diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,58240 @@ +{ + "best_global_step": 3626, + "best_metric": 0.08651281148195267, + "best_model_checkpoint": "saves/lora/llama-3-8b-instruct/train_piqa_1754507484/checkpoint-3626", + "epoch": 10.0, + "eval_steps": 1813, + "global_step": 36260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013789299503585218, + "grad_norm": 17.669382095336914, + "learning_rate": 5.5157198014340876e-08, + "loss": 1.0028, + "num_input_tokens_seen": 2848, + "step": 5 + }, + { + "epoch": 0.0027578599007170436, + "grad_norm": 12.519883155822754, + "learning_rate": 1.2410369553226697e-07, + "loss": 0.9575, + "num_input_tokens_seen": 5760, + "step": 10 + }, + { + "epoch": 0.004136789851075565, + "grad_norm": 16.950414657592773, + "learning_rate": 1.9305019305019306e-07, + "loss": 1.0904, + "num_input_tokens_seen": 8032, + "step": 15 + }, + { + "epoch": 0.005515719801434087, + "grad_norm": 20.871023178100586, + "learning_rate": 2.619966905681192e-07, + "loss": 0.9729, + "num_input_tokens_seen": 11712, + "step": 20 + }, + { + "epoch": 0.006894649751792609, + "grad_norm": 11.851168632507324, + "learning_rate": 3.309431880860452e-07, + "loss": 0.985, + "num_input_tokens_seen": 14112, + "step": 25 + }, + { + "epoch": 0.00827357970215113, + "grad_norm": 13.194154739379883, + "learning_rate": 3.998896856039713e-07, + "loss": 1.0233, + "num_input_tokens_seen": 18016, + "step": 30 + }, + { + "epoch": 0.009652509652509652, + "grad_norm": 10.514932632446289, + "learning_rate": 4.6883618312189744e-07, + "loss": 1.0284, + "num_input_tokens_seen": 21728, + "step": 35 + }, + { + "epoch": 0.011031439602868174, + "grad_norm": 14.124673843383789, + "learning_rate": 5.377826806398235e-07, + "loss": 0.9388, + "num_input_tokens_seen": 24224, + "step": 40 + }, + { + "epoch": 0.012410369553226696, + "grad_norm": 15.051218032836914, + "learning_rate": 6.067291781577496e-07, + "loss": 1.0727, + "num_input_tokens_seen": 26880, + "step": 45 + }, + { + "epoch": 0.013789299503585218, + "grad_norm": 10.610703468322754, + "learning_rate": 6.756756756756758e-07, + "loss": 0.8292, + "num_input_tokens_seen": 29056, + "step": 50 + }, + { + "epoch": 0.01516822945394374, + "grad_norm": 16.527925491333008, + "learning_rate": 7.446221731936018e-07, + "loss": 1.0232, + "num_input_tokens_seen": 31232, + "step": 55 + }, + { + "epoch": 0.01654715940430226, + "grad_norm": 13.132957458496094, + "learning_rate": 8.13568670711528e-07, + "loss": 0.6936, + "num_input_tokens_seen": 34080, + "step": 60 + }, + { + "epoch": 0.017926089354660783, + "grad_norm": 11.937808990478516, + "learning_rate": 8.82515168229454e-07, + "loss": 0.8574, + "num_input_tokens_seen": 37344, + "step": 65 + }, + { + "epoch": 0.019305019305019305, + "grad_norm": 12.995402336120605, + "learning_rate": 9.514616657473801e-07, + "loss": 0.709, + "num_input_tokens_seen": 39712, + "step": 70 + }, + { + "epoch": 0.020683949255377827, + "grad_norm": 13.506441116333008, + "learning_rate": 1.020408163265306e-06, + "loss": 1.0253, + "num_input_tokens_seen": 42400, + "step": 75 + }, + { + "epoch": 0.02206287920573635, + "grad_norm": 14.4645414352417, + "learning_rate": 1.0893546607832323e-06, + "loss": 0.8172, + "num_input_tokens_seen": 45344, + "step": 80 + }, + { + "epoch": 0.02344180915609487, + "grad_norm": 18.406402587890625, + "learning_rate": 1.1583011583011583e-06, + "loss": 1.0014, + "num_input_tokens_seen": 47840, + "step": 85 + }, + { + "epoch": 0.024820739106453393, + "grad_norm": 13.00782299041748, + "learning_rate": 1.2272476558190843e-06, + "loss": 0.4216, + "num_input_tokens_seen": 52352, + "step": 90 + }, + { + "epoch": 0.026199669056811915, + "grad_norm": 12.495691299438477, + "learning_rate": 1.2961941533370105e-06, + "loss": 0.878, + "num_input_tokens_seen": 55264, + "step": 95 + }, + { + "epoch": 0.027578599007170437, + "grad_norm": 10.091400146484375, + "learning_rate": 1.3651406508549365e-06, + "loss": 0.6017, + "num_input_tokens_seen": 59392, + "step": 100 + }, + { + "epoch": 0.02895752895752896, + "grad_norm": 9.02226448059082, + "learning_rate": 1.4340871483728628e-06, + "loss": 0.8679, + "num_input_tokens_seen": 62560, + "step": 105 + }, + { + "epoch": 0.03033645890788748, + "grad_norm": 10.768385887145996, + "learning_rate": 1.5030336458907888e-06, + "loss": 0.5288, + "num_input_tokens_seen": 65280, + "step": 110 + }, + { + "epoch": 0.031715388858246, + "grad_norm": 12.6966552734375, + "learning_rate": 1.5719801434087148e-06, + "loss": 0.4933, + "num_input_tokens_seen": 67744, + "step": 115 + }, + { + "epoch": 0.03309431880860452, + "grad_norm": 4.850258827209473, + "learning_rate": 1.6409266409266408e-06, + "loss": 0.6139, + "num_input_tokens_seen": 71776, + "step": 120 + }, + { + "epoch": 0.03447324875896304, + "grad_norm": 4.736606597900391, + "learning_rate": 1.709873138444567e-06, + "loss": 0.2704, + "num_input_tokens_seen": 74944, + "step": 125 + }, + { + "epoch": 0.035852178709321565, + "grad_norm": 2.542461633682251, + "learning_rate": 1.778819635962493e-06, + "loss": 0.447, + "num_input_tokens_seen": 77856, + "step": 130 + }, + { + "epoch": 0.03723110865968009, + "grad_norm": 1.265722393989563, + "learning_rate": 1.847766133480419e-06, + "loss": 0.1624, + "num_input_tokens_seen": 81472, + "step": 135 + }, + { + "epoch": 0.03861003861003861, + "grad_norm": 5.371358394622803, + "learning_rate": 1.9167126309983453e-06, + "loss": 0.2461, + "num_input_tokens_seen": 83968, + "step": 140 + }, + { + "epoch": 0.03998896856039713, + "grad_norm": 6.755163192749023, + "learning_rate": 1.9856591285162715e-06, + "loss": 0.1736, + "num_input_tokens_seen": 86528, + "step": 145 + }, + { + "epoch": 0.04136789851075565, + "grad_norm": 2.922980308532715, + "learning_rate": 2.0546056260341973e-06, + "loss": 0.3462, + "num_input_tokens_seen": 90880, + "step": 150 + }, + { + "epoch": 0.042746828461114175, + "grad_norm": 10.701380729675293, + "learning_rate": 2.1235521235521236e-06, + "loss": 0.2281, + "num_input_tokens_seen": 93536, + "step": 155 + }, + { + "epoch": 0.0441257584114727, + "grad_norm": 6.037044525146484, + "learning_rate": 2.1924986210700498e-06, + "loss": 0.414, + "num_input_tokens_seen": 97408, + "step": 160 + }, + { + "epoch": 0.04550468836183122, + "grad_norm": 0.928573489189148, + "learning_rate": 2.2614451185879756e-06, + "loss": 0.1749, + "num_input_tokens_seen": 100480, + "step": 165 + }, + { + "epoch": 0.04688361831218974, + "grad_norm": 5.118636131286621, + "learning_rate": 2.330391616105902e-06, + "loss": 0.1878, + "num_input_tokens_seen": 104704, + "step": 170 + }, + { + "epoch": 0.04826254826254826, + "grad_norm": 0.36207547783851624, + "learning_rate": 2.399338113623828e-06, + "loss": 0.025, + "num_input_tokens_seen": 106976, + "step": 175 + }, + { + "epoch": 0.049641478212906785, + "grad_norm": 9.21904468536377, + "learning_rate": 2.4682846111417543e-06, + "loss": 0.2351, + "num_input_tokens_seen": 109344, + "step": 180 + }, + { + "epoch": 0.05102040816326531, + "grad_norm": 2.6287364959716797, + "learning_rate": 2.53723110865968e-06, + "loss": 0.3986, + "num_input_tokens_seen": 112064, + "step": 185 + }, + { + "epoch": 0.05239933811362383, + "grad_norm": 4.83896541595459, + "learning_rate": 2.6061776061776063e-06, + "loss": 0.231, + "num_input_tokens_seen": 114528, + "step": 190 + }, + { + "epoch": 0.05377826806398235, + "grad_norm": 0.5552747845649719, + "learning_rate": 2.6751241036955325e-06, + "loss": 0.3256, + "num_input_tokens_seen": 118080, + "step": 195 + }, + { + "epoch": 0.05515719801434087, + "grad_norm": 9.750347137451172, + "learning_rate": 2.7440706012134583e-06, + "loss": 0.357, + "num_input_tokens_seen": 120320, + "step": 200 + }, + { + "epoch": 0.056536127964699395, + "grad_norm": 7.1485395431518555, + "learning_rate": 2.8130170987313846e-06, + "loss": 0.2071, + "num_input_tokens_seen": 122976, + "step": 205 + }, + { + "epoch": 0.05791505791505792, + "grad_norm": 5.439534664154053, + "learning_rate": 2.8819635962493108e-06, + "loss": 0.2674, + "num_input_tokens_seen": 125312, + "step": 210 + }, + { + "epoch": 0.05929398786541644, + "grad_norm": 0.04954156279563904, + "learning_rate": 2.950910093767237e-06, + "loss": 0.0931, + "num_input_tokens_seen": 129024, + "step": 215 + }, + { + "epoch": 0.06067291781577496, + "grad_norm": 4.755600452423096, + "learning_rate": 3.019856591285163e-06, + "loss": 0.3677, + "num_input_tokens_seen": 132128, + "step": 220 + }, + { + "epoch": 0.06205184776613348, + "grad_norm": 11.369396209716797, + "learning_rate": 3.088803088803089e-06, + "loss": 0.1927, + "num_input_tokens_seen": 134944, + "step": 225 + }, + { + "epoch": 0.063430777716492, + "grad_norm": 2.830983877182007, + "learning_rate": 3.157749586321015e-06, + "loss": 0.2736, + "num_input_tokens_seen": 138400, + "step": 230 + }, + { + "epoch": 0.06480970766685053, + "grad_norm": 4.4813361167907715, + "learning_rate": 3.226696083838941e-06, + "loss": 0.1733, + "num_input_tokens_seen": 141024, + "step": 235 + }, + { + "epoch": 0.06618863761720904, + "grad_norm": 9.926114082336426, + "learning_rate": 3.295642581356867e-06, + "loss": 0.1692, + "num_input_tokens_seen": 143616, + "step": 240 + }, + { + "epoch": 0.06756756756756757, + "grad_norm": 5.326972007751465, + "learning_rate": 3.3645890788747935e-06, + "loss": 0.2171, + "num_input_tokens_seen": 147392, + "step": 245 + }, + { + "epoch": 0.06894649751792609, + "grad_norm": 9.453497886657715, + "learning_rate": 3.4335355763927193e-06, + "loss": 0.1329, + "num_input_tokens_seen": 150432, + "step": 250 + }, + { + "epoch": 0.07032542746828462, + "grad_norm": 3.5676109790802, + "learning_rate": 3.5024820739106456e-06, + "loss": 0.2505, + "num_input_tokens_seen": 153632, + "step": 255 + }, + { + "epoch": 0.07170435741864313, + "grad_norm": 6.038978576660156, + "learning_rate": 3.5714285714285714e-06, + "loss": 0.1999, + "num_input_tokens_seen": 156192, + "step": 260 + }, + { + "epoch": 0.07308328736900166, + "grad_norm": 9.247962951660156, + "learning_rate": 3.640375068946498e-06, + "loss": 0.2638, + "num_input_tokens_seen": 160352, + "step": 265 + }, + { + "epoch": 0.07446221731936017, + "grad_norm": 1.5847299098968506, + "learning_rate": 3.709321566464424e-06, + "loss": 0.0676, + "num_input_tokens_seen": 163776, + "step": 270 + }, + { + "epoch": 0.0758411472697187, + "grad_norm": 3.1036360263824463, + "learning_rate": 3.77826806398235e-06, + "loss": 0.1888, + "num_input_tokens_seen": 167328, + "step": 275 + }, + { + "epoch": 0.07722007722007722, + "grad_norm": 7.843040466308594, + "learning_rate": 3.847214561500275e-06, + "loss": 0.3264, + "num_input_tokens_seen": 170336, + "step": 280 + }, + { + "epoch": 0.07859900717043575, + "grad_norm": 2.880110263824463, + "learning_rate": 3.916161059018202e-06, + "loss": 0.1758, + "num_input_tokens_seen": 173088, + "step": 285 + }, + { + "epoch": 0.07997793712079426, + "grad_norm": 5.429824352264404, + "learning_rate": 3.985107556536128e-06, + "loss": 0.3236, + "num_input_tokens_seen": 176992, + "step": 290 + }, + { + "epoch": 0.08135686707115279, + "grad_norm": 1.36207115650177, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.0426, + "num_input_tokens_seen": 179968, + "step": 295 + }, + { + "epoch": 0.0827357970215113, + "grad_norm": 3.6918790340423584, + "learning_rate": 4.12300055157198e-06, + "loss": 0.3515, + "num_input_tokens_seen": 183616, + "step": 300 + }, + { + "epoch": 0.08411472697186984, + "grad_norm": 6.609649658203125, + "learning_rate": 4.191947049089907e-06, + "loss": 0.2225, + "num_input_tokens_seen": 186336, + "step": 305 + }, + { + "epoch": 0.08549365692222835, + "grad_norm": 3.1137285232543945, + "learning_rate": 4.260893546607833e-06, + "loss": 0.0901, + "num_input_tokens_seen": 189504, + "step": 310 + }, + { + "epoch": 0.08687258687258688, + "grad_norm": 0.6122715473175049, + "learning_rate": 4.329840044125759e-06, + "loss": 0.1338, + "num_input_tokens_seen": 191936, + "step": 315 + }, + { + "epoch": 0.0882515168229454, + "grad_norm": 1.1380311250686646, + "learning_rate": 4.398786541643684e-06, + "loss": 0.3522, + "num_input_tokens_seen": 194912, + "step": 320 + }, + { + "epoch": 0.08963044677330391, + "grad_norm": 6.456205368041992, + "learning_rate": 4.467733039161611e-06, + "loss": 0.1393, + "num_input_tokens_seen": 197824, + "step": 325 + }, + { + "epoch": 0.09100937672366244, + "grad_norm": 3.4968087673187256, + "learning_rate": 4.536679536679537e-06, + "loss": 0.1981, + "num_input_tokens_seen": 201344, + "step": 330 + }, + { + "epoch": 0.09238830667402095, + "grad_norm": 3.6531221866607666, + "learning_rate": 4.6056260341974635e-06, + "loss": 0.1377, + "num_input_tokens_seen": 204576, + "step": 335 + }, + { + "epoch": 0.09376723662437948, + "grad_norm": 2.0937891006469727, + "learning_rate": 4.674572531715389e-06, + "loss": 0.1207, + "num_input_tokens_seen": 207232, + "step": 340 + }, + { + "epoch": 0.095146166574738, + "grad_norm": 4.6248016357421875, + "learning_rate": 4.743519029233315e-06, + "loss": 0.2269, + "num_input_tokens_seen": 210528, + "step": 345 + }, + { + "epoch": 0.09652509652509653, + "grad_norm": 0.591311514377594, + "learning_rate": 4.812465526751241e-06, + "loss": 0.0605, + "num_input_tokens_seen": 212704, + "step": 350 + }, + { + "epoch": 0.09790402647545504, + "grad_norm": 3.344120979309082, + "learning_rate": 4.881412024269167e-06, + "loss": 0.0962, + "num_input_tokens_seen": 215776, + "step": 355 + }, + { + "epoch": 0.09928295642581357, + "grad_norm": 2.304752826690674, + "learning_rate": 4.950358521787093e-06, + "loss": 0.088, + "num_input_tokens_seen": 218496, + "step": 360 + }, + { + "epoch": 0.10066188637617209, + "grad_norm": 0.8403867483139038, + "learning_rate": 5.019305019305019e-06, + "loss": 0.1856, + "num_input_tokens_seen": 222240, + "step": 365 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 2.6182518005371094, + "learning_rate": 5.088251516822946e-06, + "loss": 0.0668, + "num_input_tokens_seen": 225248, + "step": 370 + }, + { + "epoch": 0.10341974627688913, + "grad_norm": 0.15572142601013184, + "learning_rate": 5.157198014340872e-06, + "loss": 0.0359, + "num_input_tokens_seen": 227872, + "step": 375 + }, + { + "epoch": 0.10479867622724766, + "grad_norm": 4.596789360046387, + "learning_rate": 5.226144511858798e-06, + "loss": 0.1299, + "num_input_tokens_seen": 230528, + "step": 380 + }, + { + "epoch": 0.10617760617760617, + "grad_norm": 2.9758574962615967, + "learning_rate": 5.295091009376723e-06, + "loss": 0.2488, + "num_input_tokens_seen": 233440, + "step": 385 + }, + { + "epoch": 0.1075565361279647, + "grad_norm": 4.243319034576416, + "learning_rate": 5.36403750689465e-06, + "loss": 0.2388, + "num_input_tokens_seen": 237216, + "step": 390 + }, + { + "epoch": 0.10893546607832322, + "grad_norm": 3.098375082015991, + "learning_rate": 5.432984004412576e-06, + "loss": 0.1085, + "num_input_tokens_seen": 239328, + "step": 395 + }, + { + "epoch": 0.11031439602868175, + "grad_norm": 0.4289655089378357, + "learning_rate": 5.501930501930502e-06, + "loss": 0.0505, + "num_input_tokens_seen": 241856, + "step": 400 + }, + { + "epoch": 0.11169332597904026, + "grad_norm": 2.5884387493133545, + "learning_rate": 5.570876999448428e-06, + "loss": 0.0662, + "num_input_tokens_seen": 244416, + "step": 405 + }, + { + "epoch": 0.11307225592939879, + "grad_norm": 3.5801708698272705, + "learning_rate": 5.639823496966355e-06, + "loss": 0.0659, + "num_input_tokens_seen": 247168, + "step": 410 + }, + { + "epoch": 0.1144511858797573, + "grad_norm": 0.6695566177368164, + "learning_rate": 5.708769994484281e-06, + "loss": 0.2146, + "num_input_tokens_seen": 250560, + "step": 415 + }, + { + "epoch": 0.11583011583011583, + "grad_norm": 6.132290840148926, + "learning_rate": 5.777716492002206e-06, + "loss": 0.1969, + "num_input_tokens_seen": 253440, + "step": 420 + }, + { + "epoch": 0.11720904578047435, + "grad_norm": 8.758849143981934, + "learning_rate": 5.846662989520132e-06, + "loss": 0.079, + "num_input_tokens_seen": 256032, + "step": 425 + }, + { + "epoch": 0.11858797573083288, + "grad_norm": 2.1929805278778076, + "learning_rate": 5.915609487038059e-06, + "loss": 0.1566, + "num_input_tokens_seen": 259424, + "step": 430 + }, + { + "epoch": 0.1199669056811914, + "grad_norm": 5.018036365509033, + "learning_rate": 5.984555984555985e-06, + "loss": 0.1678, + "num_input_tokens_seen": 262720, + "step": 435 + }, + { + "epoch": 0.12134583563154992, + "grad_norm": 2.2784295082092285, + "learning_rate": 6.053502482073911e-06, + "loss": 0.1697, + "num_input_tokens_seen": 265216, + "step": 440 + }, + { + "epoch": 0.12272476558190844, + "grad_norm": 2.239527702331543, + "learning_rate": 6.122448979591837e-06, + "loss": 0.0977, + "num_input_tokens_seen": 267808, + "step": 445 + }, + { + "epoch": 0.12410369553226697, + "grad_norm": 7.1208600997924805, + "learning_rate": 6.191395477109764e-06, + "loss": 0.2573, + "num_input_tokens_seen": 270848, + "step": 450 + }, + { + "epoch": 0.12548262548262548, + "grad_norm": 2.7061214447021484, + "learning_rate": 6.2603419746276896e-06, + "loss": 0.1549, + "num_input_tokens_seen": 273760, + "step": 455 + }, + { + "epoch": 0.126861555432984, + "grad_norm": 1.166291356086731, + "learning_rate": 6.329288472145615e-06, + "loss": 0.3127, + "num_input_tokens_seen": 276544, + "step": 460 + }, + { + "epoch": 0.12824048538334254, + "grad_norm": 1.9800664186477661, + "learning_rate": 6.398234969663541e-06, + "loss": 0.1809, + "num_input_tokens_seen": 279072, + "step": 465 + }, + { + "epoch": 0.12961941533370105, + "grad_norm": 1.9072437286376953, + "learning_rate": 6.467181467181467e-06, + "loss": 0.1863, + "num_input_tokens_seen": 283648, + "step": 470 + }, + { + "epoch": 0.13099834528405957, + "grad_norm": 2.879823923110962, + "learning_rate": 6.5361279646993945e-06, + "loss": 0.0864, + "num_input_tokens_seen": 286656, + "step": 475 + }, + { + "epoch": 0.13237727523441808, + "grad_norm": 2.3689773082733154, + "learning_rate": 6.60507446221732e-06, + "loss": 0.1479, + "num_input_tokens_seen": 289280, + "step": 480 + }, + { + "epoch": 0.1337562051847766, + "grad_norm": 2.9115259647369385, + "learning_rate": 6.674020959735246e-06, + "loss": 0.2347, + "num_input_tokens_seen": 291776, + "step": 485 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 1.5776349306106567, + "learning_rate": 6.742967457253172e-06, + "loss": 0.0784, + "num_input_tokens_seen": 294016, + "step": 490 + }, + { + "epoch": 0.13651406508549366, + "grad_norm": 4.238711833953857, + "learning_rate": 6.8119139547710985e-06, + "loss": 0.1207, + "num_input_tokens_seen": 296416, + "step": 495 + }, + { + "epoch": 0.13789299503585217, + "grad_norm": 3.7396655082702637, + "learning_rate": 6.880860452289024e-06, + "loss": 0.1375, + "num_input_tokens_seen": 300096, + "step": 500 + }, + { + "epoch": 0.1392719249862107, + "grad_norm": 8.749637603759766, + "learning_rate": 6.94980694980695e-06, + "loss": 0.1348, + "num_input_tokens_seen": 303040, + "step": 505 + }, + { + "epoch": 0.14065085493656923, + "grad_norm": 7.4990553855896, + "learning_rate": 7.018753447324876e-06, + "loss": 0.3134, + "num_input_tokens_seen": 306464, + "step": 510 + }, + { + "epoch": 0.14202978488692775, + "grad_norm": 3.6203975677490234, + "learning_rate": 7.087699944842802e-06, + "loss": 0.1024, + "num_input_tokens_seen": 309472, + "step": 515 + }, + { + "epoch": 0.14340871483728626, + "grad_norm": 0.17645098268985748, + "learning_rate": 7.156646442360728e-06, + "loss": 0.0612, + "num_input_tokens_seen": 313024, + "step": 520 + }, + { + "epoch": 0.14478764478764478, + "grad_norm": 1.150489330291748, + "learning_rate": 7.225592939878654e-06, + "loss": 0.1524, + "num_input_tokens_seen": 315968, + "step": 525 + }, + { + "epoch": 0.14616657473800332, + "grad_norm": 1.22951078414917, + "learning_rate": 7.29453943739658e-06, + "loss": 0.0898, + "num_input_tokens_seen": 319136, + "step": 530 + }, + { + "epoch": 0.14754550468836183, + "grad_norm": 8.32089614868164, + "learning_rate": 7.363485934914506e-06, + "loss": 0.3482, + "num_input_tokens_seen": 322816, + "step": 535 + }, + { + "epoch": 0.14892443463872035, + "grad_norm": 2.5876495838165283, + "learning_rate": 7.432432432432433e-06, + "loss": 0.1018, + "num_input_tokens_seen": 325536, + "step": 540 + }, + { + "epoch": 0.15030336458907886, + "grad_norm": 1.5536537170410156, + "learning_rate": 7.501378929950359e-06, + "loss": 0.1982, + "num_input_tokens_seen": 328384, + "step": 545 + }, + { + "epoch": 0.1516822945394374, + "grad_norm": 2.8949737548828125, + "learning_rate": 7.570325427468285e-06, + "loss": 0.0413, + "num_input_tokens_seen": 332000, + "step": 550 + }, + { + "epoch": 0.15306122448979592, + "grad_norm": 1.9417126178741455, + "learning_rate": 7.63927192498621e-06, + "loss": 0.0576, + "num_input_tokens_seen": 334752, + "step": 555 + }, + { + "epoch": 0.15444015444015444, + "grad_norm": 11.754399299621582, + "learning_rate": 7.708218422504138e-06, + "loss": 0.2015, + "num_input_tokens_seen": 337792, + "step": 560 + }, + { + "epoch": 0.15581908439051295, + "grad_norm": 1.8760361671447754, + "learning_rate": 7.777164920022063e-06, + "loss": 0.1955, + "num_input_tokens_seen": 340128, + "step": 565 + }, + { + "epoch": 0.1571980143408715, + "grad_norm": 3.6601195335388184, + "learning_rate": 7.84611141753999e-06, + "loss": 0.1704, + "num_input_tokens_seen": 342880, + "step": 570 + }, + { + "epoch": 0.15857694429123, + "grad_norm": 0.32053330540657043, + "learning_rate": 7.915057915057915e-06, + "loss": 0.1103, + "num_input_tokens_seen": 345824, + "step": 575 + }, + { + "epoch": 0.15995587424158852, + "grad_norm": 0.8565095067024231, + "learning_rate": 7.984004412575841e-06, + "loss": 0.1035, + "num_input_tokens_seen": 348128, + "step": 580 + }, + { + "epoch": 0.16133480419194704, + "grad_norm": 1.4693669080734253, + "learning_rate": 8.052950910093768e-06, + "loss": 0.1242, + "num_input_tokens_seen": 350976, + "step": 585 + }, + { + "epoch": 0.16271373414230558, + "grad_norm": 0.8791829347610474, + "learning_rate": 8.121897407611693e-06, + "loss": 0.0733, + "num_input_tokens_seen": 353984, + "step": 590 + }, + { + "epoch": 0.1640926640926641, + "grad_norm": 4.294346332550049, + "learning_rate": 8.19084390512962e-06, + "loss": 0.2745, + "num_input_tokens_seen": 356608, + "step": 595 + }, + { + "epoch": 0.1654715940430226, + "grad_norm": 4.228916645050049, + "learning_rate": 8.259790402647546e-06, + "loss": 0.1379, + "num_input_tokens_seen": 359360, + "step": 600 + }, + { + "epoch": 0.16685052399338113, + "grad_norm": 2.4526193141937256, + "learning_rate": 8.328736900165473e-06, + "loss": 0.0612, + "num_input_tokens_seen": 361696, + "step": 605 + }, + { + "epoch": 0.16822945394373967, + "grad_norm": 7.250700950622559, + "learning_rate": 8.397683397683398e-06, + "loss": 0.3031, + "num_input_tokens_seen": 364640, + "step": 610 + }, + { + "epoch": 0.16960838389409819, + "grad_norm": 0.9875712394714355, + "learning_rate": 8.466629895201323e-06, + "loss": 0.2168, + "num_input_tokens_seen": 367424, + "step": 615 + }, + { + "epoch": 0.1709873138444567, + "grad_norm": 3.5654375553131104, + "learning_rate": 8.535576392719251e-06, + "loss": 0.1513, + "num_input_tokens_seen": 370144, + "step": 620 + }, + { + "epoch": 0.17236624379481522, + "grad_norm": 7.824184417724609, + "learning_rate": 8.604522890237176e-06, + "loss": 0.1476, + "num_input_tokens_seen": 372960, + "step": 625 + }, + { + "epoch": 0.17374517374517376, + "grad_norm": 2.2566745281219482, + "learning_rate": 8.673469387755103e-06, + "loss": 0.0805, + "num_input_tokens_seen": 375584, + "step": 630 + }, + { + "epoch": 0.17512410369553227, + "grad_norm": 8.761214256286621, + "learning_rate": 8.742415885273028e-06, + "loss": 0.0878, + "num_input_tokens_seen": 378368, + "step": 635 + }, + { + "epoch": 0.1765030336458908, + "grad_norm": 2.407548189163208, + "learning_rate": 8.811362382790954e-06, + "loss": 0.2009, + "num_input_tokens_seen": 381408, + "step": 640 + }, + { + "epoch": 0.1778819635962493, + "grad_norm": 9.127442359924316, + "learning_rate": 8.880308880308881e-06, + "loss": 0.2805, + "num_input_tokens_seen": 385120, + "step": 645 + }, + { + "epoch": 0.17926089354660782, + "grad_norm": 4.0799689292907715, + "learning_rate": 8.949255377826806e-06, + "loss": 0.077, + "num_input_tokens_seen": 388736, + "step": 650 + }, + { + "epoch": 0.18063982349696636, + "grad_norm": 0.8196153044700623, + "learning_rate": 9.018201875344733e-06, + "loss": 0.0423, + "num_input_tokens_seen": 391296, + "step": 655 + }, + { + "epoch": 0.18201875344732488, + "grad_norm": 0.7264888882637024, + "learning_rate": 9.08714837286266e-06, + "loss": 0.1452, + "num_input_tokens_seen": 394464, + "step": 660 + }, + { + "epoch": 0.1833976833976834, + "grad_norm": 2.7621374130249023, + "learning_rate": 9.156094870380586e-06, + "loss": 0.1126, + "num_input_tokens_seen": 397728, + "step": 665 + }, + { + "epoch": 0.1847766133480419, + "grad_norm": 2.0738942623138428, + "learning_rate": 9.225041367898511e-06, + "loss": 0.1552, + "num_input_tokens_seen": 400288, + "step": 670 + }, + { + "epoch": 0.18615554329840045, + "grad_norm": 5.758554458618164, + "learning_rate": 9.293987865416438e-06, + "loss": 0.1061, + "num_input_tokens_seen": 403552, + "step": 675 + }, + { + "epoch": 0.18753447324875896, + "grad_norm": 3.4465649127960205, + "learning_rate": 9.362934362934363e-06, + "loss": 0.1482, + "num_input_tokens_seen": 406336, + "step": 680 + }, + { + "epoch": 0.18891340319911748, + "grad_norm": 3.8320937156677246, + "learning_rate": 9.43188086045229e-06, + "loss": 0.1531, + "num_input_tokens_seen": 409376, + "step": 685 + }, + { + "epoch": 0.190292333149476, + "grad_norm": 5.675673007965088, + "learning_rate": 9.500827357970216e-06, + "loss": 0.0885, + "num_input_tokens_seen": 412288, + "step": 690 + }, + { + "epoch": 0.19167126309983454, + "grad_norm": 2.0004618167877197, + "learning_rate": 9.56977385548814e-06, + "loss": 0.0576, + "num_input_tokens_seen": 416256, + "step": 695 + }, + { + "epoch": 0.19305019305019305, + "grad_norm": 13.650708198547363, + "learning_rate": 9.638720353006067e-06, + "loss": 0.2956, + "num_input_tokens_seen": 418912, + "step": 700 + }, + { + "epoch": 0.19442912300055157, + "grad_norm": 1.8551872968673706, + "learning_rate": 9.707666850523994e-06, + "loss": 0.1216, + "num_input_tokens_seen": 421920, + "step": 705 + }, + { + "epoch": 0.19580805295091008, + "grad_norm": 2.9785971641540527, + "learning_rate": 9.77661334804192e-06, + "loss": 0.0975, + "num_input_tokens_seen": 424000, + "step": 710 + }, + { + "epoch": 0.19718698290126863, + "grad_norm": 0.36172378063201904, + "learning_rate": 9.845559845559846e-06, + "loss": 0.2162, + "num_input_tokens_seen": 426720, + "step": 715 + }, + { + "epoch": 0.19856591285162714, + "grad_norm": 6.09363317489624, + "learning_rate": 9.914506343077772e-06, + "loss": 0.0845, + "num_input_tokens_seen": 429856, + "step": 720 + }, + { + "epoch": 0.19994484280198566, + "grad_norm": 8.574639320373535, + "learning_rate": 9.983452840595699e-06, + "loss": 0.1461, + "num_input_tokens_seen": 433184, + "step": 725 + }, + { + "epoch": 0.20132377275234417, + "grad_norm": 3.739149808883667, + "learning_rate": 1.0052399338113624e-05, + "loss": 0.2545, + "num_input_tokens_seen": 436640, + "step": 730 + }, + { + "epoch": 0.20270270270270271, + "grad_norm": 8.298757553100586, + "learning_rate": 1.012134583563155e-05, + "loss": 0.1301, + "num_input_tokens_seen": 440512, + "step": 735 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.5238338112831116, + "learning_rate": 1.0190292333149476e-05, + "loss": 0.1534, + "num_input_tokens_seen": 444032, + "step": 740 + }, + { + "epoch": 0.20546056260341974, + "grad_norm": 6.082416534423828, + "learning_rate": 1.0259238830667404e-05, + "loss": 0.1086, + "num_input_tokens_seen": 447136, + "step": 745 + }, + { + "epoch": 0.20683949255377826, + "grad_norm": 5.644254207611084, + "learning_rate": 1.0328185328185329e-05, + "loss": 0.1734, + "num_input_tokens_seen": 449728, + "step": 750 + }, + { + "epoch": 0.2082184225041368, + "grad_norm": 1.8225598335266113, + "learning_rate": 1.0397131825703254e-05, + "loss": 0.1162, + "num_input_tokens_seen": 453216, + "step": 755 + }, + { + "epoch": 0.20959735245449532, + "grad_norm": 1.6550841331481934, + "learning_rate": 1.046607832322118e-05, + "loss": 0.1742, + "num_input_tokens_seen": 455968, + "step": 760 + }, + { + "epoch": 0.21097628240485383, + "grad_norm": 0.6889269948005676, + "learning_rate": 1.0535024820739107e-05, + "loss": 0.0803, + "num_input_tokens_seen": 458176, + "step": 765 + }, + { + "epoch": 0.21235521235521235, + "grad_norm": 0.6242392659187317, + "learning_rate": 1.0603971318257034e-05, + "loss": 0.11, + "num_input_tokens_seen": 461280, + "step": 770 + }, + { + "epoch": 0.2137341423055709, + "grad_norm": 3.449282169342041, + "learning_rate": 1.0672917815774959e-05, + "loss": 0.1382, + "num_input_tokens_seen": 465312, + "step": 775 + }, + { + "epoch": 0.2151130722559294, + "grad_norm": 4.157360553741455, + "learning_rate": 1.0741864313292885e-05, + "loss": 0.2158, + "num_input_tokens_seen": 469088, + "step": 780 + }, + { + "epoch": 0.21649200220628792, + "grad_norm": 3.103505849838257, + "learning_rate": 1.0810810810810812e-05, + "loss": 0.0473, + "num_input_tokens_seen": 472320, + "step": 785 + }, + { + "epoch": 0.21787093215664644, + "grad_norm": 1.541084885597229, + "learning_rate": 1.0879757308328737e-05, + "loss": 0.0865, + "num_input_tokens_seen": 475040, + "step": 790 + }, + { + "epoch": 0.21924986210700498, + "grad_norm": 0.27069300413131714, + "learning_rate": 1.0948703805846664e-05, + "loss": 0.0691, + "num_input_tokens_seen": 477440, + "step": 795 + }, + { + "epoch": 0.2206287920573635, + "grad_norm": 6.6102166175842285, + "learning_rate": 1.1017650303364589e-05, + "loss": 0.1587, + "num_input_tokens_seen": 481216, + "step": 800 + }, + { + "epoch": 0.222007722007722, + "grad_norm": 0.9398313760757446, + "learning_rate": 1.1086596800882517e-05, + "loss": 0.0477, + "num_input_tokens_seen": 484608, + "step": 805 + }, + { + "epoch": 0.22338665195808052, + "grad_norm": 0.8227260708808899, + "learning_rate": 1.1155543298400442e-05, + "loss": 0.0163, + "num_input_tokens_seen": 487712, + "step": 810 + }, + { + "epoch": 0.22476558190843904, + "grad_norm": 2.493384838104248, + "learning_rate": 1.1224489795918369e-05, + "loss": 0.1218, + "num_input_tokens_seen": 490784, + "step": 815 + }, + { + "epoch": 0.22614451185879758, + "grad_norm": 0.1838986575603485, + "learning_rate": 1.1293436293436294e-05, + "loss": 0.2492, + "num_input_tokens_seen": 493472, + "step": 820 + }, + { + "epoch": 0.2275234418091561, + "grad_norm": 2.173588991165161, + "learning_rate": 1.136238279095422e-05, + "loss": 0.0771, + "num_input_tokens_seen": 496768, + "step": 825 + }, + { + "epoch": 0.2289023717595146, + "grad_norm": 6.349250316619873, + "learning_rate": 1.1431329288472147e-05, + "loss": 0.2474, + "num_input_tokens_seen": 499616, + "step": 830 + }, + { + "epoch": 0.23028130170987313, + "grad_norm": 5.4819183349609375, + "learning_rate": 1.1500275785990072e-05, + "loss": 0.1169, + "num_input_tokens_seen": 502144, + "step": 835 + }, + { + "epoch": 0.23166023166023167, + "grad_norm": 0.6358250975608826, + "learning_rate": 1.1569222283507998e-05, + "loss": 0.0383, + "num_input_tokens_seen": 505216, + "step": 840 + }, + { + "epoch": 0.23303916161059018, + "grad_norm": 1.7361162900924683, + "learning_rate": 1.1638168781025923e-05, + "loss": 0.0655, + "num_input_tokens_seen": 508032, + "step": 845 + }, + { + "epoch": 0.2344180915609487, + "grad_norm": 0.7072999477386475, + "learning_rate": 1.1707115278543852e-05, + "loss": 0.0876, + "num_input_tokens_seen": 511264, + "step": 850 + }, + { + "epoch": 0.23579702151130721, + "grad_norm": 5.997035503387451, + "learning_rate": 1.1776061776061777e-05, + "loss": 0.0729, + "num_input_tokens_seen": 516384, + "step": 855 + }, + { + "epoch": 0.23717595146166576, + "grad_norm": 0.02051747776567936, + "learning_rate": 1.1845008273579702e-05, + "loss": 0.037, + "num_input_tokens_seen": 519872, + "step": 860 + }, + { + "epoch": 0.23855488141202427, + "grad_norm": 6.058706283569336, + "learning_rate": 1.1913954771097628e-05, + "loss": 0.0903, + "num_input_tokens_seen": 522816, + "step": 865 + }, + { + "epoch": 0.2399338113623828, + "grad_norm": 0.9495835304260254, + "learning_rate": 1.1982901268615555e-05, + "loss": 0.2302, + "num_input_tokens_seen": 526240, + "step": 870 + }, + { + "epoch": 0.2413127413127413, + "grad_norm": 15.063459396362305, + "learning_rate": 1.2051847766133482e-05, + "loss": 0.3389, + "num_input_tokens_seen": 528960, + "step": 875 + }, + { + "epoch": 0.24269167126309985, + "grad_norm": 0.02238612249493599, + "learning_rate": 1.2120794263651407e-05, + "loss": 0.0851, + "num_input_tokens_seen": 532544, + "step": 880 + }, + { + "epoch": 0.24407060121345836, + "grad_norm": 10.013882637023926, + "learning_rate": 1.2189740761169333e-05, + "loss": 0.0875, + "num_input_tokens_seen": 535328, + "step": 885 + }, + { + "epoch": 0.24544953116381688, + "grad_norm": 7.876750946044922, + "learning_rate": 1.225868725868726e-05, + "loss": 0.1118, + "num_input_tokens_seen": 538048, + "step": 890 + }, + { + "epoch": 0.2468284611141754, + "grad_norm": 0.9142919778823853, + "learning_rate": 1.2327633756205185e-05, + "loss": 0.0619, + "num_input_tokens_seen": 540832, + "step": 895 + }, + { + "epoch": 0.24820739106453393, + "grad_norm": 4.673208713531494, + "learning_rate": 1.2396580253723111e-05, + "loss": 0.0613, + "num_input_tokens_seen": 545632, + "step": 900 + }, + { + "epoch": 0.24958632101489245, + "grad_norm": 7.063283920288086, + "learning_rate": 1.2465526751241036e-05, + "loss": 0.1849, + "num_input_tokens_seen": 547840, + "step": 905 + }, + { + "epoch": 0.25096525096525096, + "grad_norm": 2.792875051498413, + "learning_rate": 1.2534473248758963e-05, + "loss": 0.2089, + "num_input_tokens_seen": 549952, + "step": 910 + }, + { + "epoch": 0.2523441809156095, + "grad_norm": 0.5728846788406372, + "learning_rate": 1.260341974627689e-05, + "loss": 0.1107, + "num_input_tokens_seen": 552800, + "step": 915 + }, + { + "epoch": 0.253723110865968, + "grad_norm": 4.2356276512146, + "learning_rate": 1.2672366243794816e-05, + "loss": 0.148, + "num_input_tokens_seen": 556928, + "step": 920 + }, + { + "epoch": 0.25510204081632654, + "grad_norm": 0.20195533335208893, + "learning_rate": 1.2741312741312741e-05, + "loss": 0.0527, + "num_input_tokens_seen": 559744, + "step": 925 + }, + { + "epoch": 0.2564809707666851, + "grad_norm": 2.941655158996582, + "learning_rate": 1.2810259238830668e-05, + "loss": 0.1174, + "num_input_tokens_seen": 562272, + "step": 930 + }, + { + "epoch": 0.25785990071704357, + "grad_norm": 2.4425055980682373, + "learning_rate": 1.2879205736348593e-05, + "loss": 0.0961, + "num_input_tokens_seen": 564512, + "step": 935 + }, + { + "epoch": 0.2592388306674021, + "grad_norm": 0.46459758281707764, + "learning_rate": 1.294815223386652e-05, + "loss": 0.0335, + "num_input_tokens_seen": 568000, + "step": 940 + }, + { + "epoch": 0.2606177606177606, + "grad_norm": 1.22327721118927, + "learning_rate": 1.3017098731384448e-05, + "loss": 0.0943, + "num_input_tokens_seen": 575520, + "step": 945 + }, + { + "epoch": 0.26199669056811914, + "grad_norm": 1.694279670715332, + "learning_rate": 1.3086045228902371e-05, + "loss": 0.0553, + "num_input_tokens_seen": 578944, + "step": 950 + }, + { + "epoch": 0.2633756205184777, + "grad_norm": 4.443500995635986, + "learning_rate": 1.31549917264203e-05, + "loss": 0.0884, + "num_input_tokens_seen": 581920, + "step": 955 + }, + { + "epoch": 0.26475455046883617, + "grad_norm": 8.392375946044922, + "learning_rate": 1.3223938223938226e-05, + "loss": 0.3079, + "num_input_tokens_seen": 584288, + "step": 960 + }, + { + "epoch": 0.2661334804191947, + "grad_norm": 1.871351718902588, + "learning_rate": 1.329288472145615e-05, + "loss": 0.2113, + "num_input_tokens_seen": 588256, + "step": 965 + }, + { + "epoch": 0.2675124103695532, + "grad_norm": 4.619775295257568, + "learning_rate": 1.3361831218974078e-05, + "loss": 0.0899, + "num_input_tokens_seen": 590912, + "step": 970 + }, + { + "epoch": 0.26889134031991174, + "grad_norm": 0.030944593250751495, + "learning_rate": 1.3430777716492001e-05, + "loss": 0.0264, + "num_input_tokens_seen": 594016, + "step": 975 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 2.6039185523986816, + "learning_rate": 1.349972421400993e-05, + "loss": 0.0369, + "num_input_tokens_seen": 598592, + "step": 980 + }, + { + "epoch": 0.2716492002206288, + "grad_norm": 4.08351993560791, + "learning_rate": 1.3568670711527856e-05, + "loss": 0.0794, + "num_input_tokens_seen": 601280, + "step": 985 + }, + { + "epoch": 0.2730281301709873, + "grad_norm": 9.411627769470215, + "learning_rate": 1.3637617209045781e-05, + "loss": 0.2106, + "num_input_tokens_seen": 603712, + "step": 990 + }, + { + "epoch": 0.27440706012134586, + "grad_norm": 5.827423095703125, + "learning_rate": 1.3706563706563708e-05, + "loss": 0.3632, + "num_input_tokens_seen": 607136, + "step": 995 + }, + { + "epoch": 0.27578599007170435, + "grad_norm": 3.1451611518859863, + "learning_rate": 1.3775510204081633e-05, + "loss": 0.104, + "num_input_tokens_seen": 611648, + "step": 1000 + }, + { + "epoch": 0.2771649200220629, + "grad_norm": 6.672887802124023, + "learning_rate": 1.384445670159956e-05, + "loss": 0.1017, + "num_input_tokens_seen": 615872, + "step": 1005 + }, + { + "epoch": 0.2785438499724214, + "grad_norm": 0.8264055848121643, + "learning_rate": 1.3913403199117486e-05, + "loss": 0.0645, + "num_input_tokens_seen": 618880, + "step": 1010 + }, + { + "epoch": 0.2799227799227799, + "grad_norm": 1.9958924055099487, + "learning_rate": 1.3982349696635411e-05, + "loss": 0.0939, + "num_input_tokens_seen": 621600, + "step": 1015 + }, + { + "epoch": 0.28130170987313846, + "grad_norm": 4.987821102142334, + "learning_rate": 1.4051296194153338e-05, + "loss": 0.0655, + "num_input_tokens_seen": 624640, + "step": 1020 + }, + { + "epoch": 0.28268063982349695, + "grad_norm": 0.46736621856689453, + "learning_rate": 1.4120242691671264e-05, + "loss": 0.0496, + "num_input_tokens_seen": 627872, + "step": 1025 + }, + { + "epoch": 0.2840595697738555, + "grad_norm": 0.9190738797187805, + "learning_rate": 1.4189189189189189e-05, + "loss": 0.1605, + "num_input_tokens_seen": 630752, + "step": 1030 + }, + { + "epoch": 0.28543849972421403, + "grad_norm": 2.2415411472320557, + "learning_rate": 1.4258135686707116e-05, + "loss": 0.0605, + "num_input_tokens_seen": 634176, + "step": 1035 + }, + { + "epoch": 0.2868174296745725, + "grad_norm": 6.420763969421387, + "learning_rate": 1.432708218422504e-05, + "loss": 0.3454, + "num_input_tokens_seen": 637056, + "step": 1040 + }, + { + "epoch": 0.28819635962493106, + "grad_norm": 6.211145401000977, + "learning_rate": 1.4396028681742967e-05, + "loss": 0.1984, + "num_input_tokens_seen": 639936, + "step": 1045 + }, + { + "epoch": 0.28957528957528955, + "grad_norm": 3.225574493408203, + "learning_rate": 1.4464975179260896e-05, + "loss": 0.1364, + "num_input_tokens_seen": 642464, + "step": 1050 + }, + { + "epoch": 0.2909542195256481, + "grad_norm": 3.829580068588257, + "learning_rate": 1.4533921676778819e-05, + "loss": 0.1217, + "num_input_tokens_seen": 645184, + "step": 1055 + }, + { + "epoch": 0.29233314947600664, + "grad_norm": 4.017049312591553, + "learning_rate": 1.4602868174296747e-05, + "loss": 0.0617, + "num_input_tokens_seen": 648480, + "step": 1060 + }, + { + "epoch": 0.2937120794263651, + "grad_norm": 6.049471855163574, + "learning_rate": 1.4671814671814674e-05, + "loss": 0.1155, + "num_input_tokens_seen": 651680, + "step": 1065 + }, + { + "epoch": 0.29509100937672367, + "grad_norm": 1.8519240617752075, + "learning_rate": 1.4740761169332597e-05, + "loss": 0.0829, + "num_input_tokens_seen": 654752, + "step": 1070 + }, + { + "epoch": 0.2964699393270822, + "grad_norm": 0.4084640145301819, + "learning_rate": 1.4809707666850526e-05, + "loss": 0.1554, + "num_input_tokens_seen": 657952, + "step": 1075 + }, + { + "epoch": 0.2978488692774407, + "grad_norm": 3.1929094791412354, + "learning_rate": 1.4878654164368449e-05, + "loss": 0.0438, + "num_input_tokens_seen": 660800, + "step": 1080 + }, + { + "epoch": 0.29922779922779924, + "grad_norm": 5.491050720214844, + "learning_rate": 1.4947600661886377e-05, + "loss": 0.1605, + "num_input_tokens_seen": 664896, + "step": 1085 + }, + { + "epoch": 0.3006067291781577, + "grad_norm": 0.19832760095596313, + "learning_rate": 1.5016547159404304e-05, + "loss": 0.0205, + "num_input_tokens_seen": 668064, + "step": 1090 + }, + { + "epoch": 0.30198565912851627, + "grad_norm": 6.482064723968506, + "learning_rate": 1.5085493656922229e-05, + "loss": 0.0436, + "num_input_tokens_seen": 671552, + "step": 1095 + }, + { + "epoch": 0.3033645890788748, + "grad_norm": 3.919965982437134, + "learning_rate": 1.5154440154440155e-05, + "loss": 0.0727, + "num_input_tokens_seen": 674016, + "step": 1100 + }, + { + "epoch": 0.3047435190292333, + "grad_norm": 11.413976669311523, + "learning_rate": 1.5223386651958082e-05, + "loss": 0.0926, + "num_input_tokens_seen": 676576, + "step": 1105 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 2.274474859237671, + "learning_rate": 1.5292333149476005e-05, + "loss": 0.1329, + "num_input_tokens_seen": 679168, + "step": 1110 + }, + { + "epoch": 0.30750137892995033, + "grad_norm": 3.50107741355896, + "learning_rate": 1.5361279646993934e-05, + "loss": 0.3203, + "num_input_tokens_seen": 682816, + "step": 1115 + }, + { + "epoch": 0.3088803088803089, + "grad_norm": 9.460408210754395, + "learning_rate": 1.543022614451186e-05, + "loss": 0.111, + "num_input_tokens_seen": 686016, + "step": 1120 + }, + { + "epoch": 0.3102592388306674, + "grad_norm": 1.2672441005706787, + "learning_rate": 1.5499172642029787e-05, + "loss": 0.049, + "num_input_tokens_seen": 688480, + "step": 1125 + }, + { + "epoch": 0.3116381687810259, + "grad_norm": 0.7149562835693359, + "learning_rate": 1.5568119139547712e-05, + "loss": 0.0164, + "num_input_tokens_seen": 691488, + "step": 1130 + }, + { + "epoch": 0.31301709873138445, + "grad_norm": 0.4139590859413147, + "learning_rate": 1.5637065637065637e-05, + "loss": 0.0789, + "num_input_tokens_seen": 693984, + "step": 1135 + }, + { + "epoch": 0.314396028681743, + "grad_norm": 9.171241760253906, + "learning_rate": 1.5706012134583565e-05, + "loss": 0.212, + "num_input_tokens_seen": 696960, + "step": 1140 + }, + { + "epoch": 0.3157749586321015, + "grad_norm": 12.700246810913086, + "learning_rate": 1.577495863210149e-05, + "loss": 0.0622, + "num_input_tokens_seen": 699776, + "step": 1145 + }, + { + "epoch": 0.31715388858246, + "grad_norm": 10.507349967956543, + "learning_rate": 1.5843905129619415e-05, + "loss": 0.1457, + "num_input_tokens_seen": 702368, + "step": 1150 + }, + { + "epoch": 0.3185328185328185, + "grad_norm": 4.064347267150879, + "learning_rate": 1.5912851627137344e-05, + "loss": 0.1343, + "num_input_tokens_seen": 705120, + "step": 1155 + }, + { + "epoch": 0.31991174848317705, + "grad_norm": 9.012533187866211, + "learning_rate": 1.598179812465527e-05, + "loss": 0.1003, + "num_input_tokens_seen": 708032, + "step": 1160 + }, + { + "epoch": 0.3212906784335356, + "grad_norm": 0.02362436056137085, + "learning_rate": 1.6050744622173193e-05, + "loss": 0.034, + "num_input_tokens_seen": 710976, + "step": 1165 + }, + { + "epoch": 0.3226696083838941, + "grad_norm": 0.004872206132858992, + "learning_rate": 1.6119691119691122e-05, + "loss": 0.086, + "num_input_tokens_seen": 714240, + "step": 1170 + }, + { + "epoch": 0.3240485383342526, + "grad_norm": 7.10363245010376, + "learning_rate": 1.6188637617209047e-05, + "loss": 0.17, + "num_input_tokens_seen": 716736, + "step": 1175 + }, + { + "epoch": 0.32542746828461117, + "grad_norm": 6.804652214050293, + "learning_rate": 1.6257584114726972e-05, + "loss": 0.14, + "num_input_tokens_seen": 719904, + "step": 1180 + }, + { + "epoch": 0.32680639823496965, + "grad_norm": 1.2868844270706177, + "learning_rate": 1.6326530612244897e-05, + "loss": 0.0547, + "num_input_tokens_seen": 723584, + "step": 1185 + }, + { + "epoch": 0.3281853281853282, + "grad_norm": 0.2484966516494751, + "learning_rate": 1.6395477109762825e-05, + "loss": 0.0943, + "num_input_tokens_seen": 726528, + "step": 1190 + }, + { + "epoch": 0.3295642581356867, + "grad_norm": 5.457822322845459, + "learning_rate": 1.6464423607280753e-05, + "loss": 0.1174, + "num_input_tokens_seen": 728864, + "step": 1195 + }, + { + "epoch": 0.3309431880860452, + "grad_norm": 0.28373268246650696, + "learning_rate": 1.6533370104798675e-05, + "loss": 0.169, + "num_input_tokens_seen": 731232, + "step": 1200 + }, + { + "epoch": 0.33232211803640377, + "grad_norm": 0.5920527577400208, + "learning_rate": 1.6602316602316603e-05, + "loss": 0.089, + "num_input_tokens_seen": 734816, + "step": 1205 + }, + { + "epoch": 0.33370104798676226, + "grad_norm": 4.506520748138428, + "learning_rate": 1.667126309983453e-05, + "loss": 0.2132, + "num_input_tokens_seen": 737920, + "step": 1210 + }, + { + "epoch": 0.3350799779371208, + "grad_norm": 4.848778247833252, + "learning_rate": 1.6740209597352453e-05, + "loss": 0.0774, + "num_input_tokens_seen": 740224, + "step": 1215 + }, + { + "epoch": 0.33645890788747934, + "grad_norm": 5.427527904510498, + "learning_rate": 1.680915609487038e-05, + "loss": 0.1372, + "num_input_tokens_seen": 742752, + "step": 1220 + }, + { + "epoch": 0.33783783783783783, + "grad_norm": 4.0141801834106445, + "learning_rate": 1.6878102592388306e-05, + "loss": 0.2211, + "num_input_tokens_seen": 745472, + "step": 1225 + }, + { + "epoch": 0.33921676778819637, + "grad_norm": 0.5669493079185486, + "learning_rate": 1.6947049089906235e-05, + "loss": 0.1056, + "num_input_tokens_seen": 748640, + "step": 1230 + }, + { + "epoch": 0.34059569773855486, + "grad_norm": 4.648062229156494, + "learning_rate": 1.701599558742416e-05, + "loss": 0.0477, + "num_input_tokens_seen": 751040, + "step": 1235 + }, + { + "epoch": 0.3419746276889134, + "grad_norm": 6.684587001800537, + "learning_rate": 1.7084942084942085e-05, + "loss": 0.2482, + "num_input_tokens_seen": 754784, + "step": 1240 + }, + { + "epoch": 0.34335355763927194, + "grad_norm": 1.361372947692871, + "learning_rate": 1.7153888582460013e-05, + "loss": 0.048, + "num_input_tokens_seen": 757408, + "step": 1245 + }, + { + "epoch": 0.34473248758963043, + "grad_norm": 5.020578861236572, + "learning_rate": 1.7222835079977938e-05, + "loss": 0.0652, + "num_input_tokens_seen": 761152, + "step": 1250 + }, + { + "epoch": 0.346111417539989, + "grad_norm": 3.0704870223999023, + "learning_rate": 1.7291781577495863e-05, + "loss": 0.0411, + "num_input_tokens_seen": 764288, + "step": 1255 + }, + { + "epoch": 0.3474903474903475, + "grad_norm": 0.004998740740120411, + "learning_rate": 1.736072807501379e-05, + "loss": 0.1879, + "num_input_tokens_seen": 767104, + "step": 1260 + }, + { + "epoch": 0.348869277440706, + "grad_norm": 4.000087261199951, + "learning_rate": 1.7429674572531716e-05, + "loss": 0.0523, + "num_input_tokens_seen": 770752, + "step": 1265 + }, + { + "epoch": 0.35024820739106455, + "grad_norm": 5.549338340759277, + "learning_rate": 1.749862107004964e-05, + "loss": 0.099, + "num_input_tokens_seen": 775072, + "step": 1270 + }, + { + "epoch": 0.35162713734142304, + "grad_norm": 0.5776360034942627, + "learning_rate": 1.756756756756757e-05, + "loss": 0.0634, + "num_input_tokens_seen": 782592, + "step": 1275 + }, + { + "epoch": 0.3530060672917816, + "grad_norm": 2.2044599056243896, + "learning_rate": 1.7636514065085495e-05, + "loss": 0.0453, + "num_input_tokens_seen": 785216, + "step": 1280 + }, + { + "epoch": 0.3543849972421401, + "grad_norm": 0.11142788827419281, + "learning_rate": 1.770546056260342e-05, + "loss": 0.0389, + "num_input_tokens_seen": 788512, + "step": 1285 + }, + { + "epoch": 0.3557639271924986, + "grad_norm": 9.274991035461426, + "learning_rate": 1.7774407060121348e-05, + "loss": 0.2205, + "num_input_tokens_seen": 791712, + "step": 1290 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.6597835421562195, + "learning_rate": 1.7843353557639273e-05, + "loss": 0.0313, + "num_input_tokens_seen": 794048, + "step": 1295 + }, + { + "epoch": 0.35852178709321564, + "grad_norm": 0.07601190358400345, + "learning_rate": 1.79123000551572e-05, + "loss": 0.0477, + "num_input_tokens_seen": 796992, + "step": 1300 + }, + { + "epoch": 0.3599007170435742, + "grad_norm": 6.407130241394043, + "learning_rate": 1.7981246552675123e-05, + "loss": 0.3161, + "num_input_tokens_seen": 800672, + "step": 1305 + }, + { + "epoch": 0.3612796469939327, + "grad_norm": 10.700091361999512, + "learning_rate": 1.805019305019305e-05, + "loss": 0.1256, + "num_input_tokens_seen": 804032, + "step": 1310 + }, + { + "epoch": 0.3626585769442912, + "grad_norm": 3.261807441711426, + "learning_rate": 1.811913954771098e-05, + "loss": 0.1027, + "num_input_tokens_seen": 807552, + "step": 1315 + }, + { + "epoch": 0.36403750689464975, + "grad_norm": 10.20089340209961, + "learning_rate": 1.81880860452289e-05, + "loss": 0.0386, + "num_input_tokens_seen": 810976, + "step": 1320 + }, + { + "epoch": 0.3654164368450083, + "grad_norm": 7.529642105102539, + "learning_rate": 1.825703254274683e-05, + "loss": 0.2094, + "num_input_tokens_seen": 813248, + "step": 1325 + }, + { + "epoch": 0.3667953667953668, + "grad_norm": 4.451100826263428, + "learning_rate": 1.8325979040264754e-05, + "loss": 0.0563, + "num_input_tokens_seen": 818144, + "step": 1330 + }, + { + "epoch": 0.3681742967457253, + "grad_norm": 6.179165363311768, + "learning_rate": 1.8394925537782683e-05, + "loss": 0.0854, + "num_input_tokens_seen": 821280, + "step": 1335 + }, + { + "epoch": 0.3695532266960838, + "grad_norm": 0.2466423511505127, + "learning_rate": 1.8463872035300608e-05, + "loss": 0.1671, + "num_input_tokens_seen": 825312, + "step": 1340 + }, + { + "epoch": 0.37093215664644236, + "grad_norm": 0.24603629112243652, + "learning_rate": 1.8532818532818533e-05, + "loss": 0.0915, + "num_input_tokens_seen": 829856, + "step": 1345 + }, + { + "epoch": 0.3723110865968009, + "grad_norm": 5.0409440994262695, + "learning_rate": 1.860176503033646e-05, + "loss": 0.1025, + "num_input_tokens_seen": 832736, + "step": 1350 + }, + { + "epoch": 0.3736900165471594, + "grad_norm": 0.5382325649261475, + "learning_rate": 1.8670711527854386e-05, + "loss": 0.127, + "num_input_tokens_seen": 835904, + "step": 1355 + }, + { + "epoch": 0.37506894649751793, + "grad_norm": 4.611674785614014, + "learning_rate": 1.873965802537231e-05, + "loss": 0.068, + "num_input_tokens_seen": 840512, + "step": 1360 + }, + { + "epoch": 0.3764478764478765, + "grad_norm": 8.655329704284668, + "learning_rate": 1.880860452289024e-05, + "loss": 0.1928, + "num_input_tokens_seen": 843456, + "step": 1365 + }, + { + "epoch": 0.37782680639823496, + "grad_norm": 1.2465722560882568, + "learning_rate": 1.8877551020408164e-05, + "loss": 0.0686, + "num_input_tokens_seen": 846464, + "step": 1370 + }, + { + "epoch": 0.3792057363485935, + "grad_norm": 4.537691593170166, + "learning_rate": 1.894649751792609e-05, + "loss": 0.1055, + "num_input_tokens_seen": 850592, + "step": 1375 + }, + { + "epoch": 0.380584666298952, + "grad_norm": 8.882405281066895, + "learning_rate": 1.9015444015444017e-05, + "loss": 0.1617, + "num_input_tokens_seen": 852768, + "step": 1380 + }, + { + "epoch": 0.38196359624931053, + "grad_norm": 0.06805791705846786, + "learning_rate": 1.9084390512961942e-05, + "loss": 0.1458, + "num_input_tokens_seen": 856224, + "step": 1385 + }, + { + "epoch": 0.3833425261996691, + "grad_norm": 1.0910981893539429, + "learning_rate": 1.9153337010479867e-05, + "loss": 0.1465, + "num_input_tokens_seen": 858688, + "step": 1390 + }, + { + "epoch": 0.38472145615002756, + "grad_norm": 0.6252194046974182, + "learning_rate": 1.9222283507997796e-05, + "loss": 0.0437, + "num_input_tokens_seen": 861440, + "step": 1395 + }, + { + "epoch": 0.3861003861003861, + "grad_norm": 9.729853630065918, + "learning_rate": 1.929123000551572e-05, + "loss": 0.0893, + "num_input_tokens_seen": 863936, + "step": 1400 + }, + { + "epoch": 0.38747931605074465, + "grad_norm": 7.807980060577393, + "learning_rate": 1.936017650303365e-05, + "loss": 0.2189, + "num_input_tokens_seen": 868032, + "step": 1405 + }, + { + "epoch": 0.38885824600110314, + "grad_norm": 0.2961891293525696, + "learning_rate": 1.942912300055157e-05, + "loss": 0.1524, + "num_input_tokens_seen": 872000, + "step": 1410 + }, + { + "epoch": 0.3902371759514617, + "grad_norm": 0.2579430341720581, + "learning_rate": 1.94980694980695e-05, + "loss": 0.1867, + "num_input_tokens_seen": 875616, + "step": 1415 + }, + { + "epoch": 0.39161610590182017, + "grad_norm": 0.07147658616304398, + "learning_rate": 1.9567015995587427e-05, + "loss": 0.0809, + "num_input_tokens_seen": 879072, + "step": 1420 + }, + { + "epoch": 0.3929950358521787, + "grad_norm": 3.9888293743133545, + "learning_rate": 1.963596249310535e-05, + "loss": 0.3356, + "num_input_tokens_seen": 882368, + "step": 1425 + }, + { + "epoch": 0.39437396580253725, + "grad_norm": 0.5137974619865417, + "learning_rate": 1.9704908990623277e-05, + "loss": 0.0261, + "num_input_tokens_seen": 885536, + "step": 1430 + }, + { + "epoch": 0.39575289575289574, + "grad_norm": 3.77169132232666, + "learning_rate": 1.9773855488141205e-05, + "loss": 0.0582, + "num_input_tokens_seen": 890272, + "step": 1435 + }, + { + "epoch": 0.3971318257032543, + "grad_norm": 0.3292185366153717, + "learning_rate": 1.984280198565913e-05, + "loss": 0.0781, + "num_input_tokens_seen": 893376, + "step": 1440 + }, + { + "epoch": 0.39851075565361277, + "grad_norm": 2.3534421920776367, + "learning_rate": 1.9911748483177055e-05, + "loss": 0.0845, + "num_input_tokens_seen": 898528, + "step": 1445 + }, + { + "epoch": 0.3998896856039713, + "grad_norm": 5.648595333099365, + "learning_rate": 1.998069498069498e-05, + "loss": 0.2178, + "num_input_tokens_seen": 902272, + "step": 1450 + }, + { + "epoch": 0.40126861555432985, + "grad_norm": 5.668462753295898, + "learning_rate": 2.004964147821291e-05, + "loss": 0.0957, + "num_input_tokens_seen": 906112, + "step": 1455 + }, + { + "epoch": 0.40264754550468834, + "grad_norm": 3.3602707386016846, + "learning_rate": 2.0118587975730834e-05, + "loss": 0.0838, + "num_input_tokens_seen": 908384, + "step": 1460 + }, + { + "epoch": 0.4040264754550469, + "grad_norm": 0.23721858859062195, + "learning_rate": 2.018753447324876e-05, + "loss": 0.1298, + "num_input_tokens_seen": 911936, + "step": 1465 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.8730001449584961, + "learning_rate": 2.0256480970766687e-05, + "loss": 0.1378, + "num_input_tokens_seen": 914912, + "step": 1470 + }, + { + "epoch": 0.4067843353557639, + "grad_norm": 0.9133241176605225, + "learning_rate": 2.0325427468284612e-05, + "loss": 0.0387, + "num_input_tokens_seen": 918112, + "step": 1475 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 4.1819167137146, + "learning_rate": 2.0394373965802537e-05, + "loss": 0.1245, + "num_input_tokens_seen": 921440, + "step": 1480 + }, + { + "epoch": 0.40954219525648095, + "grad_norm": 0.1566476821899414, + "learning_rate": 2.0463320463320465e-05, + "loss": 0.0727, + "num_input_tokens_seen": 923968, + "step": 1485 + }, + { + "epoch": 0.4109211252068395, + "grad_norm": 5.6157917976379395, + "learning_rate": 2.053226696083839e-05, + "loss": 0.1256, + "num_input_tokens_seen": 926880, + "step": 1490 + }, + { + "epoch": 0.41230005515719803, + "grad_norm": 0.39236417412757874, + "learning_rate": 2.0601213458356315e-05, + "loss": 0.074, + "num_input_tokens_seen": 929408, + "step": 1495 + }, + { + "epoch": 0.4136789851075565, + "grad_norm": 6.661300182342529, + "learning_rate": 2.0670159955874243e-05, + "loss": 0.1072, + "num_input_tokens_seen": 932160, + "step": 1500 + }, + { + "epoch": 0.41505791505791506, + "grad_norm": 3.556769371032715, + "learning_rate": 2.073910645339217e-05, + "loss": 0.1157, + "num_input_tokens_seen": 934816, + "step": 1505 + }, + { + "epoch": 0.4164368450082736, + "grad_norm": 0.06843265891075134, + "learning_rate": 2.0808052950910097e-05, + "loss": 0.0287, + "num_input_tokens_seen": 938368, + "step": 1510 + }, + { + "epoch": 0.4178157749586321, + "grad_norm": 5.355690002441406, + "learning_rate": 2.087699944842802e-05, + "loss": 0.0598, + "num_input_tokens_seen": 940704, + "step": 1515 + }, + { + "epoch": 0.41919470490899063, + "grad_norm": 8.515641212463379, + "learning_rate": 2.0945945945945947e-05, + "loss": 0.2306, + "num_input_tokens_seen": 943264, + "step": 1520 + }, + { + "epoch": 0.4205736348593491, + "grad_norm": 0.02086109109222889, + "learning_rate": 2.1014892443463875e-05, + "loss": 0.0371, + "num_input_tokens_seen": 946144, + "step": 1525 + }, + { + "epoch": 0.42195256480970766, + "grad_norm": 6.699766635894775, + "learning_rate": 2.1083838940981797e-05, + "loss": 0.2095, + "num_input_tokens_seen": 950016, + "step": 1530 + }, + { + "epoch": 0.4233314947600662, + "grad_norm": 5.968180179595947, + "learning_rate": 2.1152785438499725e-05, + "loss": 0.1734, + "num_input_tokens_seen": 953728, + "step": 1535 + }, + { + "epoch": 0.4247104247104247, + "grad_norm": 5.086653232574463, + "learning_rate": 2.1221731936017653e-05, + "loss": 0.1515, + "num_input_tokens_seen": 956288, + "step": 1540 + }, + { + "epoch": 0.42608935466078324, + "grad_norm": 3.1363677978515625, + "learning_rate": 2.1290678433535578e-05, + "loss": 0.0931, + "num_input_tokens_seen": 958752, + "step": 1545 + }, + { + "epoch": 0.4274682846111418, + "grad_norm": 0.44003549218177795, + "learning_rate": 2.1359624931053503e-05, + "loss": 0.1013, + "num_input_tokens_seen": 960960, + "step": 1550 + }, + { + "epoch": 0.42884721456150027, + "grad_norm": 2.975982904434204, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.1027, + "num_input_tokens_seen": 964480, + "step": 1555 + }, + { + "epoch": 0.4302261445118588, + "grad_norm": 1.5900849103927612, + "learning_rate": 2.1497517926089356e-05, + "loss": 0.0457, + "num_input_tokens_seen": 968320, + "step": 1560 + }, + { + "epoch": 0.4316050744622173, + "grad_norm": 0.13776454329490662, + "learning_rate": 2.156646442360728e-05, + "loss": 0.0381, + "num_input_tokens_seen": 971808, + "step": 1565 + }, + { + "epoch": 0.43298400441257584, + "grad_norm": 6.98574161529541, + "learning_rate": 2.1635410921125206e-05, + "loss": 0.1548, + "num_input_tokens_seen": 974944, + "step": 1570 + }, + { + "epoch": 0.4343629343629344, + "grad_norm": 3.0480973720550537, + "learning_rate": 2.1704357418643135e-05, + "loss": 0.1647, + "num_input_tokens_seen": 979040, + "step": 1575 + }, + { + "epoch": 0.43574186431329287, + "grad_norm": 1.829881191253662, + "learning_rate": 2.177330391616106e-05, + "loss": 0.1813, + "num_input_tokens_seen": 981536, + "step": 1580 + }, + { + "epoch": 0.4371207942636514, + "grad_norm": 3.2942609786987305, + "learning_rate": 2.1842250413678985e-05, + "loss": 0.0513, + "num_input_tokens_seen": 984448, + "step": 1585 + }, + { + "epoch": 0.43849972421400996, + "grad_norm": 0.07823936641216278, + "learning_rate": 2.1911196911196913e-05, + "loss": 0.0778, + "num_input_tokens_seen": 987264, + "step": 1590 + }, + { + "epoch": 0.43987865416436844, + "grad_norm": 6.140478610992432, + "learning_rate": 2.1980143408714838e-05, + "loss": 0.1787, + "num_input_tokens_seen": 990752, + "step": 1595 + }, + { + "epoch": 0.441257584114727, + "grad_norm": 3.3624203205108643, + "learning_rate": 2.2049089906232763e-05, + "loss": 0.1242, + "num_input_tokens_seen": 993184, + "step": 1600 + }, + { + "epoch": 0.4426365140650855, + "grad_norm": 1.1051784753799438, + "learning_rate": 2.211803640375069e-05, + "loss": 0.0137, + "num_input_tokens_seen": 996640, + "step": 1605 + }, + { + "epoch": 0.444015444015444, + "grad_norm": 5.962798118591309, + "learning_rate": 2.2186982901268616e-05, + "loss": 0.0736, + "num_input_tokens_seen": 998624, + "step": 1610 + }, + { + "epoch": 0.44539437396580256, + "grad_norm": 2.3156449794769287, + "learning_rate": 2.2255929398786545e-05, + "loss": 0.1704, + "num_input_tokens_seen": 1002176, + "step": 1615 + }, + { + "epoch": 0.44677330391616105, + "grad_norm": 0.25911134481430054, + "learning_rate": 2.232487589630447e-05, + "loss": 0.249, + "num_input_tokens_seen": 1005024, + "step": 1620 + }, + { + "epoch": 0.4481522338665196, + "grad_norm": 5.002715110778809, + "learning_rate": 2.2393822393822394e-05, + "loss": 0.1029, + "num_input_tokens_seen": 1007904, + "step": 1625 + }, + { + "epoch": 0.4495311638168781, + "grad_norm": 2.588273525238037, + "learning_rate": 2.2462768891340323e-05, + "loss": 0.1166, + "num_input_tokens_seen": 1010336, + "step": 1630 + }, + { + "epoch": 0.4509100937672366, + "grad_norm": 0.298261433839798, + "learning_rate": 2.2531715388858244e-05, + "loss": 0.0395, + "num_input_tokens_seen": 1013312, + "step": 1635 + }, + { + "epoch": 0.45228902371759516, + "grad_norm": 1.7707610130310059, + "learning_rate": 2.2600661886376173e-05, + "loss": 0.1431, + "num_input_tokens_seen": 1016640, + "step": 1640 + }, + { + "epoch": 0.45366795366795365, + "grad_norm": 8.74593734741211, + "learning_rate": 2.26696083838941e-05, + "loss": 0.0781, + "num_input_tokens_seen": 1019136, + "step": 1645 + }, + { + "epoch": 0.4550468836183122, + "grad_norm": 6.445958614349365, + "learning_rate": 2.2738554881412026e-05, + "loss": 0.1515, + "num_input_tokens_seen": 1021504, + "step": 1650 + }, + { + "epoch": 0.45642581356867074, + "grad_norm": 0.22240667045116425, + "learning_rate": 2.280750137892995e-05, + "loss": 0.006, + "num_input_tokens_seen": 1024544, + "step": 1655 + }, + { + "epoch": 0.4578047435190292, + "grad_norm": 12.677586555480957, + "learning_rate": 2.2876447876447876e-05, + "loss": 0.1076, + "num_input_tokens_seen": 1027360, + "step": 1660 + }, + { + "epoch": 0.45918367346938777, + "grad_norm": 0.16342851519584656, + "learning_rate": 2.2945394373965804e-05, + "loss": 0.1448, + "num_input_tokens_seen": 1030656, + "step": 1665 + }, + { + "epoch": 0.46056260341974625, + "grad_norm": 8.210318565368652, + "learning_rate": 2.301434087148373e-05, + "loss": 0.2095, + "num_input_tokens_seen": 1033024, + "step": 1670 + }, + { + "epoch": 0.4619415333701048, + "grad_norm": 0.21876046061515808, + "learning_rate": 2.3083287369001654e-05, + "loss": 0.2896, + "num_input_tokens_seen": 1035424, + "step": 1675 + }, + { + "epoch": 0.46332046332046334, + "grad_norm": 0.20924821496009827, + "learning_rate": 2.3152233866519583e-05, + "loss": 0.0414, + "num_input_tokens_seen": 1038112, + "step": 1680 + }, + { + "epoch": 0.4646993932708218, + "grad_norm": 2.8531711101531982, + "learning_rate": 2.3221180364037508e-05, + "loss": 0.1063, + "num_input_tokens_seen": 1041856, + "step": 1685 + }, + { + "epoch": 0.46607832322118037, + "grad_norm": 0.5296519994735718, + "learning_rate": 2.3290126861555432e-05, + "loss": 0.0757, + "num_input_tokens_seen": 1044128, + "step": 1690 + }, + { + "epoch": 0.4674572531715389, + "grad_norm": 1.736057162284851, + "learning_rate": 2.335907335907336e-05, + "loss": 0.0896, + "num_input_tokens_seen": 1047264, + "step": 1695 + }, + { + "epoch": 0.4688361831218974, + "grad_norm": 1.1601896286010742, + "learning_rate": 2.3428019856591286e-05, + "loss": 0.071, + "num_input_tokens_seen": 1051072, + "step": 1700 + }, + { + "epoch": 0.47021511307225594, + "grad_norm": 0.05990123376250267, + "learning_rate": 2.349696635410921e-05, + "loss": 0.0073, + "num_input_tokens_seen": 1053920, + "step": 1705 + }, + { + "epoch": 0.47159404302261443, + "grad_norm": 0.27321502566337585, + "learning_rate": 2.356591285162714e-05, + "loss": 0.2374, + "num_input_tokens_seen": 1057056, + "step": 1710 + }, + { + "epoch": 0.47297297297297297, + "grad_norm": 4.743587017059326, + "learning_rate": 2.3634859349145064e-05, + "loss": 0.1277, + "num_input_tokens_seen": 1060128, + "step": 1715 + }, + { + "epoch": 0.4743519029233315, + "grad_norm": 0.03943540155887604, + "learning_rate": 2.3703805846662992e-05, + "loss": 0.083, + "num_input_tokens_seen": 1062528, + "step": 1720 + }, + { + "epoch": 0.47573083287369, + "grad_norm": 0.46500203013420105, + "learning_rate": 2.3772752344180917e-05, + "loss": 0.07, + "num_input_tokens_seen": 1065344, + "step": 1725 + }, + { + "epoch": 0.47710976282404854, + "grad_norm": 0.7369627952575684, + "learning_rate": 2.3841698841698842e-05, + "loss": 0.0535, + "num_input_tokens_seen": 1068352, + "step": 1730 + }, + { + "epoch": 0.4784886927744071, + "grad_norm": 2.1421046257019043, + "learning_rate": 2.391064533921677e-05, + "loss": 0.012, + "num_input_tokens_seen": 1071872, + "step": 1735 + }, + { + "epoch": 0.4798676227247656, + "grad_norm": 0.4438492953777313, + "learning_rate": 2.3979591836734696e-05, + "loss": 0.1598, + "num_input_tokens_seen": 1074848, + "step": 1740 + }, + { + "epoch": 0.4812465526751241, + "grad_norm": 16.753074645996094, + "learning_rate": 2.404853833425262e-05, + "loss": 0.1561, + "num_input_tokens_seen": 1077600, + "step": 1745 + }, + { + "epoch": 0.4826254826254826, + "grad_norm": 9.701845169067383, + "learning_rate": 2.411748483177055e-05, + "loss": 0.1743, + "num_input_tokens_seen": 1080768, + "step": 1750 + }, + { + "epoch": 0.48400441257584115, + "grad_norm": 0.11857236176729202, + "learning_rate": 2.4186431329288474e-05, + "loss": 0.1345, + "num_input_tokens_seen": 1083680, + "step": 1755 + }, + { + "epoch": 0.4853833425261997, + "grad_norm": 5.901041030883789, + "learning_rate": 2.42553778268064e-05, + "loss": 0.1239, + "num_input_tokens_seen": 1086240, + "step": 1760 + }, + { + "epoch": 0.4867622724765582, + "grad_norm": 5.464901447296143, + "learning_rate": 2.4324324324324327e-05, + "loss": 0.192, + "num_input_tokens_seen": 1088864, + "step": 1765 + }, + { + "epoch": 0.4881412024269167, + "grad_norm": 7.466026306152344, + "learning_rate": 2.4393270821842252e-05, + "loss": 0.1495, + "num_input_tokens_seen": 1091968, + "step": 1770 + }, + { + "epoch": 0.4895201323772752, + "grad_norm": 2.5550525188446045, + "learning_rate": 2.4462217319360177e-05, + "loss": 0.136, + "num_input_tokens_seen": 1095040, + "step": 1775 + }, + { + "epoch": 0.49089906232763375, + "grad_norm": 5.988354682922363, + "learning_rate": 2.4531163816878102e-05, + "loss": 0.1546, + "num_input_tokens_seen": 1098240, + "step": 1780 + }, + { + "epoch": 0.4922779922779923, + "grad_norm": 2.5407800674438477, + "learning_rate": 2.460011031439603e-05, + "loss": 0.2136, + "num_input_tokens_seen": 1100768, + "step": 1785 + }, + { + "epoch": 0.4936569222283508, + "grad_norm": 1.5038220882415771, + "learning_rate": 2.466905681191396e-05, + "loss": 0.0961, + "num_input_tokens_seen": 1103264, + "step": 1790 + }, + { + "epoch": 0.4950358521787093, + "grad_norm": 0.4832482933998108, + "learning_rate": 2.473800330943188e-05, + "loss": 0.054, + "num_input_tokens_seen": 1107008, + "step": 1795 + }, + { + "epoch": 0.49641478212906787, + "grad_norm": 6.219969749450684, + "learning_rate": 2.480694980694981e-05, + "loss": 0.1891, + "num_input_tokens_seen": 1109696, + "step": 1800 + }, + { + "epoch": 0.49779371207942635, + "grad_norm": 2.1243174076080322, + "learning_rate": 2.4875896304467734e-05, + "loss": 0.0502, + "num_input_tokens_seen": 1113120, + "step": 1805 + }, + { + "epoch": 0.4991726420297849, + "grad_norm": 8.754079818725586, + "learning_rate": 2.494484280198566e-05, + "loss": 0.1198, + "num_input_tokens_seen": 1116768, + "step": 1810 + }, + { + "epoch": 0.5, + "eval_loss": 0.10147319734096527, + "eval_runtime": 28.5327, + "eval_samples_per_second": 56.496, + "eval_steps_per_second": 14.124, + "num_input_tokens_seen": 1118368, + "step": 1813 + }, + { + "epoch": 0.5005515719801434, + "grad_norm": 1.660986065864563, + "learning_rate": 2.5013789299503587e-05, + "loss": 0.009, + "num_input_tokens_seen": 1119808, + "step": 1815 + }, + { + "epoch": 0.5019305019305019, + "grad_norm": 1.09003484249115, + "learning_rate": 2.5082735797021512e-05, + "loss": 0.1332, + "num_input_tokens_seen": 1123232, + "step": 1820 + }, + { + "epoch": 0.5033094318808604, + "grad_norm": 6.190017223358154, + "learning_rate": 2.5151682294539437e-05, + "loss": 0.0622, + "num_input_tokens_seen": 1126560, + "step": 1825 + }, + { + "epoch": 0.504688361831219, + "grad_norm": 0.1467948704957962, + "learning_rate": 2.5220628792057365e-05, + "loss": 0.0559, + "num_input_tokens_seen": 1129280, + "step": 1830 + }, + { + "epoch": 0.5060672917815775, + "grad_norm": 2.374411106109619, + "learning_rate": 2.528957528957529e-05, + "loss": 0.0881, + "num_input_tokens_seen": 1132000, + "step": 1835 + }, + { + "epoch": 0.507446221731936, + "grad_norm": 0.49070167541503906, + "learning_rate": 2.5358521787093215e-05, + "loss": 0.1972, + "num_input_tokens_seen": 1134400, + "step": 1840 + }, + { + "epoch": 0.5088251516822946, + "grad_norm": 8.327189445495605, + "learning_rate": 2.5427468284611143e-05, + "loss": 0.3009, + "num_input_tokens_seen": 1136928, + "step": 1845 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 0.15950776636600494, + "learning_rate": 2.549641478212907e-05, + "loss": 0.0272, + "num_input_tokens_seen": 1139520, + "step": 1850 + }, + { + "epoch": 0.5115830115830116, + "grad_norm": 2.5456883907318115, + "learning_rate": 2.5565361279646993e-05, + "loss": 0.1075, + "num_input_tokens_seen": 1142464, + "step": 1855 + }, + { + "epoch": 0.5129619415333702, + "grad_norm": 1.0204153060913086, + "learning_rate": 2.563430777716492e-05, + "loss": 0.0984, + "num_input_tokens_seen": 1145184, + "step": 1860 + }, + { + "epoch": 0.5143408714837286, + "grad_norm": 6.59412956237793, + "learning_rate": 2.5703254274682847e-05, + "loss": 0.1135, + "num_input_tokens_seen": 1147776, + "step": 1865 + }, + { + "epoch": 0.5157198014340871, + "grad_norm": 12.108519554138184, + "learning_rate": 2.577220077220077e-05, + "loss": 0.1322, + "num_input_tokens_seen": 1151232, + "step": 1870 + }, + { + "epoch": 0.5170987313844456, + "grad_norm": 0.2809518873691559, + "learning_rate": 2.5841147269718703e-05, + "loss": 0.212, + "num_input_tokens_seen": 1155104, + "step": 1875 + }, + { + "epoch": 0.5184776613348042, + "grad_norm": 12.977884292602539, + "learning_rate": 2.5910093767236625e-05, + "loss": 0.143, + "num_input_tokens_seen": 1158080, + "step": 1880 + }, + { + "epoch": 0.5198565912851627, + "grad_norm": 0.5341120362281799, + "learning_rate": 2.597904026475455e-05, + "loss": 0.1071, + "num_input_tokens_seen": 1160416, + "step": 1885 + }, + { + "epoch": 0.5212355212355212, + "grad_norm": 5.434915065765381, + "learning_rate": 2.604798676227248e-05, + "loss": 0.2723, + "num_input_tokens_seen": 1163072, + "step": 1890 + }, + { + "epoch": 0.5226144511858798, + "grad_norm": 8.6831693649292, + "learning_rate": 2.6116933259790406e-05, + "loss": 0.1552, + "num_input_tokens_seen": 1165824, + "step": 1895 + }, + { + "epoch": 0.5239933811362383, + "grad_norm": 9.973540306091309, + "learning_rate": 2.6185879757308328e-05, + "loss": 0.1455, + "num_input_tokens_seen": 1168256, + "step": 1900 + }, + { + "epoch": 0.5253723110865968, + "grad_norm": 5.9253411293029785, + "learning_rate": 2.6254826254826253e-05, + "loss": 0.1321, + "num_input_tokens_seen": 1171232, + "step": 1905 + }, + { + "epoch": 0.5267512410369554, + "grad_norm": 4.208742618560791, + "learning_rate": 2.6323772752344185e-05, + "loss": 0.1372, + "num_input_tokens_seen": 1175264, + "step": 1910 + }, + { + "epoch": 0.5281301709873139, + "grad_norm": 1.2168724536895752, + "learning_rate": 2.6392719249862106e-05, + "loss": 0.0739, + "num_input_tokens_seen": 1177344, + "step": 1915 + }, + { + "epoch": 0.5295091009376723, + "grad_norm": 0.9894731044769287, + "learning_rate": 2.646166574738003e-05, + "loss": 0.1298, + "num_input_tokens_seen": 1179840, + "step": 1920 + }, + { + "epoch": 0.5308880308880309, + "grad_norm": 1.3445439338684082, + "learning_rate": 2.6530612244897963e-05, + "loss": 0.1308, + "num_input_tokens_seen": 1182496, + "step": 1925 + }, + { + "epoch": 0.5322669608383894, + "grad_norm": 2.5733654499053955, + "learning_rate": 2.6599558742415888e-05, + "loss": 0.0318, + "num_input_tokens_seen": 1187168, + "step": 1930 + }, + { + "epoch": 0.5336458907887479, + "grad_norm": 0.16607564687728882, + "learning_rate": 2.666850523993381e-05, + "loss": 0.0435, + "num_input_tokens_seen": 1189952, + "step": 1935 + }, + { + "epoch": 0.5350248207391064, + "grad_norm": 3.1834781169891357, + "learning_rate": 2.673745173745174e-05, + "loss": 0.1421, + "num_input_tokens_seen": 1192736, + "step": 1940 + }, + { + "epoch": 0.536403750689465, + "grad_norm": 6.7203521728515625, + "learning_rate": 2.6806398234969666e-05, + "loss": 0.2053, + "num_input_tokens_seen": 1195840, + "step": 1945 + }, + { + "epoch": 0.5377826806398235, + "grad_norm": 0.1385383903980255, + "learning_rate": 2.687534473248759e-05, + "loss": 0.1251, + "num_input_tokens_seen": 1198528, + "step": 1950 + }, + { + "epoch": 0.539161610590182, + "grad_norm": 0.07705878466367722, + "learning_rate": 2.694429123000552e-05, + "loss": 0.0814, + "num_input_tokens_seen": 1201536, + "step": 1955 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 2.1229023933410645, + "learning_rate": 2.7013237727523444e-05, + "loss": 0.0894, + "num_input_tokens_seen": 1204864, + "step": 1960 + }, + { + "epoch": 0.5419194704908991, + "grad_norm": 2.527162790298462, + "learning_rate": 2.708218422504137e-05, + "loss": 0.1064, + "num_input_tokens_seen": 1207424, + "step": 1965 + }, + { + "epoch": 0.5432984004412575, + "grad_norm": 1.8530842065811157, + "learning_rate": 2.715113072255929e-05, + "loss": 0.0476, + "num_input_tokens_seen": 1210272, + "step": 1970 + }, + { + "epoch": 0.5446773303916161, + "grad_norm": 0.9830695390701294, + "learning_rate": 2.7220077220077223e-05, + "loss": 0.1799, + "num_input_tokens_seen": 1214560, + "step": 1975 + }, + { + "epoch": 0.5460562603419746, + "grad_norm": 1.2972301244735718, + "learning_rate": 2.7289023717595148e-05, + "loss": 0.1479, + "num_input_tokens_seen": 1217504, + "step": 1980 + }, + { + "epoch": 0.5474351902923331, + "grad_norm": 5.811416149139404, + "learning_rate": 2.7357970215113073e-05, + "loss": 0.1656, + "num_input_tokens_seen": 1220128, + "step": 1985 + }, + { + "epoch": 0.5488141202426917, + "grad_norm": 0.32834938168525696, + "learning_rate": 2.7426916712631e-05, + "loss": 0.1097, + "num_input_tokens_seen": 1223008, + "step": 1990 + }, + { + "epoch": 0.5501930501930502, + "grad_norm": 0.41213396191596985, + "learning_rate": 2.7495863210148926e-05, + "loss": 0.1829, + "num_input_tokens_seen": 1225952, + "step": 1995 + }, + { + "epoch": 0.5515719801434087, + "grad_norm": 1.0145084857940674, + "learning_rate": 2.756480970766685e-05, + "loss": 0.0664, + "num_input_tokens_seen": 1229088, + "step": 2000 + }, + { + "epoch": 0.5529509100937673, + "grad_norm": 3.183328866958618, + "learning_rate": 2.763375620518478e-05, + "loss": 0.0365, + "num_input_tokens_seen": 1231712, + "step": 2005 + }, + { + "epoch": 0.5543298400441258, + "grad_norm": 4.021912574768066, + "learning_rate": 2.7702702702702704e-05, + "loss": 0.0923, + "num_input_tokens_seen": 1235040, + "step": 2010 + }, + { + "epoch": 0.5557087699944843, + "grad_norm": 0.6835647821426392, + "learning_rate": 2.777164920022063e-05, + "loss": 0.0617, + "num_input_tokens_seen": 1237728, + "step": 2015 + }, + { + "epoch": 0.5570876999448428, + "grad_norm": 8.295140266418457, + "learning_rate": 2.7840595697738558e-05, + "loss": 0.0974, + "num_input_tokens_seen": 1242944, + "step": 2020 + }, + { + "epoch": 0.5584666298952013, + "grad_norm": 9.242447853088379, + "learning_rate": 2.7909542195256482e-05, + "loss": 0.3846, + "num_input_tokens_seen": 1245824, + "step": 2025 + }, + { + "epoch": 0.5598455598455598, + "grad_norm": 3.6018259525299072, + "learning_rate": 2.7978488692774407e-05, + "loss": 0.1735, + "num_input_tokens_seen": 1249472, + "step": 2030 + }, + { + "epoch": 0.5612244897959183, + "grad_norm": 7.400001525878906, + "learning_rate": 2.8047435190292336e-05, + "loss": 0.0513, + "num_input_tokens_seen": 1254176, + "step": 2035 + }, + { + "epoch": 0.5626034197462769, + "grad_norm": 0.2867836058139801, + "learning_rate": 2.811638168781026e-05, + "loss": 0.022, + "num_input_tokens_seen": 1258432, + "step": 2040 + }, + { + "epoch": 0.5639823496966354, + "grad_norm": 0.058686431497335434, + "learning_rate": 2.8185328185328186e-05, + "loss": 0.0249, + "num_input_tokens_seen": 1262208, + "step": 2045 + }, + { + "epoch": 0.5653612796469939, + "grad_norm": 6.485201358795166, + "learning_rate": 2.825427468284611e-05, + "loss": 0.3185, + "num_input_tokens_seen": 1265728, + "step": 2050 + }, + { + "epoch": 0.5667402095973525, + "grad_norm": 0.041742775589227676, + "learning_rate": 2.832322118036404e-05, + "loss": 0.0419, + "num_input_tokens_seen": 1268672, + "step": 2055 + }, + { + "epoch": 0.568119139547711, + "grad_norm": 8.760430335998535, + "learning_rate": 2.8392167677881964e-05, + "loss": 0.1559, + "num_input_tokens_seen": 1272256, + "step": 2060 + }, + { + "epoch": 0.5694980694980695, + "grad_norm": 9.475297927856445, + "learning_rate": 2.846111417539989e-05, + "loss": 0.2217, + "num_input_tokens_seen": 1274880, + "step": 2065 + }, + { + "epoch": 0.5708769994484281, + "grad_norm": 2.8891334533691406, + "learning_rate": 2.8530060672917817e-05, + "loss": 0.1041, + "num_input_tokens_seen": 1278592, + "step": 2070 + }, + { + "epoch": 0.5722559293987866, + "grad_norm": 1.4390019178390503, + "learning_rate": 2.8599007170435742e-05, + "loss": 0.0793, + "num_input_tokens_seen": 1281344, + "step": 2075 + }, + { + "epoch": 0.573634859349145, + "grad_norm": 0.05516906455159187, + "learning_rate": 2.8667953667953667e-05, + "loss": 0.1305, + "num_input_tokens_seen": 1283648, + "step": 2080 + }, + { + "epoch": 0.5750137892995035, + "grad_norm": 0.014886684715747833, + "learning_rate": 2.87369001654716e-05, + "loss": 0.1086, + "num_input_tokens_seen": 1287200, + "step": 2085 + }, + { + "epoch": 0.5763927192498621, + "grad_norm": 2.4239494800567627, + "learning_rate": 2.880584666298952e-05, + "loss": 0.0406, + "num_input_tokens_seen": 1289952, + "step": 2090 + }, + { + "epoch": 0.5777716492002206, + "grad_norm": 5.572246074676514, + "learning_rate": 2.8874793160507445e-05, + "loss": 0.1307, + "num_input_tokens_seen": 1292768, + "step": 2095 + }, + { + "epoch": 0.5791505791505791, + "grad_norm": 3.3128206729888916, + "learning_rate": 2.8943739658025377e-05, + "loss": 0.1264, + "num_input_tokens_seen": 1296576, + "step": 2100 + }, + { + "epoch": 0.5805295091009377, + "grad_norm": 0.14102326333522797, + "learning_rate": 2.9012686155543302e-05, + "loss": 0.0597, + "num_input_tokens_seen": 1299616, + "step": 2105 + }, + { + "epoch": 0.5819084390512962, + "grad_norm": 0.3873675763607025, + "learning_rate": 2.9081632653061224e-05, + "loss": 0.1115, + "num_input_tokens_seen": 1302720, + "step": 2110 + }, + { + "epoch": 0.5832873690016547, + "grad_norm": 0.06504801660776138, + "learning_rate": 2.915057915057915e-05, + "loss": 0.0786, + "num_input_tokens_seen": 1305792, + "step": 2115 + }, + { + "epoch": 0.5846662989520133, + "grad_norm": 0.6604000926017761, + "learning_rate": 2.921952564809708e-05, + "loss": 0.0732, + "num_input_tokens_seen": 1309600, + "step": 2120 + }, + { + "epoch": 0.5860452289023718, + "grad_norm": 0.25309649109840393, + "learning_rate": 2.9288472145615002e-05, + "loss": 0.0559, + "num_input_tokens_seen": 1312832, + "step": 2125 + }, + { + "epoch": 0.5874241588527302, + "grad_norm": 3.607342481613159, + "learning_rate": 2.9357418643132927e-05, + "loss": 0.1352, + "num_input_tokens_seen": 1316032, + "step": 2130 + }, + { + "epoch": 0.5888030888030888, + "grad_norm": 0.12399297952651978, + "learning_rate": 2.942636514065086e-05, + "loss": 0.0435, + "num_input_tokens_seen": 1318656, + "step": 2135 + }, + { + "epoch": 0.5901820187534473, + "grad_norm": 0.08407413959503174, + "learning_rate": 2.9495311638168784e-05, + "loss": 0.014, + "num_input_tokens_seen": 1321856, + "step": 2140 + }, + { + "epoch": 0.5915609487038058, + "grad_norm": 7.654115676879883, + "learning_rate": 2.9564258135686705e-05, + "loss": 0.144, + "num_input_tokens_seen": 1324384, + "step": 2145 + }, + { + "epoch": 0.5929398786541644, + "grad_norm": 3.352487802505493, + "learning_rate": 2.9633204633204637e-05, + "loss": 0.1924, + "num_input_tokens_seen": 1328544, + "step": 2150 + }, + { + "epoch": 0.5943188086045229, + "grad_norm": 0.09355659037828445, + "learning_rate": 2.9702151130722562e-05, + "loss": 0.0425, + "num_input_tokens_seen": 1331168, + "step": 2155 + }, + { + "epoch": 0.5956977385548814, + "grad_norm": 5.925647258758545, + "learning_rate": 2.9771097628240487e-05, + "loss": 0.127, + "num_input_tokens_seen": 1334368, + "step": 2160 + }, + { + "epoch": 0.5970766685052399, + "grad_norm": 0.11383512616157532, + "learning_rate": 2.9840044125758415e-05, + "loss": 0.2346, + "num_input_tokens_seen": 1336864, + "step": 2165 + }, + { + "epoch": 0.5984555984555985, + "grad_norm": 0.026722298935055733, + "learning_rate": 2.990899062327634e-05, + "loss": 0.0753, + "num_input_tokens_seen": 1339424, + "step": 2170 + }, + { + "epoch": 0.599834528405957, + "grad_norm": 1.6953461170196533, + "learning_rate": 2.9977937120794265e-05, + "loss": 0.1073, + "num_input_tokens_seen": 1343424, + "step": 2175 + }, + { + "epoch": 0.6012134583563155, + "grad_norm": 0.1091422364115715, + "learning_rate": 3.0046883618312193e-05, + "loss": 0.0997, + "num_input_tokens_seen": 1346016, + "step": 2180 + }, + { + "epoch": 0.602592388306674, + "grad_norm": 0.07782192528247833, + "learning_rate": 3.011583011583012e-05, + "loss": 0.1168, + "num_input_tokens_seen": 1348608, + "step": 2185 + }, + { + "epoch": 0.6039713182570325, + "grad_norm": 0.2629746198654175, + "learning_rate": 3.0184776613348043e-05, + "loss": 0.0315, + "num_input_tokens_seen": 1351520, + "step": 2190 + }, + { + "epoch": 0.605350248207391, + "grad_norm": 5.786989688873291, + "learning_rate": 3.0253723110865968e-05, + "loss": 0.0652, + "num_input_tokens_seen": 1354176, + "step": 2195 + }, + { + "epoch": 0.6067291781577496, + "grad_norm": 6.895953178405762, + "learning_rate": 3.0322669608383897e-05, + "loss": 0.0728, + "num_input_tokens_seen": 1358496, + "step": 2200 + }, + { + "epoch": 0.6081081081081081, + "grad_norm": 5.519217014312744, + "learning_rate": 3.039161610590182e-05, + "loss": 0.1098, + "num_input_tokens_seen": 1361216, + "step": 2205 + }, + { + "epoch": 0.6094870380584666, + "grad_norm": 4.614552021026611, + "learning_rate": 3.0460562603419747e-05, + "loss": 0.0992, + "num_input_tokens_seen": 1364160, + "step": 2210 + }, + { + "epoch": 0.6108659680088252, + "grad_norm": 4.746635437011719, + "learning_rate": 3.0529509100937675e-05, + "loss": 0.3409, + "num_input_tokens_seen": 1366816, + "step": 2215 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.2570168077945709, + "learning_rate": 3.05984555984556e-05, + "loss": 0.1729, + "num_input_tokens_seen": 1369888, + "step": 2220 + }, + { + "epoch": 0.6136238279095422, + "grad_norm": 1.9879940748214722, + "learning_rate": 3.0667402095973525e-05, + "loss": 0.0804, + "num_input_tokens_seen": 1372960, + "step": 2225 + }, + { + "epoch": 0.6150027578599007, + "grad_norm": 8.497381210327148, + "learning_rate": 3.073634859349145e-05, + "loss": 0.0885, + "num_input_tokens_seen": 1375936, + "step": 2230 + }, + { + "epoch": 0.6163816878102593, + "grad_norm": 0.2056403011083603, + "learning_rate": 3.080529509100938e-05, + "loss": 0.1035, + "num_input_tokens_seen": 1378880, + "step": 2235 + }, + { + "epoch": 0.6177606177606177, + "grad_norm": 3.0041897296905518, + "learning_rate": 3.08742415885273e-05, + "loss": 0.282, + "num_input_tokens_seen": 1381120, + "step": 2240 + }, + { + "epoch": 0.6191395477109762, + "grad_norm": 9.030269622802734, + "learning_rate": 3.094318808604523e-05, + "loss": 0.0637, + "num_input_tokens_seen": 1384480, + "step": 2245 + }, + { + "epoch": 0.6205184776613348, + "grad_norm": 0.942237377166748, + "learning_rate": 3.101213458356316e-05, + "loss": 0.1346, + "num_input_tokens_seen": 1387872, + "step": 2250 + }, + { + "epoch": 0.6218974076116933, + "grad_norm": 0.23082980513572693, + "learning_rate": 3.108108108108108e-05, + "loss": 0.0995, + "num_input_tokens_seen": 1390720, + "step": 2255 + }, + { + "epoch": 0.6232763375620518, + "grad_norm": 4.7957634925842285, + "learning_rate": 3.1150027578599e-05, + "loss": 0.0556, + "num_input_tokens_seen": 1393120, + "step": 2260 + }, + { + "epoch": 0.6246552675124104, + "grad_norm": 0.2046554535627365, + "learning_rate": 3.121897407611694e-05, + "loss": 0.0942, + "num_input_tokens_seen": 1396512, + "step": 2265 + }, + { + "epoch": 0.6260341974627689, + "grad_norm": 5.485140800476074, + "learning_rate": 3.128792057363486e-05, + "loss": 0.1337, + "num_input_tokens_seen": 1399296, + "step": 2270 + }, + { + "epoch": 0.6274131274131274, + "grad_norm": 2.3436756134033203, + "learning_rate": 3.135686707115279e-05, + "loss": 0.0446, + "num_input_tokens_seen": 1402752, + "step": 2275 + }, + { + "epoch": 0.628792057363486, + "grad_norm": 5.072534084320068, + "learning_rate": 3.1425813568670716e-05, + "loss": 0.0702, + "num_input_tokens_seen": 1404928, + "step": 2280 + }, + { + "epoch": 0.6301709873138445, + "grad_norm": 6.33219575881958, + "learning_rate": 3.149476006618864e-05, + "loss": 0.0576, + "num_input_tokens_seen": 1408096, + "step": 2285 + }, + { + "epoch": 0.631549917264203, + "grad_norm": 0.16712979972362518, + "learning_rate": 3.1563706563706566e-05, + "loss": 0.0661, + "num_input_tokens_seen": 1410432, + "step": 2290 + }, + { + "epoch": 0.6329288472145616, + "grad_norm": 0.014507051557302475, + "learning_rate": 3.1632653061224494e-05, + "loss": 0.013, + "num_input_tokens_seen": 1412672, + "step": 2295 + }, + { + "epoch": 0.63430777716492, + "grad_norm": 7.727139472961426, + "learning_rate": 3.1701599558742416e-05, + "loss": 0.107, + "num_input_tokens_seen": 1415968, + "step": 2300 + }, + { + "epoch": 0.6356867071152785, + "grad_norm": 6.056257247924805, + "learning_rate": 3.1770546056260344e-05, + "loss": 0.1823, + "num_input_tokens_seen": 1418240, + "step": 2305 + }, + { + "epoch": 0.637065637065637, + "grad_norm": 7.824832916259766, + "learning_rate": 3.183949255377827e-05, + "loss": 0.1726, + "num_input_tokens_seen": 1420704, + "step": 2310 + }, + { + "epoch": 0.6384445670159956, + "grad_norm": 13.371331214904785, + "learning_rate": 3.1908439051296194e-05, + "loss": 0.225, + "num_input_tokens_seen": 1424032, + "step": 2315 + }, + { + "epoch": 0.6398234969663541, + "grad_norm": 3.3832457065582275, + "learning_rate": 3.197738554881412e-05, + "loss": 0.1621, + "num_input_tokens_seen": 1426208, + "step": 2320 + }, + { + "epoch": 0.6412024269167126, + "grad_norm": 0.23296302556991577, + "learning_rate": 3.204633204633205e-05, + "loss": 0.023, + "num_input_tokens_seen": 1430144, + "step": 2325 + }, + { + "epoch": 0.6425813568670712, + "grad_norm": 3.7894344329833984, + "learning_rate": 3.211527854384997e-05, + "loss": 0.1229, + "num_input_tokens_seen": 1433568, + "step": 2330 + }, + { + "epoch": 0.6439602868174297, + "grad_norm": 8.840062141418457, + "learning_rate": 3.21842250413679e-05, + "loss": 0.0919, + "num_input_tokens_seen": 1437056, + "step": 2335 + }, + { + "epoch": 0.6453392167677882, + "grad_norm": 1.6767466068267822, + "learning_rate": 3.225317153888582e-05, + "loss": 0.1168, + "num_input_tokens_seen": 1439904, + "step": 2340 + }, + { + "epoch": 0.6467181467181468, + "grad_norm": 4.991452217102051, + "learning_rate": 3.232211803640375e-05, + "loss": 0.194, + "num_input_tokens_seen": 1442880, + "step": 2345 + }, + { + "epoch": 0.6480970766685052, + "grad_norm": 6.471498012542725, + "learning_rate": 3.239106453392168e-05, + "loss": 0.0779, + "num_input_tokens_seen": 1445504, + "step": 2350 + }, + { + "epoch": 0.6494760066188637, + "grad_norm": 0.2554498612880707, + "learning_rate": 3.24600110314396e-05, + "loss": 0.0866, + "num_input_tokens_seen": 1448544, + "step": 2355 + }, + { + "epoch": 0.6508549365692223, + "grad_norm": 0.7887869477272034, + "learning_rate": 3.252895752895753e-05, + "loss": 0.0836, + "num_input_tokens_seen": 1451456, + "step": 2360 + }, + { + "epoch": 0.6522338665195808, + "grad_norm": 6.1597514152526855, + "learning_rate": 3.259790402647546e-05, + "loss": 0.0678, + "num_input_tokens_seen": 1454112, + "step": 2365 + }, + { + "epoch": 0.6536127964699393, + "grad_norm": 0.9510801434516907, + "learning_rate": 3.266685052399338e-05, + "loss": 0.0199, + "num_input_tokens_seen": 1457088, + "step": 2370 + }, + { + "epoch": 0.6549917264202979, + "grad_norm": 0.2662651240825653, + "learning_rate": 3.2735797021511314e-05, + "loss": 0.0214, + "num_input_tokens_seen": 1459456, + "step": 2375 + }, + { + "epoch": 0.6563706563706564, + "grad_norm": 0.8001293540000916, + "learning_rate": 3.2804743519029236e-05, + "loss": 0.0392, + "num_input_tokens_seen": 1462976, + "step": 2380 + }, + { + "epoch": 0.6577495863210149, + "grad_norm": 6.260765075683594, + "learning_rate": 3.287369001654716e-05, + "loss": 0.0225, + "num_input_tokens_seen": 1465888, + "step": 2385 + }, + { + "epoch": 0.6591285162713734, + "grad_norm": 0.05373566225171089, + "learning_rate": 3.294263651406509e-05, + "loss": 0.1444, + "num_input_tokens_seen": 1468736, + "step": 2390 + }, + { + "epoch": 0.660507446221732, + "grad_norm": 6.304034233093262, + "learning_rate": 3.3011583011583014e-05, + "loss": 0.1538, + "num_input_tokens_seen": 1471424, + "step": 2395 + }, + { + "epoch": 0.6618863761720905, + "grad_norm": 8.79240894317627, + "learning_rate": 3.3080529509100936e-05, + "loss": 0.1042, + "num_input_tokens_seen": 1473856, + "step": 2400 + }, + { + "epoch": 0.6632653061224489, + "grad_norm": 8.998689651489258, + "learning_rate": 3.314947600661887e-05, + "loss": 0.3178, + "num_input_tokens_seen": 1476864, + "step": 2405 + }, + { + "epoch": 0.6646442360728075, + "grad_norm": 8.176047325134277, + "learning_rate": 3.321842250413679e-05, + "loss": 0.1385, + "num_input_tokens_seen": 1479520, + "step": 2410 + }, + { + "epoch": 0.666023166023166, + "grad_norm": 0.07526406645774841, + "learning_rate": 3.3287369001654714e-05, + "loss": 0.0466, + "num_input_tokens_seen": 1481984, + "step": 2415 + }, + { + "epoch": 0.6674020959735245, + "grad_norm": 14.484724998474121, + "learning_rate": 3.335631549917264e-05, + "loss": 0.2941, + "num_input_tokens_seen": 1484512, + "step": 2420 + }, + { + "epoch": 0.6687810259238831, + "grad_norm": 4.379820823669434, + "learning_rate": 3.342526199669057e-05, + "loss": 0.2442, + "num_input_tokens_seen": 1487296, + "step": 2425 + }, + { + "epoch": 0.6701599558742416, + "grad_norm": 6.694551944732666, + "learning_rate": 3.34942084942085e-05, + "loss": 0.0739, + "num_input_tokens_seen": 1490624, + "step": 2430 + }, + { + "epoch": 0.6715388858246001, + "grad_norm": 0.2674410343170166, + "learning_rate": 3.356315499172642e-05, + "loss": 0.089, + "num_input_tokens_seen": 1493472, + "step": 2435 + }, + { + "epoch": 0.6729178157749587, + "grad_norm": 3.4197864532470703, + "learning_rate": 3.363210148924435e-05, + "loss": 0.0793, + "num_input_tokens_seen": 1496768, + "step": 2440 + }, + { + "epoch": 0.6742967457253172, + "grad_norm": 1.3795017004013062, + "learning_rate": 3.370104798676228e-05, + "loss": 0.0332, + "num_input_tokens_seen": 1499680, + "step": 2445 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 4.506232738494873, + "learning_rate": 3.37699944842802e-05, + "loss": 0.1725, + "num_input_tokens_seen": 1502112, + "step": 2450 + }, + { + "epoch": 0.6770546056260341, + "grad_norm": 0.06383058428764343, + "learning_rate": 3.383894098179813e-05, + "loss": 0.112, + "num_input_tokens_seen": 1504352, + "step": 2455 + }, + { + "epoch": 0.6784335355763927, + "grad_norm": 3.3161685466766357, + "learning_rate": 3.3907887479316055e-05, + "loss": 0.1938, + "num_input_tokens_seen": 1507712, + "step": 2460 + }, + { + "epoch": 0.6798124655267512, + "grad_norm": 0.1713074892759323, + "learning_rate": 3.397683397683398e-05, + "loss": 0.2495, + "num_input_tokens_seen": 1510432, + "step": 2465 + }, + { + "epoch": 0.6811913954771097, + "grad_norm": 1.3210210800170898, + "learning_rate": 3.4045780474351905e-05, + "loss": 0.1033, + "num_input_tokens_seen": 1515392, + "step": 2470 + }, + { + "epoch": 0.6825703254274683, + "grad_norm": 3.892580032348633, + "learning_rate": 3.4114726971869834e-05, + "loss": 0.0779, + "num_input_tokens_seen": 1518144, + "step": 2475 + }, + { + "epoch": 0.6839492553778268, + "grad_norm": 4.431966781616211, + "learning_rate": 3.4183673469387755e-05, + "loss": 0.1845, + "num_input_tokens_seen": 1521760, + "step": 2480 + }, + { + "epoch": 0.6853281853281853, + "grad_norm": 4.819953918457031, + "learning_rate": 3.4252619966905683e-05, + "loss": 0.0601, + "num_input_tokens_seen": 1523936, + "step": 2485 + }, + { + "epoch": 0.6867071152785439, + "grad_norm": 1.2196515798568726, + "learning_rate": 3.432156646442361e-05, + "loss": 0.0932, + "num_input_tokens_seen": 1528576, + "step": 2490 + }, + { + "epoch": 0.6880860452289024, + "grad_norm": 4.945538520812988, + "learning_rate": 3.4390512961941533e-05, + "loss": 0.0975, + "num_input_tokens_seen": 1530976, + "step": 2495 + }, + { + "epoch": 0.6894649751792609, + "grad_norm": 3.000450372695923, + "learning_rate": 3.445945945945946e-05, + "loss": 0.2043, + "num_input_tokens_seen": 1534912, + "step": 2500 + }, + { + "epoch": 0.6908439051296195, + "grad_norm": 6.131906509399414, + "learning_rate": 3.452840595697739e-05, + "loss": 0.1447, + "num_input_tokens_seen": 1538656, + "step": 2505 + }, + { + "epoch": 0.692222835079978, + "grad_norm": 5.748903751373291, + "learning_rate": 3.459735245449531e-05, + "loss": 0.1128, + "num_input_tokens_seen": 1541664, + "step": 2510 + }, + { + "epoch": 0.6936017650303364, + "grad_norm": 1.9016544818878174, + "learning_rate": 3.466629895201324e-05, + "loss": 0.0272, + "num_input_tokens_seen": 1543872, + "step": 2515 + }, + { + "epoch": 0.694980694980695, + "grad_norm": 0.12568971514701843, + "learning_rate": 3.473524544953117e-05, + "loss": 0.0695, + "num_input_tokens_seen": 1547200, + "step": 2520 + }, + { + "epoch": 0.6963596249310535, + "grad_norm": 0.23596584796905518, + "learning_rate": 3.480419194704909e-05, + "loss": 0.2201, + "num_input_tokens_seen": 1550272, + "step": 2525 + }, + { + "epoch": 0.697738554881412, + "grad_norm": 6.655685901641846, + "learning_rate": 3.487313844456702e-05, + "loss": 0.1143, + "num_input_tokens_seen": 1553632, + "step": 2530 + }, + { + "epoch": 0.6991174848317705, + "grad_norm": 1.0255420207977295, + "learning_rate": 3.4942084942084947e-05, + "loss": 0.0543, + "num_input_tokens_seen": 1556928, + "step": 2535 + }, + { + "epoch": 0.7004964147821291, + "grad_norm": 5.422428131103516, + "learning_rate": 3.501103143960287e-05, + "loss": 0.0503, + "num_input_tokens_seen": 1559616, + "step": 2540 + }, + { + "epoch": 0.7018753447324876, + "grad_norm": 0.09733103215694427, + "learning_rate": 3.5079977937120797e-05, + "loss": 0.0647, + "num_input_tokens_seen": 1562208, + "step": 2545 + }, + { + "epoch": 0.7032542746828461, + "grad_norm": 1.2177071571350098, + "learning_rate": 3.5148924434638725e-05, + "loss": 0.0264, + "num_input_tokens_seen": 1564896, + "step": 2550 + }, + { + "epoch": 0.7046332046332047, + "grad_norm": 0.026209449395537376, + "learning_rate": 3.5217870932156646e-05, + "loss": 0.1724, + "num_input_tokens_seen": 1567616, + "step": 2555 + }, + { + "epoch": 0.7060121345835632, + "grad_norm": 5.959299087524414, + "learning_rate": 3.5286817429674575e-05, + "loss": 0.2113, + "num_input_tokens_seen": 1570400, + "step": 2560 + }, + { + "epoch": 0.7073910645339216, + "grad_norm": 0.5824492573738098, + "learning_rate": 3.5355763927192496e-05, + "loss": 0.0777, + "num_input_tokens_seen": 1573216, + "step": 2565 + }, + { + "epoch": 0.7087699944842802, + "grad_norm": 0.0569591224193573, + "learning_rate": 3.5424710424710425e-05, + "loss": 0.1384, + "num_input_tokens_seen": 1576160, + "step": 2570 + }, + { + "epoch": 0.7101489244346387, + "grad_norm": 0.10062297433614731, + "learning_rate": 3.549365692222835e-05, + "loss": 0.0312, + "num_input_tokens_seen": 1580480, + "step": 2575 + }, + { + "epoch": 0.7115278543849972, + "grad_norm": 1.9425493478775024, + "learning_rate": 3.5562603419746275e-05, + "loss": 0.0875, + "num_input_tokens_seen": 1583360, + "step": 2580 + }, + { + "epoch": 0.7129067843353558, + "grad_norm": 6.593130588531494, + "learning_rate": 3.563154991726421e-05, + "loss": 0.2725, + "num_input_tokens_seen": 1586688, + "step": 2585 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.04134595766663551, + "learning_rate": 3.570049641478213e-05, + "loss": 0.1997, + "num_input_tokens_seen": 1589568, + "step": 2590 + }, + { + "epoch": 0.7156646442360728, + "grad_norm": 11.10048770904541, + "learning_rate": 3.576944291230005e-05, + "loss": 0.1015, + "num_input_tokens_seen": 1593664, + "step": 2595 + }, + { + "epoch": 0.7170435741864313, + "grad_norm": 0.09492672234773636, + "learning_rate": 3.583838940981799e-05, + "loss": 0.0868, + "num_input_tokens_seen": 1596352, + "step": 2600 + }, + { + "epoch": 0.7184225041367899, + "grad_norm": 4.70669412612915, + "learning_rate": 3.590733590733591e-05, + "loss": 0.0878, + "num_input_tokens_seen": 1599424, + "step": 2605 + }, + { + "epoch": 0.7198014340871484, + "grad_norm": 5.048539161682129, + "learning_rate": 3.597628240485383e-05, + "loss": 0.0424, + "num_input_tokens_seen": 1602944, + "step": 2610 + }, + { + "epoch": 0.7211803640375068, + "grad_norm": 5.1314191818237305, + "learning_rate": 3.6045228902371766e-05, + "loss": 0.055, + "num_input_tokens_seen": 1605344, + "step": 2615 + }, + { + "epoch": 0.7225592939878654, + "grad_norm": 11.859545707702637, + "learning_rate": 3.611417539988969e-05, + "loss": 0.0635, + "num_input_tokens_seen": 1607968, + "step": 2620 + }, + { + "epoch": 0.7239382239382239, + "grad_norm": 9.679577827453613, + "learning_rate": 3.618312189740761e-05, + "loss": 0.0297, + "num_input_tokens_seen": 1611840, + "step": 2625 + }, + { + "epoch": 0.7253171538885824, + "grad_norm": 0.17780883610248566, + "learning_rate": 3.625206839492554e-05, + "loss": 0.0604, + "num_input_tokens_seen": 1614304, + "step": 2630 + }, + { + "epoch": 0.726696083838941, + "grad_norm": 0.0690511018037796, + "learning_rate": 3.6321014892443466e-05, + "loss": 0.1856, + "num_input_tokens_seen": 1618912, + "step": 2635 + }, + { + "epoch": 0.7280750137892995, + "grad_norm": 2.5393104553222656, + "learning_rate": 3.6389961389961394e-05, + "loss": 0.1637, + "num_input_tokens_seen": 1621984, + "step": 2640 + }, + { + "epoch": 0.729453943739658, + "grad_norm": 0.3032165467739105, + "learning_rate": 3.6458907887479316e-05, + "loss": 0.1229, + "num_input_tokens_seen": 1625152, + "step": 2645 + }, + { + "epoch": 0.7308328736900166, + "grad_norm": 7.105386257171631, + "learning_rate": 3.6527854384997244e-05, + "loss": 0.0907, + "num_input_tokens_seen": 1628064, + "step": 2650 + }, + { + "epoch": 0.7322118036403751, + "grad_norm": 0.30708470940589905, + "learning_rate": 3.659680088251517e-05, + "loss": 0.082, + "num_input_tokens_seen": 1630432, + "step": 2655 + }, + { + "epoch": 0.7335907335907336, + "grad_norm": 2.769322633743286, + "learning_rate": 3.6665747380033094e-05, + "loss": 0.1267, + "num_input_tokens_seen": 1633760, + "step": 2660 + }, + { + "epoch": 0.7349696635410922, + "grad_norm": 0.1136775016784668, + "learning_rate": 3.673469387755102e-05, + "loss": 0.1215, + "num_input_tokens_seen": 1636480, + "step": 2665 + }, + { + "epoch": 0.7363485934914507, + "grad_norm": 2.6449806690216064, + "learning_rate": 3.680364037506895e-05, + "loss": 0.0732, + "num_input_tokens_seen": 1639392, + "step": 2670 + }, + { + "epoch": 0.7377275234418091, + "grad_norm": 14.961329460144043, + "learning_rate": 3.687258687258687e-05, + "loss": 0.1353, + "num_input_tokens_seen": 1642656, + "step": 2675 + }, + { + "epoch": 0.7391064533921676, + "grad_norm": 1.4387799501419067, + "learning_rate": 3.69415333701048e-05, + "loss": 0.0709, + "num_input_tokens_seen": 1645664, + "step": 2680 + }, + { + "epoch": 0.7404853833425262, + "grad_norm": 7.984651565551758, + "learning_rate": 3.701047986762273e-05, + "loss": 0.0922, + "num_input_tokens_seen": 1648352, + "step": 2685 + }, + { + "epoch": 0.7418643132928847, + "grad_norm": 0.7510370016098022, + "learning_rate": 3.707942636514065e-05, + "loss": 0.0933, + "num_input_tokens_seen": 1651264, + "step": 2690 + }, + { + "epoch": 0.7432432432432432, + "grad_norm": 2.969583749771118, + "learning_rate": 3.714837286265858e-05, + "loss": 0.1197, + "num_input_tokens_seen": 1653568, + "step": 2695 + }, + { + "epoch": 0.7446221731936018, + "grad_norm": 3.699042558670044, + "learning_rate": 3.721731936017651e-05, + "loss": 0.0865, + "num_input_tokens_seen": 1656064, + "step": 2700 + }, + { + "epoch": 0.7460011031439603, + "grad_norm": 5.886753559112549, + "learning_rate": 3.728626585769443e-05, + "loss": 0.1132, + "num_input_tokens_seen": 1659968, + "step": 2705 + }, + { + "epoch": 0.7473800330943188, + "grad_norm": 0.48187121748924255, + "learning_rate": 3.735521235521236e-05, + "loss": 0.0702, + "num_input_tokens_seen": 1662208, + "step": 2710 + }, + { + "epoch": 0.7487589630446774, + "grad_norm": 0.1044844388961792, + "learning_rate": 3.7424158852730286e-05, + "loss": 0.027, + "num_input_tokens_seen": 1664960, + "step": 2715 + }, + { + "epoch": 0.7501378929950359, + "grad_norm": 1.0448791980743408, + "learning_rate": 3.749310535024821e-05, + "loss": 0.028, + "num_input_tokens_seen": 1668512, + "step": 2720 + }, + { + "epoch": 0.7515168229453943, + "grad_norm": 0.025273971259593964, + "learning_rate": 3.7562051847766136e-05, + "loss": 0.0214, + "num_input_tokens_seen": 1671808, + "step": 2725 + }, + { + "epoch": 0.752895752895753, + "grad_norm": 0.1559385359287262, + "learning_rate": 3.7630998345284064e-05, + "loss": 0.0771, + "num_input_tokens_seen": 1676096, + "step": 2730 + }, + { + "epoch": 0.7542746828461114, + "grad_norm": 0.07814324647188187, + "learning_rate": 3.7699944842801986e-05, + "loss": 0.0685, + "num_input_tokens_seen": 1678976, + "step": 2735 + }, + { + "epoch": 0.7556536127964699, + "grad_norm": 6.06104850769043, + "learning_rate": 3.7768891340319914e-05, + "loss": 0.0726, + "num_input_tokens_seen": 1681888, + "step": 2740 + }, + { + "epoch": 0.7570325427468284, + "grad_norm": 10.444547653198242, + "learning_rate": 3.783783783783784e-05, + "loss": 0.1273, + "num_input_tokens_seen": 1685120, + "step": 2745 + }, + { + "epoch": 0.758411472697187, + "grad_norm": 32.63896560668945, + "learning_rate": 3.7906784335355764e-05, + "loss": 0.2583, + "num_input_tokens_seen": 1688704, + "step": 2750 + }, + { + "epoch": 0.7597904026475455, + "grad_norm": 9.409741401672363, + "learning_rate": 3.797573083287369e-05, + "loss": 0.1743, + "num_input_tokens_seen": 1692000, + "step": 2755 + }, + { + "epoch": 0.761169332597904, + "grad_norm": 0.9996359944343567, + "learning_rate": 3.804467733039162e-05, + "loss": 0.1552, + "num_input_tokens_seen": 1694560, + "step": 2760 + }, + { + "epoch": 0.7625482625482626, + "grad_norm": 1.2191091775894165, + "learning_rate": 3.811362382790954e-05, + "loss": 0.0739, + "num_input_tokens_seen": 1698336, + "step": 2765 + }, + { + "epoch": 0.7639271924986211, + "grad_norm": 6.832093715667725, + "learning_rate": 3.818257032542747e-05, + "loss": 0.2098, + "num_input_tokens_seen": 1701344, + "step": 2770 + }, + { + "epoch": 0.7653061224489796, + "grad_norm": 4.37651252746582, + "learning_rate": 3.825151682294539e-05, + "loss": 0.1285, + "num_input_tokens_seen": 1705792, + "step": 2775 + }, + { + "epoch": 0.7666850523993382, + "grad_norm": 0.8834969401359558, + "learning_rate": 3.832046332046332e-05, + "loss": 0.0161, + "num_input_tokens_seen": 1709472, + "step": 2780 + }, + { + "epoch": 0.7680639823496966, + "grad_norm": 4.919412136077881, + "learning_rate": 3.838940981798125e-05, + "loss": 0.1431, + "num_input_tokens_seen": 1712704, + "step": 2785 + }, + { + "epoch": 0.7694429123000551, + "grad_norm": 2.1594014167785645, + "learning_rate": 3.845835631549917e-05, + "loss": 0.232, + "num_input_tokens_seen": 1715808, + "step": 2790 + }, + { + "epoch": 0.7708218422504137, + "grad_norm": 3.698489189147949, + "learning_rate": 3.8527302813017105e-05, + "loss": 0.1572, + "num_input_tokens_seen": 1719072, + "step": 2795 + }, + { + "epoch": 0.7722007722007722, + "grad_norm": 2.637949228286743, + "learning_rate": 3.859624931053503e-05, + "loss": 0.1145, + "num_input_tokens_seen": 1721952, + "step": 2800 + }, + { + "epoch": 0.7735797021511307, + "grad_norm": 3.4917685985565186, + "learning_rate": 3.866519580805295e-05, + "loss": 0.0641, + "num_input_tokens_seen": 1725120, + "step": 2805 + }, + { + "epoch": 0.7749586321014893, + "grad_norm": 0.4222739636898041, + "learning_rate": 3.8734142305570884e-05, + "loss": 0.0826, + "num_input_tokens_seen": 1728064, + "step": 2810 + }, + { + "epoch": 0.7763375620518478, + "grad_norm": 0.20833419263362885, + "learning_rate": 3.8803088803088805e-05, + "loss": 0.0471, + "num_input_tokens_seen": 1730528, + "step": 2815 + }, + { + "epoch": 0.7777164920022063, + "grad_norm": 6.151366710662842, + "learning_rate": 3.887203530060673e-05, + "loss": 0.3131, + "num_input_tokens_seen": 1733568, + "step": 2820 + }, + { + "epoch": 0.7790954219525648, + "grad_norm": 1.1345854997634888, + "learning_rate": 3.894098179812466e-05, + "loss": 0.1387, + "num_input_tokens_seen": 1736288, + "step": 2825 + }, + { + "epoch": 0.7804743519029234, + "grad_norm": 1.942640781402588, + "learning_rate": 3.9009928295642583e-05, + "loss": 0.0543, + "num_input_tokens_seen": 1739680, + "step": 2830 + }, + { + "epoch": 0.7818532818532818, + "grad_norm": 1.627700924873352, + "learning_rate": 3.9078874793160505e-05, + "loss": 0.0216, + "num_input_tokens_seen": 1742336, + "step": 2835 + }, + { + "epoch": 0.7832322118036403, + "grad_norm": 5.062927722930908, + "learning_rate": 3.914782129067844e-05, + "loss": 0.0699, + "num_input_tokens_seen": 1744960, + "step": 2840 + }, + { + "epoch": 0.7846111417539989, + "grad_norm": 0.24411122500896454, + "learning_rate": 3.921676778819636e-05, + "loss": 0.1273, + "num_input_tokens_seen": 1748000, + "step": 2845 + }, + { + "epoch": 0.7859900717043574, + "grad_norm": 0.15948347747325897, + "learning_rate": 3.928571428571429e-05, + "loss": 0.0636, + "num_input_tokens_seen": 1751008, + "step": 2850 + }, + { + "epoch": 0.7873690016547159, + "grad_norm": 0.0682261735200882, + "learning_rate": 3.935466078323221e-05, + "loss": 0.242, + "num_input_tokens_seen": 1753920, + "step": 2855 + }, + { + "epoch": 0.7887479316050745, + "grad_norm": 1.2946391105651855, + "learning_rate": 3.942360728075014e-05, + "loss": 0.0103, + "num_input_tokens_seen": 1756992, + "step": 2860 + }, + { + "epoch": 0.790126861555433, + "grad_norm": 0.5707349181175232, + "learning_rate": 3.949255377826807e-05, + "loss": 0.022, + "num_input_tokens_seen": 1759648, + "step": 2865 + }, + { + "epoch": 0.7915057915057915, + "grad_norm": 0.06669903546571732, + "learning_rate": 3.956150027578599e-05, + "loss": 0.0045, + "num_input_tokens_seen": 1763584, + "step": 2870 + }, + { + "epoch": 0.7928847214561501, + "grad_norm": 10.2459716796875, + "learning_rate": 3.963044677330392e-05, + "loss": 0.1253, + "num_input_tokens_seen": 1766272, + "step": 2875 + }, + { + "epoch": 0.7942636514065086, + "grad_norm": 0.7523417472839355, + "learning_rate": 3.9699393270821847e-05, + "loss": 0.0074, + "num_input_tokens_seen": 1768960, + "step": 2880 + }, + { + "epoch": 0.795642581356867, + "grad_norm": 11.05238151550293, + "learning_rate": 3.976833976833977e-05, + "loss": 0.201, + "num_input_tokens_seen": 1771808, + "step": 2885 + }, + { + "epoch": 0.7970215113072255, + "grad_norm": 9.55925178527832, + "learning_rate": 3.9837286265857696e-05, + "loss": 0.1036, + "num_input_tokens_seen": 1774752, + "step": 2890 + }, + { + "epoch": 0.7984004412575841, + "grad_norm": 7.654099941253662, + "learning_rate": 3.9906232763375625e-05, + "loss": 0.226, + "num_input_tokens_seen": 1777664, + "step": 2895 + }, + { + "epoch": 0.7997793712079426, + "grad_norm": 0.5101456046104431, + "learning_rate": 3.9975179260893546e-05, + "loss": 0.0149, + "num_input_tokens_seen": 1781856, + "step": 2900 + }, + { + "epoch": 0.8011583011583011, + "grad_norm": 10.510738372802734, + "learning_rate": 4.0044125758411475e-05, + "loss": 0.1517, + "num_input_tokens_seen": 1784800, + "step": 2905 + }, + { + "epoch": 0.8025372311086597, + "grad_norm": 4.3869500160217285, + "learning_rate": 4.01130722559294e-05, + "loss": 0.1705, + "num_input_tokens_seen": 1787264, + "step": 2910 + }, + { + "epoch": 0.8039161610590182, + "grad_norm": 0.6210131049156189, + "learning_rate": 4.0182018753447325e-05, + "loss": 0.1949, + "num_input_tokens_seen": 1789664, + "step": 2915 + }, + { + "epoch": 0.8052950910093767, + "grad_norm": 0.22112509608268738, + "learning_rate": 4.025096525096525e-05, + "loss": 0.0588, + "num_input_tokens_seen": 1792832, + "step": 2920 + }, + { + "epoch": 0.8066740209597353, + "grad_norm": 2.7136735916137695, + "learning_rate": 4.031991174848318e-05, + "loss": 0.0633, + "num_input_tokens_seen": 1795392, + "step": 2925 + }, + { + "epoch": 0.8080529509100938, + "grad_norm": 0.25835946202278137, + "learning_rate": 4.03888582460011e-05, + "loss": 0.0612, + "num_input_tokens_seen": 1797888, + "step": 2930 + }, + { + "epoch": 0.8094318808604523, + "grad_norm": 6.2610321044921875, + "learning_rate": 4.045780474351903e-05, + "loss": 0.1925, + "num_input_tokens_seen": 1801056, + "step": 2935 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.040409479290246964, + "learning_rate": 4.052675124103696e-05, + "loss": 0.0196, + "num_input_tokens_seen": 1803360, + "step": 2940 + }, + { + "epoch": 0.8121897407611693, + "grad_norm": 6.154172420501709, + "learning_rate": 4.059569773855488e-05, + "loss": 0.0799, + "num_input_tokens_seen": 1806912, + "step": 2945 + }, + { + "epoch": 0.8135686707115278, + "grad_norm": 0.10543497651815414, + "learning_rate": 4.066464423607281e-05, + "loss": 0.1033, + "num_input_tokens_seen": 1809632, + "step": 2950 + }, + { + "epoch": 0.8149476006618864, + "grad_norm": 7.65207576751709, + "learning_rate": 4.073359073359074e-05, + "loss": 0.0641, + "num_input_tokens_seen": 1814912, + "step": 2955 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 8.576464653015137, + "learning_rate": 4.080253723110866e-05, + "loss": 0.074, + "num_input_tokens_seen": 1818528, + "step": 2960 + }, + { + "epoch": 0.8177054605626034, + "grad_norm": 1.998853087425232, + "learning_rate": 4.087148372862659e-05, + "loss": 0.092, + "num_input_tokens_seen": 1821024, + "step": 2965 + }, + { + "epoch": 0.8190843905129619, + "grad_norm": 15.728647232055664, + "learning_rate": 4.0940430226144516e-05, + "loss": 0.1963, + "num_input_tokens_seen": 1824928, + "step": 2970 + }, + { + "epoch": 0.8204633204633205, + "grad_norm": 0.04061006009578705, + "learning_rate": 4.100937672366244e-05, + "loss": 0.1117, + "num_input_tokens_seen": 1827776, + "step": 2975 + }, + { + "epoch": 0.821842250413679, + "grad_norm": 8.615455627441406, + "learning_rate": 4.1078323221180366e-05, + "loss": 0.2001, + "num_input_tokens_seen": 1830912, + "step": 2980 + }, + { + "epoch": 0.8232211803640375, + "grad_norm": 3.1883351802825928, + "learning_rate": 4.1147269718698294e-05, + "loss": 0.0481, + "num_input_tokens_seen": 1833792, + "step": 2985 + }, + { + "epoch": 0.8246001103143961, + "grad_norm": 0.21295791864395142, + "learning_rate": 4.1216216216216216e-05, + "loss": 0.0066, + "num_input_tokens_seen": 1837216, + "step": 2990 + }, + { + "epoch": 0.8259790402647545, + "grad_norm": 0.2967985272407532, + "learning_rate": 4.1285162713734144e-05, + "loss": 0.059, + "num_input_tokens_seen": 1840352, + "step": 2995 + }, + { + "epoch": 0.827357970215113, + "grad_norm": 0.06827165931463242, + "learning_rate": 4.1354109211252066e-05, + "loss": 0.164, + "num_input_tokens_seen": 1843104, + "step": 3000 + }, + { + "epoch": 0.8287369001654716, + "grad_norm": 0.41797205805778503, + "learning_rate": 4.142305570877e-05, + "loss": 0.0584, + "num_input_tokens_seen": 1846624, + "step": 3005 + }, + { + "epoch": 0.8301158301158301, + "grad_norm": 1.9046740531921387, + "learning_rate": 4.149200220628792e-05, + "loss": 0.057, + "num_input_tokens_seen": 1849536, + "step": 3010 + }, + { + "epoch": 0.8314947600661886, + "grad_norm": 0.11945090442895889, + "learning_rate": 4.1560948703805844e-05, + "loss": 0.2107, + "num_input_tokens_seen": 1852192, + "step": 3015 + }, + { + "epoch": 0.8328736900165472, + "grad_norm": 3.528775691986084, + "learning_rate": 4.162989520132378e-05, + "loss": 0.1006, + "num_input_tokens_seen": 1855328, + "step": 3020 + }, + { + "epoch": 0.8342526199669057, + "grad_norm": 3.0572848320007324, + "learning_rate": 4.16988416988417e-05, + "loss": 0.1517, + "num_input_tokens_seen": 1857888, + "step": 3025 + }, + { + "epoch": 0.8356315499172642, + "grad_norm": 0.5243210792541504, + "learning_rate": 4.176778819635962e-05, + "loss": 0.0748, + "num_input_tokens_seen": 1860256, + "step": 3030 + }, + { + "epoch": 0.8370104798676227, + "grad_norm": 0.12658680975437164, + "learning_rate": 4.183673469387756e-05, + "loss": 0.0846, + "num_input_tokens_seen": 1863392, + "step": 3035 + }, + { + "epoch": 0.8383894098179813, + "grad_norm": 0.4533143639564514, + "learning_rate": 4.190568119139548e-05, + "loss": 0.0431, + "num_input_tokens_seen": 1866592, + "step": 3040 + }, + { + "epoch": 0.8397683397683398, + "grad_norm": 2.982806921005249, + "learning_rate": 4.19746276889134e-05, + "loss": 0.068, + "num_input_tokens_seen": 1870912, + "step": 3045 + }, + { + "epoch": 0.8411472697186982, + "grad_norm": 3.8754191398620605, + "learning_rate": 4.2043574186431336e-05, + "loss": 0.195, + "num_input_tokens_seen": 1873792, + "step": 3050 + }, + { + "epoch": 0.8425261996690568, + "grad_norm": 2.166022777557373, + "learning_rate": 4.211252068394926e-05, + "loss": 0.1046, + "num_input_tokens_seen": 1876384, + "step": 3055 + }, + { + "epoch": 0.8439051296194153, + "grad_norm": 4.242037296295166, + "learning_rate": 4.2181467181467186e-05, + "loss": 0.1489, + "num_input_tokens_seen": 1879744, + "step": 3060 + }, + { + "epoch": 0.8452840595697738, + "grad_norm": 5.913004398345947, + "learning_rate": 4.225041367898511e-05, + "loss": 0.1275, + "num_input_tokens_seen": 1882400, + "step": 3065 + }, + { + "epoch": 0.8466629895201324, + "grad_norm": 5.767578601837158, + "learning_rate": 4.2319360176503036e-05, + "loss": 0.1441, + "num_input_tokens_seen": 1885600, + "step": 3070 + }, + { + "epoch": 0.8480419194704909, + "grad_norm": 5.428081035614014, + "learning_rate": 4.2388306674020964e-05, + "loss": 0.0915, + "num_input_tokens_seen": 1889888, + "step": 3075 + }, + { + "epoch": 0.8494208494208494, + "grad_norm": 1.1155694723129272, + "learning_rate": 4.2457253171538885e-05, + "loss": 0.0336, + "num_input_tokens_seen": 1892608, + "step": 3080 + }, + { + "epoch": 0.850799779371208, + "grad_norm": 2.7428200244903564, + "learning_rate": 4.2526199669056814e-05, + "loss": 0.0253, + "num_input_tokens_seen": 1895712, + "step": 3085 + }, + { + "epoch": 0.8521787093215665, + "grad_norm": 1.9959285259246826, + "learning_rate": 4.259514616657474e-05, + "loss": 0.0289, + "num_input_tokens_seen": 1898336, + "step": 3090 + }, + { + "epoch": 0.853557639271925, + "grad_norm": 5.383462905883789, + "learning_rate": 4.2664092664092664e-05, + "loss": 0.2095, + "num_input_tokens_seen": 1900736, + "step": 3095 + }, + { + "epoch": 0.8549365692222836, + "grad_norm": 0.13149206340312958, + "learning_rate": 4.273303916161059e-05, + "loss": 0.1293, + "num_input_tokens_seen": 1904032, + "step": 3100 + }, + { + "epoch": 0.856315499172642, + "grad_norm": 7.08669376373291, + "learning_rate": 4.280198565912852e-05, + "loss": 0.33, + "num_input_tokens_seen": 1906336, + "step": 3105 + }, + { + "epoch": 0.8576944291230005, + "grad_norm": 0.260258287191391, + "learning_rate": 4.287093215664644e-05, + "loss": 0.0933, + "num_input_tokens_seen": 1909280, + "step": 3110 + }, + { + "epoch": 0.859073359073359, + "grad_norm": 0.26446259021759033, + "learning_rate": 4.293987865416437e-05, + "loss": 0.0673, + "num_input_tokens_seen": 1913088, + "step": 3115 + }, + { + "epoch": 0.8604522890237176, + "grad_norm": 6.108292102813721, + "learning_rate": 4.30088251516823e-05, + "loss": 0.1607, + "num_input_tokens_seen": 1915808, + "step": 3120 + }, + { + "epoch": 0.8618312189740761, + "grad_norm": 11.180524826049805, + "learning_rate": 4.307777164920022e-05, + "loss": 0.0511, + "num_input_tokens_seen": 1918496, + "step": 3125 + }, + { + "epoch": 0.8632101489244346, + "grad_norm": 3.3553597927093506, + "learning_rate": 4.314671814671815e-05, + "loss": 0.11, + "num_input_tokens_seen": 1921440, + "step": 3130 + }, + { + "epoch": 0.8645890788747932, + "grad_norm": 2.089757204055786, + "learning_rate": 4.321566464423608e-05, + "loss": 0.2242, + "num_input_tokens_seen": 1924288, + "step": 3135 + }, + { + "epoch": 0.8659680088251517, + "grad_norm": 6.430854320526123, + "learning_rate": 4.3284611141754e-05, + "loss": 0.0813, + "num_input_tokens_seen": 1927456, + "step": 3140 + }, + { + "epoch": 0.8673469387755102, + "grad_norm": 0.19162870943546295, + "learning_rate": 4.335355763927193e-05, + "loss": 0.0871, + "num_input_tokens_seen": 1929440, + "step": 3145 + }, + { + "epoch": 0.8687258687258688, + "grad_norm": 1.8628618717193604, + "learning_rate": 4.3422504136789855e-05, + "loss": 0.1009, + "num_input_tokens_seen": 1932096, + "step": 3150 + }, + { + "epoch": 0.8701047986762273, + "grad_norm": 0.821291446685791, + "learning_rate": 4.349145063430778e-05, + "loss": 0.0738, + "num_input_tokens_seen": 1934592, + "step": 3155 + }, + { + "epoch": 0.8714837286265857, + "grad_norm": 5.123189449310303, + "learning_rate": 4.3560397131825705e-05, + "loss": 0.0627, + "num_input_tokens_seen": 1937504, + "step": 3160 + }, + { + "epoch": 0.8728626585769443, + "grad_norm": 0.26415953040122986, + "learning_rate": 4.3629343629343633e-05, + "loss": 0.0319, + "num_input_tokens_seen": 1940096, + "step": 3165 + }, + { + "epoch": 0.8742415885273028, + "grad_norm": 2.69034743309021, + "learning_rate": 4.3698290126861555e-05, + "loss": 0.105, + "num_input_tokens_seen": 1942880, + "step": 3170 + }, + { + "epoch": 0.8756205184776613, + "grad_norm": 0.305533230304718, + "learning_rate": 4.376723662437948e-05, + "loss": 0.1383, + "num_input_tokens_seen": 1945856, + "step": 3175 + }, + { + "epoch": 0.8769994484280199, + "grad_norm": 0.0549052394926548, + "learning_rate": 4.383618312189741e-05, + "loss": 0.049, + "num_input_tokens_seen": 1948864, + "step": 3180 + }, + { + "epoch": 0.8783783783783784, + "grad_norm": 0.08387865871191025, + "learning_rate": 4.390512961941533e-05, + "loss": 0.0074, + "num_input_tokens_seen": 1951776, + "step": 3185 + }, + { + "epoch": 0.8797573083287369, + "grad_norm": 0.158961683511734, + "learning_rate": 4.397407611693326e-05, + "loss": 0.1516, + "num_input_tokens_seen": 1954272, + "step": 3190 + }, + { + "epoch": 0.8811362382790954, + "grad_norm": 1.0043755769729614, + "learning_rate": 4.404302261445119e-05, + "loss": 0.1985, + "num_input_tokens_seen": 1957600, + "step": 3195 + }, + { + "epoch": 0.882515168229454, + "grad_norm": 0.22964155673980713, + "learning_rate": 4.411196911196911e-05, + "loss": 0.157, + "num_input_tokens_seen": 1960064, + "step": 3200 + }, + { + "epoch": 0.8838940981798125, + "grad_norm": 3.8040783405303955, + "learning_rate": 4.418091560948704e-05, + "loss": 0.1, + "num_input_tokens_seen": 1962688, + "step": 3205 + }, + { + "epoch": 0.885273028130171, + "grad_norm": 1.852252721786499, + "learning_rate": 4.424986210700497e-05, + "loss": 0.0794, + "num_input_tokens_seen": 1965472, + "step": 3210 + }, + { + "epoch": 0.8866519580805295, + "grad_norm": 2.620889902114868, + "learning_rate": 4.4318808604522897e-05, + "loss": 0.2556, + "num_input_tokens_seen": 1968288, + "step": 3215 + }, + { + "epoch": 0.888030888030888, + "grad_norm": 0.6540868878364563, + "learning_rate": 4.438775510204082e-05, + "loss": 0.0443, + "num_input_tokens_seen": 1970976, + "step": 3220 + }, + { + "epoch": 0.8894098179812465, + "grad_norm": 0.22831250727176666, + "learning_rate": 4.445670159955874e-05, + "loss": 0.0552, + "num_input_tokens_seen": 1974496, + "step": 3225 + }, + { + "epoch": 0.8907887479316051, + "grad_norm": 0.15296447277069092, + "learning_rate": 4.4525648097076675e-05, + "loss": 0.0106, + "num_input_tokens_seen": 1978240, + "step": 3230 + }, + { + "epoch": 0.8921676778819636, + "grad_norm": 6.2793684005737305, + "learning_rate": 4.4594594594594596e-05, + "loss": 0.128, + "num_input_tokens_seen": 1981056, + "step": 3235 + }, + { + "epoch": 0.8935466078323221, + "grad_norm": 1.392009973526001, + "learning_rate": 4.466354109211252e-05, + "loss": 0.1089, + "num_input_tokens_seen": 1984064, + "step": 3240 + }, + { + "epoch": 0.8949255377826807, + "grad_norm": 1.3649619817733765, + "learning_rate": 4.473248758963045e-05, + "loss": 0.0079, + "num_input_tokens_seen": 1987168, + "step": 3245 + }, + { + "epoch": 0.8963044677330392, + "grad_norm": 2.1653430461883545, + "learning_rate": 4.4801434087148375e-05, + "loss": 0.0365, + "num_input_tokens_seen": 1989600, + "step": 3250 + }, + { + "epoch": 0.8976833976833977, + "grad_norm": 0.11724536120891571, + "learning_rate": 4.4870380584666296e-05, + "loss": 0.1119, + "num_input_tokens_seen": 1992736, + "step": 3255 + }, + { + "epoch": 0.8990623276337562, + "grad_norm": 6.477216720581055, + "learning_rate": 4.493932708218423e-05, + "loss": 0.16, + "num_input_tokens_seen": 1995424, + "step": 3260 + }, + { + "epoch": 0.9004412575841148, + "grad_norm": 4.354042053222656, + "learning_rate": 4.500827357970215e-05, + "loss": 0.1595, + "num_input_tokens_seen": 1998272, + "step": 3265 + }, + { + "epoch": 0.9018201875344732, + "grad_norm": 2.2000417709350586, + "learning_rate": 4.507722007722008e-05, + "loss": 0.0822, + "num_input_tokens_seen": 2000896, + "step": 3270 + }, + { + "epoch": 0.9031991174848317, + "grad_norm": 1.768319845199585, + "learning_rate": 4.514616657473801e-05, + "loss": 0.053, + "num_input_tokens_seen": 2003296, + "step": 3275 + }, + { + "epoch": 0.9045780474351903, + "grad_norm": 0.03771506994962692, + "learning_rate": 4.521511307225593e-05, + "loss": 0.1324, + "num_input_tokens_seen": 2007776, + "step": 3280 + }, + { + "epoch": 0.9059569773855488, + "grad_norm": 0.052950892597436905, + "learning_rate": 4.528405956977386e-05, + "loss": 0.0163, + "num_input_tokens_seen": 2011360, + "step": 3285 + }, + { + "epoch": 0.9073359073359073, + "grad_norm": 7.677261829376221, + "learning_rate": 4.535300606729178e-05, + "loss": 0.1154, + "num_input_tokens_seen": 2013920, + "step": 3290 + }, + { + "epoch": 0.9087148372862659, + "grad_norm": 3.4474124908447266, + "learning_rate": 4.542195256480971e-05, + "loss": 0.0876, + "num_input_tokens_seen": 2017312, + "step": 3295 + }, + { + "epoch": 0.9100937672366244, + "grad_norm": 2.1592090129852295, + "learning_rate": 4.549089906232764e-05, + "loss": 0.1144, + "num_input_tokens_seen": 2021088, + "step": 3300 + }, + { + "epoch": 0.9114726971869829, + "grad_norm": 0.9739417433738708, + "learning_rate": 4.555984555984556e-05, + "loss": 0.0241, + "num_input_tokens_seen": 2023392, + "step": 3305 + }, + { + "epoch": 0.9128516271373415, + "grad_norm": 2.456094264984131, + "learning_rate": 4.562879205736349e-05, + "loss": 0.1306, + "num_input_tokens_seen": 2025792, + "step": 3310 + }, + { + "epoch": 0.9142305570877, + "grad_norm": 2.1156139373779297, + "learning_rate": 4.5697738554881416e-05, + "loss": 0.1324, + "num_input_tokens_seen": 2028800, + "step": 3315 + }, + { + "epoch": 0.9156094870380584, + "grad_norm": 0.20599760115146637, + "learning_rate": 4.576668505239934e-05, + "loss": 0.121, + "num_input_tokens_seen": 2031264, + "step": 3320 + }, + { + "epoch": 0.916988416988417, + "grad_norm": 0.21698591113090515, + "learning_rate": 4.5835631549917266e-05, + "loss": 0.0175, + "num_input_tokens_seen": 2033952, + "step": 3325 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 11.347099304199219, + "learning_rate": 4.5904578047435194e-05, + "loss": 0.0973, + "num_input_tokens_seen": 2037664, + "step": 3330 + }, + { + "epoch": 0.919746276889134, + "grad_norm": 11.48883056640625, + "learning_rate": 4.5973524544953116e-05, + "loss": 0.1038, + "num_input_tokens_seen": 2040800, + "step": 3335 + }, + { + "epoch": 0.9211252068394925, + "grad_norm": 0.0836859866976738, + "learning_rate": 4.6042471042471044e-05, + "loss": 0.1484, + "num_input_tokens_seen": 2044000, + "step": 3340 + }, + { + "epoch": 0.9225041367898511, + "grad_norm": 4.41023588180542, + "learning_rate": 4.611141753998897e-05, + "loss": 0.237, + "num_input_tokens_seen": 2046944, + "step": 3345 + }, + { + "epoch": 0.9238830667402096, + "grad_norm": 0.12101664394140244, + "learning_rate": 4.6180364037506894e-05, + "loss": 0.068, + "num_input_tokens_seen": 2049920, + "step": 3350 + }, + { + "epoch": 0.9252619966905681, + "grad_norm": 9.579550743103027, + "learning_rate": 4.624931053502482e-05, + "loss": 0.123, + "num_input_tokens_seen": 2052480, + "step": 3355 + }, + { + "epoch": 0.9266409266409267, + "grad_norm": 0.17656105756759644, + "learning_rate": 4.631825703254275e-05, + "loss": 0.2072, + "num_input_tokens_seen": 2055296, + "step": 3360 + }, + { + "epoch": 0.9280198565912852, + "grad_norm": 3.7781026363372803, + "learning_rate": 4.638720353006067e-05, + "loss": 0.2313, + "num_input_tokens_seen": 2058304, + "step": 3365 + }, + { + "epoch": 0.9293987865416437, + "grad_norm": 0.16722194850444794, + "learning_rate": 4.64561500275786e-05, + "loss": 0.1628, + "num_input_tokens_seen": 2061632, + "step": 3370 + }, + { + "epoch": 0.9307777164920022, + "grad_norm": 0.6639604568481445, + "learning_rate": 4.652509652509653e-05, + "loss": 0.049, + "num_input_tokens_seen": 2064800, + "step": 3375 + }, + { + "epoch": 0.9321566464423607, + "grad_norm": 2.87056827545166, + "learning_rate": 4.659404302261445e-05, + "loss": 0.0872, + "num_input_tokens_seen": 2068704, + "step": 3380 + }, + { + "epoch": 0.9335355763927192, + "grad_norm": 7.74490213394165, + "learning_rate": 4.666298952013238e-05, + "loss": 0.0972, + "num_input_tokens_seen": 2071232, + "step": 3385 + }, + { + "epoch": 0.9349145063430778, + "grad_norm": 2.927354097366333, + "learning_rate": 4.673193601765031e-05, + "loss": 0.1721, + "num_input_tokens_seen": 2073952, + "step": 3390 + }, + { + "epoch": 0.9362934362934363, + "grad_norm": 0.6767141222953796, + "learning_rate": 4.680088251516823e-05, + "loss": 0.0702, + "num_input_tokens_seen": 2077312, + "step": 3395 + }, + { + "epoch": 0.9376723662437948, + "grad_norm": 0.28995025157928467, + "learning_rate": 4.686982901268616e-05, + "loss": 0.0219, + "num_input_tokens_seen": 2079744, + "step": 3400 + }, + { + "epoch": 0.9390512961941533, + "grad_norm": 0.7139922976493835, + "learning_rate": 4.6938775510204086e-05, + "loss": 0.1867, + "num_input_tokens_seen": 2082304, + "step": 3405 + }, + { + "epoch": 0.9404302261445119, + "grad_norm": 2.951464891433716, + "learning_rate": 4.700772200772201e-05, + "loss": 0.1831, + "num_input_tokens_seen": 2085088, + "step": 3410 + }, + { + "epoch": 0.9418091560948704, + "grad_norm": 6.571165561676025, + "learning_rate": 4.7076668505239935e-05, + "loss": 0.0837, + "num_input_tokens_seen": 2088416, + "step": 3415 + }, + { + "epoch": 0.9431880860452289, + "grad_norm": 4.172509670257568, + "learning_rate": 4.7145615002757864e-05, + "loss": 0.178, + "num_input_tokens_seen": 2090560, + "step": 3420 + }, + { + "epoch": 0.9445670159955875, + "grad_norm": 0.8526307344436646, + "learning_rate": 4.721456150027579e-05, + "loss": 0.1069, + "num_input_tokens_seen": 2093600, + "step": 3425 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 5.525157928466797, + "learning_rate": 4.7283507997793714e-05, + "loss": 0.0232, + "num_input_tokens_seen": 2096480, + "step": 3430 + }, + { + "epoch": 0.9473248758963044, + "grad_norm": 0.5511594414710999, + "learning_rate": 4.7352454495311635e-05, + "loss": 0.0763, + "num_input_tokens_seen": 2100256, + "step": 3435 + }, + { + "epoch": 0.948703805846663, + "grad_norm": 2.747608184814453, + "learning_rate": 4.742140099282957e-05, + "loss": 0.0513, + "num_input_tokens_seen": 2102688, + "step": 3440 + }, + { + "epoch": 0.9500827357970215, + "grad_norm": 0.6155707836151123, + "learning_rate": 4.749034749034749e-05, + "loss": 0.0543, + "num_input_tokens_seen": 2105600, + "step": 3445 + }, + { + "epoch": 0.95146166574738, + "grad_norm": 14.20343017578125, + "learning_rate": 4.7559293987865414e-05, + "loss": 0.2143, + "num_input_tokens_seen": 2109088, + "step": 3450 + }, + { + "epoch": 0.9528405956977386, + "grad_norm": 0.052178654819726944, + "learning_rate": 4.762824048538335e-05, + "loss": 0.1158, + "num_input_tokens_seen": 2112640, + "step": 3455 + }, + { + "epoch": 0.9542195256480971, + "grad_norm": 1.5520071983337402, + "learning_rate": 4.769718698290127e-05, + "loss": 0.0134, + "num_input_tokens_seen": 2116416, + "step": 3460 + }, + { + "epoch": 0.9555984555984556, + "grad_norm": 0.8645073175430298, + "learning_rate": 4.776613348041919e-05, + "loss": 0.0421, + "num_input_tokens_seen": 2119968, + "step": 3465 + }, + { + "epoch": 0.9569773855488142, + "grad_norm": 1.423525094985962, + "learning_rate": 4.783507997793713e-05, + "loss": 0.0267, + "num_input_tokens_seen": 2122112, + "step": 3470 + }, + { + "epoch": 0.9583563154991727, + "grad_norm": 2.4390432834625244, + "learning_rate": 4.790402647545505e-05, + "loss": 0.0401, + "num_input_tokens_seen": 2125504, + "step": 3475 + }, + { + "epoch": 0.9597352454495311, + "grad_norm": 0.14393925666809082, + "learning_rate": 4.797297297297298e-05, + "loss": 0.1501, + "num_input_tokens_seen": 2129184, + "step": 3480 + }, + { + "epoch": 0.9611141753998896, + "grad_norm": 0.49890974164009094, + "learning_rate": 4.8041919470490905e-05, + "loss": 0.0529, + "num_input_tokens_seen": 2132032, + "step": 3485 + }, + { + "epoch": 0.9624931053502482, + "grad_norm": 0.014863296411931515, + "learning_rate": 4.811086596800883e-05, + "loss": 0.0824, + "num_input_tokens_seen": 2134912, + "step": 3490 + }, + { + "epoch": 0.9638720353006067, + "grad_norm": 4.668734073638916, + "learning_rate": 4.8179812465526755e-05, + "loss": 0.1632, + "num_input_tokens_seen": 2137440, + "step": 3495 + }, + { + "epoch": 0.9652509652509652, + "grad_norm": 1.221113920211792, + "learning_rate": 4.8248758963044683e-05, + "loss": 0.1682, + "num_input_tokens_seen": 2140320, + "step": 3500 + }, + { + "epoch": 0.9666298952013238, + "grad_norm": 1.02767014503479, + "learning_rate": 4.8317705460562605e-05, + "loss": 0.2664, + "num_input_tokens_seen": 2143520, + "step": 3505 + }, + { + "epoch": 0.9680088251516823, + "grad_norm": 0.19032083451747894, + "learning_rate": 4.838665195808053e-05, + "loss": 0.0146, + "num_input_tokens_seen": 2146720, + "step": 3510 + }, + { + "epoch": 0.9693877551020408, + "grad_norm": 5.418373107910156, + "learning_rate": 4.8455598455598455e-05, + "loss": 0.188, + "num_input_tokens_seen": 2149600, + "step": 3515 + }, + { + "epoch": 0.9707666850523994, + "grad_norm": 2.961867570877075, + "learning_rate": 4.852454495311638e-05, + "loss": 0.1504, + "num_input_tokens_seen": 2152192, + "step": 3520 + }, + { + "epoch": 0.9721456150027579, + "grad_norm": 0.11790712177753448, + "learning_rate": 4.859349145063431e-05, + "loss": 0.0829, + "num_input_tokens_seen": 2154688, + "step": 3525 + }, + { + "epoch": 0.9735245449531164, + "grad_norm": 9.806912422180176, + "learning_rate": 4.866243794815223e-05, + "loss": 0.0658, + "num_input_tokens_seen": 2157184, + "step": 3530 + }, + { + "epoch": 0.974903474903475, + "grad_norm": 0.1553216129541397, + "learning_rate": 4.873138444567016e-05, + "loss": 0.0645, + "num_input_tokens_seen": 2160480, + "step": 3535 + }, + { + "epoch": 0.9762824048538334, + "grad_norm": 0.9137212634086609, + "learning_rate": 4.880033094318809e-05, + "loss": 0.1199, + "num_input_tokens_seen": 2163840, + "step": 3540 + }, + { + "epoch": 0.9776613348041919, + "grad_norm": 0.17573145031929016, + "learning_rate": 4.886927744070601e-05, + "loss": 0.1398, + "num_input_tokens_seen": 2166496, + "step": 3545 + }, + { + "epoch": 0.9790402647545504, + "grad_norm": 0.2678185701370239, + "learning_rate": 4.893822393822394e-05, + "loss": 0.0437, + "num_input_tokens_seen": 2169024, + "step": 3550 + }, + { + "epoch": 0.980419194704909, + "grad_norm": 1.9389441013336182, + "learning_rate": 4.900717043574187e-05, + "loss": 0.0487, + "num_input_tokens_seen": 2173216, + "step": 3555 + }, + { + "epoch": 0.9817981246552675, + "grad_norm": 2.586686849594116, + "learning_rate": 4.907611693325979e-05, + "loss": 0.2, + "num_input_tokens_seen": 2175136, + "step": 3560 + }, + { + "epoch": 0.983177054605626, + "grad_norm": 1.882624864578247, + "learning_rate": 4.914506343077772e-05, + "loss": 0.168, + "num_input_tokens_seen": 2177888, + "step": 3565 + }, + { + "epoch": 0.9845559845559846, + "grad_norm": 3.771322011947632, + "learning_rate": 4.9214009928295646e-05, + "loss": 0.071, + "num_input_tokens_seen": 2180864, + "step": 3570 + }, + { + "epoch": 0.9859349145063431, + "grad_norm": 0.02487442083656788, + "learning_rate": 4.928295642581357e-05, + "loss": 0.0842, + "num_input_tokens_seen": 2186016, + "step": 3575 + }, + { + "epoch": 0.9873138444567016, + "grad_norm": 0.2796657383441925, + "learning_rate": 4.9351902923331496e-05, + "loss": 0.0499, + "num_input_tokens_seen": 2189632, + "step": 3580 + }, + { + "epoch": 0.9886927744070602, + "grad_norm": 2.0946149826049805, + "learning_rate": 4.9420849420849425e-05, + "loss": 0.1817, + "num_input_tokens_seen": 2193248, + "step": 3585 + }, + { + "epoch": 0.9900717043574186, + "grad_norm": 3.7578768730163574, + "learning_rate": 4.9489795918367346e-05, + "loss": 0.1132, + "num_input_tokens_seen": 2196064, + "step": 3590 + }, + { + "epoch": 0.9914506343077771, + "grad_norm": 0.028002532199025154, + "learning_rate": 4.9558742415885275e-05, + "loss": 0.0887, + "num_input_tokens_seen": 2198976, + "step": 3595 + }, + { + "epoch": 0.9928295642581357, + "grad_norm": 0.7373881936073303, + "learning_rate": 4.96276889134032e-05, + "loss": 0.1159, + "num_input_tokens_seen": 2202688, + "step": 3600 + }, + { + "epoch": 0.9942084942084942, + "grad_norm": 0.22176633775234222, + "learning_rate": 4.9696635410921124e-05, + "loss": 0.0423, + "num_input_tokens_seen": 2205632, + "step": 3605 + }, + { + "epoch": 0.9955874241588527, + "grad_norm": 9.497326850891113, + "learning_rate": 4.976558190843905e-05, + "loss": 0.1084, + "num_input_tokens_seen": 2208512, + "step": 3610 + }, + { + "epoch": 0.9969663541092113, + "grad_norm": 5.214473247528076, + "learning_rate": 4.983452840595698e-05, + "loss": 0.222, + "num_input_tokens_seen": 2211232, + "step": 3615 + }, + { + "epoch": 0.9983452840595698, + "grad_norm": 3.0118696689605713, + "learning_rate": 4.99034749034749e-05, + "loss": 0.2909, + "num_input_tokens_seen": 2214176, + "step": 3620 + }, + { + "epoch": 0.9997242140099283, + "grad_norm": 3.3293216228485107, + "learning_rate": 4.997242140099283e-05, + "loss": 0.0609, + "num_input_tokens_seen": 2216480, + "step": 3625 + }, + { + "epoch": 1.0, + "eval_loss": 0.08651281148195267, + "eval_runtime": 28.4777, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 14.151, + "num_input_tokens_seen": 2216600, + "step": 3626 + }, + { + "epoch": 1.0011031439602869, + "grad_norm": 1.7720942497253418, + "learning_rate": 4.9999998957414585e-05, + "loss": 0.0502, + "num_input_tokens_seen": 2219160, + "step": 3630 + }, + { + "epoch": 1.0024820739106453, + "grad_norm": 2.659674644470215, + "learning_rate": 4.999999258605956e-05, + "loss": 0.0817, + "num_input_tokens_seen": 2221656, + "step": 3635 + }, + { + "epoch": 1.0038610038610039, + "grad_norm": 1.8885247707366943, + "learning_rate": 4.999998042256513e-05, + "loss": 0.0936, + "num_input_tokens_seen": 2225080, + "step": 3640 + }, + { + "epoch": 1.0052399338113625, + "grad_norm": 2.8303701877593994, + "learning_rate": 4.999996246693409e-05, + "loss": 0.1601, + "num_input_tokens_seen": 2228472, + "step": 3645 + }, + { + "epoch": 1.0066188637617208, + "grad_norm": 0.23795755207538605, + "learning_rate": 4.99999387191706e-05, + "loss": 0.0662, + "num_input_tokens_seen": 2231992, + "step": 3650 + }, + { + "epoch": 1.0079977937120794, + "grad_norm": 0.08700364828109741, + "learning_rate": 4.999990917928017e-05, + "loss": 0.1222, + "num_input_tokens_seen": 2235608, + "step": 3655 + }, + { + "epoch": 1.009376723662438, + "grad_norm": 3.6544244289398193, + "learning_rate": 4.9999873847269644e-05, + "loss": 0.1575, + "num_input_tokens_seen": 2238616, + "step": 3660 + }, + { + "epoch": 1.0107556536127964, + "grad_norm": 0.2387118935585022, + "learning_rate": 4.999983272314721e-05, + "loss": 0.0779, + "num_input_tokens_seen": 2242648, + "step": 3665 + }, + { + "epoch": 1.012134583563155, + "grad_norm": 0.15321312844753265, + "learning_rate": 4.999978580692238e-05, + "loss": 0.0741, + "num_input_tokens_seen": 2245240, + "step": 3670 + }, + { + "epoch": 1.0135135135135136, + "grad_norm": 0.28849002718925476, + "learning_rate": 4.999973309860605e-05, + "loss": 0.0431, + "num_input_tokens_seen": 2249080, + "step": 3675 + }, + { + "epoch": 1.014892443463872, + "grad_norm": 0.5908432602882385, + "learning_rate": 4.9999674598210416e-05, + "loss": 0.0659, + "num_input_tokens_seen": 2252120, + "step": 3680 + }, + { + "epoch": 1.0162713734142306, + "grad_norm": 0.1667693853378296, + "learning_rate": 4.9999610305749026e-05, + "loss": 0.0693, + "num_input_tokens_seen": 2255768, + "step": 3685 + }, + { + "epoch": 1.0176503033645892, + "grad_norm": 0.309799462556839, + "learning_rate": 4.999954022123679e-05, + "loss": 0.0206, + "num_input_tokens_seen": 2259320, + "step": 3690 + }, + { + "epoch": 1.0190292333149475, + "grad_norm": 0.36609575152397156, + "learning_rate": 4.999946434468994e-05, + "loss": 0.037, + "num_input_tokens_seen": 2262776, + "step": 3695 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.3032747805118561, + "learning_rate": 4.999938267612605e-05, + "loss": 0.118, + "num_input_tokens_seen": 2265400, + "step": 3700 + }, + { + "epoch": 1.0217870932156647, + "grad_norm": 0.5489907264709473, + "learning_rate": 4.9999295215564044e-05, + "loss": 0.0683, + "num_input_tokens_seen": 2268120, + "step": 3705 + }, + { + "epoch": 1.0231660231660231, + "grad_norm": 0.049612000584602356, + "learning_rate": 4.999920196302419e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2271096, + "step": 3710 + }, + { + "epoch": 1.0245449531163817, + "grad_norm": 0.11705705523490906, + "learning_rate": 4.9999102918528084e-05, + "loss": 0.0155, + "num_input_tokens_seen": 2273624, + "step": 3715 + }, + { + "epoch": 1.0259238830667403, + "grad_norm": 10.801252365112305, + "learning_rate": 4.999899808209868e-05, + "loss": 0.1544, + "num_input_tokens_seen": 2275864, + "step": 3720 + }, + { + "epoch": 1.0273028130170987, + "grad_norm": 5.185319900512695, + "learning_rate": 4.999888745376028e-05, + "loss": 0.0775, + "num_input_tokens_seen": 2279032, + "step": 3725 + }, + { + "epoch": 1.0286817429674573, + "grad_norm": 0.011336639523506165, + "learning_rate": 4.999877103353849e-05, + "loss": 0.0506, + "num_input_tokens_seen": 2282072, + "step": 3730 + }, + { + "epoch": 1.0300606729178157, + "grad_norm": 0.0012662335066124797, + "learning_rate": 4.9998648821460295e-05, + "loss": 0.0221, + "num_input_tokens_seen": 2286680, + "step": 3735 + }, + { + "epoch": 1.0314396028681743, + "grad_norm": 0.0070986500941216946, + "learning_rate": 4.999852081755401e-05, + "loss": 0.0764, + "num_input_tokens_seen": 2290136, + "step": 3740 + }, + { + "epoch": 1.0328185328185329, + "grad_norm": 2.1524527072906494, + "learning_rate": 4.999838702184929e-05, + "loss": 0.1169, + "num_input_tokens_seen": 2293016, + "step": 3745 + }, + { + "epoch": 1.0341974627688912, + "grad_norm": 0.035601720213890076, + "learning_rate": 4.9998247434377144e-05, + "loss": 0.0405, + "num_input_tokens_seen": 2295960, + "step": 3750 + }, + { + "epoch": 1.0355763927192498, + "grad_norm": 0.04994907230138779, + "learning_rate": 4.9998102055169896e-05, + "loss": 0.0993, + "num_input_tokens_seen": 2298808, + "step": 3755 + }, + { + "epoch": 1.0369553226696084, + "grad_norm": 2.1750614643096924, + "learning_rate": 4.999795088426123e-05, + "loss": 0.0596, + "num_input_tokens_seen": 2301848, + "step": 3760 + }, + { + "epoch": 1.0383342526199668, + "grad_norm": 2.996490240097046, + "learning_rate": 4.9997793921686174e-05, + "loss": 0.043, + "num_input_tokens_seen": 2303928, + "step": 3765 + }, + { + "epoch": 1.0397131825703254, + "grad_norm": 0.06916601210832596, + "learning_rate": 4.999763116748111e-05, + "loss": 0.0035, + "num_input_tokens_seen": 2307256, + "step": 3770 + }, + { + "epoch": 1.041092112520684, + "grad_norm": 4.176510810852051, + "learning_rate": 4.9997462621683714e-05, + "loss": 0.1608, + "num_input_tokens_seen": 2310200, + "step": 3775 + }, + { + "epoch": 1.0424710424710424, + "grad_norm": 5.156945705413818, + "learning_rate": 4.999728828433307e-05, + "loss": 0.1069, + "num_input_tokens_seen": 2312856, + "step": 3780 + }, + { + "epoch": 1.043849972421401, + "grad_norm": 3.751439332962036, + "learning_rate": 4.9997108155469534e-05, + "loss": 0.0576, + "num_input_tokens_seen": 2315544, + "step": 3785 + }, + { + "epoch": 1.0452289023717596, + "grad_norm": 0.13419796526432037, + "learning_rate": 4.999692223513487e-05, + "loss": 0.0653, + "num_input_tokens_seen": 2318616, + "step": 3790 + }, + { + "epoch": 1.046607832322118, + "grad_norm": 0.0119151771068573, + "learning_rate": 4.9996730523372135e-05, + "loss": 0.0045, + "num_input_tokens_seen": 2321400, + "step": 3795 + }, + { + "epoch": 1.0479867622724766, + "grad_norm": 1.0450764894485474, + "learning_rate": 4.999653302022575e-05, + "loss": 0.075, + "num_input_tokens_seen": 2323960, + "step": 3800 + }, + { + "epoch": 1.0493656922228352, + "grad_norm": 0.058475594967603683, + "learning_rate": 4.9996329725741475e-05, + "loss": 0.0892, + "num_input_tokens_seen": 2326872, + "step": 3805 + }, + { + "epoch": 1.0507446221731935, + "grad_norm": 5.107258319854736, + "learning_rate": 4.9996120639966414e-05, + "loss": 0.0624, + "num_input_tokens_seen": 2330200, + "step": 3810 + }, + { + "epoch": 1.0521235521235521, + "grad_norm": 0.2618681490421295, + "learning_rate": 4.9995905762949004e-05, + "loss": 0.0387, + "num_input_tokens_seen": 2333752, + "step": 3815 + }, + { + "epoch": 1.0535024820739107, + "grad_norm": 0.031077412888407707, + "learning_rate": 4.999568509473903e-05, + "loss": 0.0396, + "num_input_tokens_seen": 2337016, + "step": 3820 + }, + { + "epoch": 1.054881412024269, + "grad_norm": 0.03726869076490402, + "learning_rate": 4.999545863538762e-05, + "loss": 0.0838, + "num_input_tokens_seen": 2339352, + "step": 3825 + }, + { + "epoch": 1.0562603419746277, + "grad_norm": 0.013950329273939133, + "learning_rate": 4.999522638494724e-05, + "loss": 0.0854, + "num_input_tokens_seen": 2342360, + "step": 3830 + }, + { + "epoch": 1.0576392719249863, + "grad_norm": 0.02913321554660797, + "learning_rate": 4.9994988343471696e-05, + "loss": 0.0622, + "num_input_tokens_seen": 2344792, + "step": 3835 + }, + { + "epoch": 1.0590182018753447, + "grad_norm": 3.397449254989624, + "learning_rate": 4.999474451101615e-05, + "loss": 0.1504, + "num_input_tokens_seen": 2347832, + "step": 3840 + }, + { + "epoch": 1.0603971318257033, + "grad_norm": 0.09227946400642395, + "learning_rate": 4.999449488763707e-05, + "loss": 0.0536, + "num_input_tokens_seen": 2350616, + "step": 3845 + }, + { + "epoch": 1.0617760617760619, + "grad_norm": 0.06655764579772949, + "learning_rate": 4.999423947339231e-05, + "loss": 0.0422, + "num_input_tokens_seen": 2353304, + "step": 3850 + }, + { + "epoch": 1.0631549917264203, + "grad_norm": 0.10718357563018799, + "learning_rate": 4.999397826834105e-05, + "loss": 0.1291, + "num_input_tokens_seen": 2356024, + "step": 3855 + }, + { + "epoch": 1.0645339216767788, + "grad_norm": 0.07653839886188507, + "learning_rate": 4.99937112725438e-05, + "loss": 0.1656, + "num_input_tokens_seen": 2358360, + "step": 3860 + }, + { + "epoch": 1.0659128516271372, + "grad_norm": 2.3448965549468994, + "learning_rate": 4.9993438486062415e-05, + "loss": 0.0191, + "num_input_tokens_seen": 2360984, + "step": 3865 + }, + { + "epoch": 1.0672917815774958, + "grad_norm": 5.8264265060424805, + "learning_rate": 4.99931599089601e-05, + "loss": 0.1064, + "num_input_tokens_seen": 2363224, + "step": 3870 + }, + { + "epoch": 1.0686707115278544, + "grad_norm": 0.16936548054218292, + "learning_rate": 4.9992875541301384e-05, + "loss": 0.0491, + "num_input_tokens_seen": 2366136, + "step": 3875 + }, + { + "epoch": 1.0700496414782128, + "grad_norm": 0.9294981956481934, + "learning_rate": 4.999258538315218e-05, + "loss": 0.0457, + "num_input_tokens_seen": 2369496, + "step": 3880 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 6.5319132804870605, + "learning_rate": 4.999228943457969e-05, + "loss": 0.056, + "num_input_tokens_seen": 2372152, + "step": 3885 + }, + { + "epoch": 1.07280750137893, + "grad_norm": 7.832918167114258, + "learning_rate": 4.999198769565249e-05, + "loss": 0.1511, + "num_input_tokens_seen": 2375768, + "step": 3890 + }, + { + "epoch": 1.0741864313292884, + "grad_norm": 7.347504615783691, + "learning_rate": 4.9991680166440486e-05, + "loss": 0.1148, + "num_input_tokens_seen": 2378424, + "step": 3895 + }, + { + "epoch": 1.075565361279647, + "grad_norm": 3.8901894092559814, + "learning_rate": 4.9991366847014927e-05, + "loss": 0.1846, + "num_input_tokens_seen": 2381272, + "step": 3900 + }, + { + "epoch": 1.0769442912300056, + "grad_norm": 0.12902742624282837, + "learning_rate": 4.9991047737448403e-05, + "loss": 0.1026, + "num_input_tokens_seen": 2383672, + "step": 3905 + }, + { + "epoch": 1.078323221180364, + "grad_norm": 2.2081663608551025, + "learning_rate": 4.999072283781486e-05, + "loss": 0.0309, + "num_input_tokens_seen": 2386680, + "step": 3910 + }, + { + "epoch": 1.0797021511307225, + "grad_norm": 8.79592227935791, + "learning_rate": 4.999039214818955e-05, + "loss": 0.1569, + "num_input_tokens_seen": 2388824, + "step": 3915 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 8.86937427520752, + "learning_rate": 4.999005566864911e-05, + "loss": 0.2135, + "num_input_tokens_seen": 2392440, + "step": 3920 + }, + { + "epoch": 1.0824600110314395, + "grad_norm": 0.09051109105348587, + "learning_rate": 4.9989713399271486e-05, + "loss": 0.065, + "num_input_tokens_seen": 2395544, + "step": 3925 + }, + { + "epoch": 1.0838389409817981, + "grad_norm": 0.10981129854917526, + "learning_rate": 4.9989365340135984e-05, + "loss": 0.0416, + "num_input_tokens_seen": 2398040, + "step": 3930 + }, + { + "epoch": 1.0852178709321567, + "grad_norm": 0.4449312090873718, + "learning_rate": 4.998901149132324e-05, + "loss": 0.1198, + "num_input_tokens_seen": 2400216, + "step": 3935 + }, + { + "epoch": 1.086596800882515, + "grad_norm": 0.18507373332977295, + "learning_rate": 4.9988651852915236e-05, + "loss": 0.0285, + "num_input_tokens_seen": 2402456, + "step": 3940 + }, + { + "epoch": 1.0879757308328737, + "grad_norm": 2.2554244995117188, + "learning_rate": 4.998828642499529e-05, + "loss": 0.1247, + "num_input_tokens_seen": 2404632, + "step": 3945 + }, + { + "epoch": 1.0893546607832323, + "grad_norm": 1.9692541360855103, + "learning_rate": 4.998791520764808e-05, + "loss": 0.0497, + "num_input_tokens_seen": 2407960, + "step": 3950 + }, + { + "epoch": 1.0907335907335907, + "grad_norm": 0.08695077896118164, + "learning_rate": 4.9987538200959596e-05, + "loss": 0.0217, + "num_input_tokens_seen": 2410552, + "step": 3955 + }, + { + "epoch": 1.0921125206839493, + "grad_norm": 4.881943225860596, + "learning_rate": 4.99871554050172e-05, + "loss": 0.343, + "num_input_tokens_seen": 2412888, + "step": 3960 + }, + { + "epoch": 1.0934914506343079, + "grad_norm": 3.6716673374176025, + "learning_rate": 4.998676681990957e-05, + "loss": 0.085, + "num_input_tokens_seen": 2415064, + "step": 3965 + }, + { + "epoch": 1.0948703805846662, + "grad_norm": 0.4009288549423218, + "learning_rate": 4.998637244572674e-05, + "loss": 0.0627, + "num_input_tokens_seen": 2418072, + "step": 3970 + }, + { + "epoch": 1.0962493105350248, + "grad_norm": 0.3831866681575775, + "learning_rate": 4.998597228256007e-05, + "loss": 0.0614, + "num_input_tokens_seen": 2420440, + "step": 3975 + }, + { + "epoch": 1.0976282404853834, + "grad_norm": 0.7016960382461548, + "learning_rate": 4.9985566330502286e-05, + "loss": 0.051, + "num_input_tokens_seen": 2424568, + "step": 3980 + }, + { + "epoch": 1.0990071704357418, + "grad_norm": 7.662075042724609, + "learning_rate": 4.998515458964744e-05, + "loss": 0.1378, + "num_input_tokens_seen": 2427544, + "step": 3985 + }, + { + "epoch": 1.1003861003861004, + "grad_norm": 0.25920358300209045, + "learning_rate": 4.998473706009093e-05, + "loss": 0.0035, + "num_input_tokens_seen": 2430232, + "step": 3990 + }, + { + "epoch": 1.101765030336459, + "grad_norm": 0.18613605201244354, + "learning_rate": 4.9984313741929464e-05, + "loss": 0.0207, + "num_input_tokens_seen": 2432792, + "step": 3995 + }, + { + "epoch": 1.1031439602868174, + "grad_norm": 4.985142230987549, + "learning_rate": 4.9983884635261155e-05, + "loss": 0.3501, + "num_input_tokens_seen": 2435032, + "step": 4000 + }, + { + "epoch": 1.104522890237176, + "grad_norm": 5.503170967102051, + "learning_rate": 4.99834497401854e-05, + "loss": 0.1809, + "num_input_tokens_seen": 2437528, + "step": 4005 + }, + { + "epoch": 1.1059018201875346, + "grad_norm": 0.12521497905254364, + "learning_rate": 4.998300905680297e-05, + "loss": 0.0532, + "num_input_tokens_seen": 2440696, + "step": 4010 + }, + { + "epoch": 1.107280750137893, + "grad_norm": 4.682441234588623, + "learning_rate": 4.998256258521595e-05, + "loss": 0.0929, + "num_input_tokens_seen": 2443704, + "step": 4015 + }, + { + "epoch": 1.1086596800882516, + "grad_norm": 0.14358292520046234, + "learning_rate": 4.998211032552779e-05, + "loss": 0.018, + "num_input_tokens_seen": 2447704, + "step": 4020 + }, + { + "epoch": 1.1100386100386102, + "grad_norm": 1.78728449344635, + "learning_rate": 4.998165227784327e-05, + "loss": 0.0579, + "num_input_tokens_seen": 2451480, + "step": 4025 + }, + { + "epoch": 1.1114175399889685, + "grad_norm": 0.13231167197227478, + "learning_rate": 4.9981188442268525e-05, + "loss": 0.1361, + "num_input_tokens_seen": 2454680, + "step": 4030 + }, + { + "epoch": 1.1127964699393271, + "grad_norm": 0.2525733709335327, + "learning_rate": 4.9980718818910996e-05, + "loss": 0.0506, + "num_input_tokens_seen": 2458584, + "step": 4035 + }, + { + "epoch": 1.1141753998896855, + "grad_norm": 4.3362579345703125, + "learning_rate": 4.9980243407879504e-05, + "loss": 0.064, + "num_input_tokens_seen": 2461336, + "step": 4040 + }, + { + "epoch": 1.115554329840044, + "grad_norm": 0.06165822222828865, + "learning_rate": 4.99797622092842e-05, + "loss": 0.0064, + "num_input_tokens_seen": 2464728, + "step": 4045 + }, + { + "epoch": 1.1169332597904027, + "grad_norm": 0.06382539868354797, + "learning_rate": 4.997927522323655e-05, + "loss": 0.0694, + "num_input_tokens_seen": 2467544, + "step": 4050 + }, + { + "epoch": 1.118312189740761, + "grad_norm": 0.011654861271381378, + "learning_rate": 4.99787824498494e-05, + "loss": 0.0818, + "num_input_tokens_seen": 2470200, + "step": 4055 + }, + { + "epoch": 1.1196911196911197, + "grad_norm": 1.158982276916504, + "learning_rate": 4.997828388923691e-05, + "loss": 0.0739, + "num_input_tokens_seen": 2473272, + "step": 4060 + }, + { + "epoch": 1.1210700496414783, + "grad_norm": 0.057780299335718155, + "learning_rate": 4.997777954151459e-05, + "loss": 0.0332, + "num_input_tokens_seen": 2477016, + "step": 4065 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 0.1772746443748474, + "learning_rate": 4.99772694067993e-05, + "loss": 0.0033, + "num_input_tokens_seen": 2480856, + "step": 4070 + }, + { + "epoch": 1.1238279095421952, + "grad_norm": 0.08907276391983032, + "learning_rate": 4.9976753485209214e-05, + "loss": 0.158, + "num_input_tokens_seen": 2484248, + "step": 4075 + }, + { + "epoch": 1.1252068394925538, + "grad_norm": 0.06290222704410553, + "learning_rate": 4.997623177686388e-05, + "loss": 0.0142, + "num_input_tokens_seen": 2486936, + "step": 4080 + }, + { + "epoch": 1.1265857694429122, + "grad_norm": 6.0627264976501465, + "learning_rate": 4.9975704281884156e-05, + "loss": 0.076, + "num_input_tokens_seen": 2489944, + "step": 4085 + }, + { + "epoch": 1.1279646993932708, + "grad_norm": 11.323451042175293, + "learning_rate": 4.997517100039226e-05, + "loss": 0.0696, + "num_input_tokens_seen": 2492632, + "step": 4090 + }, + { + "epoch": 1.1293436293436294, + "grad_norm": 0.014708862639963627, + "learning_rate": 4.997463193251175e-05, + "loss": 0.1195, + "num_input_tokens_seen": 2494968, + "step": 4095 + }, + { + "epoch": 1.1307225592939878, + "grad_norm": 0.1663651168346405, + "learning_rate": 4.997408707836752e-05, + "loss": 0.0539, + "num_input_tokens_seen": 2499640, + "step": 4100 + }, + { + "epoch": 1.1321014892443464, + "grad_norm": 1.3173203468322754, + "learning_rate": 4.9973536438085796e-05, + "loss": 0.0924, + "num_input_tokens_seen": 2502648, + "step": 4105 + }, + { + "epoch": 1.133480419194705, + "grad_norm": 0.009554261341691017, + "learning_rate": 4.997298001179417e-05, + "loss": 0.0557, + "num_input_tokens_seen": 2505400, + "step": 4110 + }, + { + "epoch": 1.1348593491450634, + "grad_norm": 8.418587684631348, + "learning_rate": 4.997241779962155e-05, + "loss": 0.0612, + "num_input_tokens_seen": 2508344, + "step": 4115 + }, + { + "epoch": 1.136238279095422, + "grad_norm": 0.10009825229644775, + "learning_rate": 4.997184980169818e-05, + "loss": 0.0308, + "num_input_tokens_seen": 2511064, + "step": 4120 + }, + { + "epoch": 1.1376172090457806, + "grad_norm": 0.07224704325199127, + "learning_rate": 4.9971276018155665e-05, + "loss": 0.0028, + "num_input_tokens_seen": 2514904, + "step": 4125 + }, + { + "epoch": 1.138996138996139, + "grad_norm": 1.147613286972046, + "learning_rate": 4.997069644912695e-05, + "loss": 0.1425, + "num_input_tokens_seen": 2517048, + "step": 4130 + }, + { + "epoch": 1.1403750689464975, + "grad_norm": 11.448442459106445, + "learning_rate": 4.9970111094746315e-05, + "loss": 0.0177, + "num_input_tokens_seen": 2519768, + "step": 4135 + }, + { + "epoch": 1.141753998896856, + "grad_norm": 0.2978641986846924, + "learning_rate": 4.9969519955149366e-05, + "loss": 0.0437, + "num_input_tokens_seen": 2522584, + "step": 4140 + }, + { + "epoch": 1.1431329288472145, + "grad_norm": 7.999095439910889, + "learning_rate": 4.996892303047306e-05, + "loss": 0.1647, + "num_input_tokens_seen": 2525912, + "step": 4145 + }, + { + "epoch": 1.1445118587975731, + "grad_norm": 0.10396327078342438, + "learning_rate": 4.996832032085571e-05, + "loss": 0.0699, + "num_input_tokens_seen": 2528312, + "step": 4150 + }, + { + "epoch": 1.1458907887479315, + "grad_norm": 0.05002453178167343, + "learning_rate": 4.9967711826436944e-05, + "loss": 0.0092, + "num_input_tokens_seen": 2531032, + "step": 4155 + }, + { + "epoch": 1.14726971869829, + "grad_norm": 6.874320983886719, + "learning_rate": 4.996709754735774e-05, + "loss": 0.1605, + "num_input_tokens_seen": 2533016, + "step": 4160 + }, + { + "epoch": 1.1486486486486487, + "grad_norm": 6.4871392250061035, + "learning_rate": 4.996647748376043e-05, + "loss": 0.2101, + "num_input_tokens_seen": 2535544, + "step": 4165 + }, + { + "epoch": 1.150027578599007, + "grad_norm": 9.55147647857666, + "learning_rate": 4.9965851635788664e-05, + "loss": 0.137, + "num_input_tokens_seen": 2538264, + "step": 4170 + }, + { + "epoch": 1.1514065085493657, + "grad_norm": 0.04352531582117081, + "learning_rate": 4.9965220003587444e-05, + "loss": 0.2107, + "num_input_tokens_seen": 2540952, + "step": 4175 + }, + { + "epoch": 1.1527854384997243, + "grad_norm": 0.2565387785434723, + "learning_rate": 4.996458258730311e-05, + "loss": 0.0471, + "num_input_tokens_seen": 2544088, + "step": 4180 + }, + { + "epoch": 1.1541643684500826, + "grad_norm": 0.032098591327667236, + "learning_rate": 4.996393938708335e-05, + "loss": 0.0409, + "num_input_tokens_seen": 2546712, + "step": 4185 + }, + { + "epoch": 1.1555432984004412, + "grad_norm": 3.3605294227600098, + "learning_rate": 4.9963290403077165e-05, + "loss": 0.112, + "num_input_tokens_seen": 2548824, + "step": 4190 + }, + { + "epoch": 1.1569222283507998, + "grad_norm": 12.226232528686523, + "learning_rate": 4.996263563543493e-05, + "loss": 0.0507, + "num_input_tokens_seen": 2552120, + "step": 4195 + }, + { + "epoch": 1.1583011583011582, + "grad_norm": 3.6424951553344727, + "learning_rate": 4.9961975084308346e-05, + "loss": 0.0613, + "num_input_tokens_seen": 2555032, + "step": 4200 + }, + { + "epoch": 1.1596800882515168, + "grad_norm": 0.07803042232990265, + "learning_rate": 4.996130874985045e-05, + "loss": 0.0519, + "num_input_tokens_seen": 2557592, + "step": 4205 + }, + { + "epoch": 1.1610590182018754, + "grad_norm": 0.0855921059846878, + "learning_rate": 4.9960636632215616e-05, + "loss": 0.0239, + "num_input_tokens_seen": 2559672, + "step": 4210 + }, + { + "epoch": 1.1624379481522338, + "grad_norm": 5.020890712738037, + "learning_rate": 4.995995873155958e-05, + "loss": 0.1275, + "num_input_tokens_seen": 2563384, + "step": 4215 + }, + { + "epoch": 1.1638168781025924, + "grad_norm": 8.424508094787598, + "learning_rate": 4.995927504803939e-05, + "loss": 0.0847, + "num_input_tokens_seen": 2566648, + "step": 4220 + }, + { + "epoch": 1.165195808052951, + "grad_norm": 6.383438587188721, + "learning_rate": 4.995858558181344e-05, + "loss": 0.2501, + "num_input_tokens_seen": 2569080, + "step": 4225 + }, + { + "epoch": 1.1665747380033094, + "grad_norm": 0.08036423474550247, + "learning_rate": 4.995789033304148e-05, + "loss": 0.0747, + "num_input_tokens_seen": 2572472, + "step": 4230 + }, + { + "epoch": 1.167953667953668, + "grad_norm": 0.158022940158844, + "learning_rate": 4.995718930188458e-05, + "loss": 0.0109, + "num_input_tokens_seen": 2575192, + "step": 4235 + }, + { + "epoch": 1.1693325979040265, + "grad_norm": 0.03487376123666763, + "learning_rate": 4.995648248850518e-05, + "loss": 0.018, + "num_input_tokens_seen": 2577816, + "step": 4240 + }, + { + "epoch": 1.170711527854385, + "grad_norm": 8.034502029418945, + "learning_rate": 4.995576989306701e-05, + "loss": 0.0529, + "num_input_tokens_seen": 2579928, + "step": 4245 + }, + { + "epoch": 1.1720904578047435, + "grad_norm": 0.14811156690120697, + "learning_rate": 4.9955051515735185e-05, + "loss": 0.0507, + "num_input_tokens_seen": 2582936, + "step": 4250 + }, + { + "epoch": 1.1734693877551021, + "grad_norm": 0.3924770653247833, + "learning_rate": 4.995432735667613e-05, + "loss": 0.0465, + "num_input_tokens_seen": 2586776, + "step": 4255 + }, + { + "epoch": 1.1748483177054605, + "grad_norm": 0.061028704047203064, + "learning_rate": 4.995359741605765e-05, + "loss": 0.0945, + "num_input_tokens_seen": 2589144, + "step": 4260 + }, + { + "epoch": 1.176227247655819, + "grad_norm": 19.11005210876465, + "learning_rate": 4.995286169404884e-05, + "loss": 0.0322, + "num_input_tokens_seen": 2591672, + "step": 4265 + }, + { + "epoch": 1.1776061776061777, + "grad_norm": 0.009095538407564163, + "learning_rate": 4.9952120190820145e-05, + "loss": 0.028, + "num_input_tokens_seen": 2595320, + "step": 4270 + }, + { + "epoch": 1.178985107556536, + "grad_norm": 0.4115773141384125, + "learning_rate": 4.995137290654338e-05, + "loss": 0.002, + "num_input_tokens_seen": 2598424, + "step": 4275 + }, + { + "epoch": 1.1803640375068947, + "grad_norm": 0.022073378786444664, + "learning_rate": 4.995061984139168e-05, + "loss": 0.1002, + "num_input_tokens_seen": 2601016, + "step": 4280 + }, + { + "epoch": 1.1817429674572533, + "grad_norm": 0.016468945890665054, + "learning_rate": 4.994986099553952e-05, + "loss": 0.0795, + "num_input_tokens_seen": 2604408, + "step": 4285 + }, + { + "epoch": 1.1831218974076116, + "grad_norm": 0.05102419853210449, + "learning_rate": 4.9949096369162696e-05, + "loss": 0.1222, + "num_input_tokens_seen": 2609112, + "step": 4290 + }, + { + "epoch": 1.1845008273579702, + "grad_norm": 2.754512310028076, + "learning_rate": 4.994832596243838e-05, + "loss": 0.0983, + "num_input_tokens_seen": 2611192, + "step": 4295 + }, + { + "epoch": 1.1858797573083288, + "grad_norm": 8.860036849975586, + "learning_rate": 4.994754977554506e-05, + "loss": 0.1113, + "num_input_tokens_seen": 2614392, + "step": 4300 + }, + { + "epoch": 1.1872586872586872, + "grad_norm": 0.039162930101156235, + "learning_rate": 4.9946767808662555e-05, + "loss": 0.0049, + "num_input_tokens_seen": 2617464, + "step": 4305 + }, + { + "epoch": 1.1886376172090458, + "grad_norm": 6.7508344650268555, + "learning_rate": 4.9945980061972045e-05, + "loss": 0.0058, + "num_input_tokens_seen": 2620632, + "step": 4310 + }, + { + "epoch": 1.1900165471594044, + "grad_norm": 7.114245891571045, + "learning_rate": 4.9945186535656046e-05, + "loss": 0.0543, + "num_input_tokens_seen": 2623864, + "step": 4315 + }, + { + "epoch": 1.1913954771097628, + "grad_norm": 0.1306820511817932, + "learning_rate": 4.99443872298984e-05, + "loss": 0.169, + "num_input_tokens_seen": 2626616, + "step": 4320 + }, + { + "epoch": 1.1927744070601214, + "grad_norm": 0.06917699426412582, + "learning_rate": 4.994358214488429e-05, + "loss": 0.0436, + "num_input_tokens_seen": 2629080, + "step": 4325 + }, + { + "epoch": 1.19415333701048, + "grad_norm": 3.8183417320251465, + "learning_rate": 4.994277128080026e-05, + "loss": 0.0891, + "num_input_tokens_seen": 2631512, + "step": 4330 + }, + { + "epoch": 1.1955322669608384, + "grad_norm": 11.525378227233887, + "learning_rate": 4.994195463783416e-05, + "loss": 0.154, + "num_input_tokens_seen": 2634616, + "step": 4335 + }, + { + "epoch": 1.196911196911197, + "grad_norm": 6.822160720825195, + "learning_rate": 4.9941132216175184e-05, + "loss": 0.0822, + "num_input_tokens_seen": 2637464, + "step": 4340 + }, + { + "epoch": 1.1982901268615553, + "grad_norm": 5.738264083862305, + "learning_rate": 4.9940304016013894e-05, + "loss": 0.2401, + "num_input_tokens_seen": 2639960, + "step": 4345 + }, + { + "epoch": 1.199669056811914, + "grad_norm": 0.08501064777374268, + "learning_rate": 4.993947003754218e-05, + "loss": 0.0656, + "num_input_tokens_seen": 2643064, + "step": 4350 + }, + { + "epoch": 1.2010479867622725, + "grad_norm": 0.1598571091890335, + "learning_rate": 4.993863028095324e-05, + "loss": 0.0619, + "num_input_tokens_seen": 2645656, + "step": 4355 + }, + { + "epoch": 1.202426916712631, + "grad_norm": 0.16248205304145813, + "learning_rate": 4.993778474644164e-05, + "loss": 0.0101, + "num_input_tokens_seen": 2648408, + "step": 4360 + }, + { + "epoch": 1.2038058466629895, + "grad_norm": 0.22061173617839813, + "learning_rate": 4.993693343420329e-05, + "loss": 0.0867, + "num_input_tokens_seen": 2652152, + "step": 4365 + }, + { + "epoch": 1.205184776613348, + "grad_norm": 0.09153420478105545, + "learning_rate": 4.993607634443541e-05, + "loss": 0.0041, + "num_input_tokens_seen": 2656312, + "step": 4370 + }, + { + "epoch": 1.2065637065637065, + "grad_norm": 3.7225685119628906, + "learning_rate": 4.993521347733659e-05, + "loss": 0.0822, + "num_input_tokens_seen": 2659896, + "step": 4375 + }, + { + "epoch": 1.207942636514065, + "grad_norm": 8.315540313720703, + "learning_rate": 4.993434483310674e-05, + "loss": 0.1295, + "num_input_tokens_seen": 2662808, + "step": 4380 + }, + { + "epoch": 1.2093215664644237, + "grad_norm": 0.8611403703689575, + "learning_rate": 4.9933470411947105e-05, + "loss": 0.0349, + "num_input_tokens_seen": 2665240, + "step": 4385 + }, + { + "epoch": 1.210700496414782, + "grad_norm": 2.454836845397949, + "learning_rate": 4.993259021406028e-05, + "loss": 0.021, + "num_input_tokens_seen": 2668024, + "step": 4390 + }, + { + "epoch": 1.2120794263651407, + "grad_norm": 0.40100622177124023, + "learning_rate": 4.9931704239650204e-05, + "loss": 0.01, + "num_input_tokens_seen": 2671096, + "step": 4395 + }, + { + "epoch": 1.2134583563154993, + "grad_norm": 0.008670954965054989, + "learning_rate": 4.993081248892213e-05, + "loss": 0.0101, + "num_input_tokens_seen": 2673784, + "step": 4400 + }, + { + "epoch": 1.2148372862658576, + "grad_norm": 0.014210411347448826, + "learning_rate": 4.992991496208267e-05, + "loss": 0.0593, + "num_input_tokens_seen": 2676344, + "step": 4405 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.053314123302698135, + "learning_rate": 4.992901165933976e-05, + "loss": 0.1037, + "num_input_tokens_seen": 2680344, + "step": 4410 + }, + { + "epoch": 1.2175951461665748, + "grad_norm": 0.09009978175163269, + "learning_rate": 4.99281025809027e-05, + "loss": 0.0025, + "num_input_tokens_seen": 2683224, + "step": 4415 + }, + { + "epoch": 1.2189740761169332, + "grad_norm": 2.055298089981079, + "learning_rate": 4.99271877269821e-05, + "loss": 0.0511, + "num_input_tokens_seen": 2686648, + "step": 4420 + }, + { + "epoch": 1.2203530060672918, + "grad_norm": 0.059519946575164795, + "learning_rate": 4.992626709778991e-05, + "loss": 0.0021, + "num_input_tokens_seen": 2689688, + "step": 4425 + }, + { + "epoch": 1.2217319360176504, + "grad_norm": 0.0028740970883518457, + "learning_rate": 4.992534069353945e-05, + "loss": 0.0267, + "num_input_tokens_seen": 2692152, + "step": 4430 + }, + { + "epoch": 1.2231108659680088, + "grad_norm": 0.10300591588020325, + "learning_rate": 4.992440851444533e-05, + "loss": 0.0118, + "num_input_tokens_seen": 2694936, + "step": 4435 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 1.0979372262954712, + "learning_rate": 4.992347056072354e-05, + "loss": 0.2063, + "num_input_tokens_seen": 2697816, + "step": 4440 + }, + { + "epoch": 1.2258687258687258, + "grad_norm": 10.062100410461426, + "learning_rate": 4.9922526832591374e-05, + "loss": 0.1534, + "num_input_tokens_seen": 2700952, + "step": 4445 + }, + { + "epoch": 1.2272476558190843, + "grad_norm": 0.39898043870925903, + "learning_rate": 4.9921577330267494e-05, + "loss": 0.088, + "num_input_tokens_seen": 2703608, + "step": 4450 + }, + { + "epoch": 1.228626585769443, + "grad_norm": 10.8117094039917, + "learning_rate": 4.992062205397188e-05, + "loss": 0.0554, + "num_input_tokens_seen": 2706680, + "step": 4455 + }, + { + "epoch": 1.2300055157198013, + "grad_norm": 8.540412902832031, + "learning_rate": 4.991966100392586e-05, + "loss": 0.1426, + "num_input_tokens_seen": 2708696, + "step": 4460 + }, + { + "epoch": 1.23138444567016, + "grad_norm": 0.04041459411382675, + "learning_rate": 4.991869418035209e-05, + "loss": 0.0956, + "num_input_tokens_seen": 2712280, + "step": 4465 + }, + { + "epoch": 1.2327633756205185, + "grad_norm": 5.061349868774414, + "learning_rate": 4.9917721583474575e-05, + "loss": 0.0423, + "num_input_tokens_seen": 2715256, + "step": 4470 + }, + { + "epoch": 1.234142305570877, + "grad_norm": 0.04198548570275307, + "learning_rate": 4.991674321351865e-05, + "loss": 0.0879, + "num_input_tokens_seen": 2717976, + "step": 4475 + }, + { + "epoch": 1.2355212355212355, + "grad_norm": 0.19510291516780853, + "learning_rate": 4.9915759070710976e-05, + "loss": 0.2567, + "num_input_tokens_seen": 2720280, + "step": 4480 + }, + { + "epoch": 1.236900165471594, + "grad_norm": 12.073760986328125, + "learning_rate": 4.9914769155279585e-05, + "loss": 0.0334, + "num_input_tokens_seen": 2723832, + "step": 4485 + }, + { + "epoch": 1.2382790954219525, + "grad_norm": 10.777833938598633, + "learning_rate": 4.991377346745382e-05, + "loss": 0.1027, + "num_input_tokens_seen": 2726616, + "step": 4490 + }, + { + "epoch": 1.239658025372311, + "grad_norm": 0.0679757297039032, + "learning_rate": 4.9912772007464364e-05, + "loss": 0.0483, + "num_input_tokens_seen": 2729240, + "step": 4495 + }, + { + "epoch": 1.2410369553226697, + "grad_norm": 12.6544828414917, + "learning_rate": 4.991176477554324e-05, + "loss": 0.1484, + "num_input_tokens_seen": 2731512, + "step": 4500 + }, + { + "epoch": 1.242415885273028, + "grad_norm": 6.9301886558532715, + "learning_rate": 4.9910751771923814e-05, + "loss": 0.1718, + "num_input_tokens_seen": 2734424, + "step": 4505 + }, + { + "epoch": 1.2437948152233866, + "grad_norm": 4.936260223388672, + "learning_rate": 4.990973299684078e-05, + "loss": 0.1308, + "num_input_tokens_seen": 2736856, + "step": 4510 + }, + { + "epoch": 1.2451737451737452, + "grad_norm": 0.31956180930137634, + "learning_rate": 4.990870845053017e-05, + "loss": 0.1507, + "num_input_tokens_seen": 2739128, + "step": 4515 + }, + { + "epoch": 1.2465526751241036, + "grad_norm": 0.4774256646633148, + "learning_rate": 4.9907678133229375e-05, + "loss": 0.0261, + "num_input_tokens_seen": 2741880, + "step": 4520 + }, + { + "epoch": 1.2479316050744622, + "grad_norm": 0.09431387484073639, + "learning_rate": 4.990664204517709e-05, + "loss": 0.0474, + "num_input_tokens_seen": 2745464, + "step": 4525 + }, + { + "epoch": 1.2493105350248208, + "grad_norm": 0.2736356556415558, + "learning_rate": 4.990560018661336e-05, + "loss": 0.0816, + "num_input_tokens_seen": 2748440, + "step": 4530 + }, + { + "epoch": 1.2506894649751792, + "grad_norm": 0.028250237926840782, + "learning_rate": 4.990455255777956e-05, + "loss": 0.0354, + "num_input_tokens_seen": 2751256, + "step": 4535 + }, + { + "epoch": 1.2520683949255378, + "grad_norm": 0.013752961531281471, + "learning_rate": 4.9903499158918436e-05, + "loss": 0.0481, + "num_input_tokens_seen": 2755384, + "step": 4540 + }, + { + "epoch": 1.2534473248758964, + "grad_norm": 0.040023382753133774, + "learning_rate": 4.990243999027403e-05, + "loss": 0.1032, + "num_input_tokens_seen": 2758904, + "step": 4545 + }, + { + "epoch": 1.2548262548262548, + "grad_norm": 0.3773762285709381, + "learning_rate": 4.9901375052091735e-05, + "loss": 0.2039, + "num_input_tokens_seen": 2761560, + "step": 4550 + }, + { + "epoch": 1.2562051847766134, + "grad_norm": 1.5035544633865356, + "learning_rate": 4.9900304344618294e-05, + "loss": 0.0892, + "num_input_tokens_seen": 2764248, + "step": 4555 + }, + { + "epoch": 1.257584114726972, + "grad_norm": 0.059541743248701096, + "learning_rate": 4.989922786810176e-05, + "loss": 0.2153, + "num_input_tokens_seen": 2770552, + "step": 4560 + }, + { + "epoch": 1.2589630446773303, + "grad_norm": 0.25850406289100647, + "learning_rate": 4.9898145622791544e-05, + "loss": 0.0408, + "num_input_tokens_seen": 2773304, + "step": 4565 + }, + { + "epoch": 1.260341974627689, + "grad_norm": 2.0887584686279297, + "learning_rate": 4.989705760893838e-05, + "loss": 0.1007, + "num_input_tokens_seen": 2776152, + "step": 4570 + }, + { + "epoch": 1.2617209045780475, + "grad_norm": 0.25801292061805725, + "learning_rate": 4.989596382679436e-05, + "loss": 0.0107, + "num_input_tokens_seen": 2779864, + "step": 4575 + }, + { + "epoch": 1.263099834528406, + "grad_norm": 1.8378095626831055, + "learning_rate": 4.9894864276612884e-05, + "loss": 0.0111, + "num_input_tokens_seen": 2784216, + "step": 4580 + }, + { + "epoch": 1.2644787644787645, + "grad_norm": 5.810259819030762, + "learning_rate": 4.9893758958648706e-05, + "loss": 0.1247, + "num_input_tokens_seen": 2788280, + "step": 4585 + }, + { + "epoch": 1.265857694429123, + "grad_norm": 0.26121389865875244, + "learning_rate": 4.9892647873157916e-05, + "loss": 0.1457, + "num_input_tokens_seen": 2791288, + "step": 4590 + }, + { + "epoch": 1.2672366243794815, + "grad_norm": 0.13936854898929596, + "learning_rate": 4.989153102039793e-05, + "loss": 0.1118, + "num_input_tokens_seen": 2793688, + "step": 4595 + }, + { + "epoch": 1.26861555432984, + "grad_norm": 0.10729265213012695, + "learning_rate": 4.989040840062751e-05, + "loss": 0.1242, + "num_input_tokens_seen": 2796248, + "step": 4600 + }, + { + "epoch": 1.2699944842801987, + "grad_norm": 5.415771961212158, + "learning_rate": 4.988928001410675e-05, + "loss": 0.0606, + "num_input_tokens_seen": 2798616, + "step": 4605 + }, + { + "epoch": 1.271373414230557, + "grad_norm": 0.07672042399644852, + "learning_rate": 4.988814586109708e-05, + "loss": 0.0668, + "num_input_tokens_seen": 2801336, + "step": 4610 + }, + { + "epoch": 1.2727523441809157, + "grad_norm": 0.49032580852508545, + "learning_rate": 4.988700594186127e-05, + "loss": 0.0923, + "num_input_tokens_seen": 2805400, + "step": 4615 + }, + { + "epoch": 1.2741312741312742, + "grad_norm": 3.0410706996917725, + "learning_rate": 4.988586025666343e-05, + "loss": 0.1013, + "num_input_tokens_seen": 2808216, + "step": 4620 + }, + { + "epoch": 1.2755102040816326, + "grad_norm": 0.17548201978206635, + "learning_rate": 4.988470880576899e-05, + "loss": 0.1654, + "num_input_tokens_seen": 2811832, + "step": 4625 + }, + { + "epoch": 1.2768891340319912, + "grad_norm": 6.080624580383301, + "learning_rate": 4.9883551589444716e-05, + "loss": 0.0741, + "num_input_tokens_seen": 2816376, + "step": 4630 + }, + { + "epoch": 1.2782680639823498, + "grad_norm": 0.044147200882434845, + "learning_rate": 4.988238860795873e-05, + "loss": 0.168, + "num_input_tokens_seen": 2819320, + "step": 4635 + }, + { + "epoch": 1.2796469939327082, + "grad_norm": 0.01933838613331318, + "learning_rate": 4.988121986158048e-05, + "loss": 0.0483, + "num_input_tokens_seen": 2822392, + "step": 4640 + }, + { + "epoch": 1.2810259238830668, + "grad_norm": 11.869474411010742, + "learning_rate": 4.9880045350580737e-05, + "loss": 0.0259, + "num_input_tokens_seen": 2825560, + "step": 4645 + }, + { + "epoch": 1.2824048538334254, + "grad_norm": 0.7677313685417175, + "learning_rate": 4.9878865075231635e-05, + "loss": 0.0359, + "num_input_tokens_seen": 2829592, + "step": 4650 + }, + { + "epoch": 1.2837837837837838, + "grad_norm": 4.488178730010986, + "learning_rate": 4.987767903580661e-05, + "loss": 0.1265, + "num_input_tokens_seen": 2832024, + "step": 4655 + }, + { + "epoch": 1.2851627137341424, + "grad_norm": 7.293581008911133, + "learning_rate": 4.987648723258046e-05, + "loss": 0.0633, + "num_input_tokens_seen": 2834488, + "step": 4660 + }, + { + "epoch": 1.2865416436845007, + "grad_norm": 17.046829223632812, + "learning_rate": 4.9875289665829306e-05, + "loss": 0.2053, + "num_input_tokens_seen": 2836568, + "step": 4665 + }, + { + "epoch": 1.2879205736348593, + "grad_norm": 1.6567420959472656, + "learning_rate": 4.98740863358306e-05, + "loss": 0.088, + "num_input_tokens_seen": 2839480, + "step": 4670 + }, + { + "epoch": 1.289299503585218, + "grad_norm": 0.5260456800460815, + "learning_rate": 4.987287724286315e-05, + "loss": 0.0636, + "num_input_tokens_seen": 2842296, + "step": 4675 + }, + { + "epoch": 1.2906784335355763, + "grad_norm": 4.461923599243164, + "learning_rate": 4.987166238720707e-05, + "loss": 0.2179, + "num_input_tokens_seen": 2844440, + "step": 4680 + }, + { + "epoch": 1.292057363485935, + "grad_norm": 6.972059726715088, + "learning_rate": 4.9870441769143844e-05, + "loss": 0.154, + "num_input_tokens_seen": 2848408, + "step": 4685 + }, + { + "epoch": 1.2934362934362935, + "grad_norm": 6.0718793869018555, + "learning_rate": 4.986921538895626e-05, + "loss": 0.0685, + "num_input_tokens_seen": 2852408, + "step": 4690 + }, + { + "epoch": 1.294815223386652, + "grad_norm": 0.12765730917453766, + "learning_rate": 4.986798324692845e-05, + "loss": 0.0609, + "num_input_tokens_seen": 2855224, + "step": 4695 + }, + { + "epoch": 1.2961941533370105, + "grad_norm": 2.9334335327148438, + "learning_rate": 4.986674534334589e-05, + "loss": 0.0769, + "num_input_tokens_seen": 2857944, + "step": 4700 + }, + { + "epoch": 1.2975730832873689, + "grad_norm": 9.128582000732422, + "learning_rate": 4.9865501678495375e-05, + "loss": 0.0365, + "num_input_tokens_seen": 2861272, + "step": 4705 + }, + { + "epoch": 1.2989520132377275, + "grad_norm": 0.4286104440689087, + "learning_rate": 4.9864252252665054e-05, + "loss": 0.0271, + "num_input_tokens_seen": 2864248, + "step": 4710 + }, + { + "epoch": 1.300330943188086, + "grad_norm": 0.07222030311822891, + "learning_rate": 4.98629970661444e-05, + "loss": 0.0557, + "num_input_tokens_seen": 2866488, + "step": 4715 + }, + { + "epoch": 1.3017098731384444, + "grad_norm": 0.1459227353334427, + "learning_rate": 4.986173611922422e-05, + "loss": 0.0171, + "num_input_tokens_seen": 2868984, + "step": 4720 + }, + { + "epoch": 1.303088803088803, + "grad_norm": 0.07396038621664047, + "learning_rate": 4.9860469412196654e-05, + "loss": 0.2107, + "num_input_tokens_seen": 2872248, + "step": 4725 + }, + { + "epoch": 1.3044677330391616, + "grad_norm": 8.430999755859375, + "learning_rate": 4.985919694535518e-05, + "loss": 0.1031, + "num_input_tokens_seen": 2874840, + "step": 4730 + }, + { + "epoch": 1.30584666298952, + "grad_norm": 0.6238126158714294, + "learning_rate": 4.9857918718994625e-05, + "loss": 0.0092, + "num_input_tokens_seen": 2879480, + "step": 4735 + }, + { + "epoch": 1.3072255929398786, + "grad_norm": 0.5350829362869263, + "learning_rate": 4.9856634733411114e-05, + "loss": 0.0523, + "num_input_tokens_seen": 2882232, + "step": 4740 + }, + { + "epoch": 1.3086045228902372, + "grad_norm": 4.684084415435791, + "learning_rate": 4.985534498890214e-05, + "loss": 0.1249, + "num_input_tokens_seen": 2886424, + "step": 4745 + }, + { + "epoch": 1.3099834528405956, + "grad_norm": 0.004243891220539808, + "learning_rate": 4.985404948576652e-05, + "loss": 0.0773, + "num_input_tokens_seen": 2890424, + "step": 4750 + }, + { + "epoch": 1.3113623827909542, + "grad_norm": 5.122920513153076, + "learning_rate": 4.985274822430439e-05, + "loss": 0.1858, + "num_input_tokens_seen": 2894296, + "step": 4755 + }, + { + "epoch": 1.3127413127413128, + "grad_norm": 0.39850661158561707, + "learning_rate": 4.985144120481725e-05, + "loss": 0.0965, + "num_input_tokens_seen": 2896952, + "step": 4760 + }, + { + "epoch": 1.3141202426916712, + "grad_norm": 0.3284662067890167, + "learning_rate": 4.985012842760791e-05, + "loss": 0.0519, + "num_input_tokens_seen": 2900536, + "step": 4765 + }, + { + "epoch": 1.3154991726420298, + "grad_norm": 1.184168815612793, + "learning_rate": 4.984880989298052e-05, + "loss": 0.0521, + "num_input_tokens_seen": 2903768, + "step": 4770 + }, + { + "epoch": 1.3168781025923884, + "grad_norm": 6.759727954864502, + "learning_rate": 4.984748560124057e-05, + "loss": 0.1685, + "num_input_tokens_seen": 2906264, + "step": 4775 + }, + { + "epoch": 1.3182570325427467, + "grad_norm": 0.16309282183647156, + "learning_rate": 4.984615555269489e-05, + "loss": 0.0544, + "num_input_tokens_seen": 2909016, + "step": 4780 + }, + { + "epoch": 1.3196359624931053, + "grad_norm": 1.65602445602417, + "learning_rate": 4.984481974765161e-05, + "loss": 0.0165, + "num_input_tokens_seen": 2911608, + "step": 4785 + }, + { + "epoch": 1.321014892443464, + "grad_norm": 0.16463018953800201, + "learning_rate": 4.984347818642022e-05, + "loss": 0.0536, + "num_input_tokens_seen": 2914104, + "step": 4790 + }, + { + "epoch": 1.3223938223938223, + "grad_norm": 0.9659285545349121, + "learning_rate": 4.984213086931155e-05, + "loss": 0.0136, + "num_input_tokens_seen": 2917144, + "step": 4795 + }, + { + "epoch": 1.323772752344181, + "grad_norm": 7.555206775665283, + "learning_rate": 4.984077779663776e-05, + "loss": 0.0732, + "num_input_tokens_seen": 2920568, + "step": 4800 + }, + { + "epoch": 1.3251516822945395, + "grad_norm": 15.937397003173828, + "learning_rate": 4.9839418968712325e-05, + "loss": 0.0196, + "num_input_tokens_seen": 2923768, + "step": 4805 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.023984871804714203, + "learning_rate": 4.983805438585007e-05, + "loss": 0.014, + "num_input_tokens_seen": 2926968, + "step": 4810 + }, + { + "epoch": 1.3279095421952565, + "grad_norm": 10.44031810760498, + "learning_rate": 4.983668404836715e-05, + "loss": 0.0641, + "num_input_tokens_seen": 2930680, + "step": 4815 + }, + { + "epoch": 1.329288472145615, + "grad_norm": 0.23186001181602478, + "learning_rate": 4.983530795658106e-05, + "loss": 0.0129, + "num_input_tokens_seen": 2933464, + "step": 4820 + }, + { + "epoch": 1.3306674020959735, + "grad_norm": 0.03127210587263107, + "learning_rate": 4.98339261108106e-05, + "loss": 0.0974, + "num_input_tokens_seen": 2936504, + "step": 4825 + }, + { + "epoch": 1.332046332046332, + "grad_norm": 0.8216372132301331, + "learning_rate": 4.983253851137594e-05, + "loss": 0.0987, + "num_input_tokens_seen": 2938712, + "step": 4830 + }, + { + "epoch": 1.3334252619966906, + "grad_norm": 4.221699237823486, + "learning_rate": 4.983114515859857e-05, + "loss": 0.0954, + "num_input_tokens_seen": 2942968, + "step": 4835 + }, + { + "epoch": 1.334804191947049, + "grad_norm": 5.796261787414551, + "learning_rate": 4.98297460528013e-05, + "loss": 0.082, + "num_input_tokens_seen": 2946200, + "step": 4840 + }, + { + "epoch": 1.3361831218974076, + "grad_norm": 0.7555321455001831, + "learning_rate": 4.9828341194308285e-05, + "loss": 0.0069, + "num_input_tokens_seen": 2948632, + "step": 4845 + }, + { + "epoch": 1.3375620518477662, + "grad_norm": 4.748871803283691, + "learning_rate": 4.982693058344501e-05, + "loss": 0.111, + "num_input_tokens_seen": 2951160, + "step": 4850 + }, + { + "epoch": 1.3389409817981246, + "grad_norm": 6.175275802612305, + "learning_rate": 4.982551422053831e-05, + "loss": 0.2141, + "num_input_tokens_seen": 2954712, + "step": 4855 + }, + { + "epoch": 1.3403199117484832, + "grad_norm": 0.3354179263114929, + "learning_rate": 4.98240921059163e-05, + "loss": 0.1527, + "num_input_tokens_seen": 2958968, + "step": 4860 + }, + { + "epoch": 1.3416988416988418, + "grad_norm": 0.6153071522712708, + "learning_rate": 4.982266423990849e-05, + "loss": 0.028, + "num_input_tokens_seen": 2961688, + "step": 4865 + }, + { + "epoch": 1.3430777716492002, + "grad_norm": 5.543910980224609, + "learning_rate": 4.982123062284569e-05, + "loss": 0.0099, + "num_input_tokens_seen": 2964504, + "step": 4870 + }, + { + "epoch": 1.3444567015995588, + "grad_norm": 0.05919472873210907, + "learning_rate": 4.981979125506006e-05, + "loss": 0.0498, + "num_input_tokens_seen": 2967544, + "step": 4875 + }, + { + "epoch": 1.3458356315499174, + "grad_norm": 6.533029556274414, + "learning_rate": 4.981834613688506e-05, + "loss": 0.2042, + "num_input_tokens_seen": 2970680, + "step": 4880 + }, + { + "epoch": 1.3472145615002757, + "grad_norm": 0.7192809581756592, + "learning_rate": 4.9816895268655514e-05, + "loss": 0.0179, + "num_input_tokens_seen": 2973336, + "step": 4885 + }, + { + "epoch": 1.3485934914506343, + "grad_norm": 9.776811599731445, + "learning_rate": 4.981543865070758e-05, + "loss": 0.1396, + "num_input_tokens_seen": 2976440, + "step": 4890 + }, + { + "epoch": 1.349972421400993, + "grad_norm": 12.4075927734375, + "learning_rate": 4.98139762833787e-05, + "loss": 0.1761, + "num_input_tokens_seen": 2979384, + "step": 4895 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.08358795195817947, + "learning_rate": 4.9812508167007724e-05, + "loss": 0.0228, + "num_input_tokens_seen": 2982456, + "step": 4900 + }, + { + "epoch": 1.35273028130171, + "grad_norm": 0.3200560212135315, + "learning_rate": 4.981103430193476e-05, + "loss": 0.0132, + "num_input_tokens_seen": 2985112, + "step": 4905 + }, + { + "epoch": 1.3541092112520685, + "grad_norm": 25.306184768676758, + "learning_rate": 4.9809554688501304e-05, + "loss": 0.0291, + "num_input_tokens_seen": 2987480, + "step": 4910 + }, + { + "epoch": 1.3554881412024269, + "grad_norm": 0.058540135622024536, + "learning_rate": 4.980806932705016e-05, + "loss": 0.0951, + "num_input_tokens_seen": 2990168, + "step": 4915 + }, + { + "epoch": 1.3568670711527855, + "grad_norm": 8.764174461364746, + "learning_rate": 4.980657821792545e-05, + "loss": 0.1738, + "num_input_tokens_seen": 2992920, + "step": 4920 + }, + { + "epoch": 1.358246001103144, + "grad_norm": 0.3439711928367615, + "learning_rate": 4.980508136147265e-05, + "loss": 0.0792, + "num_input_tokens_seen": 2996824, + "step": 4925 + }, + { + "epoch": 1.3596249310535025, + "grad_norm": 1.7676482200622559, + "learning_rate": 4.980357875803856e-05, + "loss": 0.0643, + "num_input_tokens_seen": 2998904, + "step": 4930 + }, + { + "epoch": 1.361003861003861, + "grad_norm": 7.662014484405518, + "learning_rate": 4.980207040797131e-05, + "loss": 0.1429, + "num_input_tokens_seen": 3001784, + "step": 4935 + }, + { + "epoch": 1.3623827909542197, + "grad_norm": 2.2721927165985107, + "learning_rate": 4.980055631162037e-05, + "loss": 0.1043, + "num_input_tokens_seen": 3005048, + "step": 4940 + }, + { + "epoch": 1.363761720904578, + "grad_norm": 0.15479476749897003, + "learning_rate": 4.979903646933654e-05, + "loss": 0.0171, + "num_input_tokens_seen": 3007864, + "step": 4945 + }, + { + "epoch": 1.3651406508549366, + "grad_norm": 0.1755800098180771, + "learning_rate": 4.979751088147192e-05, + "loss": 0.0137, + "num_input_tokens_seen": 3010488, + "step": 4950 + }, + { + "epoch": 1.366519580805295, + "grad_norm": 3.2841289043426514, + "learning_rate": 4.979597954838e-05, + "loss": 0.1421, + "num_input_tokens_seen": 3013496, + "step": 4955 + }, + { + "epoch": 1.3678985107556536, + "grad_norm": 0.273391991853714, + "learning_rate": 4.9794442470415536e-05, + "loss": 0.1051, + "num_input_tokens_seen": 3016216, + "step": 4960 + }, + { + "epoch": 1.3692774407060122, + "grad_norm": 4.539875507354736, + "learning_rate": 4.979289964793467e-05, + "loss": 0.1533, + "num_input_tokens_seen": 3018488, + "step": 4965 + }, + { + "epoch": 1.3706563706563706, + "grad_norm": 0.2045557200908661, + "learning_rate": 4.979135108129483e-05, + "loss": 0.0049, + "num_input_tokens_seen": 3023736, + "step": 4970 + }, + { + "epoch": 1.3720353006067292, + "grad_norm": 5.673638343811035, + "learning_rate": 4.978979677085483e-05, + "loss": 0.0844, + "num_input_tokens_seen": 3027192, + "step": 4975 + }, + { + "epoch": 1.3734142305570878, + "grad_norm": 7.425889492034912, + "learning_rate": 4.978823671697475e-05, + "loss": 0.0979, + "num_input_tokens_seen": 3029816, + "step": 4980 + }, + { + "epoch": 1.3747931605074462, + "grad_norm": 6.090620517730713, + "learning_rate": 4.9786670920016044e-05, + "loss": 0.2593, + "num_input_tokens_seen": 3032792, + "step": 4985 + }, + { + "epoch": 1.3761720904578048, + "grad_norm": 6.3548712730407715, + "learning_rate": 4.978509938034148e-05, + "loss": 0.1098, + "num_input_tokens_seen": 3036024, + "step": 4990 + }, + { + "epoch": 1.3775510204081631, + "grad_norm": 16.891817092895508, + "learning_rate": 4.978352209831517e-05, + "loss": 0.0386, + "num_input_tokens_seen": 3039128, + "step": 4995 + }, + { + "epoch": 1.3789299503585217, + "grad_norm": 0.11147671192884445, + "learning_rate": 4.978193907430254e-05, + "loss": 0.1293, + "num_input_tokens_seen": 3041944, + "step": 5000 + }, + { + "epoch": 1.3803088803088803, + "grad_norm": 1.5514012575149536, + "learning_rate": 4.9780350308670365e-05, + "loss": 0.0512, + "num_input_tokens_seen": 3044600, + "step": 5005 + }, + { + "epoch": 1.3816878102592387, + "grad_norm": 0.09120886027812958, + "learning_rate": 4.9778755801786726e-05, + "loss": 0.1077, + "num_input_tokens_seen": 3047896, + "step": 5010 + }, + { + "epoch": 1.3830667402095973, + "grad_norm": 5.61194372177124, + "learning_rate": 4.977715555402105e-05, + "loss": 0.0411, + "num_input_tokens_seen": 3050456, + "step": 5015 + }, + { + "epoch": 1.384445670159956, + "grad_norm": 5.002738952636719, + "learning_rate": 4.977554956574409e-05, + "loss": 0.1553, + "num_input_tokens_seen": 3052888, + "step": 5020 + }, + { + "epoch": 1.3858246001103143, + "grad_norm": 16.504987716674805, + "learning_rate": 4.9773937837327944e-05, + "loss": 0.0316, + "num_input_tokens_seen": 3056216, + "step": 5025 + }, + { + "epoch": 1.3872035300606729, + "grad_norm": 0.05142489820718765, + "learning_rate": 4.977232036914601e-05, + "loss": 0.0407, + "num_input_tokens_seen": 3058936, + "step": 5030 + }, + { + "epoch": 1.3885824600110315, + "grad_norm": 0.15545634925365448, + "learning_rate": 4.9770697161573045e-05, + "loss": 0.1307, + "num_input_tokens_seen": 3061976, + "step": 5035 + }, + { + "epoch": 1.3899613899613898, + "grad_norm": 0.09190847724676132, + "learning_rate": 4.976906821498512e-05, + "loss": 0.1455, + "num_input_tokens_seen": 3065656, + "step": 5040 + }, + { + "epoch": 1.3913403199117484, + "grad_norm": 0.05521310865879059, + "learning_rate": 4.9767433529759624e-05, + "loss": 0.0881, + "num_input_tokens_seen": 3068760, + "step": 5045 + }, + { + "epoch": 1.392719249862107, + "grad_norm": 0.20732539892196655, + "learning_rate": 4.9765793106275304e-05, + "loss": 0.0443, + "num_input_tokens_seen": 3072760, + "step": 5050 + }, + { + "epoch": 1.3940981798124654, + "grad_norm": 0.40697410702705383, + "learning_rate": 4.976414694491222e-05, + "loss": 0.0848, + "num_input_tokens_seen": 3075320, + "step": 5055 + }, + { + "epoch": 1.395477109762824, + "grad_norm": 6.341583728790283, + "learning_rate": 4.976249504605177e-05, + "loss": 0.1031, + "num_input_tokens_seen": 3078872, + "step": 5060 + }, + { + "epoch": 1.3968560397131826, + "grad_norm": 4.982421398162842, + "learning_rate": 4.9760837410076664e-05, + "loss": 0.1528, + "num_input_tokens_seen": 3081944, + "step": 5065 + }, + { + "epoch": 1.398234969663541, + "grad_norm": 8.14011001586914, + "learning_rate": 4.9759174037370957e-05, + "loss": 0.0507, + "num_input_tokens_seen": 3084216, + "step": 5070 + }, + { + "epoch": 1.3996138996138996, + "grad_norm": 0.1535651683807373, + "learning_rate": 4.975750492832003e-05, + "loss": 0.0273, + "num_input_tokens_seen": 3089176, + "step": 5075 + }, + { + "epoch": 1.4009928295642582, + "grad_norm": 0.08731765300035477, + "learning_rate": 4.975583008331059e-05, + "loss": 0.13, + "num_input_tokens_seen": 3091544, + "step": 5080 + }, + { + "epoch": 1.4023717595146166, + "grad_norm": 8.07304573059082, + "learning_rate": 4.9754149502730676e-05, + "loss": 0.085, + "num_input_tokens_seen": 3094328, + "step": 5085 + }, + { + "epoch": 1.4037506894649752, + "grad_norm": 0.07806161791086197, + "learning_rate": 4.9752463186969656e-05, + "loss": 0.105, + "num_input_tokens_seen": 3098136, + "step": 5090 + }, + { + "epoch": 1.4051296194153338, + "grad_norm": 0.8923191428184509, + "learning_rate": 4.975077113641822e-05, + "loss": 0.2363, + "num_input_tokens_seen": 3101080, + "step": 5095 + }, + { + "epoch": 1.4065085493656921, + "grad_norm": 0.18526288866996765, + "learning_rate": 4.9749073351468394e-05, + "loss": 0.0253, + "num_input_tokens_seen": 3103544, + "step": 5100 + }, + { + "epoch": 1.4078874793160507, + "grad_norm": 15.112602233886719, + "learning_rate": 4.9747369832513535e-05, + "loss": 0.1286, + "num_input_tokens_seen": 3106008, + "step": 5105 + }, + { + "epoch": 1.4092664092664093, + "grad_norm": 10.763900756835938, + "learning_rate": 4.974566057994832e-05, + "loss": 0.1248, + "num_input_tokens_seen": 3110360, + "step": 5110 + }, + { + "epoch": 1.4106453392167677, + "grad_norm": 0.0619337223470211, + "learning_rate": 4.974394559416875e-05, + "loss": 0.0869, + "num_input_tokens_seen": 3114104, + "step": 5115 + }, + { + "epoch": 1.4120242691671263, + "grad_norm": 0.008062812499701977, + "learning_rate": 4.9742224875572176e-05, + "loss": 0.0103, + "num_input_tokens_seen": 3116568, + "step": 5120 + }, + { + "epoch": 1.413403199117485, + "grad_norm": 0.7026895880699158, + "learning_rate": 4.974049842455726e-05, + "loss": 0.0541, + "num_input_tokens_seen": 3120696, + "step": 5125 + }, + { + "epoch": 1.4147821290678433, + "grad_norm": 0.032906197011470795, + "learning_rate": 4.973876624152399e-05, + "loss": 0.0696, + "num_input_tokens_seen": 3123768, + "step": 5130 + }, + { + "epoch": 1.4161610590182019, + "grad_norm": 0.3509763181209564, + "learning_rate": 4.9737028326873706e-05, + "loss": 0.0051, + "num_input_tokens_seen": 3126680, + "step": 5135 + }, + { + "epoch": 1.4175399889685605, + "grad_norm": 0.018809828907251358, + "learning_rate": 4.973528468100903e-05, + "loss": 0.0021, + "num_input_tokens_seen": 3130584, + "step": 5140 + }, + { + "epoch": 1.4189189189189189, + "grad_norm": 22.761884689331055, + "learning_rate": 4.9733535304333954e-05, + "loss": 0.0802, + "num_input_tokens_seen": 3135704, + "step": 5145 + }, + { + "epoch": 1.4202978488692775, + "grad_norm": 4.582157611846924, + "learning_rate": 4.9731780197253796e-05, + "loss": 0.0999, + "num_input_tokens_seen": 3140760, + "step": 5150 + }, + { + "epoch": 1.421676778819636, + "grad_norm": 0.3318972885608673, + "learning_rate": 4.9730019360175165e-05, + "loss": 0.1233, + "num_input_tokens_seen": 3144344, + "step": 5155 + }, + { + "epoch": 1.4230557087699944, + "grad_norm": 9.014768600463867, + "learning_rate": 4.972825279350603e-05, + "loss": 0.1306, + "num_input_tokens_seen": 3147064, + "step": 5160 + }, + { + "epoch": 1.424434638720353, + "grad_norm": 25.343647003173828, + "learning_rate": 4.97264804976557e-05, + "loss": 0.171, + "num_input_tokens_seen": 3151768, + "step": 5165 + }, + { + "epoch": 1.4258135686707116, + "grad_norm": 0.04619147628545761, + "learning_rate": 4.972470247303476e-05, + "loss": 0.0782, + "num_input_tokens_seen": 3154552, + "step": 5170 + }, + { + "epoch": 1.42719249862107, + "grad_norm": 0.015360401012003422, + "learning_rate": 4.9722918720055174e-05, + "loss": 0.0384, + "num_input_tokens_seen": 3157688, + "step": 5175 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 9.461491584777832, + "learning_rate": 4.9721129239130195e-05, + "loss": 0.1673, + "num_input_tokens_seen": 3160472, + "step": 5180 + }, + { + "epoch": 1.4299503585217872, + "grad_norm": 0.07156772166490555, + "learning_rate": 4.9719334030674434e-05, + "loss": 0.1029, + "num_input_tokens_seen": 3164920, + "step": 5185 + }, + { + "epoch": 1.4313292884721456, + "grad_norm": 4.68951940536499, + "learning_rate": 4.9717533095103805e-05, + "loss": 0.083, + "num_input_tokens_seen": 3168888, + "step": 5190 + }, + { + "epoch": 1.4327082184225042, + "grad_norm": 0.6065105199813843, + "learning_rate": 4.971572643283557e-05, + "loss": 0.1605, + "num_input_tokens_seen": 3171608, + "step": 5195 + }, + { + "epoch": 1.4340871483728628, + "grad_norm": 15.965404510498047, + "learning_rate": 4.9713914044288293e-05, + "loss": 0.0333, + "num_input_tokens_seen": 3175288, + "step": 5200 + }, + { + "epoch": 1.4354660783232212, + "grad_norm": 0.6196745038032532, + "learning_rate": 4.97120959298819e-05, + "loss": 0.0318, + "num_input_tokens_seen": 3178168, + "step": 5205 + }, + { + "epoch": 1.4368450082735797, + "grad_norm": 0.08555012941360474, + "learning_rate": 4.971027209003759e-05, + "loss": 0.0231, + "num_input_tokens_seen": 3180632, + "step": 5210 + }, + { + "epoch": 1.4382239382239383, + "grad_norm": 4.53166389465332, + "learning_rate": 4.9708442525177945e-05, + "loss": 0.0579, + "num_input_tokens_seen": 3183896, + "step": 5215 + }, + { + "epoch": 1.4396028681742967, + "grad_norm": 7.896202564239502, + "learning_rate": 4.9706607235726845e-05, + "loss": 0.1587, + "num_input_tokens_seen": 3186168, + "step": 5220 + }, + { + "epoch": 1.4409817981246553, + "grad_norm": 7.477616310119629, + "learning_rate": 4.97047662221095e-05, + "loss": 0.1038, + "num_input_tokens_seen": 3189368, + "step": 5225 + }, + { + "epoch": 1.442360728075014, + "grad_norm": 0.03507375344634056, + "learning_rate": 4.970291948475244e-05, + "loss": 0.1027, + "num_input_tokens_seen": 3191672, + "step": 5230 + }, + { + "epoch": 1.4437396580253723, + "grad_norm": 0.8413487076759338, + "learning_rate": 4.970106702408353e-05, + "loss": 0.0563, + "num_input_tokens_seen": 3194872, + "step": 5235 + }, + { + "epoch": 1.445118587975731, + "grad_norm": 0.5362104773521423, + "learning_rate": 4.9699208840531965e-05, + "loss": 0.1001, + "num_input_tokens_seen": 3198200, + "step": 5240 + }, + { + "epoch": 1.4464975179260895, + "grad_norm": 3.4146790504455566, + "learning_rate": 4.969734493452824e-05, + "loss": 0.026, + "num_input_tokens_seen": 3200984, + "step": 5245 + }, + { + "epoch": 1.4478764478764479, + "grad_norm": 17.80752182006836, + "learning_rate": 4.969547530650423e-05, + "loss": 0.1016, + "num_input_tokens_seen": 3203992, + "step": 5250 + }, + { + "epoch": 1.4492553778268065, + "grad_norm": 0.6490567326545715, + "learning_rate": 4.969359995689307e-05, + "loss": 0.0884, + "num_input_tokens_seen": 3206616, + "step": 5255 + }, + { + "epoch": 1.4506343077771648, + "grad_norm": 0.5289170145988464, + "learning_rate": 4.969171888612927e-05, + "loss": 0.1022, + "num_input_tokens_seen": 3209720, + "step": 5260 + }, + { + "epoch": 1.4520132377275234, + "grad_norm": 3.2959673404693604, + "learning_rate": 4.968983209464863e-05, + "loss": 0.0983, + "num_input_tokens_seen": 3212888, + "step": 5265 + }, + { + "epoch": 1.453392167677882, + "grad_norm": 0.3073335886001587, + "learning_rate": 4.968793958288831e-05, + "loss": 0.0107, + "num_input_tokens_seen": 3216632, + "step": 5270 + }, + { + "epoch": 1.4547710976282404, + "grad_norm": 0.031206319108605385, + "learning_rate": 4.968604135128676e-05, + "loss": 0.005, + "num_input_tokens_seen": 3219032, + "step": 5275 + }, + { + "epoch": 1.456150027578599, + "grad_norm": 0.46984606981277466, + "learning_rate": 4.9684137400283785e-05, + "loss": 0.0727, + "num_input_tokens_seen": 3222360, + "step": 5280 + }, + { + "epoch": 1.4575289575289574, + "grad_norm": 3.851978063583374, + "learning_rate": 4.9682227730320505e-05, + "loss": 0.1149, + "num_input_tokens_seen": 3227032, + "step": 5285 + }, + { + "epoch": 1.458907887479316, + "grad_norm": 0.11528141796588898, + "learning_rate": 4.9680312341839355e-05, + "loss": 0.0076, + "num_input_tokens_seen": 3230552, + "step": 5290 + }, + { + "epoch": 1.4602868174296746, + "grad_norm": 0.11113119125366211, + "learning_rate": 4.9678391235284113e-05, + "loss": 0.1823, + "num_input_tokens_seen": 3234744, + "step": 5295 + }, + { + "epoch": 1.461665747380033, + "grad_norm": 0.3492363393306732, + "learning_rate": 4.9676464411099864e-05, + "loss": 0.0042, + "num_input_tokens_seen": 3238200, + "step": 5300 + }, + { + "epoch": 1.4630446773303916, + "grad_norm": 10.934854507446289, + "learning_rate": 4.967453186973302e-05, + "loss": 0.2085, + "num_input_tokens_seen": 3241816, + "step": 5305 + }, + { + "epoch": 1.4644236072807502, + "grad_norm": 0.0765661969780922, + "learning_rate": 4.967259361163134e-05, + "loss": 0.0089, + "num_input_tokens_seen": 3245784, + "step": 5310 + }, + { + "epoch": 1.4658025372311085, + "grad_norm": 2.348210334777832, + "learning_rate": 4.967064963724388e-05, + "loss": 0.2103, + "num_input_tokens_seen": 3249112, + "step": 5315 + }, + { + "epoch": 1.4671814671814671, + "grad_norm": 0.08844047039747238, + "learning_rate": 4.9668699947021024e-05, + "loss": 0.1818, + "num_input_tokens_seen": 3251576, + "step": 5320 + }, + { + "epoch": 1.4685603971318257, + "grad_norm": 0.4735047519207001, + "learning_rate": 4.96667445414145e-05, + "loss": 0.0535, + "num_input_tokens_seen": 3254776, + "step": 5325 + }, + { + "epoch": 1.469939327082184, + "grad_norm": 0.43472954630851746, + "learning_rate": 4.966478342087735e-05, + "loss": 0.0189, + "num_input_tokens_seen": 3257048, + "step": 5330 + }, + { + "epoch": 1.4713182570325427, + "grad_norm": 0.028759296983480453, + "learning_rate": 4.9662816585863916e-05, + "loss": 0.005, + "num_input_tokens_seen": 3259544, + "step": 5335 + }, + { + "epoch": 1.4726971869829013, + "grad_norm": 0.04992734640836716, + "learning_rate": 4.9660844036829905e-05, + "loss": 0.0662, + "num_input_tokens_seen": 3262584, + "step": 5340 + }, + { + "epoch": 1.4740761169332597, + "grad_norm": 5.356382369995117, + "learning_rate": 4.9658865774232326e-05, + "loss": 0.029, + "num_input_tokens_seen": 3265240, + "step": 5345 + }, + { + "epoch": 1.4754550468836183, + "grad_norm": 11.06531047821045, + "learning_rate": 4.9656881798529507e-05, + "loss": 0.0724, + "num_input_tokens_seen": 3267480, + "step": 5350 + }, + { + "epoch": 1.4768339768339769, + "grad_norm": 0.026656068861484528, + "learning_rate": 4.965489211018112e-05, + "loss": 0.0052, + "num_input_tokens_seen": 3270776, + "step": 5355 + }, + { + "epoch": 1.4782129067843353, + "grad_norm": 0.0056898463517427444, + "learning_rate": 4.965289670964812e-05, + "loss": 0.0104, + "num_input_tokens_seen": 3273784, + "step": 5360 + }, + { + "epoch": 1.4795918367346939, + "grad_norm": 11.491242408752441, + "learning_rate": 4.9650895597392845e-05, + "loss": 0.2423, + "num_input_tokens_seen": 3275896, + "step": 5365 + }, + { + "epoch": 1.4809707666850525, + "grad_norm": 14.616995811462402, + "learning_rate": 4.964888877387891e-05, + "loss": 0.0518, + "num_input_tokens_seen": 3278616, + "step": 5370 + }, + { + "epoch": 1.4823496966354108, + "grad_norm": 0.1786096841096878, + "learning_rate": 4.964687623957126e-05, + "loss": 0.0916, + "num_input_tokens_seen": 3280888, + "step": 5375 + }, + { + "epoch": 1.4837286265857694, + "grad_norm": 0.19362294673919678, + "learning_rate": 4.964485799493618e-05, + "loss": 0.0752, + "num_input_tokens_seen": 3283736, + "step": 5380 + }, + { + "epoch": 1.485107556536128, + "grad_norm": 3.463815450668335, + "learning_rate": 4.964283404044126e-05, + "loss": 0.0445, + "num_input_tokens_seen": 3286584, + "step": 5385 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.11622494459152222, + "learning_rate": 4.964080437655543e-05, + "loss": 0.107, + "num_input_tokens_seen": 3289592, + "step": 5390 + }, + { + "epoch": 1.487865416436845, + "grad_norm": 0.10920093208551407, + "learning_rate": 4.963876900374893e-05, + "loss": 0.0119, + "num_input_tokens_seen": 3292248, + "step": 5395 + }, + { + "epoch": 1.4892443463872036, + "grad_norm": 6.825786590576172, + "learning_rate": 4.963672792249333e-05, + "loss": 0.0716, + "num_input_tokens_seen": 3294648, + "step": 5400 + }, + { + "epoch": 1.490623276337562, + "grad_norm": 0.05573999136686325, + "learning_rate": 4.963468113326152e-05, + "loss": 0.0504, + "num_input_tokens_seen": 3297368, + "step": 5405 + }, + { + "epoch": 1.4920022062879206, + "grad_norm": 0.05666523426771164, + "learning_rate": 4.96326286365277e-05, + "loss": 0.0905, + "num_input_tokens_seen": 3300184, + "step": 5410 + }, + { + "epoch": 1.4933811362382792, + "grad_norm": 1.95875084400177, + "learning_rate": 4.963057043276741e-05, + "loss": 0.0087, + "num_input_tokens_seen": 3303160, + "step": 5415 + }, + { + "epoch": 1.4947600661886375, + "grad_norm": 1.1692731380462646, + "learning_rate": 4.9628506522457515e-05, + "loss": 0.0123, + "num_input_tokens_seen": 3306936, + "step": 5420 + }, + { + "epoch": 1.4961389961389961, + "grad_norm": 0.4490296542644501, + "learning_rate": 4.962643690607618e-05, + "loss": 0.111, + "num_input_tokens_seen": 3310392, + "step": 5425 + }, + { + "epoch": 1.4975179260893547, + "grad_norm": 6.230587959289551, + "learning_rate": 4.962436158410292e-05, + "loss": 0.0917, + "num_input_tokens_seen": 3313496, + "step": 5430 + }, + { + "epoch": 1.4988968560397131, + "grad_norm": 0.6472243070602417, + "learning_rate": 4.962228055701854e-05, + "loss": 0.0172, + "num_input_tokens_seen": 3317304, + "step": 5435 + }, + { + "epoch": 1.5, + "eval_loss": 0.11150172352790833, + "eval_runtime": 28.478, + "eval_samples_per_second": 56.605, + "eval_steps_per_second": 14.151, + "num_input_tokens_seen": 3320792, + "step": 5439 + }, + { + "epoch": 1.5002757859900717, + "grad_norm": 0.1371215581893921, + "learning_rate": 4.962019382530521e-05, + "loss": 0.0814, + "num_input_tokens_seen": 3321592, + "step": 5440 + }, + { + "epoch": 1.5016547159404303, + "grad_norm": 0.1741771548986435, + "learning_rate": 4.961810138944636e-05, + "loss": 0.0976, + "num_input_tokens_seen": 3324312, + "step": 5445 + }, + { + "epoch": 1.5030336458907887, + "grad_norm": 14.487725257873535, + "learning_rate": 4.9616003249926804e-05, + "loss": 0.1017, + "num_input_tokens_seen": 3327416, + "step": 5450 + }, + { + "epoch": 1.5044125758411473, + "grad_norm": 0.16206902265548706, + "learning_rate": 4.9613899407232645e-05, + "loss": 0.1469, + "num_input_tokens_seen": 3331000, + "step": 5455 + }, + { + "epoch": 1.505791505791506, + "grad_norm": 0.09785444289445877, + "learning_rate": 4.9611789861851316e-05, + "loss": 0.0051, + "num_input_tokens_seen": 3334424, + "step": 5460 + }, + { + "epoch": 1.5071704357418643, + "grad_norm": 0.04409218579530716, + "learning_rate": 4.960967461427156e-05, + "loss": 0.0358, + "num_input_tokens_seen": 3337848, + "step": 5465 + }, + { + "epoch": 1.5085493656922229, + "grad_norm": 2.466606855392456, + "learning_rate": 4.960755366498345e-05, + "loss": 0.1063, + "num_input_tokens_seen": 3341176, + "step": 5470 + }, + { + "epoch": 1.5099282956425815, + "grad_norm": 0.016871541738510132, + "learning_rate": 4.960542701447839e-05, + "loss": 0.0101, + "num_input_tokens_seen": 3344952, + "step": 5475 + }, + { + "epoch": 1.5113072255929398, + "grad_norm": 0.4920041859149933, + "learning_rate": 4.9603294663249075e-05, + "loss": 0.109, + "num_input_tokens_seen": 3347608, + "step": 5480 + }, + { + "epoch": 1.5126861555432984, + "grad_norm": 4.596229553222656, + "learning_rate": 4.9601156611789565e-05, + "loss": 0.1028, + "num_input_tokens_seen": 3350456, + "step": 5485 + }, + { + "epoch": 1.514065085493657, + "grad_norm": 4.7042694091796875, + "learning_rate": 4.95990128605952e-05, + "loss": 0.0807, + "num_input_tokens_seen": 3353272, + "step": 5490 + }, + { + "epoch": 1.5154440154440154, + "grad_norm": 0.4432671368122101, + "learning_rate": 4.959686341016266e-05, + "loss": 0.0056, + "num_input_tokens_seen": 3356184, + "step": 5495 + }, + { + "epoch": 1.516822945394374, + "grad_norm": 0.04262305423617363, + "learning_rate": 4.959470826098994e-05, + "loss": 0.0696, + "num_input_tokens_seen": 3358392, + "step": 5500 + }, + { + "epoch": 1.5182018753447326, + "grad_norm": 0.14360962808132172, + "learning_rate": 4.9592547413576364e-05, + "loss": 0.0154, + "num_input_tokens_seen": 3360440, + "step": 5505 + }, + { + "epoch": 1.519580805295091, + "grad_norm": 0.1600494682788849, + "learning_rate": 4.959038086842255e-05, + "loss": 0.1149, + "num_input_tokens_seen": 3363768, + "step": 5510 + }, + { + "epoch": 1.5209597352454496, + "grad_norm": 3.6025373935699463, + "learning_rate": 4.958820862603049e-05, + "loss": 0.1305, + "num_input_tokens_seen": 3367032, + "step": 5515 + }, + { + "epoch": 1.5223386651958082, + "grad_norm": 0.0611116848886013, + "learning_rate": 4.958603068690342e-05, + "loss": 0.1367, + "num_input_tokens_seen": 3370136, + "step": 5520 + }, + { + "epoch": 1.5237175951461666, + "grad_norm": 14.436418533325195, + "learning_rate": 4.958384705154597e-05, + "loss": 0.129, + "num_input_tokens_seen": 3372920, + "step": 5525 + }, + { + "epoch": 1.525096525096525, + "grad_norm": 1.3966645002365112, + "learning_rate": 4.958165772046404e-05, + "loss": 0.1134, + "num_input_tokens_seen": 3377784, + "step": 5530 + }, + { + "epoch": 1.5264754550468838, + "grad_norm": 0.14405836164951324, + "learning_rate": 4.957946269416488e-05, + "loss": 0.0256, + "num_input_tokens_seen": 3380344, + "step": 5535 + }, + { + "epoch": 1.5278543849972421, + "grad_norm": 0.522388756275177, + "learning_rate": 4.957726197315703e-05, + "loss": 0.1438, + "num_input_tokens_seen": 3383064, + "step": 5540 + }, + { + "epoch": 1.5292333149476005, + "grad_norm": 0.15123750269412994, + "learning_rate": 4.9575055557950376e-05, + "loss": 0.0924, + "num_input_tokens_seen": 3386520, + "step": 5545 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 0.127592071890831, + "learning_rate": 4.957284344905611e-05, + "loss": 0.1141, + "num_input_tokens_seen": 3389208, + "step": 5550 + }, + { + "epoch": 1.5319911748483177, + "grad_norm": 0.21743938326835632, + "learning_rate": 4.957062564698675e-05, + "loss": 0.0064, + "num_input_tokens_seen": 3392504, + "step": 5555 + }, + { + "epoch": 1.533370104798676, + "grad_norm": 2.8336257934570312, + "learning_rate": 4.9568402152256114e-05, + "loss": 0.0465, + "num_input_tokens_seen": 3394968, + "step": 5560 + }, + { + "epoch": 1.534749034749035, + "grad_norm": 4.183019161224365, + "learning_rate": 4.9566172965379364e-05, + "loss": 0.0845, + "num_input_tokens_seen": 3398456, + "step": 5565 + }, + { + "epoch": 1.5361279646993933, + "grad_norm": 0.024595461785793304, + "learning_rate": 4.9563938086872985e-05, + "loss": 0.0516, + "num_input_tokens_seen": 3402488, + "step": 5570 + }, + { + "epoch": 1.5375068946497517, + "grad_norm": 0.011681238189339638, + "learning_rate": 4.9561697517254746e-05, + "loss": 0.0495, + "num_input_tokens_seen": 3405304, + "step": 5575 + }, + { + "epoch": 1.5388858246001105, + "grad_norm": 6.168275833129883, + "learning_rate": 4.9559451257043754e-05, + "loss": 0.1657, + "num_input_tokens_seen": 3407800, + "step": 5580 + }, + { + "epoch": 1.5402647545504689, + "grad_norm": 0.2670768201351166, + "learning_rate": 4.955719930676045e-05, + "loss": 0.089, + "num_input_tokens_seen": 3410424, + "step": 5585 + }, + { + "epoch": 1.5416436845008272, + "grad_norm": 0.07976247370243073, + "learning_rate": 4.955494166692657e-05, + "loss": 0.1079, + "num_input_tokens_seen": 3413048, + "step": 5590 + }, + { + "epoch": 1.5430226144511858, + "grad_norm": 12.071476936340332, + "learning_rate": 4.955267833806517e-05, + "loss": 0.0892, + "num_input_tokens_seen": 3415992, + "step": 5595 + }, + { + "epoch": 1.5444015444015444, + "grad_norm": 0.34182286262512207, + "learning_rate": 4.955040932070065e-05, + "loss": 0.0068, + "num_input_tokens_seen": 3418200, + "step": 5600 + }, + { + "epoch": 1.5457804743519028, + "grad_norm": 7.132200717926025, + "learning_rate": 4.9548134615358696e-05, + "loss": 0.1633, + "num_input_tokens_seen": 3420440, + "step": 5605 + }, + { + "epoch": 1.5471594043022614, + "grad_norm": 0.07416162639856339, + "learning_rate": 4.954585422256633e-05, + "loss": 0.0367, + "num_input_tokens_seen": 3425784, + "step": 5610 + }, + { + "epoch": 1.54853833425262, + "grad_norm": 0.010376347228884697, + "learning_rate": 4.954356814285187e-05, + "loss": 0.0518, + "num_input_tokens_seen": 3427896, + "step": 5615 + }, + { + "epoch": 1.5499172642029784, + "grad_norm": 0.13708214461803436, + "learning_rate": 4.954127637674498e-05, + "loss": 0.1544, + "num_input_tokens_seen": 3431224, + "step": 5620 + }, + { + "epoch": 1.551296194153337, + "grad_norm": 5.041487216949463, + "learning_rate": 4.9538978924776634e-05, + "loss": 0.216, + "num_input_tokens_seen": 3434392, + "step": 5625 + }, + { + "epoch": 1.5526751241036956, + "grad_norm": 0.03210623562335968, + "learning_rate": 4.953667578747911e-05, + "loss": 0.0421, + "num_input_tokens_seen": 3437464, + "step": 5630 + }, + { + "epoch": 1.554054054054054, + "grad_norm": 0.5667808651924133, + "learning_rate": 4.953436696538603e-05, + "loss": 0.0088, + "num_input_tokens_seen": 3439896, + "step": 5635 + }, + { + "epoch": 1.5554329840044125, + "grad_norm": 0.040445443242788315, + "learning_rate": 4.953205245903228e-05, + "loss": 0.0089, + "num_input_tokens_seen": 3443192, + "step": 5640 + }, + { + "epoch": 1.5568119139547711, + "grad_norm": 4.081082344055176, + "learning_rate": 4.952973226895414e-05, + "loss": 0.0809, + "num_input_tokens_seen": 3445528, + "step": 5645 + }, + { + "epoch": 1.5581908439051295, + "grad_norm": 3.174957036972046, + "learning_rate": 4.952740639568913e-05, + "loss": 0.1053, + "num_input_tokens_seen": 3449272, + "step": 5650 + }, + { + "epoch": 1.5595697738554881, + "grad_norm": 4.739900588989258, + "learning_rate": 4.952507483977614e-05, + "loss": 0.1118, + "num_input_tokens_seen": 3452440, + "step": 5655 + }, + { + "epoch": 1.5609487038058467, + "grad_norm": 13.469058990478516, + "learning_rate": 4.952273760175535e-05, + "loss": 0.0588, + "num_input_tokens_seen": 3455288, + "step": 5660 + }, + { + "epoch": 1.562327633756205, + "grad_norm": 0.2554466426372528, + "learning_rate": 4.952039468216827e-05, + "loss": 0.013, + "num_input_tokens_seen": 3457528, + "step": 5665 + }, + { + "epoch": 1.5637065637065637, + "grad_norm": 0.03572073578834534, + "learning_rate": 4.9518046081557714e-05, + "loss": 0.0066, + "num_input_tokens_seen": 3461016, + "step": 5670 + }, + { + "epoch": 1.5650854936569223, + "grad_norm": 0.023445801809430122, + "learning_rate": 4.9515691800467826e-05, + "loss": 0.0023, + "num_input_tokens_seen": 3464152, + "step": 5675 + }, + { + "epoch": 1.5664644236072807, + "grad_norm": 0.017090464010834694, + "learning_rate": 4.951333183944406e-05, + "loss": 0.1049, + "num_input_tokens_seen": 3467384, + "step": 5680 + }, + { + "epoch": 1.5678433535576393, + "grad_norm": 0.023752909153699875, + "learning_rate": 4.9510966199033174e-05, + "loss": 0.059, + "num_input_tokens_seen": 3472760, + "step": 5685 + }, + { + "epoch": 1.5692222835079979, + "grad_norm": 3.6533703804016113, + "learning_rate": 4.950859487978326e-05, + "loss": 0.0686, + "num_input_tokens_seen": 3475640, + "step": 5690 + }, + { + "epoch": 1.5706012134583562, + "grad_norm": 0.016958434134721756, + "learning_rate": 4.9506217882243724e-05, + "loss": 0.313, + "num_input_tokens_seen": 3479224, + "step": 5695 + }, + { + "epoch": 1.5719801434087148, + "grad_norm": 0.473028302192688, + "learning_rate": 4.950383520696528e-05, + "loss": 0.0368, + "num_input_tokens_seen": 3482616, + "step": 5700 + }, + { + "epoch": 1.5733590733590734, + "grad_norm": 0.14230072498321533, + "learning_rate": 4.950144685449994e-05, + "loss": 0.0471, + "num_input_tokens_seen": 3485208, + "step": 5705 + }, + { + "epoch": 1.5747380033094318, + "grad_norm": 0.10025908052921295, + "learning_rate": 4.9499052825401085e-05, + "loss": 0.0229, + "num_input_tokens_seen": 3487736, + "step": 5710 + }, + { + "epoch": 1.5761169332597904, + "grad_norm": 0.12664291262626648, + "learning_rate": 4.949665312022336e-05, + "loss": 0.0187, + "num_input_tokens_seen": 3490808, + "step": 5715 + }, + { + "epoch": 1.577495863210149, + "grad_norm": 0.052522990852594376, + "learning_rate": 4.9494247739522735e-05, + "loss": 0.0116, + "num_input_tokens_seen": 3494776, + "step": 5720 + }, + { + "epoch": 1.5788747931605074, + "grad_norm": 6.15330171585083, + "learning_rate": 4.949183668385652e-05, + "loss": 0.066, + "num_input_tokens_seen": 3497720, + "step": 5725 + }, + { + "epoch": 1.580253723110866, + "grad_norm": 0.10656528174877167, + "learning_rate": 4.94894199537833e-05, + "loss": 0.1521, + "num_input_tokens_seen": 3500408, + "step": 5730 + }, + { + "epoch": 1.5816326530612246, + "grad_norm": 5.9091291427612305, + "learning_rate": 4.948699754986301e-05, + "loss": 0.1558, + "num_input_tokens_seen": 3502392, + "step": 5735 + }, + { + "epoch": 1.583011583011583, + "grad_norm": 0.08636471629142761, + "learning_rate": 4.9484569472656886e-05, + "loss": 0.0582, + "num_input_tokens_seen": 3504984, + "step": 5740 + }, + { + "epoch": 1.5843905129619416, + "grad_norm": 0.11192071437835693, + "learning_rate": 4.948213572272748e-05, + "loss": 0.0024, + "num_input_tokens_seen": 3507928, + "step": 5745 + }, + { + "epoch": 1.5857694429123002, + "grad_norm": 0.16028521955013275, + "learning_rate": 4.947969630063865e-05, + "loss": 0.2417, + "num_input_tokens_seen": 3512280, + "step": 5750 + }, + { + "epoch": 1.5871483728626585, + "grad_norm": 0.029822317883372307, + "learning_rate": 4.947725120695558e-05, + "loss": 0.0822, + "num_input_tokens_seen": 3515000, + "step": 5755 + }, + { + "epoch": 1.5885273028130171, + "grad_norm": 1.8530558347702026, + "learning_rate": 4.947480044224476e-05, + "loss": 0.1693, + "num_input_tokens_seen": 3517912, + "step": 5760 + }, + { + "epoch": 1.5899062327633757, + "grad_norm": 0.10540071874856949, + "learning_rate": 4.9472344007074003e-05, + "loss": 0.0098, + "num_input_tokens_seen": 3520472, + "step": 5765 + }, + { + "epoch": 1.591285162713734, + "grad_norm": 0.17895671725273132, + "learning_rate": 4.9469881902012426e-05, + "loss": 0.1299, + "num_input_tokens_seen": 3522520, + "step": 5770 + }, + { + "epoch": 1.5926640926640927, + "grad_norm": 2.7233614921569824, + "learning_rate": 4.946741412763046e-05, + "loss": 0.0384, + "num_input_tokens_seen": 3524792, + "step": 5775 + }, + { + "epoch": 1.5940430226144513, + "grad_norm": 0.06440497934818268, + "learning_rate": 4.946494068449986e-05, + "loss": 0.0065, + "num_input_tokens_seen": 3527480, + "step": 5780 + }, + { + "epoch": 1.5954219525648097, + "grad_norm": 5.582112789154053, + "learning_rate": 4.946246157319368e-05, + "loss": 0.0406, + "num_input_tokens_seen": 3530712, + "step": 5785 + }, + { + "epoch": 1.5968008825151683, + "grad_norm": 0.07465437054634094, + "learning_rate": 4.94599767942863e-05, + "loss": 0.1494, + "num_input_tokens_seen": 3533176, + "step": 5790 + }, + { + "epoch": 1.5981798124655269, + "grad_norm": 3.826685905456543, + "learning_rate": 4.945748634835341e-05, + "loss": 0.1036, + "num_input_tokens_seen": 3536184, + "step": 5795 + }, + { + "epoch": 1.5995587424158852, + "grad_norm": 0.43646442890167236, + "learning_rate": 4.9454990235972e-05, + "loss": 0.0056, + "num_input_tokens_seen": 3538744, + "step": 5800 + }, + { + "epoch": 1.6009376723662438, + "grad_norm": 0.04269237816333771, + "learning_rate": 4.945248845772039e-05, + "loss": 0.0653, + "num_input_tokens_seen": 3541496, + "step": 5805 + }, + { + "epoch": 1.6023166023166024, + "grad_norm": 17.95658302307129, + "learning_rate": 4.9449981014178204e-05, + "loss": 0.0169, + "num_input_tokens_seen": 3544376, + "step": 5810 + }, + { + "epoch": 1.6036955322669608, + "grad_norm": 5.416156768798828, + "learning_rate": 4.944746790592639e-05, + "loss": 0.0707, + "num_input_tokens_seen": 3548056, + "step": 5815 + }, + { + "epoch": 1.6050744622173192, + "grad_norm": 0.2148938924074173, + "learning_rate": 4.944494913354718e-05, + "loss": 0.0243, + "num_input_tokens_seen": 3550872, + "step": 5820 + }, + { + "epoch": 1.606453392167678, + "grad_norm": 0.03317075967788696, + "learning_rate": 4.9442424697624154e-05, + "loss": 0.0052, + "num_input_tokens_seen": 3554488, + "step": 5825 + }, + { + "epoch": 1.6078323221180364, + "grad_norm": 0.02118758298456669, + "learning_rate": 4.9439894598742184e-05, + "loss": 0.1171, + "num_input_tokens_seen": 3557848, + "step": 5830 + }, + { + "epoch": 1.6092112520683948, + "grad_norm": 0.10012377798557281, + "learning_rate": 4.9437358837487456e-05, + "loss": 0.003, + "num_input_tokens_seen": 3561560, + "step": 5835 + }, + { + "epoch": 1.6105901820187536, + "grad_norm": 0.02079702541232109, + "learning_rate": 4.943481741444748e-05, + "loss": 0.1022, + "num_input_tokens_seen": 3563768, + "step": 5840 + }, + { + "epoch": 1.611969111969112, + "grad_norm": 0.014981705695390701, + "learning_rate": 4.943227033021104e-05, + "loss": 0.0804, + "num_input_tokens_seen": 3566712, + "step": 5845 + }, + { + "epoch": 1.6133480419194703, + "grad_norm": 27.160951614379883, + "learning_rate": 4.942971758536828e-05, + "loss": 0.021, + "num_input_tokens_seen": 3568984, + "step": 5850 + }, + { + "epoch": 1.6147269718698292, + "grad_norm": 0.18628834187984467, + "learning_rate": 4.942715918051063e-05, + "loss": 0.0787, + "num_input_tokens_seen": 3572536, + "step": 5855 + }, + { + "epoch": 1.6161059018201875, + "grad_norm": 0.04080803319811821, + "learning_rate": 4.942459511623084e-05, + "loss": 0.034, + "num_input_tokens_seen": 3575320, + "step": 5860 + }, + { + "epoch": 1.617484831770546, + "grad_norm": 5.5744452476501465, + "learning_rate": 4.942202539312296e-05, + "loss": 0.1715, + "num_input_tokens_seen": 3578456, + "step": 5865 + }, + { + "epoch": 1.6188637617209047, + "grad_norm": 20.302270889282227, + "learning_rate": 4.941945001178235e-05, + "loss": 0.1038, + "num_input_tokens_seen": 3580824, + "step": 5870 + }, + { + "epoch": 1.6202426916712631, + "grad_norm": 8.918120384216309, + "learning_rate": 4.9416868972805716e-05, + "loss": 0.1041, + "num_input_tokens_seen": 3583960, + "step": 5875 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.15566612780094147, + "learning_rate": 4.9414282276791024e-05, + "loss": 0.0439, + "num_input_tokens_seen": 3587160, + "step": 5880 + }, + { + "epoch": 1.62300055157198, + "grad_norm": 0.07040894776582718, + "learning_rate": 4.941168992433758e-05, + "loss": 0.0768, + "num_input_tokens_seen": 3590264, + "step": 5885 + }, + { + "epoch": 1.6243794815223387, + "grad_norm": 0.04901428148150444, + "learning_rate": 4.9409091916046e-05, + "loss": 0.0366, + "num_input_tokens_seen": 3592920, + "step": 5890 + }, + { + "epoch": 1.625758411472697, + "grad_norm": 0.7531355619430542, + "learning_rate": 4.9406488252518203e-05, + "loss": 0.0402, + "num_input_tokens_seen": 3595384, + "step": 5895 + }, + { + "epoch": 1.6271373414230557, + "grad_norm": 0.08364991098642349, + "learning_rate": 4.940387893435741e-05, + "loss": 0.0248, + "num_input_tokens_seen": 3597880, + "step": 5900 + }, + { + "epoch": 1.6285162713734143, + "grad_norm": 0.3529362976551056, + "learning_rate": 4.9401263962168174e-05, + "loss": 0.0974, + "num_input_tokens_seen": 3600056, + "step": 5905 + }, + { + "epoch": 1.6298952013237726, + "grad_norm": 11.208588600158691, + "learning_rate": 4.939864333655635e-05, + "loss": 0.1846, + "num_input_tokens_seen": 3603896, + "step": 5910 + }, + { + "epoch": 1.6312741312741312, + "grad_norm": 0.04982038214802742, + "learning_rate": 4.9396017058129085e-05, + "loss": 0.0045, + "num_input_tokens_seen": 3607800, + "step": 5915 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.017248449847102165, + "learning_rate": 4.939338512749487e-05, + "loss": 0.0042, + "num_input_tokens_seen": 3610200, + "step": 5920 + }, + { + "epoch": 1.6340319911748482, + "grad_norm": 0.023112991824746132, + "learning_rate": 4.939074754526346e-05, + "loss": 0.0681, + "num_input_tokens_seen": 3612600, + "step": 5925 + }, + { + "epoch": 1.6354109211252068, + "grad_norm": 14.132001876831055, + "learning_rate": 4.938810431204597e-05, + "loss": 0.1507, + "num_input_tokens_seen": 3615032, + "step": 5930 + }, + { + "epoch": 1.6367898510755654, + "grad_norm": 0.09072190523147583, + "learning_rate": 4.9385455428454784e-05, + "loss": 0.073, + "num_input_tokens_seen": 3617656, + "step": 5935 + }, + { + "epoch": 1.6381687810259238, + "grad_norm": 0.09742973744869232, + "learning_rate": 4.9382800895103616e-05, + "loss": 0.0993, + "num_input_tokens_seen": 3621176, + "step": 5940 + }, + { + "epoch": 1.6395477109762824, + "grad_norm": 4.674243450164795, + "learning_rate": 4.938014071260748e-05, + "loss": 0.1224, + "num_input_tokens_seen": 3624824, + "step": 5945 + }, + { + "epoch": 1.640926640926641, + "grad_norm": 0.06306679546833038, + "learning_rate": 4.93774748815827e-05, + "loss": 0.0554, + "num_input_tokens_seen": 3627992, + "step": 5950 + }, + { + "epoch": 1.6423055708769994, + "grad_norm": 8.259720802307129, + "learning_rate": 4.937480340264692e-05, + "loss": 0.1119, + "num_input_tokens_seen": 3630808, + "step": 5955 + }, + { + "epoch": 1.643684500827358, + "grad_norm": 1.2190262079238892, + "learning_rate": 4.937212627641907e-05, + "loss": 0.0354, + "num_input_tokens_seen": 3633048, + "step": 5960 + }, + { + "epoch": 1.6450634307777166, + "grad_norm": 0.08495140075683594, + "learning_rate": 4.936944350351942e-05, + "loss": 0.0496, + "num_input_tokens_seen": 3636152, + "step": 5965 + }, + { + "epoch": 1.646442360728075, + "grad_norm": 0.5213479995727539, + "learning_rate": 4.936675508456952e-05, + "loss": 0.0376, + "num_input_tokens_seen": 3639448, + "step": 5970 + }, + { + "epoch": 1.6478212906784335, + "grad_norm": 5.4620771408081055, + "learning_rate": 4.9364061020192226e-05, + "loss": 0.1021, + "num_input_tokens_seen": 3642712, + "step": 5975 + }, + { + "epoch": 1.6492002206287921, + "grad_norm": 0.01429738663136959, + "learning_rate": 4.936136131101173e-05, + "loss": 0.0036, + "num_input_tokens_seen": 3645464, + "step": 5980 + }, + { + "epoch": 1.6505791505791505, + "grad_norm": 22.447933197021484, + "learning_rate": 4.9358655957653515e-05, + "loss": 0.0347, + "num_input_tokens_seen": 3648408, + "step": 5985 + }, + { + "epoch": 1.651958080529509, + "grad_norm": 14.597185134887695, + "learning_rate": 4.935594496074437e-05, + "loss": 0.1774, + "num_input_tokens_seen": 3653176, + "step": 5990 + }, + { + "epoch": 1.6533370104798677, + "grad_norm": 4.535642147064209, + "learning_rate": 4.935322832091239e-05, + "loss": 0.0545, + "num_input_tokens_seen": 3655928, + "step": 5995 + }, + { + "epoch": 1.654715940430226, + "grad_norm": 9.838122367858887, + "learning_rate": 4.935050603878698e-05, + "loss": 0.0519, + "num_input_tokens_seen": 3659416, + "step": 6000 + }, + { + "epoch": 1.6560948703805847, + "grad_norm": 5.721614360809326, + "learning_rate": 4.9347778114998867e-05, + "loss": 0.0091, + "num_input_tokens_seen": 3662232, + "step": 6005 + }, + { + "epoch": 1.6574738003309433, + "grad_norm": 8.374700546264648, + "learning_rate": 4.9345044550180056e-05, + "loss": 0.0377, + "num_input_tokens_seen": 3664888, + "step": 6010 + }, + { + "epoch": 1.6588527302813016, + "grad_norm": 1.4216573238372803, + "learning_rate": 4.934230534496389e-05, + "loss": 0.0851, + "num_input_tokens_seen": 3667256, + "step": 6015 + }, + { + "epoch": 1.6602316602316602, + "grad_norm": 0.45622751116752625, + "learning_rate": 4.933956049998499e-05, + "loss": 0.0631, + "num_input_tokens_seen": 3670776, + "step": 6020 + }, + { + "epoch": 1.6616105901820188, + "grad_norm": 0.01896779052913189, + "learning_rate": 4.93368100158793e-05, + "loss": 0.0568, + "num_input_tokens_seen": 3673592, + "step": 6025 + }, + { + "epoch": 1.6629895201323772, + "grad_norm": 0.07758942991495132, + "learning_rate": 4.9334053893284074e-05, + "loss": 0.217, + "num_input_tokens_seen": 3676888, + "step": 6030 + }, + { + "epoch": 1.6643684500827358, + "grad_norm": 2.2317328453063965, + "learning_rate": 4.933129213283786e-05, + "loss": 0.0698, + "num_input_tokens_seen": 3680504, + "step": 6035 + }, + { + "epoch": 1.6657473800330944, + "grad_norm": 0.02021419070661068, + "learning_rate": 4.932852473518052e-05, + "loss": 0.0329, + "num_input_tokens_seen": 3683224, + "step": 6040 + }, + { + "epoch": 1.6671263099834528, + "grad_norm": 0.18398411571979523, + "learning_rate": 4.9325751700953226e-05, + "loss": 0.1369, + "num_input_tokens_seen": 3685656, + "step": 6045 + }, + { + "epoch": 1.6685052399338114, + "grad_norm": 0.02169044315814972, + "learning_rate": 4.932297303079844e-05, + "loss": 0.0381, + "num_input_tokens_seen": 3689272, + "step": 6050 + }, + { + "epoch": 1.66988416988417, + "grad_norm": 0.024994079023599625, + "learning_rate": 4.932018872535995e-05, + "loss": 0.012, + "num_input_tokens_seen": 3692440, + "step": 6055 + }, + { + "epoch": 1.6712630998345284, + "grad_norm": 2.8070311546325684, + "learning_rate": 4.931739878528283e-05, + "loss": 0.1459, + "num_input_tokens_seen": 3696728, + "step": 6060 + }, + { + "epoch": 1.672642029784887, + "grad_norm": 6.114677429199219, + "learning_rate": 4.931460321121347e-05, + "loss": 0.1114, + "num_input_tokens_seen": 3699928, + "step": 6065 + }, + { + "epoch": 1.6740209597352456, + "grad_norm": 0.06717639416456223, + "learning_rate": 4.931180200379957e-05, + "loss": 0.0046, + "num_input_tokens_seen": 3703064, + "step": 6070 + }, + { + "epoch": 1.675399889685604, + "grad_norm": 0.03065355494618416, + "learning_rate": 4.930899516369013e-05, + "loss": 0.0174, + "num_input_tokens_seen": 3705656, + "step": 6075 + }, + { + "epoch": 1.6767788196359625, + "grad_norm": 0.3430216312408447, + "learning_rate": 4.9306182691535455e-05, + "loss": 0.1067, + "num_input_tokens_seen": 3708440, + "step": 6080 + }, + { + "epoch": 1.6781577495863211, + "grad_norm": 0.11719603836536407, + "learning_rate": 4.9303364587987146e-05, + "loss": 0.0803, + "num_input_tokens_seen": 3711448, + "step": 6085 + }, + { + "epoch": 1.6795366795366795, + "grad_norm": 0.23762013018131256, + "learning_rate": 4.9300540853698124e-05, + "loss": 0.1752, + "num_input_tokens_seen": 3714040, + "step": 6090 + }, + { + "epoch": 1.680915609487038, + "grad_norm": 0.3315000534057617, + "learning_rate": 4.929771148932261e-05, + "loss": 0.07, + "num_input_tokens_seen": 3716984, + "step": 6095 + }, + { + "epoch": 1.6822945394373967, + "grad_norm": 0.3614923357963562, + "learning_rate": 4.929487649551612e-05, + "loss": 0.0139, + "num_input_tokens_seen": 3720184, + "step": 6100 + }, + { + "epoch": 1.683673469387755, + "grad_norm": 1.4168767929077148, + "learning_rate": 4.929203587293548e-05, + "loss": 0.0564, + "num_input_tokens_seen": 3723064, + "step": 6105 + }, + { + "epoch": 1.6850523993381135, + "grad_norm": 0.013246800750494003, + "learning_rate": 4.928918962223884e-05, + "loss": 0.0096, + "num_input_tokens_seen": 3727224, + "step": 6110 + }, + { + "epoch": 1.6864313292884723, + "grad_norm": 0.08132466673851013, + "learning_rate": 4.928633774408561e-05, + "loss": 0.0596, + "num_input_tokens_seen": 3730424, + "step": 6115 + }, + { + "epoch": 1.6878102592388307, + "grad_norm": 2.265897750854492, + "learning_rate": 4.9283480239136546e-05, + "loss": 0.0616, + "num_input_tokens_seen": 3733528, + "step": 6120 + }, + { + "epoch": 1.689189189189189, + "grad_norm": 0.226248636841774, + "learning_rate": 4.9280617108053686e-05, + "loss": 0.016, + "num_input_tokens_seen": 3737688, + "step": 6125 + }, + { + "epoch": 1.6905681191395479, + "grad_norm": 3.226304292678833, + "learning_rate": 4.927774835150037e-05, + "loss": 0.0355, + "num_input_tokens_seen": 3743608, + "step": 6130 + }, + { + "epoch": 1.6919470490899062, + "grad_norm": 16.77685546875, + "learning_rate": 4.927487397014127e-05, + "loss": 0.1929, + "num_input_tokens_seen": 3746712, + "step": 6135 + }, + { + "epoch": 1.6933259790402646, + "grad_norm": 3.372830867767334, + "learning_rate": 4.9271993964642306e-05, + "loss": 0.0517, + "num_input_tokens_seen": 3750264, + "step": 6140 + }, + { + "epoch": 1.6947049089906234, + "grad_norm": 0.5032870173454285, + "learning_rate": 4.9269108335670755e-05, + "loss": 0.1078, + "num_input_tokens_seen": 3753176, + "step": 6145 + }, + { + "epoch": 1.6960838389409818, + "grad_norm": 0.12772108614444733, + "learning_rate": 4.926621708389517e-05, + "loss": 0.085, + "num_input_tokens_seen": 3756152, + "step": 6150 + }, + { + "epoch": 1.6974627688913402, + "grad_norm": 0.5865352153778076, + "learning_rate": 4.926332020998542e-05, + "loss": 0.0245, + "num_input_tokens_seen": 3758712, + "step": 6155 + }, + { + "epoch": 1.698841698841699, + "grad_norm": 2.2386634349823, + "learning_rate": 4.926041771461266e-05, + "loss": 0.1907, + "num_input_tokens_seen": 3761848, + "step": 6160 + }, + { + "epoch": 1.7002206287920574, + "grad_norm": 0.07992564141750336, + "learning_rate": 4.925750959844936e-05, + "loss": 0.055, + "num_input_tokens_seen": 3764632, + "step": 6165 + }, + { + "epoch": 1.7015995587424158, + "grad_norm": 4.448506832122803, + "learning_rate": 4.92545958621693e-05, + "loss": 0.1076, + "num_input_tokens_seen": 3767448, + "step": 6170 + }, + { + "epoch": 1.7029784886927746, + "grad_norm": 0.04182408004999161, + "learning_rate": 4.925167650644752e-05, + "loss": 0.0571, + "num_input_tokens_seen": 3770456, + "step": 6175 + }, + { + "epoch": 1.704357418643133, + "grad_norm": 0.11473249644041061, + "learning_rate": 4.924875153196042e-05, + "loss": 0.0649, + "num_input_tokens_seen": 3773112, + "step": 6180 + }, + { + "epoch": 1.7057363485934913, + "grad_norm": 8.754462242126465, + "learning_rate": 4.9245820939385664e-05, + "loss": 0.0438, + "num_input_tokens_seen": 3776312, + "step": 6185 + }, + { + "epoch": 1.70711527854385, + "grad_norm": 2.416804552078247, + "learning_rate": 4.924288472940224e-05, + "loss": 0.0987, + "num_input_tokens_seen": 3779416, + "step": 6190 + }, + { + "epoch": 1.7084942084942085, + "grad_norm": 0.26092395186424255, + "learning_rate": 4.923994290269041e-05, + "loss": 0.0695, + "num_input_tokens_seen": 3782808, + "step": 6195 + }, + { + "epoch": 1.709873138444567, + "grad_norm": 0.2164454162120819, + "learning_rate": 4.9236995459931764e-05, + "loss": 0.2093, + "num_input_tokens_seen": 3785336, + "step": 6200 + }, + { + "epoch": 1.7112520683949255, + "grad_norm": 0.7891464829444885, + "learning_rate": 4.923404240180918e-05, + "loss": 0.0536, + "num_input_tokens_seen": 3787992, + "step": 6205 + }, + { + "epoch": 1.712630998345284, + "grad_norm": 0.25373661518096924, + "learning_rate": 4.9231083729006825e-05, + "loss": 0.0152, + "num_input_tokens_seen": 3791384, + "step": 6210 + }, + { + "epoch": 1.7140099282956425, + "grad_norm": 0.12566149234771729, + "learning_rate": 4.92281194422102e-05, + "loss": 0.0359, + "num_input_tokens_seen": 3795064, + "step": 6215 + }, + { + "epoch": 1.715388858246001, + "grad_norm": 0.13641464710235596, + "learning_rate": 4.9225149542106085e-05, + "loss": 0.0489, + "num_input_tokens_seen": 3797784, + "step": 6220 + }, + { + "epoch": 1.7167677881963597, + "grad_norm": 0.0623808354139328, + "learning_rate": 4.922217402938255e-05, + "loss": 0.0874, + "num_input_tokens_seen": 3800312, + "step": 6225 + }, + { + "epoch": 1.718146718146718, + "grad_norm": 0.6357030868530273, + "learning_rate": 4.921919290472899e-05, + "loss": 0.0808, + "num_input_tokens_seen": 3803320, + "step": 6230 + }, + { + "epoch": 1.7195256480970766, + "grad_norm": 0.2097962498664856, + "learning_rate": 4.92162061688361e-05, + "loss": 0.24, + "num_input_tokens_seen": 3806072, + "step": 6235 + }, + { + "epoch": 1.7209045780474352, + "grad_norm": 0.0002275997248943895, + "learning_rate": 4.9213213822395836e-05, + "loss": 0.0011, + "num_input_tokens_seen": 3809816, + "step": 6240 + }, + { + "epoch": 1.7222835079977936, + "grad_norm": 0.011056779883801937, + "learning_rate": 4.92102158661015e-05, + "loss": 0.0935, + "num_input_tokens_seen": 3812696, + "step": 6245 + }, + { + "epoch": 1.7236624379481522, + "grad_norm": 0.010154884308576584, + "learning_rate": 4.9207212300647675e-05, + "loss": 0.005, + "num_input_tokens_seen": 3815576, + "step": 6250 + }, + { + "epoch": 1.7250413678985108, + "grad_norm": 0.07235867530107498, + "learning_rate": 4.920420312673023e-05, + "loss": 0.0734, + "num_input_tokens_seen": 3818200, + "step": 6255 + }, + { + "epoch": 1.7264202978488692, + "grad_norm": 16.857227325439453, + "learning_rate": 4.9201188345046365e-05, + "loss": 0.0735, + "num_input_tokens_seen": 3821112, + "step": 6260 + }, + { + "epoch": 1.7277992277992278, + "grad_norm": 0.05080268159508705, + "learning_rate": 4.919816795629456e-05, + "loss": 0.0802, + "num_input_tokens_seen": 3824024, + "step": 6265 + }, + { + "epoch": 1.7291781577495864, + "grad_norm": 5.2233781814575195, + "learning_rate": 4.9195141961174586e-05, + "loss": 0.1097, + "num_input_tokens_seen": 3826488, + "step": 6270 + }, + { + "epoch": 1.7305570876999448, + "grad_norm": 0.3680282235145569, + "learning_rate": 4.919211036038752e-05, + "loss": 0.1731, + "num_input_tokens_seen": 3828888, + "step": 6275 + }, + { + "epoch": 1.7319360176503034, + "grad_norm": 5.7594733238220215, + "learning_rate": 4.918907315463576e-05, + "loss": 0.1625, + "num_input_tokens_seen": 3832792, + "step": 6280 + }, + { + "epoch": 1.733314947600662, + "grad_norm": 1.49729585647583, + "learning_rate": 4.918603034462296e-05, + "loss": 0.0561, + "num_input_tokens_seen": 3835512, + "step": 6285 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 5.442667484283447, + "learning_rate": 4.9182981931054104e-05, + "loss": 0.1574, + "num_input_tokens_seen": 3838104, + "step": 6290 + }, + { + "epoch": 1.736072807501379, + "grad_norm": 6.028275489807129, + "learning_rate": 4.9179927914635474e-05, + "loss": 0.0641, + "num_input_tokens_seen": 3840728, + "step": 6295 + }, + { + "epoch": 1.7374517374517375, + "grad_norm": 0.01224227249622345, + "learning_rate": 4.9176868296074626e-05, + "loss": 0.0363, + "num_input_tokens_seen": 3843128, + "step": 6300 + }, + { + "epoch": 1.738830667402096, + "grad_norm": 0.9339450001716614, + "learning_rate": 4.917380307608045e-05, + "loss": 0.09, + "num_input_tokens_seen": 3845912, + "step": 6305 + }, + { + "epoch": 1.7402095973524545, + "grad_norm": 0.4195496439933777, + "learning_rate": 4.91707322553631e-05, + "loss": 0.043, + "num_input_tokens_seen": 3848504, + "step": 6310 + }, + { + "epoch": 1.741588527302813, + "grad_norm": 0.5413926243782043, + "learning_rate": 4.916765583463404e-05, + "loss": 0.0278, + "num_input_tokens_seen": 3851064, + "step": 6315 + }, + { + "epoch": 1.7429674572531715, + "grad_norm": 5.684319972991943, + "learning_rate": 4.916457381460603e-05, + "loss": 0.0732, + "num_input_tokens_seen": 3854904, + "step": 6320 + }, + { + "epoch": 1.74434638720353, + "grad_norm": 0.09966620802879333, + "learning_rate": 4.9161486195993146e-05, + "loss": 0.031, + "num_input_tokens_seen": 3857944, + "step": 6325 + }, + { + "epoch": 1.7457253171538887, + "grad_norm": 0.01838534139096737, + "learning_rate": 4.9158392979510735e-05, + "loss": 0.0635, + "num_input_tokens_seen": 3861304, + "step": 6330 + }, + { + "epoch": 1.747104247104247, + "grad_norm": 0.2217266708612442, + "learning_rate": 4.915529416587544e-05, + "loss": 0.0056, + "num_input_tokens_seen": 3863768, + "step": 6335 + }, + { + "epoch": 1.7484831770546057, + "grad_norm": 0.06326927244663239, + "learning_rate": 4.915218975580523e-05, + "loss": 0.0303, + "num_input_tokens_seen": 3867256, + "step": 6340 + }, + { + "epoch": 1.7498621070049643, + "grad_norm": 0.11609753221273422, + "learning_rate": 4.9149079750019346e-05, + "loss": 0.1175, + "num_input_tokens_seen": 3870008, + "step": 6345 + }, + { + "epoch": 1.7512410369553226, + "grad_norm": 0.0015633984003216028, + "learning_rate": 4.914596414923832e-05, + "loss": 0.0775, + "num_input_tokens_seen": 3872696, + "step": 6350 + }, + { + "epoch": 1.7526199669056812, + "grad_norm": 3.415329933166504, + "learning_rate": 4.914284295418401e-05, + "loss": 0.3722, + "num_input_tokens_seen": 3875032, + "step": 6355 + }, + { + "epoch": 1.7539988968560398, + "grad_norm": 0.009637940675020218, + "learning_rate": 4.913971616557955e-05, + "loss": 0.1957, + "num_input_tokens_seen": 3877688, + "step": 6360 + }, + { + "epoch": 1.7553778268063982, + "grad_norm": 0.5295152068138123, + "learning_rate": 4.913658378414936e-05, + "loss": 0.0034, + "num_input_tokens_seen": 3880792, + "step": 6365 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.09815610200166702, + "learning_rate": 4.9133445810619166e-05, + "loss": 0.0386, + "num_input_tokens_seen": 3883768, + "step": 6370 + }, + { + "epoch": 1.7581356867071154, + "grad_norm": 16.54078483581543, + "learning_rate": 4.913030224571601e-05, + "loss": 0.091, + "num_input_tokens_seen": 3886904, + "step": 6375 + }, + { + "epoch": 1.7595146166574738, + "grad_norm": 0.4059413969516754, + "learning_rate": 4.912715309016819e-05, + "loss": 0.0171, + "num_input_tokens_seen": 3889336, + "step": 6380 + }, + { + "epoch": 1.7608935466078324, + "grad_norm": 2.329383134841919, + "learning_rate": 4.9123998344705344e-05, + "loss": 0.1211, + "num_input_tokens_seen": 3891736, + "step": 6385 + }, + { + "epoch": 1.762272476558191, + "grad_norm": 0.05518307909369469, + "learning_rate": 4.912083801005836e-05, + "loss": 0.1119, + "num_input_tokens_seen": 3895256, + "step": 6390 + }, + { + "epoch": 1.7636514065085493, + "grad_norm": 0.5083315372467041, + "learning_rate": 4.911767208695944e-05, + "loss": 0.0081, + "num_input_tokens_seen": 3898264, + "step": 6395 + }, + { + "epoch": 1.7650303364589077, + "grad_norm": 0.04471089318394661, + "learning_rate": 4.9114500576142106e-05, + "loss": 0.1129, + "num_input_tokens_seen": 3902392, + "step": 6400 + }, + { + "epoch": 1.7664092664092665, + "grad_norm": 0.07639720290899277, + "learning_rate": 4.911132347834114e-05, + "loss": 0.0059, + "num_input_tokens_seen": 3905080, + "step": 6405 + }, + { + "epoch": 1.767788196359625, + "grad_norm": 0.09439694881439209, + "learning_rate": 4.910814079429262e-05, + "loss": 0.0305, + "num_input_tokens_seen": 3910168, + "step": 6410 + }, + { + "epoch": 1.7691671263099833, + "grad_norm": 2.5598011016845703, + "learning_rate": 4.9104952524733936e-05, + "loss": 0.0522, + "num_input_tokens_seen": 3914488, + "step": 6415 + }, + { + "epoch": 1.7705460562603421, + "grad_norm": 6.257484436035156, + "learning_rate": 4.910175867040377e-05, + "loss": 0.1422, + "num_input_tokens_seen": 3916632, + "step": 6420 + }, + { + "epoch": 1.7719249862107005, + "grad_norm": 0.18428707122802734, + "learning_rate": 4.9098559232042086e-05, + "loss": 0.0618, + "num_input_tokens_seen": 3919800, + "step": 6425 + }, + { + "epoch": 1.7733039161610589, + "grad_norm": 0.14134041965007782, + "learning_rate": 4.909535421039014e-05, + "loss": 0.0084, + "num_input_tokens_seen": 3922712, + "step": 6430 + }, + { + "epoch": 1.7746828461114177, + "grad_norm": 0.15336433053016663, + "learning_rate": 4.909214360619051e-05, + "loss": 0.0701, + "num_input_tokens_seen": 3925240, + "step": 6435 + }, + { + "epoch": 1.776061776061776, + "grad_norm": 0.0657072439789772, + "learning_rate": 4.908892742018703e-05, + "loss": 0.0432, + "num_input_tokens_seen": 3927576, + "step": 6440 + }, + { + "epoch": 1.7774407060121344, + "grad_norm": 6.361442565917969, + "learning_rate": 4.908570565312485e-05, + "loss": 0.0902, + "num_input_tokens_seen": 3930104, + "step": 6445 + }, + { + "epoch": 1.7788196359624933, + "grad_norm": 0.012179559096693993, + "learning_rate": 4.908247830575041e-05, + "loss": 0.062, + "num_input_tokens_seen": 3932472, + "step": 6450 + }, + { + "epoch": 1.7801985659128516, + "grad_norm": 9.412137985229492, + "learning_rate": 4.9079245378811436e-05, + "loss": 0.101, + "num_input_tokens_seen": 3935768, + "step": 6455 + }, + { + "epoch": 1.78157749586321, + "grad_norm": 0.28577500581741333, + "learning_rate": 4.907600687305694e-05, + "loss": 0.059, + "num_input_tokens_seen": 3938360, + "step": 6460 + }, + { + "epoch": 1.7829564258135688, + "grad_norm": 0.19389663636684418, + "learning_rate": 4.907276278923726e-05, + "loss": 0.1215, + "num_input_tokens_seen": 3941816, + "step": 6465 + }, + { + "epoch": 1.7843353557639272, + "grad_norm": 0.08342210948467255, + "learning_rate": 4.906951312810399e-05, + "loss": 0.1273, + "num_input_tokens_seen": 3944440, + "step": 6470 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.1810239851474762, + "learning_rate": 4.906625789041003e-05, + "loss": 0.0042, + "num_input_tokens_seen": 3946488, + "step": 6475 + }, + { + "epoch": 1.7870932156646442, + "grad_norm": 0.38282859325408936, + "learning_rate": 4.906299707690958e-05, + "loss": 0.0236, + "num_input_tokens_seen": 3950040, + "step": 6480 + }, + { + "epoch": 1.7884721456150028, + "grad_norm": 0.09041977673768997, + "learning_rate": 4.9059730688358105e-05, + "loss": 0.0655, + "num_input_tokens_seen": 3953048, + "step": 6485 + }, + { + "epoch": 1.7898510755653612, + "grad_norm": 3.658358573913574, + "learning_rate": 4.905645872551241e-05, + "loss": 0.1367, + "num_input_tokens_seen": 3957048, + "step": 6490 + }, + { + "epoch": 1.7912300055157198, + "grad_norm": 0.10829849541187286, + "learning_rate": 4.905318118913054e-05, + "loss": 0.0662, + "num_input_tokens_seen": 3959256, + "step": 6495 + }, + { + "epoch": 1.7926089354660784, + "grad_norm": 1.973666787147522, + "learning_rate": 4.9049898079971844e-05, + "loss": 0.113, + "num_input_tokens_seen": 3963512, + "step": 6500 + }, + { + "epoch": 1.7939878654164367, + "grad_norm": 9.233744621276855, + "learning_rate": 4.9046609398797e-05, + "loss": 0.03, + "num_input_tokens_seen": 3967384, + "step": 6505 + }, + { + "epoch": 1.7953667953667953, + "grad_norm": 0.13675209879875183, + "learning_rate": 4.904331514636793e-05, + "loss": 0.1139, + "num_input_tokens_seen": 3970264, + "step": 6510 + }, + { + "epoch": 1.796745725317154, + "grad_norm": 4.87539529800415, + "learning_rate": 4.9040015323447865e-05, + "loss": 0.0473, + "num_input_tokens_seen": 3973912, + "step": 6515 + }, + { + "epoch": 1.7981246552675123, + "grad_norm": 3.0412869453430176, + "learning_rate": 4.903670993080134e-05, + "loss": 0.1383, + "num_input_tokens_seen": 3977272, + "step": 6520 + }, + { + "epoch": 1.799503585217871, + "grad_norm": 0.11310960352420807, + "learning_rate": 4.9033398969194145e-05, + "loss": 0.0689, + "num_input_tokens_seen": 3980312, + "step": 6525 + }, + { + "epoch": 1.8008825151682295, + "grad_norm": 0.03477763757109642, + "learning_rate": 4.9030082439393396e-05, + "loss": 0.0915, + "num_input_tokens_seen": 3983608, + "step": 6530 + }, + { + "epoch": 1.8022614451185879, + "grad_norm": 3.526385545730591, + "learning_rate": 4.902676034216749e-05, + "loss": 0.0755, + "num_input_tokens_seen": 3987128, + "step": 6535 + }, + { + "epoch": 1.8036403750689465, + "grad_norm": 0.02537163533270359, + "learning_rate": 4.90234326782861e-05, + "loss": 0.0413, + "num_input_tokens_seen": 3989880, + "step": 6540 + }, + { + "epoch": 1.805019305019305, + "grad_norm": 0.3727788031101227, + "learning_rate": 4.9020099448520194e-05, + "loss": 0.1569, + "num_input_tokens_seen": 3992984, + "step": 6545 + }, + { + "epoch": 1.8063982349696635, + "grad_norm": 0.3213331997394562, + "learning_rate": 4.901676065364205e-05, + "loss": 0.061, + "num_input_tokens_seen": 3995800, + "step": 6550 + }, + { + "epoch": 1.807777164920022, + "grad_norm": 6.646751403808594, + "learning_rate": 4.901341629442521e-05, + "loss": 0.0849, + "num_input_tokens_seen": 3998008, + "step": 6555 + }, + { + "epoch": 1.8091560948703806, + "grad_norm": 0.4729776084423065, + "learning_rate": 4.90100663716445e-05, + "loss": 0.0168, + "num_input_tokens_seen": 4000984, + "step": 6560 + }, + { + "epoch": 1.810535024820739, + "grad_norm": 1.9163256883621216, + "learning_rate": 4.900671088607608e-05, + "loss": 0.0267, + "num_input_tokens_seen": 4003288, + "step": 6565 + }, + { + "epoch": 1.8119139547710976, + "grad_norm": 0.2561767101287842, + "learning_rate": 4.9003349838497335e-05, + "loss": 0.0026, + "num_input_tokens_seen": 4006744, + "step": 6570 + }, + { + "epoch": 1.8132928847214562, + "grad_norm": 0.11881697177886963, + "learning_rate": 4.8999983229686996e-05, + "loss": 0.0029, + "num_input_tokens_seen": 4009272, + "step": 6575 + }, + { + "epoch": 1.8146718146718146, + "grad_norm": 0.3693978190422058, + "learning_rate": 4.899661106042505e-05, + "loss": 0.0915, + "num_input_tokens_seen": 4011640, + "step": 6580 + }, + { + "epoch": 1.8160507446221732, + "grad_norm": 0.6121989488601685, + "learning_rate": 4.899323333149277e-05, + "loss": 0.0061, + "num_input_tokens_seen": 4013848, + "step": 6585 + }, + { + "epoch": 1.8174296745725318, + "grad_norm": 0.01794230006635189, + "learning_rate": 4.898985004367274e-05, + "loss": 0.0986, + "num_input_tokens_seen": 4016280, + "step": 6590 + }, + { + "epoch": 1.8188086045228902, + "grad_norm": 7.288510799407959, + "learning_rate": 4.8986461197748816e-05, + "loss": 0.1376, + "num_input_tokens_seen": 4019000, + "step": 6595 + }, + { + "epoch": 1.8201875344732488, + "grad_norm": 0.07375355809926987, + "learning_rate": 4.8983066794506146e-05, + "loss": 0.0055, + "num_input_tokens_seen": 4022904, + "step": 6600 + }, + { + "epoch": 1.8215664644236074, + "grad_norm": 0.24418970942497253, + "learning_rate": 4.8979666834731164e-05, + "loss": 0.0033, + "num_input_tokens_seen": 4027096, + "step": 6605 + }, + { + "epoch": 1.8229453943739657, + "grad_norm": 16.08022689819336, + "learning_rate": 4.8976261319211584e-05, + "loss": 0.0268, + "num_input_tokens_seen": 4030168, + "step": 6610 + }, + { + "epoch": 1.8243243243243243, + "grad_norm": 0.0011849774746224284, + "learning_rate": 4.8972850248736426e-05, + "loss": 0.17, + "num_input_tokens_seen": 4032536, + "step": 6615 + }, + { + "epoch": 1.825703254274683, + "grad_norm": 0.11649534106254578, + "learning_rate": 4.896943362409598e-05, + "loss": 0.1171, + "num_input_tokens_seen": 4034904, + "step": 6620 + }, + { + "epoch": 1.8270821842250413, + "grad_norm": 0.2512246072292328, + "learning_rate": 4.8966011446081826e-05, + "loss": 0.0742, + "num_input_tokens_seen": 4038744, + "step": 6625 + }, + { + "epoch": 1.8284611141754, + "grad_norm": 0.09292587637901306, + "learning_rate": 4.896258371548685e-05, + "loss": 0.0363, + "num_input_tokens_seen": 4041528, + "step": 6630 + }, + { + "epoch": 1.8298400441257585, + "grad_norm": 6.665251731872559, + "learning_rate": 4.895915043310519e-05, + "loss": 0.267, + "num_input_tokens_seen": 4044248, + "step": 6635 + }, + { + "epoch": 1.831218974076117, + "grad_norm": 2.131420373916626, + "learning_rate": 4.895571159973229e-05, + "loss": 0.1599, + "num_input_tokens_seen": 4048056, + "step": 6640 + }, + { + "epoch": 1.8325979040264755, + "grad_norm": 1.2908679246902466, + "learning_rate": 4.8952267216164894e-05, + "loss": 0.0507, + "num_input_tokens_seen": 4051544, + "step": 6645 + }, + { + "epoch": 1.833976833976834, + "grad_norm": 0.24555858969688416, + "learning_rate": 4.8948817283200995e-05, + "loss": 0.0487, + "num_input_tokens_seen": 4054776, + "step": 6650 + }, + { + "epoch": 1.8353557639271925, + "grad_norm": 0.40823954343795776, + "learning_rate": 4.89453618016399e-05, + "loss": 0.0951, + "num_input_tokens_seen": 4057368, + "step": 6655 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.3756934106349945, + "learning_rate": 4.8941900772282215e-05, + "loss": 0.0668, + "num_input_tokens_seen": 4061048, + "step": 6660 + }, + { + "epoch": 1.8381136238279097, + "grad_norm": 0.30525290966033936, + "learning_rate": 4.893843419592977e-05, + "loss": 0.1131, + "num_input_tokens_seen": 4064056, + "step": 6665 + }, + { + "epoch": 1.839492553778268, + "grad_norm": 0.038278236985206604, + "learning_rate": 4.8934962073385756e-05, + "loss": 0.0486, + "num_input_tokens_seen": 4067448, + "step": 6670 + }, + { + "epoch": 1.8408714837286266, + "grad_norm": 0.3672383725643158, + "learning_rate": 4.893148440545461e-05, + "loss": 0.1098, + "num_input_tokens_seen": 4070072, + "step": 6675 + }, + { + "epoch": 1.8422504136789852, + "grad_norm": 0.04530451446771622, + "learning_rate": 4.8928001192942036e-05, + "loss": 0.035, + "num_input_tokens_seen": 4072696, + "step": 6680 + }, + { + "epoch": 1.8436293436293436, + "grad_norm": 14.22693920135498, + "learning_rate": 4.892451243665507e-05, + "loss": 0.1519, + "num_input_tokens_seen": 4075192, + "step": 6685 + }, + { + "epoch": 1.845008273579702, + "grad_norm": 0.23294411599636078, + "learning_rate": 4.892101813740199e-05, + "loss": 0.1134, + "num_input_tokens_seen": 4078744, + "step": 6690 + }, + { + "epoch": 1.8463872035300608, + "grad_norm": 0.4690794348716736, + "learning_rate": 4.891751829599237e-05, + "loss": 0.003, + "num_input_tokens_seen": 4081720, + "step": 6695 + }, + { + "epoch": 1.8477661334804192, + "grad_norm": 0.012863804586231709, + "learning_rate": 4.8914012913237096e-05, + "loss": 0.091, + "num_input_tokens_seen": 4084376, + "step": 6700 + }, + { + "epoch": 1.8491450634307776, + "grad_norm": 0.06375875324010849, + "learning_rate": 4.89105019899483e-05, + "loss": 0.0728, + "num_input_tokens_seen": 4087416, + "step": 6705 + }, + { + "epoch": 1.8505239933811364, + "grad_norm": 0.11805064976215363, + "learning_rate": 4.890698552693941e-05, + "loss": 0.088, + "num_input_tokens_seen": 4090840, + "step": 6710 + }, + { + "epoch": 1.8519029233314948, + "grad_norm": 5.301572799682617, + "learning_rate": 4.890346352502514e-05, + "loss": 0.2667, + "num_input_tokens_seen": 4093080, + "step": 6715 + }, + { + "epoch": 1.8532818532818531, + "grad_norm": 9.934910774230957, + "learning_rate": 4.88999359850215e-05, + "loss": 0.1864, + "num_input_tokens_seen": 4095544, + "step": 6720 + }, + { + "epoch": 1.854660783232212, + "grad_norm": 4.889917373657227, + "learning_rate": 4.8896402907745755e-05, + "loss": 0.055, + "num_input_tokens_seen": 4098296, + "step": 6725 + }, + { + "epoch": 1.8560397131825703, + "grad_norm": 0.33044612407684326, + "learning_rate": 4.889286429401648e-05, + "loss": 0.0384, + "num_input_tokens_seen": 4101720, + "step": 6730 + }, + { + "epoch": 1.8574186431329287, + "grad_norm": 0.05672832205891609, + "learning_rate": 4.888932014465352e-05, + "loss": 0.126, + "num_input_tokens_seen": 4104888, + "step": 6735 + }, + { + "epoch": 1.8587975730832875, + "grad_norm": 8.271101951599121, + "learning_rate": 4.8885770460478e-05, + "loss": 0.073, + "num_input_tokens_seen": 4108120, + "step": 6740 + }, + { + "epoch": 1.860176503033646, + "grad_norm": 0.14715009927749634, + "learning_rate": 4.888221524231233e-05, + "loss": 0.0026, + "num_input_tokens_seen": 4112056, + "step": 6745 + }, + { + "epoch": 1.8615554329840043, + "grad_norm": 2.598428249359131, + "learning_rate": 4.887865449098019e-05, + "loss": 0.0744, + "num_input_tokens_seen": 4114616, + "step": 6750 + }, + { + "epoch": 1.862934362934363, + "grad_norm": 9.243728637695312, + "learning_rate": 4.887508820730659e-05, + "loss": 0.18, + "num_input_tokens_seen": 4118776, + "step": 6755 + }, + { + "epoch": 1.8643132928847215, + "grad_norm": 0.5516256093978882, + "learning_rate": 4.887151639211775e-05, + "loss": 0.0355, + "num_input_tokens_seen": 4121720, + "step": 6760 + }, + { + "epoch": 1.8656922228350798, + "grad_norm": 0.32133209705352783, + "learning_rate": 4.886793904624123e-05, + "loss": 0.0073, + "num_input_tokens_seen": 4125688, + "step": 6765 + }, + { + "epoch": 1.8670711527854384, + "grad_norm": 0.044812433421611786, + "learning_rate": 4.886435617050584e-05, + "loss": 0.0276, + "num_input_tokens_seen": 4129016, + "step": 6770 + }, + { + "epoch": 1.868450082735797, + "grad_norm": 0.01978136971592903, + "learning_rate": 4.8860767765741685e-05, + "loss": 0.0627, + "num_input_tokens_seen": 4132024, + "step": 6775 + }, + { + "epoch": 1.8698290126861554, + "grad_norm": 0.0526522621512413, + "learning_rate": 4.8857173832780144e-05, + "loss": 0.1945, + "num_input_tokens_seen": 4135544, + "step": 6780 + }, + { + "epoch": 1.871207942636514, + "grad_norm": 2.3099396228790283, + "learning_rate": 4.885357437245388e-05, + "loss": 0.0043, + "num_input_tokens_seen": 4138360, + "step": 6785 + }, + { + "epoch": 1.8725868725868726, + "grad_norm": 2.047297239303589, + "learning_rate": 4.884996938559685e-05, + "loss": 0.1509, + "num_input_tokens_seen": 4140728, + "step": 6790 + }, + { + "epoch": 1.873965802537231, + "grad_norm": 0.4932037591934204, + "learning_rate": 4.8846358873044264e-05, + "loss": 0.0546, + "num_input_tokens_seen": 4143320, + "step": 6795 + }, + { + "epoch": 1.8753447324875896, + "grad_norm": 0.06317110359668732, + "learning_rate": 4.8842742835632616e-05, + "loss": 0.0092, + "num_input_tokens_seen": 4145816, + "step": 6800 + }, + { + "epoch": 1.8767236624379482, + "grad_norm": 6.873781204223633, + "learning_rate": 4.883912127419971e-05, + "loss": 0.1778, + "num_input_tokens_seen": 4149400, + "step": 6805 + }, + { + "epoch": 1.8781025923883066, + "grad_norm": 0.07402043789625168, + "learning_rate": 4.88354941895846e-05, + "loss": 0.0867, + "num_input_tokens_seen": 4152440, + "step": 6810 + }, + { + "epoch": 1.8794815223386652, + "grad_norm": 0.2978287637233734, + "learning_rate": 4.883186158262764e-05, + "loss": 0.1564, + "num_input_tokens_seen": 4154744, + "step": 6815 + }, + { + "epoch": 1.8808604522890238, + "grad_norm": 1.7482846975326538, + "learning_rate": 4.882822345417043e-05, + "loss": 0.1101, + "num_input_tokens_seen": 4158264, + "step": 6820 + }, + { + "epoch": 1.8822393822393821, + "grad_norm": 0.041420482099056244, + "learning_rate": 4.8824579805055894e-05, + "loss": 0.013, + "num_input_tokens_seen": 4160632, + "step": 6825 + }, + { + "epoch": 1.8836183121897407, + "grad_norm": 0.24059289693832397, + "learning_rate": 4.8820930636128195e-05, + "loss": 0.0297, + "num_input_tokens_seen": 4164024, + "step": 6830 + }, + { + "epoch": 1.8849972421400993, + "grad_norm": 15.451062202453613, + "learning_rate": 4.881727594823281e-05, + "loss": 0.0836, + "num_input_tokens_seen": 4169208, + "step": 6835 + }, + { + "epoch": 1.8863761720904577, + "grad_norm": 1.0170732736587524, + "learning_rate": 4.881361574221648e-05, + "loss": 0.0887, + "num_input_tokens_seen": 4171960, + "step": 6840 + }, + { + "epoch": 1.8877551020408163, + "grad_norm": 0.16603508591651917, + "learning_rate": 4.88099500189272e-05, + "loss": 0.0368, + "num_input_tokens_seen": 4174392, + "step": 6845 + }, + { + "epoch": 1.889134031991175, + "grad_norm": 0.009716344065964222, + "learning_rate": 4.880627877921428e-05, + "loss": 0.0675, + "num_input_tokens_seen": 4177432, + "step": 6850 + }, + { + "epoch": 1.8905129619415333, + "grad_norm": 0.04875760152935982, + "learning_rate": 4.88026020239283e-05, + "loss": 0.0156, + "num_input_tokens_seen": 4179864, + "step": 6855 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.21628601849079132, + "learning_rate": 4.87989197539211e-05, + "loss": 0.0493, + "num_input_tokens_seen": 4182392, + "step": 6860 + }, + { + "epoch": 1.8932708218422505, + "grad_norm": 3.1588988304138184, + "learning_rate": 4.879523197004581e-05, + "loss": 0.06, + "num_input_tokens_seen": 4185048, + "step": 6865 + }, + { + "epoch": 1.8946497517926089, + "grad_norm": 7.370657444000244, + "learning_rate": 4.879153867315684e-05, + "loss": 0.1219, + "num_input_tokens_seen": 4189912, + "step": 6870 + }, + { + "epoch": 1.8960286817429675, + "grad_norm": 0.04159415140748024, + "learning_rate": 4.878783986410988e-05, + "loss": 0.1734, + "num_input_tokens_seen": 4194264, + "step": 6875 + }, + { + "epoch": 1.897407611693326, + "grad_norm": 4.773725986480713, + "learning_rate": 4.8784135543761876e-05, + "loss": 0.0612, + "num_input_tokens_seen": 4196984, + "step": 6880 + }, + { + "epoch": 1.8987865416436844, + "grad_norm": 0.04335630312561989, + "learning_rate": 4.878042571297108e-05, + "loss": 0.0691, + "num_input_tokens_seen": 4201176, + "step": 6885 + }, + { + "epoch": 1.900165471594043, + "grad_norm": 0.012540698051452637, + "learning_rate": 4.877671037259699e-05, + "loss": 0.0459, + "num_input_tokens_seen": 4203160, + "step": 6890 + }, + { + "epoch": 1.9015444015444016, + "grad_norm": 3.541722059249878, + "learning_rate": 4.877298952350042e-05, + "loss": 0.2267, + "num_input_tokens_seen": 4205976, + "step": 6895 + }, + { + "epoch": 1.90292333149476, + "grad_norm": 1.0719889402389526, + "learning_rate": 4.8769263166543414e-05, + "loss": 0.1109, + "num_input_tokens_seen": 4209240, + "step": 6900 + }, + { + "epoch": 1.9043022614451186, + "grad_norm": 0.9471418261528015, + "learning_rate": 4.876553130258934e-05, + "loss": 0.0521, + "num_input_tokens_seen": 4212248, + "step": 6905 + }, + { + "epoch": 1.9056811913954772, + "grad_norm": 0.15593107044696808, + "learning_rate": 4.876179393250279e-05, + "loss": 0.1225, + "num_input_tokens_seen": 4215064, + "step": 6910 + }, + { + "epoch": 1.9070601213458356, + "grad_norm": 3.586320161819458, + "learning_rate": 4.875805105714968e-05, + "loss": 0.1367, + "num_input_tokens_seen": 4217848, + "step": 6915 + }, + { + "epoch": 1.9084390512961942, + "grad_norm": 4.476266860961914, + "learning_rate": 4.8754302677397165e-05, + "loss": 0.1141, + "num_input_tokens_seen": 4220536, + "step": 6920 + }, + { + "epoch": 1.9098179812465528, + "grad_norm": 0.05684522166848183, + "learning_rate": 4.875054879411371e-05, + "loss": 0.0498, + "num_input_tokens_seen": 4222648, + "step": 6925 + }, + { + "epoch": 1.9111969111969112, + "grad_norm": 4.220574378967285, + "learning_rate": 4.8746789408169025e-05, + "loss": 0.09, + "num_input_tokens_seen": 4225816, + "step": 6930 + }, + { + "epoch": 1.9125758411472698, + "grad_norm": 2.015505313873291, + "learning_rate": 4.87430245204341e-05, + "loss": 0.089, + "num_input_tokens_seen": 4228472, + "step": 6935 + }, + { + "epoch": 1.9139547710976283, + "grad_norm": 0.5053879618644714, + "learning_rate": 4.8739254131781207e-05, + "loss": 0.0492, + "num_input_tokens_seen": 4231928, + "step": 6940 + }, + { + "epoch": 1.9153337010479867, + "grad_norm": 3.7796754837036133, + "learning_rate": 4.87354782430839e-05, + "loss": 0.1591, + "num_input_tokens_seen": 4235832, + "step": 6945 + }, + { + "epoch": 1.9167126309983453, + "grad_norm": 0.09773415327072144, + "learning_rate": 4.873169685521699e-05, + "loss": 0.0167, + "num_input_tokens_seen": 4239320, + "step": 6950 + }, + { + "epoch": 1.918091560948704, + "grad_norm": 0.008422628976404667, + "learning_rate": 4.872790996905658e-05, + "loss": 0.1206, + "num_input_tokens_seen": 4242360, + "step": 6955 + }, + { + "epoch": 1.9194704908990623, + "grad_norm": 0.1215936616063118, + "learning_rate": 4.8724117585480025e-05, + "loss": 0.0116, + "num_input_tokens_seen": 4244856, + "step": 6960 + }, + { + "epoch": 1.920849420849421, + "grad_norm": 2.219386339187622, + "learning_rate": 4.8720319705365976e-05, + "loss": 0.1719, + "num_input_tokens_seen": 4247544, + "step": 6965 + }, + { + "epoch": 1.9222283507997795, + "grad_norm": 1.238510251045227, + "learning_rate": 4.871651632959434e-05, + "loss": 0.0726, + "num_input_tokens_seen": 4250808, + "step": 6970 + }, + { + "epoch": 1.9236072807501379, + "grad_norm": 0.22379335761070251, + "learning_rate": 4.8712707459046305e-05, + "loss": 0.0536, + "num_input_tokens_seen": 4253240, + "step": 6975 + }, + { + "epoch": 1.9249862107004965, + "grad_norm": 0.5763400793075562, + "learning_rate": 4.8708893094604335e-05, + "loss": 0.0282, + "num_input_tokens_seen": 4257016, + "step": 6980 + }, + { + "epoch": 1.926365140650855, + "grad_norm": 7.427257061004639, + "learning_rate": 4.8705073237152164e-05, + "loss": 0.0867, + "num_input_tokens_seen": 4260312, + "step": 6985 + }, + { + "epoch": 1.9277440706012134, + "grad_norm": 10.91754150390625, + "learning_rate": 4.87012478875748e-05, + "loss": 0.0816, + "num_input_tokens_seen": 4263992, + "step": 6990 + }, + { + "epoch": 1.9291230005515718, + "grad_norm": 0.05746429041028023, + "learning_rate": 4.869741704675851e-05, + "loss": 0.0022, + "num_input_tokens_seen": 4267288, + "step": 6995 + }, + { + "epoch": 1.9305019305019306, + "grad_norm": 14.335406303405762, + "learning_rate": 4.869358071559086e-05, + "loss": 0.09, + "num_input_tokens_seen": 4269816, + "step": 7000 + }, + { + "epoch": 1.931880860452289, + "grad_norm": 0.0023280964232981205, + "learning_rate": 4.868973889496066e-05, + "loss": 0.1443, + "num_input_tokens_seen": 4272280, + "step": 7005 + }, + { + "epoch": 1.9332597904026474, + "grad_norm": 0.37546753883361816, + "learning_rate": 4.8685891585758014e-05, + "loss": 0.1274, + "num_input_tokens_seen": 4274904, + "step": 7010 + }, + { + "epoch": 1.9346387203530062, + "grad_norm": 0.0280184056609869, + "learning_rate": 4.8682038788874286e-05, + "loss": 0.1018, + "num_input_tokens_seen": 4277912, + "step": 7015 + }, + { + "epoch": 1.9360176503033646, + "grad_norm": 0.17050734162330627, + "learning_rate": 4.867818050520211e-05, + "loss": 0.0414, + "num_input_tokens_seen": 4280536, + "step": 7020 + }, + { + "epoch": 1.937396580253723, + "grad_norm": 0.28095805644989014, + "learning_rate": 4.8674316735635395e-05, + "loss": 0.1339, + "num_input_tokens_seen": 4283480, + "step": 7025 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.024371549487113953, + "learning_rate": 4.867044748106932e-05, + "loss": 0.0037, + "num_input_tokens_seen": 4286744, + "step": 7030 + }, + { + "epoch": 1.9401544401544402, + "grad_norm": 0.29590219259262085, + "learning_rate": 4.866657274240035e-05, + "loss": 0.1058, + "num_input_tokens_seen": 4289528, + "step": 7035 + }, + { + "epoch": 1.9415333701047985, + "grad_norm": 4.197798728942871, + "learning_rate": 4.86626925205262e-05, + "loss": 0.1648, + "num_input_tokens_seen": 4292440, + "step": 7040 + }, + { + "epoch": 1.9429123000551574, + "grad_norm": 0.015577191486954689, + "learning_rate": 4.865880681634585e-05, + "loss": 0.0023, + "num_input_tokens_seen": 4294808, + "step": 7045 + }, + { + "epoch": 1.9442912300055157, + "grad_norm": 0.11780861020088196, + "learning_rate": 4.865491563075956e-05, + "loss": 0.096, + "num_input_tokens_seen": 4297112, + "step": 7050 + }, + { + "epoch": 1.9456701599558741, + "grad_norm": 0.21737545728683472, + "learning_rate": 4.8651018964668884e-05, + "loss": 0.0724, + "num_input_tokens_seen": 4299608, + "step": 7055 + }, + { + "epoch": 1.9470490899062327, + "grad_norm": 0.05469327047467232, + "learning_rate": 4.86471168189766e-05, + "loss": 0.0184, + "num_input_tokens_seen": 4301976, + "step": 7060 + }, + { + "epoch": 1.9484280198565913, + "grad_norm": 0.21553117036819458, + "learning_rate": 4.86432091945868e-05, + "loss": 0.0817, + "num_input_tokens_seen": 4304792, + "step": 7065 + }, + { + "epoch": 1.9498069498069497, + "grad_norm": 0.20498141646385193, + "learning_rate": 4.8639296092404806e-05, + "loss": 0.037, + "num_input_tokens_seen": 4307576, + "step": 7070 + }, + { + "epoch": 1.9511858797573083, + "grad_norm": 0.15360797941684723, + "learning_rate": 4.8635377513337246e-05, + "loss": 0.0866, + "num_input_tokens_seen": 4311512, + "step": 7075 + }, + { + "epoch": 1.9525648097076669, + "grad_norm": 0.16320839524269104, + "learning_rate": 4.8631453458291975e-05, + "loss": 0.0829, + "num_input_tokens_seen": 4314264, + "step": 7080 + }, + { + "epoch": 1.9539437396580253, + "grad_norm": 4.536736488342285, + "learning_rate": 4.862752392817816e-05, + "loss": 0.0969, + "num_input_tokens_seen": 4317144, + "step": 7085 + }, + { + "epoch": 1.9553226696083839, + "grad_norm": 0.09786000102758408, + "learning_rate": 4.862358892390621e-05, + "loss": 0.1101, + "num_input_tokens_seen": 4319192, + "step": 7090 + }, + { + "epoch": 1.9567015995587425, + "grad_norm": 0.18809539079666138, + "learning_rate": 4.861964844638781e-05, + "loss": 0.0089, + "num_input_tokens_seen": 4321528, + "step": 7095 + }, + { + "epoch": 1.9580805295091008, + "grad_norm": 13.995170593261719, + "learning_rate": 4.8615702496535906e-05, + "loss": 0.0583, + "num_input_tokens_seen": 4324376, + "step": 7100 + }, + { + "epoch": 1.9594594594594594, + "grad_norm": 0.021283334121108055, + "learning_rate": 4.861175107526473e-05, + "loss": 0.0376, + "num_input_tokens_seen": 4327288, + "step": 7105 + }, + { + "epoch": 1.960838389409818, + "grad_norm": 7.318396091461182, + "learning_rate": 4.860779418348976e-05, + "loss": 0.1157, + "num_input_tokens_seen": 4329816, + "step": 7110 + }, + { + "epoch": 1.9622173193601764, + "grad_norm": 30.49907875061035, + "learning_rate": 4.8603831822127755e-05, + "loss": 0.0752, + "num_input_tokens_seen": 4332440, + "step": 7115 + }, + { + "epoch": 1.963596249310535, + "grad_norm": 3.874824285507202, + "learning_rate": 4.859986399209674e-05, + "loss": 0.2467, + "num_input_tokens_seen": 4335128, + "step": 7120 + }, + { + "epoch": 1.9649751792608936, + "grad_norm": 17.346935272216797, + "learning_rate": 4.8595890694315996e-05, + "loss": 0.035, + "num_input_tokens_seen": 4338520, + "step": 7125 + }, + { + "epoch": 1.966354109211252, + "grad_norm": 0.12810257077217102, + "learning_rate": 4.859191192970608e-05, + "loss": 0.0868, + "num_input_tokens_seen": 4340952, + "step": 7130 + }, + { + "epoch": 1.9677330391616106, + "grad_norm": 0.17022410035133362, + "learning_rate": 4.858792769918883e-05, + "loss": 0.1362, + "num_input_tokens_seen": 4345752, + "step": 7135 + }, + { + "epoch": 1.9691119691119692, + "grad_norm": 0.10696089267730713, + "learning_rate": 4.8583938003687315e-05, + "loss": 0.0326, + "num_input_tokens_seen": 4348184, + "step": 7140 + }, + { + "epoch": 1.9704908990623275, + "grad_norm": 0.2239421010017395, + "learning_rate": 4.857994284412589e-05, + "loss": 0.0455, + "num_input_tokens_seen": 4351864, + "step": 7145 + }, + { + "epoch": 1.9718698290126861, + "grad_norm": 1.0026073455810547, + "learning_rate": 4.85759422214302e-05, + "loss": 0.0321, + "num_input_tokens_seen": 4354104, + "step": 7150 + }, + { + "epoch": 1.9732487589630447, + "grad_norm": 8.085583686828613, + "learning_rate": 4.857193613652711e-05, + "loss": 0.143, + "num_input_tokens_seen": 4356824, + "step": 7155 + }, + { + "epoch": 1.9746276889134031, + "grad_norm": 0.55838543176651, + "learning_rate": 4.856792459034477e-05, + "loss": 0.004, + "num_input_tokens_seen": 4359352, + "step": 7160 + }, + { + "epoch": 1.9760066188637617, + "grad_norm": 0.023091671988368034, + "learning_rate": 4.856390758381262e-05, + "loss": 0.1213, + "num_input_tokens_seen": 4362488, + "step": 7165 + }, + { + "epoch": 1.9773855488141203, + "grad_norm": 0.034737713634967804, + "learning_rate": 4.855988511786132e-05, + "loss": 0.1578, + "num_input_tokens_seen": 4367256, + "step": 7170 + }, + { + "epoch": 1.9787644787644787, + "grad_norm": 0.00980540830641985, + "learning_rate": 4.855585719342283e-05, + "loss": 0.0803, + "num_input_tokens_seen": 4370392, + "step": 7175 + }, + { + "epoch": 1.9801434087148373, + "grad_norm": 8.224844932556152, + "learning_rate": 4.8551823811430365e-05, + "loss": 0.0996, + "num_input_tokens_seen": 4373272, + "step": 7180 + }, + { + "epoch": 1.981522338665196, + "grad_norm": 0.024589868262410164, + "learning_rate": 4.854778497281839e-05, + "loss": 0.087, + "num_input_tokens_seen": 4377592, + "step": 7185 + }, + { + "epoch": 1.9829012686155543, + "grad_norm": 0.07042387127876282, + "learning_rate": 4.854374067852265e-05, + "loss": 0.007, + "num_input_tokens_seen": 4380536, + "step": 7190 + }, + { + "epoch": 1.9842801985659129, + "grad_norm": 2.393019199371338, + "learning_rate": 4.853969092948015e-05, + "loss": 0.116, + "num_input_tokens_seen": 4383736, + "step": 7195 + }, + { + "epoch": 1.9856591285162715, + "grad_norm": 2.7065720558166504, + "learning_rate": 4.8535635726629164e-05, + "loss": 0.081, + "num_input_tokens_seen": 4386776, + "step": 7200 + }, + { + "epoch": 1.9870380584666298, + "grad_norm": 0.08100377023220062, + "learning_rate": 4.853157507090922e-05, + "loss": 0.0034, + "num_input_tokens_seen": 4390616, + "step": 7205 + }, + { + "epoch": 1.9884169884169884, + "grad_norm": 0.06300540268421173, + "learning_rate": 4.8527508963261115e-05, + "loss": 0.0382, + "num_input_tokens_seen": 4394968, + "step": 7210 + }, + { + "epoch": 1.989795918367347, + "grad_norm": 0.20298729836940765, + "learning_rate": 4.8523437404626905e-05, + "loss": 0.1858, + "num_input_tokens_seen": 4397144, + "step": 7215 + }, + { + "epoch": 1.9911748483177054, + "grad_norm": 0.06540250778198242, + "learning_rate": 4.851936039594991e-05, + "loss": 0.1229, + "num_input_tokens_seen": 4399960, + "step": 7220 + }, + { + "epoch": 1.992553778268064, + "grad_norm": 0.12637247145175934, + "learning_rate": 4.8515277938174726e-05, + "loss": 0.0138, + "num_input_tokens_seen": 4402264, + "step": 7225 + }, + { + "epoch": 1.9939327082184226, + "grad_norm": 0.12967635691165924, + "learning_rate": 4.851119003224719e-05, + "loss": 0.0094, + "num_input_tokens_seen": 4405048, + "step": 7230 + }, + { + "epoch": 1.995311638168781, + "grad_norm": 1.7765380144119263, + "learning_rate": 4.85070966791144e-05, + "loss": 0.0461, + "num_input_tokens_seen": 4407960, + "step": 7235 + }, + { + "epoch": 1.9966905681191396, + "grad_norm": 0.011381231248378754, + "learning_rate": 4.850299787972476e-05, + "loss": 0.0106, + "num_input_tokens_seen": 4411032, + "step": 7240 + }, + { + "epoch": 1.9980694980694982, + "grad_norm": 8.633504867553711, + "learning_rate": 4.8498893635027877e-05, + "loss": 0.0909, + "num_input_tokens_seen": 4414424, + "step": 7245 + }, + { + "epoch": 1.9994484280198566, + "grad_norm": 14.863288879394531, + "learning_rate": 4.849478394597465e-05, + "loss": 0.1459, + "num_input_tokens_seen": 4418424, + "step": 7250 + }, + { + "epoch": 2.0, + "eval_loss": 0.10282225161790848, + "eval_runtime": 28.4856, + "eval_samples_per_second": 56.59, + "eval_steps_per_second": 14.147, + "num_input_tokens_seen": 4419000, + "step": 7252 + }, + { + "epoch": 2.000827357970215, + "grad_norm": 4.177727699279785, + "learning_rate": 4.8490668813517235e-05, + "loss": 0.125, + "num_input_tokens_seen": 4420568, + "step": 7255 + }, + { + "epoch": 2.0022062879205738, + "grad_norm": 0.7144536972045898, + "learning_rate": 4.8486548238609055e-05, + "loss": 0.0082, + "num_input_tokens_seen": 4422968, + "step": 7260 + }, + { + "epoch": 2.003585217870932, + "grad_norm": 0.03690946474671364, + "learning_rate": 4.8482422222204784e-05, + "loss": 0.0076, + "num_input_tokens_seen": 4425976, + "step": 7265 + }, + { + "epoch": 2.0049641478212905, + "grad_norm": 0.030650019645690918, + "learning_rate": 4.8478290765260345e-05, + "loss": 0.0031, + "num_input_tokens_seen": 4429304, + "step": 7270 + }, + { + "epoch": 2.0063430777716493, + "grad_norm": 0.08615747839212418, + "learning_rate": 4.847415386873297e-05, + "loss": 0.0835, + "num_input_tokens_seen": 4432216, + "step": 7275 + }, + { + "epoch": 2.0077220077220077, + "grad_norm": 0.25087764859199524, + "learning_rate": 4.84700115335811e-05, + "loss": 0.0034, + "num_input_tokens_seen": 4435864, + "step": 7280 + }, + { + "epoch": 2.009100937672366, + "grad_norm": 0.1648593693971634, + "learning_rate": 4.846586376076445e-05, + "loss": 0.0068, + "num_input_tokens_seen": 4438584, + "step": 7285 + }, + { + "epoch": 2.010479867622725, + "grad_norm": 0.03282011300325394, + "learning_rate": 4.846171055124401e-05, + "loss": 0.0029, + "num_input_tokens_seen": 4441656, + "step": 7290 + }, + { + "epoch": 2.0118587975730833, + "grad_norm": 0.004184122197329998, + "learning_rate": 4.845755190598201e-05, + "loss": 0.0022, + "num_input_tokens_seen": 4443736, + "step": 7295 + }, + { + "epoch": 2.0132377275234417, + "grad_norm": 0.006326358765363693, + "learning_rate": 4.8453387825941966e-05, + "loss": 0.0596, + "num_input_tokens_seen": 4446840, + "step": 7300 + }, + { + "epoch": 2.0146166574738005, + "grad_norm": 0.06023376062512398, + "learning_rate": 4.8449218312088604e-05, + "loss": 0.0004, + "num_input_tokens_seen": 4449368, + "step": 7305 + }, + { + "epoch": 2.015995587424159, + "grad_norm": 19.14274024963379, + "learning_rate": 4.8445043365387976e-05, + "loss": 0.0159, + "num_input_tokens_seen": 4452152, + "step": 7310 + }, + { + "epoch": 2.0173745173745172, + "grad_norm": 0.0001908824051497504, + "learning_rate": 4.844086298680733e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4454808, + "step": 7315 + }, + { + "epoch": 2.018753447324876, + "grad_norm": 0.008413111791014671, + "learning_rate": 4.8436677177315214e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4457208, + "step": 7320 + }, + { + "epoch": 2.0201323772752344, + "grad_norm": 0.0036981531884521246, + "learning_rate": 4.843248593788141e-05, + "loss": 0.0923, + "num_input_tokens_seen": 4459544, + "step": 7325 + }, + { + "epoch": 2.021511307225593, + "grad_norm": 0.16952703893184662, + "learning_rate": 4.842828926947698e-05, + "loss": 0.0016, + "num_input_tokens_seen": 4462360, + "step": 7330 + }, + { + "epoch": 2.0228902371759516, + "grad_norm": 0.04010908678174019, + "learning_rate": 4.842408717307422e-05, + "loss": 0.0008, + "num_input_tokens_seen": 4465880, + "step": 7335 + }, + { + "epoch": 2.02426916712631, + "grad_norm": 0.01480373740196228, + "learning_rate": 4.841987964964671e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4469912, + "step": 7340 + }, + { + "epoch": 2.0256480970766684, + "grad_norm": 0.01822209730744362, + "learning_rate": 4.841566670016926e-05, + "loss": 0.0003, + "num_input_tokens_seen": 4473496, + "step": 7345 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.010105228051543236, + "learning_rate": 4.841144832561795e-05, + "loss": 0.0008, + "num_input_tokens_seen": 4478328, + "step": 7350 + }, + { + "epoch": 2.0284059569773856, + "grad_norm": 0.01810387894511223, + "learning_rate": 4.840722452697012e-05, + "loss": 0.0008, + "num_input_tokens_seen": 4481688, + "step": 7355 + }, + { + "epoch": 2.029784886927744, + "grad_norm": 0.045274827629327774, + "learning_rate": 4.840299530520437e-05, + "loss": 0.0182, + "num_input_tokens_seen": 4484440, + "step": 7360 + }, + { + "epoch": 2.0311638168781028, + "grad_norm": 0.00020439637592062354, + "learning_rate": 4.839876066130055e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4488152, + "step": 7365 + }, + { + "epoch": 2.032542746828461, + "grad_norm": 0.007673789747059345, + "learning_rate": 4.8394520596239746e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4491928, + "step": 7370 + }, + { + "epoch": 2.0339216767788195, + "grad_norm": 0.06790433079004288, + "learning_rate": 4.839027511100435e-05, + "loss": 0.0165, + "num_input_tokens_seen": 4494712, + "step": 7375 + }, + { + "epoch": 2.0353006067291783, + "grad_norm": 0.38852518796920776, + "learning_rate": 4.838602420657795e-05, + "loss": 0.001, + "num_input_tokens_seen": 4496984, + "step": 7380 + }, + { + "epoch": 2.0366795366795367, + "grad_norm": 7.489502429962158, + "learning_rate": 4.838176788394544e-05, + "loss": 0.0028, + "num_input_tokens_seen": 4500152, + "step": 7385 + }, + { + "epoch": 2.038058466629895, + "grad_norm": 0.000516770058311522, + "learning_rate": 4.837750614409294e-05, + "loss": 0.0026, + "num_input_tokens_seen": 4502424, + "step": 7390 + }, + { + "epoch": 2.039437396580254, + "grad_norm": 0.008240120485424995, + "learning_rate": 4.837323898800784e-05, + "loss": 0.0475, + "num_input_tokens_seen": 4505688, + "step": 7395 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.0002057852689176798, + "learning_rate": 4.836896641667878e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4508184, + "step": 7400 + }, + { + "epoch": 2.0421952564809707, + "grad_norm": 0.03158336132764816, + "learning_rate": 4.8364688431095645e-05, + "loss": 0.094, + "num_input_tokens_seen": 4511096, + "step": 7405 + }, + { + "epoch": 2.0435741864313295, + "grad_norm": 11.441633224487305, + "learning_rate": 4.836040503224959e-05, + "loss": 0.0858, + "num_input_tokens_seen": 4513944, + "step": 7410 + }, + { + "epoch": 2.044953116381688, + "grad_norm": 0.008484289981424809, + "learning_rate": 4.835611622113301e-05, + "loss": 0.0013, + "num_input_tokens_seen": 4516536, + "step": 7415 + }, + { + "epoch": 2.0463320463320462, + "grad_norm": 0.0008954320219345391, + "learning_rate": 4.835182199873957e-05, + "loss": 0.0, + "num_input_tokens_seen": 4519032, + "step": 7420 + }, + { + "epoch": 2.047710976282405, + "grad_norm": 0.00040032676770351827, + "learning_rate": 4.834752236606417e-05, + "loss": 0.0285, + "num_input_tokens_seen": 4521432, + "step": 7425 + }, + { + "epoch": 2.0490899062327634, + "grad_norm": 0.0009505719062872231, + "learning_rate": 4.8343217324102976e-05, + "loss": 0.0, + "num_input_tokens_seen": 4524952, + "step": 7430 + }, + { + "epoch": 2.050468836183122, + "grad_norm": 0.0011093436041846871, + "learning_rate": 4.833890687385341e-05, + "loss": 0.0454, + "num_input_tokens_seen": 4528824, + "step": 7435 + }, + { + "epoch": 2.0518477661334806, + "grad_norm": 0.23833052814006805, + "learning_rate": 4.833459101631414e-05, + "loss": 0.0659, + "num_input_tokens_seen": 4531576, + "step": 7440 + }, + { + "epoch": 2.053226696083839, + "grad_norm": 0.02122810110449791, + "learning_rate": 4.833026975248508e-05, + "loss": 0.0725, + "num_input_tokens_seen": 4534264, + "step": 7445 + }, + { + "epoch": 2.0546056260341974, + "grad_norm": 3.80948513338808e-05, + "learning_rate": 4.832594308336741e-05, + "loss": 0.0, + "num_input_tokens_seen": 4537656, + "step": 7450 + }, + { + "epoch": 2.0559845559845558, + "grad_norm": 0.9870039820671082, + "learning_rate": 4.8321611009963566e-05, + "loss": 0.0017, + "num_input_tokens_seen": 4541208, + "step": 7455 + }, + { + "epoch": 2.0573634859349146, + "grad_norm": 0.0029811765998601913, + "learning_rate": 4.831727353327721e-05, + "loss": 0.0268, + "num_input_tokens_seen": 4544664, + "step": 7460 + }, + { + "epoch": 2.058742415885273, + "grad_norm": 0.00012829044135287404, + "learning_rate": 4.831293065431329e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4550104, + "step": 7465 + }, + { + "epoch": 2.0601213458356313, + "grad_norm": 0.05362200364470482, + "learning_rate": 4.8308582374077984e-05, + "loss": 0.0007, + "num_input_tokens_seen": 4553432, + "step": 7470 + }, + { + "epoch": 2.06150027578599, + "grad_norm": 0.001176859950646758, + "learning_rate": 4.830422869357871e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4556664, + "step": 7475 + }, + { + "epoch": 2.0628792057363485, + "grad_norm": 3.842118167085573e-05, + "learning_rate": 4.829986961382418e-05, + "loss": 0.0323, + "num_input_tokens_seen": 4561528, + "step": 7480 + }, + { + "epoch": 2.064258135686707, + "grad_norm": 0.021882131695747375, + "learning_rate": 4.829550513582431e-05, + "loss": 0.1502, + "num_input_tokens_seen": 4563960, + "step": 7485 + }, + { + "epoch": 2.0656370656370657, + "grad_norm": 0.00020051473984494805, + "learning_rate": 4.8291135260590294e-05, + "loss": 0.0, + "num_input_tokens_seen": 4566424, + "step": 7490 + }, + { + "epoch": 2.067015995587424, + "grad_norm": 0.0031675477512180805, + "learning_rate": 4.828675998913457e-05, + "loss": 0.0301, + "num_input_tokens_seen": 4569432, + "step": 7495 + }, + { + "epoch": 2.0683949255377825, + "grad_norm": 0.0002908394963014871, + "learning_rate": 4.828237932247083e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4572376, + "step": 7500 + }, + { + "epoch": 2.0697738554881413, + "grad_norm": 0.0019071489805355668, + "learning_rate": 4.8277993261613995e-05, + "loss": 0.0009, + "num_input_tokens_seen": 4576632, + "step": 7505 + }, + { + "epoch": 2.0711527854384997, + "grad_norm": 0.00010411086987005547, + "learning_rate": 4.827360180758027e-05, + "loss": 0.0017, + "num_input_tokens_seen": 4579064, + "step": 7510 + }, + { + "epoch": 2.072531715388858, + "grad_norm": 0.00043569414992816746, + "learning_rate": 4.8269204961387084e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4582392, + "step": 7515 + }, + { + "epoch": 2.073910645339217, + "grad_norm": 0.08264581859111786, + "learning_rate": 4.826480272405312e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4586072, + "step": 7520 + }, + { + "epoch": 2.0752895752895753, + "grad_norm": 0.014562688767910004, + "learning_rate": 4.826039509659832e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4589784, + "step": 7525 + }, + { + "epoch": 2.0766685052399336, + "grad_norm": 0.0049593946896493435, + "learning_rate": 4.825598208004387e-05, + "loss": 0.0235, + "num_input_tokens_seen": 4592376, + "step": 7530 + }, + { + "epoch": 2.0780474351902924, + "grad_norm": 0.0029818019829690456, + "learning_rate": 4.825156367541219e-05, + "loss": 0.063, + "num_input_tokens_seen": 4595256, + "step": 7535 + }, + { + "epoch": 2.079426365140651, + "grad_norm": 0.015358858741819859, + "learning_rate": 4.824713988372698e-05, + "loss": 0.0256, + "num_input_tokens_seen": 4597624, + "step": 7540 + }, + { + "epoch": 2.080805295091009, + "grad_norm": 0.06850120425224304, + "learning_rate": 4.824271070601315e-05, + "loss": 0.0965, + "num_input_tokens_seen": 4600824, + "step": 7545 + }, + { + "epoch": 2.082184225041368, + "grad_norm": 0.021166455000638962, + "learning_rate": 4.823827614329689e-05, + "loss": 0.0004, + "num_input_tokens_seen": 4604248, + "step": 7550 + }, + { + "epoch": 2.0835631549917264, + "grad_norm": 0.013280094601213932, + "learning_rate": 4.823383619660562e-05, + "loss": 0.0006, + "num_input_tokens_seen": 4606648, + "step": 7555 + }, + { + "epoch": 2.0849420849420848, + "grad_norm": 13.84605884552002, + "learning_rate": 4.8229390866968005e-05, + "loss": 0.0094, + "num_input_tokens_seen": 4609720, + "step": 7560 + }, + { + "epoch": 2.0863210148924436, + "grad_norm": 0.020453182980418205, + "learning_rate": 4.822494015541398e-05, + "loss": 0.0261, + "num_input_tokens_seen": 4612024, + "step": 7565 + }, + { + "epoch": 2.087699944842802, + "grad_norm": 0.007630655542016029, + "learning_rate": 4.822048406297469e-05, + "loss": 0.0943, + "num_input_tokens_seen": 4614968, + "step": 7570 + }, + { + "epoch": 2.0890788747931603, + "grad_norm": 0.15934963524341583, + "learning_rate": 4.821602259068257e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4617752, + "step": 7575 + }, + { + "epoch": 2.090457804743519, + "grad_norm": 0.06984975188970566, + "learning_rate": 4.8211555739571254e-05, + "loss": 0.0008, + "num_input_tokens_seen": 4620152, + "step": 7580 + }, + { + "epoch": 2.0918367346938775, + "grad_norm": 0.0007115501211956143, + "learning_rate": 4.820708351067568e-05, + "loss": 0.0014, + "num_input_tokens_seen": 4622360, + "step": 7585 + }, + { + "epoch": 2.093215664644236, + "grad_norm": 0.1326388567686081, + "learning_rate": 4.820260590503197e-05, + "loss": 0.036, + "num_input_tokens_seen": 4625464, + "step": 7590 + }, + { + "epoch": 2.0945945945945947, + "grad_norm": 0.003518692683428526, + "learning_rate": 4.819812292367754e-05, + "loss": 0.0012, + "num_input_tokens_seen": 4628024, + "step": 7595 + }, + { + "epoch": 2.095973524544953, + "grad_norm": 0.05283913388848305, + "learning_rate": 4.8193634567651014e-05, + "loss": 0.0003, + "num_input_tokens_seen": 4630616, + "step": 7600 + }, + { + "epoch": 2.0973524544953115, + "grad_norm": 38.87104034423828, + "learning_rate": 4.8189140837992285e-05, + "loss": 0.0459, + "num_input_tokens_seen": 4633944, + "step": 7605 + }, + { + "epoch": 2.0987313844456703, + "grad_norm": 0.0358818918466568, + "learning_rate": 4.81846417357425e-05, + "loss": 0.0003, + "num_input_tokens_seen": 4636824, + "step": 7610 + }, + { + "epoch": 2.1001103143960287, + "grad_norm": 0.03555616736412048, + "learning_rate": 4.818013726194403e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4639992, + "step": 7615 + }, + { + "epoch": 2.101489244346387, + "grad_norm": 0.0006330932374112308, + "learning_rate": 4.817562741764049e-05, + "loss": 0.0033, + "num_input_tokens_seen": 4643896, + "step": 7620 + }, + { + "epoch": 2.102868174296746, + "grad_norm": 0.05927877873182297, + "learning_rate": 4.817111220387675e-05, + "loss": 0.0016, + "num_input_tokens_seen": 4651576, + "step": 7625 + }, + { + "epoch": 2.1042471042471043, + "grad_norm": 7.602656842209399e-05, + "learning_rate": 4.816659162169892e-05, + "loss": 0.0011, + "num_input_tokens_seen": 4654232, + "step": 7630 + }, + { + "epoch": 2.1056260341974626, + "grad_norm": 0.03420206904411316, + "learning_rate": 4.816206567215436e-05, + "loss": 0.0008, + "num_input_tokens_seen": 4656984, + "step": 7635 + }, + { + "epoch": 2.1070049641478215, + "grad_norm": 0.0034066459629684687, + "learning_rate": 4.815753435629165e-05, + "loss": 0.0284, + "num_input_tokens_seen": 4659576, + "step": 7640 + }, + { + "epoch": 2.10838389409818, + "grad_norm": 0.001414724625647068, + "learning_rate": 4.815299767516065e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4662840, + "step": 7645 + }, + { + "epoch": 2.109762824048538, + "grad_norm": 0.0015888414345681667, + "learning_rate": 4.814845562981244e-05, + "loss": 0.001, + "num_input_tokens_seen": 4665464, + "step": 7650 + }, + { + "epoch": 2.111141753998897, + "grad_norm": 0.00017371107242070138, + "learning_rate": 4.814390822129933e-05, + "loss": 0.0, + "num_input_tokens_seen": 4668248, + "step": 7655 + }, + { + "epoch": 2.1125206839492554, + "grad_norm": 0.018856145441532135, + "learning_rate": 4.813935545067492e-05, + "loss": 0.0006, + "num_input_tokens_seen": 4671128, + "step": 7660 + }, + { + "epoch": 2.113899613899614, + "grad_norm": 4.998915028409101e-05, + "learning_rate": 4.813479731899399e-05, + "loss": 0.0, + "num_input_tokens_seen": 4673976, + "step": 7665 + }, + { + "epoch": 2.1152785438499726, + "grad_norm": 0.0002354418975301087, + "learning_rate": 4.8130233827312617e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4677080, + "step": 7670 + }, + { + "epoch": 2.116657473800331, + "grad_norm": 0.9374649524688721, + "learning_rate": 4.812566497668808e-05, + "loss": 0.0492, + "num_input_tokens_seen": 4680408, + "step": 7675 + }, + { + "epoch": 2.1180364037506894, + "grad_norm": 0.00013091755681671202, + "learning_rate": 4.8121090768178925e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4683704, + "step": 7680 + }, + { + "epoch": 2.119415333701048, + "grad_norm": 17.9538516998291, + "learning_rate": 4.811651120284493e-05, + "loss": 0.0096, + "num_input_tokens_seen": 4686328, + "step": 7685 + }, + { + "epoch": 2.1207942636514066, + "grad_norm": 2.431088978482876e-05, + "learning_rate": 4.811192628174712e-05, + "loss": 0.0, + "num_input_tokens_seen": 4690168, + "step": 7690 + }, + { + "epoch": 2.122173193601765, + "grad_norm": 0.00018129318777937442, + "learning_rate": 4.810733600594775e-05, + "loss": 0.0649, + "num_input_tokens_seen": 4693464, + "step": 7695 + }, + { + "epoch": 2.1235521235521237, + "grad_norm": 0.0019499295158311725, + "learning_rate": 4.8102740376510304e-05, + "loss": 0.0042, + "num_input_tokens_seen": 4696632, + "step": 7700 + }, + { + "epoch": 2.124931053502482, + "grad_norm": 2.5680967155494727e-05, + "learning_rate": 4.809813939449955e-05, + "loss": 0.0, + "num_input_tokens_seen": 4699224, + "step": 7705 + }, + { + "epoch": 2.1263099834528405, + "grad_norm": 0.004300529137253761, + "learning_rate": 4.809353306098145e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4702104, + "step": 7710 + }, + { + "epoch": 2.1276889134031993, + "grad_norm": 3.791433846345171e-05, + "learning_rate": 4.8088921377023246e-05, + "loss": 0.0, + "num_input_tokens_seen": 4704504, + "step": 7715 + }, + { + "epoch": 2.1290678433535577, + "grad_norm": 0.004459581803530455, + "learning_rate": 4.8084304343693384e-05, + "loss": 0.0, + "num_input_tokens_seen": 4707416, + "step": 7720 + }, + { + "epoch": 2.130446773303916, + "grad_norm": 0.022256556898355484, + "learning_rate": 4.807968196206156e-05, + "loss": 0.0012, + "num_input_tokens_seen": 4710424, + "step": 7725 + }, + { + "epoch": 2.1318257032542745, + "grad_norm": 0.003244422608986497, + "learning_rate": 4.807505423319873e-05, + "loss": 0.0, + "num_input_tokens_seen": 4713304, + "step": 7730 + }, + { + "epoch": 2.1332046332046333, + "grad_norm": 8.171344961738214e-05, + "learning_rate": 4.807042115817706e-05, + "loss": 0.0, + "num_input_tokens_seen": 4715672, + "step": 7735 + }, + { + "epoch": 2.1345835631549916, + "grad_norm": 5.834110561409034e-05, + "learning_rate": 4.806578273806997e-05, + "loss": 0.0, + "num_input_tokens_seen": 4719064, + "step": 7740 + }, + { + "epoch": 2.1359624931053505, + "grad_norm": 2.98488448606804e-05, + "learning_rate": 4.806113897395211e-05, + "loss": 0.1354, + "num_input_tokens_seen": 4721816, + "step": 7745 + }, + { + "epoch": 2.137341423055709, + "grad_norm": 0.001051494968123734, + "learning_rate": 4.805648986689939e-05, + "loss": 0.0, + "num_input_tokens_seen": 4725112, + "step": 7750 + }, + { + "epoch": 2.138720353006067, + "grad_norm": 7.484096568077803e-05, + "learning_rate": 4.805183541798892e-05, + "loss": 0.0, + "num_input_tokens_seen": 4728056, + "step": 7755 + }, + { + "epoch": 2.1400992829564256, + "grad_norm": 0.00016892270650714636, + "learning_rate": 4.804717562829909e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4730456, + "step": 7760 + }, + { + "epoch": 2.1414782129067844, + "grad_norm": 0.0016789851943030953, + "learning_rate": 4.804251049890949e-05, + "loss": 0.0, + "num_input_tokens_seen": 4733720, + "step": 7765 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 10.367191314697266, + "learning_rate": 4.803784003090097e-05, + "loss": 0.2147, + "num_input_tokens_seen": 4735928, + "step": 7770 + }, + { + "epoch": 2.1442360728075016, + "grad_norm": 0.05299406871199608, + "learning_rate": 4.803316422535561e-05, + "loss": 0.0079, + "num_input_tokens_seen": 4739320, + "step": 7775 + }, + { + "epoch": 2.14561500275786, + "grad_norm": 0.020284362137317657, + "learning_rate": 4.8028483083356725e-05, + "loss": 0.0004, + "num_input_tokens_seen": 4741688, + "step": 7780 + }, + { + "epoch": 2.1469939327082184, + "grad_norm": 0.10039210319519043, + "learning_rate": 4.8023796605988866e-05, + "loss": 0.0009, + "num_input_tokens_seen": 4746296, + "step": 7785 + }, + { + "epoch": 2.1483728626585767, + "grad_norm": 0.008705668151378632, + "learning_rate": 4.8019104794337835e-05, + "loss": 0.0004, + "num_input_tokens_seen": 4749464, + "step": 7790 + }, + { + "epoch": 2.1497517926089356, + "grad_norm": 0.009763110429048538, + "learning_rate": 4.8014407649490645e-05, + "loss": 0.0027, + "num_input_tokens_seen": 4752280, + "step": 7795 + }, + { + "epoch": 2.151130722559294, + "grad_norm": 0.20672707259655, + "learning_rate": 4.800970517253556e-05, + "loss": 0.0023, + "num_input_tokens_seen": 4755256, + "step": 7800 + }, + { + "epoch": 2.1525096525096523, + "grad_norm": 0.00721574854105711, + "learning_rate": 4.8004997364562076e-05, + "loss": 0.0023, + "num_input_tokens_seen": 4758712, + "step": 7805 + }, + { + "epoch": 2.153888582460011, + "grad_norm": 7.037478446960449, + "learning_rate": 4.800028422666093e-05, + "loss": 0.0028, + "num_input_tokens_seen": 4762072, + "step": 7810 + }, + { + "epoch": 2.1552675124103695, + "grad_norm": 0.045229505747556686, + "learning_rate": 4.799556575992409e-05, + "loss": 0.0003, + "num_input_tokens_seen": 4765720, + "step": 7815 + }, + { + "epoch": 2.156646442360728, + "grad_norm": 0.010506034828722477, + "learning_rate": 4.7990841965444745e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4769624, + "step": 7820 + }, + { + "epoch": 2.1580253723110867, + "grad_norm": 10.743090629577637, + "learning_rate": 4.798611284431733e-05, + "loss": 0.057, + "num_input_tokens_seen": 4772280, + "step": 7825 + }, + { + "epoch": 2.159404302261445, + "grad_norm": 0.009804551489651203, + "learning_rate": 4.798137839763753e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4774648, + "step": 7830 + }, + { + "epoch": 2.1607832322118035, + "grad_norm": 0.08015867322683334, + "learning_rate": 4.7976638626502236e-05, + "loss": 0.0941, + "num_input_tokens_seen": 4777368, + "step": 7835 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.016036666929721832, + "learning_rate": 4.797189353200959e-05, + "loss": 0.0145, + "num_input_tokens_seen": 4780344, + "step": 7840 + }, + { + "epoch": 2.1635410921125207, + "grad_norm": 0.289614200592041, + "learning_rate": 4.796714311525896e-05, + "loss": 0.0009, + "num_input_tokens_seen": 4783160, + "step": 7845 + }, + { + "epoch": 2.164920022062879, + "grad_norm": 0.05423617735505104, + "learning_rate": 4.796238737735095e-05, + "loss": 0.0255, + "num_input_tokens_seen": 4785720, + "step": 7850 + }, + { + "epoch": 2.166298952013238, + "grad_norm": 0.0021321086678653955, + "learning_rate": 4.795762631938741e-05, + "loss": 0.1152, + "num_input_tokens_seen": 4788408, + "step": 7855 + }, + { + "epoch": 2.1676778819635962, + "grad_norm": 0.06523719429969788, + "learning_rate": 4.7952859942471376e-05, + "loss": 0.0594, + "num_input_tokens_seen": 4791576, + "step": 7860 + }, + { + "epoch": 2.1690568119139546, + "grad_norm": 0.0032718232832849026, + "learning_rate": 4.794808824770718e-05, + "loss": 0.0153, + "num_input_tokens_seen": 4794744, + "step": 7865 + }, + { + "epoch": 2.1704357418643134, + "grad_norm": 0.11418384313583374, + "learning_rate": 4.794331123620033e-05, + "loss": 0.0008, + "num_input_tokens_seen": 4797208, + "step": 7870 + }, + { + "epoch": 2.171814671814672, + "grad_norm": 0.010547755286097527, + "learning_rate": 4.793852890905762e-05, + "loss": 0.053, + "num_input_tokens_seen": 4800536, + "step": 7875 + }, + { + "epoch": 2.17319360176503, + "grad_norm": 0.00057777832262218, + "learning_rate": 4.793374126738702e-05, + "loss": 0.096, + "num_input_tokens_seen": 4805624, + "step": 7880 + }, + { + "epoch": 2.174572531715389, + "grad_norm": 0.010119891725480556, + "learning_rate": 4.7928948312297774e-05, + "loss": 0.0043, + "num_input_tokens_seen": 4808280, + "step": 7885 + }, + { + "epoch": 2.1759514616657474, + "grad_norm": 0.005404855590313673, + "learning_rate": 4.792415004490034e-05, + "loss": 0.0582, + "num_input_tokens_seen": 4810904, + "step": 7890 + }, + { + "epoch": 2.1773303916161058, + "grad_norm": 0.002785085467621684, + "learning_rate": 4.791934646630639e-05, + "loss": 0.0614, + "num_input_tokens_seen": 4813848, + "step": 7895 + }, + { + "epoch": 2.1787093215664646, + "grad_norm": 0.32739725708961487, + "learning_rate": 4.791453757762887e-05, + "loss": 0.052, + "num_input_tokens_seen": 4816120, + "step": 7900 + }, + { + "epoch": 2.180088251516823, + "grad_norm": 0.009992177598178387, + "learning_rate": 4.790972337998192e-05, + "loss": 0.0006, + "num_input_tokens_seen": 4818776, + "step": 7905 + }, + { + "epoch": 2.1814671814671813, + "grad_norm": 13.274517059326172, + "learning_rate": 4.790490387448091e-05, + "loss": 0.0498, + "num_input_tokens_seen": 4822904, + "step": 7910 + }, + { + "epoch": 2.18284611141754, + "grad_norm": 0.0032590145710855722, + "learning_rate": 4.790007906224246e-05, + "loss": 0.0262, + "num_input_tokens_seen": 4825720, + "step": 7915 + }, + { + "epoch": 2.1842250413678985, + "grad_norm": 0.933907151222229, + "learning_rate": 4.7895248944384416e-05, + "loss": 0.11, + "num_input_tokens_seen": 4829112, + "step": 7920 + }, + { + "epoch": 2.185603971318257, + "grad_norm": 0.028768440708518028, + "learning_rate": 4.7890413522025837e-05, + "loss": 0.0205, + "num_input_tokens_seen": 4832280, + "step": 7925 + }, + { + "epoch": 2.1869829012686157, + "grad_norm": 0.008535407483577728, + "learning_rate": 4.788557279628702e-05, + "loss": 0.0012, + "num_input_tokens_seen": 4835032, + "step": 7930 + }, + { + "epoch": 2.188361831218974, + "grad_norm": 0.02697306126356125, + "learning_rate": 4.78807267682895e-05, + "loss": 0.0762, + "num_input_tokens_seen": 4837368, + "step": 7935 + }, + { + "epoch": 2.1897407611693325, + "grad_norm": 0.009085102006793022, + "learning_rate": 4.7875875439156014e-05, + "loss": 0.0006, + "num_input_tokens_seen": 4840888, + "step": 7940 + }, + { + "epoch": 2.1911196911196913, + "grad_norm": 0.5052295327186584, + "learning_rate": 4.787101881001057e-05, + "loss": 0.1093, + "num_input_tokens_seen": 4844408, + "step": 7945 + }, + { + "epoch": 2.1924986210700497, + "grad_norm": 0.21841485798358917, + "learning_rate": 4.786615688197836e-05, + "loss": 0.0022, + "num_input_tokens_seen": 4847864, + "step": 7950 + }, + { + "epoch": 2.193877551020408, + "grad_norm": 0.015277073718607426, + "learning_rate": 4.7861289656185845e-05, + "loss": 0.012, + "num_input_tokens_seen": 4850424, + "step": 7955 + }, + { + "epoch": 2.195256480970767, + "grad_norm": 6.155749797821045, + "learning_rate": 4.785641713376067e-05, + "loss": 0.0831, + "num_input_tokens_seen": 4853496, + "step": 7960 + }, + { + "epoch": 2.1966354109211252, + "grad_norm": 0.19021515548229218, + "learning_rate": 4.785153931583173e-05, + "loss": 0.0008, + "num_input_tokens_seen": 4856440, + "step": 7965 + }, + { + "epoch": 2.1980143408714836, + "grad_norm": 0.007145737297832966, + "learning_rate": 4.784665620352916e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4859832, + "step": 7970 + }, + { + "epoch": 2.1993932708218424, + "grad_norm": 0.19180673360824585, + "learning_rate": 4.7841767797984295e-05, + "loss": 0.0015, + "num_input_tokens_seen": 4862424, + "step": 7975 + }, + { + "epoch": 2.200772200772201, + "grad_norm": 0.04337785020470619, + "learning_rate": 4.783687410032971e-05, + "loss": 0.0004, + "num_input_tokens_seen": 4866264, + "step": 7980 + }, + { + "epoch": 2.202151130722559, + "grad_norm": 0.200333371758461, + "learning_rate": 4.7831975111699214e-05, + "loss": 0.1136, + "num_input_tokens_seen": 4869752, + "step": 7985 + }, + { + "epoch": 2.203530060672918, + "grad_norm": 1.7102524042129517, + "learning_rate": 4.7827070833227814e-05, + "loss": 0.0628, + "num_input_tokens_seen": 4872376, + "step": 7990 + }, + { + "epoch": 2.2049089906232764, + "grad_norm": 0.002229314995929599, + "learning_rate": 4.782216126605177e-05, + "loss": 0.0016, + "num_input_tokens_seen": 4875640, + "step": 7995 + }, + { + "epoch": 2.2062879205736348, + "grad_norm": 0.005118339788168669, + "learning_rate": 4.7817246411308567e-05, + "loss": 0.0009, + "num_input_tokens_seen": 4878968, + "step": 8000 + }, + { + "epoch": 2.2076668505239936, + "grad_norm": 0.1177566722035408, + "learning_rate": 4.78123262701369e-05, + "loss": 0.001, + "num_input_tokens_seen": 4882456, + "step": 8005 + }, + { + "epoch": 2.209045780474352, + "grad_norm": 0.001915040542371571, + "learning_rate": 4.780740084367668e-05, + "loss": 0.0942, + "num_input_tokens_seen": 4885496, + "step": 8010 + }, + { + "epoch": 2.2104247104247103, + "grad_norm": 0.0021556459832936525, + "learning_rate": 4.7802470133069086e-05, + "loss": 0.0575, + "num_input_tokens_seen": 4888088, + "step": 8015 + }, + { + "epoch": 2.211803640375069, + "grad_norm": 0.0739467591047287, + "learning_rate": 4.779753413945648e-05, + "loss": 0.0012, + "num_input_tokens_seen": 4891800, + "step": 8020 + }, + { + "epoch": 2.2131825703254275, + "grad_norm": 0.0066806538961827755, + "learning_rate": 4.779259286398245e-05, + "loss": 0.0003, + "num_input_tokens_seen": 4894008, + "step": 8025 + }, + { + "epoch": 2.214561500275786, + "grad_norm": 0.010499252937734127, + "learning_rate": 4.778764630779183e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4896568, + "step": 8030 + }, + { + "epoch": 2.2159404302261443, + "grad_norm": 8.316432649735361e-05, + "learning_rate": 4.778269447203067e-05, + "loss": 0.0168, + "num_input_tokens_seen": 4899000, + "step": 8035 + }, + { + "epoch": 2.217319360176503, + "grad_norm": 0.0028023028280586004, + "learning_rate": 4.777773735784622e-05, + "loss": 0.0393, + "num_input_tokens_seen": 4903320, + "step": 8040 + }, + { + "epoch": 2.2186982901268615, + "grad_norm": 0.011933046393096447, + "learning_rate": 4.7772774966387005e-05, + "loss": 0.0023, + "num_input_tokens_seen": 4906104, + "step": 8045 + }, + { + "epoch": 2.2200772200772203, + "grad_norm": 0.0016082706861197948, + "learning_rate": 4.77678072988027e-05, + "loss": 0.0003, + "num_input_tokens_seen": 4909336, + "step": 8050 + }, + { + "epoch": 2.2214561500275787, + "grad_norm": 9.920806884765625, + "learning_rate": 4.776283435624427e-05, + "loss": 0.0591, + "num_input_tokens_seen": 4914872, + "step": 8055 + }, + { + "epoch": 2.222835079977937, + "grad_norm": 0.04260602593421936, + "learning_rate": 4.7757856139863875e-05, + "loss": 0.0004, + "num_input_tokens_seen": 4917592, + "step": 8060 + }, + { + "epoch": 2.2242140099282954, + "grad_norm": 0.004781103227287531, + "learning_rate": 4.775287265081488e-05, + "loss": 0.0012, + "num_input_tokens_seen": 4920184, + "step": 8065 + }, + { + "epoch": 2.2255929398786543, + "grad_norm": 0.030817177146673203, + "learning_rate": 4.77478838902519e-05, + "loss": 0.0596, + "num_input_tokens_seen": 4923320, + "step": 8070 + }, + { + "epoch": 2.2269718698290126, + "grad_norm": 19.36916160583496, + "learning_rate": 4.774288985933075e-05, + "loss": 0.1212, + "num_input_tokens_seen": 4926392, + "step": 8075 + }, + { + "epoch": 2.228350799779371, + "grad_norm": 6.789796829223633, + "learning_rate": 4.7737890559208484e-05, + "loss": 0.1336, + "num_input_tokens_seen": 4928728, + "step": 8080 + }, + { + "epoch": 2.22972972972973, + "grad_norm": 0.005511594470590353, + "learning_rate": 4.7732885991043365e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4931640, + "step": 8085 + }, + { + "epoch": 2.231108659680088, + "grad_norm": 0.01814328320324421, + "learning_rate": 4.7727876155994875e-05, + "loss": 0.0015, + "num_input_tokens_seen": 4935608, + "step": 8090 + }, + { + "epoch": 2.2324875896304466, + "grad_norm": 0.001962002133950591, + "learning_rate": 4.772286105522373e-05, + "loss": 0.0011, + "num_input_tokens_seen": 4939448, + "step": 8095 + }, + { + "epoch": 2.2338665195808054, + "grad_norm": 0.17744943499565125, + "learning_rate": 4.771784068989186e-05, + "loss": 0.0738, + "num_input_tokens_seen": 4942392, + "step": 8100 + }, + { + "epoch": 2.2352454495311638, + "grad_norm": 0.016955208033323288, + "learning_rate": 4.771281506116239e-05, + "loss": 0.0006, + "num_input_tokens_seen": 4945944, + "step": 8105 + }, + { + "epoch": 2.236624379481522, + "grad_norm": 0.0012321334797888994, + "learning_rate": 4.770778417019971e-05, + "loss": 0.0737, + "num_input_tokens_seen": 4948184, + "step": 8110 + }, + { + "epoch": 2.238003309431881, + "grad_norm": 0.041360389441251755, + "learning_rate": 4.770274801816938e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4950392, + "step": 8115 + }, + { + "epoch": 2.2393822393822393, + "grad_norm": 2.0602166652679443, + "learning_rate": 4.769770660623824e-05, + "loss": 0.0022, + "num_input_tokens_seen": 4952632, + "step": 8120 + }, + { + "epoch": 2.2407611693325977, + "grad_norm": 0.08881087601184845, + "learning_rate": 4.7692659935574276e-05, + "loss": 0.0013, + "num_input_tokens_seen": 4956056, + "step": 8125 + }, + { + "epoch": 2.2421400992829565, + "grad_norm": 0.002567342482507229, + "learning_rate": 4.7687608007346746e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4958936, + "step": 8130 + }, + { + "epoch": 2.243519029233315, + "grad_norm": 0.0016064352821558714, + "learning_rate": 4.768255082272611e-05, + "loss": 0.0022, + "num_input_tokens_seen": 4961944, + "step": 8135 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.005334367509931326, + "learning_rate": 4.7677488382884045e-05, + "loss": 0.0002, + "num_input_tokens_seen": 4965304, + "step": 8140 + }, + { + "epoch": 2.246276889134032, + "grad_norm": 0.043229494243860245, + "learning_rate": 4.767242068899345e-05, + "loss": 0.0061, + "num_input_tokens_seen": 4969080, + "step": 8145 + }, + { + "epoch": 2.2476558190843905, + "grad_norm": 0.2006242722272873, + "learning_rate": 4.766734774222842e-05, + "loss": 0.0009, + "num_input_tokens_seen": 4971704, + "step": 8150 + }, + { + "epoch": 2.249034749034749, + "grad_norm": 44.84006881713867, + "learning_rate": 4.7662269543764294e-05, + "loss": 0.0339, + "num_input_tokens_seen": 4974232, + "step": 8155 + }, + { + "epoch": 2.2504136789851077, + "grad_norm": 0.0001465584064135328, + "learning_rate": 4.765718609477763e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4977016, + "step": 8160 + }, + { + "epoch": 2.251792608935466, + "grad_norm": 13.83402156829834, + "learning_rate": 4.765209739644617e-05, + "loss": 0.0493, + "num_input_tokens_seen": 4980728, + "step": 8165 + }, + { + "epoch": 2.2531715388858244, + "grad_norm": 0.0002814457402564585, + "learning_rate": 4.7647003449948904e-05, + "loss": 0.0048, + "num_input_tokens_seen": 4983352, + "step": 8170 + }, + { + "epoch": 2.2545504688361833, + "grad_norm": 8.309236727654934e-05, + "learning_rate": 4.764190425646602e-05, + "loss": 0.0005, + "num_input_tokens_seen": 4985880, + "step": 8175 + }, + { + "epoch": 2.2559293987865416, + "grad_norm": 0.004243097268044949, + "learning_rate": 4.7636799817178934e-05, + "loss": 0.0001, + "num_input_tokens_seen": 4988696, + "step": 8180 + }, + { + "epoch": 2.2573083287369, + "grad_norm": 0.005705064162611961, + "learning_rate": 4.7631690133270266e-05, + "loss": 0.0, + "num_input_tokens_seen": 4992056, + "step": 8185 + }, + { + "epoch": 2.258687258687259, + "grad_norm": 8.605464245192707e-05, + "learning_rate": 4.762657520592386e-05, + "loss": 0.017, + "num_input_tokens_seen": 4995512, + "step": 8190 + }, + { + "epoch": 2.260066188637617, + "grad_norm": 0.002232351340353489, + "learning_rate": 4.762145503632477e-05, + "loss": 0.0177, + "num_input_tokens_seen": 4998360, + "step": 8195 + }, + { + "epoch": 2.2614451185879756, + "grad_norm": 0.0019424180500209332, + "learning_rate": 4.761632962565927e-05, + "loss": 0.1441, + "num_input_tokens_seen": 5001976, + "step": 8200 + }, + { + "epoch": 2.2628240485383344, + "grad_norm": 0.00043961548362858593, + "learning_rate": 4.7611198975114836e-05, + "loss": 0.0001, + "num_input_tokens_seen": 5004888, + "step": 8205 + }, + { + "epoch": 2.264202978488693, + "grad_norm": 0.009882324375212193, + "learning_rate": 4.760606308588017e-05, + "loss": 0.0, + "num_input_tokens_seen": 5007864, + "step": 8210 + }, + { + "epoch": 2.265581908439051, + "grad_norm": 0.00492492550984025, + "learning_rate": 4.760092195914518e-05, + "loss": 0.0001, + "num_input_tokens_seen": 5010296, + "step": 8215 + }, + { + "epoch": 2.26696083838941, + "grad_norm": 13.112621307373047, + "learning_rate": 4.759577559610101e-05, + "loss": 0.0859, + "num_input_tokens_seen": 5013144, + "step": 8220 + }, + { + "epoch": 2.2683397683397684, + "grad_norm": 0.00017482692783232778, + "learning_rate": 4.759062399793997e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5015896, + "step": 8225 + }, + { + "epoch": 2.2697186982901267, + "grad_norm": 9.678667265689e-05, + "learning_rate": 4.7585467165855634e-05, + "loss": 0.3032, + "num_input_tokens_seen": 5017944, + "step": 8230 + }, + { + "epoch": 2.2710976282404856, + "grad_norm": 0.08590690046548843, + "learning_rate": 4.7580305101042746e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5021816, + "step": 8235 + }, + { + "epoch": 2.272476558190844, + "grad_norm": 0.05471651628613472, + "learning_rate": 4.75751378046973e-05, + "loss": 0.0861, + "num_input_tokens_seen": 5025432, + "step": 8240 + }, + { + "epoch": 2.2738554881412023, + "grad_norm": 0.23200520873069763, + "learning_rate": 4.756996527801648e-05, + "loss": 0.0024, + "num_input_tokens_seen": 5028088, + "step": 8245 + }, + { + "epoch": 2.275234418091561, + "grad_norm": 0.2467545121908188, + "learning_rate": 4.756478752219868e-05, + "loss": 0.0045, + "num_input_tokens_seen": 5030712, + "step": 8250 + }, + { + "epoch": 2.2766133480419195, + "grad_norm": 0.015832364559173584, + "learning_rate": 4.755960453844351e-05, + "loss": 0.0015, + "num_input_tokens_seen": 5034840, + "step": 8255 + }, + { + "epoch": 2.277992277992278, + "grad_norm": 0.012109086848795414, + "learning_rate": 4.7554416327951815e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5038264, + "step": 8260 + }, + { + "epoch": 2.2793712079426367, + "grad_norm": 0.001640096539631486, + "learning_rate": 4.75492228919256e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5041912, + "step": 8265 + }, + { + "epoch": 2.280750137892995, + "grad_norm": 0.0267171673476696, + "learning_rate": 4.754402423156812e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5045592, + "step": 8270 + }, + { + "epoch": 2.2821290678433535, + "grad_norm": 0.0349535271525383, + "learning_rate": 4.7538820348083834e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5048408, + "step": 8275 + }, + { + "epoch": 2.283507997793712, + "grad_norm": 0.00945182517170906, + "learning_rate": 4.7533611242678414e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5051064, + "step": 8280 + }, + { + "epoch": 2.2848869277440707, + "grad_norm": 5.293056964874268, + "learning_rate": 4.752839691655871e-05, + "loss": 0.0054, + "num_input_tokens_seen": 5054072, + "step": 8285 + }, + { + "epoch": 2.286265857694429, + "grad_norm": 0.0458347387611866, + "learning_rate": 4.7523177370932834e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5058776, + "step": 8290 + }, + { + "epoch": 2.287644787644788, + "grad_norm": 0.005794947501271963, + "learning_rate": 4.751795260701007e-05, + "loss": 0.082, + "num_input_tokens_seen": 5063544, + "step": 8295 + }, + { + "epoch": 2.2890237175951462, + "grad_norm": 0.017229130491614342, + "learning_rate": 4.7512722626000916e-05, + "loss": 0.0823, + "num_input_tokens_seen": 5066808, + "step": 8300 + }, + { + "epoch": 2.2904026475455046, + "grad_norm": 12.474126815795898, + "learning_rate": 4.7507487429117086e-05, + "loss": 0.0574, + "num_input_tokens_seen": 5069528, + "step": 8305 + }, + { + "epoch": 2.291781577495863, + "grad_norm": 0.0129832923412323, + "learning_rate": 4.75022470175715e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5072472, + "step": 8310 + }, + { + "epoch": 2.293160507446222, + "grad_norm": 0.0033000644762068987, + "learning_rate": 4.749700139257829e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5075032, + "step": 8315 + }, + { + "epoch": 2.29453943739658, + "grad_norm": 0.23035913705825806, + "learning_rate": 4.749175055535279e-05, + "loss": 0.0988, + "num_input_tokens_seen": 5078072, + "step": 8320 + }, + { + "epoch": 2.295918367346939, + "grad_norm": 0.06012466922402382, + "learning_rate": 4.7486494507111535e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5080440, + "step": 8325 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.003411104204133153, + "learning_rate": 4.748123324907229e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5083096, + "step": 8330 + }, + { + "epoch": 2.2986762272476557, + "grad_norm": 0.3358425498008728, + "learning_rate": 4.7475966782453994e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5085336, + "step": 8335 + }, + { + "epoch": 2.300055157198014, + "grad_norm": 0.007803561165928841, + "learning_rate": 4.747069510847684e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5087960, + "step": 8340 + }, + { + "epoch": 2.301434087148373, + "grad_norm": 0.021242741495370865, + "learning_rate": 4.7465418228362174e-05, + "loss": 0.0064, + "num_input_tokens_seen": 5090872, + "step": 8345 + }, + { + "epoch": 2.3028130170987313, + "grad_norm": 0.02718820609152317, + "learning_rate": 4.7460136143332586e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5093208, + "step": 8350 + }, + { + "epoch": 2.30419194704909, + "grad_norm": 0.0051672980189323425, + "learning_rate": 4.7454848854611856e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5095960, + "step": 8355 + }, + { + "epoch": 2.3055708769994485, + "grad_norm": 5.603193759918213, + "learning_rate": 4.744955636342497e-05, + "loss": 0.0672, + "num_input_tokens_seen": 5098872, + "step": 8360 + }, + { + "epoch": 2.306949806949807, + "grad_norm": 0.0012135255383327603, + "learning_rate": 4.744425867099812e-05, + "loss": 0.0551, + "num_input_tokens_seen": 5101176, + "step": 8365 + }, + { + "epoch": 2.3083287369001653, + "grad_norm": 0.00045952072832733393, + "learning_rate": 4.7438955778558724e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5104568, + "step": 8370 + }, + { + "epoch": 2.309707666850524, + "grad_norm": 0.0017318548634648323, + "learning_rate": 4.7433647687335356e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5107096, + "step": 8375 + }, + { + "epoch": 2.3110865968008825, + "grad_norm": 0.07289092242717743, + "learning_rate": 4.7428334398557856e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5109336, + "step": 8380 + }, + { + "epoch": 2.3124655267512413, + "grad_norm": 0.04814894124865532, + "learning_rate": 4.7423015913457214e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5112600, + "step": 8385 + }, + { + "epoch": 2.3138444567015997, + "grad_norm": 0.0029308032244443893, + "learning_rate": 4.741769223326565e-05, + "loss": 0.006, + "num_input_tokens_seen": 5115640, + "step": 8390 + }, + { + "epoch": 2.315223386651958, + "grad_norm": 0.0404597744345665, + "learning_rate": 4.74123633592166e-05, + "loss": 0.0012, + "num_input_tokens_seen": 5117816, + "step": 8395 + }, + { + "epoch": 2.3166023166023164, + "grad_norm": 0.01574970968067646, + "learning_rate": 4.740702929254467e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5121208, + "step": 8400 + }, + { + "epoch": 2.3179812465526752, + "grad_norm": 0.002432251116260886, + "learning_rate": 4.740169003448569e-05, + "loss": 0.2896, + "num_input_tokens_seen": 5123928, + "step": 8405 + }, + { + "epoch": 2.3193601765030336, + "grad_norm": 0.007035411894321442, + "learning_rate": 4.73963455862767e-05, + "loss": 0.0571, + "num_input_tokens_seen": 5127512, + "step": 8410 + }, + { + "epoch": 2.320739106453392, + "grad_norm": 18.157039642333984, + "learning_rate": 4.7390995949155915e-05, + "loss": 0.0491, + "num_input_tokens_seen": 5130584, + "step": 8415 + }, + { + "epoch": 2.322118036403751, + "grad_norm": 0.02962314710021019, + "learning_rate": 4.738564112436279e-05, + "loss": 0.0012, + "num_input_tokens_seen": 5133432, + "step": 8420 + }, + { + "epoch": 2.323496966354109, + "grad_norm": 0.006946281064301729, + "learning_rate": 4.738028111313794e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5136472, + "step": 8425 + }, + { + "epoch": 2.3248758963044676, + "grad_norm": 0.02260012924671173, + "learning_rate": 4.737491591672322e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5139896, + "step": 8430 + }, + { + "epoch": 2.3262548262548264, + "grad_norm": 0.01909051649272442, + "learning_rate": 4.736954553636165e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5143384, + "step": 8435 + }, + { + "epoch": 2.3276337562051848, + "grad_norm": 0.01435437798500061, + "learning_rate": 4.736416997329749e-05, + "loss": 0.0029, + "num_input_tokens_seen": 5146040, + "step": 8440 + }, + { + "epoch": 2.329012686155543, + "grad_norm": 0.04223458841443062, + "learning_rate": 4.735878922877617e-05, + "loss": 0.0122, + "num_input_tokens_seen": 5148344, + "step": 8445 + }, + { + "epoch": 2.330391616105902, + "grad_norm": 0.01451084204018116, + "learning_rate": 4.735340330404433e-05, + "loss": 0.011, + "num_input_tokens_seen": 5150936, + "step": 8450 + }, + { + "epoch": 2.3317705460562603, + "grad_norm": 0.0021338274236768484, + "learning_rate": 4.734801220034981e-05, + "loss": 0.0015, + "num_input_tokens_seen": 5153592, + "step": 8455 + }, + { + "epoch": 2.3331494760066187, + "grad_norm": 0.0040702903643250465, + "learning_rate": 4.7342615918941664e-05, + "loss": 0.0268, + "num_input_tokens_seen": 5156568, + "step": 8460 + }, + { + "epoch": 2.3345284059569775, + "grad_norm": 0.5371788740158081, + "learning_rate": 4.733721446107012e-05, + "loss": 0.0016, + "num_input_tokens_seen": 5159128, + "step": 8465 + }, + { + "epoch": 2.335907335907336, + "grad_norm": 0.010192508809268475, + "learning_rate": 4.733180782798663e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5163256, + "step": 8470 + }, + { + "epoch": 2.3372862658576943, + "grad_norm": 0.009706133976578712, + "learning_rate": 4.732639602094382e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5165976, + "step": 8475 + }, + { + "epoch": 2.338665195808053, + "grad_norm": 0.000637288554571569, + "learning_rate": 4.732097904119554e-05, + "loss": 0.0754, + "num_input_tokens_seen": 5168120, + "step": 8480 + }, + { + "epoch": 2.3400441257584115, + "grad_norm": 0.0038682546000927687, + "learning_rate": 4.7315556889996814e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5170904, + "step": 8485 + }, + { + "epoch": 2.34142305570877, + "grad_norm": 14.468756675720215, + "learning_rate": 4.731012956860389e-05, + "loss": 0.059, + "num_input_tokens_seen": 5173976, + "step": 8490 + }, + { + "epoch": 2.3428019856591287, + "grad_norm": 0.03146521747112274, + "learning_rate": 4.73046970782742e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5176696, + "step": 8495 + }, + { + "epoch": 2.344180915609487, + "grad_norm": 0.005152450408786535, + "learning_rate": 4.729925942026635e-05, + "loss": 0.0001, + "num_input_tokens_seen": 5178904, + "step": 8500 + }, + { + "epoch": 2.3455598455598454, + "grad_norm": 0.007891075685620308, + "learning_rate": 4.72938165958402e-05, + "loss": 0.0139, + "num_input_tokens_seen": 5182104, + "step": 8505 + }, + { + "epoch": 2.3469387755102042, + "grad_norm": 0.8555788397789001, + "learning_rate": 4.728836860625675e-05, + "loss": 0.0019, + "num_input_tokens_seen": 5185080, + "step": 8510 + }, + { + "epoch": 2.3483177054605626, + "grad_norm": 0.0036063448060303926, + "learning_rate": 4.728291545277824e-05, + "loss": 0.0012, + "num_input_tokens_seen": 5187928, + "step": 8515 + }, + { + "epoch": 2.349696635410921, + "grad_norm": 0.01337185874581337, + "learning_rate": 4.7277457136668066e-05, + "loss": 0.0164, + "num_input_tokens_seen": 5190488, + "step": 8520 + }, + { + "epoch": 2.35107556536128, + "grad_norm": 0.07692402601242065, + "learning_rate": 4.727199365919086e-05, + "loss": 0.1003, + "num_input_tokens_seen": 5193048, + "step": 8525 + }, + { + "epoch": 2.352454495311638, + "grad_norm": 22.313493728637695, + "learning_rate": 4.7266525021612426e-05, + "loss": 0.0376, + "num_input_tokens_seen": 5195480, + "step": 8530 + }, + { + "epoch": 2.3538334252619966, + "grad_norm": 0.11572067439556122, + "learning_rate": 4.726105122519977e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5199576, + "step": 8535 + }, + { + "epoch": 2.3552123552123554, + "grad_norm": 0.0033031783532351255, + "learning_rate": 4.725557227122108e-05, + "loss": 0.1351, + "num_input_tokens_seen": 5202936, + "step": 8540 + }, + { + "epoch": 2.3565912851627138, + "grad_norm": 0.0800480842590332, + "learning_rate": 4.725008816094577e-05, + "loss": 0.0012, + "num_input_tokens_seen": 5205944, + "step": 8545 + }, + { + "epoch": 2.357970215113072, + "grad_norm": 0.03346659615635872, + "learning_rate": 4.724459889564442e-05, + "loss": 0.001, + "num_input_tokens_seen": 5208984, + "step": 8550 + }, + { + "epoch": 2.359349145063431, + "grad_norm": 0.10549158602952957, + "learning_rate": 4.723910447658881e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5211864, + "step": 8555 + }, + { + "epoch": 2.3607280750137893, + "grad_norm": 0.040662288665771484, + "learning_rate": 4.723360490505192e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5215800, + "step": 8560 + }, + { + "epoch": 2.3621070049641477, + "grad_norm": 0.033044371753931046, + "learning_rate": 4.7228100182307934e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5218552, + "step": 8565 + }, + { + "epoch": 2.3634859349145065, + "grad_norm": 8.46005916595459, + "learning_rate": 4.72225903096322e-05, + "loss": 0.088, + "num_input_tokens_seen": 5221272, + "step": 8570 + }, + { + "epoch": 2.364864864864865, + "grad_norm": 0.012313391081988811, + "learning_rate": 4.721707528830128e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5226968, + "step": 8575 + }, + { + "epoch": 2.3662437948152233, + "grad_norm": 0.009912270121276379, + "learning_rate": 4.721155511959293e-05, + "loss": 0.0222, + "num_input_tokens_seen": 5229976, + "step": 8580 + }, + { + "epoch": 2.3676227247655817, + "grad_norm": 0.14595240354537964, + "learning_rate": 4.72060298047861e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5232504, + "step": 8585 + }, + { + "epoch": 2.3690016547159405, + "grad_norm": 0.0766470730304718, + "learning_rate": 4.720049934516092e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5236280, + "step": 8590 + }, + { + "epoch": 2.370380584666299, + "grad_norm": 31.402395248413086, + "learning_rate": 4.719496374199871e-05, + "loss": 0.0818, + "num_input_tokens_seen": 5238872, + "step": 8595 + }, + { + "epoch": 2.3717595146166577, + "grad_norm": 0.19313237071037292, + "learning_rate": 4.7189422996582e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5243800, + "step": 8600 + }, + { + "epoch": 2.373138444567016, + "grad_norm": 0.20586919784545898, + "learning_rate": 4.71838771101945e-05, + "loss": 0.0325, + "num_input_tokens_seen": 5246936, + "step": 8605 + }, + { + "epoch": 2.3745173745173744, + "grad_norm": 14.080536842346191, + "learning_rate": 4.7178326084121105e-05, + "loss": 0.1329, + "num_input_tokens_seen": 5249752, + "step": 8610 + }, + { + "epoch": 2.375896304467733, + "grad_norm": 0.0007028020336292684, + "learning_rate": 4.7172769919647915e-05, + "loss": 0.1221, + "num_input_tokens_seen": 5252344, + "step": 8615 + }, + { + "epoch": 2.3772752344180916, + "grad_norm": 0.2364923655986786, + "learning_rate": 4.716720861806221e-05, + "loss": 0.09, + "num_input_tokens_seen": 5255224, + "step": 8620 + }, + { + "epoch": 2.37865416436845, + "grad_norm": 0.2564626634120941, + "learning_rate": 4.7161642180652464e-05, + "loss": 0.0043, + "num_input_tokens_seen": 5259224, + "step": 8625 + }, + { + "epoch": 2.380033094318809, + "grad_norm": 0.1163557693362236, + "learning_rate": 4.715607060870835e-05, + "loss": 0.0559, + "num_input_tokens_seen": 5261688, + "step": 8630 + }, + { + "epoch": 2.381412024269167, + "grad_norm": 0.055200833827257156, + "learning_rate": 4.715049390352071e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5266904, + "step": 8635 + }, + { + "epoch": 2.3827909542195256, + "grad_norm": 0.202017143368721, + "learning_rate": 4.714491206638159e-05, + "loss": 0.0014, + "num_input_tokens_seen": 5269848, + "step": 8640 + }, + { + "epoch": 2.384169884169884, + "grad_norm": 14.261640548706055, + "learning_rate": 4.713932509858422e-05, + "loss": 0.0226, + "num_input_tokens_seen": 5272856, + "step": 8645 + }, + { + "epoch": 2.3855488141202428, + "grad_norm": 0.03277820348739624, + "learning_rate": 4.7133733001423025e-05, + "loss": 0.0325, + "num_input_tokens_seen": 5275608, + "step": 8650 + }, + { + "epoch": 2.386927744070601, + "grad_norm": 0.009095986373722553, + "learning_rate": 4.712813577619361e-05, + "loss": 0.0023, + "num_input_tokens_seen": 5278616, + "step": 8655 + }, + { + "epoch": 2.38830667402096, + "grad_norm": 36.98732376098633, + "learning_rate": 4.712253342419277e-05, + "loss": 0.0475, + "num_input_tokens_seen": 5281336, + "step": 8660 + }, + { + "epoch": 2.3896856039713184, + "grad_norm": 0.17704202234745026, + "learning_rate": 4.711692594671849e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5284440, + "step": 8665 + }, + { + "epoch": 2.3910645339216767, + "grad_norm": 0.025667568668723106, + "learning_rate": 4.7111313345069953e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5286808, + "step": 8670 + }, + { + "epoch": 2.392443463872035, + "grad_norm": 2.636993646621704, + "learning_rate": 4.71056956205475e-05, + "loss": 0.0542, + "num_input_tokens_seen": 5290712, + "step": 8675 + }, + { + "epoch": 2.393822393822394, + "grad_norm": 0.0005180126172490418, + "learning_rate": 4.710007277445268e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5293336, + "step": 8680 + }, + { + "epoch": 2.3952013237727523, + "grad_norm": 0.12996093928813934, + "learning_rate": 4.709444480808825e-05, + "loss": 0.0105, + "num_input_tokens_seen": 5295928, + "step": 8685 + }, + { + "epoch": 2.3965802537231107, + "grad_norm": 0.007955853827297688, + "learning_rate": 4.70888117227581e-05, + "loss": 0.119, + "num_input_tokens_seen": 5298648, + "step": 8690 + }, + { + "epoch": 2.3979591836734695, + "grad_norm": 25.599634170532227, + "learning_rate": 4.708317351976735e-05, + "loss": 0.1059, + "num_input_tokens_seen": 5302616, + "step": 8695 + }, + { + "epoch": 2.399338113623828, + "grad_norm": 0.0033976868726313114, + "learning_rate": 4.707753020042228e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5305656, + "step": 8700 + }, + { + "epoch": 2.4007170435741862, + "grad_norm": 0.20811563730239868, + "learning_rate": 4.707188176603038e-05, + "loss": 0.1824, + "num_input_tokens_seen": 5308344, + "step": 8705 + }, + { + "epoch": 2.402095973524545, + "grad_norm": 0.042688999325037, + "learning_rate": 4.70662282179003e-05, + "loss": 0.0021, + "num_input_tokens_seen": 5311384, + "step": 8710 + }, + { + "epoch": 2.4034749034749034, + "grad_norm": 10.583739280700684, + "learning_rate": 4.7060569557341896e-05, + "loss": 0.0905, + "num_input_tokens_seen": 5313976, + "step": 8715 + }, + { + "epoch": 2.404853833425262, + "grad_norm": 0.07278227061033249, + "learning_rate": 4.705490578566618e-05, + "loss": 0.003, + "num_input_tokens_seen": 5316984, + "step": 8720 + }, + { + "epoch": 2.4062327633756206, + "grad_norm": 0.5765966176986694, + "learning_rate": 4.704923690418539e-05, + "loss": 0.0062, + "num_input_tokens_seen": 5320952, + "step": 8725 + }, + { + "epoch": 2.407611693325979, + "grad_norm": 0.011819376610219479, + "learning_rate": 4.704356291421291e-05, + "loss": 0.0044, + "num_input_tokens_seen": 5323864, + "step": 8730 + }, + { + "epoch": 2.4089906232763374, + "grad_norm": 0.008882993832230568, + "learning_rate": 4.703788381706332e-05, + "loss": 0.001, + "num_input_tokens_seen": 5326808, + "step": 8735 + }, + { + "epoch": 2.410369553226696, + "grad_norm": 0.0019124305108562112, + "learning_rate": 4.70321996140524e-05, + "loss": 0.0014, + "num_input_tokens_seen": 5329304, + "step": 8740 + }, + { + "epoch": 2.4117484831770546, + "grad_norm": 0.7842869162559509, + "learning_rate": 4.7026510306497085e-05, + "loss": 0.0024, + "num_input_tokens_seen": 5332408, + "step": 8745 + }, + { + "epoch": 2.413127413127413, + "grad_norm": 0.03908592835068703, + "learning_rate": 4.702081589571551e-05, + "loss": 0.1084, + "num_input_tokens_seen": 5335192, + "step": 8750 + }, + { + "epoch": 2.414506343077772, + "grad_norm": 0.08864524215459824, + "learning_rate": 4.7015116383026984e-05, + "loss": 0.0678, + "num_input_tokens_seen": 5338840, + "step": 8755 + }, + { + "epoch": 2.41588527302813, + "grad_norm": 19.402191162109375, + "learning_rate": 4.700941176975201e-05, + "loss": 0.193, + "num_input_tokens_seen": 5341464, + "step": 8760 + }, + { + "epoch": 2.4172642029784885, + "grad_norm": 0.6172529458999634, + "learning_rate": 4.700370205721226e-05, + "loss": 0.0011, + "num_input_tokens_seen": 5344824, + "step": 8765 + }, + { + "epoch": 2.4186431329288474, + "grad_norm": 0.09809724241495132, + "learning_rate": 4.69979872467306e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5347672, + "step": 8770 + }, + { + "epoch": 2.4200220628792057, + "grad_norm": 0.002596144564449787, + "learning_rate": 4.699226733963105e-05, + "loss": 0.001, + "num_input_tokens_seen": 5350520, + "step": 8775 + }, + { + "epoch": 2.421400992829564, + "grad_norm": 0.22952012717723846, + "learning_rate": 4.698654233723885e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5354264, + "step": 8780 + }, + { + "epoch": 2.422779922779923, + "grad_norm": 0.014708043076097965, + "learning_rate": 4.6980812240880404e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5357304, + "step": 8785 + }, + { + "epoch": 2.4241588527302813, + "grad_norm": 0.020894918590784073, + "learning_rate": 4.697507705188327e-05, + "loss": 0.0155, + "num_input_tokens_seen": 5360504, + "step": 8790 + }, + { + "epoch": 2.4255377826806397, + "grad_norm": 0.0037703653797507286, + "learning_rate": 4.696933677157623e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5362904, + "step": 8795 + }, + { + "epoch": 2.4269167126309985, + "grad_norm": 0.021561674773693085, + "learning_rate": 4.696359140128921e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5365624, + "step": 8800 + }, + { + "epoch": 2.428295642581357, + "grad_norm": 0.005821035709232092, + "learning_rate": 4.695784094235335e-05, + "loss": 0.0022, + "num_input_tokens_seen": 5369112, + "step": 8805 + }, + { + "epoch": 2.4296745725317153, + "grad_norm": 0.058428164571523666, + "learning_rate": 4.6952085396100924e-05, + "loss": 0.1558, + "num_input_tokens_seen": 5371512, + "step": 8810 + }, + { + "epoch": 2.431053502482074, + "grad_norm": 0.4189552068710327, + "learning_rate": 4.6946324763865425e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5375064, + "step": 8815 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.04185764864087105, + "learning_rate": 4.69405590469815e-05, + "loss": 0.038, + "num_input_tokens_seen": 5378136, + "step": 8820 + }, + { + "epoch": 2.433811362382791, + "grad_norm": 0.01143084280192852, + "learning_rate": 4.6934788246785e-05, + "loss": 0.0089, + "num_input_tokens_seen": 5382296, + "step": 8825 + }, + { + "epoch": 2.4351902923331497, + "grad_norm": 0.004074219148606062, + "learning_rate": 4.692901236461291e-05, + "loss": 0.0655, + "num_input_tokens_seen": 5386424, + "step": 8830 + }, + { + "epoch": 2.436569222283508, + "grad_norm": 0.18479104340076447, + "learning_rate": 4.6923231401803444e-05, + "loss": 0.0461, + "num_input_tokens_seen": 5388952, + "step": 8835 + }, + { + "epoch": 2.4379481522338664, + "grad_norm": 0.013900231570005417, + "learning_rate": 4.691744535969595e-05, + "loss": 0.0269, + "num_input_tokens_seen": 5391288, + "step": 8840 + }, + { + "epoch": 2.4393270821842252, + "grad_norm": 77.91913604736328, + "learning_rate": 4.691165423963099e-05, + "loss": 0.0598, + "num_input_tokens_seen": 5393912, + "step": 8845 + }, + { + "epoch": 2.4407060121345836, + "grad_norm": 0.002904615132138133, + "learning_rate": 4.690585804295026e-05, + "loss": 0.1343, + "num_input_tokens_seen": 5397080, + "step": 8850 + }, + { + "epoch": 2.442084942084942, + "grad_norm": 0.07822445034980774, + "learning_rate": 4.690005677099668e-05, + "loss": 0.0515, + "num_input_tokens_seen": 5399832, + "step": 8855 + }, + { + "epoch": 2.443463872035301, + "grad_norm": 0.04998864233493805, + "learning_rate": 4.6894250425114295e-05, + "loss": 0.001, + "num_input_tokens_seen": 5402808, + "step": 8860 + }, + { + "epoch": 2.444842801985659, + "grad_norm": 0.5291662216186523, + "learning_rate": 4.688843900664837e-05, + "loss": 0.0013, + "num_input_tokens_seen": 5405464, + "step": 8865 + }, + { + "epoch": 2.4462217319360176, + "grad_norm": 0.002702330471947789, + "learning_rate": 4.688262251694533e-05, + "loss": 0.0022, + "num_input_tokens_seen": 5408152, + "step": 8870 + }, + { + "epoch": 2.4476006618863764, + "grad_norm": 0.03593821078538895, + "learning_rate": 4.687680095735276e-05, + "loss": 0.1617, + "num_input_tokens_seen": 5411672, + "step": 8875 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 1.5397454500198364, + "learning_rate": 4.687097432921943e-05, + "loss": 0.0972, + "num_input_tokens_seen": 5415000, + "step": 8880 + }, + { + "epoch": 2.450358521787093, + "grad_norm": 9.968918800354004, + "learning_rate": 4.6865142633895295e-05, + "loss": 0.1051, + "num_input_tokens_seen": 5419192, + "step": 8885 + }, + { + "epoch": 2.4517374517374515, + "grad_norm": 1.5201336145401, + "learning_rate": 4.685930587273149e-05, + "loss": 0.0099, + "num_input_tokens_seen": 5422712, + "step": 8890 + }, + { + "epoch": 2.4531163816878103, + "grad_norm": 4.399312973022461, + "learning_rate": 4.6853464047080276e-05, + "loss": 0.0696, + "num_input_tokens_seen": 5425304, + "step": 8895 + }, + { + "epoch": 2.4544953116381687, + "grad_norm": 5.438581466674805, + "learning_rate": 4.684761715829514e-05, + "loss": 0.0576, + "num_input_tokens_seen": 5427704, + "step": 8900 + }, + { + "epoch": 2.4558742415885275, + "grad_norm": 0.011661121621727943, + "learning_rate": 4.684176520773072e-05, + "loss": 0.011, + "num_input_tokens_seen": 5430008, + "step": 8905 + }, + { + "epoch": 2.457253171538886, + "grad_norm": 0.034345611929893494, + "learning_rate": 4.683590819674282e-05, + "loss": 0.1214, + "num_input_tokens_seen": 5433720, + "step": 8910 + }, + { + "epoch": 2.4586321014892443, + "grad_norm": 0.01961633935570717, + "learning_rate": 4.6830046126688444e-05, + "loss": 0.0971, + "num_input_tokens_seen": 5436568, + "step": 8915 + }, + { + "epoch": 2.4600110314396026, + "grad_norm": 1.0893226861953735, + "learning_rate": 4.6824178998925726e-05, + "loss": 0.0039, + "num_input_tokens_seen": 5439928, + "step": 8920 + }, + { + "epoch": 2.4613899613899615, + "grad_norm": 0.01967366226017475, + "learning_rate": 4.681830681481402e-05, + "loss": 0.0014, + "num_input_tokens_seen": 5444280, + "step": 8925 + }, + { + "epoch": 2.46276889134032, + "grad_norm": 0.04733945429325104, + "learning_rate": 4.68124295757138e-05, + "loss": 0.0436, + "num_input_tokens_seen": 5447192, + "step": 8930 + }, + { + "epoch": 2.4641478212906787, + "grad_norm": 0.02472524531185627, + "learning_rate": 4.6806547282986764e-05, + "loss": 0.0022, + "num_input_tokens_seen": 5449400, + "step": 8935 + }, + { + "epoch": 2.465526751241037, + "grad_norm": 0.08817800879478455, + "learning_rate": 4.6800659937995736e-05, + "loss": 0.0017, + "num_input_tokens_seen": 5452568, + "step": 8940 + }, + { + "epoch": 2.4669056811913954, + "grad_norm": 0.02441728115081787, + "learning_rate": 4.679476754210474e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5455160, + "step": 8945 + }, + { + "epoch": 2.468284611141754, + "grad_norm": 0.07520361244678497, + "learning_rate": 4.678887009667896e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5457592, + "step": 8950 + }, + { + "epoch": 2.4696635410921126, + "grad_norm": 0.000782095710746944, + "learning_rate": 4.678296760308474e-05, + "loss": 0.0093, + "num_input_tokens_seen": 5460664, + "step": 8955 + }, + { + "epoch": 2.471042471042471, + "grad_norm": 0.014373757876455784, + "learning_rate": 4.677706006268961e-05, + "loss": 0.0593, + "num_input_tokens_seen": 5464952, + "step": 8960 + }, + { + "epoch": 2.47242140099283, + "grad_norm": 0.004305767361074686, + "learning_rate": 4.6771147476862255e-05, + "loss": 0.0019, + "num_input_tokens_seen": 5467768, + "step": 8965 + }, + { + "epoch": 2.473800330943188, + "grad_norm": 0.01382092759013176, + "learning_rate": 4.676522984697255e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5470552, + "step": 8970 + }, + { + "epoch": 2.4751792608935466, + "grad_norm": 0.010495009832084179, + "learning_rate": 4.675930717439151e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5473336, + "step": 8975 + }, + { + "epoch": 2.476558190843905, + "grad_norm": 0.49537038803100586, + "learning_rate": 4.675337946049134e-05, + "loss": 0.0011, + "num_input_tokens_seen": 5476344, + "step": 8980 + }, + { + "epoch": 2.4779371207942638, + "grad_norm": 0.06394021958112717, + "learning_rate": 4.6747446706645407e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5479512, + "step": 8985 + }, + { + "epoch": 2.479316050744622, + "grad_norm": 0.005434753838926554, + "learning_rate": 4.674150891422824e-05, + "loss": 0.1189, + "num_input_tokens_seen": 5482776, + "step": 8990 + }, + { + "epoch": 2.4806949806949805, + "grad_norm": 8.305594444274902, + "learning_rate": 4.6735566084615556e-05, + "loss": 0.0798, + "num_input_tokens_seen": 5485048, + "step": 8995 + }, + { + "epoch": 2.4820739106453393, + "grad_norm": 0.03347676992416382, + "learning_rate": 4.67296182191842e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5487256, + "step": 9000 + }, + { + "epoch": 2.4834528405956977, + "grad_norm": 0.003948653116822243, + "learning_rate": 4.672366531931223e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5489656, + "step": 9005 + }, + { + "epoch": 2.484831770546056, + "grad_norm": 0.11208905279636383, + "learning_rate": 4.671770738637883e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5492952, + "step": 9010 + }, + { + "epoch": 2.486210700496415, + "grad_norm": 0.001648066914640367, + "learning_rate": 4.671174442176437e-05, + "loss": 0.0901, + "num_input_tokens_seen": 5495608, + "step": 9015 + }, + { + "epoch": 2.4875896304467733, + "grad_norm": 0.011942142620682716, + "learning_rate": 4.670577642685039e-05, + "loss": 0.0497, + "num_input_tokens_seen": 5498104, + "step": 9020 + }, + { + "epoch": 2.4889685603971317, + "grad_norm": 0.0010253688087686896, + "learning_rate": 4.6699803403019596e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5501144, + "step": 9025 + }, + { + "epoch": 2.4903474903474905, + "grad_norm": 0.10120375454425812, + "learning_rate": 4.669382535165583e-05, + "loss": 0.0011, + "num_input_tokens_seen": 5503736, + "step": 9030 + }, + { + "epoch": 2.491726420297849, + "grad_norm": 0.003131454810500145, + "learning_rate": 4.6687842274144136e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5506424, + "step": 9035 + }, + { + "epoch": 2.4931053502482072, + "grad_norm": 0.1423640251159668, + "learning_rate": 4.6681854171870695e-05, + "loss": 0.0013, + "num_input_tokens_seen": 5508952, + "step": 9040 + }, + { + "epoch": 2.494484280198566, + "grad_norm": 0.045378342270851135, + "learning_rate": 4.667586104622288e-05, + "loss": 0.0397, + "num_input_tokens_seen": 5513176, + "step": 9045 + }, + { + "epoch": 2.4958632101489244, + "grad_norm": 15.195469856262207, + "learning_rate": 4.6669862898589215e-05, + "loss": 0.0586, + "num_input_tokens_seen": 5515864, + "step": 9050 + }, + { + "epoch": 2.497242140099283, + "grad_norm": 0.008086958900094032, + "learning_rate": 4.666385973035936e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5518808, + "step": 9055 + }, + { + "epoch": 2.4986210700496416, + "grad_norm": 0.15329612791538239, + "learning_rate": 4.665785154292418e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5521592, + "step": 9060 + }, + { + "epoch": 2.5, + "grad_norm": 0.0029604234732687473, + "learning_rate": 4.665183833767569e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5525176, + "step": 9065 + }, + { + "epoch": 2.5, + "eval_loss": 0.15005435049533844, + "eval_runtime": 28.5082, + "eval_samples_per_second": 56.545, + "eval_steps_per_second": 14.136, + "num_input_tokens_seen": 5525176, + "step": 9065 + }, + { + "epoch": 2.5013789299503584, + "grad_norm": 0.08254590630531311, + "learning_rate": 4.664582011600705e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5528568, + "step": 9070 + }, + { + "epoch": 2.502757859900717, + "grad_norm": 0.0011370591819286346, + "learning_rate": 4.66397968793126e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5532024, + "step": 9075 + }, + { + "epoch": 2.5041367898510756, + "grad_norm": 0.0008802907541394234, + "learning_rate": 4.663376862898785e-05, + "loss": 0.0784, + "num_input_tokens_seen": 5534744, + "step": 9080 + }, + { + "epoch": 2.505515719801434, + "grad_norm": 0.0011370385764166713, + "learning_rate": 4.6627735366429445e-05, + "loss": 0.0773, + "num_input_tokens_seen": 5537912, + "step": 9085 + }, + { + "epoch": 2.5068946497517928, + "grad_norm": 0.07917392998933792, + "learning_rate": 4.6621697093035205e-05, + "loss": 0.0043, + "num_input_tokens_seen": 5540472, + "step": 9090 + }, + { + "epoch": 2.508273579702151, + "grad_norm": 0.0004095729091204703, + "learning_rate": 4.661565381020412e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5542648, + "step": 9095 + }, + { + "epoch": 2.5096525096525095, + "grad_norm": 0.04088958725333214, + "learning_rate": 4.6609605519336326e-05, + "loss": 0.0942, + "num_input_tokens_seen": 5545048, + "step": 9100 + }, + { + "epoch": 2.511031439602868, + "grad_norm": 0.005595454014837742, + "learning_rate": 4.660355222183312e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5549048, + "step": 9105 + }, + { + "epoch": 2.5124103695532267, + "grad_norm": 0.004533441737294197, + "learning_rate": 4.659749391909698e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5551480, + "step": 9110 + }, + { + "epoch": 2.513789299503585, + "grad_norm": 0.025853829458355904, + "learning_rate": 4.6591430612531515e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5554520, + "step": 9115 + }, + { + "epoch": 2.515168229453944, + "grad_norm": 0.003870980581268668, + "learning_rate": 4.658536230354151e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5558008, + "step": 9120 + }, + { + "epoch": 2.5165471594043023, + "grad_norm": 16.31989288330078, + "learning_rate": 4.657928899353291e-05, + "loss": 0.1192, + "num_input_tokens_seen": 5560632, + "step": 9125 + }, + { + "epoch": 2.5179260893546607, + "grad_norm": 0.046461910009384155, + "learning_rate": 4.6573210683912796e-05, + "loss": 0.0016, + "num_input_tokens_seen": 5564728, + "step": 9130 + }, + { + "epoch": 2.519305019305019, + "grad_norm": 0.004608998540788889, + "learning_rate": 4.656712737608945e-05, + "loss": 0.063, + "num_input_tokens_seen": 5568216, + "step": 9135 + }, + { + "epoch": 2.520683949255378, + "grad_norm": 0.0007407746743410826, + "learning_rate": 4.656103907147227e-05, + "loss": 0.0013, + "num_input_tokens_seen": 5571800, + "step": 9140 + }, + { + "epoch": 2.5220628792057362, + "grad_norm": 0.006712688598781824, + "learning_rate": 4.655494577147183e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5574616, + "step": 9145 + }, + { + "epoch": 2.523441809156095, + "grad_norm": 0.03967801108956337, + "learning_rate": 4.654884747749987e-05, + "loss": 0.0414, + "num_input_tokens_seen": 5577816, + "step": 9150 + }, + { + "epoch": 2.5248207391064534, + "grad_norm": 0.01342230848968029, + "learning_rate": 4.654274419096927e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5581208, + "step": 9155 + }, + { + "epoch": 2.526199669056812, + "grad_norm": 4.574436664581299, + "learning_rate": 4.6536635913294065e-05, + "loss": 0.0662, + "num_input_tokens_seen": 5584184, + "step": 9160 + }, + { + "epoch": 2.52757859900717, + "grad_norm": 0.0006113109411671758, + "learning_rate": 4.6530522645889475e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5586808, + "step": 9165 + }, + { + "epoch": 2.528957528957529, + "grad_norm": 27.7352294921875, + "learning_rate": 4.652440439017184e-05, + "loss": 0.0973, + "num_input_tokens_seen": 5590264, + "step": 9170 + }, + { + "epoch": 2.5303364589078874, + "grad_norm": 0.010999603196978569, + "learning_rate": 4.6518281147558675e-05, + "loss": 0.0046, + "num_input_tokens_seen": 5594072, + "step": 9175 + }, + { + "epoch": 2.531715388858246, + "grad_norm": 7.742186069488525, + "learning_rate": 4.651215291946866e-05, + "loss": 0.0275, + "num_input_tokens_seen": 5597464, + "step": 9180 + }, + { + "epoch": 2.5330943188086046, + "grad_norm": 0.011926966719329357, + "learning_rate": 4.65060197073216e-05, + "loss": 0.0612, + "num_input_tokens_seen": 5601560, + "step": 9185 + }, + { + "epoch": 2.534473248758963, + "grad_norm": 0.04240646958351135, + "learning_rate": 4.649988151253849e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5604984, + "step": 9190 + }, + { + "epoch": 2.5358521787093213, + "grad_norm": 0.03210311383008957, + "learning_rate": 4.6493738336541446e-05, + "loss": 0.0035, + "num_input_tokens_seen": 5608120, + "step": 9195 + }, + { + "epoch": 2.53723110865968, + "grad_norm": 0.05674458667635918, + "learning_rate": 4.648759018075376e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5610328, + "step": 9200 + }, + { + "epoch": 2.5386100386100385, + "grad_norm": 0.23490676283836365, + "learning_rate": 4.648143704659987e-05, + "loss": 0.006, + "num_input_tokens_seen": 5612984, + "step": 9205 + }, + { + "epoch": 2.5399889685603974, + "grad_norm": 0.017901917919516563, + "learning_rate": 4.647527893550537e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5616440, + "step": 9210 + }, + { + "epoch": 2.5413678985107557, + "grad_norm": 0.0004636941594071686, + "learning_rate": 4.6469115848897e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5619064, + "step": 9215 + }, + { + "epoch": 2.542746828461114, + "grad_norm": 0.0005845716805197299, + "learning_rate": 4.6462947788202674e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5622168, + "step": 9220 + }, + { + "epoch": 2.5441257584114725, + "grad_norm": 0.0015458014095202088, + "learning_rate": 4.645677475485143e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5624632, + "step": 9225 + }, + { + "epoch": 2.5455046883618313, + "grad_norm": 0.04051494225859642, + "learning_rate": 4.645059675027348e-05, + "loss": 0.0753, + "num_input_tokens_seen": 5628152, + "step": 9230 + }, + { + "epoch": 2.5468836183121897, + "grad_norm": 0.00130240258295089, + "learning_rate": 4.6444413775900165e-05, + "loss": 0.0551, + "num_input_tokens_seen": 5630776, + "step": 9235 + }, + { + "epoch": 2.5482625482625485, + "grad_norm": 0.08792446553707123, + "learning_rate": 4.643822583316401e-05, + "loss": 0.0083, + "num_input_tokens_seen": 5633592, + "step": 9240 + }, + { + "epoch": 2.549641478212907, + "grad_norm": 0.041054874658584595, + "learning_rate": 4.6432032923498656e-05, + "loss": 0.0776, + "num_input_tokens_seen": 5636664, + "step": 9245 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 30.00379180908203, + "learning_rate": 4.642583504833892e-05, + "loss": 0.136, + "num_input_tokens_seen": 5639832, + "step": 9250 + }, + { + "epoch": 2.5523993381136236, + "grad_norm": 0.002463668119162321, + "learning_rate": 4.641963220912076e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5642840, + "step": 9255 + }, + { + "epoch": 2.5537782680639824, + "grad_norm": 0.008395181968808174, + "learning_rate": 4.641342440728128e-05, + "loss": 0.0451, + "num_input_tokens_seen": 5645272, + "step": 9260 + }, + { + "epoch": 2.555157198014341, + "grad_norm": 0.00406790804117918, + "learning_rate": 4.6407211644258744e-05, + "loss": 0.036, + "num_input_tokens_seen": 5649080, + "step": 9265 + }, + { + "epoch": 2.5565361279646996, + "grad_norm": 0.08115560561418533, + "learning_rate": 4.640099392149255e-05, + "loss": 0.0014, + "num_input_tokens_seen": 5653080, + "step": 9270 + }, + { + "epoch": 2.557915057915058, + "grad_norm": 0.000694934802595526, + "learning_rate": 4.6394771240423274e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5655992, + "step": 9275 + }, + { + "epoch": 2.5592939878654164, + "grad_norm": 0.12480081617832184, + "learning_rate": 4.638854360249261e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5658552, + "step": 9280 + }, + { + "epoch": 2.5606729178157748, + "grad_norm": 0.008272179402410984, + "learning_rate": 4.638231100914341e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5661592, + "step": 9285 + }, + { + "epoch": 2.5620518477661336, + "grad_norm": 0.0025910816621035337, + "learning_rate": 4.637607346181969e-05, + "loss": 0.0569, + "num_input_tokens_seen": 5665368, + "step": 9290 + }, + { + "epoch": 2.563430777716492, + "grad_norm": 0.0039826626889407635, + "learning_rate": 4.636983096196658e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5668728, + "step": 9295 + }, + { + "epoch": 2.564809707666851, + "grad_norm": 0.0006319154053926468, + "learning_rate": 4.6363583511030384e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5671928, + "step": 9300 + }, + { + "epoch": 2.566188637617209, + "grad_norm": 0.009487951174378395, + "learning_rate": 4.635733111045856e-05, + "loss": 0.0776, + "num_input_tokens_seen": 5674744, + "step": 9305 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.2733803391456604, + "learning_rate": 4.635107376169968e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5677016, + "step": 9310 + }, + { + "epoch": 2.568946497517926, + "grad_norm": 0.0046328273601830006, + "learning_rate": 4.6344811466203506e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5679768, + "step": 9315 + }, + { + "epoch": 2.5703254274682847, + "grad_norm": 0.0006797423120588064, + "learning_rate": 4.6338544225420896e-05, + "loss": 0.0355, + "num_input_tokens_seen": 5683064, + "step": 9320 + }, + { + "epoch": 2.571704357418643, + "grad_norm": 0.004093751776963472, + "learning_rate": 4.6332272040803895e-05, + "loss": 0.0898, + "num_input_tokens_seen": 5686168, + "step": 9325 + }, + { + "epoch": 2.5730832873690015, + "grad_norm": 0.012873445637524128, + "learning_rate": 4.632599491380567e-05, + "loss": 0.0416, + "num_input_tokens_seen": 5688888, + "step": 9330 + }, + { + "epoch": 2.5744622173193603, + "grad_norm": 0.023885933682322502, + "learning_rate": 4.631971284588055e-05, + "loss": 0.0763, + "num_input_tokens_seen": 5692856, + "step": 9335 + }, + { + "epoch": 2.5758411472697187, + "grad_norm": 0.021867254748940468, + "learning_rate": 4.6313425838484e-05, + "loss": 0.0029, + "num_input_tokens_seen": 5695736, + "step": 9340 + }, + { + "epoch": 2.577220077220077, + "grad_norm": 0.136541947722435, + "learning_rate": 4.6307133893072616e-05, + "loss": 0.0517, + "num_input_tokens_seen": 5699256, + "step": 9345 + }, + { + "epoch": 2.578599007170436, + "grad_norm": 0.003035089699551463, + "learning_rate": 4.630083701110417e-05, + "loss": 0.0043, + "num_input_tokens_seen": 5701816, + "step": 9350 + }, + { + "epoch": 2.5799779371207943, + "grad_norm": 0.043144065886735916, + "learning_rate": 4.6294535194037546e-05, + "loss": 0.0961, + "num_input_tokens_seen": 5706360, + "step": 9355 + }, + { + "epoch": 2.5813568670711526, + "grad_norm": 0.01880503259599209, + "learning_rate": 4.628822844333278e-05, + "loss": 0.0375, + "num_input_tokens_seen": 5709688, + "step": 9360 + }, + { + "epoch": 2.5827357970215115, + "grad_norm": 0.0008439924567937851, + "learning_rate": 4.6281916760451074e-05, + "loss": 0.0038, + "num_input_tokens_seen": 5712408, + "step": 9365 + }, + { + "epoch": 2.58411472697187, + "grad_norm": 0.018823981285095215, + "learning_rate": 4.6275600146854745e-05, + "loss": 0.003, + "num_input_tokens_seen": 5714904, + "step": 9370 + }, + { + "epoch": 2.585493656922228, + "grad_norm": 0.03270116075873375, + "learning_rate": 4.6269278604007256e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5717560, + "step": 9375 + }, + { + "epoch": 2.586872586872587, + "grad_norm": 0.05542680248618126, + "learning_rate": 4.626295213337322e-05, + "loss": 0.1106, + "num_input_tokens_seen": 5720184, + "step": 9380 + }, + { + "epoch": 2.5882515168229454, + "grad_norm": 0.020366976037621498, + "learning_rate": 4.62566207364184e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5722488, + "step": 9385 + }, + { + "epoch": 2.589630446773304, + "grad_norm": 0.0001640630216570571, + "learning_rate": 4.625028441460968e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5727288, + "step": 9390 + }, + { + "epoch": 2.5910093767236626, + "grad_norm": 0.42988818883895874, + "learning_rate": 4.624394316941509e-05, + "loss": 0.0776, + "num_input_tokens_seen": 5729784, + "step": 9395 + }, + { + "epoch": 2.592388306674021, + "grad_norm": 18.40535545349121, + "learning_rate": 4.6237597002303826e-05, + "loss": 0.0115, + "num_input_tokens_seen": 5732664, + "step": 9400 + }, + { + "epoch": 2.5937672366243794, + "grad_norm": 0.13042059540748596, + "learning_rate": 4.6231245914746166e-05, + "loss": 0.0655, + "num_input_tokens_seen": 5735320, + "step": 9405 + }, + { + "epoch": 2.5951461665747377, + "grad_norm": 0.005219290032982826, + "learning_rate": 4.622488990821361e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5738328, + "step": 9410 + }, + { + "epoch": 2.5965250965250966, + "grad_norm": 4.7427897453308105, + "learning_rate": 4.621852898417873e-05, + "loss": 0.0591, + "num_input_tokens_seen": 5741432, + "step": 9415 + }, + { + "epoch": 2.597904026475455, + "grad_norm": 0.004313127137720585, + "learning_rate": 4.621216314411526e-05, + "loss": 0.0036, + "num_input_tokens_seen": 5746552, + "step": 9420 + }, + { + "epoch": 2.5992829564258138, + "grad_norm": 0.1133820191025734, + "learning_rate": 4.620579238949808e-05, + "loss": 0.0591, + "num_input_tokens_seen": 5749752, + "step": 9425 + }, + { + "epoch": 2.600661886376172, + "grad_norm": 0.02416601963341236, + "learning_rate": 4.61994167218032e-05, + "loss": 0.0007, + "num_input_tokens_seen": 5752856, + "step": 9430 + }, + { + "epoch": 2.6020408163265305, + "grad_norm": 0.05321787670254707, + "learning_rate": 4.619303614250777e-05, + "loss": 0.0013, + "num_input_tokens_seen": 5756536, + "step": 9435 + }, + { + "epoch": 2.603419746276889, + "grad_norm": 0.0025856713764369488, + "learning_rate": 4.618665065309008e-05, + "loss": 0.0013, + "num_input_tokens_seen": 5759800, + "step": 9440 + }, + { + "epoch": 2.6047986762272477, + "grad_norm": 0.07453746348619461, + "learning_rate": 4.618026025502956e-05, + "loss": 0.0819, + "num_input_tokens_seen": 5762552, + "step": 9445 + }, + { + "epoch": 2.606177606177606, + "grad_norm": 0.011901408433914185, + "learning_rate": 4.617386494980676e-05, + "loss": 0.1226, + "num_input_tokens_seen": 5765208, + "step": 9450 + }, + { + "epoch": 2.607556536127965, + "grad_norm": 0.10020314157009125, + "learning_rate": 4.61674647389034e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5768312, + "step": 9455 + }, + { + "epoch": 2.6089354660783233, + "grad_norm": 7.509307861328125, + "learning_rate": 4.6161059623802295e-05, + "loss": 0.132, + "num_input_tokens_seen": 5770616, + "step": 9460 + }, + { + "epoch": 2.6103143960286816, + "grad_norm": 16.606809616088867, + "learning_rate": 4.6154649605987436e-05, + "loss": 0.1225, + "num_input_tokens_seen": 5773176, + "step": 9465 + }, + { + "epoch": 2.61169332597904, + "grad_norm": 0.3994920551776886, + "learning_rate": 4.614823468694393e-05, + "loss": 0.0547, + "num_input_tokens_seen": 5775288, + "step": 9470 + }, + { + "epoch": 2.613072255929399, + "grad_norm": 12.727315902709961, + "learning_rate": 4.6141814868158014e-05, + "loss": 0.0123, + "num_input_tokens_seen": 5778648, + "step": 9475 + }, + { + "epoch": 2.6144511858797572, + "grad_norm": 0.13862626254558563, + "learning_rate": 4.613539015111707e-05, + "loss": 0.0065, + "num_input_tokens_seen": 5781208, + "step": 9480 + }, + { + "epoch": 2.615830115830116, + "grad_norm": 0.006509636528789997, + "learning_rate": 4.612896053730962e-05, + "loss": 0.0018, + "num_input_tokens_seen": 5783928, + "step": 9485 + }, + { + "epoch": 2.6172090457804744, + "grad_norm": 0.17267553508281708, + "learning_rate": 4.61225260282253e-05, + "loss": 0.0185, + "num_input_tokens_seen": 5786296, + "step": 9490 + }, + { + "epoch": 2.618587975730833, + "grad_norm": 0.0056494055315852165, + "learning_rate": 4.61160866253549e-05, + "loss": 0.0018, + "num_input_tokens_seen": 5789336, + "step": 9495 + }, + { + "epoch": 2.619966905681191, + "grad_norm": 0.27493366599082947, + "learning_rate": 4.610964233019035e-05, + "loss": 0.0036, + "num_input_tokens_seen": 5792440, + "step": 9500 + }, + { + "epoch": 2.62134583563155, + "grad_norm": 0.010725991800427437, + "learning_rate": 4.6103193144224676e-05, + "loss": 0.0549, + "num_input_tokens_seen": 5795576, + "step": 9505 + }, + { + "epoch": 2.6227247655819084, + "grad_norm": 0.023195629939436913, + "learning_rate": 4.609673906895208e-05, + "loss": 0.009, + "num_input_tokens_seen": 5799288, + "step": 9510 + }, + { + "epoch": 2.624103695532267, + "grad_norm": 0.013851524330675602, + "learning_rate": 4.609028010586788e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5801976, + "step": 9515 + }, + { + "epoch": 2.6254826254826256, + "grad_norm": 0.4255392253398895, + "learning_rate": 4.608381625646851e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5805048, + "step": 9520 + }, + { + "epoch": 2.626861555432984, + "grad_norm": 0.07722406834363937, + "learning_rate": 4.6077347522251556e-05, + "loss": 0.0861, + "num_input_tokens_seen": 5807992, + "step": 9525 + }, + { + "epoch": 2.6282404853833423, + "grad_norm": 0.014901147224009037, + "learning_rate": 4.607087390471574e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5810424, + "step": 9530 + }, + { + "epoch": 2.629619415333701, + "grad_norm": 0.02276097796857357, + "learning_rate": 4.6064395405360904e-05, + "loss": 0.0244, + "num_input_tokens_seen": 5812824, + "step": 9535 + }, + { + "epoch": 2.6309983452840595, + "grad_norm": 0.003588685765862465, + "learning_rate": 4.605791202568801e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5815768, + "step": 9540 + }, + { + "epoch": 2.6323772752344183, + "grad_norm": 0.08102650940418243, + "learning_rate": 4.605142376719918e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5818968, + "step": 9545 + }, + { + "epoch": 2.6337562051847767, + "grad_norm": 0.0038093782495707273, + "learning_rate": 4.604493063139764e-05, + "loss": 0.0275, + "num_input_tokens_seen": 5821816, + "step": 9550 + }, + { + "epoch": 2.635135135135135, + "grad_norm": 0.008435815572738647, + "learning_rate": 4.603843261978777e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5826296, + "step": 9555 + }, + { + "epoch": 2.6365140650854935, + "grad_norm": 0.2784048616886139, + "learning_rate": 4.6031929733875046e-05, + "loss": 0.001, + "num_input_tokens_seen": 5828376, + "step": 9560 + }, + { + "epoch": 2.6378929950358523, + "grad_norm": 0.006387496832758188, + "learning_rate": 4.6025421975166114e-05, + "loss": 0.0817, + "num_input_tokens_seen": 5830712, + "step": 9565 + }, + { + "epoch": 2.6392719249862107, + "grad_norm": 0.053163621574640274, + "learning_rate": 4.601890934516871e-05, + "loss": 0.0942, + "num_input_tokens_seen": 5835448, + "step": 9570 + }, + { + "epoch": 2.6406508549365695, + "grad_norm": 0.3285813331604004, + "learning_rate": 4.6012391845391725e-05, + "loss": 0.0329, + "num_input_tokens_seen": 5838104, + "step": 9575 + }, + { + "epoch": 2.642029784886928, + "grad_norm": 0.009809989482164383, + "learning_rate": 4.6005869477345175e-05, + "loss": 0.0005, + "num_input_tokens_seen": 5841848, + "step": 9580 + }, + { + "epoch": 2.6434087148372862, + "grad_norm": 0.08584020286798477, + "learning_rate": 4.599934224254019e-05, + "loss": 0.1175, + "num_input_tokens_seen": 5845144, + "step": 9585 + }, + { + "epoch": 2.6447876447876446, + "grad_norm": 17.272863388061523, + "learning_rate": 4.599281014248904e-05, + "loss": 0.0564, + "num_input_tokens_seen": 5848120, + "step": 9590 + }, + { + "epoch": 2.6461665747380034, + "grad_norm": 10.540135383605957, + "learning_rate": 4.598627317870512e-05, + "loss": 0.0069, + "num_input_tokens_seen": 5852600, + "step": 9595 + }, + { + "epoch": 2.647545504688362, + "grad_norm": 0.5417038202285767, + "learning_rate": 4.597973135270296e-05, + "loss": 0.1001, + "num_input_tokens_seen": 5855416, + "step": 9600 + }, + { + "epoch": 2.6489244346387206, + "grad_norm": 0.2230595201253891, + "learning_rate": 4.5973184665998186e-05, + "loss": 0.0121, + "num_input_tokens_seen": 5857848, + "step": 9605 + }, + { + "epoch": 2.650303364589079, + "grad_norm": 0.22087812423706055, + "learning_rate": 4.596663312010758e-05, + "loss": 0.0184, + "num_input_tokens_seen": 5860344, + "step": 9610 + }, + { + "epoch": 2.6516822945394374, + "grad_norm": 0.5125907063484192, + "learning_rate": 4.5960076716549055e-05, + "loss": 0.0056, + "num_input_tokens_seen": 5862712, + "step": 9615 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.026497939601540565, + "learning_rate": 4.5953515456841624e-05, + "loss": 0.2018, + "num_input_tokens_seen": 5865208, + "step": 9620 + }, + { + "epoch": 2.6544401544401546, + "grad_norm": 0.03402886912226677, + "learning_rate": 4.594694934250543e-05, + "loss": 0.0489, + "num_input_tokens_seen": 5868312, + "step": 9625 + }, + { + "epoch": 2.655819084390513, + "grad_norm": 0.052977584302425385, + "learning_rate": 4.5940378375061755e-05, + "loss": 0.002, + "num_input_tokens_seen": 5871128, + "step": 9630 + }, + { + "epoch": 2.6571980143408713, + "grad_norm": 0.3108276426792145, + "learning_rate": 4.5933802556033e-05, + "loss": 0.0028, + "num_input_tokens_seen": 5873688, + "step": 9635 + }, + { + "epoch": 2.65857694429123, + "grad_norm": 0.07926106452941895, + "learning_rate": 4.592722188694269e-05, + "loss": 0.0012, + "num_input_tokens_seen": 5876792, + "step": 9640 + }, + { + "epoch": 2.6599558742415885, + "grad_norm": 0.0027469387277960777, + "learning_rate": 4.592063636931546e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5879224, + "step": 9645 + }, + { + "epoch": 2.661334804191947, + "grad_norm": 0.16905632615089417, + "learning_rate": 4.591404600467709e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5882584, + "step": 9650 + }, + { + "epoch": 2.6627137341423057, + "grad_norm": 0.08846912533044815, + "learning_rate": 4.5907450794554476e-05, + "loss": 0.0028, + "num_input_tokens_seen": 5885816, + "step": 9655 + }, + { + "epoch": 2.664092664092664, + "grad_norm": 0.006241519469767809, + "learning_rate": 4.5900850740475616e-05, + "loss": 0.0001, + "num_input_tokens_seen": 5888632, + "step": 9660 + }, + { + "epoch": 2.6654715940430225, + "grad_norm": 0.012178421020507812, + "learning_rate": 4.5894245843969664e-05, + "loss": 0.0964, + "num_input_tokens_seen": 5891768, + "step": 9665 + }, + { + "epoch": 2.6668505239933813, + "grad_norm": 0.0002099516859743744, + "learning_rate": 4.588763610656687e-05, + "loss": 0.0002, + "num_input_tokens_seen": 5895256, + "step": 9670 + }, + { + "epoch": 2.6682294539437397, + "grad_norm": 18.38767433166504, + "learning_rate": 4.588102152979863e-05, + "loss": 0.0694, + "num_input_tokens_seen": 5898136, + "step": 9675 + }, + { + "epoch": 2.669608383894098, + "grad_norm": 1.3781605958938599, + "learning_rate": 4.587440211519743e-05, + "loss": 0.0023, + "num_input_tokens_seen": 5900792, + "step": 9680 + }, + { + "epoch": 2.670987313844457, + "grad_norm": 0.14994552731513977, + "learning_rate": 4.58677778642969e-05, + "loss": 0.0026, + "num_input_tokens_seen": 5903832, + "step": 9685 + }, + { + "epoch": 2.6723662437948152, + "grad_norm": 5.795015811920166, + "learning_rate": 4.586114877863178e-05, + "loss": 0.0968, + "num_input_tokens_seen": 5907288, + "step": 9690 + }, + { + "epoch": 2.6737451737451736, + "grad_norm": 0.0009342428529635072, + "learning_rate": 4.5854514859737945e-05, + "loss": 0.0307, + "num_input_tokens_seen": 5910264, + "step": 9695 + }, + { + "epoch": 2.6751241036955324, + "grad_norm": 15.784904479980469, + "learning_rate": 4.5847876109152365e-05, + "loss": 0.0698, + "num_input_tokens_seen": 5913080, + "step": 9700 + }, + { + "epoch": 2.676503033645891, + "grad_norm": 0.0032608818728476763, + "learning_rate": 4.584123252841316e-05, + "loss": 0.0003, + "num_input_tokens_seen": 5915928, + "step": 9705 + }, + { + "epoch": 2.677881963596249, + "grad_norm": 0.05788255110383034, + "learning_rate": 4.583458411905953e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5919000, + "step": 9710 + }, + { + "epoch": 2.6792608935466076, + "grad_norm": 29.551162719726562, + "learning_rate": 4.582793088263183e-05, + "loss": 0.027, + "num_input_tokens_seen": 5922072, + "step": 9715 + }, + { + "epoch": 2.6806398234969664, + "grad_norm": 0.08208800107240677, + "learning_rate": 4.5821272820671525e-05, + "loss": 0.0497, + "num_input_tokens_seen": 5924632, + "step": 9720 + }, + { + "epoch": 2.6820187534473248, + "grad_norm": 0.0033007559832185507, + "learning_rate": 4.5814609934721184e-05, + "loss": 0.0458, + "num_input_tokens_seen": 5927384, + "step": 9725 + }, + { + "epoch": 2.6833976833976836, + "grad_norm": 0.010448412969708443, + "learning_rate": 4.5807942226324494e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5929976, + "step": 9730 + }, + { + "epoch": 2.684776613348042, + "grad_norm": 0.0005915071233175695, + "learning_rate": 4.5801269697026285e-05, + "loss": 0.059, + "num_input_tokens_seen": 5932248, + "step": 9735 + }, + { + "epoch": 2.6861555432984003, + "grad_norm": 4.001693248748779, + "learning_rate": 4.579459234837247e-05, + "loss": 0.1711, + "num_input_tokens_seen": 5935640, + "step": 9740 + }, + { + "epoch": 2.6875344732487587, + "grad_norm": 2.2077879905700684, + "learning_rate": 4.578791018191011e-05, + "loss": 0.0771, + "num_input_tokens_seen": 5939096, + "step": 9745 + }, + { + "epoch": 2.6889134031991175, + "grad_norm": 0.00484183244407177, + "learning_rate": 4.578122319918735e-05, + "loss": 0.0011, + "num_input_tokens_seen": 5941592, + "step": 9750 + }, + { + "epoch": 2.690292333149476, + "grad_norm": 12.002338409423828, + "learning_rate": 4.5774531401753485e-05, + "loss": 0.1041, + "num_input_tokens_seen": 5944664, + "step": 9755 + }, + { + "epoch": 2.6916712630998347, + "grad_norm": 0.30136626958847046, + "learning_rate": 4.57678347911589e-05, + "loss": 0.0014, + "num_input_tokens_seen": 5947480, + "step": 9760 + }, + { + "epoch": 2.693050193050193, + "grad_norm": 0.012166544795036316, + "learning_rate": 4.57611333689551e-05, + "loss": 0.0621, + "num_input_tokens_seen": 5949912, + "step": 9765 + }, + { + "epoch": 2.6944291230005515, + "grad_norm": 0.029530344530940056, + "learning_rate": 4.575442713669471e-05, + "loss": 0.0013, + "num_input_tokens_seen": 5952184, + "step": 9770 + }, + { + "epoch": 2.69580805295091, + "grad_norm": 0.006176419090479612, + "learning_rate": 4.574771609593148e-05, + "loss": 0.0027, + "num_input_tokens_seen": 5955576, + "step": 9775 + }, + { + "epoch": 2.6971869829012687, + "grad_norm": 0.10868306457996368, + "learning_rate": 4.574100024822024e-05, + "loss": 0.064, + "num_input_tokens_seen": 5958264, + "step": 9780 + }, + { + "epoch": 2.698565912851627, + "grad_norm": 2.547797441482544, + "learning_rate": 4.573427959511698e-05, + "loss": 0.0074, + "num_input_tokens_seen": 5961400, + "step": 9785 + }, + { + "epoch": 2.699944842801986, + "grad_norm": 0.06361854076385498, + "learning_rate": 4.572755413817876e-05, + "loss": 0.0008, + "num_input_tokens_seen": 5964120, + "step": 9790 + }, + { + "epoch": 2.7013237727523443, + "grad_norm": 0.07127375155687332, + "learning_rate": 4.572082387896378e-05, + "loss": 0.0015, + "num_input_tokens_seen": 5966744, + "step": 9795 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.5351614356040955, + "learning_rate": 4.571408881903134e-05, + "loss": 0.0035, + "num_input_tokens_seen": 5970264, + "step": 9800 + }, + { + "epoch": 2.704081632653061, + "grad_norm": 0.009015747345983982, + "learning_rate": 4.570734895994186e-05, + "loss": 0.1695, + "num_input_tokens_seen": 5973624, + "step": 9805 + }, + { + "epoch": 2.70546056260342, + "grad_norm": 0.06932578235864639, + "learning_rate": 4.570060430325687e-05, + "loss": 0.0004, + "num_input_tokens_seen": 5978008, + "step": 9810 + }, + { + "epoch": 2.706839492553778, + "grad_norm": 0.05318516120314598, + "learning_rate": 4.569385485053901e-05, + "loss": 0.0009, + "num_input_tokens_seen": 5980792, + "step": 9815 + }, + { + "epoch": 2.708218422504137, + "grad_norm": 0.16174224019050598, + "learning_rate": 4.568710060335202e-05, + "loss": 0.0826, + "num_input_tokens_seen": 5983160, + "step": 9820 + }, + { + "epoch": 2.7095973524544954, + "grad_norm": 0.0115178432315588, + "learning_rate": 4.5680341563260785e-05, + "loss": 0.1387, + "num_input_tokens_seen": 5987320, + "step": 9825 + }, + { + "epoch": 2.7109762824048538, + "grad_norm": 0.09193850308656693, + "learning_rate": 4.5673577731831255e-05, + "loss": 0.0024, + "num_input_tokens_seen": 5990264, + "step": 9830 + }, + { + "epoch": 2.712355212355212, + "grad_norm": 0.05691106617450714, + "learning_rate": 4.5666809110630525e-05, + "loss": 0.0032, + "num_input_tokens_seen": 5992440, + "step": 9835 + }, + { + "epoch": 2.713734142305571, + "grad_norm": 0.14908695220947266, + "learning_rate": 4.566003570122678e-05, + "loss": 0.0052, + "num_input_tokens_seen": 5995800, + "step": 9840 + }, + { + "epoch": 2.7151130722559293, + "grad_norm": 0.0034089148975908756, + "learning_rate": 4.565325750518933e-05, + "loss": 0.0006, + "num_input_tokens_seen": 5998648, + "step": 9845 + }, + { + "epoch": 2.716492002206288, + "grad_norm": 0.026375167071819305, + "learning_rate": 4.564647452408858e-05, + "loss": 0.0063, + "num_input_tokens_seen": 6001080, + "step": 9850 + }, + { + "epoch": 2.7178709321566465, + "grad_norm": 0.05737481266260147, + "learning_rate": 4.5639686759496054e-05, + "loss": 0.001, + "num_input_tokens_seen": 6003928, + "step": 9855 + }, + { + "epoch": 2.719249862107005, + "grad_norm": 0.00499329948797822, + "learning_rate": 4.563289421298437e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6006776, + "step": 9860 + }, + { + "epoch": 2.7206287920573633, + "grad_norm": 0.05274489149451256, + "learning_rate": 4.562609688612728e-05, + "loss": 0.0196, + "num_input_tokens_seen": 6009304, + "step": 9865 + }, + { + "epoch": 2.722007722007722, + "grad_norm": 0.007098335772752762, + "learning_rate": 4.561929478049961e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6012664, + "step": 9870 + }, + { + "epoch": 2.7233866519580805, + "grad_norm": 0.016880780458450317, + "learning_rate": 4.561248789767731e-05, + "loss": 0.0018, + "num_input_tokens_seen": 6015352, + "step": 9875 + }, + { + "epoch": 2.7247655819084393, + "grad_norm": 0.04231645539402962, + "learning_rate": 4.560567623923746e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6018936, + "step": 9880 + }, + { + "epoch": 2.7261445118587977, + "grad_norm": 0.03882220759987831, + "learning_rate": 4.55988598067582e-05, + "loss": 0.0013, + "num_input_tokens_seen": 6022808, + "step": 9885 + }, + { + "epoch": 2.727523441809156, + "grad_norm": 0.0173183660954237, + "learning_rate": 4.559203860181881e-05, + "loss": 0.1273, + "num_input_tokens_seen": 6025176, + "step": 9890 + }, + { + "epoch": 2.7289023717595144, + "grad_norm": 0.0012924243928864598, + "learning_rate": 4.5585212625999664e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6028152, + "step": 9895 + }, + { + "epoch": 2.7302813017098733, + "grad_norm": 0.08724269270896912, + "learning_rate": 4.557838188088224e-05, + "loss": 0.0971, + "num_input_tokens_seen": 6031032, + "step": 9900 + }, + { + "epoch": 2.7316602316602316, + "grad_norm": 7.829470157623291, + "learning_rate": 4.5571546368049126e-05, + "loss": 0.0611, + "num_input_tokens_seen": 6034456, + "step": 9905 + }, + { + "epoch": 2.73303916161059, + "grad_norm": 0.01423654705286026, + "learning_rate": 4.5564706089084004e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6037848, + "step": 9910 + }, + { + "epoch": 2.734418091560949, + "grad_norm": 0.04407702386379242, + "learning_rate": 4.555786104557168e-05, + "loss": 0.0377, + "num_input_tokens_seen": 6041752, + "step": 9915 + }, + { + "epoch": 2.735797021511307, + "grad_norm": 0.016609402373433113, + "learning_rate": 4.555101123909804e-05, + "loss": 0.0702, + "num_input_tokens_seen": 6044696, + "step": 9920 + }, + { + "epoch": 2.7371759514616656, + "grad_norm": 9.869611740112305, + "learning_rate": 4.554415667125011e-05, + "loss": 0.0176, + "num_input_tokens_seen": 6047992, + "step": 9925 + }, + { + "epoch": 2.7385548814120244, + "grad_norm": 0.03438381478190422, + "learning_rate": 4.553729734361597e-05, + "loss": 0.0013, + "num_input_tokens_seen": 6051096, + "step": 9930 + }, + { + "epoch": 2.739933811362383, + "grad_norm": 0.035918306559324265, + "learning_rate": 4.553043325778483e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6053496, + "step": 9935 + }, + { + "epoch": 2.741312741312741, + "grad_norm": 0.04298185184597969, + "learning_rate": 4.552356441534702e-05, + "loss": 0.0011, + "num_input_tokens_seen": 6056184, + "step": 9940 + }, + { + "epoch": 2.7426916712631, + "grad_norm": 0.00636459281668067, + "learning_rate": 4.551669081789393e-05, + "loss": 0.0011, + "num_input_tokens_seen": 6060568, + "step": 9945 + }, + { + "epoch": 2.7440706012134584, + "grad_norm": 0.054127588868141174, + "learning_rate": 4.550981246701809e-05, + "loss": 0.0012, + "num_input_tokens_seen": 6064472, + "step": 9950 + }, + { + "epoch": 2.7454495311638167, + "grad_norm": 0.08095432817935944, + "learning_rate": 4.55029293643131e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6067448, + "step": 9955 + }, + { + "epoch": 2.7468284611141756, + "grad_norm": 0.00303856935352087, + "learning_rate": 4.549604151137369e-05, + "loss": 0.0052, + "num_input_tokens_seen": 6070616, + "step": 9960 + }, + { + "epoch": 2.748207391064534, + "grad_norm": 0.03189950808882713, + "learning_rate": 4.5489148909795665e-05, + "loss": 0.108, + "num_input_tokens_seen": 6073336, + "step": 9965 + }, + { + "epoch": 2.7495863210148923, + "grad_norm": 0.0032798016909509897, + "learning_rate": 4.548225156117595e-05, + "loss": 0.0116, + "num_input_tokens_seen": 6075576, + "step": 9970 + }, + { + "epoch": 2.750965250965251, + "grad_norm": 22.757169723510742, + "learning_rate": 4.547534946711256e-05, + "loss": 0.147, + "num_input_tokens_seen": 6077944, + "step": 9975 + }, + { + "epoch": 2.7523441809156095, + "grad_norm": 14.492047309875488, + "learning_rate": 4.546844262920461e-05, + "loss": 0.042, + "num_input_tokens_seen": 6081144, + "step": 9980 + }, + { + "epoch": 2.753723110865968, + "grad_norm": 12.195418357849121, + "learning_rate": 4.546153104905232e-05, + "loss": 0.0735, + "num_input_tokens_seen": 6084472, + "step": 9985 + }, + { + "epoch": 2.7551020408163263, + "grad_norm": 0.01534749660640955, + "learning_rate": 4.5454614728256995e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6088024, + "step": 9990 + }, + { + "epoch": 2.756480970766685, + "grad_norm": 0.0009908076608553529, + "learning_rate": 4.5447693668421044e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6091128, + "step": 9995 + }, + { + "epoch": 2.7578599007170435, + "grad_norm": 0.03126302361488342, + "learning_rate": 4.544076787114799e-05, + "loss": 0.0861, + "num_input_tokens_seen": 6093752, + "step": 10000 + }, + { + "epoch": 2.7592388306674023, + "grad_norm": 0.045580193400382996, + "learning_rate": 4.5433837338042446e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6096824, + "step": 10005 + }, + { + "epoch": 2.7606177606177607, + "grad_norm": 0.003499743063002825, + "learning_rate": 4.5426902070710096e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6099672, + "step": 10010 + }, + { + "epoch": 2.761996690568119, + "grad_norm": 0.002096957992762327, + "learning_rate": 4.5419962070757756e-05, + "loss": 0.0007, + "num_input_tokens_seen": 6102072, + "step": 10015 + }, + { + "epoch": 2.7633756205184774, + "grad_norm": 29.271129608154297, + "learning_rate": 4.541301733979332e-05, + "loss": 0.0536, + "num_input_tokens_seen": 6104952, + "step": 10020 + }, + { + "epoch": 2.7647545504688362, + "grad_norm": 2.103034019470215, + "learning_rate": 4.5406067879425786e-05, + "loss": 0.0017, + "num_input_tokens_seen": 6108152, + "step": 10025 + }, + { + "epoch": 2.7661334804191946, + "grad_norm": 36.98759460449219, + "learning_rate": 4.5399113691265234e-05, + "loss": 0.0122, + "num_input_tokens_seen": 6110648, + "step": 10030 + }, + { + "epoch": 2.7675124103695534, + "grad_norm": 0.023862497881054878, + "learning_rate": 4.539215477692286e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6115096, + "step": 10035 + }, + { + "epoch": 2.768891340319912, + "grad_norm": 0.008974360302090645, + "learning_rate": 4.538519113801094e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6118040, + "step": 10040 + }, + { + "epoch": 2.77027027027027, + "grad_norm": 0.0015732977772131562, + "learning_rate": 4.537822277614286e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6120824, + "step": 10045 + }, + { + "epoch": 2.7716492002206286, + "grad_norm": 0.02370639331638813, + "learning_rate": 4.5371249692933074e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6123128, + "step": 10050 + }, + { + "epoch": 2.7730281301709874, + "grad_norm": 0.0016711077187210321, + "learning_rate": 4.536427188999716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6127160, + "step": 10055 + }, + { + "epoch": 2.7744070601213457, + "grad_norm": 0.004239362198859453, + "learning_rate": 4.535728936895175e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6129528, + "step": 10060 + }, + { + "epoch": 2.7757859900717046, + "grad_norm": 0.1898328810930252, + "learning_rate": 4.5350302131414625e-05, + "loss": 0.1128, + "num_input_tokens_seen": 6132696, + "step": 10065 + }, + { + "epoch": 2.777164920022063, + "grad_norm": 14.503101348876953, + "learning_rate": 4.534331017900461e-05, + "loss": 0.2047, + "num_input_tokens_seen": 6135704, + "step": 10070 + }, + { + "epoch": 2.7785438499724213, + "grad_norm": 0.000712959561496973, + "learning_rate": 4.533631351334164e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6138424, + "step": 10075 + }, + { + "epoch": 2.7799227799227797, + "grad_norm": 0.006100221537053585, + "learning_rate": 4.5329312136046745e-05, + "loss": 0.0036, + "num_input_tokens_seen": 6141208, + "step": 10080 + }, + { + "epoch": 2.7813017098731385, + "grad_norm": 3.820530414581299, + "learning_rate": 4.5322306048742045e-05, + "loss": 0.0025, + "num_input_tokens_seen": 6143736, + "step": 10085 + }, + { + "epoch": 2.782680639823497, + "grad_norm": 0.0015356146031990647, + "learning_rate": 4.531529525305074e-05, + "loss": 0.001, + "num_input_tokens_seen": 6147384, + "step": 10090 + }, + { + "epoch": 2.7840595697738557, + "grad_norm": 0.00244556344114244, + "learning_rate": 4.530827975059715e-05, + "loss": 0.0107, + "num_input_tokens_seen": 6150392, + "step": 10095 + }, + { + "epoch": 2.785438499724214, + "grad_norm": 0.013405265286564827, + "learning_rate": 4.530125954300665e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6152984, + "step": 10100 + }, + { + "epoch": 2.7868174296745725, + "grad_norm": 2.0208895206451416, + "learning_rate": 4.529423463190573e-05, + "loss": 0.0022, + "num_input_tokens_seen": 6155512, + "step": 10105 + }, + { + "epoch": 2.788196359624931, + "grad_norm": 31.649629592895508, + "learning_rate": 4.528720501892196e-05, + "loss": 0.049, + "num_input_tokens_seen": 6160152, + "step": 10110 + }, + { + "epoch": 2.7895752895752897, + "grad_norm": 0.024345185607671738, + "learning_rate": 4.528017070568399e-05, + "loss": 0.0696, + "num_input_tokens_seen": 6162936, + "step": 10115 + }, + { + "epoch": 2.790954219525648, + "grad_norm": 0.0023127831518650055, + "learning_rate": 4.5273131693821584e-05, + "loss": 0.0072, + "num_input_tokens_seen": 6166168, + "step": 10120 + }, + { + "epoch": 2.792333149476007, + "grad_norm": 1.4468169212341309, + "learning_rate": 4.526608798496557e-05, + "loss": 0.1699, + "num_input_tokens_seen": 6170264, + "step": 10125 + }, + { + "epoch": 2.7937120794263652, + "grad_norm": 14.588123321533203, + "learning_rate": 4.525903958074789e-05, + "loss": 0.0672, + "num_input_tokens_seen": 6172824, + "step": 10130 + }, + { + "epoch": 2.7950910093767236, + "grad_norm": 0.033464279025793076, + "learning_rate": 4.5251986482801524e-05, + "loss": 0.001, + "num_input_tokens_seen": 6176376, + "step": 10135 + }, + { + "epoch": 2.796469939327082, + "grad_norm": 0.09115374088287354, + "learning_rate": 4.5244928692760615e-05, + "loss": 0.0675, + "num_input_tokens_seen": 6179192, + "step": 10140 + }, + { + "epoch": 2.797848869277441, + "grad_norm": 0.010124199092388153, + "learning_rate": 4.5237866212260314e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6182872, + "step": 10145 + }, + { + "epoch": 2.799227799227799, + "grad_norm": 0.42042604088783264, + "learning_rate": 4.523079904293693e-05, + "loss": 0.0031, + "num_input_tokens_seen": 6186136, + "step": 10150 + }, + { + "epoch": 2.800606729178158, + "grad_norm": 80.32257080078125, + "learning_rate": 4.52237271864278e-05, + "loss": 0.0899, + "num_input_tokens_seen": 6190520, + "step": 10155 + }, + { + "epoch": 2.8019856591285164, + "grad_norm": 0.021262124180793762, + "learning_rate": 4.521665064437139e-05, + "loss": 0.0921, + "num_input_tokens_seen": 6195736, + "step": 10160 + }, + { + "epoch": 2.8033645890788748, + "grad_norm": 0.026382125914096832, + "learning_rate": 4.5209569418407215e-05, + "loss": 0.0016, + "num_input_tokens_seen": 6198872, + "step": 10165 + }, + { + "epoch": 2.804743519029233, + "grad_norm": 0.22928307950496674, + "learning_rate": 4.5202483510175906e-05, + "loss": 0.0009, + "num_input_tokens_seen": 6202168, + "step": 10170 + }, + { + "epoch": 2.806122448979592, + "grad_norm": 0.7136278748512268, + "learning_rate": 4.519539292131917e-05, + "loss": 0.1884, + "num_input_tokens_seen": 6205176, + "step": 10175 + }, + { + "epoch": 2.8075013789299503, + "grad_norm": 0.006958469748497009, + "learning_rate": 4.5188297653479774e-05, + "loss": 0.0475, + "num_input_tokens_seen": 6207640, + "step": 10180 + }, + { + "epoch": 2.808880308880309, + "grad_norm": 0.37703949213027954, + "learning_rate": 4.518119770830161e-05, + "loss": 0.0514, + "num_input_tokens_seen": 6210456, + "step": 10185 + }, + { + "epoch": 2.8102592388306675, + "grad_norm": 0.05718091130256653, + "learning_rate": 4.517409308742962e-05, + "loss": 0.0697, + "num_input_tokens_seen": 6213944, + "step": 10190 + }, + { + "epoch": 2.811638168781026, + "grad_norm": 0.8478893637657166, + "learning_rate": 4.5166983792509867e-05, + "loss": 0.0035, + "num_input_tokens_seen": 6216408, + "step": 10195 + }, + { + "epoch": 2.8130170987313843, + "grad_norm": 0.007911402732133865, + "learning_rate": 4.515986982518944e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6220536, + "step": 10200 + }, + { + "epoch": 2.814396028681743, + "grad_norm": 0.031013144180178642, + "learning_rate": 4.5152751187116556e-05, + "loss": 0.0621, + "num_input_tokens_seen": 6223928, + "step": 10205 + }, + { + "epoch": 2.8157749586321015, + "grad_norm": 0.18388253450393677, + "learning_rate": 4.514562787994051e-05, + "loss": 0.0016, + "num_input_tokens_seen": 6227160, + "step": 10210 + }, + { + "epoch": 2.81715388858246, + "grad_norm": 0.002061300678178668, + "learning_rate": 4.513849990531166e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6230520, + "step": 10215 + }, + { + "epoch": 2.8185328185328187, + "grad_norm": 0.018474552780389786, + "learning_rate": 4.513136726488145e-05, + "loss": 0.0007, + "num_input_tokens_seen": 6233688, + "step": 10220 + }, + { + "epoch": 2.819911748483177, + "grad_norm": 2.7778830528259277, + "learning_rate": 4.512422996030243e-05, + "loss": 0.0063, + "num_input_tokens_seen": 6236696, + "step": 10225 + }, + { + "epoch": 2.8212906784335354, + "grad_norm": 0.10736433416604996, + "learning_rate": 4.5117087993228205e-05, + "loss": 0.1553, + "num_input_tokens_seen": 6240536, + "step": 10230 + }, + { + "epoch": 2.8226696083838942, + "grad_norm": 0.02547801285982132, + "learning_rate": 4.5109941365313444e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6243256, + "step": 10235 + }, + { + "epoch": 2.8240485383342526, + "grad_norm": 0.016515817493200302, + "learning_rate": 4.510279007821395e-05, + "loss": 0.0364, + "num_input_tokens_seen": 6246968, + "step": 10240 + }, + { + "epoch": 2.825427468284611, + "grad_norm": 0.31103524565696716, + "learning_rate": 4.509563413358655e-05, + "loss": 0.0959, + "num_input_tokens_seen": 6250296, + "step": 10245 + }, + { + "epoch": 2.82680639823497, + "grad_norm": 4.921968936920166, + "learning_rate": 4.5088473533089184e-05, + "loss": 0.0686, + "num_input_tokens_seen": 6252952, + "step": 10250 + }, + { + "epoch": 2.828185328185328, + "grad_norm": 0.07914502173662186, + "learning_rate": 4.508130827838086e-05, + "loss": 0.052, + "num_input_tokens_seen": 6255192, + "step": 10255 + }, + { + "epoch": 2.8295642581356866, + "grad_norm": 1.9176335334777832, + "learning_rate": 4.5074138371121665e-05, + "loss": 0.0041, + "num_input_tokens_seen": 6258296, + "step": 10260 + }, + { + "epoch": 2.8309431880860454, + "grad_norm": 7.760822772979736, + "learning_rate": 4.506696381297276e-05, + "loss": 0.0555, + "num_input_tokens_seen": 6261368, + "step": 10265 + }, + { + "epoch": 2.8323221180364038, + "grad_norm": 0.16994662582874298, + "learning_rate": 4.505978460559639e-05, + "loss": 0.1063, + "num_input_tokens_seen": 6264056, + "step": 10270 + }, + { + "epoch": 2.833701047986762, + "grad_norm": 0.01744755730032921, + "learning_rate": 4.5052600750655875e-05, + "loss": 0.0007, + "num_input_tokens_seen": 6266488, + "step": 10275 + }, + { + "epoch": 2.835079977937121, + "grad_norm": 0.030534762889146805, + "learning_rate": 4.504541224981561e-05, + "loss": 0.0012, + "num_input_tokens_seen": 6272056, + "step": 10280 + }, + { + "epoch": 2.8364589078874793, + "grad_norm": 0.23905101418495178, + "learning_rate": 4.503821910474106e-05, + "loss": 0.0024, + "num_input_tokens_seen": 6274168, + "step": 10285 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 5.5993242263793945, + "learning_rate": 4.5031021317098785e-05, + "loss": 0.129, + "num_input_tokens_seen": 6277528, + "step": 10290 + }, + { + "epoch": 2.839216767788196, + "grad_norm": 0.0720846951007843, + "learning_rate": 4.502381888855641e-05, + "loss": 0.0014, + "num_input_tokens_seen": 6280312, + "step": 10295 + }, + { + "epoch": 2.840595697738555, + "grad_norm": 0.16489572823047638, + "learning_rate": 4.5016611820782626e-05, + "loss": 0.0012, + "num_input_tokens_seen": 6283352, + "step": 10300 + }, + { + "epoch": 2.8419746276889133, + "grad_norm": 0.5735064744949341, + "learning_rate": 4.5009400115447206e-05, + "loss": 0.0013, + "num_input_tokens_seen": 6286264, + "step": 10305 + }, + { + "epoch": 2.843353557639272, + "grad_norm": 0.251997172832489, + "learning_rate": 4.500218377422101e-05, + "loss": 0.0414, + "num_input_tokens_seen": 6288760, + "step": 10310 + }, + { + "epoch": 2.8447324875896305, + "grad_norm": 0.026185866445302963, + "learning_rate": 4.4994962798775943e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6292088, + "step": 10315 + }, + { + "epoch": 2.846111417539989, + "grad_norm": 0.004011982120573521, + "learning_rate": 4.498773719078502e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6294456, + "step": 10320 + }, + { + "epoch": 2.8474903474903472, + "grad_norm": 0.033320050686597824, + "learning_rate": 4.4980506951922295e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6297848, + "step": 10325 + }, + { + "epoch": 2.848869277440706, + "grad_norm": 0.17686417698860168, + "learning_rate": 4.4973272083862925e-05, + "loss": 0.1417, + "num_input_tokens_seen": 6300152, + "step": 10330 + }, + { + "epoch": 2.8502482073910644, + "grad_norm": 0.06295102089643478, + "learning_rate": 4.4966032588283115e-05, + "loss": 0.0756, + "num_input_tokens_seen": 6302584, + "step": 10335 + }, + { + "epoch": 2.8516271373414233, + "grad_norm": 0.02409297600388527, + "learning_rate": 4.4958788466860154e-05, + "loss": 0.0012, + "num_input_tokens_seen": 6305912, + "step": 10340 + }, + { + "epoch": 2.8530060672917816, + "grad_norm": 1.49952232837677, + "learning_rate": 4.49515397212724e-05, + "loss": 0.0025, + "num_input_tokens_seen": 6308216, + "step": 10345 + }, + { + "epoch": 2.85438499724214, + "grad_norm": 0.3009399175643921, + "learning_rate": 4.4944286353199286e-05, + "loss": 0.1243, + "num_input_tokens_seen": 6311512, + "step": 10350 + }, + { + "epoch": 2.8557639271924984, + "grad_norm": 0.011075940914452076, + "learning_rate": 4.493702836432132e-05, + "loss": 0.0009, + "num_input_tokens_seen": 6313944, + "step": 10355 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.034681905061006546, + "learning_rate": 4.492976575632005e-05, + "loss": 0.0042, + "num_input_tokens_seen": 6316760, + "step": 10360 + }, + { + "epoch": 2.8585217870932156, + "grad_norm": 0.005477363709360361, + "learning_rate": 4.4922498530878154e-05, + "loss": 0.004, + "num_input_tokens_seen": 6319064, + "step": 10365 + }, + { + "epoch": 2.8599007170435744, + "grad_norm": 0.0069626979529857635, + "learning_rate": 4.4915226689679316e-05, + "loss": 0.0736, + "num_input_tokens_seen": 6321752, + "step": 10370 + }, + { + "epoch": 2.861279646993933, + "grad_norm": 0.11698335409164429, + "learning_rate": 4.4907950234408326e-05, + "loss": 0.0378, + "num_input_tokens_seen": 6324024, + "step": 10375 + }, + { + "epoch": 2.862658576944291, + "grad_norm": 0.002037737052887678, + "learning_rate": 4.490066916675103e-05, + "loss": 0.1899, + "num_input_tokens_seen": 6326552, + "step": 10380 + }, + { + "epoch": 2.8640375068946495, + "grad_norm": 0.042732514441013336, + "learning_rate": 4.489338348839436e-05, + "loss": 0.0007, + "num_input_tokens_seen": 6329400, + "step": 10385 + }, + { + "epoch": 2.8654164368450084, + "grad_norm": 0.022964749485254288, + "learning_rate": 4.488609320102628e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6333816, + "step": 10390 + }, + { + "epoch": 2.8667953667953667, + "grad_norm": 0.07485409080982208, + "learning_rate": 4.4878798306335865e-05, + "loss": 0.0018, + "num_input_tokens_seen": 6337144, + "step": 10395 + }, + { + "epoch": 2.8681742967457255, + "grad_norm": 0.13436555862426758, + "learning_rate": 4.4871498806013236e-05, + "loss": 0.054, + "num_input_tokens_seen": 6340184, + "step": 10400 + }, + { + "epoch": 2.869553226696084, + "grad_norm": 5.470850944519043, + "learning_rate": 4.486419470174957e-05, + "loss": 0.0053, + "num_input_tokens_seen": 6342136, + "step": 10405 + }, + { + "epoch": 2.8709321566464423, + "grad_norm": 0.04549013450741768, + "learning_rate": 4.485688599523714e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6345144, + "step": 10410 + }, + { + "epoch": 2.8723110865968007, + "grad_norm": 0.04623568058013916, + "learning_rate": 4.484957268816925e-05, + "loss": 0.0249, + "num_input_tokens_seen": 6347544, + "step": 10415 + }, + { + "epoch": 2.8736900165471595, + "grad_norm": 0.002897437661886215, + "learning_rate": 4.4842254782240304e-05, + "loss": 0.0029, + "num_input_tokens_seen": 6350168, + "step": 10420 + }, + { + "epoch": 2.875068946497518, + "grad_norm": 0.013306648470461369, + "learning_rate": 4.4834932279145746e-05, + "loss": 0.0014, + "num_input_tokens_seen": 6354040, + "step": 10425 + }, + { + "epoch": 2.8764478764478767, + "grad_norm": 0.0024375347420573235, + "learning_rate": 4.48276051805821e-05, + "loss": 0.0969, + "num_input_tokens_seen": 6358232, + "step": 10430 + }, + { + "epoch": 2.877826806398235, + "grad_norm": 0.002050186740234494, + "learning_rate": 4.482027348824694e-05, + "loss": 0.0207, + "num_input_tokens_seen": 6361656, + "step": 10435 + }, + { + "epoch": 2.8792057363485934, + "grad_norm": 0.018110794946551323, + "learning_rate": 4.481293720383893e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6364984, + "step": 10440 + }, + { + "epoch": 2.880584666298952, + "grad_norm": 0.005613088142126799, + "learning_rate": 4.480559632905778e-05, + "loss": 0.0503, + "num_input_tokens_seen": 6369176, + "step": 10445 + }, + { + "epoch": 2.8819635962493106, + "grad_norm": 14.63619613647461, + "learning_rate": 4.4798250865604244e-05, + "loss": 0.0605, + "num_input_tokens_seen": 6372376, + "step": 10450 + }, + { + "epoch": 2.883342526199669, + "grad_norm": 0.02614292874932289, + "learning_rate": 4.479090081518017e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6376088, + "step": 10455 + }, + { + "epoch": 2.884721456150028, + "grad_norm": 0.010429123416543007, + "learning_rate": 4.478354617948848e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6379832, + "step": 10460 + }, + { + "epoch": 2.886100386100386, + "grad_norm": 0.17278793454170227, + "learning_rate": 4.4776186960233115e-05, + "loss": 0.0011, + "num_input_tokens_seen": 6382552, + "step": 10465 + }, + { + "epoch": 2.8874793160507446, + "grad_norm": 0.018890513107180595, + "learning_rate": 4.47688231591191e-05, + "loss": 0.0014, + "num_input_tokens_seen": 6384856, + "step": 10470 + }, + { + "epoch": 2.888858246001103, + "grad_norm": 6.681358814239502, + "learning_rate": 4.4761454777852543e-05, + "loss": 0.0695, + "num_input_tokens_seen": 6387416, + "step": 10475 + }, + { + "epoch": 2.890237175951462, + "grad_norm": 0.0027612841222435236, + "learning_rate": 4.475408181814057e-05, + "loss": 0.102, + "num_input_tokens_seen": 6389592, + "step": 10480 + }, + { + "epoch": 2.89161610590182, + "grad_norm": 0.028153028339147568, + "learning_rate": 4.47467042816914e-05, + "loss": 0.0889, + "num_input_tokens_seen": 6393240, + "step": 10485 + }, + { + "epoch": 2.892995035852179, + "grad_norm": 0.15119436383247375, + "learning_rate": 4.4739322170214294e-05, + "loss": 0.0011, + "num_input_tokens_seen": 6395704, + "step": 10490 + }, + { + "epoch": 2.8943739658025374, + "grad_norm": 0.06305905431509018, + "learning_rate": 4.473193548541959e-05, + "loss": 0.0025, + "num_input_tokens_seen": 6398008, + "step": 10495 + }, + { + "epoch": 2.8957528957528957, + "grad_norm": 1.2519676685333252, + "learning_rate": 4.472454422901868e-05, + "loss": 0.0587, + "num_input_tokens_seen": 6401304, + "step": 10500 + }, + { + "epoch": 2.897131825703254, + "grad_norm": 0.058937136083841324, + "learning_rate": 4.471714840272401e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6404760, + "step": 10505 + }, + { + "epoch": 2.898510755653613, + "grad_norm": 0.02699188143014908, + "learning_rate": 4.470974800824907e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6408248, + "step": 10510 + }, + { + "epoch": 2.8998896856039713, + "grad_norm": 0.014289679937064648, + "learning_rate": 4.470234304730845e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6410712, + "step": 10515 + }, + { + "epoch": 2.9012686155543297, + "grad_norm": 0.10490264743566513, + "learning_rate": 4.469493352161776e-05, + "loss": 0.001, + "num_input_tokens_seen": 6413272, + "step": 10520 + }, + { + "epoch": 2.9026475455046885, + "grad_norm": 0.004986542277038097, + "learning_rate": 4.468751943289368e-05, + "loss": 0.0321, + "num_input_tokens_seen": 6416056, + "step": 10525 + }, + { + "epoch": 2.904026475455047, + "grad_norm": 0.15001939237117767, + "learning_rate": 4.468010078285395e-05, + "loss": 0.0867, + "num_input_tokens_seen": 6418712, + "step": 10530 + }, + { + "epoch": 2.9054054054054053, + "grad_norm": 0.07129742950201035, + "learning_rate": 4.4672677573217364e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6421560, + "step": 10535 + }, + { + "epoch": 2.906784335355764, + "grad_norm": 0.0010054020676761866, + "learning_rate": 4.466524980570378e-05, + "loss": 0.1687, + "num_input_tokens_seen": 6425688, + "step": 10540 + }, + { + "epoch": 2.9081632653061225, + "grad_norm": 0.006297330372035503, + "learning_rate": 4.4657817482034095e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6428632, + "step": 10545 + }, + { + "epoch": 2.909542195256481, + "grad_norm": 0.04108348488807678, + "learning_rate": 4.465038060393027e-05, + "loss": 0.0009, + "num_input_tokens_seen": 6430680, + "step": 10550 + }, + { + "epoch": 2.9109211252068397, + "grad_norm": 16.37411880493164, + "learning_rate": 4.464293917311534e-05, + "loss": 0.1707, + "num_input_tokens_seen": 6433336, + "step": 10555 + }, + { + "epoch": 2.912300055157198, + "grad_norm": 0.14821794629096985, + "learning_rate": 4.463549319131336e-05, + "loss": 0.0861, + "num_input_tokens_seen": 6435320, + "step": 10560 + }, + { + "epoch": 2.9136789851075564, + "grad_norm": 2.9681918621063232, + "learning_rate": 4.4628042660249465e-05, + "loss": 0.0449, + "num_input_tokens_seen": 6437912, + "step": 10565 + }, + { + "epoch": 2.915057915057915, + "grad_norm": 0.04241563752293587, + "learning_rate": 4.462058758164983e-05, + "loss": 0.0213, + "num_input_tokens_seen": 6440216, + "step": 10570 + }, + { + "epoch": 2.9164368450082736, + "grad_norm": 0.10312742739915848, + "learning_rate": 4.461312795724171e-05, + "loss": 0.0148, + "num_input_tokens_seen": 6443256, + "step": 10575 + }, + { + "epoch": 2.917815774958632, + "grad_norm": 0.014674448408186436, + "learning_rate": 4.460566378875336e-05, + "loss": 0.0132, + "num_input_tokens_seen": 6445976, + "step": 10580 + }, + { + "epoch": 2.919194704908991, + "grad_norm": 0.04640178754925728, + "learning_rate": 4.4598195077914145e-05, + "loss": 0.0011, + "num_input_tokens_seen": 6449464, + "step": 10585 + }, + { + "epoch": 2.920573634859349, + "grad_norm": 0.0018132575787603855, + "learning_rate": 4.459072182645445e-05, + "loss": 0.0364, + "num_input_tokens_seen": 6452056, + "step": 10590 + }, + { + "epoch": 2.9219525648097076, + "grad_norm": 0.07770655304193497, + "learning_rate": 4.458324403610572e-05, + "loss": 0.0013, + "num_input_tokens_seen": 6455192, + "step": 10595 + }, + { + "epoch": 2.923331494760066, + "grad_norm": 0.10735006630420685, + "learning_rate": 4.457576170860046e-05, + "loss": 0.0308, + "num_input_tokens_seen": 6458040, + "step": 10600 + }, + { + "epoch": 2.9247104247104247, + "grad_norm": 0.013587530702352524, + "learning_rate": 4.4568274845672206e-05, + "loss": 0.0012, + "num_input_tokens_seen": 6460952, + "step": 10605 + }, + { + "epoch": 2.926089354660783, + "grad_norm": 0.020768582820892334, + "learning_rate": 4.456078344905556e-05, + "loss": 0.0557, + "num_input_tokens_seen": 6463512, + "step": 10610 + }, + { + "epoch": 2.927468284611142, + "grad_norm": 0.03193704038858414, + "learning_rate": 4.455328752048618e-05, + "loss": 0.0044, + "num_input_tokens_seen": 6466744, + "step": 10615 + }, + { + "epoch": 2.9288472145615003, + "grad_norm": 0.00305745261721313, + "learning_rate": 4.454578706170075e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6469176, + "step": 10620 + }, + { + "epoch": 2.9302261445118587, + "grad_norm": 0.012013641186058521, + "learning_rate": 4.453828207443703e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6472024, + "step": 10625 + }, + { + "epoch": 2.931605074462217, + "grad_norm": 5.788723468780518, + "learning_rate": 4.453077256043381e-05, + "loss": 0.1775, + "num_input_tokens_seen": 6474840, + "step": 10630 + }, + { + "epoch": 2.932984004412576, + "grad_norm": 0.006596313789486885, + "learning_rate": 4.452325852143094e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6478200, + "step": 10635 + }, + { + "epoch": 2.9343629343629343, + "grad_norm": 39.52164077758789, + "learning_rate": 4.451573995916932e-05, + "loss": 0.1373, + "num_input_tokens_seen": 6480984, + "step": 10640 + }, + { + "epoch": 2.935741864313293, + "grad_norm": 5.953429222106934, + "learning_rate": 4.4508216875390884e-05, + "loss": 0.0031, + "num_input_tokens_seen": 6483896, + "step": 10645 + }, + { + "epoch": 2.9371207942636515, + "grad_norm": 0.03893352299928665, + "learning_rate": 4.450068927183864e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6487128, + "step": 10650 + }, + { + "epoch": 2.93849972421401, + "grad_norm": 0.015202446840703487, + "learning_rate": 4.449315715025659e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6489784, + "step": 10655 + }, + { + "epoch": 2.939878654164368, + "grad_norm": 0.009334960021078587, + "learning_rate": 4.4485620512389855e-05, + "loss": 0.013, + "num_input_tokens_seen": 6492920, + "step": 10660 + }, + { + "epoch": 2.941257584114727, + "grad_norm": 0.008522778749465942, + "learning_rate": 4.447807935998456e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6497016, + "step": 10665 + }, + { + "epoch": 2.9426365140650854, + "grad_norm": 0.009753022342920303, + "learning_rate": 4.4470533694787866e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6499928, + "step": 10670 + }, + { + "epoch": 2.9440154440154442, + "grad_norm": 0.012657031416893005, + "learning_rate": 4.4462983518548e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6502552, + "step": 10675 + }, + { + "epoch": 2.9453943739658026, + "grad_norm": 0.004253019113093615, + "learning_rate": 4.445542883301423e-05, + "loss": 0.1096, + "num_input_tokens_seen": 6505592, + "step": 10680 + }, + { + "epoch": 2.946773303916161, + "grad_norm": 0.010312465950846672, + "learning_rate": 4.444786963993688e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6508248, + "step": 10685 + }, + { + "epoch": 2.9481522338665194, + "grad_norm": 0.0057909609749913216, + "learning_rate": 4.4440305941067295e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6511352, + "step": 10690 + }, + { + "epoch": 2.949531163816878, + "grad_norm": 0.008251119405031204, + "learning_rate": 4.4432737738157874e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6515672, + "step": 10695 + }, + { + "epoch": 2.9509100937672366, + "grad_norm": 2.184880495071411, + "learning_rate": 4.442516503296207e-05, + "loss": 0.1459, + "num_input_tokens_seen": 6520088, + "step": 10700 + }, + { + "epoch": 2.9522890237175954, + "grad_norm": 0.15323659777641296, + "learning_rate": 4.441758782723436e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6522904, + "step": 10705 + }, + { + "epoch": 2.9536679536679538, + "grad_norm": 0.026013826951384544, + "learning_rate": 4.441000612273028e-05, + "loss": 0.0007, + "num_input_tokens_seen": 6526776, + "step": 10710 + }, + { + "epoch": 2.955046883618312, + "grad_norm": 0.10894976556301117, + "learning_rate": 4.440241992120641e-05, + "loss": 0.0022, + "num_input_tokens_seen": 6531416, + "step": 10715 + }, + { + "epoch": 2.9564258135686705, + "grad_norm": 23.335372924804688, + "learning_rate": 4.439482922442034e-05, + "loss": 0.0535, + "num_input_tokens_seen": 6534104, + "step": 10720 + }, + { + "epoch": 2.9578047435190293, + "grad_norm": 0.005596621427685022, + "learning_rate": 4.438723403413074e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6537048, + "step": 10725 + }, + { + "epoch": 2.9591836734693877, + "grad_norm": 0.006763325072824955, + "learning_rate": 4.4379634352097316e-05, + "loss": 0.1108, + "num_input_tokens_seen": 6540280, + "step": 10730 + }, + { + "epoch": 2.9605626034197465, + "grad_norm": 0.021326430141925812, + "learning_rate": 4.4372030180080794e-05, + "loss": 0.001, + "num_input_tokens_seen": 6543320, + "step": 10735 + }, + { + "epoch": 2.961941533370105, + "grad_norm": 0.020112818107008934, + "learning_rate": 4.4364421519842953e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6547032, + "step": 10740 + }, + { + "epoch": 2.9633204633204633, + "grad_norm": 8.283178329467773, + "learning_rate": 4.4356808373146606e-05, + "loss": 0.0716, + "num_input_tokens_seen": 6551160, + "step": 10745 + }, + { + "epoch": 2.9646993932708217, + "grad_norm": 0.0037360445130616426, + "learning_rate": 4.434919074175562e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6553848, + "step": 10750 + }, + { + "epoch": 2.9660783232211805, + "grad_norm": 0.011530035175383091, + "learning_rate": 4.4341568627434894e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6556792, + "step": 10755 + }, + { + "epoch": 2.967457253171539, + "grad_norm": 0.003892607521265745, + "learning_rate": 4.4333942031950334e-05, + "loss": 0.0591, + "num_input_tokens_seen": 6559192, + "step": 10760 + }, + { + "epoch": 2.9688361831218977, + "grad_norm": 0.0053358785808086395, + "learning_rate": 4.432631095706895e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6563512, + "step": 10765 + }, + { + "epoch": 2.970215113072256, + "grad_norm": 0.002390151610597968, + "learning_rate": 4.431867540455873e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6566328, + "step": 10770 + }, + { + "epoch": 2.9715940430226144, + "grad_norm": 0.008345220237970352, + "learning_rate": 4.431103537618872e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6568856, + "step": 10775 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.0011775653110817075, + "learning_rate": 4.430339087372902e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6571544, + "step": 10780 + }, + { + "epoch": 2.9743519029233316, + "grad_norm": 0.028404375538229942, + "learning_rate": 4.429574189895074e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6574424, + "step": 10785 + }, + { + "epoch": 2.97573083287369, + "grad_norm": 0.03356450796127319, + "learning_rate": 4.428808845362605e-05, + "loss": 0.0267, + "num_input_tokens_seen": 6576600, + "step": 10790 + }, + { + "epoch": 2.9771097628240484, + "grad_norm": 5.372593879699707, + "learning_rate": 4.4280430539528125e-05, + "loss": 0.1969, + "num_input_tokens_seen": 6581240, + "step": 10795 + }, + { + "epoch": 2.978488692774407, + "grad_norm": 0.030364518985152245, + "learning_rate": 4.4272768158431204e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6584856, + "step": 10800 + }, + { + "epoch": 2.9798676227247656, + "grad_norm": 0.04572931304574013, + "learning_rate": 4.4265101312110553e-05, + "loss": 0.0008, + "num_input_tokens_seen": 6586808, + "step": 10805 + }, + { + "epoch": 2.981246552675124, + "grad_norm": 0.10664189606904984, + "learning_rate": 4.425743000234246e-05, + "loss": 0.0064, + "num_input_tokens_seen": 6589784, + "step": 10810 + }, + { + "epoch": 2.9826254826254828, + "grad_norm": 0.18323567509651184, + "learning_rate": 4.424975423090427e-05, + "loss": 0.0843, + "num_input_tokens_seen": 6592696, + "step": 10815 + }, + { + "epoch": 2.984004412575841, + "grad_norm": 0.056021977216005325, + "learning_rate": 4.424207399957435e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6595608, + "step": 10820 + }, + { + "epoch": 2.9853833425261995, + "grad_norm": 0.01012982428073883, + "learning_rate": 4.423438931013208e-05, + "loss": 0.0947, + "num_input_tokens_seen": 6598008, + "step": 10825 + }, + { + "epoch": 2.9867622724765583, + "grad_norm": 0.09982278198003769, + "learning_rate": 4.422670016435792e-05, + "loss": 0.0097, + "num_input_tokens_seen": 6600664, + "step": 10830 + }, + { + "epoch": 2.9881412024269167, + "grad_norm": 20.436809539794922, + "learning_rate": 4.421900656403332e-05, + "loss": 0.1181, + "num_input_tokens_seen": 6603032, + "step": 10835 + }, + { + "epoch": 2.989520132377275, + "grad_norm": 0.6345087885856628, + "learning_rate": 4.421130851094076e-05, + "loss": 0.0187, + "num_input_tokens_seen": 6605848, + "step": 10840 + }, + { + "epoch": 2.990899062327634, + "grad_norm": 0.00234212726354599, + "learning_rate": 4.42036060068638e-05, + "loss": 0.0066, + "num_input_tokens_seen": 6608824, + "step": 10845 + }, + { + "epoch": 2.9922779922779923, + "grad_norm": 35.006439208984375, + "learning_rate": 4.419589905358698e-05, + "loss": 0.0296, + "num_input_tokens_seen": 6611768, + "step": 10850 + }, + { + "epoch": 2.9936569222283507, + "grad_norm": 38.551700592041016, + "learning_rate": 4.41881876528959e-05, + "loss": 0.0186, + "num_input_tokens_seen": 6614680, + "step": 10855 + }, + { + "epoch": 2.9950358521787095, + "grad_norm": 0.003127358155325055, + "learning_rate": 4.418047180657717e-05, + "loss": 0.0015, + "num_input_tokens_seen": 6617208, + "step": 10860 + }, + { + "epoch": 2.996414782129068, + "grad_norm": 0.02974311262369156, + "learning_rate": 4.417275151641844e-05, + "loss": 0.0105, + "num_input_tokens_seen": 6619672, + "step": 10865 + }, + { + "epoch": 2.9977937120794262, + "grad_norm": 0.010090307332575321, + "learning_rate": 4.416502678420841e-05, + "loss": 0.0305, + "num_input_tokens_seen": 6622520, + "step": 10870 + }, + { + "epoch": 2.9991726420297846, + "grad_norm": 0.016157900914549828, + "learning_rate": 4.415729761173678e-05, + "loss": 0.0752, + "num_input_tokens_seen": 6626104, + "step": 10875 + }, + { + "epoch": 3.0, + "eval_loss": 0.15016040205955505, + "eval_runtime": 28.476, + "eval_samples_per_second": 56.609, + "eval_steps_per_second": 14.152, + "num_input_tokens_seen": 6628280, + "step": 10878 + }, + { + "epoch": 3.0005515719801434, + "grad_norm": 0.03740614652633667, + "learning_rate": 4.4149564000794276e-05, + "loss": 0.0018, + "num_input_tokens_seen": 6629848, + "step": 10880 + }, + { + "epoch": 3.001930501930502, + "grad_norm": 0.03280935436487198, + "learning_rate": 4.414182595317268e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6633112, + "step": 10885 + }, + { + "epoch": 3.0033094318808606, + "grad_norm": 0.0013388587394729257, + "learning_rate": 4.4134083470664775e-05, + "loss": 0.0014, + "num_input_tokens_seen": 6636184, + "step": 10890 + }, + { + "epoch": 3.004688361831219, + "grad_norm": 0.0008833166211843491, + "learning_rate": 4.4126336555064387e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6638712, + "step": 10895 + }, + { + "epoch": 3.0060672917815774, + "grad_norm": 0.09568516910076141, + "learning_rate": 4.411858520816637e-05, + "loss": 0.0045, + "num_input_tokens_seen": 6642712, + "step": 10900 + }, + { + "epoch": 3.007446221731936, + "grad_norm": 0.029924090951681137, + "learning_rate": 4.41108294317666e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6646520, + "step": 10905 + }, + { + "epoch": 3.0088251516822946, + "grad_norm": 0.01572927087545395, + "learning_rate": 4.410306922766196e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6650680, + "step": 10910 + }, + { + "epoch": 3.010204081632653, + "grad_norm": 0.008854391984641552, + "learning_rate": 4.409530459765041e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6653944, + "step": 10915 + }, + { + "epoch": 3.011583011583012, + "grad_norm": 0.018916498869657516, + "learning_rate": 4.408753554353088e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6656632, + "step": 10920 + }, + { + "epoch": 3.01296194153337, + "grad_norm": 0.3783494532108307, + "learning_rate": 4.407976206710336e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6660344, + "step": 10925 + }, + { + "epoch": 3.0143408714837285, + "grad_norm": 57.24479675292969, + "learning_rate": 4.407198417016885e-05, + "loss": 0.0489, + "num_input_tokens_seen": 6663736, + "step": 10930 + }, + { + "epoch": 3.0157198014340874, + "grad_norm": 0.017785754054784775, + "learning_rate": 4.4064201854529365e-05, + "loss": 0.015, + "num_input_tokens_seen": 6667192, + "step": 10935 + }, + { + "epoch": 3.0170987313844457, + "grad_norm": 0.0016849016537889838, + "learning_rate": 4.405641512198797e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6670136, + "step": 10940 + }, + { + "epoch": 3.018477661334804, + "grad_norm": 0.001680663670413196, + "learning_rate": 4.404862397434874e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6672792, + "step": 10945 + }, + { + "epoch": 3.019856591285163, + "grad_norm": 0.0026263901963829994, + "learning_rate": 4.404082841341676e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6675256, + "step": 10950 + }, + { + "epoch": 3.0212355212355213, + "grad_norm": 0.008710453286767006, + "learning_rate": 4.4033028440998156e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6679000, + "step": 10955 + }, + { + "epoch": 3.0226144511858797, + "grad_norm": 0.006148626562207937, + "learning_rate": 4.402522405890008e-05, + "loss": 0.0, + "num_input_tokens_seen": 6684760, + "step": 10960 + }, + { + "epoch": 3.023993381136238, + "grad_norm": 0.00021929270587861538, + "learning_rate": 4.401741526893068e-05, + "loss": 0.0, + "num_input_tokens_seen": 6687352, + "step": 10965 + }, + { + "epoch": 3.025372311086597, + "grad_norm": 0.00011956981325056404, + "learning_rate": 4.400960207289915e-05, + "loss": 0.1084, + "num_input_tokens_seen": 6690712, + "step": 10970 + }, + { + "epoch": 3.0267512410369553, + "grad_norm": 0.006079493090510368, + "learning_rate": 4.400178447261568e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6694264, + "step": 10975 + }, + { + "epoch": 3.0281301709873136, + "grad_norm": 0.0008159158169291914, + "learning_rate": 4.399396246989152e-05, + "loss": 0.0795, + "num_input_tokens_seen": 6697208, + "step": 10980 + }, + { + "epoch": 3.0295091009376725, + "grad_norm": 0.0006077228463254869, + "learning_rate": 4.39861360665389e-05, + "loss": 0.0, + "num_input_tokens_seen": 6700024, + "step": 10985 + }, + { + "epoch": 3.030888030888031, + "grad_norm": 0.002199380425736308, + "learning_rate": 4.3978305264371084e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6702936, + "step": 10990 + }, + { + "epoch": 3.032266960838389, + "grad_norm": 0.006780579220503569, + "learning_rate": 4.397047006520236e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6705688, + "step": 10995 + }, + { + "epoch": 3.033645890788748, + "grad_norm": 0.0021143697667866945, + "learning_rate": 4.3962630470848045e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6708504, + "step": 11000 + }, + { + "epoch": 3.0350248207391064, + "grad_norm": 0.005147650372236967, + "learning_rate": 4.3954786483124435e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6711288, + "step": 11005 + }, + { + "epoch": 3.0364037506894648, + "grad_norm": 0.0016208374872803688, + "learning_rate": 4.394693810384888e-05, + "loss": 0.0269, + "num_input_tokens_seen": 6714616, + "step": 11010 + }, + { + "epoch": 3.0377826806398236, + "grad_norm": 0.006742921657860279, + "learning_rate": 4.393908533483975e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6717720, + "step": 11015 + }, + { + "epoch": 3.039161610590182, + "grad_norm": 0.045324042439460754, + "learning_rate": 4.39312281779164e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6720504, + "step": 11020 + }, + { + "epoch": 3.0405405405405403, + "grad_norm": 0.0017211772501468658, + "learning_rate": 4.392336663489922e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6723576, + "step": 11025 + }, + { + "epoch": 3.041919470490899, + "grad_norm": 0.007855923846364021, + "learning_rate": 4.391550070760963e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6726328, + "step": 11030 + }, + { + "epoch": 3.0432984004412575, + "grad_norm": 0.0006126536172814667, + "learning_rate": 4.390763039787006e-05, + "loss": 0.0, + "num_input_tokens_seen": 6729880, + "step": 11035 + }, + { + "epoch": 3.044677330391616, + "grad_norm": 0.007081735413521528, + "learning_rate": 4.389975570750392e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6732984, + "step": 11040 + }, + { + "epoch": 3.0460562603419747, + "grad_norm": 0.0014207878848537803, + "learning_rate": 4.3891876638335684e-05, + "loss": 0.0, + "num_input_tokens_seen": 6736120, + "step": 11045 + }, + { + "epoch": 3.047435190292333, + "grad_norm": 0.009626436047255993, + "learning_rate": 4.3883993192190816e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6739064, + "step": 11050 + }, + { + "epoch": 3.0488141202426915, + "grad_norm": 0.003266685176640749, + "learning_rate": 4.3876105370895784e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6742360, + "step": 11055 + }, + { + "epoch": 3.0501930501930503, + "grad_norm": 0.0020330350380390882, + "learning_rate": 4.3868213176278104e-05, + "loss": 0.0, + "num_input_tokens_seen": 6745912, + "step": 11060 + }, + { + "epoch": 3.0515719801434087, + "grad_norm": 0.0001686032919678837, + "learning_rate": 4.386031661016627e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6748408, + "step": 11065 + }, + { + "epoch": 3.052950910093767, + "grad_norm": 0.00021681377256754786, + "learning_rate": 4.38524156743898e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6751352, + "step": 11070 + }, + { + "epoch": 3.054329840044126, + "grad_norm": 6.539683090522885e-05, + "learning_rate": 4.384451037077924e-05, + "loss": 0.0, + "num_input_tokens_seen": 6754040, + "step": 11075 + }, + { + "epoch": 3.0557087699944843, + "grad_norm": 0.1897459179162979, + "learning_rate": 4.383660070116613e-05, + "loss": 0.0512, + "num_input_tokens_seen": 6757816, + "step": 11080 + }, + { + "epoch": 3.0570876999448426, + "grad_norm": 0.002547170966863632, + "learning_rate": 4.3828686667383023e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6760792, + "step": 11085 + }, + { + "epoch": 3.0584666298952015, + "grad_norm": 0.007061878219246864, + "learning_rate": 4.382076827126349e-05, + "loss": 0.0, + "num_input_tokens_seen": 6763320, + "step": 11090 + }, + { + "epoch": 3.05984555984556, + "grad_norm": 0.0008805727702565491, + "learning_rate": 4.3812845514642106e-05, + "loss": 0.0012, + "num_input_tokens_seen": 6766264, + "step": 11095 + }, + { + "epoch": 3.061224489795918, + "grad_norm": 0.0026086345314979553, + "learning_rate": 4.380491839935447e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6769368, + "step": 11100 + }, + { + "epoch": 3.062603419746277, + "grad_norm": 0.07716258615255356, + "learning_rate": 4.379698692723718e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6773272, + "step": 11105 + }, + { + "epoch": 3.0639823496966354, + "grad_norm": 0.01741020940244198, + "learning_rate": 4.378905110012783e-05, + "loss": 0.0, + "num_input_tokens_seen": 6775544, + "step": 11110 + }, + { + "epoch": 3.065361279646994, + "grad_norm": 0.0002791764563880861, + "learning_rate": 4.378111091986504e-05, + "loss": 0.0, + "num_input_tokens_seen": 6778776, + "step": 11115 + }, + { + "epoch": 3.0667402095973526, + "grad_norm": 0.0005581142613664269, + "learning_rate": 4.377316638828846e-05, + "loss": 0.0, + "num_input_tokens_seen": 6781432, + "step": 11120 + }, + { + "epoch": 3.068119139547711, + "grad_norm": 0.0004373606061562896, + "learning_rate": 4.3765217507238686e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6784120, + "step": 11125 + }, + { + "epoch": 3.0694980694980694, + "grad_norm": 0.00852905958890915, + "learning_rate": 4.375726427855739e-05, + "loss": 0.0, + "num_input_tokens_seen": 6786680, + "step": 11130 + }, + { + "epoch": 3.070876999448428, + "grad_norm": 0.0012176376767456532, + "learning_rate": 4.3749306704087214e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6789624, + "step": 11135 + }, + { + "epoch": 3.0722559293987866, + "grad_norm": 0.0001092174497898668, + "learning_rate": 4.37413447856718e-05, + "loss": 0.1188, + "num_input_tokens_seen": 6792696, + "step": 11140 + }, + { + "epoch": 3.073634859349145, + "grad_norm": 0.0007403619820252061, + "learning_rate": 4.373337852515583e-05, + "loss": 0.0, + "num_input_tokens_seen": 6795864, + "step": 11145 + }, + { + "epoch": 3.0750137892995038, + "grad_norm": 0.026793967932462692, + "learning_rate": 4.372540792438495e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6798808, + "step": 11150 + }, + { + "epoch": 3.076392719249862, + "grad_norm": 6.828639016021043e-05, + "learning_rate": 4.3717432985205864e-05, + "loss": 0.0, + "num_input_tokens_seen": 6802040, + "step": 11155 + }, + { + "epoch": 3.0777716492002205, + "grad_norm": 0.0001072119630407542, + "learning_rate": 4.370945370946622e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6804792, + "step": 11160 + }, + { + "epoch": 3.0791505791505793, + "grad_norm": 0.0932232066988945, + "learning_rate": 4.3701470099014715e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6807832, + "step": 11165 + }, + { + "epoch": 3.0805295091009377, + "grad_norm": 0.0011224475456401706, + "learning_rate": 4.369348215570104e-05, + "loss": 0.0, + "num_input_tokens_seen": 6810168, + "step": 11170 + }, + { + "epoch": 3.081908439051296, + "grad_norm": 0.005091970320791006, + "learning_rate": 4.368548988137589e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6812824, + "step": 11175 + }, + { + "epoch": 3.083287369001655, + "grad_norm": 0.00022143991373013705, + "learning_rate": 4.367749327789095e-05, + "loss": 0.0, + "num_input_tokens_seen": 6815640, + "step": 11180 + }, + { + "epoch": 3.0846662989520133, + "grad_norm": 0.00012604534276761115, + "learning_rate": 4.366949234709892e-05, + "loss": 0.0, + "num_input_tokens_seen": 6819992, + "step": 11185 + }, + { + "epoch": 3.0860452289023717, + "grad_norm": 0.0005912964697927237, + "learning_rate": 4.36614870908535e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6822392, + "step": 11190 + }, + { + "epoch": 3.0874241588527305, + "grad_norm": 7.806299981893972e-05, + "learning_rate": 4.3653477511009395e-05, + "loss": 0.0, + "num_input_tokens_seen": 6827000, + "step": 11195 + }, + { + "epoch": 3.088803088803089, + "grad_norm": 0.0023375104647129774, + "learning_rate": 4.3645463609422316e-05, + "loss": 0.0, + "num_input_tokens_seen": 6829624, + "step": 11200 + }, + { + "epoch": 3.0901820187534472, + "grad_norm": 0.0009764437563717365, + "learning_rate": 4.3637445387948964e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6832088, + "step": 11205 + }, + { + "epoch": 3.091560948703806, + "grad_norm": 6.134053546702489e-05, + "learning_rate": 4.362942284844704e-05, + "loss": 0.0, + "num_input_tokens_seen": 6834840, + "step": 11210 + }, + { + "epoch": 3.0929398786541644, + "grad_norm": 0.0003216035838704556, + "learning_rate": 4.362139599277526e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6837528, + "step": 11215 + }, + { + "epoch": 3.094318808604523, + "grad_norm": 0.0009411487844772637, + "learning_rate": 4.361336482279333e-05, + "loss": 0.0, + "num_input_tokens_seen": 6841176, + "step": 11220 + }, + { + "epoch": 3.0956977385548816, + "grad_norm": 0.0029761497862637043, + "learning_rate": 4.360532934036195e-05, + "loss": 0.0, + "num_input_tokens_seen": 6844888, + "step": 11225 + }, + { + "epoch": 3.09707666850524, + "grad_norm": 0.0003328206657897681, + "learning_rate": 4.359728954734283e-05, + "loss": 0.0, + "num_input_tokens_seen": 6848088, + "step": 11230 + }, + { + "epoch": 3.0984555984555984, + "grad_norm": 0.012019454501569271, + "learning_rate": 4.358924544559868e-05, + "loss": 0.0, + "num_input_tokens_seen": 6850776, + "step": 11235 + }, + { + "epoch": 3.099834528405957, + "grad_norm": 0.005054401699453592, + "learning_rate": 4.3581197036993196e-05, + "loss": 0.0, + "num_input_tokens_seen": 6854168, + "step": 11240 + }, + { + "epoch": 3.1012134583563156, + "grad_norm": 9.972209930419922, + "learning_rate": 4.357314432339108e-05, + "loss": 0.1377, + "num_input_tokens_seen": 6856824, + "step": 11245 + }, + { + "epoch": 3.102592388306674, + "grad_norm": 0.0017383842496201396, + "learning_rate": 4.356508730665804e-05, + "loss": 0.0, + "num_input_tokens_seen": 6859160, + "step": 11250 + }, + { + "epoch": 3.1039713182570328, + "grad_norm": 7.797887519700453e-05, + "learning_rate": 4.355702598866075e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6862200, + "step": 11255 + }, + { + "epoch": 3.105350248207391, + "grad_norm": 0.00028600922087207437, + "learning_rate": 4.354896037126691e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6865016, + "step": 11260 + }, + { + "epoch": 3.1067291781577495, + "grad_norm": 0.11841735243797302, + "learning_rate": 4.354089045634522e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6867992, + "step": 11265 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.015732083469629288, + "learning_rate": 4.3532816245765364e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6870904, + "step": 11270 + }, + { + "epoch": 3.1094870380584667, + "grad_norm": 0.0008705485961399972, + "learning_rate": 4.3524737741398e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6874008, + "step": 11275 + }, + { + "epoch": 3.110865968008825, + "grad_norm": 0.0031808314379304647, + "learning_rate": 4.351665494511481e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6876664, + "step": 11280 + }, + { + "epoch": 3.1122448979591835, + "grad_norm": 29.936138153076172, + "learning_rate": 4.350856785878847e-05, + "loss": 0.1461, + "num_input_tokens_seen": 6879736, + "step": 11285 + }, + { + "epoch": 3.1136238279095423, + "grad_norm": 0.009312102571129799, + "learning_rate": 4.350047648429264e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6882840, + "step": 11290 + }, + { + "epoch": 3.1150027578599007, + "grad_norm": 0.0030555285047739744, + "learning_rate": 4.3492380823501975e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6885304, + "step": 11295 + }, + { + "epoch": 3.116381687810259, + "grad_norm": 0.003515803487971425, + "learning_rate": 4.348428087829211e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6888056, + "step": 11300 + }, + { + "epoch": 3.117760617760618, + "grad_norm": 0.003247283399105072, + "learning_rate": 4.3476176650539704e-05, + "loss": 0.0695, + "num_input_tokens_seen": 6890104, + "step": 11305 + }, + { + "epoch": 3.1191395477109762, + "grad_norm": 0.0006464805919677019, + "learning_rate": 4.3468068142122375e-05, + "loss": 0.0006, + "num_input_tokens_seen": 6892888, + "step": 11310 + }, + { + "epoch": 3.1205184776613346, + "grad_norm": 0.014889624901115894, + "learning_rate": 4.345995535491876e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6895704, + "step": 11315 + }, + { + "epoch": 3.1218974076116934, + "grad_norm": 0.006944125983864069, + "learning_rate": 4.3451838290808475e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6898680, + "step": 11320 + }, + { + "epoch": 3.123276337562052, + "grad_norm": 0.007178357802331448, + "learning_rate": 4.344371695167212e-05, + "loss": 0.0005, + "num_input_tokens_seen": 6900824, + "step": 11325 + }, + { + "epoch": 3.12465526751241, + "grad_norm": 0.0008805608376860619, + "learning_rate": 4.34355913393913e-05, + "loss": 0.0007, + "num_input_tokens_seen": 6903192, + "step": 11330 + }, + { + "epoch": 3.126034197462769, + "grad_norm": 0.006284829694777727, + "learning_rate": 4.342746145584859e-05, + "loss": 0.0004, + "num_input_tokens_seen": 6906296, + "step": 11335 + }, + { + "epoch": 3.1274131274131274, + "grad_norm": 0.0011313227005302906, + "learning_rate": 4.341932730292759e-05, + "loss": 0.0, + "num_input_tokens_seen": 6909240, + "step": 11340 + }, + { + "epoch": 3.1287920573634858, + "grad_norm": 0.0046898117288947105, + "learning_rate": 4.341118888251284e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6911704, + "step": 11345 + }, + { + "epoch": 3.1301709873138446, + "grad_norm": 0.00171209080144763, + "learning_rate": 4.3403046196489916e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6914168, + "step": 11350 + }, + { + "epoch": 3.131549917264203, + "grad_norm": 0.00042343948734924197, + "learning_rate": 4.339489924674535e-05, + "loss": 0.0, + "num_input_tokens_seen": 6916664, + "step": 11355 + }, + { + "epoch": 3.1329288472145613, + "grad_norm": 0.0003724884882103652, + "learning_rate": 4.338674803516668e-05, + "loss": 0.065, + "num_input_tokens_seen": 6919800, + "step": 11360 + }, + { + "epoch": 3.13430777716492, + "grad_norm": 0.03269362449645996, + "learning_rate": 4.337859256364242e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6922360, + "step": 11365 + }, + { + "epoch": 3.1356867071152785, + "grad_norm": 0.05823814496397972, + "learning_rate": 4.3370432834062076e-05, + "loss": 0.0003, + "num_input_tokens_seen": 6925208, + "step": 11370 + }, + { + "epoch": 3.137065637065637, + "grad_norm": 0.0013899396872147918, + "learning_rate": 4.336226884831614e-05, + "loss": 0.0, + "num_input_tokens_seen": 6927992, + "step": 11375 + }, + { + "epoch": 3.1384445670159957, + "grad_norm": 0.0015231478027999401, + "learning_rate": 4.3354100608296086e-05, + "loss": 0.0, + "num_input_tokens_seen": 6930584, + "step": 11380 + }, + { + "epoch": 3.139823496966354, + "grad_norm": 0.0006699340301565826, + "learning_rate": 4.334592811589439e-05, + "loss": 0.0, + "num_input_tokens_seen": 6933560, + "step": 11385 + }, + { + "epoch": 3.1412024269167125, + "grad_norm": 0.0004262955626472831, + "learning_rate": 4.33377513730045e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6936152, + "step": 11390 + }, + { + "epoch": 3.1425813568670713, + "grad_norm": 0.019219275563955307, + "learning_rate": 4.332957038152084e-05, + "loss": 0.0002, + "num_input_tokens_seen": 6938904, + "step": 11395 + }, + { + "epoch": 3.1439602868174297, + "grad_norm": 0.0007518504280596972, + "learning_rate": 4.332138514333883e-05, + "loss": 0.0, + "num_input_tokens_seen": 6941816, + "step": 11400 + }, + { + "epoch": 3.145339216767788, + "grad_norm": 0.04597600921988487, + "learning_rate": 4.331319566035488e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6944600, + "step": 11405 + }, + { + "epoch": 3.146718146718147, + "grad_norm": 0.00012270582374185324, + "learning_rate": 4.330500193446637e-05, + "loss": 0.0, + "num_input_tokens_seen": 6947832, + "step": 11410 + }, + { + "epoch": 3.1480970766685052, + "grad_norm": 0.007412131410092115, + "learning_rate": 4.329680396757166e-05, + "loss": 0.0, + "num_input_tokens_seen": 6950360, + "step": 11415 + }, + { + "epoch": 3.1494760066188636, + "grad_norm": 0.00041473034070804715, + "learning_rate": 4.328860176157012e-05, + "loss": 0.0, + "num_input_tokens_seen": 6953624, + "step": 11420 + }, + { + "epoch": 3.1508549365692224, + "grad_norm": 0.00017228002252522856, + "learning_rate": 4.3280395318362076e-05, + "loss": 0.0, + "num_input_tokens_seen": 6957176, + "step": 11425 + }, + { + "epoch": 3.152233866519581, + "grad_norm": 9.316811338067055e-05, + "learning_rate": 4.3272184639848836e-05, + "loss": 0.0, + "num_input_tokens_seen": 6961304, + "step": 11430 + }, + { + "epoch": 3.153612796469939, + "grad_norm": 0.0008522114367224276, + "learning_rate": 4.326396972793271e-05, + "loss": 0.0, + "num_input_tokens_seen": 6964280, + "step": 11435 + }, + { + "epoch": 3.154991726420298, + "grad_norm": 14.900379180908203, + "learning_rate": 4.325575058451695e-05, + "loss": 0.098, + "num_input_tokens_seen": 6968728, + "step": 11440 + }, + { + "epoch": 3.1563706563706564, + "grad_norm": 0.00043966376688331366, + "learning_rate": 4.3247527211505844e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6971832, + "step": 11445 + }, + { + "epoch": 3.1577495863210148, + "grad_norm": 7.915172318462282e-05, + "learning_rate": 4.3239299610804615e-05, + "loss": 0.0, + "num_input_tokens_seen": 6974776, + "step": 11450 + }, + { + "epoch": 3.1591285162713736, + "grad_norm": 0.0005760343628935516, + "learning_rate": 4.323106778431948e-05, + "loss": 0.0017, + "num_input_tokens_seen": 6977464, + "step": 11455 + }, + { + "epoch": 3.160507446221732, + "grad_norm": 0.0020790130365639925, + "learning_rate": 4.3222831733957634e-05, + "loss": 0.0, + "num_input_tokens_seen": 6981048, + "step": 11460 + }, + { + "epoch": 3.1618863761720903, + "grad_norm": 0.0004798321460839361, + "learning_rate": 4.321459146162725e-05, + "loss": 0.0, + "num_input_tokens_seen": 6984056, + "step": 11465 + }, + { + "epoch": 3.163265306122449, + "grad_norm": 0.00881137978285551, + "learning_rate": 4.3206346969237486e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6987224, + "step": 11470 + }, + { + "epoch": 3.1646442360728075, + "grad_norm": 0.01226765476167202, + "learning_rate": 4.319809825869847e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6990680, + "step": 11475 + }, + { + "epoch": 3.166023166023166, + "grad_norm": 0.0015549216186627746, + "learning_rate": 4.318984533192132e-05, + "loss": 0.0, + "num_input_tokens_seen": 6993528, + "step": 11480 + }, + { + "epoch": 3.1674020959735247, + "grad_norm": 0.00838538445532322, + "learning_rate": 4.31815881908181e-05, + "loss": 0.0001, + "num_input_tokens_seen": 6996056, + "step": 11485 + }, + { + "epoch": 3.168781025923883, + "grad_norm": 15.330096244812012, + "learning_rate": 4.317332683730189e-05, + "loss": 0.0736, + "num_input_tokens_seen": 6998488, + "step": 11490 + }, + { + "epoch": 3.1701599558742415, + "grad_norm": 0.014631516300141811, + "learning_rate": 4.316506127328671e-05, + "loss": 0.0373, + "num_input_tokens_seen": 7000888, + "step": 11495 + }, + { + "epoch": 3.1715388858246003, + "grad_norm": 0.007855266332626343, + "learning_rate": 4.315679150068759e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7003000, + "step": 11500 + }, + { + "epoch": 3.1729178157749587, + "grad_norm": 0.00014991704665590078, + "learning_rate": 4.314851752142051e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7006808, + "step": 11505 + }, + { + "epoch": 3.174296745725317, + "grad_norm": 0.0002430374443065375, + "learning_rate": 4.3140239337402426e-05, + "loss": 0.0, + "num_input_tokens_seen": 7009688, + "step": 11510 + }, + { + "epoch": 3.175675675675676, + "grad_norm": 0.00018179694598075002, + "learning_rate": 4.313195695055128e-05, + "loss": 0.0, + "num_input_tokens_seen": 7012984, + "step": 11515 + }, + { + "epoch": 3.1770546056260343, + "grad_norm": 0.000370145047781989, + "learning_rate": 4.312367036278599e-05, + "loss": 0.0, + "num_input_tokens_seen": 7016088, + "step": 11520 + }, + { + "epoch": 3.1784335355763926, + "grad_norm": 0.00012161608174210414, + "learning_rate": 4.311537957602643e-05, + "loss": 0.0, + "num_input_tokens_seen": 7018680, + "step": 11525 + }, + { + "epoch": 3.1798124655267515, + "grad_norm": 0.00010268048208672553, + "learning_rate": 4.310708459219345e-05, + "loss": 0.0, + "num_input_tokens_seen": 7021528, + "step": 11530 + }, + { + "epoch": 3.18119139547711, + "grad_norm": 8.031549805309623e-05, + "learning_rate": 4.3098785413208896e-05, + "loss": 0.0, + "num_input_tokens_seen": 7023800, + "step": 11535 + }, + { + "epoch": 3.182570325427468, + "grad_norm": 0.0009785550646483898, + "learning_rate": 4.309048204099555e-05, + "loss": 0.0355, + "num_input_tokens_seen": 7026424, + "step": 11540 + }, + { + "epoch": 3.1839492553778266, + "grad_norm": 0.002144457073882222, + "learning_rate": 4.3082174477477197e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7031512, + "step": 11545 + }, + { + "epoch": 3.1853281853281854, + "grad_norm": 0.02954285405576229, + "learning_rate": 4.3073862724578584e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7033944, + "step": 11550 + }, + { + "epoch": 3.186707115278544, + "grad_norm": 0.0006732586771249771, + "learning_rate": 4.3065546784225416e-05, + "loss": 0.0, + "num_input_tokens_seen": 7039128, + "step": 11555 + }, + { + "epoch": 3.1880860452289026, + "grad_norm": 0.004339973907917738, + "learning_rate": 4.3057226658344374e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7041752, + "step": 11560 + }, + { + "epoch": 3.189464975179261, + "grad_norm": 0.00015984068159013987, + "learning_rate": 4.3048902348863116e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7044248, + "step": 11565 + }, + { + "epoch": 3.1908439051296194, + "grad_norm": 3.708634903887287e-05, + "learning_rate": 4.304057385771027e-05, + "loss": 0.0, + "num_input_tokens_seen": 7047768, + "step": 11570 + }, + { + "epoch": 3.1922228350799777, + "grad_norm": 0.00216269725933671, + "learning_rate": 4.303224118681542e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7051000, + "step": 11575 + }, + { + "epoch": 3.1936017650303365, + "grad_norm": 0.0035157338716089725, + "learning_rate": 4.302390433810912e-05, + "loss": 0.0, + "num_input_tokens_seen": 7054200, + "step": 11580 + }, + { + "epoch": 3.194980694980695, + "grad_norm": 0.0014530925545841455, + "learning_rate": 4.3015563313522924e-05, + "loss": 0.0, + "num_input_tokens_seen": 7057976, + "step": 11585 + }, + { + "epoch": 3.1963596249310533, + "grad_norm": 0.001863851328380406, + "learning_rate": 4.30072181149893e-05, + "loss": 0.0, + "num_input_tokens_seen": 7061880, + "step": 11590 + }, + { + "epoch": 3.197738554881412, + "grad_norm": 0.0780901238322258, + "learning_rate": 4.2998868744441725e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7064856, + "step": 11595 + }, + { + "epoch": 3.1991174848317705, + "grad_norm": 0.00018452017684467137, + "learning_rate": 4.299051520381462e-05, + "loss": 0.0, + "num_input_tokens_seen": 7067224, + "step": 11600 + }, + { + "epoch": 3.200496414782129, + "grad_norm": 0.0028358311392366886, + "learning_rate": 4.29821574950434e-05, + "loss": 0.0, + "num_input_tokens_seen": 7069880, + "step": 11605 + }, + { + "epoch": 3.2018753447324877, + "grad_norm": 0.016518667340278625, + "learning_rate": 4.29737956200644e-05, + "loss": 0.0, + "num_input_tokens_seen": 7073336, + "step": 11610 + }, + { + "epoch": 3.203254274682846, + "grad_norm": 3.115079402923584, + "learning_rate": 4.2965429580814964e-05, + "loss": 0.2063, + "num_input_tokens_seen": 7077496, + "step": 11615 + }, + { + "epoch": 3.2046332046332044, + "grad_norm": 0.0029555647633969784, + "learning_rate": 4.295705937923337e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7080632, + "step": 11620 + }, + { + "epoch": 3.2060121345835633, + "grad_norm": 0.01566106081008911, + "learning_rate": 4.2948685017258896e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7083192, + "step": 11625 + }, + { + "epoch": 3.2073910645339216, + "grad_norm": 0.001199032529257238, + "learning_rate": 4.294030649683173e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7086904, + "step": 11630 + }, + { + "epoch": 3.20876999448428, + "grad_norm": 0.05022937059402466, + "learning_rate": 4.2931923819893074e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7089944, + "step": 11635 + }, + { + "epoch": 3.210148924434639, + "grad_norm": 0.5031954646110535, + "learning_rate": 4.292353698838508e-05, + "loss": 0.0008, + "num_input_tokens_seen": 7092696, + "step": 11640 + }, + { + "epoch": 3.211527854384997, + "grad_norm": 0.026532962918281555, + "learning_rate": 4.2915146004250836e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7095320, + "step": 11645 + }, + { + "epoch": 3.2129067843353556, + "grad_norm": 0.027760261669754982, + "learning_rate": 4.290675086943444e-05, + "loss": 0.0005, + "num_input_tokens_seen": 7097624, + "step": 11650 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 0.0014631436206400394, + "learning_rate": 4.2898351585880894e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7100760, + "step": 11655 + }, + { + "epoch": 3.215664644236073, + "grad_norm": 0.0027735817711800337, + "learning_rate": 4.288994815553622e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7104440, + "step": 11660 + }, + { + "epoch": 3.217043574186431, + "grad_norm": 0.0042485883459448814, + "learning_rate": 4.288154058034734e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7106936, + "step": 11665 + }, + { + "epoch": 3.21842250413679, + "grad_norm": 0.0099521828815341, + "learning_rate": 4.2873128862262204e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7109432, + "step": 11670 + }, + { + "epoch": 3.2198014340871484, + "grad_norm": 0.001830184948630631, + "learning_rate": 4.286471300322966e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7112728, + "step": 11675 + }, + { + "epoch": 3.2211803640375067, + "grad_norm": 0.0010891134152188897, + "learning_rate": 4.2856293005199555e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7115864, + "step": 11680 + }, + { + "epoch": 3.2225592939878656, + "grad_norm": 0.002954290946945548, + "learning_rate": 4.284786887012268e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7119096, + "step": 11685 + }, + { + "epoch": 3.223938223938224, + "grad_norm": 0.0004648844478651881, + "learning_rate": 4.283944059995078e-05, + "loss": 0.0, + "num_input_tokens_seen": 7121560, + "step": 11690 + }, + { + "epoch": 3.2253171538885823, + "grad_norm": 0.0014669529628008604, + "learning_rate": 4.2831008196636556e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7124920, + "step": 11695 + }, + { + "epoch": 3.226696083838941, + "grad_norm": 0.006462618242949247, + "learning_rate": 4.2822571662133705e-05, + "loss": 0.0, + "num_input_tokens_seen": 7127768, + "step": 11700 + }, + { + "epoch": 3.2280750137892995, + "grad_norm": 0.0032141529954969883, + "learning_rate": 4.2814130998396816e-05, + "loss": 0.1063, + "num_input_tokens_seen": 7130744, + "step": 11705 + }, + { + "epoch": 3.229453943739658, + "grad_norm": 0.0011405297555029392, + "learning_rate": 4.28056862073815e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7134104, + "step": 11710 + }, + { + "epoch": 3.2308328736900167, + "grad_norm": 0.0034941479098051786, + "learning_rate": 4.279723729104428e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7137080, + "step": 11715 + }, + { + "epoch": 3.232211803640375, + "grad_norm": 0.010180235840380192, + "learning_rate": 4.2788784251342637e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7139768, + "step": 11720 + }, + { + "epoch": 3.2335907335907335, + "grad_norm": 0.0020476211793720722, + "learning_rate": 4.278032709023504e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7142872, + "step": 11725 + }, + { + "epoch": 3.2349696635410923, + "grad_norm": 0.0010998768266290426, + "learning_rate": 4.2771865809680875e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7145464, + "step": 11730 + }, + { + "epoch": 3.2363485934914507, + "grad_norm": 0.009306549094617367, + "learning_rate": 4.2763400411640514e-05, + "loss": 0.0048, + "num_input_tokens_seen": 7149880, + "step": 11735 + }, + { + "epoch": 3.237727523441809, + "grad_norm": 0.002969828899949789, + "learning_rate": 4.275493089807526e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7153336, + "step": 11740 + }, + { + "epoch": 3.239106453392168, + "grad_norm": 0.0027356387581676245, + "learning_rate": 4.2746457270947385e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7155896, + "step": 11745 + }, + { + "epoch": 3.2404853833425262, + "grad_norm": 0.013263723812997341, + "learning_rate": 4.273797953222009e-05, + "loss": 0.0549, + "num_input_tokens_seen": 7159096, + "step": 11750 + }, + { + "epoch": 3.2418643132928846, + "grad_norm": 0.010709513910114765, + "learning_rate": 4.2729497683857556e-05, + "loss": 0.0013, + "num_input_tokens_seen": 7162168, + "step": 11755 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.0042944797314703465, + "learning_rate": 4.272101172782491e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7165016, + "step": 11760 + }, + { + "epoch": 3.244622173193602, + "grad_norm": 0.0009573962306603789, + "learning_rate": 4.271252166608821e-05, + "loss": 0.0, + "num_input_tokens_seen": 7167224, + "step": 11765 + }, + { + "epoch": 3.24600110314396, + "grad_norm": 0.0009172451682388783, + "learning_rate": 4.270402750061451e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7170168, + "step": 11770 + }, + { + "epoch": 3.247380033094319, + "grad_norm": 0.0002575912803877145, + "learning_rate": 4.269552923337176e-05, + "loss": 0.0, + "num_input_tokens_seen": 7173240, + "step": 11775 + }, + { + "epoch": 3.2487589630446774, + "grad_norm": 0.0003969910030718893, + "learning_rate": 4.26870268663289e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7176440, + "step": 11780 + }, + { + "epoch": 3.2501378929950357, + "grad_norm": 0.0010612952755764127, + "learning_rate": 4.2678520401455795e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7179608, + "step": 11785 + }, + { + "epoch": 3.251516822945394, + "grad_norm": 0.004836036823689938, + "learning_rate": 4.267000984072328e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7182200, + "step": 11790 + }, + { + "epoch": 3.252895752895753, + "grad_norm": 0.0006631448050029576, + "learning_rate": 4.266149518610313e-05, + "loss": 0.0, + "num_input_tokens_seen": 7185176, + "step": 11795 + }, + { + "epoch": 3.2542746828461113, + "grad_norm": 0.00161137362010777, + "learning_rate": 4.2652976439568074e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7188216, + "step": 11800 + }, + { + "epoch": 3.25565361279647, + "grad_norm": 0.0016899239271879196, + "learning_rate": 4.264445360309176e-05, + "loss": 0.0, + "num_input_tokens_seen": 7191224, + "step": 11805 + }, + { + "epoch": 3.2570325427468285, + "grad_norm": 0.00014752900460734963, + "learning_rate": 4.263592667864883e-05, + "loss": 0.0, + "num_input_tokens_seen": 7195288, + "step": 11810 + }, + { + "epoch": 3.258411472697187, + "grad_norm": 0.0005913093336857855, + "learning_rate": 4.2627395668214843e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7197784, + "step": 11815 + }, + { + "epoch": 3.2597904026475453, + "grad_norm": 0.000840241031255573, + "learning_rate": 4.2618860573766305e-05, + "loss": 0.0, + "num_input_tokens_seen": 7200472, + "step": 11820 + }, + { + "epoch": 3.261169332597904, + "grad_norm": 0.00031895507709123194, + "learning_rate": 4.2610321397280684e-05, + "loss": 0.0, + "num_input_tokens_seen": 7203704, + "step": 11825 + }, + { + "epoch": 3.2625482625482625, + "grad_norm": 0.0005378396017476916, + "learning_rate": 4.260177814073638e-05, + "loss": 0.1063, + "num_input_tokens_seen": 7207288, + "step": 11830 + }, + { + "epoch": 3.2639271924986213, + "grad_norm": 0.001934459200128913, + "learning_rate": 4.259323080611275e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7209464, + "step": 11835 + }, + { + "epoch": 3.2653061224489797, + "grad_norm": 0.003496430814266205, + "learning_rate": 4.258467939539008e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7212024, + "step": 11840 + }, + { + "epoch": 3.266685052399338, + "grad_norm": 0.00030939950374886394, + "learning_rate": 4.257612391054961e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7215864, + "step": 11845 + }, + { + "epoch": 3.2680639823496964, + "grad_norm": 0.005613837856799364, + "learning_rate": 4.2567564353573525e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7218904, + "step": 11850 + }, + { + "epoch": 3.2694429123000552, + "grad_norm": 5.73789119720459, + "learning_rate": 4.2559000726444956e-05, + "loss": 0.098, + "num_input_tokens_seen": 7221208, + "step": 11855 + }, + { + "epoch": 3.2708218422504136, + "grad_norm": 0.006158013828098774, + "learning_rate": 4.2550433031147965e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7224376, + "step": 11860 + }, + { + "epoch": 3.2722007722007724, + "grad_norm": 0.0004870586271863431, + "learning_rate": 4.254186126966756e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7226584, + "step": 11865 + }, + { + "epoch": 3.273579702151131, + "grad_norm": 0.005293484311550856, + "learning_rate": 4.2533285443989714e-05, + "loss": 0.0008, + "num_input_tokens_seen": 7229560, + "step": 11870 + }, + { + "epoch": 3.274958632101489, + "grad_norm": 0.006149296648800373, + "learning_rate": 4.2524705556101304e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7232568, + "step": 11875 + }, + { + "epoch": 3.2763375620518476, + "grad_norm": 0.017530852928757668, + "learning_rate": 4.2516121607990175e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7236664, + "step": 11880 + }, + { + "epoch": 3.2777164920022064, + "grad_norm": 0.002272647572681308, + "learning_rate": 4.25075336016451e-05, + "loss": 0.0399, + "num_input_tokens_seen": 7239896, + "step": 11885 + }, + { + "epoch": 3.2790954219525648, + "grad_norm": 0.002037188969552517, + "learning_rate": 4.2498941539055806e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7243288, + "step": 11890 + }, + { + "epoch": 3.2804743519029236, + "grad_norm": 0.0008921648841351271, + "learning_rate": 4.249034542221294e-05, + "loss": 0.0006, + "num_input_tokens_seen": 7246488, + "step": 11895 + }, + { + "epoch": 3.281853281853282, + "grad_norm": 0.013943901285529137, + "learning_rate": 4.248174525310811e-05, + "loss": 0.0299, + "num_input_tokens_seen": 7248888, + "step": 11900 + }, + { + "epoch": 3.2832322118036403, + "grad_norm": 0.006183581426739693, + "learning_rate": 4.247314103373383e-05, + "loss": 0.0006, + "num_input_tokens_seen": 7251224, + "step": 11905 + }, + { + "epoch": 3.2846111417539987, + "grad_norm": 0.0009436063701286912, + "learning_rate": 4.2464532766083595e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7254712, + "step": 11910 + }, + { + "epoch": 3.2859900717043575, + "grad_norm": 3.221309088985436e-05, + "learning_rate": 4.245592045215182e-05, + "loss": 0.0005, + "num_input_tokens_seen": 7257976, + "step": 11915 + }, + { + "epoch": 3.287369001654716, + "grad_norm": 0.0014165466418489814, + "learning_rate": 4.244730409393385e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7261272, + "step": 11920 + }, + { + "epoch": 3.2887479316050743, + "grad_norm": 0.003939894028007984, + "learning_rate": 4.243868369342595e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7263768, + "step": 11925 + }, + { + "epoch": 3.290126861555433, + "grad_norm": 0.005802876316010952, + "learning_rate": 4.2430059252625374e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7266616, + "step": 11930 + }, + { + "epoch": 3.2915057915057915, + "grad_norm": 0.0003942724724765867, + "learning_rate": 4.242143077353026e-05, + "loss": 0.1295, + "num_input_tokens_seen": 7269432, + "step": 11935 + }, + { + "epoch": 3.29288472145615, + "grad_norm": 0.002527924720197916, + "learning_rate": 4.241279825813972e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7272568, + "step": 11940 + }, + { + "epoch": 3.2942636514065087, + "grad_norm": 0.011342327110469341, + "learning_rate": 4.240416170845376e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7275704, + "step": 11945 + }, + { + "epoch": 3.295642581356867, + "grad_norm": 0.005288524553179741, + "learning_rate": 4.239552112647337e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7278584, + "step": 11950 + }, + { + "epoch": 3.2970215113072254, + "grad_norm": 0.00609660055488348, + "learning_rate": 4.238687651420044e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7281176, + "step": 11955 + }, + { + "epoch": 3.2984004412575842, + "grad_norm": 0.007085133343935013, + "learning_rate": 4.2378227873637785e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7284888, + "step": 11960 + }, + { + "epoch": 3.2997793712079426, + "grad_norm": 0.017269432544708252, + "learning_rate": 4.2369575206789196e-05, + "loss": 0.0237, + "num_input_tokens_seen": 7287288, + "step": 11965 + }, + { + "epoch": 3.301158301158301, + "grad_norm": 0.0017261295579373837, + "learning_rate": 4.2360918515659366e-05, + "loss": 0.0006, + "num_input_tokens_seen": 7289944, + "step": 11970 + }, + { + "epoch": 3.30253723110866, + "grad_norm": 0.007170327939093113, + "learning_rate": 4.2352257802253914e-05, + "loss": 0.0006, + "num_input_tokens_seen": 7292568, + "step": 11975 + }, + { + "epoch": 3.303916161059018, + "grad_norm": 0.0016039611073210835, + "learning_rate": 4.234359306857941e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7295512, + "step": 11980 + }, + { + "epoch": 3.3052950910093766, + "grad_norm": 0.0021263931412249804, + "learning_rate": 4.233492431664335e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7299768, + "step": 11985 + }, + { + "epoch": 3.3066740209597354, + "grad_norm": 0.0034210619051009417, + "learning_rate": 4.232625154845417e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7303192, + "step": 11990 + }, + { + "epoch": 3.3080529509100938, + "grad_norm": 0.002248935168609023, + "learning_rate": 4.2317574766021203e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7307768, + "step": 11995 + }, + { + "epoch": 3.309431880860452, + "grad_norm": 0.002161960117518902, + "learning_rate": 4.230889397135475e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7310648, + "step": 12000 + }, + { + "epoch": 3.310810810810811, + "grad_norm": 0.005023464094847441, + "learning_rate": 4.2300209166466026e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7313080, + "step": 12005 + }, + { + "epoch": 3.3121897407611693, + "grad_norm": 0.0009308184962719679, + "learning_rate": 4.2291520353367165e-05, + "loss": 0.0, + "num_input_tokens_seen": 7316216, + "step": 12010 + }, + { + "epoch": 3.3135686707115277, + "grad_norm": 0.0009438966517336667, + "learning_rate": 4.2282827534071256e-05, + "loss": 0.0, + "num_input_tokens_seen": 7319224, + "step": 12015 + }, + { + "epoch": 3.3149476006618865, + "grad_norm": 0.000831877114251256, + "learning_rate": 4.227413071059229e-05, + "loss": 0.0, + "num_input_tokens_seen": 7322840, + "step": 12020 + }, + { + "epoch": 3.316326530612245, + "grad_norm": 0.0011950060725212097, + "learning_rate": 4.2265429884945196e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7326008, + "step": 12025 + }, + { + "epoch": 3.3177054605626033, + "grad_norm": 0.0011091150809079409, + "learning_rate": 4.225672505914583e-05, + "loss": 0.0009, + "num_input_tokens_seen": 7328888, + "step": 12030 + }, + { + "epoch": 3.319084390512962, + "grad_norm": 0.0013161271344870329, + "learning_rate": 4.224801623521098e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7331544, + "step": 12035 + }, + { + "epoch": 3.3204633204633205, + "grad_norm": 0.0022397583816200495, + "learning_rate": 4.223930341515836e-05, + "loss": 0.0011, + "num_input_tokens_seen": 7335096, + "step": 12040 + }, + { + "epoch": 3.321842250413679, + "grad_norm": 0.007636088877916336, + "learning_rate": 4.2230586601006593e-05, + "loss": 0.0939, + "num_input_tokens_seen": 7337368, + "step": 12045 + }, + { + "epoch": 3.3232211803640377, + "grad_norm": 0.008387607522308826, + "learning_rate": 4.222186579477525e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7339768, + "step": 12050 + }, + { + "epoch": 3.324600110314396, + "grad_norm": 8.421810150146484, + "learning_rate": 4.221314099848481e-05, + "loss": 0.1649, + "num_input_tokens_seen": 7342744, + "step": 12055 + }, + { + "epoch": 3.3259790402647544, + "grad_norm": 0.0003484303888399154, + "learning_rate": 4.220441221415668e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7345560, + "step": 12060 + }, + { + "epoch": 3.3273579702151133, + "grad_norm": 0.0004906922113150358, + "learning_rate": 4.219567944381321e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7348312, + "step": 12065 + }, + { + "epoch": 3.3287369001654716, + "grad_norm": 0.05352053791284561, + "learning_rate": 4.218694268947764e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7352056, + "step": 12070 + }, + { + "epoch": 3.33011583011583, + "grad_norm": 0.004777665715664625, + "learning_rate": 4.2178201953174156e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7355768, + "step": 12075 + }, + { + "epoch": 3.331494760066189, + "grad_norm": 0.0010732949012890458, + "learning_rate": 4.216945723692786e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7358104, + "step": 12080 + }, + { + "epoch": 3.332873690016547, + "grad_norm": 0.0013537449995055795, + "learning_rate": 4.216070854276479e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7361432, + "step": 12085 + }, + { + "epoch": 3.3342526199669056, + "grad_norm": 0.00238424981944263, + "learning_rate": 4.215195587271188e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7363640, + "step": 12090 + }, + { + "epoch": 3.335631549917264, + "grad_norm": 0.007810629904270172, + "learning_rate": 4.214319922879699e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7366616, + "step": 12095 + }, + { + "epoch": 3.337010479867623, + "grad_norm": 0.005447761621326208, + "learning_rate": 4.213443861304893e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7370424, + "step": 12100 + }, + { + "epoch": 3.338389409817981, + "grad_norm": 0.005307051353156567, + "learning_rate": 4.21256740274974e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7373752, + "step": 12105 + }, + { + "epoch": 3.33976833976834, + "grad_norm": 0.0021430382039397955, + "learning_rate": 4.211690547417302e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7376920, + "step": 12110 + }, + { + "epoch": 3.3411472697186984, + "grad_norm": 0.004281078465282917, + "learning_rate": 4.210813295510735e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7379992, + "step": 12115 + }, + { + "epoch": 3.3425261996690567, + "grad_norm": 0.0019242411945015192, + "learning_rate": 4.209935647233285e-05, + "loss": 0.0043, + "num_input_tokens_seen": 7382840, + "step": 12120 + }, + { + "epoch": 3.343905129619415, + "grad_norm": 0.0004251321079209447, + "learning_rate": 4.20905760278829e-05, + "loss": 0.0, + "num_input_tokens_seen": 7385560, + "step": 12125 + }, + { + "epoch": 3.345284059569774, + "grad_norm": 0.0013144055847078562, + "learning_rate": 4.208179162379182e-05, + "loss": 0.0549, + "num_input_tokens_seen": 7388088, + "step": 12130 + }, + { + "epoch": 3.3466629895201323, + "grad_norm": 0.00985940545797348, + "learning_rate": 4.207300326209481e-05, + "loss": 0.0005, + "num_input_tokens_seen": 7392088, + "step": 12135 + }, + { + "epoch": 3.348041919470491, + "grad_norm": 0.005419768858700991, + "learning_rate": 4.206421094482803e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7395256, + "step": 12140 + }, + { + "epoch": 3.3494208494208495, + "grad_norm": 0.0532696470618248, + "learning_rate": 4.2055414674028516e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7398072, + "step": 12145 + }, + { + "epoch": 3.350799779371208, + "grad_norm": 0.0008124826126731932, + "learning_rate": 4.2046614451734234e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7402008, + "step": 12150 + }, + { + "epoch": 3.3521787093215663, + "grad_norm": 0.0028394002001732588, + "learning_rate": 4.203781027998408e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7404152, + "step": 12155 + }, + { + "epoch": 3.353557639271925, + "grad_norm": 0.00036252872087061405, + "learning_rate": 4.2029002160817864e-05, + "loss": 0.0, + "num_input_tokens_seen": 7406424, + "step": 12160 + }, + { + "epoch": 3.3549365692222834, + "grad_norm": 0.0033257093746215105, + "learning_rate": 4.202019009627628e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7409720, + "step": 12165 + }, + { + "epoch": 3.3563154991726423, + "grad_norm": 0.005324434023350477, + "learning_rate": 4.201137408840097e-05, + "loss": 0.0, + "num_input_tokens_seen": 7412312, + "step": 12170 + }, + { + "epoch": 3.3576944291230006, + "grad_norm": 0.001503957319073379, + "learning_rate": 4.200255413923446e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7414616, + "step": 12175 + }, + { + "epoch": 3.359073359073359, + "grad_norm": 0.00021308976283762604, + "learning_rate": 4.199373025082023e-05, + "loss": 0.0, + "num_input_tokens_seen": 7417784, + "step": 12180 + }, + { + "epoch": 3.3604522890237174, + "grad_norm": 0.00017378441407345235, + "learning_rate": 4.1984902425202636e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7420088, + "step": 12185 + }, + { + "epoch": 3.361831218974076, + "grad_norm": 0.0037117076572030783, + "learning_rate": 4.197607066442695e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7424152, + "step": 12190 + }, + { + "epoch": 3.3632101489244346, + "grad_norm": 0.001210473943501711, + "learning_rate": 4.1967234970539384e-05, + "loss": 0.0013, + "num_input_tokens_seen": 7426616, + "step": 12195 + }, + { + "epoch": 3.364589078874793, + "grad_norm": 0.0002830368757713586, + "learning_rate": 4.195839534558702e-05, + "loss": 0.0, + "num_input_tokens_seen": 7429336, + "step": 12200 + }, + { + "epoch": 3.365968008825152, + "grad_norm": 0.00019406525825615972, + "learning_rate": 4.194955179161789e-05, + "loss": 0.0, + "num_input_tokens_seen": 7431640, + "step": 12205 + }, + { + "epoch": 3.36734693877551, + "grad_norm": 0.0002227967925136909, + "learning_rate": 4.19407043106809e-05, + "loss": 0.0, + "num_input_tokens_seen": 7434744, + "step": 12210 + }, + { + "epoch": 3.3687258687258685, + "grad_norm": 0.0007326032500714064, + "learning_rate": 4.193185290482591e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7437112, + "step": 12215 + }, + { + "epoch": 3.3701047986762274, + "grad_norm": 0.0014360245550051332, + "learning_rate": 4.192299757610364e-05, + "loss": 0.0, + "num_input_tokens_seen": 7439544, + "step": 12220 + }, + { + "epoch": 3.3714837286265857, + "grad_norm": 0.0011741507332772017, + "learning_rate": 4.1914138326565756e-05, + "loss": 0.0, + "num_input_tokens_seen": 7442360, + "step": 12225 + }, + { + "epoch": 3.372862658576944, + "grad_norm": 0.0015485970070585608, + "learning_rate": 4.1905275158264804e-05, + "loss": 0.0, + "num_input_tokens_seen": 7444472, + "step": 12230 + }, + { + "epoch": 3.374241588527303, + "grad_norm": 11.30135726928711, + "learning_rate": 4.189640807325428e-05, + "loss": 0.1657, + "num_input_tokens_seen": 7447160, + "step": 12235 + }, + { + "epoch": 3.3756205184776613, + "grad_norm": 0.0018227698747068644, + "learning_rate": 4.188753707358853e-05, + "loss": 0.0096, + "num_input_tokens_seen": 7449784, + "step": 12240 + }, + { + "epoch": 3.3769994484280197, + "grad_norm": 0.00915641337633133, + "learning_rate": 4.187866216132286e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7452632, + "step": 12245 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.0006694025942124426, + "learning_rate": 4.1869783338513435e-05, + "loss": 0.0006, + "num_input_tokens_seen": 7456120, + "step": 12250 + }, + { + "epoch": 3.379757308328737, + "grad_norm": 0.003303852863609791, + "learning_rate": 4.186090060721738e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7458840, + "step": 12255 + }, + { + "epoch": 3.3811362382790953, + "grad_norm": 0.0032748079393059015, + "learning_rate": 4.185201396949268e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7461272, + "step": 12260 + }, + { + "epoch": 3.382515168229454, + "grad_norm": 0.0027388050220906734, + "learning_rate": 4.1843123427398245e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7463992, + "step": 12265 + }, + { + "epoch": 3.3838940981798125, + "grad_norm": 0.0018339093076065183, + "learning_rate": 4.183422898299388e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7467096, + "step": 12270 + }, + { + "epoch": 3.385273028130171, + "grad_norm": 0.0005544518353417516, + "learning_rate": 4.1825330638340315e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7470232, + "step": 12275 + }, + { + "epoch": 3.3866519580805297, + "grad_norm": 0.011105762794613838, + "learning_rate": 4.181642839549915e-05, + "loss": 0.0007, + "num_input_tokens_seen": 7477144, + "step": 12280 + }, + { + "epoch": 3.388030888030888, + "grad_norm": 0.002614745404571295, + "learning_rate": 4.180752225653292e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7479704, + "step": 12285 + }, + { + "epoch": 3.3894098179812464, + "grad_norm": 0.0012999627506360412, + "learning_rate": 4.179861222350505e-05, + "loss": 0.0, + "num_input_tokens_seen": 7482296, + "step": 12290 + }, + { + "epoch": 3.3907887479316052, + "grad_norm": 0.0007954547181725502, + "learning_rate": 4.1789698298479854e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7486808, + "step": 12295 + }, + { + "epoch": 3.3921676778819636, + "grad_norm": 0.017041707411408424, + "learning_rate": 4.1780780483522575e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7489560, + "step": 12300 + }, + { + "epoch": 3.393546607832322, + "grad_norm": 0.010630912147462368, + "learning_rate": 4.177185878069933e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7493592, + "step": 12305 + }, + { + "epoch": 3.394925537782681, + "grad_norm": 0.006757946219295263, + "learning_rate": 4.176293319207716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7496728, + "step": 12310 + }, + { + "epoch": 3.396304467733039, + "grad_norm": 0.0010657829698175192, + "learning_rate": 4.1754003719723985e-05, + "loss": 0.0, + "num_input_tokens_seen": 7499896, + "step": 12315 + }, + { + "epoch": 3.3976833976833976, + "grad_norm": 0.0028816896956413984, + "learning_rate": 4.174507036570865e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7502488, + "step": 12320 + }, + { + "epoch": 3.3990623276337564, + "grad_norm": 0.00027372216572985053, + "learning_rate": 4.173613313210087e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7505720, + "step": 12325 + }, + { + "epoch": 3.4004412575841148, + "grad_norm": 0.030034836381673813, + "learning_rate": 4.1727192020971296e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7508568, + "step": 12330 + }, + { + "epoch": 3.401820187534473, + "grad_norm": 0.0002686096995603293, + "learning_rate": 4.171824703439142e-05, + "loss": 0.0, + "num_input_tokens_seen": 7511704, + "step": 12335 + }, + { + "epoch": 3.403199117484832, + "grad_norm": 0.0004319969448260963, + "learning_rate": 4.1709298174433695e-05, + "loss": 0.0, + "num_input_tokens_seen": 7514136, + "step": 12340 + }, + { + "epoch": 3.4045780474351903, + "grad_norm": 0.0015183872310444713, + "learning_rate": 4.170034544317144e-05, + "loss": 0.0, + "num_input_tokens_seen": 7516856, + "step": 12345 + }, + { + "epoch": 3.4059569773855487, + "grad_norm": 0.0005707339150831103, + "learning_rate": 4.169138884267887e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7520920, + "step": 12350 + }, + { + "epoch": 3.4073359073359075, + "grad_norm": 0.005454918835312128, + "learning_rate": 4.168242837503109e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7524728, + "step": 12355 + }, + { + "epoch": 3.408714837286266, + "grad_norm": 0.00334317609667778, + "learning_rate": 4.1673464042304134e-05, + "loss": 0.0, + "num_input_tokens_seen": 7527256, + "step": 12360 + }, + { + "epoch": 3.4100937672366243, + "grad_norm": 0.0019263315480202436, + "learning_rate": 4.166449584657489e-05, + "loss": 0.0, + "num_input_tokens_seen": 7530136, + "step": 12365 + }, + { + "epoch": 3.411472697186983, + "grad_norm": 0.0004364679625723511, + "learning_rate": 4.165552378992117e-05, + "loss": 0.1126, + "num_input_tokens_seen": 7532888, + "step": 12370 + }, + { + "epoch": 3.4128516271373415, + "grad_norm": 0.00528338085860014, + "learning_rate": 4.164654787442167e-05, + "loss": 0.0, + "num_input_tokens_seen": 7536216, + "step": 12375 + }, + { + "epoch": 3.4142305570877, + "grad_norm": 0.0004696169635280967, + "learning_rate": 4.163756810215597e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7539224, + "step": 12380 + }, + { + "epoch": 3.4156094870380587, + "grad_norm": 0.002053876407444477, + "learning_rate": 4.162858447520457e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7542392, + "step": 12385 + }, + { + "epoch": 3.416988416988417, + "grad_norm": 0.024383671581745148, + "learning_rate": 4.161959699564885e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7545592, + "step": 12390 + }, + { + "epoch": 3.4183673469387754, + "grad_norm": 0.003304985584691167, + "learning_rate": 4.161060566557106e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7548728, + "step": 12395 + }, + { + "epoch": 3.419746276889134, + "grad_norm": 0.002191668376326561, + "learning_rate": 4.1601610487054366e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7551960, + "step": 12400 + }, + { + "epoch": 3.4211252068394926, + "grad_norm": 0.0015510724624618888, + "learning_rate": 4.159261146218284e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7555128, + "step": 12405 + }, + { + "epoch": 3.422504136789851, + "grad_norm": 0.0141170434653759, + "learning_rate": 4.158360859304141e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7558584, + "step": 12410 + }, + { + "epoch": 3.42388306674021, + "grad_norm": 0.0010074996389448643, + "learning_rate": 4.157460188171593e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7562648, + "step": 12415 + }, + { + "epoch": 3.425261996690568, + "grad_norm": 0.0006082293693907559, + "learning_rate": 4.1565591330293094e-05, + "loss": 0.0, + "num_input_tokens_seen": 7565208, + "step": 12420 + }, + { + "epoch": 3.4266409266409266, + "grad_norm": 0.0073171574622392654, + "learning_rate": 4.155657694086055e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7568056, + "step": 12425 + }, + { + "epoch": 3.428019856591285, + "grad_norm": 0.0007639052346348763, + "learning_rate": 4.154755871550678e-05, + "loss": 0.0, + "num_input_tokens_seen": 7570968, + "step": 12430 + }, + { + "epoch": 3.4293987865416438, + "grad_norm": 0.0035709526855498552, + "learning_rate": 4.153853665632119e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7574552, + "step": 12435 + }, + { + "epoch": 3.430777716492002, + "grad_norm": 0.0018017265247181058, + "learning_rate": 4.1529510765394065e-05, + "loss": 0.0, + "num_input_tokens_seen": 7577528, + "step": 12440 + }, + { + "epoch": 3.432156646442361, + "grad_norm": 0.0006999772158451378, + "learning_rate": 4.152048104481655e-05, + "loss": 0.0, + "num_input_tokens_seen": 7581464, + "step": 12445 + }, + { + "epoch": 3.4335355763927193, + "grad_norm": 0.0029752550181001425, + "learning_rate": 4.151144749668073e-05, + "loss": 0.0, + "num_input_tokens_seen": 7585368, + "step": 12450 + }, + { + "epoch": 3.4349145063430777, + "grad_norm": 0.0009878758573904634, + "learning_rate": 4.150241012307954e-05, + "loss": 0.0, + "num_input_tokens_seen": 7589016, + "step": 12455 + }, + { + "epoch": 3.436293436293436, + "grad_norm": 0.002791385864838958, + "learning_rate": 4.1493368926106804e-05, + "loss": 0.0, + "num_input_tokens_seen": 7592216, + "step": 12460 + }, + { + "epoch": 3.437672366243795, + "grad_norm": 0.010248460806906223, + "learning_rate": 4.148432390785724e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7596664, + "step": 12465 + }, + { + "epoch": 3.4390512961941533, + "grad_norm": 0.001547045772895217, + "learning_rate": 4.147527507042644e-05, + "loss": 0.019, + "num_input_tokens_seen": 7599416, + "step": 12470 + }, + { + "epoch": 3.440430226144512, + "grad_norm": 0.0033336447086185217, + "learning_rate": 4.146622241591092e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7603064, + "step": 12475 + }, + { + "epoch": 3.4418091560948705, + "grad_norm": 0.0012358681997284293, + "learning_rate": 4.1457165946408016e-05, + "loss": 0.0, + "num_input_tokens_seen": 7606584, + "step": 12480 + }, + { + "epoch": 3.443188086045229, + "grad_norm": 0.0007786438218317926, + "learning_rate": 4.1448105664015996e-05, + "loss": 0.0, + "num_input_tokens_seen": 7610072, + "step": 12485 + }, + { + "epoch": 3.4445670159955872, + "grad_norm": 84.57879638671875, + "learning_rate": 4.1439041570833995e-05, + "loss": 0.0219, + "num_input_tokens_seen": 7612568, + "step": 12490 + }, + { + "epoch": 3.445945945945946, + "grad_norm": 0.0009850525530055165, + "learning_rate": 4.142997366896204e-05, + "loss": 0.0, + "num_input_tokens_seen": 7615256, + "step": 12495 + }, + { + "epoch": 3.4473248758963044, + "grad_norm": 0.0006714391056448221, + "learning_rate": 4.1420901960501025e-05, + "loss": 0.0, + "num_input_tokens_seen": 7617752, + "step": 12500 + }, + { + "epoch": 3.448703805846663, + "grad_norm": 0.003714134218171239, + "learning_rate": 4.1411826447552734e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7620024, + "step": 12505 + }, + { + "epoch": 3.4500827357970216, + "grad_norm": 0.0006631911383010447, + "learning_rate": 4.140274713221985e-05, + "loss": 0.0, + "num_input_tokens_seen": 7623832, + "step": 12510 + }, + { + "epoch": 3.45146166574738, + "grad_norm": 0.0001683118025539443, + "learning_rate": 4.1393664016605896e-05, + "loss": 0.0, + "num_input_tokens_seen": 7626136, + "step": 12515 + }, + { + "epoch": 3.4528405956977384, + "grad_norm": 0.005339134018868208, + "learning_rate": 4.138457710281532e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7629464, + "step": 12520 + }, + { + "epoch": 3.454219525648097, + "grad_norm": 0.017618028447031975, + "learning_rate": 4.1375486392953416e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7633016, + "step": 12525 + }, + { + "epoch": 3.4555984555984556, + "grad_norm": 0.0002826761920005083, + "learning_rate": 4.136639188912638e-05, + "loss": 0.0731, + "num_input_tokens_seen": 7635832, + "step": 12530 + }, + { + "epoch": 3.456977385548814, + "grad_norm": 0.00024848556495271623, + "learning_rate": 4.135729359344126e-05, + "loss": 0.0, + "num_input_tokens_seen": 7638744, + "step": 12535 + }, + { + "epoch": 3.4583563154991728, + "grad_norm": 0.012145240791141987, + "learning_rate": 4.134819150800603e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7641208, + "step": 12540 + }, + { + "epoch": 3.459735245449531, + "grad_norm": 0.0005190470837987959, + "learning_rate": 4.1339085634929485e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7643576, + "step": 12545 + }, + { + "epoch": 3.4611141753998895, + "grad_norm": 0.0004133028560318053, + "learning_rate": 4.1329975976321336e-05, + "loss": 0.0732, + "num_input_tokens_seen": 7648376, + "step": 12550 + }, + { + "epoch": 3.4624931053502483, + "grad_norm": 0.0001583023986313492, + "learning_rate": 4.132086253429217e-05, + "loss": 0.0, + "num_input_tokens_seen": 7651128, + "step": 12555 + }, + { + "epoch": 3.4638720353006067, + "grad_norm": 0.01287305261939764, + "learning_rate": 4.131174531095343e-05, + "loss": 0.0016, + "num_input_tokens_seen": 7653944, + "step": 12560 + }, + { + "epoch": 3.465250965250965, + "grad_norm": 0.010816365480422974, + "learning_rate": 4.1302624308417434e-05, + "loss": 0.0411, + "num_input_tokens_seen": 7657624, + "step": 12565 + }, + { + "epoch": 3.466629895201324, + "grad_norm": 0.014196987263858318, + "learning_rate": 4.129349952879741e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7660344, + "step": 12570 + }, + { + "epoch": 3.4680088251516823, + "grad_norm": 0.0005431630415841937, + "learning_rate": 4.1284370974207435e-05, + "loss": 0.0918, + "num_input_tokens_seen": 7664056, + "step": 12575 + }, + { + "epoch": 3.4693877551020407, + "grad_norm": 0.0027243762742727995, + "learning_rate": 4.127523864676245e-05, + "loss": 0.0, + "num_input_tokens_seen": 7666904, + "step": 12580 + }, + { + "epoch": 3.4707666850523995, + "grad_norm": 0.010814818553626537, + "learning_rate": 4.1266102548578286e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7670616, + "step": 12585 + }, + { + "epoch": 3.472145615002758, + "grad_norm": 0.067500039935112, + "learning_rate": 4.125696268177167e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7673368, + "step": 12590 + }, + { + "epoch": 3.4735245449531162, + "grad_norm": 1.5943603515625, + "learning_rate": 4.1247819048460144e-05, + "loss": 0.0012, + "num_input_tokens_seen": 7678232, + "step": 12595 + }, + { + "epoch": 3.474903474903475, + "grad_norm": 0.008972211740911007, + "learning_rate": 4.123867165076218e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7681240, + "step": 12600 + }, + { + "epoch": 3.4762824048538334, + "grad_norm": 0.4845327138900757, + "learning_rate": 4.122952049079709e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7683832, + "step": 12605 + }, + { + "epoch": 3.477661334804192, + "grad_norm": 0.047421298921108246, + "learning_rate": 4.122036557068507e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7686008, + "step": 12610 + }, + { + "epoch": 3.4790402647545506, + "grad_norm": 0.7038504481315613, + "learning_rate": 4.121120689254718e-05, + "loss": 0.001, + "num_input_tokens_seen": 7688824, + "step": 12615 + }, + { + "epoch": 3.480419194704909, + "grad_norm": 0.0018805633299052715, + "learning_rate": 4.120204445850535e-05, + "loss": 0.0, + "num_input_tokens_seen": 7692568, + "step": 12620 + }, + { + "epoch": 3.4817981246552674, + "grad_norm": 0.0003508180088829249, + "learning_rate": 4.119287827068239e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7695928, + "step": 12625 + }, + { + "epoch": 3.483177054605626, + "grad_norm": 0.000164494282216765, + "learning_rate": 4.1183708331201983e-05, + "loss": 0.0, + "num_input_tokens_seen": 7699032, + "step": 12630 + }, + { + "epoch": 3.4845559845559846, + "grad_norm": 0.0008786116959527135, + "learning_rate": 4.117453464218866e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7701688, + "step": 12635 + }, + { + "epoch": 3.485934914506343, + "grad_norm": 0.0042850663885474205, + "learning_rate": 4.116535720576783e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7704248, + "step": 12640 + }, + { + "epoch": 3.487313844456702, + "grad_norm": 0.0004197019443381578, + "learning_rate": 4.115617602406578e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7707224, + "step": 12645 + }, + { + "epoch": 3.48869277440706, + "grad_norm": 0.00017392629524692893, + "learning_rate": 4.1146991099209675e-05, + "loss": 0.0, + "num_input_tokens_seen": 7710136, + "step": 12650 + }, + { + "epoch": 3.4900717043574185, + "grad_norm": 0.013477221131324768, + "learning_rate": 4.11378024333275e-05, + "loss": 0.0, + "num_input_tokens_seen": 7713624, + "step": 12655 + }, + { + "epoch": 3.4914506343077774, + "grad_norm": 0.00010827065125340596, + "learning_rate": 4.112861002854814e-05, + "loss": 0.0, + "num_input_tokens_seen": 7717176, + "step": 12660 + }, + { + "epoch": 3.4928295642581357, + "grad_norm": 0.002037416212260723, + "learning_rate": 4.111941388700137e-05, + "loss": 0.0, + "num_input_tokens_seen": 7720312, + "step": 12665 + }, + { + "epoch": 3.494208494208494, + "grad_norm": 0.0002597387065179646, + "learning_rate": 4.111021401081778e-05, + "loss": 0.0, + "num_input_tokens_seen": 7722936, + "step": 12670 + }, + { + "epoch": 3.4955874241588525, + "grad_norm": 0.00011734305007848889, + "learning_rate": 4.110101040212886e-05, + "loss": 0.0, + "num_input_tokens_seen": 7725720, + "step": 12675 + }, + { + "epoch": 3.4969663541092113, + "grad_norm": 0.000295525009278208, + "learning_rate": 4.109180306306695e-05, + "loss": 0.0, + "num_input_tokens_seen": 7728824, + "step": 12680 + }, + { + "epoch": 3.4983452840595697, + "grad_norm": 0.0005643147742375731, + "learning_rate": 4.108259199576525e-05, + "loss": 0.0, + "num_input_tokens_seen": 7732568, + "step": 12685 + }, + { + "epoch": 3.4997242140099285, + "grad_norm": 0.0009331084438599646, + "learning_rate": 4.107337720235785e-05, + "loss": 0.0, + "num_input_tokens_seen": 7735672, + "step": 12690 + }, + { + "epoch": 3.5, + "eval_loss": 0.22858379781246185, + "eval_runtime": 28.4895, + "eval_samples_per_second": 56.582, + "eval_steps_per_second": 14.146, + "num_input_tokens_seen": 7736376, + "step": 12691 + }, + { + "epoch": 3.501103143960287, + "grad_norm": 0.00040335129597224295, + "learning_rate": 4.106415868497967e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7738584, + "step": 12695 + }, + { + "epoch": 3.5024820739106453, + "grad_norm": 0.002964913612231612, + "learning_rate": 4.105493644576652e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7741112, + "step": 12700 + }, + { + "epoch": 3.5038610038610036, + "grad_norm": 0.004147175699472427, + "learning_rate": 4.1045710486855056e-05, + "loss": 0.0, + "num_input_tokens_seen": 7743576, + "step": 12705 + }, + { + "epoch": 3.5052399338113625, + "grad_norm": 0.00022111146245151758, + "learning_rate": 4.103648081038279e-05, + "loss": 0.0, + "num_input_tokens_seen": 7746552, + "step": 12710 + }, + { + "epoch": 3.506618863761721, + "grad_norm": 3.315895810374059e-05, + "learning_rate": 4.102724741848812e-05, + "loss": 0.0636, + "num_input_tokens_seen": 7748728, + "step": 12715 + }, + { + "epoch": 3.5079977937120796, + "grad_norm": 5.450618846225552e-05, + "learning_rate": 4.101801031331029e-05, + "loss": 0.0, + "num_input_tokens_seen": 7751896, + "step": 12720 + }, + { + "epoch": 3.509376723662438, + "grad_norm": 0.00022146361880004406, + "learning_rate": 4.1008769496989394e-05, + "loss": 0.0, + "num_input_tokens_seen": 7754808, + "step": 12725 + }, + { + "epoch": 3.5107556536127964, + "grad_norm": 0.00013806935749016702, + "learning_rate": 4.0999524971666403e-05, + "loss": 0.0, + "num_input_tokens_seen": 7757496, + "step": 12730 + }, + { + "epoch": 3.5121345835631548, + "grad_norm": 8.78446880960837e-05, + "learning_rate": 4.099027673948315e-05, + "loss": 0.0, + "num_input_tokens_seen": 7761080, + "step": 12735 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.00010648427269188687, + "learning_rate": 4.098102480258229e-05, + "loss": 0.0469, + "num_input_tokens_seen": 7763640, + "step": 12740 + }, + { + "epoch": 3.514892443463872, + "grad_norm": 0.003035677829757333, + "learning_rate": 4.097176916310739e-05, + "loss": 0.0711, + "num_input_tokens_seen": 7765944, + "step": 12745 + }, + { + "epoch": 3.516271373414231, + "grad_norm": 0.0008408325375057757, + "learning_rate": 4.0962509823202845e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7769336, + "step": 12750 + }, + { + "epoch": 3.517650303364589, + "grad_norm": 0.00016771614900790155, + "learning_rate": 4.09532467850139e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7772248, + "step": 12755 + }, + { + "epoch": 3.5190292333149475, + "grad_norm": 0.0013141910312697291, + "learning_rate": 4.094398005068667e-05, + "loss": 0.1126, + "num_input_tokens_seen": 7774296, + "step": 12760 + }, + { + "epoch": 3.520408163265306, + "grad_norm": 0.0006003208109177649, + "learning_rate": 4.093470962236814e-05, + "loss": 0.0, + "num_input_tokens_seen": 7778392, + "step": 12765 + }, + { + "epoch": 3.5217870932156647, + "grad_norm": 0.0002530872880015522, + "learning_rate": 4.092543550220612e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7781880, + "step": 12770 + }, + { + "epoch": 3.523166023166023, + "grad_norm": 0.0022166951093822718, + "learning_rate": 4.091615769234929e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7784568, + "step": 12775 + }, + { + "epoch": 3.524544953116382, + "grad_norm": 0.0009190527489408851, + "learning_rate": 4.090687619494719e-05, + "loss": 0.0, + "num_input_tokens_seen": 7786936, + "step": 12780 + }, + { + "epoch": 3.5259238830667403, + "grad_norm": 0.005192392040044069, + "learning_rate": 4.0897591012150215e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7790008, + "step": 12785 + }, + { + "epoch": 3.5273028130170987, + "grad_norm": 0.0010810414096340537, + "learning_rate": 4.0888302146109604e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7793272, + "step": 12790 + }, + { + "epoch": 3.528681742967457, + "grad_norm": 0.006430541630834341, + "learning_rate": 4.087900959897745e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7796472, + "step": 12795 + }, + { + "epoch": 3.530060672917816, + "grad_norm": 22.532093048095703, + "learning_rate": 4.086971337290671e-05, + "loss": 0.0059, + "num_input_tokens_seen": 7799416, + "step": 12800 + }, + { + "epoch": 3.5314396028681743, + "grad_norm": 0.021260984241962433, + "learning_rate": 4.086041347005117e-05, + "loss": 0.0004, + "num_input_tokens_seen": 7801464, + "step": 12805 + }, + { + "epoch": 3.532818532818533, + "grad_norm": 0.002493176143616438, + "learning_rate": 4.085110989256551e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7803992, + "step": 12810 + }, + { + "epoch": 3.5341974627688915, + "grad_norm": 0.0003293644986115396, + "learning_rate": 4.0841802642605216e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7807320, + "step": 12815 + }, + { + "epoch": 3.53557639271925, + "grad_norm": 0.05270945280790329, + "learning_rate": 4.0832491722326646e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7810264, + "step": 12820 + }, + { + "epoch": 3.536955322669608, + "grad_norm": 0.000879129976965487, + "learning_rate": 4.082317713388702e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7813272, + "step": 12825 + }, + { + "epoch": 3.538334252619967, + "grad_norm": 0.005815768614411354, + "learning_rate": 4.0813858879444376e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7815800, + "step": 12830 + }, + { + "epoch": 3.5397131825703254, + "grad_norm": 0.00046890598605386913, + "learning_rate": 4.0804536961157624e-05, + "loss": 0.0, + "num_input_tokens_seen": 7819320, + "step": 12835 + }, + { + "epoch": 3.541092112520684, + "grad_norm": 0.0005536005483008921, + "learning_rate": 4.079521138118654e-05, + "loss": 0.0, + "num_input_tokens_seen": 7822296, + "step": 12840 + }, + { + "epoch": 3.5424710424710426, + "grad_norm": 0.00020049812155775726, + "learning_rate": 4.0785882141691694e-05, + "loss": 0.0, + "num_input_tokens_seen": 7825944, + "step": 12845 + }, + { + "epoch": 3.543849972421401, + "grad_norm": 0.0007867270614951849, + "learning_rate": 4.077654924483456e-05, + "loss": 0.0711, + "num_input_tokens_seen": 7829048, + "step": 12850 + }, + { + "epoch": 3.5452289023717594, + "grad_norm": 0.006766384933143854, + "learning_rate": 4.0767212692777424e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7831480, + "step": 12855 + }, + { + "epoch": 3.546607832322118, + "grad_norm": 0.00014231006207410246, + "learning_rate": 4.075787248768345e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7834616, + "step": 12860 + }, + { + "epoch": 3.5479867622724766, + "grad_norm": 0.00031905900686979294, + "learning_rate": 4.07485286317166e-05, + "loss": 0.0, + "num_input_tokens_seen": 7838392, + "step": 12865 + }, + { + "epoch": 3.549365692222835, + "grad_norm": 0.0016969708958640695, + "learning_rate": 4.0739181127041734e-05, + "loss": 0.0, + "num_input_tokens_seen": 7842168, + "step": 12870 + }, + { + "epoch": 3.5507446221731938, + "grad_norm": 0.00674729747697711, + "learning_rate": 4.072982997582453e-05, + "loss": 0.0, + "num_input_tokens_seen": 7847128, + "step": 12875 + }, + { + "epoch": 3.552123552123552, + "grad_norm": 0.0010851046536117792, + "learning_rate": 4.072047518023151e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7850584, + "step": 12880 + }, + { + "epoch": 3.5535024820739105, + "grad_norm": 0.0010734496172517538, + "learning_rate": 4.0711116742430044e-05, + "loss": 0.0, + "num_input_tokens_seen": 7853784, + "step": 12885 + }, + { + "epoch": 3.5548814120242693, + "grad_norm": 0.0016947168624028563, + "learning_rate": 4.070175466458836e-05, + "loss": 0.0, + "num_input_tokens_seen": 7855864, + "step": 12890 + }, + { + "epoch": 3.5562603419746277, + "grad_norm": 0.00029147620080038905, + "learning_rate": 4.069238894887551e-05, + "loss": 0.0, + "num_input_tokens_seen": 7858744, + "step": 12895 + }, + { + "epoch": 3.557639271924986, + "grad_norm": 0.00017534158541820943, + "learning_rate": 4.068301959746139e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7861656, + "step": 12900 + }, + { + "epoch": 3.559018201875345, + "grad_norm": 0.00019891277770511806, + "learning_rate": 4.0673646612516753e-05, + "loss": 0.0, + "num_input_tokens_seen": 7864344, + "step": 12905 + }, + { + "epoch": 3.5603971318257033, + "grad_norm": 0.0023647849448025227, + "learning_rate": 4.066426999621318e-05, + "loss": 0.0, + "num_input_tokens_seen": 7867480, + "step": 12910 + }, + { + "epoch": 3.5617760617760617, + "grad_norm": 0.007712789345532656, + "learning_rate": 4.0654889750723105e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7871192, + "step": 12915 + }, + { + "epoch": 3.56315499172642, + "grad_norm": 0.11096856743097305, + "learning_rate": 4.064550587821979e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7873848, + "step": 12920 + }, + { + "epoch": 3.564533921676779, + "grad_norm": 0.00012706145935226232, + "learning_rate": 4.063611838087734e-05, + "loss": 0.0, + "num_input_tokens_seen": 7876312, + "step": 12925 + }, + { + "epoch": 3.5659128516271372, + "grad_norm": 0.0018072687089443207, + "learning_rate": 4.062672726087072e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7879416, + "step": 12930 + }, + { + "epoch": 3.567291781577496, + "grad_norm": 0.0010351422242820263, + "learning_rate": 4.0617332520375696e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7882680, + "step": 12935 + }, + { + "epoch": 3.5686707115278544, + "grad_norm": 7.708925841143355e-05, + "learning_rate": 4.0607934161568906e-05, + "loss": 0.0917, + "num_input_tokens_seen": 7885048, + "step": 12940 + }, + { + "epoch": 3.570049641478213, + "grad_norm": 0.00105098239146173, + "learning_rate": 4.059853218662782e-05, + "loss": 0.0, + "num_input_tokens_seen": 7888184, + "step": 12945 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.005533318035304546, + "learning_rate": 4.058912659773073e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7891352, + "step": 12950 + }, + { + "epoch": 3.57280750137893, + "grad_norm": 7.487502443837002e-05, + "learning_rate": 4.0579717397056784e-05, + "loss": 0.0, + "num_input_tokens_seen": 7895128, + "step": 12955 + }, + { + "epoch": 3.5741864313292884, + "grad_norm": 1.5987807273631915e-05, + "learning_rate": 4.057030458678595e-05, + "loss": 0.0, + "num_input_tokens_seen": 7898808, + "step": 12960 + }, + { + "epoch": 3.575565361279647, + "grad_norm": 0.00015112973051145673, + "learning_rate": 4.056088816909904e-05, + "loss": 0.0, + "num_input_tokens_seen": 7901304, + "step": 12965 + }, + { + "epoch": 3.5769442912300056, + "grad_norm": 0.00036722299410030246, + "learning_rate": 4.055146814617772e-05, + "loss": 0.0, + "num_input_tokens_seen": 7903736, + "step": 12970 + }, + { + "epoch": 3.578323221180364, + "grad_norm": 0.07848536223173141, + "learning_rate": 4.054204452020446e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7907352, + "step": 12975 + }, + { + "epoch": 3.5797021511307223, + "grad_norm": 2.7280089852865785e-05, + "learning_rate": 4.053261729336258e-05, + "loss": 0.0, + "num_input_tokens_seen": 7909848, + "step": 12980 + }, + { + "epoch": 3.581081081081081, + "grad_norm": 0.001941213384270668, + "learning_rate": 4.052318646783623e-05, + "loss": 0.0, + "num_input_tokens_seen": 7913656, + "step": 12985 + }, + { + "epoch": 3.5824600110314395, + "grad_norm": 0.0005497548845596611, + "learning_rate": 4.0513752045810415e-05, + "loss": 0.0649, + "num_input_tokens_seen": 7917528, + "step": 12990 + }, + { + "epoch": 3.5838389409817983, + "grad_norm": 0.0003714998601935804, + "learning_rate": 4.050431402947093e-05, + "loss": 0.0, + "num_input_tokens_seen": 7919992, + "step": 12995 + }, + { + "epoch": 3.5852178709321567, + "grad_norm": 0.0006823048461228609, + "learning_rate": 4.0494872421004446e-05, + "loss": 0.1648, + "num_input_tokens_seen": 7922328, + "step": 13000 + }, + { + "epoch": 3.586596800882515, + "grad_norm": 7.07467261236161e-05, + "learning_rate": 4.048542722259844e-05, + "loss": 0.0, + "num_input_tokens_seen": 7925752, + "step": 13005 + }, + { + "epoch": 3.5879757308328735, + "grad_norm": 0.01571926474571228, + "learning_rate": 4.047597843644123e-05, + "loss": 0.0, + "num_input_tokens_seen": 7928536, + "step": 13010 + }, + { + "epoch": 3.5893546607832323, + "grad_norm": 0.00114693702198565, + "learning_rate": 4.0466526064721964e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7931288, + "step": 13015 + }, + { + "epoch": 3.5907335907335907, + "grad_norm": 0.0010113748721778393, + "learning_rate": 4.045707010963062e-05, + "loss": 0.1354, + "num_input_tokens_seen": 7933848, + "step": 13020 + }, + { + "epoch": 3.5921125206839495, + "grad_norm": 4.860210174228996e-05, + "learning_rate": 4.0447610573358014e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7936792, + "step": 13025 + }, + { + "epoch": 3.593491450634308, + "grad_norm": 4.1291692468803376e-05, + "learning_rate": 4.0438147458095784e-05, + "loss": 0.003, + "num_input_tokens_seen": 7939512, + "step": 13030 + }, + { + "epoch": 3.5948703805846662, + "grad_norm": 6.72987152938731e-05, + "learning_rate": 4.0428680766036384e-05, + "loss": 0.0, + "num_input_tokens_seen": 7943352, + "step": 13035 + }, + { + "epoch": 3.5962493105350246, + "grad_norm": 0.0008852925966493785, + "learning_rate": 4.0419210499373114e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7945720, + "step": 13040 + }, + { + "epoch": 3.5976282404853834, + "grad_norm": 0.0011594134848564863, + "learning_rate": 4.040973666030012e-05, + "loss": 0.0005, + "num_input_tokens_seen": 7948856, + "step": 13045 + }, + { + "epoch": 3.599007170435742, + "grad_norm": 0.0009307304280810058, + "learning_rate": 4.040025925101234e-05, + "loss": 0.0, + "num_input_tokens_seen": 7951032, + "step": 13050 + }, + { + "epoch": 3.6003861003861006, + "grad_norm": 0.008395034819841385, + "learning_rate": 4.0390778273705544e-05, + "loss": 0.1188, + "num_input_tokens_seen": 7954392, + "step": 13055 + }, + { + "epoch": 3.601765030336459, + "grad_norm": 0.007144145201891661, + "learning_rate": 4.038129373057635e-05, + "loss": 0.0031, + "num_input_tokens_seen": 7957464, + "step": 13060 + }, + { + "epoch": 3.6031439602868174, + "grad_norm": 0.0006866384064778686, + "learning_rate": 4.0371805623822194e-05, + "loss": 0.0, + "num_input_tokens_seen": 7960920, + "step": 13065 + }, + { + "epoch": 3.6045228902371758, + "grad_norm": 0.034451499581336975, + "learning_rate": 4.036231395564132e-05, + "loss": 0.0, + "num_input_tokens_seen": 7964088, + "step": 13070 + }, + { + "epoch": 3.6059018201875346, + "grad_norm": 0.0005005364655517042, + "learning_rate": 4.035281872823283e-05, + "loss": 0.0087, + "num_input_tokens_seen": 7969336, + "step": 13075 + }, + { + "epoch": 3.607280750137893, + "grad_norm": 0.011553100310266018, + "learning_rate": 4.034331994379661e-05, + "loss": 0.0002, + "num_input_tokens_seen": 7971928, + "step": 13080 + }, + { + "epoch": 3.6086596800882518, + "grad_norm": 0.0005054581561125815, + "learning_rate": 4.033381760453341e-05, + "loss": 0.0003, + "num_input_tokens_seen": 7974968, + "step": 13085 + }, + { + "epoch": 3.61003861003861, + "grad_norm": 0.0002731469285208732, + "learning_rate": 4.032431171264478e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7978136, + "step": 13090 + }, + { + "epoch": 3.6114175399889685, + "grad_norm": 0.0017000887310132384, + "learning_rate": 4.0314802270333094e-05, + "loss": 0.0006, + "num_input_tokens_seen": 7981048, + "step": 13095 + }, + { + "epoch": 3.612796469939327, + "grad_norm": 0.0008932440541684628, + "learning_rate": 4.0305289279801564e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7983320, + "step": 13100 + }, + { + "epoch": 3.6141753998896857, + "grad_norm": 0.0017861861269921064, + "learning_rate": 4.02957727432542e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7985784, + "step": 13105 + }, + { + "epoch": 3.615554329840044, + "grad_norm": 0.0017890261951833963, + "learning_rate": 4.0286252662895855e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7988664, + "step": 13110 + }, + { + "epoch": 3.616933259790403, + "grad_norm": 0.0005288441316224635, + "learning_rate": 4.0276729040932196e-05, + "loss": 0.0, + "num_input_tokens_seen": 7991448, + "step": 13115 + }, + { + "epoch": 3.6183121897407613, + "grad_norm": 0.002843251219019294, + "learning_rate": 4.0267201879569694e-05, + "loss": 0.0, + "num_input_tokens_seen": 7994168, + "step": 13120 + }, + { + "epoch": 3.6196911196911197, + "grad_norm": 0.0004461981006897986, + "learning_rate": 4.0257671181015684e-05, + "loss": 0.0, + "num_input_tokens_seen": 7996504, + "step": 13125 + }, + { + "epoch": 3.621070049641478, + "grad_norm": 0.0003695618943311274, + "learning_rate": 4.0248136947478265e-05, + "loss": 0.0001, + "num_input_tokens_seen": 7999288, + "step": 13130 + }, + { + "epoch": 3.622448979591837, + "grad_norm": 0.005317933391779661, + "learning_rate": 4.0238599181166404e-05, + "loss": 0.1339, + "num_input_tokens_seen": 8003768, + "step": 13135 + }, + { + "epoch": 3.6238279095421952, + "grad_norm": 0.001170914270915091, + "learning_rate": 4.022905788428985e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8007160, + "step": 13140 + }, + { + "epoch": 3.6252068394925536, + "grad_norm": 0.0029949708841741085, + "learning_rate": 4.021951305905918e-05, + "loss": 0.1002, + "num_input_tokens_seen": 8010008, + "step": 13145 + }, + { + "epoch": 3.6265857694429124, + "grad_norm": 0.033684954047203064, + "learning_rate": 4.0209964707685807e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8012408, + "step": 13150 + }, + { + "epoch": 3.627964699393271, + "grad_norm": 0.005894639529287815, + "learning_rate": 4.020041283238195e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8015320, + "step": 13155 + }, + { + "epoch": 3.629343629343629, + "grad_norm": 0.006698247976601124, + "learning_rate": 4.019085743536063e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8017752, + "step": 13160 + }, + { + "epoch": 3.630722559293988, + "grad_norm": 0.001499163219705224, + "learning_rate": 4.018129851883569e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8022104, + "step": 13165 + }, + { + "epoch": 3.6321014892443464, + "grad_norm": 0.017209531739354134, + "learning_rate": 4.0171736085021805e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8024664, + "step": 13170 + }, + { + "epoch": 3.6334804191947048, + "grad_norm": 0.003244356717914343, + "learning_rate": 4.0162170136134455e-05, + "loss": 0.0016, + "num_input_tokens_seen": 8026744, + "step": 13175 + }, + { + "epoch": 3.6348593491450636, + "grad_norm": 0.004175061360001564, + "learning_rate": 4.0152600674389925e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8029240, + "step": 13180 + }, + { + "epoch": 3.636238279095422, + "grad_norm": 0.002702976344153285, + "learning_rate": 4.014302770200533e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8031512, + "step": 13185 + }, + { + "epoch": 3.6376172090457803, + "grad_norm": 0.013400718569755554, + "learning_rate": 4.013345122119858e-05, + "loss": 0.0008, + "num_input_tokens_seen": 8034520, + "step": 13190 + }, + { + "epoch": 3.638996138996139, + "grad_norm": 0.007521723862737417, + "learning_rate": 4.012387123418841e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8037336, + "step": 13195 + }, + { + "epoch": 3.6403750689464975, + "grad_norm": 0.011572640389204025, + "learning_rate": 4.011428774319438e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8040568, + "step": 13200 + }, + { + "epoch": 3.641753998896856, + "grad_norm": 0.007382593583315611, + "learning_rate": 4.010470075043683e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8043480, + "step": 13205 + }, + { + "epoch": 3.6431329288472147, + "grad_norm": 0.007520476821810007, + "learning_rate": 4.009511025813694e-05, + "loss": 0.0464, + "num_input_tokens_seen": 8047608, + "step": 13210 + }, + { + "epoch": 3.644511858797573, + "grad_norm": 0.025059228762984276, + "learning_rate": 4.008551626851668e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8050264, + "step": 13215 + }, + { + "epoch": 3.6458907887479315, + "grad_norm": 0.0030355157796293497, + "learning_rate": 4.007591878379885e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8053592, + "step": 13220 + }, + { + "epoch": 3.64726971869829, + "grad_norm": 0.017416616901755333, + "learning_rate": 4.0066317806207036e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8056440, + "step": 13225 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.0004561247769743204, + "learning_rate": 4.005671333796566e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8059928, + "step": 13230 + }, + { + "epoch": 3.650027578599007, + "grad_norm": 0.004538930952548981, + "learning_rate": 4.004710538129993e-05, + "loss": 0.0107, + "num_input_tokens_seen": 8063672, + "step": 13235 + }, + { + "epoch": 3.651406508549366, + "grad_norm": 0.01519017107784748, + "learning_rate": 4.0037493938435885e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8066936, + "step": 13240 + }, + { + "epoch": 3.6527854384997243, + "grad_norm": 0.0003922161122318357, + "learning_rate": 4.002787901160034e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8070200, + "step": 13245 + }, + { + "epoch": 3.6541643684500826, + "grad_norm": 0.006539223250001669, + "learning_rate": 4.001826060302094e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8073816, + "step": 13250 + }, + { + "epoch": 3.655543298400441, + "grad_norm": 0.0018896848196163774, + "learning_rate": 4.000863871492615e-05, + "loss": 0.0, + "num_input_tokens_seen": 8077176, + "step": 13255 + }, + { + "epoch": 3.6569222283508, + "grad_norm": 0.0010633196216076612, + "learning_rate": 3.9999013349545205e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8079896, + "step": 13260 + }, + { + "epoch": 3.658301158301158, + "grad_norm": 0.0020972825586795807, + "learning_rate": 3.998938450910816e-05, + "loss": 0.0, + "num_input_tokens_seen": 8083608, + "step": 13265 + }, + { + "epoch": 3.659680088251517, + "grad_norm": 0.00177524343598634, + "learning_rate": 3.99797521958459e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8086552, + "step": 13270 + }, + { + "epoch": 3.6610590182018754, + "grad_norm": 0.007020385004580021, + "learning_rate": 3.9970116411990086e-05, + "loss": 0.0, + "num_input_tokens_seen": 8088792, + "step": 13275 + }, + { + "epoch": 3.662437948152234, + "grad_norm": 0.00048733691801317036, + "learning_rate": 3.996047715977318e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8091640, + "step": 13280 + }, + { + "epoch": 3.663816878102592, + "grad_norm": 0.00012300074740778655, + "learning_rate": 3.995083444142845e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8094392, + "step": 13285 + }, + { + "epoch": 3.665195808052951, + "grad_norm": 0.00449770363047719, + "learning_rate": 3.994118825919001e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8100440, + "step": 13290 + }, + { + "epoch": 3.6665747380033094, + "grad_norm": 0.0008983960142359138, + "learning_rate": 3.993153861529272e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8102808, + "step": 13295 + }, + { + "epoch": 3.667953667953668, + "grad_norm": 0.0015863754088059068, + "learning_rate": 3.992188551197226e-05, + "loss": 0.1001, + "num_input_tokens_seen": 8105784, + "step": 13300 + }, + { + "epoch": 3.6693325979040265, + "grad_norm": 0.0013777826679870486, + "learning_rate": 3.991222895146513e-05, + "loss": 0.0, + "num_input_tokens_seen": 8108472, + "step": 13305 + }, + { + "epoch": 3.670711527854385, + "grad_norm": 0.0028911964036524296, + "learning_rate": 3.99025689360086e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8111960, + "step": 13310 + }, + { + "epoch": 3.6720904578047433, + "grad_norm": 0.0034576740581542253, + "learning_rate": 3.989290546784077e-05, + "loss": 0.0017, + "num_input_tokens_seen": 8115672, + "step": 13315 + }, + { + "epoch": 3.673469387755102, + "grad_norm": 0.0015699273208156228, + "learning_rate": 3.988323854920052e-05, + "loss": 0.0, + "num_input_tokens_seen": 8117816, + "step": 13320 + }, + { + "epoch": 3.6748483177054605, + "grad_norm": 0.005252982024103403, + "learning_rate": 3.987356818232754e-05, + "loss": 0.0, + "num_input_tokens_seen": 8120984, + "step": 13325 + }, + { + "epoch": 3.6762272476558193, + "grad_norm": 0.0023115691728889942, + "learning_rate": 3.986389436946232e-05, + "loss": 0.0508, + "num_input_tokens_seen": 8128344, + "step": 13330 + }, + { + "epoch": 3.6776061776061777, + "grad_norm": 0.0008125992026180029, + "learning_rate": 3.985421711284613e-05, + "loss": 0.063, + "num_input_tokens_seen": 8130904, + "step": 13335 + }, + { + "epoch": 3.678985107556536, + "grad_norm": 0.019571218639612198, + "learning_rate": 3.984453641472105e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8133496, + "step": 13340 + }, + { + "epoch": 3.6803640375068944, + "grad_norm": 0.009293380193412304, + "learning_rate": 3.9834852277329974e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8136024, + "step": 13345 + }, + { + "epoch": 3.6817429674572533, + "grad_norm": 0.05233908072113991, + "learning_rate": 3.982516470291658e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8138296, + "step": 13350 + }, + { + "epoch": 3.6831218974076116, + "grad_norm": 0.1348547637462616, + "learning_rate": 3.981547369372532e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8141016, + "step": 13355 + }, + { + "epoch": 3.6845008273579705, + "grad_norm": 3.993227005004883, + "learning_rate": 3.980577925200148e-05, + "loss": 0.1417, + "num_input_tokens_seen": 8145240, + "step": 13360 + }, + { + "epoch": 3.685879757308329, + "grad_norm": 0.002249490236863494, + "learning_rate": 3.9796081379991114e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8148408, + "step": 13365 + }, + { + "epoch": 3.687258687258687, + "grad_norm": 0.002289652358740568, + "learning_rate": 3.978638007994108e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8151416, + "step": 13370 + }, + { + "epoch": 3.6886376172090456, + "grad_norm": 0.014677377417683601, + "learning_rate": 3.977667535409903e-05, + "loss": 0.0009, + "num_input_tokens_seen": 8154456, + "step": 13375 + }, + { + "epoch": 3.6900165471594044, + "grad_norm": 0.013875281438231468, + "learning_rate": 3.976696720471341e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8156856, + "step": 13380 + }, + { + "epoch": 3.691395477109763, + "grad_norm": 0.0031881825998425484, + "learning_rate": 3.9757255634033465e-05, + "loss": 0.0309, + "num_input_tokens_seen": 8159992, + "step": 13385 + }, + { + "epoch": 3.6927744070601216, + "grad_norm": 0.001817893236875534, + "learning_rate": 3.974754064430922e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8162648, + "step": 13390 + }, + { + "epoch": 3.69415333701048, + "grad_norm": 0.0019584570545703173, + "learning_rate": 3.97378222377915e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8165240, + "step": 13395 + }, + { + "epoch": 3.6955322669608384, + "grad_norm": 0.00876514334231615, + "learning_rate": 3.972810041673192e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8167960, + "step": 13400 + }, + { + "epoch": 3.6969111969111967, + "grad_norm": 0.12596112489700317, + "learning_rate": 3.9718375183382884e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8170552, + "step": 13405 + }, + { + "epoch": 3.6982901268615556, + "grad_norm": 71.2657699584961, + "learning_rate": 3.97086465399976e-05, + "loss": 0.0165, + "num_input_tokens_seen": 8173208, + "step": 13410 + }, + { + "epoch": 3.699669056811914, + "grad_norm": 0.002773612504824996, + "learning_rate": 3.969891448883003e-05, + "loss": 0.0835, + "num_input_tokens_seen": 8175864, + "step": 13415 + }, + { + "epoch": 3.7010479867622723, + "grad_norm": 0.0033547692000865936, + "learning_rate": 3.968917903213498e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8178584, + "step": 13420 + }, + { + "epoch": 3.702426916712631, + "grad_norm": 0.0024354197084903717, + "learning_rate": 3.9679440172168006e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8181944, + "step": 13425 + }, + { + "epoch": 3.7038058466629895, + "grad_norm": 0.1822153627872467, + "learning_rate": 3.9669697911185457e-05, + "loss": 0.0007, + "num_input_tokens_seen": 8185816, + "step": 13430 + }, + { + "epoch": 3.705184776613348, + "grad_norm": 0.009896057657897472, + "learning_rate": 3.9659952251444466e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8188472, + "step": 13435 + }, + { + "epoch": 3.7065637065637067, + "grad_norm": 0.0002654219570104033, + "learning_rate": 3.965020319520298e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8191320, + "step": 13440 + }, + { + "epoch": 3.707942636514065, + "grad_norm": 0.00897709559649229, + "learning_rate": 3.964045074471971e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8193592, + "step": 13445 + }, + { + "epoch": 3.7093215664644235, + "grad_norm": 0.0076645128428936005, + "learning_rate": 3.963069490225416e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8196760, + "step": 13450 + }, + { + "epoch": 3.7107004964147823, + "grad_norm": 0.0031833031680434942, + "learning_rate": 3.962093567006662e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8199128, + "step": 13455 + }, + { + "epoch": 3.7120794263651407, + "grad_norm": 0.0024852128699421883, + "learning_rate": 3.961117305041815e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8203544, + "step": 13460 + }, + { + "epoch": 3.713458356315499, + "grad_norm": 0.01407544407993555, + "learning_rate": 3.9601407045570626e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8205720, + "step": 13465 + }, + { + "epoch": 3.714837286265858, + "grad_norm": 0.004427584353834391, + "learning_rate": 3.959163765778668e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8208792, + "step": 13470 + }, + { + "epoch": 3.7162162162162162, + "grad_norm": 0.0003981698537245393, + "learning_rate": 3.9581864889329744e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8211352, + "step": 13475 + }, + { + "epoch": 3.7175951461665746, + "grad_norm": 0.0003884658799506724, + "learning_rate": 3.9572088742464034e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8214328, + "step": 13480 + }, + { + "epoch": 3.7189740761169334, + "grad_norm": 0.0013302003499120474, + "learning_rate": 3.956230921945453e-05, + "loss": 0.0, + "num_input_tokens_seen": 8217240, + "step": 13485 + }, + { + "epoch": 3.720353006067292, + "grad_norm": 0.0010959943756461143, + "learning_rate": 3.955252632256701e-05, + "loss": 0.0876, + "num_input_tokens_seen": 8220280, + "step": 13490 + }, + { + "epoch": 3.72173193601765, + "grad_norm": 0.0009164312505163252, + "learning_rate": 3.954274005406804e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8223608, + "step": 13495 + }, + { + "epoch": 3.7231108659680086, + "grad_norm": 0.0023763366043567657, + "learning_rate": 3.9532950416224956e-05, + "loss": 0.0, + "num_input_tokens_seen": 8226680, + "step": 13500 + }, + { + "epoch": 3.7244897959183674, + "grad_norm": 18.24535369873047, + "learning_rate": 3.952315741130586e-05, + "loss": 0.0088, + "num_input_tokens_seen": 8229368, + "step": 13505 + }, + { + "epoch": 3.7258687258687258, + "grad_norm": 0.010393153876066208, + "learning_rate": 3.951336104157967e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8232408, + "step": 13510 + }, + { + "epoch": 3.7272476558190846, + "grad_norm": 0.011879073455929756, + "learning_rate": 3.950356130931606e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8234840, + "step": 13515 + }, + { + "epoch": 3.728626585769443, + "grad_norm": 0.012068195268511772, + "learning_rate": 3.949375821678548e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8237688, + "step": 13520 + }, + { + "epoch": 3.7300055157198013, + "grad_norm": 0.002263491041958332, + "learning_rate": 3.948395176625918e-05, + "loss": 0.0, + "num_input_tokens_seen": 8240696, + "step": 13525 + }, + { + "epoch": 3.7313844456701597, + "grad_norm": 0.0002534442755859345, + "learning_rate": 3.947414196000915e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8244216, + "step": 13530 + }, + { + "epoch": 3.7327633756205185, + "grad_norm": 0.0014712369302287698, + "learning_rate": 3.94643288003082e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8247480, + "step": 13535 + }, + { + "epoch": 3.734142305570877, + "grad_norm": 0.023531479761004448, + "learning_rate": 3.9454512289429904e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8250168, + "step": 13540 + }, + { + "epoch": 3.7355212355212357, + "grad_norm": 0.149935781955719, + "learning_rate": 3.944469242964859e-05, + "loss": 0.0692, + "num_input_tokens_seen": 8252664, + "step": 13545 + }, + { + "epoch": 3.736900165471594, + "grad_norm": 0.0012755673378705978, + "learning_rate": 3.943486922323939e-05, + "loss": 0.0, + "num_input_tokens_seen": 8255896, + "step": 13550 + }, + { + "epoch": 3.7382790954219525, + "grad_norm": 0.00045455899089574814, + "learning_rate": 3.9425042672478184e-05, + "loss": 0.0, + "num_input_tokens_seen": 8258904, + "step": 13555 + }, + { + "epoch": 3.739658025372311, + "grad_norm": 0.01593303680419922, + "learning_rate": 3.9415212779641665e-05, + "loss": 0.0739, + "num_input_tokens_seen": 8261880, + "step": 13560 + }, + { + "epoch": 3.7410369553226697, + "grad_norm": 0.0003798259131144732, + "learning_rate": 3.940537954700727e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8264056, + "step": 13565 + }, + { + "epoch": 3.742415885273028, + "grad_norm": 0.0023413621820509434, + "learning_rate": 3.93955429768532e-05, + "loss": 0.0007, + "num_input_tokens_seen": 8267064, + "step": 13570 + }, + { + "epoch": 3.743794815223387, + "grad_norm": 0.000194178253877908, + "learning_rate": 3.938570307145847e-05, + "loss": 0.0, + "num_input_tokens_seen": 8269816, + "step": 13575 + }, + { + "epoch": 3.7451737451737452, + "grad_norm": 16.675071716308594, + "learning_rate": 3.937585983310284e-05, + "loss": 0.0878, + "num_input_tokens_seen": 8272728, + "step": 13580 + }, + { + "epoch": 3.7465526751241036, + "grad_norm": 0.0006618609186261892, + "learning_rate": 3.9366013264066836e-05, + "loss": 0.0, + "num_input_tokens_seen": 8276056, + "step": 13585 + }, + { + "epoch": 3.747931605074462, + "grad_norm": 0.00018556401482783258, + "learning_rate": 3.935616336663178e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8279128, + "step": 13590 + }, + { + "epoch": 3.749310535024821, + "grad_norm": 0.0017485287971794605, + "learning_rate": 3.934631014307975e-05, + "loss": 0.0, + "num_input_tokens_seen": 8281912, + "step": 13595 + }, + { + "epoch": 3.750689464975179, + "grad_norm": 0.0013782167807221413, + "learning_rate": 3.933645359569358e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8284408, + "step": 13600 + }, + { + "epoch": 3.752068394925538, + "grad_norm": 0.003684663213789463, + "learning_rate": 3.9326593726756916e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8287288, + "step": 13605 + }, + { + "epoch": 3.7534473248758964, + "grad_norm": 0.00029434970929287374, + "learning_rate": 3.9316730538554123e-05, + "loss": 0.0, + "num_input_tokens_seen": 8290840, + "step": 13610 + }, + { + "epoch": 3.7548262548262548, + "grad_norm": 0.07143149524927139, + "learning_rate": 3.930686403337039e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8293528, + "step": 13615 + }, + { + "epoch": 3.756205184776613, + "grad_norm": 0.001270086388103664, + "learning_rate": 3.929699421349161e-05, + "loss": 0.0034, + "num_input_tokens_seen": 8296152, + "step": 13620 + }, + { + "epoch": 3.757584114726972, + "grad_norm": 0.019627204164862633, + "learning_rate": 3.928712108120451e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8298584, + "step": 13625 + }, + { + "epoch": 3.7589630446773303, + "grad_norm": 0.0004911998985335231, + "learning_rate": 3.927724463879653e-05, + "loss": 0.0793, + "num_input_tokens_seen": 8300888, + "step": 13630 + }, + { + "epoch": 3.760341974627689, + "grad_norm": 0.00870292354375124, + "learning_rate": 3.9267364888555916e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8304408, + "step": 13635 + }, + { + "epoch": 3.7617209045780475, + "grad_norm": 1.3504323760571424e-05, + "learning_rate": 3.9257481832771646e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8307192, + "step": 13640 + }, + { + "epoch": 3.763099834528406, + "grad_norm": 0.00028301062411628664, + "learning_rate": 3.92475954737335e-05, + "loss": 0.0, + "num_input_tokens_seen": 8309688, + "step": 13645 + }, + { + "epoch": 3.7644787644787643, + "grad_norm": 0.0025022171903401613, + "learning_rate": 3.9237705813731995e-05, + "loss": 0.0, + "num_input_tokens_seen": 8312952, + "step": 13650 + }, + { + "epoch": 3.765857694429123, + "grad_norm": 0.00014200234727468342, + "learning_rate": 3.922781285505843e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8315256, + "step": 13655 + }, + { + "epoch": 3.7672366243794815, + "grad_norm": 0.0007676755776628852, + "learning_rate": 3.921791660000487e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8318200, + "step": 13660 + }, + { + "epoch": 3.7686155543298403, + "grad_norm": 0.0004206466837786138, + "learning_rate": 3.92080170508641e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8321176, + "step": 13665 + }, + { + "epoch": 3.7699944842801987, + "grad_norm": 0.0015731375897303224, + "learning_rate": 3.919811420992975e-05, + "loss": 0.0, + "num_input_tokens_seen": 8323608, + "step": 13670 + }, + { + "epoch": 3.771373414230557, + "grad_norm": 0.00023632515512872487, + "learning_rate": 3.918820807949612e-05, + "loss": 0.0, + "num_input_tokens_seen": 8328088, + "step": 13675 + }, + { + "epoch": 3.7727523441809154, + "grad_norm": 6.881004810566083e-05, + "learning_rate": 3.917829866185836e-05, + "loss": 0.0087, + "num_input_tokens_seen": 8331000, + "step": 13680 + }, + { + "epoch": 3.7741312741312742, + "grad_norm": 0.0007245290908031166, + "learning_rate": 3.9168385959312316e-05, + "loss": 0.1231, + "num_input_tokens_seen": 8334072, + "step": 13685 + }, + { + "epoch": 3.7755102040816326, + "grad_norm": 8.484707359457389e-05, + "learning_rate": 3.9158469974154625e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8336792, + "step": 13690 + }, + { + "epoch": 3.7768891340319914, + "grad_norm": 9.924666665028781e-05, + "learning_rate": 3.9148550708682675e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8340760, + "step": 13695 + }, + { + "epoch": 3.77826806398235, + "grad_norm": 0.003398499684408307, + "learning_rate": 3.913862816519462e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8343480, + "step": 13700 + }, + { + "epoch": 3.779646993932708, + "grad_norm": 0.39561358094215393, + "learning_rate": 3.912870234598937e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8346520, + "step": 13705 + }, + { + "epoch": 3.7810259238830666, + "grad_norm": 0.001697547733783722, + "learning_rate": 3.9118773253366604e-05, + "loss": 0.0, + "num_input_tokens_seen": 8349976, + "step": 13710 + }, + { + "epoch": 3.7824048538334254, + "grad_norm": 0.020188283175230026, + "learning_rate": 3.910884088962673e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8353080, + "step": 13715 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.008955501019954681, + "learning_rate": 3.9098905257070955e-05, + "loss": 0.0, + "num_input_tokens_seen": 8356696, + "step": 13720 + }, + { + "epoch": 3.785162713734142, + "grad_norm": 0.2442154437303543, + "learning_rate": 3.908896635800121e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8360952, + "step": 13725 + }, + { + "epoch": 3.786541643684501, + "grad_norm": 5.350624269340187e-05, + "learning_rate": 3.9079024194720205e-05, + "loss": 0.0, + "num_input_tokens_seen": 8363608, + "step": 13730 + }, + { + "epoch": 3.7879205736348593, + "grad_norm": 0.00041195587255060673, + "learning_rate": 3.906907876953138e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8366680, + "step": 13735 + }, + { + "epoch": 3.7892995035852177, + "grad_norm": 7.599734817631543e-05, + "learning_rate": 3.905913008473896e-05, + "loss": 0.0, + "num_input_tokens_seen": 8370328, + "step": 13740 + }, + { + "epoch": 3.7906784335355765, + "grad_norm": 0.0012608547694981098, + "learning_rate": 3.9049178142647916e-05, + "loss": 0.0038, + "num_input_tokens_seen": 8372632, + "step": 13745 + }, + { + "epoch": 3.792057363485935, + "grad_norm": 0.003942818380892277, + "learning_rate": 3.903922294556396e-05, + "loss": 0.2293, + "num_input_tokens_seen": 8374904, + "step": 13750 + }, + { + "epoch": 3.7934362934362933, + "grad_norm": 0.002677327487617731, + "learning_rate": 3.902926449579357e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8377880, + "step": 13755 + }, + { + "epoch": 3.794815223386652, + "grad_norm": 0.001845623250119388, + "learning_rate": 3.901930279564399e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8380856, + "step": 13760 + }, + { + "epoch": 3.7961941533370105, + "grad_norm": 0.003763906890526414, + "learning_rate": 3.900933784742318e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8384312, + "step": 13765 + }, + { + "epoch": 3.797573083287369, + "grad_norm": 0.0010883783688768744, + "learning_rate": 3.899936965343989e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8387736, + "step": 13770 + }, + { + "epoch": 3.7989520132377277, + "grad_norm": 2.066755771636963, + "learning_rate": 3.8989398216003604e-05, + "loss": 0.1285, + "num_input_tokens_seen": 8390584, + "step": 13775 + }, + { + "epoch": 3.800330943188086, + "grad_norm": 0.0025266380980610847, + "learning_rate": 3.8979423537424554e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8393880, + "step": 13780 + }, + { + "epoch": 3.8017098731384444, + "grad_norm": 0.03675956651568413, + "learning_rate": 3.896944562001375e-05, + "loss": 0.0177, + "num_input_tokens_seen": 8396632, + "step": 13785 + }, + { + "epoch": 3.8030888030888033, + "grad_norm": 0.0007311087683774531, + "learning_rate": 3.895946446608291e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8399256, + "step": 13790 + }, + { + "epoch": 3.8044677330391616, + "grad_norm": 9.078010559082031, + "learning_rate": 3.8949480077944537e-05, + "loss": 0.1045, + "num_input_tokens_seen": 8401976, + "step": 13795 + }, + { + "epoch": 3.80584666298952, + "grad_norm": 0.00762053020298481, + "learning_rate": 3.8939492457911866e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8405304, + "step": 13800 + }, + { + "epoch": 3.8072255929398784, + "grad_norm": 0.0019864714704453945, + "learning_rate": 3.892950160829888e-05, + "loss": 0.0008, + "num_input_tokens_seen": 8407896, + "step": 13805 + }, + { + "epoch": 3.808604522890237, + "grad_norm": 0.022044116631150246, + "learning_rate": 3.891950753142033e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8410776, + "step": 13810 + }, + { + "epoch": 3.8099834528405956, + "grad_norm": 0.0037514548748731613, + "learning_rate": 3.890951022959169e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8413432, + "step": 13815 + }, + { + "epoch": 3.8113623827909544, + "grad_norm": 0.00211413879878819, + "learning_rate": 3.8899509705129186e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8415800, + "step": 13820 + }, + { + "epoch": 3.812741312741313, + "grad_norm": 0.04082673415541649, + "learning_rate": 3.8889505960349816e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8418488, + "step": 13825 + }, + { + "epoch": 3.814120242691671, + "grad_norm": 0.020029639825224876, + "learning_rate": 3.8879498997571275e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8421208, + "step": 13830 + }, + { + "epoch": 3.8154991726420295, + "grad_norm": 0.004676443058997393, + "learning_rate": 3.8869488819112056e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8423800, + "step": 13835 + }, + { + "epoch": 3.8168781025923884, + "grad_norm": 0.02997012622654438, + "learning_rate": 3.885947542729137e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8427192, + "step": 13840 + }, + { + "epoch": 3.8182570325427467, + "grad_norm": 0.006997817195951939, + "learning_rate": 3.884945882442916e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8431512, + "step": 13845 + }, + { + "epoch": 3.8196359624931056, + "grad_norm": 0.001702662673778832, + "learning_rate": 3.8839439012846155e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8434232, + "step": 13850 + }, + { + "epoch": 3.821014892443464, + "grad_norm": 0.020158493891358376, + "learning_rate": 3.8829415994863784e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8438936, + "step": 13855 + }, + { + "epoch": 3.8223938223938223, + "grad_norm": 0.0027431813068687916, + "learning_rate": 3.881938977280424e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8442328, + "step": 13860 + }, + { + "epoch": 3.8237727523441807, + "grad_norm": 7.853109359741211, + "learning_rate": 3.8809360348990455e-05, + "loss": 0.1043, + "num_input_tokens_seen": 8445208, + "step": 13865 + }, + { + "epoch": 3.8251516822945395, + "grad_norm": 19.227453231811523, + "learning_rate": 3.879932772574609e-05, + "loss": 0.0657, + "num_input_tokens_seen": 8448152, + "step": 13870 + }, + { + "epoch": 3.826530612244898, + "grad_norm": 0.010998403653502464, + "learning_rate": 3.8789291905395584e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8450872, + "step": 13875 + }, + { + "epoch": 3.8279095421952567, + "grad_norm": 0.003701422829180956, + "learning_rate": 3.8779252890264084e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8454840, + "step": 13880 + }, + { + "epoch": 3.829288472145615, + "grad_norm": 0.004529767669737339, + "learning_rate": 3.876921068267748e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8457592, + "step": 13885 + }, + { + "epoch": 3.8306674020959735, + "grad_norm": 0.0020828740671277046, + "learning_rate": 3.8759165284962406e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8460312, + "step": 13890 + }, + { + "epoch": 3.832046332046332, + "grad_norm": 0.0011385735124349594, + "learning_rate": 3.874911669944624e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8463096, + "step": 13895 + }, + { + "epoch": 3.8334252619966906, + "grad_norm": 0.0005038586095906794, + "learning_rate": 3.8739064928457104e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8465880, + "step": 13900 + }, + { + "epoch": 3.834804191947049, + "grad_norm": 0.0017869931180030107, + "learning_rate": 3.872900997432383e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8468344, + "step": 13905 + }, + { + "epoch": 3.836183121897408, + "grad_norm": 0.0005546158645302057, + "learning_rate": 3.8718951839376026e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8471544, + "step": 13910 + }, + { + "epoch": 3.837562051847766, + "grad_norm": 6.475228786468506, + "learning_rate": 3.8708890525944e-05, + "loss": 0.1002, + "num_input_tokens_seen": 8474264, + "step": 13915 + }, + { + "epoch": 3.8389409817981246, + "grad_norm": 0.002455963520333171, + "learning_rate": 3.869882603635883e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8477432, + "step": 13920 + }, + { + "epoch": 3.840319911748483, + "grad_norm": 0.00286113447509706, + "learning_rate": 3.86887583729523e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8480216, + "step": 13925 + }, + { + "epoch": 3.841698841698842, + "grad_norm": 0.00912951584905386, + "learning_rate": 3.867868753805695e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8484280, + "step": 13930 + }, + { + "epoch": 3.8430777716492, + "grad_norm": 0.01539518777281046, + "learning_rate": 3.866861353400605e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8487192, + "step": 13935 + }, + { + "epoch": 3.844456701599559, + "grad_norm": 0.018167659640312195, + "learning_rate": 3.86585363631336e-05, + "loss": 0.0008, + "num_input_tokens_seen": 8490552, + "step": 13940 + }, + { + "epoch": 3.8458356315499174, + "grad_norm": 0.0020140320993959904, + "learning_rate": 3.864845602777433e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8492664, + "step": 13945 + }, + { + "epoch": 3.8472145615002757, + "grad_norm": 0.00017648658831603825, + "learning_rate": 3.8638372530263715e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8496216, + "step": 13950 + }, + { + "epoch": 3.848593491450634, + "grad_norm": 0.0004516136832535267, + "learning_rate": 3.862828587293796e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8499096, + "step": 13955 + }, + { + "epoch": 3.849972421400993, + "grad_norm": 0.0004698562261182815, + "learning_rate": 3.861819605813399e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8501304, + "step": 13960 + }, + { + "epoch": 3.8513513513513513, + "grad_norm": 0.0011632249224931002, + "learning_rate": 3.860810308818948e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8504472, + "step": 13965 + }, + { + "epoch": 3.85273028130171, + "grad_norm": 0.05992380529642105, + "learning_rate": 3.8598006965442815e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8508120, + "step": 13970 + }, + { + "epoch": 3.8541092112520685, + "grad_norm": 0.008089038543403149, + "learning_rate": 3.858790769223313e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8511000, + "step": 13975 + }, + { + "epoch": 3.855488141202427, + "grad_norm": 0.0012346263974905014, + "learning_rate": 3.857780527090028e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8513976, + "step": 13980 + }, + { + "epoch": 3.8568670711527853, + "grad_norm": 0.0006418422563001513, + "learning_rate": 3.8567699703784853e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8516440, + "step": 13985 + }, + { + "epoch": 3.858246001103144, + "grad_norm": 0.002822226844727993, + "learning_rate": 3.8557590993228155e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8519864, + "step": 13990 + }, + { + "epoch": 3.8596249310535025, + "grad_norm": 0.0024693431332707405, + "learning_rate": 3.854747914157224e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8522488, + "step": 13995 + }, + { + "epoch": 3.861003861003861, + "grad_norm": 1.388344168663025, + "learning_rate": 3.853736415115987e-05, + "loss": 0.1802, + "num_input_tokens_seen": 8526072, + "step": 14000 + }, + { + "epoch": 3.8623827909542197, + "grad_norm": 5.232800006866455, + "learning_rate": 3.852724602433455e-05, + "loss": 0.0011, + "num_input_tokens_seen": 8529080, + "step": 14005 + }, + { + "epoch": 3.863761720904578, + "grad_norm": 0.060032155364751816, + "learning_rate": 3.85171247634405e-05, + "loss": 0.0493, + "num_input_tokens_seen": 8532088, + "step": 14010 + }, + { + "epoch": 3.8651406508549364, + "grad_norm": 0.05752028897404671, + "learning_rate": 3.850700037082268e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8534648, + "step": 14015 + }, + { + "epoch": 3.8665195808052952, + "grad_norm": 0.1384345293045044, + "learning_rate": 3.849687284882675e-05, + "loss": 0.0016, + "num_input_tokens_seen": 8537400, + "step": 14020 + }, + { + "epoch": 3.8678985107556536, + "grad_norm": 0.003966166637837887, + "learning_rate": 3.848674219979913e-05, + "loss": 0.0046, + "num_input_tokens_seen": 8541336, + "step": 14025 + }, + { + "epoch": 3.869277440706012, + "grad_norm": 0.0022221982944756746, + "learning_rate": 3.847660842608693e-05, + "loss": 0.0, + "num_input_tokens_seen": 8544568, + "step": 14030 + }, + { + "epoch": 3.870656370656371, + "grad_norm": 0.6309296488761902, + "learning_rate": 3.8466471530038e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8547224, + "step": 14035 + }, + { + "epoch": 3.872035300606729, + "grad_norm": 0.000996616086922586, + "learning_rate": 3.845633151400093e-05, + "loss": 0.0, + "num_input_tokens_seen": 8550584, + "step": 14040 + }, + { + "epoch": 3.8734142305570876, + "grad_norm": 0.022307250648736954, + "learning_rate": 3.8446188380325e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8555032, + "step": 14045 + }, + { + "epoch": 3.8747931605074464, + "grad_norm": 21.887863159179688, + "learning_rate": 3.843604213136024e-05, + "loss": 0.0774, + "num_input_tokens_seen": 8557880, + "step": 14050 + }, + { + "epoch": 3.8761720904578048, + "grad_norm": 0.00010740837751654908, + "learning_rate": 3.8425892769457386e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8561528, + "step": 14055 + }, + { + "epoch": 3.877551020408163, + "grad_norm": 0.00014502413978334516, + "learning_rate": 3.8415740296967896e-05, + "loss": 0.0, + "num_input_tokens_seen": 8563960, + "step": 14060 + }, + { + "epoch": 3.878929950358522, + "grad_norm": 0.0028308513574302197, + "learning_rate": 3.8405584716243945e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8566072, + "step": 14065 + }, + { + "epoch": 3.8803088803088803, + "grad_norm": 0.00023881356173660606, + "learning_rate": 3.839542602963846e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8568536, + "step": 14070 + }, + { + "epoch": 3.8816878102592387, + "grad_norm": 0.002719649812206626, + "learning_rate": 3.838526423950504e-05, + "loss": 0.065, + "num_input_tokens_seen": 8571512, + "step": 14075 + }, + { + "epoch": 3.883066740209597, + "grad_norm": 0.0015505178598687053, + "learning_rate": 3.837509934819803e-05, + "loss": 0.0, + "num_input_tokens_seen": 8574744, + "step": 14080 + }, + { + "epoch": 3.884445670159956, + "grad_norm": 0.001251363311894238, + "learning_rate": 3.83649313580725e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8577336, + "step": 14085 + }, + { + "epoch": 3.8858246001103143, + "grad_norm": 0.0001164923669421114, + "learning_rate": 3.8354760271484215e-05, + "loss": 0.0, + "num_input_tokens_seen": 8580536, + "step": 14090 + }, + { + "epoch": 3.887203530060673, + "grad_norm": 0.04131250083446503, + "learning_rate": 3.834458609078968e-05, + "loss": 0.1147, + "num_input_tokens_seen": 8584824, + "step": 14095 + }, + { + "epoch": 3.8885824600110315, + "grad_norm": 0.0012068025534972548, + "learning_rate": 3.8334408818346095e-05, + "loss": 0.0, + "num_input_tokens_seen": 8587384, + "step": 14100 + }, + { + "epoch": 3.88996138996139, + "grad_norm": 0.004302692599594593, + "learning_rate": 3.832422845651139e-05, + "loss": 0.1123, + "num_input_tokens_seen": 8591480, + "step": 14105 + }, + { + "epoch": 3.8913403199117482, + "grad_norm": 0.5500175952911377, + "learning_rate": 3.831404500764423e-05, + "loss": 0.0007, + "num_input_tokens_seen": 8595960, + "step": 14110 + }, + { + "epoch": 3.892719249862107, + "grad_norm": 0.013104438781738281, + "learning_rate": 3.830385847410395e-05, + "loss": 0.1066, + "num_input_tokens_seen": 8599160, + "step": 14115 + }, + { + "epoch": 3.8940981798124654, + "grad_norm": 0.025211872532963753, + "learning_rate": 3.829366885825062e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8602040, + "step": 14120 + }, + { + "epoch": 3.8954771097628242, + "grad_norm": 0.005324623081833124, + "learning_rate": 3.828347616244505e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8604536, + "step": 14125 + }, + { + "epoch": 3.8968560397131826, + "grad_norm": 0.0011652971152216196, + "learning_rate": 3.8273280389048735e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8609368, + "step": 14130 + }, + { + "epoch": 3.898234969663541, + "grad_norm": 0.01673763617873192, + "learning_rate": 3.826308154042387e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8612568, + "step": 14135 + }, + { + "epoch": 3.8996138996138994, + "grad_norm": 0.004753780085593462, + "learning_rate": 3.82528796189334e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8614936, + "step": 14140 + }, + { + "epoch": 3.900992829564258, + "grad_norm": 0.003530558431521058, + "learning_rate": 3.8242674626940965e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8617304, + "step": 14145 + }, + { + "epoch": 3.9023717595146166, + "grad_norm": 0.06404106318950653, + "learning_rate": 3.823246656681091e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8620056, + "step": 14150 + }, + { + "epoch": 3.9037506894649754, + "grad_norm": 0.026945756748318672, + "learning_rate": 3.822225544090829e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8624408, + "step": 14155 + }, + { + "epoch": 3.9051296194153338, + "grad_norm": 0.04102223366498947, + "learning_rate": 3.8212041251598884e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8627864, + "step": 14160 + }, + { + "epoch": 3.906508549365692, + "grad_norm": 0.0022325681056827307, + "learning_rate": 3.8201824001249173e-05, + "loss": 0.0138, + "num_input_tokens_seen": 8631288, + "step": 14165 + }, + { + "epoch": 3.9078874793160505, + "grad_norm": 0.0004951123264618218, + "learning_rate": 3.819160369222634e-05, + "loss": 0.0, + "num_input_tokens_seen": 8633944, + "step": 14170 + }, + { + "epoch": 3.9092664092664093, + "grad_norm": 0.004495569970458746, + "learning_rate": 3.81813803268983e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8636728, + "step": 14175 + }, + { + "epoch": 3.9106453392167677, + "grad_norm": 0.011209476739168167, + "learning_rate": 3.817115390763364e-05, + "loss": 0.0007, + "num_input_tokens_seen": 8640056, + "step": 14180 + }, + { + "epoch": 3.9120242691671265, + "grad_norm": 0.04912056773900986, + "learning_rate": 3.8160924436801685e-05, + "loss": 0.1337, + "num_input_tokens_seen": 8643640, + "step": 14185 + }, + { + "epoch": 3.913403199117485, + "grad_norm": 0.01857907325029373, + "learning_rate": 3.815069191677246e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8646424, + "step": 14190 + }, + { + "epoch": 3.9147821290678433, + "grad_norm": 0.005923304241150618, + "learning_rate": 3.814045634991669e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8649752, + "step": 14195 + }, + { + "epoch": 3.9161610590182017, + "grad_norm": 0.0010136603377759457, + "learning_rate": 3.81302177386058e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8651992, + "step": 14200 + }, + { + "epoch": 3.9175399889685605, + "grad_norm": 0.0004668919718824327, + "learning_rate": 3.8119976085211937e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8654616, + "step": 14205 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.0014073258498683572, + "learning_rate": 3.810973139210795e-05, + "loss": 0.0981, + "num_input_tokens_seen": 8657336, + "step": 14210 + }, + { + "epoch": 3.9202978488692777, + "grad_norm": 0.00327498372644186, + "learning_rate": 3.809948366166738e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8659800, + "step": 14215 + }, + { + "epoch": 3.921676778819636, + "grad_norm": 0.060734864324331284, + "learning_rate": 3.808923289626448e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8663096, + "step": 14220 + }, + { + "epoch": 3.9230557087699944, + "grad_norm": 0.032874539494514465, + "learning_rate": 3.80789790982742e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8666232, + "step": 14225 + }, + { + "epoch": 3.924434638720353, + "grad_norm": 0.0034789075143635273, + "learning_rate": 3.806872227007222e-05, + "loss": 0.0011, + "num_input_tokens_seen": 8668952, + "step": 14230 + }, + { + "epoch": 3.9258135686707116, + "grad_norm": 0.01293415017426014, + "learning_rate": 3.8058462414034866e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8672728, + "step": 14235 + }, + { + "epoch": 3.92719249862107, + "grad_norm": 0.0011787419207394123, + "learning_rate": 3.804819953253923e-05, + "loss": 0.096, + "num_input_tokens_seen": 8675960, + "step": 14240 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 0.0385882705450058, + "learning_rate": 3.8037933627963055e-05, + "loss": 0.0983, + "num_input_tokens_seen": 8679320, + "step": 14245 + }, + { + "epoch": 3.929950358521787, + "grad_norm": 0.018160516396164894, + "learning_rate": 3.8027664702684814e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8681624, + "step": 14250 + }, + { + "epoch": 3.9313292884721456, + "grad_norm": 0.03206915780901909, + "learning_rate": 3.801739275908367e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8684408, + "step": 14255 + }, + { + "epoch": 3.932708218422504, + "grad_norm": 2.714200973510742, + "learning_rate": 3.8007117799539475e-05, + "loss": 0.0537, + "num_input_tokens_seen": 8686904, + "step": 14260 + }, + { + "epoch": 3.9340871483728628, + "grad_norm": 0.0016414244892075658, + "learning_rate": 3.7996839826432796e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8690488, + "step": 14265 + }, + { + "epoch": 3.935466078323221, + "grad_norm": 0.002688860986381769, + "learning_rate": 3.7986558842144895e-05, + "loss": 0.0351, + "num_input_tokens_seen": 8692984, + "step": 14270 + }, + { + "epoch": 3.93684500827358, + "grad_norm": 0.02999676764011383, + "learning_rate": 3.797627484905772e-05, + "loss": 0.0015, + "num_input_tokens_seen": 8695800, + "step": 14275 + }, + { + "epoch": 3.9382239382239383, + "grad_norm": 0.28651466965675354, + "learning_rate": 3.796598784955393e-05, + "loss": 0.0012, + "num_input_tokens_seen": 8699704, + "step": 14280 + }, + { + "epoch": 3.9396028681742967, + "grad_norm": 0.0041876607574522495, + "learning_rate": 3.795569784601688e-05, + "loss": 0.0017, + "num_input_tokens_seen": 8704376, + "step": 14285 + }, + { + "epoch": 3.940981798124655, + "grad_norm": 0.0035701743327081203, + "learning_rate": 3.79454048408306e-05, + "loss": 0.0532, + "num_input_tokens_seen": 8707320, + "step": 14290 + }, + { + "epoch": 3.942360728075014, + "grad_norm": 0.057362787425518036, + "learning_rate": 3.793510883637985e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8710200, + "step": 14295 + }, + { + "epoch": 3.9437396580253723, + "grad_norm": 0.0025778599083423615, + "learning_rate": 3.7924809835050065e-05, + "loss": 0.0636, + "num_input_tokens_seen": 8714232, + "step": 14300 + }, + { + "epoch": 3.9451185879757307, + "grad_norm": 0.0025223682168871164, + "learning_rate": 3.791450783922736e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8717112, + "step": 14305 + }, + { + "epoch": 3.9464975179260895, + "grad_norm": 0.004062006715685129, + "learning_rate": 3.7904202851298565e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8720152, + "step": 14310 + }, + { + "epoch": 3.947876447876448, + "grad_norm": 0.00019889470422640443, + "learning_rate": 3.789389487365121e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8722776, + "step": 14315 + }, + { + "epoch": 3.9492553778268062, + "grad_norm": 0.0009167041862383485, + "learning_rate": 3.788358390867349e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8726392, + "step": 14320 + }, + { + "epoch": 3.950634307777165, + "grad_norm": 0.004048195667564869, + "learning_rate": 3.787326995875432e-05, + "loss": 0.0007, + "num_input_tokens_seen": 8728920, + "step": 14325 + }, + { + "epoch": 3.9520132377275234, + "grad_norm": 0.004991008434444666, + "learning_rate": 3.786295302628329e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8731544, + "step": 14330 + }, + { + "epoch": 3.953392167677882, + "grad_norm": 15.780783653259277, + "learning_rate": 3.785263311365068e-05, + "loss": 0.1379, + "num_input_tokens_seen": 8734712, + "step": 14335 + }, + { + "epoch": 3.9547710976282406, + "grad_norm": 0.006955367513000965, + "learning_rate": 3.784231022324748e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8737880, + "step": 14340 + }, + { + "epoch": 3.956150027578599, + "grad_norm": 32.5867805480957, + "learning_rate": 3.7831984357465335e-05, + "loss": 0.0593, + "num_input_tokens_seen": 8740152, + "step": 14345 + }, + { + "epoch": 3.9575289575289574, + "grad_norm": 0.09338022023439407, + "learning_rate": 3.782165551869661e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8742328, + "step": 14350 + }, + { + "epoch": 3.958907887479316, + "grad_norm": 14.442769050598145, + "learning_rate": 3.781132370933434e-05, + "loss": 0.0084, + "num_input_tokens_seen": 8745528, + "step": 14355 + }, + { + "epoch": 3.9602868174296746, + "grad_norm": 0.09991579502820969, + "learning_rate": 3.7800988931772285e-05, + "loss": 0.053, + "num_input_tokens_seen": 8748152, + "step": 14360 + }, + { + "epoch": 3.961665747380033, + "grad_norm": 0.02645624615252018, + "learning_rate": 3.779065118840484e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8751448, + "step": 14365 + }, + { + "epoch": 3.963044677330392, + "grad_norm": 10.255952835083008, + "learning_rate": 3.778031048162711e-05, + "loss": 0.0549, + "num_input_tokens_seen": 8754296, + "step": 14370 + }, + { + "epoch": 3.96442360728075, + "grad_norm": 0.0011722492054104805, + "learning_rate": 3.77699668138349e-05, + "loss": 0.0038, + "num_input_tokens_seen": 8758072, + "step": 14375 + }, + { + "epoch": 3.9658025372311085, + "grad_norm": 9.583639144897461, + "learning_rate": 3.775962018742468e-05, + "loss": 0.0161, + "num_input_tokens_seen": 8761336, + "step": 14380 + }, + { + "epoch": 3.967181467181467, + "grad_norm": 0.0025389459915459156, + "learning_rate": 3.774927060479363e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8764216, + "step": 14385 + }, + { + "epoch": 3.9685603971318257, + "grad_norm": 0.01481222826987505, + "learning_rate": 3.773891806833958e-05, + "loss": 0.0634, + "num_input_tokens_seen": 8767832, + "step": 14390 + }, + { + "epoch": 3.969939327082184, + "grad_norm": 0.00020189688075333834, + "learning_rate": 3.772856258046108e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8771608, + "step": 14395 + }, + { + "epoch": 3.971318257032543, + "grad_norm": 0.00019851130491588265, + "learning_rate": 3.771820414355733e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8775032, + "step": 14400 + }, + { + "epoch": 3.9726971869829013, + "grad_norm": 0.0005925272707827389, + "learning_rate": 3.770784276002826e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8778168, + "step": 14405 + }, + { + "epoch": 3.9740761169332597, + "grad_norm": 0.0010954226600006223, + "learning_rate": 3.7697478432274424e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8784088, + "step": 14410 + }, + { + "epoch": 3.975455046883618, + "grad_norm": 0.537207841873169, + "learning_rate": 3.7687111162697096e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8786616, + "step": 14415 + }, + { + "epoch": 3.976833976833977, + "grad_norm": 0.036396343261003494, + "learning_rate": 3.767674095369823e-05, + "loss": 0.1023, + "num_input_tokens_seen": 8791160, + "step": 14420 + }, + { + "epoch": 3.9782129067843353, + "grad_norm": 0.07480505853891373, + "learning_rate": 3.766636780768046e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8794104, + "step": 14425 + }, + { + "epoch": 3.979591836734694, + "grad_norm": 0.005529831163585186, + "learning_rate": 3.765599172704709e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8797496, + "step": 14430 + }, + { + "epoch": 3.9809707666850525, + "grad_norm": 0.011858387850224972, + "learning_rate": 3.764561271420209e-05, + "loss": 0.0239, + "num_input_tokens_seen": 8799544, + "step": 14435 + }, + { + "epoch": 3.982349696635411, + "grad_norm": 0.0016167437424883246, + "learning_rate": 3.763523077155016e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8802680, + "step": 14440 + }, + { + "epoch": 3.983728626585769, + "grad_norm": 0.0006997982272878289, + "learning_rate": 3.7624845901496626e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8805848, + "step": 14445 + }, + { + "epoch": 3.985107556536128, + "grad_norm": 0.027822917327284813, + "learning_rate": 3.761445810644752e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8807896, + "step": 14450 + }, + { + "epoch": 3.9864864864864864, + "grad_norm": 0.0013614854542538524, + "learning_rate": 3.760406738880954e-05, + "loss": 0.0, + "num_input_tokens_seen": 8810424, + "step": 14455 + }, + { + "epoch": 3.9878654164368452, + "grad_norm": 0.0013756559928879142, + "learning_rate": 3.759367375099007e-05, + "loss": 0.0208, + "num_input_tokens_seen": 8813816, + "step": 14460 + }, + { + "epoch": 3.9892443463872036, + "grad_norm": 0.00026342959608882666, + "learning_rate": 3.758327719539717e-05, + "loss": 0.0004, + "num_input_tokens_seen": 8817048, + "step": 14465 + }, + { + "epoch": 3.990623276337562, + "grad_norm": 0.00010521979857003316, + "learning_rate": 3.757287772443957e-05, + "loss": 0.0131, + "num_input_tokens_seen": 8822264, + "step": 14470 + }, + { + "epoch": 3.9920022062879204, + "grad_norm": 0.0055816988460719585, + "learning_rate": 3.756247534052668e-05, + "loss": 0.0012, + "num_input_tokens_seen": 8825560, + "step": 14475 + }, + { + "epoch": 3.993381136238279, + "grad_norm": 0.003589016618207097, + "learning_rate": 3.755207004606858e-05, + "loss": 0.0762, + "num_input_tokens_seen": 8828888, + "step": 14480 + }, + { + "epoch": 3.9947600661886375, + "grad_norm": 0.001284954254515469, + "learning_rate": 3.7541661843476026e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8831992, + "step": 14485 + }, + { + "epoch": 3.9961389961389964, + "grad_norm": 0.0029916304629296064, + "learning_rate": 3.753125073516044e-05, + "loss": 0.0, + "num_input_tokens_seen": 8835256, + "step": 14490 + }, + { + "epoch": 3.9975179260893547, + "grad_norm": 3.605230085668154e-05, + "learning_rate": 3.752083672353395e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8838040, + "step": 14495 + }, + { + "epoch": 3.998896856039713, + "grad_norm": 0.0003334406646899879, + "learning_rate": 3.7510419811009325e-05, + "loss": 0.0453, + "num_input_tokens_seen": 8842104, + "step": 14500 + }, + { + "epoch": 4.0, + "eval_loss": 0.21640491485595703, + "eval_runtime": 28.4711, + "eval_samples_per_second": 56.619, + "eval_steps_per_second": 14.155, + "num_input_tokens_seen": 8844408, + "step": 14504 + }, + { + "epoch": 4.0002757859900715, + "grad_norm": 11.850436210632324, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.0629, + "num_input_tokens_seen": 8844920, + "step": 14505 + }, + { + "epoch": 4.00165471594043, + "grad_norm": 7.304669270524755e-05, + "learning_rate": 3.748957729292011e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8847544, + "step": 14510 + }, + { + "epoch": 4.003033645890789, + "grad_norm": 0.002618846483528614, + "learning_rate": 3.7479151692184446e-05, + "loss": 0.0, + "num_input_tokens_seen": 8851288, + "step": 14515 + }, + { + "epoch": 4.0044125758411475, + "grad_norm": 0.0003656522312667221, + "learning_rate": 3.7468723200208453e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8853912, + "step": 14520 + }, + { + "epoch": 4.005791505791506, + "grad_norm": 0.12357902526855469, + "learning_rate": 3.7458291819408285e-05, + "loss": 0.0005, + "num_input_tokens_seen": 8858104, + "step": 14525 + }, + { + "epoch": 4.007170435741864, + "grad_norm": 0.0008283788920380175, + "learning_rate": 3.7447857552200734e-05, + "loss": 0.0, + "num_input_tokens_seen": 8860472, + "step": 14530 + }, + { + "epoch": 4.008549365692223, + "grad_norm": 0.00037640516529791057, + "learning_rate": 3.743742040100326e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8863480, + "step": 14535 + }, + { + "epoch": 4.009928295642581, + "grad_norm": 46.6085090637207, + "learning_rate": 3.742698036823403e-05, + "loss": 0.0579, + "num_input_tokens_seen": 8866616, + "step": 14540 + }, + { + "epoch": 4.01130722559294, + "grad_norm": 0.008807887323200703, + "learning_rate": 3.741653745631182e-05, + "loss": 0.0, + "num_input_tokens_seen": 8870680, + "step": 14545 + }, + { + "epoch": 4.012686155543299, + "grad_norm": 0.0006956835277378559, + "learning_rate": 3.740609166765611e-05, + "loss": 0.0, + "num_input_tokens_seen": 8873304, + "step": 14550 + }, + { + "epoch": 4.014065085493657, + "grad_norm": 0.0003824675513897091, + "learning_rate": 3.739564300468705e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8876600, + "step": 14555 + }, + { + "epoch": 4.015444015444015, + "grad_norm": 0.007668101228773594, + "learning_rate": 3.738519146982543e-05, + "loss": 0.0447, + "num_input_tokens_seen": 8880056, + "step": 14560 + }, + { + "epoch": 4.016822945394374, + "grad_norm": 0.0029623403679579496, + "learning_rate": 3.737473706549274e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8883384, + "step": 14565 + }, + { + "epoch": 4.018201875344732, + "grad_norm": 0.008586783893406391, + "learning_rate": 3.736427979411109e-05, + "loss": 0.0, + "num_input_tokens_seen": 8886072, + "step": 14570 + }, + { + "epoch": 4.019580805295091, + "grad_norm": 0.0009432295919395983, + "learning_rate": 3.73538196581033e-05, + "loss": 0.0, + "num_input_tokens_seen": 8888760, + "step": 14575 + }, + { + "epoch": 4.02095973524545, + "grad_norm": 0.4787852168083191, + "learning_rate": 3.7343356659892834e-05, + "loss": 0.0006, + "num_input_tokens_seen": 8892408, + "step": 14580 + }, + { + "epoch": 4.022338665195808, + "grad_norm": 0.0002452838816680014, + "learning_rate": 3.733289080190381e-05, + "loss": 0.0, + "num_input_tokens_seen": 8895320, + "step": 14585 + }, + { + "epoch": 4.023717595146167, + "grad_norm": 0.00016686713206581771, + "learning_rate": 3.732242208656101e-05, + "loss": 0.0, + "num_input_tokens_seen": 8899736, + "step": 14590 + }, + { + "epoch": 4.025096525096525, + "grad_norm": 0.0012385715963318944, + "learning_rate": 3.731195051628989e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8903128, + "step": 14595 + }, + { + "epoch": 4.026475455046883, + "grad_norm": 0.07559910416603088, + "learning_rate": 3.730147609351659e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8905944, + "step": 14600 + }, + { + "epoch": 4.027854384997243, + "grad_norm": 0.0001424203801434487, + "learning_rate": 3.7290998820667834e-05, + "loss": 0.0, + "num_input_tokens_seen": 8908824, + "step": 14605 + }, + { + "epoch": 4.029233314947601, + "grad_norm": 0.004967537242919207, + "learning_rate": 3.7280518700171094e-05, + "loss": 0.0, + "num_input_tokens_seen": 8911480, + "step": 14610 + }, + { + "epoch": 4.030612244897959, + "grad_norm": 0.0002822061360348016, + "learning_rate": 3.7270035734454445e-05, + "loss": 0.0, + "num_input_tokens_seen": 8914104, + "step": 14615 + }, + { + "epoch": 4.031991174848318, + "grad_norm": 0.00024444167502224445, + "learning_rate": 3.7259549925946655e-05, + "loss": 0.1188, + "num_input_tokens_seen": 8917080, + "step": 14620 + }, + { + "epoch": 4.033370104798676, + "grad_norm": 9.65297149377875e-05, + "learning_rate": 3.724906127707712e-05, + "loss": 0.0, + "num_input_tokens_seen": 8921048, + "step": 14625 + }, + { + "epoch": 4.0347490347490345, + "grad_norm": 8.925137080950662e-05, + "learning_rate": 3.7238569790275914e-05, + "loss": 0.0002, + "num_input_tokens_seen": 8923544, + "step": 14630 + }, + { + "epoch": 4.036127964699393, + "grad_norm": 0.05139981582760811, + "learning_rate": 3.7228075467973765e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8926744, + "step": 14635 + }, + { + "epoch": 4.037506894649752, + "grad_norm": 0.00011289623944321647, + "learning_rate": 3.7217578312602064e-05, + "loss": 0.0, + "num_input_tokens_seen": 8929144, + "step": 14640 + }, + { + "epoch": 4.0388858246001105, + "grad_norm": 0.0008735382580198348, + "learning_rate": 3.7207078326592836e-05, + "loss": 0.0116, + "num_input_tokens_seen": 8931960, + "step": 14645 + }, + { + "epoch": 4.040264754550469, + "grad_norm": 0.0061987037770450115, + "learning_rate": 3.719657551237878e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8935320, + "step": 14650 + }, + { + "epoch": 4.041643684500827, + "grad_norm": 7.098936557769775, + "learning_rate": 3.718606987239327e-05, + "loss": 0.0027, + "num_input_tokens_seen": 8938232, + "step": 14655 + }, + { + "epoch": 4.043022614451186, + "grad_norm": 0.0019292898941785097, + "learning_rate": 3.717556140907029e-05, + "loss": 0.0, + "num_input_tokens_seen": 8940632, + "step": 14660 + }, + { + "epoch": 4.044401544401544, + "grad_norm": 0.0009451609221287072, + "learning_rate": 3.716505012484449e-05, + "loss": 0.0, + "num_input_tokens_seen": 8943704, + "step": 14665 + }, + { + "epoch": 4.045780474351903, + "grad_norm": 0.001336017856374383, + "learning_rate": 3.715453602215121e-05, + "loss": 0.0, + "num_input_tokens_seen": 8947480, + "step": 14670 + }, + { + "epoch": 4.047159404302262, + "grad_norm": 0.0001340899761999026, + "learning_rate": 3.714401910342641e-05, + "loss": 0.0, + "num_input_tokens_seen": 8949880, + "step": 14675 + }, + { + "epoch": 4.04853833425262, + "grad_norm": 3.065455530304462e-05, + "learning_rate": 3.713349937110669e-05, + "loss": 0.0003, + "num_input_tokens_seen": 8952472, + "step": 14680 + }, + { + "epoch": 4.049917264202978, + "grad_norm": 0.00023364061780739576, + "learning_rate": 3.712297682762934e-05, + "loss": 0.024, + "num_input_tokens_seen": 8954808, + "step": 14685 + }, + { + "epoch": 4.051296194153337, + "grad_norm": 0.0002689358952920884, + "learning_rate": 3.711245147543229e-05, + "loss": 0.0, + "num_input_tokens_seen": 8958296, + "step": 14690 + }, + { + "epoch": 4.052675124103695, + "grad_norm": 0.0001667701144469902, + "learning_rate": 3.7101923316954085e-05, + "loss": 0.0, + "num_input_tokens_seen": 8961272, + "step": 14695 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 0.0005256707081571221, + "learning_rate": 3.709139235463397e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8963640, + "step": 14700 + }, + { + "epoch": 4.055432984004413, + "grad_norm": 0.001038038870319724, + "learning_rate": 3.7080858590911805e-05, + "loss": 0.0, + "num_input_tokens_seen": 8967224, + "step": 14705 + }, + { + "epoch": 4.056811913954771, + "grad_norm": 0.0008105036686174572, + "learning_rate": 3.707032202822813e-05, + "loss": 0.0, + "num_input_tokens_seen": 8969624, + "step": 14710 + }, + { + "epoch": 4.0581908439051295, + "grad_norm": 0.01235236506909132, + "learning_rate": 3.705978266902409e-05, + "loss": 0.0106, + "num_input_tokens_seen": 8972760, + "step": 14715 + }, + { + "epoch": 4.059569773855488, + "grad_norm": 4.923585493088467e-06, + "learning_rate": 3.704924051574153e-05, + "loss": 0.0, + "num_input_tokens_seen": 8974904, + "step": 14720 + }, + { + "epoch": 4.060948703805846, + "grad_norm": 4.039236591779627e-05, + "learning_rate": 3.703869557082289e-05, + "loss": 0.0, + "num_input_tokens_seen": 8977752, + "step": 14725 + }, + { + "epoch": 4.0623276337562055, + "grad_norm": 0.00032097159419208765, + "learning_rate": 3.70281478367113e-05, + "loss": 0.0, + "num_input_tokens_seen": 8980632, + "step": 14730 + }, + { + "epoch": 4.063706563706564, + "grad_norm": 0.008865858428180218, + "learning_rate": 3.701759731585052e-05, + "loss": 0.0, + "num_input_tokens_seen": 8983864, + "step": 14735 + }, + { + "epoch": 4.065085493656922, + "grad_norm": 0.0006617965409532189, + "learning_rate": 3.700704401068494e-05, + "loss": 0.0001, + "num_input_tokens_seen": 8986872, + "step": 14740 + }, + { + "epoch": 4.066464423607281, + "grad_norm": 0.00014954712241888046, + "learning_rate": 3.6996487923659615e-05, + "loss": 0.0, + "num_input_tokens_seen": 8989592, + "step": 14745 + }, + { + "epoch": 4.067843353557639, + "grad_norm": 0.00039527396438643336, + "learning_rate": 3.698592905722025e-05, + "loss": 0.0, + "num_input_tokens_seen": 8992504, + "step": 14750 + }, + { + "epoch": 4.069222283507997, + "grad_norm": 0.0010233111679553986, + "learning_rate": 3.6975367413813164e-05, + "loss": 0.0007, + "num_input_tokens_seen": 8996024, + "step": 14755 + }, + { + "epoch": 4.070601213458357, + "grad_norm": 2.1518044377444312e-05, + "learning_rate": 3.696480299588535e-05, + "loss": 0.0, + "num_input_tokens_seen": 8999864, + "step": 14760 + }, + { + "epoch": 4.071980143408715, + "grad_norm": 2.0886687707388774e-05, + "learning_rate": 3.6954235805884434e-05, + "loss": 0.0, + "num_input_tokens_seen": 9004216, + "step": 14765 + }, + { + "epoch": 4.073359073359073, + "grad_norm": 4.017625542473979e-05, + "learning_rate": 3.694366584625867e-05, + "loss": 0.0, + "num_input_tokens_seen": 9009016, + "step": 14770 + }, + { + "epoch": 4.074738003309432, + "grad_norm": 3.045686571567785e-05, + "learning_rate": 3.693309311945698e-05, + "loss": 0.0, + "num_input_tokens_seen": 9011832, + "step": 14775 + }, + { + "epoch": 4.07611693325979, + "grad_norm": 4.313693352742121e-06, + "learning_rate": 3.692251762792891e-05, + "loss": 0.0, + "num_input_tokens_seen": 9014168, + "step": 14780 + }, + { + "epoch": 4.077495863210149, + "grad_norm": 0.00018662922957446426, + "learning_rate": 3.6911939374124635e-05, + "loss": 0.0, + "num_input_tokens_seen": 9017656, + "step": 14785 + }, + { + "epoch": 4.078874793160508, + "grad_norm": 0.00042428483720868826, + "learning_rate": 3.690135836049501e-05, + "loss": 0.0, + "num_input_tokens_seen": 9020856, + "step": 14790 + }, + { + "epoch": 4.080253723110866, + "grad_norm": 0.00016314072126988322, + "learning_rate": 3.689077458949149e-05, + "loss": 0.0, + "num_input_tokens_seen": 9023256, + "step": 14795 + }, + { + "epoch": 4.081632653061225, + "grad_norm": 3.8932808820391074e-05, + "learning_rate": 3.688018806356617e-05, + "loss": 0.0, + "num_input_tokens_seen": 9026232, + "step": 14800 + }, + { + "epoch": 4.083011583011583, + "grad_norm": 4.6569575715693645e-06, + "learning_rate": 3.686959878517181e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9029272, + "step": 14805 + }, + { + "epoch": 4.084390512961941, + "grad_norm": 0.001592554384842515, + "learning_rate": 3.68590067567618e-05, + "loss": 0.0, + "num_input_tokens_seen": 9035448, + "step": 14810 + }, + { + "epoch": 4.0857694429123, + "grad_norm": 0.0002217196743004024, + "learning_rate": 3.684841198079016e-05, + "loss": 0.0, + "num_input_tokens_seen": 9038360, + "step": 14815 + }, + { + "epoch": 4.087148372862659, + "grad_norm": 0.00013138630311004817, + "learning_rate": 3.683781445971152e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9040920, + "step": 14820 + }, + { + "epoch": 4.088527302813017, + "grad_norm": 9.439892892260104e-05, + "learning_rate": 3.6827214195981215e-05, + "loss": 0.0, + "num_input_tokens_seen": 9043480, + "step": 14825 + }, + { + "epoch": 4.089906232763376, + "grad_norm": 1.9184848497388884e-05, + "learning_rate": 3.6816611192055145e-05, + "loss": 0.0, + "num_input_tokens_seen": 9046072, + "step": 14830 + }, + { + "epoch": 4.091285162713734, + "grad_norm": 0.00018584559438750148, + "learning_rate": 3.680600545038988e-05, + "loss": 0.0, + "num_input_tokens_seen": 9048984, + "step": 14835 + }, + { + "epoch": 4.0926640926640925, + "grad_norm": 6.357272650348023e-05, + "learning_rate": 3.679539697344262e-05, + "loss": 0.0, + "num_input_tokens_seen": 9052056, + "step": 14840 + }, + { + "epoch": 4.094043022614451, + "grad_norm": 0.014037741348147392, + "learning_rate": 3.67847857636712e-05, + "loss": 0.0, + "num_input_tokens_seen": 9055256, + "step": 14845 + }, + { + "epoch": 4.09542195256481, + "grad_norm": 2.114132439601235e-05, + "learning_rate": 3.677417182353409e-05, + "loss": 0.0, + "num_input_tokens_seen": 9057816, + "step": 14850 + }, + { + "epoch": 4.0968008825151685, + "grad_norm": 0.003868827363476157, + "learning_rate": 3.676355515549037e-05, + "loss": 0.0, + "num_input_tokens_seen": 9061176, + "step": 14855 + }, + { + "epoch": 4.098179812465527, + "grad_norm": 3.1490926630795e-05, + "learning_rate": 3.675293576199978e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9064536, + "step": 14860 + }, + { + "epoch": 4.099558742415885, + "grad_norm": 8.814165084913839e-06, + "learning_rate": 3.674231364552269e-05, + "loss": 0.0834, + "num_input_tokens_seen": 9067704, + "step": 14865 + }, + { + "epoch": 4.100937672366244, + "grad_norm": 0.01091432012617588, + "learning_rate": 3.6731688808520073e-05, + "loss": 0.0, + "num_input_tokens_seen": 9070072, + "step": 14870 + }, + { + "epoch": 4.102316602316602, + "grad_norm": 9.793874778551981e-05, + "learning_rate": 3.672106125345356e-05, + "loss": 0.0, + "num_input_tokens_seen": 9072920, + "step": 14875 + }, + { + "epoch": 4.103695532266961, + "grad_norm": 0.0004034037992823869, + "learning_rate": 3.67104309827854e-05, + "loss": 0.0, + "num_input_tokens_seen": 9075832, + "step": 14880 + }, + { + "epoch": 4.10507446221732, + "grad_norm": 5.336606773198582e-05, + "learning_rate": 3.669979799897849e-05, + "loss": 0.0, + "num_input_tokens_seen": 9078296, + "step": 14885 + }, + { + "epoch": 4.106453392167678, + "grad_norm": 0.029392581433057785, + "learning_rate": 3.668916230449633e-05, + "loss": 0.0, + "num_input_tokens_seen": 9080472, + "step": 14890 + }, + { + "epoch": 4.107832322118036, + "grad_norm": 0.00041560924728401005, + "learning_rate": 3.667852390180304e-05, + "loss": 0.0, + "num_input_tokens_seen": 9082744, + "step": 14895 + }, + { + "epoch": 4.109211252068395, + "grad_norm": 27.44247817993164, + "learning_rate": 3.6667882793363414e-05, + "loss": 0.0876, + "num_input_tokens_seen": 9085912, + "step": 14900 + }, + { + "epoch": 4.110590182018753, + "grad_norm": 0.0005066064768470824, + "learning_rate": 3.6657238981642816e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9088312, + "step": 14905 + }, + { + "epoch": 4.1119691119691115, + "grad_norm": 0.0017946711741387844, + "learning_rate": 3.664659246910729e-05, + "loss": 0.0, + "num_input_tokens_seen": 9091352, + "step": 14910 + }, + { + "epoch": 4.113348041919471, + "grad_norm": 0.00016730725474189967, + "learning_rate": 3.6635943258223466e-05, + "loss": 0.0, + "num_input_tokens_seen": 9095480, + "step": 14915 + }, + { + "epoch": 4.114726971869829, + "grad_norm": 0.00039618517621420324, + "learning_rate": 3.662529135145862e-05, + "loss": 0.0, + "num_input_tokens_seen": 9098552, + "step": 14920 + }, + { + "epoch": 4.1161059018201875, + "grad_norm": 34.83564376831055, + "learning_rate": 3.6614636751280635e-05, + "loss": 0.098, + "num_input_tokens_seen": 9101304, + "step": 14925 + }, + { + "epoch": 4.117484831770546, + "grad_norm": 0.7795924544334412, + "learning_rate": 3.660397946015804e-05, + "loss": 0.0011, + "num_input_tokens_seen": 9105848, + "step": 14930 + }, + { + "epoch": 4.118863761720904, + "grad_norm": 3.4007498470600694e-05, + "learning_rate": 3.6593319480559975e-05, + "loss": 0.0, + "num_input_tokens_seen": 9108600, + "step": 14935 + }, + { + "epoch": 4.120242691671263, + "grad_norm": 5.887266070203623e-06, + "learning_rate": 3.658265681495619e-05, + "loss": 0.0, + "num_input_tokens_seen": 9111256, + "step": 14940 + }, + { + "epoch": 4.121621621621622, + "grad_norm": 0.0005046160076744854, + "learning_rate": 3.657199146581709e-05, + "loss": 0.0, + "num_input_tokens_seen": 9113624, + "step": 14945 + }, + { + "epoch": 4.12300055157198, + "grad_norm": 0.0005080712726339698, + "learning_rate": 3.6561323435613684e-05, + "loss": 0.0, + "num_input_tokens_seen": 9116600, + "step": 14950 + }, + { + "epoch": 4.124379481522339, + "grad_norm": 0.0029886215925216675, + "learning_rate": 3.655065272681759e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9119384, + "step": 14955 + }, + { + "epoch": 4.125758411472697, + "grad_norm": 9.867858898360282e-05, + "learning_rate": 3.653997934190106e-05, + "loss": 0.0, + "num_input_tokens_seen": 9122104, + "step": 14960 + }, + { + "epoch": 4.127137341423055, + "grad_norm": 9.355494512419682e-06, + "learning_rate": 3.652930328333697e-05, + "loss": 0.0, + "num_input_tokens_seen": 9125304, + "step": 14965 + }, + { + "epoch": 4.128516271373414, + "grad_norm": 0.004071173258125782, + "learning_rate": 3.65186245535988e-05, + "loss": 0.0, + "num_input_tokens_seen": 9128696, + "step": 14970 + }, + { + "epoch": 4.129895201323773, + "grad_norm": 0.004471008665859699, + "learning_rate": 3.650794315516067e-05, + "loss": 0.0, + "num_input_tokens_seen": 9131352, + "step": 14975 + }, + { + "epoch": 4.1312741312741315, + "grad_norm": 12.037786483764648, + "learning_rate": 3.649725909049729e-05, + "loss": 0.003, + "num_input_tokens_seen": 9135192, + "step": 14980 + }, + { + "epoch": 4.13265306122449, + "grad_norm": 0.006741185672581196, + "learning_rate": 3.648657236208403e-05, + "loss": 0.0, + "num_input_tokens_seen": 9137944, + "step": 14985 + }, + { + "epoch": 4.134031991174848, + "grad_norm": 6.121678597992286e-05, + "learning_rate": 3.647588297239683e-05, + "loss": 0.0, + "num_input_tokens_seen": 9140472, + "step": 14990 + }, + { + "epoch": 4.135410921125207, + "grad_norm": 0.00015420698036905378, + "learning_rate": 3.646519092391227e-05, + "loss": 0.0, + "num_input_tokens_seen": 9142552, + "step": 14995 + }, + { + "epoch": 4.136789851075565, + "grad_norm": 0.1270378828048706, + "learning_rate": 3.645449621910756e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9144984, + "step": 15000 + }, + { + "epoch": 4.138168781025924, + "grad_norm": 0.012670539319515228, + "learning_rate": 3.644379886046049e-05, + "loss": 0.0, + "num_input_tokens_seen": 9147576, + "step": 15005 + }, + { + "epoch": 4.139547710976283, + "grad_norm": 5.196839538257336e-06, + "learning_rate": 3.64330988504495e-05, + "loss": 0.0, + "num_input_tokens_seen": 9149816, + "step": 15010 + }, + { + "epoch": 4.140926640926641, + "grad_norm": 6.851859507150948e-05, + "learning_rate": 3.6422396191553616e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9152696, + "step": 15015 + }, + { + "epoch": 4.142305570876999, + "grad_norm": 0.00010455436131451279, + "learning_rate": 3.64116908862525e-05, + "loss": 0.0, + "num_input_tokens_seen": 9155640, + "step": 15020 + }, + { + "epoch": 4.143684500827358, + "grad_norm": 1.829596294555813e-05, + "learning_rate": 3.640098293702641e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9158648, + "step": 15025 + }, + { + "epoch": 4.145063430777716, + "grad_norm": 0.21882449090480804, + "learning_rate": 3.6390272346356224e-05, + "loss": 0.0003, + "num_input_tokens_seen": 9161560, + "step": 15030 + }, + { + "epoch": 4.146442360728075, + "grad_norm": 6.960855898796581e-06, + "learning_rate": 3.637955911672345e-05, + "loss": 0.0, + "num_input_tokens_seen": 9165784, + "step": 15035 + }, + { + "epoch": 4.147821290678434, + "grad_norm": 3.7685058487113565e-06, + "learning_rate": 3.636884325061016e-05, + "loss": 0.0, + "num_input_tokens_seen": 9168856, + "step": 15040 + }, + { + "epoch": 4.149200220628792, + "grad_norm": 0.0002529381890781224, + "learning_rate": 3.6358124750499095e-05, + "loss": 0.0, + "num_input_tokens_seen": 9171384, + "step": 15045 + }, + { + "epoch": 4.1505791505791505, + "grad_norm": 5.999406857881695e-05, + "learning_rate": 3.6347403618873556e-05, + "loss": 0.0, + "num_input_tokens_seen": 9173912, + "step": 15050 + }, + { + "epoch": 4.151958080529509, + "grad_norm": 2.984137063322123e-05, + "learning_rate": 3.6336679858217485e-05, + "loss": 0.0, + "num_input_tokens_seen": 9176600, + "step": 15055 + }, + { + "epoch": 4.153337010479867, + "grad_norm": 0.0005777961341664195, + "learning_rate": 3.632595347101543e-05, + "loss": 0.0, + "num_input_tokens_seen": 9178968, + "step": 15060 + }, + { + "epoch": 4.1547159404302265, + "grad_norm": 6.635838417423656e-06, + "learning_rate": 3.631522445975252e-05, + "loss": 0.0, + "num_input_tokens_seen": 9182488, + "step": 15065 + }, + { + "epoch": 4.156094870380585, + "grad_norm": 1.3445031981973443e-05, + "learning_rate": 3.6304492826914535e-05, + "loss": 0.0, + "num_input_tokens_seen": 9185304, + "step": 15070 + }, + { + "epoch": 4.157473800330943, + "grad_norm": 3.561486437320127e-06, + "learning_rate": 3.629375857498784e-05, + "loss": 0.0, + "num_input_tokens_seen": 9189400, + "step": 15075 + }, + { + "epoch": 4.158852730281302, + "grad_norm": 17.329294204711914, + "learning_rate": 3.628302170645938e-05, + "loss": 0.0015, + "num_input_tokens_seen": 9192312, + "step": 15080 + }, + { + "epoch": 4.16023166023166, + "grad_norm": 1.5649957276764326e-05, + "learning_rate": 3.627228222381675e-05, + "loss": 0.0, + "num_input_tokens_seen": 9194968, + "step": 15085 + }, + { + "epoch": 4.161610590182018, + "grad_norm": 0.008511544205248356, + "learning_rate": 3.626154012954816e-05, + "loss": 0.0, + "num_input_tokens_seen": 9197688, + "step": 15090 + }, + { + "epoch": 4.162989520132378, + "grad_norm": 0.0011140508577227592, + "learning_rate": 3.625079542614236e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9200984, + "step": 15095 + }, + { + "epoch": 4.164368450082736, + "grad_norm": 5.326422251528129e-05, + "learning_rate": 3.624004811608876e-05, + "loss": 0.0, + "num_input_tokens_seen": 9204536, + "step": 15100 + }, + { + "epoch": 4.165747380033094, + "grad_norm": 4.086346598342061e-05, + "learning_rate": 3.622929820187736e-05, + "loss": 0.0, + "num_input_tokens_seen": 9206904, + "step": 15105 + }, + { + "epoch": 4.167126309983453, + "grad_norm": 3.5938428482040763e-06, + "learning_rate": 3.621854568599875e-05, + "loss": 0.0, + "num_input_tokens_seen": 9209752, + "step": 15110 + }, + { + "epoch": 4.168505239933811, + "grad_norm": 5.680723461409798e-06, + "learning_rate": 3.620779057094414e-05, + "loss": 0.0, + "num_input_tokens_seen": 9212760, + "step": 15115 + }, + { + "epoch": 4.1698841698841695, + "grad_norm": 0.00010409755486762151, + "learning_rate": 3.619703285920534e-05, + "loss": 0.0, + "num_input_tokens_seen": 9216600, + "step": 15120 + }, + { + "epoch": 4.171263099834529, + "grad_norm": 0.0004116458003409207, + "learning_rate": 3.6186272553274745e-05, + "loss": 0.0, + "num_input_tokens_seen": 9219832, + "step": 15125 + }, + { + "epoch": 4.172642029784887, + "grad_norm": 0.0013677809620276093, + "learning_rate": 3.617550965564538e-05, + "loss": 0.225, + "num_input_tokens_seen": 9222584, + "step": 15130 + }, + { + "epoch": 4.174020959735246, + "grad_norm": 0.012547001242637634, + "learning_rate": 3.616474416881085e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9226648, + "step": 15135 + }, + { + "epoch": 4.175399889685604, + "grad_norm": 0.0027361437678337097, + "learning_rate": 3.615397609526535e-05, + "loss": 0.0004, + "num_input_tokens_seen": 9230040, + "step": 15140 + }, + { + "epoch": 4.176778819635962, + "grad_norm": 0.0013476021122187376, + "learning_rate": 3.61432054375037e-05, + "loss": 0.0, + "num_input_tokens_seen": 9232376, + "step": 15145 + }, + { + "epoch": 4.178157749586321, + "grad_norm": 0.0007060233619995415, + "learning_rate": 3.613243219802131e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9235352, + "step": 15150 + }, + { + "epoch": 4.17953667953668, + "grad_norm": 0.0621793195605278, + "learning_rate": 3.612165637931417e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9237880, + "step": 15155 + }, + { + "epoch": 4.180915609487038, + "grad_norm": 0.0005242339102551341, + "learning_rate": 3.6110877983878896e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9240504, + "step": 15160 + }, + { + "epoch": 4.182294539437397, + "grad_norm": 0.0010777247371152043, + "learning_rate": 3.61000970142127e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9244280, + "step": 15165 + }, + { + "epoch": 4.183673469387755, + "grad_norm": 0.0003836188407149166, + "learning_rate": 3.608931347281334e-05, + "loss": 0.0, + "num_input_tokens_seen": 9246776, + "step": 15170 + }, + { + "epoch": 4.1850523993381135, + "grad_norm": 0.0006181768258102238, + "learning_rate": 3.607852736217924e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9250872, + "step": 15175 + }, + { + "epoch": 4.186431329288472, + "grad_norm": 0.00243822718039155, + "learning_rate": 3.6067738684809364e-05, + "loss": 0.0, + "num_input_tokens_seen": 9254200, + "step": 15180 + }, + { + "epoch": 4.18781025923883, + "grad_norm": 0.002072777831926942, + "learning_rate": 3.6056947443203314e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9256984, + "step": 15185 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.00033577284193597734, + "learning_rate": 3.604615363986126e-05, + "loss": 0.0, + "num_input_tokens_seen": 9260664, + "step": 15190 + }, + { + "epoch": 4.190568119139548, + "grad_norm": 0.0005969985504634678, + "learning_rate": 3.603535727728396e-05, + "loss": 0.0, + "num_input_tokens_seen": 9264088, + "step": 15195 + }, + { + "epoch": 4.191947049089906, + "grad_norm": 0.00016624679847154766, + "learning_rate": 3.6024558357972785e-05, + "loss": 0.0, + "num_input_tokens_seen": 9267096, + "step": 15200 + }, + { + "epoch": 4.193325979040265, + "grad_norm": 0.00043502941844053566, + "learning_rate": 3.6013756884429706e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9270104, + "step": 15205 + }, + { + "epoch": 4.194704908990623, + "grad_norm": 0.00014348176773637533, + "learning_rate": 3.600295285915724e-05, + "loss": 0.0, + "num_input_tokens_seen": 9272600, + "step": 15210 + }, + { + "epoch": 4.196083838940982, + "grad_norm": 0.000557563325855881, + "learning_rate": 3.599214628465854e-05, + "loss": 0.0, + "num_input_tokens_seen": 9275224, + "step": 15215 + }, + { + "epoch": 4.197462768891341, + "grad_norm": 0.00017007389396894723, + "learning_rate": 3.5981337163437326e-05, + "loss": 0.0, + "num_input_tokens_seen": 9277528, + "step": 15220 + }, + { + "epoch": 4.198841698841699, + "grad_norm": 0.006393698509782553, + "learning_rate": 3.597052549799792e-05, + "loss": 0.0, + "num_input_tokens_seen": 9280056, + "step": 15225 + }, + { + "epoch": 4.200220628792057, + "grad_norm": 0.00013713269436266273, + "learning_rate": 3.5959711290845246e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9283224, + "step": 15230 + }, + { + "epoch": 4.201599558742416, + "grad_norm": 7.229945913422853e-05, + "learning_rate": 3.594889454448478e-05, + "loss": 0.0, + "num_input_tokens_seen": 9285944, + "step": 15235 + }, + { + "epoch": 4.202978488692774, + "grad_norm": 2.9468255888787098e-05, + "learning_rate": 3.593807526142261e-05, + "loss": 0.0, + "num_input_tokens_seen": 9289048, + "step": 15240 + }, + { + "epoch": 4.2043574186431325, + "grad_norm": 0.004753808956593275, + "learning_rate": 3.592725344416542e-05, + "loss": 0.0, + "num_input_tokens_seen": 9291960, + "step": 15245 + }, + { + "epoch": 4.205736348593492, + "grad_norm": 5.027651059208438e-05, + "learning_rate": 3.591642909522045e-05, + "loss": 0.0, + "num_input_tokens_seen": 9294616, + "step": 15250 + }, + { + "epoch": 4.20711527854385, + "grad_norm": 0.0012182702776044607, + "learning_rate": 3.5905602217095564e-05, + "loss": 0.0, + "num_input_tokens_seen": 9297336, + "step": 15255 + }, + { + "epoch": 4.2084942084942085, + "grad_norm": 0.0019460992189124227, + "learning_rate": 3.589477281229918e-05, + "loss": 0.0, + "num_input_tokens_seen": 9301048, + "step": 15260 + }, + { + "epoch": 4.209873138444567, + "grad_norm": 0.0002975023235194385, + "learning_rate": 3.588394088334034e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9303448, + "step": 15265 + }, + { + "epoch": 4.211252068394925, + "grad_norm": 0.0010573638137429953, + "learning_rate": 3.587310643272862e-05, + "loss": 0.0, + "num_input_tokens_seen": 9306680, + "step": 15270 + }, + { + "epoch": 4.212630998345284, + "grad_norm": 0.0003363651630934328, + "learning_rate": 3.586226946297421e-05, + "loss": 0.0, + "num_input_tokens_seen": 9309240, + "step": 15275 + }, + { + "epoch": 4.214009928295643, + "grad_norm": 0.000573858036659658, + "learning_rate": 3.5851429976587897e-05, + "loss": 0.0, + "num_input_tokens_seen": 9312568, + "step": 15280 + }, + { + "epoch": 4.215388858246001, + "grad_norm": 0.0009190048440359533, + "learning_rate": 3.584058797608102e-05, + "loss": 0.0, + "num_input_tokens_seen": 9316504, + "step": 15285 + }, + { + "epoch": 4.21676778819636, + "grad_norm": 0.00010884036601055413, + "learning_rate": 3.582974346396551e-05, + "loss": 0.0, + "num_input_tokens_seen": 9320408, + "step": 15290 + }, + { + "epoch": 4.218146718146718, + "grad_norm": 0.0010887670796364546, + "learning_rate": 3.581889644275391e-05, + "loss": 0.0047, + "num_input_tokens_seen": 9323960, + "step": 15295 + }, + { + "epoch": 4.219525648097076, + "grad_norm": 0.004915567580610514, + "learning_rate": 3.5808046914959304e-05, + "loss": 0.0, + "num_input_tokens_seen": 9327448, + "step": 15300 + }, + { + "epoch": 4.220904578047435, + "grad_norm": 6.12670773989521e-05, + "learning_rate": 3.579719488309536e-05, + "loss": 0.1063, + "num_input_tokens_seen": 9329560, + "step": 15305 + }, + { + "epoch": 4.222283507997794, + "grad_norm": 0.029883291572332382, + "learning_rate": 3.578634034967636e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9332248, + "step": 15310 + }, + { + "epoch": 4.223662437948152, + "grad_norm": 0.0013859098544344306, + "learning_rate": 3.5775483317217115e-05, + "loss": 0.0, + "num_input_tokens_seen": 9335576, + "step": 15315 + }, + { + "epoch": 4.225041367898511, + "grad_norm": 0.0006173021974973381, + "learning_rate": 3.576462378823308e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9338296, + "step": 15320 + }, + { + "epoch": 4.226420297848869, + "grad_norm": 0.00044381365296430886, + "learning_rate": 3.5753761765240224e-05, + "loss": 0.0, + "num_input_tokens_seen": 9341784, + "step": 15325 + }, + { + "epoch": 4.227799227799228, + "grad_norm": 0.002232547616586089, + "learning_rate": 3.574289725075513e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9344760, + "step": 15330 + }, + { + "epoch": 4.229178157749586, + "grad_norm": 0.0008386418339796364, + "learning_rate": 3.573203024729496e-05, + "loss": 0.0, + "num_input_tokens_seen": 9347032, + "step": 15335 + }, + { + "epoch": 4.230557087699945, + "grad_norm": 0.0046493886038661, + "learning_rate": 3.572116075737743e-05, + "loss": 0.0, + "num_input_tokens_seen": 9350680, + "step": 15340 + }, + { + "epoch": 4.231936017650304, + "grad_norm": 0.014028752222657204, + "learning_rate": 3.571028878352084e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9354648, + "step": 15345 + }, + { + "epoch": 4.233314947600662, + "grad_norm": 0.00034603377571329474, + "learning_rate": 3.569941432824408e-05, + "loss": 0.0, + "num_input_tokens_seen": 9358040, + "step": 15350 + }, + { + "epoch": 4.23469387755102, + "grad_norm": 0.001516031101346016, + "learning_rate": 3.568853739406662e-05, + "loss": 0.0, + "num_input_tokens_seen": 9362584, + "step": 15355 + }, + { + "epoch": 4.236072807501379, + "grad_norm": 0.00020512523769866675, + "learning_rate": 3.567765798350846e-05, + "loss": 0.0, + "num_input_tokens_seen": 9365272, + "step": 15360 + }, + { + "epoch": 4.237451737451737, + "grad_norm": 0.00010471173300174996, + "learning_rate": 3.5666776099090213e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9369144, + "step": 15365 + }, + { + "epoch": 4.238830667402096, + "grad_norm": 0.0004018774488940835, + "learning_rate": 3.565589174333307e-05, + "loss": 0.0, + "num_input_tokens_seen": 9371480, + "step": 15370 + }, + { + "epoch": 4.240209597352455, + "grad_norm": 0.00013949039566796273, + "learning_rate": 3.564500491875877e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9374808, + "step": 15375 + }, + { + "epoch": 4.241588527302813, + "grad_norm": 0.0001049422353389673, + "learning_rate": 3.563411562788963e-05, + "loss": 0.0, + "num_input_tokens_seen": 9377880, + "step": 15380 + }, + { + "epoch": 4.2429674572531715, + "grad_norm": 0.000768961850553751, + "learning_rate": 3.562322387324854e-05, + "loss": 0.0, + "num_input_tokens_seen": 9380856, + "step": 15385 + }, + { + "epoch": 4.24434638720353, + "grad_norm": 0.0001516571792308241, + "learning_rate": 3.561232965735897e-05, + "loss": 0.0835, + "num_input_tokens_seen": 9383448, + "step": 15390 + }, + { + "epoch": 4.245725317153888, + "grad_norm": 0.02579726278781891, + "learning_rate": 3.560143298274496e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9386616, + "step": 15395 + }, + { + "epoch": 4.2471042471042475, + "grad_norm": 0.010424148291349411, + "learning_rate": 3.5590533851931096e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9390552, + "step": 15400 + }, + { + "epoch": 4.248483177054606, + "grad_norm": 0.0003429515054449439, + "learning_rate": 3.5579632267442564e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9393336, + "step": 15405 + }, + { + "epoch": 4.249862107004964, + "grad_norm": 0.002827544929459691, + "learning_rate": 3.55687282318051e-05, + "loss": 0.0003, + "num_input_tokens_seen": 9395576, + "step": 15410 + }, + { + "epoch": 4.251241036955323, + "grad_norm": 0.00012766479630954564, + "learning_rate": 3.5557821747545015e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9399512, + "step": 15415 + }, + { + "epoch": 4.252619966905681, + "grad_norm": 4.228422767482698e-05, + "learning_rate": 3.554691281718918e-05, + "loss": 0.0, + "num_input_tokens_seen": 9402424, + "step": 15420 + }, + { + "epoch": 4.253998896856039, + "grad_norm": 0.0013055766467005014, + "learning_rate": 3.5536001443265036e-05, + "loss": 0.0, + "num_input_tokens_seen": 9405592, + "step": 15425 + }, + { + "epoch": 4.255377826806399, + "grad_norm": 0.0010901812929660082, + "learning_rate": 3.55250876283006e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9409048, + "step": 15430 + }, + { + "epoch": 4.256756756756757, + "grad_norm": 0.003921166528016329, + "learning_rate": 3.5514171374824446e-05, + "loss": 0.0005, + "num_input_tokens_seen": 9412056, + "step": 15435 + }, + { + "epoch": 4.258135686707115, + "grad_norm": 0.0027680196799337864, + "learning_rate": 3.55032526853657e-05, + "loss": 0.0065, + "num_input_tokens_seen": 9415000, + "step": 15440 + }, + { + "epoch": 4.259514616657474, + "grad_norm": 0.0026113735511898994, + "learning_rate": 3.5492331562454086e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9418904, + "step": 15445 + }, + { + "epoch": 4.260893546607832, + "grad_norm": 0.0009845878230407834, + "learning_rate": 3.548140800861985e-05, + "loss": 0.0, + "num_input_tokens_seen": 9422296, + "step": 15450 + }, + { + "epoch": 4.2622724765581905, + "grad_norm": 0.00031609012512490153, + "learning_rate": 3.547048202639384e-05, + "loss": 0.0, + "num_input_tokens_seen": 9425432, + "step": 15455 + }, + { + "epoch": 4.263651406508549, + "grad_norm": 2.720165684877429e-05, + "learning_rate": 3.545955361830744e-05, + "loss": 0.0, + "num_input_tokens_seen": 9427768, + "step": 15460 + }, + { + "epoch": 4.265030336458908, + "grad_norm": 0.00011204789188923314, + "learning_rate": 3.54486227868926e-05, + "loss": 0.0, + "num_input_tokens_seen": 9430424, + "step": 15465 + }, + { + "epoch": 4.2664092664092665, + "grad_norm": 0.010255116038024426, + "learning_rate": 3.543768953468186e-05, + "loss": 0.0, + "num_input_tokens_seen": 9432568, + "step": 15470 + }, + { + "epoch": 4.267788196359625, + "grad_norm": 17.161516189575195, + "learning_rate": 3.542675386420827e-05, + "loss": 0.0548, + "num_input_tokens_seen": 9436344, + "step": 15475 + }, + { + "epoch": 4.269167126309983, + "grad_norm": 6.329041934804991e-05, + "learning_rate": 3.541581577800548e-05, + "loss": 0.1961, + "num_input_tokens_seen": 9439544, + "step": 15480 + }, + { + "epoch": 4.270546056260342, + "grad_norm": 0.03961891680955887, + "learning_rate": 3.540487527860769e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9442776, + "step": 15485 + }, + { + "epoch": 4.271924986210701, + "grad_norm": 0.012221190147101879, + "learning_rate": 3.539393236854966e-05, + "loss": 0.0009, + "num_input_tokens_seen": 9444824, + "step": 15490 + }, + { + "epoch": 4.273303916161059, + "grad_norm": 0.01157456636428833, + "learning_rate": 3.5382987050366694e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9447192, + "step": 15495 + }, + { + "epoch": 4.274682846111418, + "grad_norm": 0.010028047487139702, + "learning_rate": 3.537203932659466e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9450200, + "step": 15500 + }, + { + "epoch": 4.276061776061776, + "grad_norm": 0.00592257734388113, + "learning_rate": 3.5361089199770016e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9452984, + "step": 15505 + }, + { + "epoch": 4.277440706012134, + "grad_norm": 0.005456688813865185, + "learning_rate": 3.535013667242973e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9456088, + "step": 15510 + }, + { + "epoch": 4.278819635962493, + "grad_norm": 0.0027582633774727583, + "learning_rate": 3.5339181747111334e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9459256, + "step": 15515 + }, + { + "epoch": 4.280198565912851, + "grad_norm": 0.0003318684466648847, + "learning_rate": 3.532822442635295e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9461528, + "step": 15520 + }, + { + "epoch": 4.2815774958632105, + "grad_norm": 0.0035441573709249496, + "learning_rate": 3.531726471269322e-05, + "loss": 0.0003, + "num_input_tokens_seen": 9464792, + "step": 15525 + }, + { + "epoch": 4.282956425813569, + "grad_norm": 2.404608726501465, + "learning_rate": 3.530630260867135e-05, + "loss": 0.0009, + "num_input_tokens_seen": 9467416, + "step": 15530 + }, + { + "epoch": 4.284335355763927, + "grad_norm": 0.0007409917307086289, + "learning_rate": 3.5295338116827105e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9469816, + "step": 15535 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.004608759190887213, + "learning_rate": 3.528437123970081e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9473336, + "step": 15540 + }, + { + "epoch": 4.287093215664644, + "grad_norm": 0.00023398910707328469, + "learning_rate": 3.5273401979833323e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9477112, + "step": 15545 + }, + { + "epoch": 4.288472145615003, + "grad_norm": 0.0003510262758936733, + "learning_rate": 3.526243033976607e-05, + "loss": 0.0, + "num_input_tokens_seen": 9480280, + "step": 15550 + }, + { + "epoch": 4.289851075565362, + "grad_norm": 0.00028557286714203656, + "learning_rate": 3.525145632204101e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9483064, + "step": 15555 + }, + { + "epoch": 4.29123000551572, + "grad_norm": 0.0233883298933506, + "learning_rate": 3.524047992920068e-05, + "loss": 0.0856, + "num_input_tokens_seen": 9485528, + "step": 15560 + }, + { + "epoch": 4.292608935466078, + "grad_norm": 0.000269173615379259, + "learning_rate": 3.522950116378813e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9487960, + "step": 15565 + }, + { + "epoch": 4.293987865416437, + "grad_norm": 0.0011073154164478183, + "learning_rate": 3.5218520028347025e-05, + "loss": 0.0, + "num_input_tokens_seen": 9491512, + "step": 15570 + }, + { + "epoch": 4.295366795366795, + "grad_norm": 0.04216739535331726, + "learning_rate": 3.520753652542149e-05, + "loss": 0.055, + "num_input_tokens_seen": 9495000, + "step": 15575 + }, + { + "epoch": 4.2967457253171535, + "grad_norm": 0.01195742841809988, + "learning_rate": 3.5196550657556276e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9497656, + "step": 15580 + }, + { + "epoch": 4.298124655267513, + "grad_norm": 0.00041324232006445527, + "learning_rate": 3.518556242729664e-05, + "loss": 0.065, + "num_input_tokens_seen": 9500408, + "step": 15585 + }, + { + "epoch": 4.299503585217871, + "grad_norm": 1.693939447402954, + "learning_rate": 3.51745718371884e-05, + "loss": 0.0011, + "num_input_tokens_seen": 9503032, + "step": 15590 + }, + { + "epoch": 4.3008825151682295, + "grad_norm": 0.00018601810734253377, + "learning_rate": 3.516357888977791e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9505336, + "step": 15595 + }, + { + "epoch": 4.302261445118588, + "grad_norm": 0.0009293809998780489, + "learning_rate": 3.515258358761208e-05, + "loss": 0.0003, + "num_input_tokens_seen": 9507832, + "step": 15600 + }, + { + "epoch": 4.303640375068946, + "grad_norm": 0.00035100607783533633, + "learning_rate": 3.514158593323837e-05, + "loss": 0.0, + "num_input_tokens_seen": 9510456, + "step": 15605 + }, + { + "epoch": 4.305019305019305, + "grad_norm": 0.000970156048424542, + "learning_rate": 3.513058592920478e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9514872, + "step": 15610 + }, + { + "epoch": 4.306398234969664, + "grad_norm": 0.0019925374072045088, + "learning_rate": 3.5119583578059846e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9517592, + "step": 15615 + }, + { + "epoch": 4.307777164920022, + "grad_norm": 0.00033743574749678373, + "learning_rate": 3.510857888235266e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9519864, + "step": 15620 + }, + { + "epoch": 4.309156094870381, + "grad_norm": 0.0016766160260885954, + "learning_rate": 3.509757184463285e-05, + "loss": 0.0, + "num_input_tokens_seen": 9522456, + "step": 15625 + }, + { + "epoch": 4.310535024820739, + "grad_norm": 0.0004137096111662686, + "learning_rate": 3.508656246745058e-05, + "loss": 0.0003, + "num_input_tokens_seen": 9525784, + "step": 15630 + }, + { + "epoch": 4.311913954771097, + "grad_norm": 0.0005807499983347952, + "learning_rate": 3.507555075335658e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9528312, + "step": 15635 + }, + { + "epoch": 4.313292884721456, + "grad_norm": 0.0014687442453578115, + "learning_rate": 3.50645367049021e-05, + "loss": 0.0, + "num_input_tokens_seen": 9531768, + "step": 15640 + }, + { + "epoch": 4.314671814671815, + "grad_norm": 0.0029413236770778894, + "learning_rate": 3.505352032463893e-05, + "loss": 0.0, + "num_input_tokens_seen": 9534648, + "step": 15645 + }, + { + "epoch": 4.316050744622173, + "grad_norm": 0.0007240099366754293, + "learning_rate": 3.5042501615119424e-05, + "loss": 0.0006, + "num_input_tokens_seen": 9537656, + "step": 15650 + }, + { + "epoch": 4.317429674572532, + "grad_norm": 0.00013219731044955552, + "learning_rate": 3.503148057889644e-05, + "loss": 0.0, + "num_input_tokens_seen": 9540600, + "step": 15655 + }, + { + "epoch": 4.31880860452289, + "grad_norm": 0.00011977727990597486, + "learning_rate": 3.5020457218523405e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9544760, + "step": 15660 + }, + { + "epoch": 4.3201875344732485, + "grad_norm": 0.0012800968252122402, + "learning_rate": 3.500943153655427e-05, + "loss": 0.0, + "num_input_tokens_seen": 9548760, + "step": 15665 + }, + { + "epoch": 4.321566464423607, + "grad_norm": 0.0010940745705738664, + "learning_rate": 3.499840353554353e-05, + "loss": 0.0003, + "num_input_tokens_seen": 9551320, + "step": 15670 + }, + { + "epoch": 4.322945394373966, + "grad_norm": 0.001202438841573894, + "learning_rate": 3.4987373218046205e-05, + "loss": 0.0, + "num_input_tokens_seen": 9554616, + "step": 15675 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.0011702249757945538, + "learning_rate": 3.4976340586617874e-05, + "loss": 0.0, + "num_input_tokens_seen": 9557592, + "step": 15680 + }, + { + "epoch": 4.325703254274683, + "grad_norm": 0.03407915681600571, + "learning_rate": 3.496530564381463e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9561624, + "step": 15685 + }, + { + "epoch": 4.327082184225041, + "grad_norm": 0.0010065222159028053, + "learning_rate": 3.495426839219311e-05, + "loss": 0.0, + "num_input_tokens_seen": 9565208, + "step": 15690 + }, + { + "epoch": 4.3284611141754, + "grad_norm": 0.0004155422793701291, + "learning_rate": 3.49432288343105e-05, + "loss": 0.0, + "num_input_tokens_seen": 9568152, + "step": 15695 + }, + { + "epoch": 4.329840044125758, + "grad_norm": 0.000410706561524421, + "learning_rate": 3.4932186972724504e-05, + "loss": 0.0, + "num_input_tokens_seen": 9572312, + "step": 15700 + }, + { + "epoch": 4.331218974076117, + "grad_norm": 9.05042325030081e-05, + "learning_rate": 3.492114280999334e-05, + "loss": 0.0, + "num_input_tokens_seen": 9574712, + "step": 15705 + }, + { + "epoch": 4.332597904026476, + "grad_norm": 0.00022073475702200085, + "learning_rate": 3.4910096348675806e-05, + "loss": 0.0, + "num_input_tokens_seen": 9577880, + "step": 15710 + }, + { + "epoch": 4.333976833976834, + "grad_norm": 0.00012280345254112035, + "learning_rate": 3.489904759133121e-05, + "loss": 0.0, + "num_input_tokens_seen": 9582200, + "step": 15715 + }, + { + "epoch": 4.3353557639271925, + "grad_norm": 0.0018778479425236583, + "learning_rate": 3.4887996540519375e-05, + "loss": 0.0, + "num_input_tokens_seen": 9584888, + "step": 15720 + }, + { + "epoch": 4.336734693877551, + "grad_norm": 7.578178338008001e-05, + "learning_rate": 3.487694319880068e-05, + "loss": 0.0, + "num_input_tokens_seen": 9587800, + "step": 15725 + }, + { + "epoch": 4.338113623827909, + "grad_norm": 0.00020690445671789348, + "learning_rate": 3.486588756873602e-05, + "loss": 0.0938, + "num_input_tokens_seen": 9590808, + "step": 15730 + }, + { + "epoch": 4.3394925537782685, + "grad_norm": 0.01088795904070139, + "learning_rate": 3.485482965288684e-05, + "loss": 0.0392, + "num_input_tokens_seen": 9593016, + "step": 15735 + }, + { + "epoch": 4.340871483728627, + "grad_norm": 0.00035804390790872276, + "learning_rate": 3.484376945381508e-05, + "loss": 0.0, + "num_input_tokens_seen": 9596600, + "step": 15740 + }, + { + "epoch": 4.342250413678985, + "grad_norm": 0.0009233855525963008, + "learning_rate": 3.483270697408323e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9599384, + "step": 15745 + }, + { + "epoch": 4.343629343629344, + "grad_norm": 9.626922110328451e-05, + "learning_rate": 3.4821642216254335e-05, + "loss": 0.0, + "num_input_tokens_seen": 9601944, + "step": 15750 + }, + { + "epoch": 4.345008273579702, + "grad_norm": 0.00012106230860808864, + "learning_rate": 3.4810575182891916e-05, + "loss": 0.0, + "num_input_tokens_seen": 9604888, + "step": 15755 + }, + { + "epoch": 4.34638720353006, + "grad_norm": 1.1828020811080933, + "learning_rate": 3.479950587656006e-05, + "loss": 0.0026, + "num_input_tokens_seen": 9607672, + "step": 15760 + }, + { + "epoch": 4.34776613348042, + "grad_norm": 0.0003092945262324065, + "learning_rate": 3.4788434299823345e-05, + "loss": 0.0, + "num_input_tokens_seen": 9610328, + "step": 15765 + }, + { + "epoch": 4.349145063430778, + "grad_norm": 0.0010277015389874578, + "learning_rate": 3.477736045524692e-05, + "loss": 0.0, + "num_input_tokens_seen": 9613560, + "step": 15770 + }, + { + "epoch": 4.350523993381136, + "grad_norm": 0.00037962410715408623, + "learning_rate": 3.4766284345396416e-05, + "loss": 0.0, + "num_input_tokens_seen": 9616984, + "step": 15775 + }, + { + "epoch": 4.351902923331495, + "grad_norm": 0.0002121459983754903, + "learning_rate": 3.4755205972838024e-05, + "loss": 0.0, + "num_input_tokens_seen": 9619704, + "step": 15780 + }, + { + "epoch": 4.353281853281853, + "grad_norm": 0.00013748226047027856, + "learning_rate": 3.474412534013843e-05, + "loss": 0.0, + "num_input_tokens_seen": 9622360, + "step": 15785 + }, + { + "epoch": 4.3546607832322115, + "grad_norm": 0.001388640608638525, + "learning_rate": 3.4733042449864886e-05, + "loss": 0.0, + "num_input_tokens_seen": 9625976, + "step": 15790 + }, + { + "epoch": 4.35603971318257, + "grad_norm": 0.000928206485696137, + "learning_rate": 3.4721957304585106e-05, + "loss": 0.0, + "num_input_tokens_seen": 9628984, + "step": 15795 + }, + { + "epoch": 4.357418643132929, + "grad_norm": 0.0005233434494584799, + "learning_rate": 3.471086990686737e-05, + "loss": 0.0, + "num_input_tokens_seen": 9631736, + "step": 15800 + }, + { + "epoch": 4.3587975730832875, + "grad_norm": 0.00026934861671179533, + "learning_rate": 3.469978025928047e-05, + "loss": 0.0, + "num_input_tokens_seen": 9634744, + "step": 15805 + }, + { + "epoch": 4.360176503033646, + "grad_norm": 0.003351126564666629, + "learning_rate": 3.468868836439372e-05, + "loss": 0.0, + "num_input_tokens_seen": 9637272, + "step": 15810 + }, + { + "epoch": 4.361555432984004, + "grad_norm": 0.0070546832866966724, + "learning_rate": 3.4677594224776955e-05, + "loss": 0.0006, + "num_input_tokens_seen": 9639896, + "step": 15815 + }, + { + "epoch": 4.362934362934363, + "grad_norm": 0.012589898891746998, + "learning_rate": 3.4666497843000524e-05, + "loss": 0.0, + "num_input_tokens_seen": 9642520, + "step": 15820 + }, + { + "epoch": 4.364313292884722, + "grad_norm": 0.0009189408156089485, + "learning_rate": 3.4655399221635296e-05, + "loss": 0.0, + "num_input_tokens_seen": 9645912, + "step": 15825 + }, + { + "epoch": 4.36569222283508, + "grad_norm": 0.00040700373938307166, + "learning_rate": 3.464429836325267e-05, + "loss": 0.0, + "num_input_tokens_seen": 9648088, + "step": 15830 + }, + { + "epoch": 4.367071152785439, + "grad_norm": 0.011313195340335369, + "learning_rate": 3.463319527042456e-05, + "loss": 0.0, + "num_input_tokens_seen": 9652120, + "step": 15835 + }, + { + "epoch": 4.368450082735797, + "grad_norm": 0.006962870713323355, + "learning_rate": 3.462208994572337e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9655608, + "step": 15840 + }, + { + "epoch": 4.369829012686155, + "grad_norm": 0.002591837663203478, + "learning_rate": 3.461098239172206e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9658648, + "step": 15845 + }, + { + "epoch": 4.371207942636514, + "grad_norm": 0.0003782804124057293, + "learning_rate": 3.4599872610994095e-05, + "loss": 0.0, + "num_input_tokens_seen": 9662136, + "step": 15850 + }, + { + "epoch": 4.372586872586872, + "grad_norm": 0.003627234371379018, + "learning_rate": 3.458876060611345e-05, + "loss": 0.0, + "num_input_tokens_seen": 9667512, + "step": 15855 + }, + { + "epoch": 4.373965802537231, + "grad_norm": 4.611101030604914e-05, + "learning_rate": 3.4577646379654605e-05, + "loss": 0.0, + "num_input_tokens_seen": 9671128, + "step": 15860 + }, + { + "epoch": 4.37534473248759, + "grad_norm": 0.001059801084920764, + "learning_rate": 3.4566529934192584e-05, + "loss": 0.0, + "num_input_tokens_seen": 9674872, + "step": 15865 + }, + { + "epoch": 4.376723662437948, + "grad_norm": 0.0007509560673497617, + "learning_rate": 3.455541127230289e-05, + "loss": 0.0, + "num_input_tokens_seen": 9678968, + "step": 15870 + }, + { + "epoch": 4.378102592388307, + "grad_norm": 0.00023121261619962752, + "learning_rate": 3.4544290396561574e-05, + "loss": 0.0, + "num_input_tokens_seen": 9682040, + "step": 15875 + }, + { + "epoch": 4.379481522338665, + "grad_norm": 5.6689106713747606e-05, + "learning_rate": 3.453316730954517e-05, + "loss": 0.0, + "num_input_tokens_seen": 9684984, + "step": 15880 + }, + { + "epoch": 4.380860452289023, + "grad_norm": 5.499037797562778e-05, + "learning_rate": 3.452204201383075e-05, + "loss": 0.0, + "num_input_tokens_seen": 9688088, + "step": 15885 + }, + { + "epoch": 4.382239382239383, + "grad_norm": 0.00017618802667129785, + "learning_rate": 3.4510914511995875e-05, + "loss": 0.0, + "num_input_tokens_seen": 9690328, + "step": 15890 + }, + { + "epoch": 4.383618312189741, + "grad_norm": 5.3307372581912205e-05, + "learning_rate": 3.449978480661863e-05, + "loss": 0.0, + "num_input_tokens_seen": 9694296, + "step": 15895 + }, + { + "epoch": 4.384997242140099, + "grad_norm": 0.006191091611981392, + "learning_rate": 3.448865290027761e-05, + "loss": 0.0, + "num_input_tokens_seen": 9698776, + "step": 15900 + }, + { + "epoch": 4.386376172090458, + "grad_norm": 4.010614793514833e-05, + "learning_rate": 3.4477518795551924e-05, + "loss": 0.0, + "num_input_tokens_seen": 9701432, + "step": 15905 + }, + { + "epoch": 4.387755102040816, + "grad_norm": 3.060775270569138e-05, + "learning_rate": 3.446638249502117e-05, + "loss": 0.0, + "num_input_tokens_seen": 9703640, + "step": 15910 + }, + { + "epoch": 4.3891340319911745, + "grad_norm": 0.00021763378754258156, + "learning_rate": 3.445524400126547e-05, + "loss": 0.0, + "num_input_tokens_seen": 9709112, + "step": 15915 + }, + { + "epoch": 4.390512961941534, + "grad_norm": 0.0041525475680828094, + "learning_rate": 3.4444103316865464e-05, + "loss": 0.0, + "num_input_tokens_seen": 9712568, + "step": 15920 + }, + { + "epoch": 4.391891891891892, + "grad_norm": 0.10989760607481003, + "learning_rate": 3.443296044440229e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9715192, + "step": 15925 + }, + { + "epoch": 4.3932708218422505, + "grad_norm": 4.4919892388861626e-05, + "learning_rate": 3.442181538645759e-05, + "loss": 0.0, + "num_input_tokens_seen": 9718616, + "step": 15930 + }, + { + "epoch": 4.394649751792609, + "grad_norm": 5.3571773605654016e-05, + "learning_rate": 3.441066814561349e-05, + "loss": 0.0, + "num_input_tokens_seen": 9722424, + "step": 15935 + }, + { + "epoch": 4.396028681742967, + "grad_norm": 0.00031900074100121856, + "learning_rate": 3.439951872445269e-05, + "loss": 0.0, + "num_input_tokens_seen": 9724504, + "step": 15940 + }, + { + "epoch": 4.397407611693326, + "grad_norm": 4.13788620790001e-05, + "learning_rate": 3.438836712555831e-05, + "loss": 0.0, + "num_input_tokens_seen": 9728728, + "step": 15945 + }, + { + "epoch": 4.398786541643685, + "grad_norm": 4.2499286792008206e-05, + "learning_rate": 3.4377213351514034e-05, + "loss": 0.0, + "num_input_tokens_seen": 9731864, + "step": 15950 + }, + { + "epoch": 4.400165471594043, + "grad_norm": 0.004068674053996801, + "learning_rate": 3.436605740490403e-05, + "loss": 0.0, + "num_input_tokens_seen": 9734264, + "step": 15955 + }, + { + "epoch": 4.401544401544402, + "grad_norm": 5.0964135880349204e-05, + "learning_rate": 3.435489928831297e-05, + "loss": 0.0, + "num_input_tokens_seen": 9738968, + "step": 15960 + }, + { + "epoch": 4.40292333149476, + "grad_norm": 0.0010104890679940581, + "learning_rate": 3.434373900432603e-05, + "loss": 0.0, + "num_input_tokens_seen": 9742968, + "step": 15965 + }, + { + "epoch": 4.404302261445118, + "grad_norm": 4.61545423604548e-05, + "learning_rate": 3.433257655552888e-05, + "loss": 0.0, + "num_input_tokens_seen": 9745208, + "step": 15970 + }, + { + "epoch": 4.405681191395477, + "grad_norm": 0.02005884423851967, + "learning_rate": 3.432141194450772e-05, + "loss": 0.0, + "num_input_tokens_seen": 9748312, + "step": 15975 + }, + { + "epoch": 4.407060121345836, + "grad_norm": 0.00021686244872398674, + "learning_rate": 3.431024517384921e-05, + "loss": 0.0, + "num_input_tokens_seen": 9751800, + "step": 15980 + }, + { + "epoch": 4.408439051296194, + "grad_norm": 4.3923588236793876e-05, + "learning_rate": 3.429907624614053e-05, + "loss": 0.0, + "num_input_tokens_seen": 9754424, + "step": 15985 + }, + { + "epoch": 4.409817981246553, + "grad_norm": 2.843793663487304e-05, + "learning_rate": 3.428790516396937e-05, + "loss": 0.0, + "num_input_tokens_seen": 9757528, + "step": 15990 + }, + { + "epoch": 4.411196911196911, + "grad_norm": 0.0009277626522816718, + "learning_rate": 3.4276731929923905e-05, + "loss": 0.0, + "num_input_tokens_seen": 9760504, + "step": 15995 + }, + { + "epoch": 4.4125758411472695, + "grad_norm": 0.0014669275842607021, + "learning_rate": 3.426555654659282e-05, + "loss": 0.0568, + "num_input_tokens_seen": 9763128, + "step": 16000 + }, + { + "epoch": 4.413954771097628, + "grad_norm": 0.008606551215052605, + "learning_rate": 3.425437901656528e-05, + "loss": 0.0, + "num_input_tokens_seen": 9765912, + "step": 16005 + }, + { + "epoch": 4.415333701047987, + "grad_norm": 1.0761015801108442e-05, + "learning_rate": 3.424319934243097e-05, + "loss": 0.0, + "num_input_tokens_seen": 9771640, + "step": 16010 + }, + { + "epoch": 4.4167126309983455, + "grad_norm": 0.00014637541607953608, + "learning_rate": 3.4232017526780036e-05, + "loss": 0.0, + "num_input_tokens_seen": 9773976, + "step": 16015 + }, + { + "epoch": 4.418091560948704, + "grad_norm": 5.198931467020884e-05, + "learning_rate": 3.422083357220317e-05, + "loss": 0.0, + "num_input_tokens_seen": 9776888, + "step": 16020 + }, + { + "epoch": 4.419470490899062, + "grad_norm": 4.40708790847566e-05, + "learning_rate": 3.420964748129153e-05, + "loss": 0.0, + "num_input_tokens_seen": 9780152, + "step": 16025 + }, + { + "epoch": 4.420849420849421, + "grad_norm": 3.184722663718276e-05, + "learning_rate": 3.419845925663677e-05, + "loss": 0.0, + "num_input_tokens_seen": 9783512, + "step": 16030 + }, + { + "epoch": 4.422228350799779, + "grad_norm": 2.4981187380035408e-05, + "learning_rate": 3.4187268900831035e-05, + "loss": 0.0, + "num_input_tokens_seen": 9786616, + "step": 16035 + }, + { + "epoch": 4.423607280750138, + "grad_norm": 0.00012325576972216368, + "learning_rate": 3.4176076416466965e-05, + "loss": 0.0, + "num_input_tokens_seen": 9789656, + "step": 16040 + }, + { + "epoch": 4.424986210700497, + "grad_norm": 0.00011090588668594137, + "learning_rate": 3.416488180613772e-05, + "loss": 0.0, + "num_input_tokens_seen": 9792408, + "step": 16045 + }, + { + "epoch": 4.426365140650855, + "grad_norm": 0.0007196409860625863, + "learning_rate": 3.4153685072436906e-05, + "loss": 0.0, + "num_input_tokens_seen": 9795224, + "step": 16050 + }, + { + "epoch": 4.4277440706012134, + "grad_norm": 18.086732864379883, + "learning_rate": 3.414248621795866e-05, + "loss": 0.1313, + "num_input_tokens_seen": 9797528, + "step": 16055 + }, + { + "epoch": 4.429123000551572, + "grad_norm": 0.0005067428573966026, + "learning_rate": 3.413128524529759e-05, + "loss": 0.0, + "num_input_tokens_seen": 9800152, + "step": 16060 + }, + { + "epoch": 4.43050193050193, + "grad_norm": 0.00011984720913460478, + "learning_rate": 3.412008215704881e-05, + "loss": 0.0, + "num_input_tokens_seen": 9802616, + "step": 16065 + }, + { + "epoch": 4.431880860452289, + "grad_norm": 0.0005817624623887241, + "learning_rate": 3.4108876955807904e-05, + "loss": 0.0, + "num_input_tokens_seen": 9805464, + "step": 16070 + }, + { + "epoch": 4.433259790402648, + "grad_norm": 0.0008353791199624538, + "learning_rate": 3.409766964417097e-05, + "loss": 0.0, + "num_input_tokens_seen": 9808696, + "step": 16075 + }, + { + "epoch": 4.434638720353006, + "grad_norm": 0.000665508268866688, + "learning_rate": 3.408646022473456e-05, + "loss": 0.0, + "num_input_tokens_seen": 9812824, + "step": 16080 + }, + { + "epoch": 4.436017650303365, + "grad_norm": 0.012846113182604313, + "learning_rate": 3.407524870009575e-05, + "loss": 0.0, + "num_input_tokens_seen": 9815832, + "step": 16085 + }, + { + "epoch": 4.437396580253723, + "grad_norm": 0.004295356571674347, + "learning_rate": 3.406403507285208e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9819544, + "step": 16090 + }, + { + "epoch": 4.438775510204081, + "grad_norm": 0.001835601986385882, + "learning_rate": 3.405281934560159e-05, + "loss": 0.001, + "num_input_tokens_seen": 9822136, + "step": 16095 + }, + { + "epoch": 4.440154440154441, + "grad_norm": 4.668298061005771e-05, + "learning_rate": 3.4041601520942825e-05, + "loss": 0.0, + "num_input_tokens_seen": 9825240, + "step": 16100 + }, + { + "epoch": 4.441533370104799, + "grad_norm": 0.0001456744794268161, + "learning_rate": 3.403038160147476e-05, + "loss": 0.0, + "num_input_tokens_seen": 9827704, + "step": 16105 + }, + { + "epoch": 4.442912300055157, + "grad_norm": 0.04367174580693245, + "learning_rate": 3.4019159589796904e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9831352, + "step": 16110 + }, + { + "epoch": 4.444291230005516, + "grad_norm": 0.00011144056770717725, + "learning_rate": 3.400793548850924e-05, + "loss": 0.0, + "num_input_tokens_seen": 9834392, + "step": 16115 + }, + { + "epoch": 4.445670159955874, + "grad_norm": 0.00029217134579084814, + "learning_rate": 3.3996709300212225e-05, + "loss": 0.0, + "num_input_tokens_seen": 9837464, + "step": 16120 + }, + { + "epoch": 4.4470490899062325, + "grad_norm": 0.000571342243347317, + "learning_rate": 3.39854810275068e-05, + "loss": 0.0, + "num_input_tokens_seen": 9840824, + "step": 16125 + }, + { + "epoch": 4.448428019856591, + "grad_norm": 0.0005041267722845078, + "learning_rate": 3.3974250672994417e-05, + "loss": 0.0, + "num_input_tokens_seen": 9843416, + "step": 16130 + }, + { + "epoch": 4.44980694980695, + "grad_norm": 0.005149338860064745, + "learning_rate": 3.3963018239276964e-05, + "loss": 0.1272, + "num_input_tokens_seen": 9846680, + "step": 16135 + }, + { + "epoch": 4.4511858797573085, + "grad_norm": 0.0011669083032757044, + "learning_rate": 3.395178372895685e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9850168, + "step": 16140 + }, + { + "epoch": 4.452564809707667, + "grad_norm": 0.0016152983298525214, + "learning_rate": 3.3940547144636934e-05, + "loss": 0.0058, + "num_input_tokens_seen": 9853368, + "step": 16145 + }, + { + "epoch": 4.453943739658025, + "grad_norm": 0.00019490686827339232, + "learning_rate": 3.39293084889206e-05, + "loss": 0.0284, + "num_input_tokens_seen": 9856184, + "step": 16150 + }, + { + "epoch": 4.455322669608384, + "grad_norm": 0.0013080502394586802, + "learning_rate": 3.3918067764411645e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9858616, + "step": 16155 + }, + { + "epoch": 4.456701599558742, + "grad_norm": 0.00023444199177902192, + "learning_rate": 3.39068249737144e-05, + "loss": 0.0, + "num_input_tokens_seen": 9860664, + "step": 16160 + }, + { + "epoch": 4.458080529509101, + "grad_norm": 0.024832410737872124, + "learning_rate": 3.389558011943367e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9863480, + "step": 16165 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 0.0029540492687374353, + "learning_rate": 3.3884333204174724e-05, + "loss": 0.0319, + "num_input_tokens_seen": 9866104, + "step": 16170 + }, + { + "epoch": 4.460838389409818, + "grad_norm": 0.011478830128908157, + "learning_rate": 3.3873084230543295e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9868664, + "step": 16175 + }, + { + "epoch": 4.462217319360176, + "grad_norm": 0.1423056274652481, + "learning_rate": 3.3861833201145624e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9871896, + "step": 16180 + }, + { + "epoch": 4.463596249310535, + "grad_norm": 0.0013739224523305893, + "learning_rate": 3.3850580118588397e-05, + "loss": 0.0794, + "num_input_tokens_seen": 9875256, + "step": 16185 + }, + { + "epoch": 4.464975179260893, + "grad_norm": 0.0003965218784287572, + "learning_rate": 3.383932498547881e-05, + "loss": 0.0011, + "num_input_tokens_seen": 9878104, + "step": 16190 + }, + { + "epoch": 4.466354109211252, + "grad_norm": 0.0007337417919188738, + "learning_rate": 3.38280678044245e-05, + "loss": 0.0, + "num_input_tokens_seen": 9881048, + "step": 16195 + }, + { + "epoch": 4.467733039161611, + "grad_norm": 0.000944881874602288, + "learning_rate": 3.381680857803361e-05, + "loss": 0.0, + "num_input_tokens_seen": 9883448, + "step": 16200 + }, + { + "epoch": 4.469111969111969, + "grad_norm": 0.002563257236033678, + "learning_rate": 3.3805547308914734e-05, + "loss": 0.0, + "num_input_tokens_seen": 9886296, + "step": 16205 + }, + { + "epoch": 4.4704908990623275, + "grad_norm": 0.005661997012794018, + "learning_rate": 3.3794283999676934e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9889208, + "step": 16210 + }, + { + "epoch": 4.471869829012686, + "grad_norm": 0.00010631635814206675, + "learning_rate": 3.378301865292978e-05, + "loss": 0.0, + "num_input_tokens_seen": 9891992, + "step": 16215 + }, + { + "epoch": 4.473248758963044, + "grad_norm": 0.010324298404157162, + "learning_rate": 3.377175127128327e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9894776, + "step": 16220 + }, + { + "epoch": 4.474627688913404, + "grad_norm": 0.0014779367484152317, + "learning_rate": 3.37604818573479e-05, + "loss": 0.1148, + "num_input_tokens_seen": 9898360, + "step": 16225 + }, + { + "epoch": 4.476006618863762, + "grad_norm": 0.0006448336062021554, + "learning_rate": 3.3749210413734645e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9901752, + "step": 16230 + }, + { + "epoch": 4.47738554881412, + "grad_norm": 0.04073193296790123, + "learning_rate": 3.373793694305492e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9903992, + "step": 16235 + }, + { + "epoch": 4.478764478764479, + "grad_norm": 0.0014533783541992307, + "learning_rate": 3.3726661447920635e-05, + "loss": 0.0003, + "num_input_tokens_seen": 9907640, + "step": 16240 + }, + { + "epoch": 4.480143408714837, + "grad_norm": 0.0013293199008330703, + "learning_rate": 3.3715383930944164e-05, + "loss": 0.0088, + "num_input_tokens_seen": 9910200, + "step": 16245 + }, + { + "epoch": 4.4815223386651954, + "grad_norm": 0.029999999329447746, + "learning_rate": 3.370410439473834e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9913528, + "step": 16250 + }, + { + "epoch": 4.482901268615555, + "grad_norm": 0.0018755532801151276, + "learning_rate": 3.369282284191647e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9916856, + "step": 16255 + }, + { + "epoch": 4.484280198565913, + "grad_norm": 0.0007979520596563816, + "learning_rate": 3.368153927509232e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9919576, + "step": 16260 + }, + { + "epoch": 4.4856591285162715, + "grad_norm": 0.0015421579591929913, + "learning_rate": 3.367025369688015e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9922584, + "step": 16265 + }, + { + "epoch": 4.48703805846663, + "grad_norm": 0.011878064833581448, + "learning_rate": 3.365896610989465e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9925080, + "step": 16270 + }, + { + "epoch": 4.488416988416988, + "grad_norm": 0.028224479407072067, + "learning_rate": 3.364767651675099e-05, + "loss": 0.0005, + "num_input_tokens_seen": 9927672, + "step": 16275 + }, + { + "epoch": 4.489795918367347, + "grad_norm": 0.014095084741711617, + "learning_rate": 3.3636384920064824e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9929880, + "step": 16280 + }, + { + "epoch": 4.491174848317706, + "grad_norm": 0.0012738386867567897, + "learning_rate": 3.3625091322452236e-05, + "loss": 0.0005, + "num_input_tokens_seen": 9932632, + "step": 16285 + }, + { + "epoch": 4.492553778268064, + "grad_norm": 47.818145751953125, + "learning_rate": 3.36137957265298e-05, + "loss": 0.0457, + "num_input_tokens_seen": 9935640, + "step": 16290 + }, + { + "epoch": 4.493932708218423, + "grad_norm": 0.0009223257657140493, + "learning_rate": 3.360249813491454e-05, + "loss": 0.0002, + "num_input_tokens_seen": 9938744, + "step": 16295 + }, + { + "epoch": 4.495311638168781, + "grad_norm": 0.00458955904468894, + "learning_rate": 3.359119855022394e-05, + "loss": 0.0, + "num_input_tokens_seen": 9941464, + "step": 16300 + }, + { + "epoch": 4.496690568119139, + "grad_norm": 0.0006756510119885206, + "learning_rate": 3.3579896975075966e-05, + "loss": 0.0005, + "num_input_tokens_seen": 9944440, + "step": 16305 + }, + { + "epoch": 4.498069498069498, + "grad_norm": 0.001806046231649816, + "learning_rate": 3.356859341208901e-05, + "loss": 0.0, + "num_input_tokens_seen": 9947992, + "step": 16310 + }, + { + "epoch": 4.499448428019857, + "grad_norm": 0.00017299980390816927, + "learning_rate": 3.3557287863881967e-05, + "loss": 0.0, + "num_input_tokens_seen": 9950424, + "step": 16315 + }, + { + "epoch": 4.5, + "eval_loss": 0.20931921899318695, + "eval_runtime": 28.4949, + "eval_samples_per_second": 56.572, + "eval_steps_per_second": 14.143, + "num_input_tokens_seen": 9951832, + "step": 16317 + }, + { + "epoch": 4.500827357970215, + "grad_norm": 0.00015422760043293238, + "learning_rate": 3.354598033307417e-05, + "loss": 0.0, + "num_input_tokens_seen": 9953464, + "step": 16320 + }, + { + "epoch": 4.502206287920574, + "grad_norm": 0.00014170627400744706, + "learning_rate": 3.353467082228538e-05, + "loss": 0.0, + "num_input_tokens_seen": 9956408, + "step": 16325 + }, + { + "epoch": 4.503585217870932, + "grad_norm": 0.001359220826998353, + "learning_rate": 3.352335933413589e-05, + "loss": 0.0, + "num_input_tokens_seen": 9958904, + "step": 16330 + }, + { + "epoch": 4.5049641478212905, + "grad_norm": 0.0007005540537647903, + "learning_rate": 3.351204587124638e-05, + "loss": 0.0, + "num_input_tokens_seen": 9961528, + "step": 16335 + }, + { + "epoch": 4.506343077771649, + "grad_norm": 0.0007507216068916023, + "learning_rate": 3.350073043623803e-05, + "loss": 0.0, + "num_input_tokens_seen": 9963960, + "step": 16340 + }, + { + "epoch": 4.507722007722007, + "grad_norm": 0.0012165078660473228, + "learning_rate": 3.3489413031732464e-05, + "loss": 0.0, + "num_input_tokens_seen": 9967480, + "step": 16345 + }, + { + "epoch": 4.5091009376723665, + "grad_norm": 0.0042494614608585835, + "learning_rate": 3.347809366035176e-05, + "loss": 0.0, + "num_input_tokens_seen": 9970552, + "step": 16350 + }, + { + "epoch": 4.510479867622725, + "grad_norm": 0.0006911650998517871, + "learning_rate": 3.346677232471844e-05, + "loss": 0.0, + "num_input_tokens_seen": 9974520, + "step": 16355 + }, + { + "epoch": 4.511858797573083, + "grad_norm": 0.0002449484309181571, + "learning_rate": 3.345544902745553e-05, + "loss": 0.0, + "num_input_tokens_seen": 9977048, + "step": 16360 + }, + { + "epoch": 4.513237727523442, + "grad_norm": 4.7432578867301345e-05, + "learning_rate": 3.3444123771186436e-05, + "loss": 0.0, + "num_input_tokens_seen": 9979544, + "step": 16365 + }, + { + "epoch": 4.5146166574738, + "grad_norm": 0.0003505187341943383, + "learning_rate": 3.3432796558535076e-05, + "loss": 0.0001, + "num_input_tokens_seen": 9982936, + "step": 16370 + }, + { + "epoch": 4.515995587424159, + "grad_norm": 5.193896504351869e-05, + "learning_rate": 3.342146739212581e-05, + "loss": 0.0, + "num_input_tokens_seen": 9985656, + "step": 16375 + }, + { + "epoch": 4.517374517374518, + "grad_norm": 0.00012088046787539497, + "learning_rate": 3.341013627458343e-05, + "loss": 0.0, + "num_input_tokens_seen": 9988600, + "step": 16380 + }, + { + "epoch": 4.518753447324876, + "grad_norm": 0.0022368342615664005, + "learning_rate": 3.3398803208533194e-05, + "loss": 0.0, + "num_input_tokens_seen": 9992024, + "step": 16385 + }, + { + "epoch": 4.520132377275234, + "grad_norm": 0.0007150750025175512, + "learning_rate": 3.338746819660081e-05, + "loss": 0.0, + "num_input_tokens_seen": 9994424, + "step": 16390 + }, + { + "epoch": 4.521511307225593, + "grad_norm": 0.0011829104041680694, + "learning_rate": 3.337613124141246e-05, + "loss": 0.0, + "num_input_tokens_seen": 9996952, + "step": 16395 + }, + { + "epoch": 4.522890237175951, + "grad_norm": 0.021518517285585403, + "learning_rate": 3.336479234559472e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10000120, + "step": 16400 + }, + { + "epoch": 4.5242691671263096, + "grad_norm": 0.0011668700026348233, + "learning_rate": 3.335345151177467e-05, + "loss": 0.0, + "num_input_tokens_seen": 10002904, + "step": 16405 + }, + { + "epoch": 4.525648097076669, + "grad_norm": 0.0022518353071063757, + "learning_rate": 3.3342108742579815e-05, + "loss": 0.0, + "num_input_tokens_seen": 10005592, + "step": 16410 + }, + { + "epoch": 4.527027027027027, + "grad_norm": 0.0002914741635322571, + "learning_rate": 3.333076404063811e-05, + "loss": 0.0, + "num_input_tokens_seen": 10009528, + "step": 16415 + }, + { + "epoch": 4.528405956977386, + "grad_norm": 0.026020605117082596, + "learning_rate": 3.331941740857796e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10013976, + "step": 16420 + }, + { + "epoch": 4.529784886927744, + "grad_norm": 0.0009632954024709761, + "learning_rate": 3.330806884902822e-05, + "loss": 0.0, + "num_input_tokens_seen": 10018232, + "step": 16425 + }, + { + "epoch": 4.531163816878102, + "grad_norm": 5.914830035180785e-05, + "learning_rate": 3.3296718364618185e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10021528, + "step": 16430 + }, + { + "epoch": 4.532542746828462, + "grad_norm": 0.0002861525281332433, + "learning_rate": 3.3285365957977596e-05, + "loss": 0.0, + "num_input_tokens_seen": 10024344, + "step": 16435 + }, + { + "epoch": 4.53392167677882, + "grad_norm": 0.00025808988721109927, + "learning_rate": 3.3274011631736644e-05, + "loss": 0.0, + "num_input_tokens_seen": 10027512, + "step": 16440 + }, + { + "epoch": 4.535300606729178, + "grad_norm": 5.5556480219820514e-05, + "learning_rate": 3.326265538852596e-05, + "loss": 0.0, + "num_input_tokens_seen": 10032376, + "step": 16445 + }, + { + "epoch": 4.536679536679537, + "grad_norm": 9.863469313131645e-05, + "learning_rate": 3.325129723097664e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10035384, + "step": 16450 + }, + { + "epoch": 4.538058466629895, + "grad_norm": 0.0006263967952691019, + "learning_rate": 3.32399371617202e-05, + "loss": 0.0, + "num_input_tokens_seen": 10038872, + "step": 16455 + }, + { + "epoch": 4.5394373965802535, + "grad_norm": 0.00015360809629783034, + "learning_rate": 3.322857518338859e-05, + "loss": 0.0, + "num_input_tokens_seen": 10042168, + "step": 16460 + }, + { + "epoch": 4.540816326530612, + "grad_norm": 0.000145521349622868, + "learning_rate": 3.321721129861422e-05, + "loss": 0.0, + "num_input_tokens_seen": 10044664, + "step": 16465 + }, + { + "epoch": 4.542195256480971, + "grad_norm": 0.000913974829018116, + "learning_rate": 3.3205845510029955e-05, + "loss": 0.0, + "num_input_tokens_seen": 10048088, + "step": 16470 + }, + { + "epoch": 4.5435741864313295, + "grad_norm": 0.014945820905268192, + "learning_rate": 3.319447782026907e-05, + "loss": 0.0, + "num_input_tokens_seen": 10051448, + "step": 16475 + }, + { + "epoch": 4.544953116381688, + "grad_norm": 0.0005046158330515027, + "learning_rate": 3.3183108231965305e-05, + "loss": 0.0, + "num_input_tokens_seen": 10055672, + "step": 16480 + }, + { + "epoch": 4.546332046332046, + "grad_norm": 0.0013745279284194112, + "learning_rate": 3.317173674775282e-05, + "loss": 0.0, + "num_input_tokens_seen": 10059288, + "step": 16485 + }, + { + "epoch": 4.547710976282405, + "grad_norm": 2.451779437251389e-05, + "learning_rate": 3.3160363370266244e-05, + "loss": 0.0, + "num_input_tokens_seen": 10063032, + "step": 16490 + }, + { + "epoch": 4.549089906232763, + "grad_norm": 5.8990841353079304e-05, + "learning_rate": 3.3148988102140604e-05, + "loss": 0.0, + "num_input_tokens_seen": 10065432, + "step": 16495 + }, + { + "epoch": 4.550468836183122, + "grad_norm": 0.00011702861957019195, + "learning_rate": 3.31376109460114e-05, + "loss": 0.0, + "num_input_tokens_seen": 10068312, + "step": 16500 + }, + { + "epoch": 4.551847766133481, + "grad_norm": 0.0009977619629353285, + "learning_rate": 3.312623190451454e-05, + "loss": 0.0, + "num_input_tokens_seen": 10071896, + "step": 16505 + }, + { + "epoch": 4.553226696083839, + "grad_norm": 2.201347342634108e-05, + "learning_rate": 3.31148509802864e-05, + "loss": 0.0, + "num_input_tokens_seen": 10075608, + "step": 16510 + }, + { + "epoch": 4.554605626034197, + "grad_norm": 0.03403140604496002, + "learning_rate": 3.310346817596377e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10077848, + "step": 16515 + }, + { + "epoch": 4.555984555984556, + "grad_norm": 1.3154196739196777, + "learning_rate": 3.309208349418389e-05, + "loss": 0.0009, + "num_input_tokens_seen": 10081016, + "step": 16520 + }, + { + "epoch": 4.557363485934914, + "grad_norm": 0.0002539542329031974, + "learning_rate": 3.3080696937584414e-05, + "loss": 0.0, + "num_input_tokens_seen": 10083800, + "step": 16525 + }, + { + "epoch": 4.558742415885273, + "grad_norm": 0.058503568172454834, + "learning_rate": 3.306930850880346e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10086808, + "step": 16530 + }, + { + "epoch": 4.560121345835632, + "grad_norm": 8.866886491887271e-05, + "learning_rate": 3.305791821047955e-05, + "loss": 0.0, + "num_input_tokens_seen": 10090616, + "step": 16535 + }, + { + "epoch": 4.56150027578599, + "grad_norm": 0.03866775706410408, + "learning_rate": 3.304652604525166e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10093816, + "step": 16540 + }, + { + "epoch": 4.5628792057363485, + "grad_norm": 3.988154276157729e-05, + "learning_rate": 3.303513201575918e-05, + "loss": 0.0, + "num_input_tokens_seen": 10097080, + "step": 16545 + }, + { + "epoch": 4.564258135686707, + "grad_norm": 0.008869888260960579, + "learning_rate": 3.302373612464196e-05, + "loss": 0.0, + "num_input_tokens_seen": 10099672, + "step": 16550 + }, + { + "epoch": 4.565637065637065, + "grad_norm": 0.0005074369255453348, + "learning_rate": 3.3012338374540256e-05, + "loss": 0.0, + "num_input_tokens_seen": 10103672, + "step": 16555 + }, + { + "epoch": 4.567015995587424, + "grad_norm": 4.461795469978824e-05, + "learning_rate": 3.300093876809476e-05, + "loss": 0.0, + "num_input_tokens_seen": 10106136, + "step": 16560 + }, + { + "epoch": 4.568394925537783, + "grad_norm": 1.8121952962246723e-05, + "learning_rate": 3.298953730794661e-05, + "loss": 0.0, + "num_input_tokens_seen": 10109080, + "step": 16565 + }, + { + "epoch": 4.569773855488141, + "grad_norm": 1.9447274098638445e-05, + "learning_rate": 3.297813399673734e-05, + "loss": 0.0, + "num_input_tokens_seen": 10111832, + "step": 16570 + }, + { + "epoch": 4.5711527854385, + "grad_norm": 0.004797113128006458, + "learning_rate": 3.296672883710894e-05, + "loss": 0.0, + "num_input_tokens_seen": 10114840, + "step": 16575 + }, + { + "epoch": 4.572531715388858, + "grad_norm": 3.162436405546032e-05, + "learning_rate": 3.295532183170383e-05, + "loss": 0.0, + "num_input_tokens_seen": 10117880, + "step": 16580 + }, + { + "epoch": 4.573910645339216, + "grad_norm": 0.000464083394035697, + "learning_rate": 3.294391298316485e-05, + "loss": 0.0, + "num_input_tokens_seen": 10120312, + "step": 16585 + }, + { + "epoch": 4.575289575289576, + "grad_norm": 0.00029274667031131685, + "learning_rate": 3.2932502294135256e-05, + "loss": 0.0008, + "num_input_tokens_seen": 10123160, + "step": 16590 + }, + { + "epoch": 4.576668505239934, + "grad_norm": 0.00026158877881243825, + "learning_rate": 3.2921089767258756e-05, + "loss": 0.0, + "num_input_tokens_seen": 10125880, + "step": 16595 + }, + { + "epoch": 4.5780474351902924, + "grad_norm": 0.002128956140950322, + "learning_rate": 3.290967540517945e-05, + "loss": 0.0, + "num_input_tokens_seen": 10128152, + "step": 16600 + }, + { + "epoch": 4.579426365140651, + "grad_norm": 2.2368318241205998e-05, + "learning_rate": 3.28982592105419e-05, + "loss": 0.0, + "num_input_tokens_seen": 10131096, + "step": 16605 + }, + { + "epoch": 4.580805295091009, + "grad_norm": 0.00017429083527531475, + "learning_rate": 3.2886841185991065e-05, + "loss": 0.0, + "num_input_tokens_seen": 10134712, + "step": 16610 + }, + { + "epoch": 4.582184225041368, + "grad_norm": 2.0321194824646227e-05, + "learning_rate": 3.287542133417234e-05, + "loss": 0.0, + "num_input_tokens_seen": 10137336, + "step": 16615 + }, + { + "epoch": 4.583563154991726, + "grad_norm": 1.0454102266521659e-05, + "learning_rate": 3.2863999657731525e-05, + "loss": 0.0, + "num_input_tokens_seen": 10139896, + "step": 16620 + }, + { + "epoch": 4.584942084942085, + "grad_norm": 1.5279181752703153e-05, + "learning_rate": 3.285257615931488e-05, + "loss": 0.0, + "num_input_tokens_seen": 10143544, + "step": 16625 + }, + { + "epoch": 4.586321014892444, + "grad_norm": 2.678403143363539e-05, + "learning_rate": 3.2841150841569066e-05, + "loss": 0.0, + "num_input_tokens_seen": 10147544, + "step": 16630 + }, + { + "epoch": 4.587699944842802, + "grad_norm": 4.8470337787875906e-05, + "learning_rate": 3.282972370714115e-05, + "loss": 0.0, + "num_input_tokens_seen": 10151128, + "step": 16635 + }, + { + "epoch": 4.58907887479316, + "grad_norm": 0.00035677492269314826, + "learning_rate": 3.281829475867865e-05, + "loss": 0.0, + "num_input_tokens_seen": 10153848, + "step": 16640 + }, + { + "epoch": 4.590457804743519, + "grad_norm": 2.9253542379592545e-05, + "learning_rate": 3.280686399882947e-05, + "loss": 0.0, + "num_input_tokens_seen": 10156440, + "step": 16645 + }, + { + "epoch": 4.591836734693878, + "grad_norm": 2.1441861463245004e-05, + "learning_rate": 3.279543143024197e-05, + "loss": 0.0, + "num_input_tokens_seen": 10158744, + "step": 16650 + }, + { + "epoch": 4.593215664644236, + "grad_norm": 2.012523873418104e-05, + "learning_rate": 3.27839970555649e-05, + "loss": 0.0, + "num_input_tokens_seen": 10161624, + "step": 16655 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 1.1589876521611586e-05, + "learning_rate": 3.2772560877447463e-05, + "loss": 0.0, + "num_input_tokens_seen": 10164504, + "step": 16660 + }, + { + "epoch": 4.595973524544953, + "grad_norm": 7.632758206455037e-05, + "learning_rate": 3.276112289853923e-05, + "loss": 0.0, + "num_input_tokens_seen": 10167128, + "step": 16665 + }, + { + "epoch": 4.5973524544953115, + "grad_norm": 0.009936998598277569, + "learning_rate": 3.274968312149023e-05, + "loss": 0.0, + "num_input_tokens_seen": 10169560, + "step": 16670 + }, + { + "epoch": 4.59873138444567, + "grad_norm": 0.001182060455903411, + "learning_rate": 3.273824154895089e-05, + "loss": 0.0, + "num_input_tokens_seen": 10172920, + "step": 16675 + }, + { + "epoch": 4.600110314396028, + "grad_norm": 2.4895012757042423e-05, + "learning_rate": 3.272679818357206e-05, + "loss": 0.0, + "num_input_tokens_seen": 10176344, + "step": 16680 + }, + { + "epoch": 4.6014892443463875, + "grad_norm": 1.1782497495005373e-05, + "learning_rate": 3.271535302800501e-05, + "loss": 0.0, + "num_input_tokens_seen": 10179032, + "step": 16685 + }, + { + "epoch": 4.602868174296746, + "grad_norm": 5.411309393821284e-05, + "learning_rate": 3.2703906084901406e-05, + "loss": 0.0, + "num_input_tokens_seen": 10183320, + "step": 16690 + }, + { + "epoch": 4.604247104247104, + "grad_norm": 0.00015132894623093307, + "learning_rate": 3.269245735691335e-05, + "loss": 0.0, + "num_input_tokens_seen": 10187320, + "step": 16695 + }, + { + "epoch": 4.605626034197463, + "grad_norm": 0.0001296362461289391, + "learning_rate": 3.268100684669336e-05, + "loss": 0.0, + "num_input_tokens_seen": 10190392, + "step": 16700 + }, + { + "epoch": 4.607004964147821, + "grad_norm": 0.0016024545766413212, + "learning_rate": 3.2669554556894324e-05, + "loss": 0.0, + "num_input_tokens_seen": 10193720, + "step": 16705 + }, + { + "epoch": 4.60838389409818, + "grad_norm": 0.00971545372158289, + "learning_rate": 3.265810049016959e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10196696, + "step": 16710 + }, + { + "epoch": 4.609762824048539, + "grad_norm": 3.4447290090611205e-05, + "learning_rate": 3.26466446491729e-05, + "loss": 0.0, + "num_input_tokens_seen": 10198808, + "step": 16715 + }, + { + "epoch": 4.611141753998897, + "grad_norm": 2.103985934809316e-05, + "learning_rate": 3.263518703655841e-05, + "loss": 0.0, + "num_input_tokens_seen": 10201528, + "step": 16720 + }, + { + "epoch": 4.612520683949255, + "grad_norm": 1.536258241685573e-05, + "learning_rate": 3.2623727654980686e-05, + "loss": 0.0, + "num_input_tokens_seen": 10204824, + "step": 16725 + }, + { + "epoch": 4.613899613899614, + "grad_norm": 5.617039732896956e-06, + "learning_rate": 3.261226650709469e-05, + "loss": 0.0, + "num_input_tokens_seen": 10207480, + "step": 16730 + }, + { + "epoch": 4.615278543849972, + "grad_norm": 6.175269663799554e-05, + "learning_rate": 3.260080359555582e-05, + "loss": 0.0, + "num_input_tokens_seen": 10209336, + "step": 16735 + }, + { + "epoch": 4.6166574738003305, + "grad_norm": 3.882851160597056e-05, + "learning_rate": 3.258933892301986e-05, + "loss": 0.0, + "num_input_tokens_seen": 10212248, + "step": 16740 + }, + { + "epoch": 4.61803640375069, + "grad_norm": 1.3830166608386207e-05, + "learning_rate": 3.257787249214302e-05, + "loss": 0.0, + "num_input_tokens_seen": 10214776, + "step": 16745 + }, + { + "epoch": 4.619415333701048, + "grad_norm": 0.000636248616501689, + "learning_rate": 3.2566404305581886e-05, + "loss": 0.0, + "num_input_tokens_seen": 10217560, + "step": 16750 + }, + { + "epoch": 4.6207942636514066, + "grad_norm": 0.0009548906236886978, + "learning_rate": 3.255493436599349e-05, + "loss": 0.0, + "num_input_tokens_seen": 10220472, + "step": 16755 + }, + { + "epoch": 4.622173193601765, + "grad_norm": 4.1062216041609645e-05, + "learning_rate": 3.254346267603525e-05, + "loss": 0.0, + "num_input_tokens_seen": 10223704, + "step": 16760 + }, + { + "epoch": 4.623552123552123, + "grad_norm": 0.035775382071733475, + "learning_rate": 3.2531989238364995e-05, + "loss": 0.0, + "num_input_tokens_seen": 10226424, + "step": 16765 + }, + { + "epoch": 4.624931053502483, + "grad_norm": 0.0015673238085582852, + "learning_rate": 3.252051405564095e-05, + "loss": 0.0, + "num_input_tokens_seen": 10230296, + "step": 16770 + }, + { + "epoch": 4.626309983452841, + "grad_norm": 0.0006537170265801251, + "learning_rate": 3.2509037130521744e-05, + "loss": 0.0, + "num_input_tokens_seen": 10232600, + "step": 16775 + }, + { + "epoch": 4.627688913403199, + "grad_norm": 1.1212203389732167e-05, + "learning_rate": 3.249755846566643e-05, + "loss": 0.0, + "num_input_tokens_seen": 10235672, + "step": 16780 + }, + { + "epoch": 4.629067843353558, + "grad_norm": 7.588173048134195e-06, + "learning_rate": 3.248607806373443e-05, + "loss": 0.0, + "num_input_tokens_seen": 10238744, + "step": 16785 + }, + { + "epoch": 4.630446773303916, + "grad_norm": 4.0967697714222595e-05, + "learning_rate": 3.247459592738561e-05, + "loss": 0.0, + "num_input_tokens_seen": 10241304, + "step": 16790 + }, + { + "epoch": 4.6318257032542745, + "grad_norm": 1.75071727426257e-05, + "learning_rate": 3.246311205928021e-05, + "loss": 0.0, + "num_input_tokens_seen": 10244632, + "step": 16795 + }, + { + "epoch": 4.633204633204633, + "grad_norm": 0.00013974074681755155, + "learning_rate": 3.245162646207887e-05, + "loss": 0.0, + "num_input_tokens_seen": 10247864, + "step": 16800 + }, + { + "epoch": 4.634583563154992, + "grad_norm": 1.5515026461798698e-05, + "learning_rate": 3.244013913844265e-05, + "loss": 0.0, + "num_input_tokens_seen": 10250584, + "step": 16805 + }, + { + "epoch": 4.6359624931053505, + "grad_norm": 0.0008641300373710692, + "learning_rate": 3.242865009103299e-05, + "loss": 0.0, + "num_input_tokens_seen": 10254008, + "step": 16810 + }, + { + "epoch": 4.637341423055709, + "grad_norm": 6.341366679407656e-05, + "learning_rate": 3.2417159322511725e-05, + "loss": 0.0, + "num_input_tokens_seen": 10256824, + "step": 16815 + }, + { + "epoch": 4.638720353006067, + "grad_norm": 1.1928803360206075e-05, + "learning_rate": 3.2405666835541117e-05, + "loss": 0.0814, + "num_input_tokens_seen": 10260216, + "step": 16820 + }, + { + "epoch": 4.640099282956426, + "grad_norm": 6.219510396476835e-05, + "learning_rate": 3.239417263278381e-05, + "loss": 0.0, + "num_input_tokens_seen": 10262808, + "step": 16825 + }, + { + "epoch": 4.641478212906784, + "grad_norm": 1.6986286937026307e-05, + "learning_rate": 3.238267671690285e-05, + "loss": 0.0, + "num_input_tokens_seen": 10267192, + "step": 16830 + }, + { + "epoch": 4.642857142857143, + "grad_norm": 0.00045948612387292087, + "learning_rate": 3.237117909056167e-05, + "loss": 0.0, + "num_input_tokens_seen": 10269368, + "step": 16835 + }, + { + "epoch": 4.644236072807502, + "grad_norm": 0.00017693820700515062, + "learning_rate": 3.235967975642409e-05, + "loss": 0.0, + "num_input_tokens_seen": 10271512, + "step": 16840 + }, + { + "epoch": 4.64561500275786, + "grad_norm": 4.463830919121392e-06, + "learning_rate": 3.234817871715436e-05, + "loss": 0.0, + "num_input_tokens_seen": 10274232, + "step": 16845 + }, + { + "epoch": 4.646993932708218, + "grad_norm": 1.0130941518582404e-05, + "learning_rate": 3.23366759754171e-05, + "loss": 0.0, + "num_input_tokens_seen": 10276728, + "step": 16850 + }, + { + "epoch": 4.648372862658577, + "grad_norm": 0.0008375118486583233, + "learning_rate": 3.232517153387733e-05, + "loss": 0.0, + "num_input_tokens_seen": 10282680, + "step": 16855 + }, + { + "epoch": 4.649751792608935, + "grad_norm": 2.786878394545056e-05, + "learning_rate": 3.231366539520047e-05, + "loss": 0.0, + "num_input_tokens_seen": 10284760, + "step": 16860 + }, + { + "epoch": 4.651130722559294, + "grad_norm": 7.612782792421058e-05, + "learning_rate": 3.230215756205232e-05, + "loss": 0.0, + "num_input_tokens_seen": 10288120, + "step": 16865 + }, + { + "epoch": 4.652509652509653, + "grad_norm": 1.0665535228326917e-05, + "learning_rate": 3.229064803709908e-05, + "loss": 0.0, + "num_input_tokens_seen": 10290648, + "step": 16870 + }, + { + "epoch": 4.653888582460011, + "grad_norm": 6.927584036020562e-05, + "learning_rate": 3.227913682300734e-05, + "loss": 0.0, + "num_input_tokens_seen": 10293176, + "step": 16875 + }, + { + "epoch": 4.6552675124103695, + "grad_norm": 0.00010147267312277108, + "learning_rate": 3.2267623922444086e-05, + "loss": 0.0, + "num_input_tokens_seen": 10295960, + "step": 16880 + }, + { + "epoch": 4.656646442360728, + "grad_norm": 0.0002351921721128747, + "learning_rate": 3.225610933807669e-05, + "loss": 0.0, + "num_input_tokens_seen": 10300056, + "step": 16885 + }, + { + "epoch": 4.658025372311086, + "grad_norm": 0.00011063687270507216, + "learning_rate": 3.2244593072572916e-05, + "loss": 0.0855, + "num_input_tokens_seen": 10303704, + "step": 16890 + }, + { + "epoch": 4.659404302261445, + "grad_norm": 0.0002012994373217225, + "learning_rate": 3.2233075128600926e-05, + "loss": 0.0, + "num_input_tokens_seen": 10306904, + "step": 16895 + }, + { + "epoch": 4.660783232211804, + "grad_norm": 0.00036082221777178347, + "learning_rate": 3.2221555508829246e-05, + "loss": 0.0, + "num_input_tokens_seen": 10309944, + "step": 16900 + }, + { + "epoch": 4.662162162162162, + "grad_norm": 7.872506103012711e-05, + "learning_rate": 3.221003421592683e-05, + "loss": 0.0, + "num_input_tokens_seen": 10312440, + "step": 16905 + }, + { + "epoch": 4.663541092112521, + "grad_norm": 5.22805166838225e-05, + "learning_rate": 3.219851125256298e-05, + "loss": 0.0, + "num_input_tokens_seen": 10315064, + "step": 16910 + }, + { + "epoch": 4.664920022062879, + "grad_norm": 0.00014201235899236053, + "learning_rate": 3.2186986621407396e-05, + "loss": 0.0, + "num_input_tokens_seen": 10318264, + "step": 16915 + }, + { + "epoch": 4.666298952013237, + "grad_norm": 5.025099017075263e-05, + "learning_rate": 3.2175460325130176e-05, + "loss": 0.0, + "num_input_tokens_seen": 10320536, + "step": 16920 + }, + { + "epoch": 4.667677881963597, + "grad_norm": 0.0007299146382138133, + "learning_rate": 3.21639323664018e-05, + "loss": 0.0, + "num_input_tokens_seen": 10323288, + "step": 16925 + }, + { + "epoch": 4.669056811913955, + "grad_norm": 0.0021087187342345715, + "learning_rate": 3.215240274789313e-05, + "loss": 0.0, + "num_input_tokens_seen": 10325208, + "step": 16930 + }, + { + "epoch": 4.670435741864313, + "grad_norm": 1.3518479136109818e-05, + "learning_rate": 3.214087147227541e-05, + "loss": 0.0, + "num_input_tokens_seen": 10327512, + "step": 16935 + }, + { + "epoch": 4.671814671814672, + "grad_norm": 0.00402987701818347, + "learning_rate": 3.212933854222027e-05, + "loss": 0.0, + "num_input_tokens_seen": 10329528, + "step": 16940 + }, + { + "epoch": 4.67319360176503, + "grad_norm": 0.00012574407446663827, + "learning_rate": 3.2117803960399736e-05, + "loss": 0.0, + "num_input_tokens_seen": 10332536, + "step": 16945 + }, + { + "epoch": 4.674572531715389, + "grad_norm": 5.244527073955396e-06, + "learning_rate": 3.210626772948619e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10335448, + "step": 16950 + }, + { + "epoch": 4.675951461665747, + "grad_norm": 2.1041416403022595e-05, + "learning_rate": 3.209472985215243e-05, + "loss": 0.0, + "num_input_tokens_seen": 10337720, + "step": 16955 + }, + { + "epoch": 4.677330391616106, + "grad_norm": 1.9032202544622123e-05, + "learning_rate": 3.2083190331071586e-05, + "loss": 0.0, + "num_input_tokens_seen": 10340888, + "step": 16960 + }, + { + "epoch": 4.678709321566465, + "grad_norm": 3.984277282143012e-05, + "learning_rate": 3.207164916891723e-05, + "loss": 0.0, + "num_input_tokens_seen": 10343864, + "step": 16965 + }, + { + "epoch": 4.680088251516823, + "grad_norm": 3.729923628270626e-05, + "learning_rate": 3.206010636836326e-05, + "loss": 0.0, + "num_input_tokens_seen": 10348344, + "step": 16970 + }, + { + "epoch": 4.681467181467181, + "grad_norm": 1.0976305020449217e-05, + "learning_rate": 3.2048561932083997e-05, + "loss": 0.0, + "num_input_tokens_seen": 10351832, + "step": 16975 + }, + { + "epoch": 4.68284611141754, + "grad_norm": 0.0028476559091359377, + "learning_rate": 3.203701586275411e-05, + "loss": 0.0, + "num_input_tokens_seen": 10355192, + "step": 16980 + }, + { + "epoch": 4.684225041367899, + "grad_norm": 2.736362694122363e-05, + "learning_rate": 3.202546816304866e-05, + "loss": 0.0, + "num_input_tokens_seen": 10358328, + "step": 16985 + }, + { + "epoch": 4.685603971318257, + "grad_norm": 6.548134933836991e-06, + "learning_rate": 3.201391883564309e-05, + "loss": 0.0, + "num_input_tokens_seen": 10361176, + "step": 16990 + }, + { + "epoch": 4.686982901268616, + "grad_norm": 7.693150109844282e-05, + "learning_rate": 3.200236788321319e-05, + "loss": 0.0, + "num_input_tokens_seen": 10363672, + "step": 16995 + }, + { + "epoch": 4.688361831218974, + "grad_norm": 1.8226748579763807e-05, + "learning_rate": 3.1990815308435175e-05, + "loss": 0.0, + "num_input_tokens_seen": 10366904, + "step": 17000 + }, + { + "epoch": 4.6897407611693325, + "grad_norm": 3.347235906403512e-05, + "learning_rate": 3.1979261113985604e-05, + "loss": 0.0, + "num_input_tokens_seen": 10370104, + "step": 17005 + }, + { + "epoch": 4.691119691119691, + "grad_norm": 0.00011403483949834481, + "learning_rate": 3.1967705302541415e-05, + "loss": 0.0, + "num_input_tokens_seen": 10373464, + "step": 17010 + }, + { + "epoch": 4.692498621070049, + "grad_norm": 1.3512702025764156e-05, + "learning_rate": 3.1956147876779924e-05, + "loss": 0.0, + "num_input_tokens_seen": 10376600, + "step": 17015 + }, + { + "epoch": 4.6938775510204085, + "grad_norm": 4.280274879420176e-05, + "learning_rate": 3.1944588839378825e-05, + "loss": 0.0, + "num_input_tokens_seen": 10379544, + "step": 17020 + }, + { + "epoch": 4.695256480970767, + "grad_norm": 5.092228821013123e-05, + "learning_rate": 3.193302819301617e-05, + "loss": 0.0, + "num_input_tokens_seen": 10382744, + "step": 17025 + }, + { + "epoch": 4.696635410921125, + "grad_norm": 1.4206716514308937e-05, + "learning_rate": 3.19214659403704e-05, + "loss": 0.0, + "num_input_tokens_seen": 10385304, + "step": 17030 + }, + { + "epoch": 4.698014340871484, + "grad_norm": 0.0040566460229456425, + "learning_rate": 3.1909902084120336e-05, + "loss": 0.0, + "num_input_tokens_seen": 10388760, + "step": 17035 + }, + { + "epoch": 4.699393270821842, + "grad_norm": 1.8976688807015307e-05, + "learning_rate": 3.189833662694515e-05, + "loss": 0.0, + "num_input_tokens_seen": 10391704, + "step": 17040 + }, + { + "epoch": 4.700772200772201, + "grad_norm": 0.0006795360241085291, + "learning_rate": 3.188676957152438e-05, + "loss": 0.0, + "num_input_tokens_seen": 10395672, + "step": 17045 + }, + { + "epoch": 4.70215113072256, + "grad_norm": 1.659219378780108e-05, + "learning_rate": 3.1875200920537954e-05, + "loss": 0.0, + "num_input_tokens_seen": 10398872, + "step": 17050 + }, + { + "epoch": 4.703530060672918, + "grad_norm": 2.9737373552052304e-05, + "learning_rate": 3.186363067666617e-05, + "loss": 0.0, + "num_input_tokens_seen": 10400920, + "step": 17055 + }, + { + "epoch": 4.704908990623276, + "grad_norm": 0.00014108112372923642, + "learning_rate": 3.1852058842589675e-05, + "loss": 0.0, + "num_input_tokens_seen": 10404696, + "step": 17060 + }, + { + "epoch": 4.706287920573635, + "grad_norm": 5.0868607104348484e-06, + "learning_rate": 3.184048542098951e-05, + "loss": 0.0, + "num_input_tokens_seen": 10407576, + "step": 17065 + }, + { + "epoch": 4.707666850523993, + "grad_norm": 7.1665781433694065e-06, + "learning_rate": 3.182891041454706e-05, + "loss": 0.0, + "num_input_tokens_seen": 10411640, + "step": 17070 + }, + { + "epoch": 4.7090457804743515, + "grad_norm": 1.4616422049584799e-05, + "learning_rate": 3.18173338259441e-05, + "loss": 0.0, + "num_input_tokens_seen": 10414392, + "step": 17075 + }, + { + "epoch": 4.710424710424711, + "grad_norm": 5.757427425123751e-05, + "learning_rate": 3.1805755657862745e-05, + "loss": 0.0, + "num_input_tokens_seen": 10417752, + "step": 17080 + }, + { + "epoch": 4.711803640375069, + "grad_norm": 1.2668772797042038e-05, + "learning_rate": 3.17941759129855e-05, + "loss": 0.0, + "num_input_tokens_seen": 10420728, + "step": 17085 + }, + { + "epoch": 4.7131825703254275, + "grad_norm": 7.250966154970229e-05, + "learning_rate": 3.178259459399522e-05, + "loss": 0.0116, + "num_input_tokens_seen": 10423864, + "step": 17090 + }, + { + "epoch": 4.714561500275786, + "grad_norm": 4.8586853154120035e-06, + "learning_rate": 3.177101170357513e-05, + "loss": 0.0, + "num_input_tokens_seen": 10426328, + "step": 17095 + }, + { + "epoch": 4.715940430226144, + "grad_norm": 4.699120108853094e-05, + "learning_rate": 3.175942724440882e-05, + "loss": 0.0, + "num_input_tokens_seen": 10428888, + "step": 17100 + }, + { + "epoch": 4.717319360176503, + "grad_norm": 1.0084392670250963e-05, + "learning_rate": 3.174784121918025e-05, + "loss": 0.0, + "num_input_tokens_seen": 10432120, + "step": 17105 + }, + { + "epoch": 4.718698290126862, + "grad_norm": 2.933649557235185e-05, + "learning_rate": 3.173625363057373e-05, + "loss": 0.0, + "num_input_tokens_seen": 10435160, + "step": 17110 + }, + { + "epoch": 4.72007722007722, + "grad_norm": 9.70501969277393e-06, + "learning_rate": 3.172466448127395e-05, + "loss": 0.0, + "num_input_tokens_seen": 10437816, + "step": 17115 + }, + { + "epoch": 4.721456150027579, + "grad_norm": 3.231043592677452e-05, + "learning_rate": 3.1713073773965926e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10440792, + "step": 17120 + }, + { + "epoch": 4.722835079977937, + "grad_norm": 6.744901475030929e-05, + "learning_rate": 3.170148151133508e-05, + "loss": 0.0, + "num_input_tokens_seen": 10443384, + "step": 17125 + }, + { + "epoch": 4.724214009928295, + "grad_norm": 3.3384585549356416e-06, + "learning_rate": 3.168988769606715e-05, + "loss": 0.0, + "num_input_tokens_seen": 10446456, + "step": 17130 + }, + { + "epoch": 4.725592939878654, + "grad_norm": 2.5017428924911655e-05, + "learning_rate": 3.167829233084827e-05, + "loss": 0.0, + "num_input_tokens_seen": 10448952, + "step": 17135 + }, + { + "epoch": 4.726971869829013, + "grad_norm": 2.4549079171265475e-05, + "learning_rate": 3.1666695418364934e-05, + "loss": 0.0, + "num_input_tokens_seen": 10451288, + "step": 17140 + }, + { + "epoch": 4.7283507997793714, + "grad_norm": 1.4748942703590728e-05, + "learning_rate": 3.165509696130396e-05, + "loss": 0.0, + "num_input_tokens_seen": 10454744, + "step": 17145 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 6.731417215632973e-06, + "learning_rate": 3.1643496962352545e-05, + "loss": 0.0, + "num_input_tokens_seen": 10457304, + "step": 17150 + }, + { + "epoch": 4.731108659680088, + "grad_norm": 6.44062165520154e-05, + "learning_rate": 3.163189542419825e-05, + "loss": 0.0, + "num_input_tokens_seen": 10460440, + "step": 17155 + }, + { + "epoch": 4.732487589630447, + "grad_norm": 5.341047653928399e-05, + "learning_rate": 3.162029234952899e-05, + "loss": 0.0, + "num_input_tokens_seen": 10463224, + "step": 17160 + }, + { + "epoch": 4.733866519580805, + "grad_norm": 5.040187716076616e-06, + "learning_rate": 3.1608687741033014e-05, + "loss": 0.0021, + "num_input_tokens_seen": 10467160, + "step": 17165 + }, + { + "epoch": 4.735245449531163, + "grad_norm": 1.1856101082230452e-05, + "learning_rate": 3.159708160139895e-05, + "loss": 0.0, + "num_input_tokens_seen": 10470552, + "step": 17170 + }, + { + "epoch": 4.736624379481523, + "grad_norm": 5.152129597263411e-05, + "learning_rate": 3.1585473933315786e-05, + "loss": 0.0, + "num_input_tokens_seen": 10473048, + "step": 17175 + }, + { + "epoch": 4.738003309431881, + "grad_norm": 1.983103902603034e-05, + "learning_rate": 3.1573864739472835e-05, + "loss": 0.0, + "num_input_tokens_seen": 10475768, + "step": 17180 + }, + { + "epoch": 4.739382239382239, + "grad_norm": 0.0012360865948721766, + "learning_rate": 3.156225402255979e-05, + "loss": 0.0917, + "num_input_tokens_seen": 10477912, + "step": 17185 + }, + { + "epoch": 4.740761169332598, + "grad_norm": 1.0091543117596302e-05, + "learning_rate": 3.1550641785266696e-05, + "loss": 0.0, + "num_input_tokens_seen": 10480920, + "step": 17190 + }, + { + "epoch": 4.742140099282956, + "grad_norm": 9.17972738534445e-06, + "learning_rate": 3.1539028030283915e-05, + "loss": 0.0, + "num_input_tokens_seen": 10484024, + "step": 17195 + }, + { + "epoch": 4.743519029233315, + "grad_norm": 0.0003387411416042596, + "learning_rate": 3.152741276030221e-05, + "loss": 0.0, + "num_input_tokens_seen": 10486552, + "step": 17200 + }, + { + "epoch": 4.744897959183674, + "grad_norm": 3.0558199796359986e-05, + "learning_rate": 3.151579597801266e-05, + "loss": 0.0, + "num_input_tokens_seen": 10489080, + "step": 17205 + }, + { + "epoch": 4.746276889134032, + "grad_norm": 1.0753838978416752e-05, + "learning_rate": 3.150417768610672e-05, + "loss": 0.0, + "num_input_tokens_seen": 10491576, + "step": 17210 + }, + { + "epoch": 4.7476558190843905, + "grad_norm": 0.12224911153316498, + "learning_rate": 3.1492557887276173e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10495000, + "step": 17215 + }, + { + "epoch": 4.749034749034749, + "grad_norm": 7.471300705219619e-06, + "learning_rate": 3.1480936584213156e-05, + "loss": 0.0, + "num_input_tokens_seen": 10498552, + "step": 17220 + }, + { + "epoch": 4.750413678985107, + "grad_norm": 9.684713404567447e-06, + "learning_rate": 3.1469313779610164e-05, + "loss": 0.0, + "num_input_tokens_seen": 10501912, + "step": 17225 + }, + { + "epoch": 4.751792608935466, + "grad_norm": 1.233355487784138e-05, + "learning_rate": 3.145768947616004e-05, + "loss": 0.0005, + "num_input_tokens_seen": 10504856, + "step": 17230 + }, + { + "epoch": 4.753171538885825, + "grad_norm": 4.975878255208954e-06, + "learning_rate": 3.144606367655595e-05, + "loss": 0.0, + "num_input_tokens_seen": 10507352, + "step": 17235 + }, + { + "epoch": 4.754550468836183, + "grad_norm": 0.0003281443496234715, + "learning_rate": 3.143443638349144e-05, + "loss": 0.0, + "num_input_tokens_seen": 10509976, + "step": 17240 + }, + { + "epoch": 4.755929398786542, + "grad_norm": 0.0002212274121120572, + "learning_rate": 3.142280759966039e-05, + "loss": 0.0, + "num_input_tokens_seen": 10512472, + "step": 17245 + }, + { + "epoch": 4.7573083287369, + "grad_norm": 0.000649155699647963, + "learning_rate": 3.1411177327757e-05, + "loss": 0.0, + "num_input_tokens_seen": 10515000, + "step": 17250 + }, + { + "epoch": 4.758687258687258, + "grad_norm": 1.0921076864178758e-05, + "learning_rate": 3.139954557047586e-05, + "loss": 0.0, + "num_input_tokens_seen": 10517976, + "step": 17255 + }, + { + "epoch": 4.760066188637618, + "grad_norm": 7.634209396201186e-06, + "learning_rate": 3.138791233051187e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10521176, + "step": 17260 + }, + { + "epoch": 4.761445118587976, + "grad_norm": 0.00020653479441534728, + "learning_rate": 3.13762776105603e-05, + "loss": 0.0, + "num_input_tokens_seen": 10524792, + "step": 17265 + }, + { + "epoch": 4.762824048538334, + "grad_norm": 0.000713262299541384, + "learning_rate": 3.136464141331672e-05, + "loss": 0.0, + "num_input_tokens_seen": 10527288, + "step": 17270 + }, + { + "epoch": 4.764202978488693, + "grad_norm": 9.5362302090507e-05, + "learning_rate": 3.135300374147709e-05, + "loss": 0.0, + "num_input_tokens_seen": 10530360, + "step": 17275 + }, + { + "epoch": 4.765581908439051, + "grad_norm": 1.261607394553721e-05, + "learning_rate": 3.1341364597737686e-05, + "loss": 0.0, + "num_input_tokens_seen": 10532952, + "step": 17280 + }, + { + "epoch": 4.7669608383894095, + "grad_norm": 1.6336656699422747e-05, + "learning_rate": 3.132972398479513e-05, + "loss": 0.0, + "num_input_tokens_seen": 10535352, + "step": 17285 + }, + { + "epoch": 4.768339768339768, + "grad_norm": 1.2748304470733274e-05, + "learning_rate": 3.13180819053464e-05, + "loss": 0.0, + "num_input_tokens_seen": 10538040, + "step": 17290 + }, + { + "epoch": 4.769718698290127, + "grad_norm": 1.1442652976256795e-05, + "learning_rate": 3.130643836208877e-05, + "loss": 0.0, + "num_input_tokens_seen": 10541080, + "step": 17295 + }, + { + "epoch": 4.7710976282404856, + "grad_norm": 3.80130313715199e-06, + "learning_rate": 3.12947933577199e-05, + "loss": 0.0, + "num_input_tokens_seen": 10543864, + "step": 17300 + }, + { + "epoch": 4.772476558190844, + "grad_norm": 3.0720339054823853e-06, + "learning_rate": 3.128314689493777e-05, + "loss": 0.0, + "num_input_tokens_seen": 10546424, + "step": 17305 + }, + { + "epoch": 4.773855488141202, + "grad_norm": 1.003799116006121e-05, + "learning_rate": 3.127149897644069e-05, + "loss": 0.0, + "num_input_tokens_seen": 10549880, + "step": 17310 + }, + { + "epoch": 4.775234418091561, + "grad_norm": 0.00012145918299211189, + "learning_rate": 3.125984960492732e-05, + "loss": 0.0, + "num_input_tokens_seen": 10552888, + "step": 17315 + }, + { + "epoch": 4.77661334804192, + "grad_norm": 8.906300536182243e-06, + "learning_rate": 3.124819878309666e-05, + "loss": 0.0, + "num_input_tokens_seen": 10554776, + "step": 17320 + }, + { + "epoch": 4.777992277992278, + "grad_norm": 0.0002471881452947855, + "learning_rate": 3.1236546513648034e-05, + "loss": 0.0, + "num_input_tokens_seen": 10557720, + "step": 17325 + }, + { + "epoch": 4.779371207942637, + "grad_norm": 6.083166226744652e-06, + "learning_rate": 3.12248927992811e-05, + "loss": 0.0, + "num_input_tokens_seen": 10560280, + "step": 17330 + }, + { + "epoch": 4.780750137892995, + "grad_norm": 5.483984296006383e-06, + "learning_rate": 3.121323764269585e-05, + "loss": 0.0, + "num_input_tokens_seen": 10562776, + "step": 17335 + }, + { + "epoch": 4.7821290678433535, + "grad_norm": 0.0001071913939085789, + "learning_rate": 3.120158104659263e-05, + "loss": 0.0, + "num_input_tokens_seen": 10565368, + "step": 17340 + }, + { + "epoch": 4.783507997793712, + "grad_norm": 8.067553426371887e-05, + "learning_rate": 3.118992301367211e-05, + "loss": 0.0, + "num_input_tokens_seen": 10568216, + "step": 17345 + }, + { + "epoch": 4.78488692774407, + "grad_norm": 7.266584725584835e-05, + "learning_rate": 3.117826354663527e-05, + "loss": 0.0, + "num_input_tokens_seen": 10571480, + "step": 17350 + }, + { + "epoch": 4.7862658576944295, + "grad_norm": 7.261937753355596e-06, + "learning_rate": 3.116660264818346e-05, + "loss": 0.0, + "num_input_tokens_seen": 10574296, + "step": 17355 + }, + { + "epoch": 4.787644787644788, + "grad_norm": 6.352905074891169e-06, + "learning_rate": 3.1154940321018336e-05, + "loss": 0.0, + "num_input_tokens_seen": 10576856, + "step": 17360 + }, + { + "epoch": 4.789023717595146, + "grad_norm": 3.0342936952365562e-05, + "learning_rate": 3.114327656784188e-05, + "loss": 0.0, + "num_input_tokens_seen": 10579480, + "step": 17365 + }, + { + "epoch": 4.790402647545505, + "grad_norm": 0.11836980283260345, + "learning_rate": 3.113161139135643e-05, + "loss": 0.0, + "num_input_tokens_seen": 10581816, + "step": 17370 + }, + { + "epoch": 4.791781577495863, + "grad_norm": 6.960496830288321e-05, + "learning_rate": 3.111994479426464e-05, + "loss": 0.0, + "num_input_tokens_seen": 10584984, + "step": 17375 + }, + { + "epoch": 4.793160507446221, + "grad_norm": 28.533334732055664, + "learning_rate": 3.110827677926949e-05, + "loss": 0.1167, + "num_input_tokens_seen": 10588312, + "step": 17380 + }, + { + "epoch": 4.794539437396581, + "grad_norm": 2.089304189212271e-06, + "learning_rate": 3.109660734907428e-05, + "loss": 0.0, + "num_input_tokens_seen": 10591032, + "step": 17385 + }, + { + "epoch": 4.795918367346939, + "grad_norm": 7.129622645152267e-06, + "learning_rate": 3.1084936506382665e-05, + "loss": 0.0, + "num_input_tokens_seen": 10594968, + "step": 17390 + }, + { + "epoch": 4.797297297297297, + "grad_norm": 0.02904530055820942, + "learning_rate": 3.107326425389861e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10598392, + "step": 17395 + }, + { + "epoch": 4.798676227247656, + "grad_norm": 5.282937763695372e-06, + "learning_rate": 3.1061590594326414e-05, + "loss": 0.0, + "num_input_tokens_seen": 10600440, + "step": 17400 + }, + { + "epoch": 4.800055157198014, + "grad_norm": 3.4825650800485164e-05, + "learning_rate": 3.104991553037068e-05, + "loss": 0.0, + "num_input_tokens_seen": 10603128, + "step": 17405 + }, + { + "epoch": 4.8014340871483725, + "grad_norm": 7.800450111972168e-05, + "learning_rate": 3.103823906473636e-05, + "loss": 0.15, + "num_input_tokens_seen": 10605432, + "step": 17410 + }, + { + "epoch": 4.802813017098732, + "grad_norm": 0.01592634804546833, + "learning_rate": 3.1026561200128724e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10607768, + "step": 17415 + }, + { + "epoch": 4.80419194704909, + "grad_norm": 0.00022794422693550587, + "learning_rate": 3.101488193925336e-05, + "loss": 0.0, + "num_input_tokens_seen": 10610552, + "step": 17420 + }, + { + "epoch": 4.8055708769994485, + "grad_norm": 0.0002953452349174768, + "learning_rate": 3.10032012848162e-05, + "loss": 0.0, + "num_input_tokens_seen": 10613496, + "step": 17425 + }, + { + "epoch": 4.806949806949807, + "grad_norm": 0.017758842557668686, + "learning_rate": 3.099151923952347e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10616664, + "step": 17430 + }, + { + "epoch": 4.808328736900165, + "grad_norm": 0.00010569753794698045, + "learning_rate": 3.097983580608175e-05, + "loss": 0.0, + "num_input_tokens_seen": 10619832, + "step": 17435 + }, + { + "epoch": 4.809707666850524, + "grad_norm": 0.0009082379401661456, + "learning_rate": 3.0968150987197895e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10623320, + "step": 17440 + }, + { + "epoch": 4.811086596800882, + "grad_norm": 0.0012108118971809745, + "learning_rate": 3.0956464785579124e-05, + "loss": 0.0, + "num_input_tokens_seen": 10626296, + "step": 17445 + }, + { + "epoch": 4.812465526751241, + "grad_norm": 0.00027978222351521254, + "learning_rate": 3.094477720393297e-05, + "loss": 0.0042, + "num_input_tokens_seen": 10629784, + "step": 17450 + }, + { + "epoch": 4.8138444567016, + "grad_norm": 0.001241275924257934, + "learning_rate": 3.093308824496728e-05, + "loss": 0.0, + "num_input_tokens_seen": 10632920, + "step": 17455 + }, + { + "epoch": 4.815223386651958, + "grad_norm": 2.6899795557255857e-05, + "learning_rate": 3.0921397911390196e-05, + "loss": 0.0, + "num_input_tokens_seen": 10636440, + "step": 17460 + }, + { + "epoch": 4.816602316602316, + "grad_norm": 0.000235783911193721, + "learning_rate": 3.090970620591022e-05, + "loss": 0.1396, + "num_input_tokens_seen": 10638840, + "step": 17465 + }, + { + "epoch": 4.817981246552675, + "grad_norm": 0.0011260341852903366, + "learning_rate": 3.0898013131236144e-05, + "loss": 0.0, + "num_input_tokens_seen": 10641400, + "step": 17470 + }, + { + "epoch": 4.819360176503034, + "grad_norm": 0.06666763871908188, + "learning_rate": 3.08863186900771e-05, + "loss": 0.0003, + "num_input_tokens_seen": 10645400, + "step": 17475 + }, + { + "epoch": 4.820739106453392, + "grad_norm": 0.08799973130226135, + "learning_rate": 3.087462288514249e-05, + "loss": 0.0006, + "num_input_tokens_seen": 10648536, + "step": 17480 + }, + { + "epoch": 4.822118036403751, + "grad_norm": 0.013697516173124313, + "learning_rate": 3.08629257191421e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10651000, + "step": 17485 + }, + { + "epoch": 4.823496966354109, + "grad_norm": 8.853446006774902, + "learning_rate": 3.085122719478597e-05, + "loss": 0.094, + "num_input_tokens_seen": 10653944, + "step": 17490 + }, + { + "epoch": 4.824875896304468, + "grad_norm": 0.0010986414272338152, + "learning_rate": 3.08395273147845e-05, + "loss": 0.0714, + "num_input_tokens_seen": 10657272, + "step": 17495 + }, + { + "epoch": 4.826254826254826, + "grad_norm": 0.012705660425126553, + "learning_rate": 3.082782608184837e-05, + "loss": 0.0005, + "num_input_tokens_seen": 10659640, + "step": 17500 + }, + { + "epoch": 4.827633756205184, + "grad_norm": 0.02243926003575325, + "learning_rate": 3.081612349868859e-05, + "loss": 0.0007, + "num_input_tokens_seen": 10661880, + "step": 17505 + }, + { + "epoch": 4.829012686155544, + "grad_norm": 0.0036612222902476788, + "learning_rate": 3.0804419568016484e-05, + "loss": 0.0898, + "num_input_tokens_seen": 10664344, + "step": 17510 + }, + { + "epoch": 4.830391616105902, + "grad_norm": 9.81159496307373, + "learning_rate": 3.079271429254368e-05, + "loss": 0.0663, + "num_input_tokens_seen": 10667352, + "step": 17515 + }, + { + "epoch": 4.83177054605626, + "grad_norm": 0.01661880500614643, + "learning_rate": 3.0781007674982126e-05, + "loss": 0.0005, + "num_input_tokens_seen": 10670776, + "step": 17520 + }, + { + "epoch": 4.833149476006619, + "grad_norm": 0.0009512307005934417, + "learning_rate": 3.0769299718044074e-05, + "loss": 0.0006, + "num_input_tokens_seen": 10673400, + "step": 17525 + }, + { + "epoch": 4.834528405956977, + "grad_norm": 0.00035841172211803496, + "learning_rate": 3.075759042444211e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10675928, + "step": 17530 + }, + { + "epoch": 4.835907335907336, + "grad_norm": 0.02209821343421936, + "learning_rate": 3.0745879796889074e-05, + "loss": 0.0008, + "num_input_tokens_seen": 10679960, + "step": 17535 + }, + { + "epoch": 4.837286265857695, + "grad_norm": 0.0027847406454384327, + "learning_rate": 3.073416783809817e-05, + "loss": 0.0004, + "num_input_tokens_seen": 10683576, + "step": 17540 + }, + { + "epoch": 4.838665195808053, + "grad_norm": 0.07555381953716278, + "learning_rate": 3.072245455078289e-05, + "loss": 0.0531, + "num_input_tokens_seen": 10686968, + "step": 17545 + }, + { + "epoch": 4.8400441257584115, + "grad_norm": 0.005412194412201643, + "learning_rate": 3.071073993765703e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10689784, + "step": 17550 + }, + { + "epoch": 4.84142305570877, + "grad_norm": 0.0063603101298213005, + "learning_rate": 3.069902400143471e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10693112, + "step": 17555 + }, + { + "epoch": 4.842801985659128, + "grad_norm": 0.0028908185195177794, + "learning_rate": 3.068730674483033e-05, + "loss": 0.0, + "num_input_tokens_seen": 10695640, + "step": 17560 + }, + { + "epoch": 4.844180915609487, + "grad_norm": 0.001292925444431603, + "learning_rate": 3.067558817055861e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10698040, + "step": 17565 + }, + { + "epoch": 4.845559845559846, + "grad_norm": 0.00792643427848816, + "learning_rate": 3.0663868281334596e-05, + "loss": 0.0003, + "num_input_tokens_seen": 10700952, + "step": 17570 + }, + { + "epoch": 4.846938775510204, + "grad_norm": 0.0006719690281897783, + "learning_rate": 3.065214707987359e-05, + "loss": 0.0005, + "num_input_tokens_seen": 10705272, + "step": 17575 + }, + { + "epoch": 4.848317705460563, + "grad_norm": 0.00021811090118717402, + "learning_rate": 3.064042456889124e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10707608, + "step": 17580 + }, + { + "epoch": 4.849696635410921, + "grad_norm": 0.001486431690864265, + "learning_rate": 3.0628700751103485e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10710552, + "step": 17585 + }, + { + "epoch": 4.851075565361279, + "grad_norm": 0.026196349412202835, + "learning_rate": 3.0616975629226556e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10712984, + "step": 17590 + }, + { + "epoch": 4.852454495311639, + "grad_norm": 0.014266056008636951, + "learning_rate": 3.060524920597701e-05, + "loss": 0.0814, + "num_input_tokens_seen": 10714904, + "step": 17595 + }, + { + "epoch": 4.853833425261997, + "grad_norm": 0.0019019456813111901, + "learning_rate": 3.059352148407168e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10717528, + "step": 17600 + }, + { + "epoch": 4.855212355212355, + "grad_norm": 0.008400309830904007, + "learning_rate": 3.058179246622772e-05, + "loss": 0.0186, + "num_input_tokens_seen": 10720472, + "step": 17605 + }, + { + "epoch": 4.856591285162714, + "grad_norm": 0.00016949689597822726, + "learning_rate": 3.057006215516256e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10724344, + "step": 17610 + }, + { + "epoch": 4.857970215113072, + "grad_norm": 0.0006815851666033268, + "learning_rate": 3.055833055359396e-05, + "loss": 0.0, + "num_input_tokens_seen": 10727512, + "step": 17615 + }, + { + "epoch": 4.8593491450634305, + "grad_norm": 0.0003656506596598774, + "learning_rate": 3.054659766423995e-05, + "loss": 0.0, + "num_input_tokens_seen": 10730584, + "step": 17620 + }, + { + "epoch": 4.860728075013789, + "grad_norm": 0.001142687862738967, + "learning_rate": 3.053486348981889e-05, + "loss": 0.1295, + "num_input_tokens_seen": 10733464, + "step": 17625 + }, + { + "epoch": 4.862107004964148, + "grad_norm": 0.0017071402398869395, + "learning_rate": 3.05231280330494e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10736248, + "step": 17630 + }, + { + "epoch": 4.8634859349145065, + "grad_norm": 0.0003856137627735734, + "learning_rate": 3.0511391296650426e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10739032, + "step": 17635 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.005058635026216507, + "learning_rate": 3.0499653283341205e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10742456, + "step": 17640 + }, + { + "epoch": 4.866243794815223, + "grad_norm": 0.017511798068881035, + "learning_rate": 3.0487913995841266e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10746232, + "step": 17645 + }, + { + "epoch": 4.867622724765582, + "grad_norm": 15.037210464477539, + "learning_rate": 3.0476173436870427e-05, + "loss": 0.0629, + "num_input_tokens_seen": 10749048, + "step": 17650 + }, + { + "epoch": 4.86900165471594, + "grad_norm": 0.021085307002067566, + "learning_rate": 3.0464431609148803e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10751576, + "step": 17655 + }, + { + "epoch": 4.870380584666299, + "grad_norm": 0.006634852383285761, + "learning_rate": 3.0452688515396822e-05, + "loss": 0.0003, + "num_input_tokens_seen": 10754200, + "step": 17660 + }, + { + "epoch": 4.871759514616658, + "grad_norm": 0.01798805221915245, + "learning_rate": 3.0440944158335183e-05, + "loss": 0.0003, + "num_input_tokens_seen": 10756920, + "step": 17665 + }, + { + "epoch": 4.873138444567016, + "grad_norm": 0.04578861594200134, + "learning_rate": 3.042919854068488e-05, + "loss": 0.0003, + "num_input_tokens_seen": 10760728, + "step": 17670 + }, + { + "epoch": 4.874517374517374, + "grad_norm": 0.0015434629749506712, + "learning_rate": 3.041745166516721e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10763736, + "step": 17675 + }, + { + "epoch": 4.875896304467733, + "grad_norm": 0.0028570247814059258, + "learning_rate": 3.0405703534503754e-05, + "loss": 0.0008, + "num_input_tokens_seen": 10766712, + "step": 17680 + }, + { + "epoch": 4.877275234418091, + "grad_norm": 0.0013601267710328102, + "learning_rate": 3.039395415141638e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10769656, + "step": 17685 + }, + { + "epoch": 4.8786541643684505, + "grad_norm": 0.0008736332529224455, + "learning_rate": 3.0382203518627262e-05, + "loss": 0.0, + "num_input_tokens_seen": 10771896, + "step": 17690 + }, + { + "epoch": 4.880033094318809, + "grad_norm": 0.0004562098765745759, + "learning_rate": 3.0370451638858845e-05, + "loss": 0.0, + "num_input_tokens_seen": 10775896, + "step": 17695 + }, + { + "epoch": 4.881412024269167, + "grad_norm": 0.009197314269840717, + "learning_rate": 3.035869851483387e-05, + "loss": 0.0027, + "num_input_tokens_seen": 10778616, + "step": 17700 + }, + { + "epoch": 4.882790954219526, + "grad_norm": 0.0001891944120870903, + "learning_rate": 3.0346944149275374e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10781656, + "step": 17705 + }, + { + "epoch": 4.884169884169884, + "grad_norm": 0.007496461272239685, + "learning_rate": 3.0335188544906674e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10784408, + "step": 17710 + }, + { + "epoch": 4.885548814120242, + "grad_norm": 0.002065056934952736, + "learning_rate": 3.0323431704451366e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10787416, + "step": 17715 + }, + { + "epoch": 4.886927744070602, + "grad_norm": 0.0005500860861502588, + "learning_rate": 3.031167363063335e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10791256, + "step": 17720 + }, + { + "epoch": 4.88830667402096, + "grad_norm": 0.0011419015936553478, + "learning_rate": 3.0299914326176792e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10794104, + "step": 17725 + }, + { + "epoch": 4.889685603971318, + "grad_norm": 0.03947971388697624, + "learning_rate": 3.0288153793806173e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10798232, + "step": 17730 + }, + { + "epoch": 4.891064533921677, + "grad_norm": 9.578156459610909e-05, + "learning_rate": 3.0276392036246222e-05, + "loss": 0.0, + "num_input_tokens_seen": 10800984, + "step": 17735 + }, + { + "epoch": 4.892443463872035, + "grad_norm": 7.82927090767771e-05, + "learning_rate": 3.0264629056221982e-05, + "loss": 0.0, + "num_input_tokens_seen": 10803480, + "step": 17740 + }, + { + "epoch": 4.8938223938223935, + "grad_norm": 7.750895747449249e-05, + "learning_rate": 3.025286485645875e-05, + "loss": 0.0, + "num_input_tokens_seen": 10806264, + "step": 17745 + }, + { + "epoch": 4.895201323772753, + "grad_norm": 0.002984666032716632, + "learning_rate": 3.024109943968214e-05, + "loss": 0.0, + "num_input_tokens_seen": 10809144, + "step": 17750 + }, + { + "epoch": 4.896580253723111, + "grad_norm": 4.372244075057097e-05, + "learning_rate": 3.022933280861802e-05, + "loss": 0.0, + "num_input_tokens_seen": 10811576, + "step": 17755 + }, + { + "epoch": 4.8979591836734695, + "grad_norm": 82.19187927246094, + "learning_rate": 3.0217564965992557e-05, + "loss": 0.0176, + "num_input_tokens_seen": 10814712, + "step": 17760 + }, + { + "epoch": 4.899338113623828, + "grad_norm": 0.0007231985800899565, + "learning_rate": 3.020579591453218e-05, + "loss": 0.0, + "num_input_tokens_seen": 10818616, + "step": 17765 + }, + { + "epoch": 4.900717043574186, + "grad_norm": 2.0029496226925403e-05, + "learning_rate": 3.019402565696362e-05, + "loss": 0.0, + "num_input_tokens_seen": 10821080, + "step": 17770 + }, + { + "epoch": 4.902095973524545, + "grad_norm": 0.00012439479178283364, + "learning_rate": 3.0182254196013875e-05, + "loss": 0.0, + "num_input_tokens_seen": 10825080, + "step": 17775 + }, + { + "epoch": 4.903474903474903, + "grad_norm": 0.0013271450297906995, + "learning_rate": 3.0170481534410217e-05, + "loss": 0.0, + "num_input_tokens_seen": 10827544, + "step": 17780 + }, + { + "epoch": 4.904853833425262, + "grad_norm": 0.006744956132024527, + "learning_rate": 3.0158707674880205e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10830552, + "step": 17785 + }, + { + "epoch": 4.906232763375621, + "grad_norm": 0.020490236580371857, + "learning_rate": 3.014693262015168e-05, + "loss": 0.0319, + "num_input_tokens_seen": 10832984, + "step": 17790 + }, + { + "epoch": 4.907611693325979, + "grad_norm": 0.053041961044073105, + "learning_rate": 3.0135156372952746e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10835576, + "step": 17795 + }, + { + "epoch": 4.908990623276337, + "grad_norm": 2.279970249219332e-05, + "learning_rate": 3.0123378936011787e-05, + "loss": 0.0, + "num_input_tokens_seen": 10839384, + "step": 17800 + }, + { + "epoch": 4.910369553226696, + "grad_norm": 0.0003261385136283934, + "learning_rate": 3.0111600312057476e-05, + "loss": 0.0, + "num_input_tokens_seen": 10842040, + "step": 17805 + }, + { + "epoch": 4.911748483177055, + "grad_norm": 0.0016888832906261086, + "learning_rate": 3.009982050381875e-05, + "loss": 0.0, + "num_input_tokens_seen": 10845176, + "step": 17810 + }, + { + "epoch": 4.913127413127413, + "grad_norm": 0.00014966065646149218, + "learning_rate": 3.0088039514024818e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10848216, + "step": 17815 + }, + { + "epoch": 4.914506343077772, + "grad_norm": 0.0004992152680642903, + "learning_rate": 3.0076257345405163e-05, + "loss": 0.0, + "num_input_tokens_seen": 10852248, + "step": 17820 + }, + { + "epoch": 4.91588527302813, + "grad_norm": 0.002080717356875539, + "learning_rate": 3.0064474000689548e-05, + "loss": 0.0065, + "num_input_tokens_seen": 10855384, + "step": 17825 + }, + { + "epoch": 4.9172642029784885, + "grad_norm": 0.00016945494280662388, + "learning_rate": 3.0052689482607997e-05, + "loss": 0.0002, + "num_input_tokens_seen": 10859128, + "step": 17830 + }, + { + "epoch": 4.918643132928847, + "grad_norm": 1.4337323591462336e-05, + "learning_rate": 3.0040903793890836e-05, + "loss": 0.0, + "num_input_tokens_seen": 10862520, + "step": 17835 + }, + { + "epoch": 4.920022062879205, + "grad_norm": 0.0002809724537655711, + "learning_rate": 3.002911693726862e-05, + "loss": 0.0, + "num_input_tokens_seen": 10865592, + "step": 17840 + }, + { + "epoch": 4.921400992829565, + "grad_norm": 0.0025500531774014235, + "learning_rate": 3.0017328915472198e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10868056, + "step": 17845 + }, + { + "epoch": 4.922779922779923, + "grad_norm": 4.9597561883274466e-05, + "learning_rate": 3.000553973123269e-05, + "loss": 0.0, + "num_input_tokens_seen": 10871832, + "step": 17850 + }, + { + "epoch": 4.924158852730281, + "grad_norm": 6.221999501576647e-05, + "learning_rate": 2.999374938728148e-05, + "loss": 0.0, + "num_input_tokens_seen": 10876504, + "step": 17855 + }, + { + "epoch": 4.92553778268064, + "grad_norm": 7.648825703654438e-05, + "learning_rate": 2.998195788635021e-05, + "loss": 0.0, + "num_input_tokens_seen": 10881176, + "step": 17860 + }, + { + "epoch": 4.926916712630998, + "grad_norm": 9.227164264302701e-05, + "learning_rate": 2.997016523117081e-05, + "loss": 0.0, + "num_input_tokens_seen": 10883640, + "step": 17865 + }, + { + "epoch": 4.928295642581357, + "grad_norm": 0.0006912652752362192, + "learning_rate": 2.995837142447548e-05, + "loss": 0.0, + "num_input_tokens_seen": 10886168, + "step": 17870 + }, + { + "epoch": 4.929674572531716, + "grad_norm": 1.7803648006520234e-05, + "learning_rate": 2.994657646899666e-05, + "loss": 0.0, + "num_input_tokens_seen": 10889336, + "step": 17875 + }, + { + "epoch": 4.931053502482074, + "grad_norm": 0.00012293671898078173, + "learning_rate": 2.9934780367467076e-05, + "loss": 0.0, + "num_input_tokens_seen": 10892088, + "step": 17880 + }, + { + "epoch": 4.9324324324324325, + "grad_norm": 1.397914002154721e-05, + "learning_rate": 2.992298312261972e-05, + "loss": 0.0, + "num_input_tokens_seen": 10896120, + "step": 17885 + }, + { + "epoch": 4.933811362382791, + "grad_norm": 0.00020145135931670666, + "learning_rate": 2.9911184737187847e-05, + "loss": 0.0, + "num_input_tokens_seen": 10898392, + "step": 17890 + }, + { + "epoch": 4.935190292333149, + "grad_norm": 3.272225148975849e-05, + "learning_rate": 2.989938521390495e-05, + "loss": 0.0, + "num_input_tokens_seen": 10902680, + "step": 17895 + }, + { + "epoch": 4.936569222283508, + "grad_norm": 6.011638561176369e-06, + "learning_rate": 2.988758455550484e-05, + "loss": 0.0, + "num_input_tokens_seen": 10904920, + "step": 17900 + }, + { + "epoch": 4.937948152233867, + "grad_norm": 1.0976954399666283e-05, + "learning_rate": 2.9875782764721545e-05, + "loss": 0.0, + "num_input_tokens_seen": 10907352, + "step": 17905 + }, + { + "epoch": 4.939327082184225, + "grad_norm": 7.165547140175477e-05, + "learning_rate": 2.9863979844289374e-05, + "loss": 0.0, + "num_input_tokens_seen": 10911512, + "step": 17910 + }, + { + "epoch": 4.940706012134584, + "grad_norm": 9.154763392871246e-05, + "learning_rate": 2.9852175796942895e-05, + "loss": 0.0, + "num_input_tokens_seen": 10915352, + "step": 17915 + }, + { + "epoch": 4.942084942084942, + "grad_norm": 0.0038908738642930984, + "learning_rate": 2.9840370625416925e-05, + "loss": 0.0, + "num_input_tokens_seen": 10918584, + "step": 17920 + }, + { + "epoch": 4.9434638720353, + "grad_norm": 2.0314781068009324e-05, + "learning_rate": 2.9828564332446567e-05, + "loss": 0.1208, + "num_input_tokens_seen": 10921784, + "step": 17925 + }, + { + "epoch": 4.94484280198566, + "grad_norm": 0.001198188285343349, + "learning_rate": 2.9816756920767164e-05, + "loss": 0.0, + "num_input_tokens_seen": 10924600, + "step": 17930 + }, + { + "epoch": 4.946221731936018, + "grad_norm": 0.116768978536129, + "learning_rate": 2.9804948393114324e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10928056, + "step": 17935 + }, + { + "epoch": 4.947600661886376, + "grad_norm": 0.05210451781749725, + "learning_rate": 2.9793138752223914e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10930712, + "step": 17940 + }, + { + "epoch": 4.948979591836735, + "grad_norm": 0.0004703766026068479, + "learning_rate": 2.978132800083206e-05, + "loss": 0.0003, + "num_input_tokens_seen": 10933848, + "step": 17945 + }, + { + "epoch": 4.950358521787093, + "grad_norm": 0.004788098391145468, + "learning_rate": 2.9769516141675135e-05, + "loss": 0.0757, + "num_input_tokens_seen": 10937112, + "step": 17950 + }, + { + "epoch": 4.9517374517374515, + "grad_norm": 8.656476711621508e-05, + "learning_rate": 2.9757703177489794e-05, + "loss": 0.0, + "num_input_tokens_seen": 10941144, + "step": 17955 + }, + { + "epoch": 4.95311638168781, + "grad_norm": 3.664466566988267e-05, + "learning_rate": 2.974588911101291e-05, + "loss": 0.0073, + "num_input_tokens_seen": 10945240, + "step": 17960 + }, + { + "epoch": 4.954495311638169, + "grad_norm": 0.000637764111161232, + "learning_rate": 2.973407394498165e-05, + "loss": 0.0, + "num_input_tokens_seen": 10947928, + "step": 17965 + }, + { + "epoch": 4.9558742415885275, + "grad_norm": 0.00011923899000976235, + "learning_rate": 2.9722257682133407e-05, + "loss": 0.0, + "num_input_tokens_seen": 10950872, + "step": 17970 + }, + { + "epoch": 4.957253171538886, + "grad_norm": 1.6642070477246307e-05, + "learning_rate": 2.9710440325205847e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10952856, + "step": 17975 + }, + { + "epoch": 4.958632101489244, + "grad_norm": 0.05077773705124855, + "learning_rate": 2.9698621876936878e-05, + "loss": 0.0015, + "num_input_tokens_seen": 10955864, + "step": 17980 + }, + { + "epoch": 4.960011031439603, + "grad_norm": 0.0002191205567214638, + "learning_rate": 2.9686802340064674e-05, + "loss": 0.0, + "num_input_tokens_seen": 10958232, + "step": 17985 + }, + { + "epoch": 4.961389961389961, + "grad_norm": 6.022039815434255e-05, + "learning_rate": 2.9674981717327633e-05, + "loss": 0.0, + "num_input_tokens_seen": 10960792, + "step": 17990 + }, + { + "epoch": 4.96276889134032, + "grad_norm": 0.002248239004984498, + "learning_rate": 2.966316001146444e-05, + "loss": 0.0, + "num_input_tokens_seen": 10963416, + "step": 17995 + }, + { + "epoch": 4.964147821290679, + "grad_norm": 0.000655523908790201, + "learning_rate": 2.965133722521401e-05, + "loss": 0.0, + "num_input_tokens_seen": 10966232, + "step": 18000 + }, + { + "epoch": 4.965526751241037, + "grad_norm": 0.005150991026312113, + "learning_rate": 2.963951336131551e-05, + "loss": 0.0, + "num_input_tokens_seen": 10968888, + "step": 18005 + }, + { + "epoch": 4.966905681191395, + "grad_norm": 6.904233305249363e-05, + "learning_rate": 2.962768842250836e-05, + "loss": 0.0, + "num_input_tokens_seen": 10971544, + "step": 18010 + }, + { + "epoch": 4.968284611141754, + "grad_norm": 1.0422018021927215e-05, + "learning_rate": 2.9615862411532236e-05, + "loss": 0.0013, + "num_input_tokens_seen": 10974648, + "step": 18015 + }, + { + "epoch": 4.969663541092112, + "grad_norm": 0.00017618508718442172, + "learning_rate": 2.9604035331127045e-05, + "loss": 0.0, + "num_input_tokens_seen": 10977432, + "step": 18020 + }, + { + "epoch": 4.971042471042471, + "grad_norm": 0.0002594723482616246, + "learning_rate": 2.959220718403296e-05, + "loss": 0.0835, + "num_input_tokens_seen": 10980504, + "step": 18025 + }, + { + "epoch": 4.97242140099283, + "grad_norm": 0.09079520404338837, + "learning_rate": 2.9580377972990385e-05, + "loss": 0.0001, + "num_input_tokens_seen": 10983576, + "step": 18030 + }, + { + "epoch": 4.973800330943188, + "grad_norm": 0.00024749970179982483, + "learning_rate": 2.9568547700739985e-05, + "loss": 0.0, + "num_input_tokens_seen": 10986456, + "step": 18035 + }, + { + "epoch": 4.975179260893547, + "grad_norm": 9.825411325437017e-06, + "learning_rate": 2.9556716370022658e-05, + "loss": 0.0, + "num_input_tokens_seen": 10989272, + "step": 18040 + }, + { + "epoch": 4.976558190843905, + "grad_norm": 4.750049265567213e-05, + "learning_rate": 2.9544883983579557e-05, + "loss": 0.0, + "num_input_tokens_seen": 10991992, + "step": 18045 + }, + { + "epoch": 4.977937120794263, + "grad_norm": 0.02110186591744423, + "learning_rate": 2.953305054415208e-05, + "loss": 0.0, + "num_input_tokens_seen": 10998040, + "step": 18050 + }, + { + "epoch": 4.979316050744622, + "grad_norm": 0.00039953141822479665, + "learning_rate": 2.952121605448186e-05, + "loss": 0.0, + "num_input_tokens_seen": 11001656, + "step": 18055 + }, + { + "epoch": 4.980694980694981, + "grad_norm": 0.0050485036335885525, + "learning_rate": 2.950938051731078e-05, + "loss": 0.0876, + "num_input_tokens_seen": 11004504, + "step": 18060 + }, + { + "epoch": 4.982073910645339, + "grad_norm": 3.673022985458374, + "learning_rate": 2.9497543935380968e-05, + "loss": 0.009, + "num_input_tokens_seen": 11008184, + "step": 18065 + }, + { + "epoch": 4.983452840595698, + "grad_norm": 0.0001955726620508358, + "learning_rate": 2.9485706311434775e-05, + "loss": 0.0, + "num_input_tokens_seen": 11011800, + "step": 18070 + }, + { + "epoch": 4.984831770546056, + "grad_norm": 0.015622361563146114, + "learning_rate": 2.947386764821482e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11014968, + "step": 18075 + }, + { + "epoch": 4.9862107004964145, + "grad_norm": 0.00028720867703668773, + "learning_rate": 2.946202794846396e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11018040, + "step": 18080 + }, + { + "epoch": 4.987589630446774, + "grad_norm": 0.0038540579844266176, + "learning_rate": 2.945018721492527e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11020888, + "step": 18085 + }, + { + "epoch": 4.988968560397132, + "grad_norm": 0.00012636277824640274, + "learning_rate": 2.943834545034208e-05, + "loss": 0.1338, + "num_input_tokens_seen": 11023640, + "step": 18090 + }, + { + "epoch": 4.9903474903474905, + "grad_norm": 0.005680772941559553, + "learning_rate": 2.942650265745796e-05, + "loss": 0.0097, + "num_input_tokens_seen": 11027224, + "step": 18095 + }, + { + "epoch": 4.991726420297849, + "grad_norm": 0.19337543845176697, + "learning_rate": 2.9414658839016722e-05, + "loss": 0.0007, + "num_input_tokens_seen": 11031128, + "step": 18100 + }, + { + "epoch": 4.993105350248207, + "grad_norm": 0.016032865270972252, + "learning_rate": 2.9402813997762384e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11033592, + "step": 18105 + }, + { + "epoch": 4.994484280198566, + "grad_norm": 0.0021177465096116066, + "learning_rate": 2.9390968136439257e-05, + "loss": 0.0, + "num_input_tokens_seen": 11036280, + "step": 18110 + }, + { + "epoch": 4.995863210148924, + "grad_norm": 0.0010466099483892322, + "learning_rate": 2.937912125779184e-05, + "loss": 0.0004, + "num_input_tokens_seen": 11040024, + "step": 18115 + }, + { + "epoch": 4.997242140099283, + "grad_norm": 0.0074313124641776085, + "learning_rate": 2.9367273364564883e-05, + "loss": 0.0003, + "num_input_tokens_seen": 11042968, + "step": 18120 + }, + { + "epoch": 4.998621070049642, + "grad_norm": 0.002035992918536067, + "learning_rate": 2.9355424459503382e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11046072, + "step": 18125 + }, + { + "epoch": 5.0, + "grad_norm": 2.377169039391447e-05, + "learning_rate": 2.9343574545352548e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11048200, + "step": 18130 + }, + { + "epoch": 5.0, + "eval_loss": 0.18892398476600647, + "eval_runtime": 203.4364, + "eval_samples_per_second": 7.924, + "eval_steps_per_second": 1.981, + "num_input_tokens_seen": 11048200, + "step": 18130 + }, + { + "epoch": 5.001378929950358, + "grad_norm": 0.00874176062643528, + "learning_rate": 2.9331723624857854e-05, + "loss": 0.0609, + "num_input_tokens_seen": 11051720, + "step": 18135 + }, + { + "epoch": 5.002757859900717, + "grad_norm": 0.0010959227802231908, + "learning_rate": 2.9319871700764954e-05, + "loss": 0.0, + "num_input_tokens_seen": 11055464, + "step": 18140 + }, + { + "epoch": 5.004136789851075, + "grad_norm": 0.002319841179996729, + "learning_rate": 2.9308018775819808e-05, + "loss": 0.0004, + "num_input_tokens_seen": 11058568, + "step": 18145 + }, + { + "epoch": 5.005515719801434, + "grad_norm": 0.020502526313066483, + "learning_rate": 2.9296164852768543e-05, + "loss": 0.0, + "num_input_tokens_seen": 11062600, + "step": 18150 + }, + { + "epoch": 5.006894649751793, + "grad_norm": 0.012018871493637562, + "learning_rate": 2.9284309934357556e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11064840, + "step": 18155 + }, + { + "epoch": 5.008273579702151, + "grad_norm": 0.0006381261628121138, + "learning_rate": 2.9272454023333453e-05, + "loss": 0.0, + "num_input_tokens_seen": 11070216, + "step": 18160 + }, + { + "epoch": 5.0096525096525095, + "grad_norm": 0.0001286791666643694, + "learning_rate": 2.926059712244308e-05, + "loss": 0.0, + "num_input_tokens_seen": 11073000, + "step": 18165 + }, + { + "epoch": 5.011031439602868, + "grad_norm": 0.0038279995787888765, + "learning_rate": 2.924873923443351e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11075624, + "step": 18170 + }, + { + "epoch": 5.012410369553226, + "grad_norm": 6.163943908177316e-05, + "learning_rate": 2.923688036205205e-05, + "loss": 0.0, + "num_input_tokens_seen": 11077800, + "step": 18175 + }, + { + "epoch": 5.0137892995035855, + "grad_norm": 0.002634921111166477, + "learning_rate": 2.9225020508046232e-05, + "loss": 0.0, + "num_input_tokens_seen": 11079944, + "step": 18180 + }, + { + "epoch": 5.015168229453944, + "grad_norm": 6.391623173840344e-05, + "learning_rate": 2.9213159675163804e-05, + "loss": 0.0, + "num_input_tokens_seen": 11082376, + "step": 18185 + }, + { + "epoch": 5.016547159404302, + "grad_norm": 0.0010157593060284853, + "learning_rate": 2.9201297866152754e-05, + "loss": 0.0003, + "num_input_tokens_seen": 11085064, + "step": 18190 + }, + { + "epoch": 5.017926089354661, + "grad_norm": 0.00045443797716870904, + "learning_rate": 2.9189435083761307e-05, + "loss": 0.0002, + "num_input_tokens_seen": 11087912, + "step": 18195 + }, + { + "epoch": 5.019305019305019, + "grad_norm": 0.0002962930884677917, + "learning_rate": 2.9177571330737886e-05, + "loss": 0.0, + "num_input_tokens_seen": 11091432, + "step": 18200 + }, + { + "epoch": 5.020683949255377, + "grad_norm": 0.0001136969804065302, + "learning_rate": 2.9165706609831146e-05, + "loss": 0.0, + "num_input_tokens_seen": 11094120, + "step": 18205 + }, + { + "epoch": 5.022062879205737, + "grad_norm": 0.0008752308785915375, + "learning_rate": 2.9153840923789983e-05, + "loss": 0.0, + "num_input_tokens_seen": 11097512, + "step": 18210 + }, + { + "epoch": 5.023441809156095, + "grad_norm": 0.00021406933956313878, + "learning_rate": 2.9141974275363505e-05, + "loss": 0.0, + "num_input_tokens_seen": 11101032, + "step": 18215 + }, + { + "epoch": 5.024820739106453, + "grad_norm": 0.00010104964167112485, + "learning_rate": 2.9130106667301044e-05, + "loss": 0.0, + "num_input_tokens_seen": 11104552, + "step": 18220 + }, + { + "epoch": 5.026199669056812, + "grad_norm": 0.0004241852730046958, + "learning_rate": 2.9118238102352154e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11108296, + "step": 18225 + }, + { + "epoch": 5.02757859900717, + "grad_norm": 7.422691851388663e-05, + "learning_rate": 2.9106368583266612e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11111080, + "step": 18230 + }, + { + "epoch": 5.028957528957529, + "grad_norm": 0.0001752990938257426, + "learning_rate": 2.9094498112794416e-05, + "loss": 0.0, + "num_input_tokens_seen": 11113960, + "step": 18235 + }, + { + "epoch": 5.030336458907888, + "grad_norm": 0.00015734863700345159, + "learning_rate": 2.9082626693685777e-05, + "loss": 0.0, + "num_input_tokens_seen": 11117512, + "step": 18240 + }, + { + "epoch": 5.031715388858246, + "grad_norm": 0.008134079165756702, + "learning_rate": 2.9070754328691136e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11120936, + "step": 18245 + }, + { + "epoch": 5.033094318808605, + "grad_norm": 4.627397356671281e-05, + "learning_rate": 2.9058881020561147e-05, + "loss": 0.0, + "num_input_tokens_seen": 11124008, + "step": 18250 + }, + { + "epoch": 5.034473248758963, + "grad_norm": 1.4975874364608899e-05, + "learning_rate": 2.904700677204669e-05, + "loss": 0.0, + "num_input_tokens_seen": 11128104, + "step": 18255 + }, + { + "epoch": 5.035852178709321, + "grad_norm": 0.000672043941449374, + "learning_rate": 2.903513158589886e-05, + "loss": 0.0, + "num_input_tokens_seen": 11130600, + "step": 18260 + }, + { + "epoch": 5.03723110865968, + "grad_norm": 0.029526902362704277, + "learning_rate": 2.9023255464868965e-05, + "loss": 0.1334, + "num_input_tokens_seen": 11135816, + "step": 18265 + }, + { + "epoch": 5.038610038610039, + "grad_norm": 0.02727402187883854, + "learning_rate": 2.9011378411708534e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11139848, + "step": 18270 + }, + { + "epoch": 5.039988968560397, + "grad_norm": 0.005616194102913141, + "learning_rate": 2.8999500429169296e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11142472, + "step": 18275 + }, + { + "epoch": 5.041367898510756, + "grad_norm": 0.0011192945530638099, + "learning_rate": 2.8987621520003217e-05, + "loss": 0.0, + "num_input_tokens_seen": 11145160, + "step": 18280 + }, + { + "epoch": 5.042746828461114, + "grad_norm": 0.04283200949430466, + "learning_rate": 2.8975741686962477e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11147112, + "step": 18285 + }, + { + "epoch": 5.0441257584114725, + "grad_norm": 0.00038636618410237134, + "learning_rate": 2.8963860932799464e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11151080, + "step": 18290 + }, + { + "epoch": 5.045504688361831, + "grad_norm": 9.772664634510875e-05, + "learning_rate": 2.895197926026677e-05, + "loss": 0.0, + "num_input_tokens_seen": 11153192, + "step": 18295 + }, + { + "epoch": 5.04688361831219, + "grad_norm": 0.006232786923646927, + "learning_rate": 2.89400966721172e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11156328, + "step": 18300 + }, + { + "epoch": 5.0482625482625485, + "grad_norm": 0.00035906818811781704, + "learning_rate": 2.89282131711038e-05, + "loss": 0.0002, + "num_input_tokens_seen": 11160008, + "step": 18305 + }, + { + "epoch": 5.049641478212907, + "grad_norm": 3.6550114600686356e-05, + "learning_rate": 2.8916328759979793e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11163208, + "step": 18310 + }, + { + "epoch": 5.051020408163265, + "grad_norm": 0.0005714327562600374, + "learning_rate": 2.890444344149862e-05, + "loss": 0.0528, + "num_input_tokens_seen": 11165704, + "step": 18315 + }, + { + "epoch": 5.052399338113624, + "grad_norm": 0.002480008639395237, + "learning_rate": 2.889255721841395e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11169128, + "step": 18320 + }, + { + "epoch": 5.053778268063982, + "grad_norm": 0.0008987895562313497, + "learning_rate": 2.888067009347966e-05, + "loss": 0.0002, + "num_input_tokens_seen": 11171688, + "step": 18325 + }, + { + "epoch": 5.055157198014341, + "grad_norm": 0.0006107112276367843, + "learning_rate": 2.8868782069449808e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11174728, + "step": 18330 + }, + { + "epoch": 5.0565361279647, + "grad_norm": 0.0003126814553979784, + "learning_rate": 2.885689314907868e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11178024, + "step": 18335 + }, + { + "epoch": 5.057915057915058, + "grad_norm": 0.00030604342464357615, + "learning_rate": 2.8845003335120773e-05, + "loss": 0.0, + "num_input_tokens_seen": 11181768, + "step": 18340 + }, + { + "epoch": 5.059293987865416, + "grad_norm": 5.161159424460493e-05, + "learning_rate": 2.8833112630330796e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11185672, + "step": 18345 + }, + { + "epoch": 5.060672917815775, + "grad_norm": 0.0014663878828287125, + "learning_rate": 2.882122103746363e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11189256, + "step": 18350 + }, + { + "epoch": 5.062051847766133, + "grad_norm": 0.00029076289501972497, + "learning_rate": 2.8809328559274407e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11192008, + "step": 18355 + }, + { + "epoch": 5.063430777716492, + "grad_norm": 0.012819348834455013, + "learning_rate": 2.879743519851844e-05, + "loss": 0.0, + "num_input_tokens_seen": 11194440, + "step": 18360 + }, + { + "epoch": 5.064809707666851, + "grad_norm": 0.0005339765921235085, + "learning_rate": 2.878554095795125e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11197736, + "step": 18365 + }, + { + "epoch": 5.066188637617209, + "grad_norm": 0.0042512016370892525, + "learning_rate": 2.8773645840328554e-05, + "loss": 0.0, + "num_input_tokens_seen": 11200520, + "step": 18370 + }, + { + "epoch": 5.0675675675675675, + "grad_norm": 0.00015423861623276025, + "learning_rate": 2.876174984840629e-05, + "loss": 0.0, + "num_input_tokens_seen": 11203560, + "step": 18375 + }, + { + "epoch": 5.068946497517926, + "grad_norm": 0.02260131947696209, + "learning_rate": 2.874985298494058e-05, + "loss": 0.0002, + "num_input_tokens_seen": 11206248, + "step": 18380 + }, + { + "epoch": 5.070325427468284, + "grad_norm": 0.006662032566964626, + "learning_rate": 2.873795525268776e-05, + "loss": 0.0, + "num_input_tokens_seen": 11209704, + "step": 18385 + }, + { + "epoch": 5.071704357418644, + "grad_norm": 0.001011278247460723, + "learning_rate": 2.872605665440436e-05, + "loss": 0.0, + "num_input_tokens_seen": 11212168, + "step": 18390 + }, + { + "epoch": 5.073083287369002, + "grad_norm": 0.0005155376275070012, + "learning_rate": 2.8714157192847124e-05, + "loss": 0.0, + "num_input_tokens_seen": 11215240, + "step": 18395 + }, + { + "epoch": 5.07446221731936, + "grad_norm": 9.065663107321598e-06, + "learning_rate": 2.8702256870772988e-05, + "loss": 0.0, + "num_input_tokens_seen": 11217800, + "step": 18400 + }, + { + "epoch": 5.075841147269719, + "grad_norm": 4.4812415580963716e-05, + "learning_rate": 2.869035569093907e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11220456, + "step": 18405 + }, + { + "epoch": 5.077220077220077, + "grad_norm": 3.9003596612019464e-05, + "learning_rate": 2.867845365610271e-05, + "loss": 0.0, + "num_input_tokens_seen": 11222952, + "step": 18410 + }, + { + "epoch": 5.078599007170435, + "grad_norm": 0.0004300532164052129, + "learning_rate": 2.8666550769021444e-05, + "loss": 0.0355, + "num_input_tokens_seen": 11225896, + "step": 18415 + }, + { + "epoch": 5.079977937120795, + "grad_norm": 6.285153358476236e-05, + "learning_rate": 2.8654647032452997e-05, + "loss": 0.0, + "num_input_tokens_seen": 11230472, + "step": 18420 + }, + { + "epoch": 5.081356867071153, + "grad_norm": 0.00044979818630963564, + "learning_rate": 2.8642742449155284e-05, + "loss": 0.0, + "num_input_tokens_seen": 11233096, + "step": 18425 + }, + { + "epoch": 5.0827357970215115, + "grad_norm": 0.00013998258509673178, + "learning_rate": 2.863083702188645e-05, + "loss": 0.0, + "num_input_tokens_seen": 11236648, + "step": 18430 + }, + { + "epoch": 5.08411472697187, + "grad_norm": 4.8803365643834695e-05, + "learning_rate": 2.861893075340478e-05, + "loss": 0.0, + "num_input_tokens_seen": 11239880, + "step": 18435 + }, + { + "epoch": 5.085493656922228, + "grad_norm": 0.0007255150121636689, + "learning_rate": 2.8607023646468815e-05, + "loss": 0.0, + "num_input_tokens_seen": 11242024, + "step": 18440 + }, + { + "epoch": 5.086872586872587, + "grad_norm": 0.006799870636314154, + "learning_rate": 2.8595115703837245e-05, + "loss": 0.0, + "num_input_tokens_seen": 11244840, + "step": 18445 + }, + { + "epoch": 5.088251516822945, + "grad_norm": 0.0012848654296249151, + "learning_rate": 2.8583206928268968e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11249000, + "step": 18450 + }, + { + "epoch": 5.089630446773304, + "grad_norm": 0.00031170505098998547, + "learning_rate": 2.8571297322523082e-05, + "loss": 0.0, + "num_input_tokens_seen": 11253032, + "step": 18455 + }, + { + "epoch": 5.091009376723663, + "grad_norm": 0.00709070498123765, + "learning_rate": 2.8559386889358873e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11256872, + "step": 18460 + }, + { + "epoch": 5.092388306674021, + "grad_norm": 0.0001192930358229205, + "learning_rate": 2.8547475631535814e-05, + "loss": 0.0, + "num_input_tokens_seen": 11260008, + "step": 18465 + }, + { + "epoch": 5.093767236624379, + "grad_norm": 6.557263986906037e-05, + "learning_rate": 2.853556355181357e-05, + "loss": 0.0, + "num_input_tokens_seen": 11262664, + "step": 18470 + }, + { + "epoch": 5.095146166574738, + "grad_norm": 0.005996436346322298, + "learning_rate": 2.852365065295201e-05, + "loss": 0.0002, + "num_input_tokens_seen": 11266184, + "step": 18475 + }, + { + "epoch": 5.096525096525096, + "grad_norm": 0.0007312675006687641, + "learning_rate": 2.8511736937711164e-05, + "loss": 0.0, + "num_input_tokens_seen": 11268648, + "step": 18480 + }, + { + "epoch": 5.097904026475455, + "grad_norm": 0.003773090662434697, + "learning_rate": 2.8499822408851285e-05, + "loss": 0.0, + "num_input_tokens_seen": 11271048, + "step": 18485 + }, + { + "epoch": 5.099282956425814, + "grad_norm": 0.015925632789731026, + "learning_rate": 2.848790706913278e-05, + "loss": 0.0, + "num_input_tokens_seen": 11273544, + "step": 18490 + }, + { + "epoch": 5.100661886376172, + "grad_norm": 1.3277843208925333e-05, + "learning_rate": 2.8475990921316292e-05, + "loss": 0.0, + "num_input_tokens_seen": 11276008, + "step": 18495 + }, + { + "epoch": 5.1020408163265305, + "grad_norm": 0.0003047818609047681, + "learning_rate": 2.8464073968162593e-05, + "loss": 0.0, + "num_input_tokens_seen": 11279272, + "step": 18500 + }, + { + "epoch": 5.103419746276889, + "grad_norm": 4.026084570796229e-05, + "learning_rate": 2.845215621243268e-05, + "loss": 0.0, + "num_input_tokens_seen": 11281512, + "step": 18505 + }, + { + "epoch": 5.104798676227247, + "grad_norm": 3.088998346356675e-05, + "learning_rate": 2.844023765688773e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11284712, + "step": 18510 + }, + { + "epoch": 5.1061776061776065, + "grad_norm": 0.0003976167645305395, + "learning_rate": 2.8428318304289098e-05, + "loss": 0.0, + "num_input_tokens_seen": 11288136, + "step": 18515 + }, + { + "epoch": 5.107556536127965, + "grad_norm": 0.011002707295119762, + "learning_rate": 2.8416398157398328e-05, + "loss": 0.0, + "num_input_tokens_seen": 11290472, + "step": 18520 + }, + { + "epoch": 5.108935466078323, + "grad_norm": 0.00018960333545692265, + "learning_rate": 2.8404477218977132e-05, + "loss": 0.0336, + "num_input_tokens_seen": 11293064, + "step": 18525 + }, + { + "epoch": 5.110314396028682, + "grad_norm": 0.0009883602615445852, + "learning_rate": 2.839255549178744e-05, + "loss": 0.0, + "num_input_tokens_seen": 11295656, + "step": 18530 + }, + { + "epoch": 5.11169332597904, + "grad_norm": 3.803193976636976e-05, + "learning_rate": 2.8380632978591348e-05, + "loss": 0.0, + "num_input_tokens_seen": 11298344, + "step": 18535 + }, + { + "epoch": 5.113072255929398, + "grad_norm": 0.00018561423348728567, + "learning_rate": 2.836870968215112e-05, + "loss": 0.0, + "num_input_tokens_seen": 11301256, + "step": 18540 + }, + { + "epoch": 5.114451185879758, + "grad_norm": 0.00716059748083353, + "learning_rate": 2.8356785605229207e-05, + "loss": 0.0, + "num_input_tokens_seen": 11303624, + "step": 18545 + }, + { + "epoch": 5.115830115830116, + "grad_norm": 0.000937186588998884, + "learning_rate": 2.834486075058826e-05, + "loss": 0.0, + "num_input_tokens_seen": 11306376, + "step": 18550 + }, + { + "epoch": 5.117209045780474, + "grad_norm": 8.845295087667182e-05, + "learning_rate": 2.8332935120991084e-05, + "loss": 0.0, + "num_input_tokens_seen": 11309544, + "step": 18555 + }, + { + "epoch": 5.118587975730833, + "grad_norm": 0.0006373701035045087, + "learning_rate": 2.832100871920068e-05, + "loss": 0.0, + "num_input_tokens_seen": 11311976, + "step": 18560 + }, + { + "epoch": 5.119966905681191, + "grad_norm": 0.018391739577054977, + "learning_rate": 2.8309081547980238e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11314376, + "step": 18565 + }, + { + "epoch": 5.1213458356315495, + "grad_norm": 0.0007665991433896124, + "learning_rate": 2.82971536100931e-05, + "loss": 0.0, + "num_input_tokens_seen": 11317736, + "step": 18570 + }, + { + "epoch": 5.122724765581909, + "grad_norm": 3.5510678571881726e-05, + "learning_rate": 2.8285224908302798e-05, + "loss": 0.0, + "num_input_tokens_seen": 11321928, + "step": 18575 + }, + { + "epoch": 5.124103695532267, + "grad_norm": 4.83159747091122e-05, + "learning_rate": 2.827329544537304e-05, + "loss": 0.0, + "num_input_tokens_seen": 11325352, + "step": 18580 + }, + { + "epoch": 5.125482625482626, + "grad_norm": 3.081112663494423e-05, + "learning_rate": 2.826136522406771e-05, + "loss": 0.0002, + "num_input_tokens_seen": 11327912, + "step": 18585 + }, + { + "epoch": 5.126861555432984, + "grad_norm": 2.094319097523112e-05, + "learning_rate": 2.8249434247150876e-05, + "loss": 0.0, + "num_input_tokens_seen": 11330760, + "step": 18590 + }, + { + "epoch": 5.128240485383342, + "grad_norm": 0.00012053106183884665, + "learning_rate": 2.8237502517386773e-05, + "loss": 0.0, + "num_input_tokens_seen": 11333800, + "step": 18595 + }, + { + "epoch": 5.129619415333701, + "grad_norm": 3.49541223840788e-05, + "learning_rate": 2.8225570037539807e-05, + "loss": 0.0, + "num_input_tokens_seen": 11336488, + "step": 18600 + }, + { + "epoch": 5.13099834528406, + "grad_norm": 0.0005371518200263381, + "learning_rate": 2.821363681037456e-05, + "loss": 0.0, + "num_input_tokens_seen": 11339208, + "step": 18605 + }, + { + "epoch": 5.132377275234418, + "grad_norm": 0.00024923577439039946, + "learning_rate": 2.820170283865581e-05, + "loss": 0.0, + "num_input_tokens_seen": 11341992, + "step": 18610 + }, + { + "epoch": 5.133756205184777, + "grad_norm": 0.0005735279992222786, + "learning_rate": 2.8189768125148452e-05, + "loss": 0.0, + "num_input_tokens_seen": 11344136, + "step": 18615 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 5.487493399414234e-05, + "learning_rate": 2.8177832672617616e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11346792, + "step": 18620 + }, + { + "epoch": 5.1365140650854935, + "grad_norm": 0.00047260531573556364, + "learning_rate": 2.8165896483828563e-05, + "loss": 0.0, + "num_input_tokens_seen": 11349352, + "step": 18625 + }, + { + "epoch": 5.137892995035852, + "grad_norm": 5.5423046433134004e-05, + "learning_rate": 2.815395956154674e-05, + "loss": 0.0, + "num_input_tokens_seen": 11351784, + "step": 18630 + }, + { + "epoch": 5.139271924986211, + "grad_norm": 2.873146877391264e-05, + "learning_rate": 2.8142021908537765e-05, + "loss": 0.0, + "num_input_tokens_seen": 11355112, + "step": 18635 + }, + { + "epoch": 5.1406508549365695, + "grad_norm": 0.0001481424260418862, + "learning_rate": 2.8130083527567407e-05, + "loss": 0.0, + "num_input_tokens_seen": 11359528, + "step": 18640 + }, + { + "epoch": 5.142029784886928, + "grad_norm": 4.333034667070024e-06, + "learning_rate": 2.8118144421401634e-05, + "loss": 0.0, + "num_input_tokens_seen": 11361992, + "step": 18645 + }, + { + "epoch": 5.143408714837286, + "grad_norm": 1.667491414991673e-05, + "learning_rate": 2.810620459280655e-05, + "loss": 0.0, + "num_input_tokens_seen": 11364584, + "step": 18650 + }, + { + "epoch": 5.144787644787645, + "grad_norm": 3.340924195072148e-06, + "learning_rate": 2.8094264044548456e-05, + "loss": 0.0, + "num_input_tokens_seen": 11367592, + "step": 18655 + }, + { + "epoch": 5.146166574738003, + "grad_norm": 0.0014162285951897502, + "learning_rate": 2.8082322779393793e-05, + "loss": 0.0, + "num_input_tokens_seen": 11370120, + "step": 18660 + }, + { + "epoch": 5.147545504688362, + "grad_norm": 4.6271743485704064e-06, + "learning_rate": 2.8070380800109192e-05, + "loss": 0.0, + "num_input_tokens_seen": 11372776, + "step": 18665 + }, + { + "epoch": 5.148924434638721, + "grad_norm": 0.008528552949428558, + "learning_rate": 2.8058438109461434e-05, + "loss": 0.0, + "num_input_tokens_seen": 11375944, + "step": 18670 + }, + { + "epoch": 5.150303364589079, + "grad_norm": 0.00013738883717451245, + "learning_rate": 2.8046494710217465e-05, + "loss": 0.0, + "num_input_tokens_seen": 11378408, + "step": 18675 + }, + { + "epoch": 5.151682294539437, + "grad_norm": 0.0004283135349396616, + "learning_rate": 2.80345506051444e-05, + "loss": 0.0, + "num_input_tokens_seen": 11381896, + "step": 18680 + }, + { + "epoch": 5.153061224489796, + "grad_norm": 2.0834713723161258e-05, + "learning_rate": 2.8022605797009528e-05, + "loss": 0.0, + "num_input_tokens_seen": 11385032, + "step": 18685 + }, + { + "epoch": 5.154440154440154, + "grad_norm": 0.00011283083586022258, + "learning_rate": 2.8010660288580276e-05, + "loss": 0.0, + "num_input_tokens_seen": 11388072, + "step": 18690 + }, + { + "epoch": 5.155819084390513, + "grad_norm": 2.6316627554479055e-05, + "learning_rate": 2.799871408262425e-05, + "loss": 0.0, + "num_input_tokens_seen": 11391464, + "step": 18695 + }, + { + "epoch": 5.157198014340872, + "grad_norm": 3.2841944630490616e-05, + "learning_rate": 2.7986767181909207e-05, + "loss": 0.0, + "num_input_tokens_seen": 11394056, + "step": 18700 + }, + { + "epoch": 5.15857694429123, + "grad_norm": 8.467251973343082e-06, + "learning_rate": 2.7974819589203098e-05, + "loss": 0.0, + "num_input_tokens_seen": 11397960, + "step": 18705 + }, + { + "epoch": 5.1599558742415885, + "grad_norm": 0.0025831328239291906, + "learning_rate": 2.7962871307273985e-05, + "loss": 0.0, + "num_input_tokens_seen": 11401864, + "step": 18710 + }, + { + "epoch": 5.161334804191947, + "grad_norm": 8.350599819095805e-05, + "learning_rate": 2.7950922338890117e-05, + "loss": 0.0, + "num_input_tokens_seen": 11404712, + "step": 18715 + }, + { + "epoch": 5.162713734142305, + "grad_norm": 0.00011634387919912115, + "learning_rate": 2.7938972686819902e-05, + "loss": 0.0, + "num_input_tokens_seen": 11407240, + "step": 18720 + }, + { + "epoch": 5.164092664092664, + "grad_norm": 6.771163043595152e-06, + "learning_rate": 2.7927022353831912e-05, + "loss": 0.0, + "num_input_tokens_seen": 11411144, + "step": 18725 + }, + { + "epoch": 5.165471594043023, + "grad_norm": 0.00021697088959626853, + "learning_rate": 2.7915071342694854e-05, + "loss": 0.0, + "num_input_tokens_seen": 11414312, + "step": 18730 + }, + { + "epoch": 5.166850523993381, + "grad_norm": 0.00022283883299678564, + "learning_rate": 2.7903119656177602e-05, + "loss": 0.0, + "num_input_tokens_seen": 11416840, + "step": 18735 + }, + { + "epoch": 5.16822945394374, + "grad_norm": 1.7163707525469363e-05, + "learning_rate": 2.7891167297049204e-05, + "loss": 0.0, + "num_input_tokens_seen": 11419304, + "step": 18740 + }, + { + "epoch": 5.169608383894098, + "grad_norm": 1.1534324585227296e-05, + "learning_rate": 2.7879214268078847e-05, + "loss": 0.0, + "num_input_tokens_seen": 11421960, + "step": 18745 + }, + { + "epoch": 5.170987313844456, + "grad_norm": 3.784145155805163e-05, + "learning_rate": 2.7867260572035868e-05, + "loss": 0.0, + "num_input_tokens_seen": 11424328, + "step": 18750 + }, + { + "epoch": 5.172366243794815, + "grad_norm": 2.4750124794081785e-05, + "learning_rate": 2.7855306211689773e-05, + "loss": 0.0, + "num_input_tokens_seen": 11427560, + "step": 18755 + }, + { + "epoch": 5.173745173745174, + "grad_norm": 1.5742474715807475e-05, + "learning_rate": 2.7843351189810225e-05, + "loss": 0.0, + "num_input_tokens_seen": 11430024, + "step": 18760 + }, + { + "epoch": 5.175124103695532, + "grad_norm": 3.195325916749425e-05, + "learning_rate": 2.7831395509167008e-05, + "loss": 0.0, + "num_input_tokens_seen": 11432360, + "step": 18765 + }, + { + "epoch": 5.176503033645891, + "grad_norm": 0.06618854403495789, + "learning_rate": 2.7819439172530098e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11435688, + "step": 18770 + }, + { + "epoch": 5.177881963596249, + "grad_norm": 0.00035134507925249636, + "learning_rate": 2.7807482182669603e-05, + "loss": 0.0, + "num_input_tokens_seen": 11438152, + "step": 18775 + }, + { + "epoch": 5.179260893546608, + "grad_norm": 1.983231595659163e-05, + "learning_rate": 2.7795524542355793e-05, + "loss": 0.0, + "num_input_tokens_seen": 11441000, + "step": 18780 + }, + { + "epoch": 5.180639823496966, + "grad_norm": 0.0010131191229447722, + "learning_rate": 2.778356625435907e-05, + "loss": 0.0, + "num_input_tokens_seen": 11443816, + "step": 18785 + }, + { + "epoch": 5.182018753447325, + "grad_norm": 2.1803074560011737e-05, + "learning_rate": 2.777160732145e-05, + "loss": 0.0, + "num_input_tokens_seen": 11446504, + "step": 18790 + }, + { + "epoch": 5.183397683397684, + "grad_norm": 0.0006103437044657767, + "learning_rate": 2.7759647746399304e-05, + "loss": 0.0, + "num_input_tokens_seen": 11450568, + "step": 18795 + }, + { + "epoch": 5.184776613348042, + "grad_norm": 7.728749551461078e-06, + "learning_rate": 2.7747687531977833e-05, + "loss": 0.0, + "num_input_tokens_seen": 11453224, + "step": 18800 + }, + { + "epoch": 5.1861555432984, + "grad_norm": 0.00028297913377173245, + "learning_rate": 2.7735726680956604e-05, + "loss": 0.0, + "num_input_tokens_seen": 11456200, + "step": 18805 + }, + { + "epoch": 5.187534473248759, + "grad_norm": 9.639830386731774e-05, + "learning_rate": 2.772376519610677e-05, + "loss": 0.0, + "num_input_tokens_seen": 11459208, + "step": 18810 + }, + { + "epoch": 5.188913403199117, + "grad_norm": 0.00017381046200171113, + "learning_rate": 2.7711803080199644e-05, + "loss": 0.0, + "num_input_tokens_seen": 11462440, + "step": 18815 + }, + { + "epoch": 5.190292333149476, + "grad_norm": 9.051533561432734e-05, + "learning_rate": 2.7699840336006665e-05, + "loss": 0.0, + "num_input_tokens_seen": 11466280, + "step": 18820 + }, + { + "epoch": 5.191671263099835, + "grad_norm": 0.0002005735004786402, + "learning_rate": 2.7687876966299442e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11469864, + "step": 18825 + }, + { + "epoch": 5.193050193050193, + "grad_norm": 0.00038320320891216397, + "learning_rate": 2.7675912973849704e-05, + "loss": 0.0, + "num_input_tokens_seen": 11472456, + "step": 18830 + }, + { + "epoch": 5.1944291230005515, + "grad_norm": 7.393374289677013e-06, + "learning_rate": 2.7663948361429337e-05, + "loss": 0.0, + "num_input_tokens_seen": 11474600, + "step": 18835 + }, + { + "epoch": 5.19580805295091, + "grad_norm": 3.2522275432711467e-06, + "learning_rate": 2.7651983131810377e-05, + "loss": 0.0, + "num_input_tokens_seen": 11477480, + "step": 18840 + }, + { + "epoch": 5.197186982901268, + "grad_norm": 0.001511273323558271, + "learning_rate": 2.7640017287764996e-05, + "loss": 0.0, + "num_input_tokens_seen": 11480488, + "step": 18845 + }, + { + "epoch": 5.1985659128516275, + "grad_norm": 5.039964526076801e-05, + "learning_rate": 2.7628050832065506e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11482280, + "step": 18850 + }, + { + "epoch": 5.199944842801986, + "grad_norm": 0.001601710100658238, + "learning_rate": 2.7616083767484365e-05, + "loss": 0.0, + "num_input_tokens_seen": 11485480, + "step": 18855 + }, + { + "epoch": 5.201323772752344, + "grad_norm": 8.019601409614552e-06, + "learning_rate": 2.7604116096794157e-05, + "loss": 0.0, + "num_input_tokens_seen": 11488136, + "step": 18860 + }, + { + "epoch": 5.202702702702703, + "grad_norm": 9.529595445201267e-06, + "learning_rate": 2.759214782276764e-05, + "loss": 0.0, + "num_input_tokens_seen": 11490824, + "step": 18865 + }, + { + "epoch": 5.204081632653061, + "grad_norm": 4.9016425691661425e-06, + "learning_rate": 2.7580178948177677e-05, + "loss": 0.0, + "num_input_tokens_seen": 11494216, + "step": 18870 + }, + { + "epoch": 5.205460562603419, + "grad_norm": 1.0782207027659751e-05, + "learning_rate": 2.7568209475797285e-05, + "loss": 0.0, + "num_input_tokens_seen": 11497704, + "step": 18875 + }, + { + "epoch": 5.206839492553779, + "grad_norm": 0.00011339247430441901, + "learning_rate": 2.7556239408399625e-05, + "loss": 0.0006, + "num_input_tokens_seen": 11500872, + "step": 18880 + }, + { + "epoch": 5.208218422504137, + "grad_norm": 3.6560097669280367e-06, + "learning_rate": 2.7544268748757988e-05, + "loss": 0.0, + "num_input_tokens_seen": 11503528, + "step": 18885 + }, + { + "epoch": 5.209597352454495, + "grad_norm": 6.664823740720749e-05, + "learning_rate": 2.7532297499645805e-05, + "loss": 0.0, + "num_input_tokens_seen": 11506280, + "step": 18890 + }, + { + "epoch": 5.210976282404854, + "grad_norm": 9.149872312264051e-06, + "learning_rate": 2.7520325663836644e-05, + "loss": 0.0, + "num_input_tokens_seen": 11508968, + "step": 18895 + }, + { + "epoch": 5.212355212355212, + "grad_norm": 8.16818283055909e-06, + "learning_rate": 2.7508353244104203e-05, + "loss": 0.0, + "num_input_tokens_seen": 11512200, + "step": 18900 + }, + { + "epoch": 5.2137341423055705, + "grad_norm": 0.00029344536596909165, + "learning_rate": 2.7496380243222314e-05, + "loss": 0.0, + "num_input_tokens_seen": 11514376, + "step": 18905 + }, + { + "epoch": 5.21511307225593, + "grad_norm": 0.006535123568028212, + "learning_rate": 2.7484406663964964e-05, + "loss": 0.0, + "num_input_tokens_seen": 11518632, + "step": 18910 + }, + { + "epoch": 5.216492002206288, + "grad_norm": 2.7177613901585573e-06, + "learning_rate": 2.7472432509106248e-05, + "loss": 0.0, + "num_input_tokens_seen": 11522344, + "step": 18915 + }, + { + "epoch": 5.2178709321566465, + "grad_norm": 1.5097988580237143e-05, + "learning_rate": 2.746045778142041e-05, + "loss": 0.001, + "num_input_tokens_seen": 11525096, + "step": 18920 + }, + { + "epoch": 5.219249862107005, + "grad_norm": 5.546754437091295e-06, + "learning_rate": 2.7448482483681824e-05, + "loss": 0.0, + "num_input_tokens_seen": 11528680, + "step": 18925 + }, + { + "epoch": 5.220628792057363, + "grad_norm": 4.9331924856232945e-06, + "learning_rate": 2.7436506618664998e-05, + "loss": 0.0, + "num_input_tokens_seen": 11531464, + "step": 18930 + }, + { + "epoch": 5.222007722007722, + "grad_norm": 9.3277512860368e-06, + "learning_rate": 2.7424530189144558e-05, + "loss": 0.0, + "num_input_tokens_seen": 11534568, + "step": 18935 + }, + { + "epoch": 5.223386651958081, + "grad_norm": 2.4566363208577968e-05, + "learning_rate": 2.741255319789527e-05, + "loss": 0.0, + "num_input_tokens_seen": 11537832, + "step": 18940 + }, + { + "epoch": 5.224765581908439, + "grad_norm": 7.546358392573893e-05, + "learning_rate": 2.7400575647692046e-05, + "loss": 0.0, + "num_input_tokens_seen": 11540904, + "step": 18945 + }, + { + "epoch": 5.226144511858798, + "grad_norm": 1.1499056199681945e-05, + "learning_rate": 2.7388597541309902e-05, + "loss": 0.0, + "num_input_tokens_seen": 11544328, + "step": 18950 + }, + { + "epoch": 5.227523441809156, + "grad_norm": 0.00024401303380727768, + "learning_rate": 2.737661888152399e-05, + "loss": 0.0, + "num_input_tokens_seen": 11546600, + "step": 18955 + }, + { + "epoch": 5.2289023717595144, + "grad_norm": 2.486783887434285e-05, + "learning_rate": 2.7364639671109604e-05, + "loss": 0.0, + "num_input_tokens_seen": 11550120, + "step": 18960 + }, + { + "epoch": 5.230281301709873, + "grad_norm": 1.0770195331133436e-05, + "learning_rate": 2.7352659912842143e-05, + "loss": 0.0, + "num_input_tokens_seen": 11552776, + "step": 18965 + }, + { + "epoch": 5.231660231660232, + "grad_norm": 0.00012790512118954211, + "learning_rate": 2.734067960949716e-05, + "loss": 0.0, + "num_input_tokens_seen": 11555368, + "step": 18970 + }, + { + "epoch": 5.2330391616105905, + "grad_norm": 3.0358553431142354e-06, + "learning_rate": 2.7328698763850303e-05, + "loss": 0.0, + "num_input_tokens_seen": 11558184, + "step": 18975 + }, + { + "epoch": 5.234418091560949, + "grad_norm": 1.8067557903123088e-06, + "learning_rate": 2.731671737867737e-05, + "loss": 0.0, + "num_input_tokens_seen": 11560712, + "step": 18980 + }, + { + "epoch": 5.235797021511307, + "grad_norm": 1.0010614460043143e-05, + "learning_rate": 2.7304735456754282e-05, + "loss": 0.0, + "num_input_tokens_seen": 11565032, + "step": 18985 + }, + { + "epoch": 5.237175951461666, + "grad_norm": 3.8608061004197225e-05, + "learning_rate": 2.7292753000857064e-05, + "loss": 0.0, + "num_input_tokens_seen": 11567976, + "step": 18990 + }, + { + "epoch": 5.238554881412024, + "grad_norm": 3.0495213650283404e-05, + "learning_rate": 2.7280770013761893e-05, + "loss": 0.0, + "num_input_tokens_seen": 11570408, + "step": 18995 + }, + { + "epoch": 5.239933811362382, + "grad_norm": 4.098162389709614e-05, + "learning_rate": 2.726878649824504e-05, + "loss": 0.0, + "num_input_tokens_seen": 11573320, + "step": 19000 + }, + { + "epoch": 5.241312741312742, + "grad_norm": 0.0002882324333768338, + "learning_rate": 2.7256802457082938e-05, + "loss": 0.0, + "num_input_tokens_seen": 11577768, + "step": 19005 + }, + { + "epoch": 5.2426916712631, + "grad_norm": 9.334741662314627e-06, + "learning_rate": 2.724481789305208e-05, + "loss": 0.0, + "num_input_tokens_seen": 11581512, + "step": 19010 + }, + { + "epoch": 5.244070601213458, + "grad_norm": 45.52155303955078, + "learning_rate": 2.7232832808929155e-05, + "loss": 0.0938, + "num_input_tokens_seen": 11587528, + "step": 19015 + }, + { + "epoch": 5.245449531163817, + "grad_norm": 6.686383130727336e-06, + "learning_rate": 2.7220847207490913e-05, + "loss": 0.0, + "num_input_tokens_seen": 11590792, + "step": 19020 + }, + { + "epoch": 5.246828461114175, + "grad_norm": 0.002818429609760642, + "learning_rate": 2.720886109151426e-05, + "loss": 0.0, + "num_input_tokens_seen": 11593928, + "step": 19025 + }, + { + "epoch": 5.248207391064534, + "grad_norm": 0.0005158307612873614, + "learning_rate": 2.7196874463776196e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11596648, + "step": 19030 + }, + { + "epoch": 5.249586321014893, + "grad_norm": 3.646342065621866e-06, + "learning_rate": 2.718488732705385e-05, + "loss": 0.0, + "num_input_tokens_seen": 11599880, + "step": 19035 + }, + { + "epoch": 5.250965250965251, + "grad_norm": 2.2739623091183603e-05, + "learning_rate": 2.7172899684124475e-05, + "loss": 0.0, + "num_input_tokens_seen": 11603240, + "step": 19040 + }, + { + "epoch": 5.2523441809156095, + "grad_norm": 5.4090282901597675e-06, + "learning_rate": 2.7160911537765432e-05, + "loss": 0.0, + "num_input_tokens_seen": 11606248, + "step": 19045 + }, + { + "epoch": 5.253723110865968, + "grad_norm": 5.296373274177313e-05, + "learning_rate": 2.7148922890754203e-05, + "loss": 0.0, + "num_input_tokens_seen": 11608968, + "step": 19050 + }, + { + "epoch": 5.255102040816326, + "grad_norm": 2.5204730263794772e-05, + "learning_rate": 2.713693374586839e-05, + "loss": 0.0, + "num_input_tokens_seen": 11612584, + "step": 19055 + }, + { + "epoch": 5.256480970766685, + "grad_norm": 9.294372830481734e-06, + "learning_rate": 2.7124944105885704e-05, + "loss": 0.0, + "num_input_tokens_seen": 11616040, + "step": 19060 + }, + { + "epoch": 5.257859900717044, + "grad_norm": 2.354339585508569e-06, + "learning_rate": 2.711295397358397e-05, + "loss": 0.0, + "num_input_tokens_seen": 11619240, + "step": 19065 + }, + { + "epoch": 5.259238830667402, + "grad_norm": 2.7368410883354954e-05, + "learning_rate": 2.7100963351741127e-05, + "loss": 0.0, + "num_input_tokens_seen": 11624264, + "step": 19070 + }, + { + "epoch": 5.260617760617761, + "grad_norm": 0.3015938699245453, + "learning_rate": 2.7088972243135235e-05, + "loss": 0.0002, + "num_input_tokens_seen": 11627656, + "step": 19075 + }, + { + "epoch": 5.261996690568119, + "grad_norm": 0.00010436483717057854, + "learning_rate": 2.7076980650544465e-05, + "loss": 0.1229, + "num_input_tokens_seen": 11631112, + "step": 19080 + }, + { + "epoch": 5.263375620518477, + "grad_norm": 120.11204528808594, + "learning_rate": 2.7064988576747085e-05, + "loss": 0.0234, + "num_input_tokens_seen": 11635048, + "step": 19085 + }, + { + "epoch": 5.264754550468836, + "grad_norm": 6.684259915346047e-06, + "learning_rate": 2.7052996024521498e-05, + "loss": 0.0, + "num_input_tokens_seen": 11638856, + "step": 19090 + }, + { + "epoch": 5.266133480419195, + "grad_norm": 2.193404725403525e-05, + "learning_rate": 2.70410029966462e-05, + "loss": 0.0096, + "num_input_tokens_seen": 11641352, + "step": 19095 + }, + { + "epoch": 5.267512410369553, + "grad_norm": 0.005896823015064001, + "learning_rate": 2.702900949589981e-05, + "loss": 0.0, + "num_input_tokens_seen": 11644040, + "step": 19100 + }, + { + "epoch": 5.268891340319912, + "grad_norm": 0.006872141268104315, + "learning_rate": 2.701701552506104e-05, + "loss": 0.0, + "num_input_tokens_seen": 11646728, + "step": 19105 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 1.4757583812752273e-05, + "learning_rate": 2.700502108690873e-05, + "loss": 0.0, + "num_input_tokens_seen": 11650504, + "step": 19110 + }, + { + "epoch": 5.2716492002206286, + "grad_norm": 5.268118002277333e-06, + "learning_rate": 2.6993026184221804e-05, + "loss": 0.0, + "num_input_tokens_seen": 11653384, + "step": 19115 + }, + { + "epoch": 5.273028130170987, + "grad_norm": 8.702993000042625e-06, + "learning_rate": 2.698103081977933e-05, + "loss": 0.0, + "num_input_tokens_seen": 11655784, + "step": 19120 + }, + { + "epoch": 5.274407060121346, + "grad_norm": 1.041301766235847e-05, + "learning_rate": 2.696903499636045e-05, + "loss": 0.0, + "num_input_tokens_seen": 11658696, + "step": 19125 + }, + { + "epoch": 5.275785990071705, + "grad_norm": 0.00010443369683343917, + "learning_rate": 2.695703871674442e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11661864, + "step": 19130 + }, + { + "epoch": 5.277164920022063, + "grad_norm": 0.0001308699429500848, + "learning_rate": 2.6945041983710616e-05, + "loss": 0.0, + "num_input_tokens_seen": 11665384, + "step": 19135 + }, + { + "epoch": 5.278543849972421, + "grad_norm": 0.00013869398389942944, + "learning_rate": 2.693304480003851e-05, + "loss": 0.0087, + "num_input_tokens_seen": 11668328, + "step": 19140 + }, + { + "epoch": 5.27992277992278, + "grad_norm": 0.0005929131875745952, + "learning_rate": 2.6921047168507668e-05, + "loss": 0.0, + "num_input_tokens_seen": 11670984, + "step": 19145 + }, + { + "epoch": 5.281301709873138, + "grad_norm": 0.0010357230203226209, + "learning_rate": 2.6909049091897766e-05, + "loss": 0.0006, + "num_input_tokens_seen": 11673992, + "step": 19150 + }, + { + "epoch": 5.282680639823497, + "grad_norm": 8.917280501918867e-05, + "learning_rate": 2.689705057298859e-05, + "loss": 0.0, + "num_input_tokens_seen": 11676392, + "step": 19155 + }, + { + "epoch": 5.284059569773856, + "grad_norm": 8.219642040785402e-06, + "learning_rate": 2.6885051614560042e-05, + "loss": 0.0, + "num_input_tokens_seen": 11682056, + "step": 19160 + }, + { + "epoch": 5.285438499724214, + "grad_norm": 0.001163663575425744, + "learning_rate": 2.6873052219392088e-05, + "loss": 0.0, + "num_input_tokens_seen": 11685032, + "step": 19165 + }, + { + "epoch": 5.2868174296745725, + "grad_norm": 0.001514492672868073, + "learning_rate": 2.6861052390264818e-05, + "loss": 0.0, + "num_input_tokens_seen": 11687880, + "step": 19170 + }, + { + "epoch": 5.288196359624931, + "grad_norm": 7.259602716658264e-05, + "learning_rate": 2.684905212995843e-05, + "loss": 0.0, + "num_input_tokens_seen": 11690024, + "step": 19175 + }, + { + "epoch": 5.289575289575289, + "grad_norm": 0.0001905220269691199, + "learning_rate": 2.68370514412532e-05, + "loss": 0.0, + "num_input_tokens_seen": 11693672, + "step": 19180 + }, + { + "epoch": 5.2909542195256485, + "grad_norm": 5.445872011478059e-05, + "learning_rate": 2.682505032692952e-05, + "loss": 0.0, + "num_input_tokens_seen": 11696968, + "step": 19185 + }, + { + "epoch": 5.292333149476007, + "grad_norm": 1.8255112081533298e-05, + "learning_rate": 2.6813048789767876e-05, + "loss": 0.0, + "num_input_tokens_seen": 11700872, + "step": 19190 + }, + { + "epoch": 5.293712079426365, + "grad_norm": 1.3719841263082344e-05, + "learning_rate": 2.680104683254886e-05, + "loss": 0.0, + "num_input_tokens_seen": 11703400, + "step": 19195 + }, + { + "epoch": 5.295091009376724, + "grad_norm": 4.924937093164772e-05, + "learning_rate": 2.6789044458053135e-05, + "loss": 0.0, + "num_input_tokens_seen": 11706856, + "step": 19200 + }, + { + "epoch": 5.296469939327082, + "grad_norm": 0.00035116897197440267, + "learning_rate": 2.6777041669061496e-05, + "loss": 0.0528, + "num_input_tokens_seen": 11708936, + "step": 19205 + }, + { + "epoch": 5.29784886927744, + "grad_norm": 4.050090865348466e-06, + "learning_rate": 2.6765038468354813e-05, + "loss": 0.0, + "num_input_tokens_seen": 11711848, + "step": 19210 + }, + { + "epoch": 5.2992277992278, + "grad_norm": 6.444350583478808e-05, + "learning_rate": 2.675303485871405e-05, + "loss": 0.0, + "num_input_tokens_seen": 11715880, + "step": 19215 + }, + { + "epoch": 5.300606729178158, + "grad_norm": 0.008194392547011375, + "learning_rate": 2.6741030842920273e-05, + "loss": 0.0, + "num_input_tokens_seen": 11717928, + "step": 19220 + }, + { + "epoch": 5.301985659128516, + "grad_norm": 1.5105777492863126e-05, + "learning_rate": 2.6729026423754645e-05, + "loss": 0.0, + "num_input_tokens_seen": 11721224, + "step": 19225 + }, + { + "epoch": 5.303364589078875, + "grad_norm": 2.6818619517143816e-05, + "learning_rate": 2.671702160399841e-05, + "loss": 0.0, + "num_input_tokens_seen": 11724328, + "step": 19230 + }, + { + "epoch": 5.304743519029233, + "grad_norm": 5.7637931604404e-05, + "learning_rate": 2.6705016386432924e-05, + "loss": 0.0, + "num_input_tokens_seen": 11728232, + "step": 19235 + }, + { + "epoch": 5.3061224489795915, + "grad_norm": 5.657831934513524e-06, + "learning_rate": 2.6693010773839605e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11730728, + "step": 19240 + }, + { + "epoch": 5.307501378929951, + "grad_norm": 9.28477766137803e-06, + "learning_rate": 2.6681004768999995e-05, + "loss": 0.0, + "num_input_tokens_seen": 11733896, + "step": 19245 + }, + { + "epoch": 5.308880308880309, + "grad_norm": 1.2151655027992092e-05, + "learning_rate": 2.6668998374695713e-05, + "loss": 0.0, + "num_input_tokens_seen": 11737672, + "step": 19250 + }, + { + "epoch": 5.3102592388306675, + "grad_norm": 1.2890697689726949e-05, + "learning_rate": 2.6656991593708464e-05, + "loss": 0.0, + "num_input_tokens_seen": 11741000, + "step": 19255 + }, + { + "epoch": 5.311638168781026, + "grad_norm": 3.986940555478213e-06, + "learning_rate": 2.6644984428820048e-05, + "loss": 0.0, + "num_input_tokens_seen": 11743688, + "step": 19260 + }, + { + "epoch": 5.313017098731384, + "grad_norm": 0.0003001881414093077, + "learning_rate": 2.663297688281235e-05, + "loss": 0.0, + "num_input_tokens_seen": 11746312, + "step": 19265 + }, + { + "epoch": 5.314396028681743, + "grad_norm": 3.876036498695612e-05, + "learning_rate": 2.6620968958467352e-05, + "loss": 0.0, + "num_input_tokens_seen": 11749512, + "step": 19270 + }, + { + "epoch": 5.315774958632101, + "grad_norm": 2.1423122234409675e-05, + "learning_rate": 2.6608960658567116e-05, + "loss": 0.0, + "num_input_tokens_seen": 11753864, + "step": 19275 + }, + { + "epoch": 5.31715388858246, + "grad_norm": 0.0023705177009105682, + "learning_rate": 2.6596951985893786e-05, + "loss": 0.0, + "num_input_tokens_seen": 11757448, + "step": 19280 + }, + { + "epoch": 5.318532818532819, + "grad_norm": 0.0013385703787207603, + "learning_rate": 2.6584942943229607e-05, + "loss": 0.0, + "num_input_tokens_seen": 11761288, + "step": 19285 + }, + { + "epoch": 5.319911748483177, + "grad_norm": 1.091577360057272e-05, + "learning_rate": 2.65729335333569e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11763944, + "step": 19290 + }, + { + "epoch": 5.321290678433535, + "grad_norm": 3.142530840705149e-05, + "learning_rate": 2.6560923759058064e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11766184, + "step": 19295 + }, + { + "epoch": 5.322669608383894, + "grad_norm": 8.149590939865448e-06, + "learning_rate": 2.6548913623115608e-05, + "loss": 0.0, + "num_input_tokens_seen": 11770472, + "step": 19300 + }, + { + "epoch": 5.324048538334253, + "grad_norm": 4.5416389184538275e-05, + "learning_rate": 2.6536903128312096e-05, + "loss": 0.0, + "num_input_tokens_seen": 11773000, + "step": 19305 + }, + { + "epoch": 5.325427468284611, + "grad_norm": 3.5506579934008187e-06, + "learning_rate": 2.6524892277430198e-05, + "loss": 0.0, + "num_input_tokens_seen": 11776104, + "step": 19310 + }, + { + "epoch": 5.32680639823497, + "grad_norm": 0.0002712023560889065, + "learning_rate": 2.651288107325264e-05, + "loss": 0.0, + "num_input_tokens_seen": 11779848, + "step": 19315 + }, + { + "epoch": 5.328185328185328, + "grad_norm": 4.832714330404997e-05, + "learning_rate": 2.6500869518562255e-05, + "loss": 0.0, + "num_input_tokens_seen": 11783368, + "step": 19320 + }, + { + "epoch": 5.329564258135687, + "grad_norm": 0.0001001887212623842, + "learning_rate": 2.6488857616141943e-05, + "loss": 0.0074, + "num_input_tokens_seen": 11786984, + "step": 19325 + }, + { + "epoch": 5.330943188086045, + "grad_norm": 0.00010570897575234994, + "learning_rate": 2.64768453687747e-05, + "loss": 0.0, + "num_input_tokens_seen": 11789448, + "step": 19330 + }, + { + "epoch": 5.332322118036403, + "grad_norm": 7.104261021595448e-05, + "learning_rate": 2.6464832779243574e-05, + "loss": 0.0, + "num_input_tokens_seen": 11792648, + "step": 19335 + }, + { + "epoch": 5.333701047986763, + "grad_norm": 6.110906724643428e-06, + "learning_rate": 2.6452819850331728e-05, + "loss": 0.0, + "num_input_tokens_seen": 11796648, + "step": 19340 + }, + { + "epoch": 5.335079977937121, + "grad_norm": 9.722103641252033e-06, + "learning_rate": 2.6440806584822385e-05, + "loss": 0.0, + "num_input_tokens_seen": 11799432, + "step": 19345 + }, + { + "epoch": 5.336458907887479, + "grad_norm": 3.0243986657296773e-06, + "learning_rate": 2.642879298549883e-05, + "loss": 0.0, + "num_input_tokens_seen": 11802408, + "step": 19350 + }, + { + "epoch": 5.337837837837838, + "grad_norm": 1.615182736713905e-05, + "learning_rate": 2.6416779055144447e-05, + "loss": 0.0, + "num_input_tokens_seen": 11804904, + "step": 19355 + }, + { + "epoch": 5.339216767788196, + "grad_norm": 0.06796339899301529, + "learning_rate": 2.6404764796542692e-05, + "loss": 0.0, + "num_input_tokens_seen": 11808072, + "step": 19360 + }, + { + "epoch": 5.3405956977385545, + "grad_norm": 4.979606728738872e-06, + "learning_rate": 2.6392750212477103e-05, + "loss": 0.0, + "num_input_tokens_seen": 11810760, + "step": 19365 + }, + { + "epoch": 5.341974627688914, + "grad_norm": 5.015745045966469e-06, + "learning_rate": 2.638073530573128e-05, + "loss": 0.0, + "num_input_tokens_seen": 11813800, + "step": 19370 + }, + { + "epoch": 5.343353557639272, + "grad_norm": 2.6952211555908434e-05, + "learning_rate": 2.636872007908891e-05, + "loss": 0.0001, + "num_input_tokens_seen": 11817576, + "step": 19375 + }, + { + "epoch": 5.3447324875896305, + "grad_norm": 2.6863940547627863e-06, + "learning_rate": 2.6356704535333742e-05, + "loss": 0.0, + "num_input_tokens_seen": 11820584, + "step": 19380 + }, + { + "epoch": 5.346111417539989, + "grad_norm": 2.711005072342232e-05, + "learning_rate": 2.6344688677249614e-05, + "loss": 0.0, + "num_input_tokens_seen": 11823528, + "step": 19385 + }, + { + "epoch": 5.347490347490347, + "grad_norm": 0.0012355012586340308, + "learning_rate": 2.6332672507620414e-05, + "loss": 0.0008, + "num_input_tokens_seen": 11826504, + "step": 19390 + }, + { + "epoch": 5.348869277440706, + "grad_norm": 6.754834885214223e-06, + "learning_rate": 2.6320656029230118e-05, + "loss": 0.0, + "num_input_tokens_seen": 11830024, + "step": 19395 + }, + { + "epoch": 5.350248207391065, + "grad_norm": 1.0167868822463788e-05, + "learning_rate": 2.6308639244862786e-05, + "loss": 0.0007, + "num_input_tokens_seen": 11832840, + "step": 19400 + }, + { + "epoch": 5.351627137341423, + "grad_norm": 1.1400449693610426e-05, + "learning_rate": 2.629662215730253e-05, + "loss": 0.0, + "num_input_tokens_seen": 11835784, + "step": 19405 + }, + { + "epoch": 5.353006067291782, + "grad_norm": 1.4979115803726017e-05, + "learning_rate": 2.6284604769333527e-05, + "loss": 0.0, + "num_input_tokens_seen": 11841160, + "step": 19410 + }, + { + "epoch": 5.35438499724214, + "grad_norm": 1.836961246226565e-06, + "learning_rate": 2.6272587083740036e-05, + "loss": 0.0, + "num_input_tokens_seen": 11843688, + "step": 19415 + }, + { + "epoch": 5.355763927192498, + "grad_norm": 0.00030549601069651544, + "learning_rate": 2.6260569103306387e-05, + "loss": 0.0, + "num_input_tokens_seen": 11847336, + "step": 19420 + }, + { + "epoch": 5.357142857142857, + "grad_norm": 1.2148336281825323e-05, + "learning_rate": 2.6248550830816977e-05, + "loss": 0.0, + "num_input_tokens_seen": 11849896, + "step": 19425 + }, + { + "epoch": 5.358521787093216, + "grad_norm": 2.6229859031445812e-06, + "learning_rate": 2.623653226905625e-05, + "loss": 0.0, + "num_input_tokens_seen": 11852552, + "step": 19430 + }, + { + "epoch": 5.359900717043574, + "grad_norm": 5.5435615649912506e-05, + "learning_rate": 2.6224513420808755e-05, + "loss": 0.0, + "num_input_tokens_seen": 11854984, + "step": 19435 + }, + { + "epoch": 5.361279646993933, + "grad_norm": 0.0003097957815043628, + "learning_rate": 2.621249428885908e-05, + "loss": 0.0, + "num_input_tokens_seen": 11858216, + "step": 19440 + }, + { + "epoch": 5.362658576944291, + "grad_norm": 3.7678291846532375e-06, + "learning_rate": 2.6200474875991882e-05, + "loss": 0.0, + "num_input_tokens_seen": 11860968, + "step": 19445 + }, + { + "epoch": 5.3640375068946495, + "grad_norm": 9.846678040048573e-06, + "learning_rate": 2.618845518499188e-05, + "loss": 0.0, + "num_input_tokens_seen": 11863976, + "step": 19450 + }, + { + "epoch": 5.365416436845008, + "grad_norm": 5.6469325500074774e-05, + "learning_rate": 2.6176435218643874e-05, + "loss": 0.0, + "num_input_tokens_seen": 11866856, + "step": 19455 + }, + { + "epoch": 5.366795366795367, + "grad_norm": 4.3162094698345754e-06, + "learning_rate": 2.6164414979732715e-05, + "loss": 0.0, + "num_input_tokens_seen": 11869768, + "step": 19460 + }, + { + "epoch": 5.3681742967457255, + "grad_norm": 4.179262305115117e-06, + "learning_rate": 2.6152394471043324e-05, + "loss": 0.0, + "num_input_tokens_seen": 11872424, + "step": 19465 + }, + { + "epoch": 5.369553226696084, + "grad_norm": 3.748332892428152e-05, + "learning_rate": 2.614037369536067e-05, + "loss": 0.0, + "num_input_tokens_seen": 11876776, + "step": 19470 + }, + { + "epoch": 5.370932156646442, + "grad_norm": 3.2553514756727964e-05, + "learning_rate": 2.61283526554698e-05, + "loss": 0.0, + "num_input_tokens_seen": 11879272, + "step": 19475 + }, + { + "epoch": 5.372311086596801, + "grad_norm": 2.936827058874769e-06, + "learning_rate": 2.6116331354155826e-05, + "loss": 0.0, + "num_input_tokens_seen": 11881768, + "step": 19480 + }, + { + "epoch": 5.373690016547159, + "grad_norm": 3.3654391700110864e-06, + "learning_rate": 2.6104309794203897e-05, + "loss": 0.0, + "num_input_tokens_seen": 11883976, + "step": 19485 + }, + { + "epoch": 5.375068946497518, + "grad_norm": 5.817765668325592e-06, + "learning_rate": 2.6092287978399238e-05, + "loss": 0.0, + "num_input_tokens_seen": 11886664, + "step": 19490 + }, + { + "epoch": 5.376447876447877, + "grad_norm": 6.24009917373769e-05, + "learning_rate": 2.608026590952713e-05, + "loss": 0.0, + "num_input_tokens_seen": 11890728, + "step": 19495 + }, + { + "epoch": 5.377826806398235, + "grad_norm": 4.391209131426876e-06, + "learning_rate": 2.606824359037292e-05, + "loss": 0.0, + "num_input_tokens_seen": 11894504, + "step": 19500 + }, + { + "epoch": 5.3792057363485934, + "grad_norm": 1.036530466080876e-05, + "learning_rate": 2.6056221023722005e-05, + "loss": 0.0336, + "num_input_tokens_seen": 11897896, + "step": 19505 + }, + { + "epoch": 5.380584666298952, + "grad_norm": 4.269340479368111e-06, + "learning_rate": 2.6044198212359845e-05, + "loss": 0.0, + "num_input_tokens_seen": 11901256, + "step": 19510 + }, + { + "epoch": 5.38196359624931, + "grad_norm": 9.916256203723606e-06, + "learning_rate": 2.6032175159071947e-05, + "loss": 0.0, + "num_input_tokens_seen": 11905064, + "step": 19515 + }, + { + "epoch": 5.3833425261996695, + "grad_norm": 7.63983371143695e-06, + "learning_rate": 2.6020151866643875e-05, + "loss": 0.0, + "num_input_tokens_seen": 11907496, + "step": 19520 + }, + { + "epoch": 5.384721456150028, + "grad_norm": 1.2291073289816268e-05, + "learning_rate": 2.600812833786126e-05, + "loss": 0.0, + "num_input_tokens_seen": 11910344, + "step": 19525 + }, + { + "epoch": 5.386100386100386, + "grad_norm": 4.310496478865389e-06, + "learning_rate": 2.5996104575509784e-05, + "loss": 0.0, + "num_input_tokens_seen": 11913416, + "step": 19530 + }, + { + "epoch": 5.387479316050745, + "grad_norm": 2.007289913308341e-05, + "learning_rate": 2.5984080582375175e-05, + "loss": 0.0, + "num_input_tokens_seen": 11917000, + "step": 19535 + }, + { + "epoch": 5.388858246001103, + "grad_norm": 7.402063602057751e-06, + "learning_rate": 2.597205636124322e-05, + "loss": 0.0, + "num_input_tokens_seen": 11920072, + "step": 19540 + }, + { + "epoch": 5.390237175951461, + "grad_norm": 6.450594082707539e-05, + "learning_rate": 2.5960031914899758e-05, + "loss": 0.0, + "num_input_tokens_seen": 11922408, + "step": 19545 + }, + { + "epoch": 5.39161610590182, + "grad_norm": 1.9770632206927985e-06, + "learning_rate": 2.5948007246130683e-05, + "loss": 0.0, + "num_input_tokens_seen": 11925736, + "step": 19550 + }, + { + "epoch": 5.392995035852179, + "grad_norm": 5.307466835802188e-06, + "learning_rate": 2.5935982357721944e-05, + "loss": 0.0, + "num_input_tokens_seen": 11928424, + "step": 19555 + }, + { + "epoch": 5.394373965802537, + "grad_norm": 0.0007169678574427962, + "learning_rate": 2.592395725245952e-05, + "loss": 0.0, + "num_input_tokens_seen": 11932136, + "step": 19560 + }, + { + "epoch": 5.395752895752896, + "grad_norm": 1.8298042050446384e-05, + "learning_rate": 2.5911931933129462e-05, + "loss": 0.0, + "num_input_tokens_seen": 11934952, + "step": 19565 + }, + { + "epoch": 5.397131825703254, + "grad_norm": 4.414226168591995e-06, + "learning_rate": 2.5899906402517865e-05, + "loss": 0.0, + "num_input_tokens_seen": 11938024, + "step": 19570 + }, + { + "epoch": 5.3985107556536125, + "grad_norm": 0.0001231074274983257, + "learning_rate": 2.588788066341088e-05, + "loss": 0.0, + "num_input_tokens_seen": 11941064, + "step": 19575 + }, + { + "epoch": 5.399889685603972, + "grad_norm": 3.5818211472360417e-06, + "learning_rate": 2.587585471859469e-05, + "loss": 0.0, + "num_input_tokens_seen": 11943880, + "step": 19580 + }, + { + "epoch": 5.40126861555433, + "grad_norm": 5.7879592532117385e-06, + "learning_rate": 2.5863828570855532e-05, + "loss": 0.0, + "num_input_tokens_seen": 11946664, + "step": 19585 + }, + { + "epoch": 5.4026475455046885, + "grad_norm": 1.1836438716272824e-05, + "learning_rate": 2.5851802222979686e-05, + "loss": 0.0, + "num_input_tokens_seen": 11948840, + "step": 19590 + }, + { + "epoch": 5.404026475455047, + "grad_norm": 3.18346292260685e-06, + "learning_rate": 2.583977567775351e-05, + "loss": 0.0, + "num_input_tokens_seen": 11951304, + "step": 19595 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 1.3061618119536433e-05, + "learning_rate": 2.582774893796334e-05, + "loss": 0.0, + "num_input_tokens_seen": 11954280, + "step": 19600 + }, + { + "epoch": 5.406784335355764, + "grad_norm": 1.0215990187134594e-05, + "learning_rate": 2.581572200639564e-05, + "loss": 0.0, + "num_input_tokens_seen": 11957192, + "step": 19605 + }, + { + "epoch": 5.408163265306122, + "grad_norm": 0.002089559566229582, + "learning_rate": 2.580369488583686e-05, + "loss": 0.0, + "num_input_tokens_seen": 11960424, + "step": 19610 + }, + { + "epoch": 5.409542195256481, + "grad_norm": 5.402932220022194e-05, + "learning_rate": 2.5791667579073504e-05, + "loss": 0.0, + "num_input_tokens_seen": 11963048, + "step": 19615 + }, + { + "epoch": 5.41092112520684, + "grad_norm": 4.2466363083804026e-05, + "learning_rate": 2.5779640088892136e-05, + "loss": 0.0, + "num_input_tokens_seen": 11966024, + "step": 19620 + }, + { + "epoch": 5.412300055157198, + "grad_norm": 1.2346576113486663e-05, + "learning_rate": 2.5767612418079352e-05, + "loss": 0.0, + "num_input_tokens_seen": 11968328, + "step": 19625 + }, + { + "epoch": 5.413678985107556, + "grad_norm": 2.782464662232087e-06, + "learning_rate": 2.5755584569421793e-05, + "loss": 0.0, + "num_input_tokens_seen": 11970920, + "step": 19630 + }, + { + "epoch": 5.415057915057915, + "grad_norm": 1.3425101315078791e-05, + "learning_rate": 2.5743556545706128e-05, + "loss": 0.0, + "num_input_tokens_seen": 11973864, + "step": 19635 + }, + { + "epoch": 5.416436845008274, + "grad_norm": 1.627952224225737e-05, + "learning_rate": 2.5731528349719086e-05, + "loss": 0.0, + "num_input_tokens_seen": 11977000, + "step": 19640 + }, + { + "epoch": 5.417815774958632, + "grad_norm": 0.00037915559369139373, + "learning_rate": 2.571949998424743e-05, + "loss": 0.0, + "num_input_tokens_seen": 11979432, + "step": 19645 + }, + { + "epoch": 5.419194704908991, + "grad_norm": 2.2822632672614418e-05, + "learning_rate": 2.570747145207796e-05, + "loss": 0.0, + "num_input_tokens_seen": 11983112, + "step": 19650 + }, + { + "epoch": 5.420573634859349, + "grad_norm": 9.203336958307773e-06, + "learning_rate": 2.56954427559975e-05, + "loss": 0.0, + "num_input_tokens_seen": 11985704, + "step": 19655 + }, + { + "epoch": 5.4219525648097076, + "grad_norm": 5.229988346400205e-06, + "learning_rate": 2.568341389879294e-05, + "loss": 0.0, + "num_input_tokens_seen": 11989192, + "step": 19660 + }, + { + "epoch": 5.423331494760066, + "grad_norm": 4.89458261654363e-06, + "learning_rate": 2.5671384883251187e-05, + "loss": 0.0, + "num_input_tokens_seen": 11992168, + "step": 19665 + }, + { + "epoch": 5.424710424710424, + "grad_norm": 1.1025107596651651e-05, + "learning_rate": 2.5659355712159194e-05, + "loss": 0.0, + "num_input_tokens_seen": 11994696, + "step": 19670 + }, + { + "epoch": 5.426089354660784, + "grad_norm": 0.006818878464400768, + "learning_rate": 2.564732638830395e-05, + "loss": 0.0, + "num_input_tokens_seen": 11997608, + "step": 19675 + }, + { + "epoch": 5.427468284611142, + "grad_norm": 0.0011233114637434483, + "learning_rate": 2.5635296914472475e-05, + "loss": 0.0, + "num_input_tokens_seen": 12001864, + "step": 19680 + }, + { + "epoch": 5.4288472145615, + "grad_norm": 3.2107818697113544e-06, + "learning_rate": 2.5623267293451826e-05, + "loss": 0.0, + "num_input_tokens_seen": 12004808, + "step": 19685 + }, + { + "epoch": 5.430226144511859, + "grad_norm": 1.3817490980727598e-05, + "learning_rate": 2.5611237528029085e-05, + "loss": 0.0, + "num_input_tokens_seen": 12007400, + "step": 19690 + }, + { + "epoch": 5.431605074462217, + "grad_norm": 8.070317562669516e-05, + "learning_rate": 2.559920762099139e-05, + "loss": 0.0, + "num_input_tokens_seen": 12011048, + "step": 19695 + }, + { + "epoch": 5.4329840044125755, + "grad_norm": 1.1647745850495994e-05, + "learning_rate": 2.5587177575125886e-05, + "loss": 0.0, + "num_input_tokens_seen": 12013992, + "step": 19700 + }, + { + "epoch": 5.434362934362935, + "grad_norm": 3.3761289159883745e-06, + "learning_rate": 2.5575147393219766e-05, + "loss": 0.0, + "num_input_tokens_seen": 12016904, + "step": 19705 + }, + { + "epoch": 5.435741864313293, + "grad_norm": 7.062137683533365e-06, + "learning_rate": 2.556311707806025e-05, + "loss": 0.0, + "num_input_tokens_seen": 12019976, + "step": 19710 + }, + { + "epoch": 5.4371207942636515, + "grad_norm": 1.3777957974525634e-05, + "learning_rate": 2.5551086632434594e-05, + "loss": 0.0, + "num_input_tokens_seen": 12022952, + "step": 19715 + }, + { + "epoch": 5.43849972421401, + "grad_norm": 6.188243423821405e-05, + "learning_rate": 2.5539056059130073e-05, + "loss": 0.0, + "num_input_tokens_seen": 12025960, + "step": 19720 + }, + { + "epoch": 5.439878654164368, + "grad_norm": 2.9871029255446047e-05, + "learning_rate": 2.5527025360934003e-05, + "loss": 0.0, + "num_input_tokens_seen": 12028584, + "step": 19725 + }, + { + "epoch": 5.441257584114727, + "grad_norm": 5.2069066441617906e-05, + "learning_rate": 2.5514994540633723e-05, + "loss": 0.0, + "num_input_tokens_seen": 12031944, + "step": 19730 + }, + { + "epoch": 5.442636514065086, + "grad_norm": 5.571359724854119e-05, + "learning_rate": 2.55029636010166e-05, + "loss": 0.0, + "num_input_tokens_seen": 12035688, + "step": 19735 + }, + { + "epoch": 5.444015444015444, + "grad_norm": 9.604597835277673e-06, + "learning_rate": 2.5490932544870028e-05, + "loss": 0.0855, + "num_input_tokens_seen": 12037896, + "step": 19740 + }, + { + "epoch": 5.445394373965803, + "grad_norm": 3.115604158665519e-06, + "learning_rate": 2.547890137498143e-05, + "loss": 0.0, + "num_input_tokens_seen": 12040392, + "step": 19745 + }, + { + "epoch": 5.446773303916161, + "grad_norm": 2.02921455638716e-05, + "learning_rate": 2.5466870094138262e-05, + "loss": 0.0, + "num_input_tokens_seen": 12043016, + "step": 19750 + }, + { + "epoch": 5.448152233866519, + "grad_norm": 7.294441729754908e-06, + "learning_rate": 2.5454838705127993e-05, + "loss": 0.0, + "num_input_tokens_seen": 12046888, + "step": 19755 + }, + { + "epoch": 5.449531163816878, + "grad_norm": 85.9041748046875, + "learning_rate": 2.5442807210738123e-05, + "loss": 0.0629, + "num_input_tokens_seen": 12049576, + "step": 19760 + }, + { + "epoch": 5.450910093767237, + "grad_norm": 0.00026519381208345294, + "learning_rate": 2.5430775613756186e-05, + "loss": 0.0, + "num_input_tokens_seen": 12052296, + "step": 19765 + }, + { + "epoch": 5.452289023717595, + "grad_norm": 7.203961104096379e-06, + "learning_rate": 2.541874391696972e-05, + "loss": 0.0, + "num_input_tokens_seen": 12054600, + "step": 19770 + }, + { + "epoch": 5.453667953667954, + "grad_norm": 9.387076715938747e-06, + "learning_rate": 2.5406712123166293e-05, + "loss": 0.0, + "num_input_tokens_seen": 12057000, + "step": 19775 + }, + { + "epoch": 5.455046883618312, + "grad_norm": 0.0007815684075467288, + "learning_rate": 2.539468023513351e-05, + "loss": 0.0, + "num_input_tokens_seen": 12059656, + "step": 19780 + }, + { + "epoch": 5.4564258135686705, + "grad_norm": 4.441920737008331e-06, + "learning_rate": 2.538264825565898e-05, + "loss": 0.0, + "num_input_tokens_seen": 12061896, + "step": 19785 + }, + { + "epoch": 5.457804743519029, + "grad_norm": 2.2449545213021338e-05, + "learning_rate": 2.5370616187530344e-05, + "loss": 0.0, + "num_input_tokens_seen": 12065064, + "step": 19790 + }, + { + "epoch": 5.459183673469388, + "grad_norm": 0.0009928677463904023, + "learning_rate": 2.5358584033535255e-05, + "loss": 0.0, + "num_input_tokens_seen": 12068456, + "step": 19795 + }, + { + "epoch": 5.4605626034197465, + "grad_norm": 5.600308577413671e-05, + "learning_rate": 2.53465517964614e-05, + "loss": 0.0, + "num_input_tokens_seen": 12071752, + "step": 19800 + }, + { + "epoch": 5.461941533370105, + "grad_norm": 9.479579603066668e-06, + "learning_rate": 2.5334519479096463e-05, + "loss": 0.0, + "num_input_tokens_seen": 12074504, + "step": 19805 + }, + { + "epoch": 5.463320463320463, + "grad_norm": 3.5182081774109975e-05, + "learning_rate": 2.5322487084228158e-05, + "loss": 0.0, + "num_input_tokens_seen": 12076776, + "step": 19810 + }, + { + "epoch": 5.464699393270822, + "grad_norm": 4.610674295690842e-06, + "learning_rate": 2.531045461464423e-05, + "loss": 0.0, + "num_input_tokens_seen": 12080072, + "step": 19815 + }, + { + "epoch": 5.46607832322118, + "grad_norm": 0.00013471575221046805, + "learning_rate": 2.5298422073132427e-05, + "loss": 0.0, + "num_input_tokens_seen": 12083368, + "step": 19820 + }, + { + "epoch": 5.467457253171539, + "grad_norm": 0.00037670941674150527, + "learning_rate": 2.5286389462480513e-05, + "loss": 0.0, + "num_input_tokens_seen": 12086952, + "step": 19825 + }, + { + "epoch": 5.468836183121898, + "grad_norm": 5.169164523977088e-06, + "learning_rate": 2.527435678547627e-05, + "loss": 0.0, + "num_input_tokens_seen": 12089384, + "step": 19830 + }, + { + "epoch": 5.470215113072256, + "grad_norm": 2.3606766262673773e-05, + "learning_rate": 2.5262324044907497e-05, + "loss": 0.0, + "num_input_tokens_seen": 12092936, + "step": 19835 + }, + { + "epoch": 5.471594043022614, + "grad_norm": 0.0007175630307756364, + "learning_rate": 2.5250291243562013e-05, + "loss": 0.0, + "num_input_tokens_seen": 12095336, + "step": 19840 + }, + { + "epoch": 5.472972972972973, + "grad_norm": 5.701823465642519e-05, + "learning_rate": 2.523825838422763e-05, + "loss": 0.0, + "num_input_tokens_seen": 12097832, + "step": 19845 + }, + { + "epoch": 5.474351902923331, + "grad_norm": 9.216711623594165e-05, + "learning_rate": 2.5226225469692205e-05, + "loss": 0.0, + "num_input_tokens_seen": 12100904, + "step": 19850 + }, + { + "epoch": 5.4757308328736904, + "grad_norm": 0.00010191972978645936, + "learning_rate": 2.521419250274359e-05, + "loss": 0.0, + "num_input_tokens_seen": 12103880, + "step": 19855 + }, + { + "epoch": 5.477109762824049, + "grad_norm": 0.0002268807147629559, + "learning_rate": 2.5202159486169646e-05, + "loss": 0.0, + "num_input_tokens_seen": 12106376, + "step": 19860 + }, + { + "epoch": 5.478488692774407, + "grad_norm": 1.5420371710206382e-05, + "learning_rate": 2.519012642275825e-05, + "loss": 0.1479, + "num_input_tokens_seen": 12109032, + "step": 19865 + }, + { + "epoch": 5.479867622724766, + "grad_norm": 6.267955177463591e-05, + "learning_rate": 2.517809331529729e-05, + "loss": 0.0, + "num_input_tokens_seen": 12111560, + "step": 19870 + }, + { + "epoch": 5.481246552675124, + "grad_norm": 0.00038699491415172815, + "learning_rate": 2.5166060166574664e-05, + "loss": 0.0, + "num_input_tokens_seen": 12113960, + "step": 19875 + }, + { + "epoch": 5.482625482625482, + "grad_norm": 0.001941961352713406, + "learning_rate": 2.5154026979378282e-05, + "loss": 0.0, + "num_input_tokens_seen": 12116552, + "step": 19880 + }, + { + "epoch": 5.484004412575841, + "grad_norm": 0.00016707975009921938, + "learning_rate": 2.5141993756496057e-05, + "loss": 0.0003, + "num_input_tokens_seen": 12120264, + "step": 19885 + }, + { + "epoch": 5.4853833425262, + "grad_norm": 0.0003570041444618255, + "learning_rate": 2.5129960500715923e-05, + "loss": 0.0, + "num_input_tokens_seen": 12123464, + "step": 19890 + }, + { + "epoch": 5.486762272476558, + "grad_norm": 6.63344471831806e-05, + "learning_rate": 2.511792721482581e-05, + "loss": 0.0, + "num_input_tokens_seen": 12126856, + "step": 19895 + }, + { + "epoch": 5.488141202426917, + "grad_norm": 0.007842816412448883, + "learning_rate": 2.5105893901613643e-05, + "loss": 0.0, + "num_input_tokens_seen": 12129608, + "step": 19900 + }, + { + "epoch": 5.489520132377275, + "grad_norm": 4.976615673513152e-05, + "learning_rate": 2.5093860563867384e-05, + "loss": 0.0, + "num_input_tokens_seen": 12132488, + "step": 19905 + }, + { + "epoch": 5.4908990623276335, + "grad_norm": 0.0044846138916909695, + "learning_rate": 2.508182720437498e-05, + "loss": 0.0, + "num_input_tokens_seen": 12135656, + "step": 19910 + }, + { + "epoch": 5.492277992277993, + "grad_norm": 0.00021780414681416005, + "learning_rate": 2.5069793825924387e-05, + "loss": 0.0, + "num_input_tokens_seen": 12138760, + "step": 19915 + }, + { + "epoch": 5.493656922228351, + "grad_norm": 4.146758510614745e-05, + "learning_rate": 2.5057760431303562e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12141992, + "step": 19920 + }, + { + "epoch": 5.4950358521787095, + "grad_norm": 0.0011159765999764204, + "learning_rate": 2.504572702330048e-05, + "loss": 0.0, + "num_input_tokens_seen": 12145480, + "step": 19925 + }, + { + "epoch": 5.496414782129068, + "grad_norm": 0.00014506281877402216, + "learning_rate": 2.50336936047031e-05, + "loss": 0.0002, + "num_input_tokens_seen": 12148008, + "step": 19930 + }, + { + "epoch": 5.497793712079426, + "grad_norm": 0.00015192478895187378, + "learning_rate": 2.50216601782994e-05, + "loss": 0.0, + "num_input_tokens_seen": 12151336, + "step": 19935 + }, + { + "epoch": 5.499172642029785, + "grad_norm": 9.494089681538753e-06, + "learning_rate": 2.5009626746877345e-05, + "loss": 0.0, + "num_input_tokens_seen": 12155208, + "step": 19940 + }, + { + "epoch": 5.5, + "eval_loss": 0.2586555480957031, + "eval_runtime": 28.4925, + "eval_samples_per_second": 56.576, + "eval_steps_per_second": 14.144, + "num_input_tokens_seen": 12157032, + "step": 19943 + }, + { + "epoch": 5.500551571980143, + "grad_norm": 0.002198194619268179, + "learning_rate": 2.499759331322491e-05, + "loss": 0.0, + "num_input_tokens_seen": 12159176, + "step": 19945 + }, + { + "epoch": 5.501930501930502, + "grad_norm": 2.3999131371965632e-05, + "learning_rate": 2.498555988013007e-05, + "loss": 0.0, + "num_input_tokens_seen": 12163304, + "step": 19950 + }, + { + "epoch": 5.503309431880861, + "grad_norm": 0.0004315305850468576, + "learning_rate": 2.497352645038079e-05, + "loss": 0.0, + "num_input_tokens_seen": 12166248, + "step": 19955 + }, + { + "epoch": 5.504688361831219, + "grad_norm": 0.0004786110657732934, + "learning_rate": 2.4961493026765064e-05, + "loss": 0.0, + "num_input_tokens_seen": 12169480, + "step": 19960 + }, + { + "epoch": 5.506067291781577, + "grad_norm": 0.00010326381743652746, + "learning_rate": 2.4949459612070856e-05, + "loss": 0.0, + "num_input_tokens_seen": 12171912, + "step": 19965 + }, + { + "epoch": 5.507446221731936, + "grad_norm": 0.0026989257894456387, + "learning_rate": 2.4937426209086113e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12175496, + "step": 19970 + }, + { + "epoch": 5.508825151682295, + "grad_norm": 4.132794856559485e-05, + "learning_rate": 2.4925392820598833e-05, + "loss": 0.0, + "num_input_tokens_seen": 12178344, + "step": 19975 + }, + { + "epoch": 5.510204081632653, + "grad_norm": 0.001299477182328701, + "learning_rate": 2.4913359449396958e-05, + "loss": 0.0, + "num_input_tokens_seen": 12181064, + "step": 19980 + }, + { + "epoch": 5.511583011583012, + "grad_norm": 9.486533599556424e-06, + "learning_rate": 2.4901326098268452e-05, + "loss": 0.0, + "num_input_tokens_seen": 12183976, + "step": 19985 + }, + { + "epoch": 5.51296194153337, + "grad_norm": 7.235557131934911e-05, + "learning_rate": 2.488929277000128e-05, + "loss": 0.0, + "num_input_tokens_seen": 12187240, + "step": 19990 + }, + { + "epoch": 5.5143408714837285, + "grad_norm": 1.7006470443448052e-05, + "learning_rate": 2.4877259467383376e-05, + "loss": 0.0, + "num_input_tokens_seen": 12190568, + "step": 19995 + }, + { + "epoch": 5.515719801434087, + "grad_norm": 3.732918003152008e-06, + "learning_rate": 2.4865226193202704e-05, + "loss": 0.0, + "num_input_tokens_seen": 12193896, + "step": 20000 + }, + { + "epoch": 5.517098731384445, + "grad_norm": 0.0003128045063931495, + "learning_rate": 2.4853192950247187e-05, + "loss": 0.0, + "num_input_tokens_seen": 12197544, + "step": 20005 + }, + { + "epoch": 5.5184776613348046, + "grad_norm": 8.296292435261421e-06, + "learning_rate": 2.4841159741304743e-05, + "loss": 0.0, + "num_input_tokens_seen": 12200392, + "step": 20010 + }, + { + "epoch": 5.519856591285163, + "grad_norm": 0.0002964120649266988, + "learning_rate": 2.4829126569163318e-05, + "loss": 0.0, + "num_input_tokens_seen": 12203720, + "step": 20015 + }, + { + "epoch": 5.521235521235521, + "grad_norm": 0.0001954317995114252, + "learning_rate": 2.481709343661081e-05, + "loss": 0.0, + "num_input_tokens_seen": 12207336, + "step": 20020 + }, + { + "epoch": 5.52261445118588, + "grad_norm": 2.746974496403709e-05, + "learning_rate": 2.4805060346435125e-05, + "loss": 0.0, + "num_input_tokens_seen": 12211336, + "step": 20025 + }, + { + "epoch": 5.523993381136238, + "grad_norm": 0.0001106590498238802, + "learning_rate": 2.4793027301424164e-05, + "loss": 0.0, + "num_input_tokens_seen": 12215144, + "step": 20030 + }, + { + "epoch": 5.525372311086596, + "grad_norm": 1.0747507076303009e-05, + "learning_rate": 2.478099430436581e-05, + "loss": 0.0, + "num_input_tokens_seen": 12218280, + "step": 20035 + }, + { + "epoch": 5.526751241036956, + "grad_norm": 0.07174389064311981, + "learning_rate": 2.476896135804792e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12220424, + "step": 20040 + }, + { + "epoch": 5.528130170987314, + "grad_norm": 3.6850281048828037e-06, + "learning_rate": 2.475692846525838e-05, + "loss": 0.0, + "num_input_tokens_seen": 12223240, + "step": 20045 + }, + { + "epoch": 5.5295091009376725, + "grad_norm": 0.0001528809079900384, + "learning_rate": 2.4744895628785016e-05, + "loss": 0.0, + "num_input_tokens_seen": 12226504, + "step": 20050 + }, + { + "epoch": 5.530888030888031, + "grad_norm": 2.9468126740539446e-05, + "learning_rate": 2.4732862851415673e-05, + "loss": 0.0, + "num_input_tokens_seen": 12230440, + "step": 20055 + }, + { + "epoch": 5.532266960838389, + "grad_norm": 5.494346169143682e-06, + "learning_rate": 2.472083013593818e-05, + "loss": 0.0, + "num_input_tokens_seen": 12232904, + "step": 20060 + }, + { + "epoch": 5.533645890788748, + "grad_norm": 8.084739238256589e-05, + "learning_rate": 2.4708797485140328e-05, + "loss": 0.0, + "num_input_tokens_seen": 12236136, + "step": 20065 + }, + { + "epoch": 5.535024820739107, + "grad_norm": 4.673128842114238e-06, + "learning_rate": 2.4696764901809926e-05, + "loss": 0.0, + "num_input_tokens_seen": 12240744, + "step": 20070 + }, + { + "epoch": 5.536403750689465, + "grad_norm": 0.0009279162622988224, + "learning_rate": 2.4684732388734748e-05, + "loss": 0.0042, + "num_input_tokens_seen": 12243656, + "step": 20075 + }, + { + "epoch": 5.537782680639824, + "grad_norm": 6.304876478679944e-06, + "learning_rate": 2.4672699948702535e-05, + "loss": 0.0, + "num_input_tokens_seen": 12247624, + "step": 20080 + }, + { + "epoch": 5.539161610590182, + "grad_norm": 7.371547690127045e-05, + "learning_rate": 2.4660667584501054e-05, + "loss": 0.0, + "num_input_tokens_seen": 12250824, + "step": 20085 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 3.596429678509594e-06, + "learning_rate": 2.464863529891802e-05, + "loss": 0.0, + "num_input_tokens_seen": 12254152, + "step": 20090 + }, + { + "epoch": 5.541919470490899, + "grad_norm": 0.007934198714792728, + "learning_rate": 2.463660309474115e-05, + "loss": 0.0, + "num_input_tokens_seen": 12256904, + "step": 20095 + }, + { + "epoch": 5.543298400441257, + "grad_norm": 0.0020251786336302757, + "learning_rate": 2.4624570974758125e-05, + "loss": 0.0, + "num_input_tokens_seen": 12260040, + "step": 20100 + }, + { + "epoch": 5.544677330391616, + "grad_norm": 6.18778431089595e-05, + "learning_rate": 2.4612538941756607e-05, + "loss": 0.0, + "num_input_tokens_seen": 12263400, + "step": 20105 + }, + { + "epoch": 5.546056260341975, + "grad_norm": 2.0412127923918888e-05, + "learning_rate": 2.4600506998524267e-05, + "loss": 0.0, + "num_input_tokens_seen": 12266536, + "step": 20110 + }, + { + "epoch": 5.547435190292333, + "grad_norm": 12.777944564819336, + "learning_rate": 2.458847514784872e-05, + "loss": 0.1354, + "num_input_tokens_seen": 12271240, + "step": 20115 + }, + { + "epoch": 5.5488141202426915, + "grad_norm": 0.0008758516632951796, + "learning_rate": 2.4576443392517563e-05, + "loss": 0.0, + "num_input_tokens_seen": 12274120, + "step": 20120 + }, + { + "epoch": 5.55019305019305, + "grad_norm": 9.087924991035834e-05, + "learning_rate": 2.4564411735318397e-05, + "loss": 0.0, + "num_input_tokens_seen": 12276648, + "step": 20125 + }, + { + "epoch": 5.551571980143409, + "grad_norm": 0.008270183578133583, + "learning_rate": 2.4552380179038785e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12279240, + "step": 20130 + }, + { + "epoch": 5.5529509100937675, + "grad_norm": 0.000282476277789101, + "learning_rate": 2.454034872646625e-05, + "loss": 0.0, + "num_input_tokens_seen": 12282344, + "step": 20135 + }, + { + "epoch": 5.554329840044126, + "grad_norm": 0.009041732177138329, + "learning_rate": 2.4528317380388328e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12286024, + "step": 20140 + }, + { + "epoch": 5.555708769994484, + "grad_norm": 0.0021390696056187153, + "learning_rate": 2.4516286143592487e-05, + "loss": 0.0002, + "num_input_tokens_seen": 12289192, + "step": 20145 + }, + { + "epoch": 5.557087699944843, + "grad_norm": 0.0003498357837088406, + "learning_rate": 2.4504255018866214e-05, + "loss": 0.0, + "num_input_tokens_seen": 12292680, + "step": 20150 + }, + { + "epoch": 5.558466629895201, + "grad_norm": 0.002258171560242772, + "learning_rate": 2.449222400899693e-05, + "loss": 0.0, + "num_input_tokens_seen": 12295848, + "step": 20155 + }, + { + "epoch": 5.559845559845559, + "grad_norm": 0.0002536312094889581, + "learning_rate": 2.4480193116772058e-05, + "loss": 0.0, + "num_input_tokens_seen": 12298632, + "step": 20160 + }, + { + "epoch": 5.561224489795919, + "grad_norm": 0.0009731830796226859, + "learning_rate": 2.4468162344978976e-05, + "loss": 0.0, + "num_input_tokens_seen": 12301384, + "step": 20165 + }, + { + "epoch": 5.562603419746277, + "grad_norm": 0.0012875907123088837, + "learning_rate": 2.445613169640505e-05, + "loss": 0.0, + "num_input_tokens_seen": 12304488, + "step": 20170 + }, + { + "epoch": 5.563982349696635, + "grad_norm": 0.0012799928663298488, + "learning_rate": 2.4444101173837593e-05, + "loss": 0.0, + "num_input_tokens_seen": 12307080, + "step": 20175 + }, + { + "epoch": 5.565361279646994, + "grad_norm": 8.150233770720661e-05, + "learning_rate": 2.4432070780063924e-05, + "loss": 0.0, + "num_input_tokens_seen": 12309928, + "step": 20180 + }, + { + "epoch": 5.566740209597352, + "grad_norm": 0.06592108309268951, + "learning_rate": 2.4420040517871295e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12312712, + "step": 20185 + }, + { + "epoch": 5.568119139547711, + "grad_norm": 0.02011682465672493, + "learning_rate": 2.4408010390046964e-05, + "loss": 0.0, + "num_input_tokens_seen": 12316680, + "step": 20190 + }, + { + "epoch": 5.56949806949807, + "grad_norm": 0.003201004583388567, + "learning_rate": 2.4395980399378122e-05, + "loss": 0.0, + "num_input_tokens_seen": 12319624, + "step": 20195 + }, + { + "epoch": 5.570876999448428, + "grad_norm": 0.002673415932804346, + "learning_rate": 2.4383950548651943e-05, + "loss": 0.0, + "num_input_tokens_seen": 12324200, + "step": 20200 + }, + { + "epoch": 5.5722559293987866, + "grad_norm": 0.004780936054885387, + "learning_rate": 2.437192084065559e-05, + "loss": 0.0, + "num_input_tokens_seen": 12326664, + "step": 20205 + }, + { + "epoch": 5.573634859349145, + "grad_norm": 0.0020854303147643805, + "learning_rate": 2.4359891278176163e-05, + "loss": 0.0, + "num_input_tokens_seen": 12329640, + "step": 20210 + }, + { + "epoch": 5.575013789299503, + "grad_norm": 0.00012629888078663498, + "learning_rate": 2.4347861864000733e-05, + "loss": 0.0, + "num_input_tokens_seen": 12332488, + "step": 20215 + }, + { + "epoch": 5.576392719249862, + "grad_norm": 0.00040129682747647166, + "learning_rate": 2.4335832600916353e-05, + "loss": 0.0, + "num_input_tokens_seen": 12334920, + "step": 20220 + }, + { + "epoch": 5.577771649200221, + "grad_norm": 0.0006740711396560073, + "learning_rate": 2.432380349171002e-05, + "loss": 0.0, + "num_input_tokens_seen": 12338088, + "step": 20225 + }, + { + "epoch": 5.579150579150579, + "grad_norm": 0.0009352520573884249, + "learning_rate": 2.4311774539168708e-05, + "loss": 0.0, + "num_input_tokens_seen": 12340968, + "step": 20230 + }, + { + "epoch": 5.580529509100938, + "grad_norm": 0.002482243347913027, + "learning_rate": 2.4299745746079366e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12343528, + "step": 20235 + }, + { + "epoch": 5.581908439051296, + "grad_norm": 3.102327900705859e-05, + "learning_rate": 2.4287717115228872e-05, + "loss": 0.0, + "num_input_tokens_seen": 12345800, + "step": 20240 + }, + { + "epoch": 5.5832873690016545, + "grad_norm": 5.50014701730106e-05, + "learning_rate": 2.427568864940411e-05, + "loss": 0.0, + "num_input_tokens_seen": 12349000, + "step": 20245 + }, + { + "epoch": 5.584666298952014, + "grad_norm": 0.0010236401576548815, + "learning_rate": 2.4263660351391893e-05, + "loss": 0.0, + "num_input_tokens_seen": 12351752, + "step": 20250 + }, + { + "epoch": 5.586045228902372, + "grad_norm": 0.0032754421699792147, + "learning_rate": 2.425163222397899e-05, + "loss": 0.0, + "num_input_tokens_seen": 12354376, + "step": 20255 + }, + { + "epoch": 5.5874241588527305, + "grad_norm": 0.0001351332466583699, + "learning_rate": 2.423960426995216e-05, + "loss": 0.0, + "num_input_tokens_seen": 12358024, + "step": 20260 + }, + { + "epoch": 5.588803088803089, + "grad_norm": 0.00025511812418699265, + "learning_rate": 2.4227576492098115e-05, + "loss": 0.0, + "num_input_tokens_seen": 12362376, + "step": 20265 + }, + { + "epoch": 5.590182018753447, + "grad_norm": 0.00012321180838625878, + "learning_rate": 2.42155488932035e-05, + "loss": 0.0, + "num_input_tokens_seen": 12364904, + "step": 20270 + }, + { + "epoch": 5.591560948703806, + "grad_norm": 0.001006945502012968, + "learning_rate": 2.4203521476054953e-05, + "loss": 0.0, + "num_input_tokens_seen": 12369672, + "step": 20275 + }, + { + "epoch": 5.592939878654164, + "grad_norm": 5.308711843099445e-05, + "learning_rate": 2.4191494243439035e-05, + "loss": 0.0, + "num_input_tokens_seen": 12372168, + "step": 20280 + }, + { + "epoch": 5.594318808604523, + "grad_norm": 0.00033871576306410134, + "learning_rate": 2.4179467198142312e-05, + "loss": 0.0, + "num_input_tokens_seen": 12375112, + "step": 20285 + }, + { + "epoch": 5.595697738554882, + "grad_norm": 0.0036299529019743204, + "learning_rate": 2.4167440342951254e-05, + "loss": 0.0, + "num_input_tokens_seen": 12377960, + "step": 20290 + }, + { + "epoch": 5.59707666850524, + "grad_norm": 0.0007294053793884814, + "learning_rate": 2.415541368065231e-05, + "loss": 0.0, + "num_input_tokens_seen": 12381832, + "step": 20295 + }, + { + "epoch": 5.598455598455598, + "grad_norm": 0.0001572621549712494, + "learning_rate": 2.41433872140319e-05, + "loss": 0.0, + "num_input_tokens_seen": 12384808, + "step": 20300 + }, + { + "epoch": 5.599834528405957, + "grad_norm": 0.00020802460494451225, + "learning_rate": 2.413136094587638e-05, + "loss": 0.0, + "num_input_tokens_seen": 12388712, + "step": 20305 + }, + { + "epoch": 5.601213458356315, + "grad_norm": 6.807102909078822e-05, + "learning_rate": 2.411933487897205e-05, + "loss": 0.0, + "num_input_tokens_seen": 12392328, + "step": 20310 + }, + { + "epoch": 5.602592388306674, + "grad_norm": 8.785052341409028e-05, + "learning_rate": 2.4107309016105196e-05, + "loss": 0.0, + "num_input_tokens_seen": 12394696, + "step": 20315 + }, + { + "epoch": 5.603971318257033, + "grad_norm": 0.000118998323159758, + "learning_rate": 2.4095283360062033e-05, + "loss": 0.0, + "num_input_tokens_seen": 12397032, + "step": 20320 + }, + { + "epoch": 5.605350248207391, + "grad_norm": 6.417412805603817e-05, + "learning_rate": 2.4083257913628713e-05, + "loss": 0.0, + "num_input_tokens_seen": 12399816, + "step": 20325 + }, + { + "epoch": 5.6067291781577495, + "grad_norm": 5.643611075356603e-05, + "learning_rate": 2.407123267959138e-05, + "loss": 0.0, + "num_input_tokens_seen": 12403176, + "step": 20330 + }, + { + "epoch": 5.608108108108108, + "grad_norm": 4.1536968637956306e-05, + "learning_rate": 2.4059207660736108e-05, + "loss": 0.0, + "num_input_tokens_seen": 12406056, + "step": 20335 + }, + { + "epoch": 5.609487038058466, + "grad_norm": 6.69170476612635e-05, + "learning_rate": 2.4047182859848915e-05, + "loss": 0.0, + "num_input_tokens_seen": 12408968, + "step": 20340 + }, + { + "epoch": 5.6108659680088255, + "grad_norm": 0.0001694322272669524, + "learning_rate": 2.403515827971578e-05, + "loss": 0.0, + "num_input_tokens_seen": 12411624, + "step": 20345 + }, + { + "epoch": 5.612244897959184, + "grad_norm": 6.0642618336714804e-05, + "learning_rate": 2.4023133923122612e-05, + "loss": 0.0, + "num_input_tokens_seen": 12414568, + "step": 20350 + }, + { + "epoch": 5.613623827909542, + "grad_norm": 0.0061182123608887196, + "learning_rate": 2.4011109792855306e-05, + "loss": 0.0, + "num_input_tokens_seen": 12417096, + "step": 20355 + }, + { + "epoch": 5.615002757859901, + "grad_norm": 0.0012286518467590213, + "learning_rate": 2.3999085891699655e-05, + "loss": 0.0, + "num_input_tokens_seen": 12419272, + "step": 20360 + }, + { + "epoch": 5.616381687810259, + "grad_norm": 7.967400597408414e-05, + "learning_rate": 2.3987062222441435e-05, + "loss": 0.0, + "num_input_tokens_seen": 12422056, + "step": 20365 + }, + { + "epoch": 5.617760617760617, + "grad_norm": 0.00016453796706628054, + "learning_rate": 2.3975038787866365e-05, + "loss": 0.0, + "num_input_tokens_seen": 12426408, + "step": 20370 + }, + { + "epoch": 5.619139547710976, + "grad_norm": 0.00436857994645834, + "learning_rate": 2.396301559076008e-05, + "loss": 0.0, + "num_input_tokens_seen": 12429384, + "step": 20375 + }, + { + "epoch": 5.620518477661335, + "grad_norm": 1.2202570360386744e-05, + "learning_rate": 2.3950992633908216e-05, + "loss": 0.0, + "num_input_tokens_seen": 12432424, + "step": 20380 + }, + { + "epoch": 5.621897407611693, + "grad_norm": 5.7560821005608886e-05, + "learning_rate": 2.39389699200963e-05, + "loss": 0.0, + "num_input_tokens_seen": 12436904, + "step": 20385 + }, + { + "epoch": 5.623276337562052, + "grad_norm": 1.1659861229418311e-05, + "learning_rate": 2.3926947452109807e-05, + "loss": 0.0897, + "num_input_tokens_seen": 12439528, + "step": 20390 + }, + { + "epoch": 5.62465526751241, + "grad_norm": 2.4126202333718538e-05, + "learning_rate": 2.3914925232734203e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12443912, + "step": 20395 + }, + { + "epoch": 5.626034197462769, + "grad_norm": 0.0001440227497369051, + "learning_rate": 2.3902903264754838e-05, + "loss": 0.0, + "num_input_tokens_seen": 12448392, + "step": 20400 + }, + { + "epoch": 5.627413127413128, + "grad_norm": 3.487656431389041e-05, + "learning_rate": 2.3890881550957032e-05, + "loss": 0.0, + "num_input_tokens_seen": 12451144, + "step": 20405 + }, + { + "epoch": 5.628792057363486, + "grad_norm": 2.6412777515361086e-05, + "learning_rate": 2.3878860094126065e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12453576, + "step": 20410 + }, + { + "epoch": 5.630170987313845, + "grad_norm": 0.00027950486401095986, + "learning_rate": 2.3866838897047116e-05, + "loss": 0.0, + "num_input_tokens_seen": 12456936, + "step": 20415 + }, + { + "epoch": 5.631549917264203, + "grad_norm": 4.301476292312145e-05, + "learning_rate": 2.385481796250532e-05, + "loss": 0.0, + "num_input_tokens_seen": 12459624, + "step": 20420 + }, + { + "epoch": 5.632928847214561, + "grad_norm": 2.3734464775770903e-05, + "learning_rate": 2.3842797293285778e-05, + "loss": 0.0, + "num_input_tokens_seen": 12462440, + "step": 20425 + }, + { + "epoch": 5.63430777716492, + "grad_norm": 8.627067472843919e-06, + "learning_rate": 2.3830776892173478e-05, + "loss": 0.0, + "num_input_tokens_seen": 12464648, + "step": 20430 + }, + { + "epoch": 5.635686707115278, + "grad_norm": 0.003201376413926482, + "learning_rate": 2.3818756761953396e-05, + "loss": 0.0, + "num_input_tokens_seen": 12468072, + "step": 20435 + }, + { + "epoch": 5.637065637065637, + "grad_norm": 0.00015575600264128298, + "learning_rate": 2.380673690541042e-05, + "loss": 0.0, + "num_input_tokens_seen": 12472040, + "step": 20440 + }, + { + "epoch": 5.638444567015996, + "grad_norm": 4.619639639713569e-06, + "learning_rate": 2.379471732532936e-05, + "loss": 0.0, + "num_input_tokens_seen": 12474408, + "step": 20445 + }, + { + "epoch": 5.639823496966354, + "grad_norm": 0.008151965215802193, + "learning_rate": 2.378269802449501e-05, + "loss": 0.0, + "num_input_tokens_seen": 12476776, + "step": 20450 + }, + { + "epoch": 5.6412024269167125, + "grad_norm": 2.2355126930051483e-05, + "learning_rate": 2.3770679005692053e-05, + "loss": 0.0, + "num_input_tokens_seen": 12479432, + "step": 20455 + }, + { + "epoch": 5.642581356867071, + "grad_norm": 1.2258106835361104e-05, + "learning_rate": 2.3758660271705114e-05, + "loss": 0.0, + "num_input_tokens_seen": 12482248, + "step": 20460 + }, + { + "epoch": 5.64396028681743, + "grad_norm": 7.930147694423795e-05, + "learning_rate": 2.374664182531878e-05, + "loss": 0.0, + "num_input_tokens_seen": 12485064, + "step": 20465 + }, + { + "epoch": 5.6453392167677885, + "grad_norm": 0.00030692951986566186, + "learning_rate": 2.373462366931754e-05, + "loss": 0.0, + "num_input_tokens_seen": 12487752, + "step": 20470 + }, + { + "epoch": 5.646718146718147, + "grad_norm": 0.0001261059078387916, + "learning_rate": 2.3722605806485825e-05, + "loss": 0.0, + "num_input_tokens_seen": 12490536, + "step": 20475 + }, + { + "epoch": 5.648097076668505, + "grad_norm": 9.873451199382544e-05, + "learning_rate": 2.3710588239608023e-05, + "loss": 0.0, + "num_input_tokens_seen": 12492840, + "step": 20480 + }, + { + "epoch": 5.649476006618864, + "grad_norm": 0.0009011356742121279, + "learning_rate": 2.3698570971468404e-05, + "loss": 0.0, + "num_input_tokens_seen": 12495752, + "step": 20485 + }, + { + "epoch": 5.650854936569222, + "grad_norm": 8.829414582578465e-06, + "learning_rate": 2.368655400485122e-05, + "loss": 0.0, + "num_input_tokens_seen": 12498184, + "step": 20490 + }, + { + "epoch": 5.65223386651958, + "grad_norm": 1.001169493974885e-05, + "learning_rate": 2.367453734254062e-05, + "loss": 0.0, + "num_input_tokens_seen": 12501256, + "step": 20495 + }, + { + "epoch": 5.65361279646994, + "grad_norm": 0.0005471776239573956, + "learning_rate": 2.366252098732068e-05, + "loss": 0.0, + "num_input_tokens_seen": 12503720, + "step": 20500 + }, + { + "epoch": 5.654991726420298, + "grad_norm": 1.6117855921038426e-05, + "learning_rate": 2.3650504941975435e-05, + "loss": 0.0, + "num_input_tokens_seen": 12506632, + "step": 20505 + }, + { + "epoch": 5.656370656370656, + "grad_norm": 1.5891422663116828e-05, + "learning_rate": 2.3638489209288828e-05, + "loss": 0.0, + "num_input_tokens_seen": 12509960, + "step": 20510 + }, + { + "epoch": 5.657749586321015, + "grad_norm": 3.450504436841584e-06, + "learning_rate": 2.3626473792044715e-05, + "loss": 0.0, + "num_input_tokens_seen": 12512552, + "step": 20515 + }, + { + "epoch": 5.659128516271373, + "grad_norm": 5.219184822635725e-05, + "learning_rate": 2.3614458693026922e-05, + "loss": 0.0, + "num_input_tokens_seen": 12515272, + "step": 20520 + }, + { + "epoch": 5.660507446221732, + "grad_norm": 0.039114415645599365, + "learning_rate": 2.3602443915019153e-05, + "loss": 0.0, + "num_input_tokens_seen": 12518920, + "step": 20525 + }, + { + "epoch": 5.661886376172091, + "grad_norm": 0.00018905258912127465, + "learning_rate": 2.3590429460805073e-05, + "loss": 0.0, + "num_input_tokens_seen": 12521672, + "step": 20530 + }, + { + "epoch": 5.663265306122449, + "grad_norm": 3.0587459605158074e-06, + "learning_rate": 2.357841533316825e-05, + "loss": 0.0042, + "num_input_tokens_seen": 12524200, + "step": 20535 + }, + { + "epoch": 5.6646442360728075, + "grad_norm": 0.003143318695947528, + "learning_rate": 2.3566401534892187e-05, + "loss": 0.0, + "num_input_tokens_seen": 12526408, + "step": 20540 + }, + { + "epoch": 5.666023166023166, + "grad_norm": 0.00018032797379419208, + "learning_rate": 2.355438806876031e-05, + "loss": 0.0, + "num_input_tokens_seen": 12529032, + "step": 20545 + }, + { + "epoch": 5.667402095973524, + "grad_norm": 2.890900032070931e-05, + "learning_rate": 2.3542374937555974e-05, + "loss": 0.0, + "num_input_tokens_seen": 12531624, + "step": 20550 + }, + { + "epoch": 5.668781025923883, + "grad_norm": 1.71071442309767e-05, + "learning_rate": 2.3530362144062432e-05, + "loss": 0.0, + "num_input_tokens_seen": 12535496, + "step": 20555 + }, + { + "epoch": 5.670159955874242, + "grad_norm": 2.666419095476158e-05, + "learning_rate": 2.3518349691062894e-05, + "loss": 0.0, + "num_input_tokens_seen": 12538888, + "step": 20560 + }, + { + "epoch": 5.6715388858246, + "grad_norm": 33.9599494934082, + "learning_rate": 2.3506337581340464e-05, + "loss": 0.003, + "num_input_tokens_seen": 12542408, + "step": 20565 + }, + { + "epoch": 5.672917815774959, + "grad_norm": 1.7008791473926976e-05, + "learning_rate": 2.3494325817678165e-05, + "loss": 0.0, + "num_input_tokens_seen": 12545416, + "step": 20570 + }, + { + "epoch": 5.674296745725317, + "grad_norm": 0.00015756468928884715, + "learning_rate": 2.348231440285897e-05, + "loss": 0.0, + "num_input_tokens_seen": 12548328, + "step": 20575 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 0.023344064131379128, + "learning_rate": 2.347030333966573e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12551016, + "step": 20580 + }, + { + "epoch": 5.677054605626034, + "grad_norm": 9.059951116796583e-05, + "learning_rate": 2.3458292630881266e-05, + "loss": 0.0004, + "num_input_tokens_seen": 12553800, + "step": 20585 + }, + { + "epoch": 5.678433535576393, + "grad_norm": 8.327075920533389e-05, + "learning_rate": 2.3446282279288264e-05, + "loss": 0.0, + "num_input_tokens_seen": 12556712, + "step": 20590 + }, + { + "epoch": 5.6798124655267515, + "grad_norm": 4.938055553793674e-06, + "learning_rate": 2.3434272287669347e-05, + "loss": 0.0, + "num_input_tokens_seen": 12559752, + "step": 20595 + }, + { + "epoch": 5.68119139547711, + "grad_norm": 0.00033721141517162323, + "learning_rate": 2.3422262658807075e-05, + "loss": 0.0, + "num_input_tokens_seen": 12563848, + "step": 20600 + }, + { + "epoch": 5.682570325427468, + "grad_norm": 2.1951257167529548e-06, + "learning_rate": 2.3410253395483894e-05, + "loss": 0.0, + "num_input_tokens_seen": 12566696, + "step": 20605 + }, + { + "epoch": 5.683949255377827, + "grad_norm": 0.00014962493150960654, + "learning_rate": 2.339824450048218e-05, + "loss": 0.0, + "num_input_tokens_seen": 12570312, + "step": 20610 + }, + { + "epoch": 5.685328185328185, + "grad_norm": 2.118134216289036e-05, + "learning_rate": 2.3386235976584226e-05, + "loss": 0.0, + "num_input_tokens_seen": 12575144, + "step": 20615 + }, + { + "epoch": 5.686707115278544, + "grad_norm": 2.0308589228079654e-05, + "learning_rate": 2.337422782657222e-05, + "loss": 0.0, + "num_input_tokens_seen": 12578312, + "step": 20620 + }, + { + "epoch": 5.688086045228903, + "grad_norm": 1.1140528840769548e-05, + "learning_rate": 2.3362220053228305e-05, + "loss": 0.0, + "num_input_tokens_seen": 12581448, + "step": 20625 + }, + { + "epoch": 5.689464975179261, + "grad_norm": 4.532993898465065e-06, + "learning_rate": 2.3350212659334493e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12584872, + "step": 20630 + }, + { + "epoch": 5.690843905129619, + "grad_norm": 3.2042519251263e-06, + "learning_rate": 2.3338205647672718e-05, + "loss": 0.0, + "num_input_tokens_seen": 12587400, + "step": 20635 + }, + { + "epoch": 5.692222835079978, + "grad_norm": 0.00026048338622786105, + "learning_rate": 2.3326199021024846e-05, + "loss": 0.0, + "num_input_tokens_seen": 12592712, + "step": 20640 + }, + { + "epoch": 5.693601765030336, + "grad_norm": 3.741687032743357e-05, + "learning_rate": 2.3314192782172635e-05, + "loss": 0.0, + "num_input_tokens_seen": 12595464, + "step": 20645 + }, + { + "epoch": 5.694980694980695, + "grad_norm": 0.0009181847563013434, + "learning_rate": 2.3302186933897745e-05, + "loss": 0.0, + "num_input_tokens_seen": 12598440, + "step": 20650 + }, + { + "epoch": 5.696359624931054, + "grad_norm": 0.00043300705146975815, + "learning_rate": 2.3290181478981788e-05, + "loss": 0.0, + "num_input_tokens_seen": 12601320, + "step": 20655 + }, + { + "epoch": 5.697738554881412, + "grad_norm": 6.233884050743654e-05, + "learning_rate": 2.327817642020624e-05, + "loss": 0.0, + "num_input_tokens_seen": 12604296, + "step": 20660 + }, + { + "epoch": 5.6991174848317705, + "grad_norm": 0.00020030456653330475, + "learning_rate": 2.3266171760352483e-05, + "loss": 0.1125, + "num_input_tokens_seen": 12607208, + "step": 20665 + }, + { + "epoch": 5.700496414782129, + "grad_norm": 1.6997910279314965e-05, + "learning_rate": 2.3254167502201855e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12610248, + "step": 20670 + }, + { + "epoch": 5.701875344732487, + "grad_norm": 0.008412220515310764, + "learning_rate": 2.324216364853554e-05, + "loss": 0.0, + "num_input_tokens_seen": 12613128, + "step": 20675 + }, + { + "epoch": 5.7032542746828465, + "grad_norm": 0.00015819356485735625, + "learning_rate": 2.3230160202134686e-05, + "loss": 0.0, + "num_input_tokens_seen": 12616232, + "step": 20680 + }, + { + "epoch": 5.704633204633205, + "grad_norm": 1.7584556189831346e-05, + "learning_rate": 2.3218157165780316e-05, + "loss": 0.0, + "num_input_tokens_seen": 12618920, + "step": 20685 + }, + { + "epoch": 5.706012134583563, + "grad_norm": 4.1395443986402825e-05, + "learning_rate": 2.3206154542253338e-05, + "loss": 0.0, + "num_input_tokens_seen": 12621128, + "step": 20690 + }, + { + "epoch": 5.707391064533922, + "grad_norm": 9.822405445447657e-06, + "learning_rate": 2.319415233433462e-05, + "loss": 0.0, + "num_input_tokens_seen": 12623272, + "step": 20695 + }, + { + "epoch": 5.70876999448428, + "grad_norm": 1.2651894394366536e-05, + "learning_rate": 2.3182150544804876e-05, + "loss": 0.0, + "num_input_tokens_seen": 12625992, + "step": 20700 + }, + { + "epoch": 5.710148924434638, + "grad_norm": 0.0008005529525689781, + "learning_rate": 2.317014917644475e-05, + "loss": 0.0, + "num_input_tokens_seen": 12628616, + "step": 20705 + }, + { + "epoch": 5.711527854384997, + "grad_norm": 2.779107126116287e-05, + "learning_rate": 2.3158148232034805e-05, + "loss": 0.0, + "num_input_tokens_seen": 12631368, + "step": 20710 + }, + { + "epoch": 5.712906784335356, + "grad_norm": 22.126537322998047, + "learning_rate": 2.3146147714355462e-05, + "loss": 0.1391, + "num_input_tokens_seen": 12634696, + "step": 20715 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 29.766233444213867, + "learning_rate": 2.3134147626187088e-05, + "loss": 0.0959, + "num_input_tokens_seen": 12637192, + "step": 20720 + }, + { + "epoch": 5.715664644236073, + "grad_norm": 0.0011491070035845041, + "learning_rate": 2.312214797030993e-05, + "loss": 0.0, + "num_input_tokens_seen": 12640808, + "step": 20725 + }, + { + "epoch": 5.717043574186431, + "grad_norm": 0.00013216274965088814, + "learning_rate": 2.3110148749504126e-05, + "loss": 0.0, + "num_input_tokens_seen": 12643400, + "step": 20730 + }, + { + "epoch": 5.7184225041367895, + "grad_norm": 6.065593879611697e-06, + "learning_rate": 2.309814996654974e-05, + "loss": 0.0, + "num_input_tokens_seen": 12645544, + "step": 20735 + }, + { + "epoch": 5.719801434087149, + "grad_norm": 5.95164510741597e-06, + "learning_rate": 2.3086151624226702e-05, + "loss": 0.0, + "num_input_tokens_seen": 12648712, + "step": 20740 + }, + { + "epoch": 5.721180364037507, + "grad_norm": 0.28139641880989075, + "learning_rate": 2.3074153725314853e-05, + "loss": 0.0003, + "num_input_tokens_seen": 12652232, + "step": 20745 + }, + { + "epoch": 5.722559293987866, + "grad_norm": 0.0003426389303058386, + "learning_rate": 2.3062156272593946e-05, + "loss": 0.0, + "num_input_tokens_seen": 12655336, + "step": 20750 + }, + { + "epoch": 5.723938223938224, + "grad_norm": 0.036478932946920395, + "learning_rate": 2.305015926884362e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12657608, + "step": 20755 + }, + { + "epoch": 5.725317153888582, + "grad_norm": 0.0002457723021507263, + "learning_rate": 2.303816271684339e-05, + "loss": 0.0, + "num_input_tokens_seen": 12661416, + "step": 20760 + }, + { + "epoch": 5.726696083838941, + "grad_norm": 7.988839570316486e-06, + "learning_rate": 2.302616661937271e-05, + "loss": 0.0629, + "num_input_tokens_seen": 12663304, + "step": 20765 + }, + { + "epoch": 5.728075013789299, + "grad_norm": 8.024297130759805e-05, + "learning_rate": 2.301417097921088e-05, + "loss": 0.0, + "num_input_tokens_seen": 12666056, + "step": 20770 + }, + { + "epoch": 5.729453943739658, + "grad_norm": 0.10810677707195282, + "learning_rate": 2.3002175799137137e-05, + "loss": 0.0004, + "num_input_tokens_seen": 12669128, + "step": 20775 + }, + { + "epoch": 5.730832873690017, + "grad_norm": 0.007361290045082569, + "learning_rate": 2.299018108193058e-05, + "loss": 0.0, + "num_input_tokens_seen": 12673384, + "step": 20780 + }, + { + "epoch": 5.732211803640375, + "grad_norm": 0.00030985771445557475, + "learning_rate": 2.2978186830370218e-05, + "loss": 0.0, + "num_input_tokens_seen": 12675880, + "step": 20785 + }, + { + "epoch": 5.7335907335907335, + "grad_norm": 9.263085303246044e-06, + "learning_rate": 2.2966193047234946e-05, + "loss": 0.1043, + "num_input_tokens_seen": 12680104, + "step": 20790 + }, + { + "epoch": 5.734969663541092, + "grad_norm": 0.0028491036500781775, + "learning_rate": 2.2954199735303554e-05, + "loss": 0.0, + "num_input_tokens_seen": 12682760, + "step": 20795 + }, + { + "epoch": 5.736348593491451, + "grad_norm": 0.021695757284760475, + "learning_rate": 2.2942206897354705e-05, + "loss": 0.0004, + "num_input_tokens_seen": 12686184, + "step": 20800 + }, + { + "epoch": 5.7377275234418095, + "grad_norm": 3.945080243283883e-05, + "learning_rate": 2.2930214536166995e-05, + "loss": 0.0, + "num_input_tokens_seen": 12688776, + "step": 20805 + }, + { + "epoch": 5.739106453392168, + "grad_norm": 1.0276231478201225e-05, + "learning_rate": 2.2918222654518852e-05, + "loss": 0.0, + "num_input_tokens_seen": 12691496, + "step": 20810 + }, + { + "epoch": 5.740485383342526, + "grad_norm": 4.004541187896393e-05, + "learning_rate": 2.2906231255188648e-05, + "loss": 0.0, + "num_input_tokens_seen": 12694600, + "step": 20815 + }, + { + "epoch": 5.741864313292885, + "grad_norm": 7.953350177558605e-06, + "learning_rate": 2.2894240340954605e-05, + "loss": 0.0, + "num_input_tokens_seen": 12697672, + "step": 20820 + }, + { + "epoch": 5.743243243243243, + "grad_norm": 6.360953557305038e-05, + "learning_rate": 2.288224991459484e-05, + "loss": 0.0, + "num_input_tokens_seen": 12700872, + "step": 20825 + }, + { + "epoch": 5.744622173193601, + "grad_norm": 1.6170624803635292e-05, + "learning_rate": 2.287025997888738e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12703112, + "step": 20830 + }, + { + "epoch": 5.746001103143961, + "grad_norm": 1.201775830850238e-05, + "learning_rate": 2.2858270536610113e-05, + "loss": 0.0, + "num_input_tokens_seen": 12705512, + "step": 20835 + }, + { + "epoch": 5.747380033094319, + "grad_norm": 2.561624387453776e-05, + "learning_rate": 2.2846281590540806e-05, + "loss": 0.0, + "num_input_tokens_seen": 12709224, + "step": 20840 + }, + { + "epoch": 5.748758963044677, + "grad_norm": 0.0004588552110362798, + "learning_rate": 2.2834293143457156e-05, + "loss": 0.1292, + "num_input_tokens_seen": 12711784, + "step": 20845 + }, + { + "epoch": 5.750137892995036, + "grad_norm": 2.1151683540665545e-05, + "learning_rate": 2.282230519813669e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12715560, + "step": 20850 + }, + { + "epoch": 5.751516822945394, + "grad_norm": 3.683989416458644e-05, + "learning_rate": 2.281031775735684e-05, + "loss": 0.0, + "num_input_tokens_seen": 12718984, + "step": 20855 + }, + { + "epoch": 5.752895752895753, + "grad_norm": 1.7409647625754587e-05, + "learning_rate": 2.2798330823894952e-05, + "loss": 0.0, + "num_input_tokens_seen": 12722088, + "step": 20860 + }, + { + "epoch": 5.754274682846112, + "grad_norm": 0.00011116822133772075, + "learning_rate": 2.2786344400528192e-05, + "loss": 0.0004, + "num_input_tokens_seen": 12725192, + "step": 20865 + }, + { + "epoch": 5.75565361279647, + "grad_norm": 0.001333108521066606, + "learning_rate": 2.2774358490033675e-05, + "loss": 0.0, + "num_input_tokens_seen": 12727624, + "step": 20870 + }, + { + "epoch": 5.7570325427468285, + "grad_norm": 0.016054587438702583, + "learning_rate": 2.276237309518834e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12732616, + "step": 20875 + }, + { + "epoch": 5.758411472697187, + "grad_norm": 0.0028460577595978975, + "learning_rate": 2.2750388218769032e-05, + "loss": 0.0, + "num_input_tokens_seen": 12735048, + "step": 20880 + }, + { + "epoch": 5.759790402647545, + "grad_norm": 7.61729315854609e-05, + "learning_rate": 2.2738403863552496e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12737576, + "step": 20885 + }, + { + "epoch": 5.761169332597904, + "grad_norm": 3.100543835898861e-05, + "learning_rate": 2.272642003231531e-05, + "loss": 0.0, + "num_input_tokens_seen": 12740232, + "step": 20890 + }, + { + "epoch": 5.762548262548263, + "grad_norm": 0.0003590805281419307, + "learning_rate": 2.2714436727833964e-05, + "loss": 0.0, + "num_input_tokens_seen": 12742792, + "step": 20895 + }, + { + "epoch": 5.763927192498621, + "grad_norm": 0.010819923132658005, + "learning_rate": 2.2702453952884835e-05, + "loss": 0.0, + "num_input_tokens_seen": 12744968, + "step": 20900 + }, + { + "epoch": 5.76530612244898, + "grad_norm": 0.0032360360492020845, + "learning_rate": 2.269047171024414e-05, + "loss": 0.0, + "num_input_tokens_seen": 12747944, + "step": 20905 + }, + { + "epoch": 5.766685052399338, + "grad_norm": 7.067308615660295e-05, + "learning_rate": 2.2678490002687987e-05, + "loss": 0.0, + "num_input_tokens_seen": 12751176, + "step": 20910 + }, + { + "epoch": 5.768063982349696, + "grad_norm": 7.075266330502927e-05, + "learning_rate": 2.266650883299239e-05, + "loss": 0.0, + "num_input_tokens_seen": 12754216, + "step": 20915 + }, + { + "epoch": 5.769442912300055, + "grad_norm": 22.306427001953125, + "learning_rate": 2.265452820393319e-05, + "loss": 0.0731, + "num_input_tokens_seen": 12757000, + "step": 20920 + }, + { + "epoch": 5.770821842250414, + "grad_norm": 3.635563189163804e-05, + "learning_rate": 2.264254811828614e-05, + "loss": 0.0, + "num_input_tokens_seen": 12759624, + "step": 20925 + }, + { + "epoch": 5.772200772200772, + "grad_norm": 2.5741623176145367e-05, + "learning_rate": 2.2630568578826855e-05, + "loss": 0.0, + "num_input_tokens_seen": 12763592, + "step": 20930 + }, + { + "epoch": 5.773579702151131, + "grad_norm": 0.000566591857932508, + "learning_rate": 2.261858958833081e-05, + "loss": 0.0, + "num_input_tokens_seen": 12766888, + "step": 20935 + }, + { + "epoch": 5.774958632101489, + "grad_norm": 0.00012072546815034002, + "learning_rate": 2.2606611149573386e-05, + "loss": 0.0, + "num_input_tokens_seen": 12769288, + "step": 20940 + }, + { + "epoch": 5.776337562051848, + "grad_norm": 0.20453742146492004, + "learning_rate": 2.2594633265329797e-05, + "loss": 0.0002, + "num_input_tokens_seen": 12771656, + "step": 20945 + }, + { + "epoch": 5.777716492002206, + "grad_norm": 7.781508611515164e-05, + "learning_rate": 2.2582655938375147e-05, + "loss": 0.0, + "num_input_tokens_seen": 12775208, + "step": 20950 + }, + { + "epoch": 5.779095421952565, + "grad_norm": 0.002385783242061734, + "learning_rate": 2.2570679171484422e-05, + "loss": 0.0, + "num_input_tokens_seen": 12779304, + "step": 20955 + }, + { + "epoch": 5.780474351902924, + "grad_norm": 0.00034332586801610887, + "learning_rate": 2.255870296743246e-05, + "loss": 0.0, + "num_input_tokens_seen": 12781928, + "step": 20960 + }, + { + "epoch": 5.781853281853282, + "grad_norm": 6.113573817856377e-06, + "learning_rate": 2.2546727328993976e-05, + "loss": 0.0, + "num_input_tokens_seen": 12784040, + "step": 20965 + }, + { + "epoch": 5.78323221180364, + "grad_norm": 0.0010231139604002237, + "learning_rate": 2.2534752258943564e-05, + "loss": 0.0, + "num_input_tokens_seen": 12786472, + "step": 20970 + }, + { + "epoch": 5.784611141753999, + "grad_norm": 1.1078330317104701e-05, + "learning_rate": 2.252277776005565e-05, + "loss": 0.0004, + "num_input_tokens_seen": 12789256, + "step": 20975 + }, + { + "epoch": 5.785990071704357, + "grad_norm": 8.290194818982854e-05, + "learning_rate": 2.251080383510459e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12792488, + "step": 20980 + }, + { + "epoch": 5.7873690016547155, + "grad_norm": 0.12109862267971039, + "learning_rate": 2.249883048686454e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12796456, + "step": 20985 + }, + { + "epoch": 5.788747931605075, + "grad_norm": 1.1497282685013488e-05, + "learning_rate": 2.2486857718109558e-05, + "loss": 0.0, + "num_input_tokens_seen": 12798568, + "step": 20990 + }, + { + "epoch": 5.790126861555433, + "grad_norm": 8.522561256540939e-05, + "learning_rate": 2.2474885531613574e-05, + "loss": 0.0, + "num_input_tokens_seen": 12800872, + "step": 20995 + }, + { + "epoch": 5.7915057915057915, + "grad_norm": 8.710794645594433e-06, + "learning_rate": 2.246291393015037e-05, + "loss": 0.0, + "num_input_tokens_seen": 12803368, + "step": 21000 + }, + { + "epoch": 5.79288472145615, + "grad_norm": 0.019703662022948265, + "learning_rate": 2.245094291649358e-05, + "loss": 0.0, + "num_input_tokens_seen": 12806440, + "step": 21005 + }, + { + "epoch": 5.794263651406508, + "grad_norm": 8.480051292281132e-06, + "learning_rate": 2.2438972493416732e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12808392, + "step": 21010 + }, + { + "epoch": 5.7956425813568675, + "grad_norm": 2.834114638972096e-05, + "learning_rate": 2.242700266369319e-05, + "loss": 0.0, + "num_input_tokens_seen": 12812072, + "step": 21015 + }, + { + "epoch": 5.797021511307226, + "grad_norm": 5.397303175413981e-05, + "learning_rate": 2.2415033430096204e-05, + "loss": 0.0, + "num_input_tokens_seen": 12815048, + "step": 21020 + }, + { + "epoch": 5.798400441257584, + "grad_norm": 0.00021335230849217623, + "learning_rate": 2.2403064795398863e-05, + "loss": 0.0002, + "num_input_tokens_seen": 12817096, + "step": 21025 + }, + { + "epoch": 5.799779371207943, + "grad_norm": 0.00010102834494318813, + "learning_rate": 2.239109676237413e-05, + "loss": 0.0, + "num_input_tokens_seen": 12820648, + "step": 21030 + }, + { + "epoch": 5.801158301158301, + "grad_norm": 1.1180908586538862e-05, + "learning_rate": 2.237912933379483e-05, + "loss": 0.0, + "num_input_tokens_seen": 12823208, + "step": 21035 + }, + { + "epoch": 5.802537231108659, + "grad_norm": 0.0014174493262544274, + "learning_rate": 2.236716251243365e-05, + "loss": 0.0, + "num_input_tokens_seen": 12826152, + "step": 21040 + }, + { + "epoch": 5.803916161059018, + "grad_norm": 0.0008711767732165754, + "learning_rate": 2.2355196301063104e-05, + "loss": 0.0, + "num_input_tokens_seen": 12828840, + "step": 21045 + }, + { + "epoch": 5.805295091009377, + "grad_norm": 9.143425268121064e-05, + "learning_rate": 2.234323070245563e-05, + "loss": 0.0, + "num_input_tokens_seen": 12831176, + "step": 21050 + }, + { + "epoch": 5.806674020959735, + "grad_norm": 3.0498127671307884e-05, + "learning_rate": 2.2331265719383447e-05, + "loss": 0.0, + "num_input_tokens_seen": 12834408, + "step": 21055 + }, + { + "epoch": 5.808052950910094, + "grad_norm": 0.0007618811796419322, + "learning_rate": 2.2319301354618706e-05, + "loss": 0.0, + "num_input_tokens_seen": 12837192, + "step": 21060 + }, + { + "epoch": 5.809431880860452, + "grad_norm": 1.1870605703734327e-05, + "learning_rate": 2.2307337610933355e-05, + "loss": 0.0, + "num_input_tokens_seen": 12839656, + "step": 21065 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.0645868331193924, + "learning_rate": 2.229537449109922e-05, + "loss": 0.0, + "num_input_tokens_seen": 12842120, + "step": 21070 + }, + { + "epoch": 5.81218974076117, + "grad_norm": 3.672714956337586e-05, + "learning_rate": 2.2283411997888005e-05, + "loss": 0.0, + "num_input_tokens_seen": 12845832, + "step": 21075 + }, + { + "epoch": 5.813568670711528, + "grad_norm": 1.0775153896247502e-05, + "learning_rate": 2.2271450134071233e-05, + "loss": 0.0015, + "num_input_tokens_seen": 12848200, + "step": 21080 + }, + { + "epoch": 5.8149476006618865, + "grad_norm": 0.0007987075368873775, + "learning_rate": 2.225948890242029e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12851592, + "step": 21085 + }, + { + "epoch": 5.816326530612245, + "grad_norm": 3.4602896903379587e-06, + "learning_rate": 2.224752830570644e-05, + "loss": 0.0, + "num_input_tokens_seen": 12854568, + "step": 21090 + }, + { + "epoch": 5.817705460562603, + "grad_norm": 0.0005499697872437537, + "learning_rate": 2.2235568346700763e-05, + "loss": 0.0, + "num_input_tokens_seen": 12859048, + "step": 21095 + }, + { + "epoch": 5.819084390512962, + "grad_norm": 0.00014240099699236453, + "learning_rate": 2.2223609028174213e-05, + "loss": 0.0, + "num_input_tokens_seen": 12862344, + "step": 21100 + }, + { + "epoch": 5.82046332046332, + "grad_norm": 2.8270760594750755e-06, + "learning_rate": 2.221165035289761e-05, + "loss": 0.0, + "num_input_tokens_seen": 12865768, + "step": 21105 + }, + { + "epoch": 5.821842250413679, + "grad_norm": 0.009565526619553566, + "learning_rate": 2.2199692323641584e-05, + "loss": 0.0, + "num_input_tokens_seen": 12868520, + "step": 21110 + }, + { + "epoch": 5.823221180364038, + "grad_norm": 0.0011087956372648478, + "learning_rate": 2.2187734943176663e-05, + "loss": 0.0, + "num_input_tokens_seen": 12871176, + "step": 21115 + }, + { + "epoch": 5.824600110314396, + "grad_norm": 2.1105074665683787e-06, + "learning_rate": 2.2175778214273185e-05, + "loss": 0.0, + "num_input_tokens_seen": 12873352, + "step": 21120 + }, + { + "epoch": 5.825979040264754, + "grad_norm": 0.07397367805242538, + "learning_rate": 2.2163822139701347e-05, + "loss": 0.0002, + "num_input_tokens_seen": 12876520, + "step": 21125 + }, + { + "epoch": 5.827357970215113, + "grad_norm": 2.562363624747377e-05, + "learning_rate": 2.215186672223121e-05, + "loss": 0.0, + "num_input_tokens_seen": 12879176, + "step": 21130 + }, + { + "epoch": 5.828736900165472, + "grad_norm": 7.072991593304323e-06, + "learning_rate": 2.2139911964632675e-05, + "loss": 0.0019, + "num_input_tokens_seen": 12882952, + "step": 21135 + }, + { + "epoch": 5.8301158301158305, + "grad_norm": 1.341673851129599e-05, + "learning_rate": 2.2127957869675474e-05, + "loss": 0.0, + "num_input_tokens_seen": 12885928, + "step": 21140 + }, + { + "epoch": 5.831494760066189, + "grad_norm": 3.099379682680592e-06, + "learning_rate": 2.2116004440129216e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12888232, + "step": 21145 + }, + { + "epoch": 5.832873690016547, + "grad_norm": 2.116156565534766e-06, + "learning_rate": 2.2104051678763323e-05, + "loss": 0.0, + "num_input_tokens_seen": 12891272, + "step": 21150 + }, + { + "epoch": 5.834252619966906, + "grad_norm": 7.420494057441829e-06, + "learning_rate": 2.20920995883471e-05, + "loss": 0.0, + "num_input_tokens_seen": 12894120, + "step": 21155 + }, + { + "epoch": 5.835631549917264, + "grad_norm": 0.017223723232746124, + "learning_rate": 2.2080148171649655e-05, + "loss": 0.0, + "num_input_tokens_seen": 12898024, + "step": 21160 + }, + { + "epoch": 5.837010479867622, + "grad_norm": 3.0174180210451595e-05, + "learning_rate": 2.2068197431439957e-05, + "loss": 0.0, + "num_input_tokens_seen": 12900392, + "step": 21165 + }, + { + "epoch": 5.838389409817982, + "grad_norm": 1.1243267181271221e-05, + "learning_rate": 2.2056247370486833e-05, + "loss": 0.0, + "num_input_tokens_seen": 12902856, + "step": 21170 + }, + { + "epoch": 5.83976833976834, + "grad_norm": 0.0009759026579558849, + "learning_rate": 2.204429799155895e-05, + "loss": 0.0, + "num_input_tokens_seen": 12906824, + "step": 21175 + }, + { + "epoch": 5.841147269718698, + "grad_norm": 0.016492342576384544, + "learning_rate": 2.2032349297424777e-05, + "loss": 0.0, + "num_input_tokens_seen": 12910152, + "step": 21180 + }, + { + "epoch": 5.842526199669057, + "grad_norm": 0.0017374510644003749, + "learning_rate": 2.202040129085269e-05, + "loss": 0.0, + "num_input_tokens_seen": 12912744, + "step": 21185 + }, + { + "epoch": 5.843905129619415, + "grad_norm": 2.81644406641135e-05, + "learning_rate": 2.200845397461085e-05, + "loss": 0.0, + "num_input_tokens_seen": 12916264, + "step": 21190 + }, + { + "epoch": 5.8452840595697735, + "grad_norm": 1.1078577699663583e-05, + "learning_rate": 2.199650735146727e-05, + "loss": 0.0, + "num_input_tokens_seen": 12918984, + "step": 21195 + }, + { + "epoch": 5.846662989520133, + "grad_norm": 2.682314152480103e-05, + "learning_rate": 2.1984561424189827e-05, + "loss": 0.0, + "num_input_tokens_seen": 12921736, + "step": 21200 + }, + { + "epoch": 5.848041919470491, + "grad_norm": 7.727599586360157e-05, + "learning_rate": 2.1972616195546223e-05, + "loss": 0.0, + "num_input_tokens_seen": 12924296, + "step": 21205 + }, + { + "epoch": 5.8494208494208495, + "grad_norm": 8.807650010567158e-05, + "learning_rate": 2.1960671668303986e-05, + "loss": 0.1021, + "num_input_tokens_seen": 12927400, + "step": 21210 + }, + { + "epoch": 5.850799779371208, + "grad_norm": 7.651480700587854e-05, + "learning_rate": 2.1948727845230504e-05, + "loss": 0.0006, + "num_input_tokens_seen": 12929736, + "step": 21215 + }, + { + "epoch": 5.852178709321566, + "grad_norm": 5.334191882866435e-06, + "learning_rate": 2.193678472909297e-05, + "loss": 0.0, + "num_input_tokens_seen": 12932744, + "step": 21220 + }, + { + "epoch": 5.853557639271925, + "grad_norm": 3.0162873372319154e-05, + "learning_rate": 2.192484232265845e-05, + "loss": 0.0, + "num_input_tokens_seen": 12935720, + "step": 21225 + }, + { + "epoch": 5.854936569222284, + "grad_norm": 1.1643865036603529e-05, + "learning_rate": 2.1912900628693818e-05, + "loss": 0.0, + "num_input_tokens_seen": 12938152, + "step": 21230 + }, + { + "epoch": 5.856315499172642, + "grad_norm": 5.94608936808072e-05, + "learning_rate": 2.19009596499658e-05, + "loss": 0.0, + "num_input_tokens_seen": 12940872, + "step": 21235 + }, + { + "epoch": 5.857694429123001, + "grad_norm": 0.0007861661142669618, + "learning_rate": 2.1889019389240945e-05, + "loss": 0.0, + "num_input_tokens_seen": 12943624, + "step": 21240 + }, + { + "epoch": 5.859073359073359, + "grad_norm": 0.00010319207649445161, + "learning_rate": 2.1877079849285635e-05, + "loss": 0.0, + "num_input_tokens_seen": 12946248, + "step": 21245 + }, + { + "epoch": 5.860452289023717, + "grad_norm": 0.0007867299136705697, + "learning_rate": 2.186514103286611e-05, + "loss": 0.0, + "num_input_tokens_seen": 12948680, + "step": 21250 + }, + { + "epoch": 5.861831218974076, + "grad_norm": 3.073283369303681e-05, + "learning_rate": 2.18532029427484e-05, + "loss": 0.0, + "num_input_tokens_seen": 12951592, + "step": 21255 + }, + { + "epoch": 5.863210148924434, + "grad_norm": 0.00018757343059405684, + "learning_rate": 2.1841265581698394e-05, + "loss": 0.0, + "num_input_tokens_seen": 12954568, + "step": 21260 + }, + { + "epoch": 5.864589078874793, + "grad_norm": 0.00023298486485145986, + "learning_rate": 2.182932895248182e-05, + "loss": 0.0, + "num_input_tokens_seen": 12956808, + "step": 21265 + }, + { + "epoch": 5.865968008825152, + "grad_norm": 4.526306383922929e-06, + "learning_rate": 2.1817393057864212e-05, + "loss": 0.0, + "num_input_tokens_seen": 12960456, + "step": 21270 + }, + { + "epoch": 5.86734693877551, + "grad_norm": 0.00012783448619302362, + "learning_rate": 2.1805457900610938e-05, + "loss": 0.0, + "num_input_tokens_seen": 12962920, + "step": 21275 + }, + { + "epoch": 5.8687258687258685, + "grad_norm": 5.086006785859354e-05, + "learning_rate": 2.1793523483487226e-05, + "loss": 0.0, + "num_input_tokens_seen": 12965512, + "step": 21280 + }, + { + "epoch": 5.870104798676227, + "grad_norm": 1.2556457477330696e-05, + "learning_rate": 2.1781589809258095e-05, + "loss": 0.0, + "num_input_tokens_seen": 12968328, + "step": 21285 + }, + { + "epoch": 5.871483728626586, + "grad_norm": 0.0001916462351800874, + "learning_rate": 2.1769656880688398e-05, + "loss": 0.0, + "num_input_tokens_seen": 12970504, + "step": 21290 + }, + { + "epoch": 5.872862658576945, + "grad_norm": 3.425834393055993e-06, + "learning_rate": 2.175772470054284e-05, + "loss": 0.0, + "num_input_tokens_seen": 12973160, + "step": 21295 + }, + { + "epoch": 5.874241588527303, + "grad_norm": 5.088319812784903e-05, + "learning_rate": 2.1745793271585917e-05, + "loss": 0.0, + "num_input_tokens_seen": 12975848, + "step": 21300 + }, + { + "epoch": 5.875620518477661, + "grad_norm": 2.3449596483260393e-05, + "learning_rate": 2.173386259658199e-05, + "loss": 0.0, + "num_input_tokens_seen": 12978504, + "step": 21305 + }, + { + "epoch": 5.87699944842802, + "grad_norm": 8.669699127494823e-06, + "learning_rate": 2.1721932678295216e-05, + "loss": 0.0, + "num_input_tokens_seen": 12981320, + "step": 21310 + }, + { + "epoch": 5.878378378378378, + "grad_norm": 0.0017708772793412209, + "learning_rate": 2.1710003519489578e-05, + "loss": 0.0, + "num_input_tokens_seen": 12985256, + "step": 21315 + }, + { + "epoch": 5.879757308328736, + "grad_norm": 4.7653857109253295e-06, + "learning_rate": 2.169807512292891e-05, + "loss": 0.0, + "num_input_tokens_seen": 12987528, + "step": 21320 + }, + { + "epoch": 5.881136238279096, + "grad_norm": 6.03282023803331e-06, + "learning_rate": 2.1686147491376838e-05, + "loss": 0.0, + "num_input_tokens_seen": 12990600, + "step": 21325 + }, + { + "epoch": 5.882515168229454, + "grad_norm": 4.035353867948288e-06, + "learning_rate": 2.1674220627596812e-05, + "loss": 0.0, + "num_input_tokens_seen": 12993512, + "step": 21330 + }, + { + "epoch": 5.8838940981798125, + "grad_norm": 0.002219711896032095, + "learning_rate": 2.166229453435214e-05, + "loss": 0.0, + "num_input_tokens_seen": 12996872, + "step": 21335 + }, + { + "epoch": 5.885273028130171, + "grad_norm": 0.00027739963843487203, + "learning_rate": 2.16503692144059e-05, + "loss": 0.0001, + "num_input_tokens_seen": 12999944, + "step": 21340 + }, + { + "epoch": 5.886651958080529, + "grad_norm": 1.1853826435981318e-06, + "learning_rate": 2.1638444670521035e-05, + "loss": 0.0, + "num_input_tokens_seen": 13002760, + "step": 21345 + }, + { + "epoch": 5.8880308880308885, + "grad_norm": 0.0017154620727524161, + "learning_rate": 2.1626520905460296e-05, + "loss": 0.0, + "num_input_tokens_seen": 13005736, + "step": 21350 + }, + { + "epoch": 5.889409817981247, + "grad_norm": 1.8710907170316204e-06, + "learning_rate": 2.1614597921986225e-05, + "loss": 0.0, + "num_input_tokens_seen": 13009576, + "step": 21355 + }, + { + "epoch": 5.890788747931605, + "grad_norm": 1.3882305211154744e-05, + "learning_rate": 2.1602675722861235e-05, + "loss": 0.0, + "num_input_tokens_seen": 13012872, + "step": 21360 + }, + { + "epoch": 5.892167677881964, + "grad_norm": 0.00040601182263344526, + "learning_rate": 2.159075431084751e-05, + "loss": 0.0, + "num_input_tokens_seen": 13015688, + "step": 21365 + }, + { + "epoch": 5.893546607832322, + "grad_norm": 6.239080539671704e-05, + "learning_rate": 2.157883368870706e-05, + "loss": 0.0, + "num_input_tokens_seen": 13018568, + "step": 21370 + }, + { + "epoch": 5.89492553778268, + "grad_norm": 9.130348189501092e-05, + "learning_rate": 2.156691385920175e-05, + "loss": 0.0, + "num_input_tokens_seen": 13021256, + "step": 21375 + }, + { + "epoch": 5.896304467733039, + "grad_norm": 0.001093538012355566, + "learning_rate": 2.1554994825093218e-05, + "loss": 0.0, + "num_input_tokens_seen": 13025000, + "step": 21380 + }, + { + "epoch": 5.897683397683398, + "grad_norm": 2.0420879081939347e-05, + "learning_rate": 2.1543076589142923e-05, + "loss": 0.0, + "num_input_tokens_seen": 13028968, + "step": 21385 + }, + { + "epoch": 5.899062327633756, + "grad_norm": 1.665992385824211e-06, + "learning_rate": 2.1531159154112172e-05, + "loss": 0.0, + "num_input_tokens_seen": 13031464, + "step": 21390 + }, + { + "epoch": 5.900441257584115, + "grad_norm": 0.0019800930749624968, + "learning_rate": 2.151924252276204e-05, + "loss": 0.0, + "num_input_tokens_seen": 13033960, + "step": 21395 + }, + { + "epoch": 5.901820187534473, + "grad_norm": 0.005967181641608477, + "learning_rate": 2.1507326697853465e-05, + "loss": 0.0, + "num_input_tokens_seen": 13036936, + "step": 21400 + }, + { + "epoch": 5.9031991174848315, + "grad_norm": 1.769154732755851e-05, + "learning_rate": 2.149541168214715e-05, + "loss": 0.0, + "num_input_tokens_seen": 13040168, + "step": 21405 + }, + { + "epoch": 5.904578047435191, + "grad_norm": 0.0010392649564892054, + "learning_rate": 2.148349747840364e-05, + "loss": 0.0, + "num_input_tokens_seen": 13043528, + "step": 21410 + }, + { + "epoch": 5.905956977385549, + "grad_norm": 0.00015122193144634366, + "learning_rate": 2.147158408938329e-05, + "loss": 0.0, + "num_input_tokens_seen": 13045544, + "step": 21415 + }, + { + "epoch": 5.9073359073359075, + "grad_norm": 7.034704140096437e-06, + "learning_rate": 2.1459671517846265e-05, + "loss": 0.0, + "num_input_tokens_seen": 13047688, + "step": 21420 + }, + { + "epoch": 5.908714837286266, + "grad_norm": 1.4664637092209887e-05, + "learning_rate": 2.1447759766552518e-05, + "loss": 0.0, + "num_input_tokens_seen": 13051784, + "step": 21425 + }, + { + "epoch": 5.910093767236624, + "grad_norm": 1.2298793308218592e-06, + "learning_rate": 2.143584883826185e-05, + "loss": 0.0, + "num_input_tokens_seen": 13055368, + "step": 21430 + }, + { + "epoch": 5.911472697186983, + "grad_norm": 1.1736476153600961e-05, + "learning_rate": 2.1423938735733852e-05, + "loss": 0.0, + "num_input_tokens_seen": 13057992, + "step": 21435 + }, + { + "epoch": 5.912851627137341, + "grad_norm": 8.154018360073678e-06, + "learning_rate": 2.14120294617279e-05, + "loss": 0.0, + "num_input_tokens_seen": 13061352, + "step": 21440 + }, + { + "epoch": 5.9142305570877, + "grad_norm": 3.969032604800304e-06, + "learning_rate": 2.1400121019003228e-05, + "loss": 0.0, + "num_input_tokens_seen": 13065576, + "step": 21445 + }, + { + "epoch": 5.915609487038059, + "grad_norm": 3.2909156288951635e-06, + "learning_rate": 2.138821341031883e-05, + "loss": 0.0, + "num_input_tokens_seen": 13068456, + "step": 21450 + }, + { + "epoch": 5.916988416988417, + "grad_norm": 1.5057272548801848e-06, + "learning_rate": 2.1376306638433558e-05, + "loss": 0.0, + "num_input_tokens_seen": 13070984, + "step": 21455 + }, + { + "epoch": 5.918367346938775, + "grad_norm": 4.151832399656996e-05, + "learning_rate": 2.1364400706106016e-05, + "loss": 0.0, + "num_input_tokens_seen": 13073896, + "step": 21460 + }, + { + "epoch": 5.919746276889134, + "grad_norm": 7.047321560094133e-05, + "learning_rate": 2.1352495616094638e-05, + "loss": 0.0, + "num_input_tokens_seen": 13076616, + "step": 21465 + }, + { + "epoch": 5.921125206839492, + "grad_norm": 1.8165142137149815e-06, + "learning_rate": 2.1340591371157676e-05, + "loss": 0.0, + "num_input_tokens_seen": 13079688, + "step": 21470 + }, + { + "epoch": 5.922504136789851, + "grad_norm": 0.0010464488295838237, + "learning_rate": 2.1328687974053157e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13082536, + "step": 21475 + }, + { + "epoch": 5.92388306674021, + "grad_norm": 4.620951585820876e-06, + "learning_rate": 2.131678542753894e-05, + "loss": 0.0, + "num_input_tokens_seen": 13085960, + "step": 21480 + }, + { + "epoch": 5.925261996690568, + "grad_norm": 0.0005388995632529259, + "learning_rate": 2.1304883734372663e-05, + "loss": 0.0, + "num_input_tokens_seen": 13089896, + "step": 21485 + }, + { + "epoch": 5.926640926640927, + "grad_norm": 0.00021738189388997853, + "learning_rate": 2.129298289731178e-05, + "loss": 0.0, + "num_input_tokens_seen": 13092712, + "step": 21490 + }, + { + "epoch": 5.928019856591285, + "grad_norm": 2.4569114884798182e-06, + "learning_rate": 2.128108291911356e-05, + "loss": 0.0, + "num_input_tokens_seen": 13095240, + "step": 21495 + }, + { + "epoch": 5.929398786541643, + "grad_norm": 4.951683877152391e-06, + "learning_rate": 2.1269183802535044e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13097672, + "step": 21500 + }, + { + "epoch": 5.930777716492003, + "grad_norm": 0.0004223448922857642, + "learning_rate": 2.1257285550333077e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13100520, + "step": 21505 + }, + { + "epoch": 5.932156646442361, + "grad_norm": 1.6406931990786688e-06, + "learning_rate": 2.1245388165264336e-05, + "loss": 0.0, + "num_input_tokens_seen": 13103304, + "step": 21510 + }, + { + "epoch": 5.933535576392719, + "grad_norm": 0.00040065133362077177, + "learning_rate": 2.1233491650085252e-05, + "loss": 0.0, + "num_input_tokens_seen": 13106312, + "step": 21515 + }, + { + "epoch": 5.934914506343078, + "grad_norm": 9.488232899457216e-05, + "learning_rate": 2.1221596007552084e-05, + "loss": 0.0, + "num_input_tokens_seen": 13110056, + "step": 21520 + }, + { + "epoch": 5.936293436293436, + "grad_norm": 1.3618150660477113e-06, + "learning_rate": 2.1209701240420894e-05, + "loss": 0.0, + "num_input_tokens_seen": 13112552, + "step": 21525 + }, + { + "epoch": 5.9376723662437945, + "grad_norm": 0.00011792072473326698, + "learning_rate": 2.119780735144752e-05, + "loss": 0.0, + "num_input_tokens_seen": 13117448, + "step": 21530 + }, + { + "epoch": 5.939051296194153, + "grad_norm": 0.005740241147577763, + "learning_rate": 2.1185914343387595e-05, + "loss": 0.0, + "num_input_tokens_seen": 13119944, + "step": 21535 + }, + { + "epoch": 5.940430226144512, + "grad_norm": 1.3156267414160538e-05, + "learning_rate": 2.1174022218996575e-05, + "loss": 0.0, + "num_input_tokens_seen": 13122472, + "step": 21540 + }, + { + "epoch": 5.9418091560948705, + "grad_norm": 2.023769638981321e-06, + "learning_rate": 2.116213098102968e-05, + "loss": 0.0, + "num_input_tokens_seen": 13126376, + "step": 21545 + }, + { + "epoch": 5.943188086045229, + "grad_norm": 6.770811887690797e-05, + "learning_rate": 2.1150240632241945e-05, + "loss": 0.0, + "num_input_tokens_seen": 13129160, + "step": 21550 + }, + { + "epoch": 5.944567015995587, + "grad_norm": 1.2177368262200616e-05, + "learning_rate": 2.1138351175388207e-05, + "loss": 0.0, + "num_input_tokens_seen": 13131656, + "step": 21555 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 1.7240747638425091e-06, + "learning_rate": 2.112646261322306e-05, + "loss": 0.0, + "num_input_tokens_seen": 13134632, + "step": 21560 + }, + { + "epoch": 5.947324875896305, + "grad_norm": 0.000276139413472265, + "learning_rate": 2.111457494850093e-05, + "loss": 0.0, + "num_input_tokens_seen": 13137832, + "step": 21565 + }, + { + "epoch": 5.948703805846663, + "grad_norm": 1.4222288200471667e-06, + "learning_rate": 2.110268818397601e-05, + "loss": 0.0, + "num_input_tokens_seen": 13141352, + "step": 21570 + }, + { + "epoch": 5.950082735797022, + "grad_norm": 1.4608205674448982e-05, + "learning_rate": 2.1090802322402285e-05, + "loss": 0.1229, + "num_input_tokens_seen": 13143752, + "step": 21575 + }, + { + "epoch": 5.95146166574738, + "grad_norm": 0.00573309138417244, + "learning_rate": 2.1078917366533554e-05, + "loss": 0.0, + "num_input_tokens_seen": 13146632, + "step": 21580 + }, + { + "epoch": 5.952840595697738, + "grad_norm": 0.002183592412620783, + "learning_rate": 2.1067033319123383e-05, + "loss": 0.0, + "num_input_tokens_seen": 13149544, + "step": 21585 + }, + { + "epoch": 5.954219525648097, + "grad_norm": 6.650049726886209e-06, + "learning_rate": 2.1055150182925138e-05, + "loss": 0.0, + "num_input_tokens_seen": 13152392, + "step": 21590 + }, + { + "epoch": 5.955598455598455, + "grad_norm": 0.0030907364562153816, + "learning_rate": 2.1043267960691977e-05, + "loss": 0.0, + "num_input_tokens_seen": 13154664, + "step": 21595 + }, + { + "epoch": 5.956977385548814, + "grad_norm": 0.0010089920833706856, + "learning_rate": 2.1031386655176815e-05, + "loss": 0.1854, + "num_input_tokens_seen": 13157512, + "step": 21600 + }, + { + "epoch": 5.958356315499173, + "grad_norm": 53.327911376953125, + "learning_rate": 2.1019506269132417e-05, + "loss": 0.0488, + "num_input_tokens_seen": 13159944, + "step": 21605 + }, + { + "epoch": 5.959735245449531, + "grad_norm": 0.005673205479979515, + "learning_rate": 2.1007626805311272e-05, + "loss": 0.0, + "num_input_tokens_seen": 13162984, + "step": 21610 + }, + { + "epoch": 5.9611141753998895, + "grad_norm": 0.006962941959500313, + "learning_rate": 2.099574826646568e-05, + "loss": 0.0, + "num_input_tokens_seen": 13166600, + "step": 21615 + }, + { + "epoch": 5.962493105350248, + "grad_norm": 0.000330706185195595, + "learning_rate": 2.0983870655347743e-05, + "loss": 0.0, + "num_input_tokens_seen": 13170024, + "step": 21620 + }, + { + "epoch": 5.963872035300607, + "grad_norm": 0.0005831493763253093, + "learning_rate": 2.097199397470933e-05, + "loss": 0.0, + "num_input_tokens_seen": 13172584, + "step": 21625 + }, + { + "epoch": 5.9652509652509655, + "grad_norm": 0.0020441035740077496, + "learning_rate": 2.096011822730208e-05, + "loss": 0.0, + "num_input_tokens_seen": 13175016, + "step": 21630 + }, + { + "epoch": 5.966629895201324, + "grad_norm": 0.0007431079866364598, + "learning_rate": 2.094824341587746e-05, + "loss": 0.0, + "num_input_tokens_seen": 13178984, + "step": 21635 + }, + { + "epoch": 5.968008825151682, + "grad_norm": 0.03981265053153038, + "learning_rate": 2.0936369543186668e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13181640, + "step": 21640 + }, + { + "epoch": 5.969387755102041, + "grad_norm": 0.0027489885687828064, + "learning_rate": 2.0924496611980734e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13184456, + "step": 21645 + }, + { + "epoch": 5.970766685052399, + "grad_norm": 0.0006314139463938773, + "learning_rate": 2.0912624625010425e-05, + "loss": 0.0, + "num_input_tokens_seen": 13187272, + "step": 21650 + }, + { + "epoch": 5.972145615002757, + "grad_norm": 0.002148849656805396, + "learning_rate": 2.0900753585026324e-05, + "loss": 0.1834, + "num_input_tokens_seen": 13190664, + "step": 21655 + }, + { + "epoch": 5.973524544953117, + "grad_norm": 0.007998395711183548, + "learning_rate": 2.088888349477877e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13193864, + "step": 21660 + }, + { + "epoch": 5.974903474903475, + "grad_norm": 0.00066443404648453, + "learning_rate": 2.0877014357017908e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13197800, + "step": 21665 + }, + { + "epoch": 5.976282404853833, + "grad_norm": 0.007833038456737995, + "learning_rate": 2.0865146174493622e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13200232, + "step": 21670 + }, + { + "epoch": 5.977661334804192, + "grad_norm": 0.0024856794625520706, + "learning_rate": 2.0853278949955625e-05, + "loss": 0.0002, + "num_input_tokens_seen": 13202632, + "step": 21675 + }, + { + "epoch": 5.97904026475455, + "grad_norm": 0.007623368874192238, + "learning_rate": 2.084141268615336e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13205896, + "step": 21680 + }, + { + "epoch": 5.9804191947049095, + "grad_norm": 0.4952266812324524, + "learning_rate": 2.0829547385836094e-05, + "loss": 0.0007, + "num_input_tokens_seen": 13209032, + "step": 21685 + }, + { + "epoch": 5.981798124655268, + "grad_norm": 0.00029558208188973367, + "learning_rate": 2.0817683051752828e-05, + "loss": 0.0002, + "num_input_tokens_seen": 13212168, + "step": 21690 + }, + { + "epoch": 5.983177054605626, + "grad_norm": 0.003722110530361533, + "learning_rate": 2.080581968665236e-05, + "loss": 0.0002, + "num_input_tokens_seen": 13217352, + "step": 21695 + }, + { + "epoch": 5.984555984555985, + "grad_norm": 0.00018969998927786946, + "learning_rate": 2.0793957293283274e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13221128, + "step": 21700 + }, + { + "epoch": 5.985934914506343, + "grad_norm": 0.009527960792183876, + "learning_rate": 2.078209587439391e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13225928, + "step": 21705 + }, + { + "epoch": 5.987313844456701, + "grad_norm": 0.005556968972086906, + "learning_rate": 2.0770235432732374e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13228456, + "step": 21710 + }, + { + "epoch": 5.98869277440706, + "grad_norm": 0.005665516946464777, + "learning_rate": 2.0758375971046583e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13234152, + "step": 21715 + }, + { + "epoch": 5.990071704357419, + "grad_norm": 0.000660562131088227, + "learning_rate": 2.074651749208419e-05, + "loss": 0.0043, + "num_input_tokens_seen": 13237032, + "step": 21720 + }, + { + "epoch": 5.991450634307777, + "grad_norm": 0.0035329724196344614, + "learning_rate": 2.073465999859263e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13239688, + "step": 21725 + }, + { + "epoch": 5.992829564258136, + "grad_norm": 0.0007427586824633181, + "learning_rate": 2.0722803493319133e-05, + "loss": 0.0, + "num_input_tokens_seen": 13242536, + "step": 21730 + }, + { + "epoch": 5.994208494208494, + "grad_norm": 0.00136480329092592, + "learning_rate": 2.0710947979010662e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13245096, + "step": 21735 + }, + { + "epoch": 5.9955874241588525, + "grad_norm": 0.0002645708154886961, + "learning_rate": 2.0699093458413984e-05, + "loss": 0.0, + "num_input_tokens_seen": 13247816, + "step": 21740 + }, + { + "epoch": 5.996966354109212, + "grad_norm": 9.596405288903043e-05, + "learning_rate": 2.0687239934275617e-05, + "loss": 0.0, + "num_input_tokens_seen": 13251240, + "step": 21745 + }, + { + "epoch": 5.99834528405957, + "grad_norm": 0.00016876636072993279, + "learning_rate": 2.067538740934184e-05, + "loss": 0.0, + "num_input_tokens_seen": 13255176, + "step": 21750 + }, + { + "epoch": 5.9997242140099285, + "grad_norm": 0.0002607361238915473, + "learning_rate": 2.0663535886358734e-05, + "loss": 0.0, + "num_input_tokens_seen": 13257512, + "step": 21755 + }, + { + "epoch": 6.0, + "eval_loss": 0.23504623770713806, + "eval_runtime": 28.4839, + "eval_samples_per_second": 56.593, + "eval_steps_per_second": 14.148, + "num_input_tokens_seen": 13257624, + "step": 21756 + }, + { + "epoch": 6.001103143960287, + "grad_norm": 0.003034877823665738, + "learning_rate": 2.065168536807211e-05, + "loss": 0.0, + "num_input_tokens_seen": 13259736, + "step": 21760 + }, + { + "epoch": 6.002482073910645, + "grad_norm": 7.161432586144656e-05, + "learning_rate": 2.0639835857227562e-05, + "loss": 0.0, + "num_input_tokens_seen": 13262136, + "step": 21765 + }, + { + "epoch": 6.003861003861004, + "grad_norm": 0.0011049899039790034, + "learning_rate": 2.062798735657047e-05, + "loss": 0.0, + "num_input_tokens_seen": 13265304, + "step": 21770 + }, + { + "epoch": 6.005239933811362, + "grad_norm": 0.00037946683005429804, + "learning_rate": 2.0616139868845937e-05, + "loss": 0.0002, + "num_input_tokens_seen": 13268824, + "step": 21775 + }, + { + "epoch": 6.006618863761721, + "grad_norm": 0.0011382298544049263, + "learning_rate": 2.0604293396798882e-05, + "loss": 0.0, + "num_input_tokens_seen": 13271736, + "step": 21780 + }, + { + "epoch": 6.00799779371208, + "grad_norm": 0.0010105964029207826, + "learning_rate": 2.059244794317395e-05, + "loss": 0.0, + "num_input_tokens_seen": 13274520, + "step": 21785 + }, + { + "epoch": 6.009376723662438, + "grad_norm": 0.002211674116551876, + "learning_rate": 2.0580603510715547e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13277272, + "step": 21790 + }, + { + "epoch": 6.010755653612796, + "grad_norm": 0.0011985186720266938, + "learning_rate": 2.056876010216788e-05, + "loss": 0.0, + "num_input_tokens_seen": 13279704, + "step": 21795 + }, + { + "epoch": 6.012134583563155, + "grad_norm": 0.0004284258757252246, + "learning_rate": 2.0556917720274893e-05, + "loss": 0.0, + "num_input_tokens_seen": 13282296, + "step": 21800 + }, + { + "epoch": 6.013513513513513, + "grad_norm": 0.004105695988982916, + "learning_rate": 2.0545076367780285e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13285368, + "step": 21805 + }, + { + "epoch": 6.014892443463872, + "grad_norm": 0.0007183481939136982, + "learning_rate": 2.0533236047427547e-05, + "loss": 0.0, + "num_input_tokens_seen": 13288280, + "step": 21810 + }, + { + "epoch": 6.016271373414231, + "grad_norm": 0.0004856416198890656, + "learning_rate": 2.0521396761959894e-05, + "loss": 0.0, + "num_input_tokens_seen": 13290392, + "step": 21815 + }, + { + "epoch": 6.017650303364589, + "grad_norm": 0.008980890735983849, + "learning_rate": 2.0509558514120314e-05, + "loss": 0.0, + "num_input_tokens_seen": 13293752, + "step": 21820 + }, + { + "epoch": 6.0190292333149475, + "grad_norm": 0.0027380799874663353, + "learning_rate": 2.0497721306651574e-05, + "loss": 0.0, + "num_input_tokens_seen": 13296824, + "step": 21825 + }, + { + "epoch": 6.020408163265306, + "grad_norm": 0.0025446112267673016, + "learning_rate": 2.048588514229618e-05, + "loss": 0.0, + "num_input_tokens_seen": 13300440, + "step": 21830 + }, + { + "epoch": 6.021787093215664, + "grad_norm": 0.0005323171499185264, + "learning_rate": 2.0474050023796402e-05, + "loss": 0.0, + "num_input_tokens_seen": 13303384, + "step": 21835 + }, + { + "epoch": 6.023166023166024, + "grad_norm": 3.164766894769855e-05, + "learning_rate": 2.046221595389427e-05, + "loss": 0.0, + "num_input_tokens_seen": 13305848, + "step": 21840 + }, + { + "epoch": 6.024544953116382, + "grad_norm": 4.112311580684036e-05, + "learning_rate": 2.0450382935331555e-05, + "loss": 0.0, + "num_input_tokens_seen": 13308408, + "step": 21845 + }, + { + "epoch": 6.02592388306674, + "grad_norm": 0.0001685336756054312, + "learning_rate": 2.043855097084982e-05, + "loss": 0.0, + "num_input_tokens_seen": 13310936, + "step": 21850 + }, + { + "epoch": 6.027302813017099, + "grad_norm": 0.0038101342506706715, + "learning_rate": 2.0426720063190335e-05, + "loss": 0.0, + "num_input_tokens_seen": 13313496, + "step": 21855 + }, + { + "epoch": 6.028681742967457, + "grad_norm": 0.001829806948080659, + "learning_rate": 2.041489021509417e-05, + "loss": 0.0, + "num_input_tokens_seen": 13316664, + "step": 21860 + }, + { + "epoch": 6.0300606729178154, + "grad_norm": 0.00045198368025012314, + "learning_rate": 2.0403061429302127e-05, + "loss": 0.0, + "num_input_tokens_seen": 13319800, + "step": 21865 + }, + { + "epoch": 6.031439602868175, + "grad_norm": 0.0001879269548226148, + "learning_rate": 2.0391233708554765e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13323768, + "step": 21870 + }, + { + "epoch": 6.032818532818533, + "grad_norm": 0.0003031042870134115, + "learning_rate": 2.037940705559239e-05, + "loss": 0.0, + "num_input_tokens_seen": 13327352, + "step": 21875 + }, + { + "epoch": 6.0341974627688915, + "grad_norm": 9.583349310560152e-05, + "learning_rate": 2.0367581473155084e-05, + "loss": 0.0, + "num_input_tokens_seen": 13330744, + "step": 21880 + }, + { + "epoch": 6.03557639271925, + "grad_norm": 5.8867139159701765e-05, + "learning_rate": 2.0355756963982643e-05, + "loss": 0.0, + "num_input_tokens_seen": 13335224, + "step": 21885 + }, + { + "epoch": 6.036955322669608, + "grad_norm": 0.00032692094100639224, + "learning_rate": 2.0343933530814657e-05, + "loss": 0.0, + "num_input_tokens_seen": 13337752, + "step": 21890 + }, + { + "epoch": 6.038334252619967, + "grad_norm": 0.0003408103948459029, + "learning_rate": 2.0332111176390435e-05, + "loss": 0.0, + "num_input_tokens_seen": 13340440, + "step": 21895 + }, + { + "epoch": 6.039713182570326, + "grad_norm": 3.03657325275708e-05, + "learning_rate": 2.032028990344904e-05, + "loss": 0.0, + "num_input_tokens_seen": 13343384, + "step": 21900 + }, + { + "epoch": 6.041092112520684, + "grad_norm": 0.0021202529314905405, + "learning_rate": 2.0308469714729314e-05, + "loss": 0.0, + "num_input_tokens_seen": 13346488, + "step": 21905 + }, + { + "epoch": 6.042471042471043, + "grad_norm": 3.946598371840082e-05, + "learning_rate": 2.0296650612969807e-05, + "loss": 0.0, + "num_input_tokens_seen": 13349464, + "step": 21910 + }, + { + "epoch": 6.043849972421401, + "grad_norm": 0.0005501738633029163, + "learning_rate": 2.0284832600908836e-05, + "loss": 0.0, + "num_input_tokens_seen": 13353240, + "step": 21915 + }, + { + "epoch": 6.045228902371759, + "grad_norm": 0.0034587588161230087, + "learning_rate": 2.027301568128447e-05, + "loss": 0.0, + "num_input_tokens_seen": 13356024, + "step": 21920 + }, + { + "epoch": 6.046607832322118, + "grad_norm": 0.00020292722911108285, + "learning_rate": 2.0261199856834513e-05, + "loss": 0.0, + "num_input_tokens_seen": 13358808, + "step": 21925 + }, + { + "epoch": 6.047986762272476, + "grad_norm": 8.910865290090442e-05, + "learning_rate": 2.0249385130296532e-05, + "loss": 0.0, + "num_input_tokens_seen": 13362360, + "step": 21930 + }, + { + "epoch": 6.049365692222835, + "grad_norm": 3.299365926068276e-05, + "learning_rate": 2.023757150440782e-05, + "loss": 0.0, + "num_input_tokens_seen": 13366520, + "step": 21935 + }, + { + "epoch": 6.050744622173194, + "grad_norm": 0.00040184156387113035, + "learning_rate": 2.0225758981905428e-05, + "loss": 0.0, + "num_input_tokens_seen": 13370648, + "step": 21940 + }, + { + "epoch": 6.052123552123552, + "grad_norm": 0.0006328661693260074, + "learning_rate": 2.0213947565526153e-05, + "loss": 0.0, + "num_input_tokens_seen": 13372888, + "step": 21945 + }, + { + "epoch": 6.0535024820739105, + "grad_norm": 0.0002834261395037174, + "learning_rate": 2.020213725800653e-05, + "loss": 0.0, + "num_input_tokens_seen": 13376056, + "step": 21950 + }, + { + "epoch": 6.054881412024269, + "grad_norm": 6.442457379307598e-05, + "learning_rate": 2.0190328062082818e-05, + "loss": 0.0, + "num_input_tokens_seen": 13379064, + "step": 21955 + }, + { + "epoch": 6.056260341974627, + "grad_norm": 0.2046235203742981, + "learning_rate": 2.017851998049107e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13381752, + "step": 21960 + }, + { + "epoch": 6.0576392719249865, + "grad_norm": 0.00023382806102745235, + "learning_rate": 2.0166713015967025e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13383992, + "step": 21965 + }, + { + "epoch": 6.059018201875345, + "grad_norm": 0.0002630798553582281, + "learning_rate": 2.0154907171246194e-05, + "loss": 0.0, + "num_input_tokens_seen": 13386456, + "step": 21970 + }, + { + "epoch": 6.060397131825703, + "grad_norm": 2.3235495973494835e-05, + "learning_rate": 2.0143102449063827e-05, + "loss": 0.0, + "num_input_tokens_seen": 13389784, + "step": 21975 + }, + { + "epoch": 6.061776061776062, + "grad_norm": 0.002563583431765437, + "learning_rate": 2.0131298852154897e-05, + "loss": 0.0, + "num_input_tokens_seen": 13393656, + "step": 21980 + }, + { + "epoch": 6.06315499172642, + "grad_norm": 6.113142262620386e-06, + "learning_rate": 2.0119496383254147e-05, + "loss": 0.0, + "num_input_tokens_seen": 13396504, + "step": 21985 + }, + { + "epoch": 6.064533921676778, + "grad_norm": 2.4396043954766355e-05, + "learning_rate": 2.0107695045096028e-05, + "loss": 0.0, + "num_input_tokens_seen": 13399896, + "step": 21990 + }, + { + "epoch": 6.065912851627138, + "grad_norm": 0.0002115839160978794, + "learning_rate": 2.0095894840414732e-05, + "loss": 0.0, + "num_input_tokens_seen": 13402744, + "step": 21995 + }, + { + "epoch": 6.067291781577496, + "grad_norm": 6.645983376074582e-05, + "learning_rate": 2.0084095771944204e-05, + "loss": 0.0, + "num_input_tokens_seen": 13405912, + "step": 22000 + }, + { + "epoch": 6.068670711527854, + "grad_norm": 5.6585697166156024e-05, + "learning_rate": 2.0072297842418135e-05, + "loss": 0.0, + "num_input_tokens_seen": 13408952, + "step": 22005 + }, + { + "epoch": 6.070049641478213, + "grad_norm": 4.21351105615031e-05, + "learning_rate": 2.0060501054569906e-05, + "loss": 0.0, + "num_input_tokens_seen": 13411160, + "step": 22010 + }, + { + "epoch": 6.071428571428571, + "grad_norm": 0.0007621402037329972, + "learning_rate": 2.004870541113269e-05, + "loss": 0.0, + "num_input_tokens_seen": 13414008, + "step": 22015 + }, + { + "epoch": 6.0728075013789296, + "grad_norm": 4.4949789298698306e-05, + "learning_rate": 2.0036910914839346e-05, + "loss": 0.0, + "num_input_tokens_seen": 13418520, + "step": 22020 + }, + { + "epoch": 6.074186431329289, + "grad_norm": 3.838062548311427e-05, + "learning_rate": 2.0025117568422512e-05, + "loss": 0.0, + "num_input_tokens_seen": 13422104, + "step": 22025 + }, + { + "epoch": 6.075565361279647, + "grad_norm": 2.1065558030386455e-05, + "learning_rate": 2.001332537461453e-05, + "loss": 0.0, + "num_input_tokens_seen": 13425272, + "step": 22030 + }, + { + "epoch": 6.076944291230006, + "grad_norm": 0.0005129347555339336, + "learning_rate": 2.0001534336147462e-05, + "loss": 0.0, + "num_input_tokens_seen": 13429208, + "step": 22035 + }, + { + "epoch": 6.078323221180364, + "grad_norm": 0.0003167448448948562, + "learning_rate": 1.9989744455753145e-05, + "loss": 0.0, + "num_input_tokens_seen": 13432152, + "step": 22040 + }, + { + "epoch": 6.079702151130722, + "grad_norm": 0.0009611842106096447, + "learning_rate": 1.9977955736163122e-05, + "loss": 0.0, + "num_input_tokens_seen": 13434392, + "step": 22045 + }, + { + "epoch": 6.081081081081081, + "grad_norm": 2.3875301849329844e-05, + "learning_rate": 1.996616818010866e-05, + "loss": 0.0, + "num_input_tokens_seen": 13437400, + "step": 22050 + }, + { + "epoch": 6.08246001103144, + "grad_norm": 0.00023064151173457503, + "learning_rate": 1.995438179032078e-05, + "loss": 0.0, + "num_input_tokens_seen": 13440216, + "step": 22055 + }, + { + "epoch": 6.083838940981798, + "grad_norm": 0.00032188245677389205, + "learning_rate": 1.9942596569530216e-05, + "loss": 0.0, + "num_input_tokens_seen": 13442776, + "step": 22060 + }, + { + "epoch": 6.085217870932157, + "grad_norm": 5.201676322030835e-05, + "learning_rate": 1.9930812520467416e-05, + "loss": 0.0, + "num_input_tokens_seen": 13445496, + "step": 22065 + }, + { + "epoch": 6.086596800882515, + "grad_norm": 0.00035255777765996754, + "learning_rate": 1.9919029645862598e-05, + "loss": 0.0, + "num_input_tokens_seen": 13448024, + "step": 22070 + }, + { + "epoch": 6.0879757308328735, + "grad_norm": 1.7537171515868977e-05, + "learning_rate": 1.9907247948445674e-05, + "loss": 0.0, + "num_input_tokens_seen": 13452984, + "step": 22075 + }, + { + "epoch": 6.089354660783232, + "grad_norm": 0.0001827690430218354, + "learning_rate": 1.9895467430946297e-05, + "loss": 0.0, + "num_input_tokens_seen": 13456600, + "step": 22080 + }, + { + "epoch": 6.090733590733591, + "grad_norm": 0.0007577783544547856, + "learning_rate": 1.988368809609385e-05, + "loss": 0.0, + "num_input_tokens_seen": 13460472, + "step": 22085 + }, + { + "epoch": 6.0921125206839495, + "grad_norm": 2.7378697268432006e-05, + "learning_rate": 1.9871909946617418e-05, + "loss": 0.0, + "num_input_tokens_seen": 13463736, + "step": 22090 + }, + { + "epoch": 6.093491450634308, + "grad_norm": 8.443711521977093e-06, + "learning_rate": 1.9860132985245855e-05, + "loss": 0.0, + "num_input_tokens_seen": 13466488, + "step": 22095 + }, + { + "epoch": 6.094870380584666, + "grad_norm": 5.88924485782627e-05, + "learning_rate": 1.984835721470769e-05, + "loss": 0.0, + "num_input_tokens_seen": 13468824, + "step": 22100 + }, + { + "epoch": 6.096249310535025, + "grad_norm": 2.5458060918026604e-05, + "learning_rate": 1.983658263773121e-05, + "loss": 0.0, + "num_input_tokens_seen": 13471608, + "step": 22105 + }, + { + "epoch": 6.097628240485383, + "grad_norm": 0.00017846256378106773, + "learning_rate": 1.9824809257044415e-05, + "loss": 0.0, + "num_input_tokens_seen": 13474776, + "step": 22110 + }, + { + "epoch": 6.099007170435742, + "grad_norm": 3.3468160836491734e-05, + "learning_rate": 1.9813037075375025e-05, + "loss": 0.0, + "num_input_tokens_seen": 13477432, + "step": 22115 + }, + { + "epoch": 6.100386100386101, + "grad_norm": 0.00011780412751249969, + "learning_rate": 1.98012660954505e-05, + "loss": 0.0, + "num_input_tokens_seen": 13480344, + "step": 22120 + }, + { + "epoch": 6.101765030336459, + "grad_norm": 2.654222771525383e-05, + "learning_rate": 1.9789496319997992e-05, + "loss": 0.0, + "num_input_tokens_seen": 13483096, + "step": 22125 + }, + { + "epoch": 6.103143960286817, + "grad_norm": 2.0218492863932624e-05, + "learning_rate": 1.9777727751744386e-05, + "loss": 0.0, + "num_input_tokens_seen": 13485752, + "step": 22130 + }, + { + "epoch": 6.104522890237176, + "grad_norm": 0.00023205741308629513, + "learning_rate": 1.9765960393416305e-05, + "loss": 0.0, + "num_input_tokens_seen": 13488248, + "step": 22135 + }, + { + "epoch": 6.105901820187534, + "grad_norm": 3.4710341424215585e-05, + "learning_rate": 1.9754194247740066e-05, + "loss": 0.0, + "num_input_tokens_seen": 13491128, + "step": 22140 + }, + { + "epoch": 6.107280750137893, + "grad_norm": 0.0015330564929172397, + "learning_rate": 1.974242931744171e-05, + "loss": 0.0, + "num_input_tokens_seen": 13494072, + "step": 22145 + }, + { + "epoch": 6.108659680088252, + "grad_norm": 0.00018620913033373654, + "learning_rate": 1.9730665605247028e-05, + "loss": 0.0, + "num_input_tokens_seen": 13496760, + "step": 22150 + }, + { + "epoch": 6.11003861003861, + "grad_norm": 0.00921031553298235, + "learning_rate": 1.9718903113881483e-05, + "loss": 0.0, + "num_input_tokens_seen": 13501752, + "step": 22155 + }, + { + "epoch": 6.1114175399889685, + "grad_norm": 1.5168412573984824e-05, + "learning_rate": 1.970714184607027e-05, + "loss": 0.0, + "num_input_tokens_seen": 13504280, + "step": 22160 + }, + { + "epoch": 6.112796469939327, + "grad_norm": 0.0003932148974854499, + "learning_rate": 1.9695381804538327e-05, + "loss": 0.0, + "num_input_tokens_seen": 13507768, + "step": 22165 + }, + { + "epoch": 6.114175399889685, + "grad_norm": 0.00011862949759233743, + "learning_rate": 1.9683622992010265e-05, + "loss": 0.0, + "num_input_tokens_seen": 13510424, + "step": 22170 + }, + { + "epoch": 6.1155543298400445, + "grad_norm": 2.973746450152248e-05, + "learning_rate": 1.9671865411210445e-05, + "loss": 0.0, + "num_input_tokens_seen": 13513592, + "step": 22175 + }, + { + "epoch": 6.116933259790403, + "grad_norm": 5.2250325097702444e-05, + "learning_rate": 1.9660109064862933e-05, + "loss": 0.0, + "num_input_tokens_seen": 13516120, + "step": 22180 + }, + { + "epoch": 6.118312189740761, + "grad_norm": 0.0007716939435340464, + "learning_rate": 1.964835395569149e-05, + "loss": 0.0, + "num_input_tokens_seen": 13519064, + "step": 22185 + }, + { + "epoch": 6.11969111969112, + "grad_norm": 3.526596265146509e-05, + "learning_rate": 1.9636600086419625e-05, + "loss": 0.0, + "num_input_tokens_seen": 13522552, + "step": 22190 + }, + { + "epoch": 6.121070049641478, + "grad_norm": 0.00032615134841762483, + "learning_rate": 1.9624847459770535e-05, + "loss": 0.0, + "num_input_tokens_seen": 13525304, + "step": 22195 + }, + { + "epoch": 6.122448979591836, + "grad_norm": 0.002883519744500518, + "learning_rate": 1.9613096078467116e-05, + "loss": 0.0, + "num_input_tokens_seen": 13527960, + "step": 22200 + }, + { + "epoch": 6.123827909542196, + "grad_norm": 0.0006013179663568735, + "learning_rate": 1.9601345945232023e-05, + "loss": 0.0, + "num_input_tokens_seen": 13531672, + "step": 22205 + }, + { + "epoch": 6.125206839492554, + "grad_norm": 2.1368436136981472e-05, + "learning_rate": 1.958959706278757e-05, + "loss": 0.0, + "num_input_tokens_seen": 13534456, + "step": 22210 + }, + { + "epoch": 6.126585769442912, + "grad_norm": 1.4797617950534914e-05, + "learning_rate": 1.9577849433855817e-05, + "loss": 0.0, + "num_input_tokens_seen": 13536952, + "step": 22215 + }, + { + "epoch": 6.127964699393271, + "grad_norm": 0.0001357279543299228, + "learning_rate": 1.9566103061158524e-05, + "loss": 0.0, + "num_input_tokens_seen": 13539960, + "step": 22220 + }, + { + "epoch": 6.129343629343629, + "grad_norm": 3.0221986889955588e-05, + "learning_rate": 1.9554357947417142e-05, + "loss": 0.0, + "num_input_tokens_seen": 13542936, + "step": 22225 + }, + { + "epoch": 6.130722559293988, + "grad_norm": 8.506149606546387e-05, + "learning_rate": 1.9542614095352864e-05, + "loss": 0.0, + "num_input_tokens_seen": 13545432, + "step": 22230 + }, + { + "epoch": 6.132101489244347, + "grad_norm": 2.7244457669439726e-05, + "learning_rate": 1.9530871507686565e-05, + "loss": 0.0, + "num_input_tokens_seen": 13548600, + "step": 22235 + }, + { + "epoch": 6.133480419194705, + "grad_norm": 9.93042704067193e-05, + "learning_rate": 1.9519130187138817e-05, + "loss": 0.0, + "num_input_tokens_seen": 13552984, + "step": 22240 + }, + { + "epoch": 6.134859349145064, + "grad_norm": 3.232083145121578e-06, + "learning_rate": 1.950739013642994e-05, + "loss": 0.0, + "num_input_tokens_seen": 13555352, + "step": 22245 + }, + { + "epoch": 6.136238279095422, + "grad_norm": 0.00015868853370193392, + "learning_rate": 1.9495651358279927e-05, + "loss": 0.0, + "num_input_tokens_seen": 13557592, + "step": 22250 + }, + { + "epoch": 6.13761720904578, + "grad_norm": 0.00016546026745345443, + "learning_rate": 1.9483913855408474e-05, + "loss": 0.0, + "num_input_tokens_seen": 13560184, + "step": 22255 + }, + { + "epoch": 6.138996138996139, + "grad_norm": 0.0005183650064282119, + "learning_rate": 1.947217763053501e-05, + "loss": 0.0, + "num_input_tokens_seen": 13562328, + "step": 22260 + }, + { + "epoch": 6.140375068946497, + "grad_norm": 3.70879060938023e-05, + "learning_rate": 1.946044268637863e-05, + "loss": 0.0, + "num_input_tokens_seen": 13566648, + "step": 22265 + }, + { + "epoch": 6.141753998896856, + "grad_norm": 0.0003251040179748088, + "learning_rate": 1.944870902565817e-05, + "loss": 0.0, + "num_input_tokens_seen": 13569432, + "step": 22270 + }, + { + "epoch": 6.143132928847215, + "grad_norm": 5.8786959016288165e-06, + "learning_rate": 1.9436976651092144e-05, + "loss": 0.0, + "num_input_tokens_seen": 13572248, + "step": 22275 + }, + { + "epoch": 6.144511858797573, + "grad_norm": 6.618161569349468e-05, + "learning_rate": 1.942524556539877e-05, + "loss": 0.0, + "num_input_tokens_seen": 13574680, + "step": 22280 + }, + { + "epoch": 6.1458907887479315, + "grad_norm": 0.00024323804245796055, + "learning_rate": 1.9413515771295975e-05, + "loss": 0.0, + "num_input_tokens_seen": 13578104, + "step": 22285 + }, + { + "epoch": 6.14726971869829, + "grad_norm": 0.0002880444226320833, + "learning_rate": 1.9401787271501394e-05, + "loss": 0.0, + "num_input_tokens_seen": 13581080, + "step": 22290 + }, + { + "epoch": 6.148648648648648, + "grad_norm": 1.055249958881177e-05, + "learning_rate": 1.939006006873233e-05, + "loss": 0.0, + "num_input_tokens_seen": 13584920, + "step": 22295 + }, + { + "epoch": 6.1500275785990075, + "grad_norm": 2.2745427486370318e-05, + "learning_rate": 1.937833416570583e-05, + "loss": 0.0, + "num_input_tokens_seen": 13587224, + "step": 22300 + }, + { + "epoch": 6.151406508549366, + "grad_norm": 6.013268375681946e-06, + "learning_rate": 1.9366609565138603e-05, + "loss": 0.0, + "num_input_tokens_seen": 13590264, + "step": 22305 + }, + { + "epoch": 6.152785438499724, + "grad_norm": 9.473596583120525e-05, + "learning_rate": 1.935488626974708e-05, + "loss": 0.0, + "num_input_tokens_seen": 13593112, + "step": 22310 + }, + { + "epoch": 6.154164368450083, + "grad_norm": 7.107094279490411e-05, + "learning_rate": 1.9343164282247373e-05, + "loss": 0.0, + "num_input_tokens_seen": 13596184, + "step": 22315 + }, + { + "epoch": 6.155543298400441, + "grad_norm": 4.783503391081467e-05, + "learning_rate": 1.9331443605355295e-05, + "loss": 0.0, + "num_input_tokens_seen": 13599704, + "step": 22320 + }, + { + "epoch": 6.156922228350799, + "grad_norm": 0.00017963019490707666, + "learning_rate": 1.9319724241786378e-05, + "loss": 0.0, + "num_input_tokens_seen": 13603320, + "step": 22325 + }, + { + "epoch": 6.158301158301159, + "grad_norm": 3.1482992199016735e-05, + "learning_rate": 1.9308006194255814e-05, + "loss": 0.0, + "num_input_tokens_seen": 13605912, + "step": 22330 + }, + { + "epoch": 6.159680088251517, + "grad_norm": 4.965676907886518e-06, + "learning_rate": 1.92962894654785e-05, + "loss": 0.0, + "num_input_tokens_seen": 13609048, + "step": 22335 + }, + { + "epoch": 6.161059018201875, + "grad_norm": 6.64437684463337e-05, + "learning_rate": 1.928457405816906e-05, + "loss": 0.0, + "num_input_tokens_seen": 13612024, + "step": 22340 + }, + { + "epoch": 6.162437948152234, + "grad_norm": 0.00041553209302946925, + "learning_rate": 1.9272859975041754e-05, + "loss": 0.0, + "num_input_tokens_seen": 13614744, + "step": 22345 + }, + { + "epoch": 6.163816878102592, + "grad_norm": 2.1949412257527e-05, + "learning_rate": 1.9261147218810582e-05, + "loss": 0.0, + "num_input_tokens_seen": 13617496, + "step": 22350 + }, + { + "epoch": 6.1651958080529505, + "grad_norm": 0.0002435647475067526, + "learning_rate": 1.924943579218923e-05, + "loss": 0.0, + "num_input_tokens_seen": 13619832, + "step": 22355 + }, + { + "epoch": 6.16657473800331, + "grad_norm": 1.3979367395222653e-05, + "learning_rate": 1.923772569789105e-05, + "loss": 0.0, + "num_input_tokens_seen": 13621944, + "step": 22360 + }, + { + "epoch": 6.167953667953668, + "grad_norm": 1.5997031368897296e-05, + "learning_rate": 1.922601693862912e-05, + "loss": 0.0, + "num_input_tokens_seen": 13626360, + "step": 22365 + }, + { + "epoch": 6.1693325979040265, + "grad_norm": 2.1416113668237813e-05, + "learning_rate": 1.9214309517116187e-05, + "loss": 0.0, + "num_input_tokens_seen": 13629528, + "step": 22370 + }, + { + "epoch": 6.170711527854385, + "grad_norm": 2.1606763766612858e-05, + "learning_rate": 1.9202603436064677e-05, + "loss": 0.0, + "num_input_tokens_seen": 13632664, + "step": 22375 + }, + { + "epoch": 6.172090457804743, + "grad_norm": 2.6636180336936377e-05, + "learning_rate": 1.919089869818674e-05, + "loss": 0.0, + "num_input_tokens_seen": 13635608, + "step": 22380 + }, + { + "epoch": 6.173469387755102, + "grad_norm": 1.2327773220022209e-05, + "learning_rate": 1.9179195306194188e-05, + "loss": 0.0, + "num_input_tokens_seen": 13640120, + "step": 22385 + }, + { + "epoch": 6.174848317705461, + "grad_norm": 4.955566328135319e-05, + "learning_rate": 1.9167493262798518e-05, + "loss": 0.0, + "num_input_tokens_seen": 13643480, + "step": 22390 + }, + { + "epoch": 6.176227247655819, + "grad_norm": 0.00033764532417990267, + "learning_rate": 1.915579257071095e-05, + "loss": 0.0, + "num_input_tokens_seen": 13646232, + "step": 22395 + }, + { + "epoch": 6.177606177606178, + "grad_norm": 0.0005456550279632211, + "learning_rate": 1.9144093232642355e-05, + "loss": 0.0, + "num_input_tokens_seen": 13650360, + "step": 22400 + }, + { + "epoch": 6.178985107556536, + "grad_norm": 1.046834859153023e-05, + "learning_rate": 1.9132395251303288e-05, + "loss": 0.0, + "num_input_tokens_seen": 13653016, + "step": 22405 + }, + { + "epoch": 6.1803640375068944, + "grad_norm": 0.0014121966669335961, + "learning_rate": 1.9120698629404026e-05, + "loss": 0.0, + "num_input_tokens_seen": 13655704, + "step": 22410 + }, + { + "epoch": 6.181742967457253, + "grad_norm": 0.0003417621774133295, + "learning_rate": 1.9109003369654486e-05, + "loss": 0.0, + "num_input_tokens_seen": 13658296, + "step": 22415 + }, + { + "epoch": 6.183121897407612, + "grad_norm": 0.0010052823927253485, + "learning_rate": 1.909730947476431e-05, + "loss": 0.0, + "num_input_tokens_seen": 13660824, + "step": 22420 + }, + { + "epoch": 6.1845008273579705, + "grad_norm": 0.00021095415286254138, + "learning_rate": 1.9085616947442804e-05, + "loss": 0.0, + "num_input_tokens_seen": 13663896, + "step": 22425 + }, + { + "epoch": 6.185879757308329, + "grad_norm": 2.5363615350215696e-05, + "learning_rate": 1.907392579039894e-05, + "loss": 0.0, + "num_input_tokens_seen": 13667000, + "step": 22430 + }, + { + "epoch": 6.187258687258687, + "grad_norm": 0.0022306551691144705, + "learning_rate": 1.906223600634142e-05, + "loss": 0.0, + "num_input_tokens_seen": 13669816, + "step": 22435 + }, + { + "epoch": 6.188637617209046, + "grad_norm": 0.002439978066831827, + "learning_rate": 1.905054759797858e-05, + "loss": 0.0, + "num_input_tokens_seen": 13672536, + "step": 22440 + }, + { + "epoch": 6.190016547159404, + "grad_norm": 2.8983129595872015e-05, + "learning_rate": 1.9038860568018457e-05, + "loss": 0.0, + "num_input_tokens_seen": 13675544, + "step": 22445 + }, + { + "epoch": 6.191395477109763, + "grad_norm": 8.942381100496277e-05, + "learning_rate": 1.9027174919168772e-05, + "loss": 0.0, + "num_input_tokens_seen": 13677784, + "step": 22450 + }, + { + "epoch": 6.192774407060122, + "grad_norm": 1.1597403499763459e-05, + "learning_rate": 1.901549065413692e-05, + "loss": 0.0, + "num_input_tokens_seen": 13680056, + "step": 22455 + }, + { + "epoch": 6.19415333701048, + "grad_norm": 1.2522148608695716e-05, + "learning_rate": 1.9003807775629982e-05, + "loss": 0.0, + "num_input_tokens_seen": 13684248, + "step": 22460 + }, + { + "epoch": 6.195532266960838, + "grad_norm": 6.752394710929366e-06, + "learning_rate": 1.8992126286354716e-05, + "loss": 0.0, + "num_input_tokens_seen": 13686872, + "step": 22465 + }, + { + "epoch": 6.196911196911197, + "grad_norm": 0.00010459157056175172, + "learning_rate": 1.8980446189017538e-05, + "loss": 0.0, + "num_input_tokens_seen": 13689816, + "step": 22470 + }, + { + "epoch": 6.198290126861555, + "grad_norm": 0.00012698651698883623, + "learning_rate": 1.8968767486324583e-05, + "loss": 0.0, + "num_input_tokens_seen": 13692344, + "step": 22475 + }, + { + "epoch": 6.199669056811914, + "grad_norm": 1.1207788702449761e-05, + "learning_rate": 1.8957090180981628e-05, + "loss": 0.0, + "num_input_tokens_seen": 13694840, + "step": 22480 + }, + { + "epoch": 6.201047986762273, + "grad_norm": 5.128169505042024e-05, + "learning_rate": 1.894541427569412e-05, + "loss": 0.0, + "num_input_tokens_seen": 13698200, + "step": 22485 + }, + { + "epoch": 6.202426916712631, + "grad_norm": 0.003954608924686909, + "learning_rate": 1.8933739773167224e-05, + "loss": 0.0, + "num_input_tokens_seen": 13701048, + "step": 22490 + }, + { + "epoch": 6.2038058466629895, + "grad_norm": 0.00037394603714346886, + "learning_rate": 1.8922066676105747e-05, + "loss": 0.0, + "num_input_tokens_seen": 13704408, + "step": 22495 + }, + { + "epoch": 6.205184776613348, + "grad_norm": 2.3258415239979513e-05, + "learning_rate": 1.8910394987214167e-05, + "loss": 0.0, + "num_input_tokens_seen": 13707032, + "step": 22500 + }, + { + "epoch": 6.206563706563706, + "grad_norm": 1.1562072359083686e-05, + "learning_rate": 1.8898724709196666e-05, + "loss": 0.0, + "num_input_tokens_seen": 13711544, + "step": 22505 + }, + { + "epoch": 6.2079426365140655, + "grad_norm": 5.083380892756395e-05, + "learning_rate": 1.888705584475706e-05, + "loss": 0.0, + "num_input_tokens_seen": 13713624, + "step": 22510 + }, + { + "epoch": 6.209321566464424, + "grad_norm": 1.0156079952139407e-05, + "learning_rate": 1.8875388396598874e-05, + "loss": 0.0, + "num_input_tokens_seen": 13716760, + "step": 22515 + }, + { + "epoch": 6.210700496414782, + "grad_norm": 0.002975305775180459, + "learning_rate": 1.886372236742528e-05, + "loss": 0.0, + "num_input_tokens_seen": 13720088, + "step": 22520 + }, + { + "epoch": 6.212079426365141, + "grad_norm": 7.738218300801236e-06, + "learning_rate": 1.885205775993913e-05, + "loss": 0.0, + "num_input_tokens_seen": 13722968, + "step": 22525 + }, + { + "epoch": 6.213458356315499, + "grad_norm": 1.889626400952693e-05, + "learning_rate": 1.8840394576842946e-05, + "loss": 0.0, + "num_input_tokens_seen": 13725560, + "step": 22530 + }, + { + "epoch": 6.214837286265857, + "grad_norm": 4.061789877596311e-05, + "learning_rate": 1.882873282083893e-05, + "loss": 0.0, + "num_input_tokens_seen": 13728920, + "step": 22535 + }, + { + "epoch": 6.216216216216216, + "grad_norm": 0.0008557065157219768, + "learning_rate": 1.8817072494628918e-05, + "loss": 0.0, + "num_input_tokens_seen": 13731672, + "step": 22540 + }, + { + "epoch": 6.217595146166575, + "grad_norm": 3.2938638469204307e-05, + "learning_rate": 1.880541360091447e-05, + "loss": 0.0, + "num_input_tokens_seen": 13733912, + "step": 22545 + }, + { + "epoch": 6.218974076116933, + "grad_norm": 4.6472494432237e-05, + "learning_rate": 1.879375614239676e-05, + "loss": 0.0, + "num_input_tokens_seen": 13736920, + "step": 22550 + }, + { + "epoch": 6.220353006067292, + "grad_norm": 6.749704880348872e-06, + "learning_rate": 1.8782100121776674e-05, + "loss": 0.0, + "num_input_tokens_seen": 13739992, + "step": 22555 + }, + { + "epoch": 6.22173193601765, + "grad_norm": 1.9104991224594414e-05, + "learning_rate": 1.8770445541754733e-05, + "loss": 0.0, + "num_input_tokens_seen": 13742328, + "step": 22560 + }, + { + "epoch": 6.2231108659680086, + "grad_norm": 1.2120490282541141e-05, + "learning_rate": 1.875879240503113e-05, + "loss": 0.0, + "num_input_tokens_seen": 13744952, + "step": 22565 + }, + { + "epoch": 6.224489795918367, + "grad_norm": 4.169913154328242e-05, + "learning_rate": 1.874714071430575e-05, + "loss": 0.0, + "num_input_tokens_seen": 13747672, + "step": 22570 + }, + { + "epoch": 6.225868725868726, + "grad_norm": 9.112348925555125e-05, + "learning_rate": 1.873549047227811e-05, + "loss": 0.0, + "num_input_tokens_seen": 13751192, + "step": 22575 + }, + { + "epoch": 6.227247655819085, + "grad_norm": 9.898127609631047e-05, + "learning_rate": 1.8723841681647397e-05, + "loss": 0.0, + "num_input_tokens_seen": 13754680, + "step": 22580 + }, + { + "epoch": 6.228626585769443, + "grad_norm": 1.8835717128240503e-05, + "learning_rate": 1.8712194345112482e-05, + "loss": 0.0, + "num_input_tokens_seen": 13757656, + "step": 22585 + }, + { + "epoch": 6.230005515719801, + "grad_norm": 0.0019489751430228353, + "learning_rate": 1.8700548465371874e-05, + "loss": 0.0, + "num_input_tokens_seen": 13760024, + "step": 22590 + }, + { + "epoch": 6.23138444567016, + "grad_norm": 2.3354499717243016e-05, + "learning_rate": 1.868890404512376e-05, + "loss": 0.0, + "num_input_tokens_seen": 13762552, + "step": 22595 + }, + { + "epoch": 6.232763375620518, + "grad_norm": 5.4012078180676326e-05, + "learning_rate": 1.8677261087065996e-05, + "loss": 0.098, + "num_input_tokens_seen": 13765144, + "step": 22600 + }, + { + "epoch": 6.234142305570877, + "grad_norm": 1.8654638552106917e-05, + "learning_rate": 1.8665619593896065e-05, + "loss": 0.0, + "num_input_tokens_seen": 13768344, + "step": 22605 + }, + { + "epoch": 6.235521235521236, + "grad_norm": 4.219191396259703e-06, + "learning_rate": 1.865397956831116e-05, + "loss": 0.0, + "num_input_tokens_seen": 13770808, + "step": 22610 + }, + { + "epoch": 6.236900165471594, + "grad_norm": 9.287278953706846e-05, + "learning_rate": 1.8642341013008097e-05, + "loss": 0.0, + "num_input_tokens_seen": 13774616, + "step": 22615 + }, + { + "epoch": 6.2382790954219525, + "grad_norm": 0.000333744625095278, + "learning_rate": 1.8630703930683345e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13777144, + "step": 22620 + }, + { + "epoch": 6.239658025372311, + "grad_norm": 82.66070556640625, + "learning_rate": 1.861906832403308e-05, + "loss": 0.0959, + "num_input_tokens_seen": 13780184, + "step": 22625 + }, + { + "epoch": 6.241036955322669, + "grad_norm": 0.0004745411570183933, + "learning_rate": 1.860743419575309e-05, + "loss": 0.0, + "num_input_tokens_seen": 13784184, + "step": 22630 + }, + { + "epoch": 6.2424158852730285, + "grad_norm": 0.001417644671164453, + "learning_rate": 1.8595801548538823e-05, + "loss": 0.0, + "num_input_tokens_seen": 13786616, + "step": 22635 + }, + { + "epoch": 6.243794815223387, + "grad_norm": 0.00014629792713094503, + "learning_rate": 1.858417038508542e-05, + "loss": 0.0, + "num_input_tokens_seen": 13790584, + "step": 22640 + }, + { + "epoch": 6.245173745173745, + "grad_norm": 1.3889379260945134e-05, + "learning_rate": 1.8572540708087633e-05, + "loss": 0.0, + "num_input_tokens_seen": 13793496, + "step": 22645 + }, + { + "epoch": 6.246552675124104, + "grad_norm": 6.280901288846508e-05, + "learning_rate": 1.8560912520239908e-05, + "loss": 0.0, + "num_input_tokens_seen": 13796696, + "step": 22650 + }, + { + "epoch": 6.247931605074462, + "grad_norm": 7.505457324441522e-05, + "learning_rate": 1.8549285824236327e-05, + "loss": 0.0, + "num_input_tokens_seen": 13799128, + "step": 22655 + }, + { + "epoch": 6.24931053502482, + "grad_norm": 7.324944454012439e-05, + "learning_rate": 1.8537660622770615e-05, + "loss": 0.0, + "num_input_tokens_seen": 13801944, + "step": 22660 + }, + { + "epoch": 6.25068946497518, + "grad_norm": 0.0031466390937566757, + "learning_rate": 1.8526036918536176e-05, + "loss": 0.0, + "num_input_tokens_seen": 13805144, + "step": 22665 + }, + { + "epoch": 6.252068394925538, + "grad_norm": 1.233732564287493e-05, + "learning_rate": 1.8514414714226057e-05, + "loss": 0.0, + "num_input_tokens_seen": 13807864, + "step": 22670 + }, + { + "epoch": 6.253447324875896, + "grad_norm": 1.6517582480446436e-05, + "learning_rate": 1.850279401253295e-05, + "loss": 0.0, + "num_input_tokens_seen": 13810200, + "step": 22675 + }, + { + "epoch": 6.254826254826255, + "grad_norm": 3.2593292417004704e-05, + "learning_rate": 1.849117481614921e-05, + "loss": 0.0, + "num_input_tokens_seen": 13813912, + "step": 22680 + }, + { + "epoch": 6.256205184776613, + "grad_norm": 9.98785108095035e-05, + "learning_rate": 1.8479557127766833e-05, + "loss": 0.0, + "num_input_tokens_seen": 13816536, + "step": 22685 + }, + { + "epoch": 6.2575841147269715, + "grad_norm": 0.0015070138033479452, + "learning_rate": 1.8467940950077466e-05, + "loss": 0.0, + "num_input_tokens_seen": 13819512, + "step": 22690 + }, + { + "epoch": 6.258963044677331, + "grad_norm": 0.0002154147223336622, + "learning_rate": 1.845632628577242e-05, + "loss": 0.0, + "num_input_tokens_seen": 13822648, + "step": 22695 + }, + { + "epoch": 6.260341974627689, + "grad_norm": 1.2585814147314522e-05, + "learning_rate": 1.8444713137542642e-05, + "loss": 0.0071, + "num_input_tokens_seen": 13825880, + "step": 22700 + }, + { + "epoch": 6.2617209045780475, + "grad_norm": 0.0008332717698067427, + "learning_rate": 1.8433101508078735e-05, + "loss": 0.098, + "num_input_tokens_seen": 13830040, + "step": 22705 + }, + { + "epoch": 6.263099834528406, + "grad_norm": 0.00013426985242404044, + "learning_rate": 1.842149140007095e-05, + "loss": 0.0, + "num_input_tokens_seen": 13833464, + "step": 22710 + }, + { + "epoch": 6.264478764478764, + "grad_norm": 0.0004264572053216398, + "learning_rate": 1.8409882816209163e-05, + "loss": 0.0, + "num_input_tokens_seen": 13835800, + "step": 22715 + }, + { + "epoch": 6.265857694429123, + "grad_norm": 3.503203333821148e-05, + "learning_rate": 1.8398275759182947e-05, + "loss": 0.0, + "num_input_tokens_seen": 13838584, + "step": 22720 + }, + { + "epoch": 6.267236624379482, + "grad_norm": 0.001175395678728819, + "learning_rate": 1.8386670231681464e-05, + "loss": 0.0, + "num_input_tokens_seen": 13841464, + "step": 22725 + }, + { + "epoch": 6.26861555432984, + "grad_norm": 0.0009637379553169012, + "learning_rate": 1.8375066236393556e-05, + "loss": 0.0, + "num_input_tokens_seen": 13843928, + "step": 22730 + }, + { + "epoch": 6.269994484280199, + "grad_norm": 0.0004343029286246747, + "learning_rate": 1.8363463776007707e-05, + "loss": 0.0, + "num_input_tokens_seen": 13847864, + "step": 22735 + }, + { + "epoch": 6.271373414230557, + "grad_norm": 0.0010847951052710414, + "learning_rate": 1.835186285321204e-05, + "loss": 0.0, + "num_input_tokens_seen": 13851224, + "step": 22740 + }, + { + "epoch": 6.272752344180915, + "grad_norm": 0.0011750604026019573, + "learning_rate": 1.8340263470694315e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13855768, + "step": 22745 + }, + { + "epoch": 6.274131274131274, + "grad_norm": 1.303429417021107e-05, + "learning_rate": 1.832866563114195e-05, + "loss": 0.0, + "num_input_tokens_seen": 13858392, + "step": 22750 + }, + { + "epoch": 6.275510204081632, + "grad_norm": 0.0002985381579492241, + "learning_rate": 1.831706933724199e-05, + "loss": 0.0, + "num_input_tokens_seen": 13861176, + "step": 22755 + }, + { + "epoch": 6.2768891340319914, + "grad_norm": 2.4253478841274045e-05, + "learning_rate": 1.8305474591681142e-05, + "loss": 0.0, + "num_input_tokens_seen": 13863704, + "step": 22760 + }, + { + "epoch": 6.27826806398235, + "grad_norm": 0.00019095753668807447, + "learning_rate": 1.8293881397145735e-05, + "loss": 0.0, + "num_input_tokens_seen": 13867160, + "step": 22765 + }, + { + "epoch": 6.279646993932708, + "grad_norm": 0.0010164290433749557, + "learning_rate": 1.8282289756321735e-05, + "loss": 0.0, + "num_input_tokens_seen": 13869464, + "step": 22770 + }, + { + "epoch": 6.281025923883067, + "grad_norm": 0.00012249211431480944, + "learning_rate": 1.8270699671894782e-05, + "loss": 0.0, + "num_input_tokens_seen": 13872664, + "step": 22775 + }, + { + "epoch": 6.282404853833425, + "grad_norm": 0.0016310240607708693, + "learning_rate": 1.8259111146550123e-05, + "loss": 0.0, + "num_input_tokens_seen": 13876120, + "step": 22780 + }, + { + "epoch": 6.283783783783784, + "grad_norm": 0.0003852414956782013, + "learning_rate": 1.8247524182972635e-05, + "loss": 0.0, + "num_input_tokens_seen": 13878232, + "step": 22785 + }, + { + "epoch": 6.285162713734143, + "grad_norm": 6.042767927283421e-05, + "learning_rate": 1.8235938783846878e-05, + "loss": 0.0, + "num_input_tokens_seen": 13880504, + "step": 22790 + }, + { + "epoch": 6.286541643684501, + "grad_norm": 1.9181838069926016e-05, + "learning_rate": 1.8224354951857003e-05, + "loss": 0.0, + "num_input_tokens_seen": 13883928, + "step": 22795 + }, + { + "epoch": 6.287920573634859, + "grad_norm": 0.00011769300908781588, + "learning_rate": 1.821277268968683e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13886328, + "step": 22800 + }, + { + "epoch": 6.289299503585218, + "grad_norm": 9.545772627461702e-06, + "learning_rate": 1.8201192000019796e-05, + "loss": 0.0, + "num_input_tokens_seen": 13890296, + "step": 22805 + }, + { + "epoch": 6.290678433535576, + "grad_norm": 0.00173956248909235, + "learning_rate": 1.818961288553898e-05, + "loss": 0.0, + "num_input_tokens_seen": 13892856, + "step": 22810 + }, + { + "epoch": 6.2920573634859345, + "grad_norm": 1.0826936886587646e-05, + "learning_rate": 1.8178035348927105e-05, + "loss": 0.1125, + "num_input_tokens_seen": 13895672, + "step": 22815 + }, + { + "epoch": 6.293436293436294, + "grad_norm": 0.006893922574818134, + "learning_rate": 1.8166459392866516e-05, + "loss": 0.0, + "num_input_tokens_seen": 13897976, + "step": 22820 + }, + { + "epoch": 6.294815223386652, + "grad_norm": 0.00010969779395963997, + "learning_rate": 1.8154885020039184e-05, + "loss": 0.0, + "num_input_tokens_seen": 13900440, + "step": 22825 + }, + { + "epoch": 6.2961941533370105, + "grad_norm": 0.0002664403000380844, + "learning_rate": 1.8143312233126746e-05, + "loss": 0.0, + "num_input_tokens_seen": 13902712, + "step": 22830 + }, + { + "epoch": 6.297573083287369, + "grad_norm": 3.922111864085309e-05, + "learning_rate": 1.8131741034810435e-05, + "loss": 0.0, + "num_input_tokens_seen": 13905944, + "step": 22835 + }, + { + "epoch": 6.298952013237727, + "grad_norm": 0.0006578936008736491, + "learning_rate": 1.812017142777113e-05, + "loss": 0.0, + "num_input_tokens_seen": 13908056, + "step": 22840 + }, + { + "epoch": 6.3003309431880865, + "grad_norm": 0.00022780383005738258, + "learning_rate": 1.8108603414689357e-05, + "loss": 0.0, + "num_input_tokens_seen": 13912920, + "step": 22845 + }, + { + "epoch": 6.301709873138445, + "grad_norm": 2.6180263375863433e-05, + "learning_rate": 1.8097036998245247e-05, + "loss": 0.0, + "num_input_tokens_seen": 13915416, + "step": 22850 + }, + { + "epoch": 6.303088803088803, + "grad_norm": 0.00028860894963145256, + "learning_rate": 1.808547218111858e-05, + "loss": 0.0, + "num_input_tokens_seen": 13918808, + "step": 22855 + }, + { + "epoch": 6.304467733039162, + "grad_norm": 0.002938097110018134, + "learning_rate": 1.807390896598876e-05, + "loss": 0.0001, + "num_input_tokens_seen": 13922040, + "step": 22860 + }, + { + "epoch": 6.30584666298952, + "grad_norm": 0.00016869515820872039, + "learning_rate": 1.80623473555348e-05, + "loss": 0.0, + "num_input_tokens_seen": 13925080, + "step": 22865 + }, + { + "epoch": 6.307225592939878, + "grad_norm": 2.8047430532751605e-05, + "learning_rate": 1.8050787352435374e-05, + "loss": 0.0, + "num_input_tokens_seen": 13928152, + "step": 22870 + }, + { + "epoch": 6.308604522890237, + "grad_norm": 0.0021614523138850927, + "learning_rate": 1.8039228959368776e-05, + "loss": 0.0, + "num_input_tokens_seen": 13930840, + "step": 22875 + }, + { + "epoch": 6.309983452840596, + "grad_norm": 0.0005774041637778282, + "learning_rate": 1.80276721790129e-05, + "loss": 0.0, + "num_input_tokens_seen": 13934392, + "step": 22880 + }, + { + "epoch": 6.311362382790954, + "grad_norm": 0.00045982649317011237, + "learning_rate": 1.8016117014045302e-05, + "loss": 0.0, + "num_input_tokens_seen": 13938872, + "step": 22885 + }, + { + "epoch": 6.312741312741313, + "grad_norm": 0.00032681922311894596, + "learning_rate": 1.800456346714314e-05, + "loss": 0.0, + "num_input_tokens_seen": 13942232, + "step": 22890 + }, + { + "epoch": 6.314120242691671, + "grad_norm": 0.00011855310731334612, + "learning_rate": 1.7993011540983208e-05, + "loss": 0.0, + "num_input_tokens_seen": 13945176, + "step": 22895 + }, + { + "epoch": 6.3154991726420295, + "grad_norm": 0.0002546369796618819, + "learning_rate": 1.798146123824192e-05, + "loss": 0.0, + "num_input_tokens_seen": 13948056, + "step": 22900 + }, + { + "epoch": 6.316878102592388, + "grad_norm": 9.79457672656281e-06, + "learning_rate": 1.7969912561595317e-05, + "loss": 0.0, + "num_input_tokens_seen": 13951672, + "step": 22905 + }, + { + "epoch": 6.318257032542747, + "grad_norm": 0.0001577793445903808, + "learning_rate": 1.795836551371906e-05, + "loss": 0.0, + "num_input_tokens_seen": 13954520, + "step": 22910 + }, + { + "epoch": 6.3196359624931056, + "grad_norm": 4.954118048772216e-05, + "learning_rate": 1.7946820097288443e-05, + "loss": 0.0, + "num_input_tokens_seen": 13958264, + "step": 22915 + }, + { + "epoch": 6.321014892443464, + "grad_norm": 4.2184161429759115e-05, + "learning_rate": 1.793527631497835e-05, + "loss": 0.0, + "num_input_tokens_seen": 13960760, + "step": 22920 + }, + { + "epoch": 6.322393822393822, + "grad_norm": 0.022304853424429893, + "learning_rate": 1.7923734169463336e-05, + "loss": 0.0, + "num_input_tokens_seen": 13963960, + "step": 22925 + }, + { + "epoch": 6.323772752344181, + "grad_norm": 0.00017233753169421107, + "learning_rate": 1.791219366341754e-05, + "loss": 0.0, + "num_input_tokens_seen": 13966968, + "step": 22930 + }, + { + "epoch": 6.325151682294539, + "grad_norm": 4.915365207125433e-05, + "learning_rate": 1.790065479951472e-05, + "loss": 0.0, + "num_input_tokens_seen": 13969432, + "step": 22935 + }, + { + "epoch": 6.326530612244898, + "grad_norm": 0.0018501064041629434, + "learning_rate": 1.788911758042828e-05, + "loss": 0.0, + "num_input_tokens_seen": 13973368, + "step": 22940 + }, + { + "epoch": 6.327909542195257, + "grad_norm": 0.0002575663384050131, + "learning_rate": 1.7877582008831224e-05, + "loss": 0.0, + "num_input_tokens_seen": 13976376, + "step": 22945 + }, + { + "epoch": 6.329288472145615, + "grad_norm": 0.0006070370436646044, + "learning_rate": 1.7866048087396185e-05, + "loss": 0.0, + "num_input_tokens_seen": 13980408, + "step": 22950 + }, + { + "epoch": 6.3306674020959735, + "grad_norm": 0.00010803595796460286, + "learning_rate": 1.78545158187954e-05, + "loss": 0.0, + "num_input_tokens_seen": 13984536, + "step": 22955 + }, + { + "epoch": 6.332046332046332, + "grad_norm": 9.039574797498062e-05, + "learning_rate": 1.7842985205700722e-05, + "loss": 0.0, + "num_input_tokens_seen": 13987320, + "step": 22960 + }, + { + "epoch": 6.33342526199669, + "grad_norm": 0.00034688241430558264, + "learning_rate": 1.7831456250783647e-05, + "loss": 0.0, + "num_input_tokens_seen": 13991128, + "step": 22965 + }, + { + "epoch": 6.3348041919470495, + "grad_norm": 1.5020029422885273e-05, + "learning_rate": 1.781992895671525e-05, + "loss": 0.0, + "num_input_tokens_seen": 13993880, + "step": 22970 + }, + { + "epoch": 6.336183121897408, + "grad_norm": 2.4346560167032294e-05, + "learning_rate": 1.780840332616625e-05, + "loss": 0.0, + "num_input_tokens_seen": 13998296, + "step": 22975 + }, + { + "epoch": 6.337562051847766, + "grad_norm": 0.0004979103687219322, + "learning_rate": 1.7796879361806967e-05, + "loss": 0.0, + "num_input_tokens_seen": 14000600, + "step": 22980 + }, + { + "epoch": 6.338940981798125, + "grad_norm": 0.00034212222089990973, + "learning_rate": 1.7785357066307338e-05, + "loss": 0.0, + "num_input_tokens_seen": 14002872, + "step": 22985 + }, + { + "epoch": 6.340319911748483, + "grad_norm": 2.0801086066057906e-05, + "learning_rate": 1.7773836442336923e-05, + "loss": 0.0, + "num_input_tokens_seen": 14005336, + "step": 22990 + }, + { + "epoch": 6.341698841698841, + "grad_norm": 0.0006550775724463165, + "learning_rate": 1.7762317492564876e-05, + "loss": 0.0, + "num_input_tokens_seen": 14008856, + "step": 22995 + }, + { + "epoch": 6.343077771649201, + "grad_norm": 1.5951196473906748e-05, + "learning_rate": 1.7750800219659963e-05, + "loss": 0.0, + "num_input_tokens_seen": 14011480, + "step": 23000 + }, + { + "epoch": 6.344456701599559, + "grad_norm": 0.00037148562842048705, + "learning_rate": 1.7739284626290592e-05, + "loss": 0.0, + "num_input_tokens_seen": 14014392, + "step": 23005 + }, + { + "epoch": 6.345835631549917, + "grad_norm": 0.0015464565949514508, + "learning_rate": 1.7727770715124745e-05, + "loss": 0.0, + "num_input_tokens_seen": 14017688, + "step": 23010 + }, + { + "epoch": 6.347214561500276, + "grad_norm": 5.430831151898019e-05, + "learning_rate": 1.771625848883003e-05, + "loss": 0.0, + "num_input_tokens_seen": 14020408, + "step": 23015 + }, + { + "epoch": 6.348593491450634, + "grad_norm": 9.633370609662961e-06, + "learning_rate": 1.7704747950073677e-05, + "loss": 0.0, + "num_input_tokens_seen": 14023224, + "step": 23020 + }, + { + "epoch": 6.3499724214009925, + "grad_norm": 2.3180860807769932e-05, + "learning_rate": 1.7693239101522508e-05, + "loss": 0.0, + "num_input_tokens_seen": 14027736, + "step": 23025 + }, + { + "epoch": 6.351351351351352, + "grad_norm": 5.156246334081516e-05, + "learning_rate": 1.768173194584294e-05, + "loss": 0.0, + "num_input_tokens_seen": 14031096, + "step": 23030 + }, + { + "epoch": 6.35273028130171, + "grad_norm": 0.00012442695151548833, + "learning_rate": 1.7670226485701045e-05, + "loss": 0.0, + "num_input_tokens_seen": 14033560, + "step": 23035 + }, + { + "epoch": 6.3541092112520685, + "grad_norm": 0.005731618031859398, + "learning_rate": 1.7658722723762448e-05, + "loss": 0.0, + "num_input_tokens_seen": 14037304, + "step": 23040 + }, + { + "epoch": 6.355488141202427, + "grad_norm": 0.00016622802650090307, + "learning_rate": 1.7647220662692422e-05, + "loss": 0.0, + "num_input_tokens_seen": 14039800, + "step": 23045 + }, + { + "epoch": 6.356867071152785, + "grad_norm": 0.0001621398696443066, + "learning_rate": 1.7635720305155823e-05, + "loss": 0.0, + "num_input_tokens_seen": 14044856, + "step": 23050 + }, + { + "epoch": 6.358246001103144, + "grad_norm": 2.887978007493075e-05, + "learning_rate": 1.7624221653817113e-05, + "loss": 0.0, + "num_input_tokens_seen": 14048248, + "step": 23055 + }, + { + "epoch": 6.359624931053503, + "grad_norm": 3.249336441513151e-05, + "learning_rate": 1.7612724711340384e-05, + "loss": 0.0, + "num_input_tokens_seen": 14050968, + "step": 23060 + }, + { + "epoch": 6.361003861003861, + "grad_norm": 0.008782675489783287, + "learning_rate": 1.7601229480389298e-05, + "loss": 0.0, + "num_input_tokens_seen": 14053720, + "step": 23065 + }, + { + "epoch": 6.36238279095422, + "grad_norm": 6.820290582254529e-05, + "learning_rate": 1.7589735963627125e-05, + "loss": 0.0, + "num_input_tokens_seen": 14056248, + "step": 23070 + }, + { + "epoch": 6.363761720904578, + "grad_norm": 4.830537727684714e-05, + "learning_rate": 1.757824416371677e-05, + "loss": 0.0, + "num_input_tokens_seen": 14058968, + "step": 23075 + }, + { + "epoch": 6.365140650854936, + "grad_norm": 1.238007826032117e-05, + "learning_rate": 1.756675408332069e-05, + "loss": 0.0, + "num_input_tokens_seen": 14061528, + "step": 23080 + }, + { + "epoch": 6.366519580805295, + "grad_norm": 1.535946285002865e-05, + "learning_rate": 1.7555265725101e-05, + "loss": 0.0, + "num_input_tokens_seen": 14063832, + "step": 23085 + }, + { + "epoch": 6.367898510755653, + "grad_norm": 5.023854555474827e-06, + "learning_rate": 1.7543779091719383e-05, + "loss": 0.0, + "num_input_tokens_seen": 14067480, + "step": 23090 + }, + { + "epoch": 6.369277440706012, + "grad_norm": 0.004564777947962284, + "learning_rate": 1.7532294185837113e-05, + "loss": 0.0, + "num_input_tokens_seen": 14070136, + "step": 23095 + }, + { + "epoch": 6.370656370656371, + "grad_norm": 0.0001978373184101656, + "learning_rate": 1.752081101011509e-05, + "loss": 0.0, + "num_input_tokens_seen": 14074936, + "step": 23100 + }, + { + "epoch": 6.372035300606729, + "grad_norm": 5.1842329412465915e-05, + "learning_rate": 1.7509329567213802e-05, + "loss": 0.0, + "num_input_tokens_seen": 14077432, + "step": 23105 + }, + { + "epoch": 6.373414230557088, + "grad_norm": 4.8010693717515096e-05, + "learning_rate": 1.7497849859793313e-05, + "loss": 0.0, + "num_input_tokens_seen": 14080728, + "step": 23110 + }, + { + "epoch": 6.374793160507446, + "grad_norm": 0.0001415401347912848, + "learning_rate": 1.7486371890513327e-05, + "loss": 0.0, + "num_input_tokens_seen": 14083512, + "step": 23115 + }, + { + "epoch": 6.376172090457805, + "grad_norm": 0.00017025020497385412, + "learning_rate": 1.747489566203313e-05, + "loss": 0.0, + "num_input_tokens_seen": 14087288, + "step": 23120 + }, + { + "epoch": 6.377551020408164, + "grad_norm": 6.715804829582339e-06, + "learning_rate": 1.746342117701158e-05, + "loss": 0.0, + "num_input_tokens_seen": 14089784, + "step": 23125 + }, + { + "epoch": 6.378929950358522, + "grad_norm": 0.00022700874251313508, + "learning_rate": 1.7451948438107164e-05, + "loss": 0.0, + "num_input_tokens_seen": 14093272, + "step": 23130 + }, + { + "epoch": 6.38030888030888, + "grad_norm": 5.097250938415527, + "learning_rate": 1.7440477447977942e-05, + "loss": 0.0013, + "num_input_tokens_seen": 14097144, + "step": 23135 + }, + { + "epoch": 6.381687810259239, + "grad_norm": 0.00016813093679957092, + "learning_rate": 1.7429008209281594e-05, + "loss": 0.0, + "num_input_tokens_seen": 14100568, + "step": 23140 + }, + { + "epoch": 6.383066740209597, + "grad_norm": 0.00020836610929109156, + "learning_rate": 1.741754072467536e-05, + "loss": 0.0, + "num_input_tokens_seen": 14103448, + "step": 23145 + }, + { + "epoch": 6.3844456701599555, + "grad_norm": 8.327751129399985e-05, + "learning_rate": 1.7406074996816096e-05, + "loss": 0.0, + "num_input_tokens_seen": 14107448, + "step": 23150 + }, + { + "epoch": 6.385824600110315, + "grad_norm": 7.725066097918898e-05, + "learning_rate": 1.739461102836025e-05, + "loss": 0.0, + "num_input_tokens_seen": 14110456, + "step": 23155 + }, + { + "epoch": 6.387203530060673, + "grad_norm": 2.8624010155908763e-05, + "learning_rate": 1.7383148821963863e-05, + "loss": 0.0, + "num_input_tokens_seen": 14113976, + "step": 23160 + }, + { + "epoch": 6.3885824600110315, + "grad_norm": 0.00011530439223861322, + "learning_rate": 1.7371688380282545e-05, + "loss": 0.0001, + "num_input_tokens_seen": 14117656, + "step": 23165 + }, + { + "epoch": 6.38996138996139, + "grad_norm": 3.1316434615291655e-05, + "learning_rate": 1.736022970597155e-05, + "loss": 0.0, + "num_input_tokens_seen": 14121720, + "step": 23170 + }, + { + "epoch": 6.391340319911748, + "grad_norm": 0.004966146312654018, + "learning_rate": 1.7348772801685652e-05, + "loss": 0.0, + "num_input_tokens_seen": 14124216, + "step": 23175 + }, + { + "epoch": 6.392719249862107, + "grad_norm": 0.0001146530921687372, + "learning_rate": 1.733731767007927e-05, + "loss": 0.0, + "num_input_tokens_seen": 14126392, + "step": 23180 + }, + { + "epoch": 6.394098179812466, + "grad_norm": 4.167616498307325e-05, + "learning_rate": 1.7325864313806392e-05, + "loss": 0.0001, + "num_input_tokens_seen": 14129048, + "step": 23185 + }, + { + "epoch": 6.395477109762824, + "grad_norm": 0.00013819488231092691, + "learning_rate": 1.731441273552059e-05, + "loss": 0.0, + "num_input_tokens_seen": 14132024, + "step": 23190 + }, + { + "epoch": 6.396856039713183, + "grad_norm": 1.7990412743529305e-05, + "learning_rate": 1.7302962937875045e-05, + "loss": 0.0, + "num_input_tokens_seen": 14135224, + "step": 23195 + }, + { + "epoch": 6.398234969663541, + "grad_norm": 0.0017188796773552895, + "learning_rate": 1.7291514923522496e-05, + "loss": 0.0, + "num_input_tokens_seen": 14137752, + "step": 23200 + }, + { + "epoch": 6.399613899613899, + "grad_norm": 0.00025977654149755836, + "learning_rate": 1.728006869511528e-05, + "loss": 0.0, + "num_input_tokens_seen": 14141560, + "step": 23205 + }, + { + "epoch": 6.400992829564258, + "grad_norm": 9.482164750806987e-05, + "learning_rate": 1.7268624255305347e-05, + "loss": 0.0, + "num_input_tokens_seen": 14144408, + "step": 23210 + }, + { + "epoch": 6.402371759514617, + "grad_norm": 0.0001362623879685998, + "learning_rate": 1.7257181606744187e-05, + "loss": 0.0, + "num_input_tokens_seen": 14147352, + "step": 23215 + }, + { + "epoch": 6.403750689464975, + "grad_norm": 1.7984584701480344e-05, + "learning_rate": 1.72457407520829e-05, + "loss": 0.0, + "num_input_tokens_seen": 14150648, + "step": 23220 + }, + { + "epoch": 6.405129619415334, + "grad_norm": 2.766482430160977e-05, + "learning_rate": 1.723430169397218e-05, + "loss": 0.0, + "num_input_tokens_seen": 14152888, + "step": 23225 + }, + { + "epoch": 6.406508549365692, + "grad_norm": 1.7716927686706185e-05, + "learning_rate": 1.722286443506228e-05, + "loss": 0.0, + "num_input_tokens_seen": 14155736, + "step": 23230 + }, + { + "epoch": 6.4078874793160505, + "grad_norm": 3.7102137866895646e-05, + "learning_rate": 1.7211428978003065e-05, + "loss": 0.0, + "num_input_tokens_seen": 14158200, + "step": 23235 + }, + { + "epoch": 6.409266409266409, + "grad_norm": 2.515218875487335e-05, + "learning_rate": 1.719999532544395e-05, + "loss": 0.0, + "num_input_tokens_seen": 14162584, + "step": 23240 + }, + { + "epoch": 6.410645339216768, + "grad_norm": 0.0006786913727410138, + "learning_rate": 1.7188563480033946e-05, + "loss": 0.0, + "num_input_tokens_seen": 14165784, + "step": 23245 + }, + { + "epoch": 6.4120242691671265, + "grad_norm": 2.8208669391460717e-06, + "learning_rate": 1.7177133444421666e-05, + "loss": 0.0, + "num_input_tokens_seen": 14169144, + "step": 23250 + }, + { + "epoch": 6.413403199117485, + "grad_norm": 1.2425520253600553e-05, + "learning_rate": 1.716570522125527e-05, + "loss": 0.0, + "num_input_tokens_seen": 14172472, + "step": 23255 + }, + { + "epoch": 6.414782129067843, + "grad_norm": 2.4415614461759105e-05, + "learning_rate": 1.7154278813182508e-05, + "loss": 0.0, + "num_input_tokens_seen": 14175352, + "step": 23260 + }, + { + "epoch": 6.416161059018202, + "grad_norm": 0.0002450055326335132, + "learning_rate": 1.714285422285074e-05, + "loss": 0.0469, + "num_input_tokens_seen": 14177944, + "step": 23265 + }, + { + "epoch": 6.41753998896856, + "grad_norm": 0.0001475980825489387, + "learning_rate": 1.7131431452906858e-05, + "loss": 0.0, + "num_input_tokens_seen": 14180376, + "step": 23270 + }, + { + "epoch": 6.418918918918919, + "grad_norm": 3.7684269045712426e-05, + "learning_rate": 1.7120010505997353e-05, + "loss": 0.0, + "num_input_tokens_seen": 14183224, + "step": 23275 + }, + { + "epoch": 6.420297848869278, + "grad_norm": 0.00014477729564532638, + "learning_rate": 1.710859138476831e-05, + "loss": 0.0, + "num_input_tokens_seen": 14187064, + "step": 23280 + }, + { + "epoch": 6.421676778819636, + "grad_norm": 3.1099164061743068e-06, + "learning_rate": 1.709717409186536e-05, + "loss": 0.0, + "num_input_tokens_seen": 14190104, + "step": 23285 + }, + { + "epoch": 6.423055708769994, + "grad_norm": 2.6110652470379137e-05, + "learning_rate": 1.708575862993373e-05, + "loss": 0.0, + "num_input_tokens_seen": 14193752, + "step": 23290 + }, + { + "epoch": 6.424434638720353, + "grad_norm": 0.0015297755599021912, + "learning_rate": 1.707434500161823e-05, + "loss": 0.0, + "num_input_tokens_seen": 14197592, + "step": 23295 + }, + { + "epoch": 6.425813568670711, + "grad_norm": 5.255139058135683e-06, + "learning_rate": 1.7062933209563218e-05, + "loss": 0.0, + "num_input_tokens_seen": 14200312, + "step": 23300 + }, + { + "epoch": 6.4271924986210704, + "grad_norm": 7.303752499865368e-05, + "learning_rate": 1.7051523256412656e-05, + "loss": 0.0, + "num_input_tokens_seen": 14203096, + "step": 23305 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.0001264705351786688, + "learning_rate": 1.704011514481006e-05, + "loss": 0.0, + "num_input_tokens_seen": 14206136, + "step": 23310 + }, + { + "epoch": 6.429950358521787, + "grad_norm": 2.1023750377935357e-05, + "learning_rate": 1.7028708877398516e-05, + "loss": 0.0, + "num_input_tokens_seen": 14210552, + "step": 23315 + }, + { + "epoch": 6.431329288472146, + "grad_norm": 8.420677659159992e-06, + "learning_rate": 1.701730445682071e-05, + "loss": 0.0, + "num_input_tokens_seen": 14213496, + "step": 23320 + }, + { + "epoch": 6.432708218422504, + "grad_norm": 6.058874532754999e-06, + "learning_rate": 1.700590188571887e-05, + "loss": 0.0, + "num_input_tokens_seen": 14216216, + "step": 23325 + }, + { + "epoch": 6.434087148372862, + "grad_norm": 6.300864333752543e-05, + "learning_rate": 1.699450116673481e-05, + "loss": 0.0, + "num_input_tokens_seen": 14218648, + "step": 23330 + }, + { + "epoch": 6.435466078323222, + "grad_norm": 0.00029637437546625733, + "learning_rate": 1.6983102302509925e-05, + "loss": 0.0, + "num_input_tokens_seen": 14221784, + "step": 23335 + }, + { + "epoch": 6.43684500827358, + "grad_norm": 5.317851901054382e-05, + "learning_rate": 1.6971705295685147e-05, + "loss": 0.0, + "num_input_tokens_seen": 14224248, + "step": 23340 + }, + { + "epoch": 6.438223938223938, + "grad_norm": 0.00015997541777323931, + "learning_rate": 1.6960310148901025e-05, + "loss": 0.0, + "num_input_tokens_seen": 14227896, + "step": 23345 + }, + { + "epoch": 6.439602868174297, + "grad_norm": 0.00010653011850081384, + "learning_rate": 1.694891686479763e-05, + "loss": 0.0, + "num_input_tokens_seen": 14230808, + "step": 23350 + }, + { + "epoch": 6.440981798124655, + "grad_norm": 0.00016738162958063185, + "learning_rate": 1.6937525446014624e-05, + "loss": 0.0, + "num_input_tokens_seen": 14234424, + "step": 23355 + }, + { + "epoch": 6.4423607280750135, + "grad_norm": 6.773040513508022e-05, + "learning_rate": 1.6926135895191242e-05, + "loss": 0.0, + "num_input_tokens_seen": 14237208, + "step": 23360 + }, + { + "epoch": 6.443739658025372, + "grad_norm": 1.6674131984473206e-05, + "learning_rate": 1.691474821496628e-05, + "loss": 0.0, + "num_input_tokens_seen": 14239640, + "step": 23365 + }, + { + "epoch": 6.445118587975731, + "grad_norm": 0.0021720535587519407, + "learning_rate": 1.6903362407978092e-05, + "loss": 0.0, + "num_input_tokens_seen": 14241912, + "step": 23370 + }, + { + "epoch": 6.4464975179260895, + "grad_norm": 0.0002666068612597883, + "learning_rate": 1.6891978476864618e-05, + "loss": 0.0, + "num_input_tokens_seen": 14244984, + "step": 23375 + }, + { + "epoch": 6.447876447876448, + "grad_norm": 3.348618702148087e-05, + "learning_rate": 1.6880596424263338e-05, + "loss": 0.0, + "num_input_tokens_seen": 14249208, + "step": 23380 + }, + { + "epoch": 6.449255377826806, + "grad_norm": 5.98017231823178e-06, + "learning_rate": 1.6869216252811327e-05, + "loss": 0.0, + "num_input_tokens_seen": 14252248, + "step": 23385 + }, + { + "epoch": 6.450634307777165, + "grad_norm": 1.7636049960856326e-05, + "learning_rate": 1.685783796514519e-05, + "loss": 0.0, + "num_input_tokens_seen": 14254968, + "step": 23390 + }, + { + "epoch": 6.452013237727524, + "grad_norm": 3.5377772292122245e-05, + "learning_rate": 1.684646156390112e-05, + "loss": 0.0, + "num_input_tokens_seen": 14258712, + "step": 23395 + }, + { + "epoch": 6.453392167677882, + "grad_norm": 3.3838980471045943e-06, + "learning_rate": 1.683508705171487e-05, + "loss": 0.0, + "num_input_tokens_seen": 14261656, + "step": 23400 + }, + { + "epoch": 6.454771097628241, + "grad_norm": 4.520587754086591e-05, + "learning_rate": 1.6823714431221747e-05, + "loss": 0.0, + "num_input_tokens_seen": 14263960, + "step": 23405 + }, + { + "epoch": 6.456150027578599, + "grad_norm": 7.078326689224923e-06, + "learning_rate": 1.681234370505662e-05, + "loss": 0.0, + "num_input_tokens_seen": 14266968, + "step": 23410 + }, + { + "epoch": 6.457528957528957, + "grad_norm": 0.0004590281459968537, + "learning_rate": 1.6800974875853937e-05, + "loss": 0.0, + "num_input_tokens_seen": 14269144, + "step": 23415 + }, + { + "epoch": 6.458907887479316, + "grad_norm": 7.382342300843447e-05, + "learning_rate": 1.678960794624767e-05, + "loss": 0.0, + "num_input_tokens_seen": 14271864, + "step": 23420 + }, + { + "epoch": 6.460286817429674, + "grad_norm": 0.000543157453648746, + "learning_rate": 1.67782429188714e-05, + "loss": 0.0, + "num_input_tokens_seen": 14274680, + "step": 23425 + }, + { + "epoch": 6.461665747380033, + "grad_norm": 0.00513939606025815, + "learning_rate": 1.676687979635822e-05, + "loss": 0.0, + "num_input_tokens_seen": 14277464, + "step": 23430 + }, + { + "epoch": 6.463044677330392, + "grad_norm": 1.371942380501423e-05, + "learning_rate": 1.6755518581340804e-05, + "loss": 0.0, + "num_input_tokens_seen": 14281688, + "step": 23435 + }, + { + "epoch": 6.46442360728075, + "grad_norm": 6.56812044326216e-05, + "learning_rate": 1.6744159276451397e-05, + "loss": 0.0, + "num_input_tokens_seen": 14284216, + "step": 23440 + }, + { + "epoch": 6.4658025372311085, + "grad_norm": 7.971166633069515e-05, + "learning_rate": 1.673280188432178e-05, + "loss": 0.0, + "num_input_tokens_seen": 14286904, + "step": 23445 + }, + { + "epoch": 6.467181467181467, + "grad_norm": 0.004231921397149563, + "learning_rate": 1.6721446407583284e-05, + "loss": 0.0, + "num_input_tokens_seen": 14289496, + "step": 23450 + }, + { + "epoch": 6.468560397131825, + "grad_norm": 1.4966433809604496e-05, + "learning_rate": 1.6710092848866834e-05, + "loss": 0.0, + "num_input_tokens_seen": 14292376, + "step": 23455 + }, + { + "epoch": 6.4699393270821846, + "grad_norm": 0.0001098633510991931, + "learning_rate": 1.6698741210802867e-05, + "loss": 0.0, + "num_input_tokens_seen": 14296088, + "step": 23460 + }, + { + "epoch": 6.471318257032543, + "grad_norm": 2.429874257359188e-06, + "learning_rate": 1.6687391496021397e-05, + "loss": 0.0, + "num_input_tokens_seen": 14298680, + "step": 23465 + }, + { + "epoch": 6.472697186982901, + "grad_norm": 0.00646260567009449, + "learning_rate": 1.6676043707152007e-05, + "loss": 0.0, + "num_input_tokens_seen": 14302136, + "step": 23470 + }, + { + "epoch": 6.47407611693326, + "grad_norm": 5.185075224289903e-06, + "learning_rate": 1.666469784682379e-05, + "loss": 0.0, + "num_input_tokens_seen": 14305080, + "step": 23475 + }, + { + "epoch": 6.475455046883618, + "grad_norm": 3.919495611626189e-06, + "learning_rate": 1.665335391766545e-05, + "loss": 0.0, + "num_input_tokens_seen": 14308056, + "step": 23480 + }, + { + "epoch": 6.476833976833976, + "grad_norm": 0.005868515465408564, + "learning_rate": 1.664201192230519e-05, + "loss": 0.0, + "num_input_tokens_seen": 14310104, + "step": 23485 + }, + { + "epoch": 6.478212906784336, + "grad_norm": 2.324082925042603e-05, + "learning_rate": 1.6630671863370784e-05, + "loss": 0.0, + "num_input_tokens_seen": 14314520, + "step": 23490 + }, + { + "epoch": 6.479591836734694, + "grad_norm": 2.844141454261262e-05, + "learning_rate": 1.661933374348958e-05, + "loss": 0.0, + "num_input_tokens_seen": 14316984, + "step": 23495 + }, + { + "epoch": 6.4809707666850525, + "grad_norm": 6.764614954590797e-05, + "learning_rate": 1.6607997565288446e-05, + "loss": 0.0, + "num_input_tokens_seen": 14320344, + "step": 23500 + }, + { + "epoch": 6.482349696635411, + "grad_norm": 0.0002665662323124707, + "learning_rate": 1.659666333139381e-05, + "loss": 0.0, + "num_input_tokens_seen": 14322968, + "step": 23505 + }, + { + "epoch": 6.483728626585769, + "grad_norm": 1.7354360579702188e-06, + "learning_rate": 1.6585331044431662e-05, + "loss": 0.0, + "num_input_tokens_seen": 14326328, + "step": 23510 + }, + { + "epoch": 6.485107556536128, + "grad_norm": 0.00011998268018942326, + "learning_rate": 1.6574000707027516e-05, + "loss": 0.0, + "num_input_tokens_seen": 14329400, + "step": 23515 + }, + { + "epoch": 6.486486486486487, + "grad_norm": 1.0762530109786894e-05, + "learning_rate": 1.656267232180646e-05, + "loss": 0.0, + "num_input_tokens_seen": 14332696, + "step": 23520 + }, + { + "epoch": 6.487865416436845, + "grad_norm": 2.445634891046211e-06, + "learning_rate": 1.655134589139312e-05, + "loss": 0.0, + "num_input_tokens_seen": 14335384, + "step": 23525 + }, + { + "epoch": 6.489244346387204, + "grad_norm": 3.794341409957269e-06, + "learning_rate": 1.6540021418411648e-05, + "loss": 0.0, + "num_input_tokens_seen": 14338456, + "step": 23530 + }, + { + "epoch": 6.490623276337562, + "grad_norm": 5.789535498479381e-05, + "learning_rate": 1.6528698905485784e-05, + "loss": 0.0, + "num_input_tokens_seen": 14340856, + "step": 23535 + }, + { + "epoch": 6.49200220628792, + "grad_norm": 2.2765258108847775e-05, + "learning_rate": 1.6517378355238783e-05, + "loss": 0.0, + "num_input_tokens_seen": 14343736, + "step": 23540 + }, + { + "epoch": 6.493381136238279, + "grad_norm": 4.27042232331587e-06, + "learning_rate": 1.650605977029345e-05, + "loss": 0.0, + "num_input_tokens_seen": 14346616, + "step": 23545 + }, + { + "epoch": 6.494760066188638, + "grad_norm": 6.321926775854081e-06, + "learning_rate": 1.6494743153272147e-05, + "loss": 0.0, + "num_input_tokens_seen": 14349688, + "step": 23550 + }, + { + "epoch": 6.496138996138996, + "grad_norm": 2.1522553652175702e-05, + "learning_rate": 1.6483428506796772e-05, + "loss": 0.0, + "num_input_tokens_seen": 14352504, + "step": 23555 + }, + { + "epoch": 6.497517926089355, + "grad_norm": 0.00379698327742517, + "learning_rate": 1.6472115833488747e-05, + "loss": 0.0, + "num_input_tokens_seen": 14355448, + "step": 23560 + }, + { + "epoch": 6.498896856039713, + "grad_norm": 1.9552277080947533e-05, + "learning_rate": 1.6460805135969077e-05, + "loss": 0.0, + "num_input_tokens_seen": 14358584, + "step": 23565 + }, + { + "epoch": 6.5, + "eval_loss": 0.3192580044269562, + "eval_runtime": 28.4995, + "eval_samples_per_second": 56.562, + "eval_steps_per_second": 14.141, + "num_input_tokens_seen": 14360952, + "step": 23569 + }, + { + "epoch": 6.5002757859900715, + "grad_norm": 0.00028169533470645547, + "learning_rate": 1.6449496416858284e-05, + "loss": 0.0, + "num_input_tokens_seen": 14361400, + "step": 23570 + }, + { + "epoch": 6.50165471594043, + "grad_norm": 1.4363409718498588e-05, + "learning_rate": 1.643818967877643e-05, + "loss": 0.0, + "num_input_tokens_seen": 14364664, + "step": 23575 + }, + { + "epoch": 6.503033645890788, + "grad_norm": 0.00021909267525188625, + "learning_rate": 1.6426884924343135e-05, + "loss": 0.0, + "num_input_tokens_seen": 14367704, + "step": 23580 + }, + { + "epoch": 6.5044125758411475, + "grad_norm": 6.287185442488408e-06, + "learning_rate": 1.6415582156177534e-05, + "loss": 0.0, + "num_input_tokens_seen": 14370776, + "step": 23585 + }, + { + "epoch": 6.505791505791506, + "grad_norm": 0.0002180666633648798, + "learning_rate": 1.6404281376898328e-05, + "loss": 0.0, + "num_input_tokens_seen": 14374360, + "step": 23590 + }, + { + "epoch": 6.507170435741864, + "grad_norm": 1.7332710058326484e-06, + "learning_rate": 1.639298258912374e-05, + "loss": 0.0, + "num_input_tokens_seen": 14377304, + "step": 23595 + }, + { + "epoch": 6.508549365692223, + "grad_norm": 2.5107597139140125e-06, + "learning_rate": 1.638168579547153e-05, + "loss": 0.0, + "num_input_tokens_seen": 14380152, + "step": 23600 + }, + { + "epoch": 6.509928295642581, + "grad_norm": 1.6512778984179022e-06, + "learning_rate": 1.6370390998559014e-05, + "loss": 0.0, + "num_input_tokens_seen": 14383288, + "step": 23605 + }, + { + "epoch": 6.51130722559294, + "grad_norm": 2.5140767320408486e-05, + "learning_rate": 1.6359098201003038e-05, + "loss": 0.0, + "num_input_tokens_seen": 14385976, + "step": 23610 + }, + { + "epoch": 6.512686155543299, + "grad_norm": 5.008102561987471e-06, + "learning_rate": 1.6347807405419962e-05, + "loss": 0.0, + "num_input_tokens_seen": 14390456, + "step": 23615 + }, + { + "epoch": 6.514065085493657, + "grad_norm": 0.0003732366021722555, + "learning_rate": 1.633651861442572e-05, + "loss": 0.0, + "num_input_tokens_seen": 14394136, + "step": 23620 + }, + { + "epoch": 6.515444015444015, + "grad_norm": 0.00021857618412468582, + "learning_rate": 1.632523183063575e-05, + "loss": 0.0, + "num_input_tokens_seen": 14397272, + "step": 23625 + }, + { + "epoch": 6.516822945394374, + "grad_norm": 6.422470323741436e-05, + "learning_rate": 1.6313947056665056e-05, + "loss": 0.0, + "num_input_tokens_seen": 14400408, + "step": 23630 + }, + { + "epoch": 6.518201875344732, + "grad_norm": 3.808494875556789e-05, + "learning_rate": 1.6302664295128133e-05, + "loss": 0.0, + "num_input_tokens_seen": 14402456, + "step": 23635 + }, + { + "epoch": 6.5195808052950905, + "grad_norm": 0.0001570853200973943, + "learning_rate": 1.629138354863905e-05, + "loss": 0.0, + "num_input_tokens_seen": 14406200, + "step": 23640 + }, + { + "epoch": 6.52095973524545, + "grad_norm": 1.0107301932293922e-05, + "learning_rate": 1.6280104819811397e-05, + "loss": 0.0, + "num_input_tokens_seen": 14410040, + "step": 23645 + }, + { + "epoch": 6.522338665195808, + "grad_norm": 3.908782309736125e-05, + "learning_rate": 1.6268828111258288e-05, + "loss": 0.0, + "num_input_tokens_seen": 14413304, + "step": 23650 + }, + { + "epoch": 6.523717595146167, + "grad_norm": 1.049180264089955e-05, + "learning_rate": 1.6257553425592368e-05, + "loss": 0.0, + "num_input_tokens_seen": 14416088, + "step": 23655 + }, + { + "epoch": 6.525096525096525, + "grad_norm": 1.2509583484643372e-06, + "learning_rate": 1.6246280765425835e-05, + "loss": 0.0, + "num_input_tokens_seen": 14419064, + "step": 23660 + }, + { + "epoch": 6.526475455046883, + "grad_norm": 8.26572249934543e-06, + "learning_rate": 1.6235010133370383e-05, + "loss": 0.0, + "num_input_tokens_seen": 14422168, + "step": 23665 + }, + { + "epoch": 6.527854384997243, + "grad_norm": 6.452219167840667e-06, + "learning_rate": 1.622374153203727e-05, + "loss": 0.0, + "num_input_tokens_seen": 14425144, + "step": 23670 + }, + { + "epoch": 6.529233314947601, + "grad_norm": 1.3842844964528922e-05, + "learning_rate": 1.6212474964037278e-05, + "loss": 0.0002, + "num_input_tokens_seen": 14428280, + "step": 23675 + }, + { + "epoch": 6.530612244897959, + "grad_norm": 5.889490694244159e-06, + "learning_rate": 1.6201210431980678e-05, + "loss": 0.0, + "num_input_tokens_seen": 14430456, + "step": 23680 + }, + { + "epoch": 6.531991174848318, + "grad_norm": 5.493161097547272e-06, + "learning_rate": 1.6189947938477338e-05, + "loss": 0.0013, + "num_input_tokens_seen": 14433016, + "step": 23685 + }, + { + "epoch": 6.533370104798676, + "grad_norm": 1.5605550061081885e-06, + "learning_rate": 1.6178687486136592e-05, + "loss": 0.0, + "num_input_tokens_seen": 14435256, + "step": 23690 + }, + { + "epoch": 6.5347490347490345, + "grad_norm": 5.98447622905951e-05, + "learning_rate": 1.6167429077567323e-05, + "loss": 0.0, + "num_input_tokens_seen": 14438328, + "step": 23695 + }, + { + "epoch": 6.536127964699393, + "grad_norm": 9.683307325758506e-06, + "learning_rate": 1.615617271537796e-05, + "loss": 0.0, + "num_input_tokens_seen": 14441080, + "step": 23700 + }, + { + "epoch": 6.537506894649752, + "grad_norm": 6.29564601695165e-06, + "learning_rate": 1.6144918402176425e-05, + "loss": 0.0, + "num_input_tokens_seen": 14443544, + "step": 23705 + }, + { + "epoch": 6.5388858246001105, + "grad_norm": 3.1273184504243545e-06, + "learning_rate": 1.613366614057018e-05, + "loss": 0.0, + "num_input_tokens_seen": 14446104, + "step": 23710 + }, + { + "epoch": 6.540264754550469, + "grad_norm": 4.830634225072572e-06, + "learning_rate": 1.6122415933166228e-05, + "loss": 0.0, + "num_input_tokens_seen": 14448472, + "step": 23715 + }, + { + "epoch": 6.541643684500827, + "grad_norm": 2.7349344236426987e-05, + "learning_rate": 1.611116778257107e-05, + "loss": 0.0, + "num_input_tokens_seen": 14451672, + "step": 23720 + }, + { + "epoch": 6.543022614451186, + "grad_norm": 6.24989752395777e-06, + "learning_rate": 1.6099921691390747e-05, + "loss": 0.0, + "num_input_tokens_seen": 14454936, + "step": 23725 + }, + { + "epoch": 6.544401544401545, + "grad_norm": 8.67773087520618e-06, + "learning_rate": 1.608867766223081e-05, + "loss": 0.0, + "num_input_tokens_seen": 14457688, + "step": 23730 + }, + { + "epoch": 6.545780474351903, + "grad_norm": 2.1085887055960484e-05, + "learning_rate": 1.6077435697696335e-05, + "loss": 0.0, + "num_input_tokens_seen": 14460760, + "step": 23735 + }, + { + "epoch": 6.547159404302262, + "grad_norm": 3.0249871088017244e-06, + "learning_rate": 1.6066195800391937e-05, + "loss": 0.0, + "num_input_tokens_seen": 14467096, + "step": 23740 + }, + { + "epoch": 6.54853833425262, + "grad_norm": 1.738413811835926e-05, + "learning_rate": 1.6054957972921735e-05, + "loss": 0.0, + "num_input_tokens_seen": 14469336, + "step": 23745 + }, + { + "epoch": 6.549917264202978, + "grad_norm": 4.357533543952741e-05, + "learning_rate": 1.6043722217889363e-05, + "loss": 0.0, + "num_input_tokens_seen": 14472120, + "step": 23750 + }, + { + "epoch": 6.551296194153337, + "grad_norm": 2.8419999580364674e-06, + "learning_rate": 1.6032488537897993e-05, + "loss": 0.0, + "num_input_tokens_seen": 14475192, + "step": 23755 + }, + { + "epoch": 6.552675124103695, + "grad_norm": 0.004060519393533468, + "learning_rate": 1.6021256935550304e-05, + "loss": 0.0, + "num_input_tokens_seen": 14477848, + "step": 23760 + }, + { + "epoch": 6.554054054054054, + "grad_norm": 5.309214884618996e-06, + "learning_rate": 1.6010027413448503e-05, + "loss": 0.0, + "num_input_tokens_seen": 14480664, + "step": 23765 + }, + { + "epoch": 6.555432984004413, + "grad_norm": 2.9913273465353996e-05, + "learning_rate": 1.5998799974194304e-05, + "loss": 0.0, + "num_input_tokens_seen": 14483672, + "step": 23770 + }, + { + "epoch": 6.556811913954771, + "grad_norm": 1.917832150866161e-06, + "learning_rate": 1.598757462038894e-05, + "loss": 0.0, + "num_input_tokens_seen": 14487544, + "step": 23775 + }, + { + "epoch": 6.5581908439051295, + "grad_norm": 0.003425265895202756, + "learning_rate": 1.597635135463317e-05, + "loss": 0.0, + "num_input_tokens_seen": 14491864, + "step": 23780 + }, + { + "epoch": 6.559569773855488, + "grad_norm": 5.102618160890415e-06, + "learning_rate": 1.5965130179527267e-05, + "loss": 0.0, + "num_input_tokens_seen": 14494392, + "step": 23785 + }, + { + "epoch": 6.560948703805847, + "grad_norm": 2.109327760990709e-05, + "learning_rate": 1.5953911097671e-05, + "loss": 0.0, + "num_input_tokens_seen": 14497176, + "step": 23790 + }, + { + "epoch": 6.5623276337562055, + "grad_norm": 7.045009624562226e-06, + "learning_rate": 1.5942694111663692e-05, + "loss": 0.0, + "num_input_tokens_seen": 14501880, + "step": 23795 + }, + { + "epoch": 6.563706563706564, + "grad_norm": 1.353351876787201e-06, + "learning_rate": 1.5931479224104146e-05, + "loss": 0.0, + "num_input_tokens_seen": 14504504, + "step": 23800 + }, + { + "epoch": 6.565085493656922, + "grad_norm": 5.43043142897659e-06, + "learning_rate": 1.592026643759068e-05, + "loss": 0.0, + "num_input_tokens_seen": 14507512, + "step": 23805 + }, + { + "epoch": 6.566464423607281, + "grad_norm": 9.410636994289234e-05, + "learning_rate": 1.590905575472115e-05, + "loss": 0.0, + "num_input_tokens_seen": 14510264, + "step": 23810 + }, + { + "epoch": 6.567843353557639, + "grad_norm": 1.833157875807956e-05, + "learning_rate": 1.58978471780929e-05, + "loss": 0.0, + "num_input_tokens_seen": 14512536, + "step": 23815 + }, + { + "epoch": 6.569222283507997, + "grad_norm": 1.4206633750291076e-05, + "learning_rate": 1.5886640710302816e-05, + "loss": 0.0, + "num_input_tokens_seen": 14515256, + "step": 23820 + }, + { + "epoch": 6.570601213458357, + "grad_norm": 7.9672499850858e-06, + "learning_rate": 1.587543635394726e-05, + "loss": 0.0, + "num_input_tokens_seen": 14520408, + "step": 23825 + }, + { + "epoch": 6.571980143408715, + "grad_norm": 3.148439418509952e-06, + "learning_rate": 1.5864234111622116e-05, + "loss": 0.0, + "num_input_tokens_seen": 14524248, + "step": 23830 + }, + { + "epoch": 6.573359073359073, + "grad_norm": 1.831692316045519e-05, + "learning_rate": 1.58530339859228e-05, + "loss": 0.0, + "num_input_tokens_seen": 14527384, + "step": 23835 + }, + { + "epoch": 6.574738003309432, + "grad_norm": 1.37251026899321e-06, + "learning_rate": 1.5841835979444204e-05, + "loss": 0.0, + "num_input_tokens_seen": 14529784, + "step": 23840 + }, + { + "epoch": 6.57611693325979, + "grad_norm": 0.0018110350938513875, + "learning_rate": 1.5830640094780747e-05, + "loss": 0.0, + "num_input_tokens_seen": 14533784, + "step": 23845 + }, + { + "epoch": 6.577495863210149, + "grad_norm": 3.536477152010775e-06, + "learning_rate": 1.581944633452636e-05, + "loss": 0.0, + "num_input_tokens_seen": 14537240, + "step": 23850 + }, + { + "epoch": 6.578874793160508, + "grad_norm": 7.346033817157149e-05, + "learning_rate": 1.5808254701274477e-05, + "loss": 0.0, + "num_input_tokens_seen": 14540696, + "step": 23855 + }, + { + "epoch": 6.580253723110866, + "grad_norm": 2.8904669306939468e-06, + "learning_rate": 1.5797065197618044e-05, + "loss": 0.0, + "num_input_tokens_seen": 14544280, + "step": 23860 + }, + { + "epoch": 6.581632653061225, + "grad_norm": 7.193828878371278e-06, + "learning_rate": 1.57858778261495e-05, + "loss": 0.0, + "num_input_tokens_seen": 14546936, + "step": 23865 + }, + { + "epoch": 6.583011583011583, + "grad_norm": 4.352640098659322e-05, + "learning_rate": 1.5774692589460794e-05, + "loss": 0.0, + "num_input_tokens_seen": 14549816, + "step": 23870 + }, + { + "epoch": 6.584390512961941, + "grad_norm": 2.7159101136930985e-06, + "learning_rate": 1.5763509490143398e-05, + "loss": 0.0002, + "num_input_tokens_seen": 14553272, + "step": 23875 + }, + { + "epoch": 6.5857694429123, + "grad_norm": 1.588769919180777e-06, + "learning_rate": 1.575232853078826e-05, + "loss": 0.0, + "num_input_tokens_seen": 14558360, + "step": 23880 + }, + { + "epoch": 6.587148372862659, + "grad_norm": 2.6063964924105676e-06, + "learning_rate": 1.574114971398586e-05, + "loss": 0.0, + "num_input_tokens_seen": 14560888, + "step": 23885 + }, + { + "epoch": 6.588527302813017, + "grad_norm": 0.0001892686850624159, + "learning_rate": 1.5729973042326175e-05, + "loss": 0.0, + "num_input_tokens_seen": 14563608, + "step": 23890 + }, + { + "epoch": 6.589906232763376, + "grad_norm": 6.855976607766934e-06, + "learning_rate": 1.5718798518398665e-05, + "loss": 0.0, + "num_input_tokens_seen": 14565624, + "step": 23895 + }, + { + "epoch": 6.591285162713734, + "grad_norm": 3.5957077670900617e-06, + "learning_rate": 1.5707626144792303e-05, + "loss": 0.0, + "num_input_tokens_seen": 14568888, + "step": 23900 + }, + { + "epoch": 6.5926640926640925, + "grad_norm": 4.950831225869479e-06, + "learning_rate": 1.5696455924095586e-05, + "loss": 0.0, + "num_input_tokens_seen": 14571768, + "step": 23905 + }, + { + "epoch": 6.594043022614451, + "grad_norm": 3.5178645703126676e-06, + "learning_rate": 1.568528785889648e-05, + "loss": 0.0, + "num_input_tokens_seen": 14574360, + "step": 23910 + }, + { + "epoch": 6.595421952564809, + "grad_norm": 2.2592016648559365e-06, + "learning_rate": 1.5674121951782473e-05, + "loss": 0.0, + "num_input_tokens_seen": 14577496, + "step": 23915 + }, + { + "epoch": 6.5968008825151685, + "grad_norm": 0.002930060727521777, + "learning_rate": 1.5662958205340543e-05, + "loss": 0.0, + "num_input_tokens_seen": 14580312, + "step": 23920 + }, + { + "epoch": 6.598179812465527, + "grad_norm": 1.238731442754215e-06, + "learning_rate": 1.5651796622157162e-05, + "loss": 0.0, + "num_input_tokens_seen": 14583096, + "step": 23925 + }, + { + "epoch": 6.599558742415885, + "grad_norm": 1.313850816586637e-06, + "learning_rate": 1.5640637204818325e-05, + "loss": 0.0, + "num_input_tokens_seen": 14585432, + "step": 23930 + }, + { + "epoch": 6.600937672366244, + "grad_norm": 3.20723734148487e-06, + "learning_rate": 1.5629479955909498e-05, + "loss": 0.0, + "num_input_tokens_seen": 14588152, + "step": 23935 + }, + { + "epoch": 6.602316602316602, + "grad_norm": 1.119164721785637e-06, + "learning_rate": 1.561832487801565e-05, + "loss": 0.0, + "num_input_tokens_seen": 14592376, + "step": 23940 + }, + { + "epoch": 6.603695532266961, + "grad_norm": 0.00046641670633107424, + "learning_rate": 1.560717197372126e-05, + "loss": 0.0, + "num_input_tokens_seen": 14595960, + "step": 23945 + }, + { + "epoch": 6.60507446221732, + "grad_norm": 7.5686957643483765e-06, + "learning_rate": 1.5596021245610297e-05, + "loss": 0.0, + "num_input_tokens_seen": 14598872, + "step": 23950 + }, + { + "epoch": 6.606453392167678, + "grad_norm": 1.442385655536782e-05, + "learning_rate": 1.5584872696266223e-05, + "loss": 0.0, + "num_input_tokens_seen": 14601176, + "step": 23955 + }, + { + "epoch": 6.607832322118036, + "grad_norm": 1.7592115909792483e-05, + "learning_rate": 1.5573726328272004e-05, + "loss": 0.1521, + "num_input_tokens_seen": 14604152, + "step": 23960 + }, + { + "epoch": 6.609211252068395, + "grad_norm": 3.238302087993361e-05, + "learning_rate": 1.5562582144210073e-05, + "loss": 0.0012, + "num_input_tokens_seen": 14607704, + "step": 23965 + }, + { + "epoch": 6.610590182018753, + "grad_norm": 2.8437549190130085e-05, + "learning_rate": 1.55514401466624e-05, + "loss": 0.0, + "num_input_tokens_seen": 14609880, + "step": 23970 + }, + { + "epoch": 6.6119691119691115, + "grad_norm": 0.00014415860641747713, + "learning_rate": 1.5540300338210418e-05, + "loss": 0.0, + "num_input_tokens_seen": 14612696, + "step": 23975 + }, + { + "epoch": 6.613348041919471, + "grad_norm": 1.166624224424595e-05, + "learning_rate": 1.5529162721435047e-05, + "loss": 0.0, + "num_input_tokens_seen": 14615320, + "step": 23980 + }, + { + "epoch": 6.614726971869829, + "grad_norm": 5.586547194980085e-05, + "learning_rate": 1.5518027298916734e-05, + "loss": 0.0, + "num_input_tokens_seen": 14618520, + "step": 23985 + }, + { + "epoch": 6.6161059018201875, + "grad_norm": 0.0005430664750747383, + "learning_rate": 1.550689407323539e-05, + "loss": 0.0, + "num_input_tokens_seen": 14620984, + "step": 23990 + }, + { + "epoch": 6.617484831770546, + "grad_norm": 1.3784978136754944e-06, + "learning_rate": 1.5495763046970413e-05, + "loss": 0.0, + "num_input_tokens_seen": 14624632, + "step": 23995 + }, + { + "epoch": 6.618863761720904, + "grad_norm": 3.1607289656676585e-06, + "learning_rate": 1.5484634222700717e-05, + "loss": 0.0, + "num_input_tokens_seen": 14627160, + "step": 24000 + }, + { + "epoch": 6.620242691671264, + "grad_norm": 1.5604078726028092e-05, + "learning_rate": 1.547350760300468e-05, + "loss": 0.0, + "num_input_tokens_seen": 14629400, + "step": 24005 + }, + { + "epoch": 6.621621621621622, + "grad_norm": 7.106359407771379e-05, + "learning_rate": 1.546238319046019e-05, + "loss": 0.0, + "num_input_tokens_seen": 14632568, + "step": 24010 + }, + { + "epoch": 6.62300055157198, + "grad_norm": 3.889339495799504e-06, + "learning_rate": 1.54512609876446e-05, + "loss": 0.0, + "num_input_tokens_seen": 14636088, + "step": 24015 + }, + { + "epoch": 6.624379481522339, + "grad_norm": 3.4977315408468712e-06, + "learning_rate": 1.544014099713478e-05, + "loss": 0.0, + "num_input_tokens_seen": 14638296, + "step": 24020 + }, + { + "epoch": 6.625758411472697, + "grad_norm": 5.890849934075959e-05, + "learning_rate": 1.5429023221507055e-05, + "loss": 0.0, + "num_input_tokens_seen": 14641208, + "step": 24025 + }, + { + "epoch": 6.627137341423055, + "grad_norm": 8.25232345960103e-05, + "learning_rate": 1.5417907663337274e-05, + "loss": 0.0, + "num_input_tokens_seen": 14644248, + "step": 24030 + }, + { + "epoch": 6.628516271373414, + "grad_norm": 1.996991159103345e-06, + "learning_rate": 1.5406794325200732e-05, + "loss": 0.0, + "num_input_tokens_seen": 14648056, + "step": 24035 + }, + { + "epoch": 6.629895201323773, + "grad_norm": 7.164961971284356e-06, + "learning_rate": 1.539568320967225e-05, + "loss": 0.0, + "num_input_tokens_seen": 14651032, + "step": 24040 + }, + { + "epoch": 6.6312741312741315, + "grad_norm": 3.189780545653775e-05, + "learning_rate": 1.5384574319326096e-05, + "loss": 0.0, + "num_input_tokens_seen": 14653720, + "step": 24045 + }, + { + "epoch": 6.63265306122449, + "grad_norm": 3.964648840337759e-06, + "learning_rate": 1.5373467656736053e-05, + "loss": 0.0001, + "num_input_tokens_seen": 14656120, + "step": 24050 + }, + { + "epoch": 6.634031991174848, + "grad_norm": 9.260531805921346e-05, + "learning_rate": 1.5362363224475372e-05, + "loss": 0.0, + "num_input_tokens_seen": 14658968, + "step": 24055 + }, + { + "epoch": 6.635410921125207, + "grad_norm": 4.97282380820252e-05, + "learning_rate": 1.535126102511678e-05, + "loss": 0.0, + "num_input_tokens_seen": 14662168, + "step": 24060 + }, + { + "epoch": 6.636789851075566, + "grad_norm": 6.8317331169964746e-06, + "learning_rate": 1.5340161061232516e-05, + "loss": 0.0, + "num_input_tokens_seen": 14664440, + "step": 24065 + }, + { + "epoch": 6.638168781025924, + "grad_norm": 0.0003719966043718159, + "learning_rate": 1.5329063335394274e-05, + "loss": 0.0, + "num_input_tokens_seen": 14667672, + "step": 24070 + }, + { + "epoch": 6.639547710976283, + "grad_norm": 6.390328053385019e-06, + "learning_rate": 1.5317967850173225e-05, + "loss": 0.0, + "num_input_tokens_seen": 14670616, + "step": 24075 + }, + { + "epoch": 6.640926640926641, + "grad_norm": 3.3285382414760534e-06, + "learning_rate": 1.530687460814006e-05, + "loss": 0.0, + "num_input_tokens_seen": 14672408, + "step": 24080 + }, + { + "epoch": 6.642305570876999, + "grad_norm": 1.9506847820593975e-05, + "learning_rate": 1.52957836118649e-05, + "loss": 0.0, + "num_input_tokens_seen": 14675416, + "step": 24085 + }, + { + "epoch": 6.643684500827358, + "grad_norm": 1.7722877601045184e-05, + "learning_rate": 1.5284694863917373e-05, + "loss": 0.0, + "num_input_tokens_seen": 14678584, + "step": 24090 + }, + { + "epoch": 6.645063430777716, + "grad_norm": 4.450424057722557e-06, + "learning_rate": 1.5273608366866603e-05, + "loss": 0.0, + "num_input_tokens_seen": 14682264, + "step": 24095 + }, + { + "epoch": 6.646442360728075, + "grad_norm": 0.00010010643018176779, + "learning_rate": 1.526252412328114e-05, + "loss": 0.0, + "num_input_tokens_seen": 14685240, + "step": 24100 + }, + { + "epoch": 6.647821290678434, + "grad_norm": 4.103007540834369e-06, + "learning_rate": 1.5251442135729077e-05, + "loss": 0.0, + "num_input_tokens_seen": 14688920, + "step": 24105 + }, + { + "epoch": 6.649200220628792, + "grad_norm": 2.513583012841991e-06, + "learning_rate": 1.5240362406777935e-05, + "loss": 0.0, + "num_input_tokens_seen": 14691384, + "step": 24110 + }, + { + "epoch": 6.6505791505791505, + "grad_norm": 5.409894129115855e-06, + "learning_rate": 1.5229284938994723e-05, + "loss": 0.0, + "num_input_tokens_seen": 14695288, + "step": 24115 + }, + { + "epoch": 6.651958080529509, + "grad_norm": 1.1010802154487465e-06, + "learning_rate": 1.5218209734945944e-05, + "loss": 0.0, + "num_input_tokens_seen": 14697816, + "step": 24120 + }, + { + "epoch": 6.653337010479867, + "grad_norm": 6.704249699396314e-06, + "learning_rate": 1.5207136797197553e-05, + "loss": 0.0, + "num_input_tokens_seen": 14700216, + "step": 24125 + }, + { + "epoch": 6.6547159404302265, + "grad_norm": 1.845926817622967e-06, + "learning_rate": 1.519606612831499e-05, + "loss": 0.0, + "num_input_tokens_seen": 14703320, + "step": 24130 + }, + { + "epoch": 6.656094870380585, + "grad_norm": 0.00023006167612038553, + "learning_rate": 1.5184997730863188e-05, + "loss": 0.0, + "num_input_tokens_seen": 14706808, + "step": 24135 + }, + { + "epoch": 6.657473800330943, + "grad_norm": 3.720688255270943e-05, + "learning_rate": 1.5173931607406522e-05, + "loss": 0.0, + "num_input_tokens_seen": 14710680, + "step": 24140 + }, + { + "epoch": 6.658852730281302, + "grad_norm": 0.0007959556533023715, + "learning_rate": 1.5162867760508842e-05, + "loss": 0.0, + "num_input_tokens_seen": 14714008, + "step": 24145 + }, + { + "epoch": 6.66023166023166, + "grad_norm": 1.4888669284118805e-06, + "learning_rate": 1.5151806192733508e-05, + "loss": 0.0, + "num_input_tokens_seen": 14716664, + "step": 24150 + }, + { + "epoch": 6.661610590182018, + "grad_norm": 3.1658692023484036e-05, + "learning_rate": 1.5140746906643302e-05, + "loss": 0.0, + "num_input_tokens_seen": 14720088, + "step": 24155 + }, + { + "epoch": 6.662989520132378, + "grad_norm": 7.323582849494414e-06, + "learning_rate": 1.512968990480052e-05, + "loss": 0.0, + "num_input_tokens_seen": 14724632, + "step": 24160 + }, + { + "epoch": 6.664368450082736, + "grad_norm": 2.427695562801091e-06, + "learning_rate": 1.511863518976691e-05, + "loss": 0.0, + "num_input_tokens_seen": 14727896, + "step": 24165 + }, + { + "epoch": 6.665747380033094, + "grad_norm": 7.123239629436284e-05, + "learning_rate": 1.5107582764103675e-05, + "loss": 0.0, + "num_input_tokens_seen": 14731480, + "step": 24170 + }, + { + "epoch": 6.667126309983453, + "grad_norm": 0.005688847042620182, + "learning_rate": 1.5096532630371524e-05, + "loss": 0.0, + "num_input_tokens_seen": 14734904, + "step": 24175 + }, + { + "epoch": 6.668505239933811, + "grad_norm": 2.5675948563730344e-05, + "learning_rate": 1.5085484791130605e-05, + "loss": 0.0, + "num_input_tokens_seen": 14737624, + "step": 24180 + }, + { + "epoch": 6.6698841698841695, + "grad_norm": 0.0017758748726919293, + "learning_rate": 1.5074439248940531e-05, + "loss": 0.0, + "num_input_tokens_seen": 14740888, + "step": 24185 + }, + { + "epoch": 6.671263099834528, + "grad_norm": 0.00015649078704882413, + "learning_rate": 1.5063396006360414e-05, + "loss": 0.0, + "num_input_tokens_seen": 14743544, + "step": 24190 + }, + { + "epoch": 6.672642029784887, + "grad_norm": 0.0005621150485239923, + "learning_rate": 1.5052355065948803e-05, + "loss": 0.0, + "num_input_tokens_seen": 14745848, + "step": 24195 + }, + { + "epoch": 6.674020959735246, + "grad_norm": 4.810499740415253e-05, + "learning_rate": 1.5041316430263735e-05, + "loss": 0.0, + "num_input_tokens_seen": 14748664, + "step": 24200 + }, + { + "epoch": 6.675399889685604, + "grad_norm": 7.829903552192263e-06, + "learning_rate": 1.5030280101862704e-05, + "loss": 0.0, + "num_input_tokens_seen": 14751384, + "step": 24205 + }, + { + "epoch": 6.676778819635962, + "grad_norm": 5.4960401030257344e-06, + "learning_rate": 1.5019246083302654e-05, + "loss": 0.0, + "num_input_tokens_seen": 14755384, + "step": 24210 + }, + { + "epoch": 6.678157749586321, + "grad_norm": 1.7141694570455002e-06, + "learning_rate": 1.5008214377140029e-05, + "loss": 0.0, + "num_input_tokens_seen": 14758552, + "step": 24215 + }, + { + "epoch": 6.67953667953668, + "grad_norm": 6.041562755854102e-06, + "learning_rate": 1.4997184985930701e-05, + "loss": 0.0, + "num_input_tokens_seen": 14763256, + "step": 24220 + }, + { + "epoch": 6.680915609487038, + "grad_norm": 0.00013471124111674726, + "learning_rate": 1.4986157912230025e-05, + "loss": 0.0, + "num_input_tokens_seen": 14765560, + "step": 24225 + }, + { + "epoch": 6.682294539437397, + "grad_norm": 0.0001437178871128708, + "learning_rate": 1.4975133158592818e-05, + "loss": 0.0, + "num_input_tokens_seen": 14769816, + "step": 24230 + }, + { + "epoch": 6.683673469387755, + "grad_norm": 4.029713636555243e-06, + "learning_rate": 1.4964110727573367e-05, + "loss": 0.0, + "num_input_tokens_seen": 14772408, + "step": 24235 + }, + { + "epoch": 6.6850523993381135, + "grad_norm": 2.415077460682369e-06, + "learning_rate": 1.4953090621725385e-05, + "loss": 0.0, + "num_input_tokens_seen": 14775064, + "step": 24240 + }, + { + "epoch": 6.686431329288472, + "grad_norm": 0.0003640828072093427, + "learning_rate": 1.49420728436021e-05, + "loss": 0.0, + "num_input_tokens_seen": 14777752, + "step": 24245 + }, + { + "epoch": 6.68781025923883, + "grad_norm": 0.00039317330811172724, + "learning_rate": 1.4931057395756154e-05, + "loss": 0.0, + "num_input_tokens_seen": 14780728, + "step": 24250 + }, + { + "epoch": 6.6891891891891895, + "grad_norm": 3.236191696487367e-05, + "learning_rate": 1.4920044280739682e-05, + "loss": 0.0, + "num_input_tokens_seen": 14784152, + "step": 24255 + }, + { + "epoch": 6.690568119139548, + "grad_norm": 2.058943937299773e-05, + "learning_rate": 1.4909033501104256e-05, + "loss": 0.0, + "num_input_tokens_seen": 14788504, + "step": 24260 + }, + { + "epoch": 6.691947049089906, + "grad_norm": 9.130365469900426e-06, + "learning_rate": 1.4898025059400912e-05, + "loss": 0.0, + "num_input_tokens_seen": 14791832, + "step": 24265 + }, + { + "epoch": 6.693325979040265, + "grad_norm": 3.0924064049031585e-05, + "learning_rate": 1.4887018958180163e-05, + "loss": 0.0, + "num_input_tokens_seen": 14796728, + "step": 24270 + }, + { + "epoch": 6.694704908990623, + "grad_norm": 2.4593184207333252e-05, + "learning_rate": 1.4876015199991955e-05, + "loss": 0.0, + "num_input_tokens_seen": 14799384, + "step": 24275 + }, + { + "epoch": 6.696083838940982, + "grad_norm": 0.0035194328520447016, + "learning_rate": 1.4865013787385693e-05, + "loss": 0.0, + "num_input_tokens_seen": 14801560, + "step": 24280 + }, + { + "epoch": 6.697462768891341, + "grad_norm": 1.222492983288248e-06, + "learning_rate": 1.4854014722910264e-05, + "loss": 0.0, + "num_input_tokens_seen": 14804408, + "step": 24285 + }, + { + "epoch": 6.698841698841699, + "grad_norm": 1.3404323908616789e-06, + "learning_rate": 1.4843018009113974e-05, + "loss": 0.0, + "num_input_tokens_seen": 14807896, + "step": 24290 + }, + { + "epoch": 6.700220628792057, + "grad_norm": 0.00025488369283266366, + "learning_rate": 1.4832023648544624e-05, + "loss": 0.0, + "num_input_tokens_seen": 14810872, + "step": 24295 + }, + { + "epoch": 6.701599558742416, + "grad_norm": 5.05731850353186e-06, + "learning_rate": 1.4821031643749433e-05, + "loss": 0.0, + "num_input_tokens_seen": 14815000, + "step": 24300 + }, + { + "epoch": 6.702978488692774, + "grad_norm": 0.0014568448532372713, + "learning_rate": 1.4810041997275092e-05, + "loss": 0.0, + "num_input_tokens_seen": 14817496, + "step": 24305 + }, + { + "epoch": 6.7043574186431325, + "grad_norm": 1.6669636124788667e-06, + "learning_rate": 1.479905471166776e-05, + "loss": 0.0, + "num_input_tokens_seen": 14820088, + "step": 24310 + }, + { + "epoch": 6.705736348593492, + "grad_norm": 2.7193527785129845e-06, + "learning_rate": 1.4788069789473022e-05, + "loss": 0.0, + "num_input_tokens_seen": 14823224, + "step": 24315 + }, + { + "epoch": 6.70711527854385, + "grad_norm": 6.524379386974033e-06, + "learning_rate": 1.477708723323592e-05, + "loss": 0.0, + "num_input_tokens_seen": 14826552, + "step": 24320 + }, + { + "epoch": 6.7084942084942085, + "grad_norm": 4.933555374009302e-06, + "learning_rate": 1.4766107045500968e-05, + "loss": 0.0, + "num_input_tokens_seen": 14829336, + "step": 24325 + }, + { + "epoch": 6.709873138444567, + "grad_norm": 3.1845902412896976e-05, + "learning_rate": 1.4755129228812108e-05, + "loss": 0.0, + "num_input_tokens_seen": 14832280, + "step": 24330 + }, + { + "epoch": 6.711252068394925, + "grad_norm": 1.68291933277942e-06, + "learning_rate": 1.4744153785712744e-05, + "loss": 0.0, + "num_input_tokens_seen": 14835192, + "step": 24335 + }, + { + "epoch": 6.7126309983452845, + "grad_norm": 0.0004343698383308947, + "learning_rate": 1.473318071874574e-05, + "loss": 0.0, + "num_input_tokens_seen": 14838936, + "step": 24340 + }, + { + "epoch": 6.714009928295643, + "grad_norm": 1.4068413292989135e-05, + "learning_rate": 1.4722210030453376e-05, + "loss": 0.0, + "num_input_tokens_seen": 14841912, + "step": 24345 + }, + { + "epoch": 6.715388858246001, + "grad_norm": 1.2060666449542623e-05, + "learning_rate": 1.471124172337743e-05, + "loss": 0.0, + "num_input_tokens_seen": 14844664, + "step": 24350 + }, + { + "epoch": 6.71676778819636, + "grad_norm": 2.608981230878271e-05, + "learning_rate": 1.4700275800059087e-05, + "loss": 0.0, + "num_input_tokens_seen": 14846904, + "step": 24355 + }, + { + "epoch": 6.718146718146718, + "grad_norm": 4.80871312902309e-05, + "learning_rate": 1.4689312263038985e-05, + "loss": 0.0, + "num_input_tokens_seen": 14849528, + "step": 24360 + }, + { + "epoch": 6.719525648097076, + "grad_norm": 3.7731540487584425e-06, + "learning_rate": 1.467835111485723e-05, + "loss": 0.0, + "num_input_tokens_seen": 14852792, + "step": 24365 + }, + { + "epoch": 6.720904578047435, + "grad_norm": 6.442193262046203e-05, + "learning_rate": 1.4667392358053367e-05, + "loss": 0.0, + "num_input_tokens_seen": 14857112, + "step": 24370 + }, + { + "epoch": 6.722283507997794, + "grad_norm": 1.925771584865288e-06, + "learning_rate": 1.4656435995166368e-05, + "loss": 0.0, + "num_input_tokens_seen": 14859704, + "step": 24375 + }, + { + "epoch": 6.723662437948152, + "grad_norm": 3.0153705665725283e-06, + "learning_rate": 1.4645482028734681e-05, + "loss": 0.0, + "num_input_tokens_seen": 14862424, + "step": 24380 + }, + { + "epoch": 6.725041367898511, + "grad_norm": 1.3481389942171518e-06, + "learning_rate": 1.4634530461296165e-05, + "loss": 0.0, + "num_input_tokens_seen": 14865624, + "step": 24385 + }, + { + "epoch": 6.726420297848869, + "grad_norm": 1.0630933502397966e-05, + "learning_rate": 1.4623581295388161e-05, + "loss": 0.0, + "num_input_tokens_seen": 14868056, + "step": 24390 + }, + { + "epoch": 6.727799227799228, + "grad_norm": 0.00012665466056205332, + "learning_rate": 1.4612634533547423e-05, + "loss": 0.0, + "num_input_tokens_seen": 14871128, + "step": 24395 + }, + { + "epoch": 6.729178157749586, + "grad_norm": 3.419207132537849e-05, + "learning_rate": 1.4601690178310151e-05, + "loss": 0.0, + "num_input_tokens_seen": 14874936, + "step": 24400 + }, + { + "epoch": 6.730557087699945, + "grad_norm": 2.6184020498476457e-06, + "learning_rate": 1.459074823221201e-05, + "loss": 0.0, + "num_input_tokens_seen": 14878872, + "step": 24405 + }, + { + "epoch": 6.731936017650304, + "grad_norm": 1.1410414799684077e-06, + "learning_rate": 1.4579808697788086e-05, + "loss": 0.0, + "num_input_tokens_seen": 14884088, + "step": 24410 + }, + { + "epoch": 6.733314947600662, + "grad_norm": 3.923907206626609e-05, + "learning_rate": 1.4568871577572901e-05, + "loss": 0.0, + "num_input_tokens_seen": 14886296, + "step": 24415 + }, + { + "epoch": 6.73469387755102, + "grad_norm": 3.652896566563868e-06, + "learning_rate": 1.455793687410044e-05, + "loss": 0.0, + "num_input_tokens_seen": 14889464, + "step": 24420 + }, + { + "epoch": 6.736072807501379, + "grad_norm": 0.00010175893839914352, + "learning_rate": 1.4547004589904123e-05, + "loss": 0.0, + "num_input_tokens_seen": 14892184, + "step": 24425 + }, + { + "epoch": 6.737451737451737, + "grad_norm": 3.833166374533903e-06, + "learning_rate": 1.4536074727516785e-05, + "loss": 0.0, + "num_input_tokens_seen": 14895096, + "step": 24430 + }, + { + "epoch": 6.738830667402096, + "grad_norm": 3.1620711524738e-05, + "learning_rate": 1.4525147289470741e-05, + "loss": 0.0, + "num_input_tokens_seen": 14898456, + "step": 24435 + }, + { + "epoch": 6.740209597352455, + "grad_norm": 8.717565833649132e-06, + "learning_rate": 1.45142222782977e-05, + "loss": 0.0, + "num_input_tokens_seen": 14902040, + "step": 24440 + }, + { + "epoch": 6.741588527302813, + "grad_norm": 3.33748321281746e-05, + "learning_rate": 1.450329969652885e-05, + "loss": 0.0, + "num_input_tokens_seen": 14905464, + "step": 24445 + }, + { + "epoch": 6.7429674572531715, + "grad_norm": 3.4656328352866694e-05, + "learning_rate": 1.4492379546694784e-05, + "loss": 0.0, + "num_input_tokens_seen": 14907928, + "step": 24450 + }, + { + "epoch": 6.74434638720353, + "grad_norm": 0.0007044892990961671, + "learning_rate": 1.4481461831325538e-05, + "loss": 0.0, + "num_input_tokens_seen": 14910744, + "step": 24455 + }, + { + "epoch": 6.745725317153888, + "grad_norm": 5.471194072015351e-06, + "learning_rate": 1.4470546552950608e-05, + "loss": 0.0, + "num_input_tokens_seen": 14913528, + "step": 24460 + }, + { + "epoch": 6.747104247104247, + "grad_norm": 4.9293912525172345e-06, + "learning_rate": 1.44596337140989e-05, + "loss": 0.0, + "num_input_tokens_seen": 14916568, + "step": 24465 + }, + { + "epoch": 6.748483177054606, + "grad_norm": 2.864689349735272e-06, + "learning_rate": 1.4448723317298746e-05, + "loss": 0.0, + "num_input_tokens_seen": 14919000, + "step": 24470 + }, + { + "epoch": 6.749862107004964, + "grad_norm": 3.1687675800640136e-05, + "learning_rate": 1.4437815365077956e-05, + "loss": 0.0, + "num_input_tokens_seen": 14921240, + "step": 24475 + }, + { + "epoch": 6.751241036955323, + "grad_norm": 1.1480231478344649e-05, + "learning_rate": 1.4426909859963717e-05, + "loss": 0.0, + "num_input_tokens_seen": 14923512, + "step": 24480 + }, + { + "epoch": 6.752619966905681, + "grad_norm": 1.047689806910057e-06, + "learning_rate": 1.4416006804482705e-05, + "loss": 0.0, + "num_input_tokens_seen": 14926520, + "step": 24485 + }, + { + "epoch": 6.753998896856039, + "grad_norm": 0.00020405145187396556, + "learning_rate": 1.4405106201160979e-05, + "loss": 0.0, + "num_input_tokens_seen": 14929048, + "step": 24490 + }, + { + "epoch": 6.755377826806399, + "grad_norm": 0.0003227576380595565, + "learning_rate": 1.4394208052524061e-05, + "loss": 0.0, + "num_input_tokens_seen": 14932120, + "step": 24495 + }, + { + "epoch": 6.756756756756757, + "grad_norm": 2.125744458680856e-06, + "learning_rate": 1.438331236109691e-05, + "loss": 0.0, + "num_input_tokens_seen": 14935864, + "step": 24500 + }, + { + "epoch": 6.758135686707115, + "grad_norm": 2.1932871732133208e-06, + "learning_rate": 1.4372419129403885e-05, + "loss": 0.0, + "num_input_tokens_seen": 14938776, + "step": 24505 + }, + { + "epoch": 6.759514616657474, + "grad_norm": 1.2235998383403057e-06, + "learning_rate": 1.436152835996879e-05, + "loss": 0.0, + "num_input_tokens_seen": 14941656, + "step": 24510 + }, + { + "epoch": 6.760893546607832, + "grad_norm": 8.884258932084776e-06, + "learning_rate": 1.4350640055314874e-05, + "loss": 0.0, + "num_input_tokens_seen": 14944184, + "step": 24515 + }, + { + "epoch": 6.7622724765581905, + "grad_norm": 3.7400943710963475e-06, + "learning_rate": 1.433975421796479e-05, + "loss": 0.0, + "num_input_tokens_seen": 14946968, + "step": 24520 + }, + { + "epoch": 6.763651406508549, + "grad_norm": 1.525238167232601e-05, + "learning_rate": 1.432887085044062e-05, + "loss": 0.0, + "num_input_tokens_seen": 14949784, + "step": 24525 + }, + { + "epoch": 6.765030336458908, + "grad_norm": 1.1300312507955823e-06, + "learning_rate": 1.4317989955263911e-05, + "loss": 0.0, + "num_input_tokens_seen": 14953368, + "step": 24530 + }, + { + "epoch": 6.7664092664092665, + "grad_norm": 1.1327757647450198e-06, + "learning_rate": 1.4307111534955581e-05, + "loss": 0.0, + "num_input_tokens_seen": 14955832, + "step": 24535 + }, + { + "epoch": 6.767788196359625, + "grad_norm": 0.00016681117995176464, + "learning_rate": 1.4296235592036028e-05, + "loss": 0.0, + "num_input_tokens_seen": 14959256, + "step": 24540 + }, + { + "epoch": 6.769167126309983, + "grad_norm": 3.6028829981660238e-06, + "learning_rate": 1.4285362129025042e-05, + "loss": 0.0, + "num_input_tokens_seen": 14962008, + "step": 24545 + }, + { + "epoch": 6.770546056260342, + "grad_norm": 3.389760968275368e-05, + "learning_rate": 1.4274491148441844e-05, + "loss": 0.0, + "num_input_tokens_seen": 14966072, + "step": 24550 + }, + { + "epoch": 6.771924986210701, + "grad_norm": 3.986062802141532e-05, + "learning_rate": 1.4263622652805093e-05, + "loss": 0.0, + "num_input_tokens_seen": 14970808, + "step": 24555 + }, + { + "epoch": 6.773303916161059, + "grad_norm": 3.1466499876842136e-06, + "learning_rate": 1.425275664463285e-05, + "loss": 0.0, + "num_input_tokens_seen": 14974424, + "step": 24560 + }, + { + "epoch": 6.774682846111418, + "grad_norm": 2.989514723594766e-05, + "learning_rate": 1.4241893126442623e-05, + "loss": 0.0, + "num_input_tokens_seen": 14976536, + "step": 24565 + }, + { + "epoch": 6.776061776061776, + "grad_norm": 2.52076688411762e-06, + "learning_rate": 1.4231032100751341e-05, + "loss": 0.0, + "num_input_tokens_seen": 14979192, + "step": 24570 + }, + { + "epoch": 6.777440706012134, + "grad_norm": 1.6495932868565433e-06, + "learning_rate": 1.422017357007534e-05, + "loss": 0.0, + "num_input_tokens_seen": 14982808, + "step": 24575 + }, + { + "epoch": 6.778819635962493, + "grad_norm": 9.019974640978035e-06, + "learning_rate": 1.4209317536930371e-05, + "loss": 0.0, + "num_input_tokens_seen": 14985656, + "step": 24580 + }, + { + "epoch": 6.780198565912851, + "grad_norm": 4.1639232222223654e-06, + "learning_rate": 1.419846400383165e-05, + "loss": 0.0, + "num_input_tokens_seen": 14988184, + "step": 24585 + }, + { + "epoch": 6.7815774958632105, + "grad_norm": 5.092405444884207e-06, + "learning_rate": 1.4187612973293758e-05, + "loss": 0.0, + "num_input_tokens_seen": 14990776, + "step": 24590 + }, + { + "epoch": 6.782956425813569, + "grad_norm": 1.6571527794440044e-06, + "learning_rate": 1.4176764447830743e-05, + "loss": 0.0, + "num_input_tokens_seen": 14994680, + "step": 24595 + }, + { + "epoch": 6.784335355763927, + "grad_norm": 9.930325859386357e-07, + "learning_rate": 1.4165918429956044e-05, + "loss": 0.0, + "num_input_tokens_seen": 14997784, + "step": 24600 + }, + { + "epoch": 6.785714285714286, + "grad_norm": 1.0250196282868274e-05, + "learning_rate": 1.415507492218252e-05, + "loss": 0.0, + "num_input_tokens_seen": 15000312, + "step": 24605 + }, + { + "epoch": 6.787093215664644, + "grad_norm": 2.352259343751939e-06, + "learning_rate": 1.4144233927022471e-05, + "loss": 0.0, + "num_input_tokens_seen": 15003256, + "step": 24610 + }, + { + "epoch": 6.788472145615003, + "grad_norm": 7.653873581148218e-06, + "learning_rate": 1.4133395446987596e-05, + "loss": 0.0, + "num_input_tokens_seen": 15006136, + "step": 24615 + }, + { + "epoch": 6.789851075565362, + "grad_norm": 2.0558134110615356e-06, + "learning_rate": 1.4122559484588998e-05, + "loss": 0.0, + "num_input_tokens_seen": 15009080, + "step": 24620 + }, + { + "epoch": 6.79123000551572, + "grad_norm": 2.9042032565484988e-06, + "learning_rate": 1.4111726042337231e-05, + "loss": 0.0, + "num_input_tokens_seen": 15012312, + "step": 24625 + }, + { + "epoch": 6.792608935466078, + "grad_norm": 0.006340759340673685, + "learning_rate": 1.4100895122742257e-05, + "loss": 0.0, + "num_input_tokens_seen": 15014648, + "step": 24630 + }, + { + "epoch": 6.793987865416437, + "grad_norm": 5.656861685565673e-05, + "learning_rate": 1.409006672831342e-05, + "loss": 0.0, + "num_input_tokens_seen": 15017976, + "step": 24635 + }, + { + "epoch": 6.795366795366795, + "grad_norm": 1.2171100934210699e-05, + "learning_rate": 1.4079240861559528e-05, + "loss": 0.0, + "num_input_tokens_seen": 15020344, + "step": 24640 + }, + { + "epoch": 6.7967457253171535, + "grad_norm": 0.002570971380919218, + "learning_rate": 1.4068417524988753e-05, + "loss": 0.0, + "num_input_tokens_seen": 15023864, + "step": 24645 + }, + { + "epoch": 6.798124655267513, + "grad_norm": 2.5256051230826415e-05, + "learning_rate": 1.4057596721108735e-05, + "loss": 0.0, + "num_input_tokens_seen": 15026840, + "step": 24650 + }, + { + "epoch": 6.799503585217871, + "grad_norm": 2.214355845353566e-06, + "learning_rate": 1.4046778452426485e-05, + "loss": 0.0, + "num_input_tokens_seen": 15029560, + "step": 24655 + }, + { + "epoch": 6.8008825151682295, + "grad_norm": 1.2885938076578896e-06, + "learning_rate": 1.4035962721448429e-05, + "loss": 0.0, + "num_input_tokens_seen": 15032472, + "step": 24660 + }, + { + "epoch": 6.802261445118588, + "grad_norm": 3.3115768019342795e-05, + "learning_rate": 1.4025149530680437e-05, + "loss": 0.0, + "num_input_tokens_seen": 15035224, + "step": 24665 + }, + { + "epoch": 6.803640375068946, + "grad_norm": 3.1641479836252984e-06, + "learning_rate": 1.4014338882627765e-05, + "loss": 0.0, + "num_input_tokens_seen": 15037624, + "step": 24670 + }, + { + "epoch": 6.805019305019305, + "grad_norm": 4.7909172280924395e-05, + "learning_rate": 1.4003530779795065e-05, + "loss": 0.0, + "num_input_tokens_seen": 15040376, + "step": 24675 + }, + { + "epoch": 6.806398234969664, + "grad_norm": 1.0557208042882849e-05, + "learning_rate": 1.399272522468645e-05, + "loss": 0.0, + "num_input_tokens_seen": 15044248, + "step": 24680 + }, + { + "epoch": 6.807777164920022, + "grad_norm": 9.694332447907072e-07, + "learning_rate": 1.3981922219805388e-05, + "loss": 0.0, + "num_input_tokens_seen": 15047352, + "step": 24685 + }, + { + "epoch": 6.809156094870381, + "grad_norm": 2.40284061874263e-05, + "learning_rate": 1.3971121767654796e-05, + "loss": 0.0, + "num_input_tokens_seen": 15051256, + "step": 24690 + }, + { + "epoch": 6.810535024820739, + "grad_norm": 2.15811869566096e-05, + "learning_rate": 1.3960323870736968e-05, + "loss": 0.0, + "num_input_tokens_seen": 15054040, + "step": 24695 + }, + { + "epoch": 6.811913954771097, + "grad_norm": 3.1504707749263616e-06, + "learning_rate": 1.3949528531553635e-05, + "loss": 0.0, + "num_input_tokens_seen": 15057144, + "step": 24700 + }, + { + "epoch": 6.813292884721456, + "grad_norm": 8.01544047135394e-06, + "learning_rate": 1.3938735752605924e-05, + "loss": 0.0, + "num_input_tokens_seen": 15060152, + "step": 24705 + }, + { + "epoch": 6.814671814671815, + "grad_norm": 5.751552180299768e-06, + "learning_rate": 1.3927945536394363e-05, + "loss": 0.0, + "num_input_tokens_seen": 15063544, + "step": 24710 + }, + { + "epoch": 6.816050744622173, + "grad_norm": 5.030944066675147e-06, + "learning_rate": 1.3917157885418878e-05, + "loss": 0.0, + "num_input_tokens_seen": 15066200, + "step": 24715 + }, + { + "epoch": 6.817429674572532, + "grad_norm": 2.9140865080989897e-05, + "learning_rate": 1.390637280217883e-05, + "loss": 0.0, + "num_input_tokens_seen": 15069336, + "step": 24720 + }, + { + "epoch": 6.81880860452289, + "grad_norm": 1.6073015558504267e-06, + "learning_rate": 1.3895590289172954e-05, + "loss": 0.0, + "num_input_tokens_seen": 15071896, + "step": 24725 + }, + { + "epoch": 6.8201875344732485, + "grad_norm": 0.000626255467068404, + "learning_rate": 1.388481034889942e-05, + "loss": 0.0, + "num_input_tokens_seen": 15074456, + "step": 24730 + }, + { + "epoch": 6.821566464423607, + "grad_norm": 2.3604863599757664e-05, + "learning_rate": 1.3874032983855777e-05, + "loss": 0.0, + "num_input_tokens_seen": 15077176, + "step": 24735 + }, + { + "epoch": 6.822945394373966, + "grad_norm": 2.3332429918809794e-05, + "learning_rate": 1.3863258196538976e-05, + "loss": 0.0, + "num_input_tokens_seen": 15081016, + "step": 24740 + }, + { + "epoch": 6.824324324324325, + "grad_norm": 9.477129424340092e-06, + "learning_rate": 1.3852485989445397e-05, + "loss": 0.0, + "num_input_tokens_seen": 15084056, + "step": 24745 + }, + { + "epoch": 6.825703254274683, + "grad_norm": 7.253217518154997e-06, + "learning_rate": 1.3841716365070801e-05, + "loss": 0.0, + "num_input_tokens_seen": 15087256, + "step": 24750 + }, + { + "epoch": 6.827082184225041, + "grad_norm": 3.937104338547215e-06, + "learning_rate": 1.3830949325910339e-05, + "loss": 0.0, + "num_input_tokens_seen": 15090680, + "step": 24755 + }, + { + "epoch": 6.8284611141754, + "grad_norm": 1.7699859427011688e-06, + "learning_rate": 1.3820184874458603e-05, + "loss": 0.0, + "num_input_tokens_seen": 15094360, + "step": 24760 + }, + { + "epoch": 6.829840044125758, + "grad_norm": 4.754439487442141e-06, + "learning_rate": 1.3809423013209543e-05, + "loss": 0.0, + "num_input_tokens_seen": 15097464, + "step": 24765 + }, + { + "epoch": 6.831218974076117, + "grad_norm": 2.077709041259368e-06, + "learning_rate": 1.3798663744656536e-05, + "loss": 0.0, + "num_input_tokens_seen": 15100536, + "step": 24770 + }, + { + "epoch": 6.832597904026476, + "grad_norm": 4.834143510379363e-06, + "learning_rate": 1.3787907071292364e-05, + "loss": 0.0, + "num_input_tokens_seen": 15103832, + "step": 24775 + }, + { + "epoch": 6.833976833976834, + "grad_norm": 1.3397013844951289e-06, + "learning_rate": 1.3777152995609166e-05, + "loss": 0.0, + "num_input_tokens_seen": 15108024, + "step": 24780 + }, + { + "epoch": 6.8353557639271925, + "grad_norm": 7.037220711936243e-06, + "learning_rate": 1.3766401520098535e-05, + "loss": 0.0, + "num_input_tokens_seen": 15110584, + "step": 24785 + }, + { + "epoch": 6.836734693877551, + "grad_norm": 0.0005854811170138419, + "learning_rate": 1.3755652647251416e-05, + "loss": 0.0, + "num_input_tokens_seen": 15113432, + "step": 24790 + }, + { + "epoch": 6.838113623827909, + "grad_norm": 3.8533344195457175e-05, + "learning_rate": 1.3744906379558165e-05, + "loss": 0.0, + "num_input_tokens_seen": 15116440, + "step": 24795 + }, + { + "epoch": 6.839492553778268, + "grad_norm": 1.2740802048938349e-06, + "learning_rate": 1.3734162719508558e-05, + "loss": 0.0, + "num_input_tokens_seen": 15119448, + "step": 24800 + }, + { + "epoch": 6.840871483728627, + "grad_norm": 0.0001924027019413188, + "learning_rate": 1.3723421669591729e-05, + "loss": 0.0, + "num_input_tokens_seen": 15121880, + "step": 24805 + }, + { + "epoch": 6.842250413678985, + "grad_norm": 7.616182301717345e-06, + "learning_rate": 1.3712683232296225e-05, + "loss": 0.0, + "num_input_tokens_seen": 15124856, + "step": 24810 + }, + { + "epoch": 6.843629343629344, + "grad_norm": 3.8093435250630137e-06, + "learning_rate": 1.370194741011e-05, + "loss": 0.0, + "num_input_tokens_seen": 15128504, + "step": 24815 + }, + { + "epoch": 6.845008273579702, + "grad_norm": 0.00011589325731620193, + "learning_rate": 1.3691214205520375e-05, + "loss": 0.0, + "num_input_tokens_seen": 15130744, + "step": 24820 + }, + { + "epoch": 6.84638720353006, + "grad_norm": 5.857935320818797e-06, + "learning_rate": 1.3680483621014095e-05, + "loss": 0.0, + "num_input_tokens_seen": 15133016, + "step": 24825 + }, + { + "epoch": 6.84776613348042, + "grad_norm": 3.536732037900947e-06, + "learning_rate": 1.3669755659077269e-05, + "loss": 0.0, + "num_input_tokens_seen": 15136728, + "step": 24830 + }, + { + "epoch": 6.849145063430778, + "grad_norm": 9.522664186079055e-07, + "learning_rate": 1.3659030322195427e-05, + "loss": 0.0, + "num_input_tokens_seen": 15139800, + "step": 24835 + }, + { + "epoch": 6.850523993381136, + "grad_norm": 4.233641448081471e-06, + "learning_rate": 1.3648307612853454e-05, + "loss": 0.0, + "num_input_tokens_seen": 15142040, + "step": 24840 + }, + { + "epoch": 6.851902923331495, + "grad_norm": 2.2489746243081754e-06, + "learning_rate": 1.363758753353567e-05, + "loss": 0.0, + "num_input_tokens_seen": 15144664, + "step": 24845 + }, + { + "epoch": 6.853281853281853, + "grad_norm": 3.673560422612354e-05, + "learning_rate": 1.3626870086725746e-05, + "loss": 0.0, + "num_input_tokens_seen": 15148120, + "step": 24850 + }, + { + "epoch": 6.8546607832322115, + "grad_norm": 1.599194911250379e-05, + "learning_rate": 1.3616155274906775e-05, + "loss": 0.0, + "num_input_tokens_seen": 15150968, + "step": 24855 + }, + { + "epoch": 6.85603971318257, + "grad_norm": 0.0004868154355790466, + "learning_rate": 1.3605443100561221e-05, + "loss": 0.0, + "num_input_tokens_seen": 15153016, + "step": 24860 + }, + { + "epoch": 6.857418643132929, + "grad_norm": 2.534849500079872e-06, + "learning_rate": 1.3594733566170926e-05, + "loss": 0.0, + "num_input_tokens_seen": 15155512, + "step": 24865 + }, + { + "epoch": 6.8587975730832875, + "grad_norm": 0.0001237895921804011, + "learning_rate": 1.3584026674217159e-05, + "loss": 0.0, + "num_input_tokens_seen": 15157816, + "step": 24870 + }, + { + "epoch": 6.860176503033646, + "grad_norm": 2.0703710106317885e-06, + "learning_rate": 1.3573322427180533e-05, + "loss": 0.0, + "num_input_tokens_seen": 15161176, + "step": 24875 + }, + { + "epoch": 6.861555432984004, + "grad_norm": 1.0744852261268534e-06, + "learning_rate": 1.3562620827541084e-05, + "loss": 0.0, + "num_input_tokens_seen": 15164088, + "step": 24880 + }, + { + "epoch": 6.862934362934363, + "grad_norm": 1.7541685792821227e-06, + "learning_rate": 1.3551921877778218e-05, + "loss": 0.0, + "num_input_tokens_seen": 15166264, + "step": 24885 + }, + { + "epoch": 6.864313292884722, + "grad_norm": 3.317593836982269e-06, + "learning_rate": 1.3541225580370709e-05, + "loss": 0.0, + "num_input_tokens_seen": 15170104, + "step": 24890 + }, + { + "epoch": 6.86569222283508, + "grad_norm": 9.480974085818161e-07, + "learning_rate": 1.353053193779676e-05, + "loss": 0.0, + "num_input_tokens_seen": 15172728, + "step": 24895 + }, + { + "epoch": 6.867071152785439, + "grad_norm": 2.9768339118163567e-06, + "learning_rate": 1.3519840952533908e-05, + "loss": 0.0, + "num_input_tokens_seen": 15175704, + "step": 24900 + }, + { + "epoch": 6.868450082735797, + "grad_norm": 5.766143658547662e-06, + "learning_rate": 1.3509152627059133e-05, + "loss": 0.0, + "num_input_tokens_seen": 15178488, + "step": 24905 + }, + { + "epoch": 6.869829012686155, + "grad_norm": 6.984847004787298e-06, + "learning_rate": 1.3498466963848738e-05, + "loss": 0.0, + "num_input_tokens_seen": 15181080, + "step": 24910 + }, + { + "epoch": 6.871207942636514, + "grad_norm": 2.4459104679408483e-05, + "learning_rate": 1.3487783965378448e-05, + "loss": 0.0, + "num_input_tokens_seen": 15184376, + "step": 24915 + }, + { + "epoch": 6.872586872586872, + "grad_norm": 2.0288849555072375e-05, + "learning_rate": 1.3477103634123372e-05, + "loss": 0.0, + "num_input_tokens_seen": 15186936, + "step": 24920 + }, + { + "epoch": 6.873965802537231, + "grad_norm": 1.275419663215871e-06, + "learning_rate": 1.346642597255798e-05, + "loss": 0.0, + "num_input_tokens_seen": 15190360, + "step": 24925 + }, + { + "epoch": 6.87534473248759, + "grad_norm": 2.119511691489606e-06, + "learning_rate": 1.3455750983156123e-05, + "loss": 0.0, + "num_input_tokens_seen": 15192856, + "step": 24930 + }, + { + "epoch": 6.876723662437948, + "grad_norm": 1.3917881460656645e-06, + "learning_rate": 1.3445078668391065e-05, + "loss": 0.0, + "num_input_tokens_seen": 15195608, + "step": 24935 + }, + { + "epoch": 6.878102592388307, + "grad_norm": 8.289997595056775e-07, + "learning_rate": 1.3434409030735411e-05, + "loss": 0.0, + "num_input_tokens_seen": 15198104, + "step": 24940 + }, + { + "epoch": 6.879481522338665, + "grad_norm": 2.5731142159202136e-06, + "learning_rate": 1.3423742072661158e-05, + "loss": 0.0, + "num_input_tokens_seen": 15200984, + "step": 24945 + }, + { + "epoch": 6.880860452289024, + "grad_norm": 4.416353476699442e-06, + "learning_rate": 1.3413077796639705e-05, + "loss": 0.0, + "num_input_tokens_seen": 15204216, + "step": 24950 + }, + { + "epoch": 6.882239382239383, + "grad_norm": 9.282558721679379e-07, + "learning_rate": 1.3402416205141808e-05, + "loss": 0.0, + "num_input_tokens_seen": 15206712, + "step": 24955 + }, + { + "epoch": 6.883618312189741, + "grad_norm": 1.3612672091767308e-06, + "learning_rate": 1.3391757300637587e-05, + "loss": 0.0, + "num_input_tokens_seen": 15210424, + "step": 24960 + }, + { + "epoch": 6.884997242140099, + "grad_norm": 1.7247547248189221e-06, + "learning_rate": 1.3381101085596578e-05, + "loss": 0.0, + "num_input_tokens_seen": 15213016, + "step": 24965 + }, + { + "epoch": 6.886376172090458, + "grad_norm": 5.456387953017838e-05, + "learning_rate": 1.3370447562487657e-05, + "loss": 0.0, + "num_input_tokens_seen": 15217208, + "step": 24970 + }, + { + "epoch": 6.887755102040816, + "grad_norm": 1.4433812793868128e-06, + "learning_rate": 1.33597967337791e-05, + "loss": 0.0, + "num_input_tokens_seen": 15220504, + "step": 24975 + }, + { + "epoch": 6.8891340319911745, + "grad_norm": 2.8218692023074254e-06, + "learning_rate": 1.334914860193856e-05, + "loss": 0.0, + "num_input_tokens_seen": 15223352, + "step": 24980 + }, + { + "epoch": 6.890512961941534, + "grad_norm": 1.0324677077733213e-06, + "learning_rate": 1.333850316943304e-05, + "loss": 0.0, + "num_input_tokens_seen": 15226552, + "step": 24985 + }, + { + "epoch": 6.891891891891892, + "grad_norm": 2.051032606686931e-05, + "learning_rate": 1.332786043872895e-05, + "loss": 0.0, + "num_input_tokens_seen": 15229080, + "step": 24990 + }, + { + "epoch": 6.8932708218422505, + "grad_norm": 1.8477855974197155e-06, + "learning_rate": 1.3317220412292047e-05, + "loss": 0.0, + "num_input_tokens_seen": 15231992, + "step": 24995 + }, + { + "epoch": 6.894649751792609, + "grad_norm": 1.1667954140648362e-06, + "learning_rate": 1.3306583092587471e-05, + "loss": 0.0, + "num_input_tokens_seen": 15235064, + "step": 25000 + }, + { + "epoch": 6.896028681742967, + "grad_norm": 5.8801178965950385e-06, + "learning_rate": 1.3295948482079746e-05, + "loss": 0.0, + "num_input_tokens_seen": 15239000, + "step": 25005 + }, + { + "epoch": 6.897407611693326, + "grad_norm": 2.366809212617227e-06, + "learning_rate": 1.3285316583232754e-05, + "loss": 0.0, + "num_input_tokens_seen": 15241400, + "step": 25010 + }, + { + "epoch": 6.898786541643685, + "grad_norm": 0.00012383803550619632, + "learning_rate": 1.3274687398509745e-05, + "loss": 0.0, + "num_input_tokens_seen": 15243768, + "step": 25015 + }, + { + "epoch": 6.900165471594043, + "grad_norm": 0.00021490879589691758, + "learning_rate": 1.3264060930373359e-05, + "loss": 0.0, + "num_input_tokens_seen": 15246648, + "step": 25020 + }, + { + "epoch": 6.901544401544402, + "grad_norm": 9.672006626715302e-07, + "learning_rate": 1.3253437181285588e-05, + "loss": 0.0, + "num_input_tokens_seen": 15249560, + "step": 25025 + }, + { + "epoch": 6.90292333149476, + "grad_norm": 1.2266042176634073e-05, + "learning_rate": 1.3242816153707815e-05, + "loss": 0.0, + "num_input_tokens_seen": 15252760, + "step": 25030 + }, + { + "epoch": 6.904302261445118, + "grad_norm": 8.473480193060823e-06, + "learning_rate": 1.3232197850100772e-05, + "loss": 0.0, + "num_input_tokens_seen": 15257112, + "step": 25035 + }, + { + "epoch": 6.905681191395477, + "grad_norm": 4.6543937060050666e-05, + "learning_rate": 1.3221582272924557e-05, + "loss": 0.0, + "num_input_tokens_seen": 15259640, + "step": 25040 + }, + { + "epoch": 6.907060121345836, + "grad_norm": 1.5225009519781452e-05, + "learning_rate": 1.3210969424638658e-05, + "loss": 0.0, + "num_input_tokens_seen": 15262584, + "step": 25045 + }, + { + "epoch": 6.908439051296194, + "grad_norm": 1.0895857940340647e-06, + "learning_rate": 1.3200359307701926e-05, + "loss": 0.0, + "num_input_tokens_seen": 15267800, + "step": 25050 + }, + { + "epoch": 6.909817981246553, + "grad_norm": 0.00011223066394450143, + "learning_rate": 1.3189751924572557e-05, + "loss": 0.0, + "num_input_tokens_seen": 15270776, + "step": 25055 + }, + { + "epoch": 6.911196911196911, + "grad_norm": 1.061505372490501e-06, + "learning_rate": 1.3179147277708152e-05, + "loss": 0.1, + "num_input_tokens_seen": 15273464, + "step": 25060 + }, + { + "epoch": 6.9125758411472695, + "grad_norm": 1.0700247230488458e-06, + "learning_rate": 1.3168545369565627e-05, + "loss": 0.0, + "num_input_tokens_seen": 15276184, + "step": 25065 + }, + { + "epoch": 6.913954771097628, + "grad_norm": 2.3892532681202283e-06, + "learning_rate": 1.3157946202601315e-05, + "loss": 0.0, + "num_input_tokens_seen": 15278872, + "step": 25070 + }, + { + "epoch": 6.915333701047986, + "grad_norm": 5.332910404831637e-06, + "learning_rate": 1.3147349779270885e-05, + "loss": 0.0, + "num_input_tokens_seen": 15282360, + "step": 25075 + }, + { + "epoch": 6.9167126309983455, + "grad_norm": 4.775056822836632e-06, + "learning_rate": 1.313675610202936e-05, + "loss": 0.0, + "num_input_tokens_seen": 15285304, + "step": 25080 + }, + { + "epoch": 6.918091560948704, + "grad_norm": 0.03991034999489784, + "learning_rate": 1.3126165173331168e-05, + "loss": 0.0, + "num_input_tokens_seen": 15288120, + "step": 25085 + }, + { + "epoch": 6.919470490899062, + "grad_norm": 2.2505732886202168e-06, + "learning_rate": 1.3115576995630063e-05, + "loss": 0.0, + "num_input_tokens_seen": 15291256, + "step": 25090 + }, + { + "epoch": 6.920849420849421, + "grad_norm": 5.249073183222208e-06, + "learning_rate": 1.3104991571379166e-05, + "loss": 0.0, + "num_input_tokens_seen": 15294072, + "step": 25095 + }, + { + "epoch": 6.922228350799779, + "grad_norm": 1.990455075429054e-06, + "learning_rate": 1.3094408903030983e-05, + "loss": 0.0, + "num_input_tokens_seen": 15298488, + "step": 25100 + }, + { + "epoch": 6.923607280750138, + "grad_norm": 1.310743664362235e-05, + "learning_rate": 1.308382899303735e-05, + "loss": 0.0, + "num_input_tokens_seen": 15301208, + "step": 25105 + }, + { + "epoch": 6.924986210700497, + "grad_norm": 1.8775465377984801e-06, + "learning_rate": 1.3073251843849501e-05, + "loss": 0.0, + "num_input_tokens_seen": 15304440, + "step": 25110 + }, + { + "epoch": 6.926365140650855, + "grad_norm": 5.946151941316202e-05, + "learning_rate": 1.3062677457917988e-05, + "loss": 0.0, + "num_input_tokens_seen": 15306904, + "step": 25115 + }, + { + "epoch": 6.9277440706012134, + "grad_norm": 0.005455652717500925, + "learning_rate": 1.3052105837692754e-05, + "loss": 0.0, + "num_input_tokens_seen": 15310712, + "step": 25120 + }, + { + "epoch": 6.929123000551572, + "grad_norm": 1.552518256175972e-06, + "learning_rate": 1.3041536985623105e-05, + "loss": 0.0, + "num_input_tokens_seen": 15314488, + "step": 25125 + }, + { + "epoch": 6.93050193050193, + "grad_norm": 1.2965997484570835e-06, + "learning_rate": 1.3030970904157675e-05, + "loss": 0.0, + "num_input_tokens_seen": 15318008, + "step": 25130 + }, + { + "epoch": 6.931880860452289, + "grad_norm": 1.4930681118130451e-06, + "learning_rate": 1.3020407595744471e-05, + "loss": 0.0, + "num_input_tokens_seen": 15320984, + "step": 25135 + }, + { + "epoch": 6.933259790402648, + "grad_norm": 9.596768677511136e-07, + "learning_rate": 1.3009847062830877e-05, + "loss": 0.0003, + "num_input_tokens_seen": 15323160, + "step": 25140 + }, + { + "epoch": 6.934638720353006, + "grad_norm": 0.00011441045353421941, + "learning_rate": 1.2999289307863604e-05, + "loss": 0.0, + "num_input_tokens_seen": 15326552, + "step": 25145 + }, + { + "epoch": 6.936017650303365, + "grad_norm": 3.3775700103433337e-06, + "learning_rate": 1.2988734333288726e-05, + "loss": 0.0, + "num_input_tokens_seen": 15329304, + "step": 25150 + }, + { + "epoch": 6.937396580253723, + "grad_norm": 4.706493200501427e-06, + "learning_rate": 1.2978182141551697e-05, + "loss": 0.0, + "num_input_tokens_seen": 15332024, + "step": 25155 + }, + { + "epoch": 6.938775510204081, + "grad_norm": 5.027960469305981e-06, + "learning_rate": 1.2967632735097296e-05, + "loss": 0.0, + "num_input_tokens_seen": 15335576, + "step": 25160 + }, + { + "epoch": 6.940154440154441, + "grad_norm": 1.1056820767407771e-05, + "learning_rate": 1.2957086116369677e-05, + "loss": 0.0, + "num_input_tokens_seen": 15338712, + "step": 25165 + }, + { + "epoch": 6.941533370104799, + "grad_norm": 4.0026006900006905e-05, + "learning_rate": 1.2946542287812336e-05, + "loss": 0.0, + "num_input_tokens_seen": 15342072, + "step": 25170 + }, + { + "epoch": 6.942912300055157, + "grad_norm": 1.7263086192542687e-05, + "learning_rate": 1.2936001251868119e-05, + "loss": 0.0, + "num_input_tokens_seen": 15344376, + "step": 25175 + }, + { + "epoch": 6.944291230005516, + "grad_norm": 9.59628050622996e-06, + "learning_rate": 1.292546301097925e-05, + "loss": 0.0, + "num_input_tokens_seen": 15347256, + "step": 25180 + }, + { + "epoch": 6.945670159955874, + "grad_norm": 1.3907671927881893e-05, + "learning_rate": 1.2914927567587265e-05, + "loss": 0.0, + "num_input_tokens_seen": 15349848, + "step": 25185 + }, + { + "epoch": 6.9470490899062325, + "grad_norm": 1.1104839359177276e-06, + "learning_rate": 1.290439492413309e-05, + "loss": 0.0, + "num_input_tokens_seen": 15352312, + "step": 25190 + }, + { + "epoch": 6.948428019856591, + "grad_norm": 2.6910340693575563e-06, + "learning_rate": 1.2893865083057e-05, + "loss": 0.0, + "num_input_tokens_seen": 15355384, + "step": 25195 + }, + { + "epoch": 6.94980694980695, + "grad_norm": 1.1714475931512425e-06, + "learning_rate": 1.2883338046798587e-05, + "loss": 0.0, + "num_input_tokens_seen": 15358040, + "step": 25200 + }, + { + "epoch": 6.9511858797573085, + "grad_norm": 5.9174994930799585e-06, + "learning_rate": 1.2872813817796814e-05, + "loss": 0.0, + "num_input_tokens_seen": 15360632, + "step": 25205 + }, + { + "epoch": 6.952564809707667, + "grad_norm": 1.58217574153241e-06, + "learning_rate": 1.2862292398490011e-05, + "loss": 0.0, + "num_input_tokens_seen": 15363416, + "step": 25210 + }, + { + "epoch": 6.953943739658025, + "grad_norm": 2.3074867385730613e-06, + "learning_rate": 1.2851773791315819e-05, + "loss": 0.0, + "num_input_tokens_seen": 15366104, + "step": 25215 + }, + { + "epoch": 6.955322669608384, + "grad_norm": 1.1236798854952212e-05, + "learning_rate": 1.284125799871127e-05, + "loss": 0.0, + "num_input_tokens_seen": 15368568, + "step": 25220 + }, + { + "epoch": 6.956701599558743, + "grad_norm": 5.007140862289816e-05, + "learning_rate": 1.2830745023112709e-05, + "loss": 0.0, + "num_input_tokens_seen": 15371352, + "step": 25225 + }, + { + "epoch": 6.958080529509101, + "grad_norm": 2.2730687305738684e-06, + "learning_rate": 1.2820234866955832e-05, + "loss": 0.0, + "num_input_tokens_seen": 15374072, + "step": 25230 + }, + { + "epoch": 6.95945945945946, + "grad_norm": 9.248561582353432e-06, + "learning_rate": 1.280972753267572e-05, + "loss": 0.0, + "num_input_tokens_seen": 15378584, + "step": 25235 + }, + { + "epoch": 6.960838389409818, + "grad_norm": 3.6255351005820557e-06, + "learning_rate": 1.2799223022706747e-05, + "loss": 0.0, + "num_input_tokens_seen": 15380984, + "step": 25240 + }, + { + "epoch": 6.962217319360176, + "grad_norm": 2.729856987571111e-06, + "learning_rate": 1.2788721339482656e-05, + "loss": 0.0, + "num_input_tokens_seen": 15383224, + "step": 25245 + }, + { + "epoch": 6.963596249310535, + "grad_norm": 3.229379217373207e-06, + "learning_rate": 1.2778222485436548e-05, + "loss": 0.0, + "num_input_tokens_seen": 15386520, + "step": 25250 + }, + { + "epoch": 6.964975179260893, + "grad_norm": 1.1780732620536583e-06, + "learning_rate": 1.2767726463000861e-05, + "loss": 0.0, + "num_input_tokens_seen": 15388632, + "step": 25255 + }, + { + "epoch": 6.966354109211252, + "grad_norm": 2.3320678792515537e-06, + "learning_rate": 1.2757233274607355e-05, + "loss": 0.0, + "num_input_tokens_seen": 15391256, + "step": 25260 + }, + { + "epoch": 6.967733039161611, + "grad_norm": 0.00013817913713864982, + "learning_rate": 1.2746742922687171e-05, + "loss": 0.0, + "num_input_tokens_seen": 15394328, + "step": 25265 + }, + { + "epoch": 6.969111969111969, + "grad_norm": 5.5635297030676156e-05, + "learning_rate": 1.2736255409670758e-05, + "loss": 0.0, + "num_input_tokens_seen": 15397400, + "step": 25270 + }, + { + "epoch": 6.9704908990623275, + "grad_norm": 5.813399548060261e-05, + "learning_rate": 1.2725770737987935e-05, + "loss": 0.0, + "num_input_tokens_seen": 15399928, + "step": 25275 + }, + { + "epoch": 6.971869829012686, + "grad_norm": 0.00010053989535663277, + "learning_rate": 1.271528891006784e-05, + "loss": 0.0, + "num_input_tokens_seen": 15402840, + "step": 25280 + }, + { + "epoch": 6.973248758963044, + "grad_norm": 1.5342745882662712e-06, + "learning_rate": 1.2704809928338956e-05, + "loss": 0.0, + "num_input_tokens_seen": 15406456, + "step": 25285 + }, + { + "epoch": 6.974627688913404, + "grad_norm": 9.659925854066387e-05, + "learning_rate": 1.2694333795229132e-05, + "loss": 0.0, + "num_input_tokens_seen": 15409496, + "step": 25290 + }, + { + "epoch": 6.976006618863762, + "grad_norm": 4.369336693343939e-06, + "learning_rate": 1.2683860513165529e-05, + "loss": 0.0, + "num_input_tokens_seen": 15413208, + "step": 25295 + }, + { + "epoch": 6.97738554881412, + "grad_norm": 1.5671601431677118e-05, + "learning_rate": 1.2673390084574641e-05, + "loss": 0.0, + "num_input_tokens_seen": 15416152, + "step": 25300 + }, + { + "epoch": 6.978764478764479, + "grad_norm": 1.5690183090555365e-06, + "learning_rate": 1.2662922511882341e-05, + "loss": 0.0, + "num_input_tokens_seen": 15418392, + "step": 25305 + }, + { + "epoch": 6.980143408714837, + "grad_norm": 3.846274921670556e-06, + "learning_rate": 1.2652457797513789e-05, + "loss": 0.0, + "num_input_tokens_seen": 15421112, + "step": 25310 + }, + { + "epoch": 6.9815223386651954, + "grad_norm": 7.470496825590089e-07, + "learning_rate": 1.2641995943893536e-05, + "loss": 0.0, + "num_input_tokens_seen": 15425336, + "step": 25315 + }, + { + "epoch": 6.982901268615555, + "grad_norm": 1.2977527603652561e-06, + "learning_rate": 1.2631536953445425e-05, + "loss": 0.0, + "num_input_tokens_seen": 15428760, + "step": 25320 + }, + { + "epoch": 6.984280198565913, + "grad_norm": 9.177259130410675e-07, + "learning_rate": 1.2621080828592666e-05, + "loss": 0.0, + "num_input_tokens_seen": 15431736, + "step": 25325 + }, + { + "epoch": 6.9856591285162715, + "grad_norm": 1.6310650607920252e-05, + "learning_rate": 1.2610627571757778e-05, + "loss": 0.0, + "num_input_tokens_seen": 15434456, + "step": 25330 + }, + { + "epoch": 6.98703805846663, + "grad_norm": 2.011065589613281e-06, + "learning_rate": 1.2600177185362646e-05, + "loss": 0.0, + "num_input_tokens_seen": 15438168, + "step": 25335 + }, + { + "epoch": 6.988416988416988, + "grad_norm": 1.8618836747918976e-06, + "learning_rate": 1.2589729671828463e-05, + "loss": 0.0, + "num_input_tokens_seen": 15440728, + "step": 25340 + }, + { + "epoch": 6.989795918367347, + "grad_norm": 9.9732978924294e-06, + "learning_rate": 1.2579285033575777e-05, + "loss": 0.0, + "num_input_tokens_seen": 15443832, + "step": 25345 + }, + { + "epoch": 6.991174848317705, + "grad_norm": 1.748880322338664e-06, + "learning_rate": 1.256884327302445e-05, + "loss": 0.0, + "num_input_tokens_seen": 15446744, + "step": 25350 + }, + { + "epoch": 6.992553778268064, + "grad_norm": 1.143462213804014e-05, + "learning_rate": 1.2558404392593705e-05, + "loss": 0.0, + "num_input_tokens_seen": 15450456, + "step": 25355 + }, + { + "epoch": 6.993932708218423, + "grad_norm": 8.204355594898516e-07, + "learning_rate": 1.254796839470207e-05, + "loss": 0.0, + "num_input_tokens_seen": 15453720, + "step": 25360 + }, + { + "epoch": 6.995311638168781, + "grad_norm": 2.6435739073349396e-06, + "learning_rate": 1.2537535281767406e-05, + "loss": 0.0, + "num_input_tokens_seen": 15456312, + "step": 25365 + }, + { + "epoch": 6.996690568119139, + "grad_norm": 3.954459316446446e-06, + "learning_rate": 1.2527105056206936e-05, + "loss": 0.0, + "num_input_tokens_seen": 15461336, + "step": 25370 + }, + { + "epoch": 6.998069498069498, + "grad_norm": 4.7918447307893075e-06, + "learning_rate": 1.251667772043718e-05, + "loss": 0.0, + "num_input_tokens_seen": 15464600, + "step": 25375 + }, + { + "epoch": 6.999448428019857, + "grad_norm": 1.885518395283725e-05, + "learning_rate": 1.2506253276874002e-05, + "loss": 0.0, + "num_input_tokens_seen": 15468056, + "step": 25380 + }, + { + "epoch": 7.0, + "eval_loss": 0.3668937087059021, + "eval_runtime": 28.4987, + "eval_samples_per_second": 56.564, + "eval_steps_per_second": 14.141, + "num_input_tokens_seen": 15468632, + "step": 25382 + }, + { + "epoch": 7.000827357970215, + "grad_norm": 2.9059467578917975e-06, + "learning_rate": 1.2495831727932605e-05, + "loss": 0.0, + "num_input_tokens_seen": 15470424, + "step": 25385 + }, + { + "epoch": 7.002206287920574, + "grad_norm": 9.512856195215136e-06, + "learning_rate": 1.2485413076027497e-05, + "loss": 0.0, + "num_input_tokens_seen": 15473336, + "step": 25390 + }, + { + "epoch": 7.003585217870932, + "grad_norm": 7.308926797122695e-07, + "learning_rate": 1.247499732357254e-05, + "loss": 0.0, + "num_input_tokens_seen": 15476504, + "step": 25395 + }, + { + "epoch": 7.0049641478212905, + "grad_norm": 6.20177888777107e-06, + "learning_rate": 1.2464584472980923e-05, + "loss": 0.0, + "num_input_tokens_seen": 15480696, + "step": 25400 + }, + { + "epoch": 7.006343077771649, + "grad_norm": 8.014086461116676e-07, + "learning_rate": 1.2454174526665136e-05, + "loss": 0.0, + "num_input_tokens_seen": 15483128, + "step": 25405 + }, + { + "epoch": 7.007722007722008, + "grad_norm": 1.302530449720507e-06, + "learning_rate": 1.2443767487037034e-05, + "loss": 0.0, + "num_input_tokens_seen": 15485752, + "step": 25410 + }, + { + "epoch": 7.0091009376723665, + "grad_norm": 0.0001326764322584495, + "learning_rate": 1.2433363356507766e-05, + "loss": 0.0, + "num_input_tokens_seen": 15489144, + "step": 25415 + }, + { + "epoch": 7.010479867622725, + "grad_norm": 1.5621897091477877e-06, + "learning_rate": 1.2422962137487812e-05, + "loss": 0.0, + "num_input_tokens_seen": 15492440, + "step": 25420 + }, + { + "epoch": 7.011858797573083, + "grad_norm": 1.604697354196105e-05, + "learning_rate": 1.2412563832387003e-05, + "loss": 0.0, + "num_input_tokens_seen": 15495416, + "step": 25425 + }, + { + "epoch": 7.013237727523442, + "grad_norm": 0.0008851865422911942, + "learning_rate": 1.240216844361447e-05, + "loss": 0.0, + "num_input_tokens_seen": 15498520, + "step": 25430 + }, + { + "epoch": 7.0146166574738, + "grad_norm": 5.95278379478259e-06, + "learning_rate": 1.2391775973578665e-05, + "loss": 0.0, + "num_input_tokens_seen": 15501496, + "step": 25435 + }, + { + "epoch": 7.015995587424159, + "grad_norm": 2.521914439057582e-06, + "learning_rate": 1.2381386424687395e-05, + "loss": 0.0, + "num_input_tokens_seen": 15504088, + "step": 25440 + }, + { + "epoch": 7.017374517374518, + "grad_norm": 3.173275399603881e-05, + "learning_rate": 1.2370999799347752e-05, + "loss": 0.0, + "num_input_tokens_seen": 15507288, + "step": 25445 + }, + { + "epoch": 7.018753447324876, + "grad_norm": 2.8375031888572266e-06, + "learning_rate": 1.2360616099966168e-05, + "loss": 0.0, + "num_input_tokens_seen": 15509784, + "step": 25450 + }, + { + "epoch": 7.020132377275234, + "grad_norm": 2.2038981114747003e-05, + "learning_rate": 1.2350235328948411e-05, + "loss": 0.0, + "num_input_tokens_seen": 15512440, + "step": 25455 + }, + { + "epoch": 7.021511307225593, + "grad_norm": 0.00010586568532744423, + "learning_rate": 1.2339857488699538e-05, + "loss": 0.0, + "num_input_tokens_seen": 15516760, + "step": 25460 + }, + { + "epoch": 7.022890237175951, + "grad_norm": 1.8527942984292167e-06, + "learning_rate": 1.2329482581623957e-05, + "loss": 0.1063, + "num_input_tokens_seen": 15519608, + "step": 25465 + }, + { + "epoch": 7.0242691671263096, + "grad_norm": 9.403209332958795e-06, + "learning_rate": 1.2319110610125392e-05, + "loss": 0.0, + "num_input_tokens_seen": 15523864, + "step": 25470 + }, + { + "epoch": 7.025648097076669, + "grad_norm": 1.1583824743865989e-05, + "learning_rate": 1.2308741576606864e-05, + "loss": 0.0, + "num_input_tokens_seen": 15526712, + "step": 25475 + }, + { + "epoch": 7.027027027027027, + "grad_norm": 4.564469782053493e-05, + "learning_rate": 1.2298375483470745e-05, + "loss": 0.0, + "num_input_tokens_seen": 15529688, + "step": 25480 + }, + { + "epoch": 7.028405956977386, + "grad_norm": 1.1087129223597003e-06, + "learning_rate": 1.2288012333118703e-05, + "loss": 0.0, + "num_input_tokens_seen": 15533816, + "step": 25485 + }, + { + "epoch": 7.029784886927744, + "grad_norm": 1.8477942376193823e-06, + "learning_rate": 1.2277652127951723e-05, + "loss": 0.0, + "num_input_tokens_seen": 15536664, + "step": 25490 + }, + { + "epoch": 7.031163816878102, + "grad_norm": 0.019002458080649376, + "learning_rate": 1.2267294870370127e-05, + "loss": 0.0, + "num_input_tokens_seen": 15540184, + "step": 25495 + }, + { + "epoch": 7.032542746828461, + "grad_norm": 1.1192194051545812e-06, + "learning_rate": 1.2256940562773534e-05, + "loss": 0.0, + "num_input_tokens_seen": 15542744, + "step": 25500 + }, + { + "epoch": 7.03392167677882, + "grad_norm": 8.164889209183457e-07, + "learning_rate": 1.2246589207560901e-05, + "loss": 0.0001, + "num_input_tokens_seen": 15546328, + "step": 25505 + }, + { + "epoch": 7.035300606729178, + "grad_norm": 2.606605903565651e-06, + "learning_rate": 1.2236240807130481e-05, + "loss": 0.0, + "num_input_tokens_seen": 15549208, + "step": 25510 + }, + { + "epoch": 7.036679536679537, + "grad_norm": 8.576474101573694e-06, + "learning_rate": 1.222589536387984e-05, + "loss": 0.0, + "num_input_tokens_seen": 15552024, + "step": 25515 + }, + { + "epoch": 7.038058466629895, + "grad_norm": 2.3570955818286166e-06, + "learning_rate": 1.2215552880205889e-05, + "loss": 0.0, + "num_input_tokens_seen": 15555096, + "step": 25520 + }, + { + "epoch": 7.0394373965802535, + "grad_norm": 2.1833175196661614e-05, + "learning_rate": 1.2205213358504811e-05, + "loss": 0.0, + "num_input_tokens_seen": 15557720, + "step": 25525 + }, + { + "epoch": 7.040816326530612, + "grad_norm": 2.7270937152934493e-06, + "learning_rate": 1.2194876801172148e-05, + "loss": 0.0, + "num_input_tokens_seen": 15560504, + "step": 25530 + }, + { + "epoch": 7.042195256480971, + "grad_norm": 1.958895836651209e-06, + "learning_rate": 1.2184543210602709e-05, + "loss": 0.0, + "num_input_tokens_seen": 15563800, + "step": 25535 + }, + { + "epoch": 7.0435741864313295, + "grad_norm": 4.944504325976595e-06, + "learning_rate": 1.2174212589190659e-05, + "loss": 0.0, + "num_input_tokens_seen": 15566360, + "step": 25540 + }, + { + "epoch": 7.044953116381688, + "grad_norm": 2.1016012397012673e-06, + "learning_rate": 1.2163884939329433e-05, + "loss": 0.0, + "num_input_tokens_seen": 15568856, + "step": 25545 + }, + { + "epoch": 7.046332046332046, + "grad_norm": 6.512218533316627e-06, + "learning_rate": 1.2153560263411826e-05, + "loss": 0.0, + "num_input_tokens_seen": 15571672, + "step": 25550 + }, + { + "epoch": 7.047710976282405, + "grad_norm": 1.1180616183992242e-06, + "learning_rate": 1.2143238563829892e-05, + "loss": 0.0, + "num_input_tokens_seen": 15574040, + "step": 25555 + }, + { + "epoch": 7.049089906232763, + "grad_norm": 4.714400347438641e-05, + "learning_rate": 1.213291984297504e-05, + "loss": 0.0, + "num_input_tokens_seen": 15578616, + "step": 25560 + }, + { + "epoch": 7.050468836183122, + "grad_norm": 0.00011034676572307944, + "learning_rate": 1.212260410323796e-05, + "loss": 0.0, + "num_input_tokens_seen": 15580856, + "step": 25565 + }, + { + "epoch": 7.051847766133481, + "grad_norm": 0.00021250943245831877, + "learning_rate": 1.2112291347008653e-05, + "loss": 0.0, + "num_input_tokens_seen": 15584312, + "step": 25570 + }, + { + "epoch": 7.053226696083839, + "grad_norm": 0.00010170504538109526, + "learning_rate": 1.2101981576676454e-05, + "loss": 0.0, + "num_input_tokens_seen": 15587192, + "step": 25575 + }, + { + "epoch": 7.054605626034197, + "grad_norm": 1.0993694559147116e-05, + "learning_rate": 1.2091674794629984e-05, + "loss": 0.0, + "num_input_tokens_seen": 15590136, + "step": 25580 + }, + { + "epoch": 7.055984555984556, + "grad_norm": 1.57006270455895e-05, + "learning_rate": 1.208137100325716e-05, + "loss": 0.0, + "num_input_tokens_seen": 15592600, + "step": 25585 + }, + { + "epoch": 7.057363485934914, + "grad_norm": 1.6871165371412644e-06, + "learning_rate": 1.2071070204945246e-05, + "loss": 0.0, + "num_input_tokens_seen": 15596376, + "step": 25590 + }, + { + "epoch": 7.058742415885273, + "grad_norm": 1.3840121937391814e-06, + "learning_rate": 1.2060772402080767e-05, + "loss": 0.0, + "num_input_tokens_seen": 15599096, + "step": 25595 + }, + { + "epoch": 7.060121345835632, + "grad_norm": 1.2937446172145428e-06, + "learning_rate": 1.2050477597049597e-05, + "loss": 0.0, + "num_input_tokens_seen": 15601464, + "step": 25600 + }, + { + "epoch": 7.06150027578599, + "grad_norm": 2.9582349725387758e-06, + "learning_rate": 1.2040185792236874e-05, + "loss": 0.0, + "num_input_tokens_seen": 15604696, + "step": 25605 + }, + { + "epoch": 7.0628792057363485, + "grad_norm": 2.5238250600523315e-06, + "learning_rate": 1.2029896990027076e-05, + "loss": 0.0, + "num_input_tokens_seen": 15608888, + "step": 25610 + }, + { + "epoch": 7.064258135686707, + "grad_norm": 4.5290103116713e-06, + "learning_rate": 1.2019611192803973e-05, + "loss": 0.0, + "num_input_tokens_seen": 15612184, + "step": 25615 + }, + { + "epoch": 7.065637065637065, + "grad_norm": 1.3373476122069405e-06, + "learning_rate": 1.2009328402950634e-05, + "loss": 0.0, + "num_input_tokens_seen": 15615768, + "step": 25620 + }, + { + "epoch": 7.0670159955874245, + "grad_norm": 7.586102128698258e-07, + "learning_rate": 1.1999048622849418e-05, + "loss": 0.0, + "num_input_tokens_seen": 15618904, + "step": 25625 + }, + { + "epoch": 7.068394925537783, + "grad_norm": 1.8906372133642435e-05, + "learning_rate": 1.1988771854882027e-05, + "loss": 0.0, + "num_input_tokens_seen": 15621912, + "step": 25630 + }, + { + "epoch": 7.069773855488141, + "grad_norm": 1.4394669960893225e-05, + "learning_rate": 1.1978498101429427e-05, + "loss": 0.0, + "num_input_tokens_seen": 15625400, + "step": 25635 + }, + { + "epoch": 7.0711527854385, + "grad_norm": 6.101836788729997e-06, + "learning_rate": 1.1968227364871892e-05, + "loss": 0.0, + "num_input_tokens_seen": 15628152, + "step": 25640 + }, + { + "epoch": 7.072531715388858, + "grad_norm": 3.3930857625819044e-06, + "learning_rate": 1.195795964758902e-05, + "loss": 0.0, + "num_input_tokens_seen": 15631064, + "step": 25645 + }, + { + "epoch": 7.073910645339216, + "grad_norm": 1.3268639804664417e-06, + "learning_rate": 1.1947694951959682e-05, + "loss": 0.0, + "num_input_tokens_seen": 15633848, + "step": 25650 + }, + { + "epoch": 7.075289575289576, + "grad_norm": 1.0152049298994825e-06, + "learning_rate": 1.193743328036207e-05, + "loss": 0.0, + "num_input_tokens_seen": 15636536, + "step": 25655 + }, + { + "epoch": 7.076668505239934, + "grad_norm": 3.538369128364138e-05, + "learning_rate": 1.1927174635173665e-05, + "loss": 0.0, + "num_input_tokens_seen": 15639480, + "step": 25660 + }, + { + "epoch": 7.0780474351902924, + "grad_norm": 8.93014293978922e-06, + "learning_rate": 1.1916919018771232e-05, + "loss": 0.0, + "num_input_tokens_seen": 15643224, + "step": 25665 + }, + { + "epoch": 7.079426365140651, + "grad_norm": 5.95232222622144e-06, + "learning_rate": 1.1906666433530864e-05, + "loss": 0.0, + "num_input_tokens_seen": 15646392, + "step": 25670 + }, + { + "epoch": 7.080805295091009, + "grad_norm": 1.1522305612743367e-05, + "learning_rate": 1.1896416881827947e-05, + "loss": 0.0, + "num_input_tokens_seen": 15649784, + "step": 25675 + }, + { + "epoch": 7.082184225041368, + "grad_norm": 4.593166522681713e-05, + "learning_rate": 1.188617036603713e-05, + "loss": 0.0, + "num_input_tokens_seen": 15653272, + "step": 25680 + }, + { + "epoch": 7.083563154991727, + "grad_norm": 1.78046047949465e-06, + "learning_rate": 1.1875926888532412e-05, + "loss": 0.0, + "num_input_tokens_seen": 15655576, + "step": 25685 + }, + { + "epoch": 7.084942084942085, + "grad_norm": 1.0728317647590302e-06, + "learning_rate": 1.1865686451687033e-05, + "loss": 0.0, + "num_input_tokens_seen": 15659128, + "step": 25690 + }, + { + "epoch": 7.086321014892444, + "grad_norm": 1.044779310177546e-06, + "learning_rate": 1.185544905787358e-05, + "loss": 0.0, + "num_input_tokens_seen": 15661432, + "step": 25695 + }, + { + "epoch": 7.087699944842802, + "grad_norm": 0.0001855111913755536, + "learning_rate": 1.1845214709463895e-05, + "loss": 0.0, + "num_input_tokens_seen": 15664056, + "step": 25700 + }, + { + "epoch": 7.08907887479316, + "grad_norm": 5.505565241037402e-06, + "learning_rate": 1.1834983408829123e-05, + "loss": 0.0, + "num_input_tokens_seen": 15667928, + "step": 25705 + }, + { + "epoch": 7.090457804743519, + "grad_norm": 1.0763541467895266e-05, + "learning_rate": 1.1824755158339732e-05, + "loss": 0.0, + "num_input_tokens_seen": 15671352, + "step": 25710 + }, + { + "epoch": 7.091836734693878, + "grad_norm": 8.241264367825352e-06, + "learning_rate": 1.1814529960365445e-05, + "loss": 0.0, + "num_input_tokens_seen": 15675736, + "step": 25715 + }, + { + "epoch": 7.093215664644236, + "grad_norm": 0.005151711869984865, + "learning_rate": 1.1804307817275286e-05, + "loss": 0.0, + "num_input_tokens_seen": 15678936, + "step": 25720 + }, + { + "epoch": 7.094594594594595, + "grad_norm": 6.678954378003255e-05, + "learning_rate": 1.17940887314376e-05, + "loss": 0.0, + "num_input_tokens_seen": 15682040, + "step": 25725 + }, + { + "epoch": 7.095973524544953, + "grad_norm": 5.5560676628374495e-06, + "learning_rate": 1.1783872705219992e-05, + "loss": 0.0, + "num_input_tokens_seen": 15686072, + "step": 25730 + }, + { + "epoch": 7.0973524544953115, + "grad_norm": 1.3090144420857541e-05, + "learning_rate": 1.1773659740989359e-05, + "loss": 0.0, + "num_input_tokens_seen": 15689080, + "step": 25735 + }, + { + "epoch": 7.09873138444567, + "grad_norm": 4.247441029292531e-05, + "learning_rate": 1.1763449841111906e-05, + "loss": 0.0, + "num_input_tokens_seen": 15691320, + "step": 25740 + }, + { + "epoch": 7.100110314396028, + "grad_norm": 1.4828463008598192e-06, + "learning_rate": 1.1753243007953125e-05, + "loss": 0.0, + "num_input_tokens_seen": 15693912, + "step": 25745 + }, + { + "epoch": 7.1014892443463875, + "grad_norm": 9.796541235118639e-06, + "learning_rate": 1.1743039243877797e-05, + "loss": 0.0, + "num_input_tokens_seen": 15697400, + "step": 25750 + }, + { + "epoch": 7.102868174296746, + "grad_norm": 2.999468279085704e-06, + "learning_rate": 1.1732838551249978e-05, + "loss": 0.0629, + "num_input_tokens_seen": 15699672, + "step": 25755 + }, + { + "epoch": 7.104247104247104, + "grad_norm": 9.898435564537067e-06, + "learning_rate": 1.1722640932433018e-05, + "loss": 0.0001, + "num_input_tokens_seen": 15702904, + "step": 25760 + }, + { + "epoch": 7.105626034197463, + "grad_norm": 6.212986772879958e-05, + "learning_rate": 1.1712446389789572e-05, + "loss": 0.0, + "num_input_tokens_seen": 15705560, + "step": 25765 + }, + { + "epoch": 7.107004964147821, + "grad_norm": 0.0010138641810044646, + "learning_rate": 1.1702254925681566e-05, + "loss": 0.0, + "num_input_tokens_seen": 15708440, + "step": 25770 + }, + { + "epoch": 7.108383894098179, + "grad_norm": 7.966912107804092e-07, + "learning_rate": 1.1692066542470201e-05, + "loss": 0.0, + "num_input_tokens_seen": 15710968, + "step": 25775 + }, + { + "epoch": 7.109762824048539, + "grad_norm": 2.3798636448191246e-06, + "learning_rate": 1.1681881242516e-05, + "loss": 0.0, + "num_input_tokens_seen": 15713464, + "step": 25780 + }, + { + "epoch": 7.111141753998897, + "grad_norm": 5.675804004567908e-06, + "learning_rate": 1.1671699028178729e-05, + "loss": 0.0, + "num_input_tokens_seen": 15716504, + "step": 25785 + }, + { + "epoch": 7.112520683949255, + "grad_norm": 7.256716344272718e-05, + "learning_rate": 1.1661519901817485e-05, + "loss": 0.0, + "num_input_tokens_seen": 15719512, + "step": 25790 + }, + { + "epoch": 7.113899613899614, + "grad_norm": 2.772909283521585e-05, + "learning_rate": 1.1651343865790615e-05, + "loss": 0.0, + "num_input_tokens_seen": 15722648, + "step": 25795 + }, + { + "epoch": 7.115278543849972, + "grad_norm": 2.166253125324147e-06, + "learning_rate": 1.1641170922455747e-05, + "loss": 0.0, + "num_input_tokens_seen": 15725784, + "step": 25800 + }, + { + "epoch": 7.1166574738003305, + "grad_norm": 4.121954589209054e-06, + "learning_rate": 1.1631001074169829e-05, + "loss": 0.0, + "num_input_tokens_seen": 15728120, + "step": 25805 + }, + { + "epoch": 7.11803640375069, + "grad_norm": 2.880380634451285e-06, + "learning_rate": 1.1620834323289049e-05, + "loss": 0.0, + "num_input_tokens_seen": 15730936, + "step": 25810 + }, + { + "epoch": 7.119415333701048, + "grad_norm": 1.7268217789023765e-06, + "learning_rate": 1.1610670672168906e-05, + "loss": 0.0, + "num_input_tokens_seen": 15733400, + "step": 25815 + }, + { + "epoch": 7.1207942636514066, + "grad_norm": 1.1275998076598626e-05, + "learning_rate": 1.160051012316418e-05, + "loss": 0.0, + "num_input_tokens_seen": 15736280, + "step": 25820 + }, + { + "epoch": 7.122173193601765, + "grad_norm": 1.764933813319658e-06, + "learning_rate": 1.1590352678628917e-05, + "loss": 0.0, + "num_input_tokens_seen": 15739640, + "step": 25825 + }, + { + "epoch": 7.123552123552123, + "grad_norm": 6.833886800450273e-06, + "learning_rate": 1.158019834091644e-05, + "loss": 0.0, + "num_input_tokens_seen": 15742040, + "step": 25830 + }, + { + "epoch": 7.124931053502482, + "grad_norm": 0.0006867878837510943, + "learning_rate": 1.1570047112379386e-05, + "loss": 0.0, + "num_input_tokens_seen": 15745912, + "step": 25835 + }, + { + "epoch": 7.126309983452841, + "grad_norm": 2.1403060600277968e-05, + "learning_rate": 1.1559898995369625e-05, + "loss": 0.0, + "num_input_tokens_seen": 15749144, + "step": 25840 + }, + { + "epoch": 7.127688913403199, + "grad_norm": 1.575904661876848e-06, + "learning_rate": 1.1549753992238354e-05, + "loss": 0.0, + "num_input_tokens_seen": 15752600, + "step": 25845 + }, + { + "epoch": 7.129067843353558, + "grad_norm": 7.040608579700347e-06, + "learning_rate": 1.1539612105336011e-05, + "loss": 0.0, + "num_input_tokens_seen": 15755576, + "step": 25850 + }, + { + "epoch": 7.130446773303916, + "grad_norm": 6.813922482251655e-06, + "learning_rate": 1.1529473337012317e-05, + "loss": 0.0, + "num_input_tokens_seen": 15758456, + "step": 25855 + }, + { + "epoch": 7.1318257032542745, + "grad_norm": 3.5642917737277457e-06, + "learning_rate": 1.15193376896163e-05, + "loss": 0.0, + "num_input_tokens_seen": 15762456, + "step": 25860 + }, + { + "epoch": 7.133204633204633, + "grad_norm": 1.6732059293644852e-06, + "learning_rate": 1.1509205165496233e-05, + "loss": 0.0, + "num_input_tokens_seen": 15765208, + "step": 25865 + }, + { + "epoch": 7.134583563154992, + "grad_norm": 1.2638408861675998e-06, + "learning_rate": 1.1499075766999667e-05, + "loss": 0.0, + "num_input_tokens_seen": 15768504, + "step": 25870 + }, + { + "epoch": 7.1359624931053505, + "grad_norm": 1.0950238902296405e-06, + "learning_rate": 1.1488949496473448e-05, + "loss": 0.0, + "num_input_tokens_seen": 15771672, + "step": 25875 + }, + { + "epoch": 7.137341423055709, + "grad_norm": 4.571314548229566e-06, + "learning_rate": 1.1478826356263698e-05, + "loss": 0.0, + "num_input_tokens_seen": 15774616, + "step": 25880 + }, + { + "epoch": 7.138720353006067, + "grad_norm": 3.609436589613324e-06, + "learning_rate": 1.1468706348715779e-05, + "loss": 0.0, + "num_input_tokens_seen": 15778136, + "step": 25885 + }, + { + "epoch": 7.140099282956426, + "grad_norm": 2.6981631435774034e-06, + "learning_rate": 1.1458589476174378e-05, + "loss": 0.0, + "num_input_tokens_seen": 15780344, + "step": 25890 + }, + { + "epoch": 7.141478212906784, + "grad_norm": 4.5417164074024186e-05, + "learning_rate": 1.1448475740983405e-05, + "loss": 0.0, + "num_input_tokens_seen": 15782520, + "step": 25895 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 2.8430008569557685e-06, + "learning_rate": 1.143836514548609e-05, + "loss": 0.0, + "num_input_tokens_seen": 15785176, + "step": 25900 + }, + { + "epoch": 7.144236072807502, + "grad_norm": 3.990554432675708e-06, + "learning_rate": 1.1428257692024903e-05, + "loss": 0.0, + "num_input_tokens_seen": 15787736, + "step": 25905 + }, + { + "epoch": 7.14561500275786, + "grad_norm": 8.860446882863471e-07, + "learning_rate": 1.141815338294158e-05, + "loss": 0.0006, + "num_input_tokens_seen": 15791032, + "step": 25910 + }, + { + "epoch": 7.146993932708218, + "grad_norm": 3.0705709832545836e-06, + "learning_rate": 1.140805222057717e-05, + "loss": 0.0, + "num_input_tokens_seen": 15793976, + "step": 25915 + }, + { + "epoch": 7.148372862658577, + "grad_norm": 4.0756247472018003e-05, + "learning_rate": 1.1397954207271958e-05, + "loss": 0.0, + "num_input_tokens_seen": 15797208, + "step": 25920 + }, + { + "epoch": 7.149751792608935, + "grad_norm": 6.533366104122251e-06, + "learning_rate": 1.1387859345365495e-05, + "loss": 0.0, + "num_input_tokens_seen": 15800568, + "step": 25925 + }, + { + "epoch": 7.151130722559294, + "grad_norm": 5.962438535789261e-06, + "learning_rate": 1.1377767637196638e-05, + "loss": 0.0, + "num_input_tokens_seen": 15803640, + "step": 25930 + }, + { + "epoch": 7.152509652509653, + "grad_norm": 2.9115562938386574e-06, + "learning_rate": 1.136767908510347e-05, + "loss": 0.0, + "num_input_tokens_seen": 15806744, + "step": 25935 + }, + { + "epoch": 7.153888582460011, + "grad_norm": 5.23677772434894e-06, + "learning_rate": 1.1357593691423381e-05, + "loss": 0.0, + "num_input_tokens_seen": 15809752, + "step": 25940 + }, + { + "epoch": 7.1552675124103695, + "grad_norm": 1.080451056623133e-06, + "learning_rate": 1.1347511458493e-05, + "loss": 0.0, + "num_input_tokens_seen": 15812568, + "step": 25945 + }, + { + "epoch": 7.156646442360728, + "grad_norm": 1.8108936501448625e-06, + "learning_rate": 1.1337432388648247e-05, + "loss": 0.0, + "num_input_tokens_seen": 15815416, + "step": 25950 + }, + { + "epoch": 7.158025372311086, + "grad_norm": 0.0013162720715627074, + "learning_rate": 1.1327356484224283e-05, + "loss": 0.0, + "num_input_tokens_seen": 15818392, + "step": 25955 + }, + { + "epoch": 7.1594043022614455, + "grad_norm": 0.0001410255063092336, + "learning_rate": 1.1317283747555568e-05, + "loss": 0.0, + "num_input_tokens_seen": 15821400, + "step": 25960 + }, + { + "epoch": 7.160783232211804, + "grad_norm": 0.2176869511604309, + "learning_rate": 1.1307214180975791e-05, + "loss": 0.0001, + "num_input_tokens_seen": 15824408, + "step": 25965 + }, + { + "epoch": 7.162162162162162, + "grad_norm": 1.3364419828576501e-06, + "learning_rate": 1.1297147786817947e-05, + "loss": 0.0, + "num_input_tokens_seen": 15827640, + "step": 25970 + }, + { + "epoch": 7.163541092112521, + "grad_norm": 1.088914905267302e-05, + "learning_rate": 1.1287084567414263e-05, + "loss": 0.0, + "num_input_tokens_seen": 15830232, + "step": 25975 + }, + { + "epoch": 7.164920022062879, + "grad_norm": 4.057230853504734e-06, + "learning_rate": 1.1277024525096235e-05, + "loss": 0.0, + "num_input_tokens_seen": 15833208, + "step": 25980 + }, + { + "epoch": 7.166298952013237, + "grad_norm": 2.5560766516719013e-05, + "learning_rate": 1.1266967662194646e-05, + "loss": 0.0, + "num_input_tokens_seen": 15836568, + "step": 25985 + }, + { + "epoch": 7.167677881963597, + "grad_norm": 4.022148459625896e-06, + "learning_rate": 1.1256913981039515e-05, + "loss": 0.0, + "num_input_tokens_seen": 15839704, + "step": 25990 + }, + { + "epoch": 7.169056811913955, + "grad_norm": 0.0002634481352288276, + "learning_rate": 1.124686348396015e-05, + "loss": 0.0, + "num_input_tokens_seen": 15842456, + "step": 25995 + }, + { + "epoch": 7.170435741864313, + "grad_norm": 6.326129550870974e-06, + "learning_rate": 1.1236816173285095e-05, + "loss": 0.0, + "num_input_tokens_seen": 15844952, + "step": 26000 + }, + { + "epoch": 7.171814671814672, + "grad_norm": 6.308491151685303e-07, + "learning_rate": 1.122677205134216e-05, + "loss": 0.0, + "num_input_tokens_seen": 15847224, + "step": 26005 + }, + { + "epoch": 7.17319360176503, + "grad_norm": 1.829696884669829e-05, + "learning_rate": 1.1216731120458443e-05, + "loss": 0.0, + "num_input_tokens_seen": 15850776, + "step": 26010 + }, + { + "epoch": 7.174572531715389, + "grad_norm": 1.0114885299117304e-05, + "learning_rate": 1.1206693382960268e-05, + "loss": 0.0, + "num_input_tokens_seen": 15853688, + "step": 26015 + }, + { + "epoch": 7.175951461665747, + "grad_norm": 1.3969169003757997e-06, + "learning_rate": 1.119665884117324e-05, + "loss": 0.0, + "num_input_tokens_seen": 15856568, + "step": 26020 + }, + { + "epoch": 7.177330391616106, + "grad_norm": 1.3522026165446732e-06, + "learning_rate": 1.1186627497422227e-05, + "loss": 0.0, + "num_input_tokens_seen": 15858840, + "step": 26025 + }, + { + "epoch": 7.178709321566465, + "grad_norm": 6.24521362624364e-06, + "learning_rate": 1.1176599354031333e-05, + "loss": 0.0, + "num_input_tokens_seen": 15861080, + "step": 26030 + }, + { + "epoch": 7.180088251516823, + "grad_norm": 7.732786571068573e-07, + "learning_rate": 1.1166574413323951e-05, + "loss": 0.0, + "num_input_tokens_seen": 15865080, + "step": 26035 + }, + { + "epoch": 7.181467181467181, + "grad_norm": 3.277905989307328e-06, + "learning_rate": 1.1156552677622707e-05, + "loss": 0.0, + "num_input_tokens_seen": 15867352, + "step": 26040 + }, + { + "epoch": 7.18284611141754, + "grad_norm": 3.146021754218964e-06, + "learning_rate": 1.1146534149249483e-05, + "loss": 0.0, + "num_input_tokens_seen": 15869592, + "step": 26045 + }, + { + "epoch": 7.184225041367899, + "grad_norm": 0.013491456396877766, + "learning_rate": 1.1136518830525447e-05, + "loss": 0.0, + "num_input_tokens_seen": 15873272, + "step": 26050 + }, + { + "epoch": 7.185603971318257, + "grad_norm": 6.15028475294821e-05, + "learning_rate": 1.1126506723770996e-05, + "loss": 0.0, + "num_input_tokens_seen": 15876760, + "step": 26055 + }, + { + "epoch": 7.186982901268616, + "grad_norm": 0.00043956094305031, + "learning_rate": 1.1116497831305781e-05, + "loss": 0.0, + "num_input_tokens_seen": 15880056, + "step": 26060 + }, + { + "epoch": 7.188361831218974, + "grad_norm": 9.45924603001913e-06, + "learning_rate": 1.110649215544874e-05, + "loss": 0.0, + "num_input_tokens_seen": 15882296, + "step": 26065 + }, + { + "epoch": 7.1897407611693325, + "grad_norm": 1.024058860821242e-06, + "learning_rate": 1.1096489698518032e-05, + "loss": 0.0, + "num_input_tokens_seen": 15885976, + "step": 26070 + }, + { + "epoch": 7.191119691119691, + "grad_norm": 9.163406389234297e-07, + "learning_rate": 1.1086490462831073e-05, + "loss": 0.0, + "num_input_tokens_seen": 15888696, + "step": 26075 + }, + { + "epoch": 7.192498621070049, + "grad_norm": 3.8030887026252458e-06, + "learning_rate": 1.1076494450704566e-05, + "loss": 0.0, + "num_input_tokens_seen": 15891416, + "step": 26080 + }, + { + "epoch": 7.1938775510204085, + "grad_norm": 0.0007324914913624525, + "learning_rate": 1.106650166445442e-05, + "loss": 0.0, + "num_input_tokens_seen": 15894872, + "step": 26085 + }, + { + "epoch": 7.195256480970767, + "grad_norm": 2.349493797737523e-06, + "learning_rate": 1.105651210639583e-05, + "loss": 0.0, + "num_input_tokens_seen": 15897432, + "step": 26090 + }, + { + "epoch": 7.196635410921125, + "grad_norm": 1.126094161918445e-06, + "learning_rate": 1.1046525778843244e-05, + "loss": 0.0, + "num_input_tokens_seen": 15900920, + "step": 26095 + }, + { + "epoch": 7.198014340871484, + "grad_norm": 1.9300678104627877e-06, + "learning_rate": 1.1036542684110334e-05, + "loss": 0.0, + "num_input_tokens_seen": 15904376, + "step": 26100 + }, + { + "epoch": 7.199393270821842, + "grad_norm": 9.79156880021037e-07, + "learning_rate": 1.1026562824510056e-05, + "loss": 0.0, + "num_input_tokens_seen": 15907352, + "step": 26105 + }, + { + "epoch": 7.2007722007722, + "grad_norm": 1.5580740182485897e-06, + "learning_rate": 1.101658620235459e-05, + "loss": 0.0, + "num_input_tokens_seen": 15909976, + "step": 26110 + }, + { + "epoch": 7.20215113072256, + "grad_norm": 3.061644019908272e-05, + "learning_rate": 1.1006612819955367e-05, + "loss": 0.0, + "num_input_tokens_seen": 15914968, + "step": 26115 + }, + { + "epoch": 7.203530060672918, + "grad_norm": 2.9545168217737228e-05, + "learning_rate": 1.0996642679623095e-05, + "loss": 0.0, + "num_input_tokens_seen": 15917240, + "step": 26120 + }, + { + "epoch": 7.204908990623276, + "grad_norm": 3.445008815106121e-06, + "learning_rate": 1.0986675783667696e-05, + "loss": 0.0, + "num_input_tokens_seen": 15920600, + "step": 26125 + }, + { + "epoch": 7.206287920573635, + "grad_norm": 1.1587702601900673e-06, + "learning_rate": 1.0976712134398373e-05, + "loss": 0.0, + "num_input_tokens_seen": 15923640, + "step": 26130 + }, + { + "epoch": 7.207666850523993, + "grad_norm": 2.4723519800318172e-06, + "learning_rate": 1.0966751734123554e-05, + "loss": 0.0, + "num_input_tokens_seen": 15927032, + "step": 26135 + }, + { + "epoch": 7.2090457804743515, + "grad_norm": 2.66506845036929e-06, + "learning_rate": 1.0956794585150909e-05, + "loss": 0.0, + "num_input_tokens_seen": 15930008, + "step": 26140 + }, + { + "epoch": 7.210424710424711, + "grad_norm": 2.649092948558973e-06, + "learning_rate": 1.0946840689787385e-05, + "loss": 0.0, + "num_input_tokens_seen": 15932344, + "step": 26145 + }, + { + "epoch": 7.211803640375069, + "grad_norm": 1.0524490789975971e-05, + "learning_rate": 1.093689005033914e-05, + "loss": 0.0, + "num_input_tokens_seen": 15935000, + "step": 26150 + }, + { + "epoch": 7.2131825703254275, + "grad_norm": 2.5978458779718494e-06, + "learning_rate": 1.0926942669111612e-05, + "loss": 0.0, + "num_input_tokens_seen": 15937688, + "step": 26155 + }, + { + "epoch": 7.214561500275786, + "grad_norm": 2.8043198199156905e-06, + "learning_rate": 1.0916998548409449e-05, + "loss": 0.0, + "num_input_tokens_seen": 15940984, + "step": 26160 + }, + { + "epoch": 7.215940430226144, + "grad_norm": 1.1362524219293846e-06, + "learning_rate": 1.0907057690536577e-05, + "loss": 0.0219, + "num_input_tokens_seen": 15944696, + "step": 26165 + }, + { + "epoch": 7.217319360176503, + "grad_norm": 1.0543623147896142e-06, + "learning_rate": 1.0897120097796131e-05, + "loss": 0.0, + "num_input_tokens_seen": 15948888, + "step": 26170 + }, + { + "epoch": 7.218698290126862, + "grad_norm": 0.00022038294991943985, + "learning_rate": 1.0887185772490533e-05, + "loss": 0.0, + "num_input_tokens_seen": 15953112, + "step": 26175 + }, + { + "epoch": 7.22007722007722, + "grad_norm": 2.7493277229950763e-05, + "learning_rate": 1.0877254716921398e-05, + "loss": 0.0, + "num_input_tokens_seen": 15956408, + "step": 26180 + }, + { + "epoch": 7.221456150027579, + "grad_norm": 1.2239739817232476e-06, + "learning_rate": 1.086732693338963e-05, + "loss": 0.0, + "num_input_tokens_seen": 15959096, + "step": 26185 + }, + { + "epoch": 7.222835079977937, + "grad_norm": 2.590306849015178e-06, + "learning_rate": 1.0857402424195346e-05, + "loss": 0.0, + "num_input_tokens_seen": 15962424, + "step": 26190 + }, + { + "epoch": 7.224214009928295, + "grad_norm": 0.00014495472714770585, + "learning_rate": 1.0847481191637901e-05, + "loss": 0.0, + "num_input_tokens_seen": 15965304, + "step": 26195 + }, + { + "epoch": 7.225592939878654, + "grad_norm": 1.0692922387534054e-06, + "learning_rate": 1.0837563238015924e-05, + "loss": 0.0, + "num_input_tokens_seen": 15967960, + "step": 26200 + }, + { + "epoch": 7.226971869829013, + "grad_norm": 1.1442269851613673e-06, + "learning_rate": 1.0827648565627253e-05, + "loss": 0.0, + "num_input_tokens_seen": 15970488, + "step": 26205 + }, + { + "epoch": 7.2283507997793714, + "grad_norm": 1.4753979939996498e-06, + "learning_rate": 1.081773717676896e-05, + "loss": 0.0, + "num_input_tokens_seen": 15974232, + "step": 26210 + }, + { + "epoch": 7.22972972972973, + "grad_norm": 2.790999815260875e-06, + "learning_rate": 1.0807829073737394e-05, + "loss": 0.0, + "num_input_tokens_seen": 15976600, + "step": 26215 + }, + { + "epoch": 7.231108659680088, + "grad_norm": 4.572807301883586e-05, + "learning_rate": 1.0797924258828104e-05, + "loss": 0.0, + "num_input_tokens_seen": 15979384, + "step": 26220 + }, + { + "epoch": 7.232487589630447, + "grad_norm": 6.80857567658677e-07, + "learning_rate": 1.0788022734335912e-05, + "loss": 0.0, + "num_input_tokens_seen": 15981816, + "step": 26225 + }, + { + "epoch": 7.233866519580805, + "grad_norm": 2.6406189135741442e-05, + "learning_rate": 1.0778124502554834e-05, + "loss": 0.0, + "num_input_tokens_seen": 15984408, + "step": 26230 + }, + { + "epoch": 7.235245449531164, + "grad_norm": 1.6427593436674215e-06, + "learning_rate": 1.0768229565778165e-05, + "loss": 0.0, + "num_input_tokens_seen": 15987096, + "step": 26235 + }, + { + "epoch": 7.236624379481523, + "grad_norm": 0.0002595064870547503, + "learning_rate": 1.0758337926298426e-05, + "loss": 0.0, + "num_input_tokens_seen": 15989368, + "step": 26240 + }, + { + "epoch": 7.238003309431881, + "grad_norm": 3.689436925924383e-05, + "learning_rate": 1.0748449586407358e-05, + "loss": 0.0, + "num_input_tokens_seen": 15991960, + "step": 26245 + }, + { + "epoch": 7.239382239382239, + "grad_norm": 1.5108014395082137e-06, + "learning_rate": 1.0738564548395938e-05, + "loss": 0.0, + "num_input_tokens_seen": 15994968, + "step": 26250 + }, + { + "epoch": 7.240761169332598, + "grad_norm": 0.00020964570285286754, + "learning_rate": 1.0728682814554408e-05, + "loss": 0.0, + "num_input_tokens_seen": 15998168, + "step": 26255 + }, + { + "epoch": 7.242140099282956, + "grad_norm": 8.240286888394621e-07, + "learning_rate": 1.0718804387172213e-05, + "loss": 0.0, + "num_input_tokens_seen": 16001272, + "step": 26260 + }, + { + "epoch": 7.243519029233315, + "grad_norm": 1.2417744983395096e-05, + "learning_rate": 1.0708929268538034e-05, + "loss": 0.0, + "num_input_tokens_seen": 16005176, + "step": 26265 + }, + { + "epoch": 7.244897959183674, + "grad_norm": 9.670226427260786e-05, + "learning_rate": 1.0699057460939812e-05, + "loss": 0.0, + "num_input_tokens_seen": 16008120, + "step": 26270 + }, + { + "epoch": 7.246276889134032, + "grad_norm": 3.544508444974781e-06, + "learning_rate": 1.0689188966664687e-05, + "loss": 0.0, + "num_input_tokens_seen": 16011608, + "step": 26275 + }, + { + "epoch": 7.2476558190843905, + "grad_norm": 1.6291057818307308e-06, + "learning_rate": 1.0679323787999063e-05, + "loss": 0.0, + "num_input_tokens_seen": 16014872, + "step": 26280 + }, + { + "epoch": 7.249034749034749, + "grad_norm": 2.366653234275873e-06, + "learning_rate": 1.0669461927228553e-05, + "loss": 0.0, + "num_input_tokens_seen": 16017848, + "step": 26285 + }, + { + "epoch": 7.250413678985107, + "grad_norm": 7.3313090069859754e-06, + "learning_rate": 1.0659603386637998e-05, + "loss": 0.0, + "num_input_tokens_seen": 16020312, + "step": 26290 + }, + { + "epoch": 7.251792608935466, + "grad_norm": 9.13601616048254e-06, + "learning_rate": 1.064974816851149e-05, + "loss": 0.0, + "num_input_tokens_seen": 16023736, + "step": 26295 + }, + { + "epoch": 7.253171538885825, + "grad_norm": 1.1426612900322652e-06, + "learning_rate": 1.063989627513235e-05, + "loss": 0.0003, + "num_input_tokens_seen": 16026840, + "step": 26300 + }, + { + "epoch": 7.254550468836183, + "grad_norm": 1.3193501899877447e-06, + "learning_rate": 1.0630047708783106e-05, + "loss": 0.0, + "num_input_tokens_seen": 16029560, + "step": 26305 + }, + { + "epoch": 7.255929398786542, + "grad_norm": 1.0474100236024242e-05, + "learning_rate": 1.0620202471745541e-05, + "loss": 0.0, + "num_input_tokens_seen": 16033208, + "step": 26310 + }, + { + "epoch": 7.2573083287369, + "grad_norm": 5.001596946385689e-05, + "learning_rate": 1.0610360566300651e-05, + "loss": 0.0, + "num_input_tokens_seen": 16036120, + "step": 26315 + }, + { + "epoch": 7.258687258687258, + "grad_norm": 6.998395292612258e-06, + "learning_rate": 1.0600521994728655e-05, + "loss": 0.0, + "num_input_tokens_seen": 16038616, + "step": 26320 + }, + { + "epoch": 7.260066188637618, + "grad_norm": 4.982048722013133e-06, + "learning_rate": 1.0590686759309023e-05, + "loss": 0.0, + "num_input_tokens_seen": 16042392, + "step": 26325 + }, + { + "epoch": 7.261445118587976, + "grad_norm": 3.682705937535502e-06, + "learning_rate": 1.0580854862320428e-05, + "loss": 0.0, + "num_input_tokens_seen": 16044728, + "step": 26330 + }, + { + "epoch": 7.262824048538334, + "grad_norm": 2.498252797522582e-06, + "learning_rate": 1.0571026306040788e-05, + "loss": 0.0, + "num_input_tokens_seen": 16047640, + "step": 26335 + }, + { + "epoch": 7.264202978488693, + "grad_norm": 119.77142333984375, + "learning_rate": 1.0561201092747234e-05, + "loss": 0.043, + "num_input_tokens_seen": 16050264, + "step": 26340 + }, + { + "epoch": 7.265581908439051, + "grad_norm": 0.000708308769389987, + "learning_rate": 1.0551379224716116e-05, + "loss": 0.0, + "num_input_tokens_seen": 16053240, + "step": 26345 + }, + { + "epoch": 7.2669608383894095, + "grad_norm": 8.48800118546933e-06, + "learning_rate": 1.0541560704223038e-05, + "loss": 0.0, + "num_input_tokens_seen": 16056376, + "step": 26350 + }, + { + "epoch": 7.268339768339768, + "grad_norm": 2.5027195533766644e-06, + "learning_rate": 1.0531745533542805e-05, + "loss": 0.0, + "num_input_tokens_seen": 16059512, + "step": 26355 + }, + { + "epoch": 7.269718698290127, + "grad_norm": 4.621875632437877e-06, + "learning_rate": 1.0521933714949439e-05, + "loss": 0.0, + "num_input_tokens_seen": 16061688, + "step": 26360 + }, + { + "epoch": 7.2710976282404856, + "grad_norm": 4.5236035361995164e-07, + "learning_rate": 1.0512125250716207e-05, + "loss": 0.0, + "num_input_tokens_seen": 16064824, + "step": 26365 + }, + { + "epoch": 7.272476558190844, + "grad_norm": 1.1254453511355678e-06, + "learning_rate": 1.0502320143115599e-05, + "loss": 0.0, + "num_input_tokens_seen": 16068952, + "step": 26370 + }, + { + "epoch": 7.273855488141202, + "grad_norm": 1.100788836083666e-06, + "learning_rate": 1.0492518394419299e-05, + "loss": 0.0, + "num_input_tokens_seen": 16072632, + "step": 26375 + }, + { + "epoch": 7.275234418091561, + "grad_norm": 4.444450041773962e-06, + "learning_rate": 1.0482720006898247e-05, + "loss": 0.0, + "num_input_tokens_seen": 16074936, + "step": 26380 + }, + { + "epoch": 7.276613348041919, + "grad_norm": 8.806729852040007e-07, + "learning_rate": 1.0472924982822574e-05, + "loss": 0.0, + "num_input_tokens_seen": 16077752, + "step": 26385 + }, + { + "epoch": 7.277992277992278, + "grad_norm": 0.00025087068206630647, + "learning_rate": 1.0463133324461666e-05, + "loss": 0.0, + "num_input_tokens_seen": 16080952, + "step": 26390 + }, + { + "epoch": 7.279371207942637, + "grad_norm": 0.020466910675168037, + "learning_rate": 1.0453345034084098e-05, + "loss": 0.0, + "num_input_tokens_seen": 16083736, + "step": 26395 + }, + { + "epoch": 7.280750137892995, + "grad_norm": 1.28500312257529e-06, + "learning_rate": 1.0443560113957665e-05, + "loss": 0.0, + "num_input_tokens_seen": 16086488, + "step": 26400 + }, + { + "epoch": 7.2821290678433535, + "grad_norm": 1.2672750926867593e-05, + "learning_rate": 1.0433778566349417e-05, + "loss": 0.0, + "num_input_tokens_seen": 16089560, + "step": 26405 + }, + { + "epoch": 7.283507997793712, + "grad_norm": 1.0982803360093385e-06, + "learning_rate": 1.0424000393525582e-05, + "loss": 0.0, + "num_input_tokens_seen": 16092632, + "step": 26410 + }, + { + "epoch": 7.28488692774407, + "grad_norm": 6.46136982140888e-07, + "learning_rate": 1.041422559775162e-05, + "loss": 0.0, + "num_input_tokens_seen": 16096824, + "step": 26415 + }, + { + "epoch": 7.2862658576944295, + "grad_norm": 6.771561515961366e-07, + "learning_rate": 1.0404454181292223e-05, + "loss": 0.0, + "num_input_tokens_seen": 16099704, + "step": 26420 + }, + { + "epoch": 7.287644787644788, + "grad_norm": 2.0108396711293608e-05, + "learning_rate": 1.0394686146411272e-05, + "loss": 0.0, + "num_input_tokens_seen": 16103096, + "step": 26425 + }, + { + "epoch": 7.289023717595146, + "grad_norm": 8.703664207132533e-06, + "learning_rate": 1.0384921495371902e-05, + "loss": 0.0, + "num_input_tokens_seen": 16105880, + "step": 26430 + }, + { + "epoch": 7.290402647545505, + "grad_norm": 5.943098949501291e-05, + "learning_rate": 1.037516023043642e-05, + "loss": 0.0, + "num_input_tokens_seen": 16109592, + "step": 26435 + }, + { + "epoch": 7.291781577495863, + "grad_norm": 1.1303349083391367e-06, + "learning_rate": 1.0365402353866382e-05, + "loss": 0.0, + "num_input_tokens_seen": 16112824, + "step": 26440 + }, + { + "epoch": 7.293160507446221, + "grad_norm": 1.235009904121398e-06, + "learning_rate": 1.0355647867922558e-05, + "loss": 0.0, + "num_input_tokens_seen": 16115672, + "step": 26445 + }, + { + "epoch": 7.294539437396581, + "grad_norm": 9.420400033377518e-07, + "learning_rate": 1.0345896774864911e-05, + "loss": 0.0, + "num_input_tokens_seen": 16119032, + "step": 26450 + }, + { + "epoch": 7.295918367346939, + "grad_norm": 9.477605090069119e-06, + "learning_rate": 1.033614907695262e-05, + "loss": 0.0, + "num_input_tokens_seen": 16121656, + "step": 26455 + }, + { + "epoch": 7.297297297297297, + "grad_norm": 1.6442869537058868e-06, + "learning_rate": 1.0326404776444104e-05, + "loss": 0.0, + "num_input_tokens_seen": 16124952, + "step": 26460 + }, + { + "epoch": 7.298676227247656, + "grad_norm": 5.238436187937623e-06, + "learning_rate": 1.0316663875596963e-05, + "loss": 0.0, + "num_input_tokens_seen": 16127544, + "step": 26465 + }, + { + "epoch": 7.300055157198014, + "grad_norm": 2.9828527203790145e-06, + "learning_rate": 1.030692637666804e-05, + "loss": 0.0, + "num_input_tokens_seen": 16130360, + "step": 26470 + }, + { + "epoch": 7.3014340871483725, + "grad_norm": 6.603905262636545e-07, + "learning_rate": 1.0297192281913368e-05, + "loss": 0.0, + "num_input_tokens_seen": 16133432, + "step": 26475 + }, + { + "epoch": 7.302813017098732, + "grad_norm": 9.999246231018333e-07, + "learning_rate": 1.0287461593588183e-05, + "loss": 0.0, + "num_input_tokens_seen": 16136952, + "step": 26480 + }, + { + "epoch": 7.30419194704909, + "grad_norm": 3.413470767554827e-05, + "learning_rate": 1.0277734313946963e-05, + "loss": 0.0, + "num_input_tokens_seen": 16139576, + "step": 26485 + }, + { + "epoch": 7.3055708769994485, + "grad_norm": 7.724519264229457e-07, + "learning_rate": 1.0268010445243375e-05, + "loss": 0.0, + "num_input_tokens_seen": 16142360, + "step": 26490 + }, + { + "epoch": 7.306949806949807, + "grad_norm": 1.1693138048940455e-06, + "learning_rate": 1.0258289989730289e-05, + "loss": 0.0, + "num_input_tokens_seen": 16145400, + "step": 26495 + }, + { + "epoch": 7.308328736900165, + "grad_norm": 7.233837095554918e-05, + "learning_rate": 1.0248572949659812e-05, + "loss": 0.0, + "num_input_tokens_seen": 16147640, + "step": 26500 + }, + { + "epoch": 7.309707666850524, + "grad_norm": 1.519858756182657e-06, + "learning_rate": 1.0238859327283227e-05, + "loss": 0.0, + "num_input_tokens_seen": 16150552, + "step": 26505 + }, + { + "epoch": 7.311086596800883, + "grad_norm": 7.303729603336251e-07, + "learning_rate": 1.0229149124851048e-05, + "loss": 0.0, + "num_input_tokens_seen": 16153880, + "step": 26510 + }, + { + "epoch": 7.312465526751241, + "grad_norm": 1.4636225387221202e-05, + "learning_rate": 1.0219442344612997e-05, + "loss": 0.0, + "num_input_tokens_seen": 16156184, + "step": 26515 + }, + { + "epoch": 7.3138444567016, + "grad_norm": 1.8200922568212263e-05, + "learning_rate": 1.0209738988817979e-05, + "loss": 0.0, + "num_input_tokens_seen": 16158904, + "step": 26520 + }, + { + "epoch": 7.315223386651958, + "grad_norm": 0.00028713347273878753, + "learning_rate": 1.0200039059714143e-05, + "loss": 0.0, + "num_input_tokens_seen": 16161528, + "step": 26525 + }, + { + "epoch": 7.316602316602316, + "grad_norm": 5.17019032031385e-07, + "learning_rate": 1.019034255954881e-05, + "loss": 0.0, + "num_input_tokens_seen": 16163544, + "step": 26530 + }, + { + "epoch": 7.317981246552675, + "grad_norm": 7.224449518616893e-07, + "learning_rate": 1.0180649490568514e-05, + "loss": 0.0, + "num_input_tokens_seen": 16167576, + "step": 26535 + }, + { + "epoch": 7.319360176503034, + "grad_norm": 5.2246807172195986e-05, + "learning_rate": 1.0170959855019015e-05, + "loss": 0.0, + "num_input_tokens_seen": 16170488, + "step": 26540 + }, + { + "epoch": 7.320739106453392, + "grad_norm": 7.55503549498826e-07, + "learning_rate": 1.016127365514526e-05, + "loss": 0.0, + "num_input_tokens_seen": 16173400, + "step": 26545 + }, + { + "epoch": 7.322118036403751, + "grad_norm": 6.118547162259347e-07, + "learning_rate": 1.0151590893191385e-05, + "loss": 0.0, + "num_input_tokens_seen": 16176984, + "step": 26550 + }, + { + "epoch": 7.323496966354109, + "grad_norm": 2.340090213692747e-05, + "learning_rate": 1.014191157140077e-05, + "loss": 0.0, + "num_input_tokens_seen": 16180536, + "step": 26555 + }, + { + "epoch": 7.324875896304468, + "grad_norm": 3.2405579986516386e-05, + "learning_rate": 1.0132235692015953e-05, + "loss": 0.0, + "num_input_tokens_seen": 16182872, + "step": 26560 + }, + { + "epoch": 7.326254826254826, + "grad_norm": 5.493014327839774e-07, + "learning_rate": 1.012256325727872e-05, + "loss": 0.0, + "num_input_tokens_seen": 16185816, + "step": 26565 + }, + { + "epoch": 7.327633756205184, + "grad_norm": 8.578792289881676e-07, + "learning_rate": 1.0112894269430015e-05, + "loss": 0.0, + "num_input_tokens_seen": 16188344, + "step": 26570 + }, + { + "epoch": 7.329012686155544, + "grad_norm": 5.261597948447161e-07, + "learning_rate": 1.0103228730710018e-05, + "loss": 0.0, + "num_input_tokens_seen": 16191736, + "step": 26575 + }, + { + "epoch": 7.330391616105902, + "grad_norm": 2.193875616285368e-06, + "learning_rate": 1.0093566643358083e-05, + "loss": 0.0, + "num_input_tokens_seen": 16195192, + "step": 26580 + }, + { + "epoch": 7.33177054605626, + "grad_norm": 1.3000739045310183e-06, + "learning_rate": 1.0083908009612791e-05, + "loss": 0.0, + "num_input_tokens_seen": 16197912, + "step": 26585 + }, + { + "epoch": 7.333149476006619, + "grad_norm": 1.0435526291985298e-06, + "learning_rate": 1.0074252831711891e-05, + "loss": 0.0, + "num_input_tokens_seen": 16200632, + "step": 26590 + }, + { + "epoch": 7.334528405956977, + "grad_norm": 2.6702807645051507e-06, + "learning_rate": 1.0064601111892371e-05, + "loss": 0.0, + "num_input_tokens_seen": 16205784, + "step": 26595 + }, + { + "epoch": 7.335907335907336, + "grad_norm": 1.5504806469834875e-06, + "learning_rate": 1.0054952852390385e-05, + "loss": 0.0, + "num_input_tokens_seen": 16208920, + "step": 26600 + }, + { + "epoch": 7.337286265857695, + "grad_norm": 1.0056855899165384e-05, + "learning_rate": 1.0045308055441285e-05, + "loss": 0.0, + "num_input_tokens_seen": 16211448, + "step": 26605 + }, + { + "epoch": 7.338665195808053, + "grad_norm": 1.0974266615448869e-06, + "learning_rate": 1.0035666723279651e-05, + "loss": 0.0, + "num_input_tokens_seen": 16215864, + "step": 26610 + }, + { + "epoch": 7.3400441257584115, + "grad_norm": 2.873118774004979e-06, + "learning_rate": 1.0026028858139225e-05, + "loss": 0.0, + "num_input_tokens_seen": 16218648, + "step": 26615 + }, + { + "epoch": 7.34142305570877, + "grad_norm": 2.4135890726029174e-06, + "learning_rate": 1.0016394462252979e-05, + "loss": 0.0, + "num_input_tokens_seen": 16220888, + "step": 26620 + }, + { + "epoch": 7.342801985659128, + "grad_norm": 1.4619763533119112e-06, + "learning_rate": 1.0006763537853053e-05, + "loss": 0.0, + "num_input_tokens_seen": 16223704, + "step": 26625 + }, + { + "epoch": 7.344180915609487, + "grad_norm": 1.842039068833401e-06, + "learning_rate": 9.997136087170791e-06, + "loss": 0.0, + "num_input_tokens_seen": 16226552, + "step": 26630 + }, + { + "epoch": 7.345559845559846, + "grad_norm": 1.3002094192415825e-06, + "learning_rate": 9.987512112436744e-06, + "loss": 0.0, + "num_input_tokens_seen": 16229240, + "step": 26635 + }, + { + "epoch": 7.346938775510204, + "grad_norm": 3.4252768728038063e-06, + "learning_rate": 9.977891615880641e-06, + "loss": 0.0, + "num_input_tokens_seen": 16232024, + "step": 26640 + }, + { + "epoch": 7.348317705460563, + "grad_norm": 7.309212151085376e-07, + "learning_rate": 9.968274599731425e-06, + "loss": 0.0, + "num_input_tokens_seen": 16235160, + "step": 26645 + }, + { + "epoch": 7.349696635410921, + "grad_norm": 2.2013132365827914e-06, + "learning_rate": 9.9586610662172e-06, + "loss": 0.0, + "num_input_tokens_seen": 16237816, + "step": 26650 + }, + { + "epoch": 7.351075565361279, + "grad_norm": 7.385123126368853e-07, + "learning_rate": 9.9490510175653e-06, + "loss": 0.0, + "num_input_tokens_seen": 16240472, + "step": 26655 + }, + { + "epoch": 7.352454495311639, + "grad_norm": 7.380261877187877e-07, + "learning_rate": 9.939444456002239e-06, + "loss": 0.0, + "num_input_tokens_seen": 16243352, + "step": 26660 + }, + { + "epoch": 7.353833425261997, + "grad_norm": 0.00025606006965972483, + "learning_rate": 9.929841383753713e-06, + "loss": 0.0, + "num_input_tokens_seen": 16246552, + "step": 26665 + }, + { + "epoch": 7.355212355212355, + "grad_norm": 1.1809005400209571e-06, + "learning_rate": 9.920241803044606e-06, + "loss": 0.0, + "num_input_tokens_seen": 16249400, + "step": 26670 + }, + { + "epoch": 7.356591285162714, + "grad_norm": 9.684806627774378e-07, + "learning_rate": 9.910645716099017e-06, + "loss": 0.0, + "num_input_tokens_seen": 16251864, + "step": 26675 + }, + { + "epoch": 7.357970215113072, + "grad_norm": 7.773797960908269e-07, + "learning_rate": 9.901053125140223e-06, + "loss": 0.0, + "num_input_tokens_seen": 16257240, + "step": 26680 + }, + { + "epoch": 7.3593491450634305, + "grad_norm": 9.905631486617494e-06, + "learning_rate": 9.89146403239067e-06, + "loss": 0.0, + "num_input_tokens_seen": 16260376, + "step": 26685 + }, + { + "epoch": 7.360728075013789, + "grad_norm": 1.201911800308153e-06, + "learning_rate": 9.881878440072037e-06, + "loss": 0.0, + "num_input_tokens_seen": 16264024, + "step": 26690 + }, + { + "epoch": 7.362107004964148, + "grad_norm": 1.3980820767756086e-06, + "learning_rate": 9.872296350405154e-06, + "loss": 0.0, + "num_input_tokens_seen": 16267000, + "step": 26695 + }, + { + "epoch": 7.3634859349145065, + "grad_norm": 4.085536147613311e-06, + "learning_rate": 9.862717765610055e-06, + "loss": 0.0, + "num_input_tokens_seen": 16269528, + "step": 26700 + }, + { + "epoch": 7.364864864864865, + "grad_norm": 8.607572681285092e-07, + "learning_rate": 9.853142687905967e-06, + "loss": 0.0, + "num_input_tokens_seen": 16272472, + "step": 26705 + }, + { + "epoch": 7.366243794815223, + "grad_norm": 3.6013290809933096e-05, + "learning_rate": 9.843571119511286e-06, + "loss": 0.0, + "num_input_tokens_seen": 16275032, + "step": 26710 + }, + { + "epoch": 7.367622724765582, + "grad_norm": 3.0338023861986585e-05, + "learning_rate": 9.834003062643615e-06, + "loss": 0.0, + "num_input_tokens_seen": 16278616, + "step": 26715 + }, + { + "epoch": 7.36900165471594, + "grad_norm": 9.440030908081098e-07, + "learning_rate": 9.824438519519743e-06, + "loss": 0.0, + "num_input_tokens_seen": 16281496, + "step": 26720 + }, + { + "epoch": 7.370380584666299, + "grad_norm": 1.57347153617593e-06, + "learning_rate": 9.814877492355622e-06, + "loss": 0.0, + "num_input_tokens_seen": 16285048, + "step": 26725 + }, + { + "epoch": 7.371759514616658, + "grad_norm": 1.4150446077110246e-06, + "learning_rate": 9.80531998336642e-06, + "loss": 0.0, + "num_input_tokens_seen": 16288920, + "step": 26730 + }, + { + "epoch": 7.373138444567016, + "grad_norm": 5.616726070911682e-07, + "learning_rate": 9.79576599476647e-06, + "loss": 0.0, + "num_input_tokens_seen": 16292152, + "step": 26735 + }, + { + "epoch": 7.374517374517374, + "grad_norm": 1.1552684782145661e-06, + "learning_rate": 9.78621552876928e-06, + "loss": 0.0, + "num_input_tokens_seen": 16294520, + "step": 26740 + }, + { + "epoch": 7.375896304467733, + "grad_norm": 1.3396950180322165e-06, + "learning_rate": 9.776668587587575e-06, + "loss": 0.0, + "num_input_tokens_seen": 16297560, + "step": 26745 + }, + { + "epoch": 7.377275234418091, + "grad_norm": 6.335964371828595e-06, + "learning_rate": 9.767125173433228e-06, + "loss": 0.0, + "num_input_tokens_seen": 16302872, + "step": 26750 + }, + { + "epoch": 7.3786541643684505, + "grad_norm": 7.643184289918281e-06, + "learning_rate": 9.757585288517328e-06, + "loss": 0.0, + "num_input_tokens_seen": 16306232, + "step": 26755 + }, + { + "epoch": 7.380033094318809, + "grad_norm": 1.1055030881834682e-06, + "learning_rate": 9.748048935050121e-06, + "loss": 0.0, + "num_input_tokens_seen": 16310328, + "step": 26760 + }, + { + "epoch": 7.381412024269167, + "grad_norm": 5.358572252589511e-06, + "learning_rate": 9.738516115241034e-06, + "loss": 0.0, + "num_input_tokens_seen": 16313592, + "step": 26765 + }, + { + "epoch": 7.382790954219526, + "grad_norm": 1.6967715055216104e-06, + "learning_rate": 9.728986831298703e-06, + "loss": 0.0, + "num_input_tokens_seen": 16319032, + "step": 26770 + }, + { + "epoch": 7.384169884169884, + "grad_norm": 9.286813451581111e-07, + "learning_rate": 9.719461085430917e-06, + "loss": 0.0, + "num_input_tokens_seen": 16321400, + "step": 26775 + }, + { + "epoch": 7.385548814120242, + "grad_norm": 4.587140210787766e-06, + "learning_rate": 9.709938879844643e-06, + "loss": 0.0, + "num_input_tokens_seen": 16323640, + "step": 26780 + }, + { + "epoch": 7.386927744070602, + "grad_norm": 7.949937753437553e-06, + "learning_rate": 9.70042021674605e-06, + "loss": 0.0, + "num_input_tokens_seen": 16327736, + "step": 26785 + }, + { + "epoch": 7.38830667402096, + "grad_norm": 1.0811090760398656e-06, + "learning_rate": 9.690905098340489e-06, + "loss": 0.0, + "num_input_tokens_seen": 16331512, + "step": 26790 + }, + { + "epoch": 7.389685603971318, + "grad_norm": 1.3944556485512294e-05, + "learning_rate": 9.681393526832453e-06, + "loss": 0.0, + "num_input_tokens_seen": 16333976, + "step": 26795 + }, + { + "epoch": 7.391064533921677, + "grad_norm": 0.00013757306442130357, + "learning_rate": 9.671885504425654e-06, + "loss": 0.0, + "num_input_tokens_seen": 16337016, + "step": 26800 + }, + { + "epoch": 7.392443463872035, + "grad_norm": 1.5221329476844403e-06, + "learning_rate": 9.662381033322951e-06, + "loss": 0.0, + "num_input_tokens_seen": 16339928, + "step": 26805 + }, + { + "epoch": 7.3938223938223935, + "grad_norm": 4.4942393628844e-07, + "learning_rate": 9.652880115726407e-06, + "loss": 0.0, + "num_input_tokens_seen": 16343384, + "step": 26810 + }, + { + "epoch": 7.395201323772753, + "grad_norm": 9.327813131676521e-06, + "learning_rate": 9.64338275383724e-06, + "loss": 0.0, + "num_input_tokens_seen": 16345912, + "step": 26815 + }, + { + "epoch": 7.396580253723111, + "grad_norm": 2.476333747836179e-06, + "learning_rate": 9.633888949855849e-06, + "loss": 0.0, + "num_input_tokens_seen": 16348504, + "step": 26820 + }, + { + "epoch": 7.3979591836734695, + "grad_norm": 5.908700131840305e-06, + "learning_rate": 9.624398705981821e-06, + "loss": 0.0, + "num_input_tokens_seen": 16351928, + "step": 26825 + }, + { + "epoch": 7.399338113623828, + "grad_norm": 1.872128223112668e-06, + "learning_rate": 9.614912024413908e-06, + "loss": 0.0, + "num_input_tokens_seen": 16354904, + "step": 26830 + }, + { + "epoch": 7.400717043574186, + "grad_norm": 1.6686385606590193e-06, + "learning_rate": 9.605428907350026e-06, + "loss": 0.0, + "num_input_tokens_seen": 16358008, + "step": 26835 + }, + { + "epoch": 7.402095973524545, + "grad_norm": 9.108781569011626e-07, + "learning_rate": 9.595949356987294e-06, + "loss": 0.0, + "num_input_tokens_seen": 16361368, + "step": 26840 + }, + { + "epoch": 7.403474903474904, + "grad_norm": 6.144351232251211e-07, + "learning_rate": 9.58647337552197e-06, + "loss": 0.0, + "num_input_tokens_seen": 16364248, + "step": 26845 + }, + { + "epoch": 7.404853833425262, + "grad_norm": 6.798606136726448e-06, + "learning_rate": 9.577000965149523e-06, + "loss": 0.0, + "num_input_tokens_seen": 16366616, + "step": 26850 + }, + { + "epoch": 7.406232763375621, + "grad_norm": 0.0002498811518307775, + "learning_rate": 9.567532128064555e-06, + "loss": 0.0, + "num_input_tokens_seen": 16369112, + "step": 26855 + }, + { + "epoch": 7.407611693325979, + "grad_norm": 1.6305892131640576e-05, + "learning_rate": 9.558066866460869e-06, + "loss": 0.0, + "num_input_tokens_seen": 16372088, + "step": 26860 + }, + { + "epoch": 7.408990623276337, + "grad_norm": 8.215642992581706e-06, + "learning_rate": 9.548605182531439e-06, + "loss": 0.0, + "num_input_tokens_seen": 16375768, + "step": 26865 + }, + { + "epoch": 7.410369553226696, + "grad_norm": 6.223073114597355e-07, + "learning_rate": 9.53914707846839e-06, + "loss": 0.0, + "num_input_tokens_seen": 16378904, + "step": 26870 + }, + { + "epoch": 7.411748483177055, + "grad_norm": 6.76007630318054e-07, + "learning_rate": 9.529692556463024e-06, + "loss": 0.0, + "num_input_tokens_seen": 16381368, + "step": 26875 + }, + { + "epoch": 7.413127413127413, + "grad_norm": 0.00017192481027450413, + "learning_rate": 9.520241618705836e-06, + "loss": 0.0, + "num_input_tokens_seen": 16383960, + "step": 26880 + }, + { + "epoch": 7.414506343077772, + "grad_norm": 2.0997508727305103e-06, + "learning_rate": 9.51079426738646e-06, + "loss": 0.1521, + "num_input_tokens_seen": 16386488, + "step": 26885 + }, + { + "epoch": 7.41588527302813, + "grad_norm": 9.830604312810465e-07, + "learning_rate": 9.501350504693704e-06, + "loss": 0.0, + "num_input_tokens_seen": 16388728, + "step": 26890 + }, + { + "epoch": 7.4172642029784885, + "grad_norm": 2.827315938702668e-06, + "learning_rate": 9.491910332815573e-06, + "loss": 0.0, + "num_input_tokens_seen": 16391832, + "step": 26895 + }, + { + "epoch": 7.418643132928847, + "grad_norm": 7.755492674732523e-07, + "learning_rate": 9.4824737539392e-06, + "loss": 0.0002, + "num_input_tokens_seen": 16394840, + "step": 26900 + }, + { + "epoch": 7.420022062879205, + "grad_norm": 3.241695594624616e-06, + "learning_rate": 9.473040770250918e-06, + "loss": 0.0, + "num_input_tokens_seen": 16397976, + "step": 26905 + }, + { + "epoch": 7.421400992829565, + "grad_norm": 5.2399154810700566e-05, + "learning_rate": 9.463611383936213e-06, + "loss": 0.0, + "num_input_tokens_seen": 16402264, + "step": 26910 + }, + { + "epoch": 7.422779922779923, + "grad_norm": 5.090244030725444e-06, + "learning_rate": 9.454185597179725e-06, + "loss": 0.0, + "num_input_tokens_seen": 16405208, + "step": 26915 + }, + { + "epoch": 7.424158852730281, + "grad_norm": 5.194197456148686e-06, + "learning_rate": 9.444763412165289e-06, + "loss": 0.0, + "num_input_tokens_seen": 16407704, + "step": 26920 + }, + { + "epoch": 7.42553778268064, + "grad_norm": 3.3341118523821933e-06, + "learning_rate": 9.43534483107588e-06, + "loss": 0.0, + "num_input_tokens_seen": 16410264, + "step": 26925 + }, + { + "epoch": 7.426916712630998, + "grad_norm": 2.6653581244318048e-06, + "learning_rate": 9.425929856093647e-06, + "loss": 0.0, + "num_input_tokens_seen": 16412856, + "step": 26930 + }, + { + "epoch": 7.428295642581357, + "grad_norm": 1.4204860235622618e-05, + "learning_rate": 9.416518489399919e-06, + "loss": 0.0, + "num_input_tokens_seen": 16416248, + "step": 26935 + }, + { + "epoch": 7.429674572531716, + "grad_norm": 1.3098358977003954e-05, + "learning_rate": 9.407110733175167e-06, + "loss": 0.0, + "num_input_tokens_seen": 16419320, + "step": 26940 + }, + { + "epoch": 7.431053502482074, + "grad_norm": 4.373001866042614e-05, + "learning_rate": 9.397706589599022e-06, + "loss": 0.0, + "num_input_tokens_seen": 16422616, + "step": 26945 + }, + { + "epoch": 7.4324324324324325, + "grad_norm": 3.801079401455354e-06, + "learning_rate": 9.388306060850308e-06, + "loss": 0.0, + "num_input_tokens_seen": 16425496, + "step": 26950 + }, + { + "epoch": 7.433811362382791, + "grad_norm": 4.373547199065797e-06, + "learning_rate": 9.378909149106973e-06, + "loss": 0.0, + "num_input_tokens_seen": 16429592, + "step": 26955 + }, + { + "epoch": 7.435190292333149, + "grad_norm": 1.8270936834596796e-06, + "learning_rate": 9.369515856546166e-06, + "loss": 0.0, + "num_input_tokens_seen": 16432248, + "step": 26960 + }, + { + "epoch": 7.436569222283508, + "grad_norm": 2.4590635803178884e-05, + "learning_rate": 9.360126185344168e-06, + "loss": 0.0, + "num_input_tokens_seen": 16435000, + "step": 26965 + }, + { + "epoch": 7.437948152233867, + "grad_norm": 0.0004872600839007646, + "learning_rate": 9.350740137676423e-06, + "loss": 0.0, + "num_input_tokens_seen": 16437720, + "step": 26970 + }, + { + "epoch": 7.439327082184225, + "grad_norm": 1.127385621657595e-05, + "learning_rate": 9.34135771571756e-06, + "loss": 0.0, + "num_input_tokens_seen": 16440824, + "step": 26975 + }, + { + "epoch": 7.440706012134584, + "grad_norm": 1.5845416783122346e-05, + "learning_rate": 9.331978921641344e-06, + "loss": 0.0, + "num_input_tokens_seen": 16444536, + "step": 26980 + }, + { + "epoch": 7.442084942084942, + "grad_norm": 7.29104649508372e-05, + "learning_rate": 9.3226037576207e-06, + "loss": 0.0, + "num_input_tokens_seen": 16447384, + "step": 26985 + }, + { + "epoch": 7.4434638720353, + "grad_norm": 1.1837254305646638e-06, + "learning_rate": 9.313232225827724e-06, + "loss": 0.0, + "num_input_tokens_seen": 16449656, + "step": 26990 + }, + { + "epoch": 7.444842801985659, + "grad_norm": 2.0319075701991096e-06, + "learning_rate": 9.303864328433673e-06, + "loss": 0.0, + "num_input_tokens_seen": 16452376, + "step": 26995 + }, + { + "epoch": 7.446221731936018, + "grad_norm": 1.1061099485232262e-06, + "learning_rate": 9.29450006760894e-06, + "loss": 0.0, + "num_input_tokens_seen": 16455640, + "step": 27000 + }, + { + "epoch": 7.447600661886376, + "grad_norm": 1.100037570722634e-05, + "learning_rate": 9.285139445523108e-06, + "loss": 0.0, + "num_input_tokens_seen": 16458424, + "step": 27005 + }, + { + "epoch": 7.448979591836735, + "grad_norm": 8.784091733105015e-06, + "learning_rate": 9.275782464344879e-06, + "loss": 0.0, + "num_input_tokens_seen": 16460568, + "step": 27010 + }, + { + "epoch": 7.450358521787093, + "grad_norm": 6.343604127323488e-06, + "learning_rate": 9.266429126242152e-06, + "loss": 0.0, + "num_input_tokens_seen": 16463448, + "step": 27015 + }, + { + "epoch": 7.4517374517374515, + "grad_norm": 0.0001855692098615691, + "learning_rate": 9.257079433381948e-06, + "loss": 0.0, + "num_input_tokens_seen": 16466296, + "step": 27020 + }, + { + "epoch": 7.45311638168781, + "grad_norm": 2.046066356342635e-06, + "learning_rate": 9.247733387930451e-06, + "loss": 0.0, + "num_input_tokens_seen": 16470168, + "step": 27025 + }, + { + "epoch": 7.454495311638169, + "grad_norm": 1.4868179505356238e-06, + "learning_rate": 9.238390992053023e-06, + "loss": 0.0, + "num_input_tokens_seen": 16472952, + "step": 27030 + }, + { + "epoch": 7.4558742415885275, + "grad_norm": 0.00046654028119519353, + "learning_rate": 9.229052247914153e-06, + "loss": 0.0, + "num_input_tokens_seen": 16477176, + "step": 27035 + }, + { + "epoch": 7.457253171538886, + "grad_norm": 0.0005231545073911548, + "learning_rate": 9.219717157677483e-06, + "loss": 0.0, + "num_input_tokens_seen": 16480056, + "step": 27040 + }, + { + "epoch": 7.458632101489244, + "grad_norm": 8.285375952254981e-05, + "learning_rate": 9.210385723505841e-06, + "loss": 0.0, + "num_input_tokens_seen": 16483000, + "step": 27045 + }, + { + "epoch": 7.460011031439603, + "grad_norm": 4.520165475696558e-06, + "learning_rate": 9.201057947561167e-06, + "loss": 0.0, + "num_input_tokens_seen": 16485304, + "step": 27050 + }, + { + "epoch": 7.461389961389961, + "grad_norm": 0.0005597668350674212, + "learning_rate": 9.191733832004593e-06, + "loss": 0.0, + "num_input_tokens_seen": 16489560, + "step": 27055 + }, + { + "epoch": 7.46276889134032, + "grad_norm": 1.7076201402232982e-05, + "learning_rate": 9.18241337899636e-06, + "loss": 0.0, + "num_input_tokens_seen": 16494072, + "step": 27060 + }, + { + "epoch": 7.464147821290679, + "grad_norm": 4.532069942797534e-05, + "learning_rate": 9.173096590695898e-06, + "loss": 0.0, + "num_input_tokens_seen": 16497752, + "step": 27065 + }, + { + "epoch": 7.465526751241037, + "grad_norm": 1.4344510645969422e-06, + "learning_rate": 9.163783469261774e-06, + "loss": 0.0, + "num_input_tokens_seen": 16500792, + "step": 27070 + }, + { + "epoch": 7.466905681191395, + "grad_norm": 8.360706260646111e-07, + "learning_rate": 9.154474016851706e-06, + "loss": 0.0, + "num_input_tokens_seen": 16505144, + "step": 27075 + }, + { + "epoch": 7.468284611141754, + "grad_norm": 4.843838178203441e-05, + "learning_rate": 9.145168235622542e-06, + "loss": 0.0, + "num_input_tokens_seen": 16507576, + "step": 27080 + }, + { + "epoch": 7.469663541092112, + "grad_norm": 6.565498551935889e-06, + "learning_rate": 9.135866127730323e-06, + "loss": 0.0, + "num_input_tokens_seen": 16509816, + "step": 27085 + }, + { + "epoch": 7.471042471042471, + "grad_norm": 0.0001120611559599638, + "learning_rate": 9.126567695330188e-06, + "loss": 0.0, + "num_input_tokens_seen": 16512376, + "step": 27090 + }, + { + "epoch": 7.47242140099283, + "grad_norm": 5.668275662173983e-06, + "learning_rate": 9.117272940576478e-06, + "loss": 0.0, + "num_input_tokens_seen": 16515576, + "step": 27095 + }, + { + "epoch": 7.473800330943188, + "grad_norm": 1.0336224249840598e-06, + "learning_rate": 9.107981865622637e-06, + "loss": 0.0, + "num_input_tokens_seen": 16518232, + "step": 27100 + }, + { + "epoch": 7.475179260893547, + "grad_norm": 4.112830538360868e-06, + "learning_rate": 9.098694472621269e-06, + "loss": 0.0855, + "num_input_tokens_seen": 16521016, + "step": 27105 + }, + { + "epoch": 7.476558190843905, + "grad_norm": 7.702135462750448e-07, + "learning_rate": 9.089410763724145e-06, + "loss": 0.0, + "num_input_tokens_seen": 16524728, + "step": 27110 + }, + { + "epoch": 7.477937120794263, + "grad_norm": 2.0660295376728754e-06, + "learning_rate": 9.080130741082163e-06, + "loss": 0.0, + "num_input_tokens_seen": 16527960, + "step": 27115 + }, + { + "epoch": 7.479316050744623, + "grad_norm": 3.3088506370404502e-06, + "learning_rate": 9.070854406845358e-06, + "loss": 0.0, + "num_input_tokens_seen": 16531224, + "step": 27120 + }, + { + "epoch": 7.480694980694981, + "grad_norm": 0.00042271194979548454, + "learning_rate": 9.061581763162941e-06, + "loss": 0.0, + "num_input_tokens_seen": 16533944, + "step": 27125 + }, + { + "epoch": 7.482073910645339, + "grad_norm": 4.696803443948738e-05, + "learning_rate": 9.052312812183233e-06, + "loss": 0.0, + "num_input_tokens_seen": 16536568, + "step": 27130 + }, + { + "epoch": 7.483452840595698, + "grad_norm": 2.15309137274744e-06, + "learning_rate": 9.04304755605373e-06, + "loss": 0.0, + "num_input_tokens_seen": 16539224, + "step": 27135 + }, + { + "epoch": 7.484831770546056, + "grad_norm": 0.0003153696015942842, + "learning_rate": 9.033785996921062e-06, + "loss": 0.0, + "num_input_tokens_seen": 16541528, + "step": 27140 + }, + { + "epoch": 7.4862107004964145, + "grad_norm": 1.542051904834807e-05, + "learning_rate": 9.024528136930985e-06, + "loss": 0.0, + "num_input_tokens_seen": 16545592, + "step": 27145 + }, + { + "epoch": 7.487589630446774, + "grad_norm": 6.471146889452939e-07, + "learning_rate": 9.015273978228428e-06, + "loss": 0.0, + "num_input_tokens_seen": 16548728, + "step": 27150 + }, + { + "epoch": 7.488968560397132, + "grad_norm": 0.005510705057531595, + "learning_rate": 9.006023522957438e-06, + "loss": 0.0, + "num_input_tokens_seen": 16552248, + "step": 27155 + }, + { + "epoch": 7.4903474903474905, + "grad_norm": 0.0003186753601767123, + "learning_rate": 8.996776773261207e-06, + "loss": 0.0, + "num_input_tokens_seen": 16555800, + "step": 27160 + }, + { + "epoch": 7.491726420297849, + "grad_norm": 2.421221097392845e-06, + "learning_rate": 8.987533731282085e-06, + "loss": 0.0, + "num_input_tokens_seen": 16558296, + "step": 27165 + }, + { + "epoch": 7.493105350248207, + "grad_norm": 0.02054881863296032, + "learning_rate": 8.978294399161553e-06, + "loss": 0.0, + "num_input_tokens_seen": 16561816, + "step": 27170 + }, + { + "epoch": 7.494484280198566, + "grad_norm": 0.0001314811233896762, + "learning_rate": 8.969058779040215e-06, + "loss": 0.0, + "num_input_tokens_seen": 16565080, + "step": 27175 + }, + { + "epoch": 7.495863210148924, + "grad_norm": 1.619972135813441e-05, + "learning_rate": 8.959826873057853e-06, + "loss": 0.0, + "num_input_tokens_seen": 16567320, + "step": 27180 + }, + { + "epoch": 7.497242140099283, + "grad_norm": 1.326465280726552e-06, + "learning_rate": 8.950598683353347e-06, + "loss": 0.0, + "num_input_tokens_seen": 16569624, + "step": 27185 + }, + { + "epoch": 7.498621070049642, + "grad_norm": 9.233611308445688e-06, + "learning_rate": 8.941374212064754e-06, + "loss": 0.0, + "num_input_tokens_seen": 16572216, + "step": 27190 + }, + { + "epoch": 7.5, + "grad_norm": 9.468370762988343e-07, + "learning_rate": 8.932153461329238e-06, + "loss": 0.0, + "num_input_tokens_seen": 16574840, + "step": 27195 + }, + { + "epoch": 7.5, + "eval_loss": 0.319726824760437, + "eval_runtime": 28.4977, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 14.142, + "num_input_tokens_seen": 16574840, + "step": 27195 + }, + { + "epoch": 7.501378929950358, + "grad_norm": 6.359339295158861e-06, + "learning_rate": 8.922936433283127e-06, + "loss": 0.0, + "num_input_tokens_seen": 16578136, + "step": 27200 + }, + { + "epoch": 7.502757859900717, + "grad_norm": 4.2415766074554995e-05, + "learning_rate": 8.91372313006186e-06, + "loss": 0.0, + "num_input_tokens_seen": 16580696, + "step": 27205 + }, + { + "epoch": 7.504136789851076, + "grad_norm": 1.2314489140408114e-05, + "learning_rate": 8.904513553800042e-06, + "loss": 0.0, + "num_input_tokens_seen": 16583832, + "step": 27210 + }, + { + "epoch": 7.505515719801434, + "grad_norm": 2.153207333321916e-06, + "learning_rate": 8.895307706631383e-06, + "loss": 0.0, + "num_input_tokens_seen": 16587992, + "step": 27215 + }, + { + "epoch": 7.506894649751793, + "grad_norm": 0.0001147434304584749, + "learning_rate": 8.886105590688768e-06, + "loss": 0.0, + "num_input_tokens_seen": 16590264, + "step": 27220 + }, + { + "epoch": 7.508273579702151, + "grad_norm": 1.1600194511629525e-06, + "learning_rate": 8.876907208104176e-06, + "loss": 0.0, + "num_input_tokens_seen": 16592408, + "step": 27225 + }, + { + "epoch": 7.5096525096525095, + "grad_norm": 7.585167622892186e-06, + "learning_rate": 8.867712561008743e-06, + "loss": 0.0, + "num_input_tokens_seen": 16595224, + "step": 27230 + }, + { + "epoch": 7.511031439602868, + "grad_norm": 9.972511634259718e-07, + "learning_rate": 8.858521651532745e-06, + "loss": 0.0, + "num_input_tokens_seen": 16598328, + "step": 27235 + }, + { + "epoch": 7.512410369553226, + "grad_norm": 7.836072654754389e-06, + "learning_rate": 8.849334481805571e-06, + "loss": 0.0, + "num_input_tokens_seen": 16601368, + "step": 27240 + }, + { + "epoch": 7.5137892995035855, + "grad_norm": 1.3534928257286083e-05, + "learning_rate": 8.840151053955773e-06, + "loss": 0.0, + "num_input_tokens_seen": 16604856, + "step": 27245 + }, + { + "epoch": 7.515168229453944, + "grad_norm": 0.0005592751549556851, + "learning_rate": 8.83097137011101e-06, + "loss": 0.0, + "num_input_tokens_seen": 16607992, + "step": 27250 + }, + { + "epoch": 7.516547159404302, + "grad_norm": 1.494498746978934e-06, + "learning_rate": 8.821795432398075e-06, + "loss": 0.0, + "num_input_tokens_seen": 16610872, + "step": 27255 + }, + { + "epoch": 7.517926089354661, + "grad_norm": 9.317962394561619e-06, + "learning_rate": 8.812623242942919e-06, + "loss": 0.0, + "num_input_tokens_seen": 16614584, + "step": 27260 + }, + { + "epoch": 7.519305019305019, + "grad_norm": 5.5101222642406356e-06, + "learning_rate": 8.803454803870587e-06, + "loss": 0.0, + "num_input_tokens_seen": 16617336, + "step": 27265 + }, + { + "epoch": 7.520683949255378, + "grad_norm": 1.6297673937515356e-05, + "learning_rate": 8.794290117305296e-06, + "loss": 0.0, + "num_input_tokens_seen": 16620504, + "step": 27270 + }, + { + "epoch": 7.522062879205737, + "grad_norm": 7.310984528885456e-06, + "learning_rate": 8.78512918537035e-06, + "loss": 0.0, + "num_input_tokens_seen": 16624248, + "step": 27275 + }, + { + "epoch": 7.523441809156095, + "grad_norm": 1.3990596698931768e-06, + "learning_rate": 8.775972010188226e-06, + "loss": 0.0, + "num_input_tokens_seen": 16626872, + "step": 27280 + }, + { + "epoch": 7.524820739106453, + "grad_norm": 5.6564326769148465e-06, + "learning_rate": 8.766818593880493e-06, + "loss": 0.0, + "num_input_tokens_seen": 16629464, + "step": 27285 + }, + { + "epoch": 7.526199669056812, + "grad_norm": 4.924199674860574e-05, + "learning_rate": 8.757668938567881e-06, + "loss": 0.0, + "num_input_tokens_seen": 16632920, + "step": 27290 + }, + { + "epoch": 7.52757859900717, + "grad_norm": 1.6495912404934643e-06, + "learning_rate": 8.74852304637022e-06, + "loss": 0.0, + "num_input_tokens_seen": 16635864, + "step": 27295 + }, + { + "epoch": 7.528957528957529, + "grad_norm": 1.881505704659503e-05, + "learning_rate": 8.739380919406495e-06, + "loss": 0.0, + "num_input_tokens_seen": 16641272, + "step": 27300 + }, + { + "epoch": 7.530336458907888, + "grad_norm": 0.00045785008114762604, + "learning_rate": 8.730242559794801e-06, + "loss": 0.0, + "num_input_tokens_seen": 16644248, + "step": 27305 + }, + { + "epoch": 7.531715388858246, + "grad_norm": 1.6244712242041714e-06, + "learning_rate": 8.721107969652353e-06, + "loss": 0.0, + "num_input_tokens_seen": 16646360, + "step": 27310 + }, + { + "epoch": 7.533094318808605, + "grad_norm": 2.3735030481475405e-05, + "learning_rate": 8.711977151095524e-06, + "loss": 0.0, + "num_input_tokens_seen": 16649240, + "step": 27315 + }, + { + "epoch": 7.534473248758963, + "grad_norm": 1.2544582205009647e-05, + "learning_rate": 8.702850106239784e-06, + "loss": 0.0, + "num_input_tokens_seen": 16651960, + "step": 27320 + }, + { + "epoch": 7.535852178709321, + "grad_norm": 9.494637924944982e-05, + "learning_rate": 8.69372683719973e-06, + "loss": 0.0, + "num_input_tokens_seen": 16655032, + "step": 27325 + }, + { + "epoch": 7.53723110865968, + "grad_norm": 2.7429216515884036e-06, + "learning_rate": 8.684607346089108e-06, + "loss": 0.0, + "num_input_tokens_seen": 16658968, + "step": 27330 + }, + { + "epoch": 7.538610038610039, + "grad_norm": 0.000148173508932814, + "learning_rate": 8.675491635020757e-06, + "loss": 0.0, + "num_input_tokens_seen": 16661336, + "step": 27335 + }, + { + "epoch": 7.539988968560397, + "grad_norm": 1.9996496121166274e-06, + "learning_rate": 8.666379706106666e-06, + "loss": 0.0, + "num_input_tokens_seen": 16663896, + "step": 27340 + }, + { + "epoch": 7.541367898510756, + "grad_norm": 6.563412171090022e-05, + "learning_rate": 8.657271561457947e-06, + "loss": 0.0, + "num_input_tokens_seen": 16666776, + "step": 27345 + }, + { + "epoch": 7.542746828461114, + "grad_norm": 0.00010602332622511312, + "learning_rate": 8.64816720318481e-06, + "loss": 0.0, + "num_input_tokens_seen": 16671896, + "step": 27350 + }, + { + "epoch": 7.5441257584114725, + "grad_norm": 2.8959500923519954e-05, + "learning_rate": 8.639066633396617e-06, + "loss": 0.0, + "num_input_tokens_seen": 16674680, + "step": 27355 + }, + { + "epoch": 7.545504688361831, + "grad_norm": 2.801804384944262e-06, + "learning_rate": 8.629969854201833e-06, + "loss": 0.0, + "num_input_tokens_seen": 16678200, + "step": 27360 + }, + { + "epoch": 7.54688361831219, + "grad_norm": 6.577615295100259e-06, + "learning_rate": 8.62087686770804e-06, + "loss": 0.0, + "num_input_tokens_seen": 16682232, + "step": 27365 + }, + { + "epoch": 7.5482625482625485, + "grad_norm": 0.0003931967366952449, + "learning_rate": 8.61178767602198e-06, + "loss": 0.0, + "num_input_tokens_seen": 16684920, + "step": 27370 + }, + { + "epoch": 7.549641478212907, + "grad_norm": 6.850711088191019e-06, + "learning_rate": 8.602702281249467e-06, + "loss": 0.0, + "num_input_tokens_seen": 16688184, + "step": 27375 + }, + { + "epoch": 7.551020408163265, + "grad_norm": 1.4373854355653748e-06, + "learning_rate": 8.593620685495455e-06, + "loss": 0.0, + "num_input_tokens_seen": 16690904, + "step": 27380 + }, + { + "epoch": 7.552399338113624, + "grad_norm": 9.355841029901057e-06, + "learning_rate": 8.584542890864031e-06, + "loss": 0.0, + "num_input_tokens_seen": 16695096, + "step": 27385 + }, + { + "epoch": 7.553778268063982, + "grad_norm": 1.1875217751367018e-06, + "learning_rate": 8.57546889945838e-06, + "loss": 0.0, + "num_input_tokens_seen": 16698360, + "step": 27390 + }, + { + "epoch": 7.55515719801434, + "grad_norm": 1.3125535360813956e-06, + "learning_rate": 8.566398713380827e-06, + "loss": 0.0, + "num_input_tokens_seen": 16701720, + "step": 27395 + }, + { + "epoch": 7.5565361279647, + "grad_norm": 1.0618891792546492e-06, + "learning_rate": 8.557332334732799e-06, + "loss": 0.0, + "num_input_tokens_seen": 16704568, + "step": 27400 + }, + { + "epoch": 7.557915057915058, + "grad_norm": 1.3357382158574183e-05, + "learning_rate": 8.548269765614835e-06, + "loss": 0.0, + "num_input_tokens_seen": 16706712, + "step": 27405 + }, + { + "epoch": 7.559293987865416, + "grad_norm": 3.583349825930782e-05, + "learning_rate": 8.53921100812661e-06, + "loss": 0.0, + "num_input_tokens_seen": 16709304, + "step": 27410 + }, + { + "epoch": 7.560672917815775, + "grad_norm": 2.0443438188522123e-05, + "learning_rate": 8.530156064366921e-06, + "loss": 0.0, + "num_input_tokens_seen": 16713688, + "step": 27415 + }, + { + "epoch": 7.562051847766133, + "grad_norm": 1.5352533182522166e-06, + "learning_rate": 8.521104936433644e-06, + "loss": 0.0, + "num_input_tokens_seen": 16716312, + "step": 27420 + }, + { + "epoch": 7.563430777716492, + "grad_norm": 1.2004555856037769e-06, + "learning_rate": 8.51205762642382e-06, + "loss": 0.0, + "num_input_tokens_seen": 16718776, + "step": 27425 + }, + { + "epoch": 7.564809707666851, + "grad_norm": 3.3576422993064625e-06, + "learning_rate": 8.50301413643356e-06, + "loss": 0.0, + "num_input_tokens_seen": 16722616, + "step": 27430 + }, + { + "epoch": 7.566188637617209, + "grad_norm": 4.8050085752038285e-05, + "learning_rate": 8.493974468558127e-06, + "loss": 0.0, + "num_input_tokens_seen": 16726488, + "step": 27435 + }, + { + "epoch": 7.5675675675675675, + "grad_norm": 1.2267257716302993e-06, + "learning_rate": 8.484938624891875e-06, + "loss": 0.0, + "num_input_tokens_seen": 16729112, + "step": 27440 + }, + { + "epoch": 7.568946497517926, + "grad_norm": 4.639339749701321e-05, + "learning_rate": 8.475906607528272e-06, + "loss": 0.0, + "num_input_tokens_seen": 16731960, + "step": 27445 + }, + { + "epoch": 7.570325427468284, + "grad_norm": 1.1499545280457824e-06, + "learning_rate": 8.466878418559923e-06, + "loss": 0.0, + "num_input_tokens_seen": 16735160, + "step": 27450 + }, + { + "epoch": 7.571704357418643, + "grad_norm": 8.542325304006226e-06, + "learning_rate": 8.45785406007852e-06, + "loss": 0.0, + "num_input_tokens_seen": 16738072, + "step": 27455 + }, + { + "epoch": 7.573083287369002, + "grad_norm": 3.1237090297508985e-05, + "learning_rate": 8.448833534174872e-06, + "loss": 0.0, + "num_input_tokens_seen": 16742200, + "step": 27460 + }, + { + "epoch": 7.57446221731936, + "grad_norm": 1.3848134585714433e-05, + "learning_rate": 8.439816842938919e-06, + "loss": 0.0, + "num_input_tokens_seen": 16745720, + "step": 27465 + }, + { + "epoch": 7.575841147269719, + "grad_norm": 9.699347174318973e-07, + "learning_rate": 8.430803988459683e-06, + "loss": 0.0, + "num_input_tokens_seen": 16748568, + "step": 27470 + }, + { + "epoch": 7.577220077220077, + "grad_norm": 1.681730168456852e-06, + "learning_rate": 8.42179497282533e-06, + "loss": 0.0, + "num_input_tokens_seen": 16751352, + "step": 27475 + }, + { + "epoch": 7.578599007170435, + "grad_norm": 1.2668677300098352e-05, + "learning_rate": 8.412789798123102e-06, + "loss": 0.0, + "num_input_tokens_seen": 16754200, + "step": 27480 + }, + { + "epoch": 7.579977937120795, + "grad_norm": 1.3051962923782412e-05, + "learning_rate": 8.40378846643938e-06, + "loss": 0.0, + "num_input_tokens_seen": 16757432, + "step": 27485 + }, + { + "epoch": 7.581356867071153, + "grad_norm": 8.407217683270574e-05, + "learning_rate": 8.394790979859649e-06, + "loss": 0.0, + "num_input_tokens_seen": 16760440, + "step": 27490 + }, + { + "epoch": 7.5827357970215115, + "grad_norm": 1.4033776096766815e-06, + "learning_rate": 8.38579734046849e-06, + "loss": 0.0, + "num_input_tokens_seen": 16763352, + "step": 27495 + }, + { + "epoch": 7.58411472697187, + "grad_norm": 1.45898502523778e-05, + "learning_rate": 8.376807550349591e-06, + "loss": 0.0, + "num_input_tokens_seen": 16766200, + "step": 27500 + }, + { + "epoch": 7.585493656922228, + "grad_norm": 9.908075071507483e-07, + "learning_rate": 8.367821611585774e-06, + "loss": 0.0, + "num_input_tokens_seen": 16770584, + "step": 27505 + }, + { + "epoch": 7.586872586872587, + "grad_norm": 3.402148649911396e-05, + "learning_rate": 8.358839526258946e-06, + "loss": 0.0, + "num_input_tokens_seen": 16773816, + "step": 27510 + }, + { + "epoch": 7.588251516822945, + "grad_norm": 3.919338723790133e-06, + "learning_rate": 8.349861296450115e-06, + "loss": 0.0, + "num_input_tokens_seen": 16776504, + "step": 27515 + }, + { + "epoch": 7.589630446773304, + "grad_norm": 1.6070836181825143e-06, + "learning_rate": 8.34088692423943e-06, + "loss": 0.0, + "num_input_tokens_seen": 16779032, + "step": 27520 + }, + { + "epoch": 7.591009376723663, + "grad_norm": 2.820333520503482e-06, + "learning_rate": 8.331916411706101e-06, + "loss": 0.0, + "num_input_tokens_seen": 16782296, + "step": 27525 + }, + { + "epoch": 7.592388306674021, + "grad_norm": 1.0088441740663256e-05, + "learning_rate": 8.32294976092849e-06, + "loss": 0.0, + "num_input_tokens_seen": 16784920, + "step": 27530 + }, + { + "epoch": 7.593767236624379, + "grad_norm": 1.3199761497162399e-06, + "learning_rate": 8.313986973984028e-06, + "loss": 0.0, + "num_input_tokens_seen": 16787800, + "step": 27535 + }, + { + "epoch": 7.595146166574738, + "grad_norm": 5.5109635468397755e-06, + "learning_rate": 8.305028052949257e-06, + "loss": 0.0, + "num_input_tokens_seen": 16790360, + "step": 27540 + }, + { + "epoch": 7.596525096525097, + "grad_norm": 0.0001945894182426855, + "learning_rate": 8.296072999899851e-06, + "loss": 0.0, + "num_input_tokens_seen": 16793688, + "step": 27545 + }, + { + "epoch": 7.597904026475455, + "grad_norm": 8.559983371014823e-07, + "learning_rate": 8.287121816910545e-06, + "loss": 0.0, + "num_input_tokens_seen": 16796504, + "step": 27550 + }, + { + "epoch": 7.599282956425814, + "grad_norm": 7.057211064420699e-07, + "learning_rate": 8.278174506055212e-06, + "loss": 0.0001, + "num_input_tokens_seen": 16798776, + "step": 27555 + }, + { + "epoch": 7.600661886376172, + "grad_norm": 3.547516598700895e-06, + "learning_rate": 8.269231069406818e-06, + "loss": 0.0, + "num_input_tokens_seen": 16801304, + "step": 27560 + }, + { + "epoch": 7.6020408163265305, + "grad_norm": 1.0746418411144987e-05, + "learning_rate": 8.26029150903743e-06, + "loss": 0.0, + "num_input_tokens_seen": 16803768, + "step": 27565 + }, + { + "epoch": 7.603419746276889, + "grad_norm": 1.7249494703719392e-05, + "learning_rate": 8.251355827018198e-06, + "loss": 0.0, + "num_input_tokens_seen": 16806296, + "step": 27570 + }, + { + "epoch": 7.604798676227247, + "grad_norm": 6.807173349443474e-07, + "learning_rate": 8.242424025419415e-06, + "loss": 0.0, + "num_input_tokens_seen": 16808664, + "step": 27575 + }, + { + "epoch": 7.6061776061776065, + "grad_norm": 5.162207799003227e-06, + "learning_rate": 8.233496106310432e-06, + "loss": 0.0, + "num_input_tokens_seen": 16811288, + "step": 27580 + }, + { + "epoch": 7.607556536127965, + "grad_norm": 5.108042387291789e-06, + "learning_rate": 8.224572071759734e-06, + "loss": 0.0, + "num_input_tokens_seen": 16813720, + "step": 27585 + }, + { + "epoch": 7.608935466078323, + "grad_norm": 2.8606209525605664e-05, + "learning_rate": 8.215651923834886e-06, + "loss": 0.0, + "num_input_tokens_seen": 16816568, + "step": 27590 + }, + { + "epoch": 7.610314396028682, + "grad_norm": 2.1219959307927638e-05, + "learning_rate": 8.20673566460255e-06, + "loss": 0.0, + "num_input_tokens_seen": 16819480, + "step": 27595 + }, + { + "epoch": 7.61169332597904, + "grad_norm": 2.3529300960944965e-06, + "learning_rate": 8.19782329612851e-06, + "loss": 0.0, + "num_input_tokens_seen": 16822648, + "step": 27600 + }, + { + "epoch": 7.613072255929398, + "grad_norm": 2.658678113220958e-06, + "learning_rate": 8.188914820477628e-06, + "loss": 0.0, + "num_input_tokens_seen": 16825272, + "step": 27605 + }, + { + "epoch": 7.614451185879758, + "grad_norm": 0.00019084030645899475, + "learning_rate": 8.180010239713858e-06, + "loss": 0.0, + "num_input_tokens_seen": 16827544, + "step": 27610 + }, + { + "epoch": 7.615830115830116, + "grad_norm": 7.817272489774041e-06, + "learning_rate": 8.171109555900278e-06, + "loss": 0.0, + "num_input_tokens_seen": 16830104, + "step": 27615 + }, + { + "epoch": 7.617209045780474, + "grad_norm": 1.0616771533022984e-06, + "learning_rate": 8.162212771099051e-06, + "loss": 0.0, + "num_input_tokens_seen": 16832792, + "step": 27620 + }, + { + "epoch": 7.618587975730833, + "grad_norm": 2.3201935164252063e-06, + "learning_rate": 8.153319887371418e-06, + "loss": 0.0, + "num_input_tokens_seen": 16835928, + "step": 27625 + }, + { + "epoch": 7.619966905681191, + "grad_norm": 1.5688472558395006e-05, + "learning_rate": 8.144430906777755e-06, + "loss": 0.0, + "num_input_tokens_seen": 16838360, + "step": 27630 + }, + { + "epoch": 7.6213458356315495, + "grad_norm": 7.735909548500786e-07, + "learning_rate": 8.13554583137749e-06, + "loss": 0.0, + "num_input_tokens_seen": 16841624, + "step": 27635 + }, + { + "epoch": 7.622724765581909, + "grad_norm": 0.00010355854465160519, + "learning_rate": 8.126664663229182e-06, + "loss": 0.0, + "num_input_tokens_seen": 16844408, + "step": 27640 + }, + { + "epoch": 7.624103695532267, + "grad_norm": 2.3608783976669656e-06, + "learning_rate": 8.117787404390468e-06, + "loss": 0.0, + "num_input_tokens_seen": 16847000, + "step": 27645 + }, + { + "epoch": 7.625482625482626, + "grad_norm": 1.890664634629502e-06, + "learning_rate": 8.108914056918069e-06, + "loss": 0.0, + "num_input_tokens_seen": 16849624, + "step": 27650 + }, + { + "epoch": 7.626861555432984, + "grad_norm": 9.516979275758786e-07, + "learning_rate": 8.100044622867828e-06, + "loss": 0.0, + "num_input_tokens_seen": 16852568, + "step": 27655 + }, + { + "epoch": 7.628240485383342, + "grad_norm": 0.00014708992966916412, + "learning_rate": 8.091179104294657e-06, + "loss": 0.0, + "num_input_tokens_seen": 16855384, + "step": 27660 + }, + { + "epoch": 7.629619415333701, + "grad_norm": 2.561888550189906e-06, + "learning_rate": 8.082317503252568e-06, + "loss": 0.0, + "num_input_tokens_seen": 16857720, + "step": 27665 + }, + { + "epoch": 7.63099834528406, + "grad_norm": 9.489348826718924e-07, + "learning_rate": 8.073459821794673e-06, + "loss": 0.0, + "num_input_tokens_seen": 16860024, + "step": 27670 + }, + { + "epoch": 7.632377275234418, + "grad_norm": 1.760949999152217e-05, + "learning_rate": 8.064606061973163e-06, + "loss": 0.0, + "num_input_tokens_seen": 16862936, + "step": 27675 + }, + { + "epoch": 7.633756205184777, + "grad_norm": 3.4007719023065874e-06, + "learning_rate": 8.055756225839336e-06, + "loss": 0.0, + "num_input_tokens_seen": 16867608, + "step": 27680 + }, + { + "epoch": 7.635135135135135, + "grad_norm": 2.6686207093007397e-06, + "learning_rate": 8.04691031544356e-06, + "loss": 0.0, + "num_input_tokens_seen": 16870424, + "step": 27685 + }, + { + "epoch": 7.6365140650854935, + "grad_norm": 2.1695016130252043e-06, + "learning_rate": 8.03806833283532e-06, + "loss": 0.0, + "num_input_tokens_seen": 16872856, + "step": 27690 + }, + { + "epoch": 7.637892995035852, + "grad_norm": 2.185912535423995e-06, + "learning_rate": 8.029230280063158e-06, + "loss": 0.0, + "num_input_tokens_seen": 16876280, + "step": 27695 + }, + { + "epoch": 7.639271924986211, + "grad_norm": 0.00018124801863450557, + "learning_rate": 8.020396159174746e-06, + "loss": 0.0, + "num_input_tokens_seen": 16878968, + "step": 27700 + }, + { + "epoch": 7.6406508549365695, + "grad_norm": 1.680742684584402e-06, + "learning_rate": 8.011565972216801e-06, + "loss": 0.0, + "num_input_tokens_seen": 16882264, + "step": 27705 + }, + { + "epoch": 7.642029784886928, + "grad_norm": 1.4307428273241385e-06, + "learning_rate": 8.00273972123517e-06, + "loss": 0.0, + "num_input_tokens_seen": 16884888, + "step": 27710 + }, + { + "epoch": 7.643408714837286, + "grad_norm": 7.158467633416876e-05, + "learning_rate": 7.993917408274763e-06, + "loss": 0.0, + "num_input_tokens_seen": 16888216, + "step": 27715 + }, + { + "epoch": 7.644787644787645, + "grad_norm": 1.913941559905652e-05, + "learning_rate": 7.985099035379568e-06, + "loss": 0.0, + "num_input_tokens_seen": 16891672, + "step": 27720 + }, + { + "epoch": 7.646166574738003, + "grad_norm": 8.776878530625254e-07, + "learning_rate": 7.976284604592697e-06, + "loss": 0.0, + "num_input_tokens_seen": 16894296, + "step": 27725 + }, + { + "epoch": 7.647545504688361, + "grad_norm": 1.646415694267489e-05, + "learning_rate": 7.967474117956309e-06, + "loss": 0.0, + "num_input_tokens_seen": 16897368, + "step": 27730 + }, + { + "epoch": 7.648924434638721, + "grad_norm": 2.834600309142843e-06, + "learning_rate": 7.958667577511683e-06, + "loss": 0.0, + "num_input_tokens_seen": 16899768, + "step": 27735 + }, + { + "epoch": 7.650303364589079, + "grad_norm": 2.8818532882723957e-05, + "learning_rate": 7.94986498529916e-06, + "loss": 0.0, + "num_input_tokens_seen": 16903160, + "step": 27740 + }, + { + "epoch": 7.651682294539437, + "grad_norm": 1.0162078751818626e-06, + "learning_rate": 7.94106634335817e-06, + "loss": 0.0, + "num_input_tokens_seen": 16905656, + "step": 27745 + }, + { + "epoch": 7.653061224489796, + "grad_norm": 1.9339688606123673e-06, + "learning_rate": 7.932271653727245e-06, + "loss": 0.0, + "num_input_tokens_seen": 16908120, + "step": 27750 + }, + { + "epoch": 7.654440154440154, + "grad_norm": 1.2898635759484023e-06, + "learning_rate": 7.92348091844397e-06, + "loss": 0.0, + "num_input_tokens_seen": 16910616, + "step": 27755 + }, + { + "epoch": 7.655819084390513, + "grad_norm": 2.4513457901775837e-05, + "learning_rate": 7.914694139545045e-06, + "loss": 0.0, + "num_input_tokens_seen": 16913400, + "step": 27760 + }, + { + "epoch": 7.657198014340872, + "grad_norm": 6.956769993848866e-06, + "learning_rate": 7.905911319066246e-06, + "loss": 0.0, + "num_input_tokens_seen": 16917048, + "step": 27765 + }, + { + "epoch": 7.65857694429123, + "grad_norm": 6.617079293391726e-07, + "learning_rate": 7.897132459042408e-06, + "loss": 0.0, + "num_input_tokens_seen": 16920184, + "step": 27770 + }, + { + "epoch": 7.6599558742415885, + "grad_norm": 1.2868279100075597e-06, + "learning_rate": 7.888357561507486e-06, + "loss": 0.0, + "num_input_tokens_seen": 16924408, + "step": 27775 + }, + { + "epoch": 7.661334804191947, + "grad_norm": 1.4482556025541271e-06, + "learning_rate": 7.879586628494493e-06, + "loss": 0.0, + "num_input_tokens_seen": 16930232, + "step": 27780 + }, + { + "epoch": 7.662713734142305, + "grad_norm": 3.4320453323744005e-06, + "learning_rate": 7.870819662035513e-06, + "loss": 0.0, + "num_input_tokens_seen": 16934424, + "step": 27785 + }, + { + "epoch": 7.664092664092664, + "grad_norm": 1.083987172023626e-06, + "learning_rate": 7.86205666416175e-06, + "loss": 0.0, + "num_input_tokens_seen": 16937304, + "step": 27790 + }, + { + "epoch": 7.665471594043023, + "grad_norm": 3.1031493108457653e-06, + "learning_rate": 7.853297636903448e-06, + "loss": 0.0, + "num_input_tokens_seen": 16940568, + "step": 27795 + }, + { + "epoch": 7.666850523993381, + "grad_norm": 8.755785074754385e-07, + "learning_rate": 7.844542582289946e-06, + "loss": 0.0, + "num_input_tokens_seen": 16943192, + "step": 27800 + }, + { + "epoch": 7.66822945394374, + "grad_norm": 6.931648499630683e-07, + "learning_rate": 7.835791502349678e-06, + "loss": 0.0, + "num_input_tokens_seen": 16947192, + "step": 27805 + }, + { + "epoch": 7.669608383894098, + "grad_norm": 1.0783411198644899e-05, + "learning_rate": 7.82704439911014e-06, + "loss": 0.0, + "num_input_tokens_seen": 16949720, + "step": 27810 + }, + { + "epoch": 7.670987313844456, + "grad_norm": 1.4330023532238556e-06, + "learning_rate": 7.818301274597897e-06, + "loss": 0.0, + "num_input_tokens_seen": 16952728, + "step": 27815 + }, + { + "epoch": 7.672366243794816, + "grad_norm": 2.7167288862983696e-05, + "learning_rate": 7.809562130838621e-06, + "loss": 0.0, + "num_input_tokens_seen": 16956536, + "step": 27820 + }, + { + "epoch": 7.673745173745174, + "grad_norm": 5.056040026829578e-05, + "learning_rate": 7.800826969857037e-06, + "loss": 0.0, + "num_input_tokens_seen": 16959192, + "step": 27825 + }, + { + "epoch": 7.675124103695532, + "grad_norm": 8.956365036283387e-07, + "learning_rate": 7.792095793676957e-06, + "loss": 0.0, + "num_input_tokens_seen": 16963736, + "step": 27830 + }, + { + "epoch": 7.676503033645891, + "grad_norm": 3.5017446862184443e-06, + "learning_rate": 7.78336860432128e-06, + "loss": 0.0, + "num_input_tokens_seen": 16965624, + "step": 27835 + }, + { + "epoch": 7.677881963596249, + "grad_norm": 3.138254669465823e-06, + "learning_rate": 7.774645403811956e-06, + "loss": 0.0, + "num_input_tokens_seen": 16968696, + "step": 27840 + }, + { + "epoch": 7.679260893546608, + "grad_norm": 6.090095212130109e-06, + "learning_rate": 7.765926194170039e-06, + "loss": 0.0, + "num_input_tokens_seen": 16971064, + "step": 27845 + }, + { + "epoch": 7.680639823496966, + "grad_norm": 9.103504226004588e-07, + "learning_rate": 7.757210977415638e-06, + "loss": 0.0, + "num_input_tokens_seen": 16973720, + "step": 27850 + }, + { + "epoch": 7.682018753447325, + "grad_norm": 0.002628081012517214, + "learning_rate": 7.748499755567934e-06, + "loss": 0.0, + "num_input_tokens_seen": 16976216, + "step": 27855 + }, + { + "epoch": 7.683397683397684, + "grad_norm": 1.1892431757587474e-05, + "learning_rate": 7.739792530645213e-06, + "loss": 0.0, + "num_input_tokens_seen": 16978712, + "step": 27860 + }, + { + "epoch": 7.684776613348042, + "grad_norm": 1.1762713256757706e-05, + "learning_rate": 7.731089304664791e-06, + "loss": 0.0, + "num_input_tokens_seen": 16982232, + "step": 27865 + }, + { + "epoch": 7.6861555432984, + "grad_norm": 4.497418558457866e-06, + "learning_rate": 7.7223900796431e-06, + "loss": 0.0, + "num_input_tokens_seen": 16985368, + "step": 27870 + }, + { + "epoch": 7.687534473248759, + "grad_norm": 9.86650320555782e-07, + "learning_rate": 7.713694857595618e-06, + "loss": 0.0, + "num_input_tokens_seen": 16988472, + "step": 27875 + }, + { + "epoch": 7.688913403199118, + "grad_norm": 7.1071112870413344e-06, + "learning_rate": 7.705003640536896e-06, + "loss": 0.0, + "num_input_tokens_seen": 16991384, + "step": 27880 + }, + { + "epoch": 7.690292333149476, + "grad_norm": 6.298125299508683e-06, + "learning_rate": 7.696316430480577e-06, + "loss": 0.0, + "num_input_tokens_seen": 16995512, + "step": 27885 + }, + { + "epoch": 7.691671263099835, + "grad_norm": 1.1893359896930633e-06, + "learning_rate": 7.687633229439348e-06, + "loss": 0.0, + "num_input_tokens_seen": 16997912, + "step": 27890 + }, + { + "epoch": 7.693050193050193, + "grad_norm": 4.238587280269712e-05, + "learning_rate": 7.678954039425e-06, + "loss": 0.0, + "num_input_tokens_seen": 17001208, + "step": 27895 + }, + { + "epoch": 7.6944291230005515, + "grad_norm": 6.0001068050041795e-05, + "learning_rate": 7.67027886244836e-06, + "loss": 0.0, + "num_input_tokens_seen": 17004120, + "step": 27900 + }, + { + "epoch": 7.69580805295091, + "grad_norm": 6.425401579690515e-07, + "learning_rate": 7.661607700519358e-06, + "loss": 0.0, + "num_input_tokens_seen": 17006936, + "step": 27905 + }, + { + "epoch": 7.697186982901268, + "grad_norm": 1.4073514194024028e-06, + "learning_rate": 7.65294055564696e-06, + "loss": 0.0, + "num_input_tokens_seen": 17012152, + "step": 27910 + }, + { + "epoch": 7.6985659128516275, + "grad_norm": 3.1712647796666715e-06, + "learning_rate": 7.644277429839237e-06, + "loss": 0.0, + "num_input_tokens_seen": 17014552, + "step": 27915 + }, + { + "epoch": 7.699944842801986, + "grad_norm": 2.257995402032975e-05, + "learning_rate": 7.635618325103294e-06, + "loss": 0.0, + "num_input_tokens_seen": 17017368, + "step": 27920 + }, + { + "epoch": 7.701323772752344, + "grad_norm": 4.009705662610941e-06, + "learning_rate": 7.626963243445337e-06, + "loss": 0.0, + "num_input_tokens_seen": 17020984, + "step": 27925 + }, + { + "epoch": 7.702702702702703, + "grad_norm": 1.2509442512964597e-06, + "learning_rate": 7.6183121868706155e-06, + "loss": 0.0, + "num_input_tokens_seen": 17023448, + "step": 27930 + }, + { + "epoch": 7.704081632653061, + "grad_norm": 1.0174921953876037e-06, + "learning_rate": 7.60966515738345e-06, + "loss": 0.0, + "num_input_tokens_seen": 17025976, + "step": 27935 + }, + { + "epoch": 7.705460562603419, + "grad_norm": 1.9706943930941634e-05, + "learning_rate": 7.6010221569872475e-06, + "loss": 0.0, + "num_input_tokens_seen": 17028504, + "step": 27940 + }, + { + "epoch": 7.706839492553779, + "grad_norm": 0.001692100428044796, + "learning_rate": 7.592383187684457e-06, + "loss": 0.0, + "num_input_tokens_seen": 17031032, + "step": 27945 + }, + { + "epoch": 7.708218422504137, + "grad_norm": 9.054212569026276e-06, + "learning_rate": 7.5837482514766e-06, + "loss": 0.0, + "num_input_tokens_seen": 17034712, + "step": 27950 + }, + { + "epoch": 7.709597352454495, + "grad_norm": 3.350172482896596e-05, + "learning_rate": 7.57511735036428e-06, + "loss": 0.0, + "num_input_tokens_seen": 17037464, + "step": 27955 + }, + { + "epoch": 7.710976282404854, + "grad_norm": 3.2027634006226435e-05, + "learning_rate": 7.566490486347136e-06, + "loss": 0.0, + "num_input_tokens_seen": 17040888, + "step": 27960 + }, + { + "epoch": 7.712355212355212, + "grad_norm": 1.031094825520995e-06, + "learning_rate": 7.557867661423909e-06, + "loss": 0.0, + "num_input_tokens_seen": 17044152, + "step": 27965 + }, + { + "epoch": 7.7137341423055705, + "grad_norm": 9.996971357395523e-07, + "learning_rate": 7.5492488775923655e-06, + "loss": 0.0, + "num_input_tokens_seen": 17046904, + "step": 27970 + }, + { + "epoch": 7.71511307225593, + "grad_norm": 7.586384526803158e-06, + "learning_rate": 7.5406341368493585e-06, + "loss": 0.0, + "num_input_tokens_seen": 17050072, + "step": 27975 + }, + { + "epoch": 7.716492002206288, + "grad_norm": 6.02095497015398e-06, + "learning_rate": 7.5320234411908135e-06, + "loss": 0.0, + "num_input_tokens_seen": 17052760, + "step": 27980 + }, + { + "epoch": 7.7178709321566465, + "grad_norm": 9.241795851266943e-06, + "learning_rate": 7.523416792611693e-06, + "loss": 0.0, + "num_input_tokens_seen": 17056056, + "step": 27985 + }, + { + "epoch": 7.719249862107005, + "grad_norm": 0.00026265037013217807, + "learning_rate": 7.514814193106029e-06, + "loss": 0.0, + "num_input_tokens_seen": 17058680, + "step": 27990 + }, + { + "epoch": 7.720628792057363, + "grad_norm": 8.919378160499036e-06, + "learning_rate": 7.506215644666933e-06, + "loss": 0.0, + "num_input_tokens_seen": 17061112, + "step": 27995 + }, + { + "epoch": 7.722007722007722, + "grad_norm": 2.2393519429897424e-06, + "learning_rate": 7.49762114928656e-06, + "loss": 0.0, + "num_input_tokens_seen": 17063864, + "step": 28000 + }, + { + "epoch": 7.72338665195808, + "grad_norm": 1.4167817425914109e-06, + "learning_rate": 7.489030708956124e-06, + "loss": 0.0, + "num_input_tokens_seen": 17067576, + "step": 28005 + }, + { + "epoch": 7.724765581908439, + "grad_norm": 1.032988507176924e-06, + "learning_rate": 7.480444325665917e-06, + "loss": 0.0, + "num_input_tokens_seen": 17070232, + "step": 28010 + }, + { + "epoch": 7.726144511858798, + "grad_norm": 0.00032469184952788055, + "learning_rate": 7.471862001405272e-06, + "loss": 0.0, + "num_input_tokens_seen": 17072760, + "step": 28015 + }, + { + "epoch": 7.727523441809156, + "grad_norm": 0.0002603453758638352, + "learning_rate": 7.463283738162602e-06, + "loss": 0.0, + "num_input_tokens_seen": 17075992, + "step": 28020 + }, + { + "epoch": 7.7289023717595144, + "grad_norm": 4.2053820834553335e-06, + "learning_rate": 7.454709537925358e-06, + "loss": 0.0, + "num_input_tokens_seen": 17079160, + "step": 28025 + }, + { + "epoch": 7.730281301709873, + "grad_norm": 8.877100299287122e-06, + "learning_rate": 7.446139402680058e-06, + "loss": 0.0, + "num_input_tokens_seen": 17081848, + "step": 28030 + }, + { + "epoch": 7.731660231660232, + "grad_norm": 0.0002295290178153664, + "learning_rate": 7.437573334412279e-06, + "loss": 0.0, + "num_input_tokens_seen": 17085816, + "step": 28035 + }, + { + "epoch": 7.7330391616105905, + "grad_norm": 3.91892126572202e-06, + "learning_rate": 7.42901133510667e-06, + "loss": 0.0, + "num_input_tokens_seen": 17088632, + "step": 28040 + }, + { + "epoch": 7.734418091560949, + "grad_norm": 5.335091941560677e-07, + "learning_rate": 7.420453406746905e-06, + "loss": 0.0, + "num_input_tokens_seen": 17091000, + "step": 28045 + }, + { + "epoch": 7.735797021511307, + "grad_norm": 1.3150008726370288e-06, + "learning_rate": 7.411899551315748e-06, + "loss": 0.0, + "num_input_tokens_seen": 17093784, + "step": 28050 + }, + { + "epoch": 7.737175951461666, + "grad_norm": 2.335356384719489e-06, + "learning_rate": 7.403349770794993e-06, + "loss": 0.0, + "num_input_tokens_seen": 17096504, + "step": 28055 + }, + { + "epoch": 7.738554881412024, + "grad_norm": 1.2869459169451147e-06, + "learning_rate": 7.3948040671655115e-06, + "loss": 0.0, + "num_input_tokens_seen": 17099224, + "step": 28060 + }, + { + "epoch": 7.739933811362382, + "grad_norm": 7.532411814281659e-07, + "learning_rate": 7.3862624424072185e-06, + "loss": 0.0, + "num_input_tokens_seen": 17102328, + "step": 28065 + }, + { + "epoch": 7.741312741312742, + "grad_norm": 2.3702184989815578e-05, + "learning_rate": 7.377724898499072e-06, + "loss": 0.0, + "num_input_tokens_seen": 17105208, + "step": 28070 + }, + { + "epoch": 7.7426916712631, + "grad_norm": 3.862885478156386e-06, + "learning_rate": 7.369191437419118e-06, + "loss": 0.0, + "num_input_tokens_seen": 17109240, + "step": 28075 + }, + { + "epoch": 7.744070601213458, + "grad_norm": 6.612158927055134e-07, + "learning_rate": 7.360662061144427e-06, + "loss": 0.0, + "num_input_tokens_seen": 17112696, + "step": 28080 + }, + { + "epoch": 7.745449531163817, + "grad_norm": 0.0001394614955643192, + "learning_rate": 7.352136771651124e-06, + "loss": 0.0, + "num_input_tokens_seen": 17116600, + "step": 28085 + }, + { + "epoch": 7.746828461114175, + "grad_norm": 0.0003536158474162221, + "learning_rate": 7.343615570914417e-06, + "loss": 0.0, + "num_input_tokens_seen": 17119352, + "step": 28090 + }, + { + "epoch": 7.748207391064534, + "grad_norm": 2.2203735170478467e-06, + "learning_rate": 7.3350984609085345e-06, + "loss": 0.0, + "num_input_tokens_seen": 17123928, + "step": 28095 + }, + { + "epoch": 7.749586321014893, + "grad_norm": 1.6615907952655107e-05, + "learning_rate": 7.326585443606762e-06, + "loss": 0.0, + "num_input_tokens_seen": 17128152, + "step": 28100 + }, + { + "epoch": 7.750965250965251, + "grad_norm": 3.1501691410085186e-05, + "learning_rate": 7.318076520981451e-06, + "loss": 0.0, + "num_input_tokens_seen": 17131096, + "step": 28105 + }, + { + "epoch": 7.7523441809156095, + "grad_norm": 2.961402742585051e-06, + "learning_rate": 7.309571695003994e-06, + "loss": 0.0, + "num_input_tokens_seen": 17133624, + "step": 28110 + }, + { + "epoch": 7.753723110865968, + "grad_norm": 9.332346735391184e-07, + "learning_rate": 7.301070967644849e-06, + "loss": 0.0, + "num_input_tokens_seen": 17136152, + "step": 28115 + }, + { + "epoch": 7.755102040816326, + "grad_norm": 1.9258313841419294e-05, + "learning_rate": 7.292574340873501e-06, + "loss": 0.0, + "num_input_tokens_seen": 17138840, + "step": 28120 + }, + { + "epoch": 7.756480970766685, + "grad_norm": 1.2211389541625977, + "learning_rate": 7.284081816658491e-06, + "loss": 0.0001, + "num_input_tokens_seen": 17142712, + "step": 28125 + }, + { + "epoch": 7.757859900717044, + "grad_norm": 9.444781881029485e-07, + "learning_rate": 7.275593396967434e-06, + "loss": 0.0, + "num_input_tokens_seen": 17145912, + "step": 28130 + }, + { + "epoch": 7.759238830667402, + "grad_norm": 2.525131321817753e-06, + "learning_rate": 7.26710908376696e-06, + "loss": 0.0, + "num_input_tokens_seen": 17149816, + "step": 28135 + }, + { + "epoch": 7.760617760617761, + "grad_norm": 8.762405059314915e-07, + "learning_rate": 7.258628879022758e-06, + "loss": 0.0, + "num_input_tokens_seen": 17152952, + "step": 28140 + }, + { + "epoch": 7.761996690568119, + "grad_norm": 6.778989813938097e-07, + "learning_rate": 7.250152784699585e-06, + "loss": 0.0, + "num_input_tokens_seen": 17154968, + "step": 28145 + }, + { + "epoch": 7.763375620518477, + "grad_norm": 4.110393547307467e-06, + "learning_rate": 7.241680802761225e-06, + "loss": 0.0, + "num_input_tokens_seen": 17158040, + "step": 28150 + }, + { + "epoch": 7.764754550468837, + "grad_norm": 3.5631892387755215e-06, + "learning_rate": 7.233212935170502e-06, + "loss": 0.0, + "num_input_tokens_seen": 17160888, + "step": 28155 + }, + { + "epoch": 7.766133480419195, + "grad_norm": 2.6110153612535214e-06, + "learning_rate": 7.224749183889321e-06, + "loss": 0.0, + "num_input_tokens_seen": 17163384, + "step": 28160 + }, + { + "epoch": 7.767512410369553, + "grad_norm": 6.219683200470172e-06, + "learning_rate": 7.216289550878591e-06, + "loss": 0.0, + "num_input_tokens_seen": 17165912, + "step": 28165 + }, + { + "epoch": 7.768891340319912, + "grad_norm": 1.4368537449627183e-05, + "learning_rate": 7.207834038098305e-06, + "loss": 0.1167, + "num_input_tokens_seen": 17168888, + "step": 28170 + }, + { + "epoch": 7.77027027027027, + "grad_norm": 7.381182172139233e-07, + "learning_rate": 7.199382647507469e-06, + "loss": 0.0, + "num_input_tokens_seen": 17171608, + "step": 28175 + }, + { + "epoch": 7.7716492002206286, + "grad_norm": 8.036474810069194e-07, + "learning_rate": 7.190935381064156e-06, + "loss": 0.0, + "num_input_tokens_seen": 17175160, + "step": 28180 + }, + { + "epoch": 7.773028130170987, + "grad_norm": 3.677028189486009e-06, + "learning_rate": 7.182492240725483e-06, + "loss": 0.0, + "num_input_tokens_seen": 17178264, + "step": 28185 + }, + { + "epoch": 7.774407060121346, + "grad_norm": 0.00011015699419658631, + "learning_rate": 7.1740532284476e-06, + "loss": 0.0, + "num_input_tokens_seen": 17181176, + "step": 28190 + }, + { + "epoch": 7.775785990071705, + "grad_norm": 3.1790161756362068e-06, + "learning_rate": 7.165618346185693e-06, + "loss": 0.0, + "num_input_tokens_seen": 17184984, + "step": 28195 + }, + { + "epoch": 7.777164920022063, + "grad_norm": 9.01286257430911e-05, + "learning_rate": 7.157187595894022e-06, + "loss": 0.0, + "num_input_tokens_seen": 17188120, + "step": 28200 + }, + { + "epoch": 7.778543849972421, + "grad_norm": 6.5243301605733e-07, + "learning_rate": 7.148760979525853e-06, + "loss": 0.0, + "num_input_tokens_seen": 17190872, + "step": 28205 + }, + { + "epoch": 7.77992277992278, + "grad_norm": 1.590553665664629e-06, + "learning_rate": 7.140338499033531e-06, + "loss": 0.0, + "num_input_tokens_seen": 17193272, + "step": 28210 + }, + { + "epoch": 7.781301709873138, + "grad_norm": 0.0018848171457648277, + "learning_rate": 7.13192015636841e-06, + "loss": 0.0, + "num_input_tokens_seen": 17196088, + "step": 28215 + }, + { + "epoch": 7.782680639823497, + "grad_norm": 0.00914641935378313, + "learning_rate": 7.1235059534808966e-06, + "loss": 0.0, + "num_input_tokens_seen": 17198968, + "step": 28220 + }, + { + "epoch": 7.784059569773856, + "grad_norm": 1.529835572000593e-06, + "learning_rate": 7.115095892320456e-06, + "loss": 0.0, + "num_input_tokens_seen": 17201528, + "step": 28225 + }, + { + "epoch": 7.785438499724214, + "grad_norm": 3.228050627512857e-05, + "learning_rate": 7.106689974835568e-06, + "loss": 0.0, + "num_input_tokens_seen": 17204376, + "step": 28230 + }, + { + "epoch": 7.7868174296745725, + "grad_norm": 3.045617404495715e-06, + "learning_rate": 7.098288202973757e-06, + "loss": 0.0, + "num_input_tokens_seen": 17207608, + "step": 28235 + }, + { + "epoch": 7.788196359624931, + "grad_norm": 5.863594765287417e-07, + "learning_rate": 7.0898905786816015e-06, + "loss": 0.0, + "num_input_tokens_seen": 17210584, + "step": 28240 + }, + { + "epoch": 7.789575289575289, + "grad_norm": 5.630331543216016e-06, + "learning_rate": 7.081497103904719e-06, + "loss": 0.0, + "num_input_tokens_seen": 17213464, + "step": 28245 + }, + { + "epoch": 7.7909542195256485, + "grad_norm": 8.886279374564765e-07, + "learning_rate": 7.0731077805877375e-06, + "loss": 0.0, + "num_input_tokens_seen": 17215896, + "step": 28250 + }, + { + "epoch": 7.792333149476007, + "grad_norm": 8.588835385126004e-07, + "learning_rate": 7.064722610674363e-06, + "loss": 0.0, + "num_input_tokens_seen": 17219384, + "step": 28255 + }, + { + "epoch": 7.793712079426365, + "grad_norm": 8.525578323315131e-07, + "learning_rate": 7.056341596107299e-06, + "loss": 0.0, + "num_input_tokens_seen": 17222360, + "step": 28260 + }, + { + "epoch": 7.795091009376724, + "grad_norm": 3.3309781883872347e-06, + "learning_rate": 7.047964738828325e-06, + "loss": 0.0, + "num_input_tokens_seen": 17225144, + "step": 28265 + }, + { + "epoch": 7.796469939327082, + "grad_norm": 7.063878229018883e-07, + "learning_rate": 7.039592040778231e-06, + "loss": 0.0, + "num_input_tokens_seen": 17228184, + "step": 28270 + }, + { + "epoch": 7.79784886927744, + "grad_norm": 2.2023803012416465e-06, + "learning_rate": 7.031223503896842e-06, + "loss": 0.0, + "num_input_tokens_seen": 17231288, + "step": 28275 + }, + { + "epoch": 7.799227799227799, + "grad_norm": 6.89285889166058e-06, + "learning_rate": 7.022859130123042e-06, + "loss": 0.0, + "num_input_tokens_seen": 17233880, + "step": 28280 + }, + { + "epoch": 7.800606729178158, + "grad_norm": 3.1685644898971077e-06, + "learning_rate": 7.01449892139473e-06, + "loss": 0.0, + "num_input_tokens_seen": 17236536, + "step": 28285 + }, + { + "epoch": 7.801985659128516, + "grad_norm": 1.3143950354788103e-06, + "learning_rate": 7.0061428796488416e-06, + "loss": 0.0, + "num_input_tokens_seen": 17239192, + "step": 28290 + }, + { + "epoch": 7.803364589078875, + "grad_norm": 2.003645022341516e-05, + "learning_rate": 6.997791006821361e-06, + "loss": 0.0, + "num_input_tokens_seen": 17241688, + "step": 28295 + }, + { + "epoch": 7.804743519029233, + "grad_norm": 4.878590971202357e-06, + "learning_rate": 6.989443304847287e-06, + "loss": 0.0, + "num_input_tokens_seen": 17245624, + "step": 28300 + }, + { + "epoch": 7.8061224489795915, + "grad_norm": 8.684745807840955e-06, + "learning_rate": 6.9810997756606735e-06, + "loss": 0.0, + "num_input_tokens_seen": 17248760, + "step": 28305 + }, + { + "epoch": 7.807501378929951, + "grad_norm": 1.0218941497441847e-06, + "learning_rate": 6.972760421194588e-06, + "loss": 0.0, + "num_input_tokens_seen": 17251320, + "step": 28310 + }, + { + "epoch": 7.808880308880309, + "grad_norm": 4.835427716898266e-06, + "learning_rate": 6.964425243381145e-06, + "loss": 0.0, + "num_input_tokens_seen": 17255320, + "step": 28315 + }, + { + "epoch": 7.8102592388306675, + "grad_norm": 8.368262933799997e-05, + "learning_rate": 6.956094244151478e-06, + "loss": 0.0, + "num_input_tokens_seen": 17257688, + "step": 28320 + }, + { + "epoch": 7.811638168781026, + "grad_norm": 7.927371370897163e-06, + "learning_rate": 6.947767425435772e-06, + "loss": 0.0, + "num_input_tokens_seen": 17261336, + "step": 28325 + }, + { + "epoch": 7.813017098731384, + "grad_norm": 1.2861970617450424e-06, + "learning_rate": 6.939444789163216e-06, + "loss": 0.0, + "num_input_tokens_seen": 17264600, + "step": 28330 + }, + { + "epoch": 7.814396028681743, + "grad_norm": 5.757639883086085e-06, + "learning_rate": 6.9311263372620616e-06, + "loss": 0.0, + "num_input_tokens_seen": 17267512, + "step": 28335 + }, + { + "epoch": 7.815774958632101, + "grad_norm": 1.2936417306264048e-06, + "learning_rate": 6.922812071659568e-06, + "loss": 0.0, + "num_input_tokens_seen": 17269848, + "step": 28340 + }, + { + "epoch": 7.81715388858246, + "grad_norm": 1.841455855355889e-06, + "learning_rate": 6.914501994282022e-06, + "loss": 0.0, + "num_input_tokens_seen": 17273336, + "step": 28345 + }, + { + "epoch": 7.818532818532819, + "grad_norm": 7.59752765588928e-07, + "learning_rate": 6.906196107054763e-06, + "loss": 0.0, + "num_input_tokens_seen": 17276856, + "step": 28350 + }, + { + "epoch": 7.819911748483177, + "grad_norm": 3.0950404834584333e-06, + "learning_rate": 6.897894411902131e-06, + "loss": 0.0, + "num_input_tokens_seen": 17279704, + "step": 28355 + }, + { + "epoch": 7.821290678433535, + "grad_norm": 1.2038299246341921e-06, + "learning_rate": 6.889596910747529e-06, + "loss": 0.0, + "num_input_tokens_seen": 17282232, + "step": 28360 + }, + { + "epoch": 7.822669608383894, + "grad_norm": 6.589503027498722e-05, + "learning_rate": 6.881303605513356e-06, + "loss": 0.0, + "num_input_tokens_seen": 17285112, + "step": 28365 + }, + { + "epoch": 7.824048538334253, + "grad_norm": 1.265736955247121e-06, + "learning_rate": 6.8730144981210494e-06, + "loss": 0.0, + "num_input_tokens_seen": 17287704, + "step": 28370 + }, + { + "epoch": 7.825427468284611, + "grad_norm": 6.676086741208564e-07, + "learning_rate": 6.8647295904910865e-06, + "loss": 0.0, + "num_input_tokens_seen": 17294136, + "step": 28375 + }, + { + "epoch": 7.82680639823497, + "grad_norm": 6.080244929762557e-07, + "learning_rate": 6.8564488845429515e-06, + "loss": 0.0, + "num_input_tokens_seen": 17296824, + "step": 28380 + }, + { + "epoch": 7.828185328185328, + "grad_norm": 6.67049789626617e-07, + "learning_rate": 6.848172382195167e-06, + "loss": 0.0, + "num_input_tokens_seen": 17299320, + "step": 28385 + }, + { + "epoch": 7.829564258135687, + "grad_norm": 2.1136293071322143e-06, + "learning_rate": 6.839900085365292e-06, + "loss": 0.0, + "num_input_tokens_seen": 17302104, + "step": 28390 + }, + { + "epoch": 7.830943188086045, + "grad_norm": 0.00014805764658376575, + "learning_rate": 6.831631995969881e-06, + "loss": 0.0, + "num_input_tokens_seen": 17304856, + "step": 28395 + }, + { + "epoch": 7.832322118036403, + "grad_norm": 8.207335326915199e-07, + "learning_rate": 6.823368115924547e-06, + "loss": 0.0, + "num_input_tokens_seen": 17307544, + "step": 28400 + }, + { + "epoch": 7.833701047986763, + "grad_norm": 6.421365105779842e-05, + "learning_rate": 6.815108447143906e-06, + "loss": 0.0, + "num_input_tokens_seen": 17310520, + "step": 28405 + }, + { + "epoch": 7.835079977937121, + "grad_norm": 1.0556905181147158e-06, + "learning_rate": 6.806852991541596e-06, + "loss": 0.0, + "num_input_tokens_seen": 17313432, + "step": 28410 + }, + { + "epoch": 7.836458907887479, + "grad_norm": 5.491206593433162e-06, + "learning_rate": 6.798601751030303e-06, + "loss": 0.0, + "num_input_tokens_seen": 17316184, + "step": 28415 + }, + { + "epoch": 7.837837837837838, + "grad_norm": 1.6064579995145323e-06, + "learning_rate": 6.790354727521716e-06, + "loss": 0.0, + "num_input_tokens_seen": 17318648, + "step": 28420 + }, + { + "epoch": 7.839216767788196, + "grad_norm": 1.4826537153567187e-06, + "learning_rate": 6.78211192292654e-06, + "loss": 0.0, + "num_input_tokens_seen": 17321272, + "step": 28425 + }, + { + "epoch": 7.840595697738555, + "grad_norm": 3.115598474323633e-06, + "learning_rate": 6.773873339154535e-06, + "loss": 0.0, + "num_input_tokens_seen": 17323864, + "step": 28430 + }, + { + "epoch": 7.841974627688914, + "grad_norm": 0.0033166457433253527, + "learning_rate": 6.76563897811445e-06, + "loss": 0.0, + "num_input_tokens_seen": 17326712, + "step": 28435 + }, + { + "epoch": 7.843353557639272, + "grad_norm": 0.0003280230157542974, + "learning_rate": 6.757408841714061e-06, + "loss": 0.0, + "num_input_tokens_seen": 17329304, + "step": 28440 + }, + { + "epoch": 7.8447324875896305, + "grad_norm": 7.514058211199881e-07, + "learning_rate": 6.7491829318601905e-06, + "loss": 0.0, + "num_input_tokens_seen": 17333336, + "step": 28445 + }, + { + "epoch": 7.846111417539989, + "grad_norm": 6.168517757032532e-06, + "learning_rate": 6.7409612504586465e-06, + "loss": 0.0, + "num_input_tokens_seen": 17335960, + "step": 28450 + }, + { + "epoch": 7.847490347490347, + "grad_norm": 6.551021670020418e-06, + "learning_rate": 6.732743799414285e-06, + "loss": 0.0, + "num_input_tokens_seen": 17339928, + "step": 28455 + }, + { + "epoch": 7.848869277440706, + "grad_norm": 7.416625749101513e-07, + "learning_rate": 6.724530580630972e-06, + "loss": 0.0, + "num_input_tokens_seen": 17342456, + "step": 28460 + }, + { + "epoch": 7.850248207391065, + "grad_norm": 1.055601387633942e-06, + "learning_rate": 6.716321596011588e-06, + "loss": 0.0, + "num_input_tokens_seen": 17346392, + "step": 28465 + }, + { + "epoch": 7.851627137341423, + "grad_norm": 1.322502725997765e-06, + "learning_rate": 6.708116847458043e-06, + "loss": 0.0, + "num_input_tokens_seen": 17349016, + "step": 28470 + }, + { + "epoch": 7.853006067291782, + "grad_norm": 7.92250148151652e-07, + "learning_rate": 6.699916336871254e-06, + "loss": 0.0, + "num_input_tokens_seen": 17351704, + "step": 28475 + }, + { + "epoch": 7.85438499724214, + "grad_norm": 5.114488430990605e-06, + "learning_rate": 6.6917200661511585e-06, + "loss": 0.0, + "num_input_tokens_seen": 17355000, + "step": 28480 + }, + { + "epoch": 7.855763927192498, + "grad_norm": 5.543940460484009e-06, + "learning_rate": 6.683528037196724e-06, + "loss": 0.0, + "num_input_tokens_seen": 17358104, + "step": 28485 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 1.416839722878649e-06, + "learning_rate": 6.675340251905912e-06, + "loss": 0.0, + "num_input_tokens_seen": 17361400, + "step": 28490 + }, + { + "epoch": 7.858521787093216, + "grad_norm": 1.2802917126464308e-06, + "learning_rate": 6.667156712175735e-06, + "loss": 0.0, + "num_input_tokens_seen": 17365336, + "step": 28495 + }, + { + "epoch": 7.859900717043574, + "grad_norm": 2.033307737292489e-06, + "learning_rate": 6.658977419902188e-06, + "loss": 0.0, + "num_input_tokens_seen": 17367992, + "step": 28500 + }, + { + "epoch": 7.861279646993933, + "grad_norm": 7.231174663502316e-07, + "learning_rate": 6.650802376980292e-06, + "loss": 0.0, + "num_input_tokens_seen": 17370776, + "step": 28505 + }, + { + "epoch": 7.862658576944291, + "grad_norm": 2.4175431008188752e-06, + "learning_rate": 6.642631585304099e-06, + "loss": 0.0, + "num_input_tokens_seen": 17373240, + "step": 28510 + }, + { + "epoch": 7.8640375068946495, + "grad_norm": 1.654983634580276e-06, + "learning_rate": 6.634465046766652e-06, + "loss": 0.0, + "num_input_tokens_seen": 17376888, + "step": 28515 + }, + { + "epoch": 7.865416436845008, + "grad_norm": 0.00012228856212459505, + "learning_rate": 6.6263027632600336e-06, + "loss": 0.0, + "num_input_tokens_seen": 17379864, + "step": 28520 + }, + { + "epoch": 7.866795366795367, + "grad_norm": 0.00017159745038952678, + "learning_rate": 6.618144736675314e-06, + "loss": 0.0, + "num_input_tokens_seen": 17383544, + "step": 28525 + }, + { + "epoch": 7.8681742967457255, + "grad_norm": 9.85985025181435e-07, + "learning_rate": 6.609990968902607e-06, + "loss": 0.0, + "num_input_tokens_seen": 17386808, + "step": 28530 + }, + { + "epoch": 7.869553226696084, + "grad_norm": 0.0001747197238728404, + "learning_rate": 6.601841461831004e-06, + "loss": 0.0, + "num_input_tokens_seen": 17389368, + "step": 28535 + }, + { + "epoch": 7.870932156646442, + "grad_norm": 8.536026143701747e-07, + "learning_rate": 6.593696217348649e-06, + "loss": 0.0, + "num_input_tokens_seen": 17392312, + "step": 28540 + }, + { + "epoch": 7.872311086596801, + "grad_norm": 1.7462222103858949e-06, + "learning_rate": 6.585555237342661e-06, + "loss": 0.0, + "num_input_tokens_seen": 17395704, + "step": 28545 + }, + { + "epoch": 7.873690016547159, + "grad_norm": 5.5486457313236315e-06, + "learning_rate": 6.577418523699206e-06, + "loss": 0.0, + "num_input_tokens_seen": 17398488, + "step": 28550 + }, + { + "epoch": 7.875068946497517, + "grad_norm": 5.943504106653563e-07, + "learning_rate": 6.56928607830343e-06, + "loss": 0.0, + "num_input_tokens_seen": 17401240, + "step": 28555 + }, + { + "epoch": 7.876447876447877, + "grad_norm": 8.341536158695817e-05, + "learning_rate": 6.5611579030395e-06, + "loss": 0.0, + "num_input_tokens_seen": 17406360, + "step": 28560 + }, + { + "epoch": 7.877826806398235, + "grad_norm": 1.3238962992545567e-06, + "learning_rate": 6.553033999790611e-06, + "loss": 0.0, + "num_input_tokens_seen": 17409528, + "step": 28565 + }, + { + "epoch": 7.8792057363485934, + "grad_norm": 1.0292642400600016e-06, + "learning_rate": 6.544914370438954e-06, + "loss": 0.0, + "num_input_tokens_seen": 17413464, + "step": 28570 + }, + { + "epoch": 7.880584666298952, + "grad_norm": 3.2315169846697245e-06, + "learning_rate": 6.536799016865713e-06, + "loss": 0.0, + "num_input_tokens_seen": 17415704, + "step": 28575 + }, + { + "epoch": 7.88196359624931, + "grad_norm": 7.642876198588056e-07, + "learning_rate": 6.5286879409511195e-06, + "loss": 0.0, + "num_input_tokens_seen": 17418136, + "step": 28580 + }, + { + "epoch": 7.8833425261996695, + "grad_norm": 2.1091957478347467e-06, + "learning_rate": 6.520581144574381e-06, + "loss": 0.0, + "num_input_tokens_seen": 17420888, + "step": 28585 + }, + { + "epoch": 7.884721456150028, + "grad_norm": 4.311869361117715e-06, + "learning_rate": 6.512478629613733e-06, + "loss": 0.0, + "num_input_tokens_seen": 17424440, + "step": 28590 + }, + { + "epoch": 7.886100386100386, + "grad_norm": 6.314277811725333e-07, + "learning_rate": 6.504380397946405e-06, + "loss": 0.0, + "num_input_tokens_seen": 17427576, + "step": 28595 + }, + { + "epoch": 7.887479316050745, + "grad_norm": 1.0223786830465542e-06, + "learning_rate": 6.496286451448641e-06, + "loss": 0.0, + "num_input_tokens_seen": 17431096, + "step": 28600 + }, + { + "epoch": 7.888858246001103, + "grad_norm": 1.2080651003998355e-06, + "learning_rate": 6.4881967919957074e-06, + "loss": 0.0, + "num_input_tokens_seen": 17434712, + "step": 28605 + }, + { + "epoch": 7.890237175951461, + "grad_norm": 4.29829833592521e-06, + "learning_rate": 6.480111421461849e-06, + "loss": 0.0, + "num_input_tokens_seen": 17437336, + "step": 28610 + }, + { + "epoch": 7.89161610590182, + "grad_norm": 2.8668637241935357e-06, + "learning_rate": 6.472030341720325e-06, + "loss": 0.0, + "num_input_tokens_seen": 17441304, + "step": 28615 + }, + { + "epoch": 7.892995035852179, + "grad_norm": 4.426591203809949e-06, + "learning_rate": 6.463953554643421e-06, + "loss": 0.0, + "num_input_tokens_seen": 17443896, + "step": 28620 + }, + { + "epoch": 7.894373965802537, + "grad_norm": 5.8864220591203775e-06, + "learning_rate": 6.455881062102401e-06, + "loss": 0.0, + "num_input_tokens_seen": 17447096, + "step": 28625 + }, + { + "epoch": 7.895752895752896, + "grad_norm": 3.821857262664707e-06, + "learning_rate": 6.447812865967543e-06, + "loss": 0.0, + "num_input_tokens_seen": 17449688, + "step": 28630 + }, + { + "epoch": 7.897131825703254, + "grad_norm": 4.964445452060318e-06, + "learning_rate": 6.439748968108148e-06, + "loss": 0.0, + "num_input_tokens_seen": 17453816, + "step": 28635 + }, + { + "epoch": 7.8985107556536125, + "grad_norm": 1.5072276937644347e-06, + "learning_rate": 6.431689370392485e-06, + "loss": 0.0, + "num_input_tokens_seen": 17457016, + "step": 28640 + }, + { + "epoch": 7.899889685603972, + "grad_norm": 1.3415776720648864e-06, + "learning_rate": 6.423634074687865e-06, + "loss": 0.0, + "num_input_tokens_seen": 17459544, + "step": 28645 + }, + { + "epoch": 7.90126861555433, + "grad_norm": 1.765474053172511e-06, + "learning_rate": 6.4155830828605764e-06, + "loss": 0.0, + "num_input_tokens_seen": 17462104, + "step": 28650 + }, + { + "epoch": 7.9026475455046885, + "grad_norm": 0.012541414238512516, + "learning_rate": 6.407536396775915e-06, + "loss": 0.0, + "num_input_tokens_seen": 17464856, + "step": 28655 + }, + { + "epoch": 7.904026475455047, + "grad_norm": 1.3630208741233218e-05, + "learning_rate": 6.399494018298183e-06, + "loss": 0.0, + "num_input_tokens_seen": 17467608, + "step": 28660 + }, + { + "epoch": 7.905405405405405, + "grad_norm": 7.083766035975714e-07, + "learning_rate": 6.391455949290698e-06, + "loss": 0.0, + "num_input_tokens_seen": 17470808, + "step": 28665 + }, + { + "epoch": 7.906784335355764, + "grad_norm": 2.5362332962686196e-05, + "learning_rate": 6.383422191615746e-06, + "loss": 0.0, + "num_input_tokens_seen": 17473912, + "step": 28670 + }, + { + "epoch": 7.908163265306122, + "grad_norm": 9.637657285566092e-07, + "learning_rate": 6.375392747134648e-06, + "loss": 0.0, + "num_input_tokens_seen": 17476792, + "step": 28675 + }, + { + "epoch": 7.909542195256481, + "grad_norm": 6.665807177341776e-07, + "learning_rate": 6.367367617707706e-06, + "loss": 0.0, + "num_input_tokens_seen": 17479384, + "step": 28680 + }, + { + "epoch": 7.91092112520684, + "grad_norm": 1.4985763527874951e-06, + "learning_rate": 6.359346805194219e-06, + "loss": 0.0, + "num_input_tokens_seen": 17482072, + "step": 28685 + }, + { + "epoch": 7.912300055157198, + "grad_norm": 7.799155241627886e-07, + "learning_rate": 6.351330311452508e-06, + "loss": 0.0, + "num_input_tokens_seen": 17484472, + "step": 28690 + }, + { + "epoch": 7.913678985107556, + "grad_norm": 2.8172614747745683e-06, + "learning_rate": 6.3433181383398655e-06, + "loss": 0.0, + "num_input_tokens_seen": 17488472, + "step": 28695 + }, + { + "epoch": 7.915057915057915, + "grad_norm": 1.899271296679217e-06, + "learning_rate": 6.335310287712612e-06, + "loss": 0.0, + "num_input_tokens_seen": 17491192, + "step": 28700 + }, + { + "epoch": 7.916436845008274, + "grad_norm": 1.2236394013598328e-06, + "learning_rate": 6.327306761426046e-06, + "loss": 0.0, + "num_input_tokens_seen": 17494168, + "step": 28705 + }, + { + "epoch": 7.917815774958632, + "grad_norm": 1.0517568398427102e-06, + "learning_rate": 6.319307561334459e-06, + "loss": 0.0, + "num_input_tokens_seen": 17497464, + "step": 28710 + }, + { + "epoch": 7.919194704908991, + "grad_norm": 5.366353070712648e-05, + "learning_rate": 6.311312689291166e-06, + "loss": 0.0, + "num_input_tokens_seen": 17500472, + "step": 28715 + }, + { + "epoch": 7.920573634859349, + "grad_norm": 1.5497785170737188e-06, + "learning_rate": 6.303322147148458e-06, + "loss": 0.0, + "num_input_tokens_seen": 17502680, + "step": 28720 + }, + { + "epoch": 7.9219525648097076, + "grad_norm": 3.2063403523352463e-06, + "learning_rate": 6.295335936757621e-06, + "loss": 0.0, + "num_input_tokens_seen": 17506168, + "step": 28725 + }, + { + "epoch": 7.923331494760066, + "grad_norm": 1.744980295370624e-06, + "learning_rate": 6.287354059968955e-06, + "loss": 0.0, + "num_input_tokens_seen": 17509016, + "step": 28730 + }, + { + "epoch": 7.924710424710424, + "grad_norm": 1.017656245494436e-06, + "learning_rate": 6.2793765186317495e-06, + "loss": 0.0, + "num_input_tokens_seen": 17511736, + "step": 28735 + }, + { + "epoch": 7.926089354660784, + "grad_norm": 1.7215952539118007e-06, + "learning_rate": 6.271403314594274e-06, + "loss": 0.0, + "num_input_tokens_seen": 17513816, + "step": 28740 + }, + { + "epoch": 7.927468284611142, + "grad_norm": 1.6402877008658834e-05, + "learning_rate": 6.26343444970382e-06, + "loss": 0.0, + "num_input_tokens_seen": 17516376, + "step": 28745 + }, + { + "epoch": 7.9288472145615, + "grad_norm": 8.11949234957865e-07, + "learning_rate": 6.255469925806643e-06, + "loss": 0.0, + "num_input_tokens_seen": 17519512, + "step": 28750 + }, + { + "epoch": 7.930226144511859, + "grad_norm": 1.309135996052646e-06, + "learning_rate": 6.2475097447480255e-06, + "loss": 0.0, + "num_input_tokens_seen": 17523032, + "step": 28755 + }, + { + "epoch": 7.931605074462217, + "grad_norm": 1.2800639979104744e-06, + "learning_rate": 6.23955390837222e-06, + "loss": 0.0, + "num_input_tokens_seen": 17526392, + "step": 28760 + }, + { + "epoch": 7.932984004412576, + "grad_norm": 5.67646907256858e-07, + "learning_rate": 6.23160241852247e-06, + "loss": 0.0, + "num_input_tokens_seen": 17529400, + "step": 28765 + }, + { + "epoch": 7.934362934362935, + "grad_norm": 0.00013927850523032248, + "learning_rate": 6.2236552770410376e-06, + "loss": 0.0, + "num_input_tokens_seen": 17532984, + "step": 28770 + }, + { + "epoch": 7.935741864313293, + "grad_norm": 1.2997182921026251e-06, + "learning_rate": 6.215712485769154e-06, + "loss": 0.0, + "num_input_tokens_seen": 17536312, + "step": 28775 + }, + { + "epoch": 7.9371207942636515, + "grad_norm": 7.6106897495265e-07, + "learning_rate": 6.207774046547044e-06, + "loss": 0.0, + "num_input_tokens_seen": 17539480, + "step": 28780 + }, + { + "epoch": 7.93849972421401, + "grad_norm": 0.00013079245400149375, + "learning_rate": 6.199839961213941e-06, + "loss": 0.0, + "num_input_tokens_seen": 17542520, + "step": 28785 + }, + { + "epoch": 7.939878654164368, + "grad_norm": 3.720801032613963e-06, + "learning_rate": 6.191910231608047e-06, + "loss": 0.0, + "num_input_tokens_seen": 17545560, + "step": 28790 + }, + { + "epoch": 7.941257584114727, + "grad_norm": 4.856781379203312e-05, + "learning_rate": 6.18398485956658e-06, + "loss": 0.0, + "num_input_tokens_seen": 17547992, + "step": 28795 + }, + { + "epoch": 7.942636514065086, + "grad_norm": 7.420725296469755e-07, + "learning_rate": 6.17606384692572e-06, + "loss": 0.0, + "num_input_tokens_seen": 17550744, + "step": 28800 + }, + { + "epoch": 7.944015444015444, + "grad_norm": 1.7534910057293018e-06, + "learning_rate": 6.168147195520662e-06, + "loss": 0.0, + "num_input_tokens_seen": 17553208, + "step": 28805 + }, + { + "epoch": 7.945394373965803, + "grad_norm": 9.355613656225614e-06, + "learning_rate": 6.160234907185586e-06, + "loss": 0.0, + "num_input_tokens_seen": 17557048, + "step": 28810 + }, + { + "epoch": 7.946773303916161, + "grad_norm": 3.0570977287425194e-06, + "learning_rate": 6.1523269837536455e-06, + "loss": 0.0, + "num_input_tokens_seen": 17560856, + "step": 28815 + }, + { + "epoch": 7.948152233866519, + "grad_norm": 1.0734253919508774e-05, + "learning_rate": 6.144423427056989e-06, + "loss": 0.0, + "num_input_tokens_seen": 17564728, + "step": 28820 + }, + { + "epoch": 7.949531163816878, + "grad_norm": 7.107840065145865e-05, + "learning_rate": 6.136524238926772e-06, + "loss": 0.0, + "num_input_tokens_seen": 17567224, + "step": 28825 + }, + { + "epoch": 7.950910093767237, + "grad_norm": 8.570263162255287e-06, + "learning_rate": 6.128629421193107e-06, + "loss": 0.0, + "num_input_tokens_seen": 17570328, + "step": 28830 + }, + { + "epoch": 7.952289023717595, + "grad_norm": 1.49069671806501e-06, + "learning_rate": 6.120738975685125e-06, + "loss": 0.0, + "num_input_tokens_seen": 17572632, + "step": 28835 + }, + { + "epoch": 7.953667953667954, + "grad_norm": 6.21957553903485e-07, + "learning_rate": 6.1128529042309205e-06, + "loss": 0.0, + "num_input_tokens_seen": 17575768, + "step": 28840 + }, + { + "epoch": 7.955046883618312, + "grad_norm": 3.7796812648593914e-06, + "learning_rate": 6.1049712086575785e-06, + "loss": 0.0, + "num_input_tokens_seen": 17579192, + "step": 28845 + }, + { + "epoch": 7.9564258135686705, + "grad_norm": 1.7420553604097222e-06, + "learning_rate": 6.0970938907911865e-06, + "loss": 0.0, + "num_input_tokens_seen": 17582488, + "step": 28850 + }, + { + "epoch": 7.957804743519029, + "grad_norm": 9.806030902836937e-07, + "learning_rate": 6.089220952456798e-06, + "loss": 0.0, + "num_input_tokens_seen": 17585592, + "step": 28855 + }, + { + "epoch": 7.959183673469388, + "grad_norm": 7.90848162068869e-07, + "learning_rate": 6.081352395478457e-06, + "loss": 0.0, + "num_input_tokens_seen": 17588856, + "step": 28860 + }, + { + "epoch": 7.9605626034197465, + "grad_norm": 0.0007262416766025126, + "learning_rate": 6.073488221679205e-06, + "loss": 0.0, + "num_input_tokens_seen": 17591448, + "step": 28865 + }, + { + "epoch": 7.961941533370105, + "grad_norm": 1.7055823491318733e-06, + "learning_rate": 6.065628432881046e-06, + "loss": 0.0, + "num_input_tokens_seen": 17594008, + "step": 28870 + }, + { + "epoch": 7.963320463320463, + "grad_norm": 1.097046379072708e-06, + "learning_rate": 6.05777303090499e-06, + "loss": 0.0, + "num_input_tokens_seen": 17597272, + "step": 28875 + }, + { + "epoch": 7.964699393270822, + "grad_norm": 7.790613381075673e-06, + "learning_rate": 6.049922017571022e-06, + "loss": 0.0, + "num_input_tokens_seen": 17600248, + "step": 28880 + }, + { + "epoch": 7.96607832322118, + "grad_norm": 2.014863184740534e-06, + "learning_rate": 6.042075394698099e-06, + "loss": 0.0, + "num_input_tokens_seen": 17603480, + "step": 28885 + }, + { + "epoch": 7.967457253171538, + "grad_norm": 1.5923475302770385e-06, + "learning_rate": 6.0342331641041836e-06, + "loss": 0.0, + "num_input_tokens_seen": 17606232, + "step": 28890 + }, + { + "epoch": 7.968836183121898, + "grad_norm": 1.264495836039714e-06, + "learning_rate": 6.026395327606202e-06, + "loss": 0.0, + "num_input_tokens_seen": 17609048, + "step": 28895 + }, + { + "epoch": 7.970215113072256, + "grad_norm": 3.799655360126053e-06, + "learning_rate": 6.018561887020061e-06, + "loss": 0.0, + "num_input_tokens_seen": 17611288, + "step": 28900 + }, + { + "epoch": 7.971594043022614, + "grad_norm": 9.195428560815344e-07, + "learning_rate": 6.010732844160669e-06, + "loss": 0.0, + "num_input_tokens_seen": 17614136, + "step": 28905 + }, + { + "epoch": 7.972972972972973, + "grad_norm": 1.4195646826919983e-06, + "learning_rate": 6.002908200841901e-06, + "loss": 0.0, + "num_input_tokens_seen": 17617656, + "step": 28910 + }, + { + "epoch": 7.974351902923331, + "grad_norm": 0.00021896172256674618, + "learning_rate": 5.9950879588766e-06, + "loss": 0.0, + "num_input_tokens_seen": 17620504, + "step": 28915 + }, + { + "epoch": 7.9757308328736904, + "grad_norm": 1.1010684829670936e-05, + "learning_rate": 5.98727212007662e-06, + "loss": 0.0, + "num_input_tokens_seen": 17623192, + "step": 28920 + }, + { + "epoch": 7.977109762824049, + "grad_norm": 8.790059382590698e-07, + "learning_rate": 5.979460686252769e-06, + "loss": 0.0, + "num_input_tokens_seen": 17626072, + "step": 28925 + }, + { + "epoch": 7.978488692774407, + "grad_norm": 8.012231660359248e-07, + "learning_rate": 5.971653659214854e-06, + "loss": 0.0, + "num_input_tokens_seen": 17629016, + "step": 28930 + }, + { + "epoch": 7.979867622724766, + "grad_norm": 7.52589755848021e-07, + "learning_rate": 5.9638510407716394e-06, + "loss": 0.0, + "num_input_tokens_seen": 17632376, + "step": 28935 + }, + { + "epoch": 7.981246552675124, + "grad_norm": 1.1536266129041906e-06, + "learning_rate": 5.956052832730891e-06, + "loss": 0.0, + "num_input_tokens_seen": 17635352, + "step": 28940 + }, + { + "epoch": 7.982625482625482, + "grad_norm": 1.0132839634025004e-06, + "learning_rate": 5.948259036899332e-06, + "loss": 0.0, + "num_input_tokens_seen": 17637688, + "step": 28945 + }, + { + "epoch": 7.984004412575841, + "grad_norm": 6.896979357406963e-06, + "learning_rate": 5.940469655082681e-06, + "loss": 0.0, + "num_input_tokens_seen": 17640472, + "step": 28950 + }, + { + "epoch": 7.9853833425262, + "grad_norm": 8.219049618674035e-07, + "learning_rate": 5.932684689085619e-06, + "loss": 0.0, + "num_input_tokens_seen": 17643320, + "step": 28955 + }, + { + "epoch": 7.986762272476558, + "grad_norm": 7.71410486777313e-06, + "learning_rate": 5.924904140711818e-06, + "loss": 0.0, + "num_input_tokens_seen": 17646296, + "step": 28960 + }, + { + "epoch": 7.988141202426917, + "grad_norm": 1.170013001683401e-05, + "learning_rate": 5.9171280117639176e-06, + "loss": 0.0, + "num_input_tokens_seen": 17649464, + "step": 28965 + }, + { + "epoch": 7.989520132377275, + "grad_norm": 0.003216825658455491, + "learning_rate": 5.909356304043526e-06, + "loss": 0.0, + "num_input_tokens_seen": 17653144, + "step": 28970 + }, + { + "epoch": 7.9908990623276335, + "grad_norm": 3.6667199765361147e-06, + "learning_rate": 5.90158901935125e-06, + "loss": 0.0, + "num_input_tokens_seen": 17655640, + "step": 28975 + }, + { + "epoch": 7.992277992277993, + "grad_norm": 1.0078809964397806e-06, + "learning_rate": 5.893826159486646e-06, + "loss": 0.0, + "num_input_tokens_seen": 17658552, + "step": 28980 + }, + { + "epoch": 7.993656922228351, + "grad_norm": 6.890233521517075e-07, + "learning_rate": 5.886067726248267e-06, + "loss": 0.0, + "num_input_tokens_seen": 17663032, + "step": 28985 + }, + { + "epoch": 7.9950358521787095, + "grad_norm": 6.497849653896992e-07, + "learning_rate": 5.878313721433629e-06, + "loss": 0.0, + "num_input_tokens_seen": 17666104, + "step": 28990 + }, + { + "epoch": 7.996414782129068, + "grad_norm": 3.7287936720531434e-06, + "learning_rate": 5.8705641468392135e-06, + "loss": 0.0, + "num_input_tokens_seen": 17669400, + "step": 28995 + }, + { + "epoch": 7.997793712079426, + "grad_norm": 3.946635843021795e-06, + "learning_rate": 5.8628190042605e-06, + "loss": 0.0, + "num_input_tokens_seen": 17672600, + "step": 29000 + }, + { + "epoch": 7.999172642029785, + "grad_norm": 6.004864303577051e-07, + "learning_rate": 5.855078295491914e-06, + "loss": 0.0, + "num_input_tokens_seen": 17676600, + "step": 29005 + }, + { + "epoch": 8.0, + "eval_loss": 0.3509182929992676, + "eval_runtime": 28.5311, + "eval_samples_per_second": 56.5, + "eval_steps_per_second": 14.125, + "num_input_tokens_seen": 17678024, + "step": 29008 + }, + { + "epoch": 8.000551571980143, + "grad_norm": 6.303437203314388e-07, + "learning_rate": 5.847342022326879e-06, + "loss": 0.0, + "num_input_tokens_seen": 17679144, + "step": 29010 + }, + { + "epoch": 8.001930501930502, + "grad_norm": 3.2836578611750156e-06, + "learning_rate": 5.839610186557765e-06, + "loss": 0.0, + "num_input_tokens_seen": 17682024, + "step": 29015 + }, + { + "epoch": 8.00330943188086, + "grad_norm": 9.050633025253774e-07, + "learning_rate": 5.83188278997594e-06, + "loss": 0.0, + "num_input_tokens_seen": 17685288, + "step": 29020 + }, + { + "epoch": 8.004688361831219, + "grad_norm": 6.182032393553527e-06, + "learning_rate": 5.82415983437172e-06, + "loss": 0.0, + "num_input_tokens_seen": 17688232, + "step": 29025 + }, + { + "epoch": 8.006067291781578, + "grad_norm": 6.749388035132142e-07, + "learning_rate": 5.816441321534416e-06, + "loss": 0.0, + "num_input_tokens_seen": 17691496, + "step": 29030 + }, + { + "epoch": 8.007446221731936, + "grad_norm": 2.9920338420197368e-06, + "learning_rate": 5.80872725325228e-06, + "loss": 0.0, + "num_input_tokens_seen": 17694696, + "step": 29035 + }, + { + "epoch": 8.008825151682295, + "grad_norm": 6.703595545332064e-07, + "learning_rate": 5.801017631312566e-06, + "loss": 0.0, + "num_input_tokens_seen": 17697352, + "step": 29040 + }, + { + "epoch": 8.010204081632653, + "grad_norm": 8.060866093728691e-06, + "learning_rate": 5.793312457501479e-06, + "loss": 0.0, + "num_input_tokens_seen": 17700360, + "step": 29045 + }, + { + "epoch": 8.011583011583012, + "grad_norm": 6.798858294132515e-07, + "learning_rate": 5.785611733604188e-06, + "loss": 0.0, + "num_input_tokens_seen": 17703080, + "step": 29050 + }, + { + "epoch": 8.01296194153337, + "grad_norm": 3.2347877549909754e-06, + "learning_rate": 5.777915461404853e-06, + "loss": 0.0, + "num_input_tokens_seen": 17705544, + "step": 29055 + }, + { + "epoch": 8.014340871483729, + "grad_norm": 8.544550382794114e-07, + "learning_rate": 5.770223642686584e-06, + "loss": 0.0, + "num_input_tokens_seen": 17708488, + "step": 29060 + }, + { + "epoch": 8.015719801434088, + "grad_norm": 2.603507027743035e-06, + "learning_rate": 5.762536279231456e-06, + "loss": 0.0, + "num_input_tokens_seen": 17711144, + "step": 29065 + }, + { + "epoch": 8.017098731384445, + "grad_norm": 1.314380028816231e-06, + "learning_rate": 5.754853372820537e-06, + "loss": 0.0, + "num_input_tokens_seen": 17713224, + "step": 29070 + }, + { + "epoch": 8.018477661334805, + "grad_norm": 1.3353220538192545e-06, + "learning_rate": 5.747174925233834e-06, + "loss": 0.0, + "num_input_tokens_seen": 17715720, + "step": 29075 + }, + { + "epoch": 8.019856591285162, + "grad_norm": 1.916635255838628e-06, + "learning_rate": 5.739500938250336e-06, + "loss": 0.0, + "num_input_tokens_seen": 17719432, + "step": 29080 + }, + { + "epoch": 8.021235521235521, + "grad_norm": 0.00014014860789757222, + "learning_rate": 5.731831413648003e-06, + "loss": 0.0, + "num_input_tokens_seen": 17721544, + "step": 29085 + }, + { + "epoch": 8.02261445118588, + "grad_norm": 0.00014983209257479757, + "learning_rate": 5.7241663532037415e-06, + "loss": 0.0, + "num_input_tokens_seen": 17724616, + "step": 29090 + }, + { + "epoch": 8.023993381136238, + "grad_norm": 5.388333647715626e-06, + "learning_rate": 5.7165057586934525e-06, + "loss": 0.0, + "num_input_tokens_seen": 17729512, + "step": 29095 + }, + { + "epoch": 8.025372311086597, + "grad_norm": 6.150689273454191e-07, + "learning_rate": 5.708849631891972e-06, + "loss": 0.0, + "num_input_tokens_seen": 17733352, + "step": 29100 + }, + { + "epoch": 8.026751241036955, + "grad_norm": 1.0473441989233834e-06, + "learning_rate": 5.701197974573114e-06, + "loss": 0.0, + "num_input_tokens_seen": 17736200, + "step": 29105 + }, + { + "epoch": 8.028130170987314, + "grad_norm": 2.4192672753997613e-06, + "learning_rate": 5.693550788509669e-06, + "loss": 0.0, + "num_input_tokens_seen": 17739304, + "step": 29110 + }, + { + "epoch": 8.029509100937672, + "grad_norm": 1.8409931726637296e-05, + "learning_rate": 5.685908075473373e-06, + "loss": 0.0, + "num_input_tokens_seen": 17742312, + "step": 29115 + }, + { + "epoch": 8.03088803088803, + "grad_norm": 6.038132482899528e-07, + "learning_rate": 5.6782698372349255e-06, + "loss": 0.0, + "num_input_tokens_seen": 17745800, + "step": 29120 + }, + { + "epoch": 8.03226696083839, + "grad_norm": 1.044266014105233e-06, + "learning_rate": 5.670636075564012e-06, + "loss": 0.0, + "num_input_tokens_seen": 17749160, + "step": 29125 + }, + { + "epoch": 8.033645890788748, + "grad_norm": 7.928741752039059e-07, + "learning_rate": 5.66300679222925e-06, + "loss": 0.0, + "num_input_tokens_seen": 17752552, + "step": 29130 + }, + { + "epoch": 8.035024820739107, + "grad_norm": 2.5774368623388e-05, + "learning_rate": 5.655381988998252e-06, + "loss": 0.0, + "num_input_tokens_seen": 17756104, + "step": 29135 + }, + { + "epoch": 8.036403750689464, + "grad_norm": 1.7602042134967633e-06, + "learning_rate": 5.647761667637564e-06, + "loss": 0.0, + "num_input_tokens_seen": 17758952, + "step": 29140 + }, + { + "epoch": 8.037782680639824, + "grad_norm": 6.424066896215663e-07, + "learning_rate": 5.640145829912702e-06, + "loss": 0.0, + "num_input_tokens_seen": 17761864, + "step": 29145 + }, + { + "epoch": 8.039161610590183, + "grad_norm": 6.167702849779744e-06, + "learning_rate": 5.63253447758815e-06, + "loss": 0.0, + "num_input_tokens_seen": 17764008, + "step": 29150 + }, + { + "epoch": 8.04054054054054, + "grad_norm": 3.4307249734411016e-05, + "learning_rate": 5.624927612427361e-06, + "loss": 0.0, + "num_input_tokens_seen": 17767080, + "step": 29155 + }, + { + "epoch": 8.0419194704909, + "grad_norm": 1.9516671727615176e-06, + "learning_rate": 5.617325236192716e-06, + "loss": 0.0, + "num_input_tokens_seen": 17769480, + "step": 29160 + }, + { + "epoch": 8.043298400441257, + "grad_norm": 2.9773812002531486e-06, + "learning_rate": 5.609727350645597e-06, + "loss": 0.0, + "num_input_tokens_seen": 17774280, + "step": 29165 + }, + { + "epoch": 8.044677330391616, + "grad_norm": 1.3912474059907254e-05, + "learning_rate": 5.6021339575463044e-06, + "loss": 0.0, + "num_input_tokens_seen": 17776392, + "step": 29170 + }, + { + "epoch": 8.046056260341974, + "grad_norm": 5.092578021503869e-07, + "learning_rate": 5.594545058654138e-06, + "loss": 0.0, + "num_input_tokens_seen": 17779048, + "step": 29175 + }, + { + "epoch": 8.047435190292333, + "grad_norm": 2.882876742660301e-06, + "learning_rate": 5.586960655727327e-06, + "loss": 0.0, + "num_input_tokens_seen": 17782088, + "step": 29180 + }, + { + "epoch": 8.048814120242692, + "grad_norm": 7.716949426139763e-07, + "learning_rate": 5.579380750523061e-06, + "loss": 0.0, + "num_input_tokens_seen": 17785128, + "step": 29185 + }, + { + "epoch": 8.05019305019305, + "grad_norm": 6.847171334811719e-06, + "learning_rate": 5.5718053447975125e-06, + "loss": 0.0, + "num_input_tokens_seen": 17790216, + "step": 29190 + }, + { + "epoch": 8.05157198014341, + "grad_norm": 8.132395123539027e-06, + "learning_rate": 5.564234440305782e-06, + "loss": 0.0, + "num_input_tokens_seen": 17793352, + "step": 29195 + }, + { + "epoch": 8.052950910093767, + "grad_norm": 2.6137326130992733e-05, + "learning_rate": 5.556668038801938e-06, + "loss": 0.0, + "num_input_tokens_seen": 17796456, + "step": 29200 + }, + { + "epoch": 8.054329840044126, + "grad_norm": 1.9503768271533772e-06, + "learning_rate": 5.549106142039018e-06, + "loss": 0.0, + "num_input_tokens_seen": 17800264, + "step": 29205 + }, + { + "epoch": 8.055708769994485, + "grad_norm": 8.23324683096871e-07, + "learning_rate": 5.5415487517689904e-06, + "loss": 0.0, + "num_input_tokens_seen": 17803560, + "step": 29210 + }, + { + "epoch": 8.057087699944843, + "grad_norm": 5.782594598713331e-07, + "learning_rate": 5.5339958697428115e-06, + "loss": 0.0, + "num_input_tokens_seen": 17806568, + "step": 29215 + }, + { + "epoch": 8.058466629895202, + "grad_norm": 2.378482349740807e-06, + "learning_rate": 5.526447497710357e-06, + "loss": 0.0, + "num_input_tokens_seen": 17809096, + "step": 29220 + }, + { + "epoch": 8.05984555984556, + "grad_norm": 9.674815373728052e-06, + "learning_rate": 5.518903637420486e-06, + "loss": 0.0, + "num_input_tokens_seen": 17811464, + "step": 29225 + }, + { + "epoch": 8.061224489795919, + "grad_norm": 2.7354140001989435e-06, + "learning_rate": 5.51136429062101e-06, + "loss": 0.0, + "num_input_tokens_seen": 17815496, + "step": 29230 + }, + { + "epoch": 8.062603419746276, + "grad_norm": 1.2059815162501764e-05, + "learning_rate": 5.5038294590586816e-06, + "loss": 0.0, + "num_input_tokens_seen": 17818184, + "step": 29235 + }, + { + "epoch": 8.063982349696635, + "grad_norm": 1.7014752302202396e-05, + "learning_rate": 5.496299144479203e-06, + "loss": 0.0, + "num_input_tokens_seen": 17821064, + "step": 29240 + }, + { + "epoch": 8.065361279646995, + "grad_norm": 3.0453948056674562e-06, + "learning_rate": 5.488773348627254e-06, + "loss": 0.0, + "num_input_tokens_seen": 17823464, + "step": 29245 + }, + { + "epoch": 8.066740209597352, + "grad_norm": 6.54093867069605e-07, + "learning_rate": 5.481252073246448e-06, + "loss": 0.0, + "num_input_tokens_seen": 17827336, + "step": 29250 + }, + { + "epoch": 8.068119139547711, + "grad_norm": 2.3374100237560924e-06, + "learning_rate": 5.473735320079351e-06, + "loss": 0.0, + "num_input_tokens_seen": 17829576, + "step": 29255 + }, + { + "epoch": 8.069498069498069, + "grad_norm": 1.0020327181337052e-06, + "learning_rate": 5.466223090867498e-06, + "loss": 0.0, + "num_input_tokens_seen": 17833256, + "step": 29260 + }, + { + "epoch": 8.070876999448428, + "grad_norm": 7.975388029990427e-07, + "learning_rate": 5.458715387351352e-06, + "loss": 0.0, + "num_input_tokens_seen": 17836424, + "step": 29265 + }, + { + "epoch": 8.072255929398786, + "grad_norm": 2.851196586561855e-05, + "learning_rate": 5.451212211270354e-06, + "loss": 0.0, + "num_input_tokens_seen": 17839336, + "step": 29270 + }, + { + "epoch": 8.073634859349145, + "grad_norm": 9.35058960749302e-06, + "learning_rate": 5.443713564362876e-06, + "loss": 0.0, + "num_input_tokens_seen": 17842120, + "step": 29275 + }, + { + "epoch": 8.075013789299504, + "grad_norm": 0.0007304656319320202, + "learning_rate": 5.436219448366237e-06, + "loss": 0.0, + "num_input_tokens_seen": 17845192, + "step": 29280 + }, + { + "epoch": 8.076392719249862, + "grad_norm": 7.295460591194569e-07, + "learning_rate": 5.428729865016727e-06, + "loss": 0.0, + "num_input_tokens_seen": 17849064, + "step": 29285 + }, + { + "epoch": 8.077771649200221, + "grad_norm": 6.552248805746785e-07, + "learning_rate": 5.421244816049578e-06, + "loss": 0.0, + "num_input_tokens_seen": 17853032, + "step": 29290 + }, + { + "epoch": 8.079150579150578, + "grad_norm": 2.730414735196973e-06, + "learning_rate": 5.4137643031989565e-06, + "loss": 0.0, + "num_input_tokens_seen": 17855848, + "step": 29295 + }, + { + "epoch": 8.080529509100938, + "grad_norm": 1.3532162483897991e-06, + "learning_rate": 5.406288328198006e-06, + "loss": 0.0, + "num_input_tokens_seen": 17860040, + "step": 29300 + }, + { + "epoch": 8.081908439051297, + "grad_norm": 5.395814355324546e-07, + "learning_rate": 5.398816892778791e-06, + "loss": 0.0, + "num_input_tokens_seen": 17862600, + "step": 29305 + }, + { + "epoch": 8.083287369001654, + "grad_norm": 2.0471552488743328e-06, + "learning_rate": 5.391349998672335e-06, + "loss": 0.0, + "num_input_tokens_seen": 17865224, + "step": 29310 + }, + { + "epoch": 8.084666298952014, + "grad_norm": 4.468013685254846e-06, + "learning_rate": 5.383887647608618e-06, + "loss": 0.0, + "num_input_tokens_seen": 17869768, + "step": 29315 + }, + { + "epoch": 8.086045228902371, + "grad_norm": 3.227316483389586e-06, + "learning_rate": 5.376429841316546e-06, + "loss": 0.0, + "num_input_tokens_seen": 17872360, + "step": 29320 + }, + { + "epoch": 8.08742415885273, + "grad_norm": 3.055560591747053e-05, + "learning_rate": 5.3689765815240045e-06, + "loss": 0.0, + "num_input_tokens_seen": 17875336, + "step": 29325 + }, + { + "epoch": 8.088803088803088, + "grad_norm": 9.46155414567329e-05, + "learning_rate": 5.361527869957797e-06, + "loss": 0.0, + "num_input_tokens_seen": 17879912, + "step": 29330 + }, + { + "epoch": 8.090182018753447, + "grad_norm": 5.453981088976434e-07, + "learning_rate": 5.354083708343677e-06, + "loss": 0.0, + "num_input_tokens_seen": 17883208, + "step": 29335 + }, + { + "epoch": 8.091560948703806, + "grad_norm": 1.1921924851776566e-06, + "learning_rate": 5.346644098406362e-06, + "loss": 0.0, + "num_input_tokens_seen": 17885960, + "step": 29340 + }, + { + "epoch": 8.092939878654164, + "grad_norm": 1.0339059599573375e-06, + "learning_rate": 5.339209041869494e-06, + "loss": 0.0, + "num_input_tokens_seen": 17889448, + "step": 29345 + }, + { + "epoch": 8.094318808604523, + "grad_norm": 2.634802967804717e-06, + "learning_rate": 5.331778540455667e-06, + "loss": 0.0, + "num_input_tokens_seen": 17891656, + "step": 29350 + }, + { + "epoch": 8.09569773855488, + "grad_norm": 1.0261434226777055e-06, + "learning_rate": 5.324352595886428e-06, + "loss": 0.0, + "num_input_tokens_seen": 17893992, + "step": 29355 + }, + { + "epoch": 8.09707666850524, + "grad_norm": 6.124403739704576e-07, + "learning_rate": 5.316931209882267e-06, + "loss": 0.0, + "num_input_tokens_seen": 17896872, + "step": 29360 + }, + { + "epoch": 8.0984555984556, + "grad_norm": 0.001273138215765357, + "learning_rate": 5.3095143841625975e-06, + "loss": 0.0, + "num_input_tokens_seen": 17899976, + "step": 29365 + }, + { + "epoch": 8.099834528405957, + "grad_norm": 7.483815807063365e-07, + "learning_rate": 5.302102120445807e-06, + "loss": 0.0, + "num_input_tokens_seen": 17903752, + "step": 29370 + }, + { + "epoch": 8.101213458356316, + "grad_norm": 5.701130021407153e-07, + "learning_rate": 5.294694420449195e-06, + "loss": 0.0, + "num_input_tokens_seen": 17908040, + "step": 29375 + }, + { + "epoch": 8.102592388306673, + "grad_norm": 2.675822315723053e-06, + "learning_rate": 5.2872912858890346e-06, + "loss": 0.0, + "num_input_tokens_seen": 17910920, + "step": 29380 + }, + { + "epoch": 8.103971318257033, + "grad_norm": 6.185653660395474e-07, + "learning_rate": 5.279892718480519e-06, + "loss": 0.0, + "num_input_tokens_seen": 17913448, + "step": 29385 + }, + { + "epoch": 8.10535024820739, + "grad_norm": 3.247977474529762e-06, + "learning_rate": 5.272498719937782e-06, + "loss": 0.0, + "num_input_tokens_seen": 17915656, + "step": 29390 + }, + { + "epoch": 8.10672917815775, + "grad_norm": 1.3863024150850833e-06, + "learning_rate": 5.265109291973924e-06, + "loss": 0.0, + "num_input_tokens_seen": 17918344, + "step": 29395 + }, + { + "epoch": 8.108108108108109, + "grad_norm": 2.7579681045608595e-06, + "learning_rate": 5.257724436300959e-06, + "loss": 0.0, + "num_input_tokens_seen": 17921000, + "step": 29400 + }, + { + "epoch": 8.109487038058466, + "grad_norm": 1.072694658432738e-06, + "learning_rate": 5.250344154629846e-06, + "loss": 0.0, + "num_input_tokens_seen": 17923944, + "step": 29405 + }, + { + "epoch": 8.110865968008826, + "grad_norm": 1.9287303985038307e-06, + "learning_rate": 5.242968448670504e-06, + "loss": 0.0, + "num_input_tokens_seen": 17926440, + "step": 29410 + }, + { + "epoch": 8.112244897959183, + "grad_norm": 7.34168907001731e-07, + "learning_rate": 5.235597320131766e-06, + "loss": 0.0, + "num_input_tokens_seen": 17929096, + "step": 29415 + }, + { + "epoch": 8.113623827909542, + "grad_norm": 7.453273428836837e-05, + "learning_rate": 5.228230770721429e-06, + "loss": 0.0, + "num_input_tokens_seen": 17931560, + "step": 29420 + }, + { + "epoch": 8.115002757859902, + "grad_norm": 2.147012082787114e-06, + "learning_rate": 5.220868802146203e-06, + "loss": 0.0, + "num_input_tokens_seen": 17934984, + "step": 29425 + }, + { + "epoch": 8.116381687810259, + "grad_norm": 5.834418175254541e-07, + "learning_rate": 5.213511416111758e-06, + "loss": 0.0, + "num_input_tokens_seen": 17938024, + "step": 29430 + }, + { + "epoch": 8.117760617760618, + "grad_norm": 5.509223797162122e-07, + "learning_rate": 5.206158614322701e-06, + "loss": 0.0, + "num_input_tokens_seen": 17940840, + "step": 29435 + }, + { + "epoch": 8.119139547710976, + "grad_norm": 0.0004186531878076494, + "learning_rate": 5.198810398482562e-06, + "loss": 0.0, + "num_input_tokens_seen": 17943720, + "step": 29440 + }, + { + "epoch": 8.120518477661335, + "grad_norm": 5.234844593360322e-07, + "learning_rate": 5.191466770293815e-06, + "loss": 0.0, + "num_input_tokens_seen": 17946312, + "step": 29445 + }, + { + "epoch": 8.121897407611693, + "grad_norm": 0.0004534851759672165, + "learning_rate": 5.184127731457883e-06, + "loss": 0.0, + "num_input_tokens_seen": 17948584, + "step": 29450 + }, + { + "epoch": 8.123276337562052, + "grad_norm": 6.465334081440233e-07, + "learning_rate": 5.17679328367511e-06, + "loss": 0.0, + "num_input_tokens_seen": 17951848, + "step": 29455 + }, + { + "epoch": 8.124655267512411, + "grad_norm": 3.6490550883172546e-06, + "learning_rate": 5.169463428644778e-06, + "loss": 0.0, + "num_input_tokens_seen": 17954952, + "step": 29460 + }, + { + "epoch": 8.126034197462769, + "grad_norm": 7.6044757406634744e-06, + "learning_rate": 5.162138168065117e-06, + "loss": 0.0, + "num_input_tokens_seen": 17957928, + "step": 29465 + }, + { + "epoch": 8.127413127413128, + "grad_norm": 1.1066899787692819e-06, + "learning_rate": 5.154817503633275e-06, + "loss": 0.0, + "num_input_tokens_seen": 17962184, + "step": 29470 + }, + { + "epoch": 8.128792057363485, + "grad_norm": 3.5825982195092365e-05, + "learning_rate": 5.14750143704536e-06, + "loss": 0.0, + "num_input_tokens_seen": 17965736, + "step": 29475 + }, + { + "epoch": 8.130170987313845, + "grad_norm": 8.529305546289834e-07, + "learning_rate": 5.140189969996389e-06, + "loss": 0.0, + "num_input_tokens_seen": 17969576, + "step": 29480 + }, + { + "epoch": 8.131549917264204, + "grad_norm": 2.1228595414868323e-06, + "learning_rate": 5.1328831041803224e-06, + "loss": 0.0, + "num_input_tokens_seen": 17974280, + "step": 29485 + }, + { + "epoch": 8.132928847214561, + "grad_norm": 8.403947049373528e-07, + "learning_rate": 5.125580841290062e-06, + "loss": 0.0, + "num_input_tokens_seen": 17976968, + "step": 29490 + }, + { + "epoch": 8.13430777716492, + "grad_norm": 6.40031259990792e-07, + "learning_rate": 5.1182831830174305e-06, + "loss": 0.0, + "num_input_tokens_seen": 17979720, + "step": 29495 + }, + { + "epoch": 8.135686707115278, + "grad_norm": 2.859122287190985e-05, + "learning_rate": 5.110990131053195e-06, + "loss": 0.0, + "num_input_tokens_seen": 17982344, + "step": 29500 + }, + { + "epoch": 8.137065637065637, + "grad_norm": 1.4120181504040374e-06, + "learning_rate": 5.103701687087059e-06, + "loss": 0.0, + "num_input_tokens_seen": 17987176, + "step": 29505 + }, + { + "epoch": 8.138444567015995, + "grad_norm": 8.019583219720516e-07, + "learning_rate": 5.096417852807633e-06, + "loss": 0.0, + "num_input_tokens_seen": 17989960, + "step": 29510 + }, + { + "epoch": 8.139823496966354, + "grad_norm": 1.081005166270188e-06, + "learning_rate": 5.089138629902493e-06, + "loss": 0.0, + "num_input_tokens_seen": 17992392, + "step": 29515 + }, + { + "epoch": 8.141202426916713, + "grad_norm": 1.0402986845292617e-05, + "learning_rate": 5.081864020058125e-06, + "loss": 0.0, + "num_input_tokens_seen": 17994632, + "step": 29520 + }, + { + "epoch": 8.14258135686707, + "grad_norm": 8.560833180126792e-07, + "learning_rate": 5.074594024959944e-06, + "loss": 0.0, + "num_input_tokens_seen": 17997672, + "step": 29525 + }, + { + "epoch": 8.14396028681743, + "grad_norm": 5.578099808190018e-05, + "learning_rate": 5.067328646292316e-06, + "loss": 0.0, + "num_input_tokens_seen": 18000264, + "step": 29530 + }, + { + "epoch": 8.145339216767788, + "grad_norm": 7.813955562596675e-07, + "learning_rate": 5.060067885738517e-06, + "loss": 0.0, + "num_input_tokens_seen": 18003816, + "step": 29535 + }, + { + "epoch": 8.146718146718147, + "grad_norm": 2.2613860437559197e-06, + "learning_rate": 5.052811744980757e-06, + "loss": 0.0, + "num_input_tokens_seen": 18006344, + "step": 29540 + }, + { + "epoch": 8.148097076668506, + "grad_norm": 2.7406701974541647e-06, + "learning_rate": 5.045560225700191e-06, + "loss": 0.0, + "num_input_tokens_seen": 18008872, + "step": 29545 + }, + { + "epoch": 8.149476006618864, + "grad_norm": 0.0002774492313619703, + "learning_rate": 5.038313329576888e-06, + "loss": 0.0, + "num_input_tokens_seen": 18011528, + "step": 29550 + }, + { + "epoch": 8.150854936569223, + "grad_norm": 7.624861382282688e-07, + "learning_rate": 5.031071058289841e-06, + "loss": 0.0, + "num_input_tokens_seen": 18014888, + "step": 29555 + }, + { + "epoch": 8.15223386651958, + "grad_norm": 1.0573305644356878e-06, + "learning_rate": 5.0238334135169875e-06, + "loss": 0.0, + "num_input_tokens_seen": 18017992, + "step": 29560 + }, + { + "epoch": 8.15361279646994, + "grad_norm": 1.3159844911569962e-06, + "learning_rate": 5.016600396935192e-06, + "loss": 0.0, + "num_input_tokens_seen": 18020808, + "step": 29565 + }, + { + "epoch": 8.154991726420297, + "grad_norm": 5.656946200360835e-07, + "learning_rate": 5.009372010220228e-06, + "loss": 0.0, + "num_input_tokens_seen": 18023400, + "step": 29570 + }, + { + "epoch": 8.156370656370656, + "grad_norm": 1.1881739965247107e-06, + "learning_rate": 5.002148255046821e-06, + "loss": 0.0, + "num_input_tokens_seen": 18026088, + "step": 29575 + }, + { + "epoch": 8.157749586321016, + "grad_norm": 6.057387054170249e-07, + "learning_rate": 4.9949291330886e-06, + "loss": 0.0, + "num_input_tokens_seen": 18029448, + "step": 29580 + }, + { + "epoch": 8.159128516271373, + "grad_norm": 1.65654773809365e-06, + "learning_rate": 4.9877146460181465e-06, + "loss": 0.0, + "num_input_tokens_seen": 18033384, + "step": 29585 + }, + { + "epoch": 8.160507446221732, + "grad_norm": 6.3278885136242025e-06, + "learning_rate": 4.9805047955069405e-06, + "loss": 0.0, + "num_input_tokens_seen": 18037192, + "step": 29590 + }, + { + "epoch": 8.16188637617209, + "grad_norm": 6.226504183359793e-07, + "learning_rate": 4.973299583225402e-06, + "loss": 0.0, + "num_input_tokens_seen": 18040520, + "step": 29595 + }, + { + "epoch": 8.16326530612245, + "grad_norm": 8.889142009138595e-07, + "learning_rate": 4.966099010842881e-06, + "loss": 0.0, + "num_input_tokens_seen": 18043368, + "step": 29600 + }, + { + "epoch": 8.164644236072807, + "grad_norm": 0.00020292034605517983, + "learning_rate": 4.958903080027641e-06, + "loss": 0.0, + "num_input_tokens_seen": 18047080, + "step": 29605 + }, + { + "epoch": 8.166023166023166, + "grad_norm": 9.964663831851794e-07, + "learning_rate": 4.951711792446886e-06, + "loss": 0.0, + "num_input_tokens_seen": 18050568, + "step": 29610 + }, + { + "epoch": 8.167402095973525, + "grad_norm": 9.694003892946057e-07, + "learning_rate": 4.9445251497667235e-06, + "loss": 0.0, + "num_input_tokens_seen": 18053192, + "step": 29615 + }, + { + "epoch": 8.168781025923883, + "grad_norm": 1.0813156450240058e-06, + "learning_rate": 4.937343153652196e-06, + "loss": 0.0, + "num_input_tokens_seen": 18055912, + "step": 29620 + }, + { + "epoch": 8.170159955874242, + "grad_norm": 2.7666285404848168e-06, + "learning_rate": 4.930165805767278e-06, + "loss": 0.0, + "num_input_tokens_seen": 18058824, + "step": 29625 + }, + { + "epoch": 8.1715388858246, + "grad_norm": 0.00015798929962329566, + "learning_rate": 4.922993107774845e-06, + "loss": 0.0, + "num_input_tokens_seen": 18063144, + "step": 29630 + }, + { + "epoch": 8.172917815774959, + "grad_norm": 2.4790231236693216e-06, + "learning_rate": 4.915825061336721e-06, + "loss": 0.0, + "num_input_tokens_seen": 18066728, + "step": 29635 + }, + { + "epoch": 8.174296745725318, + "grad_norm": 9.005213428281422e-07, + "learning_rate": 4.908661668113626e-06, + "loss": 0.0, + "num_input_tokens_seen": 18069288, + "step": 29640 + }, + { + "epoch": 8.175675675675675, + "grad_norm": 0.00021410042245406657, + "learning_rate": 4.901502929765231e-06, + "loss": 0.0, + "num_input_tokens_seen": 18074344, + "step": 29645 + }, + { + "epoch": 8.177054605626035, + "grad_norm": 1.0976025350828422e-06, + "learning_rate": 4.8943488479501005e-06, + "loss": 0.0, + "num_input_tokens_seen": 18077192, + "step": 29650 + }, + { + "epoch": 8.178433535576392, + "grad_norm": 2.390607505731168e-06, + "learning_rate": 4.887199424325739e-06, + "loss": 0.0, + "num_input_tokens_seen": 18080008, + "step": 29655 + }, + { + "epoch": 8.179812465526751, + "grad_norm": 6.428853680517932e-07, + "learning_rate": 4.880054660548561e-06, + "loss": 0.0, + "num_input_tokens_seen": 18082408, + "step": 29660 + }, + { + "epoch": 8.181191395477109, + "grad_norm": 1.4277707123255823e-06, + "learning_rate": 4.872914558273911e-06, + "loss": 0.0, + "num_input_tokens_seen": 18084904, + "step": 29665 + }, + { + "epoch": 8.182570325427468, + "grad_norm": 7.848726113479643e-07, + "learning_rate": 4.865779119156047e-06, + "loss": 0.0, + "num_input_tokens_seen": 18088104, + "step": 29670 + }, + { + "epoch": 8.183949255377827, + "grad_norm": 0.00011219275620533153, + "learning_rate": 4.8586483448481404e-06, + "loss": 0.0, + "num_input_tokens_seen": 18091240, + "step": 29675 + }, + { + "epoch": 8.185328185328185, + "grad_norm": 1.5478358363907319e-06, + "learning_rate": 4.851522237002298e-06, + "loss": 0.0, + "num_input_tokens_seen": 18095528, + "step": 29680 + }, + { + "epoch": 8.186707115278544, + "grad_norm": 7.322194051084807e-07, + "learning_rate": 4.8444007972695386e-06, + "loss": 0.0, + "num_input_tokens_seen": 18099208, + "step": 29685 + }, + { + "epoch": 8.188086045228902, + "grad_norm": 7.69365669839317e-07, + "learning_rate": 4.837284027299782e-06, + "loss": 0.0, + "num_input_tokens_seen": 18102120, + "step": 29690 + }, + { + "epoch": 8.189464975179261, + "grad_norm": 4.806053766515106e-05, + "learning_rate": 4.8301719287419e-06, + "loss": 0.0, + "num_input_tokens_seen": 18105576, + "step": 29695 + }, + { + "epoch": 8.19084390512962, + "grad_norm": 1.2773429034496075e-06, + "learning_rate": 4.823064503243649e-06, + "loss": 0.0, + "num_input_tokens_seen": 18108424, + "step": 29700 + }, + { + "epoch": 8.192222835079978, + "grad_norm": 6.345478595903842e-07, + "learning_rate": 4.8159617524517255e-06, + "loss": 0.0, + "num_input_tokens_seen": 18110792, + "step": 29705 + }, + { + "epoch": 8.193601765030337, + "grad_norm": 0.0002666898653842509, + "learning_rate": 4.808863678011738e-06, + "loss": 0.0, + "num_input_tokens_seen": 18113032, + "step": 29710 + }, + { + "epoch": 8.194980694980694, + "grad_norm": 4.911378709948622e-06, + "learning_rate": 4.8017702815681994e-06, + "loss": 0.0, + "num_input_tokens_seen": 18116424, + "step": 29715 + }, + { + "epoch": 8.196359624931054, + "grad_norm": 3.5215048228565138e-06, + "learning_rate": 4.794681564764558e-06, + "loss": 0.0, + "num_input_tokens_seen": 18118696, + "step": 29720 + }, + { + "epoch": 8.197738554881411, + "grad_norm": 5.2878954193147365e-06, + "learning_rate": 4.787597529243165e-06, + "loss": 0.0, + "num_input_tokens_seen": 18122120, + "step": 29725 + }, + { + "epoch": 8.19911748483177, + "grad_norm": 0.0014642456080764532, + "learning_rate": 4.7805181766452766e-06, + "loss": 0.0, + "num_input_tokens_seen": 18124456, + "step": 29730 + }, + { + "epoch": 8.20049641478213, + "grad_norm": 6.671688424830791e-06, + "learning_rate": 4.773443508611098e-06, + "loss": 0.0, + "num_input_tokens_seen": 18127912, + "step": 29735 + }, + { + "epoch": 8.201875344732487, + "grad_norm": 1.272053737011447e-06, + "learning_rate": 4.766373526779716e-06, + "loss": 0.0, + "num_input_tokens_seen": 18131272, + "step": 29740 + }, + { + "epoch": 8.203254274682847, + "grad_norm": 7.603082963214547e-07, + "learning_rate": 4.759308232789139e-06, + "loss": 0.0, + "num_input_tokens_seen": 18134504, + "step": 29745 + }, + { + "epoch": 8.204633204633204, + "grad_norm": 1.0107177104146103e-06, + "learning_rate": 4.75224762827631e-06, + "loss": 0.0, + "num_input_tokens_seen": 18137192, + "step": 29750 + }, + { + "epoch": 8.206012134583563, + "grad_norm": 1.222813079948537e-05, + "learning_rate": 4.745191714877054e-06, + "loss": 0.0, + "num_input_tokens_seen": 18140776, + "step": 29755 + }, + { + "epoch": 8.207391064533923, + "grad_norm": 0.0006260817172005773, + "learning_rate": 4.738140494226134e-06, + "loss": 0.0, + "num_input_tokens_seen": 18143496, + "step": 29760 + }, + { + "epoch": 8.20876999448428, + "grad_norm": 0.0009447151678614318, + "learning_rate": 4.731093967957218e-06, + "loss": 0.0, + "num_input_tokens_seen": 18145992, + "step": 29765 + }, + { + "epoch": 8.21014892443464, + "grad_norm": 2.5542849471094087e-06, + "learning_rate": 4.724052137702875e-06, + "loss": 0.0, + "num_input_tokens_seen": 18147944, + "step": 29770 + }, + { + "epoch": 8.211527854384997, + "grad_norm": 5.641400093736593e-06, + "learning_rate": 4.717015005094599e-06, + "loss": 0.0, + "num_input_tokens_seen": 18150472, + "step": 29775 + }, + { + "epoch": 8.212906784335356, + "grad_norm": 6.011034656694392e-06, + "learning_rate": 4.709982571762805e-06, + "loss": 0.0, + "num_input_tokens_seen": 18153288, + "step": 29780 + }, + { + "epoch": 8.214285714285714, + "grad_norm": 1.0137770004803315e-05, + "learning_rate": 4.70295483933679e-06, + "loss": 0.0, + "num_input_tokens_seen": 18155976, + "step": 29785 + }, + { + "epoch": 8.215664644236073, + "grad_norm": 6.48557033855468e-07, + "learning_rate": 4.6959318094447955e-06, + "loss": 0.0, + "num_input_tokens_seen": 18158984, + "step": 29790 + }, + { + "epoch": 8.217043574186432, + "grad_norm": 1.8239293240185361e-06, + "learning_rate": 4.688913483713939e-06, + "loss": 0.0, + "num_input_tokens_seen": 18161448, + "step": 29795 + }, + { + "epoch": 8.21842250413679, + "grad_norm": 1.1407455531298183e-06, + "learning_rate": 4.681899863770278e-06, + "loss": 0.0, + "num_input_tokens_seen": 18165096, + "step": 29800 + }, + { + "epoch": 8.219801434087149, + "grad_norm": 1.4731523378941347e-06, + "learning_rate": 4.67489095123877e-06, + "loss": 0.0, + "num_input_tokens_seen": 18168616, + "step": 29805 + }, + { + "epoch": 8.221180364037506, + "grad_norm": 6.901300821482437e-06, + "learning_rate": 4.6678867477432605e-06, + "loss": 0.0, + "num_input_tokens_seen": 18172584, + "step": 29810 + }, + { + "epoch": 8.222559293987866, + "grad_norm": 1.1711343859133194e-06, + "learning_rate": 4.660887254906548e-06, + "loss": 0.0, + "num_input_tokens_seen": 18176104, + "step": 29815 + }, + { + "epoch": 8.223938223938223, + "grad_norm": 1.3382377801463008e-06, + "learning_rate": 4.653892474350299e-06, + "loss": 0.0, + "num_input_tokens_seen": 18179176, + "step": 29820 + }, + { + "epoch": 8.225317153888582, + "grad_norm": 1.402275483997073e-05, + "learning_rate": 4.646902407695103e-06, + "loss": 0.0, + "num_input_tokens_seen": 18183240, + "step": 29825 + }, + { + "epoch": 8.226696083838942, + "grad_norm": 1.3065556458968786e-06, + "learning_rate": 4.639917056560467e-06, + "loss": 0.0, + "num_input_tokens_seen": 18187016, + "step": 29830 + }, + { + "epoch": 8.228075013789299, + "grad_norm": 7.32383853119245e-07, + "learning_rate": 4.632936422564785e-06, + "loss": 0.0, + "num_input_tokens_seen": 18189832, + "step": 29835 + }, + { + "epoch": 8.229453943739658, + "grad_norm": 7.195003036031267e-06, + "learning_rate": 4.625960507325383e-06, + "loss": 0.0, + "num_input_tokens_seen": 18192232, + "step": 29840 + }, + { + "epoch": 8.230832873690016, + "grad_norm": 1.4808023252044222e-06, + "learning_rate": 4.618989312458469e-06, + "loss": 0.0, + "num_input_tokens_seen": 18194696, + "step": 29845 + }, + { + "epoch": 8.232211803640375, + "grad_norm": 7.126628247533517e-07, + "learning_rate": 4.612022839579172e-06, + "loss": 0.0, + "num_input_tokens_seen": 18197864, + "step": 29850 + }, + { + "epoch": 8.233590733590734, + "grad_norm": 9.497439350525383e-07, + "learning_rate": 4.605061090301532e-06, + "loss": 0.0, + "num_input_tokens_seen": 18200712, + "step": 29855 + }, + { + "epoch": 8.234969663541092, + "grad_norm": 1.0916432984231506e-06, + "learning_rate": 4.5981040662384815e-06, + "loss": 0.0, + "num_input_tokens_seen": 18204488, + "step": 29860 + }, + { + "epoch": 8.236348593491451, + "grad_norm": 3.84075201509404e-06, + "learning_rate": 4.591151769001853e-06, + "loss": 0.0, + "num_input_tokens_seen": 18207432, + "step": 29865 + }, + { + "epoch": 8.237727523441809, + "grad_norm": 7.205507017715718e-07, + "learning_rate": 4.5842042002024105e-06, + "loss": 0.0, + "num_input_tokens_seen": 18211528, + "step": 29870 + }, + { + "epoch": 8.239106453392168, + "grad_norm": 2.3066177163855173e-06, + "learning_rate": 4.5772613614497975e-06, + "loss": 0.0, + "num_input_tokens_seen": 18213896, + "step": 29875 + }, + { + "epoch": 8.240485383342525, + "grad_norm": 9.950057346941321e-07, + "learning_rate": 4.570323254352562e-06, + "loss": 0.0, + "num_input_tokens_seen": 18216456, + "step": 29880 + }, + { + "epoch": 8.241864313292885, + "grad_norm": 1.210884761349007e-06, + "learning_rate": 4.563389880518182e-06, + "loss": 0.0, + "num_input_tokens_seen": 18220808, + "step": 29885 + }, + { + "epoch": 8.243243243243244, + "grad_norm": 7.462384701284464e-07, + "learning_rate": 4.556461241553003e-06, + "loss": 0.0, + "num_input_tokens_seen": 18223752, + "step": 29890 + }, + { + "epoch": 8.244622173193601, + "grad_norm": 5.7695388022693805e-06, + "learning_rate": 4.549537339062307e-06, + "loss": 0.0, + "num_input_tokens_seen": 18226312, + "step": 29895 + }, + { + "epoch": 8.24600110314396, + "grad_norm": 0.00018018324044533074, + "learning_rate": 4.542618174650254e-06, + "loss": 0.0, + "num_input_tokens_seen": 18229320, + "step": 29900 + }, + { + "epoch": 8.247380033094318, + "grad_norm": 1.0506519174668938e-06, + "learning_rate": 4.535703749919909e-06, + "loss": 0.0, + "num_input_tokens_seen": 18232264, + "step": 29905 + }, + { + "epoch": 8.248758963044677, + "grad_norm": 1.166090169135714e-05, + "learning_rate": 4.528794066473258e-06, + "loss": 0.0, + "num_input_tokens_seen": 18236616, + "step": 29910 + }, + { + "epoch": 8.250137892995037, + "grad_norm": 1.723641958051303e-06, + "learning_rate": 4.521889125911164e-06, + "loss": 0.0, + "num_input_tokens_seen": 18239144, + "step": 29915 + }, + { + "epoch": 8.251516822945394, + "grad_norm": 5.187405349715846e-06, + "learning_rate": 4.514988929833408e-06, + "loss": 0.0, + "num_input_tokens_seen": 18242024, + "step": 29920 + }, + { + "epoch": 8.252895752895753, + "grad_norm": 1.1163253475388046e-05, + "learning_rate": 4.508093479838671e-06, + "loss": 0.0, + "num_input_tokens_seen": 18244520, + "step": 29925 + }, + { + "epoch": 8.25427468284611, + "grad_norm": 1.5556802281935234e-06, + "learning_rate": 4.50120277752453e-06, + "loss": 0.0, + "num_input_tokens_seen": 18247752, + "step": 29930 + }, + { + "epoch": 8.25565361279647, + "grad_norm": 2.3430925466527697e-06, + "learning_rate": 4.494316824487446e-06, + "loss": 0.0, + "num_input_tokens_seen": 18251272, + "step": 29935 + }, + { + "epoch": 8.257032542746828, + "grad_norm": 1.1625160141193192e-06, + "learning_rate": 4.487435622322814e-06, + "loss": 0.0, + "num_input_tokens_seen": 18254312, + "step": 29940 + }, + { + "epoch": 8.258411472697187, + "grad_norm": 9.942480119207175e-07, + "learning_rate": 4.480559172624898e-06, + "loss": 0.0, + "num_input_tokens_seen": 18257096, + "step": 29945 + }, + { + "epoch": 8.259790402647546, + "grad_norm": 1.1179349712620024e-06, + "learning_rate": 4.473687476986882e-06, + "loss": 0.0, + "num_input_tokens_seen": 18260168, + "step": 29950 + }, + { + "epoch": 8.261169332597904, + "grad_norm": 1.3226691635281895e-06, + "learning_rate": 4.466820537000835e-06, + "loss": 0.0, + "num_input_tokens_seen": 18263560, + "step": 29955 + }, + { + "epoch": 8.262548262548263, + "grad_norm": 7.910752515272179e-07, + "learning_rate": 4.459958354257721e-06, + "loss": 0.0, + "num_input_tokens_seen": 18266824, + "step": 29960 + }, + { + "epoch": 8.26392719249862, + "grad_norm": 1.4313982319436036e-06, + "learning_rate": 4.45310093034742e-06, + "loss": 0.0, + "num_input_tokens_seen": 18269672, + "step": 29965 + }, + { + "epoch": 8.26530612244898, + "grad_norm": 8.537412554687762e-07, + "learning_rate": 4.446248266858696e-06, + "loss": 0.0, + "num_input_tokens_seen": 18272712, + "step": 29970 + }, + { + "epoch": 8.266685052399339, + "grad_norm": 6.0838401623186655e-06, + "learning_rate": 4.439400365379206e-06, + "loss": 0.0, + "num_input_tokens_seen": 18275528, + "step": 29975 + }, + { + "epoch": 8.268063982349696, + "grad_norm": 1.0005079502661829e-06, + "learning_rate": 4.432557227495515e-06, + "loss": 0.0, + "num_input_tokens_seen": 18278664, + "step": 29980 + }, + { + "epoch": 8.269442912300056, + "grad_norm": 4.880092546954984e-06, + "learning_rate": 4.425718854793087e-06, + "loss": 0.0, + "num_input_tokens_seen": 18282664, + "step": 29985 + }, + { + "epoch": 8.270821842250413, + "grad_norm": 6.375441330419562e-07, + "learning_rate": 4.418885248856264e-06, + "loss": 0.0, + "num_input_tokens_seen": 18286664, + "step": 29990 + }, + { + "epoch": 8.272200772200772, + "grad_norm": 6.097121513448656e-06, + "learning_rate": 4.412056411268304e-06, + "loss": 0.0, + "num_input_tokens_seen": 18288968, + "step": 29995 + }, + { + "epoch": 8.27357970215113, + "grad_norm": 2.2644412638328504e-06, + "learning_rate": 4.4052323436113374e-06, + "loss": 0.0, + "num_input_tokens_seen": 18291400, + "step": 30000 + }, + { + "epoch": 8.27495863210149, + "grad_norm": 1.0207716059085215e-06, + "learning_rate": 4.398413047466418e-06, + "loss": 0.0, + "num_input_tokens_seen": 18295560, + "step": 30005 + }, + { + "epoch": 8.276337562051848, + "grad_norm": 8.86665475263726e-07, + "learning_rate": 4.391598524413473e-06, + "loss": 0.0, + "num_input_tokens_seen": 18298408, + "step": 30010 + }, + { + "epoch": 8.277716492002206, + "grad_norm": 6.446682527894154e-05, + "learning_rate": 4.384788776031321e-06, + "loss": 0.0, + "num_input_tokens_seen": 18301160, + "step": 30015 + }, + { + "epoch": 8.279095421952565, + "grad_norm": 3.0776955100009218e-06, + "learning_rate": 4.3779838038976974e-06, + "loss": 0.0, + "num_input_tokens_seen": 18303784, + "step": 30020 + }, + { + "epoch": 8.280474351902923, + "grad_norm": 1.4667699588244432e-06, + "learning_rate": 4.371183609589208e-06, + "loss": 0.0, + "num_input_tokens_seen": 18306376, + "step": 30025 + }, + { + "epoch": 8.281853281853282, + "grad_norm": 8.064220651249343e-07, + "learning_rate": 4.364388194681357e-06, + "loss": 0.0, + "num_input_tokens_seen": 18310248, + "step": 30030 + }, + { + "epoch": 8.283232211803641, + "grad_norm": 6.231321663108247e-07, + "learning_rate": 4.357597560748553e-06, + "loss": 0.0, + "num_input_tokens_seen": 18313256, + "step": 30035 + }, + { + "epoch": 8.284611141753999, + "grad_norm": 8.806047162579489e-07, + "learning_rate": 4.35081170936408e-06, + "loss": 0.0, + "num_input_tokens_seen": 18315720, + "step": 30040 + }, + { + "epoch": 8.285990071704358, + "grad_norm": 4.789055196852132e-07, + "learning_rate": 4.344030642100133e-06, + "loss": 0.0, + "num_input_tokens_seen": 18318984, + "step": 30045 + }, + { + "epoch": 8.287369001654715, + "grad_norm": 7.750086297164671e-07, + "learning_rate": 4.337254360527773e-06, + "loss": 0.0, + "num_input_tokens_seen": 18322472, + "step": 30050 + }, + { + "epoch": 8.288747931605075, + "grad_norm": 3.080251917708665e-05, + "learning_rate": 4.330482866216984e-06, + "loss": 0.0, + "num_input_tokens_seen": 18325736, + "step": 30055 + }, + { + "epoch": 8.290126861555432, + "grad_norm": 2.606937960081268e-05, + "learning_rate": 4.32371616073661e-06, + "loss": 0.0, + "num_input_tokens_seen": 18328168, + "step": 30060 + }, + { + "epoch": 8.291505791505791, + "grad_norm": 6.833168981756899e-07, + "learning_rate": 4.316954245654411e-06, + "loss": 0.0, + "num_input_tokens_seen": 18331368, + "step": 30065 + }, + { + "epoch": 8.29288472145615, + "grad_norm": 1.348521550426085e-06, + "learning_rate": 4.310197122537016e-06, + "loss": 0.0, + "num_input_tokens_seen": 18333704, + "step": 30070 + }, + { + "epoch": 8.294263651406508, + "grad_norm": 1.6547226096008671e-06, + "learning_rate": 4.303444792949962e-06, + "loss": 0.0, + "num_input_tokens_seen": 18337032, + "step": 30075 + }, + { + "epoch": 8.295642581356867, + "grad_norm": 7.44888495773921e-07, + "learning_rate": 4.296697258457663e-06, + "loss": 0.0, + "num_input_tokens_seen": 18340040, + "step": 30080 + }, + { + "epoch": 8.297021511307225, + "grad_norm": 1.537798198114615e-06, + "learning_rate": 4.2899545206234186e-06, + "loss": 0.0, + "num_input_tokens_seen": 18343272, + "step": 30085 + }, + { + "epoch": 8.298400441257584, + "grad_norm": 2.10182824957883e-05, + "learning_rate": 4.283216581009436e-06, + "loss": 0.0, + "num_input_tokens_seen": 18346344, + "step": 30090 + }, + { + "epoch": 8.299779371207944, + "grad_norm": 6.378632519954408e-07, + "learning_rate": 4.27648344117679e-06, + "loss": 0.0, + "num_input_tokens_seen": 18349096, + "step": 30095 + }, + { + "epoch": 8.301158301158301, + "grad_norm": 1.0567817980700056e-06, + "learning_rate": 4.269755102685461e-06, + "loss": 0.0, + "num_input_tokens_seen": 18352392, + "step": 30100 + }, + { + "epoch": 8.30253723110866, + "grad_norm": 1.9012754819414113e-06, + "learning_rate": 4.263031567094306e-06, + "loss": 0.0, + "num_input_tokens_seen": 18354856, + "step": 30105 + }, + { + "epoch": 8.303916161059018, + "grad_norm": 7.0851256168680266e-06, + "learning_rate": 4.2563128359610625e-06, + "loss": 0.0, + "num_input_tokens_seen": 18357288, + "step": 30110 + }, + { + "epoch": 8.305295091009377, + "grad_norm": 6.713885341014247e-07, + "learning_rate": 4.249598910842378e-06, + "loss": 0.0, + "num_input_tokens_seen": 18359816, + "step": 30115 + }, + { + "epoch": 8.306674020959735, + "grad_norm": 4.859543878410477e-06, + "learning_rate": 4.242889793293758e-06, + "loss": 0.0, + "num_input_tokens_seen": 18362248, + "step": 30120 + }, + { + "epoch": 8.308052950910094, + "grad_norm": 1.4224142432794906e-06, + "learning_rate": 4.236185484869618e-06, + "loss": 0.0, + "num_input_tokens_seen": 18364712, + "step": 30125 + }, + { + "epoch": 8.309431880860453, + "grad_norm": 6.625753599109885e-07, + "learning_rate": 4.229485987123255e-06, + "loss": 0.0, + "num_input_tokens_seen": 18367656, + "step": 30130 + }, + { + "epoch": 8.31081081081081, + "grad_norm": 2.8606609703274444e-06, + "learning_rate": 4.222791301606835e-06, + "loss": 0.0, + "num_input_tokens_seen": 18370472, + "step": 30135 + }, + { + "epoch": 8.31218974076117, + "grad_norm": 5.216506451688474e-06, + "learning_rate": 4.21610142987143e-06, + "loss": 0.0, + "num_input_tokens_seen": 18373928, + "step": 30140 + }, + { + "epoch": 8.313568670711527, + "grad_norm": 1.1738682587747462e-05, + "learning_rate": 4.209416373466982e-06, + "loss": 0.0, + "num_input_tokens_seen": 18376680, + "step": 30145 + }, + { + "epoch": 8.314947600661887, + "grad_norm": 2.620610075609875e-06, + "learning_rate": 4.20273613394232e-06, + "loss": 0.0, + "num_input_tokens_seen": 18379752, + "step": 30150 + }, + { + "epoch": 8.316326530612244, + "grad_norm": 3.987488071288681e-06, + "learning_rate": 4.196060712845168e-06, + "loss": 0.0, + "num_input_tokens_seen": 18382216, + "step": 30155 + }, + { + "epoch": 8.317705460562603, + "grad_norm": 5.559956548495393e-07, + "learning_rate": 4.189390111722119e-06, + "loss": 0.0, + "num_input_tokens_seen": 18384680, + "step": 30160 + }, + { + "epoch": 8.319084390512963, + "grad_norm": 3.144148286082782e-05, + "learning_rate": 4.182724332118651e-06, + "loss": 0.0, + "num_input_tokens_seen": 18387752, + "step": 30165 + }, + { + "epoch": 8.32046332046332, + "grad_norm": 3.536359599820571e-06, + "learning_rate": 4.176063375579142e-06, + "loss": 0.0, + "num_input_tokens_seen": 18391304, + "step": 30170 + }, + { + "epoch": 8.32184225041368, + "grad_norm": 5.236320248513948e-07, + "learning_rate": 4.169407243646836e-06, + "loss": 0.0, + "num_input_tokens_seen": 18394632, + "step": 30175 + }, + { + "epoch": 8.323221180364037, + "grad_norm": 5.819743455504067e-05, + "learning_rate": 4.162755937863852e-06, + "loss": 0.0, + "num_input_tokens_seen": 18398600, + "step": 30180 + }, + { + "epoch": 8.324600110314396, + "grad_norm": 0.00013000697072129697, + "learning_rate": 4.1561094597712155e-06, + "loss": 0.0, + "num_input_tokens_seen": 18402376, + "step": 30185 + }, + { + "epoch": 8.325979040264755, + "grad_norm": 0.00034534596488811076, + "learning_rate": 4.149467810908811e-06, + "loss": 0.0, + "num_input_tokens_seen": 18405864, + "step": 30190 + }, + { + "epoch": 8.327357970215113, + "grad_norm": 8.550188681510917e-07, + "learning_rate": 4.142830992815416e-06, + "loss": 0.0, + "num_input_tokens_seen": 18408008, + "step": 30195 + }, + { + "epoch": 8.328736900165472, + "grad_norm": 0.00535447197034955, + "learning_rate": 4.136199007028696e-06, + "loss": 0.0, + "num_input_tokens_seen": 18411528, + "step": 30200 + }, + { + "epoch": 8.33011583011583, + "grad_norm": 1.3499292435881216e-05, + "learning_rate": 4.129571855085171e-06, + "loss": 0.0, + "num_input_tokens_seen": 18414248, + "step": 30205 + }, + { + "epoch": 8.331494760066189, + "grad_norm": 8.259712558356114e-07, + "learning_rate": 4.122949538520268e-06, + "loss": 0.0, + "num_input_tokens_seen": 18417416, + "step": 30210 + }, + { + "epoch": 8.332873690016546, + "grad_norm": 7.949819860186835e-07, + "learning_rate": 4.11633205886828e-06, + "loss": 0.0, + "num_input_tokens_seen": 18419976, + "step": 30215 + }, + { + "epoch": 8.334252619966906, + "grad_norm": 1.093212267733179e-05, + "learning_rate": 4.1097194176623775e-06, + "loss": 0.0, + "num_input_tokens_seen": 18423144, + "step": 30220 + }, + { + "epoch": 8.335631549917265, + "grad_norm": 1.2668821227634908e-06, + "learning_rate": 4.1031116164346214e-06, + "loss": 0.0, + "num_input_tokens_seen": 18426984, + "step": 30225 + }, + { + "epoch": 8.337010479867622, + "grad_norm": 6.9823490775888786e-06, + "learning_rate": 4.096508656715936e-06, + "loss": 0.0, + "num_input_tokens_seen": 18429512, + "step": 30230 + }, + { + "epoch": 8.338389409817982, + "grad_norm": 8.132217885759019e-07, + "learning_rate": 4.089910540036143e-06, + "loss": 0.0, + "num_input_tokens_seen": 18432840, + "step": 30235 + }, + { + "epoch": 8.339768339768339, + "grad_norm": 1.541662300041935e-06, + "learning_rate": 4.083317267923925e-06, + "loss": 0.0, + "num_input_tokens_seen": 18434952, + "step": 30240 + }, + { + "epoch": 8.341147269718698, + "grad_norm": 9.193771575155552e-07, + "learning_rate": 4.076728841906846e-06, + "loss": 0.0, + "num_input_tokens_seen": 18437320, + "step": 30245 + }, + { + "epoch": 8.342526199669058, + "grad_norm": 6.719125167364837e-07, + "learning_rate": 4.070145263511358e-06, + "loss": 0.0, + "num_input_tokens_seen": 18440232, + "step": 30250 + }, + { + "epoch": 8.343905129619415, + "grad_norm": 1.324684376413643e-06, + "learning_rate": 4.063566534262769e-06, + "loss": 0.0, + "num_input_tokens_seen": 18442792, + "step": 30255 + }, + { + "epoch": 8.345284059569774, + "grad_norm": 1.9219598925701575e-06, + "learning_rate": 4.056992655685287e-06, + "loss": 0.0, + "num_input_tokens_seen": 18445160, + "step": 30260 + }, + { + "epoch": 8.346662989520132, + "grad_norm": 0.0011585626052692533, + "learning_rate": 4.05042362930198e-06, + "loss": 0.0, + "num_input_tokens_seen": 18448552, + "step": 30265 + }, + { + "epoch": 8.348041919470491, + "grad_norm": 7.830186063984002e-07, + "learning_rate": 4.0438594566348044e-06, + "loss": 0.0, + "num_input_tokens_seen": 18451592, + "step": 30270 + }, + { + "epoch": 8.349420849420849, + "grad_norm": 2.35986226471141e-06, + "learning_rate": 4.0373001392045755e-06, + "loss": 0.0, + "num_input_tokens_seen": 18454728, + "step": 30275 + }, + { + "epoch": 8.350799779371208, + "grad_norm": 5.020611934014596e-05, + "learning_rate": 4.0307456785309974e-06, + "loss": 0.0, + "num_input_tokens_seen": 18459816, + "step": 30280 + }, + { + "epoch": 8.352178709321567, + "grad_norm": 3.1141453291638754e-06, + "learning_rate": 4.024196076132644e-06, + "loss": 0.0, + "num_input_tokens_seen": 18462792, + "step": 30285 + }, + { + "epoch": 8.353557639271925, + "grad_norm": 6.590836960640445e-07, + "learning_rate": 4.017651333526967e-06, + "loss": 0.0, + "num_input_tokens_seen": 18466312, + "step": 30290 + }, + { + "epoch": 8.354936569222284, + "grad_norm": 5.452602636069059e-07, + "learning_rate": 4.011111452230287e-06, + "loss": 0.0, + "num_input_tokens_seen": 18469096, + "step": 30295 + }, + { + "epoch": 8.356315499172641, + "grad_norm": 7.052349246805534e-07, + "learning_rate": 4.004576433757792e-06, + "loss": 0.0, + "num_input_tokens_seen": 18472072, + "step": 30300 + }, + { + "epoch": 8.357694429123, + "grad_norm": 3.787701280089095e-05, + "learning_rate": 3.998046279623572e-06, + "loss": 0.0, + "num_input_tokens_seen": 18474408, + "step": 30305 + }, + { + "epoch": 8.35907335907336, + "grad_norm": 5.204406988923438e-05, + "learning_rate": 3.991520991340552e-06, + "loss": 0.0, + "num_input_tokens_seen": 18476584, + "step": 30310 + }, + { + "epoch": 8.360452289023717, + "grad_norm": 4.512047780735884e-06, + "learning_rate": 3.9850005704205494e-06, + "loss": 0.0, + "num_input_tokens_seen": 18478824, + "step": 30315 + }, + { + "epoch": 8.361831218974077, + "grad_norm": 6.502476139758073e-07, + "learning_rate": 3.978485018374265e-06, + "loss": 0.0, + "num_input_tokens_seen": 18482152, + "step": 30320 + }, + { + "epoch": 8.363210148924434, + "grad_norm": 6.923429509697598e-07, + "learning_rate": 3.971974336711243e-06, + "loss": 0.0, + "num_input_tokens_seen": 18484424, + "step": 30325 + }, + { + "epoch": 8.364589078874793, + "grad_norm": 0.0016010933322831988, + "learning_rate": 3.965468526939928e-06, + "loss": 0.0, + "num_input_tokens_seen": 18487304, + "step": 30330 + }, + { + "epoch": 8.365968008825151, + "grad_norm": 9.192907555188867e-07, + "learning_rate": 3.958967590567608e-06, + "loss": 0.0, + "num_input_tokens_seen": 18489864, + "step": 30335 + }, + { + "epoch": 8.36734693877551, + "grad_norm": 1.2518989933596458e-06, + "learning_rate": 3.9524715291004685e-06, + "loss": 0.0, + "num_input_tokens_seen": 18492648, + "step": 30340 + }, + { + "epoch": 8.36872586872587, + "grad_norm": 1.444500185243669e-06, + "learning_rate": 3.945980344043557e-06, + "loss": 0.0, + "num_input_tokens_seen": 18495144, + "step": 30345 + }, + { + "epoch": 8.370104798676227, + "grad_norm": 4.703417744167382e-06, + "learning_rate": 3.939494036900779e-06, + "loss": 0.0, + "num_input_tokens_seen": 18497928, + "step": 30350 + }, + { + "epoch": 8.371483728626586, + "grad_norm": 2.1533815015573055e-06, + "learning_rate": 3.933012609174919e-06, + "loss": 0.0, + "num_input_tokens_seen": 18500968, + "step": 30355 + }, + { + "epoch": 8.372862658576944, + "grad_norm": 6.940383627807023e-07, + "learning_rate": 3.926536062367636e-06, + "loss": 0.0, + "num_input_tokens_seen": 18504296, + "step": 30360 + }, + { + "epoch": 8.374241588527303, + "grad_norm": 6.801267318223836e-07, + "learning_rate": 3.920064397979451e-06, + "loss": 0.0, + "num_input_tokens_seen": 18506952, + "step": 30365 + }, + { + "epoch": 8.37562051847766, + "grad_norm": 7.684950560360448e-07, + "learning_rate": 3.913597617509748e-06, + "loss": 0.0, + "num_input_tokens_seen": 18509128, + "step": 30370 + }, + { + "epoch": 8.37699944842802, + "grad_norm": 6.436463877435017e-07, + "learning_rate": 3.907135722456801e-06, + "loss": 0.0, + "num_input_tokens_seen": 18511784, + "step": 30375 + }, + { + "epoch": 8.378378378378379, + "grad_norm": 2.014586243603844e-06, + "learning_rate": 3.900678714317724e-06, + "loss": 0.0, + "num_input_tokens_seen": 18514920, + "step": 30380 + }, + { + "epoch": 8.379757308328736, + "grad_norm": 1.155845552602841e-06, + "learning_rate": 3.8942265945885266e-06, + "loss": 0.0, + "num_input_tokens_seen": 18517416, + "step": 30385 + }, + { + "epoch": 8.381136238279096, + "grad_norm": 1.0067420816994854e-06, + "learning_rate": 3.887779364764066e-06, + "loss": 0.0, + "num_input_tokens_seen": 18519880, + "step": 30390 + }, + { + "epoch": 8.382515168229453, + "grad_norm": 1.4944557733542752e-06, + "learning_rate": 3.881337026338067e-06, + "loss": 0.0, + "num_input_tokens_seen": 18522312, + "step": 30395 + }, + { + "epoch": 8.383894098179812, + "grad_norm": 0.000670113367959857, + "learning_rate": 3.874899580803134e-06, + "loss": 0.0, + "num_input_tokens_seen": 18524648, + "step": 30400 + }, + { + "epoch": 8.385273028130172, + "grad_norm": 8.879317761056882e-07, + "learning_rate": 3.868467029650736e-06, + "loss": 0.0, + "num_input_tokens_seen": 18526920, + "step": 30405 + }, + { + "epoch": 8.38665195808053, + "grad_norm": 7.852326575630286e-07, + "learning_rate": 3.862039374371187e-06, + "loss": 0.0, + "num_input_tokens_seen": 18529960, + "step": 30410 + }, + { + "epoch": 8.388030888030888, + "grad_norm": 0.0015813878271728754, + "learning_rate": 3.8556166164537e-06, + "loss": 0.0, + "num_input_tokens_seen": 18532328, + "step": 30415 + }, + { + "epoch": 8.389409817981246, + "grad_norm": 8.078467317318427e-07, + "learning_rate": 3.849198757386327e-06, + "loss": 0.0, + "num_input_tokens_seen": 18535080, + "step": 30420 + }, + { + "epoch": 8.390788747931605, + "grad_norm": 5.142582608641533e-07, + "learning_rate": 3.84278579865599e-06, + "loss": 0.0, + "num_input_tokens_seen": 18537992, + "step": 30425 + }, + { + "epoch": 8.392167677881964, + "grad_norm": 5.962002660453436e-07, + "learning_rate": 3.83637774174849e-06, + "loss": 0.0, + "num_input_tokens_seen": 18540744, + "step": 30430 + }, + { + "epoch": 8.393546607832322, + "grad_norm": 2.9375644317042315e-06, + "learning_rate": 3.829974588148466e-06, + "loss": 0.0, + "num_input_tokens_seen": 18545064, + "step": 30435 + }, + { + "epoch": 8.394925537782681, + "grad_norm": 1.0456966265337542e-05, + "learning_rate": 3.823576339339454e-06, + "loss": 0.0, + "num_input_tokens_seen": 18548296, + "step": 30440 + }, + { + "epoch": 8.396304467733039, + "grad_norm": 2.85767578134255e-06, + "learning_rate": 3.817182996803831e-06, + "loss": 0.0, + "num_input_tokens_seen": 18551240, + "step": 30445 + }, + { + "epoch": 8.397683397683398, + "grad_norm": 3.838551947410451e-06, + "learning_rate": 3.8107945620228347e-06, + "loss": 0.0, + "num_input_tokens_seen": 18553448, + "step": 30450 + }, + { + "epoch": 8.399062327633755, + "grad_norm": 1.1148181329190265e-06, + "learning_rate": 3.8044110364765824e-06, + "loss": 0.0, + "num_input_tokens_seen": 18556200, + "step": 30455 + }, + { + "epoch": 8.400441257584115, + "grad_norm": 6.011707682773704e-07, + "learning_rate": 3.7980324216440462e-06, + "loss": 0.0, + "num_input_tokens_seen": 18559272, + "step": 30460 + }, + { + "epoch": 8.401820187534474, + "grad_norm": 7.30600447695906e-07, + "learning_rate": 3.7916587190030467e-06, + "loss": 0.0, + "num_input_tokens_seen": 18561928, + "step": 30465 + }, + { + "epoch": 8.403199117484832, + "grad_norm": 1.4906189562680083e-06, + "learning_rate": 3.7852899300302917e-06, + "loss": 0.0, + "num_input_tokens_seen": 18565096, + "step": 30470 + }, + { + "epoch": 8.40457804743519, + "grad_norm": 9.761922683537705e-07, + "learning_rate": 3.7789260562013317e-06, + "loss": 0.0, + "num_input_tokens_seen": 18567976, + "step": 30475 + }, + { + "epoch": 8.405956977385548, + "grad_norm": 8.281479608740483e-07, + "learning_rate": 3.7725670989905953e-06, + "loss": 0.0, + "num_input_tokens_seen": 18571016, + "step": 30480 + }, + { + "epoch": 8.407335907335908, + "grad_norm": 1.0437478294988978e-06, + "learning_rate": 3.766213059871357e-06, + "loss": 0.0, + "num_input_tokens_seen": 18573544, + "step": 30485 + }, + { + "epoch": 8.408714837286265, + "grad_norm": 1.3606718312075827e-06, + "learning_rate": 3.7598639403157445e-06, + "loss": 0.0, + "num_input_tokens_seen": 18576360, + "step": 30490 + }, + { + "epoch": 8.410093767236624, + "grad_norm": 6.622713613069209e-07, + "learning_rate": 3.7535197417947753e-06, + "loss": 0.0, + "num_input_tokens_seen": 18579688, + "step": 30495 + }, + { + "epoch": 8.411472697186984, + "grad_norm": 0.00010216973896604031, + "learning_rate": 3.7471804657783022e-06, + "loss": 0.0, + "num_input_tokens_seen": 18581992, + "step": 30500 + }, + { + "epoch": 8.412851627137341, + "grad_norm": 1.0546664270805195e-06, + "learning_rate": 3.7408461137350333e-06, + "loss": 0.0, + "num_input_tokens_seen": 18585640, + "step": 30505 + }, + { + "epoch": 8.4142305570877, + "grad_norm": 6.426967047445942e-07, + "learning_rate": 3.7345166871325666e-06, + "loss": 0.0, + "num_input_tokens_seen": 18588648, + "step": 30510 + }, + { + "epoch": 8.415609487038058, + "grad_norm": 2.562672079875483e-06, + "learning_rate": 3.7281921874373293e-06, + "loss": 0.0, + "num_input_tokens_seen": 18591528, + "step": 30515 + }, + { + "epoch": 8.416988416988417, + "grad_norm": 2.6983198040397838e-06, + "learning_rate": 3.7218726161146104e-06, + "loss": 0.0, + "num_input_tokens_seen": 18595752, + "step": 30520 + }, + { + "epoch": 8.418367346938776, + "grad_norm": 6.962916927477636e-07, + "learning_rate": 3.715557974628578e-06, + "loss": 0.0, + "num_input_tokens_seen": 18598376, + "step": 30525 + }, + { + "epoch": 8.419746276889134, + "grad_norm": 2.583090463303961e-06, + "learning_rate": 3.7092482644422306e-06, + "loss": 0.0, + "num_input_tokens_seen": 18601416, + "step": 30530 + }, + { + "epoch": 8.421125206839493, + "grad_norm": 6.757973665116879e-07, + "learning_rate": 3.702943487017449e-06, + "loss": 0.0, + "num_input_tokens_seen": 18605064, + "step": 30535 + }, + { + "epoch": 8.42250413678985, + "grad_norm": 2.0647958081099205e-06, + "learning_rate": 3.696643643814948e-06, + "loss": 0.0, + "num_input_tokens_seen": 18608424, + "step": 30540 + }, + { + "epoch": 8.42388306674021, + "grad_norm": 1.0639391803124454e-06, + "learning_rate": 3.6903487362943217e-06, + "loss": 0.0, + "num_input_tokens_seen": 18611432, + "step": 30545 + }, + { + "epoch": 8.425261996690567, + "grad_norm": 1.870885512289533e-06, + "learning_rate": 3.6840587659140085e-06, + "loss": 0.0, + "num_input_tokens_seen": 18614280, + "step": 30550 + }, + { + "epoch": 8.426640926640927, + "grad_norm": 4.623542554327287e-05, + "learning_rate": 3.677773734131301e-06, + "loss": 0.0, + "num_input_tokens_seen": 18617832, + "step": 30555 + }, + { + "epoch": 8.428019856591286, + "grad_norm": 3.303857511127717e-06, + "learning_rate": 3.671493642402346e-06, + "loss": 0.0, + "num_input_tokens_seen": 18620872, + "step": 30560 + }, + { + "epoch": 8.429398786541643, + "grad_norm": 1.6807365682325326e-05, + "learning_rate": 3.6652184921821598e-06, + "loss": 0.0, + "num_input_tokens_seen": 18624584, + "step": 30565 + }, + { + "epoch": 8.430777716492003, + "grad_norm": 7.039006959530525e-07, + "learning_rate": 3.658948284924596e-06, + "loss": 0.0, + "num_input_tokens_seen": 18626888, + "step": 30570 + }, + { + "epoch": 8.43215664644236, + "grad_norm": 5.821112267767603e-07, + "learning_rate": 3.6526830220823814e-06, + "loss": 0.0, + "num_input_tokens_seen": 18629320, + "step": 30575 + }, + { + "epoch": 8.43353557639272, + "grad_norm": 4.564380162719317e-07, + "learning_rate": 3.646422705107083e-06, + "loss": 0.0, + "num_input_tokens_seen": 18632360, + "step": 30580 + }, + { + "epoch": 8.434914506343079, + "grad_norm": 8.272109539575467e-07, + "learning_rate": 3.640167335449121e-06, + "loss": 0.0, + "num_input_tokens_seen": 18636104, + "step": 30585 + }, + { + "epoch": 8.436293436293436, + "grad_norm": 0.00038806171505711973, + "learning_rate": 3.6339169145577833e-06, + "loss": 0.0, + "num_input_tokens_seen": 18638376, + "step": 30590 + }, + { + "epoch": 8.437672366243795, + "grad_norm": 1.0138156767425244e-06, + "learning_rate": 3.6276714438811975e-06, + "loss": 0.0, + "num_input_tokens_seen": 18641832, + "step": 30595 + }, + { + "epoch": 8.439051296194153, + "grad_norm": 6.091550517339783e-07, + "learning_rate": 3.621430924866348e-06, + "loss": 0.0, + "num_input_tokens_seen": 18644680, + "step": 30600 + }, + { + "epoch": 8.440430226144512, + "grad_norm": 1.3285304021337652e-06, + "learning_rate": 3.6151953589590755e-06, + "loss": 0.0, + "num_input_tokens_seen": 18647912, + "step": 30605 + }, + { + "epoch": 8.44180915609487, + "grad_norm": 1.1978809197898954e-06, + "learning_rate": 3.6089647476040766e-06, + "loss": 0.0, + "num_input_tokens_seen": 18651496, + "step": 30610 + }, + { + "epoch": 8.443188086045229, + "grad_norm": 9.225349640473723e-05, + "learning_rate": 3.6027390922448833e-06, + "loss": 0.0, + "num_input_tokens_seen": 18654184, + "step": 30615 + }, + { + "epoch": 8.444567015995588, + "grad_norm": 0.0003184352826792747, + "learning_rate": 3.5965183943239056e-06, + "loss": 0.0, + "num_input_tokens_seen": 18656104, + "step": 30620 + }, + { + "epoch": 8.445945945945946, + "grad_norm": 8.637617270323972e-07, + "learning_rate": 3.590302655282374e-06, + "loss": 0.0, + "num_input_tokens_seen": 18660040, + "step": 30625 + }, + { + "epoch": 8.447324875896305, + "grad_norm": 1.6694770010872162e-06, + "learning_rate": 3.584091876560397e-06, + "loss": 0.0, + "num_input_tokens_seen": 18663272, + "step": 30630 + }, + { + "epoch": 8.448703805846662, + "grad_norm": 7.56317888317426e-07, + "learning_rate": 3.5778860595969215e-06, + "loss": 0.0, + "num_input_tokens_seen": 18665864, + "step": 30635 + }, + { + "epoch": 8.450082735797022, + "grad_norm": 3.4154825243604137e-06, + "learning_rate": 3.571685205829739e-06, + "loss": 0.0, + "num_input_tokens_seen": 18670984, + "step": 30640 + }, + { + "epoch": 8.451461665747381, + "grad_norm": 0.0003394206869415939, + "learning_rate": 3.5654893166955083e-06, + "loss": 0.0, + "num_input_tokens_seen": 18673832, + "step": 30645 + }, + { + "epoch": 8.452840595697738, + "grad_norm": 1.1873190487676766e-05, + "learning_rate": 3.559298393629723e-06, + "loss": 0.0, + "num_input_tokens_seen": 18676232, + "step": 30650 + }, + { + "epoch": 8.454219525648098, + "grad_norm": 9.308885751124762e-07, + "learning_rate": 3.5531124380667245e-06, + "loss": 0.0, + "num_input_tokens_seen": 18680232, + "step": 30655 + }, + { + "epoch": 8.455598455598455, + "grad_norm": 1.0470940878803958e-06, + "learning_rate": 3.5469314514397225e-06, + "loss": 0.0, + "num_input_tokens_seen": 18682504, + "step": 30660 + }, + { + "epoch": 8.456977385548814, + "grad_norm": 1.488542920924374e-06, + "learning_rate": 3.5407554351807508e-06, + "loss": 0.0, + "num_input_tokens_seen": 18685416, + "step": 30665 + }, + { + "epoch": 8.458356315499172, + "grad_norm": 6.450101750488102e-07, + "learning_rate": 3.534584390720716e-06, + "loss": 0.0, + "num_input_tokens_seen": 18687528, + "step": 30670 + }, + { + "epoch": 8.459735245449531, + "grad_norm": 1.0344863312639063e-06, + "learning_rate": 3.5284183194893488e-06, + "loss": 0.0, + "num_input_tokens_seen": 18690536, + "step": 30675 + }, + { + "epoch": 8.46111417539989, + "grad_norm": 8.130018045449106e-07, + "learning_rate": 3.522257222915251e-06, + "loss": 0.0, + "num_input_tokens_seen": 18695048, + "step": 30680 + }, + { + "epoch": 8.462493105350248, + "grad_norm": 1.268207256543974e-06, + "learning_rate": 3.5161011024258476e-06, + "loss": 0.0, + "num_input_tokens_seen": 18697512, + "step": 30685 + }, + { + "epoch": 8.463872035300607, + "grad_norm": 0.0005346211255528033, + "learning_rate": 3.509949959447434e-06, + "loss": 0.0, + "num_input_tokens_seen": 18700904, + "step": 30690 + }, + { + "epoch": 8.465250965250965, + "grad_norm": 2.2636857011093525e-06, + "learning_rate": 3.5038037954051334e-06, + "loss": 0.0, + "num_input_tokens_seen": 18704040, + "step": 30695 + }, + { + "epoch": 8.466629895201324, + "grad_norm": 1.125296194004477e-06, + "learning_rate": 3.497662611722935e-06, + "loss": 0.0, + "num_input_tokens_seen": 18706536, + "step": 30700 + }, + { + "epoch": 8.468008825151681, + "grad_norm": 3.2341333735530498e-06, + "learning_rate": 3.4915264098236535e-06, + "loss": 0.0, + "num_input_tokens_seen": 18709096, + "step": 30705 + }, + { + "epoch": 8.46938775510204, + "grad_norm": 8.24930850740202e-07, + "learning_rate": 3.4853951911289567e-06, + "loss": 0.0, + "num_input_tokens_seen": 18712168, + "step": 30710 + }, + { + "epoch": 8.4707666850524, + "grad_norm": 1.504938154539559e-06, + "learning_rate": 3.47926895705937e-06, + "loss": 0.0, + "num_input_tokens_seen": 18715688, + "step": 30715 + }, + { + "epoch": 8.472145615002757, + "grad_norm": 9.067227324521809e-07, + "learning_rate": 3.4731477090342413e-06, + "loss": 0.0, + "num_input_tokens_seen": 18718344, + "step": 30720 + }, + { + "epoch": 8.473524544953117, + "grad_norm": 8.883241389412433e-05, + "learning_rate": 3.4670314484717916e-06, + "loss": 0.0, + "num_input_tokens_seen": 18722344, + "step": 30725 + }, + { + "epoch": 8.474903474903474, + "grad_norm": 5.589899046753999e-06, + "learning_rate": 3.460920176789059e-06, + "loss": 0.0, + "num_input_tokens_seen": 18725128, + "step": 30730 + }, + { + "epoch": 8.476282404853833, + "grad_norm": 1.6389981283282395e-06, + "learning_rate": 3.454813895401937e-06, + "loss": 0.0, + "num_input_tokens_seen": 18727464, + "step": 30735 + }, + { + "epoch": 8.477661334804193, + "grad_norm": 2.620275608933298e-06, + "learning_rate": 3.448712605725171e-06, + "loss": 0.0, + "num_input_tokens_seen": 18729800, + "step": 30740 + }, + { + "epoch": 8.47904026475455, + "grad_norm": 2.2449346488429e-06, + "learning_rate": 3.4426163091723336e-06, + "loss": 0.0, + "num_input_tokens_seen": 18733352, + "step": 30745 + }, + { + "epoch": 8.48041919470491, + "grad_norm": 9.372143540531397e-05, + "learning_rate": 3.4365250071558526e-06, + "loss": 0.0, + "num_input_tokens_seen": 18736392, + "step": 30750 + }, + { + "epoch": 8.481798124655267, + "grad_norm": 9.59890712692868e-07, + "learning_rate": 3.430438701087002e-06, + "loss": 0.0, + "num_input_tokens_seen": 18739976, + "step": 30755 + }, + { + "epoch": 8.483177054605626, + "grad_norm": 4.83714700294513e-07, + "learning_rate": 3.424357392375879e-06, + "loss": 0.0, + "num_input_tokens_seen": 18742504, + "step": 30760 + }, + { + "epoch": 8.484555984555984, + "grad_norm": 1.8293366110810894e-06, + "learning_rate": 3.4182810824314487e-06, + "loss": 0.0, + "num_input_tokens_seen": 18745448, + "step": 30765 + }, + { + "epoch": 8.485934914506343, + "grad_norm": 1.1084777042924543e-06, + "learning_rate": 3.4122097726614984e-06, + "loss": 0.0, + "num_input_tokens_seen": 18748872, + "step": 30770 + }, + { + "epoch": 8.487313844456702, + "grad_norm": 4.933700097353722e-07, + "learning_rate": 3.406143464472661e-06, + "loss": 0.0, + "num_input_tokens_seen": 18752488, + "step": 30775 + }, + { + "epoch": 8.48869277440706, + "grad_norm": 4.119745426578447e-06, + "learning_rate": 3.400082159270418e-06, + "loss": 0.0, + "num_input_tokens_seen": 18755464, + "step": 30780 + }, + { + "epoch": 8.490071704357419, + "grad_norm": 8.568104021833278e-07, + "learning_rate": 3.394025858459085e-06, + "loss": 0.0, + "num_input_tokens_seen": 18758376, + "step": 30785 + }, + { + "epoch": 8.491450634307776, + "grad_norm": 9.659038369136397e-07, + "learning_rate": 3.387974563441812e-06, + "loss": 0.0, + "num_input_tokens_seen": 18761544, + "step": 30790 + }, + { + "epoch": 8.492829564258136, + "grad_norm": 6.299267170106759e-07, + "learning_rate": 3.3819282756206134e-06, + "loss": 0.0, + "num_input_tokens_seen": 18764584, + "step": 30795 + }, + { + "epoch": 8.494208494208495, + "grad_norm": 5.475120019582391e-07, + "learning_rate": 3.3758869963963157e-06, + "loss": 0.0, + "num_input_tokens_seen": 18768552, + "step": 30800 + }, + { + "epoch": 8.495587424158852, + "grad_norm": 7.312656293834152e-07, + "learning_rate": 3.369850727168597e-06, + "loss": 0.0, + "num_input_tokens_seen": 18771304, + "step": 30805 + }, + { + "epoch": 8.496966354109212, + "grad_norm": 1.689523742243182e-05, + "learning_rate": 3.36381946933598e-06, + "loss": 0.0, + "num_input_tokens_seen": 18774184, + "step": 30810 + }, + { + "epoch": 8.49834528405957, + "grad_norm": 1.237825244970736e-06, + "learning_rate": 3.3577932242958104e-06, + "loss": 0.0, + "num_input_tokens_seen": 18776904, + "step": 30815 + }, + { + "epoch": 8.499724214009929, + "grad_norm": 5.49234187019465e-07, + "learning_rate": 3.3517719934442914e-06, + "loss": 0.0, + "num_input_tokens_seen": 18779592, + "step": 30820 + }, + { + "epoch": 8.5, + "eval_loss": 0.36956456303596497, + "eval_runtime": 28.4951, + "eval_samples_per_second": 56.571, + "eval_steps_per_second": 14.143, + "num_input_tokens_seen": 18780040, + "step": 30821 + }, + { + "epoch": 8.501103143960286, + "grad_norm": 4.101479862583801e-05, + "learning_rate": 3.345755778176457e-06, + "loss": 0.0, + "num_input_tokens_seen": 18781864, + "step": 30825 + }, + { + "epoch": 8.502482073910645, + "grad_norm": 1.8942256474474561e-06, + "learning_rate": 3.3397445798861727e-06, + "loss": 0.0, + "num_input_tokens_seen": 18784776, + "step": 30830 + }, + { + "epoch": 8.503861003861005, + "grad_norm": 5.728617793465673e-07, + "learning_rate": 3.333738399966152e-06, + "loss": 0.0, + "num_input_tokens_seen": 18788168, + "step": 30835 + }, + { + "epoch": 8.505239933811362, + "grad_norm": 7.33348133508116e-05, + "learning_rate": 3.3277372398079377e-06, + "loss": 0.0, + "num_input_tokens_seen": 18790600, + "step": 30840 + }, + { + "epoch": 8.506618863761721, + "grad_norm": 4.137052746955305e-05, + "learning_rate": 3.3217411008019063e-06, + "loss": 0.0, + "num_input_tokens_seen": 18794280, + "step": 30845 + }, + { + "epoch": 8.507997793712079, + "grad_norm": 6.584404036402702e-05, + "learning_rate": 3.315749984337288e-06, + "loss": 0.0, + "num_input_tokens_seen": 18798632, + "step": 30850 + }, + { + "epoch": 8.509376723662438, + "grad_norm": 1.87854300293111e-06, + "learning_rate": 3.309763891802134e-06, + "loss": 0.0, + "num_input_tokens_seen": 18801064, + "step": 30855 + }, + { + "epoch": 8.510755653612797, + "grad_norm": 9.651421351009049e-07, + "learning_rate": 3.303782824583332e-06, + "loss": 0.0, + "num_input_tokens_seen": 18803784, + "step": 30860 + }, + { + "epoch": 8.512134583563155, + "grad_norm": 1.5856074924158747e-06, + "learning_rate": 3.2978067840666175e-06, + "loss": 0.0, + "num_input_tokens_seen": 18806152, + "step": 30865 + }, + { + "epoch": 8.513513513513514, + "grad_norm": 1.380409230478108e-06, + "learning_rate": 3.291835771636542e-06, + "loss": 0.0, + "num_input_tokens_seen": 18808456, + "step": 30870 + }, + { + "epoch": 8.514892443463872, + "grad_norm": 8.264963753390475e-07, + "learning_rate": 3.2858697886765153e-06, + "loss": 0.0, + "num_input_tokens_seen": 18812584, + "step": 30875 + }, + { + "epoch": 8.51627137341423, + "grad_norm": 6.38641552086483e-07, + "learning_rate": 3.2799088365687598e-06, + "loss": 0.0, + "num_input_tokens_seen": 18814696, + "step": 30880 + }, + { + "epoch": 8.517650303364588, + "grad_norm": 4.603590241458733e-06, + "learning_rate": 3.2739529166943534e-06, + "loss": 0.0, + "num_input_tokens_seen": 18817384, + "step": 30885 + }, + { + "epoch": 8.519029233314948, + "grad_norm": 3.020235681105987e-06, + "learning_rate": 3.268002030433187e-06, + "loss": 0.0, + "num_input_tokens_seen": 18819720, + "step": 30890 + }, + { + "epoch": 8.520408163265307, + "grad_norm": 6.161237138258002e-07, + "learning_rate": 3.2620561791640026e-06, + "loss": 0.0, + "num_input_tokens_seen": 18822600, + "step": 30895 + }, + { + "epoch": 8.521787093215664, + "grad_norm": 3.6209257814334705e-06, + "learning_rate": 3.2561153642643614e-06, + "loss": 0.0, + "num_input_tokens_seen": 18825352, + "step": 30900 + }, + { + "epoch": 8.523166023166024, + "grad_norm": 8.997556619760871e-07, + "learning_rate": 3.250179587110677e-06, + "loss": 0.0, + "num_input_tokens_seen": 18827656, + "step": 30905 + }, + { + "epoch": 8.524544953116381, + "grad_norm": 6.655196216343029e-07, + "learning_rate": 3.2442488490781687e-06, + "loss": 0.0, + "num_input_tokens_seen": 18831720, + "step": 30910 + }, + { + "epoch": 8.52592388306674, + "grad_norm": 7.876425911490514e-07, + "learning_rate": 3.2383231515409176e-06, + "loss": 0.0, + "num_input_tokens_seen": 18833928, + "step": 30915 + }, + { + "epoch": 8.527302813017098, + "grad_norm": 1.9121187506243587e-05, + "learning_rate": 3.232402495871814e-06, + "loss": 0.0, + "num_input_tokens_seen": 18836136, + "step": 30920 + }, + { + "epoch": 8.528681742967457, + "grad_norm": 1.603000782779418e-06, + "learning_rate": 3.2264868834425865e-06, + "loss": 0.0, + "num_input_tokens_seen": 18838952, + "step": 30925 + }, + { + "epoch": 8.530060672917816, + "grad_norm": 9.370307907374809e-07, + "learning_rate": 3.220576315623808e-06, + "loss": 0.0, + "num_input_tokens_seen": 18843368, + "step": 30930 + }, + { + "epoch": 8.531439602868174, + "grad_norm": 0.0005576022667810321, + "learning_rate": 3.214670793784866e-06, + "loss": 0.0, + "num_input_tokens_seen": 18845928, + "step": 30935 + }, + { + "epoch": 8.532818532818533, + "grad_norm": 9.014598276735342e-07, + "learning_rate": 3.20877031929398e-06, + "loss": 0.0, + "num_input_tokens_seen": 18849128, + "step": 30940 + }, + { + "epoch": 8.53419746276889, + "grad_norm": 7.225589797599241e-07, + "learning_rate": 3.202874893518215e-06, + "loss": 0.0, + "num_input_tokens_seen": 18851944, + "step": 30945 + }, + { + "epoch": 8.53557639271925, + "grad_norm": 1.4071322311792755e-06, + "learning_rate": 3.1969845178234483e-06, + "loss": 0.0, + "num_input_tokens_seen": 18855080, + "step": 30950 + }, + { + "epoch": 8.536955322669609, + "grad_norm": 1.271003384317737e-06, + "learning_rate": 3.191099193574401e-06, + "loss": 0.0, + "num_input_tokens_seen": 18858952, + "step": 30955 + }, + { + "epoch": 8.538334252619967, + "grad_norm": 3.515089929351234e-06, + "learning_rate": 3.185218922134614e-06, + "loss": 0.0, + "num_input_tokens_seen": 18860712, + "step": 30960 + }, + { + "epoch": 8.539713182570326, + "grad_norm": 0.0011949999025091529, + "learning_rate": 3.179343704866464e-06, + "loss": 0.0, + "num_input_tokens_seen": 18864840, + "step": 30965 + }, + { + "epoch": 8.541092112520683, + "grad_norm": 5.599442829407053e-06, + "learning_rate": 3.1734735431311614e-06, + "loss": 0.0, + "num_input_tokens_seen": 18867496, + "step": 30970 + }, + { + "epoch": 8.542471042471043, + "grad_norm": 1.33514913613908e-06, + "learning_rate": 3.16760843828873e-06, + "loss": 0.0, + "num_input_tokens_seen": 18870088, + "step": 30975 + }, + { + "epoch": 8.543849972421402, + "grad_norm": 1.3239324289315846e-05, + "learning_rate": 3.1617483916980246e-06, + "loss": 0.0, + "num_input_tokens_seen": 18873000, + "step": 30980 + }, + { + "epoch": 8.54522890237176, + "grad_norm": 4.408209861139767e-06, + "learning_rate": 3.1558934047167503e-06, + "loss": 0.0, + "num_input_tokens_seen": 18876072, + "step": 30985 + }, + { + "epoch": 8.546607832322119, + "grad_norm": 6.510742309728812e-07, + "learning_rate": 3.1500434787014118e-06, + "loss": 0.0, + "num_input_tokens_seen": 18879016, + "step": 30990 + }, + { + "epoch": 8.547986762272476, + "grad_norm": 7.714109528933477e-07, + "learning_rate": 3.1441986150073515e-06, + "loss": 0.0, + "num_input_tokens_seen": 18882248, + "step": 30995 + }, + { + "epoch": 8.549365692222835, + "grad_norm": 8.327048703904438e-07, + "learning_rate": 3.138358814988751e-06, + "loss": 0.0, + "num_input_tokens_seen": 18885736, + "step": 31000 + }, + { + "epoch": 8.550744622173193, + "grad_norm": 2.372915332671255e-05, + "learning_rate": 3.132524079998597e-06, + "loss": 0.0, + "num_input_tokens_seen": 18888264, + "step": 31005 + }, + { + "epoch": 8.552123552123552, + "grad_norm": 1.017378167489369e-06, + "learning_rate": 3.1266944113887227e-06, + "loss": 0.0, + "num_input_tokens_seen": 18891304, + "step": 31010 + }, + { + "epoch": 8.553502482073911, + "grad_norm": 0.00017532506899442524, + "learning_rate": 3.1208698105097745e-06, + "loss": 0.0, + "num_input_tokens_seen": 18894632, + "step": 31015 + }, + { + "epoch": 8.554881412024269, + "grad_norm": 1.0545358009039774e-06, + "learning_rate": 3.1150502787112234e-06, + "loss": 0.0, + "num_input_tokens_seen": 18897192, + "step": 31020 + }, + { + "epoch": 8.556260341974628, + "grad_norm": 8.3015976315437e-07, + "learning_rate": 3.1092358173413797e-06, + "loss": 0.0, + "num_input_tokens_seen": 18900040, + "step": 31025 + }, + { + "epoch": 8.557639271924986, + "grad_norm": 7.094628813320014e-07, + "learning_rate": 3.103426427747372e-06, + "loss": 0.0, + "num_input_tokens_seen": 18902920, + "step": 31030 + }, + { + "epoch": 8.559018201875345, + "grad_norm": 1.1389905694159097e-06, + "learning_rate": 3.0976221112751418e-06, + "loss": 0.0, + "num_input_tokens_seen": 18904936, + "step": 31035 + }, + { + "epoch": 8.560397131825702, + "grad_norm": 1.0384778761363123e-06, + "learning_rate": 3.0918228692694785e-06, + "loss": 0.0, + "num_input_tokens_seen": 18907976, + "step": 31040 + }, + { + "epoch": 8.561776061776062, + "grad_norm": 6.0192367527633905e-05, + "learning_rate": 3.086028703073976e-06, + "loss": 0.0, + "num_input_tokens_seen": 18910600, + "step": 31045 + }, + { + "epoch": 8.563154991726421, + "grad_norm": 4.172706394456327e-05, + "learning_rate": 3.0802396140310553e-06, + "loss": 0.0, + "num_input_tokens_seen": 18913480, + "step": 31050 + }, + { + "epoch": 8.564533921676778, + "grad_norm": 0.00037684914423152804, + "learning_rate": 3.0744556034819765e-06, + "loss": 0.0, + "num_input_tokens_seen": 18915912, + "step": 31055 + }, + { + "epoch": 8.565912851627138, + "grad_norm": 1.645988959353417e-05, + "learning_rate": 3.0686766727668016e-06, + "loss": 0.0, + "num_input_tokens_seen": 18918792, + "step": 31060 + }, + { + "epoch": 8.567291781577495, + "grad_norm": 3.0020186386536807e-05, + "learning_rate": 3.0629028232244338e-06, + "loss": 0.0, + "num_input_tokens_seen": 18922632, + "step": 31065 + }, + { + "epoch": 8.568670711527854, + "grad_norm": 1.2145547998443362e-06, + "learning_rate": 3.0571340561925897e-06, + "loss": 0.0, + "num_input_tokens_seen": 18925384, + "step": 31070 + }, + { + "epoch": 8.570049641478214, + "grad_norm": 1.7565902453497984e-05, + "learning_rate": 3.0513703730078004e-06, + "loss": 0.0, + "num_input_tokens_seen": 18929352, + "step": 31075 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 7.2431876105838455e-06, + "learning_rate": 3.0456117750054415e-06, + "loss": 0.0, + "num_input_tokens_seen": 18932840, + "step": 31080 + }, + { + "epoch": 8.57280750137893, + "grad_norm": 1.9169726783729857e-06, + "learning_rate": 3.039858263519693e-06, + "loss": 0.0, + "num_input_tokens_seen": 18935272, + "step": 31085 + }, + { + "epoch": 8.574186431329288, + "grad_norm": 6.24721849362686e-07, + "learning_rate": 3.0341098398835533e-06, + "loss": 0.0, + "num_input_tokens_seen": 18937224, + "step": 31090 + }, + { + "epoch": 8.575565361279647, + "grad_norm": 6.613400387323054e-07, + "learning_rate": 3.028366505428856e-06, + "loss": 0.0, + "num_input_tokens_seen": 18940264, + "step": 31095 + }, + { + "epoch": 8.576944291230006, + "grad_norm": 1.7823803091232548e-06, + "learning_rate": 3.0226282614862545e-06, + "loss": 0.0, + "num_input_tokens_seen": 18943080, + "step": 31100 + }, + { + "epoch": 8.578323221180364, + "grad_norm": 6.913969627930783e-07, + "learning_rate": 3.016895109385209e-06, + "loss": 0.0, + "num_input_tokens_seen": 18945480, + "step": 31105 + }, + { + "epoch": 8.579702151130723, + "grad_norm": 5.352484890863707e-07, + "learning_rate": 3.011167050454014e-06, + "loss": 0.0, + "num_input_tokens_seen": 18948264, + "step": 31110 + }, + { + "epoch": 8.58108108108108, + "grad_norm": 9.798244491321384e-07, + "learning_rate": 3.0054440860197737e-06, + "loss": 0.0, + "num_input_tokens_seen": 18951272, + "step": 31115 + }, + { + "epoch": 8.58246001103144, + "grad_norm": 7.38676192213461e-07, + "learning_rate": 2.9997262174084244e-06, + "loss": 0.0, + "num_input_tokens_seen": 18953896, + "step": 31120 + }, + { + "epoch": 8.583838940981797, + "grad_norm": 1.0958626717183506e-06, + "learning_rate": 2.99401344594471e-06, + "loss": 0.0, + "num_input_tokens_seen": 18957224, + "step": 31125 + }, + { + "epoch": 8.585217870932157, + "grad_norm": 1.3055795534455683e-05, + "learning_rate": 2.988305772952191e-06, + "loss": 0.0, + "num_input_tokens_seen": 18959752, + "step": 31130 + }, + { + "epoch": 8.586596800882516, + "grad_norm": 6.740772846569598e-07, + "learning_rate": 2.9826031997532616e-06, + "loss": 0.0, + "num_input_tokens_seen": 18962376, + "step": 31135 + }, + { + "epoch": 8.587975730832873, + "grad_norm": 6.754332275704655e-07, + "learning_rate": 2.976905727669127e-06, + "loss": 0.0, + "num_input_tokens_seen": 18965064, + "step": 31140 + }, + { + "epoch": 8.589354660783233, + "grad_norm": 6.2707686083740555e-06, + "learning_rate": 2.9712133580198002e-06, + "loss": 0.0, + "num_input_tokens_seen": 18968680, + "step": 31145 + }, + { + "epoch": 8.59073359073359, + "grad_norm": 5.541087375604548e-07, + "learning_rate": 2.9655260921241352e-06, + "loss": 0.0, + "num_input_tokens_seen": 18971240, + "step": 31150 + }, + { + "epoch": 8.59211252068395, + "grad_norm": 6.935846954547742e-07, + "learning_rate": 2.9598439312997745e-06, + "loss": 0.0, + "num_input_tokens_seen": 18974632, + "step": 31155 + }, + { + "epoch": 8.593491450634307, + "grad_norm": 2.4179134925361723e-05, + "learning_rate": 2.9541668768632068e-06, + "loss": 0.0, + "num_input_tokens_seen": 18978568, + "step": 31160 + }, + { + "epoch": 8.594870380584666, + "grad_norm": 6.026135679348954e-07, + "learning_rate": 2.9484949301297166e-06, + "loss": 0.0, + "num_input_tokens_seen": 18981768, + "step": 31165 + }, + { + "epoch": 8.596249310535026, + "grad_norm": 3.963904873671709e-06, + "learning_rate": 2.9428280924134104e-06, + "loss": 0.0, + "num_input_tokens_seen": 18984456, + "step": 31170 + }, + { + "epoch": 8.597628240485383, + "grad_norm": 6.547268640133552e-06, + "learning_rate": 2.9371663650272248e-06, + "loss": 0.0, + "num_input_tokens_seen": 18988584, + "step": 31175 + }, + { + "epoch": 8.599007170435742, + "grad_norm": 8.031282163756259e-07, + "learning_rate": 2.931509749282893e-06, + "loss": 0.0, + "num_input_tokens_seen": 18991240, + "step": 31180 + }, + { + "epoch": 8.6003861003861, + "grad_norm": 1.2998410738873645e-06, + "learning_rate": 2.925858246490967e-06, + "loss": 0.0, + "num_input_tokens_seen": 18993736, + "step": 31185 + }, + { + "epoch": 8.601765030336459, + "grad_norm": 1.597325081093004e-06, + "learning_rate": 2.9202118579608277e-06, + "loss": 0.0, + "num_input_tokens_seen": 18997000, + "step": 31190 + }, + { + "epoch": 8.603143960286818, + "grad_norm": 3.528693923726678e-07, + "learning_rate": 2.9145705850006534e-06, + "loss": 0.0, + "num_input_tokens_seen": 19000616, + "step": 31195 + }, + { + "epoch": 8.604522890237176, + "grad_norm": 3.6189323964208597e-06, + "learning_rate": 2.9089344289174563e-06, + "loss": 0.0, + "num_input_tokens_seen": 19004424, + "step": 31200 + }, + { + "epoch": 8.605901820187535, + "grad_norm": 6.34968955637305e-07, + "learning_rate": 2.903303391017048e-06, + "loss": 0.0, + "num_input_tokens_seen": 19009288, + "step": 31205 + }, + { + "epoch": 8.607280750137893, + "grad_norm": 1.6710728232283145e-06, + "learning_rate": 2.8976774726040546e-06, + "loss": 0.0, + "num_input_tokens_seen": 19012232, + "step": 31210 + }, + { + "epoch": 8.608659680088252, + "grad_norm": 6.434290753531968e-06, + "learning_rate": 2.892056674981927e-06, + "loss": 0.0, + "num_input_tokens_seen": 19015272, + "step": 31215 + }, + { + "epoch": 8.61003861003861, + "grad_norm": 2.5072415610338794e-06, + "learning_rate": 2.8864409994529244e-06, + "loss": 0.0, + "num_input_tokens_seen": 19017992, + "step": 31220 + }, + { + "epoch": 8.611417539988969, + "grad_norm": 4.568767337786994e-07, + "learning_rate": 2.8808304473181088e-06, + "loss": 0.0, + "num_input_tokens_seen": 19020840, + "step": 31225 + }, + { + "epoch": 8.612796469939328, + "grad_norm": 1.3989165381644852e-06, + "learning_rate": 2.8752250198773726e-06, + "loss": 0.0, + "num_input_tokens_seen": 19024104, + "step": 31230 + }, + { + "epoch": 8.614175399889685, + "grad_norm": 2.0826510080951266e-05, + "learning_rate": 2.869624718429409e-06, + "loss": 0.0, + "num_input_tokens_seen": 19026984, + "step": 31235 + }, + { + "epoch": 8.615554329840045, + "grad_norm": 9.25755728076183e-07, + "learning_rate": 2.864029544271729e-06, + "loss": 0.0, + "num_input_tokens_seen": 19029800, + "step": 31240 + }, + { + "epoch": 8.616933259790402, + "grad_norm": 1.274300302611664e-06, + "learning_rate": 2.85843949870066e-06, + "loss": 0.0, + "num_input_tokens_seen": 19032648, + "step": 31245 + }, + { + "epoch": 8.618312189740761, + "grad_norm": 2.244813458673889e-06, + "learning_rate": 2.8528545830113248e-06, + "loss": 0.0, + "num_input_tokens_seen": 19036104, + "step": 31250 + }, + { + "epoch": 8.619691119691119, + "grad_norm": 4.2609019601513864e-07, + "learning_rate": 2.847274798497676e-06, + "loss": 0.0, + "num_input_tokens_seen": 19039208, + "step": 31255 + }, + { + "epoch": 8.621070049641478, + "grad_norm": 6.879051852592966e-07, + "learning_rate": 2.8417001464524695e-06, + "loss": 0.0, + "num_input_tokens_seen": 19041768, + "step": 31260 + }, + { + "epoch": 8.622448979591837, + "grad_norm": 1.4184473684508703e-06, + "learning_rate": 2.8361306281672636e-06, + "loss": 0.0, + "num_input_tokens_seen": 19044264, + "step": 31265 + }, + { + "epoch": 8.623827909542195, + "grad_norm": 4.269886176189175e-06, + "learning_rate": 2.8305662449324478e-06, + "loss": 0.0, + "num_input_tokens_seen": 19047176, + "step": 31270 + }, + { + "epoch": 8.625206839492554, + "grad_norm": 3.7016181977378437e-06, + "learning_rate": 2.8250069980372024e-06, + "loss": 0.0, + "num_input_tokens_seen": 19049576, + "step": 31275 + }, + { + "epoch": 8.626585769442912, + "grad_norm": 1.0634462341840845e-06, + "learning_rate": 2.8194528887695193e-06, + "loss": 0.0, + "num_input_tokens_seen": 19052808, + "step": 31280 + }, + { + "epoch": 8.62796469939327, + "grad_norm": 8.290900836982473e-07, + "learning_rate": 2.8139039184162217e-06, + "loss": 0.0, + "num_input_tokens_seen": 19055560, + "step": 31285 + }, + { + "epoch": 8.62934362934363, + "grad_norm": 6.493033879451104e-07, + "learning_rate": 2.8083600882629154e-06, + "loss": 0.0, + "num_input_tokens_seen": 19059592, + "step": 31290 + }, + { + "epoch": 8.630722559293988, + "grad_norm": 1.4506813386105932e-06, + "learning_rate": 2.8028213995940245e-06, + "loss": 0.0, + "num_input_tokens_seen": 19062952, + "step": 31295 + }, + { + "epoch": 8.632101489244347, + "grad_norm": 5.032093213230837e-07, + "learning_rate": 2.797287853692787e-06, + "loss": 0.0, + "num_input_tokens_seen": 19067784, + "step": 31300 + }, + { + "epoch": 8.633480419194704, + "grad_norm": 6.598239679078688e-07, + "learning_rate": 2.7917594518412525e-06, + "loss": 0.0, + "num_input_tokens_seen": 19070696, + "step": 31305 + }, + { + "epoch": 8.634859349145064, + "grad_norm": 1.2643154150282498e-06, + "learning_rate": 2.7862361953202604e-06, + "loss": 0.0, + "num_input_tokens_seen": 19073320, + "step": 31310 + }, + { + "epoch": 8.636238279095423, + "grad_norm": 1.1097455399067258e-06, + "learning_rate": 2.780718085409484e-06, + "loss": 0.0, + "num_input_tokens_seen": 19075720, + "step": 31315 + }, + { + "epoch": 8.63761720904578, + "grad_norm": 5.366796358430292e-07, + "learning_rate": 2.775205123387373e-06, + "loss": 0.0, + "num_input_tokens_seen": 19077736, + "step": 31320 + }, + { + "epoch": 8.63899613899614, + "grad_norm": 0.0003571145061869174, + "learning_rate": 2.769697310531219e-06, + "loss": 0.0, + "num_input_tokens_seen": 19081096, + "step": 31325 + }, + { + "epoch": 8.640375068946497, + "grad_norm": 3.1906890853861114e-06, + "learning_rate": 2.7641946481170936e-06, + "loss": 0.0, + "num_input_tokens_seen": 19084232, + "step": 31330 + }, + { + "epoch": 8.641753998896856, + "grad_norm": 5.186045655136695e-07, + "learning_rate": 2.7586971374198827e-06, + "loss": 0.0, + "num_input_tokens_seen": 19087304, + "step": 31335 + }, + { + "epoch": 8.643132928847214, + "grad_norm": 5.3471754654310644e-05, + "learning_rate": 2.7532047797132867e-06, + "loss": 0.0, + "num_input_tokens_seen": 19090504, + "step": 31340 + }, + { + "epoch": 8.644511858797573, + "grad_norm": 2.322820819244953e-06, + "learning_rate": 2.747717576269801e-06, + "loss": 0.0, + "num_input_tokens_seen": 19093992, + "step": 31345 + }, + { + "epoch": 8.645890788747932, + "grad_norm": 7.9959323784351e-07, + "learning_rate": 2.7422355283607373e-06, + "loss": 0.0, + "num_input_tokens_seen": 19096776, + "step": 31350 + }, + { + "epoch": 8.64726971869829, + "grad_norm": 9.025936833495507e-07, + "learning_rate": 2.7367586372562037e-06, + "loss": 0.0, + "num_input_tokens_seen": 19100232, + "step": 31355 + }, + { + "epoch": 8.64864864864865, + "grad_norm": 4.873572379437974e-06, + "learning_rate": 2.7312869042251138e-06, + "loss": 0.0, + "num_input_tokens_seen": 19103848, + "step": 31360 + }, + { + "epoch": 8.650027578599007, + "grad_norm": 9.815964858717052e-07, + "learning_rate": 2.7258203305352004e-06, + "loss": 0.0, + "num_input_tokens_seen": 19107368, + "step": 31365 + }, + { + "epoch": 8.651406508549366, + "grad_norm": 1.1214611959076137e-06, + "learning_rate": 2.720358917452978e-06, + "loss": 0.0, + "num_input_tokens_seen": 19109928, + "step": 31370 + }, + { + "epoch": 8.652785438499723, + "grad_norm": 8.164003020283417e-07, + "learning_rate": 2.7149026662437877e-06, + "loss": 0.0, + "num_input_tokens_seen": 19112968, + "step": 31375 + }, + { + "epoch": 8.654164368450083, + "grad_norm": 5.794077537757403e-07, + "learning_rate": 2.7094515781717537e-06, + "loss": 0.0, + "num_input_tokens_seen": 19115272, + "step": 31380 + }, + { + "epoch": 8.655543298400442, + "grad_norm": 1.5786679341545096e-06, + "learning_rate": 2.704005654499825e-06, + "loss": 0.0, + "num_input_tokens_seen": 19119080, + "step": 31385 + }, + { + "epoch": 8.6569222283508, + "grad_norm": 8.921329026634339e-07, + "learning_rate": 2.698564896489739e-06, + "loss": 0.0, + "num_input_tokens_seen": 19122184, + "step": 31390 + }, + { + "epoch": 8.658301158301159, + "grad_norm": 6.739205105077417e-07, + "learning_rate": 2.693129305402045e-06, + "loss": 0.0, + "num_input_tokens_seen": 19125032, + "step": 31395 + }, + { + "epoch": 8.659680088251516, + "grad_norm": 1.0750508181445184e-06, + "learning_rate": 2.687698882496084e-06, + "loss": 0.0, + "num_input_tokens_seen": 19128072, + "step": 31400 + }, + { + "epoch": 8.661059018201875, + "grad_norm": 7.379471185231523e-07, + "learning_rate": 2.682273629030016e-06, + "loss": 0.0, + "num_input_tokens_seen": 19130664, + "step": 31405 + }, + { + "epoch": 8.662437948152235, + "grad_norm": 5.058273018221371e-05, + "learning_rate": 2.676853546260791e-06, + "loss": 0.0, + "num_input_tokens_seen": 19133096, + "step": 31410 + }, + { + "epoch": 8.663816878102592, + "grad_norm": 5.182999984754133e-07, + "learning_rate": 2.6714386354441595e-06, + "loss": 0.0, + "num_input_tokens_seen": 19136360, + "step": 31415 + }, + { + "epoch": 8.665195808052951, + "grad_norm": 6.75032026720146e-07, + "learning_rate": 2.6660288978346854e-06, + "loss": 0.0, + "num_input_tokens_seen": 19139304, + "step": 31420 + }, + { + "epoch": 8.666574738003309, + "grad_norm": 4.6235720674303593e-07, + "learning_rate": 2.660624334685724e-06, + "loss": 0.0, + "num_input_tokens_seen": 19142664, + "step": 31425 + }, + { + "epoch": 8.667953667953668, + "grad_norm": 4.4922802544533624e-07, + "learning_rate": 2.655224947249432e-06, + "loss": 0.0, + "num_input_tokens_seen": 19145448, + "step": 31430 + }, + { + "epoch": 8.669332597904026, + "grad_norm": 7.956637091410812e-06, + "learning_rate": 2.649830736776779e-06, + "loss": 0.0, + "num_input_tokens_seen": 19148712, + "step": 31435 + }, + { + "epoch": 8.670711527854385, + "grad_norm": 0.0001791750983102247, + "learning_rate": 2.644441704517517e-06, + "loss": 0.0, + "num_input_tokens_seen": 19151304, + "step": 31440 + }, + { + "epoch": 8.672090457804744, + "grad_norm": 3.932917024940252e-06, + "learning_rate": 2.6390578517202088e-06, + "loss": 0.0, + "num_input_tokens_seen": 19153832, + "step": 31445 + }, + { + "epoch": 8.673469387755102, + "grad_norm": 4.314841135055758e-06, + "learning_rate": 2.633679179632223e-06, + "loss": 0.0, + "num_input_tokens_seen": 19156648, + "step": 31450 + }, + { + "epoch": 8.674848317705461, + "grad_norm": 4.955250574312231e-07, + "learning_rate": 2.6283056894997106e-06, + "loss": 0.0, + "num_input_tokens_seen": 19160008, + "step": 31455 + }, + { + "epoch": 8.676227247655818, + "grad_norm": 8.815569003672863e-07, + "learning_rate": 2.622937382567642e-06, + "loss": 0.0, + "num_input_tokens_seen": 19163144, + "step": 31460 + }, + { + "epoch": 8.677606177606178, + "grad_norm": 0.00016531645087525249, + "learning_rate": 2.617574260079772e-06, + "loss": 0.0, + "num_input_tokens_seen": 19166472, + "step": 31465 + }, + { + "epoch": 8.678985107556537, + "grad_norm": 1.6307586747643654e-06, + "learning_rate": 2.6122163232786545e-06, + "loss": 0.0, + "num_input_tokens_seen": 19169128, + "step": 31470 + }, + { + "epoch": 8.680364037506894, + "grad_norm": 1.2710379451164044e-06, + "learning_rate": 2.6068635734056575e-06, + "loss": 0.0, + "num_input_tokens_seen": 19172488, + "step": 31475 + }, + { + "epoch": 8.681742967457254, + "grad_norm": 1.0227517122984864e-05, + "learning_rate": 2.6015160117009275e-06, + "loss": 0.0, + "num_input_tokens_seen": 19175368, + "step": 31480 + }, + { + "epoch": 8.683121897407611, + "grad_norm": 1.0557554560364224e-05, + "learning_rate": 2.596173639403418e-06, + "loss": 0.0, + "num_input_tokens_seen": 19179112, + "step": 31485 + }, + { + "epoch": 8.68450082735797, + "grad_norm": 8.668137638778717e-07, + "learning_rate": 2.5908364577508835e-06, + "loss": 0.0, + "num_input_tokens_seen": 19181608, + "step": 31490 + }, + { + "epoch": 8.685879757308328, + "grad_norm": 1.2044404229527572e-06, + "learning_rate": 2.5855044679798713e-06, + "loss": 0.0, + "num_input_tokens_seen": 19185608, + "step": 31495 + }, + { + "epoch": 8.687258687258687, + "grad_norm": 7.555542538284499e-07, + "learning_rate": 2.5801776713257285e-06, + "loss": 0.0, + "num_input_tokens_seen": 19188808, + "step": 31500 + }, + { + "epoch": 8.688637617209046, + "grad_norm": 9.104517175728688e-07, + "learning_rate": 2.5748560690225966e-06, + "loss": 0.0, + "num_input_tokens_seen": 19191208, + "step": 31505 + }, + { + "epoch": 8.690016547159404, + "grad_norm": 4.4452372094383463e-07, + "learning_rate": 2.569539662303408e-06, + "loss": 0.0, + "num_input_tokens_seen": 19193576, + "step": 31510 + }, + { + "epoch": 8.691395477109763, + "grad_norm": 6.764616387044953e-07, + "learning_rate": 2.5642284523999027e-06, + "loss": 0.0, + "num_input_tokens_seen": 19196552, + "step": 31515 + }, + { + "epoch": 8.69277440706012, + "grad_norm": 3.5052537441515597e-06, + "learning_rate": 2.558922440542619e-06, + "loss": 0.0, + "num_input_tokens_seen": 19199272, + "step": 31520 + }, + { + "epoch": 8.69415333701048, + "grad_norm": 4.11531473218929e-05, + "learning_rate": 2.5536216279608727e-06, + "loss": 0.0, + "num_input_tokens_seen": 19202056, + "step": 31525 + }, + { + "epoch": 8.69553226696084, + "grad_norm": 3.194928240191075e-06, + "learning_rate": 2.5483260158827932e-06, + "loss": 0.0, + "num_input_tokens_seen": 19205224, + "step": 31530 + }, + { + "epoch": 8.696911196911197, + "grad_norm": 1.6453313946840353e-05, + "learning_rate": 2.5430356055352904e-06, + "loss": 0.0, + "num_input_tokens_seen": 19209352, + "step": 31535 + }, + { + "epoch": 8.698290126861556, + "grad_norm": 1.1637811212494853e-06, + "learning_rate": 2.537750398144087e-06, + "loss": 0.0, + "num_input_tokens_seen": 19211976, + "step": 31540 + }, + { + "epoch": 8.699669056811913, + "grad_norm": 8.279584449155664e-07, + "learning_rate": 2.532470394933684e-06, + "loss": 0.0, + "num_input_tokens_seen": 19214952, + "step": 31545 + }, + { + "epoch": 8.701047986762273, + "grad_norm": 5.875467081750685e-07, + "learning_rate": 2.527195597127377e-06, + "loss": 0.0, + "num_input_tokens_seen": 19218056, + "step": 31550 + }, + { + "epoch": 8.70242691671263, + "grad_norm": 1.3979788491269574e-05, + "learning_rate": 2.5219260059472697e-06, + "loss": 0.0, + "num_input_tokens_seen": 19220296, + "step": 31555 + }, + { + "epoch": 8.70380584666299, + "grad_norm": 7.468685362255201e-05, + "learning_rate": 2.51666162261425e-06, + "loss": 0.0, + "num_input_tokens_seen": 19223624, + "step": 31560 + }, + { + "epoch": 8.705184776613349, + "grad_norm": 3.561093308235286e-06, + "learning_rate": 2.511402448347991e-06, + "loss": 0.0, + "num_input_tokens_seen": 19227112, + "step": 31565 + }, + { + "epoch": 8.706563706563706, + "grad_norm": 1.4319529100248474e-06, + "learning_rate": 2.5061484843669767e-06, + "loss": 0.0, + "num_input_tokens_seen": 19229384, + "step": 31570 + }, + { + "epoch": 8.707942636514066, + "grad_norm": 8.06058324087644e-07, + "learning_rate": 2.5008997318884703e-06, + "loss": 0.0, + "num_input_tokens_seen": 19231944, + "step": 31575 + }, + { + "epoch": 8.709321566464423, + "grad_norm": 2.1393532279034844e-06, + "learning_rate": 2.4956561921285372e-06, + "loss": 0.0, + "num_input_tokens_seen": 19234760, + "step": 31580 + }, + { + "epoch": 8.710700496414782, + "grad_norm": 4.149598225922091e-06, + "learning_rate": 2.4904178663020224e-06, + "loss": 0.0, + "num_input_tokens_seen": 19237736, + "step": 31585 + }, + { + "epoch": 8.71207942636514, + "grad_norm": 5.03541627949744e-07, + "learning_rate": 2.485184755622577e-06, + "loss": 0.0, + "num_input_tokens_seen": 19241384, + "step": 31590 + }, + { + "epoch": 8.713458356315499, + "grad_norm": 6.831798486928164e-07, + "learning_rate": 2.479956861302643e-06, + "loss": 0.0, + "num_input_tokens_seen": 19244392, + "step": 31595 + }, + { + "epoch": 8.714837286265858, + "grad_norm": 1.393521529280406e-06, + "learning_rate": 2.47473418455344e-06, + "loss": 0.0, + "num_input_tokens_seen": 19247080, + "step": 31600 + }, + { + "epoch": 8.716216216216216, + "grad_norm": 2.3413708731823135e-06, + "learning_rate": 2.4695167265849835e-06, + "loss": 0.0, + "num_input_tokens_seen": 19249736, + "step": 31605 + }, + { + "epoch": 8.717595146166575, + "grad_norm": 1.016621808958007e-06, + "learning_rate": 2.464304488606098e-06, + "loss": 0.0, + "num_input_tokens_seen": 19252680, + "step": 31610 + }, + { + "epoch": 8.718974076116933, + "grad_norm": 1.2180917110526934e-06, + "learning_rate": 2.4590974718243733e-06, + "loss": 0.0, + "num_input_tokens_seen": 19255272, + "step": 31615 + }, + { + "epoch": 8.720353006067292, + "grad_norm": 4.234787866153056e-06, + "learning_rate": 2.453895677446197e-06, + "loss": 0.0, + "num_input_tokens_seen": 19258728, + "step": 31620 + }, + { + "epoch": 8.721731936017651, + "grad_norm": 8.479050279674993e-07, + "learning_rate": 2.4486991066767606e-06, + "loss": 0.0, + "num_input_tokens_seen": 19260936, + "step": 31625 + }, + { + "epoch": 8.723110865968009, + "grad_norm": 9.44303519645473e-06, + "learning_rate": 2.4435077607200265e-06, + "loss": 0.0, + "num_input_tokens_seen": 19263240, + "step": 31630 + }, + { + "epoch": 8.724489795918368, + "grad_norm": 5.681657739842194e-07, + "learning_rate": 2.4383216407787652e-06, + "loss": 0.0, + "num_input_tokens_seen": 19265832, + "step": 31635 + }, + { + "epoch": 8.725868725868725, + "grad_norm": 6.867811066513241e-07, + "learning_rate": 2.433140748054519e-06, + "loss": 0.0, + "num_input_tokens_seen": 19268232, + "step": 31640 + }, + { + "epoch": 8.727247655819085, + "grad_norm": 7.865776296966942e-07, + "learning_rate": 2.4279650837476213e-06, + "loss": 0.0, + "num_input_tokens_seen": 19271592, + "step": 31645 + }, + { + "epoch": 8.728626585769444, + "grad_norm": 4.6519153329427354e-07, + "learning_rate": 2.4227946490572055e-06, + "loss": 0.0, + "num_input_tokens_seen": 19275240, + "step": 31650 + }, + { + "epoch": 8.730005515719801, + "grad_norm": 1.7280106476391666e-05, + "learning_rate": 2.417629445181194e-06, + "loss": 0.0, + "num_input_tokens_seen": 19277928, + "step": 31655 + }, + { + "epoch": 8.73138444567016, + "grad_norm": 4.870250904787099e-07, + "learning_rate": 2.4124694733162756e-06, + "loss": 0.0, + "num_input_tokens_seen": 19280648, + "step": 31660 + }, + { + "epoch": 8.732763375620518, + "grad_norm": 1.269176323148713e-06, + "learning_rate": 2.4073147346579533e-06, + "loss": 0.0, + "num_input_tokens_seen": 19283560, + "step": 31665 + }, + { + "epoch": 8.734142305570877, + "grad_norm": 1.1862265409945394e-06, + "learning_rate": 2.402165230400505e-06, + "loss": 0.0, + "num_input_tokens_seen": 19286440, + "step": 31670 + }, + { + "epoch": 8.735521235521235, + "grad_norm": 4.190545496385312e-06, + "learning_rate": 2.397020961736987e-06, + "loss": 0.0, + "num_input_tokens_seen": 19289704, + "step": 31675 + }, + { + "epoch": 8.736900165471594, + "grad_norm": 1.61968807788071e-06, + "learning_rate": 2.391881929859266e-06, + "loss": 0.0, + "num_input_tokens_seen": 19292776, + "step": 31680 + }, + { + "epoch": 8.738279095421953, + "grad_norm": 9.018672244565096e-06, + "learning_rate": 2.3867481359579695e-06, + "loss": 0.0, + "num_input_tokens_seen": 19295272, + "step": 31685 + }, + { + "epoch": 8.73965802537231, + "grad_norm": 3.7522245293075684e-06, + "learning_rate": 2.3816195812225354e-06, + "loss": 0.0, + "num_input_tokens_seen": 19298472, + "step": 31690 + }, + { + "epoch": 8.74103695532267, + "grad_norm": 4.557513193503837e-07, + "learning_rate": 2.3764962668411688e-06, + "loss": 0.0, + "num_input_tokens_seen": 19301096, + "step": 31695 + }, + { + "epoch": 8.742415885273028, + "grad_norm": 1.3366600342124002e-06, + "learning_rate": 2.371378194000867e-06, + "loss": 0.0, + "num_input_tokens_seen": 19305480, + "step": 31700 + }, + { + "epoch": 8.743794815223387, + "grad_norm": 7.474981771338207e-07, + "learning_rate": 2.366265363887421e-06, + "loss": 0.0, + "num_input_tokens_seen": 19309032, + "step": 31705 + }, + { + "epoch": 8.745173745173744, + "grad_norm": 1.0634216778271366e-06, + "learning_rate": 2.3611577776853966e-06, + "loss": 0.0, + "num_input_tokens_seen": 19314376, + "step": 31710 + }, + { + "epoch": 8.746552675124104, + "grad_norm": 5.111495511300745e-07, + "learning_rate": 2.356055436578142e-06, + "loss": 0.0, + "num_input_tokens_seen": 19318760, + "step": 31715 + }, + { + "epoch": 8.747931605074463, + "grad_norm": 2.255149183838512e-06, + "learning_rate": 2.3509583417478026e-06, + "loss": 0.0, + "num_input_tokens_seen": 19321736, + "step": 31720 + }, + { + "epoch": 8.74931053502482, + "grad_norm": 4.3183297293580836e-07, + "learning_rate": 2.345866494375307e-06, + "loss": 0.0, + "num_input_tokens_seen": 19324264, + "step": 31725 + }, + { + "epoch": 8.75068946497518, + "grad_norm": 1.0587440328890807e-06, + "learning_rate": 2.3407798956403494e-06, + "loss": 0.0, + "num_input_tokens_seen": 19327048, + "step": 31730 + }, + { + "epoch": 8.752068394925537, + "grad_norm": 1.4067509255255572e-06, + "learning_rate": 2.335698546721438e-06, + "loss": 0.0, + "num_input_tokens_seen": 19329512, + "step": 31735 + }, + { + "epoch": 8.753447324875896, + "grad_norm": 6.013263487147924e-07, + "learning_rate": 2.3306224487958356e-06, + "loss": 0.0, + "num_input_tokens_seen": 19332040, + "step": 31740 + }, + { + "epoch": 8.754826254826256, + "grad_norm": 6.608613034586597e-07, + "learning_rate": 2.3255516030396108e-06, + "loss": 0.0, + "num_input_tokens_seen": 19335016, + "step": 31745 + }, + { + "epoch": 8.756205184776613, + "grad_norm": 9.493224411016854e-07, + "learning_rate": 2.3204860106275984e-06, + "loss": 0.0, + "num_input_tokens_seen": 19338024, + "step": 31750 + }, + { + "epoch": 8.757584114726972, + "grad_norm": 1.297951257583918e-06, + "learning_rate": 2.3154256727334232e-06, + "loss": 0.0, + "num_input_tokens_seen": 19340968, + "step": 31755 + }, + { + "epoch": 8.75896304467733, + "grad_norm": 1.144934003605158e-06, + "learning_rate": 2.310370590529501e-06, + "loss": 0.0, + "num_input_tokens_seen": 19343304, + "step": 31760 + }, + { + "epoch": 8.76034197462769, + "grad_norm": 5.973743100184947e-06, + "learning_rate": 2.3053207651870164e-06, + "loss": 0.0, + "num_input_tokens_seen": 19345800, + "step": 31765 + }, + { + "epoch": 8.761720904578047, + "grad_norm": 9.345542366645532e-07, + "learning_rate": 2.3002761978759364e-06, + "loss": 0.0, + "num_input_tokens_seen": 19349032, + "step": 31770 + }, + { + "epoch": 8.763099834528406, + "grad_norm": 8.471910746266076e-07, + "learning_rate": 2.2952368897650236e-06, + "loss": 0.0, + "num_input_tokens_seen": 19351656, + "step": 31775 + }, + { + "epoch": 8.764478764478765, + "grad_norm": 8.678824201524549e-07, + "learning_rate": 2.2902028420218073e-06, + "loss": 0.0, + "num_input_tokens_seen": 19355080, + "step": 31780 + }, + { + "epoch": 8.765857694429123, + "grad_norm": 5.466929451358737e-07, + "learning_rate": 2.2851740558126084e-06, + "loss": 0.0, + "num_input_tokens_seen": 19358920, + "step": 31785 + }, + { + "epoch": 8.767236624379482, + "grad_norm": 5.304038950271206e-07, + "learning_rate": 2.280150532302522e-06, + "loss": 0.0, + "num_input_tokens_seen": 19361992, + "step": 31790 + }, + { + "epoch": 8.76861555432984, + "grad_norm": 6.802314373999252e-07, + "learning_rate": 2.2751322726554224e-06, + "loss": 0.0, + "num_input_tokens_seen": 19364520, + "step": 31795 + }, + { + "epoch": 8.769994484280199, + "grad_norm": 1.013358883028559e-06, + "learning_rate": 2.2701192780339803e-06, + "loss": 0.0, + "num_input_tokens_seen": 19368520, + "step": 31800 + }, + { + "epoch": 8.771373414230556, + "grad_norm": 5.007022991776466e-07, + "learning_rate": 2.2651115495996273e-06, + "loss": 0.0, + "num_input_tokens_seen": 19372200, + "step": 31805 + }, + { + "epoch": 8.772752344180915, + "grad_norm": 1.783734887794708e-06, + "learning_rate": 2.260109088512577e-06, + "loss": 0.0, + "num_input_tokens_seen": 19375912, + "step": 31810 + }, + { + "epoch": 8.774131274131275, + "grad_norm": 6.114439656812465e-07, + "learning_rate": 2.2551118959318356e-06, + "loss": 0.0, + "num_input_tokens_seen": 19378824, + "step": 31815 + }, + { + "epoch": 8.775510204081632, + "grad_norm": 2.226792275905609e-06, + "learning_rate": 2.25011997301518e-06, + "loss": 0.0, + "num_input_tokens_seen": 19381768, + "step": 31820 + }, + { + "epoch": 8.776889134031991, + "grad_norm": 8.979296239886025e-07, + "learning_rate": 2.24513332091916e-06, + "loss": 0.0, + "num_input_tokens_seen": 19384168, + "step": 31825 + }, + { + "epoch": 8.778268063982349, + "grad_norm": 6.591025112356874e-07, + "learning_rate": 2.2401519407991234e-06, + "loss": 0.0, + "num_input_tokens_seen": 19387816, + "step": 31830 + }, + { + "epoch": 8.779646993932708, + "grad_norm": 2.6399904982099542e-06, + "learning_rate": 2.2351758338091724e-06, + "loss": 0.0, + "num_input_tokens_seen": 19390344, + "step": 31835 + }, + { + "epoch": 8.781025923883067, + "grad_norm": 9.55755808718095e-07, + "learning_rate": 2.2302050011022065e-06, + "loss": 0.0, + "num_input_tokens_seen": 19393064, + "step": 31840 + }, + { + "epoch": 8.782404853833425, + "grad_norm": 3.1372187550005037e-06, + "learning_rate": 2.225239443829896e-06, + "loss": 0.0, + "num_input_tokens_seen": 19395272, + "step": 31845 + }, + { + "epoch": 8.783783783783784, + "grad_norm": 1.9327901554788696e-06, + "learning_rate": 2.2202791631426857e-06, + "loss": 0.0, + "num_input_tokens_seen": 19398824, + "step": 31850 + }, + { + "epoch": 8.785162713734142, + "grad_norm": 7.602212690471788e-07, + "learning_rate": 2.215324160189805e-06, + "loss": 0.0, + "num_input_tokens_seen": 19401640, + "step": 31855 + }, + { + "epoch": 8.786541643684501, + "grad_norm": 3.0626467832917115e-06, + "learning_rate": 2.210374436119253e-06, + "loss": 0.0, + "num_input_tokens_seen": 19404360, + "step": 31860 + }, + { + "epoch": 8.78792057363486, + "grad_norm": 1.9031277815884096e-06, + "learning_rate": 2.205429992077812e-06, + "loss": 0.0, + "num_input_tokens_seen": 19407560, + "step": 31865 + }, + { + "epoch": 8.789299503585218, + "grad_norm": 9.236122195943608e-07, + "learning_rate": 2.2004908292110438e-06, + "loss": 0.0, + "num_input_tokens_seen": 19410696, + "step": 31870 + }, + { + "epoch": 8.790678433535577, + "grad_norm": 1.7060357322407071e-06, + "learning_rate": 2.1955569486632716e-06, + "loss": 0.0, + "num_input_tokens_seen": 19414280, + "step": 31875 + }, + { + "epoch": 8.792057363485934, + "grad_norm": 5.15583963078825e-07, + "learning_rate": 2.1906283515776145e-06, + "loss": 0.0, + "num_input_tokens_seen": 19418408, + "step": 31880 + }, + { + "epoch": 8.793436293436294, + "grad_norm": 1.4025750942892046e-06, + "learning_rate": 2.1857050390959505e-06, + "loss": 0.0, + "num_input_tokens_seen": 19421352, + "step": 31885 + }, + { + "epoch": 8.794815223386651, + "grad_norm": 9.552649089528131e-07, + "learning_rate": 2.1807870123589423e-06, + "loss": 0.0, + "num_input_tokens_seen": 19424776, + "step": 31890 + }, + { + "epoch": 8.79619415333701, + "grad_norm": 7.325774618038849e-07, + "learning_rate": 2.175874272506029e-06, + "loss": 0.0, + "num_input_tokens_seen": 19427432, + "step": 31895 + }, + { + "epoch": 8.79757308328737, + "grad_norm": 4.744942430079391e-07, + "learning_rate": 2.1709668206754215e-06, + "loss": 0.0, + "num_input_tokens_seen": 19431112, + "step": 31900 + }, + { + "epoch": 8.798952013237727, + "grad_norm": 4.999965312890708e-07, + "learning_rate": 2.1660646580040965e-06, + "loss": 0.0, + "num_input_tokens_seen": 19434056, + "step": 31905 + }, + { + "epoch": 8.800330943188087, + "grad_norm": 1.4691592014060006e-06, + "learning_rate": 2.1611677856278294e-06, + "loss": 0.0, + "num_input_tokens_seen": 19436776, + "step": 31910 + }, + { + "epoch": 8.801709873138444, + "grad_norm": 4.5208358301351836e-07, + "learning_rate": 2.1562762046811484e-06, + "loss": 0.0, + "num_input_tokens_seen": 19439624, + "step": 31915 + }, + { + "epoch": 8.803088803088803, + "grad_norm": 4.660184345084417e-07, + "learning_rate": 2.1513899162973582e-06, + "loss": 0.0, + "num_input_tokens_seen": 19442568, + "step": 31920 + }, + { + "epoch": 8.80446773303916, + "grad_norm": 7.291050110325159e-07, + "learning_rate": 2.146508921608542e-06, + "loss": 0.0, + "num_input_tokens_seen": 19445608, + "step": 31925 + }, + { + "epoch": 8.80584666298952, + "grad_norm": 5.358558041734796e-07, + "learning_rate": 2.1416332217455674e-06, + "loss": 0.0, + "num_input_tokens_seen": 19448200, + "step": 31930 + }, + { + "epoch": 8.80722559293988, + "grad_norm": 8.694556186128466e-07, + "learning_rate": 2.1367628178380533e-06, + "loss": 0.0, + "num_input_tokens_seen": 19451720, + "step": 31935 + }, + { + "epoch": 8.808604522890237, + "grad_norm": 2.37303083849838e-06, + "learning_rate": 2.1318977110144077e-06, + "loss": 0.0, + "num_input_tokens_seen": 19456424, + "step": 31940 + }, + { + "epoch": 8.809983452840596, + "grad_norm": 4.884673785454652e-07, + "learning_rate": 2.1270379024018017e-06, + "loss": 0.0, + "num_input_tokens_seen": 19459080, + "step": 31945 + }, + { + "epoch": 8.811362382790954, + "grad_norm": 6.458293455580133e-07, + "learning_rate": 2.1221833931261897e-06, + "loss": 0.0, + "num_input_tokens_seen": 19461896, + "step": 31950 + }, + { + "epoch": 8.812741312741313, + "grad_norm": 4.5225240796753496e-07, + "learning_rate": 2.117334184312286e-06, + "loss": 0.0, + "num_input_tokens_seen": 19465768, + "step": 31955 + }, + { + "epoch": 8.814120242691672, + "grad_norm": 1.024080120259896e-06, + "learning_rate": 2.1124902770835813e-06, + "loss": 0.0, + "num_input_tokens_seen": 19468552, + "step": 31960 + }, + { + "epoch": 8.81549917264203, + "grad_norm": 9.193711889565748e-07, + "learning_rate": 2.107651672562347e-06, + "loss": 0.0, + "num_input_tokens_seen": 19471432, + "step": 31965 + }, + { + "epoch": 8.816878102592389, + "grad_norm": 3.952219049097039e-05, + "learning_rate": 2.1028183718696083e-06, + "loss": 0.0, + "num_input_tokens_seen": 19474376, + "step": 31970 + }, + { + "epoch": 8.818257032542746, + "grad_norm": 7.463279985131521e-07, + "learning_rate": 2.0979903761251833e-06, + "loss": 0.0, + "num_input_tokens_seen": 19477192, + "step": 31975 + }, + { + "epoch": 8.819635962493106, + "grad_norm": 2.2716462808602955e-06, + "learning_rate": 2.0931676864476417e-06, + "loss": 0.0, + "num_input_tokens_seen": 19480360, + "step": 31980 + }, + { + "epoch": 8.821014892443463, + "grad_norm": 2.210845423178398e-06, + "learning_rate": 2.0883503039543283e-06, + "loss": 0.0, + "num_input_tokens_seen": 19483368, + "step": 31985 + }, + { + "epoch": 8.822393822393822, + "grad_norm": 5.692211288987892e-07, + "learning_rate": 2.0835382297613726e-06, + "loss": 0.0, + "num_input_tokens_seen": 19486504, + "step": 31990 + }, + { + "epoch": 8.823772752344182, + "grad_norm": 5.111060090712272e-07, + "learning_rate": 2.078731464983649e-06, + "loss": 0.0, + "num_input_tokens_seen": 19489160, + "step": 31995 + }, + { + "epoch": 8.825151682294539, + "grad_norm": 8.013058732103673e-07, + "learning_rate": 2.0739300107348286e-06, + "loss": 0.0, + "num_input_tokens_seen": 19492904, + "step": 32000 + }, + { + "epoch": 8.826530612244898, + "grad_norm": 3.466177804511972e-05, + "learning_rate": 2.069133868127332e-06, + "loss": 0.0, + "num_input_tokens_seen": 19496104, + "step": 32005 + }, + { + "epoch": 8.827909542195256, + "grad_norm": 5.152707217348507e-06, + "learning_rate": 2.064343038272362e-06, + "loss": 0.0, + "num_input_tokens_seen": 19499976, + "step": 32010 + }, + { + "epoch": 8.829288472145615, + "grad_norm": 8.651553002891887e-07, + "learning_rate": 2.059557522279876e-06, + "loss": 0.0, + "num_input_tokens_seen": 19503560, + "step": 32015 + }, + { + "epoch": 8.830667402095974, + "grad_norm": 1.2673489209191757e-06, + "learning_rate": 2.054777321258622e-06, + "loss": 0.0, + "num_input_tokens_seen": 19506024, + "step": 32020 + }, + { + "epoch": 8.832046332046332, + "grad_norm": 1.3541458656618488e-06, + "learning_rate": 2.0500024363160926e-06, + "loss": 0.0, + "num_input_tokens_seen": 19509576, + "step": 32025 + }, + { + "epoch": 8.833425261996691, + "grad_norm": 8.593352163188683e-07, + "learning_rate": 2.0452328685585663e-06, + "loss": 0.0, + "num_input_tokens_seen": 19512008, + "step": 32030 + }, + { + "epoch": 8.834804191947049, + "grad_norm": 1.215472934745776e-06, + "learning_rate": 2.040468619091085e-06, + "loss": 0.0, + "num_input_tokens_seen": 19515368, + "step": 32035 + }, + { + "epoch": 8.836183121897408, + "grad_norm": 8.315424224747403e-07, + "learning_rate": 2.035709689017448e-06, + "loss": 0.0, + "num_input_tokens_seen": 19518184, + "step": 32040 + }, + { + "epoch": 8.837562051847765, + "grad_norm": 6.090083957133174e-07, + "learning_rate": 2.030956079440244e-06, + "loss": 0.0, + "num_input_tokens_seen": 19520808, + "step": 32045 + }, + { + "epoch": 8.838940981798125, + "grad_norm": 2.088561814161949e-06, + "learning_rate": 2.0262077914608047e-06, + "loss": 0.0, + "num_input_tokens_seen": 19523912, + "step": 32050 + }, + { + "epoch": 8.840319911748484, + "grad_norm": 1.6957120578808826e-06, + "learning_rate": 2.0214648261792425e-06, + "loss": 0.0, + "num_input_tokens_seen": 19527688, + "step": 32055 + }, + { + "epoch": 8.841698841698841, + "grad_norm": 1.1751717465813272e-06, + "learning_rate": 2.0167271846944417e-06, + "loss": 0.0, + "num_input_tokens_seen": 19530728, + "step": 32060 + }, + { + "epoch": 8.8430777716492, + "grad_norm": 5.007402069168165e-05, + "learning_rate": 2.0119948681040364e-06, + "loss": 0.0, + "num_input_tokens_seen": 19534024, + "step": 32065 + }, + { + "epoch": 8.844456701599558, + "grad_norm": 8.696372333361069e-07, + "learning_rate": 2.00726787750444e-06, + "loss": 0.0, + "num_input_tokens_seen": 19537544, + "step": 32070 + }, + { + "epoch": 8.845835631549917, + "grad_norm": 1.5730147424619645e-06, + "learning_rate": 2.002546213990833e-06, + "loss": 0.0, + "num_input_tokens_seen": 19540552, + "step": 32075 + }, + { + "epoch": 8.847214561500277, + "grad_norm": 1.5091991372173652e-06, + "learning_rate": 1.997829878657148e-06, + "loss": 0.0, + "num_input_tokens_seen": 19543112, + "step": 32080 + }, + { + "epoch": 8.848593491450634, + "grad_norm": 1.275492309105175e-06, + "learning_rate": 1.993118872596103e-06, + "loss": 0.0, + "num_input_tokens_seen": 19545256, + "step": 32085 + }, + { + "epoch": 8.849972421400993, + "grad_norm": 8.855476494318282e-07, + "learning_rate": 1.988413196899164e-06, + "loss": 0.0, + "num_input_tokens_seen": 19548584, + "step": 32090 + }, + { + "epoch": 8.85135135135135, + "grad_norm": 1.7828898535299231e-06, + "learning_rate": 1.9837128526565664e-06, + "loss": 0.0, + "num_input_tokens_seen": 19551304, + "step": 32095 + }, + { + "epoch": 8.85273028130171, + "grad_norm": 7.736547331660404e-07, + "learning_rate": 1.9790178409573174e-06, + "loss": 0.0, + "num_input_tokens_seen": 19553992, + "step": 32100 + }, + { + "epoch": 8.854109211252068, + "grad_norm": 0.0005129269557073712, + "learning_rate": 1.974328162889183e-06, + "loss": 0.0, + "num_input_tokens_seen": 19557000, + "step": 32105 + }, + { + "epoch": 8.855488141202427, + "grad_norm": 4.956811494594149e-07, + "learning_rate": 1.9696438195386884e-06, + "loss": 0.0, + "num_input_tokens_seen": 19560616, + "step": 32110 + }, + { + "epoch": 8.856867071152786, + "grad_norm": 1.219404794028378e-06, + "learning_rate": 1.964964811991138e-06, + "loss": 0.0, + "num_input_tokens_seen": 19565448, + "step": 32115 + }, + { + "epoch": 8.858246001103144, + "grad_norm": 2.529066023271298e-06, + "learning_rate": 1.9602911413305835e-06, + "loss": 0.0, + "num_input_tokens_seen": 19568776, + "step": 32120 + }, + { + "epoch": 8.859624931053503, + "grad_norm": 4.350900439931138e-07, + "learning_rate": 1.955622808639851e-06, + "loss": 0.0, + "num_input_tokens_seen": 19571752, + "step": 32125 + }, + { + "epoch": 8.86100386100386, + "grad_norm": 6.160980206004751e-07, + "learning_rate": 1.9509598150005278e-06, + "loss": 0.0, + "num_input_tokens_seen": 19575304, + "step": 32130 + }, + { + "epoch": 8.86238279095422, + "grad_norm": 9.035264270096377e-07, + "learning_rate": 1.946302161492952e-06, + "loss": 0.0, + "num_input_tokens_seen": 19578312, + "step": 32135 + }, + { + "epoch": 8.863761720904577, + "grad_norm": 1.2330563095019897e-06, + "learning_rate": 1.9416498491962474e-06, + "loss": 0.0, + "num_input_tokens_seen": 19582280, + "step": 32140 + }, + { + "epoch": 8.865140650854936, + "grad_norm": 7.997172701834643e-07, + "learning_rate": 1.937002879188285e-06, + "loss": 0.0, + "num_input_tokens_seen": 19584904, + "step": 32145 + }, + { + "epoch": 8.866519580805296, + "grad_norm": 2.3965792479430092e-06, + "learning_rate": 1.9323612525456946e-06, + "loss": 0.0, + "num_input_tokens_seen": 19587368, + "step": 32150 + }, + { + "epoch": 8.867898510755653, + "grad_norm": 1.962045644177124e-06, + "learning_rate": 1.927724970343886e-06, + "loss": 0.0, + "num_input_tokens_seen": 19590152, + "step": 32155 + }, + { + "epoch": 8.869277440706012, + "grad_norm": 9.246259651263244e-07, + "learning_rate": 1.923094033657011e-06, + "loss": 0.0, + "num_input_tokens_seen": 19595944, + "step": 32160 + }, + { + "epoch": 8.87065637065637, + "grad_norm": 2.107461568812141e-06, + "learning_rate": 1.918468443557989e-06, + "loss": 0.0, + "num_input_tokens_seen": 19598600, + "step": 32165 + }, + { + "epoch": 8.87203530060673, + "grad_norm": 0.00014100957196205854, + "learning_rate": 1.9138482011185093e-06, + "loss": 0.0, + "num_input_tokens_seen": 19601928, + "step": 32170 + }, + { + "epoch": 8.873414230557088, + "grad_norm": 1.571232360220165e-06, + "learning_rate": 1.909233307409011e-06, + "loss": 0.0, + "num_input_tokens_seen": 19605608, + "step": 32175 + }, + { + "epoch": 8.874793160507446, + "grad_norm": 7.550000873379759e-07, + "learning_rate": 1.9046237634987057e-06, + "loss": 0.0, + "num_input_tokens_seen": 19608808, + "step": 32180 + }, + { + "epoch": 8.876172090457805, + "grad_norm": 5.45708246590948e-07, + "learning_rate": 1.9000195704555524e-06, + "loss": 0.0, + "num_input_tokens_seen": 19611720, + "step": 32185 + }, + { + "epoch": 8.877551020408163, + "grad_norm": 1.4426218513108324e-06, + "learning_rate": 1.8954207293462728e-06, + "loss": 0.0, + "num_input_tokens_seen": 19614216, + "step": 32190 + }, + { + "epoch": 8.878929950358522, + "grad_norm": 6.053188030819001e-07, + "learning_rate": 1.890827241236362e-06, + "loss": 0.0, + "num_input_tokens_seen": 19617608, + "step": 32195 + }, + { + "epoch": 8.880308880308881, + "grad_norm": 9.085579222301021e-07, + "learning_rate": 1.8862391071900576e-06, + "loss": 0.0, + "num_input_tokens_seen": 19620040, + "step": 32200 + }, + { + "epoch": 8.881687810259239, + "grad_norm": 5.194324330659583e-07, + "learning_rate": 1.8816563282703703e-06, + "loss": 0.0, + "num_input_tokens_seen": 19624072, + "step": 32205 + }, + { + "epoch": 8.883066740209598, + "grad_norm": 2.7647888600768056e-06, + "learning_rate": 1.8770789055390564e-06, + "loss": 0.0, + "num_input_tokens_seen": 19627176, + "step": 32210 + }, + { + "epoch": 8.884445670159955, + "grad_norm": 4.7668538627476664e-07, + "learning_rate": 1.8725068400566454e-06, + "loss": 0.0, + "num_input_tokens_seen": 19630216, + "step": 32215 + }, + { + "epoch": 8.885824600110315, + "grad_norm": 9.275871093450405e-07, + "learning_rate": 1.8679401328824203e-06, + "loss": 0.0, + "num_input_tokens_seen": 19633768, + "step": 32220 + }, + { + "epoch": 8.887203530060672, + "grad_norm": 8.093717838164594e-07, + "learning_rate": 1.8633787850744212e-06, + "loss": 0.0, + "num_input_tokens_seen": 19636296, + "step": 32225 + }, + { + "epoch": 8.888582460011031, + "grad_norm": 6.646071710747492e-07, + "learning_rate": 1.8588227976894413e-06, + "loss": 0.0, + "num_input_tokens_seen": 19639048, + "step": 32230 + }, + { + "epoch": 8.88996138996139, + "grad_norm": 1.2296956811042037e-05, + "learning_rate": 1.8542721717830419e-06, + "loss": 0.0, + "num_input_tokens_seen": 19642344, + "step": 32235 + }, + { + "epoch": 8.891340319911748, + "grad_norm": 7.465784506166528e-07, + "learning_rate": 1.8497269084095381e-06, + "loss": 0.0, + "num_input_tokens_seen": 19645448, + "step": 32240 + }, + { + "epoch": 8.892719249862107, + "grad_norm": 1.8334640117245726e-06, + "learning_rate": 1.8451870086219985e-06, + "loss": 0.0, + "num_input_tokens_seen": 19648232, + "step": 32245 + }, + { + "epoch": 8.894098179812465, + "grad_norm": 6.372409870891715e-07, + "learning_rate": 1.840652473472257e-06, + "loss": 0.0, + "num_input_tokens_seen": 19650312, + "step": 32250 + }, + { + "epoch": 8.895477109762824, + "grad_norm": 1.0810935009430978e-06, + "learning_rate": 1.8361233040109004e-06, + "loss": 0.0, + "num_input_tokens_seen": 19652872, + "step": 32255 + }, + { + "epoch": 8.896856039713182, + "grad_norm": 6.610231366721564e-07, + "learning_rate": 1.831599501287265e-06, + "loss": 0.0, + "num_input_tokens_seen": 19655944, + "step": 32260 + }, + { + "epoch": 8.898234969663541, + "grad_norm": 8.271805427284562e-07, + "learning_rate": 1.827081066349459e-06, + "loss": 0.0, + "num_input_tokens_seen": 19658248, + "step": 32265 + }, + { + "epoch": 8.8996138996139, + "grad_norm": 2.789936843328178e-06, + "learning_rate": 1.8225680002443346e-06, + "loss": 0.0, + "num_input_tokens_seen": 19660616, + "step": 32270 + }, + { + "epoch": 8.900992829564258, + "grad_norm": 8.086251455097226e-07, + "learning_rate": 1.8180603040175075e-06, + "loss": 0.0, + "num_input_tokens_seen": 19663720, + "step": 32275 + }, + { + "epoch": 8.902371759514617, + "grad_norm": 3.651936367532471e-06, + "learning_rate": 1.8135579787133427e-06, + "loss": 0.0, + "num_input_tokens_seen": 19666184, + "step": 32280 + }, + { + "epoch": 8.903750689464974, + "grad_norm": 6.862060786261281e-07, + "learning_rate": 1.809061025374964e-06, + "loss": 0.0, + "num_input_tokens_seen": 19669384, + "step": 32285 + }, + { + "epoch": 8.905129619415334, + "grad_norm": 5.724775746784871e-07, + "learning_rate": 1.80456944504426e-06, + "loss": 0.0, + "num_input_tokens_seen": 19673096, + "step": 32290 + }, + { + "epoch": 8.906508549365693, + "grad_norm": 7.458583581865241e-07, + "learning_rate": 1.8000832387618571e-06, + "loss": 0.0, + "num_input_tokens_seen": 19676296, + "step": 32295 + }, + { + "epoch": 8.90788747931605, + "grad_norm": 5.654292749568413e-07, + "learning_rate": 1.7956024075671402e-06, + "loss": 0.0, + "num_input_tokens_seen": 19679752, + "step": 32300 + }, + { + "epoch": 8.90926640926641, + "grad_norm": 1.0874435929508763e-06, + "learning_rate": 1.7911269524982676e-06, + "loss": 0.0, + "num_input_tokens_seen": 19682888, + "step": 32305 + }, + { + "epoch": 8.910645339216767, + "grad_norm": 5.352844164008275e-05, + "learning_rate": 1.7866568745921241e-06, + "loss": 0.0, + "num_input_tokens_seen": 19687080, + "step": 32310 + }, + { + "epoch": 8.912024269167127, + "grad_norm": 5.432999046206533e-07, + "learning_rate": 1.7821921748843728e-06, + "loss": 0.0, + "num_input_tokens_seen": 19689960, + "step": 32315 + }, + { + "epoch": 8.913403199117484, + "grad_norm": 1.1887914297403768e-06, + "learning_rate": 1.7777328544094169e-06, + "loss": 0.0, + "num_input_tokens_seen": 19693128, + "step": 32320 + }, + { + "epoch": 8.914782129067843, + "grad_norm": 4.0447534388476925e-07, + "learning_rate": 1.773278914200413e-06, + "loss": 0.0, + "num_input_tokens_seen": 19696872, + "step": 32325 + }, + { + "epoch": 8.916161059018203, + "grad_norm": 1.6888711797946598e-06, + "learning_rate": 1.7688303552892804e-06, + "loss": 0.0, + "num_input_tokens_seen": 19699944, + "step": 32330 + }, + { + "epoch": 8.91753998896856, + "grad_norm": 4.300451905692171e-07, + "learning_rate": 1.7643871787066862e-06, + "loss": 0.0, + "num_input_tokens_seen": 19703080, + "step": 32335 + }, + { + "epoch": 8.91891891891892, + "grad_norm": 6.782049126741185e-07, + "learning_rate": 1.759949385482046e-06, + "loss": 0.0, + "num_input_tokens_seen": 19705384, + "step": 32340 + }, + { + "epoch": 8.920297848869277, + "grad_norm": 7.96017843640584e-07, + "learning_rate": 1.7555169766435343e-06, + "loss": 0.0, + "num_input_tokens_seen": 19708520, + "step": 32345 + }, + { + "epoch": 8.921676778819636, + "grad_norm": 8.141352623169951e-07, + "learning_rate": 1.7510899532180825e-06, + "loss": 0.0, + "num_input_tokens_seen": 19711176, + "step": 32350 + }, + { + "epoch": 8.923055708769994, + "grad_norm": 9.625628081266768e-07, + "learning_rate": 1.746668316231359e-06, + "loss": 0.0, + "num_input_tokens_seen": 19714888, + "step": 32355 + }, + { + "epoch": 8.924434638720353, + "grad_norm": 4.93458287564863e-07, + "learning_rate": 1.742252066707803e-06, + "loss": 0.0, + "num_input_tokens_seen": 19717960, + "step": 32360 + }, + { + "epoch": 8.925813568670712, + "grad_norm": 4.232871106069069e-06, + "learning_rate": 1.7378412056705873e-06, + "loss": 0.0, + "num_input_tokens_seen": 19720776, + "step": 32365 + }, + { + "epoch": 8.92719249862107, + "grad_norm": 4.22118546339334e-07, + "learning_rate": 1.733435734141653e-06, + "loss": 0.0, + "num_input_tokens_seen": 19723176, + "step": 32370 + }, + { + "epoch": 8.928571428571429, + "grad_norm": 5.820343744744605e-07, + "learning_rate": 1.7290356531416836e-06, + "loss": 0.0, + "num_input_tokens_seen": 19726408, + "step": 32375 + }, + { + "epoch": 8.929950358521786, + "grad_norm": 3.471220770734362e-05, + "learning_rate": 1.724640963690105e-06, + "loss": 0.0, + "num_input_tokens_seen": 19729448, + "step": 32380 + }, + { + "epoch": 8.931329288472146, + "grad_norm": 1.1602166978263995e-06, + "learning_rate": 1.7202516668051172e-06, + "loss": 0.0, + "num_input_tokens_seen": 19733096, + "step": 32385 + }, + { + "epoch": 8.932708218422505, + "grad_norm": 1.3277564221425564e-06, + "learning_rate": 1.715867763503648e-06, + "loss": 0.0, + "num_input_tokens_seen": 19735848, + "step": 32390 + }, + { + "epoch": 8.934087148372862, + "grad_norm": 4.2306731984353974e-07, + "learning_rate": 1.7114892548013879e-06, + "loss": 0.0, + "num_input_tokens_seen": 19739048, + "step": 32395 + }, + { + "epoch": 8.935466078323222, + "grad_norm": 3.255407818869571e-06, + "learning_rate": 1.7071161417127752e-06, + "loss": 0.0, + "num_input_tokens_seen": 19742216, + "step": 32400 + }, + { + "epoch": 8.936845008273579, + "grad_norm": 4.4709062763104157e-07, + "learning_rate": 1.7027484252509939e-06, + "loss": 0.0, + "num_input_tokens_seen": 19745192, + "step": 32405 + }, + { + "epoch": 8.938223938223938, + "grad_norm": 6.157562211228651e-07, + "learning_rate": 1.6983861064279877e-06, + "loss": 0.0, + "num_input_tokens_seen": 19748648, + "step": 32410 + }, + { + "epoch": 8.939602868174298, + "grad_norm": 1.3746430340688676e-06, + "learning_rate": 1.6940291862544339e-06, + "loss": 0.0, + "num_input_tokens_seen": 19751624, + "step": 32415 + }, + { + "epoch": 8.940981798124655, + "grad_norm": 0.00016904644144233316, + "learning_rate": 1.6896776657397807e-06, + "loss": 0.0, + "num_input_tokens_seen": 19758952, + "step": 32420 + }, + { + "epoch": 8.942360728075014, + "grad_norm": 6.120508828644233e-07, + "learning_rate": 1.6853315458921992e-06, + "loss": 0.0, + "num_input_tokens_seen": 19763304, + "step": 32425 + }, + { + "epoch": 8.943739658025372, + "grad_norm": 1.2790617347491207e-06, + "learning_rate": 1.680990827718637e-06, + "loss": 0.0, + "num_input_tokens_seen": 19766760, + "step": 32430 + }, + { + "epoch": 8.945118587975731, + "grad_norm": 3.9288850530283526e-05, + "learning_rate": 1.6766555122247618e-06, + "loss": 0.0, + "num_input_tokens_seen": 19769512, + "step": 32435 + }, + { + "epoch": 8.946497517926089, + "grad_norm": 1.4521048115057056e-06, + "learning_rate": 1.6723256004150173e-06, + "loss": 0.0, + "num_input_tokens_seen": 19771432, + "step": 32440 + }, + { + "epoch": 8.947876447876448, + "grad_norm": 1.8308282960788347e-06, + "learning_rate": 1.668001093292576e-06, + "loss": 0.0, + "num_input_tokens_seen": 19774472, + "step": 32445 + }, + { + "epoch": 8.949255377826807, + "grad_norm": 6.557042979693506e-07, + "learning_rate": 1.6636819918593616e-06, + "loss": 0.0, + "num_input_tokens_seen": 19777960, + "step": 32450 + }, + { + "epoch": 8.950634307777165, + "grad_norm": 1.318432055086305e-06, + "learning_rate": 1.6593682971160512e-06, + "loss": 0.0, + "num_input_tokens_seen": 19781352, + "step": 32455 + }, + { + "epoch": 8.952013237727524, + "grad_norm": 9.629876558392425e-07, + "learning_rate": 1.6550600100620621e-06, + "loss": 0.0, + "num_input_tokens_seen": 19784168, + "step": 32460 + }, + { + "epoch": 8.953392167677881, + "grad_norm": 1.061712055161479e-06, + "learning_rate": 1.6507571316955711e-06, + "loss": 0.0, + "num_input_tokens_seen": 19786568, + "step": 32465 + }, + { + "epoch": 8.95477109762824, + "grad_norm": 3.3104652175097726e-07, + "learning_rate": 1.646459663013486e-06, + "loss": 0.0, + "num_input_tokens_seen": 19791944, + "step": 32470 + }, + { + "epoch": 8.956150027578598, + "grad_norm": 7.491674409720872e-07, + "learning_rate": 1.6421676050114687e-06, + "loss": 0.0, + "num_input_tokens_seen": 19795080, + "step": 32475 + }, + { + "epoch": 8.957528957528957, + "grad_norm": 9.381201380165294e-07, + "learning_rate": 1.6378809586839323e-06, + "loss": 0.0, + "num_input_tokens_seen": 19798760, + "step": 32480 + }, + { + "epoch": 8.958907887479317, + "grad_norm": 1.3484491319104563e-06, + "learning_rate": 1.6335997250240236e-06, + "loss": 0.0, + "num_input_tokens_seen": 19802376, + "step": 32485 + }, + { + "epoch": 8.960286817429674, + "grad_norm": 4.798330905941839e-07, + "learning_rate": 1.629323905023647e-06, + "loss": 0.0, + "num_input_tokens_seen": 19805224, + "step": 32490 + }, + { + "epoch": 8.961665747380033, + "grad_norm": 5.422074309535674e-07, + "learning_rate": 1.6250534996734512e-06, + "loss": 0.0, + "num_input_tokens_seen": 19807176, + "step": 32495 + }, + { + "epoch": 8.963044677330391, + "grad_norm": 8.084251703621703e-07, + "learning_rate": 1.6207885099628257e-06, + "loss": 0.0, + "num_input_tokens_seen": 19810088, + "step": 32500 + }, + { + "epoch": 8.96442360728075, + "grad_norm": 5.938937874816475e-07, + "learning_rate": 1.6165289368799101e-06, + "loss": 0.0, + "num_input_tokens_seen": 19813160, + "step": 32505 + }, + { + "epoch": 8.96580253723111, + "grad_norm": 5.376616059038497e-07, + "learning_rate": 1.6122747814115823e-06, + "loss": 0.0, + "num_input_tokens_seen": 19818792, + "step": 32510 + }, + { + "epoch": 8.967181467181467, + "grad_norm": 4.645623903343221e-07, + "learning_rate": 1.6080260445434675e-06, + "loss": 0.0, + "num_input_tokens_seen": 19821320, + "step": 32515 + }, + { + "epoch": 8.968560397131826, + "grad_norm": 0.00016884681826923043, + "learning_rate": 1.603782727259942e-06, + "loss": 0.0, + "num_input_tokens_seen": 19823688, + "step": 32520 + }, + { + "epoch": 8.969939327082184, + "grad_norm": 5.803780140922754e-07, + "learning_rate": 1.5995448305441224e-06, + "loss": 0.0, + "num_input_tokens_seen": 19826664, + "step": 32525 + }, + { + "epoch": 8.971318257032543, + "grad_norm": 9.154275630862685e-07, + "learning_rate": 1.5953123553778593e-06, + "loss": 0.0, + "num_input_tokens_seen": 19830376, + "step": 32530 + }, + { + "epoch": 8.972697186982902, + "grad_norm": 1.0659149438652094e-06, + "learning_rate": 1.5910853027417683e-06, + "loss": 0.0, + "num_input_tokens_seen": 19832904, + "step": 32535 + }, + { + "epoch": 8.97407611693326, + "grad_norm": 9.05009528651135e-06, + "learning_rate": 1.586863673615191e-06, + "loss": 0.0, + "num_input_tokens_seen": 19835400, + "step": 32540 + }, + { + "epoch": 8.975455046883619, + "grad_norm": 4.276823801774299e-06, + "learning_rate": 1.5826474689762145e-06, + "loss": 0.0, + "num_input_tokens_seen": 19837832, + "step": 32545 + }, + { + "epoch": 8.976833976833976, + "grad_norm": 7.063686666697322e-07, + "learning_rate": 1.5784366898016794e-06, + "loss": 0.0, + "num_input_tokens_seen": 19841576, + "step": 32550 + }, + { + "epoch": 8.978212906784336, + "grad_norm": 1.4341757150759804e-06, + "learning_rate": 1.5742313370671585e-06, + "loss": 0.0, + "num_input_tokens_seen": 19844616, + "step": 32555 + }, + { + "epoch": 8.979591836734693, + "grad_norm": 1.3192832284403266e-06, + "learning_rate": 1.5700314117469722e-06, + "loss": 0.0, + "num_input_tokens_seen": 19848808, + "step": 32560 + }, + { + "epoch": 8.980970766685052, + "grad_norm": 3.492281450689916e-07, + "learning_rate": 1.5658369148141894e-06, + "loss": 0.0, + "num_input_tokens_seen": 19852584, + "step": 32565 + }, + { + "epoch": 8.982349696635412, + "grad_norm": 1.9737961338250898e-05, + "learning_rate": 1.5616478472406048e-06, + "loss": 0.0, + "num_input_tokens_seen": 19855400, + "step": 32570 + }, + { + "epoch": 8.98372862658577, + "grad_norm": 5.265013669486507e-07, + "learning_rate": 1.557464209996773e-06, + "loss": 0.0, + "num_input_tokens_seen": 19861000, + "step": 32575 + }, + { + "epoch": 8.985107556536128, + "grad_norm": 1.7525370594739798e-06, + "learning_rate": 1.5532860040519793e-06, + "loss": 0.0, + "num_input_tokens_seen": 19864104, + "step": 32580 + }, + { + "epoch": 8.986486486486486, + "grad_norm": 5.308455115482502e-07, + "learning_rate": 1.5491132303742522e-06, + "loss": 0.0, + "num_input_tokens_seen": 19866888, + "step": 32585 + }, + { + "epoch": 8.987865416436845, + "grad_norm": 5.0311570021222e-07, + "learning_rate": 1.5449458899303686e-06, + "loss": 0.0, + "num_input_tokens_seen": 19869160, + "step": 32590 + }, + { + "epoch": 8.989244346387203, + "grad_norm": 1.1460214182079653e-06, + "learning_rate": 1.5407839836858368e-06, + "loss": 0.0, + "num_input_tokens_seen": 19872456, + "step": 32595 + }, + { + "epoch": 8.990623276337562, + "grad_norm": 3.9641793136979686e-07, + "learning_rate": 1.5366275126049157e-06, + "loss": 0.0, + "num_input_tokens_seen": 19874696, + "step": 32600 + }, + { + "epoch": 8.992002206287921, + "grad_norm": 6.522845524159493e-06, + "learning_rate": 1.5324764776505934e-06, + "loss": 0.0, + "num_input_tokens_seen": 19878728, + "step": 32605 + }, + { + "epoch": 8.993381136238279, + "grad_norm": 1.8111187500835513e-06, + "learning_rate": 1.528330879784609e-06, + "loss": 0.0, + "num_input_tokens_seen": 19881032, + "step": 32610 + }, + { + "epoch": 8.994760066188638, + "grad_norm": 0.00011069014726672322, + "learning_rate": 1.5241907199674387e-06, + "loss": 0.0, + "num_input_tokens_seen": 19883656, + "step": 32615 + }, + { + "epoch": 8.996138996138995, + "grad_norm": 4.0216684737970354e-07, + "learning_rate": 1.5200559991582959e-06, + "loss": 0.0, + "num_input_tokens_seen": 19886248, + "step": 32620 + }, + { + "epoch": 8.997517926089355, + "grad_norm": 1.3331276704775519e-06, + "learning_rate": 1.5159267183151387e-06, + "loss": 0.0, + "num_input_tokens_seen": 19890024, + "step": 32625 + }, + { + "epoch": 8.998896856039714, + "grad_norm": 1.1854607464556466e-06, + "learning_rate": 1.5118028783946552e-06, + "loss": 0.0, + "num_input_tokens_seen": 19892680, + "step": 32630 + }, + { + "epoch": 9.0, + "eval_loss": 0.38864049315452576, + "eval_runtime": 28.4877, + "eval_samples_per_second": 56.586, + "eval_steps_per_second": 14.146, + "num_input_tokens_seen": 19894712, + "step": 32634 + }, + { + "epoch": 9.000275785990071, + "grad_norm": 1.099319888453465e-06, + "learning_rate": 1.5076844803522922e-06, + "loss": 0.0, + "num_input_tokens_seen": 19896120, + "step": 32635 + }, + { + "epoch": 9.00165471594043, + "grad_norm": 3.256708168919431e-06, + "learning_rate": 1.5035715251422112e-06, + "loss": 0.0, + "num_input_tokens_seen": 19898488, + "step": 32640 + }, + { + "epoch": 9.003033645890788, + "grad_norm": 4.572639682010049e-06, + "learning_rate": 1.4994640137173332e-06, + "loss": 0.0, + "num_input_tokens_seen": 19901496, + "step": 32645 + }, + { + "epoch": 9.004412575841148, + "grad_norm": 8.082458862190833e-07, + "learning_rate": 1.4953619470293057e-06, + "loss": 0.0, + "num_input_tokens_seen": 19904792, + "step": 32650 + }, + { + "epoch": 9.005791505791505, + "grad_norm": 6.90643616962916e-07, + "learning_rate": 1.491265326028521e-06, + "loss": 0.0, + "num_input_tokens_seen": 19907608, + "step": 32655 + }, + { + "epoch": 9.007170435741864, + "grad_norm": 7.678669135202654e-06, + "learning_rate": 1.4871741516641059e-06, + "loss": 0.0, + "num_input_tokens_seen": 19911800, + "step": 32660 + }, + { + "epoch": 9.008549365692224, + "grad_norm": 6.441527489187138e-07, + "learning_rate": 1.4830884248839244e-06, + "loss": 0.0, + "num_input_tokens_seen": 19914488, + "step": 32665 + }, + { + "epoch": 9.009928295642581, + "grad_norm": 5.294565426083864e-07, + "learning_rate": 1.4790081466345861e-06, + "loss": 0.0, + "num_input_tokens_seen": 19916984, + "step": 32670 + }, + { + "epoch": 9.01130722559294, + "grad_norm": 1.5147861631703563e-06, + "learning_rate": 1.4749333178614322e-06, + "loss": 0.0, + "num_input_tokens_seen": 19920280, + "step": 32675 + }, + { + "epoch": 9.012686155543298, + "grad_norm": 6.924239528416365e-07, + "learning_rate": 1.4708639395085322e-06, + "loss": 0.0, + "num_input_tokens_seen": 19923576, + "step": 32680 + }, + { + "epoch": 9.014065085493657, + "grad_norm": 8.296817100017506e-07, + "learning_rate": 1.466800012518718e-06, + "loss": 0.0, + "num_input_tokens_seen": 19925688, + "step": 32685 + }, + { + "epoch": 9.015444015444016, + "grad_norm": 1.407847094014869e-06, + "learning_rate": 1.4627415378335285e-06, + "loss": 0.0, + "num_input_tokens_seen": 19927960, + "step": 32690 + }, + { + "epoch": 9.016822945394374, + "grad_norm": 6.076245995245699e-07, + "learning_rate": 1.4586885163932668e-06, + "loss": 0.0, + "num_input_tokens_seen": 19931512, + "step": 32695 + }, + { + "epoch": 9.018201875344733, + "grad_norm": 3.102136588495341e-06, + "learning_rate": 1.4546409491369483e-06, + "loss": 0.0, + "num_input_tokens_seen": 19934776, + "step": 32700 + }, + { + "epoch": 9.01958080529509, + "grad_norm": 1.0743297025328502e-06, + "learning_rate": 1.4505988370023427e-06, + "loss": 0.0, + "num_input_tokens_seen": 19939128, + "step": 32705 + }, + { + "epoch": 9.02095973524545, + "grad_norm": 6.240731522666465e-07, + "learning_rate": 1.4465621809259532e-06, + "loss": 0.0, + "num_input_tokens_seen": 19941688, + "step": 32710 + }, + { + "epoch": 9.022338665195807, + "grad_norm": 8.32455771160312e-05, + "learning_rate": 1.44253098184301e-06, + "loss": 0.0, + "num_input_tokens_seen": 19944344, + "step": 32715 + }, + { + "epoch": 9.023717595146167, + "grad_norm": 1.626727339498757e-06, + "learning_rate": 1.4385052406874793e-06, + "loss": 0.0, + "num_input_tokens_seen": 19949528, + "step": 32720 + }, + { + "epoch": 9.025096525096526, + "grad_norm": 5.004508238926064e-07, + "learning_rate": 1.4344849583920793e-06, + "loss": 0.0, + "num_input_tokens_seen": 19952568, + "step": 32725 + }, + { + "epoch": 9.026475455046883, + "grad_norm": 3.808972905972041e-06, + "learning_rate": 1.4304701358882427e-06, + "loss": 0.0, + "num_input_tokens_seen": 19954968, + "step": 32730 + }, + { + "epoch": 9.027854384997243, + "grad_norm": 6.432826467062114e-07, + "learning_rate": 1.4264607741061474e-06, + "loss": 0.0, + "num_input_tokens_seen": 19958392, + "step": 32735 + }, + { + "epoch": 9.0292333149476, + "grad_norm": 5.02367527133174e-07, + "learning_rate": 1.4224568739747063e-06, + "loss": 0.0, + "num_input_tokens_seen": 19960824, + "step": 32740 + }, + { + "epoch": 9.03061224489796, + "grad_norm": 5.182453151064692e-07, + "learning_rate": 1.4184584364215658e-06, + "loss": 0.0, + "num_input_tokens_seen": 19963416, + "step": 32745 + }, + { + "epoch": 9.031991174848319, + "grad_norm": 3.4836389204428997e-06, + "learning_rate": 1.4144654623731075e-06, + "loss": 0.0, + "num_input_tokens_seen": 19966456, + "step": 32750 + }, + { + "epoch": 9.033370104798676, + "grad_norm": 1.1381454214642872e-06, + "learning_rate": 1.4104779527544437e-06, + "loss": 0.0, + "num_input_tokens_seen": 19970584, + "step": 32755 + }, + { + "epoch": 9.034749034749035, + "grad_norm": 1.743099687701033e-06, + "learning_rate": 1.406495908489422e-06, + "loss": 0.0, + "num_input_tokens_seen": 19973464, + "step": 32760 + }, + { + "epoch": 9.036127964699393, + "grad_norm": 1.3659049500347464e-06, + "learning_rate": 1.402519330500629e-06, + "loss": 0.0, + "num_input_tokens_seen": 19976088, + "step": 32765 + }, + { + "epoch": 9.037506894649752, + "grad_norm": 2.0596012291207444e-06, + "learning_rate": 1.3985482197093807e-06, + "loss": 0.0, + "num_input_tokens_seen": 19978936, + "step": 32770 + }, + { + "epoch": 9.03888582460011, + "grad_norm": 1.4153035863273544e-06, + "learning_rate": 1.3945825770357219e-06, + "loss": 0.0, + "num_input_tokens_seen": 19981656, + "step": 32775 + }, + { + "epoch": 9.040264754550469, + "grad_norm": 5.465374215418706e-07, + "learning_rate": 1.3906224033984394e-06, + "loss": 0.0, + "num_input_tokens_seen": 19984536, + "step": 32780 + }, + { + "epoch": 9.041643684500828, + "grad_norm": 5.00600037867116e-07, + "learning_rate": 1.3866676997150496e-06, + "loss": 0.0, + "num_input_tokens_seen": 19988504, + "step": 32785 + }, + { + "epoch": 9.043022614451186, + "grad_norm": 4.0723421989241615e-05, + "learning_rate": 1.3827184669017945e-06, + "loss": 0.0, + "num_input_tokens_seen": 19991640, + "step": 32790 + }, + { + "epoch": 9.044401544401545, + "grad_norm": 8.685129841978778e-07, + "learning_rate": 1.3787747058736616e-06, + "loss": 0.0, + "num_input_tokens_seen": 19994520, + "step": 32795 + }, + { + "epoch": 9.045780474351902, + "grad_norm": 4.2812753235921264e-05, + "learning_rate": 1.3748364175443561e-06, + "loss": 0.0, + "num_input_tokens_seen": 19997496, + "step": 32800 + }, + { + "epoch": 9.047159404302262, + "grad_norm": 5.017928401684912e-07, + "learning_rate": 1.370903602826329e-06, + "loss": 0.0, + "num_input_tokens_seen": 20001592, + "step": 32805 + }, + { + "epoch": 9.048538334252619, + "grad_norm": 5.321167009242345e-07, + "learning_rate": 1.3669762626307565e-06, + "loss": 0.0, + "num_input_tokens_seen": 20004920, + "step": 32810 + }, + { + "epoch": 9.049917264202978, + "grad_norm": 1.4863959449940012e-06, + "learning_rate": 1.3630543978675443e-06, + "loss": 0.0, + "num_input_tokens_seen": 20008152, + "step": 32815 + }, + { + "epoch": 9.051296194153338, + "grad_norm": 6.918558028701227e-07, + "learning_rate": 1.359138009445335e-06, + "loss": 0.0, + "num_input_tokens_seen": 20011416, + "step": 32820 + }, + { + "epoch": 9.052675124103695, + "grad_norm": 4.886491069555632e-07, + "learning_rate": 1.3552270982714971e-06, + "loss": 0.0, + "num_input_tokens_seen": 20013944, + "step": 32825 + }, + { + "epoch": 9.054054054054054, + "grad_norm": 4.919846219308965e-07, + "learning_rate": 1.351321665252131e-06, + "loss": 0.0, + "num_input_tokens_seen": 20017176, + "step": 32830 + }, + { + "epoch": 9.055432984004412, + "grad_norm": 1.885397750811535e-06, + "learning_rate": 1.3474217112920712e-06, + "loss": 0.0, + "num_input_tokens_seen": 20020696, + "step": 32835 + }, + { + "epoch": 9.056811913954771, + "grad_norm": 5.659297812599107e-07, + "learning_rate": 1.3435272372948837e-06, + "loss": 0.0, + "num_input_tokens_seen": 20024216, + "step": 32840 + }, + { + "epoch": 9.05819084390513, + "grad_norm": 2.280378566865693e-06, + "learning_rate": 1.3396382441628636e-06, + "loss": 0.0, + "num_input_tokens_seen": 20026904, + "step": 32845 + }, + { + "epoch": 9.059569773855488, + "grad_norm": 1.3802110743199592e-06, + "learning_rate": 1.3357547327970289e-06, + "loss": 0.0, + "num_input_tokens_seen": 20029336, + "step": 32850 + }, + { + "epoch": 9.060948703805847, + "grad_norm": 9.718070259623346e-07, + "learning_rate": 1.3318767040971347e-06, + "loss": 0.0, + "num_input_tokens_seen": 20031768, + "step": 32855 + }, + { + "epoch": 9.062327633756205, + "grad_norm": 8.808164011497865e-07, + "learning_rate": 1.3280041589616681e-06, + "loss": 0.0, + "num_input_tokens_seen": 20034264, + "step": 32860 + }, + { + "epoch": 9.063706563706564, + "grad_norm": 2.7195872007723665e-06, + "learning_rate": 1.324137098287842e-06, + "loss": 0.0, + "num_input_tokens_seen": 20038104, + "step": 32865 + }, + { + "epoch": 9.065085493656921, + "grad_norm": 1.0110995390277822e-05, + "learning_rate": 1.3202755229715923e-06, + "loss": 0.0, + "num_input_tokens_seen": 20041016, + "step": 32870 + }, + { + "epoch": 9.06646442360728, + "grad_norm": 6.067065783099679e-07, + "learning_rate": 1.316419433907598e-06, + "loss": 0.0, + "num_input_tokens_seen": 20043544, + "step": 32875 + }, + { + "epoch": 9.06784335355764, + "grad_norm": 4.373980857508286e-07, + "learning_rate": 1.312568831989258e-06, + "loss": 0.0, + "num_input_tokens_seen": 20046424, + "step": 32880 + }, + { + "epoch": 9.069222283507997, + "grad_norm": 2.214167579950299e-06, + "learning_rate": 1.3087237181086982e-06, + "loss": 0.0, + "num_input_tokens_seen": 20049688, + "step": 32885 + }, + { + "epoch": 9.070601213458357, + "grad_norm": 2.9888265089539345e-06, + "learning_rate": 1.30488409315678e-06, + "loss": 0.0, + "num_input_tokens_seen": 20052056, + "step": 32890 + }, + { + "epoch": 9.071980143408714, + "grad_norm": 7.055246555864869e-07, + "learning_rate": 1.3010499580230845e-06, + "loss": 0.0, + "num_input_tokens_seen": 20055224, + "step": 32895 + }, + { + "epoch": 9.073359073359073, + "grad_norm": 2.939055320894113e-06, + "learning_rate": 1.297221313595931e-06, + "loss": 0.0, + "num_input_tokens_seen": 20058744, + "step": 32900 + }, + { + "epoch": 9.074738003309433, + "grad_norm": 4.60968692550523e-07, + "learning_rate": 1.2933981607623575e-06, + "loss": 0.0, + "num_input_tokens_seen": 20061336, + "step": 32905 + }, + { + "epoch": 9.07611693325979, + "grad_norm": 4.4243239472052664e-07, + "learning_rate": 1.2895805004081357e-06, + "loss": 0.0, + "num_input_tokens_seen": 20063768, + "step": 32910 + }, + { + "epoch": 9.07749586321015, + "grad_norm": 7.920077678136295e-07, + "learning_rate": 1.2857683334177666e-06, + "loss": 0.0, + "num_input_tokens_seen": 20067704, + "step": 32915 + }, + { + "epoch": 9.078874793160507, + "grad_norm": 4.34397350090876e-07, + "learning_rate": 1.2819616606744688e-06, + "loss": 0.0, + "num_input_tokens_seen": 20070200, + "step": 32920 + }, + { + "epoch": 9.080253723110866, + "grad_norm": 1.0550223805694259e-06, + "learning_rate": 1.2781604830601923e-06, + "loss": 0.0, + "num_input_tokens_seen": 20072888, + "step": 32925 + }, + { + "epoch": 9.081632653061224, + "grad_norm": 5.583397069131024e-07, + "learning_rate": 1.2743648014556242e-06, + "loss": 0.0, + "num_input_tokens_seen": 20075864, + "step": 32930 + }, + { + "epoch": 9.083011583011583, + "grad_norm": 1.276684179174481e-05, + "learning_rate": 1.2705746167401583e-06, + "loss": 0.0, + "num_input_tokens_seen": 20079032, + "step": 32935 + }, + { + "epoch": 9.084390512961942, + "grad_norm": 1.8272571651323233e-06, + "learning_rate": 1.2667899297919367e-06, + "loss": 0.0, + "num_input_tokens_seen": 20082296, + "step": 32940 + }, + { + "epoch": 9.0857694429123, + "grad_norm": 1.1706359828167479e-06, + "learning_rate": 1.2630107414878106e-06, + "loss": 0.0, + "num_input_tokens_seen": 20085944, + "step": 32945 + }, + { + "epoch": 9.087148372862659, + "grad_norm": 4.1236779679820756e-07, + "learning_rate": 1.2592370527033631e-06, + "loss": 0.0, + "num_input_tokens_seen": 20089048, + "step": 32950 + }, + { + "epoch": 9.088527302813016, + "grad_norm": 8.905363415578904e-07, + "learning_rate": 1.2554688643129058e-06, + "loss": 0.0, + "num_input_tokens_seen": 20092248, + "step": 32955 + }, + { + "epoch": 9.089906232763376, + "grad_norm": 4.4893224071529403e-07, + "learning_rate": 1.2517061771894767e-06, + "loss": 0.0, + "num_input_tokens_seen": 20095448, + "step": 32960 + }, + { + "epoch": 9.091285162713735, + "grad_norm": 6.70507972699852e-07, + "learning_rate": 1.2479489922048282e-06, + "loss": 0.0, + "num_input_tokens_seen": 20098136, + "step": 32965 + }, + { + "epoch": 9.092664092664092, + "grad_norm": 5.397106406235253e-07, + "learning_rate": 1.244197310229453e-06, + "loss": 0.0, + "num_input_tokens_seen": 20101432, + "step": 32970 + }, + { + "epoch": 9.094043022614452, + "grad_norm": 8.292058168990479e-07, + "learning_rate": 1.2404511321325613e-06, + "loss": 0.0, + "num_input_tokens_seen": 20104728, + "step": 32975 + }, + { + "epoch": 9.09542195256481, + "grad_norm": 4.2410516698510037e-07, + "learning_rate": 1.2367104587820867e-06, + "loss": 0.0, + "num_input_tokens_seen": 20107256, + "step": 32980 + }, + { + "epoch": 9.096800882515168, + "grad_norm": 1.2813100056519033e-06, + "learning_rate": 1.2329752910446913e-06, + "loss": 0.0, + "num_input_tokens_seen": 20109944, + "step": 32985 + }, + { + "epoch": 9.098179812465526, + "grad_norm": 8.143579179886729e-07, + "learning_rate": 1.229245629785758e-06, + "loss": 0.0, + "num_input_tokens_seen": 20112760, + "step": 32990 + }, + { + "epoch": 9.099558742415885, + "grad_norm": 2.2182726752362214e-05, + "learning_rate": 1.2255214758693985e-06, + "loss": 0.0, + "num_input_tokens_seen": 20115352, + "step": 32995 + }, + { + "epoch": 9.100937672366245, + "grad_norm": 4.814079943571414e-07, + "learning_rate": 1.2218028301584472e-06, + "loss": 0.0, + "num_input_tokens_seen": 20118552, + "step": 33000 + }, + { + "epoch": 9.102316602316602, + "grad_norm": 2.4979728550533764e-05, + "learning_rate": 1.2180896935144514e-06, + "loss": 0.0, + "num_input_tokens_seen": 20121368, + "step": 33005 + }, + { + "epoch": 9.103695532266961, + "grad_norm": 5.288317765916872e-07, + "learning_rate": 1.2143820667977036e-06, + "loss": 0.0, + "num_input_tokens_seen": 20126584, + "step": 33010 + }, + { + "epoch": 9.105074462217319, + "grad_norm": 6.874952305224724e-07, + "learning_rate": 1.2106799508671995e-06, + "loss": 0.0, + "num_input_tokens_seen": 20129880, + "step": 33015 + }, + { + "epoch": 9.106453392167678, + "grad_norm": 8.671902946844057e-07, + "learning_rate": 1.2069833465806673e-06, + "loss": 0.0, + "num_input_tokens_seen": 20132792, + "step": 33020 + }, + { + "epoch": 9.107832322118037, + "grad_norm": 3.74407960634926e-07, + "learning_rate": 1.2032922547945635e-06, + "loss": 0.0, + "num_input_tokens_seen": 20135416, + "step": 33025 + }, + { + "epoch": 9.109211252068395, + "grad_norm": 2.915339337050682e-06, + "learning_rate": 1.1996066763640512e-06, + "loss": 0.0, + "num_input_tokens_seen": 20138488, + "step": 33030 + }, + { + "epoch": 9.110590182018754, + "grad_norm": 5.08806465404632e-07, + "learning_rate": 1.1959266121430334e-06, + "loss": 0.0, + "num_input_tokens_seen": 20141848, + "step": 33035 + }, + { + "epoch": 9.111969111969112, + "grad_norm": 9.58572059062135e-07, + "learning_rate": 1.1922520629841226e-06, + "loss": 0.0, + "num_input_tokens_seen": 20144760, + "step": 33040 + }, + { + "epoch": 9.11334804191947, + "grad_norm": 5.948575676484325e-07, + "learning_rate": 1.1885830297386685e-06, + "loss": 0.0, + "num_input_tokens_seen": 20147960, + "step": 33045 + }, + { + "epoch": 9.114726971869828, + "grad_norm": 2.3033673642203212e-05, + "learning_rate": 1.1849195132567215e-06, + "loss": 0.0, + "num_input_tokens_seen": 20150968, + "step": 33050 + }, + { + "epoch": 9.116105901820188, + "grad_norm": 5.267293090582825e-07, + "learning_rate": 1.1812615143870748e-06, + "loss": 0.0, + "num_input_tokens_seen": 20153656, + "step": 33055 + }, + { + "epoch": 9.117484831770547, + "grad_norm": 6.839934485469712e-07, + "learning_rate": 1.1776090339772283e-06, + "loss": 0.0, + "num_input_tokens_seen": 20156696, + "step": 33060 + }, + { + "epoch": 9.118863761720904, + "grad_norm": 8.683169312462269e-07, + "learning_rate": 1.1739620728734163e-06, + "loss": 0.0, + "num_input_tokens_seen": 20159128, + "step": 33065 + }, + { + "epoch": 9.120242691671264, + "grad_norm": 1.186107851935958e-06, + "learning_rate": 1.1703206319205823e-06, + "loss": 0.0, + "num_input_tokens_seen": 20161720, + "step": 33070 + }, + { + "epoch": 9.121621621621621, + "grad_norm": 1.9798178527707933e-06, + "learning_rate": 1.166684711962393e-06, + "loss": 0.0, + "num_input_tokens_seen": 20166648, + "step": 33075 + }, + { + "epoch": 9.12300055157198, + "grad_norm": 0.00013573473552241921, + "learning_rate": 1.163054313841247e-06, + "loss": 0.0, + "num_input_tokens_seen": 20170616, + "step": 33080 + }, + { + "epoch": 9.124379481522338, + "grad_norm": 5.6386754295090213e-05, + "learning_rate": 1.1594294383982462e-06, + "loss": 0.0, + "num_input_tokens_seen": 20173496, + "step": 33085 + }, + { + "epoch": 9.125758411472697, + "grad_norm": 1.6313123296640697e-06, + "learning_rate": 1.155810086473233e-06, + "loss": 0.0, + "num_input_tokens_seen": 20175832, + "step": 33090 + }, + { + "epoch": 9.127137341423056, + "grad_norm": 0.00011182212620042264, + "learning_rate": 1.1521962589047503e-06, + "loss": 0.0, + "num_input_tokens_seen": 20179672, + "step": 33095 + }, + { + "epoch": 9.128516271373414, + "grad_norm": 4.364491815067595e-06, + "learning_rate": 1.14858795653007e-06, + "loss": 0.0, + "num_input_tokens_seen": 20182328, + "step": 33100 + }, + { + "epoch": 9.129895201323773, + "grad_norm": 5.524382800103922e-07, + "learning_rate": 1.1449851801851903e-06, + "loss": 0.0, + "num_input_tokens_seen": 20185688, + "step": 33105 + }, + { + "epoch": 9.13127413127413, + "grad_norm": 6.81153665027523e-07, + "learning_rate": 1.141387930704818e-06, + "loss": 0.0, + "num_input_tokens_seen": 20188696, + "step": 33110 + }, + { + "epoch": 9.13265306122449, + "grad_norm": 3.757840386242606e-05, + "learning_rate": 1.1377962089223843e-06, + "loss": 0.0, + "num_input_tokens_seen": 20192280, + "step": 33115 + }, + { + "epoch": 9.134031991174849, + "grad_norm": 0.00019841777975670993, + "learning_rate": 1.1342100156700425e-06, + "loss": 0.0, + "num_input_tokens_seen": 20195128, + "step": 33120 + }, + { + "epoch": 9.135410921125207, + "grad_norm": 7.15509543169901e-07, + "learning_rate": 1.1306293517786614e-06, + "loss": 0.0, + "num_input_tokens_seen": 20198200, + "step": 33125 + }, + { + "epoch": 9.136789851075566, + "grad_norm": 6.021668923494872e-07, + "learning_rate": 1.1270542180778276e-06, + "loss": 0.0, + "num_input_tokens_seen": 20201784, + "step": 33130 + }, + { + "epoch": 9.138168781025923, + "grad_norm": 1.9854171569022583e-06, + "learning_rate": 1.1234846153958506e-06, + "loss": 0.0, + "num_input_tokens_seen": 20205272, + "step": 33135 + }, + { + "epoch": 9.139547710976283, + "grad_norm": 6.543481845255883e-07, + "learning_rate": 1.1199205445597493e-06, + "loss": 0.0, + "num_input_tokens_seen": 20208664, + "step": 33140 + }, + { + "epoch": 9.14092664092664, + "grad_norm": 8.51884010444337e-07, + "learning_rate": 1.11636200639528e-06, + "loss": 0.0, + "num_input_tokens_seen": 20212504, + "step": 33145 + }, + { + "epoch": 9.142305570877, + "grad_norm": 5.034980290474778e-07, + "learning_rate": 1.112809001726897e-06, + "loss": 0.0, + "num_input_tokens_seen": 20215224, + "step": 33150 + }, + { + "epoch": 9.143684500827359, + "grad_norm": 1.9410483673709678e-06, + "learning_rate": 1.1092615313777782e-06, + "loss": 0.0, + "num_input_tokens_seen": 20218360, + "step": 33155 + }, + { + "epoch": 9.145063430777716, + "grad_norm": 3.365964175827685e-06, + "learning_rate": 1.105719596169827e-06, + "loss": 0.0, + "num_input_tokens_seen": 20220888, + "step": 33160 + }, + { + "epoch": 9.146442360728075, + "grad_norm": 1.0413102700113086e-06, + "learning_rate": 1.1021831969236562e-06, + "loss": 0.0, + "num_input_tokens_seen": 20223672, + "step": 33165 + }, + { + "epoch": 9.147821290678433, + "grad_norm": 4.6646229634461633e-07, + "learning_rate": 1.0986523344585998e-06, + "loss": 0.0, + "num_input_tokens_seen": 20226136, + "step": 33170 + }, + { + "epoch": 9.149200220628792, + "grad_norm": 6.937669354556419e-07, + "learning_rate": 1.0951270095927085e-06, + "loss": 0.0, + "num_input_tokens_seen": 20229080, + "step": 33175 + }, + { + "epoch": 9.150579150579151, + "grad_norm": 1.6265036038021208e-06, + "learning_rate": 1.0916072231427432e-06, + "loss": 0.0, + "num_input_tokens_seen": 20231384, + "step": 33180 + }, + { + "epoch": 9.151958080529509, + "grad_norm": 3.7949970987938286e-07, + "learning_rate": 1.088092975924196e-06, + "loss": 0.0, + "num_input_tokens_seen": 20234104, + "step": 33185 + }, + { + "epoch": 9.153337010479868, + "grad_norm": 2.3536822482128628e-06, + "learning_rate": 1.084584268751268e-06, + "loss": 0.0, + "num_input_tokens_seen": 20237336, + "step": 33190 + }, + { + "epoch": 9.154715940430226, + "grad_norm": 0.0007112575694918633, + "learning_rate": 1.0810811024368678e-06, + "loss": 0.0, + "num_input_tokens_seen": 20240376, + "step": 33195 + }, + { + "epoch": 9.156094870380585, + "grad_norm": 0.00012500496814027429, + "learning_rate": 1.0775834777926375e-06, + "loss": 0.0, + "num_input_tokens_seen": 20242616, + "step": 33200 + }, + { + "epoch": 9.157473800330942, + "grad_norm": 4.119386630918598e-06, + "learning_rate": 1.0740913956289233e-06, + "loss": 0.0, + "num_input_tokens_seen": 20244856, + "step": 33205 + }, + { + "epoch": 9.158852730281302, + "grad_norm": 1.5439829894603463e-06, + "learning_rate": 1.0706048567547866e-06, + "loss": 0.0, + "num_input_tokens_seen": 20248632, + "step": 33210 + }, + { + "epoch": 9.160231660231661, + "grad_norm": 1.0964278089886648e-06, + "learning_rate": 1.0671238619780172e-06, + "loss": 0.0, + "num_input_tokens_seen": 20251576, + "step": 33215 + }, + { + "epoch": 9.161610590182018, + "grad_norm": 1.4213009080776828e-06, + "learning_rate": 1.063648412105106e-06, + "loss": 0.0, + "num_input_tokens_seen": 20254424, + "step": 33220 + }, + { + "epoch": 9.162989520132378, + "grad_norm": 4.957705641572829e-07, + "learning_rate": 1.0601785079412591e-06, + "loss": 0.0, + "num_input_tokens_seen": 20257880, + "step": 33225 + }, + { + "epoch": 9.164368450082735, + "grad_norm": 1.225542405336455e-06, + "learning_rate": 1.0567141502904165e-06, + "loss": 0.0, + "num_input_tokens_seen": 20260504, + "step": 33230 + }, + { + "epoch": 9.165747380033094, + "grad_norm": 1.634371619729791e-05, + "learning_rate": 1.0532553399552086e-06, + "loss": 0.0, + "num_input_tokens_seen": 20263640, + "step": 33235 + }, + { + "epoch": 9.167126309983454, + "grad_norm": 0.0003134650469291955, + "learning_rate": 1.0498020777369999e-06, + "loss": 0.0, + "num_input_tokens_seen": 20268312, + "step": 33240 + }, + { + "epoch": 9.168505239933811, + "grad_norm": 4.984858037460072e-07, + "learning_rate": 1.0463543644358532e-06, + "loss": 0.0, + "num_input_tokens_seen": 20270520, + "step": 33245 + }, + { + "epoch": 9.16988416988417, + "grad_norm": 2.1150949578441214e-06, + "learning_rate": 1.0429122008505655e-06, + "loss": 0.0, + "num_input_tokens_seen": 20273464, + "step": 33250 + }, + { + "epoch": 9.171263099834528, + "grad_norm": 9.473418458583183e-07, + "learning_rate": 1.0394755877786266e-06, + "loss": 0.0, + "num_input_tokens_seen": 20275800, + "step": 33255 + }, + { + "epoch": 9.172642029784887, + "grad_norm": 5.890175316380919e-07, + "learning_rate": 1.0360445260162582e-06, + "loss": 0.0, + "num_input_tokens_seen": 20278136, + "step": 33260 + }, + { + "epoch": 9.174020959735245, + "grad_norm": 9.86457962426357e-07, + "learning_rate": 1.0326190163583826e-06, + "loss": 0.0, + "num_input_tokens_seen": 20281976, + "step": 33265 + }, + { + "epoch": 9.175399889685604, + "grad_norm": 8.983538464235608e-07, + "learning_rate": 1.0291990595986428e-06, + "loss": 0.0, + "num_input_tokens_seen": 20285016, + "step": 33270 + }, + { + "epoch": 9.176778819635963, + "grad_norm": 1.7203644802066265e-06, + "learning_rate": 1.025784656529394e-06, + "loss": 0.0, + "num_input_tokens_seen": 20290040, + "step": 33275 + }, + { + "epoch": 9.17815774958632, + "grad_norm": 1.696623075986281e-05, + "learning_rate": 1.0223758079417056e-06, + "loss": 0.0, + "num_input_tokens_seen": 20292568, + "step": 33280 + }, + { + "epoch": 9.17953667953668, + "grad_norm": 3.828911303571658e-06, + "learning_rate": 1.0189725146253576e-06, + "loss": 0.0, + "num_input_tokens_seen": 20294680, + "step": 33285 + }, + { + "epoch": 9.180915609487037, + "grad_norm": 1.6125941328937188e-05, + "learning_rate": 1.0155747773688413e-06, + "loss": 0.0, + "num_input_tokens_seen": 20297080, + "step": 33290 + }, + { + "epoch": 9.182294539437397, + "grad_norm": 7.020748284958245e-07, + "learning_rate": 1.0121825969593713e-06, + "loss": 0.0, + "num_input_tokens_seen": 20299672, + "step": 33295 + }, + { + "epoch": 9.183673469387756, + "grad_norm": 9.83953032118734e-06, + "learning_rate": 1.0087959741828606e-06, + "loss": 0.0, + "num_input_tokens_seen": 20302520, + "step": 33300 + }, + { + "epoch": 9.185052399338113, + "grad_norm": 1.036737899084983e-06, + "learning_rate": 1.0054149098239402e-06, + "loss": 0.0, + "num_input_tokens_seen": 20305688, + "step": 33305 + }, + { + "epoch": 9.186431329288473, + "grad_norm": 4.2932612132062786e-07, + "learning_rate": 1.0020394046659582e-06, + "loss": 0.0, + "num_input_tokens_seen": 20309048, + "step": 33310 + }, + { + "epoch": 9.18781025923883, + "grad_norm": 1.937777597049717e-06, + "learning_rate": 9.986694594909697e-07, + "loss": 0.0, + "num_input_tokens_seen": 20311672, + "step": 33315 + }, + { + "epoch": 9.18918918918919, + "grad_norm": 4.57800865660829e-07, + "learning_rate": 9.953050750797415e-07, + "loss": 0.0, + "num_input_tokens_seen": 20314616, + "step": 33320 + }, + { + "epoch": 9.190568119139547, + "grad_norm": 2.2019621610525064e-06, + "learning_rate": 9.919462522117534e-07, + "loss": 0.0, + "num_input_tokens_seen": 20318328, + "step": 33325 + }, + { + "epoch": 9.191947049089906, + "grad_norm": 4.852669235333451e-07, + "learning_rate": 9.885929916651964e-07, + "loss": 0.0, + "num_input_tokens_seen": 20320856, + "step": 33330 + }, + { + "epoch": 9.193325979040265, + "grad_norm": 5.356010319701454e-07, + "learning_rate": 9.852452942169771e-07, + "loss": 0.0, + "num_input_tokens_seen": 20323512, + "step": 33335 + }, + { + "epoch": 9.194704908990623, + "grad_norm": 4.3684488559847523e-07, + "learning_rate": 9.819031606427054e-07, + "loss": 0.0, + "num_input_tokens_seen": 20327448, + "step": 33340 + }, + { + "epoch": 9.196083838940982, + "grad_norm": 1.0503290468477644e-06, + "learning_rate": 9.78566591716701e-07, + "loss": 0.0, + "num_input_tokens_seen": 20330456, + "step": 33345 + }, + { + "epoch": 9.19746276889134, + "grad_norm": 1.3422567235465976e-06, + "learning_rate": 9.752355882120061e-07, + "loss": 0.0, + "num_input_tokens_seen": 20332472, + "step": 33350 + }, + { + "epoch": 9.198841698841699, + "grad_norm": 7.772389380988898e-07, + "learning_rate": 9.719101509003653e-07, + "loss": 0.0, + "num_input_tokens_seen": 20335672, + "step": 33355 + }, + { + "epoch": 9.200220628792056, + "grad_norm": 7.172147320488875e-07, + "learning_rate": 9.685902805522285e-07, + "loss": 0.0, + "num_input_tokens_seen": 20337848, + "step": 33360 + }, + { + "epoch": 9.201599558742416, + "grad_norm": 7.620354836035403e-07, + "learning_rate": 9.65275977936772e-07, + "loss": 0.0, + "num_input_tokens_seen": 20340664, + "step": 33365 + }, + { + "epoch": 9.202978488692775, + "grad_norm": 4.6155159338923113e-07, + "learning_rate": 9.619672438218624e-07, + "loss": 0.0, + "num_input_tokens_seen": 20343608, + "step": 33370 + }, + { + "epoch": 9.204357418643133, + "grad_norm": 4.77314131330786e-07, + "learning_rate": 9.586640789740946e-07, + "loss": 0.0, + "num_input_tokens_seen": 20346712, + "step": 33375 + }, + { + "epoch": 9.205736348593492, + "grad_norm": 1.401552708557574e-06, + "learning_rate": 9.553664841587596e-07, + "loss": 0.0, + "num_input_tokens_seen": 20349624, + "step": 33380 + }, + { + "epoch": 9.20711527854385, + "grad_norm": 6.546041504407185e-07, + "learning_rate": 9.520744601398596e-07, + "loss": 0.0, + "num_input_tokens_seen": 20354008, + "step": 33385 + }, + { + "epoch": 9.208494208494209, + "grad_norm": 1.2814737146982225e-06, + "learning_rate": 9.487880076801153e-07, + "loss": 0.0, + "num_input_tokens_seen": 20356728, + "step": 33390 + }, + { + "epoch": 9.209873138444568, + "grad_norm": 7.089338964760827e-07, + "learning_rate": 9.455071275409483e-07, + "loss": 0.0, + "num_input_tokens_seen": 20360056, + "step": 33395 + }, + { + "epoch": 9.211252068394925, + "grad_norm": 3.764732525723957e-07, + "learning_rate": 9.422318204824921e-07, + "loss": 0.0, + "num_input_tokens_seen": 20364120, + "step": 33400 + }, + { + "epoch": 9.212630998345285, + "grad_norm": 5.079958782516769e-07, + "learning_rate": 9.389620872635868e-07, + "loss": 0.0, + "num_input_tokens_seen": 20366840, + "step": 33405 + }, + { + "epoch": 9.214009928295642, + "grad_norm": 7.92374294178444e-07, + "learning_rate": 9.356979286417878e-07, + "loss": 0.0, + "num_input_tokens_seen": 20369880, + "step": 33410 + }, + { + "epoch": 9.215388858246001, + "grad_norm": 3.724216242062539e-07, + "learning_rate": 9.324393453733426e-07, + "loss": 0.0, + "num_input_tokens_seen": 20372408, + "step": 33415 + }, + { + "epoch": 9.216767788196359, + "grad_norm": 5.490816192832426e-07, + "learning_rate": 9.291863382132309e-07, + "loss": 0.0, + "num_input_tokens_seen": 20374808, + "step": 33420 + }, + { + "epoch": 9.218146718146718, + "grad_norm": 6.99703946338559e-07, + "learning_rate": 9.259389079151193e-07, + "loss": 0.0, + "num_input_tokens_seen": 20376632, + "step": 33425 + }, + { + "epoch": 9.219525648097077, + "grad_norm": 0.00013326633779797703, + "learning_rate": 9.226970552313946e-07, + "loss": 0.0, + "num_input_tokens_seen": 20379512, + "step": 33430 + }, + { + "epoch": 9.220904578047435, + "grad_norm": 5.123652044858318e-07, + "learning_rate": 9.194607809131478e-07, + "loss": 0.0, + "num_input_tokens_seen": 20382328, + "step": 33435 + }, + { + "epoch": 9.222283507997794, + "grad_norm": 3.9664908513259434e-07, + "learning_rate": 9.162300857101707e-07, + "loss": 0.0, + "num_input_tokens_seen": 20385400, + "step": 33440 + }, + { + "epoch": 9.223662437948152, + "grad_norm": 0.0002302992215845734, + "learning_rate": 9.130049703709786e-07, + "loss": 0.0, + "num_input_tokens_seen": 20389080, + "step": 33445 + }, + { + "epoch": 9.22504136789851, + "grad_norm": 9.716122804093175e-07, + "learning_rate": 9.09785435642782e-07, + "loss": 0.0, + "num_input_tokens_seen": 20391416, + "step": 33450 + }, + { + "epoch": 9.22642029784887, + "grad_norm": 1.8298628674529027e-06, + "learning_rate": 9.065714822714927e-07, + "loss": 0.0, + "num_input_tokens_seen": 20393976, + "step": 33455 + }, + { + "epoch": 9.227799227799228, + "grad_norm": 9.34158606469282e-07, + "learning_rate": 9.033631110017454e-07, + "loss": 0.0, + "num_input_tokens_seen": 20397464, + "step": 33460 + }, + { + "epoch": 9.229178157749587, + "grad_norm": 5.303994612404495e-07, + "learning_rate": 9.001603225768762e-07, + "loss": 0.0, + "num_input_tokens_seen": 20400888, + "step": 33465 + }, + { + "epoch": 9.230557087699944, + "grad_norm": 1.3524046380553045e-06, + "learning_rate": 8.969631177389165e-07, + "loss": 0.0, + "num_input_tokens_seen": 20403384, + "step": 33470 + }, + { + "epoch": 9.231936017650304, + "grad_norm": 0.0001504190149717033, + "learning_rate": 8.937714972286209e-07, + "loss": 0.0, + "num_input_tokens_seen": 20406136, + "step": 33475 + }, + { + "epoch": 9.233314947600661, + "grad_norm": 1.7969424561670166e-06, + "learning_rate": 8.905854617854343e-07, + "loss": 0.0, + "num_input_tokens_seen": 20411128, + "step": 33480 + }, + { + "epoch": 9.23469387755102, + "grad_norm": 0.0002354078460484743, + "learning_rate": 8.874050121475242e-07, + "loss": 0.0, + "num_input_tokens_seen": 20413496, + "step": 33485 + }, + { + "epoch": 9.23607280750138, + "grad_norm": 6.633764542129938e-07, + "learning_rate": 8.842301490517485e-07, + "loss": 0.0, + "num_input_tokens_seen": 20416120, + "step": 33490 + }, + { + "epoch": 9.237451737451737, + "grad_norm": 6.334800559670839e-07, + "learning_rate": 8.810608732336773e-07, + "loss": 0.0, + "num_input_tokens_seen": 20420152, + "step": 33495 + }, + { + "epoch": 9.238830667402096, + "grad_norm": 1.7975669379666215e-06, + "learning_rate": 8.778971854275897e-07, + "loss": 0.0, + "num_input_tokens_seen": 20423192, + "step": 33500 + }, + { + "epoch": 9.240209597352454, + "grad_norm": 6.833243446635606e-07, + "learning_rate": 8.74739086366469e-07, + "loss": 0.0, + "num_input_tokens_seen": 20427768, + "step": 33505 + }, + { + "epoch": 9.241588527302813, + "grad_norm": 4.459197043615859e-06, + "learning_rate": 8.715865767819908e-07, + "loss": 0.0, + "num_input_tokens_seen": 20431480, + "step": 33510 + }, + { + "epoch": 9.242967457253172, + "grad_norm": 7.148046734073432e-06, + "learning_rate": 8.6843965740456e-07, + "loss": 0.0, + "num_input_tokens_seen": 20434520, + "step": 33515 + }, + { + "epoch": 9.24434638720353, + "grad_norm": 5.805456453344959e-07, + "learning_rate": 8.652983289632599e-07, + "loss": 0.0, + "num_input_tokens_seen": 20437176, + "step": 33520 + }, + { + "epoch": 9.24572531715389, + "grad_norm": 1.3840133306075586e-06, + "learning_rate": 8.621625921859028e-07, + "loss": 0.0, + "num_input_tokens_seen": 20439864, + "step": 33525 + }, + { + "epoch": 9.247104247104247, + "grad_norm": 2.747702865235624e-06, + "learning_rate": 8.590324477989852e-07, + "loss": 0.0, + "num_input_tokens_seen": 20442296, + "step": 33530 + }, + { + "epoch": 9.248483177054606, + "grad_norm": 2.2499150418298086e-06, + "learning_rate": 8.55907896527719e-07, + "loss": 0.0, + "num_input_tokens_seen": 20445304, + "step": 33535 + }, + { + "epoch": 9.249862107004963, + "grad_norm": 1.5887329709585174e-06, + "learning_rate": 8.52788939096022e-07, + "loss": 0.0, + "num_input_tokens_seen": 20448856, + "step": 33540 + }, + { + "epoch": 9.251241036955323, + "grad_norm": 4.5323551489673264e-07, + "learning_rate": 8.49675576226508e-07, + "loss": 0.0, + "num_input_tokens_seen": 20452856, + "step": 33545 + }, + { + "epoch": 9.252619966905682, + "grad_norm": 1.650474246162048e-06, + "learning_rate": 8.465678086404971e-07, + "loss": 0.0, + "num_input_tokens_seen": 20455352, + "step": 33550 + }, + { + "epoch": 9.25399889685604, + "grad_norm": 7.350405439865426e-07, + "learning_rate": 8.43465637058019e-07, + "loss": 0.0, + "num_input_tokens_seen": 20457976, + "step": 33555 + }, + { + "epoch": 9.255377826806399, + "grad_norm": 5.598034817921871e-07, + "learning_rate": 8.403690621977989e-07, + "loss": 0.0, + "num_input_tokens_seen": 20460280, + "step": 33560 + }, + { + "epoch": 9.256756756756756, + "grad_norm": 9.505347406957299e-07, + "learning_rate": 8.372780847772682e-07, + "loss": 0.0, + "num_input_tokens_seen": 20464120, + "step": 33565 + }, + { + "epoch": 9.258135686707115, + "grad_norm": 1.0255300821881974e-06, + "learning_rate": 8.341927055125654e-07, + "loss": 0.0, + "num_input_tokens_seen": 20467192, + "step": 33570 + }, + { + "epoch": 9.259514616657473, + "grad_norm": 1.0376753607488354e-06, + "learning_rate": 8.311129251185212e-07, + "loss": 0.0, + "num_input_tokens_seen": 20469720, + "step": 33575 + }, + { + "epoch": 9.260893546607832, + "grad_norm": 1.5087997553564492e-06, + "learning_rate": 8.280387443086873e-07, + "loss": 0.0, + "num_input_tokens_seen": 20472472, + "step": 33580 + }, + { + "epoch": 9.262272476558191, + "grad_norm": 8.718873232282931e-07, + "learning_rate": 8.249701637953022e-07, + "loss": 0.0, + "num_input_tokens_seen": 20475448, + "step": 33585 + }, + { + "epoch": 9.263651406508549, + "grad_norm": 5.473539204103872e-07, + "learning_rate": 8.219071842893084e-07, + "loss": 0.0, + "num_input_tokens_seen": 20478456, + "step": 33590 + }, + { + "epoch": 9.265030336458908, + "grad_norm": 4.992818389837339e-07, + "learning_rate": 8.188498065003603e-07, + "loss": 0.0, + "num_input_tokens_seen": 20481400, + "step": 33595 + }, + { + "epoch": 9.266409266409266, + "grad_norm": 1.258134489035001e-06, + "learning_rate": 8.157980311368024e-07, + "loss": 0.0, + "num_input_tokens_seen": 20483800, + "step": 33600 + }, + { + "epoch": 9.267788196359625, + "grad_norm": 3.680304700992565e-07, + "learning_rate": 8.127518589056915e-07, + "loss": 0.0, + "num_input_tokens_seen": 20487256, + "step": 33605 + }, + { + "epoch": 9.269167126309984, + "grad_norm": 6.656968594143109e-07, + "learning_rate": 8.097112905127823e-07, + "loss": 0.0, + "num_input_tokens_seen": 20490232, + "step": 33610 + }, + { + "epoch": 9.270546056260342, + "grad_norm": 0.00010123817628482357, + "learning_rate": 8.066763266625282e-07, + "loss": 0.0, + "num_input_tokens_seen": 20493816, + "step": 33615 + }, + { + "epoch": 9.271924986210701, + "grad_norm": 9.092008212974179e-07, + "learning_rate": 8.036469680580888e-07, + "loss": 0.0, + "num_input_tokens_seen": 20500568, + "step": 33620 + }, + { + "epoch": 9.273303916161058, + "grad_norm": 4.421391111009143e-07, + "learning_rate": 8.006232154013249e-07, + "loss": 0.0, + "num_input_tokens_seen": 20503032, + "step": 33625 + }, + { + "epoch": 9.274682846111418, + "grad_norm": 4.4130922560725594e-07, + "learning_rate": 7.9760506939279e-07, + "loss": 0.0, + "num_input_tokens_seen": 20505304, + "step": 33630 + }, + { + "epoch": 9.276061776061777, + "grad_norm": 6.609765841858461e-05, + "learning_rate": 7.945925307317497e-07, + "loss": 0.0, + "num_input_tokens_seen": 20507960, + "step": 33635 + }, + { + "epoch": 9.277440706012134, + "grad_norm": 7.664436452614609e-06, + "learning_rate": 7.915856001161681e-07, + "loss": 0.0, + "num_input_tokens_seen": 20510360, + "step": 33640 + }, + { + "epoch": 9.278819635962494, + "grad_norm": 1.28049134673347e-06, + "learning_rate": 7.885842782427016e-07, + "loss": 0.0, + "num_input_tokens_seen": 20513400, + "step": 33645 + }, + { + "epoch": 9.280198565912851, + "grad_norm": 4.566743882605806e-05, + "learning_rate": 7.855885658067219e-07, + "loss": 0.0, + "num_input_tokens_seen": 20516408, + "step": 33650 + }, + { + "epoch": 9.28157749586321, + "grad_norm": 9.166492986878438e-07, + "learning_rate": 7.825984635022848e-07, + "loss": 0.0, + "num_input_tokens_seen": 20518584, + "step": 33655 + }, + { + "epoch": 9.282956425813568, + "grad_norm": 0.0015673971502110362, + "learning_rate": 7.796139720221557e-07, + "loss": 0.0, + "num_input_tokens_seen": 20521272, + "step": 33660 + }, + { + "epoch": 9.284335355763927, + "grad_norm": 1.2547086498670978e-06, + "learning_rate": 7.766350920577981e-07, + "loss": 0.0, + "num_input_tokens_seen": 20523896, + "step": 33665 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 1.4522039236908313e-05, + "learning_rate": 7.73661824299382e-07, + "loss": 0.0, + "num_input_tokens_seen": 20526584, + "step": 33670 + }, + { + "epoch": 9.287093215664644, + "grad_norm": 6.61642388877226e-07, + "learning_rate": 7.706941694357623e-07, + "loss": 0.0, + "num_input_tokens_seen": 20529080, + "step": 33675 + }, + { + "epoch": 9.288472145615003, + "grad_norm": 7.981186058714229e-07, + "learning_rate": 7.67732128154508e-07, + "loss": 0.0, + "num_input_tokens_seen": 20531736, + "step": 33680 + }, + { + "epoch": 9.28985107556536, + "grad_norm": 1.590552164998371e-05, + "learning_rate": 7.64775701141876e-07, + "loss": 0.0, + "num_input_tokens_seen": 20533976, + "step": 33685 + }, + { + "epoch": 9.29123000551572, + "grad_norm": 8.437616543233162e-07, + "learning_rate": 7.618248890828377e-07, + "loss": 0.0, + "num_input_tokens_seen": 20536472, + "step": 33690 + }, + { + "epoch": 9.292608935466077, + "grad_norm": 4.980062158210785e-07, + "learning_rate": 7.588796926610436e-07, + "loss": 0.0, + "num_input_tokens_seen": 20538936, + "step": 33695 + }, + { + "epoch": 9.293987865416437, + "grad_norm": 1.0906321676884545e-06, + "learning_rate": 7.559401125588561e-07, + "loss": 0.0, + "num_input_tokens_seen": 20542232, + "step": 33700 + }, + { + "epoch": 9.295366795366796, + "grad_norm": 1.6692146118657547e-06, + "learning_rate": 7.530061494573387e-07, + "loss": 0.0, + "num_input_tokens_seen": 20545208, + "step": 33705 + }, + { + "epoch": 9.296745725317153, + "grad_norm": 7.044304766168352e-07, + "learning_rate": 7.500778040362422e-07, + "loss": 0.0, + "num_input_tokens_seen": 20547608, + "step": 33710 + }, + { + "epoch": 9.298124655267513, + "grad_norm": 4.474169600143796e-06, + "learning_rate": 7.471550769740265e-07, + "loss": 0.0, + "num_input_tokens_seen": 20552248, + "step": 33715 + }, + { + "epoch": 9.29950358521787, + "grad_norm": 5.499016992871475e-07, + "learning_rate": 7.442379689478446e-07, + "loss": 0.0, + "num_input_tokens_seen": 20556504, + "step": 33720 + }, + { + "epoch": 9.30088251516823, + "grad_norm": 7.722850909885892e-07, + "learning_rate": 7.413264806335474e-07, + "loss": 0.0, + "num_input_tokens_seen": 20559384, + "step": 33725 + }, + { + "epoch": 9.302261445118589, + "grad_norm": 4.5948533511364076e-07, + "learning_rate": 7.384206127056842e-07, + "loss": 0.0, + "num_input_tokens_seen": 20562584, + "step": 33730 + }, + { + "epoch": 9.303640375068946, + "grad_norm": 3.349387407070026e-05, + "learning_rate": 7.355203658375026e-07, + "loss": 0.0, + "num_input_tokens_seen": 20565144, + "step": 33735 + }, + { + "epoch": 9.305019305019306, + "grad_norm": 8.663846529088914e-07, + "learning_rate": 7.32625740700954e-07, + "loss": 0.0, + "num_input_tokens_seen": 20568792, + "step": 33740 + }, + { + "epoch": 9.306398234969663, + "grad_norm": 0.0018480042926967144, + "learning_rate": 7.297367379666742e-07, + "loss": 0.0, + "num_input_tokens_seen": 20572440, + "step": 33745 + }, + { + "epoch": 9.307777164920022, + "grad_norm": 7.882199497544207e-07, + "learning_rate": 7.268533583040083e-07, + "loss": 0.0, + "num_input_tokens_seen": 20575224, + "step": 33750 + }, + { + "epoch": 9.30915609487038, + "grad_norm": 0.00018532971444074064, + "learning_rate": 7.239756023809885e-07, + "loss": 0.0, + "num_input_tokens_seen": 20577400, + "step": 33755 + }, + { + "epoch": 9.310535024820739, + "grad_norm": 5.864016543455364e-07, + "learning_rate": 7.211034708643594e-07, + "loss": 0.0, + "num_input_tokens_seen": 20580024, + "step": 33760 + }, + { + "epoch": 9.311913954771098, + "grad_norm": 5.014831572225376e-07, + "learning_rate": 7.182369644195414e-07, + "loss": 0.0, + "num_input_tokens_seen": 20583256, + "step": 33765 + }, + { + "epoch": 9.313292884721456, + "grad_norm": 5.329407031240407e-07, + "learning_rate": 7.153760837106699e-07, + "loss": 0.0, + "num_input_tokens_seen": 20587544, + "step": 33770 + }, + { + "epoch": 9.314671814671815, + "grad_norm": 8.552391591365449e-06, + "learning_rate": 7.125208294005703e-07, + "loss": 0.0, + "num_input_tokens_seen": 20591128, + "step": 33775 + }, + { + "epoch": 9.316050744622173, + "grad_norm": 7.075494750097278e-07, + "learning_rate": 7.096712021507579e-07, + "loss": 0.0, + "num_input_tokens_seen": 20594904, + "step": 33780 + }, + { + "epoch": 9.317429674572532, + "grad_norm": 8.337896133525646e-07, + "learning_rate": 7.068272026214573e-07, + "loss": 0.0, + "num_input_tokens_seen": 20597752, + "step": 33785 + }, + { + "epoch": 9.318808604522891, + "grad_norm": 5.027699785387085e-07, + "learning_rate": 7.039888314715804e-07, + "loss": 0.0, + "num_input_tokens_seen": 20600248, + "step": 33790 + }, + { + "epoch": 9.320187534473249, + "grad_norm": 3.2888581813494966e-07, + "learning_rate": 7.011560893587343e-07, + "loss": 0.0, + "num_input_tokens_seen": 20603160, + "step": 33795 + }, + { + "epoch": 9.321566464423608, + "grad_norm": 5.933475790698139e-07, + "learning_rate": 6.983289769392276e-07, + "loss": 0.0, + "num_input_tokens_seen": 20605656, + "step": 33800 + }, + { + "epoch": 9.322945394373965, + "grad_norm": 9.405838454767945e-07, + "learning_rate": 6.955074948680557e-07, + "loss": 0.0, + "num_input_tokens_seen": 20610200, + "step": 33805 + }, + { + "epoch": 9.324324324324325, + "grad_norm": 4.866811309511831e-07, + "learning_rate": 6.926916437989234e-07, + "loss": 0.0, + "num_input_tokens_seen": 20612440, + "step": 33810 + }, + { + "epoch": 9.325703254274682, + "grad_norm": 9.026158522829064e-07, + "learning_rate": 6.8988142438422e-07, + "loss": 0.0, + "num_input_tokens_seen": 20615896, + "step": 33815 + }, + { + "epoch": 9.327082184225041, + "grad_norm": 4.968455868947785e-07, + "learning_rate": 6.870768372750331e-07, + "loss": 0.0, + "num_input_tokens_seen": 20619352, + "step": 33820 + }, + { + "epoch": 9.3284611141754, + "grad_norm": 1.1041878451578668e-06, + "learning_rate": 6.842778831211455e-07, + "loss": 0.0, + "num_input_tokens_seen": 20622008, + "step": 33825 + }, + { + "epoch": 9.329840044125758, + "grad_norm": 6.608974558730552e-07, + "learning_rate": 6.814845625710359e-07, + "loss": 0.0, + "num_input_tokens_seen": 20624856, + "step": 33830 + }, + { + "epoch": 9.331218974076117, + "grad_norm": 1.6024173419282306e-06, + "learning_rate": 6.786968762718726e-07, + "loss": 0.0, + "num_input_tokens_seen": 20627448, + "step": 33835 + }, + { + "epoch": 9.332597904026475, + "grad_norm": 9.649825187807437e-07, + "learning_rate": 6.759148248695252e-07, + "loss": 0.0, + "num_input_tokens_seen": 20630712, + "step": 33840 + }, + { + "epoch": 9.333976833976834, + "grad_norm": 9.822085758059984e-07, + "learning_rate": 6.731384090085557e-07, + "loss": 0.0, + "num_input_tokens_seen": 20633976, + "step": 33845 + }, + { + "epoch": 9.335355763927193, + "grad_norm": 5.025925702284439e-07, + "learning_rate": 6.703676293322164e-07, + "loss": 0.0, + "num_input_tokens_seen": 20636312, + "step": 33850 + }, + { + "epoch": 9.33673469387755, + "grad_norm": 3.145748451061081e-06, + "learning_rate": 6.676024864824632e-07, + "loss": 0.0, + "num_input_tokens_seen": 20638808, + "step": 33855 + }, + { + "epoch": 9.33811362382791, + "grad_norm": 9.86882696452085e-07, + "learning_rate": 6.648429810999335e-07, + "loss": 0.0, + "num_input_tokens_seen": 20641784, + "step": 33860 + }, + { + "epoch": 9.339492553778268, + "grad_norm": 8.78443529472861e-07, + "learning_rate": 6.620891138239687e-07, + "loss": 0.0, + "num_input_tokens_seen": 20645336, + "step": 33865 + }, + { + "epoch": 9.340871483728627, + "grad_norm": 8.464341590297408e-06, + "learning_rate": 6.593408852926003e-07, + "loss": 0.0, + "num_input_tokens_seen": 20648120, + "step": 33870 + }, + { + "epoch": 9.342250413678984, + "grad_norm": 9.793021717996453e-07, + "learning_rate": 6.565982961425521e-07, + "loss": 0.0, + "num_input_tokens_seen": 20652696, + "step": 33875 + }, + { + "epoch": 9.343629343629344, + "grad_norm": 4.3382839066907763e-05, + "learning_rate": 6.538613470092409e-07, + "loss": 0.0, + "num_input_tokens_seen": 20655160, + "step": 33880 + }, + { + "epoch": 9.345008273579703, + "grad_norm": 1.1603694929362973e-06, + "learning_rate": 6.511300385267843e-07, + "loss": 0.0, + "num_input_tokens_seen": 20658456, + "step": 33885 + }, + { + "epoch": 9.34638720353006, + "grad_norm": 6.3138952555164e-07, + "learning_rate": 6.484043713279819e-07, + "loss": 0.0, + "num_input_tokens_seen": 20661144, + "step": 33890 + }, + { + "epoch": 9.34776613348042, + "grad_norm": 7.325277806558006e-07, + "learning_rate": 6.456843460443368e-07, + "loss": 0.0, + "num_input_tokens_seen": 20664536, + "step": 33895 + }, + { + "epoch": 9.349145063430777, + "grad_norm": 3.8126393064885633e-06, + "learning_rate": 6.42969963306031e-07, + "loss": 0.0, + "num_input_tokens_seen": 20668120, + "step": 33900 + }, + { + "epoch": 9.350523993381136, + "grad_norm": 7.283167064997542e-07, + "learning_rate": 6.402612237419586e-07, + "loss": 0.0, + "num_input_tokens_seen": 20670424, + "step": 33905 + }, + { + "epoch": 9.351902923331494, + "grad_norm": 5.238507128524361e-06, + "learning_rate": 6.375581279796899e-07, + "loss": 0.0, + "num_input_tokens_seen": 20672888, + "step": 33910 + }, + { + "epoch": 9.353281853281853, + "grad_norm": 4.144958438700996e-06, + "learning_rate": 6.348606766454934e-07, + "loss": 0.0, + "num_input_tokens_seen": 20676376, + "step": 33915 + }, + { + "epoch": 9.354660783232212, + "grad_norm": 1.3020972801314201e-05, + "learning_rate": 6.321688703643303e-07, + "loss": 0.0, + "num_input_tokens_seen": 20679704, + "step": 33920 + }, + { + "epoch": 9.35603971318257, + "grad_norm": 4.3208976308051206e-07, + "learning_rate": 6.294827097598571e-07, + "loss": 0.0, + "num_input_tokens_seen": 20683352, + "step": 33925 + }, + { + "epoch": 9.35741864313293, + "grad_norm": 3.7323289348023536e-07, + "learning_rate": 6.268021954544096e-07, + "loss": 0.0, + "num_input_tokens_seen": 20686392, + "step": 33930 + }, + { + "epoch": 9.358797573083287, + "grad_norm": 2.180851879529655e-05, + "learning_rate": 6.241273280690352e-07, + "loss": 0.0, + "num_input_tokens_seen": 20689304, + "step": 33935 + }, + { + "epoch": 9.360176503033646, + "grad_norm": 2.8039621611242183e-06, + "learning_rate": 6.21458108223455e-07, + "loss": 0.0, + "num_input_tokens_seen": 20692536, + "step": 33940 + }, + { + "epoch": 9.361555432984005, + "grad_norm": 5.181267397347256e-07, + "learning_rate": 6.187945365360908e-07, + "loss": 0.0, + "num_input_tokens_seen": 20695704, + "step": 33945 + }, + { + "epoch": 9.362934362934363, + "grad_norm": 5.596739356406033e-07, + "learning_rate": 6.161366136240549e-07, + "loss": 0.0, + "num_input_tokens_seen": 20699736, + "step": 33950 + }, + { + "epoch": 9.364313292884722, + "grad_norm": 7.935291796457022e-05, + "learning_rate": 6.134843401031487e-07, + "loss": 0.0, + "num_input_tokens_seen": 20702200, + "step": 33955 + }, + { + "epoch": 9.36569222283508, + "grad_norm": 3.357338584919489e-07, + "learning_rate": 6.108377165878671e-07, + "loss": 0.0, + "num_input_tokens_seen": 20706872, + "step": 33960 + }, + { + "epoch": 9.367071152785439, + "grad_norm": 8.092033567663748e-07, + "learning_rate": 6.081967436913971e-07, + "loss": 0.0, + "num_input_tokens_seen": 20711128, + "step": 33965 + }, + { + "epoch": 9.368450082735798, + "grad_norm": 0.0003309429157525301, + "learning_rate": 6.055614220256078e-07, + "loss": 0.0, + "num_input_tokens_seen": 20714104, + "step": 33970 + }, + { + "epoch": 9.369829012686155, + "grad_norm": 4.424968551575148e-07, + "learning_rate": 6.029317522010719e-07, + "loss": 0.0, + "num_input_tokens_seen": 20716760, + "step": 33975 + }, + { + "epoch": 9.371207942636515, + "grad_norm": 8.262216510956932e-07, + "learning_rate": 6.003077348270408e-07, + "loss": 0.0, + "num_input_tokens_seen": 20719512, + "step": 33980 + }, + { + "epoch": 9.372586872586872, + "grad_norm": 9.85793917607225e-07, + "learning_rate": 5.976893705114644e-07, + "loss": 0.0, + "num_input_tokens_seen": 20722552, + "step": 33985 + }, + { + "epoch": 9.373965802537231, + "grad_norm": 1.2574248557939427e-06, + "learning_rate": 5.950766598609797e-07, + "loss": 0.0, + "num_input_tokens_seen": 20726712, + "step": 33990 + }, + { + "epoch": 9.375344732487589, + "grad_norm": 4.873475631939073e-07, + "learning_rate": 5.924696034809163e-07, + "loss": 0.0, + "num_input_tokens_seen": 20729752, + "step": 33995 + }, + { + "epoch": 9.376723662437948, + "grad_norm": 3.832806214632001e-07, + "learning_rate": 5.898682019752883e-07, + "loss": 0.0, + "num_input_tokens_seen": 20732280, + "step": 34000 + }, + { + "epoch": 9.378102592388307, + "grad_norm": 4.518534524322604e-07, + "learning_rate": 5.872724559468052e-07, + "loss": 0.0, + "num_input_tokens_seen": 20735704, + "step": 34005 + }, + { + "epoch": 9.379481522338665, + "grad_norm": 1.3262282436699024e-06, + "learning_rate": 5.846823659968609e-07, + "loss": 0.0, + "num_input_tokens_seen": 20737976, + "step": 34010 + }, + { + "epoch": 9.380860452289024, + "grad_norm": 2.866930572054116e-06, + "learning_rate": 5.820979327255477e-07, + "loss": 0.0, + "num_input_tokens_seen": 20740600, + "step": 34015 + }, + { + "epoch": 9.382239382239382, + "grad_norm": 8.786758485257451e-07, + "learning_rate": 5.795191567316366e-07, + "loss": 0.0, + "num_input_tokens_seen": 20744376, + "step": 34020 + }, + { + "epoch": 9.383618312189741, + "grad_norm": 6.120651505625574e-07, + "learning_rate": 5.769460386125968e-07, + "loss": 0.0, + "num_input_tokens_seen": 20746968, + "step": 34025 + }, + { + "epoch": 9.384997242140098, + "grad_norm": 9.796405038287048e-07, + "learning_rate": 5.743785789645795e-07, + "loss": 0.0, + "num_input_tokens_seen": 20749400, + "step": 34030 + }, + { + "epoch": 9.386376172090458, + "grad_norm": 3.9591790823578776e-07, + "learning_rate": 5.718167783824335e-07, + "loss": 0.0, + "num_input_tokens_seen": 20753176, + "step": 34035 + }, + { + "epoch": 9.387755102040817, + "grad_norm": 4.888083822152112e-07, + "learning_rate": 5.692606374596815e-07, + "loss": 0.0, + "num_input_tokens_seen": 20758968, + "step": 34040 + }, + { + "epoch": 9.389134031991174, + "grad_norm": 5.418240789367701e-07, + "learning_rate": 5.667101567885552e-07, + "loss": 0.0, + "num_input_tokens_seen": 20761624, + "step": 34045 + }, + { + "epoch": 9.390512961941534, + "grad_norm": 5.282152528707229e-07, + "learning_rate": 5.64165336959957e-07, + "loss": 0.0, + "num_input_tokens_seen": 20764248, + "step": 34050 + }, + { + "epoch": 9.391891891891891, + "grad_norm": 5.80344305944891e-07, + "learning_rate": 5.616261785634902e-07, + "loss": 0.0, + "num_input_tokens_seen": 20767128, + "step": 34055 + }, + { + "epoch": 9.39327082184225, + "grad_norm": 1.1662561519187875e-05, + "learning_rate": 5.590926821874398e-07, + "loss": 0.0, + "num_input_tokens_seen": 20769752, + "step": 34060 + }, + { + "epoch": 9.39464975179261, + "grad_norm": 1.4932126759958919e-05, + "learning_rate": 5.56564848418778e-07, + "loss": 0.0, + "num_input_tokens_seen": 20771928, + "step": 34065 + }, + { + "epoch": 9.396028681742967, + "grad_norm": 2.804582663884503e-06, + "learning_rate": 5.540426778431723e-07, + "loss": 0.0, + "num_input_tokens_seen": 20775000, + "step": 34070 + }, + { + "epoch": 9.397407611693327, + "grad_norm": 3.4534648420958547e-06, + "learning_rate": 5.515261710449693e-07, + "loss": 0.0, + "num_input_tokens_seen": 20777592, + "step": 34075 + }, + { + "epoch": 9.398786541643684, + "grad_norm": 6.904355700498854e-07, + "learning_rate": 5.490153286072053e-07, + "loss": 0.0, + "num_input_tokens_seen": 20780152, + "step": 34080 + }, + { + "epoch": 9.400165471594043, + "grad_norm": 8.02389820364624e-07, + "learning_rate": 5.465101511116122e-07, + "loss": 0.0, + "num_input_tokens_seen": 20782776, + "step": 34085 + }, + { + "epoch": 9.4015444015444, + "grad_norm": 5.528112978936406e-06, + "learning_rate": 5.440106391386007e-07, + "loss": 0.0, + "num_input_tokens_seen": 20787032, + "step": 34090 + }, + { + "epoch": 9.40292333149476, + "grad_norm": 7.98707446847402e-07, + "learning_rate": 5.415167932672716e-07, + "loss": 0.0, + "num_input_tokens_seen": 20790840, + "step": 34095 + }, + { + "epoch": 9.40430226144512, + "grad_norm": 7.38361336516391e-07, + "learning_rate": 5.390286140754181e-07, + "loss": 0.0, + "num_input_tokens_seen": 20794040, + "step": 34100 + }, + { + "epoch": 9.405681191395477, + "grad_norm": 2.8944936275365762e-05, + "learning_rate": 5.365461021395096e-07, + "loss": 0.0, + "num_input_tokens_seen": 20796472, + "step": 34105 + }, + { + "epoch": 9.407060121345836, + "grad_norm": 8.988065474113682e-07, + "learning_rate": 5.340692580347112e-07, + "loss": 0.0, + "num_input_tokens_seen": 20799672, + "step": 34110 + }, + { + "epoch": 9.408439051296194, + "grad_norm": 4.0278916912939167e-07, + "learning_rate": 5.315980823348693e-07, + "loss": 0.0, + "num_input_tokens_seen": 20802808, + "step": 34115 + }, + { + "epoch": 9.409817981246553, + "grad_norm": 1.1151571470691124e-06, + "learning_rate": 5.291325756125231e-07, + "loss": 0.0, + "num_input_tokens_seen": 20806168, + "step": 34120 + }, + { + "epoch": 9.411196911196912, + "grad_norm": 1.019014121084183e-06, + "learning_rate": 5.266727384388964e-07, + "loss": 0.0, + "num_input_tokens_seen": 20809176, + "step": 34125 + }, + { + "epoch": 9.41257584114727, + "grad_norm": 8.79676406384533e-07, + "learning_rate": 5.242185713838943e-07, + "loss": 0.0, + "num_input_tokens_seen": 20811832, + "step": 34130 + }, + { + "epoch": 9.413954771097629, + "grad_norm": 5.39907603069878e-07, + "learning_rate": 5.217700750161119e-07, + "loss": 0.0, + "num_input_tokens_seen": 20815224, + "step": 34135 + }, + { + "epoch": 9.415333701047986, + "grad_norm": 6.052195203665178e-06, + "learning_rate": 5.193272499028345e-07, + "loss": 0.0, + "num_input_tokens_seen": 20818616, + "step": 34140 + }, + { + "epoch": 9.416712630998346, + "grad_norm": 4.92010144625965e-07, + "learning_rate": 5.168900966100232e-07, + "loss": 0.0, + "num_input_tokens_seen": 20821304, + "step": 34145 + }, + { + "epoch": 9.418091560948703, + "grad_norm": 6.782021273465944e-07, + "learning_rate": 5.144586157023402e-07, + "loss": 0.0, + "num_input_tokens_seen": 20824600, + "step": 34150 + }, + { + "epoch": 9.419470490899062, + "grad_norm": 5.879754780835356e-07, + "learning_rate": 5.12032807743118e-07, + "loss": 0.0, + "num_input_tokens_seen": 20828056, + "step": 34155 + }, + { + "epoch": 9.420849420849422, + "grad_norm": 6.19514992195036e-07, + "learning_rate": 5.096126732943824e-07, + "loss": 0.0, + "num_input_tokens_seen": 20830872, + "step": 34160 + }, + { + "epoch": 9.422228350799779, + "grad_norm": 7.182749754974793e-07, + "learning_rate": 5.071982129168484e-07, + "loss": 0.0, + "num_input_tokens_seen": 20834104, + "step": 34165 + }, + { + "epoch": 9.423607280750138, + "grad_norm": 1.718362796054862e-06, + "learning_rate": 5.047894271699077e-07, + "loss": 0.0, + "num_input_tokens_seen": 20839128, + "step": 34170 + }, + { + "epoch": 9.424986210700496, + "grad_norm": 4.7162026817204605e-07, + "learning_rate": 5.023863166116389e-07, + "loss": 0.0, + "num_input_tokens_seen": 20842040, + "step": 34175 + }, + { + "epoch": 9.426365140650855, + "grad_norm": 9.541142844682327e-07, + "learning_rate": 4.999888817988158e-07, + "loss": 0.0, + "num_input_tokens_seen": 20844568, + "step": 34180 + }, + { + "epoch": 9.427744070601214, + "grad_norm": 6.314620577541064e-07, + "learning_rate": 4.975971232868859e-07, + "loss": 0.0, + "num_input_tokens_seen": 20846968, + "step": 34185 + }, + { + "epoch": 9.429123000551572, + "grad_norm": 1.307192974309146e-06, + "learning_rate": 4.95211041629981e-07, + "loss": 0.0, + "num_input_tokens_seen": 20850168, + "step": 34190 + }, + { + "epoch": 9.430501930501931, + "grad_norm": 5.156391580385389e-06, + "learning_rate": 4.928306373809282e-07, + "loss": 0.0, + "num_input_tokens_seen": 20853496, + "step": 34195 + }, + { + "epoch": 9.431880860452289, + "grad_norm": 9.047058711075806e-07, + "learning_rate": 4.904559110912283e-07, + "loss": 0.0, + "num_input_tokens_seen": 20856472, + "step": 34200 + }, + { + "epoch": 9.433259790402648, + "grad_norm": 1.2176294148957822e-05, + "learning_rate": 4.880868633110741e-07, + "loss": 0.0, + "num_input_tokens_seen": 20860376, + "step": 34205 + }, + { + "epoch": 9.434638720353005, + "grad_norm": 7.259299650286266e-07, + "learning_rate": 4.857234945893407e-07, + "loss": 0.0, + "num_input_tokens_seen": 20863448, + "step": 34210 + }, + { + "epoch": 9.436017650303365, + "grad_norm": 8.941033797782438e-07, + "learning_rate": 4.833658054735845e-07, + "loss": 0.0, + "num_input_tokens_seen": 20866104, + "step": 34215 + }, + { + "epoch": 9.437396580253724, + "grad_norm": 8.208053259295411e-06, + "learning_rate": 4.810137965100491e-07, + "loss": 0.0, + "num_input_tokens_seen": 20868376, + "step": 34220 + }, + { + "epoch": 9.438775510204081, + "grad_norm": 6.262540637180791e-07, + "learning_rate": 4.786674682436598e-07, + "loss": 0.0, + "num_input_tokens_seen": 20871960, + "step": 34225 + }, + { + "epoch": 9.44015444015444, + "grad_norm": 4.919481284559879e-07, + "learning_rate": 4.7632682121802886e-07, + "loss": 0.0, + "num_input_tokens_seen": 20875544, + "step": 34230 + }, + { + "epoch": 9.441533370104798, + "grad_norm": 8.523738301846606e-07, + "learning_rate": 4.73991855975453e-07, + "loss": 0.0, + "num_input_tokens_seen": 20878456, + "step": 34235 + }, + { + "epoch": 9.442912300055157, + "grad_norm": 7.221293572001741e-07, + "learning_rate": 4.716625730569052e-07, + "loss": 0.0, + "num_input_tokens_seen": 20881272, + "step": 34240 + }, + { + "epoch": 9.444291230005515, + "grad_norm": 1.3617637932838988e-06, + "learning_rate": 4.6933897300205085e-07, + "loss": 0.0, + "num_input_tokens_seen": 20884472, + "step": 34245 + }, + { + "epoch": 9.445670159955874, + "grad_norm": 6.170719188958174e-07, + "learning_rate": 4.6702105634923166e-07, + "loss": 0.0, + "num_input_tokens_seen": 20887384, + "step": 34250 + }, + { + "epoch": 9.447049089906233, + "grad_norm": 3.806286201779585e-07, + "learning_rate": 4.647088236354763e-07, + "loss": 0.0, + "num_input_tokens_seen": 20891160, + "step": 34255 + }, + { + "epoch": 9.44842801985659, + "grad_norm": 7.605502219121263e-07, + "learning_rate": 4.6240227539649793e-07, + "loss": 0.0, + "num_input_tokens_seen": 20893560, + "step": 34260 + }, + { + "epoch": 9.44980694980695, + "grad_norm": 1.0082372909892001e-06, + "learning_rate": 4.6010141216669136e-07, + "loss": 0.0, + "num_input_tokens_seen": 20895896, + "step": 34265 + }, + { + "epoch": 9.451185879757308, + "grad_norm": 9.897481731968583e-07, + "learning_rate": 4.578062344791273e-07, + "loss": 0.0, + "num_input_tokens_seen": 20898872, + "step": 34270 + }, + { + "epoch": 9.452564809707667, + "grad_norm": 1.173056625702884e-05, + "learning_rate": 4.555167428655721e-07, + "loss": 0.0, + "num_input_tokens_seen": 20901816, + "step": 34275 + }, + { + "epoch": 9.453943739658026, + "grad_norm": 3.2515142720512813e-06, + "learning_rate": 4.5323293785646816e-07, + "loss": 0.0, + "num_input_tokens_seen": 20905176, + "step": 34280 + }, + { + "epoch": 9.455322669608384, + "grad_norm": 2.6370050818513846e-06, + "learning_rate": 4.5095481998093383e-07, + "loss": 0.0, + "num_input_tokens_seen": 20909304, + "step": 34285 + }, + { + "epoch": 9.456701599558743, + "grad_norm": 4.261414972006605e-07, + "learning_rate": 4.4868238976678023e-07, + "loss": 0.0, + "num_input_tokens_seen": 20911960, + "step": 34290 + }, + { + "epoch": 9.4580805295091, + "grad_norm": 7.327763569264789e-07, + "learning_rate": 4.4641564774049736e-07, + "loss": 0.0, + "num_input_tokens_seen": 20914424, + "step": 34295 + }, + { + "epoch": 9.45945945945946, + "grad_norm": 2.683171260287054e-06, + "learning_rate": 4.441545944272568e-07, + "loss": 0.0, + "num_input_tokens_seen": 20917976, + "step": 34300 + }, + { + "epoch": 9.460838389409817, + "grad_norm": 1.1301308404654264e-06, + "learning_rate": 4.418992303509145e-07, + "loss": 0.0, + "num_input_tokens_seen": 20920920, + "step": 34305 + }, + { + "epoch": 9.462217319360176, + "grad_norm": 3.3711617106746417e-06, + "learning_rate": 4.396495560339997e-07, + "loss": 0.0, + "num_input_tokens_seen": 20926264, + "step": 34310 + }, + { + "epoch": 9.463596249310536, + "grad_norm": 5.004407057640492e-07, + "learning_rate": 4.374055719977344e-07, + "loss": 0.0, + "num_input_tokens_seen": 20930008, + "step": 34315 + }, + { + "epoch": 9.464975179260893, + "grad_norm": 1.0625029744915082e-06, + "learning_rate": 4.3516727876201924e-07, + "loss": 0.0, + "num_input_tokens_seen": 20934584, + "step": 34320 + }, + { + "epoch": 9.466354109211252, + "grad_norm": 3.9529075479549647e-07, + "learning_rate": 4.3293467684542567e-07, + "loss": 0.0, + "num_input_tokens_seen": 20939000, + "step": 34325 + }, + { + "epoch": 9.46773303916161, + "grad_norm": 3.905722678609891e-07, + "learning_rate": 4.3070776676522594e-07, + "loss": 0.0, + "num_input_tokens_seen": 20942360, + "step": 34330 + }, + { + "epoch": 9.46911196911197, + "grad_norm": 1.1310872650938109e-05, + "learning_rate": 4.284865490373574e-07, + "loss": 0.0, + "num_input_tokens_seen": 20944920, + "step": 34335 + }, + { + "epoch": 9.470490899062328, + "grad_norm": 1.0143726285605226e-06, + "learning_rate": 4.2627102417644726e-07, + "loss": 0.0, + "num_input_tokens_seen": 20948152, + "step": 34340 + }, + { + "epoch": 9.471869829012686, + "grad_norm": 6.168664299366355e-07, + "learning_rate": 4.2406119269579604e-07, + "loss": 0.0, + "num_input_tokens_seen": 20950392, + "step": 34345 + }, + { + "epoch": 9.473248758963045, + "grad_norm": 4.708909671080619e-07, + "learning_rate": 4.2185705510739415e-07, + "loss": 0.0, + "num_input_tokens_seen": 20953112, + "step": 34350 + }, + { + "epoch": 9.474627688913403, + "grad_norm": 6.044992915121838e-07, + "learning_rate": 4.1965861192190815e-07, + "loss": 0.0, + "num_input_tokens_seen": 20955896, + "step": 34355 + }, + { + "epoch": 9.476006618863762, + "grad_norm": 7.667127306376642e-07, + "learning_rate": 4.174658636486861e-07, + "loss": 0.0, + "num_input_tokens_seen": 20959640, + "step": 34360 + }, + { + "epoch": 9.47738554881412, + "grad_norm": 5.182228619560192e-07, + "learning_rate": 4.15278810795755e-07, + "loss": 0.0, + "num_input_tokens_seen": 20963384, + "step": 34365 + }, + { + "epoch": 9.478764478764479, + "grad_norm": 1.2055113529640948e-06, + "learning_rate": 4.1309745386982066e-07, + "loss": 0.0, + "num_input_tokens_seen": 20965944, + "step": 34370 + }, + { + "epoch": 9.480143408714838, + "grad_norm": 5.164356480236165e-07, + "learning_rate": 4.109217933762788e-07, + "loss": 0.0, + "num_input_tokens_seen": 20969496, + "step": 34375 + }, + { + "epoch": 9.481522338665195, + "grad_norm": 6.823176477155357e-07, + "learning_rate": 4.087518298191956e-07, + "loss": 0.0, + "num_input_tokens_seen": 20972536, + "step": 34380 + }, + { + "epoch": 9.482901268615555, + "grad_norm": 4.61857723621506e-07, + "learning_rate": 4.065875637013189e-07, + "loss": 0.0, + "num_input_tokens_seen": 20976056, + "step": 34385 + }, + { + "epoch": 9.484280198565912, + "grad_norm": 6.111098400651827e-07, + "learning_rate": 4.0442899552408097e-07, + "loss": 0.0, + "num_input_tokens_seen": 20978840, + "step": 34390 + }, + { + "epoch": 9.485659128516271, + "grad_norm": 6.771650191694789e-07, + "learning_rate": 4.0227612578758733e-07, + "loss": 0.0, + "num_input_tokens_seen": 20982264, + "step": 34395 + }, + { + "epoch": 9.48703805846663, + "grad_norm": 7.354714739449264e-07, + "learning_rate": 4.001289549906334e-07, + "loss": 0.0, + "num_input_tokens_seen": 20984632, + "step": 34400 + }, + { + "epoch": 9.488416988416988, + "grad_norm": 5.693531761608028e-07, + "learning_rate": 3.979874836306796e-07, + "loss": 0.0, + "num_input_tokens_seen": 20988504, + "step": 34405 + }, + { + "epoch": 9.489795918367347, + "grad_norm": 5.4932684179220814e-06, + "learning_rate": 3.958517122038791e-07, + "loss": 0.0, + "num_input_tokens_seen": 20991160, + "step": 34410 + }, + { + "epoch": 9.491174848317705, + "grad_norm": 2.2336214442475466e-06, + "learning_rate": 3.9372164120506094e-07, + "loss": 0.0, + "num_input_tokens_seen": 20995576, + "step": 34415 + }, + { + "epoch": 9.492553778268064, + "grad_norm": 9.23229492855171e-07, + "learning_rate": 3.915972711277277e-07, + "loss": 0.0, + "num_input_tokens_seen": 20998200, + "step": 34420 + }, + { + "epoch": 9.493932708218422, + "grad_norm": 5.566444087889977e-07, + "learning_rate": 3.894786024640662e-07, + "loss": 0.0, + "num_input_tokens_seen": 21001560, + "step": 34425 + }, + { + "epoch": 9.495311638168781, + "grad_norm": 1.5076451518325484e-06, + "learning_rate": 3.8736563570494487e-07, + "loss": 0.0, + "num_input_tokens_seen": 21005400, + "step": 34430 + }, + { + "epoch": 9.49669056811914, + "grad_norm": 8.922031702240929e-05, + "learning_rate": 3.8525837133990274e-07, + "loss": 0.0, + "num_input_tokens_seen": 21008504, + "step": 34435 + }, + { + "epoch": 9.498069498069498, + "grad_norm": 5.583599431702169e-07, + "learning_rate": 3.831568098571686e-07, + "loss": 0.0, + "num_input_tokens_seen": 21010904, + "step": 34440 + }, + { + "epoch": 9.499448428019857, + "grad_norm": 6.163963917060755e-07, + "learning_rate": 3.810609517436392e-07, + "loss": 0.0, + "num_input_tokens_seen": 21013752, + "step": 34445 + }, + { + "epoch": 9.5, + "eval_loss": 0.39586055278778076, + "eval_runtime": 28.484, + "eval_samples_per_second": 56.593, + "eval_steps_per_second": 14.148, + "num_input_tokens_seen": 21014840, + "step": 34447 + }, + { + "epoch": 9.500827357970214, + "grad_norm": 9.596466952643823e-06, + "learning_rate": 3.789707974848983e-07, + "loss": 0.0, + "num_input_tokens_seen": 21016280, + "step": 34450 + }, + { + "epoch": 9.502206287920574, + "grad_norm": 6.84080532664666e-07, + "learning_rate": 3.768863475652029e-07, + "loss": 0.0, + "num_input_tokens_seen": 21020280, + "step": 34455 + }, + { + "epoch": 9.503585217870931, + "grad_norm": 6.567393029399682e-07, + "learning_rate": 3.74807602467489e-07, + "loss": 0.0, + "num_input_tokens_seen": 21023224, + "step": 34460 + }, + { + "epoch": 9.50496414782129, + "grad_norm": 5.929318831476849e-07, + "learning_rate": 3.7273456267337415e-07, + "loss": 0.0, + "num_input_tokens_seen": 21025432, + "step": 34465 + }, + { + "epoch": 9.50634307777165, + "grad_norm": 2.8092235879739746e-06, + "learning_rate": 3.7066722866315186e-07, + "loss": 0.0, + "num_input_tokens_seen": 21027480, + "step": 34470 + }, + { + "epoch": 9.507722007722007, + "grad_norm": 9.702512761577964e-05, + "learning_rate": 3.686056009157918e-07, + "loss": 0.0, + "num_input_tokens_seen": 21030392, + "step": 34475 + }, + { + "epoch": 9.509100937672367, + "grad_norm": 5.367818084778264e-05, + "learning_rate": 3.665496799089452e-07, + "loss": 0.0, + "num_input_tokens_seen": 21033336, + "step": 34480 + }, + { + "epoch": 9.510479867622724, + "grad_norm": 6.390228008967824e-06, + "learning_rate": 3.644994661189366e-07, + "loss": 0.0, + "num_input_tokens_seen": 21036312, + "step": 34485 + }, + { + "epoch": 9.511858797573083, + "grad_norm": 6.736890441061405e-07, + "learning_rate": 3.624549600207749e-07, + "loss": 0.0, + "num_input_tokens_seen": 21039416, + "step": 34490 + }, + { + "epoch": 9.513237727523443, + "grad_norm": 1.10143980691646e-06, + "learning_rate": 3.6041616208814246e-07, + "loss": 0.0, + "num_input_tokens_seen": 21043864, + "step": 34495 + }, + { + "epoch": 9.5146166574738, + "grad_norm": 4.441150736056443e-07, + "learning_rate": 3.583830727933973e-07, + "loss": 0.0, + "num_input_tokens_seen": 21045944, + "step": 34500 + }, + { + "epoch": 9.51599558742416, + "grad_norm": 3.866467750412994e-07, + "learning_rate": 3.563556926075767e-07, + "loss": 0.0, + "num_input_tokens_seen": 21048600, + "step": 34505 + }, + { + "epoch": 9.517374517374517, + "grad_norm": 8.683539363119053e-07, + "learning_rate": 3.543340220003993e-07, + "loss": 0.0, + "num_input_tokens_seen": 21051640, + "step": 34510 + }, + { + "epoch": 9.518753447324876, + "grad_norm": 5.012384463043418e-07, + "learning_rate": 3.523180614402516e-07, + "loss": 0.0, + "num_input_tokens_seen": 21054456, + "step": 34515 + }, + { + "epoch": 9.520132377275235, + "grad_norm": 1.1229861911488115e-06, + "learning_rate": 3.503078113942071e-07, + "loss": 0.0, + "num_input_tokens_seen": 21059128, + "step": 34520 + }, + { + "epoch": 9.521511307225593, + "grad_norm": 4.63696915176115e-06, + "learning_rate": 3.4830327232801274e-07, + "loss": 0.0, + "num_input_tokens_seen": 21062840, + "step": 34525 + }, + { + "epoch": 9.522890237175952, + "grad_norm": 8.185540536942426e-06, + "learning_rate": 3.463044447060887e-07, + "loss": 0.0, + "num_input_tokens_seen": 21066680, + "step": 34530 + }, + { + "epoch": 9.52426916712631, + "grad_norm": 6.965974534978159e-07, + "learning_rate": 3.4431132899153395e-07, + "loss": 0.0, + "num_input_tokens_seen": 21069656, + "step": 34535 + }, + { + "epoch": 9.525648097076669, + "grad_norm": 4.474032380130666e-07, + "learning_rate": 3.423239256461264e-07, + "loss": 0.0, + "num_input_tokens_seen": 21073496, + "step": 34540 + }, + { + "epoch": 9.527027027027026, + "grad_norm": 8.713201395949e-07, + "learning_rate": 3.4034223513031983e-07, + "loss": 0.0, + "num_input_tokens_seen": 21076056, + "step": 34545 + }, + { + "epoch": 9.528405956977386, + "grad_norm": 5.37091807473189e-07, + "learning_rate": 3.383662579032415e-07, + "loss": 0.0, + "num_input_tokens_seen": 21079480, + "step": 34550 + }, + { + "epoch": 9.529784886927745, + "grad_norm": 2.6638169856596505e-06, + "learning_rate": 3.3639599442269744e-07, + "loss": 0.0, + "num_input_tokens_seen": 21081912, + "step": 34555 + }, + { + "epoch": 9.531163816878102, + "grad_norm": 3.5277474808026454e-07, + "learning_rate": 3.3443144514516965e-07, + "loss": 0.0, + "num_input_tokens_seen": 21085112, + "step": 34560 + }, + { + "epoch": 9.532542746828462, + "grad_norm": 5.307039145918679e-07, + "learning_rate": 3.3247261052581635e-07, + "loss": 0.0, + "num_input_tokens_seen": 21087576, + "step": 34565 + }, + { + "epoch": 9.533921676778819, + "grad_norm": 6.595732884306926e-07, + "learning_rate": 3.305194910184717e-07, + "loss": 0.0, + "num_input_tokens_seen": 21091384, + "step": 34570 + }, + { + "epoch": 9.535300606729178, + "grad_norm": 8.279224061880086e-07, + "learning_rate": 3.2857208707564315e-07, + "loss": 0.0, + "num_input_tokens_seen": 21094328, + "step": 34575 + }, + { + "epoch": 9.536679536679536, + "grad_norm": 4.877950686932309e-06, + "learning_rate": 3.26630399148517e-07, + "loss": 0.0, + "num_input_tokens_seen": 21097400, + "step": 34580 + }, + { + "epoch": 9.538058466629895, + "grad_norm": 1.7050385849870509e-06, + "learning_rate": 3.2469442768695843e-07, + "loss": 0.0, + "num_input_tokens_seen": 21100408, + "step": 34585 + }, + { + "epoch": 9.539437396580254, + "grad_norm": 1.971808887901716e-05, + "learning_rate": 3.2276417313950035e-07, + "loss": 0.0, + "num_input_tokens_seen": 21103224, + "step": 34590 + }, + { + "epoch": 9.540816326530612, + "grad_norm": 6.787279858144757e-07, + "learning_rate": 3.208396359533572e-07, + "loss": 0.0, + "num_input_tokens_seen": 21105912, + "step": 34595 + }, + { + "epoch": 9.542195256480971, + "grad_norm": 7.505893790948903e-07, + "learning_rate": 3.1892081657441396e-07, + "loss": 0.0, + "num_input_tokens_seen": 21108920, + "step": 34600 + }, + { + "epoch": 9.543574186431329, + "grad_norm": 2.3239840629685204e-06, + "learning_rate": 3.1700771544723453e-07, + "loss": 0.0, + "num_input_tokens_seen": 21112376, + "step": 34605 + }, + { + "epoch": 9.544953116381688, + "grad_norm": 9.539958227833267e-07, + "learning_rate": 3.151003330150587e-07, + "loss": 0.0, + "num_input_tokens_seen": 21116600, + "step": 34610 + }, + { + "epoch": 9.546332046332047, + "grad_norm": 7.046206746963435e-07, + "learning_rate": 3.1319866971979697e-07, + "loss": 0.0, + "num_input_tokens_seen": 21119864, + "step": 34615 + }, + { + "epoch": 9.547710976282405, + "grad_norm": 5.4470319810207e-07, + "learning_rate": 3.113027260020412e-07, + "loss": 0.0, + "num_input_tokens_seen": 21122904, + "step": 34620 + }, + { + "epoch": 9.549089906232764, + "grad_norm": 2.033907321674633e-06, + "learning_rate": 3.0941250230104857e-07, + "loss": 0.0, + "num_input_tokens_seen": 21126456, + "step": 34625 + }, + { + "epoch": 9.550468836183121, + "grad_norm": 9.478880542701518e-07, + "learning_rate": 3.0752799905476025e-07, + "loss": 0.0, + "num_input_tokens_seen": 21129816, + "step": 34630 + }, + { + "epoch": 9.55184776613348, + "grad_norm": 2.063593683487852e-06, + "learning_rate": 3.05649216699791e-07, + "loss": 0.0, + "num_input_tokens_seen": 21132600, + "step": 34635 + }, + { + "epoch": 9.553226696083838, + "grad_norm": 1.053177697940555e-06, + "learning_rate": 3.03776155671423e-07, + "loss": 0.0, + "num_input_tokens_seen": 21134744, + "step": 34640 + }, + { + "epoch": 9.554605626034197, + "grad_norm": 2.4734847556828754e-06, + "learning_rate": 3.0190881640362044e-07, + "loss": 0.0, + "num_input_tokens_seen": 21137464, + "step": 34645 + }, + { + "epoch": 9.555984555984557, + "grad_norm": 1.4298803989731823e-06, + "learning_rate": 3.000471993290177e-07, + "loss": 0.0, + "num_input_tokens_seen": 21140120, + "step": 34650 + }, + { + "epoch": 9.557363485934914, + "grad_norm": 1.0034518709289841e-06, + "learning_rate": 2.9819130487892534e-07, + "loss": 0.0, + "num_input_tokens_seen": 21144824, + "step": 34655 + }, + { + "epoch": 9.558742415885273, + "grad_norm": 3.8480177977362473e-07, + "learning_rate": 2.963411334833299e-07, + "loss": 0.0, + "num_input_tokens_seen": 21147672, + "step": 34660 + }, + { + "epoch": 9.560121345835631, + "grad_norm": 5.556209998758277e-07, + "learning_rate": 2.9449668557088304e-07, + "loss": 0.0, + "num_input_tokens_seen": 21151576, + "step": 34665 + }, + { + "epoch": 9.56150027578599, + "grad_norm": 4.238226836150716e-07, + "learning_rate": 2.9265796156892343e-07, + "loss": 0.0, + "num_input_tokens_seen": 21154968, + "step": 34670 + }, + { + "epoch": 9.56287920573635, + "grad_norm": 7.093617000464292e-07, + "learning_rate": 2.9082496190345477e-07, + "loss": 0.0, + "num_input_tokens_seen": 21158104, + "step": 34675 + }, + { + "epoch": 9.564258135686707, + "grad_norm": 6.77117611758149e-07, + "learning_rate": 2.8899768699915676e-07, + "loss": 0.0, + "num_input_tokens_seen": 21161144, + "step": 34680 + }, + { + "epoch": 9.565637065637066, + "grad_norm": 6.058801886865695e-07, + "learning_rate": 2.871761372793824e-07, + "loss": 0.0, + "num_input_tokens_seen": 21163320, + "step": 34685 + }, + { + "epoch": 9.567015995587424, + "grad_norm": 7.242422270792304e-07, + "learning_rate": 2.853603131661581e-07, + "loss": 0.0, + "num_input_tokens_seen": 21166840, + "step": 34690 + }, + { + "epoch": 9.568394925537783, + "grad_norm": 1.6541849845452816e-06, + "learning_rate": 2.835502150801833e-07, + "loss": 0.0, + "num_input_tokens_seen": 21169144, + "step": 34695 + }, + { + "epoch": 9.56977385548814, + "grad_norm": 2.1643693344230996e-06, + "learning_rate": 2.8174584344083655e-07, + "loss": 0.0, + "num_input_tokens_seen": 21172024, + "step": 34700 + }, + { + "epoch": 9.5711527854385, + "grad_norm": 4.956471570949361e-07, + "learning_rate": 2.799471986661584e-07, + "loss": 0.0, + "num_input_tokens_seen": 21175416, + "step": 34705 + }, + { + "epoch": 9.572531715388859, + "grad_norm": 6.970524850657966e-07, + "learning_rate": 2.7815428117287376e-07, + "loss": 0.0, + "num_input_tokens_seen": 21178072, + "step": 34710 + }, + { + "epoch": 9.573910645339216, + "grad_norm": 5.581102868745802e-07, + "learning_rate": 2.763670913763755e-07, + "loss": 0.0, + "num_input_tokens_seen": 21181112, + "step": 34715 + }, + { + "epoch": 9.575289575289576, + "grad_norm": 8.057889431256626e-07, + "learning_rate": 2.745856296907268e-07, + "loss": 0.0, + "num_input_tokens_seen": 21183544, + "step": 34720 + }, + { + "epoch": 9.576668505239933, + "grad_norm": 8.676994980305608e-07, + "learning_rate": 2.728098965286724e-07, + "loss": 0.0, + "num_input_tokens_seen": 21186008, + "step": 34725 + }, + { + "epoch": 9.578047435190292, + "grad_norm": 6.792516273890215e-07, + "learning_rate": 2.7103989230161663e-07, + "loss": 0.0, + "num_input_tokens_seen": 21189240, + "step": 34730 + }, + { + "epoch": 9.579426365140652, + "grad_norm": 9.553900781611446e-07, + "learning_rate": 2.6927561741964815e-07, + "loss": 0.0, + "num_input_tokens_seen": 21193368, + "step": 34735 + }, + { + "epoch": 9.58080529509101, + "grad_norm": 0.0003314657078590244, + "learning_rate": 2.67517072291526e-07, + "loss": 0.0, + "num_input_tokens_seen": 21196024, + "step": 34740 + }, + { + "epoch": 9.582184225041368, + "grad_norm": 5.3857422699366e-07, + "learning_rate": 2.6576425732467715e-07, + "loss": 0.0, + "num_input_tokens_seen": 21198744, + "step": 34745 + }, + { + "epoch": 9.583563154991726, + "grad_norm": 5.579840376412903e-07, + "learning_rate": 2.6401717292520447e-07, + "loss": 0.0, + "num_input_tokens_seen": 21201496, + "step": 34750 + }, + { + "epoch": 9.584942084942085, + "grad_norm": 5.189708076613897e-07, + "learning_rate": 2.622758194978814e-07, + "loss": 0.0, + "num_input_tokens_seen": 21204888, + "step": 34755 + }, + { + "epoch": 9.586321014892443, + "grad_norm": 7.657336027477868e-07, + "learning_rate": 2.6054019744615465e-07, + "loss": 0.0, + "num_input_tokens_seen": 21207640, + "step": 34760 + }, + { + "epoch": 9.587699944842802, + "grad_norm": 1.5080350976859336e-06, + "learning_rate": 2.588103071721443e-07, + "loss": 0.0, + "num_input_tokens_seen": 21211640, + "step": 34765 + }, + { + "epoch": 9.589078874793161, + "grad_norm": 6.728477728756843e-06, + "learning_rate": 2.57086149076638e-07, + "loss": 0.0, + "num_input_tokens_seen": 21214168, + "step": 34770 + }, + { + "epoch": 9.590457804743519, + "grad_norm": 8.93407445801131e-07, + "learning_rate": 2.553677235591051e-07, + "loss": 0.0, + "num_input_tokens_seen": 21216888, + "step": 34775 + }, + { + "epoch": 9.591836734693878, + "grad_norm": 6.505557848868193e-06, + "learning_rate": 2.5365503101767163e-07, + "loss": 0.0, + "num_input_tokens_seen": 21219256, + "step": 34780 + }, + { + "epoch": 9.593215664644235, + "grad_norm": 1.179503556159034e-06, + "learning_rate": 2.519480718491507e-07, + "loss": 0.0, + "num_input_tokens_seen": 21222232, + "step": 34785 + }, + { + "epoch": 9.594594594594595, + "grad_norm": 6.078907972550951e-06, + "learning_rate": 2.502468464490176e-07, + "loss": 0.0, + "num_input_tokens_seen": 21224664, + "step": 34790 + }, + { + "epoch": 9.595973524544952, + "grad_norm": 5.977279329272278e-07, + "learning_rate": 2.4855135521142104e-07, + "loss": 0.0, + "num_input_tokens_seen": 21227128, + "step": 34795 + }, + { + "epoch": 9.597352454495311, + "grad_norm": 0.00010492922592675313, + "learning_rate": 2.4686159852918e-07, + "loss": 0.0, + "num_input_tokens_seen": 21229336, + "step": 34800 + }, + { + "epoch": 9.59873138444567, + "grad_norm": 1.0832452517206548e-06, + "learning_rate": 2.4517757679379526e-07, + "loss": 0.0, + "num_input_tokens_seen": 21231576, + "step": 34805 + }, + { + "epoch": 9.600110314396028, + "grad_norm": 6.979623776715016e-07, + "learning_rate": 2.434992903954214e-07, + "loss": 0.0, + "num_input_tokens_seen": 21235064, + "step": 34810 + }, + { + "epoch": 9.601489244346388, + "grad_norm": 6.359042004078219e-07, + "learning_rate": 2.418267397228974e-07, + "loss": 0.0, + "num_input_tokens_seen": 21238040, + "step": 34815 + }, + { + "epoch": 9.602868174296745, + "grad_norm": 4.782546056958381e-07, + "learning_rate": 2.401599251637271e-07, + "loss": 0.0, + "num_input_tokens_seen": 21240120, + "step": 34820 + }, + { + "epoch": 9.604247104247104, + "grad_norm": 1.071035740096704e-06, + "learning_rate": 2.3849884710409053e-07, + "loss": 0.0, + "num_input_tokens_seen": 21243256, + "step": 34825 + }, + { + "epoch": 9.605626034197464, + "grad_norm": 2.1979085431667045e-06, + "learning_rate": 2.3684350592883542e-07, + "loss": 0.0, + "num_input_tokens_seen": 21245816, + "step": 34830 + }, + { + "epoch": 9.607004964147821, + "grad_norm": 9.861061016636086e-07, + "learning_rate": 2.3519390202147718e-07, + "loss": 0.0, + "num_input_tokens_seen": 21250072, + "step": 34835 + }, + { + "epoch": 9.60838389409818, + "grad_norm": 1.1433666031734901e-06, + "learning_rate": 2.335500357642073e-07, + "loss": 0.0, + "num_input_tokens_seen": 21252664, + "step": 34840 + }, + { + "epoch": 9.609762824048538, + "grad_norm": 6.678879458377196e-07, + "learning_rate": 2.3191190753788784e-07, + "loss": 0.0, + "num_input_tokens_seen": 21255352, + "step": 34845 + }, + { + "epoch": 9.611141753998897, + "grad_norm": 6.566428783116862e-05, + "learning_rate": 2.302795177220457e-07, + "loss": 0.0, + "num_input_tokens_seen": 21259480, + "step": 34850 + }, + { + "epoch": 9.612520683949256, + "grad_norm": 7.034886380097305e-07, + "learning_rate": 2.2865286669488396e-07, + "loss": 0.0, + "num_input_tokens_seen": 21262264, + "step": 34855 + }, + { + "epoch": 9.613899613899614, + "grad_norm": 9.429157898921403e-07, + "learning_rate": 2.270319548332761e-07, + "loss": 0.0, + "num_input_tokens_seen": 21264440, + "step": 34860 + }, + { + "epoch": 9.615278543849973, + "grad_norm": 6.389363420566951e-07, + "learning_rate": 2.2541678251276343e-07, + "loss": 0.0, + "num_input_tokens_seen": 21266840, + "step": 34865 + }, + { + "epoch": 9.61665747380033, + "grad_norm": 1.5229429664032068e-06, + "learning_rate": 2.238073501075577e-07, + "loss": 0.0, + "num_input_tokens_seen": 21268792, + "step": 34870 + }, + { + "epoch": 9.61803640375069, + "grad_norm": 5.855354174855165e-07, + "learning_rate": 2.2220365799053843e-07, + "loss": 0.0, + "num_input_tokens_seen": 21271896, + "step": 34875 + }, + { + "epoch": 9.619415333701047, + "grad_norm": 7.570176876470214e-07, + "learning_rate": 2.206057065332612e-07, + "loss": 0.0, + "num_input_tokens_seen": 21274968, + "step": 34880 + }, + { + "epoch": 9.620794263651407, + "grad_norm": 4.550097401079256e-07, + "learning_rate": 2.1901349610594924e-07, + "loss": 0.0, + "num_input_tokens_seen": 21278072, + "step": 34885 + }, + { + "epoch": 9.622173193601766, + "grad_norm": 4.3580868691606156e-07, + "learning_rate": 2.174270270774964e-07, + "loss": 0.0, + "num_input_tokens_seen": 21281144, + "step": 34890 + }, + { + "epoch": 9.623552123552123, + "grad_norm": 6.359622375384788e-07, + "learning_rate": 2.158462998154559e-07, + "loss": 0.0, + "num_input_tokens_seen": 21284472, + "step": 34895 + }, + { + "epoch": 9.624931053502483, + "grad_norm": 9.612292615202023e-07, + "learning_rate": 2.1427131468607088e-07, + "loss": 0.0, + "num_input_tokens_seen": 21287896, + "step": 34900 + }, + { + "epoch": 9.62630998345284, + "grad_norm": 5.331920078788244e-07, + "learning_rate": 2.1270207205423554e-07, + "loss": 0.0, + "num_input_tokens_seen": 21290040, + "step": 34905 + }, + { + "epoch": 9.6276889134032, + "grad_norm": 6.0031004522898e-07, + "learning_rate": 2.111385722835202e-07, + "loss": 0.0, + "num_input_tokens_seen": 21292920, + "step": 34910 + }, + { + "epoch": 9.629067843353557, + "grad_norm": 1.0799780056913733e-06, + "learning_rate": 2.0958081573617118e-07, + "loss": 0.0, + "num_input_tokens_seen": 21295992, + "step": 34915 + }, + { + "epoch": 9.630446773303916, + "grad_norm": 6.651089847764524e-07, + "learning_rate": 2.0802880277309146e-07, + "loss": 0.0, + "num_input_tokens_seen": 21298680, + "step": 34920 + }, + { + "epoch": 9.631825703254275, + "grad_norm": 4.990276920580072e-07, + "learning_rate": 2.064825337538656e-07, + "loss": 0.0, + "num_input_tokens_seen": 21301688, + "step": 34925 + }, + { + "epoch": 9.633204633204633, + "grad_norm": 1.1132027793792076e-05, + "learning_rate": 2.0494200903674044e-07, + "loss": 0.0, + "num_input_tokens_seen": 21305144, + "step": 34930 + }, + { + "epoch": 9.634583563154992, + "grad_norm": 7.784057379467413e-05, + "learning_rate": 2.0340722897863317e-07, + "loss": 0.0, + "num_input_tokens_seen": 21307576, + "step": 34935 + }, + { + "epoch": 9.63596249310535, + "grad_norm": 1.1049152135456097e-06, + "learning_rate": 2.0187819393513164e-07, + "loss": 0.0, + "num_input_tokens_seen": 21309912, + "step": 34940 + }, + { + "epoch": 9.637341423055709, + "grad_norm": 8.330801506417629e-07, + "learning_rate": 2.0035490426048854e-07, + "loss": 0.0, + "num_input_tokens_seen": 21313272, + "step": 34945 + }, + { + "epoch": 9.638720353006068, + "grad_norm": 4.019707944280526e-07, + "learning_rate": 1.988373603076299e-07, + "loss": 0.0, + "num_input_tokens_seen": 21316216, + "step": 34950 + }, + { + "epoch": 9.640099282956426, + "grad_norm": 5.035908543504775e-07, + "learning_rate": 1.973255624281467e-07, + "loss": 0.0, + "num_input_tokens_seen": 21319832, + "step": 34955 + }, + { + "epoch": 9.641478212906785, + "grad_norm": 2.7454746032162802e-06, + "learning_rate": 1.95819510972306e-07, + "loss": 0.0, + "num_input_tokens_seen": 21323032, + "step": 34960 + }, + { + "epoch": 9.642857142857142, + "grad_norm": 7.46418720609654e-07, + "learning_rate": 1.9431920628903145e-07, + "loss": 0.0, + "num_input_tokens_seen": 21326264, + "step": 34965 + }, + { + "epoch": 9.644236072807502, + "grad_norm": 7.457995252480032e-07, + "learning_rate": 1.928246487259283e-07, + "loss": 0.0, + "num_input_tokens_seen": 21328920, + "step": 34970 + }, + { + "epoch": 9.645615002757859, + "grad_norm": 6.542815071952646e-07, + "learning_rate": 1.913358386292613e-07, + "loss": 0.0, + "num_input_tokens_seen": 21332056, + "step": 34975 + }, + { + "epoch": 9.646993932708218, + "grad_norm": 6.121113074186724e-07, + "learning_rate": 1.8985277634396558e-07, + "loss": 0.0, + "num_input_tokens_seen": 21335224, + "step": 34980 + }, + { + "epoch": 9.648372862658578, + "grad_norm": 1.0401483905297937e-06, + "learning_rate": 1.883754622136441e-07, + "loss": 0.0, + "num_input_tokens_seen": 21337464, + "step": 34985 + }, + { + "epoch": 9.649751792608935, + "grad_norm": 5.139012841937074e-07, + "learning_rate": 1.8690389658057582e-07, + "loss": 0.0, + "num_input_tokens_seen": 21340088, + "step": 34990 + }, + { + "epoch": 9.651130722559294, + "grad_norm": 5.518126044989913e-07, + "learning_rate": 1.8543807978569362e-07, + "loss": 0.0, + "num_input_tokens_seen": 21342648, + "step": 34995 + }, + { + "epoch": 9.652509652509652, + "grad_norm": 8.658812475914601e-07, + "learning_rate": 1.8397801216861189e-07, + "loss": 0.0, + "num_input_tokens_seen": 21345368, + "step": 35000 + }, + { + "epoch": 9.653888582460011, + "grad_norm": 3.45014086633455e-06, + "learning_rate": 1.825236940676045e-07, + "loss": 0.0, + "num_input_tokens_seen": 21347928, + "step": 35005 + }, + { + "epoch": 9.655267512410369, + "grad_norm": 4.4461833681452845e-07, + "learning_rate": 1.8107512581961584e-07, + "loss": 0.0, + "num_input_tokens_seen": 21350360, + "step": 35010 + }, + { + "epoch": 9.656646442360728, + "grad_norm": 4.90003799313854e-07, + "learning_rate": 1.7963230776025797e-07, + "loss": 0.0, + "num_input_tokens_seen": 21353176, + "step": 35015 + }, + { + "epoch": 9.658025372311087, + "grad_norm": 6.365081048897991e-07, + "learning_rate": 1.781952402238163e-07, + "loss": 0.0, + "num_input_tokens_seen": 21356152, + "step": 35020 + }, + { + "epoch": 9.659404302261445, + "grad_norm": 9.011450856633019e-06, + "learning_rate": 1.767639235432328e-07, + "loss": 0.0, + "num_input_tokens_seen": 21359160, + "step": 35025 + }, + { + "epoch": 9.660783232211804, + "grad_norm": 8.40530265122652e-06, + "learning_rate": 1.7533835805012556e-07, + "loss": 0.0, + "num_input_tokens_seen": 21362232, + "step": 35030 + }, + { + "epoch": 9.662162162162161, + "grad_norm": 6.7727351051871665e-06, + "learning_rate": 1.7391854407477482e-07, + "loss": 0.0, + "num_input_tokens_seen": 21365816, + "step": 35035 + }, + { + "epoch": 9.66354109211252, + "grad_norm": 5.14198347900674e-07, + "learning_rate": 1.7250448194613688e-07, + "loss": 0.0, + "num_input_tokens_seen": 21368120, + "step": 35040 + }, + { + "epoch": 9.66492002206288, + "grad_norm": 1.335853994532954e-06, + "learning_rate": 1.710961719918247e-07, + "loss": 0.0, + "num_input_tokens_seen": 21371352, + "step": 35045 + }, + { + "epoch": 9.666298952013237, + "grad_norm": 6.251233912735188e-07, + "learning_rate": 1.696936145381245e-07, + "loss": 0.0, + "num_input_tokens_seen": 21374520, + "step": 35050 + }, + { + "epoch": 9.667677881963597, + "grad_norm": 4.5431106627802365e-06, + "learning_rate": 1.682968099099874e-07, + "loss": 0.0, + "num_input_tokens_seen": 21378744, + "step": 35055 + }, + { + "epoch": 9.669056811913954, + "grad_norm": 1.7830341221269919e-06, + "learning_rate": 1.6690575843103796e-07, + "loss": 0.0, + "num_input_tokens_seen": 21381112, + "step": 35060 + }, + { + "epoch": 9.670435741864313, + "grad_norm": 1.750660430843709e-06, + "learning_rate": 1.6552046042355717e-07, + "loss": 0.0, + "num_input_tokens_seen": 21383544, + "step": 35065 + }, + { + "epoch": 9.671814671814673, + "grad_norm": 1.0863952866202453e-06, + "learning_rate": 1.6414091620850224e-07, + "loss": 0.0, + "num_input_tokens_seen": 21385912, + "step": 35070 + }, + { + "epoch": 9.67319360176503, + "grad_norm": 1.3523091411116184e-06, + "learning_rate": 1.627671261054925e-07, + "loss": 0.0, + "num_input_tokens_seen": 21388440, + "step": 35075 + }, + { + "epoch": 9.67457253171539, + "grad_norm": 5.435573484646739e-07, + "learning_rate": 1.6139909043281775e-07, + "loss": 0.0, + "num_input_tokens_seen": 21391608, + "step": 35080 + }, + { + "epoch": 9.675951461665747, + "grad_norm": 5.223258199293923e-07, + "learning_rate": 1.6003680950742728e-07, + "loss": 0.0, + "num_input_tokens_seen": 21394328, + "step": 35085 + }, + { + "epoch": 9.677330391616106, + "grad_norm": 7.432064421664109e-07, + "learning_rate": 1.5868028364494357e-07, + "loss": 0.0, + "num_input_tokens_seen": 21397272, + "step": 35090 + }, + { + "epoch": 9.678709321566464, + "grad_norm": 7.197027684924251e-07, + "learning_rate": 1.573295131596597e-07, + "loss": 0.0, + "num_input_tokens_seen": 21400728, + "step": 35095 + }, + { + "epoch": 9.680088251516823, + "grad_norm": 3.902801495314634e-07, + "learning_rate": 1.5598449836452257e-07, + "loss": 0.0, + "num_input_tokens_seen": 21403512, + "step": 35100 + }, + { + "epoch": 9.681467181467182, + "grad_norm": 3.958000434067799e-07, + "learning_rate": 1.5464523957115507e-07, + "loss": 0.0, + "num_input_tokens_seen": 21406168, + "step": 35105 + }, + { + "epoch": 9.68284611141754, + "grad_norm": 1.0096516689372947e-06, + "learning_rate": 1.5331173708984514e-07, + "loss": 0.0, + "num_input_tokens_seen": 21409688, + "step": 35110 + }, + { + "epoch": 9.684225041367899, + "grad_norm": 1.0370141581006465e-06, + "learning_rate": 1.5198399122954843e-07, + "loss": 0.0, + "num_input_tokens_seen": 21412248, + "step": 35115 + }, + { + "epoch": 9.685603971318256, + "grad_norm": 4.495763050726964e-07, + "learning_rate": 1.5066200229787998e-07, + "loss": 0.0, + "num_input_tokens_seen": 21415160, + "step": 35120 + }, + { + "epoch": 9.686982901268616, + "grad_norm": 8.08201093605021e-07, + "learning_rate": 1.493457706011281e-07, + "loss": 0.0, + "num_input_tokens_seen": 21419800, + "step": 35125 + }, + { + "epoch": 9.688361831218973, + "grad_norm": 6.475677878370334e-07, + "learning_rate": 1.480352964442433e-07, + "loss": 0.0, + "num_input_tokens_seen": 21422712, + "step": 35130 + }, + { + "epoch": 9.689740761169332, + "grad_norm": 8.298628017655574e-06, + "learning_rate": 1.467305801308466e-07, + "loss": 0.0, + "num_input_tokens_seen": 21425336, + "step": 35135 + }, + { + "epoch": 9.691119691119692, + "grad_norm": 1.1474681969048106e-06, + "learning_rate": 1.4543162196321847e-07, + "loss": 0.0, + "num_input_tokens_seen": 21429240, + "step": 35140 + }, + { + "epoch": 9.69249862107005, + "grad_norm": 7.17357465873647e-07, + "learning_rate": 1.4413842224231255e-07, + "loss": 0.0, + "num_input_tokens_seen": 21432376, + "step": 35145 + }, + { + "epoch": 9.693877551020408, + "grad_norm": 2.340792889299337e-06, + "learning_rate": 1.42850981267742e-07, + "loss": 0.0, + "num_input_tokens_seen": 21435384, + "step": 35150 + }, + { + "epoch": 9.695256480970766, + "grad_norm": 7.638588613190223e-06, + "learning_rate": 1.4156929933778762e-07, + "loss": 0.0, + "num_input_tokens_seen": 21438744, + "step": 35155 + }, + { + "epoch": 9.696635410921125, + "grad_norm": 5.51088248812448e-07, + "learning_rate": 1.402933767494008e-07, + "loss": 0.0, + "num_input_tokens_seen": 21442296, + "step": 35160 + }, + { + "epoch": 9.698014340871485, + "grad_norm": 7.228927643154748e-07, + "learning_rate": 1.390232137981895e-07, + "loss": 0.0, + "num_input_tokens_seen": 21445688, + "step": 35165 + }, + { + "epoch": 9.699393270821842, + "grad_norm": 6.649763690802502e-07, + "learning_rate": 1.3775881077843777e-07, + "loss": 0.0, + "num_input_tokens_seen": 21449336, + "step": 35170 + }, + { + "epoch": 9.700772200772201, + "grad_norm": 4.333329854944168e-07, + "learning_rate": 1.3650016798308352e-07, + "loss": 0.0, + "num_input_tokens_seen": 21452568, + "step": 35175 + }, + { + "epoch": 9.702151130722559, + "grad_norm": 7.216465292003704e-07, + "learning_rate": 1.3524728570374067e-07, + "loss": 0.0, + "num_input_tokens_seen": 21456024, + "step": 35180 + }, + { + "epoch": 9.703530060672918, + "grad_norm": 6.207055776030757e-07, + "learning_rate": 1.3400016423068262e-07, + "loss": 0.0, + "num_input_tokens_seen": 21458808, + "step": 35185 + }, + { + "epoch": 9.704908990623277, + "grad_norm": 5.233527531345317e-07, + "learning_rate": 1.3275880385284767e-07, + "loss": 0.0, + "num_input_tokens_seen": 21461368, + "step": 35190 + }, + { + "epoch": 9.706287920573635, + "grad_norm": 6.461211228270258e-07, + "learning_rate": 1.3152320485784464e-07, + "loss": 0.0, + "num_input_tokens_seen": 21463736, + "step": 35195 + }, + { + "epoch": 9.707666850523994, + "grad_norm": 9.805602303458727e-07, + "learning_rate": 1.3029336753194177e-07, + "loss": 0.0, + "num_input_tokens_seen": 21466488, + "step": 35200 + }, + { + "epoch": 9.709045780474352, + "grad_norm": 7.794883458700497e-07, + "learning_rate": 1.2906929216007502e-07, + "loss": 0.0, + "num_input_tokens_seen": 21469560, + "step": 35205 + }, + { + "epoch": 9.71042471042471, + "grad_norm": 2.9637278203153983e-05, + "learning_rate": 1.2785097902584808e-07, + "loss": 0.0, + "num_input_tokens_seen": 21472088, + "step": 35210 + }, + { + "epoch": 9.711803640375068, + "grad_norm": 2.8527690574264852e-06, + "learning_rate": 1.2663842841152129e-07, + "loss": 0.0, + "num_input_tokens_seen": 21475160, + "step": 35215 + }, + { + "epoch": 9.713182570325428, + "grad_norm": 6.799150469305459e-07, + "learning_rate": 1.2543164059802826e-07, + "loss": 0.0, + "num_input_tokens_seen": 21477464, + "step": 35220 + }, + { + "epoch": 9.714561500275787, + "grad_norm": 4.62675501466947e-07, + "learning_rate": 1.2423061586496477e-07, + "loss": 0.0, + "num_input_tokens_seen": 21480664, + "step": 35225 + }, + { + "epoch": 9.715940430226144, + "grad_norm": 4.3673543359545874e-07, + "learning_rate": 1.2303535449059156e-07, + "loss": 0.0, + "num_input_tokens_seen": 21485208, + "step": 35230 + }, + { + "epoch": 9.717319360176504, + "grad_norm": 5.056376153333986e-07, + "learning_rate": 1.218458567518288e-07, + "loss": 0.0, + "num_input_tokens_seen": 21488792, + "step": 35235 + }, + { + "epoch": 9.718698290126861, + "grad_norm": 9.133834737440338e-07, + "learning_rate": 1.206621229242727e-07, + "loss": 0.0, + "num_input_tokens_seen": 21491768, + "step": 35240 + }, + { + "epoch": 9.72007722007722, + "grad_norm": 1.2508202189565054e-06, + "learning_rate": 1.1948415328217332e-07, + "loss": 0.0, + "num_input_tokens_seen": 21495576, + "step": 35245 + }, + { + "epoch": 9.721456150027578, + "grad_norm": 6.33747504252824e-07, + "learning_rate": 1.1831194809844847e-07, + "loss": 0.0, + "num_input_tokens_seen": 21498136, + "step": 35250 + }, + { + "epoch": 9.722835079977937, + "grad_norm": 7.984343710631947e-07, + "learning_rate": 1.1714550764468646e-07, + "loss": 0.0, + "num_input_tokens_seen": 21502456, + "step": 35255 + }, + { + "epoch": 9.724214009928296, + "grad_norm": 5.00802016176749e-05, + "learning_rate": 1.1598483219112944e-07, + "loss": 0.0, + "num_input_tokens_seen": 21504696, + "step": 35260 + }, + { + "epoch": 9.725592939878654, + "grad_norm": 0.00010720524733187631, + "learning_rate": 1.1482992200669008e-07, + "loss": 0.0, + "num_input_tokens_seen": 21507416, + "step": 35265 + }, + { + "epoch": 9.726971869829013, + "grad_norm": 5.921199885960959e-07, + "learning_rate": 1.13680777358946e-07, + "loss": 0.0, + "num_input_tokens_seen": 21510104, + "step": 35270 + }, + { + "epoch": 9.72835079977937, + "grad_norm": 2.414221398794325e-06, + "learning_rate": 1.125373985141398e-07, + "loss": 0.0, + "num_input_tokens_seen": 21512376, + "step": 35275 + }, + { + "epoch": 9.72972972972973, + "grad_norm": 6.352693731059844e-07, + "learning_rate": 1.1139978573717069e-07, + "loss": 0.0, + "num_input_tokens_seen": 21514712, + "step": 35280 + }, + { + "epoch": 9.731108659680089, + "grad_norm": 8.768116686042049e-07, + "learning_rate": 1.1026793929161116e-07, + "loss": 0.0, + "num_input_tokens_seen": 21517240, + "step": 35285 + }, + { + "epoch": 9.732487589630447, + "grad_norm": 7.945043307699962e-07, + "learning_rate": 1.0914185943969035e-07, + "loss": 0.0, + "num_input_tokens_seen": 21520216, + "step": 35290 + }, + { + "epoch": 9.733866519580806, + "grad_norm": 4.6590415081482206e-07, + "learning_rate": 1.0802154644230788e-07, + "loss": 0.0, + "num_input_tokens_seen": 21523544, + "step": 35295 + }, + { + "epoch": 9.735245449531163, + "grad_norm": 8.783923840383068e-05, + "learning_rate": 1.0690700055902281e-07, + "loss": 0.0, + "num_input_tokens_seen": 21526392, + "step": 35300 + }, + { + "epoch": 9.736624379481523, + "grad_norm": 5.232337230154371e-07, + "learning_rate": 1.0579822204806188e-07, + "loss": 0.0, + "num_input_tokens_seen": 21529336, + "step": 35305 + }, + { + "epoch": 9.73800330943188, + "grad_norm": 6.406980332940293e-07, + "learning_rate": 1.0469521116630854e-07, + "loss": 0.0, + "num_input_tokens_seen": 21532568, + "step": 35310 + }, + { + "epoch": 9.73938223938224, + "grad_norm": 5.949327146481664e-07, + "learning_rate": 1.0359796816931944e-07, + "loss": 0.0, + "num_input_tokens_seen": 21536536, + "step": 35315 + }, + { + "epoch": 9.740761169332599, + "grad_norm": 7.560807375739387e-07, + "learning_rate": 1.0250649331130513e-07, + "loss": 0.0, + "num_input_tokens_seen": 21539768, + "step": 35320 + }, + { + "epoch": 9.742140099282956, + "grad_norm": 1.389964745612815e-05, + "learning_rate": 1.0142078684514943e-07, + "loss": 0.0, + "num_input_tokens_seen": 21543736, + "step": 35325 + }, + { + "epoch": 9.743519029233315, + "grad_norm": 5.313682436280942e-07, + "learning_rate": 1.0034084902239282e-07, + "loss": 0.0, + "num_input_tokens_seen": 21546776, + "step": 35330 + }, + { + "epoch": 9.744897959183673, + "grad_norm": 8.404411460105621e-07, + "learning_rate": 9.926668009323792e-08, + "loss": 0.0, + "num_input_tokens_seen": 21549880, + "step": 35335 + }, + { + "epoch": 9.746276889134032, + "grad_norm": 9.155785960501817e-07, + "learning_rate": 9.819828030656064e-08, + "loss": 0.0, + "num_input_tokens_seen": 21552600, + "step": 35340 + }, + { + "epoch": 9.74765581908439, + "grad_norm": 0.00042869074968621135, + "learning_rate": 9.713564990989077e-08, + "loss": 0.0, + "num_input_tokens_seen": 21556120, + "step": 35345 + }, + { + "epoch": 9.749034749034749, + "grad_norm": 6.236918466129282e-07, + "learning_rate": 9.607878914942303e-08, + "loss": 0.0, + "num_input_tokens_seen": 21558680, + "step": 35350 + }, + { + "epoch": 9.750413678985108, + "grad_norm": 8.221588245760358e-07, + "learning_rate": 9.502769827001989e-08, + "loss": 0.0, + "num_input_tokens_seen": 21561528, + "step": 35355 + }, + { + "epoch": 9.751792608935466, + "grad_norm": 6.667836487395107e-07, + "learning_rate": 9.398237751520323e-08, + "loss": 0.0, + "num_input_tokens_seen": 21564792, + "step": 35360 + }, + { + "epoch": 9.753171538885825, + "grad_norm": 4.0966696701616456e-07, + "learning_rate": 9.294282712715707e-08, + "loss": 0.0, + "num_input_tokens_seen": 21568216, + "step": 35365 + }, + { + "epoch": 9.754550468836182, + "grad_norm": 1.1810334399342537e-06, + "learning_rate": 9.190904734673045e-08, + "loss": 0.0, + "num_input_tokens_seen": 21570488, + "step": 35370 + }, + { + "epoch": 9.755929398786542, + "grad_norm": 1.4936504157958552e-06, + "learning_rate": 9.088103841343731e-08, + "loss": 0.0, + "num_input_tokens_seen": 21572824, + "step": 35375 + }, + { + "epoch": 9.757308328736901, + "grad_norm": 6.472731683970778e-07, + "learning_rate": 8.985880056545381e-08, + "loss": 0.0, + "num_input_tokens_seen": 21576600, + "step": 35380 + }, + { + "epoch": 9.758687258687258, + "grad_norm": 4.6401441977650393e-07, + "learning_rate": 8.884233403961273e-08, + "loss": 0.0, + "num_input_tokens_seen": 21579160, + "step": 35385 + }, + { + "epoch": 9.760066188637618, + "grad_norm": 5.855633276041772e-07, + "learning_rate": 8.783163907141733e-08, + "loss": 0.0, + "num_input_tokens_seen": 21581592, + "step": 35390 + }, + { + "epoch": 9.761445118587975, + "grad_norm": 0.00041601574048399925, + "learning_rate": 8.682671589503311e-08, + "loss": 0.0, + "num_input_tokens_seen": 21584696, + "step": 35395 + }, + { + "epoch": 9.762824048538334, + "grad_norm": 1.0216169812338194e-06, + "learning_rate": 8.582756474328491e-08, + "loss": 0.0, + "num_input_tokens_seen": 21587480, + "step": 35400 + }, + { + "epoch": 9.764202978488694, + "grad_norm": 7.058148412397713e-07, + "learning_rate": 8.483418584765979e-08, + "loss": 0.0, + "num_input_tokens_seen": 21591448, + "step": 35405 + }, + { + "epoch": 9.765581908439051, + "grad_norm": 4.453700626072532e-07, + "learning_rate": 8.384657943831253e-08, + "loss": 0.0, + "num_input_tokens_seen": 21595576, + "step": 35410 + }, + { + "epoch": 9.76696083838941, + "grad_norm": 3.945169737562537e-06, + "learning_rate": 8.286474574405456e-08, + "loss": 0.0, + "num_input_tokens_seen": 21598904, + "step": 35415 + }, + { + "epoch": 9.768339768339768, + "grad_norm": 8.233204766838753e-07, + "learning_rate": 8.188868499236501e-08, + "loss": 0.0, + "num_input_tokens_seen": 21601624, + "step": 35420 + }, + { + "epoch": 9.769718698290127, + "grad_norm": 5.164613412489416e-07, + "learning_rate": 8.091839740937967e-08, + "loss": 0.0, + "num_input_tokens_seen": 21604632, + "step": 35425 + }, + { + "epoch": 9.771097628240485, + "grad_norm": 0.00019077675824519247, + "learning_rate": 7.995388321990483e-08, + "loss": 0.0, + "num_input_tokens_seen": 21608440, + "step": 35430 + }, + { + "epoch": 9.772476558190844, + "grad_norm": 1.4823951914877398e-06, + "learning_rate": 7.899514264740337e-08, + "loss": 0.0, + "num_input_tokens_seen": 21610744, + "step": 35435 + }, + { + "epoch": 9.773855488141203, + "grad_norm": 6.73312740673282e-07, + "learning_rate": 7.804217591400042e-08, + "loss": 0.0, + "num_input_tokens_seen": 21613496, + "step": 35440 + }, + { + "epoch": 9.77523441809156, + "grad_norm": 8.922128245103522e-07, + "learning_rate": 7.709498324048325e-08, + "loss": 0.0, + "num_input_tokens_seen": 21616280, + "step": 35445 + }, + { + "epoch": 9.77661334804192, + "grad_norm": 5.659226189891342e-07, + "learning_rate": 7.615356484630409e-08, + "loss": 0.0, + "num_input_tokens_seen": 21619512, + "step": 35450 + }, + { + "epoch": 9.777992277992277, + "grad_norm": 2.198095444327919e-06, + "learning_rate": 7.521792094958014e-08, + "loss": 0.0, + "num_input_tokens_seen": 21622520, + "step": 35455 + }, + { + "epoch": 9.779371207942637, + "grad_norm": 1.0436416459924658e-06, + "learning_rate": 7.428805176707965e-08, + "loss": 0.0, + "num_input_tokens_seen": 21625624, + "step": 35460 + }, + { + "epoch": 9.780750137892994, + "grad_norm": 7.874007224017987e-07, + "learning_rate": 7.336395751424696e-08, + "loss": 0.0, + "num_input_tokens_seen": 21628824, + "step": 35465 + }, + { + "epoch": 9.782129067843353, + "grad_norm": 2.0119443888688693e-06, + "learning_rate": 7.244563840517471e-08, + "loss": 0.0, + "num_input_tokens_seen": 21631288, + "step": 35470 + }, + { + "epoch": 9.783507997793713, + "grad_norm": 7.214516699605156e-07, + "learning_rate": 7.153309465262881e-08, + "loss": 0.0, + "num_input_tokens_seen": 21634008, + "step": 35475 + }, + { + "epoch": 9.78488692774407, + "grad_norm": 3.3534111025801394e-06, + "learning_rate": 7.062632646802903e-08, + "loss": 0.0, + "num_input_tokens_seen": 21637848, + "step": 35480 + }, + { + "epoch": 9.78626585769443, + "grad_norm": 1.050656351253565e-06, + "learning_rate": 6.972533406146564e-08, + "loss": 0.0, + "num_input_tokens_seen": 21642808, + "step": 35485 + }, + { + "epoch": 9.787644787644787, + "grad_norm": 9.513738041277975e-05, + "learning_rate": 6.883011764168002e-08, + "loss": 0.0, + "num_input_tokens_seen": 21646008, + "step": 35490 + }, + { + "epoch": 9.789023717595146, + "grad_norm": 8.933050139603438e-07, + "learning_rate": 6.794067741608678e-08, + "loss": 0.0, + "num_input_tokens_seen": 21648216, + "step": 35495 + }, + { + "epoch": 9.790402647545505, + "grad_norm": 1.2885975593235344e-06, + "learning_rate": 6.705701359075168e-08, + "loss": 0.0, + "num_input_tokens_seen": 21651096, + "step": 35500 + }, + { + "epoch": 9.791781577495863, + "grad_norm": 1.600927816980402e-06, + "learning_rate": 6.617912637040812e-08, + "loss": 0.0, + "num_input_tokens_seen": 21654104, + "step": 35505 + }, + { + "epoch": 9.793160507446222, + "grad_norm": 4.803815727427718e-07, + "learning_rate": 6.530701595845179e-08, + "loss": 0.0, + "num_input_tokens_seen": 21656696, + "step": 35510 + }, + { + "epoch": 9.79453943739658, + "grad_norm": 9.232465458808292e-07, + "learning_rate": 6.444068255693492e-08, + "loss": 0.0, + "num_input_tokens_seen": 21659160, + "step": 35515 + }, + { + "epoch": 9.795918367346939, + "grad_norm": 1.3128283171681687e-06, + "learning_rate": 6.358012636657751e-08, + "loss": 0.0, + "num_input_tokens_seen": 21662136, + "step": 35520 + }, + { + "epoch": 9.797297297297296, + "grad_norm": 4.2247609144396847e-07, + "learning_rate": 6.272534758675619e-08, + "loss": 0.0, + "num_input_tokens_seen": 21665240, + "step": 35525 + }, + { + "epoch": 9.798676227247656, + "grad_norm": 6.23755227024958e-07, + "learning_rate": 6.187634641551254e-08, + "loss": 0.0, + "num_input_tokens_seen": 21668504, + "step": 35530 + }, + { + "epoch": 9.800055157198015, + "grad_norm": 1.2872840216004988e-06, + "learning_rate": 6.103312304954756e-08, + "loss": 0.0, + "num_input_tokens_seen": 21672856, + "step": 35535 + }, + { + "epoch": 9.801434087148372, + "grad_norm": 1.3043588751315838e-06, + "learning_rate": 6.019567768422163e-08, + "loss": 0.0, + "num_input_tokens_seen": 21675064, + "step": 35540 + }, + { + "epoch": 9.802813017098732, + "grad_norm": 4.722739390672359e-07, + "learning_rate": 5.936401051356288e-08, + "loss": 0.0, + "num_input_tokens_seen": 21677720, + "step": 35545 + }, + { + "epoch": 9.80419194704909, + "grad_norm": 1.345608552583144e-06, + "learning_rate": 5.85381217302533e-08, + "loss": 0.0, + "num_input_tokens_seen": 21680024, + "step": 35550 + }, + { + "epoch": 9.805570876999449, + "grad_norm": 6.64840456465754e-07, + "learning_rate": 5.771801152563982e-08, + "loss": 0.0, + "num_input_tokens_seen": 21683608, + "step": 35555 + }, + { + "epoch": 9.806949806949808, + "grad_norm": 1.8043804175249534e-06, + "learning_rate": 5.690368008973157e-08, + "loss": 0.0, + "num_input_tokens_seen": 21686840, + "step": 35560 + }, + { + "epoch": 9.808328736900165, + "grad_norm": 1.062743194779614e-06, + "learning_rate": 5.609512761119706e-08, + "loss": 0.0, + "num_input_tokens_seen": 21689656, + "step": 35565 + }, + { + "epoch": 9.809707666850525, + "grad_norm": 5.408151650954096e-07, + "learning_rate": 5.529235427736701e-08, + "loss": 0.0, + "num_input_tokens_seen": 21692184, + "step": 35570 + }, + { + "epoch": 9.811086596800882, + "grad_norm": 8.094965551208588e-07, + "learning_rate": 5.4495360274231524e-08, + "loss": 0.0, + "num_input_tokens_seen": 21698168, + "step": 35575 + }, + { + "epoch": 9.812465526751241, + "grad_norm": 4.899609962194518e-07, + "learning_rate": 5.370414578644289e-08, + "loss": 0.0, + "num_input_tokens_seen": 21701048, + "step": 35580 + }, + { + "epoch": 9.813844456701599, + "grad_norm": 5.8214092859998345e-05, + "learning_rate": 5.29187109973156e-08, + "loss": 0.0, + "num_input_tokens_seen": 21703544, + "step": 35585 + }, + { + "epoch": 9.815223386651958, + "grad_norm": 7.402857704619237e-07, + "learning_rate": 5.2139056088817975e-08, + "loss": 0.0, + "num_input_tokens_seen": 21706648, + "step": 35590 + }, + { + "epoch": 9.816602316602317, + "grad_norm": 6.579687692465086e-07, + "learning_rate": 5.136518124159162e-08, + "loss": 0.0, + "num_input_tokens_seen": 21710616, + "step": 35595 + }, + { + "epoch": 9.817981246552675, + "grad_norm": 5.003050773666473e-07, + "learning_rate": 5.0597086634929234e-08, + "loss": 0.0, + "num_input_tokens_seen": 21713528, + "step": 35600 + }, + { + "epoch": 9.819360176503034, + "grad_norm": 7.699116395087913e-06, + "learning_rate": 4.983477244678847e-08, + "loss": 0.0, + "num_input_tokens_seen": 21716984, + "step": 35605 + }, + { + "epoch": 9.820739106453392, + "grad_norm": 4.869227723247604e-07, + "learning_rate": 4.9078238853783596e-08, + "loss": 0.0, + "num_input_tokens_seen": 21720376, + "step": 35610 + }, + { + "epoch": 9.82211803640375, + "grad_norm": 4.615417765307939e-06, + "learning_rate": 4.8327486031196636e-08, + "loss": 0.0, + "num_input_tokens_seen": 21723288, + "step": 35615 + }, + { + "epoch": 9.82349696635411, + "grad_norm": 7.485637638637854e-07, + "learning_rate": 4.7582514152963444e-08, + "loss": 0.0, + "num_input_tokens_seen": 21726232, + "step": 35620 + }, + { + "epoch": 9.824875896304468, + "grad_norm": 5.977855153105338e-07, + "learning_rate": 4.684332339168485e-08, + "loss": 0.0, + "num_input_tokens_seen": 21729080, + "step": 35625 + }, + { + "epoch": 9.826254826254827, + "grad_norm": 5.968839218439825e-07, + "learning_rate": 4.61099139186183e-08, + "loss": 0.0, + "num_input_tokens_seen": 21731544, + "step": 35630 + }, + { + "epoch": 9.827633756205184, + "grad_norm": 8.222855285566766e-07, + "learning_rate": 4.538228590368898e-08, + "loss": 0.0, + "num_input_tokens_seen": 21733848, + "step": 35635 + }, + { + "epoch": 9.829012686155544, + "grad_norm": 6.823345870543562e-07, + "learning_rate": 4.4660439515470385e-08, + "loss": 0.0, + "num_input_tokens_seen": 21736888, + "step": 35640 + }, + { + "epoch": 9.830391616105901, + "grad_norm": 4.385144904972549e-07, + "learning_rate": 4.394437492121206e-08, + "loss": 0.0, + "num_input_tokens_seen": 21739544, + "step": 35645 + }, + { + "epoch": 9.83177054605626, + "grad_norm": 8.085122544798651e-07, + "learning_rate": 4.323409228681186e-08, + "loss": 0.0, + "num_input_tokens_seen": 21742104, + "step": 35650 + }, + { + "epoch": 9.83314947600662, + "grad_norm": 5.902650173084112e-07, + "learning_rate": 4.252959177683258e-08, + "loss": 0.0, + "num_input_tokens_seen": 21745144, + "step": 35655 + }, + { + "epoch": 9.834528405956977, + "grad_norm": 6.650939212704543e-07, + "learning_rate": 4.183087355449644e-08, + "loss": 0.0, + "num_input_tokens_seen": 21747768, + "step": 35660 + }, + { + "epoch": 9.835907335907336, + "grad_norm": 5.372575060391682e-07, + "learning_rate": 4.113793778168784e-08, + "loss": 0.0, + "num_input_tokens_seen": 21750648, + "step": 35665 + }, + { + "epoch": 9.837286265857694, + "grad_norm": 5.338450250746973e-07, + "learning_rate": 4.04507846189478e-08, + "loss": 0.0, + "num_input_tokens_seen": 21753176, + "step": 35670 + }, + { + "epoch": 9.838665195808053, + "grad_norm": 8.527904924449103e-07, + "learning_rate": 3.9769414225482305e-08, + "loss": 0.0, + "num_input_tokens_seen": 21755256, + "step": 35675 + }, + { + "epoch": 9.84004412575841, + "grad_norm": 7.788129323671455e-07, + "learning_rate": 3.9093826759153964e-08, + "loss": 0.0, + "num_input_tokens_seen": 21758424, + "step": 35680 + }, + { + "epoch": 9.84142305570877, + "grad_norm": 1.2402731044858228e-05, + "learning_rate": 3.842402237648479e-08, + "loss": 0.0, + "num_input_tokens_seen": 21761240, + "step": 35685 + }, + { + "epoch": 9.84280198565913, + "grad_norm": 6.950282909201633e-07, + "learning_rate": 3.7760001232661765e-08, + "loss": 0.0, + "num_input_tokens_seen": 21764984, + "step": 35690 + }, + { + "epoch": 9.844180915609487, + "grad_norm": 5.480671347868338e-07, + "learning_rate": 3.7101763481528494e-08, + "loss": 0.0, + "num_input_tokens_seen": 21767256, + "step": 35695 + }, + { + "epoch": 9.845559845559846, + "grad_norm": 4.134049902404513e-07, + "learning_rate": 3.6449309275587986e-08, + "loss": 0.0, + "num_input_tokens_seen": 21771928, + "step": 35700 + }, + { + "epoch": 9.846938775510203, + "grad_norm": 5.951739581178117e-07, + "learning_rate": 3.580263876600543e-08, + "loss": 0.0, + "num_input_tokens_seen": 21774680, + "step": 35705 + }, + { + "epoch": 9.848317705460563, + "grad_norm": 1.3154302678231033e-06, + "learning_rate": 3.5161752102605417e-08, + "loss": 0.0, + "num_input_tokens_seen": 21777176, + "step": 35710 + }, + { + "epoch": 9.849696635410922, + "grad_norm": 2.670680032679229e-06, + "learning_rate": 3.452664943386919e-08, + "loss": 0.0, + "num_input_tokens_seen": 21780696, + "step": 35715 + }, + { + "epoch": 9.85107556536128, + "grad_norm": 7.138978048715217e-07, + "learning_rate": 3.389733090694569e-08, + "loss": 0.0, + "num_input_tokens_seen": 21783128, + "step": 35720 + }, + { + "epoch": 9.852454495311639, + "grad_norm": 4.653046232760971e-07, + "learning_rate": 3.327379666763775e-08, + "loss": 0.0, + "num_input_tokens_seen": 21785528, + "step": 35725 + }, + { + "epoch": 9.853833425261996, + "grad_norm": 1.4414357565328828e-06, + "learning_rate": 3.26560468604048e-08, + "loss": 0.0, + "num_input_tokens_seen": 21788344, + "step": 35730 + }, + { + "epoch": 9.855212355212355, + "grad_norm": 6.55227950119297e-07, + "learning_rate": 3.20440816283768e-08, + "loss": 0.0, + "num_input_tokens_seen": 21791000, + "step": 35735 + }, + { + "epoch": 9.856591285162715, + "grad_norm": 6.649722195106733e-07, + "learning_rate": 3.143790111333478e-08, + "loss": 0.0, + "num_input_tokens_seen": 21793336, + "step": 35740 + }, + { + "epoch": 9.857970215113072, + "grad_norm": 5.619082230623462e-07, + "learning_rate": 3.0837505455719176e-08, + "loss": 0.0, + "num_input_tokens_seen": 21798296, + "step": 35745 + }, + { + "epoch": 9.859349145063431, + "grad_norm": 6.126992388999497e-07, + "learning_rate": 3.024289479463816e-08, + "loss": 0.0, + "num_input_tokens_seen": 21801464, + "step": 35750 + }, + { + "epoch": 9.860728075013789, + "grad_norm": 5.507567379936518e-07, + "learning_rate": 2.965406926785097e-08, + "loss": 0.0, + "num_input_tokens_seen": 21803960, + "step": 35755 + }, + { + "epoch": 9.862107004964148, + "grad_norm": 3.9412364571944636e-07, + "learning_rate": 2.907102901177905e-08, + "loss": 0.0, + "num_input_tokens_seen": 21807736, + "step": 35760 + }, + { + "epoch": 9.863485934914506, + "grad_norm": 6.943531616343535e-07, + "learning_rate": 2.849377416150878e-08, + "loss": 0.0, + "num_input_tokens_seen": 21810264, + "step": 35765 + }, + { + "epoch": 9.864864864864865, + "grad_norm": 4.98069709919946e-07, + "learning_rate": 2.79223048507804e-08, + "loss": 0.0, + "num_input_tokens_seen": 21813944, + "step": 35770 + }, + { + "epoch": 9.866243794815224, + "grad_norm": 5.243640543994843e-07, + "learning_rate": 2.735662121199356e-08, + "loss": 0.0, + "num_input_tokens_seen": 21816760, + "step": 35775 + }, + { + "epoch": 9.867622724765582, + "grad_norm": 2.2582280507776886e-06, + "learning_rate": 2.6796723376210087e-08, + "loss": 0.0, + "num_input_tokens_seen": 21819224, + "step": 35780 + }, + { + "epoch": 9.869001654715941, + "grad_norm": 6.285062568167632e-07, + "learning_rate": 2.6242611473148436e-08, + "loss": 0.0, + "num_input_tokens_seen": 21822424, + "step": 35785 + }, + { + "epoch": 9.870380584666298, + "grad_norm": 5.586086899711518e-07, + "learning_rate": 2.5694285631189252e-08, + "loss": 0.0, + "num_input_tokens_seen": 21826200, + "step": 35790 + }, + { + "epoch": 9.871759514616658, + "grad_norm": 3.8428999005191145e-07, + "learning_rate": 2.515174597737535e-08, + "loss": 0.0, + "num_input_tokens_seen": 21828952, + "step": 35795 + }, + { + "epoch": 9.873138444567015, + "grad_norm": 4.808758831131854e-07, + "learning_rate": 2.4614992637400637e-08, + "loss": 0.0, + "num_input_tokens_seen": 21831736, + "step": 35800 + }, + { + "epoch": 9.874517374517374, + "grad_norm": 6.666530225629685e-07, + "learning_rate": 2.408402573562396e-08, + "loss": 0.0, + "num_input_tokens_seen": 21835288, + "step": 35805 + }, + { + "epoch": 9.875896304467734, + "grad_norm": 1.8848180616259924e-06, + "learning_rate": 2.3558845395066364e-08, + "loss": 0.0, + "num_input_tokens_seen": 21838488, + "step": 35810 + }, + { + "epoch": 9.877275234418091, + "grad_norm": 7.838021360839775e-07, + "learning_rate": 2.3039451737399964e-08, + "loss": 0.0, + "num_input_tokens_seen": 21843000, + "step": 35815 + }, + { + "epoch": 9.87865416436845, + "grad_norm": 1.0437653145345394e-05, + "learning_rate": 2.2525844882964607e-08, + "loss": 0.0, + "num_input_tokens_seen": 21845624, + "step": 35820 + }, + { + "epoch": 9.880033094318808, + "grad_norm": 7.434039162035333e-07, + "learning_rate": 2.2018024950751225e-08, + "loss": 0.0, + "num_input_tokens_seen": 21848408, + "step": 35825 + }, + { + "epoch": 9.881412024269167, + "grad_norm": 7.795754868311633e-07, + "learning_rate": 2.1515992058418476e-08, + "loss": 0.0, + "num_input_tokens_seen": 21852440, + "step": 35830 + }, + { + "epoch": 9.882790954219526, + "grad_norm": 6.672918857475452e-07, + "learning_rate": 2.1019746322278878e-08, + "loss": 0.0, + "num_input_tokens_seen": 21855288, + "step": 35835 + }, + { + "epoch": 9.884169884169884, + "grad_norm": 6.755545314263145e-07, + "learning_rate": 2.0529287857304347e-08, + "loss": 0.0, + "num_input_tokens_seen": 21857912, + "step": 35840 + }, + { + "epoch": 9.885548814120243, + "grad_norm": 5.227918791206321e-07, + "learning_rate": 2.0044616777128988e-08, + "loss": 0.0, + "num_input_tokens_seen": 21860632, + "step": 35845 + }, + { + "epoch": 9.8869277440706, + "grad_norm": 5.404786520557536e-07, + "learning_rate": 1.9565733194043534e-08, + "loss": 0.0, + "num_input_tokens_seen": 21862840, + "step": 35850 + }, + { + "epoch": 9.88830667402096, + "grad_norm": 2.855455932149198e-06, + "learning_rate": 1.9092637218995347e-08, + "loss": 0.0, + "num_input_tokens_seen": 21866104, + "step": 35855 + }, + { + "epoch": 9.889685603971317, + "grad_norm": 4.624845075795747e-07, + "learning_rate": 1.862532896159952e-08, + "loss": 0.0, + "num_input_tokens_seen": 21869272, + "step": 35860 + }, + { + "epoch": 9.891064533921677, + "grad_norm": 6.432854888771544e-07, + "learning_rate": 1.8163808530119453e-08, + "loss": 0.0, + "num_input_tokens_seen": 21872504, + "step": 35865 + }, + { + "epoch": 9.892443463872036, + "grad_norm": 6.957604341550905e-07, + "learning_rate": 1.7708076031486276e-08, + "loss": 0.0, + "num_input_tokens_seen": 21875096, + "step": 35870 + }, + { + "epoch": 9.893822393822393, + "grad_norm": 1.6347937616956187e-06, + "learning_rate": 1.725813157128775e-08, + "loss": 0.0, + "num_input_tokens_seen": 21877816, + "step": 35875 + }, + { + "epoch": 9.895201323772753, + "grad_norm": 4.4664659526461037e-07, + "learning_rate": 1.6813975253765492e-08, + "loss": 0.0, + "num_input_tokens_seen": 21881464, + "step": 35880 + }, + { + "epoch": 9.89658025372311, + "grad_norm": 1.4204815670382231e-06, + "learning_rate": 1.637560718182607e-08, + "loss": 0.0, + "num_input_tokens_seen": 21884184, + "step": 35885 + }, + { + "epoch": 9.89795918367347, + "grad_norm": 6.42418683582946e-07, + "learning_rate": 1.594302745703269e-08, + "loss": 0.0, + "num_input_tokens_seen": 21887256, + "step": 35890 + }, + { + "epoch": 9.899338113623827, + "grad_norm": 8.799298143458145e-07, + "learning_rate": 1.5516236179607957e-08, + "loss": 0.0, + "num_input_tokens_seen": 21889848, + "step": 35895 + }, + { + "epoch": 9.900717043574186, + "grad_norm": 4.863588287662424e-07, + "learning_rate": 1.509523344843389e-08, + "loss": 0.0, + "num_input_tokens_seen": 21893176, + "step": 35900 + }, + { + "epoch": 9.902095973524546, + "grad_norm": 3.8651128875244467e-07, + "learning_rate": 1.4680019361049125e-08, + "loss": 0.0, + "num_input_tokens_seen": 21896152, + "step": 35905 + }, + { + "epoch": 9.903474903474903, + "grad_norm": 9.246515446648118e-07, + "learning_rate": 1.4270594013654493e-08, + "loss": 0.0, + "num_input_tokens_seen": 21898808, + "step": 35910 + }, + { + "epoch": 9.904853833425262, + "grad_norm": 4.401821058763744e-07, + "learning_rate": 1.3866957501110223e-08, + "loss": 0.0, + "num_input_tokens_seen": 21901464, + "step": 35915 + }, + { + "epoch": 9.90623276337562, + "grad_norm": 5.680370804839185e-07, + "learning_rate": 1.3469109916927624e-08, + "loss": 0.0, + "num_input_tokens_seen": 21904248, + "step": 35920 + }, + { + "epoch": 9.907611693325979, + "grad_norm": 5.904877070861403e-06, + "learning_rate": 1.3077051353285741e-08, + "loss": 0.0, + "num_input_tokens_seen": 21906328, + "step": 35925 + }, + { + "epoch": 9.908990623276338, + "grad_norm": 7.894014970588614e-07, + "learning_rate": 1.2690781901020244e-08, + "loss": 0.0, + "num_input_tokens_seen": 21909080, + "step": 35930 + }, + { + "epoch": 9.910369553226696, + "grad_norm": 5.725242999687907e-07, + "learning_rate": 1.2310301649620658e-08, + "loss": 0.0, + "num_input_tokens_seen": 21911992, + "step": 35935 + }, + { + "epoch": 9.911748483177055, + "grad_norm": 1.3695567758986726e-05, + "learning_rate": 1.1935610687238696e-08, + "loss": 0.0, + "num_input_tokens_seen": 21917240, + "step": 35940 + }, + { + "epoch": 9.913127413127413, + "grad_norm": 0.0008070399053394794, + "learning_rate": 1.1566709100688244e-08, + "loss": 0.0, + "num_input_tokens_seen": 21919832, + "step": 35945 + }, + { + "epoch": 9.914506343077772, + "grad_norm": 8.646536571177421e-07, + "learning_rate": 1.1203596975437047e-08, + "loss": 0.0, + "num_input_tokens_seen": 21922744, + "step": 35950 + }, + { + "epoch": 9.915885273028131, + "grad_norm": 4.2187153326267435e-07, + "learning_rate": 1.0846274395612255e-08, + "loss": 0.0, + "num_input_tokens_seen": 21925400, + "step": 35955 + }, + { + "epoch": 9.917264202978489, + "grad_norm": 7.065095246616693e-07, + "learning_rate": 1.0494741444000423e-08, + "loss": 0.0, + "num_input_tokens_seen": 21928664, + "step": 35960 + }, + { + "epoch": 9.918643132928848, + "grad_norm": 6.830892402831523e-07, + "learning_rate": 1.014899820204751e-08, + "loss": 0.0, + "num_input_tokens_seen": 21931352, + "step": 35965 + }, + { + "epoch": 9.920022062879205, + "grad_norm": 1.2671697504629265e-06, + "learning_rate": 9.809044749856111e-09, + "loss": 0.0, + "num_input_tokens_seen": 21933784, + "step": 35970 + }, + { + "epoch": 9.921400992829565, + "grad_norm": 1.7520400206194608e-06, + "learning_rate": 9.474881166190997e-09, + "loss": 0.0, + "num_input_tokens_seen": 21937112, + "step": 35975 + }, + { + "epoch": 9.922779922779922, + "grad_norm": 3.821458847141912e-07, + "learning_rate": 9.146507528470793e-09, + "loss": 0.0, + "num_input_tokens_seen": 21939800, + "step": 35980 + }, + { + "epoch": 9.924158852730281, + "grad_norm": 3.2216321415035054e-05, + "learning_rate": 8.823923912776311e-09, + "loss": 0.0, + "num_input_tokens_seen": 21942168, + "step": 35985 + }, + { + "epoch": 9.92553778268064, + "grad_norm": 2.8794199806725373e-06, + "learning_rate": 8.507130393842211e-09, + "loss": 0.0, + "num_input_tokens_seen": 21944888, + "step": 35990 + }, + { + "epoch": 9.926916712630998, + "grad_norm": 5.926012818235904e-07, + "learning_rate": 8.19612704507089e-09, + "loss": 0.0, + "num_input_tokens_seen": 21947768, + "step": 35995 + }, + { + "epoch": 9.928295642581357, + "grad_norm": 2.322390400877339e-06, + "learning_rate": 7.890913938510269e-09, + "loss": 0.0, + "num_input_tokens_seen": 21951288, + "step": 36000 + }, + { + "epoch": 9.929674572531715, + "grad_norm": 1.1187331665496458e-06, + "learning_rate": 7.591491144881557e-09, + "loss": 0.0, + "num_input_tokens_seen": 21954648, + "step": 36005 + }, + { + "epoch": 9.931053502482074, + "grad_norm": 7.77564309828449e-07, + "learning_rate": 7.297858733551488e-09, + "loss": 0.0, + "num_input_tokens_seen": 21957688, + "step": 36010 + }, + { + "epoch": 9.932432432432432, + "grad_norm": 7.62687818678387e-07, + "learning_rate": 7.010016772548977e-09, + "loss": 0.0, + "num_input_tokens_seen": 21960248, + "step": 36015 + }, + { + "epoch": 9.93381136238279, + "grad_norm": 1.1719409940269543e-06, + "learning_rate": 6.727965328570673e-09, + "loss": 0.0, + "num_input_tokens_seen": 21963512, + "step": 36020 + }, + { + "epoch": 9.93519029233315, + "grad_norm": 6.73290685426764e-07, + "learning_rate": 6.451704466955976e-09, + "loss": 0.0, + "num_input_tokens_seen": 21966168, + "step": 36025 + }, + { + "epoch": 9.936569222283508, + "grad_norm": 3.9710724308861245e-07, + "learning_rate": 6.18123425171202e-09, + "loss": 0.0, + "num_input_tokens_seen": 21969336, + "step": 36030 + }, + { + "epoch": 9.937948152233867, + "grad_norm": 1.5146789564823848e-06, + "learning_rate": 5.9165547455053426e-09, + "loss": 0.0, + "num_input_tokens_seen": 21973432, + "step": 36035 + }, + { + "epoch": 9.939327082184224, + "grad_norm": 2.676375152077526e-05, + "learning_rate": 5.6576660096591124e-09, + "loss": 0.0, + "num_input_tokens_seen": 21976376, + "step": 36040 + }, + { + "epoch": 9.940706012134584, + "grad_norm": 4.425915562933369e-07, + "learning_rate": 5.404568104150354e-09, + "loss": 0.0, + "num_input_tokens_seen": 21978808, + "step": 36045 + }, + { + "epoch": 9.942084942084943, + "grad_norm": 5.449539912660839e-07, + "learning_rate": 5.157261087621046e-09, + "loss": 0.0, + "num_input_tokens_seen": 21981624, + "step": 36050 + }, + { + "epoch": 9.9434638720353, + "grad_norm": 1.0231076430500252e-06, + "learning_rate": 4.915745017367024e-09, + "loss": 0.0, + "num_input_tokens_seen": 21985208, + "step": 36055 + }, + { + "epoch": 9.94484280198566, + "grad_norm": 7.981041107996134e-07, + "learning_rate": 4.680019949346304e-09, + "loss": 0.0, + "num_input_tokens_seen": 21988344, + "step": 36060 + }, + { + "epoch": 9.946221731936017, + "grad_norm": 4.7383994683514175e-07, + "learning_rate": 4.450085938170756e-09, + "loss": 0.0, + "num_input_tokens_seen": 21991096, + "step": 36065 + }, + { + "epoch": 9.947600661886376, + "grad_norm": 7.25682753000001e-07, + "learning_rate": 4.225943037114433e-09, + "loss": 0.0, + "num_input_tokens_seen": 21993272, + "step": 36070 + }, + { + "epoch": 9.948979591836736, + "grad_norm": 5.250740287010558e-07, + "learning_rate": 4.007591298108015e-09, + "loss": 0.0, + "num_input_tokens_seen": 21995864, + "step": 36075 + }, + { + "epoch": 9.950358521787093, + "grad_norm": 0.0002021529944613576, + "learning_rate": 3.795030771738817e-09, + "loss": 0.0, + "num_input_tokens_seen": 21999864, + "step": 36080 + }, + { + "epoch": 9.951737451737452, + "grad_norm": 6.362558337968949e-07, + "learning_rate": 3.588261507253554e-09, + "loss": 0.0, + "num_input_tokens_seen": 22002648, + "step": 36085 + }, + { + "epoch": 9.95311638168781, + "grad_norm": 3.3170344977406785e-06, + "learning_rate": 3.3872835525611268e-09, + "loss": 0.0, + "num_input_tokens_seen": 22005080, + "step": 36090 + }, + { + "epoch": 9.95449531163817, + "grad_norm": 1.0288703151672962e-06, + "learning_rate": 3.192096954221513e-09, + "loss": 0.0, + "num_input_tokens_seen": 22007992, + "step": 36095 + }, + { + "epoch": 9.955874241588527, + "grad_norm": 2.8364675017655827e-06, + "learning_rate": 3.0027017574596472e-09, + "loss": 0.0, + "num_input_tokens_seen": 22010328, + "step": 36100 + }, + { + "epoch": 9.957253171538886, + "grad_norm": 4.909428525934345e-07, + "learning_rate": 2.819098006151544e-09, + "loss": 0.0, + "num_input_tokens_seen": 22014104, + "step": 36105 + }, + { + "epoch": 9.958632101489245, + "grad_norm": 1.1836027624667622e-06, + "learning_rate": 2.6412857428409486e-09, + "loss": 0.0, + "num_input_tokens_seen": 22017144, + "step": 36110 + }, + { + "epoch": 9.960011031439603, + "grad_norm": 1.3121126585247112e-06, + "learning_rate": 2.4692650087226875e-09, + "loss": 0.0, + "num_input_tokens_seen": 22019448, + "step": 36115 + }, + { + "epoch": 9.961389961389962, + "grad_norm": 0.0010133803589269519, + "learning_rate": 2.3030358436482156e-09, + "loss": 0.0, + "num_input_tokens_seen": 22022104, + "step": 36120 + }, + { + "epoch": 9.96276889134032, + "grad_norm": 3.5144355933880433e-05, + "learning_rate": 2.1425982861339456e-09, + "loss": 0.0, + "num_input_tokens_seen": 22024824, + "step": 36125 + }, + { + "epoch": 9.964147821290679, + "grad_norm": 6.487444466074521e-07, + "learning_rate": 1.987952373350144e-09, + "loss": 0.0, + "num_input_tokens_seen": 22027864, + "step": 36130 + }, + { + "epoch": 9.965526751241036, + "grad_norm": 2.240005869680317e-06, + "learning_rate": 1.8390981411264829e-09, + "loss": 0.0, + "num_input_tokens_seen": 22029880, + "step": 36135 + }, + { + "epoch": 9.966905681191395, + "grad_norm": 4.209372548302781e-07, + "learning_rate": 1.6960356239492659e-09, + "loss": 0.0, + "num_input_tokens_seen": 22033144, + "step": 36140 + }, + { + "epoch": 9.968284611141755, + "grad_norm": 7.883182888690499e-07, + "learning_rate": 1.5587648549642008e-09, + "loss": 0.0, + "num_input_tokens_seen": 22035576, + "step": 36145 + }, + { + "epoch": 9.969663541092112, + "grad_norm": 5.178037554287584e-07, + "learning_rate": 1.4272858659736266e-09, + "loss": 0.0, + "num_input_tokens_seen": 22038200, + "step": 36150 + }, + { + "epoch": 9.971042471042471, + "grad_norm": 0.0001611936022527516, + "learning_rate": 1.3015986874420627e-09, + "loss": 0.0, + "num_input_tokens_seen": 22040248, + "step": 36155 + }, + { + "epoch": 9.972421400992829, + "grad_norm": 1.555216294946149e-05, + "learning_rate": 1.1817033484878836e-09, + "loss": 0.0, + "num_input_tokens_seen": 22043576, + "step": 36160 + }, + { + "epoch": 9.973800330943188, + "grad_norm": 1.2847077414335217e-05, + "learning_rate": 1.067599876891645e-09, + "loss": 0.0, + "num_input_tokens_seen": 22045912, + "step": 36165 + }, + { + "epoch": 9.975179260893547, + "grad_norm": 1.7431993910577148e-06, + "learning_rate": 9.592882990849817e-10, + "loss": 0.0, + "num_input_tokens_seen": 22049304, + "step": 36170 + }, + { + "epoch": 9.976558190843905, + "grad_norm": 9.045755291481328e-07, + "learning_rate": 8.567686401644847e-10, + "loss": 0.0, + "num_input_tokens_seen": 22051672, + "step": 36175 + }, + { + "epoch": 9.977937120794264, + "grad_norm": 0.001084186602383852, + "learning_rate": 7.600409238833762e-10, + "loss": 0.0, + "num_input_tokens_seen": 22055992, + "step": 36180 + }, + { + "epoch": 9.979316050744622, + "grad_norm": 5.257885504761362e-07, + "learning_rate": 6.691051726515074e-10, + "loss": 0.0, + "num_input_tokens_seen": 22059096, + "step": 36185 + }, + { + "epoch": 9.980694980694981, + "grad_norm": 4.945561045133218e-07, + "learning_rate": 5.839614075353606e-10, + "loss": 0.0, + "num_input_tokens_seen": 22062008, + "step": 36190 + }, + { + "epoch": 9.982073910645338, + "grad_norm": 2.677265229067416e-06, + "learning_rate": 5.046096482663743e-10, + "loss": 0.0, + "num_input_tokens_seen": 22064952, + "step": 36195 + }, + { + "epoch": 9.983452840595698, + "grad_norm": 6.281203241087496e-06, + "learning_rate": 4.3104991322429066e-10, + "loss": 0.0, + "num_input_tokens_seen": 22066808, + "step": 36200 + }, + { + "epoch": 9.984831770546057, + "grad_norm": 7.921661335785757e-07, + "learning_rate": 3.6328221945103324e-10, + "loss": 0.0, + "num_input_tokens_seen": 22069496, + "step": 36205 + }, + { + "epoch": 9.986210700496414, + "grad_norm": 4.902004775431124e-07, + "learning_rate": 3.013065826534822e-10, + "loss": 0.0, + "num_input_tokens_seen": 22072120, + "step": 36210 + }, + { + "epoch": 9.987589630446774, + "grad_norm": 3.313726892884006e-06, + "learning_rate": 2.451230171868213e-10, + "loss": 0.0, + "num_input_tokens_seen": 22074904, + "step": 36215 + }, + { + "epoch": 9.988968560397131, + "grad_norm": 1.2273881111468654e-06, + "learning_rate": 1.9473153606563987e-10, + "loss": 0.0, + "num_input_tokens_seen": 22077656, + "step": 36220 + }, + { + "epoch": 9.99034749034749, + "grad_norm": 7.225496005958121e-07, + "learning_rate": 1.5013215096948418e-10, + "loss": 0.0, + "num_input_tokens_seen": 22080216, + "step": 36225 + }, + { + "epoch": 9.991726420297848, + "grad_norm": 9.305090316047426e-07, + "learning_rate": 1.1132487222897947e-10, + "loss": 0.0, + "num_input_tokens_seen": 22083352, + "step": 36230 + }, + { + "epoch": 9.993105350248207, + "grad_norm": 7.915922469692305e-05, + "learning_rate": 7.830970883415666e-11, + "loss": 0.0, + "num_input_tokens_seen": 22085912, + "step": 36235 + }, + { + "epoch": 9.994484280198566, + "grad_norm": 1.2895435247628484e-05, + "learning_rate": 5.108666843722798e-11, + "loss": 0.0, + "num_input_tokens_seen": 22089720, + "step": 36240 + }, + { + "epoch": 9.995863210148924, + "grad_norm": 4.969319320480281e-07, + "learning_rate": 2.965575734426018e-11, + "loss": 0.0, + "num_input_tokens_seen": 22093656, + "step": 36245 + }, + { + "epoch": 9.997242140099283, + "grad_norm": 3.9428394416063384e-07, + "learning_rate": 1.4016980517950196e-11, + "loss": 0.0, + "num_input_tokens_seen": 22096568, + "step": 36250 + }, + { + "epoch": 9.99862107004964, + "grad_norm": 8.59858914736833e-07, + "learning_rate": 4.170341585951754e-12, + "loss": 0.0, + "num_input_tokens_seen": 22099960, + "step": 36255 + }, + { + "epoch": 10.0, + "grad_norm": 7.729754543106537e-07, + "learning_rate": 1.158428242220566e-13, + "loss": 0.0, + "num_input_tokens_seen": 22103448, + "step": 36260 + }, + { + "epoch": 10.0, + "eval_loss": 0.3993869423866272, + "eval_runtime": 28.5094, + "eval_samples_per_second": 56.543, + "eval_steps_per_second": 14.136, + "num_input_tokens_seen": 22103448, + "step": 36260 + }, + { + "epoch": 10.0, + "num_input_tokens_seen": 22103448, + "step": 36260, + "total_flos": 9.972551535763784e+17, + "train_loss": 0.028105780128666196, + "train_runtime": 9959.3212, + "train_samples_per_second": 14.56, + "train_steps_per_second": 3.641 + } + ], + "logging_steps": 5, + "max_steps": 36260, + "num_input_tokens_seen": 22103448, + "num_train_epochs": 10, + "save_steps": 1813, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.972551535763784e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}