diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,98304 @@ +{ + "best_global_step": 49040, + "best_metric": 0.1356717050075531, + "best_model_checkpoint": "saves/ia3/llama-3-8b-instruct/train_multirc_1753094162/checkpoint-49040", + "epoch": 10.0, + "eval_steps": 3065, + "global_step": 61300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008156606851549756, + "grad_norm": 1.3330285549163818, + "learning_rate": 3.262642740619902e-08, + "loss": 1.8354, + "num_input_tokens_seen": 8928, + "step": 5 + }, + { + "epoch": 0.0016313213703099511, + "grad_norm": 8.133570671081543, + "learning_rate": 7.34094616639478e-08, + "loss": 1.138, + "num_input_tokens_seen": 20448, + "step": 10 + }, + { + "epoch": 0.0024469820554649264, + "grad_norm": 3.1184449195861816, + "learning_rate": 1.1419249592169658e-07, + "loss": 0.51, + "num_input_tokens_seen": 31072, + "step": 15 + }, + { + "epoch": 0.0032626427406199023, + "grad_norm": 9.383626937866211, + "learning_rate": 1.5497553017944535e-07, + "loss": 1.0466, + "num_input_tokens_seen": 42528, + "step": 20 + }, + { + "epoch": 0.004078303425774877, + "grad_norm": 4.462859153747559, + "learning_rate": 1.9575856443719413e-07, + "loss": 1.2972, + "num_input_tokens_seen": 52576, + "step": 25 + }, + { + "epoch": 0.004893964110929853, + "grad_norm": 7.290808200836182, + "learning_rate": 2.365415986949429e-07, + "loss": 0.7665, + "num_input_tokens_seen": 62944, + "step": 30 + }, + { + "epoch": 0.005709624796084829, + "grad_norm": 2.5626280307769775, + "learning_rate": 2.773246329526917e-07, + "loss": 0.7818, + "num_input_tokens_seen": 73504, + "step": 35 + }, + { + "epoch": 0.0065252854812398045, + "grad_norm": 7.564657211303711, + "learning_rate": 3.1810766721044045e-07, + "loss": 1.0131, + "num_input_tokens_seen": 84640, + "step": 40 + }, + { + "epoch": 0.00734094616639478, + "grad_norm": 1.8235329389572144, + "learning_rate": 3.5889070146818926e-07, + "loss": 0.7292, + "num_input_tokens_seen": 96288, + "step": 45 + }, + { + "epoch": 0.008156606851549755, + "grad_norm": 5.445663928985596, + "learning_rate": 3.99673735725938e-07, + "loss": 1.0332, + "num_input_tokens_seen": 107360, + "step": 50 + }, + { + "epoch": 0.00897226753670473, + "grad_norm": 7.110819339752197, + "learning_rate": 4.4045676998368683e-07, + "loss": 0.8706, + "num_input_tokens_seen": 118432, + "step": 55 + }, + { + "epoch": 0.009787928221859706, + "grad_norm": 2.586118459701538, + "learning_rate": 4.812398042414356e-07, + "loss": 1.0135, + "num_input_tokens_seen": 128416, + "step": 60 + }, + { + "epoch": 0.010603588907014683, + "grad_norm": 2.204103708267212, + "learning_rate": 5.220228384991843e-07, + "loss": 0.4435, + "num_input_tokens_seen": 138432, + "step": 65 + }, + { + "epoch": 0.011419249592169658, + "grad_norm": 8.153916358947754, + "learning_rate": 5.628058727569332e-07, + "loss": 0.6451, + "num_input_tokens_seen": 149504, + "step": 70 + }, + { + "epoch": 0.012234910277324634, + "grad_norm": 3.9400570392608643, + "learning_rate": 6.03588907014682e-07, + "loss": 0.662, + "num_input_tokens_seen": 160192, + "step": 75 + }, + { + "epoch": 0.013050570962479609, + "grad_norm": 2.5303783416748047, + "learning_rate": 6.443719412724307e-07, + "loss": 0.7694, + "num_input_tokens_seen": 170432, + "step": 80 + }, + { + "epoch": 0.013866231647634585, + "grad_norm": 2.4371073246002197, + "learning_rate": 6.851549755301795e-07, + "loss": 0.6462, + "num_input_tokens_seen": 180672, + "step": 85 + }, + { + "epoch": 0.01468189233278956, + "grad_norm": 3.1941189765930176, + "learning_rate": 7.259380097879283e-07, + "loss": 0.756, + "num_input_tokens_seen": 192352, + "step": 90 + }, + { + "epoch": 0.015497553017944535, + "grad_norm": 8.172453880310059, + "learning_rate": 7.66721044045677e-07, + "loss": 0.7517, + "num_input_tokens_seen": 203040, + "step": 95 + }, + { + "epoch": 0.01631321370309951, + "grad_norm": 5.06707239151001, + "learning_rate": 8.075040783034258e-07, + "loss": 1.0138, + "num_input_tokens_seen": 214272, + "step": 100 + }, + { + "epoch": 0.017128874388254486, + "grad_norm": 4.949384689331055, + "learning_rate": 8.482871125611746e-07, + "loss": 0.6656, + "num_input_tokens_seen": 225664, + "step": 105 + }, + { + "epoch": 0.01794453507340946, + "grad_norm": 3.9158895015716553, + "learning_rate": 8.890701468189233e-07, + "loss": 1.6558, + "num_input_tokens_seen": 237216, + "step": 110 + }, + { + "epoch": 0.018760195758564437, + "grad_norm": 7.355237007141113, + "learning_rate": 9.298531810766722e-07, + "loss": 0.754, + "num_input_tokens_seen": 246560, + "step": 115 + }, + { + "epoch": 0.01957585644371941, + "grad_norm": 8.249229431152344, + "learning_rate": 9.70636215334421e-07, + "loss": 1.3942, + "num_input_tokens_seen": 257184, + "step": 120 + }, + { + "epoch": 0.020391517128874388, + "grad_norm": 6.793741703033447, + "learning_rate": 1.0114192495921699e-06, + "loss": 0.7729, + "num_input_tokens_seen": 268384, + "step": 125 + }, + { + "epoch": 0.021207177814029365, + "grad_norm": 3.0679988861083984, + "learning_rate": 1.0522022838499183e-06, + "loss": 0.6073, + "num_input_tokens_seen": 279680, + "step": 130 + }, + { + "epoch": 0.02202283849918434, + "grad_norm": 6.015023231506348, + "learning_rate": 1.0929853181076673e-06, + "loss": 1.3118, + "num_input_tokens_seen": 290688, + "step": 135 + }, + { + "epoch": 0.022838499184339316, + "grad_norm": 7.075500011444092, + "learning_rate": 1.133768352365416e-06, + "loss": 0.8988, + "num_input_tokens_seen": 301408, + "step": 140 + }, + { + "epoch": 0.02365415986949429, + "grad_norm": 2.774397373199463, + "learning_rate": 1.1745513866231649e-06, + "loss": 1.1197, + "num_input_tokens_seen": 311968, + "step": 145 + }, + { + "epoch": 0.024469820554649267, + "grad_norm": 5.70680046081543, + "learning_rate": 1.2153344208809136e-06, + "loss": 1.413, + "num_input_tokens_seen": 322080, + "step": 150 + }, + { + "epoch": 0.02528548123980424, + "grad_norm": 5.242013454437256, + "learning_rate": 1.2561174551386625e-06, + "loss": 1.3037, + "num_input_tokens_seen": 333824, + "step": 155 + }, + { + "epoch": 0.026101141924959218, + "grad_norm": 3.010617256164551, + "learning_rate": 1.296900489396411e-06, + "loss": 1.6965, + "num_input_tokens_seen": 344896, + "step": 160 + }, + { + "epoch": 0.026916802610114192, + "grad_norm": 2.9609925746917725, + "learning_rate": 1.33768352365416e-06, + "loss": 0.9141, + "num_input_tokens_seen": 356192, + "step": 165 + }, + { + "epoch": 0.02773246329526917, + "grad_norm": 3.0405545234680176, + "learning_rate": 1.3784665579119086e-06, + "loss": 0.6744, + "num_input_tokens_seen": 367136, + "step": 170 + }, + { + "epoch": 0.028548123980424143, + "grad_norm": 2.0657386779785156, + "learning_rate": 1.4192495921696575e-06, + "loss": 0.7884, + "num_input_tokens_seen": 376832, + "step": 175 + }, + { + "epoch": 0.02936378466557912, + "grad_norm": 2.7704222202301025, + "learning_rate": 1.4600326264274062e-06, + "loss": 0.5125, + "num_input_tokens_seen": 387552, + "step": 180 + }, + { + "epoch": 0.030179445350734094, + "grad_norm": 9.020060539245605, + "learning_rate": 1.5008156606851552e-06, + "loss": 1.0444, + "num_input_tokens_seen": 398528, + "step": 185 + }, + { + "epoch": 0.03099510603588907, + "grad_norm": 6.435461044311523, + "learning_rate": 1.5415986949429036e-06, + "loss": 0.9178, + "num_input_tokens_seen": 408896, + "step": 190 + }, + { + "epoch": 0.03181076672104405, + "grad_norm": 2.2490572929382324, + "learning_rate": 1.5823817292006523e-06, + "loss": 0.7729, + "num_input_tokens_seen": 420416, + "step": 195 + }, + { + "epoch": 0.03262642740619902, + "grad_norm": 9.05227279663086, + "learning_rate": 1.6231647634584013e-06, + "loss": 0.6115, + "num_input_tokens_seen": 430912, + "step": 200 + }, + { + "epoch": 0.033442088091353996, + "grad_norm": 9.257620811462402, + "learning_rate": 1.6639477977161502e-06, + "loss": 1.0006, + "num_input_tokens_seen": 442176, + "step": 205 + }, + { + "epoch": 0.03425774877650897, + "grad_norm": 5.566224098205566, + "learning_rate": 1.704730831973899e-06, + "loss": 0.6711, + "num_input_tokens_seen": 454496, + "step": 210 + }, + { + "epoch": 0.03507340946166395, + "grad_norm": 5.373349189758301, + "learning_rate": 1.7455138662316478e-06, + "loss": 0.5347, + "num_input_tokens_seen": 464896, + "step": 215 + }, + { + "epoch": 0.03588907014681892, + "grad_norm": 3.9684319496154785, + "learning_rate": 1.7862969004893963e-06, + "loss": 1.2468, + "num_input_tokens_seen": 477024, + "step": 220 + }, + { + "epoch": 0.0367047308319739, + "grad_norm": 8.038186073303223, + "learning_rate": 1.8270799347471452e-06, + "loss": 0.7641, + "num_input_tokens_seen": 487776, + "step": 225 + }, + { + "epoch": 0.037520391517128875, + "grad_norm": 1.7324097156524658, + "learning_rate": 1.8678629690048941e-06, + "loss": 0.7528, + "num_input_tokens_seen": 498528, + "step": 230 + }, + { + "epoch": 0.03833605220228385, + "grad_norm": 3.343194007873535, + "learning_rate": 1.908646003262643e-06, + "loss": 0.4611, + "num_input_tokens_seen": 509184, + "step": 235 + }, + { + "epoch": 0.03915171288743882, + "grad_norm": 3.391310691833496, + "learning_rate": 1.9494290375203913e-06, + "loss": 0.7424, + "num_input_tokens_seen": 520224, + "step": 240 + }, + { + "epoch": 0.0399673735725938, + "grad_norm": 7.827118396759033, + "learning_rate": 1.9902120717781402e-06, + "loss": 1.8242, + "num_input_tokens_seen": 530464, + "step": 245 + }, + { + "epoch": 0.040783034257748776, + "grad_norm": 8.592738151550293, + "learning_rate": 2.030995106035889e-06, + "loss": 1.3644, + "num_input_tokens_seen": 541952, + "step": 250 + }, + { + "epoch": 0.041598694942903754, + "grad_norm": 2.5240113735198975, + "learning_rate": 2.071778140293638e-06, + "loss": 0.5893, + "num_input_tokens_seen": 551424, + "step": 255 + }, + { + "epoch": 0.04241435562805873, + "grad_norm": 5.229653358459473, + "learning_rate": 2.1125611745513866e-06, + "loss": 1.5085, + "num_input_tokens_seen": 561440, + "step": 260 + }, + { + "epoch": 0.0432300163132137, + "grad_norm": 6.8039751052856445, + "learning_rate": 2.1533442088091355e-06, + "loss": 1.1373, + "num_input_tokens_seen": 573440, + "step": 265 + }, + { + "epoch": 0.04404567699836868, + "grad_norm": 4.531702518463135, + "learning_rate": 2.1941272430668844e-06, + "loss": 0.529, + "num_input_tokens_seen": 584032, + "step": 270 + }, + { + "epoch": 0.044861337683523655, + "grad_norm": 9.74407958984375, + "learning_rate": 2.2349102773246333e-06, + "loss": 1.0011, + "num_input_tokens_seen": 595328, + "step": 275 + }, + { + "epoch": 0.04567699836867863, + "grad_norm": 3.3898370265960693, + "learning_rate": 2.275693311582382e-06, + "loss": 1.2622, + "num_input_tokens_seen": 605760, + "step": 280 + }, + { + "epoch": 0.0464926590538336, + "grad_norm": 4.497292995452881, + "learning_rate": 2.3164763458401307e-06, + "loss": 0.8188, + "num_input_tokens_seen": 616384, + "step": 285 + }, + { + "epoch": 0.04730831973898858, + "grad_norm": 3.677025318145752, + "learning_rate": 2.357259380097879e-06, + "loss": 0.8144, + "num_input_tokens_seen": 626880, + "step": 290 + }, + { + "epoch": 0.04812398042414356, + "grad_norm": 1.748076319694519, + "learning_rate": 2.398042414355628e-06, + "loss": 1.0358, + "num_input_tokens_seen": 637536, + "step": 295 + }, + { + "epoch": 0.048939641109298535, + "grad_norm": 7.991934299468994, + "learning_rate": 2.4388254486133766e-06, + "loss": 0.9524, + "num_input_tokens_seen": 648000, + "step": 300 + }, + { + "epoch": 0.049755301794453505, + "grad_norm": 3.546271324157715, + "learning_rate": 2.4796084828711255e-06, + "loss": 1.5631, + "num_input_tokens_seen": 659648, + "step": 305 + }, + { + "epoch": 0.05057096247960848, + "grad_norm": 3.526733636856079, + "learning_rate": 2.5203915171288745e-06, + "loss": 0.7609, + "num_input_tokens_seen": 669920, + "step": 310 + }, + { + "epoch": 0.05138662316476346, + "grad_norm": 3.7879226207733154, + "learning_rate": 2.5611745513866234e-06, + "loss": 1.0211, + "num_input_tokens_seen": 681216, + "step": 315 + }, + { + "epoch": 0.052202283849918436, + "grad_norm": 4.732941627502441, + "learning_rate": 2.6019575856443723e-06, + "loss": 1.2922, + "num_input_tokens_seen": 691872, + "step": 320 + }, + { + "epoch": 0.05301794453507341, + "grad_norm": 6.640748977661133, + "learning_rate": 2.6427406199021208e-06, + "loss": 0.6389, + "num_input_tokens_seen": 702208, + "step": 325 + }, + { + "epoch": 0.053833605220228384, + "grad_norm": 5.588031768798828, + "learning_rate": 2.6835236541598697e-06, + "loss": 0.8367, + "num_input_tokens_seen": 712288, + "step": 330 + }, + { + "epoch": 0.05464926590538336, + "grad_norm": 9.681129455566406, + "learning_rate": 2.7243066884176186e-06, + "loss": 1.524, + "num_input_tokens_seen": 722976, + "step": 335 + }, + { + "epoch": 0.05546492659053834, + "grad_norm": 2.907257318496704, + "learning_rate": 2.7650897226753675e-06, + "loss": 0.7539, + "num_input_tokens_seen": 733760, + "step": 340 + }, + { + "epoch": 0.05628058727569331, + "grad_norm": 2.9278414249420166, + "learning_rate": 2.805872756933116e-06, + "loss": 0.9993, + "num_input_tokens_seen": 743520, + "step": 345 + }, + { + "epoch": 0.057096247960848286, + "grad_norm": 2.478440046310425, + "learning_rate": 2.8466557911908645e-06, + "loss": 0.2686, + "num_input_tokens_seen": 753984, + "step": 350 + }, + { + "epoch": 0.05791190864600326, + "grad_norm": 3.4064347743988037, + "learning_rate": 2.8874388254486134e-06, + "loss": 0.6754, + "num_input_tokens_seen": 765376, + "step": 355 + }, + { + "epoch": 0.05872756933115824, + "grad_norm": 4.806743621826172, + "learning_rate": 2.9282218597063623e-06, + "loss": 1.2103, + "num_input_tokens_seen": 775904, + "step": 360 + }, + { + "epoch": 0.05954323001631321, + "grad_norm": 2.5767345428466797, + "learning_rate": 2.969004893964111e-06, + "loss": 1.1001, + "num_input_tokens_seen": 788000, + "step": 365 + }, + { + "epoch": 0.06035889070146819, + "grad_norm": 7.398701190948486, + "learning_rate": 3.0097879282218597e-06, + "loss": 1.3833, + "num_input_tokens_seen": 798816, + "step": 370 + }, + { + "epoch": 0.061174551386623165, + "grad_norm": 2.2194998264312744, + "learning_rate": 3.0505709624796087e-06, + "loss": 0.535, + "num_input_tokens_seen": 809856, + "step": 375 + }, + { + "epoch": 0.06199021207177814, + "grad_norm": 2.4408185482025146, + "learning_rate": 3.0913539967373576e-06, + "loss": 0.8373, + "num_input_tokens_seen": 820448, + "step": 380 + }, + { + "epoch": 0.06280587275693311, + "grad_norm": 7.254020690917969, + "learning_rate": 3.132137030995106e-06, + "loss": 1.4735, + "num_input_tokens_seen": 830624, + "step": 385 + }, + { + "epoch": 0.0636215334420881, + "grad_norm": 2.5482029914855957, + "learning_rate": 3.1729200652528554e-06, + "loss": 0.918, + "num_input_tokens_seen": 840224, + "step": 390 + }, + { + "epoch": 0.06443719412724307, + "grad_norm": 6.943422794342041, + "learning_rate": 3.213703099510604e-06, + "loss": 0.9464, + "num_input_tokens_seen": 850176, + "step": 395 + }, + { + "epoch": 0.06525285481239804, + "grad_norm": 3.934837579727173, + "learning_rate": 3.2544861337683524e-06, + "loss": 0.6153, + "num_input_tokens_seen": 861056, + "step": 400 + }, + { + "epoch": 0.06606851549755302, + "grad_norm": 3.1032795906066895, + "learning_rate": 3.2952691680261013e-06, + "loss": 1.1709, + "num_input_tokens_seen": 871360, + "step": 405 + }, + { + "epoch": 0.06688417618270799, + "grad_norm": 9.995521545410156, + "learning_rate": 3.33605220228385e-06, + "loss": 2.0494, + "num_input_tokens_seen": 882656, + "step": 410 + }, + { + "epoch": 0.06769983686786298, + "grad_norm": 3.1664140224456787, + "learning_rate": 3.3768352365415987e-06, + "loss": 0.8906, + "num_input_tokens_seen": 893952, + "step": 415 + }, + { + "epoch": 0.06851549755301795, + "grad_norm": 6.3491387367248535, + "learning_rate": 3.4176182707993476e-06, + "loss": 0.8215, + "num_input_tokens_seen": 905472, + "step": 420 + }, + { + "epoch": 0.06933115823817292, + "grad_norm": 2.482991933822632, + "learning_rate": 3.458401305057096e-06, + "loss": 0.9371, + "num_input_tokens_seen": 916000, + "step": 425 + }, + { + "epoch": 0.0701468189233279, + "grad_norm": 3.66642689704895, + "learning_rate": 3.4991843393148455e-06, + "loss": 1.0555, + "num_input_tokens_seen": 926944, + "step": 430 + }, + { + "epoch": 0.07096247960848287, + "grad_norm": 3.7871196269989014, + "learning_rate": 3.539967373572594e-06, + "loss": 1.1158, + "num_input_tokens_seen": 938496, + "step": 435 + }, + { + "epoch": 0.07177814029363784, + "grad_norm": 1.7797225713729858, + "learning_rate": 3.5807504078303425e-06, + "loss": 1.1119, + "num_input_tokens_seen": 949472, + "step": 440 + }, + { + "epoch": 0.07259380097879282, + "grad_norm": 8.044276237487793, + "learning_rate": 3.621533442088092e-06, + "loss": 1.1674, + "num_input_tokens_seen": 959712, + "step": 445 + }, + { + "epoch": 0.0734094616639478, + "grad_norm": 7.116984844207764, + "learning_rate": 3.6623164763458403e-06, + "loss": 1.5467, + "num_input_tokens_seen": 969888, + "step": 450 + }, + { + "epoch": 0.07422512234910278, + "grad_norm": 3.5360755920410156, + "learning_rate": 3.7030995106035896e-06, + "loss": 1.2204, + "num_input_tokens_seen": 981024, + "step": 455 + }, + { + "epoch": 0.07504078303425775, + "grad_norm": 8.104681015014648, + "learning_rate": 3.743882544861338e-06, + "loss": 0.6813, + "num_input_tokens_seen": 991680, + "step": 460 + }, + { + "epoch": 0.07585644371941272, + "grad_norm": 4.070034503936768, + "learning_rate": 3.7846655791190866e-06, + "loss": 1.3768, + "num_input_tokens_seen": 1003008, + "step": 465 + }, + { + "epoch": 0.0766721044045677, + "grad_norm": 2.2653777599334717, + "learning_rate": 3.8254486133768355e-06, + "loss": 0.7309, + "num_input_tokens_seen": 1014336, + "step": 470 + }, + { + "epoch": 0.07748776508972267, + "grad_norm": 5.1151227951049805, + "learning_rate": 3.866231647634584e-06, + "loss": 1.3279, + "num_input_tokens_seen": 1024960, + "step": 475 + }, + { + "epoch": 0.07830342577487764, + "grad_norm": 2.843921184539795, + "learning_rate": 3.9070146818923325e-06, + "loss": 0.483, + "num_input_tokens_seen": 1034880, + "step": 480 + }, + { + "epoch": 0.07911908646003263, + "grad_norm": 6.089860439300537, + "learning_rate": 3.947797716150082e-06, + "loss": 1.6538, + "num_input_tokens_seen": 1046112, + "step": 485 + }, + { + "epoch": 0.0799347471451876, + "grad_norm": 7.09505033493042, + "learning_rate": 3.98858075040783e-06, + "loss": 1.3348, + "num_input_tokens_seen": 1056864, + "step": 490 + }, + { + "epoch": 0.08075040783034258, + "grad_norm": 2.155876398086548, + "learning_rate": 4.02936378466558e-06, + "loss": 0.8634, + "num_input_tokens_seen": 1067072, + "step": 495 + }, + { + "epoch": 0.08156606851549755, + "grad_norm": 6.383486747741699, + "learning_rate": 4.070146818923328e-06, + "loss": 0.8931, + "num_input_tokens_seen": 1079136, + "step": 500 + }, + { + "epoch": 0.08238172920065252, + "grad_norm": 3.3095953464508057, + "learning_rate": 4.110929853181077e-06, + "loss": 1.2502, + "num_input_tokens_seen": 1089408, + "step": 505 + }, + { + "epoch": 0.08319738988580751, + "grad_norm": 4.448249340057373, + "learning_rate": 4.151712887438826e-06, + "loss": 0.6225, + "num_input_tokens_seen": 1099488, + "step": 510 + }, + { + "epoch": 0.08401305057096248, + "grad_norm": 6.322920322418213, + "learning_rate": 4.1924959216965745e-06, + "loss": 1.2978, + "num_input_tokens_seen": 1110464, + "step": 515 + }, + { + "epoch": 0.08482871125611746, + "grad_norm": 7.788602352142334, + "learning_rate": 4.233278955954323e-06, + "loss": 0.7351, + "num_input_tokens_seen": 1121216, + "step": 520 + }, + { + "epoch": 0.08564437194127243, + "grad_norm": 8.174678802490234, + "learning_rate": 4.274061990212072e-06, + "loss": 1.1831, + "num_input_tokens_seen": 1132192, + "step": 525 + }, + { + "epoch": 0.0864600326264274, + "grad_norm": 4.423396110534668, + "learning_rate": 4.314845024469821e-06, + "loss": 1.2018, + "num_input_tokens_seen": 1142944, + "step": 530 + }, + { + "epoch": 0.08727569331158239, + "grad_norm": 6.594726085662842, + "learning_rate": 4.35562805872757e-06, + "loss": 1.1559, + "num_input_tokens_seen": 1154080, + "step": 535 + }, + { + "epoch": 0.08809135399673736, + "grad_norm": 5.156267166137695, + "learning_rate": 4.396411092985319e-06, + "loss": 1.0537, + "num_input_tokens_seen": 1165120, + "step": 540 + }, + { + "epoch": 0.08890701468189233, + "grad_norm": 4.245129585266113, + "learning_rate": 4.437194127243067e-06, + "loss": 0.7722, + "num_input_tokens_seen": 1176352, + "step": 545 + }, + { + "epoch": 0.08972267536704731, + "grad_norm": 3.9522593021392822, + "learning_rate": 4.477977161500816e-06, + "loss": 0.4317, + "num_input_tokens_seen": 1188288, + "step": 550 + }, + { + "epoch": 0.09053833605220228, + "grad_norm": 2.710925817489624, + "learning_rate": 4.518760195758565e-06, + "loss": 0.9999, + "num_input_tokens_seen": 1200800, + "step": 555 + }, + { + "epoch": 0.09135399673735727, + "grad_norm": 6.8713459968566895, + "learning_rate": 4.5595432300163135e-06, + "loss": 1.0474, + "num_input_tokens_seen": 1211360, + "step": 560 + }, + { + "epoch": 0.09216965742251224, + "grad_norm": 2.6632473468780518, + "learning_rate": 4.600326264274062e-06, + "loss": 0.6629, + "num_input_tokens_seen": 1221184, + "step": 565 + }, + { + "epoch": 0.0929853181076672, + "grad_norm": 2.766153335571289, + "learning_rate": 4.6411092985318105e-06, + "loss": 1.2558, + "num_input_tokens_seen": 1231808, + "step": 570 + }, + { + "epoch": 0.09380097879282219, + "grad_norm": 4.6623640060424805, + "learning_rate": 4.68189233278956e-06, + "loss": 0.5445, + "num_input_tokens_seen": 1242560, + "step": 575 + }, + { + "epoch": 0.09461663947797716, + "grad_norm": 2.658595085144043, + "learning_rate": 4.722675367047308e-06, + "loss": 0.5414, + "num_input_tokens_seen": 1253440, + "step": 580 + }, + { + "epoch": 0.09543230016313213, + "grad_norm": 6.455071449279785, + "learning_rate": 4.763458401305057e-06, + "loss": 1.5696, + "num_input_tokens_seen": 1263616, + "step": 585 + }, + { + "epoch": 0.09624796084828711, + "grad_norm": 5.618657112121582, + "learning_rate": 4.804241435562806e-06, + "loss": 0.506, + "num_input_tokens_seen": 1275488, + "step": 590 + }, + { + "epoch": 0.09706362153344208, + "grad_norm": 2.9246697425842285, + "learning_rate": 4.845024469820555e-06, + "loss": 0.8798, + "num_input_tokens_seen": 1287200, + "step": 595 + }, + { + "epoch": 0.09787928221859707, + "grad_norm": 3.4241175651550293, + "learning_rate": 4.885807504078304e-06, + "loss": 1.3947, + "num_input_tokens_seen": 1298336, + "step": 600 + }, + { + "epoch": 0.09869494290375204, + "grad_norm": 5.635351657867432, + "learning_rate": 4.9265905383360524e-06, + "loss": 1.0792, + "num_input_tokens_seen": 1309056, + "step": 605 + }, + { + "epoch": 0.09951060358890701, + "grad_norm": 6.215715408325195, + "learning_rate": 4.967373572593801e-06, + "loss": 0.7391, + "num_input_tokens_seen": 1320864, + "step": 610 + }, + { + "epoch": 0.100326264274062, + "grad_norm": 4.762406826019287, + "learning_rate": 5.00815660685155e-06, + "loss": 1.3666, + "num_input_tokens_seen": 1332448, + "step": 615 + }, + { + "epoch": 0.10114192495921696, + "grad_norm": 4.497780799865723, + "learning_rate": 5.048939641109299e-06, + "loss": 1.1731, + "num_input_tokens_seen": 1342368, + "step": 620 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 7.246380805969238, + "learning_rate": 5.089722675367047e-06, + "loss": 0.9549, + "num_input_tokens_seen": 1353024, + "step": 625 + }, + { + "epoch": 0.10277324632952692, + "grad_norm": 7.045072555541992, + "learning_rate": 5.130505709624797e-06, + "loss": 0.9715, + "num_input_tokens_seen": 1364000, + "step": 630 + }, + { + "epoch": 0.10358890701468189, + "grad_norm": 6.610607624053955, + "learning_rate": 5.171288743882545e-06, + "loss": 1.5019, + "num_input_tokens_seen": 1374784, + "step": 635 + }, + { + "epoch": 0.10440456769983687, + "grad_norm": 2.0536601543426514, + "learning_rate": 5.2120717781402944e-06, + "loss": 0.8855, + "num_input_tokens_seen": 1384192, + "step": 640 + }, + { + "epoch": 0.10522022838499184, + "grad_norm": 3.158419609069824, + "learning_rate": 5.252854812398043e-06, + "loss": 0.7457, + "num_input_tokens_seen": 1395392, + "step": 645 + }, + { + "epoch": 0.10603588907014681, + "grad_norm": 5.4659624099731445, + "learning_rate": 5.293637846655791e-06, + "loss": 0.7337, + "num_input_tokens_seen": 1406208, + "step": 650 + }, + { + "epoch": 0.1068515497553018, + "grad_norm": 4.692897796630859, + "learning_rate": 5.334420880913541e-06, + "loss": 0.6057, + "num_input_tokens_seen": 1417024, + "step": 655 + }, + { + "epoch": 0.10766721044045677, + "grad_norm": 3.56672739982605, + "learning_rate": 5.375203915171289e-06, + "loss": 1.0009, + "num_input_tokens_seen": 1427808, + "step": 660 + }, + { + "epoch": 0.10848287112561175, + "grad_norm": 2.1490917205810547, + "learning_rate": 5.415986949429038e-06, + "loss": 0.5541, + "num_input_tokens_seen": 1437344, + "step": 665 + }, + { + "epoch": 0.10929853181076672, + "grad_norm": 5.393669128417969, + "learning_rate": 5.456769983686786e-06, + "loss": 1.0449, + "num_input_tokens_seen": 1448352, + "step": 670 + }, + { + "epoch": 0.11011419249592169, + "grad_norm": 3.691850423812866, + "learning_rate": 5.4975530179445356e-06, + "loss": 0.5626, + "num_input_tokens_seen": 1459520, + "step": 675 + }, + { + "epoch": 0.11092985318107668, + "grad_norm": 4.579554557800293, + "learning_rate": 5.538336052202284e-06, + "loss": 0.7368, + "num_input_tokens_seen": 1470432, + "step": 680 + }, + { + "epoch": 0.11174551386623165, + "grad_norm": 3.071270704269409, + "learning_rate": 5.5791190864600326e-06, + "loss": 0.7172, + "num_input_tokens_seen": 1480544, + "step": 685 + }, + { + "epoch": 0.11256117455138662, + "grad_norm": 5.810830116271973, + "learning_rate": 5.619902120717781e-06, + "loss": 1.094, + "num_input_tokens_seen": 1490752, + "step": 690 + }, + { + "epoch": 0.1133768352365416, + "grad_norm": 8.176682472229004, + "learning_rate": 5.66068515497553e-06, + "loss": 0.9902, + "num_input_tokens_seen": 1502240, + "step": 695 + }, + { + "epoch": 0.11419249592169657, + "grad_norm": 4.889057159423828, + "learning_rate": 5.701468189233279e-06, + "loss": 1.1874, + "num_input_tokens_seen": 1512256, + "step": 700 + }, + { + "epoch": 0.11500815660685156, + "grad_norm": 1.9960483312606812, + "learning_rate": 5.742251223491028e-06, + "loss": 1.0746, + "num_input_tokens_seen": 1523168, + "step": 705 + }, + { + "epoch": 0.11582381729200653, + "grad_norm": 2.9320225715637207, + "learning_rate": 5.783034257748777e-06, + "loss": 0.5092, + "num_input_tokens_seen": 1534976, + "step": 710 + }, + { + "epoch": 0.1166394779771615, + "grad_norm": 2.7793667316436768, + "learning_rate": 5.823817292006525e-06, + "loss": 1.1876, + "num_input_tokens_seen": 1545856, + "step": 715 + }, + { + "epoch": 0.11745513866231648, + "grad_norm": 2.030134677886963, + "learning_rate": 5.8646003262642745e-06, + "loss": 0.316, + "num_input_tokens_seen": 1555680, + "step": 720 + }, + { + "epoch": 0.11827079934747145, + "grad_norm": 1.2774195671081543, + "learning_rate": 5.905383360522023e-06, + "loss": 0.3569, + "num_input_tokens_seen": 1567328, + "step": 725 + }, + { + "epoch": 0.11908646003262642, + "grad_norm": 4.449801445007324, + "learning_rate": 5.9461663947797715e-06, + "loss": 1.2828, + "num_input_tokens_seen": 1577184, + "step": 730 + }, + { + "epoch": 0.1199021207177814, + "grad_norm": 9.261007308959961, + "learning_rate": 5.986949429037521e-06, + "loss": 1.1463, + "num_input_tokens_seen": 1588544, + "step": 735 + }, + { + "epoch": 0.12071778140293637, + "grad_norm": 3.738579511642456, + "learning_rate": 6.027732463295269e-06, + "loss": 0.8595, + "num_input_tokens_seen": 1599648, + "step": 740 + }, + { + "epoch": 0.12153344208809136, + "grad_norm": 2.616783857345581, + "learning_rate": 6.068515497553019e-06, + "loss": 1.1717, + "num_input_tokens_seen": 1612096, + "step": 745 + }, + { + "epoch": 0.12234910277324633, + "grad_norm": 2.2141339778900146, + "learning_rate": 6.109298531810767e-06, + "loss": 0.5894, + "num_input_tokens_seen": 1623584, + "step": 750 + }, + { + "epoch": 0.1231647634584013, + "grad_norm": 6.61582612991333, + "learning_rate": 6.150081566068516e-06, + "loss": 1.0738, + "num_input_tokens_seen": 1634688, + "step": 755 + }, + { + "epoch": 0.12398042414355628, + "grad_norm": 3.1965808868408203, + "learning_rate": 6.190864600326265e-06, + "loss": 1.2733, + "num_input_tokens_seen": 1645600, + "step": 760 + }, + { + "epoch": 0.12479608482871125, + "grad_norm": 5.271690845489502, + "learning_rate": 6.2316476345840135e-06, + "loss": 0.4003, + "num_input_tokens_seen": 1655968, + "step": 765 + }, + { + "epoch": 0.12561174551386622, + "grad_norm": 4.063238620758057, + "learning_rate": 6.272430668841763e-06, + "loss": 1.2057, + "num_input_tokens_seen": 1666848, + "step": 770 + }, + { + "epoch": 0.1264274061990212, + "grad_norm": 8.91235637664795, + "learning_rate": 6.3132137030995105e-06, + "loss": 0.9312, + "num_input_tokens_seen": 1676512, + "step": 775 + }, + { + "epoch": 0.1272430668841762, + "grad_norm": 1.5295811891555786, + "learning_rate": 6.35399673735726e-06, + "loss": 0.93, + "num_input_tokens_seen": 1687488, + "step": 780 + }, + { + "epoch": 0.12805872756933115, + "grad_norm": 1.8265514373779297, + "learning_rate": 6.394779771615008e-06, + "loss": 1.135, + "num_input_tokens_seen": 1697312, + "step": 785 + }, + { + "epoch": 0.12887438825448613, + "grad_norm": 6.960163593292236, + "learning_rate": 6.435562805872757e-06, + "loss": 0.99, + "num_input_tokens_seen": 1708096, + "step": 790 + }, + { + "epoch": 0.12969004893964112, + "grad_norm": 1.4655123949050903, + "learning_rate": 6.476345840130506e-06, + "loss": 0.6007, + "num_input_tokens_seen": 1717824, + "step": 795 + }, + { + "epoch": 0.13050570962479607, + "grad_norm": 4.673121929168701, + "learning_rate": 6.517128874388255e-06, + "loss": 1.5462, + "num_input_tokens_seen": 1728608, + "step": 800 + }, + { + "epoch": 0.13132137030995106, + "grad_norm": 7.8018622398376465, + "learning_rate": 6.557911908646004e-06, + "loss": 0.8731, + "num_input_tokens_seen": 1739456, + "step": 805 + }, + { + "epoch": 0.13213703099510604, + "grad_norm": 3.048567771911621, + "learning_rate": 6.598694942903752e-06, + "loss": 0.8261, + "num_input_tokens_seen": 1750304, + "step": 810 + }, + { + "epoch": 0.132952691680261, + "grad_norm": 7.084068775177002, + "learning_rate": 6.639477977161501e-06, + "loss": 0.5847, + "num_input_tokens_seen": 1762496, + "step": 815 + }, + { + "epoch": 0.13376835236541598, + "grad_norm": 3.578463554382324, + "learning_rate": 6.68026101141925e-06, + "loss": 0.9494, + "num_input_tokens_seen": 1772640, + "step": 820 + }, + { + "epoch": 0.13458401305057097, + "grad_norm": 0.9019239544868469, + "learning_rate": 6.721044045676998e-06, + "loss": 0.5508, + "num_input_tokens_seen": 1782272, + "step": 825 + }, + { + "epoch": 0.13539967373572595, + "grad_norm": 0.43010106682777405, + "learning_rate": 6.761827079934747e-06, + "loss": 0.5446, + "num_input_tokens_seen": 1793120, + "step": 830 + }, + { + "epoch": 0.1362153344208809, + "grad_norm": 2.677826404571533, + "learning_rate": 6.802610114192497e-06, + "loss": 0.7698, + "num_input_tokens_seen": 1803264, + "step": 835 + }, + { + "epoch": 0.1370309951060359, + "grad_norm": 3.998385429382324, + "learning_rate": 6.843393148450244e-06, + "loss": 0.2121, + "num_input_tokens_seen": 1813088, + "step": 840 + }, + { + "epoch": 0.13784665579119088, + "grad_norm": 0.8953873515129089, + "learning_rate": 6.884176182707994e-06, + "loss": 0.8705, + "num_input_tokens_seen": 1824032, + "step": 845 + }, + { + "epoch": 0.13866231647634583, + "grad_norm": 2.784470319747925, + "learning_rate": 6.924959216965743e-06, + "loss": 0.9661, + "num_input_tokens_seen": 1836512, + "step": 850 + }, + { + "epoch": 0.13947797716150082, + "grad_norm": 2.844362258911133, + "learning_rate": 6.965742251223491e-06, + "loss": 0.4676, + "num_input_tokens_seen": 1848256, + "step": 855 + }, + { + "epoch": 0.1402936378466558, + "grad_norm": 4.121376991271973, + "learning_rate": 7.00652528548124e-06, + "loss": 0.4564, + "num_input_tokens_seen": 1859040, + "step": 860 + }, + { + "epoch": 0.14110929853181076, + "grad_norm": 0.6507602334022522, + "learning_rate": 7.047308319738989e-06, + "loss": 0.5684, + "num_input_tokens_seen": 1869248, + "step": 865 + }, + { + "epoch": 0.14192495921696574, + "grad_norm": 5.145756244659424, + "learning_rate": 7.088091353996739e-06, + "loss": 1.0076, + "num_input_tokens_seen": 1881184, + "step": 870 + }, + { + "epoch": 0.14274061990212072, + "grad_norm": 0.485206663608551, + "learning_rate": 7.128874388254486e-06, + "loss": 0.6238, + "num_input_tokens_seen": 1893760, + "step": 875 + }, + { + "epoch": 0.14355628058727568, + "grad_norm": 3.875338077545166, + "learning_rate": 7.169657422512236e-06, + "loss": 0.45, + "num_input_tokens_seen": 1904800, + "step": 880 + }, + { + "epoch": 0.14437194127243066, + "grad_norm": 1.1071562767028809, + "learning_rate": 7.210440456769985e-06, + "loss": 0.5303, + "num_input_tokens_seen": 1915296, + "step": 885 + }, + { + "epoch": 0.14518760195758565, + "grad_norm": 10.392167091369629, + "learning_rate": 7.251223491027733e-06, + "loss": 1.091, + "num_input_tokens_seen": 1925152, + "step": 890 + }, + { + "epoch": 0.14600326264274063, + "grad_norm": 4.559696197509766, + "learning_rate": 7.292006525285482e-06, + "loss": 0.4553, + "num_input_tokens_seen": 1935200, + "step": 895 + }, + { + "epoch": 0.1468189233278956, + "grad_norm": 0.3478938937187195, + "learning_rate": 7.3327895595432304e-06, + "loss": 0.7881, + "num_input_tokens_seen": 1945632, + "step": 900 + }, + { + "epoch": 0.14763458401305057, + "grad_norm": 6.111306190490723, + "learning_rate": 7.373572593800979e-06, + "loss": 1.103, + "num_input_tokens_seen": 1956992, + "step": 905 + }, + { + "epoch": 0.14845024469820556, + "grad_norm": 0.4845161437988281, + "learning_rate": 7.414355628058728e-06, + "loss": 0.8208, + "num_input_tokens_seen": 1968800, + "step": 910 + }, + { + "epoch": 0.14926590538336051, + "grad_norm": 2.187330722808838, + "learning_rate": 7.455138662316477e-06, + "loss": 0.5876, + "num_input_tokens_seen": 1979424, + "step": 915 + }, + { + "epoch": 0.1500815660685155, + "grad_norm": 0.325928270816803, + "learning_rate": 7.495921696574225e-06, + "loss": 1.0476, + "num_input_tokens_seen": 1990976, + "step": 920 + }, + { + "epoch": 0.15089722675367048, + "grad_norm": 6.140865325927734, + "learning_rate": 7.536704730831974e-06, + "loss": 0.4415, + "num_input_tokens_seen": 2002432, + "step": 925 + }, + { + "epoch": 0.15171288743882544, + "grad_norm": 0.2894853353500366, + "learning_rate": 7.577487765089723e-06, + "loss": 0.3785, + "num_input_tokens_seen": 2013632, + "step": 930 + }, + { + "epoch": 0.15252854812398042, + "grad_norm": 5.740682601928711, + "learning_rate": 7.6182707993474724e-06, + "loss": 0.9366, + "num_input_tokens_seen": 2024960, + "step": 935 + }, + { + "epoch": 0.1533442088091354, + "grad_norm": 5.101397514343262, + "learning_rate": 7.659053833605221e-06, + "loss": 0.5329, + "num_input_tokens_seen": 2035712, + "step": 940 + }, + { + "epoch": 0.15415986949429036, + "grad_norm": 0.26534295082092285, + "learning_rate": 7.69983686786297e-06, + "loss": 0.3169, + "num_input_tokens_seen": 2046656, + "step": 945 + }, + { + "epoch": 0.15497553017944535, + "grad_norm": 0.6722267270088196, + "learning_rate": 7.740619902120718e-06, + "loss": 0.8392, + "num_input_tokens_seen": 2058112, + "step": 950 + }, + { + "epoch": 0.15579119086460033, + "grad_norm": 4.746047496795654, + "learning_rate": 7.781402936378467e-06, + "loss": 0.7101, + "num_input_tokens_seen": 2068768, + "step": 955 + }, + { + "epoch": 0.1566068515497553, + "grad_norm": 5.68007755279541, + "learning_rate": 7.822185970636217e-06, + "loss": 0.6575, + "num_input_tokens_seen": 2080928, + "step": 960 + }, + { + "epoch": 0.15742251223491027, + "grad_norm": 0.674821138381958, + "learning_rate": 7.862969004893964e-06, + "loss": 0.5901, + "num_input_tokens_seen": 2090784, + "step": 965 + }, + { + "epoch": 0.15823817292006526, + "grad_norm": 6.950295448303223, + "learning_rate": 7.903752039151714e-06, + "loss": 0.3123, + "num_input_tokens_seen": 2100416, + "step": 970 + }, + { + "epoch": 0.15905383360522024, + "grad_norm": 0.8636399507522583, + "learning_rate": 7.944535073409461e-06, + "loss": 0.1567, + "num_input_tokens_seen": 2111424, + "step": 975 + }, + { + "epoch": 0.1598694942903752, + "grad_norm": 0.2521192133426666, + "learning_rate": 7.98531810766721e-06, + "loss": 0.3353, + "num_input_tokens_seen": 2122176, + "step": 980 + }, + { + "epoch": 0.16068515497553018, + "grad_norm": 0.13819894194602966, + "learning_rate": 8.026101141924958e-06, + "loss": 0.1369, + "num_input_tokens_seen": 2132928, + "step": 985 + }, + { + "epoch": 0.16150081566068517, + "grad_norm": 3.2420027256011963, + "learning_rate": 8.066884176182708e-06, + "loss": 0.1868, + "num_input_tokens_seen": 2144480, + "step": 990 + }, + { + "epoch": 0.16231647634584012, + "grad_norm": 2.645141124725342, + "learning_rate": 8.107667210440457e-06, + "loss": 0.2884, + "num_input_tokens_seen": 2154272, + "step": 995 + }, + { + "epoch": 0.1631321370309951, + "grad_norm": 0.6991066932678223, + "learning_rate": 8.148450244698205e-06, + "loss": 0.0201, + "num_input_tokens_seen": 2163936, + "step": 1000 + }, + { + "epoch": 0.1639477977161501, + "grad_norm": 0.7994592189788818, + "learning_rate": 8.189233278955954e-06, + "loss": 0.3993, + "num_input_tokens_seen": 2175392, + "step": 1005 + }, + { + "epoch": 0.16476345840130505, + "grad_norm": 0.10106869786977768, + "learning_rate": 8.230016313213703e-06, + "loss": 0.4307, + "num_input_tokens_seen": 2185120, + "step": 1010 + }, + { + "epoch": 0.16557911908646003, + "grad_norm": 7.514805793762207, + "learning_rate": 8.270799347471453e-06, + "loss": 0.7977, + "num_input_tokens_seen": 2196864, + "step": 1015 + }, + { + "epoch": 0.16639477977161501, + "grad_norm": 4.973484039306641, + "learning_rate": 8.3115823817292e-06, + "loss": 0.4691, + "num_input_tokens_seen": 2207744, + "step": 1020 + }, + { + "epoch": 0.16721044045676997, + "grad_norm": 3.2014658451080322, + "learning_rate": 8.35236541598695e-06, + "loss": 0.9073, + "num_input_tokens_seen": 2218528, + "step": 1025 + }, + { + "epoch": 0.16802610114192496, + "grad_norm": 7.67025899887085, + "learning_rate": 8.393148450244699e-06, + "loss": 0.3349, + "num_input_tokens_seen": 2228320, + "step": 1030 + }, + { + "epoch": 0.16884176182707994, + "grad_norm": 3.3980448246002197, + "learning_rate": 8.433931484502447e-06, + "loss": 0.4665, + "num_input_tokens_seen": 2238464, + "step": 1035 + }, + { + "epoch": 0.16965742251223492, + "grad_norm": 0.09088382869958878, + "learning_rate": 8.474714518760196e-06, + "loss": 0.2671, + "num_input_tokens_seen": 2249440, + "step": 1040 + }, + { + "epoch": 0.17047308319738988, + "grad_norm": 0.13072648644447327, + "learning_rate": 8.515497553017945e-06, + "loss": 0.3816, + "num_input_tokens_seen": 2260576, + "step": 1045 + }, + { + "epoch": 0.17128874388254486, + "grad_norm": 0.9632706642150879, + "learning_rate": 8.556280587275693e-06, + "loss": 0.2803, + "num_input_tokens_seen": 2271904, + "step": 1050 + }, + { + "epoch": 0.17210440456769985, + "grad_norm": 3.8651349544525146, + "learning_rate": 8.597063621533442e-06, + "loss": 0.1723, + "num_input_tokens_seen": 2281568, + "step": 1055 + }, + { + "epoch": 0.1729200652528548, + "grad_norm": 3.410445213317871, + "learning_rate": 8.637846655791192e-06, + "loss": 0.2324, + "num_input_tokens_seen": 2292704, + "step": 1060 + }, + { + "epoch": 0.1737357259380098, + "grad_norm": 0.21268483996391296, + "learning_rate": 8.67862969004894e-06, + "loss": 0.455, + "num_input_tokens_seen": 2303904, + "step": 1065 + }, + { + "epoch": 0.17455138662316477, + "grad_norm": 3.510143995285034, + "learning_rate": 8.719412724306688e-06, + "loss": 0.1439, + "num_input_tokens_seen": 2314336, + "step": 1070 + }, + { + "epoch": 0.17536704730831973, + "grad_norm": 0.07627126574516296, + "learning_rate": 8.760195758564438e-06, + "loss": 0.0848, + "num_input_tokens_seen": 2325024, + "step": 1075 + }, + { + "epoch": 0.1761827079934747, + "grad_norm": 0.10416768491268158, + "learning_rate": 8.800978792822187e-06, + "loss": 0.3712, + "num_input_tokens_seen": 2336224, + "step": 1080 + }, + { + "epoch": 0.1769983686786297, + "grad_norm": 0.1290075033903122, + "learning_rate": 8.841761827079935e-06, + "loss": 0.0914, + "num_input_tokens_seen": 2346880, + "step": 1085 + }, + { + "epoch": 0.17781402936378465, + "grad_norm": 0.12905515730381012, + "learning_rate": 8.882544861337684e-06, + "loss": 0.2428, + "num_input_tokens_seen": 2357632, + "step": 1090 + }, + { + "epoch": 0.17862969004893964, + "grad_norm": 1.0012727975845337, + "learning_rate": 8.923327895595434e-06, + "loss": 0.2445, + "num_input_tokens_seen": 2370592, + "step": 1095 + }, + { + "epoch": 0.17944535073409462, + "grad_norm": 2.7375569343566895, + "learning_rate": 8.964110929853181e-06, + "loss": 0.5748, + "num_input_tokens_seen": 2381024, + "step": 1100 + }, + { + "epoch": 0.1802610114192496, + "grad_norm": 1.375232458114624, + "learning_rate": 9.00489396411093e-06, + "loss": 0.1205, + "num_input_tokens_seen": 2392160, + "step": 1105 + }, + { + "epoch": 0.18107667210440456, + "grad_norm": 2.8965327739715576, + "learning_rate": 9.04567699836868e-06, + "loss": 0.7723, + "num_input_tokens_seen": 2403616, + "step": 1110 + }, + { + "epoch": 0.18189233278955955, + "grad_norm": 2.3710503578186035, + "learning_rate": 9.086460032626427e-06, + "loss": 0.8811, + "num_input_tokens_seen": 2416032, + "step": 1115 + }, + { + "epoch": 0.18270799347471453, + "grad_norm": 0.08530598133802414, + "learning_rate": 9.127243066884177e-06, + "loss": 0.3497, + "num_input_tokens_seen": 2426112, + "step": 1120 + }, + { + "epoch": 0.1835236541598695, + "grad_norm": 3.0593976974487305, + "learning_rate": 9.168026101141926e-06, + "loss": 0.3664, + "num_input_tokens_seen": 2437344, + "step": 1125 + }, + { + "epoch": 0.18433931484502447, + "grad_norm": 0.09175080060958862, + "learning_rate": 9.208809135399674e-06, + "loss": 0.4302, + "num_input_tokens_seen": 2448672, + "step": 1130 + }, + { + "epoch": 0.18515497553017946, + "grad_norm": 0.07254120707511902, + "learning_rate": 9.249592169657423e-06, + "loss": 0.224, + "num_input_tokens_seen": 2459552, + "step": 1135 + }, + { + "epoch": 0.1859706362153344, + "grad_norm": 0.37052246928215027, + "learning_rate": 9.290375203915172e-06, + "loss": 0.1629, + "num_input_tokens_seen": 2469184, + "step": 1140 + }, + { + "epoch": 0.1867862969004894, + "grad_norm": 0.08568814396858215, + "learning_rate": 9.33115823817292e-06, + "loss": 0.1471, + "num_input_tokens_seen": 2481120, + "step": 1145 + }, + { + "epoch": 0.18760195758564438, + "grad_norm": 2.281127691268921, + "learning_rate": 9.37194127243067e-06, + "loss": 0.223, + "num_input_tokens_seen": 2490976, + "step": 1150 + }, + { + "epoch": 0.18841761827079934, + "grad_norm": 2.210665702819824, + "learning_rate": 9.412724306688419e-06, + "loss": 0.3644, + "num_input_tokens_seen": 2501024, + "step": 1155 + }, + { + "epoch": 0.18923327895595432, + "grad_norm": 3.8169009685516357, + "learning_rate": 9.453507340946168e-06, + "loss": 0.5174, + "num_input_tokens_seen": 2511264, + "step": 1160 + }, + { + "epoch": 0.1900489396411093, + "grad_norm": 0.08619657903909683, + "learning_rate": 9.494290375203916e-06, + "loss": 0.2152, + "num_input_tokens_seen": 2521920, + "step": 1165 + }, + { + "epoch": 0.19086460032626426, + "grad_norm": 2.668630361557007, + "learning_rate": 9.535073409461665e-06, + "loss": 0.7419, + "num_input_tokens_seen": 2533056, + "step": 1170 + }, + { + "epoch": 0.19168026101141925, + "grad_norm": 3.9117813110351562, + "learning_rate": 9.575856443719414e-06, + "loss": 0.6141, + "num_input_tokens_seen": 2544320, + "step": 1175 + }, + { + "epoch": 0.19249592169657423, + "grad_norm": 2.8921947479248047, + "learning_rate": 9.616639477977162e-06, + "loss": 0.4961, + "num_input_tokens_seen": 2555168, + "step": 1180 + }, + { + "epoch": 0.1933115823817292, + "grad_norm": 0.14350508153438568, + "learning_rate": 9.657422512234911e-06, + "loss": 0.2523, + "num_input_tokens_seen": 2566304, + "step": 1185 + }, + { + "epoch": 0.19412724306688417, + "grad_norm": 0.05376644805073738, + "learning_rate": 9.69820554649266e-06, + "loss": 0.6567, + "num_input_tokens_seen": 2575936, + "step": 1190 + }, + { + "epoch": 0.19494290375203915, + "grad_norm": 5.182711124420166, + "learning_rate": 9.738988580750408e-06, + "loss": 0.4341, + "num_input_tokens_seen": 2586528, + "step": 1195 + }, + { + "epoch": 0.19575856443719414, + "grad_norm": 1.0402570962905884, + "learning_rate": 9.779771615008158e-06, + "loss": 0.4649, + "num_input_tokens_seen": 2596064, + "step": 1200 + }, + { + "epoch": 0.1965742251223491, + "grad_norm": 0.6327175498008728, + "learning_rate": 9.820554649265905e-06, + "loss": 0.53, + "num_input_tokens_seen": 2606464, + "step": 1205 + }, + { + "epoch": 0.19738988580750408, + "grad_norm": 2.819385290145874, + "learning_rate": 9.861337683523655e-06, + "loss": 0.4592, + "num_input_tokens_seen": 2617568, + "step": 1210 + }, + { + "epoch": 0.19820554649265906, + "grad_norm": 0.046848125755786896, + "learning_rate": 9.902120717781402e-06, + "loss": 0.2271, + "num_input_tokens_seen": 2628928, + "step": 1215 + }, + { + "epoch": 0.19902120717781402, + "grad_norm": 3.546823024749756, + "learning_rate": 9.942903752039152e-06, + "loss": 0.2925, + "num_input_tokens_seen": 2641344, + "step": 1220 + }, + { + "epoch": 0.199836867862969, + "grad_norm": 7.206093788146973, + "learning_rate": 9.983686786296901e-06, + "loss": 0.1126, + "num_input_tokens_seen": 2651584, + "step": 1225 + }, + { + "epoch": 0.200652528548124, + "grad_norm": 2.7622852325439453, + "learning_rate": 1.0024469820554649e-05, + "loss": 0.1872, + "num_input_tokens_seen": 2663360, + "step": 1230 + }, + { + "epoch": 0.20146818923327894, + "grad_norm": 0.7607209086418152, + "learning_rate": 1.0065252854812398e-05, + "loss": 0.3735, + "num_input_tokens_seen": 2675168, + "step": 1235 + }, + { + "epoch": 0.20228384991843393, + "grad_norm": 4.236608028411865, + "learning_rate": 1.0106035889070147e-05, + "loss": 0.6866, + "num_input_tokens_seen": 2685952, + "step": 1240 + }, + { + "epoch": 0.2030995106035889, + "grad_norm": 2.818138837814331, + "learning_rate": 1.0146818923327895e-05, + "loss": 0.2428, + "num_input_tokens_seen": 2695488, + "step": 1245 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 3.535099983215332, + "learning_rate": 1.0187601957585644e-05, + "loss": 0.3125, + "num_input_tokens_seen": 2706880, + "step": 1250 + }, + { + "epoch": 0.20473083197389885, + "grad_norm": 2.7588858604431152, + "learning_rate": 1.0228384991843394e-05, + "loss": 0.8008, + "num_input_tokens_seen": 2717184, + "step": 1255 + }, + { + "epoch": 0.20554649265905384, + "grad_norm": 0.056320056319236755, + "learning_rate": 1.0269168026101141e-05, + "loss": 0.2729, + "num_input_tokens_seen": 2729184, + "step": 1260 + }, + { + "epoch": 0.20636215334420882, + "grad_norm": 2.690307140350342, + "learning_rate": 1.030995106035889e-05, + "loss": 0.2161, + "num_input_tokens_seen": 2740768, + "step": 1265 + }, + { + "epoch": 0.20717781402936378, + "grad_norm": 0.5788370370864868, + "learning_rate": 1.035073409461664e-05, + "loss": 0.3405, + "num_input_tokens_seen": 2751328, + "step": 1270 + }, + { + "epoch": 0.20799347471451876, + "grad_norm": 0.14811070263385773, + "learning_rate": 1.0391517128874388e-05, + "loss": 0.4065, + "num_input_tokens_seen": 2762464, + "step": 1275 + }, + { + "epoch": 0.20880913539967375, + "grad_norm": 0.13844524323940277, + "learning_rate": 1.0432300163132137e-05, + "loss": 0.1291, + "num_input_tokens_seen": 2773824, + "step": 1280 + }, + { + "epoch": 0.2096247960848287, + "grad_norm": 2.7933778762817383, + "learning_rate": 1.0473083197389886e-05, + "loss": 0.6, + "num_input_tokens_seen": 2784544, + "step": 1285 + }, + { + "epoch": 0.21044045676998369, + "grad_norm": 0.03066830337047577, + "learning_rate": 1.0513866231647634e-05, + "loss": 0.1149, + "num_input_tokens_seen": 2796032, + "step": 1290 + }, + { + "epoch": 0.21125611745513867, + "grad_norm": 0.04977479204535484, + "learning_rate": 1.0554649265905383e-05, + "loss": 0.3102, + "num_input_tokens_seen": 2808096, + "step": 1295 + }, + { + "epoch": 0.21207177814029363, + "grad_norm": 2.464812994003296, + "learning_rate": 1.0595432300163133e-05, + "loss": 0.136, + "num_input_tokens_seen": 2818080, + "step": 1300 + }, + { + "epoch": 0.2128874388254486, + "grad_norm": 0.20755323767662048, + "learning_rate": 1.0636215334420882e-05, + "loss": 0.3619, + "num_input_tokens_seen": 2829248, + "step": 1305 + }, + { + "epoch": 0.2137030995106036, + "grad_norm": 0.06139184162020683, + "learning_rate": 1.067699836867863e-05, + "loss": 0.2034, + "num_input_tokens_seen": 2840288, + "step": 1310 + }, + { + "epoch": 0.21451876019575855, + "grad_norm": 2.8132433891296387, + "learning_rate": 1.0717781402936379e-05, + "loss": 0.4125, + "num_input_tokens_seen": 2849440, + "step": 1315 + }, + { + "epoch": 0.21533442088091354, + "grad_norm": 0.027258845046162605, + "learning_rate": 1.0758564437194128e-05, + "loss": 0.2683, + "num_input_tokens_seen": 2861056, + "step": 1320 + }, + { + "epoch": 0.21615008156606852, + "grad_norm": 1.155172348022461, + "learning_rate": 1.0799347471451876e-05, + "loss": 0.1487, + "num_input_tokens_seen": 2872832, + "step": 1325 + }, + { + "epoch": 0.2169657422512235, + "grad_norm": 4.498603820800781, + "learning_rate": 1.0840130505709625e-05, + "loss": 0.762, + "num_input_tokens_seen": 2883968, + "step": 1330 + }, + { + "epoch": 0.21778140293637846, + "grad_norm": 0.03457179293036461, + "learning_rate": 1.0880913539967375e-05, + "loss": 0.1753, + "num_input_tokens_seen": 2894880, + "step": 1335 + }, + { + "epoch": 0.21859706362153344, + "grad_norm": 3.0344653129577637, + "learning_rate": 1.0921696574225122e-05, + "loss": 0.5336, + "num_input_tokens_seen": 2905216, + "step": 1340 + }, + { + "epoch": 0.21941272430668843, + "grad_norm": 4.007531642913818, + "learning_rate": 1.0962479608482872e-05, + "loss": 0.1149, + "num_input_tokens_seen": 2917152, + "step": 1345 + }, + { + "epoch": 0.22022838499184338, + "grad_norm": 2.827204942703247, + "learning_rate": 1.1003262642740621e-05, + "loss": 0.2197, + "num_input_tokens_seen": 2928448, + "step": 1350 + }, + { + "epoch": 0.22104404567699837, + "grad_norm": 0.9208163022994995, + "learning_rate": 1.1044045676998369e-05, + "loss": 0.542, + "num_input_tokens_seen": 2938336, + "step": 1355 + }, + { + "epoch": 0.22185970636215335, + "grad_norm": 3.4270288944244385, + "learning_rate": 1.1084828711256118e-05, + "loss": 0.1137, + "num_input_tokens_seen": 2950496, + "step": 1360 + }, + { + "epoch": 0.2226753670473083, + "grad_norm": 4.8768391609191895, + "learning_rate": 1.1125611745513867e-05, + "loss": 0.4521, + "num_input_tokens_seen": 2961024, + "step": 1365 + }, + { + "epoch": 0.2234910277324633, + "grad_norm": 2.3663644790649414, + "learning_rate": 1.1166394779771617e-05, + "loss": 0.1375, + "num_input_tokens_seen": 2971744, + "step": 1370 + }, + { + "epoch": 0.22430668841761828, + "grad_norm": 2.867866039276123, + "learning_rate": 1.1207177814029364e-05, + "loss": 0.2323, + "num_input_tokens_seen": 2984192, + "step": 1375 + }, + { + "epoch": 0.22512234910277323, + "grad_norm": 2.736905336380005, + "learning_rate": 1.1247960848287114e-05, + "loss": 0.3018, + "num_input_tokens_seen": 2994816, + "step": 1380 + }, + { + "epoch": 0.22593800978792822, + "grad_norm": 0.12237265706062317, + "learning_rate": 1.1288743882544863e-05, + "loss": 0.2125, + "num_input_tokens_seen": 3005568, + "step": 1385 + }, + { + "epoch": 0.2267536704730832, + "grad_norm": 0.3546059727668762, + "learning_rate": 1.132952691680261e-05, + "loss": 0.4865, + "num_input_tokens_seen": 3015680, + "step": 1390 + }, + { + "epoch": 0.2275693311582382, + "grad_norm": 1.0771702527999878, + "learning_rate": 1.137030995106036e-05, + "loss": 0.1847, + "num_input_tokens_seen": 3026752, + "step": 1395 + }, + { + "epoch": 0.22838499184339314, + "grad_norm": 0.05117671936750412, + "learning_rate": 1.141109298531811e-05, + "loss": 0.2351, + "num_input_tokens_seen": 3036704, + "step": 1400 + }, + { + "epoch": 0.22920065252854813, + "grad_norm": 2.687425136566162, + "learning_rate": 1.1451876019575857e-05, + "loss": 0.4731, + "num_input_tokens_seen": 3048960, + "step": 1405 + }, + { + "epoch": 0.2300163132137031, + "grad_norm": 3.091639280319214, + "learning_rate": 1.1492659053833606e-05, + "loss": 0.5082, + "num_input_tokens_seen": 3059456, + "step": 1410 + }, + { + "epoch": 0.23083197389885807, + "grad_norm": 0.7175241708755493, + "learning_rate": 1.1533442088091356e-05, + "loss": 0.2151, + "num_input_tokens_seen": 3070112, + "step": 1415 + }, + { + "epoch": 0.23164763458401305, + "grad_norm": 0.03852963447570801, + "learning_rate": 1.1574225122349103e-05, + "loss": 0.2183, + "num_input_tokens_seen": 3080224, + "step": 1420 + }, + { + "epoch": 0.23246329526916804, + "grad_norm": 2.1732800006866455, + "learning_rate": 1.1615008156606853e-05, + "loss": 0.5715, + "num_input_tokens_seen": 3090304, + "step": 1425 + }, + { + "epoch": 0.233278955954323, + "grad_norm": 3.8749501705169678, + "learning_rate": 1.1655791190864602e-05, + "loss": 0.3049, + "num_input_tokens_seen": 3099232, + "step": 1430 + }, + { + "epoch": 0.23409461663947798, + "grad_norm": 3.7227132320404053, + "learning_rate": 1.169657422512235e-05, + "loss": 0.5939, + "num_input_tokens_seen": 3109696, + "step": 1435 + }, + { + "epoch": 0.23491027732463296, + "grad_norm": 3.6950790882110596, + "learning_rate": 1.1737357259380099e-05, + "loss": 0.2211, + "num_input_tokens_seen": 3120160, + "step": 1440 + }, + { + "epoch": 0.23572593800978792, + "grad_norm": 1.2648481130599976, + "learning_rate": 1.1778140293637847e-05, + "loss": 0.1113, + "num_input_tokens_seen": 3130784, + "step": 1445 + }, + { + "epoch": 0.2365415986949429, + "grad_norm": 2.5034701824188232, + "learning_rate": 1.1818923327895596e-05, + "loss": 0.5047, + "num_input_tokens_seen": 3141536, + "step": 1450 + }, + { + "epoch": 0.23735725938009788, + "grad_norm": 0.06958629190921783, + "learning_rate": 1.1859706362153344e-05, + "loss": 0.3159, + "num_input_tokens_seen": 3152160, + "step": 1455 + }, + { + "epoch": 0.23817292006525284, + "grad_norm": 0.3680306673049927, + "learning_rate": 1.1900489396411093e-05, + "loss": 0.1365, + "num_input_tokens_seen": 3162560, + "step": 1460 + }, + { + "epoch": 0.23898858075040783, + "grad_norm": 0.9419664740562439, + "learning_rate": 1.1941272430668842e-05, + "loss": 0.2142, + "num_input_tokens_seen": 3175360, + "step": 1465 + }, + { + "epoch": 0.2398042414355628, + "grad_norm": 2.8168485164642334, + "learning_rate": 1.198205546492659e-05, + "loss": 0.1516, + "num_input_tokens_seen": 3187936, + "step": 1470 + }, + { + "epoch": 0.2406199021207178, + "grad_norm": 4.710270881652832, + "learning_rate": 1.202283849918434e-05, + "loss": 0.627, + "num_input_tokens_seen": 3199424, + "step": 1475 + }, + { + "epoch": 0.24143556280587275, + "grad_norm": 0.07906500995159149, + "learning_rate": 1.2063621533442089e-05, + "loss": 0.4768, + "num_input_tokens_seen": 3210272, + "step": 1480 + }, + { + "epoch": 0.24225122349102773, + "grad_norm": 2.2458198070526123, + "learning_rate": 1.2104404567699836e-05, + "loss": 0.1451, + "num_input_tokens_seen": 3219680, + "step": 1485 + }, + { + "epoch": 0.24306688417618272, + "grad_norm": 0.22720396518707275, + "learning_rate": 1.2145187601957586e-05, + "loss": 0.2042, + "num_input_tokens_seen": 3229376, + "step": 1490 + }, + { + "epoch": 0.24388254486133767, + "grad_norm": 0.9453988671302795, + "learning_rate": 1.2185970636215335e-05, + "loss": 0.2855, + "num_input_tokens_seen": 3240896, + "step": 1495 + }, + { + "epoch": 0.24469820554649266, + "grad_norm": 2.903646230697632, + "learning_rate": 1.2226753670473083e-05, + "loss": 0.5893, + "num_input_tokens_seen": 3251808, + "step": 1500 + }, + { + "epoch": 0.24551386623164764, + "grad_norm": 0.6642008423805237, + "learning_rate": 1.2267536704730832e-05, + "loss": 0.1373, + "num_input_tokens_seen": 3262688, + "step": 1505 + }, + { + "epoch": 0.2463295269168026, + "grad_norm": 2.305981159210205, + "learning_rate": 1.2308319738988581e-05, + "loss": 0.0784, + "num_input_tokens_seen": 3273344, + "step": 1510 + }, + { + "epoch": 0.24714518760195758, + "grad_norm": 3.1640007495880127, + "learning_rate": 1.234910277324633e-05, + "loss": 0.3315, + "num_input_tokens_seen": 3283520, + "step": 1515 + }, + { + "epoch": 0.24796084828711257, + "grad_norm": 0.06524429470300674, + "learning_rate": 1.2389885807504078e-05, + "loss": 0.3914, + "num_input_tokens_seen": 3294816, + "step": 1520 + }, + { + "epoch": 0.24877650897226752, + "grad_norm": 1.6377649307250977, + "learning_rate": 1.2430668841761828e-05, + "loss": 0.4014, + "num_input_tokens_seen": 3305152, + "step": 1525 + }, + { + "epoch": 0.2495921696574225, + "grad_norm": 0.03360249474644661, + "learning_rate": 1.2471451876019577e-05, + "loss": 0.2807, + "num_input_tokens_seen": 3315264, + "step": 1530 + }, + { + "epoch": 0.25040783034257746, + "grad_norm": 2.121892213821411, + "learning_rate": 1.2512234910277326e-05, + "loss": 0.3732, + "num_input_tokens_seen": 3326208, + "step": 1535 + }, + { + "epoch": 0.25122349102773245, + "grad_norm": 2.8405683040618896, + "learning_rate": 1.2553017944535072e-05, + "loss": 0.4747, + "num_input_tokens_seen": 3337472, + "step": 1540 + }, + { + "epoch": 0.25203915171288743, + "grad_norm": 0.6029368042945862, + "learning_rate": 1.2593800978792822e-05, + "loss": 0.2285, + "num_input_tokens_seen": 3348576, + "step": 1545 + }, + { + "epoch": 0.2528548123980424, + "grad_norm": 4.591424465179443, + "learning_rate": 1.2634584013050571e-05, + "loss": 0.4104, + "num_input_tokens_seen": 3359200, + "step": 1550 + }, + { + "epoch": 0.2536704730831974, + "grad_norm": 0.08392064273357391, + "learning_rate": 1.267536704730832e-05, + "loss": 0.0638, + "num_input_tokens_seen": 3369696, + "step": 1555 + }, + { + "epoch": 0.2544861337683524, + "grad_norm": 2.326383113861084, + "learning_rate": 1.271615008156607e-05, + "loss": 0.3437, + "num_input_tokens_seen": 3381088, + "step": 1560 + }, + { + "epoch": 0.2553017944535073, + "grad_norm": 3.015758514404297, + "learning_rate": 1.2756933115823819e-05, + "loss": 0.2132, + "num_input_tokens_seen": 3392704, + "step": 1565 + }, + { + "epoch": 0.2561174551386623, + "grad_norm": 0.19748492538928986, + "learning_rate": 1.2797716150081568e-05, + "loss": 0.5774, + "num_input_tokens_seen": 3403136, + "step": 1570 + }, + { + "epoch": 0.2569331158238173, + "grad_norm": 0.028329474851489067, + "learning_rate": 1.2838499184339314e-05, + "loss": 0.1412, + "num_input_tokens_seen": 3411968, + "step": 1575 + }, + { + "epoch": 0.25774877650897227, + "grad_norm": 0.06005063280463219, + "learning_rate": 1.2879282218597064e-05, + "loss": 0.1646, + "num_input_tokens_seen": 3421216, + "step": 1580 + }, + { + "epoch": 0.25856443719412725, + "grad_norm": 2.730849504470825, + "learning_rate": 1.2920065252854813e-05, + "loss": 0.4178, + "num_input_tokens_seen": 3433120, + "step": 1585 + }, + { + "epoch": 0.25938009787928223, + "grad_norm": 2.926846981048584, + "learning_rate": 1.2960848287112562e-05, + "loss": 0.2884, + "num_input_tokens_seen": 3444960, + "step": 1590 + }, + { + "epoch": 0.2601957585644372, + "grad_norm": 0.7243711352348328, + "learning_rate": 1.3001631321370312e-05, + "loss": 0.0888, + "num_input_tokens_seen": 3456512, + "step": 1595 + }, + { + "epoch": 0.26101141924959215, + "grad_norm": 2.007214069366455, + "learning_rate": 1.3042414355628061e-05, + "loss": 0.1463, + "num_input_tokens_seen": 3466880, + "step": 1600 + }, + { + "epoch": 0.26182707993474713, + "grad_norm": 0.31235232949256897, + "learning_rate": 1.3083197389885807e-05, + "loss": 0.1514, + "num_input_tokens_seen": 3478528, + "step": 1605 + }, + { + "epoch": 0.2626427406199021, + "grad_norm": 0.21175625920295715, + "learning_rate": 1.3123980424143556e-05, + "loss": 0.2653, + "num_input_tokens_seen": 3490240, + "step": 1610 + }, + { + "epoch": 0.2634584013050571, + "grad_norm": 2.254345178604126, + "learning_rate": 1.3164763458401306e-05, + "loss": 0.3273, + "num_input_tokens_seen": 3501152, + "step": 1615 + }, + { + "epoch": 0.2642740619902121, + "grad_norm": 0.034375693649053574, + "learning_rate": 1.3205546492659055e-05, + "loss": 0.3663, + "num_input_tokens_seen": 3511360, + "step": 1620 + }, + { + "epoch": 0.26508972267536707, + "grad_norm": 2.29891037940979, + "learning_rate": 1.3246329526916804e-05, + "loss": 0.1089, + "num_input_tokens_seen": 3522880, + "step": 1625 + }, + { + "epoch": 0.265905383360522, + "grad_norm": 0.027485765516757965, + "learning_rate": 1.3287112561174554e-05, + "loss": 0.1571, + "num_input_tokens_seen": 3534720, + "step": 1630 + }, + { + "epoch": 0.266721044045677, + "grad_norm": 0.606846809387207, + "learning_rate": 1.3327895595432303e-05, + "loss": 0.1151, + "num_input_tokens_seen": 3544128, + "step": 1635 + }, + { + "epoch": 0.26753670473083196, + "grad_norm": 2.736981153488159, + "learning_rate": 1.3368678629690049e-05, + "loss": 0.2174, + "num_input_tokens_seen": 3555552, + "step": 1640 + }, + { + "epoch": 0.26835236541598695, + "grad_norm": 0.1346738338470459, + "learning_rate": 1.3409461663947798e-05, + "loss": 0.1377, + "num_input_tokens_seen": 3565824, + "step": 1645 + }, + { + "epoch": 0.26916802610114193, + "grad_norm": 3.4124577045440674, + "learning_rate": 1.3450244698205548e-05, + "loss": 0.1383, + "num_input_tokens_seen": 3576864, + "step": 1650 + }, + { + "epoch": 0.2699836867862969, + "grad_norm": 4.092293739318848, + "learning_rate": 1.3491027732463297e-05, + "loss": 0.1128, + "num_input_tokens_seen": 3588032, + "step": 1655 + }, + { + "epoch": 0.2707993474714519, + "grad_norm": 3.4838578701019287, + "learning_rate": 1.3531810766721044e-05, + "loss": 0.1371, + "num_input_tokens_seen": 3599488, + "step": 1660 + }, + { + "epoch": 0.27161500815660683, + "grad_norm": 0.26558977365493774, + "learning_rate": 1.3572593800978794e-05, + "loss": 0.0373, + "num_input_tokens_seen": 3610368, + "step": 1665 + }, + { + "epoch": 0.2724306688417618, + "grad_norm": 3.7105157375335693, + "learning_rate": 1.3613376835236541e-05, + "loss": 0.3408, + "num_input_tokens_seen": 3620128, + "step": 1670 + }, + { + "epoch": 0.2732463295269168, + "grad_norm": 1.4476635456085205, + "learning_rate": 1.365415986949429e-05, + "loss": 0.2187, + "num_input_tokens_seen": 3631872, + "step": 1675 + }, + { + "epoch": 0.2740619902120718, + "grad_norm": 5.08590841293335, + "learning_rate": 1.369494290375204e-05, + "loss": 0.5114, + "num_input_tokens_seen": 3643296, + "step": 1680 + }, + { + "epoch": 0.27487765089722677, + "grad_norm": 1.5059062242507935, + "learning_rate": 1.3735725938009788e-05, + "loss": 0.3789, + "num_input_tokens_seen": 3654592, + "step": 1685 + }, + { + "epoch": 0.27569331158238175, + "grad_norm": 4.924588203430176, + "learning_rate": 1.3776508972267537e-05, + "loss": 0.7009, + "num_input_tokens_seen": 3665312, + "step": 1690 + }, + { + "epoch": 0.2765089722675367, + "grad_norm": 2.3354132175445557, + "learning_rate": 1.3817292006525286e-05, + "loss": 0.18, + "num_input_tokens_seen": 3677024, + "step": 1695 + }, + { + "epoch": 0.27732463295269166, + "grad_norm": 1.0145519971847534, + "learning_rate": 1.3858075040783036e-05, + "loss": 0.4344, + "num_input_tokens_seen": 3687904, + "step": 1700 + }, + { + "epoch": 0.27814029363784665, + "grad_norm": 1.554020881652832, + "learning_rate": 1.3898858075040783e-05, + "loss": 0.4536, + "num_input_tokens_seen": 3698848, + "step": 1705 + }, + { + "epoch": 0.27895595432300163, + "grad_norm": 3.5304179191589355, + "learning_rate": 1.3939641109298531e-05, + "loss": 0.2728, + "num_input_tokens_seen": 3709856, + "step": 1710 + }, + { + "epoch": 0.2797716150081566, + "grad_norm": 0.2529675364494324, + "learning_rate": 1.398042414355628e-05, + "loss": 0.2319, + "num_input_tokens_seen": 3720736, + "step": 1715 + }, + { + "epoch": 0.2805872756933116, + "grad_norm": 1.7375285625457764, + "learning_rate": 1.402120717781403e-05, + "loss": 0.3889, + "num_input_tokens_seen": 3731264, + "step": 1720 + }, + { + "epoch": 0.2814029363784666, + "grad_norm": 3.5078318119049072, + "learning_rate": 1.4061990212071779e-05, + "loss": 0.3773, + "num_input_tokens_seen": 3743072, + "step": 1725 + }, + { + "epoch": 0.2822185970636215, + "grad_norm": 1.6043356657028198, + "learning_rate": 1.4102773246329528e-05, + "loss": 0.3117, + "num_input_tokens_seen": 3754656, + "step": 1730 + }, + { + "epoch": 0.2830342577487765, + "grad_norm": 2.2843124866485596, + "learning_rate": 1.4143556280587274e-05, + "loss": 0.2169, + "num_input_tokens_seen": 3765248, + "step": 1735 + }, + { + "epoch": 0.2838499184339315, + "grad_norm": 2.9326183795928955, + "learning_rate": 1.4184339314845024e-05, + "loss": 0.5629, + "num_input_tokens_seen": 3774464, + "step": 1740 + }, + { + "epoch": 0.28466557911908646, + "grad_norm": 4.325689792633057, + "learning_rate": 1.4225122349102773e-05, + "loss": 0.4558, + "num_input_tokens_seen": 3784608, + "step": 1745 + }, + { + "epoch": 0.28548123980424145, + "grad_norm": 1.4701961278915405, + "learning_rate": 1.4265905383360522e-05, + "loss": 0.2626, + "num_input_tokens_seen": 3796672, + "step": 1750 + }, + { + "epoch": 0.28629690048939643, + "grad_norm": 1.277824878692627, + "learning_rate": 1.4306688417618272e-05, + "loss": 0.1767, + "num_input_tokens_seen": 3807712, + "step": 1755 + }, + { + "epoch": 0.28711256117455136, + "grad_norm": 2.2586658000946045, + "learning_rate": 1.4347471451876021e-05, + "loss": 0.1704, + "num_input_tokens_seen": 3817984, + "step": 1760 + }, + { + "epoch": 0.28792822185970635, + "grad_norm": 1.4830759763717651, + "learning_rate": 1.438825448613377e-05, + "loss": 0.0795, + "num_input_tokens_seen": 3829152, + "step": 1765 + }, + { + "epoch": 0.28874388254486133, + "grad_norm": 3.7856686115264893, + "learning_rate": 1.4429037520391516e-05, + "loss": 0.5235, + "num_input_tokens_seen": 3840352, + "step": 1770 + }, + { + "epoch": 0.2895595432300163, + "grad_norm": 0.06657853722572327, + "learning_rate": 1.4469820554649266e-05, + "loss": 0.4019, + "num_input_tokens_seen": 3851456, + "step": 1775 + }, + { + "epoch": 0.2903752039151713, + "grad_norm": 0.5419802665710449, + "learning_rate": 1.4510603588907015e-05, + "loss": 0.0283, + "num_input_tokens_seen": 3863040, + "step": 1780 + }, + { + "epoch": 0.2911908646003263, + "grad_norm": 1.3362468481063843, + "learning_rate": 1.4551386623164764e-05, + "loss": 0.0885, + "num_input_tokens_seen": 3874176, + "step": 1785 + }, + { + "epoch": 0.29200652528548127, + "grad_norm": 0.02550133503973484, + "learning_rate": 1.4592169657422514e-05, + "loss": 0.3812, + "num_input_tokens_seen": 3884736, + "step": 1790 + }, + { + "epoch": 0.2928221859706362, + "grad_norm": 2.9662318229675293, + "learning_rate": 1.4632952691680263e-05, + "loss": 0.2143, + "num_input_tokens_seen": 3895584, + "step": 1795 + }, + { + "epoch": 0.2936378466557912, + "grad_norm": 0.13017387688159943, + "learning_rate": 1.4673735725938009e-05, + "loss": 0.2513, + "num_input_tokens_seen": 3907584, + "step": 1800 + }, + { + "epoch": 0.29445350734094616, + "grad_norm": 1.6621451377868652, + "learning_rate": 1.4714518760195758e-05, + "loss": 0.2098, + "num_input_tokens_seen": 3919424, + "step": 1805 + }, + { + "epoch": 0.29526916802610115, + "grad_norm": 2.7320353984832764, + "learning_rate": 1.4755301794453508e-05, + "loss": 0.1553, + "num_input_tokens_seen": 3929152, + "step": 1810 + }, + { + "epoch": 0.29608482871125613, + "grad_norm": 2.471790075302124, + "learning_rate": 1.4796084828711257e-05, + "loss": 0.2417, + "num_input_tokens_seen": 3940928, + "step": 1815 + }, + { + "epoch": 0.2969004893964111, + "grad_norm": 1.8731759786605835, + "learning_rate": 1.4836867862969006e-05, + "loss": 0.1361, + "num_input_tokens_seen": 3949568, + "step": 1820 + }, + { + "epoch": 0.29771615008156604, + "grad_norm": 0.9011683464050293, + "learning_rate": 1.4877650897226756e-05, + "loss": 0.1557, + "num_input_tokens_seen": 3961088, + "step": 1825 + }, + { + "epoch": 0.29853181076672103, + "grad_norm": 0.10952156782150269, + "learning_rate": 1.4918433931484505e-05, + "loss": 0.4044, + "num_input_tokens_seen": 3973088, + "step": 1830 + }, + { + "epoch": 0.299347471451876, + "grad_norm": 0.0665321946144104, + "learning_rate": 1.4959216965742251e-05, + "loss": 0.1858, + "num_input_tokens_seen": 3982976, + "step": 1835 + }, + { + "epoch": 0.300163132137031, + "grad_norm": 2.553307056427002, + "learning_rate": 1.5e-05, + "loss": 0.3501, + "num_input_tokens_seen": 3993440, + "step": 1840 + }, + { + "epoch": 0.300978792822186, + "grad_norm": 0.1930677741765976, + "learning_rate": 1.504078303425775e-05, + "loss": 0.074, + "num_input_tokens_seen": 4002720, + "step": 1845 + }, + { + "epoch": 0.30179445350734097, + "grad_norm": 1.4660422801971436, + "learning_rate": 1.5081566068515499e-05, + "loss": 0.5167, + "num_input_tokens_seen": 4012896, + "step": 1850 + }, + { + "epoch": 0.30261011419249595, + "grad_norm": 0.9541784524917603, + "learning_rate": 1.5122349102773248e-05, + "loss": 0.1193, + "num_input_tokens_seen": 4023680, + "step": 1855 + }, + { + "epoch": 0.3034257748776509, + "grad_norm": 0.206839457154274, + "learning_rate": 1.5163132137030998e-05, + "loss": 0.0706, + "num_input_tokens_seen": 4033760, + "step": 1860 + }, + { + "epoch": 0.30424143556280586, + "grad_norm": 2.893082618713379, + "learning_rate": 1.5203915171288744e-05, + "loss": 0.2395, + "num_input_tokens_seen": 4044192, + "step": 1865 + }, + { + "epoch": 0.30505709624796085, + "grad_norm": 1.9797323942184448, + "learning_rate": 1.5244698205546493e-05, + "loss": 0.2518, + "num_input_tokens_seen": 4055520, + "step": 1870 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.22603654861450195, + "learning_rate": 1.5285481239804242e-05, + "loss": 0.146, + "num_input_tokens_seen": 4067264, + "step": 1875 + }, + { + "epoch": 0.3066884176182708, + "grad_norm": 2.636371374130249, + "learning_rate": 1.532626427406199e-05, + "loss": 0.3543, + "num_input_tokens_seen": 4078016, + "step": 1880 + }, + { + "epoch": 0.3075040783034258, + "grad_norm": 0.04059653729200363, + "learning_rate": 1.536704730831974e-05, + "loss": 0.2304, + "num_input_tokens_seen": 4088352, + "step": 1885 + }, + { + "epoch": 0.3083197389885807, + "grad_norm": 3.9373421669006348, + "learning_rate": 1.540783034257749e-05, + "loss": 0.2363, + "num_input_tokens_seen": 4100416, + "step": 1890 + }, + { + "epoch": 0.3091353996737357, + "grad_norm": 2.2121036052703857, + "learning_rate": 1.5448613376835236e-05, + "loss": 0.3215, + "num_input_tokens_seen": 4110080, + "step": 1895 + }, + { + "epoch": 0.3099510603588907, + "grad_norm": 2.7927119731903076, + "learning_rate": 1.5489396411092984e-05, + "loss": 0.5094, + "num_input_tokens_seen": 4120736, + "step": 1900 + }, + { + "epoch": 0.3107667210440457, + "grad_norm": 0.2541435658931732, + "learning_rate": 1.5530179445350735e-05, + "loss": 0.1742, + "num_input_tokens_seen": 4132736, + "step": 1905 + }, + { + "epoch": 0.31158238172920066, + "grad_norm": 4.164838790893555, + "learning_rate": 1.5570962479608483e-05, + "loss": 0.3136, + "num_input_tokens_seen": 4143808, + "step": 1910 + }, + { + "epoch": 0.31239804241435565, + "grad_norm": 2.5148730278015137, + "learning_rate": 1.5611745513866234e-05, + "loss": 0.2455, + "num_input_tokens_seen": 4154976, + "step": 1915 + }, + { + "epoch": 0.3132137030995106, + "grad_norm": 1.5994421243667603, + "learning_rate": 1.565252854812398e-05, + "loss": 0.3177, + "num_input_tokens_seen": 4165504, + "step": 1920 + }, + { + "epoch": 0.31402936378466556, + "grad_norm": 4.83409309387207, + "learning_rate": 1.5693311582381732e-05, + "loss": 0.3132, + "num_input_tokens_seen": 4177184, + "step": 1925 + }, + { + "epoch": 0.31484502446982054, + "grad_norm": 2.7300570011138916, + "learning_rate": 1.5734094616639477e-05, + "loss": 0.2454, + "num_input_tokens_seen": 4189088, + "step": 1930 + }, + { + "epoch": 0.31566068515497553, + "grad_norm": 0.14544816315174103, + "learning_rate": 1.5774877650897228e-05, + "loss": 0.0791, + "num_input_tokens_seen": 4200064, + "step": 1935 + }, + { + "epoch": 0.3164763458401305, + "grad_norm": 0.3472960889339447, + "learning_rate": 1.5815660685154975e-05, + "loss": 0.2672, + "num_input_tokens_seen": 4211936, + "step": 1940 + }, + { + "epoch": 0.3172920065252855, + "grad_norm": 1.0601612329483032, + "learning_rate": 1.5856443719412726e-05, + "loss": 0.1873, + "num_input_tokens_seen": 4221952, + "step": 1945 + }, + { + "epoch": 0.3181076672104405, + "grad_norm": 0.15293537080287933, + "learning_rate": 1.5897226753670474e-05, + "loss": 0.4064, + "num_input_tokens_seen": 4233024, + "step": 1950 + }, + { + "epoch": 0.3189233278955954, + "grad_norm": 1.2910592555999756, + "learning_rate": 1.5938009787928225e-05, + "loss": 0.1133, + "num_input_tokens_seen": 4244480, + "step": 1955 + }, + { + "epoch": 0.3197389885807504, + "grad_norm": 0.03099215403199196, + "learning_rate": 1.597879282218597e-05, + "loss": 0.2308, + "num_input_tokens_seen": 4255904, + "step": 1960 + }, + { + "epoch": 0.3205546492659054, + "grad_norm": 2.3486950397491455, + "learning_rate": 1.601957585644372e-05, + "loss": 0.5981, + "num_input_tokens_seen": 4265792, + "step": 1965 + }, + { + "epoch": 0.32137030995106036, + "grad_norm": 0.0227492805570364, + "learning_rate": 1.6060358890701468e-05, + "loss": 0.1188, + "num_input_tokens_seen": 4276160, + "step": 1970 + }, + { + "epoch": 0.32218597063621535, + "grad_norm": 2.6325185298919678, + "learning_rate": 1.610114192495922e-05, + "loss": 0.3278, + "num_input_tokens_seen": 4286624, + "step": 1975 + }, + { + "epoch": 0.32300163132137033, + "grad_norm": 1.837198257446289, + "learning_rate": 1.6141924959216967e-05, + "loss": 0.1318, + "num_input_tokens_seen": 4297600, + "step": 1980 + }, + { + "epoch": 0.32381729200652526, + "grad_norm": 3.0125441551208496, + "learning_rate": 1.6182707993474718e-05, + "loss": 0.2808, + "num_input_tokens_seen": 4308352, + "step": 1985 + }, + { + "epoch": 0.32463295269168024, + "grad_norm": 3.2271201610565186, + "learning_rate": 1.6223491027732465e-05, + "loss": 0.3265, + "num_input_tokens_seen": 4319360, + "step": 1990 + }, + { + "epoch": 0.3254486133768352, + "grad_norm": 1.7345423698425293, + "learning_rate": 1.6264274061990213e-05, + "loss": 0.0784, + "num_input_tokens_seen": 4331552, + "step": 1995 + }, + { + "epoch": 0.3262642740619902, + "grad_norm": 0.14787054061889648, + "learning_rate": 1.630505709624796e-05, + "loss": 0.3881, + "num_input_tokens_seen": 4343744, + "step": 2000 + }, + { + "epoch": 0.3270799347471452, + "grad_norm": 0.034362901002168655, + "learning_rate": 1.634584013050571e-05, + "loss": 0.2557, + "num_input_tokens_seen": 4354304, + "step": 2005 + }, + { + "epoch": 0.3278955954323002, + "grad_norm": 2.122854471206665, + "learning_rate": 1.638662316476346e-05, + "loss": 0.1945, + "num_input_tokens_seen": 4365696, + "step": 2010 + }, + { + "epoch": 0.32871125611745516, + "grad_norm": 6.897362232208252, + "learning_rate": 1.6427406199021207e-05, + "loss": 0.4621, + "num_input_tokens_seen": 4376736, + "step": 2015 + }, + { + "epoch": 0.3295269168026101, + "grad_norm": 0.3468558192253113, + "learning_rate": 1.6468189233278958e-05, + "loss": 0.227, + "num_input_tokens_seen": 4388064, + "step": 2020 + }, + { + "epoch": 0.3303425774877651, + "grad_norm": 8.244028091430664, + "learning_rate": 1.6508972267536706e-05, + "loss": 0.2322, + "num_input_tokens_seen": 4398912, + "step": 2025 + }, + { + "epoch": 0.33115823817292006, + "grad_norm": 3.219264507293701, + "learning_rate": 1.6549755301794453e-05, + "loss": 0.253, + "num_input_tokens_seen": 4409856, + "step": 2030 + }, + { + "epoch": 0.33197389885807504, + "grad_norm": 1.5489206314086914, + "learning_rate": 1.6590538336052204e-05, + "loss": 0.1869, + "num_input_tokens_seen": 4419552, + "step": 2035 + }, + { + "epoch": 0.33278955954323003, + "grad_norm": 0.2392227053642273, + "learning_rate": 1.6631321370309952e-05, + "loss": 0.1862, + "num_input_tokens_seen": 4430080, + "step": 2040 + }, + { + "epoch": 0.333605220228385, + "grad_norm": 2.250466823577881, + "learning_rate": 1.66721044045677e-05, + "loss": 0.2848, + "num_input_tokens_seen": 4441568, + "step": 2045 + }, + { + "epoch": 0.33442088091353994, + "grad_norm": 3.4350407123565674, + "learning_rate": 1.671288743882545e-05, + "loss": 0.3952, + "num_input_tokens_seen": 4451200, + "step": 2050 + }, + { + "epoch": 0.3352365415986949, + "grad_norm": 1.0180439949035645, + "learning_rate": 1.6753670473083198e-05, + "loss": 0.2798, + "num_input_tokens_seen": 4461888, + "step": 2055 + }, + { + "epoch": 0.3360522022838499, + "grad_norm": 1.7880240678787231, + "learning_rate": 1.6794453507340946e-05, + "loss": 0.1433, + "num_input_tokens_seen": 4471392, + "step": 2060 + }, + { + "epoch": 0.3368678629690049, + "grad_norm": 1.1054818630218506, + "learning_rate": 1.6835236541598694e-05, + "loss": 0.1511, + "num_input_tokens_seen": 4482656, + "step": 2065 + }, + { + "epoch": 0.3376835236541599, + "grad_norm": 2.177093267440796, + "learning_rate": 1.6876019575856445e-05, + "loss": 0.1576, + "num_input_tokens_seen": 4494336, + "step": 2070 + }, + { + "epoch": 0.33849918433931486, + "grad_norm": 2.8875629901885986, + "learning_rate": 1.6916802610114192e-05, + "loss": 0.3951, + "num_input_tokens_seen": 4505760, + "step": 2075 + }, + { + "epoch": 0.33931484502446985, + "grad_norm": 2.9531240463256836, + "learning_rate": 1.6957585644371943e-05, + "loss": 0.284, + "num_input_tokens_seen": 4514976, + "step": 2080 + }, + { + "epoch": 0.3401305057096248, + "grad_norm": 0.029251810163259506, + "learning_rate": 1.699836867862969e-05, + "loss": 0.1051, + "num_input_tokens_seen": 4524928, + "step": 2085 + }, + { + "epoch": 0.34094616639477976, + "grad_norm": 2.049252510070801, + "learning_rate": 1.703915171288744e-05, + "loss": 0.2748, + "num_input_tokens_seen": 4536192, + "step": 2090 + }, + { + "epoch": 0.34176182707993474, + "grad_norm": 0.21637818217277527, + "learning_rate": 1.7079934747145186e-05, + "loss": 0.1316, + "num_input_tokens_seen": 4548160, + "step": 2095 + }, + { + "epoch": 0.3425774877650897, + "grad_norm": 1.3389133214950562, + "learning_rate": 1.7120717781402937e-05, + "loss": 0.2887, + "num_input_tokens_seen": 4558560, + "step": 2100 + }, + { + "epoch": 0.3433931484502447, + "grad_norm": 2.581313371658325, + "learning_rate": 1.7161500815660685e-05, + "loss": 0.3082, + "num_input_tokens_seen": 4570272, + "step": 2105 + }, + { + "epoch": 0.3442088091353997, + "grad_norm": 0.6610245704650879, + "learning_rate": 1.7202283849918436e-05, + "loss": 0.2744, + "num_input_tokens_seen": 4580608, + "step": 2110 + }, + { + "epoch": 0.3450244698205546, + "grad_norm": 1.2604070901870728, + "learning_rate": 1.7243066884176184e-05, + "loss": 0.1228, + "num_input_tokens_seen": 4591072, + "step": 2115 + }, + { + "epoch": 0.3458401305057096, + "grad_norm": 0.6086477637290955, + "learning_rate": 1.7283849918433935e-05, + "loss": 0.1535, + "num_input_tokens_seen": 4602368, + "step": 2120 + }, + { + "epoch": 0.3466557911908646, + "grad_norm": 2.3657615184783936, + "learning_rate": 1.732463295269168e-05, + "loss": 0.1893, + "num_input_tokens_seen": 4613600, + "step": 2125 + }, + { + "epoch": 0.3474714518760196, + "grad_norm": 0.03862304985523224, + "learning_rate": 1.736541598694943e-05, + "loss": 0.1583, + "num_input_tokens_seen": 4623872, + "step": 2130 + }, + { + "epoch": 0.34828711256117456, + "grad_norm": 2.298746347427368, + "learning_rate": 1.7406199021207178e-05, + "loss": 0.3444, + "num_input_tokens_seen": 4635136, + "step": 2135 + }, + { + "epoch": 0.34910277324632955, + "grad_norm": 0.142240971326828, + "learning_rate": 1.744698205546493e-05, + "loss": 0.0519, + "num_input_tokens_seen": 4645152, + "step": 2140 + }, + { + "epoch": 0.34991843393148453, + "grad_norm": 2.246483564376831, + "learning_rate": 1.7487765089722676e-05, + "loss": 0.2155, + "num_input_tokens_seen": 4654240, + "step": 2145 + }, + { + "epoch": 0.35073409461663946, + "grad_norm": 0.9099391102790833, + "learning_rate": 1.7528548123980427e-05, + "loss": 0.2148, + "num_input_tokens_seen": 4666240, + "step": 2150 + }, + { + "epoch": 0.35154975530179444, + "grad_norm": 1.1253048181533813, + "learning_rate": 1.756933115823817e-05, + "loss": 0.1624, + "num_input_tokens_seen": 4678528, + "step": 2155 + }, + { + "epoch": 0.3523654159869494, + "grad_norm": 1.7053184509277344, + "learning_rate": 1.7610114192495923e-05, + "loss": 0.2618, + "num_input_tokens_seen": 4688768, + "step": 2160 + }, + { + "epoch": 0.3531810766721044, + "grad_norm": 1.7153773307800293, + "learning_rate": 1.765089722675367e-05, + "loss": 0.1411, + "num_input_tokens_seen": 4698496, + "step": 2165 + }, + { + "epoch": 0.3539967373572594, + "grad_norm": 0.8576794266700745, + "learning_rate": 1.769168026101142e-05, + "loss": 0.0939, + "num_input_tokens_seen": 4710208, + "step": 2170 + }, + { + "epoch": 0.3548123980424144, + "grad_norm": 1.1655031442642212, + "learning_rate": 1.773246329526917e-05, + "loss": 0.2645, + "num_input_tokens_seen": 4720480, + "step": 2175 + }, + { + "epoch": 0.3556280587275693, + "grad_norm": 0.38470765948295593, + "learning_rate": 1.777324632952692e-05, + "loss": 0.4119, + "num_input_tokens_seen": 4730560, + "step": 2180 + }, + { + "epoch": 0.3564437194127243, + "grad_norm": 1.034359335899353, + "learning_rate": 1.7814029363784668e-05, + "loss": 0.0481, + "num_input_tokens_seen": 4740768, + "step": 2185 + }, + { + "epoch": 0.3572593800978793, + "grad_norm": 0.9203453063964844, + "learning_rate": 1.7854812398042415e-05, + "loss": 0.5743, + "num_input_tokens_seen": 4751040, + "step": 2190 + }, + { + "epoch": 0.35807504078303426, + "grad_norm": 1.765923023223877, + "learning_rate": 1.7895595432300163e-05, + "loss": 0.1223, + "num_input_tokens_seen": 4762016, + "step": 2195 + }, + { + "epoch": 0.35889070146818924, + "grad_norm": 1.7131823301315308, + "learning_rate": 1.7936378466557914e-05, + "loss": 0.1777, + "num_input_tokens_seen": 4771648, + "step": 2200 + }, + { + "epoch": 0.35970636215334423, + "grad_norm": 2.4293699264526367, + "learning_rate": 1.797716150081566e-05, + "loss": 0.2052, + "num_input_tokens_seen": 4781440, + "step": 2205 + }, + { + "epoch": 0.3605220228384992, + "grad_norm": 1.0557916164398193, + "learning_rate": 1.8017944535073413e-05, + "loss": 0.1978, + "num_input_tokens_seen": 4793312, + "step": 2210 + }, + { + "epoch": 0.36133768352365414, + "grad_norm": 2.4802889823913574, + "learning_rate": 1.805872756933116e-05, + "loss": 0.1264, + "num_input_tokens_seen": 4804288, + "step": 2215 + }, + { + "epoch": 0.3621533442088091, + "grad_norm": 4.415771007537842, + "learning_rate": 1.8099510603588908e-05, + "loss": 0.4321, + "num_input_tokens_seen": 4814784, + "step": 2220 + }, + { + "epoch": 0.3629690048939641, + "grad_norm": 0.17939838767051697, + "learning_rate": 1.8140293637846655e-05, + "loss": 0.4842, + "num_input_tokens_seen": 4825152, + "step": 2225 + }, + { + "epoch": 0.3637846655791191, + "grad_norm": 0.45780378580093384, + "learning_rate": 1.8181076672104407e-05, + "loss": 0.3107, + "num_input_tokens_seen": 4835776, + "step": 2230 + }, + { + "epoch": 0.3646003262642741, + "grad_norm": 0.2483983337879181, + "learning_rate": 1.8221859706362154e-05, + "loss": 0.1113, + "num_input_tokens_seen": 4846848, + "step": 2235 + }, + { + "epoch": 0.36541598694942906, + "grad_norm": 0.27436283230781555, + "learning_rate": 1.8262642740619905e-05, + "loss": 0.0192, + "num_input_tokens_seen": 4857664, + "step": 2240 + }, + { + "epoch": 0.366231647634584, + "grad_norm": 4.137628555297852, + "learning_rate": 1.8303425774877653e-05, + "loss": 0.3617, + "num_input_tokens_seen": 4868800, + "step": 2245 + }, + { + "epoch": 0.367047308319739, + "grad_norm": 0.1305709332227707, + "learning_rate": 1.83442088091354e-05, + "loss": 0.2852, + "num_input_tokens_seen": 4879776, + "step": 2250 + }, + { + "epoch": 0.36786296900489396, + "grad_norm": 0.24522808194160461, + "learning_rate": 1.8384991843393148e-05, + "loss": 0.2796, + "num_input_tokens_seen": 4890816, + "step": 2255 + }, + { + "epoch": 0.36867862969004894, + "grad_norm": 1.8962805271148682, + "learning_rate": 1.84257748776509e-05, + "loss": 0.257, + "num_input_tokens_seen": 4899136, + "step": 2260 + }, + { + "epoch": 0.3694942903752039, + "grad_norm": 0.062327928841114044, + "learning_rate": 1.8466557911908647e-05, + "loss": 0.111, + "num_input_tokens_seen": 4910112, + "step": 2265 + }, + { + "epoch": 0.3703099510603589, + "grad_norm": 2.150012731552124, + "learning_rate": 1.8507340946166394e-05, + "loss": 0.2978, + "num_input_tokens_seen": 4920032, + "step": 2270 + }, + { + "epoch": 0.37112561174551384, + "grad_norm": 2.0880093574523926, + "learning_rate": 1.8548123980424145e-05, + "loss": 0.2289, + "num_input_tokens_seen": 4930752, + "step": 2275 + }, + { + "epoch": 0.3719412724306688, + "grad_norm": 0.07187820971012115, + "learning_rate": 1.8588907014681893e-05, + "loss": 0.1885, + "num_input_tokens_seen": 4940960, + "step": 2280 + }, + { + "epoch": 0.3727569331158238, + "grad_norm": 1.5199735164642334, + "learning_rate": 1.862969004893964e-05, + "loss": 0.2155, + "num_input_tokens_seen": 4952128, + "step": 2285 + }, + { + "epoch": 0.3735725938009788, + "grad_norm": 0.8438881635665894, + "learning_rate": 1.867047308319739e-05, + "loss": 0.4123, + "num_input_tokens_seen": 4962848, + "step": 2290 + }, + { + "epoch": 0.3743882544861338, + "grad_norm": 1.3021920919418335, + "learning_rate": 1.871125611745514e-05, + "loss": 0.0762, + "num_input_tokens_seen": 4972448, + "step": 2295 + }, + { + "epoch": 0.37520391517128876, + "grad_norm": 0.41465264558792114, + "learning_rate": 1.8752039151712887e-05, + "loss": 0.1731, + "num_input_tokens_seen": 4982432, + "step": 2300 + }, + { + "epoch": 0.37601957585644374, + "grad_norm": 0.060559842735528946, + "learning_rate": 1.8792822185970638e-05, + "loss": 0.1006, + "num_input_tokens_seen": 4992800, + "step": 2305 + }, + { + "epoch": 0.3768352365415987, + "grad_norm": 0.752964973449707, + "learning_rate": 1.8833605220228386e-05, + "loss": 0.0503, + "num_input_tokens_seen": 5004352, + "step": 2310 + }, + { + "epoch": 0.37765089722675366, + "grad_norm": 0.1636490672826767, + "learning_rate": 1.8874388254486133e-05, + "loss": 0.1866, + "num_input_tokens_seen": 5015968, + "step": 2315 + }, + { + "epoch": 0.37846655791190864, + "grad_norm": 1.6338448524475098, + "learning_rate": 1.891517128874388e-05, + "loss": 0.0685, + "num_input_tokens_seen": 5026624, + "step": 2320 + }, + { + "epoch": 0.3792822185970636, + "grad_norm": 2.927778482437134, + "learning_rate": 1.8955954323001632e-05, + "loss": 0.1551, + "num_input_tokens_seen": 5037312, + "step": 2325 + }, + { + "epoch": 0.3800978792822186, + "grad_norm": 0.3640718460083008, + "learning_rate": 1.899673735725938e-05, + "loss": 0.1937, + "num_input_tokens_seen": 5048544, + "step": 2330 + }, + { + "epoch": 0.3809135399673736, + "grad_norm": 0.9198599457740784, + "learning_rate": 1.903752039151713e-05, + "loss": 0.178, + "num_input_tokens_seen": 5059584, + "step": 2335 + }, + { + "epoch": 0.3817292006525285, + "grad_norm": 1.4417325258255005, + "learning_rate": 1.907830342577488e-05, + "loss": 0.2625, + "num_input_tokens_seen": 5072736, + "step": 2340 + }, + { + "epoch": 0.3825448613376835, + "grad_norm": 0.7423872947692871, + "learning_rate": 1.911908646003263e-05, + "loss": 0.5037, + "num_input_tokens_seen": 5083456, + "step": 2345 + }, + { + "epoch": 0.3833605220228385, + "grad_norm": 1.0033279657363892, + "learning_rate": 1.9159869494290374e-05, + "loss": 0.1649, + "num_input_tokens_seen": 5094240, + "step": 2350 + }, + { + "epoch": 0.3841761827079935, + "grad_norm": 2.8702170848846436, + "learning_rate": 1.9200652528548125e-05, + "loss": 0.1889, + "num_input_tokens_seen": 5103744, + "step": 2355 + }, + { + "epoch": 0.38499184339314846, + "grad_norm": 1.840469241142273, + "learning_rate": 1.9241435562805872e-05, + "loss": 0.2305, + "num_input_tokens_seen": 5114048, + "step": 2360 + }, + { + "epoch": 0.38580750407830344, + "grad_norm": 2.1254985332489014, + "learning_rate": 1.9282218597063623e-05, + "loss": 0.3999, + "num_input_tokens_seen": 5123744, + "step": 2365 + }, + { + "epoch": 0.3866231647634584, + "grad_norm": 1.9873484373092651, + "learning_rate": 1.932300163132137e-05, + "loss": 0.2445, + "num_input_tokens_seen": 5133888, + "step": 2370 + }, + { + "epoch": 0.38743882544861336, + "grad_norm": 0.04943738505244255, + "learning_rate": 1.9363784665579122e-05, + "loss": 0.1918, + "num_input_tokens_seen": 5145024, + "step": 2375 + }, + { + "epoch": 0.38825448613376834, + "grad_norm": 0.7362540364265442, + "learning_rate": 1.9404567699836866e-05, + "loss": 0.0386, + "num_input_tokens_seen": 5156544, + "step": 2380 + }, + { + "epoch": 0.3890701468189233, + "grad_norm": 1.9308018684387207, + "learning_rate": 1.9445350734094617e-05, + "loss": 0.3522, + "num_input_tokens_seen": 5168192, + "step": 2385 + }, + { + "epoch": 0.3898858075040783, + "grad_norm": 0.8454206585884094, + "learning_rate": 1.9486133768352365e-05, + "loss": 0.2643, + "num_input_tokens_seen": 5178240, + "step": 2390 + }, + { + "epoch": 0.3907014681892333, + "grad_norm": 0.1646224409341812, + "learning_rate": 1.9526916802610116e-05, + "loss": 0.0951, + "num_input_tokens_seen": 5189248, + "step": 2395 + }, + { + "epoch": 0.3915171288743883, + "grad_norm": 0.9636086225509644, + "learning_rate": 1.9567699836867864e-05, + "loss": 0.1067, + "num_input_tokens_seen": 5199392, + "step": 2400 + }, + { + "epoch": 0.3923327895595432, + "grad_norm": 2.3385391235351562, + "learning_rate": 1.9608482871125615e-05, + "loss": 0.2225, + "num_input_tokens_seen": 5210240, + "step": 2405 + }, + { + "epoch": 0.3931484502446982, + "grad_norm": 3.836735486984253, + "learning_rate": 1.9649265905383362e-05, + "loss": 0.3199, + "num_input_tokens_seen": 5220000, + "step": 2410 + }, + { + "epoch": 0.3939641109298532, + "grad_norm": 1.7201495170593262, + "learning_rate": 1.969004893964111e-05, + "loss": 0.1009, + "num_input_tokens_seen": 5231520, + "step": 2415 + }, + { + "epoch": 0.39477977161500816, + "grad_norm": 2.4716289043426514, + "learning_rate": 1.9730831973898858e-05, + "loss": 0.2958, + "num_input_tokens_seen": 5243200, + "step": 2420 + }, + { + "epoch": 0.39559543230016314, + "grad_norm": 1.8361541032791138, + "learning_rate": 1.977161500815661e-05, + "loss": 0.1447, + "num_input_tokens_seen": 5253632, + "step": 2425 + }, + { + "epoch": 0.3964110929853181, + "grad_norm": 0.3441266119480133, + "learning_rate": 1.9812398042414356e-05, + "loss": 0.1608, + "num_input_tokens_seen": 5264736, + "step": 2430 + }, + { + "epoch": 0.3972267536704731, + "grad_norm": 0.6578155159950256, + "learning_rate": 1.9853181076672107e-05, + "loss": 0.2224, + "num_input_tokens_seen": 5276192, + "step": 2435 + }, + { + "epoch": 0.39804241435562804, + "grad_norm": 0.09727490693330765, + "learning_rate": 1.9893964110929855e-05, + "loss": 0.1859, + "num_input_tokens_seen": 5287296, + "step": 2440 + }, + { + "epoch": 0.398858075040783, + "grad_norm": 0.6685360074043274, + "learning_rate": 1.9934747145187603e-05, + "loss": 0.2449, + "num_input_tokens_seen": 5298336, + "step": 2445 + }, + { + "epoch": 0.399673735725938, + "grad_norm": 0.2587214410305023, + "learning_rate": 1.997553017944535e-05, + "loss": 0.1042, + "num_input_tokens_seen": 5309248, + "step": 2450 + }, + { + "epoch": 0.400489396411093, + "grad_norm": 0.3933566212654114, + "learning_rate": 2.00163132137031e-05, + "loss": 0.3934, + "num_input_tokens_seen": 5320384, + "step": 2455 + }, + { + "epoch": 0.401305057096248, + "grad_norm": 0.21555542945861816, + "learning_rate": 2.005709624796085e-05, + "loss": 0.0456, + "num_input_tokens_seen": 5331904, + "step": 2460 + }, + { + "epoch": 0.40212071778140296, + "grad_norm": 0.1750049740076065, + "learning_rate": 2.00978792822186e-05, + "loss": 0.2169, + "num_input_tokens_seen": 5343904, + "step": 2465 + }, + { + "epoch": 0.4029363784665579, + "grad_norm": 0.8155176639556885, + "learning_rate": 2.0138662316476348e-05, + "loss": 0.093, + "num_input_tokens_seen": 5354336, + "step": 2470 + }, + { + "epoch": 0.40375203915171287, + "grad_norm": 0.45638391375541687, + "learning_rate": 2.0179445350734095e-05, + "loss": 0.2681, + "num_input_tokens_seen": 5366144, + "step": 2475 + }, + { + "epoch": 0.40456769983686786, + "grad_norm": 2.086141347885132, + "learning_rate": 2.0220228384991843e-05, + "loss": 0.1274, + "num_input_tokens_seen": 5377344, + "step": 2480 + }, + { + "epoch": 0.40538336052202284, + "grad_norm": 1.5685988664627075, + "learning_rate": 2.0261011419249594e-05, + "loss": 0.0944, + "num_input_tokens_seen": 5386752, + "step": 2485 + }, + { + "epoch": 0.4061990212071778, + "grad_norm": 0.15431497991085052, + "learning_rate": 2.030179445350734e-05, + "loss": 0.0835, + "num_input_tokens_seen": 5396384, + "step": 2490 + }, + { + "epoch": 0.4070146818923328, + "grad_norm": 2.3109543323516846, + "learning_rate": 2.034257748776509e-05, + "loss": 0.2147, + "num_input_tokens_seen": 5408256, + "step": 2495 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.2294061928987503, + "learning_rate": 2.038336052202284e-05, + "loss": 0.2026, + "num_input_tokens_seen": 5418752, + "step": 2500 + }, + { + "epoch": 0.4086460032626427, + "grad_norm": 0.07159079611301422, + "learning_rate": 2.0424143556280588e-05, + "loss": 0.1354, + "num_input_tokens_seen": 5429792, + "step": 2505 + }, + { + "epoch": 0.4094616639477977, + "grad_norm": 2.351248025894165, + "learning_rate": 2.0464926590538336e-05, + "loss": 0.2966, + "num_input_tokens_seen": 5440608, + "step": 2510 + }, + { + "epoch": 0.4102773246329527, + "grad_norm": 1.2306334972381592, + "learning_rate": 2.0505709624796087e-05, + "loss": 0.3043, + "num_input_tokens_seen": 5450016, + "step": 2515 + }, + { + "epoch": 0.4110929853181077, + "grad_norm": 5.2972235679626465, + "learning_rate": 2.0546492659053834e-05, + "loss": 0.5052, + "num_input_tokens_seen": 5461120, + "step": 2520 + }, + { + "epoch": 0.41190864600326266, + "grad_norm": 2.6186749935150146, + "learning_rate": 2.0587275693311582e-05, + "loss": 0.0897, + "num_input_tokens_seen": 5471648, + "step": 2525 + }, + { + "epoch": 0.41272430668841764, + "grad_norm": 0.3563755750656128, + "learning_rate": 2.0628058727569333e-05, + "loss": 0.2818, + "num_input_tokens_seen": 5482656, + "step": 2530 + }, + { + "epoch": 0.41353996737357257, + "grad_norm": 1.6189361810684204, + "learning_rate": 2.066884176182708e-05, + "loss": 0.0645, + "num_input_tokens_seen": 5491168, + "step": 2535 + }, + { + "epoch": 0.41435562805872755, + "grad_norm": 0.7304040193557739, + "learning_rate": 2.070962479608483e-05, + "loss": 0.2919, + "num_input_tokens_seen": 5501792, + "step": 2540 + }, + { + "epoch": 0.41517128874388254, + "grad_norm": 1.8033251762390137, + "learning_rate": 2.0750407830342576e-05, + "loss": 0.32, + "num_input_tokens_seen": 5513152, + "step": 2545 + }, + { + "epoch": 0.4159869494290375, + "grad_norm": 0.4581279158592224, + "learning_rate": 2.0791190864600327e-05, + "loss": 0.0537, + "num_input_tokens_seen": 5524480, + "step": 2550 + }, + { + "epoch": 0.4168026101141925, + "grad_norm": 0.060318563133478165, + "learning_rate": 2.0831973898858075e-05, + "loss": 0.1207, + "num_input_tokens_seen": 5535008, + "step": 2555 + }, + { + "epoch": 0.4176182707993475, + "grad_norm": 0.5620632171630859, + "learning_rate": 2.0872756933115826e-05, + "loss": 0.1074, + "num_input_tokens_seen": 5546048, + "step": 2560 + }, + { + "epoch": 0.4184339314845024, + "grad_norm": 3.0411903858184814, + "learning_rate": 2.0913539967373573e-05, + "loss": 0.3709, + "num_input_tokens_seen": 5557760, + "step": 2565 + }, + { + "epoch": 0.4192495921696574, + "grad_norm": 0.382484495639801, + "learning_rate": 2.0954323001631324e-05, + "loss": 0.2466, + "num_input_tokens_seen": 5567488, + "step": 2570 + }, + { + "epoch": 0.4200652528548124, + "grad_norm": 1.7020468711853027, + "learning_rate": 2.099510603588907e-05, + "loss": 0.2884, + "num_input_tokens_seen": 5578592, + "step": 2575 + }, + { + "epoch": 0.42088091353996737, + "grad_norm": 1.1323612928390503, + "learning_rate": 2.103588907014682e-05, + "loss": 0.0619, + "num_input_tokens_seen": 5588576, + "step": 2580 + }, + { + "epoch": 0.42169657422512236, + "grad_norm": 0.7371749877929688, + "learning_rate": 2.1076672104404567e-05, + "loss": 0.1083, + "num_input_tokens_seen": 5599264, + "step": 2585 + }, + { + "epoch": 0.42251223491027734, + "grad_norm": 0.2735646367073059, + "learning_rate": 2.1117455138662318e-05, + "loss": 0.1351, + "num_input_tokens_seen": 5609504, + "step": 2590 + }, + { + "epoch": 0.4233278955954323, + "grad_norm": 4.150676727294922, + "learning_rate": 2.1158238172920066e-05, + "loss": 0.2879, + "num_input_tokens_seen": 5620256, + "step": 2595 + }, + { + "epoch": 0.42414355628058725, + "grad_norm": 2.109612226486206, + "learning_rate": 2.1199021207177817e-05, + "loss": 0.4743, + "num_input_tokens_seen": 5630592, + "step": 2600 + }, + { + "epoch": 0.42495921696574224, + "grad_norm": 2.8089146614074707, + "learning_rate": 2.123980424143556e-05, + "loss": 0.3867, + "num_input_tokens_seen": 5641536, + "step": 2605 + }, + { + "epoch": 0.4257748776508972, + "grad_norm": 0.268686980009079, + "learning_rate": 2.1280587275693312e-05, + "loss": 0.0419, + "num_input_tokens_seen": 5652416, + "step": 2610 + }, + { + "epoch": 0.4265905383360522, + "grad_norm": 1.8767969608306885, + "learning_rate": 2.132137030995106e-05, + "loss": 0.2235, + "num_input_tokens_seen": 5662528, + "step": 2615 + }, + { + "epoch": 0.4274061990212072, + "grad_norm": 1.4668070077896118, + "learning_rate": 2.136215334420881e-05, + "loss": 0.1764, + "num_input_tokens_seen": 5672288, + "step": 2620 + }, + { + "epoch": 0.4282218597063622, + "grad_norm": 0.6798267960548401, + "learning_rate": 2.140293637846656e-05, + "loss": 0.1158, + "num_input_tokens_seen": 5682976, + "step": 2625 + }, + { + "epoch": 0.4290375203915171, + "grad_norm": 0.4762680232524872, + "learning_rate": 2.144371941272431e-05, + "loss": 0.1031, + "num_input_tokens_seen": 5694240, + "step": 2630 + }, + { + "epoch": 0.4298531810766721, + "grad_norm": 0.16496841609477997, + "learning_rate": 2.1484502446982057e-05, + "loss": 0.138, + "num_input_tokens_seen": 5704192, + "step": 2635 + }, + { + "epoch": 0.43066884176182707, + "grad_norm": 0.2772187292575836, + "learning_rate": 2.1525285481239805e-05, + "loss": 0.0869, + "num_input_tokens_seen": 5715968, + "step": 2640 + }, + { + "epoch": 0.43148450244698205, + "grad_norm": 0.35814934968948364, + "learning_rate": 2.1566068515497553e-05, + "loss": 0.0846, + "num_input_tokens_seen": 5726784, + "step": 2645 + }, + { + "epoch": 0.43230016313213704, + "grad_norm": 2.368760824203491, + "learning_rate": 2.1606851549755304e-05, + "loss": 0.3894, + "num_input_tokens_seen": 5737952, + "step": 2650 + }, + { + "epoch": 0.433115823817292, + "grad_norm": 0.5970998406410217, + "learning_rate": 2.164763458401305e-05, + "loss": 0.1214, + "num_input_tokens_seen": 5748512, + "step": 2655 + }, + { + "epoch": 0.433931484502447, + "grad_norm": 0.20811328291893005, + "learning_rate": 2.1688417618270802e-05, + "loss": 0.1749, + "num_input_tokens_seen": 5758944, + "step": 2660 + }, + { + "epoch": 0.43474714518760194, + "grad_norm": 1.0056008100509644, + "learning_rate": 2.172920065252855e-05, + "loss": 0.2438, + "num_input_tokens_seen": 5769632, + "step": 2665 + }, + { + "epoch": 0.4355628058727569, + "grad_norm": 1.3665837049484253, + "learning_rate": 2.1769983686786298e-05, + "loss": 0.1153, + "num_input_tokens_seen": 5781248, + "step": 2670 + }, + { + "epoch": 0.4363784665579119, + "grad_norm": 0.5735511183738708, + "learning_rate": 2.1810766721044045e-05, + "loss": 0.2311, + "num_input_tokens_seen": 5792192, + "step": 2675 + }, + { + "epoch": 0.4371941272430669, + "grad_norm": 1.1685433387756348, + "learning_rate": 2.1851549755301796e-05, + "loss": 0.0838, + "num_input_tokens_seen": 5803264, + "step": 2680 + }, + { + "epoch": 0.43800978792822187, + "grad_norm": 0.046221062541007996, + "learning_rate": 2.1892332789559544e-05, + "loss": 0.0684, + "num_input_tokens_seen": 5815072, + "step": 2685 + }, + { + "epoch": 0.43882544861337686, + "grad_norm": 0.18644335865974426, + "learning_rate": 2.1933115823817295e-05, + "loss": 0.265, + "num_input_tokens_seen": 5825248, + "step": 2690 + }, + { + "epoch": 0.4396411092985318, + "grad_norm": 2.524604558944702, + "learning_rate": 2.1973898858075043e-05, + "loss": 0.2813, + "num_input_tokens_seen": 5835904, + "step": 2695 + }, + { + "epoch": 0.44045676998368677, + "grad_norm": 1.5085242986679077, + "learning_rate": 2.201468189233279e-05, + "loss": 0.1735, + "num_input_tokens_seen": 5844736, + "step": 2700 + }, + { + "epoch": 0.44127243066884175, + "grad_norm": 0.23170897364616394, + "learning_rate": 2.2055464926590538e-05, + "loss": 0.3299, + "num_input_tokens_seen": 5856672, + "step": 2705 + }, + { + "epoch": 0.44208809135399674, + "grad_norm": 0.09748858213424683, + "learning_rate": 2.209624796084829e-05, + "loss": 0.156, + "num_input_tokens_seen": 5866848, + "step": 2710 + }, + { + "epoch": 0.4429037520391517, + "grad_norm": 0.21378208696842194, + "learning_rate": 2.2137030995106037e-05, + "loss": 0.0494, + "num_input_tokens_seen": 5877856, + "step": 2715 + }, + { + "epoch": 0.4437194127243067, + "grad_norm": 0.13683302700519562, + "learning_rate": 2.2177814029363788e-05, + "loss": 0.1523, + "num_input_tokens_seen": 5888896, + "step": 2720 + }, + { + "epoch": 0.4445350734094617, + "grad_norm": 0.30655649304389954, + "learning_rate": 2.2218597063621535e-05, + "loss": 0.2155, + "num_input_tokens_seen": 5899872, + "step": 2725 + }, + { + "epoch": 0.4453507340946166, + "grad_norm": 2.0962772369384766, + "learning_rate": 2.2259380097879283e-05, + "loss": 0.128, + "num_input_tokens_seen": 5910272, + "step": 2730 + }, + { + "epoch": 0.4461663947797716, + "grad_norm": 1.9508531093597412, + "learning_rate": 2.230016313213703e-05, + "loss": 0.2445, + "num_input_tokens_seen": 5921312, + "step": 2735 + }, + { + "epoch": 0.4469820554649266, + "grad_norm": 0.789364218711853, + "learning_rate": 2.234094616639478e-05, + "loss": 0.1091, + "num_input_tokens_seen": 5932032, + "step": 2740 + }, + { + "epoch": 0.44779771615008157, + "grad_norm": 0.5286423563957214, + "learning_rate": 2.238172920065253e-05, + "loss": 0.1172, + "num_input_tokens_seen": 5942176, + "step": 2745 + }, + { + "epoch": 0.44861337683523655, + "grad_norm": 2.813351631164551, + "learning_rate": 2.2422512234910277e-05, + "loss": 0.4982, + "num_input_tokens_seen": 5954336, + "step": 2750 + }, + { + "epoch": 0.44942903752039154, + "grad_norm": 4.15123987197876, + "learning_rate": 2.2463295269168028e-05, + "loss": 0.3503, + "num_input_tokens_seen": 5965152, + "step": 2755 + }, + { + "epoch": 0.45024469820554647, + "grad_norm": 1.0213944911956787, + "learning_rate": 2.2504078303425776e-05, + "loss": 0.2533, + "num_input_tokens_seen": 5977216, + "step": 2760 + }, + { + "epoch": 0.45106035889070145, + "grad_norm": 0.5648565292358398, + "learning_rate": 2.2544861337683527e-05, + "loss": 0.1975, + "num_input_tokens_seen": 5988000, + "step": 2765 + }, + { + "epoch": 0.45187601957585644, + "grad_norm": 2.686412811279297, + "learning_rate": 2.258564437194127e-05, + "loss": 0.1684, + "num_input_tokens_seen": 5998400, + "step": 2770 + }, + { + "epoch": 0.4526916802610114, + "grad_norm": 1.376602292060852, + "learning_rate": 2.2626427406199022e-05, + "loss": 0.2157, + "num_input_tokens_seen": 6010176, + "step": 2775 + }, + { + "epoch": 0.4535073409461664, + "grad_norm": 1.753621220588684, + "learning_rate": 2.266721044045677e-05, + "loss": 0.1655, + "num_input_tokens_seen": 6021312, + "step": 2780 + }, + { + "epoch": 0.4543230016313214, + "grad_norm": 2.4146690368652344, + "learning_rate": 2.270799347471452e-05, + "loss": 0.1919, + "num_input_tokens_seen": 6033024, + "step": 2785 + }, + { + "epoch": 0.4551386623164764, + "grad_norm": 0.09019044041633606, + "learning_rate": 2.2748776508972268e-05, + "loss": 0.1043, + "num_input_tokens_seen": 6042432, + "step": 2790 + }, + { + "epoch": 0.4559543230016313, + "grad_norm": 1.6199525594711304, + "learning_rate": 2.278955954323002e-05, + "loss": 0.2129, + "num_input_tokens_seen": 6054304, + "step": 2795 + }, + { + "epoch": 0.4567699836867863, + "grad_norm": 1.1388384103775024, + "learning_rate": 2.2830342577487763e-05, + "loss": 0.1772, + "num_input_tokens_seen": 6065312, + "step": 2800 + }, + { + "epoch": 0.45758564437194127, + "grad_norm": 2.106416702270508, + "learning_rate": 2.2871125611745514e-05, + "loss": 0.4018, + "num_input_tokens_seen": 6075072, + "step": 2805 + }, + { + "epoch": 0.45840130505709625, + "grad_norm": 1.8106433153152466, + "learning_rate": 2.2911908646003262e-05, + "loss": 0.0895, + "num_input_tokens_seen": 6084992, + "step": 2810 + }, + { + "epoch": 0.45921696574225124, + "grad_norm": 3.2367923259735107, + "learning_rate": 2.2952691680261013e-05, + "loss": 0.1232, + "num_input_tokens_seen": 6095424, + "step": 2815 + }, + { + "epoch": 0.4600326264274062, + "grad_norm": 1.3443167209625244, + "learning_rate": 2.299347471451876e-05, + "loss": 0.1228, + "num_input_tokens_seen": 6107328, + "step": 2820 + }, + { + "epoch": 0.46084828711256115, + "grad_norm": 0.49068576097488403, + "learning_rate": 2.3034257748776512e-05, + "loss": 0.0234, + "num_input_tokens_seen": 6118048, + "step": 2825 + }, + { + "epoch": 0.46166394779771613, + "grad_norm": 1.948642373085022, + "learning_rate": 2.307504078303426e-05, + "loss": 0.3937, + "num_input_tokens_seen": 6128512, + "step": 2830 + }, + { + "epoch": 0.4624796084828711, + "grad_norm": 0.6373794674873352, + "learning_rate": 2.3115823817292007e-05, + "loss": 0.0599, + "num_input_tokens_seen": 6138720, + "step": 2835 + }, + { + "epoch": 0.4632952691680261, + "grad_norm": 2.4720773696899414, + "learning_rate": 2.3156606851549755e-05, + "loss": 0.2404, + "num_input_tokens_seen": 6149600, + "step": 2840 + }, + { + "epoch": 0.4641109298531811, + "grad_norm": 2.3904786109924316, + "learning_rate": 2.3197389885807506e-05, + "loss": 0.134, + "num_input_tokens_seen": 6160608, + "step": 2845 + }, + { + "epoch": 0.46492659053833607, + "grad_norm": 1.3608145713806152, + "learning_rate": 2.3238172920065253e-05, + "loss": 0.2464, + "num_input_tokens_seen": 6171968, + "step": 2850 + }, + { + "epoch": 0.46574225122349105, + "grad_norm": 1.800126075744629, + "learning_rate": 2.3278955954323004e-05, + "loss": 0.1798, + "num_input_tokens_seen": 6183360, + "step": 2855 + }, + { + "epoch": 0.466557911908646, + "grad_norm": 1.3392291069030762, + "learning_rate": 2.3319738988580752e-05, + "loss": 0.0942, + "num_input_tokens_seen": 6193664, + "step": 2860 + }, + { + "epoch": 0.46737357259380097, + "grad_norm": 0.522480309009552, + "learning_rate": 2.33605220228385e-05, + "loss": 0.0443, + "num_input_tokens_seen": 6204320, + "step": 2865 + }, + { + "epoch": 0.46818923327895595, + "grad_norm": 1.7256078720092773, + "learning_rate": 2.3401305057096247e-05, + "loss": 0.3283, + "num_input_tokens_seen": 6214560, + "step": 2870 + }, + { + "epoch": 0.46900489396411094, + "grad_norm": 1.0729522705078125, + "learning_rate": 2.3442088091354e-05, + "loss": 0.293, + "num_input_tokens_seen": 6226176, + "step": 2875 + }, + { + "epoch": 0.4698205546492659, + "grad_norm": 0.8386189341545105, + "learning_rate": 2.3482871125611746e-05, + "loss": 0.4924, + "num_input_tokens_seen": 6236768, + "step": 2880 + }, + { + "epoch": 0.4706362153344209, + "grad_norm": 3.0372092723846436, + "learning_rate": 2.3523654159869497e-05, + "loss": 0.1802, + "num_input_tokens_seen": 6247424, + "step": 2885 + }, + { + "epoch": 0.47145187601957583, + "grad_norm": 0.3947007358074188, + "learning_rate": 2.3564437194127245e-05, + "loss": 0.0537, + "num_input_tokens_seen": 6257280, + "step": 2890 + }, + { + "epoch": 0.4722675367047308, + "grad_norm": 1.7978843450546265, + "learning_rate": 2.3605220228384996e-05, + "loss": 0.1934, + "num_input_tokens_seen": 6268160, + "step": 2895 + }, + { + "epoch": 0.4730831973898858, + "grad_norm": 0.2250419408082962, + "learning_rate": 2.364600326264274e-05, + "loss": 0.092, + "num_input_tokens_seen": 6278976, + "step": 2900 + }, + { + "epoch": 0.4738988580750408, + "grad_norm": 0.0751931443810463, + "learning_rate": 2.368678629690049e-05, + "loss": 0.1814, + "num_input_tokens_seen": 6290144, + "step": 2905 + }, + { + "epoch": 0.47471451876019577, + "grad_norm": 0.7637354731559753, + "learning_rate": 2.372756933115824e-05, + "loss": 0.1224, + "num_input_tokens_seen": 6301760, + "step": 2910 + }, + { + "epoch": 0.47553017944535075, + "grad_norm": 0.12859490513801575, + "learning_rate": 2.376835236541599e-05, + "loss": 0.4383, + "num_input_tokens_seen": 6312416, + "step": 2915 + }, + { + "epoch": 0.4763458401305057, + "grad_norm": 0.5231532454490662, + "learning_rate": 2.3809135399673737e-05, + "loss": 0.1181, + "num_input_tokens_seen": 6323488, + "step": 2920 + }, + { + "epoch": 0.47716150081566067, + "grad_norm": 2.2103540897369385, + "learning_rate": 2.384991843393149e-05, + "loss": 0.3103, + "num_input_tokens_seen": 6334496, + "step": 2925 + }, + { + "epoch": 0.47797716150081565, + "grad_norm": 1.1206231117248535, + "learning_rate": 2.3890701468189233e-05, + "loss": 0.3338, + "num_input_tokens_seen": 6344320, + "step": 2930 + }, + { + "epoch": 0.47879282218597063, + "grad_norm": 2.323606491088867, + "learning_rate": 2.3931484502446984e-05, + "loss": 0.2765, + "num_input_tokens_seen": 6355904, + "step": 2935 + }, + { + "epoch": 0.4796084828711256, + "grad_norm": 2.122671127319336, + "learning_rate": 2.397226753670473e-05, + "loss": 0.3437, + "num_input_tokens_seen": 6366016, + "step": 2940 + }, + { + "epoch": 0.4804241435562806, + "grad_norm": 1.5134105682373047, + "learning_rate": 2.4013050570962482e-05, + "loss": 0.0754, + "num_input_tokens_seen": 6375712, + "step": 2945 + }, + { + "epoch": 0.4812398042414356, + "grad_norm": 0.1432291567325592, + "learning_rate": 2.405383360522023e-05, + "loss": 0.1341, + "num_input_tokens_seen": 6386720, + "step": 2950 + }, + { + "epoch": 0.4820554649265905, + "grad_norm": 0.10092782974243164, + "learning_rate": 2.4094616639477978e-05, + "loss": 0.0839, + "num_input_tokens_seen": 6397536, + "step": 2955 + }, + { + "epoch": 0.4828711256117455, + "grad_norm": 1.9689542055130005, + "learning_rate": 2.4135399673735725e-05, + "loss": 0.2548, + "num_input_tokens_seen": 6408384, + "step": 2960 + }, + { + "epoch": 0.4836867862969005, + "grad_norm": 1.3863824605941772, + "learning_rate": 2.4176182707993476e-05, + "loss": 0.1177, + "num_input_tokens_seen": 6420640, + "step": 2965 + }, + { + "epoch": 0.48450244698205547, + "grad_norm": 0.24607890844345093, + "learning_rate": 2.4216965742251224e-05, + "loss": 0.1626, + "num_input_tokens_seen": 6431680, + "step": 2970 + }, + { + "epoch": 0.48531810766721045, + "grad_norm": 0.569159209728241, + "learning_rate": 2.425774877650897e-05, + "loss": 0.0798, + "num_input_tokens_seen": 6442368, + "step": 2975 + }, + { + "epoch": 0.48613376835236544, + "grad_norm": 0.019526738673448563, + "learning_rate": 2.4298531810766723e-05, + "loss": 0.1184, + "num_input_tokens_seen": 6453856, + "step": 2980 + }, + { + "epoch": 0.48694942903752036, + "grad_norm": 0.6877635717391968, + "learning_rate": 2.433931484502447e-05, + "loss": 0.2915, + "num_input_tokens_seen": 6464640, + "step": 2985 + }, + { + "epoch": 0.48776508972267535, + "grad_norm": 2.784242868423462, + "learning_rate": 2.438009787928222e-05, + "loss": 0.1751, + "num_input_tokens_seen": 6475392, + "step": 2990 + }, + { + "epoch": 0.48858075040783033, + "grad_norm": 1.3387733697891235, + "learning_rate": 2.442088091353997e-05, + "loss": 0.1517, + "num_input_tokens_seen": 6486880, + "step": 2995 + }, + { + "epoch": 0.4893964110929853, + "grad_norm": 0.1691960245370865, + "learning_rate": 2.4461663947797717e-05, + "loss": 0.141, + "num_input_tokens_seen": 6497216, + "step": 3000 + }, + { + "epoch": 0.4902120717781403, + "grad_norm": 1.3357646465301514, + "learning_rate": 2.4502446982055464e-05, + "loss": 0.2395, + "num_input_tokens_seen": 6508320, + "step": 3005 + }, + { + "epoch": 0.4910277324632953, + "grad_norm": 2.1551332473754883, + "learning_rate": 2.4543230016313215e-05, + "loss": 0.2423, + "num_input_tokens_seen": 6519520, + "step": 3010 + }, + { + "epoch": 0.49184339314845027, + "grad_norm": 0.9169336557388306, + "learning_rate": 2.4584013050570963e-05, + "loss": 0.1753, + "num_input_tokens_seen": 6530240, + "step": 3015 + }, + { + "epoch": 0.4926590538336052, + "grad_norm": 1.8618212938308716, + "learning_rate": 2.4624796084828714e-05, + "loss": 0.3145, + "num_input_tokens_seen": 6540192, + "step": 3020 + }, + { + "epoch": 0.4934747145187602, + "grad_norm": 0.2941511869430542, + "learning_rate": 2.466557911908646e-05, + "loss": 0.0944, + "num_input_tokens_seen": 6550016, + "step": 3025 + }, + { + "epoch": 0.49429037520391517, + "grad_norm": 0.23173175752162933, + "learning_rate": 2.470636215334421e-05, + "loss": 0.134, + "num_input_tokens_seen": 6560672, + "step": 3030 + }, + { + "epoch": 0.49510603588907015, + "grad_norm": 0.37472182512283325, + "learning_rate": 2.4747145187601957e-05, + "loss": 0.0945, + "num_input_tokens_seen": 6571808, + "step": 3035 + }, + { + "epoch": 0.49592169657422513, + "grad_norm": 0.8540402054786682, + "learning_rate": 2.4787928221859708e-05, + "loss": 0.2594, + "num_input_tokens_seen": 6582624, + "step": 3040 + }, + { + "epoch": 0.4967373572593801, + "grad_norm": 0.6165176630020142, + "learning_rate": 2.4828711256117456e-05, + "loss": 0.2534, + "num_input_tokens_seen": 6595008, + "step": 3045 + }, + { + "epoch": 0.49755301794453505, + "grad_norm": 1.5081135034561157, + "learning_rate": 2.4869494290375207e-05, + "loss": 0.1354, + "num_input_tokens_seen": 6606976, + "step": 3050 + }, + { + "epoch": 0.49836867862969003, + "grad_norm": 1.721940040588379, + "learning_rate": 2.4910277324632954e-05, + "loss": 0.1365, + "num_input_tokens_seen": 6617760, + "step": 3055 + }, + { + "epoch": 0.499184339314845, + "grad_norm": 2.795393943786621, + "learning_rate": 2.4951060358890702e-05, + "loss": 0.1433, + "num_input_tokens_seen": 6628896, + "step": 3060 + }, + { + "epoch": 0.5, + "grad_norm": 2.8703060150146484, + "learning_rate": 2.499184339314845e-05, + "loss": 0.2946, + "num_input_tokens_seen": 6639424, + "step": 3065 + }, + { + "epoch": 0.5, + "eval_loss": 0.1869235336780548, + "eval_runtime": 132.3268, + "eval_samples_per_second": 20.593, + "eval_steps_per_second": 5.154, + "num_input_tokens_seen": 6639424, + "step": 3065 + }, + { + "epoch": 0.5008156606851549, + "grad_norm": 1.3174020051956177, + "learning_rate": 2.5032626427406204e-05, + "loss": 0.2417, + "num_input_tokens_seen": 6649280, + "step": 3070 + }, + { + "epoch": 0.50163132137031, + "grad_norm": 0.6986455321311951, + "learning_rate": 2.507340946166395e-05, + "loss": 0.1122, + "num_input_tokens_seen": 6659008, + "step": 3075 + }, + { + "epoch": 0.5024469820554649, + "grad_norm": 1.5233278274536133, + "learning_rate": 2.5114192495921696e-05, + "loss": 0.1664, + "num_input_tokens_seen": 6669312, + "step": 3080 + }, + { + "epoch": 0.5032626427406199, + "grad_norm": 1.634453296661377, + "learning_rate": 2.5154975530179447e-05, + "loss": 0.3517, + "num_input_tokens_seen": 6679968, + "step": 3085 + }, + { + "epoch": 0.5040783034257749, + "grad_norm": 2.461886167526245, + "learning_rate": 2.5195758564437195e-05, + "loss": 0.2109, + "num_input_tokens_seen": 6689728, + "step": 3090 + }, + { + "epoch": 0.5048939641109299, + "grad_norm": 1.9994043111801147, + "learning_rate": 2.5236541598694946e-05, + "loss": 0.4812, + "num_input_tokens_seen": 6700448, + "step": 3095 + }, + { + "epoch": 0.5057096247960848, + "grad_norm": 1.5028187036514282, + "learning_rate": 2.5277324632952693e-05, + "loss": 0.1368, + "num_input_tokens_seen": 6711424, + "step": 3100 + }, + { + "epoch": 0.5065252854812398, + "grad_norm": 1.0185061693191528, + "learning_rate": 2.5318107667210438e-05, + "loss": 0.0759, + "num_input_tokens_seen": 6722752, + "step": 3105 + }, + { + "epoch": 0.5073409461663948, + "grad_norm": 0.6809740662574768, + "learning_rate": 2.5358890701468192e-05, + "loss": 0.3854, + "num_input_tokens_seen": 6734080, + "step": 3110 + }, + { + "epoch": 0.5081566068515497, + "grad_norm": 0.5781834721565247, + "learning_rate": 2.5399673735725936e-05, + "loss": 0.1012, + "num_input_tokens_seen": 6744672, + "step": 3115 + }, + { + "epoch": 0.5089722675367048, + "grad_norm": 1.9785404205322266, + "learning_rate": 2.544045676998369e-05, + "loss": 0.2218, + "num_input_tokens_seen": 6755104, + "step": 3120 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 1.2857749462127686, + "learning_rate": 2.5481239804241435e-05, + "loss": 0.2261, + "num_input_tokens_seen": 6767168, + "step": 3125 + }, + { + "epoch": 0.5106035889070146, + "grad_norm": 0.2167927622795105, + "learning_rate": 2.552202283849919e-05, + "loss": 0.187, + "num_input_tokens_seen": 6777472, + "step": 3130 + }, + { + "epoch": 0.5114192495921697, + "grad_norm": 3.0851855278015137, + "learning_rate": 2.5562805872756934e-05, + "loss": 0.2878, + "num_input_tokens_seen": 6787776, + "step": 3135 + }, + { + "epoch": 0.5122349102773246, + "grad_norm": 1.2497187852859497, + "learning_rate": 2.560358890701468e-05, + "loss": 0.3231, + "num_input_tokens_seen": 6799008, + "step": 3140 + }, + { + "epoch": 0.5130505709624796, + "grad_norm": 0.16389091312885284, + "learning_rate": 2.5644371941272432e-05, + "loss": 0.1161, + "num_input_tokens_seen": 6809440, + "step": 3145 + }, + { + "epoch": 0.5138662316476346, + "grad_norm": 0.12030970305204391, + "learning_rate": 2.568515497553018e-05, + "loss": 0.0774, + "num_input_tokens_seen": 6819456, + "step": 3150 + }, + { + "epoch": 0.5146818923327896, + "grad_norm": 1.5228848457336426, + "learning_rate": 2.572593800978793e-05, + "loss": 0.1101, + "num_input_tokens_seen": 6829312, + "step": 3155 + }, + { + "epoch": 0.5154975530179445, + "grad_norm": 1.2761099338531494, + "learning_rate": 2.576672104404568e-05, + "loss": 0.3629, + "num_input_tokens_seen": 6839456, + "step": 3160 + }, + { + "epoch": 0.5163132137030995, + "grad_norm": 0.3170281946659088, + "learning_rate": 2.580750407830343e-05, + "loss": 0.2687, + "num_input_tokens_seen": 6850752, + "step": 3165 + }, + { + "epoch": 0.5171288743882545, + "grad_norm": 0.7285338044166565, + "learning_rate": 2.5848287112561177e-05, + "loss": 0.1214, + "num_input_tokens_seen": 6862144, + "step": 3170 + }, + { + "epoch": 0.5179445350734094, + "grad_norm": 2.7989420890808105, + "learning_rate": 2.588907014681892e-05, + "loss": 0.2117, + "num_input_tokens_seen": 6872704, + "step": 3175 + }, + { + "epoch": 0.5187601957585645, + "grad_norm": 2.5124666690826416, + "learning_rate": 2.5929853181076673e-05, + "loss": 0.1702, + "num_input_tokens_seen": 6883232, + "step": 3180 + }, + { + "epoch": 0.5195758564437194, + "grad_norm": 1.7549992799758911, + "learning_rate": 2.597063621533442e-05, + "loss": 0.3014, + "num_input_tokens_seen": 6894368, + "step": 3185 + }, + { + "epoch": 0.5203915171288744, + "grad_norm": 1.7443325519561768, + "learning_rate": 2.601141924959217e-05, + "loss": 0.1908, + "num_input_tokens_seen": 6906336, + "step": 3190 + }, + { + "epoch": 0.5212071778140294, + "grad_norm": 0.22508245706558228, + "learning_rate": 2.605220228384992e-05, + "loss": 0.1156, + "num_input_tokens_seen": 6917440, + "step": 3195 + }, + { + "epoch": 0.5220228384991843, + "grad_norm": 1.6739757061004639, + "learning_rate": 2.609298531810767e-05, + "loss": 0.3984, + "num_input_tokens_seen": 6928736, + "step": 3200 + }, + { + "epoch": 0.5228384991843393, + "grad_norm": 1.3097021579742432, + "learning_rate": 2.6133768352365418e-05, + "loss": 0.1948, + "num_input_tokens_seen": 6939488, + "step": 3205 + }, + { + "epoch": 0.5236541598694943, + "grad_norm": 0.4659516215324402, + "learning_rate": 2.6174551386623165e-05, + "loss": 0.0611, + "num_input_tokens_seen": 6950016, + "step": 3210 + }, + { + "epoch": 0.5244698205546493, + "grad_norm": 2.1895430088043213, + "learning_rate": 2.6215334420880916e-05, + "loss": 0.2994, + "num_input_tokens_seen": 6961280, + "step": 3215 + }, + { + "epoch": 0.5252854812398042, + "grad_norm": 0.9825932383537292, + "learning_rate": 2.6256117455138664e-05, + "loss": 0.1014, + "num_input_tokens_seen": 6971360, + "step": 3220 + }, + { + "epoch": 0.5261011419249593, + "grad_norm": 2.113788366317749, + "learning_rate": 2.6296900489396415e-05, + "loss": 0.4298, + "num_input_tokens_seen": 6982528, + "step": 3225 + }, + { + "epoch": 0.5269168026101142, + "grad_norm": 0.8410818576812744, + "learning_rate": 2.633768352365416e-05, + "loss": 0.0912, + "num_input_tokens_seen": 6992704, + "step": 3230 + }, + { + "epoch": 0.5277324632952691, + "grad_norm": 1.4548765420913696, + "learning_rate": 2.6378466557911907e-05, + "loss": 0.2967, + "num_input_tokens_seen": 7003072, + "step": 3235 + }, + { + "epoch": 0.5285481239804242, + "grad_norm": 1.0208672285079956, + "learning_rate": 2.6419249592169658e-05, + "loss": 0.1798, + "num_input_tokens_seen": 7013728, + "step": 3240 + }, + { + "epoch": 0.5293637846655791, + "grad_norm": 1.641165018081665, + "learning_rate": 2.6460032626427406e-05, + "loss": 0.0907, + "num_input_tokens_seen": 7024256, + "step": 3245 + }, + { + "epoch": 0.5301794453507341, + "grad_norm": 0.9203339219093323, + "learning_rate": 2.6500815660685157e-05, + "loss": 0.3748, + "num_input_tokens_seen": 7034688, + "step": 3250 + }, + { + "epoch": 0.5309951060358891, + "grad_norm": 1.3660693168640137, + "learning_rate": 2.6541598694942904e-05, + "loss": 0.1606, + "num_input_tokens_seen": 7046080, + "step": 3255 + }, + { + "epoch": 0.531810766721044, + "grad_norm": 1.6320232152938843, + "learning_rate": 2.6582381729200655e-05, + "loss": 0.1978, + "num_input_tokens_seen": 7056032, + "step": 3260 + }, + { + "epoch": 0.532626427406199, + "grad_norm": 1.6352487802505493, + "learning_rate": 2.6623164763458403e-05, + "loss": 0.2057, + "num_input_tokens_seen": 7066880, + "step": 3265 + }, + { + "epoch": 0.533442088091354, + "grad_norm": 2.2797794342041016, + "learning_rate": 2.666394779771615e-05, + "loss": 0.2925, + "num_input_tokens_seen": 7077664, + "step": 3270 + }, + { + "epoch": 0.534257748776509, + "grad_norm": 2.437703847885132, + "learning_rate": 2.67047308319739e-05, + "loss": 0.1696, + "num_input_tokens_seen": 7088928, + "step": 3275 + }, + { + "epoch": 0.5350734094616639, + "grad_norm": 0.5809819102287292, + "learning_rate": 2.6745513866231646e-05, + "loss": 0.1999, + "num_input_tokens_seen": 7100800, + "step": 3280 + }, + { + "epoch": 0.535889070146819, + "grad_norm": 1.1705827713012695, + "learning_rate": 2.67862969004894e-05, + "loss": 0.1386, + "num_input_tokens_seen": 7112160, + "step": 3285 + }, + { + "epoch": 0.5367047308319739, + "grad_norm": 0.7980926632881165, + "learning_rate": 2.6827079934747145e-05, + "loss": 0.1649, + "num_input_tokens_seen": 7122848, + "step": 3290 + }, + { + "epoch": 0.5375203915171288, + "grad_norm": 1.3709770441055298, + "learning_rate": 2.68678629690049e-05, + "loss": 0.1931, + "num_input_tokens_seen": 7132800, + "step": 3295 + }, + { + "epoch": 0.5383360522022839, + "grad_norm": 0.3947302997112274, + "learning_rate": 2.6908646003262643e-05, + "loss": 0.1402, + "num_input_tokens_seen": 7144064, + "step": 3300 + }, + { + "epoch": 0.5391517128874388, + "grad_norm": 0.9864395260810852, + "learning_rate": 2.694942903752039e-05, + "loss": 0.1774, + "num_input_tokens_seen": 7155744, + "step": 3305 + }, + { + "epoch": 0.5399673735725938, + "grad_norm": 0.21754276752471924, + "learning_rate": 2.6990212071778142e-05, + "loss": 0.0467, + "num_input_tokens_seen": 7166336, + "step": 3310 + }, + { + "epoch": 0.5407830342577488, + "grad_norm": 0.10675032436847687, + "learning_rate": 2.703099510603589e-05, + "loss": 0.2205, + "num_input_tokens_seen": 7176832, + "step": 3315 + }, + { + "epoch": 0.5415986949429038, + "grad_norm": 1.3385330438613892, + "learning_rate": 2.707177814029364e-05, + "loss": 0.1988, + "num_input_tokens_seen": 7187616, + "step": 3320 + }, + { + "epoch": 0.5424143556280587, + "grad_norm": 0.06198494881391525, + "learning_rate": 2.7112561174551388e-05, + "loss": 0.1784, + "num_input_tokens_seen": 7198144, + "step": 3325 + }, + { + "epoch": 0.5432300163132137, + "grad_norm": 1.267365574836731, + "learning_rate": 2.715334420880914e-05, + "loss": 0.1195, + "num_input_tokens_seen": 7208960, + "step": 3330 + }, + { + "epoch": 0.5440456769983687, + "grad_norm": 0.09285911917686462, + "learning_rate": 2.7194127243066887e-05, + "loss": 0.3097, + "num_input_tokens_seen": 7220064, + "step": 3335 + }, + { + "epoch": 0.5448613376835236, + "grad_norm": 0.37407758831977844, + "learning_rate": 2.723491027732463e-05, + "loss": 0.2128, + "num_input_tokens_seen": 7231008, + "step": 3340 + }, + { + "epoch": 0.5456769983686787, + "grad_norm": 1.2999653816223145, + "learning_rate": 2.7275693311582386e-05, + "loss": 0.3561, + "num_input_tokens_seen": 7242144, + "step": 3345 + }, + { + "epoch": 0.5464926590538336, + "grad_norm": 0.8629180192947388, + "learning_rate": 2.731647634584013e-05, + "loss": 0.2733, + "num_input_tokens_seen": 7252448, + "step": 3350 + }, + { + "epoch": 0.5473083197389886, + "grad_norm": 0.23098208010196686, + "learning_rate": 2.7357259380097884e-05, + "loss": 0.1719, + "num_input_tokens_seen": 7261856, + "step": 3355 + }, + { + "epoch": 0.5481239804241436, + "grad_norm": 0.3350803852081299, + "learning_rate": 2.739804241435563e-05, + "loss": 0.1187, + "num_input_tokens_seen": 7272800, + "step": 3360 + }, + { + "epoch": 0.5489396411092985, + "grad_norm": 1.9838169813156128, + "learning_rate": 2.7438825448613376e-05, + "loss": 0.2015, + "num_input_tokens_seen": 7283808, + "step": 3365 + }, + { + "epoch": 0.5497553017944535, + "grad_norm": 0.6555441617965698, + "learning_rate": 2.7479608482871127e-05, + "loss": 0.0606, + "num_input_tokens_seen": 7294976, + "step": 3370 + }, + { + "epoch": 0.5505709624796085, + "grad_norm": 0.5716204047203064, + "learning_rate": 2.7520391517128875e-05, + "loss": 0.1756, + "num_input_tokens_seen": 7306560, + "step": 3375 + }, + { + "epoch": 0.5513866231647635, + "grad_norm": 0.3020555078983307, + "learning_rate": 2.7561174551386626e-05, + "loss": 0.0402, + "num_input_tokens_seen": 7317600, + "step": 3380 + }, + { + "epoch": 0.5522022838499184, + "grad_norm": 1.7957122325897217, + "learning_rate": 2.7601957585644373e-05, + "loss": 0.1115, + "num_input_tokens_seen": 7330144, + "step": 3385 + }, + { + "epoch": 0.5530179445350734, + "grad_norm": 1.741129994392395, + "learning_rate": 2.7642740619902125e-05, + "loss": 0.2083, + "num_input_tokens_seen": 7341248, + "step": 3390 + }, + { + "epoch": 0.5538336052202284, + "grad_norm": 1.6629507541656494, + "learning_rate": 2.7683523654159872e-05, + "loss": 0.2427, + "num_input_tokens_seen": 7351200, + "step": 3395 + }, + { + "epoch": 0.5546492659053833, + "grad_norm": 0.6279044151306152, + "learning_rate": 2.7724306688417616e-05, + "loss": 0.0465, + "num_input_tokens_seen": 7361792, + "step": 3400 + }, + { + "epoch": 0.5554649265905384, + "grad_norm": 1.7130972146987915, + "learning_rate": 2.776508972267537e-05, + "loss": 0.1665, + "num_input_tokens_seen": 7372160, + "step": 3405 + }, + { + "epoch": 0.5562805872756933, + "grad_norm": 0.32833874225616455, + "learning_rate": 2.7805872756933115e-05, + "loss": 0.0832, + "num_input_tokens_seen": 7381408, + "step": 3410 + }, + { + "epoch": 0.5570962479608483, + "grad_norm": 2.365764617919922, + "learning_rate": 2.7846655791190866e-05, + "loss": 0.2382, + "num_input_tokens_seen": 7392512, + "step": 3415 + }, + { + "epoch": 0.5579119086460033, + "grad_norm": 1.3806045055389404, + "learning_rate": 2.7887438825448614e-05, + "loss": 0.2299, + "num_input_tokens_seen": 7404192, + "step": 3420 + }, + { + "epoch": 0.5587275693311582, + "grad_norm": 0.024680066853761673, + "learning_rate": 2.7928221859706365e-05, + "loss": 0.1904, + "num_input_tokens_seen": 7415008, + "step": 3425 + }, + { + "epoch": 0.5595432300163132, + "grad_norm": 1.3443577289581299, + "learning_rate": 2.7969004893964112e-05, + "loss": 0.1258, + "num_input_tokens_seen": 7425088, + "step": 3430 + }, + { + "epoch": 0.5603588907014682, + "grad_norm": 1.059841513633728, + "learning_rate": 2.800978792822186e-05, + "loss": 0.2935, + "num_input_tokens_seen": 7436608, + "step": 3435 + }, + { + "epoch": 0.5611745513866232, + "grad_norm": 0.3945305347442627, + "learning_rate": 2.805057096247961e-05, + "loss": 0.1878, + "num_input_tokens_seen": 7446784, + "step": 3440 + }, + { + "epoch": 0.5619902120717781, + "grad_norm": 0.2547900676727295, + "learning_rate": 2.809135399673736e-05, + "loss": 0.1722, + "num_input_tokens_seen": 7456640, + "step": 3445 + }, + { + "epoch": 0.5628058727569332, + "grad_norm": 0.36951524019241333, + "learning_rate": 2.813213703099511e-05, + "loss": 0.2653, + "num_input_tokens_seen": 7468096, + "step": 3450 + }, + { + "epoch": 0.5636215334420881, + "grad_norm": 1.3985049724578857, + "learning_rate": 2.8172920065252857e-05, + "loss": 0.2331, + "num_input_tokens_seen": 7480096, + "step": 3455 + }, + { + "epoch": 0.564437194127243, + "grad_norm": 0.41456514596939087, + "learning_rate": 2.8213703099510602e-05, + "loss": 0.137, + "num_input_tokens_seen": 7491136, + "step": 3460 + }, + { + "epoch": 0.5652528548123981, + "grad_norm": 0.2760438323020935, + "learning_rate": 2.8254486133768353e-05, + "loss": 0.0909, + "num_input_tokens_seen": 7502304, + "step": 3465 + }, + { + "epoch": 0.566068515497553, + "grad_norm": 0.47616180777549744, + "learning_rate": 2.82952691680261e-05, + "loss": 0.1592, + "num_input_tokens_seen": 7512928, + "step": 3470 + }, + { + "epoch": 0.566884176182708, + "grad_norm": 0.2349841594696045, + "learning_rate": 2.833605220228385e-05, + "loss": 0.2282, + "num_input_tokens_seen": 7523904, + "step": 3475 + }, + { + "epoch": 0.567699836867863, + "grad_norm": 2.1416139602661133, + "learning_rate": 2.83768352365416e-05, + "loss": 0.369, + "num_input_tokens_seen": 7534496, + "step": 3480 + }, + { + "epoch": 0.5685154975530179, + "grad_norm": 2.0568671226501465, + "learning_rate": 2.841761827079935e-05, + "loss": 0.3968, + "num_input_tokens_seen": 7545120, + "step": 3485 + }, + { + "epoch": 0.5693311582381729, + "grad_norm": 1.0830539464950562, + "learning_rate": 2.8458401305057098e-05, + "loss": 0.1145, + "num_input_tokens_seen": 7555712, + "step": 3490 + }, + { + "epoch": 0.5701468189233279, + "grad_norm": 0.37009501457214355, + "learning_rate": 2.8499184339314845e-05, + "loss": 0.0786, + "num_input_tokens_seen": 7567200, + "step": 3495 + }, + { + "epoch": 0.5709624796084829, + "grad_norm": 1.5800613164901733, + "learning_rate": 2.8539967373572596e-05, + "loss": 0.251, + "num_input_tokens_seen": 7578464, + "step": 3500 + }, + { + "epoch": 0.5717781402936378, + "grad_norm": 1.018689751625061, + "learning_rate": 2.858075040783034e-05, + "loss": 0.184, + "num_input_tokens_seen": 7588256, + "step": 3505 + }, + { + "epoch": 0.5725938009787929, + "grad_norm": 1.336984634399414, + "learning_rate": 2.8621533442088095e-05, + "loss": 0.3123, + "num_input_tokens_seen": 7598848, + "step": 3510 + }, + { + "epoch": 0.5734094616639478, + "grad_norm": 1.050323724746704, + "learning_rate": 2.866231647634584e-05, + "loss": 0.1251, + "num_input_tokens_seen": 7609696, + "step": 3515 + }, + { + "epoch": 0.5742251223491027, + "grad_norm": 0.5272635221481323, + "learning_rate": 2.8703099510603594e-05, + "loss": 0.0874, + "num_input_tokens_seen": 7620864, + "step": 3520 + }, + { + "epoch": 0.5750407830342578, + "grad_norm": 1.886742353439331, + "learning_rate": 2.8743882544861338e-05, + "loss": 0.0998, + "num_input_tokens_seen": 7631392, + "step": 3525 + }, + { + "epoch": 0.5758564437194127, + "grad_norm": 0.27875006198883057, + "learning_rate": 2.8784665579119086e-05, + "loss": 0.1168, + "num_input_tokens_seen": 7643232, + "step": 3530 + }, + { + "epoch": 0.5766721044045677, + "grad_norm": 0.23623165488243103, + "learning_rate": 2.8825448613376837e-05, + "loss": 0.2091, + "num_input_tokens_seen": 7654944, + "step": 3535 + }, + { + "epoch": 0.5774877650897227, + "grad_norm": 1.4585511684417725, + "learning_rate": 2.8866231647634584e-05, + "loss": 0.1117, + "num_input_tokens_seen": 7665760, + "step": 3540 + }, + { + "epoch": 0.5783034257748777, + "grad_norm": 0.29399406909942627, + "learning_rate": 2.8907014681892335e-05, + "loss": 0.0878, + "num_input_tokens_seen": 7676576, + "step": 3545 + }, + { + "epoch": 0.5791190864600326, + "grad_norm": 0.6470755338668823, + "learning_rate": 2.8947797716150083e-05, + "loss": 0.301, + "num_input_tokens_seen": 7687104, + "step": 3550 + }, + { + "epoch": 0.5799347471451876, + "grad_norm": 1.6273497343063354, + "learning_rate": 2.8988580750407834e-05, + "loss": 0.1052, + "num_input_tokens_seen": 7698080, + "step": 3555 + }, + { + "epoch": 0.5807504078303426, + "grad_norm": 1.4072333574295044, + "learning_rate": 2.9029363784665582e-05, + "loss": 0.2493, + "num_input_tokens_seen": 7709632, + "step": 3560 + }, + { + "epoch": 0.5815660685154975, + "grad_norm": 0.6595863699913025, + "learning_rate": 2.9070146818923326e-05, + "loss": 0.0776, + "num_input_tokens_seen": 7719936, + "step": 3565 + }, + { + "epoch": 0.5823817292006526, + "grad_norm": 0.5274279117584229, + "learning_rate": 2.911092985318108e-05, + "loss": 0.1038, + "num_input_tokens_seen": 7730080, + "step": 3570 + }, + { + "epoch": 0.5831973898858075, + "grad_norm": 0.9455099701881409, + "learning_rate": 2.9151712887438825e-05, + "loss": 0.1505, + "num_input_tokens_seen": 7740576, + "step": 3575 + }, + { + "epoch": 0.5840130505709625, + "grad_norm": 0.07032564282417297, + "learning_rate": 2.919249592169658e-05, + "loss": 0.2131, + "num_input_tokens_seen": 7751040, + "step": 3580 + }, + { + "epoch": 0.5848287112561175, + "grad_norm": 0.052492592483758926, + "learning_rate": 2.9233278955954323e-05, + "loss": 0.0535, + "num_input_tokens_seen": 7760768, + "step": 3585 + }, + { + "epoch": 0.5856443719412724, + "grad_norm": 0.19630420207977295, + "learning_rate": 2.927406199021207e-05, + "loss": 0.0816, + "num_input_tokens_seen": 7771680, + "step": 3590 + }, + { + "epoch": 0.5864600326264274, + "grad_norm": 0.35516485571861267, + "learning_rate": 2.9314845024469822e-05, + "loss": 0.1278, + "num_input_tokens_seen": 7783072, + "step": 3595 + }, + { + "epoch": 0.5872756933115824, + "grad_norm": 0.2818853259086609, + "learning_rate": 2.935562805872757e-05, + "loss": 0.095, + "num_input_tokens_seen": 7792608, + "step": 3600 + }, + { + "epoch": 0.5880913539967374, + "grad_norm": 0.17740480601787567, + "learning_rate": 2.939641109298532e-05, + "loss": 0.2733, + "num_input_tokens_seen": 7803744, + "step": 3605 + }, + { + "epoch": 0.5889070146818923, + "grad_norm": 0.8036673665046692, + "learning_rate": 2.943719412724307e-05, + "loss": 0.1092, + "num_input_tokens_seen": 7815008, + "step": 3610 + }, + { + "epoch": 0.5897226753670473, + "grad_norm": 0.9958035349845886, + "learning_rate": 2.947797716150082e-05, + "loss": 0.2809, + "num_input_tokens_seen": 7825536, + "step": 3615 + }, + { + "epoch": 0.5905383360522023, + "grad_norm": 0.15587951242923737, + "learning_rate": 2.9518760195758567e-05, + "loss": 0.1764, + "num_input_tokens_seen": 7836160, + "step": 3620 + }, + { + "epoch": 0.5913539967373572, + "grad_norm": 1.1619831323623657, + "learning_rate": 2.955954323001631e-05, + "loss": 0.0826, + "num_input_tokens_seen": 7848864, + "step": 3625 + }, + { + "epoch": 0.5921696574225123, + "grad_norm": 1.8797065019607544, + "learning_rate": 2.9600326264274066e-05, + "loss": 0.1503, + "num_input_tokens_seen": 7859488, + "step": 3630 + }, + { + "epoch": 0.5929853181076672, + "grad_norm": 0.7004976272583008, + "learning_rate": 2.964110929853181e-05, + "loss": 0.168, + "num_input_tokens_seen": 7869600, + "step": 3635 + }, + { + "epoch": 0.5938009787928222, + "grad_norm": 1.4462761878967285, + "learning_rate": 2.968189233278956e-05, + "loss": 0.2242, + "num_input_tokens_seen": 7879840, + "step": 3640 + }, + { + "epoch": 0.5946166394779772, + "grad_norm": 0.0964532122015953, + "learning_rate": 2.972267536704731e-05, + "loss": 0.0933, + "num_input_tokens_seen": 7890272, + "step": 3645 + }, + { + "epoch": 0.5954323001631321, + "grad_norm": 3.1874380111694336, + "learning_rate": 2.976345840130506e-05, + "loss": 0.3486, + "num_input_tokens_seen": 7901120, + "step": 3650 + }, + { + "epoch": 0.5962479608482871, + "grad_norm": 1.3083733320236206, + "learning_rate": 2.9804241435562807e-05, + "loss": 0.0864, + "num_input_tokens_seen": 7912608, + "step": 3655 + }, + { + "epoch": 0.5970636215334421, + "grad_norm": 0.9140175580978394, + "learning_rate": 2.9845024469820555e-05, + "loss": 0.2499, + "num_input_tokens_seen": 7923936, + "step": 3660 + }, + { + "epoch": 0.5978792822185971, + "grad_norm": 1.9506852626800537, + "learning_rate": 2.9885807504078306e-05, + "loss": 0.284, + "num_input_tokens_seen": 7934496, + "step": 3665 + }, + { + "epoch": 0.598694942903752, + "grad_norm": 1.6006230115890503, + "learning_rate": 2.9926590538336054e-05, + "loss": 0.2032, + "num_input_tokens_seen": 7945728, + "step": 3670 + }, + { + "epoch": 0.5995106035889071, + "grad_norm": 0.18785792589187622, + "learning_rate": 2.9967373572593805e-05, + "loss": 0.1365, + "num_input_tokens_seen": 7956960, + "step": 3675 + }, + { + "epoch": 0.600326264274062, + "grad_norm": 1.094731092453003, + "learning_rate": 3.0008156606851552e-05, + "loss": 0.1421, + "num_input_tokens_seen": 7969440, + "step": 3680 + }, + { + "epoch": 0.6011419249592169, + "grad_norm": 1.0951755046844482, + "learning_rate": 3.0048939641109303e-05, + "loss": 0.1337, + "num_input_tokens_seen": 7978560, + "step": 3685 + }, + { + "epoch": 0.601957585644372, + "grad_norm": 0.8618472814559937, + "learning_rate": 3.0089722675367048e-05, + "loss": 0.2517, + "num_input_tokens_seen": 7990400, + "step": 3690 + }, + { + "epoch": 0.6027732463295269, + "grad_norm": 1.4578768014907837, + "learning_rate": 3.0130505709624795e-05, + "loss": 0.1591, + "num_input_tokens_seen": 8000128, + "step": 3695 + }, + { + "epoch": 0.6035889070146819, + "grad_norm": 0.08261454850435257, + "learning_rate": 3.0171288743882546e-05, + "loss": 0.1251, + "num_input_tokens_seen": 8010592, + "step": 3700 + }, + { + "epoch": 0.6044045676998369, + "grad_norm": 1.2404423952102661, + "learning_rate": 3.0212071778140294e-05, + "loss": 0.2306, + "num_input_tokens_seen": 8022240, + "step": 3705 + }, + { + "epoch": 0.6052202283849919, + "grad_norm": 0.3733838200569153, + "learning_rate": 3.0252854812398045e-05, + "loss": 0.2092, + "num_input_tokens_seen": 8032096, + "step": 3710 + }, + { + "epoch": 0.6060358890701468, + "grad_norm": 2.7544119358062744, + "learning_rate": 3.0293637846655793e-05, + "loss": 0.2861, + "num_input_tokens_seen": 8042400, + "step": 3715 + }, + { + "epoch": 0.6068515497553018, + "grad_norm": 0.295664519071579, + "learning_rate": 3.033442088091354e-05, + "loss": 0.0915, + "num_input_tokens_seen": 8052672, + "step": 3720 + }, + { + "epoch": 0.6076672104404568, + "grad_norm": 0.6411544680595398, + "learning_rate": 3.037520391517129e-05, + "loss": 0.1512, + "num_input_tokens_seen": 8063712, + "step": 3725 + }, + { + "epoch": 0.6084828711256117, + "grad_norm": 1.3559644222259521, + "learning_rate": 3.041598694942904e-05, + "loss": 0.1158, + "num_input_tokens_seen": 8073600, + "step": 3730 + }, + { + "epoch": 0.6092985318107668, + "grad_norm": 0.3749676048755646, + "learning_rate": 3.045676998368679e-05, + "loss": 0.1844, + "num_input_tokens_seen": 8085664, + "step": 3735 + }, + { + "epoch": 0.6101141924959217, + "grad_norm": 1.4180079698562622, + "learning_rate": 3.0497553017944534e-05, + "loss": 0.1959, + "num_input_tokens_seen": 8097152, + "step": 3740 + }, + { + "epoch": 0.6109298531810766, + "grad_norm": 1.2302302122116089, + "learning_rate": 3.0538336052202285e-05, + "loss": 0.1121, + "num_input_tokens_seen": 8107648, + "step": 3745 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 1.7100049257278442, + "learning_rate": 3.057911908646003e-05, + "loss": 0.3169, + "num_input_tokens_seen": 8118272, + "step": 3750 + }, + { + "epoch": 0.6125611745513866, + "grad_norm": 0.8462449908256531, + "learning_rate": 3.061990212071778e-05, + "loss": 0.1118, + "num_input_tokens_seen": 8129120, + "step": 3755 + }, + { + "epoch": 0.6133768352365416, + "grad_norm": 0.2153860479593277, + "learning_rate": 3.0660685154975535e-05, + "loss": 0.1762, + "num_input_tokens_seen": 8139200, + "step": 3760 + }, + { + "epoch": 0.6141924959216966, + "grad_norm": 0.7844712734222412, + "learning_rate": 3.0701468189233276e-05, + "loss": 0.132, + "num_input_tokens_seen": 8150272, + "step": 3765 + }, + { + "epoch": 0.6150081566068516, + "grad_norm": 2.743211269378662, + "learning_rate": 3.074225122349103e-05, + "loss": 0.2935, + "num_input_tokens_seen": 8162496, + "step": 3770 + }, + { + "epoch": 0.6158238172920065, + "grad_norm": 0.4109259843826294, + "learning_rate": 3.078303425774878e-05, + "loss": 0.1799, + "num_input_tokens_seen": 8173664, + "step": 3775 + }, + { + "epoch": 0.6166394779771615, + "grad_norm": 0.3772255778312683, + "learning_rate": 3.082381729200653e-05, + "loss": 0.3055, + "num_input_tokens_seen": 8184512, + "step": 3780 + }, + { + "epoch": 0.6174551386623165, + "grad_norm": 1.7265052795410156, + "learning_rate": 3.086460032626427e-05, + "loss": 0.2311, + "num_input_tokens_seen": 8194432, + "step": 3785 + }, + { + "epoch": 0.6182707993474714, + "grad_norm": 1.5105286836624146, + "learning_rate": 3.090538336052202e-05, + "loss": 0.1113, + "num_input_tokens_seen": 8205152, + "step": 3790 + }, + { + "epoch": 0.6190864600326265, + "grad_norm": 0.2853895425796509, + "learning_rate": 3.0946166394779775e-05, + "loss": 0.1774, + "num_input_tokens_seen": 8215360, + "step": 3795 + }, + { + "epoch": 0.6199021207177814, + "grad_norm": 0.24685432016849518, + "learning_rate": 3.098694942903752e-05, + "loss": 0.1131, + "num_input_tokens_seen": 8225216, + "step": 3800 + }, + { + "epoch": 0.6207177814029364, + "grad_norm": 1.8795397281646729, + "learning_rate": 3.102773246329527e-05, + "loss": 0.196, + "num_input_tokens_seen": 8235488, + "step": 3805 + }, + { + "epoch": 0.6215334420880914, + "grad_norm": 0.7603802680969238, + "learning_rate": 3.106851549755302e-05, + "loss": 0.1461, + "num_input_tokens_seen": 8246208, + "step": 3810 + }, + { + "epoch": 0.6223491027732463, + "grad_norm": 1.0045170783996582, + "learning_rate": 3.1109298531810766e-05, + "loss": 0.1635, + "num_input_tokens_seen": 8258048, + "step": 3815 + }, + { + "epoch": 0.6231647634584013, + "grad_norm": 1.6275588274002075, + "learning_rate": 3.115008156606852e-05, + "loss": 0.1145, + "num_input_tokens_seen": 8269472, + "step": 3820 + }, + { + "epoch": 0.6239804241435563, + "grad_norm": 2.998537302017212, + "learning_rate": 3.119086460032626e-05, + "loss": 0.3316, + "num_input_tokens_seen": 8280512, + "step": 3825 + }, + { + "epoch": 0.6247960848287113, + "grad_norm": 0.6193655729293823, + "learning_rate": 3.1231647634584016e-05, + "loss": 0.1959, + "num_input_tokens_seen": 8291360, + "step": 3830 + }, + { + "epoch": 0.6256117455138662, + "grad_norm": 0.23274919390678406, + "learning_rate": 3.127243066884176e-05, + "loss": 0.0575, + "num_input_tokens_seen": 8301440, + "step": 3835 + }, + { + "epoch": 0.6264274061990212, + "grad_norm": 2.0886945724487305, + "learning_rate": 3.131321370309952e-05, + "loss": 0.2852, + "num_input_tokens_seen": 8313472, + "step": 3840 + }, + { + "epoch": 0.6272430668841762, + "grad_norm": 0.09431463479995728, + "learning_rate": 3.135399673735726e-05, + "loss": 0.0282, + "num_input_tokens_seen": 8323488, + "step": 3845 + }, + { + "epoch": 0.6280587275693311, + "grad_norm": 0.4216545820236206, + "learning_rate": 3.1394779771615006e-05, + "loss": 0.0485, + "num_input_tokens_seen": 8336096, + "step": 3850 + }, + { + "epoch": 0.6288743882544862, + "grad_norm": 1.8134320974349976, + "learning_rate": 3.143556280587276e-05, + "loss": 0.2596, + "num_input_tokens_seen": 8344736, + "step": 3855 + }, + { + "epoch": 0.6296900489396411, + "grad_norm": 0.18447917699813843, + "learning_rate": 3.147634584013051e-05, + "loss": 0.1132, + "num_input_tokens_seen": 8356416, + "step": 3860 + }, + { + "epoch": 0.6305057096247961, + "grad_norm": 0.24256375432014465, + "learning_rate": 3.1517128874388256e-05, + "loss": 0.2093, + "num_input_tokens_seen": 8367712, + "step": 3865 + }, + { + "epoch": 0.6313213703099511, + "grad_norm": 0.13160741329193115, + "learning_rate": 3.1557911908646004e-05, + "loss": 0.2129, + "num_input_tokens_seen": 8378752, + "step": 3870 + }, + { + "epoch": 0.632137030995106, + "grad_norm": 1.558432936668396, + "learning_rate": 3.159869494290376e-05, + "loss": 0.2321, + "num_input_tokens_seen": 8389472, + "step": 3875 + }, + { + "epoch": 0.632952691680261, + "grad_norm": 0.15893925726413727, + "learning_rate": 3.1639477977161506e-05, + "loss": 0.1981, + "num_input_tokens_seen": 8400064, + "step": 3880 + }, + { + "epoch": 0.633768352365416, + "grad_norm": 0.5538126826286316, + "learning_rate": 3.1680261011419246e-05, + "loss": 0.1092, + "num_input_tokens_seen": 8410176, + "step": 3885 + }, + { + "epoch": 0.634584013050571, + "grad_norm": 0.9902182817459106, + "learning_rate": 3.1721044045677e-05, + "loss": 0.3046, + "num_input_tokens_seen": 8420256, + "step": 3890 + }, + { + "epoch": 0.6353996737357259, + "grad_norm": 0.2671620845794678, + "learning_rate": 3.176182707993475e-05, + "loss": 0.1703, + "num_input_tokens_seen": 8431808, + "step": 3895 + }, + { + "epoch": 0.636215334420881, + "grad_norm": 0.5846095681190491, + "learning_rate": 3.1802610114192496e-05, + "loss": 0.0737, + "num_input_tokens_seen": 8444064, + "step": 3900 + }, + { + "epoch": 0.6370309951060359, + "grad_norm": 1.4352977275848389, + "learning_rate": 3.1843393148450244e-05, + "loss": 0.2703, + "num_input_tokens_seen": 8454944, + "step": 3905 + }, + { + "epoch": 0.6378466557911908, + "grad_norm": 0.41372212767601013, + "learning_rate": 3.1884176182708e-05, + "loss": 0.0663, + "num_input_tokens_seen": 8466208, + "step": 3910 + }, + { + "epoch": 0.6386623164763459, + "grad_norm": 0.3155847489833832, + "learning_rate": 3.1924959216965746e-05, + "loss": 0.1165, + "num_input_tokens_seen": 8476480, + "step": 3915 + }, + { + "epoch": 0.6394779771615008, + "grad_norm": 1.5493156909942627, + "learning_rate": 3.1965742251223494e-05, + "loss": 0.1826, + "num_input_tokens_seen": 8486688, + "step": 3920 + }, + { + "epoch": 0.6402936378466558, + "grad_norm": 0.19440703094005585, + "learning_rate": 3.200652528548124e-05, + "loss": 0.195, + "num_input_tokens_seen": 8499200, + "step": 3925 + }, + { + "epoch": 0.6411092985318108, + "grad_norm": 1.2557703256607056, + "learning_rate": 3.204730831973899e-05, + "loss": 0.1563, + "num_input_tokens_seen": 8509728, + "step": 3930 + }, + { + "epoch": 0.6419249592169658, + "grad_norm": 2.224519729614258, + "learning_rate": 3.208809135399674e-05, + "loss": 0.1559, + "num_input_tokens_seen": 8520928, + "step": 3935 + }, + { + "epoch": 0.6427406199021207, + "grad_norm": 0.10380819439888, + "learning_rate": 3.2128874388254484e-05, + "loss": 0.1696, + "num_input_tokens_seen": 8532448, + "step": 3940 + }, + { + "epoch": 0.6435562805872757, + "grad_norm": 0.3382575511932373, + "learning_rate": 3.216965742251223e-05, + "loss": 0.2216, + "num_input_tokens_seen": 8543360, + "step": 3945 + }, + { + "epoch": 0.6443719412724307, + "grad_norm": 0.44937607645988464, + "learning_rate": 3.2210440456769986e-05, + "loss": 0.2427, + "num_input_tokens_seen": 8554176, + "step": 3950 + }, + { + "epoch": 0.6451876019575856, + "grad_norm": 0.6430274248123169, + "learning_rate": 3.2251223491027734e-05, + "loss": 0.1447, + "num_input_tokens_seen": 8565376, + "step": 3955 + }, + { + "epoch": 0.6460032626427407, + "grad_norm": 0.2522242069244385, + "learning_rate": 3.229200652528548e-05, + "loss": 0.2942, + "num_input_tokens_seen": 8576480, + "step": 3960 + }, + { + "epoch": 0.6468189233278956, + "grad_norm": 0.1704854965209961, + "learning_rate": 3.233278955954323e-05, + "loss": 0.1101, + "num_input_tokens_seen": 8586944, + "step": 3965 + }, + { + "epoch": 0.6476345840130505, + "grad_norm": 0.2656114101409912, + "learning_rate": 3.2373572593800984e-05, + "loss": 0.1653, + "num_input_tokens_seen": 8596640, + "step": 3970 + }, + { + "epoch": 0.6484502446982056, + "grad_norm": 0.32051053643226624, + "learning_rate": 3.241435562805873e-05, + "loss": 0.088, + "num_input_tokens_seen": 8608256, + "step": 3975 + }, + { + "epoch": 0.6492659053833605, + "grad_norm": 1.062200665473938, + "learning_rate": 3.245513866231648e-05, + "loss": 0.1907, + "num_input_tokens_seen": 8618752, + "step": 3980 + }, + { + "epoch": 0.6500815660685155, + "grad_norm": 0.08999307453632355, + "learning_rate": 3.2495921696574226e-05, + "loss": 0.1677, + "num_input_tokens_seen": 8629824, + "step": 3985 + }, + { + "epoch": 0.6508972267536705, + "grad_norm": 0.1572684496641159, + "learning_rate": 3.2536704730831974e-05, + "loss": 0.2017, + "num_input_tokens_seen": 8640832, + "step": 3990 + }, + { + "epoch": 0.6517128874388255, + "grad_norm": 0.16816380620002747, + "learning_rate": 3.257748776508973e-05, + "loss": 0.211, + "num_input_tokens_seen": 8651104, + "step": 3995 + }, + { + "epoch": 0.6525285481239804, + "grad_norm": 1.6684448719024658, + "learning_rate": 3.261827079934747e-05, + "loss": 0.2144, + "num_input_tokens_seen": 8662112, + "step": 4000 + }, + { + "epoch": 0.6533442088091354, + "grad_norm": 1.6554170846939087, + "learning_rate": 3.2659053833605224e-05, + "loss": 0.1038, + "num_input_tokens_seen": 8672960, + "step": 4005 + }, + { + "epoch": 0.6541598694942904, + "grad_norm": 0.1408659964799881, + "learning_rate": 3.269983686786297e-05, + "loss": 0.1135, + "num_input_tokens_seen": 8683968, + "step": 4010 + }, + { + "epoch": 0.6549755301794453, + "grad_norm": 2.1625583171844482, + "learning_rate": 3.274061990212072e-05, + "loss": 0.1519, + "num_input_tokens_seen": 8695200, + "step": 4015 + }, + { + "epoch": 0.6557911908646004, + "grad_norm": 1.3420214653015137, + "learning_rate": 3.278140293637847e-05, + "loss": 0.2179, + "num_input_tokens_seen": 8705408, + "step": 4020 + }, + { + "epoch": 0.6566068515497553, + "grad_norm": 1.9934234619140625, + "learning_rate": 3.2822185970636214e-05, + "loss": 0.144, + "num_input_tokens_seen": 8716128, + "step": 4025 + }, + { + "epoch": 0.6574225122349103, + "grad_norm": 0.6495137214660645, + "learning_rate": 3.286296900489397e-05, + "loss": 0.3275, + "num_input_tokens_seen": 8727744, + "step": 4030 + }, + { + "epoch": 0.6582381729200653, + "grad_norm": 0.8493125438690186, + "learning_rate": 3.2903752039151716e-05, + "loss": 0.194, + "num_input_tokens_seen": 8740128, + "step": 4035 + }, + { + "epoch": 0.6590538336052202, + "grad_norm": 2.768663167953491, + "learning_rate": 3.2944535073409464e-05, + "loss": 0.2515, + "num_input_tokens_seen": 8752320, + "step": 4040 + }, + { + "epoch": 0.6598694942903752, + "grad_norm": 0.4861355125904083, + "learning_rate": 3.298531810766721e-05, + "loss": 0.1941, + "num_input_tokens_seen": 8762784, + "step": 4045 + }, + { + "epoch": 0.6606851549755302, + "grad_norm": 0.24323298037052155, + "learning_rate": 3.302610114192496e-05, + "loss": 0.1335, + "num_input_tokens_seen": 8773312, + "step": 4050 + }, + { + "epoch": 0.6615008156606852, + "grad_norm": 0.635265052318573, + "learning_rate": 3.3066884176182714e-05, + "loss": 0.1895, + "num_input_tokens_seen": 8783744, + "step": 4055 + }, + { + "epoch": 0.6623164763458401, + "grad_norm": 0.32602640986442566, + "learning_rate": 3.3107667210440455e-05, + "loss": 0.071, + "num_input_tokens_seen": 8795104, + "step": 4060 + }, + { + "epoch": 0.6631321370309952, + "grad_norm": 0.5591351985931396, + "learning_rate": 3.314845024469821e-05, + "loss": 0.1261, + "num_input_tokens_seen": 8805760, + "step": 4065 + }, + { + "epoch": 0.6639477977161501, + "grad_norm": 2.0875802040100098, + "learning_rate": 3.318923327895596e-05, + "loss": 0.1011, + "num_input_tokens_seen": 8817536, + "step": 4070 + }, + { + "epoch": 0.664763458401305, + "grad_norm": 0.8281918168067932, + "learning_rate": 3.3230016313213704e-05, + "loss": 0.0537, + "num_input_tokens_seen": 8829184, + "step": 4075 + }, + { + "epoch": 0.6655791190864601, + "grad_norm": 0.09719588607549667, + "learning_rate": 3.327079934747145e-05, + "loss": 0.1126, + "num_input_tokens_seen": 8839328, + "step": 4080 + }, + { + "epoch": 0.666394779771615, + "grad_norm": 1.0670149326324463, + "learning_rate": 3.33115823817292e-05, + "loss": 0.1794, + "num_input_tokens_seen": 8850336, + "step": 4085 + }, + { + "epoch": 0.66721044045677, + "grad_norm": 0.5992256999015808, + "learning_rate": 3.3352365415986954e-05, + "loss": 0.1848, + "num_input_tokens_seen": 8859392, + "step": 4090 + }, + { + "epoch": 0.668026101141925, + "grad_norm": 0.10344874113798141, + "learning_rate": 3.33931484502447e-05, + "loss": 0.1048, + "num_input_tokens_seen": 8871008, + "step": 4095 + }, + { + "epoch": 0.6688417618270799, + "grad_norm": 0.1503552496433258, + "learning_rate": 3.343393148450245e-05, + "loss": 0.2035, + "num_input_tokens_seen": 8881760, + "step": 4100 + }, + { + "epoch": 0.6696574225122349, + "grad_norm": 1.8119827508926392, + "learning_rate": 3.34747145187602e-05, + "loss": 0.3326, + "num_input_tokens_seen": 8892544, + "step": 4105 + }, + { + "epoch": 0.6704730831973899, + "grad_norm": 2.0805768966674805, + "learning_rate": 3.3515497553017945e-05, + "loss": 0.3225, + "num_input_tokens_seen": 8903936, + "step": 4110 + }, + { + "epoch": 0.6712887438825449, + "grad_norm": 0.1837233603000641, + "learning_rate": 3.35562805872757e-05, + "loss": 0.1804, + "num_input_tokens_seen": 8913952, + "step": 4115 + }, + { + "epoch": 0.6721044045676998, + "grad_norm": 1.9629334211349487, + "learning_rate": 3.359706362153344e-05, + "loss": 0.2121, + "num_input_tokens_seen": 8925120, + "step": 4120 + }, + { + "epoch": 0.6729200652528549, + "grad_norm": 0.1613834798336029, + "learning_rate": 3.3637846655791194e-05, + "loss": 0.1407, + "num_input_tokens_seen": 8935872, + "step": 4125 + }, + { + "epoch": 0.6737357259380098, + "grad_norm": 0.21415840089321136, + "learning_rate": 3.367862969004894e-05, + "loss": 0.1668, + "num_input_tokens_seen": 8946624, + "step": 4130 + }, + { + "epoch": 0.6745513866231647, + "grad_norm": 0.2994208037853241, + "learning_rate": 3.371941272430669e-05, + "loss": 0.1243, + "num_input_tokens_seen": 8957568, + "step": 4135 + }, + { + "epoch": 0.6753670473083198, + "grad_norm": 1.561730146408081, + "learning_rate": 3.376019575856444e-05, + "loss": 0.2402, + "num_input_tokens_seen": 8968160, + "step": 4140 + }, + { + "epoch": 0.6761827079934747, + "grad_norm": 0.7743952870368958, + "learning_rate": 3.3800978792822185e-05, + "loss": 0.1476, + "num_input_tokens_seen": 8978560, + "step": 4145 + }, + { + "epoch": 0.6769983686786297, + "grad_norm": 1.5301467180252075, + "learning_rate": 3.384176182707994e-05, + "loss": 0.1735, + "num_input_tokens_seen": 8988896, + "step": 4150 + }, + { + "epoch": 0.6778140293637847, + "grad_norm": 1.3827495574951172, + "learning_rate": 3.388254486133769e-05, + "loss": 0.2049, + "num_input_tokens_seen": 9000928, + "step": 4155 + }, + { + "epoch": 0.6786296900489397, + "grad_norm": 0.32373371720314026, + "learning_rate": 3.3923327895595435e-05, + "loss": 0.1088, + "num_input_tokens_seen": 9011712, + "step": 4160 + }, + { + "epoch": 0.6794453507340946, + "grad_norm": 0.9602869749069214, + "learning_rate": 3.396411092985318e-05, + "loss": 0.2013, + "num_input_tokens_seen": 9022624, + "step": 4165 + }, + { + "epoch": 0.6802610114192496, + "grad_norm": 0.07210908830165863, + "learning_rate": 3.400489396411093e-05, + "loss": 0.1921, + "num_input_tokens_seen": 9033088, + "step": 4170 + }, + { + "epoch": 0.6810766721044046, + "grad_norm": 3.103898286819458, + "learning_rate": 3.404567699836868e-05, + "loss": 0.2776, + "num_input_tokens_seen": 9043680, + "step": 4175 + }, + { + "epoch": 0.6818923327895595, + "grad_norm": 1.3201634883880615, + "learning_rate": 3.4086460032626425e-05, + "loss": 0.1743, + "num_input_tokens_seen": 9055424, + "step": 4180 + }, + { + "epoch": 0.6827079934747146, + "grad_norm": 1.4667233228683472, + "learning_rate": 3.412724306688418e-05, + "loss": 0.0815, + "num_input_tokens_seen": 9066816, + "step": 4185 + }, + { + "epoch": 0.6835236541598695, + "grad_norm": 0.30274611711502075, + "learning_rate": 3.416802610114193e-05, + "loss": 0.1795, + "num_input_tokens_seen": 9077792, + "step": 4190 + }, + { + "epoch": 0.6843393148450244, + "grad_norm": 0.20953816175460815, + "learning_rate": 3.4208809135399675e-05, + "loss": 0.259, + "num_input_tokens_seen": 9088160, + "step": 4195 + }, + { + "epoch": 0.6851549755301795, + "grad_norm": 0.11291848123073578, + "learning_rate": 3.424959216965742e-05, + "loss": 0.0544, + "num_input_tokens_seen": 9099264, + "step": 4200 + }, + { + "epoch": 0.6859706362153344, + "grad_norm": 1.5547826290130615, + "learning_rate": 3.429037520391517e-05, + "loss": 0.2437, + "num_input_tokens_seen": 9110336, + "step": 4205 + }, + { + "epoch": 0.6867862969004894, + "grad_norm": 1.3994802236557007, + "learning_rate": 3.4331158238172925e-05, + "loss": 0.0677, + "num_input_tokens_seen": 9121248, + "step": 4210 + }, + { + "epoch": 0.6876019575856444, + "grad_norm": 1.909999132156372, + "learning_rate": 3.4371941272430666e-05, + "loss": 0.1214, + "num_input_tokens_seen": 9132896, + "step": 4215 + }, + { + "epoch": 0.6884176182707994, + "grad_norm": 0.07696279138326645, + "learning_rate": 3.441272430668842e-05, + "loss": 0.1172, + "num_input_tokens_seen": 9143968, + "step": 4220 + }, + { + "epoch": 0.6892332789559543, + "grad_norm": 0.1180364266037941, + "learning_rate": 3.445350734094617e-05, + "loss": 0.1202, + "num_input_tokens_seen": 9154848, + "step": 4225 + }, + { + "epoch": 0.6900489396411092, + "grad_norm": 0.08901848644018173, + "learning_rate": 3.449429037520392e-05, + "loss": 0.196, + "num_input_tokens_seen": 9167264, + "step": 4230 + }, + { + "epoch": 0.6908646003262643, + "grad_norm": 0.15825361013412476, + "learning_rate": 3.453507340946166e-05, + "loss": 0.0831, + "num_input_tokens_seen": 9178016, + "step": 4235 + }, + { + "epoch": 0.6916802610114192, + "grad_norm": 0.39295828342437744, + "learning_rate": 3.457585644371941e-05, + "loss": 0.1276, + "num_input_tokens_seen": 9188224, + "step": 4240 + }, + { + "epoch": 0.6924959216965743, + "grad_norm": 0.03429195657372475, + "learning_rate": 3.4616639477977165e-05, + "loss": 0.0444, + "num_input_tokens_seen": 9198208, + "step": 4245 + }, + { + "epoch": 0.6933115823817292, + "grad_norm": 0.4621954560279846, + "learning_rate": 3.465742251223491e-05, + "loss": 0.1553, + "num_input_tokens_seen": 9208128, + "step": 4250 + }, + { + "epoch": 0.6941272430668842, + "grad_norm": 0.09658543765544891, + "learning_rate": 3.469820554649266e-05, + "loss": 0.0859, + "num_input_tokens_seen": 9218080, + "step": 4255 + }, + { + "epoch": 0.6949429037520392, + "grad_norm": 0.23600126802921295, + "learning_rate": 3.473898858075041e-05, + "loss": 0.0246, + "num_input_tokens_seen": 9228416, + "step": 4260 + }, + { + "epoch": 0.6957585644371941, + "grad_norm": 1.2098424434661865, + "learning_rate": 3.477977161500816e-05, + "loss": 0.2637, + "num_input_tokens_seen": 9239296, + "step": 4265 + }, + { + "epoch": 0.6965742251223491, + "grad_norm": 0.5443731546401978, + "learning_rate": 3.482055464926591e-05, + "loss": 0.243, + "num_input_tokens_seen": 9249472, + "step": 4270 + }, + { + "epoch": 0.697389885807504, + "grad_norm": 1.7058247327804565, + "learning_rate": 3.486133768352365e-05, + "loss": 0.1599, + "num_input_tokens_seen": 9259616, + "step": 4275 + }, + { + "epoch": 0.6982055464926591, + "grad_norm": 0.880430281162262, + "learning_rate": 3.4902120717781405e-05, + "loss": 0.098, + "num_input_tokens_seen": 9271360, + "step": 4280 + }, + { + "epoch": 0.699021207177814, + "grad_norm": 0.3391721248626709, + "learning_rate": 3.494290375203915e-05, + "loss": 0.1757, + "num_input_tokens_seen": 9281952, + "step": 4285 + }, + { + "epoch": 0.6998368678629691, + "grad_norm": 2.916903257369995, + "learning_rate": 3.498368678629691e-05, + "loss": 0.2134, + "num_input_tokens_seen": 9292480, + "step": 4290 + }, + { + "epoch": 0.700652528548124, + "grad_norm": 0.10674115270376205, + "learning_rate": 3.502446982055465e-05, + "loss": 0.3358, + "num_input_tokens_seen": 9301696, + "step": 4295 + }, + { + "epoch": 0.7014681892332789, + "grad_norm": 0.8638542890548706, + "learning_rate": 3.5065252854812396e-05, + "loss": 0.0722, + "num_input_tokens_seen": 9311360, + "step": 4300 + }, + { + "epoch": 0.702283849918434, + "grad_norm": 1.422343134880066, + "learning_rate": 3.510603588907015e-05, + "loss": 0.2793, + "num_input_tokens_seen": 9321504, + "step": 4305 + }, + { + "epoch": 0.7030995106035889, + "grad_norm": 0.9247894287109375, + "learning_rate": 3.51468189233279e-05, + "loss": 0.1935, + "num_input_tokens_seen": 9333024, + "step": 4310 + }, + { + "epoch": 0.7039151712887439, + "grad_norm": 1.024821162223816, + "learning_rate": 3.5187601957585646e-05, + "loss": 0.1292, + "num_input_tokens_seen": 9343424, + "step": 4315 + }, + { + "epoch": 0.7047308319738989, + "grad_norm": 0.027629321441054344, + "learning_rate": 3.522838499184339e-05, + "loss": 0.0612, + "num_input_tokens_seen": 9354016, + "step": 4320 + }, + { + "epoch": 0.7055464926590538, + "grad_norm": 0.12272980809211731, + "learning_rate": 3.526916802610115e-05, + "loss": 0.2035, + "num_input_tokens_seen": 9365600, + "step": 4325 + }, + { + "epoch": 0.7063621533442088, + "grad_norm": 0.27316489815711975, + "learning_rate": 3.5309951060358895e-05, + "loss": 0.1831, + "num_input_tokens_seen": 9377216, + "step": 4330 + }, + { + "epoch": 0.7071778140293637, + "grad_norm": 0.7400538325309753, + "learning_rate": 3.5350734094616636e-05, + "loss": 0.1277, + "num_input_tokens_seen": 9388000, + "step": 4335 + }, + { + "epoch": 0.7079934747145188, + "grad_norm": 1.2733510732650757, + "learning_rate": 3.539151712887439e-05, + "loss": 0.2185, + "num_input_tokens_seen": 9399616, + "step": 4340 + }, + { + "epoch": 0.7088091353996737, + "grad_norm": 1.2391202449798584, + "learning_rate": 3.543230016313214e-05, + "loss": 0.1477, + "num_input_tokens_seen": 9411872, + "step": 4345 + }, + { + "epoch": 0.7096247960848288, + "grad_norm": 0.8934322595596313, + "learning_rate": 3.5473083197389886e-05, + "loss": 0.2035, + "num_input_tokens_seen": 9422912, + "step": 4350 + }, + { + "epoch": 0.7104404567699837, + "grad_norm": 0.6965172290802002, + "learning_rate": 3.5513866231647634e-05, + "loss": 0.039, + "num_input_tokens_seen": 9434176, + "step": 4355 + }, + { + "epoch": 0.7112561174551386, + "grad_norm": 0.6112057566642761, + "learning_rate": 3.555464926590539e-05, + "loss": 0.157, + "num_input_tokens_seen": 9443936, + "step": 4360 + }, + { + "epoch": 0.7120717781402937, + "grad_norm": 0.17153038084506989, + "learning_rate": 3.5595432300163136e-05, + "loss": 0.0937, + "num_input_tokens_seen": 9453984, + "step": 4365 + }, + { + "epoch": 0.7128874388254486, + "grad_norm": 0.26257970929145813, + "learning_rate": 3.563621533442088e-05, + "loss": 0.1822, + "num_input_tokens_seen": 9465408, + "step": 4370 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.8564168214797974, + "learning_rate": 3.567699836867863e-05, + "loss": 0.1755, + "num_input_tokens_seen": 9475392, + "step": 4375 + }, + { + "epoch": 0.7145187601957586, + "grad_norm": 1.4156333208084106, + "learning_rate": 3.571778140293638e-05, + "loss": 0.2171, + "num_input_tokens_seen": 9485280, + "step": 4380 + }, + { + "epoch": 0.7153344208809136, + "grad_norm": 0.9891328811645508, + "learning_rate": 3.575856443719413e-05, + "loss": 0.0834, + "num_input_tokens_seen": 9496224, + "step": 4385 + }, + { + "epoch": 0.7161500815660685, + "grad_norm": 0.7931239008903503, + "learning_rate": 3.579934747145188e-05, + "loss": 0.124, + "num_input_tokens_seen": 9507456, + "step": 4390 + }, + { + "epoch": 0.7169657422512234, + "grad_norm": 1.8769522905349731, + "learning_rate": 3.584013050570963e-05, + "loss": 0.3355, + "num_input_tokens_seen": 9517952, + "step": 4395 + }, + { + "epoch": 0.7177814029363785, + "grad_norm": 1.9964611530303955, + "learning_rate": 3.5880913539967376e-05, + "loss": 0.1526, + "num_input_tokens_seen": 9528672, + "step": 4400 + }, + { + "epoch": 0.7185970636215334, + "grad_norm": 0.6138824820518494, + "learning_rate": 3.5921696574225124e-05, + "loss": 0.0632, + "num_input_tokens_seen": 9539552, + "step": 4405 + }, + { + "epoch": 0.7194127243066885, + "grad_norm": 1.445238471031189, + "learning_rate": 3.596247960848287e-05, + "loss": 0.2525, + "num_input_tokens_seen": 9550112, + "step": 4410 + }, + { + "epoch": 0.7202283849918434, + "grad_norm": 0.4267001152038574, + "learning_rate": 3.600326264274062e-05, + "loss": 0.1952, + "num_input_tokens_seen": 9560544, + "step": 4415 + }, + { + "epoch": 0.7210440456769984, + "grad_norm": 3.369941234588623, + "learning_rate": 3.604404567699837e-05, + "loss": 0.2682, + "num_input_tokens_seen": 9571232, + "step": 4420 + }, + { + "epoch": 0.7218597063621534, + "grad_norm": 0.1950523555278778, + "learning_rate": 3.608482871125612e-05, + "loss": 0.0653, + "num_input_tokens_seen": 9581920, + "step": 4425 + }, + { + "epoch": 0.7226753670473083, + "grad_norm": 0.6953341364860535, + "learning_rate": 3.612561174551387e-05, + "loss": 0.0725, + "num_input_tokens_seen": 9592448, + "step": 4430 + }, + { + "epoch": 0.7234910277324633, + "grad_norm": 0.3738235533237457, + "learning_rate": 3.6166394779771616e-05, + "loss": 0.1782, + "num_input_tokens_seen": 9603360, + "step": 4435 + }, + { + "epoch": 0.7243066884176182, + "grad_norm": 0.6815865635871887, + "learning_rate": 3.6207177814029364e-05, + "loss": 0.1549, + "num_input_tokens_seen": 9614336, + "step": 4440 + }, + { + "epoch": 0.7251223491027733, + "grad_norm": 0.20119497179985046, + "learning_rate": 3.624796084828712e-05, + "loss": 0.1119, + "num_input_tokens_seen": 9624512, + "step": 4445 + }, + { + "epoch": 0.7259380097879282, + "grad_norm": 0.20808187127113342, + "learning_rate": 3.628874388254486e-05, + "loss": 0.2274, + "num_input_tokens_seen": 9635168, + "step": 4450 + }, + { + "epoch": 0.7267536704730831, + "grad_norm": 1.6445269584655762, + "learning_rate": 3.6329526916802614e-05, + "loss": 0.2661, + "num_input_tokens_seen": 9645280, + "step": 4455 + }, + { + "epoch": 0.7275693311582382, + "grad_norm": 0.6649954319000244, + "learning_rate": 3.637030995106036e-05, + "loss": 0.1286, + "num_input_tokens_seen": 9656032, + "step": 4460 + }, + { + "epoch": 0.7283849918433931, + "grad_norm": 0.14932653307914734, + "learning_rate": 3.641109298531811e-05, + "loss": 0.2725, + "num_input_tokens_seen": 9667968, + "step": 4465 + }, + { + "epoch": 0.7292006525285482, + "grad_norm": 1.340802788734436, + "learning_rate": 3.6451876019575856e-05, + "loss": 0.0956, + "num_input_tokens_seen": 9680032, + "step": 4470 + }, + { + "epoch": 0.7300163132137031, + "grad_norm": 1.3877623081207275, + "learning_rate": 3.6492659053833604e-05, + "loss": 0.2648, + "num_input_tokens_seen": 9691424, + "step": 4475 + }, + { + "epoch": 0.7308319738988581, + "grad_norm": 1.4375659227371216, + "learning_rate": 3.653344208809136e-05, + "loss": 0.2401, + "num_input_tokens_seen": 9702400, + "step": 4480 + }, + { + "epoch": 0.731647634584013, + "grad_norm": 0.1922251284122467, + "learning_rate": 3.6574225122349106e-05, + "loss": 0.0745, + "num_input_tokens_seen": 9712928, + "step": 4485 + }, + { + "epoch": 0.732463295269168, + "grad_norm": 2.3438916206359863, + "learning_rate": 3.6615008156606854e-05, + "loss": 0.5354, + "num_input_tokens_seen": 9723360, + "step": 4490 + }, + { + "epoch": 0.733278955954323, + "grad_norm": 1.0176266431808472, + "learning_rate": 3.66557911908646e-05, + "loss": 0.0963, + "num_input_tokens_seen": 9734496, + "step": 4495 + }, + { + "epoch": 0.734094616639478, + "grad_norm": 1.480460286140442, + "learning_rate": 3.669657422512235e-05, + "loss": 0.2322, + "num_input_tokens_seen": 9744288, + "step": 4500 + }, + { + "epoch": 0.734910277324633, + "grad_norm": 1.7686793804168701, + "learning_rate": 3.6737357259380104e-05, + "loss": 0.2655, + "num_input_tokens_seen": 9755072, + "step": 4505 + }, + { + "epoch": 0.7357259380097879, + "grad_norm": 0.3204210698604584, + "learning_rate": 3.6778140293637844e-05, + "loss": 0.1352, + "num_input_tokens_seen": 9764192, + "step": 4510 + }, + { + "epoch": 0.736541598694943, + "grad_norm": 2.595867156982422, + "learning_rate": 3.68189233278956e-05, + "loss": 0.3091, + "num_input_tokens_seen": 9775392, + "step": 4515 + }, + { + "epoch": 0.7373572593800979, + "grad_norm": 0.5504496693611145, + "learning_rate": 3.6859706362153346e-05, + "loss": 0.1145, + "num_input_tokens_seen": 9784864, + "step": 4520 + }, + { + "epoch": 0.7381729200652528, + "grad_norm": 0.4828990399837494, + "learning_rate": 3.6900489396411094e-05, + "loss": 0.0566, + "num_input_tokens_seen": 9796512, + "step": 4525 + }, + { + "epoch": 0.7389885807504079, + "grad_norm": 1.1352708339691162, + "learning_rate": 3.694127243066884e-05, + "loss": 0.1091, + "num_input_tokens_seen": 9808416, + "step": 4530 + }, + { + "epoch": 0.7398042414355628, + "grad_norm": 2.0075783729553223, + "learning_rate": 3.698205546492659e-05, + "loss": 0.1451, + "num_input_tokens_seen": 9819552, + "step": 4535 + }, + { + "epoch": 0.7406199021207178, + "grad_norm": 0.424578994512558, + "learning_rate": 3.7022838499184344e-05, + "loss": 0.0948, + "num_input_tokens_seen": 9831168, + "step": 4540 + }, + { + "epoch": 0.7414355628058727, + "grad_norm": 1.360968828201294, + "learning_rate": 3.706362153344209e-05, + "loss": 0.0739, + "num_input_tokens_seen": 9841280, + "step": 4545 + }, + { + "epoch": 0.7422512234910277, + "grad_norm": 1.1034801006317139, + "learning_rate": 3.710440456769984e-05, + "loss": 0.1962, + "num_input_tokens_seen": 9852224, + "step": 4550 + }, + { + "epoch": 0.7430668841761827, + "grad_norm": 0.3710602819919586, + "learning_rate": 3.714518760195759e-05, + "loss": 0.3535, + "num_input_tokens_seen": 9862656, + "step": 4555 + }, + { + "epoch": 0.7438825448613376, + "grad_norm": 0.2828501760959625, + "learning_rate": 3.7185970636215334e-05, + "loss": 0.0395, + "num_input_tokens_seen": 9874304, + "step": 4560 + }, + { + "epoch": 0.7446982055464927, + "grad_norm": 0.6698804497718811, + "learning_rate": 3.722675367047309e-05, + "loss": 0.1083, + "num_input_tokens_seen": 9884128, + "step": 4565 + }, + { + "epoch": 0.7455138662316476, + "grad_norm": 2.002762794494629, + "learning_rate": 3.726753670473083e-05, + "loss": 0.132, + "num_input_tokens_seen": 9894208, + "step": 4570 + }, + { + "epoch": 0.7463295269168027, + "grad_norm": 2.6305642127990723, + "learning_rate": 3.7308319738988584e-05, + "loss": 0.2288, + "num_input_tokens_seen": 9904864, + "step": 4575 + }, + { + "epoch": 0.7471451876019576, + "grad_norm": 1.0546863079071045, + "learning_rate": 3.734910277324633e-05, + "loss": 0.1684, + "num_input_tokens_seen": 9915904, + "step": 4580 + }, + { + "epoch": 0.7479608482871125, + "grad_norm": 0.10014204680919647, + "learning_rate": 3.738988580750408e-05, + "loss": 0.2295, + "num_input_tokens_seen": 9926048, + "step": 4585 + }, + { + "epoch": 0.7487765089722676, + "grad_norm": 0.3518209457397461, + "learning_rate": 3.743066884176183e-05, + "loss": 0.0796, + "num_input_tokens_seen": 9936736, + "step": 4590 + }, + { + "epoch": 0.7495921696574225, + "grad_norm": 1.6999297142028809, + "learning_rate": 3.7471451876019575e-05, + "loss": 0.1722, + "num_input_tokens_seen": 9949408, + "step": 4595 + }, + { + "epoch": 0.7504078303425775, + "grad_norm": 0.7619360089302063, + "learning_rate": 3.751223491027733e-05, + "loss": 0.1096, + "num_input_tokens_seen": 9960160, + "step": 4600 + }, + { + "epoch": 0.7512234910277324, + "grad_norm": 1.5584419965744019, + "learning_rate": 3.755301794453508e-05, + "loss": 0.2107, + "num_input_tokens_seen": 9970624, + "step": 4605 + }, + { + "epoch": 0.7520391517128875, + "grad_norm": 0.8687570095062256, + "learning_rate": 3.7593800978792824e-05, + "loss": 0.1334, + "num_input_tokens_seen": 9982080, + "step": 4610 + }, + { + "epoch": 0.7528548123980424, + "grad_norm": 0.2780604958534241, + "learning_rate": 3.763458401305057e-05, + "loss": 0.0209, + "num_input_tokens_seen": 9992480, + "step": 4615 + }, + { + "epoch": 0.7536704730831973, + "grad_norm": 0.3853524327278137, + "learning_rate": 3.7675367047308326e-05, + "loss": 0.1928, + "num_input_tokens_seen": 10002496, + "step": 4620 + }, + { + "epoch": 0.7544861337683524, + "grad_norm": 1.3870559930801392, + "learning_rate": 3.771615008156607e-05, + "loss": 0.3045, + "num_input_tokens_seen": 10013408, + "step": 4625 + }, + { + "epoch": 0.7553017944535073, + "grad_norm": 0.8942268490791321, + "learning_rate": 3.7756933115823815e-05, + "loss": 0.1699, + "num_input_tokens_seen": 10022880, + "step": 4630 + }, + { + "epoch": 0.7561174551386624, + "grad_norm": 1.521225929260254, + "learning_rate": 3.779771615008157e-05, + "loss": 0.1045, + "num_input_tokens_seen": 10034208, + "step": 4635 + }, + { + "epoch": 0.7569331158238173, + "grad_norm": 1.199190616607666, + "learning_rate": 3.783849918433932e-05, + "loss": 0.165, + "num_input_tokens_seen": 10044832, + "step": 4640 + }, + { + "epoch": 0.7577487765089723, + "grad_norm": 0.2519892454147339, + "learning_rate": 3.7879282218597065e-05, + "loss": 0.1857, + "num_input_tokens_seen": 10055712, + "step": 4645 + }, + { + "epoch": 0.7585644371941273, + "grad_norm": 1.7427067756652832, + "learning_rate": 3.792006525285481e-05, + "loss": 0.187, + "num_input_tokens_seen": 10066976, + "step": 4650 + }, + { + "epoch": 0.7593800978792822, + "grad_norm": 1.1058930158615112, + "learning_rate": 3.796084828711256e-05, + "loss": 0.2178, + "num_input_tokens_seen": 10078304, + "step": 4655 + }, + { + "epoch": 0.7601957585644372, + "grad_norm": 3.2263786792755127, + "learning_rate": 3.8001631321370314e-05, + "loss": 0.3636, + "num_input_tokens_seen": 10089056, + "step": 4660 + }, + { + "epoch": 0.7610114192495921, + "grad_norm": 0.13389304280281067, + "learning_rate": 3.804241435562806e-05, + "loss": 0.2377, + "num_input_tokens_seen": 10099904, + "step": 4665 + }, + { + "epoch": 0.7618270799347472, + "grad_norm": 1.0420982837677002, + "learning_rate": 3.808319738988581e-05, + "loss": 0.0753, + "num_input_tokens_seen": 10111552, + "step": 4670 + }, + { + "epoch": 0.7626427406199021, + "grad_norm": 2.139227867126465, + "learning_rate": 3.812398042414356e-05, + "loss": 0.2722, + "num_input_tokens_seen": 10121568, + "step": 4675 + }, + { + "epoch": 0.763458401305057, + "grad_norm": 0.5846371054649353, + "learning_rate": 3.816476345840131e-05, + "loss": 0.2317, + "num_input_tokens_seen": 10132000, + "step": 4680 + }, + { + "epoch": 0.7642740619902121, + "grad_norm": 1.8069263696670532, + "learning_rate": 3.820554649265905e-05, + "loss": 0.2095, + "num_input_tokens_seen": 10141952, + "step": 4685 + }, + { + "epoch": 0.765089722675367, + "grad_norm": 0.3705402612686157, + "learning_rate": 3.82463295269168e-05, + "loss": 0.127, + "num_input_tokens_seen": 10151776, + "step": 4690 + }, + { + "epoch": 0.765905383360522, + "grad_norm": 0.8749772310256958, + "learning_rate": 3.8287112561174555e-05, + "loss": 0.1638, + "num_input_tokens_seen": 10162688, + "step": 4695 + }, + { + "epoch": 0.766721044045677, + "grad_norm": 0.11825856566429138, + "learning_rate": 3.83278955954323e-05, + "loss": 0.024, + "num_input_tokens_seen": 10174624, + "step": 4700 + }, + { + "epoch": 0.767536704730832, + "grad_norm": 0.36333194375038147, + "learning_rate": 3.836867862969005e-05, + "loss": 0.0694, + "num_input_tokens_seen": 10185248, + "step": 4705 + }, + { + "epoch": 0.768352365415987, + "grad_norm": 0.04190446063876152, + "learning_rate": 3.84094616639478e-05, + "loss": 0.0494, + "num_input_tokens_seen": 10196768, + "step": 4710 + }, + { + "epoch": 0.7691680261011419, + "grad_norm": 0.31323742866516113, + "learning_rate": 3.845024469820555e-05, + "loss": 0.0796, + "num_input_tokens_seen": 10206816, + "step": 4715 + }, + { + "epoch": 0.7699836867862969, + "grad_norm": 0.3558478057384491, + "learning_rate": 3.84910277324633e-05, + "loss": 0.1009, + "num_input_tokens_seen": 10217600, + "step": 4720 + }, + { + "epoch": 0.7707993474714518, + "grad_norm": 0.6769245266914368, + "learning_rate": 3.853181076672104e-05, + "loss": 0.194, + "num_input_tokens_seen": 10228544, + "step": 4725 + }, + { + "epoch": 0.7716150081566069, + "grad_norm": 0.138211190700531, + "learning_rate": 3.8572593800978795e-05, + "loss": 0.1273, + "num_input_tokens_seen": 10239168, + "step": 4730 + }, + { + "epoch": 0.7724306688417618, + "grad_norm": 0.5706911683082581, + "learning_rate": 3.861337683523654e-05, + "loss": 0.1449, + "num_input_tokens_seen": 10250784, + "step": 4735 + }, + { + "epoch": 0.7732463295269169, + "grad_norm": 1.215019941329956, + "learning_rate": 3.86541598694943e-05, + "loss": 0.0834, + "num_input_tokens_seen": 10260480, + "step": 4740 + }, + { + "epoch": 0.7740619902120718, + "grad_norm": 2.0132105350494385, + "learning_rate": 3.869494290375204e-05, + "loss": 0.2036, + "num_input_tokens_seen": 10272064, + "step": 4745 + }, + { + "epoch": 0.7748776508972267, + "grad_norm": 0.08571137487888336, + "learning_rate": 3.873572593800979e-05, + "loss": 0.0635, + "num_input_tokens_seen": 10281472, + "step": 4750 + }, + { + "epoch": 0.7756933115823818, + "grad_norm": 0.6688966155052185, + "learning_rate": 3.877650897226754e-05, + "loss": 0.1107, + "num_input_tokens_seen": 10293536, + "step": 4755 + }, + { + "epoch": 0.7765089722675367, + "grad_norm": 0.21157002449035645, + "learning_rate": 3.881729200652529e-05, + "loss": 0.1192, + "num_input_tokens_seen": 10303968, + "step": 4760 + }, + { + "epoch": 0.7773246329526917, + "grad_norm": 0.2018115222454071, + "learning_rate": 3.8858075040783035e-05, + "loss": 0.0848, + "num_input_tokens_seen": 10316064, + "step": 4765 + }, + { + "epoch": 0.7781402936378466, + "grad_norm": 0.28213027119636536, + "learning_rate": 3.889885807504078e-05, + "loss": 0.1271, + "num_input_tokens_seen": 10327328, + "step": 4770 + }, + { + "epoch": 0.7789559543230016, + "grad_norm": 1.6592588424682617, + "learning_rate": 3.893964110929854e-05, + "loss": 0.2611, + "num_input_tokens_seen": 10337088, + "step": 4775 + }, + { + "epoch": 0.7797716150081566, + "grad_norm": 1.4524939060211182, + "learning_rate": 3.8980424143556285e-05, + "loss": 0.2837, + "num_input_tokens_seen": 10347488, + "step": 4780 + }, + { + "epoch": 0.7805872756933115, + "grad_norm": 1.8038417100906372, + "learning_rate": 3.9021207177814026e-05, + "loss": 0.2948, + "num_input_tokens_seen": 10358656, + "step": 4785 + }, + { + "epoch": 0.7814029363784666, + "grad_norm": 0.8687059879302979, + "learning_rate": 3.906199021207178e-05, + "loss": 0.1719, + "num_input_tokens_seen": 10370816, + "step": 4790 + }, + { + "epoch": 0.7822185970636215, + "grad_norm": 0.5117902159690857, + "learning_rate": 3.910277324632953e-05, + "loss": 0.0759, + "num_input_tokens_seen": 10381856, + "step": 4795 + }, + { + "epoch": 0.7830342577487766, + "grad_norm": 1.738427758216858, + "learning_rate": 3.914355628058728e-05, + "loss": 0.2084, + "num_input_tokens_seen": 10392320, + "step": 4800 + }, + { + "epoch": 0.7838499184339315, + "grad_norm": 1.0825542211532593, + "learning_rate": 3.918433931484502e-05, + "loss": 0.2627, + "num_input_tokens_seen": 10401568, + "step": 4805 + }, + { + "epoch": 0.7846655791190864, + "grad_norm": 0.48563969135284424, + "learning_rate": 3.922512234910278e-05, + "loss": 0.1719, + "num_input_tokens_seen": 10413248, + "step": 4810 + }, + { + "epoch": 0.7854812398042414, + "grad_norm": 1.2269260883331299, + "learning_rate": 3.9265905383360525e-05, + "loss": 0.0768, + "num_input_tokens_seen": 10423168, + "step": 4815 + }, + { + "epoch": 0.7862969004893964, + "grad_norm": 0.3527114689350128, + "learning_rate": 3.930668841761827e-05, + "loss": 0.1562, + "num_input_tokens_seen": 10433792, + "step": 4820 + }, + { + "epoch": 0.7871125611745514, + "grad_norm": 1.4367517232894897, + "learning_rate": 3.934747145187602e-05, + "loss": 0.1495, + "num_input_tokens_seen": 10445120, + "step": 4825 + }, + { + "epoch": 0.7879282218597063, + "grad_norm": 0.7354501485824585, + "learning_rate": 3.938825448613377e-05, + "loss": 0.1943, + "num_input_tokens_seen": 10455584, + "step": 4830 + }, + { + "epoch": 0.7887438825448614, + "grad_norm": 0.07956749945878983, + "learning_rate": 3.942903752039152e-05, + "loss": 0.0885, + "num_input_tokens_seen": 10466752, + "step": 4835 + }, + { + "epoch": 0.7895595432300163, + "grad_norm": 1.2896878719329834, + "learning_rate": 3.946982055464927e-05, + "loss": 0.2006, + "num_input_tokens_seen": 10477376, + "step": 4840 + }, + { + "epoch": 0.7903752039151712, + "grad_norm": 2.072166919708252, + "learning_rate": 3.951060358890702e-05, + "loss": 0.1727, + "num_input_tokens_seen": 10487392, + "step": 4845 + }, + { + "epoch": 0.7911908646003263, + "grad_norm": 1.685652494430542, + "learning_rate": 3.9551386623164766e-05, + "loss": 0.329, + "num_input_tokens_seen": 10498208, + "step": 4850 + }, + { + "epoch": 0.7920065252854812, + "grad_norm": 0.7367632389068604, + "learning_rate": 3.959216965742251e-05, + "loss": 0.1194, + "num_input_tokens_seen": 10507200, + "step": 4855 + }, + { + "epoch": 0.7928221859706363, + "grad_norm": 0.2606205940246582, + "learning_rate": 3.963295269168026e-05, + "loss": 0.1665, + "num_input_tokens_seen": 10516416, + "step": 4860 + }, + { + "epoch": 0.7936378466557912, + "grad_norm": 1.37810480594635, + "learning_rate": 3.967373572593801e-05, + "loss": 0.2398, + "num_input_tokens_seen": 10527648, + "step": 4865 + }, + { + "epoch": 0.7944535073409462, + "grad_norm": 0.56053626537323, + "learning_rate": 3.971451876019576e-05, + "loss": 0.0479, + "num_input_tokens_seen": 10537952, + "step": 4870 + }, + { + "epoch": 0.7952691680261011, + "grad_norm": 1.7219877243041992, + "learning_rate": 3.975530179445351e-05, + "loss": 0.1383, + "num_input_tokens_seen": 10548320, + "step": 4875 + }, + { + "epoch": 0.7960848287112561, + "grad_norm": 0.7819518446922302, + "learning_rate": 3.979608482871126e-05, + "loss": 0.1256, + "num_input_tokens_seen": 10557824, + "step": 4880 + }, + { + "epoch": 0.7969004893964111, + "grad_norm": 0.21302464604377747, + "learning_rate": 3.9836867862969006e-05, + "loss": 0.0375, + "num_input_tokens_seen": 10568896, + "step": 4885 + }, + { + "epoch": 0.797716150081566, + "grad_norm": 1.3075305223464966, + "learning_rate": 3.9877650897226754e-05, + "loss": 0.3594, + "num_input_tokens_seen": 10580064, + "step": 4890 + }, + { + "epoch": 0.7985318107667211, + "grad_norm": 0.07132778316736221, + "learning_rate": 3.991843393148451e-05, + "loss": 0.165, + "num_input_tokens_seen": 10590976, + "step": 4895 + }, + { + "epoch": 0.799347471451876, + "grad_norm": 1.4289264678955078, + "learning_rate": 3.995921696574225e-05, + "loss": 0.1515, + "num_input_tokens_seen": 10600448, + "step": 4900 + }, + { + "epoch": 0.8001631321370309, + "grad_norm": 0.8216736316680908, + "learning_rate": 4e-05, + "loss": 0.1747, + "num_input_tokens_seen": 10611904, + "step": 4905 + }, + { + "epoch": 0.800978792822186, + "grad_norm": 1.2880563735961914, + "learning_rate": 4.004078303425775e-05, + "loss": 0.1272, + "num_input_tokens_seen": 10622240, + "step": 4910 + }, + { + "epoch": 0.8017944535073409, + "grad_norm": 3.0397720336914062, + "learning_rate": 4.00815660685155e-05, + "loss": 0.2796, + "num_input_tokens_seen": 10632864, + "step": 4915 + }, + { + "epoch": 0.802610114192496, + "grad_norm": 0.2946607172489166, + "learning_rate": 4.0122349102773246e-05, + "loss": 0.0773, + "num_input_tokens_seen": 10642464, + "step": 4920 + }, + { + "epoch": 0.8034257748776509, + "grad_norm": 0.7557851076126099, + "learning_rate": 4.0163132137030994e-05, + "loss": 0.172, + "num_input_tokens_seen": 10653024, + "step": 4925 + }, + { + "epoch": 0.8042414355628059, + "grad_norm": 0.37043091654777527, + "learning_rate": 4.020391517128875e-05, + "loss": 0.1451, + "num_input_tokens_seen": 10664384, + "step": 4930 + }, + { + "epoch": 0.8050570962479608, + "grad_norm": 2.1550283432006836, + "learning_rate": 4.0244698205546496e-05, + "loss": 0.3184, + "num_input_tokens_seen": 10675360, + "step": 4935 + }, + { + "epoch": 0.8058727569331158, + "grad_norm": 0.9118832945823669, + "learning_rate": 4.0285481239804244e-05, + "loss": 0.1691, + "num_input_tokens_seen": 10685824, + "step": 4940 + }, + { + "epoch": 0.8066884176182708, + "grad_norm": 1.750726580619812, + "learning_rate": 4.032626427406199e-05, + "loss": 0.2137, + "num_input_tokens_seen": 10697536, + "step": 4945 + }, + { + "epoch": 0.8075040783034257, + "grad_norm": 0.39037588238716125, + "learning_rate": 4.036704730831974e-05, + "loss": 0.0892, + "num_input_tokens_seen": 10709888, + "step": 4950 + }, + { + "epoch": 0.8083197389885808, + "grad_norm": 0.06759205460548401, + "learning_rate": 4.040783034257749e-05, + "loss": 0.0831, + "num_input_tokens_seen": 10721216, + "step": 4955 + }, + { + "epoch": 0.8091353996737357, + "grad_norm": 0.24060052633285522, + "learning_rate": 4.0448613376835234e-05, + "loss": 0.0988, + "num_input_tokens_seen": 10729952, + "step": 4960 + }, + { + "epoch": 0.8099510603588908, + "grad_norm": 1.1551861763000488, + "learning_rate": 4.048939641109299e-05, + "loss": 0.1192, + "num_input_tokens_seen": 10741216, + "step": 4965 + }, + { + "epoch": 0.8107667210440457, + "grad_norm": 0.18729320168495178, + "learning_rate": 4.0530179445350736e-05, + "loss": 0.1721, + "num_input_tokens_seen": 10752544, + "step": 4970 + }, + { + "epoch": 0.8115823817292006, + "grad_norm": 0.21509476006031036, + "learning_rate": 4.057096247960849e-05, + "loss": 0.2014, + "num_input_tokens_seen": 10764960, + "step": 4975 + }, + { + "epoch": 0.8123980424143556, + "grad_norm": 1.0672246217727661, + "learning_rate": 4.061174551386623e-05, + "loss": 0.0837, + "num_input_tokens_seen": 10775296, + "step": 4980 + }, + { + "epoch": 0.8132137030995106, + "grad_norm": 0.8526793122291565, + "learning_rate": 4.065252854812398e-05, + "loss": 0.1499, + "num_input_tokens_seen": 10786112, + "step": 4985 + }, + { + "epoch": 0.8140293637846656, + "grad_norm": 0.4695553481578827, + "learning_rate": 4.0693311582381734e-05, + "loss": 0.1442, + "num_input_tokens_seen": 10797152, + "step": 4990 + }, + { + "epoch": 0.8148450244698205, + "grad_norm": 1.9879764318466187, + "learning_rate": 4.073409461663948e-05, + "loss": 0.1837, + "num_input_tokens_seen": 10808480, + "step": 4995 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 1.4153037071228027, + "learning_rate": 4.077487765089723e-05, + "loss": 0.0652, + "num_input_tokens_seen": 10819424, + "step": 5000 + }, + { + "epoch": 0.8164763458401305, + "grad_norm": 0.5849806666374207, + "learning_rate": 4.0815660685154977e-05, + "loss": 0.1465, + "num_input_tokens_seen": 10830048, + "step": 5005 + }, + { + "epoch": 0.8172920065252854, + "grad_norm": 0.5245445370674133, + "learning_rate": 4.0856443719412724e-05, + "loss": 0.206, + "num_input_tokens_seen": 10840768, + "step": 5010 + }, + { + "epoch": 0.8181076672104405, + "grad_norm": 0.12575313448905945, + "learning_rate": 4.089722675367048e-05, + "loss": 0.19, + "num_input_tokens_seen": 10849792, + "step": 5015 + }, + { + "epoch": 0.8189233278955954, + "grad_norm": 0.1241908073425293, + "learning_rate": 4.093800978792822e-05, + "loss": 0.0407, + "num_input_tokens_seen": 10861472, + "step": 5020 + }, + { + "epoch": 0.8197389885807504, + "grad_norm": 1.677001953125, + "learning_rate": 4.0978792822185974e-05, + "loss": 0.1386, + "num_input_tokens_seen": 10871968, + "step": 5025 + }, + { + "epoch": 0.8205546492659054, + "grad_norm": 1.0621230602264404, + "learning_rate": 4.101957585644372e-05, + "loss": 0.2058, + "num_input_tokens_seen": 10882560, + "step": 5030 + }, + { + "epoch": 0.8213703099510603, + "grad_norm": 1.5921025276184082, + "learning_rate": 4.106035889070147e-05, + "loss": 0.178, + "num_input_tokens_seen": 10894048, + "step": 5035 + }, + { + "epoch": 0.8221859706362153, + "grad_norm": 1.776806116104126, + "learning_rate": 4.110114192495922e-05, + "loss": 0.2245, + "num_input_tokens_seen": 10905408, + "step": 5040 + }, + { + "epoch": 0.8230016313213703, + "grad_norm": 1.5049265623092651, + "learning_rate": 4.1141924959216964e-05, + "loss": 0.1237, + "num_input_tokens_seen": 10915648, + "step": 5045 + }, + { + "epoch": 0.8238172920065253, + "grad_norm": 1.1862596273422241, + "learning_rate": 4.118270799347472e-05, + "loss": 0.1369, + "num_input_tokens_seen": 10926880, + "step": 5050 + }, + { + "epoch": 0.8246329526916802, + "grad_norm": 0.80622398853302, + "learning_rate": 4.1223491027732467e-05, + "loss": 0.2345, + "num_input_tokens_seen": 10938432, + "step": 5055 + }, + { + "epoch": 0.8254486133768353, + "grad_norm": 0.7072303891181946, + "learning_rate": 4.1264274061990214e-05, + "loss": 0.2554, + "num_input_tokens_seen": 10948992, + "step": 5060 + }, + { + "epoch": 0.8262642740619902, + "grad_norm": 3.396085739135742, + "learning_rate": 4.130505709624796e-05, + "loss": 0.274, + "num_input_tokens_seen": 10958272, + "step": 5065 + }, + { + "epoch": 0.8270799347471451, + "grad_norm": 0.9278431534767151, + "learning_rate": 4.1345840130505716e-05, + "loss": 0.1213, + "num_input_tokens_seen": 10968512, + "step": 5070 + }, + { + "epoch": 0.8278955954323002, + "grad_norm": 0.3846895396709442, + "learning_rate": 4.1386623164763464e-05, + "loss": 0.1198, + "num_input_tokens_seen": 10979072, + "step": 5075 + }, + { + "epoch": 0.8287112561174551, + "grad_norm": 1.023067593574524, + "learning_rate": 4.1427406199021205e-05, + "loss": 0.1355, + "num_input_tokens_seen": 10988736, + "step": 5080 + }, + { + "epoch": 0.8295269168026101, + "grad_norm": 1.1759504079818726, + "learning_rate": 4.146818923327896e-05, + "loss": 0.1603, + "num_input_tokens_seen": 11000512, + "step": 5085 + }, + { + "epoch": 0.8303425774877651, + "grad_norm": 1.1155422925949097, + "learning_rate": 4.150897226753671e-05, + "loss": 0.2278, + "num_input_tokens_seen": 11010240, + "step": 5090 + }, + { + "epoch": 0.8311582381729201, + "grad_norm": 0.1764405220746994, + "learning_rate": 4.1549755301794454e-05, + "loss": 0.2163, + "num_input_tokens_seen": 11020672, + "step": 5095 + }, + { + "epoch": 0.831973898858075, + "grad_norm": 0.24654601514339447, + "learning_rate": 4.15905383360522e-05, + "loss": 0.1056, + "num_input_tokens_seen": 11031808, + "step": 5100 + }, + { + "epoch": 0.83278955954323, + "grad_norm": 1.0771883726119995, + "learning_rate": 4.1631321370309957e-05, + "loss": 0.148, + "num_input_tokens_seen": 11042144, + "step": 5105 + }, + { + "epoch": 0.833605220228385, + "grad_norm": 0.22585289180278778, + "learning_rate": 4.1672104404567704e-05, + "loss": 0.1057, + "num_input_tokens_seen": 11053728, + "step": 5110 + }, + { + "epoch": 0.8344208809135399, + "grad_norm": 0.7839706540107727, + "learning_rate": 4.171288743882545e-05, + "loss": 0.2071, + "num_input_tokens_seen": 11064032, + "step": 5115 + }, + { + "epoch": 0.835236541598695, + "grad_norm": 1.941272497177124, + "learning_rate": 4.17536704730832e-05, + "loss": 0.2429, + "num_input_tokens_seen": 11073792, + "step": 5120 + }, + { + "epoch": 0.8360522022838499, + "grad_norm": 0.32143521308898926, + "learning_rate": 4.179445350734095e-05, + "loss": 0.1, + "num_input_tokens_seen": 11085216, + "step": 5125 + }, + { + "epoch": 0.8368678629690048, + "grad_norm": 0.44522032141685486, + "learning_rate": 4.18352365415987e-05, + "loss": 0.2765, + "num_input_tokens_seen": 11096288, + "step": 5130 + }, + { + "epoch": 0.8376835236541599, + "grad_norm": 0.37198135256767273, + "learning_rate": 4.187601957585644e-05, + "loss": 0.1328, + "num_input_tokens_seen": 11107584, + "step": 5135 + }, + { + "epoch": 0.8384991843393148, + "grad_norm": 0.5887154340744019, + "learning_rate": 4.191680261011419e-05, + "loss": 0.0831, + "num_input_tokens_seen": 11118752, + "step": 5140 + }, + { + "epoch": 0.8393148450244698, + "grad_norm": 1.0385169982910156, + "learning_rate": 4.1957585644371944e-05, + "loss": 0.1236, + "num_input_tokens_seen": 11127840, + "step": 5145 + }, + { + "epoch": 0.8401305057096248, + "grad_norm": 0.05005195364356041, + "learning_rate": 4.199836867862969e-05, + "loss": 0.1596, + "num_input_tokens_seen": 11137952, + "step": 5150 + }, + { + "epoch": 0.8409461663947798, + "grad_norm": 0.21136224269866943, + "learning_rate": 4.203915171288744e-05, + "loss": 0.1626, + "num_input_tokens_seen": 11147008, + "step": 5155 + }, + { + "epoch": 0.8417618270799347, + "grad_norm": 1.8165090084075928, + "learning_rate": 4.207993474714519e-05, + "loss": 0.1093, + "num_input_tokens_seen": 11158912, + "step": 5160 + }, + { + "epoch": 0.8425774877650897, + "grad_norm": 0.837884783744812, + "learning_rate": 4.212071778140294e-05, + "loss": 0.0955, + "num_input_tokens_seen": 11169440, + "step": 5165 + }, + { + "epoch": 0.8433931484502447, + "grad_norm": 0.4252052903175354, + "learning_rate": 4.216150081566069e-05, + "loss": 0.1811, + "num_input_tokens_seen": 11179488, + "step": 5170 + }, + { + "epoch": 0.8442088091353996, + "grad_norm": 0.9325119256973267, + "learning_rate": 4.220228384991843e-05, + "loss": 0.2256, + "num_input_tokens_seen": 11190912, + "step": 5175 + }, + { + "epoch": 0.8450244698205547, + "grad_norm": 0.7317050695419312, + "learning_rate": 4.2243066884176185e-05, + "loss": 0.1869, + "num_input_tokens_seen": 11202208, + "step": 5180 + }, + { + "epoch": 0.8458401305057096, + "grad_norm": 1.7350088357925415, + "learning_rate": 4.228384991843393e-05, + "loss": 0.3059, + "num_input_tokens_seen": 11212448, + "step": 5185 + }, + { + "epoch": 0.8466557911908646, + "grad_norm": 0.6131577491760254, + "learning_rate": 4.232463295269169e-05, + "loss": 0.1622, + "num_input_tokens_seen": 11223648, + "step": 5190 + }, + { + "epoch": 0.8474714518760196, + "grad_norm": 0.14720016717910767, + "learning_rate": 4.236541598694943e-05, + "loss": 0.2356, + "num_input_tokens_seen": 11234880, + "step": 5195 + }, + { + "epoch": 0.8482871125611745, + "grad_norm": 0.25731566548347473, + "learning_rate": 4.240619902120718e-05, + "loss": 0.1095, + "num_input_tokens_seen": 11245184, + "step": 5200 + }, + { + "epoch": 0.8491027732463295, + "grad_norm": 1.2384376525878906, + "learning_rate": 4.244698205546493e-05, + "loss": 0.1025, + "num_input_tokens_seen": 11256288, + "step": 5205 + }, + { + "epoch": 0.8499184339314845, + "grad_norm": 0.6569077372550964, + "learning_rate": 4.248776508972268e-05, + "loss": 0.158, + "num_input_tokens_seen": 11265728, + "step": 5210 + }, + { + "epoch": 0.8507340946166395, + "grad_norm": 0.17277978360652924, + "learning_rate": 4.2528548123980425e-05, + "loss": 0.113, + "num_input_tokens_seen": 11275712, + "step": 5215 + }, + { + "epoch": 0.8515497553017944, + "grad_norm": 0.697318971157074, + "learning_rate": 4.256933115823817e-05, + "loss": 0.1213, + "num_input_tokens_seen": 11287776, + "step": 5220 + }, + { + "epoch": 0.8523654159869495, + "grad_norm": 1.730143427848816, + "learning_rate": 4.261011419249593e-05, + "loss": 0.18, + "num_input_tokens_seen": 11298528, + "step": 5225 + }, + { + "epoch": 0.8531810766721044, + "grad_norm": 0.7245425581932068, + "learning_rate": 4.2650897226753675e-05, + "loss": 0.2551, + "num_input_tokens_seen": 11309536, + "step": 5230 + }, + { + "epoch": 0.8539967373572593, + "grad_norm": 0.6406300067901611, + "learning_rate": 4.2691680261011416e-05, + "loss": 0.3078, + "num_input_tokens_seen": 11319200, + "step": 5235 + }, + { + "epoch": 0.8548123980424144, + "grad_norm": 0.18651285767555237, + "learning_rate": 4.273246329526917e-05, + "loss": 0.1082, + "num_input_tokens_seen": 11329120, + "step": 5240 + }, + { + "epoch": 0.8556280587275693, + "grad_norm": 0.11495926976203918, + "learning_rate": 4.277324632952692e-05, + "loss": 0.1453, + "num_input_tokens_seen": 11339360, + "step": 5245 + }, + { + "epoch": 0.8564437194127243, + "grad_norm": 2.393460273742676, + "learning_rate": 4.281402936378467e-05, + "loss": 0.2379, + "num_input_tokens_seen": 11350848, + "step": 5250 + }, + { + "epoch": 0.8572593800978793, + "grad_norm": 0.3224658668041229, + "learning_rate": 4.285481239804241e-05, + "loss": 0.147, + "num_input_tokens_seen": 11360704, + "step": 5255 + }, + { + "epoch": 0.8580750407830342, + "grad_norm": 1.0622302293777466, + "learning_rate": 4.289559543230017e-05, + "loss": 0.0937, + "num_input_tokens_seen": 11370816, + "step": 5260 + }, + { + "epoch": 0.8588907014681892, + "grad_norm": 1.4305747747421265, + "learning_rate": 4.2936378466557915e-05, + "loss": 0.1332, + "num_input_tokens_seen": 11382080, + "step": 5265 + }, + { + "epoch": 0.8597063621533442, + "grad_norm": 0.24062344431877136, + "learning_rate": 4.297716150081566e-05, + "loss": 0.267, + "num_input_tokens_seen": 11392736, + "step": 5270 + }, + { + "epoch": 0.8605220228384992, + "grad_norm": 0.07406114786863327, + "learning_rate": 4.301794453507341e-05, + "loss": 0.1649, + "num_input_tokens_seen": 11402720, + "step": 5275 + }, + { + "epoch": 0.8613376835236541, + "grad_norm": 0.5666096806526184, + "learning_rate": 4.305872756933116e-05, + "loss": 0.0589, + "num_input_tokens_seen": 11413984, + "step": 5280 + }, + { + "epoch": 0.8621533442088092, + "grad_norm": 1.814202070236206, + "learning_rate": 4.309951060358891e-05, + "loss": 0.2191, + "num_input_tokens_seen": 11424832, + "step": 5285 + }, + { + "epoch": 0.8629690048939641, + "grad_norm": 0.7600120306015015, + "learning_rate": 4.314029363784666e-05, + "loss": 0.0772, + "num_input_tokens_seen": 11436128, + "step": 5290 + }, + { + "epoch": 0.863784665579119, + "grad_norm": 0.039884574711322784, + "learning_rate": 4.318107667210441e-05, + "loss": 0.2, + "num_input_tokens_seen": 11448448, + "step": 5295 + }, + { + "epoch": 0.8646003262642741, + "grad_norm": 0.5914639830589294, + "learning_rate": 4.3221859706362155e-05, + "loss": 0.1376, + "num_input_tokens_seen": 11458176, + "step": 5300 + }, + { + "epoch": 0.865415986949429, + "grad_norm": 1.1345813274383545, + "learning_rate": 4.32626427406199e-05, + "loss": 0.181, + "num_input_tokens_seen": 11468480, + "step": 5305 + }, + { + "epoch": 0.866231647634584, + "grad_norm": 1.281274437904358, + "learning_rate": 4.330342577487765e-05, + "loss": 0.2074, + "num_input_tokens_seen": 11478880, + "step": 5310 + }, + { + "epoch": 0.867047308319739, + "grad_norm": 1.0489345788955688, + "learning_rate": 4.33442088091354e-05, + "loss": 0.1851, + "num_input_tokens_seen": 11490176, + "step": 5315 + }, + { + "epoch": 0.867862969004894, + "grad_norm": 0.5587987899780273, + "learning_rate": 4.338499184339315e-05, + "loss": 0.1632, + "num_input_tokens_seen": 11500992, + "step": 5320 + }, + { + "epoch": 0.8686786296900489, + "grad_norm": 0.36505672335624695, + "learning_rate": 4.34257748776509e-05, + "loss": 0.0492, + "num_input_tokens_seen": 11512288, + "step": 5325 + }, + { + "epoch": 0.8694942903752039, + "grad_norm": 2.027203321456909, + "learning_rate": 4.346655791190865e-05, + "loss": 0.2303, + "num_input_tokens_seen": 11521664, + "step": 5330 + }, + { + "epoch": 0.8703099510603589, + "grad_norm": 0.7583109140396118, + "learning_rate": 4.3507340946166396e-05, + "loss": 0.0481, + "num_input_tokens_seen": 11532608, + "step": 5335 + }, + { + "epoch": 0.8711256117455138, + "grad_norm": 2.0775530338287354, + "learning_rate": 4.354812398042414e-05, + "loss": 0.2386, + "num_input_tokens_seen": 11543072, + "step": 5340 + }, + { + "epoch": 0.8719412724306689, + "grad_norm": 0.7719337940216064, + "learning_rate": 4.35889070146819e-05, + "loss": 0.0884, + "num_input_tokens_seen": 11554144, + "step": 5345 + }, + { + "epoch": 0.8727569331158238, + "grad_norm": 0.2318546026945114, + "learning_rate": 4.3629690048939645e-05, + "loss": 0.0907, + "num_input_tokens_seen": 11565376, + "step": 5350 + }, + { + "epoch": 0.8735725938009788, + "grad_norm": 1.7471778392791748, + "learning_rate": 4.367047308319739e-05, + "loss": 0.2911, + "num_input_tokens_seen": 11576128, + "step": 5355 + }, + { + "epoch": 0.8743882544861338, + "grad_norm": 0.11528843641281128, + "learning_rate": 4.371125611745514e-05, + "loss": 0.0963, + "num_input_tokens_seen": 11588000, + "step": 5360 + }, + { + "epoch": 0.8752039151712887, + "grad_norm": 0.4404895007610321, + "learning_rate": 4.375203915171289e-05, + "loss": 0.09, + "num_input_tokens_seen": 11599520, + "step": 5365 + }, + { + "epoch": 0.8760195758564437, + "grad_norm": 0.6694369316101074, + "learning_rate": 4.3792822185970636e-05, + "loss": 0.1199, + "num_input_tokens_seen": 11611168, + "step": 5370 + }, + { + "epoch": 0.8768352365415987, + "grad_norm": 0.35590821504592896, + "learning_rate": 4.3833605220228384e-05, + "loss": 0.0738, + "num_input_tokens_seen": 11621632, + "step": 5375 + }, + { + "epoch": 0.8776508972267537, + "grad_norm": 2.2024123668670654, + "learning_rate": 4.387438825448614e-05, + "loss": 0.3895, + "num_input_tokens_seen": 11632160, + "step": 5380 + }, + { + "epoch": 0.8784665579119086, + "grad_norm": 0.2284500002861023, + "learning_rate": 4.3915171288743886e-05, + "loss": 0.1732, + "num_input_tokens_seen": 11641344, + "step": 5385 + }, + { + "epoch": 0.8792822185970636, + "grad_norm": 0.24190641939640045, + "learning_rate": 4.395595432300163e-05, + "loss": 0.1387, + "num_input_tokens_seen": 11652128, + "step": 5390 + }, + { + "epoch": 0.8800978792822186, + "grad_norm": 0.1385887861251831, + "learning_rate": 4.399673735725938e-05, + "loss": 0.0886, + "num_input_tokens_seen": 11662528, + "step": 5395 + }, + { + "epoch": 0.8809135399673735, + "grad_norm": 0.2485819011926651, + "learning_rate": 4.403752039151713e-05, + "loss": 0.1541, + "num_input_tokens_seen": 11672544, + "step": 5400 + }, + { + "epoch": 0.8817292006525286, + "grad_norm": 1.0426441431045532, + "learning_rate": 4.407830342577488e-05, + "loss": 0.1451, + "num_input_tokens_seen": 11683552, + "step": 5405 + }, + { + "epoch": 0.8825448613376835, + "grad_norm": 1.1441318988800049, + "learning_rate": 4.4119086460032624e-05, + "loss": 0.1314, + "num_input_tokens_seen": 11693408, + "step": 5410 + }, + { + "epoch": 0.8833605220228385, + "grad_norm": 0.4553411900997162, + "learning_rate": 4.415986949429038e-05, + "loss": 0.139, + "num_input_tokens_seen": 11704160, + "step": 5415 + }, + { + "epoch": 0.8841761827079935, + "grad_norm": 0.2870623469352722, + "learning_rate": 4.4200652528548126e-05, + "loss": 0.1321, + "num_input_tokens_seen": 11714752, + "step": 5420 + }, + { + "epoch": 0.8849918433931484, + "grad_norm": 0.17791546881198883, + "learning_rate": 4.424143556280588e-05, + "loss": 0.0651, + "num_input_tokens_seen": 11725408, + "step": 5425 + }, + { + "epoch": 0.8858075040783034, + "grad_norm": 0.6911072134971619, + "learning_rate": 4.428221859706362e-05, + "loss": 0.1687, + "num_input_tokens_seen": 11736640, + "step": 5430 + }, + { + "epoch": 0.8866231647634584, + "grad_norm": 1.1713777780532837, + "learning_rate": 4.432300163132137e-05, + "loss": 0.1402, + "num_input_tokens_seen": 11747008, + "step": 5435 + }, + { + "epoch": 0.8874388254486134, + "grad_norm": 1.284684181213379, + "learning_rate": 4.436378466557912e-05, + "loss": 0.128, + "num_input_tokens_seen": 11757728, + "step": 5440 + }, + { + "epoch": 0.8882544861337683, + "grad_norm": 0.5087106227874756, + "learning_rate": 4.440456769983687e-05, + "loss": 0.1775, + "num_input_tokens_seen": 11768704, + "step": 5445 + }, + { + "epoch": 0.8890701468189234, + "grad_norm": 3.5427370071411133, + "learning_rate": 4.444535073409462e-05, + "loss": 0.2346, + "num_input_tokens_seen": 11780288, + "step": 5450 + }, + { + "epoch": 0.8898858075040783, + "grad_norm": 2.012118339538574, + "learning_rate": 4.4486133768352366e-05, + "loss": 0.3374, + "num_input_tokens_seen": 11790080, + "step": 5455 + }, + { + "epoch": 0.8907014681892332, + "grad_norm": 1.3392176628112793, + "learning_rate": 4.4526916802610114e-05, + "loss": 0.3619, + "num_input_tokens_seen": 11801952, + "step": 5460 + }, + { + "epoch": 0.8915171288743883, + "grad_norm": 1.2538071870803833, + "learning_rate": 4.456769983686787e-05, + "loss": 0.103, + "num_input_tokens_seen": 11811520, + "step": 5465 + }, + { + "epoch": 0.8923327895595432, + "grad_norm": 0.46419066190719604, + "learning_rate": 4.460848287112561e-05, + "loss": 0.2166, + "num_input_tokens_seen": 11822240, + "step": 5470 + }, + { + "epoch": 0.8931484502446982, + "grad_norm": 2.079055070877075, + "learning_rate": 4.4649265905383364e-05, + "loss": 0.2427, + "num_input_tokens_seen": 11833632, + "step": 5475 + }, + { + "epoch": 0.8939641109298532, + "grad_norm": 0.03738432750105858, + "learning_rate": 4.469004893964111e-05, + "loss": 0.0696, + "num_input_tokens_seen": 11844640, + "step": 5480 + }, + { + "epoch": 0.8947797716150081, + "grad_norm": 1.4639662504196167, + "learning_rate": 4.4730831973898866e-05, + "loss": 0.1692, + "num_input_tokens_seen": 11855776, + "step": 5485 + }, + { + "epoch": 0.8955954323001631, + "grad_norm": 0.6367776989936829, + "learning_rate": 4.4771615008156607e-05, + "loss": 0.3454, + "num_input_tokens_seen": 11865856, + "step": 5490 + }, + { + "epoch": 0.8964110929853181, + "grad_norm": 0.8310841917991638, + "learning_rate": 4.4812398042414354e-05, + "loss": 0.1215, + "num_input_tokens_seen": 11877504, + "step": 5495 + }, + { + "epoch": 0.8972267536704731, + "grad_norm": 0.17663313448429108, + "learning_rate": 4.485318107667211e-05, + "loss": 0.2363, + "num_input_tokens_seen": 11888320, + "step": 5500 + }, + { + "epoch": 0.898042414355628, + "grad_norm": 0.2376878559589386, + "learning_rate": 4.4893964110929856e-05, + "loss": 0.1693, + "num_input_tokens_seen": 11899456, + "step": 5505 + }, + { + "epoch": 0.8988580750407831, + "grad_norm": 1.4082831144332886, + "learning_rate": 4.4934747145187604e-05, + "loss": 0.1686, + "num_input_tokens_seen": 11910240, + "step": 5510 + }, + { + "epoch": 0.899673735725938, + "grad_norm": 1.111380934715271, + "learning_rate": 4.497553017944535e-05, + "loss": 0.0734, + "num_input_tokens_seen": 11920512, + "step": 5515 + }, + { + "epoch": 0.9004893964110929, + "grad_norm": 0.21907494962215424, + "learning_rate": 4.5016313213703106e-05, + "loss": 0.2208, + "num_input_tokens_seen": 11930592, + "step": 5520 + }, + { + "epoch": 0.901305057096248, + "grad_norm": 0.5766258835792542, + "learning_rate": 4.5057096247960854e-05, + "loss": 0.3175, + "num_input_tokens_seen": 11941088, + "step": 5525 + }, + { + "epoch": 0.9021207177814029, + "grad_norm": 0.47398489713668823, + "learning_rate": 4.5097879282218594e-05, + "loss": 0.1328, + "num_input_tokens_seen": 11952320, + "step": 5530 + }, + { + "epoch": 0.9029363784665579, + "grad_norm": 0.47731080651283264, + "learning_rate": 4.513866231647635e-05, + "loss": 0.1759, + "num_input_tokens_seen": 11962880, + "step": 5535 + }, + { + "epoch": 0.9037520391517129, + "grad_norm": 0.8351296186447144, + "learning_rate": 4.5179445350734097e-05, + "loss": 0.0753, + "num_input_tokens_seen": 11973472, + "step": 5540 + }, + { + "epoch": 0.9045676998368679, + "grad_norm": 1.259040355682373, + "learning_rate": 4.5220228384991844e-05, + "loss": 0.3654, + "num_input_tokens_seen": 11984096, + "step": 5545 + }, + { + "epoch": 0.9053833605220228, + "grad_norm": 0.8817420601844788, + "learning_rate": 4.526101141924959e-05, + "loss": 0.1056, + "num_input_tokens_seen": 11995488, + "step": 5550 + }, + { + "epoch": 0.9061990212071778, + "grad_norm": 2.741581678390503, + "learning_rate": 4.5301794453507346e-05, + "loss": 0.1359, + "num_input_tokens_seen": 12007104, + "step": 5555 + }, + { + "epoch": 0.9070146818923328, + "grad_norm": 0.9292651414871216, + "learning_rate": 4.5342577487765094e-05, + "loss": 0.2138, + "num_input_tokens_seen": 12018688, + "step": 5560 + }, + { + "epoch": 0.9078303425774877, + "grad_norm": 0.2391597181558609, + "learning_rate": 4.538336052202284e-05, + "loss": 0.0773, + "num_input_tokens_seen": 12028096, + "step": 5565 + }, + { + "epoch": 0.9086460032626428, + "grad_norm": 1.3455731868743896, + "learning_rate": 4.542414355628059e-05, + "loss": 0.1585, + "num_input_tokens_seen": 12038976, + "step": 5570 + }, + { + "epoch": 0.9094616639477977, + "grad_norm": 0.8469296097755432, + "learning_rate": 4.546492659053834e-05, + "loss": 0.2196, + "num_input_tokens_seen": 12050816, + "step": 5575 + }, + { + "epoch": 0.9102773246329527, + "grad_norm": 0.13299940526485443, + "learning_rate": 4.550570962479609e-05, + "loss": 0.1925, + "num_input_tokens_seen": 12062336, + "step": 5580 + }, + { + "epoch": 0.9110929853181077, + "grad_norm": 2.074770450592041, + "learning_rate": 4.554649265905383e-05, + "loss": 0.1843, + "num_input_tokens_seen": 12072096, + "step": 5585 + }, + { + "epoch": 0.9119086460032626, + "grad_norm": 0.3727168142795563, + "learning_rate": 4.558727569331158e-05, + "loss": 0.0873, + "num_input_tokens_seen": 12083136, + "step": 5590 + }, + { + "epoch": 0.9127243066884176, + "grad_norm": 0.5380368828773499, + "learning_rate": 4.5628058727569334e-05, + "loss": 0.2181, + "num_input_tokens_seen": 12094272, + "step": 5595 + }, + { + "epoch": 0.9135399673735726, + "grad_norm": 0.2822546660900116, + "learning_rate": 4.566884176182708e-05, + "loss": 0.2264, + "num_input_tokens_seen": 12105280, + "step": 5600 + }, + { + "epoch": 0.9143556280587276, + "grad_norm": 0.6293612718582153, + "learning_rate": 4.570962479608483e-05, + "loss": 0.2106, + "num_input_tokens_seen": 12115968, + "step": 5605 + }, + { + "epoch": 0.9151712887438825, + "grad_norm": 1.490959882736206, + "learning_rate": 4.575040783034258e-05, + "loss": 0.1572, + "num_input_tokens_seen": 12127136, + "step": 5610 + }, + { + "epoch": 0.9159869494290375, + "grad_norm": 2.4018568992614746, + "learning_rate": 4.579119086460033e-05, + "loss": 0.1311, + "num_input_tokens_seen": 12136704, + "step": 5615 + }, + { + "epoch": 0.9168026101141925, + "grad_norm": 0.21975982189178467, + "learning_rate": 4.583197389885808e-05, + "loss": 0.0696, + "num_input_tokens_seen": 12146784, + "step": 5620 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.9235070943832397, + "learning_rate": 4.587275693311583e-05, + "loss": 0.08, + "num_input_tokens_seen": 12157248, + "step": 5625 + }, + { + "epoch": 0.9184339314845025, + "grad_norm": 0.9278425574302673, + "learning_rate": 4.5913539967373574e-05, + "loss": 0.1844, + "num_input_tokens_seen": 12168256, + "step": 5630 + }, + { + "epoch": 0.9192495921696574, + "grad_norm": 1.2807331085205078, + "learning_rate": 4.595432300163132e-05, + "loss": 0.128, + "num_input_tokens_seen": 12177760, + "step": 5635 + }, + { + "epoch": 0.9200652528548124, + "grad_norm": 0.6914818286895752, + "learning_rate": 4.5995106035889077e-05, + "loss": 0.0844, + "num_input_tokens_seen": 12188192, + "step": 5640 + }, + { + "epoch": 0.9208809135399674, + "grad_norm": 1.558292031288147, + "learning_rate": 4.603588907014682e-05, + "loss": 0.2201, + "num_input_tokens_seen": 12198880, + "step": 5645 + }, + { + "epoch": 0.9216965742251223, + "grad_norm": 1.367709994316101, + "learning_rate": 4.607667210440457e-05, + "loss": 0.1987, + "num_input_tokens_seen": 12208800, + "step": 5650 + }, + { + "epoch": 0.9225122349102773, + "grad_norm": 0.4747437834739685, + "learning_rate": 4.611745513866232e-05, + "loss": 0.1817, + "num_input_tokens_seen": 12219360, + "step": 5655 + }, + { + "epoch": 0.9233278955954323, + "grad_norm": 1.9532448053359985, + "learning_rate": 4.615823817292007e-05, + "loss": 0.1327, + "num_input_tokens_seen": 12229376, + "step": 5660 + }, + { + "epoch": 0.9241435562805873, + "grad_norm": 1.470059871673584, + "learning_rate": 4.6199021207177815e-05, + "loss": 0.2263, + "num_input_tokens_seen": 12241152, + "step": 5665 + }, + { + "epoch": 0.9249592169657422, + "grad_norm": 0.4245757460594177, + "learning_rate": 4.623980424143556e-05, + "loss": 0.2578, + "num_input_tokens_seen": 12252192, + "step": 5670 + }, + { + "epoch": 0.9257748776508973, + "grad_norm": 0.7320389747619629, + "learning_rate": 4.628058727569332e-05, + "loss": 0.2001, + "num_input_tokens_seen": 12262848, + "step": 5675 + }, + { + "epoch": 0.9265905383360522, + "grad_norm": 1.2751731872558594, + "learning_rate": 4.6321370309951064e-05, + "loss": 0.2186, + "num_input_tokens_seen": 12273728, + "step": 5680 + }, + { + "epoch": 0.9274061990212071, + "grad_norm": 0.6518067121505737, + "learning_rate": 4.636215334420881e-05, + "loss": 0.1263, + "num_input_tokens_seen": 12284512, + "step": 5685 + }, + { + "epoch": 0.9282218597063622, + "grad_norm": 1.670360803604126, + "learning_rate": 4.640293637846656e-05, + "loss": 0.1982, + "num_input_tokens_seen": 12295968, + "step": 5690 + }, + { + "epoch": 0.9290375203915171, + "grad_norm": 0.5132317543029785, + "learning_rate": 4.644371941272431e-05, + "loss": 0.1443, + "num_input_tokens_seen": 12307616, + "step": 5695 + }, + { + "epoch": 0.9298531810766721, + "grad_norm": 0.8424690365791321, + "learning_rate": 4.648450244698206e-05, + "loss": 0.1441, + "num_input_tokens_seen": 12318400, + "step": 5700 + }, + { + "epoch": 0.9306688417618271, + "grad_norm": 0.5365008115768433, + "learning_rate": 4.65252854812398e-05, + "loss": 0.1231, + "num_input_tokens_seen": 12329472, + "step": 5705 + }, + { + "epoch": 0.9314845024469821, + "grad_norm": 0.7158522605895996, + "learning_rate": 4.656606851549756e-05, + "loss": 0.1002, + "num_input_tokens_seen": 12340032, + "step": 5710 + }, + { + "epoch": 0.932300163132137, + "grad_norm": 0.6860803365707397, + "learning_rate": 4.6606851549755305e-05, + "loss": 0.0833, + "num_input_tokens_seen": 12351584, + "step": 5715 + }, + { + "epoch": 0.933115823817292, + "grad_norm": 1.2812353372573853, + "learning_rate": 4.664763458401305e-05, + "loss": 0.3253, + "num_input_tokens_seen": 12362752, + "step": 5720 + }, + { + "epoch": 0.933931484502447, + "grad_norm": 0.7990608811378479, + "learning_rate": 4.66884176182708e-05, + "loss": 0.2509, + "num_input_tokens_seen": 12374720, + "step": 5725 + }, + { + "epoch": 0.9347471451876019, + "grad_norm": 0.26863712072372437, + "learning_rate": 4.672920065252855e-05, + "loss": 0.1199, + "num_input_tokens_seen": 12384896, + "step": 5730 + }, + { + "epoch": 0.935562805872757, + "grad_norm": 0.9292393922805786, + "learning_rate": 4.67699836867863e-05, + "loss": 0.1231, + "num_input_tokens_seen": 12395808, + "step": 5735 + }, + { + "epoch": 0.9363784665579119, + "grad_norm": 1.1364026069641113, + "learning_rate": 4.681076672104405e-05, + "loss": 0.1516, + "num_input_tokens_seen": 12406880, + "step": 5740 + }, + { + "epoch": 0.9371941272430668, + "grad_norm": 0.7874767780303955, + "learning_rate": 4.68515497553018e-05, + "loss": 0.2467, + "num_input_tokens_seen": 12418240, + "step": 5745 + }, + { + "epoch": 0.9380097879282219, + "grad_norm": 0.8493223190307617, + "learning_rate": 4.6892332789559545e-05, + "loss": 0.1887, + "num_input_tokens_seen": 12428896, + "step": 5750 + }, + { + "epoch": 0.9388254486133768, + "grad_norm": 0.2502117455005646, + "learning_rate": 4.693311582381729e-05, + "loss": 0.0573, + "num_input_tokens_seen": 12439872, + "step": 5755 + }, + { + "epoch": 0.9396411092985318, + "grad_norm": 0.14836087822914124, + "learning_rate": 4.697389885807505e-05, + "loss": 0.0549, + "num_input_tokens_seen": 12451840, + "step": 5760 + }, + { + "epoch": 0.9404567699836868, + "grad_norm": 0.43802669644355774, + "learning_rate": 4.701468189233279e-05, + "loss": 0.1463, + "num_input_tokens_seen": 12463328, + "step": 5765 + }, + { + "epoch": 0.9412724306688418, + "grad_norm": 0.3718009293079376, + "learning_rate": 4.705546492659054e-05, + "loss": 0.2845, + "num_input_tokens_seen": 12474080, + "step": 5770 + }, + { + "epoch": 0.9420880913539967, + "grad_norm": 0.19554634392261505, + "learning_rate": 4.709624796084829e-05, + "loss": 0.354, + "num_input_tokens_seen": 12484960, + "step": 5775 + }, + { + "epoch": 0.9429037520391517, + "grad_norm": 0.17014552652835846, + "learning_rate": 4.713703099510604e-05, + "loss": 0.0754, + "num_input_tokens_seen": 12495584, + "step": 5780 + }, + { + "epoch": 0.9437194127243067, + "grad_norm": 0.4325193762779236, + "learning_rate": 4.7177814029363785e-05, + "loss": 0.1834, + "num_input_tokens_seen": 12505568, + "step": 5785 + }, + { + "epoch": 0.9445350734094616, + "grad_norm": 0.2593797445297241, + "learning_rate": 4.721859706362153e-05, + "loss": 0.221, + "num_input_tokens_seen": 12516480, + "step": 5790 + }, + { + "epoch": 0.9453507340946167, + "grad_norm": 1.714267373085022, + "learning_rate": 4.725938009787929e-05, + "loss": 0.3079, + "num_input_tokens_seen": 12527136, + "step": 5795 + }, + { + "epoch": 0.9461663947797716, + "grad_norm": 0.13458195328712463, + "learning_rate": 4.7300163132137035e-05, + "loss": 0.3024, + "num_input_tokens_seen": 12537568, + "step": 5800 + }, + { + "epoch": 0.9469820554649266, + "grad_norm": 0.42326828837394714, + "learning_rate": 4.734094616639478e-05, + "loss": 0.1171, + "num_input_tokens_seen": 12548736, + "step": 5805 + }, + { + "epoch": 0.9477977161500816, + "grad_norm": 0.08009657263755798, + "learning_rate": 4.738172920065253e-05, + "loss": 0.1037, + "num_input_tokens_seen": 12560192, + "step": 5810 + }, + { + "epoch": 0.9486133768352365, + "grad_norm": 0.39937567710876465, + "learning_rate": 4.742251223491028e-05, + "loss": 0.1102, + "num_input_tokens_seen": 12570816, + "step": 5815 + }, + { + "epoch": 0.9494290375203915, + "grad_norm": 2.496629238128662, + "learning_rate": 4.7463295269168026e-05, + "loss": 0.2396, + "num_input_tokens_seen": 12582272, + "step": 5820 + }, + { + "epoch": 0.9502446982055465, + "grad_norm": 0.38591089844703674, + "learning_rate": 4.750407830342577e-05, + "loss": 0.2388, + "num_input_tokens_seen": 12594432, + "step": 5825 + }, + { + "epoch": 0.9510603588907015, + "grad_norm": 0.4665520489215851, + "learning_rate": 4.754486133768353e-05, + "loss": 0.0422, + "num_input_tokens_seen": 12606048, + "step": 5830 + }, + { + "epoch": 0.9518760195758564, + "grad_norm": 0.8105296492576599, + "learning_rate": 4.7585644371941275e-05, + "loss": 0.2875, + "num_input_tokens_seen": 12617088, + "step": 5835 + }, + { + "epoch": 0.9526916802610114, + "grad_norm": 1.4674079418182373, + "learning_rate": 4.762642740619902e-05, + "loss": 0.2773, + "num_input_tokens_seen": 12627808, + "step": 5840 + }, + { + "epoch": 0.9535073409461664, + "grad_norm": 0.25272461771965027, + "learning_rate": 4.766721044045677e-05, + "loss": 0.0775, + "num_input_tokens_seen": 12638720, + "step": 5845 + }, + { + "epoch": 0.9543230016313213, + "grad_norm": 0.5733208656311035, + "learning_rate": 4.770799347471452e-05, + "loss": 0.1082, + "num_input_tokens_seen": 12648608, + "step": 5850 + }, + { + "epoch": 0.9551386623164764, + "grad_norm": 0.5689278244972229, + "learning_rate": 4.774877650897227e-05, + "loss": 0.0891, + "num_input_tokens_seen": 12658944, + "step": 5855 + }, + { + "epoch": 0.9559543230016313, + "grad_norm": 2.0646185874938965, + "learning_rate": 4.7789559543230014e-05, + "loss": 0.1715, + "num_input_tokens_seen": 12669856, + "step": 5860 + }, + { + "epoch": 0.9567699836867863, + "grad_norm": 0.8185526728630066, + "learning_rate": 4.783034257748777e-05, + "loss": 0.2019, + "num_input_tokens_seen": 12680096, + "step": 5865 + }, + { + "epoch": 0.9575856443719413, + "grad_norm": 0.09415222704410553, + "learning_rate": 4.7871125611745516e-05, + "loss": 0.0397, + "num_input_tokens_seen": 12691040, + "step": 5870 + }, + { + "epoch": 0.9584013050570962, + "grad_norm": 0.1611718088388443, + "learning_rate": 4.791190864600327e-05, + "loss": 0.1566, + "num_input_tokens_seen": 12701728, + "step": 5875 + }, + { + "epoch": 0.9592169657422512, + "grad_norm": 0.2526058256626129, + "learning_rate": 4.795269168026101e-05, + "loss": 0.1256, + "num_input_tokens_seen": 12712224, + "step": 5880 + }, + { + "epoch": 0.9600326264274062, + "grad_norm": 1.040116310119629, + "learning_rate": 4.799347471451876e-05, + "loss": 0.2642, + "num_input_tokens_seen": 12723264, + "step": 5885 + }, + { + "epoch": 0.9608482871125612, + "grad_norm": 0.8266168832778931, + "learning_rate": 4.803425774877651e-05, + "loss": 0.2048, + "num_input_tokens_seen": 12734400, + "step": 5890 + }, + { + "epoch": 0.9616639477977161, + "grad_norm": 0.06330566853284836, + "learning_rate": 4.807504078303426e-05, + "loss": 0.142, + "num_input_tokens_seen": 12745504, + "step": 5895 + }, + { + "epoch": 0.9624796084828712, + "grad_norm": 0.9322842359542847, + "learning_rate": 4.811582381729201e-05, + "loss": 0.2408, + "num_input_tokens_seen": 12755520, + "step": 5900 + }, + { + "epoch": 0.9632952691680261, + "grad_norm": 0.5328028798103333, + "learning_rate": 4.8156606851549756e-05, + "loss": 0.0552, + "num_input_tokens_seen": 12766336, + "step": 5905 + }, + { + "epoch": 0.964110929853181, + "grad_norm": 1.391656756401062, + "learning_rate": 4.819738988580751e-05, + "loss": 0.1345, + "num_input_tokens_seen": 12776704, + "step": 5910 + }, + { + "epoch": 0.9649265905383361, + "grad_norm": 0.3866509199142456, + "learning_rate": 4.823817292006526e-05, + "loss": 0.2547, + "num_input_tokens_seen": 12787872, + "step": 5915 + }, + { + "epoch": 0.965742251223491, + "grad_norm": 0.30649709701538086, + "learning_rate": 4.8278955954323e-05, + "loss": 0.0888, + "num_input_tokens_seen": 12798880, + "step": 5920 + }, + { + "epoch": 0.966557911908646, + "grad_norm": 1.4070444107055664, + "learning_rate": 4.831973898858075e-05, + "loss": 0.1847, + "num_input_tokens_seen": 12809696, + "step": 5925 + }, + { + "epoch": 0.967373572593801, + "grad_norm": 0.49175766110420227, + "learning_rate": 4.83605220228385e-05, + "loss": 0.1251, + "num_input_tokens_seen": 12820192, + "step": 5930 + }, + { + "epoch": 0.968189233278956, + "grad_norm": 2.166771173477173, + "learning_rate": 4.8401305057096255e-05, + "loss": 0.0684, + "num_input_tokens_seen": 12832384, + "step": 5935 + }, + { + "epoch": 0.9690048939641109, + "grad_norm": 0.283640056848526, + "learning_rate": 4.8442088091353996e-05, + "loss": 0.0936, + "num_input_tokens_seen": 12843936, + "step": 5940 + }, + { + "epoch": 0.9698205546492659, + "grad_norm": 0.5705147385597229, + "learning_rate": 4.8482871125611744e-05, + "loss": 0.1492, + "num_input_tokens_seen": 12854752, + "step": 5945 + }, + { + "epoch": 0.9706362153344209, + "grad_norm": 0.775317370891571, + "learning_rate": 4.85236541598695e-05, + "loss": 0.0695, + "num_input_tokens_seen": 12865248, + "step": 5950 + }, + { + "epoch": 0.9714518760195758, + "grad_norm": 1.848908543586731, + "learning_rate": 4.8564437194127246e-05, + "loss": 0.1654, + "num_input_tokens_seen": 12876128, + "step": 5955 + }, + { + "epoch": 0.9722675367047309, + "grad_norm": 1.3205891847610474, + "learning_rate": 4.8605220228384994e-05, + "loss": 0.264, + "num_input_tokens_seen": 12887008, + "step": 5960 + }, + { + "epoch": 0.9730831973898858, + "grad_norm": 0.07514811307191849, + "learning_rate": 4.864600326264274e-05, + "loss": 0.0718, + "num_input_tokens_seen": 12898624, + "step": 5965 + }, + { + "epoch": 0.9738988580750407, + "grad_norm": 0.3022977411746979, + "learning_rate": 4.8686786296900496e-05, + "loss": 0.1064, + "num_input_tokens_seen": 12909888, + "step": 5970 + }, + { + "epoch": 0.9747145187601958, + "grad_norm": 2.25614333152771, + "learning_rate": 4.872756933115824e-05, + "loss": 0.2958, + "num_input_tokens_seen": 12920032, + "step": 5975 + }, + { + "epoch": 0.9755301794453507, + "grad_norm": 2.3989357948303223, + "learning_rate": 4.8768352365415984e-05, + "loss": 0.3869, + "num_input_tokens_seen": 12931392, + "step": 5980 + }, + { + "epoch": 0.9763458401305057, + "grad_norm": 1.3274757862091064, + "learning_rate": 4.880913539967374e-05, + "loss": 0.1832, + "num_input_tokens_seen": 12941472, + "step": 5985 + }, + { + "epoch": 0.9771615008156607, + "grad_norm": 0.6780110597610474, + "learning_rate": 4.8849918433931486e-05, + "loss": 0.1049, + "num_input_tokens_seen": 12952192, + "step": 5990 + }, + { + "epoch": 0.9779771615008157, + "grad_norm": 1.677978754043579, + "learning_rate": 4.8890701468189234e-05, + "loss": 0.2348, + "num_input_tokens_seen": 12963200, + "step": 5995 + }, + { + "epoch": 0.9787928221859706, + "grad_norm": 0.7129308581352234, + "learning_rate": 4.893148450244698e-05, + "loss": 0.1958, + "num_input_tokens_seen": 12972768, + "step": 6000 + }, + { + "epoch": 0.9796084828711256, + "grad_norm": 0.9679998755455017, + "learning_rate": 4.8972267536704736e-05, + "loss": 0.0804, + "num_input_tokens_seen": 12984064, + "step": 6005 + }, + { + "epoch": 0.9804241435562806, + "grad_norm": 0.5357950925827026, + "learning_rate": 4.9013050570962484e-05, + "loss": 0.1893, + "num_input_tokens_seen": 12994336, + "step": 6010 + }, + { + "epoch": 0.9812398042414355, + "grad_norm": 0.10595850646495819, + "learning_rate": 4.905383360522023e-05, + "loss": 0.045, + "num_input_tokens_seen": 13004096, + "step": 6015 + }, + { + "epoch": 0.9820554649265906, + "grad_norm": 0.2538086175918579, + "learning_rate": 4.909461663947798e-05, + "loss": 0.0548, + "num_input_tokens_seen": 13015168, + "step": 6020 + }, + { + "epoch": 0.9828711256117455, + "grad_norm": 0.17063258588314056, + "learning_rate": 4.9135399673735727e-05, + "loss": 0.0699, + "num_input_tokens_seen": 13026080, + "step": 6025 + }, + { + "epoch": 0.9836867862969005, + "grad_norm": 1.8540911674499512, + "learning_rate": 4.917618270799348e-05, + "loss": 0.2788, + "num_input_tokens_seen": 13037888, + "step": 6030 + }, + { + "epoch": 0.9845024469820555, + "grad_norm": 2.5009257793426514, + "learning_rate": 4.921696574225123e-05, + "loss": 0.3754, + "num_input_tokens_seen": 13049440, + "step": 6035 + }, + { + "epoch": 0.9853181076672104, + "grad_norm": 0.28608807921409607, + "learning_rate": 4.9257748776508976e-05, + "loss": 0.067, + "num_input_tokens_seen": 13060480, + "step": 6040 + }, + { + "epoch": 0.9861337683523654, + "grad_norm": 0.5712888836860657, + "learning_rate": 4.9298531810766724e-05, + "loss": 0.1072, + "num_input_tokens_seen": 13071584, + "step": 6045 + }, + { + "epoch": 0.9869494290375204, + "grad_norm": 0.9310219287872314, + "learning_rate": 4.933931484502447e-05, + "loss": 0.1502, + "num_input_tokens_seen": 13081376, + "step": 6050 + }, + { + "epoch": 0.9877650897226754, + "grad_norm": 0.0705684944987297, + "learning_rate": 4.938009787928222e-05, + "loss": 0.0891, + "num_input_tokens_seen": 13092768, + "step": 6055 + }, + { + "epoch": 0.9885807504078303, + "grad_norm": 0.4895913898944855, + "learning_rate": 4.942088091353997e-05, + "loss": 0.2225, + "num_input_tokens_seen": 13104096, + "step": 6060 + }, + { + "epoch": 0.9893964110929854, + "grad_norm": 0.19624266028404236, + "learning_rate": 4.946166394779772e-05, + "loss": 0.0958, + "num_input_tokens_seen": 13115488, + "step": 6065 + }, + { + "epoch": 0.9902120717781403, + "grad_norm": 0.07742413133382797, + "learning_rate": 4.950244698205547e-05, + "loss": 0.073, + "num_input_tokens_seen": 13126752, + "step": 6070 + }, + { + "epoch": 0.9910277324632952, + "grad_norm": 0.24820929765701294, + "learning_rate": 4.9543230016313217e-05, + "loss": 0.1363, + "num_input_tokens_seen": 13137920, + "step": 6075 + }, + { + "epoch": 0.9918433931484503, + "grad_norm": 1.0225684642791748, + "learning_rate": 4.9584013050570964e-05, + "loss": 0.1826, + "num_input_tokens_seen": 13148416, + "step": 6080 + }, + { + "epoch": 0.9926590538336052, + "grad_norm": 1.4646437168121338, + "learning_rate": 4.962479608482871e-05, + "loss": 0.0883, + "num_input_tokens_seen": 13157920, + "step": 6085 + }, + { + "epoch": 0.9934747145187602, + "grad_norm": 0.3324906826019287, + "learning_rate": 4.9665579119086466e-05, + "loss": 0.0596, + "num_input_tokens_seen": 13167968, + "step": 6090 + }, + { + "epoch": 0.9942903752039152, + "grad_norm": 1.351397156715393, + "learning_rate": 4.970636215334421e-05, + "loss": 0.2292, + "num_input_tokens_seen": 13179648, + "step": 6095 + }, + { + "epoch": 0.9951060358890701, + "grad_norm": 0.1321014165878296, + "learning_rate": 4.974714518760196e-05, + "loss": 0.1588, + "num_input_tokens_seen": 13190656, + "step": 6100 + }, + { + "epoch": 0.9959216965742251, + "grad_norm": 0.22870512306690216, + "learning_rate": 4.978792822185971e-05, + "loss": 0.1892, + "num_input_tokens_seen": 13201120, + "step": 6105 + }, + { + "epoch": 0.9967373572593801, + "grad_norm": 0.39421209692955017, + "learning_rate": 4.982871125611746e-05, + "loss": 0.0889, + "num_input_tokens_seen": 13211744, + "step": 6110 + }, + { + "epoch": 0.9975530179445351, + "grad_norm": 0.03809139505028725, + "learning_rate": 4.9869494290375205e-05, + "loss": 0.182, + "num_input_tokens_seen": 13222752, + "step": 6115 + }, + { + "epoch": 0.99836867862969, + "grad_norm": 0.3277501165866852, + "learning_rate": 4.991027732463295e-05, + "loss": 0.2179, + "num_input_tokens_seen": 13233696, + "step": 6120 + }, + { + "epoch": 0.9991843393148451, + "grad_norm": 0.5695142149925232, + "learning_rate": 4.9951060358890707e-05, + "loss": 0.1774, + "num_input_tokens_seen": 13245504, + "step": 6125 + }, + { + "epoch": 1.0, + "grad_norm": 0.062192730605602264, + "learning_rate": 4.9991843393148454e-05, + "loss": 0.2248, + "num_input_tokens_seen": 13255424, + "step": 6130 + }, + { + "epoch": 1.0, + "eval_loss": 0.15951211750507355, + "eval_runtime": 132.3275, + "eval_samples_per_second": 20.593, + "eval_steps_per_second": 5.154, + "num_input_tokens_seen": 13255424, + "step": 6130 + }, + { + "epoch": 1.000815660685155, + "grad_norm": 0.8447925448417664, + "learning_rate": 4.999999935147941e-05, + "loss": 0.168, + "num_input_tokens_seen": 13265952, + "step": 6135 + }, + { + "epoch": 1.0016313213703099, + "grad_norm": 0.753044605255127, + "learning_rate": 4.999999671686456e-05, + "loss": 0.0932, + "num_input_tokens_seen": 13276768, + "step": 6140 + }, + { + "epoch": 1.002446982055465, + "grad_norm": 0.42759910225868225, + "learning_rate": 4.999999205562312e-05, + "loss": 0.1425, + "num_input_tokens_seen": 13287776, + "step": 6145 + }, + { + "epoch": 1.00326264274062, + "grad_norm": 0.2797618806362152, + "learning_rate": 4.999998536775549e-05, + "loss": 0.2427, + "num_input_tokens_seen": 13299488, + "step": 6150 + }, + { + "epoch": 1.004078303425775, + "grad_norm": 0.5948951840400696, + "learning_rate": 4.9999976653262184e-05, + "loss": 0.1046, + "num_input_tokens_seen": 13310208, + "step": 6155 + }, + { + "epoch": 1.0048939641109298, + "grad_norm": 0.07903174310922623, + "learning_rate": 4.999996591214392e-05, + "loss": 0.071, + "num_input_tokens_seen": 13319904, + "step": 6160 + }, + { + "epoch": 1.0057096247960848, + "grad_norm": 0.35774946212768555, + "learning_rate": 4.999995314440158e-05, + "loss": 0.1436, + "num_input_tokens_seen": 13330528, + "step": 6165 + }, + { + "epoch": 1.0065252854812399, + "grad_norm": 0.1773616373538971, + "learning_rate": 4.999993835003618e-05, + "loss": 0.1189, + "num_input_tokens_seen": 13341760, + "step": 6170 + }, + { + "epoch": 1.0073409461663947, + "grad_norm": 0.03791346400976181, + "learning_rate": 4.9999921529048945e-05, + "loss": 0.2111, + "num_input_tokens_seen": 13352448, + "step": 6175 + }, + { + "epoch": 1.0081566068515497, + "grad_norm": 0.8501187562942505, + "learning_rate": 4.9999902681441205e-05, + "loss": 0.2517, + "num_input_tokens_seen": 13363296, + "step": 6180 + }, + { + "epoch": 1.0089722675367048, + "grad_norm": 0.9683763384819031, + "learning_rate": 4.9999881807214515e-05, + "loss": 0.0898, + "num_input_tokens_seen": 13374368, + "step": 6185 + }, + { + "epoch": 1.0097879282218598, + "grad_norm": 1.0407763719558716, + "learning_rate": 4.9999858906370553e-05, + "loss": 0.1489, + "num_input_tokens_seen": 13385184, + "step": 6190 + }, + { + "epoch": 1.0106035889070146, + "grad_norm": 1.3565641641616821, + "learning_rate": 4.9999833978911185e-05, + "loss": 0.296, + "num_input_tokens_seen": 13395104, + "step": 6195 + }, + { + "epoch": 1.0114192495921697, + "grad_norm": 0.7545968294143677, + "learning_rate": 4.999980702483842e-05, + "loss": 0.1138, + "num_input_tokens_seen": 13406080, + "step": 6200 + }, + { + "epoch": 1.0122349102773247, + "grad_norm": 0.342725545167923, + "learning_rate": 4.999977804415446e-05, + "loss": 0.1759, + "num_input_tokens_seen": 13417952, + "step": 6205 + }, + { + "epoch": 1.0130505709624795, + "grad_norm": 1.4529454708099365, + "learning_rate": 4.999974703686164e-05, + "loss": 0.1407, + "num_input_tokens_seen": 13428512, + "step": 6210 + }, + { + "epoch": 1.0138662316476346, + "grad_norm": 0.20423713326454163, + "learning_rate": 4.9999714002962474e-05, + "loss": 0.0499, + "num_input_tokens_seen": 13439648, + "step": 6215 + }, + { + "epoch": 1.0146818923327896, + "grad_norm": 0.20951935648918152, + "learning_rate": 4.999967894245965e-05, + "loss": 0.1284, + "num_input_tokens_seen": 13451520, + "step": 6220 + }, + { + "epoch": 1.0154975530179446, + "grad_norm": 1.618605613708496, + "learning_rate": 4.9999641855355995e-05, + "loss": 0.112, + "num_input_tokens_seen": 13460480, + "step": 6225 + }, + { + "epoch": 1.0163132137030995, + "grad_norm": 0.4594557285308838, + "learning_rate": 4.999960274165453e-05, + "loss": 0.0938, + "num_input_tokens_seen": 13471328, + "step": 6230 + }, + { + "epoch": 1.0171288743882545, + "grad_norm": 1.3834564685821533, + "learning_rate": 4.999956160135842e-05, + "loss": 0.1153, + "num_input_tokens_seen": 13482048, + "step": 6235 + }, + { + "epoch": 1.0179445350734095, + "grad_norm": 0.5826573967933655, + "learning_rate": 4.999951843447099e-05, + "loss": 0.1958, + "num_input_tokens_seen": 13492512, + "step": 6240 + }, + { + "epoch": 1.0187601957585644, + "grad_norm": 0.9123461842536926, + "learning_rate": 4.999947324099576e-05, + "loss": 0.1434, + "num_input_tokens_seen": 13504000, + "step": 6245 + }, + { + "epoch": 1.0195758564437194, + "grad_norm": 0.32763737440109253, + "learning_rate": 4.999942602093638e-05, + "loss": 0.1353, + "num_input_tokens_seen": 13514784, + "step": 6250 + }, + { + "epoch": 1.0203915171288744, + "grad_norm": 0.6798973083496094, + "learning_rate": 4.999937677429669e-05, + "loss": 0.108, + "num_input_tokens_seen": 13525376, + "step": 6255 + }, + { + "epoch": 1.0212071778140293, + "grad_norm": 0.24766962230205536, + "learning_rate": 4.9999325501080666e-05, + "loss": 0.1644, + "num_input_tokens_seen": 13535872, + "step": 6260 + }, + { + "epoch": 1.0220228384991843, + "grad_norm": 0.5509926676750183, + "learning_rate": 4.999927220129247e-05, + "loss": 0.096, + "num_input_tokens_seen": 13546464, + "step": 6265 + }, + { + "epoch": 1.0228384991843393, + "grad_norm": 0.4833585023880005, + "learning_rate": 4.9999216874936426e-05, + "loss": 0.2495, + "num_input_tokens_seen": 13555936, + "step": 6270 + }, + { + "epoch": 1.0236541598694944, + "grad_norm": 0.3273831605911255, + "learning_rate": 4.9999159522017015e-05, + "loss": 0.0229, + "num_input_tokens_seen": 13566528, + "step": 6275 + }, + { + "epoch": 1.0244698205546492, + "grad_norm": 0.09724698215723038, + "learning_rate": 4.999910014253889e-05, + "loss": 0.1671, + "num_input_tokens_seen": 13577152, + "step": 6280 + }, + { + "epoch": 1.0252854812398042, + "grad_norm": 1.1974483728408813, + "learning_rate": 4.999903873650687e-05, + "loss": 0.115, + "num_input_tokens_seen": 13588704, + "step": 6285 + }, + { + "epoch": 1.0261011419249593, + "grad_norm": 1.0667773485183716, + "learning_rate": 4.999897530392591e-05, + "loss": 0.0547, + "num_input_tokens_seen": 13599552, + "step": 6290 + }, + { + "epoch": 1.026916802610114, + "grad_norm": 0.15526823699474335, + "learning_rate": 4.9998909844801176e-05, + "loss": 0.0425, + "num_input_tokens_seen": 13611264, + "step": 6295 + }, + { + "epoch": 1.0277324632952691, + "grad_norm": 0.7670542597770691, + "learning_rate": 4.999884235913797e-05, + "loss": 0.2132, + "num_input_tokens_seen": 13621760, + "step": 6300 + }, + { + "epoch": 1.0285481239804242, + "grad_norm": 1.479518175125122, + "learning_rate": 4.999877284694177e-05, + "loss": 0.2331, + "num_input_tokens_seen": 13632672, + "step": 6305 + }, + { + "epoch": 1.0293637846655792, + "grad_norm": 0.13447560369968414, + "learning_rate": 4.999870130821818e-05, + "loss": 0.0405, + "num_input_tokens_seen": 13643584, + "step": 6310 + }, + { + "epoch": 1.030179445350734, + "grad_norm": 0.3454958200454712, + "learning_rate": 4.9998627742973025e-05, + "loss": 0.291, + "num_input_tokens_seen": 13655488, + "step": 6315 + }, + { + "epoch": 1.030995106035889, + "grad_norm": 1.1404590606689453, + "learning_rate": 4.9998552151212276e-05, + "loss": 0.165, + "num_input_tokens_seen": 13666368, + "step": 6320 + }, + { + "epoch": 1.031810766721044, + "grad_norm": 0.5154155492782593, + "learning_rate": 4.999847453294204e-05, + "loss": 0.2877, + "num_input_tokens_seen": 13678048, + "step": 6325 + }, + { + "epoch": 1.032626427406199, + "grad_norm": 2.051875591278076, + "learning_rate": 4.999839488816861e-05, + "loss": 0.0974, + "num_input_tokens_seen": 13689024, + "step": 6330 + }, + { + "epoch": 1.033442088091354, + "grad_norm": 0.5902764201164246, + "learning_rate": 4.9998313216898454e-05, + "loss": 0.1673, + "num_input_tokens_seen": 13699744, + "step": 6335 + }, + { + "epoch": 1.034257748776509, + "grad_norm": 0.3263436555862427, + "learning_rate": 4.99982295191382e-05, + "loss": 0.0911, + "num_input_tokens_seen": 13710016, + "step": 6340 + }, + { + "epoch": 1.035073409461664, + "grad_norm": 0.6336426734924316, + "learning_rate": 4.99981437948946e-05, + "loss": 0.1427, + "num_input_tokens_seen": 13721664, + "step": 6345 + }, + { + "epoch": 1.0358890701468189, + "grad_norm": 0.9716700911521912, + "learning_rate": 4.999805604417464e-05, + "loss": 0.0655, + "num_input_tokens_seen": 13734752, + "step": 6350 + }, + { + "epoch": 1.036704730831974, + "grad_norm": 0.22332896292209625, + "learning_rate": 4.999796626698542e-05, + "loss": 0.0963, + "num_input_tokens_seen": 13745504, + "step": 6355 + }, + { + "epoch": 1.037520391517129, + "grad_norm": 3.418821096420288, + "learning_rate": 4.999787446333421e-05, + "loss": 0.2489, + "num_input_tokens_seen": 13755968, + "step": 6360 + }, + { + "epoch": 1.0383360522022838, + "grad_norm": 1.5685642957687378, + "learning_rate": 4.999778063322846e-05, + "loss": 0.1757, + "num_input_tokens_seen": 13767296, + "step": 6365 + }, + { + "epoch": 1.0391517128874388, + "grad_norm": 0.1339409053325653, + "learning_rate": 4.9997684776675775e-05, + "loss": 0.2306, + "num_input_tokens_seen": 13778656, + "step": 6370 + }, + { + "epoch": 1.0399673735725938, + "grad_norm": 1.9527429342269897, + "learning_rate": 4.999758689368392e-05, + "loss": 0.1324, + "num_input_tokens_seen": 13789216, + "step": 6375 + }, + { + "epoch": 1.0407830342577489, + "grad_norm": 0.9353014826774597, + "learning_rate": 4.999748698426084e-05, + "loss": 0.1053, + "num_input_tokens_seen": 13799680, + "step": 6380 + }, + { + "epoch": 1.0415986949429037, + "grad_norm": 0.15532203018665314, + "learning_rate": 4.9997385048414624e-05, + "loss": 0.1455, + "num_input_tokens_seen": 13810720, + "step": 6385 + }, + { + "epoch": 1.0424143556280587, + "grad_norm": 0.24766068160533905, + "learning_rate": 4.999728108615355e-05, + "loss": 0.1358, + "num_input_tokens_seen": 13821856, + "step": 6390 + }, + { + "epoch": 1.0432300163132138, + "grad_norm": 0.41371721029281616, + "learning_rate": 4.9997175097486026e-05, + "loss": 0.1819, + "num_input_tokens_seen": 13833984, + "step": 6395 + }, + { + "epoch": 1.0440456769983686, + "grad_norm": 0.7519470453262329, + "learning_rate": 4.9997067082420655e-05, + "loss": 0.2345, + "num_input_tokens_seen": 13844928, + "step": 6400 + }, + { + "epoch": 1.0448613376835236, + "grad_norm": 2.41269850730896, + "learning_rate": 4.9996957040966205e-05, + "loss": 0.1976, + "num_input_tokens_seen": 13856384, + "step": 6405 + }, + { + "epoch": 1.0456769983686787, + "grad_norm": 0.38167575001716614, + "learning_rate": 4.999684497313157e-05, + "loss": 0.1104, + "num_input_tokens_seen": 13867456, + "step": 6410 + }, + { + "epoch": 1.0464926590538337, + "grad_norm": 0.27223873138427734, + "learning_rate": 4.9996730878925856e-05, + "loss": 0.0321, + "num_input_tokens_seen": 13877216, + "step": 6415 + }, + { + "epoch": 1.0473083197389885, + "grad_norm": 1.3951562643051147, + "learning_rate": 4.99966147583583e-05, + "loss": 0.3813, + "num_input_tokens_seen": 13888192, + "step": 6420 + }, + { + "epoch": 1.0481239804241436, + "grad_norm": 0.4671041667461395, + "learning_rate": 4.9996496611438326e-05, + "loss": 0.0254, + "num_input_tokens_seen": 13899072, + "step": 6425 + }, + { + "epoch": 1.0489396411092986, + "grad_norm": 2.1099801063537598, + "learning_rate": 4.99963764381755e-05, + "loss": 0.2004, + "num_input_tokens_seen": 13909792, + "step": 6430 + }, + { + "epoch": 1.0497553017944534, + "grad_norm": 0.638484537601471, + "learning_rate": 4.999625423857958e-05, + "loss": 0.127, + "num_input_tokens_seen": 13920800, + "step": 6435 + }, + { + "epoch": 1.0505709624796085, + "grad_norm": 0.8620742559432983, + "learning_rate": 4.999613001266045e-05, + "loss": 0.093, + "num_input_tokens_seen": 13930976, + "step": 6440 + }, + { + "epoch": 1.0513866231647635, + "grad_norm": 0.5324145555496216, + "learning_rate": 4.999600376042819e-05, + "loss": 0.06, + "num_input_tokens_seen": 13942080, + "step": 6445 + }, + { + "epoch": 1.0522022838499185, + "grad_norm": 1.362005352973938, + "learning_rate": 4.999587548189305e-05, + "loss": 0.099, + "num_input_tokens_seen": 13952736, + "step": 6450 + }, + { + "epoch": 1.0530179445350734, + "grad_norm": 1.0698500871658325, + "learning_rate": 4.99957451770654e-05, + "loss": 0.1306, + "num_input_tokens_seen": 13962400, + "step": 6455 + }, + { + "epoch": 1.0538336052202284, + "grad_norm": 0.051931142807006836, + "learning_rate": 4.999561284595583e-05, + "loss": 0.0392, + "num_input_tokens_seen": 13974304, + "step": 6460 + }, + { + "epoch": 1.0546492659053834, + "grad_norm": 0.628566563129425, + "learning_rate": 4.9995478488575054e-05, + "loss": 0.1208, + "num_input_tokens_seen": 13985248, + "step": 6465 + }, + { + "epoch": 1.0554649265905383, + "grad_norm": 2.0032753944396973, + "learning_rate": 4.999534210493396e-05, + "loss": 0.0949, + "num_input_tokens_seen": 13995840, + "step": 6470 + }, + { + "epoch": 1.0562805872756933, + "grad_norm": 0.1899144947528839, + "learning_rate": 4.999520369504362e-05, + "loss": 0.1009, + "num_input_tokens_seen": 14007104, + "step": 6475 + }, + { + "epoch": 1.0570962479608483, + "grad_norm": 1.2809895277023315, + "learning_rate": 4.9995063258915235e-05, + "loss": 0.0999, + "num_input_tokens_seen": 14018144, + "step": 6480 + }, + { + "epoch": 1.0579119086460032, + "grad_norm": 0.4021928310394287, + "learning_rate": 4.9994920796560205e-05, + "loss": 0.1025, + "num_input_tokens_seen": 14029024, + "step": 6485 + }, + { + "epoch": 1.0587275693311582, + "grad_norm": 0.08657373487949371, + "learning_rate": 4.999477630799007e-05, + "loss": 0.2045, + "num_input_tokens_seen": 14040480, + "step": 6490 + }, + { + "epoch": 1.0595432300163132, + "grad_norm": 0.1352744847536087, + "learning_rate": 4.999462979321654e-05, + "loss": 0.0102, + "num_input_tokens_seen": 14050976, + "step": 6495 + }, + { + "epoch": 1.0603588907014683, + "grad_norm": 0.7184095978736877, + "learning_rate": 4.9994481252251506e-05, + "loss": 0.0252, + "num_input_tokens_seen": 14060352, + "step": 6500 + }, + { + "epoch": 1.061174551386623, + "grad_norm": 2.4281158447265625, + "learning_rate": 4.999433068510699e-05, + "loss": 0.0934, + "num_input_tokens_seen": 14071680, + "step": 6505 + }, + { + "epoch": 1.0619902120717781, + "grad_norm": 0.5886656641960144, + "learning_rate": 4.999417809179523e-05, + "loss": 0.1177, + "num_input_tokens_seen": 14083296, + "step": 6510 + }, + { + "epoch": 1.0628058727569332, + "grad_norm": 2.2036728858947754, + "learning_rate": 4.9994023472328555e-05, + "loss": 0.1343, + "num_input_tokens_seen": 14094752, + "step": 6515 + }, + { + "epoch": 1.0636215334420882, + "grad_norm": 2.3913283348083496, + "learning_rate": 4.999386682671953e-05, + "loss": 0.3164, + "num_input_tokens_seen": 14105088, + "step": 6520 + }, + { + "epoch": 1.064437194127243, + "grad_norm": 2.9640257358551025, + "learning_rate": 4.9993708154980836e-05, + "loss": 0.2238, + "num_input_tokens_seen": 14114976, + "step": 6525 + }, + { + "epoch": 1.065252854812398, + "grad_norm": 1.4815361499786377, + "learning_rate": 4.999354745712534e-05, + "loss": 0.1679, + "num_input_tokens_seen": 14126592, + "step": 6530 + }, + { + "epoch": 1.066068515497553, + "grad_norm": 1.130543828010559, + "learning_rate": 4.999338473316607e-05, + "loss": 0.1804, + "num_input_tokens_seen": 14138048, + "step": 6535 + }, + { + "epoch": 1.066884176182708, + "grad_norm": 0.17946594953536987, + "learning_rate": 4.9993219983116223e-05, + "loss": 0.0981, + "num_input_tokens_seen": 14148128, + "step": 6540 + }, + { + "epoch": 1.067699836867863, + "grad_norm": 0.8463963270187378, + "learning_rate": 4.999305320698915e-05, + "loss": 0.1231, + "num_input_tokens_seen": 14160096, + "step": 6545 + }, + { + "epoch": 1.068515497553018, + "grad_norm": 0.2815023958683014, + "learning_rate": 4.999288440479837e-05, + "loss": 0.086, + "num_input_tokens_seen": 14170752, + "step": 6550 + }, + { + "epoch": 1.0693311582381728, + "grad_norm": 0.10955648124217987, + "learning_rate": 4.999271357655757e-05, + "loss": 0.3752, + "num_input_tokens_seen": 14181952, + "step": 6555 + }, + { + "epoch": 1.0701468189233279, + "grad_norm": 0.230208158493042, + "learning_rate": 4.999254072228059e-05, + "loss": 0.197, + "num_input_tokens_seen": 14192384, + "step": 6560 + }, + { + "epoch": 1.070962479608483, + "grad_norm": 0.05787502974271774, + "learning_rate": 4.9992365841981456e-05, + "loss": 0.0359, + "num_input_tokens_seen": 14204384, + "step": 6565 + }, + { + "epoch": 1.071778140293638, + "grad_norm": 0.22051464021205902, + "learning_rate": 4.9992188935674335e-05, + "loss": 0.1302, + "num_input_tokens_seen": 14215776, + "step": 6570 + }, + { + "epoch": 1.0725938009787928, + "grad_norm": 0.4692552387714386, + "learning_rate": 4.999201000337356e-05, + "loss": 0.1385, + "num_input_tokens_seen": 14227520, + "step": 6575 + }, + { + "epoch": 1.0734094616639478, + "grad_norm": 0.25476914644241333, + "learning_rate": 4.999182904509366e-05, + "loss": 0.1798, + "num_input_tokens_seen": 14238976, + "step": 6580 + }, + { + "epoch": 1.0742251223491028, + "grad_norm": 0.032368458807468414, + "learning_rate": 4.9991646060849285e-05, + "loss": 0.1537, + "num_input_tokens_seen": 14249344, + "step": 6585 + }, + { + "epoch": 1.0750407830342577, + "grad_norm": 0.4669268727302551, + "learning_rate": 4.9991461050655264e-05, + "loss": 0.0312, + "num_input_tokens_seen": 14259840, + "step": 6590 + }, + { + "epoch": 1.0758564437194127, + "grad_norm": 0.36859166622161865, + "learning_rate": 4.999127401452662e-05, + "loss": 0.2027, + "num_input_tokens_seen": 14271456, + "step": 6595 + }, + { + "epoch": 1.0766721044045677, + "grad_norm": 1.335395097732544, + "learning_rate": 4.999108495247849e-05, + "loss": 0.119, + "num_input_tokens_seen": 14282688, + "step": 6600 + }, + { + "epoch": 1.0774877650897228, + "grad_norm": 1.1209423542022705, + "learning_rate": 4.9990893864526214e-05, + "loss": 0.0792, + "num_input_tokens_seen": 14293920, + "step": 6605 + }, + { + "epoch": 1.0783034257748776, + "grad_norm": 2.257000684738159, + "learning_rate": 4.9990700750685276e-05, + "loss": 0.0515, + "num_input_tokens_seen": 14304512, + "step": 6610 + }, + { + "epoch": 1.0791190864600326, + "grad_norm": 0.6861785054206848, + "learning_rate": 4.999050561097134e-05, + "loss": 0.0484, + "num_input_tokens_seen": 14315616, + "step": 6615 + }, + { + "epoch": 1.0799347471451877, + "grad_norm": 0.31465965509414673, + "learning_rate": 4.999030844540021e-05, + "loss": 0.0547, + "num_input_tokens_seen": 14326400, + "step": 6620 + }, + { + "epoch": 1.0807504078303425, + "grad_norm": 0.03909171000123024, + "learning_rate": 4.999010925398788e-05, + "loss": 0.1774, + "num_input_tokens_seen": 14336288, + "step": 6625 + }, + { + "epoch": 1.0815660685154975, + "grad_norm": 0.4094997048377991, + "learning_rate": 4.99899080367505e-05, + "loss": 0.1463, + "num_input_tokens_seen": 14347616, + "step": 6630 + }, + { + "epoch": 1.0823817292006526, + "grad_norm": 0.19153444468975067, + "learning_rate": 4.9989704793704374e-05, + "loss": 0.1032, + "num_input_tokens_seen": 14357664, + "step": 6635 + }, + { + "epoch": 1.0831973898858076, + "grad_norm": 1.2632654905319214, + "learning_rate": 4.998949952486598e-05, + "loss": 0.3619, + "num_input_tokens_seen": 14367232, + "step": 6640 + }, + { + "epoch": 1.0840130505709624, + "grad_norm": 0.11972885578870773, + "learning_rate": 4.998929223025196e-05, + "loss": 0.1099, + "num_input_tokens_seen": 14378048, + "step": 6645 + }, + { + "epoch": 1.0848287112561175, + "grad_norm": 1.1563533544540405, + "learning_rate": 4.998908290987913e-05, + "loss": 0.2028, + "num_input_tokens_seen": 14389248, + "step": 6650 + }, + { + "epoch": 1.0856443719412725, + "grad_norm": 0.39810478687286377, + "learning_rate": 4.998887156376443e-05, + "loss": 0.1079, + "num_input_tokens_seen": 14399616, + "step": 6655 + }, + { + "epoch": 1.0864600326264273, + "grad_norm": 2.4849796295166016, + "learning_rate": 4.998865819192501e-05, + "loss": 0.3332, + "num_input_tokens_seen": 14412000, + "step": 6660 + }, + { + "epoch": 1.0872756933115824, + "grad_norm": 0.21648432314395905, + "learning_rate": 4.9988442794378166e-05, + "loss": 0.2316, + "num_input_tokens_seen": 14423392, + "step": 6665 + }, + { + "epoch": 1.0880913539967374, + "grad_norm": 0.6873315572738647, + "learning_rate": 4.998822537114136e-05, + "loss": 0.2595, + "num_input_tokens_seen": 14433792, + "step": 6670 + }, + { + "epoch": 1.0889070146818924, + "grad_norm": 0.2786778211593628, + "learning_rate": 4.998800592223222e-05, + "loss": 0.1057, + "num_input_tokens_seen": 14445152, + "step": 6675 + }, + { + "epoch": 1.0897226753670473, + "grad_norm": 0.6130539774894714, + "learning_rate": 4.9987784447668526e-05, + "loss": 0.1302, + "num_input_tokens_seen": 14455968, + "step": 6680 + }, + { + "epoch": 1.0905383360522023, + "grad_norm": 0.3063286244869232, + "learning_rate": 4.9987560947468245e-05, + "loss": 0.0482, + "num_input_tokens_seen": 14465824, + "step": 6685 + }, + { + "epoch": 1.0913539967373573, + "grad_norm": 2.032806634902954, + "learning_rate": 4.998733542164948e-05, + "loss": 0.1605, + "num_input_tokens_seen": 14476480, + "step": 6690 + }, + { + "epoch": 1.0921696574225122, + "grad_norm": 1.2560557126998901, + "learning_rate": 4.998710787023053e-05, + "loss": 0.1347, + "num_input_tokens_seen": 14486144, + "step": 6695 + }, + { + "epoch": 1.0929853181076672, + "grad_norm": 0.6270899772644043, + "learning_rate": 4.998687829322983e-05, + "loss": 0.2322, + "num_input_tokens_seen": 14497664, + "step": 6700 + }, + { + "epoch": 1.0938009787928222, + "grad_norm": 1.8091199398040771, + "learning_rate": 4.9986646690665996e-05, + "loss": 0.1403, + "num_input_tokens_seen": 14509888, + "step": 6705 + }, + { + "epoch": 1.094616639477977, + "grad_norm": 0.5705227255821228, + "learning_rate": 4.998641306255779e-05, + "loss": 0.2003, + "num_input_tokens_seen": 14521664, + "step": 6710 + }, + { + "epoch": 1.095432300163132, + "grad_norm": 1.049680471420288, + "learning_rate": 4.998617740892417e-05, + "loss": 0.1745, + "num_input_tokens_seen": 14532768, + "step": 6715 + }, + { + "epoch": 1.0962479608482871, + "grad_norm": 0.9702552556991577, + "learning_rate": 4.998593972978423e-05, + "loss": 0.1622, + "num_input_tokens_seen": 14544160, + "step": 6720 + }, + { + "epoch": 1.0970636215334422, + "grad_norm": 0.6705875992774963, + "learning_rate": 4.9985700025157236e-05, + "loss": 0.071, + "num_input_tokens_seen": 14554112, + "step": 6725 + }, + { + "epoch": 1.097879282218597, + "grad_norm": 0.7697087526321411, + "learning_rate": 4.998545829506263e-05, + "loss": 0.078, + "num_input_tokens_seen": 14564096, + "step": 6730 + }, + { + "epoch": 1.098694942903752, + "grad_norm": 0.22131772339344025, + "learning_rate": 4.998521453951999e-05, + "loss": 0.1209, + "num_input_tokens_seen": 14574592, + "step": 6735 + }, + { + "epoch": 1.099510603588907, + "grad_norm": 0.05516809597611427, + "learning_rate": 4.998496875854908e-05, + "loss": 0.0336, + "num_input_tokens_seen": 14585728, + "step": 6740 + }, + { + "epoch": 1.100326264274062, + "grad_norm": 0.6439073085784912, + "learning_rate": 4.998472095216984e-05, + "loss": 0.2168, + "num_input_tokens_seen": 14594464, + "step": 6745 + }, + { + "epoch": 1.101141924959217, + "grad_norm": 1.1700901985168457, + "learning_rate": 4.998447112040235e-05, + "loss": 0.1454, + "num_input_tokens_seen": 14604480, + "step": 6750 + }, + { + "epoch": 1.101957585644372, + "grad_norm": 1.356703758239746, + "learning_rate": 4.998421926326685e-05, + "loss": 0.0969, + "num_input_tokens_seen": 14616128, + "step": 6755 + }, + { + "epoch": 1.102773246329527, + "grad_norm": 0.047582969069480896, + "learning_rate": 4.998396538078378e-05, + "loss": 0.0498, + "num_input_tokens_seen": 14627008, + "step": 6760 + }, + { + "epoch": 1.1035889070146818, + "grad_norm": 0.5306851267814636, + "learning_rate": 4.99837094729737e-05, + "loss": 0.2199, + "num_input_tokens_seen": 14637280, + "step": 6765 + }, + { + "epoch": 1.1044045676998369, + "grad_norm": 2.032977342605591, + "learning_rate": 4.998345153985738e-05, + "loss": 0.1974, + "num_input_tokens_seen": 14646976, + "step": 6770 + }, + { + "epoch": 1.105220228384992, + "grad_norm": 5.861591339111328, + "learning_rate": 4.998319158145569e-05, + "loss": 0.1176, + "num_input_tokens_seen": 14657152, + "step": 6775 + }, + { + "epoch": 1.1060358890701467, + "grad_norm": 0.5621002912521362, + "learning_rate": 4.998292959778974e-05, + "loss": 0.288, + "num_input_tokens_seen": 14667488, + "step": 6780 + }, + { + "epoch": 1.1068515497553018, + "grad_norm": 0.8230745792388916, + "learning_rate": 4.9982665588880753e-05, + "loss": 0.2166, + "num_input_tokens_seen": 14678112, + "step": 6785 + }, + { + "epoch": 1.1076672104404568, + "grad_norm": 0.047583892941474915, + "learning_rate": 4.9982399554750136e-05, + "loss": 0.0652, + "num_input_tokens_seen": 14689312, + "step": 6790 + }, + { + "epoch": 1.1084828711256118, + "grad_norm": 0.11870197206735611, + "learning_rate": 4.998213149541945e-05, + "loss": 0.2976, + "num_input_tokens_seen": 14700032, + "step": 6795 + }, + { + "epoch": 1.1092985318107667, + "grad_norm": 0.6663447618484497, + "learning_rate": 4.9981861410910424e-05, + "loss": 0.2148, + "num_input_tokens_seen": 14711008, + "step": 6800 + }, + { + "epoch": 1.1101141924959217, + "grad_norm": 0.050990715622901917, + "learning_rate": 4.9981589301244956e-05, + "loss": 0.0831, + "num_input_tokens_seen": 14720448, + "step": 6805 + }, + { + "epoch": 1.1109298531810767, + "grad_norm": 0.11604470014572144, + "learning_rate": 4.99813151664451e-05, + "loss": 0.098, + "num_input_tokens_seen": 14732128, + "step": 6810 + }, + { + "epoch": 1.1117455138662315, + "grad_norm": 0.08551973104476929, + "learning_rate": 4.998103900653309e-05, + "loss": 0.121, + "num_input_tokens_seen": 14743200, + "step": 6815 + }, + { + "epoch": 1.1125611745513866, + "grad_norm": 1.8084325790405273, + "learning_rate": 4.9980760821531304e-05, + "loss": 0.2622, + "num_input_tokens_seen": 14754464, + "step": 6820 + }, + { + "epoch": 1.1133768352365416, + "grad_norm": 0.7951775193214417, + "learning_rate": 4.998048061146229e-05, + "loss": 0.2508, + "num_input_tokens_seen": 14763840, + "step": 6825 + }, + { + "epoch": 1.1141924959216967, + "grad_norm": 0.238115131855011, + "learning_rate": 4.9980198376348774e-05, + "loss": 0.102, + "num_input_tokens_seen": 14774304, + "step": 6830 + }, + { + "epoch": 1.1150081566068515, + "grad_norm": 1.3298671245574951, + "learning_rate": 4.997991411621362e-05, + "loss": 0.0776, + "num_input_tokens_seen": 14783456, + "step": 6835 + }, + { + "epoch": 1.1158238172920065, + "grad_norm": 0.053829267621040344, + "learning_rate": 4.9979627831079894e-05, + "loss": 0.0313, + "num_input_tokens_seen": 14793664, + "step": 6840 + }, + { + "epoch": 1.1166394779771616, + "grad_norm": 1.8658376932144165, + "learning_rate": 4.997933952097078e-05, + "loss": 0.132, + "num_input_tokens_seen": 14804256, + "step": 6845 + }, + { + "epoch": 1.1174551386623164, + "grad_norm": 0.38412636518478394, + "learning_rate": 4.997904918590966e-05, + "loss": 0.2334, + "num_input_tokens_seen": 14815744, + "step": 6850 + }, + { + "epoch": 1.1182707993474714, + "grad_norm": 0.24277693033218384, + "learning_rate": 4.997875682592008e-05, + "loss": 0.0553, + "num_input_tokens_seen": 14826304, + "step": 6855 + }, + { + "epoch": 1.1190864600326265, + "grad_norm": 0.733185887336731, + "learning_rate": 4.997846244102573e-05, + "loss": 0.2171, + "num_input_tokens_seen": 14838016, + "step": 6860 + }, + { + "epoch": 1.1199021207177815, + "grad_norm": 0.9400540590286255, + "learning_rate": 4.997816603125047e-05, + "loss": 0.0612, + "num_input_tokens_seen": 14849344, + "step": 6865 + }, + { + "epoch": 1.1207177814029363, + "grad_norm": 0.7809391021728516, + "learning_rate": 4.9977867596618333e-05, + "loss": 0.2377, + "num_input_tokens_seen": 14860608, + "step": 6870 + }, + { + "epoch": 1.1215334420880914, + "grad_norm": 1.7023645639419556, + "learning_rate": 4.997756713715352e-05, + "loss": 0.0998, + "num_input_tokens_seen": 14870976, + "step": 6875 + }, + { + "epoch": 1.1223491027732464, + "grad_norm": 0.807488739490509, + "learning_rate": 4.997726465288037e-05, + "loss": 0.0937, + "num_input_tokens_seen": 14880960, + "step": 6880 + }, + { + "epoch": 1.1231647634584012, + "grad_norm": 0.03930335119366646, + "learning_rate": 4.997696014382341e-05, + "loss": 0.144, + "num_input_tokens_seen": 14892256, + "step": 6885 + }, + { + "epoch": 1.1239804241435563, + "grad_norm": 1.7069370746612549, + "learning_rate": 4.997665361000735e-05, + "loss": 0.3029, + "num_input_tokens_seen": 14902304, + "step": 6890 + }, + { + "epoch": 1.1247960848287113, + "grad_norm": 0.17382943630218506, + "learning_rate": 4.9976345051456995e-05, + "loss": 0.1372, + "num_input_tokens_seen": 14913536, + "step": 6895 + }, + { + "epoch": 1.1256117455138663, + "grad_norm": 1.1773525476455688, + "learning_rate": 4.99760344681974e-05, + "loss": 0.1922, + "num_input_tokens_seen": 14924480, + "step": 6900 + }, + { + "epoch": 1.1264274061990212, + "grad_norm": 0.7289412021636963, + "learning_rate": 4.997572186025371e-05, + "loss": 0.1316, + "num_input_tokens_seen": 14935264, + "step": 6905 + }, + { + "epoch": 1.1272430668841762, + "grad_norm": 1.301597237586975, + "learning_rate": 4.997540722765128e-05, + "loss": 0.1229, + "num_input_tokens_seen": 14947072, + "step": 6910 + }, + { + "epoch": 1.1280587275693312, + "grad_norm": 0.6722368597984314, + "learning_rate": 4.997509057041563e-05, + "loss": 0.1798, + "num_input_tokens_seen": 14957888, + "step": 6915 + }, + { + "epoch": 1.128874388254486, + "grad_norm": 1.1180039644241333, + "learning_rate": 4.99747718885724e-05, + "loss": 0.22, + "num_input_tokens_seen": 14967424, + "step": 6920 + }, + { + "epoch": 1.129690048939641, + "grad_norm": 2.016648769378662, + "learning_rate": 4.9974451182147456e-05, + "loss": 0.1344, + "num_input_tokens_seen": 14977536, + "step": 6925 + }, + { + "epoch": 1.1305057096247961, + "grad_norm": 0.24831824004650116, + "learning_rate": 4.997412845116677e-05, + "loss": 0.0525, + "num_input_tokens_seen": 14987008, + "step": 6930 + }, + { + "epoch": 1.131321370309951, + "grad_norm": 0.13983696699142456, + "learning_rate": 4.997380369565652e-05, + "loss": 0.351, + "num_input_tokens_seen": 14998272, + "step": 6935 + }, + { + "epoch": 1.132137030995106, + "grad_norm": 1.9715735912322998, + "learning_rate": 4.9973476915643015e-05, + "loss": 0.2973, + "num_input_tokens_seen": 15007392, + "step": 6940 + }, + { + "epoch": 1.132952691680261, + "grad_norm": 2.0275111198425293, + "learning_rate": 4.997314811115277e-05, + "loss": 0.3122, + "num_input_tokens_seen": 15016608, + "step": 6945 + }, + { + "epoch": 1.133768352365416, + "grad_norm": 1.2770103216171265, + "learning_rate": 4.997281728221242e-05, + "loss": 0.1898, + "num_input_tokens_seen": 15027712, + "step": 6950 + }, + { + "epoch": 1.1345840130505709, + "grad_norm": 0.6556134819984436, + "learning_rate": 4.997248442884879e-05, + "loss": 0.1061, + "num_input_tokens_seen": 15038080, + "step": 6955 + }, + { + "epoch": 1.135399673735726, + "grad_norm": 0.29350876808166504, + "learning_rate": 4.997214955108887e-05, + "loss": 0.1782, + "num_input_tokens_seen": 15050432, + "step": 6960 + }, + { + "epoch": 1.136215334420881, + "grad_norm": 0.2736978828907013, + "learning_rate": 4.9971812648959796e-05, + "loss": 0.1679, + "num_input_tokens_seen": 15060992, + "step": 6965 + }, + { + "epoch": 1.137030995106036, + "grad_norm": 0.7794458866119385, + "learning_rate": 4.997147372248887e-05, + "loss": 0.1139, + "num_input_tokens_seen": 15070816, + "step": 6970 + }, + { + "epoch": 1.1378466557911908, + "grad_norm": 0.3859100043773651, + "learning_rate": 4.99711327717036e-05, + "loss": 0.1378, + "num_input_tokens_seen": 15080128, + "step": 6975 + }, + { + "epoch": 1.1386623164763459, + "grad_norm": 0.21588242053985596, + "learning_rate": 4.997078979663159e-05, + "loss": 0.1239, + "num_input_tokens_seen": 15090464, + "step": 6980 + }, + { + "epoch": 1.139477977161501, + "grad_norm": 1.0051765441894531, + "learning_rate": 4.997044479730067e-05, + "loss": 0.2535, + "num_input_tokens_seen": 15101472, + "step": 6985 + }, + { + "epoch": 1.1402936378466557, + "grad_norm": 1.1905962228775024, + "learning_rate": 4.997009777373879e-05, + "loss": 0.1208, + "num_input_tokens_seen": 15111616, + "step": 6990 + }, + { + "epoch": 1.1411092985318108, + "grad_norm": 1.3619177341461182, + "learning_rate": 4.9969748725974085e-05, + "loss": 0.26, + "num_input_tokens_seen": 15122496, + "step": 6995 + }, + { + "epoch": 1.1419249592169658, + "grad_norm": 0.23088783025741577, + "learning_rate": 4.996939765403486e-05, + "loss": 0.1522, + "num_input_tokens_seen": 15134112, + "step": 7000 + }, + { + "epoch": 1.1427406199021206, + "grad_norm": 1.2237498760223389, + "learning_rate": 4.996904455794956e-05, + "loss": 0.1085, + "num_input_tokens_seen": 15144352, + "step": 7005 + }, + { + "epoch": 1.1435562805872757, + "grad_norm": 0.619831919670105, + "learning_rate": 4.996868943774683e-05, + "loss": 0.1502, + "num_input_tokens_seen": 15154304, + "step": 7010 + }, + { + "epoch": 1.1443719412724307, + "grad_norm": 0.6305994987487793, + "learning_rate": 4.9968332293455433e-05, + "loss": 0.0921, + "num_input_tokens_seen": 15165024, + "step": 7015 + }, + { + "epoch": 1.1451876019575857, + "grad_norm": 1.0472581386566162, + "learning_rate": 4.996797312510433e-05, + "loss": 0.1807, + "num_input_tokens_seen": 15176224, + "step": 7020 + }, + { + "epoch": 1.1460032626427405, + "grad_norm": 1.7032833099365234, + "learning_rate": 4.9967611932722645e-05, + "loss": 0.1155, + "num_input_tokens_seen": 15186464, + "step": 7025 + }, + { + "epoch": 1.1468189233278956, + "grad_norm": 1.0751842260360718, + "learning_rate": 4.9967248716339656e-05, + "loss": 0.1861, + "num_input_tokens_seen": 15197408, + "step": 7030 + }, + { + "epoch": 1.1476345840130506, + "grad_norm": 0.5629591941833496, + "learning_rate": 4.9966883475984796e-05, + "loss": 0.1064, + "num_input_tokens_seen": 15209504, + "step": 7035 + }, + { + "epoch": 1.1484502446982057, + "grad_norm": 1.2936196327209473, + "learning_rate": 4.996651621168768e-05, + "loss": 0.2456, + "num_input_tokens_seen": 15220576, + "step": 7040 + }, + { + "epoch": 1.1492659053833605, + "grad_norm": 0.5926993489265442, + "learning_rate": 4.9966146923478086e-05, + "loss": 0.1263, + "num_input_tokens_seen": 15230752, + "step": 7045 + }, + { + "epoch": 1.1500815660685155, + "grad_norm": 1.2118645906448364, + "learning_rate": 4.996577561138594e-05, + "loss": 0.0749, + "num_input_tokens_seen": 15241312, + "step": 7050 + }, + { + "epoch": 1.1508972267536706, + "grad_norm": 1.2220784425735474, + "learning_rate": 4.996540227544136e-05, + "loss": 0.14, + "num_input_tokens_seen": 15252512, + "step": 7055 + }, + { + "epoch": 1.1517128874388254, + "grad_norm": 0.7331756949424744, + "learning_rate": 4.9965026915674584e-05, + "loss": 0.0871, + "num_input_tokens_seen": 15262400, + "step": 7060 + }, + { + "epoch": 1.1525285481239804, + "grad_norm": 1.393388032913208, + "learning_rate": 4.9964649532116065e-05, + "loss": 0.2159, + "num_input_tokens_seen": 15274816, + "step": 7065 + }, + { + "epoch": 1.1533442088091355, + "grad_norm": 0.09148511290550232, + "learning_rate": 4.996427012479638e-05, + "loss": 0.1077, + "num_input_tokens_seen": 15285248, + "step": 7070 + }, + { + "epoch": 1.1541598694942903, + "grad_norm": 1.4246196746826172, + "learning_rate": 4.9963888693746294e-05, + "loss": 0.0709, + "num_input_tokens_seen": 15295552, + "step": 7075 + }, + { + "epoch": 1.1549755301794453, + "grad_norm": 0.508366584777832, + "learning_rate": 4.996350523899672e-05, + "loss": 0.0815, + "num_input_tokens_seen": 15305248, + "step": 7080 + }, + { + "epoch": 1.1557911908646004, + "grad_norm": 0.15868118405342102, + "learning_rate": 4.9963119760578756e-05, + "loss": 0.0432, + "num_input_tokens_seen": 15316576, + "step": 7085 + }, + { + "epoch": 1.1566068515497552, + "grad_norm": 0.045865725725889206, + "learning_rate": 4.996273225852364e-05, + "loss": 0.1516, + "num_input_tokens_seen": 15328256, + "step": 7090 + }, + { + "epoch": 1.1574225122349102, + "grad_norm": 0.9741826057434082, + "learning_rate": 4.996234273286278e-05, + "loss": 0.2447, + "num_input_tokens_seen": 15338752, + "step": 7095 + }, + { + "epoch": 1.1582381729200653, + "grad_norm": 0.1629514843225479, + "learning_rate": 4.996195118362777e-05, + "loss": 0.0492, + "num_input_tokens_seen": 15348224, + "step": 7100 + }, + { + "epoch": 1.1590538336052203, + "grad_norm": 0.2998805046081543, + "learning_rate": 4.996155761085034e-05, + "loss": 0.0985, + "num_input_tokens_seen": 15359008, + "step": 7105 + }, + { + "epoch": 1.1598694942903751, + "grad_norm": 0.21384000778198242, + "learning_rate": 4.996116201456239e-05, + "loss": 0.0541, + "num_input_tokens_seen": 15369408, + "step": 7110 + }, + { + "epoch": 1.1606851549755302, + "grad_norm": 0.2932111620903015, + "learning_rate": 4.9960764394796e-05, + "loss": 0.0156, + "num_input_tokens_seen": 15379392, + "step": 7115 + }, + { + "epoch": 1.1615008156606852, + "grad_norm": 0.5352867245674133, + "learning_rate": 4.99603647515834e-05, + "loss": 0.1227, + "num_input_tokens_seen": 15389920, + "step": 7120 + }, + { + "epoch": 1.1623164763458402, + "grad_norm": 0.3886188864707947, + "learning_rate": 4.9959963084956986e-05, + "loss": 0.1532, + "num_input_tokens_seen": 15400896, + "step": 7125 + }, + { + "epoch": 1.163132137030995, + "grad_norm": 0.546933650970459, + "learning_rate": 4.9959559394949315e-05, + "loss": 0.0707, + "num_input_tokens_seen": 15412320, + "step": 7130 + }, + { + "epoch": 1.16394779771615, + "grad_norm": 0.0992317721247673, + "learning_rate": 4.9959153681593114e-05, + "loss": 0.0966, + "num_input_tokens_seen": 15423680, + "step": 7135 + }, + { + "epoch": 1.1647634584013051, + "grad_norm": 0.29832249879837036, + "learning_rate": 4.9958745944921275e-05, + "loss": 0.1239, + "num_input_tokens_seen": 15433952, + "step": 7140 + }, + { + "epoch": 1.16557911908646, + "grad_norm": 0.6798011064529419, + "learning_rate": 4.995833618496685e-05, + "loss": 0.1939, + "num_input_tokens_seen": 15444896, + "step": 7145 + }, + { + "epoch": 1.166394779771615, + "grad_norm": 0.2682628035545349, + "learning_rate": 4.9957924401763065e-05, + "loss": 0.1, + "num_input_tokens_seen": 15456192, + "step": 7150 + }, + { + "epoch": 1.16721044045677, + "grad_norm": 2.1139400005340576, + "learning_rate": 4.9957510595343285e-05, + "loss": 0.2636, + "num_input_tokens_seen": 15465312, + "step": 7155 + }, + { + "epoch": 1.1680261011419248, + "grad_norm": 0.21100583672523499, + "learning_rate": 4.995709476574106e-05, + "loss": 0.1281, + "num_input_tokens_seen": 15474496, + "step": 7160 + }, + { + "epoch": 1.1688417618270799, + "grad_norm": 1.7074402570724487, + "learning_rate": 4.9956676912990105e-05, + "loss": 0.1582, + "num_input_tokens_seen": 15484992, + "step": 7165 + }, + { + "epoch": 1.169657422512235, + "grad_norm": 0.3699171245098114, + "learning_rate": 4.99562570371243e-05, + "loss": 0.0855, + "num_input_tokens_seen": 15495232, + "step": 7170 + }, + { + "epoch": 1.17047308319739, + "grad_norm": 0.12341846525669098, + "learning_rate": 4.9955835138177667e-05, + "loss": 0.1398, + "num_input_tokens_seen": 15506304, + "step": 7175 + }, + { + "epoch": 1.1712887438825448, + "grad_norm": 0.3367205858230591, + "learning_rate": 4.9955411216184414e-05, + "loss": 0.1967, + "num_input_tokens_seen": 15517248, + "step": 7180 + }, + { + "epoch": 1.1721044045676998, + "grad_norm": 0.649776816368103, + "learning_rate": 4.9954985271178903e-05, + "loss": 0.0662, + "num_input_tokens_seen": 15527040, + "step": 7185 + }, + { + "epoch": 1.1729200652528549, + "grad_norm": 1.0134059190750122, + "learning_rate": 4.995455730319566e-05, + "loss": 0.251, + "num_input_tokens_seen": 15538368, + "step": 7190 + }, + { + "epoch": 1.17373572593801, + "grad_norm": 0.3646966516971588, + "learning_rate": 4.9954127312269386e-05, + "loss": 0.0361, + "num_input_tokens_seen": 15548608, + "step": 7195 + }, + { + "epoch": 1.1745513866231647, + "grad_norm": 0.26996538043022156, + "learning_rate": 4.9953695298434944e-05, + "loss": 0.1934, + "num_input_tokens_seen": 15560032, + "step": 7200 + }, + { + "epoch": 1.1753670473083198, + "grad_norm": 0.6783331632614136, + "learning_rate": 4.9953261261727334e-05, + "loss": 0.1632, + "num_input_tokens_seen": 15571616, + "step": 7205 + }, + { + "epoch": 1.1761827079934748, + "grad_norm": 0.19382016360759735, + "learning_rate": 4.9952825202181766e-05, + "loss": 0.1233, + "num_input_tokens_seen": 15583392, + "step": 7210 + }, + { + "epoch": 1.1769983686786296, + "grad_norm": 0.26040735840797424, + "learning_rate": 4.995238711983358e-05, + "loss": 0.2527, + "num_input_tokens_seen": 15593504, + "step": 7215 + }, + { + "epoch": 1.1778140293637847, + "grad_norm": 0.6082053780555725, + "learning_rate": 4.995194701471828e-05, + "loss": 0.0826, + "num_input_tokens_seen": 15604928, + "step": 7220 + }, + { + "epoch": 1.1786296900489397, + "grad_norm": 1.430023431777954, + "learning_rate": 4.9951504886871545e-05, + "loss": 0.1768, + "num_input_tokens_seen": 15615200, + "step": 7225 + }, + { + "epoch": 1.1794453507340945, + "grad_norm": 0.6038452982902527, + "learning_rate": 4.995106073632924e-05, + "loss": 0.0599, + "num_input_tokens_seen": 15626176, + "step": 7230 + }, + { + "epoch": 1.1802610114192496, + "grad_norm": 0.7641096115112305, + "learning_rate": 4.995061456312733e-05, + "loss": 0.0991, + "num_input_tokens_seen": 15636768, + "step": 7235 + }, + { + "epoch": 1.1810766721044046, + "grad_norm": 0.8587701320648193, + "learning_rate": 4.995016636730202e-05, + "loss": 0.1913, + "num_input_tokens_seen": 15648512, + "step": 7240 + }, + { + "epoch": 1.1818923327895596, + "grad_norm": 0.8954089879989624, + "learning_rate": 4.994971614888962e-05, + "loss": 0.0791, + "num_input_tokens_seen": 15659328, + "step": 7245 + }, + { + "epoch": 1.1827079934747144, + "grad_norm": 0.25724783539772034, + "learning_rate": 4.994926390792664e-05, + "loss": 0.054, + "num_input_tokens_seen": 15669920, + "step": 7250 + }, + { + "epoch": 1.1835236541598695, + "grad_norm": 0.4564119577407837, + "learning_rate": 4.9948809644449734e-05, + "loss": 0.2072, + "num_input_tokens_seen": 15681152, + "step": 7255 + }, + { + "epoch": 1.1843393148450245, + "grad_norm": 1.4996201992034912, + "learning_rate": 4.994835335849573e-05, + "loss": 0.12, + "num_input_tokens_seen": 15690912, + "step": 7260 + }, + { + "epoch": 1.1851549755301796, + "grad_norm": 1.4564710855484009, + "learning_rate": 4.994789505010161e-05, + "loss": 0.1581, + "num_input_tokens_seen": 15701216, + "step": 7265 + }, + { + "epoch": 1.1859706362153344, + "grad_norm": 0.9515605568885803, + "learning_rate": 4.994743471930454e-05, + "loss": 0.185, + "num_input_tokens_seen": 15712000, + "step": 7270 + }, + { + "epoch": 1.1867862969004894, + "grad_norm": 1.1618497371673584, + "learning_rate": 4.994697236614183e-05, + "loss": 0.2476, + "num_input_tokens_seen": 15722848, + "step": 7275 + }, + { + "epoch": 1.1876019575856445, + "grad_norm": 0.47113335132598877, + "learning_rate": 4.994650799065096e-05, + "loss": 0.1981, + "num_input_tokens_seen": 15734816, + "step": 7280 + }, + { + "epoch": 1.1884176182707993, + "grad_norm": 0.528962254524231, + "learning_rate": 4.9946041592869576e-05, + "loss": 0.3232, + "num_input_tokens_seen": 15745664, + "step": 7285 + }, + { + "epoch": 1.1892332789559543, + "grad_norm": 2.544632911682129, + "learning_rate": 4.994557317283548e-05, + "loss": 0.1918, + "num_input_tokens_seen": 15756896, + "step": 7290 + }, + { + "epoch": 1.1900489396411094, + "grad_norm": 0.4806921184062958, + "learning_rate": 4.9945102730586655e-05, + "loss": 0.1748, + "num_input_tokens_seen": 15766656, + "step": 7295 + }, + { + "epoch": 1.1908646003262642, + "grad_norm": 1.549131989479065, + "learning_rate": 4.994463026616123e-05, + "loss": 0.2553, + "num_input_tokens_seen": 15777792, + "step": 7300 + }, + { + "epoch": 1.1916802610114192, + "grad_norm": 0.5866542458534241, + "learning_rate": 4.994415577959751e-05, + "loss": 0.2183, + "num_input_tokens_seen": 15787968, + "step": 7305 + }, + { + "epoch": 1.1924959216965743, + "grad_norm": 1.180175542831421, + "learning_rate": 4.9943679270933954e-05, + "loss": 0.2973, + "num_input_tokens_seen": 15797920, + "step": 7310 + }, + { + "epoch": 1.1933115823817293, + "grad_norm": 0.43190494179725647, + "learning_rate": 4.99432007402092e-05, + "loss": 0.1577, + "num_input_tokens_seen": 15809376, + "step": 7315 + }, + { + "epoch": 1.1941272430668841, + "grad_norm": 2.9713072776794434, + "learning_rate": 4.9942720187462025e-05, + "loss": 0.1272, + "num_input_tokens_seen": 15821408, + "step": 7320 + }, + { + "epoch": 1.1949429037520392, + "grad_norm": 0.32094860076904297, + "learning_rate": 4.9942237612731395e-05, + "loss": 0.1251, + "num_input_tokens_seen": 15830816, + "step": 7325 + }, + { + "epoch": 1.1957585644371942, + "grad_norm": 0.33393827080726624, + "learning_rate": 4.994175301605644e-05, + "loss": 0.0512, + "num_input_tokens_seen": 15841792, + "step": 7330 + }, + { + "epoch": 1.196574225122349, + "grad_norm": 0.24921761453151703, + "learning_rate": 4.9941266397476414e-05, + "loss": 0.3004, + "num_input_tokens_seen": 15852768, + "step": 7335 + }, + { + "epoch": 1.197389885807504, + "grad_norm": 0.3066304624080658, + "learning_rate": 4.9940777757030796e-05, + "loss": 0.087, + "num_input_tokens_seen": 15863680, + "step": 7340 + }, + { + "epoch": 1.198205546492659, + "grad_norm": 0.7808701395988464, + "learning_rate": 4.994028709475917e-05, + "loss": 0.1463, + "num_input_tokens_seen": 15874976, + "step": 7345 + }, + { + "epoch": 1.1990212071778141, + "grad_norm": 0.1130456030368805, + "learning_rate": 4.993979441070135e-05, + "loss": 0.0395, + "num_input_tokens_seen": 15885408, + "step": 7350 + }, + { + "epoch": 1.199836867862969, + "grad_norm": 2.009401559829712, + "learning_rate": 4.9939299704897236e-05, + "loss": 0.1948, + "num_input_tokens_seen": 15895168, + "step": 7355 + }, + { + "epoch": 1.200652528548124, + "grad_norm": 0.9587863683700562, + "learning_rate": 4.993880297738694e-05, + "loss": 0.1424, + "num_input_tokens_seen": 15906240, + "step": 7360 + }, + { + "epoch": 1.201468189233279, + "grad_norm": 2.167450428009033, + "learning_rate": 4.9938304228210754e-05, + "loss": 0.1603, + "num_input_tokens_seen": 15917024, + "step": 7365 + }, + { + "epoch": 1.2022838499184338, + "grad_norm": 2.505861759185791, + "learning_rate": 4.9937803457409084e-05, + "loss": 0.292, + "num_input_tokens_seen": 15926496, + "step": 7370 + }, + { + "epoch": 1.2030995106035889, + "grad_norm": 1.0467236042022705, + "learning_rate": 4.9937300665022535e-05, + "loss": 0.0955, + "num_input_tokens_seen": 15937600, + "step": 7375 + }, + { + "epoch": 1.203915171288744, + "grad_norm": 1.475075602531433, + "learning_rate": 4.9936795851091854e-05, + "loss": 0.0931, + "num_input_tokens_seen": 15948064, + "step": 7380 + }, + { + "epoch": 1.2047308319738987, + "grad_norm": 0.07853451371192932, + "learning_rate": 4.993628901565799e-05, + "loss": 0.1031, + "num_input_tokens_seen": 15958752, + "step": 7385 + }, + { + "epoch": 1.2055464926590538, + "grad_norm": 2.1304450035095215, + "learning_rate": 4.9935780158762e-05, + "loss": 0.2477, + "num_input_tokens_seen": 15970080, + "step": 7390 + }, + { + "epoch": 1.2063621533442088, + "grad_norm": 0.3613418936729431, + "learning_rate": 4.993526928044515e-05, + "loss": 0.1021, + "num_input_tokens_seen": 15982080, + "step": 7395 + }, + { + "epoch": 1.2071778140293639, + "grad_norm": 0.36740222573280334, + "learning_rate": 4.9934756380748846e-05, + "loss": 0.0634, + "num_input_tokens_seen": 15993056, + "step": 7400 + }, + { + "epoch": 1.2079934747145187, + "grad_norm": 1.30686354637146, + "learning_rate": 4.993424145971468e-05, + "loss": 0.1026, + "num_input_tokens_seen": 16003584, + "step": 7405 + }, + { + "epoch": 1.2088091353996737, + "grad_norm": 1.3798381090164185, + "learning_rate": 4.993372451738439e-05, + "loss": 0.1793, + "num_input_tokens_seen": 16014912, + "step": 7410 + }, + { + "epoch": 1.2096247960848288, + "grad_norm": 0.5113927721977234, + "learning_rate": 4.993320555379987e-05, + "loss": 0.1199, + "num_input_tokens_seen": 16026304, + "step": 7415 + }, + { + "epoch": 1.2104404567699838, + "grad_norm": 0.5137270092964172, + "learning_rate": 4.9932684569003205e-05, + "loss": 0.0953, + "num_input_tokens_seen": 16038464, + "step": 7420 + }, + { + "epoch": 1.2112561174551386, + "grad_norm": 2.1859731674194336, + "learning_rate": 4.993216156303662e-05, + "loss": 0.1282, + "num_input_tokens_seen": 16049728, + "step": 7425 + }, + { + "epoch": 1.2120717781402937, + "grad_norm": 2.0353329181671143, + "learning_rate": 4.9931636535942506e-05, + "loss": 0.4122, + "num_input_tokens_seen": 16060032, + "step": 7430 + }, + { + "epoch": 1.2128874388254487, + "grad_norm": 2.7048187255859375, + "learning_rate": 4.993110948776344e-05, + "loss": 0.1216, + "num_input_tokens_seen": 16072064, + "step": 7435 + }, + { + "epoch": 1.2137030995106035, + "grad_norm": 0.8658931255340576, + "learning_rate": 4.993058041854214e-05, + "loss": 0.1384, + "num_input_tokens_seen": 16082112, + "step": 7440 + }, + { + "epoch": 1.2145187601957586, + "grad_norm": 0.2680533230304718, + "learning_rate": 4.9930049328321495e-05, + "loss": 0.2049, + "num_input_tokens_seen": 16093280, + "step": 7445 + }, + { + "epoch": 1.2153344208809136, + "grad_norm": 1.7213209867477417, + "learning_rate": 4.9929516217144554e-05, + "loss": 0.2732, + "num_input_tokens_seen": 16104352, + "step": 7450 + }, + { + "epoch": 1.2161500815660684, + "grad_norm": 0.29289597272872925, + "learning_rate": 4.992898108505454e-05, + "loss": 0.0802, + "num_input_tokens_seen": 16114784, + "step": 7455 + }, + { + "epoch": 1.2169657422512234, + "grad_norm": 0.5455002188682556, + "learning_rate": 4.992844393209483e-05, + "loss": 0.1129, + "num_input_tokens_seen": 16126080, + "step": 7460 + }, + { + "epoch": 1.2177814029363785, + "grad_norm": 1.0198004245758057, + "learning_rate": 4.992790475830896e-05, + "loss": 0.2508, + "num_input_tokens_seen": 16136480, + "step": 7465 + }, + { + "epoch": 1.2185970636215335, + "grad_norm": 1.7931045293807983, + "learning_rate": 4.992736356374066e-05, + "loss": 0.3175, + "num_input_tokens_seen": 16147424, + "step": 7470 + }, + { + "epoch": 1.2194127243066883, + "grad_norm": 0.43799713253974915, + "learning_rate": 4.992682034843379e-05, + "loss": 0.2348, + "num_input_tokens_seen": 16157280, + "step": 7475 + }, + { + "epoch": 1.2202283849918434, + "grad_norm": 1.3121196031570435, + "learning_rate": 4.992627511243238e-05, + "loss": 0.1318, + "num_input_tokens_seen": 16168320, + "step": 7480 + }, + { + "epoch": 1.2210440456769984, + "grad_norm": 0.10270816087722778, + "learning_rate": 4.992572785578063e-05, + "loss": 0.0784, + "num_input_tokens_seen": 16178976, + "step": 7485 + }, + { + "epoch": 1.2218597063621535, + "grad_norm": 0.07832954078912735, + "learning_rate": 4.9925178578522914e-05, + "loss": 0.1405, + "num_input_tokens_seen": 16190208, + "step": 7490 + }, + { + "epoch": 1.2226753670473083, + "grad_norm": 0.5278720855712891, + "learning_rate": 4.992462728070375e-05, + "loss": 0.2194, + "num_input_tokens_seen": 16199488, + "step": 7495 + }, + { + "epoch": 1.2234910277324633, + "grad_norm": 0.27939480543136597, + "learning_rate": 4.992407396236784e-05, + "loss": 0.2035, + "num_input_tokens_seen": 16210432, + "step": 7500 + }, + { + "epoch": 1.2243066884176184, + "grad_norm": 0.9180857539176941, + "learning_rate": 4.992351862356003e-05, + "loss": 0.095, + "num_input_tokens_seen": 16221664, + "step": 7505 + }, + { + "epoch": 1.2251223491027732, + "grad_norm": 0.13531503081321716, + "learning_rate": 4.992296126432533e-05, + "loss": 0.1922, + "num_input_tokens_seen": 16232064, + "step": 7510 + }, + { + "epoch": 1.2259380097879282, + "grad_norm": 0.10459175705909729, + "learning_rate": 4.992240188470894e-05, + "loss": 0.1158, + "num_input_tokens_seen": 16242976, + "step": 7515 + }, + { + "epoch": 1.2267536704730833, + "grad_norm": 0.08982200920581818, + "learning_rate": 4.99218404847562e-05, + "loss": 0.2032, + "num_input_tokens_seen": 16253440, + "step": 7520 + }, + { + "epoch": 1.227569331158238, + "grad_norm": 0.5903329849243164, + "learning_rate": 4.9921277064512614e-05, + "loss": 0.1141, + "num_input_tokens_seen": 16263680, + "step": 7525 + }, + { + "epoch": 1.2283849918433931, + "grad_norm": 0.13364842534065247, + "learning_rate": 4.992071162402386e-05, + "loss": 0.266, + "num_input_tokens_seen": 16274944, + "step": 7530 + }, + { + "epoch": 1.2292006525285482, + "grad_norm": 0.5839671492576599, + "learning_rate": 4.992014416333577e-05, + "loss": 0.0877, + "num_input_tokens_seen": 16285376, + "step": 7535 + }, + { + "epoch": 1.2300163132137032, + "grad_norm": 1.150698184967041, + "learning_rate": 4.991957468249436e-05, + "loss": 0.1107, + "num_input_tokens_seen": 16296544, + "step": 7540 + }, + { + "epoch": 1.230831973898858, + "grad_norm": 0.32738006114959717, + "learning_rate": 4.991900318154578e-05, + "loss": 0.2051, + "num_input_tokens_seen": 16306752, + "step": 7545 + }, + { + "epoch": 1.231647634584013, + "grad_norm": 1.637184500694275, + "learning_rate": 4.991842966053637e-05, + "loss": 0.1699, + "num_input_tokens_seen": 16317536, + "step": 7550 + }, + { + "epoch": 1.232463295269168, + "grad_norm": 1.793697714805603, + "learning_rate": 4.991785411951261e-05, + "loss": 0.1461, + "num_input_tokens_seen": 16328640, + "step": 7555 + }, + { + "epoch": 1.233278955954323, + "grad_norm": 1.5189714431762695, + "learning_rate": 4.9917276558521164e-05, + "loss": 0.2252, + "num_input_tokens_seen": 16340032, + "step": 7560 + }, + { + "epoch": 1.234094616639478, + "grad_norm": 1.2340056896209717, + "learning_rate": 4.9916696977608855e-05, + "loss": 0.2405, + "num_input_tokens_seen": 16351328, + "step": 7565 + }, + { + "epoch": 1.234910277324633, + "grad_norm": 0.7973819971084595, + "learning_rate": 4.991611537682266e-05, + "loss": 0.1239, + "num_input_tokens_seen": 16360832, + "step": 7570 + }, + { + "epoch": 1.235725938009788, + "grad_norm": 0.13952307403087616, + "learning_rate": 4.991553175620973e-05, + "loss": 0.0385, + "num_input_tokens_seen": 16372640, + "step": 7575 + }, + { + "epoch": 1.2365415986949428, + "grad_norm": 0.35601678490638733, + "learning_rate": 4.991494611581738e-05, + "loss": 0.0679, + "num_input_tokens_seen": 16383360, + "step": 7580 + }, + { + "epoch": 1.2373572593800979, + "grad_norm": 0.7161945104598999, + "learning_rate": 4.9914358455693076e-05, + "loss": 0.1668, + "num_input_tokens_seen": 16393728, + "step": 7585 + }, + { + "epoch": 1.238172920065253, + "grad_norm": 2.169713258743286, + "learning_rate": 4.991376877588446e-05, + "loss": 0.2717, + "num_input_tokens_seen": 16405440, + "step": 7590 + }, + { + "epoch": 1.2389885807504077, + "grad_norm": 1.271612524986267, + "learning_rate": 4.991317707643934e-05, + "loss": 0.2383, + "num_input_tokens_seen": 16416672, + "step": 7595 + }, + { + "epoch": 1.2398042414355628, + "grad_norm": 1.0383329391479492, + "learning_rate": 4.991258335740568e-05, + "loss": 0.2126, + "num_input_tokens_seen": 16427136, + "step": 7600 + }, + { + "epoch": 1.2406199021207178, + "grad_norm": 0.4886912703514099, + "learning_rate": 4.99119876188316e-05, + "loss": 0.1222, + "num_input_tokens_seen": 16437984, + "step": 7605 + }, + { + "epoch": 1.2414355628058726, + "grad_norm": 0.422086238861084, + "learning_rate": 4.9911389860765406e-05, + "loss": 0.1822, + "num_input_tokens_seen": 16448640, + "step": 7610 + }, + { + "epoch": 1.2422512234910277, + "grad_norm": 1.4241257905960083, + "learning_rate": 4.9910790083255555e-05, + "loss": 0.2439, + "num_input_tokens_seen": 16457856, + "step": 7615 + }, + { + "epoch": 1.2430668841761827, + "grad_norm": 0.08201615512371063, + "learning_rate": 4.991018828635066e-05, + "loss": 0.0726, + "num_input_tokens_seen": 16469056, + "step": 7620 + }, + { + "epoch": 1.2438825448613378, + "grad_norm": 0.15234707295894623, + "learning_rate": 4.99095844700995e-05, + "loss": 0.1294, + "num_input_tokens_seen": 16479616, + "step": 7625 + }, + { + "epoch": 1.2446982055464926, + "grad_norm": 0.41127660870552063, + "learning_rate": 4.9908978634551045e-05, + "loss": 0.0704, + "num_input_tokens_seen": 16489952, + "step": 7630 + }, + { + "epoch": 1.2455138662316476, + "grad_norm": 0.31107231974601746, + "learning_rate": 4.990837077975439e-05, + "loss": 0.112, + "num_input_tokens_seen": 16501152, + "step": 7635 + }, + { + "epoch": 1.2463295269168027, + "grad_norm": 0.3260094225406647, + "learning_rate": 4.990776090575881e-05, + "loss": 0.0365, + "num_input_tokens_seen": 16512608, + "step": 7640 + }, + { + "epoch": 1.2471451876019577, + "grad_norm": 1.7661806344985962, + "learning_rate": 4.990714901261376e-05, + "loss": 0.1728, + "num_input_tokens_seen": 16524064, + "step": 7645 + }, + { + "epoch": 1.2479608482871125, + "grad_norm": 1.20936119556427, + "learning_rate": 4.990653510036883e-05, + "loss": 0.108, + "num_input_tokens_seen": 16534688, + "step": 7650 + }, + { + "epoch": 1.2487765089722676, + "grad_norm": 1.638940453529358, + "learning_rate": 4.99059191690738e-05, + "loss": 0.1414, + "num_input_tokens_seen": 16544928, + "step": 7655 + }, + { + "epoch": 1.2495921696574226, + "grad_norm": 1.4426946640014648, + "learning_rate": 4.9905301218778575e-05, + "loss": 0.2486, + "num_input_tokens_seen": 16555200, + "step": 7660 + }, + { + "epoch": 1.2504078303425774, + "grad_norm": 1.4018425941467285, + "learning_rate": 4.990468124953328e-05, + "loss": 0.2622, + "num_input_tokens_seen": 16566304, + "step": 7665 + }, + { + "epoch": 1.2512234910277324, + "grad_norm": 1.3975560665130615, + "learning_rate": 4.990405926138815e-05, + "loss": 0.2468, + "num_input_tokens_seen": 16578560, + "step": 7670 + }, + { + "epoch": 1.2520391517128875, + "grad_norm": 0.316589891910553, + "learning_rate": 4.9903435254393616e-05, + "loss": 0.0979, + "num_input_tokens_seen": 16588704, + "step": 7675 + }, + { + "epoch": 1.2528548123980423, + "grad_norm": 0.8762066960334778, + "learning_rate": 4.990280922860026e-05, + "loss": 0.0474, + "num_input_tokens_seen": 16599776, + "step": 7680 + }, + { + "epoch": 1.2536704730831973, + "grad_norm": 0.216337189078331, + "learning_rate": 4.990218118405883e-05, + "loss": 0.1011, + "num_input_tokens_seen": 16609824, + "step": 7685 + }, + { + "epoch": 1.2544861337683524, + "grad_norm": 1.617035984992981, + "learning_rate": 4.990155112082024e-05, + "loss": 0.2895, + "num_input_tokens_seen": 16621280, + "step": 7690 + }, + { + "epoch": 1.2553017944535072, + "grad_norm": 0.4105816185474396, + "learning_rate": 4.9900919038935564e-05, + "loss": 0.1266, + "num_input_tokens_seen": 16631776, + "step": 7695 + }, + { + "epoch": 1.2561174551386622, + "grad_norm": 2.1970608234405518, + "learning_rate": 4.9900284938456056e-05, + "loss": 0.1201, + "num_input_tokens_seen": 16643584, + "step": 7700 + }, + { + "epoch": 1.2569331158238173, + "grad_norm": 0.08868924528360367, + "learning_rate": 4.98996488194331e-05, + "loss": 0.0943, + "num_input_tokens_seen": 16654432, + "step": 7705 + }, + { + "epoch": 1.2577487765089723, + "grad_norm": 0.08532081544399261, + "learning_rate": 4.989901068191828e-05, + "loss": 0.1171, + "num_input_tokens_seen": 16664064, + "step": 7710 + }, + { + "epoch": 1.2585644371941274, + "grad_norm": 0.5159860253334045, + "learning_rate": 4.9898370525963314e-05, + "loss": 0.1174, + "num_input_tokens_seen": 16674432, + "step": 7715 + }, + { + "epoch": 1.2593800978792822, + "grad_norm": 1.3835340738296509, + "learning_rate": 4.9897728351620085e-05, + "loss": 0.1254, + "num_input_tokens_seen": 16685920, + "step": 7720 + }, + { + "epoch": 1.2601957585644372, + "grad_norm": 0.719484806060791, + "learning_rate": 4.989708415894069e-05, + "loss": 0.1113, + "num_input_tokens_seen": 16696320, + "step": 7725 + }, + { + "epoch": 1.2610114192495923, + "grad_norm": 0.33740100264549255, + "learning_rate": 4.9896437947977306e-05, + "loss": 0.0982, + "num_input_tokens_seen": 16707040, + "step": 7730 + }, + { + "epoch": 1.261827079934747, + "grad_norm": 1.6353504657745361, + "learning_rate": 4.989578971878235e-05, + "loss": 0.3064, + "num_input_tokens_seen": 16718208, + "step": 7735 + }, + { + "epoch": 1.2626427406199021, + "grad_norm": 1.0411760807037354, + "learning_rate": 4.9895139471408356e-05, + "loss": 0.0723, + "num_input_tokens_seen": 16728480, + "step": 7740 + }, + { + "epoch": 1.2634584013050572, + "grad_norm": 0.15755613148212433, + "learning_rate": 4.9894487205908044e-05, + "loss": 0.1486, + "num_input_tokens_seen": 16740384, + "step": 7745 + }, + { + "epoch": 1.264274061990212, + "grad_norm": 1.3491051197052002, + "learning_rate": 4.9893832922334285e-05, + "loss": 0.2727, + "num_input_tokens_seen": 16749888, + "step": 7750 + }, + { + "epoch": 1.265089722675367, + "grad_norm": 1.58836030960083, + "learning_rate": 4.989317662074011e-05, + "loss": 0.2815, + "num_input_tokens_seen": 16761024, + "step": 7755 + }, + { + "epoch": 1.265905383360522, + "grad_norm": 0.08986806869506836, + "learning_rate": 4.989251830117874e-05, + "loss": 0.0376, + "num_input_tokens_seen": 16771296, + "step": 7760 + }, + { + "epoch": 1.2667210440456769, + "grad_norm": 0.8587936162948608, + "learning_rate": 4.9891857963703535e-05, + "loss": 0.0914, + "num_input_tokens_seen": 16781984, + "step": 7765 + }, + { + "epoch": 1.267536704730832, + "grad_norm": 0.9356215000152588, + "learning_rate": 4.989119560836802e-05, + "loss": 0.1476, + "num_input_tokens_seen": 16793056, + "step": 7770 + }, + { + "epoch": 1.268352365415987, + "grad_norm": 0.548858642578125, + "learning_rate": 4.989053123522589e-05, + "loss": 0.1021, + "num_input_tokens_seen": 16805568, + "step": 7775 + }, + { + "epoch": 1.269168026101142, + "grad_norm": 0.2826305627822876, + "learning_rate": 4.988986484433101e-05, + "loss": 0.1, + "num_input_tokens_seen": 16815808, + "step": 7780 + }, + { + "epoch": 1.269983686786297, + "grad_norm": 0.15965187549591064, + "learning_rate": 4.988919643573739e-05, + "loss": 0.1995, + "num_input_tokens_seen": 16826272, + "step": 7785 + }, + { + "epoch": 1.2707993474714518, + "grad_norm": 2.4767823219299316, + "learning_rate": 4.9888526009499223e-05, + "loss": 0.1948, + "num_input_tokens_seen": 16836576, + "step": 7790 + }, + { + "epoch": 1.2716150081566069, + "grad_norm": 0.5599492192268372, + "learning_rate": 4.9887853565670854e-05, + "loss": 0.0478, + "num_input_tokens_seen": 16846720, + "step": 7795 + }, + { + "epoch": 1.272430668841762, + "grad_norm": 0.6882991790771484, + "learning_rate": 4.9887179104306796e-05, + "loss": 0.1607, + "num_input_tokens_seen": 16856960, + "step": 7800 + }, + { + "epoch": 1.2732463295269167, + "grad_norm": 0.5968769192695618, + "learning_rate": 4.988650262546173e-05, + "loss": 0.1883, + "num_input_tokens_seen": 16868512, + "step": 7805 + }, + { + "epoch": 1.2740619902120718, + "grad_norm": 0.1988547295331955, + "learning_rate": 4.9885824129190476e-05, + "loss": 0.1175, + "num_input_tokens_seen": 16879744, + "step": 7810 + }, + { + "epoch": 1.2748776508972268, + "grad_norm": 0.7639335989952087, + "learning_rate": 4.988514361554806e-05, + "loss": 0.1728, + "num_input_tokens_seen": 16889440, + "step": 7815 + }, + { + "epoch": 1.2756933115823816, + "grad_norm": 0.3733547329902649, + "learning_rate": 4.988446108458963e-05, + "loss": 0.1629, + "num_input_tokens_seen": 16900768, + "step": 7820 + }, + { + "epoch": 1.2765089722675367, + "grad_norm": 0.3817235827445984, + "learning_rate": 4.988377653637052e-05, + "loss": 0.1407, + "num_input_tokens_seen": 16910240, + "step": 7825 + }, + { + "epoch": 1.2773246329526917, + "grad_norm": 0.3502112030982971, + "learning_rate": 4.988308997094623e-05, + "loss": 0.0652, + "num_input_tokens_seen": 16921600, + "step": 7830 + }, + { + "epoch": 1.2781402936378465, + "grad_norm": 1.5680334568023682, + "learning_rate": 4.988240138837241e-05, + "loss": 0.204, + "num_input_tokens_seen": 16933344, + "step": 7835 + }, + { + "epoch": 1.2789559543230016, + "grad_norm": 1.2958720922470093, + "learning_rate": 4.988171078870488e-05, + "loss": 0.2589, + "num_input_tokens_seen": 16944416, + "step": 7840 + }, + { + "epoch": 1.2797716150081566, + "grad_norm": 2.0183675289154053, + "learning_rate": 4.988101817199963e-05, + "loss": 0.242, + "num_input_tokens_seen": 16954816, + "step": 7845 + }, + { + "epoch": 1.2805872756933117, + "grad_norm": 0.9942153692245483, + "learning_rate": 4.988032353831279e-05, + "loss": 0.11, + "num_input_tokens_seen": 16964992, + "step": 7850 + }, + { + "epoch": 1.2814029363784667, + "grad_norm": 0.14058445394039154, + "learning_rate": 4.9879626887700694e-05, + "loss": 0.0533, + "num_input_tokens_seen": 16975200, + "step": 7855 + }, + { + "epoch": 1.2822185970636215, + "grad_norm": 0.6203305721282959, + "learning_rate": 4.98789282202198e-05, + "loss": 0.0949, + "num_input_tokens_seen": 16985984, + "step": 7860 + }, + { + "epoch": 1.2830342577487766, + "grad_norm": 0.19137662649154663, + "learning_rate": 4.9878227535926745e-05, + "loss": 0.1324, + "num_input_tokens_seen": 16997504, + "step": 7865 + }, + { + "epoch": 1.2838499184339316, + "grad_norm": 0.04926902428269386, + "learning_rate": 4.987752483487834e-05, + "loss": 0.0629, + "num_input_tokens_seen": 17007296, + "step": 7870 + }, + { + "epoch": 1.2846655791190864, + "grad_norm": 1.0828146934509277, + "learning_rate": 4.987682011713155e-05, + "loss": 0.1819, + "num_input_tokens_seen": 17017792, + "step": 7875 + }, + { + "epoch": 1.2854812398042414, + "grad_norm": 0.42353543639183044, + "learning_rate": 4.9876113382743496e-05, + "loss": 0.1167, + "num_input_tokens_seen": 17029472, + "step": 7880 + }, + { + "epoch": 1.2862969004893965, + "grad_norm": 0.1622113585472107, + "learning_rate": 4.987540463177147e-05, + "loss": 0.0625, + "num_input_tokens_seen": 17039808, + "step": 7885 + }, + { + "epoch": 1.2871125611745513, + "grad_norm": 0.32840630412101746, + "learning_rate": 4.987469386427292e-05, + "loss": 0.0895, + "num_input_tokens_seen": 17050720, + "step": 7890 + }, + { + "epoch": 1.2879282218597063, + "grad_norm": 0.5184624791145325, + "learning_rate": 4.987398108030548e-05, + "loss": 0.2141, + "num_input_tokens_seen": 17061792, + "step": 7895 + }, + { + "epoch": 1.2887438825448614, + "grad_norm": 0.8181111812591553, + "learning_rate": 4.987326627992692e-05, + "loss": 0.1032, + "num_input_tokens_seen": 17072512, + "step": 7900 + }, + { + "epoch": 1.2895595432300162, + "grad_norm": 0.46939966082572937, + "learning_rate": 4.98725494631952e-05, + "loss": 0.1469, + "num_input_tokens_seen": 17084288, + "step": 7905 + }, + { + "epoch": 1.2903752039151712, + "grad_norm": 1.5741486549377441, + "learning_rate": 4.9871830630168404e-05, + "loss": 0.3427, + "num_input_tokens_seen": 17095648, + "step": 7910 + }, + { + "epoch": 1.2911908646003263, + "grad_norm": 0.7115363478660583, + "learning_rate": 4.987110978090482e-05, + "loss": 0.1753, + "num_input_tokens_seen": 17106976, + "step": 7915 + }, + { + "epoch": 1.2920065252854813, + "grad_norm": 1.0173192024230957, + "learning_rate": 4.9870386915462894e-05, + "loss": 0.1013, + "num_input_tokens_seen": 17116640, + "step": 7920 + }, + { + "epoch": 1.2928221859706361, + "grad_norm": 0.28692835569381714, + "learning_rate": 4.986966203390121e-05, + "loss": 0.2261, + "num_input_tokens_seen": 17127584, + "step": 7925 + }, + { + "epoch": 1.2936378466557912, + "grad_norm": 0.0319373644888401, + "learning_rate": 4.986893513627853e-05, + "loss": 0.0622, + "num_input_tokens_seen": 17137760, + "step": 7930 + }, + { + "epoch": 1.2944535073409462, + "grad_norm": 1.6692686080932617, + "learning_rate": 4.9868206222653785e-05, + "loss": 0.2997, + "num_input_tokens_seen": 17149056, + "step": 7935 + }, + { + "epoch": 1.2952691680261013, + "grad_norm": 0.08771282434463501, + "learning_rate": 4.9867475293086066e-05, + "loss": 0.1534, + "num_input_tokens_seen": 17160096, + "step": 7940 + }, + { + "epoch": 1.296084828711256, + "grad_norm": 1.5949183702468872, + "learning_rate": 4.9866742347634624e-05, + "loss": 0.145, + "num_input_tokens_seen": 17170592, + "step": 7945 + }, + { + "epoch": 1.2969004893964111, + "grad_norm": 1.7492725849151611, + "learning_rate": 4.986600738635887e-05, + "loss": 0.3093, + "num_input_tokens_seen": 17181760, + "step": 7950 + }, + { + "epoch": 1.2977161500815662, + "grad_norm": 2.251246452331543, + "learning_rate": 4.986527040931839e-05, + "loss": 0.265, + "num_input_tokens_seen": 17192416, + "step": 7955 + }, + { + "epoch": 1.298531810766721, + "grad_norm": 1.159475326538086, + "learning_rate": 4.9864531416572926e-05, + "loss": 0.0856, + "num_input_tokens_seen": 17204256, + "step": 7960 + }, + { + "epoch": 1.299347471451876, + "grad_norm": 0.8298521637916565, + "learning_rate": 4.986379040818239e-05, + "loss": 0.2257, + "num_input_tokens_seen": 17215520, + "step": 7965 + }, + { + "epoch": 1.300163132137031, + "grad_norm": 1.4584145545959473, + "learning_rate": 4.9863047384206835e-05, + "loss": 0.223, + "num_input_tokens_seen": 17226560, + "step": 7970 + }, + { + "epoch": 1.3009787928221859, + "grad_norm": 0.20686374604701996, + "learning_rate": 4.986230234470651e-05, + "loss": 0.1074, + "num_input_tokens_seen": 17236864, + "step": 7975 + }, + { + "epoch": 1.301794453507341, + "grad_norm": 1.08961021900177, + "learning_rate": 4.986155528974181e-05, + "loss": 0.1901, + "num_input_tokens_seen": 17247424, + "step": 7980 + }, + { + "epoch": 1.302610114192496, + "grad_norm": 1.1100622415542603, + "learning_rate": 4.986080621937329e-05, + "loss": 0.0835, + "num_input_tokens_seen": 17258208, + "step": 7985 + }, + { + "epoch": 1.3034257748776508, + "grad_norm": 1.0842260122299194, + "learning_rate": 4.9860055133661675e-05, + "loss": 0.1275, + "num_input_tokens_seen": 17269280, + "step": 7990 + }, + { + "epoch": 1.3042414355628058, + "grad_norm": 0.436748206615448, + "learning_rate": 4.985930203266785e-05, + "loss": 0.0584, + "num_input_tokens_seen": 17278816, + "step": 7995 + }, + { + "epoch": 1.3050570962479608, + "grad_norm": 0.6630215048789978, + "learning_rate": 4.985854691645287e-05, + "loss": 0.054, + "num_input_tokens_seen": 17289824, + "step": 8000 + }, + { + "epoch": 1.3058727569331159, + "grad_norm": 0.44978564977645874, + "learning_rate": 4.985778978507795e-05, + "loss": 0.0918, + "num_input_tokens_seen": 17300448, + "step": 8005 + }, + { + "epoch": 1.306688417618271, + "grad_norm": 1.4119718074798584, + "learning_rate": 4.9857030638604454e-05, + "loss": 0.262, + "num_input_tokens_seen": 17310592, + "step": 8010 + }, + { + "epoch": 1.3075040783034257, + "grad_norm": 0.17335893213748932, + "learning_rate": 4.985626947709393e-05, + "loss": 0.1419, + "num_input_tokens_seen": 17321632, + "step": 8015 + }, + { + "epoch": 1.3083197389885808, + "grad_norm": 0.4426310956478119, + "learning_rate": 4.985550630060809e-05, + "loss": 0.1529, + "num_input_tokens_seen": 17332960, + "step": 8020 + }, + { + "epoch": 1.3091353996737358, + "grad_norm": 2.697675943374634, + "learning_rate": 4.985474110920879e-05, + "loss": 0.3281, + "num_input_tokens_seen": 17343808, + "step": 8025 + }, + { + "epoch": 1.3099510603588906, + "grad_norm": 0.15749400854110718, + "learning_rate": 4.985397390295807e-05, + "loss": 0.1706, + "num_input_tokens_seen": 17353696, + "step": 8030 + }, + { + "epoch": 1.3107667210440457, + "grad_norm": 0.5040116310119629, + "learning_rate": 4.985320468191811e-05, + "loss": 0.1685, + "num_input_tokens_seen": 17364000, + "step": 8035 + }, + { + "epoch": 1.3115823817292007, + "grad_norm": 0.49932393431663513, + "learning_rate": 4.985243344615128e-05, + "loss": 0.1334, + "num_input_tokens_seen": 17371936, + "step": 8040 + }, + { + "epoch": 1.3123980424143555, + "grad_norm": 0.7465859055519104, + "learning_rate": 4.9851660195720095e-05, + "loss": 0.1548, + "num_input_tokens_seen": 17382304, + "step": 8045 + }, + { + "epoch": 1.3132137030995106, + "grad_norm": 0.48878148198127747, + "learning_rate": 4.985088493068724e-05, + "loss": 0.0598, + "num_input_tokens_seen": 17392000, + "step": 8050 + }, + { + "epoch": 1.3140293637846656, + "grad_norm": 0.3741835951805115, + "learning_rate": 4.985010765111555e-05, + "loss": 0.0919, + "num_input_tokens_seen": 17402528, + "step": 8055 + }, + { + "epoch": 1.3148450244698204, + "grad_norm": 0.13645236194133759, + "learning_rate": 4.984932835706805e-05, + "loss": 0.2114, + "num_input_tokens_seen": 17412480, + "step": 8060 + }, + { + "epoch": 1.3156606851549755, + "grad_norm": 0.39734262228012085, + "learning_rate": 4.984854704860791e-05, + "loss": 0.0834, + "num_input_tokens_seen": 17423520, + "step": 8065 + }, + { + "epoch": 1.3164763458401305, + "grad_norm": 0.7645502090454102, + "learning_rate": 4.984776372579847e-05, + "loss": 0.1225, + "num_input_tokens_seen": 17434176, + "step": 8070 + }, + { + "epoch": 1.3172920065252856, + "grad_norm": 1.7395343780517578, + "learning_rate": 4.984697838870322e-05, + "loss": 0.1818, + "num_input_tokens_seen": 17445504, + "step": 8075 + }, + { + "epoch": 1.3181076672104406, + "grad_norm": 0.3358175456523895, + "learning_rate": 4.984619103738584e-05, + "loss": 0.073, + "num_input_tokens_seen": 17456544, + "step": 8080 + }, + { + "epoch": 1.3189233278955954, + "grad_norm": 0.7949738502502441, + "learning_rate": 4.984540167191014e-05, + "loss": 0.1209, + "num_input_tokens_seen": 17466432, + "step": 8085 + }, + { + "epoch": 1.3197389885807504, + "grad_norm": 1.1140042543411255, + "learning_rate": 4.984461029234011e-05, + "loss": 0.0658, + "num_input_tokens_seen": 17477472, + "step": 8090 + }, + { + "epoch": 1.3205546492659055, + "grad_norm": 0.4260055124759674, + "learning_rate": 4.9843816898739913e-05, + "loss": 0.0858, + "num_input_tokens_seen": 17487360, + "step": 8095 + }, + { + "epoch": 1.3213703099510603, + "grad_norm": 0.19757315516471863, + "learning_rate": 4.984302149117387e-05, + "loss": 0.0619, + "num_input_tokens_seen": 17497568, + "step": 8100 + }, + { + "epoch": 1.3221859706362153, + "grad_norm": 0.6601164937019348, + "learning_rate": 4.984222406970644e-05, + "loss": 0.0679, + "num_input_tokens_seen": 17508768, + "step": 8105 + }, + { + "epoch": 1.3230016313213704, + "grad_norm": 0.2028186321258545, + "learning_rate": 4.984142463440229e-05, + "loss": 0.117, + "num_input_tokens_seen": 17518368, + "step": 8110 + }, + { + "epoch": 1.3238172920065252, + "grad_norm": 2.52878475189209, + "learning_rate": 4.984062318532621e-05, + "loss": 0.3555, + "num_input_tokens_seen": 17528416, + "step": 8115 + }, + { + "epoch": 1.3246329526916802, + "grad_norm": 0.315536767244339, + "learning_rate": 4.983981972254317e-05, + "loss": 0.2875, + "num_input_tokens_seen": 17537600, + "step": 8120 + }, + { + "epoch": 1.3254486133768353, + "grad_norm": 0.35498911142349243, + "learning_rate": 4.983901424611832e-05, + "loss": 0.0818, + "num_input_tokens_seen": 17548736, + "step": 8125 + }, + { + "epoch": 1.32626427406199, + "grad_norm": 0.7185875177383423, + "learning_rate": 4.9838206756116926e-05, + "loss": 0.2549, + "num_input_tokens_seen": 17558336, + "step": 8130 + }, + { + "epoch": 1.3270799347471451, + "grad_norm": 0.46769359707832336, + "learning_rate": 4.983739725260448e-05, + "loss": 0.0392, + "num_input_tokens_seen": 17570272, + "step": 8135 + }, + { + "epoch": 1.3278955954323002, + "grad_norm": 0.6646812558174133, + "learning_rate": 4.983658573564658e-05, + "loss": 0.0906, + "num_input_tokens_seen": 17580960, + "step": 8140 + }, + { + "epoch": 1.3287112561174552, + "grad_norm": 0.15906447172164917, + "learning_rate": 4.983577220530902e-05, + "loss": 0.1132, + "num_input_tokens_seen": 17590528, + "step": 8145 + }, + { + "epoch": 1.32952691680261, + "grad_norm": 0.7903897762298584, + "learning_rate": 4.983495666165775e-05, + "loss": 0.1427, + "num_input_tokens_seen": 17600672, + "step": 8150 + }, + { + "epoch": 1.330342577487765, + "grad_norm": 0.5599838495254517, + "learning_rate": 4.983413910475889e-05, + "loss": 0.2473, + "num_input_tokens_seen": 17611648, + "step": 8155 + }, + { + "epoch": 1.3311582381729201, + "grad_norm": 0.671582043170929, + "learning_rate": 4.98333195346787e-05, + "loss": 0.2115, + "num_input_tokens_seen": 17622656, + "step": 8160 + }, + { + "epoch": 1.3319738988580752, + "grad_norm": 0.2762494683265686, + "learning_rate": 4.983249795148363e-05, + "loss": 0.0928, + "num_input_tokens_seen": 17632864, + "step": 8165 + }, + { + "epoch": 1.33278955954323, + "grad_norm": 0.266267865896225, + "learning_rate": 4.983167435524027e-05, + "loss": 0.1476, + "num_input_tokens_seen": 17643968, + "step": 8170 + }, + { + "epoch": 1.333605220228385, + "grad_norm": 0.8275823593139648, + "learning_rate": 4.98308487460154e-05, + "loss": 0.1651, + "num_input_tokens_seen": 17654272, + "step": 8175 + }, + { + "epoch": 1.33442088091354, + "grad_norm": 0.41866549849510193, + "learning_rate": 4.983002112387594e-05, + "loss": 0.1055, + "num_input_tokens_seen": 17665728, + "step": 8180 + }, + { + "epoch": 1.3352365415986949, + "grad_norm": 0.3109361231327057, + "learning_rate": 4.982919148888897e-05, + "loss": 0.2513, + "num_input_tokens_seen": 17677856, + "step": 8185 + }, + { + "epoch": 1.33605220228385, + "grad_norm": 0.15104494988918304, + "learning_rate": 4.982835984112177e-05, + "loss": 0.1163, + "num_input_tokens_seen": 17688672, + "step": 8190 + }, + { + "epoch": 1.336867862969005, + "grad_norm": 0.3408213257789612, + "learning_rate": 4.982752618064174e-05, + "loss": 0.1176, + "num_input_tokens_seen": 17699456, + "step": 8195 + }, + { + "epoch": 1.3376835236541598, + "grad_norm": 1.8632510900497437, + "learning_rate": 4.982669050751646e-05, + "loss": 0.1681, + "num_input_tokens_seen": 17710976, + "step": 8200 + }, + { + "epoch": 1.3384991843393148, + "grad_norm": 0.7814512848854065, + "learning_rate": 4.982585282181368e-05, + "loss": 0.081, + "num_input_tokens_seen": 17721152, + "step": 8205 + }, + { + "epoch": 1.3393148450244698, + "grad_norm": 0.7978337407112122, + "learning_rate": 4.9825013123601305e-05, + "loss": 0.1563, + "num_input_tokens_seen": 17731200, + "step": 8210 + }, + { + "epoch": 1.3401305057096247, + "grad_norm": 1.2878050804138184, + "learning_rate": 4.9824171412947404e-05, + "loss": 0.1819, + "num_input_tokens_seen": 17741760, + "step": 8215 + }, + { + "epoch": 1.3409461663947797, + "grad_norm": 1.3785074949264526, + "learning_rate": 4.982332768992021e-05, + "loss": 0.1556, + "num_input_tokens_seen": 17754368, + "step": 8220 + }, + { + "epoch": 1.3417618270799347, + "grad_norm": 1.0371918678283691, + "learning_rate": 4.982248195458812e-05, + "loss": 0.2015, + "num_input_tokens_seen": 17764096, + "step": 8225 + }, + { + "epoch": 1.3425774877650898, + "grad_norm": 0.667453944683075, + "learning_rate": 4.98216342070197e-05, + "loss": 0.1339, + "num_input_tokens_seen": 17774784, + "step": 8230 + }, + { + "epoch": 1.3433931484502448, + "grad_norm": 0.10895339399576187, + "learning_rate": 4.982078444728367e-05, + "loss": 0.0464, + "num_input_tokens_seen": 17786112, + "step": 8235 + }, + { + "epoch": 1.3442088091353996, + "grad_norm": 1.465915560722351, + "learning_rate": 4.981993267544891e-05, + "loss": 0.0822, + "num_input_tokens_seen": 17797120, + "step": 8240 + }, + { + "epoch": 1.3450244698205547, + "grad_norm": 1.2549196481704712, + "learning_rate": 4.9819078891584467e-05, + "loss": 0.163, + "num_input_tokens_seen": 17808032, + "step": 8245 + }, + { + "epoch": 1.3458401305057097, + "grad_norm": 0.29555994272232056, + "learning_rate": 4.981822309575956e-05, + "loss": 0.042, + "num_input_tokens_seen": 17818144, + "step": 8250 + }, + { + "epoch": 1.3466557911908645, + "grad_norm": 0.1517149806022644, + "learning_rate": 4.981736528804357e-05, + "loss": 0.0513, + "num_input_tokens_seen": 17828032, + "step": 8255 + }, + { + "epoch": 1.3474714518760196, + "grad_norm": 0.580276370048523, + "learning_rate": 4.9816505468506026e-05, + "loss": 0.1013, + "num_input_tokens_seen": 17839232, + "step": 8260 + }, + { + "epoch": 1.3482871125611746, + "grad_norm": 0.7274616956710815, + "learning_rate": 4.981564363721663e-05, + "loss": 0.0498, + "num_input_tokens_seen": 17849088, + "step": 8265 + }, + { + "epoch": 1.3491027732463294, + "grad_norm": 0.1916886568069458, + "learning_rate": 4.981477979424524e-05, + "loss": 0.0665, + "num_input_tokens_seen": 17860512, + "step": 8270 + }, + { + "epoch": 1.3499184339314845, + "grad_norm": 0.2475271075963974, + "learning_rate": 4.98139139396619e-05, + "loss": 0.3531, + "num_input_tokens_seen": 17872000, + "step": 8275 + }, + { + "epoch": 1.3507340946166395, + "grad_norm": 0.3772023916244507, + "learning_rate": 4.981304607353678e-05, + "loss": 0.1573, + "num_input_tokens_seen": 17883712, + "step": 8280 + }, + { + "epoch": 1.3515497553017943, + "grad_norm": 1.699342966079712, + "learning_rate": 4.981217619594026e-05, + "loss": 0.1251, + "num_input_tokens_seen": 17894240, + "step": 8285 + }, + { + "epoch": 1.3523654159869494, + "grad_norm": 0.10178104043006897, + "learning_rate": 4.981130430694283e-05, + "loss": 0.1736, + "num_input_tokens_seen": 17903904, + "step": 8290 + }, + { + "epoch": 1.3531810766721044, + "grad_norm": 0.3703417181968689, + "learning_rate": 4.9810430406615194e-05, + "loss": 0.1283, + "num_input_tokens_seen": 17914208, + "step": 8295 + }, + { + "epoch": 1.3539967373572595, + "grad_norm": 0.030938509851694107, + "learning_rate": 4.980955449502818e-05, + "loss": 0.0186, + "num_input_tokens_seen": 17924640, + "step": 8300 + }, + { + "epoch": 1.3548123980424145, + "grad_norm": 0.10887324810028076, + "learning_rate": 4.980867657225279e-05, + "loss": 0.0906, + "num_input_tokens_seen": 17935520, + "step": 8305 + }, + { + "epoch": 1.3556280587275693, + "grad_norm": 0.5587227940559387, + "learning_rate": 4.980779663836019e-05, + "loss": 0.1314, + "num_input_tokens_seen": 17946848, + "step": 8310 + }, + { + "epoch": 1.3564437194127243, + "grad_norm": 0.6571934223175049, + "learning_rate": 4.980691469342174e-05, + "loss": 0.085, + "num_input_tokens_seen": 17957568, + "step": 8315 + }, + { + "epoch": 1.3572593800978794, + "grad_norm": 1.8414629697799683, + "learning_rate": 4.98060307375089e-05, + "loss": 0.0845, + "num_input_tokens_seen": 17967776, + "step": 8320 + }, + { + "epoch": 1.3580750407830342, + "grad_norm": 1.246471643447876, + "learning_rate": 4.980514477069336e-05, + "loss": 0.087, + "num_input_tokens_seen": 17978688, + "step": 8325 + }, + { + "epoch": 1.3588907014681892, + "grad_norm": 0.18067152798175812, + "learning_rate": 4.980425679304691e-05, + "loss": 0.1734, + "num_input_tokens_seen": 17990432, + "step": 8330 + }, + { + "epoch": 1.3597063621533443, + "grad_norm": 2.093881607055664, + "learning_rate": 4.9803366804641556e-05, + "loss": 0.185, + "num_input_tokens_seen": 18001888, + "step": 8335 + }, + { + "epoch": 1.360522022838499, + "grad_norm": 0.1183953806757927, + "learning_rate": 4.980247480554944e-05, + "loss": 0.0873, + "num_input_tokens_seen": 18013536, + "step": 8340 + }, + { + "epoch": 1.3613376835236541, + "grad_norm": 2.777961015701294, + "learning_rate": 4.980158079584286e-05, + "loss": 0.3329, + "num_input_tokens_seen": 18024864, + "step": 8345 + }, + { + "epoch": 1.3621533442088092, + "grad_norm": 1.3634042739868164, + "learning_rate": 4.9800684775594306e-05, + "loss": 0.0522, + "num_input_tokens_seen": 18036192, + "step": 8350 + }, + { + "epoch": 1.362969004893964, + "grad_norm": 1.7045334577560425, + "learning_rate": 4.979978674487641e-05, + "loss": 0.1548, + "num_input_tokens_seen": 18044864, + "step": 8355 + }, + { + "epoch": 1.363784665579119, + "grad_norm": 0.21142731606960297, + "learning_rate": 4.979888670376196e-05, + "loss": 0.2387, + "num_input_tokens_seen": 18056192, + "step": 8360 + }, + { + "epoch": 1.364600326264274, + "grad_norm": 0.1007235124707222, + "learning_rate": 4.979798465232393e-05, + "loss": 0.114, + "num_input_tokens_seen": 18066592, + "step": 8365 + }, + { + "epoch": 1.3654159869494291, + "grad_norm": 1.0888813734054565, + "learning_rate": 4.9797080590635434e-05, + "loss": 0.1423, + "num_input_tokens_seen": 18077408, + "step": 8370 + }, + { + "epoch": 1.366231647634584, + "grad_norm": 0.8058589100837708, + "learning_rate": 4.979617451876978e-05, + "loss": 0.0991, + "num_input_tokens_seen": 18088544, + "step": 8375 + }, + { + "epoch": 1.367047308319739, + "grad_norm": 0.6500251889228821, + "learning_rate": 4.979526643680039e-05, + "loss": 0.2222, + "num_input_tokens_seen": 18100192, + "step": 8380 + }, + { + "epoch": 1.367862969004894, + "grad_norm": 0.5268667936325073, + "learning_rate": 4.9794356344800894e-05, + "loss": 0.2683, + "num_input_tokens_seen": 18110432, + "step": 8385 + }, + { + "epoch": 1.368678629690049, + "grad_norm": 0.27150434255599976, + "learning_rate": 4.9793444242845075e-05, + "loss": 0.191, + "num_input_tokens_seen": 18120928, + "step": 8390 + }, + { + "epoch": 1.3694942903752039, + "grad_norm": 0.36934569478034973, + "learning_rate": 4.979253013100686e-05, + "loss": 0.1732, + "num_input_tokens_seen": 18131104, + "step": 8395 + }, + { + "epoch": 1.370309951060359, + "grad_norm": 0.04967869445681572, + "learning_rate": 4.979161400936036e-05, + "loss": 0.0421, + "num_input_tokens_seen": 18142720, + "step": 8400 + }, + { + "epoch": 1.371125611745514, + "grad_norm": 0.14478720724582672, + "learning_rate": 4.979069587797984e-05, + "loss": 0.0889, + "num_input_tokens_seen": 18153664, + "step": 8405 + }, + { + "epoch": 1.3719412724306688, + "grad_norm": 1.8310569524765015, + "learning_rate": 4.978977573693972e-05, + "loss": 0.4896, + "num_input_tokens_seen": 18164512, + "step": 8410 + }, + { + "epoch": 1.3727569331158238, + "grad_norm": 0.5026527643203735, + "learning_rate": 4.97888535863146e-05, + "loss": 0.2821, + "num_input_tokens_seen": 18174368, + "step": 8415 + }, + { + "epoch": 1.3735725938009788, + "grad_norm": 1.0323867797851562, + "learning_rate": 4.9787929426179224e-05, + "loss": 0.0948, + "num_input_tokens_seen": 18185216, + "step": 8420 + }, + { + "epoch": 1.3743882544861337, + "grad_norm": 0.6572347283363342, + "learning_rate": 4.978700325660852e-05, + "loss": 0.1065, + "num_input_tokens_seen": 18196256, + "step": 8425 + }, + { + "epoch": 1.3752039151712887, + "grad_norm": 0.8093132376670837, + "learning_rate": 4.978607507767757e-05, + "loss": 0.0809, + "num_input_tokens_seen": 18207264, + "step": 8430 + }, + { + "epoch": 1.3760195758564437, + "grad_norm": 2.1320583820343018, + "learning_rate": 4.9785144889461606e-05, + "loss": 0.2629, + "num_input_tokens_seen": 18217760, + "step": 8435 + }, + { + "epoch": 1.3768352365415986, + "grad_norm": 0.1304975301027298, + "learning_rate": 4.978421269203604e-05, + "loss": 0.1169, + "num_input_tokens_seen": 18228608, + "step": 8440 + }, + { + "epoch": 1.3776508972267536, + "grad_norm": 2.013760805130005, + "learning_rate": 4.9783278485476434e-05, + "loss": 0.247, + "num_input_tokens_seen": 18239200, + "step": 8445 + }, + { + "epoch": 1.3784665579119086, + "grad_norm": 3.0486900806427, + "learning_rate": 4.978234226985853e-05, + "loss": 0.2465, + "num_input_tokens_seen": 18251200, + "step": 8450 + }, + { + "epoch": 1.3792822185970637, + "grad_norm": 1.1449403762817383, + "learning_rate": 4.978140404525822e-05, + "loss": 0.0657, + "num_input_tokens_seen": 18261856, + "step": 8455 + }, + { + "epoch": 1.3800978792822187, + "grad_norm": 0.3155995011329651, + "learning_rate": 4.978046381175155e-05, + "loss": 0.0392, + "num_input_tokens_seen": 18273152, + "step": 8460 + }, + { + "epoch": 1.3809135399673735, + "grad_norm": 0.1719769984483719, + "learning_rate": 4.977952156941476e-05, + "loss": 0.0717, + "num_input_tokens_seen": 18284896, + "step": 8465 + }, + { + "epoch": 1.3817292006525286, + "grad_norm": 0.17510537803173065, + "learning_rate": 4.977857731832421e-05, + "loss": 0.1339, + "num_input_tokens_seen": 18294912, + "step": 8470 + }, + { + "epoch": 1.3825448613376836, + "grad_norm": 0.4680802822113037, + "learning_rate": 4.977763105855646e-05, + "loss": 0.1251, + "num_input_tokens_seen": 18304672, + "step": 8475 + }, + { + "epoch": 1.3833605220228384, + "grad_norm": 0.3379485011100769, + "learning_rate": 4.9776682790188225e-05, + "loss": 0.1585, + "num_input_tokens_seen": 18316256, + "step": 8480 + }, + { + "epoch": 1.3841761827079935, + "grad_norm": 0.14064888656139374, + "learning_rate": 4.977573251329636e-05, + "loss": 0.2052, + "num_input_tokens_seen": 18327904, + "step": 8485 + }, + { + "epoch": 1.3849918433931485, + "grad_norm": 0.08874785900115967, + "learning_rate": 4.97747802279579e-05, + "loss": 0.0462, + "num_input_tokens_seen": 18338752, + "step": 8490 + }, + { + "epoch": 1.3858075040783033, + "grad_norm": 0.14626598358154297, + "learning_rate": 4.9773825934250056e-05, + "loss": 0.1088, + "num_input_tokens_seen": 18350464, + "step": 8495 + }, + { + "epoch": 1.3866231647634584, + "grad_norm": 0.7591391205787659, + "learning_rate": 4.977286963225018e-05, + "loss": 0.1956, + "num_input_tokens_seen": 18360832, + "step": 8500 + }, + { + "epoch": 1.3874388254486134, + "grad_norm": 1.4445750713348389, + "learning_rate": 4.9771911322035794e-05, + "loss": 0.1698, + "num_input_tokens_seen": 18370976, + "step": 8505 + }, + { + "epoch": 1.3882544861337682, + "grad_norm": 0.1082734763622284, + "learning_rate": 4.977095100368459e-05, + "loss": 0.1462, + "num_input_tokens_seen": 18381920, + "step": 8510 + }, + { + "epoch": 1.3890701468189233, + "grad_norm": 0.4161073863506317, + "learning_rate": 4.9769988677274405e-05, + "loss": 0.1804, + "num_input_tokens_seen": 18393152, + "step": 8515 + }, + { + "epoch": 1.3898858075040783, + "grad_norm": 0.6001592874526978, + "learning_rate": 4.976902434288326e-05, + "loss": 0.0962, + "num_input_tokens_seen": 18403168, + "step": 8520 + }, + { + "epoch": 1.3907014681892333, + "grad_norm": 0.1181526631116867, + "learning_rate": 4.9768058000589325e-05, + "loss": 0.1848, + "num_input_tokens_seen": 18413792, + "step": 8525 + }, + { + "epoch": 1.3915171288743884, + "grad_norm": 1.6027510166168213, + "learning_rate": 4.976708965047093e-05, + "loss": 0.1399, + "num_input_tokens_seen": 18424864, + "step": 8530 + }, + { + "epoch": 1.3923327895595432, + "grad_norm": 1.0224151611328125, + "learning_rate": 4.976611929260659e-05, + "loss": 0.0672, + "num_input_tokens_seen": 18434144, + "step": 8535 + }, + { + "epoch": 1.3931484502446982, + "grad_norm": 0.995125412940979, + "learning_rate": 4.976514692707496e-05, + "loss": 0.1808, + "num_input_tokens_seen": 18444576, + "step": 8540 + }, + { + "epoch": 1.3939641109298533, + "grad_norm": 0.36804354190826416, + "learning_rate": 4.9764172553954855e-05, + "loss": 0.2286, + "num_input_tokens_seen": 18455808, + "step": 8545 + }, + { + "epoch": 1.394779771615008, + "grad_norm": 0.5762017965316772, + "learning_rate": 4.976319617332527e-05, + "loss": 0.0675, + "num_input_tokens_seen": 18466848, + "step": 8550 + }, + { + "epoch": 1.3955954323001631, + "grad_norm": 0.5209258198738098, + "learning_rate": 4.9762217785265356e-05, + "loss": 0.2224, + "num_input_tokens_seen": 18479040, + "step": 8555 + }, + { + "epoch": 1.3964110929853182, + "grad_norm": 0.7207282781600952, + "learning_rate": 4.976123738985443e-05, + "loss": 0.1473, + "num_input_tokens_seen": 18490080, + "step": 8560 + }, + { + "epoch": 1.397226753670473, + "grad_norm": 1.8721143007278442, + "learning_rate": 4.976025498717196e-05, + "loss": 0.2597, + "num_input_tokens_seen": 18501632, + "step": 8565 + }, + { + "epoch": 1.398042414355628, + "grad_norm": 0.06844497472047806, + "learning_rate": 4.9759270577297603e-05, + "loss": 0.1596, + "num_input_tokens_seen": 18513056, + "step": 8570 + }, + { + "epoch": 1.398858075040783, + "grad_norm": 0.39333680272102356, + "learning_rate": 4.975828416031113e-05, + "loss": 0.0197, + "num_input_tokens_seen": 18523616, + "step": 8575 + }, + { + "epoch": 1.399673735725938, + "grad_norm": 0.41392782330513, + "learning_rate": 4.975729573629252e-05, + "loss": 0.1474, + "num_input_tokens_seen": 18534688, + "step": 8580 + }, + { + "epoch": 1.400489396411093, + "grad_norm": 0.24080877006053925, + "learning_rate": 4.9756305305321906e-05, + "loss": 0.2205, + "num_input_tokens_seen": 18545120, + "step": 8585 + }, + { + "epoch": 1.401305057096248, + "grad_norm": 0.3021838068962097, + "learning_rate": 4.975531286747958e-05, + "loss": 0.0964, + "num_input_tokens_seen": 18556224, + "step": 8590 + }, + { + "epoch": 1.402120717781403, + "grad_norm": 0.889634370803833, + "learning_rate": 4.975431842284597e-05, + "loss": 0.4437, + "num_input_tokens_seen": 18566272, + "step": 8595 + }, + { + "epoch": 1.4029363784665578, + "grad_norm": 0.4034481346607208, + "learning_rate": 4.975332197150171e-05, + "loss": 0.2043, + "num_input_tokens_seen": 18577216, + "step": 8600 + }, + { + "epoch": 1.4037520391517129, + "grad_norm": 0.19787530601024628, + "learning_rate": 4.975232351352758e-05, + "loss": 0.3172, + "num_input_tokens_seen": 18587072, + "step": 8605 + }, + { + "epoch": 1.404567699836868, + "grad_norm": 1.3807878494262695, + "learning_rate": 4.975132304900451e-05, + "loss": 0.1201, + "num_input_tokens_seen": 18598112, + "step": 8610 + }, + { + "epoch": 1.405383360522023, + "grad_norm": 0.6855478286743164, + "learning_rate": 4.975032057801361e-05, + "loss": 0.1936, + "num_input_tokens_seen": 18608384, + "step": 8615 + }, + { + "epoch": 1.4061990212071778, + "grad_norm": 0.7795236110687256, + "learning_rate": 4.974931610063613e-05, + "loss": 0.0801, + "num_input_tokens_seen": 18618944, + "step": 8620 + }, + { + "epoch": 1.4070146818923328, + "grad_norm": 0.21920494735240936, + "learning_rate": 4.974830961695353e-05, + "loss": 0.1119, + "num_input_tokens_seen": 18629248, + "step": 8625 + }, + { + "epoch": 1.4078303425774878, + "grad_norm": 0.38066649436950684, + "learning_rate": 4.9747301127047366e-05, + "loss": 0.0501, + "num_input_tokens_seen": 18641248, + "step": 8630 + }, + { + "epoch": 1.4086460032626427, + "grad_norm": 0.17510920763015747, + "learning_rate": 4.974629063099942e-05, + "loss": 0.0327, + "num_input_tokens_seen": 18651520, + "step": 8635 + }, + { + "epoch": 1.4094616639477977, + "grad_norm": 0.573754072189331, + "learning_rate": 4.974527812889158e-05, + "loss": 0.1827, + "num_input_tokens_seen": 18662304, + "step": 8640 + }, + { + "epoch": 1.4102773246329527, + "grad_norm": 0.028728865087032318, + "learning_rate": 4.974426362080594e-05, + "loss": 0.1177, + "num_input_tokens_seen": 18672416, + "step": 8645 + }, + { + "epoch": 1.4110929853181076, + "grad_norm": 1.069749355316162, + "learning_rate": 4.974324710682474e-05, + "loss": 0.1013, + "num_input_tokens_seen": 18683136, + "step": 8650 + }, + { + "epoch": 1.4119086460032626, + "grad_norm": 0.23686392605304718, + "learning_rate": 4.974222858703039e-05, + "loss": 0.2617, + "num_input_tokens_seen": 18693472, + "step": 8655 + }, + { + "epoch": 1.4127243066884176, + "grad_norm": 0.28056275844573975, + "learning_rate": 4.9741208061505454e-05, + "loss": 0.0634, + "num_input_tokens_seen": 18704128, + "step": 8660 + }, + { + "epoch": 1.4135399673735725, + "grad_norm": 0.546768307685852, + "learning_rate": 4.974018553033264e-05, + "loss": 0.0737, + "num_input_tokens_seen": 18715456, + "step": 8665 + }, + { + "epoch": 1.4143556280587275, + "grad_norm": 0.5857595801353455, + "learning_rate": 4.973916099359487e-05, + "loss": 0.1853, + "num_input_tokens_seen": 18726240, + "step": 8670 + }, + { + "epoch": 1.4151712887438825, + "grad_norm": 0.23354992270469666, + "learning_rate": 4.973813445137518e-05, + "loss": 0.0639, + "num_input_tokens_seen": 18736992, + "step": 8675 + }, + { + "epoch": 1.4159869494290376, + "grad_norm": 0.4560316503047943, + "learning_rate": 4.9737105903756794e-05, + "loss": 0.0609, + "num_input_tokens_seen": 18749216, + "step": 8680 + }, + { + "epoch": 1.4168026101141926, + "grad_norm": 0.06857717037200928, + "learning_rate": 4.973607535082309e-05, + "loss": 0.0809, + "num_input_tokens_seen": 18760352, + "step": 8685 + }, + { + "epoch": 1.4176182707993474, + "grad_norm": 0.2685120105743408, + "learning_rate": 4.97350427926576e-05, + "loss": 0.0874, + "num_input_tokens_seen": 18769792, + "step": 8690 + }, + { + "epoch": 1.4184339314845025, + "grad_norm": 0.1969255954027176, + "learning_rate": 4.973400822934404e-05, + "loss": 0.2258, + "num_input_tokens_seen": 18781696, + "step": 8695 + }, + { + "epoch": 1.4192495921696575, + "grad_norm": 0.8738628029823303, + "learning_rate": 4.973297166096628e-05, + "loss": 0.1723, + "num_input_tokens_seen": 18790912, + "step": 8700 + }, + { + "epoch": 1.4200652528548123, + "grad_norm": 1.7920475006103516, + "learning_rate": 4.9731933087608334e-05, + "loss": 0.1838, + "num_input_tokens_seen": 18800672, + "step": 8705 + }, + { + "epoch": 1.4208809135399674, + "grad_norm": 2.3198161125183105, + "learning_rate": 4.973089250935441e-05, + "loss": 0.192, + "num_input_tokens_seen": 18812032, + "step": 8710 + }, + { + "epoch": 1.4216965742251224, + "grad_norm": 1.627617597579956, + "learning_rate": 4.972984992628885e-05, + "loss": 0.2335, + "num_input_tokens_seen": 18823328, + "step": 8715 + }, + { + "epoch": 1.4225122349102772, + "grad_norm": 1.3859310150146484, + "learning_rate": 4.972880533849619e-05, + "loss": 0.2196, + "num_input_tokens_seen": 18834048, + "step": 8720 + }, + { + "epoch": 1.4233278955954323, + "grad_norm": 0.9107431173324585, + "learning_rate": 4.9727758746061084e-05, + "loss": 0.151, + "num_input_tokens_seen": 18845216, + "step": 8725 + }, + { + "epoch": 1.4241435562805873, + "grad_norm": 0.14224939048290253, + "learning_rate": 4.972671014906839e-05, + "loss": 0.1602, + "num_input_tokens_seen": 18854848, + "step": 8730 + }, + { + "epoch": 1.4249592169657421, + "grad_norm": 1.058025598526001, + "learning_rate": 4.972565954760311e-05, + "loss": 0.1411, + "num_input_tokens_seen": 18866368, + "step": 8735 + }, + { + "epoch": 1.4257748776508972, + "grad_norm": 0.9419047832489014, + "learning_rate": 4.9724606941750406e-05, + "loss": 0.2699, + "num_input_tokens_seen": 18878464, + "step": 8740 + }, + { + "epoch": 1.4265905383360522, + "grad_norm": 0.18219062685966492, + "learning_rate": 4.972355233159562e-05, + "loss": 0.173, + "num_input_tokens_seen": 18889856, + "step": 8745 + }, + { + "epoch": 1.4274061990212072, + "grad_norm": 0.2437511682510376, + "learning_rate": 4.972249571722423e-05, + "loss": 0.1658, + "num_input_tokens_seen": 18900352, + "step": 8750 + }, + { + "epoch": 1.4282218597063623, + "grad_norm": 0.16856686770915985, + "learning_rate": 4.97214370987219e-05, + "loss": 0.1017, + "num_input_tokens_seen": 18911936, + "step": 8755 + }, + { + "epoch": 1.429037520391517, + "grad_norm": 0.2286577671766281, + "learning_rate": 4.972037647617444e-05, + "loss": 0.0951, + "num_input_tokens_seen": 18922048, + "step": 8760 + }, + { + "epoch": 1.4298531810766721, + "grad_norm": 1.2201122045516968, + "learning_rate": 4.9719313849667835e-05, + "loss": 0.1325, + "num_input_tokens_seen": 18932288, + "step": 8765 + }, + { + "epoch": 1.4306688417618272, + "grad_norm": 0.7941123247146606, + "learning_rate": 4.9718249219288226e-05, + "loss": 0.1715, + "num_input_tokens_seen": 18943008, + "step": 8770 + }, + { + "epoch": 1.431484502446982, + "grad_norm": 1.2975040674209595, + "learning_rate": 4.971718258512191e-05, + "loss": 0.1248, + "num_input_tokens_seen": 18953376, + "step": 8775 + }, + { + "epoch": 1.432300163132137, + "grad_norm": 0.3932364284992218, + "learning_rate": 4.971611394725537e-05, + "loss": 0.0497, + "num_input_tokens_seen": 18965600, + "step": 8780 + }, + { + "epoch": 1.433115823817292, + "grad_norm": 0.587518036365509, + "learning_rate": 4.971504330577521e-05, + "loss": 0.1076, + "num_input_tokens_seen": 18975936, + "step": 8785 + }, + { + "epoch": 1.433931484502447, + "grad_norm": 0.0882759690284729, + "learning_rate": 4.971397066076825e-05, + "loss": 0.0652, + "num_input_tokens_seen": 18986048, + "step": 8790 + }, + { + "epoch": 1.434747145187602, + "grad_norm": 0.5696509480476379, + "learning_rate": 4.971289601232143e-05, + "loss": 0.1327, + "num_input_tokens_seen": 18997056, + "step": 8795 + }, + { + "epoch": 1.435562805872757, + "grad_norm": 0.1068899929523468, + "learning_rate": 4.971181936052186e-05, + "loss": 0.0336, + "num_input_tokens_seen": 19007744, + "step": 8800 + }, + { + "epoch": 1.4363784665579118, + "grad_norm": 0.7700384855270386, + "learning_rate": 4.971074070545684e-05, + "loss": 0.1753, + "num_input_tokens_seen": 19018784, + "step": 8805 + }, + { + "epoch": 1.4371941272430668, + "grad_norm": 0.8065531849861145, + "learning_rate": 4.970966004721378e-05, + "loss": 0.139, + "num_input_tokens_seen": 19030176, + "step": 8810 + }, + { + "epoch": 1.4380097879282219, + "grad_norm": 0.11314158886671066, + "learning_rate": 4.970857738588031e-05, + "loss": 0.0934, + "num_input_tokens_seen": 19041088, + "step": 8815 + }, + { + "epoch": 1.438825448613377, + "grad_norm": 0.37165123224258423, + "learning_rate": 4.9707492721544185e-05, + "loss": 0.028, + "num_input_tokens_seen": 19052352, + "step": 8820 + }, + { + "epoch": 1.4396411092985317, + "grad_norm": 0.7877383828163147, + "learning_rate": 4.970640605429334e-05, + "loss": 0.0835, + "num_input_tokens_seen": 19062688, + "step": 8825 + }, + { + "epoch": 1.4404567699836868, + "grad_norm": 1.3908308744430542, + "learning_rate": 4.970531738421585e-05, + "loss": 0.1344, + "num_input_tokens_seen": 19073504, + "step": 8830 + }, + { + "epoch": 1.4412724306688418, + "grad_norm": 0.9590302109718323, + "learning_rate": 4.970422671139999e-05, + "loss": 0.2386, + "num_input_tokens_seen": 19085792, + "step": 8835 + }, + { + "epoch": 1.4420880913539968, + "grad_norm": 0.3002943694591522, + "learning_rate": 4.970313403593416e-05, + "loss": 0.1472, + "num_input_tokens_seen": 19096512, + "step": 8840 + }, + { + "epoch": 1.4429037520391517, + "grad_norm": 1.3112082481384277, + "learning_rate": 4.970203935790695e-05, + "loss": 0.1708, + "num_input_tokens_seen": 19106336, + "step": 8845 + }, + { + "epoch": 1.4437194127243067, + "grad_norm": 1.2811903953552246, + "learning_rate": 4.970094267740708e-05, + "loss": 0.1314, + "num_input_tokens_seen": 19116800, + "step": 8850 + }, + { + "epoch": 1.4445350734094617, + "grad_norm": 1.212189793586731, + "learning_rate": 4.969984399452347e-05, + "loss": 0.1782, + "num_input_tokens_seen": 19127488, + "step": 8855 + }, + { + "epoch": 1.4453507340946166, + "grad_norm": 0.06619177758693695, + "learning_rate": 4.9698743309345184e-05, + "loss": 0.2327, + "num_input_tokens_seen": 19135264, + "step": 8860 + }, + { + "epoch": 1.4461663947797716, + "grad_norm": 0.8010154366493225, + "learning_rate": 4.969764062196145e-05, + "loss": 0.1471, + "num_input_tokens_seen": 19146656, + "step": 8865 + }, + { + "epoch": 1.4469820554649266, + "grad_norm": 0.06665907800197601, + "learning_rate": 4.969653593246164e-05, + "loss": 0.0299, + "num_input_tokens_seen": 19158880, + "step": 8870 + }, + { + "epoch": 1.4477977161500815, + "grad_norm": 0.8259228467941284, + "learning_rate": 4.9695429240935335e-05, + "loss": 0.2108, + "num_input_tokens_seen": 19170240, + "step": 8875 + }, + { + "epoch": 1.4486133768352365, + "grad_norm": 0.1983632594347, + "learning_rate": 4.9694320547472215e-05, + "loss": 0.068, + "num_input_tokens_seen": 19180960, + "step": 8880 + }, + { + "epoch": 1.4494290375203915, + "grad_norm": 1.8711177110671997, + "learning_rate": 4.9693209852162184e-05, + "loss": 0.402, + "num_input_tokens_seen": 19192576, + "step": 8885 + }, + { + "epoch": 1.4502446982055464, + "grad_norm": 0.5487431883811951, + "learning_rate": 4.969209715509526e-05, + "loss": 0.099, + "num_input_tokens_seen": 19202752, + "step": 8890 + }, + { + "epoch": 1.4510603588907014, + "grad_norm": 0.4326847195625305, + "learning_rate": 4.969098245636167e-05, + "loss": 0.0707, + "num_input_tokens_seen": 19214048, + "step": 8895 + }, + { + "epoch": 1.4518760195758564, + "grad_norm": 2.1383697986602783, + "learning_rate": 4.968986575605175e-05, + "loss": 0.2908, + "num_input_tokens_seen": 19225056, + "step": 8900 + }, + { + "epoch": 1.4526916802610115, + "grad_norm": 3.8658246994018555, + "learning_rate": 4.968874705425604e-05, + "loss": 0.2181, + "num_input_tokens_seen": 19236480, + "step": 8905 + }, + { + "epoch": 1.4535073409461665, + "grad_norm": 1.6951333284378052, + "learning_rate": 4.968762635106522e-05, + "loss": 0.2015, + "num_input_tokens_seen": 19248192, + "step": 8910 + }, + { + "epoch": 1.4543230016313213, + "grad_norm": 0.4172704517841339, + "learning_rate": 4.9686503646570146e-05, + "loss": 0.1555, + "num_input_tokens_seen": 19258368, + "step": 8915 + }, + { + "epoch": 1.4551386623164764, + "grad_norm": 0.14649607241153717, + "learning_rate": 4.9685378940861826e-05, + "loss": 0.0852, + "num_input_tokens_seen": 19269088, + "step": 8920 + }, + { + "epoch": 1.4559543230016314, + "grad_norm": 1.296694278717041, + "learning_rate": 4.9684252234031446e-05, + "loss": 0.1346, + "num_input_tokens_seen": 19280384, + "step": 8925 + }, + { + "epoch": 1.4567699836867862, + "grad_norm": 0.18951445817947388, + "learning_rate": 4.968312352617033e-05, + "loss": 0.2175, + "num_input_tokens_seen": 19292192, + "step": 8930 + }, + { + "epoch": 1.4575856443719413, + "grad_norm": 1.1053181886672974, + "learning_rate": 4.968199281736997e-05, + "loss": 0.1384, + "num_input_tokens_seen": 19303904, + "step": 8935 + }, + { + "epoch": 1.4584013050570963, + "grad_norm": 0.5205485820770264, + "learning_rate": 4.968086010772205e-05, + "loss": 0.1084, + "num_input_tokens_seen": 19315712, + "step": 8940 + }, + { + "epoch": 1.4592169657422511, + "grad_norm": 0.3917433023452759, + "learning_rate": 4.9679725397318375e-05, + "loss": 0.1991, + "num_input_tokens_seen": 19326848, + "step": 8945 + }, + { + "epoch": 1.4600326264274062, + "grad_norm": 3.2067322731018066, + "learning_rate": 4.967858868625094e-05, + "loss": 0.3383, + "num_input_tokens_seen": 19338496, + "step": 8950 + }, + { + "epoch": 1.4608482871125612, + "grad_norm": 1.1065832376480103, + "learning_rate": 4.967744997461188e-05, + "loss": 0.2726, + "num_input_tokens_seen": 19348992, + "step": 8955 + }, + { + "epoch": 1.461663947797716, + "grad_norm": 0.5372388362884521, + "learning_rate": 4.9676309262493513e-05, + "loss": 0.1745, + "num_input_tokens_seen": 19359680, + "step": 8960 + }, + { + "epoch": 1.462479608482871, + "grad_norm": 1.2104640007019043, + "learning_rate": 4.9675166549988314e-05, + "loss": 0.0938, + "num_input_tokens_seen": 19370848, + "step": 8965 + }, + { + "epoch": 1.463295269168026, + "grad_norm": 0.2635592222213745, + "learning_rate": 4.9674021837188917e-05, + "loss": 0.1332, + "num_input_tokens_seen": 19381888, + "step": 8970 + }, + { + "epoch": 1.4641109298531811, + "grad_norm": 0.06277162581682205, + "learning_rate": 4.967287512418811e-05, + "loss": 0.095, + "num_input_tokens_seen": 19392768, + "step": 8975 + }, + { + "epoch": 1.4649265905383362, + "grad_norm": 0.21775400638580322, + "learning_rate": 4.9671726411078864e-05, + "loss": 0.0645, + "num_input_tokens_seen": 19403488, + "step": 8980 + }, + { + "epoch": 1.465742251223491, + "grad_norm": 0.49373579025268555, + "learning_rate": 4.967057569795428e-05, + "loss": 0.2339, + "num_input_tokens_seen": 19414656, + "step": 8985 + }, + { + "epoch": 1.466557911908646, + "grad_norm": 0.29496142268180847, + "learning_rate": 4.966942298490767e-05, + "loss": 0.2418, + "num_input_tokens_seen": 19424960, + "step": 8990 + }, + { + "epoch": 1.467373572593801, + "grad_norm": 2.0113983154296875, + "learning_rate": 4.966826827203245e-05, + "loss": 0.146, + "num_input_tokens_seen": 19436512, + "step": 8995 + }, + { + "epoch": 1.468189233278956, + "grad_norm": 2.2392690181732178, + "learning_rate": 4.966711155942223e-05, + "loss": 0.1537, + "num_input_tokens_seen": 19447808, + "step": 9000 + }, + { + "epoch": 1.469004893964111, + "grad_norm": 0.40393295884132385, + "learning_rate": 4.966595284717081e-05, + "loss": 0.0763, + "num_input_tokens_seen": 19457984, + "step": 9005 + }, + { + "epoch": 1.469820554649266, + "grad_norm": 0.23018547892570496, + "learning_rate": 4.966479213537207e-05, + "loss": 0.0843, + "num_input_tokens_seen": 19467680, + "step": 9010 + }, + { + "epoch": 1.4706362153344208, + "grad_norm": 0.49305325746536255, + "learning_rate": 4.966362942412015e-05, + "loss": 0.1547, + "num_input_tokens_seen": 19475968, + "step": 9015 + }, + { + "epoch": 1.4714518760195758, + "grad_norm": 0.7353869080543518, + "learning_rate": 4.9662464713509285e-05, + "loss": 0.2338, + "num_input_tokens_seen": 19487328, + "step": 9020 + }, + { + "epoch": 1.4722675367047309, + "grad_norm": 0.10189709812402725, + "learning_rate": 4.966129800363389e-05, + "loss": 0.0365, + "num_input_tokens_seen": 19497280, + "step": 9025 + }, + { + "epoch": 1.4730831973898857, + "grad_norm": 0.29267412424087524, + "learning_rate": 4.9660129294588554e-05, + "loss": 0.1735, + "num_input_tokens_seen": 19507264, + "step": 9030 + }, + { + "epoch": 1.4738988580750407, + "grad_norm": 0.6233241558074951, + "learning_rate": 4.965895858646801e-05, + "loss": 0.1096, + "num_input_tokens_seen": 19518080, + "step": 9035 + }, + { + "epoch": 1.4747145187601958, + "grad_norm": 0.7910251617431641, + "learning_rate": 4.9657785879367166e-05, + "loss": 0.1122, + "num_input_tokens_seen": 19528288, + "step": 9040 + }, + { + "epoch": 1.4755301794453508, + "grad_norm": 1.2512730360031128, + "learning_rate": 4.965661117338108e-05, + "loss": 0.1439, + "num_input_tokens_seen": 19539072, + "step": 9045 + }, + { + "epoch": 1.4763458401305056, + "grad_norm": 1.572228193283081, + "learning_rate": 4.9655434468605e-05, + "loss": 0.2325, + "num_input_tokens_seen": 19550496, + "step": 9050 + }, + { + "epoch": 1.4771615008156607, + "grad_norm": 0.1463225930929184, + "learning_rate": 4.9654255765134294e-05, + "loss": 0.0731, + "num_input_tokens_seen": 19562752, + "step": 9055 + }, + { + "epoch": 1.4779771615008157, + "grad_norm": 0.2149760127067566, + "learning_rate": 4.965307506306452e-05, + "loss": 0.2358, + "num_input_tokens_seen": 19572512, + "step": 9060 + }, + { + "epoch": 1.4787928221859707, + "grad_norm": 0.18090412020683289, + "learning_rate": 4.965189236249139e-05, + "loss": 0.1767, + "num_input_tokens_seen": 19582752, + "step": 9065 + }, + { + "epoch": 1.4796084828711256, + "grad_norm": 2.174182653427124, + "learning_rate": 4.9650707663510785e-05, + "loss": 0.2044, + "num_input_tokens_seen": 19593184, + "step": 9070 + }, + { + "epoch": 1.4804241435562806, + "grad_norm": 0.36408257484436035, + "learning_rate": 4.9649520966218744e-05, + "loss": 0.0459, + "num_input_tokens_seen": 19604768, + "step": 9075 + }, + { + "epoch": 1.4812398042414356, + "grad_norm": 0.33235734701156616, + "learning_rate": 4.9648332270711463e-05, + "loss": 0.1169, + "num_input_tokens_seen": 19616576, + "step": 9080 + }, + { + "epoch": 1.4820554649265905, + "grad_norm": 0.19606205821037292, + "learning_rate": 4.96471415770853e-05, + "loss": 0.0446, + "num_input_tokens_seen": 19628384, + "step": 9085 + }, + { + "epoch": 1.4828711256117455, + "grad_norm": 0.7749315500259399, + "learning_rate": 4.964594888543678e-05, + "loss": 0.1177, + "num_input_tokens_seen": 19639136, + "step": 9090 + }, + { + "epoch": 1.4836867862969005, + "grad_norm": 1.4512215852737427, + "learning_rate": 4.9644754195862597e-05, + "loss": 0.0996, + "num_input_tokens_seen": 19649856, + "step": 9095 + }, + { + "epoch": 1.4845024469820554, + "grad_norm": 0.5395937561988831, + "learning_rate": 4.964355750845959e-05, + "loss": 0.1312, + "num_input_tokens_seen": 19661120, + "step": 9100 + }, + { + "epoch": 1.4853181076672104, + "grad_norm": 1.6249747276306152, + "learning_rate": 4.9642358823324776e-05, + "loss": 0.1377, + "num_input_tokens_seen": 19671648, + "step": 9105 + }, + { + "epoch": 1.4861337683523654, + "grad_norm": 0.23380792140960693, + "learning_rate": 4.964115814055531e-05, + "loss": 0.0968, + "num_input_tokens_seen": 19683584, + "step": 9110 + }, + { + "epoch": 1.4869494290375203, + "grad_norm": 0.8709216713905334, + "learning_rate": 4.963995546024854e-05, + "loss": 0.2856, + "num_input_tokens_seen": 19694080, + "step": 9115 + }, + { + "epoch": 1.4877650897226753, + "grad_norm": 1.859592080116272, + "learning_rate": 4.963875078250197e-05, + "loss": 0.3416, + "num_input_tokens_seen": 19705824, + "step": 9120 + }, + { + "epoch": 1.4885807504078303, + "grad_norm": 1.4653548002243042, + "learning_rate": 4.963754410741324e-05, + "loss": 0.412, + "num_input_tokens_seen": 19717056, + "step": 9125 + }, + { + "epoch": 1.4893964110929854, + "grad_norm": 0.6876802444458008, + "learning_rate": 4.9636335435080174e-05, + "loss": 0.0869, + "num_input_tokens_seen": 19728512, + "step": 9130 + }, + { + "epoch": 1.4902120717781404, + "grad_norm": 1.0502352714538574, + "learning_rate": 4.963512476560075e-05, + "loss": 0.076, + "num_input_tokens_seen": 19740768, + "step": 9135 + }, + { + "epoch": 1.4910277324632952, + "grad_norm": 1.7944258451461792, + "learning_rate": 4.963391209907312e-05, + "loss": 0.203, + "num_input_tokens_seen": 19752576, + "step": 9140 + }, + { + "epoch": 1.4918433931484503, + "grad_norm": 0.07644007354974747, + "learning_rate": 4.9632697435595585e-05, + "loss": 0.0431, + "num_input_tokens_seen": 19764064, + "step": 9145 + }, + { + "epoch": 1.4926590538336053, + "grad_norm": 0.12093212455511093, + "learning_rate": 4.96314807752666e-05, + "loss": 0.0915, + "num_input_tokens_seen": 19776608, + "step": 9150 + }, + { + "epoch": 1.4934747145187601, + "grad_norm": 0.17137907445430756, + "learning_rate": 4.963026211818482e-05, + "loss": 0.1878, + "num_input_tokens_seen": 19787488, + "step": 9155 + }, + { + "epoch": 1.4942903752039152, + "grad_norm": 0.543830156326294, + "learning_rate": 4.962904146444901e-05, + "loss": 0.2602, + "num_input_tokens_seen": 19798240, + "step": 9160 + }, + { + "epoch": 1.4951060358890702, + "grad_norm": 0.07553970068693161, + "learning_rate": 4.962781881415814e-05, + "loss": 0.278, + "num_input_tokens_seen": 19808384, + "step": 9165 + }, + { + "epoch": 1.495921696574225, + "grad_norm": 0.549642026424408, + "learning_rate": 4.962659416741131e-05, + "loss": 0.1705, + "num_input_tokens_seen": 19817696, + "step": 9170 + }, + { + "epoch": 1.49673735725938, + "grad_norm": 0.5360369682312012, + "learning_rate": 4.962536752430781e-05, + "loss": 0.1487, + "num_input_tokens_seen": 19828384, + "step": 9175 + }, + { + "epoch": 1.497553017944535, + "grad_norm": 0.29280364513397217, + "learning_rate": 4.962413888494706e-05, + "loss": 0.1137, + "num_input_tokens_seen": 19840704, + "step": 9180 + }, + { + "epoch": 1.49836867862969, + "grad_norm": 0.5337381958961487, + "learning_rate": 4.9622908249428676e-05, + "loss": 0.1559, + "num_input_tokens_seen": 19851648, + "step": 9185 + }, + { + "epoch": 1.499184339314845, + "grad_norm": 2.0410237312316895, + "learning_rate": 4.962167561785241e-05, + "loss": 0.133, + "num_input_tokens_seen": 19861568, + "step": 9190 + }, + { + "epoch": 1.5, + "grad_norm": 0.10696162283420563, + "learning_rate": 4.962044099031819e-05, + "loss": 0.0889, + "num_input_tokens_seen": 19871232, + "step": 9195 + }, + { + "epoch": 1.5, + "eval_loss": 0.1495753973722458, + "eval_runtime": 132.2481, + "eval_samples_per_second": 20.605, + "eval_steps_per_second": 5.157, + "num_input_tokens_seen": 19871232, + "step": 9195 + }, + { + "epoch": 1.5008156606851548, + "grad_norm": 1.2080838680267334, + "learning_rate": 4.9619204366926106e-05, + "loss": 0.0565, + "num_input_tokens_seen": 19882400, + "step": 9200 + }, + { + "epoch": 1.50163132137031, + "grad_norm": 0.41133129596710205, + "learning_rate": 4.96179657477764e-05, + "loss": 0.1433, + "num_input_tokens_seen": 19893824, + "step": 9205 + }, + { + "epoch": 1.502446982055465, + "grad_norm": 0.13738945126533508, + "learning_rate": 4.961672513296948e-05, + "loss": 0.098, + "num_input_tokens_seen": 19905280, + "step": 9210 + }, + { + "epoch": 1.50326264274062, + "grad_norm": 0.4024055004119873, + "learning_rate": 4.9615482522605915e-05, + "loss": 0.0493, + "num_input_tokens_seen": 19916032, + "step": 9215 + }, + { + "epoch": 1.504078303425775, + "grad_norm": 0.390259712934494, + "learning_rate": 4.9614237916786434e-05, + "loss": 0.0965, + "num_input_tokens_seen": 19927232, + "step": 9220 + }, + { + "epoch": 1.5048939641109298, + "grad_norm": 0.6851797103881836, + "learning_rate": 4.961299131561194e-05, + "loss": 0.0253, + "num_input_tokens_seen": 19936992, + "step": 9225 + }, + { + "epoch": 1.5057096247960848, + "grad_norm": 0.4839438199996948, + "learning_rate": 4.961174271918349e-05, + "loss": 0.1637, + "num_input_tokens_seen": 19948256, + "step": 9230 + }, + { + "epoch": 1.5065252854812399, + "grad_norm": 0.6803037524223328, + "learning_rate": 4.961049212760229e-05, + "loss": 0.1072, + "num_input_tokens_seen": 19960416, + "step": 9235 + }, + { + "epoch": 1.5073409461663947, + "grad_norm": 1.012634038925171, + "learning_rate": 4.960923954096972e-05, + "loss": 0.1085, + "num_input_tokens_seen": 19970656, + "step": 9240 + }, + { + "epoch": 1.5081566068515497, + "grad_norm": 0.05316442251205444, + "learning_rate": 4.960798495938734e-05, + "loss": 0.1011, + "num_input_tokens_seen": 19982176, + "step": 9245 + }, + { + "epoch": 1.5089722675367048, + "grad_norm": 1.71255624294281, + "learning_rate": 4.960672838295683e-05, + "loss": 0.2674, + "num_input_tokens_seen": 19991968, + "step": 9250 + }, + { + "epoch": 1.5097879282218596, + "grad_norm": 1.4141844511032104, + "learning_rate": 4.960546981178007e-05, + "loss": 0.2149, + "num_input_tokens_seen": 20003968, + "step": 9255 + }, + { + "epoch": 1.5106035889070146, + "grad_norm": 1.2355879545211792, + "learning_rate": 4.9604209245959076e-05, + "loss": 0.1005, + "num_input_tokens_seen": 20014784, + "step": 9260 + }, + { + "epoch": 1.5114192495921697, + "grad_norm": 0.4144745171070099, + "learning_rate": 4.960294668559604e-05, + "loss": 0.2639, + "num_input_tokens_seen": 20024192, + "step": 9265 + }, + { + "epoch": 1.5122349102773245, + "grad_norm": 0.2137766182422638, + "learning_rate": 4.960168213079331e-05, + "loss": 0.0684, + "num_input_tokens_seen": 20034368, + "step": 9270 + }, + { + "epoch": 1.5130505709624797, + "grad_norm": 3.488988161087036, + "learning_rate": 4.9600415581653406e-05, + "loss": 0.2013, + "num_input_tokens_seen": 20045216, + "step": 9275 + }, + { + "epoch": 1.5138662316476346, + "grad_norm": 0.32379117608070374, + "learning_rate": 4.9599147038278984e-05, + "loss": 0.1101, + "num_input_tokens_seen": 20056928, + "step": 9280 + }, + { + "epoch": 1.5146818923327896, + "grad_norm": 1.9183076620101929, + "learning_rate": 4.95978765007729e-05, + "loss": 0.1871, + "num_input_tokens_seen": 20067520, + "step": 9285 + }, + { + "epoch": 1.5154975530179446, + "grad_norm": 0.1402137130498886, + "learning_rate": 4.959660396923813e-05, + "loss": 0.0566, + "num_input_tokens_seen": 20077568, + "step": 9290 + }, + { + "epoch": 1.5163132137030995, + "grad_norm": 0.7633365988731384, + "learning_rate": 4.9595329443777836e-05, + "loss": 0.284, + "num_input_tokens_seen": 20087584, + "step": 9295 + }, + { + "epoch": 1.5171288743882545, + "grad_norm": 0.49616220593452454, + "learning_rate": 4.959405292449535e-05, + "loss": 0.1271, + "num_input_tokens_seen": 20098880, + "step": 9300 + }, + { + "epoch": 1.5179445350734095, + "grad_norm": 2.778726577758789, + "learning_rate": 4.959277441149415e-05, + "loss": 0.2166, + "num_input_tokens_seen": 20109568, + "step": 9305 + }, + { + "epoch": 1.5187601957585644, + "grad_norm": 0.29544439911842346, + "learning_rate": 4.959149390487786e-05, + "loss": 0.0571, + "num_input_tokens_seen": 20121184, + "step": 9310 + }, + { + "epoch": 1.5195758564437194, + "grad_norm": 0.22960929572582245, + "learning_rate": 4.959021140475031e-05, + "loss": 0.1075, + "num_input_tokens_seen": 20132608, + "step": 9315 + }, + { + "epoch": 1.5203915171288744, + "grad_norm": 0.9171779751777649, + "learning_rate": 4.958892691121545e-05, + "loss": 0.0477, + "num_input_tokens_seen": 20143200, + "step": 9320 + }, + { + "epoch": 1.5212071778140293, + "grad_norm": 0.046947382390499115, + "learning_rate": 4.958764042437741e-05, + "loss": 0.109, + "num_input_tokens_seen": 20153344, + "step": 9325 + }, + { + "epoch": 1.5220228384991843, + "grad_norm": 0.12833619117736816, + "learning_rate": 4.958635194434048e-05, + "loss": 0.1385, + "num_input_tokens_seen": 20164288, + "step": 9330 + }, + { + "epoch": 1.5228384991843393, + "grad_norm": 0.23062317073345184, + "learning_rate": 4.958506147120912e-05, + "loss": 0.1118, + "num_input_tokens_seen": 20174880, + "step": 9335 + }, + { + "epoch": 1.5236541598694942, + "grad_norm": 0.18649493157863617, + "learning_rate": 4.958376900508792e-05, + "loss": 0.122, + "num_input_tokens_seen": 20184960, + "step": 9340 + }, + { + "epoch": 1.5244698205546494, + "grad_norm": 0.26624226570129395, + "learning_rate": 4.958247454608167e-05, + "loss": 0.0858, + "num_input_tokens_seen": 20194464, + "step": 9345 + }, + { + "epoch": 1.5252854812398042, + "grad_norm": 0.3583889901638031, + "learning_rate": 4.958117809429531e-05, + "loss": 0.3853, + "num_input_tokens_seen": 20204352, + "step": 9350 + }, + { + "epoch": 1.5261011419249593, + "grad_norm": 0.3248310387134552, + "learning_rate": 4.9579879649833925e-05, + "loss": 0.0895, + "num_input_tokens_seen": 20214816, + "step": 9355 + }, + { + "epoch": 1.5269168026101143, + "grad_norm": 2.3007090091705322, + "learning_rate": 4.957857921280279e-05, + "loss": 0.2512, + "num_input_tokens_seen": 20225664, + "step": 9360 + }, + { + "epoch": 1.5277324632952691, + "grad_norm": 1.0970346927642822, + "learning_rate": 4.9577276783307296e-05, + "loss": 0.0791, + "num_input_tokens_seen": 20236672, + "step": 9365 + }, + { + "epoch": 1.5285481239804242, + "grad_norm": 1.7510260343551636, + "learning_rate": 4.9575972361453046e-05, + "loss": 0.2107, + "num_input_tokens_seen": 20246784, + "step": 9370 + }, + { + "epoch": 1.5293637846655792, + "grad_norm": 0.1852952390909195, + "learning_rate": 4.957466594734579e-05, + "loss": 0.0936, + "num_input_tokens_seen": 20257088, + "step": 9375 + }, + { + "epoch": 1.530179445350734, + "grad_norm": 0.10161633044481277, + "learning_rate": 4.9573357541091414e-05, + "loss": 0.0475, + "num_input_tokens_seen": 20267520, + "step": 9380 + }, + { + "epoch": 1.530995106035889, + "grad_norm": 0.1877739280462265, + "learning_rate": 4.957204714279599e-05, + "loss": 0.0612, + "num_input_tokens_seen": 20277568, + "step": 9385 + }, + { + "epoch": 1.531810766721044, + "grad_norm": 0.6494584083557129, + "learning_rate": 4.957073475256575e-05, + "loss": 0.0968, + "num_input_tokens_seen": 20288768, + "step": 9390 + }, + { + "epoch": 1.532626427406199, + "grad_norm": 0.5662254095077515, + "learning_rate": 4.9569420370507087e-05, + "loss": 0.1272, + "num_input_tokens_seen": 20299968, + "step": 9395 + }, + { + "epoch": 1.533442088091354, + "grad_norm": 1.2351518869400024, + "learning_rate": 4.956810399672653e-05, + "loss": 0.3059, + "num_input_tokens_seen": 20310592, + "step": 9400 + }, + { + "epoch": 1.534257748776509, + "grad_norm": 1.697769045829773, + "learning_rate": 4.956678563133082e-05, + "loss": 0.3561, + "num_input_tokens_seen": 20321088, + "step": 9405 + }, + { + "epoch": 1.5350734094616638, + "grad_norm": 0.1440660059452057, + "learning_rate": 4.956546527442681e-05, + "loss": 0.1573, + "num_input_tokens_seen": 20331136, + "step": 9410 + }, + { + "epoch": 1.535889070146819, + "grad_norm": 1.1753937005996704, + "learning_rate": 4.956414292612154e-05, + "loss": 0.179, + "num_input_tokens_seen": 20343136, + "step": 9415 + }, + { + "epoch": 1.536704730831974, + "grad_norm": 0.29920467734336853, + "learning_rate": 4.9562818586522206e-05, + "loss": 0.1666, + "num_input_tokens_seen": 20354656, + "step": 9420 + }, + { + "epoch": 1.5375203915171287, + "grad_norm": 0.18486274778842926, + "learning_rate": 4.9561492255736175e-05, + "loss": 0.031, + "num_input_tokens_seen": 20365856, + "step": 9425 + }, + { + "epoch": 1.538336052202284, + "grad_norm": 1.6574335098266602, + "learning_rate": 4.9560163933870954e-05, + "loss": 0.2931, + "num_input_tokens_seen": 20376064, + "step": 9430 + }, + { + "epoch": 1.5391517128874388, + "grad_norm": 0.6979897022247314, + "learning_rate": 4.9558833621034224e-05, + "loss": 0.0887, + "num_input_tokens_seen": 20387104, + "step": 9435 + }, + { + "epoch": 1.5399673735725938, + "grad_norm": 1.5539965629577637, + "learning_rate": 4.955750131733383e-05, + "loss": 0.1297, + "num_input_tokens_seen": 20398112, + "step": 9440 + }, + { + "epoch": 1.5407830342577489, + "grad_norm": 0.47621679306030273, + "learning_rate": 4.955616702287778e-05, + "loss": 0.0533, + "num_input_tokens_seen": 20409152, + "step": 9445 + }, + { + "epoch": 1.5415986949429037, + "grad_norm": 0.3893702030181885, + "learning_rate": 4.9554830737774226e-05, + "loss": 0.0944, + "num_input_tokens_seen": 20420704, + "step": 9450 + }, + { + "epoch": 1.5424143556280587, + "grad_norm": 0.380730539560318, + "learning_rate": 4.955349246213151e-05, + "loss": 0.1056, + "num_input_tokens_seen": 20431456, + "step": 9455 + }, + { + "epoch": 1.5432300163132138, + "grad_norm": 0.8857797980308533, + "learning_rate": 4.9552152196058114e-05, + "loss": 0.1442, + "num_input_tokens_seen": 20442880, + "step": 9460 + }, + { + "epoch": 1.5440456769983686, + "grad_norm": 0.7813159227371216, + "learning_rate": 4.955080993966268e-05, + "loss": 0.0493, + "num_input_tokens_seen": 20454624, + "step": 9465 + }, + { + "epoch": 1.5448613376835236, + "grad_norm": 0.6499303579330444, + "learning_rate": 4.954946569305402e-05, + "loss": 0.1499, + "num_input_tokens_seen": 20465824, + "step": 9470 + }, + { + "epoch": 1.5456769983686787, + "grad_norm": 0.26304349303245544, + "learning_rate": 4.9548119456341114e-05, + "loss": 0.1452, + "num_input_tokens_seen": 20477088, + "step": 9475 + }, + { + "epoch": 1.5464926590538335, + "grad_norm": 0.03185722231864929, + "learning_rate": 4.954677122963309e-05, + "loss": 0.0557, + "num_input_tokens_seen": 20487296, + "step": 9480 + }, + { + "epoch": 1.5473083197389887, + "grad_norm": 0.1737225502729416, + "learning_rate": 4.954542101303924e-05, + "loss": 0.18, + "num_input_tokens_seen": 20498080, + "step": 9485 + }, + { + "epoch": 1.5481239804241436, + "grad_norm": 1.4441016912460327, + "learning_rate": 4.954406880666902e-05, + "loss": 0.2301, + "num_input_tokens_seen": 20509824, + "step": 9490 + }, + { + "epoch": 1.5489396411092984, + "grad_norm": 0.5287127494812012, + "learning_rate": 4.954271461063204e-05, + "loss": 0.3027, + "num_input_tokens_seen": 20520928, + "step": 9495 + }, + { + "epoch": 1.5497553017944536, + "grad_norm": 0.9144262075424194, + "learning_rate": 4.9541358425038095e-05, + "loss": 0.2492, + "num_input_tokens_seen": 20532288, + "step": 9500 + }, + { + "epoch": 1.5505709624796085, + "grad_norm": 0.5827791094779968, + "learning_rate": 4.954000024999711e-05, + "loss": 0.2535, + "num_input_tokens_seen": 20543456, + "step": 9505 + }, + { + "epoch": 1.5513866231647635, + "grad_norm": 0.7168580889701843, + "learning_rate": 4.9538640085619184e-05, + "loss": 0.0882, + "num_input_tokens_seen": 20554592, + "step": 9510 + }, + { + "epoch": 1.5522022838499185, + "grad_norm": 0.9091903567314148, + "learning_rate": 4.953727793201459e-05, + "loss": 0.113, + "num_input_tokens_seen": 20565504, + "step": 9515 + }, + { + "epoch": 1.5530179445350734, + "grad_norm": 1.2968186140060425, + "learning_rate": 4.953591378929375e-05, + "loss": 0.2994, + "num_input_tokens_seen": 20576896, + "step": 9520 + }, + { + "epoch": 1.5538336052202284, + "grad_norm": 0.768661379814148, + "learning_rate": 4.953454765756724e-05, + "loss": 0.1123, + "num_input_tokens_seen": 20586720, + "step": 9525 + }, + { + "epoch": 1.5546492659053834, + "grad_norm": 1.616100788116455, + "learning_rate": 4.953317953694582e-05, + "loss": 0.1474, + "num_input_tokens_seen": 20597376, + "step": 9530 + }, + { + "epoch": 1.5554649265905383, + "grad_norm": 1.1429634094238281, + "learning_rate": 4.953180942754037e-05, + "loss": 0.0751, + "num_input_tokens_seen": 20608608, + "step": 9535 + }, + { + "epoch": 1.5562805872756933, + "grad_norm": 0.5231560468673706, + "learning_rate": 4.9530437329461987e-05, + "loss": 0.1062, + "num_input_tokens_seen": 20620128, + "step": 9540 + }, + { + "epoch": 1.5570962479608483, + "grad_norm": 0.14536641538143158, + "learning_rate": 4.952906324282188e-05, + "loss": 0.0678, + "num_input_tokens_seen": 20631200, + "step": 9545 + }, + { + "epoch": 1.5579119086460032, + "grad_norm": 0.0671345517039299, + "learning_rate": 4.952768716773145e-05, + "loss": 0.0594, + "num_input_tokens_seen": 20640352, + "step": 9550 + }, + { + "epoch": 1.5587275693311582, + "grad_norm": 1.3912783861160278, + "learning_rate": 4.9526309104302246e-05, + "loss": 0.2762, + "num_input_tokens_seen": 20652832, + "step": 9555 + }, + { + "epoch": 1.5595432300163132, + "grad_norm": 0.4908728003501892, + "learning_rate": 4.952492905264599e-05, + "loss": 0.0737, + "num_input_tokens_seen": 20663488, + "step": 9560 + }, + { + "epoch": 1.560358890701468, + "grad_norm": 0.24295347929000854, + "learning_rate": 4.9523547012874524e-05, + "loss": 0.0646, + "num_input_tokens_seen": 20674464, + "step": 9565 + }, + { + "epoch": 1.5611745513866233, + "grad_norm": 0.5301098227500916, + "learning_rate": 4.952216298509993e-05, + "loss": 0.1303, + "num_input_tokens_seen": 20685760, + "step": 9570 + }, + { + "epoch": 1.5619902120717781, + "grad_norm": 2.7111635208129883, + "learning_rate": 4.952077696943437e-05, + "loss": 0.2442, + "num_input_tokens_seen": 20697280, + "step": 9575 + }, + { + "epoch": 1.5628058727569332, + "grad_norm": 0.04234471544623375, + "learning_rate": 4.951938896599021e-05, + "loss": 0.1222, + "num_input_tokens_seen": 20708000, + "step": 9580 + }, + { + "epoch": 1.5636215334420882, + "grad_norm": 0.3402857184410095, + "learning_rate": 4.951799897487997e-05, + "loss": 0.1231, + "num_input_tokens_seen": 20719424, + "step": 9585 + }, + { + "epoch": 1.564437194127243, + "grad_norm": 1.4403290748596191, + "learning_rate": 4.951660699621633e-05, + "loss": 0.1699, + "num_input_tokens_seen": 20728096, + "step": 9590 + }, + { + "epoch": 1.565252854812398, + "grad_norm": 0.7373093366622925, + "learning_rate": 4.9515213030112135e-05, + "loss": 0.2208, + "num_input_tokens_seen": 20738816, + "step": 9595 + }, + { + "epoch": 1.566068515497553, + "grad_norm": 0.20920532941818237, + "learning_rate": 4.951381707668038e-05, + "loss": 0.1606, + "num_input_tokens_seen": 20749408, + "step": 9600 + }, + { + "epoch": 1.566884176182708, + "grad_norm": 0.47896191477775574, + "learning_rate": 4.951241913603423e-05, + "loss": 0.0416, + "num_input_tokens_seen": 20759200, + "step": 9605 + }, + { + "epoch": 1.567699836867863, + "grad_norm": 0.6874516606330872, + "learning_rate": 4.9511019208287014e-05, + "loss": 0.0909, + "num_input_tokens_seen": 20770560, + "step": 9610 + }, + { + "epoch": 1.568515497553018, + "grad_norm": 0.6775272488594055, + "learning_rate": 4.9509617293552215e-05, + "loss": 0.1533, + "num_input_tokens_seen": 20781056, + "step": 9615 + }, + { + "epoch": 1.5693311582381728, + "grad_norm": 1.1651034355163574, + "learning_rate": 4.9508213391943467e-05, + "loss": 0.0839, + "num_input_tokens_seen": 20791648, + "step": 9620 + }, + { + "epoch": 1.5701468189233279, + "grad_norm": 0.9377660751342773, + "learning_rate": 4.950680750357459e-05, + "loss": 0.1667, + "num_input_tokens_seen": 20804128, + "step": 9625 + }, + { + "epoch": 1.570962479608483, + "grad_norm": 1.8491570949554443, + "learning_rate": 4.950539962855956e-05, + "loss": 0.1251, + "num_input_tokens_seen": 20814752, + "step": 9630 + }, + { + "epoch": 1.5717781402936377, + "grad_norm": 0.5825570821762085, + "learning_rate": 4.9503989767012493e-05, + "loss": 0.16, + "num_input_tokens_seen": 20826720, + "step": 9635 + }, + { + "epoch": 1.572593800978793, + "grad_norm": 0.5232163071632385, + "learning_rate": 4.950257791904768e-05, + "loss": 0.0682, + "num_input_tokens_seen": 20837536, + "step": 9640 + }, + { + "epoch": 1.5734094616639478, + "grad_norm": 2.3789517879486084, + "learning_rate": 4.950116408477958e-05, + "loss": 0.1683, + "num_input_tokens_seen": 20847744, + "step": 9645 + }, + { + "epoch": 1.5742251223491026, + "grad_norm": 1.4860109090805054, + "learning_rate": 4.94997482643228e-05, + "loss": 0.3066, + "num_input_tokens_seen": 20859136, + "step": 9650 + }, + { + "epoch": 1.5750407830342579, + "grad_norm": 2.4730606079101562, + "learning_rate": 4.949833045779212e-05, + "loss": 0.1892, + "num_input_tokens_seen": 20868832, + "step": 9655 + }, + { + "epoch": 1.5758564437194127, + "grad_norm": 0.6081188321113586, + "learning_rate": 4.9496910665302467e-05, + "loss": 0.1066, + "num_input_tokens_seen": 20878080, + "step": 9660 + }, + { + "epoch": 1.5766721044045677, + "grad_norm": 0.22755779325962067, + "learning_rate": 4.949548888696893e-05, + "loss": 0.144, + "num_input_tokens_seen": 20889600, + "step": 9665 + }, + { + "epoch": 1.5774877650897228, + "grad_norm": 1.3751211166381836, + "learning_rate": 4.9494065122906787e-05, + "loss": 0.2198, + "num_input_tokens_seen": 20899392, + "step": 9670 + }, + { + "epoch": 1.5783034257748776, + "grad_norm": 0.5511062145233154, + "learning_rate": 4.9492639373231436e-05, + "loss": 0.2023, + "num_input_tokens_seen": 20909888, + "step": 9675 + }, + { + "epoch": 1.5791190864600326, + "grad_norm": 1.0774787664413452, + "learning_rate": 4.949121163805847e-05, + "loss": 0.0548, + "num_input_tokens_seen": 20920032, + "step": 9680 + }, + { + "epoch": 1.5799347471451877, + "grad_norm": 0.07253263890743256, + "learning_rate": 4.948978191750362e-05, + "loss": 0.1983, + "num_input_tokens_seen": 20931136, + "step": 9685 + }, + { + "epoch": 1.5807504078303425, + "grad_norm": 0.07003948837518692, + "learning_rate": 4.948835021168278e-05, + "loss": 0.0865, + "num_input_tokens_seen": 20941792, + "step": 9690 + }, + { + "epoch": 1.5815660685154975, + "grad_norm": 1.1390676498413086, + "learning_rate": 4.9486916520712026e-05, + "loss": 0.135, + "num_input_tokens_seen": 20952672, + "step": 9695 + }, + { + "epoch": 1.5823817292006526, + "grad_norm": 0.3242737948894501, + "learning_rate": 4.948548084470757e-05, + "loss": 0.0539, + "num_input_tokens_seen": 20964480, + "step": 9700 + }, + { + "epoch": 1.5831973898858074, + "grad_norm": 0.28581514954566956, + "learning_rate": 4.94840431837858e-05, + "loss": 0.0603, + "num_input_tokens_seen": 20975392, + "step": 9705 + }, + { + "epoch": 1.5840130505709626, + "grad_norm": 0.12777112424373627, + "learning_rate": 4.948260353806326e-05, + "loss": 0.1669, + "num_input_tokens_seen": 20985344, + "step": 9710 + }, + { + "epoch": 1.5848287112561175, + "grad_norm": 0.972244381904602, + "learning_rate": 4.948116190765665e-05, + "loss": 0.1976, + "num_input_tokens_seen": 20995456, + "step": 9715 + }, + { + "epoch": 1.5856443719412723, + "grad_norm": 0.1589186042547226, + "learning_rate": 4.9479718292682846e-05, + "loss": 0.057, + "num_input_tokens_seen": 21006816, + "step": 9720 + }, + { + "epoch": 1.5864600326264275, + "grad_norm": 1.5028563737869263, + "learning_rate": 4.9478272693258866e-05, + "loss": 0.1877, + "num_input_tokens_seen": 21017344, + "step": 9725 + }, + { + "epoch": 1.5872756933115824, + "grad_norm": 0.8287039399147034, + "learning_rate": 4.94768251095019e-05, + "loss": 0.1132, + "num_input_tokens_seen": 21027040, + "step": 9730 + }, + { + "epoch": 1.5880913539967374, + "grad_norm": 0.6289673447608948, + "learning_rate": 4.9475375541529294e-05, + "loss": 0.0676, + "num_input_tokens_seen": 21037472, + "step": 9735 + }, + { + "epoch": 1.5889070146818924, + "grad_norm": 2.664294958114624, + "learning_rate": 4.947392398945856e-05, + "loss": 0.2786, + "num_input_tokens_seen": 21048800, + "step": 9740 + }, + { + "epoch": 1.5897226753670473, + "grad_norm": 1.306288480758667, + "learning_rate": 4.9472470453407374e-05, + "loss": 0.2767, + "num_input_tokens_seen": 21060672, + "step": 9745 + }, + { + "epoch": 1.5905383360522023, + "grad_norm": 0.13333579897880554, + "learning_rate": 4.947101493349355e-05, + "loss": 0.2598, + "num_input_tokens_seen": 21070880, + "step": 9750 + }, + { + "epoch": 1.5913539967373573, + "grad_norm": 0.9839193820953369, + "learning_rate": 4.94695574298351e-05, + "loss": 0.3102, + "num_input_tokens_seen": 21081792, + "step": 9755 + }, + { + "epoch": 1.5921696574225122, + "grad_norm": 0.2415851503610611, + "learning_rate": 4.946809794255016e-05, + "loss": 0.0967, + "num_input_tokens_seen": 21093280, + "step": 9760 + }, + { + "epoch": 1.5929853181076672, + "grad_norm": 0.8959052562713623, + "learning_rate": 4.946663647175706e-05, + "loss": 0.0904, + "num_input_tokens_seen": 21103264, + "step": 9765 + }, + { + "epoch": 1.5938009787928222, + "grad_norm": 0.16394798457622528, + "learning_rate": 4.946517301757426e-05, + "loss": 0.151, + "num_input_tokens_seen": 21113632, + "step": 9770 + }, + { + "epoch": 1.594616639477977, + "grad_norm": 1.1182037591934204, + "learning_rate": 4.94637075801204e-05, + "loss": 0.191, + "num_input_tokens_seen": 21124448, + "step": 9775 + }, + { + "epoch": 1.595432300163132, + "grad_norm": 1.0492414236068726, + "learning_rate": 4.946224015951427e-05, + "loss": 0.13, + "num_input_tokens_seen": 21135776, + "step": 9780 + }, + { + "epoch": 1.5962479608482871, + "grad_norm": 0.20796242356300354, + "learning_rate": 4.946077075587484e-05, + "loss": 0.2224, + "num_input_tokens_seen": 21147072, + "step": 9785 + }, + { + "epoch": 1.597063621533442, + "grad_norm": 0.20031939446926117, + "learning_rate": 4.945929936932122e-05, + "loss": 0.1214, + "num_input_tokens_seen": 21158848, + "step": 9790 + }, + { + "epoch": 1.5978792822185972, + "grad_norm": 0.7567790150642395, + "learning_rate": 4.945782599997269e-05, + "loss": 0.1098, + "num_input_tokens_seen": 21168448, + "step": 9795 + }, + { + "epoch": 1.598694942903752, + "grad_norm": 1.9094181060791016, + "learning_rate": 4.945635064794869e-05, + "loss": 0.1855, + "num_input_tokens_seen": 21179744, + "step": 9800 + }, + { + "epoch": 1.599510603588907, + "grad_norm": 1.5208712816238403, + "learning_rate": 4.94548733133688e-05, + "loss": 0.1807, + "num_input_tokens_seen": 21191904, + "step": 9805 + }, + { + "epoch": 1.600326264274062, + "grad_norm": 0.1920945644378662, + "learning_rate": 4.945339399635281e-05, + "loss": 0.1203, + "num_input_tokens_seen": 21202848, + "step": 9810 + }, + { + "epoch": 1.601141924959217, + "grad_norm": 0.1028672382235527, + "learning_rate": 4.945191269702062e-05, + "loss": 0.0716, + "num_input_tokens_seen": 21214304, + "step": 9815 + }, + { + "epoch": 1.601957585644372, + "grad_norm": 0.4249165654182434, + "learning_rate": 4.945042941549233e-05, + "loss": 0.1113, + "num_input_tokens_seen": 21224448, + "step": 9820 + }, + { + "epoch": 1.602773246329527, + "grad_norm": 2.1583309173583984, + "learning_rate": 4.944894415188815e-05, + "loss": 0.2607, + "num_input_tokens_seen": 21235392, + "step": 9825 + }, + { + "epoch": 1.6035889070146818, + "grad_norm": 0.985519528388977, + "learning_rate": 4.944745690632852e-05, + "loss": 0.1823, + "num_input_tokens_seen": 21246656, + "step": 9830 + }, + { + "epoch": 1.6044045676998369, + "grad_norm": 0.8249574899673462, + "learning_rate": 4.944596767893399e-05, + "loss": 0.0798, + "num_input_tokens_seen": 21257376, + "step": 9835 + }, + { + "epoch": 1.605220228384992, + "grad_norm": 0.5455015897750854, + "learning_rate": 4.944447646982529e-05, + "loss": 0.1298, + "num_input_tokens_seen": 21267456, + "step": 9840 + }, + { + "epoch": 1.6060358890701467, + "grad_norm": 0.8807271718978882, + "learning_rate": 4.9442983279123276e-05, + "loss": 0.1497, + "num_input_tokens_seen": 21279104, + "step": 9845 + }, + { + "epoch": 1.6068515497553018, + "grad_norm": 0.15970806777477264, + "learning_rate": 4.944148810694903e-05, + "loss": 0.2251, + "num_input_tokens_seen": 21291328, + "step": 9850 + }, + { + "epoch": 1.6076672104404568, + "grad_norm": 0.675246000289917, + "learning_rate": 4.9439990953423735e-05, + "loss": 0.0994, + "num_input_tokens_seen": 21302016, + "step": 9855 + }, + { + "epoch": 1.6084828711256116, + "grad_norm": 0.3762994706630707, + "learning_rate": 4.943849181866876e-05, + "loss": 0.1017, + "num_input_tokens_seen": 21312448, + "step": 9860 + }, + { + "epoch": 1.6092985318107669, + "grad_norm": 0.8290408253669739, + "learning_rate": 4.943699070280565e-05, + "loss": 0.0838, + "num_input_tokens_seen": 21323136, + "step": 9865 + }, + { + "epoch": 1.6101141924959217, + "grad_norm": 0.5779378414154053, + "learning_rate": 4.9435487605956084e-05, + "loss": 0.0839, + "num_input_tokens_seen": 21334368, + "step": 9870 + }, + { + "epoch": 1.6109298531810765, + "grad_norm": 1.1362475156784058, + "learning_rate": 4.94339825282419e-05, + "loss": 0.1657, + "num_input_tokens_seen": 21344992, + "step": 9875 + }, + { + "epoch": 1.6117455138662318, + "grad_norm": 0.2843942642211914, + "learning_rate": 4.943247546978512e-05, + "loss": 0.1201, + "num_input_tokens_seen": 21354688, + "step": 9880 + }, + { + "epoch": 1.6125611745513866, + "grad_norm": 2.1728832721710205, + "learning_rate": 4.943096643070791e-05, + "loss": 0.2832, + "num_input_tokens_seen": 21365696, + "step": 9885 + }, + { + "epoch": 1.6133768352365416, + "grad_norm": 1.555431604385376, + "learning_rate": 4.9429455411132596e-05, + "loss": 0.2512, + "num_input_tokens_seen": 21378240, + "step": 9890 + }, + { + "epoch": 1.6141924959216967, + "grad_norm": 2.1490399837493896, + "learning_rate": 4.942794241118167e-05, + "loss": 0.4715, + "num_input_tokens_seen": 21388992, + "step": 9895 + }, + { + "epoch": 1.6150081566068515, + "grad_norm": 0.5063406825065613, + "learning_rate": 4.9426427430977796e-05, + "loss": 0.1583, + "num_input_tokens_seen": 21399808, + "step": 9900 + }, + { + "epoch": 1.6158238172920065, + "grad_norm": 0.9175553321838379, + "learning_rate": 4.942491047064377e-05, + "loss": 0.0677, + "num_input_tokens_seen": 21409696, + "step": 9905 + }, + { + "epoch": 1.6166394779771616, + "grad_norm": 0.11037199944257736, + "learning_rate": 4.942339153030257e-05, + "loss": 0.0959, + "num_input_tokens_seen": 21420416, + "step": 9910 + }, + { + "epoch": 1.6174551386623164, + "grad_norm": 2.0186550617218018, + "learning_rate": 4.942187061007732e-05, + "loss": 0.3781, + "num_input_tokens_seen": 21431648, + "step": 9915 + }, + { + "epoch": 1.6182707993474714, + "grad_norm": 0.45069754123687744, + "learning_rate": 4.942034771009134e-05, + "loss": 0.2518, + "num_input_tokens_seen": 21441856, + "step": 9920 + }, + { + "epoch": 1.6190864600326265, + "grad_norm": 0.8029191493988037, + "learning_rate": 4.941882283046806e-05, + "loss": 0.1848, + "num_input_tokens_seen": 21452736, + "step": 9925 + }, + { + "epoch": 1.6199021207177813, + "grad_norm": 0.1578453779220581, + "learning_rate": 4.94172959713311e-05, + "loss": 0.1012, + "num_input_tokens_seen": 21463552, + "step": 9930 + }, + { + "epoch": 1.6207177814029365, + "grad_norm": 1.4020339250564575, + "learning_rate": 4.941576713280424e-05, + "loss": 0.2008, + "num_input_tokens_seen": 21475136, + "step": 9935 + }, + { + "epoch": 1.6215334420880914, + "grad_norm": 0.037428390234708786, + "learning_rate": 4.941423631501141e-05, + "loss": 0.1878, + "num_input_tokens_seen": 21485536, + "step": 9940 + }, + { + "epoch": 1.6223491027732462, + "grad_norm": 0.13916999101638794, + "learning_rate": 4.941270351807671e-05, + "loss": 0.0887, + "num_input_tokens_seen": 21496320, + "step": 9945 + }, + { + "epoch": 1.6231647634584014, + "grad_norm": 0.9028674364089966, + "learning_rate": 4.941116874212439e-05, + "loss": 0.1172, + "num_input_tokens_seen": 21506688, + "step": 9950 + }, + { + "epoch": 1.6239804241435563, + "grad_norm": 1.7922662496566772, + "learning_rate": 4.940963198727887e-05, + "loss": 0.308, + "num_input_tokens_seen": 21517184, + "step": 9955 + }, + { + "epoch": 1.6247960848287113, + "grad_norm": 0.27482205629348755, + "learning_rate": 4.940809325366473e-05, + "loss": 0.0782, + "num_input_tokens_seen": 21527104, + "step": 9960 + }, + { + "epoch": 1.6256117455138663, + "grad_norm": 1.8799694776535034, + "learning_rate": 4.9406552541406707e-05, + "loss": 0.2803, + "num_input_tokens_seen": 21537024, + "step": 9965 + }, + { + "epoch": 1.6264274061990212, + "grad_norm": 1.7185105085372925, + "learning_rate": 4.94050098506297e-05, + "loss": 0.1119, + "num_input_tokens_seen": 21547648, + "step": 9970 + }, + { + "epoch": 1.6272430668841762, + "grad_norm": 1.6852678060531616, + "learning_rate": 4.940346518145876e-05, + "loss": 0.1787, + "num_input_tokens_seen": 21558944, + "step": 9975 + }, + { + "epoch": 1.6280587275693312, + "grad_norm": 2.3451287746429443, + "learning_rate": 4.940191853401911e-05, + "loss": 0.3344, + "num_input_tokens_seen": 21569408, + "step": 9980 + }, + { + "epoch": 1.628874388254486, + "grad_norm": 1.089866280555725, + "learning_rate": 4.940036990843613e-05, + "loss": 0.1551, + "num_input_tokens_seen": 21580384, + "step": 9985 + }, + { + "epoch": 1.629690048939641, + "grad_norm": 0.8225944638252258, + "learning_rate": 4.9398819304835364e-05, + "loss": 0.1086, + "num_input_tokens_seen": 21591328, + "step": 9990 + }, + { + "epoch": 1.6305057096247961, + "grad_norm": 0.5891323685646057, + "learning_rate": 4.93972667233425e-05, + "loss": 0.0842, + "num_input_tokens_seen": 21603456, + "step": 9995 + }, + { + "epoch": 1.631321370309951, + "grad_norm": 0.5437803864479065, + "learning_rate": 4.9395712164083406e-05, + "loss": 0.2339, + "num_input_tokens_seen": 21613248, + "step": 10000 + }, + { + "epoch": 1.632137030995106, + "grad_norm": 0.3118618130683899, + "learning_rate": 4.93941556271841e-05, + "loss": 0.0669, + "num_input_tokens_seen": 21624256, + "step": 10005 + }, + { + "epoch": 1.632952691680261, + "grad_norm": 0.4542217552661896, + "learning_rate": 4.9392597112770765e-05, + "loss": 0.1009, + "num_input_tokens_seen": 21636032, + "step": 10010 + }, + { + "epoch": 1.6337683523654158, + "grad_norm": 0.8307268023490906, + "learning_rate": 4.939103662096974e-05, + "loss": 0.1153, + "num_input_tokens_seen": 21647264, + "step": 10015 + }, + { + "epoch": 1.634584013050571, + "grad_norm": 1.407177209854126, + "learning_rate": 4.938947415190754e-05, + "loss": 0.1077, + "num_input_tokens_seen": 21658496, + "step": 10020 + }, + { + "epoch": 1.635399673735726, + "grad_norm": 0.7017089128494263, + "learning_rate": 4.93879097057108e-05, + "loss": 0.1896, + "num_input_tokens_seen": 21669248, + "step": 10025 + }, + { + "epoch": 1.636215334420881, + "grad_norm": 0.8109764456748962, + "learning_rate": 4.938634328250636e-05, + "loss": 0.1401, + "num_input_tokens_seen": 21679840, + "step": 10030 + }, + { + "epoch": 1.637030995106036, + "grad_norm": 0.21435219049453735, + "learning_rate": 4.93847748824212e-05, + "loss": 0.1602, + "num_input_tokens_seen": 21691904, + "step": 10035 + }, + { + "epoch": 1.6378466557911908, + "grad_norm": 0.26665621995925903, + "learning_rate": 4.938320450558246e-05, + "loss": 0.0513, + "num_input_tokens_seen": 21701408, + "step": 10040 + }, + { + "epoch": 1.6386623164763459, + "grad_norm": 0.31883466243743896, + "learning_rate": 4.938163215211745e-05, + "loss": 0.0552, + "num_input_tokens_seen": 21712256, + "step": 10045 + }, + { + "epoch": 1.639477977161501, + "grad_norm": 0.39129558205604553, + "learning_rate": 4.938005782215362e-05, + "loss": 0.1025, + "num_input_tokens_seen": 21722304, + "step": 10050 + }, + { + "epoch": 1.6402936378466557, + "grad_norm": 0.3782632648944855, + "learning_rate": 4.93784815158186e-05, + "loss": 0.0897, + "num_input_tokens_seen": 21733152, + "step": 10055 + }, + { + "epoch": 1.6411092985318108, + "grad_norm": 0.5986658930778503, + "learning_rate": 4.937690323324017e-05, + "loss": 0.1457, + "num_input_tokens_seen": 21744160, + "step": 10060 + }, + { + "epoch": 1.6419249592169658, + "grad_norm": 0.7843287587165833, + "learning_rate": 4.9375322974546285e-05, + "loss": 0.0735, + "num_input_tokens_seen": 21754432, + "step": 10065 + }, + { + "epoch": 1.6427406199021206, + "grad_norm": 0.11766274273395538, + "learning_rate": 4.937374073986504e-05, + "loss": 0.0303, + "num_input_tokens_seen": 21764928, + "step": 10070 + }, + { + "epoch": 1.6435562805872757, + "grad_norm": 1.3721957206726074, + "learning_rate": 4.937215652932469e-05, + "loss": 0.2468, + "num_input_tokens_seen": 21775008, + "step": 10075 + }, + { + "epoch": 1.6443719412724307, + "grad_norm": 1.8610183000564575, + "learning_rate": 4.937057034305368e-05, + "loss": 0.1499, + "num_input_tokens_seen": 21785792, + "step": 10080 + }, + { + "epoch": 1.6451876019575855, + "grad_norm": 0.22046580910682678, + "learning_rate": 4.9368982181180576e-05, + "loss": 0.1417, + "num_input_tokens_seen": 21796704, + "step": 10085 + }, + { + "epoch": 1.6460032626427408, + "grad_norm": 0.0671902522444725, + "learning_rate": 4.936739204383413e-05, + "loss": 0.2904, + "num_input_tokens_seen": 21807840, + "step": 10090 + }, + { + "epoch": 1.6468189233278956, + "grad_norm": 1.2413488626480103, + "learning_rate": 4.936579993114324e-05, + "loss": 0.1713, + "num_input_tokens_seen": 21818528, + "step": 10095 + }, + { + "epoch": 1.6476345840130504, + "grad_norm": 0.02881753072142601, + "learning_rate": 4.936420584323699e-05, + "loss": 0.062, + "num_input_tokens_seen": 21829472, + "step": 10100 + }, + { + "epoch": 1.6484502446982057, + "grad_norm": 1.38197922706604, + "learning_rate": 4.936260978024458e-05, + "loss": 0.4128, + "num_input_tokens_seen": 21841024, + "step": 10105 + }, + { + "epoch": 1.6492659053833605, + "grad_norm": 0.6728406548500061, + "learning_rate": 4.936101174229541e-05, + "loss": 0.3775, + "num_input_tokens_seen": 21852096, + "step": 10110 + }, + { + "epoch": 1.6500815660685155, + "grad_norm": 1.3667972087860107, + "learning_rate": 4.935941172951902e-05, + "loss": 0.2138, + "num_input_tokens_seen": 21863904, + "step": 10115 + }, + { + "epoch": 1.6508972267536706, + "grad_norm": 0.6908622980117798, + "learning_rate": 4.9357809742045126e-05, + "loss": 0.1076, + "num_input_tokens_seen": 21874048, + "step": 10120 + }, + { + "epoch": 1.6517128874388254, + "grad_norm": 2.3066461086273193, + "learning_rate": 4.935620578000358e-05, + "loss": 0.1759, + "num_input_tokens_seen": 21885632, + "step": 10125 + }, + { + "epoch": 1.6525285481239804, + "grad_norm": 0.21415647864341736, + "learning_rate": 4.935459984352441e-05, + "loss": 0.2532, + "num_input_tokens_seen": 21896864, + "step": 10130 + }, + { + "epoch": 1.6533442088091355, + "grad_norm": 1.0546451807022095, + "learning_rate": 4.93529919327378e-05, + "loss": 0.1254, + "num_input_tokens_seen": 21907712, + "step": 10135 + }, + { + "epoch": 1.6541598694942903, + "grad_norm": 0.517832338809967, + "learning_rate": 4.9351382047774095e-05, + "loss": 0.179, + "num_input_tokens_seen": 21918464, + "step": 10140 + }, + { + "epoch": 1.6549755301794453, + "grad_norm": 0.15570271015167236, + "learning_rate": 4.934977018876381e-05, + "loss": 0.2078, + "num_input_tokens_seen": 21929664, + "step": 10145 + }, + { + "epoch": 1.6557911908646004, + "grad_norm": 1.1648714542388916, + "learning_rate": 4.93481563558376e-05, + "loss": 0.1464, + "num_input_tokens_seen": 21940288, + "step": 10150 + }, + { + "epoch": 1.6566068515497552, + "grad_norm": 1.036941409111023, + "learning_rate": 4.9346540549126305e-05, + "loss": 0.0646, + "num_input_tokens_seen": 21950464, + "step": 10155 + }, + { + "epoch": 1.6574225122349104, + "grad_norm": 1.05499267578125, + "learning_rate": 4.934492276876089e-05, + "loss": 0.1213, + "num_input_tokens_seen": 21961696, + "step": 10160 + }, + { + "epoch": 1.6582381729200653, + "grad_norm": 0.17494390904903412, + "learning_rate": 4.934330301487251e-05, + "loss": 0.2046, + "num_input_tokens_seen": 21972192, + "step": 10165 + }, + { + "epoch": 1.65905383360522, + "grad_norm": 0.17510263621807098, + "learning_rate": 4.934168128759248e-05, + "loss": 0.2103, + "num_input_tokens_seen": 21982112, + "step": 10170 + }, + { + "epoch": 1.6598694942903753, + "grad_norm": 1.8037638664245605, + "learning_rate": 4.9340057587052245e-05, + "loss": 0.1726, + "num_input_tokens_seen": 21993568, + "step": 10175 + }, + { + "epoch": 1.6606851549755302, + "grad_norm": 0.6196677088737488, + "learning_rate": 4.9338431913383444e-05, + "loss": 0.0954, + "num_input_tokens_seen": 22003296, + "step": 10180 + }, + { + "epoch": 1.6615008156606852, + "grad_norm": 0.0629924088716507, + "learning_rate": 4.9336804266717864e-05, + "loss": 0.0823, + "num_input_tokens_seen": 22013344, + "step": 10185 + }, + { + "epoch": 1.6623164763458402, + "grad_norm": 0.3144052028656006, + "learning_rate": 4.933517464718744e-05, + "loss": 0.2354, + "num_input_tokens_seen": 22022112, + "step": 10190 + }, + { + "epoch": 1.663132137030995, + "grad_norm": 0.17219026386737823, + "learning_rate": 4.933354305492429e-05, + "loss": 0.1326, + "num_input_tokens_seen": 22032704, + "step": 10195 + }, + { + "epoch": 1.66394779771615, + "grad_norm": 0.22085992991924286, + "learning_rate": 4.933190949006068e-05, + "loss": 0.121, + "num_input_tokens_seen": 22044320, + "step": 10200 + }, + { + "epoch": 1.6647634584013051, + "grad_norm": 0.08180168271064758, + "learning_rate": 4.933027395272901e-05, + "loss": 0.0158, + "num_input_tokens_seen": 22055008, + "step": 10205 + }, + { + "epoch": 1.66557911908646, + "grad_norm": 0.3453916311264038, + "learning_rate": 4.9328636443061894e-05, + "loss": 0.1634, + "num_input_tokens_seen": 22065920, + "step": 10210 + }, + { + "epoch": 1.666394779771615, + "grad_norm": 0.11126921325922012, + "learning_rate": 4.932699696119207e-05, + "loss": 0.0658, + "num_input_tokens_seen": 22077024, + "step": 10215 + }, + { + "epoch": 1.66721044045677, + "grad_norm": 1.3116127252578735, + "learning_rate": 4.932535550725243e-05, + "loss": 0.3547, + "num_input_tokens_seen": 22088320, + "step": 10220 + }, + { + "epoch": 1.6680261011419248, + "grad_norm": 0.2266954630613327, + "learning_rate": 4.932371208137605e-05, + "loss": 0.027, + "num_input_tokens_seen": 22098528, + "step": 10225 + }, + { + "epoch": 1.6688417618270799, + "grad_norm": 1.731363296508789, + "learning_rate": 4.932206668369615e-05, + "loss": 0.1775, + "num_input_tokens_seen": 22109792, + "step": 10230 + }, + { + "epoch": 1.669657422512235, + "grad_norm": 0.2706277370452881, + "learning_rate": 4.932041931434611e-05, + "loss": 0.3014, + "num_input_tokens_seen": 22120064, + "step": 10235 + }, + { + "epoch": 1.6704730831973897, + "grad_norm": 0.09294142574071884, + "learning_rate": 4.931876997345949e-05, + "loss": 0.084, + "num_input_tokens_seen": 22131200, + "step": 10240 + }, + { + "epoch": 1.671288743882545, + "grad_norm": 0.16164569556713104, + "learning_rate": 4.931711866116998e-05, + "loss": 0.1494, + "num_input_tokens_seen": 22142880, + "step": 10245 + }, + { + "epoch": 1.6721044045676998, + "grad_norm": 0.08114420622587204, + "learning_rate": 4.9315465377611445e-05, + "loss": 0.2449, + "num_input_tokens_seen": 22151744, + "step": 10250 + }, + { + "epoch": 1.6729200652528549, + "grad_norm": 1.4241777658462524, + "learning_rate": 4.9313810122917914e-05, + "loss": 0.1629, + "num_input_tokens_seen": 22162432, + "step": 10255 + }, + { + "epoch": 1.67373572593801, + "grad_norm": 0.6075181365013123, + "learning_rate": 4.931215289722357e-05, + "loss": 0.0705, + "num_input_tokens_seen": 22174432, + "step": 10260 + }, + { + "epoch": 1.6745513866231647, + "grad_norm": 1.6232726573944092, + "learning_rate": 4.931049370066275e-05, + "loss": 0.2168, + "num_input_tokens_seen": 22185184, + "step": 10265 + }, + { + "epoch": 1.6753670473083198, + "grad_norm": 0.2952570617198944, + "learning_rate": 4.930883253336996e-05, + "loss": 0.1121, + "num_input_tokens_seen": 22196480, + "step": 10270 + }, + { + "epoch": 1.6761827079934748, + "grad_norm": 0.1785585731267929, + "learning_rate": 4.930716939547986e-05, + "loss": 0.1351, + "num_input_tokens_seen": 22207392, + "step": 10275 + }, + { + "epoch": 1.6769983686786296, + "grad_norm": 0.8336212038993835, + "learning_rate": 4.930550428712728e-05, + "loss": 0.1412, + "num_input_tokens_seen": 22218880, + "step": 10280 + }, + { + "epoch": 1.6778140293637847, + "grad_norm": 0.08800079673528671, + "learning_rate": 4.93038372084472e-05, + "loss": 0.2175, + "num_input_tokens_seen": 22229184, + "step": 10285 + }, + { + "epoch": 1.6786296900489397, + "grad_norm": 1.9098169803619385, + "learning_rate": 4.9302168159574756e-05, + "loss": 0.1689, + "num_input_tokens_seen": 22240096, + "step": 10290 + }, + { + "epoch": 1.6794453507340945, + "grad_norm": 0.13753287494182587, + "learning_rate": 4.930049714064525e-05, + "loss": 0.3052, + "num_input_tokens_seen": 22251616, + "step": 10295 + }, + { + "epoch": 1.6802610114192496, + "grad_norm": 0.432573139667511, + "learning_rate": 4.9298824151794154e-05, + "loss": 0.1389, + "num_input_tokens_seen": 22262464, + "step": 10300 + }, + { + "epoch": 1.6810766721044046, + "grad_norm": 0.7999908924102783, + "learning_rate": 4.9297149193157075e-05, + "loss": 0.1628, + "num_input_tokens_seen": 22272640, + "step": 10305 + }, + { + "epoch": 1.6818923327895594, + "grad_norm": 1.4646841287612915, + "learning_rate": 4.9295472264869804e-05, + "loss": 0.1422, + "num_input_tokens_seen": 22283264, + "step": 10310 + }, + { + "epoch": 1.6827079934747147, + "grad_norm": 0.14251692593097687, + "learning_rate": 4.929379336706827e-05, + "loss": 0.0654, + "num_input_tokens_seen": 22294112, + "step": 10315 + }, + { + "epoch": 1.6835236541598695, + "grad_norm": 0.1267152726650238, + "learning_rate": 4.9292112499888584e-05, + "loss": 0.2165, + "num_input_tokens_seen": 22305184, + "step": 10320 + }, + { + "epoch": 1.6843393148450243, + "grad_norm": 0.6478993892669678, + "learning_rate": 4.929042966346701e-05, + "loss": 0.148, + "num_input_tokens_seen": 22315840, + "step": 10325 + }, + { + "epoch": 1.6851549755301796, + "grad_norm": 1.3357243537902832, + "learning_rate": 4.928874485793995e-05, + "loss": 0.1624, + "num_input_tokens_seen": 22325728, + "step": 10330 + }, + { + "epoch": 1.6859706362153344, + "grad_norm": 0.8223444819450378, + "learning_rate": 4.928705808344399e-05, + "loss": 0.1977, + "num_input_tokens_seen": 22335904, + "step": 10335 + }, + { + "epoch": 1.6867862969004894, + "grad_norm": 0.3010203540325165, + "learning_rate": 4.928536934011587e-05, + "loss": 0.2011, + "num_input_tokens_seen": 22345184, + "step": 10340 + }, + { + "epoch": 1.6876019575856445, + "grad_norm": 0.9725590348243713, + "learning_rate": 4.92836786280925e-05, + "loss": 0.1623, + "num_input_tokens_seen": 22356736, + "step": 10345 + }, + { + "epoch": 1.6884176182707993, + "grad_norm": 0.8771800398826599, + "learning_rate": 4.9281985947510915e-05, + "loss": 0.1583, + "num_input_tokens_seen": 22368704, + "step": 10350 + }, + { + "epoch": 1.6892332789559543, + "grad_norm": 0.4385643005371094, + "learning_rate": 4.9280291298508355e-05, + "loss": 0.2261, + "num_input_tokens_seen": 22379904, + "step": 10355 + }, + { + "epoch": 1.6900489396411094, + "grad_norm": 1.6599299907684326, + "learning_rate": 4.927859468122217e-05, + "loss": 0.1981, + "num_input_tokens_seen": 22391776, + "step": 10360 + }, + { + "epoch": 1.6908646003262642, + "grad_norm": 0.9857207536697388, + "learning_rate": 4.9276896095789924e-05, + "loss": 0.1187, + "num_input_tokens_seen": 22401792, + "step": 10365 + }, + { + "epoch": 1.6916802610114192, + "grad_norm": 0.6576722264289856, + "learning_rate": 4.927519554234929e-05, + "loss": 0.0858, + "num_input_tokens_seen": 22411968, + "step": 10370 + }, + { + "epoch": 1.6924959216965743, + "grad_norm": 0.495442271232605, + "learning_rate": 4.9273493021038146e-05, + "loss": 0.1421, + "num_input_tokens_seen": 22424000, + "step": 10375 + }, + { + "epoch": 1.693311582381729, + "grad_norm": 0.7892906069755554, + "learning_rate": 4.927178853199449e-05, + "loss": 0.1024, + "num_input_tokens_seen": 22434048, + "step": 10380 + }, + { + "epoch": 1.6941272430668843, + "grad_norm": 0.11855455487966537, + "learning_rate": 4.927008207535651e-05, + "loss": 0.0916, + "num_input_tokens_seen": 22446176, + "step": 10385 + }, + { + "epoch": 1.6949429037520392, + "grad_norm": 0.31140080094337463, + "learning_rate": 4.9268373651262515e-05, + "loss": 0.0597, + "num_input_tokens_seen": 22457600, + "step": 10390 + }, + { + "epoch": 1.695758564437194, + "grad_norm": 0.820087194442749, + "learning_rate": 4.9266663259851025e-05, + "loss": 0.1224, + "num_input_tokens_seen": 22468768, + "step": 10395 + }, + { + "epoch": 1.6965742251223492, + "grad_norm": 1.4946815967559814, + "learning_rate": 4.926495090126068e-05, + "loss": 0.235, + "num_input_tokens_seen": 22480448, + "step": 10400 + }, + { + "epoch": 1.697389885807504, + "grad_norm": 0.8153831362724304, + "learning_rate": 4.92632365756303e-05, + "loss": 0.1046, + "num_input_tokens_seen": 22490880, + "step": 10405 + }, + { + "epoch": 1.698205546492659, + "grad_norm": 0.1213165670633316, + "learning_rate": 4.926152028309885e-05, + "loss": 0.3573, + "num_input_tokens_seen": 22500704, + "step": 10410 + }, + { + "epoch": 1.6990212071778141, + "grad_norm": 0.19575339555740356, + "learning_rate": 4.9259802023805466e-05, + "loss": 0.0701, + "num_input_tokens_seen": 22511040, + "step": 10415 + }, + { + "epoch": 1.699836867862969, + "grad_norm": 0.2000800222158432, + "learning_rate": 4.9258081797889434e-05, + "loss": 0.1138, + "num_input_tokens_seen": 22521664, + "step": 10420 + }, + { + "epoch": 1.700652528548124, + "grad_norm": 0.36520588397979736, + "learning_rate": 4.925635960549021e-05, + "loss": 0.1666, + "num_input_tokens_seen": 22532192, + "step": 10425 + }, + { + "epoch": 1.701468189233279, + "grad_norm": 0.27685093879699707, + "learning_rate": 4.92546354467474e-05, + "loss": 0.1684, + "num_input_tokens_seen": 22543712, + "step": 10430 + }, + { + "epoch": 1.7022838499184338, + "grad_norm": 0.45899200439453125, + "learning_rate": 4.9252909321800775e-05, + "loss": 0.1718, + "num_input_tokens_seen": 22555232, + "step": 10435 + }, + { + "epoch": 1.7030995106035889, + "grad_norm": 0.6299252510070801, + "learning_rate": 4.925118123079026e-05, + "loss": 0.0539, + "num_input_tokens_seen": 22566720, + "step": 10440 + }, + { + "epoch": 1.703915171288744, + "grad_norm": 0.5591800808906555, + "learning_rate": 4.924945117385594e-05, + "loss": 0.0828, + "num_input_tokens_seen": 22577376, + "step": 10445 + }, + { + "epoch": 1.7047308319738987, + "grad_norm": 1.9399120807647705, + "learning_rate": 4.9247719151138086e-05, + "loss": 0.2485, + "num_input_tokens_seen": 22589568, + "step": 10450 + }, + { + "epoch": 1.7055464926590538, + "grad_norm": 0.23141559958457947, + "learning_rate": 4.924598516277707e-05, + "loss": 0.1502, + "num_input_tokens_seen": 22600672, + "step": 10455 + }, + { + "epoch": 1.7063621533442088, + "grad_norm": 0.35681289434432983, + "learning_rate": 4.924424920891347e-05, + "loss": 0.2757, + "num_input_tokens_seen": 22611200, + "step": 10460 + }, + { + "epoch": 1.7071778140293636, + "grad_norm": 0.2804974317550659, + "learning_rate": 4.9242511289688024e-05, + "loss": 0.1118, + "num_input_tokens_seen": 22620608, + "step": 10465 + }, + { + "epoch": 1.707993474714519, + "grad_norm": 0.053446583449840546, + "learning_rate": 4.924077140524161e-05, + "loss": 0.1974, + "num_input_tokens_seen": 22631168, + "step": 10470 + }, + { + "epoch": 1.7088091353996737, + "grad_norm": 0.3095388114452362, + "learning_rate": 4.9239029555715264e-05, + "loss": 0.2151, + "num_input_tokens_seen": 22640640, + "step": 10475 + }, + { + "epoch": 1.7096247960848288, + "grad_norm": 0.5745346546173096, + "learning_rate": 4.92372857412502e-05, + "loss": 0.1654, + "num_input_tokens_seen": 22650976, + "step": 10480 + }, + { + "epoch": 1.7104404567699838, + "grad_norm": 0.922829806804657, + "learning_rate": 4.9235539961987766e-05, + "loss": 0.3304, + "num_input_tokens_seen": 22661472, + "step": 10485 + }, + { + "epoch": 1.7112561174551386, + "grad_norm": 0.6742053031921387, + "learning_rate": 4.9233792218069494e-05, + "loss": 0.1689, + "num_input_tokens_seen": 22673024, + "step": 10490 + }, + { + "epoch": 1.7120717781402937, + "grad_norm": 0.11383309960365295, + "learning_rate": 4.923204250963707e-05, + "loss": 0.1549, + "num_input_tokens_seen": 22684416, + "step": 10495 + }, + { + "epoch": 1.7128874388254487, + "grad_norm": 0.20021244883537292, + "learning_rate": 4.923029083683233e-05, + "loss": 0.0821, + "num_input_tokens_seen": 22695136, + "step": 10500 + }, + { + "epoch": 1.7137030995106035, + "grad_norm": 0.28709983825683594, + "learning_rate": 4.9228537199797263e-05, + "loss": 0.1746, + "num_input_tokens_seen": 22705408, + "step": 10505 + }, + { + "epoch": 1.7145187601957586, + "grad_norm": 1.1379327774047852, + "learning_rate": 4.9226781598674047e-05, + "loss": 0.1094, + "num_input_tokens_seen": 22716000, + "step": 10510 + }, + { + "epoch": 1.7153344208809136, + "grad_norm": 1.510632872581482, + "learning_rate": 4.922502403360498e-05, + "loss": 0.2379, + "num_input_tokens_seen": 22727936, + "step": 10515 + }, + { + "epoch": 1.7161500815660684, + "grad_norm": 0.9037133455276489, + "learning_rate": 4.922326450473255e-05, + "loss": 0.2981, + "num_input_tokens_seen": 22737632, + "step": 10520 + }, + { + "epoch": 1.7169657422512234, + "grad_norm": 2.1707892417907715, + "learning_rate": 4.9221503012199386e-05, + "loss": 0.2136, + "num_input_tokens_seen": 22748288, + "step": 10525 + }, + { + "epoch": 1.7177814029363785, + "grad_norm": 1.1838117837905884, + "learning_rate": 4.92197395561483e-05, + "loss": 0.2154, + "num_input_tokens_seen": 22760288, + "step": 10530 + }, + { + "epoch": 1.7185970636215333, + "grad_norm": 0.379041850566864, + "learning_rate": 4.9217974136722235e-05, + "loss": 0.1557, + "num_input_tokens_seen": 22772032, + "step": 10535 + }, + { + "epoch": 1.7194127243066886, + "grad_norm": 0.47969967126846313, + "learning_rate": 4.92162067540643e-05, + "loss": 0.1455, + "num_input_tokens_seen": 22783392, + "step": 10540 + }, + { + "epoch": 1.7202283849918434, + "grad_norm": 0.09209180623292923, + "learning_rate": 4.921443740831778e-05, + "loss": 0.1246, + "num_input_tokens_seen": 22795808, + "step": 10545 + }, + { + "epoch": 1.7210440456769984, + "grad_norm": 0.39106622338294983, + "learning_rate": 4.9212666099626095e-05, + "loss": 0.2615, + "num_input_tokens_seen": 22806400, + "step": 10550 + }, + { + "epoch": 1.7218597063621535, + "grad_norm": 0.9700261950492859, + "learning_rate": 4.9210892828132835e-05, + "loss": 0.1532, + "num_input_tokens_seen": 22816256, + "step": 10555 + }, + { + "epoch": 1.7226753670473083, + "grad_norm": 0.5284584164619446, + "learning_rate": 4.920911759398177e-05, + "loss": 0.051, + "num_input_tokens_seen": 22827200, + "step": 10560 + }, + { + "epoch": 1.7234910277324633, + "grad_norm": 0.5419508814811707, + "learning_rate": 4.920734039731679e-05, + "loss": 0.1243, + "num_input_tokens_seen": 22837248, + "step": 10565 + }, + { + "epoch": 1.7243066884176184, + "grad_norm": 0.6644659638404846, + "learning_rate": 4.9205561238281985e-05, + "loss": 0.0788, + "num_input_tokens_seen": 22846048, + "step": 10570 + }, + { + "epoch": 1.7251223491027732, + "grad_norm": 1.179039478302002, + "learning_rate": 4.920378011702155e-05, + "loss": 0.165, + "num_input_tokens_seen": 22856672, + "step": 10575 + }, + { + "epoch": 1.7259380097879282, + "grad_norm": 1.3489512205123901, + "learning_rate": 4.92019970336799e-05, + "loss": 0.1481, + "num_input_tokens_seen": 22868224, + "step": 10580 + }, + { + "epoch": 1.7267536704730833, + "grad_norm": 1.0470668077468872, + "learning_rate": 4.920021198840157e-05, + "loss": 0.3194, + "num_input_tokens_seen": 22879584, + "step": 10585 + }, + { + "epoch": 1.727569331158238, + "grad_norm": 0.09701337665319443, + "learning_rate": 4.919842498133126e-05, + "loss": 0.0464, + "num_input_tokens_seen": 22889696, + "step": 10590 + }, + { + "epoch": 1.7283849918433931, + "grad_norm": 0.6184234023094177, + "learning_rate": 4.919663601261384e-05, + "loss": 0.1005, + "num_input_tokens_seen": 22900224, + "step": 10595 + }, + { + "epoch": 1.7292006525285482, + "grad_norm": 1.0333664417266846, + "learning_rate": 4.919484508239434e-05, + "loss": 0.246, + "num_input_tokens_seen": 22911136, + "step": 10600 + }, + { + "epoch": 1.730016313213703, + "grad_norm": 0.3574853241443634, + "learning_rate": 4.9193052190817926e-05, + "loss": 0.2171, + "num_input_tokens_seen": 22922400, + "step": 10605 + }, + { + "epoch": 1.7308319738988582, + "grad_norm": 0.49208998680114746, + "learning_rate": 4.919125733802995e-05, + "loss": 0.1073, + "num_input_tokens_seen": 22933664, + "step": 10610 + }, + { + "epoch": 1.731647634584013, + "grad_norm": 0.7305833697319031, + "learning_rate": 4.9189460524175915e-05, + "loss": 0.1909, + "num_input_tokens_seen": 22943104, + "step": 10615 + }, + { + "epoch": 1.7324632952691679, + "grad_norm": 1.228419542312622, + "learning_rate": 4.918766174940146e-05, + "loss": 0.1749, + "num_input_tokens_seen": 22953344, + "step": 10620 + }, + { + "epoch": 1.7332789559543231, + "grad_norm": 0.548836886882782, + "learning_rate": 4.918586101385243e-05, + "loss": 0.1522, + "num_input_tokens_seen": 22964928, + "step": 10625 + }, + { + "epoch": 1.734094616639478, + "grad_norm": 0.39504456520080566, + "learning_rate": 4.918405831767478e-05, + "loss": 0.2018, + "num_input_tokens_seen": 22977152, + "step": 10630 + }, + { + "epoch": 1.734910277324633, + "grad_norm": 1.330342411994934, + "learning_rate": 4.9182253661014656e-05, + "loss": 0.1515, + "num_input_tokens_seen": 22988448, + "step": 10635 + }, + { + "epoch": 1.735725938009788, + "grad_norm": 0.11297988891601562, + "learning_rate": 4.9180447044018354e-05, + "loss": 0.1094, + "num_input_tokens_seen": 22998592, + "step": 10640 + }, + { + "epoch": 1.7365415986949428, + "grad_norm": 0.4581161439418793, + "learning_rate": 4.917863846683232e-05, + "loss": 0.108, + "num_input_tokens_seen": 23009344, + "step": 10645 + }, + { + "epoch": 1.7373572593800979, + "grad_norm": 0.7921245694160461, + "learning_rate": 4.9176827929603176e-05, + "loss": 0.2636, + "num_input_tokens_seen": 23019584, + "step": 10650 + }, + { + "epoch": 1.738172920065253, + "grad_norm": 1.5789525508880615, + "learning_rate": 4.9175015432477686e-05, + "loss": 0.2173, + "num_input_tokens_seen": 23030336, + "step": 10655 + }, + { + "epoch": 1.7389885807504077, + "grad_norm": 0.37940162420272827, + "learning_rate": 4.9173200975602776e-05, + "loss": 0.1923, + "num_input_tokens_seen": 23041760, + "step": 10660 + }, + { + "epoch": 1.7398042414355628, + "grad_norm": 1.8595452308654785, + "learning_rate": 4.917138455912555e-05, + "loss": 0.2132, + "num_input_tokens_seen": 23051840, + "step": 10665 + }, + { + "epoch": 1.7406199021207178, + "grad_norm": 1.222503900527954, + "learning_rate": 4.916956618319324e-05, + "loss": 0.0679, + "num_input_tokens_seen": 23061632, + "step": 10670 + }, + { + "epoch": 1.7414355628058726, + "grad_norm": 0.919315755367279, + "learning_rate": 4.916774584795327e-05, + "loss": 0.1863, + "num_input_tokens_seen": 23071520, + "step": 10675 + }, + { + "epoch": 1.7422512234910277, + "grad_norm": 0.23044638335704803, + "learning_rate": 4.916592355355318e-05, + "loss": 0.2763, + "num_input_tokens_seen": 23082304, + "step": 10680 + }, + { + "epoch": 1.7430668841761827, + "grad_norm": 1.9861356019973755, + "learning_rate": 4.916409930014073e-05, + "loss": 0.2502, + "num_input_tokens_seen": 23092416, + "step": 10685 + }, + { + "epoch": 1.7438825448613375, + "grad_norm": 2.1736693382263184, + "learning_rate": 4.916227308786377e-05, + "loss": 0.2047, + "num_input_tokens_seen": 23102720, + "step": 10690 + }, + { + "epoch": 1.7446982055464928, + "grad_norm": 0.4964856505393982, + "learning_rate": 4.916044491687036e-05, + "loss": 0.0832, + "num_input_tokens_seen": 23114016, + "step": 10695 + }, + { + "epoch": 1.7455138662316476, + "grad_norm": 0.41666483879089355, + "learning_rate": 4.915861478730869e-05, + "loss": 0.0744, + "num_input_tokens_seen": 23125888, + "step": 10700 + }, + { + "epoch": 1.7463295269168027, + "grad_norm": 0.8853898048400879, + "learning_rate": 4.915678269932713e-05, + "loss": 0.0799, + "num_input_tokens_seen": 23137248, + "step": 10705 + }, + { + "epoch": 1.7471451876019577, + "grad_norm": 0.40365004539489746, + "learning_rate": 4.91549486530742e-05, + "loss": 0.1813, + "num_input_tokens_seen": 23148064, + "step": 10710 + }, + { + "epoch": 1.7479608482871125, + "grad_norm": 0.7827680706977844, + "learning_rate": 4.9153112648698565e-05, + "loss": 0.0257, + "num_input_tokens_seen": 23158208, + "step": 10715 + }, + { + "epoch": 1.7487765089722676, + "grad_norm": 1.1238071918487549, + "learning_rate": 4.915127468634906e-05, + "loss": 0.1526, + "num_input_tokens_seen": 23170144, + "step": 10720 + }, + { + "epoch": 1.7495921696574226, + "grad_norm": 0.22247803211212158, + "learning_rate": 4.9149434766174695e-05, + "loss": 0.2003, + "num_input_tokens_seen": 23180768, + "step": 10725 + }, + { + "epoch": 1.7504078303425774, + "grad_norm": 1.6841139793395996, + "learning_rate": 4.914759288832462e-05, + "loss": 0.1683, + "num_input_tokens_seen": 23191840, + "step": 10730 + }, + { + "epoch": 1.7512234910277324, + "grad_norm": 0.41562068462371826, + "learning_rate": 4.914574905294813e-05, + "loss": 0.151, + "num_input_tokens_seen": 23201760, + "step": 10735 + }, + { + "epoch": 1.7520391517128875, + "grad_norm": 0.18658669292926788, + "learning_rate": 4.9143903260194715e-05, + "loss": 0.1312, + "num_input_tokens_seen": 23213024, + "step": 10740 + }, + { + "epoch": 1.7528548123980423, + "grad_norm": 0.27012789249420166, + "learning_rate": 4.914205551021399e-05, + "loss": 0.1377, + "num_input_tokens_seen": 23224512, + "step": 10745 + }, + { + "epoch": 1.7536704730831973, + "grad_norm": 0.3188875615596771, + "learning_rate": 4.914020580315576e-05, + "loss": 0.0977, + "num_input_tokens_seen": 23235040, + "step": 10750 + }, + { + "epoch": 1.7544861337683524, + "grad_norm": 0.29813435673713684, + "learning_rate": 4.913835413916996e-05, + "loss": 0.1532, + "num_input_tokens_seen": 23245056, + "step": 10755 + }, + { + "epoch": 1.7553017944535072, + "grad_norm": 0.5771848559379578, + "learning_rate": 4.9136500518406694e-05, + "loss": 0.1436, + "num_input_tokens_seen": 23257824, + "step": 10760 + }, + { + "epoch": 1.7561174551386625, + "grad_norm": 0.7467983365058899, + "learning_rate": 4.913464494101622e-05, + "loss": 0.1541, + "num_input_tokens_seen": 23269376, + "step": 10765 + }, + { + "epoch": 1.7569331158238173, + "grad_norm": 1.8137843608856201, + "learning_rate": 4.913278740714898e-05, + "loss": 0.1262, + "num_input_tokens_seen": 23282560, + "step": 10770 + }, + { + "epoch": 1.7577487765089723, + "grad_norm": 0.5314048528671265, + "learning_rate": 4.913092791695554e-05, + "loss": 0.0951, + "num_input_tokens_seen": 23292896, + "step": 10775 + }, + { + "epoch": 1.7585644371941274, + "grad_norm": 0.42371276021003723, + "learning_rate": 4.912906647058664e-05, + "loss": 0.0372, + "num_input_tokens_seen": 23303168, + "step": 10780 + }, + { + "epoch": 1.7593800978792822, + "grad_norm": 0.2400047928094864, + "learning_rate": 4.912720306819319e-05, + "loss": 0.1845, + "num_input_tokens_seen": 23314784, + "step": 10785 + }, + { + "epoch": 1.7601957585644372, + "grad_norm": 0.8695508241653442, + "learning_rate": 4.9125337709926235e-05, + "loss": 0.1433, + "num_input_tokens_seen": 23325696, + "step": 10790 + }, + { + "epoch": 1.7610114192495923, + "grad_norm": 1.5511823892593384, + "learning_rate": 4.9123470395937e-05, + "loss": 0.233, + "num_input_tokens_seen": 23337120, + "step": 10795 + }, + { + "epoch": 1.761827079934747, + "grad_norm": 1.1077746152877808, + "learning_rate": 4.9121601126376845e-05, + "loss": 0.1725, + "num_input_tokens_seen": 23347584, + "step": 10800 + }, + { + "epoch": 1.7626427406199021, + "grad_norm": 1.2157357931137085, + "learning_rate": 4.9119729901397313e-05, + "loss": 0.2838, + "num_input_tokens_seen": 23357504, + "step": 10805 + }, + { + "epoch": 1.7634584013050572, + "grad_norm": 0.2532455325126648, + "learning_rate": 4.9117856721150095e-05, + "loss": 0.1722, + "num_input_tokens_seen": 23367456, + "step": 10810 + }, + { + "epoch": 1.764274061990212, + "grad_norm": 0.16372384130954742, + "learning_rate": 4.911598158578704e-05, + "loss": 0.1232, + "num_input_tokens_seen": 23377984, + "step": 10815 + }, + { + "epoch": 1.765089722675367, + "grad_norm": 0.38018280267715454, + "learning_rate": 4.9114104495460154e-05, + "loss": 0.2359, + "num_input_tokens_seen": 23390272, + "step": 10820 + }, + { + "epoch": 1.765905383360522, + "grad_norm": 0.1829068958759308, + "learning_rate": 4.9112225450321606e-05, + "loss": 0.037, + "num_input_tokens_seen": 23400416, + "step": 10825 + }, + { + "epoch": 1.7667210440456769, + "grad_norm": 1.073758840560913, + "learning_rate": 4.911034445052371e-05, + "loss": 0.1488, + "num_input_tokens_seen": 23412704, + "step": 10830 + }, + { + "epoch": 1.7675367047308321, + "grad_norm": 0.23486199975013733, + "learning_rate": 4.910846149621896e-05, + "loss": 0.1556, + "num_input_tokens_seen": 23421952, + "step": 10835 + }, + { + "epoch": 1.768352365415987, + "grad_norm": 0.11121225357055664, + "learning_rate": 4.9106576587560006e-05, + "loss": 0.12, + "num_input_tokens_seen": 23431136, + "step": 10840 + }, + { + "epoch": 1.7691680261011418, + "grad_norm": 0.22460253536701202, + "learning_rate": 4.9104689724699625e-05, + "loss": 0.1465, + "num_input_tokens_seen": 23442880, + "step": 10845 + }, + { + "epoch": 1.769983686786297, + "grad_norm": 0.8907060027122498, + "learning_rate": 4.91028009077908e-05, + "loss": 0.1782, + "num_input_tokens_seen": 23453120, + "step": 10850 + }, + { + "epoch": 1.7707993474714518, + "grad_norm": 0.6183016300201416, + "learning_rate": 4.910091013698663e-05, + "loss": 0.2306, + "num_input_tokens_seen": 23464192, + "step": 10855 + }, + { + "epoch": 1.7716150081566069, + "grad_norm": 0.7102295160293579, + "learning_rate": 4.90990174124404e-05, + "loss": 0.2203, + "num_input_tokens_seen": 23475264, + "step": 10860 + }, + { + "epoch": 1.772430668841762, + "grad_norm": 0.5815507769584656, + "learning_rate": 4.909712273430554e-05, + "loss": 0.1244, + "num_input_tokens_seen": 23485440, + "step": 10865 + }, + { + "epoch": 1.7732463295269167, + "grad_norm": 0.4833236336708069, + "learning_rate": 4.9095226102735645e-05, + "loss": 0.1119, + "num_input_tokens_seen": 23496768, + "step": 10870 + }, + { + "epoch": 1.7740619902120718, + "grad_norm": 0.0349065437912941, + "learning_rate": 4.909332751788447e-05, + "loss": 0.1553, + "num_input_tokens_seen": 23507328, + "step": 10875 + }, + { + "epoch": 1.7748776508972268, + "grad_norm": 0.49040454626083374, + "learning_rate": 4.909142697990591e-05, + "loss": 0.1055, + "num_input_tokens_seen": 23518976, + "step": 10880 + }, + { + "epoch": 1.7756933115823816, + "grad_norm": 0.16327893733978271, + "learning_rate": 4.908952448895404e-05, + "loss": 0.1443, + "num_input_tokens_seen": 23530720, + "step": 10885 + }, + { + "epoch": 1.7765089722675367, + "grad_norm": 0.5573715567588806, + "learning_rate": 4.908762004518309e-05, + "loss": 0.108, + "num_input_tokens_seen": 23541312, + "step": 10890 + }, + { + "epoch": 1.7773246329526917, + "grad_norm": 0.3981970548629761, + "learning_rate": 4.908571364874743e-05, + "loss": 0.1453, + "num_input_tokens_seen": 23552064, + "step": 10895 + }, + { + "epoch": 1.7781402936378465, + "grad_norm": 0.322843074798584, + "learning_rate": 4.9083805299801626e-05, + "loss": 0.1604, + "num_input_tokens_seen": 23563104, + "step": 10900 + }, + { + "epoch": 1.7789559543230016, + "grad_norm": 0.6862454414367676, + "learning_rate": 4.908189499850036e-05, + "loss": 0.1095, + "num_input_tokens_seen": 23572992, + "step": 10905 + }, + { + "epoch": 1.7797716150081566, + "grad_norm": 1.444200873374939, + "learning_rate": 4.907998274499849e-05, + "loss": 0.2169, + "num_input_tokens_seen": 23583328, + "step": 10910 + }, + { + "epoch": 1.7805872756933114, + "grad_norm": 0.339087575674057, + "learning_rate": 4.9078068539451045e-05, + "loss": 0.097, + "num_input_tokens_seen": 23593888, + "step": 10915 + }, + { + "epoch": 1.7814029363784667, + "grad_norm": 1.4434795379638672, + "learning_rate": 4.907615238201319e-05, + "loss": 0.158, + "num_input_tokens_seen": 23604544, + "step": 10920 + }, + { + "epoch": 1.7822185970636215, + "grad_norm": 0.3660069704055786, + "learning_rate": 4.907423427284026e-05, + "loss": 0.2129, + "num_input_tokens_seen": 23615584, + "step": 10925 + }, + { + "epoch": 1.7830342577487766, + "grad_norm": 0.06385023146867752, + "learning_rate": 4.907231421208775e-05, + "loss": 0.0904, + "num_input_tokens_seen": 23625568, + "step": 10930 + }, + { + "epoch": 1.7838499184339316, + "grad_norm": 0.08754198253154755, + "learning_rate": 4.907039219991131e-05, + "loss": 0.0462, + "num_input_tokens_seen": 23636320, + "step": 10935 + }, + { + "epoch": 1.7846655791190864, + "grad_norm": 0.5627201795578003, + "learning_rate": 4.906846823646675e-05, + "loss": 0.2143, + "num_input_tokens_seen": 23646912, + "step": 10940 + }, + { + "epoch": 1.7854812398042414, + "grad_norm": 0.12414195388555527, + "learning_rate": 4.906654232191002e-05, + "loss": 0.1287, + "num_input_tokens_seen": 23657120, + "step": 10945 + }, + { + "epoch": 1.7862969004893965, + "grad_norm": 0.9161733984947205, + "learning_rate": 4.906461445639726e-05, + "loss": 0.2149, + "num_input_tokens_seen": 23667712, + "step": 10950 + }, + { + "epoch": 1.7871125611745513, + "grad_norm": 0.23685620725154877, + "learning_rate": 4.906268464008476e-05, + "loss": 0.1095, + "num_input_tokens_seen": 23678720, + "step": 10955 + }, + { + "epoch": 1.7879282218597063, + "grad_norm": 0.6284694671630859, + "learning_rate": 4.9060752873128946e-05, + "loss": 0.0687, + "num_input_tokens_seen": 23688160, + "step": 10960 + }, + { + "epoch": 1.7887438825448614, + "grad_norm": 1.4317481517791748, + "learning_rate": 4.905881915568642e-05, + "loss": 0.2153, + "num_input_tokens_seen": 23699680, + "step": 10965 + }, + { + "epoch": 1.7895595432300162, + "grad_norm": 0.9173634052276611, + "learning_rate": 4.905688348791394e-05, + "loss": 0.1222, + "num_input_tokens_seen": 23710880, + "step": 10970 + }, + { + "epoch": 1.7903752039151712, + "grad_norm": 0.47638052701950073, + "learning_rate": 4.905494586996842e-05, + "loss": 0.1092, + "num_input_tokens_seen": 23722304, + "step": 10975 + }, + { + "epoch": 1.7911908646003263, + "grad_norm": 1.5935089588165283, + "learning_rate": 4.905300630200693e-05, + "loss": 0.1977, + "num_input_tokens_seen": 23734048, + "step": 10980 + }, + { + "epoch": 1.792006525285481, + "grad_norm": 0.05882372334599495, + "learning_rate": 4.9051064784186704e-05, + "loss": 0.1129, + "num_input_tokens_seen": 23745600, + "step": 10985 + }, + { + "epoch": 1.7928221859706364, + "grad_norm": 0.29162392020225525, + "learning_rate": 4.9049121316665146e-05, + "loss": 0.0552, + "num_input_tokens_seen": 23756800, + "step": 10990 + }, + { + "epoch": 1.7936378466557912, + "grad_norm": 0.7141324877738953, + "learning_rate": 4.904717589959978e-05, + "loss": 0.2243, + "num_input_tokens_seen": 23766880, + "step": 10995 + }, + { + "epoch": 1.7944535073409462, + "grad_norm": 0.08671049028635025, + "learning_rate": 4.904522853314833e-05, + "loss": 0.1478, + "num_input_tokens_seen": 23777472, + "step": 11000 + }, + { + "epoch": 1.7952691680261013, + "grad_norm": 0.7424882054328918, + "learning_rate": 4.904327921746864e-05, + "loss": 0.0872, + "num_input_tokens_seen": 23786336, + "step": 11005 + }, + { + "epoch": 1.796084828711256, + "grad_norm": 0.6772093772888184, + "learning_rate": 4.904132795271875e-05, + "loss": 0.1564, + "num_input_tokens_seen": 23797184, + "step": 11010 + }, + { + "epoch": 1.7969004893964111, + "grad_norm": 0.16423743963241577, + "learning_rate": 4.9039374739056825e-05, + "loss": 0.1133, + "num_input_tokens_seen": 23808288, + "step": 11015 + }, + { + "epoch": 1.7977161500815662, + "grad_norm": 1.107019305229187, + "learning_rate": 4.903741957664121e-05, + "loss": 0.2249, + "num_input_tokens_seen": 23818176, + "step": 11020 + }, + { + "epoch": 1.798531810766721, + "grad_norm": 1.168736457824707, + "learning_rate": 4.903546246563041e-05, + "loss": 0.1547, + "num_input_tokens_seen": 23829344, + "step": 11025 + }, + { + "epoch": 1.799347471451876, + "grad_norm": 1.4475007057189941, + "learning_rate": 4.9033503406183055e-05, + "loss": 0.1423, + "num_input_tokens_seen": 23839936, + "step": 11030 + }, + { + "epoch": 1.800163132137031, + "grad_norm": 1.1656544208526611, + "learning_rate": 4.9031542398457974e-05, + "loss": 0.1082, + "num_input_tokens_seen": 23852736, + "step": 11035 + }, + { + "epoch": 1.8009787928221859, + "grad_norm": 0.2272362858057022, + "learning_rate": 4.902957944261413e-05, + "loss": 0.1568, + "num_input_tokens_seen": 23864736, + "step": 11040 + }, + { + "epoch": 1.801794453507341, + "grad_norm": 1.4763901233673096, + "learning_rate": 4.902761453881065e-05, + "loss": 0.1908, + "num_input_tokens_seen": 23874432, + "step": 11045 + }, + { + "epoch": 1.802610114192496, + "grad_norm": 0.4412177801132202, + "learning_rate": 4.9025647687206824e-05, + "loss": 0.0565, + "num_input_tokens_seen": 23885056, + "step": 11050 + }, + { + "epoch": 1.8034257748776508, + "grad_norm": 0.5296454429626465, + "learning_rate": 4.90236788879621e-05, + "loss": 0.139, + "num_input_tokens_seen": 23896096, + "step": 11055 + }, + { + "epoch": 1.804241435562806, + "grad_norm": 0.3225826025009155, + "learning_rate": 4.9021708141236056e-05, + "loss": 0.1102, + "num_input_tokens_seen": 23907200, + "step": 11060 + }, + { + "epoch": 1.8050570962479608, + "grad_norm": 0.41703757643699646, + "learning_rate": 4.901973544718847e-05, + "loss": 0.1011, + "num_input_tokens_seen": 23916576, + "step": 11065 + }, + { + "epoch": 1.8058727569331157, + "grad_norm": 0.10879310965538025, + "learning_rate": 4.901776080597926e-05, + "loss": 0.0916, + "num_input_tokens_seen": 23926784, + "step": 11070 + }, + { + "epoch": 1.806688417618271, + "grad_norm": 1.0350428819656372, + "learning_rate": 4.9015784217768487e-05, + "loss": 0.2252, + "num_input_tokens_seen": 23937632, + "step": 11075 + }, + { + "epoch": 1.8075040783034257, + "grad_norm": 0.9058722853660583, + "learning_rate": 4.901380568271639e-05, + "loss": 0.0377, + "num_input_tokens_seen": 23949024, + "step": 11080 + }, + { + "epoch": 1.8083197389885808, + "grad_norm": 0.7912176251411438, + "learning_rate": 4.901182520098336e-05, + "loss": 0.1899, + "num_input_tokens_seen": 23960192, + "step": 11085 + }, + { + "epoch": 1.8091353996737358, + "grad_norm": 0.641477644443512, + "learning_rate": 4.9009842772729944e-05, + "loss": 0.1941, + "num_input_tokens_seen": 23970304, + "step": 11090 + }, + { + "epoch": 1.8099510603588906, + "grad_norm": 0.22677692770957947, + "learning_rate": 4.9007858398116856e-05, + "loss": 0.1048, + "num_input_tokens_seen": 23980544, + "step": 11095 + }, + { + "epoch": 1.8107667210440457, + "grad_norm": 1.4863380193710327, + "learning_rate": 4.9005872077304944e-05, + "loss": 0.0962, + "num_input_tokens_seen": 23991680, + "step": 11100 + }, + { + "epoch": 1.8115823817292007, + "grad_norm": 0.3254639804363251, + "learning_rate": 4.900388381045524e-05, + "loss": 0.2548, + "num_input_tokens_seen": 24000672, + "step": 11105 + }, + { + "epoch": 1.8123980424143555, + "grad_norm": 0.38307857513427734, + "learning_rate": 4.9001893597728915e-05, + "loss": 0.1999, + "num_input_tokens_seen": 24010496, + "step": 11110 + }, + { + "epoch": 1.8132137030995106, + "grad_norm": 0.20040693879127502, + "learning_rate": 4.899990143928731e-05, + "loss": 0.1024, + "num_input_tokens_seen": 24021056, + "step": 11115 + }, + { + "epoch": 1.8140293637846656, + "grad_norm": 0.13844339549541473, + "learning_rate": 4.899790733529193e-05, + "loss": 0.1328, + "num_input_tokens_seen": 24031136, + "step": 11120 + }, + { + "epoch": 1.8148450244698204, + "grad_norm": 0.22696815431118011, + "learning_rate": 4.8995911285904404e-05, + "loss": 0.0261, + "num_input_tokens_seen": 24040864, + "step": 11125 + }, + { + "epoch": 1.8156606851549757, + "grad_norm": 0.3906095027923584, + "learning_rate": 4.899391329128656e-05, + "loss": 0.2294, + "num_input_tokens_seen": 24051168, + "step": 11130 + }, + { + "epoch": 1.8164763458401305, + "grad_norm": 1.1550642251968384, + "learning_rate": 4.899191335160037e-05, + "loss": 0.1693, + "num_input_tokens_seen": 24060448, + "step": 11135 + }, + { + "epoch": 1.8172920065252853, + "grad_norm": 0.3427199423313141, + "learning_rate": 4.898991146700794e-05, + "loss": 0.3734, + "num_input_tokens_seen": 24070208, + "step": 11140 + }, + { + "epoch": 1.8181076672104406, + "grad_norm": 0.18821614980697632, + "learning_rate": 4.898790763767157e-05, + "loss": 0.0749, + "num_input_tokens_seen": 24080000, + "step": 11145 + }, + { + "epoch": 1.8189233278955954, + "grad_norm": 0.9808987975120544, + "learning_rate": 4.8985901863753694e-05, + "loss": 0.1195, + "num_input_tokens_seen": 24089120, + "step": 11150 + }, + { + "epoch": 1.8197389885807504, + "grad_norm": 0.12925080955028534, + "learning_rate": 4.8983894145416896e-05, + "loss": 0.0743, + "num_input_tokens_seen": 24099968, + "step": 11155 + }, + { + "epoch": 1.8205546492659055, + "grad_norm": 0.5045719146728516, + "learning_rate": 4.898188448282396e-05, + "loss": 0.0648, + "num_input_tokens_seen": 24110560, + "step": 11160 + }, + { + "epoch": 1.8213703099510603, + "grad_norm": 0.07339629530906677, + "learning_rate": 4.897987287613778e-05, + "loss": 0.1302, + "num_input_tokens_seen": 24121984, + "step": 11165 + }, + { + "epoch": 1.8221859706362153, + "grad_norm": 1.2263209819793701, + "learning_rate": 4.897785932552143e-05, + "loss": 0.2561, + "num_input_tokens_seen": 24132544, + "step": 11170 + }, + { + "epoch": 1.8230016313213704, + "grad_norm": 1.0832867622375488, + "learning_rate": 4.897584383113814e-05, + "loss": 0.1706, + "num_input_tokens_seen": 24143808, + "step": 11175 + }, + { + "epoch": 1.8238172920065252, + "grad_norm": 0.8655292391777039, + "learning_rate": 4.89738263931513e-05, + "loss": 0.2561, + "num_input_tokens_seen": 24154496, + "step": 11180 + }, + { + "epoch": 1.8246329526916802, + "grad_norm": 1.0648332834243774, + "learning_rate": 4.8971807011724444e-05, + "loss": 0.162, + "num_input_tokens_seen": 24165472, + "step": 11185 + }, + { + "epoch": 1.8254486133768353, + "grad_norm": 0.16703511774539948, + "learning_rate": 4.8969785687021294e-05, + "loss": 0.0576, + "num_input_tokens_seen": 24177088, + "step": 11190 + }, + { + "epoch": 1.82626427406199, + "grad_norm": 0.18617329001426697, + "learning_rate": 4.8967762419205684e-05, + "loss": 0.1119, + "num_input_tokens_seen": 24188768, + "step": 11195 + }, + { + "epoch": 1.8270799347471451, + "grad_norm": 0.2792664170265198, + "learning_rate": 4.896573720844164e-05, + "loss": 0.1685, + "num_input_tokens_seen": 24199712, + "step": 11200 + }, + { + "epoch": 1.8278955954323002, + "grad_norm": 0.4508182108402252, + "learning_rate": 4.896371005489334e-05, + "loss": 0.1611, + "num_input_tokens_seen": 24210912, + "step": 11205 + }, + { + "epoch": 1.828711256117455, + "grad_norm": 1.4522744417190552, + "learning_rate": 4.896168095872511e-05, + "loss": 0.1429, + "num_input_tokens_seen": 24221824, + "step": 11210 + }, + { + "epoch": 1.8295269168026103, + "grad_norm": 0.11767175793647766, + "learning_rate": 4.895964992010145e-05, + "loss": 0.0863, + "num_input_tokens_seen": 24231168, + "step": 11215 + }, + { + "epoch": 1.830342577487765, + "grad_norm": 0.06283705681562424, + "learning_rate": 4.895761693918699e-05, + "loss": 0.0234, + "num_input_tokens_seen": 24241408, + "step": 11220 + }, + { + "epoch": 1.8311582381729201, + "grad_norm": 0.5714608430862427, + "learning_rate": 4.895558201614654e-05, + "loss": 0.179, + "num_input_tokens_seen": 24251520, + "step": 11225 + }, + { + "epoch": 1.8319738988580752, + "grad_norm": 0.12997697293758392, + "learning_rate": 4.895354515114506e-05, + "loss": 0.2542, + "num_input_tokens_seen": 24262144, + "step": 11230 + }, + { + "epoch": 1.83278955954323, + "grad_norm": 1.1499295234680176, + "learning_rate": 4.895150634434769e-05, + "loss": 0.1397, + "num_input_tokens_seen": 24273440, + "step": 11235 + }, + { + "epoch": 1.833605220228385, + "grad_norm": 0.8665029406547546, + "learning_rate": 4.894946559591966e-05, + "loss": 0.0522, + "num_input_tokens_seen": 24284832, + "step": 11240 + }, + { + "epoch": 1.83442088091354, + "grad_norm": 0.4014985263347626, + "learning_rate": 4.8947422906026446e-05, + "loss": 0.1711, + "num_input_tokens_seen": 24295488, + "step": 11245 + }, + { + "epoch": 1.8352365415986949, + "grad_norm": 0.4329793453216553, + "learning_rate": 4.894537827483362e-05, + "loss": 0.1445, + "num_input_tokens_seen": 24306624, + "step": 11250 + }, + { + "epoch": 1.83605220228385, + "grad_norm": 0.19141638278961182, + "learning_rate": 4.8943331702506935e-05, + "loss": 0.0864, + "num_input_tokens_seen": 24318176, + "step": 11255 + }, + { + "epoch": 1.836867862969005, + "grad_norm": 0.5342246890068054, + "learning_rate": 4.894128318921229e-05, + "loss": 0.1482, + "num_input_tokens_seen": 24329024, + "step": 11260 + }, + { + "epoch": 1.8376835236541598, + "grad_norm": 0.05448802933096886, + "learning_rate": 4.893923273511576e-05, + "loss": 0.0734, + "num_input_tokens_seen": 24339680, + "step": 11265 + }, + { + "epoch": 1.8384991843393148, + "grad_norm": 0.053307320922613144, + "learning_rate": 4.893718034038355e-05, + "loss": 0.2316, + "num_input_tokens_seen": 24350368, + "step": 11270 + }, + { + "epoch": 1.8393148450244698, + "grad_norm": 0.3530235290527344, + "learning_rate": 4.8935126005182056e-05, + "loss": 0.0914, + "num_input_tokens_seen": 24361536, + "step": 11275 + }, + { + "epoch": 1.8401305057096247, + "grad_norm": 0.06855583935976028, + "learning_rate": 4.8933069729677795e-05, + "loss": 0.1889, + "num_input_tokens_seen": 24372768, + "step": 11280 + }, + { + "epoch": 1.84094616639478, + "grad_norm": 1.6704508066177368, + "learning_rate": 4.893101151403747e-05, + "loss": 0.1617, + "num_input_tokens_seen": 24382592, + "step": 11285 + }, + { + "epoch": 1.8417618270799347, + "grad_norm": 0.514264702796936, + "learning_rate": 4.892895135842792e-05, + "loss": 0.1061, + "num_input_tokens_seen": 24393248, + "step": 11290 + }, + { + "epoch": 1.8425774877650896, + "grad_norm": 0.6328234076499939, + "learning_rate": 4.892688926301616e-05, + "loss": 0.1388, + "num_input_tokens_seen": 24403232, + "step": 11295 + }, + { + "epoch": 1.8433931484502448, + "grad_norm": 0.4304945468902588, + "learning_rate": 4.892482522796936e-05, + "loss": 0.0954, + "num_input_tokens_seen": 24413792, + "step": 11300 + }, + { + "epoch": 1.8442088091353996, + "grad_norm": 0.24159249663352966, + "learning_rate": 4.892275925345483e-05, + "loss": 0.078, + "num_input_tokens_seen": 24423328, + "step": 11305 + }, + { + "epoch": 1.8450244698205547, + "grad_norm": 0.6167858839035034, + "learning_rate": 4.8920691339640055e-05, + "loss": 0.2761, + "num_input_tokens_seen": 24434464, + "step": 11310 + }, + { + "epoch": 1.8458401305057097, + "grad_norm": 0.09842648357152939, + "learning_rate": 4.8918621486692663e-05, + "loss": 0.1146, + "num_input_tokens_seen": 24444224, + "step": 11315 + }, + { + "epoch": 1.8466557911908645, + "grad_norm": 0.12907062470912933, + "learning_rate": 4.8916549694780455e-05, + "loss": 0.1194, + "num_input_tokens_seen": 24454912, + "step": 11320 + }, + { + "epoch": 1.8474714518760196, + "grad_norm": 0.6324743628501892, + "learning_rate": 4.891447596407137e-05, + "loss": 0.0572, + "num_input_tokens_seen": 24466304, + "step": 11325 + }, + { + "epoch": 1.8482871125611746, + "grad_norm": 0.5863937735557556, + "learning_rate": 4.8912400294733526e-05, + "loss": 0.1656, + "num_input_tokens_seen": 24476960, + "step": 11330 + }, + { + "epoch": 1.8491027732463294, + "grad_norm": 0.7638999223709106, + "learning_rate": 4.891032268693519e-05, + "loss": 0.2082, + "num_input_tokens_seen": 24486784, + "step": 11335 + }, + { + "epoch": 1.8499184339314845, + "grad_norm": 0.16736260056495667, + "learning_rate": 4.8908243140844765e-05, + "loss": 0.0616, + "num_input_tokens_seen": 24497120, + "step": 11340 + }, + { + "epoch": 1.8507340946166395, + "grad_norm": 0.3171115219593048, + "learning_rate": 4.890616165663085e-05, + "loss": 0.1409, + "num_input_tokens_seen": 24507200, + "step": 11345 + }, + { + "epoch": 1.8515497553017943, + "grad_norm": 0.9361408948898315, + "learning_rate": 4.890407823446218e-05, + "loss": 0.267, + "num_input_tokens_seen": 24517920, + "step": 11350 + }, + { + "epoch": 1.8523654159869496, + "grad_norm": 0.29708027839660645, + "learning_rate": 4.890199287450763e-05, + "loss": 0.1161, + "num_input_tokens_seen": 24528640, + "step": 11355 + }, + { + "epoch": 1.8531810766721044, + "grad_norm": 1.4451370239257812, + "learning_rate": 4.889990557693626e-05, + "loss": 0.2154, + "num_input_tokens_seen": 24540352, + "step": 11360 + }, + { + "epoch": 1.8539967373572592, + "grad_norm": 0.8457237482070923, + "learning_rate": 4.889781634191728e-05, + "loss": 0.0786, + "num_input_tokens_seen": 24551584, + "step": 11365 + }, + { + "epoch": 1.8548123980424145, + "grad_norm": 0.3698783218860626, + "learning_rate": 4.889572516962006e-05, + "loss": 0.1148, + "num_input_tokens_seen": 24562400, + "step": 11370 + }, + { + "epoch": 1.8556280587275693, + "grad_norm": 0.17349012196063995, + "learning_rate": 4.889363206021409e-05, + "loss": 0.1338, + "num_input_tokens_seen": 24572736, + "step": 11375 + }, + { + "epoch": 1.8564437194127243, + "grad_norm": 1.4691931009292603, + "learning_rate": 4.889153701386908e-05, + "loss": 0.2128, + "num_input_tokens_seen": 24584416, + "step": 11380 + }, + { + "epoch": 1.8572593800978794, + "grad_norm": 1.9815444946289062, + "learning_rate": 4.888944003075486e-05, + "loss": 0.2383, + "num_input_tokens_seen": 24593472, + "step": 11385 + }, + { + "epoch": 1.8580750407830342, + "grad_norm": 0.09037842601537704, + "learning_rate": 4.888734111104142e-05, + "loss": 0.3937, + "num_input_tokens_seen": 24604544, + "step": 11390 + }, + { + "epoch": 1.8588907014681892, + "grad_norm": 0.14706182479858398, + "learning_rate": 4.88852402548989e-05, + "loss": 0.1211, + "num_input_tokens_seen": 24614784, + "step": 11395 + }, + { + "epoch": 1.8597063621533443, + "grad_norm": 0.8038565516471863, + "learning_rate": 4.8883137462497615e-05, + "loss": 0.1234, + "num_input_tokens_seen": 24625504, + "step": 11400 + }, + { + "epoch": 1.860522022838499, + "grad_norm": 0.8292802572250366, + "learning_rate": 4.8881032734008024e-05, + "loss": 0.1179, + "num_input_tokens_seen": 24636192, + "step": 11405 + }, + { + "epoch": 1.8613376835236541, + "grad_norm": 0.6918166279792786, + "learning_rate": 4.887892606960075e-05, + "loss": 0.1459, + "num_input_tokens_seen": 24647200, + "step": 11410 + }, + { + "epoch": 1.8621533442088092, + "grad_norm": 0.10024622082710266, + "learning_rate": 4.887681746944657e-05, + "loss": 0.0639, + "num_input_tokens_seen": 24656640, + "step": 11415 + }, + { + "epoch": 1.862969004893964, + "grad_norm": 1.9872785806655884, + "learning_rate": 4.8874706933716406e-05, + "loss": 0.0825, + "num_input_tokens_seen": 24665664, + "step": 11420 + }, + { + "epoch": 1.863784665579119, + "grad_norm": 0.703723669052124, + "learning_rate": 4.887259446258137e-05, + "loss": 0.1545, + "num_input_tokens_seen": 24676896, + "step": 11425 + }, + { + "epoch": 1.864600326264274, + "grad_norm": 0.4607495963573456, + "learning_rate": 4.887048005621269e-05, + "loss": 0.1044, + "num_input_tokens_seen": 24688960, + "step": 11430 + }, + { + "epoch": 1.865415986949429, + "grad_norm": 0.5059289336204529, + "learning_rate": 4.886836371478178e-05, + "loss": 0.0842, + "num_input_tokens_seen": 24699200, + "step": 11435 + }, + { + "epoch": 1.8662316476345842, + "grad_norm": 0.730379581451416, + "learning_rate": 4.8866245438460215e-05, + "loss": 0.1003, + "num_input_tokens_seen": 24709600, + "step": 11440 + }, + { + "epoch": 1.867047308319739, + "grad_norm": 1.9107879400253296, + "learning_rate": 4.886412522741968e-05, + "loss": 0.2707, + "num_input_tokens_seen": 24719968, + "step": 11445 + }, + { + "epoch": 1.867862969004894, + "grad_norm": 0.10750371962785721, + "learning_rate": 4.886200308183207e-05, + "loss": 0.0364, + "num_input_tokens_seen": 24731520, + "step": 11450 + }, + { + "epoch": 1.868678629690049, + "grad_norm": 0.4764733612537384, + "learning_rate": 4.885987900186943e-05, + "loss": 0.0785, + "num_input_tokens_seen": 24742880, + "step": 11455 + }, + { + "epoch": 1.8694942903752039, + "grad_norm": 0.04602759703993797, + "learning_rate": 4.8857752987703924e-05, + "loss": 0.1717, + "num_input_tokens_seen": 24753600, + "step": 11460 + }, + { + "epoch": 1.870309951060359, + "grad_norm": 0.22615784406661987, + "learning_rate": 4.8855625039507916e-05, + "loss": 0.2017, + "num_input_tokens_seen": 24764608, + "step": 11465 + }, + { + "epoch": 1.871125611745514, + "grad_norm": 1.6973901987075806, + "learning_rate": 4.8853495157453886e-05, + "loss": 0.1853, + "num_input_tokens_seen": 24775872, + "step": 11470 + }, + { + "epoch": 1.8719412724306688, + "grad_norm": 0.9939471483230591, + "learning_rate": 4.885136334171452e-05, + "loss": 0.1505, + "num_input_tokens_seen": 24786208, + "step": 11475 + }, + { + "epoch": 1.8727569331158238, + "grad_norm": 1.6235151290893555, + "learning_rate": 4.8849229592462615e-05, + "loss": 0.3, + "num_input_tokens_seen": 24795296, + "step": 11480 + }, + { + "epoch": 1.8735725938009788, + "grad_norm": 0.16128504276275635, + "learning_rate": 4.884709390987115e-05, + "loss": 0.1011, + "num_input_tokens_seen": 24805632, + "step": 11485 + }, + { + "epoch": 1.8743882544861337, + "grad_norm": 0.8719653487205505, + "learning_rate": 4.8844956294113255e-05, + "loss": 0.0986, + "num_input_tokens_seen": 24815552, + "step": 11490 + }, + { + "epoch": 1.8752039151712887, + "grad_norm": 0.08685186505317688, + "learning_rate": 4.884281674536221e-05, + "loss": 0.2034, + "num_input_tokens_seen": 24827264, + "step": 11495 + }, + { + "epoch": 1.8760195758564437, + "grad_norm": 0.08469411730766296, + "learning_rate": 4.884067526379147e-05, + "loss": 0.1035, + "num_input_tokens_seen": 24839168, + "step": 11500 + }, + { + "epoch": 1.8768352365415986, + "grad_norm": 0.7849903702735901, + "learning_rate": 4.8838531849574624e-05, + "loss": 0.1625, + "num_input_tokens_seen": 24850336, + "step": 11505 + }, + { + "epoch": 1.8776508972267538, + "grad_norm": 0.17188666760921478, + "learning_rate": 4.8836386502885426e-05, + "loss": 0.0351, + "num_input_tokens_seen": 24861984, + "step": 11510 + }, + { + "epoch": 1.8784665579119086, + "grad_norm": 0.9335209727287292, + "learning_rate": 4.88342392238978e-05, + "loss": 0.1219, + "num_input_tokens_seen": 24872160, + "step": 11515 + }, + { + "epoch": 1.8792822185970635, + "grad_norm": 0.9970380067825317, + "learning_rate": 4.88320900127858e-05, + "loss": 0.2172, + "num_input_tokens_seen": 24882272, + "step": 11520 + }, + { + "epoch": 1.8800978792822187, + "grad_norm": 1.2653907537460327, + "learning_rate": 4.882993886972367e-05, + "loss": 0.247, + "num_input_tokens_seen": 24892448, + "step": 11525 + }, + { + "epoch": 1.8809135399673735, + "grad_norm": 0.4529324173927307, + "learning_rate": 4.882778579488578e-05, + "loss": 0.0928, + "num_input_tokens_seen": 24902016, + "step": 11530 + }, + { + "epoch": 1.8817292006525286, + "grad_norm": 1.9510047435760498, + "learning_rate": 4.882563078844668e-05, + "loss": 0.2199, + "num_input_tokens_seen": 24913344, + "step": 11535 + }, + { + "epoch": 1.8825448613376836, + "grad_norm": 0.7624040246009827, + "learning_rate": 4.882347385058105e-05, + "loss": 0.2063, + "num_input_tokens_seen": 24923840, + "step": 11540 + }, + { + "epoch": 1.8833605220228384, + "grad_norm": 0.13526307046413422, + "learning_rate": 4.882131498146375e-05, + "loss": 0.1974, + "num_input_tokens_seen": 24933568, + "step": 11545 + }, + { + "epoch": 1.8841761827079935, + "grad_norm": 0.533092737197876, + "learning_rate": 4.881915418126979e-05, + "loss": 0.0793, + "num_input_tokens_seen": 24943520, + "step": 11550 + }, + { + "epoch": 1.8849918433931485, + "grad_norm": 2.3084022998809814, + "learning_rate": 4.8816991450174334e-05, + "loss": 0.1972, + "num_input_tokens_seen": 24954976, + "step": 11555 + }, + { + "epoch": 1.8858075040783033, + "grad_norm": 0.3834567070007324, + "learning_rate": 4.881482678835271e-05, + "loss": 0.0552, + "num_input_tokens_seen": 24964384, + "step": 11560 + }, + { + "epoch": 1.8866231647634584, + "grad_norm": 0.5558674931526184, + "learning_rate": 4.881266019598039e-05, + "loss": 0.0574, + "num_input_tokens_seen": 24975584, + "step": 11565 + }, + { + "epoch": 1.8874388254486134, + "grad_norm": 1.696108102798462, + "learning_rate": 4.8810491673233006e-05, + "loss": 0.0959, + "num_input_tokens_seen": 24987520, + "step": 11570 + }, + { + "epoch": 1.8882544861337682, + "grad_norm": 0.16730380058288574, + "learning_rate": 4.880832122028635e-05, + "loss": 0.1392, + "num_input_tokens_seen": 24999424, + "step": 11575 + }, + { + "epoch": 1.8890701468189235, + "grad_norm": 1.977611780166626, + "learning_rate": 4.880614883731638e-05, + "loss": 0.2573, + "num_input_tokens_seen": 25010624, + "step": 11580 + }, + { + "epoch": 1.8898858075040783, + "grad_norm": 2.798954486846924, + "learning_rate": 4.88039745244992e-05, + "loss": 0.249, + "num_input_tokens_seen": 25021696, + "step": 11585 + }, + { + "epoch": 1.8907014681892331, + "grad_norm": 0.8682471513748169, + "learning_rate": 4.880179828201106e-05, + "loss": 0.1363, + "num_input_tokens_seen": 25032640, + "step": 11590 + }, + { + "epoch": 1.8915171288743884, + "grad_norm": 0.0452408529818058, + "learning_rate": 4.8799620110028375e-05, + "loss": 0.24, + "num_input_tokens_seen": 25043744, + "step": 11595 + }, + { + "epoch": 1.8923327895595432, + "grad_norm": 1.2058929204940796, + "learning_rate": 4.879744000872774e-05, + "loss": 0.2806, + "num_input_tokens_seen": 25055136, + "step": 11600 + }, + { + "epoch": 1.8931484502446982, + "grad_norm": 0.06619522720575333, + "learning_rate": 4.879525797828585e-05, + "loss": 0.1416, + "num_input_tokens_seen": 25066528, + "step": 11605 + }, + { + "epoch": 1.8939641109298533, + "grad_norm": 0.4435395896434784, + "learning_rate": 4.879307401887963e-05, + "loss": 0.0471, + "num_input_tokens_seen": 25077280, + "step": 11610 + }, + { + "epoch": 1.894779771615008, + "grad_norm": 1.5265368223190308, + "learning_rate": 4.87908881306861e-05, + "loss": 0.1938, + "num_input_tokens_seen": 25088544, + "step": 11615 + }, + { + "epoch": 1.8955954323001631, + "grad_norm": 0.16881117224693298, + "learning_rate": 4.878870031388246e-05, + "loss": 0.0455, + "num_input_tokens_seen": 25097344, + "step": 11620 + }, + { + "epoch": 1.8964110929853182, + "grad_norm": 0.9110477566719055, + "learning_rate": 4.8786510568646074e-05, + "loss": 0.2074, + "num_input_tokens_seen": 25108736, + "step": 11625 + }, + { + "epoch": 1.897226753670473, + "grad_norm": 0.9393763542175293, + "learning_rate": 4.878431889515445e-05, + "loss": 0.1776, + "num_input_tokens_seen": 25119648, + "step": 11630 + }, + { + "epoch": 1.898042414355628, + "grad_norm": 0.2141273021697998, + "learning_rate": 4.8782125293585255e-05, + "loss": 0.0488, + "num_input_tokens_seen": 25130048, + "step": 11635 + }, + { + "epoch": 1.898858075040783, + "grad_norm": 1.2580442428588867, + "learning_rate": 4.877992976411632e-05, + "loss": 0.1296, + "num_input_tokens_seen": 25140032, + "step": 11640 + }, + { + "epoch": 1.899673735725938, + "grad_norm": 0.20478147268295288, + "learning_rate": 4.8777732306925614e-05, + "loss": 0.1742, + "num_input_tokens_seen": 25151136, + "step": 11645 + }, + { + "epoch": 1.900489396411093, + "grad_norm": 0.23843401670455933, + "learning_rate": 4.877553292219128e-05, + "loss": 0.1225, + "num_input_tokens_seen": 25161408, + "step": 11650 + }, + { + "epoch": 1.901305057096248, + "grad_norm": 1.666159749031067, + "learning_rate": 4.877333161009161e-05, + "loss": 0.1647, + "num_input_tokens_seen": 25172224, + "step": 11655 + }, + { + "epoch": 1.9021207177814028, + "grad_norm": 0.6406264901161194, + "learning_rate": 4.8771128370805066e-05, + "loss": 0.3146, + "num_input_tokens_seen": 25183424, + "step": 11660 + }, + { + "epoch": 1.902936378466558, + "grad_norm": 0.9234243631362915, + "learning_rate": 4.876892320451023e-05, + "loss": 0.1173, + "num_input_tokens_seen": 25194944, + "step": 11665 + }, + { + "epoch": 1.9037520391517129, + "grad_norm": 0.7858626842498779, + "learning_rate": 4.876671611138588e-05, + "loss": 0.1227, + "num_input_tokens_seen": 25205728, + "step": 11670 + }, + { + "epoch": 1.904567699836868, + "grad_norm": 0.1792234182357788, + "learning_rate": 4.876450709161093e-05, + "loss": 0.1879, + "num_input_tokens_seen": 25216576, + "step": 11675 + }, + { + "epoch": 1.905383360522023, + "grad_norm": 0.03908146917819977, + "learning_rate": 4.876229614536446e-05, + "loss": 0.0204, + "num_input_tokens_seen": 25227616, + "step": 11680 + }, + { + "epoch": 1.9061990212071778, + "grad_norm": 0.42534196376800537, + "learning_rate": 4.8760083272825695e-05, + "loss": 0.123, + "num_input_tokens_seen": 25237856, + "step": 11685 + }, + { + "epoch": 1.9070146818923328, + "grad_norm": 1.8126550912857056, + "learning_rate": 4.875786847417402e-05, + "loss": 0.1758, + "num_input_tokens_seen": 25248256, + "step": 11690 + }, + { + "epoch": 1.9078303425774878, + "grad_norm": 0.19900965690612793, + "learning_rate": 4.875565174958898e-05, + "loss": 0.1028, + "num_input_tokens_seen": 25258944, + "step": 11695 + }, + { + "epoch": 1.9086460032626427, + "grad_norm": 0.22923874855041504, + "learning_rate": 4.8753433099250276e-05, + "loss": 0.1239, + "num_input_tokens_seen": 25269120, + "step": 11700 + }, + { + "epoch": 1.9094616639477977, + "grad_norm": 1.0266461372375488, + "learning_rate": 4.875121252333776e-05, + "loss": 0.1272, + "num_input_tokens_seen": 25280160, + "step": 11705 + }, + { + "epoch": 1.9102773246329527, + "grad_norm": 1.143520474433899, + "learning_rate": 4.874899002203145e-05, + "loss": 0.1807, + "num_input_tokens_seen": 25290112, + "step": 11710 + }, + { + "epoch": 1.9110929853181076, + "grad_norm": 0.4171189069747925, + "learning_rate": 4.8746765595511504e-05, + "loss": 0.1507, + "num_input_tokens_seen": 25301664, + "step": 11715 + }, + { + "epoch": 1.9119086460032626, + "grad_norm": 0.8873015642166138, + "learning_rate": 4.874453924395824e-05, + "loss": 0.2218, + "num_input_tokens_seen": 25311264, + "step": 11720 + }, + { + "epoch": 1.9127243066884176, + "grad_norm": 0.5969108939170837, + "learning_rate": 4.874231096755216e-05, + "loss": 0.0435, + "num_input_tokens_seen": 25321984, + "step": 11725 + }, + { + "epoch": 1.9135399673735725, + "grad_norm": 0.4166800081729889, + "learning_rate": 4.8740080766473876e-05, + "loss": 0.0926, + "num_input_tokens_seen": 25333664, + "step": 11730 + }, + { + "epoch": 1.9143556280587277, + "grad_norm": 0.799900233745575, + "learning_rate": 4.873784864090419e-05, + "loss": 0.1268, + "num_input_tokens_seen": 25342944, + "step": 11735 + }, + { + "epoch": 1.9151712887438825, + "grad_norm": 0.6129142642021179, + "learning_rate": 4.873561459102406e-05, + "loss": 0.1427, + "num_input_tokens_seen": 25353568, + "step": 11740 + }, + { + "epoch": 1.9159869494290374, + "grad_norm": 0.40975579619407654, + "learning_rate": 4.873337861701456e-05, + "loss": 0.1283, + "num_input_tokens_seen": 25364736, + "step": 11745 + }, + { + "epoch": 1.9168026101141926, + "grad_norm": 0.04878057911992073, + "learning_rate": 4.8731140719056977e-05, + "loss": 0.1168, + "num_input_tokens_seen": 25374528, + "step": 11750 + }, + { + "epoch": 1.9176182707993474, + "grad_norm": 0.2008107602596283, + "learning_rate": 4.872890089733272e-05, + "loss": 0.1348, + "num_input_tokens_seen": 25385440, + "step": 11755 + }, + { + "epoch": 1.9184339314845025, + "grad_norm": 1.533408522605896, + "learning_rate": 4.8726659152023356e-05, + "loss": 0.2526, + "num_input_tokens_seen": 25397344, + "step": 11760 + }, + { + "epoch": 1.9192495921696575, + "grad_norm": 0.13346454501152039, + "learning_rate": 4.87244154833106e-05, + "loss": 0.0628, + "num_input_tokens_seen": 25408448, + "step": 11765 + }, + { + "epoch": 1.9200652528548123, + "grad_norm": 0.4927123486995697, + "learning_rate": 4.872216989137637e-05, + "loss": 0.1862, + "num_input_tokens_seen": 25418336, + "step": 11770 + }, + { + "epoch": 1.9208809135399674, + "grad_norm": 0.6781282424926758, + "learning_rate": 4.871992237640267e-05, + "loss": 0.1731, + "num_input_tokens_seen": 25429824, + "step": 11775 + }, + { + "epoch": 1.9216965742251224, + "grad_norm": 0.25802409648895264, + "learning_rate": 4.871767293857171e-05, + "loss": 0.1972, + "num_input_tokens_seen": 25439584, + "step": 11780 + }, + { + "epoch": 1.9225122349102772, + "grad_norm": 0.8111163973808289, + "learning_rate": 4.871542157806584e-05, + "loss": 0.1574, + "num_input_tokens_seen": 25451744, + "step": 11785 + }, + { + "epoch": 1.9233278955954323, + "grad_norm": 0.19863565266132355, + "learning_rate": 4.871316829506757e-05, + "loss": 0.0374, + "num_input_tokens_seen": 25461856, + "step": 11790 + }, + { + "epoch": 1.9241435562805873, + "grad_norm": 2.4421234130859375, + "learning_rate": 4.871091308975955e-05, + "loss": 0.2828, + "num_input_tokens_seen": 25473120, + "step": 11795 + }, + { + "epoch": 1.9249592169657421, + "grad_norm": 0.857035756111145, + "learning_rate": 4.8708655962324615e-05, + "loss": 0.0552, + "num_input_tokens_seen": 25485632, + "step": 11800 + }, + { + "epoch": 1.9257748776508974, + "grad_norm": 0.5191669464111328, + "learning_rate": 4.870639691294573e-05, + "loss": 0.0577, + "num_input_tokens_seen": 25496160, + "step": 11805 + }, + { + "epoch": 1.9265905383360522, + "grad_norm": 1.2635668516159058, + "learning_rate": 4.8704135941806016e-05, + "loss": 0.1589, + "num_input_tokens_seen": 25507712, + "step": 11810 + }, + { + "epoch": 1.927406199021207, + "grad_norm": 2.340160846710205, + "learning_rate": 4.870187304908878e-05, + "loss": 0.2222, + "num_input_tokens_seen": 25519328, + "step": 11815 + }, + { + "epoch": 1.9282218597063623, + "grad_norm": 0.6560558080673218, + "learning_rate": 4.869960823497745e-05, + "loss": 0.058, + "num_input_tokens_seen": 25528992, + "step": 11820 + }, + { + "epoch": 1.929037520391517, + "grad_norm": 0.2504071593284607, + "learning_rate": 4.8697341499655626e-05, + "loss": 0.1589, + "num_input_tokens_seen": 25540032, + "step": 11825 + }, + { + "epoch": 1.9298531810766721, + "grad_norm": 3.292569160461426, + "learning_rate": 4.8695072843307064e-05, + "loss": 0.1331, + "num_input_tokens_seen": 25551392, + "step": 11830 + }, + { + "epoch": 1.9306688417618272, + "grad_norm": 1.5722140073776245, + "learning_rate": 4.869280226611567e-05, + "loss": 0.1327, + "num_input_tokens_seen": 25561856, + "step": 11835 + }, + { + "epoch": 1.931484502446982, + "grad_norm": 0.6263178586959839, + "learning_rate": 4.86905297682655e-05, + "loss": 0.1727, + "num_input_tokens_seen": 25571712, + "step": 11840 + }, + { + "epoch": 1.932300163132137, + "grad_norm": 0.16666299104690552, + "learning_rate": 4.868825534994078e-05, + "loss": 0.0736, + "num_input_tokens_seen": 25581088, + "step": 11845 + }, + { + "epoch": 1.933115823817292, + "grad_norm": 0.07264748960733414, + "learning_rate": 4.86859790113259e-05, + "loss": 0.0273, + "num_input_tokens_seen": 25593376, + "step": 11850 + }, + { + "epoch": 1.933931484502447, + "grad_norm": 1.1715854406356812, + "learning_rate": 4.868370075260538e-05, + "loss": 0.213, + "num_input_tokens_seen": 25603488, + "step": 11855 + }, + { + "epoch": 1.934747145187602, + "grad_norm": 0.8068843483924866, + "learning_rate": 4.86814205739639e-05, + "loss": 0.3003, + "num_input_tokens_seen": 25613216, + "step": 11860 + }, + { + "epoch": 1.935562805872757, + "grad_norm": 0.6677848100662231, + "learning_rate": 4.86791384755863e-05, + "loss": 0.0721, + "num_input_tokens_seen": 25624096, + "step": 11865 + }, + { + "epoch": 1.9363784665579118, + "grad_norm": 0.18287482857704163, + "learning_rate": 4.86768544576576e-05, + "loss": 0.1752, + "num_input_tokens_seen": 25633760, + "step": 11870 + }, + { + "epoch": 1.9371941272430668, + "grad_norm": 0.3792082667350769, + "learning_rate": 4.867456852036295e-05, + "loss": 0.1306, + "num_input_tokens_seen": 25645216, + "step": 11875 + }, + { + "epoch": 1.9380097879282219, + "grad_norm": 0.3422375023365021, + "learning_rate": 4.867228066388765e-05, + "loss": 0.2554, + "num_input_tokens_seen": 25656288, + "step": 11880 + }, + { + "epoch": 1.9388254486133767, + "grad_norm": 0.2528005838394165, + "learning_rate": 4.866999088841716e-05, + "loss": 0.177, + "num_input_tokens_seen": 25667488, + "step": 11885 + }, + { + "epoch": 1.939641109298532, + "grad_norm": 2.0795843601226807, + "learning_rate": 4.866769919413711e-05, + "loss": 0.142, + "num_input_tokens_seen": 25678848, + "step": 11890 + }, + { + "epoch": 1.9404567699836868, + "grad_norm": 0.4998319447040558, + "learning_rate": 4.866540558123328e-05, + "loss": 0.132, + "num_input_tokens_seen": 25689280, + "step": 11895 + }, + { + "epoch": 1.9412724306688418, + "grad_norm": 1.2122588157653809, + "learning_rate": 4.8663110049891595e-05, + "loss": 0.1086, + "num_input_tokens_seen": 25699552, + "step": 11900 + }, + { + "epoch": 1.9420880913539968, + "grad_norm": 0.0686386451125145, + "learning_rate": 4.866081260029813e-05, + "loss": 0.1795, + "num_input_tokens_seen": 25709920, + "step": 11905 + }, + { + "epoch": 1.9429037520391517, + "grad_norm": 2.137895107269287, + "learning_rate": 4.8658513232639155e-05, + "loss": 0.2703, + "num_input_tokens_seen": 25722592, + "step": 11910 + }, + { + "epoch": 1.9437194127243067, + "grad_norm": 0.1718495786190033, + "learning_rate": 4.8656211947101054e-05, + "loss": 0.0387, + "num_input_tokens_seen": 25733696, + "step": 11915 + }, + { + "epoch": 1.9445350734094617, + "grad_norm": 0.5469145774841309, + "learning_rate": 4.865390874387038e-05, + "loss": 0.1981, + "num_input_tokens_seen": 25743712, + "step": 11920 + }, + { + "epoch": 1.9453507340946166, + "grad_norm": 0.15845216810703278, + "learning_rate": 4.865160362313384e-05, + "loss": 0.2128, + "num_input_tokens_seen": 25753952, + "step": 11925 + }, + { + "epoch": 1.9461663947797716, + "grad_norm": 0.34621137380599976, + "learning_rate": 4.8649296585078316e-05, + "loss": 0.1607, + "num_input_tokens_seen": 25764896, + "step": 11930 + }, + { + "epoch": 1.9469820554649266, + "grad_norm": 0.8383191823959351, + "learning_rate": 4.864698762989081e-05, + "loss": 0.1407, + "num_input_tokens_seen": 25774944, + "step": 11935 + }, + { + "epoch": 1.9477977161500815, + "grad_norm": 0.11861728876829147, + "learning_rate": 4.86446767577585e-05, + "loss": 0.0999, + "num_input_tokens_seen": 25785472, + "step": 11940 + }, + { + "epoch": 1.9486133768352365, + "grad_norm": 0.5963906049728394, + "learning_rate": 4.864236396886872e-05, + "loss": 0.2359, + "num_input_tokens_seen": 25796448, + "step": 11945 + }, + { + "epoch": 1.9494290375203915, + "grad_norm": 0.1360093057155609, + "learning_rate": 4.864004926340896e-05, + "loss": 0.1039, + "num_input_tokens_seen": 25807168, + "step": 11950 + }, + { + "epoch": 1.9502446982055464, + "grad_norm": 1.496074914932251, + "learning_rate": 4.8637732641566855e-05, + "loss": 0.1274, + "num_input_tokens_seen": 25817792, + "step": 11955 + }, + { + "epoch": 1.9510603588907016, + "grad_norm": 0.3960314393043518, + "learning_rate": 4.8635414103530205e-05, + "loss": 0.2113, + "num_input_tokens_seen": 25827392, + "step": 11960 + }, + { + "epoch": 1.9518760195758564, + "grad_norm": 0.1562812328338623, + "learning_rate": 4.863309364948697e-05, + "loss": 0.0454, + "num_input_tokens_seen": 25838720, + "step": 11965 + }, + { + "epoch": 1.9526916802610113, + "grad_norm": 1.62313973903656, + "learning_rate": 4.863077127962524e-05, + "loss": 0.0831, + "num_input_tokens_seen": 25850496, + "step": 11970 + }, + { + "epoch": 1.9535073409461665, + "grad_norm": 0.19097167253494263, + "learning_rate": 4.8628446994133306e-05, + "loss": 0.1356, + "num_input_tokens_seen": 25862240, + "step": 11975 + }, + { + "epoch": 1.9543230016313213, + "grad_norm": 0.5543133020401001, + "learning_rate": 4.8626120793199545e-05, + "loss": 0.2657, + "num_input_tokens_seen": 25873280, + "step": 11980 + }, + { + "epoch": 1.9551386623164764, + "grad_norm": 0.1144879162311554, + "learning_rate": 4.862379267701257e-05, + "loss": 0.0624, + "num_input_tokens_seen": 25883872, + "step": 11985 + }, + { + "epoch": 1.9559543230016314, + "grad_norm": 0.15556301176548004, + "learning_rate": 4.86214626457611e-05, + "loss": 0.0344, + "num_input_tokens_seen": 25895424, + "step": 11990 + }, + { + "epoch": 1.9567699836867862, + "grad_norm": 2.012242317199707, + "learning_rate": 4.8619130699633994e-05, + "loss": 0.2551, + "num_input_tokens_seen": 25906944, + "step": 11995 + }, + { + "epoch": 1.9575856443719413, + "grad_norm": 0.972852885723114, + "learning_rate": 4.861679683882033e-05, + "loss": 0.1318, + "num_input_tokens_seen": 25918720, + "step": 12000 + }, + { + "epoch": 1.9584013050570963, + "grad_norm": 0.6439188718795776, + "learning_rate": 4.861446106350928e-05, + "loss": 0.1001, + "num_input_tokens_seen": 25929248, + "step": 12005 + }, + { + "epoch": 1.9592169657422511, + "grad_norm": 1.367970585823059, + "learning_rate": 4.861212337389019e-05, + "loss": 0.2416, + "num_input_tokens_seen": 25939040, + "step": 12010 + }, + { + "epoch": 1.9600326264274062, + "grad_norm": 0.16289880871772766, + "learning_rate": 4.8609783770152575e-05, + "loss": 0.1661, + "num_input_tokens_seen": 25949024, + "step": 12015 + }, + { + "epoch": 1.9608482871125612, + "grad_norm": 0.08855515718460083, + "learning_rate": 4.8607442252486095e-05, + "loss": 0.2403, + "num_input_tokens_seen": 25959264, + "step": 12020 + }, + { + "epoch": 1.961663947797716, + "grad_norm": 0.6799286603927612, + "learning_rate": 4.8605098821080564e-05, + "loss": 0.0986, + "num_input_tokens_seen": 25970432, + "step": 12025 + }, + { + "epoch": 1.9624796084828713, + "grad_norm": 2.4543135166168213, + "learning_rate": 4.8602753476125954e-05, + "loss": 0.2287, + "num_input_tokens_seen": 25981856, + "step": 12030 + }, + { + "epoch": 1.963295269168026, + "grad_norm": 0.48550450801849365, + "learning_rate": 4.860040621781238e-05, + "loss": 0.3435, + "num_input_tokens_seen": 25993696, + "step": 12035 + }, + { + "epoch": 1.964110929853181, + "grad_norm": 1.076973557472229, + "learning_rate": 4.8598057046330135e-05, + "loss": 0.1972, + "num_input_tokens_seen": 26004736, + "step": 12040 + }, + { + "epoch": 1.9649265905383362, + "grad_norm": 0.27697324752807617, + "learning_rate": 4.8595705961869656e-05, + "loss": 0.1054, + "num_input_tokens_seen": 26016480, + "step": 12045 + }, + { + "epoch": 1.965742251223491, + "grad_norm": 0.7125768661499023, + "learning_rate": 4.859335296462152e-05, + "loss": 0.0422, + "num_input_tokens_seen": 26027072, + "step": 12050 + }, + { + "epoch": 1.966557911908646, + "grad_norm": 0.7342801094055176, + "learning_rate": 4.859099805477648e-05, + "loss": 0.117, + "num_input_tokens_seen": 26036096, + "step": 12055 + }, + { + "epoch": 1.967373572593801, + "grad_norm": 0.2170349359512329, + "learning_rate": 4.858864123252544e-05, + "loss": 0.1251, + "num_input_tokens_seen": 26047264, + "step": 12060 + }, + { + "epoch": 1.968189233278956, + "grad_norm": 0.22995510697364807, + "learning_rate": 4.8586282498059456e-05, + "loss": 0.0978, + "num_input_tokens_seen": 26058176, + "step": 12065 + }, + { + "epoch": 1.969004893964111, + "grad_norm": 0.3823881447315216, + "learning_rate": 4.8583921851569735e-05, + "loss": 0.0775, + "num_input_tokens_seen": 26069664, + "step": 12070 + }, + { + "epoch": 1.969820554649266, + "grad_norm": 0.12915287911891937, + "learning_rate": 4.8581559293247655e-05, + "loss": 0.1097, + "num_input_tokens_seen": 26080640, + "step": 12075 + }, + { + "epoch": 1.9706362153344208, + "grad_norm": 1.0469787120819092, + "learning_rate": 4.857919482328471e-05, + "loss": 0.0706, + "num_input_tokens_seen": 26091360, + "step": 12080 + }, + { + "epoch": 1.9714518760195758, + "grad_norm": 0.28623881936073303, + "learning_rate": 4.857682844187261e-05, + "loss": 0.0681, + "num_input_tokens_seen": 26102080, + "step": 12085 + }, + { + "epoch": 1.9722675367047309, + "grad_norm": 2.706929922103882, + "learning_rate": 4.857446014920316e-05, + "loss": 0.2622, + "num_input_tokens_seen": 26112192, + "step": 12090 + }, + { + "epoch": 1.9730831973898857, + "grad_norm": 2.286761999130249, + "learning_rate": 4.857208994546836e-05, + "loss": 0.2026, + "num_input_tokens_seen": 26122496, + "step": 12095 + }, + { + "epoch": 1.9738988580750407, + "grad_norm": 1.504563331604004, + "learning_rate": 4.856971783086034e-05, + "loss": 0.1021, + "num_input_tokens_seen": 26134048, + "step": 12100 + }, + { + "epoch": 1.9747145187601958, + "grad_norm": 0.19639308750629425, + "learning_rate": 4.85673438055714e-05, + "loss": 0.2021, + "num_input_tokens_seen": 26144192, + "step": 12105 + }, + { + "epoch": 1.9755301794453506, + "grad_norm": 0.3045138716697693, + "learning_rate": 4.856496786979399e-05, + "loss": 0.1193, + "num_input_tokens_seen": 26155328, + "step": 12110 + }, + { + "epoch": 1.9763458401305058, + "grad_norm": 0.1441960334777832, + "learning_rate": 4.8562590023720725e-05, + "loss": 0.2069, + "num_input_tokens_seen": 26166272, + "step": 12115 + }, + { + "epoch": 1.9771615008156607, + "grad_norm": 0.882371187210083, + "learning_rate": 4.8560210267544345e-05, + "loss": 0.1913, + "num_input_tokens_seen": 26175520, + "step": 12120 + }, + { + "epoch": 1.9779771615008157, + "grad_norm": 1.4831676483154297, + "learning_rate": 4.855782860145779e-05, + "loss": 0.3348, + "num_input_tokens_seen": 26187008, + "step": 12125 + }, + { + "epoch": 1.9787928221859707, + "grad_norm": 2.7548205852508545, + "learning_rate": 4.8555445025654116e-05, + "loss": 0.1483, + "num_input_tokens_seen": 26198016, + "step": 12130 + }, + { + "epoch": 1.9796084828711256, + "grad_norm": 0.08197253197431564, + "learning_rate": 4.855305954032655e-05, + "loss": 0.2079, + "num_input_tokens_seen": 26209440, + "step": 12135 + }, + { + "epoch": 1.9804241435562806, + "grad_norm": 0.4331893026828766, + "learning_rate": 4.855067214566846e-05, + "loss": 0.0919, + "num_input_tokens_seen": 26220736, + "step": 12140 + }, + { + "epoch": 1.9812398042414356, + "grad_norm": 1.6063554286956787, + "learning_rate": 4.85482828418734e-05, + "loss": 0.096, + "num_input_tokens_seen": 26231456, + "step": 12145 + }, + { + "epoch": 1.9820554649265905, + "grad_norm": 0.32582035660743713, + "learning_rate": 4.854589162913505e-05, + "loss": 0.0637, + "num_input_tokens_seen": 26241568, + "step": 12150 + }, + { + "epoch": 1.9828711256117455, + "grad_norm": 1.8624845743179321, + "learning_rate": 4.854349850764725e-05, + "loss": 0.2939, + "num_input_tokens_seen": 26252480, + "step": 12155 + }, + { + "epoch": 1.9836867862969005, + "grad_norm": 0.36018335819244385, + "learning_rate": 4.8541103477604e-05, + "loss": 0.0933, + "num_input_tokens_seen": 26263264, + "step": 12160 + }, + { + "epoch": 1.9845024469820554, + "grad_norm": 2.063706159591675, + "learning_rate": 4.853870653919946e-05, + "loss": 0.1345, + "num_input_tokens_seen": 26274496, + "step": 12165 + }, + { + "epoch": 1.9853181076672104, + "grad_norm": 0.5936631560325623, + "learning_rate": 4.853630769262794e-05, + "loss": 0.139, + "num_input_tokens_seen": 26285152, + "step": 12170 + }, + { + "epoch": 1.9861337683523654, + "grad_norm": 1.0229272842407227, + "learning_rate": 4.853390693808388e-05, + "loss": 0.2255, + "num_input_tokens_seen": 26296064, + "step": 12175 + }, + { + "epoch": 1.9869494290375203, + "grad_norm": 1.0816926956176758, + "learning_rate": 4.853150427576193e-05, + "loss": 0.1394, + "num_input_tokens_seen": 26305984, + "step": 12180 + }, + { + "epoch": 1.9877650897226755, + "grad_norm": 0.6024875640869141, + "learning_rate": 4.852909970585684e-05, + "loss": 0.1049, + "num_input_tokens_seen": 26316896, + "step": 12185 + }, + { + "epoch": 1.9885807504078303, + "grad_norm": 0.3424450159072876, + "learning_rate": 4.852669322856354e-05, + "loss": 0.1012, + "num_input_tokens_seen": 26326976, + "step": 12190 + }, + { + "epoch": 1.9893964110929854, + "grad_norm": 0.6647361516952515, + "learning_rate": 4.8524284844077116e-05, + "loss": 0.2253, + "num_input_tokens_seen": 26337504, + "step": 12195 + }, + { + "epoch": 1.9902120717781404, + "grad_norm": 1.5201817750930786, + "learning_rate": 4.8521874552592805e-05, + "loss": 0.176, + "num_input_tokens_seen": 26348608, + "step": 12200 + }, + { + "epoch": 1.9910277324632952, + "grad_norm": 2.8657894134521484, + "learning_rate": 4.851946235430599e-05, + "loss": 0.2444, + "num_input_tokens_seen": 26358240, + "step": 12205 + }, + { + "epoch": 1.9918433931484503, + "grad_norm": 0.10733562707901001, + "learning_rate": 4.851704824941222e-05, + "loss": 0.0874, + "num_input_tokens_seen": 26369568, + "step": 12210 + }, + { + "epoch": 1.9926590538336053, + "grad_norm": 0.9179078936576843, + "learning_rate": 4.8514632238107194e-05, + "loss": 0.1552, + "num_input_tokens_seen": 26379936, + "step": 12215 + }, + { + "epoch": 1.9934747145187601, + "grad_norm": 0.8692367076873779, + "learning_rate": 4.851221432058677e-05, + "loss": 0.1564, + "num_input_tokens_seen": 26390624, + "step": 12220 + }, + { + "epoch": 1.9942903752039152, + "grad_norm": 0.29343292117118835, + "learning_rate": 4.850979449704695e-05, + "loss": 0.1145, + "num_input_tokens_seen": 26399616, + "step": 12225 + }, + { + "epoch": 1.9951060358890702, + "grad_norm": 0.47389349341392517, + "learning_rate": 4.85073727676839e-05, + "loss": 0.118, + "num_input_tokens_seen": 26410208, + "step": 12230 + }, + { + "epoch": 1.995921696574225, + "grad_norm": 0.4006916284561157, + "learning_rate": 4.8504949132693936e-05, + "loss": 0.0966, + "num_input_tokens_seen": 26420160, + "step": 12235 + }, + { + "epoch": 1.99673735725938, + "grad_norm": 0.49663811922073364, + "learning_rate": 4.850252359227353e-05, + "loss": 0.0998, + "num_input_tokens_seen": 26430432, + "step": 12240 + }, + { + "epoch": 1.997553017944535, + "grad_norm": 0.49484026432037354, + "learning_rate": 4.8500096146619325e-05, + "loss": 0.0989, + "num_input_tokens_seen": 26440128, + "step": 12245 + }, + { + "epoch": 1.99836867862969, + "grad_norm": 1.0403273105621338, + "learning_rate": 4.849766679592808e-05, + "loss": 0.0815, + "num_input_tokens_seen": 26450496, + "step": 12250 + }, + { + "epoch": 1.9991843393148452, + "grad_norm": 1.3465441465377808, + "learning_rate": 4.849523554039673e-05, + "loss": 0.1152, + "num_input_tokens_seen": 26461824, + "step": 12255 + }, + { + "epoch": 2.0, + "grad_norm": 1.1869151592254639, + "learning_rate": 4.8492802380222393e-05, + "loss": 0.1594, + "num_input_tokens_seen": 26471216, + "step": 12260 + }, + { + "epoch": 2.0, + "eval_loss": 0.14601776003837585, + "eval_runtime": 132.8156, + "eval_samples_per_second": 20.517, + "eval_steps_per_second": 5.135, + "num_input_tokens_seen": 26471216, + "step": 12260 + }, + { + "epoch": 2.000815660685155, + "grad_norm": 1.3409366607666016, + "learning_rate": 4.849036731560228e-05, + "loss": 0.1488, + "num_input_tokens_seen": 26481904, + "step": 12265 + }, + { + "epoch": 2.00163132137031, + "grad_norm": 1.9051913022994995, + "learning_rate": 4.84879303467338e-05, + "loss": 0.2374, + "num_input_tokens_seen": 26491344, + "step": 12270 + }, + { + "epoch": 2.002446982055465, + "grad_norm": 0.2385687232017517, + "learning_rate": 4.8485491473814514e-05, + "loss": 0.0715, + "num_input_tokens_seen": 26502160, + "step": 12275 + }, + { + "epoch": 2.0032626427406197, + "grad_norm": 1.2004241943359375, + "learning_rate": 4.8483050697042135e-05, + "loss": 0.1027, + "num_input_tokens_seen": 26512816, + "step": 12280 + }, + { + "epoch": 2.004078303425775, + "grad_norm": 0.08586379885673523, + "learning_rate": 4.8480608016614504e-05, + "loss": 0.0319, + "num_input_tokens_seen": 26522640, + "step": 12285 + }, + { + "epoch": 2.00489396411093, + "grad_norm": 0.08238863199949265, + "learning_rate": 4.847816343272965e-05, + "loss": 0.2255, + "num_input_tokens_seen": 26533072, + "step": 12290 + }, + { + "epoch": 2.0057096247960846, + "grad_norm": 0.07972168177366257, + "learning_rate": 4.847571694558574e-05, + "loss": 0.1096, + "num_input_tokens_seen": 26543408, + "step": 12295 + }, + { + "epoch": 2.00652528548124, + "grad_norm": 0.15178793668746948, + "learning_rate": 4.84732685553811e-05, + "loss": 0.1704, + "num_input_tokens_seen": 26554320, + "step": 12300 + }, + { + "epoch": 2.0073409461663947, + "grad_norm": 0.17462073266506195, + "learning_rate": 4.847081826231421e-05, + "loss": 0.1323, + "num_input_tokens_seen": 26565008, + "step": 12305 + }, + { + "epoch": 2.00815660685155, + "grad_norm": 0.05253322422504425, + "learning_rate": 4.846836606658371e-05, + "loss": 0.0498, + "num_input_tokens_seen": 26576912, + "step": 12310 + }, + { + "epoch": 2.0089722675367048, + "grad_norm": 0.8922966718673706, + "learning_rate": 4.8465911968388364e-05, + "loss": 0.4385, + "num_input_tokens_seen": 26586800, + "step": 12315 + }, + { + "epoch": 2.0097879282218596, + "grad_norm": 0.502031147480011, + "learning_rate": 4.846345596792713e-05, + "loss": 0.1005, + "num_input_tokens_seen": 26598160, + "step": 12320 + }, + { + "epoch": 2.010603588907015, + "grad_norm": 1.2205913066864014, + "learning_rate": 4.846099806539911e-05, + "loss": 0.1888, + "num_input_tokens_seen": 26609136, + "step": 12325 + }, + { + "epoch": 2.0114192495921697, + "grad_norm": 0.2916317880153656, + "learning_rate": 4.845853826100355e-05, + "loss": 0.1068, + "num_input_tokens_seen": 26620752, + "step": 12330 + }, + { + "epoch": 2.0122349102773245, + "grad_norm": 1.8768889904022217, + "learning_rate": 4.845607655493984e-05, + "loss": 0.2504, + "num_input_tokens_seen": 26630256, + "step": 12335 + }, + { + "epoch": 2.0130505709624797, + "grad_norm": 1.4205341339111328, + "learning_rate": 4.8453612947407564e-05, + "loss": 0.2848, + "num_input_tokens_seen": 26641136, + "step": 12340 + }, + { + "epoch": 2.0138662316476346, + "grad_norm": 0.5545447468757629, + "learning_rate": 4.8451147438606416e-05, + "loss": 0.0809, + "num_input_tokens_seen": 26651824, + "step": 12345 + }, + { + "epoch": 2.0146818923327894, + "grad_norm": 1.3467308282852173, + "learning_rate": 4.844868002873626e-05, + "loss": 0.2855, + "num_input_tokens_seen": 26662064, + "step": 12350 + }, + { + "epoch": 2.0154975530179446, + "grad_norm": 0.4422735571861267, + "learning_rate": 4.844621071799712e-05, + "loss": 0.1316, + "num_input_tokens_seen": 26673104, + "step": 12355 + }, + { + "epoch": 2.0163132137030995, + "grad_norm": 1.3082959651947021, + "learning_rate": 4.844373950658918e-05, + "loss": 0.276, + "num_input_tokens_seen": 26682736, + "step": 12360 + }, + { + "epoch": 2.0171288743882543, + "grad_norm": 0.27886345982551575, + "learning_rate": 4.844126639471277e-05, + "loss": 0.1506, + "num_input_tokens_seen": 26694224, + "step": 12365 + }, + { + "epoch": 2.0179445350734095, + "grad_norm": 0.31053614616394043, + "learning_rate": 4.843879138256836e-05, + "loss": 0.0955, + "num_input_tokens_seen": 26704880, + "step": 12370 + }, + { + "epoch": 2.0187601957585644, + "grad_norm": 0.911743700504303, + "learning_rate": 4.843631447035659e-05, + "loss": 0.2319, + "num_input_tokens_seen": 26713648, + "step": 12375 + }, + { + "epoch": 2.0195758564437196, + "grad_norm": 0.057457875460386276, + "learning_rate": 4.843383565827826e-05, + "loss": 0.0756, + "num_input_tokens_seen": 26724560, + "step": 12380 + }, + { + "epoch": 2.0203915171288744, + "grad_norm": 1.05793035030365, + "learning_rate": 4.843135494653431e-05, + "loss": 0.1569, + "num_input_tokens_seen": 26735344, + "step": 12385 + }, + { + "epoch": 2.0212071778140293, + "grad_norm": 1.6903791427612305, + "learning_rate": 4.842887233532584e-05, + "loss": 0.1425, + "num_input_tokens_seen": 26746288, + "step": 12390 + }, + { + "epoch": 2.0220228384991845, + "grad_norm": 1.5788512229919434, + "learning_rate": 4.842638782485409e-05, + "loss": 0.1888, + "num_input_tokens_seen": 26757936, + "step": 12395 + }, + { + "epoch": 2.0228384991843393, + "grad_norm": 0.5548219680786133, + "learning_rate": 4.8423901415320486e-05, + "loss": 0.1444, + "num_input_tokens_seen": 26770256, + "step": 12400 + }, + { + "epoch": 2.023654159869494, + "grad_norm": 1.330373764038086, + "learning_rate": 4.8421413106926586e-05, + "loss": 0.0834, + "num_input_tokens_seen": 26780240, + "step": 12405 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.42746448516845703, + "learning_rate": 4.841892289987409e-05, + "loss": 0.1383, + "num_input_tokens_seen": 26791824, + "step": 12410 + }, + { + "epoch": 2.0252854812398042, + "grad_norm": 0.13744373619556427, + "learning_rate": 4.841643079436489e-05, + "loss": 0.1308, + "num_input_tokens_seen": 26802576, + "step": 12415 + }, + { + "epoch": 2.026101141924959, + "grad_norm": 0.26373177766799927, + "learning_rate": 4.841393679060099e-05, + "loss": 0.1577, + "num_input_tokens_seen": 26814256, + "step": 12420 + }, + { + "epoch": 2.0269168026101143, + "grad_norm": 1.4538960456848145, + "learning_rate": 4.841144088878457e-05, + "loss": 0.3266, + "num_input_tokens_seen": 26824528, + "step": 12425 + }, + { + "epoch": 2.027732463295269, + "grad_norm": 0.7568187117576599, + "learning_rate": 4.8408943089117964e-05, + "loss": 0.0759, + "num_input_tokens_seen": 26836816, + "step": 12430 + }, + { + "epoch": 2.028548123980424, + "grad_norm": 0.5350803136825562, + "learning_rate": 4.840644339180366e-05, + "loss": 0.1662, + "num_input_tokens_seen": 26846800, + "step": 12435 + }, + { + "epoch": 2.029363784665579, + "grad_norm": 0.5850176811218262, + "learning_rate": 4.8403941797044286e-05, + "loss": 0.1125, + "num_input_tokens_seen": 26858288, + "step": 12440 + }, + { + "epoch": 2.030179445350734, + "grad_norm": 0.5619671940803528, + "learning_rate": 4.840143830504264e-05, + "loss": 0.0991, + "num_input_tokens_seen": 26868624, + "step": 12445 + }, + { + "epoch": 2.0309951060358893, + "grad_norm": 0.3465394377708435, + "learning_rate": 4.839893291600167e-05, + "loss": 0.191, + "num_input_tokens_seen": 26879536, + "step": 12450 + }, + { + "epoch": 2.031810766721044, + "grad_norm": 0.9153656959533691, + "learning_rate": 4.839642563012447e-05, + "loss": 0.4034, + "num_input_tokens_seen": 26890320, + "step": 12455 + }, + { + "epoch": 2.032626427406199, + "grad_norm": 0.24927516281604767, + "learning_rate": 4.83939164476143e-05, + "loss": 0.1446, + "num_input_tokens_seen": 26900784, + "step": 12460 + }, + { + "epoch": 2.033442088091354, + "grad_norm": 0.2581087648868561, + "learning_rate": 4.839140536867456e-05, + "loss": 0.0857, + "num_input_tokens_seen": 26912944, + "step": 12465 + }, + { + "epoch": 2.034257748776509, + "grad_norm": 0.07100389897823334, + "learning_rate": 4.838889239350881e-05, + "loss": 0.1669, + "num_input_tokens_seen": 26923792, + "step": 12470 + }, + { + "epoch": 2.035073409461664, + "grad_norm": 0.30283212661743164, + "learning_rate": 4.838637752232078e-05, + "loss": 0.0698, + "num_input_tokens_seen": 26934736, + "step": 12475 + }, + { + "epoch": 2.035889070146819, + "grad_norm": 0.16381867229938507, + "learning_rate": 4.838386075531432e-05, + "loss": 0.1316, + "num_input_tokens_seen": 26945840, + "step": 12480 + }, + { + "epoch": 2.036704730831974, + "grad_norm": 0.10557590425014496, + "learning_rate": 4.8381342092693464e-05, + "loss": 0.0871, + "num_input_tokens_seen": 26956304, + "step": 12485 + }, + { + "epoch": 2.0375203915171287, + "grad_norm": 0.666679322719574, + "learning_rate": 4.837882153466237e-05, + "loss": 0.0813, + "num_input_tokens_seen": 26967632, + "step": 12490 + }, + { + "epoch": 2.038336052202284, + "grad_norm": 0.38799768686294556, + "learning_rate": 4.837629908142539e-05, + "loss": 0.1338, + "num_input_tokens_seen": 26978160, + "step": 12495 + }, + { + "epoch": 2.039151712887439, + "grad_norm": 0.7140949368476868, + "learning_rate": 4.837377473318699e-05, + "loss": 0.1888, + "num_input_tokens_seen": 26987568, + "step": 12500 + }, + { + "epoch": 2.0399673735725936, + "grad_norm": 0.23693375289440155, + "learning_rate": 4.837124849015182e-05, + "loss": 0.107, + "num_input_tokens_seen": 26999760, + "step": 12505 + }, + { + "epoch": 2.040783034257749, + "grad_norm": 0.9824971556663513, + "learning_rate": 4.8368720352524655e-05, + "loss": 0.0707, + "num_input_tokens_seen": 27011696, + "step": 12510 + }, + { + "epoch": 2.0415986949429037, + "grad_norm": 0.38591650128364563, + "learning_rate": 4.8366190320510454e-05, + "loss": 0.1149, + "num_input_tokens_seen": 27022608, + "step": 12515 + }, + { + "epoch": 2.0424143556280585, + "grad_norm": 0.5156813859939575, + "learning_rate": 4.83636583943143e-05, + "loss": 0.1516, + "num_input_tokens_seen": 27033680, + "step": 12520 + }, + { + "epoch": 2.0432300163132138, + "grad_norm": 0.9841905832290649, + "learning_rate": 4.8361124574141455e-05, + "loss": 0.0632, + "num_input_tokens_seen": 27045136, + "step": 12525 + }, + { + "epoch": 2.0440456769983686, + "grad_norm": 0.4595491886138916, + "learning_rate": 4.835858886019732e-05, + "loss": 0.1846, + "num_input_tokens_seen": 27055760, + "step": 12530 + }, + { + "epoch": 2.044861337683524, + "grad_norm": 0.2079661637544632, + "learning_rate": 4.835605125268745e-05, + "loss": 0.094, + "num_input_tokens_seen": 27066352, + "step": 12535 + }, + { + "epoch": 2.0456769983686787, + "grad_norm": 1.188047170639038, + "learning_rate": 4.835351175181755e-05, + "loss": 0.1367, + "num_input_tokens_seen": 27077872, + "step": 12540 + }, + { + "epoch": 2.0464926590538335, + "grad_norm": 0.8767662048339844, + "learning_rate": 4.83509703577935e-05, + "loss": 0.1057, + "num_input_tokens_seen": 27088912, + "step": 12545 + }, + { + "epoch": 2.0473083197389887, + "grad_norm": 0.053366128355264664, + "learning_rate": 4.834842707082131e-05, + "loss": 0.0336, + "num_input_tokens_seen": 27099216, + "step": 12550 + }, + { + "epoch": 2.0481239804241436, + "grad_norm": 0.24288392066955566, + "learning_rate": 4.834588189110716e-05, + "loss": 0.1678, + "num_input_tokens_seen": 27109712, + "step": 12555 + }, + { + "epoch": 2.0489396411092984, + "grad_norm": 0.564766526222229, + "learning_rate": 4.834333481885735e-05, + "loss": 0.064, + "num_input_tokens_seen": 27120304, + "step": 12560 + }, + { + "epoch": 2.0497553017944536, + "grad_norm": 0.19480040669441223, + "learning_rate": 4.8340785854278395e-05, + "loss": 0.1257, + "num_input_tokens_seen": 27130992, + "step": 12565 + }, + { + "epoch": 2.0505709624796085, + "grad_norm": 1.2062691450119019, + "learning_rate": 4.83382349975769e-05, + "loss": 0.1223, + "num_input_tokens_seen": 27141136, + "step": 12570 + }, + { + "epoch": 2.0513866231647633, + "grad_norm": 0.4995480477809906, + "learning_rate": 4.833568224895967e-05, + "loss": 0.1182, + "num_input_tokens_seen": 27152208, + "step": 12575 + }, + { + "epoch": 2.0522022838499185, + "grad_norm": 0.29473036527633667, + "learning_rate": 4.833312760863362e-05, + "loss": 0.1857, + "num_input_tokens_seen": 27164016, + "step": 12580 + }, + { + "epoch": 2.0530179445350734, + "grad_norm": 0.8190780282020569, + "learning_rate": 4.833057107680586e-05, + "loss": 0.1273, + "num_input_tokens_seen": 27174800, + "step": 12585 + }, + { + "epoch": 2.053833605220228, + "grad_norm": 0.8921218514442444, + "learning_rate": 4.832801265368363e-05, + "loss": 0.1619, + "num_input_tokens_seen": 27184784, + "step": 12590 + }, + { + "epoch": 2.0546492659053834, + "grad_norm": 0.37578675150871277, + "learning_rate": 4.832545233947433e-05, + "loss": 0.0693, + "num_input_tokens_seen": 27195600, + "step": 12595 + }, + { + "epoch": 2.0554649265905383, + "grad_norm": 0.28351399302482605, + "learning_rate": 4.832289013438551e-05, + "loss": 0.1214, + "num_input_tokens_seen": 27206416, + "step": 12600 + }, + { + "epoch": 2.0562805872756935, + "grad_norm": 0.6984574198722839, + "learning_rate": 4.8320326038624875e-05, + "loss": 0.1536, + "num_input_tokens_seen": 27217360, + "step": 12605 + }, + { + "epoch": 2.0570962479608483, + "grad_norm": 0.5631441473960876, + "learning_rate": 4.831776005240029e-05, + "loss": 0.2002, + "num_input_tokens_seen": 27228304, + "step": 12610 + }, + { + "epoch": 2.057911908646003, + "grad_norm": 0.2803304195404053, + "learning_rate": 4.831519217591976e-05, + "loss": 0.1585, + "num_input_tokens_seen": 27240208, + "step": 12615 + }, + { + "epoch": 2.0587275693311584, + "grad_norm": 0.2613511085510254, + "learning_rate": 4.831262240939144e-05, + "loss": 0.0648, + "num_input_tokens_seen": 27250256, + "step": 12620 + }, + { + "epoch": 2.0595432300163132, + "grad_norm": 0.9238356351852417, + "learning_rate": 4.8310050753023674e-05, + "loss": 0.1192, + "num_input_tokens_seen": 27260080, + "step": 12625 + }, + { + "epoch": 2.060358890701468, + "grad_norm": 0.49744144082069397, + "learning_rate": 4.8307477207024923e-05, + "loss": 0.0562, + "num_input_tokens_seen": 27272112, + "step": 12630 + }, + { + "epoch": 2.0611745513866233, + "grad_norm": 0.32926130294799805, + "learning_rate": 4.83049017716038e-05, + "loss": 0.151, + "num_input_tokens_seen": 27282672, + "step": 12635 + }, + { + "epoch": 2.061990212071778, + "grad_norm": 0.5016634464263916, + "learning_rate": 4.8302324446969094e-05, + "loss": 0.062, + "num_input_tokens_seen": 27294704, + "step": 12640 + }, + { + "epoch": 2.062805872756933, + "grad_norm": 0.3072339594364166, + "learning_rate": 4.829974523332973e-05, + "loss": 0.0735, + "num_input_tokens_seen": 27306320, + "step": 12645 + }, + { + "epoch": 2.063621533442088, + "grad_norm": 0.3885962665081024, + "learning_rate": 4.8297164130894804e-05, + "loss": 0.1364, + "num_input_tokens_seen": 27316720, + "step": 12650 + }, + { + "epoch": 2.064437194127243, + "grad_norm": 0.21602122485637665, + "learning_rate": 4.8294581139873544e-05, + "loss": 0.0252, + "num_input_tokens_seen": 27327408, + "step": 12655 + }, + { + "epoch": 2.065252854812398, + "grad_norm": 1.0989031791687012, + "learning_rate": 4.829199626047534e-05, + "loss": 0.1565, + "num_input_tokens_seen": 27338672, + "step": 12660 + }, + { + "epoch": 2.066068515497553, + "grad_norm": 1.1340826749801636, + "learning_rate": 4.8289409492909726e-05, + "loss": 0.2788, + "num_input_tokens_seen": 27350352, + "step": 12665 + }, + { + "epoch": 2.066884176182708, + "grad_norm": 0.12903577089309692, + "learning_rate": 4.8286820837386416e-05, + "loss": 0.1127, + "num_input_tokens_seen": 27360432, + "step": 12670 + }, + { + "epoch": 2.067699836867863, + "grad_norm": 0.7638895511627197, + "learning_rate": 4.828423029411526e-05, + "loss": 0.2098, + "num_input_tokens_seen": 27369840, + "step": 12675 + }, + { + "epoch": 2.068515497553018, + "grad_norm": 0.2819768190383911, + "learning_rate": 4.828163786330624e-05, + "loss": 0.0449, + "num_input_tokens_seen": 27380016, + "step": 12680 + }, + { + "epoch": 2.069331158238173, + "grad_norm": 0.7492182850837708, + "learning_rate": 4.8279043545169535e-05, + "loss": 0.0993, + "num_input_tokens_seen": 27391504, + "step": 12685 + }, + { + "epoch": 2.070146818923328, + "grad_norm": 0.187462717294693, + "learning_rate": 4.8276447339915446e-05, + "loss": 0.0393, + "num_input_tokens_seen": 27402064, + "step": 12690 + }, + { + "epoch": 2.070962479608483, + "grad_norm": 0.09270130097866058, + "learning_rate": 4.827384924775442e-05, + "loss": 0.0835, + "num_input_tokens_seen": 27413808, + "step": 12695 + }, + { + "epoch": 2.0717781402936377, + "grad_norm": 0.46471574902534485, + "learning_rate": 4.8271249268897094e-05, + "loss": 0.2293, + "num_input_tokens_seen": 27425520, + "step": 12700 + }, + { + "epoch": 2.072593800978793, + "grad_norm": 2.0366370677948, + "learning_rate": 4.826864740355422e-05, + "loss": 0.2342, + "num_input_tokens_seen": 27436624, + "step": 12705 + }, + { + "epoch": 2.073409461663948, + "grad_norm": 0.3269873261451721, + "learning_rate": 4.826604365193673e-05, + "loss": 0.1591, + "num_input_tokens_seen": 27448176, + "step": 12710 + }, + { + "epoch": 2.0742251223491026, + "grad_norm": 0.47529107332229614, + "learning_rate": 4.8263438014255687e-05, + "loss": 0.1148, + "num_input_tokens_seen": 27459568, + "step": 12715 + }, + { + "epoch": 2.075040783034258, + "grad_norm": 2.134490966796875, + "learning_rate": 4.8260830490722317e-05, + "loss": 0.1914, + "num_input_tokens_seen": 27470800, + "step": 12720 + }, + { + "epoch": 2.0758564437194127, + "grad_norm": 1.0811941623687744, + "learning_rate": 4.8258221081548004e-05, + "loss": 0.1051, + "num_input_tokens_seen": 27481168, + "step": 12725 + }, + { + "epoch": 2.0766721044045675, + "grad_norm": 0.5238149762153625, + "learning_rate": 4.825560978694429e-05, + "loss": 0.1445, + "num_input_tokens_seen": 27492528, + "step": 12730 + }, + { + "epoch": 2.0774877650897228, + "grad_norm": 0.2232326716184616, + "learning_rate": 4.8252996607122835e-05, + "loss": 0.1407, + "num_input_tokens_seen": 27503152, + "step": 12735 + }, + { + "epoch": 2.0783034257748776, + "grad_norm": 0.6695789098739624, + "learning_rate": 4.82503815422955e-05, + "loss": 0.1231, + "num_input_tokens_seen": 27513904, + "step": 12740 + }, + { + "epoch": 2.0791190864600324, + "grad_norm": 1.1702998876571655, + "learning_rate": 4.824776459267426e-05, + "loss": 0.2385, + "num_input_tokens_seen": 27525648, + "step": 12745 + }, + { + "epoch": 2.0799347471451877, + "grad_norm": 1.2954697608947754, + "learning_rate": 4.824514575847127e-05, + "loss": 0.1491, + "num_input_tokens_seen": 27537264, + "step": 12750 + }, + { + "epoch": 2.0807504078303425, + "grad_norm": 0.5932684540748596, + "learning_rate": 4.824252503989881e-05, + "loss": 0.0885, + "num_input_tokens_seen": 27548144, + "step": 12755 + }, + { + "epoch": 2.0815660685154977, + "grad_norm": 0.21580569446086884, + "learning_rate": 4.823990243716935e-05, + "loss": 0.0445, + "num_input_tokens_seen": 27559568, + "step": 12760 + }, + { + "epoch": 2.0823817292006526, + "grad_norm": 1.9167141914367676, + "learning_rate": 4.823727795049548e-05, + "loss": 0.2163, + "num_input_tokens_seen": 27570544, + "step": 12765 + }, + { + "epoch": 2.0831973898858074, + "grad_norm": 1.0613116025924683, + "learning_rate": 4.8234651580089945e-05, + "loss": 0.2039, + "num_input_tokens_seen": 27581616, + "step": 12770 + }, + { + "epoch": 2.0840130505709626, + "grad_norm": 0.0771641731262207, + "learning_rate": 4.823202332616567e-05, + "loss": 0.1966, + "num_input_tokens_seen": 27592112, + "step": 12775 + }, + { + "epoch": 2.0848287112561175, + "grad_norm": 1.1727176904678345, + "learning_rate": 4.8229393188935703e-05, + "loss": 0.1984, + "num_input_tokens_seen": 27603056, + "step": 12780 + }, + { + "epoch": 2.0856443719412723, + "grad_norm": 0.7516143321990967, + "learning_rate": 4.8226761168613255e-05, + "loss": 0.2328, + "num_input_tokens_seen": 27613616, + "step": 12785 + }, + { + "epoch": 2.0864600326264275, + "grad_norm": 0.5738093852996826, + "learning_rate": 4.82241272654117e-05, + "loss": 0.1813, + "num_input_tokens_seen": 27624784, + "step": 12790 + }, + { + "epoch": 2.0872756933115824, + "grad_norm": 0.5610936880111694, + "learning_rate": 4.822149147954455e-05, + "loss": 0.0927, + "num_input_tokens_seen": 27636048, + "step": 12795 + }, + { + "epoch": 2.088091353996737, + "grad_norm": 0.8456146717071533, + "learning_rate": 4.8218853811225475e-05, + "loss": 0.1439, + "num_input_tokens_seen": 27645328, + "step": 12800 + }, + { + "epoch": 2.0889070146818924, + "grad_norm": 0.10529472678899765, + "learning_rate": 4.8216214260668304e-05, + "loss": 0.1259, + "num_input_tokens_seen": 27655024, + "step": 12805 + }, + { + "epoch": 2.0897226753670473, + "grad_norm": 1.9029359817504883, + "learning_rate": 4.8213572828087e-05, + "loss": 0.2071, + "num_input_tokens_seen": 27666224, + "step": 12810 + }, + { + "epoch": 2.090538336052202, + "grad_norm": 0.09167110919952393, + "learning_rate": 4.82109295136957e-05, + "loss": 0.0321, + "num_input_tokens_seen": 27677904, + "step": 12815 + }, + { + "epoch": 2.0913539967373573, + "grad_norm": 1.1922001838684082, + "learning_rate": 4.820828431770868e-05, + "loss": 0.1603, + "num_input_tokens_seen": 27687088, + "step": 12820 + }, + { + "epoch": 2.092169657422512, + "grad_norm": 1.5374292135238647, + "learning_rate": 4.820563724034039e-05, + "loss": 0.1899, + "num_input_tokens_seen": 27697008, + "step": 12825 + }, + { + "epoch": 2.0929853181076674, + "grad_norm": 1.5661110877990723, + "learning_rate": 4.820298828180538e-05, + "loss": 0.0784, + "num_input_tokens_seen": 27708208, + "step": 12830 + }, + { + "epoch": 2.0938009787928222, + "grad_norm": 0.21569599211215973, + "learning_rate": 4.8200337442318424e-05, + "loss": 0.2801, + "num_input_tokens_seen": 27718704, + "step": 12835 + }, + { + "epoch": 2.094616639477977, + "grad_norm": 0.14340727031230927, + "learning_rate": 4.819768472209439e-05, + "loss": 0.1436, + "num_input_tokens_seen": 27728816, + "step": 12840 + }, + { + "epoch": 2.0954323001631323, + "grad_norm": 0.5164428949356079, + "learning_rate": 4.8195030121348336e-05, + "loss": 0.0975, + "num_input_tokens_seen": 27738992, + "step": 12845 + }, + { + "epoch": 2.096247960848287, + "grad_norm": 0.38452836871147156, + "learning_rate": 4.819237364029544e-05, + "loss": 0.1547, + "num_input_tokens_seen": 27750384, + "step": 12850 + }, + { + "epoch": 2.097063621533442, + "grad_norm": 0.9049424529075623, + "learning_rate": 4.818971527915107e-05, + "loss": 0.1284, + "num_input_tokens_seen": 27760784, + "step": 12855 + }, + { + "epoch": 2.097879282218597, + "grad_norm": 0.21945752203464508, + "learning_rate": 4.818705503813071e-05, + "loss": 0.0746, + "num_input_tokens_seen": 27771952, + "step": 12860 + }, + { + "epoch": 2.098694942903752, + "grad_norm": 0.35159236192703247, + "learning_rate": 4.818439291745002e-05, + "loss": 0.1581, + "num_input_tokens_seen": 27781744, + "step": 12865 + }, + { + "epoch": 2.099510603588907, + "grad_norm": 0.11637181043624878, + "learning_rate": 4.81817289173248e-05, + "loss": 0.0648, + "num_input_tokens_seen": 27792944, + "step": 12870 + }, + { + "epoch": 2.100326264274062, + "grad_norm": 0.5314282774925232, + "learning_rate": 4.8179063037971016e-05, + "loss": 0.1088, + "num_input_tokens_seen": 27804048, + "step": 12875 + }, + { + "epoch": 2.101141924959217, + "grad_norm": 0.7320392727851868, + "learning_rate": 4.817639527960477e-05, + "loss": 0.0872, + "num_input_tokens_seen": 27814960, + "step": 12880 + }, + { + "epoch": 2.1019575856443717, + "grad_norm": 0.20118214190006256, + "learning_rate": 4.817372564244233e-05, + "loss": 0.096, + "num_input_tokens_seen": 27827280, + "step": 12885 + }, + { + "epoch": 2.102773246329527, + "grad_norm": 0.4085347056388855, + "learning_rate": 4.817105412670011e-05, + "loss": 0.0789, + "num_input_tokens_seen": 27837520, + "step": 12890 + }, + { + "epoch": 2.103588907014682, + "grad_norm": 0.8810507655143738, + "learning_rate": 4.8168380732594666e-05, + "loss": 0.0662, + "num_input_tokens_seen": 27848912, + "step": 12895 + }, + { + "epoch": 2.104404567699837, + "grad_norm": 0.6321456432342529, + "learning_rate": 4.816570546034273e-05, + "loss": 0.1006, + "num_input_tokens_seen": 27859504, + "step": 12900 + }, + { + "epoch": 2.105220228384992, + "grad_norm": 0.5121043920516968, + "learning_rate": 4.816302831016116e-05, + "loss": 0.0758, + "num_input_tokens_seen": 27870736, + "step": 12905 + }, + { + "epoch": 2.1060358890701467, + "grad_norm": 1.1151807308197021, + "learning_rate": 4.8160349282266995e-05, + "loss": 0.1855, + "num_input_tokens_seen": 27880720, + "step": 12910 + }, + { + "epoch": 2.106851549755302, + "grad_norm": 1.4390844106674194, + "learning_rate": 4.81576683768774e-05, + "loss": 0.2797, + "num_input_tokens_seen": 27892048, + "step": 12915 + }, + { + "epoch": 2.107667210440457, + "grad_norm": 0.5143570303916931, + "learning_rate": 4.81549855942097e-05, + "loss": 0.0647, + "num_input_tokens_seen": 27903760, + "step": 12920 + }, + { + "epoch": 2.1084828711256116, + "grad_norm": 0.08628338575363159, + "learning_rate": 4.8152300934481384e-05, + "loss": 0.1181, + "num_input_tokens_seen": 27914192, + "step": 12925 + }, + { + "epoch": 2.109298531810767, + "grad_norm": 0.23242877423763275, + "learning_rate": 4.8149614397910094e-05, + "loss": 0.0236, + "num_input_tokens_seen": 27925936, + "step": 12930 + }, + { + "epoch": 2.1101141924959217, + "grad_norm": 0.5154436230659485, + "learning_rate": 4.8146925984713585e-05, + "loss": 0.3177, + "num_input_tokens_seen": 27937392, + "step": 12935 + }, + { + "epoch": 2.1109298531810765, + "grad_norm": 0.3259274959564209, + "learning_rate": 4.814423569510981e-05, + "loss": 0.1004, + "num_input_tokens_seen": 27948304, + "step": 12940 + }, + { + "epoch": 2.1117455138662318, + "grad_norm": 1.2795716524124146, + "learning_rate": 4.814154352931687e-05, + "loss": 0.0874, + "num_input_tokens_seen": 27959344, + "step": 12945 + }, + { + "epoch": 2.1125611745513866, + "grad_norm": 1.5269575119018555, + "learning_rate": 4.813884948755298e-05, + "loss": 0.3456, + "num_input_tokens_seen": 27970032, + "step": 12950 + }, + { + "epoch": 2.1133768352365414, + "grad_norm": 0.5126486420631409, + "learning_rate": 4.8136153570036544e-05, + "loss": 0.0754, + "num_input_tokens_seen": 27979984, + "step": 12955 + }, + { + "epoch": 2.1141924959216967, + "grad_norm": 0.08545016497373581, + "learning_rate": 4.8133455776986114e-05, + "loss": 0.1363, + "num_input_tokens_seen": 27989200, + "step": 12960 + }, + { + "epoch": 2.1150081566068515, + "grad_norm": 0.2537959814071655, + "learning_rate": 4.813075610862038e-05, + "loss": 0.1314, + "num_input_tokens_seen": 28000432, + "step": 12965 + }, + { + "epoch": 2.1158238172920063, + "grad_norm": 0.35370925068855286, + "learning_rate": 4.8128054565158196e-05, + "loss": 0.0778, + "num_input_tokens_seen": 28010064, + "step": 12970 + }, + { + "epoch": 2.1166394779771616, + "grad_norm": 1.472081184387207, + "learning_rate": 4.8125351146818556e-05, + "loss": 0.1957, + "num_input_tokens_seen": 28021776, + "step": 12975 + }, + { + "epoch": 2.1174551386623164, + "grad_norm": 0.3381919264793396, + "learning_rate": 4.8122645853820604e-05, + "loss": 0.2146, + "num_input_tokens_seen": 28032656, + "step": 12980 + }, + { + "epoch": 2.1182707993474716, + "grad_norm": 0.04557975381612778, + "learning_rate": 4.811993868638367e-05, + "loss": 0.0451, + "num_input_tokens_seen": 28043728, + "step": 12985 + }, + { + "epoch": 2.1190864600326265, + "grad_norm": 0.235945463180542, + "learning_rate": 4.811722964472719e-05, + "loss": 0.2021, + "num_input_tokens_seen": 28055120, + "step": 12990 + }, + { + "epoch": 2.1199021207177813, + "grad_norm": 1.984135627746582, + "learning_rate": 4.811451872907078e-05, + "loss": 0.1774, + "num_input_tokens_seen": 28065936, + "step": 12995 + }, + { + "epoch": 2.1207177814029365, + "grad_norm": 2.077986478805542, + "learning_rate": 4.8111805939634204e-05, + "loss": 0.1503, + "num_input_tokens_seen": 28077040, + "step": 13000 + }, + { + "epoch": 2.1215334420880914, + "grad_norm": 0.9240676760673523, + "learning_rate": 4.810909127663736e-05, + "loss": 0.1062, + "num_input_tokens_seen": 28088624, + "step": 13005 + }, + { + "epoch": 2.122349102773246, + "grad_norm": 1.3506577014923096, + "learning_rate": 4.810637474030033e-05, + "loss": 0.197, + "num_input_tokens_seen": 28097520, + "step": 13010 + }, + { + "epoch": 2.1231647634584014, + "grad_norm": 0.4019344449043274, + "learning_rate": 4.810365633084333e-05, + "loss": 0.2969, + "num_input_tokens_seen": 28107504, + "step": 13015 + }, + { + "epoch": 2.1239804241435563, + "grad_norm": 0.7906027436256409, + "learning_rate": 4.810093604848671e-05, + "loss": 0.1751, + "num_input_tokens_seen": 28118352, + "step": 13020 + }, + { + "epoch": 2.124796084828711, + "grad_norm": 0.9880836009979248, + "learning_rate": 4.8098213893451005e-05, + "loss": 0.132, + "num_input_tokens_seen": 28129936, + "step": 13025 + }, + { + "epoch": 2.1256117455138663, + "grad_norm": 1.0452935695648193, + "learning_rate": 4.809548986595688e-05, + "loss": 0.1542, + "num_input_tokens_seen": 28140496, + "step": 13030 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.5881481170654297, + "learning_rate": 4.809276396622516e-05, + "loss": 0.0849, + "num_input_tokens_seen": 28151440, + "step": 13035 + }, + { + "epoch": 2.1272430668841764, + "grad_norm": 0.5999038815498352, + "learning_rate": 4.809003619447683e-05, + "loss": 0.1234, + "num_input_tokens_seen": 28161200, + "step": 13040 + }, + { + "epoch": 2.1280587275693312, + "grad_norm": 1.0349979400634766, + "learning_rate": 4.8087306550932996e-05, + "loss": 0.0693, + "num_input_tokens_seen": 28171568, + "step": 13045 + }, + { + "epoch": 2.128874388254486, + "grad_norm": 0.6646943092346191, + "learning_rate": 4.808457503581496e-05, + "loss": 0.1435, + "num_input_tokens_seen": 28183344, + "step": 13050 + }, + { + "epoch": 2.1296900489396413, + "grad_norm": 0.23816418647766113, + "learning_rate": 4.808184164934414e-05, + "loss": 0.0554, + "num_input_tokens_seen": 28193552, + "step": 13055 + }, + { + "epoch": 2.130505709624796, + "grad_norm": 0.29680532217025757, + "learning_rate": 4.8079106391742115e-05, + "loss": 0.1796, + "num_input_tokens_seen": 28203600, + "step": 13060 + }, + { + "epoch": 2.131321370309951, + "grad_norm": 0.4732407331466675, + "learning_rate": 4.807636926323063e-05, + "loss": 0.3656, + "num_input_tokens_seen": 28213584, + "step": 13065 + }, + { + "epoch": 2.132137030995106, + "grad_norm": 0.14734923839569092, + "learning_rate": 4.8073630264031556e-05, + "loss": 0.0848, + "num_input_tokens_seen": 28224016, + "step": 13070 + }, + { + "epoch": 2.132952691680261, + "grad_norm": 0.29372870922088623, + "learning_rate": 4.807088939436695e-05, + "loss": 0.1949, + "num_input_tokens_seen": 28235760, + "step": 13075 + }, + { + "epoch": 2.133768352365416, + "grad_norm": 1.180466651916504, + "learning_rate": 4.806814665445898e-05, + "loss": 0.1623, + "num_input_tokens_seen": 28244656, + "step": 13080 + }, + { + "epoch": 2.134584013050571, + "grad_norm": 0.41686689853668213, + "learning_rate": 4.8065402044529994e-05, + "loss": 0.0497, + "num_input_tokens_seen": 28254096, + "step": 13085 + }, + { + "epoch": 2.135399673735726, + "grad_norm": 0.667001485824585, + "learning_rate": 4.806265556480249e-05, + "loss": 0.1042, + "num_input_tokens_seen": 28264880, + "step": 13090 + }, + { + "epoch": 2.1362153344208807, + "grad_norm": 0.8959311842918396, + "learning_rate": 4.80599072154991e-05, + "loss": 0.1696, + "num_input_tokens_seen": 28274448, + "step": 13095 + }, + { + "epoch": 2.137030995106036, + "grad_norm": 0.5145247578620911, + "learning_rate": 4.805715699684264e-05, + "loss": 0.0733, + "num_input_tokens_seen": 28285776, + "step": 13100 + }, + { + "epoch": 2.137846655791191, + "grad_norm": 1.338154673576355, + "learning_rate": 4.8054404909056036e-05, + "loss": 0.083, + "num_input_tokens_seen": 28296016, + "step": 13105 + }, + { + "epoch": 2.1386623164763456, + "grad_norm": 1.5141409635543823, + "learning_rate": 4.805165095236239e-05, + "loss": 0.2283, + "num_input_tokens_seen": 28307952, + "step": 13110 + }, + { + "epoch": 2.139477977161501, + "grad_norm": 1.474342942237854, + "learning_rate": 4.804889512698496e-05, + "loss": 0.1739, + "num_input_tokens_seen": 28318960, + "step": 13115 + }, + { + "epoch": 2.1402936378466557, + "grad_norm": 0.1864023506641388, + "learning_rate": 4.804613743314714e-05, + "loss": 0.0254, + "num_input_tokens_seen": 28329136, + "step": 13120 + }, + { + "epoch": 2.141109298531811, + "grad_norm": 0.14857469499111176, + "learning_rate": 4.804337787107248e-05, + "loss": 0.0492, + "num_input_tokens_seen": 28339696, + "step": 13125 + }, + { + "epoch": 2.141924959216966, + "grad_norm": 0.626284122467041, + "learning_rate": 4.80406164409847e-05, + "loss": 0.1838, + "num_input_tokens_seen": 28351440, + "step": 13130 + }, + { + "epoch": 2.1427406199021206, + "grad_norm": 0.31992587447166443, + "learning_rate": 4.8037853143107634e-05, + "loss": 0.1963, + "num_input_tokens_seen": 28361232, + "step": 13135 + }, + { + "epoch": 2.143556280587276, + "grad_norm": 0.15971119701862335, + "learning_rate": 4.8035087977665304e-05, + "loss": 0.3099, + "num_input_tokens_seen": 28370864, + "step": 13140 + }, + { + "epoch": 2.1443719412724307, + "grad_norm": 0.763587474822998, + "learning_rate": 4.803232094488186e-05, + "loss": 0.063, + "num_input_tokens_seen": 28381456, + "step": 13145 + }, + { + "epoch": 2.1451876019575855, + "grad_norm": 0.14533881843090057, + "learning_rate": 4.802955204498162e-05, + "loss": 0.0667, + "num_input_tokens_seen": 28391600, + "step": 13150 + }, + { + "epoch": 2.1460032626427408, + "grad_norm": 0.8553251624107361, + "learning_rate": 4.802678127818904e-05, + "loss": 0.176, + "num_input_tokens_seen": 28402640, + "step": 13155 + }, + { + "epoch": 2.1468189233278956, + "grad_norm": 0.3594006896018982, + "learning_rate": 4.802400864472873e-05, + "loss": 0.2611, + "num_input_tokens_seen": 28412272, + "step": 13160 + }, + { + "epoch": 2.1476345840130504, + "grad_norm": 0.6907150149345398, + "learning_rate": 4.8021234144825456e-05, + "loss": 0.1914, + "num_input_tokens_seen": 28423312, + "step": 13165 + }, + { + "epoch": 2.1484502446982057, + "grad_norm": 0.9241585731506348, + "learning_rate": 4.801845777870414e-05, + "loss": 0.2672, + "num_input_tokens_seen": 28433424, + "step": 13170 + }, + { + "epoch": 2.1492659053833605, + "grad_norm": 1.8309253454208374, + "learning_rate": 4.801567954658984e-05, + "loss": 0.2254, + "num_input_tokens_seen": 28443984, + "step": 13175 + }, + { + "epoch": 2.1500815660685153, + "grad_norm": 0.6105179786682129, + "learning_rate": 4.801289944870777e-05, + "loss": 0.1658, + "num_input_tokens_seen": 28454832, + "step": 13180 + }, + { + "epoch": 2.1508972267536706, + "grad_norm": 0.1066540777683258, + "learning_rate": 4.8010117485283305e-05, + "loss": 0.1523, + "num_input_tokens_seen": 28467280, + "step": 13185 + }, + { + "epoch": 2.1517128874388254, + "grad_norm": 0.6311045289039612, + "learning_rate": 4.800733365654197e-05, + "loss": 0.2282, + "num_input_tokens_seen": 28478128, + "step": 13190 + }, + { + "epoch": 2.15252854812398, + "grad_norm": 0.9728257656097412, + "learning_rate": 4.8004547962709424e-05, + "loss": 0.0996, + "num_input_tokens_seen": 28489200, + "step": 13195 + }, + { + "epoch": 2.1533442088091355, + "grad_norm": 0.23620663583278656, + "learning_rate": 4.80017604040115e-05, + "loss": 0.0391, + "num_input_tokens_seen": 28500240, + "step": 13200 + }, + { + "epoch": 2.1541598694942903, + "grad_norm": 1.053244948387146, + "learning_rate": 4.799897098067417e-05, + "loss": 0.1141, + "num_input_tokens_seen": 28511824, + "step": 13205 + }, + { + "epoch": 2.1549755301794455, + "grad_norm": 0.11474776268005371, + "learning_rate": 4.799617969292355e-05, + "loss": 0.1419, + "num_input_tokens_seen": 28521456, + "step": 13210 + }, + { + "epoch": 2.1557911908646004, + "grad_norm": 0.6445015668869019, + "learning_rate": 4.799338654098593e-05, + "loss": 0.13, + "num_input_tokens_seen": 28531888, + "step": 13215 + }, + { + "epoch": 2.156606851549755, + "grad_norm": 0.06628404557704926, + "learning_rate": 4.799059152508773e-05, + "loss": 0.0783, + "num_input_tokens_seen": 28541616, + "step": 13220 + }, + { + "epoch": 2.1574225122349104, + "grad_norm": 0.8802301287651062, + "learning_rate": 4.798779464545552e-05, + "loss": 0.1669, + "num_input_tokens_seen": 28553776, + "step": 13225 + }, + { + "epoch": 2.1582381729200653, + "grad_norm": 1.7193315029144287, + "learning_rate": 4.7984995902316045e-05, + "loss": 0.1264, + "num_input_tokens_seen": 28565680, + "step": 13230 + }, + { + "epoch": 2.15905383360522, + "grad_norm": 1.3896609544754028, + "learning_rate": 4.798219529589618e-05, + "loss": 0.2182, + "num_input_tokens_seen": 28576560, + "step": 13235 + }, + { + "epoch": 2.1598694942903753, + "grad_norm": 0.40329039096832275, + "learning_rate": 4.797939282642294e-05, + "loss": 0.3333, + "num_input_tokens_seen": 28586352, + "step": 13240 + }, + { + "epoch": 2.16068515497553, + "grad_norm": 0.7308264970779419, + "learning_rate": 4.797658849412353e-05, + "loss": 0.2526, + "num_input_tokens_seen": 28598416, + "step": 13245 + }, + { + "epoch": 2.161500815660685, + "grad_norm": 0.11722517758607864, + "learning_rate": 4.797378229922528e-05, + "loss": 0.1273, + "num_input_tokens_seen": 28609424, + "step": 13250 + }, + { + "epoch": 2.1623164763458402, + "grad_norm": 0.6328141689300537, + "learning_rate": 4.797097424195566e-05, + "loss": 0.0816, + "num_input_tokens_seen": 28619184, + "step": 13255 + }, + { + "epoch": 2.163132137030995, + "grad_norm": 0.46326833963394165, + "learning_rate": 4.796816432254232e-05, + "loss": 0.1612, + "num_input_tokens_seen": 28629232, + "step": 13260 + }, + { + "epoch": 2.1639477977161503, + "grad_norm": 0.6138639450073242, + "learning_rate": 4.796535254121304e-05, + "loss": 0.0655, + "num_input_tokens_seen": 28639408, + "step": 13265 + }, + { + "epoch": 2.164763458401305, + "grad_norm": 0.2836679518222809, + "learning_rate": 4.7962538898195754e-05, + "loss": 0.1313, + "num_input_tokens_seen": 28650736, + "step": 13270 + }, + { + "epoch": 2.16557911908646, + "grad_norm": 1.6075170040130615, + "learning_rate": 4.7959723393718556e-05, + "loss": 0.1367, + "num_input_tokens_seen": 28660784, + "step": 13275 + }, + { + "epoch": 2.166394779771615, + "grad_norm": 1.1576753854751587, + "learning_rate": 4.7956906028009683e-05, + "loss": 0.1081, + "num_input_tokens_seen": 28671248, + "step": 13280 + }, + { + "epoch": 2.16721044045677, + "grad_norm": 0.6806620955467224, + "learning_rate": 4.795408680129753e-05, + "loss": 0.1335, + "num_input_tokens_seen": 28682000, + "step": 13285 + }, + { + "epoch": 2.168026101141925, + "grad_norm": 0.4646163582801819, + "learning_rate": 4.795126571381062e-05, + "loss": 0.2369, + "num_input_tokens_seen": 28692912, + "step": 13290 + }, + { + "epoch": 2.16884176182708, + "grad_norm": 2.3161160945892334, + "learning_rate": 4.794844276577767e-05, + "loss": 0.2329, + "num_input_tokens_seen": 28703504, + "step": 13295 + }, + { + "epoch": 2.169657422512235, + "grad_norm": 0.15537142753601074, + "learning_rate": 4.794561795742751e-05, + "loss": 0.0576, + "num_input_tokens_seen": 28714416, + "step": 13300 + }, + { + "epoch": 2.1704730831973897, + "grad_norm": 1.7142517566680908, + "learning_rate": 4.794279128898913e-05, + "loss": 0.1073, + "num_input_tokens_seen": 28725200, + "step": 13305 + }, + { + "epoch": 2.171288743882545, + "grad_norm": 0.5713295936584473, + "learning_rate": 4.7939962760691675e-05, + "loss": 0.1202, + "num_input_tokens_seen": 28736176, + "step": 13310 + }, + { + "epoch": 2.1721044045677, + "grad_norm": 0.2936367392539978, + "learning_rate": 4.793713237276445e-05, + "loss": 0.0882, + "num_input_tokens_seen": 28746928, + "step": 13315 + }, + { + "epoch": 2.1729200652528546, + "grad_norm": 1.2313017845153809, + "learning_rate": 4.7934300125436885e-05, + "loss": 0.1868, + "num_input_tokens_seen": 28757584, + "step": 13320 + }, + { + "epoch": 2.17373572593801, + "grad_norm": 0.5504234433174133, + "learning_rate": 4.7931466018938586e-05, + "loss": 0.098, + "num_input_tokens_seen": 28766224, + "step": 13325 + }, + { + "epoch": 2.1745513866231647, + "grad_norm": 0.2147180736064911, + "learning_rate": 4.79286300534993e-05, + "loss": 0.0824, + "num_input_tokens_seen": 28775984, + "step": 13330 + }, + { + "epoch": 2.1753670473083195, + "grad_norm": 0.049584973603487015, + "learning_rate": 4.792579222934892e-05, + "loss": 0.0831, + "num_input_tokens_seen": 28787056, + "step": 13335 + }, + { + "epoch": 2.176182707993475, + "grad_norm": 0.3779813349246979, + "learning_rate": 4.79229525467175e-05, + "loss": 0.0951, + "num_input_tokens_seen": 28797648, + "step": 13340 + }, + { + "epoch": 2.1769983686786296, + "grad_norm": 0.13990169763565063, + "learning_rate": 4.792011100583524e-05, + "loss": 0.0994, + "num_input_tokens_seen": 28808656, + "step": 13345 + }, + { + "epoch": 2.177814029363785, + "grad_norm": 1.3999111652374268, + "learning_rate": 4.791726760693248e-05, + "loss": 0.0726, + "num_input_tokens_seen": 28818448, + "step": 13350 + }, + { + "epoch": 2.1786296900489397, + "grad_norm": 1.634453535079956, + "learning_rate": 4.791442235023974e-05, + "loss": 0.1477, + "num_input_tokens_seen": 28828752, + "step": 13355 + }, + { + "epoch": 2.1794453507340945, + "grad_norm": 2.022653579711914, + "learning_rate": 4.7911575235987644e-05, + "loss": 0.1933, + "num_input_tokens_seen": 28838896, + "step": 13360 + }, + { + "epoch": 2.1802610114192498, + "grad_norm": 0.4094257354736328, + "learning_rate": 4.790872626440701e-05, + "loss": 0.1722, + "num_input_tokens_seen": 28848880, + "step": 13365 + }, + { + "epoch": 2.1810766721044046, + "grad_norm": 0.548117458820343, + "learning_rate": 4.790587543572879e-05, + "loss": 0.344, + "num_input_tokens_seen": 28860592, + "step": 13370 + }, + { + "epoch": 2.1818923327895594, + "grad_norm": 0.37602749466896057, + "learning_rate": 4.790302275018408e-05, + "loss": 0.1698, + "num_input_tokens_seen": 28872432, + "step": 13375 + }, + { + "epoch": 2.1827079934747147, + "grad_norm": 0.6694759726524353, + "learning_rate": 4.790016820800414e-05, + "loss": 0.0546, + "num_input_tokens_seen": 28883856, + "step": 13380 + }, + { + "epoch": 2.1835236541598695, + "grad_norm": 0.804273247718811, + "learning_rate": 4.789731180942037e-05, + "loss": 0.2271, + "num_input_tokens_seen": 28894384, + "step": 13385 + }, + { + "epoch": 2.1843393148450243, + "grad_norm": 1.8568830490112305, + "learning_rate": 4.7894453554664325e-05, + "loss": 0.1547, + "num_input_tokens_seen": 28902288, + "step": 13390 + }, + { + "epoch": 2.1851549755301796, + "grad_norm": 1.6628495454788208, + "learning_rate": 4.7891593443967706e-05, + "loss": 0.1666, + "num_input_tokens_seen": 28913296, + "step": 13395 + }, + { + "epoch": 2.1859706362153344, + "grad_norm": 0.06772861629724503, + "learning_rate": 4.788873147756238e-05, + "loss": 0.1343, + "num_input_tokens_seen": 28923888, + "step": 13400 + }, + { + "epoch": 2.186786296900489, + "grad_norm": 0.3587746322154999, + "learning_rate": 4.788586765568034e-05, + "loss": 0.2296, + "num_input_tokens_seen": 28934640, + "step": 13405 + }, + { + "epoch": 2.1876019575856445, + "grad_norm": 0.1269921511411667, + "learning_rate": 4.788300197855374e-05, + "loss": 0.1596, + "num_input_tokens_seen": 28945648, + "step": 13410 + }, + { + "epoch": 2.1884176182707993, + "grad_norm": 0.7698563933372498, + "learning_rate": 4.788013444641491e-05, + "loss": 0.0645, + "num_input_tokens_seen": 28957392, + "step": 13415 + }, + { + "epoch": 2.189233278955954, + "grad_norm": 0.5774688124656677, + "learning_rate": 4.7877265059496266e-05, + "loss": 0.0758, + "num_input_tokens_seen": 28967056, + "step": 13420 + }, + { + "epoch": 2.1900489396411094, + "grad_norm": 0.30158737301826477, + "learning_rate": 4.7874393818030456e-05, + "loss": 0.2367, + "num_input_tokens_seen": 28977808, + "step": 13425 + }, + { + "epoch": 2.190864600326264, + "grad_norm": 0.37706589698791504, + "learning_rate": 4.7871520722250214e-05, + "loss": 0.0897, + "num_input_tokens_seen": 28987440, + "step": 13430 + }, + { + "epoch": 2.1916802610114194, + "grad_norm": 1.0010120868682861, + "learning_rate": 4.786864577238845e-05, + "loss": 0.1282, + "num_input_tokens_seen": 28997392, + "step": 13435 + }, + { + "epoch": 2.1924959216965743, + "grad_norm": 0.6086024045944214, + "learning_rate": 4.7865768968678226e-05, + "loss": 0.2132, + "num_input_tokens_seen": 29007824, + "step": 13440 + }, + { + "epoch": 2.193311582381729, + "grad_norm": 0.2765220105648041, + "learning_rate": 4.786289031135275e-05, + "loss": 0.1333, + "num_input_tokens_seen": 29019024, + "step": 13445 + }, + { + "epoch": 2.1941272430668843, + "grad_norm": 1.2010226249694824, + "learning_rate": 4.786000980064538e-05, + "loss": 0.165, + "num_input_tokens_seen": 29031600, + "step": 13450 + }, + { + "epoch": 2.194942903752039, + "grad_norm": 1.0528693199157715, + "learning_rate": 4.785712743678963e-05, + "loss": 0.1506, + "num_input_tokens_seen": 29043056, + "step": 13455 + }, + { + "epoch": 2.195758564437194, + "grad_norm": 0.5378763675689697, + "learning_rate": 4.785424322001915e-05, + "loss": 0.1189, + "num_input_tokens_seen": 29053648, + "step": 13460 + }, + { + "epoch": 2.1965742251223492, + "grad_norm": 0.38808873295783997, + "learning_rate": 4.785135715056775e-05, + "loss": 0.0685, + "num_input_tokens_seen": 29064112, + "step": 13465 + }, + { + "epoch": 2.197389885807504, + "grad_norm": 0.1925218403339386, + "learning_rate": 4.78484692286694e-05, + "loss": 0.047, + "num_input_tokens_seen": 29075216, + "step": 13470 + }, + { + "epoch": 2.198205546492659, + "grad_norm": 1.6443582773208618, + "learning_rate": 4.7845579454558196e-05, + "loss": 0.1414, + "num_input_tokens_seen": 29084368, + "step": 13475 + }, + { + "epoch": 2.199021207177814, + "grad_norm": 0.13677892088890076, + "learning_rate": 4.784268782846841e-05, + "loss": 0.0581, + "num_input_tokens_seen": 29096016, + "step": 13480 + }, + { + "epoch": 2.199836867862969, + "grad_norm": 0.5304481983184814, + "learning_rate": 4.783979435063445e-05, + "loss": 0.0965, + "num_input_tokens_seen": 29106768, + "step": 13485 + }, + { + "epoch": 2.200652528548124, + "grad_norm": 2.075303077697754, + "learning_rate": 4.783689902129086e-05, + "loss": 0.1879, + "num_input_tokens_seen": 29116560, + "step": 13490 + }, + { + "epoch": 2.201468189233279, + "grad_norm": 0.5959886312484741, + "learning_rate": 4.783400184067237e-05, + "loss": 0.1832, + "num_input_tokens_seen": 29127376, + "step": 13495 + }, + { + "epoch": 2.202283849918434, + "grad_norm": 0.5153542757034302, + "learning_rate": 4.783110280901383e-05, + "loss": 0.1606, + "num_input_tokens_seen": 29139088, + "step": 13500 + }, + { + "epoch": 2.203099510603589, + "grad_norm": 0.32251691818237305, + "learning_rate": 4.7828201926550245e-05, + "loss": 0.1493, + "num_input_tokens_seen": 29149488, + "step": 13505 + }, + { + "epoch": 2.203915171288744, + "grad_norm": 0.15522581338882446, + "learning_rate": 4.7825299193516794e-05, + "loss": 0.0689, + "num_input_tokens_seen": 29159728, + "step": 13510 + }, + { + "epoch": 2.2047308319738987, + "grad_norm": 0.5815375447273254, + "learning_rate": 4.782239461014877e-05, + "loss": 0.1041, + "num_input_tokens_seen": 29170928, + "step": 13515 + }, + { + "epoch": 2.205546492659054, + "grad_norm": 1.9519193172454834, + "learning_rate": 4.781948817668164e-05, + "loss": 0.2686, + "num_input_tokens_seen": 29180912, + "step": 13520 + }, + { + "epoch": 2.206362153344209, + "grad_norm": 0.18937018513679504, + "learning_rate": 4.7816579893351014e-05, + "loss": 0.1495, + "num_input_tokens_seen": 29191216, + "step": 13525 + }, + { + "epoch": 2.2071778140293636, + "grad_norm": 0.15450355410575867, + "learning_rate": 4.781366976039265e-05, + "loss": 0.0456, + "num_input_tokens_seen": 29201424, + "step": 13530 + }, + { + "epoch": 2.207993474714519, + "grad_norm": 0.12843057513237, + "learning_rate": 4.781075777804246e-05, + "loss": 0.2095, + "num_input_tokens_seen": 29211152, + "step": 13535 + }, + { + "epoch": 2.2088091353996737, + "grad_norm": 1.973917007446289, + "learning_rate": 4.7807843946536514e-05, + "loss": 0.1831, + "num_input_tokens_seen": 29221744, + "step": 13540 + }, + { + "epoch": 2.2096247960848285, + "grad_norm": 0.8128254413604736, + "learning_rate": 4.7804928266110996e-05, + "loss": 0.2249, + "num_input_tokens_seen": 29231568, + "step": 13545 + }, + { + "epoch": 2.210440456769984, + "grad_norm": 2.464848756790161, + "learning_rate": 4.780201073700229e-05, + "loss": 0.3909, + "num_input_tokens_seen": 29242128, + "step": 13550 + }, + { + "epoch": 2.2112561174551386, + "grad_norm": 0.4939432740211487, + "learning_rate": 4.7799091359446905e-05, + "loss": 0.1565, + "num_input_tokens_seen": 29250992, + "step": 13555 + }, + { + "epoch": 2.2120717781402934, + "grad_norm": 0.5552804470062256, + "learning_rate": 4.779617013368148e-05, + "loss": 0.0725, + "num_input_tokens_seen": 29261072, + "step": 13560 + }, + { + "epoch": 2.2128874388254487, + "grad_norm": 0.4202837646007538, + "learning_rate": 4.7793247059942845e-05, + "loss": 0.0908, + "num_input_tokens_seen": 29271760, + "step": 13565 + }, + { + "epoch": 2.2137030995106035, + "grad_norm": 0.2647188603878021, + "learning_rate": 4.779032213846795e-05, + "loss": 0.2231, + "num_input_tokens_seen": 29282480, + "step": 13570 + }, + { + "epoch": 2.2145187601957588, + "grad_norm": 1.0521947145462036, + "learning_rate": 4.77873953694939e-05, + "loss": 0.1331, + "num_input_tokens_seen": 29292912, + "step": 13575 + }, + { + "epoch": 2.2153344208809136, + "grad_norm": 0.17444144189357758, + "learning_rate": 4.778446675325796e-05, + "loss": 0.1292, + "num_input_tokens_seen": 29302800, + "step": 13580 + }, + { + "epoch": 2.2161500815660684, + "grad_norm": 0.41427701711654663, + "learning_rate": 4.778153628999754e-05, + "loss": 0.0948, + "num_input_tokens_seen": 29313424, + "step": 13585 + }, + { + "epoch": 2.2169657422512237, + "grad_norm": 0.7632957100868225, + "learning_rate": 4.7778603979950196e-05, + "loss": 0.1298, + "num_input_tokens_seen": 29323856, + "step": 13590 + }, + { + "epoch": 2.2177814029363785, + "grad_norm": 0.09347732365131378, + "learning_rate": 4.777566982335364e-05, + "loss": 0.0652, + "num_input_tokens_seen": 29334832, + "step": 13595 + }, + { + "epoch": 2.2185970636215333, + "grad_norm": 0.06450121849775314, + "learning_rate": 4.777273382044572e-05, + "loss": 0.1003, + "num_input_tokens_seen": 29345936, + "step": 13600 + }, + { + "epoch": 2.2194127243066886, + "grad_norm": 3.384652853012085, + "learning_rate": 4.7769795971464456e-05, + "loss": 0.1855, + "num_input_tokens_seen": 29356560, + "step": 13605 + }, + { + "epoch": 2.2202283849918434, + "grad_norm": 0.1759355664253235, + "learning_rate": 4.7766856276647986e-05, + "loss": 0.0624, + "num_input_tokens_seen": 29367024, + "step": 13610 + }, + { + "epoch": 2.221044045676998, + "grad_norm": 0.5091504454612732, + "learning_rate": 4.776391473623464e-05, + "loss": 0.0644, + "num_input_tokens_seen": 29376944, + "step": 13615 + }, + { + "epoch": 2.2218597063621535, + "grad_norm": 2.0757315158843994, + "learning_rate": 4.7760971350462856e-05, + "loss": 0.1894, + "num_input_tokens_seen": 29389552, + "step": 13620 + }, + { + "epoch": 2.2226753670473083, + "grad_norm": 0.5930010676383972, + "learning_rate": 4.775802611957125e-05, + "loss": 0.0428, + "num_input_tokens_seen": 29399600, + "step": 13625 + }, + { + "epoch": 2.223491027732463, + "grad_norm": 1.1199795007705688, + "learning_rate": 4.7755079043798565e-05, + "loss": 0.0813, + "num_input_tokens_seen": 29410896, + "step": 13630 + }, + { + "epoch": 2.2243066884176184, + "grad_norm": 1.2806106805801392, + "learning_rate": 4.775213012338373e-05, + "loss": 0.1638, + "num_input_tokens_seen": 29421136, + "step": 13635 + }, + { + "epoch": 2.225122349102773, + "grad_norm": 1.1316274404525757, + "learning_rate": 4.774917935856577e-05, + "loss": 0.1025, + "num_input_tokens_seen": 29431536, + "step": 13640 + }, + { + "epoch": 2.225938009787928, + "grad_norm": 0.10398954898118973, + "learning_rate": 4.774622674958391e-05, + "loss": 0.1368, + "num_input_tokens_seen": 29442416, + "step": 13645 + }, + { + "epoch": 2.2267536704730833, + "grad_norm": 0.8221560120582581, + "learning_rate": 4.7743272296677495e-05, + "loss": 0.0742, + "num_input_tokens_seen": 29453232, + "step": 13650 + }, + { + "epoch": 2.227569331158238, + "grad_norm": 0.6162348985671997, + "learning_rate": 4.774031600008603e-05, + "loss": 0.0923, + "num_input_tokens_seen": 29464976, + "step": 13655 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 1.2692580223083496, + "learning_rate": 4.7737357860049164e-05, + "loss": 0.1944, + "num_input_tokens_seen": 29475536, + "step": 13660 + }, + { + "epoch": 2.229200652528548, + "grad_norm": 0.11307224631309509, + "learning_rate": 4.7734397876806704e-05, + "loss": 0.0304, + "num_input_tokens_seen": 29486256, + "step": 13665 + }, + { + "epoch": 2.230016313213703, + "grad_norm": 1.295657992362976, + "learning_rate": 4.77314360505986e-05, + "loss": 0.2264, + "num_input_tokens_seen": 29498512, + "step": 13670 + }, + { + "epoch": 2.2308319738988582, + "grad_norm": 0.8491688370704651, + "learning_rate": 4.772847238166495e-05, + "loss": 0.172, + "num_input_tokens_seen": 29509360, + "step": 13675 + }, + { + "epoch": 2.231647634584013, + "grad_norm": 0.7462703585624695, + "learning_rate": 4.7725506870246006e-05, + "loss": 0.0326, + "num_input_tokens_seen": 29520400, + "step": 13680 + }, + { + "epoch": 2.232463295269168, + "grad_norm": 0.5510059595108032, + "learning_rate": 4.772253951658217e-05, + "loss": 0.1895, + "num_input_tokens_seen": 29532016, + "step": 13685 + }, + { + "epoch": 2.233278955954323, + "grad_norm": 0.32409805059432983, + "learning_rate": 4.771957032091398e-05, + "loss": 0.0839, + "num_input_tokens_seen": 29543728, + "step": 13690 + }, + { + "epoch": 2.234094616639478, + "grad_norm": 0.9810343980789185, + "learning_rate": 4.771659928348214e-05, + "loss": 0.5666, + "num_input_tokens_seen": 29554352, + "step": 13695 + }, + { + "epoch": 2.2349102773246328, + "grad_norm": 0.8076919913291931, + "learning_rate": 4.7713626404527514e-05, + "loss": 0.1277, + "num_input_tokens_seen": 29565168, + "step": 13700 + }, + { + "epoch": 2.235725938009788, + "grad_norm": 1.105963945388794, + "learning_rate": 4.7710651684291074e-05, + "loss": 0.2664, + "num_input_tokens_seen": 29575792, + "step": 13705 + }, + { + "epoch": 2.236541598694943, + "grad_norm": 0.2101021707057953, + "learning_rate": 4.770767512301398e-05, + "loss": 0.1594, + "num_input_tokens_seen": 29586384, + "step": 13710 + }, + { + "epoch": 2.237357259380098, + "grad_norm": 0.10565786063671112, + "learning_rate": 4.770469672093752e-05, + "loss": 0.0731, + "num_input_tokens_seen": 29597232, + "step": 13715 + }, + { + "epoch": 2.238172920065253, + "grad_norm": 2.147580862045288, + "learning_rate": 4.7701716478303135e-05, + "loss": 0.1412, + "num_input_tokens_seen": 29606896, + "step": 13720 + }, + { + "epoch": 2.2389885807504077, + "grad_norm": 0.11202830821275711, + "learning_rate": 4.769873439535244e-05, + "loss": 0.0652, + "num_input_tokens_seen": 29617648, + "step": 13725 + }, + { + "epoch": 2.239804241435563, + "grad_norm": 1.2284941673278809, + "learning_rate": 4.769575047232715e-05, + "loss": 0.2647, + "num_input_tokens_seen": 29627664, + "step": 13730 + }, + { + "epoch": 2.240619902120718, + "grad_norm": 0.24147869646549225, + "learning_rate": 4.769276470946917e-05, + "loss": 0.0667, + "num_input_tokens_seen": 29639088, + "step": 13735 + }, + { + "epoch": 2.2414355628058726, + "grad_norm": 0.7831531167030334, + "learning_rate": 4.768977710702055e-05, + "loss": 0.0966, + "num_input_tokens_seen": 29649648, + "step": 13740 + }, + { + "epoch": 2.242251223491028, + "grad_norm": 0.46271488070487976, + "learning_rate": 4.768678766522347e-05, + "loss": 0.2031, + "num_input_tokens_seen": 29661360, + "step": 13745 + }, + { + "epoch": 2.2430668841761827, + "grad_norm": 0.4586954116821289, + "learning_rate": 4.768379638432026e-05, + "loss": 0.102, + "num_input_tokens_seen": 29672336, + "step": 13750 + }, + { + "epoch": 2.2438825448613375, + "grad_norm": 2.5928611755371094, + "learning_rate": 4.768080326455343e-05, + "loss": 0.3049, + "num_input_tokens_seen": 29683472, + "step": 13755 + }, + { + "epoch": 2.244698205546493, + "grad_norm": 0.30377769470214844, + "learning_rate": 4.7677808306165596e-05, + "loss": 0.0201, + "num_input_tokens_seen": 29695472, + "step": 13760 + }, + { + "epoch": 2.2455138662316476, + "grad_norm": 0.8730031251907349, + "learning_rate": 4.767481150939956e-05, + "loss": 0.2246, + "num_input_tokens_seen": 29705520, + "step": 13765 + }, + { + "epoch": 2.2463295269168024, + "grad_norm": 0.43833523988723755, + "learning_rate": 4.767181287449825e-05, + "loss": 0.0621, + "num_input_tokens_seen": 29715824, + "step": 13770 + }, + { + "epoch": 2.2471451876019577, + "grad_norm": 0.5170870423316956, + "learning_rate": 4.766881240170475e-05, + "loss": 0.0495, + "num_input_tokens_seen": 29726864, + "step": 13775 + }, + { + "epoch": 2.2479608482871125, + "grad_norm": 0.15601758658885956, + "learning_rate": 4.7665810091262305e-05, + "loss": 0.0331, + "num_input_tokens_seen": 29738128, + "step": 13780 + }, + { + "epoch": 2.2487765089722673, + "grad_norm": 0.18609844148159027, + "learning_rate": 4.766280594341428e-05, + "loss": 0.2147, + "num_input_tokens_seen": 29747152, + "step": 13785 + }, + { + "epoch": 2.2495921696574226, + "grad_norm": 1.5417094230651855, + "learning_rate": 4.7659799958404225e-05, + "loss": 0.1723, + "num_input_tokens_seen": 29756912, + "step": 13790 + }, + { + "epoch": 2.2504078303425774, + "grad_norm": 1.5286859273910522, + "learning_rate": 4.7656792136475804e-05, + "loss": 0.2052, + "num_input_tokens_seen": 29766768, + "step": 13795 + }, + { + "epoch": 2.2512234910277327, + "grad_norm": 1.218888282775879, + "learning_rate": 4.765378247787285e-05, + "loss": 0.1301, + "num_input_tokens_seen": 29778032, + "step": 13800 + }, + { + "epoch": 2.2520391517128875, + "grad_norm": 1.1892277002334595, + "learning_rate": 4.765077098283935e-05, + "loss": 0.2085, + "num_input_tokens_seen": 29787408, + "step": 13805 + }, + { + "epoch": 2.2528548123980423, + "grad_norm": 0.3803302049636841, + "learning_rate": 4.764775765161943e-05, + "loss": 0.0679, + "num_input_tokens_seen": 29797232, + "step": 13810 + }, + { + "epoch": 2.2536704730831976, + "grad_norm": 0.847817599773407, + "learning_rate": 4.764474248445735e-05, + "loss": 0.2341, + "num_input_tokens_seen": 29808016, + "step": 13815 + }, + { + "epoch": 2.2544861337683524, + "grad_norm": 0.3727641701698303, + "learning_rate": 4.764172548159755e-05, + "loss": 0.0618, + "num_input_tokens_seen": 29818640, + "step": 13820 + }, + { + "epoch": 2.255301794453507, + "grad_norm": 0.16063395142555237, + "learning_rate": 4.7638706643284605e-05, + "loss": 0.1552, + "num_input_tokens_seen": 29829776, + "step": 13825 + }, + { + "epoch": 2.2561174551386625, + "grad_norm": 0.1389966458082199, + "learning_rate": 4.7635685969763225e-05, + "loss": 0.0869, + "num_input_tokens_seen": 29840304, + "step": 13830 + }, + { + "epoch": 2.2569331158238173, + "grad_norm": 0.28809016942977905, + "learning_rate": 4.763266346127829e-05, + "loss": 0.2421, + "num_input_tokens_seen": 29852048, + "step": 13835 + }, + { + "epoch": 2.257748776508972, + "grad_norm": 1.2391031980514526, + "learning_rate": 4.7629639118074816e-05, + "loss": 0.0989, + "num_input_tokens_seen": 29861264, + "step": 13840 + }, + { + "epoch": 2.2585644371941274, + "grad_norm": 0.05801759287714958, + "learning_rate": 4.7626612940397976e-05, + "loss": 0.2238, + "num_input_tokens_seen": 29871984, + "step": 13845 + }, + { + "epoch": 2.259380097879282, + "grad_norm": 0.7271353602409363, + "learning_rate": 4.762358492849308e-05, + "loss": 0.221, + "num_input_tokens_seen": 29882256, + "step": 13850 + }, + { + "epoch": 2.2601957585644374, + "grad_norm": 1.014772891998291, + "learning_rate": 4.762055508260561e-05, + "loss": 0.1183, + "num_input_tokens_seen": 29891696, + "step": 13855 + }, + { + "epoch": 2.2610114192495923, + "grad_norm": 0.44359707832336426, + "learning_rate": 4.7617523402981155e-05, + "loss": 0.0833, + "num_input_tokens_seen": 29903856, + "step": 13860 + }, + { + "epoch": 2.261827079934747, + "grad_norm": 0.2842143476009369, + "learning_rate": 4.7614489889865506e-05, + "loss": 0.0897, + "num_input_tokens_seen": 29913520, + "step": 13865 + }, + { + "epoch": 2.262642740619902, + "grad_norm": 0.9333368539810181, + "learning_rate": 4.761145454350455e-05, + "loss": 0.0484, + "num_input_tokens_seen": 29924016, + "step": 13870 + }, + { + "epoch": 2.263458401305057, + "grad_norm": 1.7074891328811646, + "learning_rate": 4.760841736414437e-05, + "loss": 0.2495, + "num_input_tokens_seen": 29935120, + "step": 13875 + }, + { + "epoch": 2.264274061990212, + "grad_norm": 1.2954754829406738, + "learning_rate": 4.760537835203116e-05, + "loss": 0.1746, + "num_input_tokens_seen": 29946288, + "step": 13880 + }, + { + "epoch": 2.2650897226753672, + "grad_norm": 1.4761862754821777, + "learning_rate": 4.760233750741128e-05, + "loss": 0.1646, + "num_input_tokens_seen": 29957008, + "step": 13885 + }, + { + "epoch": 2.265905383360522, + "grad_norm": 0.1096840649843216, + "learning_rate": 4.7599294830531235e-05, + "loss": 0.083, + "num_input_tokens_seen": 29966896, + "step": 13890 + }, + { + "epoch": 2.266721044045677, + "grad_norm": 2.2314236164093018, + "learning_rate": 4.759625032163769e-05, + "loss": 0.2202, + "num_input_tokens_seen": 29976528, + "step": 13895 + }, + { + "epoch": 2.267536704730832, + "grad_norm": 0.29714876413345337, + "learning_rate": 4.7593203980977444e-05, + "loss": 0.0944, + "num_input_tokens_seen": 29987120, + "step": 13900 + }, + { + "epoch": 2.268352365415987, + "grad_norm": 0.5204722881317139, + "learning_rate": 4.759015580879744e-05, + "loss": 0.0705, + "num_input_tokens_seen": 29997680, + "step": 13905 + }, + { + "epoch": 2.2691680261011418, + "grad_norm": 0.20128268003463745, + "learning_rate": 4.758710580534479e-05, + "loss": 0.2356, + "num_input_tokens_seen": 30008784, + "step": 13910 + }, + { + "epoch": 2.269983686786297, + "grad_norm": 0.10542436689138412, + "learning_rate": 4.758405397086674e-05, + "loss": 0.3545, + "num_input_tokens_seen": 30018416, + "step": 13915 + }, + { + "epoch": 2.270799347471452, + "grad_norm": 1.2286351919174194, + "learning_rate": 4.758100030561068e-05, + "loss": 0.1174, + "num_input_tokens_seen": 30030352, + "step": 13920 + }, + { + "epoch": 2.2716150081566067, + "grad_norm": 0.3744695782661438, + "learning_rate": 4.757794480982416e-05, + "loss": 0.1071, + "num_input_tokens_seen": 30041392, + "step": 13925 + }, + { + "epoch": 2.272430668841762, + "grad_norm": 0.3680442273616791, + "learning_rate": 4.757488748375487e-05, + "loss": 0.0294, + "num_input_tokens_seen": 30052240, + "step": 13930 + }, + { + "epoch": 2.2732463295269167, + "grad_norm": 0.4691818654537201, + "learning_rate": 4.757182832765067e-05, + "loss": 0.2776, + "num_input_tokens_seen": 30062192, + "step": 13935 + }, + { + "epoch": 2.274061990212072, + "grad_norm": 0.5376624464988708, + "learning_rate": 4.7568767341759526e-05, + "loss": 0.1331, + "num_input_tokens_seen": 30073808, + "step": 13940 + }, + { + "epoch": 2.274877650897227, + "grad_norm": 0.5299275517463684, + "learning_rate": 4.756570452632959e-05, + "loss": 0.178, + "num_input_tokens_seen": 30084016, + "step": 13945 + }, + { + "epoch": 2.2756933115823816, + "grad_norm": 1.015104055404663, + "learning_rate": 4.756263988160915e-05, + "loss": 0.0898, + "num_input_tokens_seen": 30095600, + "step": 13950 + }, + { + "epoch": 2.2765089722675365, + "grad_norm": 0.8894478678703308, + "learning_rate": 4.755957340784664e-05, + "loss": 0.0888, + "num_input_tokens_seen": 30106800, + "step": 13955 + }, + { + "epoch": 2.2773246329526917, + "grad_norm": 0.29795682430267334, + "learning_rate": 4.755650510529064e-05, + "loss": 0.0336, + "num_input_tokens_seen": 30118288, + "step": 13960 + }, + { + "epoch": 2.2781402936378465, + "grad_norm": 0.6013714671134949, + "learning_rate": 4.755343497418989e-05, + "loss": 0.0872, + "num_input_tokens_seen": 30129552, + "step": 13965 + }, + { + "epoch": 2.278955954323002, + "grad_norm": 0.36108410358428955, + "learning_rate": 4.7550363014793264e-05, + "loss": 0.1239, + "num_input_tokens_seen": 30140720, + "step": 13970 + }, + { + "epoch": 2.2797716150081566, + "grad_norm": 0.9697116017341614, + "learning_rate": 4.754728922734979e-05, + "loss": 0.1035, + "num_input_tokens_seen": 30150640, + "step": 13975 + }, + { + "epoch": 2.2805872756933114, + "grad_norm": 0.8726990818977356, + "learning_rate": 4.754421361210865e-05, + "loss": 0.1191, + "num_input_tokens_seen": 30160976, + "step": 13980 + }, + { + "epoch": 2.2814029363784667, + "grad_norm": 2.0480940341949463, + "learning_rate": 4.7541136169319165e-05, + "loss": 0.2516, + "num_input_tokens_seen": 30170576, + "step": 13985 + }, + { + "epoch": 2.2822185970636215, + "grad_norm": 1.4654812812805176, + "learning_rate": 4.7538056899230815e-05, + "loss": 0.1386, + "num_input_tokens_seen": 30181904, + "step": 13990 + }, + { + "epoch": 2.2830342577487763, + "grad_norm": 0.3451170325279236, + "learning_rate": 4.753497580209321e-05, + "loss": 0.0573, + "num_input_tokens_seen": 30193392, + "step": 13995 + }, + { + "epoch": 2.2838499184339316, + "grad_norm": 1.2247956991195679, + "learning_rate": 4.7531892878156125e-05, + "loss": 0.0781, + "num_input_tokens_seen": 30204528, + "step": 14000 + }, + { + "epoch": 2.2846655791190864, + "grad_norm": 1.555415391921997, + "learning_rate": 4.752880812766948e-05, + "loss": 0.1339, + "num_input_tokens_seen": 30215760, + "step": 14005 + }, + { + "epoch": 2.2854812398042412, + "grad_norm": 0.3646087944507599, + "learning_rate": 4.752572155088334e-05, + "loss": 0.0457, + "num_input_tokens_seen": 30226096, + "step": 14010 + }, + { + "epoch": 2.2862969004893965, + "grad_norm": 0.18460646271705627, + "learning_rate": 4.752263314804791e-05, + "loss": 0.0923, + "num_input_tokens_seen": 30236784, + "step": 14015 + }, + { + "epoch": 2.2871125611745513, + "grad_norm": 0.35536134243011475, + "learning_rate": 4.7519542919413566e-05, + "loss": 0.121, + "num_input_tokens_seen": 30248432, + "step": 14020 + }, + { + "epoch": 2.2879282218597066, + "grad_norm": 0.3636711537837982, + "learning_rate": 4.751645086523081e-05, + "loss": 0.1703, + "num_input_tokens_seen": 30258448, + "step": 14025 + }, + { + "epoch": 2.2887438825448614, + "grad_norm": 0.6909719705581665, + "learning_rate": 4.75133569857503e-05, + "loss": 0.2268, + "num_input_tokens_seen": 30269424, + "step": 14030 + }, + { + "epoch": 2.289559543230016, + "grad_norm": 0.29647842049598694, + "learning_rate": 4.751026128122283e-05, + "loss": 0.0697, + "num_input_tokens_seen": 30280144, + "step": 14035 + }, + { + "epoch": 2.2903752039151715, + "grad_norm": 0.41081520915031433, + "learning_rate": 4.7507163751899374e-05, + "loss": 0.0646, + "num_input_tokens_seen": 30290064, + "step": 14040 + }, + { + "epoch": 2.2911908646003263, + "grad_norm": 0.2059166580438614, + "learning_rate": 4.750406439803102e-05, + "loss": 0.1677, + "num_input_tokens_seen": 30299984, + "step": 14045 + }, + { + "epoch": 2.292006525285481, + "grad_norm": 0.45396357774734497, + "learning_rate": 4.750096321986902e-05, + "loss": 0.0512, + "num_input_tokens_seen": 30311248, + "step": 14050 + }, + { + "epoch": 2.2928221859706364, + "grad_norm": 0.19897764921188354, + "learning_rate": 4.749786021766478e-05, + "loss": 0.1314, + "num_input_tokens_seen": 30322160, + "step": 14055 + }, + { + "epoch": 2.293637846655791, + "grad_norm": 1.5904041528701782, + "learning_rate": 4.749475539166983e-05, + "loss": 0.1452, + "num_input_tokens_seen": 30333616, + "step": 14060 + }, + { + "epoch": 2.294453507340946, + "grad_norm": 2.0654308795928955, + "learning_rate": 4.749164874213588e-05, + "loss": 0.0906, + "num_input_tokens_seen": 30344112, + "step": 14065 + }, + { + "epoch": 2.2952691680261013, + "grad_norm": 0.3560764491558075, + "learning_rate": 4.7488540269314756e-05, + "loss": 0.0513, + "num_input_tokens_seen": 30354192, + "step": 14070 + }, + { + "epoch": 2.296084828711256, + "grad_norm": 1.486275315284729, + "learning_rate": 4.748542997345845e-05, + "loss": 0.1184, + "num_input_tokens_seen": 30365168, + "step": 14075 + }, + { + "epoch": 2.2969004893964113, + "grad_norm": 1.106338381767273, + "learning_rate": 4.74823178548191e-05, + "loss": 0.1336, + "num_input_tokens_seen": 30376464, + "step": 14080 + }, + { + "epoch": 2.297716150081566, + "grad_norm": 0.53264981508255, + "learning_rate": 4.7479203913649e-05, + "loss": 0.1012, + "num_input_tokens_seen": 30387472, + "step": 14085 + }, + { + "epoch": 2.298531810766721, + "grad_norm": 1.3906505107879639, + "learning_rate": 4.747608815020056e-05, + "loss": 0.2618, + "num_input_tokens_seen": 30398224, + "step": 14090 + }, + { + "epoch": 2.299347471451876, + "grad_norm": 1.6780986785888672, + "learning_rate": 4.747297056472638e-05, + "loss": 0.2984, + "num_input_tokens_seen": 30409872, + "step": 14095 + }, + { + "epoch": 2.300163132137031, + "grad_norm": 1.0715991258621216, + "learning_rate": 4.7469851157479177e-05, + "loss": 0.1576, + "num_input_tokens_seen": 30420144, + "step": 14100 + }, + { + "epoch": 2.300978792822186, + "grad_norm": 0.7742687463760376, + "learning_rate": 4.746672992871183e-05, + "loss": 0.1019, + "num_input_tokens_seen": 30430704, + "step": 14105 + }, + { + "epoch": 2.301794453507341, + "grad_norm": 0.26867881417274475, + "learning_rate": 4.746360687867736e-05, + "loss": 0.1122, + "num_input_tokens_seen": 30441328, + "step": 14110 + }, + { + "epoch": 2.302610114192496, + "grad_norm": 0.4401240944862366, + "learning_rate": 4.746048200762893e-05, + "loss": 0.1127, + "num_input_tokens_seen": 30452208, + "step": 14115 + }, + { + "epoch": 2.3034257748776508, + "grad_norm": 0.48443764448165894, + "learning_rate": 4.7457355315819874e-05, + "loss": 0.0992, + "num_input_tokens_seen": 30462960, + "step": 14120 + }, + { + "epoch": 2.304241435562806, + "grad_norm": 1.0418888330459595, + "learning_rate": 4.745422680350364e-05, + "loss": 0.1803, + "num_input_tokens_seen": 30473168, + "step": 14125 + }, + { + "epoch": 2.305057096247961, + "grad_norm": 1.5463100671768188, + "learning_rate": 4.745109647093385e-05, + "loss": 0.1137, + "num_input_tokens_seen": 30483824, + "step": 14130 + }, + { + "epoch": 2.3058727569331157, + "grad_norm": 0.6509619355201721, + "learning_rate": 4.744796431836428e-05, + "loss": 0.1748, + "num_input_tokens_seen": 30495632, + "step": 14135 + }, + { + "epoch": 2.306688417618271, + "grad_norm": 0.687272310256958, + "learning_rate": 4.7444830346048804e-05, + "loss": 0.1087, + "num_input_tokens_seen": 30505808, + "step": 14140 + }, + { + "epoch": 2.3075040783034257, + "grad_norm": 0.5971601605415344, + "learning_rate": 4.744169455424151e-05, + "loss": 0.0816, + "num_input_tokens_seen": 30516368, + "step": 14145 + }, + { + "epoch": 2.3083197389885806, + "grad_norm": 0.13959559798240662, + "learning_rate": 4.7438556943196574e-05, + "loss": 0.0669, + "num_input_tokens_seen": 30527856, + "step": 14150 + }, + { + "epoch": 2.309135399673736, + "grad_norm": 1.5694308280944824, + "learning_rate": 4.743541751316837e-05, + "loss": 0.3163, + "num_input_tokens_seen": 30537936, + "step": 14155 + }, + { + "epoch": 2.3099510603588906, + "grad_norm": 0.7941884994506836, + "learning_rate": 4.743227626441139e-05, + "loss": 0.1303, + "num_input_tokens_seen": 30548304, + "step": 14160 + }, + { + "epoch": 2.310766721044046, + "grad_norm": 0.9225988388061523, + "learning_rate": 4.7429133197180264e-05, + "loss": 0.0475, + "num_input_tokens_seen": 30559728, + "step": 14165 + }, + { + "epoch": 2.3115823817292007, + "grad_norm": 0.3583184778690338, + "learning_rate": 4.7425988311729805e-05, + "loss": 0.0785, + "num_input_tokens_seen": 30569840, + "step": 14170 + }, + { + "epoch": 2.3123980424143555, + "grad_norm": 0.36166587471961975, + "learning_rate": 4.742284160831494e-05, + "loss": 0.1091, + "num_input_tokens_seen": 30580400, + "step": 14175 + }, + { + "epoch": 2.3132137030995104, + "grad_norm": 1.7419312000274658, + "learning_rate": 4.741969308719076e-05, + "loss": 0.2601, + "num_input_tokens_seen": 30594544, + "step": 14180 + }, + { + "epoch": 2.3140293637846656, + "grad_norm": 0.2997038662433624, + "learning_rate": 4.741654274861251e-05, + "loss": 0.095, + "num_input_tokens_seen": 30604912, + "step": 14185 + }, + { + "epoch": 2.3148450244698204, + "grad_norm": 0.4252568781375885, + "learning_rate": 4.741339059283556e-05, + "loss": 0.0789, + "num_input_tokens_seen": 30616400, + "step": 14190 + }, + { + "epoch": 2.3156606851549757, + "grad_norm": 0.467245489358902, + "learning_rate": 4.7410236620115444e-05, + "loss": 0.0799, + "num_input_tokens_seen": 30627824, + "step": 14195 + }, + { + "epoch": 2.3164763458401305, + "grad_norm": 0.9956581592559814, + "learning_rate": 4.740708083070784e-05, + "loss": 0.09, + "num_input_tokens_seen": 30638320, + "step": 14200 + }, + { + "epoch": 2.3172920065252853, + "grad_norm": 0.1976730227470398, + "learning_rate": 4.740392322486858e-05, + "loss": 0.2316, + "num_input_tokens_seen": 30649744, + "step": 14205 + }, + { + "epoch": 2.3181076672104406, + "grad_norm": 1.3490827083587646, + "learning_rate": 4.740076380285361e-05, + "loss": 0.2802, + "num_input_tokens_seen": 30661136, + "step": 14210 + }, + { + "epoch": 2.3189233278955954, + "grad_norm": 0.46415022015571594, + "learning_rate": 4.739760256491908e-05, + "loss": 0.099, + "num_input_tokens_seen": 30672368, + "step": 14215 + }, + { + "epoch": 2.3197389885807502, + "grad_norm": 1.4267038106918335, + "learning_rate": 4.7394439511321225e-05, + "loss": 0.1775, + "num_input_tokens_seen": 30684784, + "step": 14220 + }, + { + "epoch": 2.3205546492659055, + "grad_norm": 0.711530327796936, + "learning_rate": 4.7391274642316485e-05, + "loss": 0.077, + "num_input_tokens_seen": 30696432, + "step": 14225 + }, + { + "epoch": 2.3213703099510603, + "grad_norm": 0.6221045851707458, + "learning_rate": 4.7388107958161414e-05, + "loss": 0.2366, + "num_input_tokens_seen": 30706256, + "step": 14230 + }, + { + "epoch": 2.322185970636215, + "grad_norm": 1.8335391283035278, + "learning_rate": 4.738493945911271e-05, + "loss": 0.101, + "num_input_tokens_seen": 30716976, + "step": 14235 + }, + { + "epoch": 2.3230016313213704, + "grad_norm": 1.7002390623092651, + "learning_rate": 4.738176914542723e-05, + "loss": 0.2518, + "num_input_tokens_seen": 30726768, + "step": 14240 + }, + { + "epoch": 2.323817292006525, + "grad_norm": 1.8443176746368408, + "learning_rate": 4.737859701736199e-05, + "loss": 0.1196, + "num_input_tokens_seen": 30737072, + "step": 14245 + }, + { + "epoch": 2.3246329526916805, + "grad_norm": 0.7565651535987854, + "learning_rate": 4.737542307517413e-05, + "loss": 0.1377, + "num_input_tokens_seen": 30748400, + "step": 14250 + }, + { + "epoch": 2.3254486133768353, + "grad_norm": 1.7013037204742432, + "learning_rate": 4.737224731912093e-05, + "loss": 0.1524, + "num_input_tokens_seen": 30760240, + "step": 14255 + }, + { + "epoch": 2.32626427406199, + "grad_norm": 0.112869493663311, + "learning_rate": 4.736906974945986e-05, + "loss": 0.1127, + "num_input_tokens_seen": 30770928, + "step": 14260 + }, + { + "epoch": 2.3270799347471454, + "grad_norm": 1.2279354333877563, + "learning_rate": 4.736589036644848e-05, + "loss": 0.083, + "num_input_tokens_seen": 30780240, + "step": 14265 + }, + { + "epoch": 2.3278955954323, + "grad_norm": 0.09147945046424866, + "learning_rate": 4.736270917034456e-05, + "loss": 0.0712, + "num_input_tokens_seen": 30792144, + "step": 14270 + }, + { + "epoch": 2.328711256117455, + "grad_norm": 0.9014620780944824, + "learning_rate": 4.735952616140597e-05, + "loss": 0.1459, + "num_input_tokens_seen": 30802416, + "step": 14275 + }, + { + "epoch": 2.3295269168026103, + "grad_norm": 0.8106676936149597, + "learning_rate": 4.735634133989072e-05, + "loss": 0.141, + "num_input_tokens_seen": 30813872, + "step": 14280 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.4920821487903595, + "learning_rate": 4.735315470605702e-05, + "loss": 0.0641, + "num_input_tokens_seen": 30825584, + "step": 14285 + }, + { + "epoch": 2.33115823817292, + "grad_norm": 0.7780404090881348, + "learning_rate": 4.734996626016317e-05, + "loss": 0.061, + "num_input_tokens_seen": 30836688, + "step": 14290 + }, + { + "epoch": 2.331973898858075, + "grad_norm": 0.25258079171180725, + "learning_rate": 4.7346776002467664e-05, + "loss": 0.1014, + "num_input_tokens_seen": 30848816, + "step": 14295 + }, + { + "epoch": 2.33278955954323, + "grad_norm": 1.1454508304595947, + "learning_rate": 4.73435839332291e-05, + "loss": 0.1582, + "num_input_tokens_seen": 30858608, + "step": 14300 + }, + { + "epoch": 2.3336052202283852, + "grad_norm": 0.1330079436302185, + "learning_rate": 4.734039005270625e-05, + "loss": 0.0914, + "num_input_tokens_seen": 30868560, + "step": 14305 + }, + { + "epoch": 2.33442088091354, + "grad_norm": 0.10161491483449936, + "learning_rate": 4.733719436115804e-05, + "loss": 0.0256, + "num_input_tokens_seen": 30878192, + "step": 14310 + }, + { + "epoch": 2.335236541598695, + "grad_norm": 0.15230320394039154, + "learning_rate": 4.733399685884351e-05, + "loss": 0.0646, + "num_input_tokens_seen": 30888432, + "step": 14315 + }, + { + "epoch": 2.3360522022838497, + "grad_norm": 0.107599638402462, + "learning_rate": 4.7330797546021876e-05, + "loss": 0.0904, + "num_input_tokens_seen": 30898864, + "step": 14320 + }, + { + "epoch": 2.336867862969005, + "grad_norm": 0.8647338151931763, + "learning_rate": 4.732759642295248e-05, + "loss": 0.1808, + "num_input_tokens_seen": 30909424, + "step": 14325 + }, + { + "epoch": 2.3376835236541598, + "grad_norm": 0.42740312218666077, + "learning_rate": 4.732439348989484e-05, + "loss": 0.0844, + "num_input_tokens_seen": 30919600, + "step": 14330 + }, + { + "epoch": 2.338499184339315, + "grad_norm": 0.03750680014491081, + "learning_rate": 4.732118874710858e-05, + "loss": 0.0472, + "num_input_tokens_seen": 30930416, + "step": 14335 + }, + { + "epoch": 2.33931484502447, + "grad_norm": 0.13734659552574158, + "learning_rate": 4.731798219485351e-05, + "loss": 0.0624, + "num_input_tokens_seen": 30940560, + "step": 14340 + }, + { + "epoch": 2.3401305057096247, + "grad_norm": 0.9373392462730408, + "learning_rate": 4.7314773833389567e-05, + "loss": 0.2276, + "num_input_tokens_seen": 30951440, + "step": 14345 + }, + { + "epoch": 2.34094616639478, + "grad_norm": 0.06662897765636444, + "learning_rate": 4.731156366297682e-05, + "loss": 0.1753, + "num_input_tokens_seen": 30961936, + "step": 14350 + }, + { + "epoch": 2.3417618270799347, + "grad_norm": 0.8662267327308655, + "learning_rate": 4.730835168387553e-05, + "loss": 0.1532, + "num_input_tokens_seen": 30972112, + "step": 14355 + }, + { + "epoch": 2.3425774877650896, + "grad_norm": 1.7627376317977905, + "learning_rate": 4.730513789634605e-05, + "loss": 0.1798, + "num_input_tokens_seen": 30984272, + "step": 14360 + }, + { + "epoch": 2.343393148450245, + "grad_norm": 0.2197243869304657, + "learning_rate": 4.7301922300648926e-05, + "loss": 0.2306, + "num_input_tokens_seen": 30996368, + "step": 14365 + }, + { + "epoch": 2.3442088091353996, + "grad_norm": 1.5141834020614624, + "learning_rate": 4.729870489704481e-05, + "loss": 0.2171, + "num_input_tokens_seen": 31007216, + "step": 14370 + }, + { + "epoch": 2.3450244698205545, + "grad_norm": 0.26824548840522766, + "learning_rate": 4.729548568579454e-05, + "loss": 0.0389, + "num_input_tokens_seen": 31018416, + "step": 14375 + }, + { + "epoch": 2.3458401305057097, + "grad_norm": 0.12216578423976898, + "learning_rate": 4.729226466715907e-05, + "loss": 0.0438, + "num_input_tokens_seen": 31030416, + "step": 14380 + }, + { + "epoch": 2.3466557911908645, + "grad_norm": 0.8907936811447144, + "learning_rate": 4.728904184139952e-05, + "loss": 0.088, + "num_input_tokens_seen": 31040752, + "step": 14385 + }, + { + "epoch": 2.34747145187602, + "grad_norm": 0.18704326450824738, + "learning_rate": 4.728581720877715e-05, + "loss": 0.1037, + "num_input_tokens_seen": 31052048, + "step": 14390 + }, + { + "epoch": 2.3482871125611746, + "grad_norm": 2.3546788692474365, + "learning_rate": 4.7282590769553346e-05, + "loss": 0.1808, + "num_input_tokens_seen": 31062320, + "step": 14395 + }, + { + "epoch": 2.3491027732463294, + "grad_norm": 1.8572067022323608, + "learning_rate": 4.727936252398969e-05, + "loss": 0.1432, + "num_input_tokens_seen": 31071856, + "step": 14400 + }, + { + "epoch": 2.3499184339314847, + "grad_norm": 0.018110258504748344, + "learning_rate": 4.727613247234785e-05, + "loss": 0.0269, + "num_input_tokens_seen": 31082096, + "step": 14405 + }, + { + "epoch": 2.3507340946166395, + "grad_norm": 0.8953160047531128, + "learning_rate": 4.727290061488969e-05, + "loss": 0.2294, + "num_input_tokens_seen": 31092400, + "step": 14410 + }, + { + "epoch": 2.3515497553017943, + "grad_norm": 0.22126181423664093, + "learning_rate": 4.726966695187719e-05, + "loss": 0.184, + "num_input_tokens_seen": 31102768, + "step": 14415 + }, + { + "epoch": 2.3523654159869496, + "grad_norm": 1.0845212936401367, + "learning_rate": 4.7266431483572495e-05, + "loss": 0.067, + "num_input_tokens_seen": 31113936, + "step": 14420 + }, + { + "epoch": 2.3531810766721044, + "grad_norm": 0.1874828338623047, + "learning_rate": 4.726319421023789e-05, + "loss": 0.0701, + "num_input_tokens_seen": 31123920, + "step": 14425 + }, + { + "epoch": 2.3539967373572592, + "grad_norm": 0.7864197492599487, + "learning_rate": 4.725995513213579e-05, + "loss": 0.2754, + "num_input_tokens_seen": 31135472, + "step": 14430 + }, + { + "epoch": 2.3548123980424145, + "grad_norm": 0.04870380461215973, + "learning_rate": 4.725671424952879e-05, + "loss": 0.1484, + "num_input_tokens_seen": 31145552, + "step": 14435 + }, + { + "epoch": 2.3556280587275693, + "grad_norm": 1.8649710416793823, + "learning_rate": 4.7253471562679594e-05, + "loss": 0.135, + "num_input_tokens_seen": 31157680, + "step": 14440 + }, + { + "epoch": 2.356443719412724, + "grad_norm": 0.3857799172401428, + "learning_rate": 4.725022707185109e-05, + "loss": 0.0794, + "num_input_tokens_seen": 31168720, + "step": 14445 + }, + { + "epoch": 2.3572593800978794, + "grad_norm": 1.9375417232513428, + "learning_rate": 4.724698077730628e-05, + "loss": 0.3399, + "num_input_tokens_seen": 31178832, + "step": 14450 + }, + { + "epoch": 2.358075040783034, + "grad_norm": 1.3214128017425537, + "learning_rate": 4.7243732679308325e-05, + "loss": 0.1427, + "num_input_tokens_seen": 31190288, + "step": 14455 + }, + { + "epoch": 2.358890701468189, + "grad_norm": 0.30480295419692993, + "learning_rate": 4.724048277812054e-05, + "loss": 0.1267, + "num_input_tokens_seen": 31202128, + "step": 14460 + }, + { + "epoch": 2.3597063621533443, + "grad_norm": 0.20748089253902435, + "learning_rate": 4.7237231074006374e-05, + "loss": 0.0703, + "num_input_tokens_seen": 31213232, + "step": 14465 + }, + { + "epoch": 2.360522022838499, + "grad_norm": 1.4566761255264282, + "learning_rate": 4.723397756722942e-05, + "loss": 0.2064, + "num_input_tokens_seen": 31224272, + "step": 14470 + }, + { + "epoch": 2.3613376835236544, + "grad_norm": 0.2989595830440521, + "learning_rate": 4.7230722258053434e-05, + "loss": 0.1191, + "num_input_tokens_seen": 31234064, + "step": 14475 + }, + { + "epoch": 2.362153344208809, + "grad_norm": 0.12316528707742691, + "learning_rate": 4.7227465146742304e-05, + "loss": 0.3132, + "num_input_tokens_seen": 31245776, + "step": 14480 + }, + { + "epoch": 2.362969004893964, + "grad_norm": 0.07490243762731552, + "learning_rate": 4.722420623356007e-05, + "loss": 0.1375, + "num_input_tokens_seen": 31257392, + "step": 14485 + }, + { + "epoch": 2.3637846655791193, + "grad_norm": 0.46264785528182983, + "learning_rate": 4.722094551877091e-05, + "loss": 0.0689, + "num_input_tokens_seen": 31267408, + "step": 14490 + }, + { + "epoch": 2.364600326264274, + "grad_norm": 0.6599071621894836, + "learning_rate": 4.7217683002639165e-05, + "loss": 0.1159, + "num_input_tokens_seen": 31278960, + "step": 14495 + }, + { + "epoch": 2.365415986949429, + "grad_norm": 1.388808250427246, + "learning_rate": 4.7214418685429295e-05, + "loss": 0.2899, + "num_input_tokens_seen": 31289264, + "step": 14500 + }, + { + "epoch": 2.366231647634584, + "grad_norm": 1.8117046356201172, + "learning_rate": 4.721115256740594e-05, + "loss": 0.1206, + "num_input_tokens_seen": 31299152, + "step": 14505 + }, + { + "epoch": 2.367047308319739, + "grad_norm": 0.6270184516906738, + "learning_rate": 4.720788464883385e-05, + "loss": 0.1178, + "num_input_tokens_seen": 31308944, + "step": 14510 + }, + { + "epoch": 2.367862969004894, + "grad_norm": 0.020576151087880135, + "learning_rate": 4.720461492997796e-05, + "loss": 0.089, + "num_input_tokens_seen": 31319824, + "step": 14515 + }, + { + "epoch": 2.368678629690049, + "grad_norm": 0.8532291650772095, + "learning_rate": 4.720134341110332e-05, + "loss": 0.1732, + "num_input_tokens_seen": 31332016, + "step": 14520 + }, + { + "epoch": 2.369494290375204, + "grad_norm": 1.643574833869934, + "learning_rate": 4.719807009247513e-05, + "loss": 0.2692, + "num_input_tokens_seen": 31342512, + "step": 14525 + }, + { + "epoch": 2.370309951060359, + "grad_norm": 0.1683911830186844, + "learning_rate": 4.7194794974358744e-05, + "loss": 0.057, + "num_input_tokens_seen": 31352528, + "step": 14530 + }, + { + "epoch": 2.371125611745514, + "grad_norm": 0.3722173571586609, + "learning_rate": 4.719151805701966e-05, + "loss": 0.0481, + "num_input_tokens_seen": 31361520, + "step": 14535 + }, + { + "epoch": 2.3719412724306688, + "grad_norm": 1.0543298721313477, + "learning_rate": 4.7188239340723526e-05, + "loss": 0.1555, + "num_input_tokens_seen": 31372304, + "step": 14540 + }, + { + "epoch": 2.3727569331158236, + "grad_norm": 0.5424573421478271, + "learning_rate": 4.7184958825736135e-05, + "loss": 0.1995, + "num_input_tokens_seen": 31382960, + "step": 14545 + }, + { + "epoch": 2.373572593800979, + "grad_norm": 1.3532980680465698, + "learning_rate": 4.718167651232341e-05, + "loss": 0.1998, + "num_input_tokens_seen": 31394128, + "step": 14550 + }, + { + "epoch": 2.3743882544861337, + "grad_norm": 0.6146543622016907, + "learning_rate": 4.7178392400751433e-05, + "loss": 0.0609, + "num_input_tokens_seen": 31405072, + "step": 14555 + }, + { + "epoch": 2.375203915171289, + "grad_norm": 0.4889201521873474, + "learning_rate": 4.7175106491286446e-05, + "loss": 0.061, + "num_input_tokens_seen": 31415280, + "step": 14560 + }, + { + "epoch": 2.3760195758564437, + "grad_norm": 0.6021022200584412, + "learning_rate": 4.717181878419481e-05, + "loss": 0.0528, + "num_input_tokens_seen": 31426384, + "step": 14565 + }, + { + "epoch": 2.3768352365415986, + "grad_norm": 0.4806508421897888, + "learning_rate": 4.7168529279743046e-05, + "loss": 0.0932, + "num_input_tokens_seen": 31436304, + "step": 14570 + }, + { + "epoch": 2.377650897226754, + "grad_norm": 0.1382075846195221, + "learning_rate": 4.716523797819781e-05, + "loss": 0.1305, + "num_input_tokens_seen": 31448240, + "step": 14575 + }, + { + "epoch": 2.3784665579119086, + "grad_norm": 0.04626614600419998, + "learning_rate": 4.716194487982592e-05, + "loss": 0.2382, + "num_input_tokens_seen": 31458928, + "step": 14580 + }, + { + "epoch": 2.3792822185970635, + "grad_norm": 0.34228023886680603, + "learning_rate": 4.715864998489433e-05, + "loss": 0.1472, + "num_input_tokens_seen": 31470480, + "step": 14585 + }, + { + "epoch": 2.3800978792822187, + "grad_norm": 0.18011803925037384, + "learning_rate": 4.715535329367014e-05, + "loss": 0.0707, + "num_input_tokens_seen": 31481968, + "step": 14590 + }, + { + "epoch": 2.3809135399673735, + "grad_norm": 0.5706026554107666, + "learning_rate": 4.71520548064206e-05, + "loss": 0.1317, + "num_input_tokens_seen": 31493360, + "step": 14595 + }, + { + "epoch": 2.3817292006525284, + "grad_norm": 1.7832030057907104, + "learning_rate": 4.71487545234131e-05, + "loss": 0.1606, + "num_input_tokens_seen": 31505552, + "step": 14600 + }, + { + "epoch": 2.3825448613376836, + "grad_norm": 0.046002451330423355, + "learning_rate": 4.7145452444915175e-05, + "loss": 0.0689, + "num_input_tokens_seen": 31516336, + "step": 14605 + }, + { + "epoch": 2.3833605220228384, + "grad_norm": 0.06458590179681778, + "learning_rate": 4.71421485711945e-05, + "loss": 0.0616, + "num_input_tokens_seen": 31527152, + "step": 14610 + }, + { + "epoch": 2.3841761827079937, + "grad_norm": 0.05975591391324997, + "learning_rate": 4.713884290251892e-05, + "loss": 0.0312, + "num_input_tokens_seen": 31536400, + "step": 14615 + }, + { + "epoch": 2.3849918433931485, + "grad_norm": 1.1592563390731812, + "learning_rate": 4.713553543915641e-05, + "loss": 0.0677, + "num_input_tokens_seen": 31547056, + "step": 14620 + }, + { + "epoch": 2.3858075040783033, + "grad_norm": 0.21437254548072815, + "learning_rate": 4.713222618137508e-05, + "loss": 0.1262, + "num_input_tokens_seen": 31557840, + "step": 14625 + }, + { + "epoch": 2.3866231647634586, + "grad_norm": 0.05477537214756012, + "learning_rate": 4.71289151294432e-05, + "loss": 0.0263, + "num_input_tokens_seen": 31569424, + "step": 14630 + }, + { + "epoch": 2.3874388254486134, + "grad_norm": 0.7810143828392029, + "learning_rate": 4.7125602283629166e-05, + "loss": 0.347, + "num_input_tokens_seen": 31579504, + "step": 14635 + }, + { + "epoch": 2.3882544861337682, + "grad_norm": 0.6122269034385681, + "learning_rate": 4.7122287644201556e-05, + "loss": 0.2281, + "num_input_tokens_seen": 31591120, + "step": 14640 + }, + { + "epoch": 2.3890701468189235, + "grad_norm": 0.5043045878410339, + "learning_rate": 4.711897121142906e-05, + "loss": 0.0413, + "num_input_tokens_seen": 31601968, + "step": 14645 + }, + { + "epoch": 2.3898858075040783, + "grad_norm": 0.21912886202335358, + "learning_rate": 4.711565298558053e-05, + "loss": 0.2151, + "num_input_tokens_seen": 31612944, + "step": 14650 + }, + { + "epoch": 2.390701468189233, + "grad_norm": 0.717047929763794, + "learning_rate": 4.711233296692495e-05, + "loss": 0.0281, + "num_input_tokens_seen": 31623472, + "step": 14655 + }, + { + "epoch": 2.3915171288743884, + "grad_norm": 0.3582729697227478, + "learning_rate": 4.7109011155731475e-05, + "loss": 0.1342, + "num_input_tokens_seen": 31635248, + "step": 14660 + }, + { + "epoch": 2.392332789559543, + "grad_norm": 1.228150725364685, + "learning_rate": 4.710568755226936e-05, + "loss": 0.238, + "num_input_tokens_seen": 31646320, + "step": 14665 + }, + { + "epoch": 2.393148450244698, + "grad_norm": 0.11674576252698898, + "learning_rate": 4.710236215680806e-05, + "loss": 0.1025, + "num_input_tokens_seen": 31658608, + "step": 14670 + }, + { + "epoch": 2.3939641109298533, + "grad_norm": 1.0883936882019043, + "learning_rate": 4.709903496961713e-05, + "loss": 0.1258, + "num_input_tokens_seen": 31669680, + "step": 14675 + }, + { + "epoch": 2.394779771615008, + "grad_norm": 0.6012787818908691, + "learning_rate": 4.7095705990966306e-05, + "loss": 0.243, + "num_input_tokens_seen": 31679696, + "step": 14680 + }, + { + "epoch": 2.395595432300163, + "grad_norm": 2.014378786087036, + "learning_rate": 4.709237522112543e-05, + "loss": 0.1934, + "num_input_tokens_seen": 31691472, + "step": 14685 + }, + { + "epoch": 2.396411092985318, + "grad_norm": 0.8689379692077637, + "learning_rate": 4.708904266036453e-05, + "loss": 0.1876, + "num_input_tokens_seen": 31701968, + "step": 14690 + }, + { + "epoch": 2.397226753670473, + "grad_norm": 0.11904332041740417, + "learning_rate": 4.7085708308953754e-05, + "loss": 0.2578, + "num_input_tokens_seen": 31713104, + "step": 14695 + }, + { + "epoch": 2.3980424143556283, + "grad_norm": 0.1358756422996521, + "learning_rate": 4.7082372167163394e-05, + "loss": 0.1179, + "num_input_tokens_seen": 31724560, + "step": 14700 + }, + { + "epoch": 2.398858075040783, + "grad_norm": 0.10790514200925827, + "learning_rate": 4.707903423526391e-05, + "loss": 0.0833, + "num_input_tokens_seen": 31733776, + "step": 14705 + }, + { + "epoch": 2.399673735725938, + "grad_norm": 0.32612109184265137, + "learning_rate": 4.707569451352588e-05, + "loss": 0.1272, + "num_input_tokens_seen": 31745680, + "step": 14710 + }, + { + "epoch": 2.400489396411093, + "grad_norm": 1.0225720405578613, + "learning_rate": 4.707235300222004e-05, + "loss": 0.1105, + "num_input_tokens_seen": 31756240, + "step": 14715 + }, + { + "epoch": 2.401305057096248, + "grad_norm": 0.2811700105667114, + "learning_rate": 4.706900970161727e-05, + "loss": 0.053, + "num_input_tokens_seen": 31767056, + "step": 14720 + }, + { + "epoch": 2.402120717781403, + "grad_norm": 0.4984097182750702, + "learning_rate": 4.7065664611988596e-05, + "loss": 0.0722, + "num_input_tokens_seen": 31777968, + "step": 14725 + }, + { + "epoch": 2.402936378466558, + "grad_norm": 0.333994060754776, + "learning_rate": 4.7062317733605185e-05, + "loss": 0.0887, + "num_input_tokens_seen": 31788976, + "step": 14730 + }, + { + "epoch": 2.403752039151713, + "grad_norm": 0.3533784747123718, + "learning_rate": 4.705896906673837e-05, + "loss": 0.0803, + "num_input_tokens_seen": 31799280, + "step": 14735 + }, + { + "epoch": 2.4045676998368677, + "grad_norm": 0.24654172360897064, + "learning_rate": 4.705561861165959e-05, + "loss": 0.1387, + "num_input_tokens_seen": 31810288, + "step": 14740 + }, + { + "epoch": 2.405383360522023, + "grad_norm": 0.4522712230682373, + "learning_rate": 4.705226636864045e-05, + "loss": 0.1037, + "num_input_tokens_seen": 31820880, + "step": 14745 + }, + { + "epoch": 2.4061990212071778, + "grad_norm": 0.19621139764785767, + "learning_rate": 4.704891233795271e-05, + "loss": 0.0606, + "num_input_tokens_seen": 31830704, + "step": 14750 + }, + { + "epoch": 2.407014681892333, + "grad_norm": 0.5182687640190125, + "learning_rate": 4.704555651986826e-05, + "loss": 0.1268, + "num_input_tokens_seen": 31842608, + "step": 14755 + }, + { + "epoch": 2.407830342577488, + "grad_norm": 0.10586079210042953, + "learning_rate": 4.704219891465914e-05, + "loss": 0.1398, + "num_input_tokens_seen": 31853456, + "step": 14760 + }, + { + "epoch": 2.4086460032626427, + "grad_norm": 0.3188978135585785, + "learning_rate": 4.703883952259754e-05, + "loss": 0.2535, + "num_input_tokens_seen": 31864240, + "step": 14765 + }, + { + "epoch": 2.4094616639477975, + "grad_norm": 0.3462498188018799, + "learning_rate": 4.7035478343955774e-05, + "loss": 0.1771, + "num_input_tokens_seen": 31874896, + "step": 14770 + }, + { + "epoch": 2.4102773246329527, + "grad_norm": 0.7719521522521973, + "learning_rate": 4.7032115379006337e-05, + "loss": 0.1068, + "num_input_tokens_seen": 31887120, + "step": 14775 + }, + { + "epoch": 2.4110929853181076, + "grad_norm": 2.0137553215026855, + "learning_rate": 4.7028750628021834e-05, + "loss": 0.1987, + "num_input_tokens_seen": 31899216, + "step": 14780 + }, + { + "epoch": 2.411908646003263, + "grad_norm": 2.035888195037842, + "learning_rate": 4.702538409127503e-05, + "loss": 0.1124, + "num_input_tokens_seen": 31911248, + "step": 14785 + }, + { + "epoch": 2.4127243066884176, + "grad_norm": 0.5035562515258789, + "learning_rate": 4.7022015769038844e-05, + "loss": 0.0826, + "num_input_tokens_seen": 31921264, + "step": 14790 + }, + { + "epoch": 2.4135399673735725, + "grad_norm": 0.4913134276866913, + "learning_rate": 4.701864566158631e-05, + "loss": 0.0518, + "num_input_tokens_seen": 31932208, + "step": 14795 + }, + { + "epoch": 2.4143556280587277, + "grad_norm": 1.8825125694274902, + "learning_rate": 4.701527376919064e-05, + "loss": 0.2025, + "num_input_tokens_seen": 31943792, + "step": 14800 + }, + { + "epoch": 2.4151712887438825, + "grad_norm": 0.04977178946137428, + "learning_rate": 4.701190009212518e-05, + "loss": 0.2783, + "num_input_tokens_seen": 31954128, + "step": 14805 + }, + { + "epoch": 2.4159869494290374, + "grad_norm": 0.8827961683273315, + "learning_rate": 4.700852463066341e-05, + "loss": 0.1346, + "num_input_tokens_seen": 31964176, + "step": 14810 + }, + { + "epoch": 2.4168026101141926, + "grad_norm": 0.48535624146461487, + "learning_rate": 4.7005147385078956e-05, + "loss": 0.0924, + "num_input_tokens_seen": 31975312, + "step": 14815 + }, + { + "epoch": 2.4176182707993474, + "grad_norm": 0.5695900321006775, + "learning_rate": 4.700176835564561e-05, + "loss": 0.1188, + "num_input_tokens_seen": 31985520, + "step": 14820 + }, + { + "epoch": 2.4184339314845023, + "grad_norm": 0.059951383620500565, + "learning_rate": 4.699838754263728e-05, + "loss": 0.025, + "num_input_tokens_seen": 31996656, + "step": 14825 + }, + { + "epoch": 2.4192495921696575, + "grad_norm": 0.6425120830535889, + "learning_rate": 4.6995004946328035e-05, + "loss": 0.0497, + "num_input_tokens_seen": 32005680, + "step": 14830 + }, + { + "epoch": 2.4200652528548123, + "grad_norm": 0.061006706207990646, + "learning_rate": 4.699162056699209e-05, + "loss": 0.0854, + "num_input_tokens_seen": 32017232, + "step": 14835 + }, + { + "epoch": 2.4208809135399676, + "grad_norm": 0.21986700594425201, + "learning_rate": 4.698823440490381e-05, + "loss": 0.1101, + "num_input_tokens_seen": 32028112, + "step": 14840 + }, + { + "epoch": 2.4216965742251224, + "grad_norm": 2.0775399208068848, + "learning_rate": 4.6984846460337664e-05, + "loss": 0.207, + "num_input_tokens_seen": 32038608, + "step": 14845 + }, + { + "epoch": 2.4225122349102772, + "grad_norm": 0.3241261839866638, + "learning_rate": 4.698145673356832e-05, + "loss": 0.167, + "num_input_tokens_seen": 32048880, + "step": 14850 + }, + { + "epoch": 2.4233278955954325, + "grad_norm": 1.0995069742202759, + "learning_rate": 4.697806522487056e-05, + "loss": 0.1945, + "num_input_tokens_seen": 32058864, + "step": 14855 + }, + { + "epoch": 2.4241435562805873, + "grad_norm": 0.10011187940835953, + "learning_rate": 4.697467193451932e-05, + "loss": 0.0647, + "num_input_tokens_seen": 32068816, + "step": 14860 + }, + { + "epoch": 2.424959216965742, + "grad_norm": 1.4558048248291016, + "learning_rate": 4.6971276862789674e-05, + "loss": 0.2524, + "num_input_tokens_seen": 32080848, + "step": 14865 + }, + { + "epoch": 2.4257748776508974, + "grad_norm": 1.086951732635498, + "learning_rate": 4.6967880009956845e-05, + "loss": 0.092, + "num_input_tokens_seen": 32091824, + "step": 14870 + }, + { + "epoch": 2.426590538336052, + "grad_norm": 0.7518462538719177, + "learning_rate": 4.69644813762962e-05, + "loss": 0.1219, + "num_input_tokens_seen": 32102448, + "step": 14875 + }, + { + "epoch": 2.427406199021207, + "grad_norm": 1.028389811515808, + "learning_rate": 4.696108096208324e-05, + "loss": 0.0658, + "num_input_tokens_seen": 32114000, + "step": 14880 + }, + { + "epoch": 2.4282218597063623, + "grad_norm": 0.28185686469078064, + "learning_rate": 4.695767876759363e-05, + "loss": 0.103, + "num_input_tokens_seen": 32126192, + "step": 14885 + }, + { + "epoch": 2.429037520391517, + "grad_norm": 1.017129898071289, + "learning_rate": 4.695427479310317e-05, + "loss": 0.0374, + "num_input_tokens_seen": 32136880, + "step": 14890 + }, + { + "epoch": 2.429853181076672, + "grad_norm": 1.0681246519088745, + "learning_rate": 4.6950869038887804e-05, + "loss": 0.1997, + "num_input_tokens_seen": 32147408, + "step": 14895 + }, + { + "epoch": 2.430668841761827, + "grad_norm": 0.6264070272445679, + "learning_rate": 4.6947461505223614e-05, + "loss": 0.2489, + "num_input_tokens_seen": 32158416, + "step": 14900 + }, + { + "epoch": 2.431484502446982, + "grad_norm": 0.49671533703804016, + "learning_rate": 4.6944052192386836e-05, + "loss": 0.2793, + "num_input_tokens_seen": 32169264, + "step": 14905 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.11606377363204956, + "learning_rate": 4.6940641100653834e-05, + "loss": 0.1825, + "num_input_tokens_seen": 32181872, + "step": 14910 + }, + { + "epoch": 2.433115823817292, + "grad_norm": 0.31747573614120483, + "learning_rate": 4.693722823030114e-05, + "loss": 0.2824, + "num_input_tokens_seen": 32192816, + "step": 14915 + }, + { + "epoch": 2.433931484502447, + "grad_norm": 0.573392927646637, + "learning_rate": 4.693381358160543e-05, + "loss": 0.0804, + "num_input_tokens_seen": 32203952, + "step": 14920 + }, + { + "epoch": 2.434747145187602, + "grad_norm": 0.1746041625738144, + "learning_rate": 4.693039715484349e-05, + "loss": 0.1424, + "num_input_tokens_seen": 32215088, + "step": 14925 + }, + { + "epoch": 2.435562805872757, + "grad_norm": 0.13036420941352844, + "learning_rate": 4.692697895029229e-05, + "loss": 0.0601, + "num_input_tokens_seen": 32226256, + "step": 14930 + }, + { + "epoch": 2.436378466557912, + "grad_norm": 0.5963720083236694, + "learning_rate": 4.6923558968228906e-05, + "loss": 0.1875, + "num_input_tokens_seen": 32238096, + "step": 14935 + }, + { + "epoch": 2.437194127243067, + "grad_norm": 0.2690431475639343, + "learning_rate": 4.692013720893061e-05, + "loss": 0.1197, + "num_input_tokens_seen": 32247440, + "step": 14940 + }, + { + "epoch": 2.438009787928222, + "grad_norm": 0.10556929558515549, + "learning_rate": 4.691671367267476e-05, + "loss": 0.0333, + "num_input_tokens_seen": 32257488, + "step": 14945 + }, + { + "epoch": 2.4388254486133767, + "grad_norm": 0.23260244727134705, + "learning_rate": 4.6913288359738895e-05, + "loss": 0.0737, + "num_input_tokens_seen": 32268816, + "step": 14950 + }, + { + "epoch": 2.439641109298532, + "grad_norm": 0.47709760069847107, + "learning_rate": 4.690986127040069e-05, + "loss": 0.1521, + "num_input_tokens_seen": 32280048, + "step": 14955 + }, + { + "epoch": 2.4404567699836868, + "grad_norm": 0.7583755254745483, + "learning_rate": 4.690643240493797e-05, + "loss": 0.1006, + "num_input_tokens_seen": 32289680, + "step": 14960 + }, + { + "epoch": 2.4412724306688416, + "grad_norm": 0.2382224202156067, + "learning_rate": 4.690300176362867e-05, + "loss": 0.0595, + "num_input_tokens_seen": 32300656, + "step": 14965 + }, + { + "epoch": 2.442088091353997, + "grad_norm": 1.3572629690170288, + "learning_rate": 4.6899569346750924e-05, + "loss": 0.1253, + "num_input_tokens_seen": 32311888, + "step": 14970 + }, + { + "epoch": 2.4429037520391517, + "grad_norm": 0.3309600055217743, + "learning_rate": 4.689613515458297e-05, + "loss": 0.1592, + "num_input_tokens_seen": 32323248, + "step": 14975 + }, + { + "epoch": 2.443719412724307, + "grad_norm": 0.17021049559116364, + "learning_rate": 4.68926991874032e-05, + "loss": 0.0542, + "num_input_tokens_seen": 32333808, + "step": 14980 + }, + { + "epoch": 2.4445350734094617, + "grad_norm": 2.2337186336517334, + "learning_rate": 4.688926144549015e-05, + "loss": 0.2809, + "num_input_tokens_seen": 32345648, + "step": 14985 + }, + { + "epoch": 2.4453507340946166, + "grad_norm": 0.5330019593238831, + "learning_rate": 4.6885821929122497e-05, + "loss": 0.079, + "num_input_tokens_seen": 32356272, + "step": 14990 + }, + { + "epoch": 2.4461663947797714, + "grad_norm": 0.28825223445892334, + "learning_rate": 4.688238063857908e-05, + "loss": 0.2508, + "num_input_tokens_seen": 32366800, + "step": 14995 + }, + { + "epoch": 2.4469820554649266, + "grad_norm": 0.09029711037874222, + "learning_rate": 4.687893757413885e-05, + "loss": 0.2802, + "num_input_tokens_seen": 32377072, + "step": 15000 + }, + { + "epoch": 2.4477977161500815, + "grad_norm": 1.1115494966506958, + "learning_rate": 4.6875492736080935e-05, + "loss": 0.2219, + "num_input_tokens_seen": 32386640, + "step": 15005 + }, + { + "epoch": 2.4486133768352367, + "grad_norm": 1.1848427057266235, + "learning_rate": 4.687204612468458e-05, + "loss": 0.1258, + "num_input_tokens_seen": 32396496, + "step": 15010 + }, + { + "epoch": 2.4494290375203915, + "grad_norm": 0.575787365436554, + "learning_rate": 4.6868597740229186e-05, + "loss": 0.1294, + "num_input_tokens_seen": 32407824, + "step": 15015 + }, + { + "epoch": 2.4502446982055464, + "grad_norm": 0.1655394583940506, + "learning_rate": 4.68651475829943e-05, + "loss": 0.1352, + "num_input_tokens_seen": 32417968, + "step": 15020 + }, + { + "epoch": 2.4510603588907016, + "grad_norm": 0.13729065656661987, + "learning_rate": 4.686169565325961e-05, + "loss": 0.2296, + "num_input_tokens_seen": 32428240, + "step": 15025 + }, + { + "epoch": 2.4518760195758564, + "grad_norm": 0.6209558248519897, + "learning_rate": 4.685824195130495e-05, + "loss": 0.0654, + "num_input_tokens_seen": 32438704, + "step": 15030 + }, + { + "epoch": 2.4526916802610113, + "grad_norm": 0.7342514395713806, + "learning_rate": 4.6854786477410286e-05, + "loss": 0.1471, + "num_input_tokens_seen": 32449776, + "step": 15035 + }, + { + "epoch": 2.4535073409461665, + "grad_norm": 0.24594002962112427, + "learning_rate": 4.6851329231855736e-05, + "loss": 0.0347, + "num_input_tokens_seen": 32460240, + "step": 15040 + }, + { + "epoch": 2.4543230016313213, + "grad_norm": 0.07647109031677246, + "learning_rate": 4.6847870214921566e-05, + "loss": 0.0756, + "num_input_tokens_seen": 32469808, + "step": 15045 + }, + { + "epoch": 2.455138662316476, + "grad_norm": 0.8174765110015869, + "learning_rate": 4.6844409426888186e-05, + "loss": 0.0772, + "num_input_tokens_seen": 32481232, + "step": 15050 + }, + { + "epoch": 2.4559543230016314, + "grad_norm": 0.8108428120613098, + "learning_rate": 4.684094686803614e-05, + "loss": 0.2775, + "num_input_tokens_seen": 32492272, + "step": 15055 + }, + { + "epoch": 2.4567699836867862, + "grad_norm": 0.2994794249534607, + "learning_rate": 4.683748253864612e-05, + "loss": 0.1501, + "num_input_tokens_seen": 32503312, + "step": 15060 + }, + { + "epoch": 2.4575856443719415, + "grad_norm": 1.955729603767395, + "learning_rate": 4.6834016438998965e-05, + "loss": 0.1775, + "num_input_tokens_seen": 32513424, + "step": 15065 + }, + { + "epoch": 2.4584013050570963, + "grad_norm": 1.58058500289917, + "learning_rate": 4.6830548569375645e-05, + "loss": 0.1239, + "num_input_tokens_seen": 32525264, + "step": 15070 + }, + { + "epoch": 2.459216965742251, + "grad_norm": 0.7459841966629028, + "learning_rate": 4.68270789300573e-05, + "loss": 0.1625, + "num_input_tokens_seen": 32536592, + "step": 15075 + }, + { + "epoch": 2.4600326264274064, + "grad_norm": 0.22226285934448242, + "learning_rate": 4.682360752132518e-05, + "loss": 0.0605, + "num_input_tokens_seen": 32547344, + "step": 15080 + }, + { + "epoch": 2.460848287112561, + "grad_norm": 0.4719882011413574, + "learning_rate": 4.682013434346071e-05, + "loss": 0.0553, + "num_input_tokens_seen": 32558800, + "step": 15085 + }, + { + "epoch": 2.461663947797716, + "grad_norm": 1.493306279182434, + "learning_rate": 4.6816659396745424e-05, + "loss": 0.0858, + "num_input_tokens_seen": 32569392, + "step": 15090 + }, + { + "epoch": 2.4624796084828713, + "grad_norm": 0.05719485133886337, + "learning_rate": 4.6813182681461044e-05, + "loss": 0.1088, + "num_input_tokens_seen": 32579632, + "step": 15095 + }, + { + "epoch": 2.463295269168026, + "grad_norm": 0.3330918550491333, + "learning_rate": 4.680970419788939e-05, + "loss": 0.0779, + "num_input_tokens_seen": 32590128, + "step": 15100 + }, + { + "epoch": 2.464110929853181, + "grad_norm": 0.14283914864063263, + "learning_rate": 4.6806223946312455e-05, + "loss": 0.0902, + "num_input_tokens_seen": 32600656, + "step": 15105 + }, + { + "epoch": 2.464926590538336, + "grad_norm": 0.23892106115818024, + "learning_rate": 4.6802741927012363e-05, + "loss": 0.1035, + "num_input_tokens_seen": 32610928, + "step": 15110 + }, + { + "epoch": 2.465742251223491, + "grad_norm": 0.22045449912548065, + "learning_rate": 4.679925814027138e-05, + "loss": 0.0892, + "num_input_tokens_seen": 32621488, + "step": 15115 + }, + { + "epoch": 2.466557911908646, + "grad_norm": 0.44638726115226746, + "learning_rate": 4.6795772586371934e-05, + "loss": 0.1594, + "num_input_tokens_seen": 32633712, + "step": 15120 + }, + { + "epoch": 2.467373572593801, + "grad_norm": 0.05972493067383766, + "learning_rate": 4.679228526559656e-05, + "loss": 0.1735, + "num_input_tokens_seen": 32644560, + "step": 15125 + }, + { + "epoch": 2.468189233278956, + "grad_norm": 0.06434395164251328, + "learning_rate": 4.678879617822798e-05, + "loss": 0.0768, + "num_input_tokens_seen": 32655888, + "step": 15130 + }, + { + "epoch": 2.4690048939641107, + "grad_norm": 0.4070717394351959, + "learning_rate": 4.6785305324549016e-05, + "loss": 0.1025, + "num_input_tokens_seen": 32666832, + "step": 15135 + }, + { + "epoch": 2.469820554649266, + "grad_norm": 0.7208000421524048, + "learning_rate": 4.678181270484267e-05, + "loss": 0.0716, + "num_input_tokens_seen": 32677616, + "step": 15140 + }, + { + "epoch": 2.470636215334421, + "grad_norm": 1.3808907270431519, + "learning_rate": 4.677831831939207e-05, + "loss": 0.1082, + "num_input_tokens_seen": 32687696, + "step": 15145 + }, + { + "epoch": 2.471451876019576, + "grad_norm": 2.2134146690368652, + "learning_rate": 4.6774822168480476e-05, + "loss": 0.2151, + "num_input_tokens_seen": 32698096, + "step": 15150 + }, + { + "epoch": 2.472267536704731, + "grad_norm": 0.18488378822803497, + "learning_rate": 4.677132425239132e-05, + "loss": 0.0956, + "num_input_tokens_seen": 32707920, + "step": 15155 + }, + { + "epoch": 2.4730831973898857, + "grad_norm": 1.223698377609253, + "learning_rate": 4.676782457140815e-05, + "loss": 0.2074, + "num_input_tokens_seen": 32719472, + "step": 15160 + }, + { + "epoch": 2.473898858075041, + "grad_norm": 2.0003459453582764, + "learning_rate": 4.676432312581467e-05, + "loss": 0.1386, + "num_input_tokens_seen": 32730064, + "step": 15165 + }, + { + "epoch": 2.4747145187601958, + "grad_norm": 0.19623111188411713, + "learning_rate": 4.676081991589473e-05, + "loss": 0.1681, + "num_input_tokens_seen": 32740944, + "step": 15170 + }, + { + "epoch": 2.4755301794453506, + "grad_norm": 0.14517223834991455, + "learning_rate": 4.6757314941932315e-05, + "loss": 0.1064, + "num_input_tokens_seen": 32751856, + "step": 15175 + }, + { + "epoch": 2.476345840130506, + "grad_norm": 0.46274054050445557, + "learning_rate": 4.6753808204211554e-05, + "loss": 0.1718, + "num_input_tokens_seen": 32762928, + "step": 15180 + }, + { + "epoch": 2.4771615008156607, + "grad_norm": 0.8498165011405945, + "learning_rate": 4.675029970301672e-05, + "loss": 0.0623, + "num_input_tokens_seen": 32773840, + "step": 15185 + }, + { + "epoch": 2.4779771615008155, + "grad_norm": 0.41249653697013855, + "learning_rate": 4.674678943863223e-05, + "loss": 0.209, + "num_input_tokens_seen": 32786064, + "step": 15190 + }, + { + "epoch": 2.4787928221859707, + "grad_norm": 1.7439165115356445, + "learning_rate": 4.674327741134266e-05, + "loss": 0.1692, + "num_input_tokens_seen": 32796816, + "step": 15195 + }, + { + "epoch": 2.4796084828711256, + "grad_norm": 0.6460889577865601, + "learning_rate": 4.673976362143269e-05, + "loss": 0.2441, + "num_input_tokens_seen": 32807088, + "step": 15200 + }, + { + "epoch": 2.480424143556281, + "grad_norm": 3.0972390174865723, + "learning_rate": 4.673624806918717e-05, + "loss": 0.2705, + "num_input_tokens_seen": 32817424, + "step": 15205 + }, + { + "epoch": 2.4812398042414356, + "grad_norm": 1.1106595993041992, + "learning_rate": 4.673273075489109e-05, + "loss": 0.1589, + "num_input_tokens_seen": 32827984, + "step": 15210 + }, + { + "epoch": 2.4820554649265905, + "grad_norm": 0.7116230726242065, + "learning_rate": 4.6729211678829595e-05, + "loss": 0.1538, + "num_input_tokens_seen": 32839664, + "step": 15215 + }, + { + "epoch": 2.4828711256117453, + "grad_norm": 1.4497753381729126, + "learning_rate": 4.672569084128794e-05, + "loss": 0.2326, + "num_input_tokens_seen": 32851280, + "step": 15220 + }, + { + "epoch": 2.4836867862969005, + "grad_norm": 0.11486872285604477, + "learning_rate": 4.6722168242551554e-05, + "loss": 0.1311, + "num_input_tokens_seen": 32862224, + "step": 15225 + }, + { + "epoch": 2.4845024469820554, + "grad_norm": 0.09041553735733032, + "learning_rate": 4.671864388290599e-05, + "loss": 0.0291, + "num_input_tokens_seen": 32872816, + "step": 15230 + }, + { + "epoch": 2.4853181076672106, + "grad_norm": 0.9516128897666931, + "learning_rate": 4.671511776263696e-05, + "loss": 0.1999, + "num_input_tokens_seen": 32882896, + "step": 15235 + }, + { + "epoch": 2.4861337683523654, + "grad_norm": 0.6295701861381531, + "learning_rate": 4.67115898820303e-05, + "loss": 0.0964, + "num_input_tokens_seen": 32894256, + "step": 15240 + }, + { + "epoch": 2.4869494290375203, + "grad_norm": 0.05139046534895897, + "learning_rate": 4.6708060241372e-05, + "loss": 0.0922, + "num_input_tokens_seen": 32905392, + "step": 15245 + }, + { + "epoch": 2.4877650897226755, + "grad_norm": 0.12630115449428558, + "learning_rate": 4.670452884094819e-05, + "loss": 0.148, + "num_input_tokens_seen": 32916592, + "step": 15250 + }, + { + "epoch": 2.4885807504078303, + "grad_norm": 0.1252480000257492, + "learning_rate": 4.6700995681045144e-05, + "loss": 0.0338, + "num_input_tokens_seen": 32927216, + "step": 15255 + }, + { + "epoch": 2.489396411092985, + "grad_norm": 0.6140846014022827, + "learning_rate": 4.669746076194928e-05, + "loss": 0.0499, + "num_input_tokens_seen": 32937648, + "step": 15260 + }, + { + "epoch": 2.4902120717781404, + "grad_norm": 0.13702353835105896, + "learning_rate": 4.669392408394716e-05, + "loss": 0.1907, + "num_input_tokens_seen": 32948496, + "step": 15265 + }, + { + "epoch": 2.4910277324632952, + "grad_norm": 0.7108393311500549, + "learning_rate": 4.669038564732548e-05, + "loss": 0.1562, + "num_input_tokens_seen": 32958832, + "step": 15270 + }, + { + "epoch": 2.49184339314845, + "grad_norm": 1.1924853324890137, + "learning_rate": 4.668684545237107e-05, + "loss": 0.1917, + "num_input_tokens_seen": 32969456, + "step": 15275 + }, + { + "epoch": 2.4926590538336053, + "grad_norm": 0.05113023892045021, + "learning_rate": 4.668330349937093e-05, + "loss": 0.117, + "num_input_tokens_seen": 32980496, + "step": 15280 + }, + { + "epoch": 2.49347471451876, + "grad_norm": 0.5148568153381348, + "learning_rate": 4.6679759788612205e-05, + "loss": 0.1682, + "num_input_tokens_seen": 32991888, + "step": 15285 + }, + { + "epoch": 2.4942903752039154, + "grad_norm": 1.4265097379684448, + "learning_rate": 4.667621432038214e-05, + "loss": 0.0926, + "num_input_tokens_seen": 33002672, + "step": 15290 + }, + { + "epoch": 2.49510603588907, + "grad_norm": 1.5880223512649536, + "learning_rate": 4.6672667094968156e-05, + "loss": 0.115, + "num_input_tokens_seen": 33012720, + "step": 15295 + }, + { + "epoch": 2.495921696574225, + "grad_norm": 0.2819592356681824, + "learning_rate": 4.6669118112657814e-05, + "loss": 0.1535, + "num_input_tokens_seen": 33023760, + "step": 15300 + }, + { + "epoch": 2.4967373572593803, + "grad_norm": 0.8198356628417969, + "learning_rate": 4.666556737373881e-05, + "loss": 0.1627, + "num_input_tokens_seen": 33034192, + "step": 15305 + }, + { + "epoch": 2.497553017944535, + "grad_norm": 2.330695867538452, + "learning_rate": 4.666201487849898e-05, + "loss": 0.2342, + "num_input_tokens_seen": 33044080, + "step": 15310 + }, + { + "epoch": 2.49836867862969, + "grad_norm": 0.5785046815872192, + "learning_rate": 4.665846062722632e-05, + "loss": 0.075, + "num_input_tokens_seen": 33055184, + "step": 15315 + }, + { + "epoch": 2.499184339314845, + "grad_norm": 1.3179028034210205, + "learning_rate": 4.665490462020895e-05, + "loss": 0.1225, + "num_input_tokens_seen": 33065712, + "step": 15320 + }, + { + "epoch": 2.5, + "grad_norm": 0.7252205610275269, + "learning_rate": 4.665134685773513e-05, + "loss": 0.0945, + "num_input_tokens_seen": 33075856, + "step": 15325 + }, + { + "epoch": 2.5, + "eval_loss": 0.14460259675979614, + "eval_runtime": 132.04, + "eval_samples_per_second": 20.638, + "eval_steps_per_second": 5.165, + "num_input_tokens_seen": 33075856, + "step": 15325 + }, + { + "epoch": 2.500815660685155, + "grad_norm": 0.09335429221391678, + "learning_rate": 4.664778734009327e-05, + "loss": 0.032, + "num_input_tokens_seen": 33087312, + "step": 15330 + }, + { + "epoch": 2.50163132137031, + "grad_norm": 0.07399007678031921, + "learning_rate": 4.664422606757194e-05, + "loss": 0.1527, + "num_input_tokens_seen": 33098960, + "step": 15335 + }, + { + "epoch": 2.502446982055465, + "grad_norm": 0.183058962225914, + "learning_rate": 4.664066304045982e-05, + "loss": 0.1662, + "num_input_tokens_seen": 33109712, + "step": 15340 + }, + { + "epoch": 2.50326264274062, + "grad_norm": 1.4244166612625122, + "learning_rate": 4.6637098259045744e-05, + "loss": 0.1183, + "num_input_tokens_seen": 33121008, + "step": 15345 + }, + { + "epoch": 2.504078303425775, + "grad_norm": 0.2134961038827896, + "learning_rate": 4.66335317236187e-05, + "loss": 0.1491, + "num_input_tokens_seen": 33133360, + "step": 15350 + }, + { + "epoch": 2.50489396411093, + "grad_norm": 0.37712299823760986, + "learning_rate": 4.662996343446781e-05, + "loss": 0.0744, + "num_input_tokens_seen": 33144688, + "step": 15355 + }, + { + "epoch": 2.5057096247960846, + "grad_norm": 0.09144271165132523, + "learning_rate": 4.6626393391882326e-05, + "loss": 0.1208, + "num_input_tokens_seen": 33155024, + "step": 15360 + }, + { + "epoch": 2.50652528548124, + "grad_norm": 0.12681259214878082, + "learning_rate": 4.6622821596151676e-05, + "loss": 0.3281, + "num_input_tokens_seen": 33166576, + "step": 15365 + }, + { + "epoch": 2.5073409461663947, + "grad_norm": 0.5135420560836792, + "learning_rate": 4.6619248047565386e-05, + "loss": 0.1071, + "num_input_tokens_seen": 33176720, + "step": 15370 + }, + { + "epoch": 2.50815660685155, + "grad_norm": 1.862266182899475, + "learning_rate": 4.6615672746413156e-05, + "loss": 0.1288, + "num_input_tokens_seen": 33187824, + "step": 15375 + }, + { + "epoch": 2.5089722675367048, + "grad_norm": 0.38648244738578796, + "learning_rate": 4.661209569298482e-05, + "loss": 0.138, + "num_input_tokens_seen": 33198800, + "step": 15380 + }, + { + "epoch": 2.5097879282218596, + "grad_norm": 1.7704272270202637, + "learning_rate": 4.660851688757034e-05, + "loss": 0.1355, + "num_input_tokens_seen": 33209232, + "step": 15385 + }, + { + "epoch": 2.5106035889070144, + "grad_norm": 1.3502709865570068, + "learning_rate": 4.6604936330459845e-05, + "loss": 0.3126, + "num_input_tokens_seen": 33219760, + "step": 15390 + }, + { + "epoch": 2.5114192495921697, + "grad_norm": 0.17795352637767792, + "learning_rate": 4.660135402194359e-05, + "loss": 0.0951, + "num_input_tokens_seen": 33229040, + "step": 15395 + }, + { + "epoch": 2.5122349102773245, + "grad_norm": 0.24164390563964844, + "learning_rate": 4.6597769962311975e-05, + "loss": 0.095, + "num_input_tokens_seen": 33238960, + "step": 15400 + }, + { + "epoch": 2.5130505709624797, + "grad_norm": 1.6595830917358398, + "learning_rate": 4.6594184151855536e-05, + "loss": 0.2356, + "num_input_tokens_seen": 33247920, + "step": 15405 + }, + { + "epoch": 2.5138662316476346, + "grad_norm": 0.23753483593463898, + "learning_rate": 4.6590596590864966e-05, + "loss": 0.1379, + "num_input_tokens_seen": 33259504, + "step": 15410 + }, + { + "epoch": 2.5146818923327894, + "grad_norm": 0.08064033836126328, + "learning_rate": 4.658700727963109e-05, + "loss": 0.0757, + "num_input_tokens_seen": 33269872, + "step": 15415 + }, + { + "epoch": 2.5154975530179446, + "grad_norm": 0.39265960454940796, + "learning_rate": 4.6583416218444866e-05, + "loss": 0.0439, + "num_input_tokens_seen": 33282192, + "step": 15420 + }, + { + "epoch": 2.5163132137030995, + "grad_norm": 0.7837746739387512, + "learning_rate": 4.657982340759741e-05, + "loss": 0.1112, + "num_input_tokens_seen": 33293648, + "step": 15425 + }, + { + "epoch": 2.5171288743882547, + "grad_norm": 0.7057381272315979, + "learning_rate": 4.657622884737998e-05, + "loss": 0.2722, + "num_input_tokens_seen": 33304752, + "step": 15430 + }, + { + "epoch": 2.5179445350734095, + "grad_norm": 1.1379543542861938, + "learning_rate": 4.657263253808396e-05, + "loss": 0.1117, + "num_input_tokens_seen": 33315216, + "step": 15435 + }, + { + "epoch": 2.5187601957585644, + "grad_norm": 2.8032920360565186, + "learning_rate": 4.6569034480000887e-05, + "loss": 0.2415, + "num_input_tokens_seen": 33325392, + "step": 15440 + }, + { + "epoch": 2.519575856443719, + "grad_norm": 0.21161817014217377, + "learning_rate": 4.656543467342244e-05, + "loss": 0.1179, + "num_input_tokens_seen": 33336656, + "step": 15445 + }, + { + "epoch": 2.5203915171288744, + "grad_norm": 0.37448322772979736, + "learning_rate": 4.656183311864043e-05, + "loss": 0.1034, + "num_input_tokens_seen": 33347856, + "step": 15450 + }, + { + "epoch": 2.5212071778140293, + "grad_norm": 0.44176727533340454, + "learning_rate": 4.655822981594683e-05, + "loss": 0.1708, + "num_input_tokens_seen": 33358640, + "step": 15455 + }, + { + "epoch": 2.5220228384991845, + "grad_norm": 0.480294406414032, + "learning_rate": 4.6554624765633734e-05, + "loss": 0.1485, + "num_input_tokens_seen": 33369200, + "step": 15460 + }, + { + "epoch": 2.5228384991843393, + "grad_norm": 0.29898691177368164, + "learning_rate": 4.655101796799338e-05, + "loss": 0.1115, + "num_input_tokens_seen": 33378320, + "step": 15465 + }, + { + "epoch": 2.523654159869494, + "grad_norm": 0.08569514751434326, + "learning_rate": 4.654740942331818e-05, + "loss": 0.1261, + "num_input_tokens_seen": 33388976, + "step": 15470 + }, + { + "epoch": 2.5244698205546494, + "grad_norm": 0.33493202924728394, + "learning_rate": 4.6543799131900625e-05, + "loss": 0.1476, + "num_input_tokens_seen": 33399664, + "step": 15475 + }, + { + "epoch": 2.5252854812398042, + "grad_norm": 1.21054208278656, + "learning_rate": 4.6540187094033407e-05, + "loss": 0.176, + "num_input_tokens_seen": 33409616, + "step": 15480 + }, + { + "epoch": 2.5261011419249595, + "grad_norm": 0.2401600480079651, + "learning_rate": 4.6536573310009326e-05, + "loss": 0.0562, + "num_input_tokens_seen": 33420624, + "step": 15485 + }, + { + "epoch": 2.5269168026101143, + "grad_norm": 1.6534537076950073, + "learning_rate": 4.653295778012134e-05, + "loss": 0.133, + "num_input_tokens_seen": 33430960, + "step": 15490 + }, + { + "epoch": 2.527732463295269, + "grad_norm": 0.4734901189804077, + "learning_rate": 4.652934050466254e-05, + "loss": 0.0593, + "num_input_tokens_seen": 33443344, + "step": 15495 + }, + { + "epoch": 2.528548123980424, + "grad_norm": 0.39705759286880493, + "learning_rate": 4.652572148392616e-05, + "loss": 0.1062, + "num_input_tokens_seen": 33454032, + "step": 15500 + }, + { + "epoch": 2.529363784665579, + "grad_norm": 1.5619351863861084, + "learning_rate": 4.652210071820557e-05, + "loss": 0.1279, + "num_input_tokens_seen": 33465360, + "step": 15505 + }, + { + "epoch": 2.530179445350734, + "grad_norm": 0.19738373160362244, + "learning_rate": 4.6518478207794304e-05, + "loss": 0.0498, + "num_input_tokens_seen": 33476336, + "step": 15510 + }, + { + "epoch": 2.5309951060358893, + "grad_norm": 0.05532718822360039, + "learning_rate": 4.6514853952986e-05, + "loss": 0.0284, + "num_input_tokens_seen": 33486736, + "step": 15515 + }, + { + "epoch": 2.531810766721044, + "grad_norm": 0.7342164516448975, + "learning_rate": 4.6511227954074476e-05, + "loss": 0.0954, + "num_input_tokens_seen": 33497680, + "step": 15520 + }, + { + "epoch": 2.532626427406199, + "grad_norm": 1.6016055345535278, + "learning_rate": 4.650760021135366e-05, + "loss": 0.1713, + "num_input_tokens_seen": 33509456, + "step": 15525 + }, + { + "epoch": 2.5334420880913537, + "grad_norm": 1.9737138748168945, + "learning_rate": 4.650397072511765e-05, + "loss": 0.1379, + "num_input_tokens_seen": 33521040, + "step": 15530 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 1.1645753383636475, + "learning_rate": 4.650033949566066e-05, + "loss": 0.1326, + "num_input_tokens_seen": 33531472, + "step": 15535 + }, + { + "epoch": 2.535073409461664, + "grad_norm": 0.3861691653728485, + "learning_rate": 4.6496706523277054e-05, + "loss": 0.1175, + "num_input_tokens_seen": 33541552, + "step": 15540 + }, + { + "epoch": 2.535889070146819, + "grad_norm": 0.27864256501197815, + "learning_rate": 4.649307180826136e-05, + "loss": 0.0616, + "num_input_tokens_seen": 33551920, + "step": 15545 + }, + { + "epoch": 2.536704730831974, + "grad_norm": 0.4444292485713959, + "learning_rate": 4.64894353509082e-05, + "loss": 0.3376, + "num_input_tokens_seen": 33562960, + "step": 15550 + }, + { + "epoch": 2.5375203915171287, + "grad_norm": 0.12517836689949036, + "learning_rate": 4.648579715151237e-05, + "loss": 0.1266, + "num_input_tokens_seen": 33572848, + "step": 15555 + }, + { + "epoch": 2.538336052202284, + "grad_norm": 1.0811668634414673, + "learning_rate": 4.648215721036881e-05, + "loss": 0.219, + "num_input_tokens_seen": 33582224, + "step": 15560 + }, + { + "epoch": 2.539151712887439, + "grad_norm": 0.5149428844451904, + "learning_rate": 4.647851552777258e-05, + "loss": 0.0403, + "num_input_tokens_seen": 33593680, + "step": 15565 + }, + { + "epoch": 2.539967373572594, + "grad_norm": 0.47650372982025146, + "learning_rate": 4.6474872104018907e-05, + "loss": 0.1603, + "num_input_tokens_seen": 33605616, + "step": 15570 + }, + { + "epoch": 2.540783034257749, + "grad_norm": 0.30104705691337585, + "learning_rate": 4.6471226939403145e-05, + "loss": 0.1442, + "num_input_tokens_seen": 33616752, + "step": 15575 + }, + { + "epoch": 2.5415986949429037, + "grad_norm": 0.12388172745704651, + "learning_rate": 4.646758003422077e-05, + "loss": 0.1351, + "num_input_tokens_seen": 33628048, + "step": 15580 + }, + { + "epoch": 2.5424143556280585, + "grad_norm": 0.6462289094924927, + "learning_rate": 4.646393138876745e-05, + "loss": 0.0379, + "num_input_tokens_seen": 33639280, + "step": 15585 + }, + { + "epoch": 2.5432300163132138, + "grad_norm": 0.680925190448761, + "learning_rate": 4.6460281003338924e-05, + "loss": 0.0724, + "num_input_tokens_seen": 33649808, + "step": 15590 + }, + { + "epoch": 2.5440456769983686, + "grad_norm": 0.09795048832893372, + "learning_rate": 4.6456628878231144e-05, + "loss": 0.2248, + "num_input_tokens_seen": 33658992, + "step": 15595 + }, + { + "epoch": 2.544861337683524, + "grad_norm": 0.19079826772212982, + "learning_rate": 4.645297501374015e-05, + "loss": 0.046, + "num_input_tokens_seen": 33669904, + "step": 15600 + }, + { + "epoch": 2.5456769983686787, + "grad_norm": 0.07357697933912277, + "learning_rate": 4.644931941016216e-05, + "loss": 0.14, + "num_input_tokens_seen": 33680080, + "step": 15605 + }, + { + "epoch": 2.5464926590538335, + "grad_norm": 0.10278824716806412, + "learning_rate": 4.644566206779349e-05, + "loss": 0.1192, + "num_input_tokens_seen": 33692208, + "step": 15610 + }, + { + "epoch": 2.5473083197389887, + "grad_norm": 0.3182622492313385, + "learning_rate": 4.6442002986930656e-05, + "loss": 0.2185, + "num_input_tokens_seen": 33703088, + "step": 15615 + }, + { + "epoch": 2.5481239804241436, + "grad_norm": 0.054895609617233276, + "learning_rate": 4.6438342167870255e-05, + "loss": 0.1831, + "num_input_tokens_seen": 33712944, + "step": 15620 + }, + { + "epoch": 2.5489396411092984, + "grad_norm": 0.06905942410230637, + "learning_rate": 4.643467961090906e-05, + "loss": 0.1182, + "num_input_tokens_seen": 33723408, + "step": 15625 + }, + { + "epoch": 2.5497553017944536, + "grad_norm": 0.7037061452865601, + "learning_rate": 4.643101531634399e-05, + "loss": 0.1148, + "num_input_tokens_seen": 33733488, + "step": 15630 + }, + { + "epoch": 2.5505709624796085, + "grad_norm": 0.06155296415090561, + "learning_rate": 4.642734928447207e-05, + "loss": 0.1175, + "num_input_tokens_seen": 33742640, + "step": 15635 + }, + { + "epoch": 2.5513866231647633, + "grad_norm": 0.2660094201564789, + "learning_rate": 4.642368151559049e-05, + "loss": 0.2637, + "num_input_tokens_seen": 33753584, + "step": 15640 + }, + { + "epoch": 2.5522022838499185, + "grad_norm": 0.24558298289775848, + "learning_rate": 4.642001200999659e-05, + "loss": 0.0279, + "num_input_tokens_seen": 33764656, + "step": 15645 + }, + { + "epoch": 2.5530179445350734, + "grad_norm": 0.12981551885604858, + "learning_rate": 4.6416340767987833e-05, + "loss": 0.1148, + "num_input_tokens_seen": 33775632, + "step": 15650 + }, + { + "epoch": 2.5538336052202286, + "grad_norm": 0.04761983081698418, + "learning_rate": 4.641266778986182e-05, + "loss": 0.0945, + "num_input_tokens_seen": 33787536, + "step": 15655 + }, + { + "epoch": 2.5546492659053834, + "grad_norm": 1.2215356826782227, + "learning_rate": 4.640899307591632e-05, + "loss": 0.1567, + "num_input_tokens_seen": 33798736, + "step": 15660 + }, + { + "epoch": 2.5554649265905383, + "grad_norm": 0.37796738743782043, + "learning_rate": 4.64053166264492e-05, + "loss": 0.0565, + "num_input_tokens_seen": 33810256, + "step": 15665 + }, + { + "epoch": 2.556280587275693, + "grad_norm": 0.47339576482772827, + "learning_rate": 4.640163844175852e-05, + "loss": 0.075, + "num_input_tokens_seen": 33820272, + "step": 15670 + }, + { + "epoch": 2.5570962479608483, + "grad_norm": 0.495961993932724, + "learning_rate": 4.6397958522142426e-05, + "loss": 0.1761, + "num_input_tokens_seen": 33830448, + "step": 15675 + }, + { + "epoch": 2.557911908646003, + "grad_norm": 0.6126897931098938, + "learning_rate": 4.639427686789924e-05, + "loss": 0.3655, + "num_input_tokens_seen": 33841808, + "step": 15680 + }, + { + "epoch": 2.5587275693311584, + "grad_norm": 0.17688283324241638, + "learning_rate": 4.6390593479327424e-05, + "loss": 0.1172, + "num_input_tokens_seen": 33851088, + "step": 15685 + }, + { + "epoch": 2.5595432300163132, + "grad_norm": 0.5221394896507263, + "learning_rate": 4.6386908356725564e-05, + "loss": 0.0489, + "num_input_tokens_seen": 33863600, + "step": 15690 + }, + { + "epoch": 2.560358890701468, + "grad_norm": 0.27286484837532043, + "learning_rate": 4.63832215003924e-05, + "loss": 0.1112, + "num_input_tokens_seen": 33873712, + "step": 15695 + }, + { + "epoch": 2.5611745513866233, + "grad_norm": 0.5750360488891602, + "learning_rate": 4.63795329106268e-05, + "loss": 0.1565, + "num_input_tokens_seen": 33884496, + "step": 15700 + }, + { + "epoch": 2.561990212071778, + "grad_norm": 1.9597563743591309, + "learning_rate": 4.637584258772779e-05, + "loss": 0.2458, + "num_input_tokens_seen": 33896176, + "step": 15705 + }, + { + "epoch": 2.5628058727569334, + "grad_norm": 0.18684956431388855, + "learning_rate": 4.637215053199451e-05, + "loss": 0.0893, + "num_input_tokens_seen": 33907760, + "step": 15710 + }, + { + "epoch": 2.563621533442088, + "grad_norm": 0.20780570805072784, + "learning_rate": 4.6368456743726276e-05, + "loss": 0.1696, + "num_input_tokens_seen": 33918864, + "step": 15715 + }, + { + "epoch": 2.564437194127243, + "grad_norm": 0.7037215828895569, + "learning_rate": 4.636476122322251e-05, + "loss": 0.0953, + "num_input_tokens_seen": 33929072, + "step": 15720 + }, + { + "epoch": 2.565252854812398, + "grad_norm": 0.10402847081422806, + "learning_rate": 4.636106397078279e-05, + "loss": 0.106, + "num_input_tokens_seen": 33939440, + "step": 15725 + }, + { + "epoch": 2.566068515497553, + "grad_norm": 0.7178180813789368, + "learning_rate": 4.635736498670685e-05, + "loss": 0.0681, + "num_input_tokens_seen": 33949168, + "step": 15730 + }, + { + "epoch": 2.566884176182708, + "grad_norm": 0.11257173120975494, + "learning_rate": 4.635366427129454e-05, + "loss": 0.0355, + "num_input_tokens_seen": 33958800, + "step": 15735 + }, + { + "epoch": 2.567699836867863, + "grad_norm": 0.21552638709545135, + "learning_rate": 4.634996182484584e-05, + "loss": 0.1341, + "num_input_tokens_seen": 33968144, + "step": 15740 + }, + { + "epoch": 2.568515497553018, + "grad_norm": 0.5312675833702087, + "learning_rate": 4.634625764766093e-05, + "loss": 0.1171, + "num_input_tokens_seen": 33979344, + "step": 15745 + }, + { + "epoch": 2.569331158238173, + "grad_norm": 1.0321571826934814, + "learning_rate": 4.6342551740040053e-05, + "loss": 0.1639, + "num_input_tokens_seen": 33990160, + "step": 15750 + }, + { + "epoch": 2.5701468189233276, + "grad_norm": 1.6210813522338867, + "learning_rate": 4.633884410228364e-05, + "loss": 0.1113, + "num_input_tokens_seen": 34000336, + "step": 15755 + }, + { + "epoch": 2.570962479608483, + "grad_norm": 0.41350024938583374, + "learning_rate": 4.633513473469225e-05, + "loss": 0.2525, + "num_input_tokens_seen": 34011696, + "step": 15760 + }, + { + "epoch": 2.5717781402936377, + "grad_norm": 0.6767852902412415, + "learning_rate": 4.633142363756658e-05, + "loss": 0.1419, + "num_input_tokens_seen": 34022256, + "step": 15765 + }, + { + "epoch": 2.572593800978793, + "grad_norm": 1.0389457941055298, + "learning_rate": 4.6327710811207486e-05, + "loss": 0.2378, + "num_input_tokens_seen": 34032848, + "step": 15770 + }, + { + "epoch": 2.573409461663948, + "grad_norm": 0.6689066290855408, + "learning_rate": 4.6323996255915936e-05, + "loss": 0.1781, + "num_input_tokens_seen": 34043888, + "step": 15775 + }, + { + "epoch": 2.5742251223491026, + "grad_norm": 1.5574312210083008, + "learning_rate": 4.6320279971993055e-05, + "loss": 0.3016, + "num_input_tokens_seen": 34055760, + "step": 15780 + }, + { + "epoch": 2.575040783034258, + "grad_norm": 0.6770592331886292, + "learning_rate": 4.631656195974009e-05, + "loss": 0.1215, + "num_input_tokens_seen": 34067536, + "step": 15785 + }, + { + "epoch": 2.5758564437194127, + "grad_norm": 0.296342134475708, + "learning_rate": 4.631284221945846e-05, + "loss": 0.0495, + "num_input_tokens_seen": 34078896, + "step": 15790 + }, + { + "epoch": 2.576672104404568, + "grad_norm": 0.08414090424776077, + "learning_rate": 4.6309120751449706e-05, + "loss": 0.0147, + "num_input_tokens_seen": 34089648, + "step": 15795 + }, + { + "epoch": 2.5774877650897228, + "grad_norm": 0.155085027217865, + "learning_rate": 4.63053975560155e-05, + "loss": 0.0801, + "num_input_tokens_seen": 34101392, + "step": 15800 + }, + { + "epoch": 2.5783034257748776, + "grad_norm": 0.5450553297996521, + "learning_rate": 4.630167263345766e-05, + "loss": 0.094, + "num_input_tokens_seen": 34111184, + "step": 15805 + }, + { + "epoch": 2.5791190864600324, + "grad_norm": 0.2866578698158264, + "learning_rate": 4.629794598407815e-05, + "loss": 0.1913, + "num_input_tokens_seen": 34121872, + "step": 15810 + }, + { + "epoch": 2.5799347471451877, + "grad_norm": 0.03585745766758919, + "learning_rate": 4.629421760817908e-05, + "loss": 0.0762, + "num_input_tokens_seen": 34132912, + "step": 15815 + }, + { + "epoch": 2.5807504078303425, + "grad_norm": 0.10831768810749054, + "learning_rate": 4.6290487506062685e-05, + "loss": 0.0852, + "num_input_tokens_seen": 34143632, + "step": 15820 + }, + { + "epoch": 2.5815660685154977, + "grad_norm": 0.12537044286727905, + "learning_rate": 4.6286755678031344e-05, + "loss": 0.0267, + "num_input_tokens_seen": 34154928, + "step": 15825 + }, + { + "epoch": 2.5823817292006526, + "grad_norm": 0.3769458532333374, + "learning_rate": 4.628302212438758e-05, + "loss": 0.1184, + "num_input_tokens_seen": 34165168, + "step": 15830 + }, + { + "epoch": 2.5831973898858074, + "grad_norm": 0.8016397356987, + "learning_rate": 4.627928684543406e-05, + "loss": 0.1541, + "num_input_tokens_seen": 34176304, + "step": 15835 + }, + { + "epoch": 2.5840130505709626, + "grad_norm": 0.24759170413017273, + "learning_rate": 4.627554984147357e-05, + "loss": 0.0602, + "num_input_tokens_seen": 34186928, + "step": 15840 + }, + { + "epoch": 2.5848287112561175, + "grad_norm": 1.3067725896835327, + "learning_rate": 4.627181111280906e-05, + "loss": 0.1243, + "num_input_tokens_seen": 34197712, + "step": 15845 + }, + { + "epoch": 2.5856443719412723, + "grad_norm": 0.43489405512809753, + "learning_rate": 4.6268070659743605e-05, + "loss": 0.0581, + "num_input_tokens_seen": 34208144, + "step": 15850 + }, + { + "epoch": 2.5864600326264275, + "grad_norm": 0.6601294279098511, + "learning_rate": 4.626432848258044e-05, + "loss": 0.2012, + "num_input_tokens_seen": 34219504, + "step": 15855 + }, + { + "epoch": 2.5872756933115824, + "grad_norm": 0.04888039082288742, + "learning_rate": 4.62605845816229e-05, + "loss": 0.2289, + "num_input_tokens_seen": 34230032, + "step": 15860 + }, + { + "epoch": 2.588091353996737, + "grad_norm": 0.6863899827003479, + "learning_rate": 4.625683895717451e-05, + "loss": 0.0876, + "num_input_tokens_seen": 34240400, + "step": 15865 + }, + { + "epoch": 2.5889070146818924, + "grad_norm": 0.4643964171409607, + "learning_rate": 4.62530916095389e-05, + "loss": 0.105, + "num_input_tokens_seen": 34252048, + "step": 15870 + }, + { + "epoch": 2.5897226753670473, + "grad_norm": 0.5524716973304749, + "learning_rate": 4.6249342539019844e-05, + "loss": 0.0746, + "num_input_tokens_seen": 34262832, + "step": 15875 + }, + { + "epoch": 2.5905383360522025, + "grad_norm": 0.5016497373580933, + "learning_rate": 4.6245591745921254e-05, + "loss": 0.0307, + "num_input_tokens_seen": 34274512, + "step": 15880 + }, + { + "epoch": 2.5913539967373573, + "grad_norm": 1.5911133289337158, + "learning_rate": 4.624183923054721e-05, + "loss": 0.135, + "num_input_tokens_seen": 34284240, + "step": 15885 + }, + { + "epoch": 2.592169657422512, + "grad_norm": 0.08369740098714828, + "learning_rate": 4.623808499320189e-05, + "loss": 0.0864, + "num_input_tokens_seen": 34293744, + "step": 15890 + }, + { + "epoch": 2.592985318107667, + "grad_norm": 0.3344554007053375, + "learning_rate": 4.623432903418965e-05, + "loss": 0.0686, + "num_input_tokens_seen": 34304144, + "step": 15895 + }, + { + "epoch": 2.5938009787928222, + "grad_norm": 0.008899877779185772, + "learning_rate": 4.6230571353814944e-05, + "loss": 0.2038, + "num_input_tokens_seen": 34314992, + "step": 15900 + }, + { + "epoch": 2.594616639477977, + "grad_norm": 0.9765336513519287, + "learning_rate": 4.622681195238241e-05, + "loss": 0.1803, + "num_input_tokens_seen": 34326768, + "step": 15905 + }, + { + "epoch": 2.5954323001631323, + "grad_norm": 0.5287724137306213, + "learning_rate": 4.622305083019679e-05, + "loss": 0.1379, + "num_input_tokens_seen": 34336080, + "step": 15910 + }, + { + "epoch": 2.596247960848287, + "grad_norm": 0.2865585684776306, + "learning_rate": 4.621928798756299e-05, + "loss": 0.0982, + "num_input_tokens_seen": 34347568, + "step": 15915 + }, + { + "epoch": 2.597063621533442, + "grad_norm": 2.188908338546753, + "learning_rate": 4.621552342478604e-05, + "loss": 0.2364, + "num_input_tokens_seen": 34358864, + "step": 15920 + }, + { + "epoch": 2.597879282218597, + "grad_norm": 0.6300801634788513, + "learning_rate": 4.6211757142171105e-05, + "loss": 0.0924, + "num_input_tokens_seen": 34371024, + "step": 15925 + }, + { + "epoch": 2.598694942903752, + "grad_norm": 0.1324428915977478, + "learning_rate": 4.620798914002352e-05, + "loss": 0.217, + "num_input_tokens_seen": 34381520, + "step": 15930 + }, + { + "epoch": 2.5995106035889073, + "grad_norm": 0.41740360856056213, + "learning_rate": 4.6204219418648724e-05, + "loss": 0.0191, + "num_input_tokens_seen": 34392880, + "step": 15935 + }, + { + "epoch": 2.600326264274062, + "grad_norm": 1.6882859468460083, + "learning_rate": 4.6200447978352315e-05, + "loss": 0.188, + "num_input_tokens_seen": 34404816, + "step": 15940 + }, + { + "epoch": 2.601141924959217, + "grad_norm": 0.763187050819397, + "learning_rate": 4.6196674819440015e-05, + "loss": 0.3311, + "num_input_tokens_seen": 34414896, + "step": 15945 + }, + { + "epoch": 2.6019575856443717, + "grad_norm": 1.806559443473816, + "learning_rate": 4.619289994221771e-05, + "loss": 0.145, + "num_input_tokens_seen": 34426480, + "step": 15950 + }, + { + "epoch": 2.602773246329527, + "grad_norm": 0.8962505459785461, + "learning_rate": 4.61891233469914e-05, + "loss": 0.0738, + "num_input_tokens_seen": 34437328, + "step": 15955 + }, + { + "epoch": 2.603588907014682, + "grad_norm": 0.5685733556747437, + "learning_rate": 4.618534503406724e-05, + "loss": 0.1389, + "num_input_tokens_seen": 34449584, + "step": 15960 + }, + { + "epoch": 2.604404567699837, + "grad_norm": 0.4446079730987549, + "learning_rate": 4.6181565003751525e-05, + "loss": 0.1331, + "num_input_tokens_seen": 34460592, + "step": 15965 + }, + { + "epoch": 2.605220228384992, + "grad_norm": 0.5192838311195374, + "learning_rate": 4.617778325635067e-05, + "loss": 0.0656, + "num_input_tokens_seen": 34472944, + "step": 15970 + }, + { + "epoch": 2.6060358890701467, + "grad_norm": 0.06653329730033875, + "learning_rate": 4.617399979217125e-05, + "loss": 0.0226, + "num_input_tokens_seen": 34483536, + "step": 15975 + }, + { + "epoch": 2.6068515497553015, + "grad_norm": 0.16525577008724213, + "learning_rate": 4.617021461151997e-05, + "loss": 0.1578, + "num_input_tokens_seen": 34495248, + "step": 15980 + }, + { + "epoch": 2.607667210440457, + "grad_norm": 1.2620959281921387, + "learning_rate": 4.616642771470367e-05, + "loss": 0.1126, + "num_input_tokens_seen": 34506832, + "step": 15985 + }, + { + "epoch": 2.6084828711256116, + "grad_norm": 0.07461298257112503, + "learning_rate": 4.616263910202936e-05, + "loss": 0.0843, + "num_input_tokens_seen": 34517168, + "step": 15990 + }, + { + "epoch": 2.609298531810767, + "grad_norm": 0.2591688632965088, + "learning_rate": 4.615884877380413e-05, + "loss": 0.085, + "num_input_tokens_seen": 34528272, + "step": 15995 + }, + { + "epoch": 2.6101141924959217, + "grad_norm": 0.132327601313591, + "learning_rate": 4.6155056730335274e-05, + "loss": 0.0201, + "num_input_tokens_seen": 34537904, + "step": 16000 + }, + { + "epoch": 2.6109298531810765, + "grad_norm": 0.414215087890625, + "learning_rate": 4.615126297193017e-05, + "loss": 0.1923, + "num_input_tokens_seen": 34548368, + "step": 16005 + }, + { + "epoch": 2.6117455138662318, + "grad_norm": 0.4052756428718567, + "learning_rate": 4.614746749889637e-05, + "loss": 0.1269, + "num_input_tokens_seen": 34559024, + "step": 16010 + }, + { + "epoch": 2.6125611745513866, + "grad_norm": 0.4717433452606201, + "learning_rate": 4.614367031154155e-05, + "loss": 0.1719, + "num_input_tokens_seen": 34570224, + "step": 16015 + }, + { + "epoch": 2.613376835236542, + "grad_norm": 2.031895399093628, + "learning_rate": 4.613987141017354e-05, + "loss": 0.0634, + "num_input_tokens_seen": 34581968, + "step": 16020 + }, + { + "epoch": 2.6141924959216967, + "grad_norm": 0.21029925346374512, + "learning_rate": 4.6136070795100285e-05, + "loss": 0.1471, + "num_input_tokens_seen": 34593392, + "step": 16025 + }, + { + "epoch": 2.6150081566068515, + "grad_norm": 0.7136368751525879, + "learning_rate": 4.613226846662989e-05, + "loss": 0.0423, + "num_input_tokens_seen": 34604176, + "step": 16030 + }, + { + "epoch": 2.6158238172920063, + "grad_norm": 0.6394966244697571, + "learning_rate": 4.6128464425070595e-05, + "loss": 0.0583, + "num_input_tokens_seen": 34615024, + "step": 16035 + }, + { + "epoch": 2.6166394779771616, + "grad_norm": 1.993090033531189, + "learning_rate": 4.612465867073076e-05, + "loss": 0.2008, + "num_input_tokens_seen": 34626160, + "step": 16040 + }, + { + "epoch": 2.6174551386623164, + "grad_norm": 1.21798837184906, + "learning_rate": 4.612085120391891e-05, + "loss": 0.0695, + "num_input_tokens_seen": 34635632, + "step": 16045 + }, + { + "epoch": 2.6182707993474716, + "grad_norm": 0.5826882123947144, + "learning_rate": 4.61170420249437e-05, + "loss": 0.0521, + "num_input_tokens_seen": 34646288, + "step": 16050 + }, + { + "epoch": 2.6190864600326265, + "grad_norm": 0.036645203828811646, + "learning_rate": 4.611323113411391e-05, + "loss": 0.3037, + "num_input_tokens_seen": 34657232, + "step": 16055 + }, + { + "epoch": 2.6199021207177813, + "grad_norm": 1.8137896060943604, + "learning_rate": 4.610941853173848e-05, + "loss": 0.2472, + "num_input_tokens_seen": 34667408, + "step": 16060 + }, + { + "epoch": 2.6207177814029365, + "grad_norm": 1.5853043794631958, + "learning_rate": 4.610560421812647e-05, + "loss": 0.1572, + "num_input_tokens_seen": 34677616, + "step": 16065 + }, + { + "epoch": 2.6215334420880914, + "grad_norm": 0.08630457520484924, + "learning_rate": 4.6101788193587103e-05, + "loss": 0.043, + "num_input_tokens_seen": 34688688, + "step": 16070 + }, + { + "epoch": 2.622349102773246, + "grad_norm": 0.16816207766532898, + "learning_rate": 4.609797045842972e-05, + "loss": 0.1064, + "num_input_tokens_seen": 34700272, + "step": 16075 + }, + { + "epoch": 2.6231647634584014, + "grad_norm": 0.4720619320869446, + "learning_rate": 4.609415101296379e-05, + "loss": 0.2784, + "num_input_tokens_seen": 34710832, + "step": 16080 + }, + { + "epoch": 2.6239804241435563, + "grad_norm": 1.5798563957214355, + "learning_rate": 4.609032985749895e-05, + "loss": 0.2568, + "num_input_tokens_seen": 34722224, + "step": 16085 + }, + { + "epoch": 2.624796084828711, + "grad_norm": 0.04864642769098282, + "learning_rate": 4.6086506992344956e-05, + "loss": 0.0684, + "num_input_tokens_seen": 34733136, + "step": 16090 + }, + { + "epoch": 2.6256117455138663, + "grad_norm": 0.15153317153453827, + "learning_rate": 4.608268241781172e-05, + "loss": 0.0906, + "num_input_tokens_seen": 34743408, + "step": 16095 + }, + { + "epoch": 2.626427406199021, + "grad_norm": 3.4676260948181152, + "learning_rate": 4.6078856134209284e-05, + "loss": 0.3451, + "num_input_tokens_seen": 34755184, + "step": 16100 + }, + { + "epoch": 2.6272430668841764, + "grad_norm": 1.2685582637786865, + "learning_rate": 4.6075028141847795e-05, + "loss": 0.0992, + "num_input_tokens_seen": 34765040, + "step": 16105 + }, + { + "epoch": 2.6280587275693312, + "grad_norm": 0.31038355827331543, + "learning_rate": 4.607119844103761e-05, + "loss": 0.1272, + "num_input_tokens_seen": 34775312, + "step": 16110 + }, + { + "epoch": 2.628874388254486, + "grad_norm": 0.6987268924713135, + "learning_rate": 4.606736703208916e-05, + "loss": 0.134, + "num_input_tokens_seen": 34784400, + "step": 16115 + }, + { + "epoch": 2.629690048939641, + "grad_norm": 0.21948063373565674, + "learning_rate": 4.606353391531304e-05, + "loss": 0.083, + "num_input_tokens_seen": 34796080, + "step": 16120 + }, + { + "epoch": 2.630505709624796, + "grad_norm": 0.4202493727207184, + "learning_rate": 4.605969909101998e-05, + "loss": 0.0205, + "num_input_tokens_seen": 34807280, + "step": 16125 + }, + { + "epoch": 2.631321370309951, + "grad_norm": 0.30618178844451904, + "learning_rate": 4.605586255952087e-05, + "loss": 0.0837, + "num_input_tokens_seen": 34818896, + "step": 16130 + }, + { + "epoch": 2.632137030995106, + "grad_norm": 0.3213944733142853, + "learning_rate": 4.6052024321126695e-05, + "loss": 0.0617, + "num_input_tokens_seen": 34830288, + "step": 16135 + }, + { + "epoch": 2.632952691680261, + "grad_norm": 0.15917760133743286, + "learning_rate": 4.6048184376148616e-05, + "loss": 0.0973, + "num_input_tokens_seen": 34840688, + "step": 16140 + }, + { + "epoch": 2.633768352365416, + "grad_norm": 1.5158135890960693, + "learning_rate": 4.6044342724897915e-05, + "loss": 0.1202, + "num_input_tokens_seen": 34852848, + "step": 16145 + }, + { + "epoch": 2.634584013050571, + "grad_norm": 1.9742897748947144, + "learning_rate": 4.604049936768601e-05, + "loss": 0.2881, + "num_input_tokens_seen": 34863728, + "step": 16150 + }, + { + "epoch": 2.635399673735726, + "grad_norm": 0.19183100759983063, + "learning_rate": 4.6036654304824464e-05, + "loss": 0.1483, + "num_input_tokens_seen": 34874672, + "step": 16155 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.4226238429546356, + "learning_rate": 4.603280753662499e-05, + "loss": 0.0756, + "num_input_tokens_seen": 34885904, + "step": 16160 + }, + { + "epoch": 2.637030995106036, + "grad_norm": 0.542097270488739, + "learning_rate": 4.602895906339941e-05, + "loss": 0.1896, + "num_input_tokens_seen": 34896144, + "step": 16165 + }, + { + "epoch": 2.637846655791191, + "grad_norm": 2.1888411045074463, + "learning_rate": 4.6025108885459725e-05, + "loss": 0.2527, + "num_input_tokens_seen": 34906928, + "step": 16170 + }, + { + "epoch": 2.6386623164763456, + "grad_norm": 0.5332846641540527, + "learning_rate": 4.602125700311801e-05, + "loss": 0.0632, + "num_input_tokens_seen": 34917392, + "step": 16175 + }, + { + "epoch": 2.639477977161501, + "grad_norm": 0.16631820797920227, + "learning_rate": 4.6017403416686555e-05, + "loss": 0.0581, + "num_input_tokens_seen": 34928240, + "step": 16180 + }, + { + "epoch": 2.6402936378466557, + "grad_norm": 0.6476415991783142, + "learning_rate": 4.601354812647774e-05, + "loss": 0.0225, + "num_input_tokens_seen": 34939824, + "step": 16185 + }, + { + "epoch": 2.641109298531811, + "grad_norm": 0.4088219106197357, + "learning_rate": 4.600969113280409e-05, + "loss": 0.118, + "num_input_tokens_seen": 34950704, + "step": 16190 + }, + { + "epoch": 2.641924959216966, + "grad_norm": 0.28635913133621216, + "learning_rate": 4.6005832435978266e-05, + "loss": 0.1194, + "num_input_tokens_seen": 34962224, + "step": 16195 + }, + { + "epoch": 2.6427406199021206, + "grad_norm": 0.24838003516197205, + "learning_rate": 4.600197203631309e-05, + "loss": 0.1014, + "num_input_tokens_seen": 34973936, + "step": 16200 + }, + { + "epoch": 2.6435562805872754, + "grad_norm": 0.4665522873401642, + "learning_rate": 4.5998109934121494e-05, + "loss": 0.0416, + "num_input_tokens_seen": 34985296, + "step": 16205 + }, + { + "epoch": 2.6443719412724307, + "grad_norm": 1.907543659210205, + "learning_rate": 4.599424612971657e-05, + "loss": 0.226, + "num_input_tokens_seen": 34996528, + "step": 16210 + }, + { + "epoch": 2.6451876019575855, + "grad_norm": 1.5283608436584473, + "learning_rate": 4.599038062341153e-05, + "loss": 0.1545, + "num_input_tokens_seen": 35007056, + "step": 16215 + }, + { + "epoch": 2.6460032626427408, + "grad_norm": 0.9238367676734924, + "learning_rate": 4.598651341551973e-05, + "loss": 0.0961, + "num_input_tokens_seen": 35017744, + "step": 16220 + }, + { + "epoch": 2.6468189233278956, + "grad_norm": 0.7059518694877625, + "learning_rate": 4.5982644506354666e-05, + "loss": 0.3024, + "num_input_tokens_seen": 35029104, + "step": 16225 + }, + { + "epoch": 2.6476345840130504, + "grad_norm": 0.12639828026294708, + "learning_rate": 4.5978773896229977e-05, + "loss": 0.1562, + "num_input_tokens_seen": 35039632, + "step": 16230 + }, + { + "epoch": 2.6484502446982057, + "grad_norm": 0.9235341548919678, + "learning_rate": 4.597490158545943e-05, + "loss": 0.1684, + "num_input_tokens_seen": 35050704, + "step": 16235 + }, + { + "epoch": 2.6492659053833605, + "grad_norm": 0.0709356740117073, + "learning_rate": 4.5971027574356926e-05, + "loss": 0.1058, + "num_input_tokens_seen": 35060688, + "step": 16240 + }, + { + "epoch": 2.6500815660685157, + "grad_norm": 0.856269359588623, + "learning_rate": 4.5967151863236534e-05, + "loss": 0.0818, + "num_input_tokens_seen": 35071920, + "step": 16245 + }, + { + "epoch": 2.6508972267536706, + "grad_norm": 0.08339370787143707, + "learning_rate": 4.5963274452412416e-05, + "loss": 0.0369, + "num_input_tokens_seen": 35083056, + "step": 16250 + }, + { + "epoch": 2.6517128874388254, + "grad_norm": 0.4824131429195404, + "learning_rate": 4.595939534219891e-05, + "loss": 0.1273, + "num_input_tokens_seen": 35094320, + "step": 16255 + }, + { + "epoch": 2.65252854812398, + "grad_norm": 0.08939515799283981, + "learning_rate": 4.595551453291047e-05, + "loss": 0.1978, + "num_input_tokens_seen": 35104560, + "step": 16260 + }, + { + "epoch": 2.6533442088091355, + "grad_norm": 2.270421028137207, + "learning_rate": 4.5951632024861694e-05, + "loss": 0.2512, + "num_input_tokens_seen": 35115376, + "step": 16265 + }, + { + "epoch": 2.6541598694942903, + "grad_norm": 0.3267267644405365, + "learning_rate": 4.594774781836732e-05, + "loss": 0.1268, + "num_input_tokens_seen": 35126192, + "step": 16270 + }, + { + "epoch": 2.6549755301794455, + "grad_norm": 0.6081148982048035, + "learning_rate": 4.594386191374221e-05, + "loss": 0.1972, + "num_input_tokens_seen": 35137936, + "step": 16275 + }, + { + "epoch": 2.6557911908646004, + "grad_norm": 0.5752279162406921, + "learning_rate": 4.5939974311301406e-05, + "loss": 0.1344, + "num_input_tokens_seen": 35148560, + "step": 16280 + }, + { + "epoch": 2.656606851549755, + "grad_norm": 0.441112220287323, + "learning_rate": 4.593608501136002e-05, + "loss": 0.0778, + "num_input_tokens_seen": 35159632, + "step": 16285 + }, + { + "epoch": 2.6574225122349104, + "grad_norm": 0.09887488931417465, + "learning_rate": 4.5932194014233356e-05, + "loss": 0.1725, + "num_input_tokens_seen": 35170000, + "step": 16290 + }, + { + "epoch": 2.6582381729200653, + "grad_norm": 0.058287106454372406, + "learning_rate": 4.592830132023684e-05, + "loss": 0.0344, + "num_input_tokens_seen": 35181360, + "step": 16295 + }, + { + "epoch": 2.65905383360522, + "grad_norm": 0.22812767326831818, + "learning_rate": 4.592440692968602e-05, + "loss": 0.1311, + "num_input_tokens_seen": 35191600, + "step": 16300 + }, + { + "epoch": 2.6598694942903753, + "grad_norm": 0.2456549108028412, + "learning_rate": 4.5920510842896624e-05, + "loss": 0.045, + "num_input_tokens_seen": 35201136, + "step": 16305 + }, + { + "epoch": 2.66068515497553, + "grad_norm": 0.8075782656669617, + "learning_rate": 4.591661306018446e-05, + "loss": 0.0506, + "num_input_tokens_seen": 35211600, + "step": 16310 + }, + { + "epoch": 2.661500815660685, + "grad_norm": 0.16152742505073547, + "learning_rate": 4.591271358186551e-05, + "loss": 0.2012, + "num_input_tokens_seen": 35222416, + "step": 16315 + }, + { + "epoch": 2.6623164763458402, + "grad_norm": 0.6005259156227112, + "learning_rate": 4.5908812408255884e-05, + "loss": 0.1525, + "num_input_tokens_seen": 35233904, + "step": 16320 + }, + { + "epoch": 2.663132137030995, + "grad_norm": 0.670436680316925, + "learning_rate": 4.5904909539671836e-05, + "loss": 0.0927, + "num_input_tokens_seen": 35243984, + "step": 16325 + }, + { + "epoch": 2.6639477977161503, + "grad_norm": 0.30571600794792175, + "learning_rate": 4.590100497642975e-05, + "loss": 0.0621, + "num_input_tokens_seen": 35255216, + "step": 16330 + }, + { + "epoch": 2.664763458401305, + "grad_norm": 1.8096356391906738, + "learning_rate": 4.589709871884615e-05, + "loss": 0.2343, + "num_input_tokens_seen": 35265136, + "step": 16335 + }, + { + "epoch": 2.66557911908646, + "grad_norm": 0.06903491169214249, + "learning_rate": 4.58931907672377e-05, + "loss": 0.0513, + "num_input_tokens_seen": 35275600, + "step": 16340 + }, + { + "epoch": 2.6663947797716148, + "grad_norm": 0.9679623246192932, + "learning_rate": 4.588928112192119e-05, + "loss": 0.1391, + "num_input_tokens_seen": 35286928, + "step": 16345 + }, + { + "epoch": 2.66721044045677, + "grad_norm": 1.4216017723083496, + "learning_rate": 4.588536978321357e-05, + "loss": 0.1688, + "num_input_tokens_seen": 35299024, + "step": 16350 + }, + { + "epoch": 2.668026101141925, + "grad_norm": 0.10653942078351974, + "learning_rate": 4.588145675143189e-05, + "loss": 0.0439, + "num_input_tokens_seen": 35310320, + "step": 16355 + }, + { + "epoch": 2.66884176182708, + "grad_norm": 0.10458323359489441, + "learning_rate": 4.5877542026893395e-05, + "loss": 0.0861, + "num_input_tokens_seen": 35321104, + "step": 16360 + }, + { + "epoch": 2.669657422512235, + "grad_norm": 1.0414234399795532, + "learning_rate": 4.5873625609915393e-05, + "loss": 0.1775, + "num_input_tokens_seen": 35332688, + "step": 16365 + }, + { + "epoch": 2.6704730831973897, + "grad_norm": 0.14578625559806824, + "learning_rate": 4.58697075008154e-05, + "loss": 0.1651, + "num_input_tokens_seen": 35342416, + "step": 16370 + }, + { + "epoch": 2.671288743882545, + "grad_norm": 0.2125830054283142, + "learning_rate": 4.586578769991102e-05, + "loss": 0.0928, + "num_input_tokens_seen": 35353616, + "step": 16375 + }, + { + "epoch": 2.6721044045677, + "grad_norm": 0.7333959937095642, + "learning_rate": 4.586186620752001e-05, + "loss": 0.0849, + "num_input_tokens_seen": 35364304, + "step": 16380 + }, + { + "epoch": 2.672920065252855, + "grad_norm": 0.3010815680027008, + "learning_rate": 4.585794302396028e-05, + "loss": 0.1479, + "num_input_tokens_seen": 35374992, + "step": 16385 + }, + { + "epoch": 2.67373572593801, + "grad_norm": 1.923797845840454, + "learning_rate": 4.585401814954986e-05, + "loss": 0.182, + "num_input_tokens_seen": 35386544, + "step": 16390 + }, + { + "epoch": 2.6745513866231647, + "grad_norm": 0.07290996611118317, + "learning_rate": 4.5850091584606906e-05, + "loss": 0.2067, + "num_input_tokens_seen": 35397616, + "step": 16395 + }, + { + "epoch": 2.6753670473083195, + "grad_norm": 0.22814522683620453, + "learning_rate": 4.5846163329449745e-05, + "loss": 0.2687, + "num_input_tokens_seen": 35407408, + "step": 16400 + }, + { + "epoch": 2.676182707993475, + "grad_norm": 0.5660831928253174, + "learning_rate": 4.584223338439681e-05, + "loss": 0.0332, + "num_input_tokens_seen": 35418512, + "step": 16405 + }, + { + "epoch": 2.6769983686786296, + "grad_norm": 1.0347036123275757, + "learning_rate": 4.583830174976669e-05, + "loss": 0.2343, + "num_input_tokens_seen": 35429712, + "step": 16410 + }, + { + "epoch": 2.677814029363785, + "grad_norm": 0.8323891162872314, + "learning_rate": 4.5834368425878085e-05, + "loss": 0.2077, + "num_input_tokens_seen": 35440560, + "step": 16415 + }, + { + "epoch": 2.6786296900489397, + "grad_norm": 0.40187332034111023, + "learning_rate": 4.583043341304987e-05, + "loss": 0.2169, + "num_input_tokens_seen": 35451504, + "step": 16420 + }, + { + "epoch": 2.6794453507340945, + "grad_norm": 0.7098586559295654, + "learning_rate": 4.582649671160103e-05, + "loss": 0.1784, + "num_input_tokens_seen": 35462640, + "step": 16425 + }, + { + "epoch": 2.6802610114192493, + "grad_norm": 0.05750785022974014, + "learning_rate": 4.5822558321850696e-05, + "loss": 0.1002, + "num_input_tokens_seen": 35474160, + "step": 16430 + }, + { + "epoch": 2.6810766721044046, + "grad_norm": 0.605217456817627, + "learning_rate": 4.5818618244118126e-05, + "loss": 0.1207, + "num_input_tokens_seen": 35485200, + "step": 16435 + }, + { + "epoch": 2.6818923327895594, + "grad_norm": 0.33244821429252625, + "learning_rate": 4.581467647872273e-05, + "loss": 0.466, + "num_input_tokens_seen": 35496176, + "step": 16440 + }, + { + "epoch": 2.6827079934747147, + "grad_norm": 0.1863093376159668, + "learning_rate": 4.5810733025984045e-05, + "loss": 0.1287, + "num_input_tokens_seen": 35506960, + "step": 16445 + }, + { + "epoch": 2.6835236541598695, + "grad_norm": 1.217970609664917, + "learning_rate": 4.580678788622176e-05, + "loss": 0.1918, + "num_input_tokens_seen": 35517744, + "step": 16450 + }, + { + "epoch": 2.6843393148450243, + "grad_norm": 1.358182430267334, + "learning_rate": 4.580284105975566e-05, + "loss": 0.2966, + "num_input_tokens_seen": 35529680, + "step": 16455 + }, + { + "epoch": 2.6851549755301796, + "grad_norm": 0.14840909838676453, + "learning_rate": 4.5798892546905726e-05, + "loss": 0.1057, + "num_input_tokens_seen": 35540496, + "step": 16460 + }, + { + "epoch": 2.6859706362153344, + "grad_norm": 1.3857533931732178, + "learning_rate": 4.579494234799202e-05, + "loss": 0.171, + "num_input_tokens_seen": 35552176, + "step": 16465 + }, + { + "epoch": 2.6867862969004896, + "grad_norm": 0.08094238489866257, + "learning_rate": 4.579099046333477e-05, + "loss": 0.0394, + "num_input_tokens_seen": 35563440, + "step": 16470 + }, + { + "epoch": 2.6876019575856445, + "grad_norm": 0.5446646213531494, + "learning_rate": 4.5787036893254355e-05, + "loss": 0.0376, + "num_input_tokens_seen": 35574256, + "step": 16475 + }, + { + "epoch": 2.6884176182707993, + "grad_norm": 0.9475445747375488, + "learning_rate": 4.578308163807125e-05, + "loss": 0.0906, + "num_input_tokens_seen": 35584368, + "step": 16480 + }, + { + "epoch": 2.689233278955954, + "grad_norm": 1.0300689935684204, + "learning_rate": 4.577912469810609e-05, + "loss": 0.1341, + "num_input_tokens_seen": 35594896, + "step": 16485 + }, + { + "epoch": 2.6900489396411094, + "grad_norm": 0.6346067786216736, + "learning_rate": 4.577516607367965e-05, + "loss": 0.1287, + "num_input_tokens_seen": 35605488, + "step": 16490 + }, + { + "epoch": 2.690864600326264, + "grad_norm": 1.5552700757980347, + "learning_rate": 4.577120576511285e-05, + "loss": 0.0768, + "num_input_tokens_seen": 35617616, + "step": 16495 + }, + { + "epoch": 2.6916802610114194, + "grad_norm": 0.23210765421390533, + "learning_rate": 4.5767243772726706e-05, + "loss": 0.1825, + "num_input_tokens_seen": 35628304, + "step": 16500 + }, + { + "epoch": 2.6924959216965743, + "grad_norm": 0.24755330383777618, + "learning_rate": 4.576328009684241e-05, + "loss": 0.1577, + "num_input_tokens_seen": 35640016, + "step": 16505 + }, + { + "epoch": 2.693311582381729, + "grad_norm": 1.3579243421554565, + "learning_rate": 4.5759314737781275e-05, + "loss": 0.1833, + "num_input_tokens_seen": 35651632, + "step": 16510 + }, + { + "epoch": 2.6941272430668843, + "grad_norm": 0.8150209188461304, + "learning_rate": 4.575534769586477e-05, + "loss": 0.0414, + "num_input_tokens_seen": 35661200, + "step": 16515 + }, + { + "epoch": 2.694942903752039, + "grad_norm": 1.5602037906646729, + "learning_rate": 4.575137897141446e-05, + "loss": 0.1734, + "num_input_tokens_seen": 35671472, + "step": 16520 + }, + { + "epoch": 2.695758564437194, + "grad_norm": 0.8422805666923523, + "learning_rate": 4.574740856475207e-05, + "loss": 0.1404, + "num_input_tokens_seen": 35682480, + "step": 16525 + }, + { + "epoch": 2.6965742251223492, + "grad_norm": 1.7045531272888184, + "learning_rate": 4.574343647619949e-05, + "loss": 0.2981, + "num_input_tokens_seen": 35693168, + "step": 16530 + }, + { + "epoch": 2.697389885807504, + "grad_norm": 0.4193389117717743, + "learning_rate": 4.573946270607868e-05, + "loss": 0.0868, + "num_input_tokens_seen": 35704624, + "step": 16535 + }, + { + "epoch": 2.698205546492659, + "grad_norm": 0.38796287775039673, + "learning_rate": 4.573548725471181e-05, + "loss": 0.0907, + "num_input_tokens_seen": 35714768, + "step": 16540 + }, + { + "epoch": 2.699021207177814, + "grad_norm": 0.13406537473201752, + "learning_rate": 4.573151012242112e-05, + "loss": 0.0986, + "num_input_tokens_seen": 35724240, + "step": 16545 + }, + { + "epoch": 2.699836867862969, + "grad_norm": 0.45840421319007874, + "learning_rate": 4.5727531309529024e-05, + "loss": 0.0731, + "num_input_tokens_seen": 35735472, + "step": 16550 + }, + { + "epoch": 2.700652528548124, + "grad_norm": 0.04955285042524338, + "learning_rate": 4.5723550816358076e-05, + "loss": 0.0663, + "num_input_tokens_seen": 35747376, + "step": 16555 + }, + { + "epoch": 2.701468189233279, + "grad_norm": 0.08433847874403, + "learning_rate": 4.571956864323095e-05, + "loss": 0.0269, + "num_input_tokens_seen": 35758960, + "step": 16560 + }, + { + "epoch": 2.702283849918434, + "grad_norm": 1.8160960674285889, + "learning_rate": 4.571558479047046e-05, + "loss": 0.2026, + "num_input_tokens_seen": 35770512, + "step": 16565 + }, + { + "epoch": 2.7030995106035887, + "grad_norm": 0.12067871540784836, + "learning_rate": 4.571159925839956e-05, + "loss": 0.1944, + "num_input_tokens_seen": 35781424, + "step": 16570 + }, + { + "epoch": 2.703915171288744, + "grad_norm": 1.2132900953292847, + "learning_rate": 4.570761204734133e-05, + "loss": 0.1535, + "num_input_tokens_seen": 35793200, + "step": 16575 + }, + { + "epoch": 2.7047308319738987, + "grad_norm": 0.9339560866355896, + "learning_rate": 4.5703623157619e-05, + "loss": 0.0487, + "num_input_tokens_seen": 35804880, + "step": 16580 + }, + { + "epoch": 2.705546492659054, + "grad_norm": 0.15661920607089996, + "learning_rate": 4.5699632589555924e-05, + "loss": 0.2616, + "num_input_tokens_seen": 35815376, + "step": 16585 + }, + { + "epoch": 2.706362153344209, + "grad_norm": 0.7422894239425659, + "learning_rate": 4.569564034347561e-05, + "loss": 0.2518, + "num_input_tokens_seen": 35825008, + "step": 16590 + }, + { + "epoch": 2.7071778140293636, + "grad_norm": 0.7921921610832214, + "learning_rate": 4.5691646419701675e-05, + "loss": 0.1044, + "num_input_tokens_seen": 35834384, + "step": 16595 + }, + { + "epoch": 2.707993474714519, + "grad_norm": 2.0096499919891357, + "learning_rate": 4.5687650818557884e-05, + "loss": 0.1594, + "num_input_tokens_seen": 35844752, + "step": 16600 + }, + { + "epoch": 2.7088091353996737, + "grad_norm": 0.06112353503704071, + "learning_rate": 4.568365354036816e-05, + "loss": 0.1371, + "num_input_tokens_seen": 35855664, + "step": 16605 + }, + { + "epoch": 2.709624796084829, + "grad_norm": 0.22191616892814636, + "learning_rate": 4.567965458545653e-05, + "loss": 0.0413, + "num_input_tokens_seen": 35867472, + "step": 16610 + }, + { + "epoch": 2.710440456769984, + "grad_norm": 0.14753541350364685, + "learning_rate": 4.5675653954147174e-05, + "loss": 0.1204, + "num_input_tokens_seen": 35878544, + "step": 16615 + }, + { + "epoch": 2.7112561174551386, + "grad_norm": 0.6620187163352966, + "learning_rate": 4.56716516467644e-05, + "loss": 0.1318, + "num_input_tokens_seen": 35890704, + "step": 16620 + }, + { + "epoch": 2.7120717781402934, + "grad_norm": 0.06728707998991013, + "learning_rate": 4.5667647663632653e-05, + "loss": 0.065, + "num_input_tokens_seen": 35901424, + "step": 16625 + }, + { + "epoch": 2.7128874388254487, + "grad_norm": 1.1215720176696777, + "learning_rate": 4.566364200507652e-05, + "loss": 0.2785, + "num_input_tokens_seen": 35912688, + "step": 16630 + }, + { + "epoch": 2.7137030995106035, + "grad_norm": 1.1600323915481567, + "learning_rate": 4.565963467142073e-05, + "loss": 0.1513, + "num_input_tokens_seen": 35924272, + "step": 16635 + }, + { + "epoch": 2.7145187601957588, + "grad_norm": 0.21545198559761047, + "learning_rate": 4.565562566299012e-05, + "loss": 0.1675, + "num_input_tokens_seen": 35935984, + "step": 16640 + }, + { + "epoch": 2.7153344208809136, + "grad_norm": 0.5861642360687256, + "learning_rate": 4.5651614980109684e-05, + "loss": 0.259, + "num_input_tokens_seen": 35945840, + "step": 16645 + }, + { + "epoch": 2.7161500815660684, + "grad_norm": 0.39166441559791565, + "learning_rate": 4.564760262310456e-05, + "loss": 0.1378, + "num_input_tokens_seen": 35956336, + "step": 16650 + }, + { + "epoch": 2.7169657422512232, + "grad_norm": 0.08788899332284927, + "learning_rate": 4.5643588592300004e-05, + "loss": 0.0581, + "num_input_tokens_seen": 35967664, + "step": 16655 + }, + { + "epoch": 2.7177814029363785, + "grad_norm": 0.629030168056488, + "learning_rate": 4.56395728880214e-05, + "loss": 0.2868, + "num_input_tokens_seen": 35978480, + "step": 16660 + }, + { + "epoch": 2.7185970636215333, + "grad_norm": 0.0898665189743042, + "learning_rate": 4.5635555510594304e-05, + "loss": 0.1317, + "num_input_tokens_seen": 35989040, + "step": 16665 + }, + { + "epoch": 2.7194127243066886, + "grad_norm": 0.4992068409919739, + "learning_rate": 4.563153646034437e-05, + "loss": 0.1761, + "num_input_tokens_seen": 35999792, + "step": 16670 + }, + { + "epoch": 2.7202283849918434, + "grad_norm": 0.110963374376297, + "learning_rate": 4.5627515737597406e-05, + "loss": 0.0323, + "num_input_tokens_seen": 36009392, + "step": 16675 + }, + { + "epoch": 2.721044045676998, + "grad_norm": 0.24389059841632843, + "learning_rate": 4.562349334267936e-05, + "loss": 0.1174, + "num_input_tokens_seen": 36019952, + "step": 16680 + }, + { + "epoch": 2.7218597063621535, + "grad_norm": 0.1974208652973175, + "learning_rate": 4.5619469275916294e-05, + "loss": 0.0656, + "num_input_tokens_seen": 36031152, + "step": 16685 + }, + { + "epoch": 2.7226753670473083, + "grad_norm": 0.42634639143943787, + "learning_rate": 4.5615443537634425e-05, + "loss": 0.1298, + "num_input_tokens_seen": 36041808, + "step": 16690 + }, + { + "epoch": 2.7234910277324635, + "grad_norm": 1.8849952220916748, + "learning_rate": 4.561141612816011e-05, + "loss": 0.1368, + "num_input_tokens_seen": 36051248, + "step": 16695 + }, + { + "epoch": 2.7243066884176184, + "grad_norm": 0.06307632476091385, + "learning_rate": 4.560738704781982e-05, + "loss": 0.0755, + "num_input_tokens_seen": 36063024, + "step": 16700 + }, + { + "epoch": 2.725122349102773, + "grad_norm": 0.22482647001743317, + "learning_rate": 4.560335629694018e-05, + "loss": 0.2008, + "num_input_tokens_seen": 36074800, + "step": 16705 + }, + { + "epoch": 2.725938009787928, + "grad_norm": 0.35101455450057983, + "learning_rate": 4.559932387584792e-05, + "loss": 0.1085, + "num_input_tokens_seen": 36084784, + "step": 16710 + }, + { + "epoch": 2.7267536704730833, + "grad_norm": 1.6113200187683105, + "learning_rate": 4.559528978486997e-05, + "loss": 0.1033, + "num_input_tokens_seen": 36095248, + "step": 16715 + }, + { + "epoch": 2.727569331158238, + "grad_norm": 0.24735985696315765, + "learning_rate": 4.5591254024333304e-05, + "loss": 0.2691, + "num_input_tokens_seen": 36106288, + "step": 16720 + }, + { + "epoch": 2.7283849918433933, + "grad_norm": 0.4454752802848816, + "learning_rate": 4.558721659456513e-05, + "loss": 0.0849, + "num_input_tokens_seen": 36116976, + "step": 16725 + }, + { + "epoch": 2.729200652528548, + "grad_norm": 0.15588189661502838, + "learning_rate": 4.558317749589271e-05, + "loss": 0.1419, + "num_input_tokens_seen": 36127504, + "step": 16730 + }, + { + "epoch": 2.730016313213703, + "grad_norm": 0.0390903614461422, + "learning_rate": 4.557913672864349e-05, + "loss": 0.035, + "num_input_tokens_seen": 36137744, + "step": 16735 + }, + { + "epoch": 2.7308319738988582, + "grad_norm": 0.3506168723106384, + "learning_rate": 4.5575094293145025e-05, + "loss": 0.137, + "num_input_tokens_seen": 36149328, + "step": 16740 + }, + { + "epoch": 2.731647634584013, + "grad_norm": 0.10427909344434738, + "learning_rate": 4.557105018972502e-05, + "loss": 0.0998, + "num_input_tokens_seen": 36160688, + "step": 16745 + }, + { + "epoch": 2.732463295269168, + "grad_norm": 0.22597669064998627, + "learning_rate": 4.5567004418711314e-05, + "loss": 0.0679, + "num_input_tokens_seen": 36171056, + "step": 16750 + }, + { + "epoch": 2.733278955954323, + "grad_norm": 0.0786878690123558, + "learning_rate": 4.556295698043187e-05, + "loss": 0.1493, + "num_input_tokens_seen": 36181552, + "step": 16755 + }, + { + "epoch": 2.734094616639478, + "grad_norm": 0.8831868767738342, + "learning_rate": 4.55589078752148e-05, + "loss": 0.1294, + "num_input_tokens_seen": 36191472, + "step": 16760 + }, + { + "epoch": 2.7349102773246328, + "grad_norm": 1.0989148616790771, + "learning_rate": 4.5554857103388336e-05, + "loss": 0.192, + "num_input_tokens_seen": 36204272, + "step": 16765 + }, + { + "epoch": 2.735725938009788, + "grad_norm": 0.48297038674354553, + "learning_rate": 4.555080466528087e-05, + "loss": 0.1713, + "num_input_tokens_seen": 36216144, + "step": 16770 + }, + { + "epoch": 2.736541598694943, + "grad_norm": 0.8950685858726501, + "learning_rate": 4.5546750561220896e-05, + "loss": 0.0679, + "num_input_tokens_seen": 36226672, + "step": 16775 + }, + { + "epoch": 2.737357259380098, + "grad_norm": 1.8489067554473877, + "learning_rate": 4.554269479153708e-05, + "loss": 0.2505, + "num_input_tokens_seen": 36238448, + "step": 16780 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.44560274481773376, + "learning_rate": 4.553863735655818e-05, + "loss": 0.2509, + "num_input_tokens_seen": 36248400, + "step": 16785 + }, + { + "epoch": 2.7389885807504077, + "grad_norm": 0.5334349870681763, + "learning_rate": 4.553457825661313e-05, + "loss": 0.0523, + "num_input_tokens_seen": 36259824, + "step": 16790 + }, + { + "epoch": 2.7398042414355626, + "grad_norm": 0.8576014041900635, + "learning_rate": 4.553051749203097e-05, + "loss": 0.0565, + "num_input_tokens_seen": 36269168, + "step": 16795 + }, + { + "epoch": 2.740619902120718, + "grad_norm": 0.5859591960906982, + "learning_rate": 4.5526455063140894e-05, + "loss": 0.0502, + "num_input_tokens_seen": 36278288, + "step": 16800 + }, + { + "epoch": 2.7414355628058726, + "grad_norm": 0.4079974591732025, + "learning_rate": 4.552239097027222e-05, + "loss": 0.1087, + "num_input_tokens_seen": 36289200, + "step": 16805 + }, + { + "epoch": 2.742251223491028, + "grad_norm": 1.2780177593231201, + "learning_rate": 4.551832521375441e-05, + "loss": 0.2154, + "num_input_tokens_seen": 36300112, + "step": 16810 + }, + { + "epoch": 2.7430668841761827, + "grad_norm": 0.9378383159637451, + "learning_rate": 4.551425779391705e-05, + "loss": 0.3166, + "num_input_tokens_seen": 36309872, + "step": 16815 + }, + { + "epoch": 2.7438825448613375, + "grad_norm": 0.06925444304943085, + "learning_rate": 4.551018871108985e-05, + "loss": 0.1212, + "num_input_tokens_seen": 36320080, + "step": 16820 + }, + { + "epoch": 2.744698205546493, + "grad_norm": 0.17695577442646027, + "learning_rate": 4.55061179656027e-05, + "loss": 0.2374, + "num_input_tokens_seen": 36330448, + "step": 16825 + }, + { + "epoch": 2.7455138662316476, + "grad_norm": 0.5816348195075989, + "learning_rate": 4.550204555778558e-05, + "loss": 0.2082, + "num_input_tokens_seen": 36341008, + "step": 16830 + }, + { + "epoch": 2.746329526916803, + "grad_norm": 0.20593750476837158, + "learning_rate": 4.549797148796861e-05, + "loss": 0.0926, + "num_input_tokens_seen": 36351728, + "step": 16835 + }, + { + "epoch": 2.7471451876019577, + "grad_norm": 0.2827848494052887, + "learning_rate": 4.549389575648208e-05, + "loss": 0.2039, + "num_input_tokens_seen": 36362832, + "step": 16840 + }, + { + "epoch": 2.7479608482871125, + "grad_norm": 1.070474624633789, + "learning_rate": 4.548981836365636e-05, + "loss": 0.1296, + "num_input_tokens_seen": 36373520, + "step": 16845 + }, + { + "epoch": 2.7487765089722673, + "grad_norm": 1.367502212524414, + "learning_rate": 4.5485739309822e-05, + "loss": 0.0661, + "num_input_tokens_seen": 36384912, + "step": 16850 + }, + { + "epoch": 2.7495921696574226, + "grad_norm": 0.1122259795665741, + "learning_rate": 4.548165859530968e-05, + "loss": 0.0238, + "num_input_tokens_seen": 36395760, + "step": 16855 + }, + { + "epoch": 2.7504078303425774, + "grad_norm": 0.7411473393440247, + "learning_rate": 4.547757622045018e-05, + "loss": 0.0719, + "num_input_tokens_seen": 36406768, + "step": 16860 + }, + { + "epoch": 2.7512234910277327, + "grad_norm": 0.6618984937667847, + "learning_rate": 4.5473492185574465e-05, + "loss": 0.0847, + "num_input_tokens_seen": 36415728, + "step": 16865 + }, + { + "epoch": 2.7520391517128875, + "grad_norm": 0.21528784930706024, + "learning_rate": 4.546940649101358e-05, + "loss": 0.2554, + "num_input_tokens_seen": 36427728, + "step": 16870 + }, + { + "epoch": 2.7528548123980423, + "grad_norm": 0.059602174907922745, + "learning_rate": 4.546531913709874e-05, + "loss": 0.2978, + "num_input_tokens_seen": 36438640, + "step": 16875 + }, + { + "epoch": 2.753670473083197, + "grad_norm": 1.7860591411590576, + "learning_rate": 4.5461230124161294e-05, + "loss": 0.1529, + "num_input_tokens_seen": 36448976, + "step": 16880 + }, + { + "epoch": 2.7544861337683524, + "grad_norm": 0.4750521779060364, + "learning_rate": 4.545713945253272e-05, + "loss": 0.1966, + "num_input_tokens_seen": 36458736, + "step": 16885 + }, + { + "epoch": 2.755301794453507, + "grad_norm": 2.8948001861572266, + "learning_rate": 4.545304712254462e-05, + "loss": 0.2695, + "num_input_tokens_seen": 36468144, + "step": 16890 + }, + { + "epoch": 2.7561174551386625, + "grad_norm": 0.24813991785049438, + "learning_rate": 4.544895313452875e-05, + "loss": 0.2001, + "num_input_tokens_seen": 36478512, + "step": 16895 + }, + { + "epoch": 2.7569331158238173, + "grad_norm": 1.0546525716781616, + "learning_rate": 4.544485748881697e-05, + "loss": 0.1519, + "num_input_tokens_seen": 36490384, + "step": 16900 + }, + { + "epoch": 2.757748776508972, + "grad_norm": 1.2791383266448975, + "learning_rate": 4.544076018574131e-05, + "loss": 0.0825, + "num_input_tokens_seen": 36500944, + "step": 16905 + }, + { + "epoch": 2.7585644371941274, + "grad_norm": 0.7771432995796204, + "learning_rate": 4.5436661225633915e-05, + "loss": 0.0873, + "num_input_tokens_seen": 36511056, + "step": 16910 + }, + { + "epoch": 2.759380097879282, + "grad_norm": 0.5250051617622375, + "learning_rate": 4.543256060882707e-05, + "loss": 0.1484, + "num_input_tokens_seen": 36522448, + "step": 16915 + }, + { + "epoch": 2.7601957585644374, + "grad_norm": 0.16145603358745575, + "learning_rate": 4.542845833565318e-05, + "loss": 0.1669, + "num_input_tokens_seen": 36533456, + "step": 16920 + }, + { + "epoch": 2.7610114192495923, + "grad_norm": 0.2961632013320923, + "learning_rate": 4.5424354406444815e-05, + "loss": 0.1125, + "num_input_tokens_seen": 36544656, + "step": 16925 + }, + { + "epoch": 2.761827079934747, + "grad_norm": 0.7707560062408447, + "learning_rate": 4.542024882153464e-05, + "loss": 0.1193, + "num_input_tokens_seen": 36556144, + "step": 16930 + }, + { + "epoch": 2.762642740619902, + "grad_norm": 0.3775326907634735, + "learning_rate": 4.541614158125549e-05, + "loss": 0.1259, + "num_input_tokens_seen": 36566064, + "step": 16935 + }, + { + "epoch": 2.763458401305057, + "grad_norm": 0.5492393970489502, + "learning_rate": 4.541203268594031e-05, + "loss": 0.0817, + "num_input_tokens_seen": 36576528, + "step": 16940 + }, + { + "epoch": 2.764274061990212, + "grad_norm": 0.910723090171814, + "learning_rate": 4.5407922135922194e-05, + "loss": 0.1111, + "num_input_tokens_seen": 36587856, + "step": 16945 + }, + { + "epoch": 2.7650897226753672, + "grad_norm": 1.4997965097427368, + "learning_rate": 4.5403809931534355e-05, + "loss": 0.2877, + "num_input_tokens_seen": 36597936, + "step": 16950 + }, + { + "epoch": 2.765905383360522, + "grad_norm": 0.38866764307022095, + "learning_rate": 4.5399696073110166e-05, + "loss": 0.0649, + "num_input_tokens_seen": 36608688, + "step": 16955 + }, + { + "epoch": 2.766721044045677, + "grad_norm": 0.7356373071670532, + "learning_rate": 4.53955805609831e-05, + "loss": 0.1884, + "num_input_tokens_seen": 36619824, + "step": 16960 + }, + { + "epoch": 2.767536704730832, + "grad_norm": 1.927843451499939, + "learning_rate": 4.5391463395486784e-05, + "loss": 0.1915, + "num_input_tokens_seen": 36630960, + "step": 16965 + }, + { + "epoch": 2.768352365415987, + "grad_norm": 0.1633632779121399, + "learning_rate": 4.538734457695498e-05, + "loss": 0.1277, + "num_input_tokens_seen": 36640976, + "step": 16970 + }, + { + "epoch": 2.7691680261011418, + "grad_norm": 0.33883246779441833, + "learning_rate": 4.5383224105721586e-05, + "loss": 0.2424, + "num_input_tokens_seen": 36651792, + "step": 16975 + }, + { + "epoch": 2.769983686786297, + "grad_norm": 1.3714739084243774, + "learning_rate": 4.537910198212061e-05, + "loss": 0.0679, + "num_input_tokens_seen": 36662896, + "step": 16980 + }, + { + "epoch": 2.770799347471452, + "grad_norm": 0.733548104763031, + "learning_rate": 4.537497820648624e-05, + "loss": 0.165, + "num_input_tokens_seen": 36673040, + "step": 16985 + }, + { + "epoch": 2.7716150081566067, + "grad_norm": 1.4941461086273193, + "learning_rate": 4.537085277915275e-05, + "loss": 0.1943, + "num_input_tokens_seen": 36684464, + "step": 16990 + }, + { + "epoch": 2.772430668841762, + "grad_norm": 0.6606377959251404, + "learning_rate": 4.536672570045457e-05, + "loss": 0.1338, + "num_input_tokens_seen": 36696368, + "step": 16995 + }, + { + "epoch": 2.7732463295269167, + "grad_norm": 0.25244760513305664, + "learning_rate": 4.536259697072627e-05, + "loss": 0.141, + "num_input_tokens_seen": 36707600, + "step": 17000 + }, + { + "epoch": 2.774061990212072, + "grad_norm": 1.2988156080245972, + "learning_rate": 4.535846659030254e-05, + "loss": 0.2017, + "num_input_tokens_seen": 36718992, + "step": 17005 + }, + { + "epoch": 2.774877650897227, + "grad_norm": 0.31207531690597534, + "learning_rate": 4.5354334559518205e-05, + "loss": 0.1074, + "num_input_tokens_seen": 36729840, + "step": 17010 + }, + { + "epoch": 2.7756933115823816, + "grad_norm": 0.27934396266937256, + "learning_rate": 4.535020087870824e-05, + "loss": 0.1037, + "num_input_tokens_seen": 36738672, + "step": 17015 + }, + { + "epoch": 2.7765089722675365, + "grad_norm": 0.5810121297836304, + "learning_rate": 4.5346065548207727e-05, + "loss": 0.0707, + "num_input_tokens_seen": 36749104, + "step": 17020 + }, + { + "epoch": 2.7773246329526917, + "grad_norm": 0.25529494881629944, + "learning_rate": 4.5341928568351915e-05, + "loss": 0.0973, + "num_input_tokens_seen": 36760560, + "step": 17025 + }, + { + "epoch": 2.7781402936378465, + "grad_norm": 0.39088091254234314, + "learning_rate": 4.533778993947615e-05, + "loss": 0.1018, + "num_input_tokens_seen": 36771728, + "step": 17030 + }, + { + "epoch": 2.778955954323002, + "grad_norm": 1.2304964065551758, + "learning_rate": 4.533364966191595e-05, + "loss": 0.1923, + "num_input_tokens_seen": 36783472, + "step": 17035 + }, + { + "epoch": 2.7797716150081566, + "grad_norm": 0.056510500609874725, + "learning_rate": 4.532950773600694e-05, + "loss": 0.2439, + "num_input_tokens_seen": 36794672, + "step": 17040 + }, + { + "epoch": 2.7805872756933114, + "grad_norm": 0.0985405296087265, + "learning_rate": 4.532536416208487e-05, + "loss": 0.159, + "num_input_tokens_seen": 36805840, + "step": 17045 + }, + { + "epoch": 2.7814029363784667, + "grad_norm": 0.47774678468704224, + "learning_rate": 4.532121894048566e-05, + "loss": 0.105, + "num_input_tokens_seen": 36816368, + "step": 17050 + }, + { + "epoch": 2.7822185970636215, + "grad_norm": 1.1948124170303345, + "learning_rate": 4.531707207154532e-05, + "loss": 0.1086, + "num_input_tokens_seen": 36827920, + "step": 17055 + }, + { + "epoch": 2.7830342577487768, + "grad_norm": 1.9611790180206299, + "learning_rate": 4.531292355560004e-05, + "loss": 0.3168, + "num_input_tokens_seen": 36839312, + "step": 17060 + }, + { + "epoch": 2.7838499184339316, + "grad_norm": 1.444595217704773, + "learning_rate": 4.5308773392986115e-05, + "loss": 0.1291, + "num_input_tokens_seen": 36849264, + "step": 17065 + }, + { + "epoch": 2.7846655791190864, + "grad_norm": 1.1180214881896973, + "learning_rate": 4.530462158403996e-05, + "loss": 0.2698, + "num_input_tokens_seen": 36859664, + "step": 17070 + }, + { + "epoch": 2.7854812398042412, + "grad_norm": 0.21255655586719513, + "learning_rate": 4.5300468129098165e-05, + "loss": 0.1186, + "num_input_tokens_seen": 36869776, + "step": 17075 + }, + { + "epoch": 2.7862969004893965, + "grad_norm": 0.5140745043754578, + "learning_rate": 4.529631302849742e-05, + "loss": 0.0838, + "num_input_tokens_seen": 36880912, + "step": 17080 + }, + { + "epoch": 2.7871125611745513, + "grad_norm": 1.1539146900177002, + "learning_rate": 4.529215628257455e-05, + "loss": 0.2702, + "num_input_tokens_seen": 36891056, + "step": 17085 + }, + { + "epoch": 2.7879282218597066, + "grad_norm": 0.3217758238315582, + "learning_rate": 4.528799789166654e-05, + "loss": 0.1335, + "num_input_tokens_seen": 36901456, + "step": 17090 + }, + { + "epoch": 2.7887438825448614, + "grad_norm": 0.13648879528045654, + "learning_rate": 4.5283837856110474e-05, + "loss": 0.1693, + "num_input_tokens_seen": 36912336, + "step": 17095 + }, + { + "epoch": 2.789559543230016, + "grad_norm": 0.06339794397354126, + "learning_rate": 4.5279676176243596e-05, + "loss": 0.0701, + "num_input_tokens_seen": 36921456, + "step": 17100 + }, + { + "epoch": 2.790375203915171, + "grad_norm": 1.032213807106018, + "learning_rate": 4.527551285240327e-05, + "loss": 0.1242, + "num_input_tokens_seen": 36932272, + "step": 17105 + }, + { + "epoch": 2.7911908646003263, + "grad_norm": 0.22263789176940918, + "learning_rate": 4.527134788492698e-05, + "loss": 0.1089, + "num_input_tokens_seen": 36942640, + "step": 17110 + }, + { + "epoch": 2.792006525285481, + "grad_norm": 1.1650749444961548, + "learning_rate": 4.526718127415239e-05, + "loss": 0.4081, + "num_input_tokens_seen": 36954192, + "step": 17115 + }, + { + "epoch": 2.7928221859706364, + "grad_norm": 0.9025839567184448, + "learning_rate": 4.5263013020417254e-05, + "loss": 0.1786, + "num_input_tokens_seen": 36964880, + "step": 17120 + }, + { + "epoch": 2.793637846655791, + "grad_norm": 0.6315277814865112, + "learning_rate": 4.5258843124059466e-05, + "loss": 0.1307, + "num_input_tokens_seen": 36976112, + "step": 17125 + }, + { + "epoch": 2.794453507340946, + "grad_norm": 0.39560723304748535, + "learning_rate": 4.5254671585417056e-05, + "loss": 0.0703, + "num_input_tokens_seen": 36985232, + "step": 17130 + }, + { + "epoch": 2.7952691680261013, + "grad_norm": 0.3881431818008423, + "learning_rate": 4.52504984048282e-05, + "loss": 0.0744, + "num_input_tokens_seen": 36995376, + "step": 17135 + }, + { + "epoch": 2.796084828711256, + "grad_norm": 0.8088746666908264, + "learning_rate": 4.5246323582631196e-05, + "loss": 0.109, + "num_input_tokens_seen": 37006576, + "step": 17140 + }, + { + "epoch": 2.7969004893964113, + "grad_norm": 0.7311804294586182, + "learning_rate": 4.524214711916447e-05, + "loss": 0.1397, + "num_input_tokens_seen": 37017264, + "step": 17145 + }, + { + "epoch": 2.797716150081566, + "grad_norm": 0.2638937830924988, + "learning_rate": 4.523796901476659e-05, + "loss": 0.0402, + "num_input_tokens_seen": 37027472, + "step": 17150 + }, + { + "epoch": 2.798531810766721, + "grad_norm": 0.465883731842041, + "learning_rate": 4.5233789269776264e-05, + "loss": 0.129, + "num_input_tokens_seen": 37038768, + "step": 17155 + }, + { + "epoch": 2.799347471451876, + "grad_norm": 0.9292364120483398, + "learning_rate": 4.5229607884532306e-05, + "loss": 0.16, + "num_input_tokens_seen": 37049424, + "step": 17160 + }, + { + "epoch": 2.800163132137031, + "grad_norm": 2.3898096084594727, + "learning_rate": 4.522542485937369e-05, + "loss": 0.2354, + "num_input_tokens_seen": 37060912, + "step": 17165 + }, + { + "epoch": 2.800978792822186, + "grad_norm": 0.898797333240509, + "learning_rate": 4.5221240194639514e-05, + "loss": 0.18, + "num_input_tokens_seen": 37070032, + "step": 17170 + }, + { + "epoch": 2.801794453507341, + "grad_norm": 1.2207622528076172, + "learning_rate": 4.5217053890669004e-05, + "loss": 0.1241, + "num_input_tokens_seen": 37079792, + "step": 17175 + }, + { + "epoch": 2.802610114192496, + "grad_norm": 0.3187756836414337, + "learning_rate": 4.521286594780152e-05, + "loss": 0.1129, + "num_input_tokens_seen": 37091344, + "step": 17180 + }, + { + "epoch": 2.8034257748776508, + "grad_norm": 0.5261462330818176, + "learning_rate": 4.5208676366376574e-05, + "loss": 0.0754, + "num_input_tokens_seen": 37102800, + "step": 17185 + }, + { + "epoch": 2.804241435562806, + "grad_norm": 0.7514358162879944, + "learning_rate": 4.520448514673378e-05, + "loss": 0.2016, + "num_input_tokens_seen": 37114512, + "step": 17190 + }, + { + "epoch": 2.805057096247961, + "grad_norm": 1.8172250986099243, + "learning_rate": 4.52002922892129e-05, + "loss": 0.3408, + "num_input_tokens_seen": 37125232, + "step": 17195 + }, + { + "epoch": 2.8058727569331157, + "grad_norm": 0.9880380630493164, + "learning_rate": 4.519609779415384e-05, + "loss": 0.129, + "num_input_tokens_seen": 37136656, + "step": 17200 + }, + { + "epoch": 2.806688417618271, + "grad_norm": 0.44648081064224243, + "learning_rate": 4.519190166189661e-05, + "loss": 0.1656, + "num_input_tokens_seen": 37148112, + "step": 17205 + }, + { + "epoch": 2.8075040783034257, + "grad_norm": 0.22080466151237488, + "learning_rate": 4.518770389278138e-05, + "loss": 0.0565, + "num_input_tokens_seen": 37160208, + "step": 17210 + }, + { + "epoch": 2.8083197389885806, + "grad_norm": 0.20874586701393127, + "learning_rate": 4.5183504487148444e-05, + "loss": 0.0665, + "num_input_tokens_seen": 37170288, + "step": 17215 + }, + { + "epoch": 2.809135399673736, + "grad_norm": 0.3296731114387512, + "learning_rate": 4.517930344533822e-05, + "loss": 0.0688, + "num_input_tokens_seen": 37179408, + "step": 17220 + }, + { + "epoch": 2.8099510603588906, + "grad_norm": 0.1705022007226944, + "learning_rate": 4.517510076769128e-05, + "loss": 0.0832, + "num_input_tokens_seen": 37191472, + "step": 17225 + }, + { + "epoch": 2.810766721044046, + "grad_norm": 0.40433767437934875, + "learning_rate": 4.517089645454829e-05, + "loss": 0.0823, + "num_input_tokens_seen": 37200528, + "step": 17230 + }, + { + "epoch": 2.8115823817292007, + "grad_norm": 0.30362287163734436, + "learning_rate": 4.51666905062501e-05, + "loss": 0.1228, + "num_input_tokens_seen": 37210960, + "step": 17235 + }, + { + "epoch": 2.8123980424143555, + "grad_norm": 0.10895878821611404, + "learning_rate": 4.516248292313765e-05, + "loss": 0.1664, + "num_input_tokens_seen": 37222192, + "step": 17240 + }, + { + "epoch": 2.8132137030995104, + "grad_norm": 1.6326020956039429, + "learning_rate": 4.515827370555202e-05, + "loss": 0.192, + "num_input_tokens_seen": 37234064, + "step": 17245 + }, + { + "epoch": 2.8140293637846656, + "grad_norm": 0.3068557381629944, + "learning_rate": 4.515406285383446e-05, + "loss": 0.2506, + "num_input_tokens_seen": 37245968, + "step": 17250 + }, + { + "epoch": 2.8148450244698204, + "grad_norm": 1.399659514427185, + "learning_rate": 4.514985036832628e-05, + "loss": 0.2222, + "num_input_tokens_seen": 37256432, + "step": 17255 + }, + { + "epoch": 2.8156606851549757, + "grad_norm": 0.5596020817756653, + "learning_rate": 4.514563624936901e-05, + "loss": 0.2231, + "num_input_tokens_seen": 37267312, + "step": 17260 + }, + { + "epoch": 2.8164763458401305, + "grad_norm": 0.4561832547187805, + "learning_rate": 4.514142049730424e-05, + "loss": 0.1377, + "num_input_tokens_seen": 37278256, + "step": 17265 + }, + { + "epoch": 2.8172920065252853, + "grad_norm": 0.17966203391551971, + "learning_rate": 4.513720311247374e-05, + "loss": 0.1031, + "num_input_tokens_seen": 37289968, + "step": 17270 + }, + { + "epoch": 2.8181076672104406, + "grad_norm": 0.1193300262093544, + "learning_rate": 4.5132984095219364e-05, + "loss": 0.0997, + "num_input_tokens_seen": 37301072, + "step": 17275 + }, + { + "epoch": 2.8189233278955954, + "grad_norm": 2.619776487350464, + "learning_rate": 4.512876344588315e-05, + "loss": 0.2036, + "num_input_tokens_seen": 37311664, + "step": 17280 + }, + { + "epoch": 2.8197389885807507, + "grad_norm": 0.6041295528411865, + "learning_rate": 4.512454116480724e-05, + "loss": 0.0903, + "num_input_tokens_seen": 37323120, + "step": 17285 + }, + { + "epoch": 2.8205546492659055, + "grad_norm": 0.23590534925460815, + "learning_rate": 4.512031725233391e-05, + "loss": 0.1002, + "num_input_tokens_seen": 37334672, + "step": 17290 + }, + { + "epoch": 2.8213703099510603, + "grad_norm": 0.10666978359222412, + "learning_rate": 4.5116091708805575e-05, + "loss": 0.0982, + "num_input_tokens_seen": 37346384, + "step": 17295 + }, + { + "epoch": 2.822185970636215, + "grad_norm": 0.3725242018699646, + "learning_rate": 4.5111864534564776e-05, + "loss": 0.0486, + "num_input_tokens_seen": 37358256, + "step": 17300 + }, + { + "epoch": 2.8230016313213704, + "grad_norm": 1.4836148023605347, + "learning_rate": 4.510763572995419e-05, + "loss": 0.1664, + "num_input_tokens_seen": 37369968, + "step": 17305 + }, + { + "epoch": 2.823817292006525, + "grad_norm": 0.49886706471443176, + "learning_rate": 4.5103405295316634e-05, + "loss": 0.1833, + "num_input_tokens_seen": 37380432, + "step": 17310 + }, + { + "epoch": 2.8246329526916805, + "grad_norm": 2.0635175704956055, + "learning_rate": 4.509917323099504e-05, + "loss": 0.2829, + "num_input_tokens_seen": 37390480, + "step": 17315 + }, + { + "epoch": 2.8254486133768353, + "grad_norm": 0.25237855315208435, + "learning_rate": 4.5094939537332475e-05, + "loss": 0.1165, + "num_input_tokens_seen": 37402608, + "step": 17320 + }, + { + "epoch": 2.82626427406199, + "grad_norm": 0.46054792404174805, + "learning_rate": 4.5090704214672155e-05, + "loss": 0.0612, + "num_input_tokens_seen": 37413936, + "step": 17325 + }, + { + "epoch": 2.827079934747145, + "grad_norm": 0.14470918476581573, + "learning_rate": 4.508646726335741e-05, + "loss": 0.0793, + "num_input_tokens_seen": 37426480, + "step": 17330 + }, + { + "epoch": 2.8278955954323, + "grad_norm": 0.7988441586494446, + "learning_rate": 4.508222868373171e-05, + "loss": 0.2153, + "num_input_tokens_seen": 37438224, + "step": 17335 + }, + { + "epoch": 2.828711256117455, + "grad_norm": 0.05291635915637016, + "learning_rate": 4.507798847613866e-05, + "loss": 0.0995, + "num_input_tokens_seen": 37448176, + "step": 17340 + }, + { + "epoch": 2.8295269168026103, + "grad_norm": 0.18277090787887573, + "learning_rate": 4.507374664092199e-05, + "loss": 0.1494, + "num_input_tokens_seen": 37458672, + "step": 17345 + }, + { + "epoch": 2.830342577487765, + "grad_norm": 0.25506582856178284, + "learning_rate": 4.506950317842556e-05, + "loss": 0.2082, + "num_input_tokens_seen": 37469840, + "step": 17350 + }, + { + "epoch": 2.83115823817292, + "grad_norm": 1.5982588529586792, + "learning_rate": 4.506525808899337e-05, + "loss": 0.2768, + "num_input_tokens_seen": 37479376, + "step": 17355 + }, + { + "epoch": 2.831973898858075, + "grad_norm": 1.5113115310668945, + "learning_rate": 4.506101137296955e-05, + "loss": 0.2181, + "num_input_tokens_seen": 37489392, + "step": 17360 + }, + { + "epoch": 2.83278955954323, + "grad_norm": 0.44505587220191956, + "learning_rate": 4.505676303069837e-05, + "loss": 0.1205, + "num_input_tokens_seen": 37499504, + "step": 17365 + }, + { + "epoch": 2.8336052202283852, + "grad_norm": 0.5240213871002197, + "learning_rate": 4.5052513062524196e-05, + "loss": 0.1481, + "num_input_tokens_seen": 37510512, + "step": 17370 + }, + { + "epoch": 2.83442088091354, + "grad_norm": 0.8518088459968567, + "learning_rate": 4.504826146879158e-05, + "loss": 0.186, + "num_input_tokens_seen": 37521424, + "step": 17375 + }, + { + "epoch": 2.835236541598695, + "grad_norm": 1.586323618888855, + "learning_rate": 4.504400824984516e-05, + "loss": 0.273, + "num_input_tokens_seen": 37532976, + "step": 17380 + }, + { + "epoch": 2.8360522022838497, + "grad_norm": 0.5222753286361694, + "learning_rate": 4.503975340602973e-05, + "loss": 0.1829, + "num_input_tokens_seen": 37543504, + "step": 17385 + }, + { + "epoch": 2.836867862969005, + "grad_norm": 0.11114467680454254, + "learning_rate": 4.50354969376902e-05, + "loss": 0.1386, + "num_input_tokens_seen": 37554000, + "step": 17390 + }, + { + "epoch": 2.8376835236541598, + "grad_norm": 0.36704790592193604, + "learning_rate": 4.5031238845171644e-05, + "loss": 0.092, + "num_input_tokens_seen": 37565360, + "step": 17395 + }, + { + "epoch": 2.838499184339315, + "grad_norm": 0.1285860687494278, + "learning_rate": 4.502697912881923e-05, + "loss": 0.2233, + "num_input_tokens_seen": 37576368, + "step": 17400 + }, + { + "epoch": 2.83931484502447, + "grad_norm": 0.26471173763275146, + "learning_rate": 4.502271778897825e-05, + "loss": 0.1417, + "num_input_tokens_seen": 37587664, + "step": 17405 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.4637354910373688, + "learning_rate": 4.50184548259942e-05, + "loss": 0.2213, + "num_input_tokens_seen": 37598256, + "step": 17410 + }, + { + "epoch": 2.84094616639478, + "grad_norm": 0.3331049978733063, + "learning_rate": 4.501419024021261e-05, + "loss": 0.1829, + "num_input_tokens_seen": 37608976, + "step": 17415 + }, + { + "epoch": 2.8417618270799347, + "grad_norm": 0.568318784236908, + "learning_rate": 4.500992403197921e-05, + "loss": 0.1491, + "num_input_tokens_seen": 37618352, + "step": 17420 + }, + { + "epoch": 2.8425774877650896, + "grad_norm": 0.6415582299232483, + "learning_rate": 4.500565620163985e-05, + "loss": 0.1846, + "num_input_tokens_seen": 37628816, + "step": 17425 + }, + { + "epoch": 2.843393148450245, + "grad_norm": 0.09981326013803482, + "learning_rate": 4.500138674954047e-05, + "loss": 0.0548, + "num_input_tokens_seen": 37639824, + "step": 17430 + }, + { + "epoch": 2.8442088091353996, + "grad_norm": 0.6605005860328674, + "learning_rate": 4.499711567602721e-05, + "loss": 0.1326, + "num_input_tokens_seen": 37651504, + "step": 17435 + }, + { + "epoch": 2.8450244698205545, + "grad_norm": 0.15242959558963776, + "learning_rate": 4.499284298144629e-05, + "loss": 0.107, + "num_input_tokens_seen": 37662992, + "step": 17440 + }, + { + "epoch": 2.8458401305057097, + "grad_norm": 0.07054515928030014, + "learning_rate": 4.498856866614407e-05, + "loss": 0.0561, + "num_input_tokens_seen": 37673904, + "step": 17445 + }, + { + "epoch": 2.8466557911908645, + "grad_norm": 0.7039295434951782, + "learning_rate": 4.498429273046705e-05, + "loss": 0.1133, + "num_input_tokens_seen": 37684368, + "step": 17450 + }, + { + "epoch": 2.84747145187602, + "grad_norm": 0.250881552696228, + "learning_rate": 4.498001517476187e-05, + "loss": 0.0679, + "num_input_tokens_seen": 37694480, + "step": 17455 + }, + { + "epoch": 2.8482871125611746, + "grad_norm": 0.0969104990363121, + "learning_rate": 4.497573599937528e-05, + "loss": 0.0241, + "num_input_tokens_seen": 37705040, + "step": 17460 + }, + { + "epoch": 2.8491027732463294, + "grad_norm": 1.1526011228561401, + "learning_rate": 4.497145520465417e-05, + "loss": 0.2124, + "num_input_tokens_seen": 37716304, + "step": 17465 + }, + { + "epoch": 2.8499184339314843, + "grad_norm": 0.2592758536338806, + "learning_rate": 4.4967172790945565e-05, + "loss": 0.0879, + "num_input_tokens_seen": 37726352, + "step": 17470 + }, + { + "epoch": 2.8507340946166395, + "grad_norm": 0.40564852952957153, + "learning_rate": 4.496288875859663e-05, + "loss": 0.0566, + "num_input_tokens_seen": 37736048, + "step": 17475 + }, + { + "epoch": 2.8515497553017943, + "grad_norm": 1.9470528364181519, + "learning_rate": 4.4958603107954635e-05, + "loss": 0.2083, + "num_input_tokens_seen": 37746704, + "step": 17480 + }, + { + "epoch": 2.8523654159869496, + "grad_norm": 0.14689616858959198, + "learning_rate": 4.4954315839367006e-05, + "loss": 0.0597, + "num_input_tokens_seen": 37758000, + "step": 17485 + }, + { + "epoch": 2.8531810766721044, + "grad_norm": 0.1450406312942505, + "learning_rate": 4.495002695318129e-05, + "loss": 0.3516, + "num_input_tokens_seen": 37768240, + "step": 17490 + }, + { + "epoch": 2.8539967373572592, + "grad_norm": 0.42605870962142944, + "learning_rate": 4.494573644974516e-05, + "loss": 0.2342, + "num_input_tokens_seen": 37778576, + "step": 17495 + }, + { + "epoch": 2.8548123980424145, + "grad_norm": 0.17584563791751862, + "learning_rate": 4.494144432940643e-05, + "loss": 0.0757, + "num_input_tokens_seen": 37788880, + "step": 17500 + }, + { + "epoch": 2.8556280587275693, + "grad_norm": 0.08138612657785416, + "learning_rate": 4.493715059251304e-05, + "loss": 0.1147, + "num_input_tokens_seen": 37799856, + "step": 17505 + }, + { + "epoch": 2.8564437194127246, + "grad_norm": 0.2507184147834778, + "learning_rate": 4.4932855239413065e-05, + "loss": 0.1988, + "num_input_tokens_seen": 37809808, + "step": 17510 + }, + { + "epoch": 2.8572593800978794, + "grad_norm": 1.6355557441711426, + "learning_rate": 4.49285582704547e-05, + "loss": 0.234, + "num_input_tokens_seen": 37819856, + "step": 17515 + }, + { + "epoch": 2.858075040783034, + "grad_norm": 0.5372338891029358, + "learning_rate": 4.492425968598629e-05, + "loss": 0.1552, + "num_input_tokens_seen": 37831792, + "step": 17520 + }, + { + "epoch": 2.858890701468189, + "grad_norm": 0.5974254012107849, + "learning_rate": 4.49199594863563e-05, + "loss": 0.1134, + "num_input_tokens_seen": 37843280, + "step": 17525 + }, + { + "epoch": 2.8597063621533443, + "grad_norm": 0.6360036730766296, + "learning_rate": 4.4915657671913314e-05, + "loss": 0.1221, + "num_input_tokens_seen": 37854928, + "step": 17530 + }, + { + "epoch": 2.860522022838499, + "grad_norm": 0.4384385645389557, + "learning_rate": 4.491135424300607e-05, + "loss": 0.1114, + "num_input_tokens_seen": 37865424, + "step": 17535 + }, + { + "epoch": 2.8613376835236544, + "grad_norm": 0.5827024579048157, + "learning_rate": 4.490704919998342e-05, + "loss": 0.0928, + "num_input_tokens_seen": 37876624, + "step": 17540 + }, + { + "epoch": 2.862153344208809, + "grad_norm": 0.7195541262626648, + "learning_rate": 4.4902742543194356e-05, + "loss": 0.1706, + "num_input_tokens_seen": 37886896, + "step": 17545 + }, + { + "epoch": 2.862969004893964, + "grad_norm": 0.918659508228302, + "learning_rate": 4.4898434272988e-05, + "loss": 0.2845, + "num_input_tokens_seen": 37897424, + "step": 17550 + }, + { + "epoch": 2.863784665579119, + "grad_norm": 0.19184179604053497, + "learning_rate": 4.489412438971359e-05, + "loss": 0.0822, + "num_input_tokens_seen": 37908080, + "step": 17555 + }, + { + "epoch": 2.864600326264274, + "grad_norm": 0.4937117397785187, + "learning_rate": 4.488981289372052e-05, + "loss": 0.0548, + "num_input_tokens_seen": 37918768, + "step": 17560 + }, + { + "epoch": 2.865415986949429, + "grad_norm": 0.6364811658859253, + "learning_rate": 4.488549978535829e-05, + "loss": 0.265, + "num_input_tokens_seen": 37929488, + "step": 17565 + }, + { + "epoch": 2.866231647634584, + "grad_norm": 0.4120334982872009, + "learning_rate": 4.4881185064976553e-05, + "loss": 0.2033, + "num_input_tokens_seen": 37940560, + "step": 17570 + }, + { + "epoch": 2.867047308319739, + "grad_norm": 0.5884972214698792, + "learning_rate": 4.487686873292508e-05, + "loss": 0.2155, + "num_input_tokens_seen": 37951248, + "step": 17575 + }, + { + "epoch": 2.867862969004894, + "grad_norm": 0.03316609561443329, + "learning_rate": 4.487255078955378e-05, + "loss": 0.0564, + "num_input_tokens_seen": 37961936, + "step": 17580 + }, + { + "epoch": 2.868678629690049, + "grad_norm": 1.3251910209655762, + "learning_rate": 4.486823123521267e-05, + "loss": 0.1929, + "num_input_tokens_seen": 37972560, + "step": 17585 + }, + { + "epoch": 2.869494290375204, + "grad_norm": 0.20283091068267822, + "learning_rate": 4.4863910070251927e-05, + "loss": 0.076, + "num_input_tokens_seen": 37984016, + "step": 17590 + }, + { + "epoch": 2.870309951060359, + "grad_norm": 0.1907118409872055, + "learning_rate": 4.485958729502185e-05, + "loss": 0.0962, + "num_input_tokens_seen": 37994896, + "step": 17595 + }, + { + "epoch": 2.871125611745514, + "grad_norm": 0.42410850524902344, + "learning_rate": 4.485526290987286e-05, + "loss": 0.1847, + "num_input_tokens_seen": 38006832, + "step": 17600 + }, + { + "epoch": 2.8719412724306688, + "grad_norm": 1.8681787252426147, + "learning_rate": 4.485093691515551e-05, + "loss": 0.1349, + "num_input_tokens_seen": 38018832, + "step": 17605 + }, + { + "epoch": 2.8727569331158236, + "grad_norm": 0.10706133395433426, + "learning_rate": 4.4846609311220494e-05, + "loss": 0.2282, + "num_input_tokens_seen": 38029520, + "step": 17610 + }, + { + "epoch": 2.873572593800979, + "grad_norm": 0.15446341037750244, + "learning_rate": 4.484228009841863e-05, + "loss": 0.1103, + "num_input_tokens_seen": 38039568, + "step": 17615 + }, + { + "epoch": 2.8743882544861337, + "grad_norm": 0.11967872828245163, + "learning_rate": 4.483794927710085e-05, + "loss": 0.1195, + "num_input_tokens_seen": 38051344, + "step": 17620 + }, + { + "epoch": 2.875203915171289, + "grad_norm": 1.3119794130325317, + "learning_rate": 4.483361684761826e-05, + "loss": 0.1372, + "num_input_tokens_seen": 38062448, + "step": 17625 + }, + { + "epoch": 2.8760195758564437, + "grad_norm": 0.4687134027481079, + "learning_rate": 4.482928281032205e-05, + "loss": 0.1052, + "num_input_tokens_seen": 38073520, + "step": 17630 + }, + { + "epoch": 2.8768352365415986, + "grad_norm": 0.6964274644851685, + "learning_rate": 4.482494716556356e-05, + "loss": 0.1578, + "num_input_tokens_seen": 38084400, + "step": 17635 + }, + { + "epoch": 2.877650897226754, + "grad_norm": 0.034998729825019836, + "learning_rate": 4.482060991369426e-05, + "loss": 0.2065, + "num_input_tokens_seen": 38094288, + "step": 17640 + }, + { + "epoch": 2.8784665579119086, + "grad_norm": 1.0125142335891724, + "learning_rate": 4.481627105506575e-05, + "loss": 0.0765, + "num_input_tokens_seen": 38104752, + "step": 17645 + }, + { + "epoch": 2.8792822185970635, + "grad_norm": 0.2941353917121887, + "learning_rate": 4.481193059002976e-05, + "loss": 0.1129, + "num_input_tokens_seen": 38113456, + "step": 17650 + }, + { + "epoch": 2.8800978792822187, + "grad_norm": 0.3303471803665161, + "learning_rate": 4.480758851893816e-05, + "loss": 0.1024, + "num_input_tokens_seen": 38123888, + "step": 17655 + }, + { + "epoch": 2.8809135399673735, + "grad_norm": 1.2039132118225098, + "learning_rate": 4.480324484214293e-05, + "loss": 0.1193, + "num_input_tokens_seen": 38134448, + "step": 17660 + }, + { + "epoch": 2.8817292006525284, + "grad_norm": 2.5589773654937744, + "learning_rate": 4.479889955999619e-05, + "loss": 0.2881, + "num_input_tokens_seen": 38145104, + "step": 17665 + }, + { + "epoch": 2.8825448613376836, + "grad_norm": 1.4553834199905396, + "learning_rate": 4.4794552672850185e-05, + "loss": 0.2345, + "num_input_tokens_seen": 38155984, + "step": 17670 + }, + { + "epoch": 2.8833605220228384, + "grad_norm": 1.2841482162475586, + "learning_rate": 4.479020418105732e-05, + "loss": 0.1973, + "num_input_tokens_seen": 38167408, + "step": 17675 + }, + { + "epoch": 2.8841761827079937, + "grad_norm": 1.139731526374817, + "learning_rate": 4.478585408497008e-05, + "loss": 0.08, + "num_input_tokens_seen": 38178096, + "step": 17680 + }, + { + "epoch": 2.8849918433931485, + "grad_norm": 0.5754477977752686, + "learning_rate": 4.478150238494112e-05, + "loss": 0.0504, + "num_input_tokens_seen": 38188304, + "step": 17685 + }, + { + "epoch": 2.8858075040783033, + "grad_norm": 1.1190640926361084, + "learning_rate": 4.47771490813232e-05, + "loss": 0.1085, + "num_input_tokens_seen": 38198512, + "step": 17690 + }, + { + "epoch": 2.886623164763458, + "grad_norm": 0.10027656704187393, + "learning_rate": 4.4772794174469234e-05, + "loss": 0.1269, + "num_input_tokens_seen": 38209840, + "step": 17695 + }, + { + "epoch": 2.8874388254486134, + "grad_norm": 0.20489782094955444, + "learning_rate": 4.4768437664732244e-05, + "loss": 0.1183, + "num_input_tokens_seen": 38219760, + "step": 17700 + }, + { + "epoch": 2.8882544861337682, + "grad_norm": 0.42330506443977356, + "learning_rate": 4.4764079552465385e-05, + "loss": 0.0535, + "num_input_tokens_seen": 38231376, + "step": 17705 + }, + { + "epoch": 2.8890701468189235, + "grad_norm": 1.9222708940505981, + "learning_rate": 4.475971983802196e-05, + "loss": 0.2268, + "num_input_tokens_seen": 38242096, + "step": 17710 + }, + { + "epoch": 2.8898858075040783, + "grad_norm": 0.07726768404245377, + "learning_rate": 4.475535852175539e-05, + "loss": 0.0642, + "num_input_tokens_seen": 38252624, + "step": 17715 + }, + { + "epoch": 2.890701468189233, + "grad_norm": 0.7972289323806763, + "learning_rate": 4.475099560401922e-05, + "loss": 0.1099, + "num_input_tokens_seen": 38262864, + "step": 17720 + }, + { + "epoch": 2.8915171288743884, + "grad_norm": 0.1802748292684555, + "learning_rate": 4.474663108516713e-05, + "loss": 0.0565, + "num_input_tokens_seen": 38274640, + "step": 17725 + }, + { + "epoch": 2.892332789559543, + "grad_norm": 0.053632523864507675, + "learning_rate": 4.474226496555293e-05, + "loss": 0.0377, + "num_input_tokens_seen": 38286512, + "step": 17730 + }, + { + "epoch": 2.8931484502446985, + "grad_norm": 0.03768714889883995, + "learning_rate": 4.473789724553056e-05, + "loss": 0.0843, + "num_input_tokens_seen": 38297456, + "step": 17735 + }, + { + "epoch": 2.8939641109298533, + "grad_norm": 1.2284891605377197, + "learning_rate": 4.473352792545409e-05, + "loss": 0.1616, + "num_input_tokens_seen": 38308304, + "step": 17740 + }, + { + "epoch": 2.894779771615008, + "grad_norm": 0.7691360116004944, + "learning_rate": 4.4729157005677724e-05, + "loss": 0.2649, + "num_input_tokens_seen": 38318640, + "step": 17745 + }, + { + "epoch": 2.895595432300163, + "grad_norm": 0.6107876896858215, + "learning_rate": 4.472478448655578e-05, + "loss": 0.0542, + "num_input_tokens_seen": 38330320, + "step": 17750 + }, + { + "epoch": 2.896411092985318, + "grad_norm": 0.3791916072368622, + "learning_rate": 4.4720410368442724e-05, + "loss": 0.0541, + "num_input_tokens_seen": 38340432, + "step": 17755 + }, + { + "epoch": 2.897226753670473, + "grad_norm": 0.8735598921775818, + "learning_rate": 4.471603465169314e-05, + "loss": 0.1175, + "num_input_tokens_seen": 38351760, + "step": 17760 + }, + { + "epoch": 2.8980424143556283, + "grad_norm": 0.37647947669029236, + "learning_rate": 4.471165733666176e-05, + "loss": 0.0598, + "num_input_tokens_seen": 38361136, + "step": 17765 + }, + { + "epoch": 2.898858075040783, + "grad_norm": 0.11474911868572235, + "learning_rate": 4.4707278423703415e-05, + "loss": 0.0343, + "num_input_tokens_seen": 38371408, + "step": 17770 + }, + { + "epoch": 2.899673735725938, + "grad_norm": 1.1632682085037231, + "learning_rate": 4.470289791317308e-05, + "loss": 0.0996, + "num_input_tokens_seen": 38383056, + "step": 17775 + }, + { + "epoch": 2.9004893964110927, + "grad_norm": 0.34774431586265564, + "learning_rate": 4.4698515805425876e-05, + "loss": 0.0613, + "num_input_tokens_seen": 38393840, + "step": 17780 + }, + { + "epoch": 2.901305057096248, + "grad_norm": 0.3920598328113556, + "learning_rate": 4.469413210081703e-05, + "loss": 0.0726, + "num_input_tokens_seen": 38406096, + "step": 17785 + }, + { + "epoch": 2.902120717781403, + "grad_norm": 0.3049202859401703, + "learning_rate": 4.468974679970191e-05, + "loss": 0.1265, + "num_input_tokens_seen": 38416912, + "step": 17790 + }, + { + "epoch": 2.902936378466558, + "grad_norm": 1.6396926641464233, + "learning_rate": 4.468535990243601e-05, + "loss": 0.1378, + "num_input_tokens_seen": 38428016, + "step": 17795 + }, + { + "epoch": 2.903752039151713, + "grad_norm": 0.9648082852363586, + "learning_rate": 4.468097140937495e-05, + "loss": 0.1378, + "num_input_tokens_seen": 38438480, + "step": 17800 + }, + { + "epoch": 2.9045676998368677, + "grad_norm": 0.67011559009552, + "learning_rate": 4.467658132087449e-05, + "loss": 0.0535, + "num_input_tokens_seen": 38448112, + "step": 17805 + }, + { + "epoch": 2.905383360522023, + "grad_norm": 0.7131406664848328, + "learning_rate": 4.4672189637290505e-05, + "loss": 0.2097, + "num_input_tokens_seen": 38457872, + "step": 17810 + }, + { + "epoch": 2.9061990212071778, + "grad_norm": 0.26138898730278015, + "learning_rate": 4.466779635897902e-05, + "loss": 0.1453, + "num_input_tokens_seen": 38469040, + "step": 17815 + }, + { + "epoch": 2.907014681892333, + "grad_norm": 2.393496036529541, + "learning_rate": 4.466340148629617e-05, + "loss": 0.2414, + "num_input_tokens_seen": 38479344, + "step": 17820 + }, + { + "epoch": 2.907830342577488, + "grad_norm": 1.7140467166900635, + "learning_rate": 4.465900501959822e-05, + "loss": 0.1397, + "num_input_tokens_seen": 38489552, + "step": 17825 + }, + { + "epoch": 2.9086460032626427, + "grad_norm": 0.8202376365661621, + "learning_rate": 4.465460695924157e-05, + "loss": 0.158, + "num_input_tokens_seen": 38500752, + "step": 17830 + }, + { + "epoch": 2.9094616639477975, + "grad_norm": 1.6571744680404663, + "learning_rate": 4.4650207305582756e-05, + "loss": 0.1684, + "num_input_tokens_seen": 38512432, + "step": 17835 + }, + { + "epoch": 2.9102773246329527, + "grad_norm": 0.02976834774017334, + "learning_rate": 4.464580605897844e-05, + "loss": 0.0963, + "num_input_tokens_seen": 38522256, + "step": 17840 + }, + { + "epoch": 2.9110929853181076, + "grad_norm": 0.35234424471855164, + "learning_rate": 4.4641403219785396e-05, + "loss": 0.1544, + "num_input_tokens_seen": 38532784, + "step": 17845 + }, + { + "epoch": 2.911908646003263, + "grad_norm": 1.7993569374084473, + "learning_rate": 4.463699878836055e-05, + "loss": 0.2406, + "num_input_tokens_seen": 38542640, + "step": 17850 + }, + { + "epoch": 2.9127243066884176, + "grad_norm": 0.9576764106750488, + "learning_rate": 4.463259276506095e-05, + "loss": 0.1692, + "num_input_tokens_seen": 38553008, + "step": 17855 + }, + { + "epoch": 2.9135399673735725, + "grad_norm": 1.0256035327911377, + "learning_rate": 4.462818515024376e-05, + "loss": 0.2576, + "num_input_tokens_seen": 38565200, + "step": 17860 + }, + { + "epoch": 2.9143556280587277, + "grad_norm": 0.21867474913597107, + "learning_rate": 4.462377594426629e-05, + "loss": 0.1472, + "num_input_tokens_seen": 38574736, + "step": 17865 + }, + { + "epoch": 2.9151712887438825, + "grad_norm": 0.2697612941265106, + "learning_rate": 4.461936514748597e-05, + "loss": 0.0342, + "num_input_tokens_seen": 38584496, + "step": 17870 + }, + { + "epoch": 2.9159869494290374, + "grad_norm": 0.09283608198165894, + "learning_rate": 4.4614952760260366e-05, + "loss": 0.1266, + "num_input_tokens_seen": 38594448, + "step": 17875 + }, + { + "epoch": 2.9168026101141926, + "grad_norm": 0.14076977968215942, + "learning_rate": 4.4610538782947166e-05, + "loss": 0.0805, + "num_input_tokens_seen": 38604976, + "step": 17880 + }, + { + "epoch": 2.9176182707993474, + "grad_norm": 0.14565713703632355, + "learning_rate": 4.460612321590419e-05, + "loss": 0.2369, + "num_input_tokens_seen": 38615792, + "step": 17885 + }, + { + "epoch": 2.9184339314845023, + "grad_norm": 1.3420040607452393, + "learning_rate": 4.460170605948939e-05, + "loss": 0.2022, + "num_input_tokens_seen": 38626320, + "step": 17890 + }, + { + "epoch": 2.9192495921696575, + "grad_norm": 0.5683112740516663, + "learning_rate": 4.459728731406083e-05, + "loss": 0.1128, + "num_input_tokens_seen": 38637008, + "step": 17895 + }, + { + "epoch": 2.9200652528548123, + "grad_norm": 0.7237491607666016, + "learning_rate": 4.4592866979976725e-05, + "loss": 0.0709, + "num_input_tokens_seen": 38647056, + "step": 17900 + }, + { + "epoch": 2.9208809135399676, + "grad_norm": 1.4114866256713867, + "learning_rate": 4.458844505759542e-05, + "loss": 0.3881, + "num_input_tokens_seen": 38658512, + "step": 17905 + }, + { + "epoch": 2.9216965742251224, + "grad_norm": 0.4005456864833832, + "learning_rate": 4.4584021547275356e-05, + "loss": 0.142, + "num_input_tokens_seen": 38669328, + "step": 17910 + }, + { + "epoch": 2.9225122349102772, + "grad_norm": 0.49808740615844727, + "learning_rate": 4.457959644937514e-05, + "loss": 0.1171, + "num_input_tokens_seen": 38680048, + "step": 17915 + }, + { + "epoch": 2.923327895595432, + "grad_norm": 0.3495057225227356, + "learning_rate": 4.457516976425349e-05, + "loss": 0.2552, + "num_input_tokens_seen": 38691952, + "step": 17920 + }, + { + "epoch": 2.9241435562805873, + "grad_norm": 0.19406715035438538, + "learning_rate": 4.457074149226926e-05, + "loss": 0.0906, + "num_input_tokens_seen": 38701872, + "step": 17925 + }, + { + "epoch": 2.924959216965742, + "grad_norm": 0.05754147842526436, + "learning_rate": 4.456631163378142e-05, + "loss": 0.0483, + "num_input_tokens_seen": 38713104, + "step": 17930 + }, + { + "epoch": 2.9257748776508974, + "grad_norm": 0.24740785360336304, + "learning_rate": 4.456188018914908e-05, + "loss": 0.1836, + "num_input_tokens_seen": 38723952, + "step": 17935 + }, + { + "epoch": 2.926590538336052, + "grad_norm": 0.14805801212787628, + "learning_rate": 4.455744715873148e-05, + "loss": 0.0525, + "num_input_tokens_seen": 38735216, + "step": 17940 + }, + { + "epoch": 2.927406199021207, + "grad_norm": 0.21716822683811188, + "learning_rate": 4.455301254288797e-05, + "loss": 0.1111, + "num_input_tokens_seen": 38746032, + "step": 17945 + }, + { + "epoch": 2.9282218597063623, + "grad_norm": 0.9031400084495544, + "learning_rate": 4.454857634197806e-05, + "loss": 0.1799, + "num_input_tokens_seen": 38756464, + "step": 17950 + }, + { + "epoch": 2.929037520391517, + "grad_norm": 0.5946336984634399, + "learning_rate": 4.4544138556361364e-05, + "loss": 0.1794, + "num_input_tokens_seen": 38767952, + "step": 17955 + }, + { + "epoch": 2.9298531810766724, + "grad_norm": 0.1019158884882927, + "learning_rate": 4.453969918639763e-05, + "loss": 0.0653, + "num_input_tokens_seen": 38779312, + "step": 17960 + }, + { + "epoch": 2.930668841761827, + "grad_norm": 0.15273791551589966, + "learning_rate": 4.453525823244673e-05, + "loss": 0.0604, + "num_input_tokens_seen": 38790576, + "step": 17965 + }, + { + "epoch": 2.931484502446982, + "grad_norm": 0.45562729239463806, + "learning_rate": 4.453081569486869e-05, + "loss": 0.1013, + "num_input_tokens_seen": 38802032, + "step": 17970 + }, + { + "epoch": 2.932300163132137, + "grad_norm": 1.308039903640747, + "learning_rate": 4.452637157402362e-05, + "loss": 0.2619, + "num_input_tokens_seen": 38812080, + "step": 17975 + }, + { + "epoch": 2.933115823817292, + "grad_norm": 1.3412467241287231, + "learning_rate": 4.45219258702718e-05, + "loss": 0.2176, + "num_input_tokens_seen": 38823664, + "step": 17980 + }, + { + "epoch": 2.933931484502447, + "grad_norm": 1.9245426654815674, + "learning_rate": 4.451747858397361e-05, + "loss": 0.1315, + "num_input_tokens_seen": 38833840, + "step": 17985 + }, + { + "epoch": 2.934747145187602, + "grad_norm": 1.2497422695159912, + "learning_rate": 4.451302971548958e-05, + "loss": 0.1884, + "num_input_tokens_seen": 38844400, + "step": 17990 + }, + { + "epoch": 2.935562805872757, + "grad_norm": 0.22795526683330536, + "learning_rate": 4.450857926518035e-05, + "loss": 0.0926, + "num_input_tokens_seen": 38855248, + "step": 17995 + }, + { + "epoch": 2.936378466557912, + "grad_norm": 0.1688574254512787, + "learning_rate": 4.45041272334067e-05, + "loss": 0.1007, + "num_input_tokens_seen": 38867472, + "step": 18000 + }, + { + "epoch": 2.9371941272430666, + "grad_norm": 0.3018284738063812, + "learning_rate": 4.449967362052954e-05, + "loss": 0.0798, + "num_input_tokens_seen": 38878864, + "step": 18005 + }, + { + "epoch": 2.938009787928222, + "grad_norm": 1.7203642129898071, + "learning_rate": 4.449521842690989e-05, + "loss": 0.1818, + "num_input_tokens_seen": 38890032, + "step": 18010 + }, + { + "epoch": 2.9388254486133767, + "grad_norm": 0.204995259642601, + "learning_rate": 4.449076165290892e-05, + "loss": 0.1107, + "num_input_tokens_seen": 38900624, + "step": 18015 + }, + { + "epoch": 2.939641109298532, + "grad_norm": 0.07036878168582916, + "learning_rate": 4.448630329888791e-05, + "loss": 0.0529, + "num_input_tokens_seen": 38911984, + "step": 18020 + }, + { + "epoch": 2.9404567699836868, + "grad_norm": 0.7506133317947388, + "learning_rate": 4.448184336520829e-05, + "loss": 0.0724, + "num_input_tokens_seen": 38922896, + "step": 18025 + }, + { + "epoch": 2.9412724306688416, + "grad_norm": 2.3020055294036865, + "learning_rate": 4.447738185223158e-05, + "loss": 0.2738, + "num_input_tokens_seen": 38933872, + "step": 18030 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.6001795530319214, + "learning_rate": 4.447291876031949e-05, + "loss": 0.0467, + "num_input_tokens_seen": 38943088, + "step": 18035 + }, + { + "epoch": 2.9429037520391517, + "grad_norm": 1.334251880645752, + "learning_rate": 4.446845408983379e-05, + "loss": 0.2284, + "num_input_tokens_seen": 38954064, + "step": 18040 + }, + { + "epoch": 2.943719412724307, + "grad_norm": 0.5289679765701294, + "learning_rate": 4.446398784113642e-05, + "loss": 0.0909, + "num_input_tokens_seen": 38965904, + "step": 18045 + }, + { + "epoch": 2.9445350734094617, + "grad_norm": 0.5277372002601624, + "learning_rate": 4.445952001458944e-05, + "loss": 0.1179, + "num_input_tokens_seen": 38976080, + "step": 18050 + }, + { + "epoch": 2.9453507340946166, + "grad_norm": 0.1693275421857834, + "learning_rate": 4.445505061055503e-05, + "loss": 0.0849, + "num_input_tokens_seen": 38987344, + "step": 18055 + }, + { + "epoch": 2.9461663947797714, + "grad_norm": 3.4813265800476074, + "learning_rate": 4.44505796293955e-05, + "loss": 0.1043, + "num_input_tokens_seen": 38999472, + "step": 18060 + }, + { + "epoch": 2.9469820554649266, + "grad_norm": 0.15672938525676727, + "learning_rate": 4.44461070714733e-05, + "loss": 0.1303, + "num_input_tokens_seen": 39010352, + "step": 18065 + }, + { + "epoch": 2.9477977161500815, + "grad_norm": 0.11651710420846939, + "learning_rate": 4.4441632937150984e-05, + "loss": 0.1348, + "num_input_tokens_seen": 39020816, + "step": 18070 + }, + { + "epoch": 2.9486133768352367, + "grad_norm": 0.8391134142875671, + "learning_rate": 4.443715722679126e-05, + "loss": 0.1101, + "num_input_tokens_seen": 39032112, + "step": 18075 + }, + { + "epoch": 2.9494290375203915, + "grad_norm": 0.058513179421424866, + "learning_rate": 4.443267994075695e-05, + "loss": 0.0271, + "num_input_tokens_seen": 39043536, + "step": 18080 + }, + { + "epoch": 2.9502446982055464, + "grad_norm": 0.7790514826774597, + "learning_rate": 4.4428201079411004e-05, + "loss": 0.1408, + "num_input_tokens_seen": 39053232, + "step": 18085 + }, + { + "epoch": 2.9510603588907016, + "grad_norm": 0.5750375986099243, + "learning_rate": 4.4423720643116495e-05, + "loss": 0.04, + "num_input_tokens_seen": 39064368, + "step": 18090 + }, + { + "epoch": 2.9518760195758564, + "grad_norm": 0.3989630937576294, + "learning_rate": 4.441923863223664e-05, + "loss": 0.3298, + "num_input_tokens_seen": 39072560, + "step": 18095 + }, + { + "epoch": 2.9526916802610113, + "grad_norm": 0.6084967255592346, + "learning_rate": 4.441475504713477e-05, + "loss": 0.1174, + "num_input_tokens_seen": 39084080, + "step": 18100 + }, + { + "epoch": 2.9535073409461665, + "grad_norm": 0.8417239785194397, + "learning_rate": 4.4410269888174346e-05, + "loss": 0.1826, + "num_input_tokens_seen": 39095056, + "step": 18105 + }, + { + "epoch": 2.9543230016313213, + "grad_norm": 0.14589500427246094, + "learning_rate": 4.440578315571896e-05, + "loss": 0.1364, + "num_input_tokens_seen": 39106000, + "step": 18110 + }, + { + "epoch": 2.955138662316476, + "grad_norm": 0.9415616989135742, + "learning_rate": 4.4401294850132324e-05, + "loss": 0.2401, + "num_input_tokens_seen": 39115760, + "step": 18115 + }, + { + "epoch": 2.9559543230016314, + "grad_norm": 2.6689510345458984, + "learning_rate": 4.439680497177829e-05, + "loss": 0.2381, + "num_input_tokens_seen": 39126160, + "step": 18120 + }, + { + "epoch": 2.9567699836867862, + "grad_norm": 0.5960155725479126, + "learning_rate": 4.439231352102082e-05, + "loss": 0.0914, + "num_input_tokens_seen": 39137264, + "step": 18125 + }, + { + "epoch": 2.9575856443719415, + "grad_norm": 1.2798892259597778, + "learning_rate": 4.438782049822403e-05, + "loss": 0.09, + "num_input_tokens_seen": 39147824, + "step": 18130 + }, + { + "epoch": 2.9584013050570963, + "grad_norm": 0.19436176121234894, + "learning_rate": 4.4383325903752124e-05, + "loss": 0.1968, + "num_input_tokens_seen": 39158768, + "step": 18135 + }, + { + "epoch": 2.959216965742251, + "grad_norm": 0.641007125377655, + "learning_rate": 4.437882973796948e-05, + "loss": 0.0911, + "num_input_tokens_seen": 39168752, + "step": 18140 + }, + { + "epoch": 2.960032626427406, + "grad_norm": 0.25964975357055664, + "learning_rate": 4.437433200124057e-05, + "loss": 0.0314, + "num_input_tokens_seen": 39180528, + "step": 18145 + }, + { + "epoch": 2.960848287112561, + "grad_norm": 1.0983214378356934, + "learning_rate": 4.4369832693930005e-05, + "loss": 0.2843, + "num_input_tokens_seen": 39192336, + "step": 18150 + }, + { + "epoch": 2.961663947797716, + "grad_norm": 0.6376581788063049, + "learning_rate": 4.436533181640252e-05, + "loss": 0.1074, + "num_input_tokens_seen": 39203056, + "step": 18155 + }, + { + "epoch": 2.9624796084828713, + "grad_norm": 0.30192241072654724, + "learning_rate": 4.436082936902297e-05, + "loss": 0.1225, + "num_input_tokens_seen": 39213104, + "step": 18160 + }, + { + "epoch": 2.963295269168026, + "grad_norm": 1.4236564636230469, + "learning_rate": 4.435632535215637e-05, + "loss": 0.2519, + "num_input_tokens_seen": 39223792, + "step": 18165 + }, + { + "epoch": 2.964110929853181, + "grad_norm": 2.221045732498169, + "learning_rate": 4.435181976616781e-05, + "loss": 0.1274, + "num_input_tokens_seen": 39235248, + "step": 18170 + }, + { + "epoch": 2.964926590538336, + "grad_norm": 0.05879782885313034, + "learning_rate": 4.4347312611422555e-05, + "loss": 0.1437, + "num_input_tokens_seen": 39245424, + "step": 18175 + }, + { + "epoch": 2.965742251223491, + "grad_norm": 0.3388291299343109, + "learning_rate": 4.434280388828598e-05, + "loss": 0.1927, + "num_input_tokens_seen": 39255344, + "step": 18180 + }, + { + "epoch": 2.9665579119086463, + "grad_norm": 1.3704702854156494, + "learning_rate": 4.433829359712356e-05, + "loss": 0.1153, + "num_input_tokens_seen": 39266192, + "step": 18185 + }, + { + "epoch": 2.967373572593801, + "grad_norm": 0.9216246008872986, + "learning_rate": 4.4333781738300954e-05, + "loss": 0.1998, + "num_input_tokens_seen": 39277040, + "step": 18190 + }, + { + "epoch": 2.968189233278956, + "grad_norm": 1.8098597526550293, + "learning_rate": 4.43292683121839e-05, + "loss": 0.1817, + "num_input_tokens_seen": 39286160, + "step": 18195 + }, + { + "epoch": 2.9690048939641107, + "grad_norm": 0.3070986866950989, + "learning_rate": 4.432475331913828e-05, + "loss": 0.0585, + "num_input_tokens_seen": 39296496, + "step": 18200 + }, + { + "epoch": 2.969820554649266, + "grad_norm": 1.2749452590942383, + "learning_rate": 4.4320236759530095e-05, + "loss": 0.1595, + "num_input_tokens_seen": 39306672, + "step": 18205 + }, + { + "epoch": 2.970636215334421, + "grad_norm": 0.11077670753002167, + "learning_rate": 4.43157186337255e-05, + "loss": 0.036, + "num_input_tokens_seen": 39316816, + "step": 18210 + }, + { + "epoch": 2.971451876019576, + "grad_norm": 0.6073323488235474, + "learning_rate": 4.431119894209074e-05, + "loss": 0.1586, + "num_input_tokens_seen": 39327984, + "step": 18215 + }, + { + "epoch": 2.972267536704731, + "grad_norm": 0.4559842348098755, + "learning_rate": 4.430667768499221e-05, + "loss": 0.1295, + "num_input_tokens_seen": 39338832, + "step": 18220 + }, + { + "epoch": 2.9730831973898857, + "grad_norm": 0.4654099643230438, + "learning_rate": 4.4302154862796425e-05, + "loss": 0.1717, + "num_input_tokens_seen": 39349104, + "step": 18225 + }, + { + "epoch": 2.9738988580750405, + "grad_norm": 1.4881434440612793, + "learning_rate": 4.4297630475870025e-05, + "loss": 0.358, + "num_input_tokens_seen": 39358640, + "step": 18230 + }, + { + "epoch": 2.9747145187601958, + "grad_norm": 0.23656068742275238, + "learning_rate": 4.429310452457979e-05, + "loss": 0.0992, + "num_input_tokens_seen": 39368688, + "step": 18235 + }, + { + "epoch": 2.9755301794453506, + "grad_norm": 0.9751746654510498, + "learning_rate": 4.428857700929261e-05, + "loss": 0.1168, + "num_input_tokens_seen": 39378224, + "step": 18240 + }, + { + "epoch": 2.976345840130506, + "grad_norm": 0.42957207560539246, + "learning_rate": 4.428404793037551e-05, + "loss": 0.0751, + "num_input_tokens_seen": 39388368, + "step": 18245 + }, + { + "epoch": 2.9771615008156607, + "grad_norm": 1.0870577096939087, + "learning_rate": 4.427951728819564e-05, + "loss": 0.1384, + "num_input_tokens_seen": 39399120, + "step": 18250 + }, + { + "epoch": 2.9779771615008155, + "grad_norm": 0.3751599192619324, + "learning_rate": 4.427498508312026e-05, + "loss": 0.1041, + "num_input_tokens_seen": 39408560, + "step": 18255 + }, + { + "epoch": 2.9787928221859707, + "grad_norm": 0.41700828075408936, + "learning_rate": 4.4270451315516807e-05, + "loss": 0.1222, + "num_input_tokens_seen": 39419664, + "step": 18260 + }, + { + "epoch": 2.9796084828711256, + "grad_norm": 0.9439231157302856, + "learning_rate": 4.426591598575278e-05, + "loss": 0.0679, + "num_input_tokens_seen": 39430480, + "step": 18265 + }, + { + "epoch": 2.980424143556281, + "grad_norm": 0.34425288438796997, + "learning_rate": 4.4261379094195856e-05, + "loss": 0.0901, + "num_input_tokens_seen": 39441008, + "step": 18270 + }, + { + "epoch": 2.9812398042414356, + "grad_norm": 0.7729831337928772, + "learning_rate": 4.425684064121381e-05, + "loss": 0.0837, + "num_input_tokens_seen": 39451984, + "step": 18275 + }, + { + "epoch": 2.9820554649265905, + "grad_norm": 1.4117343425750732, + "learning_rate": 4.425230062717455e-05, + "loss": 0.2798, + "num_input_tokens_seen": 39461904, + "step": 18280 + }, + { + "epoch": 2.9828711256117453, + "grad_norm": 0.3930506408214569, + "learning_rate": 4.424775905244612e-05, + "loss": 0.045, + "num_input_tokens_seen": 39472656, + "step": 18285 + }, + { + "epoch": 2.9836867862969005, + "grad_norm": 0.11412102729082108, + "learning_rate": 4.424321591739668e-05, + "loss": 0.1971, + "num_input_tokens_seen": 39483376, + "step": 18290 + }, + { + "epoch": 2.9845024469820554, + "grad_norm": 0.3438469469547272, + "learning_rate": 4.423867122239451e-05, + "loss": 0.0845, + "num_input_tokens_seen": 39496144, + "step": 18295 + }, + { + "epoch": 2.9853181076672106, + "grad_norm": 0.36180293560028076, + "learning_rate": 4.4234124967808044e-05, + "loss": 0.2112, + "num_input_tokens_seen": 39506128, + "step": 18300 + }, + { + "epoch": 2.9861337683523654, + "grad_norm": 1.2927416563034058, + "learning_rate": 4.4229577154005814e-05, + "loss": 0.4327, + "num_input_tokens_seen": 39515440, + "step": 18305 + }, + { + "epoch": 2.9869494290375203, + "grad_norm": 0.14284566044807434, + "learning_rate": 4.4225027781356484e-05, + "loss": 0.0432, + "num_input_tokens_seen": 39526512, + "step": 18310 + }, + { + "epoch": 2.9877650897226755, + "grad_norm": 0.05157662183046341, + "learning_rate": 4.4220476850228866e-05, + "loss": 0.0289, + "num_input_tokens_seen": 39537136, + "step": 18315 + }, + { + "epoch": 2.9885807504078303, + "grad_norm": 0.4630555808544159, + "learning_rate": 4.421592436099186e-05, + "loss": 0.0852, + "num_input_tokens_seen": 39545808, + "step": 18320 + }, + { + "epoch": 2.9893964110929856, + "grad_norm": 0.6020138263702393, + "learning_rate": 4.4211370314014534e-05, + "loss": 0.0639, + "num_input_tokens_seen": 39558000, + "step": 18325 + }, + { + "epoch": 2.9902120717781404, + "grad_norm": 0.36315277218818665, + "learning_rate": 4.4206814709666046e-05, + "loss": 0.1977, + "num_input_tokens_seen": 39570000, + "step": 18330 + }, + { + "epoch": 2.9910277324632952, + "grad_norm": 0.2751581370830536, + "learning_rate": 4.4202257548315704e-05, + "loss": 0.1383, + "num_input_tokens_seen": 39580560, + "step": 18335 + }, + { + "epoch": 2.99184339314845, + "grad_norm": 0.7463333010673523, + "learning_rate": 4.4197698830332934e-05, + "loss": 0.0385, + "num_input_tokens_seen": 39591408, + "step": 18340 + }, + { + "epoch": 2.9926590538336053, + "grad_norm": 1.3330538272857666, + "learning_rate": 4.419313855608729e-05, + "loss": 0.0925, + "num_input_tokens_seen": 39601872, + "step": 18345 + }, + { + "epoch": 2.99347471451876, + "grad_norm": 0.4952561557292938, + "learning_rate": 4.418857672594845e-05, + "loss": 0.0804, + "num_input_tokens_seen": 39613008, + "step": 18350 + }, + { + "epoch": 2.9942903752039154, + "grad_norm": 0.7994943261146545, + "learning_rate": 4.4184013340286215e-05, + "loss": 0.1065, + "num_input_tokens_seen": 39624272, + "step": 18355 + }, + { + "epoch": 2.99510603588907, + "grad_norm": 0.23224081099033356, + "learning_rate": 4.417944839947053e-05, + "loss": 0.0792, + "num_input_tokens_seen": 39635216, + "step": 18360 + }, + { + "epoch": 2.995921696574225, + "grad_norm": 0.45264649391174316, + "learning_rate": 4.417488190387144e-05, + "loss": 0.1112, + "num_input_tokens_seen": 39645808, + "step": 18365 + }, + { + "epoch": 2.99673735725938, + "grad_norm": 0.3124370872974396, + "learning_rate": 4.417031385385911e-05, + "loss": 0.1439, + "num_input_tokens_seen": 39656560, + "step": 18370 + }, + { + "epoch": 2.997553017944535, + "grad_norm": 1.0669280290603638, + "learning_rate": 4.416574424980389e-05, + "loss": 0.0778, + "num_input_tokens_seen": 39666704, + "step": 18375 + }, + { + "epoch": 2.99836867862969, + "grad_norm": 1.818014144897461, + "learning_rate": 4.4161173092076194e-05, + "loss": 0.0956, + "num_input_tokens_seen": 39677008, + "step": 18380 + }, + { + "epoch": 2.999184339314845, + "grad_norm": 0.46440309286117554, + "learning_rate": 4.415660038104658e-05, + "loss": 0.0398, + "num_input_tokens_seen": 39686416, + "step": 18385 + }, + { + "epoch": 3.0, + "grad_norm": 0.01660398580133915, + "learning_rate": 4.4152026117085735e-05, + "loss": 0.1203, + "num_input_tokens_seen": 39694112, + "step": 18390 + }, + { + "epoch": 3.0, + "eval_loss": 0.1438286006450653, + "eval_runtime": 132.0097, + "eval_samples_per_second": 20.642, + "eval_steps_per_second": 5.166, + "num_input_tokens_seen": 39694112, + "step": 18390 + }, + { + "epoch": 3.000815660685155, + "grad_norm": 0.08183811604976654, + "learning_rate": 4.4147450300564485e-05, + "loss": 0.018, + "num_input_tokens_seen": 39703936, + "step": 18395 + }, + { + "epoch": 3.00163132137031, + "grad_norm": 0.08592738956212997, + "learning_rate": 4.414287293185376e-05, + "loss": 0.0707, + "num_input_tokens_seen": 39715968, + "step": 18400 + }, + { + "epoch": 3.002446982055465, + "grad_norm": 0.31839555501937866, + "learning_rate": 4.413829401132462e-05, + "loss": 0.1118, + "num_input_tokens_seen": 39726912, + "step": 18405 + }, + { + "epoch": 3.0032626427406197, + "grad_norm": 1.4366300106048584, + "learning_rate": 4.4133713539348266e-05, + "loss": 0.235, + "num_input_tokens_seen": 39737504, + "step": 18410 + }, + { + "epoch": 3.004078303425775, + "grad_norm": 2.085970163345337, + "learning_rate": 4.4129131516296006e-05, + "loss": 0.3727, + "num_input_tokens_seen": 39749792, + "step": 18415 + }, + { + "epoch": 3.00489396411093, + "grad_norm": 1.1832972764968872, + "learning_rate": 4.412454794253928e-05, + "loss": 0.0645, + "num_input_tokens_seen": 39760928, + "step": 18420 + }, + { + "epoch": 3.0057096247960846, + "grad_norm": 1.4493836164474487, + "learning_rate": 4.4119962818449665e-05, + "loss": 0.0699, + "num_input_tokens_seen": 39772096, + "step": 18425 + }, + { + "epoch": 3.00652528548124, + "grad_norm": 0.401859849691391, + "learning_rate": 4.411537614439886e-05, + "loss": 0.0791, + "num_input_tokens_seen": 39782528, + "step": 18430 + }, + { + "epoch": 3.0073409461663947, + "grad_norm": 0.1731359213590622, + "learning_rate": 4.4110787920758663e-05, + "loss": 0.0118, + "num_input_tokens_seen": 39794336, + "step": 18435 + }, + { + "epoch": 3.00815660685155, + "grad_norm": 1.5221755504608154, + "learning_rate": 4.4106198147901035e-05, + "loss": 0.227, + "num_input_tokens_seen": 39805376, + "step": 18440 + }, + { + "epoch": 3.0089722675367048, + "grad_norm": 1.9144115447998047, + "learning_rate": 4.410160682619803e-05, + "loss": 0.1314, + "num_input_tokens_seen": 39816768, + "step": 18445 + }, + { + "epoch": 3.0097879282218596, + "grad_norm": 0.1511722207069397, + "learning_rate": 4.409701395602187e-05, + "loss": 0.1877, + "num_input_tokens_seen": 39827520, + "step": 18450 + }, + { + "epoch": 3.010603588907015, + "grad_norm": 0.3139670789241791, + "learning_rate": 4.4092419537744854e-05, + "loss": 0.1541, + "num_input_tokens_seen": 39839072, + "step": 18455 + }, + { + "epoch": 3.0114192495921697, + "grad_norm": 0.42930933833122253, + "learning_rate": 4.408782357173944e-05, + "loss": 0.1131, + "num_input_tokens_seen": 39850432, + "step": 18460 + }, + { + "epoch": 3.0122349102773245, + "grad_norm": 0.06151362136006355, + "learning_rate": 4.408322605837819e-05, + "loss": 0.088, + "num_input_tokens_seen": 39861600, + "step": 18465 + }, + { + "epoch": 3.0130505709624797, + "grad_norm": 0.9429694414138794, + "learning_rate": 4.407862699803381e-05, + "loss": 0.1052, + "num_input_tokens_seen": 39872288, + "step": 18470 + }, + { + "epoch": 3.0138662316476346, + "grad_norm": 0.08354208618402481, + "learning_rate": 4.4074026391079126e-05, + "loss": 0.2913, + "num_input_tokens_seen": 39881984, + "step": 18475 + }, + { + "epoch": 3.0146818923327894, + "grad_norm": 0.042720261961221695, + "learning_rate": 4.406942423788708e-05, + "loss": 0.0653, + "num_input_tokens_seen": 39893184, + "step": 18480 + }, + { + "epoch": 3.0154975530179446, + "grad_norm": 0.20680686831474304, + "learning_rate": 4.406482053883075e-05, + "loss": 0.0397, + "num_input_tokens_seen": 39903200, + "step": 18485 + }, + { + "epoch": 3.0163132137030995, + "grad_norm": 0.2586461901664734, + "learning_rate": 4.406021529428334e-05, + "loss": 0.1261, + "num_input_tokens_seen": 39913728, + "step": 18490 + }, + { + "epoch": 3.0171288743882543, + "grad_norm": 0.8558509349822998, + "learning_rate": 4.405560850461815e-05, + "loss": 0.0585, + "num_input_tokens_seen": 39924576, + "step": 18495 + }, + { + "epoch": 3.0179445350734095, + "grad_norm": 0.9878755211830139, + "learning_rate": 4.405100017020866e-05, + "loss": 0.1417, + "num_input_tokens_seen": 39935808, + "step": 18500 + }, + { + "epoch": 3.0187601957585644, + "grad_norm": 0.21263906359672546, + "learning_rate": 4.4046390291428426e-05, + "loss": 0.1291, + "num_input_tokens_seen": 39946592, + "step": 18505 + }, + { + "epoch": 3.0195758564437196, + "grad_norm": 0.2276342511177063, + "learning_rate": 4.4041778868651155e-05, + "loss": 0.0488, + "num_input_tokens_seen": 39958592, + "step": 18510 + }, + { + "epoch": 3.0203915171288744, + "grad_norm": 0.9263331890106201, + "learning_rate": 4.4037165902250676e-05, + "loss": 0.1464, + "num_input_tokens_seen": 39969760, + "step": 18515 + }, + { + "epoch": 3.0212071778140293, + "grad_norm": 0.8194901347160339, + "learning_rate": 4.403255139260093e-05, + "loss": 0.1154, + "num_input_tokens_seen": 39980064, + "step": 18520 + }, + { + "epoch": 3.0220228384991845, + "grad_norm": 1.2494325637817383, + "learning_rate": 4.4027935340076005e-05, + "loss": 0.0942, + "num_input_tokens_seen": 39991840, + "step": 18525 + }, + { + "epoch": 3.0228384991843393, + "grad_norm": 0.045885100960731506, + "learning_rate": 4.402331774505009e-05, + "loss": 0.1054, + "num_input_tokens_seen": 40002912, + "step": 18530 + }, + { + "epoch": 3.023654159869494, + "grad_norm": 0.16370925307273865, + "learning_rate": 4.4018698607897515e-05, + "loss": 0.2045, + "num_input_tokens_seen": 40014624, + "step": 18535 + }, + { + "epoch": 3.0244698205546494, + "grad_norm": 0.5207610130310059, + "learning_rate": 4.4014077928992736e-05, + "loss": 0.2794, + "num_input_tokens_seen": 40025408, + "step": 18540 + }, + { + "epoch": 3.0252854812398042, + "grad_norm": 0.715366005897522, + "learning_rate": 4.4009455708710315e-05, + "loss": 0.1203, + "num_input_tokens_seen": 40036352, + "step": 18545 + }, + { + "epoch": 3.026101141924959, + "grad_norm": 0.454301655292511, + "learning_rate": 4.4004831947424967e-05, + "loss": 0.0772, + "num_input_tokens_seen": 40045888, + "step": 18550 + }, + { + "epoch": 3.0269168026101143, + "grad_norm": 0.08731784671545029, + "learning_rate": 4.400020664551151e-05, + "loss": 0.1467, + "num_input_tokens_seen": 40056928, + "step": 18555 + }, + { + "epoch": 3.027732463295269, + "grad_norm": 0.1427256166934967, + "learning_rate": 4.39955798033449e-05, + "loss": 0.0447, + "num_input_tokens_seen": 40067456, + "step": 18560 + }, + { + "epoch": 3.028548123980424, + "grad_norm": 0.18552502989768982, + "learning_rate": 4.399095142130021e-05, + "loss": 0.1158, + "num_input_tokens_seen": 40077760, + "step": 18565 + }, + { + "epoch": 3.029363784665579, + "grad_norm": 0.3631187677383423, + "learning_rate": 4.398632149975263e-05, + "loss": 0.0778, + "num_input_tokens_seen": 40089376, + "step": 18570 + }, + { + "epoch": 3.030179445350734, + "grad_norm": 1.3221142292022705, + "learning_rate": 4.39816900390775e-05, + "loss": 0.1101, + "num_input_tokens_seen": 40100544, + "step": 18575 + }, + { + "epoch": 3.0309951060358893, + "grad_norm": 0.13682685792446136, + "learning_rate": 4.397705703965026e-05, + "loss": 0.0235, + "num_input_tokens_seen": 40111616, + "step": 18580 + }, + { + "epoch": 3.031810766721044, + "grad_norm": 0.4029770791530609, + "learning_rate": 4.397242250184649e-05, + "loss": 0.1962, + "num_input_tokens_seen": 40123712, + "step": 18585 + }, + { + "epoch": 3.032626427406199, + "grad_norm": 0.4951936602592468, + "learning_rate": 4.396778642604188e-05, + "loss": 0.0507, + "num_input_tokens_seen": 40134976, + "step": 18590 + }, + { + "epoch": 3.033442088091354, + "grad_norm": 1.5448215007781982, + "learning_rate": 4.396314881261227e-05, + "loss": 0.1591, + "num_input_tokens_seen": 40144992, + "step": 18595 + }, + { + "epoch": 3.034257748776509, + "grad_norm": 0.03320423513650894, + "learning_rate": 4.39585096619336e-05, + "loss": 0.0448, + "num_input_tokens_seen": 40156928, + "step": 18600 + }, + { + "epoch": 3.035073409461664, + "grad_norm": 0.44172874093055725, + "learning_rate": 4.395386897438194e-05, + "loss": 0.1038, + "num_input_tokens_seen": 40167264, + "step": 18605 + }, + { + "epoch": 3.035889070146819, + "grad_norm": 0.8511459827423096, + "learning_rate": 4.3949226750333484e-05, + "loss": 0.2253, + "num_input_tokens_seen": 40178944, + "step": 18610 + }, + { + "epoch": 3.036704730831974, + "grad_norm": 0.4897553324699402, + "learning_rate": 4.3944582990164565e-05, + "loss": 0.1914, + "num_input_tokens_seen": 40190240, + "step": 18615 + }, + { + "epoch": 3.0375203915171287, + "grad_norm": 1.310648798942566, + "learning_rate": 4.393993769425162e-05, + "loss": 0.1168, + "num_input_tokens_seen": 40201696, + "step": 18620 + }, + { + "epoch": 3.038336052202284, + "grad_norm": 0.22728601098060608, + "learning_rate": 4.393529086297123e-05, + "loss": 0.2142, + "num_input_tokens_seen": 40211424, + "step": 18625 + }, + { + "epoch": 3.039151712887439, + "grad_norm": 1.2889529466629028, + "learning_rate": 4.3930642496700084e-05, + "loss": 0.1307, + "num_input_tokens_seen": 40221888, + "step": 18630 + }, + { + "epoch": 3.0399673735725936, + "grad_norm": 0.13192445039749146, + "learning_rate": 4.392599259581501e-05, + "loss": 0.0609, + "num_input_tokens_seen": 40233120, + "step": 18635 + }, + { + "epoch": 3.040783034257749, + "grad_norm": 1.8392046689987183, + "learning_rate": 4.392134116069294e-05, + "loss": 0.1268, + "num_input_tokens_seen": 40243104, + "step": 18640 + }, + { + "epoch": 3.0415986949429037, + "grad_norm": 0.08789340406656265, + "learning_rate": 4.391668819171095e-05, + "loss": 0.1387, + "num_input_tokens_seen": 40253344, + "step": 18645 + }, + { + "epoch": 3.0424143556280585, + "grad_norm": 1.5099657773971558, + "learning_rate": 4.391203368924623e-05, + "loss": 0.291, + "num_input_tokens_seen": 40265280, + "step": 18650 + }, + { + "epoch": 3.0432300163132138, + "grad_norm": 0.64671391248703, + "learning_rate": 4.3907377653676104e-05, + "loss": 0.1026, + "num_input_tokens_seen": 40276224, + "step": 18655 + }, + { + "epoch": 3.0440456769983686, + "grad_norm": 0.14876797795295715, + "learning_rate": 4.390272008537802e-05, + "loss": 0.2004, + "num_input_tokens_seen": 40287168, + "step": 18660 + }, + { + "epoch": 3.044861337683524, + "grad_norm": 0.8279629945755005, + "learning_rate": 4.3898060984729526e-05, + "loss": 0.0482, + "num_input_tokens_seen": 40297184, + "step": 18665 + }, + { + "epoch": 3.0456769983686787, + "grad_norm": 0.9890760779380798, + "learning_rate": 4.389340035210832e-05, + "loss": 0.0649, + "num_input_tokens_seen": 40308128, + "step": 18670 + }, + { + "epoch": 3.0464926590538335, + "grad_norm": 0.6120040416717529, + "learning_rate": 4.388873818789222e-05, + "loss": 0.0884, + "num_input_tokens_seen": 40320352, + "step": 18675 + }, + { + "epoch": 3.0473083197389887, + "grad_norm": 0.639624834060669, + "learning_rate": 4.388407449245916e-05, + "loss": 0.1798, + "num_input_tokens_seen": 40331424, + "step": 18680 + }, + { + "epoch": 3.0481239804241436, + "grad_norm": 0.41215428709983826, + "learning_rate": 4.38794092661872e-05, + "loss": 0.089, + "num_input_tokens_seen": 40343264, + "step": 18685 + }, + { + "epoch": 3.0489396411092984, + "grad_norm": 1.0802353620529175, + "learning_rate": 4.3874742509454536e-05, + "loss": 0.1074, + "num_input_tokens_seen": 40356608, + "step": 18690 + }, + { + "epoch": 3.0497553017944536, + "grad_norm": 1.400597333908081, + "learning_rate": 4.387007422263948e-05, + "loss": 0.1649, + "num_input_tokens_seen": 40367456, + "step": 18695 + }, + { + "epoch": 3.0505709624796085, + "grad_norm": 1.1025152206420898, + "learning_rate": 4.3865404406120456e-05, + "loss": 0.0807, + "num_input_tokens_seen": 40377728, + "step": 18700 + }, + { + "epoch": 3.0513866231647633, + "grad_norm": 0.6560204029083252, + "learning_rate": 4.3860733060276025e-05, + "loss": 0.0921, + "num_input_tokens_seen": 40389184, + "step": 18705 + }, + { + "epoch": 3.0522022838499185, + "grad_norm": 0.7588626742362976, + "learning_rate": 4.385606018548488e-05, + "loss": 0.0898, + "num_input_tokens_seen": 40400512, + "step": 18710 + }, + { + "epoch": 3.0530179445350734, + "grad_norm": 0.6728503704071045, + "learning_rate": 4.385138578212582e-05, + "loss": 0.1118, + "num_input_tokens_seen": 40411008, + "step": 18715 + }, + { + "epoch": 3.053833605220228, + "grad_norm": 1.480348825454712, + "learning_rate": 4.384670985057778e-05, + "loss": 0.1256, + "num_input_tokens_seen": 40422528, + "step": 18720 + }, + { + "epoch": 3.0546492659053834, + "grad_norm": 0.03438263759016991, + "learning_rate": 4.3842032391219804e-05, + "loss": 0.0864, + "num_input_tokens_seen": 40432384, + "step": 18725 + }, + { + "epoch": 3.0554649265905383, + "grad_norm": 0.15686731040477753, + "learning_rate": 4.3837353404431086e-05, + "loss": 0.2127, + "num_input_tokens_seen": 40443424, + "step": 18730 + }, + { + "epoch": 3.0562805872756935, + "grad_norm": 0.12685614824295044, + "learning_rate": 4.383267289059092e-05, + "loss": 0.0522, + "num_input_tokens_seen": 40453760, + "step": 18735 + }, + { + "epoch": 3.0570962479608483, + "grad_norm": 1.4601800441741943, + "learning_rate": 4.382799085007873e-05, + "loss": 0.1135, + "num_input_tokens_seen": 40464832, + "step": 18740 + }, + { + "epoch": 3.057911908646003, + "grad_norm": 3.748215913772583, + "learning_rate": 4.382330728327407e-05, + "loss": 0.2857, + "num_input_tokens_seen": 40475136, + "step": 18745 + }, + { + "epoch": 3.0587275693311584, + "grad_norm": 1.167231798171997, + "learning_rate": 4.3818622190556624e-05, + "loss": 0.2329, + "num_input_tokens_seen": 40485856, + "step": 18750 + }, + { + "epoch": 3.0595432300163132, + "grad_norm": 0.3400525152683258, + "learning_rate": 4.381393557230617e-05, + "loss": 0.1536, + "num_input_tokens_seen": 40497120, + "step": 18755 + }, + { + "epoch": 3.060358890701468, + "grad_norm": 0.9300292134284973, + "learning_rate": 4.380924742890264e-05, + "loss": 0.0894, + "num_input_tokens_seen": 40508928, + "step": 18760 + }, + { + "epoch": 3.0611745513866233, + "grad_norm": 0.15938006341457367, + "learning_rate": 4.380455776072607e-05, + "loss": 0.1894, + "num_input_tokens_seen": 40519904, + "step": 18765 + }, + { + "epoch": 3.061990212071778, + "grad_norm": 1.8551231622695923, + "learning_rate": 4.3799866568156634e-05, + "loss": 0.2049, + "num_input_tokens_seen": 40529568, + "step": 18770 + }, + { + "epoch": 3.062805872756933, + "grad_norm": 0.7621637582778931, + "learning_rate": 4.379517385157463e-05, + "loss": 0.1291, + "num_input_tokens_seen": 40540512, + "step": 18775 + }, + { + "epoch": 3.063621533442088, + "grad_norm": 0.19398127496242523, + "learning_rate": 4.3790479611360466e-05, + "loss": 0.15, + "num_input_tokens_seen": 40550592, + "step": 18780 + }, + { + "epoch": 3.064437194127243, + "grad_norm": 0.5826227068901062, + "learning_rate": 4.378578384789469e-05, + "loss": 0.0662, + "num_input_tokens_seen": 40561824, + "step": 18785 + }, + { + "epoch": 3.065252854812398, + "grad_norm": 0.46185946464538574, + "learning_rate": 4.378108656155795e-05, + "loss": 0.1018, + "num_input_tokens_seen": 40572416, + "step": 18790 + }, + { + "epoch": 3.066068515497553, + "grad_norm": 1.4502075910568237, + "learning_rate": 4.377638775273104e-05, + "loss": 0.2308, + "num_input_tokens_seen": 40583712, + "step": 18795 + }, + { + "epoch": 3.066884176182708, + "grad_norm": 1.5293116569519043, + "learning_rate": 4.3771687421794866e-05, + "loss": 0.195, + "num_input_tokens_seen": 40594560, + "step": 18800 + }, + { + "epoch": 3.067699836867863, + "grad_norm": 1.3609498739242554, + "learning_rate": 4.3766985569130465e-05, + "loss": 0.1342, + "num_input_tokens_seen": 40605536, + "step": 18805 + }, + { + "epoch": 3.068515497553018, + "grad_norm": 0.528692364692688, + "learning_rate": 4.376228219511899e-05, + "loss": 0.0983, + "num_input_tokens_seen": 40615328, + "step": 18810 + }, + { + "epoch": 3.069331158238173, + "grad_norm": 0.8694379925727844, + "learning_rate": 4.375757730014172e-05, + "loss": 0.0823, + "num_input_tokens_seen": 40625504, + "step": 18815 + }, + { + "epoch": 3.070146818923328, + "grad_norm": 1.3805750608444214, + "learning_rate": 4.3752870884580065e-05, + "loss": 0.2362, + "num_input_tokens_seen": 40636896, + "step": 18820 + }, + { + "epoch": 3.070962479608483, + "grad_norm": 1.1829862594604492, + "learning_rate": 4.374816294881554e-05, + "loss": 0.2166, + "num_input_tokens_seen": 40647456, + "step": 18825 + }, + { + "epoch": 3.0717781402936377, + "grad_norm": 0.7003613114356995, + "learning_rate": 4.37434534932298e-05, + "loss": 0.1304, + "num_input_tokens_seen": 40656832, + "step": 18830 + }, + { + "epoch": 3.072593800978793, + "grad_norm": 1.031459927558899, + "learning_rate": 4.373874251820462e-05, + "loss": 0.0707, + "num_input_tokens_seen": 40666752, + "step": 18835 + }, + { + "epoch": 3.073409461663948, + "grad_norm": 0.9121919274330139, + "learning_rate": 4.3734030024121886e-05, + "loss": 0.0615, + "num_input_tokens_seen": 40678368, + "step": 18840 + }, + { + "epoch": 3.0742251223491026, + "grad_norm": 2.0097029209136963, + "learning_rate": 4.372931601136363e-05, + "loss": 0.234, + "num_input_tokens_seen": 40689632, + "step": 18845 + }, + { + "epoch": 3.075040783034258, + "grad_norm": 0.7301902174949646, + "learning_rate": 4.372460048031198e-05, + "loss": 0.085, + "num_input_tokens_seen": 40700576, + "step": 18850 + }, + { + "epoch": 3.0758564437194127, + "grad_norm": 1.0843689441680908, + "learning_rate": 4.3719883431349206e-05, + "loss": 0.1635, + "num_input_tokens_seen": 40711488, + "step": 18855 + }, + { + "epoch": 3.0766721044045675, + "grad_norm": 0.4920283257961273, + "learning_rate": 4.3715164864857705e-05, + "loss": 0.1525, + "num_input_tokens_seen": 40720832, + "step": 18860 + }, + { + "epoch": 3.0774877650897228, + "grad_norm": 2.3246185779571533, + "learning_rate": 4.371044478121998e-05, + "loss": 0.1649, + "num_input_tokens_seen": 40731200, + "step": 18865 + }, + { + "epoch": 3.0783034257748776, + "grad_norm": 1.1510441303253174, + "learning_rate": 4.370572318081866e-05, + "loss": 0.0979, + "num_input_tokens_seen": 40742432, + "step": 18870 + }, + { + "epoch": 3.0791190864600324, + "grad_norm": 0.24556031823158264, + "learning_rate": 4.3701000064036504e-05, + "loss": 0.2515, + "num_input_tokens_seen": 40753856, + "step": 18875 + }, + { + "epoch": 3.0799347471451877, + "grad_norm": 0.5569616556167603, + "learning_rate": 4.3696275431256405e-05, + "loss": 0.1656, + "num_input_tokens_seen": 40762976, + "step": 18880 + }, + { + "epoch": 3.0807504078303425, + "grad_norm": 1.5287224054336548, + "learning_rate": 4.369154928286134e-05, + "loss": 0.1287, + "num_input_tokens_seen": 40773504, + "step": 18885 + }, + { + "epoch": 3.0815660685154977, + "grad_norm": 0.2851041853427887, + "learning_rate": 4.368682161923447e-05, + "loss": 0.0281, + "num_input_tokens_seen": 40784576, + "step": 18890 + }, + { + "epoch": 3.0823817292006526, + "grad_norm": 0.3248263895511627, + "learning_rate": 4.368209244075901e-05, + "loss": 0.0377, + "num_input_tokens_seen": 40794240, + "step": 18895 + }, + { + "epoch": 3.0831973898858074, + "grad_norm": 0.08944778889417648, + "learning_rate": 4.3677361747818355e-05, + "loss": 0.0678, + "num_input_tokens_seen": 40805600, + "step": 18900 + }, + { + "epoch": 3.0840130505709626, + "grad_norm": 0.3848574459552765, + "learning_rate": 4.3672629540795976e-05, + "loss": 0.1129, + "num_input_tokens_seen": 40816352, + "step": 18905 + }, + { + "epoch": 3.0848287112561175, + "grad_norm": 0.37914204597473145, + "learning_rate": 4.366789582007551e-05, + "loss": 0.0674, + "num_input_tokens_seen": 40826720, + "step": 18910 + }, + { + "epoch": 3.0856443719412723, + "grad_norm": 1.0957794189453125, + "learning_rate": 4.366316058604069e-05, + "loss": 0.208, + "num_input_tokens_seen": 40837216, + "step": 18915 + }, + { + "epoch": 3.0864600326264275, + "grad_norm": 2.306138753890991, + "learning_rate": 4.3658423839075376e-05, + "loss": 0.0791, + "num_input_tokens_seen": 40847904, + "step": 18920 + }, + { + "epoch": 3.0872756933115824, + "grad_norm": 0.18481816351413727, + "learning_rate": 4.3653685579563555e-05, + "loss": 0.1498, + "num_input_tokens_seen": 40858304, + "step": 18925 + }, + { + "epoch": 3.088091353996737, + "grad_norm": 0.2291443645954132, + "learning_rate": 4.364894580788932e-05, + "loss": 0.0621, + "num_input_tokens_seen": 40869088, + "step": 18930 + }, + { + "epoch": 3.0889070146818924, + "grad_norm": 0.6047766804695129, + "learning_rate": 4.364420452443693e-05, + "loss": 0.0668, + "num_input_tokens_seen": 40878976, + "step": 18935 + }, + { + "epoch": 3.0897226753670473, + "grad_norm": 1.5093845129013062, + "learning_rate": 4.363946172959071e-05, + "loss": 0.1801, + "num_input_tokens_seen": 40890144, + "step": 18940 + }, + { + "epoch": 3.090538336052202, + "grad_norm": 1.04195237159729, + "learning_rate": 4.363471742373516e-05, + "loss": 0.1353, + "num_input_tokens_seen": 40900448, + "step": 18945 + }, + { + "epoch": 3.0913539967373573, + "grad_norm": 0.1638570874929428, + "learning_rate": 4.3629971607254855e-05, + "loss": 0.0441, + "num_input_tokens_seen": 40910880, + "step": 18950 + }, + { + "epoch": 3.092169657422512, + "grad_norm": 0.21140503883361816, + "learning_rate": 4.362522428053453e-05, + "loss": 0.0558, + "num_input_tokens_seen": 40922752, + "step": 18955 + }, + { + "epoch": 3.0929853181076674, + "grad_norm": 0.8337807059288025, + "learning_rate": 4.3620475443959016e-05, + "loss": 0.1121, + "num_input_tokens_seen": 40932640, + "step": 18960 + }, + { + "epoch": 3.0938009787928222, + "grad_norm": 0.2842220366001129, + "learning_rate": 4.361572509791328e-05, + "loss": 0.1216, + "num_input_tokens_seen": 40943264, + "step": 18965 + }, + { + "epoch": 3.094616639477977, + "grad_norm": 0.751213788986206, + "learning_rate": 4.361097324278242e-05, + "loss": 0.1035, + "num_input_tokens_seen": 40953664, + "step": 18970 + }, + { + "epoch": 3.0954323001631323, + "grad_norm": 1.8274370431900024, + "learning_rate": 4.3606219878951623e-05, + "loss": 0.196, + "num_input_tokens_seen": 40964544, + "step": 18975 + }, + { + "epoch": 3.096247960848287, + "grad_norm": 0.05019150301814079, + "learning_rate": 4.360146500680625e-05, + "loss": 0.0339, + "num_input_tokens_seen": 40975520, + "step": 18980 + }, + { + "epoch": 3.097063621533442, + "grad_norm": 0.7254641652107239, + "learning_rate": 4.3596708626731744e-05, + "loss": 0.0902, + "num_input_tokens_seen": 40985728, + "step": 18985 + }, + { + "epoch": 3.097879282218597, + "grad_norm": 0.3125148415565491, + "learning_rate": 4.359195073911367e-05, + "loss": 0.1564, + "num_input_tokens_seen": 40996064, + "step": 18990 + }, + { + "epoch": 3.098694942903752, + "grad_norm": 0.07035113871097565, + "learning_rate": 4.3587191344337735e-05, + "loss": 0.1022, + "num_input_tokens_seen": 41007456, + "step": 18995 + }, + { + "epoch": 3.099510603588907, + "grad_norm": 1.704017162322998, + "learning_rate": 4.358243044278976e-05, + "loss": 0.2497, + "num_input_tokens_seen": 41017792, + "step": 19000 + }, + { + "epoch": 3.100326264274062, + "grad_norm": 0.15614904463291168, + "learning_rate": 4.3577668034855684e-05, + "loss": 0.1156, + "num_input_tokens_seen": 41028512, + "step": 19005 + }, + { + "epoch": 3.101141924959217, + "grad_norm": 0.979960024356842, + "learning_rate": 4.357290412092158e-05, + "loss": 0.0972, + "num_input_tokens_seen": 41039520, + "step": 19010 + }, + { + "epoch": 3.1019575856443717, + "grad_norm": 0.15796735882759094, + "learning_rate": 4.356813870137363e-05, + "loss": 0.1807, + "num_input_tokens_seen": 41050336, + "step": 19015 + }, + { + "epoch": 3.102773246329527, + "grad_norm": 0.18841435015201569, + "learning_rate": 4.356337177659814e-05, + "loss": 0.0177, + "num_input_tokens_seen": 41061088, + "step": 19020 + }, + { + "epoch": 3.103588907014682, + "grad_norm": 0.04877398535609245, + "learning_rate": 4.355860334698154e-05, + "loss": 0.2228, + "num_input_tokens_seen": 41071680, + "step": 19025 + }, + { + "epoch": 3.104404567699837, + "grad_norm": 0.06247614324092865, + "learning_rate": 4.3553833412910395e-05, + "loss": 0.1332, + "num_input_tokens_seen": 41083744, + "step": 19030 + }, + { + "epoch": 3.105220228384992, + "grad_norm": 0.9955107569694519, + "learning_rate": 4.354906197477137e-05, + "loss": 0.1498, + "num_input_tokens_seen": 41094688, + "step": 19035 + }, + { + "epoch": 3.1060358890701467, + "grad_norm": 0.06094314157962799, + "learning_rate": 4.354428903295126e-05, + "loss": 0.2318, + "num_input_tokens_seen": 41106656, + "step": 19040 + }, + { + "epoch": 3.106851549755302, + "grad_norm": 0.25092825293540955, + "learning_rate": 4.353951458783699e-05, + "loss": 0.0868, + "num_input_tokens_seen": 41117280, + "step": 19045 + }, + { + "epoch": 3.107667210440457, + "grad_norm": 0.41358324885368347, + "learning_rate": 4.3534738639815606e-05, + "loss": 0.1268, + "num_input_tokens_seen": 41128064, + "step": 19050 + }, + { + "epoch": 3.1084828711256116, + "grad_norm": 0.5759807229042053, + "learning_rate": 4.352996118927426e-05, + "loss": 0.121, + "num_input_tokens_seen": 41137440, + "step": 19055 + }, + { + "epoch": 3.109298531810767, + "grad_norm": 1.0411663055419922, + "learning_rate": 4.3525182236600235e-05, + "loss": 0.1172, + "num_input_tokens_seen": 41147648, + "step": 19060 + }, + { + "epoch": 3.1101141924959217, + "grad_norm": 0.6615650653839111, + "learning_rate": 4.3520401782180954e-05, + "loss": 0.1591, + "num_input_tokens_seen": 41158688, + "step": 19065 + }, + { + "epoch": 3.1109298531810765, + "grad_norm": 0.3392117917537689, + "learning_rate": 4.351561982640392e-05, + "loss": 0.0192, + "num_input_tokens_seen": 41168160, + "step": 19070 + }, + { + "epoch": 3.1117455138662318, + "grad_norm": 1.338409185409546, + "learning_rate": 4.35108363696568e-05, + "loss": 0.1288, + "num_input_tokens_seen": 41179936, + "step": 19075 + }, + { + "epoch": 3.1125611745513866, + "grad_norm": 0.11319505423307419, + "learning_rate": 4.3506051412327364e-05, + "loss": 0.1749, + "num_input_tokens_seen": 41190528, + "step": 19080 + }, + { + "epoch": 3.1133768352365414, + "grad_norm": 0.6742737293243408, + "learning_rate": 4.3501264954803495e-05, + "loss": 0.0856, + "num_input_tokens_seen": 41200128, + "step": 19085 + }, + { + "epoch": 3.1141924959216967, + "grad_norm": 1.2940088510513306, + "learning_rate": 4.3496476997473216e-05, + "loss": 0.0999, + "num_input_tokens_seen": 41211040, + "step": 19090 + }, + { + "epoch": 3.1150081566068515, + "grad_norm": 0.47692346572875977, + "learning_rate": 4.349168754072467e-05, + "loss": 0.3969, + "num_input_tokens_seen": 41223392, + "step": 19095 + }, + { + "epoch": 3.1158238172920063, + "grad_norm": 0.8293249607086182, + "learning_rate": 4.3486896584946094e-05, + "loss": 0.0961, + "num_input_tokens_seen": 41233344, + "step": 19100 + }, + { + "epoch": 3.1166394779771616, + "grad_norm": 2.1680586338043213, + "learning_rate": 4.348210413052589e-05, + "loss": 0.3403, + "num_input_tokens_seen": 41244512, + "step": 19105 + }, + { + "epoch": 3.1174551386623164, + "grad_norm": 0.3612120449542999, + "learning_rate": 4.3477310177852537e-05, + "loss": 0.1858, + "num_input_tokens_seen": 41255360, + "step": 19110 + }, + { + "epoch": 3.1182707993474716, + "grad_norm": 1.0091067552566528, + "learning_rate": 4.347251472731467e-05, + "loss": 0.0906, + "num_input_tokens_seen": 41265856, + "step": 19115 + }, + { + "epoch": 3.1190864600326265, + "grad_norm": 0.10623046010732651, + "learning_rate": 4.3467717779301046e-05, + "loss": 0.1945, + "num_input_tokens_seen": 41277216, + "step": 19120 + }, + { + "epoch": 3.1199021207177813, + "grad_norm": 3.0946784019470215, + "learning_rate": 4.3462919334200494e-05, + "loss": 0.3645, + "num_input_tokens_seen": 41288256, + "step": 19125 + }, + { + "epoch": 3.1207177814029365, + "grad_norm": 0.21693742275238037, + "learning_rate": 4.345811939240203e-05, + "loss": 0.133, + "num_input_tokens_seen": 41298784, + "step": 19130 + }, + { + "epoch": 3.1215334420880914, + "grad_norm": 0.20810531079769135, + "learning_rate": 4.3453317954294755e-05, + "loss": 0.1813, + "num_input_tokens_seen": 41309792, + "step": 19135 + }, + { + "epoch": 3.122349102773246, + "grad_norm": 0.8111361861228943, + "learning_rate": 4.3448515020267896e-05, + "loss": 0.1328, + "num_input_tokens_seen": 41320320, + "step": 19140 + }, + { + "epoch": 3.1231647634584014, + "grad_norm": 0.9469971060752869, + "learning_rate": 4.3443710590710795e-05, + "loss": 0.073, + "num_input_tokens_seen": 41331008, + "step": 19145 + }, + { + "epoch": 3.1239804241435563, + "grad_norm": 0.27218252420425415, + "learning_rate": 4.343890466601294e-05, + "loss": 0.2258, + "num_input_tokens_seen": 41341888, + "step": 19150 + }, + { + "epoch": 3.124796084828711, + "grad_norm": 0.3061143755912781, + "learning_rate": 4.343409724656391e-05, + "loss": 0.132, + "num_input_tokens_seen": 41351840, + "step": 19155 + }, + { + "epoch": 3.1256117455138663, + "grad_norm": 0.3389440178871155, + "learning_rate": 4.342928833275341e-05, + "loss": 0.045, + "num_input_tokens_seen": 41362592, + "step": 19160 + }, + { + "epoch": 3.126427406199021, + "grad_norm": 0.5431530475616455, + "learning_rate": 4.342447792497131e-05, + "loss": 0.134, + "num_input_tokens_seen": 41372704, + "step": 19165 + }, + { + "epoch": 3.1272430668841764, + "grad_norm": 0.38043951988220215, + "learning_rate": 4.341966602360754e-05, + "loss": 0.16, + "num_input_tokens_seen": 41383584, + "step": 19170 + }, + { + "epoch": 3.1280587275693312, + "grad_norm": 2.071164608001709, + "learning_rate": 4.3414852629052175e-05, + "loss": 0.3902, + "num_input_tokens_seen": 41394976, + "step": 19175 + }, + { + "epoch": 3.128874388254486, + "grad_norm": 1.2924818992614746, + "learning_rate": 4.341003774169542e-05, + "loss": 0.1606, + "num_input_tokens_seen": 41406016, + "step": 19180 + }, + { + "epoch": 3.1296900489396413, + "grad_norm": 0.06956911832094193, + "learning_rate": 4.34052213619276e-05, + "loss": 0.0813, + "num_input_tokens_seen": 41417376, + "step": 19185 + }, + { + "epoch": 3.130505709624796, + "grad_norm": 1.0201656818389893, + "learning_rate": 4.340040349013915e-05, + "loss": 0.2108, + "num_input_tokens_seen": 41428416, + "step": 19190 + }, + { + "epoch": 3.131321370309951, + "grad_norm": 1.1185380220413208, + "learning_rate": 4.3395584126720626e-05, + "loss": 0.2451, + "num_input_tokens_seen": 41439552, + "step": 19195 + }, + { + "epoch": 3.132137030995106, + "grad_norm": 0.19940805435180664, + "learning_rate": 4.339076327206272e-05, + "loss": 0.0386, + "num_input_tokens_seen": 41448928, + "step": 19200 + }, + { + "epoch": 3.132952691680261, + "grad_norm": 0.5618808269500732, + "learning_rate": 4.338594092655622e-05, + "loss": 0.115, + "num_input_tokens_seen": 41460096, + "step": 19205 + }, + { + "epoch": 3.133768352365416, + "grad_norm": 1.5302211046218872, + "learning_rate": 4.338111709059206e-05, + "loss": 0.1315, + "num_input_tokens_seen": 41469696, + "step": 19210 + }, + { + "epoch": 3.134584013050571, + "grad_norm": 0.17556603252887726, + "learning_rate": 4.337629176456129e-05, + "loss": 0.039, + "num_input_tokens_seen": 41480224, + "step": 19215 + }, + { + "epoch": 3.135399673735726, + "grad_norm": 0.28390225768089294, + "learning_rate": 4.337146494885507e-05, + "loss": 0.1134, + "num_input_tokens_seen": 41490176, + "step": 19220 + }, + { + "epoch": 3.1362153344208807, + "grad_norm": 0.8928486108779907, + "learning_rate": 4.336663664386468e-05, + "loss": 0.177, + "num_input_tokens_seen": 41500928, + "step": 19225 + }, + { + "epoch": 3.137030995106036, + "grad_norm": 0.1325121372938156, + "learning_rate": 4.3361806849981535e-05, + "loss": 0.1128, + "num_input_tokens_seen": 41512608, + "step": 19230 + }, + { + "epoch": 3.137846655791191, + "grad_norm": 0.5142335891723633, + "learning_rate": 4.335697556759716e-05, + "loss": 0.0584, + "num_input_tokens_seen": 41522880, + "step": 19235 + }, + { + "epoch": 3.1386623164763456, + "grad_norm": 0.5122372508049011, + "learning_rate": 4.3352142797103204e-05, + "loss": 0.1945, + "num_input_tokens_seen": 41533632, + "step": 19240 + }, + { + "epoch": 3.139477977161501, + "grad_norm": 0.9903202056884766, + "learning_rate": 4.334730853889143e-05, + "loss": 0.0995, + "num_input_tokens_seen": 41546112, + "step": 19245 + }, + { + "epoch": 3.1402936378466557, + "grad_norm": 1.1909246444702148, + "learning_rate": 4.3342472793353736e-05, + "loss": 0.0688, + "num_input_tokens_seen": 41557376, + "step": 19250 + }, + { + "epoch": 3.141109298531811, + "grad_norm": 0.5558847188949585, + "learning_rate": 4.333763556088213e-05, + "loss": 0.2517, + "num_input_tokens_seen": 41568160, + "step": 19255 + }, + { + "epoch": 3.141924959216966, + "grad_norm": 0.9271166920661926, + "learning_rate": 4.333279684186874e-05, + "loss": 0.17, + "num_input_tokens_seen": 41577696, + "step": 19260 + }, + { + "epoch": 3.1427406199021206, + "grad_norm": 0.1923040747642517, + "learning_rate": 4.332795663670581e-05, + "loss": 0.0618, + "num_input_tokens_seen": 41589088, + "step": 19265 + }, + { + "epoch": 3.143556280587276, + "grad_norm": 0.6338872909545898, + "learning_rate": 4.332311494578573e-05, + "loss": 0.1049, + "num_input_tokens_seen": 41599904, + "step": 19270 + }, + { + "epoch": 3.1443719412724307, + "grad_norm": 0.17090697586536407, + "learning_rate": 4.331827176950098e-05, + "loss": 0.0506, + "num_input_tokens_seen": 41610784, + "step": 19275 + }, + { + "epoch": 3.1451876019575855, + "grad_norm": 0.09775925427675247, + "learning_rate": 4.3313427108244175e-05, + "loss": 0.0965, + "num_input_tokens_seen": 41620768, + "step": 19280 + }, + { + "epoch": 3.1460032626427408, + "grad_norm": 0.18484753370285034, + "learning_rate": 4.330858096240804e-05, + "loss": 0.0996, + "num_input_tokens_seen": 41632384, + "step": 19285 + }, + { + "epoch": 3.1468189233278956, + "grad_norm": 1.5860660076141357, + "learning_rate": 4.3303733332385446e-05, + "loss": 0.1157, + "num_input_tokens_seen": 41643680, + "step": 19290 + }, + { + "epoch": 3.1476345840130504, + "grad_norm": 1.4998079538345337, + "learning_rate": 4.329888421856936e-05, + "loss": 0.1056, + "num_input_tokens_seen": 41655424, + "step": 19295 + }, + { + "epoch": 3.1484502446982057, + "grad_norm": 0.6520251631736755, + "learning_rate": 4.3294033621352856e-05, + "loss": 0.0898, + "num_input_tokens_seen": 41666144, + "step": 19300 + }, + { + "epoch": 3.1492659053833605, + "grad_norm": 0.7482738494873047, + "learning_rate": 4.3289181541129174e-05, + "loss": 0.0551, + "num_input_tokens_seen": 41677312, + "step": 19305 + }, + { + "epoch": 3.1500815660685153, + "grad_norm": 1.515718936920166, + "learning_rate": 4.328432797829164e-05, + "loss": 0.0769, + "num_input_tokens_seen": 41689344, + "step": 19310 + }, + { + "epoch": 3.1508972267536706, + "grad_norm": 0.09883121401071548, + "learning_rate": 4.3279472933233696e-05, + "loss": 0.0402, + "num_input_tokens_seen": 41700032, + "step": 19315 + }, + { + "epoch": 3.1517128874388254, + "grad_norm": 0.043051522225141525, + "learning_rate": 4.327461640634893e-05, + "loss": 0.0823, + "num_input_tokens_seen": 41711040, + "step": 19320 + }, + { + "epoch": 3.15252854812398, + "grad_norm": 0.30574384331703186, + "learning_rate": 4.3269758398031037e-05, + "loss": 0.1469, + "num_input_tokens_seen": 41721184, + "step": 19325 + }, + { + "epoch": 3.1533442088091355, + "grad_norm": 1.342074990272522, + "learning_rate": 4.3264898908673826e-05, + "loss": 0.1902, + "num_input_tokens_seen": 41731616, + "step": 19330 + }, + { + "epoch": 3.1541598694942903, + "grad_norm": 0.24574804306030273, + "learning_rate": 4.3260037938671237e-05, + "loss": 0.1433, + "num_input_tokens_seen": 41741568, + "step": 19335 + }, + { + "epoch": 3.1549755301794455, + "grad_norm": 1.8171104192733765, + "learning_rate": 4.325517548841732e-05, + "loss": 0.1588, + "num_input_tokens_seen": 41752512, + "step": 19340 + }, + { + "epoch": 3.1557911908646004, + "grad_norm": 1.4050531387329102, + "learning_rate": 4.3250311558306255e-05, + "loss": 0.142, + "num_input_tokens_seen": 41763040, + "step": 19345 + }, + { + "epoch": 3.156606851549755, + "grad_norm": 0.246576726436615, + "learning_rate": 4.324544614873233e-05, + "loss": 0.2784, + "num_input_tokens_seen": 41773472, + "step": 19350 + }, + { + "epoch": 3.1574225122349104, + "grad_norm": 0.8389762043952942, + "learning_rate": 4.324057926008997e-05, + "loss": 0.0464, + "num_input_tokens_seen": 41784096, + "step": 19355 + }, + { + "epoch": 3.1582381729200653, + "grad_norm": 0.14780451357364655, + "learning_rate": 4.323571089277369e-05, + "loss": 0.0339, + "num_input_tokens_seen": 41795616, + "step": 19360 + }, + { + "epoch": 3.15905383360522, + "grad_norm": 0.31709998846054077, + "learning_rate": 4.3230841047178175e-05, + "loss": 0.018, + "num_input_tokens_seen": 41806784, + "step": 19365 + }, + { + "epoch": 3.1598694942903753, + "grad_norm": 0.9949445128440857, + "learning_rate": 4.3225969723698165e-05, + "loss": 0.1329, + "num_input_tokens_seen": 41817920, + "step": 19370 + }, + { + "epoch": 3.16068515497553, + "grad_norm": 1.248502492904663, + "learning_rate": 4.322109692272858e-05, + "loss": 0.2829, + "num_input_tokens_seen": 41827520, + "step": 19375 + }, + { + "epoch": 3.161500815660685, + "grad_norm": 0.5448068380355835, + "learning_rate": 4.321622264466443e-05, + "loss": 0.1224, + "num_input_tokens_seen": 41838496, + "step": 19380 + }, + { + "epoch": 3.1623164763458402, + "grad_norm": 0.855376124382019, + "learning_rate": 4.321134688990084e-05, + "loss": 0.1373, + "num_input_tokens_seen": 41848800, + "step": 19385 + }, + { + "epoch": 3.163132137030995, + "grad_norm": 0.21709460020065308, + "learning_rate": 4.320646965883307e-05, + "loss": 0.3064, + "num_input_tokens_seen": 41859616, + "step": 19390 + }, + { + "epoch": 3.1639477977161503, + "grad_norm": 0.5100330114364624, + "learning_rate": 4.320159095185648e-05, + "loss": 0.1084, + "num_input_tokens_seen": 41870432, + "step": 19395 + }, + { + "epoch": 3.164763458401305, + "grad_norm": 0.5862342119216919, + "learning_rate": 4.3196710769366585e-05, + "loss": 0.2062, + "num_input_tokens_seen": 41880960, + "step": 19400 + }, + { + "epoch": 3.16557911908646, + "grad_norm": 1.0539770126342773, + "learning_rate": 4.3191829111758985e-05, + "loss": 0.274, + "num_input_tokens_seen": 41891456, + "step": 19405 + }, + { + "epoch": 3.166394779771615, + "grad_norm": 0.6242407560348511, + "learning_rate": 4.318694597942941e-05, + "loss": 0.0482, + "num_input_tokens_seen": 41903040, + "step": 19410 + }, + { + "epoch": 3.16721044045677, + "grad_norm": 0.9187711477279663, + "learning_rate": 4.318206137277372e-05, + "loss": 0.1359, + "num_input_tokens_seen": 41914272, + "step": 19415 + }, + { + "epoch": 3.168026101141925, + "grad_norm": 0.7398778796195984, + "learning_rate": 4.317717529218788e-05, + "loss": 0.1908, + "num_input_tokens_seen": 41925184, + "step": 19420 + }, + { + "epoch": 3.16884176182708, + "grad_norm": 2.367769479751587, + "learning_rate": 4.317228773806797e-05, + "loss": 0.2196, + "num_input_tokens_seen": 41936288, + "step": 19425 + }, + { + "epoch": 3.169657422512235, + "grad_norm": 1.5590695142745972, + "learning_rate": 4.316739871081021e-05, + "loss": 0.1212, + "num_input_tokens_seen": 41945984, + "step": 19430 + }, + { + "epoch": 3.1704730831973897, + "grad_norm": 0.6591187715530396, + "learning_rate": 4.3162508210810936e-05, + "loss": 0.2495, + "num_input_tokens_seen": 41956256, + "step": 19435 + }, + { + "epoch": 3.171288743882545, + "grad_norm": 1.3236651420593262, + "learning_rate": 4.31576162384666e-05, + "loss": 0.1218, + "num_input_tokens_seen": 41967040, + "step": 19440 + }, + { + "epoch": 3.1721044045677, + "grad_norm": 0.03740663826465607, + "learning_rate": 4.315272279417375e-05, + "loss": 0.0642, + "num_input_tokens_seen": 41977664, + "step": 19445 + }, + { + "epoch": 3.1729200652528546, + "grad_norm": 0.9564058184623718, + "learning_rate": 4.314782787832908e-05, + "loss": 0.1053, + "num_input_tokens_seen": 41988384, + "step": 19450 + }, + { + "epoch": 3.17373572593801, + "grad_norm": 1.5615700483322144, + "learning_rate": 4.314293149132941e-05, + "loss": 0.29, + "num_input_tokens_seen": 42000576, + "step": 19455 + }, + { + "epoch": 3.1745513866231647, + "grad_norm": 0.26619860529899597, + "learning_rate": 4.313803363357166e-05, + "loss": 0.1141, + "num_input_tokens_seen": 42010176, + "step": 19460 + }, + { + "epoch": 3.1753670473083195, + "grad_norm": 0.6811047792434692, + "learning_rate": 4.313313430545286e-05, + "loss": 0.1098, + "num_input_tokens_seen": 42021248, + "step": 19465 + }, + { + "epoch": 3.176182707993475, + "grad_norm": 0.1675572693347931, + "learning_rate": 4.3128233507370196e-05, + "loss": 0.1205, + "num_input_tokens_seen": 42031712, + "step": 19470 + }, + { + "epoch": 3.1769983686786296, + "grad_norm": 0.34925577044487, + "learning_rate": 4.312333123972094e-05, + "loss": 0.1564, + "num_input_tokens_seen": 42042848, + "step": 19475 + }, + { + "epoch": 3.177814029363785, + "grad_norm": 1.3687608242034912, + "learning_rate": 4.31184275029025e-05, + "loss": 0.1271, + "num_input_tokens_seen": 42053856, + "step": 19480 + }, + { + "epoch": 3.1786296900489397, + "grad_norm": 1.1697301864624023, + "learning_rate": 4.311352229731239e-05, + "loss": 0.1372, + "num_input_tokens_seen": 42064864, + "step": 19485 + }, + { + "epoch": 3.1794453507340945, + "grad_norm": 0.9826184511184692, + "learning_rate": 4.310861562334826e-05, + "loss": 0.0799, + "num_input_tokens_seen": 42076864, + "step": 19490 + }, + { + "epoch": 3.1802610114192498, + "grad_norm": 0.2797046899795532, + "learning_rate": 4.310370748140786e-05, + "loss": 0.0844, + "num_input_tokens_seen": 42086592, + "step": 19495 + }, + { + "epoch": 3.1810766721044046, + "grad_norm": 0.32359981536865234, + "learning_rate": 4.3098797871889075e-05, + "loss": 0.174, + "num_input_tokens_seen": 42096736, + "step": 19500 + }, + { + "epoch": 3.1818923327895594, + "grad_norm": 0.16912338137626648, + "learning_rate": 4.30938867951899e-05, + "loss": 0.1239, + "num_input_tokens_seen": 42106592, + "step": 19505 + }, + { + "epoch": 3.1827079934747147, + "grad_norm": 0.28766751289367676, + "learning_rate": 4.308897425170846e-05, + "loss": 0.1145, + "num_input_tokens_seen": 42117024, + "step": 19510 + }, + { + "epoch": 3.1835236541598695, + "grad_norm": 0.47387129068374634, + "learning_rate": 4.3084060241842984e-05, + "loss": 0.0927, + "num_input_tokens_seen": 42127776, + "step": 19515 + }, + { + "epoch": 3.1843393148450243, + "grad_norm": 0.3382481634616852, + "learning_rate": 4.307914476599182e-05, + "loss": 0.0719, + "num_input_tokens_seen": 42138976, + "step": 19520 + }, + { + "epoch": 3.1851549755301796, + "grad_norm": 1.3960458040237427, + "learning_rate": 4.307422782455346e-05, + "loss": 0.0645, + "num_input_tokens_seen": 42150752, + "step": 19525 + }, + { + "epoch": 3.1859706362153344, + "grad_norm": 0.4591168165206909, + "learning_rate": 4.306930941792648e-05, + "loss": 0.0706, + "num_input_tokens_seen": 42161696, + "step": 19530 + }, + { + "epoch": 3.186786296900489, + "grad_norm": 1.163291335105896, + "learning_rate": 4.3064389546509585e-05, + "loss": 0.1479, + "num_input_tokens_seen": 42173312, + "step": 19535 + }, + { + "epoch": 3.1876019575856445, + "grad_norm": 0.32388246059417725, + "learning_rate": 4.305946821070163e-05, + "loss": 0.2754, + "num_input_tokens_seen": 42185344, + "step": 19540 + }, + { + "epoch": 3.1884176182707993, + "grad_norm": 0.429171085357666, + "learning_rate": 4.3054545410901547e-05, + "loss": 0.1185, + "num_input_tokens_seen": 42196480, + "step": 19545 + }, + { + "epoch": 3.189233278955954, + "grad_norm": 0.32626718282699585, + "learning_rate": 4.30496211475084e-05, + "loss": 0.0502, + "num_input_tokens_seen": 42206624, + "step": 19550 + }, + { + "epoch": 3.1900489396411094, + "grad_norm": 1.9694180488586426, + "learning_rate": 4.3044695420921386e-05, + "loss": 0.1965, + "num_input_tokens_seen": 42218688, + "step": 19555 + }, + { + "epoch": 3.190864600326264, + "grad_norm": 0.06955645233392715, + "learning_rate": 4.30397682315398e-05, + "loss": 0.0264, + "num_input_tokens_seen": 42228896, + "step": 19560 + }, + { + "epoch": 3.1916802610114194, + "grad_norm": 0.6951987147331238, + "learning_rate": 4.303483957976306e-05, + "loss": 0.2809, + "num_input_tokens_seen": 42239424, + "step": 19565 + }, + { + "epoch": 3.1924959216965743, + "grad_norm": 0.6288555264472961, + "learning_rate": 4.302990946599073e-05, + "loss": 0.1774, + "num_input_tokens_seen": 42250272, + "step": 19570 + }, + { + "epoch": 3.193311582381729, + "grad_norm": 0.5508313775062561, + "learning_rate": 4.302497789062245e-05, + "loss": 0.118, + "num_input_tokens_seen": 42261024, + "step": 19575 + }, + { + "epoch": 3.1941272430668843, + "grad_norm": 0.07586044073104858, + "learning_rate": 4.3020044854058e-05, + "loss": 0.1073, + "num_input_tokens_seen": 42271680, + "step": 19580 + }, + { + "epoch": 3.194942903752039, + "grad_norm": 0.06826034188270569, + "learning_rate": 4.3015110356697285e-05, + "loss": 0.0946, + "num_input_tokens_seen": 42281792, + "step": 19585 + }, + { + "epoch": 3.195758564437194, + "grad_norm": 0.1197437047958374, + "learning_rate": 4.301017439894032e-05, + "loss": 0.0232, + "num_input_tokens_seen": 42292128, + "step": 19590 + }, + { + "epoch": 3.1965742251223492, + "grad_norm": 0.5561928749084473, + "learning_rate": 4.300523698118722e-05, + "loss": 0.1089, + "num_input_tokens_seen": 42303232, + "step": 19595 + }, + { + "epoch": 3.197389885807504, + "grad_norm": 0.09985547512769699, + "learning_rate": 4.3000298103838274e-05, + "loss": 0.0511, + "num_input_tokens_seen": 42313056, + "step": 19600 + }, + { + "epoch": 3.198205546492659, + "grad_norm": 0.3217068910598755, + "learning_rate": 4.299535776729382e-05, + "loss": 0.076, + "num_input_tokens_seen": 42324320, + "step": 19605 + }, + { + "epoch": 3.199021207177814, + "grad_norm": 0.2861423194408417, + "learning_rate": 4.2990415971954364e-05, + "loss": 0.073, + "num_input_tokens_seen": 42332800, + "step": 19610 + }, + { + "epoch": 3.199836867862969, + "grad_norm": 2.0094244480133057, + "learning_rate": 4.29854727182205e-05, + "loss": 0.3601, + "num_input_tokens_seen": 42344448, + "step": 19615 + }, + { + "epoch": 3.200652528548124, + "grad_norm": 0.1763935089111328, + "learning_rate": 4.298052800649296e-05, + "loss": 0.0754, + "num_input_tokens_seen": 42355520, + "step": 19620 + }, + { + "epoch": 3.201468189233279, + "grad_norm": 0.6537269353866577, + "learning_rate": 4.297558183717259e-05, + "loss": 0.093, + "num_input_tokens_seen": 42366656, + "step": 19625 + }, + { + "epoch": 3.202283849918434, + "grad_norm": 0.4111579954624176, + "learning_rate": 4.297063421066035e-05, + "loss": 0.0791, + "num_input_tokens_seen": 42377824, + "step": 19630 + }, + { + "epoch": 3.203099510603589, + "grad_norm": 1.2444483041763306, + "learning_rate": 4.296568512735732e-05, + "loss": 0.0968, + "num_input_tokens_seen": 42388064, + "step": 19635 + }, + { + "epoch": 3.203915171288744, + "grad_norm": 0.1037173941731453, + "learning_rate": 4.29607345876647e-05, + "loss": 0.1217, + "num_input_tokens_seen": 42399648, + "step": 19640 + }, + { + "epoch": 3.2047308319738987, + "grad_norm": 0.03805389255285263, + "learning_rate": 4.2955782591983795e-05, + "loss": 0.0777, + "num_input_tokens_seen": 42409024, + "step": 19645 + }, + { + "epoch": 3.205546492659054, + "grad_norm": 0.2333722561597824, + "learning_rate": 4.295082914071604e-05, + "loss": 0.0298, + "num_input_tokens_seen": 42419552, + "step": 19650 + }, + { + "epoch": 3.206362153344209, + "grad_norm": 1.0125876665115356, + "learning_rate": 4.294587423426301e-05, + "loss": 0.2287, + "num_input_tokens_seen": 42428992, + "step": 19655 + }, + { + "epoch": 3.2071778140293636, + "grad_norm": 0.18161317706108093, + "learning_rate": 4.294091787302634e-05, + "loss": 0.1186, + "num_input_tokens_seen": 42440736, + "step": 19660 + }, + { + "epoch": 3.207993474714519, + "grad_norm": 0.23576763272285461, + "learning_rate": 4.2935960057407855e-05, + "loss": 0.0741, + "num_input_tokens_seen": 42451168, + "step": 19665 + }, + { + "epoch": 3.2088091353996737, + "grad_norm": 1.8303405046463013, + "learning_rate": 4.2931000787809426e-05, + "loss": 0.1107, + "num_input_tokens_seen": 42462528, + "step": 19670 + }, + { + "epoch": 3.2096247960848285, + "grad_norm": 0.16819153726100922, + "learning_rate": 4.29260400646331e-05, + "loss": 0.0803, + "num_input_tokens_seen": 42471648, + "step": 19675 + }, + { + "epoch": 3.210440456769984, + "grad_norm": 0.5990731120109558, + "learning_rate": 4.2921077888281014e-05, + "loss": 0.0927, + "num_input_tokens_seen": 42482944, + "step": 19680 + }, + { + "epoch": 3.2112561174551386, + "grad_norm": 1.1605374813079834, + "learning_rate": 4.2916114259155414e-05, + "loss": 0.186, + "num_input_tokens_seen": 42493152, + "step": 19685 + }, + { + "epoch": 3.2120717781402934, + "grad_norm": 1.5479902029037476, + "learning_rate": 4.291114917765869e-05, + "loss": 0.1087, + "num_input_tokens_seen": 42504544, + "step": 19690 + }, + { + "epoch": 3.2128874388254487, + "grad_norm": 0.7968707084655762, + "learning_rate": 4.290618264419334e-05, + "loss": 0.1754, + "num_input_tokens_seen": 42514496, + "step": 19695 + }, + { + "epoch": 3.2137030995106035, + "grad_norm": 0.3627569377422333, + "learning_rate": 4.290121465916196e-05, + "loss": 0.076, + "num_input_tokens_seen": 42524704, + "step": 19700 + }, + { + "epoch": 3.2145187601957588, + "grad_norm": 2.599414110183716, + "learning_rate": 4.2896245222967296e-05, + "loss": 0.2253, + "num_input_tokens_seen": 42534688, + "step": 19705 + }, + { + "epoch": 3.2153344208809136, + "grad_norm": 0.6555103659629822, + "learning_rate": 4.2891274336012186e-05, + "loss": 0.2313, + "num_input_tokens_seen": 42546016, + "step": 19710 + }, + { + "epoch": 3.2161500815660684, + "grad_norm": 1.0902552604675293, + "learning_rate": 4.288630199869961e-05, + "loss": 0.1404, + "num_input_tokens_seen": 42556672, + "step": 19715 + }, + { + "epoch": 3.2169657422512237, + "grad_norm": 0.283772736787796, + "learning_rate": 4.2881328211432626e-05, + "loss": 0.0823, + "num_input_tokens_seen": 42567168, + "step": 19720 + }, + { + "epoch": 3.2177814029363785, + "grad_norm": 0.09754502773284912, + "learning_rate": 4.2876352974614456e-05, + "loss": 0.023, + "num_input_tokens_seen": 42577760, + "step": 19725 + }, + { + "epoch": 3.2185970636215333, + "grad_norm": 1.3344135284423828, + "learning_rate": 4.28713762886484e-05, + "loss": 0.1565, + "num_input_tokens_seen": 42588672, + "step": 19730 + }, + { + "epoch": 3.2194127243066886, + "grad_norm": 1.4590020179748535, + "learning_rate": 4.286639815393791e-05, + "loss": 0.0951, + "num_input_tokens_seen": 42600480, + "step": 19735 + }, + { + "epoch": 3.2202283849918434, + "grad_norm": 0.02089184708893299, + "learning_rate": 4.286141857088654e-05, + "loss": 0.099, + "num_input_tokens_seen": 42612992, + "step": 19740 + }, + { + "epoch": 3.221044045676998, + "grad_norm": 1.185897707939148, + "learning_rate": 4.285643753989794e-05, + "loss": 0.087, + "num_input_tokens_seen": 42624576, + "step": 19745 + }, + { + "epoch": 3.2218597063621535, + "grad_norm": 0.21461841464042664, + "learning_rate": 4.2851455061375924e-05, + "loss": 0.1442, + "num_input_tokens_seen": 42636128, + "step": 19750 + }, + { + "epoch": 3.2226753670473083, + "grad_norm": 0.11228448897600174, + "learning_rate": 4.2846471135724376e-05, + "loss": 0.1335, + "num_input_tokens_seen": 42646304, + "step": 19755 + }, + { + "epoch": 3.223491027732463, + "grad_norm": 0.8357821702957153, + "learning_rate": 4.2841485763347324e-05, + "loss": 0.24, + "num_input_tokens_seen": 42656576, + "step": 19760 + }, + { + "epoch": 3.2243066884176184, + "grad_norm": 0.4937847852706909, + "learning_rate": 4.2836498944648904e-05, + "loss": 0.0209, + "num_input_tokens_seen": 42666624, + "step": 19765 + }, + { + "epoch": 3.225122349102773, + "grad_norm": 0.4595547616481781, + "learning_rate": 4.2831510680033394e-05, + "loss": 0.1303, + "num_input_tokens_seen": 42676096, + "step": 19770 + }, + { + "epoch": 3.225938009787928, + "grad_norm": 0.7994150519371033, + "learning_rate": 4.2826520969905134e-05, + "loss": 0.1408, + "num_input_tokens_seen": 42686752, + "step": 19775 + }, + { + "epoch": 3.2267536704730833, + "grad_norm": 0.9644463658332825, + "learning_rate": 4.282152981466865e-05, + "loss": 0.2064, + "num_input_tokens_seen": 42697792, + "step": 19780 + }, + { + "epoch": 3.227569331158238, + "grad_norm": 0.7333102226257324, + "learning_rate": 4.2816537214728524e-05, + "loss": 0.1273, + "num_input_tokens_seen": 42708384, + "step": 19785 + }, + { + "epoch": 3.2283849918433933, + "grad_norm": 0.6132882237434387, + "learning_rate": 4.281154317048949e-05, + "loss": 0.1337, + "num_input_tokens_seen": 42719136, + "step": 19790 + }, + { + "epoch": 3.229200652528548, + "grad_norm": 0.1036171242594719, + "learning_rate": 4.28065476823564e-05, + "loss": 0.1667, + "num_input_tokens_seen": 42728896, + "step": 19795 + }, + { + "epoch": 3.230016313213703, + "grad_norm": 1.068676233291626, + "learning_rate": 4.2801550750734195e-05, + "loss": 0.1643, + "num_input_tokens_seen": 42740512, + "step": 19800 + }, + { + "epoch": 3.2308319738988582, + "grad_norm": 0.07642659544944763, + "learning_rate": 4.279655237602796e-05, + "loss": 0.0254, + "num_input_tokens_seen": 42750240, + "step": 19805 + }, + { + "epoch": 3.231647634584013, + "grad_norm": 0.1155475452542305, + "learning_rate": 4.279155255864291e-05, + "loss": 0.0245, + "num_input_tokens_seen": 42760672, + "step": 19810 + }, + { + "epoch": 3.232463295269168, + "grad_norm": 1.8495310544967651, + "learning_rate": 4.2786551298984315e-05, + "loss": 0.2059, + "num_input_tokens_seen": 42771680, + "step": 19815 + }, + { + "epoch": 3.233278955954323, + "grad_norm": 0.825230598449707, + "learning_rate": 4.278154859745763e-05, + "loss": 0.1768, + "num_input_tokens_seen": 42782048, + "step": 19820 + }, + { + "epoch": 3.234094616639478, + "grad_norm": 1.421612024307251, + "learning_rate": 4.27765444544684e-05, + "loss": 0.1164, + "num_input_tokens_seen": 42792864, + "step": 19825 + }, + { + "epoch": 3.2349102773246328, + "grad_norm": 0.04358692094683647, + "learning_rate": 4.277153887042227e-05, + "loss": 0.1927, + "num_input_tokens_seen": 42803904, + "step": 19830 + }, + { + "epoch": 3.235725938009788, + "grad_norm": 1.249319076538086, + "learning_rate": 4.2766531845725036e-05, + "loss": 0.1244, + "num_input_tokens_seen": 42814720, + "step": 19835 + }, + { + "epoch": 3.236541598694943, + "grad_norm": 0.38734400272369385, + "learning_rate": 4.276152338078258e-05, + "loss": 0.0416, + "num_input_tokens_seen": 42825856, + "step": 19840 + }, + { + "epoch": 3.237357259380098, + "grad_norm": 0.6942612528800964, + "learning_rate": 4.275651347600092e-05, + "loss": 0.288, + "num_input_tokens_seen": 42837408, + "step": 19845 + }, + { + "epoch": 3.238172920065253, + "grad_norm": 0.6387715339660645, + "learning_rate": 4.275150213178618e-05, + "loss": 0.0833, + "num_input_tokens_seen": 42848960, + "step": 19850 + }, + { + "epoch": 3.2389885807504077, + "grad_norm": 0.06872618943452835, + "learning_rate": 4.27464893485446e-05, + "loss": 0.2268, + "num_input_tokens_seen": 42859168, + "step": 19855 + }, + { + "epoch": 3.239804241435563, + "grad_norm": 0.1838718056678772, + "learning_rate": 4.274147512668256e-05, + "loss": 0.1335, + "num_input_tokens_seen": 42867872, + "step": 19860 + }, + { + "epoch": 3.240619902120718, + "grad_norm": 1.6283977031707764, + "learning_rate": 4.273645946660652e-05, + "loss": 0.2621, + "num_input_tokens_seen": 42879264, + "step": 19865 + }, + { + "epoch": 3.2414355628058726, + "grad_norm": 0.21383120119571686, + "learning_rate": 4.273144236872308e-05, + "loss": 0.041, + "num_input_tokens_seen": 42890816, + "step": 19870 + }, + { + "epoch": 3.242251223491028, + "grad_norm": 1.9044957160949707, + "learning_rate": 4.2726423833438964e-05, + "loss": 0.1954, + "num_input_tokens_seen": 42902400, + "step": 19875 + }, + { + "epoch": 3.2430668841761827, + "grad_norm": 1.3370946645736694, + "learning_rate": 4.272140386116098e-05, + "loss": 0.3637, + "num_input_tokens_seen": 42913152, + "step": 19880 + }, + { + "epoch": 3.2438825448613375, + "grad_norm": 1.6648623943328857, + "learning_rate": 4.2716382452296086e-05, + "loss": 0.1778, + "num_input_tokens_seen": 42924672, + "step": 19885 + }, + { + "epoch": 3.244698205546493, + "grad_norm": 0.2898411750793457, + "learning_rate": 4.271135960725133e-05, + "loss": 0.2482, + "num_input_tokens_seen": 42934880, + "step": 19890 + }, + { + "epoch": 3.2455138662316476, + "grad_norm": 0.07163754105567932, + "learning_rate": 4.270633532643391e-05, + "loss": 0.0479, + "num_input_tokens_seen": 42946176, + "step": 19895 + }, + { + "epoch": 3.2463295269168024, + "grad_norm": 0.20770087838172913, + "learning_rate": 4.27013096102511e-05, + "loss": 0.189, + "num_input_tokens_seen": 42958016, + "step": 19900 + }, + { + "epoch": 3.2471451876019577, + "grad_norm": 0.3763732314109802, + "learning_rate": 4.269628245911031e-05, + "loss": 0.0706, + "num_input_tokens_seen": 42968320, + "step": 19905 + }, + { + "epoch": 3.2479608482871125, + "grad_norm": 1.5136264562606812, + "learning_rate": 4.269125387341909e-05, + "loss": 0.1785, + "num_input_tokens_seen": 42980096, + "step": 19910 + }, + { + "epoch": 3.2487765089722673, + "grad_norm": 0.7315645813941956, + "learning_rate": 4.268622385358506e-05, + "loss": 0.2143, + "num_input_tokens_seen": 42990304, + "step": 19915 + }, + { + "epoch": 3.2495921696574226, + "grad_norm": 0.09218769520521164, + "learning_rate": 4.268119240001598e-05, + "loss": 0.2586, + "num_input_tokens_seen": 43001280, + "step": 19920 + }, + { + "epoch": 3.2504078303425774, + "grad_norm": 0.34302768111228943, + "learning_rate": 4.267615951311974e-05, + "loss": 0.036, + "num_input_tokens_seen": 43011584, + "step": 19925 + }, + { + "epoch": 3.2512234910277327, + "grad_norm": 0.12174876779317856, + "learning_rate": 4.267112519330432e-05, + "loss": 0.11, + "num_input_tokens_seen": 43022432, + "step": 19930 + }, + { + "epoch": 3.2520391517128875, + "grad_norm": 0.5081053376197815, + "learning_rate": 4.266608944097782e-05, + "loss": 0.1691, + "num_input_tokens_seen": 43033792, + "step": 19935 + }, + { + "epoch": 3.2528548123980423, + "grad_norm": 0.4410194158554077, + "learning_rate": 4.266105225654848e-05, + "loss": 0.2555, + "num_input_tokens_seen": 43045504, + "step": 19940 + }, + { + "epoch": 3.2536704730831976, + "grad_norm": 0.21058671176433563, + "learning_rate": 4.265601364042463e-05, + "loss": 0.1597, + "num_input_tokens_seen": 43056544, + "step": 19945 + }, + { + "epoch": 3.2544861337683524, + "grad_norm": 1.4391343593597412, + "learning_rate": 4.2650973593014734e-05, + "loss": 0.1979, + "num_input_tokens_seen": 43065184, + "step": 19950 + }, + { + "epoch": 3.255301794453507, + "grad_norm": 0.6140344142913818, + "learning_rate": 4.264593211472735e-05, + "loss": 0.0867, + "num_input_tokens_seen": 43074784, + "step": 19955 + }, + { + "epoch": 3.2561174551386625, + "grad_norm": 0.8795890808105469, + "learning_rate": 4.264088920597118e-05, + "loss": 0.0634, + "num_input_tokens_seen": 43084864, + "step": 19960 + }, + { + "epoch": 3.2569331158238173, + "grad_norm": 0.2027873545885086, + "learning_rate": 4.263584486715503e-05, + "loss": 0.1948, + "num_input_tokens_seen": 43096576, + "step": 19965 + }, + { + "epoch": 3.257748776508972, + "grad_norm": 1.4493507146835327, + "learning_rate": 4.2630799098687804e-05, + "loss": 0.1633, + "num_input_tokens_seen": 43106816, + "step": 19970 + }, + { + "epoch": 3.2585644371941274, + "grad_norm": 1.7667884826660156, + "learning_rate": 4.262575190097854e-05, + "loss": 0.0927, + "num_input_tokens_seen": 43118304, + "step": 19975 + }, + { + "epoch": 3.259380097879282, + "grad_norm": 0.040716901421546936, + "learning_rate": 4.262070327443639e-05, + "loss": 0.1682, + "num_input_tokens_seen": 43128384, + "step": 19980 + }, + { + "epoch": 3.2601957585644374, + "grad_norm": 0.9542805552482605, + "learning_rate": 4.261565321947064e-05, + "loss": 0.2024, + "num_input_tokens_seen": 43139136, + "step": 19985 + }, + { + "epoch": 3.2610114192495923, + "grad_norm": 0.6979206800460815, + "learning_rate": 4.261060173649065e-05, + "loss": 0.0877, + "num_input_tokens_seen": 43150048, + "step": 19990 + }, + { + "epoch": 3.261827079934747, + "grad_norm": 1.7044378519058228, + "learning_rate": 4.260554882590594e-05, + "loss": 0.2374, + "num_input_tokens_seen": 43159456, + "step": 19995 + }, + { + "epoch": 3.262642740619902, + "grad_norm": 1.571637749671936, + "learning_rate": 4.2600494488126104e-05, + "loss": 0.2515, + "num_input_tokens_seen": 43170816, + "step": 20000 + }, + { + "epoch": 3.263458401305057, + "grad_norm": 1.2443839311599731, + "learning_rate": 4.259543872356088e-05, + "loss": 0.2272, + "num_input_tokens_seen": 43182336, + "step": 20005 + }, + { + "epoch": 3.264274061990212, + "grad_norm": 0.5157975554466248, + "learning_rate": 4.259038153262012e-05, + "loss": 0.259, + "num_input_tokens_seen": 43192384, + "step": 20010 + }, + { + "epoch": 3.2650897226753672, + "grad_norm": 0.09112334251403809, + "learning_rate": 4.2585322915713774e-05, + "loss": 0.1731, + "num_input_tokens_seen": 43203744, + "step": 20015 + }, + { + "epoch": 3.265905383360522, + "grad_norm": 0.794873058795929, + "learning_rate": 4.258026287325192e-05, + "loss": 0.1009, + "num_input_tokens_seen": 43213376, + "step": 20020 + }, + { + "epoch": 3.266721044045677, + "grad_norm": 1.6490790843963623, + "learning_rate": 4.2575201405644764e-05, + "loss": 0.2431, + "num_input_tokens_seen": 43223744, + "step": 20025 + }, + { + "epoch": 3.267536704730832, + "grad_norm": 0.832360565662384, + "learning_rate": 4.257013851330261e-05, + "loss": 0.0857, + "num_input_tokens_seen": 43234752, + "step": 20030 + }, + { + "epoch": 3.268352365415987, + "grad_norm": 1.08843195438385, + "learning_rate": 4.256507419663587e-05, + "loss": 0.1683, + "num_input_tokens_seen": 43245280, + "step": 20035 + }, + { + "epoch": 3.2691680261011418, + "grad_norm": 1.2418919801712036, + "learning_rate": 4.25600084560551e-05, + "loss": 0.1029, + "num_input_tokens_seen": 43256672, + "step": 20040 + }, + { + "epoch": 3.269983686786297, + "grad_norm": 0.6986849904060364, + "learning_rate": 4.255494129197094e-05, + "loss": 0.3023, + "num_input_tokens_seen": 43268288, + "step": 20045 + }, + { + "epoch": 3.270799347471452, + "grad_norm": 0.30457615852355957, + "learning_rate": 4.254987270479417e-05, + "loss": 0.0307, + "num_input_tokens_seen": 43278080, + "step": 20050 + }, + { + "epoch": 3.2716150081566067, + "grad_norm": 0.1305709183216095, + "learning_rate": 4.254480269493567e-05, + "loss": 0.0497, + "num_input_tokens_seen": 43287392, + "step": 20055 + }, + { + "epoch": 3.272430668841762, + "grad_norm": 0.7469885945320129, + "learning_rate": 4.253973126280644e-05, + "loss": 0.0916, + "num_input_tokens_seen": 43297824, + "step": 20060 + }, + { + "epoch": 3.2732463295269167, + "grad_norm": 0.7403779625892639, + "learning_rate": 4.2534658408817595e-05, + "loss": 0.0736, + "num_input_tokens_seen": 43308864, + "step": 20065 + }, + { + "epoch": 3.274061990212072, + "grad_norm": 1.2645503282546997, + "learning_rate": 4.252958413338038e-05, + "loss": 0.1733, + "num_input_tokens_seen": 43318912, + "step": 20070 + }, + { + "epoch": 3.274877650897227, + "grad_norm": 0.06170913204550743, + "learning_rate": 4.2524508436906124e-05, + "loss": 0.1182, + "num_input_tokens_seen": 43328896, + "step": 20075 + }, + { + "epoch": 3.2756933115823816, + "grad_norm": 0.98072749376297, + "learning_rate": 4.251943131980629e-05, + "loss": 0.1007, + "num_input_tokens_seen": 43340512, + "step": 20080 + }, + { + "epoch": 3.2765089722675365, + "grad_norm": 0.5822535157203674, + "learning_rate": 4.2514352782492475e-05, + "loss": 0.1409, + "num_input_tokens_seen": 43351840, + "step": 20085 + }, + { + "epoch": 3.2773246329526917, + "grad_norm": 1.7935006618499756, + "learning_rate": 4.250927282537635e-05, + "loss": 0.3036, + "num_input_tokens_seen": 43363328, + "step": 20090 + }, + { + "epoch": 3.2781402936378465, + "grad_norm": 0.49228498339653015, + "learning_rate": 4.2504191448869716e-05, + "loss": 0.1866, + "num_input_tokens_seen": 43374688, + "step": 20095 + }, + { + "epoch": 3.278955954323002, + "grad_norm": 0.29795724153518677, + "learning_rate": 4.249910865338452e-05, + "loss": 0.1233, + "num_input_tokens_seen": 43384832, + "step": 20100 + }, + { + "epoch": 3.2797716150081566, + "grad_norm": 0.6365963220596313, + "learning_rate": 4.249402443933279e-05, + "loss": 0.2361, + "num_input_tokens_seen": 43396352, + "step": 20105 + }, + { + "epoch": 3.2805872756933114, + "grad_norm": 2.330982208251953, + "learning_rate": 4.248893880712667e-05, + "loss": 0.1175, + "num_input_tokens_seen": 43407104, + "step": 20110 + }, + { + "epoch": 3.2814029363784667, + "grad_norm": 0.40792331099510193, + "learning_rate": 4.248385175717843e-05, + "loss": 0.0663, + "num_input_tokens_seen": 43418816, + "step": 20115 + }, + { + "epoch": 3.2822185970636215, + "grad_norm": 0.08269398659467697, + "learning_rate": 4.247876328990046e-05, + "loss": 0.1358, + "num_input_tokens_seen": 43429088, + "step": 20120 + }, + { + "epoch": 3.2830342577487763, + "grad_norm": 0.644342839717865, + "learning_rate": 4.247367340570525e-05, + "loss": 0.0861, + "num_input_tokens_seen": 43439744, + "step": 20125 + }, + { + "epoch": 3.2838499184339316, + "grad_norm": 0.6780996322631836, + "learning_rate": 4.2468582105005413e-05, + "loss": 0.0516, + "num_input_tokens_seen": 43451264, + "step": 20130 + }, + { + "epoch": 3.2846655791190864, + "grad_norm": 0.13126252591609955, + "learning_rate": 4.246348938821367e-05, + "loss": 0.1791, + "num_input_tokens_seen": 43461696, + "step": 20135 + }, + { + "epoch": 3.2854812398042412, + "grad_norm": 0.9957919120788574, + "learning_rate": 4.2458395255742875e-05, + "loss": 0.0566, + "num_input_tokens_seen": 43471904, + "step": 20140 + }, + { + "epoch": 3.2862969004893965, + "grad_norm": 0.2616921365261078, + "learning_rate": 4.245329970800597e-05, + "loss": 0.1821, + "num_input_tokens_seen": 43482400, + "step": 20145 + }, + { + "epoch": 3.2871125611745513, + "grad_norm": 0.5326260328292847, + "learning_rate": 4.244820274541604e-05, + "loss": 0.2034, + "num_input_tokens_seen": 43492160, + "step": 20150 + }, + { + "epoch": 3.2879282218597066, + "grad_norm": 0.5996872186660767, + "learning_rate": 4.244310436838627e-05, + "loss": 0.1096, + "num_input_tokens_seen": 43502688, + "step": 20155 + }, + { + "epoch": 3.2887438825448614, + "grad_norm": 0.7041240930557251, + "learning_rate": 4.2438004577329946e-05, + "loss": 0.2114, + "num_input_tokens_seen": 43513184, + "step": 20160 + }, + { + "epoch": 3.289559543230016, + "grad_norm": 1.571988821029663, + "learning_rate": 4.243290337266049e-05, + "loss": 0.2231, + "num_input_tokens_seen": 43522592, + "step": 20165 + }, + { + "epoch": 3.2903752039151715, + "grad_norm": 1.0301952362060547, + "learning_rate": 4.242780075479143e-05, + "loss": 0.0943, + "num_input_tokens_seen": 43533312, + "step": 20170 + }, + { + "epoch": 3.2911908646003263, + "grad_norm": 1.0821579694747925, + "learning_rate": 4.242269672413643e-05, + "loss": 0.1712, + "num_input_tokens_seen": 43543808, + "step": 20175 + }, + { + "epoch": 3.292006525285481, + "grad_norm": 1.8149852752685547, + "learning_rate": 4.241759128110922e-05, + "loss": 0.2643, + "num_input_tokens_seen": 43554528, + "step": 20180 + }, + { + "epoch": 3.2928221859706364, + "grad_norm": 0.21128012239933014, + "learning_rate": 4.241248442612368e-05, + "loss": 0.0635, + "num_input_tokens_seen": 43564960, + "step": 20185 + }, + { + "epoch": 3.293637846655791, + "grad_norm": 0.07297689467668533, + "learning_rate": 4.240737615959381e-05, + "loss": 0.0923, + "num_input_tokens_seen": 43576576, + "step": 20190 + }, + { + "epoch": 3.294453507340946, + "grad_norm": 1.0048854351043701, + "learning_rate": 4.2402266481933706e-05, + "loss": 0.1676, + "num_input_tokens_seen": 43588576, + "step": 20195 + }, + { + "epoch": 3.2952691680261013, + "grad_norm": 0.5553324818611145, + "learning_rate": 4.2397155393557574e-05, + "loss": 0.2803, + "num_input_tokens_seen": 43598912, + "step": 20200 + }, + { + "epoch": 3.296084828711256, + "grad_norm": 0.9605953097343445, + "learning_rate": 4.239204289487976e-05, + "loss": 0.1537, + "num_input_tokens_seen": 43609568, + "step": 20205 + }, + { + "epoch": 3.2969004893964113, + "grad_norm": 1.0060200691223145, + "learning_rate": 4.23869289863147e-05, + "loss": 0.132, + "num_input_tokens_seen": 43619904, + "step": 20210 + }, + { + "epoch": 3.297716150081566, + "grad_norm": 0.16473597288131714, + "learning_rate": 4.238181366827696e-05, + "loss": 0.1209, + "num_input_tokens_seen": 43630976, + "step": 20215 + }, + { + "epoch": 3.298531810766721, + "grad_norm": 0.5122063755989075, + "learning_rate": 4.237669694118121e-05, + "loss": 0.0556, + "num_input_tokens_seen": 43640384, + "step": 20220 + }, + { + "epoch": 3.299347471451876, + "grad_norm": 0.6019430160522461, + "learning_rate": 4.237157880544223e-05, + "loss": 0.1691, + "num_input_tokens_seen": 43650912, + "step": 20225 + }, + { + "epoch": 3.300163132137031, + "grad_norm": 0.38875749707221985, + "learning_rate": 4.2366459261474933e-05, + "loss": 0.1534, + "num_input_tokens_seen": 43662368, + "step": 20230 + }, + { + "epoch": 3.300978792822186, + "grad_norm": 1.6273040771484375, + "learning_rate": 4.2361338309694335e-05, + "loss": 0.153, + "num_input_tokens_seen": 43674048, + "step": 20235 + }, + { + "epoch": 3.301794453507341, + "grad_norm": 0.3639886677265167, + "learning_rate": 4.235621595051556e-05, + "loss": 0.1542, + "num_input_tokens_seen": 43685120, + "step": 20240 + }, + { + "epoch": 3.302610114192496, + "grad_norm": 0.10949182510375977, + "learning_rate": 4.2351092184353855e-05, + "loss": 0.1205, + "num_input_tokens_seen": 43696288, + "step": 20245 + }, + { + "epoch": 3.3034257748776508, + "grad_norm": 0.16684170067310333, + "learning_rate": 4.234596701162458e-05, + "loss": 0.1669, + "num_input_tokens_seen": 43708000, + "step": 20250 + }, + { + "epoch": 3.304241435562806, + "grad_norm": 0.15032793581485748, + "learning_rate": 4.2340840432743206e-05, + "loss": 0.0711, + "num_input_tokens_seen": 43720416, + "step": 20255 + }, + { + "epoch": 3.305057096247961, + "grad_norm": 0.09865568578243256, + "learning_rate": 4.2335712448125316e-05, + "loss": 0.1905, + "num_input_tokens_seen": 43730080, + "step": 20260 + }, + { + "epoch": 3.3058727569331157, + "grad_norm": 1.0571610927581787, + "learning_rate": 4.233058305818662e-05, + "loss": 0.0712, + "num_input_tokens_seen": 43740896, + "step": 20265 + }, + { + "epoch": 3.306688417618271, + "grad_norm": 1.819212794303894, + "learning_rate": 4.232545226334293e-05, + "loss": 0.1437, + "num_input_tokens_seen": 43752128, + "step": 20270 + }, + { + "epoch": 3.3075040783034257, + "grad_norm": 0.12400731444358826, + "learning_rate": 4.232032006401017e-05, + "loss": 0.0623, + "num_input_tokens_seen": 43763008, + "step": 20275 + }, + { + "epoch": 3.3083197389885806, + "grad_norm": 0.5329647064208984, + "learning_rate": 4.231518646060438e-05, + "loss": 0.1858, + "num_input_tokens_seen": 43774144, + "step": 20280 + }, + { + "epoch": 3.309135399673736, + "grad_norm": 0.5324128866195679, + "learning_rate": 4.231005145354172e-05, + "loss": 0.0834, + "num_input_tokens_seen": 43785280, + "step": 20285 + }, + { + "epoch": 3.3099510603588906, + "grad_norm": 0.10381724685430527, + "learning_rate": 4.230491504323846e-05, + "loss": 0.0589, + "num_input_tokens_seen": 43796288, + "step": 20290 + }, + { + "epoch": 3.310766721044046, + "grad_norm": 0.5866633653640747, + "learning_rate": 4.229977723011097e-05, + "loss": 0.0872, + "num_input_tokens_seen": 43807392, + "step": 20295 + }, + { + "epoch": 3.3115823817292007, + "grad_norm": 1.193349838256836, + "learning_rate": 4.2294638014575774e-05, + "loss": 0.2391, + "num_input_tokens_seen": 43819328, + "step": 20300 + }, + { + "epoch": 3.3123980424143555, + "grad_norm": 0.37874382734298706, + "learning_rate": 4.228949739704946e-05, + "loss": 0.0625, + "num_input_tokens_seen": 43829632, + "step": 20305 + }, + { + "epoch": 3.3132137030995104, + "grad_norm": 0.3584655523300171, + "learning_rate": 4.228435537794877e-05, + "loss": 0.1354, + "num_input_tokens_seen": 43839520, + "step": 20310 + }, + { + "epoch": 3.3140293637846656, + "grad_norm": 1.3893345594406128, + "learning_rate": 4.227921195769053e-05, + "loss": 0.0942, + "num_input_tokens_seen": 43850048, + "step": 20315 + }, + { + "epoch": 3.3148450244698204, + "grad_norm": 0.5104585289955139, + "learning_rate": 4.227406713669169e-05, + "loss": 0.0456, + "num_input_tokens_seen": 43860480, + "step": 20320 + }, + { + "epoch": 3.3156606851549757, + "grad_norm": 0.19100333750247955, + "learning_rate": 4.226892091536933e-05, + "loss": 0.0626, + "num_input_tokens_seen": 43871456, + "step": 20325 + }, + { + "epoch": 3.3164763458401305, + "grad_norm": 1.0890214443206787, + "learning_rate": 4.226377329414061e-05, + "loss": 0.2032, + "num_input_tokens_seen": 43882208, + "step": 20330 + }, + { + "epoch": 3.3172920065252853, + "grad_norm": 0.024444876238703728, + "learning_rate": 4.225862427342283e-05, + "loss": 0.0315, + "num_input_tokens_seen": 43893568, + "step": 20335 + }, + { + "epoch": 3.3181076672104406, + "grad_norm": 0.611129879951477, + "learning_rate": 4.2253473853633405e-05, + "loss": 0.1803, + "num_input_tokens_seen": 43904736, + "step": 20340 + }, + { + "epoch": 3.3189233278955954, + "grad_norm": 0.5708674192428589, + "learning_rate": 4.2248322035189835e-05, + "loss": 0.0792, + "num_input_tokens_seen": 43915232, + "step": 20345 + }, + { + "epoch": 3.3197389885807502, + "grad_norm": 0.12921273708343506, + "learning_rate": 4.224316881850977e-05, + "loss": 0.2528, + "num_input_tokens_seen": 43925664, + "step": 20350 + }, + { + "epoch": 3.3205546492659055, + "grad_norm": 0.6584367156028748, + "learning_rate": 4.223801420401095e-05, + "loss": 0.1019, + "num_input_tokens_seen": 43936736, + "step": 20355 + }, + { + "epoch": 3.3213703099510603, + "grad_norm": 1.5508123636245728, + "learning_rate": 4.223285819211124e-05, + "loss": 0.2618, + "num_input_tokens_seen": 43948320, + "step": 20360 + }, + { + "epoch": 3.322185970636215, + "grad_norm": 0.24931399524211884, + "learning_rate": 4.2227700783228594e-05, + "loss": 0.0193, + "num_input_tokens_seen": 43958304, + "step": 20365 + }, + { + "epoch": 3.3230016313213704, + "grad_norm": 0.0947088897228241, + "learning_rate": 4.222254197778112e-05, + "loss": 0.1424, + "num_input_tokens_seen": 43969120, + "step": 20370 + }, + { + "epoch": 3.323817292006525, + "grad_norm": 0.19435550272464752, + "learning_rate": 4.2217381776187005e-05, + "loss": 0.1046, + "num_input_tokens_seen": 43980768, + "step": 20375 + }, + { + "epoch": 3.3246329526916805, + "grad_norm": 0.5655499696731567, + "learning_rate": 4.2212220178864556e-05, + "loss": 0.14, + "num_input_tokens_seen": 43991360, + "step": 20380 + }, + { + "epoch": 3.3254486133768353, + "grad_norm": 0.5781283378601074, + "learning_rate": 4.2207057186232215e-05, + "loss": 0.0741, + "num_input_tokens_seen": 44002464, + "step": 20385 + }, + { + "epoch": 3.32626427406199, + "grad_norm": 0.47785571217536926, + "learning_rate": 4.220189279870851e-05, + "loss": 0.0526, + "num_input_tokens_seen": 44011264, + "step": 20390 + }, + { + "epoch": 3.3270799347471454, + "grad_norm": 0.1998063623905182, + "learning_rate": 4.219672701671209e-05, + "loss": 0.075, + "num_input_tokens_seen": 44021632, + "step": 20395 + }, + { + "epoch": 3.3278955954323, + "grad_norm": 0.05213431268930435, + "learning_rate": 4.219155984066171e-05, + "loss": 0.135, + "num_input_tokens_seen": 44031872, + "step": 20400 + }, + { + "epoch": 3.328711256117455, + "grad_norm": 0.4799228012561798, + "learning_rate": 4.218639127097628e-05, + "loss": 0.2122, + "num_input_tokens_seen": 44044064, + "step": 20405 + }, + { + "epoch": 3.3295269168026103, + "grad_norm": 2.4497756958007812, + "learning_rate": 4.218122130807476e-05, + "loss": 0.3232, + "num_input_tokens_seen": 44055488, + "step": 20410 + }, + { + "epoch": 3.330342577487765, + "grad_norm": 0.3697224259376526, + "learning_rate": 4.2176049952376265e-05, + "loss": 0.0418, + "num_input_tokens_seen": 44066496, + "step": 20415 + }, + { + "epoch": 3.33115823817292, + "grad_norm": 0.09690236300230026, + "learning_rate": 4.217087720430002e-05, + "loss": 0.1884, + "num_input_tokens_seen": 44077664, + "step": 20420 + }, + { + "epoch": 3.331973898858075, + "grad_norm": 0.3146320879459381, + "learning_rate": 4.2165703064265335e-05, + "loss": 0.1508, + "num_input_tokens_seen": 44089344, + "step": 20425 + }, + { + "epoch": 3.33278955954323, + "grad_norm": 0.3257324993610382, + "learning_rate": 4.216052753269166e-05, + "loss": 0.0182, + "num_input_tokens_seen": 44100288, + "step": 20430 + }, + { + "epoch": 3.3336052202283852, + "grad_norm": 0.8148209452629089, + "learning_rate": 4.215535060999856e-05, + "loss": 0.0484, + "num_input_tokens_seen": 44111040, + "step": 20435 + }, + { + "epoch": 3.33442088091354, + "grad_norm": 0.23131063580513, + "learning_rate": 4.215017229660569e-05, + "loss": 0.1102, + "num_input_tokens_seen": 44122144, + "step": 20440 + }, + { + "epoch": 3.335236541598695, + "grad_norm": 0.37140095233917236, + "learning_rate": 4.214499259293283e-05, + "loss": 0.1253, + "num_input_tokens_seen": 44133344, + "step": 20445 + }, + { + "epoch": 3.3360522022838497, + "grad_norm": 1.0770279169082642, + "learning_rate": 4.213981149939988e-05, + "loss": 0.0838, + "num_input_tokens_seen": 44143520, + "step": 20450 + }, + { + "epoch": 3.336867862969005, + "grad_norm": 0.07707896828651428, + "learning_rate": 4.213462901642685e-05, + "loss": 0.0426, + "num_input_tokens_seen": 44154496, + "step": 20455 + }, + { + "epoch": 3.3376835236541598, + "grad_norm": 0.15634286403656006, + "learning_rate": 4.212944514443384e-05, + "loss": 0.0291, + "num_input_tokens_seen": 44165504, + "step": 20460 + }, + { + "epoch": 3.338499184339315, + "grad_norm": 0.5779223442077637, + "learning_rate": 4.21242598838411e-05, + "loss": 0.2184, + "num_input_tokens_seen": 44177472, + "step": 20465 + }, + { + "epoch": 3.33931484502447, + "grad_norm": 0.7172437906265259, + "learning_rate": 4.211907323506897e-05, + "loss": 0.1691, + "num_input_tokens_seen": 44187712, + "step": 20470 + }, + { + "epoch": 3.3401305057096247, + "grad_norm": 0.8406465649604797, + "learning_rate": 4.21138851985379e-05, + "loss": 0.3039, + "num_input_tokens_seen": 44198816, + "step": 20475 + }, + { + "epoch": 3.34094616639478, + "grad_norm": 1.40019690990448, + "learning_rate": 4.210869577466846e-05, + "loss": 0.255, + "num_input_tokens_seen": 44209600, + "step": 20480 + }, + { + "epoch": 3.3417618270799347, + "grad_norm": 1.1587885618209839, + "learning_rate": 4.210350496388133e-05, + "loss": 0.1049, + "num_input_tokens_seen": 44220768, + "step": 20485 + }, + { + "epoch": 3.3425774877650896, + "grad_norm": 0.13724230229854584, + "learning_rate": 4.2098312766597305e-05, + "loss": 0.1961, + "num_input_tokens_seen": 44231936, + "step": 20490 + }, + { + "epoch": 3.343393148450245, + "grad_norm": 0.18048794567584991, + "learning_rate": 4.209311918323729e-05, + "loss": 0.1459, + "num_input_tokens_seen": 44242912, + "step": 20495 + }, + { + "epoch": 3.3442088091353996, + "grad_norm": 0.3322440981864929, + "learning_rate": 4.208792421422231e-05, + "loss": 0.098, + "num_input_tokens_seen": 44254144, + "step": 20500 + }, + { + "epoch": 3.3450244698205545, + "grad_norm": 0.39575862884521484, + "learning_rate": 4.208272785997348e-05, + "loss": 0.1036, + "num_input_tokens_seen": 44264832, + "step": 20505 + }, + { + "epoch": 3.3458401305057097, + "grad_norm": 1.4289801120758057, + "learning_rate": 4.207753012091207e-05, + "loss": 0.1488, + "num_input_tokens_seen": 44276800, + "step": 20510 + }, + { + "epoch": 3.3466557911908645, + "grad_norm": 0.9356715679168701, + "learning_rate": 4.20723309974594e-05, + "loss": 0.1238, + "num_input_tokens_seen": 44287584, + "step": 20515 + }, + { + "epoch": 3.34747145187602, + "grad_norm": 0.9041482210159302, + "learning_rate": 4.2067130490036964e-05, + "loss": 0.08, + "num_input_tokens_seen": 44298688, + "step": 20520 + }, + { + "epoch": 3.3482871125611746, + "grad_norm": 0.28811243176460266, + "learning_rate": 4.206192859906633e-05, + "loss": 0.0523, + "num_input_tokens_seen": 44309184, + "step": 20525 + }, + { + "epoch": 3.3491027732463294, + "grad_norm": 0.37969598174095154, + "learning_rate": 4.205672532496919e-05, + "loss": 0.2095, + "num_input_tokens_seen": 44319872, + "step": 20530 + }, + { + "epoch": 3.3499184339314847, + "grad_norm": 1.5401928424835205, + "learning_rate": 4.205152066816736e-05, + "loss": 0.168, + "num_input_tokens_seen": 44330432, + "step": 20535 + }, + { + "epoch": 3.3507340946166395, + "grad_norm": 0.34568822383880615, + "learning_rate": 4.204631462908274e-05, + "loss": 0.2247, + "num_input_tokens_seen": 44341920, + "step": 20540 + }, + { + "epoch": 3.3515497553017943, + "grad_norm": 0.7561233639717102, + "learning_rate": 4.2041107208137366e-05, + "loss": 0.0731, + "num_input_tokens_seen": 44352096, + "step": 20545 + }, + { + "epoch": 3.3523654159869496, + "grad_norm": 0.9558727741241455, + "learning_rate": 4.203589840575337e-05, + "loss": 0.1079, + "num_input_tokens_seen": 44363456, + "step": 20550 + }, + { + "epoch": 3.3531810766721044, + "grad_norm": 0.23312394320964813, + "learning_rate": 4.203068822235302e-05, + "loss": 0.0634, + "num_input_tokens_seen": 44375168, + "step": 20555 + }, + { + "epoch": 3.3539967373572592, + "grad_norm": 0.10804706066846848, + "learning_rate": 4.2025476658358656e-05, + "loss": 0.0863, + "num_input_tokens_seen": 44385824, + "step": 20560 + }, + { + "epoch": 3.3548123980424145, + "grad_norm": 0.4815885126590729, + "learning_rate": 4.202026371419278e-05, + "loss": 0.245, + "num_input_tokens_seen": 44396736, + "step": 20565 + }, + { + "epoch": 3.3556280587275693, + "grad_norm": 0.11740241199731827, + "learning_rate": 4.201504939027796e-05, + "loss": 0.1194, + "num_input_tokens_seen": 44407648, + "step": 20570 + }, + { + "epoch": 3.356443719412724, + "grad_norm": 0.36060845851898193, + "learning_rate": 4.20098336870369e-05, + "loss": 0.1162, + "num_input_tokens_seen": 44418336, + "step": 20575 + }, + { + "epoch": 3.3572593800978794, + "grad_norm": 1.8596771955490112, + "learning_rate": 4.200461660489242e-05, + "loss": 0.1813, + "num_input_tokens_seen": 44428960, + "step": 20580 + }, + { + "epoch": 3.358075040783034, + "grad_norm": 0.42297646403312683, + "learning_rate": 4.199939814426744e-05, + "loss": 0.073, + "num_input_tokens_seen": 44440608, + "step": 20585 + }, + { + "epoch": 3.358890701468189, + "grad_norm": 0.12916216254234314, + "learning_rate": 4.1994178305584996e-05, + "loss": 0.1428, + "num_input_tokens_seen": 44451040, + "step": 20590 + }, + { + "epoch": 3.3597063621533443, + "grad_norm": 0.8602046370506287, + "learning_rate": 4.198895708926822e-05, + "loss": 0.1019, + "num_input_tokens_seen": 44461792, + "step": 20595 + }, + { + "epoch": 3.360522022838499, + "grad_norm": 0.06754378229379654, + "learning_rate": 4.198373449574039e-05, + "loss": 0.0429, + "num_input_tokens_seen": 44472000, + "step": 20600 + }, + { + "epoch": 3.3613376835236544, + "grad_norm": 0.6893155574798584, + "learning_rate": 4.197851052542486e-05, + "loss": 0.0834, + "num_input_tokens_seen": 44482752, + "step": 20605 + }, + { + "epoch": 3.362153344208809, + "grad_norm": 0.25513550639152527, + "learning_rate": 4.197328517874513e-05, + "loss": 0.1554, + "num_input_tokens_seen": 44494624, + "step": 20610 + }, + { + "epoch": 3.362969004893964, + "grad_norm": 1.2258827686309814, + "learning_rate": 4.1968058456124756e-05, + "loss": 0.2055, + "num_input_tokens_seen": 44504256, + "step": 20615 + }, + { + "epoch": 3.3637846655791193, + "grad_norm": 0.05405048653483391, + "learning_rate": 4.196283035798749e-05, + "loss": 0.0313, + "num_input_tokens_seen": 44515680, + "step": 20620 + }, + { + "epoch": 3.364600326264274, + "grad_norm": 0.803203821182251, + "learning_rate": 4.1957600884757124e-05, + "loss": 0.0884, + "num_input_tokens_seen": 44527360, + "step": 20625 + }, + { + "epoch": 3.365415986949429, + "grad_norm": 0.08740226179361343, + "learning_rate": 4.195237003685759e-05, + "loss": 0.1593, + "num_input_tokens_seen": 44537600, + "step": 20630 + }, + { + "epoch": 3.366231647634584, + "grad_norm": 0.7047392129898071, + "learning_rate": 4.194713781471292e-05, + "loss": 0.0399, + "num_input_tokens_seen": 44548704, + "step": 20635 + }, + { + "epoch": 3.367047308319739, + "grad_norm": 0.16498588025569916, + "learning_rate": 4.194190421874727e-05, + "loss": 0.1819, + "num_input_tokens_seen": 44559584, + "step": 20640 + }, + { + "epoch": 3.367862969004894, + "grad_norm": 0.5149936676025391, + "learning_rate": 4.193666924938491e-05, + "loss": 0.0209, + "num_input_tokens_seen": 44568576, + "step": 20645 + }, + { + "epoch": 3.368678629690049, + "grad_norm": 0.8327103853225708, + "learning_rate": 4.1931432907050196e-05, + "loss": 0.2361, + "num_input_tokens_seen": 44579968, + "step": 20650 + }, + { + "epoch": 3.369494290375204, + "grad_norm": 0.12923701107501984, + "learning_rate": 4.192619519216763e-05, + "loss": 0.2273, + "num_input_tokens_seen": 44591552, + "step": 20655 + }, + { + "epoch": 3.370309951060359, + "grad_norm": 0.5959347486495972, + "learning_rate": 4.192095610516179e-05, + "loss": 0.1221, + "num_input_tokens_seen": 44602016, + "step": 20660 + }, + { + "epoch": 3.371125611745514, + "grad_norm": 0.06743135303258896, + "learning_rate": 4.1915715646457385e-05, + "loss": 0.0739, + "num_input_tokens_seen": 44611936, + "step": 20665 + }, + { + "epoch": 3.3719412724306688, + "grad_norm": 0.16741137206554413, + "learning_rate": 4.191047381647925e-05, + "loss": 0.1291, + "num_input_tokens_seen": 44623232, + "step": 20670 + }, + { + "epoch": 3.3727569331158236, + "grad_norm": 0.16705454885959625, + "learning_rate": 4.190523061565231e-05, + "loss": 0.0506, + "num_input_tokens_seen": 44634592, + "step": 20675 + }, + { + "epoch": 3.373572593800979, + "grad_norm": 0.023698626086115837, + "learning_rate": 4.189998604440159e-05, + "loss": 0.0472, + "num_input_tokens_seen": 44645568, + "step": 20680 + }, + { + "epoch": 3.3743882544861337, + "grad_norm": 2.219172477722168, + "learning_rate": 4.189474010315226e-05, + "loss": 0.2142, + "num_input_tokens_seen": 44656992, + "step": 20685 + }, + { + "epoch": 3.375203915171289, + "grad_norm": 0.22574280202388763, + "learning_rate": 4.188949279232958e-05, + "loss": 0.066, + "num_input_tokens_seen": 44669248, + "step": 20690 + }, + { + "epoch": 3.3760195758564437, + "grad_norm": 0.32711589336395264, + "learning_rate": 4.188424411235891e-05, + "loss": 0.0982, + "num_input_tokens_seen": 44680864, + "step": 20695 + }, + { + "epoch": 3.3768352365415986, + "grad_norm": 0.10940727591514587, + "learning_rate": 4.1878994063665734e-05, + "loss": 0.1583, + "num_input_tokens_seen": 44691456, + "step": 20700 + }, + { + "epoch": 3.377650897226754, + "grad_norm": 0.09100750833749771, + "learning_rate": 4.187374264667566e-05, + "loss": 0.0408, + "num_input_tokens_seen": 44702336, + "step": 20705 + }, + { + "epoch": 3.3784665579119086, + "grad_norm": 1.3329205513000488, + "learning_rate": 4.1868489861814394e-05, + "loss": 0.3791, + "num_input_tokens_seen": 44713728, + "step": 20710 + }, + { + "epoch": 3.3792822185970635, + "grad_norm": 0.5405666828155518, + "learning_rate": 4.1863235709507755e-05, + "loss": 0.116, + "num_input_tokens_seen": 44723968, + "step": 20715 + }, + { + "epoch": 3.3800978792822187, + "grad_norm": 0.11696047335863113, + "learning_rate": 4.1857980190181655e-05, + "loss": 0.093, + "num_input_tokens_seen": 44734304, + "step": 20720 + }, + { + "epoch": 3.3809135399673735, + "grad_norm": 1.093609094619751, + "learning_rate": 4.1852723304262145e-05, + "loss": 0.1968, + "num_input_tokens_seen": 44745568, + "step": 20725 + }, + { + "epoch": 3.3817292006525284, + "grad_norm": 0.8003025054931641, + "learning_rate": 4.1847465052175386e-05, + "loss": 0.0744, + "num_input_tokens_seen": 44757216, + "step": 20730 + }, + { + "epoch": 3.3825448613376836, + "grad_norm": 0.0745125487446785, + "learning_rate": 4.184220543434762e-05, + "loss": 0.0658, + "num_input_tokens_seen": 44767808, + "step": 20735 + }, + { + "epoch": 3.3833605220228384, + "grad_norm": 1.2239478826522827, + "learning_rate": 4.1836944451205215e-05, + "loss": 0.166, + "num_input_tokens_seen": 44778208, + "step": 20740 + }, + { + "epoch": 3.3841761827079937, + "grad_norm": 0.1356264054775238, + "learning_rate": 4.1831682103174676e-05, + "loss": 0.1234, + "num_input_tokens_seen": 44788032, + "step": 20745 + }, + { + "epoch": 3.3849918433931485, + "grad_norm": 1.318473219871521, + "learning_rate": 4.182641839068259e-05, + "loss": 0.2178, + "num_input_tokens_seen": 44797440, + "step": 20750 + }, + { + "epoch": 3.3858075040783033, + "grad_norm": 1.7627304792404175, + "learning_rate": 4.182115331415564e-05, + "loss": 0.21, + "num_input_tokens_seen": 44809248, + "step": 20755 + }, + { + "epoch": 3.3866231647634586, + "grad_norm": 0.8951590657234192, + "learning_rate": 4.1815886874020646e-05, + "loss": 0.1744, + "num_input_tokens_seen": 44820768, + "step": 20760 + }, + { + "epoch": 3.3874388254486134, + "grad_norm": 0.23522402346134186, + "learning_rate": 4.181061907070455e-05, + "loss": 0.0455, + "num_input_tokens_seen": 44832128, + "step": 20765 + }, + { + "epoch": 3.3882544861337682, + "grad_norm": 0.5904980897903442, + "learning_rate": 4.180534990463437e-05, + "loss": 0.1136, + "num_input_tokens_seen": 44840992, + "step": 20770 + }, + { + "epoch": 3.3890701468189235, + "grad_norm": 0.26223689317703247, + "learning_rate": 4.1800079376237265e-05, + "loss": 0.1036, + "num_input_tokens_seen": 44851232, + "step": 20775 + }, + { + "epoch": 3.3898858075040783, + "grad_norm": 0.13141851127147675, + "learning_rate": 4.179480748594048e-05, + "loss": 0.0478, + "num_input_tokens_seen": 44860672, + "step": 20780 + }, + { + "epoch": 3.390701468189233, + "grad_norm": 0.4659656584262848, + "learning_rate": 4.178953423417138e-05, + "loss": 0.0459, + "num_input_tokens_seen": 44870688, + "step": 20785 + }, + { + "epoch": 3.3915171288743884, + "grad_norm": 0.6954128742218018, + "learning_rate": 4.1784259621357444e-05, + "loss": 0.0697, + "num_input_tokens_seen": 44881760, + "step": 20790 + }, + { + "epoch": 3.392332789559543, + "grad_norm": 0.34243524074554443, + "learning_rate": 4.1778983647926274e-05, + "loss": 0.0494, + "num_input_tokens_seen": 44892896, + "step": 20795 + }, + { + "epoch": 3.393148450244698, + "grad_norm": 1.208329439163208, + "learning_rate": 4.177370631430554e-05, + "loss": 0.1316, + "num_input_tokens_seen": 44903296, + "step": 20800 + }, + { + "epoch": 3.3939641109298533, + "grad_norm": 0.4378150701522827, + "learning_rate": 4.176842762092307e-05, + "loss": 0.1134, + "num_input_tokens_seen": 44913664, + "step": 20805 + }, + { + "epoch": 3.394779771615008, + "grad_norm": 1.2153003215789795, + "learning_rate": 4.176314756820677e-05, + "loss": 0.181, + "num_input_tokens_seen": 44924864, + "step": 20810 + }, + { + "epoch": 3.395595432300163, + "grad_norm": 0.37409481406211853, + "learning_rate": 4.175786615658468e-05, + "loss": 0.1383, + "num_input_tokens_seen": 44935008, + "step": 20815 + }, + { + "epoch": 3.396411092985318, + "grad_norm": 0.038138728588819504, + "learning_rate": 4.175258338648493e-05, + "loss": 0.1054, + "num_input_tokens_seen": 44945280, + "step": 20820 + }, + { + "epoch": 3.397226753670473, + "grad_norm": 0.1878923922777176, + "learning_rate": 4.174729925833576e-05, + "loss": 0.0702, + "num_input_tokens_seen": 44956064, + "step": 20825 + }, + { + "epoch": 3.3980424143556283, + "grad_norm": 0.17885097861289978, + "learning_rate": 4.174201377256555e-05, + "loss": 0.0808, + "num_input_tokens_seen": 44967424, + "step": 20830 + }, + { + "epoch": 3.398858075040783, + "grad_norm": 0.33815720677375793, + "learning_rate": 4.173672692960274e-05, + "loss": 0.1003, + "num_input_tokens_seen": 44977952, + "step": 20835 + }, + { + "epoch": 3.399673735725938, + "grad_norm": 0.11137424409389496, + "learning_rate": 4.173143872987594e-05, + "loss": 0.1285, + "num_input_tokens_seen": 44988384, + "step": 20840 + }, + { + "epoch": 3.400489396411093, + "grad_norm": 0.019554266706109047, + "learning_rate": 4.172614917381381e-05, + "loss": 0.0359, + "num_input_tokens_seen": 44998816, + "step": 20845 + }, + { + "epoch": 3.401305057096248, + "grad_norm": 0.9815075397491455, + "learning_rate": 4.1720858261845166e-05, + "loss": 0.1502, + "num_input_tokens_seen": 45010112, + "step": 20850 + }, + { + "epoch": 3.402120717781403, + "grad_norm": 1.9425601959228516, + "learning_rate": 4.171556599439891e-05, + "loss": 0.2215, + "num_input_tokens_seen": 45021184, + "step": 20855 + }, + { + "epoch": 3.402936378466558, + "grad_norm": 0.6406822204589844, + "learning_rate": 4.1710272371904055e-05, + "loss": 0.2221, + "num_input_tokens_seen": 45030976, + "step": 20860 + }, + { + "epoch": 3.403752039151713, + "grad_norm": 1.1519079208374023, + "learning_rate": 4.170497739478974e-05, + "loss": 0.189, + "num_input_tokens_seen": 45042400, + "step": 20865 + }, + { + "epoch": 3.4045676998368677, + "grad_norm": 0.07258306443691254, + "learning_rate": 4.16996810634852e-05, + "loss": 0.0968, + "num_input_tokens_seen": 45052352, + "step": 20870 + }, + { + "epoch": 3.405383360522023, + "grad_norm": 0.678307056427002, + "learning_rate": 4.1694383378419774e-05, + "loss": 0.0405, + "num_input_tokens_seen": 45063520, + "step": 20875 + }, + { + "epoch": 3.4061990212071778, + "grad_norm": 0.8699730634689331, + "learning_rate": 4.168908434002292e-05, + "loss": 0.0833, + "num_input_tokens_seen": 45074144, + "step": 20880 + }, + { + "epoch": 3.407014681892333, + "grad_norm": 1.3913657665252686, + "learning_rate": 4.168378394872422e-05, + "loss": 0.1421, + "num_input_tokens_seen": 45084704, + "step": 20885 + }, + { + "epoch": 3.407830342577488, + "grad_norm": 1.0087631940841675, + "learning_rate": 4.167848220495334e-05, + "loss": 0.1036, + "num_input_tokens_seen": 45095744, + "step": 20890 + }, + { + "epoch": 3.4086460032626427, + "grad_norm": 0.23521724343299866, + "learning_rate": 4.167317910914006e-05, + "loss": 0.0887, + "num_input_tokens_seen": 45105952, + "step": 20895 + }, + { + "epoch": 3.4094616639477975, + "grad_norm": 0.2431805580854416, + "learning_rate": 4.166787466171429e-05, + "loss": 0.0323, + "num_input_tokens_seen": 45118080, + "step": 20900 + }, + { + "epoch": 3.4102773246329527, + "grad_norm": 0.04132357984781265, + "learning_rate": 4.166256886310602e-05, + "loss": 0.0346, + "num_input_tokens_seen": 45129024, + "step": 20905 + }, + { + "epoch": 3.4110929853181076, + "grad_norm": 1.831483244895935, + "learning_rate": 4.165726171374538e-05, + "loss": 0.0612, + "num_input_tokens_seen": 45139200, + "step": 20910 + }, + { + "epoch": 3.411908646003263, + "grad_norm": 0.040500421077013016, + "learning_rate": 4.165195321406259e-05, + "loss": 0.0698, + "num_input_tokens_seen": 45150752, + "step": 20915 + }, + { + "epoch": 3.4127243066884176, + "grad_norm": 0.1911318302154541, + "learning_rate": 4.164664336448797e-05, + "loss": 0.1586, + "num_input_tokens_seen": 45162016, + "step": 20920 + }, + { + "epoch": 3.4135399673735725, + "grad_norm": 0.4213143289089203, + "learning_rate": 4.164133216545199e-05, + "loss": 0.1209, + "num_input_tokens_seen": 45171904, + "step": 20925 + }, + { + "epoch": 3.4143556280587277, + "grad_norm": 2.1356234550476074, + "learning_rate": 4.163601961738517e-05, + "loss": 0.2055, + "num_input_tokens_seen": 45183712, + "step": 20930 + }, + { + "epoch": 3.4151712887438825, + "grad_norm": 1.140232801437378, + "learning_rate": 4.16307057207182e-05, + "loss": 0.0839, + "num_input_tokens_seen": 45194656, + "step": 20935 + }, + { + "epoch": 3.4159869494290374, + "grad_norm": 0.05285272374749184, + "learning_rate": 4.162539047588183e-05, + "loss": 0.1871, + "num_input_tokens_seen": 45204800, + "step": 20940 + }, + { + "epoch": 3.4168026101141926, + "grad_norm": 1.2122323513031006, + "learning_rate": 4.162007388330696e-05, + "loss": 0.1439, + "num_input_tokens_seen": 45215776, + "step": 20945 + }, + { + "epoch": 3.4176182707993474, + "grad_norm": 0.3053913414478302, + "learning_rate": 4.1614755943424575e-05, + "loss": 0.0781, + "num_input_tokens_seen": 45225952, + "step": 20950 + }, + { + "epoch": 3.4184339314845023, + "grad_norm": 0.4541253447532654, + "learning_rate": 4.160943665666577e-05, + "loss": 0.0877, + "num_input_tokens_seen": 45237312, + "step": 20955 + }, + { + "epoch": 3.4192495921696575, + "grad_norm": 0.5633123517036438, + "learning_rate": 4.160411602346175e-05, + "loss": 0.052, + "num_input_tokens_seen": 45246912, + "step": 20960 + }, + { + "epoch": 3.4200652528548123, + "grad_norm": 0.18381142616271973, + "learning_rate": 4.159879404424384e-05, + "loss": 0.0859, + "num_input_tokens_seen": 45257440, + "step": 20965 + }, + { + "epoch": 3.4208809135399676, + "grad_norm": 0.3028866648674011, + "learning_rate": 4.159347071944346e-05, + "loss": 0.1447, + "num_input_tokens_seen": 45268416, + "step": 20970 + }, + { + "epoch": 3.4216965742251224, + "grad_norm": 0.132730171084404, + "learning_rate": 4.158814604949215e-05, + "loss": 0.1235, + "num_input_tokens_seen": 45278848, + "step": 20975 + }, + { + "epoch": 3.4225122349102772, + "grad_norm": 0.28434517979621887, + "learning_rate": 4.158282003482156e-05, + "loss": 0.0441, + "num_input_tokens_seen": 45291072, + "step": 20980 + }, + { + "epoch": 3.4233278955954325, + "grad_norm": 0.5835362672805786, + "learning_rate": 4.157749267586343e-05, + "loss": 0.1276, + "num_input_tokens_seen": 45302336, + "step": 20985 + }, + { + "epoch": 3.4241435562805873, + "grad_norm": 0.17687270045280457, + "learning_rate": 4.1572163973049624e-05, + "loss": 0.043, + "num_input_tokens_seen": 45312864, + "step": 20990 + }, + { + "epoch": 3.424959216965742, + "grad_norm": 0.5782681107521057, + "learning_rate": 4.1566833926812135e-05, + "loss": 0.164, + "num_input_tokens_seen": 45323744, + "step": 20995 + }, + { + "epoch": 3.4257748776508974, + "grad_norm": 0.42257899045944214, + "learning_rate": 4.1561502537583016e-05, + "loss": 0.1895, + "num_input_tokens_seen": 45333760, + "step": 21000 + }, + { + "epoch": 3.426590538336052, + "grad_norm": 0.5675125122070312, + "learning_rate": 4.155616980579447e-05, + "loss": 0.2282, + "num_input_tokens_seen": 45344896, + "step": 21005 + }, + { + "epoch": 3.427406199021207, + "grad_norm": 1.7860918045043945, + "learning_rate": 4.155083573187881e-05, + "loss": 0.1148, + "num_input_tokens_seen": 45354336, + "step": 21010 + }, + { + "epoch": 3.4282218597063623, + "grad_norm": 0.09939247369766235, + "learning_rate": 4.154550031626842e-05, + "loss": 0.2634, + "num_input_tokens_seen": 45365376, + "step": 21015 + }, + { + "epoch": 3.429037520391517, + "grad_norm": 0.8350715637207031, + "learning_rate": 4.1540163559395816e-05, + "loss": 0.072, + "num_input_tokens_seen": 45376416, + "step": 21020 + }, + { + "epoch": 3.429853181076672, + "grad_norm": 0.209537535905838, + "learning_rate": 4.153482546169364e-05, + "loss": 0.1577, + "num_input_tokens_seen": 45386272, + "step": 21025 + }, + { + "epoch": 3.430668841761827, + "grad_norm": 0.04453764483332634, + "learning_rate": 4.15294860235946e-05, + "loss": 0.067, + "num_input_tokens_seen": 45395936, + "step": 21030 + }, + { + "epoch": 3.431484502446982, + "grad_norm": 0.09188912063837051, + "learning_rate": 4.152414524553156e-05, + "loss": 0.0703, + "num_input_tokens_seen": 45407680, + "step": 21035 + }, + { + "epoch": 3.432300163132137, + "grad_norm": 1.3226426839828491, + "learning_rate": 4.1518803127937464e-05, + "loss": 0.0541, + "num_input_tokens_seen": 45417088, + "step": 21040 + }, + { + "epoch": 3.433115823817292, + "grad_norm": 0.32461977005004883, + "learning_rate": 4.1513459671245384e-05, + "loss": 0.0962, + "num_input_tokens_seen": 45429536, + "step": 21045 + }, + { + "epoch": 3.433931484502447, + "grad_norm": 0.13267523050308228, + "learning_rate": 4.150811487588846e-05, + "loss": 0.0472, + "num_input_tokens_seen": 45440320, + "step": 21050 + }, + { + "epoch": 3.434747145187602, + "grad_norm": 0.8796838521957397, + "learning_rate": 4.150276874229999e-05, + "loss": 0.1874, + "num_input_tokens_seen": 45451008, + "step": 21055 + }, + { + "epoch": 3.435562805872757, + "grad_norm": 0.6520984768867493, + "learning_rate": 4.149742127091335e-05, + "loss": 0.066, + "num_input_tokens_seen": 45462432, + "step": 21060 + }, + { + "epoch": 3.436378466557912, + "grad_norm": 2.018832206726074, + "learning_rate": 4.149207246216203e-05, + "loss": 0.1727, + "num_input_tokens_seen": 45472544, + "step": 21065 + }, + { + "epoch": 3.437194127243067, + "grad_norm": 0.187293142080307, + "learning_rate": 4.1486722316479635e-05, + "loss": 0.0803, + "num_input_tokens_seen": 45483168, + "step": 21070 + }, + { + "epoch": 3.438009787928222, + "grad_norm": 0.08988659083843231, + "learning_rate": 4.1481370834299884e-05, + "loss": 0.0289, + "num_input_tokens_seen": 45492832, + "step": 21075 + }, + { + "epoch": 3.4388254486133767, + "grad_norm": 1.0860850811004639, + "learning_rate": 4.1476018016056583e-05, + "loss": 0.2663, + "num_input_tokens_seen": 45504064, + "step": 21080 + }, + { + "epoch": 3.439641109298532, + "grad_norm": 1.0470513105392456, + "learning_rate": 4.1470663862183664e-05, + "loss": 0.156, + "num_input_tokens_seen": 45514336, + "step": 21085 + }, + { + "epoch": 3.4404567699836868, + "grad_norm": 0.2596949338912964, + "learning_rate": 4.146530837311516e-05, + "loss": 0.0917, + "num_input_tokens_seen": 45524480, + "step": 21090 + }, + { + "epoch": 3.4412724306688416, + "grad_norm": 2.8096089363098145, + "learning_rate": 4.145995154928521e-05, + "loss": 0.2803, + "num_input_tokens_seen": 45535264, + "step": 21095 + }, + { + "epoch": 3.442088091353997, + "grad_norm": 1.0723637342453003, + "learning_rate": 4.1454593391128084e-05, + "loss": 0.0681, + "num_input_tokens_seen": 45546560, + "step": 21100 + }, + { + "epoch": 3.4429037520391517, + "grad_norm": 0.26469871401786804, + "learning_rate": 4.144923389907812e-05, + "loss": 0.0624, + "num_input_tokens_seen": 45558016, + "step": 21105 + }, + { + "epoch": 3.443719412724307, + "grad_norm": 0.1301802545785904, + "learning_rate": 4.1443873073569796e-05, + "loss": 0.1188, + "num_input_tokens_seen": 45568512, + "step": 21110 + }, + { + "epoch": 3.4445350734094617, + "grad_norm": 0.4232877492904663, + "learning_rate": 4.143851091503768e-05, + "loss": 0.131, + "num_input_tokens_seen": 45579968, + "step": 21115 + }, + { + "epoch": 3.4453507340946166, + "grad_norm": 0.5492163896560669, + "learning_rate": 4.1433147423916466e-05, + "loss": 0.0642, + "num_input_tokens_seen": 45592064, + "step": 21120 + }, + { + "epoch": 3.4461663947797714, + "grad_norm": 0.1130504384636879, + "learning_rate": 4.1427782600640943e-05, + "loss": 0.1292, + "num_input_tokens_seen": 45603040, + "step": 21125 + }, + { + "epoch": 3.4469820554649266, + "grad_norm": 0.13097329437732697, + "learning_rate": 4.1422416445646e-05, + "loss": 0.0441, + "num_input_tokens_seen": 45613536, + "step": 21130 + }, + { + "epoch": 3.4477977161500815, + "grad_norm": 0.37002208828926086, + "learning_rate": 4.141704895936666e-05, + "loss": 0.0693, + "num_input_tokens_seen": 45625120, + "step": 21135 + }, + { + "epoch": 3.4486133768352367, + "grad_norm": 2.2499969005584717, + "learning_rate": 4.141168014223803e-05, + "loss": 0.1569, + "num_input_tokens_seen": 45635744, + "step": 21140 + }, + { + "epoch": 3.4494290375203915, + "grad_norm": 0.15823428332805634, + "learning_rate": 4.1406309994695335e-05, + "loss": 0.166, + "num_input_tokens_seen": 45646432, + "step": 21145 + }, + { + "epoch": 3.4502446982055464, + "grad_norm": 1.2551742792129517, + "learning_rate": 4.1400938517173905e-05, + "loss": 0.1132, + "num_input_tokens_seen": 45656576, + "step": 21150 + }, + { + "epoch": 3.4510603588907016, + "grad_norm": 0.6153711676597595, + "learning_rate": 4.139556571010919e-05, + "loss": 0.2611, + "num_input_tokens_seen": 45667904, + "step": 21155 + }, + { + "epoch": 3.4518760195758564, + "grad_norm": 0.07157787680625916, + "learning_rate": 4.139019157393672e-05, + "loss": 0.039, + "num_input_tokens_seen": 45678528, + "step": 21160 + }, + { + "epoch": 3.4526916802610113, + "grad_norm": 1.532470703125, + "learning_rate": 4.138481610909216e-05, + "loss": 0.21, + "num_input_tokens_seen": 45689312, + "step": 21165 + }, + { + "epoch": 3.4535073409461665, + "grad_norm": 1.0007938146591187, + "learning_rate": 4.137943931601127e-05, + "loss": 0.0527, + "num_input_tokens_seen": 45700160, + "step": 21170 + }, + { + "epoch": 3.4543230016313213, + "grad_norm": 1.9624125957489014, + "learning_rate": 4.1374061195129924e-05, + "loss": 0.2307, + "num_input_tokens_seen": 45710432, + "step": 21175 + }, + { + "epoch": 3.455138662316476, + "grad_norm": 0.902416467666626, + "learning_rate": 4.13686817468841e-05, + "loss": 0.0474, + "num_input_tokens_seen": 45721248, + "step": 21180 + }, + { + "epoch": 3.4559543230016314, + "grad_norm": 0.5320085287094116, + "learning_rate": 4.1363300971709866e-05, + "loss": 0.0987, + "num_input_tokens_seen": 45732768, + "step": 21185 + }, + { + "epoch": 3.4567699836867862, + "grad_norm": 0.15607000887393951, + "learning_rate": 4.135791887004344e-05, + "loss": 0.0244, + "num_input_tokens_seen": 45743200, + "step": 21190 + }, + { + "epoch": 3.4575856443719415, + "grad_norm": 0.23226548731327057, + "learning_rate": 4.13525354423211e-05, + "loss": 0.155, + "num_input_tokens_seen": 45753376, + "step": 21195 + }, + { + "epoch": 3.4584013050570963, + "grad_norm": 0.5796207189559937, + "learning_rate": 4.1347150688979275e-05, + "loss": 0.085, + "num_input_tokens_seen": 45764064, + "step": 21200 + }, + { + "epoch": 3.459216965742251, + "grad_norm": 1.11313796043396, + "learning_rate": 4.134176461045447e-05, + "loss": 0.2084, + "num_input_tokens_seen": 45774944, + "step": 21205 + }, + { + "epoch": 3.4600326264274064, + "grad_norm": 0.7064036726951599, + "learning_rate": 4.133637720718331e-05, + "loss": 0.1837, + "num_input_tokens_seen": 45785856, + "step": 21210 + }, + { + "epoch": 3.460848287112561, + "grad_norm": 2.124871253967285, + "learning_rate": 4.133098847960252e-05, + "loss": 0.3672, + "num_input_tokens_seen": 45796352, + "step": 21215 + }, + { + "epoch": 3.461663947797716, + "grad_norm": 1.523603081703186, + "learning_rate": 4.1325598428148935e-05, + "loss": 0.0892, + "num_input_tokens_seen": 45808160, + "step": 21220 + }, + { + "epoch": 3.4624796084828713, + "grad_norm": 0.21834760904312134, + "learning_rate": 4.132020705325952e-05, + "loss": 0.0671, + "num_input_tokens_seen": 45819744, + "step": 21225 + }, + { + "epoch": 3.463295269168026, + "grad_norm": 0.03198466822504997, + "learning_rate": 4.13148143553713e-05, + "loss": 0.1836, + "num_input_tokens_seen": 45830048, + "step": 21230 + }, + { + "epoch": 3.464110929853181, + "grad_norm": 1.2551103830337524, + "learning_rate": 4.130942033492146e-05, + "loss": 0.0877, + "num_input_tokens_seen": 45841216, + "step": 21235 + }, + { + "epoch": 3.464926590538336, + "grad_norm": 0.18150414526462555, + "learning_rate": 4.1304024992347245e-05, + "loss": 0.0807, + "num_input_tokens_seen": 45850720, + "step": 21240 + }, + { + "epoch": 3.465742251223491, + "grad_norm": 1.5147252082824707, + "learning_rate": 4.129862832808604e-05, + "loss": 0.2355, + "num_input_tokens_seen": 45862272, + "step": 21245 + }, + { + "epoch": 3.466557911908646, + "grad_norm": 0.2292776107788086, + "learning_rate": 4.129323034257533e-05, + "loss": 0.2022, + "num_input_tokens_seen": 45872576, + "step": 21250 + }, + { + "epoch": 3.467373572593801, + "grad_norm": 0.05998830869793892, + "learning_rate": 4.128783103625269e-05, + "loss": 0.1693, + "num_input_tokens_seen": 45882816, + "step": 21255 + }, + { + "epoch": 3.468189233278956, + "grad_norm": 1.3748859167099, + "learning_rate": 4.128243040955583e-05, + "loss": 0.1778, + "num_input_tokens_seen": 45893856, + "step": 21260 + }, + { + "epoch": 3.4690048939641107, + "grad_norm": 1.3118088245391846, + "learning_rate": 4.1277028462922535e-05, + "loss": 0.1083, + "num_input_tokens_seen": 45904128, + "step": 21265 + }, + { + "epoch": 3.469820554649266, + "grad_norm": 1.030863642692566, + "learning_rate": 4.127162519679073e-05, + "loss": 0.1523, + "num_input_tokens_seen": 45914624, + "step": 21270 + }, + { + "epoch": 3.470636215334421, + "grad_norm": 1.3712297677993774, + "learning_rate": 4.126622061159843e-05, + "loss": 0.1713, + "num_input_tokens_seen": 45926432, + "step": 21275 + }, + { + "epoch": 3.471451876019576, + "grad_norm": 0.9359384775161743, + "learning_rate": 4.126081470778375e-05, + "loss": 0.0654, + "num_input_tokens_seen": 45936608, + "step": 21280 + }, + { + "epoch": 3.472267536704731, + "grad_norm": 0.13954168558120728, + "learning_rate": 4.125540748578491e-05, + "loss": 0.1627, + "num_input_tokens_seen": 45948768, + "step": 21285 + }, + { + "epoch": 3.4730831973898857, + "grad_norm": 0.03404529020190239, + "learning_rate": 4.124999894604028e-05, + "loss": 0.0597, + "num_input_tokens_seen": 45958080, + "step": 21290 + }, + { + "epoch": 3.473898858075041, + "grad_norm": 1.006542682647705, + "learning_rate": 4.124458908898827e-05, + "loss": 0.1759, + "num_input_tokens_seen": 45968768, + "step": 21295 + }, + { + "epoch": 3.4747145187601958, + "grad_norm": 1.4531551599502563, + "learning_rate": 4.1239177915067454e-05, + "loss": 0.2055, + "num_input_tokens_seen": 45979360, + "step": 21300 + }, + { + "epoch": 3.4755301794453506, + "grad_norm": 0.3092147409915924, + "learning_rate": 4.123376542471648e-05, + "loss": 0.0547, + "num_input_tokens_seen": 45990880, + "step": 21305 + }, + { + "epoch": 3.476345840130506, + "grad_norm": 0.032198067754507065, + "learning_rate": 4.122835161837409e-05, + "loss": 0.1668, + "num_input_tokens_seen": 46001344, + "step": 21310 + }, + { + "epoch": 3.4771615008156607, + "grad_norm": 1.4173557758331299, + "learning_rate": 4.12229364964792e-05, + "loss": 0.1047, + "num_input_tokens_seen": 46013152, + "step": 21315 + }, + { + "epoch": 3.4779771615008155, + "grad_norm": 0.36894047260284424, + "learning_rate": 4.121752005947076e-05, + "loss": 0.1562, + "num_input_tokens_seen": 46023424, + "step": 21320 + }, + { + "epoch": 3.4787928221859707, + "grad_norm": 0.7583985328674316, + "learning_rate": 4.121210230778785e-05, + "loss": 0.0674, + "num_input_tokens_seen": 46033568, + "step": 21325 + }, + { + "epoch": 3.4796084828711256, + "grad_norm": 0.5663972496986389, + "learning_rate": 4.120668324186967e-05, + "loss": 0.0511, + "num_input_tokens_seen": 46045088, + "step": 21330 + }, + { + "epoch": 3.480424143556281, + "grad_norm": 0.18726637959480286, + "learning_rate": 4.120126286215552e-05, + "loss": 0.1247, + "num_input_tokens_seen": 46056640, + "step": 21335 + }, + { + "epoch": 3.4812398042414356, + "grad_norm": 0.2130858302116394, + "learning_rate": 4.119584116908478e-05, + "loss": 0.0954, + "num_input_tokens_seen": 46067008, + "step": 21340 + }, + { + "epoch": 3.4820554649265905, + "grad_norm": 0.07508160173892975, + "learning_rate": 4.1190418163097e-05, + "loss": 0.0619, + "num_input_tokens_seen": 46078048, + "step": 21345 + }, + { + "epoch": 3.4828711256117453, + "grad_norm": 0.08054173737764359, + "learning_rate": 4.118499384463176e-05, + "loss": 0.1242, + "num_input_tokens_seen": 46089568, + "step": 21350 + }, + { + "epoch": 3.4836867862969005, + "grad_norm": 1.273911476135254, + "learning_rate": 4.1179568214128805e-05, + "loss": 0.3072, + "num_input_tokens_seen": 46101024, + "step": 21355 + }, + { + "epoch": 3.4845024469820554, + "grad_norm": 1.4062918424606323, + "learning_rate": 4.117414127202795e-05, + "loss": 0.1803, + "num_input_tokens_seen": 46112064, + "step": 21360 + }, + { + "epoch": 3.4853181076672106, + "grad_norm": 1.2397923469543457, + "learning_rate": 4.116871301876914e-05, + "loss": 0.1246, + "num_input_tokens_seen": 46122080, + "step": 21365 + }, + { + "epoch": 3.4861337683523654, + "grad_norm": 1.5942530632019043, + "learning_rate": 4.116328345479241e-05, + "loss": 0.1755, + "num_input_tokens_seen": 46132992, + "step": 21370 + }, + { + "epoch": 3.4869494290375203, + "grad_norm": 1.6760292053222656, + "learning_rate": 4.115785258053792e-05, + "loss": 0.1877, + "num_input_tokens_seen": 46144576, + "step": 21375 + }, + { + "epoch": 3.4877650897226755, + "grad_norm": 0.25462713837623596, + "learning_rate": 4.1152420396445915e-05, + "loss": 0.2598, + "num_input_tokens_seen": 46155296, + "step": 21380 + }, + { + "epoch": 3.4885807504078303, + "grad_norm": 2.0932681560516357, + "learning_rate": 4.1146986902956745e-05, + "loss": 0.1551, + "num_input_tokens_seen": 46166528, + "step": 21385 + }, + { + "epoch": 3.489396411092985, + "grad_norm": 0.7462074756622314, + "learning_rate": 4.1141552100510896e-05, + "loss": 0.1652, + "num_input_tokens_seen": 46176672, + "step": 21390 + }, + { + "epoch": 3.4902120717781404, + "grad_norm": 0.2554551362991333, + "learning_rate": 4.1136115989548926e-05, + "loss": 0.0773, + "num_input_tokens_seen": 46186880, + "step": 21395 + }, + { + "epoch": 3.4910277324632952, + "grad_norm": 1.4749929904937744, + "learning_rate": 4.113067857051153e-05, + "loss": 0.15, + "num_input_tokens_seen": 46196320, + "step": 21400 + }, + { + "epoch": 3.49184339314845, + "grad_norm": 0.04471868276596069, + "learning_rate": 4.112523984383948e-05, + "loss": 0.0612, + "num_input_tokens_seen": 46207040, + "step": 21405 + }, + { + "epoch": 3.4926590538336053, + "grad_norm": 0.6691175103187561, + "learning_rate": 4.111979980997366e-05, + "loss": 0.1117, + "num_input_tokens_seen": 46217248, + "step": 21410 + }, + { + "epoch": 3.49347471451876, + "grad_norm": 0.2741221785545349, + "learning_rate": 4.1114358469355084e-05, + "loss": 0.1398, + "num_input_tokens_seen": 46226688, + "step": 21415 + }, + { + "epoch": 3.4942903752039154, + "grad_norm": 0.379067599773407, + "learning_rate": 4.110891582242485e-05, + "loss": 0.1426, + "num_input_tokens_seen": 46236768, + "step": 21420 + }, + { + "epoch": 3.49510603588907, + "grad_norm": 0.39113929867744446, + "learning_rate": 4.1103471869624154e-05, + "loss": 0.0898, + "num_input_tokens_seen": 46247872, + "step": 21425 + }, + { + "epoch": 3.495921696574225, + "grad_norm": 1.0275293588638306, + "learning_rate": 4.109802661139433e-05, + "loss": 0.21, + "num_input_tokens_seen": 46258688, + "step": 21430 + }, + { + "epoch": 3.4967373572593803, + "grad_norm": 0.6147500872612, + "learning_rate": 4.109258004817679e-05, + "loss": 0.116, + "num_input_tokens_seen": 46269504, + "step": 21435 + }, + { + "epoch": 3.497553017944535, + "grad_norm": 0.8244101405143738, + "learning_rate": 4.1087132180413047e-05, + "loss": 0.1231, + "num_input_tokens_seen": 46280928, + "step": 21440 + }, + { + "epoch": 3.49836867862969, + "grad_norm": 0.08629418164491653, + "learning_rate": 4.108168300854475e-05, + "loss": 0.1589, + "num_input_tokens_seen": 46291648, + "step": 21445 + }, + { + "epoch": 3.499184339314845, + "grad_norm": 1.069753646850586, + "learning_rate": 4.1076232533013635e-05, + "loss": 0.1139, + "num_input_tokens_seen": 46302592, + "step": 21450 + }, + { + "epoch": 3.5, + "grad_norm": 0.29237836599349976, + "learning_rate": 4.1070780754261533e-05, + "loss": 0.1356, + "num_input_tokens_seen": 46313216, + "step": 21455 + }, + { + "epoch": 3.5, + "eval_loss": 0.14037171006202698, + "eval_runtime": 131.8673, + "eval_samples_per_second": 20.665, + "eval_steps_per_second": 5.172, + "num_input_tokens_seen": 46313216, + "step": 21455 + }, + { + "epoch": 3.500815660685155, + "grad_norm": 0.2440900206565857, + "learning_rate": 4.10653276727304e-05, + "loss": 0.1917, + "num_input_tokens_seen": 46322656, + "step": 21460 + }, + { + "epoch": 3.50163132137031, + "grad_norm": 0.5341936945915222, + "learning_rate": 4.105987328886229e-05, + "loss": 0.1053, + "num_input_tokens_seen": 46332576, + "step": 21465 + }, + { + "epoch": 3.502446982055465, + "grad_norm": 0.012847920879721642, + "learning_rate": 4.1054417603099376e-05, + "loss": 0.1641, + "num_input_tokens_seen": 46343584, + "step": 21470 + }, + { + "epoch": 3.50326264274062, + "grad_norm": 0.05842230096459389, + "learning_rate": 4.104896061588391e-05, + "loss": 0.0357, + "num_input_tokens_seen": 46354240, + "step": 21475 + }, + { + "epoch": 3.504078303425775, + "grad_norm": 1.0171583890914917, + "learning_rate": 4.1043502327658256e-05, + "loss": 0.137, + "num_input_tokens_seen": 46364736, + "step": 21480 + }, + { + "epoch": 3.50489396411093, + "grad_norm": 0.11827445775270462, + "learning_rate": 4.1038042738864906e-05, + "loss": 0.1955, + "num_input_tokens_seen": 46374976, + "step": 21485 + }, + { + "epoch": 3.5057096247960846, + "grad_norm": 1.4060189723968506, + "learning_rate": 4.103258184994644e-05, + "loss": 0.182, + "num_input_tokens_seen": 46386080, + "step": 21490 + }, + { + "epoch": 3.50652528548124, + "grad_norm": 1.009465217590332, + "learning_rate": 4.102711966134553e-05, + "loss": 0.0998, + "num_input_tokens_seen": 46397152, + "step": 21495 + }, + { + "epoch": 3.5073409461663947, + "grad_norm": 0.4787003993988037, + "learning_rate": 4.102165617350498e-05, + "loss": 0.1134, + "num_input_tokens_seen": 46408480, + "step": 21500 + }, + { + "epoch": 3.50815660685155, + "grad_norm": 0.1543492078781128, + "learning_rate": 4.101619138686769e-05, + "loss": 0.1387, + "num_input_tokens_seen": 46419712, + "step": 21505 + }, + { + "epoch": 3.5089722675367048, + "grad_norm": 2.3671770095825195, + "learning_rate": 4.101072530187666e-05, + "loss": 0.2317, + "num_input_tokens_seen": 46431072, + "step": 21510 + }, + { + "epoch": 3.5097879282218596, + "grad_norm": 0.2627931833267212, + "learning_rate": 4.100525791897501e-05, + "loss": 0.0798, + "num_input_tokens_seen": 46441280, + "step": 21515 + }, + { + "epoch": 3.5106035889070144, + "grad_norm": 0.24285852909088135, + "learning_rate": 4.0999789238605925e-05, + "loss": 0.1658, + "num_input_tokens_seen": 46451808, + "step": 21520 + }, + { + "epoch": 3.5114192495921697, + "grad_norm": 1.985195517539978, + "learning_rate": 4.099431926121276e-05, + "loss": 0.1827, + "num_input_tokens_seen": 46463136, + "step": 21525 + }, + { + "epoch": 3.5122349102773245, + "grad_norm": 0.44379639625549316, + "learning_rate": 4.098884798723891e-05, + "loss": 0.1309, + "num_input_tokens_seen": 46473280, + "step": 21530 + }, + { + "epoch": 3.5130505709624797, + "grad_norm": 0.9799425005912781, + "learning_rate": 4.098337541712791e-05, + "loss": 0.0988, + "num_input_tokens_seen": 46484032, + "step": 21535 + }, + { + "epoch": 3.5138662316476346, + "grad_norm": 0.11051211506128311, + "learning_rate": 4.0977901551323414e-05, + "loss": 0.0389, + "num_input_tokens_seen": 46495488, + "step": 21540 + }, + { + "epoch": 3.5146818923327894, + "grad_norm": 0.9421526193618774, + "learning_rate": 4.097242639026914e-05, + "loss": 0.1917, + "num_input_tokens_seen": 46506944, + "step": 21545 + }, + { + "epoch": 3.5154975530179446, + "grad_norm": 0.6799578070640564, + "learning_rate": 4.0966949934408946e-05, + "loss": 0.1312, + "num_input_tokens_seen": 46517216, + "step": 21550 + }, + { + "epoch": 3.5163132137030995, + "grad_norm": 1.4050284624099731, + "learning_rate": 4.0961472184186766e-05, + "loss": 0.2027, + "num_input_tokens_seen": 46528864, + "step": 21555 + }, + { + "epoch": 3.5171288743882547, + "grad_norm": 0.21580615639686584, + "learning_rate": 4.0955993140046665e-05, + "loss": 0.159, + "num_input_tokens_seen": 46539072, + "step": 21560 + }, + { + "epoch": 3.5179445350734095, + "grad_norm": 0.3019736409187317, + "learning_rate": 4.095051280243281e-05, + "loss": 0.0904, + "num_input_tokens_seen": 46550016, + "step": 21565 + }, + { + "epoch": 3.5187601957585644, + "grad_norm": 0.6989091038703918, + "learning_rate": 4.0945031171789435e-05, + "loss": 0.1069, + "num_input_tokens_seen": 46559968, + "step": 21570 + }, + { + "epoch": 3.519575856443719, + "grad_norm": 0.41753318905830383, + "learning_rate": 4.0939548248560946e-05, + "loss": 0.0525, + "num_input_tokens_seen": 46571456, + "step": 21575 + }, + { + "epoch": 3.5203915171288744, + "grad_norm": 0.15549147129058838, + "learning_rate": 4.093406403319179e-05, + "loss": 0.0986, + "num_input_tokens_seen": 46583104, + "step": 21580 + }, + { + "epoch": 3.5212071778140293, + "grad_norm": 0.6112769842147827, + "learning_rate": 4.0928578526126566e-05, + "loss": 0.1561, + "num_input_tokens_seen": 46592384, + "step": 21585 + }, + { + "epoch": 3.5220228384991845, + "grad_norm": 0.6356746554374695, + "learning_rate": 4.092309172780994e-05, + "loss": 0.0628, + "num_input_tokens_seen": 46603488, + "step": 21590 + }, + { + "epoch": 3.5228384991843393, + "grad_norm": 0.09601086378097534, + "learning_rate": 4.09176036386867e-05, + "loss": 0.189, + "num_input_tokens_seen": 46614528, + "step": 21595 + }, + { + "epoch": 3.523654159869494, + "grad_norm": 0.8101431131362915, + "learning_rate": 4.091211425920175e-05, + "loss": 0.0653, + "num_input_tokens_seen": 46625920, + "step": 21600 + }, + { + "epoch": 3.5244698205546494, + "grad_norm": 0.023896178230643272, + "learning_rate": 4.090662358980009e-05, + "loss": 0.0536, + "num_input_tokens_seen": 46636288, + "step": 21605 + }, + { + "epoch": 3.5252854812398042, + "grad_norm": 1.5607773065567017, + "learning_rate": 4.0901131630926794e-05, + "loss": 0.2219, + "num_input_tokens_seen": 46645536, + "step": 21610 + }, + { + "epoch": 3.5261011419249595, + "grad_norm": 0.1344711035490036, + "learning_rate": 4.089563838302709e-05, + "loss": 0.064, + "num_input_tokens_seen": 46656224, + "step": 21615 + }, + { + "epoch": 3.5269168026101143, + "grad_norm": 0.9610986709594727, + "learning_rate": 4.089014384654629e-05, + "loss": 0.0829, + "num_input_tokens_seen": 46665920, + "step": 21620 + }, + { + "epoch": 3.527732463295269, + "grad_norm": 0.463426798582077, + "learning_rate": 4.088464802192981e-05, + "loss": 0.0811, + "num_input_tokens_seen": 46676864, + "step": 21625 + }, + { + "epoch": 3.528548123980424, + "grad_norm": 0.0913710668683052, + "learning_rate": 4.0879150909623156e-05, + "loss": 0.1334, + "num_input_tokens_seen": 46688192, + "step": 21630 + }, + { + "epoch": 3.529363784665579, + "grad_norm": 0.07081405073404312, + "learning_rate": 4.0873652510071955e-05, + "loss": 0.1223, + "num_input_tokens_seen": 46698848, + "step": 21635 + }, + { + "epoch": 3.530179445350734, + "grad_norm": 1.4261358976364136, + "learning_rate": 4.086815282372195e-05, + "loss": 0.1152, + "num_input_tokens_seen": 46709920, + "step": 21640 + }, + { + "epoch": 3.5309951060358893, + "grad_norm": 0.7629926800727844, + "learning_rate": 4.086265185101895e-05, + "loss": 0.1456, + "num_input_tokens_seen": 46721152, + "step": 21645 + }, + { + "epoch": 3.531810766721044, + "grad_norm": 0.4027405381202698, + "learning_rate": 4.0857149592408914e-05, + "loss": 0.0889, + "num_input_tokens_seen": 46732320, + "step": 21650 + }, + { + "epoch": 3.532626427406199, + "grad_norm": 0.15421830117702484, + "learning_rate": 4.085164604833788e-05, + "loss": 0.1385, + "num_input_tokens_seen": 46744032, + "step": 21655 + }, + { + "epoch": 3.5334420880913537, + "grad_norm": 0.5925560593605042, + "learning_rate": 4.084614121925198e-05, + "loss": 0.1605, + "num_input_tokens_seen": 46755584, + "step": 21660 + }, + { + "epoch": 3.534257748776509, + "grad_norm": 1.2748488187789917, + "learning_rate": 4.084063510559746e-05, + "loss": 0.1267, + "num_input_tokens_seen": 46766368, + "step": 21665 + }, + { + "epoch": 3.535073409461664, + "grad_norm": 0.3488668203353882, + "learning_rate": 4.0835127707820696e-05, + "loss": 0.1471, + "num_input_tokens_seen": 46777920, + "step": 21670 + }, + { + "epoch": 3.535889070146819, + "grad_norm": 0.07630845904350281, + "learning_rate": 4.0829619026368134e-05, + "loss": 0.2018, + "num_input_tokens_seen": 46789312, + "step": 21675 + }, + { + "epoch": 3.536704730831974, + "grad_norm": 0.08554045855998993, + "learning_rate": 4.0824109061686325e-05, + "loss": 0.0661, + "num_input_tokens_seen": 46800736, + "step": 21680 + }, + { + "epoch": 3.5375203915171287, + "grad_norm": 2.337381601333618, + "learning_rate": 4.081859781422195e-05, + "loss": 0.246, + "num_input_tokens_seen": 46812096, + "step": 21685 + }, + { + "epoch": 3.538336052202284, + "grad_norm": 0.41747793555259705, + "learning_rate": 4.0813085284421774e-05, + "loss": 0.2092, + "num_input_tokens_seen": 46822624, + "step": 21690 + }, + { + "epoch": 3.539151712887439, + "grad_norm": 2.2528226375579834, + "learning_rate": 4.080757147273267e-05, + "loss": 0.2123, + "num_input_tokens_seen": 46833760, + "step": 21695 + }, + { + "epoch": 3.539967373572594, + "grad_norm": 1.1469388008117676, + "learning_rate": 4.080205637960162e-05, + "loss": 0.1799, + "num_input_tokens_seen": 46845312, + "step": 21700 + }, + { + "epoch": 3.540783034257749, + "grad_norm": 0.6369066834449768, + "learning_rate": 4.07965400054757e-05, + "loss": 0.12, + "num_input_tokens_seen": 46855904, + "step": 21705 + }, + { + "epoch": 3.5415986949429037, + "grad_norm": 0.3036232888698578, + "learning_rate": 4.0791022350802086e-05, + "loss": 0.2013, + "num_input_tokens_seen": 46865760, + "step": 21710 + }, + { + "epoch": 3.5424143556280585, + "grad_norm": 0.37198108434677124, + "learning_rate": 4.078550341602809e-05, + "loss": 0.0357, + "num_input_tokens_seen": 46877888, + "step": 21715 + }, + { + "epoch": 3.5432300163132138, + "grad_norm": 0.4884043335914612, + "learning_rate": 4.077998320160109e-05, + "loss": 0.097, + "num_input_tokens_seen": 46889312, + "step": 21720 + }, + { + "epoch": 3.5440456769983686, + "grad_norm": 1.3437817096710205, + "learning_rate": 4.077446170796858e-05, + "loss": 0.1638, + "num_input_tokens_seen": 46899392, + "step": 21725 + }, + { + "epoch": 3.544861337683524, + "grad_norm": 0.5742864608764648, + "learning_rate": 4.076893893557816e-05, + "loss": 0.083, + "num_input_tokens_seen": 46910784, + "step": 21730 + }, + { + "epoch": 3.5456769983686787, + "grad_norm": 0.4358750581741333, + "learning_rate": 4.076341488487755e-05, + "loss": 0.2556, + "num_input_tokens_seen": 46921920, + "step": 21735 + }, + { + "epoch": 3.5464926590538335, + "grad_norm": 1.0535780191421509, + "learning_rate": 4.0757889556314545e-05, + "loss": 0.1065, + "num_input_tokens_seen": 46931776, + "step": 21740 + }, + { + "epoch": 3.5473083197389887, + "grad_norm": 1.8264961242675781, + "learning_rate": 4.0752362950337054e-05, + "loss": 0.2372, + "num_input_tokens_seen": 46942848, + "step": 21745 + }, + { + "epoch": 3.5481239804241436, + "grad_norm": 0.08123189955949783, + "learning_rate": 4.0746835067393096e-05, + "loss": 0.043, + "num_input_tokens_seen": 46953408, + "step": 21750 + }, + { + "epoch": 3.5489396411092984, + "grad_norm": 0.3914510905742645, + "learning_rate": 4.074130590793079e-05, + "loss": 0.0777, + "num_input_tokens_seen": 46964416, + "step": 21755 + }, + { + "epoch": 3.5497553017944536, + "grad_norm": 0.15825699269771576, + "learning_rate": 4.073577547239836e-05, + "loss": 0.1157, + "num_input_tokens_seen": 46974304, + "step": 21760 + }, + { + "epoch": 3.5505709624796085, + "grad_norm": 0.21117083728313446, + "learning_rate": 4.073024376124412e-05, + "loss": 0.0785, + "num_input_tokens_seen": 46984256, + "step": 21765 + }, + { + "epoch": 3.5513866231647633, + "grad_norm": 0.5812256336212158, + "learning_rate": 4.072471077491651e-05, + "loss": 0.2018, + "num_input_tokens_seen": 46995424, + "step": 21770 + }, + { + "epoch": 3.5522022838499185, + "grad_norm": 0.15163183212280273, + "learning_rate": 4.071917651386406e-05, + "loss": 0.2636, + "num_input_tokens_seen": 47006880, + "step": 21775 + }, + { + "epoch": 3.5530179445350734, + "grad_norm": 0.7479328513145447, + "learning_rate": 4.071364097853541e-05, + "loss": 0.1664, + "num_input_tokens_seen": 47016864, + "step": 21780 + }, + { + "epoch": 3.5538336052202286, + "grad_norm": 1.6031537055969238, + "learning_rate": 4.070810416937927e-05, + "loss": 0.2058, + "num_input_tokens_seen": 47029248, + "step": 21785 + }, + { + "epoch": 3.5546492659053834, + "grad_norm": 0.24858638644218445, + "learning_rate": 4.070256608684452e-05, + "loss": 0.081, + "num_input_tokens_seen": 47040480, + "step": 21790 + }, + { + "epoch": 3.5554649265905383, + "grad_norm": 0.3296593427658081, + "learning_rate": 4.069702673138009e-05, + "loss": 0.0687, + "num_input_tokens_seen": 47051872, + "step": 21795 + }, + { + "epoch": 3.556280587275693, + "grad_norm": 0.34410977363586426, + "learning_rate": 4.0691486103435025e-05, + "loss": 0.041, + "num_input_tokens_seen": 47063008, + "step": 21800 + }, + { + "epoch": 3.5570962479608483, + "grad_norm": 0.11920095235109329, + "learning_rate": 4.0685944203458476e-05, + "loss": 0.097, + "num_input_tokens_seen": 47073792, + "step": 21805 + }, + { + "epoch": 3.557911908646003, + "grad_norm": 0.18055571615695953, + "learning_rate": 4.06804010318997e-05, + "loss": 0.0923, + "num_input_tokens_seen": 47084832, + "step": 21810 + }, + { + "epoch": 3.5587275693311584, + "grad_norm": 0.5717656016349792, + "learning_rate": 4.0674856589208063e-05, + "loss": 0.0424, + "num_input_tokens_seen": 47096032, + "step": 21815 + }, + { + "epoch": 3.5595432300163132, + "grad_norm": 0.15171322226524353, + "learning_rate": 4.066931087583301e-05, + "loss": 0.0933, + "num_input_tokens_seen": 47106720, + "step": 21820 + }, + { + "epoch": 3.560358890701468, + "grad_norm": 0.14937041699886322, + "learning_rate": 4.0663763892224114e-05, + "loss": 0.0683, + "num_input_tokens_seen": 47118592, + "step": 21825 + }, + { + "epoch": 3.5611745513866233, + "grad_norm": 2.1657557487487793, + "learning_rate": 4.065821563883104e-05, + "loss": 0.1162, + "num_input_tokens_seen": 47129792, + "step": 21830 + }, + { + "epoch": 3.561990212071778, + "grad_norm": 0.28118646144866943, + "learning_rate": 4.0652666116103556e-05, + "loss": 0.0507, + "num_input_tokens_seen": 47140224, + "step": 21835 + }, + { + "epoch": 3.5628058727569334, + "grad_norm": 0.4537132680416107, + "learning_rate": 4.064711532449153e-05, + "loss": 0.0709, + "num_input_tokens_seen": 47151904, + "step": 21840 + }, + { + "epoch": 3.563621533442088, + "grad_norm": 1.0330581665039062, + "learning_rate": 4.0641563264444946e-05, + "loss": 0.0662, + "num_input_tokens_seen": 47161920, + "step": 21845 + }, + { + "epoch": 3.564437194127243, + "grad_norm": 0.16755837202072144, + "learning_rate": 4.063600993641389e-05, + "loss": 0.1493, + "num_input_tokens_seen": 47172128, + "step": 21850 + }, + { + "epoch": 3.565252854812398, + "grad_norm": 1.8683198690414429, + "learning_rate": 4.0630455340848525e-05, + "loss": 0.2185, + "num_input_tokens_seen": 47182784, + "step": 21855 + }, + { + "epoch": 3.566068515497553, + "grad_norm": 0.03482047840952873, + "learning_rate": 4.062489947819914e-05, + "loss": 0.0393, + "num_input_tokens_seen": 47194336, + "step": 21860 + }, + { + "epoch": 3.566884176182708, + "grad_norm": 0.6464216113090515, + "learning_rate": 4.061934234891612e-05, + "loss": 0.1244, + "num_input_tokens_seen": 47204832, + "step": 21865 + }, + { + "epoch": 3.567699836867863, + "grad_norm": 0.6125308275222778, + "learning_rate": 4.0613783953449966e-05, + "loss": 0.1689, + "num_input_tokens_seen": 47215648, + "step": 21870 + }, + { + "epoch": 3.568515497553018, + "grad_norm": 0.0632922500371933, + "learning_rate": 4.0608224292251264e-05, + "loss": 0.1221, + "num_input_tokens_seen": 47226080, + "step": 21875 + }, + { + "epoch": 3.569331158238173, + "grad_norm": 0.6401284337043762, + "learning_rate": 4.0602663365770696e-05, + "loss": 0.1952, + "num_input_tokens_seen": 47238112, + "step": 21880 + }, + { + "epoch": 3.5701468189233276, + "grad_norm": 1.7300751209259033, + "learning_rate": 4.0597101174459074e-05, + "loss": 0.1264, + "num_input_tokens_seen": 47249216, + "step": 21885 + }, + { + "epoch": 3.570962479608483, + "grad_norm": 0.045195698738098145, + "learning_rate": 4.0591537718767284e-05, + "loss": 0.0893, + "num_input_tokens_seen": 47259296, + "step": 21890 + }, + { + "epoch": 3.5717781402936377, + "grad_norm": 0.17726784944534302, + "learning_rate": 4.058597299914634e-05, + "loss": 0.1599, + "num_input_tokens_seen": 47270336, + "step": 21895 + }, + { + "epoch": 3.572593800978793, + "grad_norm": 0.8649657964706421, + "learning_rate": 4.0580407016047345e-05, + "loss": 0.0848, + "num_input_tokens_seen": 47280736, + "step": 21900 + }, + { + "epoch": 3.573409461663948, + "grad_norm": 1.4209532737731934, + "learning_rate": 4.0574839769921504e-05, + "loss": 0.1633, + "num_input_tokens_seen": 47291328, + "step": 21905 + }, + { + "epoch": 3.5742251223491026, + "grad_norm": 0.29775622487068176, + "learning_rate": 4.056927126122012e-05, + "loss": 0.1208, + "num_input_tokens_seen": 47302368, + "step": 21910 + }, + { + "epoch": 3.575040783034258, + "grad_norm": 1.8403836488723755, + "learning_rate": 4.056370149039461e-05, + "loss": 0.0836, + "num_input_tokens_seen": 47312768, + "step": 21915 + }, + { + "epoch": 3.5758564437194127, + "grad_norm": 0.11581604182720184, + "learning_rate": 4.05581304578965e-05, + "loss": 0.0534, + "num_input_tokens_seen": 47323520, + "step": 21920 + }, + { + "epoch": 3.576672104404568, + "grad_norm": 1.7136850357055664, + "learning_rate": 4.055255816417738e-05, + "loss": 0.1787, + "num_input_tokens_seen": 47334688, + "step": 21925 + }, + { + "epoch": 3.5774877650897228, + "grad_norm": 1.021734595298767, + "learning_rate": 4.054698460968899e-05, + "loss": 0.1255, + "num_input_tokens_seen": 47345184, + "step": 21930 + }, + { + "epoch": 3.5783034257748776, + "grad_norm": 0.522864043712616, + "learning_rate": 4.054140979488314e-05, + "loss": 0.0589, + "num_input_tokens_seen": 47356608, + "step": 21935 + }, + { + "epoch": 3.5791190864600324, + "grad_norm": 1.7569857835769653, + "learning_rate": 4.0535833720211755e-05, + "loss": 0.2997, + "num_input_tokens_seen": 47367520, + "step": 21940 + }, + { + "epoch": 3.5799347471451877, + "grad_norm": 0.5944653749465942, + "learning_rate": 4.053025638612686e-05, + "loss": 0.2031, + "num_input_tokens_seen": 47376288, + "step": 21945 + }, + { + "epoch": 3.5807504078303425, + "grad_norm": 0.4408424198627472, + "learning_rate": 4.052467779308058e-05, + "loss": 0.0578, + "num_input_tokens_seen": 47386592, + "step": 21950 + }, + { + "epoch": 3.5815660685154977, + "grad_norm": 1.2043009996414185, + "learning_rate": 4.051909794152515e-05, + "loss": 0.2358, + "num_input_tokens_seen": 47397792, + "step": 21955 + }, + { + "epoch": 3.5823817292006526, + "grad_norm": 0.0817117914557457, + "learning_rate": 4.05135168319129e-05, + "loss": 0.0317, + "num_input_tokens_seen": 47409280, + "step": 21960 + }, + { + "epoch": 3.5831973898858074, + "grad_norm": 0.057694174349308014, + "learning_rate": 4.050793446469626e-05, + "loss": 0.067, + "num_input_tokens_seen": 47419680, + "step": 21965 + }, + { + "epoch": 3.5840130505709626, + "grad_norm": 1.0496121644973755, + "learning_rate": 4.0502350840327764e-05, + "loss": 0.0994, + "num_input_tokens_seen": 47431296, + "step": 21970 + }, + { + "epoch": 3.5848287112561175, + "grad_norm": 2.1446094512939453, + "learning_rate": 4.0496765959260055e-05, + "loss": 0.1759, + "num_input_tokens_seen": 47441824, + "step": 21975 + }, + { + "epoch": 3.5856443719412723, + "grad_norm": 2.98195219039917, + "learning_rate": 4.049117982194586e-05, + "loss": 0.0627, + "num_input_tokens_seen": 47452192, + "step": 21980 + }, + { + "epoch": 3.5864600326264275, + "grad_norm": 1.482277750968933, + "learning_rate": 4.048559242883804e-05, + "loss": 0.0962, + "num_input_tokens_seen": 47461696, + "step": 21985 + }, + { + "epoch": 3.5872756933115824, + "grad_norm": 0.2997550368309021, + "learning_rate": 4.0480003780389507e-05, + "loss": 0.1468, + "num_input_tokens_seen": 47472960, + "step": 21990 + }, + { + "epoch": 3.588091353996737, + "grad_norm": 0.3548039495944977, + "learning_rate": 4.0474413877053335e-05, + "loss": 0.222, + "num_input_tokens_seen": 47483712, + "step": 21995 + }, + { + "epoch": 3.5889070146818924, + "grad_norm": 0.34236854314804077, + "learning_rate": 4.0468822719282654e-05, + "loss": 0.1979, + "num_input_tokens_seen": 47493728, + "step": 22000 + }, + { + "epoch": 3.5897226753670473, + "grad_norm": 0.1833425611257553, + "learning_rate": 4.046323030753071e-05, + "loss": 0.0204, + "num_input_tokens_seen": 47504928, + "step": 22005 + }, + { + "epoch": 3.5905383360522025, + "grad_norm": 0.5381787419319153, + "learning_rate": 4.045763664225087e-05, + "loss": 0.1173, + "num_input_tokens_seen": 47514880, + "step": 22010 + }, + { + "epoch": 3.5913539967373573, + "grad_norm": 0.9563619494438171, + "learning_rate": 4.045204172389656e-05, + "loss": 0.0574, + "num_input_tokens_seen": 47526688, + "step": 22015 + }, + { + "epoch": 3.592169657422512, + "grad_norm": 0.11658233404159546, + "learning_rate": 4.044644555292135e-05, + "loss": 0.0374, + "num_input_tokens_seen": 47536768, + "step": 22020 + }, + { + "epoch": 3.592985318107667, + "grad_norm": 0.07268639653921127, + "learning_rate": 4.04408481297789e-05, + "loss": 0.1688, + "num_input_tokens_seen": 47548768, + "step": 22025 + }, + { + "epoch": 3.5938009787928222, + "grad_norm": 0.07244197279214859, + "learning_rate": 4.043524945492294e-05, + "loss": 0.1012, + "num_input_tokens_seen": 47559968, + "step": 22030 + }, + { + "epoch": 3.594616639477977, + "grad_norm": 2.005072832107544, + "learning_rate": 4.042964952880734e-05, + "loss": 0.1372, + "num_input_tokens_seen": 47571040, + "step": 22035 + }, + { + "epoch": 3.5954323001631323, + "grad_norm": 0.7973790764808655, + "learning_rate": 4.042404835188607e-05, + "loss": 0.1056, + "num_input_tokens_seen": 47582816, + "step": 22040 + }, + { + "epoch": 3.596247960848287, + "grad_norm": 0.2094675898551941, + "learning_rate": 4.041844592461318e-05, + "loss": 0.0871, + "num_input_tokens_seen": 47594112, + "step": 22045 + }, + { + "epoch": 3.597063621533442, + "grad_norm": 0.48479169607162476, + "learning_rate": 4.0412842247442815e-05, + "loss": 0.1393, + "num_input_tokens_seen": 47604320, + "step": 22050 + }, + { + "epoch": 3.597879282218597, + "grad_norm": 0.08563528954982758, + "learning_rate": 4.040723732082927e-05, + "loss": 0.1528, + "num_input_tokens_seen": 47613472, + "step": 22055 + }, + { + "epoch": 3.598694942903752, + "grad_norm": 2.2007954120635986, + "learning_rate": 4.040163114522689e-05, + "loss": 0.2985, + "num_input_tokens_seen": 47624096, + "step": 22060 + }, + { + "epoch": 3.5995106035889073, + "grad_norm": 1.2863943576812744, + "learning_rate": 4.039602372109014e-05, + "loss": 0.0894, + "num_input_tokens_seen": 47635456, + "step": 22065 + }, + { + "epoch": 3.600326264274062, + "grad_norm": 1.0406988859176636, + "learning_rate": 4.0390415048873584e-05, + "loss": 0.0523, + "num_input_tokens_seen": 47644448, + "step": 22070 + }, + { + "epoch": 3.601141924959217, + "grad_norm": 1.0334203243255615, + "learning_rate": 4.03848051290319e-05, + "loss": 0.1798, + "num_input_tokens_seen": 47654144, + "step": 22075 + }, + { + "epoch": 3.6019575856443717, + "grad_norm": 0.051889363676309586, + "learning_rate": 4.037919396201985e-05, + "loss": 0.0475, + "num_input_tokens_seen": 47664672, + "step": 22080 + }, + { + "epoch": 3.602773246329527, + "grad_norm": 1.4356606006622314, + "learning_rate": 4.0373581548292305e-05, + "loss": 0.1216, + "num_input_tokens_seen": 47676512, + "step": 22085 + }, + { + "epoch": 3.603588907014682, + "grad_norm": 1.1495347023010254, + "learning_rate": 4.036796788830423e-05, + "loss": 0.2286, + "num_input_tokens_seen": 47687168, + "step": 22090 + }, + { + "epoch": 3.604404567699837, + "grad_norm": 2.364736318588257, + "learning_rate": 4.036235298251071e-05, + "loss": 0.2351, + "num_input_tokens_seen": 47698208, + "step": 22095 + }, + { + "epoch": 3.605220228384992, + "grad_norm": 0.6184305548667908, + "learning_rate": 4.03567368313669e-05, + "loss": 0.0791, + "num_input_tokens_seen": 47708448, + "step": 22100 + }, + { + "epoch": 3.6060358890701467, + "grad_norm": 0.4950851500034332, + "learning_rate": 4.035111943532808e-05, + "loss": 0.097, + "num_input_tokens_seen": 47718624, + "step": 22105 + }, + { + "epoch": 3.6068515497553015, + "grad_norm": 1.4330027103424072, + "learning_rate": 4.034550079484964e-05, + "loss": 0.3217, + "num_input_tokens_seen": 47729184, + "step": 22110 + }, + { + "epoch": 3.607667210440457, + "grad_norm": 0.8900971412658691, + "learning_rate": 4.033988091038704e-05, + "loss": 0.0989, + "num_input_tokens_seen": 47740992, + "step": 22115 + }, + { + "epoch": 3.6084828711256116, + "grad_norm": 0.39197415113449097, + "learning_rate": 4.0334259782395855e-05, + "loss": 0.1494, + "num_input_tokens_seen": 47751520, + "step": 22120 + }, + { + "epoch": 3.609298531810767, + "grad_norm": 0.37524616718292236, + "learning_rate": 4.032863741133177e-05, + "loss": 0.1712, + "num_input_tokens_seen": 47761920, + "step": 22125 + }, + { + "epoch": 3.6101141924959217, + "grad_norm": 0.31316274404525757, + "learning_rate": 4.0323013797650556e-05, + "loss": 0.2141, + "num_input_tokens_seen": 47773376, + "step": 22130 + }, + { + "epoch": 3.6109298531810765, + "grad_norm": 0.9302644729614258, + "learning_rate": 4.0317388941808096e-05, + "loss": 0.0788, + "num_input_tokens_seen": 47784064, + "step": 22135 + }, + { + "epoch": 3.6117455138662318, + "grad_norm": 1.0205985307693481, + "learning_rate": 4.0311762844260377e-05, + "loss": 0.0635, + "num_input_tokens_seen": 47794592, + "step": 22140 + }, + { + "epoch": 3.6125611745513866, + "grad_norm": 0.7004141211509705, + "learning_rate": 4.030613550546347e-05, + "loss": 0.0931, + "num_input_tokens_seen": 47805504, + "step": 22145 + }, + { + "epoch": 3.613376835236542, + "grad_norm": 0.30105674266815186, + "learning_rate": 4.030050692587355e-05, + "loss": 0.1258, + "num_input_tokens_seen": 47816128, + "step": 22150 + }, + { + "epoch": 3.6141924959216967, + "grad_norm": 1.0422030687332153, + "learning_rate": 4.02948771059469e-05, + "loss": 0.2052, + "num_input_tokens_seen": 47827264, + "step": 22155 + }, + { + "epoch": 3.6150081566068515, + "grad_norm": 0.40735524892807007, + "learning_rate": 4.028924604613991e-05, + "loss": 0.1051, + "num_input_tokens_seen": 47838208, + "step": 22160 + }, + { + "epoch": 3.6158238172920063, + "grad_norm": 0.4317876398563385, + "learning_rate": 4.028361374690906e-05, + "loss": 0.3057, + "num_input_tokens_seen": 47848192, + "step": 22165 + }, + { + "epoch": 3.6166394779771616, + "grad_norm": 0.3251751661300659, + "learning_rate": 4.027798020871093e-05, + "loss": 0.1093, + "num_input_tokens_seen": 47858688, + "step": 22170 + }, + { + "epoch": 3.6174551386623164, + "grad_norm": 0.3699907064437866, + "learning_rate": 4.027234543200221e-05, + "loss": 0.0244, + "num_input_tokens_seen": 47870048, + "step": 22175 + }, + { + "epoch": 3.6182707993474716, + "grad_norm": 1.4837071895599365, + "learning_rate": 4.026670941723968e-05, + "loss": 0.2303, + "num_input_tokens_seen": 47881984, + "step": 22180 + }, + { + "epoch": 3.6190864600326265, + "grad_norm": 0.6615767478942871, + "learning_rate": 4.026107216488022e-05, + "loss": 0.0553, + "num_input_tokens_seen": 47893376, + "step": 22185 + }, + { + "epoch": 3.6199021207177813, + "grad_norm": 0.30396920442581177, + "learning_rate": 4.0255433675380803e-05, + "loss": 0.1506, + "num_input_tokens_seen": 47903680, + "step": 22190 + }, + { + "epoch": 3.6207177814029365, + "grad_norm": 1.5533841848373413, + "learning_rate": 4.024979394919855e-05, + "loss": 0.1991, + "num_input_tokens_seen": 47914144, + "step": 22195 + }, + { + "epoch": 3.6215334420880914, + "grad_norm": 0.21553345024585724, + "learning_rate": 4.0244152986790604e-05, + "loss": 0.0177, + "num_input_tokens_seen": 47924224, + "step": 22200 + }, + { + "epoch": 3.622349102773246, + "grad_norm": 0.2753080427646637, + "learning_rate": 4.0238510788614276e-05, + "loss": 0.0626, + "num_input_tokens_seen": 47935264, + "step": 22205 + }, + { + "epoch": 3.6231647634584014, + "grad_norm": 2.341655969619751, + "learning_rate": 4.0232867355126934e-05, + "loss": 0.1012, + "num_input_tokens_seen": 47946432, + "step": 22210 + }, + { + "epoch": 3.6239804241435563, + "grad_norm": 0.15099309384822845, + "learning_rate": 4.0227222686786084e-05, + "loss": 0.0854, + "num_input_tokens_seen": 47956608, + "step": 22215 + }, + { + "epoch": 3.624796084828711, + "grad_norm": 0.31910523772239685, + "learning_rate": 4.02215767840493e-05, + "loss": 0.1922, + "num_input_tokens_seen": 47967552, + "step": 22220 + }, + { + "epoch": 3.6256117455138663, + "grad_norm": 1.5229593515396118, + "learning_rate": 4.021592964737427e-05, + "loss": 0.1861, + "num_input_tokens_seen": 47977888, + "step": 22225 + }, + { + "epoch": 3.626427406199021, + "grad_norm": 0.734809935092926, + "learning_rate": 4.021028127721878e-05, + "loss": 0.052, + "num_input_tokens_seen": 47988576, + "step": 22230 + }, + { + "epoch": 3.6272430668841764, + "grad_norm": 1.2613019943237305, + "learning_rate": 4.020463167404071e-05, + "loss": 0.2818, + "num_input_tokens_seen": 47999776, + "step": 22235 + }, + { + "epoch": 3.6280587275693312, + "grad_norm": 0.05445049703121185, + "learning_rate": 4.019898083829804e-05, + "loss": 0.1623, + "num_input_tokens_seen": 48010688, + "step": 22240 + }, + { + "epoch": 3.628874388254486, + "grad_norm": 0.25354236364364624, + "learning_rate": 4.019332877044888e-05, + "loss": 0.0254, + "num_input_tokens_seen": 48021152, + "step": 22245 + }, + { + "epoch": 3.629690048939641, + "grad_norm": 1.2857609987258911, + "learning_rate": 4.018767547095139e-05, + "loss": 0.2103, + "num_input_tokens_seen": 48030176, + "step": 22250 + }, + { + "epoch": 3.630505709624796, + "grad_norm": 1.0982242822647095, + "learning_rate": 4.018202094026386e-05, + "loss": 0.1832, + "num_input_tokens_seen": 48039232, + "step": 22255 + }, + { + "epoch": 3.631321370309951, + "grad_norm": 0.14467552304267883, + "learning_rate": 4.01763651788447e-05, + "loss": 0.118, + "num_input_tokens_seen": 48049984, + "step": 22260 + }, + { + "epoch": 3.632137030995106, + "grad_norm": 0.19703271985054016, + "learning_rate": 4.017070818715235e-05, + "loss": 0.079, + "num_input_tokens_seen": 48061088, + "step": 22265 + }, + { + "epoch": 3.632952691680261, + "grad_norm": 1.2680519819259644, + "learning_rate": 4.016504996564544e-05, + "loss": 0.1207, + "num_input_tokens_seen": 48072704, + "step": 22270 + }, + { + "epoch": 3.633768352365416, + "grad_norm": 0.1493595689535141, + "learning_rate": 4.015939051478262e-05, + "loss": 0.0994, + "num_input_tokens_seen": 48082848, + "step": 22275 + }, + { + "epoch": 3.634584013050571, + "grad_norm": 1.462408423423767, + "learning_rate": 4.0153729835022685e-05, + "loss": 0.1354, + "num_input_tokens_seen": 48094464, + "step": 22280 + }, + { + "epoch": 3.635399673735726, + "grad_norm": 1.018476963043213, + "learning_rate": 4.014806792682453e-05, + "loss": 0.1483, + "num_input_tokens_seen": 48105088, + "step": 22285 + }, + { + "epoch": 3.636215334420881, + "grad_norm": 1.4754915237426758, + "learning_rate": 4.0142404790647124e-05, + "loss": 0.137, + "num_input_tokens_seen": 48114432, + "step": 22290 + }, + { + "epoch": 3.637030995106036, + "grad_norm": 0.8720031380653381, + "learning_rate": 4.0136740426949546e-05, + "loss": 0.0883, + "num_input_tokens_seen": 48124288, + "step": 22295 + }, + { + "epoch": 3.637846655791191, + "grad_norm": 0.845069408416748, + "learning_rate": 4.0131074836191e-05, + "loss": 0.2384, + "num_input_tokens_seen": 48134144, + "step": 22300 + }, + { + "epoch": 3.6386623164763456, + "grad_norm": 0.28406667709350586, + "learning_rate": 4.0125408018830744e-05, + "loss": 0.104, + "num_input_tokens_seen": 48144416, + "step": 22305 + }, + { + "epoch": 3.639477977161501, + "grad_norm": 0.26834163069725037, + "learning_rate": 4.011973997532818e-05, + "loss": 0.0486, + "num_input_tokens_seen": 48155584, + "step": 22310 + }, + { + "epoch": 3.6402936378466557, + "grad_norm": 2.4336163997650146, + "learning_rate": 4.011407070614276e-05, + "loss": 0.2485, + "num_input_tokens_seen": 48167232, + "step": 22315 + }, + { + "epoch": 3.641109298531811, + "grad_norm": 0.9544999599456787, + "learning_rate": 4.010840021173409e-05, + "loss": 0.2031, + "num_input_tokens_seen": 48178208, + "step": 22320 + }, + { + "epoch": 3.641924959216966, + "grad_norm": 0.1387774795293808, + "learning_rate": 4.010272849256184e-05, + "loss": 0.0386, + "num_input_tokens_seen": 48188896, + "step": 22325 + }, + { + "epoch": 3.6427406199021206, + "grad_norm": 1.3935996294021606, + "learning_rate": 4.0097055549085784e-05, + "loss": 0.3519, + "num_input_tokens_seen": 48200000, + "step": 22330 + }, + { + "epoch": 3.6435562805872754, + "grad_norm": 1.2360919713974, + "learning_rate": 4.009138138176581e-05, + "loss": 0.1708, + "num_input_tokens_seen": 48211200, + "step": 22335 + }, + { + "epoch": 3.6443719412724307, + "grad_norm": 1.4774643182754517, + "learning_rate": 4.008570599106188e-05, + "loss": 0.2943, + "num_input_tokens_seen": 48221504, + "step": 22340 + }, + { + "epoch": 3.6451876019575855, + "grad_norm": 0.7293899059295654, + "learning_rate": 4.008002937743409e-05, + "loss": 0.1112, + "num_input_tokens_seen": 48230976, + "step": 22345 + }, + { + "epoch": 3.6460032626427408, + "grad_norm": 0.9118521809577942, + "learning_rate": 4.0074351541342595e-05, + "loss": 0.0718, + "num_input_tokens_seen": 48242368, + "step": 22350 + }, + { + "epoch": 3.6468189233278956, + "grad_norm": 0.8160725831985474, + "learning_rate": 4.006867248324767e-05, + "loss": 0.1449, + "num_input_tokens_seen": 48253696, + "step": 22355 + }, + { + "epoch": 3.6476345840130504, + "grad_norm": 0.2279084026813507, + "learning_rate": 4.006299220360971e-05, + "loss": 0.1015, + "num_input_tokens_seen": 48264768, + "step": 22360 + }, + { + "epoch": 3.6484502446982057, + "grad_norm": 0.27244338393211365, + "learning_rate": 4.0057310702889164e-05, + "loss": 0.1966, + "num_input_tokens_seen": 48275520, + "step": 22365 + }, + { + "epoch": 3.6492659053833605, + "grad_norm": 0.36927640438079834, + "learning_rate": 4.005162798154661e-05, + "loss": 0.1276, + "num_input_tokens_seen": 48284960, + "step": 22370 + }, + { + "epoch": 3.6500815660685157, + "grad_norm": 0.5293595790863037, + "learning_rate": 4.004594404004273e-05, + "loss": 0.0307, + "num_input_tokens_seen": 48295840, + "step": 22375 + }, + { + "epoch": 3.6508972267536706, + "grad_norm": 0.5546556115150452, + "learning_rate": 4.0040258878838284e-05, + "loss": 0.0745, + "num_input_tokens_seen": 48306880, + "step": 22380 + }, + { + "epoch": 3.6517128874388254, + "grad_norm": 2.104471206665039, + "learning_rate": 4.003457249839413e-05, + "loss": 0.1571, + "num_input_tokens_seen": 48318336, + "step": 22385 + }, + { + "epoch": 3.65252854812398, + "grad_norm": 2.189279794692993, + "learning_rate": 4.002888489917126e-05, + "loss": 0.1884, + "num_input_tokens_seen": 48329152, + "step": 22390 + }, + { + "epoch": 3.6533442088091355, + "grad_norm": 0.5096259713172913, + "learning_rate": 4.002319608163071e-05, + "loss": 0.0344, + "num_input_tokens_seen": 48339136, + "step": 22395 + }, + { + "epoch": 3.6541598694942903, + "grad_norm": 0.193783238530159, + "learning_rate": 4.0017506046233664e-05, + "loss": 0.1081, + "num_input_tokens_seen": 48350208, + "step": 22400 + }, + { + "epoch": 3.6549755301794455, + "grad_norm": 0.4924594759941101, + "learning_rate": 4.001181479344138e-05, + "loss": 0.2898, + "num_input_tokens_seen": 48362016, + "step": 22405 + }, + { + "epoch": 3.6557911908646004, + "grad_norm": 0.7075284123420715, + "learning_rate": 4.000612232371522e-05, + "loss": 0.1259, + "num_input_tokens_seen": 48371424, + "step": 22410 + }, + { + "epoch": 3.656606851549755, + "grad_norm": 0.13252605497837067, + "learning_rate": 4.000042863751664e-05, + "loss": 0.2515, + "num_input_tokens_seen": 48383328, + "step": 22415 + }, + { + "epoch": 3.6574225122349104, + "grad_norm": 0.11614397168159485, + "learning_rate": 3.999473373530721e-05, + "loss": 0.0473, + "num_input_tokens_seen": 48394304, + "step": 22420 + }, + { + "epoch": 3.6582381729200653, + "grad_norm": 0.2915717363357544, + "learning_rate": 3.9989037617548575e-05, + "loss": 0.0721, + "num_input_tokens_seen": 48405376, + "step": 22425 + }, + { + "epoch": 3.65905383360522, + "grad_norm": 1.2218226194381714, + "learning_rate": 3.9983340284702495e-05, + "loss": 0.0902, + "num_input_tokens_seen": 48416352, + "step": 22430 + }, + { + "epoch": 3.6598694942903753, + "grad_norm": 0.44890114665031433, + "learning_rate": 3.9977641737230833e-05, + "loss": 0.2198, + "num_input_tokens_seen": 48427488, + "step": 22435 + }, + { + "epoch": 3.66068515497553, + "grad_norm": 0.1842951476573944, + "learning_rate": 3.9971941975595535e-05, + "loss": 0.0781, + "num_input_tokens_seen": 48439520, + "step": 22440 + }, + { + "epoch": 3.661500815660685, + "grad_norm": 0.7876251935958862, + "learning_rate": 3.996624100025865e-05, + "loss": 0.063, + "num_input_tokens_seen": 48449984, + "step": 22445 + }, + { + "epoch": 3.6623164763458402, + "grad_norm": 2.320216417312622, + "learning_rate": 3.9960538811682334e-05, + "loss": 0.1849, + "num_input_tokens_seen": 48460672, + "step": 22450 + }, + { + "epoch": 3.663132137030995, + "grad_norm": 1.0451196432113647, + "learning_rate": 3.9954835410328836e-05, + "loss": 0.0857, + "num_input_tokens_seen": 48471328, + "step": 22455 + }, + { + "epoch": 3.6639477977161503, + "grad_norm": 0.05244923755526543, + "learning_rate": 3.9949130796660496e-05, + "loss": 0.0406, + "num_input_tokens_seen": 48481664, + "step": 22460 + }, + { + "epoch": 3.664763458401305, + "grad_norm": 0.1188025176525116, + "learning_rate": 3.994342497113977e-05, + "loss": 0.1285, + "num_input_tokens_seen": 48492896, + "step": 22465 + }, + { + "epoch": 3.66557911908646, + "grad_norm": 0.29004615545272827, + "learning_rate": 3.993771793422918e-05, + "loss": 0.0486, + "num_input_tokens_seen": 48503648, + "step": 22470 + }, + { + "epoch": 3.6663947797716148, + "grad_norm": 0.5214669108390808, + "learning_rate": 3.993200968639139e-05, + "loss": 0.0675, + "num_input_tokens_seen": 48514368, + "step": 22475 + }, + { + "epoch": 3.66721044045677, + "grad_norm": 0.16497154533863068, + "learning_rate": 3.9926300228089124e-05, + "loss": 0.1326, + "num_input_tokens_seen": 48525504, + "step": 22480 + }, + { + "epoch": 3.668026101141925, + "grad_norm": 0.8711979389190674, + "learning_rate": 3.992058955978523e-05, + "loss": 0.1125, + "num_input_tokens_seen": 48534880, + "step": 22485 + }, + { + "epoch": 3.66884176182708, + "grad_norm": 0.8170343637466431, + "learning_rate": 3.9914877681942645e-05, + "loss": 0.1497, + "num_input_tokens_seen": 48546656, + "step": 22490 + }, + { + "epoch": 3.669657422512235, + "grad_norm": 0.05931103974580765, + "learning_rate": 3.99091645950244e-05, + "loss": 0.0523, + "num_input_tokens_seen": 48557952, + "step": 22495 + }, + { + "epoch": 3.6704730831973897, + "grad_norm": 0.9163515567779541, + "learning_rate": 3.990345029949361e-05, + "loss": 0.0874, + "num_input_tokens_seen": 48568832, + "step": 22500 + }, + { + "epoch": 3.671288743882545, + "grad_norm": 1.342931866645813, + "learning_rate": 3.9897734795813524e-05, + "loss": 0.1659, + "num_input_tokens_seen": 48578784, + "step": 22505 + }, + { + "epoch": 3.6721044045677, + "grad_norm": 0.08631537854671478, + "learning_rate": 3.989201808444747e-05, + "loss": 0.1209, + "num_input_tokens_seen": 48590016, + "step": 22510 + }, + { + "epoch": 3.672920065252855, + "grad_norm": 0.764109194278717, + "learning_rate": 3.988630016585887e-05, + "loss": 0.0933, + "num_input_tokens_seen": 48600608, + "step": 22515 + }, + { + "epoch": 3.67373572593801, + "grad_norm": 0.054367855191230774, + "learning_rate": 3.988058104051124e-05, + "loss": 0.0342, + "num_input_tokens_seen": 48610912, + "step": 22520 + }, + { + "epoch": 3.6745513866231647, + "grad_norm": 0.17264679074287415, + "learning_rate": 3.987486070886821e-05, + "loss": 0.0734, + "num_input_tokens_seen": 48622272, + "step": 22525 + }, + { + "epoch": 3.6753670473083195, + "grad_norm": 0.12014711648225784, + "learning_rate": 3.98691391713935e-05, + "loss": 0.0559, + "num_input_tokens_seen": 48633376, + "step": 22530 + }, + { + "epoch": 3.676182707993475, + "grad_norm": 0.864189863204956, + "learning_rate": 3.986341642855092e-05, + "loss": 0.1936, + "num_input_tokens_seen": 48644608, + "step": 22535 + }, + { + "epoch": 3.6769983686786296, + "grad_norm": 0.4444725215435028, + "learning_rate": 3.985769248080439e-05, + "loss": 0.0741, + "num_input_tokens_seen": 48656448, + "step": 22540 + }, + { + "epoch": 3.677814029363785, + "grad_norm": 0.9247855544090271, + "learning_rate": 3.9851967328617925e-05, + "loss": 0.2447, + "num_input_tokens_seen": 48667936, + "step": 22545 + }, + { + "epoch": 3.6786296900489397, + "grad_norm": 0.38801804184913635, + "learning_rate": 3.984624097245562e-05, + "loss": 0.2071, + "num_input_tokens_seen": 48678240, + "step": 22550 + }, + { + "epoch": 3.6794453507340945, + "grad_norm": 1.4992053508758545, + "learning_rate": 3.98405134127817e-05, + "loss": 0.142, + "num_input_tokens_seen": 48687296, + "step": 22555 + }, + { + "epoch": 3.6802610114192493, + "grad_norm": 0.06119745597243309, + "learning_rate": 3.983478465006045e-05, + "loss": 0.0423, + "num_input_tokens_seen": 48698720, + "step": 22560 + }, + { + "epoch": 3.6810766721044046, + "grad_norm": 0.9964190721511841, + "learning_rate": 3.9829054684756304e-05, + "loss": 0.0623, + "num_input_tokens_seen": 48709696, + "step": 22565 + }, + { + "epoch": 3.6818923327895594, + "grad_norm": 0.5649317502975464, + "learning_rate": 3.982332351733373e-05, + "loss": 0.0754, + "num_input_tokens_seen": 48720032, + "step": 22570 + }, + { + "epoch": 3.6827079934747147, + "grad_norm": 1.9308198690414429, + "learning_rate": 3.981759114825735e-05, + "loss": 0.1619, + "num_input_tokens_seen": 48729664, + "step": 22575 + }, + { + "epoch": 3.6835236541598695, + "grad_norm": 0.2885560095310211, + "learning_rate": 3.981185757799184e-05, + "loss": 0.059, + "num_input_tokens_seen": 48742336, + "step": 22580 + }, + { + "epoch": 3.6843393148450243, + "grad_norm": 0.2751719355583191, + "learning_rate": 3.9806122807002e-05, + "loss": 0.0384, + "num_input_tokens_seen": 48754048, + "step": 22585 + }, + { + "epoch": 3.6851549755301796, + "grad_norm": 0.6433579325675964, + "learning_rate": 3.9800386835752726e-05, + "loss": 0.1, + "num_input_tokens_seen": 48765344, + "step": 22590 + }, + { + "epoch": 3.6859706362153344, + "grad_norm": 0.5814077854156494, + "learning_rate": 3.979464966470899e-05, + "loss": 0.2033, + "num_input_tokens_seen": 48777152, + "step": 22595 + }, + { + "epoch": 3.6867862969004896, + "grad_norm": 0.13814793527126312, + "learning_rate": 3.978891129433588e-05, + "loss": 0.1695, + "num_input_tokens_seen": 48786880, + "step": 22600 + }, + { + "epoch": 3.6876019575856445, + "grad_norm": 0.05158331245183945, + "learning_rate": 3.97831717250986e-05, + "loss": 0.0213, + "num_input_tokens_seen": 48796256, + "step": 22605 + }, + { + "epoch": 3.6884176182707993, + "grad_norm": 0.16841058433055878, + "learning_rate": 3.97774309574624e-05, + "loss": 0.1496, + "num_input_tokens_seen": 48808096, + "step": 22610 + }, + { + "epoch": 3.689233278955954, + "grad_norm": 0.4499695301055908, + "learning_rate": 3.977168899189267e-05, + "loss": 0.1136, + "num_input_tokens_seen": 48818848, + "step": 22615 + }, + { + "epoch": 3.6900489396411094, + "grad_norm": 0.26728183031082153, + "learning_rate": 3.9765945828854876e-05, + "loss": 0.0603, + "num_input_tokens_seen": 48831296, + "step": 22620 + }, + { + "epoch": 3.690864600326264, + "grad_norm": 0.41662153601646423, + "learning_rate": 3.97602014688146e-05, + "loss": 0.1813, + "num_input_tokens_seen": 48841536, + "step": 22625 + }, + { + "epoch": 3.6916802610114194, + "grad_norm": 1.2493749856948853, + "learning_rate": 3.9754455912237486e-05, + "loss": 0.3123, + "num_input_tokens_seen": 48853152, + "step": 22630 + }, + { + "epoch": 3.6924959216965743, + "grad_norm": 0.17014853656291962, + "learning_rate": 3.974870915958932e-05, + "loss": 0.1205, + "num_input_tokens_seen": 48862688, + "step": 22635 + }, + { + "epoch": 3.693311582381729, + "grad_norm": 0.38713201880455017, + "learning_rate": 3.974296121133596e-05, + "loss": 0.0821, + "num_input_tokens_seen": 48874688, + "step": 22640 + }, + { + "epoch": 3.6941272430668843, + "grad_norm": 0.5559442043304443, + "learning_rate": 3.9737212067943354e-05, + "loss": 0.0795, + "num_input_tokens_seen": 48885856, + "step": 22645 + }, + { + "epoch": 3.694942903752039, + "grad_norm": 0.8259474635124207, + "learning_rate": 3.973146172987756e-05, + "loss": 0.1952, + "num_input_tokens_seen": 48896096, + "step": 22650 + }, + { + "epoch": 3.695758564437194, + "grad_norm": 1.3626903295516968, + "learning_rate": 3.9725710197604735e-05, + "loss": 0.2153, + "num_input_tokens_seen": 48907328, + "step": 22655 + }, + { + "epoch": 3.6965742251223492, + "grad_norm": 1.5414537191390991, + "learning_rate": 3.971995747159113e-05, + "loss": 0.1361, + "num_input_tokens_seen": 48918304, + "step": 22660 + }, + { + "epoch": 3.697389885807504, + "grad_norm": 0.2773340344429016, + "learning_rate": 3.971420355230308e-05, + "loss": 0.072, + "num_input_tokens_seen": 48929440, + "step": 22665 + }, + { + "epoch": 3.698205546492659, + "grad_norm": 0.4002946615219116, + "learning_rate": 3.9708448440207026e-05, + "loss": 0.0264, + "num_input_tokens_seen": 48940256, + "step": 22670 + }, + { + "epoch": 3.699021207177814, + "grad_norm": 0.8330409526824951, + "learning_rate": 3.970269213576951e-05, + "loss": 0.0706, + "num_input_tokens_seen": 48949856, + "step": 22675 + }, + { + "epoch": 3.699836867862969, + "grad_norm": 0.5058496594429016, + "learning_rate": 3.969693463945717e-05, + "loss": 0.0811, + "num_input_tokens_seen": 48960768, + "step": 22680 + }, + { + "epoch": 3.700652528548124, + "grad_norm": 1.9459131956100464, + "learning_rate": 3.9691175951736745e-05, + "loss": 0.2167, + "num_input_tokens_seen": 48971264, + "step": 22685 + }, + { + "epoch": 3.701468189233279, + "grad_norm": 0.068778395652771, + "learning_rate": 3.9685416073075045e-05, + "loss": 0.1499, + "num_input_tokens_seen": 48981632, + "step": 22690 + }, + { + "epoch": 3.702283849918434, + "grad_norm": 1.405644178390503, + "learning_rate": 3.967965500393901e-05, + "loss": 0.3085, + "num_input_tokens_seen": 48993024, + "step": 22695 + }, + { + "epoch": 3.7030995106035887, + "grad_norm": 0.04470760002732277, + "learning_rate": 3.9673892744795655e-05, + "loss": 0.2117, + "num_input_tokens_seen": 49004064, + "step": 22700 + }, + { + "epoch": 3.703915171288744, + "grad_norm": 1.3761003017425537, + "learning_rate": 3.96681292961121e-05, + "loss": 0.1894, + "num_input_tokens_seen": 49016032, + "step": 22705 + }, + { + "epoch": 3.7047308319738987, + "grad_norm": 0.5630801916122437, + "learning_rate": 3.9662364658355555e-05, + "loss": 0.0788, + "num_input_tokens_seen": 49026336, + "step": 22710 + }, + { + "epoch": 3.705546492659054, + "grad_norm": 1.011101245880127, + "learning_rate": 3.965659883199334e-05, + "loss": 0.1198, + "num_input_tokens_seen": 49037312, + "step": 22715 + }, + { + "epoch": 3.706362153344209, + "grad_norm": 0.4040263891220093, + "learning_rate": 3.9650831817492864e-05, + "loss": 0.1337, + "num_input_tokens_seen": 49048480, + "step": 22720 + }, + { + "epoch": 3.7071778140293636, + "grad_norm": 1.2166802883148193, + "learning_rate": 3.964506361532161e-05, + "loss": 0.4087, + "num_input_tokens_seen": 49059840, + "step": 22725 + }, + { + "epoch": 3.707993474714519, + "grad_norm": 0.24933300912380219, + "learning_rate": 3.96392942259472e-05, + "loss": 0.0902, + "num_input_tokens_seen": 49070720, + "step": 22730 + }, + { + "epoch": 3.7088091353996737, + "grad_norm": 0.11950570344924927, + "learning_rate": 3.963352364983731e-05, + "loss": 0.0917, + "num_input_tokens_seen": 49081408, + "step": 22735 + }, + { + "epoch": 3.709624796084829, + "grad_norm": 1.7170554399490356, + "learning_rate": 3.962775188745975e-05, + "loss": 0.1013, + "num_input_tokens_seen": 49092416, + "step": 22740 + }, + { + "epoch": 3.710440456769984, + "grad_norm": 0.4253525733947754, + "learning_rate": 3.9621978939282405e-05, + "loss": 0.1175, + "num_input_tokens_seen": 49102464, + "step": 22745 + }, + { + "epoch": 3.7112561174551386, + "grad_norm": 0.12798340618610382, + "learning_rate": 3.961620480577325e-05, + "loss": 0.1697, + "num_input_tokens_seen": 49113920, + "step": 22750 + }, + { + "epoch": 3.7120717781402934, + "grad_norm": 0.13918446004390717, + "learning_rate": 3.961042948740038e-05, + "loss": 0.1314, + "num_input_tokens_seen": 49124096, + "step": 22755 + }, + { + "epoch": 3.7128874388254487, + "grad_norm": 0.17866289615631104, + "learning_rate": 3.960465298463195e-05, + "loss": 0.0911, + "num_input_tokens_seen": 49134048, + "step": 22760 + }, + { + "epoch": 3.7137030995106035, + "grad_norm": 1.0138061046600342, + "learning_rate": 3.959887529793625e-05, + "loss": 0.0566, + "num_input_tokens_seen": 49144800, + "step": 22765 + }, + { + "epoch": 3.7145187601957588, + "grad_norm": 1.3053709268569946, + "learning_rate": 3.9593096427781665e-05, + "loss": 0.2524, + "num_input_tokens_seen": 49156128, + "step": 22770 + }, + { + "epoch": 3.7153344208809136, + "grad_norm": 0.18055230379104614, + "learning_rate": 3.958731637463662e-05, + "loss": 0.0893, + "num_input_tokens_seen": 49167168, + "step": 22775 + }, + { + "epoch": 3.7161500815660684, + "grad_norm": 0.1404314488172531, + "learning_rate": 3.958153513896969e-05, + "loss": 0.0521, + "num_input_tokens_seen": 49177248, + "step": 22780 + }, + { + "epoch": 3.7169657422512232, + "grad_norm": 0.5247876048088074, + "learning_rate": 3.957575272124954e-05, + "loss": 0.0378, + "num_input_tokens_seen": 49189568, + "step": 22785 + }, + { + "epoch": 3.7177814029363785, + "grad_norm": 1.062398076057434, + "learning_rate": 3.9569969121944925e-05, + "loss": 0.1428, + "num_input_tokens_seen": 49198912, + "step": 22790 + }, + { + "epoch": 3.7185970636215333, + "grad_norm": 0.29455673694610596, + "learning_rate": 3.956418434152467e-05, + "loss": 0.2117, + "num_input_tokens_seen": 49210112, + "step": 22795 + }, + { + "epoch": 3.7194127243066886, + "grad_norm": 1.2261962890625, + "learning_rate": 3.955839838045775e-05, + "loss": 0.182, + "num_input_tokens_seen": 49221344, + "step": 22800 + }, + { + "epoch": 3.7202283849918434, + "grad_norm": 0.035390034317970276, + "learning_rate": 3.9552611239213185e-05, + "loss": 0.1192, + "num_input_tokens_seen": 49232064, + "step": 22805 + }, + { + "epoch": 3.721044045676998, + "grad_norm": 0.9223475456237793, + "learning_rate": 3.954682291826011e-05, + "loss": 0.3075, + "num_input_tokens_seen": 49242144, + "step": 22810 + }, + { + "epoch": 3.7218597063621535, + "grad_norm": 0.587470531463623, + "learning_rate": 3.9541033418067765e-05, + "loss": 0.1114, + "num_input_tokens_seen": 49253248, + "step": 22815 + }, + { + "epoch": 3.7226753670473083, + "grad_norm": 1.172481894493103, + "learning_rate": 3.953524273910546e-05, + "loss": 0.1413, + "num_input_tokens_seen": 49264384, + "step": 22820 + }, + { + "epoch": 3.7234910277324635, + "grad_norm": 0.12987253069877625, + "learning_rate": 3.952945088184264e-05, + "loss": 0.0662, + "num_input_tokens_seen": 49276416, + "step": 22825 + }, + { + "epoch": 3.7243066884176184, + "grad_norm": 0.32261407375335693, + "learning_rate": 3.952365784674881e-05, + "loss": 0.0783, + "num_input_tokens_seen": 49288800, + "step": 22830 + }, + { + "epoch": 3.725122349102773, + "grad_norm": 1.9350452423095703, + "learning_rate": 3.951786363429357e-05, + "loss": 0.1663, + "num_input_tokens_seen": 49299328, + "step": 22835 + }, + { + "epoch": 3.725938009787928, + "grad_norm": 1.3163487911224365, + "learning_rate": 3.951206824494665e-05, + "loss": 0.1711, + "num_input_tokens_seen": 49311360, + "step": 22840 + }, + { + "epoch": 3.7267536704730833, + "grad_norm": 0.23962199687957764, + "learning_rate": 3.950627167917784e-05, + "loss": 0.1724, + "num_input_tokens_seen": 49321632, + "step": 22845 + }, + { + "epoch": 3.727569331158238, + "grad_norm": 0.7122876644134521, + "learning_rate": 3.950047393745705e-05, + "loss": 0.0944, + "num_input_tokens_seen": 49332256, + "step": 22850 + }, + { + "epoch": 3.7283849918433933, + "grad_norm": 1.5773470401763916, + "learning_rate": 3.949467502025426e-05, + "loss": 0.1947, + "num_input_tokens_seen": 49343360, + "step": 22855 + }, + { + "epoch": 3.729200652528548, + "grad_norm": 0.10519284009933472, + "learning_rate": 3.948887492803957e-05, + "loss": 0.1231, + "num_input_tokens_seen": 49354112, + "step": 22860 + }, + { + "epoch": 3.730016313213703, + "grad_norm": 0.26630064845085144, + "learning_rate": 3.948307366128316e-05, + "loss": 0.1117, + "num_input_tokens_seen": 49365120, + "step": 22865 + }, + { + "epoch": 3.7308319738988582, + "grad_norm": 1.193199872970581, + "learning_rate": 3.9477271220455323e-05, + "loss": 0.2939, + "num_input_tokens_seen": 49377600, + "step": 22870 + }, + { + "epoch": 3.731647634584013, + "grad_norm": 0.1426955610513687, + "learning_rate": 3.947146760602642e-05, + "loss": 0.0838, + "num_input_tokens_seen": 49387648, + "step": 22875 + }, + { + "epoch": 3.732463295269168, + "grad_norm": 1.9097248315811157, + "learning_rate": 3.946566281846692e-05, + "loss": 0.1854, + "num_input_tokens_seen": 49398368, + "step": 22880 + }, + { + "epoch": 3.733278955954323, + "grad_norm": 1.200536847114563, + "learning_rate": 3.9459856858247404e-05, + "loss": 0.1821, + "num_input_tokens_seen": 49408416, + "step": 22885 + }, + { + "epoch": 3.734094616639478, + "grad_norm": 0.6233472228050232, + "learning_rate": 3.945404972583851e-05, + "loss": 0.246, + "num_input_tokens_seen": 49417888, + "step": 22890 + }, + { + "epoch": 3.7349102773246328, + "grad_norm": 0.4906785190105438, + "learning_rate": 3.9448241421711004e-05, + "loss": 0.2116, + "num_input_tokens_seen": 49427296, + "step": 22895 + }, + { + "epoch": 3.735725938009788, + "grad_norm": 0.1995089203119278, + "learning_rate": 3.9442431946335755e-05, + "loss": 0.1539, + "num_input_tokens_seen": 49437984, + "step": 22900 + }, + { + "epoch": 3.736541598694943, + "grad_norm": 1.0107371807098389, + "learning_rate": 3.943662130018368e-05, + "loss": 0.1603, + "num_input_tokens_seen": 49448288, + "step": 22905 + }, + { + "epoch": 3.737357259380098, + "grad_norm": 0.09762635827064514, + "learning_rate": 3.943080948372583e-05, + "loss": 0.1212, + "num_input_tokens_seen": 49458624, + "step": 22910 + }, + { + "epoch": 3.738172920065253, + "grad_norm": 1.2493212223052979, + "learning_rate": 3.942499649743335e-05, + "loss": 0.1601, + "num_input_tokens_seen": 49468032, + "step": 22915 + }, + { + "epoch": 3.7389885807504077, + "grad_norm": 0.32032954692840576, + "learning_rate": 3.941918234177746e-05, + "loss": 0.0395, + "num_input_tokens_seen": 49479744, + "step": 22920 + }, + { + "epoch": 3.7398042414355626, + "grad_norm": 1.4508816003799438, + "learning_rate": 3.941336701722949e-05, + "loss": 0.1665, + "num_input_tokens_seen": 49490688, + "step": 22925 + }, + { + "epoch": 3.740619902120718, + "grad_norm": 0.2547200322151184, + "learning_rate": 3.940755052426085e-05, + "loss": 0.0792, + "num_input_tokens_seen": 49499936, + "step": 22930 + }, + { + "epoch": 3.7414355628058726, + "grad_norm": 0.06837546080350876, + "learning_rate": 3.940173286334307e-05, + "loss": 0.0462, + "num_input_tokens_seen": 49510080, + "step": 22935 + }, + { + "epoch": 3.742251223491028, + "grad_norm": 0.6467730402946472, + "learning_rate": 3.9395914034947744e-05, + "loss": 0.1061, + "num_input_tokens_seen": 49521056, + "step": 22940 + }, + { + "epoch": 3.7430668841761827, + "grad_norm": 0.3209196925163269, + "learning_rate": 3.939009403954659e-05, + "loss": 0.0499, + "num_input_tokens_seen": 49531712, + "step": 22945 + }, + { + "epoch": 3.7438825448613375, + "grad_norm": 0.6693896055221558, + "learning_rate": 3.9384272877611384e-05, + "loss": 0.1149, + "num_input_tokens_seen": 49542912, + "step": 22950 + }, + { + "epoch": 3.744698205546493, + "grad_norm": 0.2495461106300354, + "learning_rate": 3.9378450549614044e-05, + "loss": 0.0272, + "num_input_tokens_seen": 49553888, + "step": 22955 + }, + { + "epoch": 3.7455138662316476, + "grad_norm": 0.48277395963668823, + "learning_rate": 3.9372627056026544e-05, + "loss": 0.1055, + "num_input_tokens_seen": 49565376, + "step": 22960 + }, + { + "epoch": 3.746329526916803, + "grad_norm": 0.4428081810474396, + "learning_rate": 3.9366802397320966e-05, + "loss": 0.125, + "num_input_tokens_seen": 49577184, + "step": 22965 + }, + { + "epoch": 3.7471451876019577, + "grad_norm": 0.8581382632255554, + "learning_rate": 3.9360976573969494e-05, + "loss": 0.0552, + "num_input_tokens_seen": 49588608, + "step": 22970 + }, + { + "epoch": 3.7479608482871125, + "grad_norm": 0.49911001324653625, + "learning_rate": 3.935514958644439e-05, + "loss": 0.0377, + "num_input_tokens_seen": 49599840, + "step": 22975 + }, + { + "epoch": 3.7487765089722673, + "grad_norm": 0.17319731414318085, + "learning_rate": 3.934932143521803e-05, + "loss": 0.2326, + "num_input_tokens_seen": 49611040, + "step": 22980 + }, + { + "epoch": 3.7495921696574226, + "grad_norm": 0.3340170681476593, + "learning_rate": 3.934349212076286e-05, + "loss": 0.042, + "num_input_tokens_seen": 49621472, + "step": 22985 + }, + { + "epoch": 3.7504078303425774, + "grad_norm": 0.19907572865486145, + "learning_rate": 3.933766164355145e-05, + "loss": 0.136, + "num_input_tokens_seen": 49632832, + "step": 22990 + }, + { + "epoch": 3.7512234910277327, + "grad_norm": 1.1386948823928833, + "learning_rate": 3.9331830004056424e-05, + "loss": 0.3451, + "num_input_tokens_seen": 49643136, + "step": 22995 + }, + { + "epoch": 3.7520391517128875, + "grad_norm": 0.5631598830223083, + "learning_rate": 3.932599720275055e-05, + "loss": 0.1493, + "num_input_tokens_seen": 49653920, + "step": 23000 + }, + { + "epoch": 3.7528548123980423, + "grad_norm": 0.08074790239334106, + "learning_rate": 3.9320163240106656e-05, + "loss": 0.077, + "num_input_tokens_seen": 49664992, + "step": 23005 + }, + { + "epoch": 3.753670473083197, + "grad_norm": 0.08432282507419586, + "learning_rate": 3.931432811659766e-05, + "loss": 0.0297, + "num_input_tokens_seen": 49676000, + "step": 23010 + }, + { + "epoch": 3.7544861337683524, + "grad_norm": 0.26251158118247986, + "learning_rate": 3.9308491832696596e-05, + "loss": 0.0611, + "num_input_tokens_seen": 49687712, + "step": 23015 + }, + { + "epoch": 3.755301794453507, + "grad_norm": 1.1447269916534424, + "learning_rate": 3.930265438887659e-05, + "loss": 0.0742, + "num_input_tokens_seen": 49698944, + "step": 23020 + }, + { + "epoch": 3.7561174551386625, + "grad_norm": 0.08241933584213257, + "learning_rate": 3.929681578561084e-05, + "loss": 0.1013, + "num_input_tokens_seen": 49710048, + "step": 23025 + }, + { + "epoch": 3.7569331158238173, + "grad_norm": 0.5000652074813843, + "learning_rate": 3.929097602337267e-05, + "loss": 0.134, + "num_input_tokens_seen": 49720672, + "step": 23030 + }, + { + "epoch": 3.757748776508972, + "grad_norm": 0.3214322626590729, + "learning_rate": 3.9285135102635474e-05, + "loss": 0.1006, + "num_input_tokens_seen": 49731776, + "step": 23035 + }, + { + "epoch": 3.7585644371941274, + "grad_norm": 0.24787583947181702, + "learning_rate": 3.9279293023872745e-05, + "loss": 0.1738, + "num_input_tokens_seen": 49742336, + "step": 23040 + }, + { + "epoch": 3.759380097879282, + "grad_norm": 1.998279333114624, + "learning_rate": 3.927344978755806e-05, + "loss": 0.2901, + "num_input_tokens_seen": 49753408, + "step": 23045 + }, + { + "epoch": 3.7601957585644374, + "grad_norm": 0.07503266632556915, + "learning_rate": 3.926760539416512e-05, + "loss": 0.0856, + "num_input_tokens_seen": 49764192, + "step": 23050 + }, + { + "epoch": 3.7610114192495923, + "grad_norm": 0.6417959332466125, + "learning_rate": 3.926175984416769e-05, + "loss": 0.0649, + "num_input_tokens_seen": 49775840, + "step": 23055 + }, + { + "epoch": 3.761827079934747, + "grad_norm": 0.10385168343782425, + "learning_rate": 3.9255913138039645e-05, + "loss": 0.0841, + "num_input_tokens_seen": 49785408, + "step": 23060 + }, + { + "epoch": 3.762642740619902, + "grad_norm": 1.364176630973816, + "learning_rate": 3.925006527625494e-05, + "loss": 0.2447, + "num_input_tokens_seen": 49796192, + "step": 23065 + }, + { + "epoch": 3.763458401305057, + "grad_norm": 0.09965293854475021, + "learning_rate": 3.924421625928765e-05, + "loss": 0.0545, + "num_input_tokens_seen": 49807296, + "step": 23070 + }, + { + "epoch": 3.764274061990212, + "grad_norm": 1.5386382341384888, + "learning_rate": 3.923836608761192e-05, + "loss": 0.2164, + "num_input_tokens_seen": 49818368, + "step": 23075 + }, + { + "epoch": 3.7650897226753672, + "grad_norm": 0.0982886403799057, + "learning_rate": 3.923251476170198e-05, + "loss": 0.1696, + "num_input_tokens_seen": 49830368, + "step": 23080 + }, + { + "epoch": 3.765905383360522, + "grad_norm": 1.012694001197815, + "learning_rate": 3.922666228203218e-05, + "loss": 0.1974, + "num_input_tokens_seen": 49840960, + "step": 23085 + }, + { + "epoch": 3.766721044045677, + "grad_norm": 0.1931735724210739, + "learning_rate": 3.9220808649076954e-05, + "loss": 0.3229, + "num_input_tokens_seen": 49851776, + "step": 23090 + }, + { + "epoch": 3.767536704730832, + "grad_norm": 0.376737117767334, + "learning_rate": 3.921495386331082e-05, + "loss": 0.1761, + "num_input_tokens_seen": 49862336, + "step": 23095 + }, + { + "epoch": 3.768352365415987, + "grad_norm": 1.650534749031067, + "learning_rate": 3.9209097925208405e-05, + "loss": 0.2525, + "num_input_tokens_seen": 49874208, + "step": 23100 + }, + { + "epoch": 3.7691680261011418, + "grad_norm": 1.1905802488327026, + "learning_rate": 3.920324083524441e-05, + "loss": 0.0542, + "num_input_tokens_seen": 49885440, + "step": 23105 + }, + { + "epoch": 3.769983686786297, + "grad_norm": 0.49513110518455505, + "learning_rate": 3.919738259389365e-05, + "loss": 0.0925, + "num_input_tokens_seen": 49896832, + "step": 23110 + }, + { + "epoch": 3.770799347471452, + "grad_norm": 0.4361059367656708, + "learning_rate": 3.919152320163101e-05, + "loss": 0.1115, + "num_input_tokens_seen": 49907008, + "step": 23115 + }, + { + "epoch": 3.7716150081566067, + "grad_norm": 0.5083498954772949, + "learning_rate": 3.91856626589315e-05, + "loss": 0.0539, + "num_input_tokens_seen": 49917600, + "step": 23120 + }, + { + "epoch": 3.772430668841762, + "grad_norm": 0.9718011021614075, + "learning_rate": 3.91798009662702e-05, + "loss": 0.1393, + "num_input_tokens_seen": 49929280, + "step": 23125 + }, + { + "epoch": 3.7732463295269167, + "grad_norm": 1.3941649198532104, + "learning_rate": 3.917393812412229e-05, + "loss": 0.1084, + "num_input_tokens_seen": 49940384, + "step": 23130 + }, + { + "epoch": 3.774061990212072, + "grad_norm": 0.029787305742502213, + "learning_rate": 3.916807413296303e-05, + "loss": 0.2013, + "num_input_tokens_seen": 49949696, + "step": 23135 + }, + { + "epoch": 3.774877650897227, + "grad_norm": 1.042540192604065, + "learning_rate": 3.916220899326779e-05, + "loss": 0.1747, + "num_input_tokens_seen": 49959680, + "step": 23140 + }, + { + "epoch": 3.7756933115823816, + "grad_norm": 0.6403712034225464, + "learning_rate": 3.915634270551204e-05, + "loss": 0.1463, + "num_input_tokens_seen": 49970592, + "step": 23145 + }, + { + "epoch": 3.7765089722675365, + "grad_norm": 0.5920913219451904, + "learning_rate": 3.915047527017132e-05, + "loss": 0.2635, + "num_input_tokens_seen": 49981440, + "step": 23150 + }, + { + "epoch": 3.7773246329526917, + "grad_norm": 0.8670738339424133, + "learning_rate": 3.914460668772127e-05, + "loss": 0.1116, + "num_input_tokens_seen": 49993024, + "step": 23155 + }, + { + "epoch": 3.7781402936378465, + "grad_norm": 0.8891991972923279, + "learning_rate": 3.913873695863763e-05, + "loss": 0.1474, + "num_input_tokens_seen": 50004704, + "step": 23160 + }, + { + "epoch": 3.778955954323002, + "grad_norm": 0.45362144708633423, + "learning_rate": 3.913286608339625e-05, + "loss": 0.1064, + "num_input_tokens_seen": 50015424, + "step": 23165 + }, + { + "epoch": 3.7797716150081566, + "grad_norm": 1.3983337879180908, + "learning_rate": 3.9126994062473013e-05, + "loss": 0.0679, + "num_input_tokens_seen": 50026368, + "step": 23170 + }, + { + "epoch": 3.7805872756933114, + "grad_norm": 1.1244664192199707, + "learning_rate": 3.912112089634397e-05, + "loss": 0.179, + "num_input_tokens_seen": 50036512, + "step": 23175 + }, + { + "epoch": 3.7814029363784667, + "grad_norm": 0.20910879969596863, + "learning_rate": 3.911524658548522e-05, + "loss": 0.1232, + "num_input_tokens_seen": 50047776, + "step": 23180 + }, + { + "epoch": 3.7822185970636215, + "grad_norm": 3.1859097480773926, + "learning_rate": 3.9109371130372956e-05, + "loss": 0.2619, + "num_input_tokens_seen": 50058048, + "step": 23185 + }, + { + "epoch": 3.7830342577487768, + "grad_norm": 0.14704057574272156, + "learning_rate": 3.910349453148348e-05, + "loss": 0.0512, + "num_input_tokens_seen": 50068992, + "step": 23190 + }, + { + "epoch": 3.7838499184339316, + "grad_norm": 0.626315712928772, + "learning_rate": 3.909761678929318e-05, + "loss": 0.0792, + "num_input_tokens_seen": 50079776, + "step": 23195 + }, + { + "epoch": 3.7846655791190864, + "grad_norm": 0.08173764497041702, + "learning_rate": 3.909173790427852e-05, + "loss": 0.0617, + "num_input_tokens_seen": 50090592, + "step": 23200 + }, + { + "epoch": 3.7854812398042412, + "grad_norm": 0.33168113231658936, + "learning_rate": 3.90858578769161e-05, + "loss": 0.1486, + "num_input_tokens_seen": 50102240, + "step": 23205 + }, + { + "epoch": 3.7862969004893965, + "grad_norm": 1.611867904663086, + "learning_rate": 3.907997670768256e-05, + "loss": 0.1611, + "num_input_tokens_seen": 50113472, + "step": 23210 + }, + { + "epoch": 3.7871125611745513, + "grad_norm": 0.2886215150356293, + "learning_rate": 3.907409439705467e-05, + "loss": 0.0449, + "num_input_tokens_seen": 50124224, + "step": 23215 + }, + { + "epoch": 3.7879282218597066, + "grad_norm": 0.22796235978603363, + "learning_rate": 3.9068210945509276e-05, + "loss": 0.0854, + "num_input_tokens_seen": 50135264, + "step": 23220 + }, + { + "epoch": 3.7887438825448614, + "grad_norm": 1.9141652584075928, + "learning_rate": 3.906232635352333e-05, + "loss": 0.2567, + "num_input_tokens_seen": 50145824, + "step": 23225 + }, + { + "epoch": 3.789559543230016, + "grad_norm": 0.013708163984119892, + "learning_rate": 3.9056440621573855e-05, + "loss": 0.1448, + "num_input_tokens_seen": 50156864, + "step": 23230 + }, + { + "epoch": 3.790375203915171, + "grad_norm": 0.40456247329711914, + "learning_rate": 3.9050553750137975e-05, + "loss": 0.0942, + "num_input_tokens_seen": 50167872, + "step": 23235 + }, + { + "epoch": 3.7911908646003263, + "grad_norm": 0.48312655091285706, + "learning_rate": 3.904466573969292e-05, + "loss": 0.1011, + "num_input_tokens_seen": 50178880, + "step": 23240 + }, + { + "epoch": 3.792006525285481, + "grad_norm": 0.4104769229888916, + "learning_rate": 3.9038776590716e-05, + "loss": 0.3079, + "num_input_tokens_seen": 50189280, + "step": 23245 + }, + { + "epoch": 3.7928221859706364, + "grad_norm": 0.12508687376976013, + "learning_rate": 3.903288630368461e-05, + "loss": 0.0523, + "num_input_tokens_seen": 50200736, + "step": 23250 + }, + { + "epoch": 3.793637846655791, + "grad_norm": 0.05616260692477226, + "learning_rate": 3.902699487907626e-05, + "loss": 0.0501, + "num_input_tokens_seen": 50210208, + "step": 23255 + }, + { + "epoch": 3.794453507340946, + "grad_norm": 0.44050133228302, + "learning_rate": 3.902110231736853e-05, + "loss": 0.0708, + "num_input_tokens_seen": 50220960, + "step": 23260 + }, + { + "epoch": 3.7952691680261013, + "grad_norm": 0.5427454113960266, + "learning_rate": 3.901520861903911e-05, + "loss": 0.1043, + "num_input_tokens_seen": 50232384, + "step": 23265 + }, + { + "epoch": 3.796084828711256, + "grad_norm": 1.3041061162948608, + "learning_rate": 3.900931378456576e-05, + "loss": 0.073, + "num_input_tokens_seen": 50243328, + "step": 23270 + }, + { + "epoch": 3.7969004893964113, + "grad_norm": 0.24816113710403442, + "learning_rate": 3.9003417814426346e-05, + "loss": 0.1226, + "num_input_tokens_seen": 50254080, + "step": 23275 + }, + { + "epoch": 3.797716150081566, + "grad_norm": 0.04790763184428215, + "learning_rate": 3.8997520709098845e-05, + "loss": 0.0531, + "num_input_tokens_seen": 50264384, + "step": 23280 + }, + { + "epoch": 3.798531810766721, + "grad_norm": 1.0503261089324951, + "learning_rate": 3.899162246906129e-05, + "loss": 0.1754, + "num_input_tokens_seen": 50275232, + "step": 23285 + }, + { + "epoch": 3.799347471451876, + "grad_norm": 0.6647178530693054, + "learning_rate": 3.8985723094791814e-05, + "loss": 0.1233, + "num_input_tokens_seen": 50286048, + "step": 23290 + }, + { + "epoch": 3.800163132137031, + "grad_norm": 0.2584688067436218, + "learning_rate": 3.897982258676867e-05, + "loss": 0.0949, + "num_input_tokens_seen": 50296192, + "step": 23295 + }, + { + "epoch": 3.800978792822186, + "grad_norm": 1.8845710754394531, + "learning_rate": 3.8973920945470174e-05, + "loss": 0.1769, + "num_input_tokens_seen": 50305888, + "step": 23300 + }, + { + "epoch": 3.801794453507341, + "grad_norm": 0.8670610785484314, + "learning_rate": 3.896801817137474e-05, + "loss": 0.1148, + "num_input_tokens_seen": 50316640, + "step": 23305 + }, + { + "epoch": 3.802610114192496, + "grad_norm": 0.5635920166969299, + "learning_rate": 3.8962114264960894e-05, + "loss": 0.172, + "num_input_tokens_seen": 50328832, + "step": 23310 + }, + { + "epoch": 3.8034257748776508, + "grad_norm": 0.43233147263526917, + "learning_rate": 3.8956209226707206e-05, + "loss": 0.2284, + "num_input_tokens_seen": 50339136, + "step": 23315 + }, + { + "epoch": 3.804241435562806, + "grad_norm": 0.04135040193796158, + "learning_rate": 3.8950303057092386e-05, + "loss": 0.1747, + "num_input_tokens_seen": 50350336, + "step": 23320 + }, + { + "epoch": 3.805057096247961, + "grad_norm": 1.72965407371521, + "learning_rate": 3.8944395756595225e-05, + "loss": 0.1767, + "num_input_tokens_seen": 50361376, + "step": 23325 + }, + { + "epoch": 3.8058727569331157, + "grad_norm": 0.07645891606807709, + "learning_rate": 3.893848732569458e-05, + "loss": 0.0804, + "num_input_tokens_seen": 50372064, + "step": 23330 + }, + { + "epoch": 3.806688417618271, + "grad_norm": 0.822197437286377, + "learning_rate": 3.893257776486944e-05, + "loss": 0.1268, + "num_input_tokens_seen": 50382304, + "step": 23335 + }, + { + "epoch": 3.8075040783034257, + "grad_norm": 1.3751492500305176, + "learning_rate": 3.8926667074598846e-05, + "loss": 0.1868, + "num_input_tokens_seen": 50392416, + "step": 23340 + }, + { + "epoch": 3.8083197389885806, + "grad_norm": 1.5053339004516602, + "learning_rate": 3.892075525536196e-05, + "loss": 0.2373, + "num_input_tokens_seen": 50402816, + "step": 23345 + }, + { + "epoch": 3.809135399673736, + "grad_norm": 0.6124274730682373, + "learning_rate": 3.891484230763802e-05, + "loss": 0.1374, + "num_input_tokens_seen": 50412224, + "step": 23350 + }, + { + "epoch": 3.8099510603588906, + "grad_norm": 0.8519467711448669, + "learning_rate": 3.890892823190636e-05, + "loss": 0.2473, + "num_input_tokens_seen": 50423680, + "step": 23355 + }, + { + "epoch": 3.810766721044046, + "grad_norm": 0.38249069452285767, + "learning_rate": 3.890301302864641e-05, + "loss": 0.03, + "num_input_tokens_seen": 50435072, + "step": 23360 + }, + { + "epoch": 3.8115823817292007, + "grad_norm": 0.7696945667266846, + "learning_rate": 3.889709669833767e-05, + "loss": 0.2374, + "num_input_tokens_seen": 50446336, + "step": 23365 + }, + { + "epoch": 3.8123980424143555, + "grad_norm": 0.5149552822113037, + "learning_rate": 3.8891179241459766e-05, + "loss": 0.088, + "num_input_tokens_seen": 50456928, + "step": 23370 + }, + { + "epoch": 3.8132137030995104, + "grad_norm": 0.345041960477829, + "learning_rate": 3.888526065849238e-05, + "loss": 0.1492, + "num_input_tokens_seen": 50467488, + "step": 23375 + }, + { + "epoch": 3.8140293637846656, + "grad_norm": 0.5555599927902222, + "learning_rate": 3.887934094991531e-05, + "loss": 0.0485, + "num_input_tokens_seen": 50479168, + "step": 23380 + }, + { + "epoch": 3.8148450244698204, + "grad_norm": 0.23931854963302612, + "learning_rate": 3.887342011620845e-05, + "loss": 0.0173, + "num_input_tokens_seen": 50490048, + "step": 23385 + }, + { + "epoch": 3.8156606851549757, + "grad_norm": 1.3563354015350342, + "learning_rate": 3.886749815785176e-05, + "loss": 0.1421, + "num_input_tokens_seen": 50500192, + "step": 23390 + }, + { + "epoch": 3.8164763458401305, + "grad_norm": 0.07795385271310806, + "learning_rate": 3.8861575075325304e-05, + "loss": 0.1308, + "num_input_tokens_seen": 50510944, + "step": 23395 + }, + { + "epoch": 3.8172920065252853, + "grad_norm": 0.13973784446716309, + "learning_rate": 3.8855650869109246e-05, + "loss": 0.1566, + "num_input_tokens_seen": 50521344, + "step": 23400 + }, + { + "epoch": 3.8181076672104406, + "grad_norm": 0.1453014612197876, + "learning_rate": 3.884972553968382e-05, + "loss": 0.0842, + "num_input_tokens_seen": 50532064, + "step": 23405 + }, + { + "epoch": 3.8189233278955954, + "grad_norm": 0.5889809131622314, + "learning_rate": 3.884379908752936e-05, + "loss": 0.1045, + "num_input_tokens_seen": 50542816, + "step": 23410 + }, + { + "epoch": 3.8197389885807507, + "grad_norm": 0.15662984549999237, + "learning_rate": 3.883787151312632e-05, + "loss": 0.0358, + "num_input_tokens_seen": 50553088, + "step": 23415 + }, + { + "epoch": 3.8205546492659055, + "grad_norm": 0.13717705011367798, + "learning_rate": 3.88319428169552e-05, + "loss": 0.0571, + "num_input_tokens_seen": 50561792, + "step": 23420 + }, + { + "epoch": 3.8213703099510603, + "grad_norm": 0.45364710688591003, + "learning_rate": 3.882601299949661e-05, + "loss": 0.1145, + "num_input_tokens_seen": 50573280, + "step": 23425 + }, + { + "epoch": 3.822185970636215, + "grad_norm": 0.11154213547706604, + "learning_rate": 3.882008206123125e-05, + "loss": 0.0916, + "num_input_tokens_seen": 50584384, + "step": 23430 + }, + { + "epoch": 3.8230016313213704, + "grad_norm": 0.07244039326906204, + "learning_rate": 3.881415000263991e-05, + "loss": 0.0706, + "num_input_tokens_seen": 50595904, + "step": 23435 + }, + { + "epoch": 3.823817292006525, + "grad_norm": 1.1152913570404053, + "learning_rate": 3.8808216824203494e-05, + "loss": 0.039, + "num_input_tokens_seen": 50608224, + "step": 23440 + }, + { + "epoch": 3.8246329526916805, + "grad_norm": 0.6293681859970093, + "learning_rate": 3.880228252640295e-05, + "loss": 0.2133, + "num_input_tokens_seen": 50618912, + "step": 23445 + }, + { + "epoch": 3.8254486133768353, + "grad_norm": 0.24831336736679077, + "learning_rate": 3.879634710971935e-05, + "loss": 0.0987, + "num_input_tokens_seen": 50629728, + "step": 23450 + }, + { + "epoch": 3.82626427406199, + "grad_norm": 0.1669679433107376, + "learning_rate": 3.8790410574633854e-05, + "loss": 0.0924, + "num_input_tokens_seen": 50640096, + "step": 23455 + }, + { + "epoch": 3.827079934747145, + "grad_norm": 0.09684005379676819, + "learning_rate": 3.8784472921627715e-05, + "loss": 0.1298, + "num_input_tokens_seen": 50650176, + "step": 23460 + }, + { + "epoch": 3.8278955954323, + "grad_norm": 0.09691516309976578, + "learning_rate": 3.877853415118224e-05, + "loss": 0.1424, + "num_input_tokens_seen": 50661504, + "step": 23465 + }, + { + "epoch": 3.828711256117455, + "grad_norm": 0.33604544401168823, + "learning_rate": 3.877259426377889e-05, + "loss": 0.0365, + "num_input_tokens_seen": 50672384, + "step": 23470 + }, + { + "epoch": 3.8295269168026103, + "grad_norm": 0.1445193588733673, + "learning_rate": 3.8766653259899165e-05, + "loss": 0.1171, + "num_input_tokens_seen": 50683168, + "step": 23475 + }, + { + "epoch": 3.830342577487765, + "grad_norm": 1.3966519832611084, + "learning_rate": 3.8760711140024677e-05, + "loss": 0.163, + "num_input_tokens_seen": 50692512, + "step": 23480 + }, + { + "epoch": 3.83115823817292, + "grad_norm": 0.38481605052948, + "learning_rate": 3.875476790463712e-05, + "loss": 0.1816, + "num_input_tokens_seen": 50703168, + "step": 23485 + }, + { + "epoch": 3.831973898858075, + "grad_norm": 0.4352918267250061, + "learning_rate": 3.8748823554218286e-05, + "loss": 0.2153, + "num_input_tokens_seen": 50714112, + "step": 23490 + }, + { + "epoch": 3.83278955954323, + "grad_norm": 1.5068241357803345, + "learning_rate": 3.8742878089250043e-05, + "loss": 0.1684, + "num_input_tokens_seen": 50724576, + "step": 23495 + }, + { + "epoch": 3.8336052202283852, + "grad_norm": 0.5219002366065979, + "learning_rate": 3.8736931510214385e-05, + "loss": 0.1178, + "num_input_tokens_seen": 50735392, + "step": 23500 + }, + { + "epoch": 3.83442088091354, + "grad_norm": 0.19887638092041016, + "learning_rate": 3.873098381759336e-05, + "loss": 0.0732, + "num_input_tokens_seen": 50747488, + "step": 23505 + }, + { + "epoch": 3.835236541598695, + "grad_norm": 0.6984885334968567, + "learning_rate": 3.872503501186911e-05, + "loss": 0.0998, + "num_input_tokens_seen": 50757120, + "step": 23510 + }, + { + "epoch": 3.8360522022838497, + "grad_norm": 0.06157013773918152, + "learning_rate": 3.871908509352388e-05, + "loss": 0.1385, + "num_input_tokens_seen": 50767136, + "step": 23515 + }, + { + "epoch": 3.836867862969005, + "grad_norm": 0.55604088306427, + "learning_rate": 3.871313406304001e-05, + "loss": 0.0332, + "num_input_tokens_seen": 50777728, + "step": 23520 + }, + { + "epoch": 3.8376835236541598, + "grad_norm": 1.1752097606658936, + "learning_rate": 3.87071819208999e-05, + "loss": 0.219, + "num_input_tokens_seen": 50787488, + "step": 23525 + }, + { + "epoch": 3.838499184339315, + "grad_norm": 1.4207184314727783, + "learning_rate": 3.870122866758609e-05, + "loss": 0.0975, + "num_input_tokens_seen": 50797760, + "step": 23530 + }, + { + "epoch": 3.83931484502447, + "grad_norm": 0.8263128399848938, + "learning_rate": 3.869527430358116e-05, + "loss": 0.0512, + "num_input_tokens_seen": 50808832, + "step": 23535 + }, + { + "epoch": 3.8401305057096247, + "grad_norm": 0.2943103015422821, + "learning_rate": 3.8689318829367796e-05, + "loss": 0.1108, + "num_input_tokens_seen": 50819296, + "step": 23540 + }, + { + "epoch": 3.84094616639478, + "grad_norm": 0.6642223596572876, + "learning_rate": 3.86833622454288e-05, + "loss": 0.0936, + "num_input_tokens_seen": 50831424, + "step": 23545 + }, + { + "epoch": 3.8417618270799347, + "grad_norm": 0.7188926935195923, + "learning_rate": 3.8677404552247024e-05, + "loss": 0.1214, + "num_input_tokens_seen": 50841152, + "step": 23550 + }, + { + "epoch": 3.8425774877650896, + "grad_norm": 0.7999119162559509, + "learning_rate": 3.8671445750305444e-05, + "loss": 0.2542, + "num_input_tokens_seen": 50850752, + "step": 23555 + }, + { + "epoch": 3.843393148450245, + "grad_norm": 0.43848463892936707, + "learning_rate": 3.8665485840087104e-05, + "loss": 0.0804, + "num_input_tokens_seen": 50860896, + "step": 23560 + }, + { + "epoch": 3.8442088091353996, + "grad_norm": 1.9523582458496094, + "learning_rate": 3.865952482207513e-05, + "loss": 0.1639, + "num_input_tokens_seen": 50871072, + "step": 23565 + }, + { + "epoch": 3.8450244698205545, + "grad_norm": 0.882192075252533, + "learning_rate": 3.865356269675278e-05, + "loss": 0.121, + "num_input_tokens_seen": 50880960, + "step": 23570 + }, + { + "epoch": 3.8458401305057097, + "grad_norm": 0.8030144572257996, + "learning_rate": 3.8647599464603355e-05, + "loss": 0.0862, + "num_input_tokens_seen": 50890080, + "step": 23575 + }, + { + "epoch": 3.8466557911908645, + "grad_norm": 0.20192958414554596, + "learning_rate": 3.864163512611028e-05, + "loss": 0.0285, + "num_input_tokens_seen": 50901216, + "step": 23580 + }, + { + "epoch": 3.84747145187602, + "grad_norm": 1.838523268699646, + "learning_rate": 3.863566968175703e-05, + "loss": 0.1469, + "num_input_tokens_seen": 50912416, + "step": 23585 + }, + { + "epoch": 3.8482871125611746, + "grad_norm": 0.28233271837234497, + "learning_rate": 3.862970313202722e-05, + "loss": 0.0678, + "num_input_tokens_seen": 50922752, + "step": 23590 + }, + { + "epoch": 3.8491027732463294, + "grad_norm": 0.1266106367111206, + "learning_rate": 3.86237354774045e-05, + "loss": 0.0251, + "num_input_tokens_seen": 50935168, + "step": 23595 + }, + { + "epoch": 3.8499184339314843, + "grad_norm": 1.2682360410690308, + "learning_rate": 3.861776671837267e-05, + "loss": 0.2057, + "num_input_tokens_seen": 50946304, + "step": 23600 + }, + { + "epoch": 3.8507340946166395, + "grad_norm": 0.4979730546474457, + "learning_rate": 3.861179685541557e-05, + "loss": 0.0692, + "num_input_tokens_seen": 50955872, + "step": 23605 + }, + { + "epoch": 3.8515497553017943, + "grad_norm": 0.44400954246520996, + "learning_rate": 3.8605825889017156e-05, + "loss": 0.1705, + "num_input_tokens_seen": 50967488, + "step": 23610 + }, + { + "epoch": 3.8523654159869496, + "grad_norm": 0.1429114043712616, + "learning_rate": 3.859985381966146e-05, + "loss": 0.0658, + "num_input_tokens_seen": 50977664, + "step": 23615 + }, + { + "epoch": 3.8531810766721044, + "grad_norm": 0.5022424459457397, + "learning_rate": 3.8593880647832606e-05, + "loss": 0.0503, + "num_input_tokens_seen": 50989920, + "step": 23620 + }, + { + "epoch": 3.8539967373572592, + "grad_norm": 1.4202947616577148, + "learning_rate": 3.858790637401482e-05, + "loss": 0.1106, + "num_input_tokens_seen": 50999136, + "step": 23625 + }, + { + "epoch": 3.8548123980424145, + "grad_norm": 0.025258250534534454, + "learning_rate": 3.858193099869239e-05, + "loss": 0.2241, + "num_input_tokens_seen": 51010336, + "step": 23630 + }, + { + "epoch": 3.8556280587275693, + "grad_norm": 1.193731427192688, + "learning_rate": 3.857595452234971e-05, + "loss": 0.21, + "num_input_tokens_seen": 51021792, + "step": 23635 + }, + { + "epoch": 3.8564437194127246, + "grad_norm": 0.06789184361696243, + "learning_rate": 3.856997694547129e-05, + "loss": 0.1368, + "num_input_tokens_seen": 51032416, + "step": 23640 + }, + { + "epoch": 3.8572593800978794, + "grad_norm": 1.0217015743255615, + "learning_rate": 3.856399826854168e-05, + "loss": 0.1144, + "num_input_tokens_seen": 51042304, + "step": 23645 + }, + { + "epoch": 3.858075040783034, + "grad_norm": 0.44159433245658875, + "learning_rate": 3.855801849204555e-05, + "loss": 0.1526, + "num_input_tokens_seen": 51053664, + "step": 23650 + }, + { + "epoch": 3.858890701468189, + "grad_norm": 0.22466833889484406, + "learning_rate": 3.855203761646764e-05, + "loss": 0.0576, + "num_input_tokens_seen": 51064608, + "step": 23655 + }, + { + "epoch": 3.8597063621533443, + "grad_norm": 0.07724909484386444, + "learning_rate": 3.85460556422928e-05, + "loss": 0.1347, + "num_input_tokens_seen": 51074208, + "step": 23660 + }, + { + "epoch": 3.860522022838499, + "grad_norm": 1.6864056587219238, + "learning_rate": 3.854007257000596e-05, + "loss": 0.1166, + "num_input_tokens_seen": 51085600, + "step": 23665 + }, + { + "epoch": 3.8613376835236544, + "grad_norm": 0.37252840399742126, + "learning_rate": 3.853408840009214e-05, + "loss": 0.1253, + "num_input_tokens_seen": 51097344, + "step": 23670 + }, + { + "epoch": 3.862153344208809, + "grad_norm": 0.08836593478918076, + "learning_rate": 3.8528103133036434e-05, + "loss": 0.042, + "num_input_tokens_seen": 51109184, + "step": 23675 + }, + { + "epoch": 3.862969004893964, + "grad_norm": 0.22672097384929657, + "learning_rate": 3.8522116769324056e-05, + "loss": 0.0964, + "num_input_tokens_seen": 51120800, + "step": 23680 + }, + { + "epoch": 3.863784665579119, + "grad_norm": 0.30504074692726135, + "learning_rate": 3.851612930944027e-05, + "loss": 0.0573, + "num_input_tokens_seen": 51131008, + "step": 23685 + }, + { + "epoch": 3.864600326264274, + "grad_norm": 1.2891191244125366, + "learning_rate": 3.851014075387048e-05, + "loss": 0.2495, + "num_input_tokens_seen": 51141600, + "step": 23690 + }, + { + "epoch": 3.865415986949429, + "grad_norm": 0.7296348214149475, + "learning_rate": 3.850415110310012e-05, + "loss": 0.0557, + "num_input_tokens_seen": 51152128, + "step": 23695 + }, + { + "epoch": 3.866231647634584, + "grad_norm": 0.11870774626731873, + "learning_rate": 3.8498160357614756e-05, + "loss": 0.0996, + "num_input_tokens_seen": 51163840, + "step": 23700 + }, + { + "epoch": 3.867047308319739, + "grad_norm": 0.5508372187614441, + "learning_rate": 3.8492168517900016e-05, + "loss": 0.1129, + "num_input_tokens_seen": 51173984, + "step": 23705 + }, + { + "epoch": 3.867862969004894, + "grad_norm": 0.6811441779136658, + "learning_rate": 3.8486175584441643e-05, + "loss": 0.1295, + "num_input_tokens_seen": 51185120, + "step": 23710 + }, + { + "epoch": 3.868678629690049, + "grad_norm": 0.03830844908952713, + "learning_rate": 3.8480181557725455e-05, + "loss": 0.0358, + "num_input_tokens_seen": 51195968, + "step": 23715 + }, + { + "epoch": 3.869494290375204, + "grad_norm": 0.3983684182167053, + "learning_rate": 3.847418643823735e-05, + "loss": 0.0728, + "num_input_tokens_seen": 51206368, + "step": 23720 + }, + { + "epoch": 3.870309951060359, + "grad_norm": 0.260383278131485, + "learning_rate": 3.8468190226463316e-05, + "loss": 0.1293, + "num_input_tokens_seen": 51216928, + "step": 23725 + }, + { + "epoch": 3.871125611745514, + "grad_norm": 1.1635099649429321, + "learning_rate": 3.846219292288945e-05, + "loss": 0.3199, + "num_input_tokens_seen": 51227936, + "step": 23730 + }, + { + "epoch": 3.8719412724306688, + "grad_norm": 0.5541387796401978, + "learning_rate": 3.845619452800192e-05, + "loss": 0.1601, + "num_input_tokens_seen": 51238880, + "step": 23735 + }, + { + "epoch": 3.8727569331158236, + "grad_norm": 1.0262653827667236, + "learning_rate": 3.845019504228699e-05, + "loss": 0.1039, + "num_input_tokens_seen": 51250560, + "step": 23740 + }, + { + "epoch": 3.873572593800979, + "grad_norm": 0.09979766607284546, + "learning_rate": 3.8444194466230994e-05, + "loss": 0.043, + "num_input_tokens_seen": 51261440, + "step": 23745 + }, + { + "epoch": 3.8743882544861337, + "grad_norm": 0.2696254253387451, + "learning_rate": 3.843819280032038e-05, + "loss": 0.1995, + "num_input_tokens_seen": 51272640, + "step": 23750 + }, + { + "epoch": 3.875203915171289, + "grad_norm": 1.562738060951233, + "learning_rate": 3.843219004504168e-05, + "loss": 0.2958, + "num_input_tokens_seen": 51284064, + "step": 23755 + }, + { + "epoch": 3.8760195758564437, + "grad_norm": 1.2842038869857788, + "learning_rate": 3.84261862008815e-05, + "loss": 0.1277, + "num_input_tokens_seen": 51294368, + "step": 23760 + }, + { + "epoch": 3.8768352365415986, + "grad_norm": 0.18769167363643646, + "learning_rate": 3.8420181268326536e-05, + "loss": 0.0618, + "num_input_tokens_seen": 51304704, + "step": 23765 + }, + { + "epoch": 3.877650897226754, + "grad_norm": 1.0148802995681763, + "learning_rate": 3.841417524786359e-05, + "loss": 0.0957, + "num_input_tokens_seen": 51316128, + "step": 23770 + }, + { + "epoch": 3.8784665579119086, + "grad_norm": 0.01527851726859808, + "learning_rate": 3.840816813997954e-05, + "loss": 0.0666, + "num_input_tokens_seen": 51327712, + "step": 23775 + }, + { + "epoch": 3.8792822185970635, + "grad_norm": 1.5775858163833618, + "learning_rate": 3.8402159945161346e-05, + "loss": 0.1928, + "num_input_tokens_seen": 51338624, + "step": 23780 + }, + { + "epoch": 3.8800978792822187, + "grad_norm": 0.03907257691025734, + "learning_rate": 3.839615066389607e-05, + "loss": 0.0538, + "num_input_tokens_seen": 51348928, + "step": 23785 + }, + { + "epoch": 3.8809135399673735, + "grad_norm": 0.9343230724334717, + "learning_rate": 3.839014029667084e-05, + "loss": 0.1348, + "num_input_tokens_seen": 51359744, + "step": 23790 + }, + { + "epoch": 3.8817292006525284, + "grad_norm": 0.08258652687072754, + "learning_rate": 3.83841288439729e-05, + "loss": 0.1826, + "num_input_tokens_seen": 51371616, + "step": 23795 + }, + { + "epoch": 3.8825448613376836, + "grad_norm": 0.1941588968038559, + "learning_rate": 3.837811630628957e-05, + "loss": 0.1384, + "num_input_tokens_seen": 51382752, + "step": 23800 + }, + { + "epoch": 3.8833605220228384, + "grad_norm": 1.1839029788970947, + "learning_rate": 3.837210268410824e-05, + "loss": 0.3149, + "num_input_tokens_seen": 51393440, + "step": 23805 + }, + { + "epoch": 3.8841761827079937, + "grad_norm": 0.28859883546829224, + "learning_rate": 3.836608797791642e-05, + "loss": 0.0342, + "num_input_tokens_seen": 51401792, + "step": 23810 + }, + { + "epoch": 3.8849918433931485, + "grad_norm": 0.5674793720245361, + "learning_rate": 3.8360072188201704e-05, + "loss": 0.2795, + "num_input_tokens_seen": 51413216, + "step": 23815 + }, + { + "epoch": 3.8858075040783033, + "grad_norm": 0.3157980740070343, + "learning_rate": 3.835405531545173e-05, + "loss": 0.1818, + "num_input_tokens_seen": 51423840, + "step": 23820 + }, + { + "epoch": 3.886623164763458, + "grad_norm": 0.09118959307670593, + "learning_rate": 3.834803736015428e-05, + "loss": 0.0728, + "num_input_tokens_seen": 51434464, + "step": 23825 + }, + { + "epoch": 3.8874388254486134, + "grad_norm": 0.4672378897666931, + "learning_rate": 3.8342018322797205e-05, + "loss": 0.1922, + "num_input_tokens_seen": 51446112, + "step": 23830 + }, + { + "epoch": 3.8882544861337682, + "grad_norm": 1.4092789888381958, + "learning_rate": 3.833599820386842e-05, + "loss": 0.2304, + "num_input_tokens_seen": 51456704, + "step": 23835 + }, + { + "epoch": 3.8890701468189235, + "grad_norm": 1.4864208698272705, + "learning_rate": 3.8329977003855956e-05, + "loss": 0.161, + "num_input_tokens_seen": 51467424, + "step": 23840 + }, + { + "epoch": 3.8898858075040783, + "grad_norm": 0.2715340256690979, + "learning_rate": 3.832395472324791e-05, + "loss": 0.0563, + "num_input_tokens_seen": 51477792, + "step": 23845 + }, + { + "epoch": 3.890701468189233, + "grad_norm": 1.171617865562439, + "learning_rate": 3.83179313625325e-05, + "loss": 0.2706, + "num_input_tokens_seen": 51486368, + "step": 23850 + }, + { + "epoch": 3.8915171288743884, + "grad_norm": 1.808974027633667, + "learning_rate": 3.8311906922198005e-05, + "loss": 0.1553, + "num_input_tokens_seen": 51496224, + "step": 23855 + }, + { + "epoch": 3.892332789559543, + "grad_norm": 0.08512428402900696, + "learning_rate": 3.830588140273278e-05, + "loss": 0.139, + "num_input_tokens_seen": 51507872, + "step": 23860 + }, + { + "epoch": 3.8931484502446985, + "grad_norm": 0.7790274620056152, + "learning_rate": 3.829985480462529e-05, + "loss": 0.0879, + "num_input_tokens_seen": 51518112, + "step": 23865 + }, + { + "epoch": 3.8939641109298533, + "grad_norm": 1.6626739501953125, + "learning_rate": 3.82938271283641e-05, + "loss": 0.1121, + "num_input_tokens_seen": 51529120, + "step": 23870 + }, + { + "epoch": 3.894779771615008, + "grad_norm": 0.5072335004806519, + "learning_rate": 3.828779837443783e-05, + "loss": 0.1461, + "num_input_tokens_seen": 51540768, + "step": 23875 + }, + { + "epoch": 3.895595432300163, + "grad_norm": 0.31471213698387146, + "learning_rate": 3.8281768543335195e-05, + "loss": 0.1653, + "num_input_tokens_seen": 51550304, + "step": 23880 + }, + { + "epoch": 3.896411092985318, + "grad_norm": 0.41274648904800415, + "learning_rate": 3.827573763554502e-05, + "loss": 0.1372, + "num_input_tokens_seen": 51561088, + "step": 23885 + }, + { + "epoch": 3.897226753670473, + "grad_norm": 0.4272482097148895, + "learning_rate": 3.826970565155618e-05, + "loss": 0.1772, + "num_input_tokens_seen": 51571264, + "step": 23890 + }, + { + "epoch": 3.8980424143556283, + "grad_norm": 0.10646409541368484, + "learning_rate": 3.8263672591857666e-05, + "loss": 0.0683, + "num_input_tokens_seen": 51582912, + "step": 23895 + }, + { + "epoch": 3.898858075040783, + "grad_norm": 0.5710664391517639, + "learning_rate": 3.825763845693857e-05, + "loss": 0.0757, + "num_input_tokens_seen": 51594144, + "step": 23900 + }, + { + "epoch": 3.899673735725938, + "grad_norm": 0.13217692077159882, + "learning_rate": 3.825160324728802e-05, + "loss": 0.1047, + "num_input_tokens_seen": 51604928, + "step": 23905 + }, + { + "epoch": 3.9004893964110927, + "grad_norm": 1.7496857643127441, + "learning_rate": 3.824556696339528e-05, + "loss": 0.2591, + "num_input_tokens_seen": 51614176, + "step": 23910 + }, + { + "epoch": 3.901305057096248, + "grad_norm": 1.383554458618164, + "learning_rate": 3.823952960574967e-05, + "loss": 0.0997, + "num_input_tokens_seen": 51624992, + "step": 23915 + }, + { + "epoch": 3.902120717781403, + "grad_norm": 0.5849953293800354, + "learning_rate": 3.823349117484062e-05, + "loss": 0.0359, + "num_input_tokens_seen": 51635744, + "step": 23920 + }, + { + "epoch": 3.902936378466558, + "grad_norm": 0.4051600396633148, + "learning_rate": 3.822745167115762e-05, + "loss": 0.0408, + "num_input_tokens_seen": 51645472, + "step": 23925 + }, + { + "epoch": 3.903752039151713, + "grad_norm": 1.854121208190918, + "learning_rate": 3.822141109519027e-05, + "loss": 0.1931, + "num_input_tokens_seen": 51656128, + "step": 23930 + }, + { + "epoch": 3.9045676998368677, + "grad_norm": 0.7237974405288696, + "learning_rate": 3.821536944742827e-05, + "loss": 0.2701, + "num_input_tokens_seen": 51666368, + "step": 23935 + }, + { + "epoch": 3.905383360522023, + "grad_norm": 1.121891736984253, + "learning_rate": 3.820932672836135e-05, + "loss": 0.1194, + "num_input_tokens_seen": 51677792, + "step": 23940 + }, + { + "epoch": 3.9061990212071778, + "grad_norm": 0.38926252722740173, + "learning_rate": 3.820328293847939e-05, + "loss": 0.2014, + "num_input_tokens_seen": 51689280, + "step": 23945 + }, + { + "epoch": 3.907014681892333, + "grad_norm": 0.21109870076179504, + "learning_rate": 3.819723807827232e-05, + "loss": 0.0663, + "num_input_tokens_seen": 51699712, + "step": 23950 + }, + { + "epoch": 3.907830342577488, + "grad_norm": 1.0770444869995117, + "learning_rate": 3.8191192148230176e-05, + "loss": 0.2194, + "num_input_tokens_seen": 51710624, + "step": 23955 + }, + { + "epoch": 3.9086460032626427, + "grad_norm": 1.0308058261871338, + "learning_rate": 3.818514514884306e-05, + "loss": 0.1133, + "num_input_tokens_seen": 51721952, + "step": 23960 + }, + { + "epoch": 3.9094616639477975, + "grad_norm": 0.10407707840204239, + "learning_rate": 3.8179097080601175e-05, + "loss": 0.1688, + "num_input_tokens_seen": 51731584, + "step": 23965 + }, + { + "epoch": 3.9102773246329527, + "grad_norm": 0.13172735273838043, + "learning_rate": 3.817304794399481e-05, + "loss": 0.1651, + "num_input_tokens_seen": 51743232, + "step": 23970 + }, + { + "epoch": 3.9110929853181076, + "grad_norm": 0.2634405791759491, + "learning_rate": 3.816699773951434e-05, + "loss": 0.1224, + "num_input_tokens_seen": 51755712, + "step": 23975 + }, + { + "epoch": 3.911908646003263, + "grad_norm": 0.3391109108924866, + "learning_rate": 3.8160946467650226e-05, + "loss": 0.0569, + "num_input_tokens_seen": 51767776, + "step": 23980 + }, + { + "epoch": 3.9127243066884176, + "grad_norm": 0.639576256275177, + "learning_rate": 3.815489412889302e-05, + "loss": 0.1449, + "num_input_tokens_seen": 51778176, + "step": 23985 + }, + { + "epoch": 3.9135399673735725, + "grad_norm": 0.3108399510383606, + "learning_rate": 3.8148840723733335e-05, + "loss": 0.1425, + "num_input_tokens_seen": 51789024, + "step": 23990 + }, + { + "epoch": 3.9143556280587277, + "grad_norm": 0.4041057527065277, + "learning_rate": 3.814278625266191e-05, + "loss": 0.0662, + "num_input_tokens_seen": 51798752, + "step": 23995 + }, + { + "epoch": 3.9151712887438825, + "grad_norm": 0.8107935190200806, + "learning_rate": 3.8136730716169554e-05, + "loss": 0.141, + "num_input_tokens_seen": 51809856, + "step": 24000 + }, + { + "epoch": 3.9159869494290374, + "grad_norm": 0.24182677268981934, + "learning_rate": 3.8130674114747146e-05, + "loss": 0.0365, + "num_input_tokens_seen": 51819584, + "step": 24005 + }, + { + "epoch": 3.9168026101141926, + "grad_norm": 1.1841930150985718, + "learning_rate": 3.812461644888566e-05, + "loss": 0.1156, + "num_input_tokens_seen": 51831168, + "step": 24010 + }, + { + "epoch": 3.9176182707993474, + "grad_norm": 0.5022777915000916, + "learning_rate": 3.8118557719076186e-05, + "loss": 0.074, + "num_input_tokens_seen": 51839776, + "step": 24015 + }, + { + "epoch": 3.9184339314845023, + "grad_norm": 0.06826812773942947, + "learning_rate": 3.811249792580985e-05, + "loss": 0.1507, + "num_input_tokens_seen": 51850368, + "step": 24020 + }, + { + "epoch": 3.9192495921696575, + "grad_norm": 1.5066953897476196, + "learning_rate": 3.810643706957791e-05, + "loss": 0.1565, + "num_input_tokens_seen": 51861248, + "step": 24025 + }, + { + "epoch": 3.9200652528548123, + "grad_norm": 1.1135014295578003, + "learning_rate": 3.810037515087167e-05, + "loss": 0.2856, + "num_input_tokens_seen": 51872928, + "step": 24030 + }, + { + "epoch": 3.9208809135399676, + "grad_norm": 1.7112643718719482, + "learning_rate": 3.809431217018255e-05, + "loss": 0.2618, + "num_input_tokens_seen": 51883680, + "step": 24035 + }, + { + "epoch": 3.9216965742251224, + "grad_norm": 1.332648515701294, + "learning_rate": 3.8088248128002044e-05, + "loss": 0.319, + "num_input_tokens_seen": 51895808, + "step": 24040 + }, + { + "epoch": 3.9225122349102772, + "grad_norm": 0.5167086720466614, + "learning_rate": 3.808218302482175e-05, + "loss": 0.1046, + "num_input_tokens_seen": 51905760, + "step": 24045 + }, + { + "epoch": 3.923327895595432, + "grad_norm": 0.6628379821777344, + "learning_rate": 3.8076116861133305e-05, + "loss": 0.2098, + "num_input_tokens_seen": 51916288, + "step": 24050 + }, + { + "epoch": 3.9241435562805873, + "grad_norm": 1.499902606010437, + "learning_rate": 3.8070049637428485e-05, + "loss": 0.058, + "num_input_tokens_seen": 51927040, + "step": 24055 + }, + { + "epoch": 3.924959216965742, + "grad_norm": 0.08888544887304306, + "learning_rate": 3.806398135419913e-05, + "loss": 0.0392, + "num_input_tokens_seen": 51937760, + "step": 24060 + }, + { + "epoch": 3.9257748776508974, + "grad_norm": 0.7418966293334961, + "learning_rate": 3.805791201193716e-05, + "loss": 0.0551, + "num_input_tokens_seen": 51948384, + "step": 24065 + }, + { + "epoch": 3.926590538336052, + "grad_norm": 0.6177875399589539, + "learning_rate": 3.8051841611134576e-05, + "loss": 0.1724, + "num_input_tokens_seen": 51958496, + "step": 24070 + }, + { + "epoch": 3.927406199021207, + "grad_norm": 0.13845227658748627, + "learning_rate": 3.804577015228349e-05, + "loss": 0.0485, + "num_input_tokens_seen": 51969216, + "step": 24075 + }, + { + "epoch": 3.9282218597063623, + "grad_norm": 0.12449220567941666, + "learning_rate": 3.803969763587609e-05, + "loss": 0.0663, + "num_input_tokens_seen": 51980768, + "step": 24080 + }, + { + "epoch": 3.929037520391517, + "grad_norm": 0.3435952365398407, + "learning_rate": 3.803362406240463e-05, + "loss": 0.0652, + "num_input_tokens_seen": 51990976, + "step": 24085 + }, + { + "epoch": 3.9298531810766724, + "grad_norm": 0.8796844482421875, + "learning_rate": 3.802754943236148e-05, + "loss": 0.1154, + "num_input_tokens_seen": 52002240, + "step": 24090 + }, + { + "epoch": 3.930668841761827, + "grad_norm": 1.4424340724945068, + "learning_rate": 3.8021473746239064e-05, + "loss": 0.1089, + "num_input_tokens_seen": 52013760, + "step": 24095 + }, + { + "epoch": 3.931484502446982, + "grad_norm": 0.2785103917121887, + "learning_rate": 3.801539700452992e-05, + "loss": 0.0993, + "num_input_tokens_seen": 52024960, + "step": 24100 + }, + { + "epoch": 3.932300163132137, + "grad_norm": 0.11304652690887451, + "learning_rate": 3.800931920772666e-05, + "loss": 0.0986, + "num_input_tokens_seen": 52035872, + "step": 24105 + }, + { + "epoch": 3.933115823817292, + "grad_norm": 0.06485164910554886, + "learning_rate": 3.8003240356321965e-05, + "loss": 0.1797, + "num_input_tokens_seen": 52047136, + "step": 24110 + }, + { + "epoch": 3.933931484502447, + "grad_norm": 0.059327129274606705, + "learning_rate": 3.7997160450808634e-05, + "loss": 0.2418, + "num_input_tokens_seen": 52059040, + "step": 24115 + }, + { + "epoch": 3.934747145187602, + "grad_norm": 0.12350811809301376, + "learning_rate": 3.7991079491679524e-05, + "loss": 0.109, + "num_input_tokens_seen": 52070176, + "step": 24120 + }, + { + "epoch": 3.935562805872757, + "grad_norm": 0.188985213637352, + "learning_rate": 3.79849974794276e-05, + "loss": 0.1542, + "num_input_tokens_seen": 52080576, + "step": 24125 + }, + { + "epoch": 3.936378466557912, + "grad_norm": 0.5610558986663818, + "learning_rate": 3.7978914414545895e-05, + "loss": 0.0991, + "num_input_tokens_seen": 52091744, + "step": 24130 + }, + { + "epoch": 3.9371941272430666, + "grad_norm": 0.2215762734413147, + "learning_rate": 3.797283029752753e-05, + "loss": 0.2482, + "num_input_tokens_seen": 52102432, + "step": 24135 + }, + { + "epoch": 3.938009787928222, + "grad_norm": 0.10263261944055557, + "learning_rate": 3.796674512886573e-05, + "loss": 0.1286, + "num_input_tokens_seen": 52111968, + "step": 24140 + }, + { + "epoch": 3.9388254486133767, + "grad_norm": 0.33701106905937195, + "learning_rate": 3.7960658909053766e-05, + "loss": 0.1213, + "num_input_tokens_seen": 52123872, + "step": 24145 + }, + { + "epoch": 3.939641109298532, + "grad_norm": 0.1156352236866951, + "learning_rate": 3.7954571638585035e-05, + "loss": 0.0445, + "num_input_tokens_seen": 52134400, + "step": 24150 + }, + { + "epoch": 3.9404567699836868, + "grad_norm": 0.07244640588760376, + "learning_rate": 3.7948483317952985e-05, + "loss": 0.0356, + "num_input_tokens_seen": 52144448, + "step": 24155 + }, + { + "epoch": 3.9412724306688416, + "grad_norm": 1.311730146408081, + "learning_rate": 3.794239394765119e-05, + "loss": 0.2277, + "num_input_tokens_seen": 52155840, + "step": 24160 + }, + { + "epoch": 3.942088091353997, + "grad_norm": 0.18571235239505768, + "learning_rate": 3.793630352817327e-05, + "loss": 0.073, + "num_input_tokens_seen": 52167968, + "step": 24165 + }, + { + "epoch": 3.9429037520391517, + "grad_norm": 0.8546648621559143, + "learning_rate": 3.7930212060012946e-05, + "loss": 0.0655, + "num_input_tokens_seen": 52177792, + "step": 24170 + }, + { + "epoch": 3.943719412724307, + "grad_norm": 0.7812620401382446, + "learning_rate": 3.792411954366402e-05, + "loss": 0.0824, + "num_input_tokens_seen": 52188992, + "step": 24175 + }, + { + "epoch": 3.9445350734094617, + "grad_norm": 1.3593146800994873, + "learning_rate": 3.791802597962039e-05, + "loss": 0.1438, + "num_input_tokens_seen": 52200192, + "step": 24180 + }, + { + "epoch": 3.9453507340946166, + "grad_norm": 1.0203273296356201, + "learning_rate": 3.791193136837603e-05, + "loss": 0.1783, + "num_input_tokens_seen": 52210944, + "step": 24185 + }, + { + "epoch": 3.9461663947797714, + "grad_norm": 0.5696756839752197, + "learning_rate": 3.7905835710425e-05, + "loss": 0.1653, + "num_input_tokens_seen": 52220960, + "step": 24190 + }, + { + "epoch": 3.9469820554649266, + "grad_norm": 1.8119621276855469, + "learning_rate": 3.789973900626145e-05, + "loss": 0.1628, + "num_input_tokens_seen": 52231232, + "step": 24195 + }, + { + "epoch": 3.9477977161500815, + "grad_norm": 0.04743614420294762, + "learning_rate": 3.78936412563796e-05, + "loss": 0.1206, + "num_input_tokens_seen": 52240032, + "step": 24200 + }, + { + "epoch": 3.9486133768352367, + "grad_norm": 0.16305673122406006, + "learning_rate": 3.788754246127375e-05, + "loss": 0.0767, + "num_input_tokens_seen": 52251968, + "step": 24205 + }, + { + "epoch": 3.9494290375203915, + "grad_norm": 0.020633898675441742, + "learning_rate": 3.7881442621438333e-05, + "loss": 0.0373, + "num_input_tokens_seen": 52263072, + "step": 24210 + }, + { + "epoch": 3.9502446982055464, + "grad_norm": 1.8059253692626953, + "learning_rate": 3.787534173736782e-05, + "loss": 0.2029, + "num_input_tokens_seen": 52274336, + "step": 24215 + }, + { + "epoch": 3.9510603588907016, + "grad_norm": 1.3515090942382812, + "learning_rate": 3.786923980955678e-05, + "loss": 0.1441, + "num_input_tokens_seen": 52284992, + "step": 24220 + }, + { + "epoch": 3.9518760195758564, + "grad_norm": 0.7591730952262878, + "learning_rate": 3.7863136838499855e-05, + "loss": 0.0785, + "num_input_tokens_seen": 52296032, + "step": 24225 + }, + { + "epoch": 3.9526916802610113, + "grad_norm": 1.2754985094070435, + "learning_rate": 3.785703282469179e-05, + "loss": 0.2643, + "num_input_tokens_seen": 52306432, + "step": 24230 + }, + { + "epoch": 3.9535073409461665, + "grad_norm": 0.17808884382247925, + "learning_rate": 3.785092776862741e-05, + "loss": 0.0653, + "num_input_tokens_seen": 52316640, + "step": 24235 + }, + { + "epoch": 3.9543230016313213, + "grad_norm": 1.3192689418792725, + "learning_rate": 3.784482167080162e-05, + "loss": 0.1519, + "num_input_tokens_seen": 52327104, + "step": 24240 + }, + { + "epoch": 3.955138662316476, + "grad_norm": 0.08583094924688339, + "learning_rate": 3.783871453170941e-05, + "loss": 0.1753, + "num_input_tokens_seen": 52338080, + "step": 24245 + }, + { + "epoch": 3.9559543230016314, + "grad_norm": 0.2242617905139923, + "learning_rate": 3.783260635184586e-05, + "loss": 0.3616, + "num_input_tokens_seen": 52348576, + "step": 24250 + }, + { + "epoch": 3.9567699836867862, + "grad_norm": 0.07552061975002289, + "learning_rate": 3.782649713170613e-05, + "loss": 0.1719, + "num_input_tokens_seen": 52358848, + "step": 24255 + }, + { + "epoch": 3.9575856443719415, + "grad_norm": 0.1420482099056244, + "learning_rate": 3.7820386871785455e-05, + "loss": 0.057, + "num_input_tokens_seen": 52369248, + "step": 24260 + }, + { + "epoch": 3.9584013050570963, + "grad_norm": 0.8325637578964233, + "learning_rate": 3.7814275572579175e-05, + "loss": 0.1851, + "num_input_tokens_seen": 52380640, + "step": 24265 + }, + { + "epoch": 3.959216965742251, + "grad_norm": 0.6581172347068787, + "learning_rate": 3.780816323458269e-05, + "loss": 0.1566, + "num_input_tokens_seen": 52391136, + "step": 24270 + }, + { + "epoch": 3.960032626427406, + "grad_norm": 0.34368014335632324, + "learning_rate": 3.7802049858291515e-05, + "loss": 0.0385, + "num_input_tokens_seen": 52401824, + "step": 24275 + }, + { + "epoch": 3.960848287112561, + "grad_norm": 1.1478301286697388, + "learning_rate": 3.779593544420122e-05, + "loss": 0.4089, + "num_input_tokens_seen": 52412576, + "step": 24280 + }, + { + "epoch": 3.961663947797716, + "grad_norm": 1.023537516593933, + "learning_rate": 3.7789819992807474e-05, + "loss": 0.1366, + "num_input_tokens_seen": 52422720, + "step": 24285 + }, + { + "epoch": 3.9624796084828713, + "grad_norm": 1.440125823020935, + "learning_rate": 3.778370350460601e-05, + "loss": 0.1428, + "num_input_tokens_seen": 52432800, + "step": 24290 + }, + { + "epoch": 3.963295269168026, + "grad_norm": 0.2123410850763321, + "learning_rate": 3.777758598009269e-05, + "loss": 0.0675, + "num_input_tokens_seen": 52443680, + "step": 24295 + }, + { + "epoch": 3.964110929853181, + "grad_norm": 0.06274532526731491, + "learning_rate": 3.777146741976342e-05, + "loss": 0.184, + "num_input_tokens_seen": 52454976, + "step": 24300 + }, + { + "epoch": 3.964926590538336, + "grad_norm": 0.8209527730941772, + "learning_rate": 3.776534782411419e-05, + "loss": 0.2984, + "num_input_tokens_seen": 52465376, + "step": 24305 + }, + { + "epoch": 3.965742251223491, + "grad_norm": 0.7401530742645264, + "learning_rate": 3.77592271936411e-05, + "loss": 0.2109, + "num_input_tokens_seen": 52477408, + "step": 24310 + }, + { + "epoch": 3.9665579119086463, + "grad_norm": 0.6581721305847168, + "learning_rate": 3.775310552884031e-05, + "loss": 0.1035, + "num_input_tokens_seen": 52489472, + "step": 24315 + }, + { + "epoch": 3.967373572593801, + "grad_norm": 0.2654457986354828, + "learning_rate": 3.7746982830208075e-05, + "loss": 0.0496, + "num_input_tokens_seen": 52499648, + "step": 24320 + }, + { + "epoch": 3.968189233278956, + "grad_norm": 0.24737143516540527, + "learning_rate": 3.774085909824074e-05, + "loss": 0.2856, + "num_input_tokens_seen": 52508096, + "step": 24325 + }, + { + "epoch": 3.9690048939641107, + "grad_norm": 0.327981173992157, + "learning_rate": 3.7734734333434726e-05, + "loss": 0.0581, + "num_input_tokens_seen": 52520224, + "step": 24330 + }, + { + "epoch": 3.969820554649266, + "grad_norm": 0.11817782372236252, + "learning_rate": 3.772860853628652e-05, + "loss": 0.0824, + "num_input_tokens_seen": 52531520, + "step": 24335 + }, + { + "epoch": 3.970636215334421, + "grad_norm": 0.12562112510204315, + "learning_rate": 3.772248170729272e-05, + "loss": 0.1482, + "num_input_tokens_seen": 52543040, + "step": 24340 + }, + { + "epoch": 3.971451876019576, + "grad_norm": 1.1199324131011963, + "learning_rate": 3.771635384695001e-05, + "loss": 0.3657, + "num_input_tokens_seen": 52553760, + "step": 24345 + }, + { + "epoch": 3.972267536704731, + "grad_norm": 0.5529919266700745, + "learning_rate": 3.771022495575513e-05, + "loss": 0.108, + "num_input_tokens_seen": 52564416, + "step": 24350 + }, + { + "epoch": 3.9730831973898857, + "grad_norm": 1.1147733926773071, + "learning_rate": 3.770409503420492e-05, + "loss": 0.1784, + "num_input_tokens_seen": 52574880, + "step": 24355 + }, + { + "epoch": 3.9738988580750405, + "grad_norm": 0.4246824383735657, + "learning_rate": 3.769796408279631e-05, + "loss": 0.2941, + "num_input_tokens_seen": 52586464, + "step": 24360 + }, + { + "epoch": 3.9747145187601958, + "grad_norm": 0.08181697875261307, + "learning_rate": 3.76918321020263e-05, + "loss": 0.0387, + "num_input_tokens_seen": 52596608, + "step": 24365 + }, + { + "epoch": 3.9755301794453506, + "grad_norm": 0.45836085081100464, + "learning_rate": 3.768569909239199e-05, + "loss": 0.1128, + "num_input_tokens_seen": 52607680, + "step": 24370 + }, + { + "epoch": 3.976345840130506, + "grad_norm": 1.3314580917358398, + "learning_rate": 3.767956505439054e-05, + "loss": 0.207, + "num_input_tokens_seen": 52619648, + "step": 24375 + }, + { + "epoch": 3.9771615008156607, + "grad_norm": 0.10198111087083817, + "learning_rate": 3.767342998851921e-05, + "loss": 0.0896, + "num_input_tokens_seen": 52630528, + "step": 24380 + }, + { + "epoch": 3.9779771615008155, + "grad_norm": 0.07803508639335632, + "learning_rate": 3.766729389527535e-05, + "loss": 0.0437, + "num_input_tokens_seen": 52640608, + "step": 24385 + }, + { + "epoch": 3.9787928221859707, + "grad_norm": 1.2967859506607056, + "learning_rate": 3.766115677515637e-05, + "loss": 0.1706, + "num_input_tokens_seen": 52651872, + "step": 24390 + }, + { + "epoch": 3.9796084828711256, + "grad_norm": 0.48636394739151, + "learning_rate": 3.765501862865976e-05, + "loss": 0.1172, + "num_input_tokens_seen": 52663904, + "step": 24395 + }, + { + "epoch": 3.980424143556281, + "grad_norm": 0.9684281349182129, + "learning_rate": 3.764887945628315e-05, + "loss": 0.184, + "num_input_tokens_seen": 52674848, + "step": 24400 + }, + { + "epoch": 3.9812398042414356, + "grad_norm": 1.0375940799713135, + "learning_rate": 3.76427392585242e-05, + "loss": 0.3536, + "num_input_tokens_seen": 52686656, + "step": 24405 + }, + { + "epoch": 3.9820554649265905, + "grad_norm": 0.09999796748161316, + "learning_rate": 3.7636598035880633e-05, + "loss": 0.0714, + "num_input_tokens_seen": 52696544, + "step": 24410 + }, + { + "epoch": 3.9828711256117453, + "grad_norm": 0.05761435627937317, + "learning_rate": 3.763045578885033e-05, + "loss": 0.0934, + "num_input_tokens_seen": 52706912, + "step": 24415 + }, + { + "epoch": 3.9836867862969005, + "grad_norm": 0.10995554178953171, + "learning_rate": 3.762431251793118e-05, + "loss": 0.1095, + "num_input_tokens_seen": 52718016, + "step": 24420 + }, + { + "epoch": 3.9845024469820554, + "grad_norm": 1.7158801555633545, + "learning_rate": 3.7618168223621215e-05, + "loss": 0.4554, + "num_input_tokens_seen": 52728288, + "step": 24425 + }, + { + "epoch": 3.9853181076672106, + "grad_norm": 0.2522057592868805, + "learning_rate": 3.761202290641851e-05, + "loss": 0.0947, + "num_input_tokens_seen": 52739104, + "step": 24430 + }, + { + "epoch": 3.9861337683523654, + "grad_norm": 1.1959582567214966, + "learning_rate": 3.760587656682122e-05, + "loss": 0.0819, + "num_input_tokens_seen": 52749536, + "step": 24435 + }, + { + "epoch": 3.9869494290375203, + "grad_norm": 1.26105797290802, + "learning_rate": 3.759972920532762e-05, + "loss": 0.2281, + "num_input_tokens_seen": 52760416, + "step": 24440 + }, + { + "epoch": 3.9877650897226755, + "grad_norm": 0.14686807990074158, + "learning_rate": 3.759358082243604e-05, + "loss": 0.124, + "num_input_tokens_seen": 52772160, + "step": 24445 + }, + { + "epoch": 3.9885807504078303, + "grad_norm": 0.6119867563247681, + "learning_rate": 3.7587431418644906e-05, + "loss": 0.075, + "num_input_tokens_seen": 52781984, + "step": 24450 + }, + { + "epoch": 3.9893964110929856, + "grad_norm": 1.3627238273620605, + "learning_rate": 3.758128099445271e-05, + "loss": 0.1438, + "num_input_tokens_seen": 52791776, + "step": 24455 + }, + { + "epoch": 3.9902120717781404, + "grad_norm": 0.12167369574308395, + "learning_rate": 3.757512955035804e-05, + "loss": 0.0941, + "num_input_tokens_seen": 52802528, + "step": 24460 + }, + { + "epoch": 3.9910277324632952, + "grad_norm": 0.3350487947463989, + "learning_rate": 3.7568977086859566e-05, + "loss": 0.1232, + "num_input_tokens_seen": 52812640, + "step": 24465 + }, + { + "epoch": 3.99184339314845, + "grad_norm": 0.4135972559452057, + "learning_rate": 3.7562823604456035e-05, + "loss": 0.1266, + "num_input_tokens_seen": 52823744, + "step": 24470 + }, + { + "epoch": 3.9926590538336053, + "grad_norm": 1.1936187744140625, + "learning_rate": 3.7556669103646266e-05, + "loss": 0.0618, + "num_input_tokens_seen": 52834016, + "step": 24475 + }, + { + "epoch": 3.99347471451876, + "grad_norm": 1.8035624027252197, + "learning_rate": 3.75505135849292e-05, + "loss": 0.1936, + "num_input_tokens_seen": 52845248, + "step": 24480 + }, + { + "epoch": 3.9942903752039154, + "grad_norm": 0.4107532799243927, + "learning_rate": 3.7544357048803824e-05, + "loss": 0.0477, + "num_input_tokens_seen": 52855104, + "step": 24485 + }, + { + "epoch": 3.99510603588907, + "grad_norm": 0.2738112509250641, + "learning_rate": 3.7538199495769214e-05, + "loss": 0.0617, + "num_input_tokens_seen": 52866048, + "step": 24490 + }, + { + "epoch": 3.995921696574225, + "grad_norm": 0.11858531087636948, + "learning_rate": 3.753204092632454e-05, + "loss": 0.0503, + "num_input_tokens_seen": 52876768, + "step": 24495 + }, + { + "epoch": 3.99673735725938, + "grad_norm": 0.19328926503658295, + "learning_rate": 3.752588134096903e-05, + "loss": 0.0898, + "num_input_tokens_seen": 52887872, + "step": 24500 + }, + { + "epoch": 3.997553017944535, + "grad_norm": 0.6338810324668884, + "learning_rate": 3.751972074020202e-05, + "loss": 0.1215, + "num_input_tokens_seen": 52898624, + "step": 24505 + }, + { + "epoch": 3.99836867862969, + "grad_norm": 0.40014877915382385, + "learning_rate": 3.751355912452294e-05, + "loss": 0.098, + "num_input_tokens_seen": 52908768, + "step": 24510 + }, + { + "epoch": 3.999184339314845, + "grad_norm": 0.07681459933519363, + "learning_rate": 3.7507396494431246e-05, + "loss": 0.0524, + "num_input_tokens_seen": 52920064, + "step": 24515 + }, + { + "epoch": 4.0, + "grad_norm": 0.05157363414764404, + "learning_rate": 3.750123285042654e-05, + "loss": 0.0207, + "num_input_tokens_seen": 52929744, + "step": 24520 + }, + { + "epoch": 4.0, + "eval_loss": 0.1367577761411667, + "eval_runtime": 131.7973, + "eval_samples_per_second": 20.676, + "eval_steps_per_second": 5.175, + "num_input_tokens_seen": 52929744, + "step": 24520 + }, + { + "epoch": 4.000815660685155, + "grad_norm": 1.2774838209152222, + "learning_rate": 3.749506819300846e-05, + "loss": 0.1715, + "num_input_tokens_seen": 52941840, + "step": 24525 + }, + { + "epoch": 4.00163132137031, + "grad_norm": 0.31644344329833984, + "learning_rate": 3.748890252267676e-05, + "loss": 0.1315, + "num_input_tokens_seen": 52952560, + "step": 24530 + }, + { + "epoch": 4.002446982055465, + "grad_norm": 0.4536781907081604, + "learning_rate": 3.748273583993126e-05, + "loss": 0.1468, + "num_input_tokens_seen": 52961968, + "step": 24535 + }, + { + "epoch": 4.00326264274062, + "grad_norm": 1.9920074939727783, + "learning_rate": 3.747656814527185e-05, + "loss": 0.1758, + "num_input_tokens_seen": 52971408, + "step": 24540 + }, + { + "epoch": 4.004078303425775, + "grad_norm": 0.06796783208847046, + "learning_rate": 3.747039943919852e-05, + "loss": 0.1995, + "num_input_tokens_seen": 52982832, + "step": 24545 + }, + { + "epoch": 4.00489396411093, + "grad_norm": 1.2921401262283325, + "learning_rate": 3.746422972221134e-05, + "loss": 0.1012, + "num_input_tokens_seen": 52993648, + "step": 24550 + }, + { + "epoch": 4.005709624796085, + "grad_norm": 0.22168010473251343, + "learning_rate": 3.745805899481045e-05, + "loss": 0.1607, + "num_input_tokens_seen": 53004272, + "step": 24555 + }, + { + "epoch": 4.006525285481239, + "grad_norm": 1.0081449747085571, + "learning_rate": 3.745188725749609e-05, + "loss": 0.2114, + "num_input_tokens_seen": 53013872, + "step": 24560 + }, + { + "epoch": 4.007340946166395, + "grad_norm": 0.09465651214122772, + "learning_rate": 3.744571451076856e-05, + "loss": 0.0418, + "num_input_tokens_seen": 53024912, + "step": 24565 + }, + { + "epoch": 4.00815660685155, + "grad_norm": 1.807223916053772, + "learning_rate": 3.7439540755128276e-05, + "loss": 0.1907, + "num_input_tokens_seen": 53037360, + "step": 24570 + }, + { + "epoch": 4.008972267536705, + "grad_norm": 0.9933118224143982, + "learning_rate": 3.7433365991075695e-05, + "loss": 0.1806, + "num_input_tokens_seen": 53048688, + "step": 24575 + }, + { + "epoch": 4.00978792822186, + "grad_norm": 0.540823221206665, + "learning_rate": 3.742719021911138e-05, + "loss": 0.0761, + "num_input_tokens_seen": 53059568, + "step": 24580 + }, + { + "epoch": 4.010603588907014, + "grad_norm": 0.6057183146476746, + "learning_rate": 3.742101343973598e-05, + "loss": 0.0862, + "num_input_tokens_seen": 53071312, + "step": 24585 + }, + { + "epoch": 4.011419249592169, + "grad_norm": 1.1052379608154297, + "learning_rate": 3.741483565345019e-05, + "loss": 0.1233, + "num_input_tokens_seen": 53082544, + "step": 24590 + }, + { + "epoch": 4.012234910277325, + "grad_norm": 0.08853403478860855, + "learning_rate": 3.740865686075484e-05, + "loss": 0.0311, + "num_input_tokens_seen": 53093168, + "step": 24595 + }, + { + "epoch": 4.01305057096248, + "grad_norm": 0.12357772141695023, + "learning_rate": 3.7402477062150795e-05, + "loss": 0.1771, + "num_input_tokens_seen": 53104688, + "step": 24600 + }, + { + "epoch": 4.013866231647635, + "grad_norm": 0.30767208337783813, + "learning_rate": 3.739629625813904e-05, + "loss": 0.1878, + "num_input_tokens_seen": 53116528, + "step": 24605 + }, + { + "epoch": 4.014681892332789, + "grad_norm": 0.5002086758613586, + "learning_rate": 3.739011444922061e-05, + "loss": 0.1391, + "num_input_tokens_seen": 53128016, + "step": 24610 + }, + { + "epoch": 4.015497553017944, + "grad_norm": 0.1352892816066742, + "learning_rate": 3.7383931635896634e-05, + "loss": 0.0809, + "num_input_tokens_seen": 53139120, + "step": 24615 + }, + { + "epoch": 4.0163132137031, + "grad_norm": 0.1858925223350525, + "learning_rate": 3.737774781866833e-05, + "loss": 0.0412, + "num_input_tokens_seen": 53149392, + "step": 24620 + }, + { + "epoch": 4.017128874388255, + "grad_norm": 1.5910470485687256, + "learning_rate": 3.737156299803698e-05, + "loss": 0.1066, + "num_input_tokens_seen": 53160304, + "step": 24625 + }, + { + "epoch": 4.0179445350734095, + "grad_norm": 0.5914395451545715, + "learning_rate": 3.7365377174503956e-05, + "loss": 0.1207, + "num_input_tokens_seen": 53171248, + "step": 24630 + }, + { + "epoch": 4.018760195758564, + "grad_norm": 0.6934479475021362, + "learning_rate": 3.7359190348570726e-05, + "loss": 0.1961, + "num_input_tokens_seen": 53183312, + "step": 24635 + }, + { + "epoch": 4.019575856443719, + "grad_norm": 0.08266226947307587, + "learning_rate": 3.735300252073881e-05, + "loss": 0.1611, + "num_input_tokens_seen": 53194192, + "step": 24640 + }, + { + "epoch": 4.020391517128874, + "grad_norm": 0.3198448717594147, + "learning_rate": 3.734681369150983e-05, + "loss": 0.3379, + "num_input_tokens_seen": 53205296, + "step": 24645 + }, + { + "epoch": 4.02120717781403, + "grad_norm": 0.12528522312641144, + "learning_rate": 3.7340623861385496e-05, + "loss": 0.1085, + "num_input_tokens_seen": 53215632, + "step": 24650 + }, + { + "epoch": 4.0220228384991845, + "grad_norm": 0.39002397656440735, + "learning_rate": 3.7334433030867564e-05, + "loss": 0.1415, + "num_input_tokens_seen": 53226704, + "step": 24655 + }, + { + "epoch": 4.022838499184339, + "grad_norm": 0.08449193090200424, + "learning_rate": 3.732824120045791e-05, + "loss": 0.0388, + "num_input_tokens_seen": 53237328, + "step": 24660 + }, + { + "epoch": 4.023654159869494, + "grad_norm": 0.2958545386791229, + "learning_rate": 3.732204837065847e-05, + "loss": 0.0641, + "num_input_tokens_seen": 53248912, + "step": 24665 + }, + { + "epoch": 4.024469820554649, + "grad_norm": 0.08546962589025497, + "learning_rate": 3.731585454197127e-05, + "loss": 0.0761, + "num_input_tokens_seen": 53260528, + "step": 24670 + }, + { + "epoch": 4.025285481239805, + "grad_norm": 0.6369078755378723, + "learning_rate": 3.7309659714898404e-05, + "loss": 0.1144, + "num_input_tokens_seen": 53272016, + "step": 24675 + }, + { + "epoch": 4.0261011419249595, + "grad_norm": 0.9323785901069641, + "learning_rate": 3.730346388994207e-05, + "loss": 0.0973, + "num_input_tokens_seen": 53282640, + "step": 24680 + }, + { + "epoch": 4.026916802610114, + "grad_norm": 0.023752475157380104, + "learning_rate": 3.729726706760452e-05, + "loss": 0.0967, + "num_input_tokens_seen": 53293904, + "step": 24685 + }, + { + "epoch": 4.027732463295269, + "grad_norm": 0.07631927728652954, + "learning_rate": 3.729106924838812e-05, + "loss": 0.0524, + "num_input_tokens_seen": 53305904, + "step": 24690 + }, + { + "epoch": 4.028548123980424, + "grad_norm": 0.08418624848127365, + "learning_rate": 3.728487043279527e-05, + "loss": 0.0302, + "num_input_tokens_seen": 53316528, + "step": 24695 + }, + { + "epoch": 4.029363784665579, + "grad_norm": 0.8092179298400879, + "learning_rate": 3.727867062132849e-05, + "loss": 0.1365, + "num_input_tokens_seen": 53328464, + "step": 24700 + }, + { + "epoch": 4.0301794453507345, + "grad_norm": 0.8613394498825073, + "learning_rate": 3.7272469814490376e-05, + "loss": 0.1359, + "num_input_tokens_seen": 53339440, + "step": 24705 + }, + { + "epoch": 4.030995106035889, + "grad_norm": 0.1917508989572525, + "learning_rate": 3.726626801278358e-05, + "loss": 0.1184, + "num_input_tokens_seen": 53350000, + "step": 24710 + }, + { + "epoch": 4.031810766721044, + "grad_norm": 0.06437835842370987, + "learning_rate": 3.726006521671086e-05, + "loss": 0.0423, + "num_input_tokens_seen": 53360528, + "step": 24715 + }, + { + "epoch": 4.032626427406199, + "grad_norm": 0.06264664232730865, + "learning_rate": 3.7253861426775056e-05, + "loss": 0.0736, + "num_input_tokens_seen": 53371600, + "step": 24720 + }, + { + "epoch": 4.033442088091354, + "grad_norm": 0.33831343054771423, + "learning_rate": 3.7247656643479064e-05, + "loss": 0.107, + "num_input_tokens_seen": 53382032, + "step": 24725 + }, + { + "epoch": 4.034257748776509, + "grad_norm": 0.6865877509117126, + "learning_rate": 3.724145086732588e-05, + "loss": 0.2081, + "num_input_tokens_seen": 53392816, + "step": 24730 + }, + { + "epoch": 4.035073409461664, + "grad_norm": 0.4536660611629486, + "learning_rate": 3.7235244098818576e-05, + "loss": 0.0766, + "num_input_tokens_seen": 53404624, + "step": 24735 + }, + { + "epoch": 4.035889070146819, + "grad_norm": 1.0716814994812012, + "learning_rate": 3.722903633846031e-05, + "loss": 0.1844, + "num_input_tokens_seen": 53416048, + "step": 24740 + }, + { + "epoch": 4.036704730831974, + "grad_norm": 1.0233968496322632, + "learning_rate": 3.72228275867543e-05, + "loss": 0.0652, + "num_input_tokens_seen": 53426704, + "step": 24745 + }, + { + "epoch": 4.037520391517129, + "grad_norm": 1.786224365234375, + "learning_rate": 3.721661784420387e-05, + "loss": 0.1535, + "num_input_tokens_seen": 53439248, + "step": 24750 + }, + { + "epoch": 4.0383360522022835, + "grad_norm": 0.6464876532554626, + "learning_rate": 3.721040711131242e-05, + "loss": 0.2456, + "num_input_tokens_seen": 53448400, + "step": 24755 + }, + { + "epoch": 4.039151712887439, + "grad_norm": 0.9258679151535034, + "learning_rate": 3.72041953885834e-05, + "loss": 0.2706, + "num_input_tokens_seen": 53460336, + "step": 24760 + }, + { + "epoch": 4.039967373572594, + "grad_norm": 1.335443139076233, + "learning_rate": 3.719798267652038e-05, + "loss": 0.1818, + "num_input_tokens_seen": 53471568, + "step": 24765 + }, + { + "epoch": 4.040783034257749, + "grad_norm": 0.18895019590854645, + "learning_rate": 3.719176897562701e-05, + "loss": 0.0888, + "num_input_tokens_seen": 53483568, + "step": 24770 + }, + { + "epoch": 4.041598694942904, + "grad_norm": 0.7087806463241577, + "learning_rate": 3.718555428640697e-05, + "loss": 0.1203, + "num_input_tokens_seen": 53492848, + "step": 24775 + }, + { + "epoch": 4.0424143556280585, + "grad_norm": 1.3410437107086182, + "learning_rate": 3.717933860936407e-05, + "loss": 0.1916, + "num_input_tokens_seen": 53502992, + "step": 24780 + }, + { + "epoch": 4.043230016313213, + "grad_norm": 0.8567341566085815, + "learning_rate": 3.7173121945002197e-05, + "loss": 0.1341, + "num_input_tokens_seen": 53513264, + "step": 24785 + }, + { + "epoch": 4.044045676998369, + "grad_norm": 0.41637760400772095, + "learning_rate": 3.716690429382529e-05, + "loss": 0.1064, + "num_input_tokens_seen": 53523984, + "step": 24790 + }, + { + "epoch": 4.044861337683524, + "grad_norm": 0.2588373124599457, + "learning_rate": 3.716068565633738e-05, + "loss": 0.1545, + "num_input_tokens_seen": 53534672, + "step": 24795 + }, + { + "epoch": 4.045676998368679, + "grad_norm": 0.23870563507080078, + "learning_rate": 3.715446603304259e-05, + "loss": 0.0974, + "num_input_tokens_seen": 53546224, + "step": 24800 + }, + { + "epoch": 4.0464926590538335, + "grad_norm": 0.09742946177721024, + "learning_rate": 3.7148245424445114e-05, + "loss": 0.0222, + "num_input_tokens_seen": 53555952, + "step": 24805 + }, + { + "epoch": 4.047308319738988, + "grad_norm": 0.1346934288740158, + "learning_rate": 3.7142023831049226e-05, + "loss": 0.1104, + "num_input_tokens_seen": 53567024, + "step": 24810 + }, + { + "epoch": 4.048123980424143, + "grad_norm": 0.21870392560958862, + "learning_rate": 3.713580125335928e-05, + "loss": 0.0687, + "num_input_tokens_seen": 53578032, + "step": 24815 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.46705853939056396, + "learning_rate": 3.7129577691879694e-05, + "loss": 0.1212, + "num_input_tokens_seen": 53589968, + "step": 24820 + }, + { + "epoch": 4.049755301794454, + "grad_norm": 0.35032111406326294, + "learning_rate": 3.712335314711501e-05, + "loss": 0.0526, + "num_input_tokens_seen": 53599472, + "step": 24825 + }, + { + "epoch": 4.0505709624796085, + "grad_norm": 0.04890364408493042, + "learning_rate": 3.7117127619569796e-05, + "loss": 0.1931, + "num_input_tokens_seen": 53611376, + "step": 24830 + }, + { + "epoch": 4.051386623164763, + "grad_norm": 0.8556477427482605, + "learning_rate": 3.7110901109748745e-05, + "loss": 0.0732, + "num_input_tokens_seen": 53623760, + "step": 24835 + }, + { + "epoch": 4.052202283849918, + "grad_norm": 1.8577442169189453, + "learning_rate": 3.710467361815659e-05, + "loss": 0.3333, + "num_input_tokens_seen": 53634416, + "step": 24840 + }, + { + "epoch": 4.053017944535074, + "grad_norm": 0.05822547897696495, + "learning_rate": 3.709844514529818e-05, + "loss": 0.0872, + "num_input_tokens_seen": 53644528, + "step": 24845 + }, + { + "epoch": 4.053833605220229, + "grad_norm": 0.3316808342933655, + "learning_rate": 3.709221569167842e-05, + "loss": 0.083, + "num_input_tokens_seen": 53654480, + "step": 24850 + }, + { + "epoch": 4.054649265905383, + "grad_norm": 0.08146774768829346, + "learning_rate": 3.70859852578023e-05, + "loss": 0.1873, + "num_input_tokens_seen": 53665136, + "step": 24855 + }, + { + "epoch": 4.055464926590538, + "grad_norm": 0.2368049919605255, + "learning_rate": 3.70797538441749e-05, + "loss": 0.1768, + "num_input_tokens_seen": 53677104, + "step": 24860 + }, + { + "epoch": 4.056280587275693, + "grad_norm": 0.32325300574302673, + "learning_rate": 3.707352145130135e-05, + "loss": 0.0556, + "num_input_tokens_seen": 53688016, + "step": 24865 + }, + { + "epoch": 4.057096247960848, + "grad_norm": 0.4269058108329773, + "learning_rate": 3.706728807968689e-05, + "loss": 0.1468, + "num_input_tokens_seen": 53698544, + "step": 24870 + }, + { + "epoch": 4.057911908646004, + "grad_norm": 0.8478068709373474, + "learning_rate": 3.706105372983683e-05, + "loss": 0.0717, + "num_input_tokens_seen": 53710000, + "step": 24875 + }, + { + "epoch": 4.058727569331158, + "grad_norm": 0.08615349978208542, + "learning_rate": 3.705481840225656e-05, + "loss": 0.0809, + "num_input_tokens_seen": 53720656, + "step": 24880 + }, + { + "epoch": 4.059543230016313, + "grad_norm": 0.11225626617670059, + "learning_rate": 3.704858209745155e-05, + "loss": 0.0742, + "num_input_tokens_seen": 53731280, + "step": 24885 + }, + { + "epoch": 4.060358890701468, + "grad_norm": 0.6187756061553955, + "learning_rate": 3.704234481592733e-05, + "loss": 0.1049, + "num_input_tokens_seen": 53741712, + "step": 24890 + }, + { + "epoch": 4.061174551386623, + "grad_norm": 0.2808443307876587, + "learning_rate": 3.703610655818955e-05, + "loss": 0.0917, + "num_input_tokens_seen": 53753360, + "step": 24895 + }, + { + "epoch": 4.061990212071779, + "grad_norm": 1.3474351167678833, + "learning_rate": 3.702986732474389e-05, + "loss": 0.1719, + "num_input_tokens_seen": 53764112, + "step": 24900 + }, + { + "epoch": 4.062805872756933, + "grad_norm": 0.1584075391292572, + "learning_rate": 3.702362711609615e-05, + "loss": 0.0977, + "num_input_tokens_seen": 53774320, + "step": 24905 + }, + { + "epoch": 4.063621533442088, + "grad_norm": 0.12264853715896606, + "learning_rate": 3.701738593275219e-05, + "loss": 0.1393, + "num_input_tokens_seen": 53784848, + "step": 24910 + }, + { + "epoch": 4.064437194127243, + "grad_norm": 0.1146339401602745, + "learning_rate": 3.701114377521795e-05, + "loss": 0.0893, + "num_input_tokens_seen": 53795632, + "step": 24915 + }, + { + "epoch": 4.065252854812398, + "grad_norm": 0.4898930788040161, + "learning_rate": 3.700490064399945e-05, + "loss": 0.1272, + "num_input_tokens_seen": 53806480, + "step": 24920 + }, + { + "epoch": 4.066068515497553, + "grad_norm": 0.45654791593551636, + "learning_rate": 3.6998656539602795e-05, + "loss": 0.0658, + "num_input_tokens_seen": 53816816, + "step": 24925 + }, + { + "epoch": 4.066884176182708, + "grad_norm": 0.08093341439962387, + "learning_rate": 3.699241146253416e-05, + "loss": 0.1253, + "num_input_tokens_seen": 53828720, + "step": 24930 + }, + { + "epoch": 4.067699836867863, + "grad_norm": 0.36073917150497437, + "learning_rate": 3.69861654132998e-05, + "loss": 0.1087, + "num_input_tokens_seen": 53840816, + "step": 24935 + }, + { + "epoch": 4.068515497553018, + "grad_norm": 0.5552253723144531, + "learning_rate": 3.6979918392406055e-05, + "loss": 0.0777, + "num_input_tokens_seen": 53851120, + "step": 24940 + }, + { + "epoch": 4.069331158238173, + "grad_norm": 0.4011973440647125, + "learning_rate": 3.697367040035934e-05, + "loss": 0.1369, + "num_input_tokens_seen": 53862288, + "step": 24945 + }, + { + "epoch": 4.070146818923328, + "grad_norm": 1.221205472946167, + "learning_rate": 3.696742143766615e-05, + "loss": 0.2403, + "num_input_tokens_seen": 53873584, + "step": 24950 + }, + { + "epoch": 4.0709624796084825, + "grad_norm": 1.2464250326156616, + "learning_rate": 3.696117150483306e-05, + "loss": 0.2838, + "num_input_tokens_seen": 53885328, + "step": 24955 + }, + { + "epoch": 4.071778140293638, + "grad_norm": 0.5011881589889526, + "learning_rate": 3.695492060236671e-05, + "loss": 0.0233, + "num_input_tokens_seen": 53895376, + "step": 24960 + }, + { + "epoch": 4.072593800978793, + "grad_norm": 0.22178144752979279, + "learning_rate": 3.694866873077384e-05, + "loss": 0.0981, + "num_input_tokens_seen": 53906640, + "step": 24965 + }, + { + "epoch": 4.073409461663948, + "grad_norm": 0.9080899357795715, + "learning_rate": 3.6942415890561254e-05, + "loss": 0.1672, + "num_input_tokens_seen": 53917264, + "step": 24970 + }, + { + "epoch": 4.074225122349103, + "grad_norm": 0.15988712012767792, + "learning_rate": 3.6936162082235844e-05, + "loss": 0.1748, + "num_input_tokens_seen": 53927984, + "step": 24975 + }, + { + "epoch": 4.075040783034257, + "grad_norm": 0.2903548777103424, + "learning_rate": 3.692990730630457e-05, + "loss": 0.0575, + "num_input_tokens_seen": 53938672, + "step": 24980 + }, + { + "epoch": 4.075856443719413, + "grad_norm": 0.858127236366272, + "learning_rate": 3.692365156327448e-05, + "loss": 0.1001, + "num_input_tokens_seen": 53951120, + "step": 24985 + }, + { + "epoch": 4.076672104404568, + "grad_norm": 0.3399229049682617, + "learning_rate": 3.691739485365269e-05, + "loss": 0.0857, + "num_input_tokens_seen": 53962192, + "step": 24990 + }, + { + "epoch": 4.077487765089723, + "grad_norm": 0.6332297325134277, + "learning_rate": 3.691113717794641e-05, + "loss": 0.2119, + "num_input_tokens_seen": 53973488, + "step": 24995 + }, + { + "epoch": 4.078303425774878, + "grad_norm": 0.23857589066028595, + "learning_rate": 3.6904878536662904e-05, + "loss": 0.1418, + "num_input_tokens_seen": 53984272, + "step": 25000 + }, + { + "epoch": 4.079119086460032, + "grad_norm": 1.9001691341400146, + "learning_rate": 3.6898618930309556e-05, + "loss": 0.2021, + "num_input_tokens_seen": 53994704, + "step": 25005 + }, + { + "epoch": 4.079934747145187, + "grad_norm": 0.11337598413228989, + "learning_rate": 3.6892358359393767e-05, + "loss": 0.1006, + "num_input_tokens_seen": 54005552, + "step": 25010 + }, + { + "epoch": 4.080750407830343, + "grad_norm": 0.4456023573875427, + "learning_rate": 3.688609682442308e-05, + "loss": 0.0402, + "num_input_tokens_seen": 54016560, + "step": 25015 + }, + { + "epoch": 4.081566068515498, + "grad_norm": 0.9385554790496826, + "learning_rate": 3.687983432590507e-05, + "loss": 0.1313, + "num_input_tokens_seen": 54026736, + "step": 25020 + }, + { + "epoch": 4.082381729200653, + "grad_norm": 0.8946731090545654, + "learning_rate": 3.6873570864347415e-05, + "loss": 0.197, + "num_input_tokens_seen": 54036592, + "step": 25025 + }, + { + "epoch": 4.083197389885807, + "grad_norm": 0.6972407698631287, + "learning_rate": 3.686730644025786e-05, + "loss": 0.3495, + "num_input_tokens_seen": 54046864, + "step": 25030 + }, + { + "epoch": 4.084013050570962, + "grad_norm": 0.7378352880477905, + "learning_rate": 3.686104105414423e-05, + "loss": 0.0526, + "num_input_tokens_seen": 54058448, + "step": 25035 + }, + { + "epoch": 4.084828711256117, + "grad_norm": 1.850986361503601, + "learning_rate": 3.6854774706514424e-05, + "loss": 0.2452, + "num_input_tokens_seen": 54068848, + "step": 25040 + }, + { + "epoch": 4.085644371941273, + "grad_norm": 0.17961834371089935, + "learning_rate": 3.684850739787644e-05, + "loss": 0.2734, + "num_input_tokens_seen": 54079504, + "step": 25045 + }, + { + "epoch": 4.0864600326264275, + "grad_norm": 0.1486385017633438, + "learning_rate": 3.684223912873832e-05, + "loss": 0.0703, + "num_input_tokens_seen": 54089424, + "step": 25050 + }, + { + "epoch": 4.087275693311582, + "grad_norm": 0.10096079856157303, + "learning_rate": 3.683596989960821e-05, + "loss": 0.0443, + "num_input_tokens_seen": 54100848, + "step": 25055 + }, + { + "epoch": 4.088091353996737, + "grad_norm": 0.8930910229682922, + "learning_rate": 3.682969971099433e-05, + "loss": 0.098, + "num_input_tokens_seen": 54111376, + "step": 25060 + }, + { + "epoch": 4.088907014681892, + "grad_norm": 0.5028018355369568, + "learning_rate": 3.682342856340496e-05, + "loss": 0.0416, + "num_input_tokens_seen": 54123408, + "step": 25065 + }, + { + "epoch": 4.089722675367048, + "grad_norm": 0.21989706158638, + "learning_rate": 3.681715645734848e-05, + "loss": 0.0982, + "num_input_tokens_seen": 54133296, + "step": 25070 + }, + { + "epoch": 4.0905383360522025, + "grad_norm": 0.09897215664386749, + "learning_rate": 3.681088339333334e-05, + "loss": 0.0598, + "num_input_tokens_seen": 54144240, + "step": 25075 + }, + { + "epoch": 4.091353996737357, + "grad_norm": 0.07659962773323059, + "learning_rate": 3.680460937186807e-05, + "loss": 0.0679, + "num_input_tokens_seen": 54157008, + "step": 25080 + }, + { + "epoch": 4.092169657422512, + "grad_norm": 0.24242481589317322, + "learning_rate": 3.679833439346126e-05, + "loss": 0.1199, + "num_input_tokens_seen": 54167792, + "step": 25085 + }, + { + "epoch": 4.092985318107667, + "grad_norm": 0.35850679874420166, + "learning_rate": 3.6792058458621607e-05, + "loss": 0.1049, + "num_input_tokens_seen": 54178672, + "step": 25090 + }, + { + "epoch": 4.093800978792822, + "grad_norm": 0.07504113018512726, + "learning_rate": 3.678578156785786e-05, + "loss": 0.0757, + "num_input_tokens_seen": 54190288, + "step": 25095 + }, + { + "epoch": 4.0946166394779775, + "grad_norm": 1.3511338233947754, + "learning_rate": 3.677950372167885e-05, + "loss": 0.1637, + "num_input_tokens_seen": 54199728, + "step": 25100 + }, + { + "epoch": 4.095432300163132, + "grad_norm": 0.3655468821525574, + "learning_rate": 3.677322492059352e-05, + "loss": 0.0512, + "num_input_tokens_seen": 54209840, + "step": 25105 + }, + { + "epoch": 4.096247960848287, + "grad_norm": 0.02954595908522606, + "learning_rate": 3.676694516511083e-05, + "loss": 0.1784, + "num_input_tokens_seen": 54219856, + "step": 25110 + }, + { + "epoch": 4.097063621533442, + "grad_norm": 0.08775818347930908, + "learning_rate": 3.676066445573986e-05, + "loss": 0.0895, + "num_input_tokens_seen": 54230096, + "step": 25115 + }, + { + "epoch": 4.097879282218597, + "grad_norm": 0.11006388813257217, + "learning_rate": 3.675438279298975e-05, + "loss": 0.0347, + "num_input_tokens_seen": 54241840, + "step": 25120 + }, + { + "epoch": 4.0986949429037525, + "grad_norm": 0.8215053677558899, + "learning_rate": 3.674810017736974e-05, + "loss": 0.0863, + "num_input_tokens_seen": 54251920, + "step": 25125 + }, + { + "epoch": 4.099510603588907, + "grad_norm": 0.20904460549354553, + "learning_rate": 3.674181660938911e-05, + "loss": 0.1209, + "num_input_tokens_seen": 54263344, + "step": 25130 + }, + { + "epoch": 4.100326264274062, + "grad_norm": 0.8680446147918701, + "learning_rate": 3.6735532089557256e-05, + "loss": 0.1413, + "num_input_tokens_seen": 54274384, + "step": 25135 + }, + { + "epoch": 4.101141924959217, + "grad_norm": 0.09279170632362366, + "learning_rate": 3.672924661838362e-05, + "loss": 0.052, + "num_input_tokens_seen": 54285296, + "step": 25140 + }, + { + "epoch": 4.101957585644372, + "grad_norm": 0.3266103267669678, + "learning_rate": 3.672296019637774e-05, + "loss": 0.1243, + "num_input_tokens_seen": 54295504, + "step": 25145 + }, + { + "epoch": 4.102773246329527, + "grad_norm": 0.6176956295967102, + "learning_rate": 3.6716672824049234e-05, + "loss": 0.1165, + "num_input_tokens_seen": 54306704, + "step": 25150 + }, + { + "epoch": 4.103588907014682, + "grad_norm": 0.024190787225961685, + "learning_rate": 3.671038450190777e-05, + "loss": 0.1621, + "num_input_tokens_seen": 54316720, + "step": 25155 + }, + { + "epoch": 4.104404567699837, + "grad_norm": 0.9360352158546448, + "learning_rate": 3.670409523046312e-05, + "loss": 0.1422, + "num_input_tokens_seen": 54327408, + "step": 25160 + }, + { + "epoch": 4.105220228384992, + "grad_norm": 0.46476319432258606, + "learning_rate": 3.669780501022513e-05, + "loss": 0.0939, + "num_input_tokens_seen": 54337104, + "step": 25165 + }, + { + "epoch": 4.106035889070147, + "grad_norm": 0.3910299241542816, + "learning_rate": 3.669151384170371e-05, + "loss": 0.2314, + "num_input_tokens_seen": 54347408, + "step": 25170 + }, + { + "epoch": 4.1068515497553015, + "grad_norm": 1.3800137042999268, + "learning_rate": 3.668522172540886e-05, + "loss": 0.111, + "num_input_tokens_seen": 54357264, + "step": 25175 + }, + { + "epoch": 4.107667210440456, + "grad_norm": 0.7701561450958252, + "learning_rate": 3.667892866185064e-05, + "loss": 0.1522, + "num_input_tokens_seen": 54368816, + "step": 25180 + }, + { + "epoch": 4.108482871125612, + "grad_norm": 0.352112740278244, + "learning_rate": 3.6672634651539205e-05, + "loss": 0.1338, + "num_input_tokens_seen": 54380848, + "step": 25185 + }, + { + "epoch": 4.109298531810767, + "grad_norm": 0.4230813980102539, + "learning_rate": 3.6666339694984785e-05, + "loss": 0.0795, + "num_input_tokens_seen": 54391760, + "step": 25190 + }, + { + "epoch": 4.110114192495922, + "grad_norm": 0.41941729187965393, + "learning_rate": 3.666004379269766e-05, + "loss": 0.247, + "num_input_tokens_seen": 54402384, + "step": 25195 + }, + { + "epoch": 4.1109298531810765, + "grad_norm": 0.19789518415927887, + "learning_rate": 3.665374694518824e-05, + "loss": 0.0426, + "num_input_tokens_seen": 54413680, + "step": 25200 + }, + { + "epoch": 4.111745513866231, + "grad_norm": 1.7383402585983276, + "learning_rate": 3.664744915296695e-05, + "loss": 0.1499, + "num_input_tokens_seen": 54422640, + "step": 25205 + }, + { + "epoch": 4.112561174551387, + "grad_norm": 1.1434688568115234, + "learning_rate": 3.664115041654434e-05, + "loss": 0.2142, + "num_input_tokens_seen": 54434576, + "step": 25210 + }, + { + "epoch": 4.113376835236542, + "grad_norm": 0.11980898678302765, + "learning_rate": 3.663485073643102e-05, + "loss": 0.0409, + "num_input_tokens_seen": 54444272, + "step": 25215 + }, + { + "epoch": 4.114192495921697, + "grad_norm": 0.5465552806854248, + "learning_rate": 3.6628550113137635e-05, + "loss": 0.0947, + "num_input_tokens_seen": 54456016, + "step": 25220 + }, + { + "epoch": 4.1150081566068515, + "grad_norm": 0.03726860135793686, + "learning_rate": 3.6622248547175e-05, + "loss": 0.0202, + "num_input_tokens_seen": 54466448, + "step": 25225 + }, + { + "epoch": 4.115823817292006, + "grad_norm": 0.4719489514827728, + "learning_rate": 3.661594603905392e-05, + "loss": 0.15, + "num_input_tokens_seen": 54476592, + "step": 25230 + }, + { + "epoch": 4.116639477977161, + "grad_norm": 0.032816484570503235, + "learning_rate": 3.660964258928532e-05, + "loss": 0.1095, + "num_input_tokens_seen": 54488048, + "step": 25235 + }, + { + "epoch": 4.117455138662317, + "grad_norm": 0.35289138555526733, + "learning_rate": 3.660333819838018e-05, + "loss": 0.0758, + "num_input_tokens_seen": 54499120, + "step": 25240 + }, + { + "epoch": 4.118270799347472, + "grad_norm": 1.2858176231384277, + "learning_rate": 3.659703286684957e-05, + "loss": 0.1721, + "num_input_tokens_seen": 54510576, + "step": 25245 + }, + { + "epoch": 4.1190864600326265, + "grad_norm": 0.16514073312282562, + "learning_rate": 3.659072659520463e-05, + "loss": 0.1381, + "num_input_tokens_seen": 54522160, + "step": 25250 + }, + { + "epoch": 4.119902120717781, + "grad_norm": 0.10417579859495163, + "learning_rate": 3.658441938395659e-05, + "loss": 0.0233, + "num_input_tokens_seen": 54533872, + "step": 25255 + }, + { + "epoch": 4.120717781402936, + "grad_norm": 0.49221304059028625, + "learning_rate": 3.6578111233616726e-05, + "loss": 0.153, + "num_input_tokens_seen": 54545200, + "step": 25260 + }, + { + "epoch": 4.121533442088092, + "grad_norm": 0.19402079284191132, + "learning_rate": 3.657180214469643e-05, + "loss": 0.1173, + "num_input_tokens_seen": 54554960, + "step": 25265 + }, + { + "epoch": 4.122349102773247, + "grad_norm": 0.41792765259742737, + "learning_rate": 3.656549211770713e-05, + "loss": 0.1701, + "num_input_tokens_seen": 54565456, + "step": 25270 + }, + { + "epoch": 4.123164763458401, + "grad_norm": 0.22035710513591766, + "learning_rate": 3.655918115316036e-05, + "loss": 0.1643, + "num_input_tokens_seen": 54575536, + "step": 25275 + }, + { + "epoch": 4.123980424143556, + "grad_norm": 0.42917466163635254, + "learning_rate": 3.655286925156772e-05, + "loss": 0.1069, + "num_input_tokens_seen": 54585296, + "step": 25280 + }, + { + "epoch": 4.124796084828711, + "grad_norm": 0.31499963998794556, + "learning_rate": 3.654655641344087e-05, + "loss": 0.1415, + "num_input_tokens_seen": 54595600, + "step": 25285 + }, + { + "epoch": 4.125611745513866, + "grad_norm": 0.10761100798845291, + "learning_rate": 3.654024263929157e-05, + "loss": 0.0919, + "num_input_tokens_seen": 54606160, + "step": 25290 + }, + { + "epoch": 4.126427406199022, + "grad_norm": 0.28989946842193604, + "learning_rate": 3.653392792963165e-05, + "loss": 0.0564, + "num_input_tokens_seen": 54617232, + "step": 25295 + }, + { + "epoch": 4.127243066884176, + "grad_norm": 1.7709081172943115, + "learning_rate": 3.652761228497301e-05, + "loss": 0.1064, + "num_input_tokens_seen": 54628688, + "step": 25300 + }, + { + "epoch": 4.128058727569331, + "grad_norm": 0.26981890201568604, + "learning_rate": 3.652129570582763e-05, + "loss": 0.1534, + "num_input_tokens_seen": 54640016, + "step": 25305 + }, + { + "epoch": 4.128874388254486, + "grad_norm": 0.08452721685171127, + "learning_rate": 3.651497819270756e-05, + "loss": 0.0813, + "num_input_tokens_seen": 54648848, + "step": 25310 + }, + { + "epoch": 4.129690048939641, + "grad_norm": 1.4110102653503418, + "learning_rate": 3.650865974612493e-05, + "loss": 0.2361, + "num_input_tokens_seen": 54659376, + "step": 25315 + }, + { + "epoch": 4.130505709624796, + "grad_norm": 0.34259700775146484, + "learning_rate": 3.650234036659195e-05, + "loss": 0.0509, + "num_input_tokens_seen": 54670928, + "step": 25320 + }, + { + "epoch": 4.131321370309951, + "grad_norm": 0.21498428285121918, + "learning_rate": 3.649602005462089e-05, + "loss": 0.1147, + "num_input_tokens_seen": 54681584, + "step": 25325 + }, + { + "epoch": 4.132137030995106, + "grad_norm": 0.08928987383842468, + "learning_rate": 3.648969881072412e-05, + "loss": 0.141, + "num_input_tokens_seen": 54692432, + "step": 25330 + }, + { + "epoch": 4.132952691680261, + "grad_norm": 0.1472281515598297, + "learning_rate": 3.648337663541407e-05, + "loss": 0.1525, + "num_input_tokens_seen": 54703152, + "step": 25335 + }, + { + "epoch": 4.133768352365416, + "grad_norm": 0.19706130027770996, + "learning_rate": 3.647705352920324e-05, + "loss": 0.0301, + "num_input_tokens_seen": 54714032, + "step": 25340 + }, + { + "epoch": 4.134584013050571, + "grad_norm": 0.8202407956123352, + "learning_rate": 3.647072949260422e-05, + "loss": 0.0752, + "num_input_tokens_seen": 54724272, + "step": 25345 + }, + { + "epoch": 4.135399673735726, + "grad_norm": 0.40144866704940796, + "learning_rate": 3.646440452612965e-05, + "loss": 0.0297, + "num_input_tokens_seen": 54734864, + "step": 25350 + }, + { + "epoch": 4.136215334420881, + "grad_norm": 0.13663925230503082, + "learning_rate": 3.645807863029229e-05, + "loss": 0.172, + "num_input_tokens_seen": 54744464, + "step": 25355 + }, + { + "epoch": 4.137030995106036, + "grad_norm": 1.2778240442276, + "learning_rate": 3.645175180560495e-05, + "loss": 0.2119, + "num_input_tokens_seen": 54754672, + "step": 25360 + }, + { + "epoch": 4.137846655791191, + "grad_norm": 0.2102060467004776, + "learning_rate": 3.644542405258049e-05, + "loss": 0.1862, + "num_input_tokens_seen": 54766256, + "step": 25365 + }, + { + "epoch": 4.138662316476346, + "grad_norm": 0.22705677151679993, + "learning_rate": 3.643909537173188e-05, + "loss": 0.084, + "num_input_tokens_seen": 54777296, + "step": 25370 + }, + { + "epoch": 4.1394779771615005, + "grad_norm": 1.310612678527832, + "learning_rate": 3.643276576357216e-05, + "loss": 0.2143, + "num_input_tokens_seen": 54787728, + "step": 25375 + }, + { + "epoch": 4.140293637846656, + "grad_norm": 0.4837663471698761, + "learning_rate": 3.642643522861444e-05, + "loss": 0.1334, + "num_input_tokens_seen": 54798864, + "step": 25380 + }, + { + "epoch": 4.141109298531811, + "grad_norm": 0.039813052862882614, + "learning_rate": 3.642010376737191e-05, + "loss": 0.072, + "num_input_tokens_seen": 54809872, + "step": 25385 + }, + { + "epoch": 4.141924959216966, + "grad_norm": 0.08044595271348953, + "learning_rate": 3.641377138035782e-05, + "loss": 0.1825, + "num_input_tokens_seen": 54819248, + "step": 25390 + }, + { + "epoch": 4.142740619902121, + "grad_norm": 0.5242161750793457, + "learning_rate": 3.640743806808551e-05, + "loss": 0.1116, + "num_input_tokens_seen": 54830448, + "step": 25395 + }, + { + "epoch": 4.143556280587275, + "grad_norm": 0.22783935070037842, + "learning_rate": 3.640110383106838e-05, + "loss": 0.104, + "num_input_tokens_seen": 54841776, + "step": 25400 + }, + { + "epoch": 4.14437194127243, + "grad_norm": 2.0598368644714355, + "learning_rate": 3.639476866981993e-05, + "loss": 0.2037, + "num_input_tokens_seen": 54851888, + "step": 25405 + }, + { + "epoch": 4.145187601957586, + "grad_norm": 1.638221263885498, + "learning_rate": 3.638843258485372e-05, + "loss": 0.2902, + "num_input_tokens_seen": 54861968, + "step": 25410 + }, + { + "epoch": 4.146003262642741, + "grad_norm": 0.22306685149669647, + "learning_rate": 3.638209557668337e-05, + "loss": 0.1308, + "num_input_tokens_seen": 54873104, + "step": 25415 + }, + { + "epoch": 4.146818923327896, + "grad_norm": 0.08311817049980164, + "learning_rate": 3.637575764582261e-05, + "loss": 0.017, + "num_input_tokens_seen": 54883600, + "step": 25420 + }, + { + "epoch": 4.14763458401305, + "grad_norm": 0.14503639936447144, + "learning_rate": 3.636941879278522e-05, + "loss": 0.0269, + "num_input_tokens_seen": 54895184, + "step": 25425 + }, + { + "epoch": 4.148450244698205, + "grad_norm": 0.26186805963516235, + "learning_rate": 3.636307901808504e-05, + "loss": 0.1287, + "num_input_tokens_seen": 54906640, + "step": 25430 + }, + { + "epoch": 4.149265905383361, + "grad_norm": 0.40495967864990234, + "learning_rate": 3.635673832223603e-05, + "loss": 0.133, + "num_input_tokens_seen": 54918192, + "step": 25435 + }, + { + "epoch": 4.150081566068516, + "grad_norm": 0.1886124610900879, + "learning_rate": 3.635039670575218e-05, + "loss": 0.1107, + "num_input_tokens_seen": 54927536, + "step": 25440 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.33092430233955383, + "learning_rate": 3.6344054169147584e-05, + "loss": 0.1114, + "num_input_tokens_seen": 54938224, + "step": 25445 + }, + { + "epoch": 4.151712887438825, + "grad_norm": 0.6605625748634338, + "learning_rate": 3.63377107129364e-05, + "loss": 0.0424, + "num_input_tokens_seen": 54949104, + "step": 25450 + }, + { + "epoch": 4.15252854812398, + "grad_norm": 0.09767262637615204, + "learning_rate": 3.633136633763286e-05, + "loss": 0.0472, + "num_input_tokens_seen": 54960848, + "step": 25455 + }, + { + "epoch": 4.153344208809135, + "grad_norm": 1.6139311790466309, + "learning_rate": 3.632502104375127e-05, + "loss": 0.1375, + "num_input_tokens_seen": 54971600, + "step": 25460 + }, + { + "epoch": 4.154159869494291, + "grad_norm": 0.2696635127067566, + "learning_rate": 3.6318674831806e-05, + "loss": 0.0343, + "num_input_tokens_seen": 54981936, + "step": 25465 + }, + { + "epoch": 4.1549755301794455, + "grad_norm": 1.133078694343567, + "learning_rate": 3.6312327702311536e-05, + "loss": 0.1001, + "num_input_tokens_seen": 54993552, + "step": 25470 + }, + { + "epoch": 4.1557911908646, + "grad_norm": 0.1421635001897812, + "learning_rate": 3.630597965578238e-05, + "loss": 0.0718, + "num_input_tokens_seen": 55005200, + "step": 25475 + }, + { + "epoch": 4.156606851549755, + "grad_norm": 0.07731588184833527, + "learning_rate": 3.629963069273315e-05, + "loss": 0.0173, + "num_input_tokens_seen": 55016464, + "step": 25480 + }, + { + "epoch": 4.15742251223491, + "grad_norm": 0.17452064156532288, + "learning_rate": 3.6293280813678523e-05, + "loss": 0.2472, + "num_input_tokens_seen": 55026576, + "step": 25485 + }, + { + "epoch": 4.158238172920065, + "grad_norm": 0.030154768377542496, + "learning_rate": 3.628693001913325e-05, + "loss": 0.074, + "num_input_tokens_seen": 55037904, + "step": 25490 + }, + { + "epoch": 4.1590538336052205, + "grad_norm": 0.4178576171398163, + "learning_rate": 3.6280578309612165e-05, + "loss": 0.0903, + "num_input_tokens_seen": 55047408, + "step": 25495 + }, + { + "epoch": 4.159869494290375, + "grad_norm": 1.372735619544983, + "learning_rate": 3.6274225685630156e-05, + "loss": 0.1826, + "num_input_tokens_seen": 55058064, + "step": 25500 + }, + { + "epoch": 4.16068515497553, + "grad_norm": 0.42113596200942993, + "learning_rate": 3.626787214770221e-05, + "loss": 0.0644, + "num_input_tokens_seen": 55069008, + "step": 25505 + }, + { + "epoch": 4.161500815660685, + "grad_norm": 1.2067970037460327, + "learning_rate": 3.626151769634338e-05, + "loss": 0.1901, + "num_input_tokens_seen": 55079792, + "step": 25510 + }, + { + "epoch": 4.16231647634584, + "grad_norm": 0.7609421610832214, + "learning_rate": 3.6255162332068785e-05, + "loss": 0.1787, + "num_input_tokens_seen": 55090864, + "step": 25515 + }, + { + "epoch": 4.1631321370309955, + "grad_norm": 0.9008023142814636, + "learning_rate": 3.624880605539362e-05, + "loss": 0.1782, + "num_input_tokens_seen": 55101904, + "step": 25520 + }, + { + "epoch": 4.16394779771615, + "grad_norm": 1.3827122449874878, + "learning_rate": 3.6242448866833164e-05, + "loss": 0.0917, + "num_input_tokens_seen": 55112656, + "step": 25525 + }, + { + "epoch": 4.164763458401305, + "grad_norm": 0.04189121350646019, + "learning_rate": 3.623609076690275e-05, + "loss": 0.2468, + "num_input_tokens_seen": 55123024, + "step": 25530 + }, + { + "epoch": 4.16557911908646, + "grad_norm": 0.1630854308605194, + "learning_rate": 3.622973175611781e-05, + "loss": 0.1906, + "num_input_tokens_seen": 55133136, + "step": 25535 + }, + { + "epoch": 4.166394779771615, + "grad_norm": 0.48869696259498596, + "learning_rate": 3.622337183499384e-05, + "loss": 0.1226, + "num_input_tokens_seen": 55142864, + "step": 25540 + }, + { + "epoch": 4.16721044045677, + "grad_norm": 0.804943323135376, + "learning_rate": 3.6217011004046404e-05, + "loss": 0.1481, + "num_input_tokens_seen": 55152848, + "step": 25545 + }, + { + "epoch": 4.168026101141925, + "grad_norm": 0.9470813274383545, + "learning_rate": 3.621064926379114e-05, + "loss": 0.1366, + "num_input_tokens_seen": 55162032, + "step": 25550 + }, + { + "epoch": 4.16884176182708, + "grad_norm": 0.4964427947998047, + "learning_rate": 3.620428661474377e-05, + "loss": 0.1526, + "num_input_tokens_seen": 55173616, + "step": 25555 + }, + { + "epoch": 4.169657422512235, + "grad_norm": 1.1587247848510742, + "learning_rate": 3.619792305742006e-05, + "loss": 0.1217, + "num_input_tokens_seen": 55185328, + "step": 25560 + }, + { + "epoch": 4.17047308319739, + "grad_norm": 0.8051002621650696, + "learning_rate": 3.619155859233589e-05, + "loss": 0.1944, + "num_input_tokens_seen": 55197072, + "step": 25565 + }, + { + "epoch": 4.171288743882545, + "grad_norm": 0.470247745513916, + "learning_rate": 3.6185193220007214e-05, + "loss": 0.0264, + "num_input_tokens_seen": 55205936, + "step": 25570 + }, + { + "epoch": 4.1721044045677, + "grad_norm": 0.5290273427963257, + "learning_rate": 3.617882694095001e-05, + "loss": 0.0326, + "num_input_tokens_seen": 55216560, + "step": 25575 + }, + { + "epoch": 4.172920065252855, + "grad_norm": 0.3143501877784729, + "learning_rate": 3.617245975568038e-05, + "loss": 0.0407, + "num_input_tokens_seen": 55227504, + "step": 25580 + }, + { + "epoch": 4.17373572593801, + "grad_norm": 0.8730102181434631, + "learning_rate": 3.616609166471447e-05, + "loss": 0.2182, + "num_input_tokens_seen": 55238032, + "step": 25585 + }, + { + "epoch": 4.174551386623165, + "grad_norm": 0.07136286795139313, + "learning_rate": 3.615972266856851e-05, + "loss": 0.087, + "num_input_tokens_seen": 55248080, + "step": 25590 + }, + { + "epoch": 4.1753670473083195, + "grad_norm": 1.5332361459732056, + "learning_rate": 3.6153352767758816e-05, + "loss": 0.1644, + "num_input_tokens_seen": 55259760, + "step": 25595 + }, + { + "epoch": 4.176182707993474, + "grad_norm": 0.4458472430706024, + "learning_rate": 3.6146981962801744e-05, + "loss": 0.0556, + "num_input_tokens_seen": 55270128, + "step": 25600 + }, + { + "epoch": 4.17699836867863, + "grad_norm": 0.6772699952125549, + "learning_rate": 3.6140610254213756e-05, + "loss": 0.1705, + "num_input_tokens_seen": 55280592, + "step": 25605 + }, + { + "epoch": 4.177814029363785, + "grad_norm": 0.30372413992881775, + "learning_rate": 3.613423764251138e-05, + "loss": 0.036, + "num_input_tokens_seen": 55291344, + "step": 25610 + }, + { + "epoch": 4.17862969004894, + "grad_norm": 1.471801519393921, + "learning_rate": 3.61278641282112e-05, + "loss": 0.1129, + "num_input_tokens_seen": 55301360, + "step": 25615 + }, + { + "epoch": 4.1794453507340945, + "grad_norm": 1.4281028509140015, + "learning_rate": 3.612148971182989e-05, + "loss": 0.0934, + "num_input_tokens_seen": 55311696, + "step": 25620 + }, + { + "epoch": 4.180261011419249, + "grad_norm": 0.0837947428226471, + "learning_rate": 3.6115114393884206e-05, + "loss": 0.1821, + "num_input_tokens_seen": 55322704, + "step": 25625 + }, + { + "epoch": 4.181076672104404, + "grad_norm": 1.1480833292007446, + "learning_rate": 3.6108738174890944e-05, + "loss": 0.1186, + "num_input_tokens_seen": 55334704, + "step": 25630 + }, + { + "epoch": 4.18189233278956, + "grad_norm": 0.19892379641532898, + "learning_rate": 3.6102361055367e-05, + "loss": 0.1346, + "num_input_tokens_seen": 55344912, + "step": 25635 + }, + { + "epoch": 4.182707993474715, + "grad_norm": 0.17817413806915283, + "learning_rate": 3.609598303582934e-05, + "loss": 0.1241, + "num_input_tokens_seen": 55354832, + "step": 25640 + }, + { + "epoch": 4.1835236541598695, + "grad_norm": 0.2573038339614868, + "learning_rate": 3.608960411679499e-05, + "loss": 0.0645, + "num_input_tokens_seen": 55365776, + "step": 25645 + }, + { + "epoch": 4.184339314845024, + "grad_norm": 0.10212865471839905, + "learning_rate": 3.608322429878107e-05, + "loss": 0.2227, + "num_input_tokens_seen": 55376208, + "step": 25650 + }, + { + "epoch": 4.185154975530179, + "grad_norm": 1.046486258506775, + "learning_rate": 3.6076843582304744e-05, + "loss": 0.1749, + "num_input_tokens_seen": 55386512, + "step": 25655 + }, + { + "epoch": 4.185970636215335, + "grad_norm": 0.25114744901657104, + "learning_rate": 3.607046196788328e-05, + "loss": 0.0612, + "num_input_tokens_seen": 55397168, + "step": 25660 + }, + { + "epoch": 4.18678629690049, + "grad_norm": 0.04706627503037453, + "learning_rate": 3.6064079456033996e-05, + "loss": 0.2515, + "num_input_tokens_seen": 55408912, + "step": 25665 + }, + { + "epoch": 4.1876019575856445, + "grad_norm": 0.44897714257240295, + "learning_rate": 3.6057696047274285e-05, + "loss": 0.1593, + "num_input_tokens_seen": 55418672, + "step": 25670 + }, + { + "epoch": 4.188417618270799, + "grad_norm": 0.9102382659912109, + "learning_rate": 3.605131174212164e-05, + "loss": 0.0444, + "num_input_tokens_seen": 55430768, + "step": 25675 + }, + { + "epoch": 4.189233278955954, + "grad_norm": 1.2203656435012817, + "learning_rate": 3.604492654109357e-05, + "loss": 0.1726, + "num_input_tokens_seen": 55441232, + "step": 25680 + }, + { + "epoch": 4.190048939641109, + "grad_norm": 1.440864086151123, + "learning_rate": 3.6038540444707734e-05, + "loss": 0.1524, + "num_input_tokens_seen": 55453040, + "step": 25685 + }, + { + "epoch": 4.190864600326265, + "grad_norm": 0.08262733370065689, + "learning_rate": 3.603215345348179e-05, + "loss": 0.0963, + "num_input_tokens_seen": 55462768, + "step": 25690 + }, + { + "epoch": 4.191680261011419, + "grad_norm": 0.2157970368862152, + "learning_rate": 3.602576556793352e-05, + "loss": 0.0773, + "num_input_tokens_seen": 55474416, + "step": 25695 + }, + { + "epoch": 4.192495921696574, + "grad_norm": 0.09424997121095657, + "learning_rate": 3.601937678858074e-05, + "loss": 0.1024, + "num_input_tokens_seen": 55484272, + "step": 25700 + }, + { + "epoch": 4.193311582381729, + "grad_norm": 0.528780460357666, + "learning_rate": 3.601298711594137e-05, + "loss": 0.0946, + "num_input_tokens_seen": 55496336, + "step": 25705 + }, + { + "epoch": 4.194127243066884, + "grad_norm": 0.3041703402996063, + "learning_rate": 3.6006596550533385e-05, + "loss": 0.0475, + "num_input_tokens_seen": 55508656, + "step": 25710 + }, + { + "epoch": 4.19494290375204, + "grad_norm": 1.6468514204025269, + "learning_rate": 3.600020509287483e-05, + "loss": 0.2685, + "num_input_tokens_seen": 55519664, + "step": 25715 + }, + { + "epoch": 4.195758564437194, + "grad_norm": 0.3250485062599182, + "learning_rate": 3.599381274348385e-05, + "loss": 0.1096, + "num_input_tokens_seen": 55529744, + "step": 25720 + }, + { + "epoch": 4.196574225122349, + "grad_norm": 0.5316824316978455, + "learning_rate": 3.598741950287861e-05, + "loss": 0.0548, + "num_input_tokens_seen": 55540400, + "step": 25725 + }, + { + "epoch": 4.197389885807504, + "grad_norm": 2.317739486694336, + "learning_rate": 3.5981025371577404e-05, + "loss": 0.2875, + "num_input_tokens_seen": 55551312, + "step": 25730 + }, + { + "epoch": 4.198205546492659, + "grad_norm": 0.1082012802362442, + "learning_rate": 3.5974630350098566e-05, + "loss": 0.079, + "num_input_tokens_seen": 55562448, + "step": 25735 + }, + { + "epoch": 4.199021207177814, + "grad_norm": 0.10781194269657135, + "learning_rate": 3.5968234438960505e-05, + "loss": 0.0367, + "num_input_tokens_seen": 55572848, + "step": 25740 + }, + { + "epoch": 4.199836867862969, + "grad_norm": 0.24173392355442047, + "learning_rate": 3.5961837638681714e-05, + "loss": 0.1306, + "num_input_tokens_seen": 55582800, + "step": 25745 + }, + { + "epoch": 4.200652528548124, + "grad_norm": 0.7261694073677063, + "learning_rate": 3.595543994978073e-05, + "loss": 0.1213, + "num_input_tokens_seen": 55594096, + "step": 25750 + }, + { + "epoch": 4.201468189233279, + "grad_norm": 0.2567303776741028, + "learning_rate": 3.594904137277621e-05, + "loss": 0.0978, + "num_input_tokens_seen": 55605456, + "step": 25755 + }, + { + "epoch": 4.202283849918434, + "grad_norm": 0.13034598529338837, + "learning_rate": 3.594264190818683e-05, + "loss": 0.027, + "num_input_tokens_seen": 55617072, + "step": 25760 + }, + { + "epoch": 4.203099510603589, + "grad_norm": 0.40154901146888733, + "learning_rate": 3.593624155653138e-05, + "loss": 0.3191, + "num_input_tokens_seen": 55628176, + "step": 25765 + }, + { + "epoch": 4.2039151712887435, + "grad_norm": 0.10001322627067566, + "learning_rate": 3.592984031832871e-05, + "loss": 0.0802, + "num_input_tokens_seen": 55637744, + "step": 25770 + }, + { + "epoch": 4.204730831973899, + "grad_norm": 0.3643575608730316, + "learning_rate": 3.5923438194097715e-05, + "loss": 0.0227, + "num_input_tokens_seen": 55649104, + "step": 25775 + }, + { + "epoch": 4.205546492659054, + "grad_norm": 1.2852152585983276, + "learning_rate": 3.591703518435739e-05, + "loss": 0.1398, + "num_input_tokens_seen": 55660272, + "step": 25780 + }, + { + "epoch": 4.206362153344209, + "grad_norm": 0.4267362356185913, + "learning_rate": 3.591063128962681e-05, + "loss": 0.1358, + "num_input_tokens_seen": 55670864, + "step": 25785 + }, + { + "epoch": 4.207177814029364, + "grad_norm": 1.2621310949325562, + "learning_rate": 3.5904226510425095e-05, + "loss": 0.0753, + "num_input_tokens_seen": 55681456, + "step": 25790 + }, + { + "epoch": 4.2079934747145185, + "grad_norm": 0.616682231426239, + "learning_rate": 3.5897820847271446e-05, + "loss": 0.0633, + "num_input_tokens_seen": 55692464, + "step": 25795 + }, + { + "epoch": 4.208809135399674, + "grad_norm": 0.20670576393604279, + "learning_rate": 3.5891414300685155e-05, + "loss": 0.1502, + "num_input_tokens_seen": 55702704, + "step": 25800 + }, + { + "epoch": 4.209624796084829, + "grad_norm": 0.05077836662530899, + "learning_rate": 3.588500687118555e-05, + "loss": 0.1303, + "num_input_tokens_seen": 55713040, + "step": 25805 + }, + { + "epoch": 4.210440456769984, + "grad_norm": 0.3071328103542328, + "learning_rate": 3.587859855929207e-05, + "loss": 0.1069, + "num_input_tokens_seen": 55723920, + "step": 25810 + }, + { + "epoch": 4.211256117455139, + "grad_norm": 2.030940294265747, + "learning_rate": 3.5872189365524175e-05, + "loss": 0.1308, + "num_input_tokens_seen": 55735440, + "step": 25815 + }, + { + "epoch": 4.212071778140293, + "grad_norm": 0.7463493347167969, + "learning_rate": 3.586577929040146e-05, + "loss": 0.1124, + "num_input_tokens_seen": 55745680, + "step": 25820 + }, + { + "epoch": 4.212887438825448, + "grad_norm": 0.79369056224823, + "learning_rate": 3.5859368334443536e-05, + "loss": 0.1371, + "num_input_tokens_seen": 55757680, + "step": 25825 + }, + { + "epoch": 4.213703099510604, + "grad_norm": 1.6905449628829956, + "learning_rate": 3.585295649817011e-05, + "loss": 0.098, + "num_input_tokens_seen": 55767536, + "step": 25830 + }, + { + "epoch": 4.214518760195759, + "grad_norm": 0.45130985975265503, + "learning_rate": 3.5846543782100974e-05, + "loss": 0.1003, + "num_input_tokens_seen": 55778032, + "step": 25835 + }, + { + "epoch": 4.215334420880914, + "grad_norm": 0.5166543126106262, + "learning_rate": 3.584013018675596e-05, + "loss": 0.1078, + "num_input_tokens_seen": 55790000, + "step": 25840 + }, + { + "epoch": 4.216150081566068, + "grad_norm": 0.10297074913978577, + "learning_rate": 3.583371571265498e-05, + "loss": 0.1184, + "num_input_tokens_seen": 55800688, + "step": 25845 + }, + { + "epoch": 4.216965742251223, + "grad_norm": 0.23115047812461853, + "learning_rate": 3.582730036031805e-05, + "loss": 0.0456, + "num_input_tokens_seen": 55812208, + "step": 25850 + }, + { + "epoch": 4.217781402936378, + "grad_norm": 1.3173105716705322, + "learning_rate": 3.582088413026521e-05, + "loss": 0.0813, + "num_input_tokens_seen": 55823120, + "step": 25855 + }, + { + "epoch": 4.218597063621534, + "grad_norm": 0.06299196928739548, + "learning_rate": 3.581446702301659e-05, + "loss": 0.0931, + "num_input_tokens_seen": 55834416, + "step": 25860 + }, + { + "epoch": 4.219412724306689, + "grad_norm": 0.13791410624980927, + "learning_rate": 3.5808049039092414e-05, + "loss": 0.1496, + "num_input_tokens_seen": 55846480, + "step": 25865 + }, + { + "epoch": 4.220228384991843, + "grad_norm": 1.4749735593795776, + "learning_rate": 3.580163017901295e-05, + "loss": 0.1082, + "num_input_tokens_seen": 55857200, + "step": 25870 + }, + { + "epoch": 4.221044045676998, + "grad_norm": 0.29208216071128845, + "learning_rate": 3.579521044329852e-05, + "loss": 0.0919, + "num_input_tokens_seen": 55868976, + "step": 25875 + }, + { + "epoch": 4.221859706362153, + "grad_norm": 2.0320160388946533, + "learning_rate": 3.578878983246956e-05, + "loss": 0.1272, + "num_input_tokens_seen": 55881360, + "step": 25880 + }, + { + "epoch": 4.222675367047309, + "grad_norm": 0.43313920497894287, + "learning_rate": 3.578236834704656e-05, + "loss": 0.1276, + "num_input_tokens_seen": 55892208, + "step": 25885 + }, + { + "epoch": 4.2234910277324635, + "grad_norm": 0.022502034902572632, + "learning_rate": 3.577594598755006e-05, + "loss": 0.0692, + "num_input_tokens_seen": 55901936, + "step": 25890 + }, + { + "epoch": 4.224306688417618, + "grad_norm": 0.7390857934951782, + "learning_rate": 3.5769522754500714e-05, + "loss": 0.0431, + "num_input_tokens_seen": 55912784, + "step": 25895 + }, + { + "epoch": 4.225122349102773, + "grad_norm": 0.32230597734451294, + "learning_rate": 3.5763098648419216e-05, + "loss": 0.2846, + "num_input_tokens_seen": 55924432, + "step": 25900 + }, + { + "epoch": 4.225938009787928, + "grad_norm": 0.13515491783618927, + "learning_rate": 3.575667366982631e-05, + "loss": 0.1414, + "num_input_tokens_seen": 55935216, + "step": 25905 + }, + { + "epoch": 4.226753670473083, + "grad_norm": 0.04559844732284546, + "learning_rate": 3.575024781924288e-05, + "loss": 0.092, + "num_input_tokens_seen": 55945680, + "step": 25910 + }, + { + "epoch": 4.2275693311582385, + "grad_norm": 0.5143098831176758, + "learning_rate": 3.574382109718979e-05, + "loss": 0.1727, + "num_input_tokens_seen": 55956496, + "step": 25915 + }, + { + "epoch": 4.228384991843393, + "grad_norm": 0.04272894561290741, + "learning_rate": 3.573739350418806e-05, + "loss": 0.0521, + "num_input_tokens_seen": 55967280, + "step": 25920 + }, + { + "epoch": 4.229200652528548, + "grad_norm": 0.2506392002105713, + "learning_rate": 3.573096504075874e-05, + "loss": 0.0861, + "num_input_tokens_seen": 55978160, + "step": 25925 + }, + { + "epoch": 4.230016313213703, + "grad_norm": 0.42394155263900757, + "learning_rate": 3.572453570742294e-05, + "loss": 0.0625, + "num_input_tokens_seen": 55989808, + "step": 25930 + }, + { + "epoch": 4.230831973898858, + "grad_norm": 0.06342624872922897, + "learning_rate": 3.571810550470186e-05, + "loss": 0.1375, + "num_input_tokens_seen": 56000944, + "step": 25935 + }, + { + "epoch": 4.231647634584013, + "grad_norm": 0.05216793343424797, + "learning_rate": 3.571167443311676e-05, + "loss": 0.0883, + "num_input_tokens_seen": 56011408, + "step": 25940 + }, + { + "epoch": 4.232463295269168, + "grad_norm": 1.4375618696212769, + "learning_rate": 3.5705242493188986e-05, + "loss": 0.0449, + "num_input_tokens_seen": 56020944, + "step": 25945 + }, + { + "epoch": 4.233278955954323, + "grad_norm": 0.0753653421998024, + "learning_rate": 3.569880968543994e-05, + "loss": 0.0139, + "num_input_tokens_seen": 56031792, + "step": 25950 + }, + { + "epoch": 4.234094616639478, + "grad_norm": 0.5021356344223022, + "learning_rate": 3.569237601039109e-05, + "loss": 0.0331, + "num_input_tokens_seen": 56041936, + "step": 25955 + }, + { + "epoch": 4.234910277324633, + "grad_norm": 0.6169167160987854, + "learning_rate": 3.5685941468563985e-05, + "loss": 0.032, + "num_input_tokens_seen": 56052272, + "step": 25960 + }, + { + "epoch": 4.235725938009788, + "grad_norm": 0.03036423958837986, + "learning_rate": 3.567950606048025e-05, + "loss": 0.0097, + "num_input_tokens_seen": 56061808, + "step": 25965 + }, + { + "epoch": 4.236541598694943, + "grad_norm": 1.337295413017273, + "learning_rate": 3.5673069786661566e-05, + "loss": 0.185, + "num_input_tokens_seen": 56071536, + "step": 25970 + }, + { + "epoch": 4.237357259380098, + "grad_norm": 0.19458641111850739, + "learning_rate": 3.566663264762969e-05, + "loss": 0.0849, + "num_input_tokens_seen": 56083568, + "step": 25975 + }, + { + "epoch": 4.238172920065253, + "grad_norm": 0.23963822424411774, + "learning_rate": 3.5660194643906455e-05, + "loss": 0.0856, + "num_input_tokens_seen": 56094000, + "step": 25980 + }, + { + "epoch": 4.238988580750408, + "grad_norm": 1.3819525241851807, + "learning_rate": 3.5653755776013745e-05, + "loss": 0.2663, + "num_input_tokens_seen": 56103952, + "step": 25985 + }, + { + "epoch": 4.239804241435563, + "grad_norm": 0.32894936203956604, + "learning_rate": 3.5647316044473537e-05, + "loss": 0.0429, + "num_input_tokens_seen": 56114224, + "step": 25990 + }, + { + "epoch": 4.240619902120717, + "grad_norm": 0.09477546066045761, + "learning_rate": 3.564087544980786e-05, + "loss": 0.0447, + "num_input_tokens_seen": 56124464, + "step": 25995 + }, + { + "epoch": 4.241435562805873, + "grad_norm": 0.07513494044542313, + "learning_rate": 3.563443399253883e-05, + "loss": 0.0478, + "num_input_tokens_seen": 56135280, + "step": 26000 + }, + { + "epoch": 4.242251223491028, + "grad_norm": 0.3155718743801117, + "learning_rate": 3.5627991673188624e-05, + "loss": 0.0944, + "num_input_tokens_seen": 56144976, + "step": 26005 + }, + { + "epoch": 4.243066884176183, + "grad_norm": 0.9494279623031616, + "learning_rate": 3.562154849227949e-05, + "loss": 0.2717, + "num_input_tokens_seen": 56156688, + "step": 26010 + }, + { + "epoch": 4.2438825448613375, + "grad_norm": 0.24719959497451782, + "learning_rate": 3.561510445033375e-05, + "loss": 0.0713, + "num_input_tokens_seen": 56167856, + "step": 26015 + }, + { + "epoch": 4.244698205546492, + "grad_norm": 0.7306587100028992, + "learning_rate": 3.560865954787377e-05, + "loss": 0.0863, + "num_input_tokens_seen": 56178928, + "step": 26020 + }, + { + "epoch": 4.245513866231648, + "grad_norm": 0.1199502944946289, + "learning_rate": 3.5602213785422025e-05, + "loss": 0.1694, + "num_input_tokens_seen": 56189584, + "step": 26025 + }, + { + "epoch": 4.246329526916803, + "grad_norm": 1.226730465888977, + "learning_rate": 3.5595767163501034e-05, + "loss": 0.1826, + "num_input_tokens_seen": 56200080, + "step": 26030 + }, + { + "epoch": 4.247145187601958, + "grad_norm": 0.04272512346506119, + "learning_rate": 3.5589319682633393e-05, + "loss": 0.3384, + "num_input_tokens_seen": 56210288, + "step": 26035 + }, + { + "epoch": 4.2479608482871125, + "grad_norm": 0.37325000762939453, + "learning_rate": 3.558287134334177e-05, + "loss": 0.054, + "num_input_tokens_seen": 56220976, + "step": 26040 + }, + { + "epoch": 4.248776508972267, + "grad_norm": 0.6238862872123718, + "learning_rate": 3.55764221461489e-05, + "loss": 0.1047, + "num_input_tokens_seen": 56232176, + "step": 26045 + }, + { + "epoch": 4.249592169657422, + "grad_norm": 1.1938815116882324, + "learning_rate": 3.556997209157759e-05, + "loss": 0.2887, + "num_input_tokens_seen": 56244400, + "step": 26050 + }, + { + "epoch": 4.250407830342578, + "grad_norm": 0.46962520480155945, + "learning_rate": 3.5563521180150704e-05, + "loss": 0.0419, + "num_input_tokens_seen": 56255952, + "step": 26055 + }, + { + "epoch": 4.251223491027733, + "grad_norm": 0.2528083324432373, + "learning_rate": 3.55570694123912e-05, + "loss": 0.1145, + "num_input_tokens_seen": 56267760, + "step": 26060 + }, + { + "epoch": 4.2520391517128875, + "grad_norm": 0.8862475156784058, + "learning_rate": 3.5550616788822074e-05, + "loss": 0.0621, + "num_input_tokens_seen": 56278800, + "step": 26065 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.03742888569831848, + "learning_rate": 3.5544163309966425e-05, + "loss": 0.0761, + "num_input_tokens_seen": 56289552, + "step": 26070 + }, + { + "epoch": 4.253670473083197, + "grad_norm": 0.39713752269744873, + "learning_rate": 3.5537708976347386e-05, + "loss": 0.1594, + "num_input_tokens_seen": 56301808, + "step": 26075 + }, + { + "epoch": 4.254486133768353, + "grad_norm": 1.3396848440170288, + "learning_rate": 3.55312537884882e-05, + "loss": 0.0842, + "num_input_tokens_seen": 56311536, + "step": 26080 + }, + { + "epoch": 4.255301794453508, + "grad_norm": 0.9397637248039246, + "learning_rate": 3.552479774691215e-05, + "loss": 0.1083, + "num_input_tokens_seen": 56323344, + "step": 26085 + }, + { + "epoch": 4.2561174551386625, + "grad_norm": 0.7461183667182922, + "learning_rate": 3.5518340852142587e-05, + "loss": 0.1214, + "num_input_tokens_seen": 56333904, + "step": 26090 + }, + { + "epoch": 4.256933115823817, + "grad_norm": 1.041058897972107, + "learning_rate": 3.5511883104702943e-05, + "loss": 0.0632, + "num_input_tokens_seen": 56343312, + "step": 26095 + }, + { + "epoch": 4.257748776508972, + "grad_norm": 0.5209500789642334, + "learning_rate": 3.5505424505116714e-05, + "loss": 0.0788, + "num_input_tokens_seen": 56354960, + "step": 26100 + }, + { + "epoch": 4.258564437194127, + "grad_norm": 0.20629717409610748, + "learning_rate": 3.549896505390748e-05, + "loss": 0.0969, + "num_input_tokens_seen": 56366032, + "step": 26105 + }, + { + "epoch": 4.259380097879283, + "grad_norm": 0.8508964776992798, + "learning_rate": 3.549250475159887e-05, + "loss": 0.1127, + "num_input_tokens_seen": 56376496, + "step": 26110 + }, + { + "epoch": 4.260195758564437, + "grad_norm": 0.05899472162127495, + "learning_rate": 3.5486043598714576e-05, + "loss": 0.1539, + "num_input_tokens_seen": 56387408, + "step": 26115 + }, + { + "epoch": 4.261011419249592, + "grad_norm": 0.7480545043945312, + "learning_rate": 3.547958159577839e-05, + "loss": 0.1693, + "num_input_tokens_seen": 56397328, + "step": 26120 + }, + { + "epoch": 4.261827079934747, + "grad_norm": 0.5579689145088196, + "learning_rate": 3.547311874331414e-05, + "loss": 0.1808, + "num_input_tokens_seen": 56406640, + "step": 26125 + }, + { + "epoch": 4.262642740619902, + "grad_norm": 1.08989679813385, + "learning_rate": 3.546665504184575e-05, + "loss": 0.118, + "num_input_tokens_seen": 56416400, + "step": 26130 + }, + { + "epoch": 4.263458401305057, + "grad_norm": 1.8761835098266602, + "learning_rate": 3.5460190491897195e-05, + "loss": 0.155, + "num_input_tokens_seen": 56426320, + "step": 26135 + }, + { + "epoch": 4.264274061990212, + "grad_norm": 0.397647500038147, + "learning_rate": 3.5453725093992526e-05, + "loss": 0.3792, + "num_input_tokens_seen": 56436400, + "step": 26140 + }, + { + "epoch": 4.265089722675367, + "grad_norm": 1.5778895616531372, + "learning_rate": 3.544725884865585e-05, + "loss": 0.2232, + "num_input_tokens_seen": 56445712, + "step": 26145 + }, + { + "epoch": 4.265905383360522, + "grad_norm": 0.10646571964025497, + "learning_rate": 3.544079175641137e-05, + "loss": 0.136, + "num_input_tokens_seen": 56456720, + "step": 26150 + }, + { + "epoch": 4.266721044045677, + "grad_norm": 0.3575814962387085, + "learning_rate": 3.543432381778333e-05, + "loss": 0.105, + "num_input_tokens_seen": 56468464, + "step": 26155 + }, + { + "epoch": 4.267536704730832, + "grad_norm": 2.1906797885894775, + "learning_rate": 3.5427855033296056e-05, + "loss": 0.2673, + "num_input_tokens_seen": 56478096, + "step": 26160 + }, + { + "epoch": 4.268352365415987, + "grad_norm": 1.9281293153762817, + "learning_rate": 3.542138540347395e-05, + "loss": 0.3854, + "num_input_tokens_seen": 56489776, + "step": 26165 + }, + { + "epoch": 4.269168026101142, + "grad_norm": 0.4006623923778534, + "learning_rate": 3.5414914928841467e-05, + "loss": 0.1291, + "num_input_tokens_seen": 56501232, + "step": 26170 + }, + { + "epoch": 4.269983686786297, + "grad_norm": 0.24187839031219482, + "learning_rate": 3.540844360992313e-05, + "loss": 0.0618, + "num_input_tokens_seen": 56511664, + "step": 26175 + }, + { + "epoch": 4.270799347471452, + "grad_norm": 0.6317567229270935, + "learning_rate": 3.5401971447243545e-05, + "loss": 0.1001, + "num_input_tokens_seen": 56522032, + "step": 26180 + }, + { + "epoch": 4.271615008156607, + "grad_norm": 0.07449998706579208, + "learning_rate": 3.539549844132737e-05, + "loss": 0.1928, + "num_input_tokens_seen": 56533008, + "step": 26185 + }, + { + "epoch": 4.2724306688417615, + "grad_norm": 0.6358280777931213, + "learning_rate": 3.538902459269935e-05, + "loss": 0.0546, + "num_input_tokens_seen": 56544176, + "step": 26190 + }, + { + "epoch": 4.273246329526917, + "grad_norm": 0.48262444138526917, + "learning_rate": 3.538254990188429e-05, + "loss": 0.1513, + "num_input_tokens_seen": 56555376, + "step": 26195 + }, + { + "epoch": 4.274061990212072, + "grad_norm": 0.356067419052124, + "learning_rate": 3.5376074369407044e-05, + "loss": 0.135, + "num_input_tokens_seen": 56566256, + "step": 26200 + }, + { + "epoch": 4.274877650897227, + "grad_norm": 0.7803857326507568, + "learning_rate": 3.536959799579256e-05, + "loss": 0.1165, + "num_input_tokens_seen": 56576976, + "step": 26205 + }, + { + "epoch": 4.275693311582382, + "grad_norm": 0.15594719350337982, + "learning_rate": 3.5363120781565854e-05, + "loss": 0.1167, + "num_input_tokens_seen": 56587504, + "step": 26210 + }, + { + "epoch": 4.2765089722675365, + "grad_norm": 1.1101912260055542, + "learning_rate": 3.535664272725199e-05, + "loss": 0.1502, + "num_input_tokens_seen": 56598800, + "step": 26215 + }, + { + "epoch": 4.277324632952691, + "grad_norm": 2.448267698287964, + "learning_rate": 3.5350163833376124e-05, + "loss": 0.1752, + "num_input_tokens_seen": 56609584, + "step": 26220 + }, + { + "epoch": 4.278140293637847, + "grad_norm": 0.044354744255542755, + "learning_rate": 3.534368410046346e-05, + "loss": 0.1076, + "num_input_tokens_seen": 56619984, + "step": 26225 + }, + { + "epoch": 4.278955954323002, + "grad_norm": 0.16405673325061798, + "learning_rate": 3.5337203529039275e-05, + "loss": 0.1473, + "num_input_tokens_seen": 56630128, + "step": 26230 + }, + { + "epoch": 4.279771615008157, + "grad_norm": 1.6509475708007812, + "learning_rate": 3.533072211962892e-05, + "loss": 0.1671, + "num_input_tokens_seen": 56641264, + "step": 26235 + }, + { + "epoch": 4.280587275693311, + "grad_norm": 0.22296209633350372, + "learning_rate": 3.532423987275782e-05, + "loss": 0.041, + "num_input_tokens_seen": 56652688, + "step": 26240 + }, + { + "epoch": 4.281402936378466, + "grad_norm": 1.1326755285263062, + "learning_rate": 3.531775678895145e-05, + "loss": 0.132, + "num_input_tokens_seen": 56663600, + "step": 26245 + }, + { + "epoch": 4.282218597063622, + "grad_norm": 0.05160877853631973, + "learning_rate": 3.531127286873536e-05, + "loss": 0.0442, + "num_input_tokens_seen": 56674544, + "step": 26250 + }, + { + "epoch": 4.283034257748777, + "grad_norm": 0.20932632684707642, + "learning_rate": 3.530478811263518e-05, + "loss": 0.0302, + "num_input_tokens_seen": 56685680, + "step": 26255 + }, + { + "epoch": 4.283849918433932, + "grad_norm": 1.5372573137283325, + "learning_rate": 3.529830252117657e-05, + "loss": 0.1304, + "num_input_tokens_seen": 56696752, + "step": 26260 + }, + { + "epoch": 4.284665579119086, + "grad_norm": 0.1781013160943985, + "learning_rate": 3.529181609488532e-05, + "loss": 0.1398, + "num_input_tokens_seen": 56707600, + "step": 26265 + }, + { + "epoch": 4.285481239804241, + "grad_norm": 0.3733862340450287, + "learning_rate": 3.528532883428724e-05, + "loss": 0.169, + "num_input_tokens_seen": 56718608, + "step": 26270 + }, + { + "epoch": 4.286296900489396, + "grad_norm": 0.1379006952047348, + "learning_rate": 3.527884073990822e-05, + "loss": 0.2042, + "num_input_tokens_seen": 56729840, + "step": 26275 + }, + { + "epoch": 4.287112561174552, + "grad_norm": 0.4404539465904236, + "learning_rate": 3.52723518122742e-05, + "loss": 0.1133, + "num_input_tokens_seen": 56739984, + "step": 26280 + }, + { + "epoch": 4.287928221859707, + "grad_norm": 0.2862788140773773, + "learning_rate": 3.526586205191123e-05, + "loss": 0.2972, + "num_input_tokens_seen": 56751248, + "step": 26285 + }, + { + "epoch": 4.288743882544861, + "grad_norm": 0.22141849994659424, + "learning_rate": 3.525937145934539e-05, + "loss": 0.1507, + "num_input_tokens_seen": 56762480, + "step": 26290 + }, + { + "epoch": 4.289559543230016, + "grad_norm": 1.8074235916137695, + "learning_rate": 3.525288003510285e-05, + "loss": 0.1818, + "num_input_tokens_seen": 56772016, + "step": 26295 + }, + { + "epoch": 4.290375203915171, + "grad_norm": 0.7941063046455383, + "learning_rate": 3.524638777970982e-05, + "loss": 0.0673, + "num_input_tokens_seen": 56781168, + "step": 26300 + }, + { + "epoch": 4.291190864600326, + "grad_norm": 1.107401728630066, + "learning_rate": 3.523989469369262e-05, + "loss": 0.1175, + "num_input_tokens_seen": 56792720, + "step": 26305 + }, + { + "epoch": 4.2920065252854815, + "grad_norm": 0.05031752586364746, + "learning_rate": 3.523340077757759e-05, + "loss": 0.0937, + "num_input_tokens_seen": 56803856, + "step": 26310 + }, + { + "epoch": 4.292822185970636, + "grad_norm": 0.21116431057453156, + "learning_rate": 3.522690603189117e-05, + "loss": 0.0987, + "num_input_tokens_seen": 56814480, + "step": 26315 + }, + { + "epoch": 4.293637846655791, + "grad_norm": 0.13427197933197021, + "learning_rate": 3.522041045715986e-05, + "loss": 0.1494, + "num_input_tokens_seen": 56824816, + "step": 26320 + }, + { + "epoch": 4.294453507340946, + "grad_norm": 0.5051047801971436, + "learning_rate": 3.521391405391022e-05, + "loss": 0.2542, + "num_input_tokens_seen": 56834928, + "step": 26325 + }, + { + "epoch": 4.295269168026101, + "grad_norm": 0.15627947449684143, + "learning_rate": 3.520741682266888e-05, + "loss": 0.0419, + "num_input_tokens_seen": 56845680, + "step": 26330 + }, + { + "epoch": 4.2960848287112565, + "grad_norm": 0.05145386978983879, + "learning_rate": 3.520091876396255e-05, + "loss": 0.0893, + "num_input_tokens_seen": 56856464, + "step": 26335 + }, + { + "epoch": 4.296900489396411, + "grad_norm": 0.21678097546100616, + "learning_rate": 3.5194419878317975e-05, + "loss": 0.1518, + "num_input_tokens_seen": 56867920, + "step": 26340 + }, + { + "epoch": 4.297716150081566, + "grad_norm": 0.09596415609121323, + "learning_rate": 3.518792016626201e-05, + "loss": 0.0717, + "num_input_tokens_seen": 56878672, + "step": 26345 + }, + { + "epoch": 4.298531810766721, + "grad_norm": 0.3578111231327057, + "learning_rate": 3.518141962832153e-05, + "loss": 0.0518, + "num_input_tokens_seen": 56889616, + "step": 26350 + }, + { + "epoch": 4.299347471451876, + "grad_norm": 0.1397036761045456, + "learning_rate": 3.517491826502352e-05, + "loss": 0.1106, + "num_input_tokens_seen": 56899184, + "step": 26355 + }, + { + "epoch": 4.300163132137031, + "grad_norm": 0.3154468238353729, + "learning_rate": 3.516841607689501e-05, + "loss": 0.1152, + "num_input_tokens_seen": 56910480, + "step": 26360 + }, + { + "epoch": 4.300978792822186, + "grad_norm": 0.4044772982597351, + "learning_rate": 3.516191306446309e-05, + "loss": 0.247, + "num_input_tokens_seen": 56921040, + "step": 26365 + }, + { + "epoch": 4.301794453507341, + "grad_norm": 1.0980392694473267, + "learning_rate": 3.5155409228254946e-05, + "loss": 0.1836, + "num_input_tokens_seen": 56931920, + "step": 26370 + }, + { + "epoch": 4.302610114192496, + "grad_norm": 0.6669036746025085, + "learning_rate": 3.5148904568797805e-05, + "loss": 0.1124, + "num_input_tokens_seen": 56942960, + "step": 26375 + }, + { + "epoch": 4.303425774877651, + "grad_norm": 0.5535319447517395, + "learning_rate": 3.514239908661896e-05, + "loss": 0.1558, + "num_input_tokens_seen": 56955728, + "step": 26380 + }, + { + "epoch": 4.304241435562806, + "grad_norm": 1.8101848363876343, + "learning_rate": 3.513589278224577e-05, + "loss": 0.0955, + "num_input_tokens_seen": 56966224, + "step": 26385 + }, + { + "epoch": 4.30505709624796, + "grad_norm": 1.3432228565216064, + "learning_rate": 3.5129385656205696e-05, + "loss": 0.2257, + "num_input_tokens_seen": 56975440, + "step": 26390 + }, + { + "epoch": 4.305872756933116, + "grad_norm": 1.1615512371063232, + "learning_rate": 3.512287770902623e-05, + "loss": 0.0748, + "num_input_tokens_seen": 56986416, + "step": 26395 + }, + { + "epoch": 4.306688417618271, + "grad_norm": 0.4676556885242462, + "learning_rate": 3.5116368941234924e-05, + "loss": 0.1281, + "num_input_tokens_seen": 56997392, + "step": 26400 + }, + { + "epoch": 4.307504078303426, + "grad_norm": 0.41042599081993103, + "learning_rate": 3.510985935335943e-05, + "loss": 0.1236, + "num_input_tokens_seen": 57008144, + "step": 26405 + }, + { + "epoch": 4.308319738988581, + "grad_norm": 0.5026414394378662, + "learning_rate": 3.510334894592743e-05, + "loss": 0.0982, + "num_input_tokens_seen": 57019472, + "step": 26410 + }, + { + "epoch": 4.309135399673735, + "grad_norm": 0.10582801699638367, + "learning_rate": 3.509683771946671e-05, + "loss": 0.1822, + "num_input_tokens_seen": 57029136, + "step": 26415 + }, + { + "epoch": 4.309951060358891, + "grad_norm": 1.0920722484588623, + "learning_rate": 3.509032567450508e-05, + "loss": 0.0978, + "num_input_tokens_seen": 57039504, + "step": 26420 + }, + { + "epoch": 4.310766721044046, + "grad_norm": 0.6725613474845886, + "learning_rate": 3.508381281157046e-05, + "loss": 0.1577, + "num_input_tokens_seen": 57050064, + "step": 26425 + }, + { + "epoch": 4.311582381729201, + "grad_norm": 0.5091227889060974, + "learning_rate": 3.507729913119081e-05, + "loss": 0.0466, + "num_input_tokens_seen": 57061232, + "step": 26430 + }, + { + "epoch": 4.3123980424143555, + "grad_norm": 0.0650944635272026, + "learning_rate": 3.507078463389417e-05, + "loss": 0.2027, + "num_input_tokens_seen": 57072016, + "step": 26435 + }, + { + "epoch": 4.31321370309951, + "grad_norm": 0.44764551520347595, + "learning_rate": 3.506426932020861e-05, + "loss": 0.0871, + "num_input_tokens_seen": 57082576, + "step": 26440 + }, + { + "epoch": 4.314029363784665, + "grad_norm": 0.5810356736183167, + "learning_rate": 3.505775319066233e-05, + "loss": 0.1069, + "num_input_tokens_seen": 57093712, + "step": 26445 + }, + { + "epoch": 4.314845024469821, + "grad_norm": 0.5216653347015381, + "learning_rate": 3.5051236245783536e-05, + "loss": 0.1014, + "num_input_tokens_seen": 57104432, + "step": 26450 + }, + { + "epoch": 4.315660685154976, + "grad_norm": 0.42035406827926636, + "learning_rate": 3.5044718486100536e-05, + "loss": 0.0481, + "num_input_tokens_seen": 57114576, + "step": 26455 + }, + { + "epoch": 4.3164763458401305, + "grad_norm": 1.5372929573059082, + "learning_rate": 3.503819991214168e-05, + "loss": 0.2989, + "num_input_tokens_seen": 57125744, + "step": 26460 + }, + { + "epoch": 4.317292006525285, + "grad_norm": 0.11556963622570038, + "learning_rate": 3.503168052443542e-05, + "loss": 0.1798, + "num_input_tokens_seen": 57136784, + "step": 26465 + }, + { + "epoch": 4.31810766721044, + "grad_norm": 1.2873648405075073, + "learning_rate": 3.502516032351022e-05, + "loss": 0.0825, + "num_input_tokens_seen": 57147888, + "step": 26470 + }, + { + "epoch": 4.318923327895595, + "grad_norm": 0.06128234416246414, + "learning_rate": 3.501863930989467e-05, + "loss": 0.0788, + "num_input_tokens_seen": 57158704, + "step": 26475 + }, + { + "epoch": 4.319738988580751, + "grad_norm": 0.0289164986461401, + "learning_rate": 3.501211748411738e-05, + "loss": 0.044, + "num_input_tokens_seen": 57170096, + "step": 26480 + }, + { + "epoch": 4.3205546492659055, + "grad_norm": 0.2324293553829193, + "learning_rate": 3.500559484670705e-05, + "loss": 0.0727, + "num_input_tokens_seen": 57181008, + "step": 26485 + }, + { + "epoch": 4.32137030995106, + "grad_norm": 0.799298107624054, + "learning_rate": 3.499907139819242e-05, + "loss": 0.1176, + "num_input_tokens_seen": 57192752, + "step": 26490 + }, + { + "epoch": 4.322185970636215, + "grad_norm": 0.6321514248847961, + "learning_rate": 3.499254713910234e-05, + "loss": 0.2007, + "num_input_tokens_seen": 57204976, + "step": 26495 + }, + { + "epoch": 4.32300163132137, + "grad_norm": 0.887978732585907, + "learning_rate": 3.498602206996569e-05, + "loss": 0.1751, + "num_input_tokens_seen": 57216592, + "step": 26500 + }, + { + "epoch": 4.323817292006526, + "grad_norm": 1.5273164510726929, + "learning_rate": 3.497949619131141e-05, + "loss": 0.0683, + "num_input_tokens_seen": 57227568, + "step": 26505 + }, + { + "epoch": 4.3246329526916805, + "grad_norm": 1.0891623497009277, + "learning_rate": 3.497296950366854e-05, + "loss": 0.0863, + "num_input_tokens_seen": 57238096, + "step": 26510 + }, + { + "epoch": 4.325448613376835, + "grad_norm": 0.29711759090423584, + "learning_rate": 3.4966442007566165e-05, + "loss": 0.0605, + "num_input_tokens_seen": 57250416, + "step": 26515 + }, + { + "epoch": 4.32626427406199, + "grad_norm": 0.08259692043066025, + "learning_rate": 3.495991370353342e-05, + "loss": 0.1746, + "num_input_tokens_seen": 57261008, + "step": 26520 + }, + { + "epoch": 4.327079934747145, + "grad_norm": 1.6772282123565674, + "learning_rate": 3.4953384592099536e-05, + "loss": 0.1753, + "num_input_tokens_seen": 57271344, + "step": 26525 + }, + { + "epoch": 4.327895595432301, + "grad_norm": 1.9395228624343872, + "learning_rate": 3.494685467379381e-05, + "loss": 0.1226, + "num_input_tokens_seen": 57280752, + "step": 26530 + }, + { + "epoch": 4.328711256117455, + "grad_norm": 0.26149705052375793, + "learning_rate": 3.494032394914555e-05, + "loss": 0.1855, + "num_input_tokens_seen": 57290736, + "step": 26535 + }, + { + "epoch": 4.32952691680261, + "grad_norm": 0.5657910704612732, + "learning_rate": 3.493379241868421e-05, + "loss": 0.0973, + "num_input_tokens_seen": 57301680, + "step": 26540 + }, + { + "epoch": 4.330342577487765, + "grad_norm": 0.08677627891302109, + "learning_rate": 3.492726008293925e-05, + "loss": 0.1496, + "num_input_tokens_seen": 57313200, + "step": 26545 + }, + { + "epoch": 4.33115823817292, + "grad_norm": 0.9367263317108154, + "learning_rate": 3.4920726942440215e-05, + "loss": 0.1075, + "num_input_tokens_seen": 57323856, + "step": 26550 + }, + { + "epoch": 4.331973898858075, + "grad_norm": 1.0195482969284058, + "learning_rate": 3.4914192997716724e-05, + "loss": 0.0607, + "num_input_tokens_seen": 57334768, + "step": 26555 + }, + { + "epoch": 4.33278955954323, + "grad_norm": 1.3827000856399536, + "learning_rate": 3.4907658249298435e-05, + "loss": 0.1819, + "num_input_tokens_seen": 57344720, + "step": 26560 + }, + { + "epoch": 4.333605220228385, + "grad_norm": 0.7743793725967407, + "learning_rate": 3.4901122697715096e-05, + "loss": 0.1882, + "num_input_tokens_seen": 57356784, + "step": 26565 + }, + { + "epoch": 4.33442088091354, + "grad_norm": 0.5187420845031738, + "learning_rate": 3.4894586343496524e-05, + "loss": 0.077, + "num_input_tokens_seen": 57367184, + "step": 26570 + }, + { + "epoch": 4.335236541598695, + "grad_norm": 0.22432343661785126, + "learning_rate": 3.4888049187172566e-05, + "loss": 0.1517, + "num_input_tokens_seen": 57377840, + "step": 26575 + }, + { + "epoch": 4.33605220228385, + "grad_norm": 0.1150059923529625, + "learning_rate": 3.4881511229273175e-05, + "loss": 0.1093, + "num_input_tokens_seen": 57389232, + "step": 26580 + }, + { + "epoch": 4.3368678629690045, + "grad_norm": 1.5456725358963013, + "learning_rate": 3.487497247032835e-05, + "loss": 0.1492, + "num_input_tokens_seen": 57400176, + "step": 26585 + }, + { + "epoch": 4.33768352365416, + "grad_norm": 0.037683289498090744, + "learning_rate": 3.4868432910868156e-05, + "loss": 0.1473, + "num_input_tokens_seen": 57410768, + "step": 26590 + }, + { + "epoch": 4.338499184339315, + "grad_norm": 0.9300975203514099, + "learning_rate": 3.48618925514227e-05, + "loss": 0.1581, + "num_input_tokens_seen": 57421680, + "step": 26595 + }, + { + "epoch": 4.33931484502447, + "grad_norm": 2.4267826080322266, + "learning_rate": 3.4855351392522214e-05, + "loss": 0.1712, + "num_input_tokens_seen": 57431600, + "step": 26600 + }, + { + "epoch": 4.340130505709625, + "grad_norm": 0.11666043102741241, + "learning_rate": 3.4848809434696924e-05, + "loss": 0.0348, + "num_input_tokens_seen": 57442640, + "step": 26605 + }, + { + "epoch": 4.3409461663947795, + "grad_norm": 1.2314481735229492, + "learning_rate": 3.484226667847718e-05, + "loss": 0.13, + "num_input_tokens_seen": 57453072, + "step": 26610 + }, + { + "epoch": 4.341761827079935, + "grad_norm": 0.4626401960849762, + "learning_rate": 3.4835723124393347e-05, + "loss": 0.0919, + "num_input_tokens_seen": 57463792, + "step": 26615 + }, + { + "epoch": 4.34257748776509, + "grad_norm": 0.562355101108551, + "learning_rate": 3.48291787729759e-05, + "loss": 0.1353, + "num_input_tokens_seen": 57474448, + "step": 26620 + }, + { + "epoch": 4.343393148450245, + "grad_norm": 0.638850748538971, + "learning_rate": 3.482263362475535e-05, + "loss": 0.0817, + "num_input_tokens_seen": 57485904, + "step": 26625 + }, + { + "epoch": 4.3442088091354, + "grad_norm": 0.34423160552978516, + "learning_rate": 3.4816087680262275e-05, + "loss": 0.2367, + "num_input_tokens_seen": 57497008, + "step": 26630 + }, + { + "epoch": 4.3450244698205545, + "grad_norm": 0.18855653703212738, + "learning_rate": 3.480954094002733e-05, + "loss": 0.0751, + "num_input_tokens_seen": 57507920, + "step": 26635 + }, + { + "epoch": 4.345840130505709, + "grad_norm": 0.3912961483001709, + "learning_rate": 3.480299340458123e-05, + "loss": 0.1243, + "num_input_tokens_seen": 57518832, + "step": 26640 + }, + { + "epoch": 4.346655791190865, + "grad_norm": 0.08973410725593567, + "learning_rate": 3.479644507445473e-05, + "loss": 0.1097, + "num_input_tokens_seen": 57529808, + "step": 26645 + }, + { + "epoch": 4.34747145187602, + "grad_norm": 1.5727320909500122, + "learning_rate": 3.4789895950178694e-05, + "loss": 0.1045, + "num_input_tokens_seen": 57540944, + "step": 26650 + }, + { + "epoch": 4.348287112561175, + "grad_norm": 0.2457934468984604, + "learning_rate": 3.478334603228401e-05, + "loss": 0.1682, + "num_input_tokens_seen": 57552176, + "step": 26655 + }, + { + "epoch": 4.349102773246329, + "grad_norm": 0.09513163566589355, + "learning_rate": 3.477679532130167e-05, + "loss": 0.0888, + "num_input_tokens_seen": 57563184, + "step": 26660 + }, + { + "epoch": 4.349918433931484, + "grad_norm": 0.5264205932617188, + "learning_rate": 3.4770243817762686e-05, + "loss": 0.3067, + "num_input_tokens_seen": 57574320, + "step": 26665 + }, + { + "epoch": 4.350734094616639, + "grad_norm": 0.21896077692508698, + "learning_rate": 3.476369152219817e-05, + "loss": 0.1146, + "num_input_tokens_seen": 57584944, + "step": 26670 + }, + { + "epoch": 4.351549755301795, + "grad_norm": 0.7277307510375977, + "learning_rate": 3.4757138435139274e-05, + "loss": 0.0759, + "num_input_tokens_seen": 57595600, + "step": 26675 + }, + { + "epoch": 4.35236541598695, + "grad_norm": 2.621506452560425, + "learning_rate": 3.4750584557117234e-05, + "loss": 0.3312, + "num_input_tokens_seen": 57607472, + "step": 26680 + }, + { + "epoch": 4.353181076672104, + "grad_norm": 2.0350959300994873, + "learning_rate": 3.4744029888663326e-05, + "loss": 0.3159, + "num_input_tokens_seen": 57618800, + "step": 26685 + }, + { + "epoch": 4.353996737357259, + "grad_norm": 0.4514561593532562, + "learning_rate": 3.473747443030892e-05, + "loss": 0.0375, + "num_input_tokens_seen": 57629040, + "step": 26690 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 1.2259180545806885, + "learning_rate": 3.473091818258543e-05, + "loss": 0.2311, + "num_input_tokens_seen": 57639728, + "step": 26695 + }, + { + "epoch": 4.35562805872757, + "grad_norm": 0.9718542098999023, + "learning_rate": 3.472436114602433e-05, + "loss": 0.131, + "num_input_tokens_seen": 57650512, + "step": 26700 + }, + { + "epoch": 4.356443719412725, + "grad_norm": 0.16658152639865875, + "learning_rate": 3.471780332115719e-05, + "loss": 0.1681, + "num_input_tokens_seen": 57662096, + "step": 26705 + }, + { + "epoch": 4.357259380097879, + "grad_norm": 0.19467680156230927, + "learning_rate": 3.47112447085156e-05, + "loss": 0.2168, + "num_input_tokens_seen": 57672080, + "step": 26710 + }, + { + "epoch": 4.358075040783034, + "grad_norm": 0.3133710026741028, + "learning_rate": 3.470468530863123e-05, + "loss": 0.269, + "num_input_tokens_seen": 57682704, + "step": 26715 + }, + { + "epoch": 4.358890701468189, + "grad_norm": 1.3399832248687744, + "learning_rate": 3.469812512203584e-05, + "loss": 0.181, + "num_input_tokens_seen": 57693520, + "step": 26720 + }, + { + "epoch": 4.359706362153344, + "grad_norm": 1.0464636087417603, + "learning_rate": 3.469156414926121e-05, + "loss": 0.1533, + "num_input_tokens_seen": 57703312, + "step": 26725 + }, + { + "epoch": 4.3605220228384995, + "grad_norm": 1.700222373008728, + "learning_rate": 3.4685002390839226e-05, + "loss": 0.1596, + "num_input_tokens_seen": 57714672, + "step": 26730 + }, + { + "epoch": 4.361337683523654, + "grad_norm": 0.03180016949772835, + "learning_rate": 3.467843984730179e-05, + "loss": 0.0485, + "num_input_tokens_seen": 57724464, + "step": 26735 + }, + { + "epoch": 4.362153344208809, + "grad_norm": 0.8383585214614868, + "learning_rate": 3.467187651918093e-05, + "loss": 0.0544, + "num_input_tokens_seen": 57735856, + "step": 26740 + }, + { + "epoch": 4.362969004893964, + "grad_norm": 0.41382721066474915, + "learning_rate": 3.466531240700868e-05, + "loss": 0.1176, + "num_input_tokens_seen": 57747440, + "step": 26745 + }, + { + "epoch": 4.363784665579119, + "grad_norm": 0.7681185603141785, + "learning_rate": 3.465874751131716e-05, + "loss": 0.1149, + "num_input_tokens_seen": 57757168, + "step": 26750 + }, + { + "epoch": 4.364600326264274, + "grad_norm": 0.10247297585010529, + "learning_rate": 3.4652181832638566e-05, + "loss": 0.0392, + "num_input_tokens_seen": 57768112, + "step": 26755 + }, + { + "epoch": 4.365415986949429, + "grad_norm": 0.3853526711463928, + "learning_rate": 3.464561537150513e-05, + "loss": 0.1363, + "num_input_tokens_seen": 57778480, + "step": 26760 + }, + { + "epoch": 4.366231647634584, + "grad_norm": 0.13290642201900482, + "learning_rate": 3.4639048128449175e-05, + "loss": 0.0573, + "num_input_tokens_seen": 57789328, + "step": 26765 + }, + { + "epoch": 4.367047308319739, + "grad_norm": 0.17501631379127502, + "learning_rate": 3.463248010400307e-05, + "loss": 0.0941, + "num_input_tokens_seen": 57800112, + "step": 26770 + }, + { + "epoch": 4.367862969004894, + "grad_norm": 0.2592281401157379, + "learning_rate": 3.462591129869925e-05, + "loss": 0.2253, + "num_input_tokens_seen": 57810064, + "step": 26775 + }, + { + "epoch": 4.368678629690049, + "grad_norm": 0.9170477986335754, + "learning_rate": 3.461934171307022e-05, + "loss": 0.0708, + "num_input_tokens_seen": 57821680, + "step": 26780 + }, + { + "epoch": 4.369494290375204, + "grad_norm": 1.4142638444900513, + "learning_rate": 3.461277134764855e-05, + "loss": 0.1754, + "num_input_tokens_seen": 57832496, + "step": 26785 + }, + { + "epoch": 4.370309951060359, + "grad_norm": 0.3251838684082031, + "learning_rate": 3.460620020296684e-05, + "loss": 0.1741, + "num_input_tokens_seen": 57842768, + "step": 26790 + }, + { + "epoch": 4.371125611745514, + "grad_norm": 0.9456160068511963, + "learning_rate": 3.459962827955782e-05, + "loss": 0.2533, + "num_input_tokens_seen": 57853008, + "step": 26795 + }, + { + "epoch": 4.371941272430669, + "grad_norm": 0.9515712261199951, + "learning_rate": 3.459305557795422e-05, + "loss": 0.1029, + "num_input_tokens_seen": 57863312, + "step": 26800 + }, + { + "epoch": 4.372756933115824, + "grad_norm": 1.0975676774978638, + "learning_rate": 3.458648209868886e-05, + "loss": 0.1256, + "num_input_tokens_seen": 57873552, + "step": 26805 + }, + { + "epoch": 4.373572593800978, + "grad_norm": 1.0465668439865112, + "learning_rate": 3.4579907842294614e-05, + "loss": 0.1852, + "num_input_tokens_seen": 57884336, + "step": 26810 + }, + { + "epoch": 4.374388254486134, + "grad_norm": 1.0310214757919312, + "learning_rate": 3.457333280930444e-05, + "loss": 0.2138, + "num_input_tokens_seen": 57894512, + "step": 26815 + }, + { + "epoch": 4.375203915171289, + "grad_norm": 0.05769374594092369, + "learning_rate": 3.456675700025132e-05, + "loss": 0.1153, + "num_input_tokens_seen": 57906576, + "step": 26820 + }, + { + "epoch": 4.376019575856444, + "grad_norm": 0.2756873071193695, + "learning_rate": 3.4560180415668354e-05, + "loss": 0.1253, + "num_input_tokens_seen": 57916816, + "step": 26825 + }, + { + "epoch": 4.376835236541599, + "grad_norm": 0.06464431434869766, + "learning_rate": 3.455360305608865e-05, + "loss": 0.0893, + "num_input_tokens_seen": 57927088, + "step": 26830 + }, + { + "epoch": 4.377650897226753, + "grad_norm": 0.35234585404396057, + "learning_rate": 3.4547024922045405e-05, + "loss": 0.0822, + "num_input_tokens_seen": 57939120, + "step": 26835 + }, + { + "epoch": 4.378466557911908, + "grad_norm": 0.67051762342453, + "learning_rate": 3.454044601407187e-05, + "loss": 0.1537, + "num_input_tokens_seen": 57949264, + "step": 26840 + }, + { + "epoch": 4.379282218597064, + "grad_norm": 0.9753795266151428, + "learning_rate": 3.453386633270138e-05, + "loss": 0.2095, + "num_input_tokens_seen": 57961328, + "step": 26845 + }, + { + "epoch": 4.380097879282219, + "grad_norm": 0.9662653207778931, + "learning_rate": 3.4527285878467305e-05, + "loss": 0.1904, + "num_input_tokens_seen": 57972816, + "step": 26850 + }, + { + "epoch": 4.3809135399673735, + "grad_norm": 0.9608920812606812, + "learning_rate": 3.45207046519031e-05, + "loss": 0.112, + "num_input_tokens_seen": 57983440, + "step": 26855 + }, + { + "epoch": 4.381729200652528, + "grad_norm": 0.8774608373641968, + "learning_rate": 3.451412265354227e-05, + "loss": 0.1952, + "num_input_tokens_seen": 57994384, + "step": 26860 + }, + { + "epoch": 4.382544861337683, + "grad_norm": 0.8014524579048157, + "learning_rate": 3.450753988391839e-05, + "loss": 0.0889, + "num_input_tokens_seen": 58006192, + "step": 26865 + }, + { + "epoch": 4.383360522022839, + "grad_norm": 0.9717660546302795, + "learning_rate": 3.450095634356508e-05, + "loss": 0.0538, + "num_input_tokens_seen": 58016784, + "step": 26870 + }, + { + "epoch": 4.384176182707994, + "grad_norm": 1.014916181564331, + "learning_rate": 3.449437203301604e-05, + "loss": 0.1077, + "num_input_tokens_seen": 58026736, + "step": 26875 + }, + { + "epoch": 4.3849918433931485, + "grad_norm": 0.1395779550075531, + "learning_rate": 3.4487786952805035e-05, + "loss": 0.0828, + "num_input_tokens_seen": 58037200, + "step": 26880 + }, + { + "epoch": 4.385807504078303, + "grad_norm": 0.2716046869754791, + "learning_rate": 3.4481201103465875e-05, + "loss": 0.0615, + "num_input_tokens_seen": 58049200, + "step": 26885 + }, + { + "epoch": 4.386623164763458, + "grad_norm": 0.3383435010910034, + "learning_rate": 3.447461448553245e-05, + "loss": 0.1075, + "num_input_tokens_seen": 58060144, + "step": 26890 + }, + { + "epoch": 4.387438825448613, + "grad_norm": 0.2942950129508972, + "learning_rate": 3.4468027099538694e-05, + "loss": 0.1435, + "num_input_tokens_seen": 58071600, + "step": 26895 + }, + { + "epoch": 4.388254486133769, + "grad_norm": 0.5068498849868774, + "learning_rate": 3.446143894601862e-05, + "loss": 0.2088, + "num_input_tokens_seen": 58081616, + "step": 26900 + }, + { + "epoch": 4.3890701468189235, + "grad_norm": 0.3725895583629608, + "learning_rate": 3.44548500255063e-05, + "loss": 0.2309, + "num_input_tokens_seen": 58092976, + "step": 26905 + }, + { + "epoch": 4.389885807504078, + "grad_norm": 0.5698264837265015, + "learning_rate": 3.444826033853587e-05, + "loss": 0.0787, + "num_input_tokens_seen": 58103536, + "step": 26910 + }, + { + "epoch": 4.390701468189233, + "grad_norm": 0.17807742953300476, + "learning_rate": 3.4441669885641517e-05, + "loss": 0.0345, + "num_input_tokens_seen": 58115504, + "step": 26915 + }, + { + "epoch": 4.391517128874388, + "grad_norm": 0.25091543793678284, + "learning_rate": 3.443507866735749e-05, + "loss": 0.1126, + "num_input_tokens_seen": 58125648, + "step": 26920 + }, + { + "epoch": 4.392332789559543, + "grad_norm": 1.6614545583724976, + "learning_rate": 3.4428486684218116e-05, + "loss": 0.1482, + "num_input_tokens_seen": 58136816, + "step": 26925 + }, + { + "epoch": 4.3931484502446985, + "grad_norm": 1.5614811182022095, + "learning_rate": 3.442189393675777e-05, + "loss": 0.2114, + "num_input_tokens_seen": 58146832, + "step": 26930 + }, + { + "epoch": 4.393964110929853, + "grad_norm": 0.7088298201560974, + "learning_rate": 3.44153004255109e-05, + "loss": 0.2413, + "num_input_tokens_seen": 58156912, + "step": 26935 + }, + { + "epoch": 4.394779771615008, + "grad_norm": 1.40430748462677, + "learning_rate": 3.4408706151012e-05, + "loss": 0.0998, + "num_input_tokens_seen": 58167856, + "step": 26940 + }, + { + "epoch": 4.395595432300163, + "grad_norm": 0.8166909217834473, + "learning_rate": 3.440211111379564e-05, + "loss": 0.1695, + "num_input_tokens_seen": 58178928, + "step": 26945 + }, + { + "epoch": 4.396411092985318, + "grad_norm": 0.07193876802921295, + "learning_rate": 3.4395515314396445e-05, + "loss": 0.064, + "num_input_tokens_seen": 58187888, + "step": 26950 + }, + { + "epoch": 4.397226753670473, + "grad_norm": 0.8012107014656067, + "learning_rate": 3.4388918753349106e-05, + "loss": 0.1056, + "num_input_tokens_seen": 58198160, + "step": 26955 + }, + { + "epoch": 4.398042414355628, + "grad_norm": 0.1735745370388031, + "learning_rate": 3.438232143118838e-05, + "loss": 0.0573, + "num_input_tokens_seen": 58209808, + "step": 26960 + }, + { + "epoch": 4.398858075040783, + "grad_norm": 1.5717047452926636, + "learning_rate": 3.437572334844907e-05, + "loss": 0.1055, + "num_input_tokens_seen": 58221104, + "step": 26965 + }, + { + "epoch": 4.399673735725938, + "grad_norm": 1.4278390407562256, + "learning_rate": 3.436912450566606e-05, + "loss": 0.1658, + "num_input_tokens_seen": 58230832, + "step": 26970 + }, + { + "epoch": 4.400489396411093, + "grad_norm": 0.3996203541755676, + "learning_rate": 3.436252490337428e-05, + "loss": 0.173, + "num_input_tokens_seen": 58241328, + "step": 26975 + }, + { + "epoch": 4.401305057096248, + "grad_norm": 0.08070327341556549, + "learning_rate": 3.4355924542108716e-05, + "loss": 0.0332, + "num_input_tokens_seen": 58252720, + "step": 26980 + }, + { + "epoch": 4.402120717781403, + "grad_norm": 1.7117465734481812, + "learning_rate": 3.4349323422404444e-05, + "loss": 0.1092, + "num_input_tokens_seen": 58263216, + "step": 26985 + }, + { + "epoch": 4.402936378466558, + "grad_norm": 0.12779639661312103, + "learning_rate": 3.434272154479657e-05, + "loss": 0.025, + "num_input_tokens_seen": 58274320, + "step": 26990 + }, + { + "epoch": 4.403752039151713, + "grad_norm": 0.11658629029989243, + "learning_rate": 3.4336118909820295e-05, + "loss": 0.2183, + "num_input_tokens_seen": 58285392, + "step": 26995 + }, + { + "epoch": 4.404567699836868, + "grad_norm": 1.0423623323440552, + "learning_rate": 3.432951551801084e-05, + "loss": 0.1994, + "num_input_tokens_seen": 58295728, + "step": 27000 + }, + { + "epoch": 4.4053833605220225, + "grad_norm": 0.15781480073928833, + "learning_rate": 3.432291136990352e-05, + "loss": 0.1423, + "num_input_tokens_seen": 58306704, + "step": 27005 + }, + { + "epoch": 4.406199021207178, + "grad_norm": 0.42623457312583923, + "learning_rate": 3.4316306466033704e-05, + "loss": 0.1602, + "num_input_tokens_seen": 58318288, + "step": 27010 + }, + { + "epoch": 4.407014681892333, + "grad_norm": 0.6194913387298584, + "learning_rate": 3.430970080693681e-05, + "loss": 0.1255, + "num_input_tokens_seen": 58330416, + "step": 27015 + }, + { + "epoch": 4.407830342577488, + "grad_norm": 1.5639581680297852, + "learning_rate": 3.430309439314834e-05, + "loss": 0.1379, + "num_input_tokens_seen": 58341456, + "step": 27020 + }, + { + "epoch": 4.408646003262643, + "grad_norm": 0.19171683490276337, + "learning_rate": 3.4296487225203825e-05, + "loss": 0.1293, + "num_input_tokens_seen": 58351568, + "step": 27025 + }, + { + "epoch": 4.4094616639477975, + "grad_norm": 1.249830961227417, + "learning_rate": 3.42898793036389e-05, + "loss": 0.2383, + "num_input_tokens_seen": 58361968, + "step": 27030 + }, + { + "epoch": 4.410277324632952, + "grad_norm": 0.48567724227905273, + "learning_rate": 3.428327062898921e-05, + "loss": 0.1292, + "num_input_tokens_seen": 58373200, + "step": 27035 + }, + { + "epoch": 4.411092985318108, + "grad_norm": 0.23019640147686005, + "learning_rate": 3.4276661201790506e-05, + "loss": 0.0649, + "num_input_tokens_seen": 58383632, + "step": 27040 + }, + { + "epoch": 4.411908646003263, + "grad_norm": 1.212049961090088, + "learning_rate": 3.427005102257857e-05, + "loss": 0.1752, + "num_input_tokens_seen": 58394768, + "step": 27045 + }, + { + "epoch": 4.412724306688418, + "grad_norm": 1.2833452224731445, + "learning_rate": 3.426344009188927e-05, + "loss": 0.0733, + "num_input_tokens_seen": 58403856, + "step": 27050 + }, + { + "epoch": 4.4135399673735725, + "grad_norm": 1.628224492073059, + "learning_rate": 3.425682841025851e-05, + "loss": 0.193, + "num_input_tokens_seen": 58415152, + "step": 27055 + }, + { + "epoch": 4.414355628058727, + "grad_norm": 1.3059850931167603, + "learning_rate": 3.4250215978222264e-05, + "loss": 0.075, + "num_input_tokens_seen": 58425584, + "step": 27060 + }, + { + "epoch": 4.415171288743883, + "grad_norm": 0.18833720684051514, + "learning_rate": 3.424360279631659e-05, + "loss": 0.1464, + "num_input_tokens_seen": 58435088, + "step": 27065 + }, + { + "epoch": 4.415986949429038, + "grad_norm": 0.2394917905330658, + "learning_rate": 3.423698886507756e-05, + "loss": 0.118, + "num_input_tokens_seen": 58447088, + "step": 27070 + }, + { + "epoch": 4.416802610114193, + "grad_norm": 1.9370800256729126, + "learning_rate": 3.4230374185041346e-05, + "loss": 0.1995, + "num_input_tokens_seen": 58458928, + "step": 27075 + }, + { + "epoch": 4.417618270799347, + "grad_norm": 0.31403490900993347, + "learning_rate": 3.4223758756744176e-05, + "loss": 0.1446, + "num_input_tokens_seen": 58470224, + "step": 27080 + }, + { + "epoch": 4.418433931484502, + "grad_norm": 0.23613710701465607, + "learning_rate": 3.421714258072231e-05, + "loss": 0.0253, + "num_input_tokens_seen": 58480816, + "step": 27085 + }, + { + "epoch": 4.419249592169657, + "grad_norm": 0.7164018154144287, + "learning_rate": 3.421052565751209e-05, + "loss": 0.2045, + "num_input_tokens_seen": 58492528, + "step": 27090 + }, + { + "epoch": 4.420065252854813, + "grad_norm": 0.14931237697601318, + "learning_rate": 3.420390798764995e-05, + "loss": 0.0584, + "num_input_tokens_seen": 58502640, + "step": 27095 + }, + { + "epoch": 4.420880913539968, + "grad_norm": 0.7027750611305237, + "learning_rate": 3.4197289571672316e-05, + "loss": 0.0791, + "num_input_tokens_seen": 58514064, + "step": 27100 + }, + { + "epoch": 4.421696574225122, + "grad_norm": 0.320191353559494, + "learning_rate": 3.4190670410115724e-05, + "loss": 0.0279, + "num_input_tokens_seen": 58526032, + "step": 27105 + }, + { + "epoch": 4.422512234910277, + "grad_norm": 0.6264593601226807, + "learning_rate": 3.418405050351674e-05, + "loss": 0.0416, + "num_input_tokens_seen": 58537904, + "step": 27110 + }, + { + "epoch": 4.423327895595432, + "grad_norm": 1.469906210899353, + "learning_rate": 3.417742985241205e-05, + "loss": 0.2423, + "num_input_tokens_seen": 58548208, + "step": 27115 + }, + { + "epoch": 4.424143556280587, + "grad_norm": 0.14992353320121765, + "learning_rate": 3.417080845733831e-05, + "loss": 0.1966, + "num_input_tokens_seen": 58559568, + "step": 27120 + }, + { + "epoch": 4.424959216965743, + "grad_norm": 0.32697710394859314, + "learning_rate": 3.416418631883231e-05, + "loss": 0.133, + "num_input_tokens_seen": 58570576, + "step": 27125 + }, + { + "epoch": 4.425774877650897, + "grad_norm": 0.18551237881183624, + "learning_rate": 3.415756343743088e-05, + "loss": 0.1172, + "num_input_tokens_seen": 58581680, + "step": 27130 + }, + { + "epoch": 4.426590538336052, + "grad_norm": 0.1071120947599411, + "learning_rate": 3.4150939813670886e-05, + "loss": 0.0974, + "num_input_tokens_seen": 58593040, + "step": 27135 + }, + { + "epoch": 4.427406199021207, + "grad_norm": 0.032717082649469376, + "learning_rate": 3.414431544808928e-05, + "loss": 0.0361, + "num_input_tokens_seen": 58603728, + "step": 27140 + }, + { + "epoch": 4.428221859706362, + "grad_norm": 0.0985318124294281, + "learning_rate": 3.413769034122306e-05, + "loss": 0.0452, + "num_input_tokens_seen": 58614640, + "step": 27145 + }, + { + "epoch": 4.4290375203915175, + "grad_norm": 0.10255564004182816, + "learning_rate": 3.41310644936093e-05, + "loss": 0.0559, + "num_input_tokens_seen": 58623536, + "step": 27150 + }, + { + "epoch": 4.429853181076672, + "grad_norm": 1.4460506439208984, + "learning_rate": 3.412443790578512e-05, + "loss": 0.1218, + "num_input_tokens_seen": 58633648, + "step": 27155 + }, + { + "epoch": 4.430668841761827, + "grad_norm": 0.09258942306041718, + "learning_rate": 3.4117810578287704e-05, + "loss": 0.1462, + "num_input_tokens_seen": 58644624, + "step": 27160 + }, + { + "epoch": 4.431484502446982, + "grad_norm": 0.2656846046447754, + "learning_rate": 3.411118251165431e-05, + "loss": 0.0621, + "num_input_tokens_seen": 58655344, + "step": 27165 + }, + { + "epoch": 4.432300163132137, + "grad_norm": 0.8349721431732178, + "learning_rate": 3.410455370642221e-05, + "loss": 0.099, + "num_input_tokens_seen": 58666448, + "step": 27170 + }, + { + "epoch": 4.433115823817292, + "grad_norm": 0.17413672804832458, + "learning_rate": 3.409792416312881e-05, + "loss": 0.1219, + "num_input_tokens_seen": 58678672, + "step": 27175 + }, + { + "epoch": 4.433931484502447, + "grad_norm": 0.37481555342674255, + "learning_rate": 3.409129388231151e-05, + "loss": 0.0537, + "num_input_tokens_seen": 58690672, + "step": 27180 + }, + { + "epoch": 4.434747145187602, + "grad_norm": 2.170048713684082, + "learning_rate": 3.40846628645078e-05, + "loss": 0.1988, + "num_input_tokens_seen": 58702320, + "step": 27185 + }, + { + "epoch": 4.435562805872757, + "grad_norm": 0.35049837827682495, + "learning_rate": 3.407803111025522e-05, + "loss": 0.1341, + "num_input_tokens_seen": 58714192, + "step": 27190 + }, + { + "epoch": 4.436378466557912, + "grad_norm": 0.07406556606292725, + "learning_rate": 3.407139862009138e-05, + "loss": 0.1537, + "num_input_tokens_seen": 58725488, + "step": 27195 + }, + { + "epoch": 4.437194127243067, + "grad_norm": 0.7855865955352783, + "learning_rate": 3.406476539455394e-05, + "loss": 0.1328, + "num_input_tokens_seen": 58736208, + "step": 27200 + }, + { + "epoch": 4.438009787928221, + "grad_norm": 0.3180878460407257, + "learning_rate": 3.405813143418062e-05, + "loss": 0.0465, + "num_input_tokens_seen": 58746608, + "step": 27205 + }, + { + "epoch": 4.438825448613377, + "grad_norm": 0.3696017265319824, + "learning_rate": 3.4051496739509216e-05, + "loss": 0.2101, + "num_input_tokens_seen": 58756880, + "step": 27210 + }, + { + "epoch": 4.439641109298532, + "grad_norm": 0.5498877763748169, + "learning_rate": 3.404486131107754e-05, + "loss": 0.0247, + "num_input_tokens_seen": 58768080, + "step": 27215 + }, + { + "epoch": 4.440456769983687, + "grad_norm": 1.4799346923828125, + "learning_rate": 3.403822514942353e-05, + "loss": 0.2006, + "num_input_tokens_seen": 58778896, + "step": 27220 + }, + { + "epoch": 4.441272430668842, + "grad_norm": 2.7045750617980957, + "learning_rate": 3.4031588255085126e-05, + "loss": 0.3228, + "num_input_tokens_seen": 58790032, + "step": 27225 + }, + { + "epoch": 4.442088091353996, + "grad_norm": 0.3792399764060974, + "learning_rate": 3.4024950628600345e-05, + "loss": 0.0755, + "num_input_tokens_seen": 58800880, + "step": 27230 + }, + { + "epoch": 4.442903752039152, + "grad_norm": 0.3723272681236267, + "learning_rate": 3.401831227050728e-05, + "loss": 0.0615, + "num_input_tokens_seen": 58811760, + "step": 27235 + }, + { + "epoch": 4.443719412724307, + "grad_norm": 0.4622465968132019, + "learning_rate": 3.401167318134406e-05, + "loss": 0.1811, + "num_input_tokens_seen": 58821808, + "step": 27240 + }, + { + "epoch": 4.444535073409462, + "grad_norm": 0.5726256966590881, + "learning_rate": 3.400503336164888e-05, + "loss": 0.1616, + "num_input_tokens_seen": 58832080, + "step": 27245 + }, + { + "epoch": 4.445350734094617, + "grad_norm": 0.3024379014968872, + "learning_rate": 3.3998392811960024e-05, + "loss": 0.1917, + "num_input_tokens_seen": 58842480, + "step": 27250 + }, + { + "epoch": 4.446166394779771, + "grad_norm": 2.164541721343994, + "learning_rate": 3.399175153281578e-05, + "loss": 0.3343, + "num_input_tokens_seen": 58852752, + "step": 27255 + }, + { + "epoch": 4.446982055464926, + "grad_norm": 0.1484709531068802, + "learning_rate": 3.3985109524754535e-05, + "loss": 0.0949, + "num_input_tokens_seen": 58863312, + "step": 27260 + }, + { + "epoch": 4.447797716150082, + "grad_norm": 0.8146542310714722, + "learning_rate": 3.397846678831472e-05, + "loss": 0.1133, + "num_input_tokens_seen": 58873104, + "step": 27265 + }, + { + "epoch": 4.448613376835237, + "grad_norm": 1.0596978664398193, + "learning_rate": 3.397182332403482e-05, + "loss": 0.0647, + "num_input_tokens_seen": 58884624, + "step": 27270 + }, + { + "epoch": 4.4494290375203915, + "grad_norm": 0.5097349882125854, + "learning_rate": 3.3965179132453416e-05, + "loss": 0.0614, + "num_input_tokens_seen": 58894864, + "step": 27275 + }, + { + "epoch": 4.450244698205546, + "grad_norm": 2.0162887573242188, + "learning_rate": 3.3958534214109095e-05, + "loss": 0.209, + "num_input_tokens_seen": 58906512, + "step": 27280 + }, + { + "epoch": 4.451060358890701, + "grad_norm": 0.0755825787782669, + "learning_rate": 3.395188856954054e-05, + "loss": 0.1233, + "num_input_tokens_seen": 58917904, + "step": 27285 + }, + { + "epoch": 4.451876019575856, + "grad_norm": 0.9538062214851379, + "learning_rate": 3.394524219928647e-05, + "loss": 0.1269, + "num_input_tokens_seen": 58928304, + "step": 27290 + }, + { + "epoch": 4.452691680261012, + "grad_norm": 0.11138822883367538, + "learning_rate": 3.3938595103885684e-05, + "loss": 0.0454, + "num_input_tokens_seen": 58939056, + "step": 27295 + }, + { + "epoch": 4.4535073409461665, + "grad_norm": 0.3853479325771332, + "learning_rate": 3.393194728387702e-05, + "loss": 0.1623, + "num_input_tokens_seen": 58950384, + "step": 27300 + }, + { + "epoch": 4.454323001631321, + "grad_norm": 0.04868115112185478, + "learning_rate": 3.39252987397994e-05, + "loss": 0.0805, + "num_input_tokens_seen": 58959952, + "step": 27305 + }, + { + "epoch": 4.455138662316476, + "grad_norm": 1.0670331716537476, + "learning_rate": 3.391864947219177e-05, + "loss": 0.0765, + "num_input_tokens_seen": 58970288, + "step": 27310 + }, + { + "epoch": 4.455954323001631, + "grad_norm": 0.311469703912735, + "learning_rate": 3.391199948159315e-05, + "loss": 0.1295, + "num_input_tokens_seen": 58981936, + "step": 27315 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.028117846697568893, + "learning_rate": 3.390534876854265e-05, + "loss": 0.0602, + "num_input_tokens_seen": 58992432, + "step": 27320 + }, + { + "epoch": 4.4575856443719415, + "grad_norm": 0.19743657112121582, + "learning_rate": 3.389869733357939e-05, + "loss": 0.0196, + "num_input_tokens_seen": 59002832, + "step": 27325 + }, + { + "epoch": 4.458401305057096, + "grad_norm": 0.5645349025726318, + "learning_rate": 3.389204517724256e-05, + "loss": 0.0518, + "num_input_tokens_seen": 59013936, + "step": 27330 + }, + { + "epoch": 4.459216965742251, + "grad_norm": 0.20682115852832794, + "learning_rate": 3.388539230007145e-05, + "loss": 0.0698, + "num_input_tokens_seen": 59023440, + "step": 27335 + }, + { + "epoch": 4.460032626427406, + "grad_norm": 1.0605493783950806, + "learning_rate": 3.387873870260534e-05, + "loss": 0.1101, + "num_input_tokens_seen": 59034224, + "step": 27340 + }, + { + "epoch": 4.460848287112561, + "grad_norm": 0.4958038330078125, + "learning_rate": 3.3872084385383626e-05, + "loss": 0.12, + "num_input_tokens_seen": 59044016, + "step": 27345 + }, + { + "epoch": 4.4616639477977165, + "grad_norm": 0.05505606532096863, + "learning_rate": 3.3865429348945735e-05, + "loss": 0.0648, + "num_input_tokens_seen": 59053648, + "step": 27350 + }, + { + "epoch": 4.462479608482871, + "grad_norm": 0.22912542521953583, + "learning_rate": 3.385877359383116e-05, + "loss": 0.1384, + "num_input_tokens_seen": 59064080, + "step": 27355 + }, + { + "epoch": 4.463295269168026, + "grad_norm": 1.0505529642105103, + "learning_rate": 3.385211712057945e-05, + "loss": 0.2588, + "num_input_tokens_seen": 59074672, + "step": 27360 + }, + { + "epoch": 4.464110929853181, + "grad_norm": 1.390404462814331, + "learning_rate": 3.384545992973021e-05, + "loss": 0.2052, + "num_input_tokens_seen": 59084784, + "step": 27365 + }, + { + "epoch": 4.464926590538336, + "grad_norm": 0.9572387337684631, + "learning_rate": 3.383880202182311e-05, + "loss": 0.1103, + "num_input_tokens_seen": 59095504, + "step": 27370 + }, + { + "epoch": 4.465742251223491, + "grad_norm": 0.08853822946548462, + "learning_rate": 3.3832143397397855e-05, + "loss": 0.0381, + "num_input_tokens_seen": 59105616, + "step": 27375 + }, + { + "epoch": 4.466557911908646, + "grad_norm": 6.104022026062012, + "learning_rate": 3.382548405699426e-05, + "loss": 0.2123, + "num_input_tokens_seen": 59114576, + "step": 27380 + }, + { + "epoch": 4.467373572593801, + "grad_norm": 0.07109697163105011, + "learning_rate": 3.3818824001152135e-05, + "loss": 0.1642, + "num_input_tokens_seen": 59124976, + "step": 27385 + }, + { + "epoch": 4.468189233278956, + "grad_norm": 0.1874253749847412, + "learning_rate": 3.38121632304114e-05, + "loss": 0.0707, + "num_input_tokens_seen": 59136336, + "step": 27390 + }, + { + "epoch": 4.469004893964111, + "grad_norm": 0.3087076246738434, + "learning_rate": 3.3805501745312e-05, + "loss": 0.1213, + "num_input_tokens_seen": 59146960, + "step": 27395 + }, + { + "epoch": 4.4698205546492655, + "grad_norm": 0.8427400588989258, + "learning_rate": 3.379883954639394e-05, + "loss": 0.049, + "num_input_tokens_seen": 59158096, + "step": 27400 + }, + { + "epoch": 4.470636215334421, + "grad_norm": 0.07492333650588989, + "learning_rate": 3.379217663419731e-05, + "loss": 0.0908, + "num_input_tokens_seen": 59168912, + "step": 27405 + }, + { + "epoch": 4.471451876019576, + "grad_norm": 0.09221696108579636, + "learning_rate": 3.378551300926222e-05, + "loss": 0.0488, + "num_input_tokens_seen": 59179728, + "step": 27410 + }, + { + "epoch": 4.472267536704731, + "grad_norm": 1.7877833843231201, + "learning_rate": 3.3778848672128884e-05, + "loss": 0.2264, + "num_input_tokens_seen": 59191312, + "step": 27415 + }, + { + "epoch": 4.473083197389886, + "grad_norm": 1.8451799154281616, + "learning_rate": 3.3772183623337524e-05, + "loss": 0.066, + "num_input_tokens_seen": 59202352, + "step": 27420 + }, + { + "epoch": 4.4738988580750405, + "grad_norm": 0.15486983954906464, + "learning_rate": 3.3765517863428456e-05, + "loss": 0.0779, + "num_input_tokens_seen": 59212560, + "step": 27425 + }, + { + "epoch": 4.474714518760196, + "grad_norm": 0.5240895748138428, + "learning_rate": 3.375885139294202e-05, + "loss": 0.1435, + "num_input_tokens_seen": 59221968, + "step": 27430 + }, + { + "epoch": 4.475530179445351, + "grad_norm": 0.6805318593978882, + "learning_rate": 3.375218421241866e-05, + "loss": 0.1763, + "num_input_tokens_seen": 59232848, + "step": 27435 + }, + { + "epoch": 4.476345840130506, + "grad_norm": 0.09250953048467636, + "learning_rate": 3.3745516322398834e-05, + "loss": 0.137, + "num_input_tokens_seen": 59243984, + "step": 27440 + }, + { + "epoch": 4.477161500815661, + "grad_norm": 0.2321421056985855, + "learning_rate": 3.373884772342308e-05, + "loss": 0.1219, + "num_input_tokens_seen": 59253840, + "step": 27445 + }, + { + "epoch": 4.4779771615008155, + "grad_norm": 0.10636953264474869, + "learning_rate": 3.3732178416032e-05, + "loss": 0.0905, + "num_input_tokens_seen": 59264400, + "step": 27450 + }, + { + "epoch": 4.47879282218597, + "grad_norm": 1.4315314292907715, + "learning_rate": 3.372550840076622e-05, + "loss": 0.23, + "num_input_tokens_seen": 59275728, + "step": 27455 + }, + { + "epoch": 4.479608482871126, + "grad_norm": 0.09869086742401123, + "learning_rate": 3.371883767816646e-05, + "loss": 0.129, + "num_input_tokens_seen": 59286896, + "step": 27460 + }, + { + "epoch": 4.480424143556281, + "grad_norm": 1.534591555595398, + "learning_rate": 3.371216624877348e-05, + "loss": 0.1295, + "num_input_tokens_seen": 59297808, + "step": 27465 + }, + { + "epoch": 4.481239804241436, + "grad_norm": 0.33429425954818726, + "learning_rate": 3.370549411312809e-05, + "loss": 0.0993, + "num_input_tokens_seen": 59308752, + "step": 27470 + }, + { + "epoch": 4.4820554649265905, + "grad_norm": 1.1591269969940186, + "learning_rate": 3.3698821271771186e-05, + "loss": 0.1189, + "num_input_tokens_seen": 59318320, + "step": 27475 + }, + { + "epoch": 4.482871125611745, + "grad_norm": 0.9904152154922485, + "learning_rate": 3.369214772524369e-05, + "loss": 0.1044, + "num_input_tokens_seen": 59328464, + "step": 27480 + }, + { + "epoch": 4.4836867862969, + "grad_norm": 0.7803117632865906, + "learning_rate": 3.3685473474086584e-05, + "loss": 0.1413, + "num_input_tokens_seen": 59338000, + "step": 27485 + }, + { + "epoch": 4.484502446982056, + "grad_norm": 0.5399446487426758, + "learning_rate": 3.3678798518840946e-05, + "loss": 0.1378, + "num_input_tokens_seen": 59347792, + "step": 27490 + }, + { + "epoch": 4.485318107667211, + "grad_norm": 0.2603670060634613, + "learning_rate": 3.367212286004786e-05, + "loss": 0.041, + "num_input_tokens_seen": 59357648, + "step": 27495 + }, + { + "epoch": 4.486133768352365, + "grad_norm": 0.0646490827202797, + "learning_rate": 3.366544649824849e-05, + "loss": 0.0431, + "num_input_tokens_seen": 59367760, + "step": 27500 + }, + { + "epoch": 4.48694942903752, + "grad_norm": 0.3719237744808197, + "learning_rate": 3.365876943398406e-05, + "loss": 0.0951, + "num_input_tokens_seen": 59378416, + "step": 27505 + }, + { + "epoch": 4.487765089722675, + "grad_norm": 0.5248892307281494, + "learning_rate": 3.365209166779585e-05, + "loss": 0.0852, + "num_input_tokens_seen": 59388944, + "step": 27510 + }, + { + "epoch": 4.488580750407831, + "grad_norm": 0.13497211039066315, + "learning_rate": 3.3645413200225175e-05, + "loss": 0.1171, + "num_input_tokens_seen": 59397776, + "step": 27515 + }, + { + "epoch": 4.489396411092986, + "grad_norm": 0.31091538071632385, + "learning_rate": 3.363873403181346e-05, + "loss": 0.0125, + "num_input_tokens_seen": 59408464, + "step": 27520 + }, + { + "epoch": 4.49021207177814, + "grad_norm": 0.10311544686555862, + "learning_rate": 3.363205416310212e-05, + "loss": 0.2333, + "num_input_tokens_seen": 59419888, + "step": 27525 + }, + { + "epoch": 4.491027732463295, + "grad_norm": 0.025290582329034805, + "learning_rate": 3.362537359463267e-05, + "loss": 0.128, + "num_input_tokens_seen": 59429712, + "step": 27530 + }, + { + "epoch": 4.49184339314845, + "grad_norm": 0.34522148966789246, + "learning_rate": 3.361869232694666e-05, + "loss": 0.0912, + "num_input_tokens_seen": 59440688, + "step": 27535 + }, + { + "epoch": 4.492659053833605, + "grad_norm": 0.06391511857509613, + "learning_rate": 3.3612010360585744e-05, + "loss": 0.08, + "num_input_tokens_seen": 59451120, + "step": 27540 + }, + { + "epoch": 4.493474714518761, + "grad_norm": 0.05280837044119835, + "learning_rate": 3.360532769609156e-05, + "loss": 0.0911, + "num_input_tokens_seen": 59461776, + "step": 27545 + }, + { + "epoch": 4.494290375203915, + "grad_norm": 0.15489310026168823, + "learning_rate": 3.359864433400585e-05, + "loss": 0.2163, + "num_input_tokens_seen": 59472464, + "step": 27550 + }, + { + "epoch": 4.49510603588907, + "grad_norm": 0.5672360062599182, + "learning_rate": 3.3591960274870394e-05, + "loss": 0.2574, + "num_input_tokens_seen": 59482704, + "step": 27555 + }, + { + "epoch": 4.495921696574225, + "grad_norm": 0.8707929849624634, + "learning_rate": 3.3585275519227046e-05, + "loss": 0.1706, + "num_input_tokens_seen": 59492976, + "step": 27560 + }, + { + "epoch": 4.49673735725938, + "grad_norm": 0.9699409008026123, + "learning_rate": 3.357859006761771e-05, + "loss": 0.1247, + "num_input_tokens_seen": 59503344, + "step": 27565 + }, + { + "epoch": 4.497553017944535, + "grad_norm": 0.4842979311943054, + "learning_rate": 3.357190392058433e-05, + "loss": 0.0291, + "num_input_tokens_seen": 59514896, + "step": 27570 + }, + { + "epoch": 4.49836867862969, + "grad_norm": 0.8894584774971008, + "learning_rate": 3.356521707866893e-05, + "loss": 0.2601, + "num_input_tokens_seen": 59527408, + "step": 27575 + }, + { + "epoch": 4.499184339314845, + "grad_norm": 1.2285726070404053, + "learning_rate": 3.3558529542413574e-05, + "loss": 0.1905, + "num_input_tokens_seen": 59537520, + "step": 27580 + }, + { + "epoch": 4.5, + "grad_norm": 0.7183090448379517, + "learning_rate": 3.3551841312360386e-05, + "loss": 0.3164, + "num_input_tokens_seen": 59549072, + "step": 27585 + }, + { + "epoch": 4.5, + "eval_loss": 0.13744661211967468, + "eval_runtime": 131.9034, + "eval_samples_per_second": 20.659, + "eval_steps_per_second": 5.17, + "num_input_tokens_seen": 59549072, + "step": 27585 + }, + { + "epoch": 4.500815660685155, + "grad_norm": 0.1749202013015747, + "learning_rate": 3.354515238905155e-05, + "loss": 0.0695, + "num_input_tokens_seen": 59559824, + "step": 27590 + }, + { + "epoch": 4.50163132137031, + "grad_norm": 0.5458588600158691, + "learning_rate": 3.35384627730293e-05, + "loss": 0.1207, + "num_input_tokens_seen": 59569776, + "step": 27595 + }, + { + "epoch": 4.502446982055465, + "grad_norm": 0.7018977403640747, + "learning_rate": 3.353177246483594e-05, + "loss": 0.1095, + "num_input_tokens_seen": 59579184, + "step": 27600 + }, + { + "epoch": 4.50326264274062, + "grad_norm": 0.20476390421390533, + "learning_rate": 3.352508146501381e-05, + "loss": 0.1016, + "num_input_tokens_seen": 59590416, + "step": 27605 + }, + { + "epoch": 4.504078303425775, + "grad_norm": 0.12445519864559174, + "learning_rate": 3.3518389774105326e-05, + "loss": 0.1048, + "num_input_tokens_seen": 59601136, + "step": 27610 + }, + { + "epoch": 4.50489396411093, + "grad_norm": 0.9597125053405762, + "learning_rate": 3.351169739265294e-05, + "loss": 0.1687, + "num_input_tokens_seen": 59612816, + "step": 27615 + }, + { + "epoch": 4.505709624796085, + "grad_norm": 0.22041702270507812, + "learning_rate": 3.350500432119917e-05, + "loss": 0.09, + "num_input_tokens_seen": 59624080, + "step": 27620 + }, + { + "epoch": 4.506525285481239, + "grad_norm": 0.491672158241272, + "learning_rate": 3.3498310560286604e-05, + "loss": 0.0465, + "num_input_tokens_seen": 59634608, + "step": 27625 + }, + { + "epoch": 4.507340946166395, + "grad_norm": 1.3196773529052734, + "learning_rate": 3.349161611045786e-05, + "loss": 0.0876, + "num_input_tokens_seen": 59645200, + "step": 27630 + }, + { + "epoch": 4.50815660685155, + "grad_norm": 0.25631749629974365, + "learning_rate": 3.348492097225563e-05, + "loss": 0.1476, + "num_input_tokens_seen": 59654736, + "step": 27635 + }, + { + "epoch": 4.508972267536705, + "grad_norm": 0.061140622943639755, + "learning_rate": 3.347822514622265e-05, + "loss": 0.1037, + "num_input_tokens_seen": 59664624, + "step": 27640 + }, + { + "epoch": 4.50978792822186, + "grad_norm": 0.06419696658849716, + "learning_rate": 3.347152863290173e-05, + "loss": 0.2061, + "num_input_tokens_seen": 59674480, + "step": 27645 + }, + { + "epoch": 4.510603588907014, + "grad_norm": 0.6134275794029236, + "learning_rate": 3.346483143283571e-05, + "loss": 0.0522, + "num_input_tokens_seen": 59685392, + "step": 27650 + }, + { + "epoch": 4.511419249592169, + "grad_norm": 0.6463528871536255, + "learning_rate": 3.3458133546567506e-05, + "loss": 0.0548, + "num_input_tokens_seen": 59695280, + "step": 27655 + }, + { + "epoch": 4.512234910277325, + "grad_norm": 0.42874494194984436, + "learning_rate": 3.345143497464007e-05, + "loss": 0.1318, + "num_input_tokens_seen": 59705872, + "step": 27660 + }, + { + "epoch": 4.51305057096248, + "grad_norm": 0.09881189465522766, + "learning_rate": 3.344473571759645e-05, + "loss": 0.0509, + "num_input_tokens_seen": 59717936, + "step": 27665 + }, + { + "epoch": 4.513866231647635, + "grad_norm": 2.6111817359924316, + "learning_rate": 3.343803577597969e-05, + "loss": 0.0984, + "num_input_tokens_seen": 59728144, + "step": 27670 + }, + { + "epoch": 4.514681892332789, + "grad_norm": 0.7452704310417175, + "learning_rate": 3.343133515033295e-05, + "loss": 0.1179, + "num_input_tokens_seen": 59739888, + "step": 27675 + }, + { + "epoch": 4.515497553017944, + "grad_norm": 0.018707316368818283, + "learning_rate": 3.342463384119939e-05, + "loss": 0.0308, + "num_input_tokens_seen": 59750320, + "step": 27680 + }, + { + "epoch": 4.5163132137031, + "grad_norm": 0.2588053047657013, + "learning_rate": 3.3417931849122275e-05, + "loss": 0.116, + "num_input_tokens_seen": 59761520, + "step": 27685 + }, + { + "epoch": 4.517128874388255, + "grad_norm": 0.7174596786499023, + "learning_rate": 3.341122917464489e-05, + "loss": 0.1136, + "num_input_tokens_seen": 59771632, + "step": 27690 + }, + { + "epoch": 4.5179445350734095, + "grad_norm": 0.9163461923599243, + "learning_rate": 3.340452581831057e-05, + "loss": 0.1107, + "num_input_tokens_seen": 59781744, + "step": 27695 + }, + { + "epoch": 4.518760195758564, + "grad_norm": 1.0487594604492188, + "learning_rate": 3.3397821780662764e-05, + "loss": 0.1001, + "num_input_tokens_seen": 59791760, + "step": 27700 + }, + { + "epoch": 4.519575856443719, + "grad_norm": 0.2954835891723633, + "learning_rate": 3.3391117062244913e-05, + "loss": 0.1065, + "num_input_tokens_seen": 59801872, + "step": 27705 + }, + { + "epoch": 4.520391517128875, + "grad_norm": 0.3776164650917053, + "learning_rate": 3.338441166360054e-05, + "loss": 0.1586, + "num_input_tokens_seen": 59811152, + "step": 27710 + }, + { + "epoch": 4.52120717781403, + "grad_norm": 0.2662873864173889, + "learning_rate": 3.33777055852732e-05, + "loss": 0.0495, + "num_input_tokens_seen": 59821264, + "step": 27715 + }, + { + "epoch": 4.5220228384991845, + "grad_norm": 1.1023070812225342, + "learning_rate": 3.3370998827806543e-05, + "loss": 0.1407, + "num_input_tokens_seen": 59832336, + "step": 27720 + }, + { + "epoch": 4.522838499184339, + "grad_norm": 0.7153282761573792, + "learning_rate": 3.336429139174425e-05, + "loss": 0.0582, + "num_input_tokens_seen": 59842288, + "step": 27725 + }, + { + "epoch": 4.523654159869494, + "grad_norm": 1.792145848274231, + "learning_rate": 3.335758327763006e-05, + "loss": 0.1358, + "num_input_tokens_seen": 59852912, + "step": 27730 + }, + { + "epoch": 4.524469820554649, + "grad_norm": 0.1738814413547516, + "learning_rate": 3.335087448600776e-05, + "loss": 0.0795, + "num_input_tokens_seen": 59863280, + "step": 27735 + }, + { + "epoch": 4.525285481239804, + "grad_norm": 0.22113284468650818, + "learning_rate": 3.33441650174212e-05, + "loss": 0.0759, + "num_input_tokens_seen": 59874288, + "step": 27740 + }, + { + "epoch": 4.5261011419249595, + "grad_norm": 0.9211616516113281, + "learning_rate": 3.3337454872414294e-05, + "loss": 0.0934, + "num_input_tokens_seen": 59886000, + "step": 27745 + }, + { + "epoch": 4.526916802610114, + "grad_norm": 0.7618114948272705, + "learning_rate": 3.333074405153098e-05, + "loss": 0.0571, + "num_input_tokens_seen": 59896432, + "step": 27750 + }, + { + "epoch": 4.527732463295269, + "grad_norm": 0.5754401087760925, + "learning_rate": 3.332403255531529e-05, + "loss": 0.1998, + "num_input_tokens_seen": 59906704, + "step": 27755 + }, + { + "epoch": 4.528548123980424, + "grad_norm": 0.9458719491958618, + "learning_rate": 3.331732038431129e-05, + "loss": 0.0735, + "num_input_tokens_seen": 59917392, + "step": 27760 + }, + { + "epoch": 4.529363784665579, + "grad_norm": 0.4538721740245819, + "learning_rate": 3.3310607539063096e-05, + "loss": 0.1825, + "num_input_tokens_seen": 59927984, + "step": 27765 + }, + { + "epoch": 4.5301794453507345, + "grad_norm": 0.3901679515838623, + "learning_rate": 3.3303894020114886e-05, + "loss": 0.1327, + "num_input_tokens_seen": 59938160, + "step": 27770 + }, + { + "epoch": 4.530995106035889, + "grad_norm": 0.43520236015319824, + "learning_rate": 3.329717982801089e-05, + "loss": 0.0496, + "num_input_tokens_seen": 59950000, + "step": 27775 + }, + { + "epoch": 4.531810766721044, + "grad_norm": 0.5135898590087891, + "learning_rate": 3.32904649632954e-05, + "loss": 0.0505, + "num_input_tokens_seen": 59960720, + "step": 27780 + }, + { + "epoch": 4.532626427406199, + "grad_norm": 0.05132293701171875, + "learning_rate": 3.328374942651275e-05, + "loss": 0.0133, + "num_input_tokens_seen": 59970864, + "step": 27785 + }, + { + "epoch": 4.533442088091354, + "grad_norm": 1.7838002443313599, + "learning_rate": 3.3277033218207346e-05, + "loss": 0.1619, + "num_input_tokens_seen": 59982256, + "step": 27790 + }, + { + "epoch": 4.5342577487765094, + "grad_norm": 0.06523383408784866, + "learning_rate": 3.327031633892363e-05, + "loss": 0.2061, + "num_input_tokens_seen": 59993584, + "step": 27795 + }, + { + "epoch": 4.535073409461664, + "grad_norm": 0.2009391337633133, + "learning_rate": 3.32635987892061e-05, + "loss": 0.0562, + "num_input_tokens_seen": 60004720, + "step": 27800 + }, + { + "epoch": 4.535889070146819, + "grad_norm": 0.5938424468040466, + "learning_rate": 3.3256880569599335e-05, + "loss": 0.1996, + "num_input_tokens_seen": 60015120, + "step": 27805 + }, + { + "epoch": 4.536704730831974, + "grad_norm": 0.0976007804274559, + "learning_rate": 3.325016168064794e-05, + "loss": 0.0326, + "num_input_tokens_seen": 60024752, + "step": 27810 + }, + { + "epoch": 4.537520391517129, + "grad_norm": 1.135041356086731, + "learning_rate": 3.324344212289657e-05, + "loss": 0.1215, + "num_input_tokens_seen": 60037072, + "step": 27815 + }, + { + "epoch": 4.5383360522022835, + "grad_norm": 0.12968742847442627, + "learning_rate": 3.3236721896889954e-05, + "loss": 0.1112, + "num_input_tokens_seen": 60048144, + "step": 27820 + }, + { + "epoch": 4.539151712887438, + "grad_norm": 0.23278973996639252, + "learning_rate": 3.323000100317287e-05, + "loss": 0.1638, + "num_input_tokens_seen": 60058544, + "step": 27825 + }, + { + "epoch": 4.539967373572594, + "grad_norm": 0.7902299761772156, + "learning_rate": 3.3223279442290146e-05, + "loss": 0.0493, + "num_input_tokens_seen": 60068688, + "step": 27830 + }, + { + "epoch": 4.540783034257749, + "grad_norm": 0.4219397008419037, + "learning_rate": 3.321655721478667e-05, + "loss": 0.0573, + "num_input_tokens_seen": 60079664, + "step": 27835 + }, + { + "epoch": 4.541598694942904, + "grad_norm": 0.8390363454818726, + "learning_rate": 3.320983432120737e-05, + "loss": 0.1802, + "num_input_tokens_seen": 60089296, + "step": 27840 + }, + { + "epoch": 4.5424143556280585, + "grad_norm": 0.04833041504025459, + "learning_rate": 3.320311076209724e-05, + "loss": 0.1341, + "num_input_tokens_seen": 60099568, + "step": 27845 + }, + { + "epoch": 4.543230016313213, + "grad_norm": 0.5419262051582336, + "learning_rate": 3.3196386538001346e-05, + "loss": 0.092, + "num_input_tokens_seen": 60110160, + "step": 27850 + }, + { + "epoch": 4.544045676998369, + "grad_norm": 0.9166667461395264, + "learning_rate": 3.3189661649464754e-05, + "loss": 0.0473, + "num_input_tokens_seen": 60120880, + "step": 27855 + }, + { + "epoch": 4.544861337683524, + "grad_norm": 0.06209676340222359, + "learning_rate": 3.318293609703264e-05, + "loss": 0.0591, + "num_input_tokens_seen": 60132336, + "step": 27860 + }, + { + "epoch": 4.545676998368679, + "grad_norm": 0.26240047812461853, + "learning_rate": 3.3176209881250206e-05, + "loss": 0.0331, + "num_input_tokens_seen": 60142608, + "step": 27865 + }, + { + "epoch": 4.5464926590538335, + "grad_norm": 1.0854209661483765, + "learning_rate": 3.3169483002662714e-05, + "loss": 0.0653, + "num_input_tokens_seen": 60153744, + "step": 27870 + }, + { + "epoch": 4.547308319738988, + "grad_norm": 0.06623554974794388, + "learning_rate": 3.316275546181548e-05, + "loss": 0.0645, + "num_input_tokens_seen": 60166288, + "step": 27875 + }, + { + "epoch": 4.548123980424144, + "grad_norm": 1.9918785095214844, + "learning_rate": 3.315602725925387e-05, + "loss": 0.2114, + "num_input_tokens_seen": 60176208, + "step": 27880 + }, + { + "epoch": 4.548939641109299, + "grad_norm": 1.0009732246398926, + "learning_rate": 3.314929839552331e-05, + "loss": 0.0817, + "num_input_tokens_seen": 60187088, + "step": 27885 + }, + { + "epoch": 4.549755301794454, + "grad_norm": 0.3322462737560272, + "learning_rate": 3.314256887116927e-05, + "loss": 0.2491, + "num_input_tokens_seen": 60197488, + "step": 27890 + }, + { + "epoch": 4.5505709624796085, + "grad_norm": 0.7896372079849243, + "learning_rate": 3.313583868673728e-05, + "loss": 0.0595, + "num_input_tokens_seen": 60208816, + "step": 27895 + }, + { + "epoch": 4.551386623164763, + "grad_norm": 0.7047167420387268, + "learning_rate": 3.312910784277293e-05, + "loss": 0.0953, + "num_input_tokens_seen": 60219728, + "step": 27900 + }, + { + "epoch": 4.552202283849918, + "grad_norm": 0.7169449329376221, + "learning_rate": 3.312237633982185e-05, + "loss": 0.099, + "num_input_tokens_seen": 60230288, + "step": 27905 + }, + { + "epoch": 4.553017944535073, + "grad_norm": 0.0638008639216423, + "learning_rate": 3.3115644178429725e-05, + "loss": 0.0319, + "num_input_tokens_seen": 60242384, + "step": 27910 + }, + { + "epoch": 4.553833605220229, + "grad_norm": 0.030663613229990005, + "learning_rate": 3.310891135914231e-05, + "loss": 0.1345, + "num_input_tokens_seen": 60253360, + "step": 27915 + }, + { + "epoch": 4.554649265905383, + "grad_norm": 0.14774952828884125, + "learning_rate": 3.31021778825054e-05, + "loss": 0.2389, + "num_input_tokens_seen": 60263440, + "step": 27920 + }, + { + "epoch": 4.555464926590538, + "grad_norm": 1.2427139282226562, + "learning_rate": 3.309544374906484e-05, + "loss": 0.0542, + "num_input_tokens_seen": 60273392, + "step": 27925 + }, + { + "epoch": 4.556280587275693, + "grad_norm": 0.7691762447357178, + "learning_rate": 3.308870895936652e-05, + "loss": 0.2047, + "num_input_tokens_seen": 60285424, + "step": 27930 + }, + { + "epoch": 4.557096247960848, + "grad_norm": 1.2741413116455078, + "learning_rate": 3.308197351395643e-05, + "loss": 0.0951, + "num_input_tokens_seen": 60295248, + "step": 27935 + }, + { + "epoch": 4.557911908646004, + "grad_norm": 0.08047308027744293, + "learning_rate": 3.3075237413380545e-05, + "loss": 0.2071, + "num_input_tokens_seen": 60306320, + "step": 27940 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.11959271132946014, + "learning_rate": 3.306850065818494e-05, + "loss": 0.0802, + "num_input_tokens_seen": 60315600, + "step": 27945 + }, + { + "epoch": 4.559543230016313, + "grad_norm": 0.041997626423835754, + "learning_rate": 3.3061763248915744e-05, + "loss": 0.2009, + "num_input_tokens_seen": 60326608, + "step": 27950 + }, + { + "epoch": 4.560358890701468, + "grad_norm": 0.12778837978839874, + "learning_rate": 3.305502518611911e-05, + "loss": 0.2546, + "num_input_tokens_seen": 60336944, + "step": 27955 + }, + { + "epoch": 4.561174551386623, + "grad_norm": 0.3559116721153259, + "learning_rate": 3.304828647034126e-05, + "loss": 0.1711, + "num_input_tokens_seen": 60348656, + "step": 27960 + }, + { + "epoch": 4.561990212071779, + "grad_norm": 1.110945701599121, + "learning_rate": 3.304154710212847e-05, + "loss": 0.0853, + "num_input_tokens_seen": 60359568, + "step": 27965 + }, + { + "epoch": 4.562805872756933, + "grad_norm": 1.093151569366455, + "learning_rate": 3.303480708202708e-05, + "loss": 0.1291, + "num_input_tokens_seen": 60370192, + "step": 27970 + }, + { + "epoch": 4.563621533442088, + "grad_norm": 0.543356716632843, + "learning_rate": 3.3028066410583456e-05, + "loss": 0.0925, + "num_input_tokens_seen": 60381136, + "step": 27975 + }, + { + "epoch": 4.564437194127243, + "grad_norm": 0.26652777194976807, + "learning_rate": 3.3021325088344036e-05, + "loss": 0.0896, + "num_input_tokens_seen": 60392304, + "step": 27980 + }, + { + "epoch": 4.565252854812398, + "grad_norm": 0.07132405042648315, + "learning_rate": 3.3014583115855304e-05, + "loss": 0.2285, + "num_input_tokens_seen": 60402544, + "step": 27985 + }, + { + "epoch": 4.566068515497553, + "grad_norm": 3.4920849800109863, + "learning_rate": 3.3007840493663794e-05, + "loss": 0.0792, + "num_input_tokens_seen": 60413488, + "step": 27990 + }, + { + "epoch": 4.566884176182708, + "grad_norm": 0.5244823694229126, + "learning_rate": 3.30010972223161e-05, + "loss": 0.1191, + "num_input_tokens_seen": 60423696, + "step": 27995 + }, + { + "epoch": 4.567699836867863, + "grad_norm": 0.5142679214477539, + "learning_rate": 3.2994353302358875e-05, + "loss": 0.0638, + "num_input_tokens_seen": 60434576, + "step": 28000 + }, + { + "epoch": 4.568515497553018, + "grad_norm": 1.760802149772644, + "learning_rate": 3.298760873433881e-05, + "loss": 0.2865, + "num_input_tokens_seen": 60443856, + "step": 28005 + }, + { + "epoch": 4.569331158238173, + "grad_norm": 1.5676339864730835, + "learning_rate": 3.298086351880265e-05, + "loss": 0.1295, + "num_input_tokens_seen": 60454864, + "step": 28010 + }, + { + "epoch": 4.570146818923328, + "grad_norm": 0.9332104325294495, + "learning_rate": 3.2974117656297194e-05, + "loss": 0.1664, + "num_input_tokens_seen": 60465456, + "step": 28015 + }, + { + "epoch": 4.5709624796084825, + "grad_norm": 0.39851951599121094, + "learning_rate": 3.2967371147369306e-05, + "loss": 0.1168, + "num_input_tokens_seen": 60476816, + "step": 28020 + }, + { + "epoch": 4.571778140293638, + "grad_norm": 0.688158392906189, + "learning_rate": 3.296062399256587e-05, + "loss": 0.1724, + "num_input_tokens_seen": 60488784, + "step": 28025 + }, + { + "epoch": 4.572593800978793, + "grad_norm": 0.6545007824897766, + "learning_rate": 3.295387619243389e-05, + "loss": 0.0605, + "num_input_tokens_seen": 60500656, + "step": 28030 + }, + { + "epoch": 4.573409461663948, + "grad_norm": 0.35214531421661377, + "learning_rate": 3.294712774752033e-05, + "loss": 0.049, + "num_input_tokens_seen": 60511664, + "step": 28035 + }, + { + "epoch": 4.574225122349103, + "grad_norm": 2.11287522315979, + "learning_rate": 3.2940378658372276e-05, + "loss": 0.2179, + "num_input_tokens_seen": 60522000, + "step": 28040 + }, + { + "epoch": 4.575040783034257, + "grad_norm": 0.4524390697479248, + "learning_rate": 3.293362892553684e-05, + "loss": 0.1006, + "num_input_tokens_seen": 60532688, + "step": 28045 + }, + { + "epoch": 4.575856443719413, + "grad_norm": 0.17525345087051392, + "learning_rate": 3.292687854956119e-05, + "loss": 0.1839, + "num_input_tokens_seen": 60543152, + "step": 28050 + }, + { + "epoch": 4.576672104404568, + "grad_norm": 0.41497501730918884, + "learning_rate": 3.292012753099254e-05, + "loss": 0.1304, + "num_input_tokens_seen": 60552272, + "step": 28055 + }, + { + "epoch": 4.577487765089723, + "grad_norm": 0.12479083985090256, + "learning_rate": 3.2913375870378165e-05, + "loss": 0.1428, + "num_input_tokens_seen": 60563888, + "step": 28060 + }, + { + "epoch": 4.578303425774878, + "grad_norm": 0.1557879000902176, + "learning_rate": 3.2906623568265396e-05, + "loss": 0.3302, + "num_input_tokens_seen": 60574576, + "step": 28065 + }, + { + "epoch": 4.579119086460032, + "grad_norm": 0.9482901096343994, + "learning_rate": 3.289987062520159e-05, + "loss": 0.0714, + "num_input_tokens_seen": 60584848, + "step": 28070 + }, + { + "epoch": 4.579934747145187, + "grad_norm": 1.1416925191879272, + "learning_rate": 3.28931170417342e-05, + "loss": 0.1295, + "num_input_tokens_seen": 60595312, + "step": 28075 + }, + { + "epoch": 4.580750407830343, + "grad_norm": 0.8453258872032166, + "learning_rate": 3.288636281841069e-05, + "loss": 0.133, + "num_input_tokens_seen": 60604848, + "step": 28080 + }, + { + "epoch": 4.581566068515498, + "grad_norm": 0.1357371062040329, + "learning_rate": 3.287960795577859e-05, + "loss": 0.1282, + "num_input_tokens_seen": 60614640, + "step": 28085 + }, + { + "epoch": 4.582381729200653, + "grad_norm": 2.0028440952301025, + "learning_rate": 3.2872852454385495e-05, + "loss": 0.1029, + "num_input_tokens_seen": 60624912, + "step": 28090 + }, + { + "epoch": 4.583197389885807, + "grad_norm": 0.7436177730560303, + "learning_rate": 3.2866096314779035e-05, + "loss": 0.033, + "num_input_tokens_seen": 60637968, + "step": 28095 + }, + { + "epoch": 4.584013050570962, + "grad_norm": 0.02785624749958515, + "learning_rate": 3.285933953750689e-05, + "loss": 0.1228, + "num_input_tokens_seen": 60648816, + "step": 28100 + }, + { + "epoch": 4.584828711256117, + "grad_norm": 0.498106986284256, + "learning_rate": 3.28525821231168e-05, + "loss": 0.1257, + "num_input_tokens_seen": 60659888, + "step": 28105 + }, + { + "epoch": 4.585644371941273, + "grad_norm": 1.2704261541366577, + "learning_rate": 3.284582407215657e-05, + "loss": 0.1526, + "num_input_tokens_seen": 60671472, + "step": 28110 + }, + { + "epoch": 4.5864600326264275, + "grad_norm": 1.4731998443603516, + "learning_rate": 3.283906538517403e-05, + "loss": 0.0964, + "num_input_tokens_seen": 60683696, + "step": 28115 + }, + { + "epoch": 4.587275693311582, + "grad_norm": 0.4558843672275543, + "learning_rate": 3.283230606271707e-05, + "loss": 0.0945, + "num_input_tokens_seen": 60694832, + "step": 28120 + }, + { + "epoch": 4.588091353996737, + "grad_norm": 0.3350542485713959, + "learning_rate": 3.2825546105333634e-05, + "loss": 0.1099, + "num_input_tokens_seen": 60706192, + "step": 28125 + }, + { + "epoch": 4.588907014681892, + "grad_norm": 0.10364656150341034, + "learning_rate": 3.281878551357174e-05, + "loss": 0.033, + "num_input_tokens_seen": 60716880, + "step": 28130 + }, + { + "epoch": 4.589722675367048, + "grad_norm": 1.8262451887130737, + "learning_rate": 3.281202428797941e-05, + "loss": 0.3445, + "num_input_tokens_seen": 60727792, + "step": 28135 + }, + { + "epoch": 4.5905383360522025, + "grad_norm": 0.5630049705505371, + "learning_rate": 3.2805262429104755e-05, + "loss": 0.1475, + "num_input_tokens_seen": 60738800, + "step": 28140 + }, + { + "epoch": 4.591353996737357, + "grad_norm": 1.1416552066802979, + "learning_rate": 3.279849993749593e-05, + "loss": 0.1645, + "num_input_tokens_seen": 60748336, + "step": 28145 + }, + { + "epoch": 4.592169657422512, + "grad_norm": 0.11095064878463745, + "learning_rate": 3.279173681370112e-05, + "loss": 0.1666, + "num_input_tokens_seen": 60759728, + "step": 28150 + }, + { + "epoch": 4.592985318107667, + "grad_norm": 1.262987494468689, + "learning_rate": 3.27849730582686e-05, + "loss": 0.1356, + "num_input_tokens_seen": 60771184, + "step": 28155 + }, + { + "epoch": 4.593800978792823, + "grad_norm": 0.10241828113794327, + "learning_rate": 3.2778208671746654e-05, + "loss": 0.0402, + "num_input_tokens_seen": 60782864, + "step": 28160 + }, + { + "epoch": 4.5946166394779775, + "grad_norm": 2.2916011810302734, + "learning_rate": 3.277144365468365e-05, + "loss": 0.2177, + "num_input_tokens_seen": 60794800, + "step": 28165 + }, + { + "epoch": 4.595432300163132, + "grad_norm": 0.04092436656355858, + "learning_rate": 3.2764678007627994e-05, + "loss": 0.0324, + "num_input_tokens_seen": 60806224, + "step": 28170 + }, + { + "epoch": 4.596247960848287, + "grad_norm": 0.019521571695804596, + "learning_rate": 3.275791173112814e-05, + "loss": 0.138, + "num_input_tokens_seen": 60817232, + "step": 28175 + }, + { + "epoch": 4.597063621533442, + "grad_norm": 0.38479185104370117, + "learning_rate": 3.2751144825732595e-05, + "loss": 0.1535, + "num_input_tokens_seen": 60828400, + "step": 28180 + }, + { + "epoch": 4.597879282218597, + "grad_norm": 0.3421945869922638, + "learning_rate": 3.274437729198992e-05, + "loss": 0.3309, + "num_input_tokens_seen": 60840080, + "step": 28185 + }, + { + "epoch": 4.598694942903752, + "grad_norm": 0.7071083188056946, + "learning_rate": 3.273760913044873e-05, + "loss": 0.0727, + "num_input_tokens_seen": 60851088, + "step": 28190 + }, + { + "epoch": 4.599510603588907, + "grad_norm": 0.34697964787483215, + "learning_rate": 3.273084034165769e-05, + "loss": 0.1047, + "num_input_tokens_seen": 60862096, + "step": 28195 + }, + { + "epoch": 4.600326264274062, + "grad_norm": 0.07816991209983826, + "learning_rate": 3.2724070926165495e-05, + "loss": 0.2031, + "num_input_tokens_seen": 60872848, + "step": 28200 + }, + { + "epoch": 4.601141924959217, + "grad_norm": 0.3034125864505768, + "learning_rate": 3.271730088452093e-05, + "loss": 0.0134, + "num_input_tokens_seen": 60883760, + "step": 28205 + }, + { + "epoch": 4.601957585644372, + "grad_norm": 1.7227879762649536, + "learning_rate": 3.2710530217272794e-05, + "loss": 0.152, + "num_input_tokens_seen": 60893456, + "step": 28210 + }, + { + "epoch": 4.602773246329527, + "grad_norm": 1.091416597366333, + "learning_rate": 3.270375892496995e-05, + "loss": 0.0887, + "num_input_tokens_seen": 60905136, + "step": 28215 + }, + { + "epoch": 4.603588907014682, + "grad_norm": 0.8031155467033386, + "learning_rate": 3.2696987008161325e-05, + "loss": 0.2074, + "num_input_tokens_seen": 60915088, + "step": 28220 + }, + { + "epoch": 4.604404567699837, + "grad_norm": 0.4350029528141022, + "learning_rate": 3.269021446739588e-05, + "loss": 0.0862, + "num_input_tokens_seen": 60926704, + "step": 28225 + }, + { + "epoch": 4.605220228384992, + "grad_norm": 2.1760308742523193, + "learning_rate": 3.268344130322262e-05, + "loss": 0.0789, + "num_input_tokens_seen": 60937808, + "step": 28230 + }, + { + "epoch": 4.606035889070147, + "grad_norm": 0.7521554827690125, + "learning_rate": 3.2676667516190634e-05, + "loss": 0.0717, + "num_input_tokens_seen": 60948112, + "step": 28235 + }, + { + "epoch": 4.6068515497553015, + "grad_norm": 0.42239710688591003, + "learning_rate": 3.266989310684902e-05, + "loss": 0.3502, + "num_input_tokens_seen": 60959312, + "step": 28240 + }, + { + "epoch": 4.607667210440457, + "grad_norm": 0.9908577799797058, + "learning_rate": 3.266311807574697e-05, + "loss": 0.0885, + "num_input_tokens_seen": 60970256, + "step": 28245 + }, + { + "epoch": 4.608482871125612, + "grad_norm": 0.040701545774936676, + "learning_rate": 3.265634242343367e-05, + "loss": 0.0461, + "num_input_tokens_seen": 60980752, + "step": 28250 + }, + { + "epoch": 4.609298531810767, + "grad_norm": 0.09981203079223633, + "learning_rate": 3.264956615045841e-05, + "loss": 0.0534, + "num_input_tokens_seen": 60991344, + "step": 28255 + }, + { + "epoch": 4.610114192495922, + "grad_norm": 0.3176724910736084, + "learning_rate": 3.26427892573705e-05, + "loss": 0.0387, + "num_input_tokens_seen": 61001264, + "step": 28260 + }, + { + "epoch": 4.6109298531810765, + "grad_norm": 0.054240498691797256, + "learning_rate": 3.263601174471932e-05, + "loss": 0.0151, + "num_input_tokens_seen": 61011408, + "step": 28265 + }, + { + "epoch": 4.611745513866231, + "grad_norm": 0.5754056572914124, + "learning_rate": 3.262923361305429e-05, + "loss": 0.0644, + "num_input_tokens_seen": 61020944, + "step": 28270 + }, + { + "epoch": 4.612561174551386, + "grad_norm": 0.6585829854011536, + "learning_rate": 3.262245486292486e-05, + "loss": 0.0827, + "num_input_tokens_seen": 61031248, + "step": 28275 + }, + { + "epoch": 4.613376835236542, + "grad_norm": 1.8633842468261719, + "learning_rate": 3.261567549488056e-05, + "loss": 0.234, + "num_input_tokens_seen": 61042320, + "step": 28280 + }, + { + "epoch": 4.614192495921697, + "grad_norm": 1.0077927112579346, + "learning_rate": 3.260889550947098e-05, + "loss": 0.2495, + "num_input_tokens_seen": 61053520, + "step": 28285 + }, + { + "epoch": 4.6150081566068515, + "grad_norm": 0.2986800968647003, + "learning_rate": 3.260211490724571e-05, + "loss": 0.0755, + "num_input_tokens_seen": 61062960, + "step": 28290 + }, + { + "epoch": 4.615823817292006, + "grad_norm": 1.072527527809143, + "learning_rate": 3.259533368875444e-05, + "loss": 0.0463, + "num_input_tokens_seen": 61073392, + "step": 28295 + }, + { + "epoch": 4.616639477977161, + "grad_norm": 0.034682661294937134, + "learning_rate": 3.2588551854546876e-05, + "loss": 0.023, + "num_input_tokens_seen": 61083984, + "step": 28300 + }, + { + "epoch": 4.617455138662317, + "grad_norm": 1.8818720579147339, + "learning_rate": 3.2581769405172805e-05, + "loss": 0.097, + "num_input_tokens_seen": 61095280, + "step": 28305 + }, + { + "epoch": 4.618270799347472, + "grad_norm": 0.12688353657722473, + "learning_rate": 3.2574986341182026e-05, + "loss": 0.0423, + "num_input_tokens_seen": 61107024, + "step": 28310 + }, + { + "epoch": 4.6190864600326265, + "grad_norm": 0.1703336089849472, + "learning_rate": 3.256820266312442e-05, + "loss": 0.0689, + "num_input_tokens_seen": 61117552, + "step": 28315 + }, + { + "epoch": 4.619902120717781, + "grad_norm": 1.0406697988510132, + "learning_rate": 3.256141837154991e-05, + "loss": 0.0886, + "num_input_tokens_seen": 61129040, + "step": 28320 + }, + { + "epoch": 4.620717781402936, + "grad_norm": 0.2787533104419708, + "learning_rate": 3.255463346700846e-05, + "loss": 0.0584, + "num_input_tokens_seen": 61139664, + "step": 28325 + }, + { + "epoch": 4.621533442088092, + "grad_norm": 2.075836181640625, + "learning_rate": 3.254784795005008e-05, + "loss": 0.3671, + "num_input_tokens_seen": 61151728, + "step": 28330 + }, + { + "epoch": 4.622349102773247, + "grad_norm": 1.7105157375335693, + "learning_rate": 3.254106182122486e-05, + "loss": 0.277, + "num_input_tokens_seen": 61162448, + "step": 28335 + }, + { + "epoch": 4.623164763458401, + "grad_norm": 1.8954601287841797, + "learning_rate": 3.2534275081082896e-05, + "loss": 0.1919, + "num_input_tokens_seen": 61173136, + "step": 28340 + }, + { + "epoch": 4.623980424143556, + "grad_norm": 1.9087681770324707, + "learning_rate": 3.252748773017437e-05, + "loss": 0.1848, + "num_input_tokens_seen": 61183120, + "step": 28345 + }, + { + "epoch": 4.624796084828711, + "grad_norm": 0.12842100858688354, + "learning_rate": 3.2520699769049496e-05, + "loss": 0.0547, + "num_input_tokens_seen": 61194000, + "step": 28350 + }, + { + "epoch": 4.625611745513866, + "grad_norm": 1.1408346891403198, + "learning_rate": 3.251391119825854e-05, + "loss": 0.0857, + "num_input_tokens_seen": 61204688, + "step": 28355 + }, + { + "epoch": 4.626427406199021, + "grad_norm": 1.4275144338607788, + "learning_rate": 3.2507122018351815e-05, + "loss": 0.1909, + "num_input_tokens_seen": 61215376, + "step": 28360 + }, + { + "epoch": 4.627243066884176, + "grad_norm": 0.5976679921150208, + "learning_rate": 3.250033222987969e-05, + "loss": 0.0581, + "num_input_tokens_seen": 61226256, + "step": 28365 + }, + { + "epoch": 4.628058727569331, + "grad_norm": 0.5630949139595032, + "learning_rate": 3.2493541833392575e-05, + "loss": 0.0561, + "num_input_tokens_seen": 61238192, + "step": 28370 + }, + { + "epoch": 4.628874388254486, + "grad_norm": 0.10408467054367065, + "learning_rate": 3.2486750829440946e-05, + "loss": 0.1378, + "num_input_tokens_seen": 61248944, + "step": 28375 + }, + { + "epoch": 4.629690048939641, + "grad_norm": 0.07081914693117142, + "learning_rate": 3.2479959218575295e-05, + "loss": 0.1034, + "num_input_tokens_seen": 61260752, + "step": 28380 + }, + { + "epoch": 4.630505709624796, + "grad_norm": 1.4308704137802124, + "learning_rate": 3.24731670013462e-05, + "loss": 0.3154, + "num_input_tokens_seen": 61271984, + "step": 28385 + }, + { + "epoch": 4.631321370309951, + "grad_norm": 1.1556546688079834, + "learning_rate": 3.246637417830427e-05, + "loss": 0.0857, + "num_input_tokens_seen": 61283248, + "step": 28390 + }, + { + "epoch": 4.632137030995106, + "grad_norm": 0.2695733606815338, + "learning_rate": 3.245958075000017e-05, + "loss": 0.0771, + "num_input_tokens_seen": 61294512, + "step": 28395 + }, + { + "epoch": 4.632952691680261, + "grad_norm": 1.6350932121276855, + "learning_rate": 3.24527867169846e-05, + "loss": 0.1254, + "num_input_tokens_seen": 61304944, + "step": 28400 + }, + { + "epoch": 4.633768352365416, + "grad_norm": 0.12445788830518723, + "learning_rate": 3.244599207980833e-05, + "loss": 0.0226, + "num_input_tokens_seen": 61315408, + "step": 28405 + }, + { + "epoch": 4.634584013050571, + "grad_norm": 0.9411349296569824, + "learning_rate": 3.243919683902216e-05, + "loss": 0.1187, + "num_input_tokens_seen": 61326000, + "step": 28410 + }, + { + "epoch": 4.635399673735726, + "grad_norm": 0.1020992323756218, + "learning_rate": 3.2432400995176934e-05, + "loss": 0.1565, + "num_input_tokens_seen": 61335984, + "step": 28415 + }, + { + "epoch": 4.636215334420881, + "grad_norm": 0.22649428248405457, + "learning_rate": 3.242560454882359e-05, + "loss": 0.0637, + "num_input_tokens_seen": 61347344, + "step": 28420 + }, + { + "epoch": 4.637030995106036, + "grad_norm": 1.2538992166519165, + "learning_rate": 3.241880750051306e-05, + "loss": 0.2198, + "num_input_tokens_seen": 61358352, + "step": 28425 + }, + { + "epoch": 4.637846655791191, + "grad_norm": 0.10841718316078186, + "learning_rate": 3.241200985079634e-05, + "loss": 0.0488, + "num_input_tokens_seen": 61369168, + "step": 28430 + }, + { + "epoch": 4.638662316476346, + "grad_norm": 1.709912657737732, + "learning_rate": 3.2405211600224503e-05, + "loss": 0.2314, + "num_input_tokens_seen": 61379984, + "step": 28435 + }, + { + "epoch": 4.6394779771615005, + "grad_norm": 1.621129035949707, + "learning_rate": 3.239841274934863e-05, + "loss": 0.2433, + "num_input_tokens_seen": 61391760, + "step": 28440 + }, + { + "epoch": 4.640293637846656, + "grad_norm": 0.503489077091217, + "learning_rate": 3.239161329871989e-05, + "loss": 0.1898, + "num_input_tokens_seen": 61401680, + "step": 28445 + }, + { + "epoch": 4.641109298531811, + "grad_norm": 0.061672698706388474, + "learning_rate": 3.2384813248889475e-05, + "loss": 0.065, + "num_input_tokens_seen": 61411696, + "step": 28450 + }, + { + "epoch": 4.641924959216966, + "grad_norm": 0.26011011004447937, + "learning_rate": 3.2378012600408625e-05, + "loss": 0.1513, + "num_input_tokens_seen": 61421968, + "step": 28455 + }, + { + "epoch": 4.642740619902121, + "grad_norm": 0.8748125433921814, + "learning_rate": 3.2371211353828636e-05, + "loss": 0.1387, + "num_input_tokens_seen": 61430960, + "step": 28460 + }, + { + "epoch": 4.643556280587275, + "grad_norm": 0.9568736553192139, + "learning_rate": 3.236440950970085e-05, + "loss": 0.2941, + "num_input_tokens_seen": 61442032, + "step": 28465 + }, + { + "epoch": 4.64437194127243, + "grad_norm": 0.8938939571380615, + "learning_rate": 3.2357607068576664e-05, + "loss": 0.17, + "num_input_tokens_seen": 61453136, + "step": 28470 + }, + { + "epoch": 4.645187601957586, + "grad_norm": 0.6542612910270691, + "learning_rate": 3.2350804031007524e-05, + "loss": 0.141, + "num_input_tokens_seen": 61464048, + "step": 28475 + }, + { + "epoch": 4.646003262642741, + "grad_norm": 0.0610966682434082, + "learning_rate": 3.234400039754491e-05, + "loss": 0.1677, + "num_input_tokens_seen": 61475344, + "step": 28480 + }, + { + "epoch": 4.646818923327896, + "grad_norm": 0.4462844729423523, + "learning_rate": 3.2337196168740356e-05, + "loss": 0.0474, + "num_input_tokens_seen": 61485168, + "step": 28485 + }, + { + "epoch": 4.64763458401305, + "grad_norm": 1.1381635665893555, + "learning_rate": 3.233039134514545e-05, + "loss": 0.1686, + "num_input_tokens_seen": 61496304, + "step": 28490 + }, + { + "epoch": 4.648450244698205, + "grad_norm": 0.5392566323280334, + "learning_rate": 3.2323585927311825e-05, + "loss": 0.0553, + "num_input_tokens_seen": 61507088, + "step": 28495 + }, + { + "epoch": 4.649265905383361, + "grad_norm": 1.124061942100525, + "learning_rate": 3.231677991579118e-05, + "loss": 0.1515, + "num_input_tokens_seen": 61518032, + "step": 28500 + }, + { + "epoch": 4.650081566068516, + "grad_norm": 0.5740441083908081, + "learning_rate": 3.230997331113521e-05, + "loss": 0.0968, + "num_input_tokens_seen": 61529424, + "step": 28505 + }, + { + "epoch": 4.650897226753671, + "grad_norm": 0.6482673287391663, + "learning_rate": 3.230316611389573e-05, + "loss": 0.053, + "num_input_tokens_seen": 61540976, + "step": 28510 + }, + { + "epoch": 4.651712887438825, + "grad_norm": 0.1950196623802185, + "learning_rate": 3.229635832462454e-05, + "loss": 0.0473, + "num_input_tokens_seen": 61552368, + "step": 28515 + }, + { + "epoch": 4.65252854812398, + "grad_norm": 0.11606240272521973, + "learning_rate": 3.228954994387352e-05, + "loss": 0.0481, + "num_input_tokens_seen": 61563184, + "step": 28520 + }, + { + "epoch": 4.653344208809135, + "grad_norm": 2.2352166175842285, + "learning_rate": 3.2282740972194606e-05, + "loss": 0.2498, + "num_input_tokens_seen": 61572720, + "step": 28525 + }, + { + "epoch": 4.654159869494291, + "grad_norm": 0.09474208205938339, + "learning_rate": 3.2275931410139755e-05, + "loss": 0.0945, + "num_input_tokens_seen": 61584176, + "step": 28530 + }, + { + "epoch": 4.6549755301794455, + "grad_norm": 0.20557904243469238, + "learning_rate": 3.226912125826098e-05, + "loss": 0.056, + "num_input_tokens_seen": 61595152, + "step": 28535 + }, + { + "epoch": 4.6557911908646, + "grad_norm": 0.9384939670562744, + "learning_rate": 3.226231051711035e-05, + "loss": 0.1249, + "num_input_tokens_seen": 61606032, + "step": 28540 + }, + { + "epoch": 4.656606851549755, + "grad_norm": 0.14732283353805542, + "learning_rate": 3.225549918723999e-05, + "loss": 0.0939, + "num_input_tokens_seen": 61617680, + "step": 28545 + }, + { + "epoch": 4.65742251223491, + "grad_norm": 0.28902360796928406, + "learning_rate": 3.224868726920205e-05, + "loss": 0.1086, + "num_input_tokens_seen": 61628080, + "step": 28550 + }, + { + "epoch": 4.658238172920065, + "grad_norm": 0.9967536330223083, + "learning_rate": 3.224187476354873e-05, + "loss": 0.2371, + "num_input_tokens_seen": 61639440, + "step": 28555 + }, + { + "epoch": 4.6590538336052205, + "grad_norm": 0.9070225358009338, + "learning_rate": 3.223506167083231e-05, + "loss": 0.136, + "num_input_tokens_seen": 61650544, + "step": 28560 + }, + { + "epoch": 4.659869494290375, + "grad_norm": 0.6540431380271912, + "learning_rate": 3.222824799160508e-05, + "loss": 0.1933, + "num_input_tokens_seen": 61661232, + "step": 28565 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 1.331835389137268, + "learning_rate": 3.222143372641938e-05, + "loss": 0.0381, + "num_input_tokens_seen": 61671984, + "step": 28570 + }, + { + "epoch": 4.661500815660685, + "grad_norm": 0.733400821685791, + "learning_rate": 3.2214618875827626e-05, + "loss": 0.0783, + "num_input_tokens_seen": 61682160, + "step": 28575 + }, + { + "epoch": 4.66231647634584, + "grad_norm": 0.2217773199081421, + "learning_rate": 3.220780344038227e-05, + "loss": 0.1203, + "num_input_tokens_seen": 61692112, + "step": 28580 + }, + { + "epoch": 4.6631321370309955, + "grad_norm": 0.04294593632221222, + "learning_rate": 3.220098742063578e-05, + "loss": 0.17, + "num_input_tokens_seen": 61703312, + "step": 28585 + }, + { + "epoch": 4.66394779771615, + "grad_norm": 0.5706005692481995, + "learning_rate": 3.219417081714072e-05, + "loss": 0.0413, + "num_input_tokens_seen": 61713616, + "step": 28590 + }, + { + "epoch": 4.664763458401305, + "grad_norm": 0.08831708878278732, + "learning_rate": 3.218735363044967e-05, + "loss": 0.2332, + "num_input_tokens_seen": 61724080, + "step": 28595 + }, + { + "epoch": 4.66557911908646, + "grad_norm": 0.7087748646736145, + "learning_rate": 3.218053586111526e-05, + "loss": 0.1616, + "num_input_tokens_seen": 61735216, + "step": 28600 + }, + { + "epoch": 4.666394779771615, + "grad_norm": 1.5853861570358276, + "learning_rate": 3.217371750969019e-05, + "loss": 0.0817, + "num_input_tokens_seen": 61746736, + "step": 28605 + }, + { + "epoch": 4.6672104404567705, + "grad_norm": 1.6287124156951904, + "learning_rate": 3.2166898576727176e-05, + "loss": 0.1649, + "num_input_tokens_seen": 61758032, + "step": 28610 + }, + { + "epoch": 4.668026101141925, + "grad_norm": 0.4395933151245117, + "learning_rate": 3.2160079062779005e-05, + "loss": 0.2533, + "num_input_tokens_seen": 61768624, + "step": 28615 + }, + { + "epoch": 4.66884176182708, + "grad_norm": 0.6998699903488159, + "learning_rate": 3.215325896839848e-05, + "loss": 0.0516, + "num_input_tokens_seen": 61779408, + "step": 28620 + }, + { + "epoch": 4.669657422512235, + "grad_norm": 0.4985121786594391, + "learning_rate": 3.2146438294138505e-05, + "loss": 0.046, + "num_input_tokens_seen": 61790320, + "step": 28625 + }, + { + "epoch": 4.67047308319739, + "grad_norm": 0.21287117898464203, + "learning_rate": 3.2139617040551966e-05, + "loss": 0.0589, + "num_input_tokens_seen": 61801968, + "step": 28630 + }, + { + "epoch": 4.671288743882545, + "grad_norm": 0.26676884293556213, + "learning_rate": 3.2132795208191853e-05, + "loss": 0.1857, + "num_input_tokens_seen": 61812944, + "step": 28635 + }, + { + "epoch": 4.672104404567699, + "grad_norm": 1.2941328287124634, + "learning_rate": 3.212597279761116e-05, + "loss": 0.1162, + "num_input_tokens_seen": 61824912, + "step": 28640 + }, + { + "epoch": 4.672920065252855, + "grad_norm": 0.036512214690446854, + "learning_rate": 3.211914980936296e-05, + "loss": 0.1733, + "num_input_tokens_seen": 61835120, + "step": 28645 + }, + { + "epoch": 4.67373572593801, + "grad_norm": 0.10471077263355255, + "learning_rate": 3.2112326244000355e-05, + "loss": 0.0855, + "num_input_tokens_seen": 61845680, + "step": 28650 + }, + { + "epoch": 4.674551386623165, + "grad_norm": 1.0601836442947388, + "learning_rate": 3.2105502102076494e-05, + "loss": 0.1393, + "num_input_tokens_seen": 61856368, + "step": 28655 + }, + { + "epoch": 4.6753670473083195, + "grad_norm": 0.4370889663696289, + "learning_rate": 3.209867738414459e-05, + "loss": 0.1103, + "num_input_tokens_seen": 61866448, + "step": 28660 + }, + { + "epoch": 4.676182707993474, + "grad_norm": 0.5675359964370728, + "learning_rate": 3.2091852090757865e-05, + "loss": 0.1478, + "num_input_tokens_seen": 61877040, + "step": 28665 + }, + { + "epoch": 4.67699836867863, + "grad_norm": 1.3275635242462158, + "learning_rate": 3.208502622246964e-05, + "loss": 0.1711, + "num_input_tokens_seen": 61887472, + "step": 28670 + }, + { + "epoch": 4.677814029363785, + "grad_norm": 1.1144992113113403, + "learning_rate": 3.207819977983323e-05, + "loss": 0.0641, + "num_input_tokens_seen": 61898864, + "step": 28675 + }, + { + "epoch": 4.67862969004894, + "grad_norm": 0.24717970192432404, + "learning_rate": 3.207137276340203e-05, + "loss": 0.0381, + "num_input_tokens_seen": 61910160, + "step": 28680 + }, + { + "epoch": 4.6794453507340945, + "grad_norm": 1.3548861742019653, + "learning_rate": 3.206454517372949e-05, + "loss": 0.3584, + "num_input_tokens_seen": 61920176, + "step": 28685 + }, + { + "epoch": 4.680261011419249, + "grad_norm": 1.6306753158569336, + "learning_rate": 3.205771701136906e-05, + "loss": 0.1958, + "num_input_tokens_seen": 61930992, + "step": 28690 + }, + { + "epoch": 4.681076672104405, + "grad_norm": 0.32892927527427673, + "learning_rate": 3.205088827687428e-05, + "loss": 0.1373, + "num_input_tokens_seen": 61941744, + "step": 28695 + }, + { + "epoch": 4.68189233278956, + "grad_norm": 0.19401444494724274, + "learning_rate": 3.204405897079872e-05, + "loss": 0.1287, + "num_input_tokens_seen": 61951760, + "step": 28700 + }, + { + "epoch": 4.682707993474715, + "grad_norm": 1.2984799146652222, + "learning_rate": 3.2037229093696e-05, + "loss": 0.362, + "num_input_tokens_seen": 61961040, + "step": 28705 + }, + { + "epoch": 4.6835236541598695, + "grad_norm": 0.09523936361074448, + "learning_rate": 3.203039864611978e-05, + "loss": 0.0457, + "num_input_tokens_seen": 61971856, + "step": 28710 + }, + { + "epoch": 4.684339314845024, + "grad_norm": 0.18014055490493774, + "learning_rate": 3.202356762862377e-05, + "loss": 0.0267, + "num_input_tokens_seen": 61983760, + "step": 28715 + }, + { + "epoch": 4.685154975530179, + "grad_norm": 0.6007795333862305, + "learning_rate": 3.201673604176174e-05, + "loss": 0.0457, + "num_input_tokens_seen": 61994480, + "step": 28720 + }, + { + "epoch": 4.685970636215334, + "grad_norm": 1.7613146305084229, + "learning_rate": 3.2009903886087476e-05, + "loss": 0.1706, + "num_input_tokens_seen": 62005360, + "step": 28725 + }, + { + "epoch": 4.68678629690049, + "grad_norm": 0.3049047291278839, + "learning_rate": 3.200307116215485e-05, + "loss": 0.0831, + "num_input_tokens_seen": 62016976, + "step": 28730 + }, + { + "epoch": 4.6876019575856445, + "grad_norm": 1.407517671585083, + "learning_rate": 3.1996237870517734e-05, + "loss": 0.1253, + "num_input_tokens_seen": 62028176, + "step": 28735 + }, + { + "epoch": 4.688417618270799, + "grad_norm": 0.18415847420692444, + "learning_rate": 3.198940401173007e-05, + "loss": 0.1191, + "num_input_tokens_seen": 62038672, + "step": 28740 + }, + { + "epoch": 4.689233278955954, + "grad_norm": 0.10298551619052887, + "learning_rate": 3.198256958634586e-05, + "loss": 0.0262, + "num_input_tokens_seen": 62049424, + "step": 28745 + }, + { + "epoch": 4.690048939641109, + "grad_norm": 0.4208112359046936, + "learning_rate": 3.197573459491913e-05, + "loss": 0.0952, + "num_input_tokens_seen": 62060112, + "step": 28750 + }, + { + "epoch": 4.690864600326265, + "grad_norm": 0.19282126426696777, + "learning_rate": 3.1968899038003965e-05, + "loss": 0.0985, + "num_input_tokens_seen": 62071728, + "step": 28755 + }, + { + "epoch": 4.691680261011419, + "grad_norm": 0.149068683385849, + "learning_rate": 3.196206291615447e-05, + "loss": 0.269, + "num_input_tokens_seen": 62081680, + "step": 28760 + }, + { + "epoch": 4.692495921696574, + "grad_norm": 0.9717332124710083, + "learning_rate": 3.195522622992484e-05, + "loss": 0.1023, + "num_input_tokens_seen": 62091984, + "step": 28765 + }, + { + "epoch": 4.693311582381729, + "grad_norm": 0.042737673968076706, + "learning_rate": 3.1948388979869277e-05, + "loss": 0.082, + "num_input_tokens_seen": 62102864, + "step": 28770 + }, + { + "epoch": 4.694127243066884, + "grad_norm": 1.16884446144104, + "learning_rate": 3.194155116654205e-05, + "loss": 0.2367, + "num_input_tokens_seen": 62114256, + "step": 28775 + }, + { + "epoch": 4.69494290375204, + "grad_norm": 0.5421172976493835, + "learning_rate": 3.193471279049746e-05, + "loss": 0.0381, + "num_input_tokens_seen": 62124816, + "step": 28780 + }, + { + "epoch": 4.695758564437194, + "grad_norm": 0.16998200118541718, + "learning_rate": 3.192787385228987e-05, + "loss": 0.0568, + "num_input_tokens_seen": 62136368, + "step": 28785 + }, + { + "epoch": 4.696574225122349, + "grad_norm": 0.31973719596862793, + "learning_rate": 3.192103435247368e-05, + "loss": 0.2218, + "num_input_tokens_seen": 62147024, + "step": 28790 + }, + { + "epoch": 4.697389885807504, + "grad_norm": 0.8680514693260193, + "learning_rate": 3.1914194291603313e-05, + "loss": 0.0625, + "num_input_tokens_seen": 62156848, + "step": 28795 + }, + { + "epoch": 4.698205546492659, + "grad_norm": 0.32929056882858276, + "learning_rate": 3.190735367023328e-05, + "loss": 0.0705, + "num_input_tokens_seen": 62168496, + "step": 28800 + }, + { + "epoch": 4.699021207177814, + "grad_norm": 0.36527249217033386, + "learning_rate": 3.1900512488918114e-05, + "loss": 0.1482, + "num_input_tokens_seen": 62178800, + "step": 28805 + }, + { + "epoch": 4.699836867862969, + "grad_norm": 0.11662361025810242, + "learning_rate": 3.189367074821239e-05, + "loss": 0.0998, + "num_input_tokens_seen": 62190768, + "step": 28810 + }, + { + "epoch": 4.700652528548124, + "grad_norm": 0.07164676487445831, + "learning_rate": 3.1886828448670734e-05, + "loss": 0.0269, + "num_input_tokens_seen": 62201648, + "step": 28815 + }, + { + "epoch": 4.701468189233279, + "grad_norm": 0.34035253524780273, + "learning_rate": 3.1879985590847824e-05, + "loss": 0.1861, + "num_input_tokens_seen": 62212816, + "step": 28820 + }, + { + "epoch": 4.702283849918434, + "grad_norm": 0.09780071675777435, + "learning_rate": 3.187314217529838e-05, + "loss": 0.1051, + "num_input_tokens_seen": 62222992, + "step": 28825 + }, + { + "epoch": 4.703099510603589, + "grad_norm": 1.3342393636703491, + "learning_rate": 3.1866298202577157e-05, + "loss": 0.157, + "num_input_tokens_seen": 62234192, + "step": 28830 + }, + { + "epoch": 4.7039151712887435, + "grad_norm": 0.024050630629062653, + "learning_rate": 3.185945367323895e-05, + "loss": 0.196, + "num_input_tokens_seen": 62244848, + "step": 28835 + }, + { + "epoch": 4.704730831973899, + "grad_norm": 1.5173563957214355, + "learning_rate": 3.185260858783864e-05, + "loss": 0.1502, + "num_input_tokens_seen": 62255376, + "step": 28840 + }, + { + "epoch": 4.705546492659054, + "grad_norm": 0.5343908667564392, + "learning_rate": 3.1845762946931093e-05, + "loss": 0.0732, + "num_input_tokens_seen": 62266192, + "step": 28845 + }, + { + "epoch": 4.706362153344209, + "grad_norm": 0.08200320601463318, + "learning_rate": 3.183891675107128e-05, + "loss": 0.0408, + "num_input_tokens_seen": 62277232, + "step": 28850 + }, + { + "epoch": 4.707177814029364, + "grad_norm": 0.0990060567855835, + "learning_rate": 3.183207000081416e-05, + "loss": 0.0392, + "num_input_tokens_seen": 62288368, + "step": 28855 + }, + { + "epoch": 4.7079934747145185, + "grad_norm": 1.01786208152771, + "learning_rate": 3.1825222696714796e-05, + "loss": 0.2141, + "num_input_tokens_seen": 62299024, + "step": 28860 + }, + { + "epoch": 4.708809135399674, + "grad_norm": 1.1389751434326172, + "learning_rate": 3.1818374839328236e-05, + "loss": 0.21, + "num_input_tokens_seen": 62309136, + "step": 28865 + }, + { + "epoch": 4.709624796084829, + "grad_norm": 1.8310836553573608, + "learning_rate": 3.181152642920962e-05, + "loss": 0.1874, + "num_input_tokens_seen": 62320656, + "step": 28870 + }, + { + "epoch": 4.710440456769984, + "grad_norm": 0.09890327602624893, + "learning_rate": 3.180467746691411e-05, + "loss": 0.0293, + "num_input_tokens_seen": 62330896, + "step": 28875 + }, + { + "epoch": 4.711256117455139, + "grad_norm": 0.1292860507965088, + "learning_rate": 3.1797827952996914e-05, + "loss": 0.0923, + "num_input_tokens_seen": 62342032, + "step": 28880 + }, + { + "epoch": 4.712071778140293, + "grad_norm": 0.06437575817108154, + "learning_rate": 3.1790977888013294e-05, + "loss": 0.0996, + "num_input_tokens_seen": 62351984, + "step": 28885 + }, + { + "epoch": 4.712887438825448, + "grad_norm": 1.1184226274490356, + "learning_rate": 3.178412727251856e-05, + "loss": 0.0831, + "num_input_tokens_seen": 62362256, + "step": 28890 + }, + { + "epoch": 4.713703099510604, + "grad_norm": 1.4105007648468018, + "learning_rate": 3.177727610706804e-05, + "loss": 0.2028, + "num_input_tokens_seen": 62373040, + "step": 28895 + }, + { + "epoch": 4.714518760195759, + "grad_norm": 0.042892057448625565, + "learning_rate": 3.177042439221713e-05, + "loss": 0.0824, + "num_input_tokens_seen": 62384304, + "step": 28900 + }, + { + "epoch": 4.715334420880914, + "grad_norm": 1.7406820058822632, + "learning_rate": 3.176357212852127e-05, + "loss": 0.1013, + "num_input_tokens_seen": 62394864, + "step": 28905 + }, + { + "epoch": 4.716150081566068, + "grad_norm": 0.7112303376197815, + "learning_rate": 3.175671931653593e-05, + "loss": 0.1194, + "num_input_tokens_seen": 62405040, + "step": 28910 + }, + { + "epoch": 4.716965742251223, + "grad_norm": 0.030896715819835663, + "learning_rate": 3.174986595681664e-05, + "loss": 0.1039, + "num_input_tokens_seen": 62417040, + "step": 28915 + }, + { + "epoch": 4.717781402936378, + "grad_norm": 0.9543591141700745, + "learning_rate": 3.174301204991896e-05, + "loss": 0.2131, + "num_input_tokens_seen": 62427088, + "step": 28920 + }, + { + "epoch": 4.718597063621534, + "grad_norm": 0.296215295791626, + "learning_rate": 3.173615759639852e-05, + "loss": 0.1744, + "num_input_tokens_seen": 62436816, + "step": 28925 + }, + { + "epoch": 4.719412724306689, + "grad_norm": 0.1053757593035698, + "learning_rate": 3.1729302596810965e-05, + "loss": 0.0294, + "num_input_tokens_seen": 62447248, + "step": 28930 + }, + { + "epoch": 4.720228384991843, + "grad_norm": 0.43667805194854736, + "learning_rate": 3.172244705171199e-05, + "loss": 0.1836, + "num_input_tokens_seen": 62456368, + "step": 28935 + }, + { + "epoch": 4.721044045676998, + "grad_norm": 0.42491528391838074, + "learning_rate": 3.171559096165736e-05, + "loss": 0.1006, + "num_input_tokens_seen": 62467088, + "step": 28940 + }, + { + "epoch": 4.721859706362153, + "grad_norm": 1.5101956129074097, + "learning_rate": 3.170873432720285e-05, + "loss": 0.2686, + "num_input_tokens_seen": 62476752, + "step": 28945 + }, + { + "epoch": 4.722675367047309, + "grad_norm": 0.03543290123343468, + "learning_rate": 3.170187714890429e-05, + "loss": 0.0935, + "num_input_tokens_seen": 62487152, + "step": 28950 + }, + { + "epoch": 4.7234910277324635, + "grad_norm": 0.14720787107944489, + "learning_rate": 3.1695019427317564e-05, + "loss": 0.041, + "num_input_tokens_seen": 62498544, + "step": 28955 + }, + { + "epoch": 4.724306688417618, + "grad_norm": 0.019663648679852486, + "learning_rate": 3.1688161162998595e-05, + "loss": 0.0729, + "num_input_tokens_seen": 62509488, + "step": 28960 + }, + { + "epoch": 4.725122349102773, + "grad_norm": 0.2534136474132538, + "learning_rate": 3.1681302356503337e-05, + "loss": 0.0985, + "num_input_tokens_seen": 62520464, + "step": 28965 + }, + { + "epoch": 4.725938009787928, + "grad_norm": 0.8010448217391968, + "learning_rate": 3.167444300838782e-05, + "loss": 0.3174, + "num_input_tokens_seen": 62531536, + "step": 28970 + }, + { + "epoch": 4.726753670473083, + "grad_norm": 2.480090618133545, + "learning_rate": 3.1667583119208085e-05, + "loss": 0.146, + "num_input_tokens_seen": 62542320, + "step": 28975 + }, + { + "epoch": 4.7275693311582385, + "grad_norm": 0.4579830765724182, + "learning_rate": 3.1660722689520225e-05, + "loss": 0.0909, + "num_input_tokens_seen": 62551760, + "step": 28980 + }, + { + "epoch": 4.728384991843393, + "grad_norm": 0.6611957550048828, + "learning_rate": 3.16538617198804e-05, + "loss": 0.0729, + "num_input_tokens_seen": 62562512, + "step": 28985 + }, + { + "epoch": 4.729200652528548, + "grad_norm": 0.11296696960926056, + "learning_rate": 3.164700021084478e-05, + "loss": 0.126, + "num_input_tokens_seen": 62573648, + "step": 28990 + }, + { + "epoch": 4.730016313213703, + "grad_norm": 0.3038175702095032, + "learning_rate": 3.164013816296959e-05, + "loss": 0.2156, + "num_input_tokens_seen": 62584592, + "step": 28995 + }, + { + "epoch": 4.730831973898858, + "grad_norm": 0.0356559120118618, + "learning_rate": 3.163327557681111e-05, + "loss": 0.0536, + "num_input_tokens_seen": 62596496, + "step": 29000 + }, + { + "epoch": 4.731647634584013, + "grad_norm": 0.1901496797800064, + "learning_rate": 3.162641245292566e-05, + "loss": 0.103, + "num_input_tokens_seen": 62607312, + "step": 29005 + }, + { + "epoch": 4.732463295269168, + "grad_norm": 0.8596254587173462, + "learning_rate": 3.161954879186959e-05, + "loss": 0.0772, + "num_input_tokens_seen": 62617680, + "step": 29010 + }, + { + "epoch": 4.733278955954323, + "grad_norm": 0.4523984491825104, + "learning_rate": 3.161268459419931e-05, + "loss": 0.0881, + "num_input_tokens_seen": 62627664, + "step": 29015 + }, + { + "epoch": 4.734094616639478, + "grad_norm": 1.4262962341308594, + "learning_rate": 3.160581986047127e-05, + "loss": 0.2772, + "num_input_tokens_seen": 62638352, + "step": 29020 + }, + { + "epoch": 4.734910277324633, + "grad_norm": 0.2836189270019531, + "learning_rate": 3.1598954591241934e-05, + "loss": 0.1668, + "num_input_tokens_seen": 62649520, + "step": 29025 + }, + { + "epoch": 4.735725938009788, + "grad_norm": 0.09149888902902603, + "learning_rate": 3.159208878706787e-05, + "loss": 0.0345, + "num_input_tokens_seen": 62660080, + "step": 29030 + }, + { + "epoch": 4.736541598694943, + "grad_norm": 0.3856813609600067, + "learning_rate": 3.1585222448505644e-05, + "loss": 0.2825, + "num_input_tokens_seen": 62669808, + "step": 29035 + }, + { + "epoch": 4.737357259380098, + "grad_norm": 0.4459984600543976, + "learning_rate": 3.1578355576111864e-05, + "loss": 0.0775, + "num_input_tokens_seen": 62681136, + "step": 29040 + }, + { + "epoch": 4.738172920065253, + "grad_norm": 0.20251597464084625, + "learning_rate": 3.157148817044321e-05, + "loss": 0.2205, + "num_input_tokens_seen": 62692144, + "step": 29045 + }, + { + "epoch": 4.738988580750408, + "grad_norm": 0.17032290995121002, + "learning_rate": 3.156462023205638e-05, + "loss": 0.1526, + "num_input_tokens_seen": 62701360, + "step": 29050 + }, + { + "epoch": 4.739804241435563, + "grad_norm": 1.8142008781433105, + "learning_rate": 3.155775176150812e-05, + "loss": 0.1228, + "num_input_tokens_seen": 62713488, + "step": 29055 + }, + { + "epoch": 4.740619902120718, + "grad_norm": 0.15112879872322083, + "learning_rate": 3.1550882759355246e-05, + "loss": 0.1713, + "num_input_tokens_seen": 62725264, + "step": 29060 + }, + { + "epoch": 4.741435562805873, + "grad_norm": 1.0661237239837646, + "learning_rate": 3.154401322615456e-05, + "loss": 0.0846, + "num_input_tokens_seen": 62735312, + "step": 29065 + }, + { + "epoch": 4.742251223491028, + "grad_norm": 0.20968744158744812, + "learning_rate": 3.153714316246297e-05, + "loss": 0.1673, + "num_input_tokens_seen": 62745744, + "step": 29070 + }, + { + "epoch": 4.743066884176183, + "grad_norm": 0.6078944206237793, + "learning_rate": 3.153027256883737e-05, + "loss": 0.157, + "num_input_tokens_seen": 62754960, + "step": 29075 + }, + { + "epoch": 4.7438825448613375, + "grad_norm": 0.6986880898475647, + "learning_rate": 3.152340144583475e-05, + "loss": 0.1507, + "num_input_tokens_seen": 62765712, + "step": 29080 + }, + { + "epoch": 4.744698205546492, + "grad_norm": 0.7138107419013977, + "learning_rate": 3.151652979401211e-05, + "loss": 0.0943, + "num_input_tokens_seen": 62776784, + "step": 29085 + }, + { + "epoch": 4.745513866231647, + "grad_norm": 0.1673237383365631, + "learning_rate": 3.15096576139265e-05, + "loss": 0.2732, + "num_input_tokens_seen": 62787984, + "step": 29090 + }, + { + "epoch": 4.746329526916803, + "grad_norm": 0.13312944769859314, + "learning_rate": 3.150278490613501e-05, + "loss": 0.1053, + "num_input_tokens_seen": 62797680, + "step": 29095 + }, + { + "epoch": 4.747145187601958, + "grad_norm": 0.082826629281044, + "learning_rate": 3.149591167119479e-05, + "loss": 0.0748, + "num_input_tokens_seen": 62808112, + "step": 29100 + }, + { + "epoch": 4.7479608482871125, + "grad_norm": 0.3943411409854889, + "learning_rate": 3.148903790966301e-05, + "loss": 0.2761, + "num_input_tokens_seen": 62818928, + "step": 29105 + }, + { + "epoch": 4.748776508972267, + "grad_norm": 0.5550259351730347, + "learning_rate": 3.148216362209688e-05, + "loss": 0.1322, + "num_input_tokens_seen": 62829872, + "step": 29110 + }, + { + "epoch": 4.749592169657422, + "grad_norm": 0.2862483561038971, + "learning_rate": 3.1475288809053684e-05, + "loss": 0.1015, + "num_input_tokens_seen": 62840144, + "step": 29115 + }, + { + "epoch": 4.750407830342578, + "grad_norm": 0.30869150161743164, + "learning_rate": 3.146841347109072e-05, + "loss": 0.117, + "num_input_tokens_seen": 62851856, + "step": 29120 + }, + { + "epoch": 4.751223491027733, + "grad_norm": 0.5811659693717957, + "learning_rate": 3.146153760876534e-05, + "loss": 0.1949, + "num_input_tokens_seen": 62863056, + "step": 29125 + }, + { + "epoch": 4.7520391517128875, + "grad_norm": 1.4709901809692383, + "learning_rate": 3.145466122263494e-05, + "loss": 0.0861, + "num_input_tokens_seen": 62872592, + "step": 29130 + }, + { + "epoch": 4.752854812398042, + "grad_norm": 1.929608702659607, + "learning_rate": 3.144778431325694e-05, + "loss": 0.1352, + "num_input_tokens_seen": 62883600, + "step": 29135 + }, + { + "epoch": 4.753670473083197, + "grad_norm": 2.3003194332122803, + "learning_rate": 3.1440906881188835e-05, + "loss": 0.1903, + "num_input_tokens_seen": 62894160, + "step": 29140 + }, + { + "epoch": 4.754486133768353, + "grad_norm": 0.5869313478469849, + "learning_rate": 3.143402892698814e-05, + "loss": 0.1175, + "num_input_tokens_seen": 62905648, + "step": 29145 + }, + { + "epoch": 4.755301794453508, + "grad_norm": 0.12926341593265533, + "learning_rate": 3.142715045121241e-05, + "loss": 0.0587, + "num_input_tokens_seen": 62918256, + "step": 29150 + }, + { + "epoch": 4.7561174551386625, + "grad_norm": 1.6476423740386963, + "learning_rate": 3.142027145441926e-05, + "loss": 0.2337, + "num_input_tokens_seen": 62929200, + "step": 29155 + }, + { + "epoch": 4.756933115823817, + "grad_norm": 0.7038540244102478, + "learning_rate": 3.141339193716633e-05, + "loss": 0.0722, + "num_input_tokens_seen": 62940016, + "step": 29160 + }, + { + "epoch": 4.757748776508972, + "grad_norm": 0.6043410897254944, + "learning_rate": 3.1406511900011295e-05, + "loss": 0.0383, + "num_input_tokens_seen": 62949744, + "step": 29165 + }, + { + "epoch": 4.758564437194127, + "grad_norm": 0.09132350236177444, + "learning_rate": 3.139963134351191e-05, + "loss": 0.1169, + "num_input_tokens_seen": 62960848, + "step": 29170 + }, + { + "epoch": 4.759380097879282, + "grad_norm": 0.1589629203081131, + "learning_rate": 3.139275026822594e-05, + "loss": 0.2463, + "num_input_tokens_seen": 62971984, + "step": 29175 + }, + { + "epoch": 4.760195758564437, + "grad_norm": 0.41063636541366577, + "learning_rate": 3.138586867471118e-05, + "loss": 0.0684, + "num_input_tokens_seen": 62982224, + "step": 29180 + }, + { + "epoch": 4.761011419249592, + "grad_norm": 1.3316296339035034, + "learning_rate": 3.137898656352551e-05, + "loss": 0.1827, + "num_input_tokens_seen": 62992464, + "step": 29185 + }, + { + "epoch": 4.761827079934747, + "grad_norm": 0.7656028866767883, + "learning_rate": 3.137210393522683e-05, + "loss": 0.1074, + "num_input_tokens_seen": 63002960, + "step": 29190 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.3045111894607544, + "learning_rate": 3.136522079037307e-05, + "loss": 0.1754, + "num_input_tokens_seen": 63013520, + "step": 29195 + }, + { + "epoch": 4.763458401305057, + "grad_norm": 1.8018311262130737, + "learning_rate": 3.135833712952222e-05, + "loss": 0.1843, + "num_input_tokens_seen": 63023760, + "step": 29200 + }, + { + "epoch": 4.764274061990212, + "grad_norm": 0.06452036648988724, + "learning_rate": 3.135145295323229e-05, + "loss": 0.0974, + "num_input_tokens_seen": 63035248, + "step": 29205 + }, + { + "epoch": 4.765089722675367, + "grad_norm": 0.40617066621780396, + "learning_rate": 3.1344568262061366e-05, + "loss": 0.1497, + "num_input_tokens_seen": 63045904, + "step": 29210 + }, + { + "epoch": 4.765905383360522, + "grad_norm": 1.1776576042175293, + "learning_rate": 3.133768305656755e-05, + "loss": 0.213, + "num_input_tokens_seen": 63056336, + "step": 29215 + }, + { + "epoch": 4.766721044045677, + "grad_norm": 1.1827329397201538, + "learning_rate": 3.1330797337308984e-05, + "loss": 0.2036, + "num_input_tokens_seen": 63067376, + "step": 29220 + }, + { + "epoch": 4.767536704730832, + "grad_norm": 0.1710384488105774, + "learning_rate": 3.1323911104843865e-05, + "loss": 0.2384, + "num_input_tokens_seen": 63077456, + "step": 29225 + }, + { + "epoch": 4.768352365415987, + "grad_norm": 0.08730197697877884, + "learning_rate": 3.131702435973042e-05, + "loss": 0.0311, + "num_input_tokens_seen": 63087024, + "step": 29230 + }, + { + "epoch": 4.769168026101142, + "grad_norm": 0.22756651043891907, + "learning_rate": 3.1310137102526926e-05, + "loss": 0.1098, + "num_input_tokens_seen": 63098608, + "step": 29235 + }, + { + "epoch": 4.769983686786297, + "grad_norm": 0.03942583128809929, + "learning_rate": 3.13032493337917e-05, + "loss": 0.0912, + "num_input_tokens_seen": 63108144, + "step": 29240 + }, + { + "epoch": 4.770799347471452, + "grad_norm": 0.6011594533920288, + "learning_rate": 3.129636105408311e-05, + "loss": 0.2764, + "num_input_tokens_seen": 63118736, + "step": 29245 + }, + { + "epoch": 4.771615008156607, + "grad_norm": 0.27442798018455505, + "learning_rate": 3.128947226395954e-05, + "loss": 0.022, + "num_input_tokens_seen": 63128656, + "step": 29250 + }, + { + "epoch": 4.7724306688417615, + "grad_norm": 0.08253444731235504, + "learning_rate": 3.1282582963979434e-05, + "loss": 0.1625, + "num_input_tokens_seen": 63139632, + "step": 29255 + }, + { + "epoch": 4.773246329526917, + "grad_norm": 0.549888014793396, + "learning_rate": 3.127569315470128e-05, + "loss": 0.0613, + "num_input_tokens_seen": 63150128, + "step": 29260 + }, + { + "epoch": 4.774061990212072, + "grad_norm": 0.2669948637485504, + "learning_rate": 3.12688028366836e-05, + "loss": 0.1117, + "num_input_tokens_seen": 63160112, + "step": 29265 + }, + { + "epoch": 4.774877650897227, + "grad_norm": 0.5292145013809204, + "learning_rate": 3.126191201048494e-05, + "loss": 0.1353, + "num_input_tokens_seen": 63172592, + "step": 29270 + }, + { + "epoch": 4.775693311582382, + "grad_norm": 1.0613421201705933, + "learning_rate": 3.125502067666393e-05, + "loss": 0.133, + "num_input_tokens_seen": 63184304, + "step": 29275 + }, + { + "epoch": 4.7765089722675365, + "grad_norm": 0.036837756633758545, + "learning_rate": 3.1248128835779206e-05, + "loss": 0.0276, + "num_input_tokens_seen": 63195184, + "step": 29280 + }, + { + "epoch": 4.777324632952691, + "grad_norm": 0.13515576720237732, + "learning_rate": 3.124123648838946e-05, + "loss": 0.0846, + "num_input_tokens_seen": 63206064, + "step": 29285 + }, + { + "epoch": 4.778140293637847, + "grad_norm": 0.19384650886058807, + "learning_rate": 3.123434363505341e-05, + "loss": 0.0388, + "num_input_tokens_seen": 63215536, + "step": 29290 + }, + { + "epoch": 4.778955954323002, + "grad_norm": 0.523151695728302, + "learning_rate": 3.122745027632983e-05, + "loss": 0.1267, + "num_input_tokens_seen": 63226480, + "step": 29295 + }, + { + "epoch": 4.779771615008157, + "grad_norm": 0.503781795501709, + "learning_rate": 3.1220556412777536e-05, + "loss": 0.1396, + "num_input_tokens_seen": 63237200, + "step": 29300 + }, + { + "epoch": 4.780587275693311, + "grad_norm": 0.5483595728874207, + "learning_rate": 3.121366204495538e-05, + "loss": 0.2338, + "num_input_tokens_seen": 63247632, + "step": 29305 + }, + { + "epoch": 4.781402936378466, + "grad_norm": 0.2788871228694916, + "learning_rate": 3.120676717342225e-05, + "loss": 0.1647, + "num_input_tokens_seen": 63258864, + "step": 29310 + }, + { + "epoch": 4.782218597063622, + "grad_norm": 0.8601438403129578, + "learning_rate": 3.119987179873707e-05, + "loss": 0.1091, + "num_input_tokens_seen": 63270928, + "step": 29315 + }, + { + "epoch": 4.783034257748777, + "grad_norm": 1.4617716073989868, + "learning_rate": 3.119297592145884e-05, + "loss": 0.2289, + "num_input_tokens_seen": 63281200, + "step": 29320 + }, + { + "epoch": 4.783849918433932, + "grad_norm": 0.6485883593559265, + "learning_rate": 3.1186079542146554e-05, + "loss": 0.101, + "num_input_tokens_seen": 63291184, + "step": 29325 + }, + { + "epoch": 4.784665579119086, + "grad_norm": 0.709026038646698, + "learning_rate": 3.117918266135927e-05, + "loss": 0.1144, + "num_input_tokens_seen": 63302544, + "step": 29330 + }, + { + "epoch": 4.785481239804241, + "grad_norm": 0.2803540825843811, + "learning_rate": 3.1172285279656085e-05, + "loss": 0.1376, + "num_input_tokens_seen": 63312464, + "step": 29335 + }, + { + "epoch": 4.786296900489396, + "grad_norm": 0.26614904403686523, + "learning_rate": 3.1165387397596136e-05, + "loss": 0.3013, + "num_input_tokens_seen": 63323856, + "step": 29340 + }, + { + "epoch": 4.787112561174552, + "grad_norm": 0.0762321799993515, + "learning_rate": 3.1158489015738604e-05, + "loss": 0.0539, + "num_input_tokens_seen": 63335664, + "step": 29345 + }, + { + "epoch": 4.787928221859707, + "grad_norm": 0.14766846597194672, + "learning_rate": 3.1151590134642705e-05, + "loss": 0.0308, + "num_input_tokens_seen": 63347632, + "step": 29350 + }, + { + "epoch": 4.788743882544861, + "grad_norm": 1.3014819622039795, + "learning_rate": 3.114469075486769e-05, + "loss": 0.1293, + "num_input_tokens_seen": 63359152, + "step": 29355 + }, + { + "epoch": 4.789559543230016, + "grad_norm": 0.11126188933849335, + "learning_rate": 3.113779087697287e-05, + "loss": 0.0438, + "num_input_tokens_seen": 63370224, + "step": 29360 + }, + { + "epoch": 4.790375203915171, + "grad_norm": 0.6595664024353027, + "learning_rate": 3.1130890501517586e-05, + "loss": 0.096, + "num_input_tokens_seen": 63380784, + "step": 29365 + }, + { + "epoch": 4.791190864600326, + "grad_norm": 2.100224733352661, + "learning_rate": 3.11239896290612e-05, + "loss": 0.2771, + "num_input_tokens_seen": 63392656, + "step": 29370 + }, + { + "epoch": 4.7920065252854815, + "grad_norm": 0.3184239864349365, + "learning_rate": 3.111708826016315e-05, + "loss": 0.1478, + "num_input_tokens_seen": 63403440, + "step": 29375 + }, + { + "epoch": 4.792822185970636, + "grad_norm": 1.2802703380584717, + "learning_rate": 3.1110186395382885e-05, + "loss": 0.1113, + "num_input_tokens_seen": 63413968, + "step": 29380 + }, + { + "epoch": 4.793637846655791, + "grad_norm": 1.7191417217254639, + "learning_rate": 3.1103284035279905e-05, + "loss": 0.1414, + "num_input_tokens_seen": 63424752, + "step": 29385 + }, + { + "epoch": 4.794453507340946, + "grad_norm": 0.05970386788249016, + "learning_rate": 3.109638118041376e-05, + "loss": 0.0365, + "num_input_tokens_seen": 63435792, + "step": 29390 + }, + { + "epoch": 4.795269168026101, + "grad_norm": 0.5205230712890625, + "learning_rate": 3.108947783134402e-05, + "loss": 0.0459, + "num_input_tokens_seen": 63448016, + "step": 29395 + }, + { + "epoch": 4.7960848287112565, + "grad_norm": 0.28026896715164185, + "learning_rate": 3.10825739886303e-05, + "loss": 0.0283, + "num_input_tokens_seen": 63456880, + "step": 29400 + }, + { + "epoch": 4.796900489396411, + "grad_norm": 1.706835389137268, + "learning_rate": 3.107566965283228e-05, + "loss": 0.1415, + "num_input_tokens_seen": 63467504, + "step": 29405 + }, + { + "epoch": 4.797716150081566, + "grad_norm": 0.5086063146591187, + "learning_rate": 3.106876482450964e-05, + "loss": 0.1139, + "num_input_tokens_seen": 63478288, + "step": 29410 + }, + { + "epoch": 4.798531810766721, + "grad_norm": 0.088511161506176, + "learning_rate": 3.106185950422215e-05, + "loss": 0.0451, + "num_input_tokens_seen": 63487888, + "step": 29415 + }, + { + "epoch": 4.799347471451876, + "grad_norm": 0.14584606885910034, + "learning_rate": 3.105495369252956e-05, + "loss": 0.0845, + "num_input_tokens_seen": 63498576, + "step": 29420 + }, + { + "epoch": 4.800163132137031, + "grad_norm": 0.03162181004881859, + "learning_rate": 3.104804738999169e-05, + "loss": 0.108, + "num_input_tokens_seen": 63509872, + "step": 29425 + }, + { + "epoch": 4.800978792822186, + "grad_norm": 0.2903023660182953, + "learning_rate": 3.1041140597168425e-05, + "loss": 0.0323, + "num_input_tokens_seen": 63521328, + "step": 29430 + }, + { + "epoch": 4.801794453507341, + "grad_norm": 1.163330316543579, + "learning_rate": 3.1034233314619647e-05, + "loss": 0.1397, + "num_input_tokens_seen": 63531248, + "step": 29435 + }, + { + "epoch": 4.802610114192496, + "grad_norm": 2.675530433654785, + "learning_rate": 3.102732554290531e-05, + "loss": 0.2598, + "num_input_tokens_seen": 63541648, + "step": 29440 + }, + { + "epoch": 4.803425774877651, + "grad_norm": 0.619760274887085, + "learning_rate": 3.102041728258537e-05, + "loss": 0.0863, + "num_input_tokens_seen": 63551408, + "step": 29445 + }, + { + "epoch": 4.804241435562806, + "grad_norm": 0.23420119285583496, + "learning_rate": 3.101350853421986e-05, + "loss": 0.0344, + "num_input_tokens_seen": 63562160, + "step": 29450 + }, + { + "epoch": 4.80505709624796, + "grad_norm": 0.05053994432091713, + "learning_rate": 3.1006599298368826e-05, + "loss": 0.0898, + "num_input_tokens_seen": 63573648, + "step": 29455 + }, + { + "epoch": 4.805872756933116, + "grad_norm": 0.044055283069610596, + "learning_rate": 3.099968957559239e-05, + "loss": 0.1203, + "num_input_tokens_seen": 63584048, + "step": 29460 + }, + { + "epoch": 4.806688417618271, + "grad_norm": 0.48202669620513916, + "learning_rate": 3.0992779366450666e-05, + "loss": 0.0509, + "num_input_tokens_seen": 63595344, + "step": 29465 + }, + { + "epoch": 4.807504078303426, + "grad_norm": 1.5036213397979736, + "learning_rate": 3.098586867150385e-05, + "loss": 0.1463, + "num_input_tokens_seen": 63606800, + "step": 29470 + }, + { + "epoch": 4.808319738988581, + "grad_norm": 1.6419960260391235, + "learning_rate": 3.097895749131214e-05, + "loss": 0.1886, + "num_input_tokens_seen": 63617456, + "step": 29475 + }, + { + "epoch": 4.809135399673735, + "grad_norm": 0.31495824456214905, + "learning_rate": 3.09720458264358e-05, + "loss": 0.2881, + "num_input_tokens_seen": 63628336, + "step": 29480 + }, + { + "epoch": 4.809951060358891, + "grad_norm": 0.056317877024412155, + "learning_rate": 3.096513367743513e-05, + "loss": 0.0553, + "num_input_tokens_seen": 63640720, + "step": 29485 + }, + { + "epoch": 4.810766721044046, + "grad_norm": 0.9567562341690063, + "learning_rate": 3.095822104487045e-05, + "loss": 0.0613, + "num_input_tokens_seen": 63650960, + "step": 29490 + }, + { + "epoch": 4.811582381729201, + "grad_norm": 1.729486346244812, + "learning_rate": 3.0951307929302136e-05, + "loss": 0.1338, + "num_input_tokens_seen": 63662192, + "step": 29495 + }, + { + "epoch": 4.8123980424143555, + "grad_norm": 1.5991438627243042, + "learning_rate": 3.094439433129061e-05, + "loss": 0.1113, + "num_input_tokens_seen": 63673232, + "step": 29500 + }, + { + "epoch": 4.81321370309951, + "grad_norm": 0.04654611274600029, + "learning_rate": 3.093748025139632e-05, + "loss": 0.0239, + "num_input_tokens_seen": 63683728, + "step": 29505 + }, + { + "epoch": 4.814029363784666, + "grad_norm": 1.098647952079773, + "learning_rate": 3.093056569017975e-05, + "loss": 0.0808, + "num_input_tokens_seen": 63694768, + "step": 29510 + }, + { + "epoch": 4.814845024469821, + "grad_norm": 0.11613412201404572, + "learning_rate": 3.0923650648201436e-05, + "loss": 0.1538, + "num_input_tokens_seen": 63705488, + "step": 29515 + }, + { + "epoch": 4.815660685154976, + "grad_norm": 0.9862400889396667, + "learning_rate": 3.0916735126021945e-05, + "loss": 0.2528, + "num_input_tokens_seen": 63715600, + "step": 29520 + }, + { + "epoch": 4.8164763458401305, + "grad_norm": 1.4751365184783936, + "learning_rate": 3.090981912420188e-05, + "loss": 0.2008, + "num_input_tokens_seen": 63726000, + "step": 29525 + }, + { + "epoch": 4.817292006525285, + "grad_norm": 0.6519401669502258, + "learning_rate": 3.09029026433019e-05, + "loss": 0.1591, + "num_input_tokens_seen": 63736400, + "step": 29530 + }, + { + "epoch": 4.81810766721044, + "grad_norm": 1.5798346996307373, + "learning_rate": 3.0895985683882675e-05, + "loss": 0.42, + "num_input_tokens_seen": 63747600, + "step": 29535 + }, + { + "epoch": 4.818923327895595, + "grad_norm": 0.20431646704673767, + "learning_rate": 3.088906824650493e-05, + "loss": 0.0355, + "num_input_tokens_seen": 63758416, + "step": 29540 + }, + { + "epoch": 4.819738988580751, + "grad_norm": 0.19323883950710297, + "learning_rate": 3.088215033172944e-05, + "loss": 0.1731, + "num_input_tokens_seen": 63768976, + "step": 29545 + }, + { + "epoch": 4.8205546492659055, + "grad_norm": 0.18188121914863586, + "learning_rate": 3.087523194011699e-05, + "loss": 0.0959, + "num_input_tokens_seen": 63778896, + "step": 29550 + }, + { + "epoch": 4.82137030995106, + "grad_norm": 0.575604259967804, + "learning_rate": 3.086831307222844e-05, + "loss": 0.1089, + "num_input_tokens_seen": 63790384, + "step": 29555 + }, + { + "epoch": 4.822185970636215, + "grad_norm": 0.1612223982810974, + "learning_rate": 3.086139372862464e-05, + "loss": 0.0328, + "num_input_tokens_seen": 63800688, + "step": 29560 + }, + { + "epoch": 4.82300163132137, + "grad_norm": 0.07010478526353836, + "learning_rate": 3.085447390986653e-05, + "loss": 0.0508, + "num_input_tokens_seen": 63811856, + "step": 29565 + }, + { + "epoch": 4.823817292006526, + "grad_norm": 0.2426881641149521, + "learning_rate": 3.084755361651507e-05, + "loss": 0.1312, + "num_input_tokens_seen": 63822576, + "step": 29570 + }, + { + "epoch": 4.8246329526916805, + "grad_norm": 0.4278726875782013, + "learning_rate": 3.0840632849131236e-05, + "loss": 0.037, + "num_input_tokens_seen": 63833168, + "step": 29575 + }, + { + "epoch": 4.825448613376835, + "grad_norm": 0.3289976119995117, + "learning_rate": 3.083371160827606e-05, + "loss": 0.0371, + "num_input_tokens_seen": 63843312, + "step": 29580 + }, + { + "epoch": 4.82626427406199, + "grad_norm": 0.7106301784515381, + "learning_rate": 3.082678989451063e-05, + "loss": 0.2606, + "num_input_tokens_seen": 63853904, + "step": 29585 + }, + { + "epoch": 4.827079934747145, + "grad_norm": 2.023566484451294, + "learning_rate": 3.081986770839605e-05, + "loss": 0.431, + "num_input_tokens_seen": 63865072, + "step": 29590 + }, + { + "epoch": 4.827895595432301, + "grad_norm": 0.7260615229606628, + "learning_rate": 3.0812945050493464e-05, + "loss": 0.0624, + "num_input_tokens_seen": 63875760, + "step": 29595 + }, + { + "epoch": 4.828711256117455, + "grad_norm": 0.1606421023607254, + "learning_rate": 3.080602192136405e-05, + "loss": 0.0465, + "num_input_tokens_seen": 63887216, + "step": 29600 + }, + { + "epoch": 4.82952691680261, + "grad_norm": 0.05963525548577309, + "learning_rate": 3.079909832156905e-05, + "loss": 0.1096, + "num_input_tokens_seen": 63897104, + "step": 29605 + }, + { + "epoch": 4.830342577487765, + "grad_norm": 0.9887569546699524, + "learning_rate": 3.0792174251669706e-05, + "loss": 0.193, + "num_input_tokens_seen": 63907984, + "step": 29610 + }, + { + "epoch": 4.83115823817292, + "grad_norm": 0.7576796412467957, + "learning_rate": 3.078524971222733e-05, + "loss": 0.1099, + "num_input_tokens_seen": 63918640, + "step": 29615 + }, + { + "epoch": 4.831973898858075, + "grad_norm": 1.5732847452163696, + "learning_rate": 3.0778324703803256e-05, + "loss": 0.1694, + "num_input_tokens_seen": 63929136, + "step": 29620 + }, + { + "epoch": 4.8327895595432295, + "grad_norm": 0.42717090249061584, + "learning_rate": 3.0771399226958865e-05, + "loss": 0.2654, + "num_input_tokens_seen": 63940464, + "step": 29625 + }, + { + "epoch": 4.833605220228385, + "grad_norm": 1.4015424251556396, + "learning_rate": 3.076447328225557e-05, + "loss": 0.1796, + "num_input_tokens_seen": 63950512, + "step": 29630 + }, + { + "epoch": 4.83442088091354, + "grad_norm": 1.820035696029663, + "learning_rate": 3.075754687025482e-05, + "loss": 0.2912, + "num_input_tokens_seen": 63961840, + "step": 29635 + }, + { + "epoch": 4.835236541598695, + "grad_norm": 0.6336179971694946, + "learning_rate": 3.0750619991518115e-05, + "loss": 0.1011, + "num_input_tokens_seen": 63971440, + "step": 29640 + }, + { + "epoch": 4.83605220228385, + "grad_norm": 0.751527726650238, + "learning_rate": 3.074369264660697e-05, + "loss": 0.109, + "num_input_tokens_seen": 63982608, + "step": 29645 + }, + { + "epoch": 4.8368678629690045, + "grad_norm": 0.099815733730793, + "learning_rate": 3.0736764836082954e-05, + "loss": 0.2025, + "num_input_tokens_seen": 63994000, + "step": 29650 + }, + { + "epoch": 4.83768352365416, + "grad_norm": 0.7557456493377686, + "learning_rate": 3.072983656050767e-05, + "loss": 0.1271, + "num_input_tokens_seen": 64005008, + "step": 29655 + }, + { + "epoch": 4.838499184339315, + "grad_norm": 0.6797901391983032, + "learning_rate": 3.072290782044276e-05, + "loss": 0.1139, + "num_input_tokens_seen": 64014448, + "step": 29660 + }, + { + "epoch": 4.83931484502447, + "grad_norm": 0.3730597496032715, + "learning_rate": 3.0715978616449906e-05, + "loss": 0.0368, + "num_input_tokens_seen": 64026672, + "step": 29665 + }, + { + "epoch": 4.840130505709625, + "grad_norm": 0.16916845738887787, + "learning_rate": 3.070904894909083e-05, + "loss": 0.1421, + "num_input_tokens_seen": 64036752, + "step": 29670 + }, + { + "epoch": 4.8409461663947795, + "grad_norm": 0.27607303857803345, + "learning_rate": 3.070211881892727e-05, + "loss": 0.1875, + "num_input_tokens_seen": 64047760, + "step": 29675 + }, + { + "epoch": 4.841761827079935, + "grad_norm": 0.09065970033407211, + "learning_rate": 3.069518822652103e-05, + "loss": 0.2085, + "num_input_tokens_seen": 64059664, + "step": 29680 + }, + { + "epoch": 4.84257748776509, + "grad_norm": 1.7212302684783936, + "learning_rate": 3.0688257172433944e-05, + "loss": 0.13, + "num_input_tokens_seen": 64071408, + "step": 29685 + }, + { + "epoch": 4.843393148450245, + "grad_norm": 0.408094584941864, + "learning_rate": 3.068132565722786e-05, + "loss": 0.1056, + "num_input_tokens_seen": 64082448, + "step": 29690 + }, + { + "epoch": 4.8442088091354, + "grad_norm": 1.0667849779129028, + "learning_rate": 3.06743936814647e-05, + "loss": 0.108, + "num_input_tokens_seen": 64094320, + "step": 29695 + }, + { + "epoch": 4.8450244698205545, + "grad_norm": 0.3523818254470825, + "learning_rate": 3.0667461245706386e-05, + "loss": 0.157, + "num_input_tokens_seen": 64105392, + "step": 29700 + }, + { + "epoch": 4.845840130505709, + "grad_norm": 0.40902161598205566, + "learning_rate": 3.066052835051491e-05, + "loss": 0.1697, + "num_input_tokens_seen": 64117008, + "step": 29705 + }, + { + "epoch": 4.846655791190865, + "grad_norm": 1.077429175376892, + "learning_rate": 3.065359499645228e-05, + "loss": 0.4585, + "num_input_tokens_seen": 64128016, + "step": 29710 + }, + { + "epoch": 4.84747145187602, + "grad_norm": 0.9659509062767029, + "learning_rate": 3.064666118408057e-05, + "loss": 0.1412, + "num_input_tokens_seen": 64138448, + "step": 29715 + }, + { + "epoch": 4.848287112561175, + "grad_norm": 0.22391922771930695, + "learning_rate": 3.0639726913961833e-05, + "loss": 0.0308, + "num_input_tokens_seen": 64148848, + "step": 29720 + }, + { + "epoch": 4.849102773246329, + "grad_norm": 1.089530110359192, + "learning_rate": 3.0632792186658225e-05, + "loss": 0.1869, + "num_input_tokens_seen": 64159280, + "step": 29725 + }, + { + "epoch": 4.849918433931484, + "grad_norm": 0.1461264044046402, + "learning_rate": 3.062585700273191e-05, + "loss": 0.1295, + "num_input_tokens_seen": 64170384, + "step": 29730 + }, + { + "epoch": 4.850734094616639, + "grad_norm": 0.06906203180551529, + "learning_rate": 3.0618921362745075e-05, + "loss": 0.2113, + "num_input_tokens_seen": 64180112, + "step": 29735 + }, + { + "epoch": 4.851549755301795, + "grad_norm": 0.7489222884178162, + "learning_rate": 3.061198526725996e-05, + "loss": 0.0943, + "num_input_tokens_seen": 64190032, + "step": 29740 + }, + { + "epoch": 4.85236541598695, + "grad_norm": 1.145715594291687, + "learning_rate": 3.060504871683885e-05, + "loss": 0.1368, + "num_input_tokens_seen": 64201936, + "step": 29745 + }, + { + "epoch": 4.853181076672104, + "grad_norm": 0.8044369220733643, + "learning_rate": 3.059811171204404e-05, + "loss": 0.0706, + "num_input_tokens_seen": 64212784, + "step": 29750 + }, + { + "epoch": 4.853996737357259, + "grad_norm": 0.06835240870714188, + "learning_rate": 3.0591174253437904e-05, + "loss": 0.0868, + "num_input_tokens_seen": 64224208, + "step": 29755 + }, + { + "epoch": 4.854812398042414, + "grad_norm": 0.33019015192985535, + "learning_rate": 3.05842363415828e-05, + "loss": 0.1096, + "num_input_tokens_seen": 64235792, + "step": 29760 + }, + { + "epoch": 4.85562805872757, + "grad_norm": 0.07190854847431183, + "learning_rate": 3.057729797704118e-05, + "loss": 0.133, + "num_input_tokens_seen": 64246512, + "step": 29765 + }, + { + "epoch": 4.856443719412725, + "grad_norm": 0.20802675187587738, + "learning_rate": 3.057035916037548e-05, + "loss": 0.074, + "num_input_tokens_seen": 64256624, + "step": 29770 + }, + { + "epoch": 4.857259380097879, + "grad_norm": 0.4216321110725403, + "learning_rate": 3.05634198921482e-05, + "loss": 0.1596, + "num_input_tokens_seen": 64266544, + "step": 29775 + }, + { + "epoch": 4.858075040783034, + "grad_norm": 1.1335259675979614, + "learning_rate": 3.055648017292188e-05, + "loss": 0.2361, + "num_input_tokens_seen": 64277904, + "step": 29780 + }, + { + "epoch": 4.858890701468189, + "grad_norm": 0.17350880801677704, + "learning_rate": 3.0549540003259084e-05, + "loss": 0.0781, + "num_input_tokens_seen": 64288464, + "step": 29785 + }, + { + "epoch": 4.859706362153344, + "grad_norm": 0.17786061763763428, + "learning_rate": 3.054259938372242e-05, + "loss": 0.2554, + "num_input_tokens_seen": 64299408, + "step": 29790 + }, + { + "epoch": 4.8605220228384995, + "grad_norm": 2.4030632972717285, + "learning_rate": 3.0535658314874515e-05, + "loss": 0.2761, + "num_input_tokens_seen": 64310224, + "step": 29795 + }, + { + "epoch": 4.861337683523654, + "grad_norm": 0.9891718029975891, + "learning_rate": 3.0528716797278064e-05, + "loss": 0.0879, + "num_input_tokens_seen": 64321520, + "step": 29800 + }, + { + "epoch": 4.862153344208809, + "grad_norm": 2.1309545040130615, + "learning_rate": 3.052177483149578e-05, + "loss": 0.2302, + "num_input_tokens_seen": 64332880, + "step": 29805 + }, + { + "epoch": 4.862969004893964, + "grad_norm": 0.211195170879364, + "learning_rate": 3.0514832418090406e-05, + "loss": 0.1057, + "num_input_tokens_seen": 64342576, + "step": 29810 + }, + { + "epoch": 4.863784665579119, + "grad_norm": 0.3037104308605194, + "learning_rate": 3.050788955762474e-05, + "loss": 0.0892, + "num_input_tokens_seen": 64353936, + "step": 29815 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.020603397861123085, + "learning_rate": 3.05009462506616e-05, + "loss": 0.0411, + "num_input_tokens_seen": 64364592, + "step": 29820 + }, + { + "epoch": 4.865415986949429, + "grad_norm": 0.2755267322063446, + "learning_rate": 3.049400249776384e-05, + "loss": 0.0663, + "num_input_tokens_seen": 64375696, + "step": 29825 + }, + { + "epoch": 4.866231647634584, + "grad_norm": 0.1555478423833847, + "learning_rate": 3.0487058299494363e-05, + "loss": 0.0443, + "num_input_tokens_seen": 64387440, + "step": 29830 + }, + { + "epoch": 4.867047308319739, + "grad_norm": 0.08368054777383804, + "learning_rate": 3.0480113656416103e-05, + "loss": 0.0652, + "num_input_tokens_seen": 64399088, + "step": 29835 + }, + { + "epoch": 4.867862969004894, + "grad_norm": 0.2932908236980438, + "learning_rate": 3.047316856909202e-05, + "loss": 0.337, + "num_input_tokens_seen": 64409712, + "step": 29840 + }, + { + "epoch": 4.868678629690049, + "grad_norm": 0.2726362645626068, + "learning_rate": 3.0466223038085128e-05, + "loss": 0.2343, + "num_input_tokens_seen": 64421200, + "step": 29845 + }, + { + "epoch": 4.869494290375204, + "grad_norm": 0.25737571716308594, + "learning_rate": 3.0459277063958457e-05, + "loss": 0.199, + "num_input_tokens_seen": 64431344, + "step": 29850 + }, + { + "epoch": 4.870309951060359, + "grad_norm": 0.12706886231899261, + "learning_rate": 3.0452330647275086e-05, + "loss": 0.0448, + "num_input_tokens_seen": 64442928, + "step": 29855 + }, + { + "epoch": 4.871125611745514, + "grad_norm": 0.05100022256374359, + "learning_rate": 3.044538378859813e-05, + "loss": 0.1523, + "num_input_tokens_seen": 64453968, + "step": 29860 + }, + { + "epoch": 4.871941272430669, + "grad_norm": 0.052014727145433426, + "learning_rate": 3.0438436488490736e-05, + "loss": 0.0569, + "num_input_tokens_seen": 64465008, + "step": 29865 + }, + { + "epoch": 4.872756933115824, + "grad_norm": 0.0675124078989029, + "learning_rate": 3.0431488747516085e-05, + "loss": 0.1483, + "num_input_tokens_seen": 64474832, + "step": 29870 + }, + { + "epoch": 4.873572593800979, + "grad_norm": 0.8720448017120361, + "learning_rate": 3.0424540566237398e-05, + "loss": 0.0949, + "num_input_tokens_seen": 64485776, + "step": 29875 + }, + { + "epoch": 4.874388254486134, + "grad_norm": 0.2557765543460846, + "learning_rate": 3.041759194521792e-05, + "loss": 0.1814, + "num_input_tokens_seen": 64497392, + "step": 29880 + }, + { + "epoch": 4.875203915171289, + "grad_norm": 0.35522353649139404, + "learning_rate": 3.0410642885020957e-05, + "loss": 0.0378, + "num_input_tokens_seen": 64508176, + "step": 29885 + }, + { + "epoch": 4.876019575856444, + "grad_norm": 1.9193897247314453, + "learning_rate": 3.040369338620983e-05, + "loss": 0.2732, + "num_input_tokens_seen": 64517808, + "step": 29890 + }, + { + "epoch": 4.876835236541599, + "grad_norm": 2.0133681297302246, + "learning_rate": 3.0396743449347893e-05, + "loss": 0.1674, + "num_input_tokens_seen": 64527696, + "step": 29895 + }, + { + "epoch": 4.877650897226753, + "grad_norm": 0.48696669936180115, + "learning_rate": 3.0389793074998553e-05, + "loss": 0.0543, + "num_input_tokens_seen": 64538448, + "step": 29900 + }, + { + "epoch": 4.878466557911908, + "grad_norm": 1.0542048215866089, + "learning_rate": 3.038284226372524e-05, + "loss": 0.0908, + "num_input_tokens_seen": 64548784, + "step": 29905 + }, + { + "epoch": 4.879282218597064, + "grad_norm": 0.04650663584470749, + "learning_rate": 3.0375891016091424e-05, + "loss": 0.0791, + "num_input_tokens_seen": 64559152, + "step": 29910 + }, + { + "epoch": 4.880097879282219, + "grad_norm": 0.1529679298400879, + "learning_rate": 3.0368939332660603e-05, + "loss": 0.0637, + "num_input_tokens_seen": 64570096, + "step": 29915 + }, + { + "epoch": 4.8809135399673735, + "grad_norm": 0.03222471848130226, + "learning_rate": 3.036198721399631e-05, + "loss": 0.0877, + "num_input_tokens_seen": 64581744, + "step": 29920 + }, + { + "epoch": 4.881729200652528, + "grad_norm": 0.6553998589515686, + "learning_rate": 3.035503466066214e-05, + "loss": 0.0977, + "num_input_tokens_seen": 64591856, + "step": 29925 + }, + { + "epoch": 4.882544861337683, + "grad_norm": 0.04681103304028511, + "learning_rate": 3.0348081673221678e-05, + "loss": 0.0263, + "num_input_tokens_seen": 64602192, + "step": 29930 + }, + { + "epoch": 4.883360522022839, + "grad_norm": 0.13021394610404968, + "learning_rate": 3.034112825223858e-05, + "loss": 0.0921, + "num_input_tokens_seen": 64612976, + "step": 29935 + }, + { + "epoch": 4.884176182707994, + "grad_norm": 0.272113174200058, + "learning_rate": 3.0334174398276532e-05, + "loss": 0.0569, + "num_input_tokens_seen": 64623984, + "step": 29940 + }, + { + "epoch": 4.8849918433931485, + "grad_norm": 0.8153955340385437, + "learning_rate": 3.032722011189924e-05, + "loss": 0.0735, + "num_input_tokens_seen": 64634992, + "step": 29945 + }, + { + "epoch": 4.885807504078303, + "grad_norm": 0.40620386600494385, + "learning_rate": 3.032026539367046e-05, + "loss": 0.1614, + "num_input_tokens_seen": 64645840, + "step": 29950 + }, + { + "epoch": 4.886623164763458, + "grad_norm": 0.18715421855449677, + "learning_rate": 3.0313310244153968e-05, + "loss": 0.1139, + "num_input_tokens_seen": 64656016, + "step": 29955 + }, + { + "epoch": 4.887438825448614, + "grad_norm": 0.15265630185604095, + "learning_rate": 3.0306354663913588e-05, + "loss": 0.0709, + "num_input_tokens_seen": 64667760, + "step": 29960 + }, + { + "epoch": 4.888254486133769, + "grad_norm": 0.890955924987793, + "learning_rate": 3.029939865351317e-05, + "loss": 0.1602, + "num_input_tokens_seen": 64677872, + "step": 29965 + }, + { + "epoch": 4.8890701468189235, + "grad_norm": 0.03800920024514198, + "learning_rate": 3.0292442213516613e-05, + "loss": 0.0238, + "num_input_tokens_seen": 64688784, + "step": 29970 + }, + { + "epoch": 4.889885807504078, + "grad_norm": 0.5078579783439636, + "learning_rate": 3.0285485344487834e-05, + "loss": 0.1822, + "num_input_tokens_seen": 64698576, + "step": 29975 + }, + { + "epoch": 4.890701468189233, + "grad_norm": 0.0714603066444397, + "learning_rate": 3.02785280469908e-05, + "loss": 0.02, + "num_input_tokens_seen": 64709520, + "step": 29980 + }, + { + "epoch": 4.891517128874388, + "grad_norm": 0.5037271976470947, + "learning_rate": 3.0271570321589494e-05, + "loss": 0.073, + "num_input_tokens_seen": 64720592, + "step": 29985 + }, + { + "epoch": 4.892332789559543, + "grad_norm": 1.4138091802597046, + "learning_rate": 3.026461216884795e-05, + "loss": 0.1475, + "num_input_tokens_seen": 64731568, + "step": 29990 + }, + { + "epoch": 4.8931484502446985, + "grad_norm": 2.180875062942505, + "learning_rate": 3.025765358933024e-05, + "loss": 0.1683, + "num_input_tokens_seen": 64742256, + "step": 29995 + }, + { + "epoch": 4.893964110929853, + "grad_norm": 0.2247498780488968, + "learning_rate": 3.0250694583600448e-05, + "loss": 0.0362, + "num_input_tokens_seen": 64752208, + "step": 30000 + }, + { + "epoch": 4.894779771615008, + "grad_norm": 0.14722786843776703, + "learning_rate": 3.024373515222271e-05, + "loss": 0.1496, + "num_input_tokens_seen": 64761168, + "step": 30005 + }, + { + "epoch": 4.895595432300163, + "grad_norm": 0.09746509045362473, + "learning_rate": 3.0236775295761194e-05, + "loss": 0.0811, + "num_input_tokens_seen": 64773040, + "step": 30010 + }, + { + "epoch": 4.896411092985318, + "grad_norm": 0.37557053565979004, + "learning_rate": 3.02298150147801e-05, + "loss": 0.2454, + "num_input_tokens_seen": 64785584, + "step": 30015 + }, + { + "epoch": 4.897226753670473, + "grad_norm": 0.12243838608264923, + "learning_rate": 3.022285430984367e-05, + "loss": 0.1035, + "num_input_tokens_seen": 64797648, + "step": 30020 + }, + { + "epoch": 4.898042414355628, + "grad_norm": 2.277400016784668, + "learning_rate": 3.0215893181516163e-05, + "loss": 0.081, + "num_input_tokens_seen": 64808912, + "step": 30025 + }, + { + "epoch": 4.898858075040783, + "grad_norm": 0.17473246157169342, + "learning_rate": 3.02089316303619e-05, + "loss": 0.23, + "num_input_tokens_seen": 64820336, + "step": 30030 + }, + { + "epoch": 4.899673735725938, + "grad_norm": 0.034842632710933685, + "learning_rate": 3.0201969656945196e-05, + "loss": 0.0639, + "num_input_tokens_seen": 64831600, + "step": 30035 + }, + { + "epoch": 4.900489396411093, + "grad_norm": 0.4822215735912323, + "learning_rate": 3.0195007261830438e-05, + "loss": 0.0509, + "num_input_tokens_seen": 64842000, + "step": 30040 + }, + { + "epoch": 4.901305057096248, + "grad_norm": 0.44661039113998413, + "learning_rate": 3.018804444558204e-05, + "loss": 0.1368, + "num_input_tokens_seen": 64853456, + "step": 30045 + }, + { + "epoch": 4.902120717781403, + "grad_norm": 0.39672183990478516, + "learning_rate": 3.018108120876443e-05, + "loss": 0.0863, + "num_input_tokens_seen": 64863952, + "step": 30050 + }, + { + "epoch": 4.902936378466558, + "grad_norm": 0.7208009958267212, + "learning_rate": 3.0174117551942087e-05, + "loss": 0.1797, + "num_input_tokens_seen": 64874128, + "step": 30055 + }, + { + "epoch": 4.903752039151713, + "grad_norm": 0.1213861033320427, + "learning_rate": 3.0167153475679527e-05, + "loss": 0.0554, + "num_input_tokens_seen": 64883856, + "step": 30060 + }, + { + "epoch": 4.904567699836868, + "grad_norm": 1.3992159366607666, + "learning_rate": 3.0160188980541288e-05, + "loss": 0.1358, + "num_input_tokens_seen": 64894736, + "step": 30065 + }, + { + "epoch": 4.9053833605220225, + "grad_norm": 0.12332815676927567, + "learning_rate": 3.0153224067091952e-05, + "loss": 0.0769, + "num_input_tokens_seen": 64905808, + "step": 30070 + }, + { + "epoch": 4.906199021207177, + "grad_norm": 0.8598760366439819, + "learning_rate": 3.0146258735896117e-05, + "loss": 0.2366, + "num_input_tokens_seen": 64917104, + "step": 30075 + }, + { + "epoch": 4.907014681892333, + "grad_norm": 0.2603754997253418, + "learning_rate": 3.0139292987518443e-05, + "loss": 0.0329, + "num_input_tokens_seen": 64926800, + "step": 30080 + }, + { + "epoch": 4.907830342577488, + "grad_norm": 0.062192875891923904, + "learning_rate": 3.0132326822523606e-05, + "loss": 0.1152, + "num_input_tokens_seen": 64937904, + "step": 30085 + }, + { + "epoch": 4.908646003262643, + "grad_norm": 0.019849726930260658, + "learning_rate": 3.01253602414763e-05, + "loss": 0.2487, + "num_input_tokens_seen": 64948592, + "step": 30090 + }, + { + "epoch": 4.9094616639477975, + "grad_norm": 0.3418304920196533, + "learning_rate": 3.0118393244941302e-05, + "loss": 0.0702, + "num_input_tokens_seen": 64959152, + "step": 30095 + }, + { + "epoch": 4.910277324632952, + "grad_norm": 0.059327930212020874, + "learning_rate": 3.011142583348337e-05, + "loss": 0.3623, + "num_input_tokens_seen": 64970736, + "step": 30100 + }, + { + "epoch": 4.911092985318108, + "grad_norm": 0.9177201390266418, + "learning_rate": 3.010445800766733e-05, + "loss": 0.1454, + "num_input_tokens_seen": 64982032, + "step": 30105 + }, + { + "epoch": 4.911908646003263, + "grad_norm": 0.22226189076900482, + "learning_rate": 3.0097489768058022e-05, + "loss": 0.0702, + "num_input_tokens_seen": 64994448, + "step": 30110 + }, + { + "epoch": 4.912724306688418, + "grad_norm": 0.2005605548620224, + "learning_rate": 3.0090521115220327e-05, + "loss": 0.1523, + "num_input_tokens_seen": 65005744, + "step": 30115 + }, + { + "epoch": 4.9135399673735725, + "grad_norm": 0.4192376136779785, + "learning_rate": 3.0083552049719167e-05, + "loss": 0.0425, + "num_input_tokens_seen": 65016848, + "step": 30120 + }, + { + "epoch": 4.914355628058727, + "grad_norm": 0.6007254123687744, + "learning_rate": 3.0076582572119473e-05, + "loss": 0.174, + "num_input_tokens_seen": 65027600, + "step": 30125 + }, + { + "epoch": 4.915171288743883, + "grad_norm": 0.39774954319000244, + "learning_rate": 3.0069612682986247e-05, + "loss": 0.1007, + "num_input_tokens_seen": 65039120, + "step": 30130 + }, + { + "epoch": 4.915986949429038, + "grad_norm": 1.1897732019424438, + "learning_rate": 3.0062642382884494e-05, + "loss": 0.0733, + "num_input_tokens_seen": 65050512, + "step": 30135 + }, + { + "epoch": 4.916802610114193, + "grad_norm": 1.7043700218200684, + "learning_rate": 3.005567167237926e-05, + "loss": 0.1967, + "num_input_tokens_seen": 65059888, + "step": 30140 + }, + { + "epoch": 4.917618270799347, + "grad_norm": 0.26112696528434753, + "learning_rate": 3.004870055203562e-05, + "loss": 0.078, + "num_input_tokens_seen": 65071120, + "step": 30145 + }, + { + "epoch": 4.918433931484502, + "grad_norm": 0.15263231098651886, + "learning_rate": 3.0041729022418702e-05, + "loss": 0.0944, + "num_input_tokens_seen": 65081744, + "step": 30150 + }, + { + "epoch": 4.919249592169657, + "grad_norm": 0.14855341613292694, + "learning_rate": 3.003475708409365e-05, + "loss": 0.0879, + "num_input_tokens_seen": 65093040, + "step": 30155 + }, + { + "epoch": 4.920065252854813, + "grad_norm": 0.036632757633924484, + "learning_rate": 3.0027784737625646e-05, + "loss": 0.2129, + "num_input_tokens_seen": 65103056, + "step": 30160 + }, + { + "epoch": 4.920880913539968, + "grad_norm": 0.29426074028015137, + "learning_rate": 3.00208119835799e-05, + "loss": 0.0362, + "num_input_tokens_seen": 65113904, + "step": 30165 + }, + { + "epoch": 4.921696574225122, + "grad_norm": 1.1760820150375366, + "learning_rate": 3.0013838822521655e-05, + "loss": 0.1895, + "num_input_tokens_seen": 65124016, + "step": 30170 + }, + { + "epoch": 4.922512234910277, + "grad_norm": 0.14327462017536163, + "learning_rate": 3.0006865255016192e-05, + "loss": 0.0193, + "num_input_tokens_seen": 65135120, + "step": 30175 + }, + { + "epoch": 4.923327895595432, + "grad_norm": 0.383309930562973, + "learning_rate": 2.9999891281628832e-05, + "loss": 0.1685, + "num_input_tokens_seen": 65144784, + "step": 30180 + }, + { + "epoch": 4.924143556280587, + "grad_norm": 0.7071207165718079, + "learning_rate": 2.9992916902924917e-05, + "loss": 0.1481, + "num_input_tokens_seen": 65154928, + "step": 30185 + }, + { + "epoch": 4.924959216965743, + "grad_norm": 0.2624954879283905, + "learning_rate": 2.998594211946982e-05, + "loss": 0.0378, + "num_input_tokens_seen": 65165808, + "step": 30190 + }, + { + "epoch": 4.925774877650897, + "grad_norm": 0.44247570633888245, + "learning_rate": 2.9978966931828957e-05, + "loss": 0.1487, + "num_input_tokens_seen": 65175952, + "step": 30195 + }, + { + "epoch": 4.926590538336052, + "grad_norm": 0.309957891702652, + "learning_rate": 2.9971991340567773e-05, + "loss": 0.1295, + "num_input_tokens_seen": 65185904, + "step": 30200 + }, + { + "epoch": 4.927406199021207, + "grad_norm": 0.1204834133386612, + "learning_rate": 2.9965015346251747e-05, + "loss": 0.0853, + "num_input_tokens_seen": 65196848, + "step": 30205 + }, + { + "epoch": 4.928221859706362, + "grad_norm": 0.20536422729492188, + "learning_rate": 2.995803894944637e-05, + "loss": 0.2135, + "num_input_tokens_seen": 65207440, + "step": 30210 + }, + { + "epoch": 4.9290375203915175, + "grad_norm": 0.11078263819217682, + "learning_rate": 2.9951062150717212e-05, + "loss": 0.1175, + "num_input_tokens_seen": 65217904, + "step": 30215 + }, + { + "epoch": 4.929853181076672, + "grad_norm": 0.2862643897533417, + "learning_rate": 2.994408495062983e-05, + "loss": 0.0486, + "num_input_tokens_seen": 65228464, + "step": 30220 + }, + { + "epoch": 4.930668841761827, + "grad_norm": 1.9985431432724, + "learning_rate": 2.9937107349749842e-05, + "loss": 0.2383, + "num_input_tokens_seen": 65239024, + "step": 30225 + }, + { + "epoch": 4.931484502446982, + "grad_norm": 2.0456202030181885, + "learning_rate": 2.9930129348642877e-05, + "loss": 0.1024, + "num_input_tokens_seen": 65249872, + "step": 30230 + }, + { + "epoch": 4.932300163132137, + "grad_norm": 0.14065244793891907, + "learning_rate": 2.9923150947874613e-05, + "loss": 0.2327, + "num_input_tokens_seen": 65260560, + "step": 30235 + }, + { + "epoch": 4.933115823817292, + "grad_norm": 0.48798397183418274, + "learning_rate": 2.991617214801075e-05, + "loss": 0.0503, + "num_input_tokens_seen": 65272016, + "step": 30240 + }, + { + "epoch": 4.933931484502447, + "grad_norm": 0.25700968503952026, + "learning_rate": 2.9909192949617036e-05, + "loss": 0.0336, + "num_input_tokens_seen": 65281136, + "step": 30245 + }, + { + "epoch": 4.934747145187602, + "grad_norm": 1.455586314201355, + "learning_rate": 2.9902213353259223e-05, + "loss": 0.1182, + "num_input_tokens_seen": 65290640, + "step": 30250 + }, + { + "epoch": 4.935562805872757, + "grad_norm": 1.790833592414856, + "learning_rate": 2.989523335950313e-05, + "loss": 0.321, + "num_input_tokens_seen": 65302032, + "step": 30255 + }, + { + "epoch": 4.936378466557912, + "grad_norm": 0.24753662943840027, + "learning_rate": 2.9888252968914576e-05, + "loss": 0.2569, + "num_input_tokens_seen": 65313168, + "step": 30260 + }, + { + "epoch": 4.937194127243067, + "grad_norm": 0.27628281712532043, + "learning_rate": 2.988127218205944e-05, + "loss": 0.0213, + "num_input_tokens_seen": 65324528, + "step": 30265 + }, + { + "epoch": 4.938009787928221, + "grad_norm": 0.9563420414924622, + "learning_rate": 2.9874290999503606e-05, + "loss": 0.064, + "num_input_tokens_seen": 65335568, + "step": 30270 + }, + { + "epoch": 4.938825448613377, + "grad_norm": 0.6594898104667664, + "learning_rate": 2.9867309421813018e-05, + "loss": 0.2238, + "num_input_tokens_seen": 65346032, + "step": 30275 + }, + { + "epoch": 4.939641109298532, + "grad_norm": 0.12434316426515579, + "learning_rate": 2.9860327449553626e-05, + "loss": 0.0975, + "num_input_tokens_seen": 65357712, + "step": 30280 + }, + { + "epoch": 4.940456769983687, + "grad_norm": 0.032138511538505554, + "learning_rate": 2.9853345083291434e-05, + "loss": 0.0671, + "num_input_tokens_seen": 65369552, + "step": 30285 + }, + { + "epoch": 4.941272430668842, + "grad_norm": 0.8250702023506165, + "learning_rate": 2.9846362323592463e-05, + "loss": 0.1784, + "num_input_tokens_seen": 65380912, + "step": 30290 + }, + { + "epoch": 4.942088091353996, + "grad_norm": 0.890328586101532, + "learning_rate": 2.9839379171022776e-05, + "loss": 0.3446, + "num_input_tokens_seen": 65392080, + "step": 30295 + }, + { + "epoch": 4.942903752039152, + "grad_norm": 0.053008195012807846, + "learning_rate": 2.983239562614845e-05, + "loss": 0.0228, + "num_input_tokens_seen": 65402256, + "step": 30300 + }, + { + "epoch": 4.943719412724307, + "grad_norm": 1.5324891805648804, + "learning_rate": 2.982541168953562e-05, + "loss": 0.2103, + "num_input_tokens_seen": 65413456, + "step": 30305 + }, + { + "epoch": 4.944535073409462, + "grad_norm": 0.7249002456665039, + "learning_rate": 2.9818427361750434e-05, + "loss": 0.1275, + "num_input_tokens_seen": 65424912, + "step": 30310 + }, + { + "epoch": 4.945350734094617, + "grad_norm": 0.1453290581703186, + "learning_rate": 2.9811442643359076e-05, + "loss": 0.0189, + "num_input_tokens_seen": 65434896, + "step": 30315 + }, + { + "epoch": 4.946166394779771, + "grad_norm": 0.3903196454048157, + "learning_rate": 2.9804457534927772e-05, + "loss": 0.0776, + "num_input_tokens_seen": 65445424, + "step": 30320 + }, + { + "epoch": 4.946982055464927, + "grad_norm": 0.17624586820602417, + "learning_rate": 2.9797472037022757e-05, + "loss": 0.132, + "num_input_tokens_seen": 65454672, + "step": 30325 + }, + { + "epoch": 4.947797716150082, + "grad_norm": 0.0656774640083313, + "learning_rate": 2.9790486150210316e-05, + "loss": 0.0907, + "num_input_tokens_seen": 65465776, + "step": 30330 + }, + { + "epoch": 4.948613376835237, + "grad_norm": 1.8595796823501587, + "learning_rate": 2.9783499875056766e-05, + "loss": 0.3258, + "num_input_tokens_seen": 65477168, + "step": 30335 + }, + { + "epoch": 4.9494290375203915, + "grad_norm": 1.377127766609192, + "learning_rate": 2.9776513212128442e-05, + "loss": 0.0494, + "num_input_tokens_seen": 65487248, + "step": 30340 + }, + { + "epoch": 4.950244698205546, + "grad_norm": 0.289554238319397, + "learning_rate": 2.976952616199172e-05, + "loss": 0.0263, + "num_input_tokens_seen": 65497360, + "step": 30345 + }, + { + "epoch": 4.951060358890701, + "grad_norm": 0.5253307819366455, + "learning_rate": 2.9762538725213007e-05, + "loss": 0.1182, + "num_input_tokens_seen": 65507728, + "step": 30350 + }, + { + "epoch": 4.951876019575856, + "grad_norm": 0.36026281118392944, + "learning_rate": 2.9755550902358737e-05, + "loss": 0.108, + "num_input_tokens_seen": 65517104, + "step": 30355 + }, + { + "epoch": 4.952691680261012, + "grad_norm": 1.3345162868499756, + "learning_rate": 2.9748562693995386e-05, + "loss": 0.1784, + "num_input_tokens_seen": 65529168, + "step": 30360 + }, + { + "epoch": 4.9535073409461665, + "grad_norm": 1.3442256450653076, + "learning_rate": 2.974157410068944e-05, + "loss": 0.1081, + "num_input_tokens_seen": 65538960, + "step": 30365 + }, + { + "epoch": 4.954323001631321, + "grad_norm": 0.02193724736571312, + "learning_rate": 2.9734585123007446e-05, + "loss": 0.0422, + "num_input_tokens_seen": 65549392, + "step": 30370 + }, + { + "epoch": 4.955138662316476, + "grad_norm": 1.449938178062439, + "learning_rate": 2.9727595761515958e-05, + "loss": 0.1146, + "num_input_tokens_seen": 65559824, + "step": 30375 + }, + { + "epoch": 4.955954323001631, + "grad_norm": 1.0855340957641602, + "learning_rate": 2.9720606016781577e-05, + "loss": 0.1529, + "num_input_tokens_seen": 65570896, + "step": 30380 + }, + { + "epoch": 4.956769983686787, + "grad_norm": 0.10187073051929474, + "learning_rate": 2.9713615889370917e-05, + "loss": 0.1193, + "num_input_tokens_seen": 65582384, + "step": 30385 + }, + { + "epoch": 4.9575856443719415, + "grad_norm": 1.8449482917785645, + "learning_rate": 2.9706625379850627e-05, + "loss": 0.3079, + "num_input_tokens_seen": 65593264, + "step": 30390 + }, + { + "epoch": 4.958401305057096, + "grad_norm": 0.4539341628551483, + "learning_rate": 2.9699634488787415e-05, + "loss": 0.2716, + "num_input_tokens_seen": 65604016, + "step": 30395 + }, + { + "epoch": 4.959216965742251, + "grad_norm": 0.916366696357727, + "learning_rate": 2.9692643216747978e-05, + "loss": 0.0915, + "num_input_tokens_seen": 65613616, + "step": 30400 + }, + { + "epoch": 4.960032626427406, + "grad_norm": 0.6873366236686707, + "learning_rate": 2.9685651564299077e-05, + "loss": 0.1992, + "num_input_tokens_seen": 65624816, + "step": 30405 + }, + { + "epoch": 4.960848287112562, + "grad_norm": 0.7760663032531738, + "learning_rate": 2.9678659532007475e-05, + "loss": 0.213, + "num_input_tokens_seen": 65635792, + "step": 30410 + }, + { + "epoch": 4.9616639477977165, + "grad_norm": 0.43969231843948364, + "learning_rate": 2.967166712044e-05, + "loss": 0.2024, + "num_input_tokens_seen": 65646512, + "step": 30415 + }, + { + "epoch": 4.962479608482871, + "grad_norm": 0.712770402431488, + "learning_rate": 2.9664674330163485e-05, + "loss": 0.1208, + "num_input_tokens_seen": 65656976, + "step": 30420 + }, + { + "epoch": 4.963295269168026, + "grad_norm": 1.4512652158737183, + "learning_rate": 2.96576811617448e-05, + "loss": 0.1329, + "num_input_tokens_seen": 65667792, + "step": 30425 + }, + { + "epoch": 4.964110929853181, + "grad_norm": 1.2038450241088867, + "learning_rate": 2.9650687615750843e-05, + "loss": 0.1286, + "num_input_tokens_seen": 65679088, + "step": 30430 + }, + { + "epoch": 4.964926590538336, + "grad_norm": 1.144261360168457, + "learning_rate": 2.964369369274856e-05, + "loss": 0.096, + "num_input_tokens_seen": 65689840, + "step": 30435 + }, + { + "epoch": 4.9657422512234906, + "grad_norm": 0.8356012105941772, + "learning_rate": 2.963669939330489e-05, + "loss": 0.1571, + "num_input_tokens_seen": 65700528, + "step": 30440 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.16905619204044342, + "learning_rate": 2.962970471798685e-05, + "loss": 0.0592, + "num_input_tokens_seen": 65710128, + "step": 30445 + }, + { + "epoch": 4.967373572593801, + "grad_norm": 0.23513437807559967, + "learning_rate": 2.9622709667361455e-05, + "loss": 0.2329, + "num_input_tokens_seen": 65722064, + "step": 30450 + }, + { + "epoch": 4.968189233278956, + "grad_norm": 0.4741469621658325, + "learning_rate": 2.9615714241995758e-05, + "loss": 0.0629, + "num_input_tokens_seen": 65732336, + "step": 30455 + }, + { + "epoch": 4.969004893964111, + "grad_norm": 0.16741029918193817, + "learning_rate": 2.9608718442456844e-05, + "loss": 0.0755, + "num_input_tokens_seen": 65743504, + "step": 30460 + }, + { + "epoch": 4.9698205546492655, + "grad_norm": 0.5669999718666077, + "learning_rate": 2.9601722269311827e-05, + "loss": 0.1324, + "num_input_tokens_seen": 65754608, + "step": 30465 + }, + { + "epoch": 4.970636215334421, + "grad_norm": 0.8082267642021179, + "learning_rate": 2.9594725723127855e-05, + "loss": 0.1507, + "num_input_tokens_seen": 65765584, + "step": 30470 + }, + { + "epoch": 4.971451876019576, + "grad_norm": 0.09119617193937302, + "learning_rate": 2.9587728804472104e-05, + "loss": 0.252, + "num_input_tokens_seen": 65776816, + "step": 30475 + }, + { + "epoch": 4.972267536704731, + "grad_norm": 1.4475390911102295, + "learning_rate": 2.9580731513911773e-05, + "loss": 0.2011, + "num_input_tokens_seen": 65788336, + "step": 30480 + }, + { + "epoch": 4.973083197389886, + "grad_norm": 0.03726387023925781, + "learning_rate": 2.9573733852014112e-05, + "loss": 0.2435, + "num_input_tokens_seen": 65798704, + "step": 30485 + }, + { + "epoch": 4.9738988580750405, + "grad_norm": 0.5195260643959045, + "learning_rate": 2.9566735819346376e-05, + "loss": 0.1118, + "num_input_tokens_seen": 65808816, + "step": 30490 + }, + { + "epoch": 4.974714518760196, + "grad_norm": 0.6714190244674683, + "learning_rate": 2.9559737416475863e-05, + "loss": 0.1004, + "num_input_tokens_seen": 65819088, + "step": 30495 + }, + { + "epoch": 4.975530179445351, + "grad_norm": 2.1143596172332764, + "learning_rate": 2.9552738643969896e-05, + "loss": 0.2644, + "num_input_tokens_seen": 65830256, + "step": 30500 + }, + { + "epoch": 4.976345840130506, + "grad_norm": 0.031756166368722916, + "learning_rate": 2.9545739502395835e-05, + "loss": 0.1119, + "num_input_tokens_seen": 65839792, + "step": 30505 + }, + { + "epoch": 4.977161500815661, + "grad_norm": 0.4315786063671112, + "learning_rate": 2.9538739992321062e-05, + "loss": 0.0466, + "num_input_tokens_seen": 65850512, + "step": 30510 + }, + { + "epoch": 4.9779771615008155, + "grad_norm": 0.2136828899383545, + "learning_rate": 2.9531740114313e-05, + "loss": 0.1153, + "num_input_tokens_seen": 65861584, + "step": 30515 + }, + { + "epoch": 4.97879282218597, + "grad_norm": 0.6362653374671936, + "learning_rate": 2.9524739868939088e-05, + "loss": 0.3221, + "num_input_tokens_seen": 65872880, + "step": 30520 + }, + { + "epoch": 4.979608482871125, + "grad_norm": 0.1377904862165451, + "learning_rate": 2.9517739256766803e-05, + "loss": 0.1287, + "num_input_tokens_seen": 65882544, + "step": 30525 + }, + { + "epoch": 4.980424143556281, + "grad_norm": 0.12595771253108978, + "learning_rate": 2.9510738278363652e-05, + "loss": 0.037, + "num_input_tokens_seen": 65893456, + "step": 30530 + }, + { + "epoch": 4.981239804241436, + "grad_norm": 0.375393807888031, + "learning_rate": 2.950373693429717e-05, + "loss": 0.0859, + "num_input_tokens_seen": 65905264, + "step": 30535 + }, + { + "epoch": 4.9820554649265905, + "grad_norm": 1.8494465351104736, + "learning_rate": 2.949673522513492e-05, + "loss": 0.2844, + "num_input_tokens_seen": 65916400, + "step": 30540 + }, + { + "epoch": 4.982871125611745, + "grad_norm": 0.24733656644821167, + "learning_rate": 2.9489733151444497e-05, + "loss": 0.1162, + "num_input_tokens_seen": 65928272, + "step": 30545 + }, + { + "epoch": 4.9836867862969, + "grad_norm": 0.7175878286361694, + "learning_rate": 2.9482730713793526e-05, + "loss": 0.0625, + "num_input_tokens_seen": 65939792, + "step": 30550 + }, + { + "epoch": 4.984502446982056, + "grad_norm": 0.15073053538799286, + "learning_rate": 2.9475727912749656e-05, + "loss": 0.1082, + "num_input_tokens_seen": 65950576, + "step": 30555 + }, + { + "epoch": 4.985318107667211, + "grad_norm": 1.4865745306015015, + "learning_rate": 2.946872474888058e-05, + "loss": 0.2095, + "num_input_tokens_seen": 65960240, + "step": 30560 + }, + { + "epoch": 4.986133768352365, + "grad_norm": 0.7713416814804077, + "learning_rate": 2.9461721222753992e-05, + "loss": 0.141, + "num_input_tokens_seen": 65970544, + "step": 30565 + }, + { + "epoch": 4.98694942903752, + "grad_norm": 0.11139209568500519, + "learning_rate": 2.9454717334937638e-05, + "loss": 0.0232, + "num_input_tokens_seen": 65982768, + "step": 30570 + }, + { + "epoch": 4.987765089722675, + "grad_norm": 0.11969856172800064, + "learning_rate": 2.94477130859993e-05, + "loss": 0.0699, + "num_input_tokens_seen": 65993680, + "step": 30575 + }, + { + "epoch": 4.988580750407831, + "grad_norm": 1.034498691558838, + "learning_rate": 2.9440708476506773e-05, + "loss": 0.2004, + "num_input_tokens_seen": 66005136, + "step": 30580 + }, + { + "epoch": 4.989396411092986, + "grad_norm": 1.093869924545288, + "learning_rate": 2.943370350702789e-05, + "loss": 0.1508, + "num_input_tokens_seen": 66015376, + "step": 30585 + }, + { + "epoch": 4.99021207177814, + "grad_norm": 0.32599014043807983, + "learning_rate": 2.9426698178130495e-05, + "loss": 0.0416, + "num_input_tokens_seen": 66025904, + "step": 30590 + }, + { + "epoch": 4.991027732463295, + "grad_norm": 0.19211184978485107, + "learning_rate": 2.9419692490382488e-05, + "loss": 0.1568, + "num_input_tokens_seen": 66036560, + "step": 30595 + }, + { + "epoch": 4.99184339314845, + "grad_norm": 0.7873234748840332, + "learning_rate": 2.9412686444351782e-05, + "loss": 0.1897, + "num_input_tokens_seen": 66047856, + "step": 30600 + }, + { + "epoch": 4.992659053833605, + "grad_norm": 0.1156252846121788, + "learning_rate": 2.9405680040606326e-05, + "loss": 0.1887, + "num_input_tokens_seen": 66058736, + "step": 30605 + }, + { + "epoch": 4.993474714518761, + "grad_norm": 0.9220502376556396, + "learning_rate": 2.939867327971409e-05, + "loss": 0.0899, + "num_input_tokens_seen": 66068848, + "step": 30610 + }, + { + "epoch": 4.994290375203915, + "grad_norm": 1.8980004787445068, + "learning_rate": 2.939166616224308e-05, + "loss": 0.1147, + "num_input_tokens_seen": 66079760, + "step": 30615 + }, + { + "epoch": 4.99510603588907, + "grad_norm": 0.06531383097171783, + "learning_rate": 2.938465868876133e-05, + "loss": 0.1008, + "num_input_tokens_seen": 66090128, + "step": 30620 + }, + { + "epoch": 4.995921696574225, + "grad_norm": 1.7395237684249878, + "learning_rate": 2.9377650859836892e-05, + "loss": 0.1449, + "num_input_tokens_seen": 66100304, + "step": 30625 + }, + { + "epoch": 4.99673735725938, + "grad_norm": 1.0468535423278809, + "learning_rate": 2.9370642676037867e-05, + "loss": 0.2664, + "num_input_tokens_seen": 66111696, + "step": 30630 + }, + { + "epoch": 4.997553017944535, + "grad_norm": 1.3942664861679077, + "learning_rate": 2.936363413793237e-05, + "loss": 0.2453, + "num_input_tokens_seen": 66122064, + "step": 30635 + }, + { + "epoch": 4.99836867862969, + "grad_norm": 0.8007043600082397, + "learning_rate": 2.9356625246088554e-05, + "loss": 0.0402, + "num_input_tokens_seen": 66131376, + "step": 30640 + }, + { + "epoch": 4.999184339314845, + "grad_norm": 0.15810838341712952, + "learning_rate": 2.9349616001074588e-05, + "loss": 0.2031, + "num_input_tokens_seen": 66142800, + "step": 30645 + }, + { + "epoch": 5.0, + "grad_norm": 0.14717867970466614, + "learning_rate": 2.934260640345867e-05, + "loss": 0.1342, + "num_input_tokens_seen": 66152480, + "step": 30650 + }, + { + "epoch": 5.0, + "eval_loss": 0.13616938889026642, + "eval_runtime": 131.9355, + "eval_samples_per_second": 20.654, + "eval_steps_per_second": 5.169, + "num_input_tokens_seen": 66152480, + "step": 30650 + }, + { + "epoch": 5.000815660685155, + "grad_norm": 0.5758910179138184, + "learning_rate": 2.9335596453809055e-05, + "loss": 0.0454, + "num_input_tokens_seen": 66162912, + "step": 30655 + }, + { + "epoch": 5.00163132137031, + "grad_norm": 0.8695580363273621, + "learning_rate": 2.9328586152693986e-05, + "loss": 0.1273, + "num_input_tokens_seen": 66173312, + "step": 30660 + }, + { + "epoch": 5.002446982055465, + "grad_norm": 0.2505077123641968, + "learning_rate": 2.9321575500681757e-05, + "loss": 0.1145, + "num_input_tokens_seen": 66184832, + "step": 30665 + }, + { + "epoch": 5.00326264274062, + "grad_norm": 0.5564342737197876, + "learning_rate": 2.93145644983407e-05, + "loss": 0.068, + "num_input_tokens_seen": 66196544, + "step": 30670 + }, + { + "epoch": 5.004078303425775, + "grad_norm": 0.07125292718410492, + "learning_rate": 2.9307553146239146e-05, + "loss": 0.043, + "num_input_tokens_seen": 66207904, + "step": 30675 + }, + { + "epoch": 5.00489396411093, + "grad_norm": 0.18770147860050201, + "learning_rate": 2.930054144494548e-05, + "loss": 0.103, + "num_input_tokens_seen": 66218752, + "step": 30680 + }, + { + "epoch": 5.005709624796085, + "grad_norm": 0.14990520477294922, + "learning_rate": 2.9293529395028102e-05, + "loss": 0.0739, + "num_input_tokens_seen": 66230528, + "step": 30685 + }, + { + "epoch": 5.006525285481239, + "grad_norm": 1.371207594871521, + "learning_rate": 2.928651699705545e-05, + "loss": 0.1023, + "num_input_tokens_seen": 66240768, + "step": 30690 + }, + { + "epoch": 5.007340946166395, + "grad_norm": 1.9894987344741821, + "learning_rate": 2.927950425159598e-05, + "loss": 0.1767, + "num_input_tokens_seen": 66252672, + "step": 30695 + }, + { + "epoch": 5.00815660685155, + "grad_norm": 0.4733501374721527, + "learning_rate": 2.927249115921818e-05, + "loss": 0.1021, + "num_input_tokens_seen": 66263520, + "step": 30700 + }, + { + "epoch": 5.008972267536705, + "grad_norm": 0.05010391026735306, + "learning_rate": 2.926547772049057e-05, + "loss": 0.0304, + "num_input_tokens_seen": 66274752, + "step": 30705 + }, + { + "epoch": 5.00978792822186, + "grad_norm": 0.5702261924743652, + "learning_rate": 2.9258463935981696e-05, + "loss": 0.3365, + "num_input_tokens_seen": 66285440, + "step": 30710 + }, + { + "epoch": 5.010603588907014, + "grad_norm": 0.3784456253051758, + "learning_rate": 2.9251449806260122e-05, + "loss": 0.0254, + "num_input_tokens_seen": 66295040, + "step": 30715 + }, + { + "epoch": 5.011419249592169, + "grad_norm": 0.10189798474311829, + "learning_rate": 2.9244435331894454e-05, + "loss": 0.0298, + "num_input_tokens_seen": 66304384, + "step": 30720 + }, + { + "epoch": 5.012234910277325, + "grad_norm": 0.22660477459430695, + "learning_rate": 2.9237420513453328e-05, + "loss": 0.1488, + "num_input_tokens_seen": 66315648, + "step": 30725 + }, + { + "epoch": 5.01305057096248, + "grad_norm": 0.14918887615203857, + "learning_rate": 2.9230405351505386e-05, + "loss": 0.0806, + "num_input_tokens_seen": 66325664, + "step": 30730 + }, + { + "epoch": 5.013866231647635, + "grad_norm": 0.5915656089782715, + "learning_rate": 2.9223389846619326e-05, + "loss": 0.0616, + "num_input_tokens_seen": 66336544, + "step": 30735 + }, + { + "epoch": 5.014681892332789, + "grad_norm": 0.5369629263877869, + "learning_rate": 2.921637399936386e-05, + "loss": 0.0469, + "num_input_tokens_seen": 66347360, + "step": 30740 + }, + { + "epoch": 5.015497553017944, + "grad_norm": 0.31393659114837646, + "learning_rate": 2.920935781030772e-05, + "loss": 0.0548, + "num_input_tokens_seen": 66356992, + "step": 30745 + }, + { + "epoch": 5.0163132137031, + "grad_norm": 0.5530638098716736, + "learning_rate": 2.9202341280019675e-05, + "loss": 0.0528, + "num_input_tokens_seen": 66368032, + "step": 30750 + }, + { + "epoch": 5.017128874388255, + "grad_norm": 0.4163587987422943, + "learning_rate": 2.9195324409068525e-05, + "loss": 0.2258, + "num_input_tokens_seen": 66379424, + "step": 30755 + }, + { + "epoch": 5.0179445350734095, + "grad_norm": 0.08504832535982132, + "learning_rate": 2.9188307198023095e-05, + "loss": 0.0522, + "num_input_tokens_seen": 66390752, + "step": 30760 + }, + { + "epoch": 5.018760195758564, + "grad_norm": 0.687720775604248, + "learning_rate": 2.918128964745223e-05, + "loss": 0.0989, + "num_input_tokens_seen": 66403200, + "step": 30765 + }, + { + "epoch": 5.019575856443719, + "grad_norm": 0.03814522549510002, + "learning_rate": 2.9174271757924814e-05, + "loss": 0.0806, + "num_input_tokens_seen": 66412768, + "step": 30770 + }, + { + "epoch": 5.020391517128874, + "grad_norm": 1.538287878036499, + "learning_rate": 2.9167253530009748e-05, + "loss": 0.1525, + "num_input_tokens_seen": 66424448, + "step": 30775 + }, + { + "epoch": 5.02120717781403, + "grad_norm": 0.17071488499641418, + "learning_rate": 2.9160234964275963e-05, + "loss": 0.2018, + "num_input_tokens_seen": 66434400, + "step": 30780 + }, + { + "epoch": 5.0220228384991845, + "grad_norm": 0.12022291123867035, + "learning_rate": 2.915321606129242e-05, + "loss": 0.3244, + "num_input_tokens_seen": 66445440, + "step": 30785 + }, + { + "epoch": 5.022838499184339, + "grad_norm": 1.5709545612335205, + "learning_rate": 2.9146196821628113e-05, + "loss": 0.2173, + "num_input_tokens_seen": 66455776, + "step": 30790 + }, + { + "epoch": 5.023654159869494, + "grad_norm": 0.09674385190010071, + "learning_rate": 2.9139177245852056e-05, + "loss": 0.0135, + "num_input_tokens_seen": 66467616, + "step": 30795 + }, + { + "epoch": 5.024469820554649, + "grad_norm": 0.1725759655237198, + "learning_rate": 2.9132157334533295e-05, + "loss": 0.1007, + "num_input_tokens_seen": 66477056, + "step": 30800 + }, + { + "epoch": 5.025285481239805, + "grad_norm": 0.1760103404521942, + "learning_rate": 2.9125137088240885e-05, + "loss": 0.1555, + "num_input_tokens_seen": 66488352, + "step": 30805 + }, + { + "epoch": 5.0261011419249595, + "grad_norm": 0.5517750978469849, + "learning_rate": 2.9118116507543936e-05, + "loss": 0.1416, + "num_input_tokens_seen": 66497824, + "step": 30810 + }, + { + "epoch": 5.026916802610114, + "grad_norm": 0.37891921401023865, + "learning_rate": 2.9111095593011567e-05, + "loss": 0.1085, + "num_input_tokens_seen": 66509920, + "step": 30815 + }, + { + "epoch": 5.027732463295269, + "grad_norm": 0.5608354210853577, + "learning_rate": 2.9104074345212933e-05, + "loss": 0.0549, + "num_input_tokens_seen": 66519520, + "step": 30820 + }, + { + "epoch": 5.028548123980424, + "grad_norm": 0.11770410090684891, + "learning_rate": 2.9097052764717196e-05, + "loss": 0.0612, + "num_input_tokens_seen": 66529792, + "step": 30825 + }, + { + "epoch": 5.029363784665579, + "grad_norm": 0.6649871468544006, + "learning_rate": 2.9090030852093586e-05, + "loss": 0.1763, + "num_input_tokens_seen": 66540416, + "step": 30830 + }, + { + "epoch": 5.0301794453507345, + "grad_norm": 0.7023178935050964, + "learning_rate": 2.9083008607911322e-05, + "loss": 0.0534, + "num_input_tokens_seen": 66549280, + "step": 30835 + }, + { + "epoch": 5.030995106035889, + "grad_norm": 0.22835296392440796, + "learning_rate": 2.9075986032739656e-05, + "loss": 0.0377, + "num_input_tokens_seen": 66560640, + "step": 30840 + }, + { + "epoch": 5.031810766721044, + "grad_norm": 0.34461089968681335, + "learning_rate": 2.9068963127147886e-05, + "loss": 0.1582, + "num_input_tokens_seen": 66571808, + "step": 30845 + }, + { + "epoch": 5.032626427406199, + "grad_norm": 2.240480422973633, + "learning_rate": 2.906193989170532e-05, + "loss": 0.2587, + "num_input_tokens_seen": 66583424, + "step": 30850 + }, + { + "epoch": 5.033442088091354, + "grad_norm": 1.185805320739746, + "learning_rate": 2.9054916326981297e-05, + "loss": 0.1938, + "num_input_tokens_seen": 66593984, + "step": 30855 + }, + { + "epoch": 5.034257748776509, + "grad_norm": 0.3774120509624481, + "learning_rate": 2.9047892433545176e-05, + "loss": 0.1219, + "num_input_tokens_seen": 66604704, + "step": 30860 + }, + { + "epoch": 5.035073409461664, + "grad_norm": 1.3609302043914795, + "learning_rate": 2.9040868211966364e-05, + "loss": 0.2157, + "num_input_tokens_seen": 66615808, + "step": 30865 + }, + { + "epoch": 5.035889070146819, + "grad_norm": 1.0047729015350342, + "learning_rate": 2.903384366281427e-05, + "loss": 0.0584, + "num_input_tokens_seen": 66627008, + "step": 30870 + }, + { + "epoch": 5.036704730831974, + "grad_norm": 0.1720990687608719, + "learning_rate": 2.902681878665834e-05, + "loss": 0.115, + "num_input_tokens_seen": 66638464, + "step": 30875 + }, + { + "epoch": 5.037520391517129, + "grad_norm": 1.257200002670288, + "learning_rate": 2.9019793584068046e-05, + "loss": 0.1085, + "num_input_tokens_seen": 66648544, + "step": 30880 + }, + { + "epoch": 5.0383360522022835, + "grad_norm": 0.0909833088517189, + "learning_rate": 2.9012768055612887e-05, + "loss": 0.1276, + "num_input_tokens_seen": 66660384, + "step": 30885 + }, + { + "epoch": 5.039151712887439, + "grad_norm": 0.63874751329422, + "learning_rate": 2.9005742201862385e-05, + "loss": 0.1733, + "num_input_tokens_seen": 66671136, + "step": 30890 + }, + { + "epoch": 5.039967373572594, + "grad_norm": 1.3079160451889038, + "learning_rate": 2.8998716023386096e-05, + "loss": 0.1746, + "num_input_tokens_seen": 66681664, + "step": 30895 + }, + { + "epoch": 5.040783034257749, + "grad_norm": 0.10502729564905167, + "learning_rate": 2.8991689520753605e-05, + "loss": 0.0489, + "num_input_tokens_seen": 66691968, + "step": 30900 + }, + { + "epoch": 5.041598694942904, + "grad_norm": 0.685741662979126, + "learning_rate": 2.8984662694534504e-05, + "loss": 0.0863, + "num_input_tokens_seen": 66703488, + "step": 30905 + }, + { + "epoch": 5.0424143556280585, + "grad_norm": 0.11285900324583054, + "learning_rate": 2.897763554529842e-05, + "loss": 0.0949, + "num_input_tokens_seen": 66714304, + "step": 30910 + }, + { + "epoch": 5.043230016313213, + "grad_norm": 0.43470498919487, + "learning_rate": 2.8970608073615026e-05, + "loss": 0.1568, + "num_input_tokens_seen": 66724864, + "step": 30915 + }, + { + "epoch": 5.044045676998369, + "grad_norm": 0.15843810141086578, + "learning_rate": 2.8963580280053992e-05, + "loss": 0.144, + "num_input_tokens_seen": 66736736, + "step": 30920 + }, + { + "epoch": 5.044861337683524, + "grad_norm": 0.4321405589580536, + "learning_rate": 2.8956552165185023e-05, + "loss": 0.1962, + "num_input_tokens_seen": 66748128, + "step": 30925 + }, + { + "epoch": 5.045676998368679, + "grad_norm": 0.1092374324798584, + "learning_rate": 2.894952372957787e-05, + "loss": 0.0338, + "num_input_tokens_seen": 66759200, + "step": 30930 + }, + { + "epoch": 5.0464926590538335, + "grad_norm": 0.28613293170928955, + "learning_rate": 2.894249497380228e-05, + "loss": 0.1961, + "num_input_tokens_seen": 66770720, + "step": 30935 + }, + { + "epoch": 5.047308319738988, + "grad_norm": 1.0271399021148682, + "learning_rate": 2.893546589842805e-05, + "loss": 0.1664, + "num_input_tokens_seen": 66781312, + "step": 30940 + }, + { + "epoch": 5.048123980424143, + "grad_norm": 0.5183497071266174, + "learning_rate": 2.892843650402497e-05, + "loss": 0.0808, + "num_input_tokens_seen": 66792864, + "step": 30945 + }, + { + "epoch": 5.048939641109299, + "grad_norm": 1.1089658737182617, + "learning_rate": 2.8921406791162902e-05, + "loss": 0.2126, + "num_input_tokens_seen": 66802976, + "step": 30950 + }, + { + "epoch": 5.049755301794454, + "grad_norm": 0.8908772468566895, + "learning_rate": 2.891437676041171e-05, + "loss": 0.0752, + "num_input_tokens_seen": 66812768, + "step": 30955 + }, + { + "epoch": 5.0505709624796085, + "grad_norm": 0.2382565140724182, + "learning_rate": 2.890734641234127e-05, + "loss": 0.0956, + "num_input_tokens_seen": 66823072, + "step": 30960 + }, + { + "epoch": 5.051386623164763, + "grad_norm": 1.6848177909851074, + "learning_rate": 2.8900315747521507e-05, + "loss": 0.1169, + "num_input_tokens_seen": 66835200, + "step": 30965 + }, + { + "epoch": 5.052202283849918, + "grad_norm": 0.29619187116622925, + "learning_rate": 2.8893284766522353e-05, + "loss": 0.0437, + "num_input_tokens_seen": 66846528, + "step": 30970 + }, + { + "epoch": 5.053017944535074, + "grad_norm": 0.7946823239326477, + "learning_rate": 2.8886253469913787e-05, + "loss": 0.1562, + "num_input_tokens_seen": 66858080, + "step": 30975 + }, + { + "epoch": 5.053833605220229, + "grad_norm": 0.6143460273742676, + "learning_rate": 2.8879221858265794e-05, + "loss": 0.1362, + "num_input_tokens_seen": 66869888, + "step": 30980 + }, + { + "epoch": 5.054649265905383, + "grad_norm": 1.9376676082611084, + "learning_rate": 2.8872189932148392e-05, + "loss": 0.1945, + "num_input_tokens_seen": 66880256, + "step": 30985 + }, + { + "epoch": 5.055464926590538, + "grad_norm": 0.6851410269737244, + "learning_rate": 2.8865157692131633e-05, + "loss": 0.1285, + "num_input_tokens_seen": 66890784, + "step": 30990 + }, + { + "epoch": 5.056280587275693, + "grad_norm": 0.30911117792129517, + "learning_rate": 2.8858125138785568e-05, + "loss": 0.045, + "num_input_tokens_seen": 66901920, + "step": 30995 + }, + { + "epoch": 5.057096247960848, + "grad_norm": 0.5380886793136597, + "learning_rate": 2.8851092272680313e-05, + "loss": 0.159, + "num_input_tokens_seen": 66912608, + "step": 31000 + }, + { + "epoch": 5.057911908646004, + "grad_norm": 0.1348826289176941, + "learning_rate": 2.8844059094385977e-05, + "loss": 0.1412, + "num_input_tokens_seen": 66922336, + "step": 31005 + }, + { + "epoch": 5.058727569331158, + "grad_norm": 0.7376143336296082, + "learning_rate": 2.883702560447271e-05, + "loss": 0.2569, + "num_input_tokens_seen": 66932960, + "step": 31010 + }, + { + "epoch": 5.059543230016313, + "grad_norm": 0.4566114842891693, + "learning_rate": 2.8829991803510675e-05, + "loss": 0.1632, + "num_input_tokens_seen": 66944448, + "step": 31015 + }, + { + "epoch": 5.060358890701468, + "grad_norm": 0.2596755623817444, + "learning_rate": 2.8822957692070073e-05, + "loss": 0.0496, + "num_input_tokens_seen": 66955264, + "step": 31020 + }, + { + "epoch": 5.061174551386623, + "grad_norm": 0.18773813545703888, + "learning_rate": 2.8815923270721124e-05, + "loss": 0.0456, + "num_input_tokens_seen": 66965024, + "step": 31025 + }, + { + "epoch": 5.061990212071779, + "grad_norm": 0.13461995124816895, + "learning_rate": 2.8808888540034067e-05, + "loss": 0.1415, + "num_input_tokens_seen": 66975072, + "step": 31030 + }, + { + "epoch": 5.062805872756933, + "grad_norm": 0.054163847118616104, + "learning_rate": 2.8801853500579183e-05, + "loss": 0.1423, + "num_input_tokens_seen": 66987232, + "step": 31035 + }, + { + "epoch": 5.063621533442088, + "grad_norm": 0.9835292100906372, + "learning_rate": 2.879481815292676e-05, + "loss": 0.212, + "num_input_tokens_seen": 66995968, + "step": 31040 + }, + { + "epoch": 5.064437194127243, + "grad_norm": 2.3429088592529297, + "learning_rate": 2.878778249764713e-05, + "loss": 0.2186, + "num_input_tokens_seen": 67006912, + "step": 31045 + }, + { + "epoch": 5.065252854812398, + "grad_norm": 0.4998287856578827, + "learning_rate": 2.8780746535310623e-05, + "loss": 0.1894, + "num_input_tokens_seen": 67017824, + "step": 31050 + }, + { + "epoch": 5.066068515497553, + "grad_norm": 0.11504344642162323, + "learning_rate": 2.8773710266487623e-05, + "loss": 0.019, + "num_input_tokens_seen": 67028736, + "step": 31055 + }, + { + "epoch": 5.066884176182708, + "grad_norm": 0.43984708189964294, + "learning_rate": 2.8766673691748524e-05, + "loss": 0.0872, + "num_input_tokens_seen": 67039936, + "step": 31060 + }, + { + "epoch": 5.067699836867863, + "grad_norm": 0.15531794726848602, + "learning_rate": 2.875963681166373e-05, + "loss": 0.0464, + "num_input_tokens_seen": 67051616, + "step": 31065 + }, + { + "epoch": 5.068515497553018, + "grad_norm": 0.12827053666114807, + "learning_rate": 2.8752599626803717e-05, + "loss": 0.059, + "num_input_tokens_seen": 67062240, + "step": 31070 + }, + { + "epoch": 5.069331158238173, + "grad_norm": 0.21322360634803772, + "learning_rate": 2.874556213773893e-05, + "loss": 0.0826, + "num_input_tokens_seen": 67073408, + "step": 31075 + }, + { + "epoch": 5.070146818923328, + "grad_norm": 0.2064371407032013, + "learning_rate": 2.8738524345039876e-05, + "loss": 0.0793, + "num_input_tokens_seen": 67084896, + "step": 31080 + }, + { + "epoch": 5.0709624796084825, + "grad_norm": 0.13057412207126617, + "learning_rate": 2.8731486249277062e-05, + "loss": 0.0265, + "num_input_tokens_seen": 67095040, + "step": 31085 + }, + { + "epoch": 5.071778140293638, + "grad_norm": 0.04665058106184006, + "learning_rate": 2.8724447851021047e-05, + "loss": 0.0754, + "num_input_tokens_seen": 67106336, + "step": 31090 + }, + { + "epoch": 5.072593800978793, + "grad_norm": 0.11459007114171982, + "learning_rate": 2.871740915084239e-05, + "loss": 0.2417, + "num_input_tokens_seen": 67116832, + "step": 31095 + }, + { + "epoch": 5.073409461663948, + "grad_norm": 1.9979851245880127, + "learning_rate": 2.871037014931168e-05, + "loss": 0.168, + "num_input_tokens_seen": 67128736, + "step": 31100 + }, + { + "epoch": 5.074225122349103, + "grad_norm": 0.949614942073822, + "learning_rate": 2.870333084699954e-05, + "loss": 0.1778, + "num_input_tokens_seen": 67140128, + "step": 31105 + }, + { + "epoch": 5.075040783034257, + "grad_norm": 0.2014104425907135, + "learning_rate": 2.8696291244476613e-05, + "loss": 0.0964, + "num_input_tokens_seen": 67151232, + "step": 31110 + }, + { + "epoch": 5.075856443719413, + "grad_norm": 0.3233135938644409, + "learning_rate": 2.8689251342313562e-05, + "loss": 0.1314, + "num_input_tokens_seen": 67161984, + "step": 31115 + }, + { + "epoch": 5.076672104404568, + "grad_norm": 1.5371689796447754, + "learning_rate": 2.8682211141081084e-05, + "loss": 0.1304, + "num_input_tokens_seen": 67172224, + "step": 31120 + }, + { + "epoch": 5.077487765089723, + "grad_norm": 0.11071723699569702, + "learning_rate": 2.867517064134988e-05, + "loss": 0.0281, + "num_input_tokens_seen": 67182560, + "step": 31125 + }, + { + "epoch": 5.078303425774878, + "grad_norm": 0.6816234588623047, + "learning_rate": 2.8668129843690704e-05, + "loss": 0.0436, + "num_input_tokens_seen": 67193024, + "step": 31130 + }, + { + "epoch": 5.079119086460032, + "grad_norm": 0.057593438774347305, + "learning_rate": 2.86610887486743e-05, + "loss": 0.2013, + "num_input_tokens_seen": 67204096, + "step": 31135 + }, + { + "epoch": 5.079934747145187, + "grad_norm": 0.20219261944293976, + "learning_rate": 2.8654047356871473e-05, + "loss": 0.1155, + "num_input_tokens_seen": 67214432, + "step": 31140 + }, + { + "epoch": 5.080750407830343, + "grad_norm": 0.7337654829025269, + "learning_rate": 2.8647005668853027e-05, + "loss": 0.0512, + "num_input_tokens_seen": 67225792, + "step": 31145 + }, + { + "epoch": 5.081566068515498, + "grad_norm": 0.6815458536148071, + "learning_rate": 2.8639963685189795e-05, + "loss": 0.1645, + "num_input_tokens_seen": 67236128, + "step": 31150 + }, + { + "epoch": 5.082381729200653, + "grad_norm": 1.0305111408233643, + "learning_rate": 2.8632921406452635e-05, + "loss": 0.1293, + "num_input_tokens_seen": 67247072, + "step": 31155 + }, + { + "epoch": 5.083197389885807, + "grad_norm": 0.7417222261428833, + "learning_rate": 2.862587883321244e-05, + "loss": 0.1529, + "num_input_tokens_seen": 67257344, + "step": 31160 + }, + { + "epoch": 5.084013050570962, + "grad_norm": 0.5551039576530457, + "learning_rate": 2.8618835966040104e-05, + "loss": 0.1098, + "num_input_tokens_seen": 67266848, + "step": 31165 + }, + { + "epoch": 5.084828711256117, + "grad_norm": 1.441504955291748, + "learning_rate": 2.8611792805506565e-05, + "loss": 0.2681, + "num_input_tokens_seen": 67277056, + "step": 31170 + }, + { + "epoch": 5.085644371941273, + "grad_norm": 0.19704709947109222, + "learning_rate": 2.8604749352182776e-05, + "loss": 0.054, + "num_input_tokens_seen": 67288672, + "step": 31175 + }, + { + "epoch": 5.0864600326264275, + "grad_norm": 0.06052444130182266, + "learning_rate": 2.8597705606639707e-05, + "loss": 0.106, + "num_input_tokens_seen": 67299424, + "step": 31180 + }, + { + "epoch": 5.087275693311582, + "grad_norm": 0.40498459339141846, + "learning_rate": 2.8590661569448372e-05, + "loss": 0.0637, + "num_input_tokens_seen": 67309056, + "step": 31185 + }, + { + "epoch": 5.088091353996737, + "grad_norm": 0.3320674002170563, + "learning_rate": 2.8583617241179796e-05, + "loss": 0.0671, + "num_input_tokens_seen": 67320320, + "step": 31190 + }, + { + "epoch": 5.088907014681892, + "grad_norm": 0.1642572581768036, + "learning_rate": 2.8576572622405017e-05, + "loss": 0.2466, + "num_input_tokens_seen": 67330944, + "step": 31195 + }, + { + "epoch": 5.089722675367048, + "grad_norm": 0.09887607395648956, + "learning_rate": 2.856952771369512e-05, + "loss": 0.036, + "num_input_tokens_seen": 67342720, + "step": 31200 + }, + { + "epoch": 5.0905383360522025, + "grad_norm": 1.3803266286849976, + "learning_rate": 2.856248251562119e-05, + "loss": 0.1604, + "num_input_tokens_seen": 67353856, + "step": 31205 + }, + { + "epoch": 5.091353996737357, + "grad_norm": 0.04991486296057701, + "learning_rate": 2.8555437028754352e-05, + "loss": 0.232, + "num_input_tokens_seen": 67364384, + "step": 31210 + }, + { + "epoch": 5.092169657422512, + "grad_norm": 0.2314082384109497, + "learning_rate": 2.8548391253665746e-05, + "loss": 0.2305, + "num_input_tokens_seen": 67375328, + "step": 31215 + }, + { + "epoch": 5.092985318107667, + "grad_norm": 0.2849508821964264, + "learning_rate": 2.854134519092654e-05, + "loss": 0.0916, + "num_input_tokens_seen": 67386240, + "step": 31220 + }, + { + "epoch": 5.093800978792822, + "grad_norm": 0.1557760089635849, + "learning_rate": 2.8534298841107925e-05, + "loss": 0.1146, + "num_input_tokens_seen": 67397760, + "step": 31225 + }, + { + "epoch": 5.0946166394779775, + "grad_norm": 0.11636760085821152, + "learning_rate": 2.8527252204781117e-05, + "loss": 0.0684, + "num_input_tokens_seen": 67408768, + "step": 31230 + }, + { + "epoch": 5.095432300163132, + "grad_norm": 1.2101463079452515, + "learning_rate": 2.852020528251735e-05, + "loss": 0.1627, + "num_input_tokens_seen": 67419456, + "step": 31235 + }, + { + "epoch": 5.096247960848287, + "grad_norm": 0.871988832950592, + "learning_rate": 2.8513158074887875e-05, + "loss": 0.1097, + "num_input_tokens_seen": 67430528, + "step": 31240 + }, + { + "epoch": 5.097063621533442, + "grad_norm": 0.7329961657524109, + "learning_rate": 2.8506110582463984e-05, + "loss": 0.0981, + "num_input_tokens_seen": 67441472, + "step": 31245 + }, + { + "epoch": 5.097879282218597, + "grad_norm": 1.1473565101623535, + "learning_rate": 2.8499062805816984e-05, + "loss": 0.1527, + "num_input_tokens_seen": 67451808, + "step": 31250 + }, + { + "epoch": 5.0986949429037525, + "grad_norm": 0.37051501870155334, + "learning_rate": 2.8492014745518192e-05, + "loss": 0.1188, + "num_input_tokens_seen": 67463104, + "step": 31255 + }, + { + "epoch": 5.099510603588907, + "grad_norm": 0.10107944905757904, + "learning_rate": 2.8484966402138968e-05, + "loss": 0.0644, + "num_input_tokens_seen": 67474144, + "step": 31260 + }, + { + "epoch": 5.100326264274062, + "grad_norm": 0.8080949783325195, + "learning_rate": 2.8477917776250683e-05, + "loss": 0.0847, + "num_input_tokens_seen": 67484096, + "step": 31265 + }, + { + "epoch": 5.101141924959217, + "grad_norm": 0.057649970054626465, + "learning_rate": 2.847086886842474e-05, + "loss": 0.1329, + "num_input_tokens_seen": 67493664, + "step": 31270 + }, + { + "epoch": 5.101957585644372, + "grad_norm": 3.1244051456451416, + "learning_rate": 2.8463819679232555e-05, + "loss": 0.2565, + "num_input_tokens_seen": 67504768, + "step": 31275 + }, + { + "epoch": 5.102773246329527, + "grad_norm": 1.2160165309906006, + "learning_rate": 2.845677020924557e-05, + "loss": 0.1166, + "num_input_tokens_seen": 67514848, + "step": 31280 + }, + { + "epoch": 5.103588907014682, + "grad_norm": 0.07304582744836807, + "learning_rate": 2.8449720459035256e-05, + "loss": 0.1246, + "num_input_tokens_seen": 67525248, + "step": 31285 + }, + { + "epoch": 5.104404567699837, + "grad_norm": 0.8757428526878357, + "learning_rate": 2.8442670429173098e-05, + "loss": 0.0774, + "num_input_tokens_seen": 67534656, + "step": 31290 + }, + { + "epoch": 5.105220228384992, + "grad_norm": 2.0004518032073975, + "learning_rate": 2.8435620120230595e-05, + "loss": 0.2461, + "num_input_tokens_seen": 67545024, + "step": 31295 + }, + { + "epoch": 5.106035889070147, + "grad_norm": 1.780617356300354, + "learning_rate": 2.84285695327793e-05, + "loss": 0.2403, + "num_input_tokens_seen": 67556640, + "step": 31300 + }, + { + "epoch": 5.1068515497553015, + "grad_norm": 0.20335178077220917, + "learning_rate": 2.842151866739077e-05, + "loss": 0.0334, + "num_input_tokens_seen": 67568032, + "step": 31305 + }, + { + "epoch": 5.107667210440456, + "grad_norm": 0.10867176204919815, + "learning_rate": 2.8414467524636568e-05, + "loss": 0.087, + "num_input_tokens_seen": 67578688, + "step": 31310 + }, + { + "epoch": 5.108482871125612, + "grad_norm": 0.1851479858160019, + "learning_rate": 2.8407416105088304e-05, + "loss": 0.0369, + "num_input_tokens_seen": 67589664, + "step": 31315 + }, + { + "epoch": 5.109298531810767, + "grad_norm": 0.9145223498344421, + "learning_rate": 2.8400364409317604e-05, + "loss": 0.151, + "num_input_tokens_seen": 67600736, + "step": 31320 + }, + { + "epoch": 5.110114192495922, + "grad_norm": 0.34022143483161926, + "learning_rate": 2.8393312437896112e-05, + "loss": 0.2161, + "num_input_tokens_seen": 67612288, + "step": 31325 + }, + { + "epoch": 5.1109298531810765, + "grad_norm": 1.486560583114624, + "learning_rate": 2.8386260191395497e-05, + "loss": 0.237, + "num_input_tokens_seen": 67622592, + "step": 31330 + }, + { + "epoch": 5.111745513866231, + "grad_norm": 0.2870905101299286, + "learning_rate": 2.837920767038745e-05, + "loss": 0.1734, + "num_input_tokens_seen": 67633568, + "step": 31335 + }, + { + "epoch": 5.112561174551387, + "grad_norm": 0.11720023304224014, + "learning_rate": 2.837215487544368e-05, + "loss": 0.1528, + "num_input_tokens_seen": 67644000, + "step": 31340 + }, + { + "epoch": 5.113376835236542, + "grad_norm": 0.8533234000205994, + "learning_rate": 2.836510180713593e-05, + "loss": 0.1607, + "num_input_tokens_seen": 67654816, + "step": 31345 + }, + { + "epoch": 5.114192495921697, + "grad_norm": 1.347430944442749, + "learning_rate": 2.835804846603595e-05, + "loss": 0.2962, + "num_input_tokens_seen": 67666304, + "step": 31350 + }, + { + "epoch": 5.1150081566068515, + "grad_norm": 1.3393418788909912, + "learning_rate": 2.8350994852715522e-05, + "loss": 0.0924, + "num_input_tokens_seen": 67677536, + "step": 31355 + }, + { + "epoch": 5.115823817292006, + "grad_norm": 0.2886262536048889, + "learning_rate": 2.8343940967746453e-05, + "loss": 0.2336, + "num_input_tokens_seen": 67688608, + "step": 31360 + }, + { + "epoch": 5.116639477977161, + "grad_norm": 2.821655750274658, + "learning_rate": 2.8336886811700548e-05, + "loss": 0.2311, + "num_input_tokens_seen": 67699200, + "step": 31365 + }, + { + "epoch": 5.117455138662317, + "grad_norm": 0.15756982564926147, + "learning_rate": 2.8329832385149678e-05, + "loss": 0.072, + "num_input_tokens_seen": 67709568, + "step": 31370 + }, + { + "epoch": 5.118270799347472, + "grad_norm": 0.5028086304664612, + "learning_rate": 2.8322777688665704e-05, + "loss": 0.1435, + "num_input_tokens_seen": 67719360, + "step": 31375 + }, + { + "epoch": 5.1190864600326265, + "grad_norm": 0.42932289838790894, + "learning_rate": 2.83157227228205e-05, + "loss": 0.103, + "num_input_tokens_seen": 67729472, + "step": 31380 + }, + { + "epoch": 5.119902120717781, + "grad_norm": 0.7541675567626953, + "learning_rate": 2.830866748818599e-05, + "loss": 0.1395, + "num_input_tokens_seen": 67739424, + "step": 31385 + }, + { + "epoch": 5.120717781402936, + "grad_norm": 1.7179908752441406, + "learning_rate": 2.8301611985334115e-05, + "loss": 0.2094, + "num_input_tokens_seen": 67750624, + "step": 31390 + }, + { + "epoch": 5.121533442088092, + "grad_norm": 0.2632579505443573, + "learning_rate": 2.8294556214836813e-05, + "loss": 0.0871, + "num_input_tokens_seen": 67760960, + "step": 31395 + }, + { + "epoch": 5.122349102773247, + "grad_norm": 0.46029651165008545, + "learning_rate": 2.828750017726607e-05, + "loss": 0.0681, + "num_input_tokens_seen": 67771232, + "step": 31400 + }, + { + "epoch": 5.123164763458401, + "grad_norm": 0.6499857902526855, + "learning_rate": 2.8280443873193884e-05, + "loss": 0.0296, + "num_input_tokens_seen": 67781408, + "step": 31405 + }, + { + "epoch": 5.123980424143556, + "grad_norm": 0.09541068971157074, + "learning_rate": 2.8273387303192266e-05, + "loss": 0.2114, + "num_input_tokens_seen": 67793472, + "step": 31410 + }, + { + "epoch": 5.124796084828711, + "grad_norm": 0.2795756757259369, + "learning_rate": 2.8266330467833274e-05, + "loss": 0.0314, + "num_input_tokens_seen": 67804704, + "step": 31415 + }, + { + "epoch": 5.125611745513866, + "grad_norm": 0.06110858544707298, + "learning_rate": 2.8259273367688954e-05, + "loss": 0.0994, + "num_input_tokens_seen": 67815840, + "step": 31420 + }, + { + "epoch": 5.126427406199022, + "grad_norm": 0.08294708281755447, + "learning_rate": 2.8252216003331395e-05, + "loss": 0.0457, + "num_input_tokens_seen": 67827008, + "step": 31425 + }, + { + "epoch": 5.127243066884176, + "grad_norm": 1.4577921628952026, + "learning_rate": 2.824515837533271e-05, + "loss": 0.2982, + "num_input_tokens_seen": 67837024, + "step": 31430 + }, + { + "epoch": 5.128058727569331, + "grad_norm": 0.19412653148174286, + "learning_rate": 2.823810048426502e-05, + "loss": 0.0686, + "num_input_tokens_seen": 67847360, + "step": 31435 + }, + { + "epoch": 5.128874388254486, + "grad_norm": 0.05199446156620979, + "learning_rate": 2.8231042330700473e-05, + "loss": 0.2852, + "num_input_tokens_seen": 67857152, + "step": 31440 + }, + { + "epoch": 5.129690048939641, + "grad_norm": 0.9946163296699524, + "learning_rate": 2.822398391521125e-05, + "loss": 0.1466, + "num_input_tokens_seen": 67867680, + "step": 31445 + }, + { + "epoch": 5.130505709624796, + "grad_norm": 0.17058317363262177, + "learning_rate": 2.8216925238369518e-05, + "loss": 0.037, + "num_input_tokens_seen": 67877632, + "step": 31450 + }, + { + "epoch": 5.131321370309951, + "grad_norm": 0.5046305656433105, + "learning_rate": 2.820986630074751e-05, + "loss": 0.2242, + "num_input_tokens_seen": 67886912, + "step": 31455 + }, + { + "epoch": 5.132137030995106, + "grad_norm": 0.2384406328201294, + "learning_rate": 2.8202807102917457e-05, + "loss": 0.0503, + "num_input_tokens_seen": 67895040, + "step": 31460 + }, + { + "epoch": 5.132952691680261, + "grad_norm": 0.15545634925365448, + "learning_rate": 2.8195747645451605e-05, + "loss": 0.1234, + "num_input_tokens_seen": 67906336, + "step": 31465 + }, + { + "epoch": 5.133768352365416, + "grad_norm": 0.02819395810365677, + "learning_rate": 2.8188687928922237e-05, + "loss": 0.1468, + "num_input_tokens_seen": 67916992, + "step": 31470 + }, + { + "epoch": 5.134584013050571, + "grad_norm": 0.1371302753686905, + "learning_rate": 2.818162795390164e-05, + "loss": 0.1373, + "num_input_tokens_seen": 67928352, + "step": 31475 + }, + { + "epoch": 5.135399673735726, + "grad_norm": 0.25026318430900574, + "learning_rate": 2.817456772096214e-05, + "loss": 0.2253, + "num_input_tokens_seen": 67940160, + "step": 31480 + }, + { + "epoch": 5.136215334420881, + "grad_norm": 0.21966339647769928, + "learning_rate": 2.8167507230676077e-05, + "loss": 0.1084, + "num_input_tokens_seen": 67950304, + "step": 31485 + }, + { + "epoch": 5.137030995106036, + "grad_norm": 0.9881267547607422, + "learning_rate": 2.8160446483615804e-05, + "loss": 0.2012, + "num_input_tokens_seen": 67960224, + "step": 31490 + }, + { + "epoch": 5.137846655791191, + "grad_norm": 1.5114860534667969, + "learning_rate": 2.8153385480353705e-05, + "loss": 0.1459, + "num_input_tokens_seen": 67971616, + "step": 31495 + }, + { + "epoch": 5.138662316476346, + "grad_norm": 0.5961607694625854, + "learning_rate": 2.814632422146218e-05, + "loss": 0.1865, + "num_input_tokens_seen": 67982048, + "step": 31500 + }, + { + "epoch": 5.1394779771615005, + "grad_norm": 0.4973187744617462, + "learning_rate": 2.8139262707513647e-05, + "loss": 0.1413, + "num_input_tokens_seen": 67991584, + "step": 31505 + }, + { + "epoch": 5.140293637846656, + "grad_norm": 1.3512592315673828, + "learning_rate": 2.813220093908055e-05, + "loss": 0.1749, + "num_input_tokens_seen": 68002208, + "step": 31510 + }, + { + "epoch": 5.141109298531811, + "grad_norm": 0.5511339902877808, + "learning_rate": 2.812513891673535e-05, + "loss": 0.0374, + "num_input_tokens_seen": 68013344, + "step": 31515 + }, + { + "epoch": 5.141924959216966, + "grad_norm": 0.7657870054244995, + "learning_rate": 2.8118076641050535e-05, + "loss": 0.2446, + "num_input_tokens_seen": 68025056, + "step": 31520 + }, + { + "epoch": 5.142740619902121, + "grad_norm": 0.30343884229660034, + "learning_rate": 2.81110141125986e-05, + "loss": 0.0413, + "num_input_tokens_seen": 68035296, + "step": 31525 + }, + { + "epoch": 5.143556280587275, + "grad_norm": 1.0123003721237183, + "learning_rate": 2.8103951331952083e-05, + "loss": 0.1225, + "num_input_tokens_seen": 68046080, + "step": 31530 + }, + { + "epoch": 5.14437194127243, + "grad_norm": 0.15243308246135712, + "learning_rate": 2.8096888299683515e-05, + "loss": 0.1483, + "num_input_tokens_seen": 68057344, + "step": 31535 + }, + { + "epoch": 5.145187601957586, + "grad_norm": 0.12911519408226013, + "learning_rate": 2.8089825016365478e-05, + "loss": 0.1402, + "num_input_tokens_seen": 68067488, + "step": 31540 + }, + { + "epoch": 5.146003262642741, + "grad_norm": 1.6224327087402344, + "learning_rate": 2.808276148257054e-05, + "loss": 0.2155, + "num_input_tokens_seen": 68079584, + "step": 31545 + }, + { + "epoch": 5.146818923327896, + "grad_norm": 0.11471255123615265, + "learning_rate": 2.807569769887132e-05, + "loss": 0.2315, + "num_input_tokens_seen": 68090816, + "step": 31550 + }, + { + "epoch": 5.14763458401305, + "grad_norm": 0.21378003060817719, + "learning_rate": 2.8068633665840438e-05, + "loss": 0.1138, + "num_input_tokens_seen": 68102592, + "step": 31555 + }, + { + "epoch": 5.148450244698205, + "grad_norm": 0.4573446214199066, + "learning_rate": 2.806156938405054e-05, + "loss": 0.0264, + "num_input_tokens_seen": 68113664, + "step": 31560 + }, + { + "epoch": 5.149265905383361, + "grad_norm": 0.33342307806015015, + "learning_rate": 2.8054504854074293e-05, + "loss": 0.1132, + "num_input_tokens_seen": 68124480, + "step": 31565 + }, + { + "epoch": 5.150081566068516, + "grad_norm": 0.10408451408147812, + "learning_rate": 2.8047440076484383e-05, + "loss": 0.1075, + "num_input_tokens_seen": 68135424, + "step": 31570 + }, + { + "epoch": 5.150897226753671, + "grad_norm": 0.5066759586334229, + "learning_rate": 2.8040375051853522e-05, + "loss": 0.1767, + "num_input_tokens_seen": 68146784, + "step": 31575 + }, + { + "epoch": 5.151712887438825, + "grad_norm": 0.5509450435638428, + "learning_rate": 2.803330978075443e-05, + "loss": 0.1688, + "num_input_tokens_seen": 68155744, + "step": 31580 + }, + { + "epoch": 5.15252854812398, + "grad_norm": 0.2266324758529663, + "learning_rate": 2.802624426375985e-05, + "loss": 0.018, + "num_input_tokens_seen": 68167168, + "step": 31585 + }, + { + "epoch": 5.153344208809135, + "grad_norm": 1.1486326456069946, + "learning_rate": 2.801917850144256e-05, + "loss": 0.1542, + "num_input_tokens_seen": 68178496, + "step": 31590 + }, + { + "epoch": 5.154159869494291, + "grad_norm": 0.27946892380714417, + "learning_rate": 2.8012112494375342e-05, + "loss": 0.1247, + "num_input_tokens_seen": 68189568, + "step": 31595 + }, + { + "epoch": 5.1549755301794455, + "grad_norm": 0.24547308683395386, + "learning_rate": 2.8005046243131005e-05, + "loss": 0.0984, + "num_input_tokens_seen": 68198624, + "step": 31600 + }, + { + "epoch": 5.1557911908646, + "grad_norm": 0.2628032863140106, + "learning_rate": 2.7997979748282364e-05, + "loss": 0.0302, + "num_input_tokens_seen": 68210528, + "step": 31605 + }, + { + "epoch": 5.156606851549755, + "grad_norm": 0.11396227777004242, + "learning_rate": 2.7990913010402282e-05, + "loss": 0.0792, + "num_input_tokens_seen": 68220384, + "step": 31610 + }, + { + "epoch": 5.15742251223491, + "grad_norm": 0.10848227143287659, + "learning_rate": 2.798384603006361e-05, + "loss": 0.2329, + "num_input_tokens_seen": 68230464, + "step": 31615 + }, + { + "epoch": 5.158238172920065, + "grad_norm": 0.17693181335926056, + "learning_rate": 2.7976778807839245e-05, + "loss": 0.1639, + "num_input_tokens_seen": 68240384, + "step": 31620 + }, + { + "epoch": 5.1590538336052205, + "grad_norm": 0.2174263298511505, + "learning_rate": 2.796971134430208e-05, + "loss": 0.0908, + "num_input_tokens_seen": 68251840, + "step": 31625 + }, + { + "epoch": 5.159869494290375, + "grad_norm": 0.48760634660720825, + "learning_rate": 2.7962643640025044e-05, + "loss": 0.1966, + "num_input_tokens_seen": 68263008, + "step": 31630 + }, + { + "epoch": 5.16068515497553, + "grad_norm": 0.2639845609664917, + "learning_rate": 2.7955575695581083e-05, + "loss": 0.1207, + "num_input_tokens_seen": 68273920, + "step": 31635 + }, + { + "epoch": 5.161500815660685, + "grad_norm": 0.7271772623062134, + "learning_rate": 2.794850751154316e-05, + "loss": 0.1575, + "num_input_tokens_seen": 68284256, + "step": 31640 + }, + { + "epoch": 5.16231647634584, + "grad_norm": 0.30370572209358215, + "learning_rate": 2.794143908848426e-05, + "loss": 0.1977, + "num_input_tokens_seen": 68296000, + "step": 31645 + }, + { + "epoch": 5.1631321370309955, + "grad_norm": 0.41518494486808777, + "learning_rate": 2.7934370426977385e-05, + "loss": 0.1331, + "num_input_tokens_seen": 68308224, + "step": 31650 + }, + { + "epoch": 5.16394779771615, + "grad_norm": 0.9806423187255859, + "learning_rate": 2.792730152759555e-05, + "loss": 0.118, + "num_input_tokens_seen": 68318560, + "step": 31655 + }, + { + "epoch": 5.164763458401305, + "grad_norm": 0.09957236796617508, + "learning_rate": 2.7920232390911805e-05, + "loss": 0.104, + "num_input_tokens_seen": 68328256, + "step": 31660 + }, + { + "epoch": 5.16557911908646, + "grad_norm": 0.08985765278339386, + "learning_rate": 2.79131630174992e-05, + "loss": 0.1399, + "num_input_tokens_seen": 68339648, + "step": 31665 + }, + { + "epoch": 5.166394779771615, + "grad_norm": 0.6127741932868958, + "learning_rate": 2.790609340793082e-05, + "loss": 0.0892, + "num_input_tokens_seen": 68351072, + "step": 31670 + }, + { + "epoch": 5.16721044045677, + "grad_norm": 0.7855762243270874, + "learning_rate": 2.789902356277977e-05, + "loss": 0.0999, + "num_input_tokens_seen": 68360832, + "step": 31675 + }, + { + "epoch": 5.168026101141925, + "grad_norm": 0.918478786945343, + "learning_rate": 2.7891953482619148e-05, + "loss": 0.1121, + "num_input_tokens_seen": 68370016, + "step": 31680 + }, + { + "epoch": 5.16884176182708, + "grad_norm": 0.1943194568157196, + "learning_rate": 2.788488316802211e-05, + "loss": 0.0811, + "num_input_tokens_seen": 68380768, + "step": 31685 + }, + { + "epoch": 5.169657422512235, + "grad_norm": 0.4142906963825226, + "learning_rate": 2.78778126195618e-05, + "loss": 0.0327, + "num_input_tokens_seen": 68392448, + "step": 31690 + }, + { + "epoch": 5.17047308319739, + "grad_norm": 0.40185263752937317, + "learning_rate": 2.7870741837811404e-05, + "loss": 0.1514, + "num_input_tokens_seen": 68402016, + "step": 31695 + }, + { + "epoch": 5.171288743882545, + "grad_norm": 0.6606400609016418, + "learning_rate": 2.7863670823344106e-05, + "loss": 0.114, + "num_input_tokens_seen": 68412704, + "step": 31700 + }, + { + "epoch": 5.1721044045677, + "grad_norm": 0.4363783895969391, + "learning_rate": 2.7856599576733124e-05, + "loss": 0.1482, + "num_input_tokens_seen": 68424448, + "step": 31705 + }, + { + "epoch": 5.172920065252855, + "grad_norm": 0.8665124773979187, + "learning_rate": 2.7849528098551682e-05, + "loss": 0.1667, + "num_input_tokens_seen": 68435168, + "step": 31710 + }, + { + "epoch": 5.17373572593801, + "grad_norm": 0.3525066673755646, + "learning_rate": 2.7842456389373032e-05, + "loss": 0.1206, + "num_input_tokens_seen": 68445696, + "step": 31715 + }, + { + "epoch": 5.174551386623165, + "grad_norm": 1.4800982475280762, + "learning_rate": 2.783538444977045e-05, + "loss": 0.1557, + "num_input_tokens_seen": 68455936, + "step": 31720 + }, + { + "epoch": 5.1753670473083195, + "grad_norm": 0.40539830923080444, + "learning_rate": 2.7828312280317214e-05, + "loss": 0.0551, + "num_input_tokens_seen": 68465344, + "step": 31725 + }, + { + "epoch": 5.176182707993474, + "grad_norm": 1.0006362199783325, + "learning_rate": 2.782123988158664e-05, + "loss": 0.0583, + "num_input_tokens_seen": 68475936, + "step": 31730 + }, + { + "epoch": 5.17699836867863, + "grad_norm": 0.6081488728523254, + "learning_rate": 2.781416725415204e-05, + "loss": 0.0607, + "num_input_tokens_seen": 68486496, + "step": 31735 + }, + { + "epoch": 5.177814029363785, + "grad_norm": 0.4052134156227112, + "learning_rate": 2.780709439858677e-05, + "loss": 0.059, + "num_input_tokens_seen": 68497408, + "step": 31740 + }, + { + "epoch": 5.17862969004894, + "grad_norm": 0.38420602679252625, + "learning_rate": 2.7800021315464176e-05, + "loss": 0.0871, + "num_input_tokens_seen": 68506720, + "step": 31745 + }, + { + "epoch": 5.1794453507340945, + "grad_norm": 1.2670568227767944, + "learning_rate": 2.779294800535765e-05, + "loss": 0.2504, + "num_input_tokens_seen": 68516800, + "step": 31750 + }, + { + "epoch": 5.180261011419249, + "grad_norm": 0.7592873573303223, + "learning_rate": 2.778587446884059e-05, + "loss": 0.0565, + "num_input_tokens_seen": 68525120, + "step": 31755 + }, + { + "epoch": 5.181076672104404, + "grad_norm": 0.1342897117137909, + "learning_rate": 2.777880070648641e-05, + "loss": 0.1655, + "num_input_tokens_seen": 68535872, + "step": 31760 + }, + { + "epoch": 5.18189233278956, + "grad_norm": 0.6944959163665771, + "learning_rate": 2.777172671886854e-05, + "loss": 0.107, + "num_input_tokens_seen": 68545760, + "step": 31765 + }, + { + "epoch": 5.182707993474715, + "grad_norm": 0.3133276104927063, + "learning_rate": 2.776465250656044e-05, + "loss": 0.0753, + "num_input_tokens_seen": 68555552, + "step": 31770 + }, + { + "epoch": 5.1835236541598695, + "grad_norm": 1.3245728015899658, + "learning_rate": 2.7757578070135588e-05, + "loss": 0.0429, + "num_input_tokens_seen": 68565760, + "step": 31775 + }, + { + "epoch": 5.184339314845024, + "grad_norm": 1.7633616924285889, + "learning_rate": 2.775050341016746e-05, + "loss": 0.0622, + "num_input_tokens_seen": 68576160, + "step": 31780 + }, + { + "epoch": 5.185154975530179, + "grad_norm": 1.8059039115905762, + "learning_rate": 2.774342852722957e-05, + "loss": 0.2424, + "num_input_tokens_seen": 68587360, + "step": 31785 + }, + { + "epoch": 5.185970636215335, + "grad_norm": 0.3840816915035248, + "learning_rate": 2.7736353421895445e-05, + "loss": 0.0317, + "num_input_tokens_seen": 68598336, + "step": 31790 + }, + { + "epoch": 5.18678629690049, + "grad_norm": 0.3170669972896576, + "learning_rate": 2.772927809473862e-05, + "loss": 0.1708, + "num_input_tokens_seen": 68610560, + "step": 31795 + }, + { + "epoch": 5.1876019575856445, + "grad_norm": 0.11404617875814438, + "learning_rate": 2.7722202546332676e-05, + "loss": 0.0691, + "num_input_tokens_seen": 68621472, + "step": 31800 + }, + { + "epoch": 5.188417618270799, + "grad_norm": 1.8305658102035522, + "learning_rate": 2.7715126777251177e-05, + "loss": 0.2399, + "num_input_tokens_seen": 68632064, + "step": 31805 + }, + { + "epoch": 5.189233278955954, + "grad_norm": 0.7111310362815857, + "learning_rate": 2.7708050788067724e-05, + "loss": 0.2104, + "num_input_tokens_seen": 68642784, + "step": 31810 + }, + { + "epoch": 5.190048939641109, + "grad_norm": 0.6489546298980713, + "learning_rate": 2.7700974579355933e-05, + "loss": 0.1543, + "num_input_tokens_seen": 68652448, + "step": 31815 + }, + { + "epoch": 5.190864600326265, + "grad_norm": 0.7322481274604797, + "learning_rate": 2.769389815168944e-05, + "loss": 0.1621, + "num_input_tokens_seen": 68662656, + "step": 31820 + }, + { + "epoch": 5.191680261011419, + "grad_norm": 0.35210010409355164, + "learning_rate": 2.7686821505641893e-05, + "loss": 0.0415, + "num_input_tokens_seen": 68673248, + "step": 31825 + }, + { + "epoch": 5.192495921696574, + "grad_norm": 0.8265831470489502, + "learning_rate": 2.7679744641786963e-05, + "loss": 0.1051, + "num_input_tokens_seen": 68683776, + "step": 31830 + }, + { + "epoch": 5.193311582381729, + "grad_norm": 0.12210773676633835, + "learning_rate": 2.7672667560698328e-05, + "loss": 0.1092, + "num_input_tokens_seen": 68694112, + "step": 31835 + }, + { + "epoch": 5.194127243066884, + "grad_norm": 1.3703761100769043, + "learning_rate": 2.7665590262949707e-05, + "loss": 0.2637, + "num_input_tokens_seen": 68704512, + "step": 31840 + }, + { + "epoch": 5.19494290375204, + "grad_norm": 0.1190643236041069, + "learning_rate": 2.7658512749114816e-05, + "loss": 0.1125, + "num_input_tokens_seen": 68715776, + "step": 31845 + }, + { + "epoch": 5.195758564437194, + "grad_norm": 0.22993601858615875, + "learning_rate": 2.7651435019767384e-05, + "loss": 0.1792, + "num_input_tokens_seen": 68725888, + "step": 31850 + }, + { + "epoch": 5.196574225122349, + "grad_norm": 0.08158719539642334, + "learning_rate": 2.764435707548118e-05, + "loss": 0.0953, + "num_input_tokens_seen": 68736864, + "step": 31855 + }, + { + "epoch": 5.197389885807504, + "grad_norm": 0.2580280005931854, + "learning_rate": 2.7637278916829977e-05, + "loss": 0.0969, + "num_input_tokens_seen": 68747264, + "step": 31860 + }, + { + "epoch": 5.198205546492659, + "grad_norm": 0.16134226322174072, + "learning_rate": 2.7630200544387562e-05, + "loss": 0.2264, + "num_input_tokens_seen": 68757792, + "step": 31865 + }, + { + "epoch": 5.199021207177814, + "grad_norm": 0.6080136895179749, + "learning_rate": 2.762312195872775e-05, + "loss": 0.2657, + "num_input_tokens_seen": 68769440, + "step": 31870 + }, + { + "epoch": 5.199836867862969, + "grad_norm": 0.9957655668258667, + "learning_rate": 2.761604316042436e-05, + "loss": 0.1234, + "num_input_tokens_seen": 68780224, + "step": 31875 + }, + { + "epoch": 5.200652528548124, + "grad_norm": 0.6834363341331482, + "learning_rate": 2.760896415005123e-05, + "loss": 0.0726, + "num_input_tokens_seen": 68791008, + "step": 31880 + }, + { + "epoch": 5.201468189233279, + "grad_norm": 0.08492528647184372, + "learning_rate": 2.7601884928182238e-05, + "loss": 0.086, + "num_input_tokens_seen": 68801088, + "step": 31885 + }, + { + "epoch": 5.202283849918434, + "grad_norm": 0.3485061824321747, + "learning_rate": 2.759480549539125e-05, + "loss": 0.0632, + "num_input_tokens_seen": 68812128, + "step": 31890 + }, + { + "epoch": 5.203099510603589, + "grad_norm": 0.8797106146812439, + "learning_rate": 2.758772585225216e-05, + "loss": 0.0804, + "num_input_tokens_seen": 68822688, + "step": 31895 + }, + { + "epoch": 5.2039151712887435, + "grad_norm": 1.9435482025146484, + "learning_rate": 2.7580645999338885e-05, + "loss": 0.2774, + "num_input_tokens_seen": 68832896, + "step": 31900 + }, + { + "epoch": 5.204730831973899, + "grad_norm": 0.6081010699272156, + "learning_rate": 2.757356593722534e-05, + "loss": 0.0525, + "num_input_tokens_seen": 68844224, + "step": 31905 + }, + { + "epoch": 5.205546492659054, + "grad_norm": 0.23902736604213715, + "learning_rate": 2.7566485666485496e-05, + "loss": 0.1447, + "num_input_tokens_seen": 68856512, + "step": 31910 + }, + { + "epoch": 5.206362153344209, + "grad_norm": 0.43897294998168945, + "learning_rate": 2.75594051876933e-05, + "loss": 0.0969, + "num_input_tokens_seen": 68867552, + "step": 31915 + }, + { + "epoch": 5.207177814029364, + "grad_norm": 0.8752716183662415, + "learning_rate": 2.755232450142272e-05, + "loss": 0.1636, + "num_input_tokens_seen": 68877536, + "step": 31920 + }, + { + "epoch": 5.2079934747145185, + "grad_norm": 1.78202486038208, + "learning_rate": 2.754524360824778e-05, + "loss": 0.348, + "num_input_tokens_seen": 68887360, + "step": 31925 + }, + { + "epoch": 5.208809135399674, + "grad_norm": 0.5885811448097229, + "learning_rate": 2.7538162508742472e-05, + "loss": 0.1743, + "num_input_tokens_seen": 68899328, + "step": 31930 + }, + { + "epoch": 5.209624796084829, + "grad_norm": 0.21337798237800598, + "learning_rate": 2.7531081203480834e-05, + "loss": 0.0949, + "num_input_tokens_seen": 68909888, + "step": 31935 + }, + { + "epoch": 5.210440456769984, + "grad_norm": 1.7742751836776733, + "learning_rate": 2.7523999693036916e-05, + "loss": 0.2304, + "num_input_tokens_seen": 68920416, + "step": 31940 + }, + { + "epoch": 5.211256117455139, + "grad_norm": 0.35617586970329285, + "learning_rate": 2.7516917977984773e-05, + "loss": 0.0946, + "num_input_tokens_seen": 68931936, + "step": 31945 + }, + { + "epoch": 5.212071778140293, + "grad_norm": 1.8491452932357788, + "learning_rate": 2.7509836058898487e-05, + "loss": 0.1791, + "num_input_tokens_seen": 68942912, + "step": 31950 + }, + { + "epoch": 5.212887438825448, + "grad_norm": 0.3081819713115692, + "learning_rate": 2.750275393635215e-05, + "loss": 0.3377, + "num_input_tokens_seen": 68954400, + "step": 31955 + }, + { + "epoch": 5.213703099510604, + "grad_norm": 1.0540478229522705, + "learning_rate": 2.7495671610919886e-05, + "loss": 0.0846, + "num_input_tokens_seen": 68964032, + "step": 31960 + }, + { + "epoch": 5.214518760195759, + "grad_norm": 0.28353071212768555, + "learning_rate": 2.748858908317582e-05, + "loss": 0.0625, + "num_input_tokens_seen": 68974944, + "step": 31965 + }, + { + "epoch": 5.215334420880914, + "grad_norm": 0.21728989481925964, + "learning_rate": 2.7481506353694092e-05, + "loss": 0.0815, + "num_input_tokens_seen": 68985856, + "step": 31970 + }, + { + "epoch": 5.216150081566068, + "grad_norm": 1.1983259916305542, + "learning_rate": 2.7474423423048873e-05, + "loss": 0.104, + "num_input_tokens_seen": 68996768, + "step": 31975 + }, + { + "epoch": 5.216965742251223, + "grad_norm": 1.1201660633087158, + "learning_rate": 2.746734029181433e-05, + "loss": 0.1077, + "num_input_tokens_seen": 69008064, + "step": 31980 + }, + { + "epoch": 5.217781402936378, + "grad_norm": 0.19003356993198395, + "learning_rate": 2.7460256960564668e-05, + "loss": 0.165, + "num_input_tokens_seen": 69018240, + "step": 31985 + }, + { + "epoch": 5.218597063621534, + "grad_norm": 0.8017317652702332, + "learning_rate": 2.7453173429874096e-05, + "loss": 0.1643, + "num_input_tokens_seen": 69028128, + "step": 31990 + }, + { + "epoch": 5.219412724306689, + "grad_norm": 1.29346764087677, + "learning_rate": 2.744608970031683e-05, + "loss": 0.1313, + "num_input_tokens_seen": 69038720, + "step": 31995 + }, + { + "epoch": 5.220228384991843, + "grad_norm": 0.62893146276474, + "learning_rate": 2.7439005772467126e-05, + "loss": 0.0234, + "num_input_tokens_seen": 69050368, + "step": 32000 + }, + { + "epoch": 5.221044045676998, + "grad_norm": 0.060869913548231125, + "learning_rate": 2.743192164689924e-05, + "loss": 0.0235, + "num_input_tokens_seen": 69060672, + "step": 32005 + }, + { + "epoch": 5.221859706362153, + "grad_norm": 0.5458961725234985, + "learning_rate": 2.742483732418744e-05, + "loss": 0.2075, + "num_input_tokens_seen": 69072480, + "step": 32010 + }, + { + "epoch": 5.222675367047309, + "grad_norm": 0.1872738003730774, + "learning_rate": 2.7417752804906027e-05, + "loss": 0.0684, + "num_input_tokens_seen": 69083232, + "step": 32015 + }, + { + "epoch": 5.2234910277324635, + "grad_norm": 0.7728226780891418, + "learning_rate": 2.7410668089629304e-05, + "loss": 0.0717, + "num_input_tokens_seen": 69094912, + "step": 32020 + }, + { + "epoch": 5.224306688417618, + "grad_norm": 0.45959919691085815, + "learning_rate": 2.7403583178931597e-05, + "loss": 0.0424, + "num_input_tokens_seen": 69106432, + "step": 32025 + }, + { + "epoch": 5.225122349102773, + "grad_norm": 1.094530701637268, + "learning_rate": 2.7396498073387245e-05, + "loss": 0.0872, + "num_input_tokens_seen": 69117184, + "step": 32030 + }, + { + "epoch": 5.225938009787928, + "grad_norm": 0.2915984094142914, + "learning_rate": 2.7389412773570595e-05, + "loss": 0.0559, + "num_input_tokens_seen": 69127520, + "step": 32035 + }, + { + "epoch": 5.226753670473083, + "grad_norm": 0.0593239851295948, + "learning_rate": 2.738232728005602e-05, + "loss": 0.1351, + "num_input_tokens_seen": 69139104, + "step": 32040 + }, + { + "epoch": 5.2275693311582385, + "grad_norm": 1.3653908967971802, + "learning_rate": 2.737524159341791e-05, + "loss": 0.0674, + "num_input_tokens_seen": 69149312, + "step": 32045 + }, + { + "epoch": 5.228384991843393, + "grad_norm": 0.10092675685882568, + "learning_rate": 2.7368155714230663e-05, + "loss": 0.132, + "num_input_tokens_seen": 69160576, + "step": 32050 + }, + { + "epoch": 5.229200652528548, + "grad_norm": 0.38964179158210754, + "learning_rate": 2.7361069643068698e-05, + "loss": 0.04, + "num_input_tokens_seen": 69171360, + "step": 32055 + }, + { + "epoch": 5.230016313213703, + "grad_norm": 1.4117554426193237, + "learning_rate": 2.7353983380506444e-05, + "loss": 0.386, + "num_input_tokens_seen": 69183360, + "step": 32060 + }, + { + "epoch": 5.230831973898858, + "grad_norm": 0.21871192753314972, + "learning_rate": 2.734689692711836e-05, + "loss": 0.0944, + "num_input_tokens_seen": 69195392, + "step": 32065 + }, + { + "epoch": 5.231647634584013, + "grad_norm": 0.6834312081336975, + "learning_rate": 2.73398102834789e-05, + "loss": 0.0269, + "num_input_tokens_seen": 69206816, + "step": 32070 + }, + { + "epoch": 5.232463295269168, + "grad_norm": 0.6874644756317139, + "learning_rate": 2.7332723450162544e-05, + "loss": 0.0345, + "num_input_tokens_seen": 69217248, + "step": 32075 + }, + { + "epoch": 5.233278955954323, + "grad_norm": 1.039467692375183, + "learning_rate": 2.7325636427743788e-05, + "loss": 0.1816, + "num_input_tokens_seen": 69228480, + "step": 32080 + }, + { + "epoch": 5.234094616639478, + "grad_norm": 0.9328274726867676, + "learning_rate": 2.731854921679715e-05, + "loss": 0.1822, + "num_input_tokens_seen": 69240832, + "step": 32085 + }, + { + "epoch": 5.234910277324633, + "grad_norm": 1.1466262340545654, + "learning_rate": 2.7311461817897143e-05, + "loss": 0.0651, + "num_input_tokens_seen": 69250976, + "step": 32090 + }, + { + "epoch": 5.235725938009788, + "grad_norm": 0.08023878931999207, + "learning_rate": 2.7304374231618318e-05, + "loss": 0.0803, + "num_input_tokens_seen": 69261184, + "step": 32095 + }, + { + "epoch": 5.236541598694943, + "grad_norm": 0.7207009792327881, + "learning_rate": 2.729728645853522e-05, + "loss": 0.2312, + "num_input_tokens_seen": 69273024, + "step": 32100 + }, + { + "epoch": 5.237357259380098, + "grad_norm": 0.21625261008739471, + "learning_rate": 2.729019849922243e-05, + "loss": 0.0521, + "num_input_tokens_seen": 69285280, + "step": 32105 + }, + { + "epoch": 5.238172920065253, + "grad_norm": 0.17811504006385803, + "learning_rate": 2.7283110354254526e-05, + "loss": 0.0811, + "num_input_tokens_seen": 69295680, + "step": 32110 + }, + { + "epoch": 5.238988580750408, + "grad_norm": 1.2762020826339722, + "learning_rate": 2.727602202420611e-05, + "loss": 0.1856, + "num_input_tokens_seen": 69305888, + "step": 32115 + }, + { + "epoch": 5.239804241435563, + "grad_norm": 0.7115421891212463, + "learning_rate": 2.7268933509651806e-05, + "loss": 0.0964, + "num_input_tokens_seen": 69317856, + "step": 32120 + }, + { + "epoch": 5.240619902120717, + "grad_norm": 1.7073214054107666, + "learning_rate": 2.7261844811166236e-05, + "loss": 0.3271, + "num_input_tokens_seen": 69329088, + "step": 32125 + }, + { + "epoch": 5.241435562805873, + "grad_norm": 1.318413496017456, + "learning_rate": 2.725475592932405e-05, + "loss": 0.1377, + "num_input_tokens_seen": 69340544, + "step": 32130 + }, + { + "epoch": 5.242251223491028, + "grad_norm": 1.3532123565673828, + "learning_rate": 2.724766686469991e-05, + "loss": 0.2582, + "num_input_tokens_seen": 69351104, + "step": 32135 + }, + { + "epoch": 5.243066884176183, + "grad_norm": 0.03839603066444397, + "learning_rate": 2.724057761786849e-05, + "loss": 0.1042, + "num_input_tokens_seen": 69362848, + "step": 32140 + }, + { + "epoch": 5.2438825448613375, + "grad_norm": 1.0577501058578491, + "learning_rate": 2.7233488189404478e-05, + "loss": 0.1363, + "num_input_tokens_seen": 69373504, + "step": 32145 + }, + { + "epoch": 5.244698205546492, + "grad_norm": 0.10017810016870499, + "learning_rate": 2.7226398579882573e-05, + "loss": 0.0594, + "num_input_tokens_seen": 69384256, + "step": 32150 + }, + { + "epoch": 5.245513866231648, + "grad_norm": 0.40099966526031494, + "learning_rate": 2.7219308789877513e-05, + "loss": 0.1144, + "num_input_tokens_seen": 69394880, + "step": 32155 + }, + { + "epoch": 5.246329526916803, + "grad_norm": 0.36648550629615784, + "learning_rate": 2.7212218819964013e-05, + "loss": 0.0846, + "num_input_tokens_seen": 69405120, + "step": 32160 + }, + { + "epoch": 5.247145187601958, + "grad_norm": 0.035507749766111374, + "learning_rate": 2.720512867071684e-05, + "loss": 0.0716, + "num_input_tokens_seen": 69416640, + "step": 32165 + }, + { + "epoch": 5.2479608482871125, + "grad_norm": 0.4799066185951233, + "learning_rate": 2.719803834271074e-05, + "loss": 0.1754, + "num_input_tokens_seen": 69426912, + "step": 32170 + }, + { + "epoch": 5.248776508972267, + "grad_norm": 0.061930108815431595, + "learning_rate": 2.7190947836520502e-05, + "loss": 0.193, + "num_input_tokens_seen": 69436512, + "step": 32175 + }, + { + "epoch": 5.249592169657422, + "grad_norm": 0.7084671258926392, + "learning_rate": 2.718385715272092e-05, + "loss": 0.3761, + "num_input_tokens_seen": 69447424, + "step": 32180 + }, + { + "epoch": 5.250407830342578, + "grad_norm": 0.734576404094696, + "learning_rate": 2.7176766291886792e-05, + "loss": 0.2088, + "num_input_tokens_seen": 69459136, + "step": 32185 + }, + { + "epoch": 5.251223491027733, + "grad_norm": 1.2771655321121216, + "learning_rate": 2.7169675254592947e-05, + "loss": 0.168, + "num_input_tokens_seen": 69470048, + "step": 32190 + }, + { + "epoch": 5.2520391517128875, + "grad_norm": 0.580244779586792, + "learning_rate": 2.716258404141421e-05, + "loss": 0.1561, + "num_input_tokens_seen": 69480992, + "step": 32195 + }, + { + "epoch": 5.252854812398042, + "grad_norm": 2.5051915645599365, + "learning_rate": 2.7155492652925446e-05, + "loss": 0.1824, + "num_input_tokens_seen": 69491936, + "step": 32200 + }, + { + "epoch": 5.253670473083197, + "grad_norm": 0.9106834530830383, + "learning_rate": 2.714840108970151e-05, + "loss": 0.2234, + "num_input_tokens_seen": 69500896, + "step": 32205 + }, + { + "epoch": 5.254486133768353, + "grad_norm": 0.6579012274742126, + "learning_rate": 2.7141309352317278e-05, + "loss": 0.0944, + "num_input_tokens_seen": 69509888, + "step": 32210 + }, + { + "epoch": 5.255301794453508, + "grad_norm": 0.2011554092168808, + "learning_rate": 2.7134217441347647e-05, + "loss": 0.081, + "num_input_tokens_seen": 69519840, + "step": 32215 + }, + { + "epoch": 5.2561174551386625, + "grad_norm": 1.6986922025680542, + "learning_rate": 2.7127125357367515e-05, + "loss": 0.0968, + "num_input_tokens_seen": 69530400, + "step": 32220 + }, + { + "epoch": 5.256933115823817, + "grad_norm": 0.04942469298839569, + "learning_rate": 2.7120033100951814e-05, + "loss": 0.0624, + "num_input_tokens_seen": 69540992, + "step": 32225 + }, + { + "epoch": 5.257748776508972, + "grad_norm": 0.4370053708553314, + "learning_rate": 2.7112940672675473e-05, + "loss": 0.1506, + "num_input_tokens_seen": 69552704, + "step": 32230 + }, + { + "epoch": 5.258564437194127, + "grad_norm": 0.9967560768127441, + "learning_rate": 2.7105848073113433e-05, + "loss": 0.0581, + "num_input_tokens_seen": 69563648, + "step": 32235 + }, + { + "epoch": 5.259380097879283, + "grad_norm": 2.1827542781829834, + "learning_rate": 2.709875530284067e-05, + "loss": 0.1325, + "num_input_tokens_seen": 69574880, + "step": 32240 + }, + { + "epoch": 5.260195758564437, + "grad_norm": 0.9553539752960205, + "learning_rate": 2.7091662362432153e-05, + "loss": 0.2396, + "num_input_tokens_seen": 69586496, + "step": 32245 + }, + { + "epoch": 5.261011419249592, + "grad_norm": 1.220571756362915, + "learning_rate": 2.7084569252462873e-05, + "loss": 0.1304, + "num_input_tokens_seen": 69596704, + "step": 32250 + }, + { + "epoch": 5.261827079934747, + "grad_norm": 2.1674768924713135, + "learning_rate": 2.7077475973507832e-05, + "loss": 0.2313, + "num_input_tokens_seen": 69606912, + "step": 32255 + }, + { + "epoch": 5.262642740619902, + "grad_norm": 1.254448413848877, + "learning_rate": 2.7070382526142045e-05, + "loss": 0.3018, + "num_input_tokens_seen": 69618880, + "step": 32260 + }, + { + "epoch": 5.263458401305057, + "grad_norm": 0.9256694912910461, + "learning_rate": 2.706328891094055e-05, + "loss": 0.0863, + "num_input_tokens_seen": 69629440, + "step": 32265 + }, + { + "epoch": 5.264274061990212, + "grad_norm": 0.25340592861175537, + "learning_rate": 2.7056195128478384e-05, + "loss": 0.0999, + "num_input_tokens_seen": 69640128, + "step": 32270 + }, + { + "epoch": 5.265089722675367, + "grad_norm": 0.8832587003707886, + "learning_rate": 2.7049101179330605e-05, + "loss": 0.1269, + "num_input_tokens_seen": 69650816, + "step": 32275 + }, + { + "epoch": 5.265905383360522, + "grad_norm": 0.3262135088443756, + "learning_rate": 2.7042007064072288e-05, + "loss": 0.1242, + "num_input_tokens_seen": 69661376, + "step": 32280 + }, + { + "epoch": 5.266721044045677, + "grad_norm": 1.2730554342269897, + "learning_rate": 2.703491278327852e-05, + "loss": 0.0883, + "num_input_tokens_seen": 69671200, + "step": 32285 + }, + { + "epoch": 5.267536704730832, + "grad_norm": 1.1479891538619995, + "learning_rate": 2.7027818337524396e-05, + "loss": 0.2041, + "num_input_tokens_seen": 69682048, + "step": 32290 + }, + { + "epoch": 5.268352365415987, + "grad_norm": 0.1435851901769638, + "learning_rate": 2.7020723727385027e-05, + "loss": 0.07, + "num_input_tokens_seen": 69694208, + "step": 32295 + }, + { + "epoch": 5.269168026101142, + "grad_norm": 0.2321045696735382, + "learning_rate": 2.7013628953435544e-05, + "loss": 0.0901, + "num_input_tokens_seen": 69702848, + "step": 32300 + }, + { + "epoch": 5.269983686786297, + "grad_norm": 0.37368208169937134, + "learning_rate": 2.7006534016251072e-05, + "loss": 0.0493, + "num_input_tokens_seen": 69714080, + "step": 32305 + }, + { + "epoch": 5.270799347471452, + "grad_norm": 0.04321548715233803, + "learning_rate": 2.6999438916406777e-05, + "loss": 0.0385, + "num_input_tokens_seen": 69725408, + "step": 32310 + }, + { + "epoch": 5.271615008156607, + "grad_norm": 0.40996718406677246, + "learning_rate": 2.6992343654477825e-05, + "loss": 0.0331, + "num_input_tokens_seen": 69736704, + "step": 32315 + }, + { + "epoch": 5.2724306688417615, + "grad_norm": 0.47859853506088257, + "learning_rate": 2.6985248231039378e-05, + "loss": 0.059, + "num_input_tokens_seen": 69747264, + "step": 32320 + }, + { + "epoch": 5.273246329526917, + "grad_norm": 0.8511914610862732, + "learning_rate": 2.6978152646666644e-05, + "loss": 0.0781, + "num_input_tokens_seen": 69758592, + "step": 32325 + }, + { + "epoch": 5.274061990212072, + "grad_norm": 0.4381433129310608, + "learning_rate": 2.697105690193481e-05, + "loss": 0.0827, + "num_input_tokens_seen": 69769088, + "step": 32330 + }, + { + "epoch": 5.274877650897227, + "grad_norm": 0.5958948731422424, + "learning_rate": 2.696396099741911e-05, + "loss": 0.123, + "num_input_tokens_seen": 69780064, + "step": 32335 + }, + { + "epoch": 5.275693311582382, + "grad_norm": 0.17133241891860962, + "learning_rate": 2.695686493369476e-05, + "loss": 0.1423, + "num_input_tokens_seen": 69792064, + "step": 32340 + }, + { + "epoch": 5.2765089722675365, + "grad_norm": 0.8682675361633301, + "learning_rate": 2.6949768711337015e-05, + "loss": 0.2336, + "num_input_tokens_seen": 69802304, + "step": 32345 + }, + { + "epoch": 5.277324632952691, + "grad_norm": 0.18695512413978577, + "learning_rate": 2.6942672330921124e-05, + "loss": 0.0643, + "num_input_tokens_seen": 69811104, + "step": 32350 + }, + { + "epoch": 5.278140293637847, + "grad_norm": 0.07860486954450607, + "learning_rate": 2.693557579302236e-05, + "loss": 0.0566, + "num_input_tokens_seen": 69823072, + "step": 32355 + }, + { + "epoch": 5.278955954323002, + "grad_norm": 1.9765772819519043, + "learning_rate": 2.6928479098216e-05, + "loss": 0.2334, + "num_input_tokens_seen": 69835104, + "step": 32360 + }, + { + "epoch": 5.279771615008157, + "grad_norm": 0.8001852035522461, + "learning_rate": 2.6921382247077336e-05, + "loss": 0.1281, + "num_input_tokens_seen": 69846784, + "step": 32365 + }, + { + "epoch": 5.280587275693311, + "grad_norm": 0.4027155935764313, + "learning_rate": 2.691428524018168e-05, + "loss": 0.0704, + "num_input_tokens_seen": 69857024, + "step": 32370 + }, + { + "epoch": 5.281402936378466, + "grad_norm": 0.2207874208688736, + "learning_rate": 2.6907188078104352e-05, + "loss": 0.0221, + "num_input_tokens_seen": 69866624, + "step": 32375 + }, + { + "epoch": 5.282218597063622, + "grad_norm": 0.9720665216445923, + "learning_rate": 2.690009076142067e-05, + "loss": 0.1335, + "num_input_tokens_seen": 69878816, + "step": 32380 + }, + { + "epoch": 5.283034257748777, + "grad_norm": 0.182729572057724, + "learning_rate": 2.6892993290706e-05, + "loss": 0.1067, + "num_input_tokens_seen": 69889184, + "step": 32385 + }, + { + "epoch": 5.283849918433932, + "grad_norm": 0.3508009910583496, + "learning_rate": 2.6885895666535684e-05, + "loss": 0.2114, + "num_input_tokens_seen": 69900160, + "step": 32390 + }, + { + "epoch": 5.284665579119086, + "grad_norm": 0.038284774869680405, + "learning_rate": 2.6878797889485096e-05, + "loss": 0.1319, + "num_input_tokens_seen": 69910848, + "step": 32395 + }, + { + "epoch": 5.285481239804241, + "grad_norm": 0.09873811900615692, + "learning_rate": 2.687169996012962e-05, + "loss": 0.0255, + "num_input_tokens_seen": 69921856, + "step": 32400 + }, + { + "epoch": 5.286296900489396, + "grad_norm": 0.14854714274406433, + "learning_rate": 2.6864601879044653e-05, + "loss": 0.0599, + "num_input_tokens_seen": 69932768, + "step": 32405 + }, + { + "epoch": 5.287112561174552, + "grad_norm": 1.3320354223251343, + "learning_rate": 2.6857503646805593e-05, + "loss": 0.1516, + "num_input_tokens_seen": 69943488, + "step": 32410 + }, + { + "epoch": 5.287928221859707, + "grad_norm": 1.7986290454864502, + "learning_rate": 2.6850405263987867e-05, + "loss": 0.0963, + "num_input_tokens_seen": 69953312, + "step": 32415 + }, + { + "epoch": 5.288743882544861, + "grad_norm": 0.9065678119659424, + "learning_rate": 2.6843306731166894e-05, + "loss": 0.1141, + "num_input_tokens_seen": 69963072, + "step": 32420 + }, + { + "epoch": 5.289559543230016, + "grad_norm": 0.14670032262802124, + "learning_rate": 2.6836208048918132e-05, + "loss": 0.1037, + "num_input_tokens_seen": 69973600, + "step": 32425 + }, + { + "epoch": 5.290375203915171, + "grad_norm": 0.11385879665613174, + "learning_rate": 2.682910921781702e-05, + "loss": 0.104, + "num_input_tokens_seen": 69984384, + "step": 32430 + }, + { + "epoch": 5.291190864600326, + "grad_norm": 0.2959834635257721, + "learning_rate": 2.682201023843904e-05, + "loss": 0.0714, + "num_input_tokens_seen": 69995616, + "step": 32435 + }, + { + "epoch": 5.2920065252854815, + "grad_norm": 1.356833577156067, + "learning_rate": 2.6814911111359665e-05, + "loss": 0.1557, + "num_input_tokens_seen": 70006432, + "step": 32440 + }, + { + "epoch": 5.292822185970636, + "grad_norm": 1.560719609260559, + "learning_rate": 2.6807811837154383e-05, + "loss": 0.2201, + "num_input_tokens_seen": 70017120, + "step": 32445 + }, + { + "epoch": 5.293637846655791, + "grad_norm": 0.5568475723266602, + "learning_rate": 2.6800712416398705e-05, + "loss": 0.127, + "num_input_tokens_seen": 70027744, + "step": 32450 + }, + { + "epoch": 5.294453507340946, + "grad_norm": 1.1908375024795532, + "learning_rate": 2.6793612849668138e-05, + "loss": 0.1422, + "num_input_tokens_seen": 70039360, + "step": 32455 + }, + { + "epoch": 5.295269168026101, + "grad_norm": 0.057870857417583466, + "learning_rate": 2.6786513137538216e-05, + "loss": 0.2331, + "num_input_tokens_seen": 70049952, + "step": 32460 + }, + { + "epoch": 5.2960848287112565, + "grad_norm": 0.24213384091854095, + "learning_rate": 2.677941328058447e-05, + "loss": 0.1602, + "num_input_tokens_seen": 70059904, + "step": 32465 + }, + { + "epoch": 5.296900489396411, + "grad_norm": 0.2150900661945343, + "learning_rate": 2.677231327938246e-05, + "loss": 0.2262, + "num_input_tokens_seen": 70070464, + "step": 32470 + }, + { + "epoch": 5.297716150081566, + "grad_norm": 0.9027369022369385, + "learning_rate": 2.676521313450774e-05, + "loss": 0.0633, + "num_input_tokens_seen": 70081472, + "step": 32475 + }, + { + "epoch": 5.298531810766721, + "grad_norm": 0.30362507700920105, + "learning_rate": 2.6758112846535888e-05, + "loss": 0.0882, + "num_input_tokens_seen": 70091360, + "step": 32480 + }, + { + "epoch": 5.299347471451876, + "grad_norm": 0.7668594121932983, + "learning_rate": 2.6751012416042487e-05, + "loss": 0.1062, + "num_input_tokens_seen": 70101088, + "step": 32485 + }, + { + "epoch": 5.300163132137031, + "grad_norm": 0.5197054147720337, + "learning_rate": 2.674391184360313e-05, + "loss": 0.0756, + "num_input_tokens_seen": 70112608, + "step": 32490 + }, + { + "epoch": 5.300978792822186, + "grad_norm": 0.03211485967040062, + "learning_rate": 2.6736811129793438e-05, + "loss": 0.0363, + "num_input_tokens_seen": 70124128, + "step": 32495 + }, + { + "epoch": 5.301794453507341, + "grad_norm": 2.5017478466033936, + "learning_rate": 2.6729710275189024e-05, + "loss": 0.3071, + "num_input_tokens_seen": 70136736, + "step": 32500 + }, + { + "epoch": 5.302610114192496, + "grad_norm": 0.3426743149757385, + "learning_rate": 2.672260928036552e-05, + "loss": 0.1093, + "num_input_tokens_seen": 70148768, + "step": 32505 + }, + { + "epoch": 5.303425774877651, + "grad_norm": 1.3173344135284424, + "learning_rate": 2.671550814589856e-05, + "loss": 0.1686, + "num_input_tokens_seen": 70158592, + "step": 32510 + }, + { + "epoch": 5.304241435562806, + "grad_norm": 0.08895739167928696, + "learning_rate": 2.6708406872363813e-05, + "loss": 0.0903, + "num_input_tokens_seen": 70169856, + "step": 32515 + }, + { + "epoch": 5.30505709624796, + "grad_norm": 0.6723769307136536, + "learning_rate": 2.670130546033693e-05, + "loss": 0.0333, + "num_input_tokens_seen": 70179680, + "step": 32520 + }, + { + "epoch": 5.305872756933116, + "grad_norm": 0.4175232946872711, + "learning_rate": 2.6694203910393594e-05, + "loss": 0.1823, + "num_input_tokens_seen": 70191104, + "step": 32525 + }, + { + "epoch": 5.306688417618271, + "grad_norm": 0.12389029562473297, + "learning_rate": 2.66871022231095e-05, + "loss": 0.1477, + "num_input_tokens_seen": 70201728, + "step": 32530 + }, + { + "epoch": 5.307504078303426, + "grad_norm": 0.27309438586235046, + "learning_rate": 2.6680000399060327e-05, + "loss": 0.0227, + "num_input_tokens_seen": 70213088, + "step": 32535 + }, + { + "epoch": 5.308319738988581, + "grad_norm": 0.3897963762283325, + "learning_rate": 2.6672898438821808e-05, + "loss": 0.039, + "num_input_tokens_seen": 70223648, + "step": 32540 + }, + { + "epoch": 5.309135399673735, + "grad_norm": 0.5153766870498657, + "learning_rate": 2.666579634296965e-05, + "loss": 0.125, + "num_input_tokens_seen": 70234272, + "step": 32545 + }, + { + "epoch": 5.309951060358891, + "grad_norm": 0.47751858830451965, + "learning_rate": 2.6658694112079586e-05, + "loss": 0.0941, + "num_input_tokens_seen": 70244800, + "step": 32550 + }, + { + "epoch": 5.310766721044046, + "grad_norm": 0.06826390326023102, + "learning_rate": 2.6651591746727363e-05, + "loss": 0.1531, + "num_input_tokens_seen": 70255360, + "step": 32555 + }, + { + "epoch": 5.311582381729201, + "grad_norm": 0.09438413381576538, + "learning_rate": 2.6644489247488735e-05, + "loss": 0.1048, + "num_input_tokens_seen": 70265888, + "step": 32560 + }, + { + "epoch": 5.3123980424143555, + "grad_norm": 0.13774384558200836, + "learning_rate": 2.6637386614939464e-05, + "loss": 0.1943, + "num_input_tokens_seen": 70276544, + "step": 32565 + }, + { + "epoch": 5.31321370309951, + "grad_norm": 0.14108823239803314, + "learning_rate": 2.6630283849655326e-05, + "loss": 0.0759, + "num_input_tokens_seen": 70287904, + "step": 32570 + }, + { + "epoch": 5.314029363784665, + "grad_norm": 0.2956930994987488, + "learning_rate": 2.6623180952212106e-05, + "loss": 0.0751, + "num_input_tokens_seen": 70299872, + "step": 32575 + }, + { + "epoch": 5.314845024469821, + "grad_norm": 0.10097391158342361, + "learning_rate": 2.66160779231856e-05, + "loss": 0.1135, + "num_input_tokens_seen": 70310624, + "step": 32580 + }, + { + "epoch": 5.315660685154976, + "grad_norm": 0.2313680499792099, + "learning_rate": 2.660897476315162e-05, + "loss": 0.0393, + "num_input_tokens_seen": 70321312, + "step": 32585 + }, + { + "epoch": 5.3164763458401305, + "grad_norm": 0.12124349176883698, + "learning_rate": 2.6601871472685985e-05, + "loss": 0.0635, + "num_input_tokens_seen": 70332384, + "step": 32590 + }, + { + "epoch": 5.317292006525285, + "grad_norm": 0.78958660364151, + "learning_rate": 2.659476805236451e-05, + "loss": 0.1445, + "num_input_tokens_seen": 70343232, + "step": 32595 + }, + { + "epoch": 5.31810766721044, + "grad_norm": 1.60438072681427, + "learning_rate": 2.6587664502763054e-05, + "loss": 0.3474, + "num_input_tokens_seen": 70355104, + "step": 32600 + }, + { + "epoch": 5.318923327895595, + "grad_norm": 2.016028881072998, + "learning_rate": 2.6580560824457457e-05, + "loss": 0.2091, + "num_input_tokens_seen": 70366400, + "step": 32605 + }, + { + "epoch": 5.319738988580751, + "grad_norm": 1.0816799402236938, + "learning_rate": 2.657345701802358e-05, + "loss": 0.1663, + "num_input_tokens_seen": 70378400, + "step": 32610 + }, + { + "epoch": 5.3205546492659055, + "grad_norm": 0.27306821942329407, + "learning_rate": 2.6566353084037295e-05, + "loss": 0.0565, + "num_input_tokens_seen": 70389952, + "step": 32615 + }, + { + "epoch": 5.32137030995106, + "grad_norm": 0.4015747904777527, + "learning_rate": 2.6559249023074474e-05, + "loss": 0.0901, + "num_input_tokens_seen": 70401408, + "step": 32620 + }, + { + "epoch": 5.322185970636215, + "grad_norm": 0.16250860691070557, + "learning_rate": 2.6552144835711017e-05, + "loss": 0.1348, + "num_input_tokens_seen": 70412640, + "step": 32625 + }, + { + "epoch": 5.32300163132137, + "grad_norm": 0.5013503432273865, + "learning_rate": 2.6545040522522828e-05, + "loss": 0.1063, + "num_input_tokens_seen": 70422592, + "step": 32630 + }, + { + "epoch": 5.323817292006526, + "grad_norm": 0.3327314257621765, + "learning_rate": 2.653793608408582e-05, + "loss": 0.1336, + "num_input_tokens_seen": 70433856, + "step": 32635 + }, + { + "epoch": 5.3246329526916805, + "grad_norm": 2.1867520809173584, + "learning_rate": 2.6530831520975903e-05, + "loss": 0.2053, + "num_input_tokens_seen": 70445376, + "step": 32640 + }, + { + "epoch": 5.325448613376835, + "grad_norm": 1.8716121912002563, + "learning_rate": 2.652372683376902e-05, + "loss": 0.2343, + "num_input_tokens_seen": 70456384, + "step": 32645 + }, + { + "epoch": 5.32626427406199, + "grad_norm": 0.14179889857769012, + "learning_rate": 2.65166220230411e-05, + "loss": 0.182, + "num_input_tokens_seen": 70467040, + "step": 32650 + }, + { + "epoch": 5.327079934747145, + "grad_norm": 0.4325231909751892, + "learning_rate": 2.650951708936811e-05, + "loss": 0.1357, + "num_input_tokens_seen": 70477248, + "step": 32655 + }, + { + "epoch": 5.327895595432301, + "grad_norm": 0.12798385322093964, + "learning_rate": 2.6502412033326e-05, + "loss": 0.237, + "num_input_tokens_seen": 70487648, + "step": 32660 + }, + { + "epoch": 5.328711256117455, + "grad_norm": 0.5768354535102844, + "learning_rate": 2.6495306855490754e-05, + "loss": 0.1794, + "num_input_tokens_seen": 70499296, + "step": 32665 + }, + { + "epoch": 5.32952691680261, + "grad_norm": 0.979622483253479, + "learning_rate": 2.6488201556438346e-05, + "loss": 0.1102, + "num_input_tokens_seen": 70509664, + "step": 32670 + }, + { + "epoch": 5.330342577487765, + "grad_norm": 0.26595765352249146, + "learning_rate": 2.648109613674477e-05, + "loss": 0.0483, + "num_input_tokens_seen": 70520064, + "step": 32675 + }, + { + "epoch": 5.33115823817292, + "grad_norm": 1.2309209108352661, + "learning_rate": 2.647399059698602e-05, + "loss": 0.1453, + "num_input_tokens_seen": 70531104, + "step": 32680 + }, + { + "epoch": 5.331973898858075, + "grad_norm": 0.05812413990497589, + "learning_rate": 2.646688493773812e-05, + "loss": 0.092, + "num_input_tokens_seen": 70542528, + "step": 32685 + }, + { + "epoch": 5.33278955954323, + "grad_norm": 0.9786520600318909, + "learning_rate": 2.6459779159577077e-05, + "loss": 0.1281, + "num_input_tokens_seen": 70553216, + "step": 32690 + }, + { + "epoch": 5.333605220228385, + "grad_norm": 1.8560127019882202, + "learning_rate": 2.645267326307893e-05, + "loss": 0.2416, + "num_input_tokens_seen": 70565504, + "step": 32695 + }, + { + "epoch": 5.33442088091354, + "grad_norm": 0.6732898950576782, + "learning_rate": 2.6445567248819726e-05, + "loss": 0.1272, + "num_input_tokens_seen": 70577376, + "step": 32700 + }, + { + "epoch": 5.335236541598695, + "grad_norm": 0.4406517744064331, + "learning_rate": 2.643846111737549e-05, + "loss": 0.1609, + "num_input_tokens_seen": 70588960, + "step": 32705 + }, + { + "epoch": 5.33605220228385, + "grad_norm": 0.6710542440414429, + "learning_rate": 2.643135486932231e-05, + "loss": 0.0689, + "num_input_tokens_seen": 70600192, + "step": 32710 + }, + { + "epoch": 5.3368678629690045, + "grad_norm": 0.09686751663684845, + "learning_rate": 2.642424850523624e-05, + "loss": 0.1054, + "num_input_tokens_seen": 70610816, + "step": 32715 + }, + { + "epoch": 5.33768352365416, + "grad_norm": 1.316989541053772, + "learning_rate": 2.641714202569336e-05, + "loss": 0.3231, + "num_input_tokens_seen": 70621280, + "step": 32720 + }, + { + "epoch": 5.338499184339315, + "grad_norm": 0.7181122303009033, + "learning_rate": 2.6410035431269754e-05, + "loss": 0.1251, + "num_input_tokens_seen": 70631968, + "step": 32725 + }, + { + "epoch": 5.33931484502447, + "grad_norm": 0.08911708742380142, + "learning_rate": 2.6402928722541524e-05, + "loss": 0.122, + "num_input_tokens_seen": 70643296, + "step": 32730 + }, + { + "epoch": 5.340130505709625, + "grad_norm": 0.34748294949531555, + "learning_rate": 2.6395821900084772e-05, + "loss": 0.0486, + "num_input_tokens_seen": 70652576, + "step": 32735 + }, + { + "epoch": 5.3409461663947795, + "grad_norm": 0.22814546525478363, + "learning_rate": 2.638871496447562e-05, + "loss": 0.0799, + "num_input_tokens_seen": 70663424, + "step": 32740 + }, + { + "epoch": 5.341761827079935, + "grad_norm": 1.4161123037338257, + "learning_rate": 2.638160791629018e-05, + "loss": 0.1564, + "num_input_tokens_seen": 70673536, + "step": 32745 + }, + { + "epoch": 5.34257748776509, + "grad_norm": 1.3367749452590942, + "learning_rate": 2.6374500756104594e-05, + "loss": 0.1207, + "num_input_tokens_seen": 70683712, + "step": 32750 + }, + { + "epoch": 5.343393148450245, + "grad_norm": 1.344509482383728, + "learning_rate": 2.6367393484494994e-05, + "loss": 0.1725, + "num_input_tokens_seen": 70693856, + "step": 32755 + }, + { + "epoch": 5.3442088091354, + "grad_norm": 0.4230455160140991, + "learning_rate": 2.636028610203755e-05, + "loss": 0.0718, + "num_input_tokens_seen": 70705024, + "step": 32760 + }, + { + "epoch": 5.3450244698205545, + "grad_norm": 0.36728593707084656, + "learning_rate": 2.635317860930841e-05, + "loss": 0.0522, + "num_input_tokens_seen": 70715040, + "step": 32765 + }, + { + "epoch": 5.345840130505709, + "grad_norm": 1.2678040266036987, + "learning_rate": 2.6346071006883748e-05, + "loss": 0.063, + "num_input_tokens_seen": 70726080, + "step": 32770 + }, + { + "epoch": 5.346655791190865, + "grad_norm": 0.22598996758460999, + "learning_rate": 2.6338963295339737e-05, + "loss": 0.0736, + "num_input_tokens_seen": 70736800, + "step": 32775 + }, + { + "epoch": 5.34747145187602, + "grad_norm": 1.9069526195526123, + "learning_rate": 2.633185547525257e-05, + "loss": 0.1073, + "num_input_tokens_seen": 70748928, + "step": 32780 + }, + { + "epoch": 5.348287112561175, + "grad_norm": 0.5262249112129211, + "learning_rate": 2.6324747547198443e-05, + "loss": 0.1506, + "num_input_tokens_seen": 70760736, + "step": 32785 + }, + { + "epoch": 5.349102773246329, + "grad_norm": 1.1524248123168945, + "learning_rate": 2.631763951175355e-05, + "loss": 0.1195, + "num_input_tokens_seen": 70771552, + "step": 32790 + }, + { + "epoch": 5.349918433931484, + "grad_norm": 0.7824480533599854, + "learning_rate": 2.6310531369494118e-05, + "loss": 0.0892, + "num_input_tokens_seen": 70782656, + "step": 32795 + }, + { + "epoch": 5.350734094616639, + "grad_norm": 1.0246188640594482, + "learning_rate": 2.630342312099637e-05, + "loss": 0.1286, + "num_input_tokens_seen": 70792736, + "step": 32800 + }, + { + "epoch": 5.351549755301795, + "grad_norm": 0.28646135330200195, + "learning_rate": 2.629631476683652e-05, + "loss": 0.1074, + "num_input_tokens_seen": 70804032, + "step": 32805 + }, + { + "epoch": 5.35236541598695, + "grad_norm": 0.9804304838180542, + "learning_rate": 2.6289206307590815e-05, + "loss": 0.0664, + "num_input_tokens_seen": 70814208, + "step": 32810 + }, + { + "epoch": 5.353181076672104, + "grad_norm": 1.0818814039230347, + "learning_rate": 2.6282097743835517e-05, + "loss": 0.181, + "num_input_tokens_seen": 70825312, + "step": 32815 + }, + { + "epoch": 5.353996737357259, + "grad_norm": 0.9424179196357727, + "learning_rate": 2.627498907614686e-05, + "loss": 0.1284, + "num_input_tokens_seen": 70835680, + "step": 32820 + }, + { + "epoch": 5.354812398042414, + "grad_norm": 1.2159817218780518, + "learning_rate": 2.6267880305101127e-05, + "loss": 0.1388, + "num_input_tokens_seen": 70846336, + "step": 32825 + }, + { + "epoch": 5.35562805872757, + "grad_norm": 0.78123539686203, + "learning_rate": 2.626077143127458e-05, + "loss": 0.1071, + "num_input_tokens_seen": 70858016, + "step": 32830 + }, + { + "epoch": 5.356443719412725, + "grad_norm": 1.0209163427352905, + "learning_rate": 2.6253662455243504e-05, + "loss": 0.2432, + "num_input_tokens_seen": 70868704, + "step": 32835 + }, + { + "epoch": 5.357259380097879, + "grad_norm": 0.24893786013126373, + "learning_rate": 2.6246553377584186e-05, + "loss": 0.1181, + "num_input_tokens_seen": 70879392, + "step": 32840 + }, + { + "epoch": 5.358075040783034, + "grad_norm": 1.2298864126205444, + "learning_rate": 2.623944419887293e-05, + "loss": 0.1918, + "num_input_tokens_seen": 70890848, + "step": 32845 + }, + { + "epoch": 5.358890701468189, + "grad_norm": 0.9829404354095459, + "learning_rate": 2.6232334919686035e-05, + "loss": 0.0927, + "num_input_tokens_seen": 70901472, + "step": 32850 + }, + { + "epoch": 5.359706362153344, + "grad_norm": 1.426815152168274, + "learning_rate": 2.6225225540599825e-05, + "loss": 0.2002, + "num_input_tokens_seen": 70911904, + "step": 32855 + }, + { + "epoch": 5.3605220228384995, + "grad_norm": 0.22255435585975647, + "learning_rate": 2.6218116062190605e-05, + "loss": 0.0367, + "num_input_tokens_seen": 70922880, + "step": 32860 + }, + { + "epoch": 5.361337683523654, + "grad_norm": 0.13856905698776245, + "learning_rate": 2.621100648503472e-05, + "loss": 0.0745, + "num_input_tokens_seen": 70933472, + "step": 32865 + }, + { + "epoch": 5.362153344208809, + "grad_norm": 0.07629656046628952, + "learning_rate": 2.6203896809708512e-05, + "loss": 0.1115, + "num_input_tokens_seen": 70944864, + "step": 32870 + }, + { + "epoch": 5.362969004893964, + "grad_norm": 1.4775617122650146, + "learning_rate": 2.619678703678832e-05, + "loss": 0.0991, + "num_input_tokens_seen": 70956512, + "step": 32875 + }, + { + "epoch": 5.363784665579119, + "grad_norm": 0.2830376923084259, + "learning_rate": 2.618967716685049e-05, + "loss": 0.0321, + "num_input_tokens_seen": 70966400, + "step": 32880 + }, + { + "epoch": 5.364600326264274, + "grad_norm": 1.8415154218673706, + "learning_rate": 2.6182567200471396e-05, + "loss": 0.0816, + "num_input_tokens_seen": 70977120, + "step": 32885 + }, + { + "epoch": 5.365415986949429, + "grad_norm": 0.928907573223114, + "learning_rate": 2.6175457138227404e-05, + "loss": 0.1461, + "num_input_tokens_seen": 70987712, + "step": 32890 + }, + { + "epoch": 5.366231647634584, + "grad_norm": 0.6499874591827393, + "learning_rate": 2.6168346980694896e-05, + "loss": 0.0618, + "num_input_tokens_seen": 70998080, + "step": 32895 + }, + { + "epoch": 5.367047308319739, + "grad_norm": 0.6055964827537537, + "learning_rate": 2.6161236728450257e-05, + "loss": 0.1815, + "num_input_tokens_seen": 71009568, + "step": 32900 + }, + { + "epoch": 5.367862969004894, + "grad_norm": 1.6646360158920288, + "learning_rate": 2.6154126382069866e-05, + "loss": 0.2061, + "num_input_tokens_seen": 71021728, + "step": 32905 + }, + { + "epoch": 5.368678629690049, + "grad_norm": 0.30759310722351074, + "learning_rate": 2.6147015942130143e-05, + "loss": 0.0565, + "num_input_tokens_seen": 71032704, + "step": 32910 + }, + { + "epoch": 5.369494290375204, + "grad_norm": 0.1831441968679428, + "learning_rate": 2.6139905409207475e-05, + "loss": 0.0481, + "num_input_tokens_seen": 71043808, + "step": 32915 + }, + { + "epoch": 5.370309951060359, + "grad_norm": 0.16012047231197357, + "learning_rate": 2.61327947838783e-05, + "loss": 0.1529, + "num_input_tokens_seen": 71054752, + "step": 32920 + }, + { + "epoch": 5.371125611745514, + "grad_norm": 0.49410805106163025, + "learning_rate": 2.6125684066719036e-05, + "loss": 0.1515, + "num_input_tokens_seen": 71065216, + "step": 32925 + }, + { + "epoch": 5.371941272430669, + "grad_norm": 0.21318818628787994, + "learning_rate": 2.6118573258306106e-05, + "loss": 0.1823, + "num_input_tokens_seen": 71074752, + "step": 32930 + }, + { + "epoch": 5.372756933115824, + "grad_norm": 0.06402728706598282, + "learning_rate": 2.6111462359215944e-05, + "loss": 0.0993, + "num_input_tokens_seen": 71084864, + "step": 32935 + }, + { + "epoch": 5.373572593800978, + "grad_norm": 0.5962974429130554, + "learning_rate": 2.6104351370025014e-05, + "loss": 0.1614, + "num_input_tokens_seen": 71095392, + "step": 32940 + }, + { + "epoch": 5.374388254486134, + "grad_norm": 0.07998337596654892, + "learning_rate": 2.6097240291309756e-05, + "loss": 0.1275, + "num_input_tokens_seen": 71105472, + "step": 32945 + }, + { + "epoch": 5.375203915171289, + "grad_norm": 0.11067869514226913, + "learning_rate": 2.6090129123646633e-05, + "loss": 0.1885, + "num_input_tokens_seen": 71116448, + "step": 32950 + }, + { + "epoch": 5.376019575856444, + "grad_norm": 0.32002657651901245, + "learning_rate": 2.6083017867612115e-05, + "loss": 0.0469, + "num_input_tokens_seen": 71128032, + "step": 32955 + }, + { + "epoch": 5.376835236541599, + "grad_norm": 0.16989165544509888, + "learning_rate": 2.6075906523782666e-05, + "loss": 0.0791, + "num_input_tokens_seen": 71140448, + "step": 32960 + }, + { + "epoch": 5.377650897226753, + "grad_norm": 0.797529935836792, + "learning_rate": 2.6068795092734783e-05, + "loss": 0.0454, + "num_input_tokens_seen": 71150720, + "step": 32965 + }, + { + "epoch": 5.378466557911908, + "grad_norm": 0.3075840473175049, + "learning_rate": 2.6061683575044937e-05, + "loss": 0.1669, + "num_input_tokens_seen": 71161920, + "step": 32970 + }, + { + "epoch": 5.379282218597064, + "grad_norm": 0.38357776403427124, + "learning_rate": 2.605457197128964e-05, + "loss": 0.0551, + "num_input_tokens_seen": 71173888, + "step": 32975 + }, + { + "epoch": 5.380097879282219, + "grad_norm": 0.05986737832427025, + "learning_rate": 2.6047460282045388e-05, + "loss": 0.0412, + "num_input_tokens_seen": 71185984, + "step": 32980 + }, + { + "epoch": 5.3809135399673735, + "grad_norm": 0.3224155008792877, + "learning_rate": 2.604034850788869e-05, + "loss": 0.0354, + "num_input_tokens_seen": 71196640, + "step": 32985 + }, + { + "epoch": 5.381729200652528, + "grad_norm": 0.09212839603424072, + "learning_rate": 2.6033236649396063e-05, + "loss": 0.0194, + "num_input_tokens_seen": 71208000, + "step": 32990 + }, + { + "epoch": 5.382544861337683, + "grad_norm": 0.3418068289756775, + "learning_rate": 2.6026124707144033e-05, + "loss": 0.0555, + "num_input_tokens_seen": 71219616, + "step": 32995 + }, + { + "epoch": 5.383360522022839, + "grad_norm": 0.6734557747840881, + "learning_rate": 2.6019012681709127e-05, + "loss": 0.1309, + "num_input_tokens_seen": 71230880, + "step": 33000 + }, + { + "epoch": 5.384176182707994, + "grad_norm": 1.8327505588531494, + "learning_rate": 2.601190057366788e-05, + "loss": 0.2139, + "num_input_tokens_seen": 71240800, + "step": 33005 + }, + { + "epoch": 5.3849918433931485, + "grad_norm": 0.2776627242565155, + "learning_rate": 2.600478838359684e-05, + "loss": 0.1205, + "num_input_tokens_seen": 71250784, + "step": 33010 + }, + { + "epoch": 5.385807504078303, + "grad_norm": 0.25524285435676575, + "learning_rate": 2.5997676112072557e-05, + "loss": 0.0799, + "num_input_tokens_seen": 71260576, + "step": 33015 + }, + { + "epoch": 5.386623164763458, + "grad_norm": 1.1821907758712769, + "learning_rate": 2.5990563759671575e-05, + "loss": 0.0566, + "num_input_tokens_seen": 71270976, + "step": 33020 + }, + { + "epoch": 5.387438825448613, + "grad_norm": 0.10980208963155746, + "learning_rate": 2.598345132697048e-05, + "loss": 0.1626, + "num_input_tokens_seen": 71281216, + "step": 33025 + }, + { + "epoch": 5.388254486133769, + "grad_norm": 0.5035930275917053, + "learning_rate": 2.597633881454583e-05, + "loss": 0.1625, + "num_input_tokens_seen": 71292064, + "step": 33030 + }, + { + "epoch": 5.3890701468189235, + "grad_norm": 0.4076540172100067, + "learning_rate": 2.5969226222974196e-05, + "loss": 0.1542, + "num_input_tokens_seen": 71303392, + "step": 33035 + }, + { + "epoch": 5.389885807504078, + "grad_norm": 0.05759120732545853, + "learning_rate": 2.5962113552832173e-05, + "loss": 0.0585, + "num_input_tokens_seen": 71314016, + "step": 33040 + }, + { + "epoch": 5.390701468189233, + "grad_norm": 0.2703090310096741, + "learning_rate": 2.5955000804696345e-05, + "loss": 0.1447, + "num_input_tokens_seen": 71326016, + "step": 33045 + }, + { + "epoch": 5.391517128874388, + "grad_norm": 0.49615153670310974, + "learning_rate": 2.5947887979143304e-05, + "loss": 0.1345, + "num_input_tokens_seen": 71335840, + "step": 33050 + }, + { + "epoch": 5.392332789559543, + "grad_norm": 0.15089276432991028, + "learning_rate": 2.594077507674965e-05, + "loss": 0.1271, + "num_input_tokens_seen": 71347072, + "step": 33055 + }, + { + "epoch": 5.3931484502446985, + "grad_norm": 0.9335330724716187, + "learning_rate": 2.5933662098091997e-05, + "loss": 0.0668, + "num_input_tokens_seen": 71358464, + "step": 33060 + }, + { + "epoch": 5.393964110929853, + "grad_norm": 0.6251490712165833, + "learning_rate": 2.5926549043746962e-05, + "loss": 0.0833, + "num_input_tokens_seen": 71367776, + "step": 33065 + }, + { + "epoch": 5.394779771615008, + "grad_norm": 0.5371342301368713, + "learning_rate": 2.591943591429115e-05, + "loss": 0.0389, + "num_input_tokens_seen": 71378368, + "step": 33070 + }, + { + "epoch": 5.395595432300163, + "grad_norm": 1.8749233484268188, + "learning_rate": 2.5912322710301202e-05, + "loss": 0.1518, + "num_input_tokens_seen": 71389632, + "step": 33075 + }, + { + "epoch": 5.396411092985318, + "grad_norm": 0.3144783675670624, + "learning_rate": 2.590520943235375e-05, + "loss": 0.0265, + "num_input_tokens_seen": 71400064, + "step": 33080 + }, + { + "epoch": 5.397226753670473, + "grad_norm": 0.39402276277542114, + "learning_rate": 2.5898096081025424e-05, + "loss": 0.0688, + "num_input_tokens_seen": 71410784, + "step": 33085 + }, + { + "epoch": 5.398042414355628, + "grad_norm": 1.952614426612854, + "learning_rate": 2.589098265689287e-05, + "loss": 0.2322, + "num_input_tokens_seen": 71421632, + "step": 33090 + }, + { + "epoch": 5.398858075040783, + "grad_norm": 0.11696138978004456, + "learning_rate": 2.5883869160532743e-05, + "loss": 0.2238, + "num_input_tokens_seen": 71432640, + "step": 33095 + }, + { + "epoch": 5.399673735725938, + "grad_norm": 0.4042474627494812, + "learning_rate": 2.58767555925217e-05, + "loss": 0.0285, + "num_input_tokens_seen": 71442816, + "step": 33100 + }, + { + "epoch": 5.400489396411093, + "grad_norm": 0.35978254675865173, + "learning_rate": 2.5869641953436402e-05, + "loss": 0.0529, + "num_input_tokens_seen": 71453568, + "step": 33105 + }, + { + "epoch": 5.401305057096248, + "grad_norm": 0.14386510848999023, + "learning_rate": 2.5862528243853513e-05, + "loss": 0.0949, + "num_input_tokens_seen": 71464448, + "step": 33110 + }, + { + "epoch": 5.402120717781403, + "grad_norm": 0.05822209268808365, + "learning_rate": 2.5855414464349707e-05, + "loss": 0.1064, + "num_input_tokens_seen": 71475264, + "step": 33115 + }, + { + "epoch": 5.402936378466558, + "grad_norm": 0.9655589461326599, + "learning_rate": 2.5848300615501663e-05, + "loss": 0.108, + "num_input_tokens_seen": 71484704, + "step": 33120 + }, + { + "epoch": 5.403752039151713, + "grad_norm": 1.0961092710494995, + "learning_rate": 2.5841186697886065e-05, + "loss": 0.0966, + "num_input_tokens_seen": 71494592, + "step": 33125 + }, + { + "epoch": 5.404567699836868, + "grad_norm": 3.1062636375427246, + "learning_rate": 2.583407271207961e-05, + "loss": 0.2855, + "num_input_tokens_seen": 71505568, + "step": 33130 + }, + { + "epoch": 5.4053833605220225, + "grad_norm": 0.03059573657810688, + "learning_rate": 2.582695865865899e-05, + "loss": 0.0819, + "num_input_tokens_seen": 71515872, + "step": 33135 + }, + { + "epoch": 5.406199021207178, + "grad_norm": 0.13387155532836914, + "learning_rate": 2.5819844538200906e-05, + "loss": 0.1153, + "num_input_tokens_seen": 71526720, + "step": 33140 + }, + { + "epoch": 5.407014681892333, + "grad_norm": 0.06607962399721146, + "learning_rate": 2.5812730351282056e-05, + "loss": 0.1033, + "num_input_tokens_seen": 71537856, + "step": 33145 + }, + { + "epoch": 5.407830342577488, + "grad_norm": 0.4317404627799988, + "learning_rate": 2.5805616098479167e-05, + "loss": 0.114, + "num_input_tokens_seen": 71546624, + "step": 33150 + }, + { + "epoch": 5.408646003262643, + "grad_norm": 0.052899762988090515, + "learning_rate": 2.5798501780368944e-05, + "loss": 0.2489, + "num_input_tokens_seen": 71556864, + "step": 33155 + }, + { + "epoch": 5.4094616639477975, + "grad_norm": 0.7092944979667664, + "learning_rate": 2.5791387397528123e-05, + "loss": 0.122, + "num_input_tokens_seen": 71566848, + "step": 33160 + }, + { + "epoch": 5.410277324632952, + "grad_norm": 0.8679232597351074, + "learning_rate": 2.578427295053341e-05, + "loss": 0.0849, + "num_input_tokens_seen": 71576544, + "step": 33165 + }, + { + "epoch": 5.411092985318108, + "grad_norm": 0.15377336740493774, + "learning_rate": 2.5777158439961564e-05, + "loss": 0.1547, + "num_input_tokens_seen": 71586816, + "step": 33170 + }, + { + "epoch": 5.411908646003263, + "grad_norm": 0.6793223023414612, + "learning_rate": 2.577004386638931e-05, + "loss": 0.0471, + "num_input_tokens_seen": 71596480, + "step": 33175 + }, + { + "epoch": 5.412724306688418, + "grad_norm": 0.22455202043056488, + "learning_rate": 2.576292923039339e-05, + "loss": 0.0911, + "num_input_tokens_seen": 71607968, + "step": 33180 + }, + { + "epoch": 5.4135399673735725, + "grad_norm": 1.665393590927124, + "learning_rate": 2.5755814532550553e-05, + "loss": 0.1692, + "num_input_tokens_seen": 71618560, + "step": 33185 + }, + { + "epoch": 5.414355628058727, + "grad_norm": 0.06284108012914658, + "learning_rate": 2.574869977343756e-05, + "loss": 0.0388, + "num_input_tokens_seen": 71629952, + "step": 33190 + }, + { + "epoch": 5.415171288743883, + "grad_norm": 0.7149102687835693, + "learning_rate": 2.574158495363117e-05, + "loss": 0.1059, + "num_input_tokens_seen": 71639296, + "step": 33195 + }, + { + "epoch": 5.415986949429038, + "grad_norm": 0.6928590536117554, + "learning_rate": 2.5734470073708133e-05, + "loss": 0.1137, + "num_input_tokens_seen": 71649760, + "step": 33200 + }, + { + "epoch": 5.416802610114193, + "grad_norm": 0.6785303950309753, + "learning_rate": 2.572735513424523e-05, + "loss": 0.076, + "num_input_tokens_seen": 71660800, + "step": 33205 + }, + { + "epoch": 5.417618270799347, + "grad_norm": 0.4337913393974304, + "learning_rate": 2.5720240135819223e-05, + "loss": 0.0603, + "num_input_tokens_seen": 71671104, + "step": 33210 + }, + { + "epoch": 5.418433931484502, + "grad_norm": 1.543366551399231, + "learning_rate": 2.57131250790069e-05, + "loss": 0.1652, + "num_input_tokens_seen": 71681568, + "step": 33215 + }, + { + "epoch": 5.419249592169657, + "grad_norm": 0.41521987318992615, + "learning_rate": 2.570600996438504e-05, + "loss": 0.2313, + "num_input_tokens_seen": 71691008, + "step": 33220 + }, + { + "epoch": 5.420065252854813, + "grad_norm": 0.4584099352359772, + "learning_rate": 2.5698894792530432e-05, + "loss": 0.0558, + "num_input_tokens_seen": 71702112, + "step": 33225 + }, + { + "epoch": 5.420880913539968, + "grad_norm": 0.4541694223880768, + "learning_rate": 2.5691779564019862e-05, + "loss": 0.0875, + "num_input_tokens_seen": 71713696, + "step": 33230 + }, + { + "epoch": 5.421696574225122, + "grad_norm": 1.3273344039916992, + "learning_rate": 2.5684664279430125e-05, + "loss": 0.2778, + "num_input_tokens_seen": 71724800, + "step": 33235 + }, + { + "epoch": 5.422512234910277, + "grad_norm": 0.642754316329956, + "learning_rate": 2.5677548939338035e-05, + "loss": 0.0546, + "num_input_tokens_seen": 71736128, + "step": 33240 + }, + { + "epoch": 5.423327895595432, + "grad_norm": 0.5210699439048767, + "learning_rate": 2.5670433544320388e-05, + "loss": 0.1084, + "num_input_tokens_seen": 71746912, + "step": 33245 + }, + { + "epoch": 5.424143556280587, + "grad_norm": 0.9327448606491089, + "learning_rate": 2.5663318094953997e-05, + "loss": 0.2143, + "num_input_tokens_seen": 71757888, + "step": 33250 + }, + { + "epoch": 5.424959216965743, + "grad_norm": 0.1126241534948349, + "learning_rate": 2.5656202591815675e-05, + "loss": 0.0737, + "num_input_tokens_seen": 71769504, + "step": 33255 + }, + { + "epoch": 5.425774877650897, + "grad_norm": 1.4397854804992676, + "learning_rate": 2.5649087035482243e-05, + "loss": 0.1191, + "num_input_tokens_seen": 71780352, + "step": 33260 + }, + { + "epoch": 5.426590538336052, + "grad_norm": 1.411329746246338, + "learning_rate": 2.5641971426530525e-05, + "loss": 0.1502, + "num_input_tokens_seen": 71791488, + "step": 33265 + }, + { + "epoch": 5.427406199021207, + "grad_norm": 0.9133142232894897, + "learning_rate": 2.5634855765537347e-05, + "loss": 0.0992, + "num_input_tokens_seen": 71801728, + "step": 33270 + }, + { + "epoch": 5.428221859706362, + "grad_norm": 1.8517613410949707, + "learning_rate": 2.5627740053079534e-05, + "loss": 0.124, + "num_input_tokens_seen": 71812096, + "step": 33275 + }, + { + "epoch": 5.4290375203915175, + "grad_norm": 0.4363913834095001, + "learning_rate": 2.562062428973393e-05, + "loss": 0.0439, + "num_input_tokens_seen": 71822752, + "step": 33280 + }, + { + "epoch": 5.429853181076672, + "grad_norm": 0.11947920173406601, + "learning_rate": 2.5613508476077365e-05, + "loss": 0.059, + "num_input_tokens_seen": 71833056, + "step": 33285 + }, + { + "epoch": 5.430668841761827, + "grad_norm": 0.05347622558474541, + "learning_rate": 2.5606392612686697e-05, + "loss": 0.1385, + "num_input_tokens_seen": 71842528, + "step": 33290 + }, + { + "epoch": 5.431484502446982, + "grad_norm": 0.7026997208595276, + "learning_rate": 2.5599276700138764e-05, + "loss": 0.0873, + "num_input_tokens_seen": 71853440, + "step": 33295 + }, + { + "epoch": 5.432300163132137, + "grad_norm": 0.15601125359535217, + "learning_rate": 2.5592160739010425e-05, + "loss": 0.1011, + "num_input_tokens_seen": 71863040, + "step": 33300 + }, + { + "epoch": 5.433115823817292, + "grad_norm": 0.04944843798875809, + "learning_rate": 2.5585044729878526e-05, + "loss": 0.1222, + "num_input_tokens_seen": 71873792, + "step": 33305 + }, + { + "epoch": 5.433931484502447, + "grad_norm": 1.1231180429458618, + "learning_rate": 2.557792867331994e-05, + "loss": 0.1285, + "num_input_tokens_seen": 71885856, + "step": 33310 + }, + { + "epoch": 5.434747145187602, + "grad_norm": 0.7335122227668762, + "learning_rate": 2.5570812569911518e-05, + "loss": 0.0565, + "num_input_tokens_seen": 71896416, + "step": 33315 + }, + { + "epoch": 5.435562805872757, + "grad_norm": 0.14499059319496155, + "learning_rate": 2.556369642023013e-05, + "loss": 0.0791, + "num_input_tokens_seen": 71906976, + "step": 33320 + }, + { + "epoch": 5.436378466557912, + "grad_norm": 0.08466537296772003, + "learning_rate": 2.5556580224852655e-05, + "loss": 0.096, + "num_input_tokens_seen": 71916864, + "step": 33325 + }, + { + "epoch": 5.437194127243067, + "grad_norm": 0.23324784636497498, + "learning_rate": 2.5549463984355964e-05, + "loss": 0.0353, + "num_input_tokens_seen": 71928288, + "step": 33330 + }, + { + "epoch": 5.438009787928221, + "grad_norm": 0.1635974645614624, + "learning_rate": 2.5542347699316933e-05, + "loss": 0.0881, + "num_input_tokens_seen": 71939104, + "step": 33335 + }, + { + "epoch": 5.438825448613377, + "grad_norm": 0.35775065422058105, + "learning_rate": 2.553523137031244e-05, + "loss": 0.0472, + "num_input_tokens_seen": 71950208, + "step": 33340 + }, + { + "epoch": 5.439641109298532, + "grad_norm": 1.415647029876709, + "learning_rate": 2.5528114997919384e-05, + "loss": 0.3089, + "num_input_tokens_seen": 71961312, + "step": 33345 + }, + { + "epoch": 5.440456769983687, + "grad_norm": 1.4096287488937378, + "learning_rate": 2.5520998582714645e-05, + "loss": 0.2896, + "num_input_tokens_seen": 71973248, + "step": 33350 + }, + { + "epoch": 5.441272430668842, + "grad_norm": 0.060523271560668945, + "learning_rate": 2.5513882125275113e-05, + "loss": 0.0346, + "num_input_tokens_seen": 71983648, + "step": 33355 + }, + { + "epoch": 5.442088091353996, + "grad_norm": 0.08715082705020905, + "learning_rate": 2.5506765626177697e-05, + "loss": 0.0684, + "num_input_tokens_seen": 71995072, + "step": 33360 + }, + { + "epoch": 5.442903752039152, + "grad_norm": 1.527961254119873, + "learning_rate": 2.5499649085999282e-05, + "loss": 0.1719, + "num_input_tokens_seen": 72006112, + "step": 33365 + }, + { + "epoch": 5.443719412724307, + "grad_norm": 0.09341130405664444, + "learning_rate": 2.549253250531678e-05, + "loss": 0.0387, + "num_input_tokens_seen": 72017184, + "step": 33370 + }, + { + "epoch": 5.444535073409462, + "grad_norm": 0.6595378518104553, + "learning_rate": 2.548541588470709e-05, + "loss": 0.1019, + "num_input_tokens_seen": 72028352, + "step": 33375 + }, + { + "epoch": 5.445350734094617, + "grad_norm": 0.06843893975019455, + "learning_rate": 2.547829922474713e-05, + "loss": 0.1189, + "num_input_tokens_seen": 72037664, + "step": 33380 + }, + { + "epoch": 5.446166394779771, + "grad_norm": 0.1423833966255188, + "learning_rate": 2.5471182526013805e-05, + "loss": 0.0659, + "num_input_tokens_seen": 72049120, + "step": 33385 + }, + { + "epoch": 5.446982055464926, + "grad_norm": 0.9286152124404907, + "learning_rate": 2.546406578908403e-05, + "loss": 0.1918, + "num_input_tokens_seen": 72059104, + "step": 33390 + }, + { + "epoch": 5.447797716150082, + "grad_norm": 1.8825623989105225, + "learning_rate": 2.545694901453473e-05, + "loss": 0.1121, + "num_input_tokens_seen": 72070048, + "step": 33395 + }, + { + "epoch": 5.448613376835237, + "grad_norm": 0.24566085636615753, + "learning_rate": 2.5449832202942832e-05, + "loss": 0.2327, + "num_input_tokens_seen": 72080928, + "step": 33400 + }, + { + "epoch": 5.4494290375203915, + "grad_norm": 0.08398690074682236, + "learning_rate": 2.5442715354885237e-05, + "loss": 0.0643, + "num_input_tokens_seen": 72091488, + "step": 33405 + }, + { + "epoch": 5.450244698205546, + "grad_norm": 0.596794605255127, + "learning_rate": 2.5435598470938903e-05, + "loss": 0.0541, + "num_input_tokens_seen": 72102528, + "step": 33410 + }, + { + "epoch": 5.451060358890701, + "grad_norm": 0.731020450592041, + "learning_rate": 2.5428481551680745e-05, + "loss": 0.0813, + "num_input_tokens_seen": 72112992, + "step": 33415 + }, + { + "epoch": 5.451876019575856, + "grad_norm": 0.5819640755653381, + "learning_rate": 2.5421364597687696e-05, + "loss": 0.0703, + "num_input_tokens_seen": 72124096, + "step": 33420 + }, + { + "epoch": 5.452691680261012, + "grad_norm": 0.31213757395744324, + "learning_rate": 2.5414247609536696e-05, + "loss": 0.0686, + "num_input_tokens_seen": 72134624, + "step": 33425 + }, + { + "epoch": 5.4535073409461665, + "grad_norm": 1.0890876054763794, + "learning_rate": 2.5407130587804685e-05, + "loss": 0.2022, + "num_input_tokens_seen": 72145344, + "step": 33430 + }, + { + "epoch": 5.454323001631321, + "grad_norm": 0.36570850014686584, + "learning_rate": 2.5400013533068594e-05, + "loss": 0.0753, + "num_input_tokens_seen": 72157312, + "step": 33435 + }, + { + "epoch": 5.455138662316476, + "grad_norm": 1.4240550994873047, + "learning_rate": 2.5392896445905385e-05, + "loss": 0.223, + "num_input_tokens_seen": 72168032, + "step": 33440 + }, + { + "epoch": 5.455954323001631, + "grad_norm": 0.4986121952533722, + "learning_rate": 2.538577932689199e-05, + "loss": 0.0775, + "num_input_tokens_seen": 72178944, + "step": 33445 + }, + { + "epoch": 5.456769983686787, + "grad_norm": 1.1091175079345703, + "learning_rate": 2.537866217660537e-05, + "loss": 0.1015, + "num_input_tokens_seen": 72188640, + "step": 33450 + }, + { + "epoch": 5.4575856443719415, + "grad_norm": 1.1750309467315674, + "learning_rate": 2.5371544995622472e-05, + "loss": 0.1978, + "num_input_tokens_seen": 72198720, + "step": 33455 + }, + { + "epoch": 5.458401305057096, + "grad_norm": 0.04448351263999939, + "learning_rate": 2.536442778452025e-05, + "loss": 0.1146, + "num_input_tokens_seen": 72208704, + "step": 33460 + }, + { + "epoch": 5.459216965742251, + "grad_norm": 0.13202178478240967, + "learning_rate": 2.5357310543875667e-05, + "loss": 0.0888, + "num_input_tokens_seen": 72218688, + "step": 33465 + }, + { + "epoch": 5.460032626427406, + "grad_norm": 1.0137995481491089, + "learning_rate": 2.5350193274265678e-05, + "loss": 0.105, + "num_input_tokens_seen": 72230336, + "step": 33470 + }, + { + "epoch": 5.460848287112561, + "grad_norm": 0.15469439327716827, + "learning_rate": 2.5343075976267234e-05, + "loss": 0.0272, + "num_input_tokens_seen": 72241024, + "step": 33475 + }, + { + "epoch": 5.4616639477977165, + "grad_norm": 0.6689593195915222, + "learning_rate": 2.533595865045732e-05, + "loss": 0.1012, + "num_input_tokens_seen": 72252064, + "step": 33480 + }, + { + "epoch": 5.462479608482871, + "grad_norm": 1.4603744745254517, + "learning_rate": 2.532884129741289e-05, + "loss": 0.2382, + "num_input_tokens_seen": 72263616, + "step": 33485 + }, + { + "epoch": 5.463295269168026, + "grad_norm": 1.1756455898284912, + "learning_rate": 2.5321723917710923e-05, + "loss": 0.2562, + "num_input_tokens_seen": 72274784, + "step": 33490 + }, + { + "epoch": 5.464110929853181, + "grad_norm": 0.08495238423347473, + "learning_rate": 2.531460651192838e-05, + "loss": 0.0952, + "num_input_tokens_seen": 72284064, + "step": 33495 + }, + { + "epoch": 5.464926590538336, + "grad_norm": 1.212573766708374, + "learning_rate": 2.5307489080642227e-05, + "loss": 0.1805, + "num_input_tokens_seen": 72295616, + "step": 33500 + }, + { + "epoch": 5.465742251223491, + "grad_norm": 0.4000867009162903, + "learning_rate": 2.530037162442946e-05, + "loss": 0.2221, + "num_input_tokens_seen": 72305952, + "step": 33505 + }, + { + "epoch": 5.466557911908646, + "grad_norm": 0.30234313011169434, + "learning_rate": 2.529325414386704e-05, + "loss": 0.1959, + "num_input_tokens_seen": 72316512, + "step": 33510 + }, + { + "epoch": 5.467373572593801, + "grad_norm": 0.12931102514266968, + "learning_rate": 2.5286136639531956e-05, + "loss": 0.1823, + "num_input_tokens_seen": 72327328, + "step": 33515 + }, + { + "epoch": 5.468189233278956, + "grad_norm": 1.0303187370300293, + "learning_rate": 2.527901911200118e-05, + "loss": 0.0841, + "num_input_tokens_seen": 72338560, + "step": 33520 + }, + { + "epoch": 5.469004893964111, + "grad_norm": 0.8110071420669556, + "learning_rate": 2.5271901561851703e-05, + "loss": 0.1127, + "num_input_tokens_seen": 72349312, + "step": 33525 + }, + { + "epoch": 5.4698205546492655, + "grad_norm": 0.7760509848594666, + "learning_rate": 2.52647839896605e-05, + "loss": 0.0678, + "num_input_tokens_seen": 72361024, + "step": 33530 + }, + { + "epoch": 5.470636215334421, + "grad_norm": 0.25321927666664124, + "learning_rate": 2.525766639600457e-05, + "loss": 0.2418, + "num_input_tokens_seen": 72372352, + "step": 33535 + }, + { + "epoch": 5.471451876019576, + "grad_norm": 0.16550378501415253, + "learning_rate": 2.525054878146089e-05, + "loss": 0.0734, + "num_input_tokens_seen": 72383136, + "step": 33540 + }, + { + "epoch": 5.472267536704731, + "grad_norm": 0.19652751088142395, + "learning_rate": 2.5243431146606456e-05, + "loss": 0.017, + "num_input_tokens_seen": 72392224, + "step": 33545 + }, + { + "epoch": 5.473083197389886, + "grad_norm": 0.1535763442516327, + "learning_rate": 2.5236313492018254e-05, + "loss": 0.0344, + "num_input_tokens_seen": 72402176, + "step": 33550 + }, + { + "epoch": 5.4738988580750405, + "grad_norm": 1.3598815202713013, + "learning_rate": 2.5229195818273284e-05, + "loss": 0.1952, + "num_input_tokens_seen": 72413792, + "step": 33555 + }, + { + "epoch": 5.474714518760196, + "grad_norm": 1.1835277080535889, + "learning_rate": 2.5222078125948534e-05, + "loss": 0.1843, + "num_input_tokens_seen": 72425792, + "step": 33560 + }, + { + "epoch": 5.475530179445351, + "grad_norm": 0.06823533028364182, + "learning_rate": 2.5214960415621007e-05, + "loss": 0.0359, + "num_input_tokens_seen": 72436864, + "step": 33565 + }, + { + "epoch": 5.476345840130506, + "grad_norm": 0.40672850608825684, + "learning_rate": 2.5207842687867705e-05, + "loss": 0.0411, + "num_input_tokens_seen": 72448096, + "step": 33570 + }, + { + "epoch": 5.477161500815661, + "grad_norm": 0.6308793425559998, + "learning_rate": 2.5200724943265614e-05, + "loss": 0.0529, + "num_input_tokens_seen": 72459776, + "step": 33575 + }, + { + "epoch": 5.4779771615008155, + "grad_norm": 2.7459938526153564, + "learning_rate": 2.519360718239174e-05, + "loss": 0.2566, + "num_input_tokens_seen": 72470528, + "step": 33580 + }, + { + "epoch": 5.47879282218597, + "grad_norm": 1.7168710231781006, + "learning_rate": 2.5186489405823087e-05, + "loss": 0.3503, + "num_input_tokens_seen": 72482144, + "step": 33585 + }, + { + "epoch": 5.479608482871126, + "grad_norm": 1.1908953189849854, + "learning_rate": 2.517937161413666e-05, + "loss": 0.1557, + "num_input_tokens_seen": 72493504, + "step": 33590 + }, + { + "epoch": 5.480424143556281, + "grad_norm": 0.038211409002542496, + "learning_rate": 2.517225380790946e-05, + "loss": 0.1017, + "num_input_tokens_seen": 72503904, + "step": 33595 + }, + { + "epoch": 5.481239804241436, + "grad_norm": 1.3332563638687134, + "learning_rate": 2.5165135987718486e-05, + "loss": 0.2588, + "num_input_tokens_seen": 72514560, + "step": 33600 + }, + { + "epoch": 5.4820554649265905, + "grad_norm": 0.3832308351993561, + "learning_rate": 2.515801815414075e-05, + "loss": 0.1162, + "num_input_tokens_seen": 72526080, + "step": 33605 + }, + { + "epoch": 5.482871125611745, + "grad_norm": 0.10727083683013916, + "learning_rate": 2.5150900307753267e-05, + "loss": 0.1386, + "num_input_tokens_seen": 72536992, + "step": 33610 + }, + { + "epoch": 5.4836867862969, + "grad_norm": 0.32799169421195984, + "learning_rate": 2.5143782449133036e-05, + "loss": 0.1251, + "num_input_tokens_seen": 72548224, + "step": 33615 + }, + { + "epoch": 5.484502446982056, + "grad_norm": 0.14705663919448853, + "learning_rate": 2.5136664578857072e-05, + "loss": 0.0723, + "num_input_tokens_seen": 72559744, + "step": 33620 + }, + { + "epoch": 5.485318107667211, + "grad_norm": 0.511358916759491, + "learning_rate": 2.5129546697502382e-05, + "loss": 0.0747, + "num_input_tokens_seen": 72570528, + "step": 33625 + }, + { + "epoch": 5.486133768352365, + "grad_norm": 0.6802677512168884, + "learning_rate": 2.512242880564598e-05, + "loss": 0.1729, + "num_input_tokens_seen": 72582112, + "step": 33630 + }, + { + "epoch": 5.48694942903752, + "grad_norm": 0.12909795343875885, + "learning_rate": 2.5115310903864874e-05, + "loss": 0.1401, + "num_input_tokens_seen": 72592736, + "step": 33635 + }, + { + "epoch": 5.487765089722675, + "grad_norm": 0.09353944659233093, + "learning_rate": 2.510819299273609e-05, + "loss": 0.0637, + "num_input_tokens_seen": 72604096, + "step": 33640 + }, + { + "epoch": 5.488580750407831, + "grad_norm": 0.8127201199531555, + "learning_rate": 2.510107507283663e-05, + "loss": 0.0911, + "num_input_tokens_seen": 72614560, + "step": 33645 + }, + { + "epoch": 5.489396411092986, + "grad_norm": 1.1064848899841309, + "learning_rate": 2.5093957144743507e-05, + "loss": 0.2183, + "num_input_tokens_seen": 72625536, + "step": 33650 + }, + { + "epoch": 5.49021207177814, + "grad_norm": 0.26609623432159424, + "learning_rate": 2.5086839209033747e-05, + "loss": 0.2519, + "num_input_tokens_seen": 72636384, + "step": 33655 + }, + { + "epoch": 5.491027732463295, + "grad_norm": 1.6713817119598389, + "learning_rate": 2.507972126628435e-05, + "loss": 0.231, + "num_input_tokens_seen": 72646272, + "step": 33660 + }, + { + "epoch": 5.49184339314845, + "grad_norm": 0.06328891962766647, + "learning_rate": 2.5072603317072353e-05, + "loss": 0.1506, + "num_input_tokens_seen": 72657184, + "step": 33665 + }, + { + "epoch": 5.492659053833605, + "grad_norm": 0.5628859996795654, + "learning_rate": 2.5065485361974754e-05, + "loss": 0.1499, + "num_input_tokens_seen": 72668544, + "step": 33670 + }, + { + "epoch": 5.493474714518761, + "grad_norm": 0.0631362721323967, + "learning_rate": 2.505836740156859e-05, + "loss": 0.0642, + "num_input_tokens_seen": 72679104, + "step": 33675 + }, + { + "epoch": 5.494290375203915, + "grad_norm": 0.8975023031234741, + "learning_rate": 2.5051249436430862e-05, + "loss": 0.2144, + "num_input_tokens_seen": 72689856, + "step": 33680 + }, + { + "epoch": 5.49510603588907, + "grad_norm": 0.8430812358856201, + "learning_rate": 2.5044131467138597e-05, + "loss": 0.0874, + "num_input_tokens_seen": 72701312, + "step": 33685 + }, + { + "epoch": 5.495921696574225, + "grad_norm": 0.10496621578931808, + "learning_rate": 2.5037013494268814e-05, + "loss": 0.0189, + "num_input_tokens_seen": 72711808, + "step": 33690 + }, + { + "epoch": 5.49673735725938, + "grad_norm": 0.3913768529891968, + "learning_rate": 2.502989551839852e-05, + "loss": 0.0699, + "num_input_tokens_seen": 72722528, + "step": 33695 + }, + { + "epoch": 5.497553017944535, + "grad_norm": 1.6516674757003784, + "learning_rate": 2.5022777540104752e-05, + "loss": 0.1807, + "num_input_tokens_seen": 72733408, + "step": 33700 + }, + { + "epoch": 5.49836867862969, + "grad_norm": 1.1062999963760376, + "learning_rate": 2.5015659559964516e-05, + "loss": 0.1788, + "num_input_tokens_seen": 72744544, + "step": 33705 + }, + { + "epoch": 5.499184339314845, + "grad_norm": 0.16313856840133667, + "learning_rate": 2.5008541578554838e-05, + "loss": 0.068, + "num_input_tokens_seen": 72755712, + "step": 33710 + }, + { + "epoch": 5.5, + "grad_norm": 2.4327638149261475, + "learning_rate": 2.5001423596452738e-05, + "loss": 0.1577, + "num_input_tokens_seen": 72765696, + "step": 33715 + }, + { + "epoch": 5.5, + "eval_loss": 0.13759736716747284, + "eval_runtime": 131.7726, + "eval_samples_per_second": 20.68, + "eval_steps_per_second": 5.176, + "num_input_tokens_seen": 72765696, + "step": 33715 + }, + { + "epoch": 5.500815660685155, + "grad_norm": 0.6229026913642883, + "learning_rate": 2.4994305614235228e-05, + "loss": 0.0722, + "num_input_tokens_seen": 72776576, + "step": 33720 + }, + { + "epoch": 5.50163132137031, + "grad_norm": 1.0697559118270874, + "learning_rate": 2.498718763247934e-05, + "loss": 0.054, + "num_input_tokens_seen": 72786112, + "step": 33725 + }, + { + "epoch": 5.502446982055465, + "grad_norm": 0.021476686000823975, + "learning_rate": 2.4980069651762085e-05, + "loss": 0.0635, + "num_input_tokens_seen": 72796384, + "step": 33730 + }, + { + "epoch": 5.50326264274062, + "grad_norm": 0.22370800375938416, + "learning_rate": 2.4972951672660487e-05, + "loss": 0.0705, + "num_input_tokens_seen": 72806720, + "step": 33735 + }, + { + "epoch": 5.504078303425775, + "grad_norm": 1.59718918800354, + "learning_rate": 2.4965833695751563e-05, + "loss": 0.1712, + "num_input_tokens_seen": 72817408, + "step": 33740 + }, + { + "epoch": 5.50489396411093, + "grad_norm": 0.9414946436882019, + "learning_rate": 2.4958715721612335e-05, + "loss": 0.1836, + "num_input_tokens_seen": 72828096, + "step": 33745 + }, + { + "epoch": 5.505709624796085, + "grad_norm": 0.13373306393623352, + "learning_rate": 2.495159775081982e-05, + "loss": 0.033, + "num_input_tokens_seen": 72839040, + "step": 33750 + }, + { + "epoch": 5.506525285481239, + "grad_norm": 0.6756805181503296, + "learning_rate": 2.4944479783951037e-05, + "loss": 0.1072, + "num_input_tokens_seen": 72849728, + "step": 33755 + }, + { + "epoch": 5.507340946166395, + "grad_norm": 0.10357353091239929, + "learning_rate": 2.4937361821583e-05, + "loss": 0.1426, + "num_input_tokens_seen": 72859872, + "step": 33760 + }, + { + "epoch": 5.50815660685155, + "grad_norm": 1.826023817062378, + "learning_rate": 2.4930243864292736e-05, + "loss": 0.3352, + "num_input_tokens_seen": 72871648, + "step": 33765 + }, + { + "epoch": 5.508972267536705, + "grad_norm": 0.18868030607700348, + "learning_rate": 2.492312591265726e-05, + "loss": 0.0505, + "num_input_tokens_seen": 72883136, + "step": 33770 + }, + { + "epoch": 5.50978792822186, + "grad_norm": 1.166529655456543, + "learning_rate": 2.4916007967253576e-05, + "loss": 0.0667, + "num_input_tokens_seen": 72892896, + "step": 33775 + }, + { + "epoch": 5.510603588907014, + "grad_norm": 0.04603581130504608, + "learning_rate": 2.490889002865872e-05, + "loss": 0.1391, + "num_input_tokens_seen": 72904576, + "step": 33780 + }, + { + "epoch": 5.511419249592169, + "grad_norm": 0.07419361174106598, + "learning_rate": 2.4901772097449703e-05, + "loss": 0.113, + "num_input_tokens_seen": 72914016, + "step": 33785 + }, + { + "epoch": 5.512234910277325, + "grad_norm": 0.12880775332450867, + "learning_rate": 2.4894654174203535e-05, + "loss": 0.0686, + "num_input_tokens_seen": 72925152, + "step": 33790 + }, + { + "epoch": 5.51305057096248, + "grad_norm": 0.5021657943725586, + "learning_rate": 2.488753625949723e-05, + "loss": 0.0525, + "num_input_tokens_seen": 72936320, + "step": 33795 + }, + { + "epoch": 5.513866231647635, + "grad_norm": 0.05625078082084656, + "learning_rate": 2.488041835390781e-05, + "loss": 0.0223, + "num_input_tokens_seen": 72946848, + "step": 33800 + }, + { + "epoch": 5.514681892332789, + "grad_norm": 0.1433025747537613, + "learning_rate": 2.4873300458012285e-05, + "loss": 0.0168, + "num_input_tokens_seen": 72958208, + "step": 33805 + }, + { + "epoch": 5.515497553017944, + "grad_norm": 0.6785904765129089, + "learning_rate": 2.486618257238767e-05, + "loss": 0.0442, + "num_input_tokens_seen": 72969152, + "step": 33810 + }, + { + "epoch": 5.5163132137031, + "grad_norm": 0.11947550624608994, + "learning_rate": 2.4859064697610977e-05, + "loss": 0.2639, + "num_input_tokens_seen": 72981664, + "step": 33815 + }, + { + "epoch": 5.517128874388255, + "grad_norm": 0.5220280289649963, + "learning_rate": 2.485194683425921e-05, + "loss": 0.0468, + "num_input_tokens_seen": 72992224, + "step": 33820 + }, + { + "epoch": 5.5179445350734095, + "grad_norm": 0.25924181938171387, + "learning_rate": 2.4844828982909388e-05, + "loss": 0.0833, + "num_input_tokens_seen": 73003712, + "step": 33825 + }, + { + "epoch": 5.518760195758564, + "grad_norm": 0.6291234493255615, + "learning_rate": 2.4837711144138514e-05, + "loss": 0.2957, + "num_input_tokens_seen": 73015328, + "step": 33830 + }, + { + "epoch": 5.519575856443719, + "grad_norm": 1.220653772354126, + "learning_rate": 2.48305933185236e-05, + "loss": 0.183, + "num_input_tokens_seen": 73026304, + "step": 33835 + }, + { + "epoch": 5.520391517128875, + "grad_norm": 0.36711016297340393, + "learning_rate": 2.4823475506641646e-05, + "loss": 0.3052, + "num_input_tokens_seen": 73038688, + "step": 33840 + }, + { + "epoch": 5.52120717781403, + "grad_norm": 1.0639771223068237, + "learning_rate": 2.481635770906967e-05, + "loss": 0.0757, + "num_input_tokens_seen": 73049824, + "step": 33845 + }, + { + "epoch": 5.5220228384991845, + "grad_norm": 0.11110055446624756, + "learning_rate": 2.4809239926384664e-05, + "loss": 0.1661, + "num_input_tokens_seen": 73061760, + "step": 33850 + }, + { + "epoch": 5.522838499184339, + "grad_norm": 0.07689385116100311, + "learning_rate": 2.480212215916364e-05, + "loss": 0.1261, + "num_input_tokens_seen": 73072704, + "step": 33855 + }, + { + "epoch": 5.523654159869494, + "grad_norm": 0.8870680928230286, + "learning_rate": 2.4795004407983593e-05, + "loss": 0.0859, + "num_input_tokens_seen": 73084032, + "step": 33860 + }, + { + "epoch": 5.524469820554649, + "grad_norm": 0.2513519823551178, + "learning_rate": 2.4787886673421536e-05, + "loss": 0.1266, + "num_input_tokens_seen": 73094592, + "step": 33865 + }, + { + "epoch": 5.525285481239804, + "grad_norm": 0.6530385613441467, + "learning_rate": 2.4780768956054457e-05, + "loss": 0.1226, + "num_input_tokens_seen": 73106304, + "step": 33870 + }, + { + "epoch": 5.5261011419249595, + "grad_norm": 0.08056148886680603, + "learning_rate": 2.477365125645936e-05, + "loss": 0.1072, + "num_input_tokens_seen": 73117088, + "step": 33875 + }, + { + "epoch": 5.526916802610114, + "grad_norm": 0.2694742977619171, + "learning_rate": 2.4766533575213242e-05, + "loss": 0.0569, + "num_input_tokens_seen": 73127776, + "step": 33880 + }, + { + "epoch": 5.527732463295269, + "grad_norm": 0.7621837854385376, + "learning_rate": 2.4759415912893096e-05, + "loss": 0.0386, + "num_input_tokens_seen": 73139904, + "step": 33885 + }, + { + "epoch": 5.528548123980424, + "grad_norm": 0.15608613193035126, + "learning_rate": 2.4752298270075918e-05, + "loss": 0.0228, + "num_input_tokens_seen": 73150688, + "step": 33890 + }, + { + "epoch": 5.529363784665579, + "grad_norm": 0.2243753969669342, + "learning_rate": 2.47451806473387e-05, + "loss": 0.1526, + "num_input_tokens_seen": 73160576, + "step": 33895 + }, + { + "epoch": 5.5301794453507345, + "grad_norm": 0.26516106724739075, + "learning_rate": 2.4738063045258415e-05, + "loss": 0.1361, + "num_input_tokens_seen": 73170272, + "step": 33900 + }, + { + "epoch": 5.530995106035889, + "grad_norm": 1.0799710750579834, + "learning_rate": 2.4730945464412085e-05, + "loss": 0.2118, + "num_input_tokens_seen": 73181760, + "step": 33905 + }, + { + "epoch": 5.531810766721044, + "grad_norm": 0.03487993776798248, + "learning_rate": 2.472382790537668e-05, + "loss": 0.0689, + "num_input_tokens_seen": 73192096, + "step": 33910 + }, + { + "epoch": 5.532626427406199, + "grad_norm": 1.4954665899276733, + "learning_rate": 2.4716710368729187e-05, + "loss": 0.1312, + "num_input_tokens_seen": 73201408, + "step": 33915 + }, + { + "epoch": 5.533442088091354, + "grad_norm": 1.4355415105819702, + "learning_rate": 2.4709592855046587e-05, + "loss": 0.1289, + "num_input_tokens_seen": 73211936, + "step": 33920 + }, + { + "epoch": 5.5342577487765094, + "grad_norm": 1.5141838788986206, + "learning_rate": 2.4702475364905864e-05, + "loss": 0.1655, + "num_input_tokens_seen": 73223392, + "step": 33925 + }, + { + "epoch": 5.535073409461664, + "grad_norm": 1.538330078125, + "learning_rate": 2.4695357898883998e-05, + "loss": 0.159, + "num_input_tokens_seen": 73233920, + "step": 33930 + }, + { + "epoch": 5.535889070146819, + "grad_norm": 0.48461493849754333, + "learning_rate": 2.4688240457557967e-05, + "loss": 0.0421, + "num_input_tokens_seen": 73244288, + "step": 33935 + }, + { + "epoch": 5.536704730831974, + "grad_norm": 0.8885757327079773, + "learning_rate": 2.4681123041504746e-05, + "loss": 0.219, + "num_input_tokens_seen": 73256256, + "step": 33940 + }, + { + "epoch": 5.537520391517129, + "grad_norm": 0.7745918035507202, + "learning_rate": 2.46740056513013e-05, + "loss": 0.2354, + "num_input_tokens_seen": 73267264, + "step": 33945 + }, + { + "epoch": 5.5383360522022835, + "grad_norm": 0.7361733317375183, + "learning_rate": 2.466688828752462e-05, + "loss": 0.2654, + "num_input_tokens_seen": 73279168, + "step": 33950 + }, + { + "epoch": 5.539151712887438, + "grad_norm": 1.042506217956543, + "learning_rate": 2.4659770950751666e-05, + "loss": 0.082, + "num_input_tokens_seen": 73288128, + "step": 33955 + }, + { + "epoch": 5.539967373572594, + "grad_norm": 1.3183423280715942, + "learning_rate": 2.4652653641559404e-05, + "loss": 0.2585, + "num_input_tokens_seen": 73297472, + "step": 33960 + }, + { + "epoch": 5.540783034257749, + "grad_norm": 0.29733794927597046, + "learning_rate": 2.46455363605248e-05, + "loss": 0.0985, + "num_input_tokens_seen": 73309504, + "step": 33965 + }, + { + "epoch": 5.541598694942904, + "grad_norm": 0.17515775561332703, + "learning_rate": 2.4638419108224817e-05, + "loss": 0.1214, + "num_input_tokens_seen": 73320192, + "step": 33970 + }, + { + "epoch": 5.5424143556280585, + "grad_norm": 0.07302232086658478, + "learning_rate": 2.4631301885236415e-05, + "loss": 0.194, + "num_input_tokens_seen": 73331168, + "step": 33975 + }, + { + "epoch": 5.543230016313213, + "grad_norm": 1.5229003429412842, + "learning_rate": 2.4624184692136554e-05, + "loss": 0.105, + "num_input_tokens_seen": 73342944, + "step": 33980 + }, + { + "epoch": 5.544045676998369, + "grad_norm": 0.07922688871622086, + "learning_rate": 2.4617067529502188e-05, + "loss": 0.0496, + "num_input_tokens_seen": 73353920, + "step": 33985 + }, + { + "epoch": 5.544861337683524, + "grad_norm": 1.6471694707870483, + "learning_rate": 2.460995039791027e-05, + "loss": 0.3382, + "num_input_tokens_seen": 73366432, + "step": 33990 + }, + { + "epoch": 5.545676998368679, + "grad_norm": 0.11130040138959885, + "learning_rate": 2.4602833297937755e-05, + "loss": 0.0982, + "num_input_tokens_seen": 73377824, + "step": 33995 + }, + { + "epoch": 5.5464926590538335, + "grad_norm": 0.5256956219673157, + "learning_rate": 2.4595716230161586e-05, + "loss": 0.0436, + "num_input_tokens_seen": 73387840, + "step": 34000 + }, + { + "epoch": 5.547308319738988, + "grad_norm": 0.6887928247451782, + "learning_rate": 2.45885991951587e-05, + "loss": 0.0916, + "num_input_tokens_seen": 73398304, + "step": 34005 + }, + { + "epoch": 5.548123980424144, + "grad_norm": 2.3785130977630615, + "learning_rate": 2.458148219350606e-05, + "loss": 0.2024, + "num_input_tokens_seen": 73410016, + "step": 34010 + }, + { + "epoch": 5.548939641109299, + "grad_norm": 1.1446316242218018, + "learning_rate": 2.45743652257806e-05, + "loss": 0.0871, + "num_input_tokens_seen": 73420960, + "step": 34015 + }, + { + "epoch": 5.549755301794454, + "grad_norm": 1.0986348390579224, + "learning_rate": 2.4567248292559253e-05, + "loss": 0.0821, + "num_input_tokens_seen": 73431712, + "step": 34020 + }, + { + "epoch": 5.5505709624796085, + "grad_norm": 1.583878993988037, + "learning_rate": 2.4560131394418958e-05, + "loss": 0.1875, + "num_input_tokens_seen": 73441376, + "step": 34025 + }, + { + "epoch": 5.551386623164763, + "grad_norm": 0.2926316261291504, + "learning_rate": 2.4553014531936632e-05, + "loss": 0.117, + "num_input_tokens_seen": 73450752, + "step": 34030 + }, + { + "epoch": 5.552202283849918, + "grad_norm": 1.5859044790267944, + "learning_rate": 2.4545897705689223e-05, + "loss": 0.1365, + "num_input_tokens_seen": 73461952, + "step": 34035 + }, + { + "epoch": 5.553017944535073, + "grad_norm": 0.584766685962677, + "learning_rate": 2.4538780916253657e-05, + "loss": 0.1294, + "num_input_tokens_seen": 73472064, + "step": 34040 + }, + { + "epoch": 5.553833605220229, + "grad_norm": 0.10464676469564438, + "learning_rate": 2.4531664164206843e-05, + "loss": 0.0275, + "num_input_tokens_seen": 73483200, + "step": 34045 + }, + { + "epoch": 5.554649265905383, + "grad_norm": 0.6264121532440186, + "learning_rate": 2.4524547450125713e-05, + "loss": 0.035, + "num_input_tokens_seen": 73494080, + "step": 34050 + }, + { + "epoch": 5.555464926590538, + "grad_norm": 0.3899288773536682, + "learning_rate": 2.4517430774587174e-05, + "loss": 0.1942, + "num_input_tokens_seen": 73503616, + "step": 34055 + }, + { + "epoch": 5.556280587275693, + "grad_norm": 0.6224768161773682, + "learning_rate": 2.4510314138168146e-05, + "loss": 0.1284, + "num_input_tokens_seen": 73515328, + "step": 34060 + }, + { + "epoch": 5.557096247960848, + "grad_norm": 0.09890153259038925, + "learning_rate": 2.4503197541445545e-05, + "loss": 0.1712, + "num_input_tokens_seen": 73527520, + "step": 34065 + }, + { + "epoch": 5.557911908646004, + "grad_norm": 0.903719425201416, + "learning_rate": 2.4496080984996264e-05, + "loss": 0.1953, + "num_input_tokens_seen": 73538688, + "step": 34070 + }, + { + "epoch": 5.558727569331158, + "grad_norm": 0.5276505351066589, + "learning_rate": 2.448896446939722e-05, + "loss": 0.1336, + "num_input_tokens_seen": 73548544, + "step": 34075 + }, + { + "epoch": 5.559543230016313, + "grad_norm": 0.44410911202430725, + "learning_rate": 2.4481847995225307e-05, + "loss": 0.0662, + "num_input_tokens_seen": 73559712, + "step": 34080 + }, + { + "epoch": 5.560358890701468, + "grad_norm": 0.4711245596408844, + "learning_rate": 2.4474731563057426e-05, + "loss": 0.0326, + "num_input_tokens_seen": 73570304, + "step": 34085 + }, + { + "epoch": 5.561174551386623, + "grad_norm": 0.8677898645401001, + "learning_rate": 2.446761517347046e-05, + "loss": 0.1508, + "num_input_tokens_seen": 73581056, + "step": 34090 + }, + { + "epoch": 5.561990212071779, + "grad_norm": 0.2630501091480255, + "learning_rate": 2.446049882704132e-05, + "loss": 0.0422, + "num_input_tokens_seen": 73592064, + "step": 34095 + }, + { + "epoch": 5.562805872756933, + "grad_norm": 0.19160860776901245, + "learning_rate": 2.4453382524346882e-05, + "loss": 0.0501, + "num_input_tokens_seen": 73603296, + "step": 34100 + }, + { + "epoch": 5.563621533442088, + "grad_norm": 0.9147894382476807, + "learning_rate": 2.444626626596403e-05, + "loss": 0.0774, + "num_input_tokens_seen": 73612704, + "step": 34105 + }, + { + "epoch": 5.564437194127243, + "grad_norm": 1.3476645946502686, + "learning_rate": 2.4439150052469644e-05, + "loss": 0.1116, + "num_input_tokens_seen": 73623168, + "step": 34110 + }, + { + "epoch": 5.565252854812398, + "grad_norm": 0.23332683742046356, + "learning_rate": 2.4432033884440585e-05, + "loss": 0.2493, + "num_input_tokens_seen": 73633088, + "step": 34115 + }, + { + "epoch": 5.566068515497553, + "grad_norm": 1.5037603378295898, + "learning_rate": 2.4424917762453757e-05, + "loss": 0.0702, + "num_input_tokens_seen": 73644512, + "step": 34120 + }, + { + "epoch": 5.566884176182708, + "grad_norm": 0.04761352017521858, + "learning_rate": 2.4417801687086013e-05, + "loss": 0.1099, + "num_input_tokens_seen": 73655008, + "step": 34125 + }, + { + "epoch": 5.567699836867863, + "grad_norm": 0.8476315140724182, + "learning_rate": 2.4410685658914213e-05, + "loss": 0.0977, + "num_input_tokens_seen": 73665216, + "step": 34130 + }, + { + "epoch": 5.568515497553018, + "grad_norm": 0.3169078528881073, + "learning_rate": 2.4403569678515227e-05, + "loss": 0.0412, + "num_input_tokens_seen": 73676384, + "step": 34135 + }, + { + "epoch": 5.569331158238173, + "grad_norm": 0.20494414865970612, + "learning_rate": 2.4396453746465912e-05, + "loss": 0.1847, + "num_input_tokens_seen": 73687456, + "step": 34140 + }, + { + "epoch": 5.570146818923328, + "grad_norm": 0.3425231873989105, + "learning_rate": 2.4389337863343117e-05, + "loss": 0.1225, + "num_input_tokens_seen": 73699424, + "step": 34145 + }, + { + "epoch": 5.5709624796084825, + "grad_norm": 0.3071475923061371, + "learning_rate": 2.4382222029723693e-05, + "loss": 0.0647, + "num_input_tokens_seen": 73710400, + "step": 34150 + }, + { + "epoch": 5.571778140293638, + "grad_norm": 0.2939091622829437, + "learning_rate": 2.4375106246184484e-05, + "loss": 0.2143, + "num_input_tokens_seen": 73721184, + "step": 34155 + }, + { + "epoch": 5.572593800978793, + "grad_norm": 0.48443788290023804, + "learning_rate": 2.4367990513302336e-05, + "loss": 0.1065, + "num_input_tokens_seen": 73732032, + "step": 34160 + }, + { + "epoch": 5.573409461663948, + "grad_norm": 1.656798243522644, + "learning_rate": 2.4360874831654083e-05, + "loss": 0.1787, + "num_input_tokens_seen": 73742944, + "step": 34165 + }, + { + "epoch": 5.574225122349103, + "grad_norm": 0.09948846697807312, + "learning_rate": 2.4353759201816555e-05, + "loss": 0.0647, + "num_input_tokens_seen": 73753312, + "step": 34170 + }, + { + "epoch": 5.575040783034257, + "grad_norm": 0.5985463261604309, + "learning_rate": 2.4346643624366586e-05, + "loss": 0.0955, + "num_input_tokens_seen": 73764480, + "step": 34175 + }, + { + "epoch": 5.575856443719413, + "grad_norm": 0.4561882019042969, + "learning_rate": 2.4339528099881e-05, + "loss": 0.1292, + "num_input_tokens_seen": 73774496, + "step": 34180 + }, + { + "epoch": 5.576672104404568, + "grad_norm": 0.9075701236724854, + "learning_rate": 2.433241262893662e-05, + "loss": 0.1202, + "num_input_tokens_seen": 73786688, + "step": 34185 + }, + { + "epoch": 5.577487765089723, + "grad_norm": 0.792144775390625, + "learning_rate": 2.432529721211026e-05, + "loss": 0.1795, + "num_input_tokens_seen": 73798080, + "step": 34190 + }, + { + "epoch": 5.578303425774878, + "grad_norm": 0.9097509384155273, + "learning_rate": 2.4318181849978733e-05, + "loss": 0.043, + "num_input_tokens_seen": 73808736, + "step": 34195 + }, + { + "epoch": 5.579119086460032, + "grad_norm": 0.47060051560401917, + "learning_rate": 2.4311066543118842e-05, + "loss": 0.1286, + "num_input_tokens_seen": 73819232, + "step": 34200 + }, + { + "epoch": 5.579934747145187, + "grad_norm": 0.33399948477745056, + "learning_rate": 2.4303951292107395e-05, + "loss": 0.0487, + "num_input_tokens_seen": 73829312, + "step": 34205 + }, + { + "epoch": 5.580750407830343, + "grad_norm": 0.31313207745552063, + "learning_rate": 2.4296836097521186e-05, + "loss": 0.035, + "num_input_tokens_seen": 73840320, + "step": 34210 + }, + { + "epoch": 5.581566068515498, + "grad_norm": 0.27508455514907837, + "learning_rate": 2.4289720959937008e-05, + "loss": 0.0571, + "num_input_tokens_seen": 73850464, + "step": 34215 + }, + { + "epoch": 5.582381729200653, + "grad_norm": 0.10854875296354294, + "learning_rate": 2.4282605879931647e-05, + "loss": 0.0939, + "num_input_tokens_seen": 73861152, + "step": 34220 + }, + { + "epoch": 5.583197389885807, + "grad_norm": 0.1684761643409729, + "learning_rate": 2.4275490858081903e-05, + "loss": 0.0139, + "num_input_tokens_seen": 73872288, + "step": 34225 + }, + { + "epoch": 5.584013050570962, + "grad_norm": 0.2679985463619232, + "learning_rate": 2.4268375894964544e-05, + "loss": 0.1167, + "num_input_tokens_seen": 73882720, + "step": 34230 + }, + { + "epoch": 5.584828711256117, + "grad_norm": 0.23798803985118866, + "learning_rate": 2.426126099115635e-05, + "loss": 0.1336, + "num_input_tokens_seen": 73892448, + "step": 34235 + }, + { + "epoch": 5.585644371941273, + "grad_norm": 2.0507800579071045, + "learning_rate": 2.4254146147234087e-05, + "loss": 0.2476, + "num_input_tokens_seen": 73904224, + "step": 34240 + }, + { + "epoch": 5.5864600326264275, + "grad_norm": 1.245173454284668, + "learning_rate": 2.4247031363774523e-05, + "loss": 0.1268, + "num_input_tokens_seen": 73915424, + "step": 34245 + }, + { + "epoch": 5.587275693311582, + "grad_norm": 0.2712286114692688, + "learning_rate": 2.4239916641354417e-05, + "loss": 0.1503, + "num_input_tokens_seen": 73925344, + "step": 34250 + }, + { + "epoch": 5.588091353996737, + "grad_norm": 0.2528771758079529, + "learning_rate": 2.4232801980550523e-05, + "loss": 0.1386, + "num_input_tokens_seen": 73935904, + "step": 34255 + }, + { + "epoch": 5.588907014681892, + "grad_norm": 0.09751628339290619, + "learning_rate": 2.422568738193959e-05, + "loss": 0.0163, + "num_input_tokens_seen": 73947296, + "step": 34260 + }, + { + "epoch": 5.589722675367048, + "grad_norm": 0.5488587021827698, + "learning_rate": 2.421857284609837e-05, + "loss": 0.0388, + "num_input_tokens_seen": 73955680, + "step": 34265 + }, + { + "epoch": 5.5905383360522025, + "grad_norm": 0.9247393012046814, + "learning_rate": 2.42114583736036e-05, + "loss": 0.0613, + "num_input_tokens_seen": 73966304, + "step": 34270 + }, + { + "epoch": 5.591353996737357, + "grad_norm": 0.09721387922763824, + "learning_rate": 2.4204343965032015e-05, + "loss": 0.032, + "num_input_tokens_seen": 73977120, + "step": 34275 + }, + { + "epoch": 5.592169657422512, + "grad_norm": 0.12973524630069733, + "learning_rate": 2.4197229620960347e-05, + "loss": 0.0623, + "num_input_tokens_seen": 73987648, + "step": 34280 + }, + { + "epoch": 5.592985318107667, + "grad_norm": 0.8082906007766724, + "learning_rate": 2.4190115341965316e-05, + "loss": 0.1296, + "num_input_tokens_seen": 73997312, + "step": 34285 + }, + { + "epoch": 5.593800978792823, + "grad_norm": 0.20732882618904114, + "learning_rate": 2.418300112862365e-05, + "loss": 0.2433, + "num_input_tokens_seen": 74009056, + "step": 34290 + }, + { + "epoch": 5.5946166394779775, + "grad_norm": 0.677845299243927, + "learning_rate": 2.4175886981512054e-05, + "loss": 0.0518, + "num_input_tokens_seen": 74019488, + "step": 34295 + }, + { + "epoch": 5.595432300163132, + "grad_norm": 0.37370046973228455, + "learning_rate": 2.416877290120724e-05, + "loss": 0.0714, + "num_input_tokens_seen": 74030880, + "step": 34300 + }, + { + "epoch": 5.596247960848287, + "grad_norm": 0.4275214374065399, + "learning_rate": 2.4161658888285916e-05, + "loss": 0.1607, + "num_input_tokens_seen": 74042080, + "step": 34305 + }, + { + "epoch": 5.597063621533442, + "grad_norm": 0.2759245038032532, + "learning_rate": 2.4154544943324772e-05, + "loss": 0.0665, + "num_input_tokens_seen": 74052448, + "step": 34310 + }, + { + "epoch": 5.597879282218597, + "grad_norm": 1.3812904357910156, + "learning_rate": 2.414743106690051e-05, + "loss": 0.1494, + "num_input_tokens_seen": 74063392, + "step": 34315 + }, + { + "epoch": 5.598694942903752, + "grad_norm": 1.5786231756210327, + "learning_rate": 2.41403172595898e-05, + "loss": 0.1055, + "num_input_tokens_seen": 74074720, + "step": 34320 + }, + { + "epoch": 5.599510603588907, + "grad_norm": 1.312446117401123, + "learning_rate": 2.413320352196934e-05, + "loss": 0.1359, + "num_input_tokens_seen": 74084512, + "step": 34325 + }, + { + "epoch": 5.600326264274062, + "grad_norm": 0.1688050478696823, + "learning_rate": 2.4126089854615802e-05, + "loss": 0.1091, + "num_input_tokens_seen": 74095520, + "step": 34330 + }, + { + "epoch": 5.601141924959217, + "grad_norm": 0.44533148407936096, + "learning_rate": 2.411897625810586e-05, + "loss": 0.1906, + "num_input_tokens_seen": 74105792, + "step": 34335 + }, + { + "epoch": 5.601957585644372, + "grad_norm": 0.0741853415966034, + "learning_rate": 2.4111862733016164e-05, + "loss": 0.0662, + "num_input_tokens_seen": 74116224, + "step": 34340 + }, + { + "epoch": 5.602773246329527, + "grad_norm": 1.099631667137146, + "learning_rate": 2.4104749279923383e-05, + "loss": 0.0838, + "num_input_tokens_seen": 74127232, + "step": 34345 + }, + { + "epoch": 5.603588907014682, + "grad_norm": 1.5300298929214478, + "learning_rate": 2.409763589940417e-05, + "loss": 0.1151, + "num_input_tokens_seen": 74137088, + "step": 34350 + }, + { + "epoch": 5.604404567699837, + "grad_norm": 0.05026852712035179, + "learning_rate": 2.4090522592035172e-05, + "loss": 0.0788, + "num_input_tokens_seen": 74148128, + "step": 34355 + }, + { + "epoch": 5.605220228384992, + "grad_norm": 0.23165574669837952, + "learning_rate": 2.408340935839303e-05, + "loss": 0.0457, + "num_input_tokens_seen": 74158432, + "step": 34360 + }, + { + "epoch": 5.606035889070147, + "grad_norm": 1.6267240047454834, + "learning_rate": 2.407629619905437e-05, + "loss": 0.1763, + "num_input_tokens_seen": 74169376, + "step": 34365 + }, + { + "epoch": 5.6068515497553015, + "grad_norm": 0.9381352663040161, + "learning_rate": 2.406918311459583e-05, + "loss": 0.1118, + "num_input_tokens_seen": 74180352, + "step": 34370 + }, + { + "epoch": 5.607667210440457, + "grad_norm": 0.743543267250061, + "learning_rate": 2.406207010559403e-05, + "loss": 0.0668, + "num_input_tokens_seen": 74190304, + "step": 34375 + }, + { + "epoch": 5.608482871125612, + "grad_norm": 1.6062792539596558, + "learning_rate": 2.4054957172625584e-05, + "loss": 0.1536, + "num_input_tokens_seen": 74201504, + "step": 34380 + }, + { + "epoch": 5.609298531810767, + "grad_norm": 1.335512399673462, + "learning_rate": 2.4047844316267104e-05, + "loss": 0.283, + "num_input_tokens_seen": 74211968, + "step": 34385 + }, + { + "epoch": 5.610114192495922, + "grad_norm": 0.024723855778574944, + "learning_rate": 2.40407315370952e-05, + "loss": 0.14, + "num_input_tokens_seen": 74223232, + "step": 34390 + }, + { + "epoch": 5.6109298531810765, + "grad_norm": 1.4230507612228394, + "learning_rate": 2.4033618835686462e-05, + "loss": 0.1222, + "num_input_tokens_seen": 74234272, + "step": 34395 + }, + { + "epoch": 5.611745513866231, + "grad_norm": 0.027699271216988564, + "learning_rate": 2.4026506212617485e-05, + "loss": 0.0634, + "num_input_tokens_seen": 74244608, + "step": 34400 + }, + { + "epoch": 5.612561174551386, + "grad_norm": 1.0704874992370605, + "learning_rate": 2.4019393668464846e-05, + "loss": 0.1198, + "num_input_tokens_seen": 74255104, + "step": 34405 + }, + { + "epoch": 5.613376835236542, + "grad_norm": 0.13765081763267517, + "learning_rate": 2.4012281203805138e-05, + "loss": 0.0521, + "num_input_tokens_seen": 74265792, + "step": 34410 + }, + { + "epoch": 5.614192495921697, + "grad_norm": 0.3117351531982422, + "learning_rate": 2.4005168819214926e-05, + "loss": 0.2139, + "num_input_tokens_seen": 74277120, + "step": 34415 + }, + { + "epoch": 5.6150081566068515, + "grad_norm": 0.27440208196640015, + "learning_rate": 2.3998056515270782e-05, + "loss": 0.06, + "num_input_tokens_seen": 74286848, + "step": 34420 + }, + { + "epoch": 5.615823817292006, + "grad_norm": 0.8035458922386169, + "learning_rate": 2.3990944292549257e-05, + "loss": 0.0517, + "num_input_tokens_seen": 74296992, + "step": 34425 + }, + { + "epoch": 5.616639477977161, + "grad_norm": 0.32434457540512085, + "learning_rate": 2.3983832151626897e-05, + "loss": 0.1009, + "num_input_tokens_seen": 74307680, + "step": 34430 + }, + { + "epoch": 5.617455138662317, + "grad_norm": 2.186988353729248, + "learning_rate": 2.397672009308027e-05, + "loss": 0.2657, + "num_input_tokens_seen": 74318368, + "step": 34435 + }, + { + "epoch": 5.618270799347472, + "grad_norm": 0.03940096125006676, + "learning_rate": 2.3969608117485906e-05, + "loss": 0.0624, + "num_input_tokens_seen": 74328544, + "step": 34440 + }, + { + "epoch": 5.6190864600326265, + "grad_norm": 0.6382993459701538, + "learning_rate": 2.3962496225420335e-05, + "loss": 0.0781, + "num_input_tokens_seen": 74339904, + "step": 34445 + }, + { + "epoch": 5.619902120717781, + "grad_norm": 0.5472183227539062, + "learning_rate": 2.3955384417460084e-05, + "loss": 0.0461, + "num_input_tokens_seen": 74350784, + "step": 34450 + }, + { + "epoch": 5.620717781402936, + "grad_norm": 0.08891753107309341, + "learning_rate": 2.3948272694181673e-05, + "loss": 0.1177, + "num_input_tokens_seen": 74361408, + "step": 34455 + }, + { + "epoch": 5.621533442088092, + "grad_norm": 0.7192165851593018, + "learning_rate": 2.3941161056161612e-05, + "loss": 0.2589, + "num_input_tokens_seen": 74371904, + "step": 34460 + }, + { + "epoch": 5.622349102773247, + "grad_norm": 0.10705941915512085, + "learning_rate": 2.393404950397641e-05, + "loss": 0.3497, + "num_input_tokens_seen": 74383232, + "step": 34465 + }, + { + "epoch": 5.623164763458401, + "grad_norm": 0.7647888660430908, + "learning_rate": 2.3926938038202565e-05, + "loss": 0.1016, + "num_input_tokens_seen": 74394240, + "step": 34470 + }, + { + "epoch": 5.623980424143556, + "grad_norm": 0.05268000811338425, + "learning_rate": 2.3919826659416564e-05, + "loss": 0.0201, + "num_input_tokens_seen": 74404160, + "step": 34475 + }, + { + "epoch": 5.624796084828711, + "grad_norm": 1.0506339073181152, + "learning_rate": 2.3912715368194895e-05, + "loss": 0.2965, + "num_input_tokens_seen": 74415904, + "step": 34480 + }, + { + "epoch": 5.625611745513866, + "grad_norm": 0.5055717825889587, + "learning_rate": 2.3905604165114038e-05, + "loss": 0.0812, + "num_input_tokens_seen": 74426336, + "step": 34485 + }, + { + "epoch": 5.626427406199021, + "grad_norm": 2.5514719486236572, + "learning_rate": 2.3898493050750453e-05, + "loss": 0.1405, + "num_input_tokens_seen": 74434592, + "step": 34490 + }, + { + "epoch": 5.627243066884176, + "grad_norm": 0.15258082747459412, + "learning_rate": 2.3891382025680616e-05, + "loss": 0.1013, + "num_input_tokens_seen": 74446464, + "step": 34495 + }, + { + "epoch": 5.628058727569331, + "grad_norm": 0.06443482637405396, + "learning_rate": 2.388427109048098e-05, + "loss": 0.1122, + "num_input_tokens_seen": 74457056, + "step": 34500 + }, + { + "epoch": 5.628874388254486, + "grad_norm": 1.474653959274292, + "learning_rate": 2.3877160245727988e-05, + "loss": 0.0442, + "num_input_tokens_seen": 74467584, + "step": 34505 + }, + { + "epoch": 5.629690048939641, + "grad_norm": 0.2614332139492035, + "learning_rate": 2.3870049491998082e-05, + "loss": 0.0723, + "num_input_tokens_seen": 74477376, + "step": 34510 + }, + { + "epoch": 5.630505709624796, + "grad_norm": 0.6452293992042542, + "learning_rate": 2.3862938829867698e-05, + "loss": 0.256, + "num_input_tokens_seen": 74487648, + "step": 34515 + }, + { + "epoch": 5.631321370309951, + "grad_norm": 1.2320345640182495, + "learning_rate": 2.3855828259913262e-05, + "loss": 0.1065, + "num_input_tokens_seen": 74498496, + "step": 34520 + }, + { + "epoch": 5.632137030995106, + "grad_norm": 1.5507426261901855, + "learning_rate": 2.3848717782711194e-05, + "loss": 0.1722, + "num_input_tokens_seen": 74509664, + "step": 34525 + }, + { + "epoch": 5.632952691680261, + "grad_norm": 0.21751387417316437, + "learning_rate": 2.3841607398837902e-05, + "loss": 0.0893, + "num_input_tokens_seen": 74519808, + "step": 34530 + }, + { + "epoch": 5.633768352365416, + "grad_norm": 1.0748143196105957, + "learning_rate": 2.3834497108869797e-05, + "loss": 0.0858, + "num_input_tokens_seen": 74531840, + "step": 34535 + }, + { + "epoch": 5.634584013050571, + "grad_norm": 0.41573330760002136, + "learning_rate": 2.3827386913383254e-05, + "loss": 0.1926, + "num_input_tokens_seen": 74543168, + "step": 34540 + }, + { + "epoch": 5.635399673735726, + "grad_norm": 0.7571199536323547, + "learning_rate": 2.3820276812954688e-05, + "loss": 0.0529, + "num_input_tokens_seen": 74553792, + "step": 34545 + }, + { + "epoch": 5.636215334420881, + "grad_norm": 0.1432759165763855, + "learning_rate": 2.3813166808160472e-05, + "loss": 0.0917, + "num_input_tokens_seen": 74565312, + "step": 34550 + }, + { + "epoch": 5.637030995106036, + "grad_norm": 0.0659380555152893, + "learning_rate": 2.3806056899576978e-05, + "loss": 0.3423, + "num_input_tokens_seen": 74576256, + "step": 34555 + }, + { + "epoch": 5.637846655791191, + "grad_norm": 0.15110820531845093, + "learning_rate": 2.3798947087780567e-05, + "loss": 0.1579, + "num_input_tokens_seen": 74586560, + "step": 34560 + }, + { + "epoch": 5.638662316476346, + "grad_norm": 1.6646732091903687, + "learning_rate": 2.37918373733476e-05, + "loss": 0.0567, + "num_input_tokens_seen": 74597792, + "step": 34565 + }, + { + "epoch": 5.6394779771615005, + "grad_norm": 1.743380069732666, + "learning_rate": 2.3784727756854425e-05, + "loss": 0.2997, + "num_input_tokens_seen": 74609504, + "step": 34570 + }, + { + "epoch": 5.640293637846656, + "grad_norm": 0.06715751439332962, + "learning_rate": 2.377761823887738e-05, + "loss": 0.1618, + "num_input_tokens_seen": 74620160, + "step": 34575 + }, + { + "epoch": 5.641109298531811, + "grad_norm": 2.3716201782226562, + "learning_rate": 2.3770508819992807e-05, + "loss": 0.149, + "num_input_tokens_seen": 74631072, + "step": 34580 + }, + { + "epoch": 5.641924959216966, + "grad_norm": 0.04521693289279938, + "learning_rate": 2.376339950077703e-05, + "loss": 0.032, + "num_input_tokens_seen": 74641088, + "step": 34585 + }, + { + "epoch": 5.642740619902121, + "grad_norm": 0.7887351512908936, + "learning_rate": 2.3756290281806358e-05, + "loss": 0.0814, + "num_input_tokens_seen": 74650560, + "step": 34590 + }, + { + "epoch": 5.643556280587275, + "grad_norm": 0.10442844778299332, + "learning_rate": 2.3749181163657114e-05, + "loss": 0.0969, + "num_input_tokens_seen": 74660608, + "step": 34595 + }, + { + "epoch": 5.64437194127243, + "grad_norm": 0.3399212956428528, + "learning_rate": 2.3742072146905587e-05, + "loss": 0.0546, + "num_input_tokens_seen": 74672064, + "step": 34600 + }, + { + "epoch": 5.645187601957586, + "grad_norm": 0.5312784314155579, + "learning_rate": 2.3734963232128072e-05, + "loss": 0.0549, + "num_input_tokens_seen": 74683200, + "step": 34605 + }, + { + "epoch": 5.646003262642741, + "grad_norm": 0.20033112168312073, + "learning_rate": 2.372785441990086e-05, + "loss": 0.147, + "num_input_tokens_seen": 74694016, + "step": 34610 + }, + { + "epoch": 5.646818923327896, + "grad_norm": 0.9566420912742615, + "learning_rate": 2.3720745710800225e-05, + "loss": 0.3817, + "num_input_tokens_seen": 74704096, + "step": 34615 + }, + { + "epoch": 5.64763458401305, + "grad_norm": 0.18431200087070465, + "learning_rate": 2.371363710540243e-05, + "loss": 0.158, + "num_input_tokens_seen": 74714528, + "step": 34620 + }, + { + "epoch": 5.648450244698205, + "grad_norm": 0.30366337299346924, + "learning_rate": 2.370652860428374e-05, + "loss": 0.2091, + "num_input_tokens_seen": 74725888, + "step": 34625 + }, + { + "epoch": 5.649265905383361, + "grad_norm": 0.7919727563858032, + "learning_rate": 2.3699420208020403e-05, + "loss": 0.0797, + "num_input_tokens_seen": 74736384, + "step": 34630 + }, + { + "epoch": 5.650081566068516, + "grad_norm": 1.2413642406463623, + "learning_rate": 2.3692311917188658e-05, + "loss": 0.2418, + "num_input_tokens_seen": 74748064, + "step": 34635 + }, + { + "epoch": 5.650897226753671, + "grad_norm": 0.05497109889984131, + "learning_rate": 2.3685203732364754e-05, + "loss": 0.068, + "num_input_tokens_seen": 74758720, + "step": 34640 + }, + { + "epoch": 5.651712887438825, + "grad_norm": 0.35737377405166626, + "learning_rate": 2.3678095654124893e-05, + "loss": 0.0365, + "num_input_tokens_seen": 74770208, + "step": 34645 + }, + { + "epoch": 5.65252854812398, + "grad_norm": 0.10966935008764267, + "learning_rate": 2.3670987683045317e-05, + "loss": 0.0219, + "num_input_tokens_seen": 74781568, + "step": 34650 + }, + { + "epoch": 5.653344208809135, + "grad_norm": 0.029028993099927902, + "learning_rate": 2.366387981970222e-05, + "loss": 0.0892, + "num_input_tokens_seen": 74792608, + "step": 34655 + }, + { + "epoch": 5.654159869494291, + "grad_norm": 0.5783670544624329, + "learning_rate": 2.36567720646718e-05, + "loss": 0.0431, + "num_input_tokens_seen": 74801760, + "step": 34660 + }, + { + "epoch": 5.6549755301794455, + "grad_norm": 0.3389531970024109, + "learning_rate": 2.3649664418530258e-05, + "loss": 0.0737, + "num_input_tokens_seen": 74812256, + "step": 34665 + }, + { + "epoch": 5.6557911908646, + "grad_norm": 0.16787628829479218, + "learning_rate": 2.364255688185377e-05, + "loss": 0.0712, + "num_input_tokens_seen": 74822848, + "step": 34670 + }, + { + "epoch": 5.656606851549755, + "grad_norm": 0.1286398470401764, + "learning_rate": 2.3635449455218506e-05, + "loss": 0.0998, + "num_input_tokens_seen": 74832992, + "step": 34675 + }, + { + "epoch": 5.65742251223491, + "grad_norm": 0.08293907344341278, + "learning_rate": 2.3628342139200636e-05, + "loss": 0.0846, + "num_input_tokens_seen": 74843136, + "step": 34680 + }, + { + "epoch": 5.658238172920065, + "grad_norm": 0.04126607999205589, + "learning_rate": 2.362123493437631e-05, + "loss": 0.0528, + "num_input_tokens_seen": 74852800, + "step": 34685 + }, + { + "epoch": 5.6590538336052205, + "grad_norm": 0.6634520292282104, + "learning_rate": 2.3614127841321677e-05, + "loss": 0.1033, + "num_input_tokens_seen": 74864512, + "step": 34690 + }, + { + "epoch": 5.659869494290375, + "grad_norm": 0.41665711998939514, + "learning_rate": 2.3607020860612872e-05, + "loss": 0.0261, + "num_input_tokens_seen": 74875936, + "step": 34695 + }, + { + "epoch": 5.66068515497553, + "grad_norm": 0.09761860966682434, + "learning_rate": 2.3599913992826023e-05, + "loss": 0.1705, + "num_input_tokens_seen": 74885504, + "step": 34700 + }, + { + "epoch": 5.661500815660685, + "grad_norm": 2.0916833877563477, + "learning_rate": 2.3592807238537253e-05, + "loss": 0.0977, + "num_input_tokens_seen": 74897632, + "step": 34705 + }, + { + "epoch": 5.66231647634584, + "grad_norm": 1.0291258096694946, + "learning_rate": 2.3585700598322665e-05, + "loss": 0.0546, + "num_input_tokens_seen": 74908672, + "step": 34710 + }, + { + "epoch": 5.6631321370309955, + "grad_norm": 1.3698513507843018, + "learning_rate": 2.3578594072758363e-05, + "loss": 0.1281, + "num_input_tokens_seen": 74919104, + "step": 34715 + }, + { + "epoch": 5.66394779771615, + "grad_norm": 0.23834523558616638, + "learning_rate": 2.3571487662420433e-05, + "loss": 0.0567, + "num_input_tokens_seen": 74929952, + "step": 34720 + }, + { + "epoch": 5.664763458401305, + "grad_norm": 1.2088772058486938, + "learning_rate": 2.3564381367884965e-05, + "loss": 0.0991, + "num_input_tokens_seen": 74941760, + "step": 34725 + }, + { + "epoch": 5.66557911908646, + "grad_norm": 0.5487915873527527, + "learning_rate": 2.3557275189728032e-05, + "loss": 0.0947, + "num_input_tokens_seen": 74952000, + "step": 34730 + }, + { + "epoch": 5.666394779771615, + "grad_norm": 0.352211594581604, + "learning_rate": 2.3550169128525688e-05, + "loss": 0.0275, + "num_input_tokens_seen": 74963008, + "step": 34735 + }, + { + "epoch": 5.6672104404567705, + "grad_norm": 0.5492860674858093, + "learning_rate": 2.3543063184853994e-05, + "loss": 0.043, + "num_input_tokens_seen": 74973408, + "step": 34740 + }, + { + "epoch": 5.668026101141925, + "grad_norm": 1.271528959274292, + "learning_rate": 2.353595735928899e-05, + "loss": 0.3656, + "num_input_tokens_seen": 74983744, + "step": 34745 + }, + { + "epoch": 5.66884176182708, + "grad_norm": 0.17715172469615936, + "learning_rate": 2.3528851652406697e-05, + "loss": 0.11, + "num_input_tokens_seen": 74993952, + "step": 34750 + }, + { + "epoch": 5.669657422512235, + "grad_norm": 0.36880648136138916, + "learning_rate": 2.3521746064783168e-05, + "loss": 0.0751, + "num_input_tokens_seen": 75004128, + "step": 34755 + }, + { + "epoch": 5.67047308319739, + "grad_norm": 1.8883578777313232, + "learning_rate": 2.3514640596994404e-05, + "loss": 0.1268, + "num_input_tokens_seen": 75014496, + "step": 34760 + }, + { + "epoch": 5.671288743882545, + "grad_norm": 0.6966503262519836, + "learning_rate": 2.350753524961641e-05, + "loss": 0.3832, + "num_input_tokens_seen": 75024704, + "step": 34765 + }, + { + "epoch": 5.672104404567699, + "grad_norm": 0.09297385066747665, + "learning_rate": 2.3500430023225174e-05, + "loss": 0.1087, + "num_input_tokens_seen": 75035488, + "step": 34770 + }, + { + "epoch": 5.672920065252855, + "grad_norm": 0.049702342599630356, + "learning_rate": 2.3493324918396696e-05, + "loss": 0.1008, + "num_input_tokens_seen": 75048192, + "step": 34775 + }, + { + "epoch": 5.67373572593801, + "grad_norm": 0.053669609129428864, + "learning_rate": 2.3486219935706944e-05, + "loss": 0.1218, + "num_input_tokens_seen": 75059680, + "step": 34780 + }, + { + "epoch": 5.674551386623165, + "grad_norm": 1.4690881967544556, + "learning_rate": 2.3479115075731886e-05, + "loss": 0.1383, + "num_input_tokens_seen": 75069696, + "step": 34785 + }, + { + "epoch": 5.6753670473083195, + "grad_norm": 1.2512744665145874, + "learning_rate": 2.3472010339047474e-05, + "loss": 0.2666, + "num_input_tokens_seen": 75080512, + "step": 34790 + }, + { + "epoch": 5.676182707993474, + "grad_norm": 1.4784785509109497, + "learning_rate": 2.3464905726229657e-05, + "loss": 0.1503, + "num_input_tokens_seen": 75092000, + "step": 34795 + }, + { + "epoch": 5.67699836867863, + "grad_norm": 0.07367252558469772, + "learning_rate": 2.3457801237854367e-05, + "loss": 0.1709, + "num_input_tokens_seen": 75102208, + "step": 34800 + }, + { + "epoch": 5.677814029363785, + "grad_norm": 0.12372208386659622, + "learning_rate": 2.345069687449754e-05, + "loss": 0.1027, + "num_input_tokens_seen": 75112960, + "step": 34805 + }, + { + "epoch": 5.67862969004894, + "grad_norm": 0.3000190556049347, + "learning_rate": 2.3443592636735085e-05, + "loss": 0.0621, + "num_input_tokens_seen": 75125824, + "step": 34810 + }, + { + "epoch": 5.6794453507340945, + "grad_norm": 0.1609734296798706, + "learning_rate": 2.3436488525142906e-05, + "loss": 0.1421, + "num_input_tokens_seen": 75136896, + "step": 34815 + }, + { + "epoch": 5.680261011419249, + "grad_norm": 1.014256238937378, + "learning_rate": 2.3429384540296902e-05, + "loss": 0.0549, + "num_input_tokens_seen": 75146528, + "step": 34820 + }, + { + "epoch": 5.681076672104405, + "grad_norm": 0.37118247151374817, + "learning_rate": 2.3422280682772953e-05, + "loss": 0.095, + "num_input_tokens_seen": 75157184, + "step": 34825 + }, + { + "epoch": 5.68189233278956, + "grad_norm": 1.5495882034301758, + "learning_rate": 2.341517695314694e-05, + "loss": 0.0865, + "num_input_tokens_seen": 75166016, + "step": 34830 + }, + { + "epoch": 5.682707993474715, + "grad_norm": 0.10932526737451553, + "learning_rate": 2.3408073351994726e-05, + "loss": 0.2613, + "num_input_tokens_seen": 75177824, + "step": 34835 + }, + { + "epoch": 5.6835236541598695, + "grad_norm": 0.5255666375160217, + "learning_rate": 2.340096987989216e-05, + "loss": 0.1605, + "num_input_tokens_seen": 75188128, + "step": 34840 + }, + { + "epoch": 5.684339314845024, + "grad_norm": 0.7176936864852905, + "learning_rate": 2.3393866537415093e-05, + "loss": 0.1374, + "num_input_tokens_seen": 75198016, + "step": 34845 + }, + { + "epoch": 5.685154975530179, + "grad_norm": 0.052010428160429, + "learning_rate": 2.3386763325139353e-05, + "loss": 0.0343, + "num_input_tokens_seen": 75208768, + "step": 34850 + }, + { + "epoch": 5.685970636215334, + "grad_norm": 0.2720937728881836, + "learning_rate": 2.337966024364076e-05, + "loss": 0.0677, + "num_input_tokens_seen": 75220064, + "step": 34855 + }, + { + "epoch": 5.68678629690049, + "grad_norm": 0.7925336360931396, + "learning_rate": 2.337255729349512e-05, + "loss": 0.0916, + "num_input_tokens_seen": 75231552, + "step": 34860 + }, + { + "epoch": 5.6876019575856445, + "grad_norm": 0.6195081472396851, + "learning_rate": 2.3365454475278257e-05, + "loss": 0.0802, + "num_input_tokens_seen": 75242080, + "step": 34865 + }, + { + "epoch": 5.688417618270799, + "grad_norm": 0.01774914562702179, + "learning_rate": 2.3358351789565945e-05, + "loss": 0.0477, + "num_input_tokens_seen": 75253312, + "step": 34870 + }, + { + "epoch": 5.689233278955954, + "grad_norm": 0.28734323382377625, + "learning_rate": 2.335124923693397e-05, + "loss": 0.0603, + "num_input_tokens_seen": 75265376, + "step": 34875 + }, + { + "epoch": 5.690048939641109, + "grad_norm": 0.3998967111110687, + "learning_rate": 2.334414681795809e-05, + "loss": 0.1868, + "num_input_tokens_seen": 75275872, + "step": 34880 + }, + { + "epoch": 5.690864600326265, + "grad_norm": 0.18342562019824982, + "learning_rate": 2.3337044533214068e-05, + "loss": 0.0377, + "num_input_tokens_seen": 75288096, + "step": 34885 + }, + { + "epoch": 5.691680261011419, + "grad_norm": 1.3679125308990479, + "learning_rate": 2.3329942383277665e-05, + "loss": 0.07, + "num_input_tokens_seen": 75298112, + "step": 34890 + }, + { + "epoch": 5.692495921696574, + "grad_norm": 0.32627835869789124, + "learning_rate": 2.3322840368724598e-05, + "loss": 0.1809, + "num_input_tokens_seen": 75308992, + "step": 34895 + }, + { + "epoch": 5.693311582381729, + "grad_norm": 0.20442666113376617, + "learning_rate": 2.3315738490130606e-05, + "loss": 0.1751, + "num_input_tokens_seen": 75320352, + "step": 34900 + }, + { + "epoch": 5.694127243066884, + "grad_norm": 0.13446763157844543, + "learning_rate": 2.3308636748071395e-05, + "loss": 0.0496, + "num_input_tokens_seen": 75333056, + "step": 34905 + }, + { + "epoch": 5.69494290375204, + "grad_norm": 0.14151953160762787, + "learning_rate": 2.3301535143122675e-05, + "loss": 0.1175, + "num_input_tokens_seen": 75345248, + "step": 34910 + }, + { + "epoch": 5.695758564437194, + "grad_norm": 2.1667795181274414, + "learning_rate": 2.3294433675860134e-05, + "loss": 0.2368, + "num_input_tokens_seen": 75356576, + "step": 34915 + }, + { + "epoch": 5.696574225122349, + "grad_norm": 0.13180553913116455, + "learning_rate": 2.328733234685945e-05, + "loss": 0.0621, + "num_input_tokens_seen": 75368000, + "step": 34920 + }, + { + "epoch": 5.697389885807504, + "grad_norm": 1.1849123239517212, + "learning_rate": 2.3280231156696297e-05, + "loss": 0.0676, + "num_input_tokens_seen": 75378208, + "step": 34925 + }, + { + "epoch": 5.698205546492659, + "grad_norm": 1.659267783164978, + "learning_rate": 2.3273130105946333e-05, + "loss": 0.2076, + "num_input_tokens_seen": 75390112, + "step": 34930 + }, + { + "epoch": 5.699021207177814, + "grad_norm": 0.06583758443593979, + "learning_rate": 2.3266029195185204e-05, + "loss": 0.0392, + "num_input_tokens_seen": 75401824, + "step": 34935 + }, + { + "epoch": 5.699836867862969, + "grad_norm": 1.1402586698532104, + "learning_rate": 2.3258928424988548e-05, + "loss": 0.1827, + "num_input_tokens_seen": 75412384, + "step": 34940 + }, + { + "epoch": 5.700652528548124, + "grad_norm": 0.6852033734321594, + "learning_rate": 2.325182779593198e-05, + "loss": 0.0522, + "num_input_tokens_seen": 75423648, + "step": 34945 + }, + { + "epoch": 5.701468189233279, + "grad_norm": 1.4051307439804077, + "learning_rate": 2.3244727308591126e-05, + "loss": 0.2468, + "num_input_tokens_seen": 75435232, + "step": 34950 + }, + { + "epoch": 5.702283849918434, + "grad_norm": 1.1383956670761108, + "learning_rate": 2.3237626963541588e-05, + "loss": 0.0541, + "num_input_tokens_seen": 75445952, + "step": 34955 + }, + { + "epoch": 5.703099510603589, + "grad_norm": 0.09476876258850098, + "learning_rate": 2.3230526761358944e-05, + "loss": 0.0282, + "num_input_tokens_seen": 75457696, + "step": 34960 + }, + { + "epoch": 5.7039151712887435, + "grad_norm": 0.8265910744667053, + "learning_rate": 2.3223426702618776e-05, + "loss": 0.1039, + "num_input_tokens_seen": 75467424, + "step": 34965 + }, + { + "epoch": 5.704730831973899, + "grad_norm": 0.07946164160966873, + "learning_rate": 2.3216326787896652e-05, + "loss": 0.1266, + "num_input_tokens_seen": 75476928, + "step": 34970 + }, + { + "epoch": 5.705546492659054, + "grad_norm": 1.7165470123291016, + "learning_rate": 2.3209227017768137e-05, + "loss": 0.0885, + "num_input_tokens_seen": 75486624, + "step": 34975 + }, + { + "epoch": 5.706362153344209, + "grad_norm": 0.19744281470775604, + "learning_rate": 2.3202127392808768e-05, + "loss": 0.156, + "num_input_tokens_seen": 75497888, + "step": 34980 + }, + { + "epoch": 5.707177814029364, + "grad_norm": 0.14444983005523682, + "learning_rate": 2.319502791359407e-05, + "loss": 0.0672, + "num_input_tokens_seen": 75508352, + "step": 34985 + }, + { + "epoch": 5.7079934747145185, + "grad_norm": 0.1496565043926239, + "learning_rate": 2.3187928580699573e-05, + "loss": 0.0741, + "num_input_tokens_seen": 75520416, + "step": 34990 + }, + { + "epoch": 5.708809135399674, + "grad_norm": 0.29905062913894653, + "learning_rate": 2.3180829394700775e-05, + "loss": 0.1943, + "num_input_tokens_seen": 75531168, + "step": 34995 + }, + { + "epoch": 5.709624796084829, + "grad_norm": 0.027989136055111885, + "learning_rate": 2.317373035617318e-05, + "loss": 0.0761, + "num_input_tokens_seen": 75541632, + "step": 35000 + }, + { + "epoch": 5.710440456769984, + "grad_norm": 0.21603041887283325, + "learning_rate": 2.3166631465692264e-05, + "loss": 0.0678, + "num_input_tokens_seen": 75552224, + "step": 35005 + }, + { + "epoch": 5.711256117455139, + "grad_norm": 0.04801922291517258, + "learning_rate": 2.3159532723833508e-05, + "loss": 0.0128, + "num_input_tokens_seen": 75562080, + "step": 35010 + }, + { + "epoch": 5.712071778140293, + "grad_norm": 0.42546045780181885, + "learning_rate": 2.3152434131172368e-05, + "loss": 0.1238, + "num_input_tokens_seen": 75571616, + "step": 35015 + }, + { + "epoch": 5.712887438825448, + "grad_norm": 0.31379836797714233, + "learning_rate": 2.3145335688284288e-05, + "loss": 0.0358, + "num_input_tokens_seen": 75581952, + "step": 35020 + }, + { + "epoch": 5.713703099510604, + "grad_norm": 0.07527698576450348, + "learning_rate": 2.3138237395744712e-05, + "loss": 0.0505, + "num_input_tokens_seen": 75592448, + "step": 35025 + }, + { + "epoch": 5.714518760195759, + "grad_norm": 0.957847535610199, + "learning_rate": 2.313113925412905e-05, + "loss": 0.0842, + "num_input_tokens_seen": 75603872, + "step": 35030 + }, + { + "epoch": 5.715334420880914, + "grad_norm": 0.3895184397697449, + "learning_rate": 2.312404126401273e-05, + "loss": 0.0956, + "num_input_tokens_seen": 75614752, + "step": 35035 + }, + { + "epoch": 5.716150081566068, + "grad_norm": 0.1580435186624527, + "learning_rate": 2.3116943425971144e-05, + "loss": 0.0228, + "num_input_tokens_seen": 75625600, + "step": 35040 + }, + { + "epoch": 5.716965742251223, + "grad_norm": 1.2983827590942383, + "learning_rate": 2.3109845740579676e-05, + "loss": 0.2021, + "num_input_tokens_seen": 75635648, + "step": 35045 + }, + { + "epoch": 5.717781402936378, + "grad_norm": 0.9512764811515808, + "learning_rate": 2.3102748208413706e-05, + "loss": 0.1685, + "num_input_tokens_seen": 75647680, + "step": 35050 + }, + { + "epoch": 5.718597063621534, + "grad_norm": 0.16244132816791534, + "learning_rate": 2.3095650830048595e-05, + "loss": 0.0741, + "num_input_tokens_seen": 75658080, + "step": 35055 + }, + { + "epoch": 5.719412724306689, + "grad_norm": 0.17050884664058685, + "learning_rate": 2.3088553606059686e-05, + "loss": 0.1574, + "num_input_tokens_seen": 75669920, + "step": 35060 + }, + { + "epoch": 5.720228384991843, + "grad_norm": 0.04701370373368263, + "learning_rate": 2.308145653702232e-05, + "loss": 0.0643, + "num_input_tokens_seen": 75681792, + "step": 35065 + }, + { + "epoch": 5.721044045676998, + "grad_norm": 0.07871877402067184, + "learning_rate": 2.307435962351181e-05, + "loss": 0.0193, + "num_input_tokens_seen": 75692992, + "step": 35070 + }, + { + "epoch": 5.721859706362153, + "grad_norm": 0.12638044357299805, + "learning_rate": 2.3067262866103492e-05, + "loss": 0.0492, + "num_input_tokens_seen": 75704608, + "step": 35075 + }, + { + "epoch": 5.722675367047309, + "grad_norm": 0.9110086560249329, + "learning_rate": 2.3060166265372654e-05, + "loss": 0.0616, + "num_input_tokens_seen": 75715328, + "step": 35080 + }, + { + "epoch": 5.7234910277324635, + "grad_norm": 1.9571293592453003, + "learning_rate": 2.3053069821894578e-05, + "loss": 0.1803, + "num_input_tokens_seen": 75725376, + "step": 35085 + }, + { + "epoch": 5.724306688417618, + "grad_norm": 0.04180498048663139, + "learning_rate": 2.3045973536244543e-05, + "loss": 0.0895, + "num_input_tokens_seen": 75736512, + "step": 35090 + }, + { + "epoch": 5.725122349102773, + "grad_norm": 0.029427675530314445, + "learning_rate": 2.303887740899781e-05, + "loss": 0.0267, + "num_input_tokens_seen": 75747552, + "step": 35095 + }, + { + "epoch": 5.725938009787928, + "grad_norm": 1.0187705755233765, + "learning_rate": 2.3031781440729623e-05, + "loss": 0.1645, + "num_input_tokens_seen": 75759136, + "step": 35100 + }, + { + "epoch": 5.726753670473083, + "grad_norm": 0.10032349079847336, + "learning_rate": 2.3024685632015218e-05, + "loss": 0.1258, + "num_input_tokens_seen": 75770208, + "step": 35105 + }, + { + "epoch": 5.7275693311582385, + "grad_norm": 0.1286563277244568, + "learning_rate": 2.3017589983429817e-05, + "loss": 0.0484, + "num_input_tokens_seen": 75780800, + "step": 35110 + }, + { + "epoch": 5.728384991843393, + "grad_norm": 0.52266925573349, + "learning_rate": 2.301049449554863e-05, + "loss": 0.0366, + "num_input_tokens_seen": 75791328, + "step": 35115 + }, + { + "epoch": 5.729200652528548, + "grad_norm": 0.21610905230045319, + "learning_rate": 2.3003399168946855e-05, + "loss": 0.2453, + "num_input_tokens_seen": 75802688, + "step": 35120 + }, + { + "epoch": 5.730016313213703, + "grad_norm": 0.12675096094608307, + "learning_rate": 2.2996304004199677e-05, + "loss": 0.2033, + "num_input_tokens_seen": 75812640, + "step": 35125 + }, + { + "epoch": 5.730831973898858, + "grad_norm": 0.637650191783905, + "learning_rate": 2.298920900188226e-05, + "loss": 0.0859, + "num_input_tokens_seen": 75823680, + "step": 35130 + }, + { + "epoch": 5.731647634584013, + "grad_norm": 0.14479686319828033, + "learning_rate": 2.2982114162569766e-05, + "loss": 0.053, + "num_input_tokens_seen": 75834208, + "step": 35135 + }, + { + "epoch": 5.732463295269168, + "grad_norm": 0.02536420337855816, + "learning_rate": 2.2975019486837334e-05, + "loss": 0.0729, + "num_input_tokens_seen": 75846432, + "step": 35140 + }, + { + "epoch": 5.733278955954323, + "grad_norm": 0.3909951150417328, + "learning_rate": 2.29679249752601e-05, + "loss": 0.2031, + "num_input_tokens_seen": 75856960, + "step": 35145 + }, + { + "epoch": 5.734094616639478, + "grad_norm": 0.807005763053894, + "learning_rate": 2.2960830628413175e-05, + "loss": 0.047, + "num_input_tokens_seen": 75868256, + "step": 35150 + }, + { + "epoch": 5.734910277324633, + "grad_norm": 0.04462399333715439, + "learning_rate": 2.295373644687167e-05, + "loss": 0.0671, + "num_input_tokens_seen": 75879808, + "step": 35155 + }, + { + "epoch": 5.735725938009788, + "grad_norm": 3.0646090507507324, + "learning_rate": 2.294664243121067e-05, + "loss": 0.2904, + "num_input_tokens_seen": 75891168, + "step": 35160 + }, + { + "epoch": 5.736541598694943, + "grad_norm": 0.05339455232024193, + "learning_rate": 2.2939548582005253e-05, + "loss": 0.2902, + "num_input_tokens_seen": 75902080, + "step": 35165 + }, + { + "epoch": 5.737357259380098, + "grad_norm": 0.8545902967453003, + "learning_rate": 2.293245489983048e-05, + "loss": 0.1491, + "num_input_tokens_seen": 75913568, + "step": 35170 + }, + { + "epoch": 5.738172920065253, + "grad_norm": 0.09468505531549454, + "learning_rate": 2.2925361385261402e-05, + "loss": 0.1521, + "num_input_tokens_seen": 75923552, + "step": 35175 + }, + { + "epoch": 5.738988580750408, + "grad_norm": 1.3513710498809814, + "learning_rate": 2.2918268038873055e-05, + "loss": 0.1916, + "num_input_tokens_seen": 75933856, + "step": 35180 + }, + { + "epoch": 5.739804241435563, + "grad_norm": 0.1282830834388733, + "learning_rate": 2.291117486124047e-05, + "loss": 0.1871, + "num_input_tokens_seen": 75944896, + "step": 35185 + }, + { + "epoch": 5.740619902120718, + "grad_norm": 1.815136194229126, + "learning_rate": 2.290408185293865e-05, + "loss": 0.1921, + "num_input_tokens_seen": 75954880, + "step": 35190 + }, + { + "epoch": 5.741435562805873, + "grad_norm": 0.0984594002366066, + "learning_rate": 2.2896989014542584e-05, + "loss": 0.0837, + "num_input_tokens_seen": 75964800, + "step": 35195 + }, + { + "epoch": 5.742251223491028, + "grad_norm": 1.3900820016860962, + "learning_rate": 2.2889896346627256e-05, + "loss": 0.2138, + "num_input_tokens_seen": 75975264, + "step": 35200 + }, + { + "epoch": 5.743066884176183, + "grad_norm": 0.4990847110748291, + "learning_rate": 2.2882803849767646e-05, + "loss": 0.059, + "num_input_tokens_seen": 75986976, + "step": 35205 + }, + { + "epoch": 5.7438825448613375, + "grad_norm": 0.12128788977861404, + "learning_rate": 2.2875711524538697e-05, + "loss": 0.097, + "num_input_tokens_seen": 75998752, + "step": 35210 + }, + { + "epoch": 5.744698205546492, + "grad_norm": 0.3258254826068878, + "learning_rate": 2.2868619371515348e-05, + "loss": 0.1535, + "num_input_tokens_seen": 76010688, + "step": 35215 + }, + { + "epoch": 5.745513866231647, + "grad_norm": 1.1281287670135498, + "learning_rate": 2.2861527391272526e-05, + "loss": 0.2856, + "num_input_tokens_seen": 76021408, + "step": 35220 + }, + { + "epoch": 5.746329526916803, + "grad_norm": 0.06528589874505997, + "learning_rate": 2.285443558438515e-05, + "loss": 0.2153, + "num_input_tokens_seen": 76032480, + "step": 35225 + }, + { + "epoch": 5.747145187601958, + "grad_norm": 1.530265212059021, + "learning_rate": 2.2847343951428106e-05, + "loss": 0.1067, + "num_input_tokens_seen": 76043072, + "step": 35230 + }, + { + "epoch": 5.7479608482871125, + "grad_norm": 0.10315248370170593, + "learning_rate": 2.284025249297629e-05, + "loss": 0.0843, + "num_input_tokens_seen": 76054208, + "step": 35235 + }, + { + "epoch": 5.748776508972267, + "grad_norm": 0.28354737162590027, + "learning_rate": 2.2833161209604557e-05, + "loss": 0.1018, + "num_input_tokens_seen": 76066080, + "step": 35240 + }, + { + "epoch": 5.749592169657422, + "grad_norm": 0.871299684047699, + "learning_rate": 2.2826070101887777e-05, + "loss": 0.1614, + "num_input_tokens_seen": 76077088, + "step": 35245 + }, + { + "epoch": 5.750407830342578, + "grad_norm": 0.11529336124658585, + "learning_rate": 2.2818979170400785e-05, + "loss": 0.1294, + "num_input_tokens_seen": 76087104, + "step": 35250 + }, + { + "epoch": 5.751223491027733, + "grad_norm": 0.7382805943489075, + "learning_rate": 2.2811888415718405e-05, + "loss": 0.0642, + "num_input_tokens_seen": 76098560, + "step": 35255 + }, + { + "epoch": 5.7520391517128875, + "grad_norm": 0.6586386561393738, + "learning_rate": 2.2804797838415448e-05, + "loss": 0.0642, + "num_input_tokens_seen": 76109472, + "step": 35260 + }, + { + "epoch": 5.752854812398042, + "grad_norm": 0.8721778392791748, + "learning_rate": 2.2797707439066724e-05, + "loss": 0.1583, + "num_input_tokens_seen": 76119744, + "step": 35265 + }, + { + "epoch": 5.753670473083197, + "grad_norm": 0.05330934748053551, + "learning_rate": 2.2790617218247005e-05, + "loss": 0.0508, + "num_input_tokens_seen": 76129664, + "step": 35270 + }, + { + "epoch": 5.754486133768353, + "grad_norm": 0.17257095873355865, + "learning_rate": 2.278352717653107e-05, + "loss": 0.056, + "num_input_tokens_seen": 76140384, + "step": 35275 + }, + { + "epoch": 5.755301794453508, + "grad_norm": 1.2921454906463623, + "learning_rate": 2.2776437314493666e-05, + "loss": 0.1464, + "num_input_tokens_seen": 76150624, + "step": 35280 + }, + { + "epoch": 5.7561174551386625, + "grad_norm": 1.4359643459320068, + "learning_rate": 2.2769347632709523e-05, + "loss": 0.0992, + "num_input_tokens_seen": 76161952, + "step": 35285 + }, + { + "epoch": 5.756933115823817, + "grad_norm": 0.10733385384082794, + "learning_rate": 2.276225813175339e-05, + "loss": 0.1318, + "num_input_tokens_seen": 76173600, + "step": 35290 + }, + { + "epoch": 5.757748776508972, + "grad_norm": 0.09496019035577774, + "learning_rate": 2.275516881219997e-05, + "loss": 0.056, + "num_input_tokens_seen": 76183808, + "step": 35295 + }, + { + "epoch": 5.758564437194127, + "grad_norm": 1.8561654090881348, + "learning_rate": 2.2748079674623954e-05, + "loss": 0.1806, + "num_input_tokens_seen": 76194752, + "step": 35300 + }, + { + "epoch": 5.759380097879282, + "grad_norm": 1.6735090017318726, + "learning_rate": 2.2740990719600026e-05, + "loss": 0.119, + "num_input_tokens_seen": 76205696, + "step": 35305 + }, + { + "epoch": 5.760195758564437, + "grad_norm": 0.3426590859889984, + "learning_rate": 2.2733901947702852e-05, + "loss": 0.1482, + "num_input_tokens_seen": 76215072, + "step": 35310 + }, + { + "epoch": 5.761011419249592, + "grad_norm": 0.9131085872650146, + "learning_rate": 2.2726813359507084e-05, + "loss": 0.2032, + "num_input_tokens_seen": 76225248, + "step": 35315 + }, + { + "epoch": 5.761827079934747, + "grad_norm": 1.1801096200942993, + "learning_rate": 2.271972495558736e-05, + "loss": 0.1223, + "num_input_tokens_seen": 76235552, + "step": 35320 + }, + { + "epoch": 5.762642740619902, + "grad_norm": 1.2221084833145142, + "learning_rate": 2.27126367365183e-05, + "loss": 0.1784, + "num_input_tokens_seen": 76246624, + "step": 35325 + }, + { + "epoch": 5.763458401305057, + "grad_norm": 1.3893240690231323, + "learning_rate": 2.2705548702874512e-05, + "loss": 0.0921, + "num_input_tokens_seen": 76256352, + "step": 35330 + }, + { + "epoch": 5.764274061990212, + "grad_norm": 0.17188148200511932, + "learning_rate": 2.269846085523059e-05, + "loss": 0.1372, + "num_input_tokens_seen": 76266912, + "step": 35335 + }, + { + "epoch": 5.765089722675367, + "grad_norm": 0.329385906457901, + "learning_rate": 2.2691373194161107e-05, + "loss": 0.217, + "num_input_tokens_seen": 76277344, + "step": 35340 + }, + { + "epoch": 5.765905383360522, + "grad_norm": 2.2937424182891846, + "learning_rate": 2.2684285720240624e-05, + "loss": 0.2031, + "num_input_tokens_seen": 76288544, + "step": 35345 + }, + { + "epoch": 5.766721044045677, + "grad_norm": 1.8719924688339233, + "learning_rate": 2.2677198434043695e-05, + "loss": 0.1835, + "num_input_tokens_seen": 76299744, + "step": 35350 + }, + { + "epoch": 5.767536704730832, + "grad_norm": 0.5595280528068542, + "learning_rate": 2.2670111336144844e-05, + "loss": 0.1819, + "num_input_tokens_seen": 76309280, + "step": 35355 + }, + { + "epoch": 5.768352365415987, + "grad_norm": 0.6324966549873352, + "learning_rate": 2.2663024427118592e-05, + "loss": 0.0931, + "num_input_tokens_seen": 76319072, + "step": 35360 + }, + { + "epoch": 5.769168026101142, + "grad_norm": 0.6825203895568848, + "learning_rate": 2.2655937707539437e-05, + "loss": 0.1, + "num_input_tokens_seen": 76329696, + "step": 35365 + }, + { + "epoch": 5.769983686786297, + "grad_norm": 0.19529519975185394, + "learning_rate": 2.2648851177981868e-05, + "loss": 0.2042, + "num_input_tokens_seen": 76339904, + "step": 35370 + }, + { + "epoch": 5.770799347471452, + "grad_norm": 0.5741446018218994, + "learning_rate": 2.264176483902035e-05, + "loss": 0.0645, + "num_input_tokens_seen": 76351776, + "step": 35375 + }, + { + "epoch": 5.771615008156607, + "grad_norm": 0.13455399870872498, + "learning_rate": 2.263467869122934e-05, + "loss": 0.1511, + "num_input_tokens_seen": 76362208, + "step": 35380 + }, + { + "epoch": 5.7724306688417615, + "grad_norm": 0.4356161057949066, + "learning_rate": 2.262759273518327e-05, + "loss": 0.0605, + "num_input_tokens_seen": 76374624, + "step": 35385 + }, + { + "epoch": 5.773246329526917, + "grad_norm": 0.16513119637966156, + "learning_rate": 2.262050697145657e-05, + "loss": 0.0838, + "num_input_tokens_seen": 76384064, + "step": 35390 + }, + { + "epoch": 5.774061990212072, + "grad_norm": 1.1841497421264648, + "learning_rate": 2.2613421400623653e-05, + "loss": 0.0693, + "num_input_tokens_seen": 76394976, + "step": 35395 + }, + { + "epoch": 5.774877650897227, + "grad_norm": 0.44148150086402893, + "learning_rate": 2.2606336023258907e-05, + "loss": 0.2271, + "num_input_tokens_seen": 76407136, + "step": 35400 + }, + { + "epoch": 5.775693311582382, + "grad_norm": 0.817480206489563, + "learning_rate": 2.259925083993671e-05, + "loss": 0.0954, + "num_input_tokens_seen": 76417536, + "step": 35405 + }, + { + "epoch": 5.7765089722675365, + "grad_norm": 0.057790692895650864, + "learning_rate": 2.2592165851231423e-05, + "loss": 0.0259, + "num_input_tokens_seen": 76428576, + "step": 35410 + }, + { + "epoch": 5.777324632952691, + "grad_norm": 1.4110851287841797, + "learning_rate": 2.2585081057717387e-05, + "loss": 0.1295, + "num_input_tokens_seen": 76439040, + "step": 35415 + }, + { + "epoch": 5.778140293637847, + "grad_norm": 0.4191972315311432, + "learning_rate": 2.2577996459968935e-05, + "loss": 0.2766, + "num_input_tokens_seen": 76450624, + "step": 35420 + }, + { + "epoch": 5.778955954323002, + "grad_norm": 0.07142219692468643, + "learning_rate": 2.2570912058560375e-05, + "loss": 0.1411, + "num_input_tokens_seen": 76459840, + "step": 35425 + }, + { + "epoch": 5.779771615008157, + "grad_norm": 1.3175666332244873, + "learning_rate": 2.2563827854066007e-05, + "loss": 0.2116, + "num_input_tokens_seen": 76470688, + "step": 35430 + }, + { + "epoch": 5.780587275693311, + "grad_norm": 0.20303930342197418, + "learning_rate": 2.2556743847060118e-05, + "loss": 0.2016, + "num_input_tokens_seen": 76481728, + "step": 35435 + }, + { + "epoch": 5.781402936378466, + "grad_norm": 2.3511593341827393, + "learning_rate": 2.254966003811697e-05, + "loss": 0.1479, + "num_input_tokens_seen": 76492032, + "step": 35440 + }, + { + "epoch": 5.782218597063622, + "grad_norm": 1.1023997068405151, + "learning_rate": 2.2542576427810813e-05, + "loss": 0.0645, + "num_input_tokens_seen": 76502784, + "step": 35445 + }, + { + "epoch": 5.783034257748777, + "grad_norm": 0.17986084520816803, + "learning_rate": 2.2535493016715882e-05, + "loss": 0.052, + "num_input_tokens_seen": 76513856, + "step": 35450 + }, + { + "epoch": 5.783849918433932, + "grad_norm": 0.29729312658309937, + "learning_rate": 2.2528409805406388e-05, + "loss": 0.0429, + "num_input_tokens_seen": 76525056, + "step": 35455 + }, + { + "epoch": 5.784665579119086, + "grad_norm": 1.4007925987243652, + "learning_rate": 2.2521326794456537e-05, + "loss": 0.1258, + "num_input_tokens_seen": 76535584, + "step": 35460 + }, + { + "epoch": 5.785481239804241, + "grad_norm": 0.750909149646759, + "learning_rate": 2.2514243984440512e-05, + "loss": 0.0975, + "num_input_tokens_seen": 76547168, + "step": 35465 + }, + { + "epoch": 5.786296900489396, + "grad_norm": 0.14322616159915924, + "learning_rate": 2.2507161375932484e-05, + "loss": 0.0467, + "num_input_tokens_seen": 76557888, + "step": 35470 + }, + { + "epoch": 5.787112561174552, + "grad_norm": 0.7631579041481018, + "learning_rate": 2.25000789695066e-05, + "loss": 0.076, + "num_input_tokens_seen": 76568864, + "step": 35475 + }, + { + "epoch": 5.787928221859707, + "grad_norm": 0.34894630312919617, + "learning_rate": 2.2492996765737004e-05, + "loss": 0.1217, + "num_input_tokens_seen": 76579296, + "step": 35480 + }, + { + "epoch": 5.788743882544861, + "grad_norm": 0.1100822165608406, + "learning_rate": 2.2485914765197807e-05, + "loss": 0.0339, + "num_input_tokens_seen": 76589824, + "step": 35485 + }, + { + "epoch": 5.789559543230016, + "grad_norm": 0.602289617061615, + "learning_rate": 2.247883296846311e-05, + "loss": 0.2052, + "num_input_tokens_seen": 76599744, + "step": 35490 + }, + { + "epoch": 5.790375203915171, + "grad_norm": 0.31225913763046265, + "learning_rate": 2.2471751376107006e-05, + "loss": 0.0911, + "num_input_tokens_seen": 76612000, + "step": 35495 + }, + { + "epoch": 5.791190864600326, + "grad_norm": 0.42806610465049744, + "learning_rate": 2.246466998870357e-05, + "loss": 0.0728, + "num_input_tokens_seen": 76623936, + "step": 35500 + }, + { + "epoch": 5.7920065252854815, + "grad_norm": 0.13211219012737274, + "learning_rate": 2.245758880682685e-05, + "loss": 0.0712, + "num_input_tokens_seen": 76634240, + "step": 35505 + }, + { + "epoch": 5.792822185970636, + "grad_norm": 0.2439313381910324, + "learning_rate": 2.2450507831050876e-05, + "loss": 0.1611, + "num_input_tokens_seen": 76645344, + "step": 35510 + }, + { + "epoch": 5.793637846655791, + "grad_norm": 0.13011877238750458, + "learning_rate": 2.2443427061949672e-05, + "loss": 0.018, + "num_input_tokens_seen": 76656000, + "step": 35515 + }, + { + "epoch": 5.794453507340946, + "grad_norm": 1.568411946296692, + "learning_rate": 2.2436346500097247e-05, + "loss": 0.11, + "num_input_tokens_seen": 76666976, + "step": 35520 + }, + { + "epoch": 5.795269168026101, + "grad_norm": 3.708010673522949, + "learning_rate": 2.2429266146067582e-05, + "loss": 0.237, + "num_input_tokens_seen": 76676192, + "step": 35525 + }, + { + "epoch": 5.7960848287112565, + "grad_norm": 0.33181092143058777, + "learning_rate": 2.242218600043465e-05, + "loss": 0.1399, + "num_input_tokens_seen": 76687104, + "step": 35530 + }, + { + "epoch": 5.796900489396411, + "grad_norm": 1.541534662246704, + "learning_rate": 2.2415106063772394e-05, + "loss": 0.1248, + "num_input_tokens_seen": 76699104, + "step": 35535 + }, + { + "epoch": 5.797716150081566, + "grad_norm": 0.06079941615462303, + "learning_rate": 2.240802633665476e-05, + "loss": 0.2088, + "num_input_tokens_seen": 76710464, + "step": 35540 + }, + { + "epoch": 5.798531810766721, + "grad_norm": 0.307283490896225, + "learning_rate": 2.2400946819655663e-05, + "loss": 0.0523, + "num_input_tokens_seen": 76720864, + "step": 35545 + }, + { + "epoch": 5.799347471451876, + "grad_norm": 0.06545855849981308, + "learning_rate": 2.2393867513349002e-05, + "loss": 0.1079, + "num_input_tokens_seen": 76731520, + "step": 35550 + }, + { + "epoch": 5.800163132137031, + "grad_norm": 2.3016021251678467, + "learning_rate": 2.238678841830867e-05, + "loss": 0.2078, + "num_input_tokens_seen": 76741536, + "step": 35555 + }, + { + "epoch": 5.800978792822186, + "grad_norm": 0.08474590629339218, + "learning_rate": 2.2379709535108524e-05, + "loss": 0.0544, + "num_input_tokens_seen": 76751904, + "step": 35560 + }, + { + "epoch": 5.801794453507341, + "grad_norm": 1.5767675638198853, + "learning_rate": 2.2372630864322416e-05, + "loss": 0.2947, + "num_input_tokens_seen": 76761504, + "step": 35565 + }, + { + "epoch": 5.802610114192496, + "grad_norm": 0.42994242906570435, + "learning_rate": 2.2365552406524183e-05, + "loss": 0.0356, + "num_input_tokens_seen": 76771968, + "step": 35570 + }, + { + "epoch": 5.803425774877651, + "grad_norm": 1.5422430038452148, + "learning_rate": 2.2358474162287635e-05, + "loss": 0.1426, + "num_input_tokens_seen": 76782432, + "step": 35575 + }, + { + "epoch": 5.804241435562806, + "grad_norm": 1.1938533782958984, + "learning_rate": 2.235139613218658e-05, + "loss": 0.1616, + "num_input_tokens_seen": 76793152, + "step": 35580 + }, + { + "epoch": 5.80505709624796, + "grad_norm": 0.9243000149726868, + "learning_rate": 2.234431831679479e-05, + "loss": 0.1019, + "num_input_tokens_seen": 76804000, + "step": 35585 + }, + { + "epoch": 5.805872756933116, + "grad_norm": 0.36149558424949646, + "learning_rate": 2.2337240716686035e-05, + "loss": 0.0637, + "num_input_tokens_seen": 76814432, + "step": 35590 + }, + { + "epoch": 5.806688417618271, + "grad_norm": 0.4774553179740906, + "learning_rate": 2.2330163332434056e-05, + "loss": 0.1502, + "num_input_tokens_seen": 76825472, + "step": 35595 + }, + { + "epoch": 5.807504078303426, + "grad_norm": 0.17207126319408417, + "learning_rate": 2.2323086164612584e-05, + "loss": 0.0179, + "num_input_tokens_seen": 76837568, + "step": 35600 + }, + { + "epoch": 5.808319738988581, + "grad_norm": 0.2748345732688904, + "learning_rate": 2.2316009213795323e-05, + "loss": 0.0378, + "num_input_tokens_seen": 76848768, + "step": 35605 + }, + { + "epoch": 5.809135399673735, + "grad_norm": 1.665005087852478, + "learning_rate": 2.230893248055598e-05, + "loss": 0.0922, + "num_input_tokens_seen": 76860000, + "step": 35610 + }, + { + "epoch": 5.809951060358891, + "grad_norm": 0.10731276124715805, + "learning_rate": 2.2301855965468226e-05, + "loss": 0.0858, + "num_input_tokens_seen": 76871904, + "step": 35615 + }, + { + "epoch": 5.810766721044046, + "grad_norm": 0.7904080152511597, + "learning_rate": 2.2294779669105716e-05, + "loss": 0.0957, + "num_input_tokens_seen": 76881504, + "step": 35620 + }, + { + "epoch": 5.811582381729201, + "grad_norm": 2.586336851119995, + "learning_rate": 2.2287703592042096e-05, + "loss": 0.2371, + "num_input_tokens_seen": 76894080, + "step": 35625 + }, + { + "epoch": 5.8123980424143555, + "grad_norm": 0.8000916838645935, + "learning_rate": 2.2280627734850984e-05, + "loss": 0.103, + "num_input_tokens_seen": 76904672, + "step": 35630 + }, + { + "epoch": 5.81321370309951, + "grad_norm": 1.424573302268982, + "learning_rate": 2.2273552098105983e-05, + "loss": 0.0803, + "num_input_tokens_seen": 76914976, + "step": 35635 + }, + { + "epoch": 5.814029363784666, + "grad_norm": 1.5610053539276123, + "learning_rate": 2.2266476682380685e-05, + "loss": 0.2271, + "num_input_tokens_seen": 76925824, + "step": 35640 + }, + { + "epoch": 5.814845024469821, + "grad_norm": 0.4056338369846344, + "learning_rate": 2.2259401488248658e-05, + "loss": 0.0655, + "num_input_tokens_seen": 76936160, + "step": 35645 + }, + { + "epoch": 5.815660685154976, + "grad_norm": 1.040636658668518, + "learning_rate": 2.225232651628345e-05, + "loss": 0.1065, + "num_input_tokens_seen": 76946016, + "step": 35650 + }, + { + "epoch": 5.8164763458401305, + "grad_norm": 1.772929072380066, + "learning_rate": 2.2245251767058595e-05, + "loss": 0.1513, + "num_input_tokens_seen": 76956704, + "step": 35655 + }, + { + "epoch": 5.817292006525285, + "grad_norm": 0.9087326526641846, + "learning_rate": 2.2238177241147607e-05, + "loss": 0.0555, + "num_input_tokens_seen": 76967360, + "step": 35660 + }, + { + "epoch": 5.81810766721044, + "grad_norm": 0.5664535760879517, + "learning_rate": 2.223110293912399e-05, + "loss": 0.1105, + "num_input_tokens_seen": 76979008, + "step": 35665 + }, + { + "epoch": 5.818923327895595, + "grad_norm": 0.04775217920541763, + "learning_rate": 2.2224028861561215e-05, + "loss": 0.1634, + "num_input_tokens_seen": 76988800, + "step": 35670 + }, + { + "epoch": 5.819738988580751, + "grad_norm": 1.9678328037261963, + "learning_rate": 2.2216955009032747e-05, + "loss": 0.3235, + "num_input_tokens_seen": 76999616, + "step": 35675 + }, + { + "epoch": 5.8205546492659055, + "grad_norm": 0.1731863170862198, + "learning_rate": 2.2209881382112026e-05, + "loss": 0.0232, + "num_input_tokens_seen": 77010656, + "step": 35680 + }, + { + "epoch": 5.82137030995106, + "grad_norm": 0.039809584617614746, + "learning_rate": 2.220280798137248e-05, + "loss": 0.1414, + "num_input_tokens_seen": 77021920, + "step": 35685 + }, + { + "epoch": 5.822185970636215, + "grad_norm": 1.7238339185714722, + "learning_rate": 2.219573480738751e-05, + "loss": 0.0688, + "num_input_tokens_seen": 77032416, + "step": 35690 + }, + { + "epoch": 5.82300163132137, + "grad_norm": 1.6232447624206543, + "learning_rate": 2.2188661860730507e-05, + "loss": 0.1193, + "num_input_tokens_seen": 77042944, + "step": 35695 + }, + { + "epoch": 5.823817292006526, + "grad_norm": 0.9126656651496887, + "learning_rate": 2.2181589141974836e-05, + "loss": 0.0506, + "num_input_tokens_seen": 77051584, + "step": 35700 + }, + { + "epoch": 5.8246329526916805, + "grad_norm": 1.6667828559875488, + "learning_rate": 2.217451665169385e-05, + "loss": 0.1709, + "num_input_tokens_seen": 77062048, + "step": 35705 + }, + { + "epoch": 5.825448613376835, + "grad_norm": 0.3192729353904724, + "learning_rate": 2.216744439046087e-05, + "loss": 0.1978, + "num_input_tokens_seen": 77074080, + "step": 35710 + }, + { + "epoch": 5.82626427406199, + "grad_norm": 0.386771559715271, + "learning_rate": 2.2160372358849234e-05, + "loss": 0.2959, + "num_input_tokens_seen": 77085632, + "step": 35715 + }, + { + "epoch": 5.827079934747145, + "grad_norm": 0.2889782786369324, + "learning_rate": 2.215330055743222e-05, + "loss": 0.018, + "num_input_tokens_seen": 77097664, + "step": 35720 + }, + { + "epoch": 5.827895595432301, + "grad_norm": 0.1934361755847931, + "learning_rate": 2.2146228986783105e-05, + "loss": 0.0465, + "num_input_tokens_seen": 77109568, + "step": 35725 + }, + { + "epoch": 5.828711256117455, + "grad_norm": 1.520974040031433, + "learning_rate": 2.213915764747515e-05, + "loss": 0.1598, + "num_input_tokens_seen": 77119648, + "step": 35730 + }, + { + "epoch": 5.82952691680261, + "grad_norm": 1.4342284202575684, + "learning_rate": 2.2132086540081593e-05, + "loss": 0.0605, + "num_input_tokens_seen": 77132064, + "step": 35735 + }, + { + "epoch": 5.830342577487765, + "grad_norm": 0.4122923016548157, + "learning_rate": 2.212501566517565e-05, + "loss": 0.0417, + "num_input_tokens_seen": 77142880, + "step": 35740 + }, + { + "epoch": 5.83115823817292, + "grad_norm": 0.9696914553642273, + "learning_rate": 2.211794502333052e-05, + "loss": 0.0821, + "num_input_tokens_seen": 77154272, + "step": 35745 + }, + { + "epoch": 5.831973898858075, + "grad_norm": 1.7119977474212646, + "learning_rate": 2.2110874615119396e-05, + "loss": 0.2933, + "num_input_tokens_seen": 77165280, + "step": 35750 + }, + { + "epoch": 5.8327895595432295, + "grad_norm": 1.3630074262619019, + "learning_rate": 2.2103804441115434e-05, + "loss": 0.0669, + "num_input_tokens_seen": 77176032, + "step": 35755 + }, + { + "epoch": 5.833605220228385, + "grad_norm": 0.13490362465381622, + "learning_rate": 2.209673450189178e-05, + "loss": 0.1265, + "num_input_tokens_seen": 77186880, + "step": 35760 + }, + { + "epoch": 5.83442088091354, + "grad_norm": 1.0787986516952515, + "learning_rate": 2.208966479802156e-05, + "loss": 0.1587, + "num_input_tokens_seen": 77196704, + "step": 35765 + }, + { + "epoch": 5.835236541598695, + "grad_norm": 1.1688522100448608, + "learning_rate": 2.2082595330077878e-05, + "loss": 0.0879, + "num_input_tokens_seen": 77208064, + "step": 35770 + }, + { + "epoch": 5.83605220228385, + "grad_norm": 0.685676634311676, + "learning_rate": 2.2075526098633816e-05, + "loss": 0.0895, + "num_input_tokens_seen": 77219328, + "step": 35775 + }, + { + "epoch": 5.8368678629690045, + "grad_norm": 0.1621507853269577, + "learning_rate": 2.206845710426245e-05, + "loss": 0.0471, + "num_input_tokens_seen": 77230592, + "step": 35780 + }, + { + "epoch": 5.83768352365416, + "grad_norm": 0.6451820731163025, + "learning_rate": 2.2061388347536828e-05, + "loss": 0.1247, + "num_input_tokens_seen": 77240992, + "step": 35785 + }, + { + "epoch": 5.838499184339315, + "grad_norm": 0.9332337379455566, + "learning_rate": 2.2054319829029975e-05, + "loss": 0.1492, + "num_input_tokens_seen": 77250144, + "step": 35790 + }, + { + "epoch": 5.83931484502447, + "grad_norm": 0.04069116711616516, + "learning_rate": 2.2047251549314907e-05, + "loss": 0.0945, + "num_input_tokens_seen": 77262464, + "step": 35795 + }, + { + "epoch": 5.840130505709625, + "grad_norm": 1.6503252983093262, + "learning_rate": 2.20401835089646e-05, + "loss": 0.2365, + "num_input_tokens_seen": 77273248, + "step": 35800 + }, + { + "epoch": 5.8409461663947795, + "grad_norm": 1.3946945667266846, + "learning_rate": 2.2033115708552044e-05, + "loss": 0.2143, + "num_input_tokens_seen": 77284032, + "step": 35805 + }, + { + "epoch": 5.841761827079935, + "grad_norm": 0.6170226335525513, + "learning_rate": 2.202604814865018e-05, + "loss": 0.0699, + "num_input_tokens_seen": 77295072, + "step": 35810 + }, + { + "epoch": 5.84257748776509, + "grad_norm": 2.0482466220855713, + "learning_rate": 2.201898082983194e-05, + "loss": 0.2622, + "num_input_tokens_seen": 77306208, + "step": 35815 + }, + { + "epoch": 5.843393148450245, + "grad_norm": 0.20397228002548218, + "learning_rate": 2.2011913752670242e-05, + "loss": 0.0504, + "num_input_tokens_seen": 77317024, + "step": 35820 + }, + { + "epoch": 5.8442088091354, + "grad_norm": 1.798384666442871, + "learning_rate": 2.2004846917737978e-05, + "loss": 0.1892, + "num_input_tokens_seen": 77327648, + "step": 35825 + }, + { + "epoch": 5.8450244698205545, + "grad_norm": 0.5982685089111328, + "learning_rate": 2.1997780325608013e-05, + "loss": 0.0564, + "num_input_tokens_seen": 77338304, + "step": 35830 + }, + { + "epoch": 5.845840130505709, + "grad_norm": 0.2976589500904083, + "learning_rate": 2.1990713976853216e-05, + "loss": 0.085, + "num_input_tokens_seen": 77348768, + "step": 35835 + }, + { + "epoch": 5.846655791190865, + "grad_norm": 0.22083783149719238, + "learning_rate": 2.1983647872046412e-05, + "loss": 0.1248, + "num_input_tokens_seen": 77358208, + "step": 35840 + }, + { + "epoch": 5.84747145187602, + "grad_norm": 0.05233761668205261, + "learning_rate": 2.1976582011760415e-05, + "loss": 0.058, + "num_input_tokens_seen": 77368896, + "step": 35845 + }, + { + "epoch": 5.848287112561175, + "grad_norm": 0.25483331084251404, + "learning_rate": 2.1969516396568023e-05, + "loss": 0.0864, + "num_input_tokens_seen": 77379616, + "step": 35850 + }, + { + "epoch": 5.849102773246329, + "grad_norm": 0.9229902625083923, + "learning_rate": 2.196245102704201e-05, + "loss": 0.0785, + "num_input_tokens_seen": 77390400, + "step": 35855 + }, + { + "epoch": 5.849918433931484, + "grad_norm": 1.0879507064819336, + "learning_rate": 2.1955385903755127e-05, + "loss": 0.142, + "num_input_tokens_seen": 77400544, + "step": 35860 + }, + { + "epoch": 5.850734094616639, + "grad_norm": 0.4752790331840515, + "learning_rate": 2.1948321027280108e-05, + "loss": 0.0688, + "num_input_tokens_seen": 77411136, + "step": 35865 + }, + { + "epoch": 5.851549755301795, + "grad_norm": 0.22942009568214417, + "learning_rate": 2.1941256398189676e-05, + "loss": 0.2013, + "num_input_tokens_seen": 77422240, + "step": 35870 + }, + { + "epoch": 5.85236541598695, + "grad_norm": 0.27538853883743286, + "learning_rate": 2.1934192017056515e-05, + "loss": 0.0742, + "num_input_tokens_seen": 77431648, + "step": 35875 + }, + { + "epoch": 5.853181076672104, + "grad_norm": 0.4424043893814087, + "learning_rate": 2.1927127884453307e-05, + "loss": 0.0629, + "num_input_tokens_seen": 77443680, + "step": 35880 + }, + { + "epoch": 5.853996737357259, + "grad_norm": 0.3454330861568451, + "learning_rate": 2.19200640009527e-05, + "loss": 0.0772, + "num_input_tokens_seen": 77453824, + "step": 35885 + }, + { + "epoch": 5.854812398042414, + "grad_norm": 0.0801226943731308, + "learning_rate": 2.1913000367127337e-05, + "loss": 0.1864, + "num_input_tokens_seen": 77464512, + "step": 35890 + }, + { + "epoch": 5.85562805872757, + "grad_norm": 1.369702935218811, + "learning_rate": 2.190593698354983e-05, + "loss": 0.2427, + "num_input_tokens_seen": 77475584, + "step": 35895 + }, + { + "epoch": 5.856443719412725, + "grad_norm": 1.2049825191497803, + "learning_rate": 2.1898873850792768e-05, + "loss": 0.1446, + "num_input_tokens_seen": 77486464, + "step": 35900 + }, + { + "epoch": 5.857259380097879, + "grad_norm": 1.8446846008300781, + "learning_rate": 2.1891810969428724e-05, + "loss": 0.1289, + "num_input_tokens_seen": 77495168, + "step": 35905 + }, + { + "epoch": 5.858075040783034, + "grad_norm": 2.2439215183258057, + "learning_rate": 2.1884748340030255e-05, + "loss": 0.144, + "num_input_tokens_seen": 77505632, + "step": 35910 + }, + { + "epoch": 5.858890701468189, + "grad_norm": 0.25688305497169495, + "learning_rate": 2.1877685963169893e-05, + "loss": 0.1025, + "num_input_tokens_seen": 77516512, + "step": 35915 + }, + { + "epoch": 5.859706362153344, + "grad_norm": 1.4145053625106812, + "learning_rate": 2.187062383942015e-05, + "loss": 0.2051, + "num_input_tokens_seen": 77528352, + "step": 35920 + }, + { + "epoch": 5.8605220228384995, + "grad_norm": 0.6181437969207764, + "learning_rate": 2.186356196935351e-05, + "loss": 0.1794, + "num_input_tokens_seen": 77537728, + "step": 35925 + }, + { + "epoch": 5.861337683523654, + "grad_norm": 0.06667118519544601, + "learning_rate": 2.185650035354245e-05, + "loss": 0.1729, + "num_input_tokens_seen": 77549216, + "step": 35930 + }, + { + "epoch": 5.862153344208809, + "grad_norm": 1.5411523580551147, + "learning_rate": 2.184943899255943e-05, + "loss": 0.2419, + "num_input_tokens_seen": 77560288, + "step": 35935 + }, + { + "epoch": 5.862969004893964, + "grad_norm": 2.0659618377685547, + "learning_rate": 2.1842377886976873e-05, + "loss": 0.1839, + "num_input_tokens_seen": 77571968, + "step": 35940 + }, + { + "epoch": 5.863784665579119, + "grad_norm": 0.09531906247138977, + "learning_rate": 2.183531703736718e-05, + "loss": 0.0376, + "num_input_tokens_seen": 77583872, + "step": 35945 + }, + { + "epoch": 5.864600326264274, + "grad_norm": 0.11095280200242996, + "learning_rate": 2.182825644430275e-05, + "loss": 0.1269, + "num_input_tokens_seen": 77595232, + "step": 35950 + }, + { + "epoch": 5.865415986949429, + "grad_norm": 0.08216061443090439, + "learning_rate": 2.1821196108355944e-05, + "loss": 0.0239, + "num_input_tokens_seen": 77605696, + "step": 35955 + }, + { + "epoch": 5.866231647634584, + "grad_norm": 0.17441889643669128, + "learning_rate": 2.181413603009911e-05, + "loss": 0.0841, + "num_input_tokens_seen": 77616192, + "step": 35960 + }, + { + "epoch": 5.867047308319739, + "grad_norm": 0.8034014701843262, + "learning_rate": 2.1807076210104575e-05, + "loss": 0.0539, + "num_input_tokens_seen": 77626688, + "step": 35965 + }, + { + "epoch": 5.867862969004894, + "grad_norm": 0.19544656574726105, + "learning_rate": 2.1800016648944638e-05, + "loss": 0.0713, + "num_input_tokens_seen": 77637152, + "step": 35970 + }, + { + "epoch": 5.868678629690049, + "grad_norm": 1.018566608428955, + "learning_rate": 2.1792957347191594e-05, + "loss": 0.0624, + "num_input_tokens_seen": 77647328, + "step": 35975 + }, + { + "epoch": 5.869494290375204, + "grad_norm": 0.32629159092903137, + "learning_rate": 2.1785898305417698e-05, + "loss": 0.0347, + "num_input_tokens_seen": 77657824, + "step": 35980 + }, + { + "epoch": 5.870309951060359, + "grad_norm": 0.08203940093517303, + "learning_rate": 2.1778839524195195e-05, + "loss": 0.0539, + "num_input_tokens_seen": 77668768, + "step": 35985 + }, + { + "epoch": 5.871125611745514, + "grad_norm": 2.328958034515381, + "learning_rate": 2.1771781004096304e-05, + "loss": 0.1305, + "num_input_tokens_seen": 77679040, + "step": 35990 + }, + { + "epoch": 5.871941272430669, + "grad_norm": 0.07304451614618301, + "learning_rate": 2.1764722745693223e-05, + "loss": 0.1365, + "num_input_tokens_seen": 77690304, + "step": 35995 + }, + { + "epoch": 5.872756933115824, + "grad_norm": 1.7445805072784424, + "learning_rate": 2.1757664749558132e-05, + "loss": 0.0833, + "num_input_tokens_seen": 77700480, + "step": 36000 + }, + { + "epoch": 5.873572593800979, + "grad_norm": 0.507717490196228, + "learning_rate": 2.1750607016263192e-05, + "loss": 0.0941, + "num_input_tokens_seen": 77711552, + "step": 36005 + }, + { + "epoch": 5.874388254486134, + "grad_norm": 0.5241026282310486, + "learning_rate": 2.1743549546380527e-05, + "loss": 0.1786, + "num_input_tokens_seen": 77722944, + "step": 36010 + }, + { + "epoch": 5.875203915171289, + "grad_norm": 0.08339208364486694, + "learning_rate": 2.1736492340482267e-05, + "loss": 0.0443, + "num_input_tokens_seen": 77735072, + "step": 36015 + }, + { + "epoch": 5.876019575856444, + "grad_norm": 0.048925772309303284, + "learning_rate": 2.172943539914049e-05, + "loss": 0.0293, + "num_input_tokens_seen": 77746848, + "step": 36020 + }, + { + "epoch": 5.876835236541599, + "grad_norm": 0.17727376520633698, + "learning_rate": 2.172237872292728e-05, + "loss": 0.0952, + "num_input_tokens_seen": 77757664, + "step": 36025 + }, + { + "epoch": 5.877650897226753, + "grad_norm": 0.10976528376340866, + "learning_rate": 2.1715322312414664e-05, + "loss": 0.0761, + "num_input_tokens_seen": 77768288, + "step": 36030 + }, + { + "epoch": 5.878466557911908, + "grad_norm": 0.04055725038051605, + "learning_rate": 2.1708266168174703e-05, + "loss": 0.1173, + "num_input_tokens_seen": 77778080, + "step": 36035 + }, + { + "epoch": 5.879282218597064, + "grad_norm": 1.5486962795257568, + "learning_rate": 2.170121029077939e-05, + "loss": 0.1805, + "num_input_tokens_seen": 77790784, + "step": 36040 + }, + { + "epoch": 5.880097879282219, + "grad_norm": 0.6597937345504761, + "learning_rate": 2.1694154680800706e-05, + "loss": 0.1391, + "num_input_tokens_seen": 77801344, + "step": 36045 + }, + { + "epoch": 5.8809135399673735, + "grad_norm": 0.5852967500686646, + "learning_rate": 2.168709933881062e-05, + "loss": 0.0788, + "num_input_tokens_seen": 77811392, + "step": 36050 + }, + { + "epoch": 5.881729200652528, + "grad_norm": 1.7837048768997192, + "learning_rate": 2.168004426538106e-05, + "loss": 0.1178, + "num_input_tokens_seen": 77821312, + "step": 36055 + }, + { + "epoch": 5.882544861337683, + "grad_norm": 0.2123912125825882, + "learning_rate": 2.1672989461083964e-05, + "loss": 0.2354, + "num_input_tokens_seen": 77831872, + "step": 36060 + }, + { + "epoch": 5.883360522022839, + "grad_norm": 0.42070403695106506, + "learning_rate": 2.1665934926491226e-05, + "loss": 0.158, + "num_input_tokens_seen": 77842752, + "step": 36065 + }, + { + "epoch": 5.884176182707994, + "grad_norm": 0.375935822725296, + "learning_rate": 2.1658880662174717e-05, + "loss": 0.0835, + "num_input_tokens_seen": 77853696, + "step": 36070 + }, + { + "epoch": 5.8849918433931485, + "grad_norm": 1.4452121257781982, + "learning_rate": 2.1651826668706297e-05, + "loss": 0.1891, + "num_input_tokens_seen": 77864288, + "step": 36075 + }, + { + "epoch": 5.885807504078303, + "grad_norm": 0.20285755395889282, + "learning_rate": 2.1644772946657795e-05, + "loss": 0.0959, + "num_input_tokens_seen": 77874880, + "step": 36080 + }, + { + "epoch": 5.886623164763458, + "grad_norm": 0.32670965790748596, + "learning_rate": 2.163771949660102e-05, + "loss": 0.0598, + "num_input_tokens_seen": 77886688, + "step": 36085 + }, + { + "epoch": 5.887438825448614, + "grad_norm": 0.6877618432044983, + "learning_rate": 2.1630666319107767e-05, + "loss": 0.1142, + "num_input_tokens_seen": 77897376, + "step": 36090 + }, + { + "epoch": 5.888254486133769, + "grad_norm": 1.0914161205291748, + "learning_rate": 2.1623613414749797e-05, + "loss": 0.0755, + "num_input_tokens_seen": 77908736, + "step": 36095 + }, + { + "epoch": 5.8890701468189235, + "grad_norm": 0.08395864069461823, + "learning_rate": 2.1616560784098856e-05, + "loss": 0.2577, + "num_input_tokens_seen": 77920352, + "step": 36100 + }, + { + "epoch": 5.889885807504078, + "grad_norm": 0.4012202024459839, + "learning_rate": 2.160950842772666e-05, + "loss": 0.133, + "num_input_tokens_seen": 77931360, + "step": 36105 + }, + { + "epoch": 5.890701468189233, + "grad_norm": 0.5247132778167725, + "learning_rate": 2.160245634620492e-05, + "loss": 0.0931, + "num_input_tokens_seen": 77942208, + "step": 36110 + }, + { + "epoch": 5.891517128874388, + "grad_norm": 0.028093451634049416, + "learning_rate": 2.1595404540105295e-05, + "loss": 0.0298, + "num_input_tokens_seen": 77953248, + "step": 36115 + }, + { + "epoch": 5.892332789559543, + "grad_norm": 0.21639838814735413, + "learning_rate": 2.1588353009999464e-05, + "loss": 0.0286, + "num_input_tokens_seen": 77963584, + "step": 36120 + }, + { + "epoch": 5.8931484502446985, + "grad_norm": 1.17374587059021, + "learning_rate": 2.158130175645905e-05, + "loss": 0.0746, + "num_input_tokens_seen": 77973920, + "step": 36125 + }, + { + "epoch": 5.893964110929853, + "grad_norm": 0.08162448555231094, + "learning_rate": 2.1574250780055654e-05, + "loss": 0.0564, + "num_input_tokens_seen": 77984736, + "step": 36130 + }, + { + "epoch": 5.894779771615008, + "grad_norm": 2.169862985610962, + "learning_rate": 2.156720008136087e-05, + "loss": 0.2796, + "num_input_tokens_seen": 77996160, + "step": 36135 + }, + { + "epoch": 5.895595432300163, + "grad_norm": 1.1835252046585083, + "learning_rate": 2.156014966094627e-05, + "loss": 0.194, + "num_input_tokens_seen": 78006592, + "step": 36140 + }, + { + "epoch": 5.896411092985318, + "grad_norm": 0.03459487482905388, + "learning_rate": 2.1553099519383394e-05, + "loss": 0.0981, + "num_input_tokens_seen": 78015936, + "step": 36145 + }, + { + "epoch": 5.897226753670473, + "grad_norm": 1.6962909698486328, + "learning_rate": 2.154604965724376e-05, + "loss": 0.2926, + "num_input_tokens_seen": 78028160, + "step": 36150 + }, + { + "epoch": 5.898042414355628, + "grad_norm": 0.049649935215711594, + "learning_rate": 2.1539000075098868e-05, + "loss": 0.0515, + "num_input_tokens_seen": 78039712, + "step": 36155 + }, + { + "epoch": 5.898858075040783, + "grad_norm": 0.1744745969772339, + "learning_rate": 2.1531950773520187e-05, + "loss": 0.0687, + "num_input_tokens_seen": 78051936, + "step": 36160 + }, + { + "epoch": 5.899673735725938, + "grad_norm": 0.9241953492164612, + "learning_rate": 2.1524901753079176e-05, + "loss": 0.2714, + "num_input_tokens_seen": 78062464, + "step": 36165 + }, + { + "epoch": 5.900489396411093, + "grad_norm": 0.4071631133556366, + "learning_rate": 2.1517853014347262e-05, + "loss": 0.059, + "num_input_tokens_seen": 78072352, + "step": 36170 + }, + { + "epoch": 5.901305057096248, + "grad_norm": 0.05377605929970741, + "learning_rate": 2.1510804557895847e-05, + "loss": 0.1079, + "num_input_tokens_seen": 78082816, + "step": 36175 + }, + { + "epoch": 5.902120717781403, + "grad_norm": 1.9559311866760254, + "learning_rate": 2.1503756384296323e-05, + "loss": 0.241, + "num_input_tokens_seen": 78092288, + "step": 36180 + }, + { + "epoch": 5.902936378466558, + "grad_norm": 0.520461916923523, + "learning_rate": 2.1496708494120043e-05, + "loss": 0.0972, + "num_input_tokens_seen": 78104608, + "step": 36185 + }, + { + "epoch": 5.903752039151713, + "grad_norm": 0.06770973652601242, + "learning_rate": 2.148966088793835e-05, + "loss": 0.0827, + "num_input_tokens_seen": 78116160, + "step": 36190 + }, + { + "epoch": 5.904567699836868, + "grad_norm": 0.35834822058677673, + "learning_rate": 2.1482613566322558e-05, + "loss": 0.2184, + "num_input_tokens_seen": 78127200, + "step": 36195 + }, + { + "epoch": 5.9053833605220225, + "grad_norm": 0.7860990762710571, + "learning_rate": 2.147556652984395e-05, + "loss": 0.163, + "num_input_tokens_seen": 78136928, + "step": 36200 + }, + { + "epoch": 5.906199021207177, + "grad_norm": 0.06807223707437515, + "learning_rate": 2.1468519779073805e-05, + "loss": 0.0696, + "num_input_tokens_seen": 78148704, + "step": 36205 + }, + { + "epoch": 5.907014681892333, + "grad_norm": 1.2382292747497559, + "learning_rate": 2.146147331458337e-05, + "loss": 0.1259, + "num_input_tokens_seen": 78158752, + "step": 36210 + }, + { + "epoch": 5.907830342577488, + "grad_norm": 0.8064499497413635, + "learning_rate": 2.1454427136943858e-05, + "loss": 0.3648, + "num_input_tokens_seen": 78167968, + "step": 36215 + }, + { + "epoch": 5.908646003262643, + "grad_norm": 1.7454198598861694, + "learning_rate": 2.1447381246726473e-05, + "loss": 0.1183, + "num_input_tokens_seen": 78178688, + "step": 36220 + }, + { + "epoch": 5.9094616639477975, + "grad_norm": 1.783477544784546, + "learning_rate": 2.144033564450239e-05, + "loss": 0.1456, + "num_input_tokens_seen": 78189376, + "step": 36225 + }, + { + "epoch": 5.910277324632952, + "grad_norm": 0.07744349539279938, + "learning_rate": 2.1433290330842764e-05, + "loss": 0.077, + "num_input_tokens_seen": 78200000, + "step": 36230 + }, + { + "epoch": 5.911092985318108, + "grad_norm": 1.231650471687317, + "learning_rate": 2.142624530631872e-05, + "loss": 0.0971, + "num_input_tokens_seen": 78210592, + "step": 36235 + }, + { + "epoch": 5.911908646003263, + "grad_norm": 0.7984469532966614, + "learning_rate": 2.1419200571501363e-05, + "loss": 0.1088, + "num_input_tokens_seen": 78220896, + "step": 36240 + }, + { + "epoch": 5.912724306688418, + "grad_norm": 0.997605562210083, + "learning_rate": 2.141215612696177e-05, + "loss": 0.059, + "num_input_tokens_seen": 78233088, + "step": 36245 + }, + { + "epoch": 5.9135399673735725, + "grad_norm": 0.9988604784011841, + "learning_rate": 2.140511197327101e-05, + "loss": 0.2988, + "num_input_tokens_seen": 78243680, + "step": 36250 + }, + { + "epoch": 5.914355628058727, + "grad_norm": 0.02668026275932789, + "learning_rate": 2.139806811100012e-05, + "loss": 0.0943, + "num_input_tokens_seen": 78253344, + "step": 36255 + }, + { + "epoch": 5.915171288743883, + "grad_norm": 0.06173250824213028, + "learning_rate": 2.1391024540720102e-05, + "loss": 0.1408, + "num_input_tokens_seen": 78264864, + "step": 36260 + }, + { + "epoch": 5.915986949429038, + "grad_norm": 0.2508784532546997, + "learning_rate": 2.1383981263001947e-05, + "loss": 0.0573, + "num_input_tokens_seen": 78276000, + "step": 36265 + }, + { + "epoch": 5.916802610114193, + "grad_norm": 0.6955304741859436, + "learning_rate": 2.1376938278416615e-05, + "loss": 0.1941, + "num_input_tokens_seen": 78287360, + "step": 36270 + }, + { + "epoch": 5.917618270799347, + "grad_norm": 0.09223299473524094, + "learning_rate": 2.1369895587535052e-05, + "loss": 0.096, + "num_input_tokens_seen": 78296608, + "step": 36275 + }, + { + "epoch": 5.918433931484502, + "grad_norm": 0.5154429078102112, + "learning_rate": 2.1362853190928172e-05, + "loss": 0.0835, + "num_input_tokens_seen": 78306848, + "step": 36280 + }, + { + "epoch": 5.919249592169657, + "grad_norm": 0.5667573809623718, + "learning_rate": 2.135581108916686e-05, + "loss": 0.029, + "num_input_tokens_seen": 78317504, + "step": 36285 + }, + { + "epoch": 5.920065252854813, + "grad_norm": 1.2974648475646973, + "learning_rate": 2.1348769282822e-05, + "loss": 0.1746, + "num_input_tokens_seen": 78327360, + "step": 36290 + }, + { + "epoch": 5.920880913539968, + "grad_norm": 0.07408157736063004, + "learning_rate": 2.1341727772464425e-05, + "loss": 0.1237, + "num_input_tokens_seen": 78338656, + "step": 36295 + }, + { + "epoch": 5.921696574225122, + "grad_norm": 0.5621784329414368, + "learning_rate": 2.1334686558664964e-05, + "loss": 0.2077, + "num_input_tokens_seen": 78350464, + "step": 36300 + }, + { + "epoch": 5.922512234910277, + "grad_norm": 1.7449541091918945, + "learning_rate": 2.1327645641994404e-05, + "loss": 0.1444, + "num_input_tokens_seen": 78360480, + "step": 36305 + }, + { + "epoch": 5.923327895595432, + "grad_norm": 1.4478201866149902, + "learning_rate": 2.1320605023023522e-05, + "loss": 0.0712, + "num_input_tokens_seen": 78371264, + "step": 36310 + }, + { + "epoch": 5.924143556280587, + "grad_norm": 1.545745611190796, + "learning_rate": 2.1313564702323064e-05, + "loss": 0.1925, + "num_input_tokens_seen": 78382176, + "step": 36315 + }, + { + "epoch": 5.924959216965743, + "grad_norm": 0.3494517207145691, + "learning_rate": 2.1306524680463758e-05, + "loss": 0.2699, + "num_input_tokens_seen": 78392704, + "step": 36320 + }, + { + "epoch": 5.925774877650897, + "grad_norm": 0.5785122513771057, + "learning_rate": 2.12994849580163e-05, + "loss": 0.3208, + "num_input_tokens_seen": 78404544, + "step": 36325 + }, + { + "epoch": 5.926590538336052, + "grad_norm": 0.13500815629959106, + "learning_rate": 2.129244553555137e-05, + "loss": 0.0954, + "num_input_tokens_seen": 78414400, + "step": 36330 + }, + { + "epoch": 5.927406199021207, + "grad_norm": 0.13741059601306915, + "learning_rate": 2.1285406413639616e-05, + "loss": 0.0961, + "num_input_tokens_seen": 78425664, + "step": 36335 + }, + { + "epoch": 5.928221859706362, + "grad_norm": 1.390944480895996, + "learning_rate": 2.1278367592851668e-05, + "loss": 0.0666, + "num_input_tokens_seen": 78437120, + "step": 36340 + }, + { + "epoch": 5.9290375203915175, + "grad_norm": 0.5312197804450989, + "learning_rate": 2.1271329073758118e-05, + "loss": 0.0963, + "num_input_tokens_seen": 78447456, + "step": 36345 + }, + { + "epoch": 5.929853181076672, + "grad_norm": 0.23515424132347107, + "learning_rate": 2.1264290856929553e-05, + "loss": 0.085, + "num_input_tokens_seen": 78457760, + "step": 36350 + }, + { + "epoch": 5.930668841761827, + "grad_norm": 0.1589874029159546, + "learning_rate": 2.125725294293653e-05, + "loss": 0.0366, + "num_input_tokens_seen": 78469248, + "step": 36355 + }, + { + "epoch": 5.931484502446982, + "grad_norm": 0.053969718515872955, + "learning_rate": 2.1250215332349575e-05, + "loss": 0.183, + "num_input_tokens_seen": 78480864, + "step": 36360 + }, + { + "epoch": 5.932300163132137, + "grad_norm": 1.6440715789794922, + "learning_rate": 2.1243178025739193e-05, + "loss": 0.1637, + "num_input_tokens_seen": 78492224, + "step": 36365 + }, + { + "epoch": 5.933115823817292, + "grad_norm": 0.6522055864334106, + "learning_rate": 2.1236141023675855e-05, + "loss": 0.0615, + "num_input_tokens_seen": 78503072, + "step": 36370 + }, + { + "epoch": 5.933931484502447, + "grad_norm": 0.3612748086452484, + "learning_rate": 2.122910432673003e-05, + "loss": 0.1915, + "num_input_tokens_seen": 78511744, + "step": 36375 + }, + { + "epoch": 5.934747145187602, + "grad_norm": 0.8383288383483887, + "learning_rate": 2.122206793547214e-05, + "loss": 0.1551, + "num_input_tokens_seen": 78522176, + "step": 36380 + }, + { + "epoch": 5.935562805872757, + "grad_norm": 0.22092437744140625, + "learning_rate": 2.1215031850472593e-05, + "loss": 0.0632, + "num_input_tokens_seen": 78532928, + "step": 36385 + }, + { + "epoch": 5.936378466557912, + "grad_norm": 0.4652670919895172, + "learning_rate": 2.120799607230177e-05, + "loss": 0.1324, + "num_input_tokens_seen": 78543552, + "step": 36390 + }, + { + "epoch": 5.937194127243067, + "grad_norm": 0.6669734716415405, + "learning_rate": 2.1200960601530022e-05, + "loss": 0.1117, + "num_input_tokens_seen": 78553824, + "step": 36395 + }, + { + "epoch": 5.938009787928221, + "grad_norm": 1.4828910827636719, + "learning_rate": 2.119392543872769e-05, + "loss": 0.1264, + "num_input_tokens_seen": 78565248, + "step": 36400 + }, + { + "epoch": 5.938825448613377, + "grad_norm": 0.7355093955993652, + "learning_rate": 2.1186890584465068e-05, + "loss": 0.1163, + "num_input_tokens_seen": 78577440, + "step": 36405 + }, + { + "epoch": 5.939641109298532, + "grad_norm": 1.696367621421814, + "learning_rate": 2.1179856039312446e-05, + "loss": 0.1921, + "num_input_tokens_seen": 78588160, + "step": 36410 + }, + { + "epoch": 5.940456769983687, + "grad_norm": 1.2901116609573364, + "learning_rate": 2.1172821803840077e-05, + "loss": 0.1319, + "num_input_tokens_seen": 78598976, + "step": 36415 + }, + { + "epoch": 5.941272430668842, + "grad_norm": 0.9677536487579346, + "learning_rate": 2.116578787861819e-05, + "loss": 0.2652, + "num_input_tokens_seen": 78608768, + "step": 36420 + }, + { + "epoch": 5.942088091353996, + "grad_norm": 0.225405752658844, + "learning_rate": 2.1158754264216992e-05, + "loss": 0.0908, + "num_input_tokens_seen": 78620736, + "step": 36425 + }, + { + "epoch": 5.942903752039152, + "grad_norm": 0.15411673486232758, + "learning_rate": 2.1151720961206657e-05, + "loss": 0.1481, + "num_input_tokens_seen": 78629216, + "step": 36430 + }, + { + "epoch": 5.943719412724307, + "grad_norm": 0.2160162478685379, + "learning_rate": 2.1144687970157357e-05, + "loss": 0.0535, + "num_input_tokens_seen": 78639040, + "step": 36435 + }, + { + "epoch": 5.944535073409462, + "grad_norm": 0.12948185205459595, + "learning_rate": 2.1137655291639206e-05, + "loss": 0.1257, + "num_input_tokens_seen": 78650624, + "step": 36440 + }, + { + "epoch": 5.945350734094617, + "grad_norm": 1.192501425743103, + "learning_rate": 2.113062292622232e-05, + "loss": 0.0788, + "num_input_tokens_seen": 78662368, + "step": 36445 + }, + { + "epoch": 5.946166394779771, + "grad_norm": 0.9106379151344299, + "learning_rate": 2.112359087447677e-05, + "loss": 0.147, + "num_input_tokens_seen": 78673984, + "step": 36450 + }, + { + "epoch": 5.946982055464927, + "grad_norm": 0.3759186267852783, + "learning_rate": 2.11165591369726e-05, + "loss": 0.041, + "num_input_tokens_seen": 78683328, + "step": 36455 + }, + { + "epoch": 5.947797716150082, + "grad_norm": 1.5580641031265259, + "learning_rate": 2.110952771427986e-05, + "loss": 0.1473, + "num_input_tokens_seen": 78694496, + "step": 36460 + }, + { + "epoch": 5.948613376835237, + "grad_norm": 0.8585734963417053, + "learning_rate": 2.110249660696855e-05, + "loss": 0.0687, + "num_input_tokens_seen": 78704832, + "step": 36465 + }, + { + "epoch": 5.9494290375203915, + "grad_norm": 2.2798309326171875, + "learning_rate": 2.1095465815608637e-05, + "loss": 0.1345, + "num_input_tokens_seen": 78715840, + "step": 36470 + }, + { + "epoch": 5.950244698205546, + "grad_norm": 0.23111610114574432, + "learning_rate": 2.1088435340770074e-05, + "loss": 0.0552, + "num_input_tokens_seen": 78726592, + "step": 36475 + }, + { + "epoch": 5.951060358890701, + "grad_norm": 0.3295735716819763, + "learning_rate": 2.108140518302279e-05, + "loss": 0.0959, + "num_input_tokens_seen": 78738048, + "step": 36480 + }, + { + "epoch": 5.951876019575856, + "grad_norm": 0.2134305089712143, + "learning_rate": 2.107437534293669e-05, + "loss": 0.0902, + "num_input_tokens_seen": 78749312, + "step": 36485 + }, + { + "epoch": 5.952691680261012, + "grad_norm": 0.10599318146705627, + "learning_rate": 2.106734582108164e-05, + "loss": 0.2207, + "num_input_tokens_seen": 78760512, + "step": 36490 + }, + { + "epoch": 5.9535073409461665, + "grad_norm": 1.102748990058899, + "learning_rate": 2.1060316618027493e-05, + "loss": 0.1151, + "num_input_tokens_seen": 78771072, + "step": 36495 + }, + { + "epoch": 5.954323001631321, + "grad_norm": 0.026224562898278236, + "learning_rate": 2.105328773434407e-05, + "loss": 0.0478, + "num_input_tokens_seen": 78781376, + "step": 36500 + }, + { + "epoch": 5.955138662316476, + "grad_norm": 0.43597325682640076, + "learning_rate": 2.1046259170601167e-05, + "loss": 0.1742, + "num_input_tokens_seen": 78792640, + "step": 36505 + }, + { + "epoch": 5.955954323001631, + "grad_norm": 1.1503396034240723, + "learning_rate": 2.1039230927368556e-05, + "loss": 0.11, + "num_input_tokens_seen": 78803776, + "step": 36510 + }, + { + "epoch": 5.956769983686787, + "grad_norm": 2.003201484680176, + "learning_rate": 2.1032203005215978e-05, + "loss": 0.1284, + "num_input_tokens_seen": 78813440, + "step": 36515 + }, + { + "epoch": 5.9575856443719415, + "grad_norm": 0.20198193192481995, + "learning_rate": 2.1025175404713167e-05, + "loss": 0.2026, + "num_input_tokens_seen": 78824736, + "step": 36520 + }, + { + "epoch": 5.958401305057096, + "grad_norm": 0.09352589398622513, + "learning_rate": 2.1018148126429797e-05, + "loss": 0.0967, + "num_input_tokens_seen": 78833920, + "step": 36525 + }, + { + "epoch": 5.959216965742251, + "grad_norm": 0.09614044427871704, + "learning_rate": 2.101112117093555e-05, + "loss": 0.0798, + "num_input_tokens_seen": 78845376, + "step": 36530 + }, + { + "epoch": 5.960032626427406, + "grad_norm": 0.9030516147613525, + "learning_rate": 2.1004094538800058e-05, + "loss": 0.0813, + "num_input_tokens_seen": 78858016, + "step": 36535 + }, + { + "epoch": 5.960848287112562, + "grad_norm": 1.0056709051132202, + "learning_rate": 2.099706823059294e-05, + "loss": 0.091, + "num_input_tokens_seen": 78869216, + "step": 36540 + }, + { + "epoch": 5.9616639477977165, + "grad_norm": 0.30634215474128723, + "learning_rate": 2.0990042246883777e-05, + "loss": 0.1147, + "num_input_tokens_seen": 78880096, + "step": 36545 + }, + { + "epoch": 5.962479608482871, + "grad_norm": 0.9440407156944275, + "learning_rate": 2.0983016588242145e-05, + "loss": 0.2113, + "num_input_tokens_seen": 78889664, + "step": 36550 + }, + { + "epoch": 5.963295269168026, + "grad_norm": 0.05915823578834534, + "learning_rate": 2.0975991255237562e-05, + "loss": 0.0626, + "num_input_tokens_seen": 78902240, + "step": 36555 + }, + { + "epoch": 5.964110929853181, + "grad_norm": 0.9726567268371582, + "learning_rate": 2.0968966248439536e-05, + "loss": 0.0758, + "num_input_tokens_seen": 78911872, + "step": 36560 + }, + { + "epoch": 5.964926590538336, + "grad_norm": 1.0373398065567017, + "learning_rate": 2.096194156841757e-05, + "loss": 0.0384, + "num_input_tokens_seen": 78922304, + "step": 36565 + }, + { + "epoch": 5.9657422512234906, + "grad_norm": 0.18789711594581604, + "learning_rate": 2.0954917215741113e-05, + "loss": 0.0646, + "num_input_tokens_seen": 78933408, + "step": 36570 + }, + { + "epoch": 5.966557911908646, + "grad_norm": 0.5063246488571167, + "learning_rate": 2.0947893190979588e-05, + "loss": 0.1148, + "num_input_tokens_seen": 78944704, + "step": 36575 + }, + { + "epoch": 5.967373572593801, + "grad_norm": 0.11106864362955093, + "learning_rate": 2.09408694947024e-05, + "loss": 0.0643, + "num_input_tokens_seen": 78955584, + "step": 36580 + }, + { + "epoch": 5.968189233278956, + "grad_norm": 0.592289388179779, + "learning_rate": 2.0933846127478928e-05, + "loss": 0.1361, + "num_input_tokens_seen": 78966272, + "step": 36585 + }, + { + "epoch": 5.969004893964111, + "grad_norm": 0.17345793545246124, + "learning_rate": 2.092682308987852e-05, + "loss": 0.1667, + "num_input_tokens_seen": 78976608, + "step": 36590 + }, + { + "epoch": 5.9698205546492655, + "grad_norm": 0.10420658439397812, + "learning_rate": 2.0919800382470503e-05, + "loss": 0.0582, + "num_input_tokens_seen": 78987360, + "step": 36595 + }, + { + "epoch": 5.970636215334421, + "grad_norm": 0.061308085918426514, + "learning_rate": 2.091277800582416e-05, + "loss": 0.0736, + "num_input_tokens_seen": 78999040, + "step": 36600 + }, + { + "epoch": 5.971451876019576, + "grad_norm": 0.9275151491165161, + "learning_rate": 2.0905755960508778e-05, + "loss": 0.0293, + "num_input_tokens_seen": 79010112, + "step": 36605 + }, + { + "epoch": 5.972267536704731, + "grad_norm": 0.3520929515361786, + "learning_rate": 2.0898734247093592e-05, + "loss": 0.0561, + "num_input_tokens_seen": 79020416, + "step": 36610 + }, + { + "epoch": 5.973083197389886, + "grad_norm": 0.8955913186073303, + "learning_rate": 2.0891712866147812e-05, + "loss": 0.22, + "num_input_tokens_seen": 79031648, + "step": 36615 + }, + { + "epoch": 5.9738988580750405, + "grad_norm": 0.15769003331661224, + "learning_rate": 2.088469181824064e-05, + "loss": 0.0783, + "num_input_tokens_seen": 79042560, + "step": 36620 + }, + { + "epoch": 5.974714518760196, + "grad_norm": 0.587742805480957, + "learning_rate": 2.0877671103941228e-05, + "loss": 0.1441, + "num_input_tokens_seen": 79052672, + "step": 36625 + }, + { + "epoch": 5.975530179445351, + "grad_norm": 1.5978223085403442, + "learning_rate": 2.0870650723818706e-05, + "loss": 0.3334, + "num_input_tokens_seen": 79063744, + "step": 36630 + }, + { + "epoch": 5.976345840130506, + "grad_norm": 1.5600570440292358, + "learning_rate": 2.0863630678442196e-05, + "loss": 0.063, + "num_input_tokens_seen": 79075424, + "step": 36635 + }, + { + "epoch": 5.977161500815661, + "grad_norm": 0.16905611753463745, + "learning_rate": 2.0856610968380768e-05, + "loss": 0.1189, + "num_input_tokens_seen": 79085792, + "step": 36640 + }, + { + "epoch": 5.9779771615008155, + "grad_norm": 2.155029773712158, + "learning_rate": 2.0849591594203482e-05, + "loss": 0.1279, + "num_input_tokens_seen": 79096704, + "step": 36645 + }, + { + "epoch": 5.97879282218597, + "grad_norm": 1.2202547788619995, + "learning_rate": 2.0842572556479355e-05, + "loss": 0.1511, + "num_input_tokens_seen": 79107328, + "step": 36650 + }, + { + "epoch": 5.979608482871125, + "grad_norm": 0.4367958903312683, + "learning_rate": 2.0835553855777396e-05, + "loss": 0.0852, + "num_input_tokens_seen": 79118144, + "step": 36655 + }, + { + "epoch": 5.980424143556281, + "grad_norm": 0.030520331114530563, + "learning_rate": 2.082853549266656e-05, + "loss": 0.2852, + "num_input_tokens_seen": 79129568, + "step": 36660 + }, + { + "epoch": 5.981239804241436, + "grad_norm": 2.4893834590911865, + "learning_rate": 2.0821517467715815e-05, + "loss": 0.1946, + "num_input_tokens_seen": 79139936, + "step": 36665 + }, + { + "epoch": 5.9820554649265905, + "grad_norm": 1.3846659660339355, + "learning_rate": 2.0814499781494057e-05, + "loss": 0.1172, + "num_input_tokens_seen": 79149664, + "step": 36670 + }, + { + "epoch": 5.982871125611745, + "grad_norm": 0.09702694416046143, + "learning_rate": 2.0807482434570187e-05, + "loss": 0.1077, + "num_input_tokens_seen": 79161280, + "step": 36675 + }, + { + "epoch": 5.9836867862969, + "grad_norm": 0.12053288519382477, + "learning_rate": 2.080046542751307e-05, + "loss": 0.1034, + "num_input_tokens_seen": 79171488, + "step": 36680 + }, + { + "epoch": 5.984502446982056, + "grad_norm": 1.1765824556350708, + "learning_rate": 2.079344876089152e-05, + "loss": 0.0849, + "num_input_tokens_seen": 79182656, + "step": 36685 + }, + { + "epoch": 5.985318107667211, + "grad_norm": 0.22869789600372314, + "learning_rate": 2.078643243527437e-05, + "loss": 0.2381, + "num_input_tokens_seen": 79194016, + "step": 36690 + }, + { + "epoch": 5.986133768352365, + "grad_norm": 1.5129129886627197, + "learning_rate": 2.0779416451230382e-05, + "loss": 0.2239, + "num_input_tokens_seen": 79205472, + "step": 36695 + }, + { + "epoch": 5.98694942903752, + "grad_norm": 0.15388360619544983, + "learning_rate": 2.0772400809328314e-05, + "loss": 0.0968, + "num_input_tokens_seen": 79215904, + "step": 36700 + }, + { + "epoch": 5.987765089722675, + "grad_norm": 0.2996169328689575, + "learning_rate": 2.0765385510136884e-05, + "loss": 0.1258, + "num_input_tokens_seen": 79227776, + "step": 36705 + }, + { + "epoch": 5.988580750407831, + "grad_norm": 0.0454501248896122, + "learning_rate": 2.0758370554224793e-05, + "loss": 0.1521, + "num_input_tokens_seen": 79239488, + "step": 36710 + }, + { + "epoch": 5.989396411092986, + "grad_norm": 1.502669334411621, + "learning_rate": 2.0751355942160706e-05, + "loss": 0.2672, + "num_input_tokens_seen": 79249600, + "step": 36715 + }, + { + "epoch": 5.99021207177814, + "grad_norm": 0.6053801774978638, + "learning_rate": 2.0744341674513264e-05, + "loss": 0.1866, + "num_input_tokens_seen": 79259840, + "step": 36720 + }, + { + "epoch": 5.991027732463295, + "grad_norm": 1.190796136856079, + "learning_rate": 2.0737327751851075e-05, + "loss": 0.1428, + "num_input_tokens_seen": 79271488, + "step": 36725 + }, + { + "epoch": 5.99184339314845, + "grad_norm": 0.16675561666488647, + "learning_rate": 2.0730314174742733e-05, + "loss": 0.0693, + "num_input_tokens_seen": 79282752, + "step": 36730 + }, + { + "epoch": 5.992659053833605, + "grad_norm": 0.8094229102134705, + "learning_rate": 2.0723300943756783e-05, + "loss": 0.0797, + "num_input_tokens_seen": 79294336, + "step": 36735 + }, + { + "epoch": 5.993474714518761, + "grad_norm": 0.8730387091636658, + "learning_rate": 2.0716288059461764e-05, + "loss": 0.2663, + "num_input_tokens_seen": 79306272, + "step": 36740 + }, + { + "epoch": 5.994290375203915, + "grad_norm": 0.4239517152309418, + "learning_rate": 2.0709275522426158e-05, + "loss": 0.0795, + "num_input_tokens_seen": 79317120, + "step": 36745 + }, + { + "epoch": 5.99510603588907, + "grad_norm": 0.07016343623399734, + "learning_rate": 2.070226333321846e-05, + "loss": 0.0212, + "num_input_tokens_seen": 79327200, + "step": 36750 + }, + { + "epoch": 5.995921696574225, + "grad_norm": 0.8132364153862, + "learning_rate": 2.06952514924071e-05, + "loss": 0.0969, + "num_input_tokens_seen": 79337888, + "step": 36755 + }, + { + "epoch": 5.99673735725938, + "grad_norm": 0.32091468572616577, + "learning_rate": 2.0688240000560498e-05, + "loss": 0.1059, + "num_input_tokens_seen": 79348480, + "step": 36760 + }, + { + "epoch": 5.997553017944535, + "grad_norm": 0.29365962743759155, + "learning_rate": 2.0681228858247038e-05, + "loss": 0.1417, + "num_input_tokens_seen": 79360480, + "step": 36765 + }, + { + "epoch": 5.99836867862969, + "grad_norm": 1.2733268737792969, + "learning_rate": 2.067421806603508e-05, + "loss": 0.128, + "num_input_tokens_seen": 79370912, + "step": 36770 + }, + { + "epoch": 5.999184339314845, + "grad_norm": 0.46356871724128723, + "learning_rate": 2.0667207624492943e-05, + "loss": 0.0569, + "num_input_tokens_seen": 79380288, + "step": 36775 + }, + { + "epoch": 6.0, + "grad_norm": 0.03394840285181999, + "learning_rate": 2.0660197534188952e-05, + "loss": 0.0909, + "num_input_tokens_seen": 79389648, + "step": 36780 + }, + { + "epoch": 6.0, + "eval_loss": 0.13736946880817413, + "eval_runtime": 131.8093, + "eval_samples_per_second": 20.674, + "eval_steps_per_second": 5.174, + "num_input_tokens_seen": 79389648, + "step": 36780 + }, + { + "epoch": 6.000815660685155, + "grad_norm": 0.3548504114151001, + "learning_rate": 2.065318779569137e-05, + "loss": 0.0403, + "num_input_tokens_seen": 79400656, + "step": 36785 + }, + { + "epoch": 6.00163132137031, + "grad_norm": 1.8793834447860718, + "learning_rate": 2.064617840956844e-05, + "loss": 0.1181, + "num_input_tokens_seen": 79412656, + "step": 36790 + }, + { + "epoch": 6.002446982055465, + "grad_norm": 0.09908688068389893, + "learning_rate": 2.063916937638838e-05, + "loss": 0.056, + "num_input_tokens_seen": 79424048, + "step": 36795 + }, + { + "epoch": 6.00326264274062, + "grad_norm": 0.542762815952301, + "learning_rate": 2.063216069671937e-05, + "loss": 0.0857, + "num_input_tokens_seen": 79434896, + "step": 36800 + }, + { + "epoch": 6.004078303425775, + "grad_norm": 0.09355419129133224, + "learning_rate": 2.0625152371129585e-05, + "loss": 0.1948, + "num_input_tokens_seen": 79445680, + "step": 36805 + }, + { + "epoch": 6.00489396411093, + "grad_norm": 0.6499553322792053, + "learning_rate": 2.0618144400187142e-05, + "loss": 0.0764, + "num_input_tokens_seen": 79456592, + "step": 36810 + }, + { + "epoch": 6.005709624796085, + "grad_norm": 0.9768366813659668, + "learning_rate": 2.061113678446015e-05, + "loss": 0.1456, + "num_input_tokens_seen": 79466992, + "step": 36815 + }, + { + "epoch": 6.006525285481239, + "grad_norm": 0.25254783034324646, + "learning_rate": 2.0604129524516676e-05, + "loss": 0.0296, + "num_input_tokens_seen": 79477168, + "step": 36820 + }, + { + "epoch": 6.007340946166395, + "grad_norm": 1.6313750743865967, + "learning_rate": 2.059712262092477e-05, + "loss": 0.1318, + "num_input_tokens_seen": 79488368, + "step": 36825 + }, + { + "epoch": 6.00815660685155, + "grad_norm": 0.10681536048650742, + "learning_rate": 2.0590116074252438e-05, + "loss": 0.1164, + "num_input_tokens_seen": 79498736, + "step": 36830 + }, + { + "epoch": 6.008972267536705, + "grad_norm": 0.26533401012420654, + "learning_rate": 2.058310988506768e-05, + "loss": 0.2765, + "num_input_tokens_seen": 79508656, + "step": 36835 + }, + { + "epoch": 6.00978792822186, + "grad_norm": 0.9492351412773132, + "learning_rate": 2.057610405393844e-05, + "loss": 0.284, + "num_input_tokens_seen": 79519696, + "step": 36840 + }, + { + "epoch": 6.010603588907014, + "grad_norm": 1.1795973777770996, + "learning_rate": 2.0569098581432655e-05, + "loss": 0.3041, + "num_input_tokens_seen": 79529776, + "step": 36845 + }, + { + "epoch": 6.011419249592169, + "grad_norm": 1.6361383199691772, + "learning_rate": 2.0562093468118225e-05, + "loss": 0.1911, + "num_input_tokens_seen": 79541392, + "step": 36850 + }, + { + "epoch": 6.012234910277325, + "grad_norm": 0.7947347164154053, + "learning_rate": 2.055508871456301e-05, + "loss": 0.0665, + "num_input_tokens_seen": 79552720, + "step": 36855 + }, + { + "epoch": 6.01305057096248, + "grad_norm": 0.9406610131263733, + "learning_rate": 2.054808432133486e-05, + "loss": 0.2431, + "num_input_tokens_seen": 79563888, + "step": 36860 + }, + { + "epoch": 6.013866231647635, + "grad_norm": 0.08179469406604767, + "learning_rate": 2.0541080289001584e-05, + "loss": 0.1232, + "num_input_tokens_seen": 79575760, + "step": 36865 + }, + { + "epoch": 6.014681892332789, + "grad_norm": 0.43494778871536255, + "learning_rate": 2.0534076618130965e-05, + "loss": 0.0716, + "num_input_tokens_seen": 79587408, + "step": 36870 + }, + { + "epoch": 6.015497553017944, + "grad_norm": 1.0935986042022705, + "learning_rate": 2.0527073309290755e-05, + "loss": 0.0812, + "num_input_tokens_seen": 79598352, + "step": 36875 + }, + { + "epoch": 6.0163132137031, + "grad_norm": 2.3222243785858154, + "learning_rate": 2.0520070363048667e-05, + "loss": 0.263, + "num_input_tokens_seen": 79609520, + "step": 36880 + }, + { + "epoch": 6.017128874388255, + "grad_norm": 0.2949793338775635, + "learning_rate": 2.0513067779972415e-05, + "loss": 0.0749, + "num_input_tokens_seen": 79621712, + "step": 36885 + }, + { + "epoch": 6.0179445350734095, + "grad_norm": 0.3799355626106262, + "learning_rate": 2.0506065560629655e-05, + "loss": 0.1056, + "num_input_tokens_seen": 79630576, + "step": 36890 + }, + { + "epoch": 6.018760195758564, + "grad_norm": 0.18886154890060425, + "learning_rate": 2.0499063705588024e-05, + "loss": 0.089, + "num_input_tokens_seen": 79641360, + "step": 36895 + }, + { + "epoch": 6.019575856443719, + "grad_norm": 0.28480619192123413, + "learning_rate": 2.0492062215415125e-05, + "loss": 0.0568, + "num_input_tokens_seen": 79651792, + "step": 36900 + }, + { + "epoch": 6.020391517128874, + "grad_norm": 0.08191132545471191, + "learning_rate": 2.048506109067854e-05, + "loss": 0.0932, + "num_input_tokens_seen": 79661488, + "step": 36905 + }, + { + "epoch": 6.02120717781403, + "grad_norm": 1.248218059539795, + "learning_rate": 2.047806033194581e-05, + "loss": 0.2634, + "num_input_tokens_seen": 79671856, + "step": 36910 + }, + { + "epoch": 6.0220228384991845, + "grad_norm": 0.06515499204397202, + "learning_rate": 2.0471059939784447e-05, + "loss": 0.0797, + "num_input_tokens_seen": 79681232, + "step": 36915 + }, + { + "epoch": 6.022838499184339, + "grad_norm": 1.9572089910507202, + "learning_rate": 2.046405991476195e-05, + "loss": 0.317, + "num_input_tokens_seen": 79691728, + "step": 36920 + }, + { + "epoch": 6.023654159869494, + "grad_norm": 0.09668673574924469, + "learning_rate": 2.045706025744577e-05, + "loss": 0.0653, + "num_input_tokens_seen": 79702672, + "step": 36925 + }, + { + "epoch": 6.024469820554649, + "grad_norm": 0.3033691644668579, + "learning_rate": 2.045006096840334e-05, + "loss": 0.0726, + "num_input_tokens_seen": 79713552, + "step": 36930 + }, + { + "epoch": 6.025285481239805, + "grad_norm": 0.11264218389987946, + "learning_rate": 2.044306204820205e-05, + "loss": 0.0521, + "num_input_tokens_seen": 79725296, + "step": 36935 + }, + { + "epoch": 6.0261011419249595, + "grad_norm": 1.63962721824646, + "learning_rate": 2.0436063497409274e-05, + "loss": 0.0785, + "num_input_tokens_seen": 79735792, + "step": 36940 + }, + { + "epoch": 6.026916802610114, + "grad_norm": 0.2450704723596573, + "learning_rate": 2.042906531659235e-05, + "loss": 0.046, + "num_input_tokens_seen": 79746576, + "step": 36945 + }, + { + "epoch": 6.027732463295269, + "grad_norm": 1.3058305978775024, + "learning_rate": 2.042206750631858e-05, + "loss": 0.0937, + "num_input_tokens_seen": 79756496, + "step": 36950 + }, + { + "epoch": 6.028548123980424, + "grad_norm": 0.6722590923309326, + "learning_rate": 2.041507006715525e-05, + "loss": 0.0893, + "num_input_tokens_seen": 79767600, + "step": 36955 + }, + { + "epoch": 6.029363784665579, + "grad_norm": 0.04352841153740883, + "learning_rate": 2.0408072999669604e-05, + "loss": 0.0971, + "num_input_tokens_seen": 79779056, + "step": 36960 + }, + { + "epoch": 6.0301794453507345, + "grad_norm": 0.7519928216934204, + "learning_rate": 2.0401076304428857e-05, + "loss": 0.2641, + "num_input_tokens_seen": 79790672, + "step": 36965 + }, + { + "epoch": 6.030995106035889, + "grad_norm": 1.1231536865234375, + "learning_rate": 2.0394079982000198e-05, + "loss": 0.1667, + "num_input_tokens_seen": 79800144, + "step": 36970 + }, + { + "epoch": 6.031810766721044, + "grad_norm": 1.2992480993270874, + "learning_rate": 2.0387084032950787e-05, + "loss": 0.1709, + "num_input_tokens_seen": 79811056, + "step": 36975 + }, + { + "epoch": 6.032626427406199, + "grad_norm": 0.9446027278900146, + "learning_rate": 2.038008845784775e-05, + "loss": 0.17, + "num_input_tokens_seen": 79821616, + "step": 36980 + }, + { + "epoch": 6.033442088091354, + "grad_norm": 0.6704817414283752, + "learning_rate": 2.0373093257258184e-05, + "loss": 0.1241, + "num_input_tokens_seen": 79833104, + "step": 36985 + }, + { + "epoch": 6.034257748776509, + "grad_norm": 0.2387690544128418, + "learning_rate": 2.0366098431749152e-05, + "loss": 0.0332, + "num_input_tokens_seen": 79842832, + "step": 36990 + }, + { + "epoch": 6.035073409461664, + "grad_norm": 0.12274579703807831, + "learning_rate": 2.0359103981887695e-05, + "loss": 0.044, + "num_input_tokens_seen": 79853840, + "step": 36995 + }, + { + "epoch": 6.035889070146819, + "grad_norm": 0.916332483291626, + "learning_rate": 2.035210990824082e-05, + "loss": 0.1662, + "num_input_tokens_seen": 79865488, + "step": 37000 + }, + { + "epoch": 6.036704730831974, + "grad_norm": 1.1935508251190186, + "learning_rate": 2.0345116211375496e-05, + "loss": 0.2764, + "num_input_tokens_seen": 79874640, + "step": 37005 + }, + { + "epoch": 6.037520391517129, + "grad_norm": 0.49243950843811035, + "learning_rate": 2.0338122891858677e-05, + "loss": 0.0404, + "num_input_tokens_seen": 79885552, + "step": 37010 + }, + { + "epoch": 6.0383360522022835, + "grad_norm": 0.09472338855266571, + "learning_rate": 2.0331129950257266e-05, + "loss": 0.0143, + "num_input_tokens_seen": 79896336, + "step": 37015 + }, + { + "epoch": 6.039151712887439, + "grad_norm": 0.6970955729484558, + "learning_rate": 2.0324137387138152e-05, + "loss": 0.1446, + "num_input_tokens_seen": 79907184, + "step": 37020 + }, + { + "epoch": 6.039967373572594, + "grad_norm": 1.0032442808151245, + "learning_rate": 2.031714520306819e-05, + "loss": 0.1468, + "num_input_tokens_seen": 79917936, + "step": 37025 + }, + { + "epoch": 6.040783034257749, + "grad_norm": 2.5711050033569336, + "learning_rate": 2.0310153398614192e-05, + "loss": 0.2225, + "num_input_tokens_seen": 79930000, + "step": 37030 + }, + { + "epoch": 6.041598694942904, + "grad_norm": 0.4610103964805603, + "learning_rate": 2.030316197434296e-05, + "loss": 0.0581, + "num_input_tokens_seen": 79941456, + "step": 37035 + }, + { + "epoch": 6.0424143556280585, + "grad_norm": 0.8451020121574402, + "learning_rate": 2.0296170930821245e-05, + "loss": 0.068, + "num_input_tokens_seen": 79951440, + "step": 37040 + }, + { + "epoch": 6.043230016313213, + "grad_norm": 0.12362402677536011, + "learning_rate": 2.028918026861579e-05, + "loss": 0.0357, + "num_input_tokens_seen": 79962064, + "step": 37045 + }, + { + "epoch": 6.044045676998369, + "grad_norm": 0.03492611274123192, + "learning_rate": 2.0282189988293276e-05, + "loss": 0.0366, + "num_input_tokens_seen": 79974160, + "step": 37050 + }, + { + "epoch": 6.044861337683524, + "grad_norm": 0.08145938068628311, + "learning_rate": 2.0275200090420376e-05, + "loss": 0.0213, + "num_input_tokens_seen": 79985072, + "step": 37055 + }, + { + "epoch": 6.045676998368679, + "grad_norm": 1.0868322849273682, + "learning_rate": 2.026821057556374e-05, + "loss": 0.1176, + "num_input_tokens_seen": 79996784, + "step": 37060 + }, + { + "epoch": 6.0464926590538335, + "grad_norm": 0.23795321583747864, + "learning_rate": 2.026122144428996e-05, + "loss": 0.0485, + "num_input_tokens_seen": 80008496, + "step": 37065 + }, + { + "epoch": 6.047308319738988, + "grad_norm": 1.582801342010498, + "learning_rate": 2.0254232697165616e-05, + "loss": 0.1654, + "num_input_tokens_seen": 80019600, + "step": 37070 + }, + { + "epoch": 6.048123980424143, + "grad_norm": 0.22323176264762878, + "learning_rate": 2.0247244334757248e-05, + "loss": 0.0374, + "num_input_tokens_seen": 80030864, + "step": 37075 + }, + { + "epoch": 6.048939641109299, + "grad_norm": 0.7268773913383484, + "learning_rate": 2.0240256357631367e-05, + "loss": 0.0475, + "num_input_tokens_seen": 80041936, + "step": 37080 + }, + { + "epoch": 6.049755301794454, + "grad_norm": 0.6161600351333618, + "learning_rate": 2.023326876635446e-05, + "loss": 0.0551, + "num_input_tokens_seen": 80053552, + "step": 37085 + }, + { + "epoch": 6.0505709624796085, + "grad_norm": 1.263363003730774, + "learning_rate": 2.022628156149297e-05, + "loss": 0.2165, + "num_input_tokens_seen": 80064688, + "step": 37090 + }, + { + "epoch": 6.051386623164763, + "grad_norm": 1.0067670345306396, + "learning_rate": 2.021929474361331e-05, + "loss": 0.0358, + "num_input_tokens_seen": 80076176, + "step": 37095 + }, + { + "epoch": 6.052202283849918, + "grad_norm": 0.07773994654417038, + "learning_rate": 2.0212308313281886e-05, + "loss": 0.0377, + "num_input_tokens_seen": 80085968, + "step": 37100 + }, + { + "epoch": 6.053017944535074, + "grad_norm": 0.7837408781051636, + "learning_rate": 2.0205322271065042e-05, + "loss": 0.2674, + "num_input_tokens_seen": 80096048, + "step": 37105 + }, + { + "epoch": 6.053833605220229, + "grad_norm": 0.5510076880455017, + "learning_rate": 2.01983366175291e-05, + "loss": 0.0809, + "num_input_tokens_seen": 80106608, + "step": 37110 + }, + { + "epoch": 6.054649265905383, + "grad_norm": 0.041368819773197174, + "learning_rate": 2.0191351353240363e-05, + "loss": 0.14, + "num_input_tokens_seen": 80116400, + "step": 37115 + }, + { + "epoch": 6.055464926590538, + "grad_norm": 1.2590614557266235, + "learning_rate": 2.0184366478765078e-05, + "loss": 0.1562, + "num_input_tokens_seen": 80127024, + "step": 37120 + }, + { + "epoch": 6.056280587275693, + "grad_norm": 0.21675696969032288, + "learning_rate": 2.017738199466948e-05, + "loss": 0.1171, + "num_input_tokens_seen": 80137936, + "step": 37125 + }, + { + "epoch": 6.057096247960848, + "grad_norm": 1.3059369325637817, + "learning_rate": 2.0170397901519766e-05, + "loss": 0.0774, + "num_input_tokens_seen": 80148752, + "step": 37130 + }, + { + "epoch": 6.057911908646004, + "grad_norm": 0.4119716286659241, + "learning_rate": 2.016341419988211e-05, + "loss": 0.1434, + "num_input_tokens_seen": 80159856, + "step": 37135 + }, + { + "epoch": 6.058727569331158, + "grad_norm": 1.4489063024520874, + "learning_rate": 2.0156430890322627e-05, + "loss": 0.1267, + "num_input_tokens_seen": 80172656, + "step": 37140 + }, + { + "epoch": 6.059543230016313, + "grad_norm": 0.10104052722454071, + "learning_rate": 2.0149447973407443e-05, + "loss": 0.1013, + "num_input_tokens_seen": 80184240, + "step": 37145 + }, + { + "epoch": 6.060358890701468, + "grad_norm": 0.2790704071521759, + "learning_rate": 2.0142465449702612e-05, + "loss": 0.0512, + "num_input_tokens_seen": 80195184, + "step": 37150 + }, + { + "epoch": 6.061174551386623, + "grad_norm": 0.37179529666900635, + "learning_rate": 2.0135483319774183e-05, + "loss": 0.0457, + "num_input_tokens_seen": 80205296, + "step": 37155 + }, + { + "epoch": 6.061990212071779, + "grad_norm": 0.03723519295454025, + "learning_rate": 2.012850158418816e-05, + "loss": 0.0708, + "num_input_tokens_seen": 80216496, + "step": 37160 + }, + { + "epoch": 6.062805872756933, + "grad_norm": 0.6903476715087891, + "learning_rate": 2.0121520243510512e-05, + "loss": 0.0589, + "num_input_tokens_seen": 80227280, + "step": 37165 + }, + { + "epoch": 6.063621533442088, + "grad_norm": 1.2477331161499023, + "learning_rate": 2.0114539298307188e-05, + "loss": 0.1649, + "num_input_tokens_seen": 80238704, + "step": 37170 + }, + { + "epoch": 6.064437194127243, + "grad_norm": 1.5219900608062744, + "learning_rate": 2.0107558749144096e-05, + "loss": 0.1129, + "num_input_tokens_seen": 80248560, + "step": 37175 + }, + { + "epoch": 6.065252854812398, + "grad_norm": 0.04282096400856972, + "learning_rate": 2.0100578596587116e-05, + "loss": 0.1017, + "num_input_tokens_seen": 80258448, + "step": 37180 + }, + { + "epoch": 6.066068515497553, + "grad_norm": 0.21572862565517426, + "learning_rate": 2.0093598841202092e-05, + "loss": 0.029, + "num_input_tokens_seen": 80269168, + "step": 37185 + }, + { + "epoch": 6.066884176182708, + "grad_norm": 0.3542579412460327, + "learning_rate": 2.0086619483554847e-05, + "loss": 0.0409, + "num_input_tokens_seen": 80278672, + "step": 37190 + }, + { + "epoch": 6.067699836867863, + "grad_norm": 0.15018631517887115, + "learning_rate": 2.0079640524211153e-05, + "loss": 0.0582, + "num_input_tokens_seen": 80290128, + "step": 37195 + }, + { + "epoch": 6.068515497553018, + "grad_norm": 0.0800928995013237, + "learning_rate": 2.0072661963736752e-05, + "loss": 0.1849, + "num_input_tokens_seen": 80300816, + "step": 37200 + }, + { + "epoch": 6.069331158238173, + "grad_norm": 0.1467091143131256, + "learning_rate": 2.006568380269739e-05, + "loss": 0.0209, + "num_input_tokens_seen": 80311120, + "step": 37205 + }, + { + "epoch": 6.070146818923328, + "grad_norm": 1.322386622428894, + "learning_rate": 2.005870604165873e-05, + "loss": 0.0714, + "num_input_tokens_seen": 80321968, + "step": 37210 + }, + { + "epoch": 6.0709624796084825, + "grad_norm": 0.4637303054332733, + "learning_rate": 2.005172868118643e-05, + "loss": 0.1656, + "num_input_tokens_seen": 80332176, + "step": 37215 + }, + { + "epoch": 6.071778140293638, + "grad_norm": 1.7898385524749756, + "learning_rate": 2.004475172184611e-05, + "loss": 0.0805, + "num_input_tokens_seen": 80343376, + "step": 37220 + }, + { + "epoch": 6.072593800978793, + "grad_norm": 0.0470711886882782, + "learning_rate": 2.0037775164203356e-05, + "loss": 0.0317, + "num_input_tokens_seen": 80354032, + "step": 37225 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.5151978135108948, + "learning_rate": 2.0030799008823727e-05, + "loss": 0.043, + "num_input_tokens_seen": 80365072, + "step": 37230 + }, + { + "epoch": 6.074225122349103, + "grad_norm": 0.1475774347782135, + "learning_rate": 2.0023823256272748e-05, + "loss": 0.0467, + "num_input_tokens_seen": 80374512, + "step": 37235 + }, + { + "epoch": 6.075040783034257, + "grad_norm": 0.15191799402236938, + "learning_rate": 2.00168479071159e-05, + "loss": 0.0824, + "num_input_tokens_seen": 80384368, + "step": 37240 + }, + { + "epoch": 6.075856443719413, + "grad_norm": 0.3686923682689667, + "learning_rate": 2.0009872961918648e-05, + "loss": 0.0603, + "num_input_tokens_seen": 80395152, + "step": 37245 + }, + { + "epoch": 6.076672104404568, + "grad_norm": 0.7799150347709656, + "learning_rate": 2.0002898421246414e-05, + "loss": 0.1341, + "num_input_tokens_seen": 80406288, + "step": 37250 + }, + { + "epoch": 6.077487765089723, + "grad_norm": 0.056045521050691605, + "learning_rate": 1.9995924285664587e-05, + "loss": 0.087, + "num_input_tokens_seen": 80417040, + "step": 37255 + }, + { + "epoch": 6.078303425774878, + "grad_norm": 0.460558146238327, + "learning_rate": 1.9988950555738528e-05, + "loss": 0.1179, + "num_input_tokens_seen": 80428112, + "step": 37260 + }, + { + "epoch": 6.079119086460032, + "grad_norm": 0.06287221610546112, + "learning_rate": 1.9981977232033563e-05, + "loss": 0.1001, + "num_input_tokens_seen": 80438288, + "step": 37265 + }, + { + "epoch": 6.079934747145187, + "grad_norm": 0.25936359167099, + "learning_rate": 1.9975004315114988e-05, + "loss": 0.5109, + "num_input_tokens_seen": 80449360, + "step": 37270 + }, + { + "epoch": 6.080750407830343, + "grad_norm": 1.3424193859100342, + "learning_rate": 1.9968031805548056e-05, + "loss": 0.0733, + "num_input_tokens_seen": 80460752, + "step": 37275 + }, + { + "epoch": 6.081566068515498, + "grad_norm": 0.5981659293174744, + "learning_rate": 1.9961059703898e-05, + "loss": 0.1241, + "num_input_tokens_seen": 80472048, + "step": 37280 + }, + { + "epoch": 6.082381729200653, + "grad_norm": 0.8516066074371338, + "learning_rate": 1.9954088010730003e-05, + "loss": 0.0799, + "num_input_tokens_seen": 80481840, + "step": 37285 + }, + { + "epoch": 6.083197389885807, + "grad_norm": 0.5177723169326782, + "learning_rate": 1.994711672660924e-05, + "loss": 0.0176, + "num_input_tokens_seen": 80493648, + "step": 37290 + }, + { + "epoch": 6.084013050570962, + "grad_norm": 0.2276274561882019, + "learning_rate": 1.9940145852100836e-05, + "loss": 0.1365, + "num_input_tokens_seen": 80503024, + "step": 37295 + }, + { + "epoch": 6.084828711256117, + "grad_norm": 0.2755437195301056, + "learning_rate": 1.993317538776988e-05, + "loss": 0.1544, + "num_input_tokens_seen": 80513808, + "step": 37300 + }, + { + "epoch": 6.085644371941273, + "grad_norm": 0.6525851488113403, + "learning_rate": 1.9926205334181443e-05, + "loss": 0.2671, + "num_input_tokens_seen": 80524176, + "step": 37305 + }, + { + "epoch": 6.0864600326264275, + "grad_norm": 0.09336105734109879, + "learning_rate": 1.9919235691900526e-05, + "loss": 0.1258, + "num_input_tokens_seen": 80536176, + "step": 37310 + }, + { + "epoch": 6.087275693311582, + "grad_norm": 0.07979799062013626, + "learning_rate": 1.991226646149216e-05, + "loss": 0.1166, + "num_input_tokens_seen": 80546960, + "step": 37315 + }, + { + "epoch": 6.088091353996737, + "grad_norm": 1.2321258783340454, + "learning_rate": 1.9905297643521287e-05, + "loss": 0.2253, + "num_input_tokens_seen": 80557936, + "step": 37320 + }, + { + "epoch": 6.088907014681892, + "grad_norm": 0.56537926197052, + "learning_rate": 1.9898329238552838e-05, + "loss": 0.156, + "num_input_tokens_seen": 80567728, + "step": 37325 + }, + { + "epoch": 6.089722675367048, + "grad_norm": 0.047077156603336334, + "learning_rate": 1.9891361247151706e-05, + "loss": 0.1283, + "num_input_tokens_seen": 80579184, + "step": 37330 + }, + { + "epoch": 6.0905383360522025, + "grad_norm": 0.7265591621398926, + "learning_rate": 1.9884393669882752e-05, + "loss": 0.066, + "num_input_tokens_seen": 80589872, + "step": 37335 + }, + { + "epoch": 6.091353996737357, + "grad_norm": 0.27355268597602844, + "learning_rate": 1.9877426507310802e-05, + "loss": 0.0577, + "num_input_tokens_seen": 80599888, + "step": 37340 + }, + { + "epoch": 6.092169657422512, + "grad_norm": 2.2992732524871826, + "learning_rate": 1.9870459760000654e-05, + "loss": 0.1754, + "num_input_tokens_seen": 80610384, + "step": 37345 + }, + { + "epoch": 6.092985318107667, + "grad_norm": 0.2976953983306885, + "learning_rate": 1.9863493428517066e-05, + "loss": 0.1962, + "num_input_tokens_seen": 80621456, + "step": 37350 + }, + { + "epoch": 6.093800978792822, + "grad_norm": 0.17599287629127502, + "learning_rate": 1.985652751342476e-05, + "loss": 0.0984, + "num_input_tokens_seen": 80632208, + "step": 37355 + }, + { + "epoch": 6.0946166394779775, + "grad_norm": 0.9587820172309875, + "learning_rate": 1.984956201528843e-05, + "loss": 0.0617, + "num_input_tokens_seen": 80642736, + "step": 37360 + }, + { + "epoch": 6.095432300163132, + "grad_norm": 1.3817583322525024, + "learning_rate": 1.984259693467274e-05, + "loss": 0.1941, + "num_input_tokens_seen": 80654064, + "step": 37365 + }, + { + "epoch": 6.096247960848287, + "grad_norm": 1.7975013256072998, + "learning_rate": 1.9835632272142305e-05, + "loss": 0.1244, + "num_input_tokens_seen": 80664880, + "step": 37370 + }, + { + "epoch": 6.097063621533442, + "grad_norm": 1.980268120765686, + "learning_rate": 1.9828668028261726e-05, + "loss": 0.0728, + "num_input_tokens_seen": 80676720, + "step": 37375 + }, + { + "epoch": 6.097879282218597, + "grad_norm": 0.9389541745185852, + "learning_rate": 1.9821704203595554e-05, + "loss": 0.1571, + "num_input_tokens_seen": 80688240, + "step": 37380 + }, + { + "epoch": 6.0986949429037525, + "grad_norm": 0.312199205160141, + "learning_rate": 1.9814740798708316e-05, + "loss": 0.1006, + "num_input_tokens_seen": 80699216, + "step": 37385 + }, + { + "epoch": 6.099510603588907, + "grad_norm": 0.2595294117927551, + "learning_rate": 1.98077778141645e-05, + "loss": 0.2286, + "num_input_tokens_seen": 80710192, + "step": 37390 + }, + { + "epoch": 6.100326264274062, + "grad_norm": 1.6973011493682861, + "learning_rate": 1.9800815250528557e-05, + "loss": 0.2326, + "num_input_tokens_seen": 80720880, + "step": 37395 + }, + { + "epoch": 6.101141924959217, + "grad_norm": 2.2239439487457275, + "learning_rate": 1.979385310836491e-05, + "loss": 0.0557, + "num_input_tokens_seen": 80731760, + "step": 37400 + }, + { + "epoch": 6.101957585644372, + "grad_norm": 0.15243497490882874, + "learning_rate": 1.9786891388237945e-05, + "loss": 0.0672, + "num_input_tokens_seen": 80741712, + "step": 37405 + }, + { + "epoch": 6.102773246329527, + "grad_norm": 0.7098251581192017, + "learning_rate": 1.9779930090712017e-05, + "loss": 0.2172, + "num_input_tokens_seen": 80753264, + "step": 37410 + }, + { + "epoch": 6.103588907014682, + "grad_norm": 0.7933482527732849, + "learning_rate": 1.9772969216351433e-05, + "loss": 0.0892, + "num_input_tokens_seen": 80764816, + "step": 37415 + }, + { + "epoch": 6.104404567699837, + "grad_norm": 1.1987318992614746, + "learning_rate": 1.9766008765720493e-05, + "loss": 0.3037, + "num_input_tokens_seen": 80774320, + "step": 37420 + }, + { + "epoch": 6.105220228384992, + "grad_norm": 0.020115366205573082, + "learning_rate": 1.975904873938344e-05, + "loss": 0.0812, + "num_input_tokens_seen": 80784976, + "step": 37425 + }, + { + "epoch": 6.106035889070147, + "grad_norm": 0.1901809275150299, + "learning_rate": 1.9752089137904492e-05, + "loss": 0.11, + "num_input_tokens_seen": 80794672, + "step": 37430 + }, + { + "epoch": 6.1068515497553015, + "grad_norm": 0.23970019817352295, + "learning_rate": 1.9745129961847824e-05, + "loss": 0.041, + "num_input_tokens_seen": 80805424, + "step": 37435 + }, + { + "epoch": 6.107667210440456, + "grad_norm": 0.38505232334136963, + "learning_rate": 1.9738171211777584e-05, + "loss": 0.0492, + "num_input_tokens_seen": 80816240, + "step": 37440 + }, + { + "epoch": 6.108482871125612, + "grad_norm": 0.05080033838748932, + "learning_rate": 1.9731212888257883e-05, + "loss": 0.206, + "num_input_tokens_seen": 80825744, + "step": 37445 + }, + { + "epoch": 6.109298531810767, + "grad_norm": 1.2595866918563843, + "learning_rate": 1.97242549918528e-05, + "loss": 0.1355, + "num_input_tokens_seen": 80835984, + "step": 37450 + }, + { + "epoch": 6.110114192495922, + "grad_norm": 1.052344560623169, + "learning_rate": 1.9717297523126373e-05, + "loss": 0.1, + "num_input_tokens_seen": 80847184, + "step": 37455 + }, + { + "epoch": 6.1109298531810765, + "grad_norm": 0.16609787940979004, + "learning_rate": 1.9710340482642615e-05, + "loss": 0.0796, + "num_input_tokens_seen": 80858608, + "step": 37460 + }, + { + "epoch": 6.111745513866231, + "grad_norm": 0.08822865039110184, + "learning_rate": 1.9703383870965496e-05, + "loss": 0.15, + "num_input_tokens_seen": 80869584, + "step": 37465 + }, + { + "epoch": 6.112561174551387, + "grad_norm": 1.583421230316162, + "learning_rate": 1.969642768865896e-05, + "loss": 0.1195, + "num_input_tokens_seen": 80879408, + "step": 37470 + }, + { + "epoch": 6.113376835236542, + "grad_norm": 0.12025380879640579, + "learning_rate": 1.9689471936286902e-05, + "loss": 0.1382, + "num_input_tokens_seen": 80890032, + "step": 37475 + }, + { + "epoch": 6.114192495921697, + "grad_norm": 0.24867355823516846, + "learning_rate": 1.9682516614413194e-05, + "loss": 0.1612, + "num_input_tokens_seen": 80902160, + "step": 37480 + }, + { + "epoch": 6.1150081566068515, + "grad_norm": 0.06056251376867294, + "learning_rate": 1.967556172360167e-05, + "loss": 0.2809, + "num_input_tokens_seen": 80914032, + "step": 37485 + }, + { + "epoch": 6.115823817292006, + "grad_norm": 0.1298357993364334, + "learning_rate": 1.966860726441613e-05, + "loss": 0.0792, + "num_input_tokens_seen": 80924912, + "step": 37490 + }, + { + "epoch": 6.116639477977161, + "grad_norm": 0.0901600569486618, + "learning_rate": 1.9661653237420337e-05, + "loss": 0.1622, + "num_input_tokens_seen": 80934544, + "step": 37495 + }, + { + "epoch": 6.117455138662317, + "grad_norm": 1.2291409969329834, + "learning_rate": 1.9654699643178016e-05, + "loss": 0.1677, + "num_input_tokens_seen": 80945456, + "step": 37500 + }, + { + "epoch": 6.118270799347472, + "grad_norm": 0.4729701578617096, + "learning_rate": 1.9647746482252866e-05, + "loss": 0.0832, + "num_input_tokens_seen": 80956368, + "step": 37505 + }, + { + "epoch": 6.1190864600326265, + "grad_norm": 0.036452509462833405, + "learning_rate": 1.9640793755208542e-05, + "loss": 0.0392, + "num_input_tokens_seen": 80967280, + "step": 37510 + }, + { + "epoch": 6.119902120717781, + "grad_norm": 0.9883771538734436, + "learning_rate": 1.9633841462608664e-05, + "loss": 0.1063, + "num_input_tokens_seen": 80978320, + "step": 37515 + }, + { + "epoch": 6.120717781402936, + "grad_norm": 1.4593206644058228, + "learning_rate": 1.9626889605016827e-05, + "loss": 0.1642, + "num_input_tokens_seen": 80988624, + "step": 37520 + }, + { + "epoch": 6.121533442088092, + "grad_norm": 0.7261061668395996, + "learning_rate": 1.9619938182996585e-05, + "loss": 0.2367, + "num_input_tokens_seen": 81000560, + "step": 37525 + }, + { + "epoch": 6.122349102773247, + "grad_norm": 0.33667197823524475, + "learning_rate": 1.961298719711145e-05, + "loss": 0.0171, + "num_input_tokens_seen": 81011408, + "step": 37530 + }, + { + "epoch": 6.123164763458401, + "grad_norm": 0.0414847694337368, + "learning_rate": 1.9606036647924907e-05, + "loss": 0.1361, + "num_input_tokens_seen": 81023472, + "step": 37535 + }, + { + "epoch": 6.123980424143556, + "grad_norm": 2.2268688678741455, + "learning_rate": 1.95990865360004e-05, + "loss": 0.2452, + "num_input_tokens_seen": 81033968, + "step": 37540 + }, + { + "epoch": 6.124796084828711, + "grad_norm": 0.9434770345687866, + "learning_rate": 1.9592136861901344e-05, + "loss": 0.0925, + "num_input_tokens_seen": 81044304, + "step": 37545 + }, + { + "epoch": 6.125611745513866, + "grad_norm": 0.09693703800439835, + "learning_rate": 1.9585187626191113e-05, + "loss": 0.0559, + "num_input_tokens_seen": 81055056, + "step": 37550 + }, + { + "epoch": 6.126427406199022, + "grad_norm": 1.2802937030792236, + "learning_rate": 1.9578238829433048e-05, + "loss": 0.1244, + "num_input_tokens_seen": 81066544, + "step": 37555 + }, + { + "epoch": 6.127243066884176, + "grad_norm": 0.13514459133148193, + "learning_rate": 1.9571290472190456e-05, + "loss": 0.061, + "num_input_tokens_seen": 81077360, + "step": 37560 + }, + { + "epoch": 6.128058727569331, + "grad_norm": 1.1010595560073853, + "learning_rate": 1.95643425550266e-05, + "loss": 0.0551, + "num_input_tokens_seen": 81088816, + "step": 37565 + }, + { + "epoch": 6.128874388254486, + "grad_norm": 0.11481825262308121, + "learning_rate": 1.955739507850472e-05, + "loss": 0.1523, + "num_input_tokens_seen": 81101488, + "step": 37570 + }, + { + "epoch": 6.129690048939641, + "grad_norm": 0.3698502779006958, + "learning_rate": 1.9550448043188007e-05, + "loss": 0.0692, + "num_input_tokens_seen": 81111568, + "step": 37575 + }, + { + "epoch": 6.130505709624796, + "grad_norm": 0.1630636751651764, + "learning_rate": 1.954350144963963e-05, + "loss": 0.0523, + "num_input_tokens_seen": 81121840, + "step": 37580 + }, + { + "epoch": 6.131321370309951, + "grad_norm": 0.6908400654792786, + "learning_rate": 1.9536555298422706e-05, + "loss": 0.0517, + "num_input_tokens_seen": 81132720, + "step": 37585 + }, + { + "epoch": 6.132137030995106, + "grad_norm": 1.5822930335998535, + "learning_rate": 1.9529609590100337e-05, + "loss": 0.0938, + "num_input_tokens_seen": 81143536, + "step": 37590 + }, + { + "epoch": 6.132952691680261, + "grad_norm": 1.616736888885498, + "learning_rate": 1.952266432523557e-05, + "loss": 0.2995, + "num_input_tokens_seen": 81154096, + "step": 37595 + }, + { + "epoch": 6.133768352365416, + "grad_norm": 0.5477954149246216, + "learning_rate": 1.9515719504391415e-05, + "loss": 0.0349, + "num_input_tokens_seen": 81165104, + "step": 37600 + }, + { + "epoch": 6.134584013050571, + "grad_norm": 1.4384320974349976, + "learning_rate": 1.950877512813087e-05, + "loss": 0.1844, + "num_input_tokens_seen": 81175952, + "step": 37605 + }, + { + "epoch": 6.135399673735726, + "grad_norm": 1.340421438217163, + "learning_rate": 1.950183119701688e-05, + "loss": 0.1535, + "num_input_tokens_seen": 81186608, + "step": 37610 + }, + { + "epoch": 6.136215334420881, + "grad_norm": 1.2593897581100464, + "learning_rate": 1.949488771161235e-05, + "loss": 0.1117, + "num_input_tokens_seen": 81198128, + "step": 37615 + }, + { + "epoch": 6.137030995106036, + "grad_norm": 0.9293044209480286, + "learning_rate": 1.948794467248015e-05, + "loss": 0.1143, + "num_input_tokens_seen": 81207792, + "step": 37620 + }, + { + "epoch": 6.137846655791191, + "grad_norm": 0.5867303609848022, + "learning_rate": 1.9481002080183114e-05, + "loss": 0.0522, + "num_input_tokens_seen": 81219152, + "step": 37625 + }, + { + "epoch": 6.138662316476346, + "grad_norm": 0.38122567534446716, + "learning_rate": 1.947405993528406e-05, + "loss": 0.0286, + "num_input_tokens_seen": 81230800, + "step": 37630 + }, + { + "epoch": 6.1394779771615005, + "grad_norm": 0.25760385394096375, + "learning_rate": 1.9467118238345752e-05, + "loss": 0.1538, + "num_input_tokens_seen": 81243536, + "step": 37635 + }, + { + "epoch": 6.140293637846656, + "grad_norm": 0.04456019401550293, + "learning_rate": 1.946017698993091e-05, + "loss": 0.0272, + "num_input_tokens_seen": 81255344, + "step": 37640 + }, + { + "epoch": 6.141109298531811, + "grad_norm": 0.0904713124036789, + "learning_rate": 1.945323619060223e-05, + "loss": 0.0301, + "num_input_tokens_seen": 81266544, + "step": 37645 + }, + { + "epoch": 6.141924959216966, + "grad_norm": 1.943692684173584, + "learning_rate": 1.944629584092237e-05, + "loss": 0.1787, + "num_input_tokens_seen": 81276976, + "step": 37650 + }, + { + "epoch": 6.142740619902121, + "grad_norm": 0.18819433450698853, + "learning_rate": 1.943935594145395e-05, + "loss": 0.0764, + "num_input_tokens_seen": 81288656, + "step": 37655 + }, + { + "epoch": 6.143556280587275, + "grad_norm": 2.4094033241271973, + "learning_rate": 1.9432416492759548e-05, + "loss": 0.256, + "num_input_tokens_seen": 81298928, + "step": 37660 + }, + { + "epoch": 6.14437194127243, + "grad_norm": 3.1942267417907715, + "learning_rate": 1.9425477495401716e-05, + "loss": 0.2215, + "num_input_tokens_seen": 81310224, + "step": 37665 + }, + { + "epoch": 6.145187601957586, + "grad_norm": 0.28428924083709717, + "learning_rate": 1.9418538949942962e-05, + "loss": 0.0482, + "num_input_tokens_seen": 81321616, + "step": 37670 + }, + { + "epoch": 6.146003262642741, + "grad_norm": 0.5270897150039673, + "learning_rate": 1.9411600856945763e-05, + "loss": 0.1565, + "num_input_tokens_seen": 81332656, + "step": 37675 + }, + { + "epoch": 6.146818923327896, + "grad_norm": 0.5083853602409363, + "learning_rate": 1.940466321697255e-05, + "loss": 0.1161, + "num_input_tokens_seen": 81343440, + "step": 37680 + }, + { + "epoch": 6.14763458401305, + "grad_norm": 0.4155411124229431, + "learning_rate": 1.9397726030585726e-05, + "loss": 0.1861, + "num_input_tokens_seen": 81354672, + "step": 37685 + }, + { + "epoch": 6.148450244698205, + "grad_norm": 0.8621602654457092, + "learning_rate": 1.939078929834766e-05, + "loss": 0.1145, + "num_input_tokens_seen": 81365808, + "step": 37690 + }, + { + "epoch": 6.149265905383361, + "grad_norm": 1.10995614528656, + "learning_rate": 1.9383853020820674e-05, + "loss": 0.0921, + "num_input_tokens_seen": 81375856, + "step": 37695 + }, + { + "epoch": 6.150081566068516, + "grad_norm": 1.2853572368621826, + "learning_rate": 1.9376917198567058e-05, + "loss": 0.056, + "num_input_tokens_seen": 81387184, + "step": 37700 + }, + { + "epoch": 6.150897226753671, + "grad_norm": 0.08852958679199219, + "learning_rate": 1.9369981832149064e-05, + "loss": 0.1032, + "num_input_tokens_seen": 81397712, + "step": 37705 + }, + { + "epoch": 6.151712887438825, + "grad_norm": 0.3179425299167633, + "learning_rate": 1.936304692212891e-05, + "loss": 0.1228, + "num_input_tokens_seen": 81408592, + "step": 37710 + }, + { + "epoch": 6.15252854812398, + "grad_norm": 2.3498713970184326, + "learning_rate": 1.9356112469068776e-05, + "loss": 0.2546, + "num_input_tokens_seen": 81420176, + "step": 37715 + }, + { + "epoch": 6.153344208809135, + "grad_norm": 1.5032501220703125, + "learning_rate": 1.93491784735308e-05, + "loss": 0.158, + "num_input_tokens_seen": 81431920, + "step": 37720 + }, + { + "epoch": 6.154159869494291, + "grad_norm": 0.4690800905227661, + "learning_rate": 1.934224493607709e-05, + "loss": 0.2206, + "num_input_tokens_seen": 81443312, + "step": 37725 + }, + { + "epoch": 6.1549755301794455, + "grad_norm": 0.8098165392875671, + "learning_rate": 1.933531185726971e-05, + "loss": 0.0956, + "num_input_tokens_seen": 81454736, + "step": 37730 + }, + { + "epoch": 6.1557911908646, + "grad_norm": 1.2583338022232056, + "learning_rate": 1.9328379237670684e-05, + "loss": 0.38, + "num_input_tokens_seen": 81465968, + "step": 37735 + }, + { + "epoch": 6.156606851549755, + "grad_norm": 0.38823747634887695, + "learning_rate": 1.9321447077842026e-05, + "loss": 0.09, + "num_input_tokens_seen": 81477264, + "step": 37740 + }, + { + "epoch": 6.15742251223491, + "grad_norm": 1.305921196937561, + "learning_rate": 1.931451537834568e-05, + "loss": 0.1133, + "num_input_tokens_seen": 81488080, + "step": 37745 + }, + { + "epoch": 6.158238172920065, + "grad_norm": 0.6341311931610107, + "learning_rate": 1.9307584139743564e-05, + "loss": 0.1905, + "num_input_tokens_seen": 81499536, + "step": 37750 + }, + { + "epoch": 6.1590538336052205, + "grad_norm": 0.09116479009389877, + "learning_rate": 1.9300653362597564e-05, + "loss": 0.0747, + "num_input_tokens_seen": 81511184, + "step": 37755 + }, + { + "epoch": 6.159869494290375, + "grad_norm": 0.09355226904153824, + "learning_rate": 1.929372304746952e-05, + "loss": 0.3754, + "num_input_tokens_seen": 81521648, + "step": 37760 + }, + { + "epoch": 6.16068515497553, + "grad_norm": 0.14583759009838104, + "learning_rate": 1.928679319492124e-05, + "loss": 0.0928, + "num_input_tokens_seen": 81532016, + "step": 37765 + }, + { + "epoch": 6.161500815660685, + "grad_norm": 1.8186074495315552, + "learning_rate": 1.9279863805514482e-05, + "loss": 0.1252, + "num_input_tokens_seen": 81543440, + "step": 37770 + }, + { + "epoch": 6.16231647634584, + "grad_norm": 1.0784066915512085, + "learning_rate": 1.9272934879810994e-05, + "loss": 0.1668, + "num_input_tokens_seen": 81553680, + "step": 37775 + }, + { + "epoch": 6.1631321370309955, + "grad_norm": 0.655130922794342, + "learning_rate": 1.9266006418372464e-05, + "loss": 0.0381, + "num_input_tokens_seen": 81564944, + "step": 37780 + }, + { + "epoch": 6.16394779771615, + "grad_norm": 0.48264598846435547, + "learning_rate": 1.925907842176055e-05, + "loss": 0.1128, + "num_input_tokens_seen": 81575376, + "step": 37785 + }, + { + "epoch": 6.164763458401305, + "grad_norm": 1.4524091482162476, + "learning_rate": 1.925215089053687e-05, + "loss": 0.1838, + "num_input_tokens_seen": 81585840, + "step": 37790 + }, + { + "epoch": 6.16557911908646, + "grad_norm": 1.326046347618103, + "learning_rate": 1.9245223825262997e-05, + "loss": 0.1046, + "num_input_tokens_seen": 81597072, + "step": 37795 + }, + { + "epoch": 6.166394779771615, + "grad_norm": 0.1233883872628212, + "learning_rate": 1.9238297226500483e-05, + "loss": 0.2139, + "num_input_tokens_seen": 81608336, + "step": 37800 + }, + { + "epoch": 6.16721044045677, + "grad_norm": 0.28992077708244324, + "learning_rate": 1.923137109481083e-05, + "loss": 0.123, + "num_input_tokens_seen": 81619440, + "step": 37805 + }, + { + "epoch": 6.168026101141925, + "grad_norm": 1.1832324266433716, + "learning_rate": 1.9224445430755507e-05, + "loss": 0.1144, + "num_input_tokens_seen": 81629648, + "step": 37810 + }, + { + "epoch": 6.16884176182708, + "grad_norm": 0.07100293785333633, + "learning_rate": 1.9217520234895943e-05, + "loss": 0.2464, + "num_input_tokens_seen": 81639408, + "step": 37815 + }, + { + "epoch": 6.169657422512235, + "grad_norm": 0.4030412435531616, + "learning_rate": 1.9210595507793526e-05, + "loss": 0.1136, + "num_input_tokens_seen": 81650704, + "step": 37820 + }, + { + "epoch": 6.17047308319739, + "grad_norm": 0.061928603798151016, + "learning_rate": 1.9203671250009612e-05, + "loss": 0.0322, + "num_input_tokens_seen": 81661328, + "step": 37825 + }, + { + "epoch": 6.171288743882545, + "grad_norm": 0.3157247006893158, + "learning_rate": 1.9196747462105517e-05, + "loss": 0.0599, + "num_input_tokens_seen": 81671728, + "step": 37830 + }, + { + "epoch": 6.1721044045677, + "grad_norm": 0.6579276323318481, + "learning_rate": 1.918982414464252e-05, + "loss": 0.0728, + "num_input_tokens_seen": 81681936, + "step": 37835 + }, + { + "epoch": 6.172920065252855, + "grad_norm": 1.1576712131500244, + "learning_rate": 1.918290129818185e-05, + "loss": 0.1561, + "num_input_tokens_seen": 81692592, + "step": 37840 + }, + { + "epoch": 6.17373572593801, + "grad_norm": 0.7833705544471741, + "learning_rate": 1.9175978923284727e-05, + "loss": 0.1143, + "num_input_tokens_seen": 81703696, + "step": 37845 + }, + { + "epoch": 6.174551386623165, + "grad_norm": 0.5265875458717346, + "learning_rate": 1.91690570205123e-05, + "loss": 0.0755, + "num_input_tokens_seen": 81714448, + "step": 37850 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.07639346271753311, + "learning_rate": 1.916213559042569e-05, + "loss": 0.1681, + "num_input_tokens_seen": 81724560, + "step": 37855 + }, + { + "epoch": 6.176182707993474, + "grad_norm": 0.057384561747312546, + "learning_rate": 1.9155214633586e-05, + "loss": 0.0133, + "num_input_tokens_seen": 81734384, + "step": 37860 + }, + { + "epoch": 6.17699836867863, + "grad_norm": 0.4263990521430969, + "learning_rate": 1.9148294150554266e-05, + "loss": 0.0646, + "num_input_tokens_seen": 81745552, + "step": 37865 + }, + { + "epoch": 6.177814029363785, + "grad_norm": 0.5079492330551147, + "learning_rate": 1.9141374141891498e-05, + "loss": 0.1261, + "num_input_tokens_seen": 81756144, + "step": 37870 + }, + { + "epoch": 6.17862969004894, + "grad_norm": 1.397243618965149, + "learning_rate": 1.913445460815867e-05, + "loss": 0.1725, + "num_input_tokens_seen": 81767024, + "step": 37875 + }, + { + "epoch": 6.1794453507340945, + "grad_norm": 2.929673910140991, + "learning_rate": 1.9127535549916715e-05, + "loss": 0.2618, + "num_input_tokens_seen": 81777488, + "step": 37880 + }, + { + "epoch": 6.180261011419249, + "grad_norm": 0.33935460448265076, + "learning_rate": 1.912061696772652e-05, + "loss": 0.2495, + "num_input_tokens_seen": 81788624, + "step": 37885 + }, + { + "epoch": 6.181076672104404, + "grad_norm": 0.2385120540857315, + "learning_rate": 1.911369886214895e-05, + "loss": 0.0692, + "num_input_tokens_seen": 81798224, + "step": 37890 + }, + { + "epoch": 6.18189233278956, + "grad_norm": 0.21372321248054504, + "learning_rate": 1.9106781233744813e-05, + "loss": 0.0438, + "num_input_tokens_seen": 81808976, + "step": 37895 + }, + { + "epoch": 6.182707993474715, + "grad_norm": 2.145467758178711, + "learning_rate": 1.9099864083074892e-05, + "loss": 0.1585, + "num_input_tokens_seen": 81819760, + "step": 37900 + }, + { + "epoch": 6.1835236541598695, + "grad_norm": 0.1120305061340332, + "learning_rate": 1.9092947410699927e-05, + "loss": 0.2096, + "num_input_tokens_seen": 81830416, + "step": 37905 + }, + { + "epoch": 6.184339314845024, + "grad_norm": 1.4638912677764893, + "learning_rate": 1.9086031217180618e-05, + "loss": 0.267, + "num_input_tokens_seen": 81842416, + "step": 37910 + }, + { + "epoch": 6.185154975530179, + "grad_norm": 0.21343189477920532, + "learning_rate": 1.9079115503077617e-05, + "loss": 0.0752, + "num_input_tokens_seen": 81852080, + "step": 37915 + }, + { + "epoch": 6.185970636215335, + "grad_norm": 0.579294741153717, + "learning_rate": 1.9072200268951562e-05, + "loss": 0.1258, + "num_input_tokens_seen": 81862096, + "step": 37920 + }, + { + "epoch": 6.18678629690049, + "grad_norm": 1.602110743522644, + "learning_rate": 1.906528551536303e-05, + "loss": 0.0848, + "num_input_tokens_seen": 81872816, + "step": 37925 + }, + { + "epoch": 6.1876019575856445, + "grad_norm": 1.6496732234954834, + "learning_rate": 1.905837124287257e-05, + "loss": 0.1577, + "num_input_tokens_seen": 81883120, + "step": 37930 + }, + { + "epoch": 6.188417618270799, + "grad_norm": 0.43160343170166016, + "learning_rate": 1.905145745204068e-05, + "loss": 0.102, + "num_input_tokens_seen": 81893904, + "step": 37935 + }, + { + "epoch": 6.189233278955954, + "grad_norm": 0.06898845732212067, + "learning_rate": 1.9044544143427832e-05, + "loss": 0.144, + "num_input_tokens_seen": 81904368, + "step": 37940 + }, + { + "epoch": 6.190048939641109, + "grad_norm": 0.2113545835018158, + "learning_rate": 1.9037631317594445e-05, + "loss": 0.0988, + "num_input_tokens_seen": 81915664, + "step": 37945 + }, + { + "epoch": 6.190864600326265, + "grad_norm": 0.16898232698440552, + "learning_rate": 1.9030718975100927e-05, + "loss": 0.0227, + "num_input_tokens_seen": 81926736, + "step": 37950 + }, + { + "epoch": 6.191680261011419, + "grad_norm": 0.04708011820912361, + "learning_rate": 1.9023807116507615e-05, + "loss": 0.0324, + "num_input_tokens_seen": 81937552, + "step": 37955 + }, + { + "epoch": 6.192495921696574, + "grad_norm": 0.24296963214874268, + "learning_rate": 1.9016895742374824e-05, + "loss": 0.2787, + "num_input_tokens_seen": 81947120, + "step": 37960 + }, + { + "epoch": 6.193311582381729, + "grad_norm": 1.4290884733200073, + "learning_rate": 1.900998485326282e-05, + "loss": 0.1558, + "num_input_tokens_seen": 81955216, + "step": 37965 + }, + { + "epoch": 6.194127243066884, + "grad_norm": 0.5316212773323059, + "learning_rate": 1.9003074449731835e-05, + "loss": 0.173, + "num_input_tokens_seen": 81965360, + "step": 37970 + }, + { + "epoch": 6.19494290375204, + "grad_norm": 1.7918905019760132, + "learning_rate": 1.8996164532342065e-05, + "loss": 0.3234, + "num_input_tokens_seen": 81976112, + "step": 37975 + }, + { + "epoch": 6.195758564437194, + "grad_norm": 1.7267645597457886, + "learning_rate": 1.8989255101653662e-05, + "loss": 0.1431, + "num_input_tokens_seen": 81987568, + "step": 37980 + }, + { + "epoch": 6.196574225122349, + "grad_norm": 1.3252222537994385, + "learning_rate": 1.898234615822674e-05, + "loss": 0.1666, + "num_input_tokens_seen": 81998096, + "step": 37985 + }, + { + "epoch": 6.197389885807504, + "grad_norm": 0.12323681265115738, + "learning_rate": 1.8975437702621368e-05, + "loss": 0.0211, + "num_input_tokens_seen": 82009872, + "step": 37990 + }, + { + "epoch": 6.198205546492659, + "grad_norm": 0.13478881120681763, + "learning_rate": 1.8968529735397582e-05, + "loss": 0.0932, + "num_input_tokens_seen": 82020080, + "step": 37995 + }, + { + "epoch": 6.199021207177814, + "grad_norm": 0.08421572297811508, + "learning_rate": 1.896162225711538e-05, + "loss": 0.0624, + "num_input_tokens_seen": 82030416, + "step": 38000 + }, + { + "epoch": 6.199836867862969, + "grad_norm": 0.7266654372215271, + "learning_rate": 1.895471526833472e-05, + "loss": 0.0645, + "num_input_tokens_seen": 82041296, + "step": 38005 + }, + { + "epoch": 6.200652528548124, + "grad_norm": 0.12099231779575348, + "learning_rate": 1.8947808769615512e-05, + "loss": 0.0876, + "num_input_tokens_seen": 82051024, + "step": 38010 + }, + { + "epoch": 6.201468189233279, + "grad_norm": 0.3940577805042267, + "learning_rate": 1.8940902761517638e-05, + "loss": 0.2693, + "num_input_tokens_seen": 82062288, + "step": 38015 + }, + { + "epoch": 6.202283849918434, + "grad_norm": 0.07157322019338608, + "learning_rate": 1.8933997244600923e-05, + "loss": 0.093, + "num_input_tokens_seen": 82073808, + "step": 38020 + }, + { + "epoch": 6.203099510603589, + "grad_norm": 0.23391515016555786, + "learning_rate": 1.8927092219425174e-05, + "loss": 0.1162, + "num_input_tokens_seen": 82083728, + "step": 38025 + }, + { + "epoch": 6.2039151712887435, + "grad_norm": 2.730612277984619, + "learning_rate": 1.892018768655014e-05, + "loss": 0.1582, + "num_input_tokens_seen": 82095600, + "step": 38030 + }, + { + "epoch": 6.204730831973899, + "grad_norm": 0.2251482456922531, + "learning_rate": 1.8913283646535547e-05, + "loss": 0.096, + "num_input_tokens_seen": 82105136, + "step": 38035 + }, + { + "epoch": 6.205546492659054, + "grad_norm": 0.06162997707724571, + "learning_rate": 1.890638009994106e-05, + "loss": 0.0426, + "num_input_tokens_seen": 82115504, + "step": 38040 + }, + { + "epoch": 6.206362153344209, + "grad_norm": 0.8598052859306335, + "learning_rate": 1.889947704732632e-05, + "loss": 0.2407, + "num_input_tokens_seen": 82126864, + "step": 38045 + }, + { + "epoch": 6.207177814029364, + "grad_norm": 0.3058079779148102, + "learning_rate": 1.889257448925093e-05, + "loss": 0.1945, + "num_input_tokens_seen": 82137520, + "step": 38050 + }, + { + "epoch": 6.2079934747145185, + "grad_norm": 0.3898675739765167, + "learning_rate": 1.8885672426274424e-05, + "loss": 0.1118, + "num_input_tokens_seen": 82148976, + "step": 38055 + }, + { + "epoch": 6.208809135399674, + "grad_norm": 0.4696159362792969, + "learning_rate": 1.8878770858956353e-05, + "loss": 0.0817, + "num_input_tokens_seen": 82160496, + "step": 38060 + }, + { + "epoch": 6.209624796084829, + "grad_norm": 1.5819545984268188, + "learning_rate": 1.8871869787856166e-05, + "loss": 0.099, + "num_input_tokens_seen": 82171440, + "step": 38065 + }, + { + "epoch": 6.210440456769984, + "grad_norm": 0.6235008835792542, + "learning_rate": 1.886496921353331e-05, + "loss": 0.2032, + "num_input_tokens_seen": 82182704, + "step": 38070 + }, + { + "epoch": 6.211256117455139, + "grad_norm": 0.1925574541091919, + "learning_rate": 1.8858069136547186e-05, + "loss": 0.144, + "num_input_tokens_seen": 82193392, + "step": 38075 + }, + { + "epoch": 6.212071778140293, + "grad_norm": 1.2430566549301147, + "learning_rate": 1.8851169557457128e-05, + "loss": 0.1677, + "num_input_tokens_seen": 82204080, + "step": 38080 + }, + { + "epoch": 6.212887438825448, + "grad_norm": 1.1717830896377563, + "learning_rate": 1.8844270476822473e-05, + "loss": 0.0726, + "num_input_tokens_seen": 82215440, + "step": 38085 + }, + { + "epoch": 6.213703099510604, + "grad_norm": 1.3006101846694946, + "learning_rate": 1.883737189520249e-05, + "loss": 0.109, + "num_input_tokens_seen": 82226832, + "step": 38090 + }, + { + "epoch": 6.214518760195759, + "grad_norm": 0.04921629652380943, + "learning_rate": 1.88304738131564e-05, + "loss": 0.0419, + "num_input_tokens_seen": 82237296, + "step": 38095 + }, + { + "epoch": 6.215334420880914, + "grad_norm": 0.9212018847465515, + "learning_rate": 1.8823576231243418e-05, + "loss": 0.1411, + "num_input_tokens_seen": 82248400, + "step": 38100 + }, + { + "epoch": 6.216150081566068, + "grad_norm": 0.2744962275028229, + "learning_rate": 1.881667915002268e-05, + "loss": 0.1234, + "num_input_tokens_seen": 82259088, + "step": 38105 + }, + { + "epoch": 6.216965742251223, + "grad_norm": 0.08988190442323685, + "learning_rate": 1.8809782570053304e-05, + "loss": 0.2305, + "num_input_tokens_seen": 82269424, + "step": 38110 + }, + { + "epoch": 6.217781402936378, + "grad_norm": 2.44431734085083, + "learning_rate": 1.880288649189436e-05, + "loss": 0.3185, + "num_input_tokens_seen": 82280144, + "step": 38115 + }, + { + "epoch": 6.218597063621534, + "grad_norm": 0.07340530306100845, + "learning_rate": 1.8795990916104886e-05, + "loss": 0.1107, + "num_input_tokens_seen": 82290672, + "step": 38120 + }, + { + "epoch": 6.219412724306689, + "grad_norm": 0.5772780179977417, + "learning_rate": 1.8789095843243863e-05, + "loss": 0.1177, + "num_input_tokens_seen": 82301456, + "step": 38125 + }, + { + "epoch": 6.220228384991843, + "grad_norm": 1.8559118509292603, + "learning_rate": 1.878220127387025e-05, + "loss": 0.1304, + "num_input_tokens_seen": 82311696, + "step": 38130 + }, + { + "epoch": 6.221044045676998, + "grad_norm": 1.65354323387146, + "learning_rate": 1.8775307208542946e-05, + "loss": 0.1028, + "num_input_tokens_seen": 82322320, + "step": 38135 + }, + { + "epoch": 6.221859706362153, + "grad_norm": 1.1038422584533691, + "learning_rate": 1.8768413647820817e-05, + "loss": 0.1479, + "num_input_tokens_seen": 82332976, + "step": 38140 + }, + { + "epoch": 6.222675367047309, + "grad_norm": 3.105799436569214, + "learning_rate": 1.8761520592262704e-05, + "loss": 0.2367, + "num_input_tokens_seen": 82343920, + "step": 38145 + }, + { + "epoch": 6.2234910277324635, + "grad_norm": 0.2445458173751831, + "learning_rate": 1.8754628042427387e-05, + "loss": 0.0864, + "num_input_tokens_seen": 82354928, + "step": 38150 + }, + { + "epoch": 6.224306688417618, + "grad_norm": 0.4313603341579437, + "learning_rate": 1.8747735998873604e-05, + "loss": 0.1006, + "num_input_tokens_seen": 82366544, + "step": 38155 + }, + { + "epoch": 6.225122349102773, + "grad_norm": 0.6973852515220642, + "learning_rate": 1.8740844462160064e-05, + "loss": 0.0233, + "num_input_tokens_seen": 82377520, + "step": 38160 + }, + { + "epoch": 6.225938009787928, + "grad_norm": 0.12195112556219101, + "learning_rate": 1.873395343284543e-05, + "loss": 0.0462, + "num_input_tokens_seen": 82388240, + "step": 38165 + }, + { + "epoch": 6.226753670473083, + "grad_norm": 0.16192711889743805, + "learning_rate": 1.872706291148833e-05, + "loss": 0.1905, + "num_input_tokens_seen": 82399984, + "step": 38170 + }, + { + "epoch": 6.2275693311582385, + "grad_norm": 0.13061641156673431, + "learning_rate": 1.8720172898647338e-05, + "loss": 0.1238, + "num_input_tokens_seen": 82410992, + "step": 38175 + }, + { + "epoch": 6.228384991843393, + "grad_norm": 0.16985392570495605, + "learning_rate": 1.8713283394880993e-05, + "loss": 0.0541, + "num_input_tokens_seen": 82422224, + "step": 38180 + }, + { + "epoch": 6.229200652528548, + "grad_norm": 1.3945404291152954, + "learning_rate": 1.8706394400747796e-05, + "loss": 0.181, + "num_input_tokens_seen": 82432624, + "step": 38185 + }, + { + "epoch": 6.230016313213703, + "grad_norm": 0.45256587862968445, + "learning_rate": 1.8699505916806205e-05, + "loss": 0.1851, + "num_input_tokens_seen": 82443696, + "step": 38190 + }, + { + "epoch": 6.230831973898858, + "grad_norm": 0.5792465209960938, + "learning_rate": 1.869261794361463e-05, + "loss": 0.1233, + "num_input_tokens_seen": 82454736, + "step": 38195 + }, + { + "epoch": 6.231647634584013, + "grad_norm": 1.128084659576416, + "learning_rate": 1.8685730481731444e-05, + "loss": 0.2736, + "num_input_tokens_seen": 82465072, + "step": 38200 + }, + { + "epoch": 6.232463295269168, + "grad_norm": 0.3472203314304352, + "learning_rate": 1.867884353171499e-05, + "loss": 0.0872, + "num_input_tokens_seen": 82475760, + "step": 38205 + }, + { + "epoch": 6.233278955954323, + "grad_norm": 0.1519133597612381, + "learning_rate": 1.867195709412355e-05, + "loss": 0.1384, + "num_input_tokens_seen": 82487056, + "step": 38210 + }, + { + "epoch": 6.234094616639478, + "grad_norm": 0.06282222270965576, + "learning_rate": 1.8665071169515375e-05, + "loss": 0.2076, + "num_input_tokens_seen": 82498544, + "step": 38215 + }, + { + "epoch": 6.234910277324633, + "grad_norm": 0.05076301842927933, + "learning_rate": 1.8658185758448676e-05, + "loss": 0.1925, + "num_input_tokens_seen": 82510096, + "step": 38220 + }, + { + "epoch": 6.235725938009788, + "grad_norm": 0.6093469858169556, + "learning_rate": 1.8651300861481614e-05, + "loss": 0.0424, + "num_input_tokens_seen": 82520752, + "step": 38225 + }, + { + "epoch": 6.236541598694943, + "grad_norm": 0.6091616153717041, + "learning_rate": 1.8644416479172316e-05, + "loss": 0.1319, + "num_input_tokens_seen": 82531760, + "step": 38230 + }, + { + "epoch": 6.237357259380098, + "grad_norm": 0.16844359040260315, + "learning_rate": 1.8637532612078872e-05, + "loss": 0.0609, + "num_input_tokens_seen": 82542576, + "step": 38235 + }, + { + "epoch": 6.238172920065253, + "grad_norm": 0.9415881037712097, + "learning_rate": 1.8630649260759315e-05, + "loss": 0.1614, + "num_input_tokens_seen": 82553168, + "step": 38240 + }, + { + "epoch": 6.238988580750408, + "grad_norm": 0.15008537471294403, + "learning_rate": 1.8623766425771648e-05, + "loss": 0.1213, + "num_input_tokens_seen": 82563280, + "step": 38245 + }, + { + "epoch": 6.239804241435563, + "grad_norm": 0.041583236306905746, + "learning_rate": 1.8616884107673823e-05, + "loss": 0.1096, + "num_input_tokens_seen": 82575120, + "step": 38250 + }, + { + "epoch": 6.240619902120717, + "grad_norm": 2.0050904750823975, + "learning_rate": 1.8610002307023767e-05, + "loss": 0.1999, + "num_input_tokens_seen": 82586032, + "step": 38255 + }, + { + "epoch": 6.241435562805873, + "grad_norm": 2.8533425331115723, + "learning_rate": 1.860312102437934e-05, + "loss": 0.1971, + "num_input_tokens_seen": 82597040, + "step": 38260 + }, + { + "epoch": 6.242251223491028, + "grad_norm": 0.06929420679807663, + "learning_rate": 1.859624026029837e-05, + "loss": 0.0344, + "num_input_tokens_seen": 82608400, + "step": 38265 + }, + { + "epoch": 6.243066884176183, + "grad_norm": 1.585404872894287, + "learning_rate": 1.8589360015338668e-05, + "loss": 0.1309, + "num_input_tokens_seen": 82618000, + "step": 38270 + }, + { + "epoch": 6.2438825448613375, + "grad_norm": 0.8127843737602234, + "learning_rate": 1.8582480290057975e-05, + "loss": 0.1599, + "num_input_tokens_seen": 82628880, + "step": 38275 + }, + { + "epoch": 6.244698205546492, + "grad_norm": 0.06787487119436264, + "learning_rate": 1.8575601085013988e-05, + "loss": 0.0955, + "num_input_tokens_seen": 82639216, + "step": 38280 + }, + { + "epoch": 6.245513866231648, + "grad_norm": 0.3458750247955322, + "learning_rate": 1.8568722400764377e-05, + "loss": 0.0668, + "num_input_tokens_seen": 82649264, + "step": 38285 + }, + { + "epoch": 6.246329526916803, + "grad_norm": 0.09428279101848602, + "learning_rate": 1.8561844237866756e-05, + "loss": 0.1588, + "num_input_tokens_seen": 82660112, + "step": 38290 + }, + { + "epoch": 6.247145187601958, + "grad_norm": 0.18826758861541748, + "learning_rate": 1.855496659687871e-05, + "loss": 0.0747, + "num_input_tokens_seen": 82671408, + "step": 38295 + }, + { + "epoch": 6.2479608482871125, + "grad_norm": 0.3293280601501465, + "learning_rate": 1.8548089478357774e-05, + "loss": 0.0786, + "num_input_tokens_seen": 82682928, + "step": 38300 + }, + { + "epoch": 6.248776508972267, + "grad_norm": 0.6067017912864685, + "learning_rate": 1.8541212882861442e-05, + "loss": 0.0822, + "num_input_tokens_seen": 82694864, + "step": 38305 + }, + { + "epoch": 6.249592169657422, + "grad_norm": 0.39619114995002747, + "learning_rate": 1.853433681094716e-05, + "loss": 0.0325, + "num_input_tokens_seen": 82705296, + "step": 38310 + }, + { + "epoch": 6.250407830342578, + "grad_norm": 0.3921799063682556, + "learning_rate": 1.8527461263172346e-05, + "loss": 0.1989, + "num_input_tokens_seen": 82716304, + "step": 38315 + }, + { + "epoch": 6.251223491027733, + "grad_norm": 0.0909445658326149, + "learning_rate": 1.852058624009436e-05, + "loss": 0.0825, + "num_input_tokens_seen": 82728272, + "step": 38320 + }, + { + "epoch": 6.2520391517128875, + "grad_norm": 1.3079830408096313, + "learning_rate": 1.8513711742270535e-05, + "loss": 0.1517, + "num_input_tokens_seen": 82738896, + "step": 38325 + }, + { + "epoch": 6.252854812398042, + "grad_norm": 0.34147194027900696, + "learning_rate": 1.8506837770258147e-05, + "loss": 0.0477, + "num_input_tokens_seen": 82749712, + "step": 38330 + }, + { + "epoch": 6.253670473083197, + "grad_norm": 0.1973235160112381, + "learning_rate": 1.8499964324614434e-05, + "loss": 0.03, + "num_input_tokens_seen": 82758608, + "step": 38335 + }, + { + "epoch": 6.254486133768353, + "grad_norm": 0.5900463461875916, + "learning_rate": 1.8493091405896595e-05, + "loss": 0.0483, + "num_input_tokens_seen": 82770288, + "step": 38340 + }, + { + "epoch": 6.255301794453508, + "grad_norm": 1.7357701063156128, + "learning_rate": 1.8486219014661782e-05, + "loss": 0.1872, + "num_input_tokens_seen": 82782448, + "step": 38345 + }, + { + "epoch": 6.2561174551386625, + "grad_norm": 0.44334131479263306, + "learning_rate": 1.8479347151467106e-05, + "loss": 0.0472, + "num_input_tokens_seen": 82793264, + "step": 38350 + }, + { + "epoch": 6.256933115823817, + "grad_norm": 0.09228410571813583, + "learning_rate": 1.8472475816869634e-05, + "loss": 0.0605, + "num_input_tokens_seen": 82804688, + "step": 38355 + }, + { + "epoch": 6.257748776508972, + "grad_norm": 0.4074892997741699, + "learning_rate": 1.8465605011426395e-05, + "loss": 0.0345, + "num_input_tokens_seen": 82816336, + "step": 38360 + }, + { + "epoch": 6.258564437194127, + "grad_norm": 0.1134694293141365, + "learning_rate": 1.8458734735694366e-05, + "loss": 0.0225, + "num_input_tokens_seen": 82827408, + "step": 38365 + }, + { + "epoch": 6.259380097879283, + "grad_norm": 1.9750169515609741, + "learning_rate": 1.8451864990230488e-05, + "loss": 0.1733, + "num_input_tokens_seen": 82838128, + "step": 38370 + }, + { + "epoch": 6.260195758564437, + "grad_norm": 0.17419013381004333, + "learning_rate": 1.8444995775591654e-05, + "loss": 0.07, + "num_input_tokens_seen": 82849072, + "step": 38375 + }, + { + "epoch": 6.261011419249592, + "grad_norm": 1.8781309127807617, + "learning_rate": 1.8438127092334732e-05, + "loss": 0.1019, + "num_input_tokens_seen": 82859088, + "step": 38380 + }, + { + "epoch": 6.261827079934747, + "grad_norm": 0.24991951882839203, + "learning_rate": 1.843125894101652e-05, + "loss": 0.0415, + "num_input_tokens_seen": 82870480, + "step": 38385 + }, + { + "epoch": 6.262642740619902, + "grad_norm": 1.5005748271942139, + "learning_rate": 1.8424391322193787e-05, + "loss": 0.1247, + "num_input_tokens_seen": 82880720, + "step": 38390 + }, + { + "epoch": 6.263458401305057, + "grad_norm": 0.24082915484905243, + "learning_rate": 1.8417524236423257e-05, + "loss": 0.0818, + "num_input_tokens_seen": 82891536, + "step": 38395 + }, + { + "epoch": 6.264274061990212, + "grad_norm": 0.6897857785224915, + "learning_rate": 1.8410657684261613e-05, + "loss": 0.2388, + "num_input_tokens_seen": 82903056, + "step": 38400 + }, + { + "epoch": 6.265089722675367, + "grad_norm": 1.4158753156661987, + "learning_rate": 1.840379166626549e-05, + "loss": 0.1478, + "num_input_tokens_seen": 82912912, + "step": 38405 + }, + { + "epoch": 6.265905383360522, + "grad_norm": 1.0224100351333618, + "learning_rate": 1.8396926182991485e-05, + "loss": 0.1475, + "num_input_tokens_seen": 82923664, + "step": 38410 + }, + { + "epoch": 6.266721044045677, + "grad_norm": 0.17481079697608948, + "learning_rate": 1.8390061234996147e-05, + "loss": 0.0297, + "num_input_tokens_seen": 82934256, + "step": 38415 + }, + { + "epoch": 6.267536704730832, + "grad_norm": 1.6420542001724243, + "learning_rate": 1.8383196822835984e-05, + "loss": 0.2166, + "num_input_tokens_seen": 82945712, + "step": 38420 + }, + { + "epoch": 6.268352365415987, + "grad_norm": 0.8055577278137207, + "learning_rate": 1.837633294706746e-05, + "loss": 0.0608, + "num_input_tokens_seen": 82956720, + "step": 38425 + }, + { + "epoch": 6.269168026101142, + "grad_norm": 0.6668297648429871, + "learning_rate": 1.8369469608246993e-05, + "loss": 0.122, + "num_input_tokens_seen": 82967856, + "step": 38430 + }, + { + "epoch": 6.269983686786297, + "grad_norm": 1.003434181213379, + "learning_rate": 1.8362606806930964e-05, + "loss": 0.1381, + "num_input_tokens_seen": 82979472, + "step": 38435 + }, + { + "epoch": 6.270799347471452, + "grad_norm": 1.0121899843215942, + "learning_rate": 1.835574454367571e-05, + "loss": 0.0523, + "num_input_tokens_seen": 82989264, + "step": 38440 + }, + { + "epoch": 6.271615008156607, + "grad_norm": 1.3940902948379517, + "learning_rate": 1.834888281903751e-05, + "loss": 0.1738, + "num_input_tokens_seen": 83000368, + "step": 38445 + }, + { + "epoch": 6.2724306688417615, + "grad_norm": 0.04603279009461403, + "learning_rate": 1.8342021633572617e-05, + "loss": 0.0805, + "num_input_tokens_seen": 83011376, + "step": 38450 + }, + { + "epoch": 6.273246329526917, + "grad_norm": 0.20990420877933502, + "learning_rate": 1.833516098783723e-05, + "loss": 0.1213, + "num_input_tokens_seen": 83021008, + "step": 38455 + }, + { + "epoch": 6.274061990212072, + "grad_norm": 0.33278489112854004, + "learning_rate": 1.832830088238751e-05, + "loss": 0.2066, + "num_input_tokens_seen": 83031888, + "step": 38460 + }, + { + "epoch": 6.274877650897227, + "grad_norm": 0.18618284165859222, + "learning_rate": 1.832144131777958e-05, + "loss": 0.2239, + "num_input_tokens_seen": 83042032, + "step": 38465 + }, + { + "epoch": 6.275693311582382, + "grad_norm": 0.34871163964271545, + "learning_rate": 1.83145822945695e-05, + "loss": 0.2265, + "num_input_tokens_seen": 83051920, + "step": 38470 + }, + { + "epoch": 6.2765089722675365, + "grad_norm": 0.2911875545978546, + "learning_rate": 1.8307723813313298e-05, + "loss": 0.3023, + "num_input_tokens_seen": 83063280, + "step": 38475 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.7013974189758301, + "learning_rate": 1.8300865874566953e-05, + "loss": 0.0896, + "num_input_tokens_seen": 83072112, + "step": 38480 + }, + { + "epoch": 6.278140293637847, + "grad_norm": 0.27515771985054016, + "learning_rate": 1.829400847888642e-05, + "loss": 0.0609, + "num_input_tokens_seen": 83083728, + "step": 38485 + }, + { + "epoch": 6.278955954323002, + "grad_norm": 2.438035249710083, + "learning_rate": 1.8287151626827586e-05, + "loss": 0.2944, + "num_input_tokens_seen": 83094256, + "step": 38490 + }, + { + "epoch": 6.279771615008157, + "grad_norm": 0.44357040524482727, + "learning_rate": 1.8280295318946304e-05, + "loss": 0.1682, + "num_input_tokens_seen": 83105552, + "step": 38495 + }, + { + "epoch": 6.280587275693311, + "grad_norm": 0.8918850421905518, + "learning_rate": 1.827343955579838e-05, + "loss": 0.0633, + "num_input_tokens_seen": 83117296, + "step": 38500 + }, + { + "epoch": 6.281402936378466, + "grad_norm": 0.4298608601093292, + "learning_rate": 1.8266584337939568e-05, + "loss": 0.1298, + "num_input_tokens_seen": 83127824, + "step": 38505 + }, + { + "epoch": 6.282218597063622, + "grad_norm": 0.8997576832771301, + "learning_rate": 1.82597296659256e-05, + "loss": 0.1533, + "num_input_tokens_seen": 83138608, + "step": 38510 + }, + { + "epoch": 6.283034257748777, + "grad_norm": 1.2740041017532349, + "learning_rate": 1.8252875540312143e-05, + "loss": 0.1279, + "num_input_tokens_seen": 83149936, + "step": 38515 + }, + { + "epoch": 6.283849918433932, + "grad_norm": 2.230847120285034, + "learning_rate": 1.824602196165483e-05, + "loss": 0.1626, + "num_input_tokens_seen": 83160528, + "step": 38520 + }, + { + "epoch": 6.284665579119086, + "grad_norm": 1.1060634851455688, + "learning_rate": 1.823916893050925e-05, + "loss": 0.1319, + "num_input_tokens_seen": 83170640, + "step": 38525 + }, + { + "epoch": 6.285481239804241, + "grad_norm": 0.1261487603187561, + "learning_rate": 1.8232316447430936e-05, + "loss": 0.0507, + "num_input_tokens_seen": 83182128, + "step": 38530 + }, + { + "epoch": 6.286296900489396, + "grad_norm": 0.7608110308647156, + "learning_rate": 1.822546451297539e-05, + "loss": 0.232, + "num_input_tokens_seen": 83192368, + "step": 38535 + }, + { + "epoch": 6.287112561174552, + "grad_norm": 0.9296764135360718, + "learning_rate": 1.8218613127698058e-05, + "loss": 0.1389, + "num_input_tokens_seen": 83203024, + "step": 38540 + }, + { + "epoch": 6.287928221859707, + "grad_norm": 1.2047080993652344, + "learning_rate": 1.8211762292154362e-05, + "loss": 0.2235, + "num_input_tokens_seen": 83214160, + "step": 38545 + }, + { + "epoch": 6.288743882544861, + "grad_norm": 0.04854205250740051, + "learning_rate": 1.820491200689966e-05, + "loss": 0.2408, + "num_input_tokens_seen": 83224784, + "step": 38550 + }, + { + "epoch": 6.289559543230016, + "grad_norm": 0.3196149170398712, + "learning_rate": 1.8198062272489263e-05, + "loss": 0.0584, + "num_input_tokens_seen": 83235344, + "step": 38555 + }, + { + "epoch": 6.290375203915171, + "grad_norm": 0.3402091860771179, + "learning_rate": 1.8191213089478455e-05, + "loss": 0.1839, + "num_input_tokens_seen": 83246384, + "step": 38560 + }, + { + "epoch": 6.291190864600326, + "grad_norm": 1.4562629461288452, + "learning_rate": 1.818436445842246e-05, + "loss": 0.2233, + "num_input_tokens_seen": 83257520, + "step": 38565 + }, + { + "epoch": 6.2920065252854815, + "grad_norm": 0.22364631295204163, + "learning_rate": 1.8177516379876463e-05, + "loss": 0.1387, + "num_input_tokens_seen": 83267824, + "step": 38570 + }, + { + "epoch": 6.292822185970636, + "grad_norm": 0.18184177577495575, + "learning_rate": 1.817066885439561e-05, + "loss": 0.0681, + "num_input_tokens_seen": 83277904, + "step": 38575 + }, + { + "epoch": 6.293637846655791, + "grad_norm": 0.563838005065918, + "learning_rate": 1.8163821882534986e-05, + "loss": 0.0135, + "num_input_tokens_seen": 83287280, + "step": 38580 + }, + { + "epoch": 6.294453507340946, + "grad_norm": 1.4078260660171509, + "learning_rate": 1.815697546484964e-05, + "loss": 0.2524, + "num_input_tokens_seen": 83297520, + "step": 38585 + }, + { + "epoch": 6.295269168026101, + "grad_norm": 0.029773715883493423, + "learning_rate": 1.8150129601894592e-05, + "loss": 0.0529, + "num_input_tokens_seen": 83308400, + "step": 38590 + }, + { + "epoch": 6.2960848287112565, + "grad_norm": 0.8573611378669739, + "learning_rate": 1.8143284294224794e-05, + "loss": 0.2749, + "num_input_tokens_seen": 83321008, + "step": 38595 + }, + { + "epoch": 6.296900489396411, + "grad_norm": 0.19959206879138947, + "learning_rate": 1.813643954239516e-05, + "loss": 0.1123, + "num_input_tokens_seen": 83333264, + "step": 38600 + }, + { + "epoch": 6.297716150081566, + "grad_norm": 0.46034571528434753, + "learning_rate": 1.8129595346960568e-05, + "loss": 0.141, + "num_input_tokens_seen": 83343856, + "step": 38605 + }, + { + "epoch": 6.298531810766721, + "grad_norm": 0.5078598260879517, + "learning_rate": 1.812275170847583e-05, + "loss": 0.0968, + "num_input_tokens_seen": 83354832, + "step": 38610 + }, + { + "epoch": 6.299347471451876, + "grad_norm": 0.03727874904870987, + "learning_rate": 1.8115908627495742e-05, + "loss": 0.158, + "num_input_tokens_seen": 83365488, + "step": 38615 + }, + { + "epoch": 6.300163132137031, + "grad_norm": 1.4347292184829712, + "learning_rate": 1.8109066104575023e-05, + "loss": 0.1746, + "num_input_tokens_seen": 83376688, + "step": 38620 + }, + { + "epoch": 6.300978792822186, + "grad_norm": 1.9739798307418823, + "learning_rate": 1.810222414026837e-05, + "loss": 0.1311, + "num_input_tokens_seen": 83387056, + "step": 38625 + }, + { + "epoch": 6.301794453507341, + "grad_norm": 0.9054266214370728, + "learning_rate": 1.809538273513043e-05, + "loss": 0.1492, + "num_input_tokens_seen": 83398224, + "step": 38630 + }, + { + "epoch": 6.302610114192496, + "grad_norm": 0.36991485953330994, + "learning_rate": 1.8088541889715795e-05, + "loss": 0.1258, + "num_input_tokens_seen": 83408880, + "step": 38635 + }, + { + "epoch": 6.303425774877651, + "grad_norm": 0.27158093452453613, + "learning_rate": 1.8081701604579025e-05, + "loss": 0.1957, + "num_input_tokens_seen": 83419504, + "step": 38640 + }, + { + "epoch": 6.304241435562806, + "grad_norm": 0.271409273147583, + "learning_rate": 1.807486188027463e-05, + "loss": 0.0647, + "num_input_tokens_seen": 83429296, + "step": 38645 + }, + { + "epoch": 6.30505709624796, + "grad_norm": 0.032146718353033066, + "learning_rate": 1.8068022717357066e-05, + "loss": 0.0428, + "num_input_tokens_seen": 83440912, + "step": 38650 + }, + { + "epoch": 6.305872756933116, + "grad_norm": 0.24195373058319092, + "learning_rate": 1.8061184116380754e-05, + "loss": 0.0989, + "num_input_tokens_seen": 83451568, + "step": 38655 + }, + { + "epoch": 6.306688417618271, + "grad_norm": 0.1702585071325302, + "learning_rate": 1.8054346077900065e-05, + "loss": 0.0413, + "num_input_tokens_seen": 83461200, + "step": 38660 + }, + { + "epoch": 6.307504078303426, + "grad_norm": 0.11178510636091232, + "learning_rate": 1.8047508602469322e-05, + "loss": 0.0563, + "num_input_tokens_seen": 83472016, + "step": 38665 + }, + { + "epoch": 6.308319738988581, + "grad_norm": 0.3890901803970337, + "learning_rate": 1.804067169064281e-05, + "loss": 0.1049, + "num_input_tokens_seen": 83482896, + "step": 38670 + }, + { + "epoch": 6.309135399673735, + "grad_norm": 2.7426509857177734, + "learning_rate": 1.8033835342974763e-05, + "loss": 0.3271, + "num_input_tokens_seen": 83494384, + "step": 38675 + }, + { + "epoch": 6.309951060358891, + "grad_norm": 0.4880353510379791, + "learning_rate": 1.8026999560019366e-05, + "loss": 0.0452, + "num_input_tokens_seen": 83504848, + "step": 38680 + }, + { + "epoch": 6.310766721044046, + "grad_norm": 0.06776443868875504, + "learning_rate": 1.8020164342330763e-05, + "loss": 0.1092, + "num_input_tokens_seen": 83516208, + "step": 38685 + }, + { + "epoch": 6.311582381729201, + "grad_norm": 2.9177005290985107, + "learning_rate": 1.8013329690463056e-05, + "loss": 0.1441, + "num_input_tokens_seen": 83527056, + "step": 38690 + }, + { + "epoch": 6.3123980424143555, + "grad_norm": 0.9228275418281555, + "learning_rate": 1.8006495604970295e-05, + "loss": 0.1542, + "num_input_tokens_seen": 83537392, + "step": 38695 + }, + { + "epoch": 6.31321370309951, + "grad_norm": 0.1532001495361328, + "learning_rate": 1.7999662086406484e-05, + "loss": 0.1042, + "num_input_tokens_seen": 83548944, + "step": 38700 + }, + { + "epoch": 6.314029363784665, + "grad_norm": 0.4222249686717987, + "learning_rate": 1.799282913532559e-05, + "loss": 0.0597, + "num_input_tokens_seen": 83558832, + "step": 38705 + }, + { + "epoch": 6.314845024469821, + "grad_norm": 0.6128891706466675, + "learning_rate": 1.798599675228151e-05, + "loss": 0.0974, + "num_input_tokens_seen": 83567696, + "step": 38710 + }, + { + "epoch": 6.315660685154976, + "grad_norm": 0.05832507088780403, + "learning_rate": 1.7979164937828127e-05, + "loss": 0.0571, + "num_input_tokens_seen": 83578480, + "step": 38715 + }, + { + "epoch": 6.3164763458401305, + "grad_norm": 0.04364491254091263, + "learning_rate": 1.797233369251926e-05, + "loss": 0.1401, + "num_input_tokens_seen": 83588496, + "step": 38720 + }, + { + "epoch": 6.317292006525285, + "grad_norm": 0.30718228220939636, + "learning_rate": 1.796550301690868e-05, + "loss": 0.0815, + "num_input_tokens_seen": 83599376, + "step": 38725 + }, + { + "epoch": 6.31810766721044, + "grad_norm": 2.1075313091278076, + "learning_rate": 1.7958672911550117e-05, + "loss": 0.1334, + "num_input_tokens_seen": 83610032, + "step": 38730 + }, + { + "epoch": 6.318923327895595, + "grad_norm": 0.7151750326156616, + "learning_rate": 1.7951843376997256e-05, + "loss": 0.0311, + "num_input_tokens_seen": 83621392, + "step": 38735 + }, + { + "epoch": 6.319738988580751, + "grad_norm": 0.43862852454185486, + "learning_rate": 1.7945014413803737e-05, + "loss": 0.0552, + "num_input_tokens_seen": 83632368, + "step": 38740 + }, + { + "epoch": 6.3205546492659055, + "grad_norm": 0.3198293149471283, + "learning_rate": 1.7938186022523144e-05, + "loss": 0.0226, + "num_input_tokens_seen": 83642800, + "step": 38745 + }, + { + "epoch": 6.32137030995106, + "grad_norm": 1.1280794143676758, + "learning_rate": 1.793135820370902e-05, + "loss": 0.1614, + "num_input_tokens_seen": 83653552, + "step": 38750 + }, + { + "epoch": 6.322185970636215, + "grad_norm": 0.8618946671485901, + "learning_rate": 1.792453095791487e-05, + "loss": 0.0883, + "num_input_tokens_seen": 83664624, + "step": 38755 + }, + { + "epoch": 6.32300163132137, + "grad_norm": 0.4377816617488861, + "learning_rate": 1.791770428569414e-05, + "loss": 0.0684, + "num_input_tokens_seen": 83675536, + "step": 38760 + }, + { + "epoch": 6.323817292006526, + "grad_norm": 0.05808074772357941, + "learning_rate": 1.7910878187600232e-05, + "loss": 0.0352, + "num_input_tokens_seen": 83685712, + "step": 38765 + }, + { + "epoch": 6.3246329526916805, + "grad_norm": 0.16077755391597748, + "learning_rate": 1.790405266418651e-05, + "loss": 0.0916, + "num_input_tokens_seen": 83696112, + "step": 38770 + }, + { + "epoch": 6.325448613376835, + "grad_norm": 0.4904874265193939, + "learning_rate": 1.789722771600628e-05, + "loss": 0.0852, + "num_input_tokens_seen": 83706608, + "step": 38775 + }, + { + "epoch": 6.32626427406199, + "grad_norm": 0.13309891521930695, + "learning_rate": 1.789040334361282e-05, + "loss": 0.1597, + "num_input_tokens_seen": 83718288, + "step": 38780 + }, + { + "epoch": 6.327079934747145, + "grad_norm": 0.7118430137634277, + "learning_rate": 1.788357954755933e-05, + "loss": 0.052, + "num_input_tokens_seen": 83729328, + "step": 38785 + }, + { + "epoch": 6.327895595432301, + "grad_norm": 1.4329004287719727, + "learning_rate": 1.7876756328398998e-05, + "loss": 0.2197, + "num_input_tokens_seen": 83740016, + "step": 38790 + }, + { + "epoch": 6.328711256117455, + "grad_norm": 1.6837587356567383, + "learning_rate": 1.7869933686684938e-05, + "loss": 0.196, + "num_input_tokens_seen": 83751120, + "step": 38795 + }, + { + "epoch": 6.32952691680261, + "grad_norm": 0.26476916670799255, + "learning_rate": 1.786311162297022e-05, + "loss": 0.1662, + "num_input_tokens_seen": 83760880, + "step": 38800 + }, + { + "epoch": 6.330342577487765, + "grad_norm": 1.4332151412963867, + "learning_rate": 1.7856290137807893e-05, + "loss": 0.1385, + "num_input_tokens_seen": 83771440, + "step": 38805 + }, + { + "epoch": 6.33115823817292, + "grad_norm": 0.049738720059394836, + "learning_rate": 1.7849469231750936e-05, + "loss": 0.1396, + "num_input_tokens_seen": 83781424, + "step": 38810 + }, + { + "epoch": 6.331973898858075, + "grad_norm": 0.10562161356210709, + "learning_rate": 1.784264890535229e-05, + "loss": 0.1013, + "num_input_tokens_seen": 83792848, + "step": 38815 + }, + { + "epoch": 6.33278955954323, + "grad_norm": 0.6746407151222229, + "learning_rate": 1.7835829159164835e-05, + "loss": 0.1606, + "num_input_tokens_seen": 83803152, + "step": 38820 + }, + { + "epoch": 6.333605220228385, + "grad_norm": 0.34170496463775635, + "learning_rate": 1.7829009993741418e-05, + "loss": 0.0898, + "num_input_tokens_seen": 83814672, + "step": 38825 + }, + { + "epoch": 6.33442088091354, + "grad_norm": 0.4389704167842865, + "learning_rate": 1.782219140963484e-05, + "loss": 0.13, + "num_input_tokens_seen": 83825904, + "step": 38830 + }, + { + "epoch": 6.335236541598695, + "grad_norm": 0.2891416847705841, + "learning_rate": 1.781537340739784e-05, + "loss": 0.0211, + "num_input_tokens_seen": 83837936, + "step": 38835 + }, + { + "epoch": 6.33605220228385, + "grad_norm": 0.7705093622207642, + "learning_rate": 1.780855598758313e-05, + "loss": 0.1485, + "num_input_tokens_seen": 83848944, + "step": 38840 + }, + { + "epoch": 6.3368678629690045, + "grad_norm": 1.4121755361557007, + "learning_rate": 1.7801739150743363e-05, + "loss": 0.1341, + "num_input_tokens_seen": 83861136, + "step": 38845 + }, + { + "epoch": 6.33768352365416, + "grad_norm": 1.3189966678619385, + "learning_rate": 1.7794922897431145e-05, + "loss": 0.1893, + "num_input_tokens_seen": 83873136, + "step": 38850 + }, + { + "epoch": 6.338499184339315, + "grad_norm": 0.288097620010376, + "learning_rate": 1.7788107228199023e-05, + "loss": 0.1367, + "num_input_tokens_seen": 83884496, + "step": 38855 + }, + { + "epoch": 6.33931484502447, + "grad_norm": 0.18899677693843842, + "learning_rate": 1.7781292143599532e-05, + "loss": 0.0211, + "num_input_tokens_seen": 83895664, + "step": 38860 + }, + { + "epoch": 6.340130505709625, + "grad_norm": 1.3749500513076782, + "learning_rate": 1.7774477644185125e-05, + "loss": 0.2076, + "num_input_tokens_seen": 83907664, + "step": 38865 + }, + { + "epoch": 6.3409461663947795, + "grad_norm": 0.19682933390140533, + "learning_rate": 1.7767663730508222e-05, + "loss": 0.0724, + "num_input_tokens_seen": 83918896, + "step": 38870 + }, + { + "epoch": 6.341761827079935, + "grad_norm": 0.8987295031547546, + "learning_rate": 1.7760850403121195e-05, + "loss": 0.1213, + "num_input_tokens_seen": 83929008, + "step": 38875 + }, + { + "epoch": 6.34257748776509, + "grad_norm": 0.11094821244478226, + "learning_rate": 1.7754037662576365e-05, + "loss": 0.0625, + "num_input_tokens_seen": 83937776, + "step": 38880 + }, + { + "epoch": 6.343393148450245, + "grad_norm": 0.19480125606060028, + "learning_rate": 1.7747225509426008e-05, + "loss": 0.0264, + "num_input_tokens_seen": 83948816, + "step": 38885 + }, + { + "epoch": 6.3442088091354, + "grad_norm": 0.6843348145484924, + "learning_rate": 1.774041394422235e-05, + "loss": 0.0812, + "num_input_tokens_seen": 83959440, + "step": 38890 + }, + { + "epoch": 6.3450244698205545, + "grad_norm": 0.3148607313632965, + "learning_rate": 1.7733602967517578e-05, + "loss": 0.0562, + "num_input_tokens_seen": 83971248, + "step": 38895 + }, + { + "epoch": 6.345840130505709, + "grad_norm": 0.09067147225141525, + "learning_rate": 1.772679257986381e-05, + "loss": 0.0554, + "num_input_tokens_seen": 83981584, + "step": 38900 + }, + { + "epoch": 6.346655791190865, + "grad_norm": 0.740907609462738, + "learning_rate": 1.7719982781813135e-05, + "loss": 0.1384, + "num_input_tokens_seen": 83992112, + "step": 38905 + }, + { + "epoch": 6.34747145187602, + "grad_norm": 0.5271180868148804, + "learning_rate": 1.7713173573917603e-05, + "loss": 0.116, + "num_input_tokens_seen": 84003472, + "step": 38910 + }, + { + "epoch": 6.348287112561175, + "grad_norm": 0.1254560351371765, + "learning_rate": 1.7706364956729195e-05, + "loss": 0.0828, + "num_input_tokens_seen": 84013840, + "step": 38915 + }, + { + "epoch": 6.349102773246329, + "grad_norm": 0.019892381504178047, + "learning_rate": 1.769955693079985e-05, + "loss": 0.066, + "num_input_tokens_seen": 84024912, + "step": 38920 + }, + { + "epoch": 6.349918433931484, + "grad_norm": 0.026367055252194405, + "learning_rate": 1.769274949668146e-05, + "loss": 0.1241, + "num_input_tokens_seen": 84035632, + "step": 38925 + }, + { + "epoch": 6.350734094616639, + "grad_norm": 1.248103380203247, + "learning_rate": 1.7685942654925876e-05, + "loss": 0.1246, + "num_input_tokens_seen": 84046448, + "step": 38930 + }, + { + "epoch": 6.351549755301795, + "grad_norm": 0.30926838517189026, + "learning_rate": 1.767913640608489e-05, + "loss": 0.1536, + "num_input_tokens_seen": 84057648, + "step": 38935 + }, + { + "epoch": 6.35236541598695, + "grad_norm": 2.6549155712127686, + "learning_rate": 1.7672330750710247e-05, + "loss": 0.0924, + "num_input_tokens_seen": 84068880, + "step": 38940 + }, + { + "epoch": 6.353181076672104, + "grad_norm": 0.7297632694244385, + "learning_rate": 1.766552568935366e-05, + "loss": 0.0718, + "num_input_tokens_seen": 84079696, + "step": 38945 + }, + { + "epoch": 6.353996737357259, + "grad_norm": 1.9025839567184448, + "learning_rate": 1.7658721222566775e-05, + "loss": 0.2094, + "num_input_tokens_seen": 84089456, + "step": 38950 + }, + { + "epoch": 6.354812398042414, + "grad_norm": 0.22776402533054352, + "learning_rate": 1.76519173509012e-05, + "loss": 0.0371, + "num_input_tokens_seen": 84099440, + "step": 38955 + }, + { + "epoch": 6.35562805872757, + "grad_norm": 0.8956218957901001, + "learning_rate": 1.764511407490848e-05, + "loss": 0.2556, + "num_input_tokens_seen": 84109232, + "step": 38960 + }, + { + "epoch": 6.356443719412725, + "grad_norm": 0.21686340868473053, + "learning_rate": 1.763831139514014e-05, + "loss": 0.1223, + "num_input_tokens_seen": 84121712, + "step": 38965 + }, + { + "epoch": 6.357259380097879, + "grad_norm": 0.2932572662830353, + "learning_rate": 1.7631509312147626e-05, + "loss": 0.0284, + "num_input_tokens_seen": 84132688, + "step": 38970 + }, + { + "epoch": 6.358075040783034, + "grad_norm": 1.5296295881271362, + "learning_rate": 1.7624707826482356e-05, + "loss": 0.0869, + "num_input_tokens_seen": 84142544, + "step": 38975 + }, + { + "epoch": 6.358890701468189, + "grad_norm": 2.387542963027954, + "learning_rate": 1.7617906938695694e-05, + "loss": 0.2713, + "num_input_tokens_seen": 84151312, + "step": 38980 + }, + { + "epoch": 6.359706362153344, + "grad_norm": 0.9537858963012695, + "learning_rate": 1.761110664933895e-05, + "loss": 0.1, + "num_input_tokens_seen": 84162512, + "step": 38985 + }, + { + "epoch": 6.3605220228384995, + "grad_norm": 1.370864748954773, + "learning_rate": 1.760430695896339e-05, + "loss": 0.1138, + "num_input_tokens_seen": 84174480, + "step": 38990 + }, + { + "epoch": 6.361337683523654, + "grad_norm": 0.08421911299228668, + "learning_rate": 1.7597507868120227e-05, + "loss": 0.105, + "num_input_tokens_seen": 84185712, + "step": 38995 + }, + { + "epoch": 6.362153344208809, + "grad_norm": 0.10924197733402252, + "learning_rate": 1.7590709377360648e-05, + "loss": 0.1926, + "num_input_tokens_seen": 84195824, + "step": 39000 + }, + { + "epoch": 6.362969004893964, + "grad_norm": 1.5095335245132446, + "learning_rate": 1.7583911487235753e-05, + "loss": 0.2281, + "num_input_tokens_seen": 84207344, + "step": 39005 + }, + { + "epoch": 6.363784665579119, + "grad_norm": 0.07251433283090591, + "learning_rate": 1.7577114198296623e-05, + "loss": 0.0291, + "num_input_tokens_seen": 84218512, + "step": 39010 + }, + { + "epoch": 6.364600326264274, + "grad_norm": 0.5512619614601135, + "learning_rate": 1.757031751109428e-05, + "loss": 0.0323, + "num_input_tokens_seen": 84229520, + "step": 39015 + }, + { + "epoch": 6.365415986949429, + "grad_norm": 0.033506762236356735, + "learning_rate": 1.75635214261797e-05, + "loss": 0.1579, + "num_input_tokens_seen": 84240752, + "step": 39020 + }, + { + "epoch": 6.366231647634584, + "grad_norm": 0.22897212207317352, + "learning_rate": 1.7556725944103803e-05, + "loss": 0.1296, + "num_input_tokens_seen": 84250224, + "step": 39025 + }, + { + "epoch": 6.367047308319739, + "grad_norm": 0.15073169767856598, + "learning_rate": 1.754993106541747e-05, + "loss": 0.0211, + "num_input_tokens_seen": 84260784, + "step": 39030 + }, + { + "epoch": 6.367862969004894, + "grad_norm": 0.3290570080280304, + "learning_rate": 1.7543136790671524e-05, + "loss": 0.0384, + "num_input_tokens_seen": 84272560, + "step": 39035 + }, + { + "epoch": 6.368678629690049, + "grad_norm": 0.25426191091537476, + "learning_rate": 1.753634312041675e-05, + "loss": 0.0713, + "num_input_tokens_seen": 84282640, + "step": 39040 + }, + { + "epoch": 6.369494290375204, + "grad_norm": 1.0959497690200806, + "learning_rate": 1.752955005520387e-05, + "loss": 0.2223, + "num_input_tokens_seen": 84293936, + "step": 39045 + }, + { + "epoch": 6.370309951060359, + "grad_norm": 0.9064390659332275, + "learning_rate": 1.7522757595583567e-05, + "loss": 0.1869, + "num_input_tokens_seen": 84303376, + "step": 39050 + }, + { + "epoch": 6.371125611745514, + "grad_norm": 1.784211277961731, + "learning_rate": 1.751596574210647e-05, + "loss": 0.1262, + "num_input_tokens_seen": 84313968, + "step": 39055 + }, + { + "epoch": 6.371941272430669, + "grad_norm": 0.24962975084781647, + "learning_rate": 1.750917449532317e-05, + "loss": 0.1425, + "num_input_tokens_seen": 84325520, + "step": 39060 + }, + { + "epoch": 6.372756933115824, + "grad_norm": 0.25909510254859924, + "learning_rate": 1.7502383855784187e-05, + "loss": 0.0388, + "num_input_tokens_seen": 84336496, + "step": 39065 + }, + { + "epoch": 6.373572593800978, + "grad_norm": 0.12299077957868576, + "learning_rate": 1.7495593824040014e-05, + "loss": 0.1181, + "num_input_tokens_seen": 84346928, + "step": 39070 + }, + { + "epoch": 6.374388254486134, + "grad_norm": 0.08700103312730789, + "learning_rate": 1.7488804400641084e-05, + "loss": 0.0597, + "num_input_tokens_seen": 84358032, + "step": 39075 + }, + { + "epoch": 6.375203915171289, + "grad_norm": 0.1160474345088005, + "learning_rate": 1.7482015586137774e-05, + "loss": 0.029, + "num_input_tokens_seen": 84368816, + "step": 39080 + }, + { + "epoch": 6.376019575856444, + "grad_norm": 0.09778963774442673, + "learning_rate": 1.7475227381080434e-05, + "loss": 0.0535, + "num_input_tokens_seen": 84378800, + "step": 39085 + }, + { + "epoch": 6.376835236541599, + "grad_norm": 0.0663478672504425, + "learning_rate": 1.746843978601934e-05, + "loss": 0.0421, + "num_input_tokens_seen": 84389584, + "step": 39090 + }, + { + "epoch": 6.377650897226753, + "grad_norm": 0.34301382303237915, + "learning_rate": 1.746165280150473e-05, + "loss": 0.2222, + "num_input_tokens_seen": 84398768, + "step": 39095 + }, + { + "epoch": 6.378466557911908, + "grad_norm": 0.26669156551361084, + "learning_rate": 1.7454866428086797e-05, + "loss": 0.0283, + "num_input_tokens_seen": 84408464, + "step": 39100 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.0919429361820221, + "learning_rate": 1.7448080666315675e-05, + "loss": 0.0309, + "num_input_tokens_seen": 84419600, + "step": 39105 + }, + { + "epoch": 6.380097879282219, + "grad_norm": 0.7204341888427734, + "learning_rate": 1.744129551674145e-05, + "loss": 0.0604, + "num_input_tokens_seen": 84428784, + "step": 39110 + }, + { + "epoch": 6.3809135399673735, + "grad_norm": 0.10078734159469604, + "learning_rate": 1.7434510979914166e-05, + "loss": 0.0551, + "num_input_tokens_seen": 84439696, + "step": 39115 + }, + { + "epoch": 6.381729200652528, + "grad_norm": 0.45009684562683105, + "learning_rate": 1.7427727056383795e-05, + "loss": 0.102, + "num_input_tokens_seen": 84451280, + "step": 39120 + }, + { + "epoch": 6.382544861337683, + "grad_norm": 1.7385809421539307, + "learning_rate": 1.74209437467003e-05, + "loss": 0.2285, + "num_input_tokens_seen": 84462512, + "step": 39125 + }, + { + "epoch": 6.383360522022839, + "grad_norm": 0.16643677651882172, + "learning_rate": 1.7414161051413565e-05, + "loss": 0.0459, + "num_input_tokens_seen": 84473616, + "step": 39130 + }, + { + "epoch": 6.384176182707994, + "grad_norm": 0.6678984761238098, + "learning_rate": 1.740737897107342e-05, + "loss": 0.0998, + "num_input_tokens_seen": 84485776, + "step": 39135 + }, + { + "epoch": 6.3849918433931485, + "grad_norm": 0.22337354719638824, + "learning_rate": 1.7400597506229667e-05, + "loss": 0.0708, + "num_input_tokens_seen": 84498064, + "step": 39140 + }, + { + "epoch": 6.385807504078303, + "grad_norm": 0.05846715718507767, + "learning_rate": 1.739381665743203e-05, + "loss": 0.2574, + "num_input_tokens_seen": 84509296, + "step": 39145 + }, + { + "epoch": 6.386623164763458, + "grad_norm": 1.0920716524124146, + "learning_rate": 1.7387036425230214e-05, + "loss": 0.0505, + "num_input_tokens_seen": 84519888, + "step": 39150 + }, + { + "epoch": 6.387438825448613, + "grad_norm": 0.8707415461540222, + "learning_rate": 1.7380256810173854e-05, + "loss": 0.1776, + "num_input_tokens_seen": 84530352, + "step": 39155 + }, + { + "epoch": 6.388254486133769, + "grad_norm": 0.061034683138132095, + "learning_rate": 1.7373477812812538e-05, + "loss": 0.1765, + "num_input_tokens_seen": 84540912, + "step": 39160 + }, + { + "epoch": 6.3890701468189235, + "grad_norm": 0.841430127620697, + "learning_rate": 1.73666994336958e-05, + "loss": 0.0524, + "num_input_tokens_seen": 84551760, + "step": 39165 + }, + { + "epoch": 6.389885807504078, + "grad_norm": 0.4790148437023163, + "learning_rate": 1.735992167337314e-05, + "loss": 0.0874, + "num_input_tokens_seen": 84562960, + "step": 39170 + }, + { + "epoch": 6.390701468189233, + "grad_norm": 0.8710991740226746, + "learning_rate": 1.7353144532394e-05, + "loss": 0.1482, + "num_input_tokens_seen": 84573264, + "step": 39175 + }, + { + "epoch": 6.391517128874388, + "grad_norm": 1.9767282009124756, + "learning_rate": 1.734636801130776e-05, + "loss": 0.0571, + "num_input_tokens_seen": 84583632, + "step": 39180 + }, + { + "epoch": 6.392332789559543, + "grad_norm": 1.384634256362915, + "learning_rate": 1.7339592110663768e-05, + "loss": 0.3796, + "num_input_tokens_seen": 84594960, + "step": 39185 + }, + { + "epoch": 6.3931484502446985, + "grad_norm": 1.5981922149658203, + "learning_rate": 1.7332816831011307e-05, + "loss": 0.1378, + "num_input_tokens_seen": 84604912, + "step": 39190 + }, + { + "epoch": 6.393964110929853, + "grad_norm": 0.2229405641555786, + "learning_rate": 1.7326042172899616e-05, + "loss": 0.1763, + "num_input_tokens_seen": 84615344, + "step": 39195 + }, + { + "epoch": 6.394779771615008, + "grad_norm": 0.7626932263374329, + "learning_rate": 1.731926813687788e-05, + "loss": 0.0926, + "num_input_tokens_seen": 84625744, + "step": 39200 + }, + { + "epoch": 6.395595432300163, + "grad_norm": 1.6154446601867676, + "learning_rate": 1.7312494723495243e-05, + "loss": 0.0788, + "num_input_tokens_seen": 84636784, + "step": 39205 + }, + { + "epoch": 6.396411092985318, + "grad_norm": 1.0625877380371094, + "learning_rate": 1.730572193330079e-05, + "loss": 0.1346, + "num_input_tokens_seen": 84648656, + "step": 39210 + }, + { + "epoch": 6.397226753670473, + "grad_norm": 0.05313669890165329, + "learning_rate": 1.7298949766843558e-05, + "loss": 0.0628, + "num_input_tokens_seen": 84659728, + "step": 39215 + }, + { + "epoch": 6.398042414355628, + "grad_norm": 2.000444173812866, + "learning_rate": 1.7292178224672528e-05, + "loss": 0.1061, + "num_input_tokens_seen": 84670672, + "step": 39220 + }, + { + "epoch": 6.398858075040783, + "grad_norm": 0.8750671148300171, + "learning_rate": 1.7285407307336636e-05, + "loss": 0.1871, + "num_input_tokens_seen": 84679952, + "step": 39225 + }, + { + "epoch": 6.399673735725938, + "grad_norm": 0.21820072829723358, + "learning_rate": 1.727863701538478e-05, + "loss": 0.0526, + "num_input_tokens_seen": 84691024, + "step": 39230 + }, + { + "epoch": 6.400489396411093, + "grad_norm": 0.6370114684104919, + "learning_rate": 1.7271867349365782e-05, + "loss": 0.172, + "num_input_tokens_seen": 84700976, + "step": 39235 + }, + { + "epoch": 6.401305057096248, + "grad_norm": 0.24081918597221375, + "learning_rate": 1.7265098309828433e-05, + "loss": 0.0291, + "num_input_tokens_seen": 84710896, + "step": 39240 + }, + { + "epoch": 6.402120717781403, + "grad_norm": 0.7086563110351562, + "learning_rate": 1.725832989732146e-05, + "loss": 0.1828, + "num_input_tokens_seen": 84722448, + "step": 39245 + }, + { + "epoch": 6.402936378466558, + "grad_norm": 2.147712469100952, + "learning_rate": 1.725156211239354e-05, + "loss": 0.2575, + "num_input_tokens_seen": 84733168, + "step": 39250 + }, + { + "epoch": 6.403752039151713, + "grad_norm": 1.6323869228363037, + "learning_rate": 1.7244794955593316e-05, + "loss": 0.1344, + "num_input_tokens_seen": 84744176, + "step": 39255 + }, + { + "epoch": 6.404567699836868, + "grad_norm": 0.3564353585243225, + "learning_rate": 1.7238028427469363e-05, + "loss": 0.0236, + "num_input_tokens_seen": 84754704, + "step": 39260 + }, + { + "epoch": 6.4053833605220225, + "grad_norm": 1.7293891906738281, + "learning_rate": 1.7231262528570207e-05, + "loss": 0.1863, + "num_input_tokens_seen": 84765584, + "step": 39265 + }, + { + "epoch": 6.406199021207178, + "grad_norm": 0.8506243824958801, + "learning_rate": 1.7224497259444334e-05, + "loss": 0.2023, + "num_input_tokens_seen": 84776144, + "step": 39270 + }, + { + "epoch": 6.407014681892333, + "grad_norm": 0.08186260610818863, + "learning_rate": 1.7217732620640163e-05, + "loss": 0.1484, + "num_input_tokens_seen": 84785488, + "step": 39275 + }, + { + "epoch": 6.407830342577488, + "grad_norm": 1.464577317237854, + "learning_rate": 1.721096861270607e-05, + "loss": 0.1994, + "num_input_tokens_seen": 84796080, + "step": 39280 + }, + { + "epoch": 6.408646003262643, + "grad_norm": 0.19154199957847595, + "learning_rate": 1.7204205236190385e-05, + "loss": 0.1343, + "num_input_tokens_seen": 84805520, + "step": 39285 + }, + { + "epoch": 6.4094616639477975, + "grad_norm": 0.1287555694580078, + "learning_rate": 1.719744249164138e-05, + "loss": 0.061, + "num_input_tokens_seen": 84816464, + "step": 39290 + }, + { + "epoch": 6.410277324632952, + "grad_norm": 0.6821826696395874, + "learning_rate": 1.7190680379607278e-05, + "loss": 0.1856, + "num_input_tokens_seen": 84826640, + "step": 39295 + }, + { + "epoch": 6.411092985318108, + "grad_norm": 1.2578270435333252, + "learning_rate": 1.718391890063624e-05, + "loss": 0.241, + "num_input_tokens_seen": 84836304, + "step": 39300 + }, + { + "epoch": 6.411908646003263, + "grad_norm": 1.835018277168274, + "learning_rate": 1.7177158055276405e-05, + "loss": 0.0743, + "num_input_tokens_seen": 84847280, + "step": 39305 + }, + { + "epoch": 6.412724306688418, + "grad_norm": 0.18001821637153625, + "learning_rate": 1.717039784407582e-05, + "loss": 0.2227, + "num_input_tokens_seen": 84859696, + "step": 39310 + }, + { + "epoch": 6.4135399673735725, + "grad_norm": 0.3027726113796234, + "learning_rate": 1.7163638267582516e-05, + "loss": 0.0587, + "num_input_tokens_seen": 84870576, + "step": 39315 + }, + { + "epoch": 6.414355628058727, + "grad_norm": 0.04115438088774681, + "learning_rate": 1.715687932634446e-05, + "loss": 0.067, + "num_input_tokens_seen": 84882224, + "step": 39320 + }, + { + "epoch": 6.415171288743883, + "grad_norm": 0.9745197892189026, + "learning_rate": 1.715012102090956e-05, + "loss": 0.2004, + "num_input_tokens_seen": 84892496, + "step": 39325 + }, + { + "epoch": 6.415986949429038, + "grad_norm": 0.6595913171768188, + "learning_rate": 1.714336335182567e-05, + "loss": 0.2464, + "num_input_tokens_seen": 84903440, + "step": 39330 + }, + { + "epoch": 6.416802610114193, + "grad_norm": 0.9623132348060608, + "learning_rate": 1.7136606319640616e-05, + "loss": 0.0943, + "num_input_tokens_seen": 84914608, + "step": 39335 + }, + { + "epoch": 6.417618270799347, + "grad_norm": 0.4644443690776825, + "learning_rate": 1.7129849924902157e-05, + "loss": 0.0544, + "num_input_tokens_seen": 84926768, + "step": 39340 + }, + { + "epoch": 6.418433931484502, + "grad_norm": 1.98554527759552, + "learning_rate": 1.7123094168157994e-05, + "loss": 0.1274, + "num_input_tokens_seen": 84937488, + "step": 39345 + }, + { + "epoch": 6.419249592169657, + "grad_norm": 0.0660291388630867, + "learning_rate": 1.7116339049955788e-05, + "loss": 0.0987, + "num_input_tokens_seen": 84949840, + "step": 39350 + }, + { + "epoch": 6.420065252854813, + "grad_norm": 1.29940664768219, + "learning_rate": 1.7109584570843136e-05, + "loss": 0.2585, + "num_input_tokens_seen": 84959088, + "step": 39355 + }, + { + "epoch": 6.420880913539968, + "grad_norm": 0.09726675599813461, + "learning_rate": 1.7102830731367593e-05, + "loss": 0.0819, + "num_input_tokens_seen": 84969232, + "step": 39360 + }, + { + "epoch": 6.421696574225122, + "grad_norm": 0.211343452334404, + "learning_rate": 1.7096077532076666e-05, + "loss": 0.1566, + "num_input_tokens_seen": 84981488, + "step": 39365 + }, + { + "epoch": 6.422512234910277, + "grad_norm": 0.6166918277740479, + "learning_rate": 1.7089324973517794e-05, + "loss": 0.1369, + "num_input_tokens_seen": 84992688, + "step": 39370 + }, + { + "epoch": 6.423327895595432, + "grad_norm": 0.09353423863649368, + "learning_rate": 1.708257305623838e-05, + "loss": 0.044, + "num_input_tokens_seen": 85004144, + "step": 39375 + }, + { + "epoch": 6.424143556280587, + "grad_norm": 1.5935109853744507, + "learning_rate": 1.7075821780785766e-05, + "loss": 0.1229, + "num_input_tokens_seen": 85015408, + "step": 39380 + }, + { + "epoch": 6.424959216965743, + "grad_norm": 0.4252547025680542, + "learning_rate": 1.7069071147707248e-05, + "loss": 0.0474, + "num_input_tokens_seen": 85026064, + "step": 39385 + }, + { + "epoch": 6.425774877650897, + "grad_norm": 0.39797234535217285, + "learning_rate": 1.706232115755006e-05, + "loss": 0.1141, + "num_input_tokens_seen": 85037232, + "step": 39390 + }, + { + "epoch": 6.426590538336052, + "grad_norm": 2.1355817317962646, + "learning_rate": 1.705557181086139e-05, + "loss": 0.1814, + "num_input_tokens_seen": 85047888, + "step": 39395 + }, + { + "epoch": 6.427406199021207, + "grad_norm": 0.40882670879364014, + "learning_rate": 1.704882310818839e-05, + "loss": 0.1243, + "num_input_tokens_seen": 85058768, + "step": 39400 + }, + { + "epoch": 6.428221859706362, + "grad_norm": 1.0569075345993042, + "learning_rate": 1.704207505007813e-05, + "loss": 0.1068, + "num_input_tokens_seen": 85069296, + "step": 39405 + }, + { + "epoch": 6.4290375203915175, + "grad_norm": 0.2341408133506775, + "learning_rate": 1.703532763707764e-05, + "loss": 0.1125, + "num_input_tokens_seen": 85080528, + "step": 39410 + }, + { + "epoch": 6.429853181076672, + "grad_norm": 0.060390837490558624, + "learning_rate": 1.7028580869733905e-05, + "loss": 0.137, + "num_input_tokens_seen": 85091024, + "step": 39415 + }, + { + "epoch": 6.430668841761827, + "grad_norm": 0.7260276079177856, + "learning_rate": 1.702183474859385e-05, + "loss": 0.0824, + "num_input_tokens_seen": 85100656, + "step": 39420 + }, + { + "epoch": 6.431484502446982, + "grad_norm": 0.27556073665618896, + "learning_rate": 1.7015089274204354e-05, + "loss": 0.0335, + "num_input_tokens_seen": 85111856, + "step": 39425 + }, + { + "epoch": 6.432300163132137, + "grad_norm": 0.5753117799758911, + "learning_rate": 1.7008344447112238e-05, + "loss": 0.1584, + "num_input_tokens_seen": 85120944, + "step": 39430 + }, + { + "epoch": 6.433115823817292, + "grad_norm": 0.8795137405395508, + "learning_rate": 1.7001600267864266e-05, + "loss": 0.09, + "num_input_tokens_seen": 85131440, + "step": 39435 + }, + { + "epoch": 6.433931484502447, + "grad_norm": 0.30175086855888367, + "learning_rate": 1.6994856737007154e-05, + "loss": 0.1106, + "num_input_tokens_seen": 85140720, + "step": 39440 + }, + { + "epoch": 6.434747145187602, + "grad_norm": 0.8620628714561462, + "learning_rate": 1.698811385508758e-05, + "loss": 0.2266, + "num_input_tokens_seen": 85151152, + "step": 39445 + }, + { + "epoch": 6.435562805872757, + "grad_norm": 1.1206591129302979, + "learning_rate": 1.698137162265215e-05, + "loss": 0.069, + "num_input_tokens_seen": 85161392, + "step": 39450 + }, + { + "epoch": 6.436378466557912, + "grad_norm": 0.7689232230186462, + "learning_rate": 1.6974630040247425e-05, + "loss": 0.1229, + "num_input_tokens_seen": 85172656, + "step": 39455 + }, + { + "epoch": 6.437194127243067, + "grad_norm": 0.17004242539405823, + "learning_rate": 1.6967889108419903e-05, + "loss": 0.0601, + "num_input_tokens_seen": 85184784, + "step": 39460 + }, + { + "epoch": 6.438009787928221, + "grad_norm": 2.0586283206939697, + "learning_rate": 1.696114882771605e-05, + "loss": 0.1185, + "num_input_tokens_seen": 85195248, + "step": 39465 + }, + { + "epoch": 6.438825448613377, + "grad_norm": 0.4549744725227356, + "learning_rate": 1.695440919868226e-05, + "loss": 0.0925, + "num_input_tokens_seen": 85205040, + "step": 39470 + }, + { + "epoch": 6.439641109298532, + "grad_norm": 0.3905419409275055, + "learning_rate": 1.694767022186488e-05, + "loss": 0.0494, + "num_input_tokens_seen": 85215344, + "step": 39475 + }, + { + "epoch": 6.440456769983687, + "grad_norm": 3.3006391525268555, + "learning_rate": 1.6940931897810208e-05, + "loss": 0.1731, + "num_input_tokens_seen": 85226352, + "step": 39480 + }, + { + "epoch": 6.441272430668842, + "grad_norm": 0.18463042378425598, + "learning_rate": 1.693419422706449e-05, + "loss": 0.0375, + "num_input_tokens_seen": 85235408, + "step": 39485 + }, + { + "epoch": 6.442088091353996, + "grad_norm": 0.13662466406822205, + "learning_rate": 1.6927457210173915e-05, + "loss": 0.1316, + "num_input_tokens_seen": 85246352, + "step": 39490 + }, + { + "epoch": 6.442903752039152, + "grad_norm": 1.190600872039795, + "learning_rate": 1.6920720847684617e-05, + "loss": 0.1902, + "num_input_tokens_seen": 85256368, + "step": 39495 + }, + { + "epoch": 6.443719412724307, + "grad_norm": 0.05564573407173157, + "learning_rate": 1.6913985140142682e-05, + "loss": 0.0478, + "num_input_tokens_seen": 85267856, + "step": 39500 + }, + { + "epoch": 6.444535073409462, + "grad_norm": 0.21816283464431763, + "learning_rate": 1.690725008809414e-05, + "loss": 0.1547, + "num_input_tokens_seen": 85280112, + "step": 39505 + }, + { + "epoch": 6.445350734094617, + "grad_norm": 0.4561656415462494, + "learning_rate": 1.6900515692084966e-05, + "loss": 0.0428, + "num_input_tokens_seen": 85288688, + "step": 39510 + }, + { + "epoch": 6.446166394779771, + "grad_norm": 0.3662914037704468, + "learning_rate": 1.689378195266109e-05, + "loss": 0.3017, + "num_input_tokens_seen": 85299888, + "step": 39515 + }, + { + "epoch": 6.446982055464926, + "grad_norm": 0.09530775994062424, + "learning_rate": 1.6887048870368377e-05, + "loss": 0.0684, + "num_input_tokens_seen": 85310128, + "step": 39520 + }, + { + "epoch": 6.447797716150082, + "grad_norm": 2.0203850269317627, + "learning_rate": 1.688031644575265e-05, + "loss": 0.0859, + "num_input_tokens_seen": 85321072, + "step": 39525 + }, + { + "epoch": 6.448613376835237, + "grad_norm": 0.6739980578422546, + "learning_rate": 1.6873584679359665e-05, + "loss": 0.0982, + "num_input_tokens_seen": 85333008, + "step": 39530 + }, + { + "epoch": 6.4494290375203915, + "grad_norm": 0.7638341784477234, + "learning_rate": 1.686685357173514e-05, + "loss": 0.0795, + "num_input_tokens_seen": 85344048, + "step": 39535 + }, + { + "epoch": 6.450244698205546, + "grad_norm": 0.16217699646949768, + "learning_rate": 1.6860123123424733e-05, + "loss": 0.0869, + "num_input_tokens_seen": 85353136, + "step": 39540 + }, + { + "epoch": 6.451060358890701, + "grad_norm": 1.224043369293213, + "learning_rate": 1.6853393334974044e-05, + "loss": 0.1395, + "num_input_tokens_seen": 85364016, + "step": 39545 + }, + { + "epoch": 6.451876019575856, + "grad_norm": 1.1090847253799438, + "learning_rate": 1.684666420692863e-05, + "loss": 0.1322, + "num_input_tokens_seen": 85374928, + "step": 39550 + }, + { + "epoch": 6.452691680261012, + "grad_norm": 2.0020737648010254, + "learning_rate": 1.6839935739833986e-05, + "loss": 0.1975, + "num_input_tokens_seen": 85383888, + "step": 39555 + }, + { + "epoch": 6.4535073409461665, + "grad_norm": 2.813281297683716, + "learning_rate": 1.683320793423555e-05, + "loss": 0.2243, + "num_input_tokens_seen": 85394736, + "step": 39560 + }, + { + "epoch": 6.454323001631321, + "grad_norm": 0.2916308343410492, + "learning_rate": 1.6826480790678718e-05, + "loss": 0.1744, + "num_input_tokens_seen": 85405232, + "step": 39565 + }, + { + "epoch": 6.455138662316476, + "grad_norm": 0.4870706796646118, + "learning_rate": 1.681975430970883e-05, + "loss": 0.1384, + "num_input_tokens_seen": 85416176, + "step": 39570 + }, + { + "epoch": 6.455954323001631, + "grad_norm": 0.7945186495780945, + "learning_rate": 1.681302849187116e-05, + "loss": 0.1, + "num_input_tokens_seen": 85426800, + "step": 39575 + }, + { + "epoch": 6.456769983686787, + "grad_norm": 0.7024726271629333, + "learning_rate": 1.6806303337710942e-05, + "loss": 0.0915, + "num_input_tokens_seen": 85437520, + "step": 39580 + }, + { + "epoch": 6.4575856443719415, + "grad_norm": 0.7683566808700562, + "learning_rate": 1.679957884777335e-05, + "loss": 0.1136, + "num_input_tokens_seen": 85447888, + "step": 39585 + }, + { + "epoch": 6.458401305057096, + "grad_norm": 0.03337701037526131, + "learning_rate": 1.6792855022603508e-05, + "loss": 0.0733, + "num_input_tokens_seen": 85459344, + "step": 39590 + }, + { + "epoch": 6.459216965742251, + "grad_norm": 0.4433768093585968, + "learning_rate": 1.678613186274648e-05, + "loss": 0.1851, + "num_input_tokens_seen": 85469744, + "step": 39595 + }, + { + "epoch": 6.460032626427406, + "grad_norm": 0.23127511143684387, + "learning_rate": 1.6779409368747274e-05, + "loss": 0.032, + "num_input_tokens_seen": 85481168, + "step": 39600 + }, + { + "epoch": 6.460848287112561, + "grad_norm": 0.11435655504465103, + "learning_rate": 1.677268754115086e-05, + "loss": 0.0642, + "num_input_tokens_seen": 85492976, + "step": 39605 + }, + { + "epoch": 6.4616639477977165, + "grad_norm": 0.3060847818851471, + "learning_rate": 1.676596638050214e-05, + "loss": 0.1188, + "num_input_tokens_seen": 85502896, + "step": 39610 + }, + { + "epoch": 6.462479608482871, + "grad_norm": 2.9347920417785645, + "learning_rate": 1.6759245887345966e-05, + "loss": 0.3007, + "num_input_tokens_seen": 85513680, + "step": 39615 + }, + { + "epoch": 6.463295269168026, + "grad_norm": 0.1197676807641983, + "learning_rate": 1.6752526062227127e-05, + "loss": 0.1854, + "num_input_tokens_seen": 85524784, + "step": 39620 + }, + { + "epoch": 6.464110929853181, + "grad_norm": 1.4622687101364136, + "learning_rate": 1.674580690569037e-05, + "loss": 0.1808, + "num_input_tokens_seen": 85536208, + "step": 39625 + }, + { + "epoch": 6.464926590538336, + "grad_norm": 0.22002846002578735, + "learning_rate": 1.6739088418280395e-05, + "loss": 0.0569, + "num_input_tokens_seen": 85545936, + "step": 39630 + }, + { + "epoch": 6.465742251223491, + "grad_norm": 0.09837743639945984, + "learning_rate": 1.6732370600541823e-05, + "loss": 0.0702, + "num_input_tokens_seen": 85556080, + "step": 39635 + }, + { + "epoch": 6.466557911908646, + "grad_norm": 2.05002760887146, + "learning_rate": 1.6725653453019244e-05, + "loss": 0.206, + "num_input_tokens_seen": 85565840, + "step": 39640 + }, + { + "epoch": 6.467373572593801, + "grad_norm": 0.32461023330688477, + "learning_rate": 1.6718936976257177e-05, + "loss": 0.2114, + "num_input_tokens_seen": 85576912, + "step": 39645 + }, + { + "epoch": 6.468189233278956, + "grad_norm": 0.1558709740638733, + "learning_rate": 1.6712221170800087e-05, + "loss": 0.0893, + "num_input_tokens_seen": 85587760, + "step": 39650 + }, + { + "epoch": 6.469004893964111, + "grad_norm": 0.022685881704092026, + "learning_rate": 1.670550603719241e-05, + "loss": 0.0543, + "num_input_tokens_seen": 85597552, + "step": 39655 + }, + { + "epoch": 6.4698205546492655, + "grad_norm": 1.3334351778030396, + "learning_rate": 1.66987915759785e-05, + "loss": 0.1544, + "num_input_tokens_seen": 85608368, + "step": 39660 + }, + { + "epoch": 6.470636215334421, + "grad_norm": 1.6813585758209229, + "learning_rate": 1.6692077787702666e-05, + "loss": 0.0813, + "num_input_tokens_seen": 85618096, + "step": 39665 + }, + { + "epoch": 6.471451876019576, + "grad_norm": 0.26489657163619995, + "learning_rate": 1.6685364672909163e-05, + "loss": 0.0478, + "num_input_tokens_seen": 85628656, + "step": 39670 + }, + { + "epoch": 6.472267536704731, + "grad_norm": 0.6976056098937988, + "learning_rate": 1.6678652232142185e-05, + "loss": 0.0502, + "num_input_tokens_seen": 85639632, + "step": 39675 + }, + { + "epoch": 6.473083197389886, + "grad_norm": 0.13981172442436218, + "learning_rate": 1.667194046594588e-05, + "loss": 0.1528, + "num_input_tokens_seen": 85650832, + "step": 39680 + }, + { + "epoch": 6.4738988580750405, + "grad_norm": 0.24093325436115265, + "learning_rate": 1.666522937486433e-05, + "loss": 0.0732, + "num_input_tokens_seen": 85661296, + "step": 39685 + }, + { + "epoch": 6.474714518760196, + "grad_norm": 1.356424331665039, + "learning_rate": 1.6658518959441584e-05, + "loss": 0.0987, + "num_input_tokens_seen": 85672368, + "step": 39690 + }, + { + "epoch": 6.475530179445351, + "grad_norm": 0.06606244295835495, + "learning_rate": 1.6651809220221614e-05, + "loss": 0.067, + "num_input_tokens_seen": 85682160, + "step": 39695 + }, + { + "epoch": 6.476345840130506, + "grad_norm": 0.5908883810043335, + "learning_rate": 1.664510015774835e-05, + "loss": 0.0542, + "num_input_tokens_seen": 85692656, + "step": 39700 + }, + { + "epoch": 6.477161500815661, + "grad_norm": 2.393296241760254, + "learning_rate": 1.6638391772565658e-05, + "loss": 0.1411, + "num_input_tokens_seen": 85703600, + "step": 39705 + }, + { + "epoch": 6.4779771615008155, + "grad_norm": 0.9064767956733704, + "learning_rate": 1.6631684065217344e-05, + "loss": 0.111, + "num_input_tokens_seen": 85714512, + "step": 39710 + }, + { + "epoch": 6.47879282218597, + "grad_norm": 0.6285703182220459, + "learning_rate": 1.662497703624719e-05, + "loss": 0.197, + "num_input_tokens_seen": 85723888, + "step": 39715 + }, + { + "epoch": 6.479608482871126, + "grad_norm": 0.6937698721885681, + "learning_rate": 1.6618270686198895e-05, + "loss": 0.055, + "num_input_tokens_seen": 85734992, + "step": 39720 + }, + { + "epoch": 6.480424143556281, + "grad_norm": 0.4674282968044281, + "learning_rate": 1.6611565015616106e-05, + "loss": 0.0445, + "num_input_tokens_seen": 85746768, + "step": 39725 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.40033429861068726, + "learning_rate": 1.6604860025042412e-05, + "loss": 0.0591, + "num_input_tokens_seen": 85758352, + "step": 39730 + }, + { + "epoch": 6.4820554649265905, + "grad_norm": 0.1271461695432663, + "learning_rate": 1.6598155715021368e-05, + "loss": 0.0312, + "num_input_tokens_seen": 85769424, + "step": 39735 + }, + { + "epoch": 6.482871125611745, + "grad_norm": 0.9505890011787415, + "learning_rate": 1.6591452086096448e-05, + "loss": 0.1908, + "num_input_tokens_seen": 85780816, + "step": 39740 + }, + { + "epoch": 6.4836867862969, + "grad_norm": 0.05427936092019081, + "learning_rate": 1.658474913881109e-05, + "loss": 0.2221, + "num_input_tokens_seen": 85792528, + "step": 39745 + }, + { + "epoch": 6.484502446982056, + "grad_norm": 0.09231724590063095, + "learning_rate": 1.6578046873708663e-05, + "loss": 0.0501, + "num_input_tokens_seen": 85802448, + "step": 39750 + }, + { + "epoch": 6.485318107667211, + "grad_norm": 0.47154390811920166, + "learning_rate": 1.657134529133248e-05, + "loss": 0.0402, + "num_input_tokens_seen": 85813104, + "step": 39755 + }, + { + "epoch": 6.486133768352365, + "grad_norm": 0.2084825187921524, + "learning_rate": 1.6564644392225824e-05, + "loss": 0.176, + "num_input_tokens_seen": 85823248, + "step": 39760 + }, + { + "epoch": 6.48694942903752, + "grad_norm": 0.06642814725637436, + "learning_rate": 1.6557944176931894e-05, + "loss": 0.3232, + "num_input_tokens_seen": 85834384, + "step": 39765 + }, + { + "epoch": 6.487765089722675, + "grad_norm": 0.9170184135437012, + "learning_rate": 1.6551244645993847e-05, + "loss": 0.1346, + "num_input_tokens_seen": 85844848, + "step": 39770 + }, + { + "epoch": 6.488580750407831, + "grad_norm": 1.7533137798309326, + "learning_rate": 1.654454579995477e-05, + "loss": 0.235, + "num_input_tokens_seen": 85856528, + "step": 39775 + }, + { + "epoch": 6.489396411092986, + "grad_norm": 0.7487239837646484, + "learning_rate": 1.653784763935772e-05, + "loss": 0.0223, + "num_input_tokens_seen": 85868112, + "step": 39780 + }, + { + "epoch": 6.49021207177814, + "grad_norm": 0.34897640347480774, + "learning_rate": 1.6531150164745674e-05, + "loss": 0.1163, + "num_input_tokens_seen": 85878256, + "step": 39785 + }, + { + "epoch": 6.491027732463295, + "grad_norm": 0.05499323457479477, + "learning_rate": 1.6524453376661568e-05, + "loss": 0.0361, + "num_input_tokens_seen": 85887440, + "step": 39790 + }, + { + "epoch": 6.49184339314845, + "grad_norm": 1.569040298461914, + "learning_rate": 1.6517757275648267e-05, + "loss": 0.0935, + "num_input_tokens_seen": 85897872, + "step": 39795 + }, + { + "epoch": 6.492659053833605, + "grad_norm": 0.12530091404914856, + "learning_rate": 1.6511061862248605e-05, + "loss": 0.0758, + "num_input_tokens_seen": 85909776, + "step": 39800 + }, + { + "epoch": 6.493474714518761, + "grad_norm": 1.476891040802002, + "learning_rate": 1.6504367137005344e-05, + "loss": 0.2571, + "num_input_tokens_seen": 85919984, + "step": 39805 + }, + { + "epoch": 6.494290375203915, + "grad_norm": 0.3149351179599762, + "learning_rate": 1.649767310046119e-05, + "loss": 0.0617, + "num_input_tokens_seen": 85931632, + "step": 39810 + }, + { + "epoch": 6.49510603588907, + "grad_norm": 0.43069615960121155, + "learning_rate": 1.649097975315879e-05, + "loss": 0.0435, + "num_input_tokens_seen": 85944176, + "step": 39815 + }, + { + "epoch": 6.495921696574225, + "grad_norm": 0.2850281000137329, + "learning_rate": 1.648428709564075e-05, + "loss": 0.0388, + "num_input_tokens_seen": 85956016, + "step": 39820 + }, + { + "epoch": 6.49673735725938, + "grad_norm": 0.38005152344703674, + "learning_rate": 1.6477595128449605e-05, + "loss": 0.0803, + "num_input_tokens_seen": 85966704, + "step": 39825 + }, + { + "epoch": 6.497553017944535, + "grad_norm": 0.032065846025943756, + "learning_rate": 1.647090385212784e-05, + "loss": 0.2371, + "num_input_tokens_seen": 85976048, + "step": 39830 + }, + { + "epoch": 6.49836867862969, + "grad_norm": 0.44354167580604553, + "learning_rate": 1.6464213267217888e-05, + "loss": 0.156, + "num_input_tokens_seen": 85985424, + "step": 39835 + }, + { + "epoch": 6.499184339314845, + "grad_norm": 0.11801769584417343, + "learning_rate": 1.6457523374262117e-05, + "loss": 0.1064, + "num_input_tokens_seen": 85996976, + "step": 39840 + }, + { + "epoch": 6.5, + "grad_norm": 0.8773611187934875, + "learning_rate": 1.645083417380284e-05, + "loss": 0.0616, + "num_input_tokens_seen": 86008784, + "step": 39845 + }, + { + "epoch": 6.5, + "eval_loss": 0.14059261977672577, + "eval_runtime": 131.8581, + "eval_samples_per_second": 20.666, + "eval_steps_per_second": 5.172, + "num_input_tokens_seen": 86008784, + "step": 39845 + }, + { + "epoch": 6.500815660685155, + "grad_norm": 0.9975789785385132, + "learning_rate": 1.644414566638233e-05, + "loss": 0.1614, + "num_input_tokens_seen": 86019568, + "step": 39850 + }, + { + "epoch": 6.50163132137031, + "grad_norm": 0.34793180227279663, + "learning_rate": 1.643745785254278e-05, + "loss": 0.0524, + "num_input_tokens_seen": 86030448, + "step": 39855 + }, + { + "epoch": 6.502446982055465, + "grad_norm": 1.4131386280059814, + "learning_rate": 1.6430770732826346e-05, + "loss": 0.1947, + "num_input_tokens_seen": 86041040, + "step": 39860 + }, + { + "epoch": 6.50326264274062, + "grad_norm": 0.3387939929962158, + "learning_rate": 1.6424084307775107e-05, + "loss": 0.0201, + "num_input_tokens_seen": 86052624, + "step": 39865 + }, + { + "epoch": 6.504078303425775, + "grad_norm": 0.040602561086416245, + "learning_rate": 1.6417398577931116e-05, + "loss": 0.1266, + "num_input_tokens_seen": 86063472, + "step": 39870 + }, + { + "epoch": 6.50489396411093, + "grad_norm": 0.0992257297039032, + "learning_rate": 1.6410713543836342e-05, + "loss": 0.0546, + "num_input_tokens_seen": 86074320, + "step": 39875 + }, + { + "epoch": 6.505709624796085, + "grad_norm": 0.16317354142665863, + "learning_rate": 1.6404029206032708e-05, + "loss": 0.1733, + "num_input_tokens_seen": 86086320, + "step": 39880 + }, + { + "epoch": 6.506525285481239, + "grad_norm": 0.047430653125047684, + "learning_rate": 1.6397345565062082e-05, + "loss": 0.1836, + "num_input_tokens_seen": 86098000, + "step": 39885 + }, + { + "epoch": 6.507340946166395, + "grad_norm": 0.16042454540729523, + "learning_rate": 1.639066262146628e-05, + "loss": 0.0912, + "num_input_tokens_seen": 86107088, + "step": 39890 + }, + { + "epoch": 6.50815660685155, + "grad_norm": 0.012026172131299973, + "learning_rate": 1.6383980375787044e-05, + "loss": 0.2195, + "num_input_tokens_seen": 86118096, + "step": 39895 + }, + { + "epoch": 6.508972267536705, + "grad_norm": 1.868530035018921, + "learning_rate": 1.637729882856608e-05, + "loss": 0.1067, + "num_input_tokens_seen": 86129136, + "step": 39900 + }, + { + "epoch": 6.50978792822186, + "grad_norm": 1.8572497367858887, + "learning_rate": 1.6370617980345022e-05, + "loss": 0.2045, + "num_input_tokens_seen": 86140816, + "step": 39905 + }, + { + "epoch": 6.510603588907014, + "grad_norm": 0.1910811960697174, + "learning_rate": 1.6363937831665458e-05, + "loss": 0.0681, + "num_input_tokens_seen": 86151664, + "step": 39910 + }, + { + "epoch": 6.511419249592169, + "grad_norm": 0.2810184061527252, + "learning_rate": 1.635725838306891e-05, + "loss": 0.0873, + "num_input_tokens_seen": 86161744, + "step": 39915 + }, + { + "epoch": 6.512234910277325, + "grad_norm": 1.0128904581069946, + "learning_rate": 1.6350579635096852e-05, + "loss": 0.1009, + "num_input_tokens_seen": 86171312, + "step": 39920 + }, + { + "epoch": 6.51305057096248, + "grad_norm": 0.18553489446640015, + "learning_rate": 1.6343901588290695e-05, + "loss": 0.0695, + "num_input_tokens_seen": 86181392, + "step": 39925 + }, + { + "epoch": 6.513866231647635, + "grad_norm": 0.630767285823822, + "learning_rate": 1.633722424319179e-05, + "loss": 0.0891, + "num_input_tokens_seen": 86191728, + "step": 39930 + }, + { + "epoch": 6.514681892332789, + "grad_norm": 0.09296154230833054, + "learning_rate": 1.633054760034145e-05, + "loss": 0.0211, + "num_input_tokens_seen": 86201712, + "step": 39935 + }, + { + "epoch": 6.515497553017944, + "grad_norm": 0.2633383572101593, + "learning_rate": 1.6323871660280904e-05, + "loss": 0.2425, + "num_input_tokens_seen": 86212464, + "step": 39940 + }, + { + "epoch": 6.5163132137031, + "grad_norm": 1.0235114097595215, + "learning_rate": 1.6317196423551347e-05, + "loss": 0.1311, + "num_input_tokens_seen": 86224208, + "step": 39945 + }, + { + "epoch": 6.517128874388255, + "grad_norm": 1.4633392095565796, + "learning_rate": 1.6310521890693904e-05, + "loss": 0.1526, + "num_input_tokens_seen": 86234544, + "step": 39950 + }, + { + "epoch": 6.5179445350734095, + "grad_norm": 0.9319020509719849, + "learning_rate": 1.6303848062249643e-05, + "loss": 0.157, + "num_input_tokens_seen": 86245776, + "step": 39955 + }, + { + "epoch": 6.518760195758564, + "grad_norm": 0.1220674142241478, + "learning_rate": 1.6297174938759584e-05, + "loss": 0.1583, + "num_input_tokens_seen": 86256976, + "step": 39960 + }, + { + "epoch": 6.519575856443719, + "grad_norm": 1.5552667379379272, + "learning_rate": 1.6290502520764687e-05, + "loss": 0.2868, + "num_input_tokens_seen": 86267568, + "step": 39965 + }, + { + "epoch": 6.520391517128875, + "grad_norm": 0.7207322120666504, + "learning_rate": 1.6283830808805832e-05, + "loss": 0.0478, + "num_input_tokens_seen": 86278768, + "step": 39970 + }, + { + "epoch": 6.52120717781403, + "grad_norm": 0.5574894547462463, + "learning_rate": 1.6277159803423888e-05, + "loss": 0.057, + "num_input_tokens_seen": 86288080, + "step": 39975 + }, + { + "epoch": 6.5220228384991845, + "grad_norm": 0.46290844678878784, + "learning_rate": 1.627048950515963e-05, + "loss": 0.0616, + "num_input_tokens_seen": 86299120, + "step": 39980 + }, + { + "epoch": 6.522838499184339, + "grad_norm": 0.21465997397899628, + "learning_rate": 1.6263819914553786e-05, + "loss": 0.0631, + "num_input_tokens_seen": 86311056, + "step": 39985 + }, + { + "epoch": 6.523654159869494, + "grad_norm": 2.3563995361328125, + "learning_rate": 1.6257151032147028e-05, + "loss": 0.2337, + "num_input_tokens_seen": 86321904, + "step": 39990 + }, + { + "epoch": 6.524469820554649, + "grad_norm": 0.1100926548242569, + "learning_rate": 1.6250482858479964e-05, + "loss": 0.2145, + "num_input_tokens_seen": 86332880, + "step": 39995 + }, + { + "epoch": 6.525285481239804, + "grad_norm": 0.39294660091400146, + "learning_rate": 1.624381539409316e-05, + "loss": 0.051, + "num_input_tokens_seen": 86344080, + "step": 40000 + }, + { + "epoch": 6.5261011419249595, + "grad_norm": 0.463060200214386, + "learning_rate": 1.6237148639527106e-05, + "loss": 0.1021, + "num_input_tokens_seen": 86356240, + "step": 40005 + }, + { + "epoch": 6.526916802610114, + "grad_norm": 0.7206583619117737, + "learning_rate": 1.6230482595322244e-05, + "loss": 0.0667, + "num_input_tokens_seen": 86367824, + "step": 40010 + }, + { + "epoch": 6.527732463295269, + "grad_norm": 0.6142520308494568, + "learning_rate": 1.6223817262018958e-05, + "loss": 0.1908, + "num_input_tokens_seen": 86379632, + "step": 40015 + }, + { + "epoch": 6.528548123980424, + "grad_norm": 0.15992958843708038, + "learning_rate": 1.6217152640157577e-05, + "loss": 0.1277, + "num_input_tokens_seen": 86391248, + "step": 40020 + }, + { + "epoch": 6.529363784665579, + "grad_norm": 0.1806604266166687, + "learning_rate": 1.621048873027836e-05, + "loss": 0.1574, + "num_input_tokens_seen": 86401104, + "step": 40025 + }, + { + "epoch": 6.5301794453507345, + "grad_norm": 1.5267677307128906, + "learning_rate": 1.6203825532921533e-05, + "loss": 0.2308, + "num_input_tokens_seen": 86412528, + "step": 40030 + }, + { + "epoch": 6.530995106035889, + "grad_norm": 0.1579625904560089, + "learning_rate": 1.6197163048627237e-05, + "loss": 0.1437, + "num_input_tokens_seen": 86422800, + "step": 40035 + }, + { + "epoch": 6.531810766721044, + "grad_norm": 0.08387274295091629, + "learning_rate": 1.619050127793557e-05, + "loss": 0.0299, + "num_input_tokens_seen": 86434448, + "step": 40040 + }, + { + "epoch": 6.532626427406199, + "grad_norm": 0.24747316539287567, + "learning_rate": 1.6183840221386567e-05, + "loss": 0.0712, + "num_input_tokens_seen": 86447312, + "step": 40045 + }, + { + "epoch": 6.533442088091354, + "grad_norm": 0.9293643236160278, + "learning_rate": 1.617717987952021e-05, + "loss": 0.0881, + "num_input_tokens_seen": 86458800, + "step": 40050 + }, + { + "epoch": 6.5342577487765094, + "grad_norm": 0.974072277545929, + "learning_rate": 1.6170520252876416e-05, + "loss": 0.0763, + "num_input_tokens_seen": 86470928, + "step": 40055 + }, + { + "epoch": 6.535073409461664, + "grad_norm": 0.1730601042509079, + "learning_rate": 1.616386134199505e-05, + "loss": 0.1214, + "num_input_tokens_seen": 86481520, + "step": 40060 + }, + { + "epoch": 6.535889070146819, + "grad_norm": 0.3764227032661438, + "learning_rate": 1.6157203147415923e-05, + "loss": 0.091, + "num_input_tokens_seen": 86492080, + "step": 40065 + }, + { + "epoch": 6.536704730831974, + "grad_norm": 0.062112219631671906, + "learning_rate": 1.6150545669678773e-05, + "loss": 0.0594, + "num_input_tokens_seen": 86503280, + "step": 40070 + }, + { + "epoch": 6.537520391517129, + "grad_norm": 2.2749154567718506, + "learning_rate": 1.6143888909323286e-05, + "loss": 0.1445, + "num_input_tokens_seen": 86513808, + "step": 40075 + }, + { + "epoch": 6.5383360522022835, + "grad_norm": 0.13816608488559723, + "learning_rate": 1.6137232866889107e-05, + "loss": 0.0665, + "num_input_tokens_seen": 86524944, + "step": 40080 + }, + { + "epoch": 6.539151712887438, + "grad_norm": 0.08039267361164093, + "learning_rate": 1.6130577542915798e-05, + "loss": 0.2869, + "num_input_tokens_seen": 86535824, + "step": 40085 + }, + { + "epoch": 6.539967373572594, + "grad_norm": 1.3747013807296753, + "learning_rate": 1.6123922937942883e-05, + "loss": 0.0851, + "num_input_tokens_seen": 86546992, + "step": 40090 + }, + { + "epoch": 6.540783034257749, + "grad_norm": 0.14117412269115448, + "learning_rate": 1.6117269052509803e-05, + "loss": 0.0582, + "num_input_tokens_seen": 86558352, + "step": 40095 + }, + { + "epoch": 6.541598694942904, + "grad_norm": 0.08306996524333954, + "learning_rate": 1.6110615887155972e-05, + "loss": 0.1117, + "num_input_tokens_seen": 86569424, + "step": 40100 + }, + { + "epoch": 6.5424143556280585, + "grad_norm": 0.3774689733982086, + "learning_rate": 1.6103963442420717e-05, + "loss": 0.1039, + "num_input_tokens_seen": 86579024, + "step": 40105 + }, + { + "epoch": 6.543230016313213, + "grad_norm": 0.07838490605354309, + "learning_rate": 1.6097311718843322e-05, + "loss": 0.1438, + "num_input_tokens_seen": 86590832, + "step": 40110 + }, + { + "epoch": 6.544045676998369, + "grad_norm": 1.1370596885681152, + "learning_rate": 1.6090660716963014e-05, + "loss": 0.0772, + "num_input_tokens_seen": 86602736, + "step": 40115 + }, + { + "epoch": 6.544861337683524, + "grad_norm": 1.0693862438201904, + "learning_rate": 1.608401043731895e-05, + "loss": 0.0512, + "num_input_tokens_seen": 86613712, + "step": 40120 + }, + { + "epoch": 6.545676998368679, + "grad_norm": 0.04522940143942833, + "learning_rate": 1.6077360880450244e-05, + "loss": 0.1811, + "num_input_tokens_seen": 86624112, + "step": 40125 + }, + { + "epoch": 6.5464926590538335, + "grad_norm": 0.03727174550294876, + "learning_rate": 1.6070712046895936e-05, + "loss": 0.0351, + "num_input_tokens_seen": 86634000, + "step": 40130 + }, + { + "epoch": 6.547308319738988, + "grad_norm": 1.8636474609375, + "learning_rate": 1.6064063937195017e-05, + "loss": 0.2191, + "num_input_tokens_seen": 86644816, + "step": 40135 + }, + { + "epoch": 6.548123980424144, + "grad_norm": 1.3826600313186646, + "learning_rate": 1.6057416551886418e-05, + "loss": 0.2815, + "num_input_tokens_seen": 86656496, + "step": 40140 + }, + { + "epoch": 6.548939641109299, + "grad_norm": 0.037934985011816025, + "learning_rate": 1.6050769891509005e-05, + "loss": 0.1172, + "num_input_tokens_seen": 86668400, + "step": 40145 + }, + { + "epoch": 6.549755301794454, + "grad_norm": 0.2202804684638977, + "learning_rate": 1.6044123956601593e-05, + "loss": 0.0208, + "num_input_tokens_seen": 86678832, + "step": 40150 + }, + { + "epoch": 6.5505709624796085, + "grad_norm": 1.6019104719161987, + "learning_rate": 1.6037478747702932e-05, + "loss": 0.1371, + "num_input_tokens_seen": 86689040, + "step": 40155 + }, + { + "epoch": 6.551386623164763, + "grad_norm": 0.07661747932434082, + "learning_rate": 1.6030834265351724e-05, + "loss": 0.2301, + "num_input_tokens_seen": 86699952, + "step": 40160 + }, + { + "epoch": 6.552202283849918, + "grad_norm": 1.3001748323440552, + "learning_rate": 1.602419051008659e-05, + "loss": 0.1135, + "num_input_tokens_seen": 86710320, + "step": 40165 + }, + { + "epoch": 6.553017944535073, + "grad_norm": 0.09937907010316849, + "learning_rate": 1.6017547482446127e-05, + "loss": 0.0509, + "num_input_tokens_seen": 86721648, + "step": 40170 + }, + { + "epoch": 6.553833605220229, + "grad_norm": 0.47360506653785706, + "learning_rate": 1.6010905182968837e-05, + "loss": 0.1354, + "num_input_tokens_seen": 86732560, + "step": 40175 + }, + { + "epoch": 6.554649265905383, + "grad_norm": 0.05952024087309837, + "learning_rate": 1.6004263612193182e-05, + "loss": 0.0249, + "num_input_tokens_seen": 86743408, + "step": 40180 + }, + { + "epoch": 6.555464926590538, + "grad_norm": 0.5428647994995117, + "learning_rate": 1.599762277065756e-05, + "loss": 0.1321, + "num_input_tokens_seen": 86754384, + "step": 40185 + }, + { + "epoch": 6.556280587275693, + "grad_norm": 1.1010702848434448, + "learning_rate": 1.599098265890031e-05, + "loss": 0.0359, + "num_input_tokens_seen": 86764592, + "step": 40190 + }, + { + "epoch": 6.557096247960848, + "grad_norm": 0.1244596466422081, + "learning_rate": 1.598434327745973e-05, + "loss": 0.0871, + "num_input_tokens_seen": 86776176, + "step": 40195 + }, + { + "epoch": 6.557911908646004, + "grad_norm": 1.6185665130615234, + "learning_rate": 1.5977704626874023e-05, + "loss": 0.104, + "num_input_tokens_seen": 86786320, + "step": 40200 + }, + { + "epoch": 6.558727569331158, + "grad_norm": 0.7490752339363098, + "learning_rate": 1.597106670768136e-05, + "loss": 0.0308, + "num_input_tokens_seen": 86798288, + "step": 40205 + }, + { + "epoch": 6.559543230016313, + "grad_norm": 0.15519112348556519, + "learning_rate": 1.5964429520419836e-05, + "loss": 0.1287, + "num_input_tokens_seen": 86809456, + "step": 40210 + }, + { + "epoch": 6.560358890701468, + "grad_norm": 0.512347400188446, + "learning_rate": 1.595779306562751e-05, + "loss": 0.104, + "num_input_tokens_seen": 86819824, + "step": 40215 + }, + { + "epoch": 6.561174551386623, + "grad_norm": 1.4103317260742188, + "learning_rate": 1.5951157343842352e-05, + "loss": 0.1884, + "num_input_tokens_seen": 86829872, + "step": 40220 + }, + { + "epoch": 6.561990212071779, + "grad_norm": 0.05970706418156624, + "learning_rate": 1.5944522355602297e-05, + "loss": 0.1089, + "num_input_tokens_seen": 86839472, + "step": 40225 + }, + { + "epoch": 6.562805872756933, + "grad_norm": 0.37653136253356934, + "learning_rate": 1.59378881014452e-05, + "loss": 0.0721, + "num_input_tokens_seen": 86851184, + "step": 40230 + }, + { + "epoch": 6.563621533442088, + "grad_norm": 0.10773669928312302, + "learning_rate": 1.5931254581908882e-05, + "loss": 0.0787, + "num_input_tokens_seen": 86861136, + "step": 40235 + }, + { + "epoch": 6.564437194127243, + "grad_norm": 0.08153286576271057, + "learning_rate": 1.592462179753108e-05, + "loss": 0.1175, + "num_input_tokens_seen": 86871344, + "step": 40240 + }, + { + "epoch": 6.565252854812398, + "grad_norm": 1.2713665962219238, + "learning_rate": 1.591798974884948e-05, + "loss": 0.0449, + "num_input_tokens_seen": 86881488, + "step": 40245 + }, + { + "epoch": 6.566068515497553, + "grad_norm": 0.7585523724555969, + "learning_rate": 1.5911358436401708e-05, + "loss": 0.1301, + "num_input_tokens_seen": 86893264, + "step": 40250 + }, + { + "epoch": 6.566884176182708, + "grad_norm": 0.14365580677986145, + "learning_rate": 1.5904727860725344e-05, + "loss": 0.0692, + "num_input_tokens_seen": 86904208, + "step": 40255 + }, + { + "epoch": 6.567699836867863, + "grad_norm": 0.3055436313152313, + "learning_rate": 1.589809802235789e-05, + "loss": 0.1584, + "num_input_tokens_seen": 86916080, + "step": 40260 + }, + { + "epoch": 6.568515497553018, + "grad_norm": 0.17009444534778595, + "learning_rate": 1.589146892183679e-05, + "loss": 0.0347, + "num_input_tokens_seen": 86926640, + "step": 40265 + }, + { + "epoch": 6.569331158238173, + "grad_norm": 0.05290939286351204, + "learning_rate": 1.5884840559699436e-05, + "loss": 0.1079, + "num_input_tokens_seen": 86936688, + "step": 40270 + }, + { + "epoch": 6.570146818923328, + "grad_norm": 0.06997165083885193, + "learning_rate": 1.5878212936483156e-05, + "loss": 0.064, + "num_input_tokens_seen": 86948272, + "step": 40275 + }, + { + "epoch": 6.5709624796084825, + "grad_norm": 6.426630020141602, + "learning_rate": 1.5871586052725216e-05, + "loss": 0.1714, + "num_input_tokens_seen": 86959184, + "step": 40280 + }, + { + "epoch": 6.571778140293638, + "grad_norm": 1.3801244497299194, + "learning_rate": 1.5864959908962832e-05, + "loss": 0.0791, + "num_input_tokens_seen": 86970032, + "step": 40285 + }, + { + "epoch": 6.572593800978793, + "grad_norm": 0.10702414810657501, + "learning_rate": 1.5858334505733137e-05, + "loss": 0.1122, + "num_input_tokens_seen": 86980048, + "step": 40290 + }, + { + "epoch": 6.573409461663948, + "grad_norm": 0.8299185633659363, + "learning_rate": 1.585170984357324e-05, + "loss": 0.0852, + "num_input_tokens_seen": 86991888, + "step": 40295 + }, + { + "epoch": 6.574225122349103, + "grad_norm": 0.2231588065624237, + "learning_rate": 1.5845085923020165e-05, + "loss": 0.0342, + "num_input_tokens_seen": 87003056, + "step": 40300 + }, + { + "epoch": 6.575040783034257, + "grad_norm": 2.0986878871917725, + "learning_rate": 1.5838462744610872e-05, + "loss": 0.0611, + "num_input_tokens_seen": 87014192, + "step": 40305 + }, + { + "epoch": 6.575856443719413, + "grad_norm": 0.3158802390098572, + "learning_rate": 1.5831840308882276e-05, + "loss": 0.2605, + "num_input_tokens_seen": 87024112, + "step": 40310 + }, + { + "epoch": 6.576672104404568, + "grad_norm": 0.09691125899553299, + "learning_rate": 1.5825218616371224e-05, + "loss": 0.1122, + "num_input_tokens_seen": 87034896, + "step": 40315 + }, + { + "epoch": 6.577487765089723, + "grad_norm": 0.11358889937400818, + "learning_rate": 1.5818597667614503e-05, + "loss": 0.1234, + "num_input_tokens_seen": 87045392, + "step": 40320 + }, + { + "epoch": 6.578303425774878, + "grad_norm": 2.486448049545288, + "learning_rate": 1.581197746314884e-05, + "loss": 0.1171, + "num_input_tokens_seen": 87056432, + "step": 40325 + }, + { + "epoch": 6.579119086460032, + "grad_norm": 0.16641297936439514, + "learning_rate": 1.5805358003510902e-05, + "loss": 0.1072, + "num_input_tokens_seen": 87067440, + "step": 40330 + }, + { + "epoch": 6.579934747145187, + "grad_norm": 0.06437788903713226, + "learning_rate": 1.5798739289237298e-05, + "loss": 0.0791, + "num_input_tokens_seen": 87078064, + "step": 40335 + }, + { + "epoch": 6.580750407830343, + "grad_norm": 0.18505960702896118, + "learning_rate": 1.5792121320864573e-05, + "loss": 0.0308, + "num_input_tokens_seen": 87087920, + "step": 40340 + }, + { + "epoch": 6.581566068515498, + "grad_norm": 0.16736316680908203, + "learning_rate": 1.5785504098929217e-05, + "loss": 0.0815, + "num_input_tokens_seen": 87098032, + "step": 40345 + }, + { + "epoch": 6.582381729200653, + "grad_norm": 0.1328691840171814, + "learning_rate": 1.5778887623967654e-05, + "loss": 0.1761, + "num_input_tokens_seen": 87108304, + "step": 40350 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.29645320773124695, + "learning_rate": 1.5772271896516245e-05, + "loss": 0.0733, + "num_input_tokens_seen": 87119088, + "step": 40355 + }, + { + "epoch": 6.584013050570962, + "grad_norm": 0.6547529697418213, + "learning_rate": 1.57656569171113e-05, + "loss": 0.2731, + "num_input_tokens_seen": 87130032, + "step": 40360 + }, + { + "epoch": 6.584828711256117, + "grad_norm": 0.250652939081192, + "learning_rate": 1.5759042686289056e-05, + "loss": 0.2866, + "num_input_tokens_seen": 87141104, + "step": 40365 + }, + { + "epoch": 6.585644371941273, + "grad_norm": 0.24125702679157257, + "learning_rate": 1.5752429204585702e-05, + "loss": 0.1215, + "num_input_tokens_seen": 87151472, + "step": 40370 + }, + { + "epoch": 6.5864600326264275, + "grad_norm": 1.5317480564117432, + "learning_rate": 1.5745816472537355e-05, + "loss": 0.0754, + "num_input_tokens_seen": 87161680, + "step": 40375 + }, + { + "epoch": 6.587275693311582, + "grad_norm": 0.3308475911617279, + "learning_rate": 1.5739204490680085e-05, + "loss": 0.0517, + "num_input_tokens_seen": 87172080, + "step": 40380 + }, + { + "epoch": 6.588091353996737, + "grad_norm": 0.5685145258903503, + "learning_rate": 1.5732593259549885e-05, + "loss": 0.0642, + "num_input_tokens_seen": 87181168, + "step": 40385 + }, + { + "epoch": 6.588907014681892, + "grad_norm": 0.055734165012836456, + "learning_rate": 1.57259827796827e-05, + "loss": 0.1318, + "num_input_tokens_seen": 87192144, + "step": 40390 + }, + { + "epoch": 6.589722675367048, + "grad_norm": 0.718437671661377, + "learning_rate": 1.5719373051614393e-05, + "loss": 0.1465, + "num_input_tokens_seen": 87203344, + "step": 40395 + }, + { + "epoch": 6.5905383360522025, + "grad_norm": 0.7224676609039307, + "learning_rate": 1.571276407588081e-05, + "loss": 0.0644, + "num_input_tokens_seen": 87214416, + "step": 40400 + }, + { + "epoch": 6.591353996737357, + "grad_norm": 0.23479150235652924, + "learning_rate": 1.570615585301769e-05, + "loss": 0.0849, + "num_input_tokens_seen": 87225168, + "step": 40405 + }, + { + "epoch": 6.592169657422512, + "grad_norm": 0.41619807481765747, + "learning_rate": 1.5699548383560736e-05, + "loss": 0.2115, + "num_input_tokens_seen": 87237232, + "step": 40410 + }, + { + "epoch": 6.592985318107667, + "grad_norm": 2.385324001312256, + "learning_rate": 1.569294166804558e-05, + "loss": 0.3116, + "num_input_tokens_seen": 87247696, + "step": 40415 + }, + { + "epoch": 6.593800978792823, + "grad_norm": 1.7540605068206787, + "learning_rate": 1.5686335707007794e-05, + "loss": 0.2557, + "num_input_tokens_seen": 87258448, + "step": 40420 + }, + { + "epoch": 6.5946166394779775, + "grad_norm": 0.3498998284339905, + "learning_rate": 1.5679730500982892e-05, + "loss": 0.0276, + "num_input_tokens_seen": 87268880, + "step": 40425 + }, + { + "epoch": 6.595432300163132, + "grad_norm": 0.5554607510566711, + "learning_rate": 1.5673126050506327e-05, + "loss": 0.1136, + "num_input_tokens_seen": 87279568, + "step": 40430 + }, + { + "epoch": 6.596247960848287, + "grad_norm": 0.16495537757873535, + "learning_rate": 1.5666522356113488e-05, + "loss": 0.0979, + "num_input_tokens_seen": 87289776, + "step": 40435 + }, + { + "epoch": 6.597063621533442, + "grad_norm": 0.01851119101047516, + "learning_rate": 1.5659919418339707e-05, + "loss": 0.164, + "num_input_tokens_seen": 87301488, + "step": 40440 + }, + { + "epoch": 6.597879282218597, + "grad_norm": 0.026403142139315605, + "learning_rate": 1.565331723772025e-05, + "loss": 0.04, + "num_input_tokens_seen": 87312112, + "step": 40445 + }, + { + "epoch": 6.598694942903752, + "grad_norm": 1.2029110193252563, + "learning_rate": 1.5646715814790318e-05, + "loss": 0.095, + "num_input_tokens_seen": 87322416, + "step": 40450 + }, + { + "epoch": 6.599510603588907, + "grad_norm": 0.16799312829971313, + "learning_rate": 1.5640115150085067e-05, + "loss": 0.0531, + "num_input_tokens_seen": 87333040, + "step": 40455 + }, + { + "epoch": 6.600326264274062, + "grad_norm": 0.041904717683792114, + "learning_rate": 1.5633515244139567e-05, + "loss": 0.0951, + "num_input_tokens_seen": 87344112, + "step": 40460 + }, + { + "epoch": 6.601141924959217, + "grad_norm": 0.4087704122066498, + "learning_rate": 1.562691609748885e-05, + "loss": 0.065, + "num_input_tokens_seen": 87353680, + "step": 40465 + }, + { + "epoch": 6.601957585644372, + "grad_norm": 0.2745386064052582, + "learning_rate": 1.562031771066787e-05, + "loss": 0.0956, + "num_input_tokens_seen": 87363536, + "step": 40470 + }, + { + "epoch": 6.602773246329527, + "grad_norm": 0.3046417236328125, + "learning_rate": 1.561372008421153e-05, + "loss": 0.0793, + "num_input_tokens_seen": 87374992, + "step": 40475 + }, + { + "epoch": 6.603588907014682, + "grad_norm": 0.040712907910346985, + "learning_rate": 1.560712321865466e-05, + "loss": 0.0729, + "num_input_tokens_seen": 87386448, + "step": 40480 + }, + { + "epoch": 6.604404567699837, + "grad_norm": 0.7552571296691895, + "learning_rate": 1.5600527114532042e-05, + "loss": 0.2293, + "num_input_tokens_seen": 87395760, + "step": 40485 + }, + { + "epoch": 6.605220228384992, + "grad_norm": 0.6252642869949341, + "learning_rate": 1.5593931772378395e-05, + "loss": 0.0581, + "num_input_tokens_seen": 87407088, + "step": 40490 + }, + { + "epoch": 6.606035889070147, + "grad_norm": 1.3083847761154175, + "learning_rate": 1.5587337192728365e-05, + "loss": 0.0428, + "num_input_tokens_seen": 87418256, + "step": 40495 + }, + { + "epoch": 6.6068515497553015, + "grad_norm": 0.31780707836151123, + "learning_rate": 1.5580743376116536e-05, + "loss": 0.0969, + "num_input_tokens_seen": 87427984, + "step": 40500 + }, + { + "epoch": 6.607667210440457, + "grad_norm": 0.03160381689667702, + "learning_rate": 1.5574150323077432e-05, + "loss": 0.241, + "num_input_tokens_seen": 87437424, + "step": 40505 + }, + { + "epoch": 6.608482871125612, + "grad_norm": 0.2899286448955536, + "learning_rate": 1.556755803414554e-05, + "loss": 0.1482, + "num_input_tokens_seen": 87447504, + "step": 40510 + }, + { + "epoch": 6.609298531810767, + "grad_norm": 0.4563083052635193, + "learning_rate": 1.5560966509855256e-05, + "loss": 0.1103, + "num_input_tokens_seen": 87457456, + "step": 40515 + }, + { + "epoch": 6.610114192495922, + "grad_norm": 0.6109724044799805, + "learning_rate": 1.5554375750740917e-05, + "loss": 0.0392, + "num_input_tokens_seen": 87468464, + "step": 40520 + }, + { + "epoch": 6.6109298531810765, + "grad_norm": 0.09066237509250641, + "learning_rate": 1.554778575733681e-05, + "loss": 0.0719, + "num_input_tokens_seen": 87479504, + "step": 40525 + }, + { + "epoch": 6.611745513866231, + "grad_norm": 0.6568520665168762, + "learning_rate": 1.5541196530177148e-05, + "loss": 0.0676, + "num_input_tokens_seen": 87489840, + "step": 40530 + }, + { + "epoch": 6.612561174551386, + "grad_norm": 0.3194960355758667, + "learning_rate": 1.5534608069796085e-05, + "loss": 0.0316, + "num_input_tokens_seen": 87501424, + "step": 40535 + }, + { + "epoch": 6.613376835236542, + "grad_norm": 0.12253714352846146, + "learning_rate": 1.5528020376727725e-05, + "loss": 0.0587, + "num_input_tokens_seen": 87512688, + "step": 40540 + }, + { + "epoch": 6.614192495921697, + "grad_norm": 0.344744473695755, + "learning_rate": 1.5521433451506088e-05, + "loss": 0.1078, + "num_input_tokens_seen": 87523056, + "step": 40545 + }, + { + "epoch": 6.6150081566068515, + "grad_norm": 0.5905764698982239, + "learning_rate": 1.5514847294665152e-05, + "loss": 0.1005, + "num_input_tokens_seen": 87532784, + "step": 40550 + }, + { + "epoch": 6.615823817292006, + "grad_norm": 0.42959144711494446, + "learning_rate": 1.5508261906738824e-05, + "loss": 0.0634, + "num_input_tokens_seen": 87542224, + "step": 40555 + }, + { + "epoch": 6.616639477977161, + "grad_norm": 1.868306279182434, + "learning_rate": 1.5501677288260943e-05, + "loss": 0.0904, + "num_input_tokens_seen": 87554224, + "step": 40560 + }, + { + "epoch": 6.617455138662317, + "grad_norm": 1.105971097946167, + "learning_rate": 1.549509343976529e-05, + "loss": 0.0317, + "num_input_tokens_seen": 87564944, + "step": 40565 + }, + { + "epoch": 6.618270799347472, + "grad_norm": 0.0670371726155281, + "learning_rate": 1.5488510361785597e-05, + "loss": 0.0629, + "num_input_tokens_seen": 87575696, + "step": 40570 + }, + { + "epoch": 6.6190864600326265, + "grad_norm": 0.041098084300756454, + "learning_rate": 1.5481928054855512e-05, + "loss": 0.1233, + "num_input_tokens_seen": 87587216, + "step": 40575 + }, + { + "epoch": 6.619902120717781, + "grad_norm": 0.10849422216415405, + "learning_rate": 1.5475346519508637e-05, + "loss": 0.0897, + "num_input_tokens_seen": 87597872, + "step": 40580 + }, + { + "epoch": 6.620717781402936, + "grad_norm": 0.5332399606704712, + "learning_rate": 1.5468765756278498e-05, + "loss": 0.076, + "num_input_tokens_seen": 87608816, + "step": 40585 + }, + { + "epoch": 6.621533442088092, + "grad_norm": 0.32096970081329346, + "learning_rate": 1.5462185765698568e-05, + "loss": 0.0907, + "num_input_tokens_seen": 87619600, + "step": 40590 + }, + { + "epoch": 6.622349102773247, + "grad_norm": 1.5782692432403564, + "learning_rate": 1.5455606548302253e-05, + "loss": 0.2248, + "num_input_tokens_seen": 87629168, + "step": 40595 + }, + { + "epoch": 6.623164763458401, + "grad_norm": 0.04048575460910797, + "learning_rate": 1.5449028104622905e-05, + "loss": 0.0801, + "num_input_tokens_seen": 87640144, + "step": 40600 + }, + { + "epoch": 6.623980424143556, + "grad_norm": 0.5008759498596191, + "learning_rate": 1.5442450435193795e-05, + "loss": 0.0672, + "num_input_tokens_seen": 87650128, + "step": 40605 + }, + { + "epoch": 6.624796084828711, + "grad_norm": 1.3086535930633545, + "learning_rate": 1.5435873540548135e-05, + "loss": 0.2263, + "num_input_tokens_seen": 87661264, + "step": 40610 + }, + { + "epoch": 6.625611745513866, + "grad_norm": 0.0969243198633194, + "learning_rate": 1.5429297421219107e-05, + "loss": 0.0547, + "num_input_tokens_seen": 87672464, + "step": 40615 + }, + { + "epoch": 6.626427406199021, + "grad_norm": 0.3870543837547302, + "learning_rate": 1.5422722077739794e-05, + "loss": 0.1232, + "num_input_tokens_seen": 87684400, + "step": 40620 + }, + { + "epoch": 6.627243066884176, + "grad_norm": 0.4018827974796295, + "learning_rate": 1.541614751064322e-05, + "loss": 0.2239, + "num_input_tokens_seen": 87695728, + "step": 40625 + }, + { + "epoch": 6.628058727569331, + "grad_norm": 0.025101007893681526, + "learning_rate": 1.5409573720462357e-05, + "loss": 0.0285, + "num_input_tokens_seen": 87705456, + "step": 40630 + }, + { + "epoch": 6.628874388254486, + "grad_norm": 0.1172412633895874, + "learning_rate": 1.540300070773011e-05, + "loss": 0.0324, + "num_input_tokens_seen": 87716816, + "step": 40635 + }, + { + "epoch": 6.629690048939641, + "grad_norm": 0.187732994556427, + "learning_rate": 1.539642847297932e-05, + "loss": 0.0951, + "num_input_tokens_seen": 87727216, + "step": 40640 + }, + { + "epoch": 6.630505709624796, + "grad_norm": 0.12403104454278946, + "learning_rate": 1.5389857016742764e-05, + "loss": 0.0828, + "num_input_tokens_seen": 87737040, + "step": 40645 + }, + { + "epoch": 6.631321370309951, + "grad_norm": 1.1827656030654907, + "learning_rate": 1.538328633955316e-05, + "loss": 0.0958, + "num_input_tokens_seen": 87746928, + "step": 40650 + }, + { + "epoch": 6.632137030995106, + "grad_norm": 0.033573217689991, + "learning_rate": 1.5376716441943162e-05, + "loss": 0.1756, + "num_input_tokens_seen": 87757424, + "step": 40655 + }, + { + "epoch": 6.632952691680261, + "grad_norm": 0.7463297843933105, + "learning_rate": 1.5370147324445354e-05, + "loss": 0.1041, + "num_input_tokens_seen": 87768816, + "step": 40660 + }, + { + "epoch": 6.633768352365416, + "grad_norm": 0.08960691839456558, + "learning_rate": 1.536357898759227e-05, + "loss": 0.1128, + "num_input_tokens_seen": 87780912, + "step": 40665 + }, + { + "epoch": 6.634584013050571, + "grad_norm": 1.6989712715148926, + "learning_rate": 1.535701143191637e-05, + "loss": 0.2614, + "num_input_tokens_seen": 87791920, + "step": 40670 + }, + { + "epoch": 6.635399673735726, + "grad_norm": 0.3100084364414215, + "learning_rate": 1.535044465795005e-05, + "loss": 0.0763, + "num_input_tokens_seen": 87802352, + "step": 40675 + }, + { + "epoch": 6.636215334420881, + "grad_norm": 0.11812779307365417, + "learning_rate": 1.534387866622564e-05, + "loss": 0.0666, + "num_input_tokens_seen": 87813488, + "step": 40680 + }, + { + "epoch": 6.637030995106036, + "grad_norm": 0.5734902024269104, + "learning_rate": 1.5337313457275428e-05, + "loss": 0.2554, + "num_input_tokens_seen": 87824016, + "step": 40685 + }, + { + "epoch": 6.637846655791191, + "grad_norm": 0.7062161564826965, + "learning_rate": 1.533074903163161e-05, + "loss": 0.3096, + "num_input_tokens_seen": 87835120, + "step": 40690 + }, + { + "epoch": 6.638662316476346, + "grad_norm": 0.08380503207445145, + "learning_rate": 1.5324185389826338e-05, + "loss": 0.1654, + "num_input_tokens_seen": 87846480, + "step": 40695 + }, + { + "epoch": 6.6394779771615005, + "grad_norm": 0.333934485912323, + "learning_rate": 1.5317622532391694e-05, + "loss": 0.093, + "num_input_tokens_seen": 87856304, + "step": 40700 + }, + { + "epoch": 6.640293637846656, + "grad_norm": 1.6710364818572998, + "learning_rate": 1.53110604598597e-05, + "loss": 0.1698, + "num_input_tokens_seen": 87868080, + "step": 40705 + }, + { + "epoch": 6.641109298531811, + "grad_norm": 0.22381983697414398, + "learning_rate": 1.5304499172762293e-05, + "loss": 0.1002, + "num_input_tokens_seen": 87878416, + "step": 40710 + }, + { + "epoch": 6.641924959216966, + "grad_norm": 0.12137886136770248, + "learning_rate": 1.5297938671631386e-05, + "loss": 0.2032, + "num_input_tokens_seen": 87888720, + "step": 40715 + }, + { + "epoch": 6.642740619902121, + "grad_norm": 0.36030349135398865, + "learning_rate": 1.5291378956998793e-05, + "loss": 0.1286, + "num_input_tokens_seen": 87900528, + "step": 40720 + }, + { + "epoch": 6.643556280587275, + "grad_norm": 0.651038646697998, + "learning_rate": 1.528482002939629e-05, + "loss": 0.2201, + "num_input_tokens_seen": 87911984, + "step": 40725 + }, + { + "epoch": 6.64437194127243, + "grad_norm": 0.8465638160705566, + "learning_rate": 1.5278261889355568e-05, + "loss": 0.1594, + "num_input_tokens_seen": 87922608, + "step": 40730 + }, + { + "epoch": 6.645187601957586, + "grad_norm": 0.20262335240840912, + "learning_rate": 1.527170453740826e-05, + "loss": 0.1253, + "num_input_tokens_seen": 87933936, + "step": 40735 + }, + { + "epoch": 6.646003262642741, + "grad_norm": 0.06504341959953308, + "learning_rate": 1.5265147974085947e-05, + "loss": 0.1317, + "num_input_tokens_seen": 87945488, + "step": 40740 + }, + { + "epoch": 6.646818923327896, + "grad_norm": 0.05771587789058685, + "learning_rate": 1.5258592199920135e-05, + "loss": 0.2234, + "num_input_tokens_seen": 87956112, + "step": 40745 + }, + { + "epoch": 6.64763458401305, + "grad_norm": 1.749327301979065, + "learning_rate": 1.5252037215442266e-05, + "loss": 0.2171, + "num_input_tokens_seen": 87965584, + "step": 40750 + }, + { + "epoch": 6.648450244698205, + "grad_norm": 0.6959023475646973, + "learning_rate": 1.5245483021183722e-05, + "loss": 0.0934, + "num_input_tokens_seen": 87975056, + "step": 40755 + }, + { + "epoch": 6.649265905383361, + "grad_norm": 0.09561906009912491, + "learning_rate": 1.5238929617675817e-05, + "loss": 0.0829, + "num_input_tokens_seen": 87985712, + "step": 40760 + }, + { + "epoch": 6.650081566068516, + "grad_norm": 1.2606459856033325, + "learning_rate": 1.5232377005449805e-05, + "loss": 0.0961, + "num_input_tokens_seen": 87996336, + "step": 40765 + }, + { + "epoch": 6.650897226753671, + "grad_norm": 0.5474752187728882, + "learning_rate": 1.5225825185036874e-05, + "loss": 0.1167, + "num_input_tokens_seen": 88007184, + "step": 40770 + }, + { + "epoch": 6.651712887438825, + "grad_norm": 0.3217850625514984, + "learning_rate": 1.5219274156968143e-05, + "loss": 0.2368, + "num_input_tokens_seen": 88017008, + "step": 40775 + }, + { + "epoch": 6.65252854812398, + "grad_norm": 0.6430179476737976, + "learning_rate": 1.521272392177468e-05, + "loss": 0.0553, + "num_input_tokens_seen": 88027760, + "step": 40780 + }, + { + "epoch": 6.653344208809135, + "grad_norm": 1.6386064291000366, + "learning_rate": 1.5206174479987475e-05, + "loss": 0.2285, + "num_input_tokens_seen": 88039696, + "step": 40785 + }, + { + "epoch": 6.654159869494291, + "grad_norm": 0.07788561284542084, + "learning_rate": 1.5199625832137459e-05, + "loss": 0.0659, + "num_input_tokens_seen": 88050544, + "step": 40790 + }, + { + "epoch": 6.6549755301794455, + "grad_norm": 0.04013260081410408, + "learning_rate": 1.5193077978755499e-05, + "loss": 0.0343, + "num_input_tokens_seen": 88061840, + "step": 40795 + }, + { + "epoch": 6.6557911908646, + "grad_norm": 1.7897299528121948, + "learning_rate": 1.5186530920372399e-05, + "loss": 0.1317, + "num_input_tokens_seen": 88072816, + "step": 40800 + }, + { + "epoch": 6.656606851549755, + "grad_norm": 0.5235965251922607, + "learning_rate": 1.5179984657518895e-05, + "loss": 0.1418, + "num_input_tokens_seen": 88083536, + "step": 40805 + }, + { + "epoch": 6.65742251223491, + "grad_norm": 2.5646445751190186, + "learning_rate": 1.5173439190725663e-05, + "loss": 0.2204, + "num_input_tokens_seen": 88094704, + "step": 40810 + }, + { + "epoch": 6.658238172920065, + "grad_norm": 1.517723798751831, + "learning_rate": 1.5166894520523305e-05, + "loss": 0.1884, + "num_input_tokens_seen": 88106288, + "step": 40815 + }, + { + "epoch": 6.6590538336052205, + "grad_norm": 0.03288956731557846, + "learning_rate": 1.5160350647442367e-05, + "loss": 0.0648, + "num_input_tokens_seen": 88114960, + "step": 40820 + }, + { + "epoch": 6.659869494290375, + "grad_norm": 0.3002878725528717, + "learning_rate": 1.5153807572013338e-05, + "loss": 0.1342, + "num_input_tokens_seen": 88124944, + "step": 40825 + }, + { + "epoch": 6.66068515497553, + "grad_norm": 0.15495893359184265, + "learning_rate": 1.5147265294766624e-05, + "loss": 0.0148, + "num_input_tokens_seen": 88135056, + "step": 40830 + }, + { + "epoch": 6.661500815660685, + "grad_norm": 0.44577357172966003, + "learning_rate": 1.5140723816232583e-05, + "loss": 0.0576, + "num_input_tokens_seen": 88146768, + "step": 40835 + }, + { + "epoch": 6.66231647634584, + "grad_norm": 0.0852656289935112, + "learning_rate": 1.5134183136941487e-05, + "loss": 0.2872, + "num_input_tokens_seen": 88157968, + "step": 40840 + }, + { + "epoch": 6.6631321370309955, + "grad_norm": 0.04005548357963562, + "learning_rate": 1.5127643257423572e-05, + "loss": 0.1338, + "num_input_tokens_seen": 88168912, + "step": 40845 + }, + { + "epoch": 6.66394779771615, + "grad_norm": 0.6272658705711365, + "learning_rate": 1.5121104178208984e-05, + "loss": 0.0799, + "num_input_tokens_seen": 88179632, + "step": 40850 + }, + { + "epoch": 6.664763458401305, + "grad_norm": 0.4420868158340454, + "learning_rate": 1.5114565899827815e-05, + "loss": 0.0394, + "num_input_tokens_seen": 88191472, + "step": 40855 + }, + { + "epoch": 6.66557911908646, + "grad_norm": 0.1335001289844513, + "learning_rate": 1.5108028422810094e-05, + "loss": 0.0798, + "num_input_tokens_seen": 88202800, + "step": 40860 + }, + { + "epoch": 6.666394779771615, + "grad_norm": 0.6417424082756042, + "learning_rate": 1.510149174768578e-05, + "loss": 0.208, + "num_input_tokens_seen": 88212848, + "step": 40865 + }, + { + "epoch": 6.6672104404567705, + "grad_norm": 0.5994857549667358, + "learning_rate": 1.5094955874984767e-05, + "loss": 0.2102, + "num_input_tokens_seen": 88224112, + "step": 40870 + }, + { + "epoch": 6.668026101141925, + "grad_norm": 0.11276267468929291, + "learning_rate": 1.5088420805236892e-05, + "loss": 0.1703, + "num_input_tokens_seen": 88234704, + "step": 40875 + }, + { + "epoch": 6.66884176182708, + "grad_norm": 0.4232877790927887, + "learning_rate": 1.5081886538971911e-05, + "loss": 0.0566, + "num_input_tokens_seen": 88247216, + "step": 40880 + }, + { + "epoch": 6.669657422512235, + "grad_norm": 0.4392523169517517, + "learning_rate": 1.5075353076719536e-05, + "loss": 0.1415, + "num_input_tokens_seen": 88259664, + "step": 40885 + }, + { + "epoch": 6.67047308319739, + "grad_norm": 0.5615129470825195, + "learning_rate": 1.50688204190094e-05, + "loss": 0.117, + "num_input_tokens_seen": 88270128, + "step": 40890 + }, + { + "epoch": 6.671288743882545, + "grad_norm": 0.12008053809404373, + "learning_rate": 1.5062288566371069e-05, + "loss": 0.0368, + "num_input_tokens_seen": 88280176, + "step": 40895 + }, + { + "epoch": 6.672104404567699, + "grad_norm": 1.7624731063842773, + "learning_rate": 1.5055757519334048e-05, + "loss": 0.1564, + "num_input_tokens_seen": 88290288, + "step": 40900 + }, + { + "epoch": 6.672920065252855, + "grad_norm": 0.06425990164279938, + "learning_rate": 1.5049227278427782e-05, + "loss": 0.1199, + "num_input_tokens_seen": 88301296, + "step": 40905 + }, + { + "epoch": 6.67373572593801, + "grad_norm": 1.095747470855713, + "learning_rate": 1.504269784418164e-05, + "loss": 0.0808, + "num_input_tokens_seen": 88312752, + "step": 40910 + }, + { + "epoch": 6.674551386623165, + "grad_norm": 0.2130989134311676, + "learning_rate": 1.5036169217124938e-05, + "loss": 0.0325, + "num_input_tokens_seen": 88324144, + "step": 40915 + }, + { + "epoch": 6.6753670473083195, + "grad_norm": 0.17120704054832458, + "learning_rate": 1.5029641397786912e-05, + "loss": 0.1236, + "num_input_tokens_seen": 88335088, + "step": 40920 + }, + { + "epoch": 6.676182707993474, + "grad_norm": 2.7445452213287354, + "learning_rate": 1.5023114386696746e-05, + "loss": 0.0893, + "num_input_tokens_seen": 88345360, + "step": 40925 + }, + { + "epoch": 6.67699836867863, + "grad_norm": 1.382515788078308, + "learning_rate": 1.5016588184383536e-05, + "loss": 0.2131, + "num_input_tokens_seen": 88355888, + "step": 40930 + }, + { + "epoch": 6.677814029363785, + "grad_norm": 0.04811516031622887, + "learning_rate": 1.5010062791376355e-05, + "loss": 0.0913, + "num_input_tokens_seen": 88366832, + "step": 40935 + }, + { + "epoch": 6.67862969004894, + "grad_norm": 1.2838526964187622, + "learning_rate": 1.5003538208204173e-05, + "loss": 0.2408, + "num_input_tokens_seen": 88378320, + "step": 40940 + }, + { + "epoch": 6.6794453507340945, + "grad_norm": 0.30030274391174316, + "learning_rate": 1.4997014435395906e-05, + "loss": 0.1392, + "num_input_tokens_seen": 88388656, + "step": 40945 + }, + { + "epoch": 6.680261011419249, + "grad_norm": 1.8680000305175781, + "learning_rate": 1.4990491473480403e-05, + "loss": 0.3516, + "num_input_tokens_seen": 88399632, + "step": 40950 + }, + { + "epoch": 6.681076672104405, + "grad_norm": 1.5184423923492432, + "learning_rate": 1.4983969322986446e-05, + "loss": 0.0576, + "num_input_tokens_seen": 88410032, + "step": 40955 + }, + { + "epoch": 6.68189233278956, + "grad_norm": 0.2360112965106964, + "learning_rate": 1.497744798444276e-05, + "loss": 0.0808, + "num_input_tokens_seen": 88421008, + "step": 40960 + }, + { + "epoch": 6.682707993474715, + "grad_norm": 0.22211392223834991, + "learning_rate": 1.497092745837799e-05, + "loss": 0.0967, + "num_input_tokens_seen": 88432144, + "step": 40965 + }, + { + "epoch": 6.6835236541598695, + "grad_norm": 0.5283361077308655, + "learning_rate": 1.496440774532073e-05, + "loss": 0.1287, + "num_input_tokens_seen": 88442576, + "step": 40970 + }, + { + "epoch": 6.684339314845024, + "grad_norm": 0.6950168013572693, + "learning_rate": 1.49578888457995e-05, + "loss": 0.1524, + "num_input_tokens_seen": 88452048, + "step": 40975 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.054601334035396576, + "learning_rate": 1.4951370760342754e-05, + "loss": 0.3297, + "num_input_tokens_seen": 88463280, + "step": 40980 + }, + { + "epoch": 6.685970636215334, + "grad_norm": 0.29519644379615784, + "learning_rate": 1.4944853489478878e-05, + "loss": 0.1602, + "num_input_tokens_seen": 88473488, + "step": 40985 + }, + { + "epoch": 6.68678629690049, + "grad_norm": 0.06688065826892853, + "learning_rate": 1.4938337033736196e-05, + "loss": 0.0126, + "num_input_tokens_seen": 88484304, + "step": 40990 + }, + { + "epoch": 6.6876019575856445, + "grad_norm": 0.07900120317935944, + "learning_rate": 1.4931821393642969e-05, + "loss": 0.1812, + "num_input_tokens_seen": 88495088, + "step": 40995 + }, + { + "epoch": 6.688417618270799, + "grad_norm": 0.10752258449792862, + "learning_rate": 1.4925306569727385e-05, + "loss": 0.159, + "num_input_tokens_seen": 88506512, + "step": 41000 + }, + { + "epoch": 6.689233278955954, + "grad_norm": 1.3085347414016724, + "learning_rate": 1.491879256251757e-05, + "loss": 0.0445, + "num_input_tokens_seen": 88518192, + "step": 41005 + }, + { + "epoch": 6.690048939641109, + "grad_norm": 0.1823146641254425, + "learning_rate": 1.4912279372541577e-05, + "loss": 0.0585, + "num_input_tokens_seen": 88526928, + "step": 41010 + }, + { + "epoch": 6.690864600326265, + "grad_norm": 0.06104184314608574, + "learning_rate": 1.4905767000327409e-05, + "loss": 0.1088, + "num_input_tokens_seen": 88537616, + "step": 41015 + }, + { + "epoch": 6.691680261011419, + "grad_norm": 0.16000205278396606, + "learning_rate": 1.4899255446402982e-05, + "loss": 0.1346, + "num_input_tokens_seen": 88547408, + "step": 41020 + }, + { + "epoch": 6.692495921696574, + "grad_norm": 0.026509514078497887, + "learning_rate": 1.4892744711296152e-05, + "loss": 0.03, + "num_input_tokens_seen": 88557232, + "step": 41025 + }, + { + "epoch": 6.693311582381729, + "grad_norm": 0.3958442807197571, + "learning_rate": 1.488623479553473e-05, + "loss": 0.1389, + "num_input_tokens_seen": 88567632, + "step": 41030 + }, + { + "epoch": 6.694127243066884, + "grad_norm": 0.5070660710334778, + "learning_rate": 1.4879725699646424e-05, + "loss": 0.1351, + "num_input_tokens_seen": 88579312, + "step": 41035 + }, + { + "epoch": 6.69494290375204, + "grad_norm": 1.4027509689331055, + "learning_rate": 1.4873217424158906e-05, + "loss": 0.0638, + "num_input_tokens_seen": 88590608, + "step": 41040 + }, + { + "epoch": 6.695758564437194, + "grad_norm": 0.48785921931266785, + "learning_rate": 1.4866709969599767e-05, + "loss": 0.0684, + "num_input_tokens_seen": 88601904, + "step": 41045 + }, + { + "epoch": 6.696574225122349, + "grad_norm": 0.013206214644014835, + "learning_rate": 1.486020333649653e-05, + "loss": 0.0209, + "num_input_tokens_seen": 88611088, + "step": 41050 + }, + { + "epoch": 6.697389885807504, + "grad_norm": 1.4933032989501953, + "learning_rate": 1.4853697525376665e-05, + "loss": 0.1597, + "num_input_tokens_seen": 88623120, + "step": 41055 + }, + { + "epoch": 6.698205546492659, + "grad_norm": 0.7586897611618042, + "learning_rate": 1.484719253676756e-05, + "loss": 0.1987, + "num_input_tokens_seen": 88634064, + "step": 41060 + }, + { + "epoch": 6.699021207177814, + "grad_norm": 1.3936363458633423, + "learning_rate": 1.4840688371196543e-05, + "loss": 0.0476, + "num_input_tokens_seen": 88645392, + "step": 41065 + }, + { + "epoch": 6.699836867862969, + "grad_norm": 0.18788082897663116, + "learning_rate": 1.4834185029190873e-05, + "loss": 0.0296, + "num_input_tokens_seen": 88657808, + "step": 41070 + }, + { + "epoch": 6.700652528548124, + "grad_norm": 0.2696627676486969, + "learning_rate": 1.4827682511277746e-05, + "loss": 0.035, + "num_input_tokens_seen": 88668048, + "step": 41075 + }, + { + "epoch": 6.701468189233279, + "grad_norm": 0.15397608280181885, + "learning_rate": 1.4821180817984288e-05, + "loss": 0.0704, + "num_input_tokens_seen": 88677456, + "step": 41080 + }, + { + "epoch": 6.702283849918434, + "grad_norm": 1.8295373916625977, + "learning_rate": 1.4814679949837563e-05, + "loss": 0.2819, + "num_input_tokens_seen": 88688592, + "step": 41085 + }, + { + "epoch": 6.703099510603589, + "grad_norm": 0.18628595769405365, + "learning_rate": 1.4808179907364555e-05, + "loss": 0.0717, + "num_input_tokens_seen": 88699152, + "step": 41090 + }, + { + "epoch": 6.7039151712887435, + "grad_norm": 0.06498292833566666, + "learning_rate": 1.48016806910922e-05, + "loss": 0.0817, + "num_input_tokens_seen": 88709776, + "step": 41095 + }, + { + "epoch": 6.704730831973899, + "grad_norm": 1.7655137777328491, + "learning_rate": 1.4795182301547356e-05, + "loss": 0.2272, + "num_input_tokens_seen": 88721328, + "step": 41100 + }, + { + "epoch": 6.705546492659054, + "grad_norm": 1.3744027614593506, + "learning_rate": 1.4788684739256808e-05, + "loss": 0.2769, + "num_input_tokens_seen": 88732432, + "step": 41105 + }, + { + "epoch": 6.706362153344209, + "grad_norm": 0.11301099509000778, + "learning_rate": 1.4782188004747289e-05, + "loss": 0.0626, + "num_input_tokens_seen": 88743312, + "step": 41110 + }, + { + "epoch": 6.707177814029364, + "grad_norm": 1.0187163352966309, + "learning_rate": 1.4775692098545451e-05, + "loss": 0.069, + "num_input_tokens_seen": 88754096, + "step": 41115 + }, + { + "epoch": 6.7079934747145185, + "grad_norm": 0.8714965581893921, + "learning_rate": 1.4769197021177896e-05, + "loss": 0.1068, + "num_input_tokens_seen": 88765200, + "step": 41120 + }, + { + "epoch": 6.708809135399674, + "grad_norm": 0.1316436380147934, + "learning_rate": 1.476270277317114e-05, + "loss": 0.1247, + "num_input_tokens_seen": 88777136, + "step": 41125 + }, + { + "epoch": 6.709624796084829, + "grad_norm": 1.0095044374465942, + "learning_rate": 1.475620935505164e-05, + "loss": 0.1634, + "num_input_tokens_seen": 88789040, + "step": 41130 + }, + { + "epoch": 6.710440456769984, + "grad_norm": 0.20141276717185974, + "learning_rate": 1.4749716767345784e-05, + "loss": 0.2931, + "num_input_tokens_seen": 88799792, + "step": 41135 + }, + { + "epoch": 6.711256117455139, + "grad_norm": 1.8397914171218872, + "learning_rate": 1.4743225010579889e-05, + "loss": 0.1513, + "num_input_tokens_seen": 88810224, + "step": 41140 + }, + { + "epoch": 6.712071778140293, + "grad_norm": 0.12461846321821213, + "learning_rate": 1.4736734085280226e-05, + "loss": 0.1697, + "num_input_tokens_seen": 88820112, + "step": 41145 + }, + { + "epoch": 6.712887438825448, + "grad_norm": 1.3688398599624634, + "learning_rate": 1.4730243991972976e-05, + "loss": 0.1284, + "num_input_tokens_seen": 88830864, + "step": 41150 + }, + { + "epoch": 6.713703099510604, + "grad_norm": 0.8252411484718323, + "learning_rate": 1.4723754731184253e-05, + "loss": 0.0523, + "num_input_tokens_seen": 88842096, + "step": 41155 + }, + { + "epoch": 6.714518760195759, + "grad_norm": 1.244638442993164, + "learning_rate": 1.4717266303440113e-05, + "loss": 0.1098, + "num_input_tokens_seen": 88853232, + "step": 41160 + }, + { + "epoch": 6.715334420880914, + "grad_norm": 0.3863266706466675, + "learning_rate": 1.471077870926654e-05, + "loss": 0.1182, + "num_input_tokens_seen": 88863408, + "step": 41165 + }, + { + "epoch": 6.716150081566068, + "grad_norm": 1.0179718732833862, + "learning_rate": 1.4704291949189452e-05, + "loss": 0.1355, + "num_input_tokens_seen": 88873648, + "step": 41170 + }, + { + "epoch": 6.716965742251223, + "grad_norm": 0.04188407212495804, + "learning_rate": 1.46978060237347e-05, + "loss": 0.0574, + "num_input_tokens_seen": 88884720, + "step": 41175 + }, + { + "epoch": 6.717781402936378, + "grad_norm": 0.20554763078689575, + "learning_rate": 1.4691320933428066e-05, + "loss": 0.0427, + "num_input_tokens_seen": 88895632, + "step": 41180 + }, + { + "epoch": 6.718597063621534, + "grad_norm": 0.3847046196460724, + "learning_rate": 1.4684836678795259e-05, + "loss": 0.2063, + "num_input_tokens_seen": 88905456, + "step": 41185 + }, + { + "epoch": 6.719412724306689, + "grad_norm": 0.7722967267036438, + "learning_rate": 1.4678353260361927e-05, + "loss": 0.1429, + "num_input_tokens_seen": 88916848, + "step": 41190 + }, + { + "epoch": 6.720228384991843, + "grad_norm": 0.5429057478904724, + "learning_rate": 1.4671870678653653e-05, + "loss": 0.0636, + "num_input_tokens_seen": 88928240, + "step": 41195 + }, + { + "epoch": 6.721044045676998, + "grad_norm": 0.39141789078712463, + "learning_rate": 1.466538893419595e-05, + "loss": 0.0595, + "num_input_tokens_seen": 88940016, + "step": 41200 + }, + { + "epoch": 6.721859706362153, + "grad_norm": 0.905299961566925, + "learning_rate": 1.4658908027514256e-05, + "loss": 0.2313, + "num_input_tokens_seen": 88950544, + "step": 41205 + }, + { + "epoch": 6.722675367047309, + "grad_norm": 0.7858887910842896, + "learning_rate": 1.4652427959133947e-05, + "loss": 0.1231, + "num_input_tokens_seen": 88961520, + "step": 41210 + }, + { + "epoch": 6.7234910277324635, + "grad_norm": 0.6687313914299011, + "learning_rate": 1.4645948729580331e-05, + "loss": 0.191, + "num_input_tokens_seen": 88971952, + "step": 41215 + }, + { + "epoch": 6.724306688417618, + "grad_norm": 1.0598692893981934, + "learning_rate": 1.4639470339378647e-05, + "loss": 0.1737, + "num_input_tokens_seen": 88981840, + "step": 41220 + }, + { + "epoch": 6.725122349102773, + "grad_norm": 1.20963454246521, + "learning_rate": 1.4632992789054064e-05, + "loss": 0.1517, + "num_input_tokens_seen": 88992656, + "step": 41225 + }, + { + "epoch": 6.725938009787928, + "grad_norm": 1.326286792755127, + "learning_rate": 1.4626516079131692e-05, + "loss": 0.2422, + "num_input_tokens_seen": 89004432, + "step": 41230 + }, + { + "epoch": 6.726753670473083, + "grad_norm": 0.42538654804229736, + "learning_rate": 1.4620040210136557e-05, + "loss": 0.1195, + "num_input_tokens_seen": 89014736, + "step": 41235 + }, + { + "epoch": 6.7275693311582385, + "grad_norm": 0.05522443726658821, + "learning_rate": 1.461356518259363e-05, + "loss": 0.1553, + "num_input_tokens_seen": 89024976, + "step": 41240 + }, + { + "epoch": 6.728384991843393, + "grad_norm": 0.11293748021125793, + "learning_rate": 1.4607090997027812e-05, + "loss": 0.0141, + "num_input_tokens_seen": 89035440, + "step": 41245 + }, + { + "epoch": 6.729200652528548, + "grad_norm": 1.9483203887939453, + "learning_rate": 1.4600617653963918e-05, + "loss": 0.1028, + "num_input_tokens_seen": 89046608, + "step": 41250 + }, + { + "epoch": 6.730016313213703, + "grad_norm": 1.1261625289916992, + "learning_rate": 1.4594145153926737e-05, + "loss": 0.1574, + "num_input_tokens_seen": 89056912, + "step": 41255 + }, + { + "epoch": 6.730831973898858, + "grad_norm": 0.21142062544822693, + "learning_rate": 1.4587673497440946e-05, + "loss": 0.1104, + "num_input_tokens_seen": 89068048, + "step": 41260 + }, + { + "epoch": 6.731647634584013, + "grad_norm": 0.36517786979675293, + "learning_rate": 1.458120268503117e-05, + "loss": 0.1268, + "num_input_tokens_seen": 89078544, + "step": 41265 + }, + { + "epoch": 6.732463295269168, + "grad_norm": 1.7117011547088623, + "learning_rate": 1.4574732717221972e-05, + "loss": 0.1406, + "num_input_tokens_seen": 89090768, + "step": 41270 + }, + { + "epoch": 6.733278955954323, + "grad_norm": 1.1101882457733154, + "learning_rate": 1.456826359453784e-05, + "loss": 0.2189, + "num_input_tokens_seen": 89101040, + "step": 41275 + }, + { + "epoch": 6.734094616639478, + "grad_norm": 0.5816317796707153, + "learning_rate": 1.4561795317503185e-05, + "loss": 0.1141, + "num_input_tokens_seen": 89112112, + "step": 41280 + }, + { + "epoch": 6.734910277324633, + "grad_norm": 0.14495745301246643, + "learning_rate": 1.455532788664237e-05, + "loss": 0.1879, + "num_input_tokens_seen": 89123056, + "step": 41285 + }, + { + "epoch": 6.735725938009788, + "grad_norm": 1.3629939556121826, + "learning_rate": 1.4548861302479672e-05, + "loss": 0.1559, + "num_input_tokens_seen": 89135120, + "step": 41290 + }, + { + "epoch": 6.736541598694943, + "grad_norm": 0.3276247978210449, + "learning_rate": 1.4542395565539302e-05, + "loss": 0.183, + "num_input_tokens_seen": 89147184, + "step": 41295 + }, + { + "epoch": 6.737357259380098, + "grad_norm": 0.5624023675918579, + "learning_rate": 1.453593067634541e-05, + "loss": 0.0834, + "num_input_tokens_seen": 89156912, + "step": 41300 + }, + { + "epoch": 6.738172920065253, + "grad_norm": 0.08098630607128143, + "learning_rate": 1.4529466635422063e-05, + "loss": 0.0462, + "num_input_tokens_seen": 89167184, + "step": 41305 + }, + { + "epoch": 6.738988580750408, + "grad_norm": 0.5440002083778381, + "learning_rate": 1.4523003443293285e-05, + "loss": 0.0871, + "num_input_tokens_seen": 89178448, + "step": 41310 + }, + { + "epoch": 6.739804241435563, + "grad_norm": 0.7725046873092651, + "learning_rate": 1.4516541100483008e-05, + "loss": 0.1854, + "num_input_tokens_seen": 89189072, + "step": 41315 + }, + { + "epoch": 6.740619902120718, + "grad_norm": 0.029194016009569168, + "learning_rate": 1.4510079607515104e-05, + "loss": 0.0793, + "num_input_tokens_seen": 89200496, + "step": 41320 + }, + { + "epoch": 6.741435562805873, + "grad_norm": 0.2794092297554016, + "learning_rate": 1.4503618964913368e-05, + "loss": 0.0584, + "num_input_tokens_seen": 89209840, + "step": 41325 + }, + { + "epoch": 6.742251223491028, + "grad_norm": 0.9811379313468933, + "learning_rate": 1.4497159173201541e-05, + "loss": 0.1802, + "num_input_tokens_seen": 89220240, + "step": 41330 + }, + { + "epoch": 6.743066884176183, + "grad_norm": 0.15792767703533173, + "learning_rate": 1.4490700232903281e-05, + "loss": 0.0898, + "num_input_tokens_seen": 89232400, + "step": 41335 + }, + { + "epoch": 6.7438825448613375, + "grad_norm": 3.0549261569976807, + "learning_rate": 1.4484242144542184e-05, + "loss": 0.2689, + "num_input_tokens_seen": 89242608, + "step": 41340 + }, + { + "epoch": 6.744698205546492, + "grad_norm": 0.20603740215301514, + "learning_rate": 1.4477784908641775e-05, + "loss": 0.0496, + "num_input_tokens_seen": 89253328, + "step": 41345 + }, + { + "epoch": 6.745513866231647, + "grad_norm": 0.09781156480312347, + "learning_rate": 1.4471328525725512e-05, + "loss": 0.0762, + "num_input_tokens_seen": 89263984, + "step": 41350 + }, + { + "epoch": 6.746329526916803, + "grad_norm": 0.89781254529953, + "learning_rate": 1.446487299631677e-05, + "loss": 0.0391, + "num_input_tokens_seen": 89275856, + "step": 41355 + }, + { + "epoch": 6.747145187601958, + "grad_norm": 1.2425040006637573, + "learning_rate": 1.4458418320938886e-05, + "loss": 0.1209, + "num_input_tokens_seen": 89287024, + "step": 41360 + }, + { + "epoch": 6.7479608482871125, + "grad_norm": 0.1673307716846466, + "learning_rate": 1.4451964500115101e-05, + "loss": 0.1941, + "num_input_tokens_seen": 89298032, + "step": 41365 + }, + { + "epoch": 6.748776508972267, + "grad_norm": 0.8001136779785156, + "learning_rate": 1.4445511534368595e-05, + "loss": 0.0304, + "num_input_tokens_seen": 89309424, + "step": 41370 + }, + { + "epoch": 6.749592169657422, + "grad_norm": 1.282899022102356, + "learning_rate": 1.4439059424222474e-05, + "loss": 0.0806, + "num_input_tokens_seen": 89320880, + "step": 41375 + }, + { + "epoch": 6.750407830342578, + "grad_norm": 0.5207896828651428, + "learning_rate": 1.4432608170199785e-05, + "loss": 0.2023, + "num_input_tokens_seen": 89332624, + "step": 41380 + }, + { + "epoch": 6.751223491027733, + "grad_norm": 0.6657674908638, + "learning_rate": 1.4426157772823495e-05, + "loss": 0.1516, + "num_input_tokens_seen": 89343344, + "step": 41385 + }, + { + "epoch": 6.7520391517128875, + "grad_norm": 0.9963172078132629, + "learning_rate": 1.4419708232616508e-05, + "loss": 0.1206, + "num_input_tokens_seen": 89355344, + "step": 41390 + }, + { + "epoch": 6.752854812398042, + "grad_norm": 0.6785929203033447, + "learning_rate": 1.4413259550101654e-05, + "loss": 0.0963, + "num_input_tokens_seen": 89366160, + "step": 41395 + }, + { + "epoch": 6.753670473083197, + "grad_norm": 0.4346177577972412, + "learning_rate": 1.4406811725801696e-05, + "loss": 0.1175, + "num_input_tokens_seen": 89376976, + "step": 41400 + }, + { + "epoch": 6.754486133768353, + "grad_norm": 1.432991623878479, + "learning_rate": 1.4400364760239333e-05, + "loss": 0.2354, + "num_input_tokens_seen": 89388176, + "step": 41405 + }, + { + "epoch": 6.755301794453508, + "grad_norm": 1.3082832098007202, + "learning_rate": 1.4393918653937183e-05, + "loss": 0.1262, + "num_input_tokens_seen": 89399152, + "step": 41410 + }, + { + "epoch": 6.7561174551386625, + "grad_norm": 1.3171734809875488, + "learning_rate": 1.4387473407417801e-05, + "loss": 0.2325, + "num_input_tokens_seen": 89409968, + "step": 41415 + }, + { + "epoch": 6.756933115823817, + "grad_norm": 0.0747123584151268, + "learning_rate": 1.438102902120367e-05, + "loss": 0.0465, + "num_input_tokens_seen": 89420560, + "step": 41420 + }, + { + "epoch": 6.757748776508972, + "grad_norm": 1.0879179239273071, + "learning_rate": 1.437458549581721e-05, + "loss": 0.1693, + "num_input_tokens_seen": 89431952, + "step": 41425 + }, + { + "epoch": 6.758564437194127, + "grad_norm": 0.14940160512924194, + "learning_rate": 1.4368142831780763e-05, + "loss": 0.0749, + "num_input_tokens_seen": 89443152, + "step": 41430 + }, + { + "epoch": 6.759380097879282, + "grad_norm": 2.597946882247925, + "learning_rate": 1.4361701029616598e-05, + "loss": 0.1509, + "num_input_tokens_seen": 89454576, + "step": 41435 + }, + { + "epoch": 6.760195758564437, + "grad_norm": 0.110374815762043, + "learning_rate": 1.4355260089846931e-05, + "loss": 0.032, + "num_input_tokens_seen": 89465840, + "step": 41440 + }, + { + "epoch": 6.761011419249592, + "grad_norm": 0.07796543836593628, + "learning_rate": 1.434882001299389e-05, + "loss": 0.0306, + "num_input_tokens_seen": 89476400, + "step": 41445 + }, + { + "epoch": 6.761827079934747, + "grad_norm": 1.6744979619979858, + "learning_rate": 1.4342380799579533e-05, + "loss": 0.2183, + "num_input_tokens_seen": 89487568, + "step": 41450 + }, + { + "epoch": 6.762642740619902, + "grad_norm": 1.156364917755127, + "learning_rate": 1.4335942450125872e-05, + "loss": 0.0748, + "num_input_tokens_seen": 89496400, + "step": 41455 + }, + { + "epoch": 6.763458401305057, + "grad_norm": 0.146433487534523, + "learning_rate": 1.4329504965154827e-05, + "loss": 0.0704, + "num_input_tokens_seen": 89507440, + "step": 41460 + }, + { + "epoch": 6.764274061990212, + "grad_norm": 0.4593377113342285, + "learning_rate": 1.4323068345188253e-05, + "loss": 0.1344, + "num_input_tokens_seen": 89516624, + "step": 41465 + }, + { + "epoch": 6.765089722675367, + "grad_norm": 0.0947418361902237, + "learning_rate": 1.431663259074793e-05, + "loss": 0.1513, + "num_input_tokens_seen": 89526352, + "step": 41470 + }, + { + "epoch": 6.765905383360522, + "grad_norm": 0.10713279247283936, + "learning_rate": 1.4310197702355572e-05, + "loss": 0.1373, + "num_input_tokens_seen": 89538128, + "step": 41475 + }, + { + "epoch": 6.766721044045677, + "grad_norm": 0.49388831853866577, + "learning_rate": 1.430376368053283e-05, + "loss": 0.1175, + "num_input_tokens_seen": 89549136, + "step": 41480 + }, + { + "epoch": 6.767536704730832, + "grad_norm": 0.09658953547477722, + "learning_rate": 1.429733052580128e-05, + "loss": 0.0599, + "num_input_tokens_seen": 89559728, + "step": 41485 + }, + { + "epoch": 6.768352365415987, + "grad_norm": 0.18450599908828735, + "learning_rate": 1.4290898238682421e-05, + "loss": 0.0676, + "num_input_tokens_seen": 89569904, + "step": 41490 + }, + { + "epoch": 6.769168026101142, + "grad_norm": 0.027618931606411934, + "learning_rate": 1.428446681969769e-05, + "loss": 0.1673, + "num_input_tokens_seen": 89581008, + "step": 41495 + }, + { + "epoch": 6.769983686786297, + "grad_norm": 1.102447271347046, + "learning_rate": 1.427803626936845e-05, + "loss": 0.3614, + "num_input_tokens_seen": 89591728, + "step": 41500 + }, + { + "epoch": 6.770799347471452, + "grad_norm": 0.09193006157875061, + "learning_rate": 1.4271606588215988e-05, + "loss": 0.0181, + "num_input_tokens_seen": 89601456, + "step": 41505 + }, + { + "epoch": 6.771615008156607, + "grad_norm": 0.7347052097320557, + "learning_rate": 1.4265177776761534e-05, + "loss": 0.0771, + "num_input_tokens_seen": 89612368, + "step": 41510 + }, + { + "epoch": 6.7724306688417615, + "grad_norm": 0.6955236196517944, + "learning_rate": 1.4258749835526235e-05, + "loss": 0.0712, + "num_input_tokens_seen": 89623312, + "step": 41515 + }, + { + "epoch": 6.773246329526917, + "grad_norm": 1.9819200038909912, + "learning_rate": 1.4252322765031179e-05, + "loss": 0.1392, + "num_input_tokens_seen": 89633936, + "step": 41520 + }, + { + "epoch": 6.774061990212072, + "grad_norm": 0.15832723677158356, + "learning_rate": 1.4245896565797373e-05, + "loss": 0.137, + "num_input_tokens_seen": 89643696, + "step": 41525 + }, + { + "epoch": 6.774877650897227, + "grad_norm": 0.29025018215179443, + "learning_rate": 1.4239471238345753e-05, + "loss": 0.0512, + "num_input_tokens_seen": 89653936, + "step": 41530 + }, + { + "epoch": 6.775693311582382, + "grad_norm": 0.42746251821517944, + "learning_rate": 1.4233046783197195e-05, + "loss": 0.0552, + "num_input_tokens_seen": 89664752, + "step": 41535 + }, + { + "epoch": 6.7765089722675365, + "grad_norm": 0.1110294759273529, + "learning_rate": 1.4226623200872496e-05, + "loss": 0.0513, + "num_input_tokens_seen": 89675984, + "step": 41540 + }, + { + "epoch": 6.777324632952691, + "grad_norm": 0.29023870825767517, + "learning_rate": 1.4220200491892383e-05, + "loss": 0.0166, + "num_input_tokens_seen": 89686704, + "step": 41545 + }, + { + "epoch": 6.778140293637847, + "grad_norm": 0.3369961380958557, + "learning_rate": 1.4213778656777515e-05, + "loss": 0.1729, + "num_input_tokens_seen": 89697040, + "step": 41550 + }, + { + "epoch": 6.778955954323002, + "grad_norm": 0.06932645291090012, + "learning_rate": 1.4207357696048479e-05, + "loss": 0.0964, + "num_input_tokens_seen": 89708208, + "step": 41555 + }, + { + "epoch": 6.779771615008157, + "grad_norm": 0.38983073830604553, + "learning_rate": 1.4200937610225787e-05, + "loss": 0.0659, + "num_input_tokens_seen": 89719120, + "step": 41560 + }, + { + "epoch": 6.780587275693311, + "grad_norm": 0.13520169258117676, + "learning_rate": 1.4194518399829887e-05, + "loss": 0.0713, + "num_input_tokens_seen": 89730384, + "step": 41565 + }, + { + "epoch": 6.781402936378466, + "grad_norm": 0.10333213210105896, + "learning_rate": 1.4188100065381144e-05, + "loss": 0.0152, + "num_input_tokens_seen": 89742128, + "step": 41570 + }, + { + "epoch": 6.782218597063622, + "grad_norm": 0.037483684718608856, + "learning_rate": 1.4181682607399877e-05, + "loss": 0.119, + "num_input_tokens_seen": 89752944, + "step": 41575 + }, + { + "epoch": 6.783034257748777, + "grad_norm": 0.05231236666440964, + "learning_rate": 1.4175266026406308e-05, + "loss": 0.2843, + "num_input_tokens_seen": 89762448, + "step": 41580 + }, + { + "epoch": 6.783849918433932, + "grad_norm": 0.16707095503807068, + "learning_rate": 1.4168850322920602e-05, + "loss": 0.1877, + "num_input_tokens_seen": 89773200, + "step": 41585 + }, + { + "epoch": 6.784665579119086, + "grad_norm": 0.3392045199871063, + "learning_rate": 1.4162435497462842e-05, + "loss": 0.0711, + "num_input_tokens_seen": 89783952, + "step": 41590 + }, + { + "epoch": 6.785481239804241, + "grad_norm": 0.20206181704998016, + "learning_rate": 1.415602155055305e-05, + "loss": 0.0721, + "num_input_tokens_seen": 89796528, + "step": 41595 + }, + { + "epoch": 6.786296900489396, + "grad_norm": 0.1490764021873474, + "learning_rate": 1.4149608482711177e-05, + "loss": 0.0147, + "num_input_tokens_seen": 89807664, + "step": 41600 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.6999619603157043, + "learning_rate": 1.4143196294457092e-05, + "loss": 0.148, + "num_input_tokens_seen": 89817552, + "step": 41605 + }, + { + "epoch": 6.787928221859707, + "grad_norm": 0.1922542303800583, + "learning_rate": 1.4136784986310603e-05, + "loss": 0.1022, + "num_input_tokens_seen": 89829008, + "step": 41610 + }, + { + "epoch": 6.788743882544861, + "grad_norm": 0.0754895806312561, + "learning_rate": 1.4130374558791442e-05, + "loss": 0.107, + "num_input_tokens_seen": 89839856, + "step": 41615 + }, + { + "epoch": 6.789559543230016, + "grad_norm": 0.16194146871566772, + "learning_rate": 1.412396501241926e-05, + "loss": 0.078, + "num_input_tokens_seen": 89851312, + "step": 41620 + }, + { + "epoch": 6.790375203915171, + "grad_norm": 0.29804185032844543, + "learning_rate": 1.411755634771367e-05, + "loss": 0.1487, + "num_input_tokens_seen": 89863248, + "step": 41625 + }, + { + "epoch": 6.791190864600326, + "grad_norm": 0.9781695008277893, + "learning_rate": 1.411114856519418e-05, + "loss": 0.1516, + "num_input_tokens_seen": 89875344, + "step": 41630 + }, + { + "epoch": 6.7920065252854815, + "grad_norm": 0.05921708792448044, + "learning_rate": 1.4104741665380236e-05, + "loss": 0.0264, + "num_input_tokens_seen": 89887056, + "step": 41635 + }, + { + "epoch": 6.792822185970636, + "grad_norm": 0.09756490588188171, + "learning_rate": 1.4098335648791216e-05, + "loss": 0.1462, + "num_input_tokens_seen": 89898064, + "step": 41640 + }, + { + "epoch": 6.793637846655791, + "grad_norm": 0.4092906415462494, + "learning_rate": 1.4091930515946422e-05, + "loss": 0.1536, + "num_input_tokens_seen": 89908816, + "step": 41645 + }, + { + "epoch": 6.794453507340946, + "grad_norm": 1.3258819580078125, + "learning_rate": 1.4085526267365084e-05, + "loss": 0.1361, + "num_input_tokens_seen": 89918896, + "step": 41650 + }, + { + "epoch": 6.795269168026101, + "grad_norm": 0.6176199316978455, + "learning_rate": 1.4079122903566371e-05, + "loss": 0.0647, + "num_input_tokens_seen": 89928880, + "step": 41655 + }, + { + "epoch": 6.7960848287112565, + "grad_norm": 0.6011408567428589, + "learning_rate": 1.4072720425069364e-05, + "loss": 0.0448, + "num_input_tokens_seen": 89939952, + "step": 41660 + }, + { + "epoch": 6.796900489396411, + "grad_norm": 0.8552467823028564, + "learning_rate": 1.4066318832393086e-05, + "loss": 0.3028, + "num_input_tokens_seen": 89949360, + "step": 41665 + }, + { + "epoch": 6.797716150081566, + "grad_norm": 0.21547524631023407, + "learning_rate": 1.4059918126056478e-05, + "loss": 0.1, + "num_input_tokens_seen": 89960144, + "step": 41670 + }, + { + "epoch": 6.798531810766721, + "grad_norm": 1.8539447784423828, + "learning_rate": 1.405351830657841e-05, + "loss": 0.243, + "num_input_tokens_seen": 89971184, + "step": 41675 + }, + { + "epoch": 6.799347471451876, + "grad_norm": 0.09657890349626541, + "learning_rate": 1.4047119374477696e-05, + "loss": 0.0879, + "num_input_tokens_seen": 89982288, + "step": 41680 + }, + { + "epoch": 6.800163132137031, + "grad_norm": 0.043274346739053726, + "learning_rate": 1.4040721330273062e-05, + "loss": 0.0565, + "num_input_tokens_seen": 89993744, + "step": 41685 + }, + { + "epoch": 6.800978792822186, + "grad_norm": 0.6742495894432068, + "learning_rate": 1.4034324174483166e-05, + "loss": 0.0594, + "num_input_tokens_seen": 90005232, + "step": 41690 + }, + { + "epoch": 6.801794453507341, + "grad_norm": 0.8683315515518188, + "learning_rate": 1.4027927907626586e-05, + "loss": 0.1218, + "num_input_tokens_seen": 90016240, + "step": 41695 + }, + { + "epoch": 6.802610114192496, + "grad_norm": 0.09083940088748932, + "learning_rate": 1.4021532530221846e-05, + "loss": 0.0354, + "num_input_tokens_seen": 90026736, + "step": 41700 + }, + { + "epoch": 6.803425774877651, + "grad_norm": 0.721216082572937, + "learning_rate": 1.4015138042787381e-05, + "loss": 0.1696, + "num_input_tokens_seen": 90037296, + "step": 41705 + }, + { + "epoch": 6.804241435562806, + "grad_norm": 0.19738058745861053, + "learning_rate": 1.4008744445841566e-05, + "loss": 0.1039, + "num_input_tokens_seen": 90048176, + "step": 41710 + }, + { + "epoch": 6.80505709624796, + "grad_norm": 0.040248043835163116, + "learning_rate": 1.4002351739902691e-05, + "loss": 0.04, + "num_input_tokens_seen": 90059056, + "step": 41715 + }, + { + "epoch": 6.805872756933116, + "grad_norm": 0.19977092742919922, + "learning_rate": 1.3995959925488988e-05, + "loss": 0.0186, + "num_input_tokens_seen": 90069456, + "step": 41720 + }, + { + "epoch": 6.806688417618271, + "grad_norm": 0.08077409863471985, + "learning_rate": 1.3989569003118609e-05, + "loss": 0.0245, + "num_input_tokens_seen": 90081840, + "step": 41725 + }, + { + "epoch": 6.807504078303426, + "grad_norm": 1.7226002216339111, + "learning_rate": 1.398317897330963e-05, + "loss": 0.1679, + "num_input_tokens_seen": 90091600, + "step": 41730 + }, + { + "epoch": 6.808319738988581, + "grad_norm": 0.9770488739013672, + "learning_rate": 1.3976789836580062e-05, + "loss": 0.0504, + "num_input_tokens_seen": 90103344, + "step": 41735 + }, + { + "epoch": 6.809135399673735, + "grad_norm": 1.4511475563049316, + "learning_rate": 1.3970401593447843e-05, + "loss": 0.0734, + "num_input_tokens_seen": 90114352, + "step": 41740 + }, + { + "epoch": 6.809951060358891, + "grad_norm": 1.4693145751953125, + "learning_rate": 1.3964014244430834e-05, + "loss": 0.0663, + "num_input_tokens_seen": 90125584, + "step": 41745 + }, + { + "epoch": 6.810766721044046, + "grad_norm": 0.10828816890716553, + "learning_rate": 1.3957627790046826e-05, + "loss": 0.1349, + "num_input_tokens_seen": 90136304, + "step": 41750 + }, + { + "epoch": 6.811582381729201, + "grad_norm": 0.259795606136322, + "learning_rate": 1.3951242230813538e-05, + "loss": 0.0782, + "num_input_tokens_seen": 90147088, + "step": 41755 + }, + { + "epoch": 6.8123980424143555, + "grad_norm": 0.15904934704303741, + "learning_rate": 1.3944857567248615e-05, + "loss": 0.2322, + "num_input_tokens_seen": 90158512, + "step": 41760 + }, + { + "epoch": 6.81321370309951, + "grad_norm": 0.3405247926712036, + "learning_rate": 1.3938473799869622e-05, + "loss": 0.1149, + "num_input_tokens_seen": 90168880, + "step": 41765 + }, + { + "epoch": 6.814029363784666, + "grad_norm": 0.33581486344337463, + "learning_rate": 1.3932090929194075e-05, + "loss": 0.0422, + "num_input_tokens_seen": 90179696, + "step": 41770 + }, + { + "epoch": 6.814845024469821, + "grad_norm": 1.1211789846420288, + "learning_rate": 1.39257089557394e-05, + "loss": 0.13, + "num_input_tokens_seen": 90190960, + "step": 41775 + }, + { + "epoch": 6.815660685154976, + "grad_norm": 1.344489336013794, + "learning_rate": 1.3919327880022945e-05, + "loss": 0.1502, + "num_input_tokens_seen": 90202288, + "step": 41780 + }, + { + "epoch": 6.8164763458401305, + "grad_norm": 0.18413734436035156, + "learning_rate": 1.3912947702561995e-05, + "loss": 0.0312, + "num_input_tokens_seen": 90214288, + "step": 41785 + }, + { + "epoch": 6.817292006525285, + "grad_norm": 0.9149455428123474, + "learning_rate": 1.390656842387375e-05, + "loss": 0.1109, + "num_input_tokens_seen": 90225168, + "step": 41790 + }, + { + "epoch": 6.81810766721044, + "grad_norm": 0.03227173909544945, + "learning_rate": 1.3900190044475364e-05, + "loss": 0.0785, + "num_input_tokens_seen": 90235760, + "step": 41795 + }, + { + "epoch": 6.818923327895595, + "grad_norm": 0.034901853650808334, + "learning_rate": 1.3893812564883896e-05, + "loss": 0.2667, + "num_input_tokens_seen": 90246864, + "step": 41800 + }, + { + "epoch": 6.819738988580751, + "grad_norm": 0.08100741356611252, + "learning_rate": 1.3887435985616332e-05, + "loss": 0.2003, + "num_input_tokens_seen": 90257680, + "step": 41805 + }, + { + "epoch": 6.8205546492659055, + "grad_norm": 0.093824103474617, + "learning_rate": 1.3881060307189592e-05, + "loss": 0.1781, + "num_input_tokens_seen": 90268976, + "step": 41810 + }, + { + "epoch": 6.82137030995106, + "grad_norm": 0.43546268343925476, + "learning_rate": 1.387468553012052e-05, + "loss": 0.0853, + "num_input_tokens_seen": 90279696, + "step": 41815 + }, + { + "epoch": 6.822185970636215, + "grad_norm": 0.10256228595972061, + "learning_rate": 1.386831165492589e-05, + "loss": 0.171, + "num_input_tokens_seen": 90288624, + "step": 41820 + }, + { + "epoch": 6.82300163132137, + "grad_norm": 0.049173593521118164, + "learning_rate": 1.3861938682122396e-05, + "loss": 0.0942, + "num_input_tokens_seen": 90299984, + "step": 41825 + }, + { + "epoch": 6.823817292006526, + "grad_norm": 0.4126286506652832, + "learning_rate": 1.3855566612226666e-05, + "loss": 0.021, + "num_input_tokens_seen": 90309264, + "step": 41830 + }, + { + "epoch": 6.8246329526916805, + "grad_norm": 1.2892982959747314, + "learning_rate": 1.3849195445755258e-05, + "loss": 0.162, + "num_input_tokens_seen": 90320528, + "step": 41835 + }, + { + "epoch": 6.825448613376835, + "grad_norm": 0.0596906878054142, + "learning_rate": 1.3842825183224642e-05, + "loss": 0.06, + "num_input_tokens_seen": 90331184, + "step": 41840 + }, + { + "epoch": 6.82626427406199, + "grad_norm": 0.046671535819768906, + "learning_rate": 1.3836455825151229e-05, + "loss": 0.0362, + "num_input_tokens_seen": 90341200, + "step": 41845 + }, + { + "epoch": 6.827079934747145, + "grad_norm": 0.17064256966114044, + "learning_rate": 1.3830087372051347e-05, + "loss": 0.1331, + "num_input_tokens_seen": 90350992, + "step": 41850 + }, + { + "epoch": 6.827895595432301, + "grad_norm": 1.0883194208145142, + "learning_rate": 1.3823719824441262e-05, + "loss": 0.169, + "num_input_tokens_seen": 90361520, + "step": 41855 + }, + { + "epoch": 6.828711256117455, + "grad_norm": 0.721582293510437, + "learning_rate": 1.3817353182837151e-05, + "loss": 0.1996, + "num_input_tokens_seen": 90372112, + "step": 41860 + }, + { + "epoch": 6.82952691680261, + "grad_norm": 0.5719067454338074, + "learning_rate": 1.3810987447755136e-05, + "loss": 0.3253, + "num_input_tokens_seen": 90383856, + "step": 41865 + }, + { + "epoch": 6.830342577487765, + "grad_norm": 0.3186333179473877, + "learning_rate": 1.380462261971125e-05, + "loss": 0.1028, + "num_input_tokens_seen": 90393552, + "step": 41870 + }, + { + "epoch": 6.83115823817292, + "grad_norm": 2.139834403991699, + "learning_rate": 1.3798258699221456e-05, + "loss": 0.2302, + "num_input_tokens_seen": 90403056, + "step": 41875 + }, + { + "epoch": 6.831973898858075, + "grad_norm": 0.2818989157676697, + "learning_rate": 1.3791895686801653e-05, + "loss": 0.2046, + "num_input_tokens_seen": 90414160, + "step": 41880 + }, + { + "epoch": 6.8327895595432295, + "grad_norm": 0.7957868576049805, + "learning_rate": 1.3785533582967642e-05, + "loss": 0.1217, + "num_input_tokens_seen": 90426160, + "step": 41885 + }, + { + "epoch": 6.833605220228385, + "grad_norm": 2.335994243621826, + "learning_rate": 1.3779172388235192e-05, + "loss": 0.3362, + "num_input_tokens_seen": 90436816, + "step": 41890 + }, + { + "epoch": 6.83442088091354, + "grad_norm": 0.2380416840314865, + "learning_rate": 1.3772812103119964e-05, + "loss": 0.1061, + "num_input_tokens_seen": 90448144, + "step": 41895 + }, + { + "epoch": 6.835236541598695, + "grad_norm": 0.46907129883766174, + "learning_rate": 1.376645272813755e-05, + "loss": 0.0385, + "num_input_tokens_seen": 90458352, + "step": 41900 + }, + { + "epoch": 6.83605220228385, + "grad_norm": 1.8631188869476318, + "learning_rate": 1.3760094263803474e-05, + "loss": 0.2209, + "num_input_tokens_seen": 90469264, + "step": 41905 + }, + { + "epoch": 6.8368678629690045, + "grad_norm": 0.2997377812862396, + "learning_rate": 1.3753736710633192e-05, + "loss": 0.1456, + "num_input_tokens_seen": 90478832, + "step": 41910 + }, + { + "epoch": 6.83768352365416, + "grad_norm": 0.04588686674833298, + "learning_rate": 1.3747380069142073e-05, + "loss": 0.0304, + "num_input_tokens_seen": 90489840, + "step": 41915 + }, + { + "epoch": 6.838499184339315, + "grad_norm": 1.004978060722351, + "learning_rate": 1.3741024339845426e-05, + "loss": 0.0693, + "num_input_tokens_seen": 90501232, + "step": 41920 + }, + { + "epoch": 6.83931484502447, + "grad_norm": 0.29040512442588806, + "learning_rate": 1.373466952325847e-05, + "loss": 0.0157, + "num_input_tokens_seen": 90512560, + "step": 41925 + }, + { + "epoch": 6.840130505709625, + "grad_norm": 0.10021024942398071, + "learning_rate": 1.372831561989636e-05, + "loss": 0.0558, + "num_input_tokens_seen": 90523856, + "step": 41930 + }, + { + "epoch": 6.8409461663947795, + "grad_norm": 0.0835769921541214, + "learning_rate": 1.3721962630274171e-05, + "loss": 0.1635, + "num_input_tokens_seen": 90532720, + "step": 41935 + }, + { + "epoch": 6.841761827079935, + "grad_norm": 0.25311121344566345, + "learning_rate": 1.3715610554906922e-05, + "loss": 0.2375, + "num_input_tokens_seen": 90543888, + "step": 41940 + }, + { + "epoch": 6.84257748776509, + "grad_norm": 1.4774302244186401, + "learning_rate": 1.370925939430954e-05, + "loss": 0.25, + "num_input_tokens_seen": 90553712, + "step": 41945 + }, + { + "epoch": 6.843393148450245, + "grad_norm": 1.6008719205856323, + "learning_rate": 1.3702909148996878e-05, + "loss": 0.1139, + "num_input_tokens_seen": 90563920, + "step": 41950 + }, + { + "epoch": 6.8442088091354, + "grad_norm": 0.1234937310218811, + "learning_rate": 1.3696559819483722e-05, + "loss": 0.0329, + "num_input_tokens_seen": 90574544, + "step": 41955 + }, + { + "epoch": 6.8450244698205545, + "grad_norm": 0.22015871107578278, + "learning_rate": 1.3690211406284784e-05, + "loss": 0.0673, + "num_input_tokens_seen": 90586384, + "step": 41960 + }, + { + "epoch": 6.845840130505709, + "grad_norm": 0.33907127380371094, + "learning_rate": 1.368386390991469e-05, + "loss": 0.1606, + "num_input_tokens_seen": 90597776, + "step": 41965 + }, + { + "epoch": 6.846655791190865, + "grad_norm": 0.491997629404068, + "learning_rate": 1.3677517330888007e-05, + "loss": 0.0437, + "num_input_tokens_seen": 90610128, + "step": 41970 + }, + { + "epoch": 6.84747145187602, + "grad_norm": 0.1321546733379364, + "learning_rate": 1.3671171669719218e-05, + "loss": 0.1597, + "num_input_tokens_seen": 90621168, + "step": 41975 + }, + { + "epoch": 6.848287112561175, + "grad_norm": 0.2031065672636032, + "learning_rate": 1.3664826926922736e-05, + "loss": 0.0764, + "num_input_tokens_seen": 90631536, + "step": 41980 + }, + { + "epoch": 6.849102773246329, + "grad_norm": 1.3819838762283325, + "learning_rate": 1.3658483103012898e-05, + "loss": 0.1516, + "num_input_tokens_seen": 90641168, + "step": 41985 + }, + { + "epoch": 6.849918433931484, + "grad_norm": 0.0467129722237587, + "learning_rate": 1.3652140198503966e-05, + "loss": 0.0906, + "num_input_tokens_seen": 90651632, + "step": 41990 + }, + { + "epoch": 6.850734094616639, + "grad_norm": 0.08374538272619247, + "learning_rate": 1.364579821391012e-05, + "loss": 0.0467, + "num_input_tokens_seen": 90662128, + "step": 41995 + }, + { + "epoch": 6.851549755301795, + "grad_norm": 0.1462072730064392, + "learning_rate": 1.3639457149745489e-05, + "loss": 0.0705, + "num_input_tokens_seen": 90673808, + "step": 42000 + }, + { + "epoch": 6.85236541598695, + "grad_norm": 0.9360509514808655, + "learning_rate": 1.3633117006524102e-05, + "loss": 0.0802, + "num_input_tokens_seen": 90684752, + "step": 42005 + }, + { + "epoch": 6.853181076672104, + "grad_norm": 0.11792770773172379, + "learning_rate": 1.3626777784759925e-05, + "loss": 0.0892, + "num_input_tokens_seen": 90695312, + "step": 42010 + }, + { + "epoch": 6.853996737357259, + "grad_norm": 1.5252859592437744, + "learning_rate": 1.3620439484966851e-05, + "loss": 0.0896, + "num_input_tokens_seen": 90705520, + "step": 42015 + }, + { + "epoch": 6.854812398042414, + "grad_norm": 0.2888747751712799, + "learning_rate": 1.3614102107658693e-05, + "loss": 0.0783, + "num_input_tokens_seen": 90717584, + "step": 42020 + }, + { + "epoch": 6.85562805872757, + "grad_norm": 0.6367222666740417, + "learning_rate": 1.3607765653349185e-05, + "loss": 0.0672, + "num_input_tokens_seen": 90728496, + "step": 42025 + }, + { + "epoch": 6.856443719412725, + "grad_norm": 1.5500949621200562, + "learning_rate": 1.3601430122552e-05, + "loss": 0.1414, + "num_input_tokens_seen": 90738960, + "step": 42030 + }, + { + "epoch": 6.857259380097879, + "grad_norm": 0.05169842019677162, + "learning_rate": 1.3595095515780726e-05, + "loss": 0.0717, + "num_input_tokens_seen": 90750352, + "step": 42035 + }, + { + "epoch": 6.858075040783034, + "grad_norm": 0.14070795476436615, + "learning_rate": 1.3588761833548875e-05, + "loss": 0.1421, + "num_input_tokens_seen": 90761040, + "step": 42040 + }, + { + "epoch": 6.858890701468189, + "grad_norm": 0.944865345954895, + "learning_rate": 1.358242907636989e-05, + "loss": 0.1228, + "num_input_tokens_seen": 90770992, + "step": 42045 + }, + { + "epoch": 6.859706362153344, + "grad_norm": 0.4509557783603668, + "learning_rate": 1.3576097244757138e-05, + "loss": 0.0888, + "num_input_tokens_seen": 90779088, + "step": 42050 + }, + { + "epoch": 6.8605220228384995, + "grad_norm": 0.1629170924425125, + "learning_rate": 1.3569766339223907e-05, + "loss": 0.2382, + "num_input_tokens_seen": 90789616, + "step": 42055 + }, + { + "epoch": 6.861337683523654, + "grad_norm": 2.3492677211761475, + "learning_rate": 1.3563436360283412e-05, + "loss": 0.1423, + "num_input_tokens_seen": 90800496, + "step": 42060 + }, + { + "epoch": 6.862153344208809, + "grad_norm": 1.9209667444229126, + "learning_rate": 1.3557107308448796e-05, + "loss": 0.2274, + "num_input_tokens_seen": 90809808, + "step": 42065 + }, + { + "epoch": 6.862969004893964, + "grad_norm": 0.328256756067276, + "learning_rate": 1.355077918423312e-05, + "loss": 0.2282, + "num_input_tokens_seen": 90820880, + "step": 42070 + }, + { + "epoch": 6.863784665579119, + "grad_norm": 0.37291550636291504, + "learning_rate": 1.3544451988149376e-05, + "loss": 0.0775, + "num_input_tokens_seen": 90831696, + "step": 42075 + }, + { + "epoch": 6.864600326264274, + "grad_norm": 0.5611517429351807, + "learning_rate": 1.353812572071047e-05, + "loss": 0.036, + "num_input_tokens_seen": 90842512, + "step": 42080 + }, + { + "epoch": 6.865415986949429, + "grad_norm": 1.0058684349060059, + "learning_rate": 1.353180038242926e-05, + "loss": 0.0829, + "num_input_tokens_seen": 90852784, + "step": 42085 + }, + { + "epoch": 6.866231647634584, + "grad_norm": 0.5713176727294922, + "learning_rate": 1.3525475973818502e-05, + "loss": 0.2293, + "num_input_tokens_seen": 90863280, + "step": 42090 + }, + { + "epoch": 6.867047308319739, + "grad_norm": 0.2721981704235077, + "learning_rate": 1.351915249539088e-05, + "loss": 0.2136, + "num_input_tokens_seen": 90874160, + "step": 42095 + }, + { + "epoch": 6.867862969004894, + "grad_norm": 0.4537580609321594, + "learning_rate": 1.3512829947659011e-05, + "loss": 0.2215, + "num_input_tokens_seen": 90884752, + "step": 42100 + }, + { + "epoch": 6.868678629690049, + "grad_norm": 0.08835228532552719, + "learning_rate": 1.350650833113542e-05, + "loss": 0.1466, + "num_input_tokens_seen": 90894704, + "step": 42105 + }, + { + "epoch": 6.869494290375204, + "grad_norm": 0.0654202476143837, + "learning_rate": 1.3500187646332593e-05, + "loss": 0.1761, + "num_input_tokens_seen": 90904528, + "step": 42110 + }, + { + "epoch": 6.870309951060359, + "grad_norm": 0.7610039710998535, + "learning_rate": 1.3493867893762904e-05, + "loss": 0.0467, + "num_input_tokens_seen": 90915696, + "step": 42115 + }, + { + "epoch": 6.871125611745514, + "grad_norm": 0.44919532537460327, + "learning_rate": 1.3487549073938666e-05, + "loss": 0.0626, + "num_input_tokens_seen": 90925776, + "step": 42120 + }, + { + "epoch": 6.871941272430669, + "grad_norm": 1.2861765623092651, + "learning_rate": 1.3481231187372111e-05, + "loss": 0.0429, + "num_input_tokens_seen": 90936144, + "step": 42125 + }, + { + "epoch": 6.872756933115824, + "grad_norm": 0.9947662353515625, + "learning_rate": 1.3474914234575406e-05, + "loss": 0.1917, + "num_input_tokens_seen": 90946480, + "step": 42130 + }, + { + "epoch": 6.873572593800979, + "grad_norm": 1.4666508436203003, + "learning_rate": 1.346859821606063e-05, + "loss": 0.0735, + "num_input_tokens_seen": 90956720, + "step": 42135 + }, + { + "epoch": 6.874388254486134, + "grad_norm": 0.2441418170928955, + "learning_rate": 1.3462283132339787e-05, + "loss": 0.1227, + "num_input_tokens_seen": 90967312, + "step": 42140 + }, + { + "epoch": 6.875203915171289, + "grad_norm": 0.7206056118011475, + "learning_rate": 1.3455968983924822e-05, + "loss": 0.2432, + "num_input_tokens_seen": 90978000, + "step": 42145 + }, + { + "epoch": 6.876019575856444, + "grad_norm": 0.844956636428833, + "learning_rate": 1.344965577132758e-05, + "loss": 0.078, + "num_input_tokens_seen": 90988304, + "step": 42150 + }, + { + "epoch": 6.876835236541599, + "grad_norm": 0.960738480091095, + "learning_rate": 1.344334349505985e-05, + "loss": 0.1263, + "num_input_tokens_seen": 90998288, + "step": 42155 + }, + { + "epoch": 6.877650897226753, + "grad_norm": 0.05621453374624252, + "learning_rate": 1.3437032155633333e-05, + "loss": 0.0763, + "num_input_tokens_seen": 91009936, + "step": 42160 + }, + { + "epoch": 6.878466557911908, + "grad_norm": 1.056253433227539, + "learning_rate": 1.343072175355966e-05, + "loss": 0.0888, + "num_input_tokens_seen": 91021456, + "step": 42165 + }, + { + "epoch": 6.879282218597064, + "grad_norm": 0.24932460486888885, + "learning_rate": 1.342441228935038e-05, + "loss": 0.1865, + "num_input_tokens_seen": 91032784, + "step": 42170 + }, + { + "epoch": 6.880097879282219, + "grad_norm": 1.0610382556915283, + "learning_rate": 1.3418103763516979e-05, + "loss": 0.1293, + "num_input_tokens_seen": 91043952, + "step": 42175 + }, + { + "epoch": 6.8809135399673735, + "grad_norm": 1.0988423824310303, + "learning_rate": 1.3411796176570852e-05, + "loss": 0.142, + "num_input_tokens_seen": 91054704, + "step": 42180 + }, + { + "epoch": 6.881729200652528, + "grad_norm": 0.31195324659347534, + "learning_rate": 1.3405489529023322e-05, + "loss": 0.1809, + "num_input_tokens_seen": 91064176, + "step": 42185 + }, + { + "epoch": 6.882544861337683, + "grad_norm": 1.2736576795578003, + "learning_rate": 1.339918382138564e-05, + "loss": 0.1075, + "num_input_tokens_seen": 91074768, + "step": 42190 + }, + { + "epoch": 6.883360522022839, + "grad_norm": 1.2702244520187378, + "learning_rate": 1.3392879054168983e-05, + "loss": 0.2004, + "num_input_tokens_seen": 91085520, + "step": 42195 + }, + { + "epoch": 6.884176182707994, + "grad_norm": 1.2357860803604126, + "learning_rate": 1.3386575227884443e-05, + "loss": 0.27, + "num_input_tokens_seen": 91094800, + "step": 42200 + }, + { + "epoch": 6.8849918433931485, + "grad_norm": 0.5456552505493164, + "learning_rate": 1.3380272343043032e-05, + "loss": 0.0863, + "num_input_tokens_seen": 91105840, + "step": 42205 + }, + { + "epoch": 6.885807504078303, + "grad_norm": 0.07232990860939026, + "learning_rate": 1.337397040015571e-05, + "loss": 0.2077, + "num_input_tokens_seen": 91116624, + "step": 42210 + }, + { + "epoch": 6.886623164763458, + "grad_norm": 1.2114051580429077, + "learning_rate": 1.336766939973334e-05, + "loss": 0.194, + "num_input_tokens_seen": 91128176, + "step": 42215 + }, + { + "epoch": 6.887438825448614, + "grad_norm": 0.4516364634037018, + "learning_rate": 1.3361369342286706e-05, + "loss": 0.1697, + "num_input_tokens_seen": 91138544, + "step": 42220 + }, + { + "epoch": 6.888254486133769, + "grad_norm": 0.1728890836238861, + "learning_rate": 1.3355070228326533e-05, + "loss": 0.1297, + "num_input_tokens_seen": 91148624, + "step": 42225 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.7646316885948181, + "learning_rate": 1.3348772058363448e-05, + "loss": 0.1549, + "num_input_tokens_seen": 91158672, + "step": 42230 + }, + { + "epoch": 6.889885807504078, + "grad_norm": 0.6690514087677002, + "learning_rate": 1.334247483290802e-05, + "loss": 0.0365, + "num_input_tokens_seen": 91169648, + "step": 42235 + }, + { + "epoch": 6.890701468189233, + "grad_norm": 0.44985848665237427, + "learning_rate": 1.3336178552470729e-05, + "loss": 0.2203, + "num_input_tokens_seen": 91180496, + "step": 42240 + }, + { + "epoch": 6.891517128874388, + "grad_norm": 1.7271960973739624, + "learning_rate": 1.332988321756198e-05, + "loss": 0.1419, + "num_input_tokens_seen": 91191440, + "step": 42245 + }, + { + "epoch": 6.892332789559543, + "grad_norm": 0.44644010066986084, + "learning_rate": 1.3323588828692119e-05, + "loss": 0.1349, + "num_input_tokens_seen": 91201584, + "step": 42250 + }, + { + "epoch": 6.8931484502446985, + "grad_norm": 0.33833152055740356, + "learning_rate": 1.3317295386371396e-05, + "loss": 0.0324, + "num_input_tokens_seen": 91213104, + "step": 42255 + }, + { + "epoch": 6.893964110929853, + "grad_norm": 0.253280907869339, + "learning_rate": 1.3311002891109981e-05, + "loss": 0.0637, + "num_input_tokens_seen": 91224560, + "step": 42260 + }, + { + "epoch": 6.894779771615008, + "grad_norm": 0.08677760511636734, + "learning_rate": 1.3304711343417985e-05, + "loss": 0.0972, + "num_input_tokens_seen": 91235824, + "step": 42265 + }, + { + "epoch": 6.895595432300163, + "grad_norm": 0.5767030119895935, + "learning_rate": 1.329842074380543e-05, + "loss": 0.0652, + "num_input_tokens_seen": 91247600, + "step": 42270 + }, + { + "epoch": 6.896411092985318, + "grad_norm": 0.1880982518196106, + "learning_rate": 1.3292131092782259e-05, + "loss": 0.1305, + "num_input_tokens_seen": 91259120, + "step": 42275 + }, + { + "epoch": 6.897226753670473, + "grad_norm": 0.2207702398300171, + "learning_rate": 1.328584239085835e-05, + "loss": 0.0369, + "num_input_tokens_seen": 91269040, + "step": 42280 + }, + { + "epoch": 6.898042414355628, + "grad_norm": 1.8528586626052856, + "learning_rate": 1.327955463854349e-05, + "loss": 0.0833, + "num_input_tokens_seen": 91280208, + "step": 42285 + }, + { + "epoch": 6.898858075040783, + "grad_norm": 0.13356901705265045, + "learning_rate": 1.3273267836347403e-05, + "loss": 0.0372, + "num_input_tokens_seen": 91291088, + "step": 42290 + }, + { + "epoch": 6.899673735725938, + "grad_norm": 0.17497427761554718, + "learning_rate": 1.3266981984779725e-05, + "loss": 0.0535, + "num_input_tokens_seen": 91302032, + "step": 42295 + }, + { + "epoch": 6.900489396411093, + "grad_norm": 0.0876205787062645, + "learning_rate": 1.3260697084350018e-05, + "loss": 0.0573, + "num_input_tokens_seen": 91313552, + "step": 42300 + }, + { + "epoch": 6.901305057096248, + "grad_norm": 0.3715498149394989, + "learning_rate": 1.3254413135567773e-05, + "loss": 0.0531, + "num_input_tokens_seen": 91323728, + "step": 42305 + }, + { + "epoch": 6.902120717781403, + "grad_norm": 0.2731800079345703, + "learning_rate": 1.3248130138942394e-05, + "loss": 0.1821, + "num_input_tokens_seen": 91334960, + "step": 42310 + }, + { + "epoch": 6.902936378466558, + "grad_norm": 0.0650930404663086, + "learning_rate": 1.3241848094983206e-05, + "loss": 0.0403, + "num_input_tokens_seen": 91346896, + "step": 42315 + }, + { + "epoch": 6.903752039151713, + "grad_norm": 0.09640514850616455, + "learning_rate": 1.3235567004199481e-05, + "loss": 0.1292, + "num_input_tokens_seen": 91357168, + "step": 42320 + }, + { + "epoch": 6.904567699836868, + "grad_norm": 0.787994384765625, + "learning_rate": 1.3229286867100388e-05, + "loss": 0.137, + "num_input_tokens_seen": 91368688, + "step": 42325 + }, + { + "epoch": 6.9053833605220225, + "grad_norm": 0.44825279712677, + "learning_rate": 1.3223007684195021e-05, + "loss": 0.081, + "num_input_tokens_seen": 91379600, + "step": 42330 + }, + { + "epoch": 6.906199021207177, + "grad_norm": 0.7117521166801453, + "learning_rate": 1.321672945599241e-05, + "loss": 0.0863, + "num_input_tokens_seen": 91389872, + "step": 42335 + }, + { + "epoch": 6.907014681892333, + "grad_norm": 1.3061370849609375, + "learning_rate": 1.3210452183001497e-05, + "loss": 0.1432, + "num_input_tokens_seen": 91400944, + "step": 42340 + }, + { + "epoch": 6.907830342577488, + "grad_norm": 1.1326236724853516, + "learning_rate": 1.320417586573115e-05, + "loss": 0.2397, + "num_input_tokens_seen": 91412080, + "step": 42345 + }, + { + "epoch": 6.908646003262643, + "grad_norm": 0.20075754821300507, + "learning_rate": 1.3197900504690161e-05, + "loss": 0.0708, + "num_input_tokens_seen": 91423952, + "step": 42350 + }, + { + "epoch": 6.9094616639477975, + "grad_norm": 0.5841072797775269, + "learning_rate": 1.3191626100387238e-05, + "loss": 0.1841, + "num_input_tokens_seen": 91434480, + "step": 42355 + }, + { + "epoch": 6.910277324632952, + "grad_norm": 2.0119566917419434, + "learning_rate": 1.3185352653331018e-05, + "loss": 0.2426, + "num_input_tokens_seen": 91444464, + "step": 42360 + }, + { + "epoch": 6.911092985318108, + "grad_norm": 0.3064592480659485, + "learning_rate": 1.3179080164030059e-05, + "loss": 0.157, + "num_input_tokens_seen": 91453104, + "step": 42365 + }, + { + "epoch": 6.911908646003263, + "grad_norm": 0.04096401482820511, + "learning_rate": 1.317280863299284e-05, + "loss": 0.0444, + "num_input_tokens_seen": 91463120, + "step": 42370 + }, + { + "epoch": 6.912724306688418, + "grad_norm": 1.846092939376831, + "learning_rate": 1.3166538060727765e-05, + "loss": 0.0685, + "num_input_tokens_seen": 91473104, + "step": 42375 + }, + { + "epoch": 6.9135399673735725, + "grad_norm": 0.16418392956256866, + "learning_rate": 1.3160268447743157e-05, + "loss": 0.2359, + "num_input_tokens_seen": 91484304, + "step": 42380 + }, + { + "epoch": 6.914355628058727, + "grad_norm": 0.12802951037883759, + "learning_rate": 1.3153999794547261e-05, + "loss": 0.0853, + "num_input_tokens_seen": 91495536, + "step": 42385 + }, + { + "epoch": 6.915171288743883, + "grad_norm": 0.04008619114756584, + "learning_rate": 1.3147732101648242e-05, + "loss": 0.0614, + "num_input_tokens_seen": 91505392, + "step": 42390 + }, + { + "epoch": 6.915986949429038, + "grad_norm": 0.0707477405667305, + "learning_rate": 1.31414653695542e-05, + "loss": 0.0415, + "num_input_tokens_seen": 91517200, + "step": 42395 + }, + { + "epoch": 6.916802610114193, + "grad_norm": 0.09512804448604584, + "learning_rate": 1.3135199598773152e-05, + "loss": 0.0565, + "num_input_tokens_seen": 91528592, + "step": 42400 + }, + { + "epoch": 6.917618270799347, + "grad_norm": 0.3014563024044037, + "learning_rate": 1.3128934789813021e-05, + "loss": 0.0434, + "num_input_tokens_seen": 91539216, + "step": 42405 + }, + { + "epoch": 6.918433931484502, + "grad_norm": 0.15657001733779907, + "learning_rate": 1.312267094318167e-05, + "loss": 0.2335, + "num_input_tokens_seen": 91549616, + "step": 42410 + }, + { + "epoch": 6.919249592169657, + "grad_norm": 0.12244506180286407, + "learning_rate": 1.3116408059386881e-05, + "loss": 0.0968, + "num_input_tokens_seen": 91560496, + "step": 42415 + }, + { + "epoch": 6.920065252854813, + "grad_norm": 0.31801390647888184, + "learning_rate": 1.3110146138936335e-05, + "loss": 0.1264, + "num_input_tokens_seen": 91571408, + "step": 42420 + }, + { + "epoch": 6.920880913539968, + "grad_norm": 0.8898562788963318, + "learning_rate": 1.3103885182337688e-05, + "loss": 0.08, + "num_input_tokens_seen": 91580496, + "step": 42425 + }, + { + "epoch": 6.921696574225122, + "grad_norm": 0.5923973321914673, + "learning_rate": 1.3097625190098464e-05, + "loss": 0.1754, + "num_input_tokens_seen": 91591984, + "step": 42430 + }, + { + "epoch": 6.922512234910277, + "grad_norm": 1.1500144004821777, + "learning_rate": 1.3091366162726136e-05, + "loss": 0.1027, + "num_input_tokens_seen": 91602672, + "step": 42435 + }, + { + "epoch": 6.923327895595432, + "grad_norm": 2.168994188308716, + "learning_rate": 1.3085108100728089e-05, + "loss": 0.072, + "num_input_tokens_seen": 91612752, + "step": 42440 + }, + { + "epoch": 6.924143556280587, + "grad_norm": 0.21804171800613403, + "learning_rate": 1.3078851004611636e-05, + "loss": 0.0564, + "num_input_tokens_seen": 91623184, + "step": 42445 + }, + { + "epoch": 6.924959216965743, + "grad_norm": 0.1488335132598877, + "learning_rate": 1.3072594874884008e-05, + "loss": 0.0657, + "num_input_tokens_seen": 91633488, + "step": 42450 + }, + { + "epoch": 6.925774877650897, + "grad_norm": 1.22910737991333, + "learning_rate": 1.3066339712052359e-05, + "loss": 0.2689, + "num_input_tokens_seen": 91645456, + "step": 42455 + }, + { + "epoch": 6.926590538336052, + "grad_norm": 0.19699357450008392, + "learning_rate": 1.3060085516623763e-05, + "loss": 0.2026, + "num_input_tokens_seen": 91656016, + "step": 42460 + }, + { + "epoch": 6.927406199021207, + "grad_norm": 0.679253339767456, + "learning_rate": 1.3053832289105216e-05, + "loss": 0.1594, + "num_input_tokens_seen": 91667728, + "step": 42465 + }, + { + "epoch": 6.928221859706362, + "grad_norm": 0.1958068311214447, + "learning_rate": 1.3047580030003642e-05, + "loss": 0.1142, + "num_input_tokens_seen": 91679280, + "step": 42470 + }, + { + "epoch": 6.9290375203915175, + "grad_norm": 0.5956099033355713, + "learning_rate": 1.3041328739825873e-05, + "loss": 0.0726, + "num_input_tokens_seen": 91690032, + "step": 42475 + }, + { + "epoch": 6.929853181076672, + "grad_norm": 0.6359840035438538, + "learning_rate": 1.3035078419078675e-05, + "loss": 0.1481, + "num_input_tokens_seen": 91700944, + "step": 42480 + }, + { + "epoch": 6.930668841761827, + "grad_norm": 0.39001190662384033, + "learning_rate": 1.302882906826873e-05, + "loss": 0.1046, + "num_input_tokens_seen": 91710096, + "step": 42485 + }, + { + "epoch": 6.931484502446982, + "grad_norm": 0.5873562097549438, + "learning_rate": 1.3022580687902641e-05, + "loss": 0.2004, + "num_input_tokens_seen": 91719856, + "step": 42490 + }, + { + "epoch": 6.932300163132137, + "grad_norm": 2.0966317653656006, + "learning_rate": 1.3016333278486936e-05, + "loss": 0.2291, + "num_input_tokens_seen": 91730448, + "step": 42495 + }, + { + "epoch": 6.933115823817292, + "grad_norm": 1.8772826194763184, + "learning_rate": 1.301008684052806e-05, + "loss": 0.1693, + "num_input_tokens_seen": 91742256, + "step": 42500 + }, + { + "epoch": 6.933931484502447, + "grad_norm": 1.7236248254776, + "learning_rate": 1.300384137453238e-05, + "loss": 0.1575, + "num_input_tokens_seen": 91752848, + "step": 42505 + }, + { + "epoch": 6.934747145187602, + "grad_norm": 1.0145195722579956, + "learning_rate": 1.2997596881006185e-05, + "loss": 0.1605, + "num_input_tokens_seen": 91764240, + "step": 42510 + }, + { + "epoch": 6.935562805872757, + "grad_norm": 0.5102953910827637, + "learning_rate": 1.2991353360455688e-05, + "loss": 0.1318, + "num_input_tokens_seen": 91775696, + "step": 42515 + }, + { + "epoch": 6.936378466557912, + "grad_norm": 0.1723252385854721, + "learning_rate": 1.298511081338702e-05, + "loss": 0.0364, + "num_input_tokens_seen": 91786576, + "step": 42520 + }, + { + "epoch": 6.937194127243067, + "grad_norm": 0.3373810946941376, + "learning_rate": 1.2978869240306219e-05, + "loss": 0.0672, + "num_input_tokens_seen": 91796880, + "step": 42525 + }, + { + "epoch": 6.938009787928221, + "grad_norm": 0.22910307347774506, + "learning_rate": 1.2972628641719286e-05, + "loss": 0.1864, + "num_input_tokens_seen": 91807344, + "step": 42530 + }, + { + "epoch": 6.938825448613377, + "grad_norm": 0.12457340210676193, + "learning_rate": 1.2966389018132097e-05, + "loss": 0.0277, + "num_input_tokens_seen": 91818640, + "step": 42535 + }, + { + "epoch": 6.939641109298532, + "grad_norm": 0.25950831174850464, + "learning_rate": 1.2960150370050475e-05, + "loss": 0.0575, + "num_input_tokens_seen": 91829616, + "step": 42540 + }, + { + "epoch": 6.940456769983687, + "grad_norm": 2.185061454772949, + "learning_rate": 1.2953912697980152e-05, + "loss": 0.0878, + "num_input_tokens_seen": 91839376, + "step": 42545 + }, + { + "epoch": 6.941272430668842, + "grad_norm": 0.09717545658349991, + "learning_rate": 1.2947676002426789e-05, + "loss": 0.1462, + "num_input_tokens_seen": 91849776, + "step": 42550 + }, + { + "epoch": 6.942088091353996, + "grad_norm": 1.4445070028305054, + "learning_rate": 1.2941440283895961e-05, + "loss": 0.3235, + "num_input_tokens_seen": 91861520, + "step": 42555 + }, + { + "epoch": 6.942903752039152, + "grad_norm": 0.27097034454345703, + "learning_rate": 1.2935205542893158e-05, + "loss": 0.0976, + "num_input_tokens_seen": 91872880, + "step": 42560 + }, + { + "epoch": 6.943719412724307, + "grad_norm": 0.7977014780044556, + "learning_rate": 1.2928971779923821e-05, + "loss": 0.2124, + "num_input_tokens_seen": 91884016, + "step": 42565 + }, + { + "epoch": 6.944535073409462, + "grad_norm": 2.751530408859253, + "learning_rate": 1.2922738995493277e-05, + "loss": 0.3285, + "num_input_tokens_seen": 91895504, + "step": 42570 + }, + { + "epoch": 6.945350734094617, + "grad_norm": 0.34596434235572815, + "learning_rate": 1.2916507190106792e-05, + "loss": 0.0792, + "num_input_tokens_seen": 91906288, + "step": 42575 + }, + { + "epoch": 6.946166394779771, + "grad_norm": 1.45978581905365, + "learning_rate": 1.2910276364269546e-05, + "loss": 0.1607, + "num_input_tokens_seen": 91916752, + "step": 42580 + }, + { + "epoch": 6.946982055464927, + "grad_norm": 1.2905007600784302, + "learning_rate": 1.2904046518486637e-05, + "loss": 0.1982, + "num_input_tokens_seen": 91926992, + "step": 42585 + }, + { + "epoch": 6.947797716150082, + "grad_norm": 0.1285146176815033, + "learning_rate": 1.2897817653263095e-05, + "loss": 0.1066, + "num_input_tokens_seen": 91935504, + "step": 42590 + }, + { + "epoch": 6.948613376835237, + "grad_norm": 2.040053606033325, + "learning_rate": 1.2891589769103856e-05, + "loss": 0.3036, + "num_input_tokens_seen": 91946320, + "step": 42595 + }, + { + "epoch": 6.9494290375203915, + "grad_norm": 0.6688337922096252, + "learning_rate": 1.288536286651379e-05, + "loss": 0.1006, + "num_input_tokens_seen": 91957008, + "step": 42600 + }, + { + "epoch": 6.950244698205546, + "grad_norm": 1.3063920736312866, + "learning_rate": 1.2879136945997677e-05, + "loss": 0.0747, + "num_input_tokens_seen": 91967824, + "step": 42605 + }, + { + "epoch": 6.951060358890701, + "grad_norm": 1.571235179901123, + "learning_rate": 1.2872912008060228e-05, + "loss": 0.08, + "num_input_tokens_seen": 91979024, + "step": 42610 + }, + { + "epoch": 6.951876019575856, + "grad_norm": 0.0410933755338192, + "learning_rate": 1.286668805320606e-05, + "loss": 0.185, + "num_input_tokens_seen": 91990288, + "step": 42615 + }, + { + "epoch": 6.952691680261012, + "grad_norm": 0.3299761116504669, + "learning_rate": 1.2860465081939727e-05, + "loss": 0.1106, + "num_input_tokens_seen": 92000592, + "step": 42620 + }, + { + "epoch": 6.9535073409461665, + "grad_norm": 0.0571131557226181, + "learning_rate": 1.2854243094765683e-05, + "loss": 0.1036, + "num_input_tokens_seen": 92009520, + "step": 42625 + }, + { + "epoch": 6.954323001631321, + "grad_norm": 0.09825420379638672, + "learning_rate": 1.2848022092188328e-05, + "loss": 0.0515, + "num_input_tokens_seen": 92020848, + "step": 42630 + }, + { + "epoch": 6.955138662316476, + "grad_norm": 2.2553465366363525, + "learning_rate": 1.2841802074711945e-05, + "loss": 0.131, + "num_input_tokens_seen": 92030384, + "step": 42635 + }, + { + "epoch": 6.955954323001631, + "grad_norm": 1.3592135906219482, + "learning_rate": 1.2835583042840788e-05, + "loss": 0.1645, + "num_input_tokens_seen": 92040528, + "step": 42640 + }, + { + "epoch": 6.956769983686787, + "grad_norm": 0.4033094346523285, + "learning_rate": 1.282936499707899e-05, + "loss": 0.0862, + "num_input_tokens_seen": 92052368, + "step": 42645 + }, + { + "epoch": 6.9575856443719415, + "grad_norm": 0.4028262197971344, + "learning_rate": 1.282314793793062e-05, + "loss": 0.1321, + "num_input_tokens_seen": 92063152, + "step": 42650 + }, + { + "epoch": 6.958401305057096, + "grad_norm": 0.49665313959121704, + "learning_rate": 1.2816931865899662e-05, + "loss": 0.0256, + "num_input_tokens_seen": 92073296, + "step": 42655 + }, + { + "epoch": 6.959216965742251, + "grad_norm": 0.9867948889732361, + "learning_rate": 1.2810716781490024e-05, + "loss": 0.1818, + "num_input_tokens_seen": 92084176, + "step": 42660 + }, + { + "epoch": 6.960032626427406, + "grad_norm": 1.8067654371261597, + "learning_rate": 1.2804502685205532e-05, + "loss": 0.1301, + "num_input_tokens_seen": 92095344, + "step": 42665 + }, + { + "epoch": 6.960848287112562, + "grad_norm": 0.9402809739112854, + "learning_rate": 1.2798289577549932e-05, + "loss": 0.0531, + "num_input_tokens_seen": 92106384, + "step": 42670 + }, + { + "epoch": 6.9616639477977165, + "grad_norm": 0.22139814496040344, + "learning_rate": 1.2792077459026886e-05, + "loss": 0.0454, + "num_input_tokens_seen": 92117104, + "step": 42675 + }, + { + "epoch": 6.962479608482871, + "grad_norm": 0.5394898653030396, + "learning_rate": 1.278586633013999e-05, + "loss": 0.1107, + "num_input_tokens_seen": 92127600, + "step": 42680 + }, + { + "epoch": 6.963295269168026, + "grad_norm": 0.3883289694786072, + "learning_rate": 1.2779656191392736e-05, + "loss": 0.1093, + "num_input_tokens_seen": 92138448, + "step": 42685 + }, + { + "epoch": 6.964110929853181, + "grad_norm": 0.37110090255737305, + "learning_rate": 1.2773447043288561e-05, + "loss": 0.0169, + "num_input_tokens_seen": 92148144, + "step": 42690 + }, + { + "epoch": 6.964926590538336, + "grad_norm": 1.4944911003112793, + "learning_rate": 1.2767238886330805e-05, + "loss": 0.3754, + "num_input_tokens_seen": 92158640, + "step": 42695 + }, + { + "epoch": 6.9657422512234906, + "grad_norm": 0.7446387410163879, + "learning_rate": 1.2761031721022732e-05, + "loss": 0.2179, + "num_input_tokens_seen": 92169008, + "step": 42700 + }, + { + "epoch": 6.966557911908646, + "grad_norm": 0.4771750271320343, + "learning_rate": 1.2754825547867519e-05, + "loss": 0.0991, + "num_input_tokens_seen": 92180432, + "step": 42705 + }, + { + "epoch": 6.967373572593801, + "grad_norm": 0.5785711407661438, + "learning_rate": 1.2748620367368286e-05, + "loss": 0.1615, + "num_input_tokens_seen": 92191376, + "step": 42710 + }, + { + "epoch": 6.968189233278956, + "grad_norm": 0.29439249634742737, + "learning_rate": 1.2742416180028053e-05, + "loss": 0.0738, + "num_input_tokens_seen": 92202256, + "step": 42715 + }, + { + "epoch": 6.969004893964111, + "grad_norm": 0.7612929940223694, + "learning_rate": 1.2736212986349755e-05, + "loss": 0.0195, + "num_input_tokens_seen": 92212400, + "step": 42720 + }, + { + "epoch": 6.9698205546492655, + "grad_norm": 1.3324565887451172, + "learning_rate": 1.2730010786836261e-05, + "loss": 0.4253, + "num_input_tokens_seen": 92223216, + "step": 42725 + }, + { + "epoch": 6.970636215334421, + "grad_norm": 0.5311837196350098, + "learning_rate": 1.2723809581990348e-05, + "loss": 0.0458, + "num_input_tokens_seen": 92234928, + "step": 42730 + }, + { + "epoch": 6.971451876019576, + "grad_norm": 0.6147965788841248, + "learning_rate": 1.271760937231472e-05, + "loss": 0.1513, + "num_input_tokens_seen": 92245392, + "step": 42735 + }, + { + "epoch": 6.972267536704731, + "grad_norm": 1.3729957342147827, + "learning_rate": 1.2711410158311987e-05, + "loss": 0.1586, + "num_input_tokens_seen": 92255824, + "step": 42740 + }, + { + "epoch": 6.973083197389886, + "grad_norm": 0.21498750150203705, + "learning_rate": 1.2705211940484707e-05, + "loss": 0.1258, + "num_input_tokens_seen": 92265904, + "step": 42745 + }, + { + "epoch": 6.9738988580750405, + "grad_norm": 0.06503775715827942, + "learning_rate": 1.2699014719335329e-05, + "loss": 0.112, + "num_input_tokens_seen": 92276176, + "step": 42750 + }, + { + "epoch": 6.974714518760196, + "grad_norm": 0.15373721718788147, + "learning_rate": 1.2692818495366236e-05, + "loss": 0.0784, + "num_input_tokens_seen": 92285744, + "step": 42755 + }, + { + "epoch": 6.975530179445351, + "grad_norm": 0.4419247806072235, + "learning_rate": 1.2686623269079717e-05, + "loss": 0.0492, + "num_input_tokens_seen": 92298128, + "step": 42760 + }, + { + "epoch": 6.976345840130506, + "grad_norm": 0.03671379014849663, + "learning_rate": 1.2680429040977998e-05, + "loss": 0.045, + "num_input_tokens_seen": 92308752, + "step": 42765 + }, + { + "epoch": 6.977161500815661, + "grad_norm": 0.526970624923706, + "learning_rate": 1.2674235811563206e-05, + "loss": 0.2007, + "num_input_tokens_seen": 92319632, + "step": 42770 + }, + { + "epoch": 6.9779771615008155, + "grad_norm": 0.3195130228996277, + "learning_rate": 1.2668043581337401e-05, + "loss": 0.0653, + "num_input_tokens_seen": 92330576, + "step": 42775 + }, + { + "epoch": 6.97879282218597, + "grad_norm": 1.26130211353302, + "learning_rate": 1.2661852350802556e-05, + "loss": 0.0943, + "num_input_tokens_seen": 92340304, + "step": 42780 + }, + { + "epoch": 6.979608482871125, + "grad_norm": 0.3356194496154785, + "learning_rate": 1.2655662120460564e-05, + "loss": 0.1979, + "num_input_tokens_seen": 92350576, + "step": 42785 + }, + { + "epoch": 6.980424143556281, + "grad_norm": 1.0855339765548706, + "learning_rate": 1.2649472890813232e-05, + "loss": 0.0758, + "num_input_tokens_seen": 92359664, + "step": 42790 + }, + { + "epoch": 6.981239804241436, + "grad_norm": 0.1743355393409729, + "learning_rate": 1.2643284662362295e-05, + "loss": 0.1024, + "num_input_tokens_seen": 92370032, + "step": 42795 + }, + { + "epoch": 6.9820554649265905, + "grad_norm": 1.3280683755874634, + "learning_rate": 1.2637097435609402e-05, + "loss": 0.1767, + "num_input_tokens_seen": 92380976, + "step": 42800 + }, + { + "epoch": 6.982871125611745, + "grad_norm": 0.3560836613178253, + "learning_rate": 1.2630911211056116e-05, + "loss": 0.1198, + "num_input_tokens_seen": 92393136, + "step": 42805 + }, + { + "epoch": 6.9836867862969, + "grad_norm": 0.08560378104448318, + "learning_rate": 1.2624725989203929e-05, + "loss": 0.1371, + "num_input_tokens_seen": 92404208, + "step": 42810 + }, + { + "epoch": 6.984502446982056, + "grad_norm": 0.06789915263652802, + "learning_rate": 1.2618541770554243e-05, + "loss": 0.1505, + "num_input_tokens_seen": 92415152, + "step": 42815 + }, + { + "epoch": 6.985318107667211, + "grad_norm": 0.1525154560804367, + "learning_rate": 1.2612358555608388e-05, + "loss": 0.0255, + "num_input_tokens_seen": 92426192, + "step": 42820 + }, + { + "epoch": 6.986133768352365, + "grad_norm": 0.21540388464927673, + "learning_rate": 1.2606176344867598e-05, + "loss": 0.0913, + "num_input_tokens_seen": 92438064, + "step": 42825 + }, + { + "epoch": 6.98694942903752, + "grad_norm": 0.9003202319145203, + "learning_rate": 1.2599995138833043e-05, + "loss": 0.0894, + "num_input_tokens_seen": 92447280, + "step": 42830 + }, + { + "epoch": 6.987765089722675, + "grad_norm": 0.21866236627101898, + "learning_rate": 1.25938149380058e-05, + "loss": 0.0566, + "num_input_tokens_seen": 92458416, + "step": 42835 + }, + { + "epoch": 6.988580750407831, + "grad_norm": 0.18182969093322754, + "learning_rate": 1.258763574288686e-05, + "loss": 0.0858, + "num_input_tokens_seen": 92469680, + "step": 42840 + }, + { + "epoch": 6.989396411092986, + "grad_norm": 0.7893223166465759, + "learning_rate": 1.2581457553977144e-05, + "loss": 0.0702, + "num_input_tokens_seen": 92480592, + "step": 42845 + }, + { + "epoch": 6.99021207177814, + "grad_norm": 0.14826340973377228, + "learning_rate": 1.2575280371777496e-05, + "loss": 0.0892, + "num_input_tokens_seen": 92492336, + "step": 42850 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.5450990200042725, + "learning_rate": 1.2569104196788665e-05, + "loss": 0.0675, + "num_input_tokens_seen": 92502608, + "step": 42855 + }, + { + "epoch": 6.99184339314845, + "grad_norm": 0.7676130533218384, + "learning_rate": 1.256292902951132e-05, + "loss": 0.0577, + "num_input_tokens_seen": 92513520, + "step": 42860 + }, + { + "epoch": 6.992659053833605, + "grad_norm": 1.141149640083313, + "learning_rate": 1.2556754870446053e-05, + "loss": 0.1034, + "num_input_tokens_seen": 92524656, + "step": 42865 + }, + { + "epoch": 6.993474714518761, + "grad_norm": 0.617643415927887, + "learning_rate": 1.255058172009337e-05, + "loss": 0.0543, + "num_input_tokens_seen": 92535408, + "step": 42870 + }, + { + "epoch": 6.994290375203915, + "grad_norm": 1.2355281114578247, + "learning_rate": 1.2544409578953697e-05, + "loss": 0.0697, + "num_input_tokens_seen": 92545744, + "step": 42875 + }, + { + "epoch": 6.99510603588907, + "grad_norm": 1.7911415100097656, + "learning_rate": 1.253823844752739e-05, + "loss": 0.0971, + "num_input_tokens_seen": 92556432, + "step": 42880 + }, + { + "epoch": 6.995921696574225, + "grad_norm": 1.4810765981674194, + "learning_rate": 1.25320683263147e-05, + "loss": 0.1058, + "num_input_tokens_seen": 92568592, + "step": 42885 + }, + { + "epoch": 6.99673735725938, + "grad_norm": 0.4161183536052704, + "learning_rate": 1.2525899215815818e-05, + "loss": 0.0524, + "num_input_tokens_seen": 92579632, + "step": 42890 + }, + { + "epoch": 6.997553017944535, + "grad_norm": 0.07790544629096985, + "learning_rate": 1.2519731116530837e-05, + "loss": 0.0421, + "num_input_tokens_seen": 92591344, + "step": 42895 + }, + { + "epoch": 6.99836867862969, + "grad_norm": 0.05763031542301178, + "learning_rate": 1.2513564028959777e-05, + "loss": 0.1403, + "num_input_tokens_seen": 92601264, + "step": 42900 + }, + { + "epoch": 6.999184339314845, + "grad_norm": 0.6145094633102417, + "learning_rate": 1.2507397953602574e-05, + "loss": 0.2, + "num_input_tokens_seen": 92612464, + "step": 42905 + }, + { + "epoch": 7.0, + "grad_norm": 0.687351644039154, + "learning_rate": 1.2501232890959075e-05, + "loss": 0.0944, + "num_input_tokens_seen": 92621824, + "step": 42910 + }, + { + "epoch": 7.0, + "eval_loss": 0.13890694081783295, + "eval_runtime": 131.8074, + "eval_samples_per_second": 20.674, + "eval_steps_per_second": 5.174, + "num_input_tokens_seen": 92621824, + "step": 42910 + }, + { + "epoch": 7.000815660685155, + "grad_norm": 0.30172446370124817, + "learning_rate": 1.2495068841529058e-05, + "loss": 0.1204, + "num_input_tokens_seen": 92633024, + "step": 42915 + }, + { + "epoch": 7.00163132137031, + "grad_norm": 0.20368990302085876, + "learning_rate": 1.248890580581221e-05, + "loss": 0.1182, + "num_input_tokens_seen": 92644768, + "step": 42920 + }, + { + "epoch": 7.002446982055465, + "grad_norm": 0.06347157061100006, + "learning_rate": 1.248274378430814e-05, + "loss": 0.0305, + "num_input_tokens_seen": 92654240, + "step": 42925 + }, + { + "epoch": 7.00326264274062, + "grad_norm": 0.544950008392334, + "learning_rate": 1.2476582777516368e-05, + "loss": 0.0276, + "num_input_tokens_seen": 92664288, + "step": 42930 + }, + { + "epoch": 7.004078303425775, + "grad_norm": 0.025849448516964912, + "learning_rate": 1.2470422785936339e-05, + "loss": 0.1807, + "num_input_tokens_seen": 92675104, + "step": 42935 + }, + { + "epoch": 7.00489396411093, + "grad_norm": 0.6000098586082458, + "learning_rate": 1.2464263810067417e-05, + "loss": 0.046, + "num_input_tokens_seen": 92685056, + "step": 42940 + }, + { + "epoch": 7.005709624796085, + "grad_norm": 1.2154836654663086, + "learning_rate": 1.2458105850408874e-05, + "loss": 0.1426, + "num_input_tokens_seen": 92696000, + "step": 42945 + }, + { + "epoch": 7.006525285481239, + "grad_norm": 0.0670337975025177, + "learning_rate": 1.2451948907459907e-05, + "loss": 0.0831, + "num_input_tokens_seen": 92706272, + "step": 42950 + }, + { + "epoch": 7.007340946166395, + "grad_norm": 0.02401762269437313, + "learning_rate": 1.2445792981719622e-05, + "loss": 0.22, + "num_input_tokens_seen": 92717120, + "step": 42955 + }, + { + "epoch": 7.00815660685155, + "grad_norm": 0.04728851467370987, + "learning_rate": 1.2439638073687065e-05, + "loss": 0.0257, + "num_input_tokens_seen": 92726432, + "step": 42960 + }, + { + "epoch": 7.008972267536705, + "grad_norm": 0.4205707311630249, + "learning_rate": 1.2433484183861178e-05, + "loss": 0.1668, + "num_input_tokens_seen": 92736896, + "step": 42965 + }, + { + "epoch": 7.00978792822186, + "grad_norm": 0.351908802986145, + "learning_rate": 1.2427331312740822e-05, + "loss": 0.0881, + "num_input_tokens_seen": 92747904, + "step": 42970 + }, + { + "epoch": 7.010603588907014, + "grad_norm": 0.16453558206558228, + "learning_rate": 1.2421179460824787e-05, + "loss": 0.2038, + "num_input_tokens_seen": 92757920, + "step": 42975 + }, + { + "epoch": 7.011419249592169, + "grad_norm": 0.028769589960575104, + "learning_rate": 1.2415028628611767e-05, + "loss": 0.0693, + "num_input_tokens_seen": 92769664, + "step": 42980 + }, + { + "epoch": 7.012234910277325, + "grad_norm": 0.31584927439689636, + "learning_rate": 1.2408878816600384e-05, + "loss": 0.0581, + "num_input_tokens_seen": 92780256, + "step": 42985 + }, + { + "epoch": 7.01305057096248, + "grad_norm": 1.2007449865341187, + "learning_rate": 1.2402730025289166e-05, + "loss": 0.1662, + "num_input_tokens_seen": 92791744, + "step": 42990 + }, + { + "epoch": 7.013866231647635, + "grad_norm": 1.9432954788208008, + "learning_rate": 1.2396582255176575e-05, + "loss": 0.0665, + "num_input_tokens_seen": 92800928, + "step": 42995 + }, + { + "epoch": 7.014681892332789, + "grad_norm": 0.20111152529716492, + "learning_rate": 1.2390435506760973e-05, + "loss": 0.071, + "num_input_tokens_seen": 92812352, + "step": 43000 + }, + { + "epoch": 7.015497553017944, + "grad_norm": 0.2590343654155731, + "learning_rate": 1.238428978054065e-05, + "loss": 0.0704, + "num_input_tokens_seen": 92822944, + "step": 43005 + }, + { + "epoch": 7.0163132137031, + "grad_norm": 1.0643070936203003, + "learning_rate": 1.2378145077013808e-05, + "loss": 0.1536, + "num_input_tokens_seen": 92832960, + "step": 43010 + }, + { + "epoch": 7.017128874388255, + "grad_norm": 0.18007348477840424, + "learning_rate": 1.237200139667857e-05, + "loss": 0.2099, + "num_input_tokens_seen": 92843584, + "step": 43015 + }, + { + "epoch": 7.0179445350734095, + "grad_norm": 0.1535462886095047, + "learning_rate": 1.2365858740032962e-05, + "loss": 0.0415, + "num_input_tokens_seen": 92853440, + "step": 43020 + }, + { + "epoch": 7.018760195758564, + "grad_norm": 0.7421181797981262, + "learning_rate": 1.2359717107574959e-05, + "loss": 0.1662, + "num_input_tokens_seen": 92862112, + "step": 43025 + }, + { + "epoch": 7.019575856443719, + "grad_norm": 0.052963949739933014, + "learning_rate": 1.2353576499802425e-05, + "loss": 0.0957, + "num_input_tokens_seen": 92872000, + "step": 43030 + }, + { + "epoch": 7.020391517128874, + "grad_norm": 0.28444549441337585, + "learning_rate": 1.2347436917213145e-05, + "loss": 0.0494, + "num_input_tokens_seen": 92881856, + "step": 43035 + }, + { + "epoch": 7.02120717781403, + "grad_norm": 0.2557075023651123, + "learning_rate": 1.2341298360304828e-05, + "loss": 0.0478, + "num_input_tokens_seen": 92892768, + "step": 43040 + }, + { + "epoch": 7.0220228384991845, + "grad_norm": 0.4948076009750366, + "learning_rate": 1.2335160829575096e-05, + "loss": 0.1966, + "num_input_tokens_seen": 92904384, + "step": 43045 + }, + { + "epoch": 7.022838499184339, + "grad_norm": 0.44939708709716797, + "learning_rate": 1.2329024325521488e-05, + "loss": 0.1116, + "num_input_tokens_seen": 92916160, + "step": 43050 + }, + { + "epoch": 7.023654159869494, + "grad_norm": 1.8413771390914917, + "learning_rate": 1.2322888848641458e-05, + "loss": 0.1597, + "num_input_tokens_seen": 92924640, + "step": 43055 + }, + { + "epoch": 7.024469820554649, + "grad_norm": 0.04251477122306824, + "learning_rate": 1.2316754399432374e-05, + "loss": 0.0361, + "num_input_tokens_seen": 92935264, + "step": 43060 + }, + { + "epoch": 7.025285481239805, + "grad_norm": 0.07370211184024811, + "learning_rate": 1.231062097839154e-05, + "loss": 0.0385, + "num_input_tokens_seen": 92947232, + "step": 43065 + }, + { + "epoch": 7.0261011419249595, + "grad_norm": 0.09625212848186493, + "learning_rate": 1.2304488586016156e-05, + "loss": 0.1555, + "num_input_tokens_seen": 92958816, + "step": 43070 + }, + { + "epoch": 7.026916802610114, + "grad_norm": 1.3434367179870605, + "learning_rate": 1.2298357222803341e-05, + "loss": 0.2796, + "num_input_tokens_seen": 92969824, + "step": 43075 + }, + { + "epoch": 7.027732463295269, + "grad_norm": 0.3533608317375183, + "learning_rate": 1.2292226889250142e-05, + "loss": 0.1619, + "num_input_tokens_seen": 92980928, + "step": 43080 + }, + { + "epoch": 7.028548123980424, + "grad_norm": 0.8979074954986572, + "learning_rate": 1.2286097585853507e-05, + "loss": 0.181, + "num_input_tokens_seen": 92992480, + "step": 43085 + }, + { + "epoch": 7.029363784665579, + "grad_norm": 0.11029212921857834, + "learning_rate": 1.2279969313110313e-05, + "loss": 0.1257, + "num_input_tokens_seen": 93003232, + "step": 43090 + }, + { + "epoch": 7.0301794453507345, + "grad_norm": 1.0796918869018555, + "learning_rate": 1.2273842071517344e-05, + "loss": 0.1175, + "num_input_tokens_seen": 93013952, + "step": 43095 + }, + { + "epoch": 7.030995106035889, + "grad_norm": 1.6343796253204346, + "learning_rate": 1.2267715861571311e-05, + "loss": 0.1648, + "num_input_tokens_seen": 93024064, + "step": 43100 + }, + { + "epoch": 7.031810766721044, + "grad_norm": 1.3527694940567017, + "learning_rate": 1.2261590683768831e-05, + "loss": 0.0648, + "num_input_tokens_seen": 93036128, + "step": 43105 + }, + { + "epoch": 7.032626427406199, + "grad_norm": 1.2403696775436401, + "learning_rate": 1.2255466538606447e-05, + "loss": 0.0903, + "num_input_tokens_seen": 93047904, + "step": 43110 + }, + { + "epoch": 7.033442088091354, + "grad_norm": 0.8752784132957458, + "learning_rate": 1.224934342658061e-05, + "loss": 0.2216, + "num_input_tokens_seen": 93057568, + "step": 43115 + }, + { + "epoch": 7.034257748776509, + "grad_norm": 0.45822077989578247, + "learning_rate": 1.224322134818769e-05, + "loss": 0.076, + "num_input_tokens_seen": 93068192, + "step": 43120 + }, + { + "epoch": 7.035073409461664, + "grad_norm": 0.06094043329358101, + "learning_rate": 1.2237100303923977e-05, + "loss": 0.0553, + "num_input_tokens_seen": 93080032, + "step": 43125 + }, + { + "epoch": 7.035889070146819, + "grad_norm": 0.4186946153640747, + "learning_rate": 1.2230980294285669e-05, + "loss": 0.0838, + "num_input_tokens_seen": 93090784, + "step": 43130 + }, + { + "epoch": 7.036704730831974, + "grad_norm": 0.1897902637720108, + "learning_rate": 1.2224861319768887e-05, + "loss": 0.0187, + "num_input_tokens_seen": 93102176, + "step": 43135 + }, + { + "epoch": 7.037520391517129, + "grad_norm": 0.3836055099964142, + "learning_rate": 1.2218743380869669e-05, + "loss": 0.0518, + "num_input_tokens_seen": 93112928, + "step": 43140 + }, + { + "epoch": 7.0383360522022835, + "grad_norm": 0.17735405266284943, + "learning_rate": 1.2212626478083964e-05, + "loss": 0.053, + "num_input_tokens_seen": 93123872, + "step": 43145 + }, + { + "epoch": 7.039151712887439, + "grad_norm": 0.09428989142179489, + "learning_rate": 1.2206510611907638e-05, + "loss": 0.128, + "num_input_tokens_seen": 93134368, + "step": 43150 + }, + { + "epoch": 7.039967373572594, + "grad_norm": 1.3463586568832397, + "learning_rate": 1.2200395782836477e-05, + "loss": 0.1249, + "num_input_tokens_seen": 93146720, + "step": 43155 + }, + { + "epoch": 7.040783034257749, + "grad_norm": 1.4154990911483765, + "learning_rate": 1.2194281991366176e-05, + "loss": 0.096, + "num_input_tokens_seen": 93157632, + "step": 43160 + }, + { + "epoch": 7.041598694942904, + "grad_norm": 0.19105331599712372, + "learning_rate": 1.2188169237992345e-05, + "loss": 0.061, + "num_input_tokens_seen": 93167264, + "step": 43165 + }, + { + "epoch": 7.0424143556280585, + "grad_norm": 1.392549753189087, + "learning_rate": 1.218205752321053e-05, + "loss": 0.1662, + "num_input_tokens_seen": 93179136, + "step": 43170 + }, + { + "epoch": 7.043230016313213, + "grad_norm": 0.10710964351892471, + "learning_rate": 1.217594684751617e-05, + "loss": 0.1397, + "num_input_tokens_seen": 93189536, + "step": 43175 + }, + { + "epoch": 7.044045676998369, + "grad_norm": 0.06999605894088745, + "learning_rate": 1.2169837211404627e-05, + "loss": 0.0103, + "num_input_tokens_seen": 93201568, + "step": 43180 + }, + { + "epoch": 7.044861337683524, + "grad_norm": 0.05549885332584381, + "learning_rate": 1.2163728615371181e-05, + "loss": 0.1031, + "num_input_tokens_seen": 93212128, + "step": 43185 + }, + { + "epoch": 7.045676998368679, + "grad_norm": 1.3402695655822754, + "learning_rate": 1.2157621059911014e-05, + "loss": 0.3236, + "num_input_tokens_seen": 93222784, + "step": 43190 + }, + { + "epoch": 7.0464926590538335, + "grad_norm": 0.1182926744222641, + "learning_rate": 1.2151514545519254e-05, + "loss": 0.0502, + "num_input_tokens_seen": 93234016, + "step": 43195 + }, + { + "epoch": 7.047308319738988, + "grad_norm": 1.9286155700683594, + "learning_rate": 1.214540907269092e-05, + "loss": 0.1453, + "num_input_tokens_seen": 93243200, + "step": 43200 + }, + { + "epoch": 7.048123980424143, + "grad_norm": 0.1642090380191803, + "learning_rate": 1.2139304641920946e-05, + "loss": 0.017, + "num_input_tokens_seen": 93254432, + "step": 43205 + }, + { + "epoch": 7.048939641109299, + "grad_norm": 0.19246169924736023, + "learning_rate": 1.2133201253704196e-05, + "loss": 0.0393, + "num_input_tokens_seen": 93263360, + "step": 43210 + }, + { + "epoch": 7.049755301794454, + "grad_norm": 0.06965053826570511, + "learning_rate": 1.2127098908535434e-05, + "loss": 0.0836, + "num_input_tokens_seen": 93273376, + "step": 43215 + }, + { + "epoch": 7.0505709624796085, + "grad_norm": 1.3098580837249756, + "learning_rate": 1.2120997606909354e-05, + "loss": 0.1171, + "num_input_tokens_seen": 93284416, + "step": 43220 + }, + { + "epoch": 7.051386623164763, + "grad_norm": 1.120794653892517, + "learning_rate": 1.2114897349320553e-05, + "loss": 0.1127, + "num_input_tokens_seen": 93295776, + "step": 43225 + }, + { + "epoch": 7.052202283849918, + "grad_norm": 0.5372159481048584, + "learning_rate": 1.2108798136263555e-05, + "loss": 0.1249, + "num_input_tokens_seen": 93305408, + "step": 43230 + }, + { + "epoch": 7.053017944535074, + "grad_norm": 2.454467535018921, + "learning_rate": 1.2102699968232787e-05, + "loss": 0.3481, + "num_input_tokens_seen": 93316192, + "step": 43235 + }, + { + "epoch": 7.053833605220229, + "grad_norm": 0.4827018082141876, + "learning_rate": 1.2096602845722598e-05, + "loss": 0.1515, + "num_input_tokens_seen": 93327072, + "step": 43240 + }, + { + "epoch": 7.054649265905383, + "grad_norm": 0.19632606208324432, + "learning_rate": 1.2090506769227256e-05, + "loss": 0.029, + "num_input_tokens_seen": 93338368, + "step": 43245 + }, + { + "epoch": 7.055464926590538, + "grad_norm": 0.4960024654865265, + "learning_rate": 1.2084411739240936e-05, + "loss": 0.074, + "num_input_tokens_seen": 93349952, + "step": 43250 + }, + { + "epoch": 7.056280587275693, + "grad_norm": 1.5153740644454956, + "learning_rate": 1.2078317756257735e-05, + "loss": 0.1069, + "num_input_tokens_seen": 93360480, + "step": 43255 + }, + { + "epoch": 7.057096247960848, + "grad_norm": 0.9647220969200134, + "learning_rate": 1.207222482077166e-05, + "loss": 0.0744, + "num_input_tokens_seen": 93371104, + "step": 43260 + }, + { + "epoch": 7.057911908646004, + "grad_norm": 0.4893571436405182, + "learning_rate": 1.2066132933276636e-05, + "loss": 0.2336, + "num_input_tokens_seen": 93381792, + "step": 43265 + }, + { + "epoch": 7.058727569331158, + "grad_norm": 0.07052499800920486, + "learning_rate": 1.2060042094266495e-05, + "loss": 0.0174, + "num_input_tokens_seen": 93393024, + "step": 43270 + }, + { + "epoch": 7.059543230016313, + "grad_norm": 0.06043890863656998, + "learning_rate": 1.2053952304235002e-05, + "loss": 0.1032, + "num_input_tokens_seen": 93403968, + "step": 43275 + }, + { + "epoch": 7.060358890701468, + "grad_norm": 0.09427468478679657, + "learning_rate": 1.2047863563675826e-05, + "loss": 0.0412, + "num_input_tokens_seen": 93414336, + "step": 43280 + }, + { + "epoch": 7.061174551386623, + "grad_norm": 0.2018786519765854, + "learning_rate": 1.204177587308255e-05, + "loss": 0.0326, + "num_input_tokens_seen": 93425312, + "step": 43285 + }, + { + "epoch": 7.061990212071779, + "grad_norm": 1.4853837490081787, + "learning_rate": 1.2035689232948669e-05, + "loss": 0.177, + "num_input_tokens_seen": 93434368, + "step": 43290 + }, + { + "epoch": 7.062805872756933, + "grad_norm": 1.795634388923645, + "learning_rate": 1.20296036437676e-05, + "loss": 0.1188, + "num_input_tokens_seen": 93445696, + "step": 43295 + }, + { + "epoch": 7.063621533442088, + "grad_norm": 0.8794215321540833, + "learning_rate": 1.2023519106032672e-05, + "loss": 0.0814, + "num_input_tokens_seen": 93456768, + "step": 43300 + }, + { + "epoch": 7.064437194127243, + "grad_norm": 0.07432935386896133, + "learning_rate": 1.2017435620237125e-05, + "loss": 0.0935, + "num_input_tokens_seen": 93468320, + "step": 43305 + }, + { + "epoch": 7.065252854812398, + "grad_norm": 0.07413256913423538, + "learning_rate": 1.201135318687412e-05, + "loss": 0.1322, + "num_input_tokens_seen": 93480832, + "step": 43310 + }, + { + "epoch": 7.066068515497553, + "grad_norm": 0.08947181701660156, + "learning_rate": 1.2005271806436727e-05, + "loss": 0.0325, + "num_input_tokens_seen": 93490880, + "step": 43315 + }, + { + "epoch": 7.066884176182708, + "grad_norm": 0.038528602570295334, + "learning_rate": 1.199919147941794e-05, + "loss": 0.0127, + "num_input_tokens_seen": 93501600, + "step": 43320 + }, + { + "epoch": 7.067699836867863, + "grad_norm": 1.2617707252502441, + "learning_rate": 1.1993112206310656e-05, + "loss": 0.1448, + "num_input_tokens_seen": 93512832, + "step": 43325 + }, + { + "epoch": 7.068515497553018, + "grad_norm": 0.9598096013069153, + "learning_rate": 1.1987033987607681e-05, + "loss": 0.0603, + "num_input_tokens_seen": 93524416, + "step": 43330 + }, + { + "epoch": 7.069331158238173, + "grad_norm": 0.07148575037717819, + "learning_rate": 1.198095682380177e-05, + "loss": 0.056, + "num_input_tokens_seen": 93535744, + "step": 43335 + }, + { + "epoch": 7.070146818923328, + "grad_norm": 0.185064435005188, + "learning_rate": 1.1974880715385557e-05, + "loss": 0.037, + "num_input_tokens_seen": 93546016, + "step": 43340 + }, + { + "epoch": 7.0709624796084825, + "grad_norm": 1.1656848192214966, + "learning_rate": 1.1968805662851601e-05, + "loss": 0.0902, + "num_input_tokens_seen": 93555040, + "step": 43345 + }, + { + "epoch": 7.071778140293638, + "grad_norm": 1.8415180444717407, + "learning_rate": 1.1962731666692378e-05, + "loss": 0.0968, + "num_input_tokens_seen": 93565184, + "step": 43350 + }, + { + "epoch": 7.072593800978793, + "grad_norm": 0.0747024416923523, + "learning_rate": 1.1956658727400277e-05, + "loss": 0.0469, + "num_input_tokens_seen": 93576352, + "step": 43355 + }, + { + "epoch": 7.073409461663948, + "grad_norm": 0.08197686821222305, + "learning_rate": 1.1950586845467602e-05, + "loss": 0.1187, + "num_input_tokens_seen": 93586880, + "step": 43360 + }, + { + "epoch": 7.074225122349103, + "grad_norm": 1.4488219022750854, + "learning_rate": 1.1944516021386565e-05, + "loss": 0.1403, + "num_input_tokens_seen": 93598560, + "step": 43365 + }, + { + "epoch": 7.075040783034257, + "grad_norm": 0.8918521404266357, + "learning_rate": 1.1938446255649305e-05, + "loss": 0.2153, + "num_input_tokens_seen": 93608928, + "step": 43370 + }, + { + "epoch": 7.075856443719413, + "grad_norm": 0.2198517918586731, + "learning_rate": 1.1932377548747867e-05, + "loss": 0.2544, + "num_input_tokens_seen": 93619552, + "step": 43375 + }, + { + "epoch": 7.076672104404568, + "grad_norm": 1.1921428442001343, + "learning_rate": 1.1926309901174196e-05, + "loss": 0.0484, + "num_input_tokens_seen": 93631264, + "step": 43380 + }, + { + "epoch": 7.077487765089723, + "grad_norm": 0.9639406800270081, + "learning_rate": 1.192024331342019e-05, + "loss": 0.1761, + "num_input_tokens_seen": 93641792, + "step": 43385 + }, + { + "epoch": 7.078303425774878, + "grad_norm": 0.08272960782051086, + "learning_rate": 1.1914177785977629e-05, + "loss": 0.0705, + "num_input_tokens_seen": 93652992, + "step": 43390 + }, + { + "epoch": 7.079119086460032, + "grad_norm": 0.2586735188961029, + "learning_rate": 1.1908113319338212e-05, + "loss": 0.0507, + "num_input_tokens_seen": 93663552, + "step": 43395 + }, + { + "epoch": 7.079934747145187, + "grad_norm": 0.6195389628410339, + "learning_rate": 1.1902049913993558e-05, + "loss": 0.1166, + "num_input_tokens_seen": 93674272, + "step": 43400 + }, + { + "epoch": 7.080750407830343, + "grad_norm": 0.11993087828159332, + "learning_rate": 1.1895987570435196e-05, + "loss": 0.0156, + "num_input_tokens_seen": 93685536, + "step": 43405 + }, + { + "epoch": 7.081566068515498, + "grad_norm": 0.47249525785446167, + "learning_rate": 1.188992628915457e-05, + "loss": 0.092, + "num_input_tokens_seen": 93696640, + "step": 43410 + }, + { + "epoch": 7.082381729200653, + "grad_norm": 1.7892115116119385, + "learning_rate": 1.1883866070643041e-05, + "loss": 0.1054, + "num_input_tokens_seen": 93707008, + "step": 43415 + }, + { + "epoch": 7.083197389885807, + "grad_norm": 0.42850303649902344, + "learning_rate": 1.1877806915391875e-05, + "loss": 0.1045, + "num_input_tokens_seen": 93717376, + "step": 43420 + }, + { + "epoch": 7.084013050570962, + "grad_norm": 2.301645517349243, + "learning_rate": 1.1871748823892264e-05, + "loss": 0.1169, + "num_input_tokens_seen": 93728032, + "step": 43425 + }, + { + "epoch": 7.084828711256117, + "grad_norm": 0.050609201192855835, + "learning_rate": 1.1865691796635306e-05, + "loss": 0.1246, + "num_input_tokens_seen": 93737792, + "step": 43430 + }, + { + "epoch": 7.085644371941273, + "grad_norm": 0.33267953991889954, + "learning_rate": 1.1859635834112012e-05, + "loss": 0.0415, + "num_input_tokens_seen": 93749088, + "step": 43435 + }, + { + "epoch": 7.0864600326264275, + "grad_norm": 0.5135351419448853, + "learning_rate": 1.1853580936813313e-05, + "loss": 0.0392, + "num_input_tokens_seen": 93760512, + "step": 43440 + }, + { + "epoch": 7.087275693311582, + "grad_norm": 0.7883628606796265, + "learning_rate": 1.1847527105230047e-05, + "loss": 0.1556, + "num_input_tokens_seen": 93771712, + "step": 43445 + }, + { + "epoch": 7.088091353996737, + "grad_norm": 0.12630808353424072, + "learning_rate": 1.1841474339852968e-05, + "loss": 0.0434, + "num_input_tokens_seen": 93783008, + "step": 43450 + }, + { + "epoch": 7.088907014681892, + "grad_norm": 0.7241953015327454, + "learning_rate": 1.1835422641172744e-05, + "loss": 0.1346, + "num_input_tokens_seen": 93792928, + "step": 43455 + }, + { + "epoch": 7.089722675367048, + "grad_norm": 0.14356383681297302, + "learning_rate": 1.182937200967996e-05, + "loss": 0.029, + "num_input_tokens_seen": 93803872, + "step": 43460 + }, + { + "epoch": 7.0905383360522025, + "grad_norm": 0.5348718166351318, + "learning_rate": 1.1823322445865103e-05, + "loss": 0.1016, + "num_input_tokens_seen": 93815584, + "step": 43465 + }, + { + "epoch": 7.091353996737357, + "grad_norm": 0.8256776928901672, + "learning_rate": 1.1817273950218591e-05, + "loss": 0.1232, + "num_input_tokens_seen": 93827040, + "step": 43470 + }, + { + "epoch": 7.092169657422512, + "grad_norm": 0.09723673015832901, + "learning_rate": 1.1811226523230731e-05, + "loss": 0.0879, + "num_input_tokens_seen": 93836800, + "step": 43475 + }, + { + "epoch": 7.092985318107667, + "grad_norm": 1.1457345485687256, + "learning_rate": 1.1805180165391774e-05, + "loss": 0.1164, + "num_input_tokens_seen": 93848032, + "step": 43480 + }, + { + "epoch": 7.093800978792822, + "grad_norm": 1.5020318031311035, + "learning_rate": 1.1799134877191867e-05, + "loss": 0.2172, + "num_input_tokens_seen": 93858720, + "step": 43485 + }, + { + "epoch": 7.0946166394779775, + "grad_norm": 0.0897250771522522, + "learning_rate": 1.1793090659121065e-05, + "loss": 0.0556, + "num_input_tokens_seen": 93869696, + "step": 43490 + }, + { + "epoch": 7.095432300163132, + "grad_norm": 0.1721135675907135, + "learning_rate": 1.1787047511669347e-05, + "loss": 0.1289, + "num_input_tokens_seen": 93880672, + "step": 43495 + }, + { + "epoch": 7.096247960848287, + "grad_norm": 1.1130088567733765, + "learning_rate": 1.178100543532659e-05, + "loss": 0.1037, + "num_input_tokens_seen": 93891840, + "step": 43500 + }, + { + "epoch": 7.097063621533442, + "grad_norm": 0.07620468735694885, + "learning_rate": 1.1774964430582614e-05, + "loss": 0.1517, + "num_input_tokens_seen": 93900928, + "step": 43505 + }, + { + "epoch": 7.097879282218597, + "grad_norm": 1.0148735046386719, + "learning_rate": 1.1768924497927123e-05, + "loss": 0.113, + "num_input_tokens_seen": 93912480, + "step": 43510 + }, + { + "epoch": 7.0986949429037525, + "grad_norm": 0.32859304547309875, + "learning_rate": 1.1762885637849746e-05, + "loss": 0.0792, + "num_input_tokens_seen": 93924192, + "step": 43515 + }, + { + "epoch": 7.099510603588907, + "grad_norm": 0.9570627212524414, + "learning_rate": 1.1756847850840024e-05, + "loss": 0.0537, + "num_input_tokens_seen": 93934304, + "step": 43520 + }, + { + "epoch": 7.100326264274062, + "grad_norm": 0.5069044232368469, + "learning_rate": 1.1750811137387414e-05, + "loss": 0.2267, + "num_input_tokens_seen": 93944032, + "step": 43525 + }, + { + "epoch": 7.101141924959217, + "grad_norm": 0.6783347129821777, + "learning_rate": 1.1744775497981273e-05, + "loss": 0.0523, + "num_input_tokens_seen": 93954304, + "step": 43530 + }, + { + "epoch": 7.101957585644372, + "grad_norm": 0.0569581463932991, + "learning_rate": 1.173874093311089e-05, + "loss": 0.0742, + "num_input_tokens_seen": 93964864, + "step": 43535 + }, + { + "epoch": 7.102773246329527, + "grad_norm": 1.4545023441314697, + "learning_rate": 1.1732707443265453e-05, + "loss": 0.1495, + "num_input_tokens_seen": 93976448, + "step": 43540 + }, + { + "epoch": 7.103588907014682, + "grad_norm": 0.5992228984832764, + "learning_rate": 1.172667502893407e-05, + "loss": 0.1006, + "num_input_tokens_seen": 93986848, + "step": 43545 + }, + { + "epoch": 7.104404567699837, + "grad_norm": 0.5459602475166321, + "learning_rate": 1.1720643690605754e-05, + "loss": 0.113, + "num_input_tokens_seen": 93997472, + "step": 43550 + }, + { + "epoch": 7.105220228384992, + "grad_norm": 0.04968167841434479, + "learning_rate": 1.1714613428769442e-05, + "loss": 0.0282, + "num_input_tokens_seen": 94007744, + "step": 43555 + }, + { + "epoch": 7.106035889070147, + "grad_norm": 1.5377347469329834, + "learning_rate": 1.1708584243913972e-05, + "loss": 0.3569, + "num_input_tokens_seen": 94018880, + "step": 43560 + }, + { + "epoch": 7.1068515497553015, + "grad_norm": 0.058406930416822433, + "learning_rate": 1.1702556136528106e-05, + "loss": 0.0548, + "num_input_tokens_seen": 94030112, + "step": 43565 + }, + { + "epoch": 7.107667210440456, + "grad_norm": 0.5400405526161194, + "learning_rate": 1.169652910710051e-05, + "loss": 0.0967, + "num_input_tokens_seen": 94041120, + "step": 43570 + }, + { + "epoch": 7.108482871125612, + "grad_norm": 0.4645627737045288, + "learning_rate": 1.1690503156119764e-05, + "loss": 0.0684, + "num_input_tokens_seen": 94052928, + "step": 43575 + }, + { + "epoch": 7.109298531810767, + "grad_norm": 0.0698034018278122, + "learning_rate": 1.1684478284074365e-05, + "loss": 0.0637, + "num_input_tokens_seen": 94064480, + "step": 43580 + }, + { + "epoch": 7.110114192495922, + "grad_norm": 0.07647210359573364, + "learning_rate": 1.1678454491452717e-05, + "loss": 0.099, + "num_input_tokens_seen": 94076064, + "step": 43585 + }, + { + "epoch": 7.1109298531810765, + "grad_norm": 2.2529516220092773, + "learning_rate": 1.1672431778743133e-05, + "loss": 0.1069, + "num_input_tokens_seen": 94085216, + "step": 43590 + }, + { + "epoch": 7.111745513866231, + "grad_norm": 3.0240368843078613, + "learning_rate": 1.1666410146433861e-05, + "loss": 0.2068, + "num_input_tokens_seen": 94095904, + "step": 43595 + }, + { + "epoch": 7.112561174551387, + "grad_norm": 1.0509105920791626, + "learning_rate": 1.1660389595013038e-05, + "loss": 0.1266, + "num_input_tokens_seen": 94108288, + "step": 43600 + }, + { + "epoch": 7.113376835236542, + "grad_norm": 0.06457561254501343, + "learning_rate": 1.165437012496872e-05, + "loss": 0.0525, + "num_input_tokens_seen": 94117440, + "step": 43605 + }, + { + "epoch": 7.114192495921697, + "grad_norm": 0.04075410217046738, + "learning_rate": 1.1648351736788871e-05, + "loss": 0.086, + "num_input_tokens_seen": 94127488, + "step": 43610 + }, + { + "epoch": 7.1150081566068515, + "grad_norm": 0.6355909705162048, + "learning_rate": 1.1642334430961377e-05, + "loss": 0.0759, + "num_input_tokens_seen": 94137824, + "step": 43615 + }, + { + "epoch": 7.115823817292006, + "grad_norm": 0.7475014925003052, + "learning_rate": 1.163631820797403e-05, + "loss": 0.2718, + "num_input_tokens_seen": 94149632, + "step": 43620 + }, + { + "epoch": 7.116639477977161, + "grad_norm": 1.034792423248291, + "learning_rate": 1.1630303068314538e-05, + "loss": 0.2224, + "num_input_tokens_seen": 94161184, + "step": 43625 + }, + { + "epoch": 7.117455138662317, + "grad_norm": 0.07988115400075912, + "learning_rate": 1.1624289012470513e-05, + "loss": 0.0594, + "num_input_tokens_seen": 94170656, + "step": 43630 + }, + { + "epoch": 7.118270799347472, + "grad_norm": 0.04811939224600792, + "learning_rate": 1.161827604092949e-05, + "loss": 0.064, + "num_input_tokens_seen": 94180384, + "step": 43635 + }, + { + "epoch": 7.1190864600326265, + "grad_norm": 0.046138688921928406, + "learning_rate": 1.161226415417891e-05, + "loss": 0.0233, + "num_input_tokens_seen": 94191456, + "step": 43640 + }, + { + "epoch": 7.119902120717781, + "grad_norm": 0.1430777758359909, + "learning_rate": 1.1606253352706118e-05, + "loss": 0.1406, + "num_input_tokens_seen": 94202112, + "step": 43645 + }, + { + "epoch": 7.120717781402936, + "grad_norm": 1.8314709663391113, + "learning_rate": 1.1600243636998396e-05, + "loss": 0.202, + "num_input_tokens_seen": 94213312, + "step": 43650 + }, + { + "epoch": 7.121533442088092, + "grad_norm": 0.09517992287874222, + "learning_rate": 1.1594235007542914e-05, + "loss": 0.1176, + "num_input_tokens_seen": 94225248, + "step": 43655 + }, + { + "epoch": 7.122349102773247, + "grad_norm": 1.8900543451309204, + "learning_rate": 1.1588227464826763e-05, + "loss": 0.2114, + "num_input_tokens_seen": 94236416, + "step": 43660 + }, + { + "epoch": 7.123164763458401, + "grad_norm": 0.22674961388111115, + "learning_rate": 1.1582221009336944e-05, + "loss": 0.0957, + "num_input_tokens_seen": 94247040, + "step": 43665 + }, + { + "epoch": 7.123980424143556, + "grad_norm": 0.724075198173523, + "learning_rate": 1.157621564156037e-05, + "loss": 0.1205, + "num_input_tokens_seen": 94257472, + "step": 43670 + }, + { + "epoch": 7.124796084828711, + "grad_norm": 1.4566503763198853, + "learning_rate": 1.157021136198387e-05, + "loss": 0.1668, + "num_input_tokens_seen": 94267776, + "step": 43675 + }, + { + "epoch": 7.125611745513866, + "grad_norm": 1.4935842752456665, + "learning_rate": 1.156420817109418e-05, + "loss": 0.1205, + "num_input_tokens_seen": 94279552, + "step": 43680 + }, + { + "epoch": 7.126427406199022, + "grad_norm": 0.05477697402238846, + "learning_rate": 1.1558206069377945e-05, + "loss": 0.1332, + "num_input_tokens_seen": 94291424, + "step": 43685 + }, + { + "epoch": 7.127243066884176, + "grad_norm": 0.7386852502822876, + "learning_rate": 1.155220505732173e-05, + "loss": 0.0924, + "num_input_tokens_seen": 94302080, + "step": 43690 + }, + { + "epoch": 7.128058727569331, + "grad_norm": 0.9121895432472229, + "learning_rate": 1.1546205135412008e-05, + "loss": 0.1444, + "num_input_tokens_seen": 94313440, + "step": 43695 + }, + { + "epoch": 7.128874388254486, + "grad_norm": 0.5823668241500854, + "learning_rate": 1.1540206304135152e-05, + "loss": 0.0557, + "num_input_tokens_seen": 94322784, + "step": 43700 + }, + { + "epoch": 7.129690048939641, + "grad_norm": 1.4820971488952637, + "learning_rate": 1.1534208563977475e-05, + "loss": 0.1347, + "num_input_tokens_seen": 94333280, + "step": 43705 + }, + { + "epoch": 7.130505709624796, + "grad_norm": 1.3515021800994873, + "learning_rate": 1.1528211915425177e-05, + "loss": 0.099, + "num_input_tokens_seen": 94344032, + "step": 43710 + }, + { + "epoch": 7.131321370309951, + "grad_norm": 0.4585278332233429, + "learning_rate": 1.1522216358964377e-05, + "loss": 0.0987, + "num_input_tokens_seen": 94354816, + "step": 43715 + }, + { + "epoch": 7.132137030995106, + "grad_norm": 0.2807989716529846, + "learning_rate": 1.1516221895081104e-05, + "loss": 0.2339, + "num_input_tokens_seen": 94364896, + "step": 43720 + }, + { + "epoch": 7.132952691680261, + "grad_norm": 0.2682253122329712, + "learning_rate": 1.1510228524261302e-05, + "loss": 0.2936, + "num_input_tokens_seen": 94375648, + "step": 43725 + }, + { + "epoch": 7.133768352365416, + "grad_norm": 0.03510168567299843, + "learning_rate": 1.1504236246990819e-05, + "loss": 0.1986, + "num_input_tokens_seen": 94386816, + "step": 43730 + }, + { + "epoch": 7.134584013050571, + "grad_norm": 0.5013941526412964, + "learning_rate": 1.1498245063755425e-05, + "loss": 0.0344, + "num_input_tokens_seen": 94397856, + "step": 43735 + }, + { + "epoch": 7.135399673735726, + "grad_norm": 1.9259153604507446, + "learning_rate": 1.1492254975040792e-05, + "loss": 0.2321, + "num_input_tokens_seen": 94408128, + "step": 43740 + }, + { + "epoch": 7.136215334420881, + "grad_norm": 0.06313171982765198, + "learning_rate": 1.148626598133251e-05, + "loss": 0.1509, + "num_input_tokens_seen": 94418912, + "step": 43745 + }, + { + "epoch": 7.137030995106036, + "grad_norm": 0.3923463523387909, + "learning_rate": 1.1480278083116074e-05, + "loss": 0.0377, + "num_input_tokens_seen": 94428576, + "step": 43750 + }, + { + "epoch": 7.137846655791191, + "grad_norm": 0.025309910997748375, + "learning_rate": 1.1474291280876894e-05, + "loss": 0.3103, + "num_input_tokens_seen": 94439232, + "step": 43755 + }, + { + "epoch": 7.138662316476346, + "grad_norm": 0.7942724823951721, + "learning_rate": 1.1468305575100294e-05, + "loss": 0.0638, + "num_input_tokens_seen": 94450144, + "step": 43760 + }, + { + "epoch": 7.1394779771615005, + "grad_norm": 0.07212179154157639, + "learning_rate": 1.1462320966271503e-05, + "loss": 0.0814, + "num_input_tokens_seen": 94461664, + "step": 43765 + }, + { + "epoch": 7.140293637846656, + "grad_norm": 0.3908741772174835, + "learning_rate": 1.1456337454875663e-05, + "loss": 0.0151, + "num_input_tokens_seen": 94471392, + "step": 43770 + }, + { + "epoch": 7.141109298531811, + "grad_norm": 0.4054188132286072, + "learning_rate": 1.1450355041397829e-05, + "loss": 0.2488, + "num_input_tokens_seen": 94482848, + "step": 43775 + }, + { + "epoch": 7.141924959216966, + "grad_norm": 0.4695701599121094, + "learning_rate": 1.1444373726322966e-05, + "loss": 0.0852, + "num_input_tokens_seen": 94493792, + "step": 43780 + }, + { + "epoch": 7.142740619902121, + "grad_norm": 0.6680193543434143, + "learning_rate": 1.143839351013595e-05, + "loss": 0.0374, + "num_input_tokens_seen": 94504448, + "step": 43785 + }, + { + "epoch": 7.143556280587275, + "grad_norm": 0.3393155634403229, + "learning_rate": 1.1432414393321556e-05, + "loss": 0.1045, + "num_input_tokens_seen": 94515296, + "step": 43790 + }, + { + "epoch": 7.14437194127243, + "grad_norm": 1.0172944068908691, + "learning_rate": 1.1426436376364502e-05, + "loss": 0.1234, + "num_input_tokens_seen": 94525984, + "step": 43795 + }, + { + "epoch": 7.145187601957586, + "grad_norm": 0.26111164689064026, + "learning_rate": 1.1420459459749385e-05, + "loss": 0.1434, + "num_input_tokens_seen": 94536512, + "step": 43800 + }, + { + "epoch": 7.146003262642741, + "grad_norm": 0.09720335900783539, + "learning_rate": 1.141448364396073e-05, + "loss": 0.0586, + "num_input_tokens_seen": 94546208, + "step": 43805 + }, + { + "epoch": 7.146818923327896, + "grad_norm": 0.1997198462486267, + "learning_rate": 1.1408508929482961e-05, + "loss": 0.0343, + "num_input_tokens_seen": 94558656, + "step": 43810 + }, + { + "epoch": 7.14763458401305, + "grad_norm": 0.470443457365036, + "learning_rate": 1.1402535316800414e-05, + "loss": 0.04, + "num_input_tokens_seen": 94568832, + "step": 43815 + }, + { + "epoch": 7.148450244698205, + "grad_norm": 1.0941243171691895, + "learning_rate": 1.1396562806397354e-05, + "loss": 0.1723, + "num_input_tokens_seen": 94580512, + "step": 43820 + }, + { + "epoch": 7.149265905383361, + "grad_norm": 0.8565495610237122, + "learning_rate": 1.1390591398757935e-05, + "loss": 0.1682, + "num_input_tokens_seen": 94592608, + "step": 43825 + }, + { + "epoch": 7.150081566068516, + "grad_norm": 1.3844496011734009, + "learning_rate": 1.1384621094366232e-05, + "loss": 0.1742, + "num_input_tokens_seen": 94603200, + "step": 43830 + }, + { + "epoch": 7.150897226753671, + "grad_norm": 0.5400999784469604, + "learning_rate": 1.1378651893706227e-05, + "loss": 0.0754, + "num_input_tokens_seen": 94614656, + "step": 43835 + }, + { + "epoch": 7.151712887438825, + "grad_norm": 1.150897741317749, + "learning_rate": 1.1372683797261814e-05, + "loss": 0.0866, + "num_input_tokens_seen": 94624512, + "step": 43840 + }, + { + "epoch": 7.15252854812398, + "grad_norm": 0.09292787313461304, + "learning_rate": 1.1366716805516794e-05, + "loss": 0.1258, + "num_input_tokens_seen": 94635360, + "step": 43845 + }, + { + "epoch": 7.153344208809135, + "grad_norm": 0.04799726605415344, + "learning_rate": 1.1360750918954887e-05, + "loss": 0.0209, + "num_input_tokens_seen": 94645440, + "step": 43850 + }, + { + "epoch": 7.154159869494291, + "grad_norm": 0.04382926970720291, + "learning_rate": 1.1354786138059715e-05, + "loss": 0.1706, + "num_input_tokens_seen": 94656832, + "step": 43855 + }, + { + "epoch": 7.1549755301794455, + "grad_norm": 0.14911532402038574, + "learning_rate": 1.1348822463314815e-05, + "loss": 0.0231, + "num_input_tokens_seen": 94666816, + "step": 43860 + }, + { + "epoch": 7.1557911908646, + "grad_norm": 0.7569922208786011, + "learning_rate": 1.1342859895203629e-05, + "loss": 0.1999, + "num_input_tokens_seen": 94677888, + "step": 43865 + }, + { + "epoch": 7.156606851549755, + "grad_norm": 0.8676208853721619, + "learning_rate": 1.1336898434209517e-05, + "loss": 0.0726, + "num_input_tokens_seen": 94689088, + "step": 43870 + }, + { + "epoch": 7.15742251223491, + "grad_norm": 0.1024833396077156, + "learning_rate": 1.1330938080815743e-05, + "loss": 0.161, + "num_input_tokens_seen": 94700992, + "step": 43875 + }, + { + "epoch": 7.158238172920065, + "grad_norm": 0.16696979105472565, + "learning_rate": 1.1324978835505483e-05, + "loss": 0.0286, + "num_input_tokens_seen": 94710112, + "step": 43880 + }, + { + "epoch": 7.1590538336052205, + "grad_norm": 0.1251891702413559, + "learning_rate": 1.1319020698761828e-05, + "loss": 0.1454, + "num_input_tokens_seen": 94720544, + "step": 43885 + }, + { + "epoch": 7.159869494290375, + "grad_norm": 0.11861784011125565, + "learning_rate": 1.1313063671067769e-05, + "loss": 0.0569, + "num_input_tokens_seen": 94730496, + "step": 43890 + }, + { + "epoch": 7.16068515497553, + "grad_norm": 0.29811903834342957, + "learning_rate": 1.1307107752906218e-05, + "loss": 0.1938, + "num_input_tokens_seen": 94741696, + "step": 43895 + }, + { + "epoch": 7.161500815660685, + "grad_norm": 0.2100759893655777, + "learning_rate": 1.1301152944759988e-05, + "loss": 0.0854, + "num_input_tokens_seen": 94752000, + "step": 43900 + }, + { + "epoch": 7.16231647634584, + "grad_norm": 0.46032240986824036, + "learning_rate": 1.129519924711181e-05, + "loss": 0.0996, + "num_input_tokens_seen": 94763264, + "step": 43905 + }, + { + "epoch": 7.1631321370309955, + "grad_norm": 2.490309476852417, + "learning_rate": 1.1289246660444306e-05, + "loss": 0.2507, + "num_input_tokens_seen": 94773504, + "step": 43910 + }, + { + "epoch": 7.16394779771615, + "grad_norm": 0.35766226053237915, + "learning_rate": 1.1283295185240048e-05, + "loss": 0.2078, + "num_input_tokens_seen": 94783328, + "step": 43915 + }, + { + "epoch": 7.164763458401305, + "grad_norm": 0.23201867938041687, + "learning_rate": 1.1277344821981475e-05, + "loss": 0.0142, + "num_input_tokens_seen": 94795584, + "step": 43920 + }, + { + "epoch": 7.16557911908646, + "grad_norm": 0.04962139576673508, + "learning_rate": 1.1271395571150964e-05, + "loss": 0.1281, + "num_input_tokens_seen": 94805408, + "step": 43925 + }, + { + "epoch": 7.166394779771615, + "grad_norm": 0.9493265748023987, + "learning_rate": 1.1265447433230784e-05, + "loss": 0.1002, + "num_input_tokens_seen": 94817024, + "step": 43930 + }, + { + "epoch": 7.16721044045677, + "grad_norm": 1.4053759574890137, + "learning_rate": 1.1259500408703124e-05, + "loss": 0.212, + "num_input_tokens_seen": 94828480, + "step": 43935 + }, + { + "epoch": 7.168026101141925, + "grad_norm": 1.4531376361846924, + "learning_rate": 1.1253554498050078e-05, + "loss": 0.209, + "num_input_tokens_seen": 94839552, + "step": 43940 + }, + { + "epoch": 7.16884176182708, + "grad_norm": 0.10345472395420074, + "learning_rate": 1.1247609701753656e-05, + "loss": 0.1058, + "num_input_tokens_seen": 94851200, + "step": 43945 + }, + { + "epoch": 7.169657422512235, + "grad_norm": 0.08316493034362793, + "learning_rate": 1.1241666020295768e-05, + "loss": 0.1995, + "num_input_tokens_seen": 94863744, + "step": 43950 + }, + { + "epoch": 7.17047308319739, + "grad_norm": 0.2300613671541214, + "learning_rate": 1.1235723454158242e-05, + "loss": 0.1022, + "num_input_tokens_seen": 94873504, + "step": 43955 + }, + { + "epoch": 7.171288743882545, + "grad_norm": 0.01852325350046158, + "learning_rate": 1.1229782003822803e-05, + "loss": 0.0894, + "num_input_tokens_seen": 94883232, + "step": 43960 + }, + { + "epoch": 7.1721044045677, + "grad_norm": 0.4098668694496155, + "learning_rate": 1.1223841669771113e-05, + "loss": 0.0908, + "num_input_tokens_seen": 94893984, + "step": 43965 + }, + { + "epoch": 7.172920065252855, + "grad_norm": 0.13278281688690186, + "learning_rate": 1.121790245248472e-05, + "loss": 0.1586, + "num_input_tokens_seen": 94904960, + "step": 43970 + }, + { + "epoch": 7.17373572593801, + "grad_norm": 1.7536135911941528, + "learning_rate": 1.1211964352445078e-05, + "loss": 0.1637, + "num_input_tokens_seen": 94914656, + "step": 43975 + }, + { + "epoch": 7.174551386623165, + "grad_norm": 0.380927175283432, + "learning_rate": 1.120602737013357e-05, + "loss": 0.0895, + "num_input_tokens_seen": 94925632, + "step": 43980 + }, + { + "epoch": 7.1753670473083195, + "grad_norm": 0.38170111179351807, + "learning_rate": 1.120009150603147e-05, + "loss": 0.1036, + "num_input_tokens_seen": 94937376, + "step": 43985 + }, + { + "epoch": 7.176182707993474, + "grad_norm": 2.0042827129364014, + "learning_rate": 1.1194156760619976e-05, + "loss": 0.1123, + "num_input_tokens_seen": 94948608, + "step": 43990 + }, + { + "epoch": 7.17699836867863, + "grad_norm": 0.07771027833223343, + "learning_rate": 1.1188223134380183e-05, + "loss": 0.0589, + "num_input_tokens_seen": 94958368, + "step": 43995 + }, + { + "epoch": 7.177814029363785, + "grad_norm": 0.2125042974948883, + "learning_rate": 1.1182290627793105e-05, + "loss": 0.0235, + "num_input_tokens_seen": 94968576, + "step": 44000 + }, + { + "epoch": 7.17862969004894, + "grad_norm": 1.3435723781585693, + "learning_rate": 1.1176359241339656e-05, + "loss": 0.1783, + "num_input_tokens_seen": 94979552, + "step": 44005 + }, + { + "epoch": 7.1794453507340945, + "grad_norm": 1.219958782196045, + "learning_rate": 1.1170428975500668e-05, + "loss": 0.0488, + "num_input_tokens_seen": 94990656, + "step": 44010 + }, + { + "epoch": 7.180261011419249, + "grad_norm": 0.32448816299438477, + "learning_rate": 1.116449983075688e-05, + "loss": 0.1195, + "num_input_tokens_seen": 95000480, + "step": 44015 + }, + { + "epoch": 7.181076672104404, + "grad_norm": 0.18430666625499725, + "learning_rate": 1.1158571807588924e-05, + "loss": 0.0582, + "num_input_tokens_seen": 95011584, + "step": 44020 + }, + { + "epoch": 7.18189233278956, + "grad_norm": 0.05868301913142204, + "learning_rate": 1.115264490647738e-05, + "loss": 0.1406, + "num_input_tokens_seen": 95021824, + "step": 44025 + }, + { + "epoch": 7.182707993474715, + "grad_norm": 0.6292427778244019, + "learning_rate": 1.11467191279027e-05, + "loss": 0.0545, + "num_input_tokens_seen": 95033696, + "step": 44030 + }, + { + "epoch": 7.1835236541598695, + "grad_norm": 0.8049329519271851, + "learning_rate": 1.1140794472345259e-05, + "loss": 0.1097, + "num_input_tokens_seen": 95043936, + "step": 44035 + }, + { + "epoch": 7.184339314845024, + "grad_norm": 3.3577682971954346, + "learning_rate": 1.1134870940285339e-05, + "loss": 0.0717, + "num_input_tokens_seen": 95055040, + "step": 44040 + }, + { + "epoch": 7.185154975530179, + "grad_norm": 0.9915413856506348, + "learning_rate": 1.112894853220313e-05, + "loss": 0.1475, + "num_input_tokens_seen": 95066368, + "step": 44045 + }, + { + "epoch": 7.185970636215335, + "grad_norm": 1.2418285608291626, + "learning_rate": 1.1123027248578736e-05, + "loss": 0.2251, + "num_input_tokens_seen": 95077728, + "step": 44050 + }, + { + "epoch": 7.18678629690049, + "grad_norm": 0.31324154138565063, + "learning_rate": 1.1117107089892162e-05, + "loss": 0.1825, + "num_input_tokens_seen": 95088160, + "step": 44055 + }, + { + "epoch": 7.1876019575856445, + "grad_norm": 1.4987879991531372, + "learning_rate": 1.1111188056623328e-05, + "loss": 0.1644, + "num_input_tokens_seen": 95099424, + "step": 44060 + }, + { + "epoch": 7.188417618270799, + "grad_norm": 0.7614249587059021, + "learning_rate": 1.1105270149252062e-05, + "loss": 0.1447, + "num_input_tokens_seen": 95109600, + "step": 44065 + }, + { + "epoch": 7.189233278955954, + "grad_norm": 0.17971669137477875, + "learning_rate": 1.10993533682581e-05, + "loss": 0.183, + "num_input_tokens_seen": 95119424, + "step": 44070 + }, + { + "epoch": 7.190048939641109, + "grad_norm": 1.2091612815856934, + "learning_rate": 1.1093437714121085e-05, + "loss": 0.0999, + "num_input_tokens_seen": 95130368, + "step": 44075 + }, + { + "epoch": 7.190864600326265, + "grad_norm": 0.9040985107421875, + "learning_rate": 1.108752318732057e-05, + "loss": 0.0457, + "num_input_tokens_seen": 95142400, + "step": 44080 + }, + { + "epoch": 7.191680261011419, + "grad_norm": 0.11549914628267288, + "learning_rate": 1.1081609788336014e-05, + "loss": 0.2894, + "num_input_tokens_seen": 95153472, + "step": 44085 + }, + { + "epoch": 7.192495921696574, + "grad_norm": 0.9327459335327148, + "learning_rate": 1.1075697517646794e-05, + "loss": 0.2341, + "num_input_tokens_seen": 95163072, + "step": 44090 + }, + { + "epoch": 7.193311582381729, + "grad_norm": 0.17795923352241516, + "learning_rate": 1.1069786375732181e-05, + "loss": 0.0352, + "num_input_tokens_seen": 95174656, + "step": 44095 + }, + { + "epoch": 7.194127243066884, + "grad_norm": 0.6681300401687622, + "learning_rate": 1.1063876363071368e-05, + "loss": 0.0518, + "num_input_tokens_seen": 95184960, + "step": 44100 + }, + { + "epoch": 7.19494290375204, + "grad_norm": 1.6751807928085327, + "learning_rate": 1.1057967480143438e-05, + "loss": 0.1326, + "num_input_tokens_seen": 95194848, + "step": 44105 + }, + { + "epoch": 7.195758564437194, + "grad_norm": 1.3131769895553589, + "learning_rate": 1.1052059727427414e-05, + "loss": 0.0998, + "num_input_tokens_seen": 95207072, + "step": 44110 + }, + { + "epoch": 7.196574225122349, + "grad_norm": 0.042680736631155014, + "learning_rate": 1.1046153105402199e-05, + "loss": 0.1628, + "num_input_tokens_seen": 95217920, + "step": 44115 + }, + { + "epoch": 7.197389885807504, + "grad_norm": 2.276479959487915, + "learning_rate": 1.1040247614546617e-05, + "loss": 0.1355, + "num_input_tokens_seen": 95228224, + "step": 44120 + }, + { + "epoch": 7.198205546492659, + "grad_norm": 0.2741931974887848, + "learning_rate": 1.1034343255339391e-05, + "loss": 0.1025, + "num_input_tokens_seen": 95239008, + "step": 44125 + }, + { + "epoch": 7.199021207177814, + "grad_norm": 1.2851805686950684, + "learning_rate": 1.1028440028259154e-05, + "loss": 0.3504, + "num_input_tokens_seen": 95248800, + "step": 44130 + }, + { + "epoch": 7.199836867862969, + "grad_norm": 0.10282298922538757, + "learning_rate": 1.1022537933784472e-05, + "loss": 0.1715, + "num_input_tokens_seen": 95259840, + "step": 44135 + }, + { + "epoch": 7.200652528548124, + "grad_norm": 1.732798457145691, + "learning_rate": 1.1016636972393782e-05, + "loss": 0.1647, + "num_input_tokens_seen": 95270848, + "step": 44140 + }, + { + "epoch": 7.201468189233279, + "grad_norm": 0.08323367685079575, + "learning_rate": 1.101073714456545e-05, + "loss": 0.0546, + "num_input_tokens_seen": 95281472, + "step": 44145 + }, + { + "epoch": 7.202283849918434, + "grad_norm": 0.25295355916023254, + "learning_rate": 1.1004838450777747e-05, + "loss": 0.1853, + "num_input_tokens_seen": 95292160, + "step": 44150 + }, + { + "epoch": 7.203099510603589, + "grad_norm": 0.9402579665184021, + "learning_rate": 1.099894089150885e-05, + "loss": 0.0622, + "num_input_tokens_seen": 95302208, + "step": 44155 + }, + { + "epoch": 7.2039151712887435, + "grad_norm": 0.22970260679721832, + "learning_rate": 1.0993044467236843e-05, + "loss": 0.2354, + "num_input_tokens_seen": 95312032, + "step": 44160 + }, + { + "epoch": 7.204730831973899, + "grad_norm": 0.7863478660583496, + "learning_rate": 1.0987149178439726e-05, + "loss": 0.1159, + "num_input_tokens_seen": 95323744, + "step": 44165 + }, + { + "epoch": 7.205546492659054, + "grad_norm": 0.9612582325935364, + "learning_rate": 1.0981255025595394e-05, + "loss": 0.172, + "num_input_tokens_seen": 95333856, + "step": 44170 + }, + { + "epoch": 7.206362153344209, + "grad_norm": 0.027494894340634346, + "learning_rate": 1.097536200918166e-05, + "loss": 0.0695, + "num_input_tokens_seen": 95344864, + "step": 44175 + }, + { + "epoch": 7.207177814029364, + "grad_norm": 0.45491012930870056, + "learning_rate": 1.0969470129676243e-05, + "loss": 0.1083, + "num_input_tokens_seen": 95355520, + "step": 44180 + }, + { + "epoch": 7.2079934747145185, + "grad_norm": 0.2796274721622467, + "learning_rate": 1.096357938755677e-05, + "loss": 0.0293, + "num_input_tokens_seen": 95366752, + "step": 44185 + }, + { + "epoch": 7.208809135399674, + "grad_norm": 0.05549515038728714, + "learning_rate": 1.0957689783300767e-05, + "loss": 0.0656, + "num_input_tokens_seen": 95378304, + "step": 44190 + }, + { + "epoch": 7.209624796084829, + "grad_norm": 1.5469039678573608, + "learning_rate": 1.0951801317385682e-05, + "loss": 0.2241, + "num_input_tokens_seen": 95389024, + "step": 44195 + }, + { + "epoch": 7.210440456769984, + "grad_norm": 0.17994342744350433, + "learning_rate": 1.0945913990288862e-05, + "loss": 0.0218, + "num_input_tokens_seen": 95398656, + "step": 44200 + }, + { + "epoch": 7.211256117455139, + "grad_norm": 1.1340211629867554, + "learning_rate": 1.0940027802487565e-05, + "loss": 0.11, + "num_input_tokens_seen": 95408160, + "step": 44205 + }, + { + "epoch": 7.212071778140293, + "grad_norm": 0.6047415733337402, + "learning_rate": 1.0934142754458954e-05, + "loss": 0.0554, + "num_input_tokens_seen": 95419936, + "step": 44210 + }, + { + "epoch": 7.212887438825448, + "grad_norm": 1.6925512552261353, + "learning_rate": 1.0928258846680097e-05, + "loss": 0.1792, + "num_input_tokens_seen": 95429952, + "step": 44215 + }, + { + "epoch": 7.213703099510604, + "grad_norm": 0.11457721889019012, + "learning_rate": 1.092237607962798e-05, + "loss": 0.0487, + "num_input_tokens_seen": 95441024, + "step": 44220 + }, + { + "epoch": 7.214518760195759, + "grad_norm": 0.10256681591272354, + "learning_rate": 1.0916494453779489e-05, + "loss": 0.036, + "num_input_tokens_seen": 95452448, + "step": 44225 + }, + { + "epoch": 7.215334420880914, + "grad_norm": 0.08105997741222382, + "learning_rate": 1.0910613969611406e-05, + "loss": 0.0423, + "num_input_tokens_seen": 95462432, + "step": 44230 + }, + { + "epoch": 7.216150081566068, + "grad_norm": 0.3361245095729828, + "learning_rate": 1.0904734627600448e-05, + "loss": 0.0538, + "num_input_tokens_seen": 95473312, + "step": 44235 + }, + { + "epoch": 7.216965742251223, + "grad_norm": 2.045866012573242, + "learning_rate": 1.0898856428223225e-05, + "loss": 0.1504, + "num_input_tokens_seen": 95482624, + "step": 44240 + }, + { + "epoch": 7.217781402936378, + "grad_norm": 0.09988845884799957, + "learning_rate": 1.0892979371956246e-05, + "loss": 0.0202, + "num_input_tokens_seen": 95494304, + "step": 44245 + }, + { + "epoch": 7.218597063621534, + "grad_norm": 0.09387572109699249, + "learning_rate": 1.088710345927594e-05, + "loss": 0.0332, + "num_input_tokens_seen": 95505024, + "step": 44250 + }, + { + "epoch": 7.219412724306689, + "grad_norm": 0.3678262233734131, + "learning_rate": 1.0881228690658634e-05, + "loss": 0.0393, + "num_input_tokens_seen": 95516224, + "step": 44255 + }, + { + "epoch": 7.220228384991843, + "grad_norm": 0.12083901464939117, + "learning_rate": 1.087535506658057e-05, + "loss": 0.0294, + "num_input_tokens_seen": 95527168, + "step": 44260 + }, + { + "epoch": 7.221044045676998, + "grad_norm": 3.335677146911621, + "learning_rate": 1.086948258751789e-05, + "loss": 0.1286, + "num_input_tokens_seen": 95537152, + "step": 44265 + }, + { + "epoch": 7.221859706362153, + "grad_norm": 1.730900526046753, + "learning_rate": 1.0863611253946651e-05, + "loss": 0.1607, + "num_input_tokens_seen": 95549248, + "step": 44270 + }, + { + "epoch": 7.222675367047309, + "grad_norm": 0.8714982271194458, + "learning_rate": 1.08577410663428e-05, + "loss": 0.193, + "num_input_tokens_seen": 95560704, + "step": 44275 + }, + { + "epoch": 7.2234910277324635, + "grad_norm": 0.046242643147706985, + "learning_rate": 1.0851872025182225e-05, + "loss": 0.118, + "num_input_tokens_seen": 95572864, + "step": 44280 + }, + { + "epoch": 7.224306688417618, + "grad_norm": 1.32993483543396, + "learning_rate": 1.084600413094069e-05, + "loss": 0.1345, + "num_input_tokens_seen": 95582944, + "step": 44285 + }, + { + "epoch": 7.225122349102773, + "grad_norm": 0.04222944378852844, + "learning_rate": 1.0840137384093876e-05, + "loss": 0.073, + "num_input_tokens_seen": 95593472, + "step": 44290 + }, + { + "epoch": 7.225938009787928, + "grad_norm": 0.49419260025024414, + "learning_rate": 1.0834271785117376e-05, + "loss": 0.18, + "num_input_tokens_seen": 95604064, + "step": 44295 + }, + { + "epoch": 7.226753670473083, + "grad_norm": 0.0761498436331749, + "learning_rate": 1.0828407334486676e-05, + "loss": 0.0364, + "num_input_tokens_seen": 95616192, + "step": 44300 + }, + { + "epoch": 7.2275693311582385, + "grad_norm": 0.6528759002685547, + "learning_rate": 1.0822544032677187e-05, + "loss": 0.084, + "num_input_tokens_seen": 95626976, + "step": 44305 + }, + { + "epoch": 7.228384991843393, + "grad_norm": 0.17088532447814941, + "learning_rate": 1.0816681880164215e-05, + "loss": 0.0445, + "num_input_tokens_seen": 95635328, + "step": 44310 + }, + { + "epoch": 7.229200652528548, + "grad_norm": 1.019085168838501, + "learning_rate": 1.0810820877422973e-05, + "loss": 0.1727, + "num_input_tokens_seen": 95645248, + "step": 44315 + }, + { + "epoch": 7.230016313213703, + "grad_norm": 0.643899142742157, + "learning_rate": 1.0804961024928587e-05, + "loss": 0.1809, + "num_input_tokens_seen": 95656928, + "step": 44320 + }, + { + "epoch": 7.230831973898858, + "grad_norm": 0.10614824295043945, + "learning_rate": 1.0799102323156082e-05, + "loss": 0.0085, + "num_input_tokens_seen": 95667200, + "step": 44325 + }, + { + "epoch": 7.231647634584013, + "grad_norm": 0.36036843061447144, + "learning_rate": 1.0793244772580402e-05, + "loss": 0.1341, + "num_input_tokens_seen": 95677248, + "step": 44330 + }, + { + "epoch": 7.232463295269168, + "grad_norm": 0.1067449152469635, + "learning_rate": 1.0787388373676374e-05, + "loss": 0.1791, + "num_input_tokens_seen": 95687104, + "step": 44335 + }, + { + "epoch": 7.233278955954323, + "grad_norm": 1.7663993835449219, + "learning_rate": 1.0781533126918767e-05, + "loss": 0.1731, + "num_input_tokens_seen": 95697792, + "step": 44340 + }, + { + "epoch": 7.234094616639478, + "grad_norm": 0.14181207120418549, + "learning_rate": 1.077567903278223e-05, + "loss": 0.11, + "num_input_tokens_seen": 95708800, + "step": 44345 + }, + { + "epoch": 7.234910277324633, + "grad_norm": 0.045374177396297455, + "learning_rate": 1.0769826091741323e-05, + "loss": 0.0563, + "num_input_tokens_seen": 95719296, + "step": 44350 + }, + { + "epoch": 7.235725938009788, + "grad_norm": 0.86130690574646, + "learning_rate": 1.0763974304270516e-05, + "loss": 0.0759, + "num_input_tokens_seen": 95730336, + "step": 44355 + }, + { + "epoch": 7.236541598694943, + "grad_norm": 0.15446144342422485, + "learning_rate": 1.0758123670844186e-05, + "loss": 0.2171, + "num_input_tokens_seen": 95741760, + "step": 44360 + }, + { + "epoch": 7.237357259380098, + "grad_norm": 0.08399948477745056, + "learning_rate": 1.0752274191936611e-05, + "loss": 0.1895, + "num_input_tokens_seen": 95752896, + "step": 44365 + }, + { + "epoch": 7.238172920065253, + "grad_norm": 1.7247850894927979, + "learning_rate": 1.0746425868021986e-05, + "loss": 0.2833, + "num_input_tokens_seen": 95763968, + "step": 44370 + }, + { + "epoch": 7.238988580750408, + "grad_norm": 0.3722257614135742, + "learning_rate": 1.07405786995744e-05, + "loss": 0.0668, + "num_input_tokens_seen": 95774528, + "step": 44375 + }, + { + "epoch": 7.239804241435563, + "grad_norm": 0.863458514213562, + "learning_rate": 1.0734732687067856e-05, + "loss": 0.0759, + "num_input_tokens_seen": 95785600, + "step": 44380 + }, + { + "epoch": 7.240619902120717, + "grad_norm": 0.9641510248184204, + "learning_rate": 1.0728887830976261e-05, + "loss": 0.1637, + "num_input_tokens_seen": 95795104, + "step": 44385 + }, + { + "epoch": 7.241435562805873, + "grad_norm": 0.132420152425766, + "learning_rate": 1.0723044131773433e-05, + "loss": 0.0721, + "num_input_tokens_seen": 95807104, + "step": 44390 + }, + { + "epoch": 7.242251223491028, + "grad_norm": 0.19197338819503784, + "learning_rate": 1.0717201589933085e-05, + "loss": 0.0872, + "num_input_tokens_seen": 95816832, + "step": 44395 + }, + { + "epoch": 7.243066884176183, + "grad_norm": 0.4269435703754425, + "learning_rate": 1.0711360205928847e-05, + "loss": 0.0631, + "num_input_tokens_seen": 95826688, + "step": 44400 + }, + { + "epoch": 7.2438825448613375, + "grad_norm": 0.11257819831371307, + "learning_rate": 1.070551998023425e-05, + "loss": 0.055, + "num_input_tokens_seen": 95835904, + "step": 44405 + }, + { + "epoch": 7.244698205546492, + "grad_norm": 1.3356163501739502, + "learning_rate": 1.0699680913322736e-05, + "loss": 0.2251, + "num_input_tokens_seen": 95847136, + "step": 44410 + }, + { + "epoch": 7.245513866231648, + "grad_norm": 0.19598014652729034, + "learning_rate": 1.0693843005667633e-05, + "loss": 0.1759, + "num_input_tokens_seen": 95858368, + "step": 44415 + }, + { + "epoch": 7.246329526916803, + "grad_norm": 0.0687854140996933, + "learning_rate": 1.0688006257742214e-05, + "loss": 0.1579, + "num_input_tokens_seen": 95868544, + "step": 44420 + }, + { + "epoch": 7.247145187601958, + "grad_norm": 0.08795636892318726, + "learning_rate": 1.0682170670019628e-05, + "loss": 0.0579, + "num_input_tokens_seen": 95878944, + "step": 44425 + }, + { + "epoch": 7.2479608482871125, + "grad_norm": 1.4766241312026978, + "learning_rate": 1.0676336242972934e-05, + "loss": 0.0762, + "num_input_tokens_seen": 95889792, + "step": 44430 + }, + { + "epoch": 7.248776508972267, + "grad_norm": 0.17209450900554657, + "learning_rate": 1.0670502977075103e-05, + "loss": 0.1048, + "num_input_tokens_seen": 95900896, + "step": 44435 + }, + { + "epoch": 7.249592169657422, + "grad_norm": 1.5335052013397217, + "learning_rate": 1.0664670872799006e-05, + "loss": 0.1878, + "num_input_tokens_seen": 95911808, + "step": 44440 + }, + { + "epoch": 7.250407830342578, + "grad_norm": 0.31868577003479004, + "learning_rate": 1.065883993061742e-05, + "loss": 0.0878, + "num_input_tokens_seen": 95921696, + "step": 44445 + }, + { + "epoch": 7.251223491027733, + "grad_norm": 0.2904200851917267, + "learning_rate": 1.065301015100304e-05, + "loss": 0.0486, + "num_input_tokens_seen": 95931104, + "step": 44450 + }, + { + "epoch": 7.2520391517128875, + "grad_norm": 0.2635079324245453, + "learning_rate": 1.0647181534428455e-05, + "loss": 0.2282, + "num_input_tokens_seen": 95941376, + "step": 44455 + }, + { + "epoch": 7.252854812398042, + "grad_norm": 0.5536256432533264, + "learning_rate": 1.0641354081366161e-05, + "loss": 0.0519, + "num_input_tokens_seen": 95950496, + "step": 44460 + }, + { + "epoch": 7.253670473083197, + "grad_norm": 1.5794439315795898, + "learning_rate": 1.0635527792288558e-05, + "loss": 0.1155, + "num_input_tokens_seen": 95961824, + "step": 44465 + }, + { + "epoch": 7.254486133768353, + "grad_norm": 0.7085497379302979, + "learning_rate": 1.0629702667667959e-05, + "loss": 0.1128, + "num_input_tokens_seen": 95972768, + "step": 44470 + }, + { + "epoch": 7.255301794453508, + "grad_norm": 1.138819932937622, + "learning_rate": 1.0623878707976575e-05, + "loss": 0.0842, + "num_input_tokens_seen": 95985024, + "step": 44475 + }, + { + "epoch": 7.2561174551386625, + "grad_norm": 0.17807446420192719, + "learning_rate": 1.0618055913686525e-05, + "loss": 0.0734, + "num_input_tokens_seen": 95996768, + "step": 44480 + }, + { + "epoch": 7.256933115823817, + "grad_norm": 0.961108386516571, + "learning_rate": 1.061223428526984e-05, + "loss": 0.13, + "num_input_tokens_seen": 96007328, + "step": 44485 + }, + { + "epoch": 7.257748776508972, + "grad_norm": 0.17782479524612427, + "learning_rate": 1.0606413823198444e-05, + "loss": 0.0578, + "num_input_tokens_seen": 96017472, + "step": 44490 + }, + { + "epoch": 7.258564437194127, + "grad_norm": 1.6045676469802856, + "learning_rate": 1.0600594527944174e-05, + "loss": 0.0776, + "num_input_tokens_seen": 96026048, + "step": 44495 + }, + { + "epoch": 7.259380097879283, + "grad_norm": 0.14779388904571533, + "learning_rate": 1.0594776399978776e-05, + "loss": 0.0826, + "num_input_tokens_seen": 96037024, + "step": 44500 + }, + { + "epoch": 7.260195758564437, + "grad_norm": 0.5510703325271606, + "learning_rate": 1.0588959439773893e-05, + "loss": 0.0853, + "num_input_tokens_seen": 96048640, + "step": 44505 + }, + { + "epoch": 7.261011419249592, + "grad_norm": 1.1203118562698364, + "learning_rate": 1.058314364780108e-05, + "loss": 0.1138, + "num_input_tokens_seen": 96059968, + "step": 44510 + }, + { + "epoch": 7.261827079934747, + "grad_norm": 0.3198342025279999, + "learning_rate": 1.0577329024531792e-05, + "loss": 0.1765, + "num_input_tokens_seen": 96070592, + "step": 44515 + }, + { + "epoch": 7.262642740619902, + "grad_norm": 0.9311063885688782, + "learning_rate": 1.0571515570437396e-05, + "loss": 0.0797, + "num_input_tokens_seen": 96080768, + "step": 44520 + }, + { + "epoch": 7.263458401305057, + "grad_norm": 0.27480533719062805, + "learning_rate": 1.0565703285989154e-05, + "loss": 0.1346, + "num_input_tokens_seen": 96091584, + "step": 44525 + }, + { + "epoch": 7.264274061990212, + "grad_norm": 0.9074100852012634, + "learning_rate": 1.0559892171658245e-05, + "loss": 0.0411, + "num_input_tokens_seen": 96103264, + "step": 44530 + }, + { + "epoch": 7.265089722675367, + "grad_norm": 0.14495117962360382, + "learning_rate": 1.0554082227915743e-05, + "loss": 0.2971, + "num_input_tokens_seen": 96112992, + "step": 44535 + }, + { + "epoch": 7.265905383360522, + "grad_norm": 0.7742032408714294, + "learning_rate": 1.0548273455232634e-05, + "loss": 0.11, + "num_input_tokens_seen": 96124480, + "step": 44540 + }, + { + "epoch": 7.266721044045677, + "grad_norm": 1.3586745262145996, + "learning_rate": 1.0542465854079806e-05, + "loss": 0.1285, + "num_input_tokens_seen": 96134848, + "step": 44545 + }, + { + "epoch": 7.267536704730832, + "grad_norm": 0.40636417269706726, + "learning_rate": 1.0536659424928044e-05, + "loss": 0.0558, + "num_input_tokens_seen": 96146176, + "step": 44550 + }, + { + "epoch": 7.268352365415987, + "grad_norm": 0.19834011793136597, + "learning_rate": 1.0530854168248064e-05, + "loss": 0.0898, + "num_input_tokens_seen": 96157824, + "step": 44555 + }, + { + "epoch": 7.269168026101142, + "grad_norm": 0.03543546423316002, + "learning_rate": 1.052505008451046e-05, + "loss": 0.0544, + "num_input_tokens_seen": 96168992, + "step": 44560 + }, + { + "epoch": 7.269983686786297, + "grad_norm": 1.1817041635513306, + "learning_rate": 1.0519247174185742e-05, + "loss": 0.0754, + "num_input_tokens_seen": 96179712, + "step": 44565 + }, + { + "epoch": 7.270799347471452, + "grad_norm": 1.1074374914169312, + "learning_rate": 1.0513445437744323e-05, + "loss": 0.1333, + "num_input_tokens_seen": 96190656, + "step": 44570 + }, + { + "epoch": 7.271615008156607, + "grad_norm": 1.430189847946167, + "learning_rate": 1.0507644875656523e-05, + "loss": 0.0588, + "num_input_tokens_seen": 96201120, + "step": 44575 + }, + { + "epoch": 7.2724306688417615, + "grad_norm": 0.19974879920482635, + "learning_rate": 1.0501845488392558e-05, + "loss": 0.0421, + "num_input_tokens_seen": 96210304, + "step": 44580 + }, + { + "epoch": 7.273246329526917, + "grad_norm": 0.27018916606903076, + "learning_rate": 1.0496047276422554e-05, + "loss": 0.2232, + "num_input_tokens_seen": 96221472, + "step": 44585 + }, + { + "epoch": 7.274061990212072, + "grad_norm": 0.03980802744626999, + "learning_rate": 1.0490250240216562e-05, + "loss": 0.039, + "num_input_tokens_seen": 96232736, + "step": 44590 + }, + { + "epoch": 7.274877650897227, + "grad_norm": 0.510287344455719, + "learning_rate": 1.0484454380244505e-05, + "loss": 0.16, + "num_input_tokens_seen": 96244352, + "step": 44595 + }, + { + "epoch": 7.275693311582382, + "grad_norm": 0.08168850094079971, + "learning_rate": 1.0478659696976225e-05, + "loss": 0.074, + "num_input_tokens_seen": 96254240, + "step": 44600 + }, + { + "epoch": 7.2765089722675365, + "grad_norm": 0.5314196348190308, + "learning_rate": 1.0472866190881473e-05, + "loss": 0.0368, + "num_input_tokens_seen": 96265696, + "step": 44605 + }, + { + "epoch": 7.277324632952691, + "grad_norm": 1.7611305713653564, + "learning_rate": 1.0467073862429897e-05, + "loss": 0.2361, + "num_input_tokens_seen": 96277664, + "step": 44610 + }, + { + "epoch": 7.278140293637847, + "grad_norm": 1.2032753229141235, + "learning_rate": 1.0461282712091053e-05, + "loss": 0.0715, + "num_input_tokens_seen": 96288448, + "step": 44615 + }, + { + "epoch": 7.278955954323002, + "grad_norm": 0.950891375541687, + "learning_rate": 1.0455492740334399e-05, + "loss": 0.0722, + "num_input_tokens_seen": 96298432, + "step": 44620 + }, + { + "epoch": 7.279771615008157, + "grad_norm": 0.0162261500954628, + "learning_rate": 1.0449703947629305e-05, + "loss": 0.0734, + "num_input_tokens_seen": 96309568, + "step": 44625 + }, + { + "epoch": 7.280587275693311, + "grad_norm": 0.14753590524196625, + "learning_rate": 1.0443916334445034e-05, + "loss": 0.0139, + "num_input_tokens_seen": 96319968, + "step": 44630 + }, + { + "epoch": 7.281402936378466, + "grad_norm": 0.297393798828125, + "learning_rate": 1.0438129901250762e-05, + "loss": 0.0521, + "num_input_tokens_seen": 96331232, + "step": 44635 + }, + { + "epoch": 7.282218597063622, + "grad_norm": 0.8894394636154175, + "learning_rate": 1.0432344648515569e-05, + "loss": 0.126, + "num_input_tokens_seen": 96341696, + "step": 44640 + }, + { + "epoch": 7.283034257748777, + "grad_norm": 0.12617194652557373, + "learning_rate": 1.042656057670843e-05, + "loss": 0.1271, + "num_input_tokens_seen": 96352160, + "step": 44645 + }, + { + "epoch": 7.283849918433932, + "grad_norm": 0.5684544444084167, + "learning_rate": 1.042077768629824e-05, + "loss": 0.0729, + "num_input_tokens_seen": 96362592, + "step": 44650 + }, + { + "epoch": 7.284665579119086, + "grad_norm": 0.8520516753196716, + "learning_rate": 1.0414995977753772e-05, + "loss": 0.0835, + "num_input_tokens_seen": 96375424, + "step": 44655 + }, + { + "epoch": 7.285481239804241, + "grad_norm": 0.7414054274559021, + "learning_rate": 1.0409215451543746e-05, + "loss": 0.1965, + "num_input_tokens_seen": 96387296, + "step": 44660 + }, + { + "epoch": 7.286296900489396, + "grad_norm": 1.1861891746520996, + "learning_rate": 1.0403436108136747e-05, + "loss": 0.135, + "num_input_tokens_seen": 96397632, + "step": 44665 + }, + { + "epoch": 7.287112561174552, + "grad_norm": 0.12566271424293518, + "learning_rate": 1.039765794800128e-05, + "loss": 0.0469, + "num_input_tokens_seen": 96408096, + "step": 44670 + }, + { + "epoch": 7.287928221859707, + "grad_norm": 0.027580417692661285, + "learning_rate": 1.0391880971605749e-05, + "loss": 0.0226, + "num_input_tokens_seen": 96417408, + "step": 44675 + }, + { + "epoch": 7.288743882544861, + "grad_norm": 0.6310897469520569, + "learning_rate": 1.0386105179418467e-05, + "loss": 0.1622, + "num_input_tokens_seen": 96427328, + "step": 44680 + }, + { + "epoch": 7.289559543230016, + "grad_norm": 0.36482688784599304, + "learning_rate": 1.0380330571907654e-05, + "loss": 0.0405, + "num_input_tokens_seen": 96437920, + "step": 44685 + }, + { + "epoch": 7.290375203915171, + "grad_norm": 0.3312911093235016, + "learning_rate": 1.037455714954142e-05, + "loss": 0.0468, + "num_input_tokens_seen": 96447872, + "step": 44690 + }, + { + "epoch": 7.291190864600326, + "grad_norm": 0.03387775644659996, + "learning_rate": 1.0368784912787794e-05, + "loss": 0.0278, + "num_input_tokens_seen": 96458592, + "step": 44695 + }, + { + "epoch": 7.2920065252854815, + "grad_norm": 0.23754163086414337, + "learning_rate": 1.03630138621147e-05, + "loss": 0.1015, + "num_input_tokens_seen": 96469312, + "step": 44700 + }, + { + "epoch": 7.292822185970636, + "grad_norm": 0.12229427695274353, + "learning_rate": 1.035724399798997e-05, + "loss": 0.0366, + "num_input_tokens_seen": 96478976, + "step": 44705 + }, + { + "epoch": 7.293637846655791, + "grad_norm": 0.285706490278244, + "learning_rate": 1.035147532088134e-05, + "loss": 0.0789, + "num_input_tokens_seen": 96489952, + "step": 44710 + }, + { + "epoch": 7.294453507340946, + "grad_norm": 0.033868491649627686, + "learning_rate": 1.0345707831256443e-05, + "loss": 0.1151, + "num_input_tokens_seen": 96500672, + "step": 44715 + }, + { + "epoch": 7.295269168026101, + "grad_norm": 2.147719383239746, + "learning_rate": 1.0339941529582828e-05, + "loss": 0.2964, + "num_input_tokens_seen": 96511488, + "step": 44720 + }, + { + "epoch": 7.2960848287112565, + "grad_norm": 0.17306168377399445, + "learning_rate": 1.0334176416327935e-05, + "loss": 0.1764, + "num_input_tokens_seen": 96522336, + "step": 44725 + }, + { + "epoch": 7.296900489396411, + "grad_norm": 0.2741304039955139, + "learning_rate": 1.0328412491959104e-05, + "loss": 0.3118, + "num_input_tokens_seen": 96533088, + "step": 44730 + }, + { + "epoch": 7.297716150081566, + "grad_norm": 0.4762522876262665, + "learning_rate": 1.0322649756943611e-05, + "loss": 0.0386, + "num_input_tokens_seen": 96543552, + "step": 44735 + }, + { + "epoch": 7.298531810766721, + "grad_norm": 0.08004552125930786, + "learning_rate": 1.0316888211748601e-05, + "loss": 0.2245, + "num_input_tokens_seen": 96554432, + "step": 44740 + }, + { + "epoch": 7.299347471451876, + "grad_norm": 1.4097814559936523, + "learning_rate": 1.0311127856841136e-05, + "loss": 0.1131, + "num_input_tokens_seen": 96565536, + "step": 44745 + }, + { + "epoch": 7.300163132137031, + "grad_norm": 0.7782304286956787, + "learning_rate": 1.0305368692688174e-05, + "loss": 0.1044, + "num_input_tokens_seen": 96576576, + "step": 44750 + }, + { + "epoch": 7.300978792822186, + "grad_norm": 1.2607035636901855, + "learning_rate": 1.0299610719756587e-05, + "loss": 0.066, + "num_input_tokens_seen": 96588640, + "step": 44755 + }, + { + "epoch": 7.301794453507341, + "grad_norm": 0.07093389332294464, + "learning_rate": 1.0293853938513142e-05, + "loss": 0.1131, + "num_input_tokens_seen": 96600160, + "step": 44760 + }, + { + "epoch": 7.302610114192496, + "grad_norm": 0.3791845142841339, + "learning_rate": 1.028809834942451e-05, + "loss": 0.2806, + "num_input_tokens_seen": 96610944, + "step": 44765 + }, + { + "epoch": 7.303425774877651, + "grad_norm": 1.4246569871902466, + "learning_rate": 1.028234395295728e-05, + "loss": 0.1753, + "num_input_tokens_seen": 96620960, + "step": 44770 + }, + { + "epoch": 7.304241435562806, + "grad_norm": 0.0878949835896492, + "learning_rate": 1.0276590749577924e-05, + "loss": 0.2241, + "num_input_tokens_seen": 96630592, + "step": 44775 + }, + { + "epoch": 7.30505709624796, + "grad_norm": 0.03264287859201431, + "learning_rate": 1.027083873975283e-05, + "loss": 0.1329, + "num_input_tokens_seen": 96642368, + "step": 44780 + }, + { + "epoch": 7.305872756933116, + "grad_norm": 0.4078168272972107, + "learning_rate": 1.0265087923948283e-05, + "loss": 0.0338, + "num_input_tokens_seen": 96654464, + "step": 44785 + }, + { + "epoch": 7.306688417618271, + "grad_norm": 1.063652515411377, + "learning_rate": 1.0259338302630472e-05, + "loss": 0.1956, + "num_input_tokens_seen": 96664992, + "step": 44790 + }, + { + "epoch": 7.307504078303426, + "grad_norm": 0.2662647068500519, + "learning_rate": 1.025358987626549e-05, + "loss": 0.1132, + "num_input_tokens_seen": 96675168, + "step": 44795 + }, + { + "epoch": 7.308319738988581, + "grad_norm": 2.5863513946533203, + "learning_rate": 1.0247842645319339e-05, + "loss": 0.1398, + "num_input_tokens_seen": 96684992, + "step": 44800 + }, + { + "epoch": 7.309135399673735, + "grad_norm": 0.5660115480422974, + "learning_rate": 1.0242096610257911e-05, + "loss": 0.0252, + "num_input_tokens_seen": 96695808, + "step": 44805 + }, + { + "epoch": 7.309951060358891, + "grad_norm": 0.860813319683075, + "learning_rate": 1.0236351771547014e-05, + "loss": 0.2539, + "num_input_tokens_seen": 96707840, + "step": 44810 + }, + { + "epoch": 7.310766721044046, + "grad_norm": 1.7156153917312622, + "learning_rate": 1.0230608129652355e-05, + "loss": 0.1445, + "num_input_tokens_seen": 96718784, + "step": 44815 + }, + { + "epoch": 7.311582381729201, + "grad_norm": 0.11828848719596863, + "learning_rate": 1.022486568503954e-05, + "loss": 0.0265, + "num_input_tokens_seen": 96729376, + "step": 44820 + }, + { + "epoch": 7.3123980424143555, + "grad_norm": 0.2722533643245697, + "learning_rate": 1.0219124438174076e-05, + "loss": 0.0594, + "num_input_tokens_seen": 96740640, + "step": 44825 + }, + { + "epoch": 7.31321370309951, + "grad_norm": 0.12132245302200317, + "learning_rate": 1.0213384389521385e-05, + "loss": 0.0724, + "num_input_tokens_seen": 96751104, + "step": 44830 + }, + { + "epoch": 7.314029363784665, + "grad_norm": 0.6458984613418579, + "learning_rate": 1.0207645539546784e-05, + "loss": 0.2081, + "num_input_tokens_seen": 96760672, + "step": 44835 + }, + { + "epoch": 7.314845024469821, + "grad_norm": 0.3117106258869171, + "learning_rate": 1.020190788871549e-05, + "loss": 0.0541, + "num_input_tokens_seen": 96771840, + "step": 44840 + }, + { + "epoch": 7.315660685154976, + "grad_norm": 0.09859402477741241, + "learning_rate": 1.0196171437492627e-05, + "loss": 0.1596, + "num_input_tokens_seen": 96782592, + "step": 44845 + }, + { + "epoch": 7.3164763458401305, + "grad_norm": 0.050697073340415955, + "learning_rate": 1.0190436186343218e-05, + "loss": 0.1797, + "num_input_tokens_seen": 96794016, + "step": 44850 + }, + { + "epoch": 7.317292006525285, + "grad_norm": 0.09669524431228638, + "learning_rate": 1.01847021357322e-05, + "loss": 0.204, + "num_input_tokens_seen": 96803968, + "step": 44855 + }, + { + "epoch": 7.31810766721044, + "grad_norm": 0.849837601184845, + "learning_rate": 1.0178969286124396e-05, + "loss": 0.1663, + "num_input_tokens_seen": 96814304, + "step": 44860 + }, + { + "epoch": 7.318923327895595, + "grad_norm": 0.09781338274478912, + "learning_rate": 1.0173237637984542e-05, + "loss": 0.0286, + "num_input_tokens_seen": 96824128, + "step": 44865 + }, + { + "epoch": 7.319738988580751, + "grad_norm": 0.03912004828453064, + "learning_rate": 1.016750719177727e-05, + "loss": 0.1421, + "num_input_tokens_seen": 96835712, + "step": 44870 + }, + { + "epoch": 7.3205546492659055, + "grad_norm": 0.4215553104877472, + "learning_rate": 1.016177794796713e-05, + "loss": 0.0252, + "num_input_tokens_seen": 96846272, + "step": 44875 + }, + { + "epoch": 7.32137030995106, + "grad_norm": 0.10814689844846725, + "learning_rate": 1.0156049907018562e-05, + "loss": 0.0771, + "num_input_tokens_seen": 96857408, + "step": 44880 + }, + { + "epoch": 7.322185970636215, + "grad_norm": 0.38242873549461365, + "learning_rate": 1.0150323069395901e-05, + "loss": 0.1763, + "num_input_tokens_seen": 96868032, + "step": 44885 + }, + { + "epoch": 7.32300163132137, + "grad_norm": 0.06121502071619034, + "learning_rate": 1.01445974355634e-05, + "loss": 0.1755, + "num_input_tokens_seen": 96879136, + "step": 44890 + }, + { + "epoch": 7.323817292006526, + "grad_norm": 0.07446341216564178, + "learning_rate": 1.0138873005985208e-05, + "loss": 0.0882, + "num_input_tokens_seen": 96890016, + "step": 44895 + }, + { + "epoch": 7.3246329526916805, + "grad_norm": 0.30517980456352234, + "learning_rate": 1.0133149781125365e-05, + "loss": 0.074, + "num_input_tokens_seen": 96900160, + "step": 44900 + }, + { + "epoch": 7.325448613376835, + "grad_norm": 0.9006150364875793, + "learning_rate": 1.0127427761447842e-05, + "loss": 0.1105, + "num_input_tokens_seen": 96909856, + "step": 44905 + }, + { + "epoch": 7.32626427406199, + "grad_norm": 0.29655569791793823, + "learning_rate": 1.012170694741649e-05, + "loss": 0.0534, + "num_input_tokens_seen": 96921280, + "step": 44910 + }, + { + "epoch": 7.327079934747145, + "grad_norm": 1.703857421875, + "learning_rate": 1.0115987339495061e-05, + "loss": 0.1113, + "num_input_tokens_seen": 96931136, + "step": 44915 + }, + { + "epoch": 7.327895595432301, + "grad_norm": 0.04934275150299072, + "learning_rate": 1.0110268938147222e-05, + "loss": 0.0329, + "num_input_tokens_seen": 96942112, + "step": 44920 + }, + { + "epoch": 7.328711256117455, + "grad_norm": 0.6826406717300415, + "learning_rate": 1.0104551743836532e-05, + "loss": 0.0851, + "num_input_tokens_seen": 96953664, + "step": 44925 + }, + { + "epoch": 7.32952691680261, + "grad_norm": 0.35228320956230164, + "learning_rate": 1.0098835757026457e-05, + "loss": 0.1321, + "num_input_tokens_seen": 96964800, + "step": 44930 + }, + { + "epoch": 7.330342577487765, + "grad_norm": 0.34419867396354675, + "learning_rate": 1.009312097818036e-05, + "loss": 0.0737, + "num_input_tokens_seen": 96975680, + "step": 44935 + }, + { + "epoch": 7.33115823817292, + "grad_norm": 0.43533793091773987, + "learning_rate": 1.0087407407761515e-05, + "loss": 0.1253, + "num_input_tokens_seen": 96986496, + "step": 44940 + }, + { + "epoch": 7.331973898858075, + "grad_norm": 0.3952029049396515, + "learning_rate": 1.0081695046233091e-05, + "loss": 0.032, + "num_input_tokens_seen": 96997184, + "step": 44945 + }, + { + "epoch": 7.33278955954323, + "grad_norm": 0.41898313164711, + "learning_rate": 1.0075983894058163e-05, + "loss": 0.0268, + "num_input_tokens_seen": 97008416, + "step": 44950 + }, + { + "epoch": 7.333605220228385, + "grad_norm": 0.1297818273305893, + "learning_rate": 1.0070273951699704e-05, + "loss": 0.0586, + "num_input_tokens_seen": 97020000, + "step": 44955 + }, + { + "epoch": 7.33442088091354, + "grad_norm": 0.3383154273033142, + "learning_rate": 1.0064565219620593e-05, + "loss": 0.086, + "num_input_tokens_seen": 97030752, + "step": 44960 + }, + { + "epoch": 7.335236541598695, + "grad_norm": 0.25986239314079285, + "learning_rate": 1.0058857698283603e-05, + "loss": 0.3129, + "num_input_tokens_seen": 97040416, + "step": 44965 + }, + { + "epoch": 7.33605220228385, + "grad_norm": 0.33601900935173035, + "learning_rate": 1.0053151388151418e-05, + "loss": 0.1929, + "num_input_tokens_seen": 97051936, + "step": 44970 + }, + { + "epoch": 7.3368678629690045, + "grad_norm": 1.002075433731079, + "learning_rate": 1.0047446289686615e-05, + "loss": 0.1561, + "num_input_tokens_seen": 97060864, + "step": 44975 + }, + { + "epoch": 7.33768352365416, + "grad_norm": 1.0403401851654053, + "learning_rate": 1.0041742403351693e-05, + "loss": 0.1212, + "num_input_tokens_seen": 97073376, + "step": 44980 + }, + { + "epoch": 7.338499184339315, + "grad_norm": 1.5448806285858154, + "learning_rate": 1.0036039729609029e-05, + "loss": 0.0756, + "num_input_tokens_seen": 97085696, + "step": 44985 + }, + { + "epoch": 7.33931484502447, + "grad_norm": 0.06956669688224792, + "learning_rate": 1.0030338268920911e-05, + "loss": 0.0775, + "num_input_tokens_seen": 97097856, + "step": 44990 + }, + { + "epoch": 7.340130505709625, + "grad_norm": 0.0861060619354248, + "learning_rate": 1.0024638021749527e-05, + "loss": 0.1207, + "num_input_tokens_seen": 97109280, + "step": 44995 + }, + { + "epoch": 7.3409461663947795, + "grad_norm": 0.15157632529735565, + "learning_rate": 1.001893898855697e-05, + "loss": 0.1776, + "num_input_tokens_seen": 97121728, + "step": 45000 + }, + { + "epoch": 7.341761827079935, + "grad_norm": 0.0996582880616188, + "learning_rate": 1.0013241169805232e-05, + "loss": 0.0443, + "num_input_tokens_seen": 97132480, + "step": 45005 + }, + { + "epoch": 7.34257748776509, + "grad_norm": 0.18203049898147583, + "learning_rate": 1.0007544565956206e-05, + "loss": 0.0235, + "num_input_tokens_seen": 97143904, + "step": 45010 + }, + { + "epoch": 7.343393148450245, + "grad_norm": 0.13284796476364136, + "learning_rate": 1.0001849177471687e-05, + "loss": 0.0927, + "num_input_tokens_seen": 97154816, + "step": 45015 + }, + { + "epoch": 7.3442088091354, + "grad_norm": 1.2465025186538696, + "learning_rate": 9.996155004813376e-06, + "loss": 0.0664, + "num_input_tokens_seen": 97165664, + "step": 45020 + }, + { + "epoch": 7.3450244698205545, + "grad_norm": 0.16438955068588257, + "learning_rate": 9.99046204844287e-06, + "loss": 0.0979, + "num_input_tokens_seen": 97174976, + "step": 45025 + }, + { + "epoch": 7.345840130505709, + "grad_norm": 0.9645077586174011, + "learning_rate": 9.984770308821664e-06, + "loss": 0.0794, + "num_input_tokens_seen": 97186720, + "step": 45030 + }, + { + "epoch": 7.346655791190865, + "grad_norm": 0.9146004319190979, + "learning_rate": 9.979079786411167e-06, + "loss": 0.0824, + "num_input_tokens_seen": 97196608, + "step": 45035 + }, + { + "epoch": 7.34747145187602, + "grad_norm": 0.6165507435798645, + "learning_rate": 9.973390481672676e-06, + "loss": 0.046, + "num_input_tokens_seen": 97208352, + "step": 45040 + }, + { + "epoch": 7.348287112561175, + "grad_norm": 0.43515709042549133, + "learning_rate": 9.967702395067388e-06, + "loss": 0.0647, + "num_input_tokens_seen": 97218048, + "step": 45045 + }, + { + "epoch": 7.349102773246329, + "grad_norm": 0.3579888641834259, + "learning_rate": 9.962015527056429e-06, + "loss": 0.1167, + "num_input_tokens_seen": 97229920, + "step": 45050 + }, + { + "epoch": 7.349918433931484, + "grad_norm": 0.9096333980560303, + "learning_rate": 9.95632987810079e-06, + "loss": 0.0687, + "num_input_tokens_seen": 97242816, + "step": 45055 + }, + { + "epoch": 7.350734094616639, + "grad_norm": 0.37219271063804626, + "learning_rate": 9.950645448661381e-06, + "loss": 0.0566, + "num_input_tokens_seen": 97254656, + "step": 45060 + }, + { + "epoch": 7.351549755301795, + "grad_norm": 0.08238190412521362, + "learning_rate": 9.944962239199013e-06, + "loss": 0.1477, + "num_input_tokens_seen": 97266432, + "step": 45065 + }, + { + "epoch": 7.35236541598695, + "grad_norm": 1.5360547304153442, + "learning_rate": 9.939280250174396e-06, + "loss": 0.1726, + "num_input_tokens_seen": 97278080, + "step": 45070 + }, + { + "epoch": 7.353181076672104, + "grad_norm": 1.8914918899536133, + "learning_rate": 9.933599482048136e-06, + "loss": 0.128, + "num_input_tokens_seen": 97288672, + "step": 45075 + }, + { + "epoch": 7.353996737357259, + "grad_norm": 0.09657787531614304, + "learning_rate": 9.927919935280752e-06, + "loss": 0.1382, + "num_input_tokens_seen": 97300736, + "step": 45080 + }, + { + "epoch": 7.354812398042414, + "grad_norm": 0.382625013589859, + "learning_rate": 9.922241610332641e-06, + "loss": 0.1428, + "num_input_tokens_seen": 97311808, + "step": 45085 + }, + { + "epoch": 7.35562805872757, + "grad_norm": 0.1223972737789154, + "learning_rate": 9.91656450766414e-06, + "loss": 0.1437, + "num_input_tokens_seen": 97321120, + "step": 45090 + }, + { + "epoch": 7.356443719412725, + "grad_norm": 1.0568969249725342, + "learning_rate": 9.91088862773545e-06, + "loss": 0.1592, + "num_input_tokens_seen": 97331712, + "step": 45095 + }, + { + "epoch": 7.357259380097879, + "grad_norm": 0.05824369937181473, + "learning_rate": 9.90521397100669e-06, + "loss": 0.0272, + "num_input_tokens_seen": 97342720, + "step": 45100 + }, + { + "epoch": 7.358075040783034, + "grad_norm": 0.4684850573539734, + "learning_rate": 9.899540537937879e-06, + "loss": 0.0685, + "num_input_tokens_seen": 97353920, + "step": 45105 + }, + { + "epoch": 7.358890701468189, + "grad_norm": 0.5024588108062744, + "learning_rate": 9.893868328988928e-06, + "loss": 0.0662, + "num_input_tokens_seen": 97364320, + "step": 45110 + }, + { + "epoch": 7.359706362153344, + "grad_norm": 0.0648832619190216, + "learning_rate": 9.888197344619657e-06, + "loss": 0.0695, + "num_input_tokens_seen": 97374656, + "step": 45115 + }, + { + "epoch": 7.3605220228384995, + "grad_norm": 0.5465717911720276, + "learning_rate": 9.882527585289788e-06, + "loss": 0.1042, + "num_input_tokens_seen": 97386272, + "step": 45120 + }, + { + "epoch": 7.361337683523654, + "grad_norm": 1.1469045877456665, + "learning_rate": 9.876859051458937e-06, + "loss": 0.1032, + "num_input_tokens_seen": 97396992, + "step": 45125 + }, + { + "epoch": 7.362153344208809, + "grad_norm": 0.13844865560531616, + "learning_rate": 9.871191743586624e-06, + "loss": 0.1113, + "num_input_tokens_seen": 97407296, + "step": 45130 + }, + { + "epoch": 7.362969004893964, + "grad_norm": 0.18844187259674072, + "learning_rate": 9.865525662132274e-06, + "loss": 0.2419, + "num_input_tokens_seen": 97417824, + "step": 45135 + }, + { + "epoch": 7.363784665579119, + "grad_norm": 0.035008762031793594, + "learning_rate": 9.859860807555204e-06, + "loss": 0.0133, + "num_input_tokens_seen": 97428096, + "step": 45140 + }, + { + "epoch": 7.364600326264274, + "grad_norm": 1.1555299758911133, + "learning_rate": 9.854197180314639e-06, + "loss": 0.1337, + "num_input_tokens_seen": 97439680, + "step": 45145 + }, + { + "epoch": 7.365415986949429, + "grad_norm": 0.6123772263526917, + "learning_rate": 9.848534780869698e-06, + "loss": 0.2147, + "num_input_tokens_seen": 97450880, + "step": 45150 + }, + { + "epoch": 7.366231647634584, + "grad_norm": 2.711750030517578, + "learning_rate": 9.842873609679404e-06, + "loss": 0.4118, + "num_input_tokens_seen": 97461568, + "step": 45155 + }, + { + "epoch": 7.367047308319739, + "grad_norm": 0.04658128321170807, + "learning_rate": 9.837213667202682e-06, + "loss": 0.2346, + "num_input_tokens_seen": 97472416, + "step": 45160 + }, + { + "epoch": 7.367862969004894, + "grad_norm": 0.09357502311468124, + "learning_rate": 9.83155495389836e-06, + "loss": 0.1258, + "num_input_tokens_seen": 97483744, + "step": 45165 + }, + { + "epoch": 7.368678629690049, + "grad_norm": 1.1962177753448486, + "learning_rate": 9.82589747022515e-06, + "loss": 0.0861, + "num_input_tokens_seen": 97494752, + "step": 45170 + }, + { + "epoch": 7.369494290375204, + "grad_norm": 0.06170763075351715, + "learning_rate": 9.82024121664169e-06, + "loss": 0.036, + "num_input_tokens_seen": 97505056, + "step": 45175 + }, + { + "epoch": 7.370309951060359, + "grad_norm": 2.515483856201172, + "learning_rate": 9.814586193606496e-06, + "loss": 0.2003, + "num_input_tokens_seen": 97516352, + "step": 45180 + }, + { + "epoch": 7.371125611745514, + "grad_norm": 0.031318988651037216, + "learning_rate": 9.808932401577994e-06, + "loss": 0.1848, + "num_input_tokens_seen": 97526784, + "step": 45185 + }, + { + "epoch": 7.371941272430669, + "grad_norm": 1.5051664113998413, + "learning_rate": 9.803279841014501e-06, + "loss": 0.11, + "num_input_tokens_seen": 97538368, + "step": 45190 + }, + { + "epoch": 7.372756933115824, + "grad_norm": 3.2714552879333496, + "learning_rate": 9.797628512374262e-06, + "loss": 0.0923, + "num_input_tokens_seen": 97549440, + "step": 45195 + }, + { + "epoch": 7.373572593800978, + "grad_norm": 0.23221814632415771, + "learning_rate": 9.791978416115393e-06, + "loss": 0.0724, + "num_input_tokens_seen": 97559264, + "step": 45200 + }, + { + "epoch": 7.374388254486134, + "grad_norm": 1.329418659210205, + "learning_rate": 9.786329552695916e-06, + "loss": 0.072, + "num_input_tokens_seen": 97569312, + "step": 45205 + }, + { + "epoch": 7.375203915171289, + "grad_norm": 0.21417859196662903, + "learning_rate": 9.780681922573759e-06, + "loss": 0.0236, + "num_input_tokens_seen": 97579712, + "step": 45210 + }, + { + "epoch": 7.376019575856444, + "grad_norm": 0.10683242976665497, + "learning_rate": 9.775035526206741e-06, + "loss": 0.0411, + "num_input_tokens_seen": 97590912, + "step": 45215 + }, + { + "epoch": 7.376835236541599, + "grad_norm": 0.49235114455223083, + "learning_rate": 9.7693903640526e-06, + "loss": 0.0732, + "num_input_tokens_seen": 97602176, + "step": 45220 + }, + { + "epoch": 7.377650897226753, + "grad_norm": 1.4150547981262207, + "learning_rate": 9.763746436568957e-06, + "loss": 0.2459, + "num_input_tokens_seen": 97612128, + "step": 45225 + }, + { + "epoch": 7.378466557911908, + "grad_norm": 1.1099653244018555, + "learning_rate": 9.758103744213334e-06, + "loss": 0.1195, + "num_input_tokens_seen": 97622528, + "step": 45230 + }, + { + "epoch": 7.379282218597064, + "grad_norm": 0.2753485441207886, + "learning_rate": 9.752462287443163e-06, + "loss": 0.1357, + "num_input_tokens_seen": 97634304, + "step": 45235 + }, + { + "epoch": 7.380097879282219, + "grad_norm": 0.5167863368988037, + "learning_rate": 9.746822066715757e-06, + "loss": 0.1304, + "num_input_tokens_seen": 97644864, + "step": 45240 + }, + { + "epoch": 7.3809135399673735, + "grad_norm": 0.44140398502349854, + "learning_rate": 9.741183082488354e-06, + "loss": 0.0656, + "num_input_tokens_seen": 97656768, + "step": 45245 + }, + { + "epoch": 7.381729200652528, + "grad_norm": 3.074732542037964, + "learning_rate": 9.73554533521807e-06, + "loss": 0.2293, + "num_input_tokens_seen": 97668416, + "step": 45250 + }, + { + "epoch": 7.382544861337683, + "grad_norm": 0.521699070930481, + "learning_rate": 9.729908825361933e-06, + "loss": 0.0375, + "num_input_tokens_seen": 97679392, + "step": 45255 + }, + { + "epoch": 7.383360522022839, + "grad_norm": 0.6748572587966919, + "learning_rate": 9.724273553376864e-06, + "loss": 0.2476, + "num_input_tokens_seen": 97689312, + "step": 45260 + }, + { + "epoch": 7.384176182707994, + "grad_norm": 0.5058345794677734, + "learning_rate": 9.718639519719695e-06, + "loss": 0.0512, + "num_input_tokens_seen": 97700448, + "step": 45265 + }, + { + "epoch": 7.3849918433931485, + "grad_norm": 0.8827280402183533, + "learning_rate": 9.713006724847137e-06, + "loss": 0.0509, + "num_input_tokens_seen": 97710528, + "step": 45270 + }, + { + "epoch": 7.385807504078303, + "grad_norm": 0.13362976908683777, + "learning_rate": 9.70737516921582e-06, + "loss": 0.0814, + "num_input_tokens_seen": 97718720, + "step": 45275 + }, + { + "epoch": 7.386623164763458, + "grad_norm": 0.04662924259901047, + "learning_rate": 9.70174485328227e-06, + "loss": 0.117, + "num_input_tokens_seen": 97727744, + "step": 45280 + }, + { + "epoch": 7.387438825448613, + "grad_norm": 0.06993778049945831, + "learning_rate": 9.6961157775029e-06, + "loss": 0.0199, + "num_input_tokens_seen": 97738112, + "step": 45285 + }, + { + "epoch": 7.388254486133769, + "grad_norm": 0.08162124454975128, + "learning_rate": 9.69048794233404e-06, + "loss": 0.0559, + "num_input_tokens_seen": 97748320, + "step": 45290 + }, + { + "epoch": 7.3890701468189235, + "grad_norm": 0.4312511384487152, + "learning_rate": 9.684861348231897e-06, + "loss": 0.0919, + "num_input_tokens_seen": 97760480, + "step": 45295 + }, + { + "epoch": 7.389885807504078, + "grad_norm": 0.9068806767463684, + "learning_rate": 9.67923599565261e-06, + "loss": 0.1605, + "num_input_tokens_seen": 97770912, + "step": 45300 + }, + { + "epoch": 7.390701468189233, + "grad_norm": 2.013495922088623, + "learning_rate": 9.673611885052189e-06, + "loss": 0.3598, + "num_input_tokens_seen": 97781984, + "step": 45305 + }, + { + "epoch": 7.391517128874388, + "grad_norm": 1.995097279548645, + "learning_rate": 9.667989016886555e-06, + "loss": 0.1537, + "num_input_tokens_seen": 97793152, + "step": 45310 + }, + { + "epoch": 7.392332789559543, + "grad_norm": 0.09866296499967575, + "learning_rate": 9.662367391611526e-06, + "loss": 0.127, + "num_input_tokens_seen": 97804896, + "step": 45315 + }, + { + "epoch": 7.3931484502446985, + "grad_norm": 0.7046131491661072, + "learning_rate": 9.656747009682817e-06, + "loss": 0.1336, + "num_input_tokens_seen": 97815008, + "step": 45320 + }, + { + "epoch": 7.393964110929853, + "grad_norm": 1.008348822593689, + "learning_rate": 9.651127871556049e-06, + "loss": 0.1282, + "num_input_tokens_seen": 97826400, + "step": 45325 + }, + { + "epoch": 7.394779771615008, + "grad_norm": 0.692890465259552, + "learning_rate": 9.645509977686731e-06, + "loss": 0.1567, + "num_input_tokens_seen": 97837024, + "step": 45330 + }, + { + "epoch": 7.395595432300163, + "grad_norm": 0.08388428390026093, + "learning_rate": 9.639893328530283e-06, + "loss": 0.1432, + "num_input_tokens_seen": 97847616, + "step": 45335 + }, + { + "epoch": 7.396411092985318, + "grad_norm": 0.030730051919817924, + "learning_rate": 9.63427792454202e-06, + "loss": 0.0662, + "num_input_tokens_seen": 97858464, + "step": 45340 + }, + { + "epoch": 7.397226753670473, + "grad_norm": 1.635595440864563, + "learning_rate": 9.628663766177154e-06, + "loss": 0.0863, + "num_input_tokens_seen": 97868224, + "step": 45345 + }, + { + "epoch": 7.398042414355628, + "grad_norm": 0.28422486782073975, + "learning_rate": 9.623050853890795e-06, + "loss": 0.1414, + "num_input_tokens_seen": 97879520, + "step": 45350 + }, + { + "epoch": 7.398858075040783, + "grad_norm": 0.915559709072113, + "learning_rate": 9.617439188137956e-06, + "loss": 0.0782, + "num_input_tokens_seen": 97889568, + "step": 45355 + }, + { + "epoch": 7.399673735725938, + "grad_norm": 1.1593899726867676, + "learning_rate": 9.611828769373538e-06, + "loss": 0.0735, + "num_input_tokens_seen": 97899840, + "step": 45360 + }, + { + "epoch": 7.400489396411093, + "grad_norm": 1.729609727859497, + "learning_rate": 9.606219598052366e-06, + "loss": 0.1757, + "num_input_tokens_seen": 97910112, + "step": 45365 + }, + { + "epoch": 7.401305057096248, + "grad_norm": 1.3048534393310547, + "learning_rate": 9.600611674629143e-06, + "loss": 0.0642, + "num_input_tokens_seen": 97921664, + "step": 45370 + }, + { + "epoch": 7.402120717781403, + "grad_norm": 0.14294742047786713, + "learning_rate": 9.595004999558471e-06, + "loss": 0.1027, + "num_input_tokens_seen": 97934112, + "step": 45375 + }, + { + "epoch": 7.402936378466558, + "grad_norm": 0.7445150017738342, + "learning_rate": 9.58939957329486e-06, + "loss": 0.0943, + "num_input_tokens_seen": 97945760, + "step": 45380 + }, + { + "epoch": 7.403752039151713, + "grad_norm": 0.30163243412971497, + "learning_rate": 9.58379539629271e-06, + "loss": 0.2891, + "num_input_tokens_seen": 97956320, + "step": 45385 + }, + { + "epoch": 7.404567699836868, + "grad_norm": 2.669139862060547, + "learning_rate": 9.578192469006328e-06, + "loss": 0.1877, + "num_input_tokens_seen": 97966368, + "step": 45390 + }, + { + "epoch": 7.4053833605220225, + "grad_norm": 0.4619734585285187, + "learning_rate": 9.572590791889916e-06, + "loss": 0.0848, + "num_input_tokens_seen": 97976736, + "step": 45395 + }, + { + "epoch": 7.406199021207178, + "grad_norm": 0.16541290283203125, + "learning_rate": 9.56699036539756e-06, + "loss": 0.0969, + "num_input_tokens_seen": 97987616, + "step": 45400 + }, + { + "epoch": 7.407014681892333, + "grad_norm": 0.12025855481624603, + "learning_rate": 9.561391189983281e-06, + "loss": 0.1025, + "num_input_tokens_seen": 97999424, + "step": 45405 + }, + { + "epoch": 7.407830342577488, + "grad_norm": 0.6264259815216064, + "learning_rate": 9.555793266100969e-06, + "loss": 0.1892, + "num_input_tokens_seen": 98010688, + "step": 45410 + }, + { + "epoch": 7.408646003262643, + "grad_norm": 0.6859762668609619, + "learning_rate": 9.550196594204413e-06, + "loss": 0.0976, + "num_input_tokens_seen": 98022304, + "step": 45415 + }, + { + "epoch": 7.4094616639477975, + "grad_norm": 0.5940676331520081, + "learning_rate": 9.544601174747316e-06, + "loss": 0.1107, + "num_input_tokens_seen": 98032448, + "step": 45420 + }, + { + "epoch": 7.410277324632952, + "grad_norm": 1.7430213689804077, + "learning_rate": 9.539007008183267e-06, + "loss": 0.1354, + "num_input_tokens_seen": 98043936, + "step": 45425 + }, + { + "epoch": 7.411092985318108, + "grad_norm": 0.8433747887611389, + "learning_rate": 9.533414094965759e-06, + "loss": 0.2142, + "num_input_tokens_seen": 98054880, + "step": 45430 + }, + { + "epoch": 7.411908646003263, + "grad_norm": 0.4047684669494629, + "learning_rate": 9.527822435548181e-06, + "loss": 0.0528, + "num_input_tokens_seen": 98065440, + "step": 45435 + }, + { + "epoch": 7.412724306688418, + "grad_norm": 0.033906202763319016, + "learning_rate": 9.522232030383822e-06, + "loss": 0.0316, + "num_input_tokens_seen": 98076256, + "step": 45440 + }, + { + "epoch": 7.4135399673735725, + "grad_norm": 2.1549360752105713, + "learning_rate": 9.516642879925865e-06, + "loss": 0.1304, + "num_input_tokens_seen": 98087648, + "step": 45445 + }, + { + "epoch": 7.414355628058727, + "grad_norm": 0.14890752732753754, + "learning_rate": 9.511054984627402e-06, + "loss": 0.0495, + "num_input_tokens_seen": 98097952, + "step": 45450 + }, + { + "epoch": 7.415171288743883, + "grad_norm": 0.19968195259571075, + "learning_rate": 9.50546834494141e-06, + "loss": 0.0703, + "num_input_tokens_seen": 98109440, + "step": 45455 + }, + { + "epoch": 7.415986949429038, + "grad_norm": 0.8773323893547058, + "learning_rate": 9.499882961320771e-06, + "loss": 0.048, + "num_input_tokens_seen": 98120160, + "step": 45460 + }, + { + "epoch": 7.416802610114193, + "grad_norm": 0.9250600934028625, + "learning_rate": 9.494298834218268e-06, + "loss": 0.0869, + "num_input_tokens_seen": 98131488, + "step": 45465 + }, + { + "epoch": 7.417618270799347, + "grad_norm": 0.030669018626213074, + "learning_rate": 9.488715964086575e-06, + "loss": 0.1015, + "num_input_tokens_seen": 98142528, + "step": 45470 + }, + { + "epoch": 7.418433931484502, + "grad_norm": 0.21164049208164215, + "learning_rate": 9.48313435137827e-06, + "loss": 0.1333, + "num_input_tokens_seen": 98153472, + "step": 45475 + }, + { + "epoch": 7.419249592169657, + "grad_norm": 0.6698386073112488, + "learning_rate": 9.47755399654583e-06, + "loss": 0.2318, + "num_input_tokens_seen": 98162784, + "step": 45480 + }, + { + "epoch": 7.420065252854813, + "grad_norm": 0.0967346802353859, + "learning_rate": 9.47197490004162e-06, + "loss": 0.1692, + "num_input_tokens_seen": 98173376, + "step": 45485 + }, + { + "epoch": 7.420880913539968, + "grad_norm": 1.2762373685836792, + "learning_rate": 9.466397062317911e-06, + "loss": 0.1537, + "num_input_tokens_seen": 98183872, + "step": 45490 + }, + { + "epoch": 7.421696574225122, + "grad_norm": 0.7246487736701965, + "learning_rate": 9.460820483826874e-06, + "loss": 0.1228, + "num_input_tokens_seen": 98195200, + "step": 45495 + }, + { + "epoch": 7.422512234910277, + "grad_norm": 0.6216683387756348, + "learning_rate": 9.455245165020565e-06, + "loss": 0.1295, + "num_input_tokens_seen": 98205344, + "step": 45500 + }, + { + "epoch": 7.423327895595432, + "grad_norm": 0.377227246761322, + "learning_rate": 9.449671106350966e-06, + "loss": 0.1109, + "num_input_tokens_seen": 98216896, + "step": 45505 + }, + { + "epoch": 7.424143556280587, + "grad_norm": 0.6808533668518066, + "learning_rate": 9.444098308269931e-06, + "loss": 0.1335, + "num_input_tokens_seen": 98228896, + "step": 45510 + }, + { + "epoch": 7.424959216965743, + "grad_norm": 1.6035178899765015, + "learning_rate": 9.438526771229212e-06, + "loss": 0.1624, + "num_input_tokens_seen": 98239488, + "step": 45515 + }, + { + "epoch": 7.425774877650897, + "grad_norm": 0.2538485527038574, + "learning_rate": 9.432956495680473e-06, + "loss": 0.0872, + "num_input_tokens_seen": 98251232, + "step": 45520 + }, + { + "epoch": 7.426590538336052, + "grad_norm": 0.21514911949634552, + "learning_rate": 9.42738748207526e-06, + "loss": 0.0865, + "num_input_tokens_seen": 98262432, + "step": 45525 + }, + { + "epoch": 7.427406199021207, + "grad_norm": 0.031095298007130623, + "learning_rate": 9.421819730865039e-06, + "loss": 0.182, + "num_input_tokens_seen": 98272320, + "step": 45530 + }, + { + "epoch": 7.428221859706362, + "grad_norm": 0.324659138917923, + "learning_rate": 9.416253242501155e-06, + "loss": 0.165, + "num_input_tokens_seen": 98283904, + "step": 45535 + }, + { + "epoch": 7.4290375203915175, + "grad_norm": 2.3519482612609863, + "learning_rate": 9.410688017434852e-06, + "loss": 0.1921, + "num_input_tokens_seen": 98295840, + "step": 45540 + }, + { + "epoch": 7.429853181076672, + "grad_norm": 0.06453260034322739, + "learning_rate": 9.40512405611728e-06, + "loss": 0.0769, + "num_input_tokens_seen": 98307072, + "step": 45545 + }, + { + "epoch": 7.430668841761827, + "grad_norm": 1.1549261808395386, + "learning_rate": 9.399561358999479e-06, + "loss": 0.0647, + "num_input_tokens_seen": 98318144, + "step": 45550 + }, + { + "epoch": 7.431484502446982, + "grad_norm": 1.0181951522827148, + "learning_rate": 9.393999926532387e-06, + "loss": 0.1732, + "num_input_tokens_seen": 98329728, + "step": 45555 + }, + { + "epoch": 7.432300163132137, + "grad_norm": 0.3055320978164673, + "learning_rate": 9.38843975916685e-06, + "loss": 0.018, + "num_input_tokens_seen": 98340256, + "step": 45560 + }, + { + "epoch": 7.433115823817292, + "grad_norm": 0.0249970480799675, + "learning_rate": 9.382880857353596e-06, + "loss": 0.0653, + "num_input_tokens_seen": 98350976, + "step": 45565 + }, + { + "epoch": 7.433931484502447, + "grad_norm": 1.9585374593734741, + "learning_rate": 9.37732322154326e-06, + "loss": 0.1809, + "num_input_tokens_seen": 98361536, + "step": 45570 + }, + { + "epoch": 7.434747145187602, + "grad_norm": 0.38214758038520813, + "learning_rate": 9.371766852186373e-06, + "loss": 0.0288, + "num_input_tokens_seen": 98372064, + "step": 45575 + }, + { + "epoch": 7.435562805872757, + "grad_norm": 2.2270989418029785, + "learning_rate": 9.366211749733361e-06, + "loss": 0.2447, + "num_input_tokens_seen": 98382688, + "step": 45580 + }, + { + "epoch": 7.436378466557912, + "grad_norm": 0.3565800189971924, + "learning_rate": 9.36065791463455e-06, + "loss": 0.0859, + "num_input_tokens_seen": 98393312, + "step": 45585 + }, + { + "epoch": 7.437194127243067, + "grad_norm": 0.2514936923980713, + "learning_rate": 9.355105347340163e-06, + "loss": 0.1904, + "num_input_tokens_seen": 98403136, + "step": 45590 + }, + { + "epoch": 7.438009787928221, + "grad_norm": 0.04712165892124176, + "learning_rate": 9.349554048300316e-06, + "loss": 0.0984, + "num_input_tokens_seen": 98414752, + "step": 45595 + }, + { + "epoch": 7.438825448613377, + "grad_norm": 0.014724222011864185, + "learning_rate": 9.344004017965027e-06, + "loss": 0.0762, + "num_input_tokens_seen": 98426496, + "step": 45600 + }, + { + "epoch": 7.439641109298532, + "grad_norm": 0.28535544872283936, + "learning_rate": 9.338455256784212e-06, + "loss": 0.1203, + "num_input_tokens_seen": 98437696, + "step": 45605 + }, + { + "epoch": 7.440456769983687, + "grad_norm": 1.082069993019104, + "learning_rate": 9.33290776520768e-06, + "loss": 0.2094, + "num_input_tokens_seen": 98448640, + "step": 45610 + }, + { + "epoch": 7.441272430668842, + "grad_norm": 0.2249133288860321, + "learning_rate": 9.32736154368513e-06, + "loss": 0.0375, + "num_input_tokens_seen": 98459360, + "step": 45615 + }, + { + "epoch": 7.442088091353996, + "grad_norm": 0.28407514095306396, + "learning_rate": 9.32181659266618e-06, + "loss": 0.1411, + "num_input_tokens_seen": 98470560, + "step": 45620 + }, + { + "epoch": 7.442903752039152, + "grad_norm": 0.11119700223207474, + "learning_rate": 9.316272912600332e-06, + "loss": 0.0793, + "num_input_tokens_seen": 98482432, + "step": 45625 + }, + { + "epoch": 7.443719412724307, + "grad_norm": 1.2035638093948364, + "learning_rate": 9.310730503936976e-06, + "loss": 0.0865, + "num_input_tokens_seen": 98493056, + "step": 45630 + }, + { + "epoch": 7.444535073409462, + "grad_norm": 0.12884700298309326, + "learning_rate": 9.305189367125416e-06, + "loss": 0.2069, + "num_input_tokens_seen": 98503904, + "step": 45635 + }, + { + "epoch": 7.445350734094617, + "grad_norm": 0.07826084643602371, + "learning_rate": 9.299649502614838e-06, + "loss": 0.0806, + "num_input_tokens_seen": 98514848, + "step": 45640 + }, + { + "epoch": 7.446166394779771, + "grad_norm": 0.36882928013801575, + "learning_rate": 9.294110910854337e-06, + "loss": 0.0289, + "num_input_tokens_seen": 98525696, + "step": 45645 + }, + { + "epoch": 7.446982055464926, + "grad_norm": 0.030608657747507095, + "learning_rate": 9.288573592292893e-06, + "loss": 0.1022, + "num_input_tokens_seen": 98536672, + "step": 45650 + }, + { + "epoch": 7.447797716150082, + "grad_norm": 0.40133029222488403, + "learning_rate": 9.283037547379394e-06, + "loss": 0.0564, + "num_input_tokens_seen": 98546912, + "step": 45655 + }, + { + "epoch": 7.448613376835237, + "grad_norm": 2.7309930324554443, + "learning_rate": 9.27750277656262e-06, + "loss": 0.5074, + "num_input_tokens_seen": 98558336, + "step": 45660 + }, + { + "epoch": 7.4494290375203915, + "grad_norm": 0.1396404653787613, + "learning_rate": 9.271969280291243e-06, + "loss": 0.0462, + "num_input_tokens_seen": 98568096, + "step": 45665 + }, + { + "epoch": 7.450244698205546, + "grad_norm": 0.113664411008358, + "learning_rate": 9.266437059013834e-06, + "loss": 0.1026, + "num_input_tokens_seen": 98579072, + "step": 45670 + }, + { + "epoch": 7.451060358890701, + "grad_norm": 0.7622182965278625, + "learning_rate": 9.260906113178875e-06, + "loss": 0.2232, + "num_input_tokens_seen": 98589600, + "step": 45675 + }, + { + "epoch": 7.451876019575856, + "grad_norm": 1.5453075170516968, + "learning_rate": 9.255376443234725e-06, + "loss": 0.1918, + "num_input_tokens_seen": 98600864, + "step": 45680 + }, + { + "epoch": 7.452691680261012, + "grad_norm": 0.08349907398223877, + "learning_rate": 9.24984804962965e-06, + "loss": 0.0807, + "num_input_tokens_seen": 98611968, + "step": 45685 + }, + { + "epoch": 7.4535073409461665, + "grad_norm": 1.5883126258850098, + "learning_rate": 9.244320932811806e-06, + "loss": 0.1537, + "num_input_tokens_seen": 98622912, + "step": 45690 + }, + { + "epoch": 7.454323001631321, + "grad_norm": 1.4092153310775757, + "learning_rate": 9.238795093229252e-06, + "loss": 0.403, + "num_input_tokens_seen": 98634592, + "step": 45695 + }, + { + "epoch": 7.455138662316476, + "grad_norm": 0.04160241037607193, + "learning_rate": 9.233270531329937e-06, + "loss": 0.1444, + "num_input_tokens_seen": 98644096, + "step": 45700 + }, + { + "epoch": 7.455954323001631, + "grad_norm": 0.4289228320121765, + "learning_rate": 9.227747247561713e-06, + "loss": 0.1656, + "num_input_tokens_seen": 98654400, + "step": 45705 + }, + { + "epoch": 7.456769983686787, + "grad_norm": 0.06219099462032318, + "learning_rate": 9.222225242372326e-06, + "loss": 0.0188, + "num_input_tokens_seen": 98665568, + "step": 45710 + }, + { + "epoch": 7.4575856443719415, + "grad_norm": 0.17552000284194946, + "learning_rate": 9.216704516209417e-06, + "loss": 0.1041, + "num_input_tokens_seen": 98674624, + "step": 45715 + }, + { + "epoch": 7.458401305057096, + "grad_norm": 2.026787281036377, + "learning_rate": 9.211185069520514e-06, + "loss": 0.2356, + "num_input_tokens_seen": 98685536, + "step": 45720 + }, + { + "epoch": 7.459216965742251, + "grad_norm": 0.9160785675048828, + "learning_rate": 9.205666902753071e-06, + "loss": 0.1604, + "num_input_tokens_seen": 98696352, + "step": 45725 + }, + { + "epoch": 7.460032626427406, + "grad_norm": 0.06679566949605942, + "learning_rate": 9.200150016354406e-06, + "loss": 0.0794, + "num_input_tokens_seen": 98707808, + "step": 45730 + }, + { + "epoch": 7.460848287112561, + "grad_norm": 0.8775147199630737, + "learning_rate": 9.19463441077175e-06, + "loss": 0.1604, + "num_input_tokens_seen": 98718112, + "step": 45735 + }, + { + "epoch": 7.4616639477977165, + "grad_norm": 1.422157883644104, + "learning_rate": 9.189120086452224e-06, + "loss": 0.1184, + "num_input_tokens_seen": 98729312, + "step": 45740 + }, + { + "epoch": 7.462479608482871, + "grad_norm": 0.12413492798805237, + "learning_rate": 9.183607043842846e-06, + "loss": 0.043, + "num_input_tokens_seen": 98739168, + "step": 45745 + }, + { + "epoch": 7.463295269168026, + "grad_norm": 2.4667410850524902, + "learning_rate": 9.178095283390533e-06, + "loss": 0.4953, + "num_input_tokens_seen": 98749376, + "step": 45750 + }, + { + "epoch": 7.464110929853181, + "grad_norm": 0.0734008327126503, + "learning_rate": 9.172584805542098e-06, + "loss": 0.0505, + "num_input_tokens_seen": 98760928, + "step": 45755 + }, + { + "epoch": 7.464926590538336, + "grad_norm": 0.15244236588478088, + "learning_rate": 9.167075610744244e-06, + "loss": 0.1489, + "num_input_tokens_seen": 98770560, + "step": 45760 + }, + { + "epoch": 7.465742251223491, + "grad_norm": 0.1628851741552353, + "learning_rate": 9.161567699443577e-06, + "loss": 0.1302, + "num_input_tokens_seen": 98780512, + "step": 45765 + }, + { + "epoch": 7.466557911908646, + "grad_norm": 0.485361248254776, + "learning_rate": 9.156061072086597e-06, + "loss": 0.0718, + "num_input_tokens_seen": 98792192, + "step": 45770 + }, + { + "epoch": 7.467373572593801, + "grad_norm": 1.8183879852294922, + "learning_rate": 9.150555729119697e-06, + "loss": 0.1454, + "num_input_tokens_seen": 98802752, + "step": 45775 + }, + { + "epoch": 7.468189233278956, + "grad_norm": 0.3141419589519501, + "learning_rate": 9.14505167098917e-06, + "loss": 0.035, + "num_input_tokens_seen": 98813600, + "step": 45780 + }, + { + "epoch": 7.469004893964111, + "grad_norm": 0.03563522920012474, + "learning_rate": 9.139548898141198e-06, + "loss": 0.024, + "num_input_tokens_seen": 98825280, + "step": 45785 + }, + { + "epoch": 7.4698205546492655, + "grad_norm": 1.590699553489685, + "learning_rate": 9.13404741102187e-06, + "loss": 0.1826, + "num_input_tokens_seen": 98836480, + "step": 45790 + }, + { + "epoch": 7.470636215334421, + "grad_norm": 1.4436962604522705, + "learning_rate": 9.128547210077162e-06, + "loss": 0.1267, + "num_input_tokens_seen": 98847424, + "step": 45795 + }, + { + "epoch": 7.471451876019576, + "grad_norm": 1.543317437171936, + "learning_rate": 9.123048295752948e-06, + "loss": 0.1571, + "num_input_tokens_seen": 98858016, + "step": 45800 + }, + { + "epoch": 7.472267536704731, + "grad_norm": 0.16045744717121124, + "learning_rate": 9.117550668494998e-06, + "loss": 0.0204, + "num_input_tokens_seen": 98868864, + "step": 45805 + }, + { + "epoch": 7.473083197389886, + "grad_norm": 2.292302370071411, + "learning_rate": 9.112054328748975e-06, + "loss": 0.2817, + "num_input_tokens_seen": 98879936, + "step": 45810 + }, + { + "epoch": 7.4738988580750405, + "grad_norm": 0.18128997087478638, + "learning_rate": 9.106559276960439e-06, + "loss": 0.0434, + "num_input_tokens_seen": 98891328, + "step": 45815 + }, + { + "epoch": 7.474714518760196, + "grad_norm": 0.7883239388465881, + "learning_rate": 9.101065513574856e-06, + "loss": 0.1129, + "num_input_tokens_seen": 98902592, + "step": 45820 + }, + { + "epoch": 7.475530179445351, + "grad_norm": 0.17698900401592255, + "learning_rate": 9.095573039037574e-06, + "loss": 0.0874, + "num_input_tokens_seen": 98913440, + "step": 45825 + }, + { + "epoch": 7.476345840130506, + "grad_norm": 0.5576223134994507, + "learning_rate": 9.090081853793838e-06, + "loss": 0.0823, + "num_input_tokens_seen": 98923808, + "step": 45830 + }, + { + "epoch": 7.477161500815661, + "grad_norm": 1.5299072265625, + "learning_rate": 9.084591958288797e-06, + "loss": 0.0659, + "num_input_tokens_seen": 98933856, + "step": 45835 + }, + { + "epoch": 7.4779771615008155, + "grad_norm": 1.9359135627746582, + "learning_rate": 9.079103352967471e-06, + "loss": 0.0712, + "num_input_tokens_seen": 98944960, + "step": 45840 + }, + { + "epoch": 7.47879282218597, + "grad_norm": 0.9135372638702393, + "learning_rate": 9.073616038274823e-06, + "loss": 0.0772, + "num_input_tokens_seen": 98956512, + "step": 45845 + }, + { + "epoch": 7.479608482871126, + "grad_norm": 0.8569888472557068, + "learning_rate": 9.068130014655665e-06, + "loss": 0.1944, + "num_input_tokens_seen": 98967520, + "step": 45850 + }, + { + "epoch": 7.480424143556281, + "grad_norm": 0.8189762830734253, + "learning_rate": 9.06264528255473e-06, + "loss": 0.2158, + "num_input_tokens_seen": 98977920, + "step": 45855 + }, + { + "epoch": 7.481239804241436, + "grad_norm": 0.05331939086318016, + "learning_rate": 9.057161842416628e-06, + "loss": 0.0565, + "num_input_tokens_seen": 98987392, + "step": 45860 + }, + { + "epoch": 7.4820554649265905, + "grad_norm": 0.31488558650016785, + "learning_rate": 9.051679694685885e-06, + "loss": 0.0311, + "num_input_tokens_seen": 98998304, + "step": 45865 + }, + { + "epoch": 7.482871125611745, + "grad_norm": 0.3325585722923279, + "learning_rate": 9.046198839806905e-06, + "loss": 0.1493, + "num_input_tokens_seen": 99008480, + "step": 45870 + }, + { + "epoch": 7.4836867862969, + "grad_norm": 0.7225121259689331, + "learning_rate": 9.040719278223997e-06, + "loss": 0.1958, + "num_input_tokens_seen": 99019584, + "step": 45875 + }, + { + "epoch": 7.484502446982056, + "grad_norm": 1.289097547531128, + "learning_rate": 9.03524101038136e-06, + "loss": 0.1856, + "num_input_tokens_seen": 99029920, + "step": 45880 + }, + { + "epoch": 7.485318107667211, + "grad_norm": 1.8541837930679321, + "learning_rate": 9.02976403672309e-06, + "loss": 0.3468, + "num_input_tokens_seen": 99040288, + "step": 45885 + }, + { + "epoch": 7.486133768352365, + "grad_norm": 0.4768669903278351, + "learning_rate": 9.02428835769318e-06, + "loss": 0.0792, + "num_input_tokens_seen": 99053024, + "step": 45890 + }, + { + "epoch": 7.48694942903752, + "grad_norm": 0.01757897436618805, + "learning_rate": 9.018813973735515e-06, + "loss": 0.0833, + "num_input_tokens_seen": 99062720, + "step": 45895 + }, + { + "epoch": 7.487765089722675, + "grad_norm": 0.4451879560947418, + "learning_rate": 9.013340885293878e-06, + "loss": 0.0438, + "num_input_tokens_seen": 99073728, + "step": 45900 + }, + { + "epoch": 7.488580750407831, + "grad_norm": 0.39969462156295776, + "learning_rate": 9.007869092811944e-06, + "loss": 0.3734, + "num_input_tokens_seen": 99084704, + "step": 45905 + }, + { + "epoch": 7.489396411092986, + "grad_norm": 2.107736825942993, + "learning_rate": 9.002398596733287e-06, + "loss": 0.4618, + "num_input_tokens_seen": 99095744, + "step": 45910 + }, + { + "epoch": 7.49021207177814, + "grad_norm": 0.24505989253520966, + "learning_rate": 8.996929397501366e-06, + "loss": 0.2116, + "num_input_tokens_seen": 99107168, + "step": 45915 + }, + { + "epoch": 7.491027732463295, + "grad_norm": 0.06736737489700317, + "learning_rate": 8.99146149555955e-06, + "loss": 0.1318, + "num_input_tokens_seen": 99118208, + "step": 45920 + }, + { + "epoch": 7.49184339314845, + "grad_norm": 0.16255341470241547, + "learning_rate": 8.98599489135109e-06, + "loss": 0.0271, + "num_input_tokens_seen": 99129152, + "step": 45925 + }, + { + "epoch": 7.492659053833605, + "grad_norm": 0.2621261179447174, + "learning_rate": 8.980529585319142e-06, + "loss": 0.0443, + "num_input_tokens_seen": 99140192, + "step": 45930 + }, + { + "epoch": 7.493474714518761, + "grad_norm": 1.9350792169570923, + "learning_rate": 8.975065577906735e-06, + "loss": 0.2212, + "num_input_tokens_seen": 99150368, + "step": 45935 + }, + { + "epoch": 7.494290375203915, + "grad_norm": 0.1252831220626831, + "learning_rate": 8.969602869556834e-06, + "loss": 0.128, + "num_input_tokens_seen": 99161728, + "step": 45940 + }, + { + "epoch": 7.49510603588907, + "grad_norm": 1.260060429573059, + "learning_rate": 8.964141460712258e-06, + "loss": 0.1583, + "num_input_tokens_seen": 99173024, + "step": 45945 + }, + { + "epoch": 7.495921696574225, + "grad_norm": 0.48726731538772583, + "learning_rate": 8.958681351815742e-06, + "loss": 0.1082, + "num_input_tokens_seen": 99183008, + "step": 45950 + }, + { + "epoch": 7.49673735725938, + "grad_norm": 0.0843067467212677, + "learning_rate": 8.95322254330991e-06, + "loss": 0.044, + "num_input_tokens_seen": 99194976, + "step": 45955 + }, + { + "epoch": 7.497553017944535, + "grad_norm": 1.67208993434906, + "learning_rate": 8.947765035637278e-06, + "loss": 0.134, + "num_input_tokens_seen": 99205760, + "step": 45960 + }, + { + "epoch": 7.49836867862969, + "grad_norm": 0.1647586226463318, + "learning_rate": 8.942308829240262e-06, + "loss": 0.0692, + "num_input_tokens_seen": 99215808, + "step": 45965 + }, + { + "epoch": 7.499184339314845, + "grad_norm": 0.32698506116867065, + "learning_rate": 8.936853924561167e-06, + "loss": 0.2036, + "num_input_tokens_seen": 99226848, + "step": 45970 + }, + { + "epoch": 7.5, + "grad_norm": 0.2649887800216675, + "learning_rate": 8.931400322042193e-06, + "loss": 0.1514, + "num_input_tokens_seen": 99237152, + "step": 45975 + }, + { + "epoch": 7.5, + "eval_loss": 0.13809734582901, + "eval_runtime": 131.8357, + "eval_samples_per_second": 20.67, + "eval_steps_per_second": 5.173, + "num_input_tokens_seen": 99237152, + "step": 45975 + }, + { + "epoch": 7.500815660685155, + "grad_norm": 1.4920328855514526, + "learning_rate": 8.925948022125446e-06, + "loss": 0.1984, + "num_input_tokens_seen": 99247936, + "step": 45980 + }, + { + "epoch": 7.50163132137031, + "grad_norm": 0.8355021476745605, + "learning_rate": 8.9204970252529e-06, + "loss": 0.0793, + "num_input_tokens_seen": 99259456, + "step": 45985 + }, + { + "epoch": 7.502446982055465, + "grad_norm": 0.12568634748458862, + "learning_rate": 8.91504733186646e-06, + "loss": 0.0309, + "num_input_tokens_seen": 99269760, + "step": 45990 + }, + { + "epoch": 7.50326264274062, + "grad_norm": 0.5867286920547485, + "learning_rate": 8.909598942407898e-06, + "loss": 0.0972, + "num_input_tokens_seen": 99280256, + "step": 45995 + }, + { + "epoch": 7.504078303425775, + "grad_norm": 0.29033464193344116, + "learning_rate": 8.904151857318888e-06, + "loss": 0.0875, + "num_input_tokens_seen": 99289600, + "step": 46000 + }, + { + "epoch": 7.50489396411093, + "grad_norm": 0.15746362507343292, + "learning_rate": 8.898706077040997e-06, + "loss": 0.2073, + "num_input_tokens_seen": 99300128, + "step": 46005 + }, + { + "epoch": 7.505709624796085, + "grad_norm": 1.0478769540786743, + "learning_rate": 8.893261602015687e-06, + "loss": 0.2408, + "num_input_tokens_seen": 99311680, + "step": 46010 + }, + { + "epoch": 7.506525285481239, + "grad_norm": 1.1233406066894531, + "learning_rate": 8.88781843268432e-06, + "loss": 0.2522, + "num_input_tokens_seen": 99322976, + "step": 46015 + }, + { + "epoch": 7.507340946166395, + "grad_norm": 0.3275778889656067, + "learning_rate": 8.88237656948814e-06, + "loss": 0.105, + "num_input_tokens_seen": 99335584, + "step": 46020 + }, + { + "epoch": 7.50815660685155, + "grad_norm": 0.5204315781593323, + "learning_rate": 8.876936012868297e-06, + "loss": 0.0513, + "num_input_tokens_seen": 99346528, + "step": 46025 + }, + { + "epoch": 7.508972267536705, + "grad_norm": 0.11750644445419312, + "learning_rate": 8.87149676326583e-06, + "loss": 0.0572, + "num_input_tokens_seen": 99357280, + "step": 46030 + }, + { + "epoch": 7.50978792822186, + "grad_norm": 0.2278793454170227, + "learning_rate": 8.866058821121667e-06, + "loss": 0.1147, + "num_input_tokens_seen": 99368192, + "step": 46035 + }, + { + "epoch": 7.510603588907014, + "grad_norm": 0.858636736869812, + "learning_rate": 8.860622186876632e-06, + "loss": 0.068, + "num_input_tokens_seen": 99379104, + "step": 46040 + }, + { + "epoch": 7.511419249592169, + "grad_norm": 1.4140982627868652, + "learning_rate": 8.855186860971462e-06, + "loss": 0.1956, + "num_input_tokens_seen": 99391200, + "step": 46045 + }, + { + "epoch": 7.512234910277325, + "grad_norm": 0.11296907812356949, + "learning_rate": 8.849752843846762e-06, + "loss": 0.0747, + "num_input_tokens_seen": 99402272, + "step": 46050 + }, + { + "epoch": 7.51305057096248, + "grad_norm": 2.127206563949585, + "learning_rate": 8.844320135943042e-06, + "loss": 0.108, + "num_input_tokens_seen": 99414016, + "step": 46055 + }, + { + "epoch": 7.513866231647635, + "grad_norm": 0.24093104898929596, + "learning_rate": 8.838888737700707e-06, + "loss": 0.1541, + "num_input_tokens_seen": 99423424, + "step": 46060 + }, + { + "epoch": 7.514681892332789, + "grad_norm": 1.6121189594268799, + "learning_rate": 8.833458649560051e-06, + "loss": 0.1477, + "num_input_tokens_seen": 99434592, + "step": 46065 + }, + { + "epoch": 7.515497553017944, + "grad_norm": 1.4315311908721924, + "learning_rate": 8.828029871961263e-06, + "loss": 0.0841, + "num_input_tokens_seen": 99444096, + "step": 46070 + }, + { + "epoch": 7.5163132137031, + "grad_norm": 0.20621904730796814, + "learning_rate": 8.82260240534443e-06, + "loss": 0.2134, + "num_input_tokens_seen": 99455328, + "step": 46075 + }, + { + "epoch": 7.517128874388255, + "grad_norm": 1.2829145193099976, + "learning_rate": 8.817176250149528e-06, + "loss": 0.2401, + "num_input_tokens_seen": 99465152, + "step": 46080 + }, + { + "epoch": 7.5179445350734095, + "grad_norm": 0.03961535170674324, + "learning_rate": 8.811751406816432e-06, + "loss": 0.0315, + "num_input_tokens_seen": 99475072, + "step": 46085 + }, + { + "epoch": 7.518760195758564, + "grad_norm": 1.5126948356628418, + "learning_rate": 8.806327875784906e-06, + "loss": 0.1497, + "num_input_tokens_seen": 99485344, + "step": 46090 + }, + { + "epoch": 7.519575856443719, + "grad_norm": 0.2562922239303589, + "learning_rate": 8.800905657494607e-06, + "loss": 0.1439, + "num_input_tokens_seen": 99496864, + "step": 46095 + }, + { + "epoch": 7.520391517128875, + "grad_norm": 0.28950026631355286, + "learning_rate": 8.795484752385088e-06, + "loss": 0.0566, + "num_input_tokens_seen": 99507520, + "step": 46100 + }, + { + "epoch": 7.52120717781403, + "grad_norm": 0.9753855466842651, + "learning_rate": 8.790065160895797e-06, + "loss": 0.1724, + "num_input_tokens_seen": 99517664, + "step": 46105 + }, + { + "epoch": 7.5220228384991845, + "grad_norm": 0.2799129784107208, + "learning_rate": 8.784646883466072e-06, + "loss": 0.071, + "num_input_tokens_seen": 99527392, + "step": 46110 + }, + { + "epoch": 7.522838499184339, + "grad_norm": 0.22417905926704407, + "learning_rate": 8.779229920535148e-06, + "loss": 0.1576, + "num_input_tokens_seen": 99538784, + "step": 46115 + }, + { + "epoch": 7.523654159869494, + "grad_norm": 1.9735370874404907, + "learning_rate": 8.77381427254215e-06, + "loss": 0.1717, + "num_input_tokens_seen": 99549024, + "step": 46120 + }, + { + "epoch": 7.524469820554649, + "grad_norm": 1.0602480173110962, + "learning_rate": 8.768399939926095e-06, + "loss": 0.1465, + "num_input_tokens_seen": 99560480, + "step": 46125 + }, + { + "epoch": 7.525285481239804, + "grad_norm": 1.304552674293518, + "learning_rate": 8.762986923125894e-06, + "loss": 0.1264, + "num_input_tokens_seen": 99571968, + "step": 46130 + }, + { + "epoch": 7.5261011419249595, + "grad_norm": 0.5220471620559692, + "learning_rate": 8.757575222580364e-06, + "loss": 0.1814, + "num_input_tokens_seen": 99582112, + "step": 46135 + }, + { + "epoch": 7.526916802610114, + "grad_norm": 0.2665942907333374, + "learning_rate": 8.752164838728203e-06, + "loss": 0.17, + "num_input_tokens_seen": 99591360, + "step": 46140 + }, + { + "epoch": 7.527732463295269, + "grad_norm": 1.3136950731277466, + "learning_rate": 8.746755772007998e-06, + "loss": 0.0897, + "num_input_tokens_seen": 99601696, + "step": 46145 + }, + { + "epoch": 7.528548123980424, + "grad_norm": 1.2421537637710571, + "learning_rate": 8.74134802285824e-06, + "loss": 0.2234, + "num_input_tokens_seen": 99612928, + "step": 46150 + }, + { + "epoch": 7.529363784665579, + "grad_norm": 0.9103395938873291, + "learning_rate": 8.735941591717297e-06, + "loss": 0.0942, + "num_input_tokens_seen": 99622720, + "step": 46155 + }, + { + "epoch": 7.5301794453507345, + "grad_norm": 1.0795632600784302, + "learning_rate": 8.730536479023463e-06, + "loss": 0.344, + "num_input_tokens_seen": 99633888, + "step": 46160 + }, + { + "epoch": 7.530995106035889, + "grad_norm": 1.5909829139709473, + "learning_rate": 8.72513268521489e-06, + "loss": 0.1159, + "num_input_tokens_seen": 99644960, + "step": 46165 + }, + { + "epoch": 7.531810766721044, + "grad_norm": 0.03683358430862427, + "learning_rate": 8.719730210729638e-06, + "loss": 0.1243, + "num_input_tokens_seen": 99656096, + "step": 46170 + }, + { + "epoch": 7.532626427406199, + "grad_norm": 2.4134154319763184, + "learning_rate": 8.714329056005663e-06, + "loss": 0.1473, + "num_input_tokens_seen": 99666752, + "step": 46175 + }, + { + "epoch": 7.533442088091354, + "grad_norm": 0.9764096140861511, + "learning_rate": 8.708929221480808e-06, + "loss": 0.1053, + "num_input_tokens_seen": 99678016, + "step": 46180 + }, + { + "epoch": 7.5342577487765094, + "grad_norm": 0.3184782564640045, + "learning_rate": 8.703530707592807e-06, + "loss": 0.0572, + "num_input_tokens_seen": 99687456, + "step": 46185 + }, + { + "epoch": 7.535073409461664, + "grad_norm": 0.8260094523429871, + "learning_rate": 8.698133514779297e-06, + "loss": 0.0666, + "num_input_tokens_seen": 99699584, + "step": 46190 + }, + { + "epoch": 7.535889070146819, + "grad_norm": 0.18512392044067383, + "learning_rate": 8.692737643477796e-06, + "loss": 0.0179, + "num_input_tokens_seen": 99710624, + "step": 46195 + }, + { + "epoch": 7.536704730831974, + "grad_norm": 2.0600924491882324, + "learning_rate": 8.687343094125726e-06, + "loss": 0.1066, + "num_input_tokens_seen": 99720704, + "step": 46200 + }, + { + "epoch": 7.537520391517129, + "grad_norm": 0.108528271317482, + "learning_rate": 8.681949867160396e-06, + "loss": 0.1663, + "num_input_tokens_seen": 99729984, + "step": 46205 + }, + { + "epoch": 7.5383360522022835, + "grad_norm": 0.06787717342376709, + "learning_rate": 8.676557963019005e-06, + "loss": 0.086, + "num_input_tokens_seen": 99741632, + "step": 46210 + }, + { + "epoch": 7.539151712887438, + "grad_norm": 0.07620181888341904, + "learning_rate": 8.67116738213865e-06, + "loss": 0.0865, + "num_input_tokens_seen": 99750688, + "step": 46215 + }, + { + "epoch": 7.539967373572594, + "grad_norm": 0.39011892676353455, + "learning_rate": 8.66577812495632e-06, + "loss": 0.1346, + "num_input_tokens_seen": 99761440, + "step": 46220 + }, + { + "epoch": 7.540783034257749, + "grad_norm": 0.8436107039451599, + "learning_rate": 8.660390191908892e-06, + "loss": 0.1298, + "num_input_tokens_seen": 99770912, + "step": 46225 + }, + { + "epoch": 7.541598694942904, + "grad_norm": 1.589601993560791, + "learning_rate": 8.655003583433144e-06, + "loss": 0.1088, + "num_input_tokens_seen": 99782368, + "step": 46230 + }, + { + "epoch": 7.5424143556280585, + "grad_norm": 0.878372073173523, + "learning_rate": 8.649618299965736e-06, + "loss": 0.085, + "num_input_tokens_seen": 99794560, + "step": 46235 + }, + { + "epoch": 7.543230016313213, + "grad_norm": 0.29169735312461853, + "learning_rate": 8.644234341943232e-06, + "loss": 0.0959, + "num_input_tokens_seen": 99805184, + "step": 46240 + }, + { + "epoch": 7.544045676998369, + "grad_norm": 0.3868658244609833, + "learning_rate": 8.638851709802082e-06, + "loss": 0.136, + "num_input_tokens_seen": 99816160, + "step": 46245 + }, + { + "epoch": 7.544861337683524, + "grad_norm": 0.3129299283027649, + "learning_rate": 8.633470403978625e-06, + "loss": 0.0642, + "num_input_tokens_seen": 99826784, + "step": 46250 + }, + { + "epoch": 7.545676998368679, + "grad_norm": 0.05447586998343468, + "learning_rate": 8.628090424909091e-06, + "loss": 0.019, + "num_input_tokens_seen": 99837920, + "step": 46255 + }, + { + "epoch": 7.5464926590538335, + "grad_norm": 1.360102891921997, + "learning_rate": 8.62271177302963e-06, + "loss": 0.1934, + "num_input_tokens_seen": 99848672, + "step": 46260 + }, + { + "epoch": 7.547308319738988, + "grad_norm": 0.22760915756225586, + "learning_rate": 8.617334448776246e-06, + "loss": 0.2301, + "num_input_tokens_seen": 99861216, + "step": 46265 + }, + { + "epoch": 7.548123980424144, + "grad_norm": 0.44699880480766296, + "learning_rate": 8.611958452584859e-06, + "loss": 0.1634, + "num_input_tokens_seen": 99870784, + "step": 46270 + }, + { + "epoch": 7.548939641109299, + "grad_norm": 0.5119531750679016, + "learning_rate": 8.60658378489127e-06, + "loss": 0.1245, + "num_input_tokens_seen": 99882272, + "step": 46275 + }, + { + "epoch": 7.549755301794454, + "grad_norm": 0.9036450386047363, + "learning_rate": 8.60121044613118e-06, + "loss": 0.1414, + "num_input_tokens_seen": 99892992, + "step": 46280 + }, + { + "epoch": 7.5505709624796085, + "grad_norm": 0.1372038871049881, + "learning_rate": 8.595838436740178e-06, + "loss": 0.0387, + "num_input_tokens_seen": 99903744, + "step": 46285 + }, + { + "epoch": 7.551386623164763, + "grad_norm": 0.19062724709510803, + "learning_rate": 8.590467757153744e-06, + "loss": 0.1336, + "num_input_tokens_seen": 99914688, + "step": 46290 + }, + { + "epoch": 7.552202283849918, + "grad_norm": 0.38412347435951233, + "learning_rate": 8.585098407807258e-06, + "loss": 0.0685, + "num_input_tokens_seen": 99925632, + "step": 46295 + }, + { + "epoch": 7.553017944535073, + "grad_norm": 1.1582070589065552, + "learning_rate": 8.579730389135973e-06, + "loss": 0.1662, + "num_input_tokens_seen": 99935552, + "step": 46300 + }, + { + "epoch": 7.553833605220229, + "grad_norm": 0.5458048582077026, + "learning_rate": 8.574363701575067e-06, + "loss": 0.0243, + "num_input_tokens_seen": 99946880, + "step": 46305 + }, + { + "epoch": 7.554649265905383, + "grad_norm": 1.5401179790496826, + "learning_rate": 8.568998345559581e-06, + "loss": 0.1674, + "num_input_tokens_seen": 99957792, + "step": 46310 + }, + { + "epoch": 7.555464926590538, + "grad_norm": 0.15892446041107178, + "learning_rate": 8.56363432152446e-06, + "loss": 0.2044, + "num_input_tokens_seen": 99967168, + "step": 46315 + }, + { + "epoch": 7.556280587275693, + "grad_norm": 0.5923933982849121, + "learning_rate": 8.55827162990454e-06, + "loss": 0.0704, + "num_input_tokens_seen": 99978784, + "step": 46320 + }, + { + "epoch": 7.557096247960848, + "grad_norm": 0.4311680197715759, + "learning_rate": 8.552910271134545e-06, + "loss": 0.1723, + "num_input_tokens_seen": 99990144, + "step": 46325 + }, + { + "epoch": 7.557911908646004, + "grad_norm": 0.23747500777244568, + "learning_rate": 8.547550245649095e-06, + "loss": 0.0175, + "num_input_tokens_seen": 100001568, + "step": 46330 + }, + { + "epoch": 7.558727569331158, + "grad_norm": 1.8785724639892578, + "learning_rate": 8.542191553882701e-06, + "loss": 0.1931, + "num_input_tokens_seen": 100012128, + "step": 46335 + }, + { + "epoch": 7.559543230016313, + "grad_norm": 0.09658266603946686, + "learning_rate": 8.536834196269766e-06, + "loss": 0.1562, + "num_input_tokens_seen": 100023936, + "step": 46340 + }, + { + "epoch": 7.560358890701468, + "grad_norm": 1.4577556848526, + "learning_rate": 8.531478173244583e-06, + "loss": 0.1014, + "num_input_tokens_seen": 100035552, + "step": 46345 + }, + { + "epoch": 7.561174551386623, + "grad_norm": 0.25668632984161377, + "learning_rate": 8.52612348524134e-06, + "loss": 0.0166, + "num_input_tokens_seen": 100045408, + "step": 46350 + }, + { + "epoch": 7.561990212071779, + "grad_norm": 0.25192809104919434, + "learning_rate": 8.520770132694118e-06, + "loss": 0.1814, + "num_input_tokens_seen": 100057184, + "step": 46355 + }, + { + "epoch": 7.562805872756933, + "grad_norm": 0.34579265117645264, + "learning_rate": 8.515418116036872e-06, + "loss": 0.0289, + "num_input_tokens_seen": 100068384, + "step": 46360 + }, + { + "epoch": 7.563621533442088, + "grad_norm": 0.8077057003974915, + "learning_rate": 8.510067435703484e-06, + "loss": 0.1919, + "num_input_tokens_seen": 100079488, + "step": 46365 + }, + { + "epoch": 7.564437194127243, + "grad_norm": 0.34325432777404785, + "learning_rate": 8.5047180921277e-06, + "loss": 0.1833, + "num_input_tokens_seen": 100090720, + "step": 46370 + }, + { + "epoch": 7.565252854812398, + "grad_norm": 0.9641119241714478, + "learning_rate": 8.499370085743163e-06, + "loss": 0.1041, + "num_input_tokens_seen": 100101248, + "step": 46375 + }, + { + "epoch": 7.566068515497553, + "grad_norm": 0.5151918530464172, + "learning_rate": 8.49402341698341e-06, + "loss": 0.0571, + "num_input_tokens_seen": 100112224, + "step": 46380 + }, + { + "epoch": 7.566884176182708, + "grad_norm": 0.21674425899982452, + "learning_rate": 8.48867808628187e-06, + "loss": 0.0928, + "num_input_tokens_seen": 100121632, + "step": 46385 + }, + { + "epoch": 7.567699836867863, + "grad_norm": 1.4788211584091187, + "learning_rate": 8.483334094071862e-06, + "loss": 0.0904, + "num_input_tokens_seen": 100133024, + "step": 46390 + }, + { + "epoch": 7.568515497553018, + "grad_norm": 0.4813409149646759, + "learning_rate": 8.477991440786597e-06, + "loss": 0.2175, + "num_input_tokens_seen": 100143008, + "step": 46395 + }, + { + "epoch": 7.569331158238173, + "grad_norm": 0.03813975676894188, + "learning_rate": 8.472650126859177e-06, + "loss": 0.1719, + "num_input_tokens_seen": 100153408, + "step": 46400 + }, + { + "epoch": 7.570146818923328, + "grad_norm": 1.126928448677063, + "learning_rate": 8.467310152722599e-06, + "loss": 0.2142, + "num_input_tokens_seen": 100163744, + "step": 46405 + }, + { + "epoch": 7.5709624796084825, + "grad_norm": 0.3292653560638428, + "learning_rate": 8.461971518809744e-06, + "loss": 0.0831, + "num_input_tokens_seen": 100172608, + "step": 46410 + }, + { + "epoch": 7.571778140293638, + "grad_norm": 0.16511443257331848, + "learning_rate": 8.456634225553389e-06, + "loss": 0.0503, + "num_input_tokens_seen": 100182400, + "step": 46415 + }, + { + "epoch": 7.572593800978793, + "grad_norm": 1.498277187347412, + "learning_rate": 8.451298273386207e-06, + "loss": 0.0747, + "num_input_tokens_seen": 100191936, + "step": 46420 + }, + { + "epoch": 7.573409461663948, + "grad_norm": 0.08042709529399872, + "learning_rate": 8.445963662740752e-06, + "loss": 0.1087, + "num_input_tokens_seen": 100202048, + "step": 46425 + }, + { + "epoch": 7.574225122349103, + "grad_norm": 0.26554739475250244, + "learning_rate": 8.440630394049479e-06, + "loss": 0.1237, + "num_input_tokens_seen": 100213632, + "step": 46430 + }, + { + "epoch": 7.575040783034257, + "grad_norm": 1.2171297073364258, + "learning_rate": 8.435298467744726e-06, + "loss": 0.22, + "num_input_tokens_seen": 100224096, + "step": 46435 + }, + { + "epoch": 7.575856443719413, + "grad_norm": 0.4434095323085785, + "learning_rate": 8.429967884258721e-06, + "loss": 0.1529, + "num_input_tokens_seen": 100234368, + "step": 46440 + }, + { + "epoch": 7.576672104404568, + "grad_norm": 0.04890967160463333, + "learning_rate": 8.424638644023603e-06, + "loss": 0.0581, + "num_input_tokens_seen": 100244256, + "step": 46445 + }, + { + "epoch": 7.577487765089723, + "grad_norm": 1.648612380027771, + "learning_rate": 8.419310747471377e-06, + "loss": 0.1085, + "num_input_tokens_seen": 100255360, + "step": 46450 + }, + { + "epoch": 7.578303425774878, + "grad_norm": 0.043325405567884445, + "learning_rate": 8.413984195033953e-06, + "loss": 0.1247, + "num_input_tokens_seen": 100265824, + "step": 46455 + }, + { + "epoch": 7.579119086460032, + "grad_norm": 0.17798182368278503, + "learning_rate": 8.408658987143125e-06, + "loss": 0.2373, + "num_input_tokens_seen": 100276512, + "step": 46460 + }, + { + "epoch": 7.579934747145187, + "grad_norm": 0.047117773443460464, + "learning_rate": 8.403335124230586e-06, + "loss": 0.0986, + "num_input_tokens_seen": 100286560, + "step": 46465 + }, + { + "epoch": 7.580750407830343, + "grad_norm": 1.0769424438476562, + "learning_rate": 8.3980126067279e-06, + "loss": 0.1231, + "num_input_tokens_seen": 100297856, + "step": 46470 + }, + { + "epoch": 7.581566068515498, + "grad_norm": 0.39586836099624634, + "learning_rate": 8.392691435066563e-06, + "loss": 0.1975, + "num_input_tokens_seen": 100308352, + "step": 46475 + }, + { + "epoch": 7.582381729200653, + "grad_norm": 1.208876609802246, + "learning_rate": 8.387371609677921e-06, + "loss": 0.0931, + "num_input_tokens_seen": 100317536, + "step": 46480 + }, + { + "epoch": 7.583197389885807, + "grad_norm": 0.0957469716668129, + "learning_rate": 8.382053130993226e-06, + "loss": 0.0265, + "num_input_tokens_seen": 100327808, + "step": 46485 + }, + { + "epoch": 7.584013050570962, + "grad_norm": 0.041857123374938965, + "learning_rate": 8.376735999443624e-06, + "loss": 0.0971, + "num_input_tokens_seen": 100339712, + "step": 46490 + }, + { + "epoch": 7.584828711256117, + "grad_norm": 1.1568468809127808, + "learning_rate": 8.371420215460149e-06, + "loss": 0.222, + "num_input_tokens_seen": 100349696, + "step": 46495 + }, + { + "epoch": 7.585644371941273, + "grad_norm": 0.7524036765098572, + "learning_rate": 8.366105779473723e-06, + "loss": 0.3112, + "num_input_tokens_seen": 100360160, + "step": 46500 + }, + { + "epoch": 7.5864600326264275, + "grad_norm": 0.7731418013572693, + "learning_rate": 8.360792691915163e-06, + "loss": 0.0546, + "num_input_tokens_seen": 100370848, + "step": 46505 + }, + { + "epoch": 7.587275693311582, + "grad_norm": 0.5715504884719849, + "learning_rate": 8.35548095321517e-06, + "loss": 0.0317, + "num_input_tokens_seen": 100381248, + "step": 46510 + }, + { + "epoch": 7.588091353996737, + "grad_norm": 2.0701725482940674, + "learning_rate": 8.350170563804349e-06, + "loss": 0.1096, + "num_input_tokens_seen": 100389888, + "step": 46515 + }, + { + "epoch": 7.588907014681892, + "grad_norm": 0.22756262123584747, + "learning_rate": 8.344861524113178e-06, + "loss": 0.0618, + "num_input_tokens_seen": 100399840, + "step": 46520 + }, + { + "epoch": 7.589722675367048, + "grad_norm": 0.843681812286377, + "learning_rate": 8.339553834572043e-06, + "loss": 0.1141, + "num_input_tokens_seen": 100410848, + "step": 46525 + }, + { + "epoch": 7.5905383360522025, + "grad_norm": 1.2214593887329102, + "learning_rate": 8.334247495611208e-06, + "loss": 0.2965, + "num_input_tokens_seen": 100422368, + "step": 46530 + }, + { + "epoch": 7.591353996737357, + "grad_norm": 0.1153625100851059, + "learning_rate": 8.32894250766083e-06, + "loss": 0.1115, + "num_input_tokens_seen": 100433504, + "step": 46535 + }, + { + "epoch": 7.592169657422512, + "grad_norm": 0.23628367483615875, + "learning_rate": 8.323638871150962e-06, + "loss": 0.0872, + "num_input_tokens_seen": 100443584, + "step": 46540 + }, + { + "epoch": 7.592985318107667, + "grad_norm": 1.4175723791122437, + "learning_rate": 8.31833658651154e-06, + "loss": 0.2408, + "num_input_tokens_seen": 100453216, + "step": 46545 + }, + { + "epoch": 7.593800978792823, + "grad_norm": 0.3972209095954895, + "learning_rate": 8.313035654172399e-06, + "loss": 0.058, + "num_input_tokens_seen": 100463296, + "step": 46550 + }, + { + "epoch": 7.5946166394779775, + "grad_norm": 1.0007461309432983, + "learning_rate": 8.307736074563257e-06, + "loss": 0.2921, + "num_input_tokens_seen": 100473728, + "step": 46555 + }, + { + "epoch": 7.595432300163132, + "grad_norm": 2.555741310119629, + "learning_rate": 8.302437848113722e-06, + "loss": 0.1166, + "num_input_tokens_seen": 100485344, + "step": 46560 + }, + { + "epoch": 7.596247960848287, + "grad_norm": 1.985058307647705, + "learning_rate": 8.297140975253302e-06, + "loss": 0.1498, + "num_input_tokens_seen": 100495552, + "step": 46565 + }, + { + "epoch": 7.597063621533442, + "grad_norm": 0.11035454273223877, + "learning_rate": 8.291845456411378e-06, + "loss": 0.0946, + "num_input_tokens_seen": 100506944, + "step": 46570 + }, + { + "epoch": 7.597879282218597, + "grad_norm": 0.8760508894920349, + "learning_rate": 8.286551292017233e-06, + "loss": 0.1227, + "num_input_tokens_seen": 100517760, + "step": 46575 + }, + { + "epoch": 7.598694942903752, + "grad_norm": 0.46874237060546875, + "learning_rate": 8.281258482500052e-06, + "loss": 0.0936, + "num_input_tokens_seen": 100528768, + "step": 46580 + }, + { + "epoch": 7.599510603588907, + "grad_norm": 0.5673197507858276, + "learning_rate": 8.275967028288886e-06, + "loss": 0.1, + "num_input_tokens_seen": 100539392, + "step": 46585 + }, + { + "epoch": 7.600326264274062, + "grad_norm": 0.3750561773777008, + "learning_rate": 8.270676929812692e-06, + "loss": 0.1043, + "num_input_tokens_seen": 100550176, + "step": 46590 + }, + { + "epoch": 7.601141924959217, + "grad_norm": 1.6376276016235352, + "learning_rate": 8.265388187500309e-06, + "loss": 0.1292, + "num_input_tokens_seen": 100561216, + "step": 46595 + }, + { + "epoch": 7.601957585644372, + "grad_norm": 0.19764047861099243, + "learning_rate": 8.26010080178047e-06, + "loss": 0.0355, + "num_input_tokens_seen": 100571648, + "step": 46600 + }, + { + "epoch": 7.602773246329527, + "grad_norm": 0.9700854420661926, + "learning_rate": 8.254814773081798e-06, + "loss": 0.1821, + "num_input_tokens_seen": 100583776, + "step": 46605 + }, + { + "epoch": 7.603588907014682, + "grad_norm": 0.3649517297744751, + "learning_rate": 8.249530101832795e-06, + "loss": 0.0463, + "num_input_tokens_seen": 100592992, + "step": 46610 + }, + { + "epoch": 7.604404567699837, + "grad_norm": 0.5723237991333008, + "learning_rate": 8.244246788461882e-06, + "loss": 0.1737, + "num_input_tokens_seen": 100603680, + "step": 46615 + }, + { + "epoch": 7.605220228384992, + "grad_norm": 2.153599977493286, + "learning_rate": 8.238964833397341e-06, + "loss": 0.2636, + "num_input_tokens_seen": 100615808, + "step": 46620 + }, + { + "epoch": 7.606035889070147, + "grad_norm": 1.969622015953064, + "learning_rate": 8.233684237067358e-06, + "loss": 0.2112, + "num_input_tokens_seen": 100625696, + "step": 46625 + }, + { + "epoch": 7.6068515497553015, + "grad_norm": 0.30095937848091125, + "learning_rate": 8.2284049999e-06, + "loss": 0.0694, + "num_input_tokens_seen": 100635968, + "step": 46630 + }, + { + "epoch": 7.607667210440457, + "grad_norm": 0.24662645161151886, + "learning_rate": 8.223127122323231e-06, + "loss": 0.1176, + "num_input_tokens_seen": 100647104, + "step": 46635 + }, + { + "epoch": 7.608482871125612, + "grad_norm": 0.11114316433668137, + "learning_rate": 8.217850604764903e-06, + "loss": 0.0157, + "num_input_tokens_seen": 100656992, + "step": 46640 + }, + { + "epoch": 7.609298531810767, + "grad_norm": 1.362558126449585, + "learning_rate": 8.212575447652757e-06, + "loss": 0.1329, + "num_input_tokens_seen": 100668288, + "step": 46645 + }, + { + "epoch": 7.610114192495922, + "grad_norm": 0.5582205057144165, + "learning_rate": 8.207301651414423e-06, + "loss": 0.1811, + "num_input_tokens_seen": 100680736, + "step": 46650 + }, + { + "epoch": 7.6109298531810765, + "grad_norm": 0.35586783289909363, + "learning_rate": 8.202029216477425e-06, + "loss": 0.0613, + "num_input_tokens_seen": 100692000, + "step": 46655 + }, + { + "epoch": 7.611745513866231, + "grad_norm": 1.5106310844421387, + "learning_rate": 8.196758143269168e-06, + "loss": 0.2732, + "num_input_tokens_seen": 100701440, + "step": 46660 + }, + { + "epoch": 7.612561174551386, + "grad_norm": 0.13264694809913635, + "learning_rate": 8.191488432216957e-06, + "loss": 0.1229, + "num_input_tokens_seen": 100712224, + "step": 46665 + }, + { + "epoch": 7.613376835236542, + "grad_norm": 0.8734698295593262, + "learning_rate": 8.186220083747975e-06, + "loss": 0.0752, + "num_input_tokens_seen": 100722912, + "step": 46670 + }, + { + "epoch": 7.614192495921697, + "grad_norm": 0.08990608155727386, + "learning_rate": 8.18095309828931e-06, + "loss": 0.103, + "num_input_tokens_seen": 100732384, + "step": 46675 + }, + { + "epoch": 7.6150081566068515, + "grad_norm": 1.2942273616790771, + "learning_rate": 8.175687476267915e-06, + "loss": 0.2193, + "num_input_tokens_seen": 100743328, + "step": 46680 + }, + { + "epoch": 7.615823817292006, + "grad_norm": 0.5781928300857544, + "learning_rate": 8.170423218110667e-06, + "loss": 0.0967, + "num_input_tokens_seen": 100754944, + "step": 46685 + }, + { + "epoch": 7.616639477977161, + "grad_norm": 0.7560697197914124, + "learning_rate": 8.165160324244305e-06, + "loss": 0.2754, + "num_input_tokens_seen": 100767200, + "step": 46690 + }, + { + "epoch": 7.617455138662317, + "grad_norm": 1.3477163314819336, + "learning_rate": 8.15989879509547e-06, + "loss": 0.1555, + "num_input_tokens_seen": 100777408, + "step": 46695 + }, + { + "epoch": 7.618270799347472, + "grad_norm": 0.132105752825737, + "learning_rate": 8.15463863109068e-06, + "loss": 0.2114, + "num_input_tokens_seen": 100787584, + "step": 46700 + }, + { + "epoch": 7.6190864600326265, + "grad_norm": 0.19837351143360138, + "learning_rate": 8.149379832656356e-06, + "loss": 0.0859, + "num_input_tokens_seen": 100796800, + "step": 46705 + }, + { + "epoch": 7.619902120717781, + "grad_norm": 0.21389202773571014, + "learning_rate": 8.144122400218804e-06, + "loss": 0.0248, + "num_input_tokens_seen": 100806528, + "step": 46710 + }, + { + "epoch": 7.620717781402936, + "grad_norm": 1.1308808326721191, + "learning_rate": 8.138866334204215e-06, + "loss": 0.0917, + "num_input_tokens_seen": 100817120, + "step": 46715 + }, + { + "epoch": 7.621533442088092, + "grad_norm": 2.085279703140259, + "learning_rate": 8.133611635038674e-06, + "loss": 0.2883, + "num_input_tokens_seen": 100828000, + "step": 46720 + }, + { + "epoch": 7.622349102773247, + "grad_norm": 0.10313961654901505, + "learning_rate": 8.12835830314815e-06, + "loss": 0.0223, + "num_input_tokens_seen": 100839424, + "step": 46725 + }, + { + "epoch": 7.623164763458401, + "grad_norm": 0.7369300723075867, + "learning_rate": 8.123106338958511e-06, + "loss": 0.092, + "num_input_tokens_seen": 100849472, + "step": 46730 + }, + { + "epoch": 7.623980424143556, + "grad_norm": 1.2866710424423218, + "learning_rate": 8.117855742895506e-06, + "loss": 0.0719, + "num_input_tokens_seen": 100859232, + "step": 46735 + }, + { + "epoch": 7.624796084828711, + "grad_norm": 0.5231333374977112, + "learning_rate": 8.112606515384772e-06, + "loss": 0.0833, + "num_input_tokens_seen": 100869888, + "step": 46740 + }, + { + "epoch": 7.625611745513866, + "grad_norm": 0.35658320784568787, + "learning_rate": 8.107358656851838e-06, + "loss": 0.2098, + "num_input_tokens_seen": 100881440, + "step": 46745 + }, + { + "epoch": 7.626427406199021, + "grad_norm": 0.8316605687141418, + "learning_rate": 8.102112167722125e-06, + "loss": 0.0971, + "num_input_tokens_seen": 100892160, + "step": 46750 + }, + { + "epoch": 7.627243066884176, + "grad_norm": 0.3301803767681122, + "learning_rate": 8.096867048420932e-06, + "loss": 0.0967, + "num_input_tokens_seen": 100901280, + "step": 46755 + }, + { + "epoch": 7.628058727569331, + "grad_norm": 1.8661962747573853, + "learning_rate": 8.091623299373467e-06, + "loss": 0.2086, + "num_input_tokens_seen": 100913440, + "step": 46760 + }, + { + "epoch": 7.628874388254486, + "grad_norm": 0.062074366956949234, + "learning_rate": 8.08638092100481e-06, + "loss": 0.1986, + "num_input_tokens_seen": 100923840, + "step": 46765 + }, + { + "epoch": 7.629690048939641, + "grad_norm": 0.5463224053382874, + "learning_rate": 8.081139913739936e-06, + "loss": 0.0959, + "num_input_tokens_seen": 100933824, + "step": 46770 + }, + { + "epoch": 7.630505709624796, + "grad_norm": 0.23783616721630096, + "learning_rate": 8.075900278003703e-06, + "loss": 0.0535, + "num_input_tokens_seen": 100943424, + "step": 46775 + }, + { + "epoch": 7.631321370309951, + "grad_norm": 0.3520021438598633, + "learning_rate": 8.07066201422087e-06, + "loss": 0.1535, + "num_input_tokens_seen": 100955040, + "step": 46780 + }, + { + "epoch": 7.632137030995106, + "grad_norm": 1.652519702911377, + "learning_rate": 8.065425122816061e-06, + "loss": 0.0991, + "num_input_tokens_seen": 100966848, + "step": 46785 + }, + { + "epoch": 7.632952691680261, + "grad_norm": 0.1303957998752594, + "learning_rate": 8.060189604213827e-06, + "loss": 0.0948, + "num_input_tokens_seen": 100977888, + "step": 46790 + }, + { + "epoch": 7.633768352365416, + "grad_norm": 0.17571304738521576, + "learning_rate": 8.054955458838576e-06, + "loss": 0.0382, + "num_input_tokens_seen": 100988736, + "step": 46795 + }, + { + "epoch": 7.634584013050571, + "grad_norm": 0.7385642528533936, + "learning_rate": 8.049722687114611e-06, + "loss": 0.1132, + "num_input_tokens_seen": 100999360, + "step": 46800 + }, + { + "epoch": 7.635399673735726, + "grad_norm": 0.47059640288352966, + "learning_rate": 8.044491289466133e-06, + "loss": 0.0365, + "num_input_tokens_seen": 101011104, + "step": 46805 + }, + { + "epoch": 7.636215334420881, + "grad_norm": 0.3823755085468292, + "learning_rate": 8.039261266317219e-06, + "loss": 0.0319, + "num_input_tokens_seen": 101021664, + "step": 46810 + }, + { + "epoch": 7.637030995106036, + "grad_norm": 0.15938326716423035, + "learning_rate": 8.034032618091846e-06, + "loss": 0.1127, + "num_input_tokens_seen": 101032576, + "step": 46815 + }, + { + "epoch": 7.637846655791191, + "grad_norm": 0.5109442472457886, + "learning_rate": 8.028805345213875e-06, + "loss": 0.0964, + "num_input_tokens_seen": 101043040, + "step": 46820 + }, + { + "epoch": 7.638662316476346, + "grad_norm": 0.1814226508140564, + "learning_rate": 8.023579448107053e-06, + "loss": 0.2567, + "num_input_tokens_seen": 101054400, + "step": 46825 + }, + { + "epoch": 7.6394779771615005, + "grad_norm": 0.4146941006183624, + "learning_rate": 8.018354927195017e-06, + "loss": 0.0351, + "num_input_tokens_seen": 101064896, + "step": 46830 + }, + { + "epoch": 7.640293637846656, + "grad_norm": 0.9155624508857727, + "learning_rate": 8.013131782901295e-06, + "loss": 0.0748, + "num_input_tokens_seen": 101074848, + "step": 46835 + }, + { + "epoch": 7.641109298531811, + "grad_norm": 0.14113570749759674, + "learning_rate": 8.007910015649304e-06, + "loss": 0.1388, + "num_input_tokens_seen": 101085312, + "step": 46840 + }, + { + "epoch": 7.641924959216966, + "grad_norm": 0.3883732557296753, + "learning_rate": 8.002689625862342e-06, + "loss": 0.0169, + "num_input_tokens_seen": 101095584, + "step": 46845 + }, + { + "epoch": 7.642740619902121, + "grad_norm": 0.0954550951719284, + "learning_rate": 7.997470613963601e-06, + "loss": 0.0097, + "num_input_tokens_seen": 101105760, + "step": 46850 + }, + { + "epoch": 7.643556280587275, + "grad_norm": 0.0895189419388771, + "learning_rate": 7.992252980376164e-06, + "loss": 0.0776, + "num_input_tokens_seen": 101116576, + "step": 46855 + }, + { + "epoch": 7.64437194127243, + "grad_norm": 0.9067518711090088, + "learning_rate": 7.987036725522995e-06, + "loss": 0.0742, + "num_input_tokens_seen": 101127840, + "step": 46860 + }, + { + "epoch": 7.645187601957586, + "grad_norm": 0.3612946569919586, + "learning_rate": 7.981821849826954e-06, + "loss": 0.146, + "num_input_tokens_seen": 101140192, + "step": 46865 + }, + { + "epoch": 7.646003262642741, + "grad_norm": 1.2164055109024048, + "learning_rate": 7.976608353710782e-06, + "loss": 0.1921, + "num_input_tokens_seen": 101150336, + "step": 46870 + }, + { + "epoch": 7.646818923327896, + "grad_norm": 0.13758468627929688, + "learning_rate": 7.971396237597114e-06, + "loss": 0.0724, + "num_input_tokens_seen": 101161088, + "step": 46875 + }, + { + "epoch": 7.64763458401305, + "grad_norm": 1.027360439300537, + "learning_rate": 7.966185501908469e-06, + "loss": 0.0682, + "num_input_tokens_seen": 101171776, + "step": 46880 + }, + { + "epoch": 7.648450244698205, + "grad_norm": 0.09612219035625458, + "learning_rate": 7.960976147067254e-06, + "loss": 0.0287, + "num_input_tokens_seen": 101180544, + "step": 46885 + }, + { + "epoch": 7.649265905383361, + "grad_norm": 0.6849896907806396, + "learning_rate": 7.95576817349577e-06, + "loss": 0.0466, + "num_input_tokens_seen": 101192448, + "step": 46890 + }, + { + "epoch": 7.650081566068516, + "grad_norm": 0.35581421852111816, + "learning_rate": 7.95056158161619e-06, + "loss": 0.0944, + "num_input_tokens_seen": 101203168, + "step": 46895 + }, + { + "epoch": 7.650897226753671, + "grad_norm": 1.4403477907180786, + "learning_rate": 7.945356371850604e-06, + "loss": 0.1041, + "num_input_tokens_seen": 101213216, + "step": 46900 + }, + { + "epoch": 7.651712887438825, + "grad_norm": 0.6876615881919861, + "learning_rate": 7.940152544620966e-06, + "loss": 0.1219, + "num_input_tokens_seen": 101224960, + "step": 46905 + }, + { + "epoch": 7.65252854812398, + "grad_norm": 1.518269658088684, + "learning_rate": 7.934950100349123e-06, + "loss": 0.1167, + "num_input_tokens_seen": 101235296, + "step": 46910 + }, + { + "epoch": 7.653344208809135, + "grad_norm": 1.50341796875, + "learning_rate": 7.929749039456813e-06, + "loss": 0.1685, + "num_input_tokens_seen": 101246784, + "step": 46915 + }, + { + "epoch": 7.654159869494291, + "grad_norm": 0.18854659795761108, + "learning_rate": 7.924549362365658e-06, + "loss": 0.125, + "num_input_tokens_seen": 101257440, + "step": 46920 + }, + { + "epoch": 7.6549755301794455, + "grad_norm": 0.09270556271076202, + "learning_rate": 7.919351069497163e-06, + "loss": 0.0527, + "num_input_tokens_seen": 101268672, + "step": 46925 + }, + { + "epoch": 7.6557911908646, + "grad_norm": 0.04600043222308159, + "learning_rate": 7.914154161272746e-06, + "loss": 0.1032, + "num_input_tokens_seen": 101279168, + "step": 46930 + }, + { + "epoch": 7.656606851549755, + "grad_norm": 3.094905138015747, + "learning_rate": 7.908958638113687e-06, + "loss": 0.2992, + "num_input_tokens_seen": 101290336, + "step": 46935 + }, + { + "epoch": 7.65742251223491, + "grad_norm": 0.0710255354642868, + "learning_rate": 7.903764500441157e-06, + "loss": 0.2248, + "num_input_tokens_seen": 101301344, + "step": 46940 + }, + { + "epoch": 7.658238172920065, + "grad_norm": 0.102028988301754, + "learning_rate": 7.898571748676223e-06, + "loss": 0.2459, + "num_input_tokens_seen": 101312160, + "step": 46945 + }, + { + "epoch": 7.6590538336052205, + "grad_norm": 0.3593699038028717, + "learning_rate": 7.893380383239835e-06, + "loss": 0.1006, + "num_input_tokens_seen": 101322720, + "step": 46950 + }, + { + "epoch": 7.659869494290375, + "grad_norm": 0.33454087376594543, + "learning_rate": 7.888190404552832e-06, + "loss": 0.0802, + "num_input_tokens_seen": 101334048, + "step": 46955 + }, + { + "epoch": 7.66068515497553, + "grad_norm": 1.5570753812789917, + "learning_rate": 7.883001813035937e-06, + "loss": 0.2389, + "num_input_tokens_seen": 101345760, + "step": 46960 + }, + { + "epoch": 7.661500815660685, + "grad_norm": 1.8494471311569214, + "learning_rate": 7.877814609109769e-06, + "loss": 0.1389, + "num_input_tokens_seen": 101356416, + "step": 46965 + }, + { + "epoch": 7.66231647634584, + "grad_norm": 1.3086097240447998, + "learning_rate": 7.872628793194823e-06, + "loss": 0.2226, + "num_input_tokens_seen": 101367360, + "step": 46970 + }, + { + "epoch": 7.6631321370309955, + "grad_norm": 0.29184749722480774, + "learning_rate": 7.86744436571149e-06, + "loss": 0.0344, + "num_input_tokens_seen": 101378848, + "step": 46975 + }, + { + "epoch": 7.66394779771615, + "grad_norm": 0.10088274627923965, + "learning_rate": 7.86226132708005e-06, + "loss": 0.1098, + "num_input_tokens_seen": 101389184, + "step": 46980 + }, + { + "epoch": 7.664763458401305, + "grad_norm": 1.5318970680236816, + "learning_rate": 7.85707967772066e-06, + "loss": 0.2131, + "num_input_tokens_seen": 101399840, + "step": 46985 + }, + { + "epoch": 7.66557911908646, + "grad_norm": 0.43490245938301086, + "learning_rate": 7.851899418053374e-06, + "loss": 0.2201, + "num_input_tokens_seen": 101410112, + "step": 46990 + }, + { + "epoch": 7.666394779771615, + "grad_norm": 0.5858300924301147, + "learning_rate": 7.846720548498132e-06, + "loss": 0.1214, + "num_input_tokens_seen": 101420224, + "step": 46995 + }, + { + "epoch": 7.6672104404567705, + "grad_norm": 0.6748706102371216, + "learning_rate": 7.841543069474747e-06, + "loss": 0.0415, + "num_input_tokens_seen": 101430080, + "step": 47000 + }, + { + "epoch": 7.668026101141925, + "grad_norm": 0.169643372297287, + "learning_rate": 7.836366981402951e-06, + "loss": 0.0756, + "num_input_tokens_seen": 101441856, + "step": 47005 + }, + { + "epoch": 7.66884176182708, + "grad_norm": 2.053367853164673, + "learning_rate": 7.831192284702334e-06, + "loss": 0.2308, + "num_input_tokens_seen": 101451232, + "step": 47010 + }, + { + "epoch": 7.669657422512235, + "grad_norm": 1.0488523244857788, + "learning_rate": 7.826018979792385e-06, + "loss": 0.1496, + "num_input_tokens_seen": 101461792, + "step": 47015 + }, + { + "epoch": 7.67047308319739, + "grad_norm": 1.5837363004684448, + "learning_rate": 7.820847067092477e-06, + "loss": 0.2774, + "num_input_tokens_seen": 101472096, + "step": 47020 + }, + { + "epoch": 7.671288743882545, + "grad_norm": 1.3263866901397705, + "learning_rate": 7.815676547021871e-06, + "loss": 0.1565, + "num_input_tokens_seen": 101482784, + "step": 47025 + }, + { + "epoch": 7.672104404567699, + "grad_norm": 0.47558850049972534, + "learning_rate": 7.810507419999716e-06, + "loss": 0.1067, + "num_input_tokens_seen": 101493248, + "step": 47030 + }, + { + "epoch": 7.672920065252855, + "grad_norm": 0.10663735866546631, + "learning_rate": 7.805339686445051e-06, + "loss": 0.0322, + "num_input_tokens_seen": 101504128, + "step": 47035 + }, + { + "epoch": 7.67373572593801, + "grad_norm": 0.6767993569374084, + "learning_rate": 7.800173346776793e-06, + "loss": 0.0713, + "num_input_tokens_seen": 101515808, + "step": 47040 + }, + { + "epoch": 7.674551386623165, + "grad_norm": 0.07184144854545593, + "learning_rate": 7.795008401413756e-06, + "loss": 0.151, + "num_input_tokens_seen": 101526976, + "step": 47045 + }, + { + "epoch": 7.6753670473083195, + "grad_norm": 0.5002629160881042, + "learning_rate": 7.789844850774636e-06, + "loss": 0.0472, + "num_input_tokens_seen": 101537440, + "step": 47050 + }, + { + "epoch": 7.676182707993474, + "grad_norm": 0.2892155349254608, + "learning_rate": 7.784682695278014e-06, + "loss": 0.2567, + "num_input_tokens_seen": 101549280, + "step": 47055 + }, + { + "epoch": 7.67699836867863, + "grad_norm": 0.177652508020401, + "learning_rate": 7.779521935342363e-06, + "loss": 0.0271, + "num_input_tokens_seen": 101558720, + "step": 47060 + }, + { + "epoch": 7.677814029363785, + "grad_norm": 0.09191443026065826, + "learning_rate": 7.77436257138604e-06, + "loss": 0.1941, + "num_input_tokens_seen": 101570400, + "step": 47065 + }, + { + "epoch": 7.67862969004894, + "grad_norm": 1.389002799987793, + "learning_rate": 7.769204603827282e-06, + "loss": 0.1318, + "num_input_tokens_seen": 101581472, + "step": 47070 + }, + { + "epoch": 7.6794453507340945, + "grad_norm": 0.037289489060640335, + "learning_rate": 7.764048033084235e-06, + "loss": 0.0523, + "num_input_tokens_seen": 101592480, + "step": 47075 + }, + { + "epoch": 7.680261011419249, + "grad_norm": 0.9277165532112122, + "learning_rate": 7.758892859574906e-06, + "loss": 0.1093, + "num_input_tokens_seen": 101601952, + "step": 47080 + }, + { + "epoch": 7.681076672104405, + "grad_norm": 1.293256402015686, + "learning_rate": 7.753739083717204e-06, + "loss": 0.1503, + "num_input_tokens_seen": 101612384, + "step": 47085 + }, + { + "epoch": 7.68189233278956, + "grad_norm": 0.6359534859657288, + "learning_rate": 7.748586705928917e-06, + "loss": 0.0772, + "num_input_tokens_seen": 101623872, + "step": 47090 + }, + { + "epoch": 7.682707993474715, + "grad_norm": 0.23076240718364716, + "learning_rate": 7.743435726627726e-06, + "loss": 0.135, + "num_input_tokens_seen": 101635072, + "step": 47095 + }, + { + "epoch": 7.6835236541598695, + "grad_norm": 0.2837105989456177, + "learning_rate": 7.738286146231194e-06, + "loss": 0.1172, + "num_input_tokens_seen": 101644896, + "step": 47100 + }, + { + "epoch": 7.684339314845024, + "grad_norm": 0.06649543344974518, + "learning_rate": 7.733137965156764e-06, + "loss": 0.0477, + "num_input_tokens_seen": 101655584, + "step": 47105 + }, + { + "epoch": 7.685154975530179, + "grad_norm": 0.46846479177474976, + "learning_rate": 7.727991183821792e-06, + "loss": 0.0558, + "num_input_tokens_seen": 101666912, + "step": 47110 + }, + { + "epoch": 7.685970636215334, + "grad_norm": 0.04389543831348419, + "learning_rate": 7.722845802643489e-06, + "loss": 0.0239, + "num_input_tokens_seen": 101676768, + "step": 47115 + }, + { + "epoch": 7.68678629690049, + "grad_norm": 0.24085885286331177, + "learning_rate": 7.71770182203897e-06, + "loss": 0.0592, + "num_input_tokens_seen": 101687936, + "step": 47120 + }, + { + "epoch": 7.6876019575856445, + "grad_norm": 1.7252795696258545, + "learning_rate": 7.71255924242523e-06, + "loss": 0.3063, + "num_input_tokens_seen": 101698592, + "step": 47125 + }, + { + "epoch": 7.688417618270799, + "grad_norm": 2.502321243286133, + "learning_rate": 7.707418064219152e-06, + "loss": 0.1583, + "num_input_tokens_seen": 101709312, + "step": 47130 + }, + { + "epoch": 7.689233278955954, + "grad_norm": 0.09914538264274597, + "learning_rate": 7.702278287837509e-06, + "loss": 0.173, + "num_input_tokens_seen": 101720512, + "step": 47135 + }, + { + "epoch": 7.690048939641109, + "grad_norm": 0.3995407223701477, + "learning_rate": 7.697139913696955e-06, + "loss": 0.1612, + "num_input_tokens_seen": 101732640, + "step": 47140 + }, + { + "epoch": 7.690864600326265, + "grad_norm": 0.23168981075286865, + "learning_rate": 7.692002942214035e-06, + "loss": 0.0944, + "num_input_tokens_seen": 101742400, + "step": 47145 + }, + { + "epoch": 7.691680261011419, + "grad_norm": 0.18768586218357086, + "learning_rate": 7.686867373805176e-06, + "loss": 0.2894, + "num_input_tokens_seen": 101751936, + "step": 47150 + }, + { + "epoch": 7.692495921696574, + "grad_norm": 1.5907188653945923, + "learning_rate": 7.681733208886693e-06, + "loss": 0.0781, + "num_input_tokens_seen": 101763136, + "step": 47155 + }, + { + "epoch": 7.693311582381729, + "grad_norm": 0.5353593826293945, + "learning_rate": 7.676600447874788e-06, + "loss": 0.0839, + "num_input_tokens_seen": 101773440, + "step": 47160 + }, + { + "epoch": 7.694127243066884, + "grad_norm": 0.24889563024044037, + "learning_rate": 7.67146909118555e-06, + "loss": 0.1652, + "num_input_tokens_seen": 101783264, + "step": 47165 + }, + { + "epoch": 7.69494290375204, + "grad_norm": 0.2768930494785309, + "learning_rate": 7.666339139234949e-06, + "loss": 0.0669, + "num_input_tokens_seen": 101793856, + "step": 47170 + }, + { + "epoch": 7.695758564437194, + "grad_norm": 0.1455915868282318, + "learning_rate": 7.66121059243885e-06, + "loss": 0.1529, + "num_input_tokens_seen": 101804832, + "step": 47175 + }, + { + "epoch": 7.696574225122349, + "grad_norm": 0.3356066346168518, + "learning_rate": 7.656083451212995e-06, + "loss": 0.1083, + "num_input_tokens_seen": 101815808, + "step": 47180 + }, + { + "epoch": 7.697389885807504, + "grad_norm": 0.10774309933185577, + "learning_rate": 7.650957715973017e-06, + "loss": 0.1235, + "num_input_tokens_seen": 101825120, + "step": 47185 + }, + { + "epoch": 7.698205546492659, + "grad_norm": 0.7152142524719238, + "learning_rate": 7.645833387134437e-06, + "loss": 0.1369, + "num_input_tokens_seen": 101836416, + "step": 47190 + }, + { + "epoch": 7.699021207177814, + "grad_norm": 0.3090754449367523, + "learning_rate": 7.640710465112654e-06, + "loss": 0.1017, + "num_input_tokens_seen": 101846240, + "step": 47195 + }, + { + "epoch": 7.699836867862969, + "grad_norm": 0.22152604162693024, + "learning_rate": 7.635588950322964e-06, + "loss": 0.121, + "num_input_tokens_seen": 101857152, + "step": 47200 + }, + { + "epoch": 7.700652528548124, + "grad_norm": 0.3910209834575653, + "learning_rate": 7.630468843180538e-06, + "loss": 0.1005, + "num_input_tokens_seen": 101867296, + "step": 47205 + }, + { + "epoch": 7.701468189233279, + "grad_norm": 0.5700939893722534, + "learning_rate": 7.625350144100441e-06, + "loss": 0.0983, + "num_input_tokens_seen": 101876000, + "step": 47210 + }, + { + "epoch": 7.702283849918434, + "grad_norm": 0.8956632018089294, + "learning_rate": 7.620232853497611e-06, + "loss": 0.0376, + "num_input_tokens_seen": 101886080, + "step": 47215 + }, + { + "epoch": 7.703099510603589, + "grad_norm": 0.5885664224624634, + "learning_rate": 7.615116971786895e-06, + "loss": 0.1635, + "num_input_tokens_seen": 101896480, + "step": 47220 + }, + { + "epoch": 7.7039151712887435, + "grad_norm": 0.037790317088365555, + "learning_rate": 7.610002499383012e-06, + "loss": 0.0381, + "num_input_tokens_seen": 101907136, + "step": 47225 + }, + { + "epoch": 7.704730831973899, + "grad_norm": 0.14260315895080566, + "learning_rate": 7.60488943670056e-06, + "loss": 0.1096, + "num_input_tokens_seen": 101918624, + "step": 47230 + }, + { + "epoch": 7.705546492659054, + "grad_norm": 0.14199811220169067, + "learning_rate": 7.59977778415403e-06, + "loss": 0.0567, + "num_input_tokens_seen": 101929408, + "step": 47235 + }, + { + "epoch": 7.706362153344209, + "grad_norm": 0.08122411370277405, + "learning_rate": 7.594667542157796e-06, + "loss": 0.1268, + "num_input_tokens_seen": 101940736, + "step": 47240 + }, + { + "epoch": 7.707177814029364, + "grad_norm": 0.6955691576004028, + "learning_rate": 7.5895587111261325e-06, + "loss": 0.1221, + "num_input_tokens_seen": 101950848, + "step": 47245 + }, + { + "epoch": 7.7079934747145185, + "grad_norm": 0.1297200471162796, + "learning_rate": 7.584451291473177e-06, + "loss": 0.07, + "num_input_tokens_seen": 101961376, + "step": 47250 + }, + { + "epoch": 7.708809135399674, + "grad_norm": 0.16632871329784393, + "learning_rate": 7.579345283612968e-06, + "loss": 0.1023, + "num_input_tokens_seen": 101972448, + "step": 47255 + }, + { + "epoch": 7.709624796084829, + "grad_norm": 0.2864798903465271, + "learning_rate": 7.574240687959422e-06, + "loss": 0.0285, + "num_input_tokens_seen": 101983712, + "step": 47260 + }, + { + "epoch": 7.710440456769984, + "grad_norm": 0.6627798080444336, + "learning_rate": 7.56913750492634e-06, + "loss": 0.1304, + "num_input_tokens_seen": 101993792, + "step": 47265 + }, + { + "epoch": 7.711256117455139, + "grad_norm": 0.41795042157173157, + "learning_rate": 7.564035734927419e-06, + "loss": 0.0621, + "num_input_tokens_seen": 102004800, + "step": 47270 + }, + { + "epoch": 7.712071778140293, + "grad_norm": 0.7342434525489807, + "learning_rate": 7.558935378376228e-06, + "loss": 0.175, + "num_input_tokens_seen": 102015936, + "step": 47275 + }, + { + "epoch": 7.712887438825448, + "grad_norm": 0.32929569482803345, + "learning_rate": 7.553836435686232e-06, + "loss": 0.2224, + "num_input_tokens_seen": 102026592, + "step": 47280 + }, + { + "epoch": 7.713703099510604, + "grad_norm": 1.0595299005508423, + "learning_rate": 7.5487389072707744e-06, + "loss": 0.0628, + "num_input_tokens_seen": 102036448, + "step": 47285 + }, + { + "epoch": 7.714518760195759, + "grad_norm": 1.4902387857437134, + "learning_rate": 7.543642793543088e-06, + "loss": 0.2005, + "num_input_tokens_seen": 102048800, + "step": 47290 + }, + { + "epoch": 7.715334420880914, + "grad_norm": 0.5227441787719727, + "learning_rate": 7.53854809491629e-06, + "loss": 0.053, + "num_input_tokens_seen": 102060000, + "step": 47295 + }, + { + "epoch": 7.716150081566068, + "grad_norm": 0.12323256582021713, + "learning_rate": 7.533454811803381e-06, + "loss": 0.0327, + "num_input_tokens_seen": 102071104, + "step": 47300 + }, + { + "epoch": 7.716965742251223, + "grad_norm": 1.1839922666549683, + "learning_rate": 7.528362944617251e-06, + "loss": 0.1747, + "num_input_tokens_seen": 102082880, + "step": 47305 + }, + { + "epoch": 7.717781402936378, + "grad_norm": 0.06292026489973068, + "learning_rate": 7.523272493770669e-06, + "loss": 0.0844, + "num_input_tokens_seen": 102094816, + "step": 47310 + }, + { + "epoch": 7.718597063621534, + "grad_norm": 0.32961881160736084, + "learning_rate": 7.518183459676295e-06, + "loss": 0.1378, + "num_input_tokens_seen": 102105632, + "step": 47315 + }, + { + "epoch": 7.719412724306689, + "grad_norm": 1.8490698337554932, + "learning_rate": 7.513095842746665e-06, + "loss": 0.1737, + "num_input_tokens_seen": 102115808, + "step": 47320 + }, + { + "epoch": 7.720228384991843, + "grad_norm": 1.0315146446228027, + "learning_rate": 7.5080096433942204e-06, + "loss": 0.1011, + "num_input_tokens_seen": 102127392, + "step": 47325 + }, + { + "epoch": 7.721044045676998, + "grad_norm": 0.9688011407852173, + "learning_rate": 7.502924862031269e-06, + "loss": 0.0694, + "num_input_tokens_seen": 102139136, + "step": 47330 + }, + { + "epoch": 7.721859706362153, + "grad_norm": 0.04519866406917572, + "learning_rate": 7.497841499070005e-06, + "loss": 0.2321, + "num_input_tokens_seen": 102150688, + "step": 47335 + }, + { + "epoch": 7.722675367047309, + "grad_norm": 1.1805208921432495, + "learning_rate": 7.4927595549225155e-06, + "loss": 0.4057, + "num_input_tokens_seen": 102161568, + "step": 47340 + }, + { + "epoch": 7.7234910277324635, + "grad_norm": 0.2329874187707901, + "learning_rate": 7.487679030000769e-06, + "loss": 0.0223, + "num_input_tokens_seen": 102173440, + "step": 47345 + }, + { + "epoch": 7.724306688417618, + "grad_norm": 0.06121927499771118, + "learning_rate": 7.482599924716613e-06, + "loss": 0.0251, + "num_input_tokens_seen": 102185376, + "step": 47350 + }, + { + "epoch": 7.725122349102773, + "grad_norm": 0.4383679926395416, + "learning_rate": 7.477522239481793e-06, + "loss": 0.0406, + "num_input_tokens_seen": 102195456, + "step": 47355 + }, + { + "epoch": 7.725938009787928, + "grad_norm": 1.5280767679214478, + "learning_rate": 7.472445974707928e-06, + "loss": 0.1305, + "num_input_tokens_seen": 102206080, + "step": 47360 + }, + { + "epoch": 7.726753670473083, + "grad_norm": 2.070587635040283, + "learning_rate": 7.467371130806524e-06, + "loss": 0.1328, + "num_input_tokens_seen": 102216736, + "step": 47365 + }, + { + "epoch": 7.7275693311582385, + "grad_norm": 0.15585832297801971, + "learning_rate": 7.462297708188978e-06, + "loss": 0.0415, + "num_input_tokens_seen": 102227904, + "step": 47370 + }, + { + "epoch": 7.728384991843393, + "grad_norm": 1.1108205318450928, + "learning_rate": 7.457225707266566e-06, + "loss": 0.0666, + "num_input_tokens_seen": 102239200, + "step": 47375 + }, + { + "epoch": 7.729200652528548, + "grad_norm": 0.2760826051235199, + "learning_rate": 7.452155128450447e-06, + "loss": 0.024, + "num_input_tokens_seen": 102251264, + "step": 47380 + }, + { + "epoch": 7.730016313213703, + "grad_norm": 0.3586565852165222, + "learning_rate": 7.447085972151663e-06, + "loss": 0.104, + "num_input_tokens_seen": 102262432, + "step": 47385 + }, + { + "epoch": 7.730831973898858, + "grad_norm": 2.131859302520752, + "learning_rate": 7.4420182387811596e-06, + "loss": 0.2186, + "num_input_tokens_seen": 102274912, + "step": 47390 + }, + { + "epoch": 7.731647634584013, + "grad_norm": 0.5656769275665283, + "learning_rate": 7.436951928749747e-06, + "loss": 0.1166, + "num_input_tokens_seen": 102285696, + "step": 47395 + }, + { + "epoch": 7.732463295269168, + "grad_norm": 0.888919472694397, + "learning_rate": 7.431887042468125e-06, + "loss": 0.0644, + "num_input_tokens_seen": 102296160, + "step": 47400 + }, + { + "epoch": 7.733278955954323, + "grad_norm": 1.1358191967010498, + "learning_rate": 7.426823580346881e-06, + "loss": 0.103, + "num_input_tokens_seen": 102307712, + "step": 47405 + }, + { + "epoch": 7.734094616639478, + "grad_norm": 0.1653074324131012, + "learning_rate": 7.421761542796479e-06, + "loss": 0.1047, + "num_input_tokens_seen": 102320032, + "step": 47410 + }, + { + "epoch": 7.734910277324633, + "grad_norm": 0.1265440434217453, + "learning_rate": 7.41670093022728e-06, + "loss": 0.1892, + "num_input_tokens_seen": 102330656, + "step": 47415 + }, + { + "epoch": 7.735725938009788, + "grad_norm": 0.42464640736579895, + "learning_rate": 7.411641743049522e-06, + "loss": 0.1155, + "num_input_tokens_seen": 102340704, + "step": 47420 + }, + { + "epoch": 7.736541598694943, + "grad_norm": 1.297457218170166, + "learning_rate": 7.406583981673315e-06, + "loss": 0.0673, + "num_input_tokens_seen": 102350752, + "step": 47425 + }, + { + "epoch": 7.737357259380098, + "grad_norm": 0.1932586282491684, + "learning_rate": 7.401527646508691e-06, + "loss": 0.0856, + "num_input_tokens_seen": 102360800, + "step": 47430 + }, + { + "epoch": 7.738172920065253, + "grad_norm": 1.7300117015838623, + "learning_rate": 7.396472737965526e-06, + "loss": 0.1737, + "num_input_tokens_seen": 102372096, + "step": 47435 + }, + { + "epoch": 7.738988580750408, + "grad_norm": 0.07759217917919159, + "learning_rate": 7.391419256453602e-06, + "loss": 0.1736, + "num_input_tokens_seen": 102382112, + "step": 47440 + }, + { + "epoch": 7.739804241435563, + "grad_norm": 1.1682559251785278, + "learning_rate": 7.386367202382577e-06, + "loss": 0.0881, + "num_input_tokens_seen": 102391392, + "step": 47445 + }, + { + "epoch": 7.740619902120718, + "grad_norm": 0.9797688126564026, + "learning_rate": 7.3813165761619975e-06, + "loss": 0.0799, + "num_input_tokens_seen": 102402720, + "step": 47450 + }, + { + "epoch": 7.741435562805873, + "grad_norm": 0.04950185865163803, + "learning_rate": 7.376267378201293e-06, + "loss": 0.3096, + "num_input_tokens_seen": 102412992, + "step": 47455 + }, + { + "epoch": 7.742251223491028, + "grad_norm": 0.7353560328483582, + "learning_rate": 7.371219608909777e-06, + "loss": 0.2609, + "num_input_tokens_seen": 102424416, + "step": 47460 + }, + { + "epoch": 7.743066884176183, + "grad_norm": 0.15029454231262207, + "learning_rate": 7.366173268696646e-06, + "loss": 0.1825, + "num_input_tokens_seen": 102435296, + "step": 47465 + }, + { + "epoch": 7.7438825448613375, + "grad_norm": 1.0972864627838135, + "learning_rate": 7.3611283579709835e-06, + "loss": 0.1249, + "num_input_tokens_seen": 102445856, + "step": 47470 + }, + { + "epoch": 7.744698205546492, + "grad_norm": 0.3136577308177948, + "learning_rate": 7.356084877141756e-06, + "loss": 0.0166, + "num_input_tokens_seen": 102457024, + "step": 47475 + }, + { + "epoch": 7.745513866231647, + "grad_norm": 1.6096700429916382, + "learning_rate": 7.35104282661781e-06, + "loss": 0.1563, + "num_input_tokens_seen": 102467744, + "step": 47480 + }, + { + "epoch": 7.746329526916803, + "grad_norm": 1.6480414867401123, + "learning_rate": 7.346002206807887e-06, + "loss": 0.1768, + "num_input_tokens_seen": 102477408, + "step": 47485 + }, + { + "epoch": 7.747145187601958, + "grad_norm": 1.1417900323867798, + "learning_rate": 7.340963018120597e-06, + "loss": 0.0794, + "num_input_tokens_seen": 102487840, + "step": 47490 + }, + { + "epoch": 7.7479608482871125, + "grad_norm": 0.3325313627719879, + "learning_rate": 7.335925260964446e-06, + "loss": 0.0887, + "num_input_tokens_seen": 102498624, + "step": 47495 + }, + { + "epoch": 7.748776508972267, + "grad_norm": 0.3144506514072418, + "learning_rate": 7.330888935747821e-06, + "loss": 0.0524, + "num_input_tokens_seen": 102509952, + "step": 47500 + }, + { + "epoch": 7.749592169657422, + "grad_norm": 0.08210043609142303, + "learning_rate": 7.325854042878991e-06, + "loss": 0.0158, + "num_input_tokens_seen": 102521120, + "step": 47505 + }, + { + "epoch": 7.750407830342578, + "grad_norm": 0.251278817653656, + "learning_rate": 7.320820582766108e-06, + "loss": 0.1265, + "num_input_tokens_seen": 102532832, + "step": 47510 + }, + { + "epoch": 7.751223491027733, + "grad_norm": 0.9715589880943298, + "learning_rate": 7.315788555817215e-06, + "loss": 0.0879, + "num_input_tokens_seen": 102542752, + "step": 47515 + }, + { + "epoch": 7.7520391517128875, + "grad_norm": 0.1808689385652542, + "learning_rate": 7.3107579624402286e-06, + "loss": 0.0623, + "num_input_tokens_seen": 102553920, + "step": 47520 + }, + { + "epoch": 7.752854812398042, + "grad_norm": 1.3775742053985596, + "learning_rate": 7.305728803042949e-06, + "loss": 0.1462, + "num_input_tokens_seen": 102565088, + "step": 47525 + }, + { + "epoch": 7.753670473083197, + "grad_norm": 0.6382964849472046, + "learning_rate": 7.3007010780330785e-06, + "loss": 0.0437, + "num_input_tokens_seen": 102576704, + "step": 47530 + }, + { + "epoch": 7.754486133768353, + "grad_norm": 0.5070104598999023, + "learning_rate": 7.295674787818188e-06, + "loss": 0.2869, + "num_input_tokens_seen": 102588000, + "step": 47535 + }, + { + "epoch": 7.755301794453508, + "grad_norm": 1.2530776262283325, + "learning_rate": 7.290649932805726e-06, + "loss": 0.199, + "num_input_tokens_seen": 102598144, + "step": 47540 + }, + { + "epoch": 7.7561174551386625, + "grad_norm": 0.28944191336631775, + "learning_rate": 7.285626513403038e-06, + "loss": 0.1142, + "num_input_tokens_seen": 102608256, + "step": 47545 + }, + { + "epoch": 7.756933115823817, + "grad_norm": 0.8963600397109985, + "learning_rate": 7.2806045300173484e-06, + "loss": 0.0815, + "num_input_tokens_seen": 102618528, + "step": 47550 + }, + { + "epoch": 7.757748776508972, + "grad_norm": 0.8675836324691772, + "learning_rate": 7.275583983055753e-06, + "loss": 0.1863, + "num_input_tokens_seen": 102629632, + "step": 47555 + }, + { + "epoch": 7.758564437194127, + "grad_norm": 0.30800503492355347, + "learning_rate": 7.2705648729252615e-06, + "loss": 0.0497, + "num_input_tokens_seen": 102639712, + "step": 47560 + }, + { + "epoch": 7.759380097879282, + "grad_norm": 1.149319052696228, + "learning_rate": 7.265547200032738e-06, + "loss": 0.128, + "num_input_tokens_seen": 102650272, + "step": 47565 + }, + { + "epoch": 7.760195758564437, + "grad_norm": 0.3470228910446167, + "learning_rate": 7.260530964784945e-06, + "loss": 0.1618, + "num_input_tokens_seen": 102661216, + "step": 47570 + }, + { + "epoch": 7.761011419249592, + "grad_norm": 1.0066901445388794, + "learning_rate": 7.2555161675885195e-06, + "loss": 0.0631, + "num_input_tokens_seen": 102671200, + "step": 47575 + }, + { + "epoch": 7.761827079934747, + "grad_norm": 0.7567046284675598, + "learning_rate": 7.250502808849988e-06, + "loss": 0.0961, + "num_input_tokens_seen": 102682144, + "step": 47580 + }, + { + "epoch": 7.762642740619902, + "grad_norm": 1.0013995170593262, + "learning_rate": 7.2454908889757586e-06, + "loss": 0.1697, + "num_input_tokens_seen": 102693376, + "step": 47585 + }, + { + "epoch": 7.763458401305057, + "grad_norm": 0.09157619625329971, + "learning_rate": 7.240480408372125e-06, + "loss": 0.0481, + "num_input_tokens_seen": 102704800, + "step": 47590 + }, + { + "epoch": 7.764274061990212, + "grad_norm": 1.006701946258545, + "learning_rate": 7.235471367445257e-06, + "loss": 0.1904, + "num_input_tokens_seen": 102715776, + "step": 47595 + }, + { + "epoch": 7.765089722675367, + "grad_norm": 1.762146234512329, + "learning_rate": 7.2304637666012195e-06, + "loss": 0.2016, + "num_input_tokens_seen": 102727232, + "step": 47600 + }, + { + "epoch": 7.765905383360522, + "grad_norm": 0.41849464178085327, + "learning_rate": 7.22545760624595e-06, + "loss": 0.1576, + "num_input_tokens_seen": 102737568, + "step": 47605 + }, + { + "epoch": 7.766721044045677, + "grad_norm": 0.15724293887615204, + "learning_rate": 7.2204528867852725e-06, + "loss": 0.1802, + "num_input_tokens_seen": 102749728, + "step": 47610 + }, + { + "epoch": 7.767536704730832, + "grad_norm": 0.6917646527290344, + "learning_rate": 7.215449608624899e-06, + "loss": 0.0495, + "num_input_tokens_seen": 102760256, + "step": 47615 + }, + { + "epoch": 7.768352365415987, + "grad_norm": 0.2011084407567978, + "learning_rate": 7.210447772170418e-06, + "loss": 0.043, + "num_input_tokens_seen": 102771648, + "step": 47620 + }, + { + "epoch": 7.769168026101142, + "grad_norm": 0.29202625155448914, + "learning_rate": 7.205447377827301e-06, + "loss": 0.1471, + "num_input_tokens_seen": 102782912, + "step": 47625 + }, + { + "epoch": 7.769983686786297, + "grad_norm": 0.1520087569952011, + "learning_rate": 7.200448426000911e-06, + "loss": 0.1163, + "num_input_tokens_seen": 102795200, + "step": 47630 + }, + { + "epoch": 7.770799347471452, + "grad_norm": 0.4532295763492584, + "learning_rate": 7.195450917096483e-06, + "loss": 0.1708, + "num_input_tokens_seen": 102805600, + "step": 47635 + }, + { + "epoch": 7.771615008156607, + "grad_norm": 1.8141812086105347, + "learning_rate": 7.190454851519138e-06, + "loss": 0.1291, + "num_input_tokens_seen": 102816800, + "step": 47640 + }, + { + "epoch": 7.7724306688417615, + "grad_norm": 0.24489079415798187, + "learning_rate": 7.185460229673893e-06, + "loss": 0.1263, + "num_input_tokens_seen": 102827968, + "step": 47645 + }, + { + "epoch": 7.773246329526917, + "grad_norm": 0.022840773686766624, + "learning_rate": 7.180467051965634e-06, + "loss": 0.0616, + "num_input_tokens_seen": 102838400, + "step": 47650 + }, + { + "epoch": 7.774061990212072, + "grad_norm": 0.4106594920158386, + "learning_rate": 7.17547531879913e-06, + "loss": 0.0544, + "num_input_tokens_seen": 102848704, + "step": 47655 + }, + { + "epoch": 7.774877650897227, + "grad_norm": 0.3935069143772125, + "learning_rate": 7.170485030579038e-06, + "loss": 0.1139, + "num_input_tokens_seen": 102859296, + "step": 47660 + }, + { + "epoch": 7.775693311582382, + "grad_norm": 0.1510777771472931, + "learning_rate": 7.165496187709894e-06, + "loss": 0.0401, + "num_input_tokens_seen": 102870592, + "step": 47665 + }, + { + "epoch": 7.7765089722675365, + "grad_norm": 1.6097686290740967, + "learning_rate": 7.160508790596121e-06, + "loss": 0.1258, + "num_input_tokens_seen": 102881568, + "step": 47670 + }, + { + "epoch": 7.777324632952691, + "grad_norm": 1.3434542417526245, + "learning_rate": 7.155522839642023e-06, + "loss": 0.1647, + "num_input_tokens_seen": 102892320, + "step": 47675 + }, + { + "epoch": 7.778140293637847, + "grad_norm": 0.42471548914909363, + "learning_rate": 7.150538335251786e-06, + "loss": 0.259, + "num_input_tokens_seen": 102903456, + "step": 47680 + }, + { + "epoch": 7.778955954323002, + "grad_norm": 0.1586197465658188, + "learning_rate": 7.1455552778294775e-06, + "loss": 0.1428, + "num_input_tokens_seen": 102914208, + "step": 47685 + }, + { + "epoch": 7.779771615008157, + "grad_norm": 1.8728445768356323, + "learning_rate": 7.140573667779052e-06, + "loss": 0.141, + "num_input_tokens_seen": 102925152, + "step": 47690 + }, + { + "epoch": 7.780587275693311, + "grad_norm": 0.9692871570587158, + "learning_rate": 7.1355935055043314e-06, + "loss": 0.1916, + "num_input_tokens_seen": 102936064, + "step": 47695 + }, + { + "epoch": 7.781402936378466, + "grad_norm": 0.41678252816200256, + "learning_rate": 7.130614791409057e-06, + "loss": 0.1419, + "num_input_tokens_seen": 102947200, + "step": 47700 + }, + { + "epoch": 7.782218597063622, + "grad_norm": 1.1137980222702026, + "learning_rate": 7.125637525896814e-06, + "loss": 0.1607, + "num_input_tokens_seen": 102957216, + "step": 47705 + }, + { + "epoch": 7.783034257748777, + "grad_norm": 0.4045230448246002, + "learning_rate": 7.1206617093710845e-06, + "loss": 0.052, + "num_input_tokens_seen": 102968160, + "step": 47710 + }, + { + "epoch": 7.783849918433932, + "grad_norm": 0.2768282890319824, + "learning_rate": 7.115687342235239e-06, + "loss": 0.0929, + "num_input_tokens_seen": 102978912, + "step": 47715 + }, + { + "epoch": 7.784665579119086, + "grad_norm": 1.6303119659423828, + "learning_rate": 7.11071442489252e-06, + "loss": 0.3121, + "num_input_tokens_seen": 102989568, + "step": 47720 + }, + { + "epoch": 7.785481239804241, + "grad_norm": 0.1627223640680313, + "learning_rate": 7.1057429577460584e-06, + "loss": 0.0302, + "num_input_tokens_seen": 103000512, + "step": 47725 + }, + { + "epoch": 7.786296900489396, + "grad_norm": 0.12466471642255783, + "learning_rate": 7.100772941198869e-06, + "loss": 0.0817, + "num_input_tokens_seen": 103012320, + "step": 47730 + }, + { + "epoch": 7.787112561174552, + "grad_norm": 0.02498667873442173, + "learning_rate": 7.095804375653844e-06, + "loss": 0.0277, + "num_input_tokens_seen": 103023936, + "step": 47735 + }, + { + "epoch": 7.787928221859707, + "grad_norm": 1.4380911588668823, + "learning_rate": 7.090837261513764e-06, + "loss": 0.1, + "num_input_tokens_seen": 103033696, + "step": 47740 + }, + { + "epoch": 7.788743882544861, + "grad_norm": 1.7073339223861694, + "learning_rate": 7.085871599181274e-06, + "loss": 0.1641, + "num_input_tokens_seen": 103045280, + "step": 47745 + }, + { + "epoch": 7.789559543230016, + "grad_norm": 0.10785792022943497, + "learning_rate": 7.0809073890589356e-06, + "loss": 0.0134, + "num_input_tokens_seen": 103055584, + "step": 47750 + }, + { + "epoch": 7.790375203915171, + "grad_norm": 0.9157541990280151, + "learning_rate": 7.075944631549167e-06, + "loss": 0.1435, + "num_input_tokens_seen": 103066880, + "step": 47755 + }, + { + "epoch": 7.791190864600326, + "grad_norm": 0.14434711635112762, + "learning_rate": 7.07098332705427e-06, + "loss": 0.0479, + "num_input_tokens_seen": 103077568, + "step": 47760 + }, + { + "epoch": 7.7920065252854815, + "grad_norm": 0.041723527014255524, + "learning_rate": 7.066023475976438e-06, + "loss": 0.0831, + "num_input_tokens_seen": 103088672, + "step": 47765 + }, + { + "epoch": 7.792822185970636, + "grad_norm": 0.8591808676719666, + "learning_rate": 7.061065078717738e-06, + "loss": 0.0527, + "num_input_tokens_seen": 103098496, + "step": 47770 + }, + { + "epoch": 7.793637846655791, + "grad_norm": 1.4401750564575195, + "learning_rate": 7.056108135680123e-06, + "loss": 0.2237, + "num_input_tokens_seen": 103109856, + "step": 47775 + }, + { + "epoch": 7.794453507340946, + "grad_norm": 0.4796815812587738, + "learning_rate": 7.05115264726543e-06, + "loss": 0.1231, + "num_input_tokens_seen": 103120032, + "step": 47780 + }, + { + "epoch": 7.795269168026101, + "grad_norm": 1.339709997177124, + "learning_rate": 7.046198613875374e-06, + "loss": 0.1874, + "num_input_tokens_seen": 103131072, + "step": 47785 + }, + { + "epoch": 7.7960848287112565, + "grad_norm": 1.3943697214126587, + "learning_rate": 7.0412460359115555e-06, + "loss": 0.1554, + "num_input_tokens_seen": 103142240, + "step": 47790 + }, + { + "epoch": 7.796900489396411, + "grad_norm": 1.347598910331726, + "learning_rate": 7.0362949137754565e-06, + "loss": 0.1613, + "num_input_tokens_seen": 103152960, + "step": 47795 + }, + { + "epoch": 7.797716150081566, + "grad_norm": 0.3033713400363922, + "learning_rate": 7.031345247868437e-06, + "loss": 0.1718, + "num_input_tokens_seen": 103164352, + "step": 47800 + }, + { + "epoch": 7.798531810766721, + "grad_norm": 2.883371114730835, + "learning_rate": 7.026397038591745e-06, + "loss": 0.2079, + "num_input_tokens_seen": 103175680, + "step": 47805 + }, + { + "epoch": 7.799347471451876, + "grad_norm": 0.7296438813209534, + "learning_rate": 7.021450286346503e-06, + "loss": 0.0496, + "num_input_tokens_seen": 103185952, + "step": 47810 + }, + { + "epoch": 7.800163132137031, + "grad_norm": 1.808886170387268, + "learning_rate": 7.016504991533726e-06, + "loss": 0.2674, + "num_input_tokens_seen": 103195840, + "step": 47815 + }, + { + "epoch": 7.800978792822186, + "grad_norm": 2.829709768295288, + "learning_rate": 7.011561154554303e-06, + "loss": 0.1272, + "num_input_tokens_seen": 103207104, + "step": 47820 + }, + { + "epoch": 7.801794453507341, + "grad_norm": 0.3835920989513397, + "learning_rate": 7.006618775809001e-06, + "loss": 0.1178, + "num_input_tokens_seen": 103218624, + "step": 47825 + }, + { + "epoch": 7.802610114192496, + "grad_norm": 0.1781388521194458, + "learning_rate": 7.001677855698482e-06, + "loss": 0.0673, + "num_input_tokens_seen": 103230592, + "step": 47830 + }, + { + "epoch": 7.803425774877651, + "grad_norm": 0.5180509090423584, + "learning_rate": 6.996738394623279e-06, + "loss": 0.1919, + "num_input_tokens_seen": 103241984, + "step": 47835 + }, + { + "epoch": 7.804241435562806, + "grad_norm": 2.3421037197113037, + "learning_rate": 6.991800392983799e-06, + "loss": 0.2537, + "num_input_tokens_seen": 103252800, + "step": 47840 + }, + { + "epoch": 7.80505709624796, + "grad_norm": 0.04079962149262428, + "learning_rate": 6.9868638511803615e-06, + "loss": 0.0515, + "num_input_tokens_seen": 103265440, + "step": 47845 + }, + { + "epoch": 7.805872756933116, + "grad_norm": 0.4411807954311371, + "learning_rate": 6.9819287696131355e-06, + "loss": 0.0666, + "num_input_tokens_seen": 103274976, + "step": 47850 + }, + { + "epoch": 7.806688417618271, + "grad_norm": 0.21569198369979858, + "learning_rate": 6.9769951486821885e-06, + "loss": 0.0715, + "num_input_tokens_seen": 103285344, + "step": 47855 + }, + { + "epoch": 7.807504078303426, + "grad_norm": 0.6597882509231567, + "learning_rate": 6.972062988787462e-06, + "loss": 0.0444, + "num_input_tokens_seen": 103296576, + "step": 47860 + }, + { + "epoch": 7.808319738988581, + "grad_norm": 0.22995272278785706, + "learning_rate": 6.9671322903287765e-06, + "loss": 0.1372, + "num_input_tokens_seen": 103306880, + "step": 47865 + }, + { + "epoch": 7.809135399673735, + "grad_norm": 0.4645429849624634, + "learning_rate": 6.962203053705851e-06, + "loss": 0.1468, + "num_input_tokens_seen": 103317760, + "step": 47870 + }, + { + "epoch": 7.809951060358891, + "grad_norm": 0.5243979692459106, + "learning_rate": 6.957275279318268e-06, + "loss": 0.108, + "num_input_tokens_seen": 103328096, + "step": 47875 + }, + { + "epoch": 7.810766721044046, + "grad_norm": 1.6325185298919678, + "learning_rate": 6.9523489675655e-06, + "loss": 0.2915, + "num_input_tokens_seen": 103338560, + "step": 47880 + }, + { + "epoch": 7.811582381729201, + "grad_norm": 1.1501160860061646, + "learning_rate": 6.9474241188468985e-06, + "loss": 0.1801, + "num_input_tokens_seen": 103349824, + "step": 47885 + }, + { + "epoch": 7.8123980424143555, + "grad_norm": 1.0967410802841187, + "learning_rate": 6.942500733561694e-06, + "loss": 0.037, + "num_input_tokens_seen": 103361056, + "step": 47890 + }, + { + "epoch": 7.81321370309951, + "grad_norm": 0.14922812581062317, + "learning_rate": 6.937578812109005e-06, + "loss": 0.045, + "num_input_tokens_seen": 103371520, + "step": 47895 + }, + { + "epoch": 7.814029363784666, + "grad_norm": 1.0502548217773438, + "learning_rate": 6.932658354887825e-06, + "loss": 0.0741, + "num_input_tokens_seen": 103382784, + "step": 47900 + }, + { + "epoch": 7.814845024469821, + "grad_norm": 0.16636432707309723, + "learning_rate": 6.927739362297028e-06, + "loss": 0.2399, + "num_input_tokens_seen": 103393376, + "step": 47905 + }, + { + "epoch": 7.815660685154976, + "grad_norm": 0.045724742114543915, + "learning_rate": 6.92282183473538e-06, + "loss": 0.0738, + "num_input_tokens_seen": 103404000, + "step": 47910 + }, + { + "epoch": 7.8164763458401305, + "grad_norm": 0.009802354499697685, + "learning_rate": 6.917905772601516e-06, + "loss": 0.1123, + "num_input_tokens_seen": 103414848, + "step": 47915 + }, + { + "epoch": 7.817292006525285, + "grad_norm": 0.0531751774251461, + "learning_rate": 6.912991176293957e-06, + "loss": 0.1179, + "num_input_tokens_seen": 103426144, + "step": 47920 + }, + { + "epoch": 7.81810766721044, + "grad_norm": 0.2212803065776825, + "learning_rate": 6.908078046211105e-06, + "loss": 0.1084, + "num_input_tokens_seen": 103435584, + "step": 47925 + }, + { + "epoch": 7.818923327895595, + "grad_norm": 0.30844688415527344, + "learning_rate": 6.903166382751244e-06, + "loss": 0.0689, + "num_input_tokens_seen": 103446464, + "step": 47930 + }, + { + "epoch": 7.819738988580751, + "grad_norm": 1.720177412033081, + "learning_rate": 6.8982561863125405e-06, + "loss": 0.1898, + "num_input_tokens_seen": 103457248, + "step": 47935 + }, + { + "epoch": 7.8205546492659055, + "grad_norm": 0.3223625421524048, + "learning_rate": 6.893347457293036e-06, + "loss": 0.1425, + "num_input_tokens_seen": 103468128, + "step": 47940 + }, + { + "epoch": 7.82137030995106, + "grad_norm": 0.19631583988666534, + "learning_rate": 6.888440196090659e-06, + "loss": 0.1873, + "num_input_tokens_seen": 103478656, + "step": 47945 + }, + { + "epoch": 7.822185970636215, + "grad_norm": 0.2718484103679657, + "learning_rate": 6.8835344031032175e-06, + "loss": 0.1729, + "num_input_tokens_seen": 103489472, + "step": 47950 + }, + { + "epoch": 7.82300163132137, + "grad_norm": 1.7035030126571655, + "learning_rate": 6.878630078728399e-06, + "loss": 0.2819, + "num_input_tokens_seen": 103500384, + "step": 47955 + }, + { + "epoch": 7.823817292006526, + "grad_norm": 1.0730199813842773, + "learning_rate": 6.873727223363766e-06, + "loss": 0.3418, + "num_input_tokens_seen": 103510432, + "step": 47960 + }, + { + "epoch": 7.8246329526916805, + "grad_norm": 0.1241966187953949, + "learning_rate": 6.868825837406784e-06, + "loss": 0.1239, + "num_input_tokens_seen": 103520544, + "step": 47965 + }, + { + "epoch": 7.825448613376835, + "grad_norm": 1.3119144439697266, + "learning_rate": 6.8639259212547764e-06, + "loss": 0.1114, + "num_input_tokens_seen": 103531264, + "step": 47970 + }, + { + "epoch": 7.82626427406199, + "grad_norm": 0.8795883059501648, + "learning_rate": 6.859027475304955e-06, + "loss": 0.0759, + "num_input_tokens_seen": 103542176, + "step": 47975 + }, + { + "epoch": 7.827079934747145, + "grad_norm": 0.1424064189195633, + "learning_rate": 6.854130499954411e-06, + "loss": 0.0418, + "num_input_tokens_seen": 103552608, + "step": 47980 + }, + { + "epoch": 7.827895595432301, + "grad_norm": 0.37717387080192566, + "learning_rate": 6.849234995600121e-06, + "loss": 0.1147, + "num_input_tokens_seen": 103563552, + "step": 47985 + }, + { + "epoch": 7.828711256117455, + "grad_norm": 0.17529833316802979, + "learning_rate": 6.84434096263894e-06, + "loss": 0.1475, + "num_input_tokens_seen": 103575296, + "step": 47990 + }, + { + "epoch": 7.82952691680261, + "grad_norm": 0.41142281889915466, + "learning_rate": 6.839448401467599e-06, + "loss": 0.0385, + "num_input_tokens_seen": 103586304, + "step": 47995 + }, + { + "epoch": 7.830342577487765, + "grad_norm": 1.7834244966506958, + "learning_rate": 6.834557312482717e-06, + "loss": 0.1333, + "num_input_tokens_seen": 103596768, + "step": 48000 + }, + { + "epoch": 7.83115823817292, + "grad_norm": 1.3197710514068604, + "learning_rate": 6.8296676960807906e-06, + "loss": 0.0957, + "num_input_tokens_seen": 103607808, + "step": 48005 + }, + { + "epoch": 7.831973898858075, + "grad_norm": 0.5481244921684265, + "learning_rate": 6.824779552658189e-06, + "loss": 0.0998, + "num_input_tokens_seen": 103618848, + "step": 48010 + }, + { + "epoch": 7.8327895595432295, + "grad_norm": 1.2992308139801025, + "learning_rate": 6.819892882611184e-06, + "loss": 0.1067, + "num_input_tokens_seen": 103629600, + "step": 48015 + }, + { + "epoch": 7.833605220228385, + "grad_norm": 0.21026477217674255, + "learning_rate": 6.8150076863359054e-06, + "loss": 0.0236, + "num_input_tokens_seen": 103639744, + "step": 48020 + }, + { + "epoch": 7.83442088091354, + "grad_norm": 0.4823703169822693, + "learning_rate": 6.810123964228374e-06, + "loss": 0.1442, + "num_input_tokens_seen": 103649472, + "step": 48025 + }, + { + "epoch": 7.835236541598695, + "grad_norm": 0.9196645617485046, + "learning_rate": 6.8052417166844905e-06, + "loss": 0.1116, + "num_input_tokens_seen": 103659968, + "step": 48030 + }, + { + "epoch": 7.83605220228385, + "grad_norm": 0.6355348229408264, + "learning_rate": 6.800360944100031e-06, + "loss": 0.0371, + "num_input_tokens_seen": 103671584, + "step": 48035 + }, + { + "epoch": 7.8368678629690045, + "grad_norm": 1.4551727771759033, + "learning_rate": 6.795481646870658e-06, + "loss": 0.1719, + "num_input_tokens_seen": 103681984, + "step": 48040 + }, + { + "epoch": 7.83768352365416, + "grad_norm": 0.21873502433300018, + "learning_rate": 6.790603825391912e-06, + "loss": 0.2016, + "num_input_tokens_seen": 103692544, + "step": 48045 + }, + { + "epoch": 7.838499184339315, + "grad_norm": 0.10470444709062576, + "learning_rate": 6.785727480059212e-06, + "loss": 0.0333, + "num_input_tokens_seen": 103703488, + "step": 48050 + }, + { + "epoch": 7.83931484502447, + "grad_norm": 0.03661126643419266, + "learning_rate": 6.78085261126786e-06, + "loss": 0.068, + "num_input_tokens_seen": 103714464, + "step": 48055 + }, + { + "epoch": 7.840130505709625, + "grad_norm": 1.1003698110580444, + "learning_rate": 6.775979219413042e-06, + "loss": 0.0661, + "num_input_tokens_seen": 103725184, + "step": 48060 + }, + { + "epoch": 7.8409461663947795, + "grad_norm": 0.6075243949890137, + "learning_rate": 6.771107304889807e-06, + "loss": 0.0925, + "num_input_tokens_seen": 103735520, + "step": 48065 + }, + { + "epoch": 7.841761827079935, + "grad_norm": 2.2721524238586426, + "learning_rate": 6.766236868093112e-06, + "loss": 0.1508, + "num_input_tokens_seen": 103747072, + "step": 48070 + }, + { + "epoch": 7.84257748776509, + "grad_norm": 0.2780798077583313, + "learning_rate": 6.761367909417776e-06, + "loss": 0.1054, + "num_input_tokens_seen": 103757632, + "step": 48075 + }, + { + "epoch": 7.843393148450245, + "grad_norm": 0.558259129524231, + "learning_rate": 6.756500429258497e-06, + "loss": 0.1076, + "num_input_tokens_seen": 103768576, + "step": 48080 + }, + { + "epoch": 7.8442088091354, + "grad_norm": 0.05826694890856743, + "learning_rate": 6.751634428009862e-06, + "loss": 0.1526, + "num_input_tokens_seen": 103779840, + "step": 48085 + }, + { + "epoch": 7.8450244698205545, + "grad_norm": 0.7215844392776489, + "learning_rate": 6.7467699060663305e-06, + "loss": 0.1011, + "num_input_tokens_seen": 103789664, + "step": 48090 + }, + { + "epoch": 7.845840130505709, + "grad_norm": 0.19809521734714508, + "learning_rate": 6.741906863822248e-06, + "loss": 0.1671, + "num_input_tokens_seen": 103800832, + "step": 48095 + }, + { + "epoch": 7.846655791190865, + "grad_norm": 1.251131534576416, + "learning_rate": 6.737045301671832e-06, + "loss": 0.1219, + "num_input_tokens_seen": 103811616, + "step": 48100 + }, + { + "epoch": 7.84747145187602, + "grad_norm": 0.9018994569778442, + "learning_rate": 6.7321852200091935e-06, + "loss": 0.291, + "num_input_tokens_seen": 103822464, + "step": 48105 + }, + { + "epoch": 7.848287112561175, + "grad_norm": 0.4249764382839203, + "learning_rate": 6.727326619228308e-06, + "loss": 0.0659, + "num_input_tokens_seen": 103833984, + "step": 48110 + }, + { + "epoch": 7.849102773246329, + "grad_norm": 0.20743130147457123, + "learning_rate": 6.722469499723042e-06, + "loss": 0.0466, + "num_input_tokens_seen": 103845664, + "step": 48115 + }, + { + "epoch": 7.849918433931484, + "grad_norm": 0.22087469696998596, + "learning_rate": 6.717613861887137e-06, + "loss": 0.3719, + "num_input_tokens_seen": 103855936, + "step": 48120 + }, + { + "epoch": 7.850734094616639, + "grad_norm": 0.8135210275650024, + "learning_rate": 6.712759706114219e-06, + "loss": 0.0539, + "num_input_tokens_seen": 103866304, + "step": 48125 + }, + { + "epoch": 7.851549755301795, + "grad_norm": 0.14629073441028595, + "learning_rate": 6.707907032797786e-06, + "loss": 0.129, + "num_input_tokens_seen": 103877280, + "step": 48130 + }, + { + "epoch": 7.85236541598695, + "grad_norm": 1.0176825523376465, + "learning_rate": 6.703055842331221e-06, + "loss": 0.1745, + "num_input_tokens_seen": 103888192, + "step": 48135 + }, + { + "epoch": 7.853181076672104, + "grad_norm": 1.2243247032165527, + "learning_rate": 6.698206135107787e-06, + "loss": 0.1255, + "num_input_tokens_seen": 103897568, + "step": 48140 + }, + { + "epoch": 7.853996737357259, + "grad_norm": 1.105273723602295, + "learning_rate": 6.6933579115206284e-06, + "loss": 0.0694, + "num_input_tokens_seen": 103908448, + "step": 48145 + }, + { + "epoch": 7.854812398042414, + "grad_norm": 1.3806887865066528, + "learning_rate": 6.6885111719627635e-06, + "loss": 0.0792, + "num_input_tokens_seen": 103917856, + "step": 48150 + }, + { + "epoch": 7.85562805872757, + "grad_norm": 0.1895894855260849, + "learning_rate": 6.683665916827087e-06, + "loss": 0.076, + "num_input_tokens_seen": 103929792, + "step": 48155 + }, + { + "epoch": 7.856443719412725, + "grad_norm": 3.2976298332214355, + "learning_rate": 6.678822146506394e-06, + "loss": 0.1784, + "num_input_tokens_seen": 103939488, + "step": 48160 + }, + { + "epoch": 7.857259380097879, + "grad_norm": 0.6948957443237305, + "learning_rate": 6.67397986139334e-06, + "loss": 0.0729, + "num_input_tokens_seen": 103949600, + "step": 48165 + }, + { + "epoch": 7.858075040783034, + "grad_norm": 1.431870460510254, + "learning_rate": 6.669139061880464e-06, + "loss": 0.2939, + "num_input_tokens_seen": 103959936, + "step": 48170 + }, + { + "epoch": 7.858890701468189, + "grad_norm": 0.7937386631965637, + "learning_rate": 6.664299748360184e-06, + "loss": 0.0762, + "num_input_tokens_seen": 103971136, + "step": 48175 + }, + { + "epoch": 7.859706362153344, + "grad_norm": 0.47237899899482727, + "learning_rate": 6.659461921224794e-06, + "loss": 0.1534, + "num_input_tokens_seen": 103981760, + "step": 48180 + }, + { + "epoch": 7.8605220228384995, + "grad_norm": 0.6659680604934692, + "learning_rate": 6.654625580866486e-06, + "loss": 0.0874, + "num_input_tokens_seen": 103992480, + "step": 48185 + }, + { + "epoch": 7.861337683523654, + "grad_norm": 1.5391424894332886, + "learning_rate": 6.649790727677313e-06, + "loss": 0.1181, + "num_input_tokens_seen": 104002720, + "step": 48190 + }, + { + "epoch": 7.862153344208809, + "grad_norm": 1.7853925228118896, + "learning_rate": 6.644957362049212e-06, + "loss": 0.1226, + "num_input_tokens_seen": 104011968, + "step": 48195 + }, + { + "epoch": 7.862969004893964, + "grad_norm": 0.30389904975891113, + "learning_rate": 6.640125484373999e-06, + "loss": 0.1383, + "num_input_tokens_seen": 104023328, + "step": 48200 + }, + { + "epoch": 7.863784665579119, + "grad_norm": 1.53744637966156, + "learning_rate": 6.635295095043373e-06, + "loss": 0.1962, + "num_input_tokens_seen": 104032896, + "step": 48205 + }, + { + "epoch": 7.864600326264274, + "grad_norm": 2.4802441596984863, + "learning_rate": 6.630466194448906e-06, + "loss": 0.1093, + "num_input_tokens_seen": 104044128, + "step": 48210 + }, + { + "epoch": 7.865415986949429, + "grad_norm": 1.3263916969299316, + "learning_rate": 6.625638782982058e-06, + "loss": 0.0902, + "num_input_tokens_seen": 104055808, + "step": 48215 + }, + { + "epoch": 7.866231647634584, + "grad_norm": 0.06001826003193855, + "learning_rate": 6.620812861034159e-06, + "loss": 0.2127, + "num_input_tokens_seen": 104066816, + "step": 48220 + }, + { + "epoch": 7.867047308319739, + "grad_norm": 1.490178108215332, + "learning_rate": 6.615988428996426e-06, + "loss": 0.1126, + "num_input_tokens_seen": 104078624, + "step": 48225 + }, + { + "epoch": 7.867862969004894, + "grad_norm": 0.12791989743709564, + "learning_rate": 6.611165487259946e-06, + "loss": 0.0311, + "num_input_tokens_seen": 104090592, + "step": 48230 + }, + { + "epoch": 7.868678629690049, + "grad_norm": 0.5636804103851318, + "learning_rate": 6.6063440362157e-06, + "loss": 0.1655, + "num_input_tokens_seen": 104101376, + "step": 48235 + }, + { + "epoch": 7.869494290375204, + "grad_norm": 1.6263099908828735, + "learning_rate": 6.601524076254534e-06, + "loss": 0.1083, + "num_input_tokens_seen": 104111968, + "step": 48240 + }, + { + "epoch": 7.870309951060359, + "grad_norm": 0.6658073663711548, + "learning_rate": 6.5967056077671785e-06, + "loss": 0.0434, + "num_input_tokens_seen": 104123616, + "step": 48245 + }, + { + "epoch": 7.871125611745514, + "grad_norm": 1.1549265384674072, + "learning_rate": 6.591888631144244e-06, + "loss": 0.0652, + "num_input_tokens_seen": 104134176, + "step": 48250 + }, + { + "epoch": 7.871941272430669, + "grad_norm": 1.0179375410079956, + "learning_rate": 6.587073146776221e-06, + "loss": 0.0737, + "num_input_tokens_seen": 104144704, + "step": 48255 + }, + { + "epoch": 7.872756933115824, + "grad_norm": 0.07754390686750412, + "learning_rate": 6.582259155053472e-06, + "loss": 0.0158, + "num_input_tokens_seen": 104155648, + "step": 48260 + }, + { + "epoch": 7.873572593800979, + "grad_norm": 1.685037612915039, + "learning_rate": 6.577446656366248e-06, + "loss": 0.103, + "num_input_tokens_seen": 104166080, + "step": 48265 + }, + { + "epoch": 7.874388254486134, + "grad_norm": 0.3054228723049164, + "learning_rate": 6.572635651104672e-06, + "loss": 0.1528, + "num_input_tokens_seen": 104177152, + "step": 48270 + }, + { + "epoch": 7.875203915171289, + "grad_norm": 0.7683823108673096, + "learning_rate": 6.56782613965875e-06, + "loss": 0.1459, + "num_input_tokens_seen": 104187104, + "step": 48275 + }, + { + "epoch": 7.876019575856444, + "grad_norm": 0.7813172340393066, + "learning_rate": 6.56301812241836e-06, + "loss": 0.2325, + "num_input_tokens_seen": 104198176, + "step": 48280 + }, + { + "epoch": 7.876835236541599, + "grad_norm": 1.027434229850769, + "learning_rate": 6.558211599773273e-06, + "loss": 0.1429, + "num_input_tokens_seen": 104208896, + "step": 48285 + }, + { + "epoch": 7.877650897226753, + "grad_norm": 0.6085355281829834, + "learning_rate": 6.55340657211313e-06, + "loss": 0.0858, + "num_input_tokens_seen": 104220512, + "step": 48290 + }, + { + "epoch": 7.878466557911908, + "grad_norm": 1.1650068759918213, + "learning_rate": 6.5486030398274444e-06, + "loss": 0.2282, + "num_input_tokens_seen": 104230816, + "step": 48295 + }, + { + "epoch": 7.879282218597064, + "grad_norm": 0.4950099587440491, + "learning_rate": 6.543801003305619e-06, + "loss": 0.1457, + "num_input_tokens_seen": 104242240, + "step": 48300 + }, + { + "epoch": 7.880097879282219, + "grad_norm": 0.12481776624917984, + "learning_rate": 6.539000462936931e-06, + "loss": 0.0599, + "num_input_tokens_seen": 104253376, + "step": 48305 + }, + { + "epoch": 7.8809135399673735, + "grad_norm": 0.1670331507921219, + "learning_rate": 6.534201419110536e-06, + "loss": 0.0986, + "num_input_tokens_seen": 104264864, + "step": 48310 + }, + { + "epoch": 7.881729200652528, + "grad_norm": 1.0294983386993408, + "learning_rate": 6.529403872215467e-06, + "loss": 0.0732, + "num_input_tokens_seen": 104275712, + "step": 48315 + }, + { + "epoch": 7.882544861337683, + "grad_norm": 0.4043427109718323, + "learning_rate": 6.524607822640638e-06, + "loss": 0.0455, + "num_input_tokens_seen": 104287328, + "step": 48320 + }, + { + "epoch": 7.883360522022839, + "grad_norm": 0.768217921257019, + "learning_rate": 6.519813270774835e-06, + "loss": 0.0367, + "num_input_tokens_seen": 104297344, + "step": 48325 + }, + { + "epoch": 7.884176182707994, + "grad_norm": 0.11567620187997818, + "learning_rate": 6.515020217006745e-06, + "loss": 0.0278, + "num_input_tokens_seen": 104308992, + "step": 48330 + }, + { + "epoch": 7.8849918433931485, + "grad_norm": 0.6898236274719238, + "learning_rate": 6.510228661724907e-06, + "loss": 0.1426, + "num_input_tokens_seen": 104319616, + "step": 48335 + }, + { + "epoch": 7.885807504078303, + "grad_norm": 0.12367790192365646, + "learning_rate": 6.5054386053177515e-06, + "loss": 0.1099, + "num_input_tokens_seen": 104331520, + "step": 48340 + }, + { + "epoch": 7.886623164763458, + "grad_norm": 1.8218592405319214, + "learning_rate": 6.500650048173582e-06, + "loss": 0.1376, + "num_input_tokens_seen": 104341664, + "step": 48345 + }, + { + "epoch": 7.887438825448614, + "grad_norm": 0.638119101524353, + "learning_rate": 6.495862990680585e-06, + "loss": 0.1404, + "num_input_tokens_seen": 104352640, + "step": 48350 + }, + { + "epoch": 7.888254486133769, + "grad_norm": 2.406665563583374, + "learning_rate": 6.4910774332268195e-06, + "loss": 0.1967, + "num_input_tokens_seen": 104362496, + "step": 48355 + }, + { + "epoch": 7.8890701468189235, + "grad_norm": 0.060982923954725266, + "learning_rate": 6.486293376200234e-06, + "loss": 0.1654, + "num_input_tokens_seen": 104372192, + "step": 48360 + }, + { + "epoch": 7.889885807504078, + "grad_norm": 0.06467942148447037, + "learning_rate": 6.481510819988645e-06, + "loss": 0.0538, + "num_input_tokens_seen": 104383168, + "step": 48365 + }, + { + "epoch": 7.890701468189233, + "grad_norm": 0.09307076036930084, + "learning_rate": 6.47672976497975e-06, + "loss": 0.027, + "num_input_tokens_seen": 104393632, + "step": 48370 + }, + { + "epoch": 7.891517128874388, + "grad_norm": 1.1839696168899536, + "learning_rate": 6.471950211561125e-06, + "loss": 0.2154, + "num_input_tokens_seen": 104404096, + "step": 48375 + }, + { + "epoch": 7.892332789559543, + "grad_norm": 0.13906240463256836, + "learning_rate": 6.46717216012023e-06, + "loss": 0.0816, + "num_input_tokens_seen": 104414592, + "step": 48380 + }, + { + "epoch": 7.8931484502446985, + "grad_norm": 0.6881563067436218, + "learning_rate": 6.462395611044383e-06, + "loss": 0.0599, + "num_input_tokens_seen": 104424736, + "step": 48385 + }, + { + "epoch": 7.893964110929853, + "grad_norm": 0.6334581971168518, + "learning_rate": 6.457620564720815e-06, + "loss": 0.0566, + "num_input_tokens_seen": 104434976, + "step": 48390 + }, + { + "epoch": 7.894779771615008, + "grad_norm": 0.12161920219659805, + "learning_rate": 6.452847021536609e-06, + "loss": 0.1793, + "num_input_tokens_seen": 104445920, + "step": 48395 + }, + { + "epoch": 7.895595432300163, + "grad_norm": 0.6572726964950562, + "learning_rate": 6.44807498187873e-06, + "loss": 0.2094, + "num_input_tokens_seen": 104455584, + "step": 48400 + }, + { + "epoch": 7.896411092985318, + "grad_norm": 0.25682201981544495, + "learning_rate": 6.443304446134024e-06, + "loss": 0.0248, + "num_input_tokens_seen": 104466304, + "step": 48405 + }, + { + "epoch": 7.897226753670473, + "grad_norm": 0.036128029227256775, + "learning_rate": 6.438535414689215e-06, + "loss": 0.0417, + "num_input_tokens_seen": 104476096, + "step": 48410 + }, + { + "epoch": 7.898042414355628, + "grad_norm": 0.757016658782959, + "learning_rate": 6.4337678879309055e-06, + "loss": 0.1414, + "num_input_tokens_seen": 104486656, + "step": 48415 + }, + { + "epoch": 7.898858075040783, + "grad_norm": 2.1387765407562256, + "learning_rate": 6.4290018662455764e-06, + "loss": 0.1924, + "num_input_tokens_seen": 104497600, + "step": 48420 + }, + { + "epoch": 7.899673735725938, + "grad_norm": 0.0911668911576271, + "learning_rate": 6.424237350019582e-06, + "loss": 0.1171, + "num_input_tokens_seen": 104508384, + "step": 48425 + }, + { + "epoch": 7.900489396411093, + "grad_norm": 0.10076640546321869, + "learning_rate": 6.419474339639161e-06, + "loss": 0.0868, + "num_input_tokens_seen": 104519136, + "step": 48430 + }, + { + "epoch": 7.901305057096248, + "grad_norm": 0.7176297903060913, + "learning_rate": 6.414712835490428e-06, + "loss": 0.0335, + "num_input_tokens_seen": 104529728, + "step": 48435 + }, + { + "epoch": 7.902120717781403, + "grad_norm": 0.11643949896097183, + "learning_rate": 6.409952837959374e-06, + "loss": 0.0668, + "num_input_tokens_seen": 104541888, + "step": 48440 + }, + { + "epoch": 7.902936378466558, + "grad_norm": 0.15054281055927277, + "learning_rate": 6.405194347431864e-06, + "loss": 0.0226, + "num_input_tokens_seen": 104552736, + "step": 48445 + }, + { + "epoch": 7.903752039151713, + "grad_norm": 1.7446258068084717, + "learning_rate": 6.400437364293655e-06, + "loss": 0.1725, + "num_input_tokens_seen": 104562464, + "step": 48450 + }, + { + "epoch": 7.904567699836868, + "grad_norm": 0.3577515482902527, + "learning_rate": 6.395681888930361e-06, + "loss": 0.0984, + "num_input_tokens_seen": 104573600, + "step": 48455 + }, + { + "epoch": 7.9053833605220225, + "grad_norm": 0.2025967836380005, + "learning_rate": 6.390927921727494e-06, + "loss": 0.0694, + "num_input_tokens_seen": 104584800, + "step": 48460 + }, + { + "epoch": 7.906199021207177, + "grad_norm": 2.072265863418579, + "learning_rate": 6.386175463070429e-06, + "loss": 0.2046, + "num_input_tokens_seen": 104596192, + "step": 48465 + }, + { + "epoch": 7.907014681892333, + "grad_norm": 2.129331350326538, + "learning_rate": 6.3814245133444196e-06, + "loss": 0.2418, + "num_input_tokens_seen": 104606976, + "step": 48470 + }, + { + "epoch": 7.907830342577488, + "grad_norm": 0.30992671847343445, + "learning_rate": 6.376675072934618e-06, + "loss": 0.1516, + "num_input_tokens_seen": 104618304, + "step": 48475 + }, + { + "epoch": 7.908646003262643, + "grad_norm": 0.3024470806121826, + "learning_rate": 6.371927142226028e-06, + "loss": 0.0287, + "num_input_tokens_seen": 104627840, + "step": 48480 + }, + { + "epoch": 7.9094616639477975, + "grad_norm": 0.7103501558303833, + "learning_rate": 6.367180721603541e-06, + "loss": 0.0865, + "num_input_tokens_seen": 104639296, + "step": 48485 + }, + { + "epoch": 7.910277324632952, + "grad_norm": 0.8961671590805054, + "learning_rate": 6.3624358114519275e-06, + "loss": 0.112, + "num_input_tokens_seen": 104648992, + "step": 48490 + }, + { + "epoch": 7.911092985318108, + "grad_norm": 0.1556129902601242, + "learning_rate": 6.3576924121558246e-06, + "loss": 0.0552, + "num_input_tokens_seen": 104659680, + "step": 48495 + }, + { + "epoch": 7.911908646003263, + "grad_norm": 0.04014259949326515, + "learning_rate": 6.352950524099774e-06, + "loss": 0.1389, + "num_input_tokens_seen": 104670688, + "step": 48500 + }, + { + "epoch": 7.912724306688418, + "grad_norm": 0.3957141041755676, + "learning_rate": 6.348210147668165e-06, + "loss": 0.1247, + "num_input_tokens_seen": 104681600, + "step": 48505 + }, + { + "epoch": 7.9135399673735725, + "grad_norm": 0.7248523235321045, + "learning_rate": 6.343471283245283e-06, + "loss": 0.1688, + "num_input_tokens_seen": 104692672, + "step": 48510 + }, + { + "epoch": 7.914355628058727, + "grad_norm": 0.18802300095558167, + "learning_rate": 6.33873393121528e-06, + "loss": 0.0491, + "num_input_tokens_seen": 104703296, + "step": 48515 + }, + { + "epoch": 7.915171288743883, + "grad_norm": 0.1525079309940338, + "learning_rate": 6.33399809196219e-06, + "loss": 0.0984, + "num_input_tokens_seen": 104715648, + "step": 48520 + }, + { + "epoch": 7.915986949429038, + "grad_norm": 1.2895901203155518, + "learning_rate": 6.329263765869925e-06, + "loss": 0.1239, + "num_input_tokens_seen": 104724864, + "step": 48525 + }, + { + "epoch": 7.916802610114193, + "grad_norm": 1.3362904787063599, + "learning_rate": 6.324530953322275e-06, + "loss": 0.117, + "num_input_tokens_seen": 104734592, + "step": 48530 + }, + { + "epoch": 7.917618270799347, + "grad_norm": 1.4832990169525146, + "learning_rate": 6.319799654702904e-06, + "loss": 0.2114, + "num_input_tokens_seen": 104744992, + "step": 48535 + }, + { + "epoch": 7.918433931484502, + "grad_norm": 0.028463153168559074, + "learning_rate": 6.315069870395354e-06, + "loss": 0.1362, + "num_input_tokens_seen": 104756512, + "step": 48540 + }, + { + "epoch": 7.919249592169657, + "grad_norm": 3.0053720474243164, + "learning_rate": 6.310341600783049e-06, + "loss": 0.1728, + "num_input_tokens_seen": 104767840, + "step": 48545 + }, + { + "epoch": 7.920065252854813, + "grad_norm": 1.1866981983184814, + "learning_rate": 6.305614846249283e-06, + "loss": 0.2345, + "num_input_tokens_seen": 104778944, + "step": 48550 + }, + { + "epoch": 7.920880913539968, + "grad_norm": 0.06975584477186203, + "learning_rate": 6.300889607177229e-06, + "loss": 0.0753, + "num_input_tokens_seen": 104789088, + "step": 48555 + }, + { + "epoch": 7.921696574225122, + "grad_norm": 0.17992334067821503, + "learning_rate": 6.296165883949947e-06, + "loss": 0.2084, + "num_input_tokens_seen": 104800032, + "step": 48560 + }, + { + "epoch": 7.922512234910277, + "grad_norm": 0.222028449177742, + "learning_rate": 6.291443676950357e-06, + "loss": 0.1383, + "num_input_tokens_seen": 104810720, + "step": 48565 + }, + { + "epoch": 7.923327895595432, + "grad_norm": 0.07976670563220978, + "learning_rate": 6.286722986561272e-06, + "loss": 0.0365, + "num_input_tokens_seen": 104820896, + "step": 48570 + }, + { + "epoch": 7.924143556280587, + "grad_norm": 0.3424583673477173, + "learning_rate": 6.282003813165368e-06, + "loss": 0.2054, + "num_input_tokens_seen": 104831808, + "step": 48575 + }, + { + "epoch": 7.924959216965743, + "grad_norm": 0.053988609462976456, + "learning_rate": 6.2772861571452125e-06, + "loss": 0.0274, + "num_input_tokens_seen": 104841920, + "step": 48580 + }, + { + "epoch": 7.925774877650897, + "grad_norm": 0.14500801265239716, + "learning_rate": 6.272570018883236e-06, + "loss": 0.041, + "num_input_tokens_seen": 104853408, + "step": 48585 + }, + { + "epoch": 7.926590538336052, + "grad_norm": 0.07908301055431366, + "learning_rate": 6.26785539876176e-06, + "loss": 0.1622, + "num_input_tokens_seen": 104862880, + "step": 48590 + }, + { + "epoch": 7.927406199021207, + "grad_norm": 1.6170755624771118, + "learning_rate": 6.2631422971629605e-06, + "loss": 0.3047, + "num_input_tokens_seen": 104873952, + "step": 48595 + }, + { + "epoch": 7.928221859706362, + "grad_norm": 0.24178631603717804, + "learning_rate": 6.2584307144689245e-06, + "loss": 0.0223, + "num_input_tokens_seen": 104884128, + "step": 48600 + }, + { + "epoch": 7.9290375203915175, + "grad_norm": 1.3242095708847046, + "learning_rate": 6.25372065106159e-06, + "loss": 0.1264, + "num_input_tokens_seen": 104895296, + "step": 48605 + }, + { + "epoch": 7.929853181076672, + "grad_norm": 0.12294887006282806, + "learning_rate": 6.249012107322774e-06, + "loss": 0.106, + "num_input_tokens_seen": 104905760, + "step": 48610 + }, + { + "epoch": 7.930668841761827, + "grad_norm": 1.0383307933807373, + "learning_rate": 6.244305083634181e-06, + "loss": 0.1157, + "num_input_tokens_seen": 104916864, + "step": 48615 + }, + { + "epoch": 7.931484502446982, + "grad_norm": 0.1177026629447937, + "learning_rate": 6.239599580377381e-06, + "loss": 0.1176, + "num_input_tokens_seen": 104927104, + "step": 48620 + }, + { + "epoch": 7.932300163132137, + "grad_norm": 1.8257189989089966, + "learning_rate": 6.234895597933832e-06, + "loss": 0.1316, + "num_input_tokens_seen": 104937632, + "step": 48625 + }, + { + "epoch": 7.933115823817292, + "grad_norm": 1.705233097076416, + "learning_rate": 6.2301931366848555e-06, + "loss": 0.2749, + "num_input_tokens_seen": 104948448, + "step": 48630 + }, + { + "epoch": 7.933931484502447, + "grad_norm": 0.1868111938238144, + "learning_rate": 6.225492197011654e-06, + "loss": 0.0283, + "num_input_tokens_seen": 104958688, + "step": 48635 + }, + { + "epoch": 7.934747145187602, + "grad_norm": 0.2070310413837433, + "learning_rate": 6.220792779295326e-06, + "loss": 0.0348, + "num_input_tokens_seen": 104969408, + "step": 48640 + }, + { + "epoch": 7.935562805872757, + "grad_norm": 0.05478796362876892, + "learning_rate": 6.216094883916815e-06, + "loss": 0.0461, + "num_input_tokens_seen": 104979968, + "step": 48645 + }, + { + "epoch": 7.936378466557912, + "grad_norm": 0.5736666917800903, + "learning_rate": 6.211398511256966e-06, + "loss": 0.1759, + "num_input_tokens_seen": 104990688, + "step": 48650 + }, + { + "epoch": 7.937194127243067, + "grad_norm": 0.8677161931991577, + "learning_rate": 6.206703661696484e-06, + "loss": 0.1007, + "num_input_tokens_seen": 105002048, + "step": 48655 + }, + { + "epoch": 7.938009787928221, + "grad_norm": 0.9613008499145508, + "learning_rate": 6.20201033561596e-06, + "loss": 0.0796, + "num_input_tokens_seen": 105013056, + "step": 48660 + }, + { + "epoch": 7.938825448613377, + "grad_norm": 3.25480318069458, + "learning_rate": 6.197318533395858e-06, + "loss": 0.3958, + "num_input_tokens_seen": 105021856, + "step": 48665 + }, + { + "epoch": 7.939641109298532, + "grad_norm": 1.0873991250991821, + "learning_rate": 6.192628255416519e-06, + "loss": 0.1986, + "num_input_tokens_seen": 105031936, + "step": 48670 + }, + { + "epoch": 7.940456769983687, + "grad_norm": 1.7788277864456177, + "learning_rate": 6.18793950205816e-06, + "loss": 0.2707, + "num_input_tokens_seen": 105044000, + "step": 48675 + }, + { + "epoch": 7.941272430668842, + "grad_norm": 1.437423586845398, + "learning_rate": 6.183252273700879e-06, + "loss": 0.1084, + "num_input_tokens_seen": 105054272, + "step": 48680 + }, + { + "epoch": 7.942088091353996, + "grad_norm": 0.1953783482313156, + "learning_rate": 6.178566570724642e-06, + "loss": 0.0477, + "num_input_tokens_seen": 105065984, + "step": 48685 + }, + { + "epoch": 7.942903752039152, + "grad_norm": 1.7387306690216064, + "learning_rate": 6.1738823935092975e-06, + "loss": 0.1173, + "num_input_tokens_seen": 105077440, + "step": 48690 + }, + { + "epoch": 7.943719412724307, + "grad_norm": 1.8572040796279907, + "learning_rate": 6.16919974243457e-06, + "loss": 0.1911, + "num_input_tokens_seen": 105087104, + "step": 48695 + }, + { + "epoch": 7.944535073409462, + "grad_norm": 1.0368303060531616, + "learning_rate": 6.164518617880058e-06, + "loss": 0.0979, + "num_input_tokens_seen": 105097824, + "step": 48700 + }, + { + "epoch": 7.945350734094617, + "grad_norm": 0.37757986783981323, + "learning_rate": 6.159839020225231e-06, + "loss": 0.0357, + "num_input_tokens_seen": 105108064, + "step": 48705 + }, + { + "epoch": 7.946166394779771, + "grad_norm": 0.7374359369277954, + "learning_rate": 6.155160949849453e-06, + "loss": 0.0313, + "num_input_tokens_seen": 105118784, + "step": 48710 + }, + { + "epoch": 7.946982055464927, + "grad_norm": 0.15988826751708984, + "learning_rate": 6.150484407131945e-06, + "loss": 0.0893, + "num_input_tokens_seen": 105128608, + "step": 48715 + }, + { + "epoch": 7.947797716150082, + "grad_norm": 0.6537628769874573, + "learning_rate": 6.145809392451815e-06, + "loss": 0.0632, + "num_input_tokens_seen": 105138720, + "step": 48720 + }, + { + "epoch": 7.948613376835237, + "grad_norm": 0.7915763854980469, + "learning_rate": 6.141135906188039e-06, + "loss": 0.1122, + "num_input_tokens_seen": 105150368, + "step": 48725 + }, + { + "epoch": 7.9494290375203915, + "grad_norm": 0.7185448408126831, + "learning_rate": 6.136463948719475e-06, + "loss": 0.0936, + "num_input_tokens_seen": 105160544, + "step": 48730 + }, + { + "epoch": 7.950244698205546, + "grad_norm": 1.4327082633972168, + "learning_rate": 6.131793520424859e-06, + "loss": 0.1648, + "num_input_tokens_seen": 105170784, + "step": 48735 + }, + { + "epoch": 7.951060358890701, + "grad_norm": 0.43283793330192566, + "learning_rate": 6.1271246216827945e-06, + "loss": 0.1396, + "num_input_tokens_seen": 105180960, + "step": 48740 + }, + { + "epoch": 7.951876019575856, + "grad_norm": 0.6850867867469788, + "learning_rate": 6.122457252871769e-06, + "loss": 0.1378, + "num_input_tokens_seen": 105191200, + "step": 48745 + }, + { + "epoch": 7.952691680261012, + "grad_norm": 1.4384677410125732, + "learning_rate": 6.117791414370141e-06, + "loss": 0.1539, + "num_input_tokens_seen": 105202176, + "step": 48750 + }, + { + "epoch": 7.9535073409461665, + "grad_norm": 0.879757285118103, + "learning_rate": 6.113127106556149e-06, + "loss": 0.2321, + "num_input_tokens_seen": 105212352, + "step": 48755 + }, + { + "epoch": 7.954323001631321, + "grad_norm": 0.9733136296272278, + "learning_rate": 6.108464329807903e-06, + "loss": 0.169, + "num_input_tokens_seen": 105222464, + "step": 48760 + }, + { + "epoch": 7.955138662316476, + "grad_norm": 0.7974148392677307, + "learning_rate": 6.1038030845033956e-06, + "loss": 0.0416, + "num_input_tokens_seen": 105233984, + "step": 48765 + }, + { + "epoch": 7.955954323001631, + "grad_norm": 0.13865315914154053, + "learning_rate": 6.0991433710204885e-06, + "loss": 0.0327, + "num_input_tokens_seen": 105244768, + "step": 48770 + }, + { + "epoch": 7.956769983686787, + "grad_norm": 0.1854076236486435, + "learning_rate": 6.0944851897369206e-06, + "loss": 0.203, + "num_input_tokens_seen": 105255680, + "step": 48775 + }, + { + "epoch": 7.9575856443719415, + "grad_norm": 0.02571571059525013, + "learning_rate": 6.0898285410303015e-06, + "loss": 0.0209, + "num_input_tokens_seen": 105268608, + "step": 48780 + }, + { + "epoch": 7.958401305057096, + "grad_norm": 1.8278616666793823, + "learning_rate": 6.085173425278137e-06, + "loss": 0.1294, + "num_input_tokens_seen": 105278464, + "step": 48785 + }, + { + "epoch": 7.959216965742251, + "grad_norm": 0.5248691439628601, + "learning_rate": 6.080519842857787e-06, + "loss": 0.0613, + "num_input_tokens_seen": 105289856, + "step": 48790 + }, + { + "epoch": 7.960032626427406, + "grad_norm": 0.08239180594682693, + "learning_rate": 6.075867794146497e-06, + "loss": 0.1129, + "num_input_tokens_seen": 105300160, + "step": 48795 + }, + { + "epoch": 7.960848287112562, + "grad_norm": 0.09987831115722656, + "learning_rate": 6.071217279521382e-06, + "loss": 0.1393, + "num_input_tokens_seen": 105312352, + "step": 48800 + }, + { + "epoch": 7.9616639477977165, + "grad_norm": 0.39759793877601624, + "learning_rate": 6.06656829935944e-06, + "loss": 0.0487, + "num_input_tokens_seen": 105324544, + "step": 48805 + }, + { + "epoch": 7.962479608482871, + "grad_norm": 1.8903241157531738, + "learning_rate": 6.061920854037531e-06, + "loss": 0.1199, + "num_input_tokens_seen": 105335072, + "step": 48810 + }, + { + "epoch": 7.963295269168026, + "grad_norm": 0.3594090938568115, + "learning_rate": 6.0572749439324146e-06, + "loss": 0.0239, + "num_input_tokens_seen": 105344960, + "step": 48815 + }, + { + "epoch": 7.964110929853181, + "grad_norm": 0.8277249336242676, + "learning_rate": 6.052630569420706e-06, + "loss": 0.0311, + "num_input_tokens_seen": 105355264, + "step": 48820 + }, + { + "epoch": 7.964926590538336, + "grad_norm": 0.03504682332277298, + "learning_rate": 6.047987730878904e-06, + "loss": 0.0602, + "num_input_tokens_seen": 105366624, + "step": 48825 + }, + { + "epoch": 7.9657422512234906, + "grad_norm": 0.07746176421642303, + "learning_rate": 6.043346428683375e-06, + "loss": 0.1721, + "num_input_tokens_seen": 105377888, + "step": 48830 + }, + { + "epoch": 7.966557911908646, + "grad_norm": 0.09458627551794052, + "learning_rate": 6.0387066632103695e-06, + "loss": 0.0571, + "num_input_tokens_seen": 105388192, + "step": 48835 + }, + { + "epoch": 7.967373572593801, + "grad_norm": 1.795005440711975, + "learning_rate": 6.034068434836013e-06, + "loss": 0.3193, + "num_input_tokens_seen": 105400256, + "step": 48840 + }, + { + "epoch": 7.968189233278956, + "grad_norm": 0.7816019058227539, + "learning_rate": 6.029431743936298e-06, + "loss": 0.1836, + "num_input_tokens_seen": 105411328, + "step": 48845 + }, + { + "epoch": 7.969004893964111, + "grad_norm": 0.44528496265411377, + "learning_rate": 6.024796590887105e-06, + "loss": 0.0918, + "num_input_tokens_seen": 105421408, + "step": 48850 + }, + { + "epoch": 7.9698205546492655, + "grad_norm": 0.05215562507510185, + "learning_rate": 6.020162976064178e-06, + "loss": 0.0353, + "num_input_tokens_seen": 105431872, + "step": 48855 + }, + { + "epoch": 7.970636215334421, + "grad_norm": 0.3392889201641083, + "learning_rate": 6.0155308998431415e-06, + "loss": 0.1142, + "num_input_tokens_seen": 105443776, + "step": 48860 + }, + { + "epoch": 7.971451876019576, + "grad_norm": 3.4686267375946045, + "learning_rate": 6.0109003625994975e-06, + "loss": 0.1041, + "num_input_tokens_seen": 105453088, + "step": 48865 + }, + { + "epoch": 7.972267536704731, + "grad_norm": 0.12923471629619598, + "learning_rate": 6.006271364708621e-06, + "loss": 0.0537, + "num_input_tokens_seen": 105463744, + "step": 48870 + }, + { + "epoch": 7.973083197389886, + "grad_norm": 0.06443211436271667, + "learning_rate": 6.0016439065457595e-06, + "loss": 0.0456, + "num_input_tokens_seen": 105474752, + "step": 48875 + }, + { + "epoch": 7.9738988580750405, + "grad_norm": 0.15833698213100433, + "learning_rate": 5.997017988486039e-06, + "loss": 0.083, + "num_input_tokens_seen": 105486016, + "step": 48880 + }, + { + "epoch": 7.974714518760196, + "grad_norm": 0.04708987474441528, + "learning_rate": 5.992393610904465e-06, + "loss": 0.0814, + "num_input_tokens_seen": 105496800, + "step": 48885 + }, + { + "epoch": 7.975530179445351, + "grad_norm": 0.03345724567770958, + "learning_rate": 5.987770774175905e-06, + "loss": 0.1415, + "num_input_tokens_seen": 105506528, + "step": 48890 + }, + { + "epoch": 7.976345840130506, + "grad_norm": 1.3144856691360474, + "learning_rate": 5.983149478675113e-06, + "loss": 0.1293, + "num_input_tokens_seen": 105517312, + "step": 48895 + }, + { + "epoch": 7.977161500815661, + "grad_norm": 0.04998178407549858, + "learning_rate": 5.978529724776713e-06, + "loss": 0.1685, + "num_input_tokens_seen": 105526976, + "step": 48900 + }, + { + "epoch": 7.9779771615008155, + "grad_norm": 0.38340845704078674, + "learning_rate": 5.97391151285521e-06, + "loss": 0.0356, + "num_input_tokens_seen": 105537408, + "step": 48905 + }, + { + "epoch": 7.97879282218597, + "grad_norm": 0.529281497001648, + "learning_rate": 5.969294843284978e-06, + "loss": 0.0874, + "num_input_tokens_seen": 105547712, + "step": 48910 + }, + { + "epoch": 7.979608482871125, + "grad_norm": 0.45598381757736206, + "learning_rate": 5.964679716440258e-06, + "loss": 0.2193, + "num_input_tokens_seen": 105557760, + "step": 48915 + }, + { + "epoch": 7.980424143556281, + "grad_norm": 1.1052435636520386, + "learning_rate": 5.9600661326951916e-06, + "loss": 0.1947, + "num_input_tokens_seen": 105568800, + "step": 48920 + }, + { + "epoch": 7.981239804241436, + "grad_norm": 0.05069243162870407, + "learning_rate": 5.955454092423773e-06, + "loss": 0.0195, + "num_input_tokens_seen": 105578944, + "step": 48925 + }, + { + "epoch": 7.9820554649265905, + "grad_norm": 1.1721428632736206, + "learning_rate": 5.950843595999877e-06, + "loss": 0.1209, + "num_input_tokens_seen": 105589664, + "step": 48930 + }, + { + "epoch": 7.982871125611745, + "grad_norm": 0.18678005039691925, + "learning_rate": 5.946234643797252e-06, + "loss": 0.0387, + "num_input_tokens_seen": 105601056, + "step": 48935 + }, + { + "epoch": 7.9836867862969, + "grad_norm": 0.34050941467285156, + "learning_rate": 5.941627236189526e-06, + "loss": 0.1123, + "num_input_tokens_seen": 105611808, + "step": 48940 + }, + { + "epoch": 7.984502446982056, + "grad_norm": 0.0468323640525341, + "learning_rate": 5.9370213735501974e-06, + "loss": 0.1628, + "num_input_tokens_seen": 105622944, + "step": 48945 + }, + { + "epoch": 7.985318107667211, + "grad_norm": 0.7147238254547119, + "learning_rate": 5.9324170562526345e-06, + "loss": 0.203, + "num_input_tokens_seen": 105633920, + "step": 48950 + }, + { + "epoch": 7.986133768352365, + "grad_norm": 0.6743090152740479, + "learning_rate": 5.927814284670097e-06, + "loss": 0.2068, + "num_input_tokens_seen": 105645280, + "step": 48955 + }, + { + "epoch": 7.98694942903752, + "grad_norm": 0.6524879932403564, + "learning_rate": 5.923213059175709e-06, + "loss": 0.0475, + "num_input_tokens_seen": 105656032, + "step": 48960 + }, + { + "epoch": 7.987765089722675, + "grad_norm": 0.1407753974199295, + "learning_rate": 5.918613380142463e-06, + "loss": 0.0346, + "num_input_tokens_seen": 105666208, + "step": 48965 + }, + { + "epoch": 7.988580750407831, + "grad_norm": 2.439143419265747, + "learning_rate": 5.914015247943233e-06, + "loss": 0.2708, + "num_input_tokens_seen": 105677600, + "step": 48970 + }, + { + "epoch": 7.989396411092986, + "grad_norm": 1.1312880516052246, + "learning_rate": 5.909418662950769e-06, + "loss": 0.1188, + "num_input_tokens_seen": 105688256, + "step": 48975 + }, + { + "epoch": 7.99021207177814, + "grad_norm": 0.13223159313201904, + "learning_rate": 5.904823625537695e-06, + "loss": 0.0671, + "num_input_tokens_seen": 105698560, + "step": 48980 + }, + { + "epoch": 7.991027732463295, + "grad_norm": 0.3069782555103302, + "learning_rate": 5.900230136076504e-06, + "loss": 0.1103, + "num_input_tokens_seen": 105709792, + "step": 48985 + }, + { + "epoch": 7.99184339314845, + "grad_norm": 0.29400020837783813, + "learning_rate": 5.895638194939568e-06, + "loss": 0.172, + "num_input_tokens_seen": 105720608, + "step": 48990 + }, + { + "epoch": 7.992659053833605, + "grad_norm": 0.29429078102111816, + "learning_rate": 5.891047802499136e-06, + "loss": 0.0204, + "num_input_tokens_seen": 105732192, + "step": 48995 + }, + { + "epoch": 7.993474714518761, + "grad_norm": 0.5421388745307922, + "learning_rate": 5.886458959127328e-06, + "loss": 0.234, + "num_input_tokens_seen": 105742688, + "step": 49000 + }, + { + "epoch": 7.994290375203915, + "grad_norm": 0.6797766089439392, + "learning_rate": 5.881871665196137e-06, + "loss": 0.1284, + "num_input_tokens_seen": 105754208, + "step": 49005 + }, + { + "epoch": 7.99510603588907, + "grad_norm": 0.08149369060993195, + "learning_rate": 5.877285921077433e-06, + "loss": 0.032, + "num_input_tokens_seen": 105765920, + "step": 49010 + }, + { + "epoch": 7.995921696574225, + "grad_norm": 0.6201289296150208, + "learning_rate": 5.872701727142963e-06, + "loss": 0.1556, + "num_input_tokens_seen": 105776992, + "step": 49015 + }, + { + "epoch": 7.99673735725938, + "grad_norm": 0.04288557916879654, + "learning_rate": 5.868119083764337e-06, + "loss": 0.132, + "num_input_tokens_seen": 105788576, + "step": 49020 + }, + { + "epoch": 7.997553017944535, + "grad_norm": 0.08801853656768799, + "learning_rate": 5.863537991313047e-06, + "loss": 0.037, + "num_input_tokens_seen": 105800384, + "step": 49025 + }, + { + "epoch": 7.99836867862969, + "grad_norm": 1.5008357763290405, + "learning_rate": 5.858958450160473e-06, + "loss": 0.1284, + "num_input_tokens_seen": 105810528, + "step": 49030 + }, + { + "epoch": 7.999184339314845, + "grad_norm": 0.04885650798678398, + "learning_rate": 5.854380460677847e-06, + "loss": 0.0414, + "num_input_tokens_seen": 105821440, + "step": 49035 + }, + { + "epoch": 8.0, + "grad_norm": 0.362807035446167, + "learning_rate": 5.849804023236285e-06, + "loss": 0.0194, + "num_input_tokens_seen": 105830544, + "step": 49040 + }, + { + "epoch": 8.0, + "eval_loss": 0.1356717050075531, + "eval_runtime": 131.9092, + "eval_samples_per_second": 20.658, + "eval_steps_per_second": 5.17, + "num_input_tokens_seen": 105830544, + "step": 49040 + }, + { + "epoch": 8.000815660685156, + "grad_norm": 0.4509699046611786, + "learning_rate": 5.845229138206776e-06, + "loss": 0.082, + "num_input_tokens_seen": 105843312, + "step": 49045 + }, + { + "epoch": 8.00163132137031, + "grad_norm": 0.35298967361450195, + "learning_rate": 5.8406558059601825e-06, + "loss": 0.0218, + "num_input_tokens_seen": 105851248, + "step": 49050 + }, + { + "epoch": 8.002446982055465, + "grad_norm": 1.9501819610595703, + "learning_rate": 5.836084026867244e-06, + "loss": 0.1189, + "num_input_tokens_seen": 105862352, + "step": 49055 + }, + { + "epoch": 8.00326264274062, + "grad_norm": 0.29763051867485046, + "learning_rate": 5.831513801298572e-06, + "loss": 0.1157, + "num_input_tokens_seen": 105873264, + "step": 49060 + }, + { + "epoch": 8.004078303425775, + "grad_norm": 0.044553935527801514, + "learning_rate": 5.82694512962465e-06, + "loss": 0.0353, + "num_input_tokens_seen": 105884272, + "step": 49065 + }, + { + "epoch": 8.00489396411093, + "grad_norm": 1.1357083320617676, + "learning_rate": 5.822378012215837e-06, + "loss": 0.1867, + "num_input_tokens_seen": 105894832, + "step": 49070 + }, + { + "epoch": 8.005709624796085, + "grad_norm": 0.048781730234622955, + "learning_rate": 5.81781244944237e-06, + "loss": 0.0078, + "num_input_tokens_seen": 105905520, + "step": 49075 + }, + { + "epoch": 8.00652528548124, + "grad_norm": 0.2396901398897171, + "learning_rate": 5.813248441674357e-06, + "loss": 0.0964, + "num_input_tokens_seen": 105917424, + "step": 49080 + }, + { + "epoch": 8.007340946166394, + "grad_norm": 1.0309923887252808, + "learning_rate": 5.8086859892817755e-06, + "loss": 0.0601, + "num_input_tokens_seen": 105928560, + "step": 49085 + }, + { + "epoch": 8.00815660685155, + "grad_norm": 0.29990488290786743, + "learning_rate": 5.804125092634485e-06, + "loss": 0.0864, + "num_input_tokens_seen": 105939120, + "step": 49090 + }, + { + "epoch": 8.008972267536704, + "grad_norm": 0.4514934718608856, + "learning_rate": 5.799565752102207e-06, + "loss": 0.0567, + "num_input_tokens_seen": 105949424, + "step": 49095 + }, + { + "epoch": 8.00978792822186, + "grad_norm": 2.320667028427124, + "learning_rate": 5.795007968054555e-06, + "loss": 0.1365, + "num_input_tokens_seen": 105959280, + "step": 49100 + }, + { + "epoch": 8.010603588907015, + "grad_norm": 0.5956774353981018, + "learning_rate": 5.790451740861005e-06, + "loss": 0.0908, + "num_input_tokens_seen": 105968272, + "step": 49105 + }, + { + "epoch": 8.01141924959217, + "grad_norm": 0.9750688076019287, + "learning_rate": 5.7858970708909056e-06, + "loss": 0.2045, + "num_input_tokens_seen": 105978736, + "step": 49110 + }, + { + "epoch": 8.012234910277325, + "grad_norm": 0.17963483929634094, + "learning_rate": 5.78134395851348e-06, + "loss": 0.0515, + "num_input_tokens_seen": 105989872, + "step": 49115 + }, + { + "epoch": 8.013050570962479, + "grad_norm": 0.10998575389385223, + "learning_rate": 5.7767924040978275e-06, + "loss": 0.0575, + "num_input_tokens_seen": 105999792, + "step": 49120 + }, + { + "epoch": 8.013866231647635, + "grad_norm": 0.04605740308761597, + "learning_rate": 5.772242408012921e-06, + "loss": 0.0852, + "num_input_tokens_seen": 106010256, + "step": 49125 + }, + { + "epoch": 8.01468189233279, + "grad_norm": 2.5586042404174805, + "learning_rate": 5.767693970627597e-06, + "loss": 0.273, + "num_input_tokens_seen": 106020496, + "step": 49130 + }, + { + "epoch": 8.015497553017944, + "grad_norm": 0.10431528836488724, + "learning_rate": 5.763147092310592e-06, + "loss": 0.0932, + "num_input_tokens_seen": 106031248, + "step": 49135 + }, + { + "epoch": 8.0163132137031, + "grad_norm": 0.044413886964321136, + "learning_rate": 5.758601773430489e-06, + "loss": 0.0077, + "num_input_tokens_seen": 106041232, + "step": 49140 + }, + { + "epoch": 8.017128874388254, + "grad_norm": 0.2886601686477661, + "learning_rate": 5.7540580143557564e-06, + "loss": 0.1259, + "num_input_tokens_seen": 106051696, + "step": 49145 + }, + { + "epoch": 8.01794453507341, + "grad_norm": 0.7764661908149719, + "learning_rate": 5.7495158154547365e-06, + "loss": 0.0984, + "num_input_tokens_seen": 106062672, + "step": 49150 + }, + { + "epoch": 8.018760195758565, + "grad_norm": 0.6796066164970398, + "learning_rate": 5.744975177095638e-06, + "loss": 0.0325, + "num_input_tokens_seen": 106072784, + "step": 49155 + }, + { + "epoch": 8.01957585644372, + "grad_norm": 0.03901611641049385, + "learning_rate": 5.740436099646551e-06, + "loss": 0.0255, + "num_input_tokens_seen": 106083952, + "step": 49160 + }, + { + "epoch": 8.020391517128875, + "grad_norm": 0.81586754322052, + "learning_rate": 5.735898583475438e-06, + "loss": 0.1511, + "num_input_tokens_seen": 106095632, + "step": 49165 + }, + { + "epoch": 8.021207177814029, + "grad_norm": 0.22459834814071655, + "learning_rate": 5.731362628950129e-06, + "loss": 0.0219, + "num_input_tokens_seen": 106106160, + "step": 49170 + }, + { + "epoch": 8.022022838499185, + "grad_norm": 1.7962474822998047, + "learning_rate": 5.726828236438334e-06, + "loss": 0.1963, + "num_input_tokens_seen": 106116784, + "step": 49175 + }, + { + "epoch": 8.022838499184338, + "grad_norm": 0.21737368404865265, + "learning_rate": 5.722295406307632e-06, + "loss": 0.1008, + "num_input_tokens_seen": 106127088, + "step": 49180 + }, + { + "epoch": 8.023654159869494, + "grad_norm": 1.090805172920227, + "learning_rate": 5.71776413892548e-06, + "loss": 0.143, + "num_input_tokens_seen": 106136912, + "step": 49185 + }, + { + "epoch": 8.02446982055465, + "grad_norm": 0.5538457036018372, + "learning_rate": 5.713234434659203e-06, + "loss": 0.0881, + "num_input_tokens_seen": 106148816, + "step": 49190 + }, + { + "epoch": 8.025285481239804, + "grad_norm": 0.08106415718793869, + "learning_rate": 5.708706293876004e-06, + "loss": 0.026, + "num_input_tokens_seen": 106158672, + "step": 49195 + }, + { + "epoch": 8.02610114192496, + "grad_norm": 1.1599112749099731, + "learning_rate": 5.7041797169429536e-06, + "loss": 0.0891, + "num_input_tokens_seen": 106169744, + "step": 49200 + }, + { + "epoch": 8.026916802610113, + "grad_norm": 0.057352904230356216, + "learning_rate": 5.699654704227003e-06, + "loss": 0.1924, + "num_input_tokens_seen": 106179824, + "step": 49205 + }, + { + "epoch": 8.02773246329527, + "grad_norm": 0.07718788087368011, + "learning_rate": 5.695131256094971e-06, + "loss": 0.1331, + "num_input_tokens_seen": 106190640, + "step": 49210 + }, + { + "epoch": 8.028548123980425, + "grad_norm": 0.5997774600982666, + "learning_rate": 5.6906093729135495e-06, + "loss": 0.045, + "num_input_tokens_seen": 106200112, + "step": 49215 + }, + { + "epoch": 8.029363784665579, + "grad_norm": 0.3521515130996704, + "learning_rate": 5.6860890550493095e-06, + "loss": 0.0909, + "num_input_tokens_seen": 106211440, + "step": 49220 + }, + { + "epoch": 8.030179445350734, + "grad_norm": 0.2145918607711792, + "learning_rate": 5.681570302868688e-06, + "loss": 0.1596, + "num_input_tokens_seen": 106222224, + "step": 49225 + }, + { + "epoch": 8.030995106035888, + "grad_norm": 0.07206902652978897, + "learning_rate": 5.677053116737999e-06, + "loss": 0.1427, + "num_input_tokens_seen": 106233968, + "step": 49230 + }, + { + "epoch": 8.031810766721044, + "grad_norm": 0.6204495429992676, + "learning_rate": 5.67253749702342e-06, + "loss": 0.118, + "num_input_tokens_seen": 106244944, + "step": 49235 + }, + { + "epoch": 8.0326264274062, + "grad_norm": 1.220038890838623, + "learning_rate": 5.668023444091025e-06, + "loss": 0.1462, + "num_input_tokens_seen": 106255664, + "step": 49240 + }, + { + "epoch": 8.033442088091354, + "grad_norm": 1.203064203262329, + "learning_rate": 5.663510958306739e-06, + "loss": 0.0886, + "num_input_tokens_seen": 106266672, + "step": 49245 + }, + { + "epoch": 8.03425774877651, + "grad_norm": 0.5341182947158813, + "learning_rate": 5.659000040036366e-06, + "loss": 0.2183, + "num_input_tokens_seen": 106278288, + "step": 49250 + }, + { + "epoch": 8.035073409461663, + "grad_norm": 1.3311084508895874, + "learning_rate": 5.654490689645589e-06, + "loss": 0.0672, + "num_input_tokens_seen": 106289744, + "step": 49255 + }, + { + "epoch": 8.035889070146819, + "grad_norm": 0.4761621356010437, + "learning_rate": 5.649982907499951e-06, + "loss": 0.0859, + "num_input_tokens_seen": 106298480, + "step": 49260 + }, + { + "epoch": 8.036704730831975, + "grad_norm": 0.0653066337108612, + "learning_rate": 5.645476693964874e-06, + "loss": 0.0573, + "num_input_tokens_seen": 106310800, + "step": 49265 + }, + { + "epoch": 8.037520391517129, + "grad_norm": 1.4576798677444458, + "learning_rate": 5.640972049405666e-06, + "loss": 0.1458, + "num_input_tokens_seen": 106322000, + "step": 49270 + }, + { + "epoch": 8.038336052202284, + "grad_norm": 0.06671801209449768, + "learning_rate": 5.636468974187492e-06, + "loss": 0.1317, + "num_input_tokens_seen": 106331632, + "step": 49275 + }, + { + "epoch": 8.039151712887438, + "grad_norm": 0.931138277053833, + "learning_rate": 5.631967468675392e-06, + "loss": 0.0409, + "num_input_tokens_seen": 106343344, + "step": 49280 + }, + { + "epoch": 8.039967373572594, + "grad_norm": 0.07258225232362747, + "learning_rate": 5.627467533234282e-06, + "loss": 0.1314, + "num_input_tokens_seen": 106354288, + "step": 49285 + }, + { + "epoch": 8.040783034257748, + "grad_norm": 0.047940921038389206, + "learning_rate": 5.622969168228947e-06, + "loss": 0.0399, + "num_input_tokens_seen": 106365168, + "step": 49290 + }, + { + "epoch": 8.041598694942904, + "grad_norm": 0.26293084025382996, + "learning_rate": 5.61847237402405e-06, + "loss": 0.0768, + "num_input_tokens_seen": 106375440, + "step": 49295 + }, + { + "epoch": 8.04241435562806, + "grad_norm": 1.6879507303237915, + "learning_rate": 5.613977150984123e-06, + "loss": 0.1155, + "num_input_tokens_seen": 106386032, + "step": 49300 + }, + { + "epoch": 8.043230016313213, + "grad_norm": 0.3108619451522827, + "learning_rate": 5.609483499473575e-06, + "loss": 0.0455, + "num_input_tokens_seen": 106398160, + "step": 49305 + }, + { + "epoch": 8.044045676998369, + "grad_norm": 0.3206223249435425, + "learning_rate": 5.604991419856678e-06, + "loss": 0.1118, + "num_input_tokens_seen": 106408432, + "step": 49310 + }, + { + "epoch": 8.044861337683523, + "grad_norm": 0.23963668942451477, + "learning_rate": 5.600500912497586e-06, + "loss": 0.2897, + "num_input_tokens_seen": 106419440, + "step": 49315 + }, + { + "epoch": 8.045676998368679, + "grad_norm": 0.22295014560222626, + "learning_rate": 5.596011977760324e-06, + "loss": 0.0463, + "num_input_tokens_seen": 106430000, + "step": 49320 + }, + { + "epoch": 8.046492659053834, + "grad_norm": 0.053941380232572556, + "learning_rate": 5.591524616008784e-06, + "loss": 0.0261, + "num_input_tokens_seen": 106441392, + "step": 49325 + }, + { + "epoch": 8.047308319738988, + "grad_norm": 1.4478299617767334, + "learning_rate": 5.587038827606736e-06, + "loss": 0.0563, + "num_input_tokens_seen": 106452688, + "step": 49330 + }, + { + "epoch": 8.048123980424144, + "grad_norm": 2.3247830867767334, + "learning_rate": 5.582554612917823e-06, + "loss": 0.1369, + "num_input_tokens_seen": 106462352, + "step": 49335 + }, + { + "epoch": 8.048939641109298, + "grad_norm": 1.3680570125579834, + "learning_rate": 5.578071972305554e-06, + "loss": 0.2246, + "num_input_tokens_seen": 106472208, + "step": 49340 + }, + { + "epoch": 8.049755301794454, + "grad_norm": 1.196207880973816, + "learning_rate": 5.57359090613331e-06, + "loss": 0.3719, + "num_input_tokens_seen": 106481840, + "step": 49345 + }, + { + "epoch": 8.05057096247961, + "grad_norm": 0.09773958474397659, + "learning_rate": 5.569111414764363e-06, + "loss": 0.2061, + "num_input_tokens_seen": 106492912, + "step": 49350 + }, + { + "epoch": 8.051386623164763, + "grad_norm": 0.09012816101312637, + "learning_rate": 5.564633498561839e-06, + "loss": 0.2935, + "num_input_tokens_seen": 106503440, + "step": 49355 + }, + { + "epoch": 8.052202283849919, + "grad_norm": 0.38713598251342773, + "learning_rate": 5.560157157888735e-06, + "loss": 0.1025, + "num_input_tokens_seen": 106512880, + "step": 49360 + }, + { + "epoch": 8.053017944535073, + "grad_norm": 0.07260233908891678, + "learning_rate": 5.555682393107928e-06, + "loss": 0.0812, + "num_input_tokens_seen": 106525296, + "step": 49365 + }, + { + "epoch": 8.053833605220229, + "grad_norm": 2.279515266418457, + "learning_rate": 5.551209204582167e-06, + "loss": 0.2231, + "num_input_tokens_seen": 106535536, + "step": 49370 + }, + { + "epoch": 8.054649265905383, + "grad_norm": 0.32851529121398926, + "learning_rate": 5.54673759267407e-06, + "loss": 0.1362, + "num_input_tokens_seen": 106546160, + "step": 49375 + }, + { + "epoch": 8.055464926590538, + "grad_norm": 0.3892855644226074, + "learning_rate": 5.542267557746128e-06, + "loss": 0.0915, + "num_input_tokens_seen": 106558064, + "step": 49380 + }, + { + "epoch": 8.056280587275694, + "grad_norm": 0.20288996398448944, + "learning_rate": 5.537799100160704e-06, + "loss": 0.0775, + "num_input_tokens_seen": 106568400, + "step": 49385 + }, + { + "epoch": 8.057096247960848, + "grad_norm": 0.8071020245552063, + "learning_rate": 5.533332220280038e-06, + "loss": 0.1191, + "num_input_tokens_seen": 106578192, + "step": 49390 + }, + { + "epoch": 8.057911908646004, + "grad_norm": 0.05942778289318085, + "learning_rate": 5.5288669184662325e-06, + "loss": 0.063, + "num_input_tokens_seen": 106587312, + "step": 49395 + }, + { + "epoch": 8.058727569331158, + "grad_norm": 0.10087130963802338, + "learning_rate": 5.524403195081271e-06, + "loss": 0.1327, + "num_input_tokens_seen": 106598672, + "step": 49400 + }, + { + "epoch": 8.059543230016313, + "grad_norm": 2.2120985984802246, + "learning_rate": 5.519941050487007e-06, + "loss": 0.0806, + "num_input_tokens_seen": 106608208, + "step": 49405 + }, + { + "epoch": 8.060358890701469, + "grad_norm": 0.03366122022271156, + "learning_rate": 5.515480485045152e-06, + "loss": 0.0383, + "num_input_tokens_seen": 106619984, + "step": 49410 + }, + { + "epoch": 8.061174551386623, + "grad_norm": 1.7727546691894531, + "learning_rate": 5.511021499117322e-06, + "loss": 0.1314, + "num_input_tokens_seen": 106631792, + "step": 49415 + }, + { + "epoch": 8.061990212071779, + "grad_norm": 0.025015683844685555, + "learning_rate": 5.5065640930649725e-06, + "loss": 0.1029, + "num_input_tokens_seen": 106643600, + "step": 49420 + }, + { + "epoch": 8.062805872756933, + "grad_norm": 1.7555135488510132, + "learning_rate": 5.502108267249448e-06, + "loss": 0.123, + "num_input_tokens_seen": 106654672, + "step": 49425 + }, + { + "epoch": 8.063621533442088, + "grad_norm": 1.42885160446167, + "learning_rate": 5.497654022031959e-06, + "loss": 0.1285, + "num_input_tokens_seen": 106665456, + "step": 49430 + }, + { + "epoch": 8.064437194127244, + "grad_norm": 0.6520128846168518, + "learning_rate": 5.493201357773589e-06, + "loss": 0.1773, + "num_input_tokens_seen": 106676880, + "step": 49435 + }, + { + "epoch": 8.065252854812398, + "grad_norm": 0.08842756599187851, + "learning_rate": 5.488750274835291e-06, + "loss": 0.1314, + "num_input_tokens_seen": 106688528, + "step": 49440 + }, + { + "epoch": 8.066068515497554, + "grad_norm": 0.2845861613750458, + "learning_rate": 5.4843007735778996e-06, + "loss": 0.1468, + "num_input_tokens_seen": 106698992, + "step": 49445 + }, + { + "epoch": 8.066884176182707, + "grad_norm": 0.26208195090293884, + "learning_rate": 5.4798528543620965e-06, + "loss": 0.1064, + "num_input_tokens_seen": 106709168, + "step": 49450 + }, + { + "epoch": 8.067699836867863, + "grad_norm": 0.47582077980041504, + "learning_rate": 5.475406517548476e-06, + "loss": 0.1792, + "num_input_tokens_seen": 106719920, + "step": 49455 + }, + { + "epoch": 8.068515497553017, + "grad_norm": 0.07069459557533264, + "learning_rate": 5.47096176349747e-06, + "loss": 0.077, + "num_input_tokens_seen": 106729872, + "step": 49460 + }, + { + "epoch": 8.069331158238173, + "grad_norm": 2.2455685138702393, + "learning_rate": 5.466518592569391e-06, + "loss": 0.135, + "num_input_tokens_seen": 106741168, + "step": 49465 + }, + { + "epoch": 8.070146818923329, + "grad_norm": 0.35985836386680603, + "learning_rate": 5.4620770051244275e-06, + "loss": 0.0509, + "num_input_tokens_seen": 106751376, + "step": 49470 + }, + { + "epoch": 8.070962479608482, + "grad_norm": 0.782514214515686, + "learning_rate": 5.457637001522636e-06, + "loss": 0.1835, + "num_input_tokens_seen": 106761904, + "step": 49475 + }, + { + "epoch": 8.071778140293638, + "grad_norm": 0.8538064360618591, + "learning_rate": 5.453198582123947e-06, + "loss": 0.1893, + "num_input_tokens_seen": 106772240, + "step": 49480 + }, + { + "epoch": 8.072593800978792, + "grad_norm": 0.2380029261112213, + "learning_rate": 5.448761747288161e-06, + "loss": 0.0562, + "num_input_tokens_seen": 106783216, + "step": 49485 + }, + { + "epoch": 8.073409461663948, + "grad_norm": 0.5149921178817749, + "learning_rate": 5.444326497374949e-06, + "loss": 0.1192, + "num_input_tokens_seen": 106794896, + "step": 49490 + }, + { + "epoch": 8.074225122349104, + "grad_norm": 0.7372229099273682, + "learning_rate": 5.439892832743856e-06, + "loss": 0.1657, + "num_input_tokens_seen": 106805360, + "step": 49495 + }, + { + "epoch": 8.075040783034257, + "grad_norm": 2.1987462043762207, + "learning_rate": 5.435460753754296e-06, + "loss": 0.1539, + "num_input_tokens_seen": 106816144, + "step": 49500 + }, + { + "epoch": 8.075856443719413, + "grad_norm": 0.33785000443458557, + "learning_rate": 5.431030260765557e-06, + "loss": 0.0347, + "num_input_tokens_seen": 106827088, + "step": 49505 + }, + { + "epoch": 8.076672104404567, + "grad_norm": 0.3203345239162445, + "learning_rate": 5.426601354136799e-06, + "loss": 0.0586, + "num_input_tokens_seen": 106837488, + "step": 49510 + }, + { + "epoch": 8.077487765089723, + "grad_norm": 0.0807085633277893, + "learning_rate": 5.4221740342270455e-06, + "loss": 0.0279, + "num_input_tokens_seen": 106849520, + "step": 49515 + }, + { + "epoch": 8.078303425774878, + "grad_norm": 0.03727695345878601, + "learning_rate": 5.4177483013952065e-06, + "loss": 0.0486, + "num_input_tokens_seen": 106860528, + "step": 49520 + }, + { + "epoch": 8.079119086460032, + "grad_norm": 1.1625919342041016, + "learning_rate": 5.413324156000046e-06, + "loss": 0.1069, + "num_input_tokens_seen": 106869936, + "step": 49525 + }, + { + "epoch": 8.079934747145188, + "grad_norm": 1.376995325088501, + "learning_rate": 5.408901598400212e-06, + "loss": 0.0837, + "num_input_tokens_seen": 106881936, + "step": 49530 + }, + { + "epoch": 8.080750407830342, + "grad_norm": 0.5018541216850281, + "learning_rate": 5.40448062895422e-06, + "loss": 0.0423, + "num_input_tokens_seen": 106891376, + "step": 49535 + }, + { + "epoch": 8.081566068515498, + "grad_norm": 0.6059653759002686, + "learning_rate": 5.400061248020452e-06, + "loss": 0.1023, + "num_input_tokens_seen": 106901168, + "step": 49540 + }, + { + "epoch": 8.082381729200652, + "grad_norm": 1.2775737047195435, + "learning_rate": 5.395643455957172e-06, + "loss": 0.259, + "num_input_tokens_seen": 106912432, + "step": 49545 + }, + { + "epoch": 8.083197389885807, + "grad_norm": 1.5817985534667969, + "learning_rate": 5.391227253122502e-06, + "loss": 0.0425, + "num_input_tokens_seen": 106924592, + "step": 49550 + }, + { + "epoch": 8.084013050570963, + "grad_norm": 0.09551537036895752, + "learning_rate": 5.386812639874439e-06, + "loss": 0.1054, + "num_input_tokens_seen": 106934704, + "step": 49555 + }, + { + "epoch": 8.084828711256117, + "grad_norm": 0.7275633215904236, + "learning_rate": 5.382399616570869e-06, + "loss": 0.116, + "num_input_tokens_seen": 106945936, + "step": 49560 + }, + { + "epoch": 8.085644371941273, + "grad_norm": 1.1517655849456787, + "learning_rate": 5.377988183569521e-06, + "loss": 0.1153, + "num_input_tokens_seen": 106957264, + "step": 49565 + }, + { + "epoch": 8.086460032626427, + "grad_norm": 0.4705158472061157, + "learning_rate": 5.3735783412280134e-06, + "loss": 0.2647, + "num_input_tokens_seen": 106968816, + "step": 49570 + }, + { + "epoch": 8.087275693311582, + "grad_norm": 0.057907894253730774, + "learning_rate": 5.36917008990383e-06, + "loss": 0.1666, + "num_input_tokens_seen": 106981680, + "step": 49575 + }, + { + "epoch": 8.088091353996738, + "grad_norm": 0.5493987202644348, + "learning_rate": 5.364763429954317e-06, + "loss": 0.1072, + "num_input_tokens_seen": 106992048, + "step": 49580 + }, + { + "epoch": 8.088907014681892, + "grad_norm": 0.417367160320282, + "learning_rate": 5.360358361736714e-06, + "loss": 0.0338, + "num_input_tokens_seen": 107004016, + "step": 49585 + }, + { + "epoch": 8.089722675367048, + "grad_norm": 1.5700583457946777, + "learning_rate": 5.3559548856081135e-06, + "loss": 0.1095, + "num_input_tokens_seen": 107015024, + "step": 49590 + }, + { + "epoch": 8.090538336052202, + "grad_norm": 0.42736729979515076, + "learning_rate": 5.351553001925486e-06, + "loss": 0.0199, + "num_input_tokens_seen": 107025008, + "step": 49595 + }, + { + "epoch": 8.091353996737357, + "grad_norm": 0.8245664238929749, + "learning_rate": 5.347152711045664e-06, + "loss": 0.0656, + "num_input_tokens_seen": 107035408, + "step": 49600 + }, + { + "epoch": 8.092169657422513, + "grad_norm": 0.7347253561019897, + "learning_rate": 5.342754013325363e-06, + "loss": 0.0986, + "num_input_tokens_seen": 107046960, + "step": 49605 + }, + { + "epoch": 8.092985318107667, + "grad_norm": 1.0357515811920166, + "learning_rate": 5.338356909121159e-06, + "loss": 0.2495, + "num_input_tokens_seen": 107058736, + "step": 49610 + }, + { + "epoch": 8.093800978792823, + "grad_norm": 0.30424338579177856, + "learning_rate": 5.3339613987895084e-06, + "loss": 0.1193, + "num_input_tokens_seen": 107069872, + "step": 49615 + }, + { + "epoch": 8.094616639477977, + "grad_norm": 0.09138703346252441, + "learning_rate": 5.329567482686729e-06, + "loss": 0.0437, + "num_input_tokens_seen": 107080400, + "step": 49620 + }, + { + "epoch": 8.095432300163132, + "grad_norm": 1.0167434215545654, + "learning_rate": 5.325175161169019e-06, + "loss": 0.0706, + "num_input_tokens_seen": 107091696, + "step": 49625 + }, + { + "epoch": 8.096247960848286, + "grad_norm": 2.2476255893707275, + "learning_rate": 5.320784434592438e-06, + "loss": 0.1911, + "num_input_tokens_seen": 107101168, + "step": 49630 + }, + { + "epoch": 8.097063621533442, + "grad_norm": 0.5623724460601807, + "learning_rate": 5.316395303312921e-06, + "loss": 0.0959, + "num_input_tokens_seen": 107112464, + "step": 49635 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 1.9982528686523438, + "learning_rate": 5.3120077676862754e-06, + "loss": 0.1608, + "num_input_tokens_seen": 107123280, + "step": 49640 + }, + { + "epoch": 8.098694942903752, + "grad_norm": 1.8952523469924927, + "learning_rate": 5.307621828068177e-06, + "loss": 0.1626, + "num_input_tokens_seen": 107134800, + "step": 49645 + }, + { + "epoch": 8.099510603588907, + "grad_norm": 0.10074829310178757, + "learning_rate": 5.303237484814169e-06, + "loss": 0.0883, + "num_input_tokens_seen": 107145904, + "step": 49650 + }, + { + "epoch": 8.100326264274061, + "grad_norm": 1.1681482791900635, + "learning_rate": 5.2988547382796735e-06, + "loss": 0.1522, + "num_input_tokens_seen": 107156464, + "step": 49655 + }, + { + "epoch": 8.101141924959217, + "grad_norm": 0.4685456454753876, + "learning_rate": 5.294473588819968e-06, + "loss": 0.1081, + "num_input_tokens_seen": 107167824, + "step": 49660 + }, + { + "epoch": 8.101957585644373, + "grad_norm": 1.5799741744995117, + "learning_rate": 5.2900940367902245e-06, + "loss": 0.0793, + "num_input_tokens_seen": 107178928, + "step": 49665 + }, + { + "epoch": 8.102773246329527, + "grad_norm": 0.1459600329399109, + "learning_rate": 5.2857160825454635e-06, + "loss": 0.1855, + "num_input_tokens_seen": 107191120, + "step": 49670 + }, + { + "epoch": 8.103588907014682, + "grad_norm": 0.3291301429271698, + "learning_rate": 5.28133972644059e-06, + "loss": 0.1085, + "num_input_tokens_seen": 107201872, + "step": 49675 + }, + { + "epoch": 8.104404567699836, + "grad_norm": 1.4186580181121826, + "learning_rate": 5.276964968830367e-06, + "loss": 0.1667, + "num_input_tokens_seen": 107214192, + "step": 49680 + }, + { + "epoch": 8.105220228384992, + "grad_norm": 0.6041234135627747, + "learning_rate": 5.272591810069438e-06, + "loss": 0.076, + "num_input_tokens_seen": 107225616, + "step": 49685 + }, + { + "epoch": 8.106035889070148, + "grad_norm": 0.0983622744679451, + "learning_rate": 5.268220250512315e-06, + "loss": 0.1408, + "num_input_tokens_seen": 107236208, + "step": 49690 + }, + { + "epoch": 8.106851549755302, + "grad_norm": 0.08380964398384094, + "learning_rate": 5.263850290513373e-06, + "loss": 0.042, + "num_input_tokens_seen": 107247632, + "step": 49695 + }, + { + "epoch": 8.107667210440457, + "grad_norm": 0.13798944652080536, + "learning_rate": 5.259481930426869e-06, + "loss": 0.0075, + "num_input_tokens_seen": 107258288, + "step": 49700 + }, + { + "epoch": 8.108482871125611, + "grad_norm": 0.06305728852748871, + "learning_rate": 5.2551151706069225e-06, + "loss": 0.0179, + "num_input_tokens_seen": 107268944, + "step": 49705 + }, + { + "epoch": 8.109298531810767, + "grad_norm": 0.10897544771432877, + "learning_rate": 5.250750011407521e-06, + "loss": 0.1134, + "num_input_tokens_seen": 107279472, + "step": 49710 + }, + { + "epoch": 8.11011419249592, + "grad_norm": 1.6675080060958862, + "learning_rate": 5.246386453182533e-06, + "loss": 0.125, + "num_input_tokens_seen": 107290064, + "step": 49715 + }, + { + "epoch": 8.110929853181077, + "grad_norm": 0.06437622010707855, + "learning_rate": 5.242024496285683e-06, + "loss": 0.0112, + "num_input_tokens_seen": 107300560, + "step": 49720 + }, + { + "epoch": 8.111745513866232, + "grad_norm": 0.2130148857831955, + "learning_rate": 5.237664141070583e-06, + "loss": 0.0403, + "num_input_tokens_seen": 107310512, + "step": 49725 + }, + { + "epoch": 8.112561174551386, + "grad_norm": 1.4085643291473389, + "learning_rate": 5.2333053878907e-06, + "loss": 0.0545, + "num_input_tokens_seen": 107321296, + "step": 49730 + }, + { + "epoch": 8.113376835236542, + "grad_norm": 0.2623315751552582, + "learning_rate": 5.228948237099379e-06, + "loss": 0.1206, + "num_input_tokens_seen": 107331920, + "step": 49735 + }, + { + "epoch": 8.114192495921696, + "grad_norm": 1.1012908220291138, + "learning_rate": 5.224592689049832e-06, + "loss": 0.1183, + "num_input_tokens_seen": 107342768, + "step": 49740 + }, + { + "epoch": 8.115008156606851, + "grad_norm": 0.40885111689567566, + "learning_rate": 5.220238744095137e-06, + "loss": 0.1105, + "num_input_tokens_seen": 107352304, + "step": 49745 + }, + { + "epoch": 8.115823817292007, + "grad_norm": 0.020881539210677147, + "learning_rate": 5.215886402588255e-06, + "loss": 0.0213, + "num_input_tokens_seen": 107362448, + "step": 49750 + }, + { + "epoch": 8.116639477977161, + "grad_norm": 0.0828927680850029, + "learning_rate": 5.211535664882003e-06, + "loss": 0.0318, + "num_input_tokens_seen": 107372368, + "step": 49755 + }, + { + "epoch": 8.117455138662317, + "grad_norm": 0.5445172786712646, + "learning_rate": 5.207186531329075e-06, + "loss": 0.0724, + "num_input_tokens_seen": 107383120, + "step": 49760 + }, + { + "epoch": 8.11827079934747, + "grad_norm": 1.1807955503463745, + "learning_rate": 5.202839002282037e-06, + "loss": 0.0716, + "num_input_tokens_seen": 107394576, + "step": 49765 + }, + { + "epoch": 8.119086460032626, + "grad_norm": 0.10731343924999237, + "learning_rate": 5.198493078093311e-06, + "loss": 0.2074, + "num_input_tokens_seen": 107404752, + "step": 49770 + }, + { + "epoch": 8.119902120717782, + "grad_norm": 0.39898836612701416, + "learning_rate": 5.194148759115214e-06, + "loss": 0.1192, + "num_input_tokens_seen": 107414288, + "step": 49775 + }, + { + "epoch": 8.120717781402936, + "grad_norm": 1.5429530143737793, + "learning_rate": 5.189806045699913e-06, + "loss": 0.1665, + "num_input_tokens_seen": 107425648, + "step": 49780 + }, + { + "epoch": 8.121533442088092, + "grad_norm": 0.4442732632160187, + "learning_rate": 5.185464938199449e-06, + "loss": 0.136, + "num_input_tokens_seen": 107437936, + "step": 49785 + }, + { + "epoch": 8.122349102773246, + "grad_norm": 1.499144196510315, + "learning_rate": 5.181125436965739e-06, + "loss": 0.1435, + "num_input_tokens_seen": 107448336, + "step": 49790 + }, + { + "epoch": 8.123164763458401, + "grad_norm": 0.5827367901802063, + "learning_rate": 5.176787542350558e-06, + "loss": 0.096, + "num_input_tokens_seen": 107458416, + "step": 49795 + }, + { + "epoch": 8.123980424143557, + "grad_norm": 0.10558357834815979, + "learning_rate": 5.172451254705559e-06, + "loss": 0.0874, + "num_input_tokens_seen": 107469872, + "step": 49800 + }, + { + "epoch": 8.124796084828711, + "grad_norm": 1.0223647356033325, + "learning_rate": 5.1681165743822676e-06, + "loss": 0.1774, + "num_input_tokens_seen": 107479760, + "step": 49805 + }, + { + "epoch": 8.125611745513867, + "grad_norm": 0.7829576134681702, + "learning_rate": 5.1637835017320726e-06, + "loss": 0.1164, + "num_input_tokens_seen": 107490096, + "step": 49810 + }, + { + "epoch": 8.12642740619902, + "grad_norm": 0.25755712389945984, + "learning_rate": 5.159452037106236e-06, + "loss": 0.152, + "num_input_tokens_seen": 107500368, + "step": 49815 + }, + { + "epoch": 8.127243066884176, + "grad_norm": 0.505631148815155, + "learning_rate": 5.155122180855884e-06, + "loss": 0.0741, + "num_input_tokens_seen": 107511600, + "step": 49820 + }, + { + "epoch": 8.12805872756933, + "grad_norm": 0.2771296203136444, + "learning_rate": 5.150793933332024e-06, + "loss": 0.137, + "num_input_tokens_seen": 107522672, + "step": 49825 + }, + { + "epoch": 8.128874388254486, + "grad_norm": 1.1553218364715576, + "learning_rate": 5.146467294885518e-06, + "loss": 0.0419, + "num_input_tokens_seen": 107534096, + "step": 49830 + }, + { + "epoch": 8.129690048939642, + "grad_norm": 2.012634515762329, + "learning_rate": 5.142142265867112e-06, + "loss": 0.2374, + "num_input_tokens_seen": 107543472, + "step": 49835 + }, + { + "epoch": 8.130505709624796, + "grad_norm": 1.356930136680603, + "learning_rate": 5.137818846627409e-06, + "loss": 0.2588, + "num_input_tokens_seen": 107554224, + "step": 49840 + }, + { + "epoch": 8.131321370309951, + "grad_norm": 0.17152048647403717, + "learning_rate": 5.13349703751689e-06, + "loss": 0.1221, + "num_input_tokens_seen": 107564240, + "step": 49845 + }, + { + "epoch": 8.132137030995105, + "grad_norm": 0.6064700484275818, + "learning_rate": 5.129176838885905e-06, + "loss": 0.0898, + "num_input_tokens_seen": 107574800, + "step": 49850 + }, + { + "epoch": 8.132952691680261, + "grad_norm": 0.4454590082168579, + "learning_rate": 5.124858251084666e-06, + "loss": 0.2438, + "num_input_tokens_seen": 107585520, + "step": 49855 + }, + { + "epoch": 8.133768352365417, + "grad_norm": 0.19075508415699005, + "learning_rate": 5.120541274463264e-06, + "loss": 0.1397, + "num_input_tokens_seen": 107596112, + "step": 49860 + }, + { + "epoch": 8.13458401305057, + "grad_norm": 0.6578219532966614, + "learning_rate": 5.116225909371649e-06, + "loss": 0.0355, + "num_input_tokens_seen": 107607120, + "step": 49865 + }, + { + "epoch": 8.135399673735726, + "grad_norm": 0.24798716604709625, + "learning_rate": 5.111912156159657e-06, + "loss": 0.1909, + "num_input_tokens_seen": 107618000, + "step": 49870 + }, + { + "epoch": 8.13621533442088, + "grad_norm": 0.6189521551132202, + "learning_rate": 5.107600015176975e-06, + "loss": 0.1269, + "num_input_tokens_seen": 107629232, + "step": 49875 + }, + { + "epoch": 8.137030995106036, + "grad_norm": 1.1848697662353516, + "learning_rate": 5.103289486773169e-06, + "loss": 0.0879, + "num_input_tokens_seen": 107640784, + "step": 49880 + }, + { + "epoch": 8.137846655791192, + "grad_norm": 1.660826325416565, + "learning_rate": 5.098980571297673e-06, + "loss": 0.2295, + "num_input_tokens_seen": 107650576, + "step": 49885 + }, + { + "epoch": 8.138662316476346, + "grad_norm": 0.35509806871414185, + "learning_rate": 5.094673269099781e-06, + "loss": 0.0289, + "num_input_tokens_seen": 107660368, + "step": 49890 + }, + { + "epoch": 8.139477977161501, + "grad_norm": 0.08466221392154694, + "learning_rate": 5.090367580528679e-06, + "loss": 0.0935, + "num_input_tokens_seen": 107670096, + "step": 49895 + }, + { + "epoch": 8.140293637846655, + "grad_norm": 1.9366835355758667, + "learning_rate": 5.086063505933403e-06, + "loss": 0.2155, + "num_input_tokens_seen": 107680240, + "step": 49900 + }, + { + "epoch": 8.141109298531811, + "grad_norm": 0.040282242000103, + "learning_rate": 5.081761045662861e-06, + "loss": 0.0627, + "num_input_tokens_seen": 107691504, + "step": 49905 + }, + { + "epoch": 8.141924959216965, + "grad_norm": 1.959486484527588, + "learning_rate": 5.077460200065834e-06, + "loss": 0.2516, + "num_input_tokens_seen": 107701968, + "step": 49910 + }, + { + "epoch": 8.14274061990212, + "grad_norm": 1.102821707725525, + "learning_rate": 5.073160969490967e-06, + "loss": 0.2094, + "num_input_tokens_seen": 107713136, + "step": 49915 + }, + { + "epoch": 8.143556280587276, + "grad_norm": 0.862926721572876, + "learning_rate": 5.068863354286779e-06, + "loss": 0.1134, + "num_input_tokens_seen": 107724080, + "step": 49920 + }, + { + "epoch": 8.14437194127243, + "grad_norm": 0.503875195980072, + "learning_rate": 5.064567354801658e-06, + "loss": 0.0256, + "num_input_tokens_seen": 107735792, + "step": 49925 + }, + { + "epoch": 8.145187601957586, + "grad_norm": 0.39262568950653076, + "learning_rate": 5.060272971383862e-06, + "loss": 0.1148, + "num_input_tokens_seen": 107747280, + "step": 49930 + }, + { + "epoch": 8.14600326264274, + "grad_norm": 1.0558342933654785, + "learning_rate": 5.055980204381508e-06, + "loss": 0.1715, + "num_input_tokens_seen": 107758448, + "step": 49935 + }, + { + "epoch": 8.146818923327896, + "grad_norm": 0.3709533214569092, + "learning_rate": 5.051689054142594e-06, + "loss": 0.059, + "num_input_tokens_seen": 107769712, + "step": 49940 + }, + { + "epoch": 8.147634584013051, + "grad_norm": 0.038735780864953995, + "learning_rate": 5.047399521014984e-06, + "loss": 0.1199, + "num_input_tokens_seen": 107780624, + "step": 49945 + }, + { + "epoch": 8.148450244698205, + "grad_norm": 0.19963178038597107, + "learning_rate": 5.043111605346404e-06, + "loss": 0.0559, + "num_input_tokens_seen": 107790800, + "step": 49950 + }, + { + "epoch": 8.149265905383361, + "grad_norm": 0.17385411262512207, + "learning_rate": 5.03882530748446e-06, + "loss": 0.1263, + "num_input_tokens_seen": 107802736, + "step": 49955 + }, + { + "epoch": 8.150081566068515, + "grad_norm": 0.6690751910209656, + "learning_rate": 5.034540627776618e-06, + "loss": 0.0445, + "num_input_tokens_seen": 107813552, + "step": 49960 + }, + { + "epoch": 8.15089722675367, + "grad_norm": 0.13226382434368134, + "learning_rate": 5.030257566570215e-06, + "loss": 0.1842, + "num_input_tokens_seen": 107825264, + "step": 49965 + }, + { + "epoch": 8.151712887438826, + "grad_norm": 0.8906847238540649, + "learning_rate": 5.025976124212461e-06, + "loss": 0.0495, + "num_input_tokens_seen": 107836880, + "step": 49970 + }, + { + "epoch": 8.15252854812398, + "grad_norm": 0.21268922090530396, + "learning_rate": 5.0216963010504295e-06, + "loss": 0.0451, + "num_input_tokens_seen": 107849392, + "step": 49975 + }, + { + "epoch": 8.153344208809136, + "grad_norm": 1.373909831047058, + "learning_rate": 5.017418097431059e-06, + "loss": 0.1069, + "num_input_tokens_seen": 107860144, + "step": 49980 + }, + { + "epoch": 8.15415986949429, + "grad_norm": 0.10281497240066528, + "learning_rate": 5.013141513701173e-06, + "loss": 0.1954, + "num_input_tokens_seen": 107869520, + "step": 49985 + }, + { + "epoch": 8.154975530179446, + "grad_norm": 0.3922017812728882, + "learning_rate": 5.008866550207447e-06, + "loss": 0.1388, + "num_input_tokens_seen": 107879792, + "step": 49990 + }, + { + "epoch": 8.1557911908646, + "grad_norm": 0.9518253803253174, + "learning_rate": 5.004593207296434e-06, + "loss": 0.0648, + "num_input_tokens_seen": 107890672, + "step": 49995 + }, + { + "epoch": 8.156606851549755, + "grad_norm": 0.07317779213190079, + "learning_rate": 5.000321485314552e-06, + "loss": 0.1203, + "num_input_tokens_seen": 107900528, + "step": 50000 + }, + { + "epoch": 8.15742251223491, + "grad_norm": 1.4227790832519531, + "learning_rate": 4.9960513846080885e-06, + "loss": 0.3037, + "num_input_tokens_seen": 107911248, + "step": 50005 + }, + { + "epoch": 8.158238172920065, + "grad_norm": 0.4063059985637665, + "learning_rate": 4.991782905523196e-06, + "loss": 0.0248, + "num_input_tokens_seen": 107922800, + "step": 50010 + }, + { + "epoch": 8.15905383360522, + "grad_norm": 0.7754814624786377, + "learning_rate": 4.987516048405905e-06, + "loss": 0.101, + "num_input_tokens_seen": 107934032, + "step": 50015 + }, + { + "epoch": 8.159869494290374, + "grad_norm": 0.056396905332803726, + "learning_rate": 4.983250813602103e-06, + "loss": 0.0907, + "num_input_tokens_seen": 107946128, + "step": 50020 + }, + { + "epoch": 8.16068515497553, + "grad_norm": 0.05135722458362579, + "learning_rate": 4.978987201457555e-06, + "loss": 0.069, + "num_input_tokens_seen": 107956912, + "step": 50025 + }, + { + "epoch": 8.161500815660686, + "grad_norm": 1.4512377977371216, + "learning_rate": 4.97472521231789e-06, + "loss": 0.1379, + "num_input_tokens_seen": 107968112, + "step": 50030 + }, + { + "epoch": 8.16231647634584, + "grad_norm": 0.09425906091928482, + "learning_rate": 4.9704648465286e-06, + "loss": 0.0701, + "num_input_tokens_seen": 107978352, + "step": 50035 + }, + { + "epoch": 8.163132137030995, + "grad_norm": 1.0611740350723267, + "learning_rate": 4.966206104435064e-06, + "loss": 0.0632, + "num_input_tokens_seen": 107989264, + "step": 50040 + }, + { + "epoch": 8.16394779771615, + "grad_norm": 0.4156206250190735, + "learning_rate": 4.961948986382511e-06, + "loss": 0.0561, + "num_input_tokens_seen": 107999888, + "step": 50045 + }, + { + "epoch": 8.164763458401305, + "grad_norm": 0.08089230954647064, + "learning_rate": 4.957693492716048e-06, + "loss": 0.0915, + "num_input_tokens_seen": 108009680, + "step": 50050 + }, + { + "epoch": 8.16557911908646, + "grad_norm": 0.8638477325439453, + "learning_rate": 4.953439623780643e-06, + "loss": 0.1359, + "num_input_tokens_seen": 108020368, + "step": 50055 + }, + { + "epoch": 8.166394779771615, + "grad_norm": 0.2824704647064209, + "learning_rate": 4.949187379921136e-06, + "loss": 0.1507, + "num_input_tokens_seen": 108030736, + "step": 50060 + }, + { + "epoch": 8.16721044045677, + "grad_norm": 0.06698982417583466, + "learning_rate": 4.9449367614822384e-06, + "loss": 0.0249, + "num_input_tokens_seen": 108041232, + "step": 50065 + }, + { + "epoch": 8.168026101141924, + "grad_norm": 0.11195465177297592, + "learning_rate": 4.940687768808525e-06, + "loss": 0.1088, + "num_input_tokens_seen": 108053264, + "step": 50070 + }, + { + "epoch": 8.16884176182708, + "grad_norm": 6.513386249542236, + "learning_rate": 4.936440402244441e-06, + "loss": 0.1169, + "num_input_tokens_seen": 108063088, + "step": 50075 + }, + { + "epoch": 8.169657422512234, + "grad_norm": 0.7191120982170105, + "learning_rate": 4.932194662134298e-06, + "loss": 0.0549, + "num_input_tokens_seen": 108072688, + "step": 50080 + }, + { + "epoch": 8.17047308319739, + "grad_norm": 0.9932898879051208, + "learning_rate": 4.92795054882228e-06, + "loss": 0.0672, + "num_input_tokens_seen": 108085296, + "step": 50085 + }, + { + "epoch": 8.171288743882545, + "grad_norm": 0.8601304888725281, + "learning_rate": 4.9237080626524294e-06, + "loss": 0.0762, + "num_input_tokens_seen": 108096816, + "step": 50090 + }, + { + "epoch": 8.1721044045677, + "grad_norm": 0.10668136179447174, + "learning_rate": 4.919467203968675e-06, + "loss": 0.1845, + "num_input_tokens_seen": 108107280, + "step": 50095 + }, + { + "epoch": 8.172920065252855, + "grad_norm": 0.06969480961561203, + "learning_rate": 4.915227973114797e-06, + "loss": 0.1112, + "num_input_tokens_seen": 108117552, + "step": 50100 + }, + { + "epoch": 8.173735725938009, + "grad_norm": 0.8772156238555908, + "learning_rate": 4.910990370434449e-06, + "loss": 0.0722, + "num_input_tokens_seen": 108129264, + "step": 50105 + }, + { + "epoch": 8.174551386623165, + "grad_norm": 1.2582988739013672, + "learning_rate": 4.906754396271152e-06, + "loss": 0.0718, + "num_input_tokens_seen": 108139472, + "step": 50110 + }, + { + "epoch": 8.17536704730832, + "grad_norm": 0.8412271738052368, + "learning_rate": 4.902520050968293e-06, + "loss": 0.1182, + "num_input_tokens_seen": 108149712, + "step": 50115 + }, + { + "epoch": 8.176182707993474, + "grad_norm": 0.10386312007904053, + "learning_rate": 4.898287334869134e-06, + "loss": 0.1025, + "num_input_tokens_seen": 108160464, + "step": 50120 + }, + { + "epoch": 8.17699836867863, + "grad_norm": 1.861067533493042, + "learning_rate": 4.8940562483168005e-06, + "loss": 0.1872, + "num_input_tokens_seen": 108171312, + "step": 50125 + }, + { + "epoch": 8.177814029363784, + "grad_norm": 0.656967043876648, + "learning_rate": 4.889826791654281e-06, + "loss": 0.1882, + "num_input_tokens_seen": 108181616, + "step": 50130 + }, + { + "epoch": 8.17862969004894, + "grad_norm": 0.048105981200933456, + "learning_rate": 4.8855989652244415e-06, + "loss": 0.1905, + "num_input_tokens_seen": 108191792, + "step": 50135 + }, + { + "epoch": 8.179445350734095, + "grad_norm": 0.6630014181137085, + "learning_rate": 4.8813727693700104e-06, + "loss": 0.2274, + "num_input_tokens_seen": 108203216, + "step": 50140 + }, + { + "epoch": 8.18026101141925, + "grad_norm": 0.18551583588123322, + "learning_rate": 4.877148204433582e-06, + "loss": 0.0652, + "num_input_tokens_seen": 108214000, + "step": 50145 + }, + { + "epoch": 8.181076672104405, + "grad_norm": 1.0558626651763916, + "learning_rate": 4.872925270757623e-06, + "loss": 0.0991, + "num_input_tokens_seen": 108224528, + "step": 50150 + }, + { + "epoch": 8.181892332789559, + "grad_norm": 0.6764323115348816, + "learning_rate": 4.868703968684466e-06, + "loss": 0.0294, + "num_input_tokens_seen": 108235088, + "step": 50155 + }, + { + "epoch": 8.182707993474715, + "grad_norm": 0.9559144973754883, + "learning_rate": 4.86448429855631e-06, + "loss": 0.1071, + "num_input_tokens_seen": 108245616, + "step": 50160 + }, + { + "epoch": 8.18352365415987, + "grad_norm": 1.5327619314193726, + "learning_rate": 4.860266260715221e-06, + "loss": 0.1648, + "num_input_tokens_seen": 108256144, + "step": 50165 + }, + { + "epoch": 8.184339314845024, + "grad_norm": 0.6028555035591125, + "learning_rate": 4.856049855503139e-06, + "loss": 0.1188, + "num_input_tokens_seen": 108267248, + "step": 50170 + }, + { + "epoch": 8.18515497553018, + "grad_norm": 2.571467638015747, + "learning_rate": 4.8518350832618655e-06, + "loss": 0.1732, + "num_input_tokens_seen": 108278128, + "step": 50175 + }, + { + "epoch": 8.185970636215334, + "grad_norm": 0.5186923742294312, + "learning_rate": 4.847621944333064e-06, + "loss": 0.219, + "num_input_tokens_seen": 108290352, + "step": 50180 + }, + { + "epoch": 8.18678629690049, + "grad_norm": 0.4284508526325226, + "learning_rate": 4.8434104390582855e-06, + "loss": 0.0416, + "num_input_tokens_seen": 108301776, + "step": 50185 + }, + { + "epoch": 8.187601957585644, + "grad_norm": 1.7609421014785767, + "learning_rate": 4.839200567778932e-06, + "loss": 0.0853, + "num_input_tokens_seen": 108312208, + "step": 50190 + }, + { + "epoch": 8.1884176182708, + "grad_norm": 1.3681172132492065, + "learning_rate": 4.834992330836274e-06, + "loss": 0.1786, + "num_input_tokens_seen": 108322480, + "step": 50195 + }, + { + "epoch": 8.189233278955955, + "grad_norm": 0.3969310224056244, + "learning_rate": 4.8307857285714545e-06, + "loss": 0.05, + "num_input_tokens_seen": 108333392, + "step": 50200 + }, + { + "epoch": 8.190048939641109, + "grad_norm": 0.0752984881401062, + "learning_rate": 4.826580761325475e-06, + "loss": 0.0331, + "num_input_tokens_seen": 108343856, + "step": 50205 + }, + { + "epoch": 8.190864600326265, + "grad_norm": 0.08413950353860855, + "learning_rate": 4.822377429439223e-06, + "loss": 0.0914, + "num_input_tokens_seen": 108355728, + "step": 50210 + }, + { + "epoch": 8.191680261011419, + "grad_norm": 0.09019526094198227, + "learning_rate": 4.818175733253438e-06, + "loss": 0.2548, + "num_input_tokens_seen": 108366736, + "step": 50215 + }, + { + "epoch": 8.192495921696574, + "grad_norm": 0.45593392848968506, + "learning_rate": 4.813975673108731e-06, + "loss": 0.0395, + "num_input_tokens_seen": 108376912, + "step": 50220 + }, + { + "epoch": 8.19331158238173, + "grad_norm": 0.21802419424057007, + "learning_rate": 4.809777249345576e-06, + "loss": 0.2402, + "num_input_tokens_seen": 108387440, + "step": 50225 + }, + { + "epoch": 8.194127243066884, + "grad_norm": 0.15356940031051636, + "learning_rate": 4.8055804623043235e-06, + "loss": 0.2664, + "num_input_tokens_seen": 108397232, + "step": 50230 + }, + { + "epoch": 8.19494290375204, + "grad_norm": 0.8041834235191345, + "learning_rate": 4.801385312325182e-06, + "loss": 0.0764, + "num_input_tokens_seen": 108408016, + "step": 50235 + }, + { + "epoch": 8.195758564437194, + "grad_norm": 1.5068293809890747, + "learning_rate": 4.7971917997482376e-06, + "loss": 0.1103, + "num_input_tokens_seen": 108419024, + "step": 50240 + }, + { + "epoch": 8.19657422512235, + "grad_norm": 0.12052559852600098, + "learning_rate": 4.79299992491343e-06, + "loss": 0.0398, + "num_input_tokens_seen": 108430320, + "step": 50245 + }, + { + "epoch": 8.197389885807505, + "grad_norm": 0.05534833297133446, + "learning_rate": 4.788809688160581e-06, + "loss": 0.0197, + "num_input_tokens_seen": 108441104, + "step": 50250 + }, + { + "epoch": 8.198205546492659, + "grad_norm": 0.19815191626548767, + "learning_rate": 4.784621089829366e-06, + "loss": 0.1854, + "num_input_tokens_seen": 108452656, + "step": 50255 + }, + { + "epoch": 8.199021207177815, + "grad_norm": 1.3801740407943726, + "learning_rate": 4.780434130259339e-06, + "loss": 0.172, + "num_input_tokens_seen": 108462992, + "step": 50260 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.048685673624277115, + "learning_rate": 4.7762488097899154e-06, + "loss": 0.0862, + "num_input_tokens_seen": 108473744, + "step": 50265 + }, + { + "epoch": 8.200652528548124, + "grad_norm": 0.30513978004455566, + "learning_rate": 4.772065128760375e-06, + "loss": 0.1126, + "num_input_tokens_seen": 108485136, + "step": 50270 + }, + { + "epoch": 8.201468189233278, + "grad_norm": 1.2770030498504639, + "learning_rate": 4.767883087509872e-06, + "loss": 0.1408, + "num_input_tokens_seen": 108496720, + "step": 50275 + }, + { + "epoch": 8.202283849918434, + "grad_norm": 1.6288734674453735, + "learning_rate": 4.763702686377425e-06, + "loss": 0.1066, + "num_input_tokens_seen": 108508016, + "step": 50280 + }, + { + "epoch": 8.20309951060359, + "grad_norm": 1.1603511571884155, + "learning_rate": 4.759523925701914e-06, + "loss": 0.2003, + "num_input_tokens_seen": 108517872, + "step": 50285 + }, + { + "epoch": 8.203915171288743, + "grad_norm": 0.9824179410934448, + "learning_rate": 4.7553468058220915e-06, + "loss": 0.1708, + "num_input_tokens_seen": 108527280, + "step": 50290 + }, + { + "epoch": 8.2047308319739, + "grad_norm": 0.17351555824279785, + "learning_rate": 4.751171327076579e-06, + "loss": 0.1652, + "num_input_tokens_seen": 108538192, + "step": 50295 + }, + { + "epoch": 8.205546492659053, + "grad_norm": 1.345658779144287, + "learning_rate": 4.746997489803853e-06, + "loss": 0.189, + "num_input_tokens_seen": 108546960, + "step": 50300 + }, + { + "epoch": 8.206362153344209, + "grad_norm": 0.5681415796279907, + "learning_rate": 4.7428252943422794e-06, + "loss": 0.083, + "num_input_tokens_seen": 108556912, + "step": 50305 + }, + { + "epoch": 8.207177814029365, + "grad_norm": 0.14759688079357147, + "learning_rate": 4.738654741030074e-06, + "loss": 0.0602, + "num_input_tokens_seen": 108568144, + "step": 50310 + }, + { + "epoch": 8.207993474714518, + "grad_norm": 0.051384832710027695, + "learning_rate": 4.734485830205318e-06, + "loss": 0.0104, + "num_input_tokens_seen": 108578160, + "step": 50315 + }, + { + "epoch": 8.208809135399674, + "grad_norm": 0.11295311152935028, + "learning_rate": 4.730318562205965e-06, + "loss": 0.1281, + "num_input_tokens_seen": 108589296, + "step": 50320 + }, + { + "epoch": 8.209624796084828, + "grad_norm": 0.39331695437431335, + "learning_rate": 4.7261529373698404e-06, + "loss": 0.0466, + "num_input_tokens_seen": 108600240, + "step": 50325 + }, + { + "epoch": 8.210440456769984, + "grad_norm": 1.568416714668274, + "learning_rate": 4.721988956034626e-06, + "loss": 0.1199, + "num_input_tokens_seen": 108611056, + "step": 50330 + }, + { + "epoch": 8.21125611745514, + "grad_norm": 0.40343165397644043, + "learning_rate": 4.717826618537874e-06, + "loss": 0.1791, + "num_input_tokens_seen": 108620400, + "step": 50335 + }, + { + "epoch": 8.212071778140293, + "grad_norm": 0.4711487889289856, + "learning_rate": 4.713665925217009e-06, + "loss": 0.0343, + "num_input_tokens_seen": 108632432, + "step": 50340 + }, + { + "epoch": 8.21288743882545, + "grad_norm": 0.9445536136627197, + "learning_rate": 4.709506876409317e-06, + "loss": 0.2373, + "num_input_tokens_seen": 108642896, + "step": 50345 + }, + { + "epoch": 8.213703099510603, + "grad_norm": 1.3254739046096802, + "learning_rate": 4.705349472451942e-06, + "loss": 0.1454, + "num_input_tokens_seen": 108652816, + "step": 50350 + }, + { + "epoch": 8.214518760195759, + "grad_norm": 0.351024866104126, + "learning_rate": 4.70119371368192e-06, + "loss": 0.206, + "num_input_tokens_seen": 108663792, + "step": 50355 + }, + { + "epoch": 8.215334420880913, + "grad_norm": 0.15197882056236267, + "learning_rate": 4.697039600436132e-06, + "loss": 0.028, + "num_input_tokens_seen": 108674896, + "step": 50360 + }, + { + "epoch": 8.216150081566068, + "grad_norm": 1.1862223148345947, + "learning_rate": 4.6928871330513296e-06, + "loss": 0.1657, + "num_input_tokens_seen": 108685584, + "step": 50365 + }, + { + "epoch": 8.216965742251224, + "grad_norm": 1.2145887613296509, + "learning_rate": 4.6887363118641335e-06, + "loss": 0.1617, + "num_input_tokens_seen": 108696560, + "step": 50370 + }, + { + "epoch": 8.217781402936378, + "grad_norm": 0.33000755310058594, + "learning_rate": 4.68458713721103e-06, + "loss": 0.0379, + "num_input_tokens_seen": 108706512, + "step": 50375 + }, + { + "epoch": 8.218597063621534, + "grad_norm": 1.0776296854019165, + "learning_rate": 4.680439609428372e-06, + "loss": 0.2006, + "num_input_tokens_seen": 108717808, + "step": 50380 + }, + { + "epoch": 8.219412724306688, + "grad_norm": 0.48731479048728943, + "learning_rate": 4.676293728852379e-06, + "loss": 0.0732, + "num_input_tokens_seen": 108729040, + "step": 50385 + }, + { + "epoch": 8.220228384991843, + "grad_norm": 0.09229531139135361, + "learning_rate": 4.6721494958191395e-06, + "loss": 0.1058, + "num_input_tokens_seen": 108739536, + "step": 50390 + }, + { + "epoch": 8.221044045676999, + "grad_norm": 0.7163644433021545, + "learning_rate": 4.6680069106646014e-06, + "loss": 0.0463, + "num_input_tokens_seen": 108750640, + "step": 50395 + }, + { + "epoch": 8.221859706362153, + "grad_norm": 0.6367756128311157, + "learning_rate": 4.663865973724591e-06, + "loss": 0.1021, + "num_input_tokens_seen": 108760368, + "step": 50400 + }, + { + "epoch": 8.222675367047309, + "grad_norm": 0.4257257282733917, + "learning_rate": 4.659726685334786e-06, + "loss": 0.0775, + "num_input_tokens_seen": 108772336, + "step": 50405 + }, + { + "epoch": 8.223491027732463, + "grad_norm": 1.8445682525634766, + "learning_rate": 4.655589045830735e-06, + "loss": 0.0987, + "num_input_tokens_seen": 108782800, + "step": 50410 + }, + { + "epoch": 8.224306688417618, + "grad_norm": 0.03136774152517319, + "learning_rate": 4.651453055547872e-06, + "loss": 0.0614, + "num_input_tokens_seen": 108794000, + "step": 50415 + }, + { + "epoch": 8.225122349102774, + "grad_norm": 0.7046485543251038, + "learning_rate": 4.647318714821469e-06, + "loss": 0.0338, + "num_input_tokens_seen": 108805392, + "step": 50420 + }, + { + "epoch": 8.225938009787928, + "grad_norm": 1.0490460395812988, + "learning_rate": 4.643186023986681e-06, + "loss": 0.0778, + "num_input_tokens_seen": 108817616, + "step": 50425 + }, + { + "epoch": 8.226753670473084, + "grad_norm": 0.16333556175231934, + "learning_rate": 4.639054983378521e-06, + "loss": 0.0573, + "num_input_tokens_seen": 108829360, + "step": 50430 + }, + { + "epoch": 8.227569331158238, + "grad_norm": 0.827948808670044, + "learning_rate": 4.634925593331876e-06, + "loss": 0.201, + "num_input_tokens_seen": 108839696, + "step": 50435 + }, + { + "epoch": 8.228384991843393, + "grad_norm": 0.10511752963066101, + "learning_rate": 4.630797854181495e-06, + "loss": 0.0531, + "num_input_tokens_seen": 108851184, + "step": 50440 + }, + { + "epoch": 8.229200652528547, + "grad_norm": 0.34275415539741516, + "learning_rate": 4.626671766261992e-06, + "loss": 0.2091, + "num_input_tokens_seen": 108861744, + "step": 50445 + }, + { + "epoch": 8.230016313213703, + "grad_norm": 2.0755112171173096, + "learning_rate": 4.622547329907848e-06, + "loss": 0.0837, + "num_input_tokens_seen": 108872816, + "step": 50450 + }, + { + "epoch": 8.230831973898859, + "grad_norm": 0.14841414988040924, + "learning_rate": 4.618424545453409e-06, + "loss": 0.0544, + "num_input_tokens_seen": 108884688, + "step": 50455 + }, + { + "epoch": 8.231647634584013, + "grad_norm": 0.90597003698349, + "learning_rate": 4.6143034132328955e-06, + "loss": 0.159, + "num_input_tokens_seen": 108896208, + "step": 50460 + }, + { + "epoch": 8.232463295269168, + "grad_norm": 1.1954562664031982, + "learning_rate": 4.610183933580381e-06, + "loss": 0.1497, + "num_input_tokens_seen": 108907536, + "step": 50465 + }, + { + "epoch": 8.233278955954322, + "grad_norm": 0.027749301865696907, + "learning_rate": 4.606066106829815e-06, + "loss": 0.0828, + "num_input_tokens_seen": 108917360, + "step": 50470 + }, + { + "epoch": 8.234094616639478, + "grad_norm": 0.04727540165185928, + "learning_rate": 4.601949933315009e-06, + "loss": 0.1357, + "num_input_tokens_seen": 108927952, + "step": 50475 + }, + { + "epoch": 8.234910277324634, + "grad_norm": 1.0404704809188843, + "learning_rate": 4.597835413369639e-06, + "loss": 0.0443, + "num_input_tokens_seen": 108939248, + "step": 50480 + }, + { + "epoch": 8.235725938009788, + "grad_norm": 1.5434926748275757, + "learning_rate": 4.593722547327248e-06, + "loss": 0.2542, + "num_input_tokens_seen": 108950832, + "step": 50485 + }, + { + "epoch": 8.236541598694943, + "grad_norm": 0.8972826600074768, + "learning_rate": 4.589611335521249e-06, + "loss": 0.1455, + "num_input_tokens_seen": 108961488, + "step": 50490 + }, + { + "epoch": 8.237357259380097, + "grad_norm": 0.7989068627357483, + "learning_rate": 4.585501778284912e-06, + "loss": 0.1612, + "num_input_tokens_seen": 108971888, + "step": 50495 + }, + { + "epoch": 8.238172920065253, + "grad_norm": 1.1510189771652222, + "learning_rate": 4.581393875951387e-06, + "loss": 0.106, + "num_input_tokens_seen": 108983856, + "step": 50500 + }, + { + "epoch": 8.238988580750409, + "grad_norm": 0.08332093805074692, + "learning_rate": 4.577287628853677e-06, + "loss": 0.2708, + "num_input_tokens_seen": 108995408, + "step": 50505 + }, + { + "epoch": 8.239804241435563, + "grad_norm": 0.03612089529633522, + "learning_rate": 4.5731830373246574e-06, + "loss": 0.1607, + "num_input_tokens_seen": 109006736, + "step": 50510 + }, + { + "epoch": 8.240619902120718, + "grad_norm": 0.3223406970500946, + "learning_rate": 4.5690801016970655e-06, + "loss": 0.0335, + "num_input_tokens_seen": 109016528, + "step": 50515 + }, + { + "epoch": 8.241435562805872, + "grad_norm": 0.9369385242462158, + "learning_rate": 4.564978822303498e-06, + "loss": 0.1569, + "num_input_tokens_seen": 109028688, + "step": 50520 + }, + { + "epoch": 8.242251223491028, + "grad_norm": 0.07333335280418396, + "learning_rate": 4.560879199476442e-06, + "loss": 0.2168, + "num_input_tokens_seen": 109039792, + "step": 50525 + }, + { + "epoch": 8.243066884176184, + "grad_norm": 0.14685934782028198, + "learning_rate": 4.5567812335482244e-06, + "loss": 0.023, + "num_input_tokens_seen": 109050416, + "step": 50530 + }, + { + "epoch": 8.243882544861338, + "grad_norm": 0.1465315818786621, + "learning_rate": 4.5526849248510475e-06, + "loss": 0.1468, + "num_input_tokens_seen": 109060816, + "step": 50535 + }, + { + "epoch": 8.244698205546493, + "grad_norm": 0.0922340676188469, + "learning_rate": 4.548590273716979e-06, + "loss": 0.0241, + "num_input_tokens_seen": 109071920, + "step": 50540 + }, + { + "epoch": 8.245513866231647, + "grad_norm": 0.25240659713745117, + "learning_rate": 4.5444972804779525e-06, + "loss": 0.1755, + "num_input_tokens_seen": 109082512, + "step": 50545 + }, + { + "epoch": 8.246329526916803, + "grad_norm": 0.07542598247528076, + "learning_rate": 4.540405945465767e-06, + "loss": 0.169, + "num_input_tokens_seen": 109093744, + "step": 50550 + }, + { + "epoch": 8.247145187601957, + "grad_norm": 1.105970859527588, + "learning_rate": 4.536316269012086e-06, + "loss": 0.0466, + "num_input_tokens_seen": 109104368, + "step": 50555 + }, + { + "epoch": 8.247960848287113, + "grad_norm": 1.163034200668335, + "learning_rate": 4.532228251448439e-06, + "loss": 0.2092, + "num_input_tokens_seen": 109115920, + "step": 50560 + }, + { + "epoch": 8.248776508972268, + "grad_norm": 0.6025656461715698, + "learning_rate": 4.528141893106225e-06, + "loss": 0.0652, + "num_input_tokens_seen": 109127088, + "step": 50565 + }, + { + "epoch": 8.249592169657422, + "grad_norm": 0.058778293430805206, + "learning_rate": 4.5240571943167e-06, + "loss": 0.0911, + "num_input_tokens_seen": 109138736, + "step": 50570 + }, + { + "epoch": 8.250407830342578, + "grad_norm": 0.06957313418388367, + "learning_rate": 4.519974155410992e-06, + "loss": 0.1485, + "num_input_tokens_seen": 109149488, + "step": 50575 + }, + { + "epoch": 8.251223491027732, + "grad_norm": 0.8092506527900696, + "learning_rate": 4.515892776720096e-06, + "loss": 0.0537, + "num_input_tokens_seen": 109160656, + "step": 50580 + }, + { + "epoch": 8.252039151712887, + "grad_norm": 0.4475059509277344, + "learning_rate": 4.5118130585748655e-06, + "loss": 0.0299, + "num_input_tokens_seen": 109171312, + "step": 50585 + }, + { + "epoch": 8.252854812398043, + "grad_norm": 0.6041321158409119, + "learning_rate": 4.507735001306024e-06, + "loss": 0.0201, + "num_input_tokens_seen": 109182384, + "step": 50590 + }, + { + "epoch": 8.253670473083197, + "grad_norm": 0.046230707317590714, + "learning_rate": 4.503658605244163e-06, + "loss": 0.0459, + "num_input_tokens_seen": 109191792, + "step": 50595 + }, + { + "epoch": 8.254486133768353, + "grad_norm": 0.4636770784854889, + "learning_rate": 4.499583870719728e-06, + "loss": 0.1014, + "num_input_tokens_seen": 109202960, + "step": 50600 + }, + { + "epoch": 8.255301794453507, + "grad_norm": 1.5469251871109009, + "learning_rate": 4.495510798063046e-06, + "loss": 0.1157, + "num_input_tokens_seen": 109214288, + "step": 50605 + }, + { + "epoch": 8.256117455138662, + "grad_norm": 0.09517119079828262, + "learning_rate": 4.4914393876042984e-06, + "loss": 0.095, + "num_input_tokens_seen": 109225328, + "step": 50610 + }, + { + "epoch": 8.256933115823816, + "grad_norm": 0.3114635944366455, + "learning_rate": 4.48736963967353e-06, + "loss": 0.1138, + "num_input_tokens_seen": 109236016, + "step": 50615 + }, + { + "epoch": 8.257748776508972, + "grad_norm": 0.33411091566085815, + "learning_rate": 4.483301554600655e-06, + "loss": 0.2948, + "num_input_tokens_seen": 109247344, + "step": 50620 + }, + { + "epoch": 8.258564437194128, + "grad_norm": 0.5273005366325378, + "learning_rate": 4.479235132715462e-06, + "loss": 0.1784, + "num_input_tokens_seen": 109258864, + "step": 50625 + }, + { + "epoch": 8.259380097879282, + "grad_norm": 0.0711941123008728, + "learning_rate": 4.4751703743475895e-06, + "loss": 0.0651, + "num_input_tokens_seen": 109270192, + "step": 50630 + }, + { + "epoch": 8.260195758564437, + "grad_norm": 0.36883997917175293, + "learning_rate": 4.47110727982655e-06, + "loss": 0.0275, + "num_input_tokens_seen": 109280016, + "step": 50635 + }, + { + "epoch": 8.261011419249591, + "grad_norm": 0.12241154909133911, + "learning_rate": 4.467045849481716e-06, + "loss": 0.0681, + "num_input_tokens_seen": 109290256, + "step": 50640 + }, + { + "epoch": 8.261827079934747, + "grad_norm": 0.5538758039474487, + "learning_rate": 4.462986083642329e-06, + "loss": 0.098, + "num_input_tokens_seen": 109300208, + "step": 50645 + }, + { + "epoch": 8.262642740619903, + "grad_norm": 0.6576360464096069, + "learning_rate": 4.4589279826374955e-06, + "loss": 0.1669, + "num_input_tokens_seen": 109310960, + "step": 50650 + }, + { + "epoch": 8.263458401305057, + "grad_norm": 0.4032260477542877, + "learning_rate": 4.454871546796182e-06, + "loss": 0.1664, + "num_input_tokens_seen": 109321040, + "step": 50655 + }, + { + "epoch": 8.264274061990212, + "grad_norm": 0.1894511580467224, + "learning_rate": 4.4508167764472254e-06, + "loss": 0.2733, + "num_input_tokens_seen": 109331056, + "step": 50660 + }, + { + "epoch": 8.265089722675366, + "grad_norm": 0.11306533217430115, + "learning_rate": 4.446763671919321e-06, + "loss": 0.1523, + "num_input_tokens_seen": 109340816, + "step": 50665 + }, + { + "epoch": 8.265905383360522, + "grad_norm": 0.25433245301246643, + "learning_rate": 4.442712233541046e-06, + "loss": 0.0844, + "num_input_tokens_seen": 109351984, + "step": 50670 + }, + { + "epoch": 8.266721044045678, + "grad_norm": 2.049995183944702, + "learning_rate": 4.438662461640825e-06, + "loss": 0.1133, + "num_input_tokens_seen": 109361616, + "step": 50675 + }, + { + "epoch": 8.267536704730832, + "grad_norm": 1.5324056148529053, + "learning_rate": 4.4346143565469485e-06, + "loss": 0.0592, + "num_input_tokens_seen": 109372240, + "step": 50680 + }, + { + "epoch": 8.268352365415987, + "grad_norm": 0.5207135081291199, + "learning_rate": 4.430567918587583e-06, + "loss": 0.0474, + "num_input_tokens_seen": 109383120, + "step": 50685 + }, + { + "epoch": 8.269168026101141, + "grad_norm": 0.37831422686576843, + "learning_rate": 4.42652314809075e-06, + "loss": 0.0543, + "num_input_tokens_seen": 109391952, + "step": 50690 + }, + { + "epoch": 8.269983686786297, + "grad_norm": 0.5895746350288391, + "learning_rate": 4.4224800453843394e-06, + "loss": 0.0773, + "num_input_tokens_seen": 109402576, + "step": 50695 + }, + { + "epoch": 8.270799347471453, + "grad_norm": 1.5194065570831299, + "learning_rate": 4.418438610796105e-06, + "loss": 0.1933, + "num_input_tokens_seen": 109412976, + "step": 50700 + }, + { + "epoch": 8.271615008156607, + "grad_norm": 0.6544106602668762, + "learning_rate": 4.414398844653666e-06, + "loss": 0.1412, + "num_input_tokens_seen": 109425040, + "step": 50705 + }, + { + "epoch": 8.272430668841762, + "grad_norm": 0.30549857020378113, + "learning_rate": 4.410360747284508e-06, + "loss": 0.052, + "num_input_tokens_seen": 109436592, + "step": 50710 + }, + { + "epoch": 8.273246329526916, + "grad_norm": 1.6692824363708496, + "learning_rate": 4.406324319015978e-06, + "loss": 0.1447, + "num_input_tokens_seen": 109447184, + "step": 50715 + }, + { + "epoch": 8.274061990212072, + "grad_norm": 0.46565523743629456, + "learning_rate": 4.4022895601752905e-06, + "loss": 0.0596, + "num_input_tokens_seen": 109457616, + "step": 50720 + }, + { + "epoch": 8.274877650897226, + "grad_norm": 0.14168405532836914, + "learning_rate": 4.398256471089518e-06, + "loss": 0.0079, + "num_input_tokens_seen": 109466896, + "step": 50725 + }, + { + "epoch": 8.275693311582382, + "grad_norm": 0.6381645202636719, + "learning_rate": 4.394225052085613e-06, + "loss": 0.0534, + "num_input_tokens_seen": 109478800, + "step": 50730 + }, + { + "epoch": 8.276508972267537, + "grad_norm": 0.9756405353546143, + "learning_rate": 4.390195303490377e-06, + "loss": 0.0653, + "num_input_tokens_seen": 109488272, + "step": 50735 + }, + { + "epoch": 8.277324632952691, + "grad_norm": 0.7020530104637146, + "learning_rate": 4.3861672256304835e-06, + "loss": 0.0258, + "num_input_tokens_seen": 109499248, + "step": 50740 + }, + { + "epoch": 8.278140293637847, + "grad_norm": 0.607020378112793, + "learning_rate": 4.382140818832467e-06, + "loss": 0.262, + "num_input_tokens_seen": 109509584, + "step": 50745 + }, + { + "epoch": 8.278955954323001, + "grad_norm": 1.067522406578064, + "learning_rate": 4.378116083422732e-06, + "loss": 0.2027, + "num_input_tokens_seen": 109520656, + "step": 50750 + }, + { + "epoch": 8.279771615008157, + "grad_norm": 0.12666578590869904, + "learning_rate": 4.374093019727541e-06, + "loss": 0.0786, + "num_input_tokens_seen": 109531600, + "step": 50755 + }, + { + "epoch": 8.280587275693312, + "grad_norm": 0.8080277442932129, + "learning_rate": 4.370071628073025e-06, + "loss": 0.1852, + "num_input_tokens_seen": 109541936, + "step": 50760 + }, + { + "epoch": 8.281402936378466, + "grad_norm": 1.3502800464630127, + "learning_rate": 4.366051908785177e-06, + "loss": 0.0559, + "num_input_tokens_seen": 109552336, + "step": 50765 + }, + { + "epoch": 8.282218597063622, + "grad_norm": 0.5376309156417847, + "learning_rate": 4.3620338621898575e-06, + "loss": 0.1642, + "num_input_tokens_seen": 109563376, + "step": 50770 + }, + { + "epoch": 8.283034257748776, + "grad_norm": 1.373502492904663, + "learning_rate": 4.35801748861279e-06, + "loss": 0.1787, + "num_input_tokens_seen": 109575216, + "step": 50775 + }, + { + "epoch": 8.283849918433932, + "grad_norm": 0.7225462198257446, + "learning_rate": 4.354002788379558e-06, + "loss": 0.0931, + "num_input_tokens_seen": 109585904, + "step": 50780 + }, + { + "epoch": 8.284665579119087, + "grad_norm": 0.5748491287231445, + "learning_rate": 4.34998976181562e-06, + "loss": 0.1727, + "num_input_tokens_seen": 109597936, + "step": 50785 + }, + { + "epoch": 8.285481239804241, + "grad_norm": 0.09617829322814941, + "learning_rate": 4.345978409246287e-06, + "loss": 0.114, + "num_input_tokens_seen": 109609424, + "step": 50790 + }, + { + "epoch": 8.286296900489397, + "grad_norm": 0.08116726577281952, + "learning_rate": 4.341968730996743e-06, + "loss": 0.0546, + "num_input_tokens_seen": 109621488, + "step": 50795 + }, + { + "epoch": 8.28711256117455, + "grad_norm": 1.0851242542266846, + "learning_rate": 4.337960727392032e-06, + "loss": 0.1258, + "num_input_tokens_seen": 109631696, + "step": 50800 + }, + { + "epoch": 8.287928221859707, + "grad_norm": 1.429978609085083, + "learning_rate": 4.333954398757054e-06, + "loss": 0.2671, + "num_input_tokens_seen": 109642544, + "step": 50805 + }, + { + "epoch": 8.28874388254486, + "grad_norm": 0.7009502649307251, + "learning_rate": 4.329949745416598e-06, + "loss": 0.0608, + "num_input_tokens_seen": 109652752, + "step": 50810 + }, + { + "epoch": 8.289559543230016, + "grad_norm": 1.2198392152786255, + "learning_rate": 4.325946767695297e-06, + "loss": 0.1056, + "num_input_tokens_seen": 109664496, + "step": 50815 + }, + { + "epoch": 8.290375203915172, + "grad_norm": 0.3571048974990845, + "learning_rate": 4.321945465917646e-06, + "loss": 0.1402, + "num_input_tokens_seen": 109676752, + "step": 50820 + }, + { + "epoch": 8.291190864600326, + "grad_norm": 0.264790415763855, + "learning_rate": 4.317945840408019e-06, + "loss": 0.0594, + "num_input_tokens_seen": 109687792, + "step": 50825 + }, + { + "epoch": 8.292006525285482, + "grad_norm": 0.2130548357963562, + "learning_rate": 4.313947891490638e-06, + "loss": 0.0794, + "num_input_tokens_seen": 109698416, + "step": 50830 + }, + { + "epoch": 8.292822185970635, + "grad_norm": 0.054836053401231766, + "learning_rate": 4.309951619489597e-06, + "loss": 0.1792, + "num_input_tokens_seen": 109709488, + "step": 50835 + }, + { + "epoch": 8.293637846655791, + "grad_norm": 0.2931985557079315, + "learning_rate": 4.3059570247288624e-06, + "loss": 0.1515, + "num_input_tokens_seen": 109720912, + "step": 50840 + }, + { + "epoch": 8.294453507340947, + "grad_norm": 0.145307257771492, + "learning_rate": 4.301964107532255e-06, + "loss": 0.0518, + "num_input_tokens_seen": 109731600, + "step": 50845 + }, + { + "epoch": 8.2952691680261, + "grad_norm": 0.2746720612049103, + "learning_rate": 4.297972868223457e-06, + "loss": 0.2025, + "num_input_tokens_seen": 109742064, + "step": 50850 + }, + { + "epoch": 8.296084828711257, + "grad_norm": 0.3317073881626129, + "learning_rate": 4.293983307126018e-06, + "loss": 0.0664, + "num_input_tokens_seen": 109752848, + "step": 50855 + }, + { + "epoch": 8.29690048939641, + "grad_norm": 0.6859831809997559, + "learning_rate": 4.289995424563353e-06, + "loss": 0.0671, + "num_input_tokens_seen": 109763728, + "step": 50860 + }, + { + "epoch": 8.297716150081566, + "grad_norm": 1.5070444345474243, + "learning_rate": 4.286009220858742e-06, + "loss": 0.3923, + "num_input_tokens_seen": 109774864, + "step": 50865 + }, + { + "epoch": 8.298531810766722, + "grad_norm": 1.0536588430404663, + "learning_rate": 4.282024696335324e-06, + "loss": 0.128, + "num_input_tokens_seen": 109785296, + "step": 50870 + }, + { + "epoch": 8.299347471451876, + "grad_norm": 0.8607791662216187, + "learning_rate": 4.278041851316106e-06, + "loss": 0.2004, + "num_input_tokens_seen": 109795856, + "step": 50875 + }, + { + "epoch": 8.300163132137031, + "grad_norm": 1.2719907760620117, + "learning_rate": 4.274060686123959e-06, + "loss": 0.1638, + "num_input_tokens_seen": 109806480, + "step": 50880 + }, + { + "epoch": 8.300978792822185, + "grad_norm": 1.9027378559112549, + "learning_rate": 4.270081201081613e-06, + "loss": 0.0543, + "num_input_tokens_seen": 109817680, + "step": 50885 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.11308127641677856, + "learning_rate": 4.2661033965116695e-06, + "loss": 0.0247, + "num_input_tokens_seen": 109829616, + "step": 50890 + }, + { + "epoch": 8.302610114192497, + "grad_norm": 0.0618082657456398, + "learning_rate": 4.2621272727365875e-06, + "loss": 0.0759, + "num_input_tokens_seen": 109839632, + "step": 50895 + }, + { + "epoch": 8.30342577487765, + "grad_norm": 0.9541622996330261, + "learning_rate": 4.2581528300786906e-06, + "loss": 0.071, + "num_input_tokens_seen": 109851344, + "step": 50900 + }, + { + "epoch": 8.304241435562806, + "grad_norm": 0.14104540646076202, + "learning_rate": 4.2541800688601696e-06, + "loss": 0.07, + "num_input_tokens_seen": 109861072, + "step": 50905 + }, + { + "epoch": 8.30505709624796, + "grad_norm": 0.37922343611717224, + "learning_rate": 4.250208989403073e-06, + "loss": 0.0934, + "num_input_tokens_seen": 109872144, + "step": 50910 + }, + { + "epoch": 8.305872756933116, + "grad_norm": 0.9319424033164978, + "learning_rate": 4.2462395920293215e-06, + "loss": 0.0994, + "num_input_tokens_seen": 109883152, + "step": 50915 + }, + { + "epoch": 8.30668841761827, + "grad_norm": 0.07266443967819214, + "learning_rate": 4.242271877060691e-06, + "loss": 0.0708, + "num_input_tokens_seen": 109894672, + "step": 50920 + }, + { + "epoch": 8.307504078303426, + "grad_norm": 1.2847687005996704, + "learning_rate": 4.238305844818827e-06, + "loss": 0.1418, + "num_input_tokens_seen": 109905456, + "step": 50925 + }, + { + "epoch": 8.308319738988581, + "grad_norm": 1.8007533550262451, + "learning_rate": 4.234341495625233e-06, + "loss": 0.2619, + "num_input_tokens_seen": 109915024, + "step": 50930 + }, + { + "epoch": 8.309135399673735, + "grad_norm": 0.4396660625934601, + "learning_rate": 4.230378829801282e-06, + "loss": 0.1631, + "num_input_tokens_seen": 109923728, + "step": 50935 + }, + { + "epoch": 8.309951060358891, + "grad_norm": 0.5640154480934143, + "learning_rate": 4.226417847668201e-06, + "loss": 0.135, + "num_input_tokens_seen": 109934128, + "step": 50940 + }, + { + "epoch": 8.310766721044045, + "grad_norm": 0.07717965543270111, + "learning_rate": 4.222458549547101e-06, + "loss": 0.0361, + "num_input_tokens_seen": 109945072, + "step": 50945 + }, + { + "epoch": 8.3115823817292, + "grad_norm": 1.3058527708053589, + "learning_rate": 4.218500935758935e-06, + "loss": 0.174, + "num_input_tokens_seen": 109955184, + "step": 50950 + }, + { + "epoch": 8.312398042414356, + "grad_norm": 1.1587975025177002, + "learning_rate": 4.214545006624526e-06, + "loss": 0.1716, + "num_input_tokens_seen": 109965232, + "step": 50955 + }, + { + "epoch": 8.31321370309951, + "grad_norm": 0.15461541712284088, + "learning_rate": 4.210590762464564e-06, + "loss": 0.0317, + "num_input_tokens_seen": 109976944, + "step": 50960 + }, + { + "epoch": 8.314029363784666, + "grad_norm": 0.03763512894511223, + "learning_rate": 4.206638203599597e-06, + "loss": 0.0835, + "num_input_tokens_seen": 109987504, + "step": 50965 + }, + { + "epoch": 8.31484502446982, + "grad_norm": 0.13119588792324066, + "learning_rate": 4.202687330350044e-06, + "loss": 0.0919, + "num_input_tokens_seen": 109998160, + "step": 50970 + }, + { + "epoch": 8.315660685154976, + "grad_norm": 0.25142326951026917, + "learning_rate": 4.1987381430361735e-06, + "loss": 0.1134, + "num_input_tokens_seen": 110008464, + "step": 50975 + }, + { + "epoch": 8.31647634584013, + "grad_norm": 1.9795559644699097, + "learning_rate": 4.194790641978141e-06, + "loss": 0.2894, + "num_input_tokens_seen": 110020080, + "step": 50980 + }, + { + "epoch": 8.317292006525285, + "grad_norm": 0.1397644281387329, + "learning_rate": 4.1908448274959436e-06, + "loss": 0.0595, + "num_input_tokens_seen": 110029104, + "step": 50985 + }, + { + "epoch": 8.318107667210441, + "grad_norm": 0.567512035369873, + "learning_rate": 4.186900699909446e-06, + "loss": 0.0619, + "num_input_tokens_seen": 110040528, + "step": 50990 + }, + { + "epoch": 8.318923327895595, + "grad_norm": 0.10818272083997726, + "learning_rate": 4.182958259538386e-06, + "loss": 0.1141, + "num_input_tokens_seen": 110049744, + "step": 50995 + }, + { + "epoch": 8.31973898858075, + "grad_norm": 0.41266998648643494, + "learning_rate": 4.179017506702351e-06, + "loss": 0.0593, + "num_input_tokens_seen": 110061680, + "step": 51000 + }, + { + "epoch": 8.320554649265905, + "grad_norm": 0.12437081336975098, + "learning_rate": 4.1750784417208065e-06, + "loss": 0.0909, + "num_input_tokens_seen": 110072880, + "step": 51005 + }, + { + "epoch": 8.32137030995106, + "grad_norm": 0.146027609705925, + "learning_rate": 4.171141064913061e-06, + "loss": 0.1933, + "num_input_tokens_seen": 110084464, + "step": 51010 + }, + { + "epoch": 8.322185970636216, + "grad_norm": 0.19198612868785858, + "learning_rate": 4.16720537659831e-06, + "loss": 0.133, + "num_input_tokens_seen": 110095536, + "step": 51015 + }, + { + "epoch": 8.32300163132137, + "grad_norm": 1.1096620559692383, + "learning_rate": 4.1632713770955956e-06, + "loss": 0.1708, + "num_input_tokens_seen": 110104496, + "step": 51020 + }, + { + "epoch": 8.323817292006526, + "grad_norm": 0.04443960264325142, + "learning_rate": 4.159339066723827e-06, + "loss": 0.0841, + "num_input_tokens_seen": 110115888, + "step": 51025 + }, + { + "epoch": 8.32463295269168, + "grad_norm": 0.07259441167116165, + "learning_rate": 4.155408445801779e-06, + "loss": 0.1159, + "num_input_tokens_seen": 110125104, + "step": 51030 + }, + { + "epoch": 8.325448613376835, + "grad_norm": 0.12388480454683304, + "learning_rate": 4.151479514648085e-06, + "loss": 0.0553, + "num_input_tokens_seen": 110135728, + "step": 51035 + }, + { + "epoch": 8.326264274061991, + "grad_norm": 0.5868048071861267, + "learning_rate": 4.147552273581248e-06, + "loss": 0.138, + "num_input_tokens_seen": 110146832, + "step": 51040 + }, + { + "epoch": 8.327079934747145, + "grad_norm": 0.6381786465644836, + "learning_rate": 4.143626722919619e-06, + "loss": 0.1741, + "num_input_tokens_seen": 110157520, + "step": 51045 + }, + { + "epoch": 8.3278955954323, + "grad_norm": 0.5960982441902161, + "learning_rate": 4.139702862981443e-06, + "loss": 0.1072, + "num_input_tokens_seen": 110168688, + "step": 51050 + }, + { + "epoch": 8.328711256117455, + "grad_norm": 0.5321128964424133, + "learning_rate": 4.135780694084793e-06, + "loss": 0.0518, + "num_input_tokens_seen": 110179056, + "step": 51055 + }, + { + "epoch": 8.32952691680261, + "grad_norm": 0.5543876886367798, + "learning_rate": 4.131860216547623e-06, + "loss": 0.1494, + "num_input_tokens_seen": 110190160, + "step": 51060 + }, + { + "epoch": 8.330342577487766, + "grad_norm": 0.03792250156402588, + "learning_rate": 4.127941430687751e-06, + "loss": 0.046, + "num_input_tokens_seen": 110201424, + "step": 51065 + }, + { + "epoch": 8.33115823817292, + "grad_norm": 0.15218901634216309, + "learning_rate": 4.1240243368228485e-06, + "loss": 0.1136, + "num_input_tokens_seen": 110212304, + "step": 51070 + }, + { + "epoch": 8.331973898858076, + "grad_norm": 0.7378392815589905, + "learning_rate": 4.120108935270459e-06, + "loss": 0.0747, + "num_input_tokens_seen": 110223216, + "step": 51075 + }, + { + "epoch": 8.33278955954323, + "grad_norm": 0.39099305868148804, + "learning_rate": 4.11619522634798e-06, + "loss": 0.3097, + "num_input_tokens_seen": 110234160, + "step": 51080 + }, + { + "epoch": 8.333605220228385, + "grad_norm": 1.6532909870147705, + "learning_rate": 4.11228321037268e-06, + "loss": 0.1361, + "num_input_tokens_seen": 110245264, + "step": 51085 + }, + { + "epoch": 8.33442088091354, + "grad_norm": 0.4761243164539337, + "learning_rate": 4.108372887661688e-06, + "loss": 0.0321, + "num_input_tokens_seen": 110256560, + "step": 51090 + }, + { + "epoch": 8.335236541598695, + "grad_norm": 0.7459836006164551, + "learning_rate": 4.10446425853199e-06, + "loss": 0.2488, + "num_input_tokens_seen": 110267920, + "step": 51095 + }, + { + "epoch": 8.33605220228385, + "grad_norm": 0.33826178312301636, + "learning_rate": 4.100557323300444e-06, + "loss": 0.3255, + "num_input_tokens_seen": 110278032, + "step": 51100 + }, + { + "epoch": 8.336867862969005, + "grad_norm": 0.4684329628944397, + "learning_rate": 4.096652082283764e-06, + "loss": 0.0437, + "num_input_tokens_seen": 110289232, + "step": 51105 + }, + { + "epoch": 8.33768352365416, + "grad_norm": 0.6991896629333496, + "learning_rate": 4.092748535798527e-06, + "loss": 0.1253, + "num_input_tokens_seen": 110299664, + "step": 51110 + }, + { + "epoch": 8.338499184339314, + "grad_norm": 0.18337330222129822, + "learning_rate": 4.088846684161177e-06, + "loss": 0.1119, + "num_input_tokens_seen": 110310480, + "step": 51115 + }, + { + "epoch": 8.33931484502447, + "grad_norm": 0.2147013247013092, + "learning_rate": 4.0849465276880105e-06, + "loss": 0.0265, + "num_input_tokens_seen": 110322640, + "step": 51120 + }, + { + "epoch": 8.340130505709626, + "grad_norm": 0.3361497223377228, + "learning_rate": 4.081048066695209e-06, + "loss": 0.0341, + "num_input_tokens_seen": 110334992, + "step": 51125 + }, + { + "epoch": 8.34094616639478, + "grad_norm": 0.3599073588848114, + "learning_rate": 4.077151301498791e-06, + "loss": 0.052, + "num_input_tokens_seen": 110345616, + "step": 51130 + }, + { + "epoch": 8.341761827079935, + "grad_norm": 0.55901700258255, + "learning_rate": 4.073256232414649e-06, + "loss": 0.1226, + "num_input_tokens_seen": 110356752, + "step": 51135 + }, + { + "epoch": 8.34257748776509, + "grad_norm": 0.2702045440673828, + "learning_rate": 4.069362859758541e-06, + "loss": 0.1065, + "num_input_tokens_seen": 110367568, + "step": 51140 + }, + { + "epoch": 8.343393148450245, + "grad_norm": 0.46054232120513916, + "learning_rate": 4.065471183846079e-06, + "loss": 0.1814, + "num_input_tokens_seen": 110378576, + "step": 51145 + }, + { + "epoch": 8.3442088091354, + "grad_norm": 0.41364818811416626, + "learning_rate": 4.061581204992742e-06, + "loss": 0.0559, + "num_input_tokens_seen": 110389072, + "step": 51150 + }, + { + "epoch": 8.345024469820554, + "grad_norm": 0.5456515550613403, + "learning_rate": 4.057692923513867e-06, + "loss": 0.2473, + "num_input_tokens_seen": 110400464, + "step": 51155 + }, + { + "epoch": 8.34584013050571, + "grad_norm": 0.25863832235336304, + "learning_rate": 4.0538063397246725e-06, + "loss": 0.1568, + "num_input_tokens_seen": 110411728, + "step": 51160 + }, + { + "epoch": 8.346655791190864, + "grad_norm": 0.06769958883523941, + "learning_rate": 4.049921453940214e-06, + "loss": 0.0357, + "num_input_tokens_seen": 110422480, + "step": 51165 + }, + { + "epoch": 8.34747145187602, + "grad_norm": 0.22704941034317017, + "learning_rate": 4.046038266475421e-06, + "loss": 0.0561, + "num_input_tokens_seen": 110433680, + "step": 51170 + }, + { + "epoch": 8.348287112561174, + "grad_norm": 0.5290148854255676, + "learning_rate": 4.0421567776450895e-06, + "loss": 0.1406, + "num_input_tokens_seen": 110442512, + "step": 51175 + }, + { + "epoch": 8.34910277324633, + "grad_norm": 0.6466386914253235, + "learning_rate": 4.038276987763864e-06, + "loss": 0.0587, + "num_input_tokens_seen": 110454032, + "step": 51180 + }, + { + "epoch": 8.349918433931485, + "grad_norm": 2.5694785118103027, + "learning_rate": 4.034398897146269e-06, + "loss": 0.1212, + "num_input_tokens_seen": 110464432, + "step": 51185 + }, + { + "epoch": 8.350734094616639, + "grad_norm": 0.7181152701377869, + "learning_rate": 4.0305225061066735e-06, + "loss": 0.1855, + "num_input_tokens_seen": 110476176, + "step": 51190 + }, + { + "epoch": 8.351549755301795, + "grad_norm": 0.13017800450325012, + "learning_rate": 4.026647814959325e-06, + "loss": 0.1924, + "num_input_tokens_seen": 110486800, + "step": 51195 + }, + { + "epoch": 8.352365415986949, + "grad_norm": 0.16304269433021545, + "learning_rate": 4.022774824018321e-06, + "loss": 0.1538, + "num_input_tokens_seen": 110498448, + "step": 51200 + }, + { + "epoch": 8.353181076672104, + "grad_norm": 0.05061040073633194, + "learning_rate": 4.018903533597629e-06, + "loss": 0.0487, + "num_input_tokens_seen": 110510064, + "step": 51205 + }, + { + "epoch": 8.35399673735726, + "grad_norm": 2.217787504196167, + "learning_rate": 4.015033944011071e-06, + "loss": 0.0942, + "num_input_tokens_seen": 110522352, + "step": 51210 + }, + { + "epoch": 8.354812398042414, + "grad_norm": 1.2849758863449097, + "learning_rate": 4.011166055572338e-06, + "loss": 0.1193, + "num_input_tokens_seen": 110534640, + "step": 51215 + }, + { + "epoch": 8.35562805872757, + "grad_norm": 0.2467225193977356, + "learning_rate": 4.007299868594983e-06, + "loss": 0.0964, + "num_input_tokens_seen": 110546608, + "step": 51220 + }, + { + "epoch": 8.356443719412724, + "grad_norm": 0.4754684567451477, + "learning_rate": 4.003435383392415e-06, + "loss": 0.0744, + "num_input_tokens_seen": 110556656, + "step": 51225 + }, + { + "epoch": 8.35725938009788, + "grad_norm": 0.47548380494117737, + "learning_rate": 3.999572600277912e-06, + "loss": 0.1316, + "num_input_tokens_seen": 110567184, + "step": 51230 + }, + { + "epoch": 8.358075040783035, + "grad_norm": 0.6301913857460022, + "learning_rate": 3.995711519564607e-06, + "loss": 0.0361, + "num_input_tokens_seen": 110576848, + "step": 51235 + }, + { + "epoch": 8.358890701468189, + "grad_norm": 0.38923853635787964, + "learning_rate": 3.991852141565503e-06, + "loss": 0.1431, + "num_input_tokens_seen": 110587920, + "step": 51240 + }, + { + "epoch": 8.359706362153345, + "grad_norm": 0.172250896692276, + "learning_rate": 3.987994466593456e-06, + "loss": 0.1329, + "num_input_tokens_seen": 110599184, + "step": 51245 + }, + { + "epoch": 8.360522022838499, + "grad_norm": 0.6014869213104248, + "learning_rate": 3.9841384949611924e-06, + "loss": 0.1013, + "num_input_tokens_seen": 110610768, + "step": 51250 + }, + { + "epoch": 8.361337683523654, + "grad_norm": 0.9361535906791687, + "learning_rate": 3.980284226981299e-06, + "loss": 0.2581, + "num_input_tokens_seen": 110621136, + "step": 51255 + }, + { + "epoch": 8.362153344208808, + "grad_norm": 0.06955485790967941, + "learning_rate": 3.976431662966209e-06, + "loss": 0.1015, + "num_input_tokens_seen": 110631088, + "step": 51260 + }, + { + "epoch": 8.362969004893964, + "grad_norm": 0.7861031889915466, + "learning_rate": 3.972580803228249e-06, + "loss": 0.0988, + "num_input_tokens_seen": 110643184, + "step": 51265 + }, + { + "epoch": 8.36378466557912, + "grad_norm": 2.0664916038513184, + "learning_rate": 3.96873164807958e-06, + "loss": 0.1443, + "num_input_tokens_seen": 110653712, + "step": 51270 + }, + { + "epoch": 8.364600326264274, + "grad_norm": 0.2254316359758377, + "learning_rate": 3.964884197832236e-06, + "loss": 0.173, + "num_input_tokens_seen": 110663600, + "step": 51275 + }, + { + "epoch": 8.36541598694943, + "grad_norm": 1.6529754400253296, + "learning_rate": 3.96103845279811e-06, + "loss": 0.1251, + "num_input_tokens_seen": 110674192, + "step": 51280 + }, + { + "epoch": 8.366231647634583, + "grad_norm": 0.3009243309497833, + "learning_rate": 3.957194413288956e-06, + "loss": 0.139, + "num_input_tokens_seen": 110684880, + "step": 51285 + }, + { + "epoch": 8.367047308319739, + "grad_norm": 0.19453752040863037, + "learning_rate": 3.953352079616387e-06, + "loss": 0.0355, + "num_input_tokens_seen": 110695120, + "step": 51290 + }, + { + "epoch": 8.367862969004895, + "grad_norm": 0.23968909680843353, + "learning_rate": 3.949511452091898e-06, + "loss": 0.0494, + "num_input_tokens_seen": 110704912, + "step": 51295 + }, + { + "epoch": 8.368678629690049, + "grad_norm": 0.7441388368606567, + "learning_rate": 3.945672531026817e-06, + "loss": 0.0641, + "num_input_tokens_seen": 110715952, + "step": 51300 + }, + { + "epoch": 8.369494290375204, + "grad_norm": 0.3061037063598633, + "learning_rate": 3.941835316732348e-06, + "loss": 0.1341, + "num_input_tokens_seen": 110727312, + "step": 51305 + }, + { + "epoch": 8.370309951060358, + "grad_norm": 1.4315301179885864, + "learning_rate": 3.9379998095195606e-06, + "loss": 0.0888, + "num_input_tokens_seen": 110738448, + "step": 51310 + }, + { + "epoch": 8.371125611745514, + "grad_norm": 1.3477935791015625, + "learning_rate": 3.9341660096993725e-06, + "loss": 0.0953, + "num_input_tokens_seen": 110750960, + "step": 51315 + }, + { + "epoch": 8.37194127243067, + "grad_norm": 0.11731075495481491, + "learning_rate": 3.9303339175825736e-06, + "loss": 0.0668, + "num_input_tokens_seen": 110761584, + "step": 51320 + }, + { + "epoch": 8.372756933115824, + "grad_norm": 0.20853191614151, + "learning_rate": 3.926503533479817e-06, + "loss": 0.02, + "num_input_tokens_seen": 110771792, + "step": 51325 + }, + { + "epoch": 8.37357259380098, + "grad_norm": 1.1274714469909668, + "learning_rate": 3.922674857701608e-06, + "loss": 0.1038, + "num_input_tokens_seen": 110783472, + "step": 51330 + }, + { + "epoch": 8.374388254486133, + "grad_norm": 0.3597312569618225, + "learning_rate": 3.918847890558322e-06, + "loss": 0.1286, + "num_input_tokens_seen": 110793584, + "step": 51335 + }, + { + "epoch": 8.375203915171289, + "grad_norm": 1.2398676872253418, + "learning_rate": 3.915022632360188e-06, + "loss": 0.12, + "num_input_tokens_seen": 110803440, + "step": 51340 + }, + { + "epoch": 8.376019575856443, + "grad_norm": 0.3926544785499573, + "learning_rate": 3.911199083417305e-06, + "loss": 0.0501, + "num_input_tokens_seen": 110813808, + "step": 51345 + }, + { + "epoch": 8.376835236541599, + "grad_norm": 1.9806065559387207, + "learning_rate": 3.9073772440396285e-06, + "loss": 0.2071, + "num_input_tokens_seen": 110824016, + "step": 51350 + }, + { + "epoch": 8.377650897226754, + "grad_norm": 0.4991952180862427, + "learning_rate": 3.903557114536973e-06, + "loss": 0.149, + "num_input_tokens_seen": 110834672, + "step": 51355 + }, + { + "epoch": 8.378466557911908, + "grad_norm": 0.14613644778728485, + "learning_rate": 3.899738695219024e-06, + "loss": 0.0213, + "num_input_tokens_seen": 110845456, + "step": 51360 + }, + { + "epoch": 8.379282218597064, + "grad_norm": 0.25043535232543945, + "learning_rate": 3.89592198639531e-06, + "loss": 0.1852, + "num_input_tokens_seen": 110856080, + "step": 51365 + }, + { + "epoch": 8.380097879282218, + "grad_norm": 0.42838433384895325, + "learning_rate": 3.8921069883752465e-06, + "loss": 0.154, + "num_input_tokens_seen": 110867056, + "step": 51370 + }, + { + "epoch": 8.380913539967374, + "grad_norm": 0.9102512001991272, + "learning_rate": 3.88829370146809e-06, + "loss": 0.0783, + "num_input_tokens_seen": 110878352, + "step": 51375 + }, + { + "epoch": 8.38172920065253, + "grad_norm": 0.8571275472640991, + "learning_rate": 3.884482125982969e-06, + "loss": 0.0502, + "num_input_tokens_seen": 110889584, + "step": 51380 + }, + { + "epoch": 8.382544861337683, + "grad_norm": 1.8241491317749023, + "learning_rate": 3.880672262228863e-06, + "loss": 0.1134, + "num_input_tokens_seen": 110900464, + "step": 51385 + }, + { + "epoch": 8.383360522022839, + "grad_norm": 0.40542280673980713, + "learning_rate": 3.876864110514622e-06, + "loss": 0.0837, + "num_input_tokens_seen": 110911568, + "step": 51390 + }, + { + "epoch": 8.384176182707993, + "grad_norm": 0.8562924861907959, + "learning_rate": 3.8730576711489555e-06, + "loss": 0.1602, + "num_input_tokens_seen": 110922640, + "step": 51395 + }, + { + "epoch": 8.384991843393149, + "grad_norm": 0.2351100891828537, + "learning_rate": 3.86925294444043e-06, + "loss": 0.125, + "num_input_tokens_seen": 110933872, + "step": 51400 + }, + { + "epoch": 8.385807504078304, + "grad_norm": 0.046325813978910446, + "learning_rate": 3.8654499306974765e-06, + "loss": 0.09, + "num_input_tokens_seen": 110945648, + "step": 51405 + }, + { + "epoch": 8.386623164763458, + "grad_norm": 0.06804926693439484, + "learning_rate": 3.86164863022839e-06, + "loss": 0.0253, + "num_input_tokens_seen": 110956528, + "step": 51410 + }, + { + "epoch": 8.387438825448614, + "grad_norm": 1.2156366109848022, + "learning_rate": 3.857849043341316e-06, + "loss": 0.0927, + "num_input_tokens_seen": 110966224, + "step": 51415 + }, + { + "epoch": 8.388254486133768, + "grad_norm": 0.18594229221343994, + "learning_rate": 3.854051170344278e-06, + "loss": 0.0794, + "num_input_tokens_seen": 110976912, + "step": 51420 + }, + { + "epoch": 8.389070146818923, + "grad_norm": 0.22757510840892792, + "learning_rate": 3.8502550115451425e-06, + "loss": 0.0673, + "num_input_tokens_seen": 110988080, + "step": 51425 + }, + { + "epoch": 8.38988580750408, + "grad_norm": 0.15273739397525787, + "learning_rate": 3.846460567251648e-06, + "loss": 0.0662, + "num_input_tokens_seen": 110999472, + "step": 51430 + }, + { + "epoch": 8.390701468189233, + "grad_norm": 0.3892327845096588, + "learning_rate": 3.8426678377713884e-06, + "loss": 0.072, + "num_input_tokens_seen": 111011184, + "step": 51435 + }, + { + "epoch": 8.391517128874389, + "grad_norm": 0.17407241463661194, + "learning_rate": 3.8388768234118275e-06, + "loss": 0.0294, + "num_input_tokens_seen": 111022480, + "step": 51440 + }, + { + "epoch": 8.392332789559543, + "grad_norm": 0.8765694499015808, + "learning_rate": 3.8350875244802855e-06, + "loss": 0.2139, + "num_input_tokens_seen": 111032912, + "step": 51445 + }, + { + "epoch": 8.393148450244698, + "grad_norm": 1.0560474395751953, + "learning_rate": 3.831299941283936e-06, + "loss": 0.1163, + "num_input_tokens_seen": 111044272, + "step": 51450 + }, + { + "epoch": 8.393964110929852, + "grad_norm": 0.8362095355987549, + "learning_rate": 3.827514074129823e-06, + "loss": 0.1009, + "num_input_tokens_seen": 111054704, + "step": 51455 + }, + { + "epoch": 8.394779771615008, + "grad_norm": 0.25575700402259827, + "learning_rate": 3.823729923324848e-06, + "loss": 0.0294, + "num_input_tokens_seen": 111065040, + "step": 51460 + }, + { + "epoch": 8.395595432300164, + "grad_norm": 0.21590080857276917, + "learning_rate": 3.819947489175771e-06, + "loss": 0.0417, + "num_input_tokens_seen": 111077200, + "step": 51465 + }, + { + "epoch": 8.396411092985318, + "grad_norm": 0.5741310119628906, + "learning_rate": 3.816166771989218e-06, + "loss": 0.1045, + "num_input_tokens_seen": 111088016, + "step": 51470 + }, + { + "epoch": 8.397226753670473, + "grad_norm": 0.6412959098815918, + "learning_rate": 3.812387772071668e-06, + "loss": 0.0754, + "num_input_tokens_seen": 111098320, + "step": 51475 + }, + { + "epoch": 8.398042414355627, + "grad_norm": 0.04435781016945839, + "learning_rate": 3.808610489729472e-06, + "loss": 0.0324, + "num_input_tokens_seen": 111108528, + "step": 51480 + }, + { + "epoch": 8.398858075040783, + "grad_norm": 1.6924227476119995, + "learning_rate": 3.804834925268838e-06, + "loss": 0.1967, + "num_input_tokens_seen": 111120080, + "step": 51485 + }, + { + "epoch": 8.399673735725939, + "grad_norm": 0.3541743755340576, + "learning_rate": 3.801061078995827e-06, + "loss": 0.0706, + "num_input_tokens_seen": 111130512, + "step": 51490 + }, + { + "epoch": 8.400489396411093, + "grad_norm": 0.2162810117006302, + "learning_rate": 3.7972889512163656e-06, + "loss": 0.1946, + "num_input_tokens_seen": 111141520, + "step": 51495 + }, + { + "epoch": 8.401305057096248, + "grad_norm": 0.3769185543060303, + "learning_rate": 3.7935185422362433e-06, + "loss": 0.0794, + "num_input_tokens_seen": 111152112, + "step": 51500 + }, + { + "epoch": 8.402120717781402, + "grad_norm": 1.4744395017623901, + "learning_rate": 3.7897498523611104e-06, + "loss": 0.156, + "num_input_tokens_seen": 111162896, + "step": 51505 + }, + { + "epoch": 8.402936378466558, + "grad_norm": 0.427181214094162, + "learning_rate": 3.7859828818964716e-06, + "loss": 0.2047, + "num_input_tokens_seen": 111173616, + "step": 51510 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.19257605075836182, + "learning_rate": 3.7822176311477027e-06, + "loss": 0.1297, + "num_input_tokens_seen": 111183856, + "step": 51515 + }, + { + "epoch": 8.404567699836868, + "grad_norm": 0.2504178285598755, + "learning_rate": 3.7784541004200287e-06, + "loss": 0.1491, + "num_input_tokens_seen": 111193936, + "step": 51520 + }, + { + "epoch": 8.405383360522023, + "grad_norm": 0.1003989428281784, + "learning_rate": 3.774692290018542e-06, + "loss": 0.1741, + "num_input_tokens_seen": 111204912, + "step": 51525 + }, + { + "epoch": 8.406199021207177, + "grad_norm": 0.4986816346645355, + "learning_rate": 3.770932200248195e-06, + "loss": 0.1041, + "num_input_tokens_seen": 111215408, + "step": 51530 + }, + { + "epoch": 8.407014681892333, + "grad_norm": 0.09005533158779144, + "learning_rate": 3.7671738314137978e-06, + "loss": 0.103, + "num_input_tokens_seen": 111227056, + "step": 51535 + }, + { + "epoch": 8.407830342577487, + "grad_norm": 0.6738113164901733, + "learning_rate": 3.7634171838200253e-06, + "loss": 0.2651, + "num_input_tokens_seen": 111238512, + "step": 51540 + }, + { + "epoch": 8.408646003262643, + "grad_norm": 0.19893407821655273, + "learning_rate": 3.75966225777141e-06, + "loss": 0.0498, + "num_input_tokens_seen": 111249392, + "step": 51545 + }, + { + "epoch": 8.409461663947798, + "grad_norm": 1.4417383670806885, + "learning_rate": 3.7559090535723427e-06, + "loss": 0.1678, + "num_input_tokens_seen": 111259600, + "step": 51550 + }, + { + "epoch": 8.410277324632952, + "grad_norm": 0.11753057688474655, + "learning_rate": 3.7521575715270817e-06, + "loss": 0.065, + "num_input_tokens_seen": 111270032, + "step": 51555 + }, + { + "epoch": 8.411092985318108, + "grad_norm": 0.08553394675254822, + "learning_rate": 3.748407811939736e-06, + "loss": 0.1491, + "num_input_tokens_seen": 111279504, + "step": 51560 + }, + { + "epoch": 8.411908646003262, + "grad_norm": 1.0488470792770386, + "learning_rate": 3.7446597751142844e-06, + "loss": 0.1523, + "num_input_tokens_seen": 111291504, + "step": 51565 + }, + { + "epoch": 8.412724306688418, + "grad_norm": 0.14673052728176117, + "learning_rate": 3.7409134613545587e-06, + "loss": 0.1648, + "num_input_tokens_seen": 111303152, + "step": 51570 + }, + { + "epoch": 8.413539967373573, + "grad_norm": 1.1273155212402344, + "learning_rate": 3.7371688709642555e-06, + "loss": 0.0687, + "num_input_tokens_seen": 111313232, + "step": 51575 + }, + { + "epoch": 8.414355628058727, + "grad_norm": 0.039151325821876526, + "learning_rate": 3.7334260042469232e-06, + "loss": 0.0566, + "num_input_tokens_seen": 111324240, + "step": 51580 + }, + { + "epoch": 8.415171288743883, + "grad_norm": 0.1502545177936554, + "learning_rate": 3.7296848615059913e-06, + "loss": 0.1296, + "num_input_tokens_seen": 111335056, + "step": 51585 + }, + { + "epoch": 8.415986949429037, + "grad_norm": 0.3975012004375458, + "learning_rate": 3.725945443044729e-06, + "loss": 0.0689, + "num_input_tokens_seen": 111345200, + "step": 51590 + }, + { + "epoch": 8.416802610114193, + "grad_norm": 0.3702761232852936, + "learning_rate": 3.722207749166273e-06, + "loss": 0.1209, + "num_input_tokens_seen": 111354640, + "step": 51595 + }, + { + "epoch": 8.417618270799348, + "grad_norm": 0.15947097539901733, + "learning_rate": 3.7184717801736186e-06, + "loss": 0.0322, + "num_input_tokens_seen": 111365008, + "step": 51600 + }, + { + "epoch": 8.418433931484502, + "grad_norm": 0.22708693146705627, + "learning_rate": 3.7147375363696168e-06, + "loss": 0.2158, + "num_input_tokens_seen": 111376080, + "step": 51605 + }, + { + "epoch": 8.419249592169658, + "grad_norm": 0.7418307662010193, + "learning_rate": 3.7110050180569985e-06, + "loss": 0.1594, + "num_input_tokens_seen": 111387376, + "step": 51610 + }, + { + "epoch": 8.420065252854812, + "grad_norm": 0.3293086886405945, + "learning_rate": 3.707274225538332e-06, + "loss": 0.0583, + "num_input_tokens_seen": 111398224, + "step": 51615 + }, + { + "epoch": 8.420880913539968, + "grad_norm": 1.123587965965271, + "learning_rate": 3.7035451591160535e-06, + "loss": 0.2058, + "num_input_tokens_seen": 111408624, + "step": 51620 + }, + { + "epoch": 8.421696574225122, + "grad_norm": 0.31434401869773865, + "learning_rate": 3.699817819092463e-06, + "loss": 0.0305, + "num_input_tokens_seen": 111419664, + "step": 51625 + }, + { + "epoch": 8.422512234910277, + "grad_norm": 1.4455478191375732, + "learning_rate": 3.6960922057697163e-06, + "loss": 0.1123, + "num_input_tokens_seen": 111430480, + "step": 51630 + }, + { + "epoch": 8.423327895595433, + "grad_norm": 0.03202994167804718, + "learning_rate": 3.6923683194498295e-06, + "loss": 0.1864, + "num_input_tokens_seen": 111439984, + "step": 51635 + }, + { + "epoch": 8.424143556280587, + "grad_norm": 0.6427009105682373, + "learning_rate": 3.6886461604346807e-06, + "loss": 0.0475, + "num_input_tokens_seen": 111451472, + "step": 51640 + }, + { + "epoch": 8.424959216965743, + "grad_norm": 0.7184913754463196, + "learning_rate": 3.6849257290260066e-06, + "loss": 0.0868, + "num_input_tokens_seen": 111462544, + "step": 51645 + }, + { + "epoch": 8.425774877650896, + "grad_norm": 1.9913402795791626, + "learning_rate": 3.6812070255254043e-06, + "loss": 0.0493, + "num_input_tokens_seen": 111474800, + "step": 51650 + }, + { + "epoch": 8.426590538336052, + "grad_norm": 0.39741426706314087, + "learning_rate": 3.677490050234331e-06, + "loss": 0.0331, + "num_input_tokens_seen": 111485776, + "step": 51655 + }, + { + "epoch": 8.427406199021208, + "grad_norm": 0.05035189166665077, + "learning_rate": 3.6737748034541054e-06, + "loss": 0.2341, + "num_input_tokens_seen": 111497616, + "step": 51660 + }, + { + "epoch": 8.428221859706362, + "grad_norm": 1.3243638277053833, + "learning_rate": 3.670061285485901e-06, + "loss": 0.2278, + "num_input_tokens_seen": 111508848, + "step": 51665 + }, + { + "epoch": 8.429037520391518, + "grad_norm": 0.1388929933309555, + "learning_rate": 3.6663494966307553e-06, + "loss": 0.2104, + "num_input_tokens_seen": 111520816, + "step": 51670 + }, + { + "epoch": 8.429853181076671, + "grad_norm": 1.8328639268875122, + "learning_rate": 3.662639437189566e-06, + "loss": 0.1703, + "num_input_tokens_seen": 111531984, + "step": 51675 + }, + { + "epoch": 8.430668841761827, + "grad_norm": 1.8287619352340698, + "learning_rate": 3.65893110746309e-06, + "loss": 0.149, + "num_input_tokens_seen": 111542800, + "step": 51680 + }, + { + "epoch": 8.431484502446983, + "grad_norm": 0.13221842050552368, + "learning_rate": 3.655224507751934e-06, + "loss": 0.1691, + "num_input_tokens_seen": 111553040, + "step": 51685 + }, + { + "epoch": 8.432300163132137, + "grad_norm": 0.20390474796295166, + "learning_rate": 3.6515196383565873e-06, + "loss": 0.0245, + "num_input_tokens_seen": 111563120, + "step": 51690 + }, + { + "epoch": 8.433115823817293, + "grad_norm": 0.09404496848583221, + "learning_rate": 3.6478164995773807e-06, + "loss": 0.0327, + "num_input_tokens_seen": 111573776, + "step": 51695 + }, + { + "epoch": 8.433931484502446, + "grad_norm": 0.305917888879776, + "learning_rate": 3.644115091714509e-06, + "loss": 0.191, + "num_input_tokens_seen": 111584080, + "step": 51700 + }, + { + "epoch": 8.434747145187602, + "grad_norm": 1.4630982875823975, + "learning_rate": 3.640415415068027e-06, + "loss": 0.1598, + "num_input_tokens_seen": 111594480, + "step": 51705 + }, + { + "epoch": 8.435562805872756, + "grad_norm": 1.6020405292510986, + "learning_rate": 3.6367174699378476e-06, + "loss": 0.1668, + "num_input_tokens_seen": 111604656, + "step": 51710 + }, + { + "epoch": 8.436378466557912, + "grad_norm": 0.08026937395334244, + "learning_rate": 3.6330212566237477e-06, + "loss": 0.0734, + "num_input_tokens_seen": 111616304, + "step": 51715 + }, + { + "epoch": 8.437194127243067, + "grad_norm": 0.5464500784873962, + "learning_rate": 3.6293267754253566e-06, + "loss": 0.1213, + "num_input_tokens_seen": 111627920, + "step": 51720 + }, + { + "epoch": 8.438009787928221, + "grad_norm": 0.3233949840068817, + "learning_rate": 3.6256340266421747e-06, + "loss": 0.0769, + "num_input_tokens_seen": 111638864, + "step": 51725 + }, + { + "epoch": 8.438825448613377, + "grad_norm": 0.5720484852790833, + "learning_rate": 3.6219430105735476e-06, + "loss": 0.0771, + "num_input_tokens_seen": 111649296, + "step": 51730 + }, + { + "epoch": 8.439641109298531, + "grad_norm": 0.813998281955719, + "learning_rate": 3.6182537275186947e-06, + "loss": 0.0682, + "num_input_tokens_seen": 111659152, + "step": 51735 + }, + { + "epoch": 8.440456769983687, + "grad_norm": 0.6815459728240967, + "learning_rate": 3.614566177776682e-06, + "loss": 0.0573, + "num_input_tokens_seen": 111669744, + "step": 51740 + }, + { + "epoch": 8.441272430668842, + "grad_norm": 0.22623679041862488, + "learning_rate": 3.6108803616464376e-06, + "loss": 0.1376, + "num_input_tokens_seen": 111680304, + "step": 51745 + }, + { + "epoch": 8.442088091353996, + "grad_norm": 1.0137273073196411, + "learning_rate": 3.6071962794267667e-06, + "loss": 0.1139, + "num_input_tokens_seen": 111692400, + "step": 51750 + }, + { + "epoch": 8.442903752039152, + "grad_norm": 0.1522027850151062, + "learning_rate": 3.603513931416311e-06, + "loss": 0.0903, + "num_input_tokens_seen": 111702160, + "step": 51755 + }, + { + "epoch": 8.443719412724306, + "grad_norm": 0.8516485691070557, + "learning_rate": 3.5998333179135783e-06, + "loss": 0.1452, + "num_input_tokens_seen": 111713712, + "step": 51760 + }, + { + "epoch": 8.444535073409462, + "grad_norm": 0.023396551609039307, + "learning_rate": 3.596154439216942e-06, + "loss": 0.0846, + "num_input_tokens_seen": 111724112, + "step": 51765 + }, + { + "epoch": 8.445350734094617, + "grad_norm": 0.6332190632820129, + "learning_rate": 3.5924772956246273e-06, + "loss": 0.0548, + "num_input_tokens_seen": 111735504, + "step": 51770 + }, + { + "epoch": 8.446166394779771, + "grad_norm": 0.07130077481269836, + "learning_rate": 3.5888018874347257e-06, + "loss": 0.0251, + "num_input_tokens_seen": 111744912, + "step": 51775 + }, + { + "epoch": 8.446982055464927, + "grad_norm": 0.5784686803817749, + "learning_rate": 3.5851282149451798e-06, + "loss": 0.0579, + "num_input_tokens_seen": 111756080, + "step": 51780 + }, + { + "epoch": 8.447797716150081, + "grad_norm": 0.32774847745895386, + "learning_rate": 3.5814562784538012e-06, + "loss": 0.0223, + "num_input_tokens_seen": 111766928, + "step": 51785 + }, + { + "epoch": 8.448613376835237, + "grad_norm": 0.1276811808347702, + "learning_rate": 3.5777860782582523e-06, + "loss": 0.2235, + "num_input_tokens_seen": 111775760, + "step": 51790 + }, + { + "epoch": 8.449429037520392, + "grad_norm": 0.34591084718704224, + "learning_rate": 3.5741176146560558e-06, + "loss": 0.1408, + "num_input_tokens_seen": 111787184, + "step": 51795 + }, + { + "epoch": 8.450244698205546, + "grad_norm": 0.23688393831253052, + "learning_rate": 3.570450887944601e-06, + "loss": 0.1378, + "num_input_tokens_seen": 111798064, + "step": 51800 + }, + { + "epoch": 8.451060358890702, + "grad_norm": 1.3135614395141602, + "learning_rate": 3.5667858984211323e-06, + "loss": 0.147, + "num_input_tokens_seen": 111808016, + "step": 51805 + }, + { + "epoch": 8.451876019575856, + "grad_norm": 0.45226746797561646, + "learning_rate": 3.5631226463827492e-06, + "loss": 0.036, + "num_input_tokens_seen": 111819568, + "step": 51810 + }, + { + "epoch": 8.452691680261012, + "grad_norm": 1.1148838996887207, + "learning_rate": 3.5594611321264125e-06, + "loss": 0.0646, + "num_input_tokens_seen": 111829840, + "step": 51815 + }, + { + "epoch": 8.453507340946166, + "grad_norm": 0.13128003478050232, + "learning_rate": 3.5558013559489457e-06, + "loss": 0.2329, + "num_input_tokens_seen": 111841136, + "step": 51820 + }, + { + "epoch": 8.454323001631321, + "grad_norm": 0.024890517815947533, + "learning_rate": 3.5521433181470306e-06, + "loss": 0.1931, + "num_input_tokens_seen": 111852208, + "step": 51825 + }, + { + "epoch": 8.455138662316477, + "grad_norm": 0.13824039697647095, + "learning_rate": 3.5484870190171994e-06, + "loss": 0.0902, + "num_input_tokens_seen": 111862608, + "step": 51830 + }, + { + "epoch": 8.455954323001631, + "grad_norm": 0.0625186413526535, + "learning_rate": 3.5448324588558566e-06, + "loss": 0.1197, + "num_input_tokens_seen": 111873424, + "step": 51835 + }, + { + "epoch": 8.456769983686787, + "grad_norm": 3.2054524421691895, + "learning_rate": 3.541179637959255e-06, + "loss": 0.3528, + "num_input_tokens_seen": 111884016, + "step": 51840 + }, + { + "epoch": 8.45758564437194, + "grad_norm": 0.4226118326187134, + "learning_rate": 3.537528556623515e-06, + "loss": 0.0632, + "num_input_tokens_seen": 111895440, + "step": 51845 + }, + { + "epoch": 8.458401305057096, + "grad_norm": 1.1501069068908691, + "learning_rate": 3.5338792151446087e-06, + "loss": 0.05, + "num_input_tokens_seen": 111907120, + "step": 51850 + }, + { + "epoch": 8.459216965742252, + "grad_norm": 0.28733572363853455, + "learning_rate": 3.530231613818372e-06, + "loss": 0.1915, + "num_input_tokens_seen": 111916464, + "step": 51855 + }, + { + "epoch": 8.460032626427406, + "grad_norm": 1.7976232767105103, + "learning_rate": 3.526585752940495e-06, + "loss": 0.136, + "num_input_tokens_seen": 111927376, + "step": 51860 + }, + { + "epoch": 8.460848287112562, + "grad_norm": 1.581836223602295, + "learning_rate": 3.522941632806534e-06, + "loss": 0.1962, + "num_input_tokens_seen": 111937392, + "step": 51865 + }, + { + "epoch": 8.461663947797716, + "grad_norm": 0.3661552965641022, + "learning_rate": 3.519299253711897e-06, + "loss": 0.0816, + "num_input_tokens_seen": 111948848, + "step": 51870 + }, + { + "epoch": 8.462479608482871, + "grad_norm": 0.05888834968209267, + "learning_rate": 3.515658615951856e-06, + "loss": 0.0136, + "num_input_tokens_seen": 111959664, + "step": 51875 + }, + { + "epoch": 8.463295269168025, + "grad_norm": 0.4386577010154724, + "learning_rate": 3.5120197198215356e-06, + "loss": 0.216, + "num_input_tokens_seen": 111969712, + "step": 51880 + }, + { + "epoch": 8.464110929853181, + "grad_norm": 0.7374330759048462, + "learning_rate": 3.508382565615928e-06, + "loss": 0.074, + "num_input_tokens_seen": 111981168, + "step": 51885 + }, + { + "epoch": 8.464926590538337, + "grad_norm": 1.0830618143081665, + "learning_rate": 3.5047471536298697e-06, + "loss": 0.1295, + "num_input_tokens_seen": 111991952, + "step": 51890 + }, + { + "epoch": 8.46574225122349, + "grad_norm": 1.590228796005249, + "learning_rate": 3.5011134841580805e-06, + "loss": 0.0785, + "num_input_tokens_seen": 112003024, + "step": 51895 + }, + { + "epoch": 8.466557911908646, + "grad_norm": 1.4729604721069336, + "learning_rate": 3.4974815574951135e-06, + "loss": 0.1552, + "num_input_tokens_seen": 112013360, + "step": 51900 + }, + { + "epoch": 8.4673735725938, + "grad_norm": 0.9723947048187256, + "learning_rate": 3.4938513739353973e-06, + "loss": 0.2171, + "num_input_tokens_seen": 112023664, + "step": 51905 + }, + { + "epoch": 8.468189233278956, + "grad_norm": 1.1635133028030396, + "learning_rate": 3.4902229337732074e-06, + "loss": 0.1112, + "num_input_tokens_seen": 112034928, + "step": 51910 + }, + { + "epoch": 8.469004893964112, + "grad_norm": 2.040271043777466, + "learning_rate": 3.4865962373026805e-06, + "loss": 0.224, + "num_input_tokens_seen": 112046128, + "step": 51915 + }, + { + "epoch": 8.469820554649266, + "grad_norm": 0.02159188874065876, + "learning_rate": 3.4829712848178293e-06, + "loss": 0.0121, + "num_input_tokens_seen": 112056944, + "step": 51920 + }, + { + "epoch": 8.470636215334421, + "grad_norm": 0.04666445404291153, + "learning_rate": 3.4793480766124986e-06, + "loss": 0.3086, + "num_input_tokens_seen": 112067472, + "step": 51925 + }, + { + "epoch": 8.471451876019575, + "grad_norm": 0.07601912319660187, + "learning_rate": 3.4757266129804093e-06, + "loss": 0.0261, + "num_input_tokens_seen": 112076400, + "step": 51930 + }, + { + "epoch": 8.47226753670473, + "grad_norm": 0.9782636761665344, + "learning_rate": 3.4721068942151324e-06, + "loss": 0.2338, + "num_input_tokens_seen": 112087312, + "step": 51935 + }, + { + "epoch": 8.473083197389887, + "grad_norm": 0.5792878270149231, + "learning_rate": 3.4684889206101025e-06, + "loss": 0.2671, + "num_input_tokens_seen": 112098544, + "step": 51940 + }, + { + "epoch": 8.47389885807504, + "grad_norm": 0.17506007850170135, + "learning_rate": 3.464872692458612e-06, + "loss": 0.1349, + "num_input_tokens_seen": 112108304, + "step": 51945 + }, + { + "epoch": 8.474714518760196, + "grad_norm": 0.16921840608119965, + "learning_rate": 3.4612582100538082e-06, + "loss": 0.0108, + "num_input_tokens_seen": 112119600, + "step": 51950 + }, + { + "epoch": 8.47553017944535, + "grad_norm": 0.3192669749259949, + "learning_rate": 3.4576454736887003e-06, + "loss": 0.024, + "num_input_tokens_seen": 112130192, + "step": 51955 + }, + { + "epoch": 8.476345840130506, + "grad_norm": 0.05869949981570244, + "learning_rate": 3.4540344836561546e-06, + "loss": 0.1937, + "num_input_tokens_seen": 112142288, + "step": 51960 + }, + { + "epoch": 8.477161500815662, + "grad_norm": 0.25752395391464233, + "learning_rate": 3.4504252402488974e-06, + "loss": 0.1307, + "num_input_tokens_seen": 112154032, + "step": 51965 + }, + { + "epoch": 8.477977161500815, + "grad_norm": 0.7192663550376892, + "learning_rate": 3.446817743759512e-06, + "loss": 0.0294, + "num_input_tokens_seen": 112165520, + "step": 51970 + }, + { + "epoch": 8.478792822185971, + "grad_norm": 1.352360486984253, + "learning_rate": 3.443211994480439e-06, + "loss": 0.1698, + "num_input_tokens_seen": 112176560, + "step": 51975 + }, + { + "epoch": 8.479608482871125, + "grad_norm": 0.24142509698867798, + "learning_rate": 3.4396079927039804e-06, + "loss": 0.0137, + "num_input_tokens_seen": 112187248, + "step": 51980 + }, + { + "epoch": 8.48042414355628, + "grad_norm": 0.15978406369686127, + "learning_rate": 3.436005738722292e-06, + "loss": 0.0902, + "num_input_tokens_seen": 112198128, + "step": 51985 + }, + { + "epoch": 8.481239804241435, + "grad_norm": 1.679732084274292, + "learning_rate": 3.432405232827396e-06, + "loss": 0.1935, + "num_input_tokens_seen": 112209680, + "step": 51990 + }, + { + "epoch": 8.48205546492659, + "grad_norm": 0.2809467911720276, + "learning_rate": 3.428806475311164e-06, + "loss": 0.0381, + "num_input_tokens_seen": 112220688, + "step": 51995 + }, + { + "epoch": 8.482871125611746, + "grad_norm": 3.652738332748413, + "learning_rate": 3.4252094664653316e-06, + "loss": 0.2268, + "num_input_tokens_seen": 112231632, + "step": 52000 + }, + { + "epoch": 8.4836867862969, + "grad_norm": 0.08355129510164261, + "learning_rate": 3.4216142065814806e-06, + "loss": 0.2276, + "num_input_tokens_seen": 112240592, + "step": 52005 + }, + { + "epoch": 8.484502446982056, + "grad_norm": 0.1358146071434021, + "learning_rate": 3.418020695951077e-06, + "loss": 0.0314, + "num_input_tokens_seen": 112251440, + "step": 52010 + }, + { + "epoch": 8.48531810766721, + "grad_norm": 0.9984641075134277, + "learning_rate": 3.414428934865421e-06, + "loss": 0.1468, + "num_input_tokens_seen": 112262192, + "step": 52015 + }, + { + "epoch": 8.486133768352365, + "grad_norm": 0.3141207695007324, + "learning_rate": 3.4108389236156806e-06, + "loss": 0.0248, + "num_input_tokens_seen": 112273808, + "step": 52020 + }, + { + "epoch": 8.486949429037521, + "grad_norm": 1.3101071119308472, + "learning_rate": 3.4072506624928808e-06, + "loss": 0.0772, + "num_input_tokens_seen": 112284304, + "step": 52025 + }, + { + "epoch": 8.487765089722675, + "grad_norm": 1.7345452308654785, + "learning_rate": 3.4036641517878997e-06, + "loss": 0.2295, + "num_input_tokens_seen": 112295152, + "step": 52030 + }, + { + "epoch": 8.48858075040783, + "grad_norm": 0.44113877415657043, + "learning_rate": 3.400079391791483e-06, + "loss": 0.0992, + "num_input_tokens_seen": 112305648, + "step": 52035 + }, + { + "epoch": 8.489396411092985, + "grad_norm": 1.3204889297485352, + "learning_rate": 3.3964963827942257e-06, + "loss": 0.1245, + "num_input_tokens_seen": 112316144, + "step": 52040 + }, + { + "epoch": 8.49021207177814, + "grad_norm": 0.2967292070388794, + "learning_rate": 3.3929151250865903e-06, + "loss": 0.0734, + "num_input_tokens_seen": 112328560, + "step": 52045 + }, + { + "epoch": 8.491027732463296, + "grad_norm": 0.30849069356918335, + "learning_rate": 3.389335618958886e-06, + "loss": 0.085, + "num_input_tokens_seen": 112338640, + "step": 52050 + }, + { + "epoch": 8.49184339314845, + "grad_norm": 0.7096772193908691, + "learning_rate": 3.385757864701286e-06, + "loss": 0.0899, + "num_input_tokens_seen": 112349232, + "step": 52055 + }, + { + "epoch": 8.492659053833606, + "grad_norm": 0.18698714673519135, + "learning_rate": 3.3821818626038198e-06, + "loss": 0.0864, + "num_input_tokens_seen": 112360400, + "step": 52060 + }, + { + "epoch": 8.49347471451876, + "grad_norm": 0.0723661258816719, + "learning_rate": 3.378607612956386e-06, + "loss": 0.0615, + "num_input_tokens_seen": 112371408, + "step": 52065 + }, + { + "epoch": 8.494290375203915, + "grad_norm": 1.2426378726959229, + "learning_rate": 3.375035116048722e-06, + "loss": 0.1498, + "num_input_tokens_seen": 112382544, + "step": 52070 + }, + { + "epoch": 8.49510603588907, + "grad_norm": 1.9386016130447388, + "learning_rate": 3.371464372170438e-06, + "loss": 0.1939, + "num_input_tokens_seen": 112392688, + "step": 52075 + }, + { + "epoch": 8.495921696574225, + "grad_norm": 0.32762107253074646, + "learning_rate": 3.3678953816109916e-06, + "loss": 0.1223, + "num_input_tokens_seen": 112404240, + "step": 52080 + }, + { + "epoch": 8.49673735725938, + "grad_norm": 0.40536460280418396, + "learning_rate": 3.3643281446597092e-06, + "loss": 0.1202, + "num_input_tokens_seen": 112414768, + "step": 52085 + }, + { + "epoch": 8.497553017944535, + "grad_norm": 0.07895764708518982, + "learning_rate": 3.3607626616057624e-06, + "loss": 0.1753, + "num_input_tokens_seen": 112425424, + "step": 52090 + }, + { + "epoch": 8.49836867862969, + "grad_norm": 0.21950703859329224, + "learning_rate": 3.3571989327381923e-06, + "loss": 0.0891, + "num_input_tokens_seen": 112436720, + "step": 52095 + }, + { + "epoch": 8.499184339314844, + "grad_norm": 0.4222082793712616, + "learning_rate": 3.3536369583458905e-06, + "loss": 0.2334, + "num_input_tokens_seen": 112446896, + "step": 52100 + }, + { + "epoch": 8.5, + "grad_norm": 0.06166878715157509, + "learning_rate": 3.3500767387176114e-06, + "loss": 0.0666, + "num_input_tokens_seen": 112458064, + "step": 52105 + }, + { + "epoch": 8.5, + "eval_loss": 0.13894924521446228, + "eval_runtime": 131.8371, + "eval_samples_per_second": 20.669, + "eval_steps_per_second": 5.173, + "num_input_tokens_seen": 112458064, + "step": 52105 + }, + { + "epoch": 8.500815660685156, + "grad_norm": 0.5304344296455383, + "learning_rate": 3.3465182741419547e-06, + "loss": 0.0743, + "num_input_tokens_seen": 112469200, + "step": 52110 + }, + { + "epoch": 8.50163132137031, + "grad_norm": 3.2155141830444336, + "learning_rate": 3.3429615649074013e-06, + "loss": 0.3232, + "num_input_tokens_seen": 112479888, + "step": 52115 + }, + { + "epoch": 8.502446982055465, + "grad_norm": 0.5862310528755188, + "learning_rate": 3.3394066113022706e-06, + "loss": 0.3664, + "num_input_tokens_seen": 112490896, + "step": 52120 + }, + { + "epoch": 8.50326264274062, + "grad_norm": 0.1469126045703888, + "learning_rate": 3.335853413614745e-06, + "loss": 0.1117, + "num_input_tokens_seen": 112503152, + "step": 52125 + }, + { + "epoch": 8.504078303425775, + "grad_norm": 0.3629530370235443, + "learning_rate": 3.332301972132862e-06, + "loss": 0.1647, + "num_input_tokens_seen": 112513936, + "step": 52130 + }, + { + "epoch": 8.50489396411093, + "grad_norm": 1.5482136011123657, + "learning_rate": 3.3287522871445263e-06, + "loss": 0.1749, + "num_input_tokens_seen": 112523536, + "step": 52135 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.08448302745819092, + "learning_rate": 3.3252043589374866e-06, + "loss": 0.1633, + "num_input_tokens_seen": 112534512, + "step": 52140 + }, + { + "epoch": 8.50652528548124, + "grad_norm": 0.6574211120605469, + "learning_rate": 3.3216581877993564e-06, + "loss": 0.2488, + "num_input_tokens_seen": 112546032, + "step": 52145 + }, + { + "epoch": 8.507340946166394, + "grad_norm": 1.2649428844451904, + "learning_rate": 3.3181137740176118e-06, + "loss": 0.2278, + "num_input_tokens_seen": 112556016, + "step": 52150 + }, + { + "epoch": 8.50815660685155, + "grad_norm": 1.0931663513183594, + "learning_rate": 3.3145711178795753e-06, + "loss": 0.1032, + "num_input_tokens_seen": 112565520, + "step": 52155 + }, + { + "epoch": 8.508972267536706, + "grad_norm": 0.10509290546178818, + "learning_rate": 3.3110302196724368e-06, + "loss": 0.0327, + "num_input_tokens_seen": 112575824, + "step": 52160 + }, + { + "epoch": 8.50978792822186, + "grad_norm": 1.8113428354263306, + "learning_rate": 3.3074910796832363e-06, + "loss": 0.1005, + "num_input_tokens_seen": 112587056, + "step": 52165 + }, + { + "epoch": 8.510603588907015, + "grad_norm": 0.22423064708709717, + "learning_rate": 3.303953698198875e-06, + "loss": 0.0139, + "num_input_tokens_seen": 112598192, + "step": 52170 + }, + { + "epoch": 8.51141924959217, + "grad_norm": 0.5099257826805115, + "learning_rate": 3.300418075506112e-06, + "loss": 0.0285, + "num_input_tokens_seen": 112609584, + "step": 52175 + }, + { + "epoch": 8.512234910277325, + "grad_norm": 1.2476074695587158, + "learning_rate": 3.296884211891563e-06, + "loss": 0.1254, + "num_input_tokens_seen": 112620720, + "step": 52180 + }, + { + "epoch": 8.513050570962479, + "grad_norm": 0.13075698912143707, + "learning_rate": 3.293352107641698e-06, + "loss": 0.0606, + "num_input_tokens_seen": 112631696, + "step": 52185 + }, + { + "epoch": 8.513866231647635, + "grad_norm": 0.2438461035490036, + "learning_rate": 3.2898217630428523e-06, + "loss": 0.1759, + "num_input_tokens_seen": 112641872, + "step": 52190 + }, + { + "epoch": 8.51468189233279, + "grad_norm": 0.2068079710006714, + "learning_rate": 3.2862931783812083e-06, + "loss": 0.0378, + "num_input_tokens_seen": 112652240, + "step": 52195 + }, + { + "epoch": 8.515497553017944, + "grad_norm": 0.1402578055858612, + "learning_rate": 3.282766353942815e-06, + "loss": 0.0277, + "num_input_tokens_seen": 112663824, + "step": 52200 + }, + { + "epoch": 8.5163132137031, + "grad_norm": 0.49284863471984863, + "learning_rate": 3.279241290013568e-06, + "loss": 0.0612, + "num_input_tokens_seen": 112675216, + "step": 52205 + }, + { + "epoch": 8.517128874388254, + "grad_norm": 0.5502614974975586, + "learning_rate": 3.275717986879237e-06, + "loss": 0.0313, + "num_input_tokens_seen": 112684464, + "step": 52210 + }, + { + "epoch": 8.51794453507341, + "grad_norm": 0.22330661118030548, + "learning_rate": 3.2721964448254345e-06, + "loss": 0.0507, + "num_input_tokens_seen": 112695120, + "step": 52215 + }, + { + "epoch": 8.518760195758565, + "grad_norm": 0.11130800098180771, + "learning_rate": 3.268676664137635e-06, + "loss": 0.0381, + "num_input_tokens_seen": 112705776, + "step": 52220 + }, + { + "epoch": 8.51957585644372, + "grad_norm": 0.15646399557590485, + "learning_rate": 3.2651586451011657e-06, + "loss": 0.2317, + "num_input_tokens_seen": 112716368, + "step": 52225 + }, + { + "epoch": 8.520391517128875, + "grad_norm": 0.4524112045764923, + "learning_rate": 3.2616423880012153e-06, + "loss": 0.0378, + "num_input_tokens_seen": 112727440, + "step": 52230 + }, + { + "epoch": 8.521207177814029, + "grad_norm": 2.1564719676971436, + "learning_rate": 3.2581278931228363e-06, + "loss": 0.2928, + "num_input_tokens_seen": 112736336, + "step": 52235 + }, + { + "epoch": 8.522022838499185, + "grad_norm": 1.1869664192199707, + "learning_rate": 3.254615160750926e-06, + "loss": 0.1486, + "num_input_tokens_seen": 112747312, + "step": 52240 + }, + { + "epoch": 8.522838499184338, + "grad_norm": 0.5580085515975952, + "learning_rate": 3.2511041911702483e-06, + "loss": 0.3119, + "num_input_tokens_seen": 112758160, + "step": 52245 + }, + { + "epoch": 8.523654159869494, + "grad_norm": 0.7612715363502502, + "learning_rate": 3.247594984665417e-06, + "loss": 0.1051, + "num_input_tokens_seen": 112768080, + "step": 52250 + }, + { + "epoch": 8.52446982055465, + "grad_norm": 0.174855038523674, + "learning_rate": 3.244087541520907e-06, + "loss": 0.1327, + "num_input_tokens_seen": 112778416, + "step": 52255 + }, + { + "epoch": 8.525285481239804, + "grad_norm": 1.7078816890716553, + "learning_rate": 3.24058186202105e-06, + "loss": 0.1175, + "num_input_tokens_seen": 112788944, + "step": 52260 + }, + { + "epoch": 8.52610114192496, + "grad_norm": 0.050862863659858704, + "learning_rate": 3.2370779464500317e-06, + "loss": 0.0346, + "num_input_tokens_seen": 112799664, + "step": 52265 + }, + { + "epoch": 8.526916802610113, + "grad_norm": 0.2345835566520691, + "learning_rate": 3.2335757950919003e-06, + "loss": 0.3092, + "num_input_tokens_seen": 112809840, + "step": 52270 + }, + { + "epoch": 8.52773246329527, + "grad_norm": 0.6063271760940552, + "learning_rate": 3.230075408230557e-06, + "loss": 0.1385, + "num_input_tokens_seen": 112820240, + "step": 52275 + }, + { + "epoch": 8.528548123980425, + "grad_norm": 0.09482656419277191, + "learning_rate": 3.2265767861497597e-06, + "loss": 0.119, + "num_input_tokens_seen": 112830704, + "step": 52280 + }, + { + "epoch": 8.529363784665579, + "grad_norm": 1.0516760349273682, + "learning_rate": 3.2230799291331244e-06, + "loss": 0.083, + "num_input_tokens_seen": 112842352, + "step": 52285 + }, + { + "epoch": 8.530179445350734, + "grad_norm": 0.6471627950668335, + "learning_rate": 3.219584837464126e-06, + "loss": 0.1452, + "num_input_tokens_seen": 112853904, + "step": 52290 + }, + { + "epoch": 8.530995106035888, + "grad_norm": 0.9402866959571838, + "learning_rate": 3.2160915114260947e-06, + "loss": 0.0988, + "num_input_tokens_seen": 112863504, + "step": 52295 + }, + { + "epoch": 8.531810766721044, + "grad_norm": 0.20046430826187134, + "learning_rate": 3.212599951302214e-06, + "loss": 0.0494, + "num_input_tokens_seen": 112873808, + "step": 52300 + }, + { + "epoch": 8.5326264274062, + "grad_norm": 1.5563350915908813, + "learning_rate": 3.2091101573755306e-06, + "loss": 0.1691, + "num_input_tokens_seen": 112883696, + "step": 52305 + }, + { + "epoch": 8.533442088091354, + "grad_norm": 0.1295468658208847, + "learning_rate": 3.2056221299289423e-06, + "loss": 0.0389, + "num_input_tokens_seen": 112893424, + "step": 52310 + }, + { + "epoch": 8.53425774877651, + "grad_norm": 0.2061646580696106, + "learning_rate": 3.20213586924521e-06, + "loss": 0.1168, + "num_input_tokens_seen": 112904016, + "step": 52315 + }, + { + "epoch": 8.535073409461663, + "grad_norm": 1.0506218671798706, + "learning_rate": 3.1986513756069426e-06, + "loss": 0.1046, + "num_input_tokens_seen": 112915376, + "step": 52320 + }, + { + "epoch": 8.535889070146819, + "grad_norm": 0.059418100863695145, + "learning_rate": 3.1951686492966094e-06, + "loss": 0.1003, + "num_input_tokens_seen": 112925424, + "step": 52325 + }, + { + "epoch": 8.536704730831975, + "grad_norm": 0.09359080344438553, + "learning_rate": 3.1916876905965483e-06, + "loss": 0.1671, + "num_input_tokens_seen": 112936656, + "step": 52330 + }, + { + "epoch": 8.537520391517129, + "grad_norm": 0.07560225576162338, + "learning_rate": 3.188208499788936e-06, + "loss": 0.0843, + "num_input_tokens_seen": 112947600, + "step": 52335 + }, + { + "epoch": 8.538336052202284, + "grad_norm": 0.02972748875617981, + "learning_rate": 3.184731077155817e-06, + "loss": 0.0232, + "num_input_tokens_seen": 112957840, + "step": 52340 + }, + { + "epoch": 8.539151712887438, + "grad_norm": 0.7501899003982544, + "learning_rate": 3.1812554229790848e-06, + "loss": 0.0619, + "num_input_tokens_seen": 112969488, + "step": 52345 + }, + { + "epoch": 8.539967373572594, + "grad_norm": 0.5979722142219543, + "learning_rate": 3.1777815375404944e-06, + "loss": 0.0386, + "num_input_tokens_seen": 112979216, + "step": 52350 + }, + { + "epoch": 8.540783034257748, + "grad_norm": 0.37767651677131653, + "learning_rate": 3.17430942112166e-06, + "loss": 0.0291, + "num_input_tokens_seen": 112990736, + "step": 52355 + }, + { + "epoch": 8.541598694942904, + "grad_norm": 1.125956654548645, + "learning_rate": 3.170839074004045e-06, + "loss": 0.2426, + "num_input_tokens_seen": 113001296, + "step": 52360 + }, + { + "epoch": 8.54241435562806, + "grad_norm": 0.051118023693561554, + "learning_rate": 3.1673704964689743e-06, + "loss": 0.0997, + "num_input_tokens_seen": 113011312, + "step": 52365 + }, + { + "epoch": 8.543230016313213, + "grad_norm": 0.27044036984443665, + "learning_rate": 3.1639036887976286e-06, + "loss": 0.0703, + "num_input_tokens_seen": 113020688, + "step": 52370 + }, + { + "epoch": 8.544045676998369, + "grad_norm": 0.38875794410705566, + "learning_rate": 3.1604386512710387e-06, + "loss": 0.0599, + "num_input_tokens_seen": 113031888, + "step": 52375 + }, + { + "epoch": 8.544861337683523, + "grad_norm": 1.2605162858963013, + "learning_rate": 3.1569753841701106e-06, + "loss": 0.1577, + "num_input_tokens_seen": 113042800, + "step": 52380 + }, + { + "epoch": 8.545676998368679, + "grad_norm": 0.2370639592409134, + "learning_rate": 3.1535138877755887e-06, + "loss": 0.0243, + "num_input_tokens_seen": 113052176, + "step": 52385 + }, + { + "epoch": 8.546492659053834, + "grad_norm": 0.022931218147277832, + "learning_rate": 3.1500541623680795e-06, + "loss": 0.1826, + "num_input_tokens_seen": 113062608, + "step": 52390 + }, + { + "epoch": 8.547308319738988, + "grad_norm": 0.7654633522033691, + "learning_rate": 3.1465962082280474e-06, + "loss": 0.1278, + "num_input_tokens_seen": 113075248, + "step": 52395 + }, + { + "epoch": 8.548123980424144, + "grad_norm": 0.09503885358572006, + "learning_rate": 3.1431400256358073e-06, + "loss": 0.1108, + "num_input_tokens_seen": 113086256, + "step": 52400 + }, + { + "epoch": 8.548939641109298, + "grad_norm": 1.1328749656677246, + "learning_rate": 3.1396856148715375e-06, + "loss": 0.0679, + "num_input_tokens_seen": 113096752, + "step": 52405 + }, + { + "epoch": 8.549755301794454, + "grad_norm": 0.028697090223431587, + "learning_rate": 3.13623297621527e-06, + "loss": 0.0837, + "num_input_tokens_seen": 113106480, + "step": 52410 + }, + { + "epoch": 8.550570962479608, + "grad_norm": 0.0757938101887703, + "learning_rate": 3.1327821099468915e-06, + "loss": 0.1304, + "num_input_tokens_seen": 113117424, + "step": 52415 + }, + { + "epoch": 8.551386623164763, + "grad_norm": 2.051435708999634, + "learning_rate": 3.1293330163461503e-06, + "loss": 0.167, + "num_input_tokens_seen": 113127728, + "step": 52420 + }, + { + "epoch": 8.552202283849919, + "grad_norm": 1.3637478351593018, + "learning_rate": 3.125885695692646e-06, + "loss": 0.1183, + "num_input_tokens_seen": 113138000, + "step": 52425 + }, + { + "epoch": 8.553017944535073, + "grad_norm": 0.24820183217525482, + "learning_rate": 3.122440148265829e-06, + "loss": 0.1339, + "num_input_tokens_seen": 113148752, + "step": 52430 + }, + { + "epoch": 8.553833605220229, + "grad_norm": 0.0765695795416832, + "learning_rate": 3.1189963743450235e-06, + "loss": 0.0589, + "num_input_tokens_seen": 113159600, + "step": 52435 + }, + { + "epoch": 8.554649265905383, + "grad_norm": 1.3098163604736328, + "learning_rate": 3.115554374209395e-06, + "loss": 0.084, + "num_input_tokens_seen": 113170416, + "step": 52440 + }, + { + "epoch": 8.555464926590538, + "grad_norm": 1.2991279363632202, + "learning_rate": 3.1121141481379735e-06, + "loss": 0.1456, + "num_input_tokens_seen": 113179984, + "step": 52445 + }, + { + "epoch": 8.556280587275694, + "grad_norm": 0.9799571633338928, + "learning_rate": 3.1086756964096327e-06, + "loss": 0.1227, + "num_input_tokens_seen": 113189680, + "step": 52450 + }, + { + "epoch": 8.557096247960848, + "grad_norm": 0.610051691532135, + "learning_rate": 3.105239019303116e-06, + "loss": 0.0897, + "num_input_tokens_seen": 113200112, + "step": 52455 + }, + { + "epoch": 8.557911908646004, + "grad_norm": 0.4796420931816101, + "learning_rate": 3.101804117097018e-06, + "loss": 0.0828, + "num_input_tokens_seen": 113210416, + "step": 52460 + }, + { + "epoch": 8.558727569331158, + "grad_norm": 0.070195272564888, + "learning_rate": 3.0983709900697903e-06, + "loss": 0.01, + "num_input_tokens_seen": 113221392, + "step": 52465 + }, + { + "epoch": 8.559543230016313, + "grad_norm": 0.2101294845342636, + "learning_rate": 3.0949396384997357e-06, + "loss": 0.3246, + "num_input_tokens_seen": 113232528, + "step": 52470 + }, + { + "epoch": 8.560358890701469, + "grad_norm": 0.39978134632110596, + "learning_rate": 3.0915100626650206e-06, + "loss": 0.1513, + "num_input_tokens_seen": 113243056, + "step": 52475 + }, + { + "epoch": 8.561174551386623, + "grad_norm": 2.006371259689331, + "learning_rate": 3.0880822628436613e-06, + "loss": 0.2345, + "num_input_tokens_seen": 113254576, + "step": 52480 + }, + { + "epoch": 8.561990212071779, + "grad_norm": 0.09436669945716858, + "learning_rate": 3.0846562393135352e-06, + "loss": 0.1163, + "num_input_tokens_seen": 113263472, + "step": 52485 + }, + { + "epoch": 8.562805872756933, + "grad_norm": 0.8137069344520569, + "learning_rate": 3.0812319923523706e-06, + "loss": 0.047, + "num_input_tokens_seen": 113276112, + "step": 52490 + }, + { + "epoch": 8.563621533442088, + "grad_norm": 0.35095372796058655, + "learning_rate": 3.077809522237754e-06, + "loss": 0.0398, + "num_input_tokens_seen": 113285520, + "step": 52495 + }, + { + "epoch": 8.564437194127244, + "grad_norm": 1.1252297163009644, + "learning_rate": 3.0743888292471322e-06, + "loss": 0.1123, + "num_input_tokens_seen": 113296304, + "step": 52500 + }, + { + "epoch": 8.565252854812398, + "grad_norm": 0.0648542195558548, + "learning_rate": 3.0709699136578006e-06, + "loss": 0.1039, + "num_input_tokens_seen": 113307632, + "step": 52505 + }, + { + "epoch": 8.566068515497554, + "grad_norm": 1.5299254655838013, + "learning_rate": 3.0675527757469124e-06, + "loss": 0.119, + "num_input_tokens_seen": 113318352, + "step": 52510 + }, + { + "epoch": 8.566884176182707, + "grad_norm": 0.062116045504808426, + "learning_rate": 3.064137415791485e-06, + "loss": 0.1998, + "num_input_tokens_seen": 113329552, + "step": 52515 + }, + { + "epoch": 8.567699836867863, + "grad_norm": 0.577437698841095, + "learning_rate": 3.0607238340683713e-06, + "loss": 0.0897, + "num_input_tokens_seen": 113339216, + "step": 52520 + }, + { + "epoch": 8.568515497553017, + "grad_norm": 1.1465462446212769, + "learning_rate": 3.057312030854306e-06, + "loss": 0.2199, + "num_input_tokens_seen": 113349904, + "step": 52525 + }, + { + "epoch": 8.569331158238173, + "grad_norm": 0.9464571475982666, + "learning_rate": 3.0539020064258682e-06, + "loss": 0.0895, + "num_input_tokens_seen": 113358832, + "step": 52530 + }, + { + "epoch": 8.570146818923329, + "grad_norm": 0.16618067026138306, + "learning_rate": 3.0504937610594837e-06, + "loss": 0.1557, + "num_input_tokens_seen": 113370480, + "step": 52535 + }, + { + "epoch": 8.570962479608482, + "grad_norm": 0.07771164178848267, + "learning_rate": 3.0470872950314476e-06, + "loss": 0.0277, + "num_input_tokens_seen": 113380592, + "step": 52540 + }, + { + "epoch": 8.571778140293638, + "grad_norm": 0.6178666949272156, + "learning_rate": 3.043682608617898e-06, + "loss": 0.1341, + "num_input_tokens_seen": 113392112, + "step": 52545 + }, + { + "epoch": 8.572593800978792, + "grad_norm": 0.07715046405792236, + "learning_rate": 3.0402797020948446e-06, + "loss": 0.1031, + "num_input_tokens_seen": 113403152, + "step": 52550 + }, + { + "epoch": 8.573409461663948, + "grad_norm": 0.18739329278469086, + "learning_rate": 3.0368785757381418e-06, + "loss": 0.0248, + "num_input_tokens_seen": 113413424, + "step": 52555 + }, + { + "epoch": 8.574225122349104, + "grad_norm": 1.7632646560668945, + "learning_rate": 3.033479229823502e-06, + "loss": 0.1614, + "num_input_tokens_seen": 113424848, + "step": 52560 + }, + { + "epoch": 8.575040783034257, + "grad_norm": 1.6881829500198364, + "learning_rate": 3.030081664626494e-06, + "loss": 0.4196, + "num_input_tokens_seen": 113434608, + "step": 52565 + }, + { + "epoch": 8.575856443719413, + "grad_norm": 0.05792311206459999, + "learning_rate": 3.0266858804225388e-06, + "loss": 0.1628, + "num_input_tokens_seen": 113446224, + "step": 52570 + }, + { + "epoch": 8.576672104404567, + "grad_norm": 0.07818284630775452, + "learning_rate": 3.0232918774869194e-06, + "loss": 0.2568, + "num_input_tokens_seen": 113457296, + "step": 52575 + }, + { + "epoch": 8.577487765089723, + "grad_norm": 0.030526209622621536, + "learning_rate": 3.0198996560947657e-06, + "loss": 0.0373, + "num_input_tokens_seen": 113468720, + "step": 52580 + }, + { + "epoch": 8.578303425774878, + "grad_norm": 1.0556334257125854, + "learning_rate": 3.016509216521074e-06, + "loss": 0.1879, + "num_input_tokens_seen": 113479152, + "step": 52585 + }, + { + "epoch": 8.579119086460032, + "grad_norm": 1.6634182929992676, + "learning_rate": 3.0131205590406886e-06, + "loss": 0.1747, + "num_input_tokens_seen": 113490608, + "step": 52590 + }, + { + "epoch": 8.579934747145188, + "grad_norm": 1.2551792860031128, + "learning_rate": 3.0097336839283118e-06, + "loss": 0.049, + "num_input_tokens_seen": 113501968, + "step": 52595 + }, + { + "epoch": 8.580750407830342, + "grad_norm": 0.3023640513420105, + "learning_rate": 3.0063485914584995e-06, + "loss": 0.0637, + "num_input_tokens_seen": 113512944, + "step": 52600 + }, + { + "epoch": 8.581566068515498, + "grad_norm": 0.11705157905817032, + "learning_rate": 3.0029652819056646e-06, + "loss": 0.0338, + "num_input_tokens_seen": 113524304, + "step": 52605 + }, + { + "epoch": 8.582381729200652, + "grad_norm": 1.6001521348953247, + "learning_rate": 2.9995837555440748e-06, + "loss": 0.1719, + "num_input_tokens_seen": 113536400, + "step": 52610 + }, + { + "epoch": 8.583197389885807, + "grad_norm": 1.3096264600753784, + "learning_rate": 2.9962040126478548e-06, + "loss": 0.2358, + "num_input_tokens_seen": 113546992, + "step": 52615 + }, + { + "epoch": 8.584013050570963, + "grad_norm": 0.7460181713104248, + "learning_rate": 2.992826053490985e-06, + "loss": 0.0512, + "num_input_tokens_seen": 113557808, + "step": 52620 + }, + { + "epoch": 8.584828711256117, + "grad_norm": 0.16230231523513794, + "learning_rate": 2.9894498783473e-06, + "loss": 0.1113, + "num_input_tokens_seen": 113568336, + "step": 52625 + }, + { + "epoch": 8.585644371941273, + "grad_norm": 0.49021467566490173, + "learning_rate": 2.986075487490486e-06, + "loss": 0.0828, + "num_input_tokens_seen": 113579280, + "step": 52630 + }, + { + "epoch": 8.586460032626427, + "grad_norm": 0.3599066436290741, + "learning_rate": 2.982702881194091e-06, + "loss": 0.0504, + "num_input_tokens_seen": 113591024, + "step": 52635 + }, + { + "epoch": 8.587275693311582, + "grad_norm": 0.5298332571983337, + "learning_rate": 2.9793320597315154e-06, + "loss": 0.1606, + "num_input_tokens_seen": 113602032, + "step": 52640 + }, + { + "epoch": 8.588091353996738, + "grad_norm": 0.21833805739879608, + "learning_rate": 2.975963023376008e-06, + "loss": 0.0491, + "num_input_tokens_seen": 113612688, + "step": 52645 + }, + { + "epoch": 8.588907014681892, + "grad_norm": 1.4480880498886108, + "learning_rate": 2.9725957724006936e-06, + "loss": 0.101, + "num_input_tokens_seen": 113622224, + "step": 52650 + }, + { + "epoch": 8.589722675367048, + "grad_norm": 1.7689820528030396, + "learning_rate": 2.9692303070785325e-06, + "loss": 0.1441, + "num_input_tokens_seen": 113631920, + "step": 52655 + }, + { + "epoch": 8.590538336052202, + "grad_norm": 0.4739533066749573, + "learning_rate": 2.9658666276823427e-06, + "loss": 0.0427, + "num_input_tokens_seen": 113642672, + "step": 52660 + }, + { + "epoch": 8.591353996737357, + "grad_norm": 0.10593804717063904, + "learning_rate": 2.9625047344848082e-06, + "loss": 0.1883, + "num_input_tokens_seen": 113653456, + "step": 52665 + }, + { + "epoch": 8.592169657422513, + "grad_norm": 0.3213776648044586, + "learning_rate": 2.959144627758453e-06, + "loss": 0.1167, + "num_input_tokens_seen": 113664304, + "step": 52670 + }, + { + "epoch": 8.592985318107667, + "grad_norm": 2.619401216506958, + "learning_rate": 2.955786307775671e-06, + "loss": 0.181, + "num_input_tokens_seen": 113675024, + "step": 52675 + }, + { + "epoch": 8.593800978792823, + "grad_norm": 0.03337228298187256, + "learning_rate": 2.9524297748087014e-06, + "loss": 0.214, + "num_input_tokens_seen": 113685200, + "step": 52680 + }, + { + "epoch": 8.594616639477977, + "grad_norm": 0.5404746532440186, + "learning_rate": 2.949075029129644e-06, + "loss": 0.0773, + "num_input_tokens_seen": 113696528, + "step": 52685 + }, + { + "epoch": 8.595432300163132, + "grad_norm": 0.049870576709508896, + "learning_rate": 2.945722071010443e-06, + "loss": 0.1669, + "num_input_tokens_seen": 113707568, + "step": 52690 + }, + { + "epoch": 8.596247960848288, + "grad_norm": 0.11618579179048538, + "learning_rate": 2.9423709007229184e-06, + "loss": 0.0178, + "num_input_tokens_seen": 113719280, + "step": 52695 + }, + { + "epoch": 8.597063621533442, + "grad_norm": 2.622437000274658, + "learning_rate": 2.9390215185387287e-06, + "loss": 0.2731, + "num_input_tokens_seen": 113729072, + "step": 52700 + }, + { + "epoch": 8.597879282218598, + "grad_norm": 1.0182908773422241, + "learning_rate": 2.93567392472939e-06, + "loss": 0.0709, + "num_input_tokens_seen": 113740784, + "step": 52705 + }, + { + "epoch": 8.598694942903752, + "grad_norm": 0.08351012319326401, + "learning_rate": 2.932328119566277e-06, + "loss": 0.1342, + "num_input_tokens_seen": 113751088, + "step": 52710 + }, + { + "epoch": 8.599510603588907, + "grad_norm": 1.3068711757659912, + "learning_rate": 2.928984103320617e-06, + "loss": 0.3106, + "num_input_tokens_seen": 113762672, + "step": 52715 + }, + { + "epoch": 8.600326264274061, + "grad_norm": 0.15515518188476562, + "learning_rate": 2.9256418762634936e-06, + "loss": 0.0794, + "num_input_tokens_seen": 113773072, + "step": 52720 + }, + { + "epoch": 8.601141924959217, + "grad_norm": 0.07428939640522003, + "learning_rate": 2.922301438665842e-06, + "loss": 0.0399, + "num_input_tokens_seen": 113784080, + "step": 52725 + }, + { + "epoch": 8.601957585644373, + "grad_norm": 0.49779772758483887, + "learning_rate": 2.9189627907984576e-06, + "loss": 0.0483, + "num_input_tokens_seen": 113794704, + "step": 52730 + }, + { + "epoch": 8.602773246329527, + "grad_norm": 0.31317272782325745, + "learning_rate": 2.9156259329319867e-06, + "loss": 0.0389, + "num_input_tokens_seen": 113806512, + "step": 52735 + }, + { + "epoch": 8.603588907014682, + "grad_norm": 1.3151118755340576, + "learning_rate": 2.9122908653369335e-06, + "loss": 0.2086, + "num_input_tokens_seen": 113817872, + "step": 52740 + }, + { + "epoch": 8.604404567699836, + "grad_norm": 0.7250694632530212, + "learning_rate": 2.908957588283656e-06, + "loss": 0.0745, + "num_input_tokens_seen": 113829808, + "step": 52745 + }, + { + "epoch": 8.605220228384992, + "grad_norm": 0.021447768434882164, + "learning_rate": 2.9056261020423582e-06, + "loss": 0.0265, + "num_input_tokens_seen": 113840656, + "step": 52750 + }, + { + "epoch": 8.606035889070148, + "grad_norm": 0.32205256819725037, + "learning_rate": 2.9022964068831204e-06, + "loss": 0.0962, + "num_input_tokens_seen": 113851664, + "step": 52755 + }, + { + "epoch": 8.606851549755302, + "grad_norm": 0.5026395916938782, + "learning_rate": 2.898968503075858e-06, + "loss": 0.0845, + "num_input_tokens_seen": 113862288, + "step": 52760 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 1.4450671672821045, + "learning_rate": 2.895642390890349e-06, + "loss": 0.0889, + "num_input_tokens_seen": 113873456, + "step": 52765 + }, + { + "epoch": 8.608482871125611, + "grad_norm": 0.039071694016456604, + "learning_rate": 2.8923180705962226e-06, + "loss": 0.2039, + "num_input_tokens_seen": 113883536, + "step": 52770 + }, + { + "epoch": 8.609298531810767, + "grad_norm": 0.3051919937133789, + "learning_rate": 2.888995542462969e-06, + "loss": 0.0492, + "num_input_tokens_seen": 113893520, + "step": 52775 + }, + { + "epoch": 8.61011419249592, + "grad_norm": 0.13717685639858246, + "learning_rate": 2.885674806759925e-06, + "loss": 0.1823, + "num_input_tokens_seen": 113904592, + "step": 52780 + }, + { + "epoch": 8.610929853181077, + "grad_norm": 0.24603791534900665, + "learning_rate": 2.8823558637562893e-06, + "loss": 0.0745, + "num_input_tokens_seen": 113916272, + "step": 52785 + }, + { + "epoch": 8.611745513866232, + "grad_norm": 0.13063745200634003, + "learning_rate": 2.8790387137211105e-06, + "loss": 0.0368, + "num_input_tokens_seen": 113928336, + "step": 52790 + }, + { + "epoch": 8.612561174551386, + "grad_norm": 0.5238015055656433, + "learning_rate": 2.8757233569232933e-06, + "loss": 0.1674, + "num_input_tokens_seen": 113939920, + "step": 52795 + }, + { + "epoch": 8.613376835236542, + "grad_norm": 1.6543503999710083, + "learning_rate": 2.8724097936316004e-06, + "loss": 0.1196, + "num_input_tokens_seen": 113949616, + "step": 52800 + }, + { + "epoch": 8.614192495921696, + "grad_norm": 0.48371046781539917, + "learning_rate": 2.8690980241146415e-06, + "loss": 0.0517, + "num_input_tokens_seen": 113959408, + "step": 52805 + }, + { + "epoch": 8.615008156606851, + "grad_norm": 0.16601872444152832, + "learning_rate": 2.8657880486408884e-06, + "loss": 0.2979, + "num_input_tokens_seen": 113970736, + "step": 52810 + }, + { + "epoch": 8.615823817292007, + "grad_norm": 0.3921493589878082, + "learning_rate": 2.862479867478665e-06, + "loss": 0.1266, + "num_input_tokens_seen": 113981008, + "step": 52815 + }, + { + "epoch": 8.616639477977161, + "grad_norm": 0.22150687873363495, + "learning_rate": 2.859173480896149e-06, + "loss": 0.086, + "num_input_tokens_seen": 113992816, + "step": 52820 + }, + { + "epoch": 8.617455138662317, + "grad_norm": 0.9350141882896423, + "learning_rate": 2.85586888916137e-06, + "loss": 0.0583, + "num_input_tokens_seen": 114005392, + "step": 52825 + }, + { + "epoch": 8.61827079934747, + "grad_norm": 0.42017918825149536, + "learning_rate": 2.852566092542211e-06, + "loss": 0.0606, + "num_input_tokens_seen": 114015600, + "step": 52830 + }, + { + "epoch": 8.619086460032626, + "grad_norm": 1.8026763200759888, + "learning_rate": 2.8492650913064274e-06, + "loss": 0.2904, + "num_input_tokens_seen": 114026448, + "step": 52835 + }, + { + "epoch": 8.619902120717782, + "grad_norm": 1.0971359014511108, + "learning_rate": 2.8459658857216074e-06, + "loss": 0.1125, + "num_input_tokens_seen": 114037552, + "step": 52840 + }, + { + "epoch": 8.620717781402936, + "grad_norm": 0.11579275876283646, + "learning_rate": 2.8426684760551993e-06, + "loss": 0.1577, + "num_input_tokens_seen": 114049072, + "step": 52845 + }, + { + "epoch": 8.621533442088092, + "grad_norm": 1.5937447547912598, + "learning_rate": 2.83937286257451e-06, + "loss": 0.1387, + "num_input_tokens_seen": 114059824, + "step": 52850 + }, + { + "epoch": 8.622349102773246, + "grad_norm": 0.22455863654613495, + "learning_rate": 2.8360790455466996e-06, + "loss": 0.1678, + "num_input_tokens_seen": 114070960, + "step": 52855 + }, + { + "epoch": 8.623164763458401, + "grad_norm": 0.1995495706796646, + "learning_rate": 2.8327870252387727e-06, + "loss": 0.0638, + "num_input_tokens_seen": 114082224, + "step": 52860 + }, + { + "epoch": 8.623980424143557, + "grad_norm": 0.09466761350631714, + "learning_rate": 2.829496801917611e-06, + "loss": 0.0718, + "num_input_tokens_seen": 114091760, + "step": 52865 + }, + { + "epoch": 8.624796084828711, + "grad_norm": 1.5015623569488525, + "learning_rate": 2.826208375849931e-06, + "loss": 0.1209, + "num_input_tokens_seen": 114100912, + "step": 52870 + }, + { + "epoch": 8.625611745513867, + "grad_norm": 0.07469386607408524, + "learning_rate": 2.8229217473023094e-06, + "loss": 0.13, + "num_input_tokens_seen": 114111856, + "step": 52875 + }, + { + "epoch": 8.62642740619902, + "grad_norm": 1.5887863636016846, + "learning_rate": 2.8196369165411767e-06, + "loss": 0.2452, + "num_input_tokens_seen": 114121872, + "step": 52880 + }, + { + "epoch": 8.627243066884176, + "grad_norm": 1.0654709339141846, + "learning_rate": 2.8163538838328176e-06, + "loss": 0.1432, + "num_input_tokens_seen": 114132304, + "step": 52885 + }, + { + "epoch": 8.62805872756933, + "grad_norm": 1.5709645748138428, + "learning_rate": 2.8130726494433684e-06, + "loss": 0.2067, + "num_input_tokens_seen": 114142160, + "step": 52890 + }, + { + "epoch": 8.628874388254486, + "grad_norm": 0.7382910847663879, + "learning_rate": 2.8097932136388285e-06, + "loss": 0.1586, + "num_input_tokens_seen": 114153392, + "step": 52895 + }, + { + "epoch": 8.629690048939642, + "grad_norm": 0.20567677915096283, + "learning_rate": 2.8065155766850425e-06, + "loss": 0.0689, + "num_input_tokens_seen": 114165648, + "step": 52900 + }, + { + "epoch": 8.630505709624796, + "grad_norm": 0.4811864197254181, + "learning_rate": 2.8032397388477098e-06, + "loss": 0.1061, + "num_input_tokens_seen": 114175248, + "step": 52905 + }, + { + "epoch": 8.631321370309951, + "grad_norm": 0.8193597793579102, + "learning_rate": 2.799965700392393e-06, + "loss": 0.2718, + "num_input_tokens_seen": 114186256, + "step": 52910 + }, + { + "epoch": 8.632137030995105, + "grad_norm": 1.447900414466858, + "learning_rate": 2.7966934615844957e-06, + "loss": 0.0513, + "num_input_tokens_seen": 114197264, + "step": 52915 + }, + { + "epoch": 8.632952691680261, + "grad_norm": 1.00506591796875, + "learning_rate": 2.793423022689284e-06, + "loss": 0.0344, + "num_input_tokens_seen": 114208944, + "step": 52920 + }, + { + "epoch": 8.633768352365417, + "grad_norm": 0.11975077539682388, + "learning_rate": 2.7901543839718795e-06, + "loss": 0.1287, + "num_input_tokens_seen": 114219504, + "step": 52925 + }, + { + "epoch": 8.63458401305057, + "grad_norm": 0.16645459830760956, + "learning_rate": 2.7868875456972534e-06, + "loss": 0.0238, + "num_input_tokens_seen": 114229072, + "step": 52930 + }, + { + "epoch": 8.635399673735726, + "grad_norm": 0.6986544728279114, + "learning_rate": 2.783622508130229e-06, + "loss": 0.0974, + "num_input_tokens_seen": 114238768, + "step": 52935 + }, + { + "epoch": 8.63621533442088, + "grad_norm": 0.036802809685468674, + "learning_rate": 2.7803592715354877e-06, + "loss": 0.0116, + "num_input_tokens_seen": 114250256, + "step": 52940 + }, + { + "epoch": 8.637030995106036, + "grad_norm": 1.5074115991592407, + "learning_rate": 2.7770978361775667e-06, + "loss": 0.1615, + "num_input_tokens_seen": 114259696, + "step": 52945 + }, + { + "epoch": 8.63784665579119, + "grad_norm": 0.19832772016525269, + "learning_rate": 2.7738382023208526e-06, + "loss": 0.2279, + "num_input_tokens_seen": 114270288, + "step": 52950 + }, + { + "epoch": 8.638662316476346, + "grad_norm": 0.07972156256437302, + "learning_rate": 2.770580370229589e-06, + "loss": 0.0744, + "num_input_tokens_seen": 114280784, + "step": 52955 + }, + { + "epoch": 8.639477977161501, + "grad_norm": 0.9932041764259338, + "learning_rate": 2.7673243401678704e-06, + "loss": 0.0953, + "num_input_tokens_seen": 114290832, + "step": 52960 + }, + { + "epoch": 8.640293637846655, + "grad_norm": 0.784183919429779, + "learning_rate": 2.7640701123996445e-06, + "loss": 0.1179, + "num_input_tokens_seen": 114301648, + "step": 52965 + }, + { + "epoch": 8.641109298531811, + "grad_norm": 0.9033508896827698, + "learning_rate": 2.7608176871887242e-06, + "loss": 0.0975, + "num_input_tokens_seen": 114311728, + "step": 52970 + }, + { + "epoch": 8.641924959216965, + "grad_norm": 0.4668687880039215, + "learning_rate": 2.7575670647987606e-06, + "loss": 0.1928, + "num_input_tokens_seen": 114322512, + "step": 52975 + }, + { + "epoch": 8.64274061990212, + "grad_norm": 0.566303551197052, + "learning_rate": 2.7543182454932705e-06, + "loss": 0.1504, + "num_input_tokens_seen": 114332592, + "step": 52980 + }, + { + "epoch": 8.643556280587276, + "grad_norm": 1.3572642803192139, + "learning_rate": 2.751071229535615e-06, + "loss": 0.2335, + "num_input_tokens_seen": 114343696, + "step": 52985 + }, + { + "epoch": 8.64437194127243, + "grad_norm": 0.14990243315696716, + "learning_rate": 2.7478260171890175e-06, + "loss": 0.1551, + "num_input_tokens_seen": 114354928, + "step": 52990 + }, + { + "epoch": 8.645187601957586, + "grad_norm": 0.8234283328056335, + "learning_rate": 2.744582608716548e-06, + "loss": 0.0627, + "num_input_tokens_seen": 114365392, + "step": 52995 + }, + { + "epoch": 8.64600326264274, + "grad_norm": 0.6338955163955688, + "learning_rate": 2.741341004381129e-06, + "loss": 0.1067, + "num_input_tokens_seen": 114374992, + "step": 53000 + }, + { + "epoch": 8.646818923327896, + "grad_norm": 1.4306365251541138, + "learning_rate": 2.7381012044455535e-06, + "loss": 0.0499, + "num_input_tokens_seen": 114385808, + "step": 53005 + }, + { + "epoch": 8.647634584013051, + "grad_norm": 0.7121531367301941, + "learning_rate": 2.73486320917245e-06, + "loss": 0.0905, + "num_input_tokens_seen": 114397072, + "step": 53010 + }, + { + "epoch": 8.648450244698205, + "grad_norm": 0.9801326394081116, + "learning_rate": 2.7316270188243064e-06, + "loss": 0.0709, + "num_input_tokens_seen": 114408624, + "step": 53015 + }, + { + "epoch": 8.649265905383361, + "grad_norm": 0.12916554510593414, + "learning_rate": 2.728392633663468e-06, + "loss": 0.228, + "num_input_tokens_seen": 114418512, + "step": 53020 + }, + { + "epoch": 8.650081566068515, + "grad_norm": 1.480373740196228, + "learning_rate": 2.7251600539521248e-06, + "loss": 0.0626, + "num_input_tokens_seen": 114427792, + "step": 53025 + }, + { + "epoch": 8.65089722675367, + "grad_norm": 0.17165900766849518, + "learning_rate": 2.7219292799523316e-06, + "loss": 0.3602, + "num_input_tokens_seen": 114439888, + "step": 53030 + }, + { + "epoch": 8.651712887438826, + "grad_norm": 0.38494476675987244, + "learning_rate": 2.718700311925987e-06, + "loss": 0.0633, + "num_input_tokens_seen": 114451280, + "step": 53035 + }, + { + "epoch": 8.65252854812398, + "grad_norm": 0.594310998916626, + "learning_rate": 2.715473150134848e-06, + "loss": 0.0976, + "num_input_tokens_seen": 114463376, + "step": 53040 + }, + { + "epoch": 8.653344208809136, + "grad_norm": 0.2046806663274765, + "learning_rate": 2.7122477948405277e-06, + "loss": 0.0794, + "num_input_tokens_seen": 114474896, + "step": 53045 + }, + { + "epoch": 8.65415986949429, + "grad_norm": 0.06330850720405579, + "learning_rate": 2.7090242463044896e-06, + "loss": 0.131, + "num_input_tokens_seen": 114486000, + "step": 53050 + }, + { + "epoch": 8.654975530179446, + "grad_norm": 0.04402468726038933, + "learning_rate": 2.7058025047880466e-06, + "loss": 0.0796, + "num_input_tokens_seen": 114497456, + "step": 53055 + }, + { + "epoch": 8.655791190864601, + "grad_norm": 0.4111977517604828, + "learning_rate": 2.702582570552373e-06, + "loss": 0.0296, + "num_input_tokens_seen": 114508368, + "step": 53060 + }, + { + "epoch": 8.656606851549755, + "grad_norm": 0.7463168501853943, + "learning_rate": 2.699364443858493e-06, + "loss": 0.1657, + "num_input_tokens_seen": 114519888, + "step": 53065 + }, + { + "epoch": 8.65742251223491, + "grad_norm": 0.096220463514328, + "learning_rate": 2.6961481249672765e-06, + "loss": 0.1663, + "num_input_tokens_seen": 114529712, + "step": 53070 + }, + { + "epoch": 8.658238172920065, + "grad_norm": 0.5814322829246521, + "learning_rate": 2.69293361413947e-06, + "loss": 0.1671, + "num_input_tokens_seen": 114540528, + "step": 53075 + }, + { + "epoch": 8.65905383360522, + "grad_norm": 0.5565184354782104, + "learning_rate": 2.6897209116356457e-06, + "loss": 0.1911, + "num_input_tokens_seen": 114552016, + "step": 53080 + }, + { + "epoch": 8.659869494290374, + "grad_norm": 0.8637263774871826, + "learning_rate": 2.6865100177162484e-06, + "loss": 0.0353, + "num_input_tokens_seen": 114562672, + "step": 53085 + }, + { + "epoch": 8.66068515497553, + "grad_norm": 0.0872642993927002, + "learning_rate": 2.6833009326415663e-06, + "loss": 0.0494, + "num_input_tokens_seen": 114574320, + "step": 53090 + }, + { + "epoch": 8.661500815660686, + "grad_norm": 0.26574015617370605, + "learning_rate": 2.680093656671745e-06, + "loss": 0.1897, + "num_input_tokens_seen": 114585456, + "step": 53095 + }, + { + "epoch": 8.66231647634584, + "grad_norm": 0.043000686913728714, + "learning_rate": 2.6768881900667787e-06, + "loss": 0.1139, + "num_input_tokens_seen": 114596400, + "step": 53100 + }, + { + "epoch": 8.663132137030995, + "grad_norm": 1.1962493658065796, + "learning_rate": 2.673684533086526e-06, + "loss": 0.072, + "num_input_tokens_seen": 114607120, + "step": 53105 + }, + { + "epoch": 8.66394779771615, + "grad_norm": 0.7600804567337036, + "learning_rate": 2.6704826859906858e-06, + "loss": 0.0635, + "num_input_tokens_seen": 114617808, + "step": 53110 + }, + { + "epoch": 8.664763458401305, + "grad_norm": 0.6240618228912354, + "learning_rate": 2.667282649038816e-06, + "loss": 0.07, + "num_input_tokens_seen": 114627664, + "step": 53115 + }, + { + "epoch": 8.66557911908646, + "grad_norm": 0.04096299037337303, + "learning_rate": 2.6640844224903318e-06, + "loss": 0.0916, + "num_input_tokens_seen": 114638064, + "step": 53120 + }, + { + "epoch": 8.666394779771615, + "grad_norm": 0.04987310245633125, + "learning_rate": 2.660888006604498e-06, + "loss": 0.1943, + "num_input_tokens_seen": 114647760, + "step": 53125 + }, + { + "epoch": 8.66721044045677, + "grad_norm": 1.0766115188598633, + "learning_rate": 2.6576934016404264e-06, + "loss": 0.14, + "num_input_tokens_seen": 114658192, + "step": 53130 + }, + { + "epoch": 8.668026101141924, + "grad_norm": 0.0791340172290802, + "learning_rate": 2.654500607857091e-06, + "loss": 0.0127, + "num_input_tokens_seen": 114669392, + "step": 53135 + }, + { + "epoch": 8.66884176182708, + "grad_norm": 1.284918189048767, + "learning_rate": 2.651309625513318e-06, + "loss": 0.168, + "num_input_tokens_seen": 114681168, + "step": 53140 + }, + { + "epoch": 8.669657422512234, + "grad_norm": 1.7157994508743286, + "learning_rate": 2.648120454867778e-06, + "loss": 0.0878, + "num_input_tokens_seen": 114692688, + "step": 53145 + }, + { + "epoch": 8.67047308319739, + "grad_norm": 1.9531432390213013, + "learning_rate": 2.6449330961790116e-06, + "loss": 0.1256, + "num_input_tokens_seen": 114703472, + "step": 53150 + }, + { + "epoch": 8.671288743882545, + "grad_norm": 0.053308065980672836, + "learning_rate": 2.641747549705395e-06, + "loss": 0.1005, + "num_input_tokens_seen": 114714704, + "step": 53155 + }, + { + "epoch": 8.6721044045677, + "grad_norm": 0.6297826170921326, + "learning_rate": 2.638563815705167e-06, + "loss": 0.0366, + "num_input_tokens_seen": 114725200, + "step": 53160 + }, + { + "epoch": 8.672920065252855, + "grad_norm": 0.42495012283325195, + "learning_rate": 2.635381894436417e-06, + "loss": 0.0788, + "num_input_tokens_seen": 114735568, + "step": 53165 + }, + { + "epoch": 8.673735725938009, + "grad_norm": 0.05451592803001404, + "learning_rate": 2.63220178615709e-06, + "loss": 0.0422, + "num_input_tokens_seen": 114746448, + "step": 53170 + }, + { + "epoch": 8.674551386623165, + "grad_norm": 0.7795166373252869, + "learning_rate": 2.629023491124971e-06, + "loss": 0.1336, + "num_input_tokens_seen": 114758704, + "step": 53175 + }, + { + "epoch": 8.67536704730832, + "grad_norm": 0.37831220030784607, + "learning_rate": 2.6258470095977262e-06, + "loss": 0.1283, + "num_input_tokens_seen": 114769744, + "step": 53180 + }, + { + "epoch": 8.676182707993474, + "grad_norm": 1.3120646476745605, + "learning_rate": 2.6226723418328437e-06, + "loss": 0.0765, + "num_input_tokens_seen": 114780112, + "step": 53185 + }, + { + "epoch": 8.67699836867863, + "grad_norm": 1.616089940071106, + "learning_rate": 2.6194994880876843e-06, + "loss": 0.1375, + "num_input_tokens_seen": 114790384, + "step": 53190 + }, + { + "epoch": 8.677814029363784, + "grad_norm": 0.36766770482063293, + "learning_rate": 2.616328448619454e-06, + "loss": 0.026, + "num_input_tokens_seen": 114801168, + "step": 53195 + }, + { + "epoch": 8.67862969004894, + "grad_norm": 1.2545886039733887, + "learning_rate": 2.613159223685213e-06, + "loss": 0.1824, + "num_input_tokens_seen": 114812560, + "step": 53200 + }, + { + "epoch": 8.679445350734095, + "grad_norm": 0.999679446220398, + "learning_rate": 2.609991813541876e-06, + "loss": 0.1101, + "num_input_tokens_seen": 114823664, + "step": 53205 + }, + { + "epoch": 8.68026101141925, + "grad_norm": 0.758983314037323, + "learning_rate": 2.6068262184462066e-06, + "loss": 0.0448, + "num_input_tokens_seen": 114835056, + "step": 53210 + }, + { + "epoch": 8.681076672104405, + "grad_norm": 0.16514311730861664, + "learning_rate": 2.6036624386548277e-06, + "loss": 0.2383, + "num_input_tokens_seen": 114845744, + "step": 53215 + }, + { + "epoch": 8.681892332789559, + "grad_norm": 0.5035471320152283, + "learning_rate": 2.6005004744242082e-06, + "loss": 0.1569, + "num_input_tokens_seen": 114857072, + "step": 53220 + }, + { + "epoch": 8.682707993474715, + "grad_norm": 0.5858597755432129, + "learning_rate": 2.597340326010675e-06, + "loss": 0.0828, + "num_input_tokens_seen": 114868304, + "step": 53225 + }, + { + "epoch": 8.68352365415987, + "grad_norm": 1.3059396743774414, + "learning_rate": 2.5941819936704053e-06, + "loss": 0.1288, + "num_input_tokens_seen": 114877360, + "step": 53230 + }, + { + "epoch": 8.684339314845024, + "grad_norm": 0.056514922529459, + "learning_rate": 2.5910254776594256e-06, + "loss": 0.0729, + "num_input_tokens_seen": 114888496, + "step": 53235 + }, + { + "epoch": 8.68515497553018, + "grad_norm": 1.4017808437347412, + "learning_rate": 2.587870778233625e-06, + "loss": 0.1968, + "num_input_tokens_seen": 114900656, + "step": 53240 + }, + { + "epoch": 8.685970636215334, + "grad_norm": 0.08449652791023254, + "learning_rate": 2.584717895648739e-06, + "loss": 0.0425, + "num_input_tokens_seen": 114912208, + "step": 53245 + }, + { + "epoch": 8.68678629690049, + "grad_norm": 2.871241807937622, + "learning_rate": 2.5815668301603537e-06, + "loss": 0.212, + "num_input_tokens_seen": 114924144, + "step": 53250 + }, + { + "epoch": 8.687601957585644, + "grad_norm": 0.4201962351799011, + "learning_rate": 2.5784175820239094e-06, + "loss": 0.1253, + "num_input_tokens_seen": 114936496, + "step": 53255 + }, + { + "epoch": 8.6884176182708, + "grad_norm": 0.3619503676891327, + "learning_rate": 2.575270151494702e-06, + "loss": 0.2282, + "num_input_tokens_seen": 114946928, + "step": 53260 + }, + { + "epoch": 8.689233278955955, + "grad_norm": 1.1953823566436768, + "learning_rate": 2.5721245388278805e-06, + "loss": 0.0842, + "num_input_tokens_seen": 114957392, + "step": 53265 + }, + { + "epoch": 8.690048939641109, + "grad_norm": 0.16350014507770538, + "learning_rate": 2.5689807442784404e-06, + "loss": 0.1036, + "num_input_tokens_seen": 114968816, + "step": 53270 + }, + { + "epoch": 8.690864600326265, + "grad_norm": 1.0026743412017822, + "learning_rate": 2.5658387681012337e-06, + "loss": 0.0826, + "num_input_tokens_seen": 114979408, + "step": 53275 + }, + { + "epoch": 8.691680261011419, + "grad_norm": 1.2731012105941772, + "learning_rate": 2.5626986105509677e-06, + "loss": 0.1346, + "num_input_tokens_seen": 114989776, + "step": 53280 + }, + { + "epoch": 8.692495921696574, + "grad_norm": 0.4595658481121063, + "learning_rate": 2.5595602718821916e-06, + "loss": 0.1737, + "num_input_tokens_seen": 115000976, + "step": 53285 + }, + { + "epoch": 8.69331158238173, + "grad_norm": 0.22089940309524536, + "learning_rate": 2.5564237523493295e-06, + "loss": 0.149, + "num_input_tokens_seen": 115010608, + "step": 53290 + }, + { + "epoch": 8.694127243066884, + "grad_norm": 1.7821305990219116, + "learning_rate": 2.553289052206634e-06, + "loss": 0.1482, + "num_input_tokens_seen": 115021904, + "step": 53295 + }, + { + "epoch": 8.69494290375204, + "grad_norm": 1.5566684007644653, + "learning_rate": 2.5501561717082204e-06, + "loss": 0.0907, + "num_input_tokens_seen": 115032240, + "step": 53300 + }, + { + "epoch": 8.695758564437194, + "grad_norm": 0.15035445988178253, + "learning_rate": 2.547025111108056e-06, + "loss": 0.1184, + "num_input_tokens_seen": 115043024, + "step": 53305 + }, + { + "epoch": 8.69657422512235, + "grad_norm": 2.0598928928375244, + "learning_rate": 2.5438958706599623e-06, + "loss": 0.2544, + "num_input_tokens_seen": 115053584, + "step": 53310 + }, + { + "epoch": 8.697389885807503, + "grad_norm": 0.18820098042488098, + "learning_rate": 2.540768450617609e-06, + "loss": 0.1695, + "num_input_tokens_seen": 115064368, + "step": 53315 + }, + { + "epoch": 8.698205546492659, + "grad_norm": 1.0812782049179077, + "learning_rate": 2.537642851234523e-06, + "loss": 0.0952, + "num_input_tokens_seen": 115073712, + "step": 53320 + }, + { + "epoch": 8.699021207177815, + "grad_norm": 1.344388484954834, + "learning_rate": 2.5345190727640828e-06, + "loss": 0.1219, + "num_input_tokens_seen": 115085616, + "step": 53325 + }, + { + "epoch": 8.699836867862969, + "grad_norm": 1.4210807085037231, + "learning_rate": 2.5313971154595135e-06, + "loss": 0.19, + "num_input_tokens_seen": 115096080, + "step": 53330 + }, + { + "epoch": 8.700652528548124, + "grad_norm": 0.1463891565799713, + "learning_rate": 2.5282769795738987e-06, + "loss": 0.0834, + "num_input_tokens_seen": 115107344, + "step": 53335 + }, + { + "epoch": 8.701468189233278, + "grad_norm": 1.1039332151412964, + "learning_rate": 2.5251586653601722e-06, + "loss": 0.1493, + "num_input_tokens_seen": 115117712, + "step": 53340 + }, + { + "epoch": 8.702283849918434, + "grad_norm": 1.475583553314209, + "learning_rate": 2.522042173071121e-06, + "loss": 0.2312, + "num_input_tokens_seen": 115128048, + "step": 53345 + }, + { + "epoch": 8.70309951060359, + "grad_norm": 0.07941869646310806, + "learning_rate": 2.518927502959384e-06, + "loss": 0.0724, + "num_input_tokens_seen": 115139152, + "step": 53350 + }, + { + "epoch": 8.703915171288743, + "grad_norm": 1.6535165309906006, + "learning_rate": 2.5158146552774486e-06, + "loss": 0.1776, + "num_input_tokens_seen": 115150032, + "step": 53355 + }, + { + "epoch": 8.7047308319739, + "grad_norm": 0.1567421853542328, + "learning_rate": 2.512703630277663e-06, + "loss": 0.0164, + "num_input_tokens_seen": 115159536, + "step": 53360 + }, + { + "epoch": 8.705546492659053, + "grad_norm": 1.0619796514511108, + "learning_rate": 2.5095944282122226e-06, + "loss": 0.168, + "num_input_tokens_seen": 115170576, + "step": 53365 + }, + { + "epoch": 8.706362153344209, + "grad_norm": 0.05233163759112358, + "learning_rate": 2.50648704933317e-06, + "loss": 0.1882, + "num_input_tokens_seen": 115182160, + "step": 53370 + }, + { + "epoch": 8.707177814029365, + "grad_norm": 0.2206847071647644, + "learning_rate": 2.5033814938924095e-06, + "loss": 0.0676, + "num_input_tokens_seen": 115190960, + "step": 53375 + }, + { + "epoch": 8.707993474714518, + "grad_norm": 0.1061377003788948, + "learning_rate": 2.500277762141692e-06, + "loss": 0.032, + "num_input_tokens_seen": 115201680, + "step": 53380 + }, + { + "epoch": 8.708809135399674, + "grad_norm": 1.0410444736480713, + "learning_rate": 2.49717585433262e-06, + "loss": 0.1269, + "num_input_tokens_seen": 115212496, + "step": 53385 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 3.707493305206299, + "learning_rate": 2.4940757707166474e-06, + "loss": 0.2253, + "num_input_tokens_seen": 115223888, + "step": 53390 + }, + { + "epoch": 8.710440456769984, + "grad_norm": 0.15944641828536987, + "learning_rate": 2.490977511545092e-06, + "loss": 0.1122, + "num_input_tokens_seen": 115233616, + "step": 53395 + }, + { + "epoch": 8.71125611745514, + "grad_norm": 0.7454949021339417, + "learning_rate": 2.4878810770691096e-06, + "loss": 0.0815, + "num_input_tokens_seen": 115244496, + "step": 53400 + }, + { + "epoch": 8.712071778140293, + "grad_norm": 1.0083587169647217, + "learning_rate": 2.48478646753971e-06, + "loss": 0.1501, + "num_input_tokens_seen": 115255216, + "step": 53405 + }, + { + "epoch": 8.71288743882545, + "grad_norm": 0.38035282492637634, + "learning_rate": 2.4816936832077615e-06, + "loss": 0.0413, + "num_input_tokens_seen": 115264720, + "step": 53410 + }, + { + "epoch": 8.713703099510603, + "grad_norm": 0.6182409524917603, + "learning_rate": 2.478602724323981e-06, + "loss": 0.2822, + "num_input_tokens_seen": 115276080, + "step": 53415 + }, + { + "epoch": 8.714518760195759, + "grad_norm": 0.158734530210495, + "learning_rate": 2.4755135911389364e-06, + "loss": 0.1066, + "num_input_tokens_seen": 115286576, + "step": 53420 + }, + { + "epoch": 8.715334420880914, + "grad_norm": 0.40942272543907166, + "learning_rate": 2.472426283903048e-06, + "loss": 0.1039, + "num_input_tokens_seen": 115297200, + "step": 53425 + }, + { + "epoch": 8.716150081566068, + "grad_norm": 0.10592272877693176, + "learning_rate": 2.4693408028665878e-06, + "loss": 0.0331, + "num_input_tokens_seen": 115308240, + "step": 53430 + }, + { + "epoch": 8.716965742251224, + "grad_norm": 0.21725155413150787, + "learning_rate": 2.4662571482796797e-06, + "loss": 0.0336, + "num_input_tokens_seen": 115319600, + "step": 53435 + }, + { + "epoch": 8.717781402936378, + "grad_norm": 0.7215107083320618, + "learning_rate": 2.4631753203923052e-06, + "loss": 0.0607, + "num_input_tokens_seen": 115330736, + "step": 53440 + }, + { + "epoch": 8.718597063621534, + "grad_norm": 0.04647787660360336, + "learning_rate": 2.460095319454289e-06, + "loss": 0.3449, + "num_input_tokens_seen": 115342672, + "step": 53445 + }, + { + "epoch": 8.719412724306688, + "grad_norm": 0.0410722978413105, + "learning_rate": 2.4570171457153123e-06, + "loss": 0.2964, + "num_input_tokens_seen": 115354320, + "step": 53450 + }, + { + "epoch": 8.720228384991843, + "grad_norm": 1.1738767623901367, + "learning_rate": 2.4539407994249088e-06, + "loss": 0.3308, + "num_input_tokens_seen": 115364880, + "step": 53455 + }, + { + "epoch": 8.721044045676999, + "grad_norm": 0.24747391045093536, + "learning_rate": 2.450866280832456e-06, + "loss": 0.1207, + "num_input_tokens_seen": 115375824, + "step": 53460 + }, + { + "epoch": 8.721859706362153, + "grad_norm": 0.2782943844795227, + "learning_rate": 2.4477935901872e-06, + "loss": 0.1562, + "num_input_tokens_seen": 115386992, + "step": 53465 + }, + { + "epoch": 8.722675367047309, + "grad_norm": 0.10715095698833466, + "learning_rate": 2.4447227277382244e-06, + "loss": 0.114, + "num_input_tokens_seen": 115397008, + "step": 53470 + }, + { + "epoch": 8.723491027732463, + "grad_norm": 0.1677139699459076, + "learning_rate": 2.441653693734472e-06, + "loss": 0.1838, + "num_input_tokens_seen": 115407888, + "step": 53475 + }, + { + "epoch": 8.724306688417618, + "grad_norm": 0.887319803237915, + "learning_rate": 2.438586488424727e-06, + "loss": 0.1613, + "num_input_tokens_seen": 115417936, + "step": 53480 + }, + { + "epoch": 8.725122349102774, + "grad_norm": 0.40163660049438477, + "learning_rate": 2.435521112057637e-06, + "loss": 0.0885, + "num_input_tokens_seen": 115429200, + "step": 53485 + }, + { + "epoch": 8.725938009787928, + "grad_norm": 0.09424107521772385, + "learning_rate": 2.432457564881699e-06, + "loss": 0.098, + "num_input_tokens_seen": 115439920, + "step": 53490 + }, + { + "epoch": 8.726753670473084, + "grad_norm": 0.5115509629249573, + "learning_rate": 2.429395847145252e-06, + "loss": 0.1152, + "num_input_tokens_seen": 115450608, + "step": 53495 + }, + { + "epoch": 8.727569331158238, + "grad_norm": 0.46444177627563477, + "learning_rate": 2.4263359590965042e-06, + "loss": 0.1248, + "num_input_tokens_seen": 115461680, + "step": 53500 + }, + { + "epoch": 8.728384991843393, + "grad_norm": 0.9053292870521545, + "learning_rate": 2.4232779009835006e-06, + "loss": 0.033, + "num_input_tokens_seen": 115472752, + "step": 53505 + }, + { + "epoch": 8.729200652528547, + "grad_norm": 1.067772626876831, + "learning_rate": 2.420221673054143e-06, + "loss": 0.2328, + "num_input_tokens_seen": 115483536, + "step": 53510 + }, + { + "epoch": 8.730016313213703, + "grad_norm": 0.5185569524765015, + "learning_rate": 2.417167275556187e-06, + "loss": 0.0498, + "num_input_tokens_seen": 115494736, + "step": 53515 + }, + { + "epoch": 8.730831973898859, + "grad_norm": 0.03891690820455551, + "learning_rate": 2.4141147087372336e-06, + "loss": 0.1345, + "num_input_tokens_seen": 115505552, + "step": 53520 + }, + { + "epoch": 8.731647634584013, + "grad_norm": 0.5866448283195496, + "learning_rate": 2.4110639728447433e-06, + "loss": 0.2133, + "num_input_tokens_seen": 115515920, + "step": 53525 + }, + { + "epoch": 8.732463295269168, + "grad_norm": 1.0881894826889038, + "learning_rate": 2.4080150681260212e-06, + "loss": 0.2325, + "num_input_tokens_seen": 115526896, + "step": 53530 + }, + { + "epoch": 8.733278955954322, + "grad_norm": 0.7996687889099121, + "learning_rate": 2.4049679948282305e-06, + "loss": 0.1705, + "num_input_tokens_seen": 115536912, + "step": 53535 + }, + { + "epoch": 8.734094616639478, + "grad_norm": 0.45565545558929443, + "learning_rate": 2.401922753198377e-06, + "loss": 0.0577, + "num_input_tokens_seen": 115547728, + "step": 53540 + }, + { + "epoch": 8.734910277324634, + "grad_norm": 0.5291571021080017, + "learning_rate": 2.398879343483329e-06, + "loss": 0.0412, + "num_input_tokens_seen": 115559184, + "step": 53545 + }, + { + "epoch": 8.735725938009788, + "grad_norm": 0.06101206690073013, + "learning_rate": 2.3958377659297983e-06, + "loss": 0.1196, + "num_input_tokens_seen": 115569552, + "step": 53550 + }, + { + "epoch": 8.736541598694943, + "grad_norm": 1.821320652961731, + "learning_rate": 2.392798020784348e-06, + "loss": 0.1946, + "num_input_tokens_seen": 115578928, + "step": 53555 + }, + { + "epoch": 8.737357259380097, + "grad_norm": 0.6122424006462097, + "learning_rate": 2.3897601082934013e-06, + "loss": 0.0922, + "num_input_tokens_seen": 115589904, + "step": 53560 + }, + { + "epoch": 8.738172920065253, + "grad_norm": 0.39287638664245605, + "learning_rate": 2.3867240287032214e-06, + "loss": 0.0847, + "num_input_tokens_seen": 115599440, + "step": 53565 + }, + { + "epoch": 8.738988580750409, + "grad_norm": 0.25171756744384766, + "learning_rate": 2.3836897822599317e-06, + "loss": 0.1084, + "num_input_tokens_seen": 115610064, + "step": 53570 + }, + { + "epoch": 8.739804241435563, + "grad_norm": 1.9051768779754639, + "learning_rate": 2.380657369209502e-06, + "loss": 0.2357, + "num_input_tokens_seen": 115621360, + "step": 53575 + }, + { + "epoch": 8.740619902120718, + "grad_norm": 0.3359629511833191, + "learning_rate": 2.3776267897977543e-06, + "loss": 0.2021, + "num_input_tokens_seen": 115632272, + "step": 53580 + }, + { + "epoch": 8.741435562805872, + "grad_norm": 0.3470095098018646, + "learning_rate": 2.374598044270365e-06, + "loss": 0.054, + "num_input_tokens_seen": 115643280, + "step": 53585 + }, + { + "epoch": 8.742251223491028, + "grad_norm": 0.07509925216436386, + "learning_rate": 2.3715711328728575e-06, + "loss": 0.1127, + "num_input_tokens_seen": 115654128, + "step": 53590 + }, + { + "epoch": 8.743066884176184, + "grad_norm": 0.38587522506713867, + "learning_rate": 2.3685460558506097e-06, + "loss": 0.0723, + "num_input_tokens_seen": 115664528, + "step": 53595 + }, + { + "epoch": 8.743882544861338, + "grad_norm": 1.4486533403396606, + "learning_rate": 2.3655228134488505e-06, + "loss": 0.1988, + "num_input_tokens_seen": 115675024, + "step": 53600 + }, + { + "epoch": 8.744698205546493, + "grad_norm": 0.22526241838932037, + "learning_rate": 2.362501405912651e-06, + "loss": 0.0939, + "num_input_tokens_seen": 115687056, + "step": 53605 + }, + { + "epoch": 8.745513866231647, + "grad_norm": 0.26714271306991577, + "learning_rate": 2.3594818334869568e-06, + "loss": 0.1441, + "num_input_tokens_seen": 115697840, + "step": 53610 + }, + { + "epoch": 8.746329526916803, + "grad_norm": 0.5713862776756287, + "learning_rate": 2.3564640964165386e-06, + "loss": 0.0302, + "num_input_tokens_seen": 115708848, + "step": 53615 + }, + { + "epoch": 8.747145187601957, + "grad_norm": 0.6749308705329895, + "learning_rate": 2.353448194946037e-06, + "loss": 0.2939, + "num_input_tokens_seen": 115719568, + "step": 53620 + }, + { + "epoch": 8.747960848287113, + "grad_norm": 0.20281775295734406, + "learning_rate": 2.350434129319928e-06, + "loss": 0.1028, + "num_input_tokens_seen": 115730768, + "step": 53625 + }, + { + "epoch": 8.748776508972268, + "grad_norm": 1.1449824571609497, + "learning_rate": 2.347421899782551e-06, + "loss": 0.0394, + "num_input_tokens_seen": 115741840, + "step": 53630 + }, + { + "epoch": 8.749592169657422, + "grad_norm": 0.03921917825937271, + "learning_rate": 2.3444115065780953e-06, + "loss": 0.1157, + "num_input_tokens_seen": 115752208, + "step": 53635 + }, + { + "epoch": 8.750407830342578, + "grad_norm": 0.34067925810813904, + "learning_rate": 2.341402949950594e-06, + "loss": 0.0405, + "num_input_tokens_seen": 115762992, + "step": 53640 + }, + { + "epoch": 8.751223491027732, + "grad_norm": 0.7033089399337769, + "learning_rate": 2.338396230143941e-06, + "loss": 0.1624, + "num_input_tokens_seen": 115774224, + "step": 53645 + }, + { + "epoch": 8.752039151712887, + "grad_norm": 0.8135300278663635, + "learning_rate": 2.335391347401872e-06, + "loss": 0.0427, + "num_input_tokens_seen": 115785776, + "step": 53650 + }, + { + "epoch": 8.752854812398043, + "grad_norm": 2.479393243789673, + "learning_rate": 2.3323883019679805e-06, + "loss": 0.2024, + "num_input_tokens_seen": 115796656, + "step": 53655 + }, + { + "epoch": 8.753670473083197, + "grad_norm": 1.877139687538147, + "learning_rate": 2.3293870940857084e-06, + "loss": 0.1324, + "num_input_tokens_seen": 115808272, + "step": 53660 + }, + { + "epoch": 8.754486133768353, + "grad_norm": 0.5668944716453552, + "learning_rate": 2.326387723998347e-06, + "loss": 0.0882, + "num_input_tokens_seen": 115820144, + "step": 53665 + }, + { + "epoch": 8.755301794453507, + "grad_norm": 0.4127062261104584, + "learning_rate": 2.3233901919490404e-06, + "loss": 0.2738, + "num_input_tokens_seen": 115831888, + "step": 53670 + }, + { + "epoch": 8.756117455138662, + "grad_norm": 0.8055358529090881, + "learning_rate": 2.3203944981807835e-06, + "loss": 0.0803, + "num_input_tokens_seen": 115843024, + "step": 53675 + }, + { + "epoch": 8.756933115823816, + "grad_norm": 1.0925915241241455, + "learning_rate": 2.3174006429364263e-06, + "loss": 0.1509, + "num_input_tokens_seen": 115854288, + "step": 53680 + }, + { + "epoch": 8.757748776508972, + "grad_norm": 0.9758503437042236, + "learning_rate": 2.314408626458664e-06, + "loss": 0.1052, + "num_input_tokens_seen": 115865296, + "step": 53685 + }, + { + "epoch": 8.758564437194128, + "grad_norm": 0.6875287890434265, + "learning_rate": 2.311418448990041e-06, + "loss": 0.075, + "num_input_tokens_seen": 115876304, + "step": 53690 + }, + { + "epoch": 8.759380097879282, + "grad_norm": 0.9202595353126526, + "learning_rate": 2.3084301107729633e-06, + "loss": 0.0856, + "num_input_tokens_seen": 115887632, + "step": 53695 + }, + { + "epoch": 8.760195758564437, + "grad_norm": 0.5518066883087158, + "learning_rate": 2.3054436120496736e-06, + "loss": 0.1583, + "num_input_tokens_seen": 115898640, + "step": 53700 + }, + { + "epoch": 8.761011419249591, + "grad_norm": 0.7494092583656311, + "learning_rate": 2.302458953062275e-06, + "loss": 0.0631, + "num_input_tokens_seen": 115909392, + "step": 53705 + }, + { + "epoch": 8.761827079934747, + "grad_norm": 0.17080530524253845, + "learning_rate": 2.2994761340527195e-06, + "loss": 0.0718, + "num_input_tokens_seen": 115920080, + "step": 53710 + }, + { + "epoch": 8.762642740619903, + "grad_norm": 0.5469005107879639, + "learning_rate": 2.2964951552628096e-06, + "loss": 0.0528, + "num_input_tokens_seen": 115930640, + "step": 53715 + }, + { + "epoch": 8.763458401305057, + "grad_norm": 0.8480032682418823, + "learning_rate": 2.293516016934202e-06, + "loss": 0.0786, + "num_input_tokens_seen": 115941392, + "step": 53720 + }, + { + "epoch": 8.764274061990212, + "grad_norm": 0.43026790022850037, + "learning_rate": 2.2905387193083965e-06, + "loss": 0.0833, + "num_input_tokens_seen": 115952176, + "step": 53725 + }, + { + "epoch": 8.765089722675366, + "grad_norm": 0.7630295157432556, + "learning_rate": 2.287563262626749e-06, + "loss": 0.132, + "num_input_tokens_seen": 115962896, + "step": 53730 + }, + { + "epoch": 8.765905383360522, + "grad_norm": 1.3966418504714966, + "learning_rate": 2.2845896471304667e-06, + "loss": 0.0771, + "num_input_tokens_seen": 115972944, + "step": 53735 + }, + { + "epoch": 8.766721044045678, + "grad_norm": 0.22578050196170807, + "learning_rate": 2.2816178730606012e-06, + "loss": 0.1069, + "num_input_tokens_seen": 115983600, + "step": 53740 + }, + { + "epoch": 8.767536704730832, + "grad_norm": 0.09750419110059738, + "learning_rate": 2.2786479406580658e-06, + "loss": 0.1903, + "num_input_tokens_seen": 115994832, + "step": 53745 + }, + { + "epoch": 8.768352365415987, + "grad_norm": 0.04400048404932022, + "learning_rate": 2.2756798501636146e-06, + "loss": 0.0291, + "num_input_tokens_seen": 116004848, + "step": 53750 + }, + { + "epoch": 8.769168026101141, + "grad_norm": 0.5450733304023743, + "learning_rate": 2.272713601817855e-06, + "loss": 0.072, + "num_input_tokens_seen": 116016016, + "step": 53755 + }, + { + "epoch": 8.769983686786297, + "grad_norm": 0.15839974582195282, + "learning_rate": 2.269749195861251e-06, + "loss": 0.0739, + "num_input_tokens_seen": 116027088, + "step": 53760 + }, + { + "epoch": 8.770799347471453, + "grad_norm": 0.6500880122184753, + "learning_rate": 2.266786632534107e-06, + "loss": 0.1674, + "num_input_tokens_seen": 116038192, + "step": 53765 + }, + { + "epoch": 8.771615008156607, + "grad_norm": 0.10043756663799286, + "learning_rate": 2.2638259120765864e-06, + "loss": 0.164, + "num_input_tokens_seen": 116048272, + "step": 53770 + }, + { + "epoch": 8.772430668841762, + "grad_norm": 0.8751306533813477, + "learning_rate": 2.2608670347286947e-06, + "loss": 0.1021, + "num_input_tokens_seen": 116060112, + "step": 53775 + }, + { + "epoch": 8.773246329526916, + "grad_norm": 0.10599298775196075, + "learning_rate": 2.257910000730304e-06, + "loss": 0.0737, + "num_input_tokens_seen": 116071216, + "step": 53780 + }, + { + "epoch": 8.774061990212072, + "grad_norm": 0.4309757351875305, + "learning_rate": 2.2549548103211222e-06, + "loss": 0.1965, + "num_input_tokens_seen": 116081360, + "step": 53785 + }, + { + "epoch": 8.774877650897226, + "grad_norm": 0.43536490201950073, + "learning_rate": 2.2520014637407076e-06, + "loss": 0.1037, + "num_input_tokens_seen": 116091216, + "step": 53790 + }, + { + "epoch": 8.775693311582382, + "grad_norm": 0.14380602538585663, + "learning_rate": 2.2490499612284777e-06, + "loss": 0.0607, + "num_input_tokens_seen": 116101168, + "step": 53795 + }, + { + "epoch": 8.776508972267537, + "grad_norm": 0.36512812972068787, + "learning_rate": 2.246100303023696e-06, + "loss": 0.0614, + "num_input_tokens_seen": 116112272, + "step": 53800 + }, + { + "epoch": 8.777324632952691, + "grad_norm": 1.6068415641784668, + "learning_rate": 2.2431524893654743e-06, + "loss": 0.202, + "num_input_tokens_seen": 116123184, + "step": 53805 + }, + { + "epoch": 8.778140293637847, + "grad_norm": 0.34696638584136963, + "learning_rate": 2.2402065204927797e-06, + "loss": 0.0466, + "num_input_tokens_seen": 116133776, + "step": 53810 + }, + { + "epoch": 8.778955954323001, + "grad_norm": 1.4786937236785889, + "learning_rate": 2.237262396644421e-06, + "loss": 0.1153, + "num_input_tokens_seen": 116144688, + "step": 53815 + }, + { + "epoch": 8.779771615008157, + "grad_norm": 0.19852186739444733, + "learning_rate": 2.2343201180590745e-06, + "loss": 0.1189, + "num_input_tokens_seen": 116154288, + "step": 53820 + }, + { + "epoch": 8.780587275693312, + "grad_norm": 0.3283979594707489, + "learning_rate": 2.2313796849752516e-06, + "loss": 0.0636, + "num_input_tokens_seen": 116165584, + "step": 53825 + }, + { + "epoch": 8.781402936378466, + "grad_norm": 0.19599652290344238, + "learning_rate": 2.2284410976313174e-06, + "loss": 0.044, + "num_input_tokens_seen": 116176848, + "step": 53830 + }, + { + "epoch": 8.782218597063622, + "grad_norm": 0.1036338210105896, + "learning_rate": 2.2255043562654926e-06, + "loss": 0.0858, + "num_input_tokens_seen": 116186800, + "step": 53835 + }, + { + "epoch": 8.783034257748776, + "grad_norm": 0.07139308750629425, + "learning_rate": 2.2225694611158366e-06, + "loss": 0.1261, + "num_input_tokens_seen": 116196656, + "step": 53840 + }, + { + "epoch": 8.783849918433932, + "grad_norm": 1.0183452367782593, + "learning_rate": 2.2196364124202756e-06, + "loss": 0.0698, + "num_input_tokens_seen": 116207760, + "step": 53845 + }, + { + "epoch": 8.784665579119086, + "grad_norm": 1.687727928161621, + "learning_rate": 2.2167052104165724e-06, + "loss": 0.1285, + "num_input_tokens_seen": 116216752, + "step": 53850 + }, + { + "epoch": 8.785481239804241, + "grad_norm": 2.201244354248047, + "learning_rate": 2.2137758553423454e-06, + "loss": 0.1819, + "num_input_tokens_seen": 116227568, + "step": 53855 + }, + { + "epoch": 8.786296900489397, + "grad_norm": 0.7767635583877563, + "learning_rate": 2.2108483474350626e-06, + "loss": 0.052, + "num_input_tokens_seen": 116238736, + "step": 53860 + }, + { + "epoch": 8.78711256117455, + "grad_norm": 2.183389186859131, + "learning_rate": 2.207922686932046e-06, + "loss": 0.3061, + "num_input_tokens_seen": 116249584, + "step": 53865 + }, + { + "epoch": 8.787928221859707, + "grad_norm": 0.13285605609416962, + "learning_rate": 2.2049988740704604e-06, + "loss": 0.069, + "num_input_tokens_seen": 116259664, + "step": 53870 + }, + { + "epoch": 8.78874388254486, + "grad_norm": 1.4534207582473755, + "learning_rate": 2.202076909087328e-06, + "loss": 0.1362, + "num_input_tokens_seen": 116270352, + "step": 53875 + }, + { + "epoch": 8.789559543230016, + "grad_norm": 1.129799246788025, + "learning_rate": 2.199156792219517e-06, + "loss": 0.0829, + "num_input_tokens_seen": 116281232, + "step": 53880 + }, + { + "epoch": 8.790375203915172, + "grad_norm": 0.3439168632030487, + "learning_rate": 2.1962385237037445e-06, + "loss": 0.0382, + "num_input_tokens_seen": 116291728, + "step": 53885 + }, + { + "epoch": 8.791190864600326, + "grad_norm": 0.723141074180603, + "learning_rate": 2.193322103776585e-06, + "loss": 0.1641, + "num_input_tokens_seen": 116302480, + "step": 53890 + }, + { + "epoch": 8.792006525285482, + "grad_norm": 1.7896579504013062, + "learning_rate": 2.1904075326744543e-06, + "loss": 0.2134, + "num_input_tokens_seen": 116314192, + "step": 53895 + }, + { + "epoch": 8.792822185970635, + "grad_norm": 0.10195622593164444, + "learning_rate": 2.1874948106336254e-06, + "loss": 0.1875, + "num_input_tokens_seen": 116325616, + "step": 53900 + }, + { + "epoch": 8.793637846655791, + "grad_norm": 0.04858094081282616, + "learning_rate": 2.1845839378902167e-06, + "loss": 0.0147, + "num_input_tokens_seen": 116335344, + "step": 53905 + }, + { + "epoch": 8.794453507340947, + "grad_norm": 0.18769830465316772, + "learning_rate": 2.181674914680196e-06, + "loss": 0.0271, + "num_input_tokens_seen": 116346480, + "step": 53910 + }, + { + "epoch": 8.7952691680261, + "grad_norm": 0.12264283746480942, + "learning_rate": 2.178767741239382e-06, + "loss": 0.1939, + "num_input_tokens_seen": 116358576, + "step": 53915 + }, + { + "epoch": 8.796084828711257, + "grad_norm": 0.7430048584938049, + "learning_rate": 2.1758624178034537e-06, + "loss": 0.1578, + "num_input_tokens_seen": 116369872, + "step": 53920 + }, + { + "epoch": 8.79690048939641, + "grad_norm": 0.4293675720691681, + "learning_rate": 2.1729589446079252e-06, + "loss": 0.0388, + "num_input_tokens_seen": 116380432, + "step": 53925 + }, + { + "epoch": 8.797716150081566, + "grad_norm": 0.14342762529850006, + "learning_rate": 2.1700573218881694e-06, + "loss": 0.0827, + "num_input_tokens_seen": 116390544, + "step": 53930 + }, + { + "epoch": 8.798531810766722, + "grad_norm": 0.11487726867198944, + "learning_rate": 2.167157549879406e-06, + "loss": 0.0509, + "num_input_tokens_seen": 116402512, + "step": 53935 + }, + { + "epoch": 8.799347471451876, + "grad_norm": 0.18652187287807465, + "learning_rate": 2.1642596288166976e-06, + "loss": 0.1308, + "num_input_tokens_seen": 116413488, + "step": 53940 + }, + { + "epoch": 8.800163132137031, + "grad_norm": 0.1428796350955963, + "learning_rate": 2.1613635589349756e-06, + "loss": 0.045, + "num_input_tokens_seen": 116424976, + "step": 53945 + }, + { + "epoch": 8.800978792822185, + "grad_norm": 0.32994818687438965, + "learning_rate": 2.1584693404690076e-06, + "loss": 0.1492, + "num_input_tokens_seen": 116435856, + "step": 53950 + }, + { + "epoch": 8.801794453507341, + "grad_norm": 0.6790923476219177, + "learning_rate": 2.155576973653409e-06, + "loss": 0.2654, + "num_input_tokens_seen": 116447472, + "step": 53955 + }, + { + "epoch": 8.802610114192497, + "grad_norm": 0.960172176361084, + "learning_rate": 2.1526864587226525e-06, + "loss": 0.1931, + "num_input_tokens_seen": 116458064, + "step": 53960 + }, + { + "epoch": 8.80342577487765, + "grad_norm": 0.7148797512054443, + "learning_rate": 2.14979779591106e-06, + "loss": 0.0596, + "num_input_tokens_seen": 116468464, + "step": 53965 + }, + { + "epoch": 8.804241435562806, + "grad_norm": 0.02047475054860115, + "learning_rate": 2.1469109854527993e-06, + "loss": 0.1645, + "num_input_tokens_seen": 116477488, + "step": 53970 + }, + { + "epoch": 8.80505709624796, + "grad_norm": 0.5492742657661438, + "learning_rate": 2.1440260275818856e-06, + "loss": 0.1104, + "num_input_tokens_seen": 116488368, + "step": 53975 + }, + { + "epoch": 8.805872756933116, + "grad_norm": 0.7058826684951782, + "learning_rate": 2.1411429225321965e-06, + "loss": 0.1356, + "num_input_tokens_seen": 116498704, + "step": 53980 + }, + { + "epoch": 8.80668841761827, + "grad_norm": 0.470954567193985, + "learning_rate": 2.138261670537445e-06, + "loss": 0.058, + "num_input_tokens_seen": 116510000, + "step": 53985 + }, + { + "epoch": 8.807504078303426, + "grad_norm": 0.3272072970867157, + "learning_rate": 2.1353822718312016e-06, + "loss": 0.1737, + "num_input_tokens_seen": 116521552, + "step": 53990 + }, + { + "epoch": 8.808319738988581, + "grad_norm": 0.3702596127986908, + "learning_rate": 2.132504726646883e-06, + "loss": 0.0522, + "num_input_tokens_seen": 116532208, + "step": 53995 + }, + { + "epoch": 8.809135399673735, + "grad_norm": 0.19730214774608612, + "learning_rate": 2.1296290352177644e-06, + "loss": 0.097, + "num_input_tokens_seen": 116542608, + "step": 54000 + }, + { + "epoch": 8.809951060358891, + "grad_norm": 0.2812879681587219, + "learning_rate": 2.1267551977769555e-06, + "loss": 0.2583, + "num_input_tokens_seen": 116552656, + "step": 54005 + }, + { + "epoch": 8.810766721044045, + "grad_norm": 0.15403160452842712, + "learning_rate": 2.123883214557429e-06, + "loss": 0.0931, + "num_input_tokens_seen": 116565008, + "step": 54010 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.23262035846710205, + "learning_rate": 2.1210130857920034e-06, + "loss": 0.1009, + "num_input_tokens_seen": 116574576, + "step": 54015 + }, + { + "epoch": 8.812398042414356, + "grad_norm": 0.06092694774270058, + "learning_rate": 2.1181448117133408e-06, + "loss": 0.0433, + "num_input_tokens_seen": 116584944, + "step": 54020 + }, + { + "epoch": 8.81321370309951, + "grad_norm": 1.3101671934127808, + "learning_rate": 2.115278392553963e-06, + "loss": 0.1125, + "num_input_tokens_seen": 116595920, + "step": 54025 + }, + { + "epoch": 8.814029363784666, + "grad_norm": 0.86967533826828, + "learning_rate": 2.112413828546231e-06, + "loss": 0.0577, + "num_input_tokens_seen": 116606704, + "step": 54030 + }, + { + "epoch": 8.81484502446982, + "grad_norm": 0.2941896617412567, + "learning_rate": 2.1095511199223676e-06, + "loss": 0.0497, + "num_input_tokens_seen": 116618032, + "step": 54035 + }, + { + "epoch": 8.815660685154976, + "grad_norm": 0.1724940687417984, + "learning_rate": 2.106690266914435e-06, + "loss": 0.1319, + "num_input_tokens_seen": 116628464, + "step": 54040 + }, + { + "epoch": 8.81647634584013, + "grad_norm": 1.6511499881744385, + "learning_rate": 2.10383126975435e-06, + "loss": 0.0847, + "num_input_tokens_seen": 116639408, + "step": 54045 + }, + { + "epoch": 8.817292006525285, + "grad_norm": 0.06884144991636276, + "learning_rate": 2.1009741286738745e-06, + "loss": 0.3243, + "num_input_tokens_seen": 116649808, + "step": 54050 + }, + { + "epoch": 8.818107667210441, + "grad_norm": 0.26130416989326477, + "learning_rate": 2.098118843904626e-06, + "loss": 0.1586, + "num_input_tokens_seen": 116661584, + "step": 54055 + }, + { + "epoch": 8.818923327895595, + "grad_norm": 0.054395951330661774, + "learning_rate": 2.0952654156780686e-06, + "loss": 0.0417, + "num_input_tokens_seen": 116671664, + "step": 54060 + }, + { + "epoch": 8.81973898858075, + "grad_norm": 0.19810165464878082, + "learning_rate": 2.09241384422551e-06, + "loss": 0.1147, + "num_input_tokens_seen": 116683248, + "step": 54065 + }, + { + "epoch": 8.820554649265905, + "grad_norm": 1.042321801185608, + "learning_rate": 2.089564129778121e-06, + "loss": 0.1798, + "num_input_tokens_seen": 116693936, + "step": 54070 + }, + { + "epoch": 8.82137030995106, + "grad_norm": 0.06922542303800583, + "learning_rate": 2.0867162725669077e-06, + "loss": 0.0917, + "num_input_tokens_seen": 116703856, + "step": 54075 + }, + { + "epoch": 8.822185970636216, + "grad_norm": 0.06602854281663895, + "learning_rate": 2.0838702728227356e-06, + "loss": 0.0386, + "num_input_tokens_seen": 116713136, + "step": 54080 + }, + { + "epoch": 8.82300163132137, + "grad_norm": 0.07651295512914658, + "learning_rate": 2.081026130776309e-06, + "loss": 0.0544, + "num_input_tokens_seen": 116723696, + "step": 54085 + }, + { + "epoch": 8.823817292006526, + "grad_norm": 0.8865509629249573, + "learning_rate": 2.078183846658199e-06, + "loss": 0.2283, + "num_input_tokens_seen": 116735280, + "step": 54090 + }, + { + "epoch": 8.82463295269168, + "grad_norm": 1.803879976272583, + "learning_rate": 2.075343420698811e-06, + "loss": 0.1249, + "num_input_tokens_seen": 116746416, + "step": 54095 + }, + { + "epoch": 8.825448613376835, + "grad_norm": 0.7199145555496216, + "learning_rate": 2.0725048531284015e-06, + "loss": 0.1473, + "num_input_tokens_seen": 116757232, + "step": 54100 + }, + { + "epoch": 8.826264274061991, + "grad_norm": 0.09479790180921555, + "learning_rate": 2.0696681441770836e-06, + "loss": 0.197, + "num_input_tokens_seen": 116768304, + "step": 54105 + }, + { + "epoch": 8.827079934747145, + "grad_norm": 0.6083109378814697, + "learning_rate": 2.066833294074813e-06, + "loss": 0.1861, + "num_input_tokens_seen": 116779216, + "step": 54110 + }, + { + "epoch": 8.8278955954323, + "grad_norm": 1.2999101877212524, + "learning_rate": 2.064000303051397e-06, + "loss": 0.0375, + "num_input_tokens_seen": 116790160, + "step": 54115 + }, + { + "epoch": 8.828711256117455, + "grad_norm": 0.24728530645370483, + "learning_rate": 2.061169171336494e-06, + "loss": 0.0308, + "num_input_tokens_seen": 116801936, + "step": 54120 + }, + { + "epoch": 8.82952691680261, + "grad_norm": 0.35062018036842346, + "learning_rate": 2.058339899159606e-06, + "loss": 0.1646, + "num_input_tokens_seen": 116812816, + "step": 54125 + }, + { + "epoch": 8.830342577487766, + "grad_norm": 0.785084068775177, + "learning_rate": 2.0555124867500915e-06, + "loss": 0.1727, + "num_input_tokens_seen": 116821840, + "step": 54130 + }, + { + "epoch": 8.83115823817292, + "grad_norm": 0.14508110284805298, + "learning_rate": 2.05268693433715e-06, + "loss": 0.0867, + "num_input_tokens_seen": 116832912, + "step": 54135 + }, + { + "epoch": 8.831973898858076, + "grad_norm": 0.42737528681755066, + "learning_rate": 2.049863242149844e-06, + "loss": 0.0276, + "num_input_tokens_seen": 116842896, + "step": 54140 + }, + { + "epoch": 8.83278955954323, + "grad_norm": 1.5145574808120728, + "learning_rate": 2.0470414104170694e-06, + "loss": 0.1538, + "num_input_tokens_seen": 116853744, + "step": 54145 + }, + { + "epoch": 8.833605220228385, + "grad_norm": 1.0481406450271606, + "learning_rate": 2.044221439367583e-06, + "loss": 0.0865, + "num_input_tokens_seen": 116863824, + "step": 54150 + }, + { + "epoch": 8.83442088091354, + "grad_norm": 0.017714202404022217, + "learning_rate": 2.0414033292299823e-06, + "loss": 0.0454, + "num_input_tokens_seen": 116875056, + "step": 54155 + }, + { + "epoch": 8.835236541598695, + "grad_norm": 0.15648293495178223, + "learning_rate": 2.0385870802327176e-06, + "loss": 0.1534, + "num_input_tokens_seen": 116886768, + "step": 54160 + }, + { + "epoch": 8.83605220228385, + "grad_norm": 0.9062206149101257, + "learning_rate": 2.0357726926040875e-06, + "loss": 0.0848, + "num_input_tokens_seen": 116898032, + "step": 54165 + }, + { + "epoch": 8.836867862969005, + "grad_norm": 0.32465264201164246, + "learning_rate": 2.0329601665722453e-06, + "loss": 0.0358, + "num_input_tokens_seen": 116909008, + "step": 54170 + }, + { + "epoch": 8.83768352365416, + "grad_norm": 0.5528138875961304, + "learning_rate": 2.030149502365186e-06, + "loss": 0.033, + "num_input_tokens_seen": 116920464, + "step": 54175 + }, + { + "epoch": 8.838499184339314, + "grad_norm": 0.343910813331604, + "learning_rate": 2.027340700210753e-06, + "loss": 0.0934, + "num_input_tokens_seen": 116931568, + "step": 54180 + }, + { + "epoch": 8.83931484502447, + "grad_norm": 1.2557252645492554, + "learning_rate": 2.0245337603366472e-06, + "loss": 0.0762, + "num_input_tokens_seen": 116943312, + "step": 54185 + }, + { + "epoch": 8.840130505709626, + "grad_norm": 0.15011948347091675, + "learning_rate": 2.0217286829704115e-06, + "loss": 0.0358, + "num_input_tokens_seen": 116954480, + "step": 54190 + }, + { + "epoch": 8.84094616639478, + "grad_norm": 0.3706777095794678, + "learning_rate": 2.018925468339436e-06, + "loss": 0.1355, + "num_input_tokens_seen": 116964880, + "step": 54195 + }, + { + "epoch": 8.841761827079935, + "grad_norm": 0.5678054690361023, + "learning_rate": 2.01612411667097e-06, + "loss": 0.1605, + "num_input_tokens_seen": 116974896, + "step": 54200 + }, + { + "epoch": 8.84257748776509, + "grad_norm": 0.4639805853366852, + "learning_rate": 2.013324628192101e-06, + "loss": 0.0554, + "num_input_tokens_seen": 116985072, + "step": 54205 + }, + { + "epoch": 8.843393148450245, + "grad_norm": 2.028367042541504, + "learning_rate": 2.0105270031297725e-06, + "loss": 0.0928, + "num_input_tokens_seen": 116995856, + "step": 54210 + }, + { + "epoch": 8.844208809135399, + "grad_norm": 1.4484184980392456, + "learning_rate": 2.0077312417107695e-06, + "loss": 0.2494, + "num_input_tokens_seen": 117007152, + "step": 54215 + }, + { + "epoch": 8.845024469820554, + "grad_norm": 1.1898887157440186, + "learning_rate": 2.0049373441617363e-06, + "loss": 0.1022, + "num_input_tokens_seen": 117017040, + "step": 54220 + }, + { + "epoch": 8.84584013050571, + "grad_norm": 0.808806300163269, + "learning_rate": 2.002145310709155e-06, + "loss": 0.2339, + "num_input_tokens_seen": 117028304, + "step": 54225 + }, + { + "epoch": 8.846655791190864, + "grad_norm": 0.05749046057462692, + "learning_rate": 1.9993551415793647e-06, + "loss": 0.1845, + "num_input_tokens_seen": 117039120, + "step": 54230 + }, + { + "epoch": 8.84747145187602, + "grad_norm": 0.026771873235702515, + "learning_rate": 1.9965668369985507e-06, + "loss": 0.0513, + "num_input_tokens_seen": 117050032, + "step": 54235 + }, + { + "epoch": 8.848287112561174, + "grad_norm": 1.2598202228546143, + "learning_rate": 1.993780397192749e-06, + "loss": 0.2721, + "num_input_tokens_seen": 117058448, + "step": 54240 + }, + { + "epoch": 8.84910277324633, + "grad_norm": 0.5965679287910461, + "learning_rate": 1.9909958223878424e-06, + "loss": 0.0619, + "num_input_tokens_seen": 117068880, + "step": 54245 + }, + { + "epoch": 8.849918433931485, + "grad_norm": 1.3638652563095093, + "learning_rate": 1.988213112809559e-06, + "loss": 0.2159, + "num_input_tokens_seen": 117079312, + "step": 54250 + }, + { + "epoch": 8.850734094616639, + "grad_norm": 3.6705965995788574, + "learning_rate": 1.9854322686834766e-06, + "loss": 0.2669, + "num_input_tokens_seen": 117089776, + "step": 54255 + }, + { + "epoch": 8.851549755301795, + "grad_norm": 0.048929937183856964, + "learning_rate": 1.982653290235034e-06, + "loss": 0.2287, + "num_input_tokens_seen": 117100304, + "step": 54260 + }, + { + "epoch": 8.852365415986949, + "grad_norm": 0.109520323574543, + "learning_rate": 1.979876177689505e-06, + "loss": 0.1661, + "num_input_tokens_seen": 117110064, + "step": 54265 + }, + { + "epoch": 8.853181076672104, + "grad_norm": 1.037641167640686, + "learning_rate": 1.9771009312720164e-06, + "loss": 0.1622, + "num_input_tokens_seen": 117121872, + "step": 54270 + }, + { + "epoch": 8.85399673735726, + "grad_norm": 0.16071178019046783, + "learning_rate": 1.9743275512075417e-06, + "loss": 0.0485, + "num_input_tokens_seen": 117133040, + "step": 54275 + }, + { + "epoch": 8.854812398042414, + "grad_norm": 1.0375304222106934, + "learning_rate": 1.9715560377209093e-06, + "loss": 0.1808, + "num_input_tokens_seen": 117145488, + "step": 54280 + }, + { + "epoch": 8.85562805872757, + "grad_norm": 0.11279469728469849, + "learning_rate": 1.9687863910367866e-06, + "loss": 0.1717, + "num_input_tokens_seen": 117156464, + "step": 54285 + }, + { + "epoch": 8.856443719412724, + "grad_norm": 0.35861924290657043, + "learning_rate": 1.9660186113796996e-06, + "loss": 0.0894, + "num_input_tokens_seen": 117166128, + "step": 54290 + }, + { + "epoch": 8.85725938009788, + "grad_norm": 2.0598206520080566, + "learning_rate": 1.963252698974019e-06, + "loss": 0.1196, + "num_input_tokens_seen": 117176272, + "step": 54295 + }, + { + "epoch": 8.858075040783035, + "grad_norm": 1.1596457958221436, + "learning_rate": 1.96048865404396e-06, + "loss": 0.2805, + "num_input_tokens_seen": 117187440, + "step": 54300 + }, + { + "epoch": 8.858890701468189, + "grad_norm": 0.19426383078098297, + "learning_rate": 1.9577264768135927e-06, + "loss": 0.0498, + "num_input_tokens_seen": 117198416, + "step": 54305 + }, + { + "epoch": 8.859706362153345, + "grad_norm": 0.03759706765413284, + "learning_rate": 1.9549661675068303e-06, + "loss": 0.2136, + "num_input_tokens_seen": 117209104, + "step": 54310 + }, + { + "epoch": 8.860522022838499, + "grad_norm": 0.037564996629953384, + "learning_rate": 1.9522077263474436e-06, + "loss": 0.0231, + "num_input_tokens_seen": 117220592, + "step": 54315 + }, + { + "epoch": 8.861337683523654, + "grad_norm": 0.1804472953081131, + "learning_rate": 1.94945115355904e-06, + "loss": 0.0724, + "num_input_tokens_seen": 117231280, + "step": 54320 + }, + { + "epoch": 8.86215334420881, + "grad_norm": 1.1725832223892212, + "learning_rate": 1.946696449365082e-06, + "loss": 0.1602, + "num_input_tokens_seen": 117243216, + "step": 54325 + }, + { + "epoch": 8.862969004893964, + "grad_norm": 0.18135598301887512, + "learning_rate": 1.943943613988883e-06, + "loss": 0.0296, + "num_input_tokens_seen": 117252976, + "step": 54330 + }, + { + "epoch": 8.86378466557912, + "grad_norm": 0.486937940120697, + "learning_rate": 1.9411926476535976e-06, + "loss": 0.1399, + "num_input_tokens_seen": 117264016, + "step": 54335 + }, + { + "epoch": 8.864600326264274, + "grad_norm": 1.068914532661438, + "learning_rate": 1.938443550582239e-06, + "loss": 0.3114, + "num_input_tokens_seen": 117274864, + "step": 54340 + }, + { + "epoch": 8.86541598694943, + "grad_norm": 0.5840505361557007, + "learning_rate": 1.935696322997657e-06, + "loss": 0.0276, + "num_input_tokens_seen": 117285552, + "step": 54345 + }, + { + "epoch": 8.866231647634583, + "grad_norm": 1.0648581981658936, + "learning_rate": 1.932950965122554e-06, + "loss": 0.1091, + "num_input_tokens_seen": 117295728, + "step": 54350 + }, + { + "epoch": 8.867047308319739, + "grad_norm": 0.43104732036590576, + "learning_rate": 1.930207477179491e-06, + "loss": 0.1904, + "num_input_tokens_seen": 117307408, + "step": 54355 + }, + { + "epoch": 8.867862969004895, + "grad_norm": 1.5470991134643555, + "learning_rate": 1.9274658593908647e-06, + "loss": 0.243, + "num_input_tokens_seen": 117318832, + "step": 54360 + }, + { + "epoch": 8.868678629690049, + "grad_norm": 0.7179315090179443, + "learning_rate": 1.9247261119789252e-06, + "loss": 0.1467, + "num_input_tokens_seen": 117329712, + "step": 54365 + }, + { + "epoch": 8.869494290375204, + "grad_norm": 0.6066888570785522, + "learning_rate": 1.9219882351657696e-06, + "loss": 0.1665, + "num_input_tokens_seen": 117339728, + "step": 54370 + }, + { + "epoch": 8.870309951060358, + "grad_norm": 0.25912782549858093, + "learning_rate": 1.9192522291733434e-06, + "loss": 0.0715, + "num_input_tokens_seen": 117351408, + "step": 54375 + }, + { + "epoch": 8.871125611745514, + "grad_norm": 2.104736566543579, + "learning_rate": 1.9165180942234435e-06, + "loss": 0.3704, + "num_input_tokens_seen": 117361904, + "step": 54380 + }, + { + "epoch": 8.87194127243067, + "grad_norm": 0.04612252861261368, + "learning_rate": 1.913785830537712e-06, + "loss": 0.0661, + "num_input_tokens_seen": 117371536, + "step": 54385 + }, + { + "epoch": 8.872756933115824, + "grad_norm": 0.24912051856517792, + "learning_rate": 1.911055438337639e-06, + "loss": 0.28, + "num_input_tokens_seen": 117379984, + "step": 54390 + }, + { + "epoch": 8.87357259380098, + "grad_norm": 0.029039453715085983, + "learning_rate": 1.9083269178445636e-06, + "loss": 0.0355, + "num_input_tokens_seen": 117390416, + "step": 54395 + }, + { + "epoch": 8.874388254486133, + "grad_norm": 1.2893595695495605, + "learning_rate": 1.9056002692796698e-06, + "loss": 0.1068, + "num_input_tokens_seen": 117401136, + "step": 54400 + }, + { + "epoch": 8.875203915171289, + "grad_norm": 0.6751818060874939, + "learning_rate": 1.9028754928640008e-06, + "loss": 0.14, + "num_input_tokens_seen": 117411344, + "step": 54405 + }, + { + "epoch": 8.876019575856443, + "grad_norm": 0.9813580513000488, + "learning_rate": 1.9001525888184407e-06, + "loss": 0.0725, + "num_input_tokens_seen": 117421680, + "step": 54410 + }, + { + "epoch": 8.876835236541599, + "grad_norm": 0.18368667364120483, + "learning_rate": 1.8974315573637185e-06, + "loss": 0.0323, + "num_input_tokens_seen": 117432048, + "step": 54415 + }, + { + "epoch": 8.877650897226754, + "grad_norm": 0.16499629616737366, + "learning_rate": 1.8947123987204135e-06, + "loss": 0.1036, + "num_input_tokens_seen": 117444080, + "step": 54420 + }, + { + "epoch": 8.878466557911908, + "grad_norm": 0.8047398924827576, + "learning_rate": 1.8919951131089602e-06, + "loss": 0.1458, + "num_input_tokens_seen": 117454864, + "step": 54425 + }, + { + "epoch": 8.879282218597064, + "grad_norm": 0.216077983379364, + "learning_rate": 1.88927970074963e-06, + "loss": 0.1554, + "num_input_tokens_seen": 117464944, + "step": 54430 + }, + { + "epoch": 8.880097879282218, + "grad_norm": 0.6538369059562683, + "learning_rate": 1.8865661618625491e-06, + "loss": 0.0487, + "num_input_tokens_seen": 117475344, + "step": 54435 + }, + { + "epoch": 8.880913539967374, + "grad_norm": 2.017178535461426, + "learning_rate": 1.883854496667692e-06, + "loss": 0.2191, + "num_input_tokens_seen": 117486288, + "step": 54440 + }, + { + "epoch": 8.88172920065253, + "grad_norm": 0.35687586665153503, + "learning_rate": 1.8811447053848796e-06, + "loss": 0.0511, + "num_input_tokens_seen": 117496432, + "step": 54445 + }, + { + "epoch": 8.882544861337683, + "grad_norm": 0.3838431239128113, + "learning_rate": 1.878436788233781e-06, + "loss": 0.0893, + "num_input_tokens_seen": 117506960, + "step": 54450 + }, + { + "epoch": 8.883360522022839, + "grad_norm": 0.6084454655647278, + "learning_rate": 1.8757307454339095e-06, + "loss": 0.2117, + "num_input_tokens_seen": 117517360, + "step": 54455 + }, + { + "epoch": 8.884176182707993, + "grad_norm": 0.47634097933769226, + "learning_rate": 1.8730265772046396e-06, + "loss": 0.1543, + "num_input_tokens_seen": 117527376, + "step": 54460 + }, + { + "epoch": 8.884991843393149, + "grad_norm": 0.08371138572692871, + "learning_rate": 1.8703242837651818e-06, + "loss": 0.0505, + "num_input_tokens_seen": 117537104, + "step": 54465 + }, + { + "epoch": 8.885807504078304, + "grad_norm": 0.9505768418312073, + "learning_rate": 1.8676238653345945e-06, + "loss": 0.1604, + "num_input_tokens_seen": 117546896, + "step": 54470 + }, + { + "epoch": 8.886623164763458, + "grad_norm": 0.3835790455341339, + "learning_rate": 1.8649253221317914e-06, + "loss": 0.0864, + "num_input_tokens_seen": 117557328, + "step": 54475 + }, + { + "epoch": 8.887438825448614, + "grad_norm": 0.2980920672416687, + "learning_rate": 1.8622286543755252e-06, + "loss": 0.0902, + "num_input_tokens_seen": 117567568, + "step": 54480 + }, + { + "epoch": 8.888254486133768, + "grad_norm": 0.4328116178512573, + "learning_rate": 1.8595338622844072e-06, + "loss": 0.0432, + "num_input_tokens_seen": 117578032, + "step": 54485 + }, + { + "epoch": 8.889070146818923, + "grad_norm": 0.37138548493385315, + "learning_rate": 1.8568409460768848e-06, + "loss": 0.1615, + "num_input_tokens_seen": 117589456, + "step": 54490 + }, + { + "epoch": 8.88988580750408, + "grad_norm": 1.4665098190307617, + "learning_rate": 1.8541499059712641e-06, + "loss": 0.0999, + "num_input_tokens_seen": 117599952, + "step": 54495 + }, + { + "epoch": 8.890701468189233, + "grad_norm": 0.10356757789850235, + "learning_rate": 1.8514607421856928e-06, + "loss": 0.14, + "num_input_tokens_seen": 117610288, + "step": 54500 + }, + { + "epoch": 8.891517128874389, + "grad_norm": 0.13220179080963135, + "learning_rate": 1.8487734549381686e-06, + "loss": 0.025, + "num_input_tokens_seen": 117620528, + "step": 54505 + }, + { + "epoch": 8.892332789559543, + "grad_norm": 0.14974744617938995, + "learning_rate": 1.8460880444465367e-06, + "loss": 0.0411, + "num_input_tokens_seen": 117631984, + "step": 54510 + }, + { + "epoch": 8.893148450244698, + "grad_norm": 0.045112695544958115, + "learning_rate": 1.843404510928487e-06, + "loss": 0.0536, + "num_input_tokens_seen": 117642800, + "step": 54515 + }, + { + "epoch": 8.893964110929852, + "grad_norm": 0.4912354350090027, + "learning_rate": 1.8407228546015648e-06, + "loss": 0.0506, + "num_input_tokens_seen": 117654544, + "step": 54520 + }, + { + "epoch": 8.894779771615008, + "grad_norm": 0.3630676865577698, + "learning_rate": 1.8380430756831574e-06, + "loss": 0.0594, + "num_input_tokens_seen": 117664688, + "step": 54525 + }, + { + "epoch": 8.895595432300164, + "grad_norm": 0.062306810170412064, + "learning_rate": 1.835365174390502e-06, + "loss": 0.0244, + "num_input_tokens_seen": 117675664, + "step": 54530 + }, + { + "epoch": 8.896411092985318, + "grad_norm": 0.22014203667640686, + "learning_rate": 1.8326891509406808e-06, + "loss": 0.0315, + "num_input_tokens_seen": 117685744, + "step": 54535 + }, + { + "epoch": 8.897226753670473, + "grad_norm": 1.104995608329773, + "learning_rate": 1.8300150055506254e-06, + "loss": 0.1202, + "num_input_tokens_seen": 117697104, + "step": 54540 + }, + { + "epoch": 8.898042414355627, + "grad_norm": 1.7553918361663818, + "learning_rate": 1.8273427384371127e-06, + "loss": 0.1228, + "num_input_tokens_seen": 117708272, + "step": 54545 + }, + { + "epoch": 8.898858075040783, + "grad_norm": 1.3383511304855347, + "learning_rate": 1.8246723498167806e-06, + "loss": 0.0627, + "num_input_tokens_seen": 117718832, + "step": 54550 + }, + { + "epoch": 8.899673735725939, + "grad_norm": 0.37367600202560425, + "learning_rate": 1.8220038399060973e-06, + "loss": 0.1249, + "num_input_tokens_seen": 117729744, + "step": 54555 + }, + { + "epoch": 8.900489396411093, + "grad_norm": 1.0186505317687988, + "learning_rate": 1.8193372089213872e-06, + "loss": 0.1307, + "num_input_tokens_seen": 117741104, + "step": 54560 + }, + { + "epoch": 8.901305057096248, + "grad_norm": 0.15310825407505035, + "learning_rate": 1.816672457078819e-06, + "loss": 0.024, + "num_input_tokens_seen": 117751280, + "step": 54565 + }, + { + "epoch": 8.902120717781402, + "grad_norm": 0.1928969919681549, + "learning_rate": 1.8140095845944117e-06, + "loss": 0.0976, + "num_input_tokens_seen": 117761200, + "step": 54570 + }, + { + "epoch": 8.902936378466558, + "grad_norm": 0.5427776575088501, + "learning_rate": 1.811348591684031e-06, + "loss": 0.0904, + "num_input_tokens_seen": 117771536, + "step": 54575 + }, + { + "epoch": 8.903752039151712, + "grad_norm": 1.1321346759796143, + "learning_rate": 1.8086894785633969e-06, + "loss": 0.0585, + "num_input_tokens_seen": 117781808, + "step": 54580 + }, + { + "epoch": 8.904567699836868, + "grad_norm": 0.04929861053824425, + "learning_rate": 1.806032245448061e-06, + "loss": 0.0897, + "num_input_tokens_seen": 117793296, + "step": 54585 + }, + { + "epoch": 8.905383360522023, + "grad_norm": 0.07118646055459976, + "learning_rate": 1.8033768925534378e-06, + "loss": 0.232, + "num_input_tokens_seen": 117803568, + "step": 54590 + }, + { + "epoch": 8.906199021207177, + "grad_norm": 0.7120736837387085, + "learning_rate": 1.8007234200947826e-06, + "loss": 0.1171, + "num_input_tokens_seen": 117813648, + "step": 54595 + }, + { + "epoch": 8.907014681892333, + "grad_norm": 0.48524805903434753, + "learning_rate": 1.7980718282871982e-06, + "loss": 0.0394, + "num_input_tokens_seen": 117824816, + "step": 54600 + }, + { + "epoch": 8.907830342577487, + "grad_norm": 0.04902864620089531, + "learning_rate": 1.7954221173456382e-06, + "loss": 0.1682, + "num_input_tokens_seen": 117835312, + "step": 54605 + }, + { + "epoch": 8.908646003262643, + "grad_norm": 0.08716239035129547, + "learning_rate": 1.7927742874848997e-06, + "loss": 0.0963, + "num_input_tokens_seen": 117845616, + "step": 54610 + }, + { + "epoch": 8.909461663947798, + "grad_norm": 0.4858724772930145, + "learning_rate": 1.7901283389196278e-06, + "loss": 0.0804, + "num_input_tokens_seen": 117856656, + "step": 54615 + }, + { + "epoch": 8.910277324632952, + "grad_norm": 2.2557671070098877, + "learning_rate": 1.7874842718643203e-06, + "loss": 0.3931, + "num_input_tokens_seen": 117867056, + "step": 54620 + }, + { + "epoch": 8.911092985318108, + "grad_norm": 1.5382047891616821, + "learning_rate": 1.7848420865333172e-06, + "loss": 0.2156, + "num_input_tokens_seen": 117877552, + "step": 54625 + }, + { + "epoch": 8.911908646003262, + "grad_norm": 0.24126176536083221, + "learning_rate": 1.782201783140805e-06, + "loss": 0.1592, + "num_input_tokens_seen": 117888624, + "step": 54630 + }, + { + "epoch": 8.912724306688418, + "grad_norm": 1.841051697731018, + "learning_rate": 1.779563361900824e-06, + "loss": 0.2089, + "num_input_tokens_seen": 117897520, + "step": 54635 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.881098747253418, + "learning_rate": 1.7769268230272557e-06, + "loss": 0.0946, + "num_input_tokens_seen": 117907920, + "step": 54640 + }, + { + "epoch": 8.914355628058727, + "grad_norm": 0.8914873600006104, + "learning_rate": 1.774292166733832e-06, + "loss": 0.1643, + "num_input_tokens_seen": 117919088, + "step": 54645 + }, + { + "epoch": 8.915171288743883, + "grad_norm": 1.538214087486267, + "learning_rate": 1.7716593932341319e-06, + "loss": 0.0606, + "num_input_tokens_seen": 117928848, + "step": 54650 + }, + { + "epoch": 8.915986949429037, + "grad_norm": 0.12411472946405411, + "learning_rate": 1.7690285027415792e-06, + "loss": 0.1177, + "num_input_tokens_seen": 117939984, + "step": 54655 + }, + { + "epoch": 8.916802610114193, + "grad_norm": 0.09456514567136765, + "learning_rate": 1.7663994954694508e-06, + "loss": 0.1025, + "num_input_tokens_seen": 117952784, + "step": 54660 + }, + { + "epoch": 8.917618270799348, + "grad_norm": 0.09985782206058502, + "learning_rate": 1.7637723716308646e-06, + "loss": 0.1991, + "num_input_tokens_seen": 117963504, + "step": 54665 + }, + { + "epoch": 8.918433931484502, + "grad_norm": 0.4231787323951721, + "learning_rate": 1.7611471314387867e-06, + "loss": 0.1002, + "num_input_tokens_seen": 117973808, + "step": 54670 + }, + { + "epoch": 8.919249592169658, + "grad_norm": 0.03392881900072098, + "learning_rate": 1.7585237751060357e-06, + "loss": 0.0955, + "num_input_tokens_seen": 117984464, + "step": 54675 + }, + { + "epoch": 8.920065252854812, + "grad_norm": 0.306450754404068, + "learning_rate": 1.7559023028452748e-06, + "loss": 0.0282, + "num_input_tokens_seen": 117994928, + "step": 54680 + }, + { + "epoch": 8.920880913539968, + "grad_norm": 0.7057309150695801, + "learning_rate": 1.7532827148690145e-06, + "loss": 0.1008, + "num_input_tokens_seen": 118004304, + "step": 54685 + }, + { + "epoch": 8.921696574225122, + "grad_norm": 1.4076290130615234, + "learning_rate": 1.75066501138961e-06, + "loss": 0.1046, + "num_input_tokens_seen": 118015152, + "step": 54690 + }, + { + "epoch": 8.922512234910277, + "grad_norm": 1.188358187675476, + "learning_rate": 1.7480491926192638e-06, + "loss": 0.0681, + "num_input_tokens_seen": 118027344, + "step": 54695 + }, + { + "epoch": 8.923327895595433, + "grad_norm": 1.5254638195037842, + "learning_rate": 1.7454352587700284e-06, + "loss": 0.2154, + "num_input_tokens_seen": 118038672, + "step": 54700 + }, + { + "epoch": 8.924143556280587, + "grad_norm": 1.110748291015625, + "learning_rate": 1.7428232100538067e-06, + "loss": 0.1069, + "num_input_tokens_seen": 118050576, + "step": 54705 + }, + { + "epoch": 8.924959216965743, + "grad_norm": 0.08805881440639496, + "learning_rate": 1.7402130466823373e-06, + "loss": 0.1632, + "num_input_tokens_seen": 118061520, + "step": 54710 + }, + { + "epoch": 8.925774877650896, + "grad_norm": 0.2349122315645218, + "learning_rate": 1.7376047688672182e-06, + "loss": 0.2122, + "num_input_tokens_seen": 118072080, + "step": 54715 + }, + { + "epoch": 8.926590538336052, + "grad_norm": 0.9658210277557373, + "learning_rate": 1.734998376819888e-06, + "loss": 0.1024, + "num_input_tokens_seen": 118082416, + "step": 54720 + }, + { + "epoch": 8.927406199021208, + "grad_norm": 0.3515530526638031, + "learning_rate": 1.732393870751639e-06, + "loss": 0.2013, + "num_input_tokens_seen": 118092880, + "step": 54725 + }, + { + "epoch": 8.928221859706362, + "grad_norm": 0.408856600522995, + "learning_rate": 1.729791250873597e-06, + "loss": 0.0529, + "num_input_tokens_seen": 118105136, + "step": 54730 + }, + { + "epoch": 8.929037520391518, + "grad_norm": 0.23199188709259033, + "learning_rate": 1.7271905173967513e-06, + "loss": 0.1688, + "num_input_tokens_seen": 118114736, + "step": 54735 + }, + { + "epoch": 8.929853181076671, + "grad_norm": 0.918359100818634, + "learning_rate": 1.7245916705319276e-06, + "loss": 0.0546, + "num_input_tokens_seen": 118126064, + "step": 54740 + }, + { + "epoch": 8.930668841761827, + "grad_norm": 0.08039906620979309, + "learning_rate": 1.7219947104897994e-06, + "loss": 0.0976, + "num_input_tokens_seen": 118137456, + "step": 54745 + }, + { + "epoch": 8.931484502446983, + "grad_norm": 0.18119695782661438, + "learning_rate": 1.7193996374808924e-06, + "loss": 0.1222, + "num_input_tokens_seen": 118147376, + "step": 54750 + }, + { + "epoch": 8.932300163132137, + "grad_norm": 0.036657724529504776, + "learning_rate": 1.7168064517155747e-06, + "loss": 0.0592, + "num_input_tokens_seen": 118158096, + "step": 54755 + }, + { + "epoch": 8.933115823817293, + "grad_norm": 0.12479238212108612, + "learning_rate": 1.714215153404064e-06, + "loss": 0.2571, + "num_input_tokens_seen": 118168688, + "step": 54760 + }, + { + "epoch": 8.933931484502446, + "grad_norm": 0.15066291391849518, + "learning_rate": 1.7116257427564259e-06, + "loss": 0.2905, + "num_input_tokens_seen": 118179792, + "step": 54765 + }, + { + "epoch": 8.934747145187602, + "grad_norm": 0.22283132374286652, + "learning_rate": 1.7090382199825672e-06, + "loss": 0.039, + "num_input_tokens_seen": 118190736, + "step": 54770 + }, + { + "epoch": 8.935562805872756, + "grad_norm": 0.3691507875919342, + "learning_rate": 1.7064525852922424e-06, + "loss": 0.1281, + "num_input_tokens_seen": 118201584, + "step": 54775 + }, + { + "epoch": 8.936378466557912, + "grad_norm": 0.18285097181797028, + "learning_rate": 1.7038688388950675e-06, + "loss": 0.0743, + "num_input_tokens_seen": 118212624, + "step": 54780 + }, + { + "epoch": 8.937194127243067, + "grad_norm": 0.5766268372535706, + "learning_rate": 1.7012869810004856e-06, + "loss": 0.1748, + "num_input_tokens_seen": 118222800, + "step": 54785 + }, + { + "epoch": 8.938009787928221, + "grad_norm": 0.2653287649154663, + "learning_rate": 1.698707011817799e-06, + "loss": 0.1005, + "num_input_tokens_seen": 118232592, + "step": 54790 + }, + { + "epoch": 8.938825448613377, + "grad_norm": 1.181424856185913, + "learning_rate": 1.6961289315561512e-06, + "loss": 0.0525, + "num_input_tokens_seen": 118244464, + "step": 54795 + }, + { + "epoch": 8.939641109298531, + "grad_norm": 0.8731667399406433, + "learning_rate": 1.6935527404245366e-06, + "loss": 0.0614, + "num_input_tokens_seen": 118253552, + "step": 54800 + }, + { + "epoch": 8.940456769983687, + "grad_norm": 0.04625741392374039, + "learning_rate": 1.6909784386317906e-06, + "loss": 0.1339, + "num_input_tokens_seen": 118263824, + "step": 54805 + }, + { + "epoch": 8.941272430668842, + "grad_norm": 2.266110897064209, + "learning_rate": 1.688406026386602e-06, + "loss": 0.2263, + "num_input_tokens_seen": 118275216, + "step": 54810 + }, + { + "epoch": 8.942088091353996, + "grad_norm": 1.3058921098709106, + "learning_rate": 1.6858355038975038e-06, + "loss": 0.0659, + "num_input_tokens_seen": 118286224, + "step": 54815 + }, + { + "epoch": 8.942903752039152, + "grad_norm": 0.23169739544391632, + "learning_rate": 1.6832668713728711e-06, + "loss": 0.2019, + "num_input_tokens_seen": 118296240, + "step": 54820 + }, + { + "epoch": 8.943719412724306, + "grad_norm": 0.08462823927402496, + "learning_rate": 1.6807001290209374e-06, + "loss": 0.0527, + "num_input_tokens_seen": 118305680, + "step": 54825 + }, + { + "epoch": 8.944535073409462, + "grad_norm": 0.08078451454639435, + "learning_rate": 1.6781352770497694e-06, + "loss": 0.1467, + "num_input_tokens_seen": 118316880, + "step": 54830 + }, + { + "epoch": 8.945350734094617, + "grad_norm": 0.3398537039756775, + "learning_rate": 1.6755723156672925e-06, + "loss": 0.1115, + "num_input_tokens_seen": 118328304, + "step": 54835 + }, + { + "epoch": 8.946166394779771, + "grad_norm": 0.22310027480125427, + "learning_rate": 1.6730112450812685e-06, + "loss": 0.0643, + "num_input_tokens_seen": 118339728, + "step": 54840 + }, + { + "epoch": 8.946982055464927, + "grad_norm": 0.38289642333984375, + "learning_rate": 1.6704520654993145e-06, + "loss": 0.0996, + "num_input_tokens_seen": 118349872, + "step": 54845 + }, + { + "epoch": 8.947797716150081, + "grad_norm": 0.28302377462387085, + "learning_rate": 1.6678947771288866e-06, + "loss": 0.2435, + "num_input_tokens_seen": 118359888, + "step": 54850 + }, + { + "epoch": 8.948613376835237, + "grad_norm": 0.061875853687524796, + "learning_rate": 1.665339380177297e-06, + "loss": 0.0562, + "num_input_tokens_seen": 118370544, + "step": 54855 + }, + { + "epoch": 8.949429037520392, + "grad_norm": 0.25207677483558655, + "learning_rate": 1.6627858748516912e-06, + "loss": 0.0415, + "num_input_tokens_seen": 118381584, + "step": 54860 + }, + { + "epoch": 8.950244698205546, + "grad_norm": 0.3887329697608948, + "learning_rate": 1.6602342613590754e-06, + "loss": 0.0734, + "num_input_tokens_seen": 118392624, + "step": 54865 + }, + { + "epoch": 8.951060358890702, + "grad_norm": 0.10069441050291061, + "learning_rate": 1.6576845399062985e-06, + "loss": 0.074, + "num_input_tokens_seen": 118402768, + "step": 54870 + }, + { + "epoch": 8.951876019575856, + "grad_norm": 1.869842767715454, + "learning_rate": 1.6551367107000503e-06, + "loss": 0.0969, + "num_input_tokens_seen": 118413584, + "step": 54875 + }, + { + "epoch": 8.952691680261012, + "grad_norm": 0.35205188393592834, + "learning_rate": 1.6525907739468689e-06, + "loss": 0.1706, + "num_input_tokens_seen": 118424400, + "step": 54880 + }, + { + "epoch": 8.953507340946166, + "grad_norm": 0.08588550984859467, + "learning_rate": 1.6500467298531414e-06, + "loss": 0.0465, + "num_input_tokens_seen": 118435696, + "step": 54885 + }, + { + "epoch": 8.954323001631321, + "grad_norm": 0.7372103929519653, + "learning_rate": 1.6475045786251059e-06, + "loss": 0.2103, + "num_input_tokens_seen": 118446544, + "step": 54890 + }, + { + "epoch": 8.955138662316477, + "grad_norm": 1.641211748123169, + "learning_rate": 1.6449643204688364e-06, + "loss": 0.2122, + "num_input_tokens_seen": 118457616, + "step": 54895 + }, + { + "epoch": 8.955954323001631, + "grad_norm": 0.4364010989665985, + "learning_rate": 1.6424259555902627e-06, + "loss": 0.0973, + "num_input_tokens_seen": 118468816, + "step": 54900 + }, + { + "epoch": 8.956769983686787, + "grad_norm": 0.01956833526492119, + "learning_rate": 1.6398894841951561e-06, + "loss": 0.1061, + "num_input_tokens_seen": 118480176, + "step": 54905 + }, + { + "epoch": 8.95758564437194, + "grad_norm": 0.1413564682006836, + "learning_rate": 1.6373549064891358e-06, + "loss": 0.0268, + "num_input_tokens_seen": 118491024, + "step": 54910 + }, + { + "epoch": 8.958401305057096, + "grad_norm": 0.9996376037597656, + "learning_rate": 1.6348222226776705e-06, + "loss": 0.0442, + "num_input_tokens_seen": 118502096, + "step": 54915 + }, + { + "epoch": 8.959216965742252, + "grad_norm": 0.7716521620750427, + "learning_rate": 1.6322914329660655e-06, + "loss": 0.0712, + "num_input_tokens_seen": 118512464, + "step": 54920 + }, + { + "epoch": 8.960032626427406, + "grad_norm": 0.08850716054439545, + "learning_rate": 1.629762537559487e-06, + "loss": 0.0837, + "num_input_tokens_seen": 118523728, + "step": 54925 + }, + { + "epoch": 8.960848287112562, + "grad_norm": 0.3587326407432556, + "learning_rate": 1.6272355366629327e-06, + "loss": 0.1697, + "num_input_tokens_seen": 118533680, + "step": 54930 + }, + { + "epoch": 8.961663947797716, + "grad_norm": 0.16890794038772583, + "learning_rate": 1.6247104304812604e-06, + "loss": 0.1146, + "num_input_tokens_seen": 118544176, + "step": 54935 + }, + { + "epoch": 8.962479608482871, + "grad_norm": 0.24985533952713013, + "learning_rate": 1.6221872192191651e-06, + "loss": 0.0574, + "num_input_tokens_seen": 118554384, + "step": 54940 + }, + { + "epoch": 8.963295269168025, + "grad_norm": 0.8751465678215027, + "learning_rate": 1.6196659030811938e-06, + "loss": 0.1629, + "num_input_tokens_seen": 118566064, + "step": 54945 + }, + { + "epoch": 8.964110929853181, + "grad_norm": 0.21941430866718292, + "learning_rate": 1.6171464822717337e-06, + "loss": 0.1862, + "num_input_tokens_seen": 118575952, + "step": 54950 + }, + { + "epoch": 8.964926590538337, + "grad_norm": 0.37886252999305725, + "learning_rate": 1.6146289569950208e-06, + "loss": 0.0714, + "num_input_tokens_seen": 118587408, + "step": 54955 + }, + { + "epoch": 8.96574225122349, + "grad_norm": 0.07480712980031967, + "learning_rate": 1.612113327455142e-06, + "loss": 0.161, + "num_input_tokens_seen": 118598320, + "step": 54960 + }, + { + "epoch": 8.966557911908646, + "grad_norm": 0.7938443422317505, + "learning_rate": 1.6095995938560288e-06, + "loss": 0.1639, + "num_input_tokens_seen": 118609488, + "step": 54965 + }, + { + "epoch": 8.9673735725938, + "grad_norm": 0.26430103182792664, + "learning_rate": 1.6070877564014514e-06, + "loss": 0.2111, + "num_input_tokens_seen": 118621040, + "step": 54970 + }, + { + "epoch": 8.968189233278956, + "grad_norm": 0.49534037709236145, + "learning_rate": 1.6045778152950357e-06, + "loss": 0.2017, + "num_input_tokens_seen": 118632720, + "step": 54975 + }, + { + "epoch": 8.969004893964112, + "grad_norm": 0.05024228245019913, + "learning_rate": 1.6020697707402472e-06, + "loss": 0.0163, + "num_input_tokens_seen": 118643024, + "step": 54980 + }, + { + "epoch": 8.969820554649266, + "grad_norm": 2.4238147735595703, + "learning_rate": 1.599563622940406e-06, + "loss": 0.2812, + "num_input_tokens_seen": 118653712, + "step": 54985 + }, + { + "epoch": 8.970636215334421, + "grad_norm": 1.9471914768218994, + "learning_rate": 1.5970593720986666e-06, + "loss": 0.265, + "num_input_tokens_seen": 118664400, + "step": 54990 + }, + { + "epoch": 8.971451876019575, + "grad_norm": 0.15898959338665009, + "learning_rate": 1.5945570184180413e-06, + "loss": 0.092, + "num_input_tokens_seen": 118675504, + "step": 54995 + }, + { + "epoch": 8.97226753670473, + "grad_norm": 0.5157021284103394, + "learning_rate": 1.5920565621013822e-06, + "loss": 0.0864, + "num_input_tokens_seen": 118686192, + "step": 55000 + }, + { + "epoch": 8.973083197389887, + "grad_norm": 0.08126424252986908, + "learning_rate": 1.5895580033513908e-06, + "loss": 0.0523, + "num_input_tokens_seen": 118697776, + "step": 55005 + }, + { + "epoch": 8.97389885807504, + "grad_norm": 0.7955025434494019, + "learning_rate": 1.5870613423706103e-06, + "loss": 0.0586, + "num_input_tokens_seen": 118708720, + "step": 55010 + }, + { + "epoch": 8.974714518760196, + "grad_norm": 1.8810887336730957, + "learning_rate": 1.5845665793614322e-06, + "loss": 0.0624, + "num_input_tokens_seen": 118720112, + "step": 55015 + }, + { + "epoch": 8.97553017944535, + "grad_norm": 0.49155426025390625, + "learning_rate": 1.5820737145260972e-06, + "loss": 0.3437, + "num_input_tokens_seen": 118731184, + "step": 55020 + }, + { + "epoch": 8.976345840130506, + "grad_norm": 0.9811478853225708, + "learning_rate": 1.5795827480666852e-06, + "loss": 0.2406, + "num_input_tokens_seen": 118741392, + "step": 55025 + }, + { + "epoch": 8.977161500815662, + "grad_norm": 1.2456873655319214, + "learning_rate": 1.5770936801851322e-06, + "loss": 0.1332, + "num_input_tokens_seen": 118751984, + "step": 55030 + }, + { + "epoch": 8.977977161500815, + "grad_norm": 0.9636381268501282, + "learning_rate": 1.5746065110832124e-06, + "loss": 0.115, + "num_input_tokens_seen": 118763600, + "step": 55035 + }, + { + "epoch": 8.978792822185971, + "grad_norm": 0.21393923461437225, + "learning_rate": 1.5721212409625485e-06, + "loss": 0.0878, + "num_input_tokens_seen": 118774512, + "step": 55040 + }, + { + "epoch": 8.979608482871125, + "grad_norm": 0.14424996078014374, + "learning_rate": 1.5696378700246094e-06, + "loss": 0.0309, + "num_input_tokens_seen": 118785072, + "step": 55045 + }, + { + "epoch": 8.98042414355628, + "grad_norm": 0.2393578737974167, + "learning_rate": 1.5671563984707066e-06, + "loss": 0.0517, + "num_input_tokens_seen": 118795984, + "step": 55050 + }, + { + "epoch": 8.981239804241435, + "grad_norm": 0.23017707467079163, + "learning_rate": 1.564676826502004e-06, + "loss": 0.0695, + "num_input_tokens_seen": 118804656, + "step": 55055 + }, + { + "epoch": 8.98205546492659, + "grad_norm": 1.4429186582565308, + "learning_rate": 1.5621991543195103e-06, + "loss": 0.0813, + "num_input_tokens_seen": 118814224, + "step": 55060 + }, + { + "epoch": 8.982871125611746, + "grad_norm": 0.08575435727834702, + "learning_rate": 1.5597233821240732e-06, + "loss": 0.0614, + "num_input_tokens_seen": 118825552, + "step": 55065 + }, + { + "epoch": 8.9836867862969, + "grad_norm": 0.13073912262916565, + "learning_rate": 1.557249510116393e-06, + "loss": 0.1512, + "num_input_tokens_seen": 118836752, + "step": 55070 + }, + { + "epoch": 8.984502446982056, + "grad_norm": 0.099863201379776, + "learning_rate": 1.5547775384970154e-06, + "loss": 0.1232, + "num_input_tokens_seen": 118846160, + "step": 55075 + }, + { + "epoch": 8.98531810766721, + "grad_norm": 0.08535605669021606, + "learning_rate": 1.5523074674663296e-06, + "loss": 0.0161, + "num_input_tokens_seen": 118857168, + "step": 55080 + }, + { + "epoch": 8.986133768352365, + "grad_norm": 0.2849293351173401, + "learning_rate": 1.549839297224573e-06, + "loss": 0.0747, + "num_input_tokens_seen": 118868432, + "step": 55085 + }, + { + "epoch": 8.986949429037521, + "grad_norm": 0.044623930007219315, + "learning_rate": 1.5473730279718296e-06, + "loss": 0.1802, + "num_input_tokens_seen": 118879120, + "step": 55090 + }, + { + "epoch": 8.987765089722675, + "grad_norm": 0.03758779913187027, + "learning_rate": 1.5449086599080204e-06, + "loss": 0.1113, + "num_input_tokens_seen": 118889040, + "step": 55095 + }, + { + "epoch": 8.98858075040783, + "grad_norm": 0.02371419034898281, + "learning_rate": 1.5424461932329298e-06, + "loss": 0.0221, + "num_input_tokens_seen": 118898608, + "step": 55100 + }, + { + "epoch": 8.989396411092985, + "grad_norm": 1.1576858758926392, + "learning_rate": 1.5399856281461734e-06, + "loss": 0.0538, + "num_input_tokens_seen": 118909264, + "step": 55105 + }, + { + "epoch": 8.99021207177814, + "grad_norm": 0.2184494137763977, + "learning_rate": 1.5375269648472162e-06, + "loss": 0.1923, + "num_input_tokens_seen": 118920048, + "step": 55110 + }, + { + "epoch": 8.991027732463294, + "grad_norm": 0.3261617422103882, + "learning_rate": 1.5350702035353716e-06, + "loss": 0.1063, + "num_input_tokens_seen": 118931120, + "step": 55115 + }, + { + "epoch": 8.99184339314845, + "grad_norm": 0.8547483086585999, + "learning_rate": 1.5326153444097934e-06, + "loss": 0.1145, + "num_input_tokens_seen": 118940464, + "step": 55120 + }, + { + "epoch": 8.992659053833606, + "grad_norm": 0.8926696181297302, + "learning_rate": 1.5301623876694898e-06, + "loss": 0.1892, + "num_input_tokens_seen": 118951120, + "step": 55125 + }, + { + "epoch": 8.99347471451876, + "grad_norm": 1.1082295179367065, + "learning_rate": 1.5277113335133097e-06, + "loss": 0.216, + "num_input_tokens_seen": 118961616, + "step": 55130 + }, + { + "epoch": 8.994290375203915, + "grad_norm": 0.16903094947338104, + "learning_rate": 1.5252621821399443e-06, + "loss": 0.1447, + "num_input_tokens_seen": 118973296, + "step": 55135 + }, + { + "epoch": 8.99510603588907, + "grad_norm": 2.2317216396331787, + "learning_rate": 1.5228149337479347e-06, + "loss": 0.2483, + "num_input_tokens_seen": 118983344, + "step": 55140 + }, + { + "epoch": 8.995921696574225, + "grad_norm": 0.08516483753919601, + "learning_rate": 1.5203695885356694e-06, + "loss": 0.0722, + "num_input_tokens_seen": 118994544, + "step": 55145 + }, + { + "epoch": 8.99673735725938, + "grad_norm": 0.7158263921737671, + "learning_rate": 1.5179261467013817e-06, + "loss": 0.1373, + "num_input_tokens_seen": 119005264, + "step": 55150 + }, + { + "epoch": 8.997553017944535, + "grad_norm": 1.6402103900909424, + "learning_rate": 1.5154846084431463e-06, + "loss": 0.2874, + "num_input_tokens_seen": 119016688, + "step": 55155 + }, + { + "epoch": 8.99836867862969, + "grad_norm": 1.3802820444107056, + "learning_rate": 1.513044973958888e-06, + "loss": 0.1526, + "num_input_tokens_seen": 119027856, + "step": 55160 + }, + { + "epoch": 8.999184339314844, + "grad_norm": 1.3076727390289307, + "learning_rate": 1.5106072434463742e-06, + "loss": 0.1945, + "num_input_tokens_seen": 119038000, + "step": 55165 + }, + { + "epoch": 9.0, + "grad_norm": 0.027284245938062668, + "learning_rate": 1.5081714171032186e-06, + "loss": 0.0326, + "num_input_tokens_seen": 119047920, + "step": 55170 + }, + { + "epoch": 9.0, + "eval_loss": 0.13717259466648102, + "eval_runtime": 131.7455, + "eval_samples_per_second": 20.684, + "eval_steps_per_second": 5.177, + "num_input_tokens_seen": 119047920, + "step": 55170 + }, + { + "epoch": 9.000815660685156, + "grad_norm": 0.8821390271186829, + "learning_rate": 1.5057374951268883e-06, + "loss": 0.1649, + "num_input_tokens_seen": 119058096, + "step": 55175 + }, + { + "epoch": 9.00163132137031, + "grad_norm": 1.5876213312149048, + "learning_rate": 1.5033054777146838e-06, + "loss": 0.1455, + "num_input_tokens_seen": 119068080, + "step": 55180 + }, + { + "epoch": 9.002446982055465, + "grad_norm": 2.08413028717041, + "learning_rate": 1.5008753650637585e-06, + "loss": 0.2092, + "num_input_tokens_seen": 119078576, + "step": 55185 + }, + { + "epoch": 9.00326264274062, + "grad_norm": 1.4062060117721558, + "learning_rate": 1.4984471573711105e-06, + "loss": 0.2228, + "num_input_tokens_seen": 119089808, + "step": 55190 + }, + { + "epoch": 9.004078303425775, + "grad_norm": 0.783659815788269, + "learning_rate": 1.4960208548335825e-06, + "loss": 0.1411, + "num_input_tokens_seen": 119100208, + "step": 55195 + }, + { + "epoch": 9.00489396411093, + "grad_norm": 0.07543804496526718, + "learning_rate": 1.4935964576478584e-06, + "loss": 0.0957, + "num_input_tokens_seen": 119111632, + "step": 55200 + }, + { + "epoch": 9.005709624796085, + "grad_norm": 0.39765772223472595, + "learning_rate": 1.4911739660104785e-06, + "loss": 0.0574, + "num_input_tokens_seen": 119122608, + "step": 55205 + }, + { + "epoch": 9.00652528548124, + "grad_norm": 0.1901390552520752, + "learning_rate": 1.4887533801178188e-06, + "loss": 0.0211, + "num_input_tokens_seen": 119132976, + "step": 55210 + }, + { + "epoch": 9.007340946166394, + "grad_norm": 1.9774962663650513, + "learning_rate": 1.486334700166106e-06, + "loss": 0.12, + "num_input_tokens_seen": 119143984, + "step": 55215 + }, + { + "epoch": 9.00815660685155, + "grad_norm": 2.216543674468994, + "learning_rate": 1.483917926351408e-06, + "loss": 0.1128, + "num_input_tokens_seen": 119154256, + "step": 55220 + }, + { + "epoch": 9.008972267536704, + "grad_norm": 0.40745317935943604, + "learning_rate": 1.4815030588696432e-06, + "loss": 0.173, + "num_input_tokens_seen": 119165712, + "step": 55225 + }, + { + "epoch": 9.00978792822186, + "grad_norm": 0.29424548149108887, + "learning_rate": 1.4790900979165717e-06, + "loss": 0.1129, + "num_input_tokens_seen": 119176496, + "step": 55230 + }, + { + "epoch": 9.010603588907015, + "grad_norm": 0.4499587416648865, + "learning_rate": 1.4766790436878035e-06, + "loss": 0.1256, + "num_input_tokens_seen": 119186640, + "step": 55235 + }, + { + "epoch": 9.01141924959217, + "grad_norm": 1.3626172542572021, + "learning_rate": 1.4742698963787854e-06, + "loss": 0.213, + "num_input_tokens_seen": 119197680, + "step": 55240 + }, + { + "epoch": 9.012234910277325, + "grad_norm": 0.24851930141448975, + "learning_rate": 1.4718626561848193e-06, + "loss": 0.0433, + "num_input_tokens_seen": 119208080, + "step": 55245 + }, + { + "epoch": 9.013050570962479, + "grad_norm": 1.7049161195755005, + "learning_rate": 1.469457323301046e-06, + "loss": 0.1495, + "num_input_tokens_seen": 119218896, + "step": 55250 + }, + { + "epoch": 9.013866231647635, + "grad_norm": 1.3961981534957886, + "learning_rate": 1.4670538979224547e-06, + "loss": 0.2916, + "num_input_tokens_seen": 119230032, + "step": 55255 + }, + { + "epoch": 9.01468189233279, + "grad_norm": 0.12481894344091415, + "learning_rate": 1.4646523802438805e-06, + "loss": 0.0428, + "num_input_tokens_seen": 119240112, + "step": 55260 + }, + { + "epoch": 9.015497553017944, + "grad_norm": 0.06117425113916397, + "learning_rate": 1.4622527704599986e-06, + "loss": 0.2967, + "num_input_tokens_seen": 119251184, + "step": 55265 + }, + { + "epoch": 9.0163132137031, + "grad_norm": 0.1815699338912964, + "learning_rate": 1.4598550687653394e-06, + "loss": 0.0317, + "num_input_tokens_seen": 119263216, + "step": 55270 + }, + { + "epoch": 9.017128874388254, + "grad_norm": 0.32242265343666077, + "learning_rate": 1.4574592753542698e-06, + "loss": 0.0952, + "num_input_tokens_seen": 119272112, + "step": 55275 + }, + { + "epoch": 9.01794453507341, + "grad_norm": 0.7713669538497925, + "learning_rate": 1.4550653904210038e-06, + "loss": 0.1972, + "num_input_tokens_seen": 119282352, + "step": 55280 + }, + { + "epoch": 9.018760195758565, + "grad_norm": 0.9028674960136414, + "learning_rate": 1.4526734141596026e-06, + "loss": 0.133, + "num_input_tokens_seen": 119293488, + "step": 55285 + }, + { + "epoch": 9.01957585644372, + "grad_norm": 2.0867831707000732, + "learning_rate": 1.4502833467639725e-06, + "loss": 0.0945, + "num_input_tokens_seen": 119304592, + "step": 55290 + }, + { + "epoch": 9.020391517128875, + "grad_norm": 0.9453493356704712, + "learning_rate": 1.4478951884278669e-06, + "loss": 0.2057, + "num_input_tokens_seen": 119315088, + "step": 55295 + }, + { + "epoch": 9.021207177814029, + "grad_norm": 1.058403730392456, + "learning_rate": 1.4455089393448778e-06, + "loss": 0.1455, + "num_input_tokens_seen": 119326416, + "step": 55300 + }, + { + "epoch": 9.022022838499185, + "grad_norm": 0.6336503624916077, + "learning_rate": 1.4431245997084425e-06, + "loss": 0.0857, + "num_input_tokens_seen": 119337296, + "step": 55305 + }, + { + "epoch": 9.022838499184338, + "grad_norm": 0.815473198890686, + "learning_rate": 1.4407421697118617e-06, + "loss": 0.1608, + "num_input_tokens_seen": 119347824, + "step": 55310 + }, + { + "epoch": 9.023654159869494, + "grad_norm": 0.22295431792736053, + "learning_rate": 1.438361649548256e-06, + "loss": 0.0676, + "num_input_tokens_seen": 119359216, + "step": 55315 + }, + { + "epoch": 9.02446982055465, + "grad_norm": 0.12202347815036774, + "learning_rate": 1.4359830394106071e-06, + "loss": 0.1102, + "num_input_tokens_seen": 119370800, + "step": 55320 + }, + { + "epoch": 9.025285481239804, + "grad_norm": 1.5599523782730103, + "learning_rate": 1.4336063394917333e-06, + "loss": 0.1846, + "num_input_tokens_seen": 119380944, + "step": 55325 + }, + { + "epoch": 9.02610114192496, + "grad_norm": 0.10876854509115219, + "learning_rate": 1.4312315499843077e-06, + "loss": 0.1055, + "num_input_tokens_seen": 119390608, + "step": 55330 + }, + { + "epoch": 9.026916802610113, + "grad_norm": 0.10631924867630005, + "learning_rate": 1.428858671080835e-06, + "loss": 0.0817, + "num_input_tokens_seen": 119401328, + "step": 55335 + }, + { + "epoch": 9.02773246329527, + "grad_norm": 1.1524336338043213, + "learning_rate": 1.4264877029736778e-06, + "loss": 0.0541, + "num_input_tokens_seen": 119411472, + "step": 55340 + }, + { + "epoch": 9.028548123980425, + "grad_norm": 0.059466652572155, + "learning_rate": 1.424118645855041e-06, + "loss": 0.0487, + "num_input_tokens_seen": 119423152, + "step": 55345 + }, + { + "epoch": 9.029363784665579, + "grad_norm": 1.4444092512130737, + "learning_rate": 1.4217514999169678e-06, + "loss": 0.1991, + "num_input_tokens_seen": 119434992, + "step": 55350 + }, + { + "epoch": 9.030179445350734, + "grad_norm": 0.14334996044635773, + "learning_rate": 1.4193862653513524e-06, + "loss": 0.2243, + "num_input_tokens_seen": 119445520, + "step": 55355 + }, + { + "epoch": 9.030995106035888, + "grad_norm": 0.7326728701591492, + "learning_rate": 1.4170229423499353e-06, + "loss": 0.1383, + "num_input_tokens_seen": 119456080, + "step": 55360 + }, + { + "epoch": 9.031810766721044, + "grad_norm": 0.21037764847278595, + "learning_rate": 1.4146615311042972e-06, + "loss": 0.0783, + "num_input_tokens_seen": 119467504, + "step": 55365 + }, + { + "epoch": 9.0326264274062, + "grad_norm": 0.7210637331008911, + "learning_rate": 1.4123020318058649e-06, + "loss": 0.184, + "num_input_tokens_seen": 119478896, + "step": 55370 + }, + { + "epoch": 9.033442088091354, + "grad_norm": 0.4622207283973694, + "learning_rate": 1.4099444446459138e-06, + "loss": 0.1113, + "num_input_tokens_seen": 119489840, + "step": 55375 + }, + { + "epoch": 9.03425774877651, + "grad_norm": 0.15429483354091644, + "learning_rate": 1.4075887698155599e-06, + "loss": 0.0793, + "num_input_tokens_seen": 119499472, + "step": 55380 + }, + { + "epoch": 9.035073409461663, + "grad_norm": 1.10573148727417, + "learning_rate": 1.4052350075057673e-06, + "loss": 0.1669, + "num_input_tokens_seen": 119510768, + "step": 55385 + }, + { + "epoch": 9.035889070146819, + "grad_norm": 0.23439574241638184, + "learning_rate": 1.4028831579073448e-06, + "loss": 0.2027, + "num_input_tokens_seen": 119521968, + "step": 55390 + }, + { + "epoch": 9.036704730831975, + "grad_norm": 0.013650942593812943, + "learning_rate": 1.4005332212109424e-06, + "loss": 0.13, + "num_input_tokens_seen": 119532912, + "step": 55395 + }, + { + "epoch": 9.037520391517129, + "grad_norm": 0.24013642966747284, + "learning_rate": 1.3981851976070603e-06, + "loss": 0.0321, + "num_input_tokens_seen": 119544208, + "step": 55400 + }, + { + "epoch": 9.038336052202284, + "grad_norm": 0.18756221234798431, + "learning_rate": 1.395839087286041e-06, + "loss": 0.0847, + "num_input_tokens_seen": 119554960, + "step": 55405 + }, + { + "epoch": 9.039151712887438, + "grad_norm": 0.06763814389705658, + "learning_rate": 1.3934948904380712e-06, + "loss": 0.1467, + "num_input_tokens_seen": 119566224, + "step": 55410 + }, + { + "epoch": 9.039967373572594, + "grad_norm": 0.25255176424980164, + "learning_rate": 1.3911526072531795e-06, + "loss": 0.1295, + "num_input_tokens_seen": 119577424, + "step": 55415 + }, + { + "epoch": 9.040783034257748, + "grad_norm": 0.13037313520908356, + "learning_rate": 1.3888122379212527e-06, + "loss": 0.1641, + "num_input_tokens_seen": 119588752, + "step": 55420 + }, + { + "epoch": 9.041598694942904, + "grad_norm": 0.17097066342830658, + "learning_rate": 1.3864737826320058e-06, + "loss": 0.0415, + "num_input_tokens_seen": 119600048, + "step": 55425 + }, + { + "epoch": 9.04241435562806, + "grad_norm": 1.5998867750167847, + "learning_rate": 1.3841372415750093e-06, + "loss": 0.2388, + "num_input_tokens_seen": 119608944, + "step": 55430 + }, + { + "epoch": 9.043230016313213, + "grad_norm": 0.07266071438789368, + "learning_rate": 1.381802614939673e-06, + "loss": 0.1232, + "num_input_tokens_seen": 119618512, + "step": 55435 + }, + { + "epoch": 9.044045676998369, + "grad_norm": 0.2362988442182541, + "learning_rate": 1.3794699029152563e-06, + "loss": 0.0341, + "num_input_tokens_seen": 119629040, + "step": 55440 + }, + { + "epoch": 9.044861337683523, + "grad_norm": 0.16838675737380981, + "learning_rate": 1.3771391056908555e-06, + "loss": 0.0643, + "num_input_tokens_seen": 119639248, + "step": 55445 + }, + { + "epoch": 9.045676998368679, + "grad_norm": 0.18277065455913544, + "learning_rate": 1.3748102234554222e-06, + "loss": 0.1442, + "num_input_tokens_seen": 119648848, + "step": 55450 + }, + { + "epoch": 9.046492659053834, + "grad_norm": 1.1582754850387573, + "learning_rate": 1.372483256397744e-06, + "loss": 0.0607, + "num_input_tokens_seen": 119660624, + "step": 55455 + }, + { + "epoch": 9.047308319738988, + "grad_norm": 0.608393132686615, + "learning_rate": 1.3701582047064592e-06, + "loss": 0.0799, + "num_input_tokens_seen": 119671120, + "step": 55460 + }, + { + "epoch": 9.048123980424144, + "grad_norm": 0.1412830799818039, + "learning_rate": 1.3678350685700447e-06, + "loss": 0.2685, + "num_input_tokens_seen": 119680720, + "step": 55465 + }, + { + "epoch": 9.048939641109298, + "grad_norm": 0.2998616099357605, + "learning_rate": 1.3655138481768303e-06, + "loss": 0.1878, + "num_input_tokens_seen": 119690896, + "step": 55470 + }, + { + "epoch": 9.049755301794454, + "grad_norm": 0.4124338626861572, + "learning_rate": 1.3631945437149823e-06, + "loss": 0.0811, + "num_input_tokens_seen": 119699824, + "step": 55475 + }, + { + "epoch": 9.05057096247961, + "grad_norm": 1.312759280204773, + "learning_rate": 1.3608771553725168e-06, + "loss": 0.2555, + "num_input_tokens_seen": 119709552, + "step": 55480 + }, + { + "epoch": 9.051386623164763, + "grad_norm": 0.4410901963710785, + "learning_rate": 1.3585616833372894e-06, + "loss": 0.0944, + "num_input_tokens_seen": 119721136, + "step": 55485 + }, + { + "epoch": 9.052202283849919, + "grad_norm": 0.2690143287181854, + "learning_rate": 1.3562481277970108e-06, + "loss": 0.1186, + "num_input_tokens_seen": 119731408, + "step": 55490 + }, + { + "epoch": 9.053017944535073, + "grad_norm": 0.032467957586050034, + "learning_rate": 1.3539364889392281e-06, + "loss": 0.2007, + "num_input_tokens_seen": 119743728, + "step": 55495 + }, + { + "epoch": 9.053833605220229, + "grad_norm": 1.7675975561141968, + "learning_rate": 1.3516267669513305e-06, + "loss": 0.2047, + "num_input_tokens_seen": 119754992, + "step": 55500 + }, + { + "epoch": 9.054649265905383, + "grad_norm": 0.67827308177948, + "learning_rate": 1.3493189620205572e-06, + "loss": 0.066, + "num_input_tokens_seen": 119765936, + "step": 55505 + }, + { + "epoch": 9.055464926590538, + "grad_norm": 0.02177887223660946, + "learning_rate": 1.3470130743339914e-06, + "loss": 0.0058, + "num_input_tokens_seen": 119776688, + "step": 55510 + }, + { + "epoch": 9.056280587275694, + "grad_norm": 1.0764429569244385, + "learning_rate": 1.3447091040785619e-06, + "loss": 0.0771, + "num_input_tokens_seen": 119787088, + "step": 55515 + }, + { + "epoch": 9.057096247960848, + "grad_norm": 0.07361742854118347, + "learning_rate": 1.342407051441033e-06, + "loss": 0.0688, + "num_input_tokens_seen": 119798576, + "step": 55520 + }, + { + "epoch": 9.057911908646004, + "grad_norm": 1.1726254224777222, + "learning_rate": 1.3401069166080278e-06, + "loss": 0.1881, + "num_input_tokens_seen": 119809424, + "step": 55525 + }, + { + "epoch": 9.058727569331158, + "grad_norm": 0.5475149750709534, + "learning_rate": 1.3378086997660077e-06, + "loss": 0.1047, + "num_input_tokens_seen": 119821232, + "step": 55530 + }, + { + "epoch": 9.059543230016313, + "grad_norm": 0.08910398930311203, + "learning_rate": 1.3355124011012744e-06, + "loss": 0.2054, + "num_input_tokens_seen": 119832688, + "step": 55535 + }, + { + "epoch": 9.060358890701469, + "grad_norm": 2.0887439250946045, + "learning_rate": 1.3332180207999783e-06, + "loss": 0.2823, + "num_input_tokens_seen": 119842736, + "step": 55540 + }, + { + "epoch": 9.061174551386623, + "grad_norm": 1.9292566776275635, + "learning_rate": 1.3309255590481129e-06, + "loss": 0.0656, + "num_input_tokens_seen": 119853808, + "step": 55545 + }, + { + "epoch": 9.061990212071779, + "grad_norm": 0.9731523394584656, + "learning_rate": 1.3286350160315181e-06, + "loss": 0.2631, + "num_input_tokens_seen": 119864976, + "step": 55550 + }, + { + "epoch": 9.062805872756933, + "grad_norm": 0.1081073209643364, + "learning_rate": 1.3263463919358759e-06, + "loss": 0.0734, + "num_input_tokens_seen": 119875312, + "step": 55555 + }, + { + "epoch": 9.063621533442088, + "grad_norm": 0.3739194869995117, + "learning_rate": 1.3240596869467158e-06, + "loss": 0.1427, + "num_input_tokens_seen": 119886704, + "step": 55560 + }, + { + "epoch": 9.064437194127244, + "grad_norm": 0.45904624462127686, + "learning_rate": 1.3217749012494062e-06, + "loss": 0.0267, + "num_input_tokens_seen": 119897424, + "step": 55565 + }, + { + "epoch": 9.065252854812398, + "grad_norm": 0.7881055474281311, + "learning_rate": 1.3194920350291657e-06, + "loss": 0.0952, + "num_input_tokens_seen": 119909520, + "step": 55570 + }, + { + "epoch": 9.066068515497554, + "grad_norm": 0.11125031113624573, + "learning_rate": 1.3172110884710541e-06, + "loss": 0.1308, + "num_input_tokens_seen": 119919056, + "step": 55575 + }, + { + "epoch": 9.066884176182707, + "grad_norm": 1.288626790046692, + "learning_rate": 1.314932061759977e-06, + "loss": 0.165, + "num_input_tokens_seen": 119930448, + "step": 55580 + }, + { + "epoch": 9.067699836867863, + "grad_norm": 0.2282382994890213, + "learning_rate": 1.3126549550806832e-06, + "loss": 0.0368, + "num_input_tokens_seen": 119939824, + "step": 55585 + }, + { + "epoch": 9.068515497553017, + "grad_norm": 1.3123608827590942, + "learning_rate": 1.310379768617767e-06, + "loss": 0.1471, + "num_input_tokens_seen": 119951152, + "step": 55590 + }, + { + "epoch": 9.069331158238173, + "grad_norm": 0.9705364108085632, + "learning_rate": 1.308106502555667e-06, + "loss": 0.0902, + "num_input_tokens_seen": 119961104, + "step": 55595 + }, + { + "epoch": 9.070146818923329, + "grad_norm": 0.8544520735740662, + "learning_rate": 1.3058351570786665e-06, + "loss": 0.1266, + "num_input_tokens_seen": 119971920, + "step": 55600 + }, + { + "epoch": 9.070962479608482, + "grad_norm": 0.05606049671769142, + "learning_rate": 1.3035657323708927e-06, + "loss": 0.1602, + "num_input_tokens_seen": 119980944, + "step": 55605 + }, + { + "epoch": 9.071778140293638, + "grad_norm": 0.45078328251838684, + "learning_rate": 1.3012982286163129e-06, + "loss": 0.209, + "num_input_tokens_seen": 119991792, + "step": 55610 + }, + { + "epoch": 9.072593800978792, + "grad_norm": 0.8267039656639099, + "learning_rate": 1.2990326459987434e-06, + "loss": 0.0697, + "num_input_tokens_seen": 120002064, + "step": 55615 + }, + { + "epoch": 9.073409461663948, + "grad_norm": 0.8613330721855164, + "learning_rate": 1.296768984701846e-06, + "loss": 0.2126, + "num_input_tokens_seen": 120012560, + "step": 55620 + }, + { + "epoch": 9.074225122349104, + "grad_norm": 0.0809059888124466, + "learning_rate": 1.2945072449091212e-06, + "loss": 0.1571, + "num_input_tokens_seen": 120023568, + "step": 55625 + }, + { + "epoch": 9.075040783034257, + "grad_norm": 0.2890501916408539, + "learning_rate": 1.292247426803922e-06, + "loss": 0.2263, + "num_input_tokens_seen": 120034448, + "step": 55630 + }, + { + "epoch": 9.075856443719413, + "grad_norm": 0.4074615240097046, + "learning_rate": 1.2899895305694408e-06, + "loss": 0.1672, + "num_input_tokens_seen": 120046096, + "step": 55635 + }, + { + "epoch": 9.076672104404567, + "grad_norm": 0.5070456862449646, + "learning_rate": 1.2877335563887095e-06, + "loss": 0.0542, + "num_input_tokens_seen": 120055344, + "step": 55640 + }, + { + "epoch": 9.077487765089723, + "grad_norm": 0.3149697184562683, + "learning_rate": 1.2854795044446116e-06, + "loss": 0.0831, + "num_input_tokens_seen": 120065808, + "step": 55645 + }, + { + "epoch": 9.078303425774878, + "grad_norm": 0.10469359159469604, + "learning_rate": 1.2832273749198708e-06, + "loss": 0.0517, + "num_input_tokens_seen": 120075472, + "step": 55650 + }, + { + "epoch": 9.079119086460032, + "grad_norm": 0.10680131614208221, + "learning_rate": 1.2809771679970522e-06, + "loss": 0.112, + "num_input_tokens_seen": 120085072, + "step": 55655 + }, + { + "epoch": 9.079934747145188, + "grad_norm": 0.16860367357730865, + "learning_rate": 1.2787288838585793e-06, + "loss": 0.1303, + "num_input_tokens_seen": 120096304, + "step": 55660 + }, + { + "epoch": 9.080750407830342, + "grad_norm": 0.15713386237621307, + "learning_rate": 1.2764825226867005e-06, + "loss": 0.0186, + "num_input_tokens_seen": 120105968, + "step": 55665 + }, + { + "epoch": 9.081566068515498, + "grad_norm": 0.04607308283448219, + "learning_rate": 1.2742380846635231e-06, + "loss": 0.1144, + "num_input_tokens_seen": 120117360, + "step": 55670 + }, + { + "epoch": 9.082381729200652, + "grad_norm": 1.189503788948059, + "learning_rate": 1.2719955699709907e-06, + "loss": 0.1252, + "num_input_tokens_seen": 120128464, + "step": 55675 + }, + { + "epoch": 9.083197389885807, + "grad_norm": 0.25766411423683167, + "learning_rate": 1.2697549787908908e-06, + "loss": 0.0839, + "num_input_tokens_seen": 120139440, + "step": 55680 + }, + { + "epoch": 9.084013050570963, + "grad_norm": 0.851627767086029, + "learning_rate": 1.267516311304856e-06, + "loss": 0.1043, + "num_input_tokens_seen": 120149392, + "step": 55685 + }, + { + "epoch": 9.084828711256117, + "grad_norm": 1.7683348655700684, + "learning_rate": 1.265279567694369e-06, + "loss": 0.255, + "num_input_tokens_seen": 120160848, + "step": 55690 + }, + { + "epoch": 9.085644371941273, + "grad_norm": 0.20737676322460175, + "learning_rate": 1.2630447481407486e-06, + "loss": 0.0229, + "num_input_tokens_seen": 120172368, + "step": 55695 + }, + { + "epoch": 9.086460032626427, + "grad_norm": 0.09393700957298279, + "learning_rate": 1.2608118528251611e-06, + "loss": 0.0873, + "num_input_tokens_seen": 120183184, + "step": 55700 + }, + { + "epoch": 9.087275693311582, + "grad_norm": 0.2825249135494232, + "learning_rate": 1.2585808819286172e-06, + "loss": 0.1042, + "num_input_tokens_seen": 120194480, + "step": 55705 + }, + { + "epoch": 9.088091353996738, + "grad_norm": 0.13539622724056244, + "learning_rate": 1.2563518356319664e-06, + "loss": 0.1789, + "num_input_tokens_seen": 120205616, + "step": 55710 + }, + { + "epoch": 9.088907014681892, + "grad_norm": 0.5594211220741272, + "learning_rate": 1.2541247141159119e-06, + "loss": 0.1379, + "num_input_tokens_seen": 120216496, + "step": 55715 + }, + { + "epoch": 9.089722675367048, + "grad_norm": 1.9805740118026733, + "learning_rate": 1.2518995175609949e-06, + "loss": 0.2826, + "num_input_tokens_seen": 120226896, + "step": 55720 + }, + { + "epoch": 9.090538336052202, + "grad_norm": 0.539701521396637, + "learning_rate": 1.2496762461475992e-06, + "loss": 0.1278, + "num_input_tokens_seen": 120236400, + "step": 55725 + }, + { + "epoch": 9.091353996737357, + "grad_norm": 0.4544152319431305, + "learning_rate": 1.2474549000559527e-06, + "loss": 0.036, + "num_input_tokens_seen": 120247920, + "step": 55730 + }, + { + "epoch": 9.092169657422513, + "grad_norm": 0.07925290614366531, + "learning_rate": 1.245235479466131e-06, + "loss": 0.0682, + "num_input_tokens_seen": 120258512, + "step": 55735 + }, + { + "epoch": 9.092985318107667, + "grad_norm": 1.5355122089385986, + "learning_rate": 1.2430179845580537e-06, + "loss": 0.0793, + "num_input_tokens_seen": 120270032, + "step": 55740 + }, + { + "epoch": 9.093800978792823, + "grad_norm": 0.14366364479064941, + "learning_rate": 1.24080241551148e-06, + "loss": 0.0221, + "num_input_tokens_seen": 120280816, + "step": 55745 + }, + { + "epoch": 9.094616639477977, + "grad_norm": 0.2779012620449066, + "learning_rate": 1.2385887725060135e-06, + "loss": 0.1398, + "num_input_tokens_seen": 120291664, + "step": 55750 + }, + { + "epoch": 9.095432300163132, + "grad_norm": 1.1971906423568726, + "learning_rate": 1.236377055721108e-06, + "loss": 0.1299, + "num_input_tokens_seen": 120302320, + "step": 55755 + }, + { + "epoch": 9.096247960848286, + "grad_norm": 0.33632466197013855, + "learning_rate": 1.234167265336053e-06, + "loss": 0.0863, + "num_input_tokens_seen": 120312688, + "step": 55760 + }, + { + "epoch": 9.097063621533442, + "grad_norm": 0.6790722608566284, + "learning_rate": 1.2319594015299862e-06, + "loss": 0.0818, + "num_input_tokens_seen": 120322928, + "step": 55765 + }, + { + "epoch": 9.097879282218598, + "grad_norm": 0.15036360919475555, + "learning_rate": 1.2297534644818891e-06, + "loss": 0.1075, + "num_input_tokens_seen": 120332080, + "step": 55770 + }, + { + "epoch": 9.098694942903752, + "grad_norm": 0.09894292801618576, + "learning_rate": 1.227549454370583e-06, + "loss": 0.1629, + "num_input_tokens_seen": 120344400, + "step": 55775 + }, + { + "epoch": 9.099510603588907, + "grad_norm": 1.6577095985412598, + "learning_rate": 1.225347371374741e-06, + "loss": 0.161, + "num_input_tokens_seen": 120355344, + "step": 55780 + }, + { + "epoch": 9.100326264274061, + "grad_norm": 0.34476450085639954, + "learning_rate": 1.2231472156728707e-06, + "loss": 0.1047, + "num_input_tokens_seen": 120366640, + "step": 55785 + }, + { + "epoch": 9.101141924959217, + "grad_norm": 0.06182524561882019, + "learning_rate": 1.2209489874433294e-06, + "loss": 0.1148, + "num_input_tokens_seen": 120377584, + "step": 55790 + }, + { + "epoch": 9.101957585644373, + "grad_norm": 0.07558123767375946, + "learning_rate": 1.2187526868643162e-06, + "loss": 0.022, + "num_input_tokens_seen": 120388368, + "step": 55795 + }, + { + "epoch": 9.102773246329527, + "grad_norm": 0.09640809148550034, + "learning_rate": 1.2165583141138748e-06, + "loss": 0.0632, + "num_input_tokens_seen": 120399312, + "step": 55800 + }, + { + "epoch": 9.103588907014682, + "grad_norm": 1.623610019683838, + "learning_rate": 1.2143658693698933e-06, + "loss": 0.2367, + "num_input_tokens_seen": 120409616, + "step": 55805 + }, + { + "epoch": 9.104404567699836, + "grad_norm": 0.8023285269737244, + "learning_rate": 1.212175352810102e-06, + "loss": 0.2464, + "num_input_tokens_seen": 120421424, + "step": 55810 + }, + { + "epoch": 9.105220228384992, + "grad_norm": 0.8180760145187378, + "learning_rate": 1.2099867646120754e-06, + "loss": 0.1083, + "num_input_tokens_seen": 120431088, + "step": 55815 + }, + { + "epoch": 9.106035889070148, + "grad_norm": 0.23242220282554626, + "learning_rate": 1.20780010495323e-06, + "loss": 0.018, + "num_input_tokens_seen": 120442320, + "step": 55820 + }, + { + "epoch": 9.106851549755302, + "grad_norm": 1.4026901721954346, + "learning_rate": 1.2056153740108295e-06, + "loss": 0.1423, + "num_input_tokens_seen": 120453104, + "step": 55825 + }, + { + "epoch": 9.107667210440457, + "grad_norm": 0.03979835659265518, + "learning_rate": 1.2034325719619794e-06, + "loss": 0.0762, + "num_input_tokens_seen": 120463536, + "step": 55830 + }, + { + "epoch": 9.108482871125611, + "grad_norm": 0.5419539213180542, + "learning_rate": 1.2012516989836242e-06, + "loss": 0.1301, + "num_input_tokens_seen": 120474672, + "step": 55835 + }, + { + "epoch": 9.109298531810767, + "grad_norm": 1.330557107925415, + "learning_rate": 1.1990727552525588e-06, + "loss": 0.0639, + "num_input_tokens_seen": 120484208, + "step": 55840 + }, + { + "epoch": 9.11011419249592, + "grad_norm": 1.4435955286026, + "learning_rate": 1.196895740945425e-06, + "loss": 0.1553, + "num_input_tokens_seen": 120494896, + "step": 55845 + }, + { + "epoch": 9.110929853181077, + "grad_norm": 1.2285466194152832, + "learning_rate": 1.194720656238696e-06, + "loss": 0.0774, + "num_input_tokens_seen": 120505936, + "step": 55850 + }, + { + "epoch": 9.111745513866232, + "grad_norm": 0.35326939821243286, + "learning_rate": 1.1925475013086968e-06, + "loss": 0.0208, + "num_input_tokens_seen": 120516944, + "step": 55855 + }, + { + "epoch": 9.112561174551386, + "grad_norm": 2.261167049407959, + "learning_rate": 1.190376276331598e-06, + "loss": 0.0671, + "num_input_tokens_seen": 120527344, + "step": 55860 + }, + { + "epoch": 9.113376835236542, + "grad_norm": 0.576062798500061, + "learning_rate": 1.1882069814834057e-06, + "loss": 0.0524, + "num_input_tokens_seen": 120537520, + "step": 55865 + }, + { + "epoch": 9.114192495921696, + "grad_norm": 0.1632242053747177, + "learning_rate": 1.186039616939974e-06, + "loss": 0.1088, + "num_input_tokens_seen": 120549072, + "step": 55870 + }, + { + "epoch": 9.115008156606851, + "grad_norm": 1.8507659435272217, + "learning_rate": 1.1838741828770039e-06, + "loss": 0.1444, + "num_input_tokens_seen": 120559888, + "step": 55875 + }, + { + "epoch": 9.115823817292007, + "grad_norm": 0.15604501962661743, + "learning_rate": 1.1817106794700327e-06, + "loss": 0.1168, + "num_input_tokens_seen": 120571504, + "step": 55880 + }, + { + "epoch": 9.116639477977161, + "grad_norm": 1.056790828704834, + "learning_rate": 1.1795491068944453e-06, + "loss": 0.1915, + "num_input_tokens_seen": 120581456, + "step": 55885 + }, + { + "epoch": 9.117455138662317, + "grad_norm": 0.403936505317688, + "learning_rate": 1.1773894653254736e-06, + "loss": 0.1199, + "num_input_tokens_seen": 120592080, + "step": 55890 + }, + { + "epoch": 9.11827079934747, + "grad_norm": 0.1703948676586151, + "learning_rate": 1.1752317549381857e-06, + "loss": 0.0606, + "num_input_tokens_seen": 120602128, + "step": 55895 + }, + { + "epoch": 9.119086460032626, + "grad_norm": 0.4984230399131775, + "learning_rate": 1.1730759759074978e-06, + "loss": 0.0501, + "num_input_tokens_seen": 120614352, + "step": 55900 + }, + { + "epoch": 9.119902120717782, + "grad_norm": 0.756306529045105, + "learning_rate": 1.1709221284081666e-06, + "loss": 0.0461, + "num_input_tokens_seen": 120625424, + "step": 55905 + }, + { + "epoch": 9.120717781402936, + "grad_norm": 1.2447696924209595, + "learning_rate": 1.1687702126147976e-06, + "loss": 0.1351, + "num_input_tokens_seen": 120635536, + "step": 55910 + }, + { + "epoch": 9.121533442088092, + "grad_norm": 0.07124149054288864, + "learning_rate": 1.1666202287018313e-06, + "loss": 0.1659, + "num_input_tokens_seen": 120646064, + "step": 55915 + }, + { + "epoch": 9.122349102773246, + "grad_norm": 0.17857323586940765, + "learning_rate": 1.1644721768435617e-06, + "loss": 0.0148, + "num_input_tokens_seen": 120657072, + "step": 55920 + }, + { + "epoch": 9.123164763458401, + "grad_norm": 0.6256363987922668, + "learning_rate": 1.1623260572141137e-06, + "loss": 0.1433, + "num_input_tokens_seen": 120668816, + "step": 55925 + }, + { + "epoch": 9.123980424143557, + "grad_norm": 0.34643787145614624, + "learning_rate": 1.16018186998747e-06, + "loss": 0.0849, + "num_input_tokens_seen": 120679248, + "step": 55930 + }, + { + "epoch": 9.124796084828711, + "grad_norm": 0.3880656063556671, + "learning_rate": 1.1580396153374446e-06, + "loss": 0.0584, + "num_input_tokens_seen": 120690096, + "step": 55935 + }, + { + "epoch": 9.125611745513867, + "grad_norm": 0.9512143731117249, + "learning_rate": 1.1558992934376982e-06, + "loss": 0.0693, + "num_input_tokens_seen": 120702352, + "step": 55940 + }, + { + "epoch": 9.12642740619902, + "grad_norm": 0.20574769377708435, + "learning_rate": 1.1537609044617398e-06, + "loss": 0.1245, + "num_input_tokens_seen": 120713072, + "step": 55945 + }, + { + "epoch": 9.127243066884176, + "grad_norm": 0.09637373685836792, + "learning_rate": 1.1516244485829193e-06, + "loss": 0.0756, + "num_input_tokens_seen": 120725200, + "step": 55950 + }, + { + "epoch": 9.12805872756933, + "grad_norm": 2.12496018409729, + "learning_rate": 1.1494899259744258e-06, + "loss": 0.1577, + "num_input_tokens_seen": 120735696, + "step": 55955 + }, + { + "epoch": 9.128874388254486, + "grad_norm": 1.174201250076294, + "learning_rate": 1.147357336809296e-06, + "loss": 0.2282, + "num_input_tokens_seen": 120747344, + "step": 55960 + }, + { + "epoch": 9.129690048939642, + "grad_norm": 1.659515380859375, + "learning_rate": 1.1452266812604056e-06, + "loss": 0.1864, + "num_input_tokens_seen": 120758416, + "step": 55965 + }, + { + "epoch": 9.130505709624796, + "grad_norm": 1.8332897424697876, + "learning_rate": 1.1430979595004777e-06, + "loss": 0.2091, + "num_input_tokens_seen": 120769552, + "step": 55970 + }, + { + "epoch": 9.131321370309951, + "grad_norm": 0.4731445014476776, + "learning_rate": 1.1409711717020794e-06, + "loss": 0.2072, + "num_input_tokens_seen": 120780656, + "step": 55975 + }, + { + "epoch": 9.132137030995105, + "grad_norm": 0.5381813049316406, + "learning_rate": 1.1388463180376175e-06, + "loss": 0.1142, + "num_input_tokens_seen": 120792048, + "step": 55980 + }, + { + "epoch": 9.132952691680261, + "grad_norm": 0.3569954037666321, + "learning_rate": 1.1367233986793429e-06, + "loss": 0.1876, + "num_input_tokens_seen": 120802224, + "step": 55985 + }, + { + "epoch": 9.133768352365417, + "grad_norm": 0.3689451813697815, + "learning_rate": 1.1346024137993516e-06, + "loss": 0.0915, + "num_input_tokens_seen": 120812720, + "step": 55990 + }, + { + "epoch": 9.13458401305057, + "grad_norm": 1.4047397375106812, + "learning_rate": 1.1324833635695808e-06, + "loss": 0.1673, + "num_input_tokens_seen": 120824880, + "step": 55995 + }, + { + "epoch": 9.135399673735726, + "grad_norm": 0.15661588311195374, + "learning_rate": 1.13036624816181e-06, + "loss": 0.0858, + "num_input_tokens_seen": 120835632, + "step": 56000 + }, + { + "epoch": 9.13621533442088, + "grad_norm": 0.09750670194625854, + "learning_rate": 1.1282510677476655e-06, + "loss": 0.0362, + "num_input_tokens_seen": 120847984, + "step": 56005 + }, + { + "epoch": 9.137030995106036, + "grad_norm": 0.45589718222618103, + "learning_rate": 1.126137822498613e-06, + "loss": 0.2551, + "num_input_tokens_seen": 120858832, + "step": 56010 + }, + { + "epoch": 9.137846655791192, + "grad_norm": 0.5648294687271118, + "learning_rate": 1.1240265125859628e-06, + "loss": 0.1039, + "num_input_tokens_seen": 120869872, + "step": 56015 + }, + { + "epoch": 9.138662316476346, + "grad_norm": 1.1323901414871216, + "learning_rate": 1.1219171381808696e-06, + "loss": 0.1544, + "num_input_tokens_seen": 120879920, + "step": 56020 + }, + { + "epoch": 9.139477977161501, + "grad_norm": 0.040440067648887634, + "learning_rate": 1.11980969945433e-06, + "loss": 0.1767, + "num_input_tokens_seen": 120891088, + "step": 56025 + }, + { + "epoch": 9.140293637846655, + "grad_norm": 0.6602320075035095, + "learning_rate": 1.1177041965771823e-06, + "loss": 0.0857, + "num_input_tokens_seen": 120901968, + "step": 56030 + }, + { + "epoch": 9.141109298531811, + "grad_norm": 2.2766456604003906, + "learning_rate": 1.1156006297201093e-06, + "loss": 0.17, + "num_input_tokens_seen": 120912176, + "step": 56035 + }, + { + "epoch": 9.141924959216965, + "grad_norm": 0.5805887579917908, + "learning_rate": 1.1134989990536387e-06, + "loss": 0.0377, + "num_input_tokens_seen": 120923216, + "step": 56040 + }, + { + "epoch": 9.14274061990212, + "grad_norm": 0.7021986842155457, + "learning_rate": 1.1113993047481369e-06, + "loss": 0.1979, + "num_input_tokens_seen": 120934448, + "step": 56045 + }, + { + "epoch": 9.143556280587276, + "grad_norm": 0.62708580493927, + "learning_rate": 1.1093015469738177e-06, + "loss": 0.0698, + "num_input_tokens_seen": 120944880, + "step": 56050 + }, + { + "epoch": 9.14437194127243, + "grad_norm": 1.6721562147140503, + "learning_rate": 1.107205725900734e-06, + "loss": 0.1856, + "num_input_tokens_seen": 120956304, + "step": 56055 + }, + { + "epoch": 9.145187601957586, + "grad_norm": 0.8806193470954895, + "learning_rate": 1.105111841698786e-06, + "loss": 0.0834, + "num_input_tokens_seen": 120966320, + "step": 56060 + }, + { + "epoch": 9.14600326264274, + "grad_norm": 0.8328976631164551, + "learning_rate": 1.1030198945377128e-06, + "loss": 0.074, + "num_input_tokens_seen": 120977648, + "step": 56065 + }, + { + "epoch": 9.146818923327896, + "grad_norm": 1.3944063186645508, + "learning_rate": 1.1009298845871013e-06, + "loss": 0.1113, + "num_input_tokens_seen": 120987728, + "step": 56070 + }, + { + "epoch": 9.147634584013051, + "grad_norm": 1.1594120264053345, + "learning_rate": 1.098841812016374e-06, + "loss": 0.0967, + "num_input_tokens_seen": 120998640, + "step": 56075 + }, + { + "epoch": 9.148450244698205, + "grad_norm": 0.08187129348516464, + "learning_rate": 1.096755676994804e-06, + "loss": 0.101, + "num_input_tokens_seen": 121008912, + "step": 56080 + }, + { + "epoch": 9.149265905383361, + "grad_norm": 1.9698723554611206, + "learning_rate": 1.0946714796915032e-06, + "loss": 0.1874, + "num_input_tokens_seen": 121019184, + "step": 56085 + }, + { + "epoch": 9.150081566068515, + "grad_norm": 1.1204243898391724, + "learning_rate": 1.092589220275425e-06, + "loss": 0.1526, + "num_input_tokens_seen": 121031184, + "step": 56090 + }, + { + "epoch": 9.15089722675367, + "grad_norm": 2.209731340408325, + "learning_rate": 1.0905088989153712e-06, + "loss": 0.3394, + "num_input_tokens_seen": 121042672, + "step": 56095 + }, + { + "epoch": 9.151712887438826, + "grad_norm": 1.2366057634353638, + "learning_rate": 1.0884305157799785e-06, + "loss": 0.2634, + "num_input_tokens_seen": 121053520, + "step": 56100 + }, + { + "epoch": 9.15252854812398, + "grad_norm": 0.2566447854042053, + "learning_rate": 1.0863540710377373e-06, + "loss": 0.074, + "num_input_tokens_seen": 121064688, + "step": 56105 + }, + { + "epoch": 9.153344208809136, + "grad_norm": 0.9382874965667725, + "learning_rate": 1.0842795648569688e-06, + "loss": 0.3079, + "num_input_tokens_seen": 121075088, + "step": 56110 + }, + { + "epoch": 9.15415986949429, + "grad_norm": 0.20569920539855957, + "learning_rate": 1.0822069974058464e-06, + "loss": 0.1198, + "num_input_tokens_seen": 121085424, + "step": 56115 + }, + { + "epoch": 9.154975530179446, + "grad_norm": 0.37257805466651917, + "learning_rate": 1.0801363688523858e-06, + "loss": 0.0999, + "num_input_tokens_seen": 121097072, + "step": 56120 + }, + { + "epoch": 9.1557911908646, + "grad_norm": 0.9185793399810791, + "learning_rate": 1.0780676793644362e-06, + "loss": 0.0865, + "num_input_tokens_seen": 121108880, + "step": 56125 + }, + { + "epoch": 9.156606851549755, + "grad_norm": 0.4443438947200775, + "learning_rate": 1.0760009291097022e-06, + "loss": 0.1295, + "num_input_tokens_seen": 121120432, + "step": 56130 + }, + { + "epoch": 9.15742251223491, + "grad_norm": 1.9627124071121216, + "learning_rate": 1.0739361182557194e-06, + "loss": 0.2847, + "num_input_tokens_seen": 121131952, + "step": 56135 + }, + { + "epoch": 9.158238172920065, + "grad_norm": 0.0887700617313385, + "learning_rate": 1.071873246969876e-06, + "loss": 0.0579, + "num_input_tokens_seen": 121140304, + "step": 56140 + }, + { + "epoch": 9.15905383360522, + "grad_norm": 0.09492816776037216, + "learning_rate": 1.0698123154193967e-06, + "loss": 0.1142, + "num_input_tokens_seen": 121150608, + "step": 56145 + }, + { + "epoch": 9.159869494290374, + "grad_norm": 0.08051688969135284, + "learning_rate": 1.0677533237713533e-06, + "loss": 0.0884, + "num_input_tokens_seen": 121161296, + "step": 56150 + }, + { + "epoch": 9.16068515497553, + "grad_norm": 0.09504478424787521, + "learning_rate": 1.0656962721926539e-06, + "loss": 0.1378, + "num_input_tokens_seen": 121172976, + "step": 56155 + }, + { + "epoch": 9.161500815660686, + "grad_norm": 0.25473299622535706, + "learning_rate": 1.063641160850054e-06, + "loss": 0.0416, + "num_input_tokens_seen": 121183024, + "step": 56160 + }, + { + "epoch": 9.16231647634584, + "grad_norm": 1.6258293390274048, + "learning_rate": 1.0615879899101567e-06, + "loss": 0.1565, + "num_input_tokens_seen": 121194032, + "step": 56165 + }, + { + "epoch": 9.163132137030995, + "grad_norm": 0.5978771448135376, + "learning_rate": 1.0595367595393978e-06, + "loss": 0.061, + "num_input_tokens_seen": 121205296, + "step": 56170 + }, + { + "epoch": 9.16394779771615, + "grad_norm": 0.12101197987794876, + "learning_rate": 1.0574874699040643e-06, + "loss": 0.0485, + "num_input_tokens_seen": 121217168, + "step": 56175 + }, + { + "epoch": 9.164763458401305, + "grad_norm": 2.027078866958618, + "learning_rate": 1.0554401211702787e-06, + "loss": 0.3768, + "num_input_tokens_seen": 121228176, + "step": 56180 + }, + { + "epoch": 9.16557911908646, + "grad_norm": 0.045387350022792816, + "learning_rate": 1.0533947135040106e-06, + "loss": 0.0401, + "num_input_tokens_seen": 121237968, + "step": 56185 + }, + { + "epoch": 9.166394779771615, + "grad_norm": 0.15678735077381134, + "learning_rate": 1.0513512470710695e-06, + "loss": 0.0887, + "num_input_tokens_seen": 121248048, + "step": 56190 + }, + { + "epoch": 9.16721044045677, + "grad_norm": 0.2806219160556793, + "learning_rate": 1.0493097220371117e-06, + "loss": 0.0746, + "num_input_tokens_seen": 121259088, + "step": 56195 + }, + { + "epoch": 9.168026101141924, + "grad_norm": 0.12474717944860458, + "learning_rate": 1.0472701385676326e-06, + "loss": 0.2538, + "num_input_tokens_seen": 121270416, + "step": 56200 + }, + { + "epoch": 9.16884176182708, + "grad_norm": 0.34797757863998413, + "learning_rate": 1.045232496827972e-06, + "loss": 0.0904, + "num_input_tokens_seen": 121281424, + "step": 56205 + }, + { + "epoch": 9.169657422512234, + "grad_norm": 1.6000549793243408, + "learning_rate": 1.043196796983309e-06, + "loss": 0.1133, + "num_input_tokens_seen": 121292496, + "step": 56210 + }, + { + "epoch": 9.17047308319739, + "grad_norm": 1.0061273574829102, + "learning_rate": 1.0411630391986698e-06, + "loss": 0.0582, + "num_input_tokens_seen": 121303280, + "step": 56215 + }, + { + "epoch": 9.171288743882545, + "grad_norm": 0.3543042242527008, + "learning_rate": 1.03913122363892e-06, + "loss": 0.0643, + "num_input_tokens_seen": 121315248, + "step": 56220 + }, + { + "epoch": 9.1721044045677, + "grad_norm": 0.6341392397880554, + "learning_rate": 1.0371013504687692e-06, + "loss": 0.1314, + "num_input_tokens_seen": 121326256, + "step": 56225 + }, + { + "epoch": 9.172920065252855, + "grad_norm": 0.1929095983505249, + "learning_rate": 1.0350734198527696e-06, + "loss": 0.2337, + "num_input_tokens_seen": 121337264, + "step": 56230 + }, + { + "epoch": 9.173735725938009, + "grad_norm": 0.3662034571170807, + "learning_rate": 1.033047431955317e-06, + "loss": 0.1401, + "num_input_tokens_seen": 121347600, + "step": 56235 + }, + { + "epoch": 9.174551386623165, + "grad_norm": 0.14751145243644714, + "learning_rate": 1.0310233869406437e-06, + "loss": 0.1984, + "num_input_tokens_seen": 121358800, + "step": 56240 + }, + { + "epoch": 9.17536704730832, + "grad_norm": 0.25707221031188965, + "learning_rate": 1.0290012849728358e-06, + "loss": 0.0323, + "num_input_tokens_seen": 121369712, + "step": 56245 + }, + { + "epoch": 9.176182707993474, + "grad_norm": 0.3136082887649536, + "learning_rate": 1.0269811262158092e-06, + "loss": 0.1179, + "num_input_tokens_seen": 121380784, + "step": 56250 + }, + { + "epoch": 9.17699836867863, + "grad_norm": 0.2915656268596649, + "learning_rate": 1.024962910833327e-06, + "loss": 0.0623, + "num_input_tokens_seen": 121392752, + "step": 56255 + }, + { + "epoch": 9.177814029363784, + "grad_norm": 0.09292411059141159, + "learning_rate": 1.022946638989003e-06, + "loss": 0.1398, + "num_input_tokens_seen": 121402928, + "step": 56260 + }, + { + "epoch": 9.17862969004894, + "grad_norm": 0.0576816163957119, + "learning_rate": 1.0209323108462816e-06, + "loss": 0.0325, + "num_input_tokens_seen": 121413008, + "step": 56265 + }, + { + "epoch": 9.179445350734095, + "grad_norm": 0.1853405237197876, + "learning_rate": 1.018919926568457e-06, + "loss": 0.1134, + "num_input_tokens_seen": 121422896, + "step": 56270 + }, + { + "epoch": 9.18026101141925, + "grad_norm": 0.34472212195396423, + "learning_rate": 1.0169094863186623e-06, + "loss": 0.1719, + "num_input_tokens_seen": 121434512, + "step": 56275 + }, + { + "epoch": 9.181076672104405, + "grad_norm": 0.6352803707122803, + "learning_rate": 1.0149009902598706e-06, + "loss": 0.2569, + "num_input_tokens_seen": 121445840, + "step": 56280 + }, + { + "epoch": 9.181892332789559, + "grad_norm": 0.3028910458087921, + "learning_rate": 1.0128944385549038e-06, + "loss": 0.1185, + "num_input_tokens_seen": 121457264, + "step": 56285 + }, + { + "epoch": 9.182707993474715, + "grad_norm": 0.19224010407924652, + "learning_rate": 1.0108898313664267e-06, + "loss": 0.0139, + "num_input_tokens_seen": 121467728, + "step": 56290 + }, + { + "epoch": 9.18352365415987, + "grad_norm": 0.0701630562543869, + "learning_rate": 1.0088871688569397e-06, + "loss": 0.1975, + "num_input_tokens_seen": 121478608, + "step": 56295 + }, + { + "epoch": 9.184339314845024, + "grad_norm": 0.6319813132286072, + "learning_rate": 1.006886451188785e-06, + "loss": 0.0744, + "num_input_tokens_seen": 121489488, + "step": 56300 + }, + { + "epoch": 9.18515497553018, + "grad_norm": 0.1711645871400833, + "learning_rate": 1.0048876785241578e-06, + "loss": 0.1609, + "num_input_tokens_seen": 121500400, + "step": 56305 + }, + { + "epoch": 9.185970636215334, + "grad_norm": 0.16338583827018738, + "learning_rate": 1.0028908510250846e-06, + "loss": 0.1156, + "num_input_tokens_seen": 121511088, + "step": 56310 + }, + { + "epoch": 9.18678629690049, + "grad_norm": 0.32292330265045166, + "learning_rate": 1.000895968853438e-06, + "loss": 0.1431, + "num_input_tokens_seen": 121523088, + "step": 56315 + }, + { + "epoch": 9.187601957585644, + "grad_norm": 0.0853412076830864, + "learning_rate": 9.989030321709336e-07, + "loss": 0.1131, + "num_input_tokens_seen": 121533808, + "step": 56320 + }, + { + "epoch": 9.1884176182708, + "grad_norm": 0.6783162355422974, + "learning_rate": 9.969120411391308e-07, + "loss": 0.1327, + "num_input_tokens_seen": 121544240, + "step": 56325 + }, + { + "epoch": 9.189233278955955, + "grad_norm": 1.3202210664749146, + "learning_rate": 9.949229959194313e-07, + "loss": 0.2658, + "num_input_tokens_seen": 121554992, + "step": 56330 + }, + { + "epoch": 9.190048939641109, + "grad_norm": 0.06663151830434799, + "learning_rate": 9.929358966730696e-07, + "loss": 0.1533, + "num_input_tokens_seen": 121565424, + "step": 56335 + }, + { + "epoch": 9.190864600326265, + "grad_norm": 0.37977203726768494, + "learning_rate": 9.909507435611365e-07, + "loss": 0.1744, + "num_input_tokens_seen": 121575504, + "step": 56340 + }, + { + "epoch": 9.191680261011419, + "grad_norm": 0.7171401381492615, + "learning_rate": 9.889675367445589e-07, + "loss": 0.0314, + "num_input_tokens_seen": 121586320, + "step": 56345 + }, + { + "epoch": 9.192495921696574, + "grad_norm": 0.26332512497901917, + "learning_rate": 9.869862763841026e-07, + "loss": 0.1856, + "num_input_tokens_seen": 121596976, + "step": 56350 + }, + { + "epoch": 9.19331158238173, + "grad_norm": 0.1317400336265564, + "learning_rate": 9.85006962640378e-07, + "loss": 0.0895, + "num_input_tokens_seen": 121607344, + "step": 56355 + }, + { + "epoch": 9.194127243066884, + "grad_norm": 0.17191168665885925, + "learning_rate": 9.8302959567384e-07, + "loss": 0.1282, + "num_input_tokens_seen": 121617424, + "step": 56360 + }, + { + "epoch": 9.19494290375204, + "grad_norm": 1.8013371229171753, + "learning_rate": 9.810541756447855e-07, + "loss": 0.1187, + "num_input_tokens_seen": 121629744, + "step": 56365 + }, + { + "epoch": 9.195758564437194, + "grad_norm": 0.9505177140235901, + "learning_rate": 9.790807027133446e-07, + "loss": 0.2253, + "num_input_tokens_seen": 121641008, + "step": 56370 + }, + { + "epoch": 9.19657422512235, + "grad_norm": 0.6237088441848755, + "learning_rate": 9.77109177039509e-07, + "loss": 0.0786, + "num_input_tokens_seen": 121652528, + "step": 56375 + }, + { + "epoch": 9.197389885807505, + "grad_norm": 1.1827000379562378, + "learning_rate": 9.751395987830924e-07, + "loss": 0.1112, + "num_input_tokens_seen": 121663856, + "step": 56380 + }, + { + "epoch": 9.198205546492659, + "grad_norm": 1.7145776748657227, + "learning_rate": 9.731719681037616e-07, + "loss": 0.1232, + "num_input_tokens_seen": 121676112, + "step": 56385 + }, + { + "epoch": 9.199021207177815, + "grad_norm": 0.1260250359773636, + "learning_rate": 9.712062851610222e-07, + "loss": 0.1594, + "num_input_tokens_seen": 121688400, + "step": 56390 + }, + { + "epoch": 9.199836867862969, + "grad_norm": 1.8734116554260254, + "learning_rate": 9.692425501142217e-07, + "loss": 0.1447, + "num_input_tokens_seen": 121698928, + "step": 56395 + }, + { + "epoch": 9.200652528548124, + "grad_norm": 1.6102721691131592, + "learning_rate": 9.672807631225521e-07, + "loss": 0.2315, + "num_input_tokens_seen": 121708496, + "step": 56400 + }, + { + "epoch": 9.201468189233278, + "grad_norm": 0.9443143010139465, + "learning_rate": 9.65320924345045e-07, + "loss": 0.0877, + "num_input_tokens_seen": 121720240, + "step": 56405 + }, + { + "epoch": 9.202283849918434, + "grad_norm": 0.3517104983329773, + "learning_rate": 9.633630339405731e-07, + "loss": 0.0763, + "num_input_tokens_seen": 121732080, + "step": 56410 + }, + { + "epoch": 9.20309951060359, + "grad_norm": 0.4655802249908447, + "learning_rate": 9.614070920678536e-07, + "loss": 0.1238, + "num_input_tokens_seen": 121742096, + "step": 56415 + }, + { + "epoch": 9.203915171288743, + "grad_norm": 0.20169074833393097, + "learning_rate": 9.59453098885446e-07, + "loss": 0.0442, + "num_input_tokens_seen": 121752496, + "step": 56420 + }, + { + "epoch": 9.2047308319739, + "grad_norm": 0.6992882490158081, + "learning_rate": 9.575010545517487e-07, + "loss": 0.1856, + "num_input_tokens_seen": 121763024, + "step": 56425 + }, + { + "epoch": 9.205546492659053, + "grad_norm": 0.06846418231725693, + "learning_rate": 9.5555095922501e-07, + "loss": 0.1201, + "num_input_tokens_seen": 121774352, + "step": 56430 + }, + { + "epoch": 9.206362153344209, + "grad_norm": 0.02828872948884964, + "learning_rate": 9.53602813063309e-07, + "loss": 0.0723, + "num_input_tokens_seen": 121785296, + "step": 56435 + }, + { + "epoch": 9.207177814029365, + "grad_norm": 0.430849552154541, + "learning_rate": 9.516566162245749e-07, + "loss": 0.1972, + "num_input_tokens_seen": 121796752, + "step": 56440 + }, + { + "epoch": 9.207993474714518, + "grad_norm": 1.2432314157485962, + "learning_rate": 9.49712368866576e-07, + "loss": 0.1005, + "num_input_tokens_seen": 121807440, + "step": 56445 + }, + { + "epoch": 9.208809135399674, + "grad_norm": 0.04401199519634247, + "learning_rate": 9.477700711469223e-07, + "loss": 0.0389, + "num_input_tokens_seen": 121818640, + "step": 56450 + }, + { + "epoch": 9.209624796084828, + "grad_norm": 0.035353437066078186, + "learning_rate": 9.458297232230684e-07, + "loss": 0.0843, + "num_input_tokens_seen": 121828560, + "step": 56455 + }, + { + "epoch": 9.210440456769984, + "grad_norm": 1.0406627655029297, + "learning_rate": 9.438913252523024e-07, + "loss": 0.1195, + "num_input_tokens_seen": 121839088, + "step": 56460 + }, + { + "epoch": 9.21125611745514, + "grad_norm": 0.5200211405754089, + "learning_rate": 9.41954877391768e-07, + "loss": 0.0275, + "num_input_tokens_seen": 121849744, + "step": 56465 + }, + { + "epoch": 9.212071778140293, + "grad_norm": 0.05020681396126747, + "learning_rate": 9.400203797984397e-07, + "loss": 0.2069, + "num_input_tokens_seen": 121859696, + "step": 56470 + }, + { + "epoch": 9.21288743882545, + "grad_norm": 1.035629153251648, + "learning_rate": 9.380878326291392e-07, + "loss": 0.0472, + "num_input_tokens_seen": 121869840, + "step": 56475 + }, + { + "epoch": 9.213703099510603, + "grad_norm": 0.8958094120025635, + "learning_rate": 9.361572360405246e-07, + "loss": 0.043, + "num_input_tokens_seen": 121880432, + "step": 56480 + }, + { + "epoch": 9.214518760195759, + "grad_norm": 0.5357049107551575, + "learning_rate": 9.342285901891068e-07, + "loss": 0.0951, + "num_input_tokens_seen": 121889264, + "step": 56485 + }, + { + "epoch": 9.215334420880913, + "grad_norm": 1.0445661544799805, + "learning_rate": 9.323018952312273e-07, + "loss": 0.0809, + "num_input_tokens_seen": 121900016, + "step": 56490 + }, + { + "epoch": 9.216150081566068, + "grad_norm": 0.7998911142349243, + "learning_rate": 9.303771513230752e-07, + "loss": 0.0905, + "num_input_tokens_seen": 121911312, + "step": 56495 + }, + { + "epoch": 9.216965742251224, + "grad_norm": 0.29759472608566284, + "learning_rate": 9.284543586206784e-07, + "loss": 0.0394, + "num_input_tokens_seen": 121923088, + "step": 56500 + }, + { + "epoch": 9.217781402936378, + "grad_norm": 1.2748000621795654, + "learning_rate": 9.265335172799094e-07, + "loss": 0.1629, + "num_input_tokens_seen": 121932720, + "step": 56505 + }, + { + "epoch": 9.218597063621534, + "grad_norm": 1.316698431968689, + "learning_rate": 9.246146274564798e-07, + "loss": 0.0848, + "num_input_tokens_seen": 121943248, + "step": 56510 + }, + { + "epoch": 9.219412724306688, + "grad_norm": 1.0078197717666626, + "learning_rate": 9.226976893059458e-07, + "loss": 0.1731, + "num_input_tokens_seen": 121953712, + "step": 56515 + }, + { + "epoch": 9.220228384991843, + "grad_norm": 0.30109941959381104, + "learning_rate": 9.207827029837052e-07, + "loss": 0.0265, + "num_input_tokens_seen": 121965456, + "step": 56520 + }, + { + "epoch": 9.221044045676999, + "grad_norm": 0.18508414924144745, + "learning_rate": 9.188696686449949e-07, + "loss": 0.0577, + "num_input_tokens_seen": 121975120, + "step": 56525 + }, + { + "epoch": 9.221859706362153, + "grad_norm": 1.0908573865890503, + "learning_rate": 9.169585864448965e-07, + "loss": 0.148, + "num_input_tokens_seen": 121987344, + "step": 56530 + }, + { + "epoch": 9.222675367047309, + "grad_norm": 0.6874753832817078, + "learning_rate": 9.150494565383305e-07, + "loss": 0.0679, + "num_input_tokens_seen": 121998448, + "step": 56535 + }, + { + "epoch": 9.223491027732463, + "grad_norm": 0.06550386548042297, + "learning_rate": 9.13142279080062e-07, + "loss": 0.2516, + "num_input_tokens_seen": 122009104, + "step": 56540 + }, + { + "epoch": 9.224306688417618, + "grad_norm": 1.820776104927063, + "learning_rate": 9.112370542246978e-07, + "loss": 0.1794, + "num_input_tokens_seen": 122019952, + "step": 56545 + }, + { + "epoch": 9.225122349102774, + "grad_norm": 0.2286478877067566, + "learning_rate": 9.093337821266784e-07, + "loss": 0.1072, + "num_input_tokens_seen": 122031280, + "step": 56550 + }, + { + "epoch": 9.225938009787928, + "grad_norm": 0.09974148869514465, + "learning_rate": 9.074324629403025e-07, + "loss": 0.0176, + "num_input_tokens_seen": 122041584, + "step": 56555 + }, + { + "epoch": 9.226753670473084, + "grad_norm": 1.0286060571670532, + "learning_rate": 9.055330968196912e-07, + "loss": 0.0853, + "num_input_tokens_seen": 122051312, + "step": 56560 + }, + { + "epoch": 9.227569331158238, + "grad_norm": 1.771204948425293, + "learning_rate": 9.036356839188243e-07, + "loss": 0.2505, + "num_input_tokens_seen": 122061552, + "step": 56565 + }, + { + "epoch": 9.228384991843393, + "grad_norm": 0.13871510326862335, + "learning_rate": 9.017402243915091e-07, + "loss": 0.1374, + "num_input_tokens_seen": 122071184, + "step": 56570 + }, + { + "epoch": 9.229200652528547, + "grad_norm": 1.0021116733551025, + "learning_rate": 8.998467183914061e-07, + "loss": 0.1038, + "num_input_tokens_seen": 122083344, + "step": 56575 + }, + { + "epoch": 9.230016313213703, + "grad_norm": 0.6647049784660339, + "learning_rate": 8.97955166072012e-07, + "loss": 0.023, + "num_input_tokens_seen": 122094832, + "step": 56580 + }, + { + "epoch": 9.230831973898859, + "grad_norm": 0.07334180921316147, + "learning_rate": 8.960655675866653e-07, + "loss": 0.1177, + "num_input_tokens_seen": 122106032, + "step": 56585 + }, + { + "epoch": 9.231647634584013, + "grad_norm": 1.8432265520095825, + "learning_rate": 8.941779230885433e-07, + "loss": 0.1474, + "num_input_tokens_seen": 122116304, + "step": 56590 + }, + { + "epoch": 9.232463295269168, + "grad_norm": 0.32835257053375244, + "learning_rate": 8.92292232730671e-07, + "loss": 0.0699, + "num_input_tokens_seen": 122128176, + "step": 56595 + }, + { + "epoch": 9.233278955954322, + "grad_norm": 0.03781943768262863, + "learning_rate": 8.904084966659121e-07, + "loss": 0.1347, + "num_input_tokens_seen": 122139696, + "step": 56600 + }, + { + "epoch": 9.234094616639478, + "grad_norm": 0.955280065536499, + "learning_rate": 8.885267150469723e-07, + "loss": 0.3576, + "num_input_tokens_seen": 122151024, + "step": 56605 + }, + { + "epoch": 9.234910277324634, + "grad_norm": 0.04297899454832077, + "learning_rate": 8.866468880263961e-07, + "loss": 0.1282, + "num_input_tokens_seen": 122161648, + "step": 56610 + }, + { + "epoch": 9.235725938009788, + "grad_norm": 0.08829467743635178, + "learning_rate": 8.847690157565758e-07, + "loss": 0.1025, + "num_input_tokens_seen": 122172656, + "step": 56615 + }, + { + "epoch": 9.236541598694943, + "grad_norm": 0.10152264684438705, + "learning_rate": 8.828930983897366e-07, + "loss": 0.1706, + "num_input_tokens_seen": 122183056, + "step": 56620 + }, + { + "epoch": 9.237357259380097, + "grad_norm": 0.6059719324111938, + "learning_rate": 8.810191360779513e-07, + "loss": 0.0617, + "num_input_tokens_seen": 122194224, + "step": 56625 + }, + { + "epoch": 9.238172920065253, + "grad_norm": 0.03971165046095848, + "learning_rate": 8.791471289731346e-07, + "loss": 0.0863, + "num_input_tokens_seen": 122204784, + "step": 56630 + }, + { + "epoch": 9.238988580750409, + "grad_norm": 0.047940757125616074, + "learning_rate": 8.7727707722704e-07, + "loss": 0.0183, + "num_input_tokens_seen": 122215280, + "step": 56635 + }, + { + "epoch": 9.239804241435563, + "grad_norm": 1.2201337814331055, + "learning_rate": 8.75408980991263e-07, + "loss": 0.1703, + "num_input_tokens_seen": 122226128, + "step": 56640 + }, + { + "epoch": 9.240619902120718, + "grad_norm": 1.9613066911697388, + "learning_rate": 8.735428404172408e-07, + "loss": 0.1837, + "num_input_tokens_seen": 122237552, + "step": 56645 + }, + { + "epoch": 9.241435562805872, + "grad_norm": 0.7117242217063904, + "learning_rate": 8.716786556562495e-07, + "loss": 0.2166, + "num_input_tokens_seen": 122248880, + "step": 56650 + }, + { + "epoch": 9.242251223491028, + "grad_norm": 1.4862103462219238, + "learning_rate": 8.698164268594155e-07, + "loss": 0.1991, + "num_input_tokens_seen": 122259760, + "step": 56655 + }, + { + "epoch": 9.243066884176184, + "grad_norm": 1.415723443031311, + "learning_rate": 8.679561541776959e-07, + "loss": 0.1209, + "num_input_tokens_seen": 122271120, + "step": 56660 + }, + { + "epoch": 9.243882544861338, + "grad_norm": 0.40370211005210876, + "learning_rate": 8.660978377618951e-07, + "loss": 0.1024, + "num_input_tokens_seen": 122281424, + "step": 56665 + }, + { + "epoch": 9.244698205546493, + "grad_norm": 0.8672224879264832, + "learning_rate": 8.64241477762659e-07, + "loss": 0.0501, + "num_input_tokens_seen": 122292784, + "step": 56670 + }, + { + "epoch": 9.245513866231647, + "grad_norm": 0.20627334713935852, + "learning_rate": 8.62387074330473e-07, + "loss": 0.0928, + "num_input_tokens_seen": 122302992, + "step": 56675 + }, + { + "epoch": 9.246329526916803, + "grad_norm": 1.410881519317627, + "learning_rate": 8.605346276156611e-07, + "loss": 0.0687, + "num_input_tokens_seen": 122314320, + "step": 56680 + }, + { + "epoch": 9.247145187601957, + "grad_norm": 0.11292549967765808, + "learning_rate": 8.586841377683951e-07, + "loss": 0.0087, + "num_input_tokens_seen": 122324848, + "step": 56685 + }, + { + "epoch": 9.247960848287113, + "grad_norm": 0.04995838552713394, + "learning_rate": 8.568356049386827e-07, + "loss": 0.0568, + "num_input_tokens_seen": 122335760, + "step": 56690 + }, + { + "epoch": 9.248776508972268, + "grad_norm": 0.518585741519928, + "learning_rate": 8.549890292763819e-07, + "loss": 0.1531, + "num_input_tokens_seen": 122346416, + "step": 56695 + }, + { + "epoch": 9.249592169657422, + "grad_norm": 0.17531588673591614, + "learning_rate": 8.531444109311781e-07, + "loss": 0.1364, + "num_input_tokens_seen": 122358608, + "step": 56700 + }, + { + "epoch": 9.250407830342578, + "grad_norm": 1.857659101486206, + "learning_rate": 8.513017500526105e-07, + "loss": 0.0679, + "num_input_tokens_seen": 122369264, + "step": 56705 + }, + { + "epoch": 9.251223491027732, + "grad_norm": 1.8880634307861328, + "learning_rate": 8.49461046790051e-07, + "loss": 0.3793, + "num_input_tokens_seen": 122380528, + "step": 56710 + }, + { + "epoch": 9.252039151712887, + "grad_norm": 0.4009200632572174, + "learning_rate": 8.476223012927193e-07, + "loss": 0.0275, + "num_input_tokens_seen": 122391856, + "step": 56715 + }, + { + "epoch": 9.252854812398043, + "grad_norm": 0.25654831528663635, + "learning_rate": 8.457855137096682e-07, + "loss": 0.0506, + "num_input_tokens_seen": 122402672, + "step": 56720 + }, + { + "epoch": 9.253670473083197, + "grad_norm": 2.174776077270508, + "learning_rate": 8.439506841898037e-07, + "loss": 0.2158, + "num_input_tokens_seen": 122412368, + "step": 56725 + }, + { + "epoch": 9.254486133768353, + "grad_norm": 0.6365659236907959, + "learning_rate": 8.421178128818624e-07, + "loss": 0.0943, + "num_input_tokens_seen": 122423248, + "step": 56730 + }, + { + "epoch": 9.255301794453507, + "grad_norm": 0.5042528510093689, + "learning_rate": 8.402868999344283e-07, + "loss": 0.0817, + "num_input_tokens_seen": 122433040, + "step": 56735 + }, + { + "epoch": 9.256117455138662, + "grad_norm": 0.08716078102588654, + "learning_rate": 8.384579454959185e-07, + "loss": 0.0583, + "num_input_tokens_seen": 122442928, + "step": 56740 + }, + { + "epoch": 9.256933115823816, + "grad_norm": 0.27867528796195984, + "learning_rate": 8.366309497146063e-07, + "loss": 0.069, + "num_input_tokens_seen": 122453488, + "step": 56745 + }, + { + "epoch": 9.257748776508972, + "grad_norm": 0.0892874151468277, + "learning_rate": 8.348059127385926e-07, + "loss": 0.0215, + "num_input_tokens_seen": 122463632, + "step": 56750 + }, + { + "epoch": 9.258564437194128, + "grad_norm": 0.5529887080192566, + "learning_rate": 8.329828347158231e-07, + "loss": 0.0352, + "num_input_tokens_seen": 122474736, + "step": 56755 + }, + { + "epoch": 9.259380097879282, + "grad_norm": 0.060408733785152435, + "learning_rate": 8.311617157940904e-07, + "loss": 0.042, + "num_input_tokens_seen": 122485776, + "step": 56760 + }, + { + "epoch": 9.260195758564437, + "grad_norm": 0.07941409945487976, + "learning_rate": 8.293425561210183e-07, + "loss": 0.0205, + "num_input_tokens_seen": 122496176, + "step": 56765 + }, + { + "epoch": 9.261011419249591, + "grad_norm": 0.017761442810297012, + "learning_rate": 8.275253558440776e-07, + "loss": 0.0292, + "num_input_tokens_seen": 122507760, + "step": 56770 + }, + { + "epoch": 9.261827079934747, + "grad_norm": 1.243690013885498, + "learning_rate": 8.257101151105839e-07, + "loss": 0.116, + "num_input_tokens_seen": 122518768, + "step": 56775 + }, + { + "epoch": 9.262642740619903, + "grad_norm": 1.8627972602844238, + "learning_rate": 8.23896834067689e-07, + "loss": 0.0496, + "num_input_tokens_seen": 122531216, + "step": 56780 + }, + { + "epoch": 9.263458401305057, + "grad_norm": 1.0245965719223022, + "learning_rate": 8.220855128623805e-07, + "loss": 0.1025, + "num_input_tokens_seen": 122541552, + "step": 56785 + }, + { + "epoch": 9.264274061990212, + "grad_norm": 0.45831015706062317, + "learning_rate": 8.202761516415025e-07, + "loss": 0.1014, + "num_input_tokens_seen": 122553712, + "step": 56790 + }, + { + "epoch": 9.265089722675366, + "grad_norm": 0.42695003747940063, + "learning_rate": 8.184687505517236e-07, + "loss": 0.1107, + "num_input_tokens_seen": 122564976, + "step": 56795 + }, + { + "epoch": 9.265905383360522, + "grad_norm": 0.37871864438056946, + "learning_rate": 8.166633097395626e-07, + "loss": 0.2343, + "num_input_tokens_seen": 122576272, + "step": 56800 + }, + { + "epoch": 9.266721044045678, + "grad_norm": 1.6830147504806519, + "learning_rate": 8.148598293513804e-07, + "loss": 0.163, + "num_input_tokens_seen": 122586576, + "step": 56805 + }, + { + "epoch": 9.267536704730832, + "grad_norm": 0.3603500425815582, + "learning_rate": 8.130583095333739e-07, + "loss": 0.0408, + "num_input_tokens_seen": 122598000, + "step": 56810 + }, + { + "epoch": 9.268352365415987, + "grad_norm": 0.8764966130256653, + "learning_rate": 8.112587504315844e-07, + "loss": 0.1236, + "num_input_tokens_seen": 122606896, + "step": 56815 + }, + { + "epoch": 9.269168026101141, + "grad_norm": 1.098997712135315, + "learning_rate": 8.094611521918927e-07, + "loss": 0.049, + "num_input_tokens_seen": 122617200, + "step": 56820 + }, + { + "epoch": 9.269983686786297, + "grad_norm": 1.371434211730957, + "learning_rate": 8.076655149600237e-07, + "loss": 0.3012, + "num_input_tokens_seen": 122626512, + "step": 56825 + }, + { + "epoch": 9.270799347471453, + "grad_norm": 0.3060854971408844, + "learning_rate": 8.058718388815362e-07, + "loss": 0.184, + "num_input_tokens_seen": 122636848, + "step": 56830 + }, + { + "epoch": 9.271615008156607, + "grad_norm": 1.194293737411499, + "learning_rate": 8.040801241018386e-07, + "loss": 0.18, + "num_input_tokens_seen": 122646928, + "step": 56835 + }, + { + "epoch": 9.272430668841762, + "grad_norm": 1.2698547840118408, + "learning_rate": 8.022903707661761e-07, + "loss": 0.163, + "num_input_tokens_seen": 122656752, + "step": 56840 + }, + { + "epoch": 9.273246329526916, + "grad_norm": 1.488785743713379, + "learning_rate": 8.005025790196325e-07, + "loss": 0.1515, + "num_input_tokens_seen": 122668656, + "step": 56845 + }, + { + "epoch": 9.274061990212072, + "grad_norm": 0.3290431797504425, + "learning_rate": 7.987167490071362e-07, + "loss": 0.1108, + "num_input_tokens_seen": 122679728, + "step": 56850 + }, + { + "epoch": 9.274877650897226, + "grad_norm": 1.3182320594787598, + "learning_rate": 7.969328808734577e-07, + "loss": 0.1536, + "num_input_tokens_seen": 122691536, + "step": 56855 + }, + { + "epoch": 9.275693311582382, + "grad_norm": 0.16737774014472961, + "learning_rate": 7.951509747632063e-07, + "loss": 0.0755, + "num_input_tokens_seen": 122701488, + "step": 56860 + }, + { + "epoch": 9.276508972267537, + "grad_norm": 1.5771586894989014, + "learning_rate": 7.933710308208275e-07, + "loss": 0.0427, + "num_input_tokens_seen": 122712784, + "step": 56865 + }, + { + "epoch": 9.277324632952691, + "grad_norm": 0.13524581491947174, + "learning_rate": 7.915930491906198e-07, + "loss": 0.2185, + "num_input_tokens_seen": 122723184, + "step": 56870 + }, + { + "epoch": 9.278140293637847, + "grad_norm": 0.5042086243629456, + "learning_rate": 7.898170300167096e-07, + "loss": 0.1303, + "num_input_tokens_seen": 122733808, + "step": 56875 + }, + { + "epoch": 9.278955954323001, + "grad_norm": 0.7766228318214417, + "learning_rate": 7.880429734430706e-07, + "loss": 0.102, + "num_input_tokens_seen": 122745552, + "step": 56880 + }, + { + "epoch": 9.279771615008157, + "grad_norm": 0.565697193145752, + "learning_rate": 7.862708796135182e-07, + "loss": 0.0551, + "num_input_tokens_seen": 122755632, + "step": 56885 + }, + { + "epoch": 9.280587275693312, + "grad_norm": 0.7166928648948669, + "learning_rate": 7.845007486717099e-07, + "loss": 0.2348, + "num_input_tokens_seen": 122767696, + "step": 56890 + }, + { + "epoch": 9.281402936378466, + "grad_norm": 1.0221455097198486, + "learning_rate": 7.827325807611391e-07, + "loss": 0.0759, + "num_input_tokens_seen": 122779376, + "step": 56895 + }, + { + "epoch": 9.282218597063622, + "grad_norm": 0.27116304636001587, + "learning_rate": 7.80966376025144e-07, + "loss": 0.1035, + "num_input_tokens_seen": 122790832, + "step": 56900 + }, + { + "epoch": 9.283034257748776, + "grad_norm": 0.22842901945114136, + "learning_rate": 7.792021346068989e-07, + "loss": 0.2321, + "num_input_tokens_seen": 122801456, + "step": 56905 + }, + { + "epoch": 9.283849918433932, + "grad_norm": 0.916963517665863, + "learning_rate": 7.774398566494201e-07, + "loss": 0.1031, + "num_input_tokens_seen": 122813520, + "step": 56910 + }, + { + "epoch": 9.284665579119087, + "grad_norm": 0.7013892531394958, + "learning_rate": 7.756795422955737e-07, + "loss": 0.0794, + "num_input_tokens_seen": 122823344, + "step": 56915 + }, + { + "epoch": 9.285481239804241, + "grad_norm": 0.2673870325088501, + "learning_rate": 7.739211916880595e-07, + "loss": 0.2257, + "num_input_tokens_seen": 122834128, + "step": 56920 + }, + { + "epoch": 9.286296900489397, + "grad_norm": 0.07409888505935669, + "learning_rate": 7.721648049694108e-07, + "loss": 0.0194, + "num_input_tokens_seen": 122845456, + "step": 56925 + }, + { + "epoch": 9.28711256117455, + "grad_norm": 0.7084934711456299, + "learning_rate": 7.704103822820164e-07, + "loss": 0.1086, + "num_input_tokens_seen": 122856272, + "step": 56930 + }, + { + "epoch": 9.287928221859707, + "grad_norm": 0.02014182321727276, + "learning_rate": 7.686579237680957e-07, + "loss": 0.1197, + "num_input_tokens_seen": 122866544, + "step": 56935 + }, + { + "epoch": 9.28874388254486, + "grad_norm": 0.7773504853248596, + "learning_rate": 7.669074295697132e-07, + "loss": 0.1317, + "num_input_tokens_seen": 122876432, + "step": 56940 + }, + { + "epoch": 9.289559543230016, + "grad_norm": 0.9284977316856384, + "learning_rate": 7.651588998287717e-07, + "loss": 0.0673, + "num_input_tokens_seen": 122888432, + "step": 56945 + }, + { + "epoch": 9.290375203915172, + "grad_norm": 0.947437584400177, + "learning_rate": 7.634123346870165e-07, + "loss": 0.1985, + "num_input_tokens_seen": 122900016, + "step": 56950 + }, + { + "epoch": 9.291190864600326, + "grad_norm": 0.13243651390075684, + "learning_rate": 7.616677342860312e-07, + "loss": 0.0956, + "num_input_tokens_seen": 122910096, + "step": 56955 + }, + { + "epoch": 9.292006525285482, + "grad_norm": 0.10301971435546875, + "learning_rate": 7.599250987672446e-07, + "loss": 0.0266, + "num_input_tokens_seen": 122920336, + "step": 56960 + }, + { + "epoch": 9.292822185970635, + "grad_norm": 0.46571996808052063, + "learning_rate": 7.581844282719213e-07, + "loss": 0.2263, + "num_input_tokens_seen": 122931632, + "step": 56965 + }, + { + "epoch": 9.293637846655791, + "grad_norm": 0.05264337360858917, + "learning_rate": 7.564457229411709e-07, + "loss": 0.0839, + "num_input_tokens_seen": 122943728, + "step": 56970 + }, + { + "epoch": 9.294453507340947, + "grad_norm": 1.535711646080017, + "learning_rate": 7.547089829159415e-07, + "loss": 0.0988, + "num_input_tokens_seen": 122953264, + "step": 56975 + }, + { + "epoch": 9.2952691680261, + "grad_norm": 1.367103099822998, + "learning_rate": 7.529742083370206e-07, + "loss": 0.0959, + "num_input_tokens_seen": 122965296, + "step": 56980 + }, + { + "epoch": 9.296084828711257, + "grad_norm": 2.282876968383789, + "learning_rate": 7.512413993450373e-07, + "loss": 0.2653, + "num_input_tokens_seen": 122974384, + "step": 56985 + }, + { + "epoch": 9.29690048939641, + "grad_norm": 1.6951284408569336, + "learning_rate": 7.495105560804627e-07, + "loss": 0.1108, + "num_input_tokens_seen": 122985296, + "step": 56990 + }, + { + "epoch": 9.297716150081566, + "grad_norm": 0.32771116495132446, + "learning_rate": 7.477816786836122e-07, + "loss": 0.0325, + "num_input_tokens_seen": 122996272, + "step": 56995 + }, + { + "epoch": 9.298531810766722, + "grad_norm": 0.16197124123573303, + "learning_rate": 7.460547672946294e-07, + "loss": 0.1127, + "num_input_tokens_seen": 123007440, + "step": 57000 + }, + { + "epoch": 9.299347471451876, + "grad_norm": 1.6775721311569214, + "learning_rate": 7.443298220535106e-07, + "loss": 0.3211, + "num_input_tokens_seen": 123018096, + "step": 57005 + }, + { + "epoch": 9.300163132137031, + "grad_norm": 1.0994304418563843, + "learning_rate": 7.426068431000882e-07, + "loss": 0.2676, + "num_input_tokens_seen": 123029072, + "step": 57010 + }, + { + "epoch": 9.300978792822185, + "grad_norm": 0.5925520062446594, + "learning_rate": 7.408858305740368e-07, + "loss": 0.1839, + "num_input_tokens_seen": 123040816, + "step": 57015 + }, + { + "epoch": 9.301794453507341, + "grad_norm": 0.04296664521098137, + "learning_rate": 7.391667846148697e-07, + "loss": 0.0441, + "num_input_tokens_seen": 123051376, + "step": 57020 + }, + { + "epoch": 9.302610114192497, + "grad_norm": 1.4392980337142944, + "learning_rate": 7.374497053619423e-07, + "loss": 0.1284, + "num_input_tokens_seen": 123061232, + "step": 57025 + }, + { + "epoch": 9.30342577487765, + "grad_norm": 0.2499386966228485, + "learning_rate": 7.357345929544485e-07, + "loss": 0.3058, + "num_input_tokens_seen": 123071632, + "step": 57030 + }, + { + "epoch": 9.304241435562806, + "grad_norm": 1.2331730127334595, + "learning_rate": 7.340214475314244e-07, + "loss": 0.1187, + "num_input_tokens_seen": 123083600, + "step": 57035 + }, + { + "epoch": 9.30505709624796, + "grad_norm": 0.17460237443447113, + "learning_rate": 7.323102692317452e-07, + "loss": 0.1476, + "num_input_tokens_seen": 123093392, + "step": 57040 + }, + { + "epoch": 9.305872756933116, + "grad_norm": 2.145357370376587, + "learning_rate": 7.306010581941275e-07, + "loss": 0.0817, + "num_input_tokens_seen": 123104784, + "step": 57045 + }, + { + "epoch": 9.30668841761827, + "grad_norm": 1.011248230934143, + "learning_rate": 7.288938145571328e-07, + "loss": 0.2555, + "num_input_tokens_seen": 123115344, + "step": 57050 + }, + { + "epoch": 9.307504078303426, + "grad_norm": 2.109616279602051, + "learning_rate": 7.271885384591503e-07, + "loss": 0.1606, + "num_input_tokens_seen": 123125520, + "step": 57055 + }, + { + "epoch": 9.308319738988581, + "grad_norm": 1.0350942611694336, + "learning_rate": 7.25485230038428e-07, + "loss": 0.0496, + "num_input_tokens_seen": 123136016, + "step": 57060 + }, + { + "epoch": 9.309135399673735, + "grad_norm": 1.0409984588623047, + "learning_rate": 7.237838894330412e-07, + "loss": 0.1953, + "num_input_tokens_seen": 123147536, + "step": 57065 + }, + { + "epoch": 9.309951060358891, + "grad_norm": 0.09051968157291412, + "learning_rate": 7.220845167809076e-07, + "loss": 0.065, + "num_input_tokens_seen": 123157904, + "step": 57070 + }, + { + "epoch": 9.310766721044045, + "grad_norm": 0.12773706018924713, + "learning_rate": 7.203871122197891e-07, + "loss": 0.0581, + "num_input_tokens_seen": 123168720, + "step": 57075 + }, + { + "epoch": 9.3115823817292, + "grad_norm": 0.936832070350647, + "learning_rate": 7.186916758872841e-07, + "loss": 0.1119, + "num_input_tokens_seen": 123180144, + "step": 57080 + }, + { + "epoch": 9.312398042414356, + "grad_norm": 0.331302285194397, + "learning_rate": 7.169982079208326e-07, + "loss": 0.0703, + "num_input_tokens_seen": 123190896, + "step": 57085 + }, + { + "epoch": 9.31321370309951, + "grad_norm": 0.1707058548927307, + "learning_rate": 7.153067084577192e-07, + "loss": 0.0584, + "num_input_tokens_seen": 123201424, + "step": 57090 + }, + { + "epoch": 9.314029363784666, + "grad_norm": 0.5153762698173523, + "learning_rate": 7.13617177635062e-07, + "loss": 0.1392, + "num_input_tokens_seen": 123211536, + "step": 57095 + }, + { + "epoch": 9.31484502446982, + "grad_norm": 0.045752786099910736, + "learning_rate": 7.119296155898236e-07, + "loss": 0.0939, + "num_input_tokens_seen": 123222896, + "step": 57100 + }, + { + "epoch": 9.315660685154976, + "grad_norm": 1.0829625129699707, + "learning_rate": 7.102440224588086e-07, + "loss": 0.1178, + "num_input_tokens_seen": 123235280, + "step": 57105 + }, + { + "epoch": 9.31647634584013, + "grad_norm": 0.39440467953681946, + "learning_rate": 7.085603983786576e-07, + "loss": 0.1762, + "num_input_tokens_seen": 123246416, + "step": 57110 + }, + { + "epoch": 9.317292006525285, + "grad_norm": 1.120316982269287, + "learning_rate": 7.068787434858532e-07, + "loss": 0.0667, + "num_input_tokens_seen": 123257200, + "step": 57115 + }, + { + "epoch": 9.318107667210441, + "grad_norm": 0.31023356318473816, + "learning_rate": 7.051990579167195e-07, + "loss": 0.2211, + "num_input_tokens_seen": 123268816, + "step": 57120 + }, + { + "epoch": 9.318923327895595, + "grad_norm": 0.3987223505973816, + "learning_rate": 7.035213418074227e-07, + "loss": 0.1621, + "num_input_tokens_seen": 123279408, + "step": 57125 + }, + { + "epoch": 9.31973898858075, + "grad_norm": 0.8757058382034302, + "learning_rate": 7.018455952939651e-07, + "loss": 0.161, + "num_input_tokens_seen": 123289232, + "step": 57130 + }, + { + "epoch": 9.320554649265905, + "grad_norm": 0.8962607383728027, + "learning_rate": 7.001718185121908e-07, + "loss": 0.1808, + "num_input_tokens_seen": 123299984, + "step": 57135 + }, + { + "epoch": 9.32137030995106, + "grad_norm": 0.7221666574478149, + "learning_rate": 6.98500011597783e-07, + "loss": 0.1591, + "num_input_tokens_seen": 123310768, + "step": 57140 + }, + { + "epoch": 9.322185970636216, + "grad_norm": 1.3005914688110352, + "learning_rate": 6.96830174686272e-07, + "loss": 0.0432, + "num_input_tokens_seen": 123321232, + "step": 57145 + }, + { + "epoch": 9.32300163132137, + "grad_norm": 0.07151403278112411, + "learning_rate": 6.951623079130192e-07, + "loss": 0.1206, + "num_input_tokens_seen": 123332432, + "step": 57150 + }, + { + "epoch": 9.323817292006526, + "grad_norm": 0.23298832774162292, + "learning_rate": 6.934964114132303e-07, + "loss": 0.1226, + "num_input_tokens_seen": 123344240, + "step": 57155 + }, + { + "epoch": 9.32463295269168, + "grad_norm": 1.925539493560791, + "learning_rate": 6.918324853219527e-07, + "loss": 0.278, + "num_input_tokens_seen": 123354672, + "step": 57160 + }, + { + "epoch": 9.325448613376835, + "grad_norm": 0.05781927704811096, + "learning_rate": 6.901705297740729e-07, + "loss": 0.2267, + "num_input_tokens_seen": 123365168, + "step": 57165 + }, + { + "epoch": 9.326264274061991, + "grad_norm": 0.22160674631595612, + "learning_rate": 6.885105449043138e-07, + "loss": 0.0611, + "num_input_tokens_seen": 123375696, + "step": 57170 + }, + { + "epoch": 9.327079934747145, + "grad_norm": 0.22415219247341156, + "learning_rate": 6.868525308472484e-07, + "loss": 0.248, + "num_input_tokens_seen": 123387344, + "step": 57175 + }, + { + "epoch": 9.3278955954323, + "grad_norm": 0.29652199149131775, + "learning_rate": 6.851964877372802e-07, + "loss": 0.0632, + "num_input_tokens_seen": 123398992, + "step": 57180 + }, + { + "epoch": 9.328711256117455, + "grad_norm": 1.0452977418899536, + "learning_rate": 6.835424157086573e-07, + "loss": 0.2232, + "num_input_tokens_seen": 123409168, + "step": 57185 + }, + { + "epoch": 9.32952691680261, + "grad_norm": 1.3161437511444092, + "learning_rate": 6.818903148954642e-07, + "loss": 0.0708, + "num_input_tokens_seen": 123421488, + "step": 57190 + }, + { + "epoch": 9.330342577487766, + "grad_norm": 0.3972456455230713, + "learning_rate": 6.802401854316298e-07, + "loss": 0.0722, + "num_input_tokens_seen": 123432816, + "step": 57195 + }, + { + "epoch": 9.33115823817292, + "grad_norm": 0.5231635570526123, + "learning_rate": 6.78592027450925e-07, + "loss": 0.1093, + "num_input_tokens_seen": 123444048, + "step": 57200 + }, + { + "epoch": 9.331973898858076, + "grad_norm": 0.04752068594098091, + "learning_rate": 6.769458410869595e-07, + "loss": 0.1181, + "num_input_tokens_seen": 123454320, + "step": 57205 + }, + { + "epoch": 9.33278955954323, + "grad_norm": 0.10275918990373611, + "learning_rate": 6.753016264731738e-07, + "loss": 0.0167, + "num_input_tokens_seen": 123465424, + "step": 57210 + }, + { + "epoch": 9.333605220228385, + "grad_norm": 0.5369459986686707, + "learning_rate": 6.736593837428639e-07, + "loss": 0.1026, + "num_input_tokens_seen": 123475856, + "step": 57215 + }, + { + "epoch": 9.33442088091354, + "grad_norm": 0.18376296758651733, + "learning_rate": 6.720191130291514e-07, + "loss": 0.3596, + "num_input_tokens_seen": 123484816, + "step": 57220 + }, + { + "epoch": 9.335236541598695, + "grad_norm": 0.04048871994018555, + "learning_rate": 6.703808144650076e-07, + "loss": 0.1151, + "num_input_tokens_seen": 123495920, + "step": 57225 + }, + { + "epoch": 9.33605220228385, + "grad_norm": 0.07308179140090942, + "learning_rate": 6.687444881832455e-07, + "loss": 0.2442, + "num_input_tokens_seen": 123507344, + "step": 57230 + }, + { + "epoch": 9.336867862969005, + "grad_norm": 0.39970675110816956, + "learning_rate": 6.67110134316512e-07, + "loss": 0.1279, + "num_input_tokens_seen": 123517808, + "step": 57235 + }, + { + "epoch": 9.33768352365416, + "grad_norm": 0.061313509941101074, + "learning_rate": 6.654777529972928e-07, + "loss": 0.2454, + "num_input_tokens_seen": 123529456, + "step": 57240 + }, + { + "epoch": 9.338499184339314, + "grad_norm": 1.2036579847335815, + "learning_rate": 6.638473443579179e-07, + "loss": 0.1017, + "num_input_tokens_seen": 123538960, + "step": 57245 + }, + { + "epoch": 9.33931484502447, + "grad_norm": 1.5458769798278809, + "learning_rate": 6.622189085305597e-07, + "loss": 0.2309, + "num_input_tokens_seen": 123549200, + "step": 57250 + }, + { + "epoch": 9.340130505709626, + "grad_norm": 1.124949336051941, + "learning_rate": 6.605924456472262e-07, + "loss": 0.138, + "num_input_tokens_seen": 123560176, + "step": 57255 + }, + { + "epoch": 9.34094616639478, + "grad_norm": 0.07035364955663681, + "learning_rate": 6.589679558397648e-07, + "loss": 0.1819, + "num_input_tokens_seen": 123571280, + "step": 57260 + }, + { + "epoch": 9.341761827079935, + "grad_norm": 0.20373615622520447, + "learning_rate": 6.573454392398648e-07, + "loss": 0.2121, + "num_input_tokens_seen": 123581296, + "step": 57265 + }, + { + "epoch": 9.34257748776509, + "grad_norm": 0.17242056131362915, + "learning_rate": 6.557248959790596e-07, + "loss": 0.1542, + "num_input_tokens_seen": 123592016, + "step": 57270 + }, + { + "epoch": 9.343393148450245, + "grad_norm": 0.09417847543954849, + "learning_rate": 6.541063261887137e-07, + "loss": 0.1198, + "num_input_tokens_seen": 123602992, + "step": 57275 + }, + { + "epoch": 9.3442088091354, + "grad_norm": 0.5961000323295593, + "learning_rate": 6.524897300000388e-07, + "loss": 0.0822, + "num_input_tokens_seen": 123614384, + "step": 57280 + }, + { + "epoch": 9.345024469820554, + "grad_norm": 0.6544556617736816, + "learning_rate": 6.508751075440856e-07, + "loss": 0.064, + "num_input_tokens_seen": 123625744, + "step": 57285 + }, + { + "epoch": 9.34584013050571, + "grad_norm": 1.9896256923675537, + "learning_rate": 6.49262458951741e-07, + "loss": 0.2717, + "num_input_tokens_seen": 123636816, + "step": 57290 + }, + { + "epoch": 9.346655791190864, + "grad_norm": 1.3411660194396973, + "learning_rate": 6.476517843537395e-07, + "loss": 0.1772, + "num_input_tokens_seen": 123648656, + "step": 57295 + }, + { + "epoch": 9.34747145187602, + "grad_norm": 1.311308741569519, + "learning_rate": 6.46043083880643e-07, + "loss": 0.039, + "num_input_tokens_seen": 123658672, + "step": 57300 + }, + { + "epoch": 9.348287112561174, + "grad_norm": 1.623583436012268, + "learning_rate": 6.44436357662867e-07, + "loss": 0.2506, + "num_input_tokens_seen": 123669968, + "step": 57305 + }, + { + "epoch": 9.34910277324633, + "grad_norm": 1.7193747758865356, + "learning_rate": 6.428316058306571e-07, + "loss": 0.0854, + "num_input_tokens_seen": 123680880, + "step": 57310 + }, + { + "epoch": 9.349918433931485, + "grad_norm": 1.9049994945526123, + "learning_rate": 6.412288285141066e-07, + "loss": 0.1575, + "num_input_tokens_seen": 123691536, + "step": 57315 + }, + { + "epoch": 9.350734094616639, + "grad_norm": 0.6351649761199951, + "learning_rate": 6.396280258431391e-07, + "loss": 0.1865, + "num_input_tokens_seen": 123701904, + "step": 57320 + }, + { + "epoch": 9.351549755301795, + "grad_norm": 0.06398110836744308, + "learning_rate": 6.38029197947529e-07, + "loss": 0.0493, + "num_input_tokens_seen": 123712528, + "step": 57325 + }, + { + "epoch": 9.352365415986949, + "grad_norm": 1.0341938734054565, + "learning_rate": 6.364323449568804e-07, + "loss": 0.137, + "num_input_tokens_seen": 123723184, + "step": 57330 + }, + { + "epoch": 9.353181076672104, + "grad_norm": 0.34141573309898376, + "learning_rate": 6.348374670006485e-07, + "loss": 0.054, + "num_input_tokens_seen": 123733968, + "step": 57335 + }, + { + "epoch": 9.35399673735726, + "grad_norm": 0.7594392895698547, + "learning_rate": 6.332445642081214e-07, + "loss": 0.0832, + "num_input_tokens_seen": 123743120, + "step": 57340 + }, + { + "epoch": 9.354812398042414, + "grad_norm": 0.18915705382823944, + "learning_rate": 6.316536367084236e-07, + "loss": 0.065, + "num_input_tokens_seen": 123754992, + "step": 57345 + }, + { + "epoch": 9.35562805872757, + "grad_norm": 0.035806842148303986, + "learning_rate": 6.300646846305241e-07, + "loss": 0.1344, + "num_input_tokens_seen": 123765488, + "step": 57350 + }, + { + "epoch": 9.356443719412724, + "grad_norm": 1.222888708114624, + "learning_rate": 6.28477708103234e-07, + "loss": 0.1143, + "num_input_tokens_seen": 123776944, + "step": 57355 + }, + { + "epoch": 9.35725938009788, + "grad_norm": 0.9000673890113831, + "learning_rate": 6.268927072552028e-07, + "loss": 0.1789, + "num_input_tokens_seen": 123789168, + "step": 57360 + }, + { + "epoch": 9.358075040783035, + "grad_norm": 0.07418310642242432, + "learning_rate": 6.253096822149113e-07, + "loss": 0.1793, + "num_input_tokens_seen": 123799536, + "step": 57365 + }, + { + "epoch": 9.358890701468189, + "grad_norm": 1.5472590923309326, + "learning_rate": 6.237286331106984e-07, + "loss": 0.1764, + "num_input_tokens_seen": 123811536, + "step": 57370 + }, + { + "epoch": 9.359706362153345, + "grad_norm": 2.139141321182251, + "learning_rate": 6.221495600707227e-07, + "loss": 0.1189, + "num_input_tokens_seen": 123820848, + "step": 57375 + }, + { + "epoch": 9.360522022838499, + "grad_norm": 0.24453617632389069, + "learning_rate": 6.205724632229987e-07, + "loss": 0.0875, + "num_input_tokens_seen": 123832112, + "step": 57380 + }, + { + "epoch": 9.361337683523654, + "grad_norm": 0.0639224648475647, + "learning_rate": 6.18997342695371e-07, + "loss": 0.1347, + "num_input_tokens_seen": 123842544, + "step": 57385 + }, + { + "epoch": 9.362153344208808, + "grad_norm": 1.2800410985946655, + "learning_rate": 6.174241986155238e-07, + "loss": 0.2009, + "num_input_tokens_seen": 123852816, + "step": 57390 + }, + { + "epoch": 9.362969004893964, + "grad_norm": 0.5563880205154419, + "learning_rate": 6.158530311109884e-07, + "loss": 0.253, + "num_input_tokens_seen": 123863312, + "step": 57395 + }, + { + "epoch": 9.36378466557912, + "grad_norm": 0.11355114728212357, + "learning_rate": 6.142838403091322e-07, + "loss": 0.0495, + "num_input_tokens_seen": 123873936, + "step": 57400 + }, + { + "epoch": 9.364600326264274, + "grad_norm": 2.7218637466430664, + "learning_rate": 6.127166263371592e-07, + "loss": 0.2147, + "num_input_tokens_seen": 123885648, + "step": 57405 + }, + { + "epoch": 9.36541598694943, + "grad_norm": 0.3215228319168091, + "learning_rate": 6.111513893221149e-07, + "loss": 0.1049, + "num_input_tokens_seen": 123896528, + "step": 57410 + }, + { + "epoch": 9.366231647634583, + "grad_norm": 0.14058297872543335, + "learning_rate": 6.095881293908867e-07, + "loss": 0.1073, + "num_input_tokens_seen": 123907728, + "step": 57415 + }, + { + "epoch": 9.367047308319739, + "grad_norm": 0.16122983396053314, + "learning_rate": 6.080268466702011e-07, + "loss": 0.0404, + "num_input_tokens_seen": 123918128, + "step": 57420 + }, + { + "epoch": 9.367862969004895, + "grad_norm": 0.6064205169677734, + "learning_rate": 6.064675412866233e-07, + "loss": 0.034, + "num_input_tokens_seen": 123929392, + "step": 57425 + }, + { + "epoch": 9.368678629690049, + "grad_norm": 0.852821946144104, + "learning_rate": 6.049102133665552e-07, + "loss": 0.2126, + "num_input_tokens_seen": 123940688, + "step": 57430 + }, + { + "epoch": 9.369494290375204, + "grad_norm": 0.07331441342830658, + "learning_rate": 6.033548630362457e-07, + "loss": 0.1101, + "num_input_tokens_seen": 123950704, + "step": 57435 + }, + { + "epoch": 9.370309951060358, + "grad_norm": 0.5586536526679993, + "learning_rate": 6.018014904217801e-07, + "loss": 0.0757, + "num_input_tokens_seen": 123961552, + "step": 57440 + }, + { + "epoch": 9.371125611745514, + "grad_norm": 0.03508526086807251, + "learning_rate": 6.002500956490798e-07, + "loss": 0.0739, + "num_input_tokens_seen": 123971408, + "step": 57445 + }, + { + "epoch": 9.37194127243067, + "grad_norm": 0.1717071533203125, + "learning_rate": 5.987006788439109e-07, + "loss": 0.05, + "num_input_tokens_seen": 123981264, + "step": 57450 + }, + { + "epoch": 9.372756933115824, + "grad_norm": 0.30389973521232605, + "learning_rate": 5.971532401318758e-07, + "loss": 0.1879, + "num_input_tokens_seen": 123992944, + "step": 57455 + }, + { + "epoch": 9.37357259380098, + "grad_norm": 0.42007818818092346, + "learning_rate": 5.956077796384185e-07, + "loss": 0.1185, + "num_input_tokens_seen": 124004464, + "step": 57460 + }, + { + "epoch": 9.374388254486133, + "grad_norm": 1.0016660690307617, + "learning_rate": 5.940642974888195e-07, + "loss": 0.1485, + "num_input_tokens_seen": 124017072, + "step": 57465 + }, + { + "epoch": 9.375203915171289, + "grad_norm": 0.17798662185668945, + "learning_rate": 5.925227938082034e-07, + "loss": 0.1445, + "num_input_tokens_seen": 124027792, + "step": 57470 + }, + { + "epoch": 9.376019575856443, + "grad_norm": 1.6794778108596802, + "learning_rate": 5.909832687215317e-07, + "loss": 0.1873, + "num_input_tokens_seen": 124039440, + "step": 57475 + }, + { + "epoch": 9.376835236541599, + "grad_norm": 1.0828273296356201, + "learning_rate": 5.894457223536071e-07, + "loss": 0.1482, + "num_input_tokens_seen": 124050608, + "step": 57480 + }, + { + "epoch": 9.377650897226754, + "grad_norm": 0.247518852353096, + "learning_rate": 5.879101548290716e-07, + "loss": 0.0867, + "num_input_tokens_seen": 124061296, + "step": 57485 + }, + { + "epoch": 9.378466557911908, + "grad_norm": 0.8965079188346863, + "learning_rate": 5.863765662724036e-07, + "loss": 0.2466, + "num_input_tokens_seen": 124071920, + "step": 57490 + }, + { + "epoch": 9.379282218597064, + "grad_norm": 0.7475562691688538, + "learning_rate": 5.848449568079228e-07, + "loss": 0.1609, + "num_input_tokens_seen": 124082608, + "step": 57495 + }, + { + "epoch": 9.380097879282218, + "grad_norm": 0.4703998565673828, + "learning_rate": 5.83315326559794e-07, + "loss": 0.1778, + "num_input_tokens_seen": 124094416, + "step": 57500 + }, + { + "epoch": 9.380913539967374, + "grad_norm": 1.0601595640182495, + "learning_rate": 5.817876756520125e-07, + "loss": 0.0863, + "num_input_tokens_seen": 124105168, + "step": 57505 + }, + { + "epoch": 9.38172920065253, + "grad_norm": 0.029636139050126076, + "learning_rate": 5.80262004208415e-07, + "loss": 0.0987, + "num_input_tokens_seen": 124114992, + "step": 57510 + }, + { + "epoch": 9.382544861337683, + "grad_norm": 1.1367892026901245, + "learning_rate": 5.787383123526891e-07, + "loss": 0.054, + "num_input_tokens_seen": 124125936, + "step": 57515 + }, + { + "epoch": 9.383360522022839, + "grad_norm": 0.11951825022697449, + "learning_rate": 5.772166002083467e-07, + "loss": 0.1214, + "num_input_tokens_seen": 124137616, + "step": 57520 + }, + { + "epoch": 9.384176182707993, + "grad_norm": 1.6824004650115967, + "learning_rate": 5.756968678987451e-07, + "loss": 0.1231, + "num_input_tokens_seen": 124147856, + "step": 57525 + }, + { + "epoch": 9.384991843393149, + "grad_norm": 1.0429649353027344, + "learning_rate": 5.741791155470854e-07, + "loss": 0.1258, + "num_input_tokens_seen": 124159824, + "step": 57530 + }, + { + "epoch": 9.385807504078304, + "grad_norm": 0.1848270148038864, + "learning_rate": 5.726633432764e-07, + "loss": 0.0369, + "num_input_tokens_seen": 124169168, + "step": 57535 + }, + { + "epoch": 9.386623164763458, + "grad_norm": 0.6306769251823425, + "learning_rate": 5.711495512095682e-07, + "loss": 0.0624, + "num_input_tokens_seen": 124179952, + "step": 57540 + }, + { + "epoch": 9.387438825448614, + "grad_norm": 0.7765772938728333, + "learning_rate": 5.696377394693003e-07, + "loss": 0.1885, + "num_input_tokens_seen": 124189648, + "step": 57545 + }, + { + "epoch": 9.388254486133768, + "grad_norm": 0.22774578630924225, + "learning_rate": 5.681279081781593e-07, + "loss": 0.1093, + "num_input_tokens_seen": 124200336, + "step": 57550 + }, + { + "epoch": 9.389070146818923, + "grad_norm": 1.2244913578033447, + "learning_rate": 5.66620057458539e-07, + "loss": 0.2218, + "num_input_tokens_seen": 124209264, + "step": 57555 + }, + { + "epoch": 9.38988580750408, + "grad_norm": 0.03331032395362854, + "learning_rate": 5.651141874326666e-07, + "loss": 0.0277, + "num_input_tokens_seen": 124219184, + "step": 57560 + }, + { + "epoch": 9.390701468189233, + "grad_norm": 0.1736881136894226, + "learning_rate": 5.636102982226221e-07, + "loss": 0.0386, + "num_input_tokens_seen": 124230800, + "step": 57565 + }, + { + "epoch": 9.391517128874389, + "grad_norm": 0.08476623892784119, + "learning_rate": 5.621083899503138e-07, + "loss": 0.0244, + "num_input_tokens_seen": 124242224, + "step": 57570 + }, + { + "epoch": 9.392332789559543, + "grad_norm": 1.7214083671569824, + "learning_rate": 5.606084627374969e-07, + "loss": 0.1396, + "num_input_tokens_seen": 124252592, + "step": 57575 + }, + { + "epoch": 9.393148450244698, + "grad_norm": 0.46961599588394165, + "learning_rate": 5.591105167057631e-07, + "loss": 0.0842, + "num_input_tokens_seen": 124263760, + "step": 57580 + }, + { + "epoch": 9.393964110929852, + "grad_norm": 0.9939093589782715, + "learning_rate": 5.576145519765402e-07, + "loss": 0.0497, + "num_input_tokens_seen": 124274704, + "step": 57585 + }, + { + "epoch": 9.394779771615008, + "grad_norm": 0.19388064742088318, + "learning_rate": 5.561205686711035e-07, + "loss": 0.0269, + "num_input_tokens_seen": 124284784, + "step": 57590 + }, + { + "epoch": 9.395595432300164, + "grad_norm": 0.3901722729206085, + "learning_rate": 5.546285669105589e-07, + "loss": 0.0726, + "num_input_tokens_seen": 124294512, + "step": 57595 + }, + { + "epoch": 9.396411092985318, + "grad_norm": 0.3279361128807068, + "learning_rate": 5.531385468158595e-07, + "loss": 0.0915, + "num_input_tokens_seen": 124306096, + "step": 57600 + }, + { + "epoch": 9.397226753670473, + "grad_norm": 1.323546290397644, + "learning_rate": 5.516505085077895e-07, + "loss": 0.1599, + "num_input_tokens_seen": 124317104, + "step": 57605 + }, + { + "epoch": 9.398042414355627, + "grad_norm": 1.5741510391235352, + "learning_rate": 5.501644521069799e-07, + "loss": 0.1791, + "num_input_tokens_seen": 124327792, + "step": 57610 + }, + { + "epoch": 9.398858075040783, + "grad_norm": 0.08537635207176208, + "learning_rate": 5.486803777338956e-07, + "loss": 0.1263, + "num_input_tokens_seen": 124340112, + "step": 57615 + }, + { + "epoch": 9.399673735725939, + "grad_norm": 1.4467803239822388, + "learning_rate": 5.471982855088459e-07, + "loss": 0.1904, + "num_input_tokens_seen": 124351184, + "step": 57620 + }, + { + "epoch": 9.400489396411093, + "grad_norm": 0.0739954486489296, + "learning_rate": 5.457181755519763e-07, + "loss": 0.1382, + "num_input_tokens_seen": 124361456, + "step": 57625 + }, + { + "epoch": 9.401305057096248, + "grad_norm": 2.3104488849639893, + "learning_rate": 5.442400479832715e-07, + "loss": 0.1687, + "num_input_tokens_seen": 124372848, + "step": 57630 + }, + { + "epoch": 9.402120717781402, + "grad_norm": 1.0758391618728638, + "learning_rate": 5.427639029225551e-07, + "loss": 0.0941, + "num_input_tokens_seen": 124383664, + "step": 57635 + }, + { + "epoch": 9.402936378466558, + "grad_norm": 0.3296918272972107, + "learning_rate": 5.412897404894896e-07, + "loss": 0.143, + "num_input_tokens_seen": 124394160, + "step": 57640 + }, + { + "epoch": 9.403752039151712, + "grad_norm": 0.23635171353816986, + "learning_rate": 5.398175608035821e-07, + "loss": 0.1016, + "num_input_tokens_seen": 124405584, + "step": 57645 + }, + { + "epoch": 9.404567699836868, + "grad_norm": 1.1642242670059204, + "learning_rate": 5.38347363984168e-07, + "loss": 0.0674, + "num_input_tokens_seen": 124416560, + "step": 57650 + }, + { + "epoch": 9.405383360522023, + "grad_norm": 0.3060440123081207, + "learning_rate": 5.368791501504378e-07, + "loss": 0.0416, + "num_input_tokens_seen": 124426864, + "step": 57655 + }, + { + "epoch": 9.406199021207177, + "grad_norm": 0.8945664167404175, + "learning_rate": 5.354129194214103e-07, + "loss": 0.0853, + "num_input_tokens_seen": 124438384, + "step": 57660 + }, + { + "epoch": 9.407014681892333, + "grad_norm": 0.902513325214386, + "learning_rate": 5.339486719159404e-07, + "loss": 0.1097, + "num_input_tokens_seen": 124449936, + "step": 57665 + }, + { + "epoch": 9.407830342577487, + "grad_norm": 0.050701804459095, + "learning_rate": 5.324864077527331e-07, + "loss": 0.2172, + "num_input_tokens_seen": 124461040, + "step": 57670 + }, + { + "epoch": 9.408646003262643, + "grad_norm": 1.6161041259765625, + "learning_rate": 5.310261270503214e-07, + "loss": 0.1908, + "num_input_tokens_seen": 124472144, + "step": 57675 + }, + { + "epoch": 9.409461663947798, + "grad_norm": 0.18010787665843964, + "learning_rate": 5.295678299270884e-07, + "loss": 0.0833, + "num_input_tokens_seen": 124483792, + "step": 57680 + }, + { + "epoch": 9.410277324632952, + "grad_norm": 0.1452299952507019, + "learning_rate": 5.281115165012479e-07, + "loss": 0.2446, + "num_input_tokens_seen": 124495152, + "step": 57685 + }, + { + "epoch": 9.411092985318108, + "grad_norm": 1.3490296602249146, + "learning_rate": 5.266571868908582e-07, + "loss": 0.078, + "num_input_tokens_seen": 124505360, + "step": 57690 + }, + { + "epoch": 9.411908646003262, + "grad_norm": 1.2887574434280396, + "learning_rate": 5.252048412138111e-07, + "loss": 0.2937, + "num_input_tokens_seen": 124515184, + "step": 57695 + }, + { + "epoch": 9.412724306688418, + "grad_norm": 0.37604036927223206, + "learning_rate": 5.237544795878457e-07, + "loss": 0.1837, + "num_input_tokens_seen": 124526416, + "step": 57700 + }, + { + "epoch": 9.413539967373573, + "grad_norm": 1.0534629821777344, + "learning_rate": 5.22306102130532e-07, + "loss": 0.1637, + "num_input_tokens_seen": 124537648, + "step": 57705 + }, + { + "epoch": 9.414355628058727, + "grad_norm": 0.11503340303897858, + "learning_rate": 5.208597089592871e-07, + "loss": 0.229, + "num_input_tokens_seen": 124547920, + "step": 57710 + }, + { + "epoch": 9.415171288743883, + "grad_norm": 0.7802032828330994, + "learning_rate": 5.194153001913588e-07, + "loss": 0.0704, + "num_input_tokens_seen": 124556624, + "step": 57715 + }, + { + "epoch": 9.415986949429037, + "grad_norm": 1.4660327434539795, + "learning_rate": 5.179728759438368e-07, + "loss": 0.2661, + "num_input_tokens_seen": 124566448, + "step": 57720 + }, + { + "epoch": 9.416802610114193, + "grad_norm": 0.13174644112586975, + "learning_rate": 5.165324363336582e-07, + "loss": 0.0991, + "num_input_tokens_seen": 124576624, + "step": 57725 + }, + { + "epoch": 9.417618270799348, + "grad_norm": 0.2586694061756134, + "learning_rate": 5.150939814775852e-07, + "loss": 0.1277, + "num_input_tokens_seen": 124587184, + "step": 57730 + }, + { + "epoch": 9.418433931484502, + "grad_norm": 1.590429663658142, + "learning_rate": 5.136575114922299e-07, + "loss": 0.1198, + "num_input_tokens_seen": 124598448, + "step": 57735 + }, + { + "epoch": 9.419249592169658, + "grad_norm": 0.4974924921989441, + "learning_rate": 5.122230264940409e-07, + "loss": 0.1767, + "num_input_tokens_seen": 124610576, + "step": 57740 + }, + { + "epoch": 9.420065252854812, + "grad_norm": 0.28781449794769287, + "learning_rate": 5.107905265993001e-07, + "loss": 0.0772, + "num_input_tokens_seen": 124620656, + "step": 57745 + }, + { + "epoch": 9.420880913539968, + "grad_norm": 0.13003045320510864, + "learning_rate": 5.09360011924137e-07, + "loss": 0.0812, + "num_input_tokens_seen": 124631344, + "step": 57750 + }, + { + "epoch": 9.421696574225122, + "grad_norm": 0.4189459979534149, + "learning_rate": 5.079314825845144e-07, + "loss": 0.0189, + "num_input_tokens_seen": 124641872, + "step": 57755 + }, + { + "epoch": 9.422512234910277, + "grad_norm": 0.11479084938764572, + "learning_rate": 5.065049386962395e-07, + "loss": 0.2005, + "num_input_tokens_seen": 124652464, + "step": 57760 + }, + { + "epoch": 9.423327895595433, + "grad_norm": 0.0666000172495842, + "learning_rate": 5.050803803749532e-07, + "loss": 0.0524, + "num_input_tokens_seen": 124662896, + "step": 57765 + }, + { + "epoch": 9.424143556280587, + "grad_norm": 0.2219913899898529, + "learning_rate": 5.036578077361381e-07, + "loss": 0.1004, + "num_input_tokens_seen": 124674480, + "step": 57770 + }, + { + "epoch": 9.424959216965743, + "grad_norm": 0.365998238325119, + "learning_rate": 5.022372208951131e-07, + "loss": 0.139, + "num_input_tokens_seen": 124684240, + "step": 57775 + }, + { + "epoch": 9.425774877650896, + "grad_norm": 1.1406774520874023, + "learning_rate": 5.008186199670389e-07, + "loss": 0.0396, + "num_input_tokens_seen": 124696336, + "step": 57780 + }, + { + "epoch": 9.426590538336052, + "grad_norm": 0.7751963138580322, + "learning_rate": 4.994020050669152e-07, + "loss": 0.2127, + "num_input_tokens_seen": 124708432, + "step": 57785 + }, + { + "epoch": 9.427406199021208, + "grad_norm": 0.6485244035720825, + "learning_rate": 4.979873763095805e-07, + "loss": 0.0677, + "num_input_tokens_seen": 124721072, + "step": 57790 + }, + { + "epoch": 9.428221859706362, + "grad_norm": 0.16039729118347168, + "learning_rate": 4.965747338097099e-07, + "loss": 0.1359, + "num_input_tokens_seen": 124731696, + "step": 57795 + }, + { + "epoch": 9.429037520391518, + "grad_norm": 0.15072108805179596, + "learning_rate": 4.951640776818228e-07, + "loss": 0.2031, + "num_input_tokens_seen": 124742416, + "step": 57800 + }, + { + "epoch": 9.429853181076671, + "grad_norm": 0.6212952733039856, + "learning_rate": 4.937554080402695e-07, + "loss": 0.0919, + "num_input_tokens_seen": 124752784, + "step": 57805 + }, + { + "epoch": 9.430668841761827, + "grad_norm": 1.5672343969345093, + "learning_rate": 4.923487249992476e-07, + "loss": 0.2671, + "num_input_tokens_seen": 124763248, + "step": 57810 + }, + { + "epoch": 9.431484502446983, + "grad_norm": 0.2292066365480423, + "learning_rate": 4.909440286727879e-07, + "loss": 0.3241, + "num_input_tokens_seen": 124774576, + "step": 57815 + }, + { + "epoch": 9.432300163132137, + "grad_norm": 1.611603856086731, + "learning_rate": 4.895413191747633e-07, + "loss": 0.1271, + "num_input_tokens_seen": 124784368, + "step": 57820 + }, + { + "epoch": 9.433115823817293, + "grad_norm": 0.034131042659282684, + "learning_rate": 4.881405966188801e-07, + "loss": 0.1512, + "num_input_tokens_seen": 124793904, + "step": 57825 + }, + { + "epoch": 9.433931484502446, + "grad_norm": 0.7329967617988586, + "learning_rate": 4.867418611186974e-07, + "loss": 0.0398, + "num_input_tokens_seen": 124804848, + "step": 57830 + }, + { + "epoch": 9.434747145187602, + "grad_norm": 1.1151880025863647, + "learning_rate": 4.853451127875968e-07, + "loss": 0.1477, + "num_input_tokens_seen": 124817840, + "step": 57835 + }, + { + "epoch": 9.435562805872756, + "grad_norm": 0.7077749371528625, + "learning_rate": 4.839503517388072e-07, + "loss": 0.0707, + "num_input_tokens_seen": 124828080, + "step": 57840 + }, + { + "epoch": 9.436378466557912, + "grad_norm": 0.26570048928260803, + "learning_rate": 4.825575780853964e-07, + "loss": 0.0375, + "num_input_tokens_seen": 124839632, + "step": 57845 + }, + { + "epoch": 9.437194127243067, + "grad_norm": 0.7607939839363098, + "learning_rate": 4.811667919402685e-07, + "loss": 0.1036, + "num_input_tokens_seen": 124852176, + "step": 57850 + }, + { + "epoch": 9.438009787928221, + "grad_norm": 0.8772774934768677, + "learning_rate": 4.797779934161667e-07, + "loss": 0.1348, + "num_input_tokens_seen": 124862768, + "step": 57855 + }, + { + "epoch": 9.438825448613377, + "grad_norm": 0.7427741289138794, + "learning_rate": 4.78391182625676e-07, + "loss": 0.1462, + "num_input_tokens_seen": 124873936, + "step": 57860 + }, + { + "epoch": 9.439641109298531, + "grad_norm": 0.25677990913391113, + "learning_rate": 4.770063596812146e-07, + "loss": 0.1365, + "num_input_tokens_seen": 124884912, + "step": 57865 + }, + { + "epoch": 9.440456769983687, + "grad_norm": 0.2517551779747009, + "learning_rate": 4.7562352469504855e-07, + "loss": 0.0305, + "num_input_tokens_seen": 124895632, + "step": 57870 + }, + { + "epoch": 9.441272430668842, + "grad_norm": 0.4349351227283478, + "learning_rate": 4.7424267777927414e-07, + "loss": 0.0293, + "num_input_tokens_seen": 124906640, + "step": 57875 + }, + { + "epoch": 9.442088091353996, + "grad_norm": 0.039428483694791794, + "learning_rate": 4.728638190458323e-07, + "loss": 0.0227, + "num_input_tokens_seen": 124917392, + "step": 57880 + }, + { + "epoch": 9.442903752039152, + "grad_norm": 0.4104333519935608, + "learning_rate": 4.7148694860649765e-07, + "loss": 0.1069, + "num_input_tokens_seen": 124927408, + "step": 57885 + }, + { + "epoch": 9.443719412724306, + "grad_norm": 0.2763499617576599, + "learning_rate": 4.701120665728892e-07, + "loss": 0.1005, + "num_input_tokens_seen": 124938160, + "step": 57890 + }, + { + "epoch": 9.444535073409462, + "grad_norm": 0.09605910629034042, + "learning_rate": 4.687391730564594e-07, + "loss": 0.0646, + "num_input_tokens_seen": 124947376, + "step": 57895 + }, + { + "epoch": 9.445350734094617, + "grad_norm": 0.15352095663547516, + "learning_rate": 4.673682681684999e-07, + "loss": 0.1169, + "num_input_tokens_seen": 124958576, + "step": 57900 + }, + { + "epoch": 9.446166394779771, + "grad_norm": 0.2022220939397812, + "learning_rate": 4.6599935202014943e-07, + "loss": 0.2972, + "num_input_tokens_seen": 124968944, + "step": 57905 + }, + { + "epoch": 9.446982055464927, + "grad_norm": 1.9214590787887573, + "learning_rate": 4.646324247223749e-07, + "loss": 0.1216, + "num_input_tokens_seen": 124980496, + "step": 57910 + }, + { + "epoch": 9.447797716150081, + "grad_norm": 3.429558277130127, + "learning_rate": 4.6326748638598485e-07, + "loss": 0.2197, + "num_input_tokens_seen": 124990256, + "step": 57915 + }, + { + "epoch": 9.448613376835237, + "grad_norm": 0.05645591765642166, + "learning_rate": 4.619045371216324e-07, + "loss": 0.1126, + "num_input_tokens_seen": 125001840, + "step": 57920 + }, + { + "epoch": 9.449429037520392, + "grad_norm": 0.04633009433746338, + "learning_rate": 4.605435770398042e-07, + "loss": 0.1342, + "num_input_tokens_seen": 125012272, + "step": 57925 + }, + { + "epoch": 9.450244698205546, + "grad_norm": 1.1759320497512817, + "learning_rate": 4.591846062508232e-07, + "loss": 0.1498, + "num_input_tokens_seen": 125023024, + "step": 57930 + }, + { + "epoch": 9.451060358890702, + "grad_norm": 0.7767384648323059, + "learning_rate": 4.578276248648594e-07, + "loss": 0.1014, + "num_input_tokens_seen": 125035184, + "step": 57935 + }, + { + "epoch": 9.451876019575856, + "grad_norm": 0.46168503165245056, + "learning_rate": 4.5647263299191113e-07, + "loss": 0.035, + "num_input_tokens_seen": 125045424, + "step": 57940 + }, + { + "epoch": 9.452691680261012, + "grad_norm": 1.8695601224899292, + "learning_rate": 4.5511963074182653e-07, + "loss": 0.2743, + "num_input_tokens_seen": 125055504, + "step": 57945 + }, + { + "epoch": 9.453507340946166, + "grad_norm": 0.10741767287254333, + "learning_rate": 4.5376861822428176e-07, + "loss": 0.1493, + "num_input_tokens_seen": 125065744, + "step": 57950 + }, + { + "epoch": 9.454323001631321, + "grad_norm": 1.4785020351409912, + "learning_rate": 4.524195955488031e-07, + "loss": 0.2051, + "num_input_tokens_seen": 125076720, + "step": 57955 + }, + { + "epoch": 9.455138662316477, + "grad_norm": 0.7151503562927246, + "learning_rate": 4.5107256282474196e-07, + "loss": 0.3212, + "num_input_tokens_seen": 125087504, + "step": 57960 + }, + { + "epoch": 9.455954323001631, + "grad_norm": 1.0050569772720337, + "learning_rate": 4.4972752016129995e-07, + "loss": 0.0717, + "num_input_tokens_seen": 125099408, + "step": 57965 + }, + { + "epoch": 9.456769983686787, + "grad_norm": 1.2024574279785156, + "learning_rate": 4.483844676675092e-07, + "loss": 0.1437, + "num_input_tokens_seen": 125109904, + "step": 57970 + }, + { + "epoch": 9.45758564437194, + "grad_norm": 0.24352259933948517, + "learning_rate": 4.4704340545224934e-07, + "loss": 0.0604, + "num_input_tokens_seen": 125120144, + "step": 57975 + }, + { + "epoch": 9.458401305057096, + "grad_norm": 1.1315070390701294, + "learning_rate": 4.457043336242306e-07, + "loss": 0.1598, + "num_input_tokens_seen": 125131056, + "step": 57980 + }, + { + "epoch": 9.459216965742252, + "grad_norm": 0.6214264631271362, + "learning_rate": 4.443672522920078e-07, + "loss": 0.1415, + "num_input_tokens_seen": 125141072, + "step": 57985 + }, + { + "epoch": 9.460032626427406, + "grad_norm": 0.05194659158587456, + "learning_rate": 4.4303216156396933e-07, + "loss": 0.0499, + "num_input_tokens_seen": 125152656, + "step": 57990 + }, + { + "epoch": 9.460848287112562, + "grad_norm": 0.18822476267814636, + "learning_rate": 4.416990615483396e-07, + "loss": 0.1012, + "num_input_tokens_seen": 125164880, + "step": 57995 + }, + { + "epoch": 9.461663947797716, + "grad_norm": 0.1982693076133728, + "learning_rate": 4.4036795235319617e-07, + "loss": 0.0651, + "num_input_tokens_seen": 125175792, + "step": 58000 + }, + { + "epoch": 9.462479608482871, + "grad_norm": 0.577401876449585, + "learning_rate": 4.390388340864415e-07, + "loss": 0.119, + "num_input_tokens_seen": 125187152, + "step": 58005 + }, + { + "epoch": 9.463295269168025, + "grad_norm": 0.1374232918024063, + "learning_rate": 4.377117068558201e-07, + "loss": 0.0253, + "num_input_tokens_seen": 125197072, + "step": 58010 + }, + { + "epoch": 9.464110929853181, + "grad_norm": 1.7269560098648071, + "learning_rate": 4.363865707689152e-07, + "loss": 0.1278, + "num_input_tokens_seen": 125208144, + "step": 58015 + }, + { + "epoch": 9.464926590538337, + "grad_norm": 0.6429465413093567, + "learning_rate": 4.350634259331465e-07, + "loss": 0.0914, + "num_input_tokens_seen": 125218448, + "step": 58020 + }, + { + "epoch": 9.46574225122349, + "grad_norm": 1.3933109045028687, + "learning_rate": 4.33742272455781e-07, + "loss": 0.2492, + "num_input_tokens_seen": 125229168, + "step": 58025 + }, + { + "epoch": 9.466557911908646, + "grad_norm": 0.7638145685195923, + "learning_rate": 4.324231104439136e-07, + "loss": 0.2067, + "num_input_tokens_seen": 125240496, + "step": 58030 + }, + { + "epoch": 9.4673735725938, + "grad_norm": 0.48519444465637207, + "learning_rate": 4.3110594000448365e-07, + "loss": 0.1148, + "num_input_tokens_seen": 125251408, + "step": 58035 + }, + { + "epoch": 9.468189233278956, + "grad_norm": 0.08342535048723221, + "learning_rate": 4.29790761244267e-07, + "loss": 0.1324, + "num_input_tokens_seen": 125262128, + "step": 58040 + }, + { + "epoch": 9.469004893964112, + "grad_norm": 0.7370414137840271, + "learning_rate": 4.2847757426988097e-07, + "loss": 0.0404, + "num_input_tokens_seen": 125272816, + "step": 58045 + }, + { + "epoch": 9.469820554649266, + "grad_norm": 0.5626121163368225, + "learning_rate": 4.271663791877767e-07, + "loss": 0.1391, + "num_input_tokens_seen": 125283792, + "step": 58050 + }, + { + "epoch": 9.470636215334421, + "grad_norm": 2.153029441833496, + "learning_rate": 4.258571761042468e-07, + "loss": 0.1707, + "num_input_tokens_seen": 125294960, + "step": 58055 + }, + { + "epoch": 9.471451876019575, + "grad_norm": 0.03596990928053856, + "learning_rate": 4.2454996512542033e-07, + "loss": 0.0518, + "num_input_tokens_seen": 125304816, + "step": 58060 + }, + { + "epoch": 9.47226753670473, + "grad_norm": 0.20277510583400726, + "learning_rate": 4.2324474635727085e-07, + "loss": 0.0919, + "num_input_tokens_seen": 125314448, + "step": 58065 + }, + { + "epoch": 9.473083197389887, + "grad_norm": 0.22452221810817719, + "learning_rate": 4.219415199056026e-07, + "loss": 0.024, + "num_input_tokens_seen": 125325584, + "step": 58070 + }, + { + "epoch": 9.47389885807504, + "grad_norm": 0.4400140941143036, + "learning_rate": 4.2064028587606163e-07, + "loss": 0.0228, + "num_input_tokens_seen": 125337328, + "step": 58075 + }, + { + "epoch": 9.474714518760196, + "grad_norm": 1.0713000297546387, + "learning_rate": 4.19341044374133e-07, + "loss": 0.0808, + "num_input_tokens_seen": 125348336, + "step": 58080 + }, + { + "epoch": 9.47553017944535, + "grad_norm": 0.44762417674064636, + "learning_rate": 4.180437955051436e-07, + "loss": 0.1657, + "num_input_tokens_seen": 125359152, + "step": 58085 + }, + { + "epoch": 9.476345840130506, + "grad_norm": 0.07304930686950684, + "learning_rate": 4.167485393742482e-07, + "loss": 0.0402, + "num_input_tokens_seen": 125371792, + "step": 58090 + }, + { + "epoch": 9.477161500815662, + "grad_norm": 2.4549214839935303, + "learning_rate": 4.1545527608645163e-07, + "loss": 0.1589, + "num_input_tokens_seen": 125382960, + "step": 58095 + }, + { + "epoch": 9.477977161500815, + "grad_norm": 1.7350486516952515, + "learning_rate": 4.1416400574659233e-07, + "loss": 0.3645, + "num_input_tokens_seen": 125392752, + "step": 58100 + }, + { + "epoch": 9.478792822185971, + "grad_norm": 0.307706743478775, + "learning_rate": 4.128747284593448e-07, + "loss": 0.1482, + "num_input_tokens_seen": 125404080, + "step": 58105 + }, + { + "epoch": 9.479608482871125, + "grad_norm": 1.4911600351333618, + "learning_rate": 4.1158744432922835e-07, + "loss": 0.1492, + "num_input_tokens_seen": 125415568, + "step": 58110 + }, + { + "epoch": 9.48042414355628, + "grad_norm": 1.4807225465774536, + "learning_rate": 4.1030215346059e-07, + "loss": 0.0813, + "num_input_tokens_seen": 125426832, + "step": 58115 + }, + { + "epoch": 9.481239804241435, + "grad_norm": 1.8319571018218994, + "learning_rate": 4.090188559576269e-07, + "loss": 0.2061, + "num_input_tokens_seen": 125436656, + "step": 58120 + }, + { + "epoch": 9.48205546492659, + "grad_norm": 0.7243747711181641, + "learning_rate": 4.077375519243698e-07, + "loss": 0.0848, + "num_input_tokens_seen": 125447440, + "step": 58125 + }, + { + "epoch": 9.482871125611746, + "grad_norm": 1.6368327140808105, + "learning_rate": 4.0645824146468834e-07, + "loss": 0.1106, + "num_input_tokens_seen": 125458928, + "step": 58130 + }, + { + "epoch": 9.4836867862969, + "grad_norm": 0.10268411785364151, + "learning_rate": 4.0518092468228297e-07, + "loss": 0.0879, + "num_input_tokens_seen": 125469520, + "step": 58135 + }, + { + "epoch": 9.484502446982056, + "grad_norm": 0.11095776408910751, + "learning_rate": 4.0390560168070966e-07, + "loss": 0.0254, + "num_input_tokens_seen": 125480016, + "step": 58140 + }, + { + "epoch": 9.48531810766721, + "grad_norm": 0.13475921750068665, + "learning_rate": 4.026322725633441e-07, + "loss": 0.0233, + "num_input_tokens_seen": 125490608, + "step": 58145 + }, + { + "epoch": 9.486133768352365, + "grad_norm": 0.579900324344635, + "learning_rate": 4.0136093743341485e-07, + "loss": 0.1311, + "num_input_tokens_seen": 125501392, + "step": 58150 + }, + { + "epoch": 9.486949429037521, + "grad_norm": 0.2069631963968277, + "learning_rate": 4.000915963939783e-07, + "loss": 0.0929, + "num_input_tokens_seen": 125510448, + "step": 58155 + }, + { + "epoch": 9.487765089722675, + "grad_norm": 0.5862755179405212, + "learning_rate": 3.988242495479383e-07, + "loss": 0.2225, + "num_input_tokens_seen": 125521968, + "step": 58160 + }, + { + "epoch": 9.48858075040783, + "grad_norm": 0.14692391455173492, + "learning_rate": 3.9755889699802926e-07, + "loss": 0.1929, + "num_input_tokens_seen": 125533360, + "step": 58165 + }, + { + "epoch": 9.489396411092985, + "grad_norm": 0.24870868027210236, + "learning_rate": 3.962955388468248e-07, + "loss": 0.024, + "num_input_tokens_seen": 125544400, + "step": 58170 + }, + { + "epoch": 9.49021207177814, + "grad_norm": 0.40811219811439514, + "learning_rate": 3.950341751967457e-07, + "loss": 0.0467, + "num_input_tokens_seen": 125555568, + "step": 58175 + }, + { + "epoch": 9.491027732463296, + "grad_norm": 0.24734750390052795, + "learning_rate": 3.9377480615003794e-07, + "loss": 0.1743, + "num_input_tokens_seen": 125566224, + "step": 58180 + }, + { + "epoch": 9.49184339314845, + "grad_norm": 0.13352225720882416, + "learning_rate": 3.9251743180879483e-07, + "loss": 0.1399, + "num_input_tokens_seen": 125576976, + "step": 58185 + }, + { + "epoch": 9.492659053833606, + "grad_norm": 1.1997109651565552, + "learning_rate": 3.9126205227494605e-07, + "loss": 0.1129, + "num_input_tokens_seen": 125587856, + "step": 58190 + }, + { + "epoch": 9.49347471451876, + "grad_norm": 2.237215280532837, + "learning_rate": 3.900086676502601e-07, + "loss": 0.194, + "num_input_tokens_seen": 125598832, + "step": 58195 + }, + { + "epoch": 9.494290375203915, + "grad_norm": 0.9479215145111084, + "learning_rate": 3.8875727803634186e-07, + "loss": 0.1109, + "num_input_tokens_seen": 125610832, + "step": 58200 + }, + { + "epoch": 9.49510603588907, + "grad_norm": 1.1702090501785278, + "learning_rate": 3.8750788353463243e-07, + "loss": 0.0859, + "num_input_tokens_seen": 125621104, + "step": 58205 + }, + { + "epoch": 9.495921696574225, + "grad_norm": 0.031888458877801895, + "learning_rate": 3.862604842464201e-07, + "loss": 0.083, + "num_input_tokens_seen": 125632752, + "step": 58210 + }, + { + "epoch": 9.49673735725938, + "grad_norm": 0.21284697949886322, + "learning_rate": 3.8501508027281865e-07, + "loss": 0.0241, + "num_input_tokens_seen": 125642832, + "step": 58215 + }, + { + "epoch": 9.497553017944535, + "grad_norm": 0.06304167211055756, + "learning_rate": 3.8377167171479154e-07, + "loss": 0.0488, + "num_input_tokens_seen": 125653840, + "step": 58220 + }, + { + "epoch": 9.49836867862969, + "grad_norm": 1.5710159540176392, + "learning_rate": 3.825302586731333e-07, + "loss": 0.1308, + "num_input_tokens_seen": 125665104, + "step": 58225 + }, + { + "epoch": 9.499184339314844, + "grad_norm": 0.2163393795490265, + "learning_rate": 3.8129084124848e-07, + "loss": 0.0223, + "num_input_tokens_seen": 125675824, + "step": 58230 + }, + { + "epoch": 9.5, + "grad_norm": 1.0804686546325684, + "learning_rate": 3.800534195413069e-07, + "loss": 0.1196, + "num_input_tokens_seen": 125686064, + "step": 58235 + }, + { + "epoch": 9.5, + "eval_loss": 0.1370878517627716, + "eval_runtime": 131.8201, + "eval_samples_per_second": 20.672, + "eval_steps_per_second": 5.174, + "num_input_tokens_seen": 125686064, + "step": 58235 + }, + { + "epoch": 9.500815660685156, + "grad_norm": 1.3066459894180298, + "learning_rate": 3.788179936519226e-07, + "loss": 0.2428, + "num_input_tokens_seen": 125697296, + "step": 58240 + }, + { + "epoch": 9.50163132137031, + "grad_norm": 0.6762257814407349, + "learning_rate": 3.775845636804776e-07, + "loss": 0.0903, + "num_input_tokens_seen": 125708688, + "step": 58245 + }, + { + "epoch": 9.502446982055465, + "grad_norm": 0.0858454629778862, + "learning_rate": 3.7635312972696404e-07, + "loss": 0.0693, + "num_input_tokens_seen": 125719792, + "step": 58250 + }, + { + "epoch": 9.50326264274062, + "grad_norm": 0.17704619467258453, + "learning_rate": 3.751236918912021e-07, + "loss": 0.0267, + "num_input_tokens_seen": 125729680, + "step": 58255 + }, + { + "epoch": 9.504078303425775, + "grad_norm": 0.3844340741634369, + "learning_rate": 3.7389625027285936e-07, + "loss": 0.2357, + "num_input_tokens_seen": 125739472, + "step": 58260 + }, + { + "epoch": 9.50489396411093, + "grad_norm": 1.5978639125823975, + "learning_rate": 3.726708049714367e-07, + "loss": 0.0679, + "num_input_tokens_seen": 125751952, + "step": 58265 + }, + { + "epoch": 9.505709624796085, + "grad_norm": 0.6401198506355286, + "learning_rate": 3.714473560862797e-07, + "loss": 0.0759, + "num_input_tokens_seen": 125762992, + "step": 58270 + }, + { + "epoch": 9.50652528548124, + "grad_norm": 0.13474994897842407, + "learning_rate": 3.702259037165617e-07, + "loss": 0.223, + "num_input_tokens_seen": 125774480, + "step": 58275 + }, + { + "epoch": 9.507340946166394, + "grad_norm": 1.5954400300979614, + "learning_rate": 3.690064479613009e-07, + "loss": 0.1365, + "num_input_tokens_seen": 125784944, + "step": 58280 + }, + { + "epoch": 9.50815660685155, + "grad_norm": 2.253873348236084, + "learning_rate": 3.67788988919357e-07, + "loss": 0.1628, + "num_input_tokens_seen": 125795120, + "step": 58285 + }, + { + "epoch": 9.508972267536706, + "grad_norm": 0.6925262808799744, + "learning_rate": 3.665735266894177e-07, + "loss": 0.1345, + "num_input_tokens_seen": 125806640, + "step": 58290 + }, + { + "epoch": 9.50978792822186, + "grad_norm": 0.522045373916626, + "learning_rate": 3.653600613700209e-07, + "loss": 0.1422, + "num_input_tokens_seen": 125817712, + "step": 58295 + }, + { + "epoch": 9.510603588907015, + "grad_norm": 1.0474458932876587, + "learning_rate": 3.6414859305952955e-07, + "loss": 0.2771, + "num_input_tokens_seen": 125829008, + "step": 58300 + }, + { + "epoch": 9.51141924959217, + "grad_norm": 1.1764123439788818, + "learning_rate": 3.629391218561512e-07, + "loss": 0.143, + "num_input_tokens_seen": 125840016, + "step": 58305 + }, + { + "epoch": 9.512234910277325, + "grad_norm": 0.05863611772656441, + "learning_rate": 3.6173164785794076e-07, + "loss": 0.0341, + "num_input_tokens_seen": 125850608, + "step": 58310 + }, + { + "epoch": 9.513050570962479, + "grad_norm": 1.2838588953018188, + "learning_rate": 3.605261711627728e-07, + "loss": 0.0792, + "num_input_tokens_seen": 125863312, + "step": 58315 + }, + { + "epoch": 9.513866231647635, + "grad_norm": 0.2506711483001709, + "learning_rate": 3.593226918683745e-07, + "loss": 0.0362, + "num_input_tokens_seen": 125874960, + "step": 58320 + }, + { + "epoch": 9.51468189233279, + "grad_norm": 0.8255561590194702, + "learning_rate": 3.5812121007230414e-07, + "loss": 0.0681, + "num_input_tokens_seen": 125885264, + "step": 58325 + }, + { + "epoch": 9.515497553017944, + "grad_norm": 0.08361409604549408, + "learning_rate": 3.569217258719587e-07, + "loss": 0.1658, + "num_input_tokens_seen": 125896624, + "step": 58330 + }, + { + "epoch": 9.5163132137031, + "grad_norm": 1.7575064897537231, + "learning_rate": 3.557242393645771e-07, + "loss": 0.1863, + "num_input_tokens_seen": 125907056, + "step": 58335 + }, + { + "epoch": 9.517128874388254, + "grad_norm": 0.31547561287879944, + "learning_rate": 3.5452875064723445e-07, + "loss": 0.313, + "num_input_tokens_seen": 125917616, + "step": 58340 + }, + { + "epoch": 9.51794453507341, + "grad_norm": 1.142470359802246, + "learning_rate": 3.5333525981683937e-07, + "loss": 0.1556, + "num_input_tokens_seen": 125927984, + "step": 58345 + }, + { + "epoch": 9.518760195758565, + "grad_norm": 1.8744893074035645, + "learning_rate": 3.521437669701422e-07, + "loss": 0.1357, + "num_input_tokens_seen": 125938736, + "step": 58350 + }, + { + "epoch": 9.51957585644372, + "grad_norm": 0.8966554999351501, + "learning_rate": 3.5095427220373513e-07, + "loss": 0.2116, + "num_input_tokens_seen": 125948816, + "step": 58355 + }, + { + "epoch": 9.520391517128875, + "grad_norm": 0.1146811917424202, + "learning_rate": 3.497667756140438e-07, + "loss": 0.0459, + "num_input_tokens_seen": 125959504, + "step": 58360 + }, + { + "epoch": 9.521207177814029, + "grad_norm": 0.34878548979759216, + "learning_rate": 3.4858127729733015e-07, + "loss": 0.094, + "num_input_tokens_seen": 125971504, + "step": 58365 + }, + { + "epoch": 9.522022838499185, + "grad_norm": 0.23213954269886017, + "learning_rate": 3.473977773496978e-07, + "loss": 0.1124, + "num_input_tokens_seen": 125981840, + "step": 58370 + }, + { + "epoch": 9.522838499184338, + "grad_norm": 0.06631080061197281, + "learning_rate": 3.462162758670895e-07, + "loss": 0.1495, + "num_input_tokens_seen": 125992432, + "step": 58375 + }, + { + "epoch": 9.523654159869494, + "grad_norm": 1.9297678470611572, + "learning_rate": 3.4503677294527857e-07, + "loss": 0.2476, + "num_input_tokens_seen": 126002576, + "step": 58380 + }, + { + "epoch": 9.52446982055465, + "grad_norm": 0.2190595269203186, + "learning_rate": 3.438592686798886e-07, + "loss": 0.0873, + "num_input_tokens_seen": 126011920, + "step": 58385 + }, + { + "epoch": 9.525285481239804, + "grad_norm": 0.1602068543434143, + "learning_rate": 3.4268376316636816e-07, + "loss": 0.0373, + "num_input_tokens_seen": 126022864, + "step": 58390 + }, + { + "epoch": 9.52610114192496, + "grad_norm": 0.187246173620224, + "learning_rate": 3.4151025650001056e-07, + "loss": 0.0165, + "num_input_tokens_seen": 126034960, + "step": 58395 + }, + { + "epoch": 9.526916802610113, + "grad_norm": 0.3820634186267853, + "learning_rate": 3.4033874877595074e-07, + "loss": 0.0482, + "num_input_tokens_seen": 126046672, + "step": 58400 + }, + { + "epoch": 9.52773246329527, + "grad_norm": 0.25481411814689636, + "learning_rate": 3.3916924008915163e-07, + "loss": 0.0883, + "num_input_tokens_seen": 126058000, + "step": 58405 + }, + { + "epoch": 9.528548123980425, + "grad_norm": 0.20575512945652008, + "learning_rate": 3.3800173053442354e-07, + "loss": 0.022, + "num_input_tokens_seen": 126068496, + "step": 58410 + }, + { + "epoch": 9.529363784665579, + "grad_norm": 0.460985004901886, + "learning_rate": 3.3683622020640736e-07, + "loss": 0.0664, + "num_input_tokens_seen": 126078544, + "step": 58415 + }, + { + "epoch": 9.530179445350734, + "grad_norm": 1.2559611797332764, + "learning_rate": 3.356727091995859e-07, + "loss": 0.2756, + "num_input_tokens_seen": 126089712, + "step": 58420 + }, + { + "epoch": 9.530995106035888, + "grad_norm": 0.054479438811540604, + "learning_rate": 3.3451119760828374e-07, + "loss": 0.1341, + "num_input_tokens_seen": 126100656, + "step": 58425 + }, + { + "epoch": 9.531810766721044, + "grad_norm": 0.32801616191864014, + "learning_rate": 3.333516855266533e-07, + "loss": 0.2072, + "num_input_tokens_seen": 126111376, + "step": 58430 + }, + { + "epoch": 9.5326264274062, + "grad_norm": 0.2340172380208969, + "learning_rate": 3.321941730486916e-07, + "loss": 0.1207, + "num_input_tokens_seen": 126122640, + "step": 58435 + }, + { + "epoch": 9.533442088091354, + "grad_norm": 0.20330066978931427, + "learning_rate": 3.3103866026823473e-07, + "loss": 0.0916, + "num_input_tokens_seen": 126132432, + "step": 58440 + }, + { + "epoch": 9.53425774877651, + "grad_norm": 0.3213180899620056, + "learning_rate": 3.2988514727895217e-07, + "loss": 0.0376, + "num_input_tokens_seen": 126142576, + "step": 58445 + }, + { + "epoch": 9.535073409461663, + "grad_norm": 0.13617219030857086, + "learning_rate": 3.287336341743524e-07, + "loss": 0.0344, + "num_input_tokens_seen": 126153552, + "step": 58450 + }, + { + "epoch": 9.535889070146819, + "grad_norm": 0.17003437876701355, + "learning_rate": 3.275841210477887e-07, + "loss": 0.1723, + "num_input_tokens_seen": 126165072, + "step": 58455 + }, + { + "epoch": 9.536704730831975, + "grad_norm": 0.22063560783863068, + "learning_rate": 3.264366079924419e-07, + "loss": 0.1179, + "num_input_tokens_seen": 126175696, + "step": 58460 + }, + { + "epoch": 9.537520391517129, + "grad_norm": 0.1787348836660385, + "learning_rate": 3.252910951013349e-07, + "loss": 0.1053, + "num_input_tokens_seen": 126187088, + "step": 58465 + }, + { + "epoch": 9.538336052202284, + "grad_norm": 1.200135350227356, + "learning_rate": 3.2414758246733234e-07, + "loss": 0.1518, + "num_input_tokens_seen": 126198480, + "step": 58470 + }, + { + "epoch": 9.539151712887438, + "grad_norm": 2.2001709938049316, + "learning_rate": 3.2300607018312944e-07, + "loss": 0.124, + "num_input_tokens_seen": 126209904, + "step": 58475 + }, + { + "epoch": 9.539967373572594, + "grad_norm": 0.03839574009180069, + "learning_rate": 3.2186655834126335e-07, + "loss": 0.0306, + "num_input_tokens_seen": 126219792, + "step": 58480 + }, + { + "epoch": 9.540783034257748, + "grad_norm": 0.17751020193099976, + "learning_rate": 3.207290470341101e-07, + "loss": 0.1001, + "num_input_tokens_seen": 126230256, + "step": 58485 + }, + { + "epoch": 9.541598694942904, + "grad_norm": 0.10896424949169159, + "learning_rate": 3.1959353635388214e-07, + "loss": 0.1779, + "num_input_tokens_seen": 126241136, + "step": 58490 + }, + { + "epoch": 9.54241435562806, + "grad_norm": 0.8789805173873901, + "learning_rate": 3.1846002639263074e-07, + "loss": 0.0622, + "num_input_tokens_seen": 126252048, + "step": 58495 + }, + { + "epoch": 9.543230016313213, + "grad_norm": 0.6125621795654297, + "learning_rate": 3.17328517242238e-07, + "loss": 0.1194, + "num_input_tokens_seen": 126262320, + "step": 58500 + }, + { + "epoch": 9.544045676998369, + "grad_norm": 1.8033124208450317, + "learning_rate": 3.161990089944389e-07, + "loss": 0.2793, + "num_input_tokens_seen": 126272464, + "step": 58505 + }, + { + "epoch": 9.544861337683523, + "grad_norm": 0.3125329315662384, + "learning_rate": 3.150715017407907e-07, + "loss": 0.0357, + "num_input_tokens_seen": 126283984, + "step": 58510 + }, + { + "epoch": 9.545676998368679, + "grad_norm": 0.5055629014968872, + "learning_rate": 3.1394599557269534e-07, + "loss": 0.0766, + "num_input_tokens_seen": 126295024, + "step": 58515 + }, + { + "epoch": 9.546492659053834, + "grad_norm": 0.06834249943494797, + "learning_rate": 3.128224905813965e-07, + "loss": 0.1603, + "num_input_tokens_seen": 126305296, + "step": 58520 + }, + { + "epoch": 9.547308319738988, + "grad_norm": 0.07821431756019592, + "learning_rate": 3.1170098685796565e-07, + "loss": 0.1336, + "num_input_tokens_seen": 126317264, + "step": 58525 + }, + { + "epoch": 9.548123980424144, + "grad_norm": 0.39453375339508057, + "learning_rate": 3.1058148449331914e-07, + "loss": 0.1397, + "num_input_tokens_seen": 126326640, + "step": 58530 + }, + { + "epoch": 9.548939641109298, + "grad_norm": 0.7962789535522461, + "learning_rate": 3.09463983578212e-07, + "loss": 0.0774, + "num_input_tokens_seen": 126337840, + "step": 58535 + }, + { + "epoch": 9.549755301794454, + "grad_norm": 0.7817016839981079, + "learning_rate": 3.0834848420323305e-07, + "loss": 0.2105, + "num_input_tokens_seen": 126347792, + "step": 58540 + }, + { + "epoch": 9.550570962479608, + "grad_norm": 0.5145885944366455, + "learning_rate": 3.0723498645880976e-07, + "loss": 0.1222, + "num_input_tokens_seen": 126359344, + "step": 58545 + }, + { + "epoch": 9.551386623164763, + "grad_norm": 1.5794398784637451, + "learning_rate": 3.061234904352089e-07, + "loss": 0.2253, + "num_input_tokens_seen": 126370064, + "step": 58550 + }, + { + "epoch": 9.552202283849919, + "grad_norm": 0.30722013115882874, + "learning_rate": 3.0501399622253344e-07, + "loss": 0.0231, + "num_input_tokens_seen": 126380080, + "step": 58555 + }, + { + "epoch": 9.553017944535073, + "grad_norm": 1.019628643989563, + "learning_rate": 3.0390650391072527e-07, + "loss": 0.0909, + "num_input_tokens_seen": 126390864, + "step": 58560 + }, + { + "epoch": 9.553833605220229, + "grad_norm": 0.05778788775205612, + "learning_rate": 3.028010135895598e-07, + "loss": 0.1334, + "num_input_tokens_seen": 126402224, + "step": 58565 + }, + { + "epoch": 9.554649265905383, + "grad_norm": 0.09439906477928162, + "learning_rate": 3.016975253486598e-07, + "loss": 0.0161, + "num_input_tokens_seen": 126412464, + "step": 58570 + }, + { + "epoch": 9.555464926590538, + "grad_norm": 0.6377114653587341, + "learning_rate": 3.0059603927747313e-07, + "loss": 0.0994, + "num_input_tokens_seen": 126423600, + "step": 58575 + }, + { + "epoch": 9.556280587275694, + "grad_norm": 0.6713332533836365, + "learning_rate": 2.9949655546529785e-07, + "loss": 0.1411, + "num_input_tokens_seen": 126434928, + "step": 58580 + }, + { + "epoch": 9.557096247960848, + "grad_norm": 1.0873801708221436, + "learning_rate": 2.9839907400125986e-07, + "loss": 0.1064, + "num_input_tokens_seen": 126444944, + "step": 58585 + }, + { + "epoch": 9.557911908646004, + "grad_norm": 0.07090858370065689, + "learning_rate": 2.973035949743269e-07, + "loss": 0.1268, + "num_input_tokens_seen": 126456208, + "step": 58590 + }, + { + "epoch": 9.558727569331158, + "grad_norm": 0.09495173394680023, + "learning_rate": 2.9621011847330293e-07, + "loss": 0.0397, + "num_input_tokens_seen": 126467312, + "step": 58595 + }, + { + "epoch": 9.559543230016313, + "grad_norm": 0.7490937113761902, + "learning_rate": 2.951186445868337e-07, + "loss": 0.144, + "num_input_tokens_seen": 126476656, + "step": 58600 + }, + { + "epoch": 9.560358890701469, + "grad_norm": 0.42767471075057983, + "learning_rate": 2.940291734034012e-07, + "loss": 0.2282, + "num_input_tokens_seen": 126487920, + "step": 58605 + }, + { + "epoch": 9.561174551386623, + "grad_norm": 0.06160451099276543, + "learning_rate": 2.929417050113181e-07, + "loss": 0.0262, + "num_input_tokens_seen": 126499216, + "step": 58610 + }, + { + "epoch": 9.561990212071779, + "grad_norm": 0.09681235998868942, + "learning_rate": 2.918562394987445e-07, + "loss": 0.0252, + "num_input_tokens_seen": 126509936, + "step": 58615 + }, + { + "epoch": 9.562805872756933, + "grad_norm": 2.1366350650787354, + "learning_rate": 2.907727769536683e-07, + "loss": 0.1811, + "num_input_tokens_seen": 126521776, + "step": 58620 + }, + { + "epoch": 9.563621533442088, + "grad_norm": 1.5031416416168213, + "learning_rate": 2.8969131746392763e-07, + "loss": 0.0813, + "num_input_tokens_seen": 126532464, + "step": 58625 + }, + { + "epoch": 9.564437194127244, + "grad_norm": 0.12723496556282043, + "learning_rate": 2.886118611171884e-07, + "loss": 0.0883, + "num_input_tokens_seen": 126542992, + "step": 58630 + }, + { + "epoch": 9.565252854812398, + "grad_norm": 1.9969114065170288, + "learning_rate": 2.875344080009529e-07, + "loss": 0.231, + "num_input_tokens_seen": 126553680, + "step": 58635 + }, + { + "epoch": 9.566068515497554, + "grad_norm": 0.3879339396953583, + "learning_rate": 2.8645895820257065e-07, + "loss": 0.0407, + "num_input_tokens_seen": 126563440, + "step": 58640 + }, + { + "epoch": 9.566884176182707, + "grad_norm": 0.35782474279403687, + "learning_rate": 2.8538551180921913e-07, + "loss": 0.1961, + "num_input_tokens_seen": 126573904, + "step": 58645 + }, + { + "epoch": 9.567699836867863, + "grad_norm": 1.877886414527893, + "learning_rate": 2.8431406890792045e-07, + "loss": 0.254, + "num_input_tokens_seen": 126584560, + "step": 58650 + }, + { + "epoch": 9.568515497553017, + "grad_norm": 0.10818041861057281, + "learning_rate": 2.8324462958552735e-07, + "loss": 0.049, + "num_input_tokens_seen": 126595632, + "step": 58655 + }, + { + "epoch": 9.569331158238173, + "grad_norm": 0.1382674127817154, + "learning_rate": 2.821771939287371e-07, + "loss": 0.0621, + "num_input_tokens_seen": 126606800, + "step": 58660 + }, + { + "epoch": 9.570146818923329, + "grad_norm": 0.9063627123832703, + "learning_rate": 2.811117620240833e-07, + "loss": 0.1023, + "num_input_tokens_seen": 126617808, + "step": 58665 + }, + { + "epoch": 9.570962479608482, + "grad_norm": 0.2340969443321228, + "learning_rate": 2.800483339579274e-07, + "loss": 0.2406, + "num_input_tokens_seen": 126628720, + "step": 58670 + }, + { + "epoch": 9.571778140293638, + "grad_norm": 0.2854406535625458, + "learning_rate": 2.789869098164838e-07, + "loss": 0.0939, + "num_input_tokens_seen": 126639664, + "step": 58675 + }, + { + "epoch": 9.572593800978792, + "grad_norm": 0.06630799919366837, + "learning_rate": 2.779274896857947e-07, + "loss": 0.0189, + "num_input_tokens_seen": 126650736, + "step": 58680 + }, + { + "epoch": 9.573409461663948, + "grad_norm": 2.1239616870880127, + "learning_rate": 2.768700736517416e-07, + "loss": 0.1563, + "num_input_tokens_seen": 126662224, + "step": 58685 + }, + { + "epoch": 9.574225122349104, + "grad_norm": 0.524193525314331, + "learning_rate": 2.7581466180004454e-07, + "loss": 0.0351, + "num_input_tokens_seen": 126673040, + "step": 58690 + }, + { + "epoch": 9.575040783034257, + "grad_norm": 0.4001074433326721, + "learning_rate": 2.747612542162603e-07, + "loss": 0.1124, + "num_input_tokens_seen": 126683824, + "step": 58695 + }, + { + "epoch": 9.575856443719413, + "grad_norm": 0.09200292080640793, + "learning_rate": 2.737098509857816e-07, + "loss": 0.0605, + "num_input_tokens_seen": 126693488, + "step": 58700 + }, + { + "epoch": 9.576672104404567, + "grad_norm": 1.6239614486694336, + "learning_rate": 2.726604521938458e-07, + "loss": 0.2465, + "num_input_tokens_seen": 126703888, + "step": 58705 + }, + { + "epoch": 9.577487765089723, + "grad_norm": 0.08846530318260193, + "learning_rate": 2.716130579255155e-07, + "loss": 0.2111, + "num_input_tokens_seen": 126715440, + "step": 58710 + }, + { + "epoch": 9.578303425774878, + "grad_norm": 0.7329774498939514, + "learning_rate": 2.7056766826570045e-07, + "loss": 0.2849, + "num_input_tokens_seen": 126725680, + "step": 58715 + }, + { + "epoch": 9.579119086460032, + "grad_norm": 1.3725229501724243, + "learning_rate": 2.6952428329914956e-07, + "loss": 0.063, + "num_input_tokens_seen": 126736144, + "step": 58720 + }, + { + "epoch": 9.579934747145188, + "grad_norm": 0.5967692732810974, + "learning_rate": 2.684829031104397e-07, + "loss": 0.1586, + "num_input_tokens_seen": 126747504, + "step": 58725 + }, + { + "epoch": 9.580750407830342, + "grad_norm": 0.06760026514530182, + "learning_rate": 2.6744352778399204e-07, + "loss": 0.1187, + "num_input_tokens_seen": 126757904, + "step": 58730 + }, + { + "epoch": 9.581566068515498, + "grad_norm": 0.7837621569633484, + "learning_rate": 2.6640615740406436e-07, + "loss": 0.0616, + "num_input_tokens_seen": 126768400, + "step": 58735 + }, + { + "epoch": 9.582381729200652, + "grad_norm": 1.357526421546936, + "learning_rate": 2.6537079205475323e-07, + "loss": 0.1971, + "num_input_tokens_seen": 126779440, + "step": 58740 + }, + { + "epoch": 9.583197389885807, + "grad_norm": 1.330133318901062, + "learning_rate": 2.6433743181998316e-07, + "loss": 0.1482, + "num_input_tokens_seen": 126790672, + "step": 58745 + }, + { + "epoch": 9.584013050570963, + "grad_norm": 0.7420561909675598, + "learning_rate": 2.633060767835316e-07, + "loss": 0.0956, + "num_input_tokens_seen": 126799824, + "step": 58750 + }, + { + "epoch": 9.584828711256117, + "grad_norm": 0.5674729347229004, + "learning_rate": 2.6227672702900106e-07, + "loss": 0.0485, + "num_input_tokens_seen": 126811248, + "step": 58755 + }, + { + "epoch": 9.585644371941273, + "grad_norm": 1.418708324432373, + "learning_rate": 2.61249382639836e-07, + "loss": 0.3309, + "num_input_tokens_seen": 126822256, + "step": 58760 + }, + { + "epoch": 9.586460032626427, + "grad_norm": 0.5379208326339722, + "learning_rate": 2.6022404369931976e-07, + "loss": 0.1735, + "num_input_tokens_seen": 126833840, + "step": 58765 + }, + { + "epoch": 9.587275693311582, + "grad_norm": 0.48695212602615356, + "learning_rate": 2.592007102905719e-07, + "loss": 0.0228, + "num_input_tokens_seen": 126846160, + "step": 58770 + }, + { + "epoch": 9.588091353996738, + "grad_norm": 1.97133469581604, + "learning_rate": 2.581793824965484e-07, + "loss": 0.0873, + "num_input_tokens_seen": 126857904, + "step": 58775 + }, + { + "epoch": 9.588907014681892, + "grad_norm": 0.3981563150882721, + "learning_rate": 2.5716006040004123e-07, + "loss": 0.1544, + "num_input_tokens_seen": 126869232, + "step": 58780 + }, + { + "epoch": 9.589722675367048, + "grad_norm": 0.033214252442121506, + "learning_rate": 2.5614274408368444e-07, + "loss": 0.0725, + "num_input_tokens_seen": 126880784, + "step": 58785 + }, + { + "epoch": 9.590538336052202, + "grad_norm": 1.3154807090759277, + "learning_rate": 2.5512743362994527e-07, + "loss": 0.0921, + "num_input_tokens_seen": 126891600, + "step": 58790 + }, + { + "epoch": 9.591353996737357, + "grad_norm": 0.8926984071731567, + "learning_rate": 2.541141291211302e-07, + "loss": 0.143, + "num_input_tokens_seen": 126903056, + "step": 58795 + }, + { + "epoch": 9.592169657422513, + "grad_norm": 0.28274044394493103, + "learning_rate": 2.5310283063938457e-07, + "loss": 0.1519, + "num_input_tokens_seen": 126914032, + "step": 58800 + }, + { + "epoch": 9.592985318107667, + "grad_norm": 0.0821286216378212, + "learning_rate": 2.5209353826668726e-07, + "loss": 0.297, + "num_input_tokens_seen": 126923920, + "step": 58805 + }, + { + "epoch": 9.593800978792823, + "grad_norm": 0.37202179431915283, + "learning_rate": 2.510862520848589e-07, + "loss": 0.0417, + "num_input_tokens_seen": 126935216, + "step": 58810 + }, + { + "epoch": 9.594616639477977, + "grad_norm": 0.1484791785478592, + "learning_rate": 2.500809721755509e-07, + "loss": 0.0589, + "num_input_tokens_seen": 126944752, + "step": 58815 + }, + { + "epoch": 9.595432300163132, + "grad_norm": 1.5112674236297607, + "learning_rate": 2.490776986202592e-07, + "loss": 0.1784, + "num_input_tokens_seen": 126955856, + "step": 58820 + }, + { + "epoch": 9.596247960848288, + "grad_norm": 1.3147807121276855, + "learning_rate": 2.480764315003159e-07, + "loss": 0.096, + "num_input_tokens_seen": 126966064, + "step": 58825 + }, + { + "epoch": 9.597063621533442, + "grad_norm": 0.03753288462758064, + "learning_rate": 2.470771708968866e-07, + "loss": 0.1125, + "num_input_tokens_seen": 126977424, + "step": 58830 + }, + { + "epoch": 9.597879282218598, + "grad_norm": 1.4761757850646973, + "learning_rate": 2.4607991689097607e-07, + "loss": 0.1563, + "num_input_tokens_seen": 126987728, + "step": 58835 + }, + { + "epoch": 9.598694942903752, + "grad_norm": 0.12523235380649567, + "learning_rate": 2.4508466956343066e-07, + "loss": 0.1816, + "num_input_tokens_seen": 126999536, + "step": 58840 + }, + { + "epoch": 9.599510603588907, + "grad_norm": 0.3203262686729431, + "learning_rate": 2.4409142899492474e-07, + "loss": 0.1583, + "num_input_tokens_seen": 127010384, + "step": 58845 + }, + { + "epoch": 9.600326264274061, + "grad_norm": 0.8678780198097229, + "learning_rate": 2.4310019526597726e-07, + "loss": 0.0342, + "num_input_tokens_seen": 127020176, + "step": 58850 + }, + { + "epoch": 9.601141924959217, + "grad_norm": 1.9295153617858887, + "learning_rate": 2.4211096845694336e-07, + "loss": 0.2036, + "num_input_tokens_seen": 127030192, + "step": 58855 + }, + { + "epoch": 9.601957585644373, + "grad_norm": 0.7878510355949402, + "learning_rate": 2.411237486480145e-07, + "loss": 0.0808, + "num_input_tokens_seen": 127041808, + "step": 58860 + }, + { + "epoch": 9.602773246329527, + "grad_norm": 0.0850197821855545, + "learning_rate": 2.4013853591922097e-07, + "loss": 0.0883, + "num_input_tokens_seen": 127052400, + "step": 58865 + }, + { + "epoch": 9.603588907014682, + "grad_norm": 0.047383468598127365, + "learning_rate": 2.391553303504296e-07, + "loss": 0.1151, + "num_input_tokens_seen": 127063056, + "step": 58870 + }, + { + "epoch": 9.604404567699836, + "grad_norm": 0.13096512854099274, + "learning_rate": 2.3817413202134041e-07, + "loss": 0.1349, + "num_input_tokens_seen": 127073840, + "step": 58875 + }, + { + "epoch": 9.605220228384992, + "grad_norm": 0.054293178021907806, + "learning_rate": 2.3719494101149543e-07, + "loss": 0.0828, + "num_input_tokens_seen": 127084656, + "step": 58880 + }, + { + "epoch": 9.606035889070148, + "grad_norm": 0.11467405408620834, + "learning_rate": 2.3621775740027553e-07, + "loss": 0.0538, + "num_input_tokens_seen": 127095984, + "step": 58885 + }, + { + "epoch": 9.606851549755302, + "grad_norm": 0.06921184062957764, + "learning_rate": 2.3524258126689235e-07, + "loss": 0.0797, + "num_input_tokens_seen": 127105776, + "step": 58890 + }, + { + "epoch": 9.607667210440457, + "grad_norm": 0.6248638033866882, + "learning_rate": 2.3426941269040213e-07, + "loss": 0.1431, + "num_input_tokens_seen": 127117424, + "step": 58895 + }, + { + "epoch": 9.608482871125611, + "grad_norm": 1.404083490371704, + "learning_rate": 2.3329825174969455e-07, + "loss": 0.2112, + "num_input_tokens_seen": 127128240, + "step": 58900 + }, + { + "epoch": 9.609298531810767, + "grad_norm": 0.7190012335777283, + "learning_rate": 2.3232909852349273e-07, + "loss": 0.0602, + "num_input_tokens_seen": 127140176, + "step": 58905 + }, + { + "epoch": 9.61011419249592, + "grad_norm": 0.3880137801170349, + "learning_rate": 2.3136195309036435e-07, + "loss": 0.0972, + "num_input_tokens_seen": 127149392, + "step": 58910 + }, + { + "epoch": 9.610929853181077, + "grad_norm": 0.11447454988956451, + "learning_rate": 2.303968155287134e-07, + "loss": 0.0512, + "num_input_tokens_seen": 127160496, + "step": 58915 + }, + { + "epoch": 9.611745513866232, + "grad_norm": 2.2581615447998047, + "learning_rate": 2.294336859167745e-07, + "loss": 0.1164, + "num_input_tokens_seen": 127170544, + "step": 58920 + }, + { + "epoch": 9.612561174551386, + "grad_norm": 0.10777644068002701, + "learning_rate": 2.2847256433262686e-07, + "loss": 0.0755, + "num_input_tokens_seen": 127182992, + "step": 58925 + }, + { + "epoch": 9.613376835236542, + "grad_norm": 0.6402192711830139, + "learning_rate": 2.2751345085418042e-07, + "loss": 0.0722, + "num_input_tokens_seen": 127194320, + "step": 58930 + }, + { + "epoch": 9.614192495921696, + "grad_norm": 0.12566886842250824, + "learning_rate": 2.265563455591896e-07, + "loss": 0.0897, + "num_input_tokens_seen": 127205776, + "step": 58935 + }, + { + "epoch": 9.615008156606851, + "grad_norm": 1.3291486501693726, + "learning_rate": 2.2560124852523955e-07, + "loss": 0.1472, + "num_input_tokens_seen": 127217264, + "step": 58940 + }, + { + "epoch": 9.615823817292007, + "grad_norm": 0.3998117446899414, + "learning_rate": 2.246481598297573e-07, + "loss": 0.1289, + "num_input_tokens_seen": 127229584, + "step": 58945 + }, + { + "epoch": 9.616639477977161, + "grad_norm": 0.2852746248245239, + "learning_rate": 2.2369707955000318e-07, + "loss": 0.1112, + "num_input_tokens_seen": 127239824, + "step": 58950 + }, + { + "epoch": 9.617455138662317, + "grad_norm": 0.2554086744785309, + "learning_rate": 2.2274800776307946e-07, + "loss": 0.0784, + "num_input_tokens_seen": 127250672, + "step": 58955 + }, + { + "epoch": 9.61827079934747, + "grad_norm": 0.617014467716217, + "learning_rate": 2.2180094454591903e-07, + "loss": 0.184, + "num_input_tokens_seen": 127260496, + "step": 58960 + }, + { + "epoch": 9.619086460032626, + "grad_norm": 0.5217033624649048, + "learning_rate": 2.2085588997529938e-07, + "loss": 0.172, + "num_input_tokens_seen": 127272336, + "step": 58965 + }, + { + "epoch": 9.619902120717782, + "grad_norm": 1.0626221895217896, + "learning_rate": 2.1991284412782864e-07, + "loss": 0.0969, + "num_input_tokens_seen": 127284368, + "step": 58970 + }, + { + "epoch": 9.620717781402936, + "grad_norm": 0.05967749282717705, + "learning_rate": 2.18971807079954e-07, + "loss": 0.1709, + "num_input_tokens_seen": 127294032, + "step": 58975 + }, + { + "epoch": 9.621533442088092, + "grad_norm": 0.06534300744533539, + "learning_rate": 2.1803277890796447e-07, + "loss": 0.1709, + "num_input_tokens_seen": 127304080, + "step": 58980 + }, + { + "epoch": 9.622349102773246, + "grad_norm": 0.6560854315757751, + "learning_rate": 2.170957596879797e-07, + "loss": 0.0614, + "num_input_tokens_seen": 127315888, + "step": 58985 + }, + { + "epoch": 9.623164763458401, + "grad_norm": 1.3650636672973633, + "learning_rate": 2.1616074949595832e-07, + "loss": 0.1034, + "num_input_tokens_seen": 127326864, + "step": 58990 + }, + { + "epoch": 9.623980424143557, + "grad_norm": 0.24395589530467987, + "learning_rate": 2.1522774840770087e-07, + "loss": 0.0779, + "num_input_tokens_seen": 127336368, + "step": 58995 + }, + { + "epoch": 9.624796084828711, + "grad_norm": 0.29466867446899414, + "learning_rate": 2.1429675649883575e-07, + "loss": 0.0659, + "num_input_tokens_seen": 127347824, + "step": 59000 + }, + { + "epoch": 9.625611745513867, + "grad_norm": 1.59531831741333, + "learning_rate": 2.1336777384484141e-07, + "loss": 0.1529, + "num_input_tokens_seen": 127356656, + "step": 59005 + }, + { + "epoch": 9.62642740619902, + "grad_norm": 0.14354895055294037, + "learning_rate": 2.1244080052101879e-07, + "loss": 0.2323, + "num_input_tokens_seen": 127367120, + "step": 59010 + }, + { + "epoch": 9.627243066884176, + "grad_norm": 0.7929092645645142, + "learning_rate": 2.115158366025133e-07, + "loss": 0.1469, + "num_input_tokens_seen": 127378800, + "step": 59015 + }, + { + "epoch": 9.62805872756933, + "grad_norm": 1.1574995517730713, + "learning_rate": 2.1059288216431217e-07, + "loss": 0.1373, + "num_input_tokens_seen": 127390000, + "step": 59020 + }, + { + "epoch": 9.628874388254486, + "grad_norm": 0.056243497878313065, + "learning_rate": 2.0967193728123334e-07, + "loss": 0.0458, + "num_input_tokens_seen": 127400144, + "step": 59025 + }, + { + "epoch": 9.629690048939642, + "grad_norm": 0.06640125811100006, + "learning_rate": 2.0875300202793101e-07, + "loss": 0.1223, + "num_input_tokens_seen": 127411568, + "step": 59030 + }, + { + "epoch": 9.630505709624796, + "grad_norm": 0.174809530377388, + "learning_rate": 2.0783607647889837e-07, + "loss": 0.1527, + "num_input_tokens_seen": 127421520, + "step": 59035 + }, + { + "epoch": 9.631321370309951, + "grad_norm": 0.32965558767318726, + "learning_rate": 2.0692116070847035e-07, + "loss": 0.1196, + "num_input_tokens_seen": 127433424, + "step": 59040 + }, + { + "epoch": 9.632137030995105, + "grad_norm": 1.089026689529419, + "learning_rate": 2.0600825479080986e-07, + "loss": 0.0813, + "num_input_tokens_seen": 127443408, + "step": 59045 + }, + { + "epoch": 9.632952691680261, + "grad_norm": 1.4127638339996338, + "learning_rate": 2.0509735879992442e-07, + "loss": 0.1182, + "num_input_tokens_seen": 127454256, + "step": 59050 + }, + { + "epoch": 9.633768352365417, + "grad_norm": 0.05491199716925621, + "learning_rate": 2.041884728096549e-07, + "loss": 0.1637, + "num_input_tokens_seen": 127464624, + "step": 59055 + }, + { + "epoch": 9.63458401305057, + "grad_norm": 0.27766233682632446, + "learning_rate": 2.0328159689368133e-07, + "loss": 0.0909, + "num_input_tokens_seen": 127474192, + "step": 59060 + }, + { + "epoch": 9.635399673735726, + "grad_norm": 2.0195810794830322, + "learning_rate": 2.0237673112551704e-07, + "loss": 0.1299, + "num_input_tokens_seen": 127483056, + "step": 59065 + }, + { + "epoch": 9.63621533442088, + "grad_norm": 2.196974515914917, + "learning_rate": 2.0147387557851727e-07, + "loss": 0.1392, + "num_input_tokens_seen": 127493360, + "step": 59070 + }, + { + "epoch": 9.637030995106036, + "grad_norm": 0.8527790307998657, + "learning_rate": 2.005730303258735e-07, + "loss": 0.1375, + "num_input_tokens_seen": 127503728, + "step": 59075 + }, + { + "epoch": 9.63784665579119, + "grad_norm": 0.04289667308330536, + "learning_rate": 1.9967419544060784e-07, + "loss": 0.0565, + "num_input_tokens_seen": 127514672, + "step": 59080 + }, + { + "epoch": 9.638662316476346, + "grad_norm": 0.12937946617603302, + "learning_rate": 1.987773709955898e-07, + "loss": 0.0571, + "num_input_tokens_seen": 127525200, + "step": 59085 + }, + { + "epoch": 9.639477977161501, + "grad_norm": 0.19430461525917053, + "learning_rate": 1.9788255706351678e-07, + "loss": 0.1069, + "num_input_tokens_seen": 127535824, + "step": 59090 + }, + { + "epoch": 9.640293637846655, + "grad_norm": 0.07089689373970032, + "learning_rate": 1.9698975371693075e-07, + "loss": 0.0781, + "num_input_tokens_seen": 127546960, + "step": 59095 + }, + { + "epoch": 9.641109298531811, + "grad_norm": 1.8465418815612793, + "learning_rate": 1.9609896102820157e-07, + "loss": 0.2024, + "num_input_tokens_seen": 127557680, + "step": 59100 + }, + { + "epoch": 9.641924959216965, + "grad_norm": 0.4027233123779297, + "learning_rate": 1.9521017906954654e-07, + "loss": 0.0679, + "num_input_tokens_seen": 127567696, + "step": 59105 + }, + { + "epoch": 9.64274061990212, + "grad_norm": 0.18666411936283112, + "learning_rate": 1.9432340791301073e-07, + "loss": 0.2595, + "num_input_tokens_seen": 127578736, + "step": 59110 + }, + { + "epoch": 9.643556280587276, + "grad_norm": 0.9968124032020569, + "learning_rate": 1.9343864763048392e-07, + "loss": 0.0569, + "num_input_tokens_seen": 127590224, + "step": 59115 + }, + { + "epoch": 9.64437194127243, + "grad_norm": 0.11648619174957275, + "learning_rate": 1.925558982936865e-07, + "loss": 0.0515, + "num_input_tokens_seen": 127601328, + "step": 59120 + }, + { + "epoch": 9.645187601957586, + "grad_norm": 0.6405937671661377, + "learning_rate": 1.916751599741806e-07, + "loss": 0.0518, + "num_input_tokens_seen": 127612464, + "step": 59125 + }, + { + "epoch": 9.64600326264274, + "grad_norm": 1.2578669786453247, + "learning_rate": 1.90796432743362e-07, + "loss": 0.0914, + "num_input_tokens_seen": 127622864, + "step": 59130 + }, + { + "epoch": 9.646818923327896, + "grad_norm": 1.0724509954452515, + "learning_rate": 1.8991971667246533e-07, + "loss": 0.0884, + "num_input_tokens_seen": 127634000, + "step": 59135 + }, + { + "epoch": 9.647634584013051, + "grad_norm": 1.7725491523742676, + "learning_rate": 1.8904501183256152e-07, + "loss": 0.0594, + "num_input_tokens_seen": 127645264, + "step": 59140 + }, + { + "epoch": 9.648450244698205, + "grad_norm": 1.4115257263183594, + "learning_rate": 1.8817231829455773e-07, + "loss": 0.0383, + "num_input_tokens_seen": 127655376, + "step": 59145 + }, + { + "epoch": 9.649265905383361, + "grad_norm": 0.16820044815540314, + "learning_rate": 1.8730163612920015e-07, + "loss": 0.2458, + "num_input_tokens_seen": 127665584, + "step": 59150 + }, + { + "epoch": 9.650081566068515, + "grad_norm": 0.9547114968299866, + "learning_rate": 1.8643296540707121e-07, + "loss": 0.1486, + "num_input_tokens_seen": 127675920, + "step": 59155 + }, + { + "epoch": 9.65089722675367, + "grad_norm": 0.5727272629737854, + "learning_rate": 1.855663061985896e-07, + "loss": 0.049, + "num_input_tokens_seen": 127686928, + "step": 59160 + }, + { + "epoch": 9.651712887438826, + "grad_norm": 0.24793057143688202, + "learning_rate": 1.8470165857401023e-07, + "loss": 0.0251, + "num_input_tokens_seen": 127697296, + "step": 59165 + }, + { + "epoch": 9.65252854812398, + "grad_norm": 0.11319496482610703, + "learning_rate": 1.8383902260342422e-07, + "loss": 0.163, + "num_input_tokens_seen": 127707312, + "step": 59170 + }, + { + "epoch": 9.653344208809136, + "grad_norm": 0.14892293512821198, + "learning_rate": 1.8297839835676456e-07, + "loss": 0.08, + "num_input_tokens_seen": 127719056, + "step": 59175 + }, + { + "epoch": 9.65415986949429, + "grad_norm": 0.2891312837600708, + "learning_rate": 1.8211978590379486e-07, + "loss": 0.0994, + "num_input_tokens_seen": 127730864, + "step": 59180 + }, + { + "epoch": 9.654975530179446, + "grad_norm": 1.0486253499984741, + "learning_rate": 1.8126318531412056e-07, + "loss": 0.0858, + "num_input_tokens_seen": 127740944, + "step": 59185 + }, + { + "epoch": 9.655791190864601, + "grad_norm": 0.04171081632375717, + "learning_rate": 1.8040859665718057e-07, + "loss": 0.0545, + "num_input_tokens_seen": 127752016, + "step": 59190 + }, + { + "epoch": 9.656606851549755, + "grad_norm": 0.18775391578674316, + "learning_rate": 1.795560200022528e-07, + "loss": 0.0243, + "num_input_tokens_seen": 127762480, + "step": 59195 + }, + { + "epoch": 9.65742251223491, + "grad_norm": 0.2110985517501831, + "learning_rate": 1.7870545541845418e-07, + "loss": 0.1149, + "num_input_tokens_seen": 127773232, + "step": 59200 + }, + { + "epoch": 9.658238172920065, + "grad_norm": 0.23936668038368225, + "learning_rate": 1.7785690297473234e-07, + "loss": 0.0718, + "num_input_tokens_seen": 127784112, + "step": 59205 + }, + { + "epoch": 9.65905383360522, + "grad_norm": 0.12913621962070465, + "learning_rate": 1.770103627398767e-07, + "loss": 0.1244, + "num_input_tokens_seen": 127795600, + "step": 59210 + }, + { + "epoch": 9.659869494290374, + "grad_norm": 1.1181679964065552, + "learning_rate": 1.7616583478251013e-07, + "loss": 0.152, + "num_input_tokens_seen": 127805648, + "step": 59215 + }, + { + "epoch": 9.66068515497553, + "grad_norm": 0.7979565262794495, + "learning_rate": 1.7532331917109457e-07, + "loss": 0.0548, + "num_input_tokens_seen": 127816560, + "step": 59220 + }, + { + "epoch": 9.661500815660686, + "grad_norm": 0.41302141547203064, + "learning_rate": 1.7448281597393368e-07, + "loss": 0.2091, + "num_input_tokens_seen": 127828784, + "step": 59225 + }, + { + "epoch": 9.66231647634584, + "grad_norm": 1.8799833059310913, + "learning_rate": 1.736443252591563e-07, + "loss": 0.119, + "num_input_tokens_seen": 127839088, + "step": 59230 + }, + { + "epoch": 9.663132137030995, + "grad_norm": 0.7927495837211609, + "learning_rate": 1.7280784709473862e-07, + "loss": 0.1695, + "num_input_tokens_seen": 127848016, + "step": 59235 + }, + { + "epoch": 9.66394779771615, + "grad_norm": 0.0769738107919693, + "learning_rate": 1.719733815484903e-07, + "loss": 0.0851, + "num_input_tokens_seen": 127858320, + "step": 59240 + }, + { + "epoch": 9.664763458401305, + "grad_norm": 0.6704502701759338, + "learning_rate": 1.7114092868805443e-07, + "loss": 0.2248, + "num_input_tokens_seen": 127869936, + "step": 59245 + }, + { + "epoch": 9.66557911908646, + "grad_norm": 0.4188607931137085, + "learning_rate": 1.7031048858091313e-07, + "loss": 0.0887, + "num_input_tokens_seen": 127881040, + "step": 59250 + }, + { + "epoch": 9.666394779771615, + "grad_norm": 0.4489705264568329, + "learning_rate": 1.6948206129439037e-07, + "loss": 0.1932, + "num_input_tokens_seen": 127891792, + "step": 59255 + }, + { + "epoch": 9.66721044045677, + "grad_norm": 2.6023764610290527, + "learning_rate": 1.6865564689564074e-07, + "loss": 0.3449, + "num_input_tokens_seen": 127903376, + "step": 59260 + }, + { + "epoch": 9.668026101141924, + "grad_norm": 0.28623345494270325, + "learning_rate": 1.6783124545165785e-07, + "loss": 0.0494, + "num_input_tokens_seen": 127914576, + "step": 59265 + }, + { + "epoch": 9.66884176182708, + "grad_norm": 0.22031381726264954, + "learning_rate": 1.6700885702926882e-07, + "loss": 0.0415, + "num_input_tokens_seen": 127924784, + "step": 59270 + }, + { + "epoch": 9.669657422512234, + "grad_norm": 0.46648120880126953, + "learning_rate": 1.6618848169514533e-07, + "loss": 0.1094, + "num_input_tokens_seen": 127934800, + "step": 59275 + }, + { + "epoch": 9.67047308319739, + "grad_norm": 0.524192214012146, + "learning_rate": 1.6537011951578974e-07, + "loss": 0.1415, + "num_input_tokens_seen": 127945584, + "step": 59280 + }, + { + "epoch": 9.671288743882545, + "grad_norm": 0.9201868176460266, + "learning_rate": 1.645537705575406e-07, + "loss": 0.1896, + "num_input_tokens_seen": 127956272, + "step": 59285 + }, + { + "epoch": 9.6721044045677, + "grad_norm": 0.5749293565750122, + "learning_rate": 1.6373943488657562e-07, + "loss": 0.0879, + "num_input_tokens_seen": 127966672, + "step": 59290 + }, + { + "epoch": 9.672920065252855, + "grad_norm": 2.4099080562591553, + "learning_rate": 1.6292711256891134e-07, + "loss": 0.3464, + "num_input_tokens_seen": 127978032, + "step": 59295 + }, + { + "epoch": 9.673735725938009, + "grad_norm": 0.3012791872024536, + "learning_rate": 1.6211680367039793e-07, + "loss": 0.113, + "num_input_tokens_seen": 127989072, + "step": 59300 + }, + { + "epoch": 9.674551386623165, + "grad_norm": 0.07914276421070099, + "learning_rate": 1.6130850825672173e-07, + "loss": 0.0123, + "num_input_tokens_seen": 127999568, + "step": 59305 + }, + { + "epoch": 9.67536704730832, + "grad_norm": 0.6962605714797974, + "learning_rate": 1.6050222639340807e-07, + "loss": 0.0903, + "num_input_tokens_seen": 128010256, + "step": 59310 + }, + { + "epoch": 9.676182707993474, + "grad_norm": 0.08777402341365814, + "learning_rate": 1.5969795814581856e-07, + "loss": 0.0363, + "num_input_tokens_seen": 128020176, + "step": 59315 + }, + { + "epoch": 9.67699836867863, + "grad_norm": 0.1660272479057312, + "learning_rate": 1.5889570357915108e-07, + "loss": 0.1085, + "num_input_tokens_seen": 128032496, + "step": 59320 + }, + { + "epoch": 9.677814029363784, + "grad_norm": 0.09718434512615204, + "learning_rate": 1.5809546275843968e-07, + "loss": 0.027, + "num_input_tokens_seen": 128043632, + "step": 59325 + }, + { + "epoch": 9.67862969004894, + "grad_norm": 0.04072815179824829, + "learning_rate": 1.572972357485575e-07, + "loss": 0.1735, + "num_input_tokens_seen": 128054608, + "step": 59330 + }, + { + "epoch": 9.679445350734095, + "grad_norm": 0.1613386571407318, + "learning_rate": 1.5650102261421107e-07, + "loss": 0.0332, + "num_input_tokens_seen": 128065360, + "step": 59335 + }, + { + "epoch": 9.68026101141925, + "grad_norm": 0.11803191900253296, + "learning_rate": 1.557068234199488e-07, + "loss": 0.0662, + "num_input_tokens_seen": 128076464, + "step": 59340 + }, + { + "epoch": 9.681076672104405, + "grad_norm": 0.41114792227745056, + "learning_rate": 1.5491463823014697e-07, + "loss": 0.0412, + "num_input_tokens_seen": 128086320, + "step": 59345 + }, + { + "epoch": 9.681892332789559, + "grad_norm": 0.1449984908103943, + "learning_rate": 1.5412446710902917e-07, + "loss": 0.0428, + "num_input_tokens_seen": 128096208, + "step": 59350 + }, + { + "epoch": 9.682707993474715, + "grad_norm": 0.7733399868011475, + "learning_rate": 1.5333631012064698e-07, + "loss": 0.1686, + "num_input_tokens_seen": 128107376, + "step": 59355 + }, + { + "epoch": 9.68352365415987, + "grad_norm": 0.1595248281955719, + "learning_rate": 1.5255016732889648e-07, + "loss": 0.0462, + "num_input_tokens_seen": 128118800, + "step": 59360 + }, + { + "epoch": 9.684339314845024, + "grad_norm": 1.6142711639404297, + "learning_rate": 1.5176603879750173e-07, + "loss": 0.2566, + "num_input_tokens_seen": 128131152, + "step": 59365 + }, + { + "epoch": 9.68515497553018, + "grad_norm": 0.2557036280632019, + "learning_rate": 1.509839245900313e-07, + "loss": 0.0659, + "num_input_tokens_seen": 128142480, + "step": 59370 + }, + { + "epoch": 9.685970636215334, + "grad_norm": 0.6258809566497803, + "learning_rate": 1.5020382476988726e-07, + "loss": 0.0526, + "num_input_tokens_seen": 128152816, + "step": 59375 + }, + { + "epoch": 9.68678629690049, + "grad_norm": 1.317581295967102, + "learning_rate": 1.4942573940030791e-07, + "loss": 0.0451, + "num_input_tokens_seen": 128163632, + "step": 59380 + }, + { + "epoch": 9.687601957585644, + "grad_norm": 0.11481855064630508, + "learning_rate": 1.4864966854437056e-07, + "loss": 0.0149, + "num_input_tokens_seen": 128174640, + "step": 59385 + }, + { + "epoch": 9.6884176182708, + "grad_norm": 0.6520689129829407, + "learning_rate": 1.4787561226498048e-07, + "loss": 0.1712, + "num_input_tokens_seen": 128184144, + "step": 59390 + }, + { + "epoch": 9.689233278955955, + "grad_norm": 0.05219221115112305, + "learning_rate": 1.4710357062489577e-07, + "loss": 0.0784, + "num_input_tokens_seen": 128192976, + "step": 59395 + }, + { + "epoch": 9.690048939641109, + "grad_norm": 0.032202091068029404, + "learning_rate": 1.4633354368669694e-07, + "loss": 0.0963, + "num_input_tokens_seen": 128204144, + "step": 59400 + }, + { + "epoch": 9.690864600326265, + "grad_norm": 0.995476245880127, + "learning_rate": 1.4556553151280628e-07, + "loss": 0.0907, + "num_input_tokens_seen": 128215600, + "step": 59405 + }, + { + "epoch": 9.691680261011419, + "grad_norm": 0.21344399452209473, + "learning_rate": 1.447995341654851e-07, + "loss": 0.1405, + "num_input_tokens_seen": 128226672, + "step": 59410 + }, + { + "epoch": 9.692495921696574, + "grad_norm": 1.8088786602020264, + "learning_rate": 1.4403555170682816e-07, + "loss": 0.1584, + "num_input_tokens_seen": 128237936, + "step": 59415 + }, + { + "epoch": 9.69331158238173, + "grad_norm": 1.0639861822128296, + "learning_rate": 1.4327358419876646e-07, + "loss": 0.0688, + "num_input_tokens_seen": 128248688, + "step": 59420 + }, + { + "epoch": 9.694127243066884, + "grad_norm": 0.8008614182472229, + "learning_rate": 1.4251363170307008e-07, + "loss": 0.0899, + "num_input_tokens_seen": 128259440, + "step": 59425 + }, + { + "epoch": 9.69494290375204, + "grad_norm": 0.1272534728050232, + "learning_rate": 1.4175569428134527e-07, + "loss": 0.0848, + "num_input_tokens_seen": 128268880, + "step": 59430 + }, + { + "epoch": 9.695758564437194, + "grad_norm": 0.07721015810966492, + "learning_rate": 1.4099977199503178e-07, + "loss": 0.0242, + "num_input_tokens_seen": 128277744, + "step": 59435 + }, + { + "epoch": 9.69657422512235, + "grad_norm": 0.126919686794281, + "learning_rate": 1.4024586490540837e-07, + "loss": 0.0973, + "num_input_tokens_seen": 128288176, + "step": 59440 + }, + { + "epoch": 9.697389885807503, + "grad_norm": 0.9800803661346436, + "learning_rate": 1.3949397307359557e-07, + "loss": 0.0417, + "num_input_tokens_seen": 128298672, + "step": 59445 + }, + { + "epoch": 9.698205546492659, + "grad_norm": 0.07782689481973648, + "learning_rate": 1.3874409656054189e-07, + "loss": 0.0405, + "num_input_tokens_seen": 128309264, + "step": 59450 + }, + { + "epoch": 9.699021207177815, + "grad_norm": 2.0487890243530273, + "learning_rate": 1.3799623542703478e-07, + "loss": 0.214, + "num_input_tokens_seen": 128320656, + "step": 59455 + }, + { + "epoch": 9.699836867862969, + "grad_norm": 0.48275241255760193, + "learning_rate": 1.3725038973370076e-07, + "loss": 0.0535, + "num_input_tokens_seen": 128332016, + "step": 59460 + }, + { + "epoch": 9.700652528548124, + "grad_norm": 2.106083631515503, + "learning_rate": 1.3650655954100532e-07, + "loss": 0.0851, + "num_input_tokens_seen": 128342448, + "step": 59465 + }, + { + "epoch": 9.701468189233278, + "grad_norm": 0.5959587693214417, + "learning_rate": 1.3576474490924195e-07, + "loss": 0.0895, + "num_input_tokens_seen": 128352368, + "step": 59470 + }, + { + "epoch": 9.702283849918434, + "grad_norm": 1.118240475654602, + "learning_rate": 1.3502494589855142e-07, + "loss": 0.3368, + "num_input_tokens_seen": 128361488, + "step": 59475 + }, + { + "epoch": 9.70309951060359, + "grad_norm": 0.09370999038219452, + "learning_rate": 1.3428716256889962e-07, + "loss": 0.1778, + "num_input_tokens_seen": 128371632, + "step": 59480 + }, + { + "epoch": 9.703915171288743, + "grad_norm": 0.3574564456939697, + "learning_rate": 1.3355139498009706e-07, + "loss": 0.237, + "num_input_tokens_seen": 128380688, + "step": 59485 + }, + { + "epoch": 9.7047308319739, + "grad_norm": 0.18019089102745056, + "learning_rate": 1.3281764319179046e-07, + "loss": 0.1836, + "num_input_tokens_seen": 128390480, + "step": 59490 + }, + { + "epoch": 9.705546492659053, + "grad_norm": 0.4371049404144287, + "learning_rate": 1.320859072634628e-07, + "loss": 0.2205, + "num_input_tokens_seen": 128401712, + "step": 59495 + }, + { + "epoch": 9.706362153344209, + "grad_norm": 0.044553689658641815, + "learning_rate": 1.3135618725442778e-07, + "loss": 0.0617, + "num_input_tokens_seen": 128413040, + "step": 59500 + }, + { + "epoch": 9.707177814029365, + "grad_norm": 0.13505397737026215, + "learning_rate": 1.3062848322384357e-07, + "loss": 0.1367, + "num_input_tokens_seen": 128421744, + "step": 59505 + }, + { + "epoch": 9.707993474714518, + "grad_norm": 0.539421021938324, + "learning_rate": 1.2990279523069916e-07, + "loss": 0.1696, + "num_input_tokens_seen": 128432528, + "step": 59510 + }, + { + "epoch": 9.708809135399674, + "grad_norm": 0.05682966858148575, + "learning_rate": 1.291791233338252e-07, + "loss": 0.0414, + "num_input_tokens_seen": 128443664, + "step": 59515 + }, + { + "epoch": 9.709624796084828, + "grad_norm": 0.03099283203482628, + "learning_rate": 1.2845746759188314e-07, + "loss": 0.0362, + "num_input_tokens_seen": 128454256, + "step": 59520 + }, + { + "epoch": 9.710440456769984, + "grad_norm": 0.08129376173019409, + "learning_rate": 1.277378280633762e-07, + "loss": 0.0419, + "num_input_tokens_seen": 128466768, + "step": 59525 + }, + { + "epoch": 9.71125611745514, + "grad_norm": 0.5030641555786133, + "learning_rate": 1.27020204806641e-07, + "loss": 0.1249, + "num_input_tokens_seen": 128477264, + "step": 59530 + }, + { + "epoch": 9.712071778140293, + "grad_norm": 0.9552064538002014, + "learning_rate": 1.2630459787985326e-07, + "loss": 0.1826, + "num_input_tokens_seen": 128489040, + "step": 59535 + }, + { + "epoch": 9.71288743882545, + "grad_norm": 0.17080475389957428, + "learning_rate": 1.2559100734102214e-07, + "loss": 0.033, + "num_input_tokens_seen": 128500176, + "step": 59540 + }, + { + "epoch": 9.713703099510603, + "grad_norm": 0.29954636096954346, + "learning_rate": 1.248794332479958e-07, + "loss": 0.1749, + "num_input_tokens_seen": 128510096, + "step": 59545 + }, + { + "epoch": 9.714518760195759, + "grad_norm": 0.5838660001754761, + "learning_rate": 1.2416987565845861e-07, + "loss": 0.0897, + "num_input_tokens_seen": 128520976, + "step": 59550 + }, + { + "epoch": 9.715334420880914, + "grad_norm": 0.12597966194152832, + "learning_rate": 1.2346233462992852e-07, + "loss": 0.1457, + "num_input_tokens_seen": 128532080, + "step": 59555 + }, + { + "epoch": 9.716150081566068, + "grad_norm": 1.0820947885513306, + "learning_rate": 1.2275681021976515e-07, + "loss": 0.1816, + "num_input_tokens_seen": 128544176, + "step": 59560 + }, + { + "epoch": 9.716965742251224, + "grad_norm": 0.2494688630104065, + "learning_rate": 1.220533024851589e-07, + "loss": 0.117, + "num_input_tokens_seen": 128555472, + "step": 59565 + }, + { + "epoch": 9.717781402936378, + "grad_norm": 0.36687996983528137, + "learning_rate": 1.213518114831419e-07, + "loss": 0.0581, + "num_input_tokens_seen": 128565968, + "step": 59570 + }, + { + "epoch": 9.718597063621534, + "grad_norm": 1.3643035888671875, + "learning_rate": 1.206523372705798e-07, + "loss": 0.3137, + "num_input_tokens_seen": 128576528, + "step": 59575 + }, + { + "epoch": 9.719412724306688, + "grad_norm": 0.7748696804046631, + "learning_rate": 1.199548799041772e-07, + "loss": 0.0681, + "num_input_tokens_seen": 128586928, + "step": 59580 + }, + { + "epoch": 9.720228384991843, + "grad_norm": 1.78489089012146, + "learning_rate": 1.1925943944047225e-07, + "loss": 0.1186, + "num_input_tokens_seen": 128599280, + "step": 59585 + }, + { + "epoch": 9.721044045676999, + "grad_norm": 0.138916015625, + "learning_rate": 1.1856601593583928e-07, + "loss": 0.0406, + "num_input_tokens_seen": 128609424, + "step": 59590 + }, + { + "epoch": 9.721859706362153, + "grad_norm": 0.2008531093597412, + "learning_rate": 1.1787460944649443e-07, + "loss": 0.0199, + "num_input_tokens_seen": 128620368, + "step": 59595 + }, + { + "epoch": 9.722675367047309, + "grad_norm": 0.36602115631103516, + "learning_rate": 1.1718522002848175e-07, + "loss": 0.1002, + "num_input_tokens_seen": 128631792, + "step": 59600 + }, + { + "epoch": 9.723491027732463, + "grad_norm": 0.28681400418281555, + "learning_rate": 1.164978477376899e-07, + "loss": 0.0896, + "num_input_tokens_seen": 128641584, + "step": 59605 + }, + { + "epoch": 9.724306688417618, + "grad_norm": 0.1014384999871254, + "learning_rate": 1.1581249262984096e-07, + "loss": 0.3105, + "num_input_tokens_seen": 128651824, + "step": 59610 + }, + { + "epoch": 9.725122349102774, + "grad_norm": 0.37503311038017273, + "learning_rate": 1.1512915476049325e-07, + "loss": 0.1924, + "num_input_tokens_seen": 128661616, + "step": 59615 + }, + { + "epoch": 9.725938009787928, + "grad_norm": 0.4460737407207489, + "learning_rate": 1.1444783418503857e-07, + "loss": 0.0472, + "num_input_tokens_seen": 128671952, + "step": 59620 + }, + { + "epoch": 9.726753670473084, + "grad_norm": 2.3387975692749023, + "learning_rate": 1.1376853095871332e-07, + "loss": 0.1002, + "num_input_tokens_seen": 128682928, + "step": 59625 + }, + { + "epoch": 9.727569331158238, + "grad_norm": 1.1626105308532715, + "learning_rate": 1.1309124513657899e-07, + "loss": 0.2454, + "num_input_tokens_seen": 128692592, + "step": 59630 + }, + { + "epoch": 9.728384991843393, + "grad_norm": 0.8823956847190857, + "learning_rate": 1.124159767735472e-07, + "loss": 0.0378, + "num_input_tokens_seen": 128703280, + "step": 59635 + }, + { + "epoch": 9.729200652528547, + "grad_norm": 0.19627554714679718, + "learning_rate": 1.1174272592435197e-07, + "loss": 0.0133, + "num_input_tokens_seen": 128713904, + "step": 59640 + }, + { + "epoch": 9.730016313213703, + "grad_norm": 0.5694146156311035, + "learning_rate": 1.1107149264357186e-07, + "loss": 0.0678, + "num_input_tokens_seen": 128723696, + "step": 59645 + }, + { + "epoch": 9.730831973898859, + "grad_norm": 1.5079737901687622, + "learning_rate": 1.1040227698562445e-07, + "loss": 0.1289, + "num_input_tokens_seen": 128735120, + "step": 59650 + }, + { + "epoch": 9.731647634584013, + "grad_norm": 0.061481624841690063, + "learning_rate": 1.0973507900475521e-07, + "loss": 0.1309, + "num_input_tokens_seen": 128745232, + "step": 59655 + }, + { + "epoch": 9.732463295269168, + "grad_norm": 0.843548059463501, + "learning_rate": 1.0906989875505425e-07, + "loss": 0.0698, + "num_input_tokens_seen": 128755856, + "step": 59660 + }, + { + "epoch": 9.733278955954322, + "grad_norm": 0.48104748129844666, + "learning_rate": 1.0840673629044228e-07, + "loss": 0.0366, + "num_input_tokens_seen": 128766768, + "step": 59665 + }, + { + "epoch": 9.734094616639478, + "grad_norm": 2.133633852005005, + "learning_rate": 1.0774559166467912e-07, + "loss": 0.092, + "num_input_tokens_seen": 128778448, + "step": 59670 + }, + { + "epoch": 9.734910277324634, + "grad_norm": 0.5782970786094666, + "learning_rate": 1.0708646493135799e-07, + "loss": 0.1383, + "num_input_tokens_seen": 128788176, + "step": 59675 + }, + { + "epoch": 9.735725938009788, + "grad_norm": 0.7257633805274963, + "learning_rate": 1.0642935614391392e-07, + "loss": 0.0687, + "num_input_tokens_seen": 128799280, + "step": 59680 + }, + { + "epoch": 9.736541598694943, + "grad_norm": 0.07894296199083328, + "learning_rate": 1.0577426535561541e-07, + "loss": 0.0256, + "num_input_tokens_seen": 128808752, + "step": 59685 + }, + { + "epoch": 9.737357259380097, + "grad_norm": 0.4926058053970337, + "learning_rate": 1.0512119261956999e-07, + "loss": 0.0288, + "num_input_tokens_seen": 128819728, + "step": 59690 + }, + { + "epoch": 9.738172920065253, + "grad_norm": 0.9913473129272461, + "learning_rate": 1.0447013798871308e-07, + "loss": 0.1025, + "num_input_tokens_seen": 128831440, + "step": 59695 + }, + { + "epoch": 9.738988580750409, + "grad_norm": 0.9411574006080627, + "learning_rate": 1.0382110151582469e-07, + "loss": 0.0607, + "num_input_tokens_seen": 128841200, + "step": 59700 + }, + { + "epoch": 9.739804241435563, + "grad_norm": 0.0331546925008297, + "learning_rate": 1.0317408325352107e-07, + "loss": 0.0389, + "num_input_tokens_seen": 128852112, + "step": 59705 + }, + { + "epoch": 9.740619902120718, + "grad_norm": 0.8599108457565308, + "learning_rate": 1.0252908325425192e-07, + "loss": 0.0407, + "num_input_tokens_seen": 128864592, + "step": 59710 + }, + { + "epoch": 9.741435562805872, + "grad_norm": 0.019739892333745956, + "learning_rate": 1.018861015703032e-07, + "loss": 0.0333, + "num_input_tokens_seen": 128876272, + "step": 59715 + }, + { + "epoch": 9.742251223491028, + "grad_norm": 0.8159624338150024, + "learning_rate": 1.0124513825379989e-07, + "loss": 0.058, + "num_input_tokens_seen": 128887056, + "step": 59720 + }, + { + "epoch": 9.743066884176184, + "grad_norm": 1.693829894065857, + "learning_rate": 1.0060619335669764e-07, + "loss": 0.2153, + "num_input_tokens_seen": 128898128, + "step": 59725 + }, + { + "epoch": 9.743882544861338, + "grad_norm": 0.34195223450660706, + "learning_rate": 9.996926693079945e-08, + "loss": 0.0536, + "num_input_tokens_seen": 128909040, + "step": 59730 + }, + { + "epoch": 9.744698205546493, + "grad_norm": 0.04717979207634926, + "learning_rate": 9.93343590277307e-08, + "loss": 0.162, + "num_input_tokens_seen": 128918288, + "step": 59735 + }, + { + "epoch": 9.745513866231647, + "grad_norm": 2.3477535247802734, + "learning_rate": 9.870146969896688e-08, + "loss": 0.1729, + "num_input_tokens_seen": 128929584, + "step": 59740 + }, + { + "epoch": 9.746329526916803, + "grad_norm": 0.25960585474967957, + "learning_rate": 9.807059899580861e-08, + "loss": 0.0624, + "num_input_tokens_seen": 128940208, + "step": 59745 + }, + { + "epoch": 9.747145187601957, + "grad_norm": 1.0273243188858032, + "learning_rate": 9.744174696939834e-08, + "loss": 0.218, + "num_input_tokens_seen": 128950224, + "step": 59750 + }, + { + "epoch": 9.747960848287113, + "grad_norm": 0.6692569255828857, + "learning_rate": 9.681491367071193e-08, + "loss": 0.044, + "num_input_tokens_seen": 128961072, + "step": 59755 + }, + { + "epoch": 9.748776508972268, + "grad_norm": 0.35715457797050476, + "learning_rate": 9.619009915056987e-08, + "loss": 0.0436, + "num_input_tokens_seen": 128972528, + "step": 59760 + }, + { + "epoch": 9.749592169657422, + "grad_norm": 0.07282152026891708, + "learning_rate": 9.556730345961773e-08, + "loss": 0.2753, + "num_input_tokens_seen": 128982480, + "step": 59765 + }, + { + "epoch": 9.750407830342578, + "grad_norm": 2.4533398151397705, + "learning_rate": 9.494652664834292e-08, + "loss": 0.1298, + "num_input_tokens_seen": 128992304, + "step": 59770 + }, + { + "epoch": 9.751223491027732, + "grad_norm": 1.79752779006958, + "learning_rate": 9.432776876707183e-08, + "loss": 0.2198, + "num_input_tokens_seen": 129003344, + "step": 59775 + }, + { + "epoch": 9.752039151712887, + "grad_norm": 1.0798481702804565, + "learning_rate": 9.371102986595881e-08, + "loss": 0.0895, + "num_input_tokens_seen": 129014224, + "step": 59780 + }, + { + "epoch": 9.752854812398043, + "grad_norm": 0.2878611981868744, + "learning_rate": 9.309630999500551e-08, + "loss": 0.1241, + "num_input_tokens_seen": 129024016, + "step": 59785 + }, + { + "epoch": 9.753670473083197, + "grad_norm": 0.6028711795806885, + "learning_rate": 9.248360920404154e-08, + "loss": 0.045, + "num_input_tokens_seen": 129033840, + "step": 59790 + }, + { + "epoch": 9.754486133768353, + "grad_norm": 0.398578405380249, + "learning_rate": 9.187292754273269e-08, + "loss": 0.0901, + "num_input_tokens_seen": 129044848, + "step": 59795 + }, + { + "epoch": 9.755301794453507, + "grad_norm": 0.6703835129737854, + "learning_rate": 9.126426506058938e-08, + "loss": 0.0772, + "num_input_tokens_seen": 129055664, + "step": 59800 + }, + { + "epoch": 9.756117455138662, + "grad_norm": 0.07417997717857361, + "learning_rate": 9.06576218069527e-08, + "loss": 0.2883, + "num_input_tokens_seen": 129066288, + "step": 59805 + }, + { + "epoch": 9.756933115823816, + "grad_norm": 0.24396660923957825, + "learning_rate": 9.005299783099441e-08, + "loss": 0.1005, + "num_input_tokens_seen": 129076016, + "step": 59810 + }, + { + "epoch": 9.757748776508972, + "grad_norm": 0.11312716454267502, + "learning_rate": 8.945039318173365e-08, + "loss": 0.0111, + "num_input_tokens_seen": 129086704, + "step": 59815 + }, + { + "epoch": 9.758564437194128, + "grad_norm": 0.06790996342897415, + "learning_rate": 8.884980790801745e-08, + "loss": 0.0864, + "num_input_tokens_seen": 129096560, + "step": 59820 + }, + { + "epoch": 9.759380097879282, + "grad_norm": 1.294219970703125, + "learning_rate": 8.825124205853463e-08, + "loss": 0.1123, + "num_input_tokens_seen": 129107984, + "step": 59825 + }, + { + "epoch": 9.760195758564437, + "grad_norm": 0.09908480942249298, + "learning_rate": 8.76546956818075e-08, + "loss": 0.1063, + "num_input_tokens_seen": 129118352, + "step": 59830 + }, + { + "epoch": 9.761011419249591, + "grad_norm": 0.5089851021766663, + "learning_rate": 8.706016882619461e-08, + "loss": 0.1595, + "num_input_tokens_seen": 129129072, + "step": 59835 + }, + { + "epoch": 9.761827079934747, + "grad_norm": 0.40677154064178467, + "learning_rate": 8.646766153989072e-08, + "loss": 0.1226, + "num_input_tokens_seen": 129140496, + "step": 59840 + }, + { + "epoch": 9.762642740619903, + "grad_norm": 0.2535157799720764, + "learning_rate": 8.587717387092686e-08, + "loss": 0.039, + "num_input_tokens_seen": 129151952, + "step": 59845 + }, + { + "epoch": 9.763458401305057, + "grad_norm": 2.498164415359497, + "learning_rate": 8.528870586717308e-08, + "loss": 0.2538, + "num_input_tokens_seen": 129165008, + "step": 59850 + }, + { + "epoch": 9.764274061990212, + "grad_norm": 1.499139666557312, + "learning_rate": 8.470225757633565e-08, + "loss": 0.2471, + "num_input_tokens_seen": 129176272, + "step": 59855 + }, + { + "epoch": 9.765089722675366, + "grad_norm": 0.22403894364833832, + "learning_rate": 8.411782904594879e-08, + "loss": 0.0819, + "num_input_tokens_seen": 129187504, + "step": 59860 + }, + { + "epoch": 9.765905383360522, + "grad_norm": 0.07421083748340607, + "learning_rate": 8.3535420323394e-08, + "loss": 0.0925, + "num_input_tokens_seen": 129198928, + "step": 59865 + }, + { + "epoch": 9.766721044045678, + "grad_norm": 0.879295289516449, + "learning_rate": 8.295503145588357e-08, + "loss": 0.0428, + "num_input_tokens_seen": 129211024, + "step": 59870 + }, + { + "epoch": 9.767536704730832, + "grad_norm": 0.6952290534973145, + "learning_rate": 8.237666249046593e-08, + "loss": 0.0776, + "num_input_tokens_seen": 129222096, + "step": 59875 + }, + { + "epoch": 9.768352365415987, + "grad_norm": 1.6058207750320435, + "learning_rate": 8.180031347402583e-08, + "loss": 0.2277, + "num_input_tokens_seen": 129232880, + "step": 59880 + }, + { + "epoch": 9.769168026101141, + "grad_norm": 0.2863946557044983, + "learning_rate": 8.122598445328699e-08, + "loss": 0.0413, + "num_input_tokens_seen": 129244560, + "step": 59885 + }, + { + "epoch": 9.769983686786297, + "grad_norm": 0.9035354256629944, + "learning_rate": 8.065367547480384e-08, + "loss": 0.0959, + "num_input_tokens_seen": 129255824, + "step": 59890 + }, + { + "epoch": 9.770799347471453, + "grad_norm": 0.537465512752533, + "learning_rate": 8.008338658497538e-08, + "loss": 0.1468, + "num_input_tokens_seen": 129267248, + "step": 59895 + }, + { + "epoch": 9.771615008156607, + "grad_norm": 0.943017303943634, + "learning_rate": 7.95151178300313e-08, + "loss": 0.0799, + "num_input_tokens_seen": 129277616, + "step": 59900 + }, + { + "epoch": 9.772430668841762, + "grad_norm": 0.9513600468635559, + "learning_rate": 7.894886925603473e-08, + "loss": 0.0853, + "num_input_tokens_seen": 129288112, + "step": 59905 + }, + { + "epoch": 9.773246329526916, + "grad_norm": 0.3104327917098999, + "learning_rate": 7.838464090889342e-08, + "loss": 0.2236, + "num_input_tokens_seen": 129298064, + "step": 59910 + }, + { + "epoch": 9.774061990212072, + "grad_norm": 0.16820557415485382, + "learning_rate": 7.782243283434299e-08, + "loss": 0.011, + "num_input_tokens_seen": 129307408, + "step": 59915 + }, + { + "epoch": 9.774877650897226, + "grad_norm": 1.1962965726852417, + "learning_rate": 7.726224507795809e-08, + "loss": 0.1514, + "num_input_tokens_seen": 129318128, + "step": 59920 + }, + { + "epoch": 9.775693311582382, + "grad_norm": 0.20740889012813568, + "learning_rate": 7.67040776851552e-08, + "loss": 0.0648, + "num_input_tokens_seen": 129328080, + "step": 59925 + }, + { + "epoch": 9.776508972267537, + "grad_norm": 0.6688018441200256, + "learning_rate": 7.614793070117865e-08, + "loss": 0.1114, + "num_input_tokens_seen": 129339120, + "step": 59930 + }, + { + "epoch": 9.777324632952691, + "grad_norm": 1.3553582429885864, + "learning_rate": 7.559380417111184e-08, + "loss": 0.2123, + "num_input_tokens_seen": 129349648, + "step": 59935 + }, + { + "epoch": 9.778140293637847, + "grad_norm": 0.9414141178131104, + "learning_rate": 7.504169813987716e-08, + "loss": 0.1455, + "num_input_tokens_seen": 129360400, + "step": 59940 + }, + { + "epoch": 9.778955954323001, + "grad_norm": 1.8336820602416992, + "learning_rate": 7.449161265223048e-08, + "loss": 0.1972, + "num_input_tokens_seen": 129373168, + "step": 59945 + }, + { + "epoch": 9.779771615008157, + "grad_norm": 0.42128685116767883, + "learning_rate": 7.394354775276391e-08, + "loss": 0.056, + "num_input_tokens_seen": 129382736, + "step": 59950 + }, + { + "epoch": 9.780587275693312, + "grad_norm": 0.15918558835983276, + "learning_rate": 7.339750348590857e-08, + "loss": 0.1561, + "num_input_tokens_seen": 129394160, + "step": 59955 + }, + { + "epoch": 9.781402936378466, + "grad_norm": 0.1330304741859436, + "learning_rate": 7.285347989592628e-08, + "loss": 0.0659, + "num_input_tokens_seen": 129405744, + "step": 59960 + }, + { + "epoch": 9.782218597063622, + "grad_norm": 0.7260505557060242, + "learning_rate": 7.231147702692065e-08, + "loss": 0.0495, + "num_input_tokens_seen": 129416368, + "step": 59965 + }, + { + "epoch": 9.783034257748776, + "grad_norm": 0.1715434044599533, + "learning_rate": 7.177149492282876e-08, + "loss": 0.0279, + "num_input_tokens_seen": 129426928, + "step": 59970 + }, + { + "epoch": 9.783849918433932, + "grad_norm": 1.675563931465149, + "learning_rate": 7.123353362742391e-08, + "loss": 0.2019, + "num_input_tokens_seen": 129438896, + "step": 59975 + }, + { + "epoch": 9.784665579119086, + "grad_norm": 0.030608616769313812, + "learning_rate": 7.069759318431567e-08, + "loss": 0.1134, + "num_input_tokens_seen": 129449808, + "step": 59980 + }, + { + "epoch": 9.785481239804241, + "grad_norm": 0.18065159022808075, + "learning_rate": 7.016367363694986e-08, + "loss": 0.1029, + "num_input_tokens_seen": 129461232, + "step": 59985 + }, + { + "epoch": 9.786296900489397, + "grad_norm": 1.2773264646530151, + "learning_rate": 6.963177502861129e-08, + "loss": 0.234, + "num_input_tokens_seen": 129471984, + "step": 59990 + }, + { + "epoch": 9.78711256117455, + "grad_norm": 0.36455103754997253, + "learning_rate": 6.910189740241269e-08, + "loss": 0.0346, + "num_input_tokens_seen": 129483088, + "step": 59995 + }, + { + "epoch": 9.787928221859707, + "grad_norm": 0.328251451253891, + "learning_rate": 6.857404080131691e-08, + "loss": 0.0669, + "num_input_tokens_seen": 129492144, + "step": 60000 + }, + { + "epoch": 9.78874388254486, + "grad_norm": 0.041986458003520966, + "learning_rate": 6.804820526810917e-08, + "loss": 0.0939, + "num_input_tokens_seen": 129502128, + "step": 60005 + }, + { + "epoch": 9.789559543230016, + "grad_norm": 0.43750736117362976, + "learning_rate": 6.75243908454165e-08, + "loss": 0.1479, + "num_input_tokens_seen": 129512848, + "step": 60010 + }, + { + "epoch": 9.790375203915172, + "grad_norm": 0.8324832320213318, + "learning_rate": 6.700259757570216e-08, + "loss": 0.0653, + "num_input_tokens_seen": 129522992, + "step": 60015 + }, + { + "epoch": 9.791190864600326, + "grad_norm": 0.29890012741088867, + "learning_rate": 6.648282550126562e-08, + "loss": 0.066, + "num_input_tokens_seen": 129533328, + "step": 60020 + }, + { + "epoch": 9.792006525285482, + "grad_norm": 0.2892267405986786, + "learning_rate": 6.59650746642454e-08, + "loss": 0.0133, + "num_input_tokens_seen": 129542768, + "step": 60025 + }, + { + "epoch": 9.792822185970635, + "grad_norm": 1.006208062171936, + "learning_rate": 6.544934510660794e-08, + "loss": 0.2585, + "num_input_tokens_seen": 129554448, + "step": 60030 + }, + { + "epoch": 9.793637846655791, + "grad_norm": 0.4177664816379547, + "learning_rate": 6.493563687016424e-08, + "loss": 0.0612, + "num_input_tokens_seen": 129566736, + "step": 60035 + }, + { + "epoch": 9.794453507340947, + "grad_norm": 0.7075027227401733, + "learning_rate": 6.442394999655599e-08, + "loss": 0.1059, + "num_input_tokens_seen": 129577392, + "step": 60040 + }, + { + "epoch": 9.7952691680261, + "grad_norm": 1.4304773807525635, + "learning_rate": 6.391428452726389e-08, + "loss": 0.0758, + "num_input_tokens_seen": 129588080, + "step": 60045 + }, + { + "epoch": 9.796084828711257, + "grad_norm": 0.15896563231945038, + "learning_rate": 6.340664050360767e-08, + "loss": 0.0666, + "num_input_tokens_seen": 129599312, + "step": 60050 + }, + { + "epoch": 9.79690048939641, + "grad_norm": 1.8543850183486938, + "learning_rate": 6.29010179667322e-08, + "loss": 0.1934, + "num_input_tokens_seen": 129609488, + "step": 60055 + }, + { + "epoch": 9.797716150081566, + "grad_norm": 1.4763749837875366, + "learning_rate": 6.239741695763246e-08, + "loss": 0.1553, + "num_input_tokens_seen": 129620944, + "step": 60060 + }, + { + "epoch": 9.798531810766722, + "grad_norm": 0.23634538054466248, + "learning_rate": 6.189583751712857e-08, + "loss": 0.0639, + "num_input_tokens_seen": 129631920, + "step": 60065 + }, + { + "epoch": 9.799347471451876, + "grad_norm": 0.08248966187238693, + "learning_rate": 6.139627968588524e-08, + "loss": 0.0143, + "num_input_tokens_seen": 129643696, + "step": 60070 + }, + { + "epoch": 9.800163132137031, + "grad_norm": 1.15908944606781, + "learning_rate": 6.089874350439506e-08, + "loss": 0.0877, + "num_input_tokens_seen": 129655184, + "step": 60075 + }, + { + "epoch": 9.800978792822185, + "grad_norm": 0.1849144548177719, + "learning_rate": 6.040322901299245e-08, + "loss": 0.0125, + "num_input_tokens_seen": 129665008, + "step": 60080 + }, + { + "epoch": 9.801794453507341, + "grad_norm": 0.05942748859524727, + "learning_rate": 5.990973625184526e-08, + "loss": 0.0551, + "num_input_tokens_seen": 129675344, + "step": 60085 + }, + { + "epoch": 9.802610114192497, + "grad_norm": 1.3717041015625, + "learning_rate": 5.9418265260960394e-08, + "loss": 0.182, + "num_input_tokens_seen": 129687120, + "step": 60090 + }, + { + "epoch": 9.80342577487765, + "grad_norm": 0.4064551591873169, + "learning_rate": 5.892881608017819e-08, + "loss": 0.0819, + "num_input_tokens_seen": 129697936, + "step": 60095 + }, + { + "epoch": 9.804241435562806, + "grad_norm": 0.7940883636474609, + "learning_rate": 5.844138874917526e-08, + "loss": 0.0759, + "num_input_tokens_seen": 129707312, + "step": 60100 + }, + { + "epoch": 9.80505709624796, + "grad_norm": 0.9274303913116455, + "learning_rate": 5.795598330746721e-08, + "loss": 0.1133, + "num_input_tokens_seen": 129718256, + "step": 60105 + }, + { + "epoch": 9.805872756933116, + "grad_norm": 1.116366982460022, + "learning_rate": 5.747259979440034e-08, + "loss": 0.0614, + "num_input_tokens_seen": 129728336, + "step": 60110 + }, + { + "epoch": 9.80668841761827, + "grad_norm": 0.8146196007728577, + "learning_rate": 5.699123824916275e-08, + "loss": 0.1216, + "num_input_tokens_seen": 129738960, + "step": 60115 + }, + { + "epoch": 9.807504078303426, + "grad_norm": 1.3873181343078613, + "learning_rate": 5.6511898710776e-08, + "loss": 0.1955, + "num_input_tokens_seen": 129748752, + "step": 60120 + }, + { + "epoch": 9.808319738988581, + "grad_norm": 0.18303796648979187, + "learning_rate": 5.603458121809513e-08, + "loss": 0.079, + "num_input_tokens_seen": 129758864, + "step": 60125 + }, + { + "epoch": 9.809135399673735, + "grad_norm": 1.232314944267273, + "learning_rate": 5.555928580981418e-08, + "loss": 0.2414, + "num_input_tokens_seen": 129768112, + "step": 60130 + }, + { + "epoch": 9.809951060358891, + "grad_norm": 0.3858652412891388, + "learning_rate": 5.5086012524466216e-08, + "loss": 0.033, + "num_input_tokens_seen": 129779440, + "step": 60135 + }, + { + "epoch": 9.810766721044045, + "grad_norm": 0.21373096108436584, + "learning_rate": 5.4614761400414996e-08, + "loss": 0.1081, + "num_input_tokens_seen": 129790320, + "step": 60140 + }, + { + "epoch": 9.8115823817292, + "grad_norm": 0.13426658511161804, + "learning_rate": 5.414553247586329e-08, + "loss": 0.1104, + "num_input_tokens_seen": 129799792, + "step": 60145 + }, + { + "epoch": 9.812398042414356, + "grad_norm": 0.8965666890144348, + "learning_rate": 5.367832578884735e-08, + "loss": 0.2163, + "num_input_tokens_seen": 129811472, + "step": 60150 + }, + { + "epoch": 9.81321370309951, + "grad_norm": 0.10823033004999161, + "learning_rate": 5.3213141377245205e-08, + "loss": 0.0505, + "num_input_tokens_seen": 129822512, + "step": 60155 + }, + { + "epoch": 9.814029363784666, + "grad_norm": 0.6866380572319031, + "learning_rate": 5.2749979278762794e-08, + "loss": 0.0668, + "num_input_tokens_seen": 129834608, + "step": 60160 + }, + { + "epoch": 9.81484502446982, + "grad_norm": 0.7314977049827576, + "learning_rate": 5.228883953094788e-08, + "loss": 0.173, + "num_input_tokens_seen": 129845008, + "step": 60165 + }, + { + "epoch": 9.815660685154976, + "grad_norm": 1.8029122352600098, + "learning_rate": 5.182972217118165e-08, + "loss": 0.1316, + "num_input_tokens_seen": 129854928, + "step": 60170 + }, + { + "epoch": 9.81647634584013, + "grad_norm": 0.45456480979919434, + "learning_rate": 5.137262723668712e-08, + "loss": 0.0904, + "num_input_tokens_seen": 129865904, + "step": 60175 + }, + { + "epoch": 9.817292006525285, + "grad_norm": 0.13055209815502167, + "learning_rate": 5.0917554764515206e-08, + "loss": 0.1509, + "num_input_tokens_seen": 129878288, + "step": 60180 + }, + { + "epoch": 9.818107667210441, + "grad_norm": 0.04333345964550972, + "learning_rate": 5.0464504791553066e-08, + "loss": 0.1007, + "num_input_tokens_seen": 129890512, + "step": 60185 + }, + { + "epoch": 9.818923327895595, + "grad_norm": 0.3097197115421295, + "learning_rate": 5.001347735453521e-08, + "loss": 0.1554, + "num_input_tokens_seen": 129900560, + "step": 60190 + }, + { + "epoch": 9.81973898858075, + "grad_norm": 0.07732637226581573, + "learning_rate": 4.95644724900185e-08, + "loss": 0.0189, + "num_input_tokens_seen": 129912272, + "step": 60195 + }, + { + "epoch": 9.820554649265905, + "grad_norm": 0.09843353927135468, + "learning_rate": 4.91174902344016e-08, + "loss": 0.1328, + "num_input_tokens_seen": 129923120, + "step": 60200 + }, + { + "epoch": 9.82137030995106, + "grad_norm": 0.11173179000616074, + "learning_rate": 4.867253062391941e-08, + "loss": 0.0679, + "num_input_tokens_seen": 129935280, + "step": 60205 + }, + { + "epoch": 9.822185970636216, + "grad_norm": 0.3161013424396515, + "learning_rate": 4.822959369464586e-08, + "loss": 0.0538, + "num_input_tokens_seen": 129946032, + "step": 60210 + }, + { + "epoch": 9.82300163132137, + "grad_norm": 0.5828753113746643, + "learning_rate": 4.7788679482485556e-08, + "loss": 0.0592, + "num_input_tokens_seen": 129956784, + "step": 60215 + }, + { + "epoch": 9.823817292006526, + "grad_norm": 0.6662574410438538, + "learning_rate": 4.734978802318213e-08, + "loss": 0.1108, + "num_input_tokens_seen": 129967504, + "step": 60220 + }, + { + "epoch": 9.82463295269168, + "grad_norm": 0.4459304213523865, + "learning_rate": 4.69129193523099e-08, + "loss": 0.1331, + "num_input_tokens_seen": 129977840, + "step": 60225 + }, + { + "epoch": 9.825448613376835, + "grad_norm": 0.24907831847667694, + "learning_rate": 4.6478073505290544e-08, + "loss": 0.065, + "num_input_tokens_seen": 129988080, + "step": 60230 + }, + { + "epoch": 9.826264274061991, + "grad_norm": 0.3414382338523865, + "learning_rate": 4.6045250517370854e-08, + "loss": 0.0278, + "num_input_tokens_seen": 129999728, + "step": 60235 + }, + { + "epoch": 9.827079934747145, + "grad_norm": 0.6336281299591064, + "learning_rate": 4.561445042363666e-08, + "loss": 0.207, + "num_input_tokens_seen": 130010352, + "step": 60240 + }, + { + "epoch": 9.8278955954323, + "grad_norm": 0.434227854013443, + "learning_rate": 4.518567325901279e-08, + "loss": 0.1713, + "num_input_tokens_seen": 130021776, + "step": 60245 + }, + { + "epoch": 9.828711256117455, + "grad_norm": 0.3436277210712433, + "learning_rate": 4.475891905825758e-08, + "loss": 0.0442, + "num_input_tokens_seen": 130032304, + "step": 60250 + }, + { + "epoch": 9.82952691680261, + "grad_norm": 0.5311912894248962, + "learning_rate": 4.4334187855968326e-08, + "loss": 0.1823, + "num_input_tokens_seen": 130041840, + "step": 60255 + }, + { + "epoch": 9.830342577487766, + "grad_norm": 0.16381004452705383, + "learning_rate": 4.391147968657028e-08, + "loss": 0.033, + "num_input_tokens_seen": 130051216, + "step": 60260 + }, + { + "epoch": 9.83115823817292, + "grad_norm": 1.5619893074035645, + "learning_rate": 4.3490794584336024e-08, + "loss": 0.1698, + "num_input_tokens_seen": 130061872, + "step": 60265 + }, + { + "epoch": 9.831973898858076, + "grad_norm": 0.12030669301748276, + "learning_rate": 4.307213258336606e-08, + "loss": 0.0277, + "num_input_tokens_seen": 130072944, + "step": 60270 + }, + { + "epoch": 9.83278955954323, + "grad_norm": 1.210178017616272, + "learning_rate": 4.2655493717597137e-08, + "loss": 0.1368, + "num_input_tokens_seen": 130082704, + "step": 60275 + }, + { + "epoch": 9.833605220228385, + "grad_norm": 0.5539122819900513, + "learning_rate": 4.224087802080778e-08, + "loss": 0.0822, + "num_input_tokens_seen": 130093936, + "step": 60280 + }, + { + "epoch": 9.83442088091354, + "grad_norm": 0.5598983764648438, + "learning_rate": 4.182828552660722e-08, + "loss": 0.1353, + "num_input_tokens_seen": 130103888, + "step": 60285 + }, + { + "epoch": 9.835236541598695, + "grad_norm": 0.08794641494750977, + "learning_rate": 4.141771626844093e-08, + "loss": 0.027, + "num_input_tokens_seen": 130114864, + "step": 60290 + }, + { + "epoch": 9.83605220228385, + "grad_norm": 0.4468124508857727, + "learning_rate": 4.100917027959617e-08, + "loss": 0.1279, + "num_input_tokens_seen": 130125136, + "step": 60295 + }, + { + "epoch": 9.836867862969005, + "grad_norm": 0.6010929346084595, + "learning_rate": 4.0602647593185325e-08, + "loss": 0.091, + "num_input_tokens_seen": 130136240, + "step": 60300 + }, + { + "epoch": 9.83768352365416, + "grad_norm": 0.17568963766098022, + "learning_rate": 4.0198148242168163e-08, + "loss": 0.2175, + "num_input_tokens_seen": 130146864, + "step": 60305 + }, + { + "epoch": 9.838499184339314, + "grad_norm": 0.26038607954978943, + "learning_rate": 3.979567225933234e-08, + "loss": 0.0447, + "num_input_tokens_seen": 130156560, + "step": 60310 + }, + { + "epoch": 9.83931484502447, + "grad_norm": 0.05728593096137047, + "learning_rate": 3.939521967730731e-08, + "loss": 0.207, + "num_input_tokens_seen": 130167664, + "step": 60315 + }, + { + "epoch": 9.840130505709626, + "grad_norm": 0.06283362209796906, + "learning_rate": 3.8996790528555985e-08, + "loss": 0.1469, + "num_input_tokens_seen": 130178992, + "step": 60320 + }, + { + "epoch": 9.84094616639478, + "grad_norm": 0.04544258490204811, + "learning_rate": 3.860038484537476e-08, + "loss": 0.1134, + "num_input_tokens_seen": 130188944, + "step": 60325 + }, + { + "epoch": 9.841761827079935, + "grad_norm": 0.4675486385822296, + "learning_rate": 3.820600265989904e-08, + "loss": 0.1129, + "num_input_tokens_seen": 130199408, + "step": 60330 + }, + { + "epoch": 9.84257748776509, + "grad_norm": 1.0837852954864502, + "learning_rate": 3.78136440040977e-08, + "loss": 0.083, + "num_input_tokens_seen": 130211152, + "step": 60335 + }, + { + "epoch": 9.843393148450245, + "grad_norm": 0.28516992926597595, + "learning_rate": 3.742330890978141e-08, + "loss": 0.1015, + "num_input_tokens_seen": 130222192, + "step": 60340 + }, + { + "epoch": 9.844208809135399, + "grad_norm": 0.4966212809085846, + "learning_rate": 3.703499740859151e-08, + "loss": 0.0694, + "num_input_tokens_seen": 130232912, + "step": 60345 + }, + { + "epoch": 9.845024469820554, + "grad_norm": 0.7214843034744263, + "learning_rate": 3.6648709532002835e-08, + "loss": 0.0736, + "num_input_tokens_seen": 130242704, + "step": 60350 + }, + { + "epoch": 9.84584013050571, + "grad_norm": 0.840776264667511, + "learning_rate": 3.6264445311334774e-08, + "loss": 0.0772, + "num_input_tokens_seen": 130254224, + "step": 60355 + }, + { + "epoch": 9.846655791190864, + "grad_norm": 1.2847782373428345, + "learning_rate": 3.588220477773463e-08, + "loss": 0.2115, + "num_input_tokens_seen": 130265392, + "step": 60360 + }, + { + "epoch": 9.84747145187602, + "grad_norm": 0.2593439817428589, + "learning_rate": 3.5501987962191505e-08, + "loss": 0.0318, + "num_input_tokens_seen": 130276048, + "step": 60365 + }, + { + "epoch": 9.848287112561174, + "grad_norm": 0.09909996390342712, + "learning_rate": 3.5123794895522425e-08, + "loss": 0.048, + "num_input_tokens_seen": 130287472, + "step": 60370 + }, + { + "epoch": 9.84910277324633, + "grad_norm": 0.8957041501998901, + "learning_rate": 3.4747625608391735e-08, + "loss": 0.0487, + "num_input_tokens_seen": 130298832, + "step": 60375 + }, + { + "epoch": 9.849918433931485, + "grad_norm": 0.5533308982849121, + "learning_rate": 3.4373480131288936e-08, + "loss": 0.0218, + "num_input_tokens_seen": 130308432, + "step": 60380 + }, + { + "epoch": 9.850734094616639, + "grad_norm": 1.845125436782837, + "learning_rate": 3.400135849454811e-08, + "loss": 0.14, + "num_input_tokens_seen": 130318544, + "step": 60385 + }, + { + "epoch": 9.851549755301795, + "grad_norm": 0.40550780296325684, + "learning_rate": 3.363126072833123e-08, + "loss": 0.2418, + "num_input_tokens_seen": 130328176, + "step": 60390 + }, + { + "epoch": 9.852365415986949, + "grad_norm": 1.922080159187317, + "learning_rate": 3.326318686264485e-08, + "loss": 0.2355, + "num_input_tokens_seen": 130338896, + "step": 60395 + }, + { + "epoch": 9.853181076672104, + "grad_norm": 0.038508299738168716, + "learning_rate": 3.2897136927323436e-08, + "loss": 0.0481, + "num_input_tokens_seen": 130350160, + "step": 60400 + }, + { + "epoch": 9.85399673735726, + "grad_norm": 0.2541324496269226, + "learning_rate": 3.253311095204048e-08, + "loss": 0.0512, + "num_input_tokens_seen": 130360976, + "step": 60405 + }, + { + "epoch": 9.854812398042414, + "grad_norm": 0.7238049507141113, + "learning_rate": 3.2171108966308486e-08, + "loss": 0.035, + "num_input_tokens_seen": 130371184, + "step": 60410 + }, + { + "epoch": 9.85562805872757, + "grad_norm": 0.6900864243507385, + "learning_rate": 3.1811130999473415e-08, + "loss": 0.0386, + "num_input_tokens_seen": 130381168, + "step": 60415 + }, + { + "epoch": 9.856443719412724, + "grad_norm": 0.8679578900337219, + "learning_rate": 3.145317708071194e-08, + "loss": 0.1047, + "num_input_tokens_seen": 130390544, + "step": 60420 + }, + { + "epoch": 9.85725938009788, + "grad_norm": 0.0705682560801506, + "learning_rate": 3.1097247239048057e-08, + "loss": 0.0309, + "num_input_tokens_seen": 130401232, + "step": 60425 + }, + { + "epoch": 9.858075040783035, + "grad_norm": 0.07589536905288696, + "learning_rate": 3.074334150333091e-08, + "loss": 0.0265, + "num_input_tokens_seen": 130412560, + "step": 60430 + }, + { + "epoch": 9.858890701468189, + "grad_norm": 0.09698832035064697, + "learning_rate": 3.039145990225145e-08, + "loss": 0.1259, + "num_input_tokens_seen": 130423472, + "step": 60435 + }, + { + "epoch": 9.859706362153345, + "grad_norm": 1.5236247777938843, + "learning_rate": 3.0041602464334076e-08, + "loss": 0.0702, + "num_input_tokens_seen": 130433008, + "step": 60440 + }, + { + "epoch": 9.860522022838499, + "grad_norm": 1.2929900884628296, + "learning_rate": 2.9693769217942203e-08, + "loss": 0.1479, + "num_input_tokens_seen": 130445776, + "step": 60445 + }, + { + "epoch": 9.861337683523654, + "grad_norm": 1.6703959703445435, + "learning_rate": 2.9347960191269952e-08, + "loss": 0.1735, + "num_input_tokens_seen": 130456720, + "step": 60450 + }, + { + "epoch": 9.86215334420881, + "grad_norm": 0.21385742723941803, + "learning_rate": 2.900417541235323e-08, + "loss": 0.0531, + "num_input_tokens_seen": 130468176, + "step": 60455 + }, + { + "epoch": 9.862969004893964, + "grad_norm": 0.07109559327363968, + "learning_rate": 2.8662414909058634e-08, + "loss": 0.0464, + "num_input_tokens_seen": 130479344, + "step": 60460 + }, + { + "epoch": 9.86378466557912, + "grad_norm": 0.2358773648738861, + "learning_rate": 2.8322678709094553e-08, + "loss": 0.0475, + "num_input_tokens_seen": 130490672, + "step": 60465 + }, + { + "epoch": 9.864600326264274, + "grad_norm": 0.0835823118686676, + "learning_rate": 2.7984966839997294e-08, + "loss": 0.0352, + "num_input_tokens_seen": 130502160, + "step": 60470 + }, + { + "epoch": 9.86541598694943, + "grad_norm": 0.19384996592998505, + "learning_rate": 2.7649279329142185e-08, + "loss": 0.0158, + "num_input_tokens_seen": 130514160, + "step": 60475 + }, + { + "epoch": 9.866231647634583, + "grad_norm": 0.03799856826663017, + "learning_rate": 2.7315616203749118e-08, + "loss": 0.1113, + "num_input_tokens_seen": 130524336, + "step": 60480 + }, + { + "epoch": 9.867047308319739, + "grad_norm": 0.08535922318696976, + "learning_rate": 2.6983977490860345e-08, + "loss": 0.1366, + "num_input_tokens_seen": 130535440, + "step": 60485 + }, + { + "epoch": 9.867862969004895, + "grad_norm": 0.09752354025840759, + "learning_rate": 2.6654363217362698e-08, + "loss": 0.1388, + "num_input_tokens_seen": 130546832, + "step": 60490 + }, + { + "epoch": 9.868678629690049, + "grad_norm": 0.4327438175678253, + "learning_rate": 2.632677340997647e-08, + "loss": 0.0536, + "num_input_tokens_seen": 130557168, + "step": 60495 + }, + { + "epoch": 9.869494290375204, + "grad_norm": 0.7532936930656433, + "learning_rate": 2.6001208095258188e-08, + "loss": 0.084, + "num_input_tokens_seen": 130567664, + "step": 60500 + }, + { + "epoch": 9.870309951060358, + "grad_norm": 1.0724714994430542, + "learning_rate": 2.5677667299597863e-08, + "loss": 0.0854, + "num_input_tokens_seen": 130577520, + "step": 60505 + }, + { + "epoch": 9.871125611745514, + "grad_norm": 0.33751508593559265, + "learning_rate": 2.5356151049221734e-08, + "loss": 0.0611, + "num_input_tokens_seen": 130589328, + "step": 60510 + }, + { + "epoch": 9.87194127243067, + "grad_norm": 1.9912911653518677, + "learning_rate": 2.5036659370197836e-08, + "loss": 0.0585, + "num_input_tokens_seen": 130599760, + "step": 60515 + }, + { + "epoch": 9.872756933115824, + "grad_norm": 0.6996232867240906, + "learning_rate": 2.4719192288424896e-08, + "loss": 0.0788, + "num_input_tokens_seen": 130610576, + "step": 60520 + }, + { + "epoch": 9.87357259380098, + "grad_norm": 1.0828760862350464, + "learning_rate": 2.440374982963789e-08, + "loss": 0.1347, + "num_input_tokens_seen": 130621168, + "step": 60525 + }, + { + "epoch": 9.874388254486133, + "grad_norm": 0.025998879224061966, + "learning_rate": 2.409033201940525e-08, + "loss": 0.0433, + "num_input_tokens_seen": 130630736, + "step": 60530 + }, + { + "epoch": 9.875203915171289, + "grad_norm": 0.6303337216377258, + "learning_rate": 2.3778938883139977e-08, + "loss": 0.1115, + "num_input_tokens_seen": 130641840, + "step": 60535 + }, + { + "epoch": 9.876019575856443, + "grad_norm": 0.03806828707456589, + "learning_rate": 2.3469570446080223e-08, + "loss": 0.0415, + "num_input_tokens_seen": 130652816, + "step": 60540 + }, + { + "epoch": 9.876835236541599, + "grad_norm": 0.34084388613700867, + "learning_rate": 2.3162226733305925e-08, + "loss": 0.0949, + "num_input_tokens_seen": 130663248, + "step": 60545 + }, + { + "epoch": 9.877650897226754, + "grad_norm": 0.10089428722858429, + "learning_rate": 2.2856907769736037e-08, + "loss": 0.0609, + "num_input_tokens_seen": 130675184, + "step": 60550 + }, + { + "epoch": 9.878466557911908, + "grad_norm": 1.2766313552856445, + "learning_rate": 2.255361358011465e-08, + "loss": 0.0713, + "num_input_tokens_seen": 130686576, + "step": 60555 + }, + { + "epoch": 9.879282218597064, + "grad_norm": 0.48859861493110657, + "learning_rate": 2.2252344189033213e-08, + "loss": 0.0759, + "num_input_tokens_seen": 130696240, + "step": 60560 + }, + { + "epoch": 9.880097879282218, + "grad_norm": 0.886803925037384, + "learning_rate": 2.1953099620911076e-08, + "loss": 0.118, + "num_input_tokens_seen": 130706064, + "step": 60565 + }, + { + "epoch": 9.880913539967374, + "grad_norm": 0.2066485732793808, + "learning_rate": 2.165587990000939e-08, + "loss": 0.0495, + "num_input_tokens_seen": 130716720, + "step": 60570 + }, + { + "epoch": 9.88172920065253, + "grad_norm": 0.4094139039516449, + "learning_rate": 2.1360685050419994e-08, + "loss": 0.1235, + "num_input_tokens_seen": 130726288, + "step": 60575 + }, + { + "epoch": 9.882544861337683, + "grad_norm": 0.12372993677854538, + "learning_rate": 2.106751509607374e-08, + "loss": 0.1208, + "num_input_tokens_seen": 130736368, + "step": 60580 + }, + { + "epoch": 9.883360522022839, + "grad_norm": 0.15551476180553436, + "learning_rate": 2.0776370060737737e-08, + "loss": 0.205, + "num_input_tokens_seen": 130746160, + "step": 60585 + }, + { + "epoch": 9.884176182707993, + "grad_norm": 0.09443182498216629, + "learning_rate": 2.0487249968012546e-08, + "loss": 0.1263, + "num_input_tokens_seen": 130755728, + "step": 60590 + }, + { + "epoch": 9.884991843393149, + "grad_norm": 0.22864508628845215, + "learning_rate": 2.020015484133497e-08, + "loss": 0.0554, + "num_input_tokens_seen": 130767088, + "step": 60595 + }, + { + "epoch": 9.885807504078304, + "grad_norm": 0.10961823910474777, + "learning_rate": 1.9915084703980845e-08, + "loss": 0.0697, + "num_input_tokens_seen": 130776944, + "step": 60600 + }, + { + "epoch": 9.886623164763458, + "grad_norm": 1.38397216796875, + "learning_rate": 1.9632039579053907e-08, + "loss": 0.0775, + "num_input_tokens_seen": 130786480, + "step": 60605 + }, + { + "epoch": 9.887438825448614, + "grad_norm": 0.032372549176216125, + "learning_rate": 1.935101948950524e-08, + "loss": 0.0293, + "num_input_tokens_seen": 130797968, + "step": 60610 + }, + { + "epoch": 9.888254486133768, + "grad_norm": 2.720259189605713, + "learning_rate": 1.9072024458113847e-08, + "loss": 0.2113, + "num_input_tokens_seen": 130808272, + "step": 60615 + }, + { + "epoch": 9.889070146818923, + "grad_norm": 0.08872219920158386, + "learning_rate": 1.8795054507494967e-08, + "loss": 0.1765, + "num_input_tokens_seen": 130817488, + "step": 60620 + }, + { + "epoch": 9.88988580750408, + "grad_norm": 0.08916054666042328, + "learning_rate": 1.852010966010287e-08, + "loss": 0.0127, + "num_input_tokens_seen": 130828432, + "step": 60625 + }, + { + "epoch": 9.890701468189233, + "grad_norm": 0.12110397964715958, + "learning_rate": 1.8247189938225274e-08, + "loss": 0.0946, + "num_input_tokens_seen": 130838416, + "step": 60630 + }, + { + "epoch": 9.891517128874389, + "grad_norm": 0.766689121723175, + "learning_rate": 1.7976295363988927e-08, + "loss": 0.0271, + "num_input_tokens_seen": 130847920, + "step": 60635 + }, + { + "epoch": 9.892332789559543, + "grad_norm": 1.4350377321243286, + "learning_rate": 1.7707425959348488e-08, + "loss": 0.2669, + "num_input_tokens_seen": 130859184, + "step": 60640 + }, + { + "epoch": 9.893148450244698, + "grad_norm": 0.0512918122112751, + "learning_rate": 1.744058174610319e-08, + "loss": 0.0851, + "num_input_tokens_seen": 130869808, + "step": 60645 + }, + { + "epoch": 9.893964110929852, + "grad_norm": 0.2971656918525696, + "learning_rate": 1.7175762745885727e-08, + "loss": 0.0643, + "num_input_tokens_seen": 130880912, + "step": 60650 + }, + { + "epoch": 9.894779771615008, + "grad_norm": 2.3093202114105225, + "learning_rate": 1.6912968980162257e-08, + "loss": 0.0962, + "num_input_tokens_seen": 130892336, + "step": 60655 + }, + { + "epoch": 9.895595432300164, + "grad_norm": 0.38605746626853943, + "learning_rate": 1.665220047023519e-08, + "loss": 0.1096, + "num_input_tokens_seen": 130900752, + "step": 60660 + }, + { + "epoch": 9.896411092985318, + "grad_norm": 0.10794999450445175, + "learning_rate": 1.639345723724872e-08, + "loss": 0.1384, + "num_input_tokens_seen": 130911600, + "step": 60665 + }, + { + "epoch": 9.897226753670473, + "grad_norm": 0.15154238045215607, + "learning_rate": 1.6136739302169412e-08, + "loss": 0.0336, + "num_input_tokens_seen": 130922640, + "step": 60670 + }, + { + "epoch": 9.898042414355627, + "grad_norm": 0.7391504645347595, + "learning_rate": 1.588204668581672e-08, + "loss": 0.0902, + "num_input_tokens_seen": 130933136, + "step": 60675 + }, + { + "epoch": 9.898858075040783, + "grad_norm": 0.251347154378891, + "learning_rate": 1.5629379408832468e-08, + "loss": 0.1046, + "num_input_tokens_seen": 130944336, + "step": 60680 + }, + { + "epoch": 9.899673735725939, + "grad_norm": 0.5382744669914246, + "learning_rate": 1.537873749169749e-08, + "loss": 0.1033, + "num_input_tokens_seen": 130952848, + "step": 60685 + }, + { + "epoch": 9.900489396411093, + "grad_norm": 0.09825175255537033, + "learning_rate": 1.513012095473443e-08, + "loss": 0.1138, + "num_input_tokens_seen": 130963536, + "step": 60690 + }, + { + "epoch": 9.901305057096248, + "grad_norm": 0.6000102758407593, + "learning_rate": 1.4883529818096598e-08, + "loss": 0.2297, + "num_input_tokens_seen": 130973840, + "step": 60695 + }, + { + "epoch": 9.902120717781402, + "grad_norm": 0.10964076966047287, + "learning_rate": 1.4638964101773568e-08, + "loss": 0.0698, + "num_input_tokens_seen": 130984304, + "step": 60700 + }, + { + "epoch": 9.902936378466558, + "grad_norm": 0.39939531683921814, + "learning_rate": 1.4396423825588367e-08, + "loss": 0.1084, + "num_input_tokens_seen": 130995504, + "step": 60705 + }, + { + "epoch": 9.903752039151712, + "grad_norm": 2.089472532272339, + "learning_rate": 1.4155909009205826e-08, + "loss": 0.1944, + "num_input_tokens_seen": 131005968, + "step": 60710 + }, + { + "epoch": 9.904567699836868, + "grad_norm": 0.3247097432613373, + "learning_rate": 1.3917419672124233e-08, + "loss": 0.0333, + "num_input_tokens_seen": 131018160, + "step": 60715 + }, + { + "epoch": 9.905383360522023, + "grad_norm": 0.21796289086341858, + "learning_rate": 1.368095583367257e-08, + "loss": 0.1051, + "num_input_tokens_seen": 131029008, + "step": 60720 + }, + { + "epoch": 9.906199021207177, + "grad_norm": 1.331068515777588, + "learning_rate": 1.3446517513021617e-08, + "loss": 0.1498, + "num_input_tokens_seen": 131038960, + "step": 60725 + }, + { + "epoch": 9.907014681892333, + "grad_norm": 0.1166331022977829, + "learning_rate": 1.321410472917839e-08, + "loss": 0.0539, + "num_input_tokens_seen": 131050160, + "step": 60730 + }, + { + "epoch": 9.907830342577487, + "grad_norm": 0.6199157238006592, + "learning_rate": 1.2983717500977822e-08, + "loss": 0.0653, + "num_input_tokens_seen": 131060048, + "step": 60735 + }, + { + "epoch": 9.908646003262643, + "grad_norm": 2.3776750564575195, + "learning_rate": 1.275535584710219e-08, + "loss": 0.1472, + "num_input_tokens_seen": 131069360, + "step": 60740 + }, + { + "epoch": 9.909461663947798, + "grad_norm": 0.40564456582069397, + "learning_rate": 1.2529019786061691e-08, + "loss": 0.1709, + "num_input_tokens_seen": 131079888, + "step": 60745 + }, + { + "epoch": 9.910277324632952, + "grad_norm": 0.8215348124504089, + "learning_rate": 1.2304709336205533e-08, + "loss": 0.0493, + "num_input_tokens_seen": 131090512, + "step": 60750 + }, + { + "epoch": 9.911092985318108, + "grad_norm": 0.3610813021659851, + "learning_rate": 1.2082424515713619e-08, + "loss": 0.1442, + "num_input_tokens_seen": 131101680, + "step": 60755 + }, + { + "epoch": 9.911908646003262, + "grad_norm": 0.4276776909828186, + "learning_rate": 1.1862165342607645e-08, + "loss": 0.1365, + "num_input_tokens_seen": 131112336, + "step": 60760 + }, + { + "epoch": 9.912724306688418, + "grad_norm": 0.45619457960128784, + "learning_rate": 1.1643931834745548e-08, + "loss": 0.0661, + "num_input_tokens_seen": 131123088, + "step": 60765 + }, + { + "epoch": 9.913539967373573, + "grad_norm": 1.032450556755066, + "learning_rate": 1.1427724009813179e-08, + "loss": 0.1312, + "num_input_tokens_seen": 131132368, + "step": 60770 + }, + { + "epoch": 9.914355628058727, + "grad_norm": 0.2613178789615631, + "learning_rate": 1.1213541885340962e-08, + "loss": 0.0679, + "num_input_tokens_seen": 131143600, + "step": 60775 + }, + { + "epoch": 9.915171288743883, + "grad_norm": 0.29900801181793213, + "learning_rate": 1.1001385478692783e-08, + "loss": 0.028, + "num_input_tokens_seen": 131153872, + "step": 60780 + }, + { + "epoch": 9.915986949429037, + "grad_norm": 0.3229835033416748, + "learning_rate": 1.0791254807063223e-08, + "loss": 0.0701, + "num_input_tokens_seen": 131164848, + "step": 60785 + }, + { + "epoch": 9.916802610114193, + "grad_norm": 1.0704872608184814, + "learning_rate": 1.0583149887488653e-08, + "loss": 0.2202, + "num_input_tokens_seen": 131175280, + "step": 60790 + }, + { + "epoch": 9.917618270799348, + "grad_norm": 0.900990903377533, + "learning_rate": 1.0377070736838912e-08, + "loss": 0.0785, + "num_input_tokens_seen": 131185776, + "step": 60795 + }, + { + "epoch": 9.918433931484502, + "grad_norm": 2.2556674480438232, + "learning_rate": 1.017301737182008e-08, + "loss": 0.1421, + "num_input_tokens_seen": 131196944, + "step": 60800 + }, + { + "epoch": 9.919249592169658, + "grad_norm": 0.05243485048413277, + "learning_rate": 9.970989808974485e-09, + "loss": 0.0503, + "num_input_tokens_seen": 131208752, + "step": 60805 + }, + { + "epoch": 9.920065252854812, + "grad_norm": 0.37581488490104675, + "learning_rate": 9.77098806467791e-09, + "loss": 0.1024, + "num_input_tokens_seen": 131217360, + "step": 60810 + }, + { + "epoch": 9.920880913539968, + "grad_norm": 0.1582769751548767, + "learning_rate": 9.573012155145166e-09, + "loss": 0.0255, + "num_input_tokens_seen": 131228304, + "step": 60815 + }, + { + "epoch": 9.921696574225122, + "grad_norm": 0.05382923781871796, + "learning_rate": 9.37706209642175e-09, + "loss": 0.0735, + "num_input_tokens_seen": 131239568, + "step": 60820 + }, + { + "epoch": 9.922512234910277, + "grad_norm": 1.4000935554504395, + "learning_rate": 9.183137904397732e-09, + "loss": 0.1901, + "num_input_tokens_seen": 131250576, + "step": 60825 + }, + { + "epoch": 9.923327895595433, + "grad_norm": 0.23853084444999695, + "learning_rate": 8.991239594788315e-09, + "loss": 0.089, + "num_input_tokens_seen": 131261616, + "step": 60830 + }, + { + "epoch": 9.924143556280587, + "grad_norm": 0.2041265070438385, + "learning_rate": 8.801367183153276e-09, + "loss": 0.141, + "num_input_tokens_seen": 131272400, + "step": 60835 + }, + { + "epoch": 9.924959216965743, + "grad_norm": 0.7052086591720581, + "learning_rate": 8.61352068488308e-09, + "loss": 0.1992, + "num_input_tokens_seen": 131283280, + "step": 60840 + }, + { + "epoch": 9.925774877650896, + "grad_norm": 0.5031229853630066, + "learning_rate": 8.427700115207216e-09, + "loss": 0.058, + "num_input_tokens_seen": 131293808, + "step": 60845 + }, + { + "epoch": 9.926590538336052, + "grad_norm": 0.11865177750587463, + "learning_rate": 8.243905489185855e-09, + "loss": 0.0442, + "num_input_tokens_seen": 131304016, + "step": 60850 + }, + { + "epoch": 9.927406199021208, + "grad_norm": 0.22925367951393127, + "learning_rate": 8.062136821723742e-09, + "loss": 0.1841, + "num_input_tokens_seen": 131314032, + "step": 60855 + }, + { + "epoch": 9.928221859706362, + "grad_norm": 0.7639937400817871, + "learning_rate": 7.882394127550763e-09, + "loss": 0.1382, + "num_input_tokens_seen": 131324112, + "step": 60860 + }, + { + "epoch": 9.929037520391518, + "grad_norm": 0.28040674328804016, + "learning_rate": 7.704677421238593e-09, + "loss": 0.1373, + "num_input_tokens_seen": 131334352, + "step": 60865 + }, + { + "epoch": 9.929853181076671, + "grad_norm": 0.6466479897499084, + "learning_rate": 7.528986717195152e-09, + "loss": 0.3011, + "num_input_tokens_seen": 131344656, + "step": 60870 + }, + { + "epoch": 9.930668841761827, + "grad_norm": 0.4012351632118225, + "learning_rate": 7.355322029661826e-09, + "loss": 0.0609, + "num_input_tokens_seen": 131355984, + "step": 60875 + }, + { + "epoch": 9.931484502446983, + "grad_norm": 0.6722109913825989, + "learning_rate": 7.183683372719019e-09, + "loss": 0.0576, + "num_input_tokens_seen": 131367056, + "step": 60880 + }, + { + "epoch": 9.932300163132137, + "grad_norm": 1.1015123128890991, + "learning_rate": 7.0140707602805995e-09, + "loss": 0.0916, + "num_input_tokens_seen": 131378192, + "step": 60885 + }, + { + "epoch": 9.933115823817293, + "grad_norm": 0.09952694922685623, + "learning_rate": 6.846484206091131e-09, + "loss": 0.2188, + "num_input_tokens_seen": 131389040, + "step": 60890 + }, + { + "epoch": 9.933931484502446, + "grad_norm": 0.8001317977905273, + "learning_rate": 6.6809237237425156e-09, + "loss": 0.0565, + "num_input_tokens_seen": 131399664, + "step": 60895 + }, + { + "epoch": 9.934747145187602, + "grad_norm": 0.9058520793914795, + "learning_rate": 6.517389326651801e-09, + "loss": 0.104, + "num_input_tokens_seen": 131409680, + "step": 60900 + }, + { + "epoch": 9.935562805872756, + "grad_norm": 1.2568798065185547, + "learning_rate": 6.3558810280778254e-09, + "loss": 0.091, + "num_input_tokens_seen": 131420240, + "step": 60905 + }, + { + "epoch": 9.936378466557912, + "grad_norm": 1.121145248413086, + "learning_rate": 6.196398841112893e-09, + "loss": 0.076, + "num_input_tokens_seen": 131429488, + "step": 60910 + }, + { + "epoch": 9.937194127243067, + "grad_norm": 0.14027339220046997, + "learning_rate": 6.038942778685553e-09, + "loss": 0.1303, + "num_input_tokens_seen": 131439792, + "step": 60915 + }, + { + "epoch": 9.938009787928221, + "grad_norm": 0.04226572439074516, + "learning_rate": 5.883512853557816e-09, + "loss": 0.1013, + "num_input_tokens_seen": 131448976, + "step": 60920 + }, + { + "epoch": 9.938825448613377, + "grad_norm": 0.15958139300346375, + "learning_rate": 5.730109078330714e-09, + "loss": 0.1507, + "num_input_tokens_seen": 131459088, + "step": 60925 + }, + { + "epoch": 9.939641109298531, + "grad_norm": 0.3867116868495941, + "learning_rate": 5.578731465444298e-09, + "loss": 0.0772, + "num_input_tokens_seen": 131470096, + "step": 60930 + }, + { + "epoch": 9.940456769983687, + "grad_norm": 1.5240671634674072, + "learning_rate": 5.429380027163755e-09, + "loss": 0.1767, + "num_input_tokens_seen": 131479376, + "step": 60935 + }, + { + "epoch": 9.941272430668842, + "grad_norm": 0.2996193766593933, + "learning_rate": 5.2820547755988434e-09, + "loss": 0.0974, + "num_input_tokens_seen": 131491792, + "step": 60940 + }, + { + "epoch": 9.942088091353996, + "grad_norm": 0.09192058444023132, + "learning_rate": 5.1367557226927875e-09, + "loss": 0.0081, + "num_input_tokens_seen": 131502512, + "step": 60945 + }, + { + "epoch": 9.942903752039152, + "grad_norm": 1.1035370826721191, + "learning_rate": 4.9934828802250535e-09, + "loss": 0.2103, + "num_input_tokens_seen": 131514224, + "step": 60950 + }, + { + "epoch": 9.943719412724306, + "grad_norm": 0.5145384073257446, + "learning_rate": 4.852236259805798e-09, + "loss": 0.1219, + "num_input_tokens_seen": 131525840, + "step": 60955 + }, + { + "epoch": 9.944535073409462, + "grad_norm": 0.07911545783281326, + "learning_rate": 4.7130158728925236e-09, + "loss": 0.0412, + "num_input_tokens_seen": 131536208, + "step": 60960 + }, + { + "epoch": 9.945350734094617, + "grad_norm": 2.092543840408325, + "learning_rate": 4.575821730765095e-09, + "loss": 0.1099, + "num_input_tokens_seen": 131547344, + "step": 60965 + }, + { + "epoch": 9.946166394779771, + "grad_norm": 2.521390676498413, + "learning_rate": 4.440653844545173e-09, + "loss": 0.1263, + "num_input_tokens_seen": 131558544, + "step": 60970 + }, + { + "epoch": 9.946982055464927, + "grad_norm": 0.21749073266983032, + "learning_rate": 4.307512225196209e-09, + "loss": 0.1853, + "num_input_tokens_seen": 131569136, + "step": 60975 + }, + { + "epoch": 9.947797716150081, + "grad_norm": 0.36385878920555115, + "learning_rate": 4.176396883504019e-09, + "loss": 0.1298, + "num_input_tokens_seen": 131580848, + "step": 60980 + }, + { + "epoch": 9.948613376835237, + "grad_norm": 0.0878366082906723, + "learning_rate": 4.047307830101765e-09, + "loss": 0.0163, + "num_input_tokens_seen": 131591664, + "step": 60985 + }, + { + "epoch": 9.949429037520392, + "grad_norm": 0.33480018377304077, + "learning_rate": 3.9202450754533e-09, + "loss": 0.1014, + "num_input_tokens_seen": 131602032, + "step": 60990 + }, + { + "epoch": 9.950244698205546, + "grad_norm": 1.6440725326538086, + "learning_rate": 3.79520862985594e-09, + "loss": 0.1443, + "num_input_tokens_seen": 131613520, + "step": 60995 + }, + { + "epoch": 9.951060358890702, + "grad_norm": 0.5772284865379333, + "learning_rate": 3.6721985034515738e-09, + "loss": 0.077, + "num_input_tokens_seen": 131624624, + "step": 61000 + }, + { + "epoch": 9.951876019575856, + "grad_norm": 0.20617149770259857, + "learning_rate": 3.5512147062072287e-09, + "loss": 0.0698, + "num_input_tokens_seen": 131634416, + "step": 61005 + }, + { + "epoch": 9.952691680261012, + "grad_norm": 0.8018596768379211, + "learning_rate": 3.4322572479345005e-09, + "loss": 0.0326, + "num_input_tokens_seen": 131645232, + "step": 61010 + }, + { + "epoch": 9.953507340946166, + "grad_norm": 1.1893234252929688, + "learning_rate": 3.3153261382729008e-09, + "loss": 0.2056, + "num_input_tokens_seen": 131655824, + "step": 61015 + }, + { + "epoch": 9.954323001631321, + "grad_norm": 1.0798609256744385, + "learning_rate": 3.2004213867009582e-09, + "loss": 0.0604, + "num_input_tokens_seen": 131666640, + "step": 61020 + }, + { + "epoch": 9.955138662316477, + "grad_norm": 0.19304655492305756, + "learning_rate": 3.0875430025362197e-09, + "loss": 0.072, + "num_input_tokens_seen": 131675120, + "step": 61025 + }, + { + "epoch": 9.955954323001631, + "grad_norm": 0.20073023438453674, + "learning_rate": 2.9766909949296983e-09, + "loss": 0.1148, + "num_input_tokens_seen": 131686096, + "step": 61030 + }, + { + "epoch": 9.956769983686787, + "grad_norm": 0.21722692251205444, + "learning_rate": 2.8678653728658746e-09, + "loss": 0.0297, + "num_input_tokens_seen": 131697968, + "step": 61035 + }, + { + "epoch": 9.95758564437194, + "grad_norm": 0.0813223347067833, + "learning_rate": 2.761066145168245e-09, + "loss": 0.0948, + "num_input_tokens_seen": 131709168, + "step": 61040 + }, + { + "epoch": 9.958401305057096, + "grad_norm": 0.1980203241109848, + "learning_rate": 2.656293320490999e-09, + "loss": 0.1931, + "num_input_tokens_seen": 131720656, + "step": 61045 + }, + { + "epoch": 9.959216965742252, + "grad_norm": 0.6853441596031189, + "learning_rate": 2.5535469073301176e-09, + "loss": 0.0503, + "num_input_tokens_seen": 131731536, + "step": 61050 + }, + { + "epoch": 9.960032626427406, + "grad_norm": 0.6942384839057922, + "learning_rate": 2.4528269140150497e-09, + "loss": 0.042, + "num_input_tokens_seen": 131742928, + "step": 61055 + }, + { + "epoch": 9.960848287112562, + "grad_norm": 1.4624426364898682, + "learning_rate": 2.354133348711485e-09, + "loss": 0.1461, + "num_input_tokens_seen": 131752496, + "step": 61060 + }, + { + "epoch": 9.961663947797716, + "grad_norm": 0.05512118712067604, + "learning_rate": 2.2574662194158047e-09, + "loss": 0.075, + "num_input_tokens_seen": 131764048, + "step": 61065 + }, + { + "epoch": 9.962479608482871, + "grad_norm": 1.0332849025726318, + "learning_rate": 2.1628255339689596e-09, + "loss": 0.1111, + "num_input_tokens_seen": 131774352, + "step": 61070 + }, + { + "epoch": 9.963295269168025, + "grad_norm": 0.9794241189956665, + "learning_rate": 2.0702113000425903e-09, + "loss": 0.0598, + "num_input_tokens_seen": 131783696, + "step": 61075 + }, + { + "epoch": 9.964110929853181, + "grad_norm": 0.10865426063537598, + "learning_rate": 1.979623525141805e-09, + "loss": 0.0621, + "num_input_tokens_seen": 131795696, + "step": 61080 + }, + { + "epoch": 9.964926590538337, + "grad_norm": 0.7038489580154419, + "learning_rate": 1.891062216610728e-09, + "loss": 0.0434, + "num_input_tokens_seen": 131805616, + "step": 61085 + }, + { + "epoch": 9.96574225122349, + "grad_norm": 0.812766969203949, + "learning_rate": 1.804527381629728e-09, + "loss": 0.2073, + "num_input_tokens_seen": 131815056, + "step": 61090 + }, + { + "epoch": 9.966557911908646, + "grad_norm": 0.4233407974243164, + "learning_rate": 1.7200190272126382e-09, + "loss": 0.0338, + "num_input_tokens_seen": 131825392, + "step": 61095 + }, + { + "epoch": 9.9673735725938, + "grad_norm": 1.5421538352966309, + "learning_rate": 1.6375371602123103e-09, + "loss": 0.1291, + "num_input_tokens_seen": 131836144, + "step": 61100 + }, + { + "epoch": 9.968189233278956, + "grad_norm": 0.7180038690567017, + "learning_rate": 1.5570817873122868e-09, + "loss": 0.0851, + "num_input_tokens_seen": 131846672, + "step": 61105 + }, + { + "epoch": 9.969004893964112, + "grad_norm": 1.4333051443099976, + "learning_rate": 1.4786529150379036e-09, + "loss": 0.2938, + "num_input_tokens_seen": 131858544, + "step": 61110 + }, + { + "epoch": 9.969820554649266, + "grad_norm": 0.08984183520078659, + "learning_rate": 1.4022505497424122e-09, + "loss": 0.1164, + "num_input_tokens_seen": 131869296, + "step": 61115 + }, + { + "epoch": 9.970636215334421, + "grad_norm": 0.16206775605678558, + "learning_rate": 1.3278746976236322e-09, + "loss": 0.1018, + "num_input_tokens_seen": 131879376, + "step": 61120 + }, + { + "epoch": 9.971451876019575, + "grad_norm": 0.22377245128154755, + "learning_rate": 1.255525364710075e-09, + "loss": 0.0394, + "num_input_tokens_seen": 131890768, + "step": 61125 + }, + { + "epoch": 9.97226753670473, + "grad_norm": 0.33986184000968933, + "learning_rate": 1.1852025568637183e-09, + "loss": 0.0522, + "num_input_tokens_seen": 131901104, + "step": 61130 + }, + { + "epoch": 9.973083197389887, + "grad_norm": 0.04135807231068611, + "learning_rate": 1.116906279791108e-09, + "loss": 0.2249, + "num_input_tokens_seen": 131912208, + "step": 61135 + }, + { + "epoch": 9.97389885807504, + "grad_norm": 2.50321888923645, + "learning_rate": 1.0506365390211547e-09, + "loss": 0.2434, + "num_input_tokens_seen": 131923472, + "step": 61140 + }, + { + "epoch": 9.974714518760196, + "grad_norm": 1.9483519792556763, + "learning_rate": 9.863933399328895e-10, + "loss": 0.312, + "num_input_tokens_seen": 131934384, + "step": 61145 + }, + { + "epoch": 9.97553017944535, + "grad_norm": 0.19242602586746216, + "learning_rate": 9.241766877304825e-10, + "loss": 0.1477, + "num_input_tokens_seen": 131944784, + "step": 61150 + }, + { + "epoch": 9.976345840130506, + "grad_norm": 1.0734996795654297, + "learning_rate": 8.639865874571218e-10, + "loss": 0.1051, + "num_input_tokens_seen": 131955088, + "step": 61155 + }, + { + "epoch": 9.977161500815662, + "grad_norm": 0.08836201578378677, + "learning_rate": 8.058230439950132e-10, + "loss": 0.1377, + "num_input_tokens_seen": 131966384, + "step": 61160 + }, + { + "epoch": 9.977977161500815, + "grad_norm": 0.1768026500940323, + "learning_rate": 7.496860620570534e-10, + "loss": 0.1454, + "num_input_tokens_seen": 131977200, + "step": 61165 + }, + { + "epoch": 9.978792822185971, + "grad_norm": 0.7439782619476318, + "learning_rate": 6.955756461951568e-10, + "loss": 0.1555, + "num_input_tokens_seen": 131988848, + "step": 61170 + }, + { + "epoch": 9.979608482871125, + "grad_norm": 1.466532826423645, + "learning_rate": 6.434918007947044e-10, + "loss": 0.126, + "num_input_tokens_seen": 132000752, + "step": 61175 + }, + { + "epoch": 9.98042414355628, + "grad_norm": 0.690096914768219, + "learning_rate": 5.934345300773192e-10, + "loss": 0.0354, + "num_input_tokens_seen": 132010544, + "step": 61180 + }, + { + "epoch": 9.981239804241435, + "grad_norm": 0.2265433967113495, + "learning_rate": 5.454038381008664e-10, + "loss": 0.1249, + "num_input_tokens_seen": 132021552, + "step": 61185 + }, + { + "epoch": 9.98205546492659, + "grad_norm": 0.17778335511684418, + "learning_rate": 4.993997287622287e-10, + "loss": 0.2129, + "num_input_tokens_seen": 132032752, + "step": 61190 + }, + { + "epoch": 9.982871125611746, + "grad_norm": 0.09469524025917053, + "learning_rate": 4.554222057889801e-10, + "loss": 0.0188, + "num_input_tokens_seen": 132043536, + "step": 61195 + }, + { + "epoch": 9.9836867862969, + "grad_norm": 0.5308228731155396, + "learning_rate": 4.1347127274493635e-10, + "loss": 0.0796, + "num_input_tokens_seen": 132053840, + "step": 61200 + }, + { + "epoch": 9.984502446982056, + "grad_norm": 0.17310892045497894, + "learning_rate": 3.735469330301555e-10, + "loss": 0.0708, + "num_input_tokens_seen": 132064400, + "step": 61205 + }, + { + "epoch": 9.98531810766721, + "grad_norm": 0.08749160915613174, + "learning_rate": 3.356491898837133e-10, + "loss": 0.0999, + "num_input_tokens_seen": 132073552, + "step": 61210 + }, + { + "epoch": 9.986133768352365, + "grad_norm": 0.33518481254577637, + "learning_rate": 2.997780463753763e-10, + "loss": 0.0655, + "num_input_tokens_seen": 132084400, + "step": 61215 + }, + { + "epoch": 9.986949429037521, + "grad_norm": 0.600452184677124, + "learning_rate": 2.659335054139289e-10, + "loss": 0.1141, + "num_input_tokens_seen": 132095856, + "step": 61220 + }, + { + "epoch": 9.987765089722675, + "grad_norm": 0.18527080118656158, + "learning_rate": 2.341155697471731e-10, + "loss": 0.1055, + "num_input_tokens_seen": 132108368, + "step": 61225 + }, + { + "epoch": 9.98858075040783, + "grad_norm": 0.1943722665309906, + "learning_rate": 2.043242419452751e-10, + "loss": 0.1935, + "num_input_tokens_seen": 132120432, + "step": 61230 + }, + { + "epoch": 9.989396411092985, + "grad_norm": 0.0777931660413742, + "learning_rate": 1.7655952443129675e-10, + "loss": 0.1797, + "num_input_tokens_seen": 132131568, + "step": 61235 + }, + { + "epoch": 9.99021207177814, + "grad_norm": 0.3111189007759094, + "learning_rate": 1.5082141945343963e-10, + "loss": 0.1287, + "num_input_tokens_seen": 132142992, + "step": 61240 + }, + { + "epoch": 9.991027732463294, + "grad_norm": 0.04403910040855408, + "learning_rate": 1.2710992909892306e-10, + "loss": 0.0247, + "num_input_tokens_seen": 132153520, + "step": 61245 + }, + { + "epoch": 9.99184339314845, + "grad_norm": 0.7367790937423706, + "learning_rate": 1.0542505528565727e-10, + "loss": 0.0722, + "num_input_tokens_seen": 132164368, + "step": 61250 + }, + { + "epoch": 9.992659053833606, + "grad_norm": 0.07988906651735306, + "learning_rate": 8.57667997788969e-11, + "loss": 0.0198, + "num_input_tokens_seen": 132174992, + "step": 61255 + }, + { + "epoch": 9.99347471451876, + "grad_norm": 1.0740950107574463, + "learning_rate": 6.813516416626087e-11, + "loss": 0.0763, + "num_input_tokens_seen": 132186320, + "step": 61260 + }, + { + "epoch": 9.994290375203915, + "grad_norm": 1.2808928489685059, + "learning_rate": 5.2530149877161315e-11, + "loss": 0.0857, + "num_input_tokens_seen": 132198000, + "step": 61265 + }, + { + "epoch": 9.99510603588907, + "grad_norm": 0.10808717459440231, + "learning_rate": 3.895175818002805e-11, + "loss": 0.1386, + "num_input_tokens_seen": 132209840, + "step": 61270 + }, + { + "epoch": 9.995921696574225, + "grad_norm": 0.1226329430937767, + "learning_rate": 2.7399990173981872e-11, + "loss": 0.0416, + "num_input_tokens_seen": 132220848, + "step": 61275 + }, + { + "epoch": 9.99673735725938, + "grad_norm": 1.7819468975067139, + "learning_rate": 1.7874846797161228e-11, + "loss": 0.1454, + "num_input_tokens_seen": 132230384, + "step": 61280 + }, + { + "epoch": 9.997553017944535, + "grad_norm": 0.048548053950071335, + "learning_rate": 1.0376328818395564e-11, + "loss": 0.17, + "num_input_tokens_seen": 132241200, + "step": 61285 + }, + { + "epoch": 9.99836867862969, + "grad_norm": 1.990525245666504, + "learning_rate": 4.90443684553199e-12, + "loss": 0.256, + "num_input_tokens_seen": 132251664, + "step": 61290 + }, + { + "epoch": 9.999184339314844, + "grad_norm": 0.07148000597953796, + "learning_rate": 1.4591713254352712e-12, + "loss": 0.0902, + "num_input_tokens_seen": 132262576, + "step": 61295 + }, + { + "epoch": 10.0, + "grad_norm": 0.9128270745277405, + "learning_rate": 4.053253843672167e-14, + "loss": 0.1177, + "num_input_tokens_seen": 132272272, + "step": 61300 + }, + { + "epoch": 10.0, + "eval_loss": 0.13692216575145721, + "eval_runtime": 131.8211, + "eval_samples_per_second": 20.672, + "eval_steps_per_second": 5.174, + "num_input_tokens_seen": 132272272, + "step": 61300 + }, + { + "epoch": 10.0, + "num_input_tokens_seen": 132272272, + "step": 61300, + "total_flos": 5.956316660046889e+18, + "train_loss": 0.1471858382194651, + "train_runtime": 40893.5226, + "train_samples_per_second": 5.996, + "train_steps_per_second": 1.499 + } + ], + "logging_steps": 5, + "max_steps": 61300, + "num_input_tokens_seen": 132272272, + "num_train_epochs": 10, + "save_steps": 3065, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.956316660046889e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}