{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 6710, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007451564828614009, "grad_norm": 26.2592177101648, "learning_rate": 2.4999998629955372e-06, "loss": 2.4544, "num_input_tokens_seen": 4096, "step": 1 }, { "epoch": 0.0014903129657228018, "grad_norm": 22.096361461004154, "learning_rate": 2.4999994519821783e-06, "loss": 2.1705, "num_input_tokens_seen": 8192, "step": 2 }, { "epoch": 0.0022354694485842027, "grad_norm": 42.084298324040915, "learning_rate": 2.499998766960014e-06, "loss": 2.1284, "num_input_tokens_seen": 12288, "step": 3 }, { "epoch": 0.0029806259314456036, "grad_norm": 15.55887220892588, "learning_rate": 2.4999978079291947e-06, "loss": 2.0113, "num_input_tokens_seen": 16384, "step": 4 }, { "epoch": 0.0037257824143070045, "grad_norm": 10.606157399433982, "learning_rate": 2.4999965748899295e-06, "loss": 1.8607, "num_input_tokens_seen": 20480, "step": 5 }, { "epoch": 0.004470938897168405, "grad_norm": 13.769665979577232, "learning_rate": 2.4999950678424892e-06, "loss": 1.7343, "num_input_tokens_seen": 24576, "step": 6 }, { "epoch": 0.005216095380029807, "grad_norm": 12.187025572099772, "learning_rate": 2.4999932867872047e-06, "loss": 2.1515, "num_input_tokens_seen": 28672, "step": 7 }, { "epoch": 0.005961251862891207, "grad_norm": 18.928271576552245, "learning_rate": 2.499991231724466e-06, "loss": 2.0501, "num_input_tokens_seen": 32768, "step": 8 }, { "epoch": 0.0067064083457526085, "grad_norm": 21.65347217050473, "learning_rate": 2.499988902654723e-06, "loss": 1.7586, "num_input_tokens_seen": 36864, "step": 9 }, { "epoch": 0.007451564828614009, "grad_norm": 20.707317974050305, "learning_rate": 2.499986299578487e-06, "loss": 2.0887, "num_input_tokens_seen": 40960, "step": 10 }, { "epoch": 0.00819672131147541, "grad_norm": 15.479510529891956, "learning_rate": 2.4999834224963287e-06, "loss": 1.7723, "num_input_tokens_seen": 45056, "step": 11 }, { "epoch": 0.00894187779433681, "grad_norm": 11.686052548485618, "learning_rate": 2.4999802714088784e-06, "loss": 1.96, "num_input_tokens_seen": 49152, "step": 12 }, { "epoch": 0.009687034277198211, "grad_norm": 8.881105305726823, "learning_rate": 2.4999768463168266e-06, "loss": 1.9163, "num_input_tokens_seen": 53248, "step": 13 }, { "epoch": 0.010432190760059613, "grad_norm": 11.413567961455172, "learning_rate": 2.4999731472209247e-06, "loss": 1.8526, "num_input_tokens_seen": 57344, "step": 14 }, { "epoch": 0.011177347242921014, "grad_norm": 8.475736578618825, "learning_rate": 2.4999691741219837e-06, "loss": 1.9305, "num_input_tokens_seen": 61440, "step": 15 }, { "epoch": 0.011922503725782414, "grad_norm": 8.406566352047948, "learning_rate": 2.4999649270208736e-06, "loss": 1.9454, "num_input_tokens_seen": 65536, "step": 16 }, { "epoch": 0.012667660208643815, "grad_norm": 20.687170861939844, "learning_rate": 2.4999604059185264e-06, "loss": 2.0263, "num_input_tokens_seen": 69632, "step": 17 }, { "epoch": 0.013412816691505217, "grad_norm": 10.07280812246135, "learning_rate": 2.4999556108159328e-06, "loss": 2.1609, "num_input_tokens_seen": 73728, "step": 18 }, { "epoch": 0.014157973174366617, "grad_norm": 16.843768898798537, "learning_rate": 2.4999505417141435e-06, "loss": 2.074, "num_input_tokens_seen": 77824, "step": 19 }, { "epoch": 0.014903129657228018, "grad_norm": 7.236448789789108, "learning_rate": 2.49994519861427e-06, "loss": 1.9962, "num_input_tokens_seen": 81920, "step": 20 }, { "epoch": 0.01564828614008942, "grad_norm": 10.738715331175193, "learning_rate": 2.4999395815174837e-06, "loss": 1.4408, "num_input_tokens_seen": 86016, "step": 21 }, { "epoch": 0.01639344262295082, "grad_norm": 10.915747892016858, "learning_rate": 2.4999336904250153e-06, "loss": 1.4921, "num_input_tokens_seen": 90112, "step": 22 }, { "epoch": 0.01713859910581222, "grad_norm": 8.966909728535994, "learning_rate": 2.4999275253381574e-06, "loss": 2.2125, "num_input_tokens_seen": 94208, "step": 23 }, { "epoch": 0.01788375558867362, "grad_norm": 11.532040739201362, "learning_rate": 2.49992108625826e-06, "loss": 1.5723, "num_input_tokens_seen": 98304, "step": 24 }, { "epoch": 0.018628912071535022, "grad_norm": 11.401390345917982, "learning_rate": 2.4999143731867355e-06, "loss": 1.6901, "num_input_tokens_seen": 102400, "step": 25 }, { "epoch": 0.019374068554396422, "grad_norm": 11.17350779681402, "learning_rate": 2.4999073861250554e-06, "loss": 1.7441, "num_input_tokens_seen": 106496, "step": 26 }, { "epoch": 0.020119225037257823, "grad_norm": 7.49121154350846, "learning_rate": 2.499900125074751e-06, "loss": 2.096, "num_input_tokens_seen": 110592, "step": 27 }, { "epoch": 0.020864381520119227, "grad_norm": 7.372382501855928, "learning_rate": 2.4998925900374143e-06, "loss": 1.8458, "num_input_tokens_seen": 114688, "step": 28 }, { "epoch": 0.021609538002980627, "grad_norm": 7.879041740902333, "learning_rate": 2.4998847810146966e-06, "loss": 1.7789, "num_input_tokens_seen": 118784, "step": 29 }, { "epoch": 0.022354694485842028, "grad_norm": 11.459426823857282, "learning_rate": 2.4998766980083098e-06, "loss": 1.6771, "num_input_tokens_seen": 122880, "step": 30 }, { "epoch": 0.023099850968703428, "grad_norm": 8.77467682892962, "learning_rate": 2.4998683410200258e-06, "loss": 1.9485, "num_input_tokens_seen": 126976, "step": 31 }, { "epoch": 0.02384500745156483, "grad_norm": 8.287367356428092, "learning_rate": 2.499859710051677e-06, "loss": 1.661, "num_input_tokens_seen": 131072, "step": 32 }, { "epoch": 0.02459016393442623, "grad_norm": 8.012791958458756, "learning_rate": 2.4998508051051546e-06, "loss": 1.5875, "num_input_tokens_seen": 135168, "step": 33 }, { "epoch": 0.02533532041728763, "grad_norm": 9.358178959840982, "learning_rate": 2.4998416261824113e-06, "loss": 1.5721, "num_input_tokens_seen": 139264, "step": 34 }, { "epoch": 0.02608047690014903, "grad_norm": 49.512750370014004, "learning_rate": 2.4998321732854585e-06, "loss": 2.3301, "num_input_tokens_seen": 143360, "step": 35 }, { "epoch": 0.026825633383010434, "grad_norm": 19.37834793243416, "learning_rate": 2.4998224464163693e-06, "loss": 2.0368, "num_input_tokens_seen": 147456, "step": 36 }, { "epoch": 0.027570789865871834, "grad_norm": 9.620704650070914, "learning_rate": 2.499812445577275e-06, "loss": 1.5724, "num_input_tokens_seen": 151552, "step": 37 }, { "epoch": 0.028315946348733235, "grad_norm": 8.612431964024728, "learning_rate": 2.499802170770368e-06, "loss": 1.2955, "num_input_tokens_seen": 155648, "step": 38 }, { "epoch": 0.029061102831594635, "grad_norm": 8.105303530798848, "learning_rate": 2.4997916219979007e-06, "loss": 1.9262, "num_input_tokens_seen": 159744, "step": 39 }, { "epoch": 0.029806259314456036, "grad_norm": 8.565400917054392, "learning_rate": 2.499780799262186e-06, "loss": 1.5556, "num_input_tokens_seen": 163840, "step": 40 }, { "epoch": 0.030551415797317436, "grad_norm": 12.880432050205359, "learning_rate": 2.499769702565596e-06, "loss": 1.6539, "num_input_tokens_seen": 167936, "step": 41 }, { "epoch": 0.03129657228017884, "grad_norm": 9.419348792719035, "learning_rate": 2.4997583319105624e-06, "loss": 1.3057, "num_input_tokens_seen": 172032, "step": 42 }, { "epoch": 0.03204172876304024, "grad_norm": 9.178114317653232, "learning_rate": 2.4997466872995786e-06, "loss": 1.7122, "num_input_tokens_seen": 176128, "step": 43 }, { "epoch": 0.03278688524590164, "grad_norm": 9.711384339318698, "learning_rate": 2.499734768735197e-06, "loss": 1.7401, "num_input_tokens_seen": 180224, "step": 44 }, { "epoch": 0.03353204172876304, "grad_norm": 8.816599893067004, "learning_rate": 2.49972257622003e-06, "loss": 1.7047, "num_input_tokens_seen": 184320, "step": 45 }, { "epoch": 0.03427719821162444, "grad_norm": 9.746736944823795, "learning_rate": 2.4997101097567508e-06, "loss": 1.567, "num_input_tokens_seen": 188416, "step": 46 }, { "epoch": 0.03502235469448584, "grad_norm": 8.51250387855355, "learning_rate": 2.499697369348092e-06, "loss": 1.6003, "num_input_tokens_seen": 192512, "step": 47 }, { "epoch": 0.03576751117734724, "grad_norm": 8.358458354529713, "learning_rate": 2.4996843549968457e-06, "loss": 1.4636, "num_input_tokens_seen": 196608, "step": 48 }, { "epoch": 0.03651266766020864, "grad_norm": 7.522612227453343, "learning_rate": 2.4996710667058654e-06, "loss": 2.0578, "num_input_tokens_seen": 200704, "step": 49 }, { "epoch": 0.037257824143070044, "grad_norm": 10.74100186969735, "learning_rate": 2.4996575044780637e-06, "loss": 1.3995, "num_input_tokens_seen": 204800, "step": 50 }, { "epoch": 0.038002980625931444, "grad_norm": 7.265803137479842, "learning_rate": 2.499643668316414e-06, "loss": 1.7972, "num_input_tokens_seen": 208896, "step": 51 }, { "epoch": 0.038748137108792845, "grad_norm": 8.48240248777722, "learning_rate": 2.4996295582239488e-06, "loss": 1.3429, "num_input_tokens_seen": 212992, "step": 52 }, { "epoch": 0.039493293591654245, "grad_norm": 6.3511796171244095, "learning_rate": 2.499615174203761e-06, "loss": 1.9008, "num_input_tokens_seen": 217088, "step": 53 }, { "epoch": 0.040238450074515646, "grad_norm": 8.570779072118688, "learning_rate": 2.4996005162590044e-06, "loss": 1.8574, "num_input_tokens_seen": 221184, "step": 54 }, { "epoch": 0.040983606557377046, "grad_norm": 8.295629157914748, "learning_rate": 2.4995855843928915e-06, "loss": 1.6347, "num_input_tokens_seen": 225280, "step": 55 }, { "epoch": 0.041728763040238454, "grad_norm": 8.840490119397352, "learning_rate": 2.4995703786086957e-06, "loss": 1.8412, "num_input_tokens_seen": 229376, "step": 56 }, { "epoch": 0.042473919523099854, "grad_norm": 7.1912986851205405, "learning_rate": 2.49955489890975e-06, "loss": 1.9378, "num_input_tokens_seen": 233472, "step": 57 }, { "epoch": 0.043219076005961254, "grad_norm": 7.925564843355136, "learning_rate": 2.499539145299448e-06, "loss": 1.6359, "num_input_tokens_seen": 237568, "step": 58 }, { "epoch": 0.043964232488822655, "grad_norm": 8.18981851900655, "learning_rate": 2.499523117781243e-06, "loss": 1.2167, "num_input_tokens_seen": 241664, "step": 59 }, { "epoch": 0.044709388971684055, "grad_norm": 9.440558749282143, "learning_rate": 2.4995068163586483e-06, "loss": 1.6408, "num_input_tokens_seen": 245760, "step": 60 }, { "epoch": 0.045454545454545456, "grad_norm": 7.810296577475019, "learning_rate": 2.4994902410352366e-06, "loss": 1.7217, "num_input_tokens_seen": 249856, "step": 61 }, { "epoch": 0.046199701937406856, "grad_norm": 11.768240771952748, "learning_rate": 2.4994733918146425e-06, "loss": 1.8552, "num_input_tokens_seen": 253952, "step": 62 }, { "epoch": 0.04694485842026826, "grad_norm": 14.13030482909867, "learning_rate": 2.4994562687005586e-06, "loss": 1.3624, "num_input_tokens_seen": 258048, "step": 63 }, { "epoch": 0.04769001490312966, "grad_norm": 6.8625783770105215, "learning_rate": 2.4994388716967386e-06, "loss": 2.0331, "num_input_tokens_seen": 262144, "step": 64 }, { "epoch": 0.04843517138599106, "grad_norm": 9.560315301093212, "learning_rate": 2.4994212008069962e-06, "loss": 1.2969, "num_input_tokens_seen": 266240, "step": 65 }, { "epoch": 0.04918032786885246, "grad_norm": 11.45413404996738, "learning_rate": 2.499403256035205e-06, "loss": 1.5188, "num_input_tokens_seen": 270336, "step": 66 }, { "epoch": 0.04992548435171386, "grad_norm": 7.105487155562134, "learning_rate": 2.4993850373852983e-06, "loss": 1.8453, "num_input_tokens_seen": 274432, "step": 67 }, { "epoch": 0.05067064083457526, "grad_norm": 14.254505023709651, "learning_rate": 2.4993665448612702e-06, "loss": 1.6573, "num_input_tokens_seen": 278528, "step": 68 }, { "epoch": 0.05141579731743666, "grad_norm": 7.696342686853101, "learning_rate": 2.4993477784671745e-06, "loss": 1.7681, "num_input_tokens_seen": 282624, "step": 69 }, { "epoch": 0.05216095380029806, "grad_norm": 8.740510535480826, "learning_rate": 2.499328738207124e-06, "loss": 1.4704, "num_input_tokens_seen": 286720, "step": 70 }, { "epoch": 0.05290611028315946, "grad_norm": 7.268669001877877, "learning_rate": 2.4993094240852935e-06, "loss": 1.9907, "num_input_tokens_seen": 290816, "step": 71 }, { "epoch": 0.05365126676602087, "grad_norm": 15.761363214554578, "learning_rate": 2.4992898361059163e-06, "loss": 1.7665, "num_input_tokens_seen": 294912, "step": 72 }, { "epoch": 0.05439642324888227, "grad_norm": 7.101160027338893, "learning_rate": 2.4992699742732864e-06, "loss": 1.6887, "num_input_tokens_seen": 299008, "step": 73 }, { "epoch": 0.05514157973174367, "grad_norm": 10.546660754092542, "learning_rate": 2.499249838591757e-06, "loss": 1.5901, "num_input_tokens_seen": 303104, "step": 74 }, { "epoch": 0.05588673621460507, "grad_norm": 6.66140205412926, "learning_rate": 2.4992294290657434e-06, "loss": 1.6904, "num_input_tokens_seen": 307200, "step": 75 }, { "epoch": 0.05663189269746647, "grad_norm": 8.693894061879814, "learning_rate": 2.499208745699718e-06, "loss": 1.6694, "num_input_tokens_seen": 311296, "step": 76 }, { "epoch": 0.05737704918032787, "grad_norm": 6.611435698062055, "learning_rate": 2.4991877884982154e-06, "loss": 1.7089, "num_input_tokens_seen": 315392, "step": 77 }, { "epoch": 0.05812220566318927, "grad_norm": 11.222642744404961, "learning_rate": 2.49916655746583e-06, "loss": 1.5323, "num_input_tokens_seen": 319488, "step": 78 }, { "epoch": 0.05886736214605067, "grad_norm": 7.042417228708208, "learning_rate": 2.499145052607215e-06, "loss": 1.6363, "num_input_tokens_seen": 323584, "step": 79 }, { "epoch": 0.05961251862891207, "grad_norm": 7.836278656629967, "learning_rate": 2.4991232739270847e-06, "loss": 1.8472, "num_input_tokens_seen": 327680, "step": 80 }, { "epoch": 0.06035767511177347, "grad_norm": 7.720300161152069, "learning_rate": 2.4991012214302136e-06, "loss": 1.675, "num_input_tokens_seen": 331776, "step": 81 }, { "epoch": 0.06110283159463487, "grad_norm": 8.530280503674181, "learning_rate": 2.499078895121435e-06, "loss": 1.6056, "num_input_tokens_seen": 335872, "step": 82 }, { "epoch": 0.06184798807749627, "grad_norm": 7.094195004993349, "learning_rate": 2.4990562950056437e-06, "loss": 1.7192, "num_input_tokens_seen": 339968, "step": 83 }, { "epoch": 0.06259314456035768, "grad_norm": 6.831533273326996, "learning_rate": 2.499033421087793e-06, "loss": 1.8451, "num_input_tokens_seen": 344064, "step": 84 }, { "epoch": 0.06333830104321908, "grad_norm": 9.328053056998593, "learning_rate": 2.4990102733728982e-06, "loss": 1.4133, "num_input_tokens_seen": 348160, "step": 85 }, { "epoch": 0.06408345752608048, "grad_norm": 6.728099055362108, "learning_rate": 2.4989868518660323e-06, "loss": 1.8014, "num_input_tokens_seen": 352256, "step": 86 }, { "epoch": 0.06482861400894188, "grad_norm": 10.3296814807208, "learning_rate": 2.49896315657233e-06, "loss": 1.4308, "num_input_tokens_seen": 356352, "step": 87 }, { "epoch": 0.06557377049180328, "grad_norm": 8.523012203547854, "learning_rate": 2.4989391874969854e-06, "loss": 1.4928, "num_input_tokens_seen": 360448, "step": 88 }, { "epoch": 0.06631892697466468, "grad_norm": 8.039701049552244, "learning_rate": 2.4989149446452527e-06, "loss": 1.5195, "num_input_tokens_seen": 364544, "step": 89 }, { "epoch": 0.06706408345752608, "grad_norm": 13.302888614075684, "learning_rate": 2.498890428022446e-06, "loss": 1.9749, "num_input_tokens_seen": 368640, "step": 90 }, { "epoch": 0.06780923994038748, "grad_norm": 9.46508868820543, "learning_rate": 2.4988656376339395e-06, "loss": 1.5448, "num_input_tokens_seen": 372736, "step": 91 }, { "epoch": 0.06855439642324888, "grad_norm": 7.385919440283184, "learning_rate": 2.498840573485168e-06, "loss": 2.0974, "num_input_tokens_seen": 376832, "step": 92 }, { "epoch": 0.06929955290611028, "grad_norm": 8.67791757050507, "learning_rate": 2.498815235581625e-06, "loss": 1.1699, "num_input_tokens_seen": 380928, "step": 93 }, { "epoch": 0.07004470938897168, "grad_norm": 10.026488279762964, "learning_rate": 2.4987896239288646e-06, "loss": 1.8106, "num_input_tokens_seen": 385024, "step": 94 }, { "epoch": 0.07078986587183309, "grad_norm": 8.804827779998126, "learning_rate": 2.498763738532502e-06, "loss": 1.8658, "num_input_tokens_seen": 389120, "step": 95 }, { "epoch": 0.07153502235469449, "grad_norm": 6.156989229677774, "learning_rate": 2.498737579398211e-06, "loss": 1.6157, "num_input_tokens_seen": 393216, "step": 96 }, { "epoch": 0.07228017883755589, "grad_norm": 8.735045713729635, "learning_rate": 2.4987111465317252e-06, "loss": 1.4651, "num_input_tokens_seen": 397312, "step": 97 }, { "epoch": 0.07302533532041729, "grad_norm": 6.833533797668797, "learning_rate": 2.4986844399388398e-06, "loss": 1.7253, "num_input_tokens_seen": 401408, "step": 98 }, { "epoch": 0.07377049180327869, "grad_norm": 11.486072006500864, "learning_rate": 2.498657459625409e-06, "loss": 1.8181, "num_input_tokens_seen": 405504, "step": 99 }, { "epoch": 0.07451564828614009, "grad_norm": 8.123142602068071, "learning_rate": 2.4986302055973466e-06, "loss": 1.7382, "num_input_tokens_seen": 409600, "step": 100 }, { "epoch": 0.07526080476900149, "grad_norm": 7.325321124100236, "learning_rate": 2.498602677860628e-06, "loss": 1.6686, "num_input_tokens_seen": 413696, "step": 101 }, { "epoch": 0.07600596125186289, "grad_norm": 8.48411470235438, "learning_rate": 2.4985748764212854e-06, "loss": 1.6356, "num_input_tokens_seen": 417792, "step": 102 }, { "epoch": 0.07675111773472429, "grad_norm": 7.383976017295508, "learning_rate": 2.4985468012854142e-06, "loss": 1.4051, "num_input_tokens_seen": 421888, "step": 103 }, { "epoch": 0.07749627421758569, "grad_norm": 7.762648783604647, "learning_rate": 2.4985184524591693e-06, "loss": 1.9119, "num_input_tokens_seen": 425984, "step": 104 }, { "epoch": 0.07824143070044709, "grad_norm": 8.558608244648465, "learning_rate": 2.4984898299487643e-06, "loss": 1.6375, "num_input_tokens_seen": 430080, "step": 105 }, { "epoch": 0.07898658718330849, "grad_norm": 14.9892187484667, "learning_rate": 2.4984609337604737e-06, "loss": 1.8567, "num_input_tokens_seen": 434176, "step": 106 }, { "epoch": 0.07973174366616989, "grad_norm": 10.823016690413843, "learning_rate": 2.4984317639006317e-06, "loss": 1.5457, "num_input_tokens_seen": 438272, "step": 107 }, { "epoch": 0.08047690014903129, "grad_norm": 7.279941675347542, "learning_rate": 2.498402320375632e-06, "loss": 1.6477, "num_input_tokens_seen": 442368, "step": 108 }, { "epoch": 0.08122205663189269, "grad_norm": 8.463783455397643, "learning_rate": 2.49837260319193e-06, "loss": 1.4253, "num_input_tokens_seen": 446464, "step": 109 }, { "epoch": 0.08196721311475409, "grad_norm": 7.94225823948356, "learning_rate": 2.4983426123560383e-06, "loss": 1.4538, "num_input_tokens_seen": 450560, "step": 110 }, { "epoch": 0.08271236959761549, "grad_norm": 9.233388562515422, "learning_rate": 2.4983123478745326e-06, "loss": 1.1376, "num_input_tokens_seen": 454656, "step": 111 }, { "epoch": 0.08345752608047691, "grad_norm": 7.641797603067413, "learning_rate": 2.4982818097540464e-06, "loss": 1.8518, "num_input_tokens_seen": 458752, "step": 112 }, { "epoch": 0.08420268256333831, "grad_norm": 11.029070574144072, "learning_rate": 2.4982509980012737e-06, "loss": 1.5002, "num_input_tokens_seen": 462848, "step": 113 }, { "epoch": 0.08494783904619971, "grad_norm": 7.5107108003207825, "learning_rate": 2.4982199126229693e-06, "loss": 1.8247, "num_input_tokens_seen": 466944, "step": 114 }, { "epoch": 0.08569299552906111, "grad_norm": 8.729817633145506, "learning_rate": 2.4981885536259463e-06, "loss": 1.6219, "num_input_tokens_seen": 471040, "step": 115 }, { "epoch": 0.08643815201192251, "grad_norm": 7.802316698015169, "learning_rate": 2.4981569210170804e-06, "loss": 1.8122, "num_input_tokens_seen": 475136, "step": 116 }, { "epoch": 0.08718330849478391, "grad_norm": 8.398438405642693, "learning_rate": 2.498125014803304e-06, "loss": 1.6108, "num_input_tokens_seen": 479232, "step": 117 }, { "epoch": 0.08792846497764531, "grad_norm": 9.294973699055676, "learning_rate": 2.498092834991613e-06, "loss": 1.3979, "num_input_tokens_seen": 483328, "step": 118 }, { "epoch": 0.08867362146050671, "grad_norm": 5.860888425536614, "learning_rate": 2.4980603815890593e-06, "loss": 1.4653, "num_input_tokens_seen": 487424, "step": 119 }, { "epoch": 0.08941877794336811, "grad_norm": 8.686058506666415, "learning_rate": 2.4980276546027586e-06, "loss": 1.7755, "num_input_tokens_seen": 491520, "step": 120 }, { "epoch": 0.09016393442622951, "grad_norm": 9.804580462589273, "learning_rate": 2.4979946540398837e-06, "loss": 1.6424, "num_input_tokens_seen": 495616, "step": 121 }, { "epoch": 0.09090909090909091, "grad_norm": 13.841843624554816, "learning_rate": 2.4979613799076697e-06, "loss": 1.6357, "num_input_tokens_seen": 499712, "step": 122 }, { "epoch": 0.09165424739195231, "grad_norm": 6.833767695364805, "learning_rate": 2.49792783221341e-06, "loss": 1.8447, "num_input_tokens_seen": 503808, "step": 123 }, { "epoch": 0.09239940387481371, "grad_norm": 7.217590430635042, "learning_rate": 2.4978940109644583e-06, "loss": 1.9269, "num_input_tokens_seen": 507904, "step": 124 }, { "epoch": 0.09314456035767511, "grad_norm": 8.486047589504965, "learning_rate": 2.497859916168229e-06, "loss": 1.7432, "num_input_tokens_seen": 512000, "step": 125 }, { "epoch": 0.09388971684053651, "grad_norm": 12.723775976899269, "learning_rate": 2.497825547832195e-06, "loss": 1.7555, "num_input_tokens_seen": 516096, "step": 126 }, { "epoch": 0.09463487332339791, "grad_norm": 10.798546550978926, "learning_rate": 2.497790905963891e-06, "loss": 1.9104, "num_input_tokens_seen": 520192, "step": 127 }, { "epoch": 0.09538002980625931, "grad_norm": 8.640613672636164, "learning_rate": 2.49775599057091e-06, "loss": 1.2189, "num_input_tokens_seen": 524288, "step": 128 }, { "epoch": 0.09612518628912071, "grad_norm": 8.96159234572974, "learning_rate": 2.497720801660906e-06, "loss": 1.4676, "num_input_tokens_seen": 528384, "step": 129 }, { "epoch": 0.09687034277198212, "grad_norm": 7.082376551029883, "learning_rate": 2.4976853392415934e-06, "loss": 1.4556, "num_input_tokens_seen": 532480, "step": 130 }, { "epoch": 0.09761549925484352, "grad_norm": 8.089417717505059, "learning_rate": 2.4976496033207448e-06, "loss": 1.4825, "num_input_tokens_seen": 536576, "step": 131 }, { "epoch": 0.09836065573770492, "grad_norm": 7.180234880099847, "learning_rate": 2.4976135939061945e-06, "loss": 1.5276, "num_input_tokens_seen": 540672, "step": 132 }, { "epoch": 0.09910581222056632, "grad_norm": 7.110556083798744, "learning_rate": 2.497577311005835e-06, "loss": 1.5871, "num_input_tokens_seen": 544768, "step": 133 }, { "epoch": 0.09985096870342772, "grad_norm": 8.038686245746069, "learning_rate": 2.497540754627621e-06, "loss": 2.0733, "num_input_tokens_seen": 548864, "step": 134 }, { "epoch": 0.10059612518628912, "grad_norm": 10.593379984751268, "learning_rate": 2.497503924779565e-06, "loss": 1.5911, "num_input_tokens_seen": 552960, "step": 135 }, { "epoch": 0.10134128166915052, "grad_norm": 7.7566012259745705, "learning_rate": 2.4974668214697412e-06, "loss": 1.3958, "num_input_tokens_seen": 557056, "step": 136 }, { "epoch": 0.10208643815201192, "grad_norm": 8.651602573444642, "learning_rate": 2.497429444706282e-06, "loss": 1.6137, "num_input_tokens_seen": 561152, "step": 137 }, { "epoch": 0.10283159463487332, "grad_norm": 8.973790268978703, "learning_rate": 2.4973917944973812e-06, "loss": 1.5397, "num_input_tokens_seen": 565248, "step": 138 }, { "epoch": 0.10357675111773472, "grad_norm": 9.529711617540084, "learning_rate": 2.4973538708512916e-06, "loss": 1.7236, "num_input_tokens_seen": 569344, "step": 139 }, { "epoch": 0.10432190760059612, "grad_norm": 9.832654003155726, "learning_rate": 2.497315673776327e-06, "loss": 1.5369, "num_input_tokens_seen": 573440, "step": 140 }, { "epoch": 0.10506706408345752, "grad_norm": 7.4536677323041705, "learning_rate": 2.49727720328086e-06, "loss": 1.8095, "num_input_tokens_seen": 577536, "step": 141 }, { "epoch": 0.10581222056631892, "grad_norm": 7.428409393100336, "learning_rate": 2.4972384593733233e-06, "loss": 1.6887, "num_input_tokens_seen": 581632, "step": 142 }, { "epoch": 0.10655737704918032, "grad_norm": 9.48651170993786, "learning_rate": 2.497199442062211e-06, "loss": 1.6321, "num_input_tokens_seen": 585728, "step": 143 }, { "epoch": 0.10730253353204174, "grad_norm": 6.463619443367253, "learning_rate": 2.4971601513560744e-06, "loss": 1.6307, "num_input_tokens_seen": 589824, "step": 144 }, { "epoch": 0.10804769001490314, "grad_norm": 11.819848966274531, "learning_rate": 2.4971205872635272e-06, "loss": 1.5312, "num_input_tokens_seen": 593920, "step": 145 }, { "epoch": 0.10879284649776454, "grad_norm": 6.967170114147165, "learning_rate": 2.4970807497932427e-06, "loss": 1.7228, "num_input_tokens_seen": 598016, "step": 146 }, { "epoch": 0.10953800298062594, "grad_norm": 8.68891027355326, "learning_rate": 2.4970406389539524e-06, "loss": 1.8044, "num_input_tokens_seen": 602112, "step": 147 }, { "epoch": 0.11028315946348734, "grad_norm": 7.45306540531903, "learning_rate": 2.497000254754449e-06, "loss": 1.6499, "num_input_tokens_seen": 606208, "step": 148 }, { "epoch": 0.11102831594634874, "grad_norm": 29.09166686323287, "learning_rate": 2.4969595972035863e-06, "loss": 2.1141, "num_input_tokens_seen": 610304, "step": 149 }, { "epoch": 0.11177347242921014, "grad_norm": 6.883376931914625, "learning_rate": 2.4969186663102753e-06, "loss": 1.7277, "num_input_tokens_seen": 614400, "step": 150 }, { "epoch": 0.11251862891207154, "grad_norm": 7.54735031423351, "learning_rate": 2.4968774620834886e-06, "loss": 1.4498, "num_input_tokens_seen": 618496, "step": 151 }, { "epoch": 0.11326378539493294, "grad_norm": 7.742435679687007, "learning_rate": 2.4968359845322595e-06, "loss": 1.1958, "num_input_tokens_seen": 622592, "step": 152 }, { "epoch": 0.11400894187779434, "grad_norm": 9.122505799280225, "learning_rate": 2.4967942336656786e-06, "loss": 1.5152, "num_input_tokens_seen": 626688, "step": 153 }, { "epoch": 0.11475409836065574, "grad_norm": 7.798035951563657, "learning_rate": 2.4967522094928988e-06, "loss": 1.706, "num_input_tokens_seen": 630784, "step": 154 }, { "epoch": 0.11549925484351714, "grad_norm": 10.628128508566592, "learning_rate": 2.4967099120231326e-06, "loss": 1.2176, "num_input_tokens_seen": 634880, "step": 155 }, { "epoch": 0.11624441132637854, "grad_norm": 9.544196195630795, "learning_rate": 2.4966673412656513e-06, "loss": 1.8425, "num_input_tokens_seen": 638976, "step": 156 }, { "epoch": 0.11698956780923994, "grad_norm": 7.788402370952136, "learning_rate": 2.4966244972297867e-06, "loss": 1.6235, "num_input_tokens_seen": 643072, "step": 157 }, { "epoch": 0.11773472429210134, "grad_norm": 6.311117085710485, "learning_rate": 2.49658137992493e-06, "loss": 1.4302, "num_input_tokens_seen": 647168, "step": 158 }, { "epoch": 0.11847988077496274, "grad_norm": 7.454361494986267, "learning_rate": 2.496537989360534e-06, "loss": 1.4197, "num_input_tokens_seen": 651264, "step": 159 }, { "epoch": 0.11922503725782414, "grad_norm": 6.986257715452968, "learning_rate": 2.4964943255461095e-06, "loss": 1.6116, "num_input_tokens_seen": 655360, "step": 160 }, { "epoch": 0.11997019374068554, "grad_norm": 8.795566666513922, "learning_rate": 2.4964503884912285e-06, "loss": 1.5379, "num_input_tokens_seen": 659456, "step": 161 }, { "epoch": 0.12071535022354694, "grad_norm": 7.158241561313038, "learning_rate": 2.4964061782055216e-06, "loss": 1.8032, "num_input_tokens_seen": 663552, "step": 162 }, { "epoch": 0.12146050670640834, "grad_norm": 10.716990885474155, "learning_rate": 2.4963616946986803e-06, "loss": 1.1715, "num_input_tokens_seen": 667648, "step": 163 }, { "epoch": 0.12220566318926974, "grad_norm": 11.54729211604458, "learning_rate": 2.496316937980455e-06, "loss": 1.7125, "num_input_tokens_seen": 671744, "step": 164 }, { "epoch": 0.12295081967213115, "grad_norm": 7.531900933759593, "learning_rate": 2.4962719080606584e-06, "loss": 1.3531, "num_input_tokens_seen": 675840, "step": 165 }, { "epoch": 0.12369597615499255, "grad_norm": 7.29382658277802, "learning_rate": 2.49622660494916e-06, "loss": 1.5327, "num_input_tokens_seen": 679936, "step": 166 }, { "epoch": 0.12444113263785395, "grad_norm": 9.957921203477792, "learning_rate": 2.496181028655891e-06, "loss": 1.5877, "num_input_tokens_seen": 684032, "step": 167 }, { "epoch": 0.12518628912071536, "grad_norm": 7.356819639671935, "learning_rate": 2.4961351791908424e-06, "loss": 1.3713, "num_input_tokens_seen": 688128, "step": 168 }, { "epoch": 0.12593144560357675, "grad_norm": 6.942971958241856, "learning_rate": 2.496089056564064e-06, "loss": 1.7287, "num_input_tokens_seen": 692224, "step": 169 }, { "epoch": 0.12667660208643816, "grad_norm": 10.962651644259884, "learning_rate": 2.496042660785666e-06, "loss": 1.4257, "num_input_tokens_seen": 696320, "step": 170 }, { "epoch": 0.12742175856929955, "grad_norm": 7.232939820956084, "learning_rate": 2.4959959918658196e-06, "loss": 1.7821, "num_input_tokens_seen": 700416, "step": 171 }, { "epoch": 0.12816691505216096, "grad_norm": 9.344209490082678, "learning_rate": 2.4959490498147547e-06, "loss": 1.6067, "num_input_tokens_seen": 704512, "step": 172 }, { "epoch": 0.12891207153502235, "grad_norm": 18.601643687230112, "learning_rate": 2.495901834642761e-06, "loss": 1.6112, "num_input_tokens_seen": 708608, "step": 173 }, { "epoch": 0.12965722801788376, "grad_norm": 7.793688939996316, "learning_rate": 2.4958543463601888e-06, "loss": 1.7217, "num_input_tokens_seen": 712704, "step": 174 }, { "epoch": 0.13040238450074515, "grad_norm": 6.825138990205076, "learning_rate": 2.4958065849774477e-06, "loss": 1.6066, "num_input_tokens_seen": 716800, "step": 175 }, { "epoch": 0.13114754098360656, "grad_norm": 7.293457468054337, "learning_rate": 2.4957585505050075e-06, "loss": 1.7766, "num_input_tokens_seen": 720896, "step": 176 }, { "epoch": 0.13189269746646795, "grad_norm": 10.834244606309243, "learning_rate": 2.4957102429533968e-06, "loss": 1.7861, "num_input_tokens_seen": 724992, "step": 177 }, { "epoch": 0.13263785394932937, "grad_norm": 7.271049708830555, "learning_rate": 2.4956616623332064e-06, "loss": 1.7738, "num_input_tokens_seen": 729088, "step": 178 }, { "epoch": 0.13338301043219075, "grad_norm": 7.923659068046659, "learning_rate": 2.4956128086550844e-06, "loss": 1.6766, "num_input_tokens_seen": 733184, "step": 179 }, { "epoch": 0.13412816691505217, "grad_norm": 9.01094403640712, "learning_rate": 2.495563681929741e-06, "loss": 1.3119, "num_input_tokens_seen": 737280, "step": 180 }, { "epoch": 0.13487332339791355, "grad_norm": 8.16591793484469, "learning_rate": 2.4955142821679433e-06, "loss": 1.4729, "num_input_tokens_seen": 741376, "step": 181 }, { "epoch": 0.13561847988077497, "grad_norm": 10.423765281518424, "learning_rate": 2.495464609380522e-06, "loss": 1.3914, "num_input_tokens_seen": 745472, "step": 182 }, { "epoch": 0.13636363636363635, "grad_norm": 7.424722714568153, "learning_rate": 2.4954146635783645e-06, "loss": 1.6569, "num_input_tokens_seen": 749568, "step": 183 }, { "epoch": 0.13710879284649777, "grad_norm": 8.045726618791438, "learning_rate": 2.4953644447724196e-06, "loss": 1.5013, "num_input_tokens_seen": 753664, "step": 184 }, { "epoch": 0.13785394932935915, "grad_norm": 15.903449436682694, "learning_rate": 2.495313952973696e-06, "loss": 1.5253, "num_input_tokens_seen": 757760, "step": 185 }, { "epoch": 0.13859910581222057, "grad_norm": 7.184884872986108, "learning_rate": 2.4952631881932615e-06, "loss": 1.2854, "num_input_tokens_seen": 761856, "step": 186 }, { "epoch": 0.13934426229508196, "grad_norm": 8.294423314491231, "learning_rate": 2.495212150442244e-06, "loss": 1.2934, "num_input_tokens_seen": 765952, "step": 187 }, { "epoch": 0.14008941877794337, "grad_norm": 7.680259139545721, "learning_rate": 2.4951608397318316e-06, "loss": 1.4835, "num_input_tokens_seen": 770048, "step": 188 }, { "epoch": 0.14083457526080476, "grad_norm": 6.584660600700139, "learning_rate": 2.495109256073272e-06, "loss": 1.5125, "num_input_tokens_seen": 774144, "step": 189 }, { "epoch": 0.14157973174366617, "grad_norm": 8.721309358449936, "learning_rate": 2.4950573994778725e-06, "loss": 1.332, "num_input_tokens_seen": 778240, "step": 190 }, { "epoch": 0.14232488822652756, "grad_norm": 8.211693044696228, "learning_rate": 2.495005269957001e-06, "loss": 1.505, "num_input_tokens_seen": 782336, "step": 191 }, { "epoch": 0.14307004470938897, "grad_norm": 7.332015229578448, "learning_rate": 2.4949528675220836e-06, "loss": 1.4962, "num_input_tokens_seen": 786432, "step": 192 }, { "epoch": 0.14381520119225039, "grad_norm": 8.063396300455713, "learning_rate": 2.494900192184608e-06, "loss": 1.4783, "num_input_tokens_seen": 790528, "step": 193 }, { "epoch": 0.14456035767511177, "grad_norm": 8.333240564972348, "learning_rate": 2.4948472439561212e-06, "loss": 1.578, "num_input_tokens_seen": 794624, "step": 194 }, { "epoch": 0.1453055141579732, "grad_norm": 12.370808108185944, "learning_rate": 2.4947940228482293e-06, "loss": 1.6124, "num_input_tokens_seen": 798720, "step": 195 }, { "epoch": 0.14605067064083457, "grad_norm": 14.586506242404822, "learning_rate": 2.4947405288725986e-06, "loss": 1.7206, "num_input_tokens_seen": 802816, "step": 196 }, { "epoch": 0.146795827123696, "grad_norm": 7.542391366941157, "learning_rate": 2.4946867620409562e-06, "loss": 1.8828, "num_input_tokens_seen": 806912, "step": 197 }, { "epoch": 0.14754098360655737, "grad_norm": 8.277568137964783, "learning_rate": 2.494632722365088e-06, "loss": 1.6511, "num_input_tokens_seen": 811008, "step": 198 }, { "epoch": 0.1482861400894188, "grad_norm": 8.705772296374711, "learning_rate": 2.4945784098568388e-06, "loss": 1.5511, "num_input_tokens_seen": 815104, "step": 199 }, { "epoch": 0.14903129657228018, "grad_norm": 5.8073161369163495, "learning_rate": 2.4945238245281153e-06, "loss": 1.5366, "num_input_tokens_seen": 819200, "step": 200 }, { "epoch": 0.1497764530551416, "grad_norm": 6.663853752980503, "learning_rate": 2.494468966390883e-06, "loss": 1.5477, "num_input_tokens_seen": 823296, "step": 201 }, { "epoch": 0.15052160953800298, "grad_norm": 13.606570630717384, "learning_rate": 2.4944138354571667e-06, "loss": 1.7468, "num_input_tokens_seen": 827392, "step": 202 }, { "epoch": 0.1512667660208644, "grad_norm": 8.748518627464817, "learning_rate": 2.494358431739052e-06, "loss": 1.3275, "num_input_tokens_seen": 831488, "step": 203 }, { "epoch": 0.15201192250372578, "grad_norm": 8.081209900510014, "learning_rate": 2.4943027552486837e-06, "loss": 1.2918, "num_input_tokens_seen": 835584, "step": 204 }, { "epoch": 0.1527570789865872, "grad_norm": 6.785929474190173, "learning_rate": 2.494246805998266e-06, "loss": 1.544, "num_input_tokens_seen": 839680, "step": 205 }, { "epoch": 0.15350223546944858, "grad_norm": 7.5104773413610495, "learning_rate": 2.494190584000064e-06, "loss": 1.6492, "num_input_tokens_seen": 843776, "step": 206 }, { "epoch": 0.15424739195231, "grad_norm": 7.39664400668688, "learning_rate": 2.494134089266401e-06, "loss": 1.5733, "num_input_tokens_seen": 847872, "step": 207 }, { "epoch": 0.15499254843517138, "grad_norm": 9.34086463310229, "learning_rate": 2.4940773218096625e-06, "loss": 1.5674, "num_input_tokens_seen": 851968, "step": 208 }, { "epoch": 0.1557377049180328, "grad_norm": 8.376871588539117, "learning_rate": 2.494020281642291e-06, "loss": 1.7445, "num_input_tokens_seen": 856064, "step": 209 }, { "epoch": 0.15648286140089418, "grad_norm": 7.870138155009764, "learning_rate": 2.493962968776791e-06, "loss": 1.5282, "num_input_tokens_seen": 860160, "step": 210 }, { "epoch": 0.1572280178837556, "grad_norm": 8.276909111229886, "learning_rate": 2.4939053832257255e-06, "loss": 1.5178, "num_input_tokens_seen": 864256, "step": 211 }, { "epoch": 0.15797317436661698, "grad_norm": 12.25501920677344, "learning_rate": 2.493847525001718e-06, "loss": 1.5608, "num_input_tokens_seen": 868352, "step": 212 }, { "epoch": 0.1587183308494784, "grad_norm": 10.803605636664724, "learning_rate": 2.493789394117451e-06, "loss": 1.6601, "num_input_tokens_seen": 872448, "step": 213 }, { "epoch": 0.15946348733233978, "grad_norm": 8.681073681334047, "learning_rate": 2.4937309905856673e-06, "loss": 1.3614, "num_input_tokens_seen": 876544, "step": 214 }, { "epoch": 0.1602086438152012, "grad_norm": 7.766852380710654, "learning_rate": 2.493672314419169e-06, "loss": 1.5145, "num_input_tokens_seen": 880640, "step": 215 }, { "epoch": 0.16095380029806258, "grad_norm": 7.779285681328385, "learning_rate": 2.493613365630819e-06, "loss": 1.5232, "num_input_tokens_seen": 884736, "step": 216 }, { "epoch": 0.161698956780924, "grad_norm": 7.718716888235315, "learning_rate": 2.493554144233539e-06, "loss": 1.5342, "num_input_tokens_seen": 888832, "step": 217 }, { "epoch": 0.16244411326378538, "grad_norm": 12.805603248498695, "learning_rate": 2.493494650240311e-06, "loss": 1.4846, "num_input_tokens_seen": 892928, "step": 218 }, { "epoch": 0.1631892697466468, "grad_norm": 12.725097775795875, "learning_rate": 2.4934348836641765e-06, "loss": 1.5216, "num_input_tokens_seen": 897024, "step": 219 }, { "epoch": 0.16393442622950818, "grad_norm": 7.024119667044165, "learning_rate": 2.4933748445182367e-06, "loss": 1.6706, "num_input_tokens_seen": 901120, "step": 220 }, { "epoch": 0.1646795827123696, "grad_norm": 7.82586013606585, "learning_rate": 2.493314532815652e-06, "loss": 1.7307, "num_input_tokens_seen": 905216, "step": 221 }, { "epoch": 0.16542473919523099, "grad_norm": 6.6449718041980494, "learning_rate": 2.4932539485696438e-06, "loss": 1.606, "num_input_tokens_seen": 909312, "step": 222 }, { "epoch": 0.1661698956780924, "grad_norm": 8.09914568624943, "learning_rate": 2.4931930917934926e-06, "loss": 1.6733, "num_input_tokens_seen": 913408, "step": 223 }, { "epoch": 0.16691505216095381, "grad_norm": 7.297259110142969, "learning_rate": 2.4931319625005385e-06, "loss": 1.5496, "num_input_tokens_seen": 917504, "step": 224 }, { "epoch": 0.1676602086438152, "grad_norm": 7.688340047797646, "learning_rate": 2.493070560704181e-06, "loss": 1.6427, "num_input_tokens_seen": 921600, "step": 225 }, { "epoch": 0.16840536512667661, "grad_norm": 7.239044040624734, "learning_rate": 2.4930088864178807e-06, "loss": 1.4676, "num_input_tokens_seen": 925696, "step": 226 }, { "epoch": 0.169150521609538, "grad_norm": 8.933692195084062, "learning_rate": 2.4929469396551563e-06, "loss": 1.7631, "num_input_tokens_seen": 929792, "step": 227 }, { "epoch": 0.16989567809239942, "grad_norm": 12.008626828365415, "learning_rate": 2.492884720429588e-06, "loss": 1.3236, "num_input_tokens_seen": 933888, "step": 228 }, { "epoch": 0.1706408345752608, "grad_norm": 7.490539194336787, "learning_rate": 2.4928222287548133e-06, "loss": 1.3123, "num_input_tokens_seen": 937984, "step": 229 }, { "epoch": 0.17138599105812222, "grad_norm": 6.896228113515029, "learning_rate": 2.492759464644532e-06, "loss": 1.702, "num_input_tokens_seen": 942080, "step": 230 }, { "epoch": 0.1721311475409836, "grad_norm": 6.9190505305287395, "learning_rate": 2.4926964281125017e-06, "loss": 1.5967, "num_input_tokens_seen": 946176, "step": 231 }, { "epoch": 0.17287630402384502, "grad_norm": 8.17102161039853, "learning_rate": 2.4926331191725406e-06, "loss": 1.3976, "num_input_tokens_seen": 950272, "step": 232 }, { "epoch": 0.1736214605067064, "grad_norm": 7.986997722934815, "learning_rate": 2.4925695378385266e-06, "loss": 1.0019, "num_input_tokens_seen": 954368, "step": 233 }, { "epoch": 0.17436661698956782, "grad_norm": 8.820227837035757, "learning_rate": 2.492505684124397e-06, "loss": 1.2974, "num_input_tokens_seen": 958464, "step": 234 }, { "epoch": 0.1751117734724292, "grad_norm": 12.39835371951438, "learning_rate": 2.4924415580441497e-06, "loss": 1.2614, "num_input_tokens_seen": 962560, "step": 235 }, { "epoch": 0.17585692995529062, "grad_norm": 7.673828353618831, "learning_rate": 2.4923771596118406e-06, "loss": 1.4689, "num_input_tokens_seen": 966656, "step": 236 }, { "epoch": 0.176602086438152, "grad_norm": 10.976146237003118, "learning_rate": 2.492312488841587e-06, "loss": 1.6646, "num_input_tokens_seen": 970752, "step": 237 }, { "epoch": 0.17734724292101342, "grad_norm": 9.620620180407197, "learning_rate": 2.4922475457475646e-06, "loss": 1.4991, "num_input_tokens_seen": 974848, "step": 238 }, { "epoch": 0.1780923994038748, "grad_norm": 8.492054235407682, "learning_rate": 2.4921823303440103e-06, "loss": 1.5118, "num_input_tokens_seen": 978944, "step": 239 }, { "epoch": 0.17883755588673622, "grad_norm": 8.421534482939563, "learning_rate": 2.4921168426452186e-06, "loss": 1.6395, "num_input_tokens_seen": 983040, "step": 240 }, { "epoch": 0.1795827123695976, "grad_norm": 6.786203490875633, "learning_rate": 2.492051082665546e-06, "loss": 1.6805, "num_input_tokens_seen": 987136, "step": 241 }, { "epoch": 0.18032786885245902, "grad_norm": 6.5695989243150725, "learning_rate": 2.491985050419407e-06, "loss": 1.4206, "num_input_tokens_seen": 991232, "step": 242 }, { "epoch": 0.1810730253353204, "grad_norm": 9.230043033203195, "learning_rate": 2.491918745921276e-06, "loss": 1.2479, "num_input_tokens_seen": 995328, "step": 243 }, { "epoch": 0.18181818181818182, "grad_norm": 8.372870656060616, "learning_rate": 2.4918521691856877e-06, "loss": 1.3356, "num_input_tokens_seen": 999424, "step": 244 }, { "epoch": 0.1825633383010432, "grad_norm": 7.127049673852751, "learning_rate": 2.491785320227237e-06, "loss": 1.6335, "num_input_tokens_seen": 1003520, "step": 245 }, { "epoch": 0.18330849478390462, "grad_norm": 14.133153876248928, "learning_rate": 2.4917181990605767e-06, "loss": 1.738, "num_input_tokens_seen": 1007616, "step": 246 }, { "epoch": 0.184053651266766, "grad_norm": 11.89681013211735, "learning_rate": 2.49165080570042e-06, "loss": 1.5037, "num_input_tokens_seen": 1011712, "step": 247 }, { "epoch": 0.18479880774962743, "grad_norm": 7.422742925433924, "learning_rate": 2.491583140161541e-06, "loss": 1.7546, "num_input_tokens_seen": 1015808, "step": 248 }, { "epoch": 0.1855439642324888, "grad_norm": 8.795346188953673, "learning_rate": 2.4915152024587718e-06, "loss": 1.438, "num_input_tokens_seen": 1019904, "step": 249 }, { "epoch": 0.18628912071535023, "grad_norm": 9.186243455774916, "learning_rate": 2.491446992607005e-06, "loss": 1.2943, "num_input_tokens_seen": 1024000, "step": 250 }, { "epoch": 0.1870342771982116, "grad_norm": 6.904991200905438, "learning_rate": 2.491378510621193e-06, "loss": 1.557, "num_input_tokens_seen": 1028096, "step": 251 }, { "epoch": 0.18777943368107303, "grad_norm": 8.276922415010644, "learning_rate": 2.491309756516347e-06, "loss": 1.3775, "num_input_tokens_seen": 1032192, "step": 252 }, { "epoch": 0.1885245901639344, "grad_norm": 11.967478893280857, "learning_rate": 2.4912407303075387e-06, "loss": 1.4781, "num_input_tokens_seen": 1036288, "step": 253 }, { "epoch": 0.18926974664679583, "grad_norm": 9.854318510958349, "learning_rate": 2.491171432009899e-06, "loss": 1.4287, "num_input_tokens_seen": 1040384, "step": 254 }, { "epoch": 0.19001490312965721, "grad_norm": 7.824504425835976, "learning_rate": 2.4911018616386184e-06, "loss": 1.6187, "num_input_tokens_seen": 1044480, "step": 255 }, { "epoch": 0.19076005961251863, "grad_norm": 7.315739076430665, "learning_rate": 2.4910320192089475e-06, "loss": 1.1897, "num_input_tokens_seen": 1048576, "step": 256 }, { "epoch": 0.19150521609538004, "grad_norm": 9.896444698629088, "learning_rate": 2.490961904736197e-06, "loss": 1.5343, "num_input_tokens_seen": 1052672, "step": 257 }, { "epoch": 0.19225037257824143, "grad_norm": 11.938016399853185, "learning_rate": 2.4908915182357353e-06, "loss": 1.2896, "num_input_tokens_seen": 1056768, "step": 258 }, { "epoch": 0.19299552906110284, "grad_norm": 6.339494262697079, "learning_rate": 2.4908208597229916e-06, "loss": 1.548, "num_input_tokens_seen": 1060864, "step": 259 }, { "epoch": 0.19374068554396423, "grad_norm": 7.608398456439894, "learning_rate": 2.4907499292134556e-06, "loss": 1.5618, "num_input_tokens_seen": 1064960, "step": 260 }, { "epoch": 0.19448584202682564, "grad_norm": 10.56841382629605, "learning_rate": 2.4906787267226752e-06, "loss": 1.2519, "num_input_tokens_seen": 1069056, "step": 261 }, { "epoch": 0.19523099850968703, "grad_norm": 7.443214059821784, "learning_rate": 2.490607252266259e-06, "loss": 1.5084, "num_input_tokens_seen": 1073152, "step": 262 }, { "epoch": 0.19597615499254845, "grad_norm": 7.328166277137504, "learning_rate": 2.490535505859874e-06, "loss": 1.7471, "num_input_tokens_seen": 1077248, "step": 263 }, { "epoch": 0.19672131147540983, "grad_norm": 14.308542042893215, "learning_rate": 2.490463487519248e-06, "loss": 1.4647, "num_input_tokens_seen": 1081344, "step": 264 }, { "epoch": 0.19746646795827125, "grad_norm": 8.316200966959025, "learning_rate": 2.490391197260168e-06, "loss": 0.9944, "num_input_tokens_seen": 1085440, "step": 265 }, { "epoch": 0.19821162444113263, "grad_norm": 13.635147629873172, "learning_rate": 2.49031863509848e-06, "loss": 1.4426, "num_input_tokens_seen": 1089536, "step": 266 }, { "epoch": 0.19895678092399405, "grad_norm": 7.982654147155753, "learning_rate": 2.490245801050091e-06, "loss": 1.4764, "num_input_tokens_seen": 1093632, "step": 267 }, { "epoch": 0.19970193740685543, "grad_norm": 8.580294742296367, "learning_rate": 2.4901726951309657e-06, "loss": 1.3711, "num_input_tokens_seen": 1097728, "step": 268 }, { "epoch": 0.20044709388971685, "grad_norm": 11.022908239860254, "learning_rate": 2.4900993173571303e-06, "loss": 1.6872, "num_input_tokens_seen": 1101824, "step": 269 }, { "epoch": 0.20119225037257824, "grad_norm": 7.966470507879765, "learning_rate": 2.4900256677446698e-06, "loss": 1.8193, "num_input_tokens_seen": 1105920, "step": 270 }, { "epoch": 0.20193740685543965, "grad_norm": 7.03264307776241, "learning_rate": 2.4899517463097277e-06, "loss": 1.3367, "num_input_tokens_seen": 1110016, "step": 271 }, { "epoch": 0.20268256333830104, "grad_norm": 6.9181456593112935, "learning_rate": 2.489877553068509e-06, "loss": 1.7942, "num_input_tokens_seen": 1114112, "step": 272 }, { "epoch": 0.20342771982116245, "grad_norm": 9.335411572886224, "learning_rate": 2.4898030880372775e-06, "loss": 1.1166, "num_input_tokens_seen": 1118208, "step": 273 }, { "epoch": 0.20417287630402384, "grad_norm": 11.032016365674766, "learning_rate": 2.4897283512323556e-06, "loss": 1.1182, "num_input_tokens_seen": 1122304, "step": 274 }, { "epoch": 0.20491803278688525, "grad_norm": 13.817866756291009, "learning_rate": 2.4896533426701267e-06, "loss": 1.6443, "num_input_tokens_seen": 1126400, "step": 275 }, { "epoch": 0.20566318926974664, "grad_norm": 9.275522593708047, "learning_rate": 2.489578062367034e-06, "loss": 1.503, "num_input_tokens_seen": 1130496, "step": 276 }, { "epoch": 0.20640834575260805, "grad_norm": 9.126913375397294, "learning_rate": 2.4895025103395777e-06, "loss": 1.7784, "num_input_tokens_seen": 1134592, "step": 277 }, { "epoch": 0.20715350223546944, "grad_norm": 9.33410321616298, "learning_rate": 2.4894266866043206e-06, "loss": 1.2738, "num_input_tokens_seen": 1138688, "step": 278 }, { "epoch": 0.20789865871833085, "grad_norm": 10.946341890158866, "learning_rate": 2.4893505911778835e-06, "loss": 1.5221, "num_input_tokens_seen": 1142784, "step": 279 }, { "epoch": 0.20864381520119224, "grad_norm": 7.4930150152670105, "learning_rate": 2.4892742240769476e-06, "loss": 1.6493, "num_input_tokens_seen": 1146880, "step": 280 }, { "epoch": 0.20938897168405365, "grad_norm": 8.851339302851088, "learning_rate": 2.4891975853182513e-06, "loss": 1.2188, "num_input_tokens_seen": 1150976, "step": 281 }, { "epoch": 0.21013412816691504, "grad_norm": 9.233311153475158, "learning_rate": 2.489120674918597e-06, "loss": 1.3961, "num_input_tokens_seen": 1155072, "step": 282 }, { "epoch": 0.21087928464977646, "grad_norm": 7.863005085263938, "learning_rate": 2.4890434928948416e-06, "loss": 1.4742, "num_input_tokens_seen": 1159168, "step": 283 }, { "epoch": 0.21162444113263784, "grad_norm": 16.749900117528174, "learning_rate": 2.4889660392639058e-06, "loss": 1.3562, "num_input_tokens_seen": 1163264, "step": 284 }, { "epoch": 0.21236959761549926, "grad_norm": 11.135557966882219, "learning_rate": 2.4888883140427664e-06, "loss": 1.3346, "num_input_tokens_seen": 1167360, "step": 285 }, { "epoch": 0.21311475409836064, "grad_norm": 8.521090947644629, "learning_rate": 2.488810317248463e-06, "loss": 1.3977, "num_input_tokens_seen": 1171456, "step": 286 }, { "epoch": 0.21385991058122206, "grad_norm": 6.902153022976627, "learning_rate": 2.4887320488980914e-06, "loss": 1.5056, "num_input_tokens_seen": 1175552, "step": 287 }, { "epoch": 0.21460506706408347, "grad_norm": 9.892864051862897, "learning_rate": 2.48865350900881e-06, "loss": 1.5831, "num_input_tokens_seen": 1179648, "step": 288 }, { "epoch": 0.21535022354694486, "grad_norm": 9.26061917974438, "learning_rate": 2.4885746975978344e-06, "loss": 1.5339, "num_input_tokens_seen": 1183744, "step": 289 }, { "epoch": 0.21609538002980627, "grad_norm": 7.569492840892105, "learning_rate": 2.48849561468244e-06, "loss": 1.125, "num_input_tokens_seen": 1187840, "step": 290 }, { "epoch": 0.21684053651266766, "grad_norm": 7.023225090455169, "learning_rate": 2.4884162602799646e-06, "loss": 1.573, "num_input_tokens_seen": 1191936, "step": 291 }, { "epoch": 0.21758569299552907, "grad_norm": 8.934111948575401, "learning_rate": 2.4883366344078007e-06, "loss": 1.0401, "num_input_tokens_seen": 1196032, "step": 292 }, { "epoch": 0.21833084947839046, "grad_norm": 7.638068540581907, "learning_rate": 2.488256737083405e-06, "loss": 1.2717, "num_input_tokens_seen": 1200128, "step": 293 }, { "epoch": 0.21907600596125187, "grad_norm": 10.3016105044919, "learning_rate": 2.48817656832429e-06, "loss": 1.334, "num_input_tokens_seen": 1204224, "step": 294 }, { "epoch": 0.21982116244411326, "grad_norm": 8.987091045543854, "learning_rate": 2.48809612814803e-06, "loss": 1.1574, "num_input_tokens_seen": 1208320, "step": 295 }, { "epoch": 0.22056631892697467, "grad_norm": 7.46324069318026, "learning_rate": 2.488015416572258e-06, "loss": 1.4306, "num_input_tokens_seen": 1212416, "step": 296 }, { "epoch": 0.22131147540983606, "grad_norm": 10.230462476448658, "learning_rate": 2.487934433614666e-06, "loss": 1.5369, "num_input_tokens_seen": 1216512, "step": 297 }, { "epoch": 0.22205663189269748, "grad_norm": 9.583923428302073, "learning_rate": 2.4878531792930074e-06, "loss": 1.3197, "num_input_tokens_seen": 1220608, "step": 298 }, { "epoch": 0.22280178837555886, "grad_norm": 7.711192879863928, "learning_rate": 2.4877716536250922e-06, "loss": 1.2565, "num_input_tokens_seen": 1224704, "step": 299 }, { "epoch": 0.22354694485842028, "grad_norm": 9.352483850608843, "learning_rate": 2.4876898566287923e-06, "loss": 1.5078, "num_input_tokens_seen": 1228800, "step": 300 }, { "epoch": 0.22429210134128166, "grad_norm": 25.224714692754166, "learning_rate": 2.4876077883220377e-06, "loss": 1.5352, "num_input_tokens_seen": 1232896, "step": 301 }, { "epoch": 0.22503725782414308, "grad_norm": 10.323945701515207, "learning_rate": 2.4875254487228184e-06, "loss": 1.7399, "num_input_tokens_seen": 1236992, "step": 302 }, { "epoch": 0.22578241430700446, "grad_norm": 6.726687282526841, "learning_rate": 2.4874428378491844e-06, "loss": 1.4718, "num_input_tokens_seen": 1241088, "step": 303 }, { "epoch": 0.22652757078986588, "grad_norm": 10.74996640090697, "learning_rate": 2.487359955719244e-06, "loss": 1.6293, "num_input_tokens_seen": 1245184, "step": 304 }, { "epoch": 0.22727272727272727, "grad_norm": 10.42151225730984, "learning_rate": 2.487276802351166e-06, "loss": 1.0553, "num_input_tokens_seen": 1249280, "step": 305 }, { "epoch": 0.22801788375558868, "grad_norm": 8.245578046625896, "learning_rate": 2.487193377763178e-06, "loss": 1.5699, "num_input_tokens_seen": 1253376, "step": 306 }, { "epoch": 0.22876304023845007, "grad_norm": 7.743145867112127, "learning_rate": 2.487109681973567e-06, "loss": 1.4865, "num_input_tokens_seen": 1257472, "step": 307 }, { "epoch": 0.22950819672131148, "grad_norm": 7.55465802347297, "learning_rate": 2.48702571500068e-06, "loss": 1.6482, "num_input_tokens_seen": 1261568, "step": 308 }, { "epoch": 0.23025335320417287, "grad_norm": 11.424004984621222, "learning_rate": 2.4869414768629237e-06, "loss": 1.1673, "num_input_tokens_seen": 1265664, "step": 309 }, { "epoch": 0.23099850968703428, "grad_norm": 7.382458079616633, "learning_rate": 2.4868569675787625e-06, "loss": 1.6454, "num_input_tokens_seen": 1269760, "step": 310 }, { "epoch": 0.23174366616989567, "grad_norm": 11.489012204115111, "learning_rate": 2.486772187166722e-06, "loss": 1.3966, "num_input_tokens_seen": 1273856, "step": 311 }, { "epoch": 0.23248882265275708, "grad_norm": 10.234873183097795, "learning_rate": 2.486687135645387e-06, "loss": 1.4258, "num_input_tokens_seen": 1277952, "step": 312 }, { "epoch": 0.23323397913561847, "grad_norm": 7.2425771683510165, "learning_rate": 2.486601813033401e-06, "loss": 1.3126, "num_input_tokens_seen": 1282048, "step": 313 }, { "epoch": 0.23397913561847988, "grad_norm": 9.712013499662069, "learning_rate": 2.4865162193494675e-06, "loss": 0.9145, "num_input_tokens_seen": 1286144, "step": 314 }, { "epoch": 0.23472429210134127, "grad_norm": 7.05839715544482, "learning_rate": 2.486430354612349e-06, "loss": 1.514, "num_input_tokens_seen": 1290240, "step": 315 }, { "epoch": 0.23546944858420268, "grad_norm": 8.283986619376204, "learning_rate": 2.486344218840868e-06, "loss": 1.1735, "num_input_tokens_seen": 1294336, "step": 316 }, { "epoch": 0.23621460506706407, "grad_norm": 6.927542958461139, "learning_rate": 2.486257812053906e-06, "loss": 1.5333, "num_input_tokens_seen": 1298432, "step": 317 }, { "epoch": 0.23695976154992549, "grad_norm": 13.821452158453605, "learning_rate": 2.4861711342704044e-06, "loss": 1.4778, "num_input_tokens_seen": 1302528, "step": 318 }, { "epoch": 0.23770491803278687, "grad_norm": 24.825034778911576, "learning_rate": 2.4860841855093628e-06, "loss": 1.7696, "num_input_tokens_seen": 1306624, "step": 319 }, { "epoch": 0.23845007451564829, "grad_norm": 6.733414905348623, "learning_rate": 2.4859969657898408e-06, "loss": 1.6006, "num_input_tokens_seen": 1310720, "step": 320 }, { "epoch": 0.2391952309985097, "grad_norm": 7.905861960597136, "learning_rate": 2.4859094751309584e-06, "loss": 1.5358, "num_input_tokens_seen": 1314816, "step": 321 }, { "epoch": 0.2399403874813711, "grad_norm": 8.068244187879031, "learning_rate": 2.485821713551894e-06, "loss": 1.687, "num_input_tokens_seen": 1318912, "step": 322 }, { "epoch": 0.2406855439642325, "grad_norm": 7.867463994324491, "learning_rate": 2.485733681071885e-06, "loss": 1.6788, "num_input_tokens_seen": 1323008, "step": 323 }, { "epoch": 0.2414307004470939, "grad_norm": 7.829259376840234, "learning_rate": 2.4856453777102296e-06, "loss": 1.483, "num_input_tokens_seen": 1327104, "step": 324 }, { "epoch": 0.2421758569299553, "grad_norm": 7.059881543361948, "learning_rate": 2.4855568034862842e-06, "loss": 1.2704, "num_input_tokens_seen": 1331200, "step": 325 }, { "epoch": 0.2429210134128167, "grad_norm": 14.569644472970428, "learning_rate": 2.4854679584194645e-06, "loss": 1.8306, "num_input_tokens_seen": 1335296, "step": 326 }, { "epoch": 0.2436661698956781, "grad_norm": 9.892678596017578, "learning_rate": 2.485378842529247e-06, "loss": 1.1652, "num_input_tokens_seen": 1339392, "step": 327 }, { "epoch": 0.2444113263785395, "grad_norm": 9.312209934184352, "learning_rate": 2.485289455835165e-06, "loss": 1.482, "num_input_tokens_seen": 1343488, "step": 328 }, { "epoch": 0.2451564828614009, "grad_norm": 14.29305472673219, "learning_rate": 2.4851997983568137e-06, "loss": 1.57, "num_input_tokens_seen": 1347584, "step": 329 }, { "epoch": 0.2459016393442623, "grad_norm": 8.859452594688351, "learning_rate": 2.4851098701138465e-06, "loss": 1.5237, "num_input_tokens_seen": 1351680, "step": 330 }, { "epoch": 0.2466467958271237, "grad_norm": 9.413494845915434, "learning_rate": 2.4850196711259765e-06, "loss": 1.3897, "num_input_tokens_seen": 1355776, "step": 331 }, { "epoch": 0.2473919523099851, "grad_norm": 7.048157837457402, "learning_rate": 2.4849292014129756e-06, "loss": 1.5734, "num_input_tokens_seen": 1359872, "step": 332 }, { "epoch": 0.2481371087928465, "grad_norm": 8.401540592581778, "learning_rate": 2.4848384609946755e-06, "loss": 1.1313, "num_input_tokens_seen": 1363968, "step": 333 }, { "epoch": 0.2488822652757079, "grad_norm": 8.64436178694505, "learning_rate": 2.4847474498909675e-06, "loss": 1.1235, "num_input_tokens_seen": 1368064, "step": 334 }, { "epoch": 0.2496274217585693, "grad_norm": 9.198498554728829, "learning_rate": 2.4846561681218016e-06, "loss": 1.1531, "num_input_tokens_seen": 1372160, "step": 335 }, { "epoch": 0.2503725782414307, "grad_norm": 10.927757765765678, "learning_rate": 2.484564615707187e-06, "loss": 1.7077, "num_input_tokens_seen": 1376256, "step": 336 }, { "epoch": 0.2511177347242921, "grad_norm": 7.033162858836843, "learning_rate": 2.4844727926671935e-06, "loss": 1.5824, "num_input_tokens_seen": 1380352, "step": 337 }, { "epoch": 0.2518628912071535, "grad_norm": 7.685349127327716, "learning_rate": 2.4843806990219486e-06, "loss": 1.2452, "num_input_tokens_seen": 1384448, "step": 338 }, { "epoch": 0.2526080476900149, "grad_norm": 9.679591251890448, "learning_rate": 2.4842883347916403e-06, "loss": 1.223, "num_input_tokens_seen": 1388544, "step": 339 }, { "epoch": 0.2533532041728763, "grad_norm": 7.721919086413783, "learning_rate": 2.4841956999965157e-06, "loss": 1.4545, "num_input_tokens_seen": 1392640, "step": 340 }, { "epoch": 0.2540983606557377, "grad_norm": 6.997826334733301, "learning_rate": 2.4841027946568802e-06, "loss": 1.8993, "num_input_tokens_seen": 1396736, "step": 341 }, { "epoch": 0.2548435171385991, "grad_norm": 8.60396372833882, "learning_rate": 2.4840096187931e-06, "loss": 1.5224, "num_input_tokens_seen": 1400832, "step": 342 }, { "epoch": 0.2555886736214605, "grad_norm": 11.937509991350838, "learning_rate": 2.4839161724256e-06, "loss": 1.5111, "num_input_tokens_seen": 1404928, "step": 343 }, { "epoch": 0.2563338301043219, "grad_norm": 14.252262443984232, "learning_rate": 2.483822455574864e-06, "loss": 1.0978, "num_input_tokens_seen": 1409024, "step": 344 }, { "epoch": 0.2570789865871833, "grad_norm": 8.081609016211711, "learning_rate": 2.483728468261435e-06, "loss": 1.5529, "num_input_tokens_seen": 1413120, "step": 345 }, { "epoch": 0.2578241430700447, "grad_norm": 9.295443132654942, "learning_rate": 2.4836342105059167e-06, "loss": 1.3758, "num_input_tokens_seen": 1417216, "step": 346 }, { "epoch": 0.2585692995529061, "grad_norm": 7.148528960594102, "learning_rate": 2.4835396823289704e-06, "loss": 1.6892, "num_input_tokens_seen": 1421312, "step": 347 }, { "epoch": 0.2593144560357675, "grad_norm": 14.442381112171365, "learning_rate": 2.4834448837513175e-06, "loss": 1.6127, "num_input_tokens_seen": 1425408, "step": 348 }, { "epoch": 0.2600596125186289, "grad_norm": 6.911028149259434, "learning_rate": 2.483349814793738e-06, "loss": 1.3349, "num_input_tokens_seen": 1429504, "step": 349 }, { "epoch": 0.2608047690014903, "grad_norm": 9.33934042193373, "learning_rate": 2.483254475477073e-06, "loss": 1.3887, "num_input_tokens_seen": 1433600, "step": 350 }, { "epoch": 0.2615499254843517, "grad_norm": 5.783903487011262, "learning_rate": 2.48315886582222e-06, "loss": 1.0432, "num_input_tokens_seen": 1437696, "step": 351 }, { "epoch": 0.26229508196721313, "grad_norm": 9.084423276663472, "learning_rate": 2.4830629858501385e-06, "loss": 1.5336, "num_input_tokens_seen": 1441792, "step": 352 }, { "epoch": 0.2630402384500745, "grad_norm": 7.429435418985458, "learning_rate": 2.4829668355818454e-06, "loss": 1.1921, "num_input_tokens_seen": 1445888, "step": 353 }, { "epoch": 0.2637853949329359, "grad_norm": 8.670190784445474, "learning_rate": 2.482870415038418e-06, "loss": 1.4285, "num_input_tokens_seen": 1449984, "step": 354 }, { "epoch": 0.26453055141579734, "grad_norm": 14.16859574020646, "learning_rate": 2.482773724240992e-06, "loss": 1.4776, "num_input_tokens_seen": 1454080, "step": 355 }, { "epoch": 0.26527570789865873, "grad_norm": 7.266353641851579, "learning_rate": 2.482676763210763e-06, "loss": 1.6028, "num_input_tokens_seen": 1458176, "step": 356 }, { "epoch": 0.2660208643815201, "grad_norm": 9.118252262877593, "learning_rate": 2.482579531968985e-06, "loss": 1.5723, "num_input_tokens_seen": 1462272, "step": 357 }, { "epoch": 0.2667660208643815, "grad_norm": 9.041463970575792, "learning_rate": 2.482482030536973e-06, "loss": 1.3897, "num_input_tokens_seen": 1466368, "step": 358 }, { "epoch": 0.26751117734724295, "grad_norm": 8.02937558227738, "learning_rate": 2.4823842589360985e-06, "loss": 1.6181, "num_input_tokens_seen": 1470464, "step": 359 }, { "epoch": 0.26825633383010433, "grad_norm": 15.54268087311806, "learning_rate": 2.4822862171877946e-06, "loss": 1.6111, "num_input_tokens_seen": 1474560, "step": 360 }, { "epoch": 0.2690014903129657, "grad_norm": 8.003235651153588, "learning_rate": 2.4821879053135527e-06, "loss": 1.7431, "num_input_tokens_seen": 1478656, "step": 361 }, { "epoch": 0.2697466467958271, "grad_norm": 7.888716707952832, "learning_rate": 2.482089323334923e-06, "loss": 1.347, "num_input_tokens_seen": 1482752, "step": 362 }, { "epoch": 0.27049180327868855, "grad_norm": 11.706253162297381, "learning_rate": 2.4819904712735162e-06, "loss": 1.3662, "num_input_tokens_seen": 1486848, "step": 363 }, { "epoch": 0.27123695976154993, "grad_norm": 10.202654963172908, "learning_rate": 2.4818913491510003e-06, "loss": 1.6084, "num_input_tokens_seen": 1490944, "step": 364 }, { "epoch": 0.2719821162444113, "grad_norm": 6.646709801293761, "learning_rate": 2.481791956989105e-06, "loss": 1.4882, "num_input_tokens_seen": 1495040, "step": 365 }, { "epoch": 0.2727272727272727, "grad_norm": 9.813025414424159, "learning_rate": 2.4816922948096166e-06, "loss": 1.2649, "num_input_tokens_seen": 1499136, "step": 366 }, { "epoch": 0.27347242921013415, "grad_norm": 9.934905505843947, "learning_rate": 2.481592362634382e-06, "loss": 1.4349, "num_input_tokens_seen": 1503232, "step": 367 }, { "epoch": 0.27421758569299554, "grad_norm": 9.682661767313817, "learning_rate": 2.481492160485307e-06, "loss": 1.5792, "num_input_tokens_seen": 1507328, "step": 368 }, { "epoch": 0.2749627421758569, "grad_norm": 9.703504711124982, "learning_rate": 2.481391688384357e-06, "loss": 1.5104, "num_input_tokens_seen": 1511424, "step": 369 }, { "epoch": 0.2757078986587183, "grad_norm": 8.120083456269077, "learning_rate": 2.481290946353556e-06, "loss": 1.3875, "num_input_tokens_seen": 1515520, "step": 370 }, { "epoch": 0.27645305514157975, "grad_norm": 9.328823907815714, "learning_rate": 2.4811899344149875e-06, "loss": 1.5366, "num_input_tokens_seen": 1519616, "step": 371 }, { "epoch": 0.27719821162444114, "grad_norm": 9.256087628132573, "learning_rate": 2.4810886525907934e-06, "loss": 1.4399, "num_input_tokens_seen": 1523712, "step": 372 }, { "epoch": 0.2779433681073025, "grad_norm": 7.451202566288217, "learning_rate": 2.4809871009031762e-06, "loss": 1.3209, "num_input_tokens_seen": 1527808, "step": 373 }, { "epoch": 0.2786885245901639, "grad_norm": 7.538426204172583, "learning_rate": 2.4808852793743965e-06, "loss": 1.5141, "num_input_tokens_seen": 1531904, "step": 374 }, { "epoch": 0.27943368107302535, "grad_norm": 7.108557091613386, "learning_rate": 2.480783188026774e-06, "loss": 1.4588, "num_input_tokens_seen": 1536000, "step": 375 }, { "epoch": 0.28017883755588674, "grad_norm": 9.833087476476566, "learning_rate": 2.4806808268826883e-06, "loss": 1.2374, "num_input_tokens_seen": 1540096, "step": 376 }, { "epoch": 0.2809239940387481, "grad_norm": 8.640117936371105, "learning_rate": 2.4805781959645776e-06, "loss": 1.3717, "num_input_tokens_seen": 1544192, "step": 377 }, { "epoch": 0.2816691505216095, "grad_norm": 7.377205586767024, "learning_rate": 2.480475295294939e-06, "loss": 1.3189, "num_input_tokens_seen": 1548288, "step": 378 }, { "epoch": 0.28241430700447095, "grad_norm": 20.97210837284922, "learning_rate": 2.4803721248963295e-06, "loss": 1.1976, "num_input_tokens_seen": 1552384, "step": 379 }, { "epoch": 0.28315946348733234, "grad_norm": 13.48622916487958, "learning_rate": 2.4802686847913644e-06, "loss": 1.4341, "num_input_tokens_seen": 1556480, "step": 380 }, { "epoch": 0.28390461997019373, "grad_norm": 16.20343698082542, "learning_rate": 2.480164975002719e-06, "loss": 1.523, "num_input_tokens_seen": 1560576, "step": 381 }, { "epoch": 0.2846497764530551, "grad_norm": 9.300452279884096, "learning_rate": 2.4800609955531268e-06, "loss": 1.3181, "num_input_tokens_seen": 1564672, "step": 382 }, { "epoch": 0.28539493293591656, "grad_norm": 8.780163453867107, "learning_rate": 2.4799567464653806e-06, "loss": 1.2529, "num_input_tokens_seen": 1568768, "step": 383 }, { "epoch": 0.28614008941877794, "grad_norm": 6.82657974128535, "learning_rate": 2.4798522277623332e-06, "loss": 1.6835, "num_input_tokens_seen": 1572864, "step": 384 }, { "epoch": 0.28688524590163933, "grad_norm": 7.6501045096341596, "learning_rate": 2.479747439466896e-06, "loss": 1.337, "num_input_tokens_seen": 1576960, "step": 385 }, { "epoch": 0.28763040238450077, "grad_norm": 14.806901359694683, "learning_rate": 2.4796423816020382e-06, "loss": 1.6153, "num_input_tokens_seen": 1581056, "step": 386 }, { "epoch": 0.28837555886736216, "grad_norm": 10.637935513704626, "learning_rate": 2.47953705419079e-06, "loss": 1.5914, "num_input_tokens_seen": 1585152, "step": 387 }, { "epoch": 0.28912071535022354, "grad_norm": 9.098125859773623, "learning_rate": 2.47943145725624e-06, "loss": 1.2155, "num_input_tokens_seen": 1589248, "step": 388 }, { "epoch": 0.28986587183308493, "grad_norm": 7.650450142341019, "learning_rate": 2.4793255908215356e-06, "loss": 1.16, "num_input_tokens_seen": 1593344, "step": 389 }, { "epoch": 0.2906110283159464, "grad_norm": 11.32622077324635, "learning_rate": 2.4792194549098836e-06, "loss": 1.0841, "num_input_tokens_seen": 1597440, "step": 390 }, { "epoch": 0.29135618479880776, "grad_norm": 7.365286012400752, "learning_rate": 2.4791130495445493e-06, "loss": 1.5789, "num_input_tokens_seen": 1601536, "step": 391 }, { "epoch": 0.29210134128166915, "grad_norm": 9.407663945216722, "learning_rate": 2.4790063747488584e-06, "loss": 1.3685, "num_input_tokens_seen": 1605632, "step": 392 }, { "epoch": 0.29284649776453053, "grad_norm": 7.85836824072835, "learning_rate": 2.478899430546194e-06, "loss": 1.0257, "num_input_tokens_seen": 1609728, "step": 393 }, { "epoch": 0.293591654247392, "grad_norm": 7.310992353174697, "learning_rate": 2.4787922169599992e-06, "loss": 1.4347, "num_input_tokens_seen": 1613824, "step": 394 }, { "epoch": 0.29433681073025336, "grad_norm": 7.727127279384815, "learning_rate": 2.4786847340137765e-06, "loss": 1.4672, "num_input_tokens_seen": 1617920, "step": 395 }, { "epoch": 0.29508196721311475, "grad_norm": 30.071621566778738, "learning_rate": 2.478576981731086e-06, "loss": 1.4601, "num_input_tokens_seen": 1622016, "step": 396 }, { "epoch": 0.29582712369597614, "grad_norm": 11.112720444437086, "learning_rate": 2.4784689601355487e-06, "loss": 1.5117, "num_input_tokens_seen": 1626112, "step": 397 }, { "epoch": 0.2965722801788376, "grad_norm": 9.976616648038606, "learning_rate": 2.478360669250843e-06, "loss": 1.9308, "num_input_tokens_seen": 1630208, "step": 398 }, { "epoch": 0.29731743666169896, "grad_norm": 12.613501621761328, "learning_rate": 2.4782521091007074e-06, "loss": 1.2112, "num_input_tokens_seen": 1634304, "step": 399 }, { "epoch": 0.29806259314456035, "grad_norm": 8.489757706801482, "learning_rate": 2.478143279708939e-06, "loss": 1.2532, "num_input_tokens_seen": 1638400, "step": 400 }, { "epoch": 0.29880774962742174, "grad_norm": 7.576493584510732, "learning_rate": 2.4780341810993943e-06, "loss": 1.4957, "num_input_tokens_seen": 1642496, "step": 401 }, { "epoch": 0.2995529061102832, "grad_norm": 8.810984926234946, "learning_rate": 2.4779248132959878e-06, "loss": 1.1833, "num_input_tokens_seen": 1646592, "step": 402 }, { "epoch": 0.30029806259314457, "grad_norm": 10.279311796470662, "learning_rate": 2.477815176322694e-06, "loss": 1.6559, "num_input_tokens_seen": 1650688, "step": 403 }, { "epoch": 0.30104321907600595, "grad_norm": 10.619037100796358, "learning_rate": 2.477705270203546e-06, "loss": 1.4503, "num_input_tokens_seen": 1654784, "step": 404 }, { "epoch": 0.30178837555886734, "grad_norm": 6.9882589391945515, "learning_rate": 2.4775950949626364e-06, "loss": 1.5521, "num_input_tokens_seen": 1658880, "step": 405 }, { "epoch": 0.3025335320417288, "grad_norm": 21.726032725764142, "learning_rate": 2.4774846506241163e-06, "loss": 1.0846, "num_input_tokens_seen": 1662976, "step": 406 }, { "epoch": 0.30327868852459017, "grad_norm": 6.680322190394293, "learning_rate": 2.4773739372121957e-06, "loss": 1.5374, "num_input_tokens_seen": 1667072, "step": 407 }, { "epoch": 0.30402384500745155, "grad_norm": 10.092448411230231, "learning_rate": 2.477262954751144e-06, "loss": 1.4517, "num_input_tokens_seen": 1671168, "step": 408 }, { "epoch": 0.30476900149031294, "grad_norm": 7.383998770766328, "learning_rate": 2.4771517032652886e-06, "loss": 1.3962, "num_input_tokens_seen": 1675264, "step": 409 }, { "epoch": 0.3055141579731744, "grad_norm": 10.336966545828155, "learning_rate": 2.477040182779018e-06, "loss": 1.3347, "num_input_tokens_seen": 1679360, "step": 410 }, { "epoch": 0.30625931445603577, "grad_norm": 7.140726784597332, "learning_rate": 2.476928393316777e-06, "loss": 1.7301, "num_input_tokens_seen": 1683456, "step": 411 }, { "epoch": 0.30700447093889716, "grad_norm": 8.767472085049203, "learning_rate": 2.476816334903071e-06, "loss": 1.2193, "num_input_tokens_seen": 1687552, "step": 412 }, { "epoch": 0.30774962742175854, "grad_norm": 8.85311109412997, "learning_rate": 2.4767040075624644e-06, "loss": 1.2628, "num_input_tokens_seen": 1691648, "step": 413 }, { "epoch": 0.30849478390462, "grad_norm": 7.303454138368299, "learning_rate": 2.47659141131958e-06, "loss": 1.5691, "num_input_tokens_seen": 1695744, "step": 414 }, { "epoch": 0.30923994038748137, "grad_norm": 7.084335722807951, "learning_rate": 2.476478546199099e-06, "loss": 1.4451, "num_input_tokens_seen": 1699840, "step": 415 }, { "epoch": 0.30998509687034276, "grad_norm": 6.6270597728194955, "learning_rate": 2.4763654122257635e-06, "loss": 1.5741, "num_input_tokens_seen": 1703936, "step": 416 }, { "epoch": 0.3107302533532042, "grad_norm": 7.926343211495248, "learning_rate": 2.476252009424372e-06, "loss": 1.3575, "num_input_tokens_seen": 1708032, "step": 417 }, { "epoch": 0.3114754098360656, "grad_norm": 15.772203634621393, "learning_rate": 2.4761383378197847e-06, "loss": 1.9621, "num_input_tokens_seen": 1712128, "step": 418 }, { "epoch": 0.312220566318927, "grad_norm": 8.728327863059183, "learning_rate": 2.4760243974369174e-06, "loss": 1.5874, "num_input_tokens_seen": 1716224, "step": 419 }, { "epoch": 0.31296572280178836, "grad_norm": 7.335293147629756, "learning_rate": 2.475910188300748e-06, "loss": 1.794, "num_input_tokens_seen": 1720320, "step": 420 }, { "epoch": 0.3137108792846498, "grad_norm": 7.031170970883745, "learning_rate": 2.4757957104363117e-06, "loss": 1.3028, "num_input_tokens_seen": 1724416, "step": 421 }, { "epoch": 0.3144560357675112, "grad_norm": 6.739958064156768, "learning_rate": 2.4756809638687025e-06, "loss": 1.587, "num_input_tokens_seen": 1728512, "step": 422 }, { "epoch": 0.3152011922503726, "grad_norm": 8.689268914218848, "learning_rate": 2.475565948623074e-06, "loss": 1.3168, "num_input_tokens_seen": 1732608, "step": 423 }, { "epoch": 0.31594634873323396, "grad_norm": 7.829601453492681, "learning_rate": 2.475450664724638e-06, "loss": 1.021, "num_input_tokens_seen": 1736704, "step": 424 }, { "epoch": 0.3166915052160954, "grad_norm": 8.21709874943885, "learning_rate": 2.4753351121986662e-06, "loss": 1.5652, "num_input_tokens_seen": 1740800, "step": 425 }, { "epoch": 0.3174366616989568, "grad_norm": 11.047264475340505, "learning_rate": 2.4752192910704874e-06, "loss": 0.9005, "num_input_tokens_seen": 1744896, "step": 426 }, { "epoch": 0.3181818181818182, "grad_norm": 7.999686943365952, "learning_rate": 2.475103201365492e-06, "loss": 1.1475, "num_input_tokens_seen": 1748992, "step": 427 }, { "epoch": 0.31892697466467956, "grad_norm": 8.958177934558103, "learning_rate": 2.4749868431091265e-06, "loss": 1.3521, "num_input_tokens_seen": 1753088, "step": 428 }, { "epoch": 0.319672131147541, "grad_norm": 11.052252781817405, "learning_rate": 2.4748702163268977e-06, "loss": 1.123, "num_input_tokens_seen": 1757184, "step": 429 }, { "epoch": 0.3204172876304024, "grad_norm": 7.5562341890160365, "learning_rate": 2.4747533210443714e-06, "loss": 1.3884, "num_input_tokens_seen": 1761280, "step": 430 }, { "epoch": 0.3211624441132638, "grad_norm": 7.261925888964919, "learning_rate": 2.4746361572871715e-06, "loss": 1.3675, "num_input_tokens_seen": 1765376, "step": 431 }, { "epoch": 0.32190760059612517, "grad_norm": 7.415190868267144, "learning_rate": 2.474518725080981e-06, "loss": 1.1608, "num_input_tokens_seen": 1769472, "step": 432 }, { "epoch": 0.3226527570789866, "grad_norm": 7.233514183378593, "learning_rate": 2.474401024451542e-06, "loss": 1.3447, "num_input_tokens_seen": 1773568, "step": 433 }, { "epoch": 0.323397913561848, "grad_norm": 9.667270498899489, "learning_rate": 2.474283055424656e-06, "loss": 1.323, "num_input_tokens_seen": 1777664, "step": 434 }, { "epoch": 0.3241430700447094, "grad_norm": 15.409719199220065, "learning_rate": 2.474164818026182e-06, "loss": 1.698, "num_input_tokens_seen": 1781760, "step": 435 }, { "epoch": 0.32488822652757077, "grad_norm": 14.483795213118288, "learning_rate": 2.474046312282038e-06, "loss": 1.8131, "num_input_tokens_seen": 1785856, "step": 436 }, { "epoch": 0.3256333830104322, "grad_norm": 7.930199769061791, "learning_rate": 2.473927538218202e-06, "loss": 1.4422, "num_input_tokens_seen": 1789952, "step": 437 }, { "epoch": 0.3263785394932936, "grad_norm": 8.669933575648892, "learning_rate": 2.4738084958607108e-06, "loss": 1.6463, "num_input_tokens_seen": 1794048, "step": 438 }, { "epoch": 0.327123695976155, "grad_norm": 8.829354054347286, "learning_rate": 2.4736891852356575e-06, "loss": 1.3007, "num_input_tokens_seen": 1798144, "step": 439 }, { "epoch": 0.32786885245901637, "grad_norm": 6.823103801401154, "learning_rate": 2.4735696063691975e-06, "loss": 1.5711, "num_input_tokens_seen": 1802240, "step": 440 }, { "epoch": 0.3286140089418778, "grad_norm": 7.685950110698454, "learning_rate": 2.4734497592875424e-06, "loss": 1.6552, "num_input_tokens_seen": 1806336, "step": 441 }, { "epoch": 0.3293591654247392, "grad_norm": 10.458211661784205, "learning_rate": 2.473329644016964e-06, "loss": 1.4643, "num_input_tokens_seen": 1810432, "step": 442 }, { "epoch": 0.3301043219076006, "grad_norm": 6.999902607494102, "learning_rate": 2.4732092605837925e-06, "loss": 1.4656, "num_input_tokens_seen": 1814528, "step": 443 }, { "epoch": 0.33084947839046197, "grad_norm": 7.595274959397586, "learning_rate": 2.4730886090144162e-06, "loss": 1.247, "num_input_tokens_seen": 1818624, "step": 444 }, { "epoch": 0.3315946348733234, "grad_norm": 8.031976517878022, "learning_rate": 2.4729676893352835e-06, "loss": 1.6404, "num_input_tokens_seen": 1822720, "step": 445 }, { "epoch": 0.3323397913561848, "grad_norm": 9.559904265191367, "learning_rate": 2.4728465015729003e-06, "loss": 1.5393, "num_input_tokens_seen": 1826816, "step": 446 }, { "epoch": 0.3330849478390462, "grad_norm": 9.316835900026838, "learning_rate": 2.472725045753832e-06, "loss": 1.3715, "num_input_tokens_seen": 1830912, "step": 447 }, { "epoch": 0.33383010432190763, "grad_norm": 10.926489526852674, "learning_rate": 2.472603321904703e-06, "loss": 1.5113, "num_input_tokens_seen": 1835008, "step": 448 }, { "epoch": 0.334575260804769, "grad_norm": 6.601267503596517, "learning_rate": 2.4724813300521953e-06, "loss": 1.5703, "num_input_tokens_seen": 1839104, "step": 449 }, { "epoch": 0.3353204172876304, "grad_norm": 7.62119201452793, "learning_rate": 2.472359070223051e-06, "loss": 1.3687, "num_input_tokens_seen": 1843200, "step": 450 }, { "epoch": 0.3360655737704918, "grad_norm": 6.940793980541602, "learning_rate": 2.47223654244407e-06, "loss": 1.4196, "num_input_tokens_seen": 1847296, "step": 451 }, { "epoch": 0.33681073025335323, "grad_norm": 7.635918122222517, "learning_rate": 2.4721137467421115e-06, "loss": 1.4345, "num_input_tokens_seen": 1851392, "step": 452 }, { "epoch": 0.3375558867362146, "grad_norm": 9.040907736079456, "learning_rate": 2.4719906831440926e-06, "loss": 1.2513, "num_input_tokens_seen": 1855488, "step": 453 }, { "epoch": 0.338301043219076, "grad_norm": 9.055386682807804, "learning_rate": 2.4718673516769905e-06, "loss": 1.1784, "num_input_tokens_seen": 1859584, "step": 454 }, { "epoch": 0.3390461997019374, "grad_norm": 9.001473265933146, "learning_rate": 2.47174375236784e-06, "loss": 0.9471, "num_input_tokens_seen": 1863680, "step": 455 }, { "epoch": 0.33979135618479883, "grad_norm": 7.639045030602643, "learning_rate": 2.471619885243735e-06, "loss": 1.5449, "num_input_tokens_seen": 1867776, "step": 456 }, { "epoch": 0.3405365126676602, "grad_norm": 7.779111851719721, "learning_rate": 2.4714957503318277e-06, "loss": 1.3811, "num_input_tokens_seen": 1871872, "step": 457 }, { "epoch": 0.3412816691505216, "grad_norm": 7.512622492671024, "learning_rate": 2.4713713476593297e-06, "loss": 1.5398, "num_input_tokens_seen": 1875968, "step": 458 }, { "epoch": 0.342026825633383, "grad_norm": 8.797594003302939, "learning_rate": 2.4712466772535114e-06, "loss": 1.3297, "num_input_tokens_seen": 1880064, "step": 459 }, { "epoch": 0.34277198211624443, "grad_norm": 28.039167742241183, "learning_rate": 2.4711217391417003e-06, "loss": 2.1771, "num_input_tokens_seen": 1884160, "step": 460 }, { "epoch": 0.3435171385991058, "grad_norm": 10.473297923744093, "learning_rate": 2.4709965333512846e-06, "loss": 1.4106, "num_input_tokens_seen": 1888256, "step": 461 }, { "epoch": 0.3442622950819672, "grad_norm": 8.157300820640312, "learning_rate": 2.4708710599097105e-06, "loss": 1.2218, "num_input_tokens_seen": 1892352, "step": 462 }, { "epoch": 0.3450074515648286, "grad_norm": 8.444935083643289, "learning_rate": 2.470745318844482e-06, "loss": 1.2774, "num_input_tokens_seen": 1896448, "step": 463 }, { "epoch": 0.34575260804769004, "grad_norm": 7.012467599741839, "learning_rate": 2.470619310183163e-06, "loss": 1.4884, "num_input_tokens_seen": 1900544, "step": 464 }, { "epoch": 0.3464977645305514, "grad_norm": 7.232354264169012, "learning_rate": 2.4704930339533745e-06, "loss": 1.2903, "num_input_tokens_seen": 1904640, "step": 465 }, { "epoch": 0.3472429210134128, "grad_norm": 8.789486909683745, "learning_rate": 2.4703664901827985e-06, "loss": 1.3325, "num_input_tokens_seen": 1908736, "step": 466 }, { "epoch": 0.3479880774962742, "grad_norm": 7.171782508565415, "learning_rate": 2.4702396788991736e-06, "loss": 1.401, "num_input_tokens_seen": 1912832, "step": 467 }, { "epoch": 0.34873323397913564, "grad_norm": 7.400452792087253, "learning_rate": 2.4701126001302972e-06, "loss": 1.421, "num_input_tokens_seen": 1916928, "step": 468 }, { "epoch": 0.349478390461997, "grad_norm": 10.018029857159522, "learning_rate": 2.469985253904027e-06, "loss": 1.3461, "num_input_tokens_seen": 1921024, "step": 469 }, { "epoch": 0.3502235469448584, "grad_norm": 7.836386892517182, "learning_rate": 2.4698576402482776e-06, "loss": 1.3228, "num_input_tokens_seen": 1925120, "step": 470 }, { "epoch": 0.3509687034277198, "grad_norm": 7.348749524037933, "learning_rate": 2.469729759191023e-06, "loss": 1.4832, "num_input_tokens_seen": 1929216, "step": 471 }, { "epoch": 0.35171385991058124, "grad_norm": 10.12062906233747, "learning_rate": 2.4696016107602957e-06, "loss": 1.5615, "num_input_tokens_seen": 1933312, "step": 472 }, { "epoch": 0.3524590163934426, "grad_norm": 11.912287790586502, "learning_rate": 2.469473194984186e-06, "loss": 1.2009, "num_input_tokens_seen": 1937408, "step": 473 }, { "epoch": 0.353204172876304, "grad_norm": 13.239050498093619, "learning_rate": 2.4693445118908445e-06, "loss": 1.4042, "num_input_tokens_seen": 1941504, "step": 474 }, { "epoch": 0.3539493293591654, "grad_norm": 9.551944966450451, "learning_rate": 2.469215561508479e-06, "loss": 1.4482, "num_input_tokens_seen": 1945600, "step": 475 }, { "epoch": 0.35469448584202684, "grad_norm": 9.113573870129224, "learning_rate": 2.4690863438653563e-06, "loss": 1.3643, "num_input_tokens_seen": 1949696, "step": 476 }, { "epoch": 0.3554396423248882, "grad_norm": 8.775050139017264, "learning_rate": 2.468956858989802e-06, "loss": 1.6095, "num_input_tokens_seen": 1953792, "step": 477 }, { "epoch": 0.3561847988077496, "grad_norm": 7.615717892037721, "learning_rate": 2.4688271069102003e-06, "loss": 1.5309, "num_input_tokens_seen": 1957888, "step": 478 }, { "epoch": 0.356929955290611, "grad_norm": 8.795731462615585, "learning_rate": 2.4686970876549935e-06, "loss": 1.4163, "num_input_tokens_seen": 1961984, "step": 479 }, { "epoch": 0.35767511177347244, "grad_norm": 6.68381037069729, "learning_rate": 2.4685668012526828e-06, "loss": 1.6612, "num_input_tokens_seen": 1966080, "step": 480 }, { "epoch": 0.35842026825633383, "grad_norm": 10.844721436205548, "learning_rate": 2.468436247731828e-06, "loss": 1.225, "num_input_tokens_seen": 1970176, "step": 481 }, { "epoch": 0.3591654247391952, "grad_norm": 9.255065307065307, "learning_rate": 2.468305427121047e-06, "loss": 1.4525, "num_input_tokens_seen": 1974272, "step": 482 }, { "epoch": 0.35991058122205666, "grad_norm": 6.282830498459162, "learning_rate": 2.468174339449017e-06, "loss": 1.4807, "num_input_tokens_seen": 1978368, "step": 483 }, { "epoch": 0.36065573770491804, "grad_norm": 7.9424615403385, "learning_rate": 2.4680429847444738e-06, "loss": 1.3893, "num_input_tokens_seen": 1982464, "step": 484 }, { "epoch": 0.36140089418777943, "grad_norm": 8.269786877652475, "learning_rate": 2.46791136303621e-06, "loss": 1.6122, "num_input_tokens_seen": 1986560, "step": 485 }, { "epoch": 0.3621460506706408, "grad_norm": 6.879840878500671, "learning_rate": 2.467779474353079e-06, "loss": 1.6938, "num_input_tokens_seen": 1990656, "step": 486 }, { "epoch": 0.36289120715350226, "grad_norm": 7.525368766923978, "learning_rate": 2.4676473187239915e-06, "loss": 1.249, "num_input_tokens_seen": 1994752, "step": 487 }, { "epoch": 0.36363636363636365, "grad_norm": 7.1912249169323275, "learning_rate": 2.467514896177917e-06, "loss": 1.3806, "num_input_tokens_seen": 1998848, "step": 488 }, { "epoch": 0.36438152011922503, "grad_norm": 7.445181997094034, "learning_rate": 2.4673822067438833e-06, "loss": 1.5156, "num_input_tokens_seen": 2002944, "step": 489 }, { "epoch": 0.3651266766020864, "grad_norm": 7.557494883477814, "learning_rate": 2.4672492504509773e-06, "loss": 1.3477, "num_input_tokens_seen": 2007040, "step": 490 }, { "epoch": 0.36587183308494786, "grad_norm": 7.5517055495739225, "learning_rate": 2.4671160273283433e-06, "loss": 1.6865, "num_input_tokens_seen": 2011136, "step": 491 }, { "epoch": 0.36661698956780925, "grad_norm": 10.338609551327918, "learning_rate": 2.4669825374051853e-06, "loss": 1.2314, "num_input_tokens_seen": 2015232, "step": 492 }, { "epoch": 0.36736214605067063, "grad_norm": 7.953621895388539, "learning_rate": 2.4668487807107654e-06, "loss": 1.4593, "num_input_tokens_seen": 2019328, "step": 493 }, { "epoch": 0.368107302533532, "grad_norm": 7.372457044492564, "learning_rate": 2.466714757274403e-06, "loss": 1.4163, "num_input_tokens_seen": 2023424, "step": 494 }, { "epoch": 0.36885245901639346, "grad_norm": 9.016839522982066, "learning_rate": 2.4665804671254784e-06, "loss": 1.3205, "num_input_tokens_seen": 2027520, "step": 495 }, { "epoch": 0.36959761549925485, "grad_norm": 9.35394683172798, "learning_rate": 2.466445910293428e-06, "loss": 1.4007, "num_input_tokens_seen": 2031616, "step": 496 }, { "epoch": 0.37034277198211624, "grad_norm": 9.409286280167091, "learning_rate": 2.4663110868077478e-06, "loss": 1.523, "num_input_tokens_seen": 2035712, "step": 497 }, { "epoch": 0.3710879284649776, "grad_norm": 8.16902794535255, "learning_rate": 2.466175996697992e-06, "loss": 1.4062, "num_input_tokens_seen": 2039808, "step": 498 }, { "epoch": 0.37183308494783907, "grad_norm": 8.562389285248473, "learning_rate": 2.4660406399937737e-06, "loss": 1.1439, "num_input_tokens_seen": 2043904, "step": 499 }, { "epoch": 0.37257824143070045, "grad_norm": 10.683942583455513, "learning_rate": 2.4659050167247633e-06, "loss": 1.1165, "num_input_tokens_seen": 2048000, "step": 500 }, { "epoch": 0.37332339791356184, "grad_norm": 8.967043384576446, "learning_rate": 2.465769126920691e-06, "loss": 1.2341, "num_input_tokens_seen": 2052096, "step": 501 }, { "epoch": 0.3740685543964232, "grad_norm": 9.010787040316805, "learning_rate": 2.4656329706113453e-06, "loss": 1.3726, "num_input_tokens_seen": 2056192, "step": 502 }, { "epoch": 0.37481371087928467, "grad_norm": 8.007183443067687, "learning_rate": 2.465496547826572e-06, "loss": 1.3128, "num_input_tokens_seen": 2060288, "step": 503 }, { "epoch": 0.37555886736214605, "grad_norm": 8.763078424238616, "learning_rate": 2.465359858596275e-06, "loss": 1.4912, "num_input_tokens_seen": 2064384, "step": 504 }, { "epoch": 0.37630402384500744, "grad_norm": 7.894086797899413, "learning_rate": 2.46522290295042e-06, "loss": 1.2561, "num_input_tokens_seen": 2068480, "step": 505 }, { "epoch": 0.3770491803278688, "grad_norm": 6.965913344166032, "learning_rate": 2.465085680919026e-06, "loss": 1.5411, "num_input_tokens_seen": 2072576, "step": 506 }, { "epoch": 0.37779433681073027, "grad_norm": 7.73723961658756, "learning_rate": 2.464948192532175e-06, "loss": 1.2335, "num_input_tokens_seen": 2076672, "step": 507 }, { "epoch": 0.37853949329359166, "grad_norm": 10.793604268589089, "learning_rate": 2.4648104378200043e-06, "loss": 1.375, "num_input_tokens_seen": 2080768, "step": 508 }, { "epoch": 0.37928464977645304, "grad_norm": 8.68374276630473, "learning_rate": 2.4646724168127114e-06, "loss": 1.2086, "num_input_tokens_seen": 2084864, "step": 509 }, { "epoch": 0.38002980625931443, "grad_norm": 7.89289166006621, "learning_rate": 2.464534129540551e-06, "loss": 1.3022, "num_input_tokens_seen": 2088960, "step": 510 }, { "epoch": 0.38077496274217587, "grad_norm": 6.765044683416528, "learning_rate": 2.4643955760338365e-06, "loss": 1.3279, "num_input_tokens_seen": 2093056, "step": 511 }, { "epoch": 0.38152011922503726, "grad_norm": 7.530741176465452, "learning_rate": 2.4642567563229407e-06, "loss": 1.3924, "num_input_tokens_seen": 2097152, "step": 512 }, { "epoch": 0.38226527570789864, "grad_norm": 7.218111876080964, "learning_rate": 2.464117670438293e-06, "loss": 1.5393, "num_input_tokens_seen": 2101248, "step": 513 }, { "epoch": 0.3830104321907601, "grad_norm": 7.973468661890096, "learning_rate": 2.4639783184103828e-06, "loss": 1.3122, "num_input_tokens_seen": 2105344, "step": 514 }, { "epoch": 0.3837555886736215, "grad_norm": 7.982976877542911, "learning_rate": 2.4638387002697565e-06, "loss": 1.7126, "num_input_tokens_seen": 2109440, "step": 515 }, { "epoch": 0.38450074515648286, "grad_norm": 8.387393987078436, "learning_rate": 2.4636988160470194e-06, "loss": 1.3359, "num_input_tokens_seen": 2113536, "step": 516 }, { "epoch": 0.38524590163934425, "grad_norm": 8.14089819288552, "learning_rate": 2.463558665772835e-06, "loss": 1.5919, "num_input_tokens_seen": 2117632, "step": 517 }, { "epoch": 0.3859910581222057, "grad_norm": 7.090923708221393, "learning_rate": 2.4634182494779257e-06, "loss": 1.4738, "num_input_tokens_seen": 2121728, "step": 518 }, { "epoch": 0.3867362146050671, "grad_norm": 11.629979099368443, "learning_rate": 2.4632775671930717e-06, "loss": 1.5325, "num_input_tokens_seen": 2125824, "step": 519 }, { "epoch": 0.38748137108792846, "grad_norm": 8.602111746516304, "learning_rate": 2.4631366189491112e-06, "loss": 1.511, "num_input_tokens_seen": 2129920, "step": 520 }, { "epoch": 0.38822652757078985, "grad_norm": 8.261344148841797, "learning_rate": 2.462995404776941e-06, "loss": 1.3882, "num_input_tokens_seen": 2134016, "step": 521 }, { "epoch": 0.3889716840536513, "grad_norm": 7.305292892785254, "learning_rate": 2.462853924707517e-06, "loss": 1.4353, "num_input_tokens_seen": 2138112, "step": 522 }, { "epoch": 0.3897168405365127, "grad_norm": 9.464223558774558, "learning_rate": 2.462712178771852e-06, "loss": 1.6179, "num_input_tokens_seen": 2142208, "step": 523 }, { "epoch": 0.39046199701937406, "grad_norm": 8.737695584060821, "learning_rate": 2.462570167001018e-06, "loss": 1.3083, "num_input_tokens_seen": 2146304, "step": 524 }, { "epoch": 0.39120715350223545, "grad_norm": 7.68419932349026, "learning_rate": 2.4624278894261446e-06, "loss": 1.2218, "num_input_tokens_seen": 2150400, "step": 525 }, { "epoch": 0.3919523099850969, "grad_norm": 7.220594910778691, "learning_rate": 2.4622853460784205e-06, "loss": 1.2524, "num_input_tokens_seen": 2154496, "step": 526 }, { "epoch": 0.3926974664679583, "grad_norm": 23.829562639068833, "learning_rate": 2.462142536989092e-06, "loss": 1.3151, "num_input_tokens_seen": 2158592, "step": 527 }, { "epoch": 0.39344262295081966, "grad_norm": 7.43913701316008, "learning_rate": 2.4619994621894638e-06, "loss": 1.3512, "num_input_tokens_seen": 2162688, "step": 528 }, { "epoch": 0.39418777943368105, "grad_norm": 10.0503614345354, "learning_rate": 2.4618561217108996e-06, "loss": 1.3919, "num_input_tokens_seen": 2166784, "step": 529 }, { "epoch": 0.3949329359165425, "grad_norm": 8.642006712832671, "learning_rate": 2.4617125155848193e-06, "loss": 1.2302, "num_input_tokens_seen": 2170880, "step": 530 }, { "epoch": 0.3956780923994039, "grad_norm": 13.178222735533243, "learning_rate": 2.4615686438427035e-06, "loss": 1.3854, "num_input_tokens_seen": 2174976, "step": 531 }, { "epoch": 0.39642324888226527, "grad_norm": 8.213246155860466, "learning_rate": 2.4614245065160896e-06, "loss": 1.4646, "num_input_tokens_seen": 2179072, "step": 532 }, { "epoch": 0.39716840536512665, "grad_norm": 6.787212968566636, "learning_rate": 2.4612801036365736e-06, "loss": 1.5803, "num_input_tokens_seen": 2183168, "step": 533 }, { "epoch": 0.3979135618479881, "grad_norm": 7.9461933336433805, "learning_rate": 2.4611354352358097e-06, "loss": 1.3156, "num_input_tokens_seen": 2187264, "step": 534 }, { "epoch": 0.3986587183308495, "grad_norm": 14.35076783177048, "learning_rate": 2.46099050134551e-06, "loss": 1.5557, "num_input_tokens_seen": 2191360, "step": 535 }, { "epoch": 0.39940387481371087, "grad_norm": 9.396024183487974, "learning_rate": 2.460845301997445e-06, "loss": 1.7731, "num_input_tokens_seen": 2195456, "step": 536 }, { "epoch": 0.40014903129657226, "grad_norm": 8.134878211595948, "learning_rate": 2.4606998372234436e-06, "loss": 1.0355, "num_input_tokens_seen": 2199552, "step": 537 }, { "epoch": 0.4008941877794337, "grad_norm": 7.475579541256085, "learning_rate": 2.460554107055393e-06, "loss": 1.1422, "num_input_tokens_seen": 2203648, "step": 538 }, { "epoch": 0.4016393442622951, "grad_norm": 7.840091723972886, "learning_rate": 2.4604081115252376e-06, "loss": 1.2464, "num_input_tokens_seen": 2207744, "step": 539 }, { "epoch": 0.40238450074515647, "grad_norm": 8.232393565849973, "learning_rate": 2.4602618506649813e-06, "loss": 1.3075, "num_input_tokens_seen": 2211840, "step": 540 }, { "epoch": 0.40312965722801786, "grad_norm": 8.995911311954508, "learning_rate": 2.4601153245066854e-06, "loss": 1.1862, "num_input_tokens_seen": 2215936, "step": 541 }, { "epoch": 0.4038748137108793, "grad_norm": 7.824167911539379, "learning_rate": 2.459968533082469e-06, "loss": 1.1412, "num_input_tokens_seen": 2220032, "step": 542 }, { "epoch": 0.4046199701937407, "grad_norm": 13.564462866575429, "learning_rate": 2.4598214764245106e-06, "loss": 1.1007, "num_input_tokens_seen": 2224128, "step": 543 }, { "epoch": 0.40536512667660207, "grad_norm": 6.99311814030726, "learning_rate": 2.4596741545650455e-06, "loss": 1.5582, "num_input_tokens_seen": 2228224, "step": 544 }, { "epoch": 0.4061102831594635, "grad_norm": 7.842509766854893, "learning_rate": 2.4595265675363678e-06, "loss": 1.4454, "num_input_tokens_seen": 2232320, "step": 545 }, { "epoch": 0.4068554396423249, "grad_norm": 7.908359507701302, "learning_rate": 2.45937871537083e-06, "loss": 0.9828, "num_input_tokens_seen": 2236416, "step": 546 }, { "epoch": 0.4076005961251863, "grad_norm": 7.494504910775969, "learning_rate": 2.4592305981008417e-06, "loss": 1.3441, "num_input_tokens_seen": 2240512, "step": 547 }, { "epoch": 0.4083457526080477, "grad_norm": 8.505572035171138, "learning_rate": 2.459082215758872e-06, "loss": 1.4857, "num_input_tokens_seen": 2244608, "step": 548 }, { "epoch": 0.4090909090909091, "grad_norm": 9.1871804250207, "learning_rate": 2.458933568377447e-06, "loss": 1.5338, "num_input_tokens_seen": 2248704, "step": 549 }, { "epoch": 0.4098360655737705, "grad_norm": 8.563137798512106, "learning_rate": 2.4587846559891507e-06, "loss": 1.4245, "num_input_tokens_seen": 2252800, "step": 550 }, { "epoch": 0.4105812220566319, "grad_norm": 8.693701234271272, "learning_rate": 2.4586354786266263e-06, "loss": 1.1714, "num_input_tokens_seen": 2256896, "step": 551 }, { "epoch": 0.4113263785394933, "grad_norm": 7.871739765082665, "learning_rate": 2.458486036322575e-06, "loss": 1.1984, "num_input_tokens_seen": 2260992, "step": 552 }, { "epoch": 0.4120715350223547, "grad_norm": 7.492111886944862, "learning_rate": 2.458336329109755e-06, "loss": 1.3793, "num_input_tokens_seen": 2265088, "step": 553 }, { "epoch": 0.4128166915052161, "grad_norm": 8.606774096491543, "learning_rate": 2.4581863570209835e-06, "loss": 1.236, "num_input_tokens_seen": 2269184, "step": 554 }, { "epoch": 0.4135618479880775, "grad_norm": 10.574104490376058, "learning_rate": 2.458036120089135e-06, "loss": 1.4198, "num_input_tokens_seen": 2273280, "step": 555 }, { "epoch": 0.4143070044709389, "grad_norm": 8.536638180319633, "learning_rate": 2.4578856183471433e-06, "loss": 1.5225, "num_input_tokens_seen": 2277376, "step": 556 }, { "epoch": 0.4150521609538003, "grad_norm": 8.311453971309833, "learning_rate": 2.457734851827999e-06, "loss": 1.125, "num_input_tokens_seen": 2281472, "step": 557 }, { "epoch": 0.4157973174366617, "grad_norm": 9.897953624517886, "learning_rate": 2.4575838205647505e-06, "loss": 1.1746, "num_input_tokens_seen": 2285568, "step": 558 }, { "epoch": 0.4165424739195231, "grad_norm": 7.6224319088628265, "learning_rate": 2.4574325245905063e-06, "loss": 1.4708, "num_input_tokens_seen": 2289664, "step": 559 }, { "epoch": 0.4172876304023845, "grad_norm": 8.509484976921119, "learning_rate": 2.4572809639384306e-06, "loss": 1.4515, "num_input_tokens_seen": 2293760, "step": 560 }, { "epoch": 0.4180327868852459, "grad_norm": 67.08261013816256, "learning_rate": 2.457129138641747e-06, "loss": 2.2347, "num_input_tokens_seen": 2297856, "step": 561 }, { "epoch": 0.4187779433681073, "grad_norm": 13.174085875539259, "learning_rate": 2.4569770487337365e-06, "loss": 1.5675, "num_input_tokens_seen": 2301952, "step": 562 }, { "epoch": 0.4195230998509687, "grad_norm": 9.445961155989636, "learning_rate": 2.456824694247738e-06, "loss": 1.5482, "num_input_tokens_seen": 2306048, "step": 563 }, { "epoch": 0.4202682563338301, "grad_norm": 8.805076089002155, "learning_rate": 2.4566720752171493e-06, "loss": 1.3036, "num_input_tokens_seen": 2310144, "step": 564 }, { "epoch": 0.4210134128166915, "grad_norm": 7.5892532514250615, "learning_rate": 2.4565191916754257e-06, "loss": 1.3679, "num_input_tokens_seen": 2314240, "step": 565 }, { "epoch": 0.4217585692995529, "grad_norm": 7.3248589762817815, "learning_rate": 2.456366043656079e-06, "loss": 1.3813, "num_input_tokens_seen": 2318336, "step": 566 }, { "epoch": 0.4225037257824143, "grad_norm": 8.067729577471637, "learning_rate": 2.456212631192682e-06, "loss": 1.2034, "num_input_tokens_seen": 2322432, "step": 567 }, { "epoch": 0.4232488822652757, "grad_norm": 7.113002282519793, "learning_rate": 2.456058954318863e-06, "loss": 1.5088, "num_input_tokens_seen": 2326528, "step": 568 }, { "epoch": 0.4239940387481371, "grad_norm": 7.325050416019469, "learning_rate": 2.4559050130683093e-06, "loss": 1.2647, "num_input_tokens_seen": 2330624, "step": 569 }, { "epoch": 0.4247391952309985, "grad_norm": 8.08973505880137, "learning_rate": 2.4557508074747656e-06, "loss": 1.3038, "num_input_tokens_seen": 2334720, "step": 570 }, { "epoch": 0.4254843517138599, "grad_norm": 6.772451944394128, "learning_rate": 2.455596337572035e-06, "loss": 1.3381, "num_input_tokens_seen": 2338816, "step": 571 }, { "epoch": 0.4262295081967213, "grad_norm": 8.334269974342535, "learning_rate": 2.455441603393979e-06, "loss": 1.5287, "num_input_tokens_seen": 2342912, "step": 572 }, { "epoch": 0.4269746646795827, "grad_norm": 7.949920783896797, "learning_rate": 2.455286604974515e-06, "loss": 1.5204, "num_input_tokens_seen": 2347008, "step": 573 }, { "epoch": 0.4277198211624441, "grad_norm": 8.215780720392313, "learning_rate": 2.4551313423476215e-06, "loss": 1.797, "num_input_tokens_seen": 2351104, "step": 574 }, { "epoch": 0.4284649776453055, "grad_norm": 7.606563448986705, "learning_rate": 2.4549758155473318e-06, "loss": 1.4357, "num_input_tokens_seen": 2355200, "step": 575 }, { "epoch": 0.42921013412816694, "grad_norm": 7.395672099090088, "learning_rate": 2.4548200246077394e-06, "loss": 1.1528, "num_input_tokens_seen": 2359296, "step": 576 }, { "epoch": 0.42995529061102833, "grad_norm": 7.305547867464772, "learning_rate": 2.454663969562994e-06, "loss": 1.4596, "num_input_tokens_seen": 2363392, "step": 577 }, { "epoch": 0.4307004470938897, "grad_norm": 10.722347430692162, "learning_rate": 2.454507650447305e-06, "loss": 1.3362, "num_input_tokens_seen": 2367488, "step": 578 }, { "epoch": 0.4314456035767511, "grad_norm": 7.161295529364825, "learning_rate": 2.454351067294938e-06, "loss": 1.4502, "num_input_tokens_seen": 2371584, "step": 579 }, { "epoch": 0.43219076005961254, "grad_norm": 6.996113320166536, "learning_rate": 2.4541942201402165e-06, "loss": 1.5816, "num_input_tokens_seen": 2375680, "step": 580 }, { "epoch": 0.43293591654247393, "grad_norm": 14.059796276382365, "learning_rate": 2.454037109017524e-06, "loss": 1.3506, "num_input_tokens_seen": 2379776, "step": 581 }, { "epoch": 0.4336810730253353, "grad_norm": 6.693357667294417, "learning_rate": 2.453879733961299e-06, "loss": 1.5274, "num_input_tokens_seen": 2383872, "step": 582 }, { "epoch": 0.4344262295081967, "grad_norm": 8.211336439149406, "learning_rate": 2.4537220950060403e-06, "loss": 1.3867, "num_input_tokens_seen": 2387968, "step": 583 }, { "epoch": 0.43517138599105815, "grad_norm": 6.804383751907642, "learning_rate": 2.453564192186303e-06, "loss": 1.5162, "num_input_tokens_seen": 2392064, "step": 584 }, { "epoch": 0.43591654247391953, "grad_norm": 7.566991451949417, "learning_rate": 2.4534060255367002e-06, "loss": 1.2941, "num_input_tokens_seen": 2396160, "step": 585 }, { "epoch": 0.4366616989567809, "grad_norm": 6.93711225895078, "learning_rate": 2.453247595091904e-06, "loss": 1.4745, "num_input_tokens_seen": 2400256, "step": 586 }, { "epoch": 0.4374068554396423, "grad_norm": 8.545578949200605, "learning_rate": 2.4530889008866426e-06, "loss": 1.2825, "num_input_tokens_seen": 2404352, "step": 587 }, { "epoch": 0.43815201192250375, "grad_norm": 8.2703656939707, "learning_rate": 2.4529299429557037e-06, "loss": 1.2442, "num_input_tokens_seen": 2408448, "step": 588 }, { "epoch": 0.43889716840536513, "grad_norm": 10.032620378724886, "learning_rate": 2.4527707213339313e-06, "loss": 1.1523, "num_input_tokens_seen": 2412544, "step": 589 }, { "epoch": 0.4396423248882265, "grad_norm": 10.325920268831238, "learning_rate": 2.4526112360562283e-06, "loss": 1.427, "num_input_tokens_seen": 2416640, "step": 590 }, { "epoch": 0.4403874813710879, "grad_norm": 8.042501529194205, "learning_rate": 2.452451487157555e-06, "loss": 1.2343, "num_input_tokens_seen": 2420736, "step": 591 }, { "epoch": 0.44113263785394935, "grad_norm": 8.542077369280959, "learning_rate": 2.452291474672929e-06, "loss": 0.9506, "num_input_tokens_seen": 2424832, "step": 592 }, { "epoch": 0.44187779433681074, "grad_norm": 7.023567807072994, "learning_rate": 2.4521311986374275e-06, "loss": 1.2496, "num_input_tokens_seen": 2428928, "step": 593 }, { "epoch": 0.4426229508196721, "grad_norm": 14.607385551311062, "learning_rate": 2.4519706590861826e-06, "loss": 1.344, "num_input_tokens_seen": 2433024, "step": 594 }, { "epoch": 0.4433681073025335, "grad_norm": 6.9886849357145975, "learning_rate": 2.451809856054387e-06, "loss": 1.6058, "num_input_tokens_seen": 2437120, "step": 595 }, { "epoch": 0.44411326378539495, "grad_norm": 7.341425365512906, "learning_rate": 2.4516487895772888e-06, "loss": 1.3307, "num_input_tokens_seen": 2441216, "step": 596 }, { "epoch": 0.44485842026825634, "grad_norm": 7.573298877481773, "learning_rate": 2.451487459690195e-06, "loss": 1.3125, "num_input_tokens_seen": 2445312, "step": 597 }, { "epoch": 0.4456035767511177, "grad_norm": 10.654071798710058, "learning_rate": 2.451325866428471e-06, "loss": 1.1923, "num_input_tokens_seen": 2449408, "step": 598 }, { "epoch": 0.4463487332339791, "grad_norm": 7.50336960720299, "learning_rate": 2.451164009827539e-06, "loss": 1.435, "num_input_tokens_seen": 2453504, "step": 599 }, { "epoch": 0.44709388971684055, "grad_norm": 8.905130802030651, "learning_rate": 2.4510018899228795e-06, "loss": 1.0631, "num_input_tokens_seen": 2457600, "step": 600 }, { "epoch": 0.44783904619970194, "grad_norm": 8.1421967319771, "learning_rate": 2.450839506750029e-06, "loss": 1.192, "num_input_tokens_seen": 2461696, "step": 601 }, { "epoch": 0.4485842026825633, "grad_norm": 9.984765704454214, "learning_rate": 2.450676860344584e-06, "loss": 1.2563, "num_input_tokens_seen": 2465792, "step": 602 }, { "epoch": 0.4493293591654247, "grad_norm": 8.57935255456879, "learning_rate": 2.450513950742198e-06, "loss": 1.0509, "num_input_tokens_seen": 2469888, "step": 603 }, { "epoch": 0.45007451564828616, "grad_norm": 7.928636637286051, "learning_rate": 2.450350777978581e-06, "loss": 1.4135, "num_input_tokens_seen": 2473984, "step": 604 }, { "epoch": 0.45081967213114754, "grad_norm": 7.253401865767692, "learning_rate": 2.4501873420895025e-06, "loss": 1.3968, "num_input_tokens_seen": 2478080, "step": 605 }, { "epoch": 0.45156482861400893, "grad_norm": 13.059038884215322, "learning_rate": 2.4500236431107883e-06, "loss": 1.3819, "num_input_tokens_seen": 2482176, "step": 606 }, { "epoch": 0.4523099850968703, "grad_norm": 7.909569152398115, "learning_rate": 2.449859681078323e-06, "loss": 1.1977, "num_input_tokens_seen": 2486272, "step": 607 }, { "epoch": 0.45305514157973176, "grad_norm": 8.950244063320092, "learning_rate": 2.4496954560280474e-06, "loss": 1.1876, "num_input_tokens_seen": 2490368, "step": 608 }, { "epoch": 0.45380029806259314, "grad_norm": 7.271840552965769, "learning_rate": 2.449530967995962e-06, "loss": 1.4114, "num_input_tokens_seen": 2494464, "step": 609 }, { "epoch": 0.45454545454545453, "grad_norm": 7.8952303066050025, "learning_rate": 2.4493662170181222e-06, "loss": 1.4661, "num_input_tokens_seen": 2498560, "step": 610 }, { "epoch": 0.455290611028316, "grad_norm": 11.131586509893785, "learning_rate": 2.4492012031306434e-06, "loss": 1.2853, "num_input_tokens_seen": 2502656, "step": 611 }, { "epoch": 0.45603576751117736, "grad_norm": 8.892633629032378, "learning_rate": 2.449035926369698e-06, "loss": 1.1033, "num_input_tokens_seen": 2506752, "step": 612 }, { "epoch": 0.45678092399403875, "grad_norm": 10.852618596985103, "learning_rate": 2.4488703867715156e-06, "loss": 1.259, "num_input_tokens_seen": 2510848, "step": 613 }, { "epoch": 0.45752608047690013, "grad_norm": 7.95105553949311, "learning_rate": 2.4487045843723837e-06, "loss": 1.1026, "num_input_tokens_seen": 2514944, "step": 614 }, { "epoch": 0.4582712369597616, "grad_norm": 8.969377867542438, "learning_rate": 2.448538519208648e-06, "loss": 1.3927, "num_input_tokens_seen": 2519040, "step": 615 }, { "epoch": 0.45901639344262296, "grad_norm": 10.611170853469087, "learning_rate": 2.4483721913167096e-06, "loss": 1.5729, "num_input_tokens_seen": 2523136, "step": 616 }, { "epoch": 0.45976154992548435, "grad_norm": 8.950176893812952, "learning_rate": 2.4482056007330305e-06, "loss": 1.5655, "num_input_tokens_seen": 2527232, "step": 617 }, { "epoch": 0.46050670640834573, "grad_norm": 9.367533798124157, "learning_rate": 2.4480387474941274e-06, "loss": 1.4684, "num_input_tokens_seen": 2531328, "step": 618 }, { "epoch": 0.4612518628912072, "grad_norm": 8.180540687658747, "learning_rate": 2.447871631636576e-06, "loss": 1.3556, "num_input_tokens_seen": 2535424, "step": 619 }, { "epoch": 0.46199701937406856, "grad_norm": 10.539956184788908, "learning_rate": 2.4477042531970096e-06, "loss": 1.2794, "num_input_tokens_seen": 2539520, "step": 620 }, { "epoch": 0.46274217585692995, "grad_norm": 9.957506867084398, "learning_rate": 2.4475366122121188e-06, "loss": 1.1589, "num_input_tokens_seen": 2543616, "step": 621 }, { "epoch": 0.46348733233979134, "grad_norm": 6.862996835740397, "learning_rate": 2.4473687087186508e-06, "loss": 1.411, "num_input_tokens_seen": 2547712, "step": 622 }, { "epoch": 0.4642324888226528, "grad_norm": 7.026393880188151, "learning_rate": 2.447200542753412e-06, "loss": 1.3693, "num_input_tokens_seen": 2551808, "step": 623 }, { "epoch": 0.46497764530551416, "grad_norm": 9.501775288618305, "learning_rate": 2.447032114353266e-06, "loss": 1.1206, "num_input_tokens_seen": 2555904, "step": 624 }, { "epoch": 0.46572280178837555, "grad_norm": 9.120691512020985, "learning_rate": 2.4468634235551325e-06, "loss": 1.2339, "num_input_tokens_seen": 2560000, "step": 625 }, { "epoch": 0.46646795827123694, "grad_norm": 8.614441228625523, "learning_rate": 2.4466944703959903e-06, "loss": 1.5603, "num_input_tokens_seen": 2564096, "step": 626 }, { "epoch": 0.4672131147540984, "grad_norm": 7.278559672398382, "learning_rate": 2.4465252549128746e-06, "loss": 1.213, "num_input_tokens_seen": 2568192, "step": 627 }, { "epoch": 0.46795827123695977, "grad_norm": 9.544621841303895, "learning_rate": 2.4463557771428796e-06, "loss": 1.1963, "num_input_tokens_seen": 2572288, "step": 628 }, { "epoch": 0.46870342771982115, "grad_norm": 17.560508524100488, "learning_rate": 2.4461860371231557e-06, "loss": 1.3465, "num_input_tokens_seen": 2576384, "step": 629 }, { "epoch": 0.46944858420268254, "grad_norm": 8.997502728775093, "learning_rate": 2.4460160348909104e-06, "loss": 1.2459, "num_input_tokens_seen": 2580480, "step": 630 }, { "epoch": 0.470193740685544, "grad_norm": 8.169711030966075, "learning_rate": 2.4458457704834103e-06, "loss": 1.3675, "num_input_tokens_seen": 2584576, "step": 631 }, { "epoch": 0.47093889716840537, "grad_norm": 7.824726312756808, "learning_rate": 2.445675243937978e-06, "loss": 1.1732, "num_input_tokens_seen": 2588672, "step": 632 }, { "epoch": 0.47168405365126675, "grad_norm": 11.411529889689481, "learning_rate": 2.445504455291994e-06, "loss": 1.2998, "num_input_tokens_seen": 2592768, "step": 633 }, { "epoch": 0.47242921013412814, "grad_norm": 7.045205663806833, "learning_rate": 2.4453334045828973e-06, "loss": 0.9837, "num_input_tokens_seen": 2596864, "step": 634 }, { "epoch": 0.4731743666169896, "grad_norm": 13.192350127687238, "learning_rate": 2.445162091848183e-06, "loss": 1.3605, "num_input_tokens_seen": 2600960, "step": 635 }, { "epoch": 0.47391952309985097, "grad_norm": 6.655518690905773, "learning_rate": 2.4449905171254033e-06, "loss": 1.3417, "num_input_tokens_seen": 2605056, "step": 636 }, { "epoch": 0.47466467958271236, "grad_norm": 8.793606050170283, "learning_rate": 2.4448186804521698e-06, "loss": 1.1621, "num_input_tokens_seen": 2609152, "step": 637 }, { "epoch": 0.47540983606557374, "grad_norm": 8.240720936044308, "learning_rate": 2.44464658186615e-06, "loss": 1.1752, "num_input_tokens_seen": 2613248, "step": 638 }, { "epoch": 0.4761549925484352, "grad_norm": 6.791827802668844, "learning_rate": 2.444474221405068e-06, "loss": 1.0429, "num_input_tokens_seen": 2617344, "step": 639 }, { "epoch": 0.47690014903129657, "grad_norm": 12.208536057253449, "learning_rate": 2.444301599106708e-06, "loss": 1.1841, "num_input_tokens_seen": 2621440, "step": 640 }, { "epoch": 0.47764530551415796, "grad_norm": 9.156726419548212, "learning_rate": 2.444128715008909e-06, "loss": 1.2, "num_input_tokens_seen": 2625536, "step": 641 }, { "epoch": 0.4783904619970194, "grad_norm": 13.335211719943032, "learning_rate": 2.4439555691495694e-06, "loss": 1.533, "num_input_tokens_seen": 2629632, "step": 642 }, { "epoch": 0.4791356184798808, "grad_norm": 7.648413806339191, "learning_rate": 2.443782161566643e-06, "loss": 1.0186, "num_input_tokens_seen": 2633728, "step": 643 }, { "epoch": 0.4798807749627422, "grad_norm": 7.734769789010372, "learning_rate": 2.443608492298142e-06, "loss": 1.3541, "num_input_tokens_seen": 2637824, "step": 644 }, { "epoch": 0.48062593144560356, "grad_norm": 9.023243099771564, "learning_rate": 2.4434345613821373e-06, "loss": 1.0941, "num_input_tokens_seen": 2641920, "step": 645 }, { "epoch": 0.481371087928465, "grad_norm": 8.105607379221185, "learning_rate": 2.4432603688567545e-06, "loss": 1.1553, "num_input_tokens_seen": 2646016, "step": 646 }, { "epoch": 0.4821162444113264, "grad_norm": 7.1621270396426375, "learning_rate": 2.443085914760178e-06, "loss": 1.183, "num_input_tokens_seen": 2650112, "step": 647 }, { "epoch": 0.4828614008941878, "grad_norm": 9.138127242392365, "learning_rate": 2.4429111991306498e-06, "loss": 1.3903, "num_input_tokens_seen": 2654208, "step": 648 }, { "epoch": 0.48360655737704916, "grad_norm": 10.311951438413145, "learning_rate": 2.4427362220064686e-06, "loss": 1.2329, "num_input_tokens_seen": 2658304, "step": 649 }, { "epoch": 0.4843517138599106, "grad_norm": 7.938988928645603, "learning_rate": 2.4425609834259904e-06, "loss": 1.3614, "num_input_tokens_seen": 2662400, "step": 650 }, { "epoch": 0.485096870342772, "grad_norm": 8.00588624092202, "learning_rate": 2.4423854834276293e-06, "loss": 1.1476, "num_input_tokens_seen": 2666496, "step": 651 }, { "epoch": 0.4858420268256334, "grad_norm": 7.452980307854229, "learning_rate": 2.442209722049856e-06, "loss": 1.2758, "num_input_tokens_seen": 2670592, "step": 652 }, { "epoch": 0.48658718330849476, "grad_norm": 7.591489478952313, "learning_rate": 2.442033699331198e-06, "loss": 1.3325, "num_input_tokens_seen": 2674688, "step": 653 }, { "epoch": 0.4873323397913562, "grad_norm": 8.170495438338802, "learning_rate": 2.4418574153102416e-06, "loss": 0.9667, "num_input_tokens_seen": 2678784, "step": 654 }, { "epoch": 0.4880774962742176, "grad_norm": 8.050227645296566, "learning_rate": 2.441680870025629e-06, "loss": 1.1164, "num_input_tokens_seen": 2682880, "step": 655 }, { "epoch": 0.488822652757079, "grad_norm": 13.106357657960782, "learning_rate": 2.44150406351606e-06, "loss": 1.5123, "num_input_tokens_seen": 2686976, "step": 656 }, { "epoch": 0.48956780923994037, "grad_norm": 8.544677286108397, "learning_rate": 2.4413269958202926e-06, "loss": 1.2855, "num_input_tokens_seen": 2691072, "step": 657 }, { "epoch": 0.4903129657228018, "grad_norm": 9.374404690611705, "learning_rate": 2.4411496669771408e-06, "loss": 1.2835, "num_input_tokens_seen": 2695168, "step": 658 }, { "epoch": 0.4910581222056632, "grad_norm": 12.88965301598933, "learning_rate": 2.4409720770254765e-06, "loss": 1.3573, "num_input_tokens_seen": 2699264, "step": 659 }, { "epoch": 0.4918032786885246, "grad_norm": 7.970502637974845, "learning_rate": 2.4407942260042284e-06, "loss": 1.2715, "num_input_tokens_seen": 2703360, "step": 660 }, { "epoch": 0.49254843517138597, "grad_norm": 10.83848665980372, "learning_rate": 2.4406161139523834e-06, "loss": 1.0092, "num_input_tokens_seen": 2707456, "step": 661 }, { "epoch": 0.4932935916542474, "grad_norm": 8.683329682361167, "learning_rate": 2.4404377409089837e-06, "loss": 1.1991, "num_input_tokens_seen": 2711552, "step": 662 }, { "epoch": 0.4940387481371088, "grad_norm": 7.680888936074596, "learning_rate": 2.440259106913131e-06, "loss": 1.4246, "num_input_tokens_seen": 2715648, "step": 663 }, { "epoch": 0.4947839046199702, "grad_norm": 38.44048378298633, "learning_rate": 2.440080212003983e-06, "loss": 1.3441, "num_input_tokens_seen": 2719744, "step": 664 }, { "epoch": 0.49552906110283157, "grad_norm": 16.597811090513403, "learning_rate": 2.4399010562207547e-06, "loss": 1.16, "num_input_tokens_seen": 2723840, "step": 665 }, { "epoch": 0.496274217585693, "grad_norm": 8.134865747243012, "learning_rate": 2.439721639602718e-06, "loss": 1.4045, "num_input_tokens_seen": 2727936, "step": 666 }, { "epoch": 0.4970193740685544, "grad_norm": 8.220619743560592, "learning_rate": 2.4395419621892027e-06, "loss": 1.3165, "num_input_tokens_seen": 2732032, "step": 667 }, { "epoch": 0.4977645305514158, "grad_norm": 16.97293240620079, "learning_rate": 2.439362024019595e-06, "loss": 1.6292, "num_input_tokens_seen": 2736128, "step": 668 }, { "epoch": 0.49850968703427717, "grad_norm": 8.146497592990714, "learning_rate": 2.439181825133339e-06, "loss": 1.1544, "num_input_tokens_seen": 2740224, "step": 669 }, { "epoch": 0.4992548435171386, "grad_norm": 7.647563742040859, "learning_rate": 2.4390013655699355e-06, "loss": 1.3072, "num_input_tokens_seen": 2744320, "step": 670 }, { "epoch": 0.5, "grad_norm": 10.70836655335748, "learning_rate": 2.4388206453689422e-06, "loss": 1.3539, "num_input_tokens_seen": 2748416, "step": 671 }, { "epoch": 0.5007451564828614, "grad_norm": 6.909874636614722, "learning_rate": 2.4386396645699745e-06, "loss": 1.606, "num_input_tokens_seen": 2752512, "step": 672 }, { "epoch": 0.5014903129657228, "grad_norm": 7.932803815338134, "learning_rate": 2.438458423212705e-06, "loss": 1.2512, "num_input_tokens_seen": 2756608, "step": 673 }, { "epoch": 0.5022354694485842, "grad_norm": 8.300115345552353, "learning_rate": 2.4382769213368625e-06, "loss": 1.1623, "num_input_tokens_seen": 2760704, "step": 674 }, { "epoch": 0.5029806259314457, "grad_norm": 10.909228439374958, "learning_rate": 2.438095158982234e-06, "loss": 1.1248, "num_input_tokens_seen": 2764800, "step": 675 }, { "epoch": 0.503725782414307, "grad_norm": 6.692747778665486, "learning_rate": 2.437913136188663e-06, "loss": 1.5054, "num_input_tokens_seen": 2768896, "step": 676 }, { "epoch": 0.5044709388971684, "grad_norm": 12.891068288112706, "learning_rate": 2.437730852996049e-06, "loss": 1.2641, "num_input_tokens_seen": 2772992, "step": 677 }, { "epoch": 0.5052160953800298, "grad_norm": 7.527528248737113, "learning_rate": 2.437548309444352e-06, "loss": 1.2964, "num_input_tokens_seen": 2777088, "step": 678 }, { "epoch": 0.5059612518628912, "grad_norm": 6.97985810932308, "learning_rate": 2.437365505573585e-06, "loss": 1.3964, "num_input_tokens_seen": 2781184, "step": 679 }, { "epoch": 0.5067064083457526, "grad_norm": 6.849122521612009, "learning_rate": 2.4371824414238217e-06, "loss": 1.2854, "num_input_tokens_seen": 2785280, "step": 680 }, { "epoch": 0.507451564828614, "grad_norm": 17.872906766867207, "learning_rate": 2.4369991170351893e-06, "loss": 1.2426, "num_input_tokens_seen": 2789376, "step": 681 }, { "epoch": 0.5081967213114754, "grad_norm": 7.153069463374355, "learning_rate": 2.4368155324478743e-06, "loss": 1.1428, "num_input_tokens_seen": 2793472, "step": 682 }, { "epoch": 0.5089418777943369, "grad_norm": 6.558999825317635, "learning_rate": 2.43663168770212e-06, "loss": 1.1919, "num_input_tokens_seen": 2797568, "step": 683 }, { "epoch": 0.5096870342771982, "grad_norm": 9.345981423909034, "learning_rate": 2.436447582838227e-06, "loss": 1.0981, "num_input_tokens_seen": 2801664, "step": 684 }, { "epoch": 0.5104321907600596, "grad_norm": 7.44162226504401, "learning_rate": 2.4362632178965515e-06, "loss": 1.3656, "num_input_tokens_seen": 2805760, "step": 685 }, { "epoch": 0.511177347242921, "grad_norm": 7.793380523542967, "learning_rate": 2.436078592917508e-06, "loss": 1.5296, "num_input_tokens_seen": 2809856, "step": 686 }, { "epoch": 0.5119225037257824, "grad_norm": 7.427699019122299, "learning_rate": 2.4358937079415673e-06, "loss": 1.2666, "num_input_tokens_seen": 2813952, "step": 687 }, { "epoch": 0.5126676602086438, "grad_norm": 8.231437748937797, "learning_rate": 2.435708563009258e-06, "loss": 1.4138, "num_input_tokens_seen": 2818048, "step": 688 }, { "epoch": 0.5134128166915052, "grad_norm": 13.475782024684642, "learning_rate": 2.435523158161165e-06, "loss": 1.3956, "num_input_tokens_seen": 2822144, "step": 689 }, { "epoch": 0.5141579731743666, "grad_norm": 8.151470484591675, "learning_rate": 2.4353374934379306e-06, "loss": 1.2367, "num_input_tokens_seen": 2826240, "step": 690 }, { "epoch": 0.5149031296572281, "grad_norm": 7.9091229638113445, "learning_rate": 2.4351515688802532e-06, "loss": 1.2604, "num_input_tokens_seen": 2830336, "step": 691 }, { "epoch": 0.5156482861400894, "grad_norm": 7.419951843543008, "learning_rate": 2.4349653845288897e-06, "loss": 1.5445, "num_input_tokens_seen": 2834432, "step": 692 }, { "epoch": 0.5163934426229508, "grad_norm": 8.946272499009154, "learning_rate": 2.4347789404246517e-06, "loss": 1.2387, "num_input_tokens_seen": 2838528, "step": 693 }, { "epoch": 0.5171385991058122, "grad_norm": 11.737896824075381, "learning_rate": 2.43459223660841e-06, "loss": 1.2528, "num_input_tokens_seen": 2842624, "step": 694 }, { "epoch": 0.5178837555886736, "grad_norm": 9.547711649757492, "learning_rate": 2.434405273121092e-06, "loss": 1.2713, "num_input_tokens_seen": 2846720, "step": 695 }, { "epoch": 0.518628912071535, "grad_norm": 9.817208847844931, "learning_rate": 2.4342180500036803e-06, "loss": 1.2254, "num_input_tokens_seen": 2850816, "step": 696 }, { "epoch": 0.5193740685543964, "grad_norm": 14.025270096368592, "learning_rate": 2.4340305672972157e-06, "loss": 1.3186, "num_input_tokens_seen": 2854912, "step": 697 }, { "epoch": 0.5201192250372578, "grad_norm": 9.466160262755023, "learning_rate": 2.4338428250427965e-06, "loss": 1.5027, "num_input_tokens_seen": 2859008, "step": 698 }, { "epoch": 0.5208643815201193, "grad_norm": 7.767796828232229, "learning_rate": 2.4336548232815764e-06, "loss": 1.461, "num_input_tokens_seen": 2863104, "step": 699 }, { "epoch": 0.5216095380029806, "grad_norm": 9.248012505705818, "learning_rate": 2.433466562054767e-06, "loss": 1.4051, "num_input_tokens_seen": 2867200, "step": 700 }, { "epoch": 0.522354694485842, "grad_norm": 7.8523070572335225, "learning_rate": 2.4332780414036365e-06, "loss": 1.4564, "num_input_tokens_seen": 2871296, "step": 701 }, { "epoch": 0.5230998509687034, "grad_norm": 8.208494604430113, "learning_rate": 2.43308926136951e-06, "loss": 1.3472, "num_input_tokens_seen": 2875392, "step": 702 }, { "epoch": 0.5238450074515648, "grad_norm": 7.227134813538323, "learning_rate": 2.432900221993769e-06, "loss": 1.1996, "num_input_tokens_seen": 2879488, "step": 703 }, { "epoch": 0.5245901639344263, "grad_norm": 7.654563651147866, "learning_rate": 2.4327109233178533e-06, "loss": 1.3172, "num_input_tokens_seen": 2883584, "step": 704 }, { "epoch": 0.5253353204172876, "grad_norm": 6.365187246570353, "learning_rate": 2.4325213653832573e-06, "loss": 1.4348, "num_input_tokens_seen": 2887680, "step": 705 }, { "epoch": 0.526080476900149, "grad_norm": 10.93576414764661, "learning_rate": 2.4323315482315344e-06, "loss": 1.249, "num_input_tokens_seen": 2891776, "step": 706 }, { "epoch": 0.5268256333830105, "grad_norm": 11.361258834902559, "learning_rate": 2.432141471904294e-06, "loss": 0.8963, "num_input_tokens_seen": 2895872, "step": 707 }, { "epoch": 0.5275707898658718, "grad_norm": 8.081512946081556, "learning_rate": 2.431951136443201e-06, "loss": 1.4897, "num_input_tokens_seen": 2899968, "step": 708 }, { "epoch": 0.5283159463487332, "grad_norm": 7.897180537126288, "learning_rate": 2.4317605418899793e-06, "loss": 1.1677, "num_input_tokens_seen": 2904064, "step": 709 }, { "epoch": 0.5290611028315947, "grad_norm": 7.334143288248464, "learning_rate": 2.431569688286408e-06, "loss": 1.4971, "num_input_tokens_seen": 2908160, "step": 710 }, { "epoch": 0.529806259314456, "grad_norm": 8.559030914821037, "learning_rate": 2.4313785756743242e-06, "loss": 0.9813, "num_input_tokens_seen": 2912256, "step": 711 }, { "epoch": 0.5305514157973175, "grad_norm": 7.7451274589531796, "learning_rate": 2.431187204095621e-06, "loss": 1.0746, "num_input_tokens_seen": 2916352, "step": 712 }, { "epoch": 0.5312965722801788, "grad_norm": 8.844515173762787, "learning_rate": 2.4309955735922476e-06, "loss": 1.3755, "num_input_tokens_seen": 2920448, "step": 713 }, { "epoch": 0.5320417287630402, "grad_norm": 13.731449203941601, "learning_rate": 2.4308036842062118e-06, "loss": 1.2146, "num_input_tokens_seen": 2924544, "step": 714 }, { "epoch": 0.5327868852459017, "grad_norm": 9.367662538468675, "learning_rate": 2.4306115359795768e-06, "loss": 1.1652, "num_input_tokens_seen": 2928640, "step": 715 }, { "epoch": 0.533532041728763, "grad_norm": 8.764953234069145, "learning_rate": 2.4304191289544625e-06, "loss": 1.2002, "num_input_tokens_seen": 2932736, "step": 716 }, { "epoch": 0.5342771982116244, "grad_norm": 11.623315800079082, "learning_rate": 2.4302264631730467e-06, "loss": 1.1663, "num_input_tokens_seen": 2936832, "step": 717 }, { "epoch": 0.5350223546944859, "grad_norm": 7.847199737741046, "learning_rate": 2.430033538677562e-06, "loss": 1.4108, "num_input_tokens_seen": 2940928, "step": 718 }, { "epoch": 0.5357675111773472, "grad_norm": 7.72081590484379, "learning_rate": 2.4298403555102996e-06, "loss": 1.3065, "num_input_tokens_seen": 2945024, "step": 719 }, { "epoch": 0.5365126676602087, "grad_norm": 8.529432048609696, "learning_rate": 2.429646913713607e-06, "loss": 1.2607, "num_input_tokens_seen": 2949120, "step": 720 }, { "epoch": 0.53725782414307, "grad_norm": 8.301575454893008, "learning_rate": 2.429453213329887e-06, "loss": 1.2119, "num_input_tokens_seen": 2953216, "step": 721 }, { "epoch": 0.5380029806259314, "grad_norm": 7.868426632442431, "learning_rate": 2.4292592544016003e-06, "loss": 1.149, "num_input_tokens_seen": 2957312, "step": 722 }, { "epoch": 0.5387481371087929, "grad_norm": 7.053488827131085, "learning_rate": 2.4290650369712647e-06, "loss": 1.1457, "num_input_tokens_seen": 2961408, "step": 723 }, { "epoch": 0.5394932935916542, "grad_norm": 6.118012386854239, "learning_rate": 2.428870561081454e-06, "loss": 1.3743, "num_input_tokens_seen": 2965504, "step": 724 }, { "epoch": 0.5402384500745157, "grad_norm": 7.487576255921088, "learning_rate": 2.428675826774798e-06, "loss": 1.4058, "num_input_tokens_seen": 2969600, "step": 725 }, { "epoch": 0.5409836065573771, "grad_norm": 7.43100420907016, "learning_rate": 2.428480834093984e-06, "loss": 1.5599, "num_input_tokens_seen": 2973696, "step": 726 }, { "epoch": 0.5417287630402384, "grad_norm": 8.64391171568042, "learning_rate": 2.428285583081757e-06, "loss": 1.2899, "num_input_tokens_seen": 2977792, "step": 727 }, { "epoch": 0.5424739195230999, "grad_norm": 8.729575627844108, "learning_rate": 2.4280900737809157e-06, "loss": 1.2234, "num_input_tokens_seen": 2981888, "step": 728 }, { "epoch": 0.5432190760059612, "grad_norm": 6.928552849587398, "learning_rate": 2.427894306234318e-06, "loss": 1.3479, "num_input_tokens_seen": 2985984, "step": 729 }, { "epoch": 0.5439642324888226, "grad_norm": 7.388258838497082, "learning_rate": 2.427698280484878e-06, "loss": 1.2431, "num_input_tokens_seen": 2990080, "step": 730 }, { "epoch": 0.5447093889716841, "grad_norm": 10.883596093131235, "learning_rate": 2.427501996575565e-06, "loss": 1.3404, "num_input_tokens_seen": 2994176, "step": 731 }, { "epoch": 0.5454545454545454, "grad_norm": 8.823970238857685, "learning_rate": 2.4273054545494064e-06, "loss": 0.95, "num_input_tokens_seen": 2998272, "step": 732 }, { "epoch": 0.5461997019374069, "grad_norm": 8.094667192058706, "learning_rate": 2.4271086544494847e-06, "loss": 1.5866, "num_input_tokens_seen": 3002368, "step": 733 }, { "epoch": 0.5469448584202683, "grad_norm": 9.822707578759708, "learning_rate": 2.4269115963189415e-06, "loss": 1.2454, "num_input_tokens_seen": 3006464, "step": 734 }, { "epoch": 0.5476900149031296, "grad_norm": 8.19368693175821, "learning_rate": 2.4267142802009722e-06, "loss": 1.2163, "num_input_tokens_seen": 3010560, "step": 735 }, { "epoch": 0.5484351713859911, "grad_norm": 8.804681225143929, "learning_rate": 2.42651670613883e-06, "loss": 1.0999, "num_input_tokens_seen": 3014656, "step": 736 }, { "epoch": 0.5491803278688525, "grad_norm": 7.133076762583086, "learning_rate": 2.4263188741758254e-06, "loss": 1.294, "num_input_tokens_seen": 3018752, "step": 737 }, { "epoch": 0.5499254843517138, "grad_norm": 9.019689525118999, "learning_rate": 2.4261207843553235e-06, "loss": 1.5013, "num_input_tokens_seen": 3022848, "step": 738 }, { "epoch": 0.5506706408345753, "grad_norm": 7.9722124246094666, "learning_rate": 2.4259224367207475e-06, "loss": 1.3703, "num_input_tokens_seen": 3026944, "step": 739 }, { "epoch": 0.5514157973174366, "grad_norm": 7.281631837826099, "learning_rate": 2.425723831315576e-06, "loss": 1.4053, "num_input_tokens_seen": 3031040, "step": 740 }, { "epoch": 0.5521609538002981, "grad_norm": 10.31140178264083, "learning_rate": 2.425524968183346e-06, "loss": 1.3927, "num_input_tokens_seen": 3035136, "step": 741 }, { "epoch": 0.5529061102831595, "grad_norm": 8.068208930062001, "learning_rate": 2.4253258473676485e-06, "loss": 0.9611, "num_input_tokens_seen": 3039232, "step": 742 }, { "epoch": 0.5536512667660208, "grad_norm": 10.699841153739568, "learning_rate": 2.4251264689121327e-06, "loss": 1.2406, "num_input_tokens_seen": 3043328, "step": 743 }, { "epoch": 0.5543964232488823, "grad_norm": 10.144815518225204, "learning_rate": 2.424926832860504e-06, "loss": 1.1565, "num_input_tokens_seen": 3047424, "step": 744 }, { "epoch": 0.5551415797317437, "grad_norm": 7.3253524102671665, "learning_rate": 2.4247269392565238e-06, "loss": 1.2893, "num_input_tokens_seen": 3051520, "step": 745 }, { "epoch": 0.555886736214605, "grad_norm": 6.879279705618943, "learning_rate": 2.42452678814401e-06, "loss": 1.3199, "num_input_tokens_seen": 3055616, "step": 746 }, { "epoch": 0.5566318926974665, "grad_norm": 8.910091554777308, "learning_rate": 2.4243263795668377e-06, "loss": 1.5452, "num_input_tokens_seen": 3059712, "step": 747 }, { "epoch": 0.5573770491803278, "grad_norm": 7.616328534088989, "learning_rate": 2.4241257135689374e-06, "loss": 1.1645, "num_input_tokens_seen": 3063808, "step": 748 }, { "epoch": 0.5581222056631893, "grad_norm": 8.413825936138887, "learning_rate": 2.4239247901942964e-06, "loss": 1.5502, "num_input_tokens_seen": 3067904, "step": 749 }, { "epoch": 0.5588673621460507, "grad_norm": 10.451028900089744, "learning_rate": 2.423723609486959e-06, "loss": 1.2269, "num_input_tokens_seen": 3072000, "step": 750 }, { "epoch": 0.559612518628912, "grad_norm": 9.178507281515033, "learning_rate": 2.4235221714910252e-06, "loss": 1.3736, "num_input_tokens_seen": 3076096, "step": 751 }, { "epoch": 0.5603576751117735, "grad_norm": 10.174598918254553, "learning_rate": 2.4233204762506518e-06, "loss": 1.1629, "num_input_tokens_seen": 3080192, "step": 752 }, { "epoch": 0.5611028315946349, "grad_norm": 8.163093447931455, "learning_rate": 2.423118523810052e-06, "loss": 1.2592, "num_input_tokens_seen": 3084288, "step": 753 }, { "epoch": 0.5618479880774963, "grad_norm": 8.115832241812559, "learning_rate": 2.4229163142134945e-06, "loss": 1.0912, "num_input_tokens_seen": 3088384, "step": 754 }, { "epoch": 0.5625931445603577, "grad_norm": 6.814619324128837, "learning_rate": 2.4227138475053056e-06, "loss": 1.1611, "num_input_tokens_seen": 3092480, "step": 755 }, { "epoch": 0.563338301043219, "grad_norm": 10.271028806416206, "learning_rate": 2.422511123729868e-06, "loss": 0.9464, "num_input_tokens_seen": 3096576, "step": 756 }, { "epoch": 0.5640834575260805, "grad_norm": 6.930283749628199, "learning_rate": 2.422308142931619e-06, "loss": 1.2653, "num_input_tokens_seen": 3100672, "step": 757 }, { "epoch": 0.5648286140089419, "grad_norm": 7.0645889458009785, "learning_rate": 2.4221049051550544e-06, "loss": 1.3637, "num_input_tokens_seen": 3104768, "step": 758 }, { "epoch": 0.5655737704918032, "grad_norm": 9.639527546743702, "learning_rate": 2.4219014104447248e-06, "loss": 1.0185, "num_input_tokens_seen": 3108864, "step": 759 }, { "epoch": 0.5663189269746647, "grad_norm": 7.221398762932507, "learning_rate": 2.421697658845238e-06, "loss": 1.5107, "num_input_tokens_seen": 3112960, "step": 760 }, { "epoch": 0.5670640834575261, "grad_norm": 15.717859777088522, "learning_rate": 2.421493650401258e-06, "loss": 1.3374, "num_input_tokens_seen": 3117056, "step": 761 }, { "epoch": 0.5678092399403875, "grad_norm": 14.663901583060639, "learning_rate": 2.4212893851575036e-06, "loss": 1.661, "num_input_tokens_seen": 3121152, "step": 762 }, { "epoch": 0.5685543964232489, "grad_norm": 10.561405386850137, "learning_rate": 2.421084863158753e-06, "loss": 1.0845, "num_input_tokens_seen": 3125248, "step": 763 }, { "epoch": 0.5692995529061102, "grad_norm": 10.011054224495465, "learning_rate": 2.420880084449838e-06, "loss": 1.21, "num_input_tokens_seen": 3129344, "step": 764 }, { "epoch": 0.5700447093889717, "grad_norm": 11.014253377622817, "learning_rate": 2.4206750490756476e-06, "loss": 1.2289, "num_input_tokens_seen": 3133440, "step": 765 }, { "epoch": 0.5707898658718331, "grad_norm": 9.265206602775041, "learning_rate": 2.4204697570811274e-06, "loss": 1.0426, "num_input_tokens_seen": 3137536, "step": 766 }, { "epoch": 0.5715350223546944, "grad_norm": 8.575488641785022, "learning_rate": 2.420264208511278e-06, "loss": 1.3052, "num_input_tokens_seen": 3141632, "step": 767 }, { "epoch": 0.5722801788375559, "grad_norm": 7.773986942142005, "learning_rate": 2.420058403411158e-06, "loss": 1.3107, "num_input_tokens_seen": 3145728, "step": 768 }, { "epoch": 0.5730253353204173, "grad_norm": 9.359061034216776, "learning_rate": 2.4198523418258814e-06, "loss": 1.0538, "num_input_tokens_seen": 3149824, "step": 769 }, { "epoch": 0.5737704918032787, "grad_norm": 8.285703750573326, "learning_rate": 2.419646023800617e-06, "loss": 1.2781, "num_input_tokens_seen": 3153920, "step": 770 }, { "epoch": 0.5745156482861401, "grad_norm": 8.53345527966589, "learning_rate": 2.419439449380593e-06, "loss": 1.3699, "num_input_tokens_seen": 3158016, "step": 771 }, { "epoch": 0.5752608047690015, "grad_norm": 7.927411884881437, "learning_rate": 2.419232618611091e-06, "loss": 1.429, "num_input_tokens_seen": 3162112, "step": 772 }, { "epoch": 0.5760059612518629, "grad_norm": 8.591662512928119, "learning_rate": 2.4190255315374496e-06, "loss": 1.1685, "num_input_tokens_seen": 3166208, "step": 773 }, { "epoch": 0.5767511177347243, "grad_norm": 7.24038471584989, "learning_rate": 2.4188181882050645e-06, "loss": 1.1626, "num_input_tokens_seen": 3170304, "step": 774 }, { "epoch": 0.5774962742175856, "grad_norm": 9.42300051739606, "learning_rate": 2.4186105886593862e-06, "loss": 1.2217, "num_input_tokens_seen": 3174400, "step": 775 }, { "epoch": 0.5782414307004471, "grad_norm": 9.667987489535996, "learning_rate": 2.418402732945922e-06, "loss": 1.2758, "num_input_tokens_seen": 3178496, "step": 776 }, { "epoch": 0.5789865871833085, "grad_norm": 8.726440380089828, "learning_rate": 2.4181946211102357e-06, "loss": 1.3064, "num_input_tokens_seen": 3182592, "step": 777 }, { "epoch": 0.5797317436661699, "grad_norm": 7.6413522934814955, "learning_rate": 2.417986253197947e-06, "loss": 1.2424, "num_input_tokens_seen": 3186688, "step": 778 }, { "epoch": 0.5804769001490313, "grad_norm": 7.999662131671186, "learning_rate": 2.4177776292547316e-06, "loss": 1.428, "num_input_tokens_seen": 3190784, "step": 779 }, { "epoch": 0.5812220566318927, "grad_norm": 7.726847159280827, "learning_rate": 2.417568749326321e-06, "loss": 1.3414, "num_input_tokens_seen": 3194880, "step": 780 }, { "epoch": 0.5819672131147541, "grad_norm": 8.17448383643462, "learning_rate": 2.4173596134585034e-06, "loss": 1.0248, "num_input_tokens_seen": 3198976, "step": 781 }, { "epoch": 0.5827123695976155, "grad_norm": 8.30345947813171, "learning_rate": 2.4171502216971226e-06, "loss": 0.9261, "num_input_tokens_seen": 3203072, "step": 782 }, { "epoch": 0.5834575260804769, "grad_norm": 12.094332718284596, "learning_rate": 2.4169405740880793e-06, "loss": 1.3307, "num_input_tokens_seen": 3207168, "step": 783 }, { "epoch": 0.5842026825633383, "grad_norm": 6.454334284549076, "learning_rate": 2.4167306706773292e-06, "loss": 1.2326, "num_input_tokens_seen": 3211264, "step": 784 }, { "epoch": 0.5849478390461997, "grad_norm": 9.366773916303528, "learning_rate": 2.4165205115108854e-06, "loss": 0.9757, "num_input_tokens_seen": 3215360, "step": 785 }, { "epoch": 0.5856929955290611, "grad_norm": 9.055453602402164, "learning_rate": 2.416310096634815e-06, "loss": 1.2443, "num_input_tokens_seen": 3219456, "step": 786 }, { "epoch": 0.5864381520119225, "grad_norm": 8.96131637591768, "learning_rate": 2.4160994260952436e-06, "loss": 1.1751, "num_input_tokens_seen": 3223552, "step": 787 }, { "epoch": 0.587183308494784, "grad_norm": 7.37942352183592, "learning_rate": 2.4158884999383515e-06, "loss": 1.3353, "num_input_tokens_seen": 3227648, "step": 788 }, { "epoch": 0.5879284649776453, "grad_norm": 6.630200823614139, "learning_rate": 2.4156773182103747e-06, "loss": 0.9832, "num_input_tokens_seen": 3231744, "step": 789 }, { "epoch": 0.5886736214605067, "grad_norm": 7.777015634141862, "learning_rate": 2.4154658809576066e-06, "loss": 1.2529, "num_input_tokens_seen": 3235840, "step": 790 }, { "epoch": 0.589418777943368, "grad_norm": 14.46489930574269, "learning_rate": 2.4152541882263948e-06, "loss": 1.7215, "num_input_tokens_seen": 3239936, "step": 791 }, { "epoch": 0.5901639344262295, "grad_norm": 8.484047556106558, "learning_rate": 2.415042240063144e-06, "loss": 1.2363, "num_input_tokens_seen": 3244032, "step": 792 }, { "epoch": 0.5909090909090909, "grad_norm": 11.407302752356564, "learning_rate": 2.4148300365143157e-06, "loss": 1.3795, "num_input_tokens_seen": 3248128, "step": 793 }, { "epoch": 0.5916542473919523, "grad_norm": 8.082070397518743, "learning_rate": 2.414617577626425e-06, "loss": 1.5385, "num_input_tokens_seen": 3252224, "step": 794 }, { "epoch": 0.5923994038748137, "grad_norm": 10.068525232711401, "learning_rate": 2.414404863446046e-06, "loss": 1.3681, "num_input_tokens_seen": 3256320, "step": 795 }, { "epoch": 0.5931445603576752, "grad_norm": 7.5229388375327355, "learning_rate": 2.414191894019806e-06, "loss": 1.4527, "num_input_tokens_seen": 3260416, "step": 796 }, { "epoch": 0.5938897168405365, "grad_norm": 8.150265532566472, "learning_rate": 2.4139786693943897e-06, "loss": 1.3565, "num_input_tokens_seen": 3264512, "step": 797 }, { "epoch": 0.5946348733233979, "grad_norm": 7.373517274217931, "learning_rate": 2.4137651896165374e-06, "loss": 1.046, "num_input_tokens_seen": 3268608, "step": 798 }, { "epoch": 0.5953800298062594, "grad_norm": 7.423033021259694, "learning_rate": 2.4135514547330455e-06, "loss": 1.4138, "num_input_tokens_seen": 3272704, "step": 799 }, { "epoch": 0.5961251862891207, "grad_norm": 8.256385595515013, "learning_rate": 2.4133374647907664e-06, "loss": 1.4083, "num_input_tokens_seen": 3276800, "step": 800 }, { "epoch": 0.5968703427719821, "grad_norm": 8.104737297932656, "learning_rate": 2.413123219836608e-06, "loss": 1.0277, "num_input_tokens_seen": 3280896, "step": 801 }, { "epoch": 0.5976154992548435, "grad_norm": 7.6803016235153025, "learning_rate": 2.4129087199175342e-06, "loss": 1.3456, "num_input_tokens_seen": 3284992, "step": 802 }, { "epoch": 0.5983606557377049, "grad_norm": 8.663846857978633, "learning_rate": 2.4126939650805658e-06, "loss": 1.3122, "num_input_tokens_seen": 3289088, "step": 803 }, { "epoch": 0.5991058122205664, "grad_norm": 7.737736516759461, "learning_rate": 2.412478955372777e-06, "loss": 1.0205, "num_input_tokens_seen": 3293184, "step": 804 }, { "epoch": 0.5998509687034277, "grad_norm": 7.98085397385934, "learning_rate": 2.4122636908413006e-06, "loss": 1.0479, "num_input_tokens_seen": 3297280, "step": 805 }, { "epoch": 0.6005961251862891, "grad_norm": 8.329794580572706, "learning_rate": 2.412048171533324e-06, "loss": 1.0664, "num_input_tokens_seen": 3301376, "step": 806 }, { "epoch": 0.6013412816691506, "grad_norm": 8.097610754367546, "learning_rate": 2.4118323974960907e-06, "loss": 1.2646, "num_input_tokens_seen": 3305472, "step": 807 }, { "epoch": 0.6020864381520119, "grad_norm": 10.453572903978277, "learning_rate": 2.411616368776899e-06, "loss": 0.8616, "num_input_tokens_seen": 3309568, "step": 808 }, { "epoch": 0.6028315946348733, "grad_norm": 10.229559615856548, "learning_rate": 2.411400085423105e-06, "loss": 1.0925, "num_input_tokens_seen": 3313664, "step": 809 }, { "epoch": 0.6035767511177347, "grad_norm": 8.449207580904245, "learning_rate": 2.411183547482119e-06, "loss": 1.0549, "num_input_tokens_seen": 3317760, "step": 810 }, { "epoch": 0.6043219076005961, "grad_norm": 9.004001626066996, "learning_rate": 2.410966755001408e-06, "loss": 1.3051, "num_input_tokens_seen": 3321856, "step": 811 }, { "epoch": 0.6050670640834576, "grad_norm": 9.346449649022095, "learning_rate": 2.4107497080284937e-06, "loss": 1.3321, "num_input_tokens_seen": 3325952, "step": 812 }, { "epoch": 0.6058122205663189, "grad_norm": 9.585336936029453, "learning_rate": 2.4105324066109556e-06, "loss": 1.0523, "num_input_tokens_seen": 3330048, "step": 813 }, { "epoch": 0.6065573770491803, "grad_norm": 7.393143487178342, "learning_rate": 2.4103148507964264e-06, "loss": 1.1885, "num_input_tokens_seen": 3334144, "step": 814 }, { "epoch": 0.6073025335320418, "grad_norm": 7.748466980673851, "learning_rate": 2.410097040632597e-06, "loss": 1.5439, "num_input_tokens_seen": 3338240, "step": 815 }, { "epoch": 0.6080476900149031, "grad_norm": 12.810468271327707, "learning_rate": 2.4098789761672127e-06, "loss": 1.3573, "num_input_tokens_seen": 3342336, "step": 816 }, { "epoch": 0.6087928464977646, "grad_norm": 9.436804918213898, "learning_rate": 2.409660657448074e-06, "loss": 1.3817, "num_input_tokens_seen": 3346432, "step": 817 }, { "epoch": 0.6095380029806259, "grad_norm": 8.561875746102803, "learning_rate": 2.4094420845230386e-06, "loss": 1.0828, "num_input_tokens_seen": 3350528, "step": 818 }, { "epoch": 0.6102831594634873, "grad_norm": 10.01936789624186, "learning_rate": 2.409223257440019e-06, "loss": 1.2243, "num_input_tokens_seen": 3354624, "step": 819 }, { "epoch": 0.6110283159463488, "grad_norm": 6.661521385730516, "learning_rate": 2.409004176246984e-06, "loss": 1.3524, "num_input_tokens_seen": 3358720, "step": 820 }, { "epoch": 0.6117734724292101, "grad_norm": 9.293837735351175, "learning_rate": 2.4087848409919575e-06, "loss": 1.1358, "num_input_tokens_seen": 3362816, "step": 821 }, { "epoch": 0.6125186289120715, "grad_norm": 8.685529888267228, "learning_rate": 2.4085652517230192e-06, "loss": 1.3048, "num_input_tokens_seen": 3366912, "step": 822 }, { "epoch": 0.613263785394933, "grad_norm": 9.424099515173399, "learning_rate": 2.408345408488305e-06, "loss": 1.4542, "num_input_tokens_seen": 3371008, "step": 823 }, { "epoch": 0.6140089418777943, "grad_norm": 10.866916584067104, "learning_rate": 2.408125311336006e-06, "loss": 1.0713, "num_input_tokens_seen": 3375104, "step": 824 }, { "epoch": 0.6147540983606558, "grad_norm": 8.528721035945052, "learning_rate": 2.407904960314369e-06, "loss": 1.5038, "num_input_tokens_seen": 3379200, "step": 825 }, { "epoch": 0.6154992548435171, "grad_norm": 7.4071761969378205, "learning_rate": 2.4076843554716965e-06, "loss": 1.1769, "num_input_tokens_seen": 3383296, "step": 826 }, { "epoch": 0.6162444113263785, "grad_norm": 7.9923974037336825, "learning_rate": 2.4074634968563465e-06, "loss": 1.0399, "num_input_tokens_seen": 3387392, "step": 827 }, { "epoch": 0.61698956780924, "grad_norm": 12.203011158907104, "learning_rate": 2.407242384516733e-06, "loss": 1.3285, "num_input_tokens_seen": 3391488, "step": 828 }, { "epoch": 0.6177347242921013, "grad_norm": 12.066179520512625, "learning_rate": 2.4070210185013256e-06, "loss": 1.397, "num_input_tokens_seen": 3395584, "step": 829 }, { "epoch": 0.6184798807749627, "grad_norm": 7.99979684956724, "learning_rate": 2.406799398858649e-06, "loss": 1.2062, "num_input_tokens_seen": 3399680, "step": 830 }, { "epoch": 0.6192250372578242, "grad_norm": 43.27269426872899, "learning_rate": 2.4065775256372835e-06, "loss": 1.4808, "num_input_tokens_seen": 3403776, "step": 831 }, { "epoch": 0.6199701937406855, "grad_norm": 9.25456079082192, "learning_rate": 2.406355398885866e-06, "loss": 1.2355, "num_input_tokens_seen": 3407872, "step": 832 }, { "epoch": 0.620715350223547, "grad_norm": 8.815847811192304, "learning_rate": 2.406133018653088e-06, "loss": 1.3289, "num_input_tokens_seen": 3411968, "step": 833 }, { "epoch": 0.6214605067064084, "grad_norm": 11.19995551152509, "learning_rate": 2.405910384987697e-06, "loss": 1.247, "num_input_tokens_seen": 3416064, "step": 834 }, { "epoch": 0.6222056631892697, "grad_norm": 8.115483623456946, "learning_rate": 2.405687497938495e-06, "loss": 1.3754, "num_input_tokens_seen": 3420160, "step": 835 }, { "epoch": 0.6229508196721312, "grad_norm": 10.693755167360994, "learning_rate": 2.4054643575543414e-06, "loss": 0.8989, "num_input_tokens_seen": 3424256, "step": 836 }, { "epoch": 0.6236959761549925, "grad_norm": 7.968589476209927, "learning_rate": 2.4052409638841503e-06, "loss": 1.2362, "num_input_tokens_seen": 3428352, "step": 837 }, { "epoch": 0.624441132637854, "grad_norm": 8.960144038780973, "learning_rate": 2.4050173169768903e-06, "loss": 1.2858, "num_input_tokens_seen": 3432448, "step": 838 }, { "epoch": 0.6251862891207154, "grad_norm": 7.6077146301774, "learning_rate": 2.404793416881587e-06, "loss": 1.3605, "num_input_tokens_seen": 3436544, "step": 839 }, { "epoch": 0.6259314456035767, "grad_norm": 8.696465411339856, "learning_rate": 2.4045692636473206e-06, "loss": 0.9805, "num_input_tokens_seen": 3440640, "step": 840 }, { "epoch": 0.6266766020864382, "grad_norm": 9.25215694380985, "learning_rate": 2.4043448573232273e-06, "loss": 1.2875, "num_input_tokens_seen": 3444736, "step": 841 }, { "epoch": 0.6274217585692996, "grad_norm": 8.445891181474625, "learning_rate": 2.404120197958499e-06, "loss": 1.18, "num_input_tokens_seen": 3448832, "step": 842 }, { "epoch": 0.6281669150521609, "grad_norm": 7.937558841479126, "learning_rate": 2.4038952856023816e-06, "loss": 1.0681, "num_input_tokens_seen": 3452928, "step": 843 }, { "epoch": 0.6289120715350224, "grad_norm": 12.9319814974292, "learning_rate": 2.403670120304178e-06, "loss": 1.157, "num_input_tokens_seen": 3457024, "step": 844 }, { "epoch": 0.6296572280178837, "grad_norm": 13.147647760535868, "learning_rate": 2.403444702113246e-06, "loss": 1.3828, "num_input_tokens_seen": 3461120, "step": 845 }, { "epoch": 0.6304023845007451, "grad_norm": 9.896358755065346, "learning_rate": 2.4032190310789996e-06, "loss": 1.6265, "num_input_tokens_seen": 3465216, "step": 846 }, { "epoch": 0.6311475409836066, "grad_norm": 7.500320238852226, "learning_rate": 2.4029931072509066e-06, "loss": 0.9738, "num_input_tokens_seen": 3469312, "step": 847 }, { "epoch": 0.6318926974664679, "grad_norm": 6.368409270894809, "learning_rate": 2.4027669306784914e-06, "loss": 1.4362, "num_input_tokens_seen": 3473408, "step": 848 }, { "epoch": 0.6326378539493294, "grad_norm": 34.29633272834881, "learning_rate": 2.4025405014113333e-06, "loss": 1.9489, "num_input_tokens_seen": 3477504, "step": 849 }, { "epoch": 0.6333830104321908, "grad_norm": 9.018115699620685, "learning_rate": 2.4023138194990677e-06, "loss": 1.211, "num_input_tokens_seen": 3481600, "step": 850 }, { "epoch": 0.6341281669150521, "grad_norm": 36.43290031845286, "learning_rate": 2.402086884991384e-06, "loss": 1.6616, "num_input_tokens_seen": 3485696, "step": 851 }, { "epoch": 0.6348733233979136, "grad_norm": 9.048623311132234, "learning_rate": 2.401859697938029e-06, "loss": 1.2117, "num_input_tokens_seen": 3489792, "step": 852 }, { "epoch": 0.6356184798807749, "grad_norm": 8.630096611981266, "learning_rate": 2.401632258388803e-06, "loss": 1.2683, "num_input_tokens_seen": 3493888, "step": 853 }, { "epoch": 0.6363636363636364, "grad_norm": 7.1566476492262785, "learning_rate": 2.4014045663935627e-06, "loss": 1.4729, "num_input_tokens_seen": 3497984, "step": 854 }, { "epoch": 0.6371087928464978, "grad_norm": 10.448564278895063, "learning_rate": 2.4011766220022197e-06, "loss": 1.0317, "num_input_tokens_seen": 3502080, "step": 855 }, { "epoch": 0.6378539493293591, "grad_norm": 9.968834823425928, "learning_rate": 2.400948425264741e-06, "loss": 1.3454, "num_input_tokens_seen": 3506176, "step": 856 }, { "epoch": 0.6385991058122206, "grad_norm": 8.184231574933637, "learning_rate": 2.400719976231149e-06, "loss": 1.4704, "num_input_tokens_seen": 3510272, "step": 857 }, { "epoch": 0.639344262295082, "grad_norm": 8.499059777943755, "learning_rate": 2.4004912749515213e-06, "loss": 1.4298, "num_input_tokens_seen": 3514368, "step": 858 }, { "epoch": 0.6400894187779433, "grad_norm": 7.677260552079338, "learning_rate": 2.400262321475991e-06, "loss": 1.129, "num_input_tokens_seen": 3518464, "step": 859 }, { "epoch": 0.6408345752608048, "grad_norm": 8.899240300998207, "learning_rate": 2.400033115854746e-06, "loss": 1.337, "num_input_tokens_seen": 3522560, "step": 860 }, { "epoch": 0.6415797317436661, "grad_norm": 9.93222925944489, "learning_rate": 2.39980365813803e-06, "loss": 1.027, "num_input_tokens_seen": 3526656, "step": 861 }, { "epoch": 0.6423248882265276, "grad_norm": 8.301889216445073, "learning_rate": 2.399573948376142e-06, "loss": 1.116, "num_input_tokens_seen": 3530752, "step": 862 }, { "epoch": 0.643070044709389, "grad_norm": 7.454314699812229, "learning_rate": 2.3993439866194358e-06, "loss": 1.1201, "num_input_tokens_seen": 3534848, "step": 863 }, { "epoch": 0.6438152011922503, "grad_norm": 9.59816944752326, "learning_rate": 2.3991137729183205e-06, "loss": 1.3722, "num_input_tokens_seen": 3538944, "step": 864 }, { "epoch": 0.6445603576751118, "grad_norm": 7.724052036175854, "learning_rate": 2.3988833073232614e-06, "loss": 1.1371, "num_input_tokens_seen": 3543040, "step": 865 }, { "epoch": 0.6453055141579732, "grad_norm": 7.3316151202514614, "learning_rate": 2.3986525898847766e-06, "loss": 1.1571, "num_input_tokens_seen": 3547136, "step": 866 }, { "epoch": 0.6460506706408345, "grad_norm": 17.006274808532908, "learning_rate": 2.3984216206534426e-06, "loss": 1.5149, "num_input_tokens_seen": 3551232, "step": 867 }, { "epoch": 0.646795827123696, "grad_norm": 9.163018369596333, "learning_rate": 2.398190399679889e-06, "loss": 1.1092, "num_input_tokens_seen": 3555328, "step": 868 }, { "epoch": 0.6475409836065574, "grad_norm": 9.434013533806743, "learning_rate": 2.3979589270148006e-06, "loss": 0.9001, "num_input_tokens_seen": 3559424, "step": 869 }, { "epoch": 0.6482861400894188, "grad_norm": 10.777458163262649, "learning_rate": 2.3977272027089184e-06, "loss": 1.3023, "num_input_tokens_seen": 3563520, "step": 870 }, { "epoch": 0.6490312965722802, "grad_norm": 17.474000771227857, "learning_rate": 2.397495226813038e-06, "loss": 1.3731, "num_input_tokens_seen": 3567616, "step": 871 }, { "epoch": 0.6497764530551415, "grad_norm": 6.7392269848172, "learning_rate": 2.39726299937801e-06, "loss": 1.08, "num_input_tokens_seen": 3571712, "step": 872 }, { "epoch": 0.650521609538003, "grad_norm": 18.456126169575235, "learning_rate": 2.39703052045474e-06, "loss": 1.35, "num_input_tokens_seen": 3575808, "step": 873 }, { "epoch": 0.6512667660208644, "grad_norm": 7.558387725095291, "learning_rate": 2.3967977900941895e-06, "loss": 1.6071, "num_input_tokens_seen": 3579904, "step": 874 }, { "epoch": 0.6520119225037257, "grad_norm": 7.235645478251111, "learning_rate": 2.396564808347375e-06, "loss": 1.6211, "num_input_tokens_seen": 3584000, "step": 875 }, { "epoch": 0.6527570789865872, "grad_norm": 7.522034917282678, "learning_rate": 2.3963315752653666e-06, "loss": 1.3252, "num_input_tokens_seen": 3588096, "step": 876 }, { "epoch": 0.6535022354694486, "grad_norm": 10.372937952424993, "learning_rate": 2.396098090899292e-06, "loss": 1.256, "num_input_tokens_seen": 3592192, "step": 877 }, { "epoch": 0.65424739195231, "grad_norm": 7.110599491566196, "learning_rate": 2.395864355300332e-06, "loss": 1.4644, "num_input_tokens_seen": 3596288, "step": 878 }, { "epoch": 0.6549925484351714, "grad_norm": 7.919589135816411, "learning_rate": 2.3956303685197225e-06, "loss": 1.3268, "num_input_tokens_seen": 3600384, "step": 879 }, { "epoch": 0.6557377049180327, "grad_norm": 8.094101117191062, "learning_rate": 2.3953961306087563e-06, "loss": 1.3351, "num_input_tokens_seen": 3604480, "step": 880 }, { "epoch": 0.6564828614008942, "grad_norm": 7.552416463666009, "learning_rate": 2.3951616416187788e-06, "loss": 1.4455, "num_input_tokens_seen": 3608576, "step": 881 }, { "epoch": 0.6572280178837556, "grad_norm": 7.923295475384383, "learning_rate": 2.394926901601193e-06, "loss": 1.5285, "num_input_tokens_seen": 3612672, "step": 882 }, { "epoch": 0.657973174366617, "grad_norm": 6.629990672015069, "learning_rate": 2.394691910607454e-06, "loss": 0.969, "num_input_tokens_seen": 3616768, "step": 883 }, { "epoch": 0.6587183308494784, "grad_norm": 8.108014364300661, "learning_rate": 2.3944566686890753e-06, "loss": 1.3137, "num_input_tokens_seen": 3620864, "step": 884 }, { "epoch": 0.6594634873323398, "grad_norm": 10.889231514605148, "learning_rate": 2.394221175897622e-06, "loss": 1.1338, "num_input_tokens_seen": 3624960, "step": 885 }, { "epoch": 0.6602086438152012, "grad_norm": 7.749037455556691, "learning_rate": 2.3939854322847165e-06, "loss": 1.3286, "num_input_tokens_seen": 3629056, "step": 886 }, { "epoch": 0.6609538002980626, "grad_norm": 7.194735423155945, "learning_rate": 2.393749437902036e-06, "loss": 1.4376, "num_input_tokens_seen": 3633152, "step": 887 }, { "epoch": 0.6616989567809239, "grad_norm": 6.722275833787233, "learning_rate": 2.393513192801311e-06, "loss": 1.4718, "num_input_tokens_seen": 3637248, "step": 888 }, { "epoch": 0.6624441132637854, "grad_norm": 7.510416534991, "learning_rate": 2.393276697034329e-06, "loss": 1.4197, "num_input_tokens_seen": 3641344, "step": 889 }, { "epoch": 0.6631892697466468, "grad_norm": 6.667114948784582, "learning_rate": 2.3930399506529316e-06, "loss": 1.4711, "num_input_tokens_seen": 3645440, "step": 890 }, { "epoch": 0.6639344262295082, "grad_norm": 9.132287126614644, "learning_rate": 2.3928029537090146e-06, "loss": 1.0811, "num_input_tokens_seen": 3649536, "step": 891 }, { "epoch": 0.6646795827123696, "grad_norm": 8.153245349449438, "learning_rate": 2.39256570625453e-06, "loss": 1.3585, "num_input_tokens_seen": 3653632, "step": 892 }, { "epoch": 0.665424739195231, "grad_norm": 7.9715680909185735, "learning_rate": 2.3923282083414837e-06, "loss": 1.08, "num_input_tokens_seen": 3657728, "step": 893 }, { "epoch": 0.6661698956780924, "grad_norm": 7.669213718273323, "learning_rate": 2.3920904600219374e-06, "loss": 1.5732, "num_input_tokens_seen": 3661824, "step": 894 }, { "epoch": 0.6669150521609538, "grad_norm": 8.87470635456334, "learning_rate": 2.3918524613480066e-06, "loss": 1.4815, "num_input_tokens_seen": 3665920, "step": 895 }, { "epoch": 0.6676602086438153, "grad_norm": 7.024475137093173, "learning_rate": 2.391614212371863e-06, "loss": 1.4483, "num_input_tokens_seen": 3670016, "step": 896 }, { "epoch": 0.6684053651266766, "grad_norm": 8.165344543356836, "learning_rate": 2.391375713145732e-06, "loss": 1.2528, "num_input_tokens_seen": 3674112, "step": 897 }, { "epoch": 0.669150521609538, "grad_norm": 6.835677510792445, "learning_rate": 2.391136963721895e-06, "loss": 1.3198, "num_input_tokens_seen": 3678208, "step": 898 }, { "epoch": 0.6698956780923994, "grad_norm": 9.447580708657178, "learning_rate": 2.390897964152687e-06, "loss": 1.0639, "num_input_tokens_seen": 3682304, "step": 899 }, { "epoch": 0.6706408345752608, "grad_norm": 9.406003313234496, "learning_rate": 2.390658714490498e-06, "loss": 1.1768, "num_input_tokens_seen": 3686400, "step": 900 }, { "epoch": 0.6713859910581222, "grad_norm": 7.536966353096618, "learning_rate": 2.390419214787774e-06, "loss": 1.5266, "num_input_tokens_seen": 3690496, "step": 901 }, { "epoch": 0.6721311475409836, "grad_norm": 10.245312375959912, "learning_rate": 2.3901794650970153e-06, "loss": 1.4232, "num_input_tokens_seen": 3694592, "step": 902 }, { "epoch": 0.672876304023845, "grad_norm": 7.736669010999861, "learning_rate": 2.389939465470776e-06, "loss": 1.4031, "num_input_tokens_seen": 3698688, "step": 903 }, { "epoch": 0.6736214605067065, "grad_norm": 10.486506969999375, "learning_rate": 2.3896992159616657e-06, "loss": 1.1351, "num_input_tokens_seen": 3702784, "step": 904 }, { "epoch": 0.6743666169895678, "grad_norm": 7.14948819389433, "learning_rate": 2.3894587166223493e-06, "loss": 1.0706, "num_input_tokens_seen": 3706880, "step": 905 }, { "epoch": 0.6751117734724292, "grad_norm": 7.852902389024853, "learning_rate": 2.3892179675055457e-06, "loss": 1.3011, "num_input_tokens_seen": 3710976, "step": 906 }, { "epoch": 0.6758569299552906, "grad_norm": 6.99071760227588, "learning_rate": 2.3889769686640283e-06, "loss": 1.1733, "num_input_tokens_seen": 3715072, "step": 907 }, { "epoch": 0.676602086438152, "grad_norm": 6.957698184806204, "learning_rate": 2.388735720150627e-06, "loss": 1.1269, "num_input_tokens_seen": 3719168, "step": 908 }, { "epoch": 0.6773472429210134, "grad_norm": 6.529853930631925, "learning_rate": 2.3884942220182244e-06, "loss": 1.3924, "num_input_tokens_seen": 3723264, "step": 909 }, { "epoch": 0.6780923994038748, "grad_norm": 7.880209891411649, "learning_rate": 2.388252474319759e-06, "loss": 1.4298, "num_input_tokens_seen": 3727360, "step": 910 }, { "epoch": 0.6788375558867362, "grad_norm": 7.877298906040307, "learning_rate": 2.388010477108223e-06, "loss": 1.0164, "num_input_tokens_seen": 3731456, "step": 911 }, { "epoch": 0.6795827123695977, "grad_norm": 8.161919576604292, "learning_rate": 2.3877682304366645e-06, "loss": 1.2606, "num_input_tokens_seen": 3735552, "step": 912 }, { "epoch": 0.680327868852459, "grad_norm": 8.071943963506747, "learning_rate": 2.3875257343581856e-06, "loss": 1.3982, "num_input_tokens_seen": 3739648, "step": 913 }, { "epoch": 0.6810730253353204, "grad_norm": 9.71768149129406, "learning_rate": 2.387282988925943e-06, "loss": 1.2133, "num_input_tokens_seen": 3743744, "step": 914 }, { "epoch": 0.6818181818181818, "grad_norm": 7.973837662034686, "learning_rate": 2.3870399941931483e-06, "loss": 1.6279, "num_input_tokens_seen": 3747840, "step": 915 }, { "epoch": 0.6825633383010432, "grad_norm": 18.228324049889707, "learning_rate": 2.3867967502130674e-06, "loss": 1.4502, "num_input_tokens_seen": 3751936, "step": 916 }, { "epoch": 0.6833084947839047, "grad_norm": 8.497616989885005, "learning_rate": 2.3865532570390214e-06, "loss": 1.2153, "num_input_tokens_seen": 3756032, "step": 917 }, { "epoch": 0.684053651266766, "grad_norm": 8.668634359535769, "learning_rate": 2.3863095147243863e-06, "loss": 1.2165, "num_input_tokens_seen": 3760128, "step": 918 }, { "epoch": 0.6847988077496274, "grad_norm": 9.390167714394034, "learning_rate": 2.386065523322591e-06, "loss": 1.13, "num_input_tokens_seen": 3764224, "step": 919 }, { "epoch": 0.6855439642324889, "grad_norm": 11.170126556391336, "learning_rate": 2.3858212828871213e-06, "loss": 1.3527, "num_input_tokens_seen": 3768320, "step": 920 }, { "epoch": 0.6862891207153502, "grad_norm": 7.081376473704773, "learning_rate": 2.3855767934715154e-06, "loss": 1.3477, "num_input_tokens_seen": 3772416, "step": 921 }, { "epoch": 0.6870342771982116, "grad_norm": 9.06765615527657, "learning_rate": 2.385332055129368e-06, "loss": 1.2423, "num_input_tokens_seen": 3776512, "step": 922 }, { "epoch": 0.687779433681073, "grad_norm": 8.395241879784905, "learning_rate": 2.3850870679143268e-06, "loss": 1.1732, "num_input_tokens_seen": 3780608, "step": 923 }, { "epoch": 0.6885245901639344, "grad_norm": 6.97825187673511, "learning_rate": 2.3848418318800954e-06, "loss": 1.2838, "num_input_tokens_seen": 3784704, "step": 924 }, { "epoch": 0.6892697466467959, "grad_norm": 11.09717578619764, "learning_rate": 2.384596347080431e-06, "loss": 1.463, "num_input_tokens_seen": 3788800, "step": 925 }, { "epoch": 0.6900149031296572, "grad_norm": 8.085866219827496, "learning_rate": 2.384350613569145e-06, "loss": 1.1807, "num_input_tokens_seen": 3792896, "step": 926 }, { "epoch": 0.6907600596125186, "grad_norm": 10.970244398448774, "learning_rate": 2.384104631400105e-06, "loss": 1.1445, "num_input_tokens_seen": 3796992, "step": 927 }, { "epoch": 0.6915052160953801, "grad_norm": 6.239108820505788, "learning_rate": 2.3838584006272313e-06, "loss": 0.9957, "num_input_tokens_seen": 3801088, "step": 928 }, { "epoch": 0.6922503725782414, "grad_norm": 7.059372074554579, "learning_rate": 2.3836119213045e-06, "loss": 1.0374, "num_input_tokens_seen": 3805184, "step": 929 }, { "epoch": 0.6929955290611028, "grad_norm": 8.462434481256834, "learning_rate": 2.383365193485941e-06, "loss": 0.9511, "num_input_tokens_seen": 3809280, "step": 930 }, { "epoch": 0.6937406855439643, "grad_norm": 8.482635698078644, "learning_rate": 2.3831182172256377e-06, "loss": 1.1104, "num_input_tokens_seen": 3813376, "step": 931 }, { "epoch": 0.6944858420268256, "grad_norm": 7.861771801389365, "learning_rate": 2.382870992577731e-06, "loss": 1.3981, "num_input_tokens_seen": 3817472, "step": 932 }, { "epoch": 0.6952309985096871, "grad_norm": 20.275725096260825, "learning_rate": 2.382623519596413e-06, "loss": 1.299, "num_input_tokens_seen": 3821568, "step": 933 }, { "epoch": 0.6959761549925484, "grad_norm": 8.695417670157841, "learning_rate": 2.3823757983359315e-06, "loss": 1.4596, "num_input_tokens_seen": 3825664, "step": 934 }, { "epoch": 0.6967213114754098, "grad_norm": 6.311349633671244, "learning_rate": 2.3821278288505894e-06, "loss": 1.4286, "num_input_tokens_seen": 3829760, "step": 935 }, { "epoch": 0.6974664679582713, "grad_norm": 9.068090365426444, "learning_rate": 2.3818796111947433e-06, "loss": 1.2375, "num_input_tokens_seen": 3833856, "step": 936 }, { "epoch": 0.6982116244411326, "grad_norm": 7.989581809491096, "learning_rate": 2.3816311454228037e-06, "loss": 1.1216, "num_input_tokens_seen": 3837952, "step": 937 }, { "epoch": 0.698956780923994, "grad_norm": 6.591071306153274, "learning_rate": 2.381382431589237e-06, "loss": 1.353, "num_input_tokens_seen": 3842048, "step": 938 }, { "epoch": 0.6997019374068555, "grad_norm": 6.7864093703736055, "learning_rate": 2.381133469748562e-06, "loss": 1.3062, "num_input_tokens_seen": 3846144, "step": 939 }, { "epoch": 0.7004470938897168, "grad_norm": 6.941799811905004, "learning_rate": 2.3808842599553533e-06, "loss": 1.4833, "num_input_tokens_seen": 3850240, "step": 940 }, { "epoch": 0.7011922503725783, "grad_norm": 7.9127100634407395, "learning_rate": 2.38063480226424e-06, "loss": 1.0806, "num_input_tokens_seen": 3854336, "step": 941 }, { "epoch": 0.7019374068554396, "grad_norm": 9.126348879459288, "learning_rate": 2.3803850967299046e-06, "loss": 1.516, "num_input_tokens_seen": 3858432, "step": 942 }, { "epoch": 0.702682563338301, "grad_norm": 11.188853897811699, "learning_rate": 2.380135143407084e-06, "loss": 1.4738, "num_input_tokens_seen": 3862528, "step": 943 }, { "epoch": 0.7034277198211625, "grad_norm": 7.92569859648005, "learning_rate": 2.3798849423505705e-06, "loss": 1.2424, "num_input_tokens_seen": 3866624, "step": 944 }, { "epoch": 0.7041728763040238, "grad_norm": 6.356078735245055, "learning_rate": 2.379634493615209e-06, "loss": 1.5676, "num_input_tokens_seen": 3870720, "step": 945 }, { "epoch": 0.7049180327868853, "grad_norm": 7.595499882601043, "learning_rate": 2.3793837972559005e-06, "loss": 1.1493, "num_input_tokens_seen": 3874816, "step": 946 }, { "epoch": 0.7056631892697467, "grad_norm": 9.172668560338002, "learning_rate": 2.379132853327599e-06, "loss": 1.2209, "num_input_tokens_seen": 3878912, "step": 947 }, { "epoch": 0.706408345752608, "grad_norm": 8.123198249663519, "learning_rate": 2.3788816618853134e-06, "loss": 1.2003, "num_input_tokens_seen": 3883008, "step": 948 }, { "epoch": 0.7071535022354695, "grad_norm": 10.373596095300632, "learning_rate": 2.3786302229841067e-06, "loss": 1.3086, "num_input_tokens_seen": 3887104, "step": 949 }, { "epoch": 0.7078986587183308, "grad_norm": 6.878441812780804, "learning_rate": 2.378378536679096e-06, "loss": 1.3345, "num_input_tokens_seen": 3891200, "step": 950 }, { "epoch": 0.7086438152011922, "grad_norm": 8.198502746379983, "learning_rate": 2.3781266030254523e-06, "loss": 1.4225, "num_input_tokens_seen": 3895296, "step": 951 }, { "epoch": 0.7093889716840537, "grad_norm": 8.688069643197114, "learning_rate": 2.377874422078402e-06, "loss": 1.0829, "num_input_tokens_seen": 3899392, "step": 952 }, { "epoch": 0.710134128166915, "grad_norm": 7.532496835697536, "learning_rate": 2.377621993893225e-06, "loss": 1.3673, "num_input_tokens_seen": 3903488, "step": 953 }, { "epoch": 0.7108792846497765, "grad_norm": 7.07150909604024, "learning_rate": 2.3773693185252546e-06, "loss": 1.4038, "num_input_tokens_seen": 3907584, "step": 954 }, { "epoch": 0.7116244411326379, "grad_norm": 8.06174785712578, "learning_rate": 2.3771163960298793e-06, "loss": 1.353, "num_input_tokens_seen": 3911680, "step": 955 }, { "epoch": 0.7123695976154992, "grad_norm": 8.330174381550842, "learning_rate": 2.3768632264625417e-06, "loss": 1.3524, "num_input_tokens_seen": 3915776, "step": 956 }, { "epoch": 0.7131147540983607, "grad_norm": 9.419679574296067, "learning_rate": 2.3766098098787384e-06, "loss": 1.2701, "num_input_tokens_seen": 3919872, "step": 957 }, { "epoch": 0.713859910581222, "grad_norm": 10.79568126978829, "learning_rate": 2.37635614633402e-06, "loss": 1.3607, "num_input_tokens_seen": 3923968, "step": 958 }, { "epoch": 0.7146050670640834, "grad_norm": 7.770259536739307, "learning_rate": 2.3761022358839917e-06, "loss": 1.3686, "num_input_tokens_seen": 3928064, "step": 959 }, { "epoch": 0.7153502235469449, "grad_norm": 8.185556565912558, "learning_rate": 2.375848078584312e-06, "loss": 1.498, "num_input_tokens_seen": 3932160, "step": 960 }, { "epoch": 0.7160953800298062, "grad_norm": 8.599401208553656, "learning_rate": 2.3755936744906945e-06, "loss": 1.3957, "num_input_tokens_seen": 3936256, "step": 961 }, { "epoch": 0.7168405365126677, "grad_norm": 8.528445984790315, "learning_rate": 2.3753390236589054e-06, "loss": 1.4242, "num_input_tokens_seen": 3940352, "step": 962 }, { "epoch": 0.7175856929955291, "grad_norm": 8.145150880809696, "learning_rate": 2.375084126144767e-06, "loss": 1.0752, "num_input_tokens_seen": 3944448, "step": 963 }, { "epoch": 0.7183308494783904, "grad_norm": 6.961716356869234, "learning_rate": 2.3748289820041547e-06, "loss": 1.2123, "num_input_tokens_seen": 3948544, "step": 964 }, { "epoch": 0.7190760059612519, "grad_norm": 24.929313692649785, "learning_rate": 2.3745735912929973e-06, "loss": 1.3737, "num_input_tokens_seen": 3952640, "step": 965 }, { "epoch": 0.7198211624441133, "grad_norm": 9.798611333650015, "learning_rate": 2.3743179540672785e-06, "loss": 0.8661, "num_input_tokens_seen": 3956736, "step": 966 }, { "epoch": 0.7205663189269746, "grad_norm": 9.240211118302732, "learning_rate": 2.3740620703830356e-06, "loss": 1.4366, "num_input_tokens_seen": 3960832, "step": 967 }, { "epoch": 0.7213114754098361, "grad_norm": 13.122842855136698, "learning_rate": 2.3738059402963607e-06, "loss": 1.4412, "num_input_tokens_seen": 3964928, "step": 968 }, { "epoch": 0.7220566318926974, "grad_norm": 12.58243221178168, "learning_rate": 2.3735495638633986e-06, "loss": 1.5885, "num_input_tokens_seen": 3969024, "step": 969 }, { "epoch": 0.7228017883755589, "grad_norm": 8.64624403738673, "learning_rate": 2.37329294114035e-06, "loss": 1.2252, "num_input_tokens_seen": 3973120, "step": 970 }, { "epoch": 0.7235469448584203, "grad_norm": 8.700726716233897, "learning_rate": 2.373036072183467e-06, "loss": 1.0892, "num_input_tokens_seen": 3977216, "step": 971 }, { "epoch": 0.7242921013412816, "grad_norm": 9.001842259147125, "learning_rate": 2.372778957049058e-06, "loss": 1.1057, "num_input_tokens_seen": 3981312, "step": 972 }, { "epoch": 0.7250372578241431, "grad_norm": 6.552586098350316, "learning_rate": 2.372521595793484e-06, "loss": 1.3325, "num_input_tokens_seen": 3985408, "step": 973 }, { "epoch": 0.7257824143070045, "grad_norm": 7.681548969559242, "learning_rate": 2.372263988473161e-06, "loss": 1.25, "num_input_tokens_seen": 3989504, "step": 974 }, { "epoch": 0.7265275707898659, "grad_norm": 8.847542267852107, "learning_rate": 2.372006135144558e-06, "loss": 1.2783, "num_input_tokens_seen": 3993600, "step": 975 }, { "epoch": 0.7272727272727273, "grad_norm": 6.746341503215298, "learning_rate": 2.371748035864198e-06, "loss": 1.2621, "num_input_tokens_seen": 3997696, "step": 976 }, { "epoch": 0.7280178837555886, "grad_norm": 10.40879836127752, "learning_rate": 2.371489690688659e-06, "loss": 1.2575, "num_input_tokens_seen": 4001792, "step": 977 }, { "epoch": 0.7287630402384501, "grad_norm": 7.955721818573852, "learning_rate": 2.3712310996745712e-06, "loss": 0.9802, "num_input_tokens_seen": 4005888, "step": 978 }, { "epoch": 0.7295081967213115, "grad_norm": 7.2184792281993895, "learning_rate": 2.3709722628786207e-06, "loss": 1.227, "num_input_tokens_seen": 4009984, "step": 979 }, { "epoch": 0.7302533532041728, "grad_norm": 9.30831506438803, "learning_rate": 2.370713180357545e-06, "loss": 1.0719, "num_input_tokens_seen": 4014080, "step": 980 }, { "epoch": 0.7309985096870343, "grad_norm": 8.13741777653473, "learning_rate": 2.370453852168138e-06, "loss": 1.1913, "num_input_tokens_seen": 4018176, "step": 981 }, { "epoch": 0.7317436661698957, "grad_norm": 9.41226498375942, "learning_rate": 2.370194278367246e-06, "loss": 1.1926, "num_input_tokens_seen": 4022272, "step": 982 }, { "epoch": 0.732488822652757, "grad_norm": 7.763918870817493, "learning_rate": 2.369934459011769e-06, "loss": 1.3796, "num_input_tokens_seen": 4026368, "step": 983 }, { "epoch": 0.7332339791356185, "grad_norm": 8.061583110518821, "learning_rate": 2.3696743941586616e-06, "loss": 1.3733, "num_input_tokens_seen": 4030464, "step": 984 }, { "epoch": 0.7339791356184798, "grad_norm": 8.577143970428882, "learning_rate": 2.3694140838649317e-06, "loss": 1.0258, "num_input_tokens_seen": 4034560, "step": 985 }, { "epoch": 0.7347242921013413, "grad_norm": 8.499805432090632, "learning_rate": 2.369153528187641e-06, "loss": 1.0486, "num_input_tokens_seen": 4038656, "step": 986 }, { "epoch": 0.7354694485842027, "grad_norm": 7.275008827454474, "learning_rate": 2.3688927271839064e-06, "loss": 1.1026, "num_input_tokens_seen": 4042752, "step": 987 }, { "epoch": 0.736214605067064, "grad_norm": 8.823241870286546, "learning_rate": 2.3686316809108956e-06, "loss": 0.9371, "num_input_tokens_seen": 4046848, "step": 988 }, { "epoch": 0.7369597615499255, "grad_norm": 7.155786418541277, "learning_rate": 2.368370389425833e-06, "loss": 1.5479, "num_input_tokens_seen": 4050944, "step": 989 }, { "epoch": 0.7377049180327869, "grad_norm": 6.3345812120437275, "learning_rate": 2.3681088527859947e-06, "loss": 1.3941, "num_input_tokens_seen": 4055040, "step": 990 }, { "epoch": 0.7384500745156483, "grad_norm": 8.892333899356412, "learning_rate": 2.367847071048712e-06, "loss": 1.3487, "num_input_tokens_seen": 4059136, "step": 991 }, { "epoch": 0.7391952309985097, "grad_norm": 9.383409965057032, "learning_rate": 2.3675850442713694e-06, "loss": 0.97, "num_input_tokens_seen": 4063232, "step": 992 }, { "epoch": 0.7399403874813711, "grad_norm": 14.24199151203603, "learning_rate": 2.367322772511405e-06, "loss": 1.2549, "num_input_tokens_seen": 4067328, "step": 993 }, { "epoch": 0.7406855439642325, "grad_norm": 7.476787775140919, "learning_rate": 2.36706025582631e-06, "loss": 1.1393, "num_input_tokens_seen": 4071424, "step": 994 }, { "epoch": 0.7414307004470939, "grad_norm": 7.594345964068869, "learning_rate": 2.3667974942736306e-06, "loss": 1.3605, "num_input_tokens_seen": 4075520, "step": 995 }, { "epoch": 0.7421758569299552, "grad_norm": 7.158968275346345, "learning_rate": 2.3665344879109657e-06, "loss": 1.265, "num_input_tokens_seen": 4079616, "step": 996 }, { "epoch": 0.7429210134128167, "grad_norm": 7.9405913962878705, "learning_rate": 2.3662712367959683e-06, "loss": 0.9492, "num_input_tokens_seen": 4083712, "step": 997 }, { "epoch": 0.7436661698956781, "grad_norm": 8.26993286012098, "learning_rate": 2.3660077409863453e-06, "loss": 1.0021, "num_input_tokens_seen": 4087808, "step": 998 }, { "epoch": 0.7444113263785395, "grad_norm": 8.893161784722881, "learning_rate": 2.365744000539856e-06, "loss": 1.1417, "num_input_tokens_seen": 4091904, "step": 999 }, { "epoch": 0.7451564828614009, "grad_norm": 13.905613006767325, "learning_rate": 2.3654800155143147e-06, "loss": 1.305, "num_input_tokens_seen": 4096000, "step": 1000 }, { "epoch": 0.7459016393442623, "grad_norm": 7.271066079615159, "learning_rate": 2.365215785967589e-06, "loss": 1.2033, "num_input_tokens_seen": 4100096, "step": 1001 }, { "epoch": 0.7466467958271237, "grad_norm": 10.103236318014167, "learning_rate": 2.3649513119575994e-06, "loss": 1.2773, "num_input_tokens_seen": 4104192, "step": 1002 }, { "epoch": 0.7473919523099851, "grad_norm": 10.329235800586767, "learning_rate": 2.364686593542321e-06, "loss": 0.9841, "num_input_tokens_seen": 4108288, "step": 1003 }, { "epoch": 0.7481371087928465, "grad_norm": 9.190845206455581, "learning_rate": 2.364421630779782e-06, "loss": 1.2493, "num_input_tokens_seen": 4112384, "step": 1004 }, { "epoch": 0.7488822652757079, "grad_norm": 9.969001460858216, "learning_rate": 2.364156423728063e-06, "loss": 1.1815, "num_input_tokens_seen": 4116480, "step": 1005 }, { "epoch": 0.7496274217585693, "grad_norm": 9.023831336936663, "learning_rate": 2.3638909724453007e-06, "loss": 1.012, "num_input_tokens_seen": 4120576, "step": 1006 }, { "epoch": 0.7503725782414307, "grad_norm": 27.22887850479459, "learning_rate": 2.3636252769896834e-06, "loss": 1.6547, "num_input_tokens_seen": 4124672, "step": 1007 }, { "epoch": 0.7511177347242921, "grad_norm": 10.450406129445883, "learning_rate": 2.363359337419453e-06, "loss": 0.9154, "num_input_tokens_seen": 4128768, "step": 1008 }, { "epoch": 0.7518628912071535, "grad_norm": 9.371965650897115, "learning_rate": 2.363093153792906e-06, "loss": 1.181, "num_input_tokens_seen": 4132864, "step": 1009 }, { "epoch": 0.7526080476900149, "grad_norm": 7.8766537031956165, "learning_rate": 2.3628267261683917e-06, "loss": 0.8272, "num_input_tokens_seen": 4136960, "step": 1010 }, { "epoch": 0.7533532041728763, "grad_norm": 10.596879977019693, "learning_rate": 2.3625600546043125e-06, "loss": 0.9569, "num_input_tokens_seen": 4141056, "step": 1011 }, { "epoch": 0.7540983606557377, "grad_norm": 8.223749201536384, "learning_rate": 2.362293139159125e-06, "loss": 1.218, "num_input_tokens_seen": 4145152, "step": 1012 }, { "epoch": 0.7548435171385991, "grad_norm": 8.257984117856308, "learning_rate": 2.3620259798913386e-06, "loss": 1.1766, "num_input_tokens_seen": 4149248, "step": 1013 }, { "epoch": 0.7555886736214605, "grad_norm": 17.01802873499766, "learning_rate": 2.361758576859517e-06, "loss": 1.3807, "num_input_tokens_seen": 4153344, "step": 1014 }, { "epoch": 0.7563338301043219, "grad_norm": 7.430679723271501, "learning_rate": 2.3614909301222773e-06, "loss": 1.0177, "num_input_tokens_seen": 4157440, "step": 1015 }, { "epoch": 0.7570789865871833, "grad_norm": 9.241662627166649, "learning_rate": 2.361223039738288e-06, "loss": 1.4113, "num_input_tokens_seen": 4161536, "step": 1016 }, { "epoch": 0.7578241430700448, "grad_norm": 7.144611034830377, "learning_rate": 2.3609549057662744e-06, "loss": 1.3528, "num_input_tokens_seen": 4165632, "step": 1017 }, { "epoch": 0.7585692995529061, "grad_norm": 7.200347685722224, "learning_rate": 2.360686528265012e-06, "loss": 1.2692, "num_input_tokens_seen": 4169728, "step": 1018 }, { "epoch": 0.7593144560357675, "grad_norm": 7.9837075039320915, "learning_rate": 2.360417907293332e-06, "loss": 1.1127, "num_input_tokens_seen": 4173824, "step": 1019 }, { "epoch": 0.7600596125186289, "grad_norm": 7.46387780028903, "learning_rate": 2.360149042910117e-06, "loss": 0.9913, "num_input_tokens_seen": 4177920, "step": 1020 }, { "epoch": 0.7608047690014903, "grad_norm": 7.562607525089172, "learning_rate": 2.359879935174305e-06, "loss": 1.1392, "num_input_tokens_seen": 4182016, "step": 1021 }, { "epoch": 0.7615499254843517, "grad_norm": 9.889468997946512, "learning_rate": 2.359610584144886e-06, "loss": 1.1858, "num_input_tokens_seen": 4186112, "step": 1022 }, { "epoch": 0.7622950819672131, "grad_norm": 6.862511856706836, "learning_rate": 2.359340989880903e-06, "loss": 0.9929, "num_input_tokens_seen": 4190208, "step": 1023 }, { "epoch": 0.7630402384500745, "grad_norm": 6.396166150667925, "learning_rate": 2.3590711524414545e-06, "loss": 1.2821, "num_input_tokens_seen": 4194304, "step": 1024 }, { "epoch": 0.763785394932936, "grad_norm": 8.155545597476145, "learning_rate": 2.3588010718856898e-06, "loss": 0.9257, "num_input_tokens_seen": 4198400, "step": 1025 }, { "epoch": 0.7645305514157973, "grad_norm": 21.296962908683962, "learning_rate": 2.3585307482728125e-06, "loss": 1.2367, "num_input_tokens_seen": 4202496, "step": 1026 }, { "epoch": 0.7652757078986587, "grad_norm": 8.48519662324862, "learning_rate": 2.3582601816620793e-06, "loss": 1.2578, "num_input_tokens_seen": 4206592, "step": 1027 }, { "epoch": 0.7660208643815202, "grad_norm": 22.12470744126561, "learning_rate": 2.3579893721128003e-06, "loss": 1.3088, "num_input_tokens_seen": 4210688, "step": 1028 }, { "epoch": 0.7667660208643815, "grad_norm": 9.60146202592085, "learning_rate": 2.35771831968434e-06, "loss": 1.2169, "num_input_tokens_seen": 4214784, "step": 1029 }, { "epoch": 0.767511177347243, "grad_norm": 22.643630872204263, "learning_rate": 2.3574470244361135e-06, "loss": 1.2802, "num_input_tokens_seen": 4218880, "step": 1030 }, { "epoch": 0.7682563338301043, "grad_norm": 6.7511122467403295, "learning_rate": 2.357175486427592e-06, "loss": 1.2986, "num_input_tokens_seen": 4222976, "step": 1031 }, { "epoch": 0.7690014903129657, "grad_norm": 9.347595104668446, "learning_rate": 2.3569037057182975e-06, "loss": 1.3282, "num_input_tokens_seen": 4227072, "step": 1032 }, { "epoch": 0.7697466467958272, "grad_norm": 8.074562685209775, "learning_rate": 2.356631682367807e-06, "loss": 1.3905, "num_input_tokens_seen": 4231168, "step": 1033 }, { "epoch": 0.7704918032786885, "grad_norm": 8.676290286994885, "learning_rate": 2.3563594164357495e-06, "loss": 1.3325, "num_input_tokens_seen": 4235264, "step": 1034 }, { "epoch": 0.7712369597615499, "grad_norm": 7.459526627212569, "learning_rate": 2.3560869079818076e-06, "loss": 1.492, "num_input_tokens_seen": 4239360, "step": 1035 }, { "epoch": 0.7719821162444114, "grad_norm": 7.434924284882449, "learning_rate": 2.355814157065718e-06, "loss": 1.1564, "num_input_tokens_seen": 4243456, "step": 1036 }, { "epoch": 0.7727272727272727, "grad_norm": 6.69259979989534, "learning_rate": 2.355541163747268e-06, "loss": 1.143, "num_input_tokens_seen": 4247552, "step": 1037 }, { "epoch": 0.7734724292101341, "grad_norm": 6.4838843138974225, "learning_rate": 2.355267928086301e-06, "loss": 1.2915, "num_input_tokens_seen": 4251648, "step": 1038 }, { "epoch": 0.7742175856929955, "grad_norm": 7.128833153560263, "learning_rate": 2.3549944501427122e-06, "loss": 1.5708, "num_input_tokens_seen": 4255744, "step": 1039 }, { "epoch": 0.7749627421758569, "grad_norm": 10.583660608958521, "learning_rate": 2.354720729976449e-06, "loss": 1.1881, "num_input_tokens_seen": 4259840, "step": 1040 }, { "epoch": 0.7757078986587184, "grad_norm": 8.110033100881337, "learning_rate": 2.354446767647514e-06, "loss": 1.3994, "num_input_tokens_seen": 4263936, "step": 1041 }, { "epoch": 0.7764530551415797, "grad_norm": 8.407287703414145, "learning_rate": 2.3541725632159607e-06, "loss": 1.101, "num_input_tokens_seen": 4268032, "step": 1042 }, { "epoch": 0.7771982116244411, "grad_norm": 7.023750258195454, "learning_rate": 2.353898116741897e-06, "loss": 1.3357, "num_input_tokens_seen": 4272128, "step": 1043 }, { "epoch": 0.7779433681073026, "grad_norm": 6.791215779482601, "learning_rate": 2.353623428285484e-06, "loss": 1.278, "num_input_tokens_seen": 4276224, "step": 1044 }, { "epoch": 0.7786885245901639, "grad_norm": 7.506056987956274, "learning_rate": 2.3533484979069343e-06, "loss": 1.322, "num_input_tokens_seen": 4280320, "step": 1045 }, { "epoch": 0.7794336810730254, "grad_norm": 7.710321653307786, "learning_rate": 2.3530733256665155e-06, "loss": 1.2637, "num_input_tokens_seen": 4284416, "step": 1046 }, { "epoch": 0.7801788375558867, "grad_norm": 13.27468324216932, "learning_rate": 2.3527979116245473e-06, "loss": 1.0619, "num_input_tokens_seen": 4288512, "step": 1047 }, { "epoch": 0.7809239940387481, "grad_norm": 8.400745762277484, "learning_rate": 2.352522255841402e-06, "loss": 1.1219, "num_input_tokens_seen": 4292608, "step": 1048 }, { "epoch": 0.7816691505216096, "grad_norm": 6.991231019312229, "learning_rate": 2.3522463583775056e-06, "loss": 1.0783, "num_input_tokens_seen": 4296704, "step": 1049 }, { "epoch": 0.7824143070044709, "grad_norm": 8.120259946124103, "learning_rate": 2.351970219293337e-06, "loss": 1.0869, "num_input_tokens_seen": 4300800, "step": 1050 }, { "epoch": 0.7831594634873323, "grad_norm": 9.10648660644641, "learning_rate": 2.351693838649427e-06, "loss": 1.0302, "num_input_tokens_seen": 4304896, "step": 1051 }, { "epoch": 0.7839046199701938, "grad_norm": 7.116663237164971, "learning_rate": 2.3514172165063608e-06, "loss": 1.155, "num_input_tokens_seen": 4308992, "step": 1052 }, { "epoch": 0.7846497764530551, "grad_norm": 6.625366939573677, "learning_rate": 2.3511403529247763e-06, "loss": 1.43, "num_input_tokens_seen": 4313088, "step": 1053 }, { "epoch": 0.7853949329359166, "grad_norm": 7.476278468504159, "learning_rate": 2.350863247965363e-06, "loss": 1.2485, "num_input_tokens_seen": 4317184, "step": 1054 }, { "epoch": 0.786140089418778, "grad_norm": 9.521583128141723, "learning_rate": 2.3505859016888656e-06, "loss": 1.2869, "num_input_tokens_seen": 4321280, "step": 1055 }, { "epoch": 0.7868852459016393, "grad_norm": 7.625541342426579, "learning_rate": 2.3503083141560794e-06, "loss": 1.3825, "num_input_tokens_seen": 4325376, "step": 1056 }, { "epoch": 0.7876304023845008, "grad_norm": 7.031731446746545, "learning_rate": 2.350030485427854e-06, "loss": 1.1774, "num_input_tokens_seen": 4329472, "step": 1057 }, { "epoch": 0.7883755588673621, "grad_norm": 8.160074307142203, "learning_rate": 2.3497524155650906e-06, "loss": 1.2702, "num_input_tokens_seen": 4333568, "step": 1058 }, { "epoch": 0.7891207153502235, "grad_norm": 6.888534672420567, "learning_rate": 2.349474104628745e-06, "loss": 1.2545, "num_input_tokens_seen": 4337664, "step": 1059 }, { "epoch": 0.789865871833085, "grad_norm": 8.15084632683891, "learning_rate": 2.3491955526798255e-06, "loss": 1.1897, "num_input_tokens_seen": 4341760, "step": 1060 }, { "epoch": 0.7906110283159463, "grad_norm": 7.242721414855804, "learning_rate": 2.348916759779391e-06, "loss": 1.1419, "num_input_tokens_seen": 4345856, "step": 1061 }, { "epoch": 0.7913561847988078, "grad_norm": 7.5642087858701, "learning_rate": 2.3486377259885563e-06, "loss": 1.1993, "num_input_tokens_seen": 4349952, "step": 1062 }, { "epoch": 0.7921013412816692, "grad_norm": 8.48428483394245, "learning_rate": 2.348358451368487e-06, "loss": 1.3785, "num_input_tokens_seen": 4354048, "step": 1063 }, { "epoch": 0.7928464977645305, "grad_norm": 9.315273917556523, "learning_rate": 2.3480789359804016e-06, "loss": 1.0656, "num_input_tokens_seen": 4358144, "step": 1064 }, { "epoch": 0.793591654247392, "grad_norm": 8.925203212332141, "learning_rate": 2.3477991798855732e-06, "loss": 1.0763, "num_input_tokens_seen": 4362240, "step": 1065 }, { "epoch": 0.7943368107302533, "grad_norm": 8.026865923435967, "learning_rate": 2.3475191831453252e-06, "loss": 1.1535, "num_input_tokens_seen": 4366336, "step": 1066 }, { "epoch": 0.7950819672131147, "grad_norm": 8.915302590711034, "learning_rate": 2.3472389458210353e-06, "loss": 1.2282, "num_input_tokens_seen": 4370432, "step": 1067 }, { "epoch": 0.7958271236959762, "grad_norm": 8.17173831328693, "learning_rate": 2.3469584679741336e-06, "loss": 1.077, "num_input_tokens_seen": 4374528, "step": 1068 }, { "epoch": 0.7965722801788375, "grad_norm": 7.871370806139562, "learning_rate": 2.346677749666103e-06, "loss": 1.299, "num_input_tokens_seen": 4378624, "step": 1069 }, { "epoch": 0.797317436661699, "grad_norm": 8.842265025822927, "learning_rate": 2.3463967909584784e-06, "loss": 1.243, "num_input_tokens_seen": 4382720, "step": 1070 }, { "epoch": 0.7980625931445604, "grad_norm": 7.817443570017509, "learning_rate": 2.346115591912848e-06, "loss": 1.4846, "num_input_tokens_seen": 4386816, "step": 1071 }, { "epoch": 0.7988077496274217, "grad_norm": 7.660206912930411, "learning_rate": 2.3458341525908536e-06, "loss": 1.0485, "num_input_tokens_seen": 4390912, "step": 1072 }, { "epoch": 0.7995529061102832, "grad_norm": 17.75607987423347, "learning_rate": 2.345552473054187e-06, "loss": 1.5474, "num_input_tokens_seen": 4395008, "step": 1073 }, { "epoch": 0.8002980625931445, "grad_norm": 8.524380633887287, "learning_rate": 2.345270553364596e-06, "loss": 1.2135, "num_input_tokens_seen": 4399104, "step": 1074 }, { "epoch": 0.801043219076006, "grad_norm": 8.48946360992408, "learning_rate": 2.344988393583879e-06, "loss": 1.2684, "num_input_tokens_seen": 4403200, "step": 1075 }, { "epoch": 0.8017883755588674, "grad_norm": 8.677904888095451, "learning_rate": 2.3447059937738868e-06, "loss": 1.3523, "num_input_tokens_seen": 4407296, "step": 1076 }, { "epoch": 0.8025335320417287, "grad_norm": 10.017146781068451, "learning_rate": 2.344423353996524e-06, "loss": 0.9602, "num_input_tokens_seen": 4411392, "step": 1077 }, { "epoch": 0.8032786885245902, "grad_norm": 7.9606468731186855, "learning_rate": 2.3441404743137467e-06, "loss": 0.9933, "num_input_tokens_seen": 4415488, "step": 1078 }, { "epoch": 0.8040238450074516, "grad_norm": 8.122338361036043, "learning_rate": 2.3438573547875655e-06, "loss": 1.2585, "num_input_tokens_seen": 4419584, "step": 1079 }, { "epoch": 0.8047690014903129, "grad_norm": 8.625025697726784, "learning_rate": 2.3435739954800404e-06, "loss": 1.0014, "num_input_tokens_seen": 4423680, "step": 1080 }, { "epoch": 0.8055141579731744, "grad_norm": 7.137806013365651, "learning_rate": 2.343290396453287e-06, "loss": 1.3195, "num_input_tokens_seen": 4427776, "step": 1081 }, { "epoch": 0.8062593144560357, "grad_norm": 9.764738289860567, "learning_rate": 2.343006557769472e-06, "loss": 0.9785, "num_input_tokens_seen": 4431872, "step": 1082 }, { "epoch": 0.8070044709388972, "grad_norm": 15.697138820691752, "learning_rate": 2.342722479490815e-06, "loss": 1.3129, "num_input_tokens_seen": 4435968, "step": 1083 }, { "epoch": 0.8077496274217586, "grad_norm": 7.338904512045073, "learning_rate": 2.3424381616795873e-06, "loss": 1.1725, "num_input_tokens_seen": 4440064, "step": 1084 }, { "epoch": 0.8084947839046199, "grad_norm": 9.263330271365579, "learning_rate": 2.342153604398114e-06, "loss": 1.1465, "num_input_tokens_seen": 4444160, "step": 1085 }, { "epoch": 0.8092399403874814, "grad_norm": 9.328693018400767, "learning_rate": 2.341868807708772e-06, "loss": 1.0685, "num_input_tokens_seen": 4448256, "step": 1086 }, { "epoch": 0.8099850968703428, "grad_norm": 7.462393148160834, "learning_rate": 2.3415837716739903e-06, "loss": 1.4202, "num_input_tokens_seen": 4452352, "step": 1087 }, { "epoch": 0.8107302533532041, "grad_norm": 8.176152980502266, "learning_rate": 2.341298496356252e-06, "loss": 1.1176, "num_input_tokens_seen": 4456448, "step": 1088 }, { "epoch": 0.8114754098360656, "grad_norm": 7.762585117610929, "learning_rate": 2.34101298181809e-06, "loss": 1.3241, "num_input_tokens_seen": 4460544, "step": 1089 }, { "epoch": 0.812220566318927, "grad_norm": 11.38247821431127, "learning_rate": 2.340727228122092e-06, "loss": 1.1413, "num_input_tokens_seen": 4464640, "step": 1090 }, { "epoch": 0.8129657228017884, "grad_norm": 7.377468022145676, "learning_rate": 2.3404412353308975e-06, "loss": 1.06, "num_input_tokens_seen": 4468736, "step": 1091 }, { "epoch": 0.8137108792846498, "grad_norm": 7.4792254850128765, "learning_rate": 2.340155003507198e-06, "loss": 1.0281, "num_input_tokens_seen": 4472832, "step": 1092 }, { "epoch": 0.8144560357675111, "grad_norm": 7.030636679935852, "learning_rate": 2.3398685327137367e-06, "loss": 1.2282, "num_input_tokens_seen": 4476928, "step": 1093 }, { "epoch": 0.8152011922503726, "grad_norm": 7.5679998392932895, "learning_rate": 2.339581823013311e-06, "loss": 1.298, "num_input_tokens_seen": 4481024, "step": 1094 }, { "epoch": 0.815946348733234, "grad_norm": 7.26831098067504, "learning_rate": 2.3392948744687692e-06, "loss": 0.9998, "num_input_tokens_seen": 4485120, "step": 1095 }, { "epoch": 0.8166915052160953, "grad_norm": 7.811142559669213, "learning_rate": 2.3390076871430126e-06, "loss": 1.1488, "num_input_tokens_seen": 4489216, "step": 1096 }, { "epoch": 0.8174366616989568, "grad_norm": 8.204172751547206, "learning_rate": 2.3387202610989947e-06, "loss": 0.888, "num_input_tokens_seen": 4493312, "step": 1097 }, { "epoch": 0.8181818181818182, "grad_norm": 7.928767083112601, "learning_rate": 2.3384325963997216e-06, "loss": 1.3033, "num_input_tokens_seen": 4497408, "step": 1098 }, { "epoch": 0.8189269746646796, "grad_norm": 7.676645796206396, "learning_rate": 2.338144693108251e-06, "loss": 1.4606, "num_input_tokens_seen": 4501504, "step": 1099 }, { "epoch": 0.819672131147541, "grad_norm": 9.13975653106205, "learning_rate": 2.3378565512876945e-06, "loss": 1.332, "num_input_tokens_seen": 4505600, "step": 1100 }, { "epoch": 0.8204172876304023, "grad_norm": 9.473519714582066, "learning_rate": 2.337568171001213e-06, "loss": 1.0153, "num_input_tokens_seen": 4509696, "step": 1101 }, { "epoch": 0.8211624441132638, "grad_norm": 7.68980469194112, "learning_rate": 2.337279552312023e-06, "loss": 1.2477, "num_input_tokens_seen": 4513792, "step": 1102 }, { "epoch": 0.8219076005961252, "grad_norm": 8.76060705056834, "learning_rate": 2.336990695283391e-06, "loss": 0.8414, "num_input_tokens_seen": 4517888, "step": 1103 }, { "epoch": 0.8226527570789866, "grad_norm": 7.400864640755904, "learning_rate": 2.336701599978637e-06, "loss": 1.0334, "num_input_tokens_seen": 4521984, "step": 1104 }, { "epoch": 0.823397913561848, "grad_norm": 8.778952144837508, "learning_rate": 2.3364122664611324e-06, "loss": 1.1444, "num_input_tokens_seen": 4526080, "step": 1105 }, { "epoch": 0.8241430700447094, "grad_norm": 7.3072512815768995, "learning_rate": 2.3361226947943012e-06, "loss": 1.1888, "num_input_tokens_seen": 4530176, "step": 1106 }, { "epoch": 0.8248882265275708, "grad_norm": 8.21593481388853, "learning_rate": 2.33583288504162e-06, "loss": 1.3883, "num_input_tokens_seen": 4534272, "step": 1107 }, { "epoch": 0.8256333830104322, "grad_norm": 8.286223104368613, "learning_rate": 2.3355428372666165e-06, "loss": 1.5025, "num_input_tokens_seen": 4538368, "step": 1108 }, { "epoch": 0.8263785394932935, "grad_norm": 13.961646563757903, "learning_rate": 2.335252551532872e-06, "loss": 1.3131, "num_input_tokens_seen": 4542464, "step": 1109 }, { "epoch": 0.827123695976155, "grad_norm": 7.98097769233269, "learning_rate": 2.3349620279040185e-06, "loss": 1.1515, "num_input_tokens_seen": 4546560, "step": 1110 }, { "epoch": 0.8278688524590164, "grad_norm": 9.086781990047562, "learning_rate": 2.334671266443741e-06, "loss": 1.4146, "num_input_tokens_seen": 4550656, "step": 1111 }, { "epoch": 0.8286140089418778, "grad_norm": 7.733606347067945, "learning_rate": 2.334380267215777e-06, "loss": 1.2018, "num_input_tokens_seen": 4554752, "step": 1112 }, { "epoch": 0.8293591654247392, "grad_norm": 7.988439626762016, "learning_rate": 2.3340890302839153e-06, "loss": 1.5003, "num_input_tokens_seen": 4558848, "step": 1113 }, { "epoch": 0.8301043219076006, "grad_norm": 7.427042297534495, "learning_rate": 2.333797555711997e-06, "loss": 1.095, "num_input_tokens_seen": 4562944, "step": 1114 }, { "epoch": 0.830849478390462, "grad_norm": 7.250237000434249, "learning_rate": 2.3335058435639155e-06, "loss": 1.1946, "num_input_tokens_seen": 4567040, "step": 1115 }, { "epoch": 0.8315946348733234, "grad_norm": 7.202921754071178, "learning_rate": 2.333213893903616e-06, "loss": 1.2968, "num_input_tokens_seen": 4571136, "step": 1116 }, { "epoch": 0.8323397913561847, "grad_norm": 7.186008017957113, "learning_rate": 2.332921706795096e-06, "loss": 0.8539, "num_input_tokens_seen": 4575232, "step": 1117 }, { "epoch": 0.8330849478390462, "grad_norm": 7.31802760679714, "learning_rate": 2.3326292823024056e-06, "loss": 0.9755, "num_input_tokens_seen": 4579328, "step": 1118 }, { "epoch": 0.8338301043219076, "grad_norm": 8.361544568786933, "learning_rate": 2.3323366204896456e-06, "loss": 1.1895, "num_input_tokens_seen": 4583424, "step": 1119 }, { "epoch": 0.834575260804769, "grad_norm": 7.24144566079348, "learning_rate": 2.3320437214209694e-06, "loss": 1.441, "num_input_tokens_seen": 4587520, "step": 1120 }, { "epoch": 0.8353204172876304, "grad_norm": 7.285847777040302, "learning_rate": 2.3317505851605835e-06, "loss": 1.1101, "num_input_tokens_seen": 4591616, "step": 1121 }, { "epoch": 0.8360655737704918, "grad_norm": 6.800791917066886, "learning_rate": 2.331457211772745e-06, "loss": 0.8749, "num_input_tokens_seen": 4595712, "step": 1122 }, { "epoch": 0.8368107302533532, "grad_norm": 7.715522984121879, "learning_rate": 2.3311636013217635e-06, "loss": 1.4112, "num_input_tokens_seen": 4599808, "step": 1123 }, { "epoch": 0.8375558867362146, "grad_norm": 9.579117941680117, "learning_rate": 2.330869753872e-06, "loss": 1.1541, "num_input_tokens_seen": 4603904, "step": 1124 }, { "epoch": 0.8383010432190761, "grad_norm": 7.3985338066147195, "learning_rate": 2.3305756694878684e-06, "loss": 1.1595, "num_input_tokens_seen": 4608000, "step": 1125 }, { "epoch": 0.8390461997019374, "grad_norm": 9.57594990644221, "learning_rate": 2.330281348233834e-06, "loss": 1.161, "num_input_tokens_seen": 4612096, "step": 1126 }, { "epoch": 0.8397913561847988, "grad_norm": 8.693526495582951, "learning_rate": 2.3299867901744145e-06, "loss": 1.3913, "num_input_tokens_seen": 4616192, "step": 1127 }, { "epoch": 0.8405365126676602, "grad_norm": 8.235895852390179, "learning_rate": 2.3296919953741787e-06, "loss": 1.0658, "num_input_tokens_seen": 4620288, "step": 1128 }, { "epoch": 0.8412816691505216, "grad_norm": 7.35250613900587, "learning_rate": 2.3293969638977475e-06, "loss": 1.2459, "num_input_tokens_seen": 4624384, "step": 1129 }, { "epoch": 0.842026825633383, "grad_norm": 7.206647892572089, "learning_rate": 2.3291016958097946e-06, "loss": 1.2693, "num_input_tokens_seen": 4628480, "step": 1130 }, { "epoch": 0.8427719821162444, "grad_norm": 6.940388817347345, "learning_rate": 2.328806191175044e-06, "loss": 1.2205, "num_input_tokens_seen": 4632576, "step": 1131 }, { "epoch": 0.8435171385991058, "grad_norm": 7.5228443720389295, "learning_rate": 2.3285104500582736e-06, "loss": 0.9117, "num_input_tokens_seen": 4636672, "step": 1132 }, { "epoch": 0.8442622950819673, "grad_norm": 9.655435646160928, "learning_rate": 2.328214472524311e-06, "loss": 1.1843, "num_input_tokens_seen": 4640768, "step": 1133 }, { "epoch": 0.8450074515648286, "grad_norm": 10.142868037259992, "learning_rate": 2.3279182586380366e-06, "loss": 1.2124, "num_input_tokens_seen": 4644864, "step": 1134 }, { "epoch": 0.84575260804769, "grad_norm": 10.73150255542747, "learning_rate": 2.327621808464383e-06, "loss": 1.1707, "num_input_tokens_seen": 4648960, "step": 1135 }, { "epoch": 0.8464977645305514, "grad_norm": 8.554552416724013, "learning_rate": 2.3273251220683344e-06, "loss": 1.1088, "num_input_tokens_seen": 4653056, "step": 1136 }, { "epoch": 0.8472429210134128, "grad_norm": 9.433793161028769, "learning_rate": 2.327028199514926e-06, "loss": 1.3592, "num_input_tokens_seen": 4657152, "step": 1137 }, { "epoch": 0.8479880774962743, "grad_norm": 7.940480790050419, "learning_rate": 2.3267310408692456e-06, "loss": 1.237, "num_input_tokens_seen": 4661248, "step": 1138 }, { "epoch": 0.8487332339791356, "grad_norm": 8.985835777724063, "learning_rate": 2.3264336461964326e-06, "loss": 1.4257, "num_input_tokens_seen": 4665344, "step": 1139 }, { "epoch": 0.849478390461997, "grad_norm": 29.82841260196988, "learning_rate": 2.326136015561678e-06, "loss": 1.3648, "num_input_tokens_seen": 4669440, "step": 1140 }, { "epoch": 0.8502235469448585, "grad_norm": 11.895551347221357, "learning_rate": 2.3258381490302247e-06, "loss": 1.3011, "num_input_tokens_seen": 4673536, "step": 1141 }, { "epoch": 0.8509687034277198, "grad_norm": 8.171668219776322, "learning_rate": 2.325540046667366e-06, "loss": 1.2322, "num_input_tokens_seen": 4677632, "step": 1142 }, { "epoch": 0.8517138599105812, "grad_norm": 16.44578472055888, "learning_rate": 2.32524170853845e-06, "loss": 1.0347, "num_input_tokens_seen": 4681728, "step": 1143 }, { "epoch": 0.8524590163934426, "grad_norm": 9.594804590177496, "learning_rate": 2.324943134708873e-06, "loss": 1.1598, "num_input_tokens_seen": 4685824, "step": 1144 }, { "epoch": 0.853204172876304, "grad_norm": 10.41749601085576, "learning_rate": 2.324644325244085e-06, "loss": 1.323, "num_input_tokens_seen": 4689920, "step": 1145 }, { "epoch": 0.8539493293591655, "grad_norm": 11.441879780355082, "learning_rate": 2.324345280209588e-06, "loss": 0.9736, "num_input_tokens_seen": 4694016, "step": 1146 }, { "epoch": 0.8546944858420268, "grad_norm": 9.232426419040001, "learning_rate": 2.3240459996709337e-06, "loss": 1.3485, "num_input_tokens_seen": 4698112, "step": 1147 }, { "epoch": 0.8554396423248882, "grad_norm": 6.699819880657923, "learning_rate": 2.323746483693727e-06, "loss": 1.3009, "num_input_tokens_seen": 4702208, "step": 1148 }, { "epoch": 0.8561847988077497, "grad_norm": 9.837816267094954, "learning_rate": 2.3234467323436237e-06, "loss": 1.2576, "num_input_tokens_seen": 4706304, "step": 1149 }, { "epoch": 0.856929955290611, "grad_norm": 8.999458883166767, "learning_rate": 2.3231467456863316e-06, "loss": 1.0699, "num_input_tokens_seen": 4710400, "step": 1150 }, { "epoch": 0.8576751117734724, "grad_norm": 8.600243148791394, "learning_rate": 2.3228465237876097e-06, "loss": 1.0016, "num_input_tokens_seen": 4714496, "step": 1151 }, { "epoch": 0.8584202682563339, "grad_norm": 7.469827623207663, "learning_rate": 2.3225460667132693e-06, "loss": 1.0671, "num_input_tokens_seen": 4718592, "step": 1152 }, { "epoch": 0.8591654247391952, "grad_norm": 8.892497016825573, "learning_rate": 2.322245374529172e-06, "loss": 0.8633, "num_input_tokens_seen": 4722688, "step": 1153 }, { "epoch": 0.8599105812220567, "grad_norm": 9.121329964525717, "learning_rate": 2.3219444473012326e-06, "loss": 1.2143, "num_input_tokens_seen": 4726784, "step": 1154 }, { "epoch": 0.860655737704918, "grad_norm": 7.205691504742169, "learning_rate": 2.321643285095416e-06, "loss": 1.4448, "num_input_tokens_seen": 4730880, "step": 1155 }, { "epoch": 0.8614008941877794, "grad_norm": 7.798016500967604, "learning_rate": 2.3213418879777383e-06, "loss": 0.8918, "num_input_tokens_seen": 4734976, "step": 1156 }, { "epoch": 0.8621460506706409, "grad_norm": 12.489750337551468, "learning_rate": 2.3210402560142693e-06, "loss": 1.0215, "num_input_tokens_seen": 4739072, "step": 1157 }, { "epoch": 0.8628912071535022, "grad_norm": 7.067767352699725, "learning_rate": 2.3207383892711284e-06, "loss": 1.1583, "num_input_tokens_seen": 4743168, "step": 1158 }, { "epoch": 0.8636363636363636, "grad_norm": 6.555028371635972, "learning_rate": 2.3204362878144864e-06, "loss": 1.2182, "num_input_tokens_seen": 4747264, "step": 1159 }, { "epoch": 0.8643815201192251, "grad_norm": 7.59787586815505, "learning_rate": 2.3201339517105667e-06, "loss": 1.4664, "num_input_tokens_seen": 4751360, "step": 1160 }, { "epoch": 0.8651266766020864, "grad_norm": 6.564014919675487, "learning_rate": 2.3198313810256435e-06, "loss": 0.8433, "num_input_tokens_seen": 4755456, "step": 1161 }, { "epoch": 0.8658718330849479, "grad_norm": 7.639858651243529, "learning_rate": 2.319528575826042e-06, "loss": 0.8912, "num_input_tokens_seen": 4759552, "step": 1162 }, { "epoch": 0.8666169895678092, "grad_norm": 27.86994545510867, "learning_rate": 2.3192255361781396e-06, "loss": 1.453, "num_input_tokens_seen": 4763648, "step": 1163 }, { "epoch": 0.8673621460506706, "grad_norm": 6.891741367919079, "learning_rate": 2.318922262148365e-06, "loss": 1.3584, "num_input_tokens_seen": 4767744, "step": 1164 }, { "epoch": 0.8681073025335321, "grad_norm": 8.31803568182438, "learning_rate": 2.3186187538031976e-06, "loss": 1.3873, "num_input_tokens_seen": 4771840, "step": 1165 }, { "epoch": 0.8688524590163934, "grad_norm": 12.85501199158661, "learning_rate": 2.3183150112091687e-06, "loss": 1.0278, "num_input_tokens_seen": 4775936, "step": 1166 }, { "epoch": 0.8695976154992549, "grad_norm": 7.2123583836846175, "learning_rate": 2.318011034432861e-06, "loss": 1.238, "num_input_tokens_seen": 4780032, "step": 1167 }, { "epoch": 0.8703427719821163, "grad_norm": 9.869593168101852, "learning_rate": 2.317706823540908e-06, "loss": 1.3372, "num_input_tokens_seen": 4784128, "step": 1168 }, { "epoch": 0.8710879284649776, "grad_norm": 7.510183748344654, "learning_rate": 2.3174023785999953e-06, "loss": 1.4581, "num_input_tokens_seen": 4788224, "step": 1169 }, { "epoch": 0.8718330849478391, "grad_norm": 8.763371617402928, "learning_rate": 2.3170976996768594e-06, "loss": 1.0156, "num_input_tokens_seen": 4792320, "step": 1170 }, { "epoch": 0.8725782414307004, "grad_norm": 8.53117828612695, "learning_rate": 2.316792786838288e-06, "loss": 1.07, "num_input_tokens_seen": 4796416, "step": 1171 }, { "epoch": 0.8733233979135618, "grad_norm": 11.31306862165704, "learning_rate": 2.31648764015112e-06, "loss": 1.2689, "num_input_tokens_seen": 4800512, "step": 1172 }, { "epoch": 0.8740685543964233, "grad_norm": 9.026838047322402, "learning_rate": 2.316182259682246e-06, "loss": 1.2357, "num_input_tokens_seen": 4804608, "step": 1173 }, { "epoch": 0.8748137108792846, "grad_norm": 13.712948045763467, "learning_rate": 2.3158766454986077e-06, "loss": 1.2576, "num_input_tokens_seen": 4808704, "step": 1174 }, { "epoch": 0.875558867362146, "grad_norm": 18.653247664203604, "learning_rate": 2.315570797667197e-06, "loss": 1.5547, "num_input_tokens_seen": 4812800, "step": 1175 }, { "epoch": 0.8763040238450075, "grad_norm": 9.04922056089527, "learning_rate": 2.315264716255059e-06, "loss": 1.2329, "num_input_tokens_seen": 4816896, "step": 1176 }, { "epoch": 0.8770491803278688, "grad_norm": 7.892185853714937, "learning_rate": 2.3149584013292888e-06, "loss": 1.1299, "num_input_tokens_seen": 4820992, "step": 1177 }, { "epoch": 0.8777943368107303, "grad_norm": 9.966206059169318, "learning_rate": 2.3146518529570323e-06, "loss": 1.1472, "num_input_tokens_seen": 4825088, "step": 1178 }, { "epoch": 0.8785394932935916, "grad_norm": 10.668232519870479, "learning_rate": 2.314345071205487e-06, "loss": 1.2971, "num_input_tokens_seen": 4829184, "step": 1179 }, { "epoch": 0.879284649776453, "grad_norm": 6.696776674142735, "learning_rate": 2.3140380561419023e-06, "loss": 0.9226, "num_input_tokens_seen": 4833280, "step": 1180 }, { "epoch": 0.8800298062593145, "grad_norm": 9.033040495382815, "learning_rate": 2.313730807833578e-06, "loss": 0.8857, "num_input_tokens_seen": 4837376, "step": 1181 }, { "epoch": 0.8807749627421758, "grad_norm": 7.295289514093259, "learning_rate": 2.3134233263478644e-06, "loss": 1.1453, "num_input_tokens_seen": 4841472, "step": 1182 }, { "epoch": 0.8815201192250373, "grad_norm": 7.9001204239002405, "learning_rate": 2.3131156117521643e-06, "loss": 1.2844, "num_input_tokens_seen": 4845568, "step": 1183 }, { "epoch": 0.8822652757078987, "grad_norm": 6.880055613289566, "learning_rate": 2.3128076641139306e-06, "loss": 1.0443, "num_input_tokens_seen": 4849664, "step": 1184 }, { "epoch": 0.88301043219076, "grad_norm": 7.945725821765207, "learning_rate": 2.3124994835006683e-06, "loss": 1.1532, "num_input_tokens_seen": 4853760, "step": 1185 }, { "epoch": 0.8837555886736215, "grad_norm": 7.676242352257293, "learning_rate": 2.312191069979932e-06, "loss": 1.0056, "num_input_tokens_seen": 4857856, "step": 1186 }, { "epoch": 0.8845007451564829, "grad_norm": 14.159795681775629, "learning_rate": 2.3118824236193286e-06, "loss": 1.1566, "num_input_tokens_seen": 4861952, "step": 1187 }, { "epoch": 0.8852459016393442, "grad_norm": 8.169320135362282, "learning_rate": 2.3115735444865156e-06, "loss": 1.2417, "num_input_tokens_seen": 4866048, "step": 1188 }, { "epoch": 0.8859910581222057, "grad_norm": 7.238327649378144, "learning_rate": 2.3112644326492007e-06, "loss": 1.2299, "num_input_tokens_seen": 4870144, "step": 1189 }, { "epoch": 0.886736214605067, "grad_norm": 11.255815738257564, "learning_rate": 2.3109550881751445e-06, "loss": 1.2333, "num_input_tokens_seen": 4874240, "step": 1190 }, { "epoch": 0.8874813710879285, "grad_norm": 7.014920616826597, "learning_rate": 2.310645511132157e-06, "loss": 0.9251, "num_input_tokens_seen": 4878336, "step": 1191 }, { "epoch": 0.8882265275707899, "grad_norm": 10.1034785201888, "learning_rate": 2.3103357015881e-06, "loss": 1.1659, "num_input_tokens_seen": 4882432, "step": 1192 }, { "epoch": 0.8889716840536512, "grad_norm": 8.381231758133517, "learning_rate": 2.3100256596108856e-06, "loss": 0.9696, "num_input_tokens_seen": 4886528, "step": 1193 }, { "epoch": 0.8897168405365127, "grad_norm": 10.737659110070958, "learning_rate": 2.309715385268477e-06, "loss": 0.9451, "num_input_tokens_seen": 4890624, "step": 1194 }, { "epoch": 0.8904619970193741, "grad_norm": 7.801133533000082, "learning_rate": 2.3094048786288893e-06, "loss": 1.1588, "num_input_tokens_seen": 4894720, "step": 1195 }, { "epoch": 0.8912071535022354, "grad_norm": 9.683089464101299, "learning_rate": 2.3090941397601876e-06, "loss": 1.258, "num_input_tokens_seen": 4898816, "step": 1196 }, { "epoch": 0.8919523099850969, "grad_norm": 7.71213967689268, "learning_rate": 2.308783168730487e-06, "loss": 1.3441, "num_input_tokens_seen": 4902912, "step": 1197 }, { "epoch": 0.8926974664679582, "grad_norm": 8.719147452146656, "learning_rate": 2.308471965607956e-06, "loss": 1.2108, "num_input_tokens_seen": 4907008, "step": 1198 }, { "epoch": 0.8934426229508197, "grad_norm": 15.244315611246757, "learning_rate": 2.3081605304608117e-06, "loss": 1.3057, "num_input_tokens_seen": 4911104, "step": 1199 }, { "epoch": 0.8941877794336811, "grad_norm": 8.744170448749651, "learning_rate": 2.307848863357324e-06, "loss": 1.2104, "num_input_tokens_seen": 4915200, "step": 1200 }, { "epoch": 0.8949329359165424, "grad_norm": 9.191912222704934, "learning_rate": 2.307536964365811e-06, "loss": 1.1334, "num_input_tokens_seen": 4919296, "step": 1201 }, { "epoch": 0.8956780923994039, "grad_norm": 11.456013961600277, "learning_rate": 2.307224833554644e-06, "loss": 1.0975, "num_input_tokens_seen": 4923392, "step": 1202 }, { "epoch": 0.8964232488822653, "grad_norm": 6.834414918978392, "learning_rate": 2.306912470992244e-06, "loss": 1.2066, "num_input_tokens_seen": 4927488, "step": 1203 }, { "epoch": 0.8971684053651267, "grad_norm": 9.737830361844477, "learning_rate": 2.306599876747084e-06, "loss": 1.324, "num_input_tokens_seen": 4931584, "step": 1204 }, { "epoch": 0.8979135618479881, "grad_norm": 10.581469918138623, "learning_rate": 2.3062870508876855e-06, "loss": 1.3097, "num_input_tokens_seen": 4935680, "step": 1205 }, { "epoch": 0.8986587183308494, "grad_norm": 8.64619025117923, "learning_rate": 2.305973993482623e-06, "loss": 1.1886, "num_input_tokens_seen": 4939776, "step": 1206 }, { "epoch": 0.8994038748137109, "grad_norm": 10.335701115782074, "learning_rate": 2.3056607046005213e-06, "loss": 0.9159, "num_input_tokens_seen": 4943872, "step": 1207 }, { "epoch": 0.9001490312965723, "grad_norm": 10.804627346820899, "learning_rate": 2.3053471843100545e-06, "loss": 1.0504, "num_input_tokens_seen": 4947968, "step": 1208 }, { "epoch": 0.9008941877794336, "grad_norm": 10.618985424372195, "learning_rate": 2.3050334326799494e-06, "loss": 1.0112, "num_input_tokens_seen": 4952064, "step": 1209 }, { "epoch": 0.9016393442622951, "grad_norm": 12.598657821822052, "learning_rate": 2.304719449778982e-06, "loss": 1.4603, "num_input_tokens_seen": 4956160, "step": 1210 }, { "epoch": 0.9023845007451565, "grad_norm": 8.958880170797302, "learning_rate": 2.30440523567598e-06, "loss": 1.3825, "num_input_tokens_seen": 4960256, "step": 1211 }, { "epoch": 0.9031296572280179, "grad_norm": 7.097275081550916, "learning_rate": 2.3040907904398214e-06, "loss": 1.5298, "num_input_tokens_seen": 4964352, "step": 1212 }, { "epoch": 0.9038748137108793, "grad_norm": 9.128687898709634, "learning_rate": 2.303776114139434e-06, "loss": 1.1537, "num_input_tokens_seen": 4968448, "step": 1213 }, { "epoch": 0.9046199701937406, "grad_norm": 7.859872649157166, "learning_rate": 2.303461206843799e-06, "loss": 1.1767, "num_input_tokens_seen": 4972544, "step": 1214 }, { "epoch": 0.9053651266766021, "grad_norm": 6.660982983262677, "learning_rate": 2.3031460686219444e-06, "loss": 1.0499, "num_input_tokens_seen": 4976640, "step": 1215 }, { "epoch": 0.9061102831594635, "grad_norm": 7.441862434155451, "learning_rate": 2.302830699542951e-06, "loss": 1.2986, "num_input_tokens_seen": 4980736, "step": 1216 }, { "epoch": 0.9068554396423248, "grad_norm": 9.311897034346806, "learning_rate": 2.3025150996759503e-06, "loss": 1.063, "num_input_tokens_seen": 4984832, "step": 1217 }, { "epoch": 0.9076005961251863, "grad_norm": 8.519432427396366, "learning_rate": 2.3021992690901247e-06, "loss": 1.4517, "num_input_tokens_seen": 4988928, "step": 1218 }, { "epoch": 0.9083457526080477, "grad_norm": 8.273676962029413, "learning_rate": 2.301883207854706e-06, "loss": 1.099, "num_input_tokens_seen": 4993024, "step": 1219 }, { "epoch": 0.9090909090909091, "grad_norm": 8.482024168115638, "learning_rate": 2.3015669160389766e-06, "loss": 1.2502, "num_input_tokens_seen": 4997120, "step": 1220 }, { "epoch": 0.9098360655737705, "grad_norm": 6.617394854290031, "learning_rate": 2.3012503937122703e-06, "loss": 1.3209, "num_input_tokens_seen": 5001216, "step": 1221 }, { "epoch": 0.910581222056632, "grad_norm": 8.111969177062612, "learning_rate": 2.3009336409439715e-06, "loss": 1.1675, "num_input_tokens_seen": 5005312, "step": 1222 }, { "epoch": 0.9113263785394933, "grad_norm": 8.786423092235989, "learning_rate": 2.3006166578035143e-06, "loss": 1.0914, "num_input_tokens_seen": 5009408, "step": 1223 }, { "epoch": 0.9120715350223547, "grad_norm": 13.837450124996023, "learning_rate": 2.300299444360383e-06, "loss": 1.3941, "num_input_tokens_seen": 5013504, "step": 1224 }, { "epoch": 0.912816691505216, "grad_norm": 12.307227971667283, "learning_rate": 2.2999820006841146e-06, "loss": 1.428, "num_input_tokens_seen": 5017600, "step": 1225 }, { "epoch": 0.9135618479880775, "grad_norm": 9.16769576586407, "learning_rate": 2.2996643268442934e-06, "loss": 0.9285, "num_input_tokens_seen": 5021696, "step": 1226 }, { "epoch": 0.9143070044709389, "grad_norm": 6.571278286623304, "learning_rate": 2.299346422910557e-06, "loss": 1.1296, "num_input_tokens_seen": 5025792, "step": 1227 }, { "epoch": 0.9150521609538003, "grad_norm": 10.053183865435722, "learning_rate": 2.299028288952591e-06, "loss": 1.1678, "num_input_tokens_seen": 5029888, "step": 1228 }, { "epoch": 0.9157973174366617, "grad_norm": 10.55031948573907, "learning_rate": 2.298709925040134e-06, "loss": 0.8149, "num_input_tokens_seen": 5033984, "step": 1229 }, { "epoch": 0.9165424739195231, "grad_norm": 10.750390977188024, "learning_rate": 2.2983913312429726e-06, "loss": 1.2315, "num_input_tokens_seen": 5038080, "step": 1230 }, { "epoch": 0.9172876304023845, "grad_norm": 8.755914276983466, "learning_rate": 2.298072507630945e-06, "loss": 1.164, "num_input_tokens_seen": 5042176, "step": 1231 }, { "epoch": 0.9180327868852459, "grad_norm": 7.43855635810907, "learning_rate": 2.2977534542739404e-06, "loss": 1.1128, "num_input_tokens_seen": 5046272, "step": 1232 }, { "epoch": 0.9187779433681073, "grad_norm": 10.360849476436494, "learning_rate": 2.2974341712418967e-06, "loss": 1.1526, "num_input_tokens_seen": 5050368, "step": 1233 }, { "epoch": 0.9195230998509687, "grad_norm": 8.302662243598295, "learning_rate": 2.297114658604803e-06, "loss": 1.2583, "num_input_tokens_seen": 5054464, "step": 1234 }, { "epoch": 0.9202682563338301, "grad_norm": 10.046337545325464, "learning_rate": 2.2967949164326995e-06, "loss": 0.9214, "num_input_tokens_seen": 5058560, "step": 1235 }, { "epoch": 0.9210134128166915, "grad_norm": 7.35358266838795, "learning_rate": 2.2964749447956758e-06, "loss": 1.1788, "num_input_tokens_seen": 5062656, "step": 1236 }, { "epoch": 0.9217585692995529, "grad_norm": 6.993369647864047, "learning_rate": 2.2961547437638712e-06, "loss": 1.4192, "num_input_tokens_seen": 5066752, "step": 1237 }, { "epoch": 0.9225037257824144, "grad_norm": 7.806477663701671, "learning_rate": 2.2958343134074766e-06, "loss": 1.1698, "num_input_tokens_seen": 5070848, "step": 1238 }, { "epoch": 0.9232488822652757, "grad_norm": 9.590951934162428, "learning_rate": 2.2955136537967325e-06, "loss": 1.1774, "num_input_tokens_seen": 5074944, "step": 1239 }, { "epoch": 0.9239940387481371, "grad_norm": 7.733208870458625, "learning_rate": 2.2951927650019304e-06, "loss": 1.2984, "num_input_tokens_seen": 5079040, "step": 1240 }, { "epoch": 0.9247391952309985, "grad_norm": 9.134876910454276, "learning_rate": 2.2948716470934104e-06, "loss": 0.8935, "num_input_tokens_seen": 5083136, "step": 1241 }, { "epoch": 0.9254843517138599, "grad_norm": 7.339116914865171, "learning_rate": 2.2945503001415643e-06, "loss": 0.9927, "num_input_tokens_seen": 5087232, "step": 1242 }, { "epoch": 0.9262295081967213, "grad_norm": 7.716741530097474, "learning_rate": 2.2942287242168337e-06, "loss": 1.4336, "num_input_tokens_seen": 5091328, "step": 1243 }, { "epoch": 0.9269746646795827, "grad_norm": 9.103100458217378, "learning_rate": 2.2939069193897105e-06, "loss": 1.1284, "num_input_tokens_seen": 5095424, "step": 1244 }, { "epoch": 0.9277198211624441, "grad_norm": 8.19839628580738, "learning_rate": 2.2935848857307362e-06, "loss": 1.4079, "num_input_tokens_seen": 5099520, "step": 1245 }, { "epoch": 0.9284649776453056, "grad_norm": 7.071013830152722, "learning_rate": 2.2932626233105034e-06, "loss": 1.2626, "num_input_tokens_seen": 5103616, "step": 1246 }, { "epoch": 0.9292101341281669, "grad_norm": 8.078215155586078, "learning_rate": 2.2929401321996537e-06, "loss": 1.0845, "num_input_tokens_seen": 5107712, "step": 1247 }, { "epoch": 0.9299552906110283, "grad_norm": 7.7789747632587245, "learning_rate": 2.2926174124688797e-06, "loss": 1.0761, "num_input_tokens_seen": 5111808, "step": 1248 }, { "epoch": 0.9307004470938898, "grad_norm": 7.581689910723496, "learning_rate": 2.292294464188924e-06, "loss": 1.1594, "num_input_tokens_seen": 5115904, "step": 1249 }, { "epoch": 0.9314456035767511, "grad_norm": 8.442729569156628, "learning_rate": 2.2919712874305794e-06, "loss": 1.2593, "num_input_tokens_seen": 5120000, "step": 1250 }, { "epoch": 0.9321907600596125, "grad_norm": 8.535132940189678, "learning_rate": 2.2916478822646877e-06, "loss": 1.1582, "num_input_tokens_seen": 5124096, "step": 1251 }, { "epoch": 0.9329359165424739, "grad_norm": 6.7104441937674535, "learning_rate": 2.2913242487621427e-06, "loss": 1.3744, "num_input_tokens_seen": 5128192, "step": 1252 }, { "epoch": 0.9336810730253353, "grad_norm": 6.455845044835539, "learning_rate": 2.291000386993886e-06, "loss": 1.2049, "num_input_tokens_seen": 5132288, "step": 1253 }, { "epoch": 0.9344262295081968, "grad_norm": 8.284103821501642, "learning_rate": 2.290676297030912e-06, "loss": 1.1112, "num_input_tokens_seen": 5136384, "step": 1254 }, { "epoch": 0.9351713859910581, "grad_norm": 7.935034114290869, "learning_rate": 2.290351978944262e-06, "loss": 1.2231, "num_input_tokens_seen": 5140480, "step": 1255 }, { "epoch": 0.9359165424739195, "grad_norm": 10.993313515747566, "learning_rate": 2.2900274328050295e-06, "loss": 1.0871, "num_input_tokens_seen": 5144576, "step": 1256 }, { "epoch": 0.936661698956781, "grad_norm": 14.747591855004698, "learning_rate": 2.2897026586843573e-06, "loss": 1.4123, "num_input_tokens_seen": 5148672, "step": 1257 }, { "epoch": 0.9374068554396423, "grad_norm": 8.237626696047425, "learning_rate": 2.289377656653438e-06, "loss": 1.0333, "num_input_tokens_seen": 5152768, "step": 1258 }, { "epoch": 0.9381520119225037, "grad_norm": 8.622648754580187, "learning_rate": 2.289052426783515e-06, "loss": 1.2072, "num_input_tokens_seen": 5156864, "step": 1259 }, { "epoch": 0.9388971684053651, "grad_norm": 11.86977864730357, "learning_rate": 2.2887269691458804e-06, "loss": 1.249, "num_input_tokens_seen": 5160960, "step": 1260 }, { "epoch": 0.9396423248882265, "grad_norm": 9.918208391705473, "learning_rate": 2.2884012838118765e-06, "loss": 0.9638, "num_input_tokens_seen": 5165056, "step": 1261 }, { "epoch": 0.940387481371088, "grad_norm": 9.70315188206784, "learning_rate": 2.288075370852897e-06, "loss": 0.9925, "num_input_tokens_seen": 5169152, "step": 1262 }, { "epoch": 0.9411326378539493, "grad_norm": 8.059696159923176, "learning_rate": 2.287749230340383e-06, "loss": 1.0942, "num_input_tokens_seen": 5173248, "step": 1263 }, { "epoch": 0.9418777943368107, "grad_norm": 11.703891764948729, "learning_rate": 2.2874228623458283e-06, "loss": 1.2773, "num_input_tokens_seen": 5177344, "step": 1264 }, { "epoch": 0.9426229508196722, "grad_norm": 7.309508414597323, "learning_rate": 2.2870962669407735e-06, "loss": 1.4217, "num_input_tokens_seen": 5181440, "step": 1265 }, { "epoch": 0.9433681073025335, "grad_norm": 8.42781686122746, "learning_rate": 2.2867694441968123e-06, "loss": 1.2464, "num_input_tokens_seen": 5185536, "step": 1266 }, { "epoch": 0.944113263785395, "grad_norm": 8.07918914308989, "learning_rate": 2.286442394185585e-06, "loss": 1.3176, "num_input_tokens_seen": 5189632, "step": 1267 }, { "epoch": 0.9448584202682563, "grad_norm": 7.774419454976433, "learning_rate": 2.2861151169787845e-06, "loss": 1.2914, "num_input_tokens_seen": 5193728, "step": 1268 }, { "epoch": 0.9456035767511177, "grad_norm": 8.478907131904492, "learning_rate": 2.285787612648152e-06, "loss": 1.2796, "num_input_tokens_seen": 5197824, "step": 1269 }, { "epoch": 0.9463487332339792, "grad_norm": 8.488272212039373, "learning_rate": 2.2854598812654784e-06, "loss": 1.1236, "num_input_tokens_seen": 5201920, "step": 1270 }, { "epoch": 0.9470938897168405, "grad_norm": 7.216317446659815, "learning_rate": 2.2851319229026046e-06, "loss": 1.0794, "num_input_tokens_seen": 5206016, "step": 1271 }, { "epoch": 0.9478390461997019, "grad_norm": 26.53545514583285, "learning_rate": 2.284803737631422e-06, "loss": 1.0616, "num_input_tokens_seen": 5210112, "step": 1272 }, { "epoch": 0.9485842026825634, "grad_norm": 7.144651443545499, "learning_rate": 2.284475325523871e-06, "loss": 1.0694, "num_input_tokens_seen": 5214208, "step": 1273 }, { "epoch": 0.9493293591654247, "grad_norm": 9.07936845032501, "learning_rate": 2.284146686651942e-06, "loss": 0.8849, "num_input_tokens_seen": 5218304, "step": 1274 }, { "epoch": 0.9500745156482862, "grad_norm": 11.538603269409286, "learning_rate": 2.283817821087675e-06, "loss": 1.1473, "num_input_tokens_seen": 5222400, "step": 1275 }, { "epoch": 0.9508196721311475, "grad_norm": 7.61062242302232, "learning_rate": 2.2834887289031586e-06, "loss": 1.2546, "num_input_tokens_seen": 5226496, "step": 1276 }, { "epoch": 0.9515648286140089, "grad_norm": 8.654301267634775, "learning_rate": 2.283159410170534e-06, "loss": 1.4144, "num_input_tokens_seen": 5230592, "step": 1277 }, { "epoch": 0.9523099850968704, "grad_norm": 7.789675831595845, "learning_rate": 2.282829864961989e-06, "loss": 1.2992, "num_input_tokens_seen": 5234688, "step": 1278 }, { "epoch": 0.9530551415797317, "grad_norm": 8.77734389840339, "learning_rate": 2.2825000933497623e-06, "loss": 0.8906, "num_input_tokens_seen": 5238784, "step": 1279 }, { "epoch": 0.9538002980625931, "grad_norm": 8.638690777376812, "learning_rate": 2.2821700954061425e-06, "loss": 1.2396, "num_input_tokens_seen": 5242880, "step": 1280 }, { "epoch": 0.9545454545454546, "grad_norm": 8.675550998376373, "learning_rate": 2.2818398712034677e-06, "loss": 1.206, "num_input_tokens_seen": 5246976, "step": 1281 }, { "epoch": 0.9552906110283159, "grad_norm": 9.119791605991532, "learning_rate": 2.281509420814125e-06, "loss": 1.2336, "num_input_tokens_seen": 5251072, "step": 1282 }, { "epoch": 0.9560357675111774, "grad_norm": 6.620672646924315, "learning_rate": 2.2811787443105513e-06, "loss": 1.2145, "num_input_tokens_seen": 5255168, "step": 1283 }, { "epoch": 0.9567809239940388, "grad_norm": 8.478624312940493, "learning_rate": 2.280847841765234e-06, "loss": 1.2463, "num_input_tokens_seen": 5259264, "step": 1284 }, { "epoch": 0.9575260804769001, "grad_norm": 8.352125742916074, "learning_rate": 2.280516713250709e-06, "loss": 0.9038, "num_input_tokens_seen": 5263360, "step": 1285 }, { "epoch": 0.9582712369597616, "grad_norm": 7.434548624955863, "learning_rate": 2.2801853588395615e-06, "loss": 1.3812, "num_input_tokens_seen": 5267456, "step": 1286 }, { "epoch": 0.9590163934426229, "grad_norm": 6.854439577570997, "learning_rate": 2.2798537786044273e-06, "loss": 1.5859, "num_input_tokens_seen": 5271552, "step": 1287 }, { "epoch": 0.9597615499254843, "grad_norm": 7.126147501615068, "learning_rate": 2.279521972617991e-06, "loss": 1.2467, "num_input_tokens_seen": 5275648, "step": 1288 }, { "epoch": 0.9605067064083458, "grad_norm": 7.650274925428415, "learning_rate": 2.279189940952987e-06, "loss": 1.1051, "num_input_tokens_seen": 5279744, "step": 1289 }, { "epoch": 0.9612518628912071, "grad_norm": 8.946799964446532, "learning_rate": 2.2788576836821985e-06, "loss": 1.0335, "num_input_tokens_seen": 5283840, "step": 1290 }, { "epoch": 0.9619970193740686, "grad_norm": 8.082917247663234, "learning_rate": 2.2785252008784594e-06, "loss": 1.3206, "num_input_tokens_seen": 5287936, "step": 1291 }, { "epoch": 0.96274217585693, "grad_norm": 8.909738945778198, "learning_rate": 2.278192492614652e-06, "loss": 1.1775, "num_input_tokens_seen": 5292032, "step": 1292 }, { "epoch": 0.9634873323397913, "grad_norm": 8.166144587352969, "learning_rate": 2.277859558963708e-06, "loss": 1.2737, "num_input_tokens_seen": 5296128, "step": 1293 }, { "epoch": 0.9642324888226528, "grad_norm": 10.65546831757371, "learning_rate": 2.277526399998609e-06, "loss": 1.2061, "num_input_tokens_seen": 5300224, "step": 1294 }, { "epoch": 0.9649776453055141, "grad_norm": 12.721787632373532, "learning_rate": 2.277193015792386e-06, "loss": 1.1353, "num_input_tokens_seen": 5304320, "step": 1295 }, { "epoch": 0.9657228017883756, "grad_norm": 7.263264544943508, "learning_rate": 2.2768594064181193e-06, "loss": 1.3654, "num_input_tokens_seen": 5308416, "step": 1296 }, { "epoch": 0.966467958271237, "grad_norm": 8.171793809169147, "learning_rate": 2.276525571948938e-06, "loss": 1.1877, "num_input_tokens_seen": 5312512, "step": 1297 }, { "epoch": 0.9672131147540983, "grad_norm": 9.560757682616277, "learning_rate": 2.276191512458022e-06, "loss": 1.3979, "num_input_tokens_seen": 5316608, "step": 1298 }, { "epoch": 0.9679582712369598, "grad_norm": 7.049928778351161, "learning_rate": 2.2758572280185983e-06, "loss": 1.0945, "num_input_tokens_seen": 5320704, "step": 1299 }, { "epoch": 0.9687034277198212, "grad_norm": 8.275577129765242, "learning_rate": 2.2755227187039446e-06, "loss": 1.0659, "num_input_tokens_seen": 5324800, "step": 1300 }, { "epoch": 0.9694485842026825, "grad_norm": 8.550906355248975, "learning_rate": 2.275187984587389e-06, "loss": 1.0792, "num_input_tokens_seen": 5328896, "step": 1301 }, { "epoch": 0.970193740685544, "grad_norm": 9.493039245468117, "learning_rate": 2.2748530257423056e-06, "loss": 1.065, "num_input_tokens_seen": 5332992, "step": 1302 }, { "epoch": 0.9709388971684053, "grad_norm": 8.537228700333195, "learning_rate": 2.2745178422421215e-06, "loss": 0.9912, "num_input_tokens_seen": 5337088, "step": 1303 }, { "epoch": 0.9716840536512668, "grad_norm": 10.782611196588, "learning_rate": 2.2741824341603105e-06, "loss": 1.0781, "num_input_tokens_seen": 5341184, "step": 1304 }, { "epoch": 0.9724292101341282, "grad_norm": 6.550610492983025, "learning_rate": 2.2738468015703966e-06, "loss": 1.2993, "num_input_tokens_seen": 5345280, "step": 1305 }, { "epoch": 0.9731743666169895, "grad_norm": 8.750379144819261, "learning_rate": 2.273510944545953e-06, "loss": 1.3574, "num_input_tokens_seen": 5349376, "step": 1306 }, { "epoch": 0.973919523099851, "grad_norm": 6.939046668835119, "learning_rate": 2.2731748631606018e-06, "loss": 1.1263, "num_input_tokens_seen": 5353472, "step": 1307 }, { "epoch": 0.9746646795827124, "grad_norm": 7.8148614337904485, "learning_rate": 2.2728385574880147e-06, "loss": 1.4476, "num_input_tokens_seen": 5357568, "step": 1308 }, { "epoch": 0.9754098360655737, "grad_norm": 11.76690903761927, "learning_rate": 2.2725020276019115e-06, "loss": 0.9841, "num_input_tokens_seen": 5361664, "step": 1309 }, { "epoch": 0.9761549925484352, "grad_norm": 10.192772660681026, "learning_rate": 2.272165273576063e-06, "loss": 1.2425, "num_input_tokens_seen": 5365760, "step": 1310 }, { "epoch": 0.9769001490312966, "grad_norm": 7.37376235412694, "learning_rate": 2.2718282954842875e-06, "loss": 1.2612, "num_input_tokens_seen": 5369856, "step": 1311 }, { "epoch": 0.977645305514158, "grad_norm": 7.276047302331616, "learning_rate": 2.271491093400453e-06, "loss": 0.957, "num_input_tokens_seen": 5373952, "step": 1312 }, { "epoch": 0.9783904619970194, "grad_norm": 10.546663366632242, "learning_rate": 2.271153667398477e-06, "loss": 1.0855, "num_input_tokens_seen": 5378048, "step": 1313 }, { "epoch": 0.9791356184798807, "grad_norm": 9.394113928970555, "learning_rate": 2.270816017552325e-06, "loss": 1.1942, "num_input_tokens_seen": 5382144, "step": 1314 }, { "epoch": 0.9798807749627422, "grad_norm": 6.519547260398689, "learning_rate": 2.270478143936013e-06, "loss": 1.1862, "num_input_tokens_seen": 5386240, "step": 1315 }, { "epoch": 0.9806259314456036, "grad_norm": 7.8225813022607, "learning_rate": 2.270140046623605e-06, "loss": 0.97, "num_input_tokens_seen": 5390336, "step": 1316 }, { "epoch": 0.981371087928465, "grad_norm": 7.508249513532412, "learning_rate": 2.2698017256892143e-06, "loss": 0.8966, "num_input_tokens_seen": 5394432, "step": 1317 }, { "epoch": 0.9821162444113264, "grad_norm": 7.849609870255619, "learning_rate": 2.2694631812070028e-06, "loss": 1.0896, "num_input_tokens_seen": 5398528, "step": 1318 }, { "epoch": 0.9828614008941878, "grad_norm": 8.082712023874087, "learning_rate": 2.269124413251183e-06, "loss": 1.4936, "num_input_tokens_seen": 5402624, "step": 1319 }, { "epoch": 0.9836065573770492, "grad_norm": 6.771938646660693, "learning_rate": 2.268785421896014e-06, "loss": 1.0596, "num_input_tokens_seen": 5406720, "step": 1320 }, { "epoch": 0.9843517138599106, "grad_norm": 8.131657932738422, "learning_rate": 2.2684462072158064e-06, "loss": 1.0785, "num_input_tokens_seen": 5410816, "step": 1321 }, { "epoch": 0.9850968703427719, "grad_norm": 9.702368741965985, "learning_rate": 2.2681067692849176e-06, "loss": 1.2295, "num_input_tokens_seen": 5414912, "step": 1322 }, { "epoch": 0.9858420268256334, "grad_norm": 8.716771854671977, "learning_rate": 2.267767108177755e-06, "loss": 1.0792, "num_input_tokens_seen": 5419008, "step": 1323 }, { "epoch": 0.9865871833084948, "grad_norm": 7.728965657234804, "learning_rate": 2.267427223968775e-06, "loss": 1.2822, "num_input_tokens_seen": 5423104, "step": 1324 }, { "epoch": 0.9873323397913562, "grad_norm": 10.381752392113267, "learning_rate": 2.267087116732482e-06, "loss": 1.0748, "num_input_tokens_seen": 5427200, "step": 1325 }, { "epoch": 0.9880774962742176, "grad_norm": 8.415616884551103, "learning_rate": 2.2667467865434304e-06, "loss": 1.2333, "num_input_tokens_seen": 5431296, "step": 1326 }, { "epoch": 0.988822652757079, "grad_norm": 11.132328767314865, "learning_rate": 2.266406233476223e-06, "loss": 1.2005, "num_input_tokens_seen": 5435392, "step": 1327 }, { "epoch": 0.9895678092399404, "grad_norm": 7.137460385268431, "learning_rate": 2.266065457605512e-06, "loss": 1.0831, "num_input_tokens_seen": 5439488, "step": 1328 }, { "epoch": 0.9903129657228018, "grad_norm": 9.88592443055109, "learning_rate": 2.2657244590059966e-06, "loss": 1.3163, "num_input_tokens_seen": 5443584, "step": 1329 }, { "epoch": 0.9910581222056631, "grad_norm": 8.782816669866095, "learning_rate": 2.2653832377524274e-06, "loss": 0.8155, "num_input_tokens_seen": 5447680, "step": 1330 }, { "epoch": 0.9918032786885246, "grad_norm": 6.848954355994219, "learning_rate": 2.265041793919602e-06, "loss": 1.129, "num_input_tokens_seen": 5451776, "step": 1331 }, { "epoch": 0.992548435171386, "grad_norm": 8.06204473462217, "learning_rate": 2.264700127582367e-06, "loss": 1.2989, "num_input_tokens_seen": 5455872, "step": 1332 }, { "epoch": 0.9932935916542474, "grad_norm": 12.438297014555065, "learning_rate": 2.264358238815619e-06, "loss": 1.1894, "num_input_tokens_seen": 5459968, "step": 1333 }, { "epoch": 0.9940387481371088, "grad_norm": 7.003248819146497, "learning_rate": 2.264016127694301e-06, "loss": 1.4266, "num_input_tokens_seen": 5464064, "step": 1334 }, { "epoch": 0.9947839046199702, "grad_norm": 8.273531356634525, "learning_rate": 2.263673794293407e-06, "loss": 0.9122, "num_input_tokens_seen": 5468160, "step": 1335 }, { "epoch": 0.9955290611028316, "grad_norm": 7.398159481473916, "learning_rate": 2.26333123868798e-06, "loss": 1.1205, "num_input_tokens_seen": 5472256, "step": 1336 }, { "epoch": 0.996274217585693, "grad_norm": 7.237367933400858, "learning_rate": 2.2629884609531093e-06, "loss": 0.876, "num_input_tokens_seen": 5476352, "step": 1337 }, { "epoch": 0.9970193740685543, "grad_norm": 7.375186534663657, "learning_rate": 2.262645461163934e-06, "loss": 1.0428, "num_input_tokens_seen": 5480448, "step": 1338 }, { "epoch": 0.9977645305514158, "grad_norm": 7.729102383143776, "learning_rate": 2.2623022393956433e-06, "loss": 1.2635, "num_input_tokens_seen": 5484544, "step": 1339 }, { "epoch": 0.9985096870342772, "grad_norm": 7.0567599822942215, "learning_rate": 2.261958795723473e-06, "loss": 0.8619, "num_input_tokens_seen": 5488640, "step": 1340 }, { "epoch": 0.9992548435171386, "grad_norm": 7.666673204731755, "learning_rate": 2.261615130222709e-06, "loss": 1.376, "num_input_tokens_seen": 5492736, "step": 1341 }, { "epoch": 1.0, "grad_norm": 6.167173501109273, "learning_rate": 2.2612712429686846e-06, "loss": 1.0728, "num_input_tokens_seen": 5496832, "step": 1342 }, { "epoch": 1.0007451564828613, "grad_norm": 7.605644847048881, "learning_rate": 2.2609271340367826e-06, "loss": 0.7283, "num_input_tokens_seen": 5500928, "step": 1343 }, { "epoch": 1.0014903129657229, "grad_norm": 7.631919972537247, "learning_rate": 2.260582803502434e-06, "loss": 1.0523, "num_input_tokens_seen": 5505024, "step": 1344 }, { "epoch": 1.0022354694485842, "grad_norm": 7.191402690228092, "learning_rate": 2.2602382514411186e-06, "loss": 0.9486, "num_input_tokens_seen": 5509120, "step": 1345 }, { "epoch": 1.0029806259314455, "grad_norm": 10.378594835770846, "learning_rate": 2.2598934779283656e-06, "loss": 0.927, "num_input_tokens_seen": 5513216, "step": 1346 }, { "epoch": 1.003725782414307, "grad_norm": 9.21043992205189, "learning_rate": 2.2595484830397503e-06, "loss": 0.7872, "num_input_tokens_seen": 5517312, "step": 1347 }, { "epoch": 1.0044709388971684, "grad_norm": 7.750653990565111, "learning_rate": 2.2592032668508985e-06, "loss": 0.9156, "num_input_tokens_seen": 5521408, "step": 1348 }, { "epoch": 1.0052160953800298, "grad_norm": 7.2985226782129455, "learning_rate": 2.258857829437484e-06, "loss": 0.9037, "num_input_tokens_seen": 5525504, "step": 1349 }, { "epoch": 1.0059612518628913, "grad_norm": 6.263194330651766, "learning_rate": 2.25851217087523e-06, "loss": 0.5464, "num_input_tokens_seen": 5529600, "step": 1350 }, { "epoch": 1.0067064083457526, "grad_norm": 6.555988876281326, "learning_rate": 2.258166291239907e-06, "loss": 0.9373, "num_input_tokens_seen": 5533696, "step": 1351 }, { "epoch": 1.007451564828614, "grad_norm": 9.998695825674233, "learning_rate": 2.257820190607333e-06, "loss": 0.6238, "num_input_tokens_seen": 5537792, "step": 1352 }, { "epoch": 1.0081967213114753, "grad_norm": 9.427471466960966, "learning_rate": 2.2574738690533774e-06, "loss": 0.934, "num_input_tokens_seen": 5541888, "step": 1353 }, { "epoch": 1.0089418777943369, "grad_norm": 7.494905023393865, "learning_rate": 2.257127326653956e-06, "loss": 0.7284, "num_input_tokens_seen": 5545984, "step": 1354 }, { "epoch": 1.0096870342771982, "grad_norm": 8.573930347210915, "learning_rate": 2.2567805634850327e-06, "loss": 1.0416, "num_input_tokens_seen": 5550080, "step": 1355 }, { "epoch": 1.0104321907600595, "grad_norm": 8.580870517179642, "learning_rate": 2.256433579622621e-06, "loss": 0.6216, "num_input_tokens_seen": 5554176, "step": 1356 }, { "epoch": 1.011177347242921, "grad_norm": 10.874546395850436, "learning_rate": 2.256086375142782e-06, "loss": 0.6755, "num_input_tokens_seen": 5558272, "step": 1357 }, { "epoch": 1.0119225037257824, "grad_norm": 11.034164372416862, "learning_rate": 2.255738950121625e-06, "loss": 0.4222, "num_input_tokens_seen": 5562368, "step": 1358 }, { "epoch": 1.0126676602086437, "grad_norm": 10.800304467234213, "learning_rate": 2.255391304635309e-06, "loss": 1.0672, "num_input_tokens_seen": 5566464, "step": 1359 }, { "epoch": 1.0134128166915053, "grad_norm": 14.382321963521148, "learning_rate": 2.25504343876004e-06, "loss": 0.6409, "num_input_tokens_seen": 5570560, "step": 1360 }, { "epoch": 1.0141579731743666, "grad_norm": 9.580098197084956, "learning_rate": 2.2546953525720724e-06, "loss": 0.7976, "num_input_tokens_seen": 5574656, "step": 1361 }, { "epoch": 1.014903129657228, "grad_norm": 8.699414886015418, "learning_rate": 2.254347046147709e-06, "loss": 0.996, "num_input_tokens_seen": 5578752, "step": 1362 }, { "epoch": 1.0156482861400895, "grad_norm": 9.151724259059032, "learning_rate": 2.2539985195633016e-06, "loss": 0.4849, "num_input_tokens_seen": 5582848, "step": 1363 }, { "epoch": 1.0163934426229508, "grad_norm": 8.730545496225847, "learning_rate": 2.25364977289525e-06, "loss": 1.0111, "num_input_tokens_seen": 5586944, "step": 1364 }, { "epoch": 1.0171385991058122, "grad_norm": 11.18118236213413, "learning_rate": 2.2533008062200006e-06, "loss": 0.6742, "num_input_tokens_seen": 5591040, "step": 1365 }, { "epoch": 1.0178837555886737, "grad_norm": 14.494020088049117, "learning_rate": 2.2529516196140513e-06, "loss": 1.1177, "num_input_tokens_seen": 5595136, "step": 1366 }, { "epoch": 1.018628912071535, "grad_norm": 6.799442084507097, "learning_rate": 2.252602213153944e-06, "loss": 0.395, "num_input_tokens_seen": 5599232, "step": 1367 }, { "epoch": 1.0193740685543964, "grad_norm": 7.509496807646831, "learning_rate": 2.252252586916274e-06, "loss": 0.7613, "num_input_tokens_seen": 5603328, "step": 1368 }, { "epoch": 1.0201192250372577, "grad_norm": 8.110060707199581, "learning_rate": 2.251902740977679e-06, "loss": 0.8923, "num_input_tokens_seen": 5607424, "step": 1369 }, { "epoch": 1.0208643815201193, "grad_norm": 8.131636586521052, "learning_rate": 2.2515526754148493e-06, "loss": 0.5857, "num_input_tokens_seen": 5611520, "step": 1370 }, { "epoch": 1.0216095380029806, "grad_norm": 7.876875215251119, "learning_rate": 2.251202390304521e-06, "loss": 0.7844, "num_input_tokens_seen": 5615616, "step": 1371 }, { "epoch": 1.022354694485842, "grad_norm": 12.944660831210319, "learning_rate": 2.2508518857234802e-06, "loss": 0.6654, "num_input_tokens_seen": 5619712, "step": 1372 }, { "epoch": 1.0230998509687035, "grad_norm": 7.913864017202185, "learning_rate": 2.2505011617485588e-06, "loss": 0.8404, "num_input_tokens_seen": 5623808, "step": 1373 }, { "epoch": 1.0238450074515648, "grad_norm": 8.66736534734557, "learning_rate": 2.2501502184566394e-06, "loss": 0.452, "num_input_tokens_seen": 5627904, "step": 1374 }, { "epoch": 1.0245901639344261, "grad_norm": 8.912440439640948, "learning_rate": 2.2497990559246496e-06, "loss": 0.6388, "num_input_tokens_seen": 5632000, "step": 1375 }, { "epoch": 1.0253353204172877, "grad_norm": 9.021522389899893, "learning_rate": 2.2494476742295675e-06, "loss": 0.9449, "num_input_tokens_seen": 5636096, "step": 1376 }, { "epoch": 1.026080476900149, "grad_norm": 9.306484632541483, "learning_rate": 2.2490960734484186e-06, "loss": 0.7754, "num_input_tokens_seen": 5640192, "step": 1377 }, { "epoch": 1.0268256333830104, "grad_norm": 9.163326530985943, "learning_rate": 2.2487442536582764e-06, "loss": 0.8899, "num_input_tokens_seen": 5644288, "step": 1378 }, { "epoch": 1.027570789865872, "grad_norm": 7.804853682036171, "learning_rate": 2.2483922149362622e-06, "loss": 0.5382, "num_input_tokens_seen": 5648384, "step": 1379 }, { "epoch": 1.0283159463487332, "grad_norm": 8.587544565918346, "learning_rate": 2.248039957359545e-06, "loss": 0.7545, "num_input_tokens_seen": 5652480, "step": 1380 }, { "epoch": 1.0290611028315946, "grad_norm": 10.898933369178398, "learning_rate": 2.247687481005343e-06, "loss": 0.7258, "num_input_tokens_seen": 5656576, "step": 1381 }, { "epoch": 1.0298062593144561, "grad_norm": 14.612180555266047, "learning_rate": 2.2473347859509206e-06, "loss": 0.4328, "num_input_tokens_seen": 5660672, "step": 1382 }, { "epoch": 1.0305514157973175, "grad_norm": 7.695095058301363, "learning_rate": 2.2469818722735918e-06, "loss": 0.9117, "num_input_tokens_seen": 5664768, "step": 1383 }, { "epoch": 1.0312965722801788, "grad_norm": 9.199060257950586, "learning_rate": 2.2466287400507177e-06, "loss": 1.1401, "num_input_tokens_seen": 5668864, "step": 1384 }, { "epoch": 1.0320417287630403, "grad_norm": 8.24186591283817, "learning_rate": 2.2462753893597065e-06, "loss": 0.6311, "num_input_tokens_seen": 5672960, "step": 1385 }, { "epoch": 1.0327868852459017, "grad_norm": 8.235009982309595, "learning_rate": 2.2459218202780165e-06, "loss": 0.9778, "num_input_tokens_seen": 5677056, "step": 1386 }, { "epoch": 1.033532041728763, "grad_norm": 7.355273490550299, "learning_rate": 2.2455680328831515e-06, "loss": 1.0053, "num_input_tokens_seen": 5681152, "step": 1387 }, { "epoch": 1.0342771982116243, "grad_norm": 13.462423735914463, "learning_rate": 2.245214027252665e-06, "loss": 0.8007, "num_input_tokens_seen": 5685248, "step": 1388 }, { "epoch": 1.035022354694486, "grad_norm": 8.235532903480713, "learning_rate": 2.2448598034641574e-06, "loss": 0.6788, "num_input_tokens_seen": 5689344, "step": 1389 }, { "epoch": 1.0357675111773472, "grad_norm": 8.151770214519473, "learning_rate": 2.2445053615952767e-06, "loss": 1.0024, "num_input_tokens_seen": 5693440, "step": 1390 }, { "epoch": 1.0365126676602086, "grad_norm": 6.777357353647815, "learning_rate": 2.2441507017237196e-06, "loss": 0.466, "num_input_tokens_seen": 5697536, "step": 1391 }, { "epoch": 1.03725782414307, "grad_norm": 8.282498121186258, "learning_rate": 2.2437958239272294e-06, "loss": 1.0094, "num_input_tokens_seen": 5701632, "step": 1392 }, { "epoch": 1.0380029806259314, "grad_norm": 7.523271319721218, "learning_rate": 2.2434407282835984e-06, "loss": 0.9878, "num_input_tokens_seen": 5705728, "step": 1393 }, { "epoch": 1.0387481371087928, "grad_norm": 8.395473846313802, "learning_rate": 2.2430854148706664e-06, "loss": 0.5129, "num_input_tokens_seen": 5709824, "step": 1394 }, { "epoch": 1.0394932935916543, "grad_norm": 8.26392156667665, "learning_rate": 2.24272988376632e-06, "loss": 0.5691, "num_input_tokens_seen": 5713920, "step": 1395 }, { "epoch": 1.0402384500745157, "grad_norm": 8.855545919091913, "learning_rate": 2.242374135048494e-06, "loss": 0.8821, "num_input_tokens_seen": 5718016, "step": 1396 }, { "epoch": 1.040983606557377, "grad_norm": 6.636343138152219, "learning_rate": 2.2420181687951714e-06, "loss": 0.9621, "num_input_tokens_seen": 5722112, "step": 1397 }, { "epoch": 1.0417287630402385, "grad_norm": 9.377070206358514, "learning_rate": 2.241661985084383e-06, "loss": 0.813, "num_input_tokens_seen": 5726208, "step": 1398 }, { "epoch": 1.0424739195230999, "grad_norm": 7.3201338142607, "learning_rate": 2.2413055839942063e-06, "loss": 0.7249, "num_input_tokens_seen": 5730304, "step": 1399 }, { "epoch": 1.0432190760059612, "grad_norm": 9.550146331933075, "learning_rate": 2.2409489656027665e-06, "loss": 0.8146, "num_input_tokens_seen": 5734400, "step": 1400 }, { "epoch": 1.0439642324888228, "grad_norm": 8.901713949866567, "learning_rate": 2.240592129988238e-06, "loss": 0.9992, "num_input_tokens_seen": 5738496, "step": 1401 }, { "epoch": 1.044709388971684, "grad_norm": 8.118459269007966, "learning_rate": 2.240235077228841e-06, "loss": 0.9069, "num_input_tokens_seen": 5742592, "step": 1402 }, { "epoch": 1.0454545454545454, "grad_norm": 10.547289754436557, "learning_rate": 2.2398778074028442e-06, "loss": 1.1653, "num_input_tokens_seen": 5746688, "step": 1403 }, { "epoch": 1.046199701937407, "grad_norm": 6.452473904803629, "learning_rate": 2.2395203205885633e-06, "loss": 1.0047, "num_input_tokens_seen": 5750784, "step": 1404 }, { "epoch": 1.0469448584202683, "grad_norm": 9.370917112544815, "learning_rate": 2.239162616864363e-06, "loss": 0.8445, "num_input_tokens_seen": 5754880, "step": 1405 }, { "epoch": 1.0476900149031296, "grad_norm": 8.20101594717181, "learning_rate": 2.238804696308653e-06, "loss": 1.149, "num_input_tokens_seen": 5758976, "step": 1406 }, { "epoch": 1.048435171385991, "grad_norm": 8.14754666274824, "learning_rate": 2.2384465589998937e-06, "loss": 0.8877, "num_input_tokens_seen": 5763072, "step": 1407 }, { "epoch": 1.0491803278688525, "grad_norm": 8.837006386543786, "learning_rate": 2.2380882050165897e-06, "loss": 0.7374, "num_input_tokens_seen": 5767168, "step": 1408 }, { "epoch": 1.0499254843517138, "grad_norm": 9.554647538809299, "learning_rate": 2.2377296344372963e-06, "loss": 0.8589, "num_input_tokens_seen": 5771264, "step": 1409 }, { "epoch": 1.0506706408345752, "grad_norm": 8.68576791817589, "learning_rate": 2.2373708473406137e-06, "loss": 0.8586, "num_input_tokens_seen": 5775360, "step": 1410 }, { "epoch": 1.0514157973174367, "grad_norm": 8.760472210150937, "learning_rate": 2.2370118438051906e-06, "loss": 1.0529, "num_input_tokens_seen": 5779456, "step": 1411 }, { "epoch": 1.052160953800298, "grad_norm": 7.032695255334956, "learning_rate": 2.2366526239097243e-06, "loss": 1.0071, "num_input_tokens_seen": 5783552, "step": 1412 }, { "epoch": 1.0529061102831594, "grad_norm": 7.620891475749697, "learning_rate": 2.2362931877329567e-06, "loss": 0.6733, "num_input_tokens_seen": 5787648, "step": 1413 }, { "epoch": 1.053651266766021, "grad_norm": 6.880168290369376, "learning_rate": 2.2359335353536797e-06, "loss": 0.6588, "num_input_tokens_seen": 5791744, "step": 1414 }, { "epoch": 1.0543964232488823, "grad_norm": 8.547881331173793, "learning_rate": 2.235573666850732e-06, "loss": 1.0424, "num_input_tokens_seen": 5795840, "step": 1415 }, { "epoch": 1.0551415797317436, "grad_norm": 8.984957529865138, "learning_rate": 2.2352135823029987e-06, "loss": 0.6086, "num_input_tokens_seen": 5799936, "step": 1416 }, { "epoch": 1.0558867362146052, "grad_norm": 7.224630184740276, "learning_rate": 2.234853281789413e-06, "loss": 0.726, "num_input_tokens_seen": 5804032, "step": 1417 }, { "epoch": 1.0566318926974665, "grad_norm": 10.07854525462653, "learning_rate": 2.2344927653889553e-06, "loss": 0.4419, "num_input_tokens_seen": 5808128, "step": 1418 }, { "epoch": 1.0573770491803278, "grad_norm": 9.949056181490057, "learning_rate": 2.234132033180654e-06, "loss": 0.8642, "num_input_tokens_seen": 5812224, "step": 1419 }, { "epoch": 1.0581222056631894, "grad_norm": 7.785080262723329, "learning_rate": 2.2337710852435835e-06, "loss": 0.7513, "num_input_tokens_seen": 5816320, "step": 1420 }, { "epoch": 1.0588673621460507, "grad_norm": 8.683142654726757, "learning_rate": 2.233409921656866e-06, "loss": 0.6049, "num_input_tokens_seen": 5820416, "step": 1421 }, { "epoch": 1.059612518628912, "grad_norm": 11.585482659648898, "learning_rate": 2.2330485424996717e-06, "loss": 0.7998, "num_input_tokens_seen": 5824512, "step": 1422 }, { "epoch": 1.0603576751117734, "grad_norm": 8.273952197477044, "learning_rate": 2.2326869478512177e-06, "loss": 0.4963, "num_input_tokens_seen": 5828608, "step": 1423 }, { "epoch": 1.061102831594635, "grad_norm": 8.120132555750848, "learning_rate": 2.232325137790767e-06, "loss": 0.91, "num_input_tokens_seen": 5832704, "step": 1424 }, { "epoch": 1.0618479880774963, "grad_norm": 7.380879313573961, "learning_rate": 2.231963112397632e-06, "loss": 0.7668, "num_input_tokens_seen": 5836800, "step": 1425 }, { "epoch": 1.0625931445603576, "grad_norm": 9.02142534887218, "learning_rate": 2.231600871751171e-06, "loss": 1.0563, "num_input_tokens_seen": 5840896, "step": 1426 }, { "epoch": 1.0633383010432191, "grad_norm": 7.275124710202016, "learning_rate": 2.2312384159307897e-06, "loss": 1.0332, "num_input_tokens_seen": 5844992, "step": 1427 }, { "epoch": 1.0640834575260805, "grad_norm": 7.16961294092526, "learning_rate": 2.230875745015941e-06, "loss": 0.7653, "num_input_tokens_seen": 5849088, "step": 1428 }, { "epoch": 1.0648286140089418, "grad_norm": 8.689459237196711, "learning_rate": 2.230512859086125e-06, "loss": 0.7781, "num_input_tokens_seen": 5853184, "step": 1429 }, { "epoch": 1.0655737704918034, "grad_norm": 7.926341513976065, "learning_rate": 2.2301497582208883e-06, "loss": 0.68, "num_input_tokens_seen": 5857280, "step": 1430 }, { "epoch": 1.0663189269746647, "grad_norm": 7.7796229503673375, "learning_rate": 2.229786442499826e-06, "loss": 0.647, "num_input_tokens_seen": 5861376, "step": 1431 }, { "epoch": 1.067064083457526, "grad_norm": 6.825822276456713, "learning_rate": 2.229422912002579e-06, "loss": 0.862, "num_input_tokens_seen": 5865472, "step": 1432 }, { "epoch": 1.0678092399403876, "grad_norm": 7.880655906476627, "learning_rate": 2.2290591668088363e-06, "loss": 0.6868, "num_input_tokens_seen": 5869568, "step": 1433 }, { "epoch": 1.068554396423249, "grad_norm": 8.10671941961462, "learning_rate": 2.228695206998333e-06, "loss": 0.5191, "num_input_tokens_seen": 5873664, "step": 1434 }, { "epoch": 1.0692995529061102, "grad_norm": 9.14688731639475, "learning_rate": 2.228331032650852e-06, "loss": 0.5623, "num_input_tokens_seen": 5877760, "step": 1435 }, { "epoch": 1.0700447093889718, "grad_norm": 8.472970655161646, "learning_rate": 2.2279666438462222e-06, "loss": 0.6721, "num_input_tokens_seen": 5881856, "step": 1436 }, { "epoch": 1.0707898658718331, "grad_norm": 8.619858345215299, "learning_rate": 2.227602040664321e-06, "loss": 0.5542, "num_input_tokens_seen": 5885952, "step": 1437 }, { "epoch": 1.0715350223546944, "grad_norm": 9.539802588837397, "learning_rate": 2.227237223185072e-06, "loss": 0.6836, "num_input_tokens_seen": 5890048, "step": 1438 }, { "epoch": 1.072280178837556, "grad_norm": 11.614127321257511, "learning_rate": 2.2268721914884456e-06, "loss": 0.7088, "num_input_tokens_seen": 5894144, "step": 1439 }, { "epoch": 1.0730253353204173, "grad_norm": 10.82915081758927, "learning_rate": 2.226506945654459e-06, "loss": 0.5817, "num_input_tokens_seen": 5898240, "step": 1440 }, { "epoch": 1.0737704918032787, "grad_norm": 12.243939920358777, "learning_rate": 2.2261414857631773e-06, "loss": 1.1852, "num_input_tokens_seen": 5902336, "step": 1441 }, { "epoch": 1.07451564828614, "grad_norm": 14.939257786331938, "learning_rate": 2.225775811894712e-06, "loss": 0.7594, "num_input_tokens_seen": 5906432, "step": 1442 }, { "epoch": 1.0752608047690015, "grad_norm": 7.813461944122728, "learning_rate": 2.2254099241292203e-06, "loss": 0.6414, "num_input_tokens_seen": 5910528, "step": 1443 }, { "epoch": 1.0760059612518629, "grad_norm": 10.207336447140355, "learning_rate": 2.225043822546909e-06, "loss": 0.5979, "num_input_tokens_seen": 5914624, "step": 1444 }, { "epoch": 1.0767511177347242, "grad_norm": 13.215763761015035, "learning_rate": 2.2246775072280284e-06, "loss": 0.8946, "num_input_tokens_seen": 5918720, "step": 1445 }, { "epoch": 1.0774962742175858, "grad_norm": 10.02042370495779, "learning_rate": 2.2243109782528794e-06, "loss": 0.975, "num_input_tokens_seen": 5922816, "step": 1446 }, { "epoch": 1.078241430700447, "grad_norm": 9.895331846243305, "learning_rate": 2.223944235701806e-06, "loss": 0.9852, "num_input_tokens_seen": 5926912, "step": 1447 }, { "epoch": 1.0789865871833084, "grad_norm": 7.254920922541859, "learning_rate": 2.223577279655202e-06, "loss": 0.8953, "num_input_tokens_seen": 5931008, "step": 1448 }, { "epoch": 1.07973174366617, "grad_norm": 8.245239428618971, "learning_rate": 2.223210110193506e-06, "loss": 1.0809, "num_input_tokens_seen": 5935104, "step": 1449 }, { "epoch": 1.0804769001490313, "grad_norm": 8.317216675132816, "learning_rate": 2.222842727397205e-06, "loss": 0.8595, "num_input_tokens_seen": 5939200, "step": 1450 }, { "epoch": 1.0812220566318926, "grad_norm": 8.454427007338458, "learning_rate": 2.2224751313468308e-06, "loss": 0.9901, "num_input_tokens_seen": 5943296, "step": 1451 }, { "epoch": 1.0819672131147542, "grad_norm": 7.3168976713479115, "learning_rate": 2.222107322122964e-06, "loss": 0.8226, "num_input_tokens_seen": 5947392, "step": 1452 }, { "epoch": 1.0827123695976155, "grad_norm": 8.171201746221318, "learning_rate": 2.2217392998062307e-06, "loss": 0.5297, "num_input_tokens_seen": 5951488, "step": 1453 }, { "epoch": 1.0834575260804769, "grad_norm": 8.23425448179384, "learning_rate": 2.2213710644773044e-06, "loss": 0.7703, "num_input_tokens_seen": 5955584, "step": 1454 }, { "epoch": 1.0842026825633384, "grad_norm": 8.207008834342204, "learning_rate": 2.221002616216904e-06, "loss": 0.9134, "num_input_tokens_seen": 5959680, "step": 1455 }, { "epoch": 1.0849478390461997, "grad_norm": 7.653867708612245, "learning_rate": 2.220633955105797e-06, "loss": 0.646, "num_input_tokens_seen": 5963776, "step": 1456 }, { "epoch": 1.085692995529061, "grad_norm": 10.024404877776895, "learning_rate": 2.2202650812247958e-06, "loss": 0.5858, "num_input_tokens_seen": 5967872, "step": 1457 }, { "epoch": 1.0864381520119224, "grad_norm": 7.8483947926099304, "learning_rate": 2.2198959946547605e-06, "loss": 0.842, "num_input_tokens_seen": 5971968, "step": 1458 }, { "epoch": 1.087183308494784, "grad_norm": 9.137623166166367, "learning_rate": 2.2195266954765975e-06, "loss": 0.7724, "num_input_tokens_seen": 5976064, "step": 1459 }, { "epoch": 1.0879284649776453, "grad_norm": 8.356347993710834, "learning_rate": 2.21915718377126e-06, "loss": 0.7467, "num_input_tokens_seen": 5980160, "step": 1460 }, { "epoch": 1.0886736214605066, "grad_norm": 7.836704973914551, "learning_rate": 2.218787459619747e-06, "loss": 0.7992, "num_input_tokens_seen": 5984256, "step": 1461 }, { "epoch": 1.0894187779433682, "grad_norm": 11.122058826473198, "learning_rate": 2.218417523103106e-06, "loss": 0.6548, "num_input_tokens_seen": 5988352, "step": 1462 }, { "epoch": 1.0901639344262295, "grad_norm": 9.548953263478444, "learning_rate": 2.218047374302428e-06, "loss": 0.8524, "num_input_tokens_seen": 5992448, "step": 1463 }, { "epoch": 1.0909090909090908, "grad_norm": 8.11416329471628, "learning_rate": 2.2176770132988535e-06, "loss": 0.7067, "num_input_tokens_seen": 5996544, "step": 1464 }, { "epoch": 1.0916542473919524, "grad_norm": 8.426053547450941, "learning_rate": 2.2173064401735672e-06, "loss": 0.8302, "num_input_tokens_seen": 6000640, "step": 1465 }, { "epoch": 1.0923994038748137, "grad_norm": 8.495107626572985, "learning_rate": 2.2169356550078026e-06, "loss": 0.7855, "num_input_tokens_seen": 6004736, "step": 1466 }, { "epoch": 1.093144560357675, "grad_norm": 9.982291210063739, "learning_rate": 2.216564657882838e-06, "loss": 0.8276, "num_input_tokens_seen": 6008832, "step": 1467 }, { "epoch": 1.0938897168405366, "grad_norm": 9.610111992226019, "learning_rate": 2.2161934488799978e-06, "loss": 0.8417, "num_input_tokens_seen": 6012928, "step": 1468 }, { "epoch": 1.094634873323398, "grad_norm": 10.05379758228981, "learning_rate": 2.215822028080655e-06, "loss": 0.6189, "num_input_tokens_seen": 6017024, "step": 1469 }, { "epoch": 1.0953800298062593, "grad_norm": 10.04865594225294, "learning_rate": 2.215450395566227e-06, "loss": 0.8005, "num_input_tokens_seen": 6021120, "step": 1470 }, { "epoch": 1.0961251862891208, "grad_norm": 8.517184496712353, "learning_rate": 2.2150785514181785e-06, "loss": 1.0426, "num_input_tokens_seen": 6025216, "step": 1471 }, { "epoch": 1.0968703427719821, "grad_norm": 14.88348596983682, "learning_rate": 2.2147064957180198e-06, "loss": 0.7231, "num_input_tokens_seen": 6029312, "step": 1472 }, { "epoch": 1.0976154992548435, "grad_norm": 9.808237570036175, "learning_rate": 2.2143342285473084e-06, "loss": 0.7686, "num_input_tokens_seen": 6033408, "step": 1473 }, { "epoch": 1.098360655737705, "grad_norm": 7.441880965377453, "learning_rate": 2.213961749987649e-06, "loss": 1.1421, "num_input_tokens_seen": 6037504, "step": 1474 }, { "epoch": 1.0991058122205664, "grad_norm": 8.498046696136962, "learning_rate": 2.2135890601206903e-06, "loss": 0.7509, "num_input_tokens_seen": 6041600, "step": 1475 }, { "epoch": 1.0998509687034277, "grad_norm": 8.545385454479778, "learning_rate": 2.213216159028129e-06, "loss": 0.7996, "num_input_tokens_seen": 6045696, "step": 1476 }, { "epoch": 1.100596125186289, "grad_norm": 9.17619971814809, "learning_rate": 2.212843046791707e-06, "loss": 0.7698, "num_input_tokens_seen": 6049792, "step": 1477 }, { "epoch": 1.1013412816691506, "grad_norm": 11.752956575316496, "learning_rate": 2.2124697234932145e-06, "loss": 0.8504, "num_input_tokens_seen": 6053888, "step": 1478 }, { "epoch": 1.102086438152012, "grad_norm": 9.009808408642602, "learning_rate": 2.212096189214486e-06, "loss": 1.182, "num_input_tokens_seen": 6057984, "step": 1479 }, { "epoch": 1.1028315946348732, "grad_norm": 8.371449242012654, "learning_rate": 2.211722444037403e-06, "loss": 0.9932, "num_input_tokens_seen": 6062080, "step": 1480 }, { "epoch": 1.1035767511177348, "grad_norm": 6.832294072282332, "learning_rate": 2.2113484880438924e-06, "loss": 0.8632, "num_input_tokens_seen": 6066176, "step": 1481 }, { "epoch": 1.1043219076005961, "grad_norm": 12.810556732960444, "learning_rate": 2.210974321315929e-06, "loss": 0.9739, "num_input_tokens_seen": 6070272, "step": 1482 }, { "epoch": 1.1050670640834575, "grad_norm": 7.57172393887436, "learning_rate": 2.2105999439355326e-06, "loss": 0.8261, "num_input_tokens_seen": 6074368, "step": 1483 }, { "epoch": 1.105812220566319, "grad_norm": 13.321119248330865, "learning_rate": 2.2102253559847688e-06, "loss": 0.7689, "num_input_tokens_seen": 6078464, "step": 1484 }, { "epoch": 1.1065573770491803, "grad_norm": 8.566209581805115, "learning_rate": 2.20985055754575e-06, "loss": 0.8814, "num_input_tokens_seen": 6082560, "step": 1485 }, { "epoch": 1.1073025335320417, "grad_norm": 7.846540863049064, "learning_rate": 2.209475548700636e-06, "loss": 0.6127, "num_input_tokens_seen": 6086656, "step": 1486 }, { "epoch": 1.1080476900149032, "grad_norm": 7.86885297157642, "learning_rate": 2.209100329531629e-06, "loss": 0.9052, "num_input_tokens_seen": 6090752, "step": 1487 }, { "epoch": 1.1087928464977646, "grad_norm": 8.32553899831655, "learning_rate": 2.208724900120983e-06, "loss": 1.0809, "num_input_tokens_seen": 6094848, "step": 1488 }, { "epoch": 1.1095380029806259, "grad_norm": 11.581313862048692, "learning_rate": 2.2083492605509916e-06, "loss": 0.9315, "num_input_tokens_seen": 6098944, "step": 1489 }, { "epoch": 1.1102831594634874, "grad_norm": 8.26125521768464, "learning_rate": 2.2079734109039995e-06, "loss": 0.6088, "num_input_tokens_seen": 6103040, "step": 1490 }, { "epoch": 1.1110283159463488, "grad_norm": 8.019653061299032, "learning_rate": 2.2075973512623956e-06, "loss": 0.8192, "num_input_tokens_seen": 6107136, "step": 1491 }, { "epoch": 1.11177347242921, "grad_norm": 9.293690280662126, "learning_rate": 2.2072210817086137e-06, "loss": 1.1068, "num_input_tokens_seen": 6111232, "step": 1492 }, { "epoch": 1.1125186289120714, "grad_norm": 9.065242116500167, "learning_rate": 2.206844602325136e-06, "loss": 0.8353, "num_input_tokens_seen": 6115328, "step": 1493 }, { "epoch": 1.113263785394933, "grad_norm": 7.70519960882646, "learning_rate": 2.206467913194489e-06, "loss": 0.8322, "num_input_tokens_seen": 6119424, "step": 1494 }, { "epoch": 1.1140089418777943, "grad_norm": 7.4160532166446895, "learning_rate": 2.2060910143992453e-06, "loss": 0.9355, "num_input_tokens_seen": 6123520, "step": 1495 }, { "epoch": 1.1147540983606556, "grad_norm": 7.384753124331813, "learning_rate": 2.2057139060220247e-06, "loss": 0.8158, "num_input_tokens_seen": 6127616, "step": 1496 }, { "epoch": 1.1154992548435172, "grad_norm": 8.559442829577506, "learning_rate": 2.2053365881454907e-06, "loss": 0.7862, "num_input_tokens_seen": 6131712, "step": 1497 }, { "epoch": 1.1162444113263785, "grad_norm": 12.822715291317174, "learning_rate": 2.204959060852356e-06, "loss": 0.6756, "num_input_tokens_seen": 6135808, "step": 1498 }, { "epoch": 1.1169895678092399, "grad_norm": 8.377461757923584, "learning_rate": 2.2045813242253748e-06, "loss": 0.7433, "num_input_tokens_seen": 6139904, "step": 1499 }, { "epoch": 1.1177347242921014, "grad_norm": 9.097386000591227, "learning_rate": 2.2042033783473523e-06, "loss": 0.6119, "num_input_tokens_seen": 6144000, "step": 1500 }, { "epoch": 1.1184798807749627, "grad_norm": 8.702042297994028, "learning_rate": 2.2038252233011347e-06, "loss": 1.0115, "num_input_tokens_seen": 6148096, "step": 1501 }, { "epoch": 1.119225037257824, "grad_norm": 11.165439987459791, "learning_rate": 2.2034468591696177e-06, "loss": 0.6371, "num_input_tokens_seen": 6152192, "step": 1502 }, { "epoch": 1.1199701937406856, "grad_norm": 13.669984296050975, "learning_rate": 2.2030682860357406e-06, "loss": 0.7969, "num_input_tokens_seen": 6156288, "step": 1503 }, { "epoch": 1.120715350223547, "grad_norm": 8.612175354952726, "learning_rate": 2.20268950398249e-06, "loss": 0.6474, "num_input_tokens_seen": 6160384, "step": 1504 }, { "epoch": 1.1214605067064083, "grad_norm": 7.3359304187347, "learning_rate": 2.202310513092897e-06, "loss": 0.8641, "num_input_tokens_seen": 6164480, "step": 1505 }, { "epoch": 1.1222056631892698, "grad_norm": 8.585992457053532, "learning_rate": 2.2019313134500396e-06, "loss": 0.7603, "num_input_tokens_seen": 6168576, "step": 1506 }, { "epoch": 1.1229508196721312, "grad_norm": 14.431619711734005, "learning_rate": 2.201551905137041e-06, "loss": 0.7949, "num_input_tokens_seen": 6172672, "step": 1507 }, { "epoch": 1.1236959761549925, "grad_norm": 8.521459990228335, "learning_rate": 2.20117228823707e-06, "loss": 0.8071, "num_input_tokens_seen": 6176768, "step": 1508 }, { "epoch": 1.124441132637854, "grad_norm": 7.315741956250795, "learning_rate": 2.2007924628333417e-06, "loss": 1.0614, "num_input_tokens_seen": 6180864, "step": 1509 }, { "epoch": 1.1251862891207154, "grad_norm": 8.216635491306445, "learning_rate": 2.200412429009116e-06, "loss": 0.653, "num_input_tokens_seen": 6184960, "step": 1510 }, { "epoch": 1.1259314456035767, "grad_norm": 8.651729544928209, "learning_rate": 2.2000321868476993e-06, "loss": 0.8789, "num_input_tokens_seen": 6189056, "step": 1511 }, { "epoch": 1.1266766020864383, "grad_norm": 9.17739255208959, "learning_rate": 2.1996517364324435e-06, "loss": 0.7755, "num_input_tokens_seen": 6193152, "step": 1512 }, { "epoch": 1.1274217585692996, "grad_norm": 9.573539026470447, "learning_rate": 2.1992710778467457e-06, "loss": 0.7414, "num_input_tokens_seen": 6197248, "step": 1513 }, { "epoch": 1.128166915052161, "grad_norm": 6.706952782815637, "learning_rate": 2.1988902111740496e-06, "loss": 1.0021, "num_input_tokens_seen": 6201344, "step": 1514 }, { "epoch": 1.1289120715350223, "grad_norm": 7.034008948270064, "learning_rate": 2.1985091364978435e-06, "loss": 0.8841, "num_input_tokens_seen": 6205440, "step": 1515 }, { "epoch": 1.1296572280178838, "grad_norm": 7.185937993074329, "learning_rate": 2.1981278539016614e-06, "loss": 0.5987, "num_input_tokens_seen": 6209536, "step": 1516 }, { "epoch": 1.1304023845007451, "grad_norm": 9.896070700048794, "learning_rate": 2.1977463634690838e-06, "loss": 0.7728, "num_input_tokens_seen": 6213632, "step": 1517 }, { "epoch": 1.1311475409836065, "grad_norm": 9.564616836467309, "learning_rate": 2.1973646652837357e-06, "loss": 0.6559, "num_input_tokens_seen": 6217728, "step": 1518 }, { "epoch": 1.131892697466468, "grad_norm": 8.33719713776463, "learning_rate": 2.196982759429288e-06, "loss": 0.8777, "num_input_tokens_seen": 6221824, "step": 1519 }, { "epoch": 1.1326378539493294, "grad_norm": 11.8375658163302, "learning_rate": 2.1966006459894577e-06, "loss": 0.8797, "num_input_tokens_seen": 6225920, "step": 1520 }, { "epoch": 1.1333830104321907, "grad_norm": 10.100037627100432, "learning_rate": 2.196218325048006e-06, "loss": 0.9191, "num_input_tokens_seen": 6230016, "step": 1521 }, { "epoch": 1.1341281669150522, "grad_norm": 11.525783855563695, "learning_rate": 2.1958357966887416e-06, "loss": 0.6395, "num_input_tokens_seen": 6234112, "step": 1522 }, { "epoch": 1.1348733233979136, "grad_norm": 14.699714702529295, "learning_rate": 2.195453060995516e-06, "loss": 0.8271, "num_input_tokens_seen": 6238208, "step": 1523 }, { "epoch": 1.135618479880775, "grad_norm": 7.920531469658895, "learning_rate": 2.1950701180522287e-06, "loss": 0.8018, "num_input_tokens_seen": 6242304, "step": 1524 }, { "epoch": 1.1363636363636362, "grad_norm": 9.255913867917059, "learning_rate": 2.1946869679428232e-06, "loss": 0.5855, "num_input_tokens_seen": 6246400, "step": 1525 }, { "epoch": 1.1371087928464978, "grad_norm": 8.764695519562814, "learning_rate": 2.1943036107512882e-06, "loss": 0.6912, "num_input_tokens_seen": 6250496, "step": 1526 }, { "epoch": 1.1378539493293591, "grad_norm": 8.127661556742446, "learning_rate": 2.193920046561659e-06, "loss": 0.6584, "num_input_tokens_seen": 6254592, "step": 1527 }, { "epoch": 1.1385991058122205, "grad_norm": 8.021179724058982, "learning_rate": 2.1935362754580155e-06, "loss": 0.8829, "num_input_tokens_seen": 6258688, "step": 1528 }, { "epoch": 1.139344262295082, "grad_norm": 8.595798250877538, "learning_rate": 2.193152297524483e-06, "loss": 0.6881, "num_input_tokens_seen": 6262784, "step": 1529 }, { "epoch": 1.1400894187779433, "grad_norm": 8.02886257457849, "learning_rate": 2.192768112845232e-06, "loss": 0.8213, "num_input_tokens_seen": 6266880, "step": 1530 }, { "epoch": 1.1408345752608047, "grad_norm": 7.723097090056169, "learning_rate": 2.192383721504479e-06, "loss": 0.8868, "num_input_tokens_seen": 6270976, "step": 1531 }, { "epoch": 1.1415797317436662, "grad_norm": 7.760417016877144, "learning_rate": 2.1919991235864846e-06, "loss": 1.097, "num_input_tokens_seen": 6275072, "step": 1532 }, { "epoch": 1.1423248882265276, "grad_norm": 7.65626275059276, "learning_rate": 2.1916143191755563e-06, "loss": 0.9098, "num_input_tokens_seen": 6279168, "step": 1533 }, { "epoch": 1.1430700447093889, "grad_norm": 7.97496448964596, "learning_rate": 2.1912293083560447e-06, "loss": 0.6419, "num_input_tokens_seen": 6283264, "step": 1534 }, { "epoch": 1.1438152011922504, "grad_norm": 7.132464427164378, "learning_rate": 2.1908440912123484e-06, "loss": 0.548, "num_input_tokens_seen": 6287360, "step": 1535 }, { "epoch": 1.1445603576751118, "grad_norm": 8.670691543063356, "learning_rate": 2.190458667828909e-06, "loss": 0.7824, "num_input_tokens_seen": 6291456, "step": 1536 }, { "epoch": 1.145305514157973, "grad_norm": 10.57447399675988, "learning_rate": 2.190073038290214e-06, "loss": 0.6707, "num_input_tokens_seen": 6295552, "step": 1537 }, { "epoch": 1.1460506706408347, "grad_norm": 7.760744375115854, "learning_rate": 2.1896872026807967e-06, "loss": 0.6804, "num_input_tokens_seen": 6299648, "step": 1538 }, { "epoch": 1.146795827123696, "grad_norm": 7.9572542940321025, "learning_rate": 2.189301161085234e-06, "loss": 0.6622, "num_input_tokens_seen": 6303744, "step": 1539 }, { "epoch": 1.1475409836065573, "grad_norm": 10.49268935259069, "learning_rate": 2.1889149135881497e-06, "loss": 0.4927, "num_input_tokens_seen": 6307840, "step": 1540 }, { "epoch": 1.1482861400894189, "grad_norm": 8.417474364559208, "learning_rate": 2.1885284602742123e-06, "loss": 0.8156, "num_input_tokens_seen": 6311936, "step": 1541 }, { "epoch": 1.1490312965722802, "grad_norm": 10.68465106502487, "learning_rate": 2.1881418012281345e-06, "loss": 1.002, "num_input_tokens_seen": 6316032, "step": 1542 }, { "epoch": 1.1497764530551415, "grad_norm": 8.202635180259607, "learning_rate": 2.187754936534675e-06, "loss": 1.1582, "num_input_tokens_seen": 6320128, "step": 1543 }, { "epoch": 1.150521609538003, "grad_norm": 10.330249798490438, "learning_rate": 2.187367866278637e-06, "loss": 0.8551, "num_input_tokens_seen": 6324224, "step": 1544 }, { "epoch": 1.1512667660208644, "grad_norm": 10.508072894692072, "learning_rate": 2.1869805905448697e-06, "loss": 0.6966, "num_input_tokens_seen": 6328320, "step": 1545 }, { "epoch": 1.1520119225037257, "grad_norm": 8.123376618933202, "learning_rate": 2.186593109418266e-06, "loss": 0.7812, "num_input_tokens_seen": 6332416, "step": 1546 }, { "epoch": 1.1527570789865873, "grad_norm": 10.96327684812965, "learning_rate": 2.1862054229837647e-06, "loss": 0.7356, "num_input_tokens_seen": 6336512, "step": 1547 }, { "epoch": 1.1535022354694486, "grad_norm": 6.820393372259299, "learning_rate": 2.1858175313263504e-06, "loss": 0.9918, "num_input_tokens_seen": 6340608, "step": 1548 }, { "epoch": 1.15424739195231, "grad_norm": 7.497459914285773, "learning_rate": 2.18542943453105e-06, "loss": 0.8448, "num_input_tokens_seen": 6344704, "step": 1549 }, { "epoch": 1.1549925484351713, "grad_norm": 7.786780603637529, "learning_rate": 2.185041132682938e-06, "loss": 0.8573, "num_input_tokens_seen": 6348800, "step": 1550 }, { "epoch": 1.1557377049180328, "grad_norm": 8.492351471479283, "learning_rate": 2.1846526258671335e-06, "loss": 0.8355, "num_input_tokens_seen": 6352896, "step": 1551 }, { "epoch": 1.1564828614008942, "grad_norm": 9.232917527498007, "learning_rate": 2.1842639141687986e-06, "loss": 0.9341, "num_input_tokens_seen": 6356992, "step": 1552 }, { "epoch": 1.1572280178837555, "grad_norm": 10.698538782563892, "learning_rate": 2.1838749976731432e-06, "loss": 0.6795, "num_input_tokens_seen": 6361088, "step": 1553 }, { "epoch": 1.157973174366617, "grad_norm": 8.637751731875658, "learning_rate": 2.18348587646542e-06, "loss": 0.9029, "num_input_tokens_seen": 6365184, "step": 1554 }, { "epoch": 1.1587183308494784, "grad_norm": 8.596612892617873, "learning_rate": 2.1830965506309263e-06, "loss": 0.7619, "num_input_tokens_seen": 6369280, "step": 1555 }, { "epoch": 1.1594634873323397, "grad_norm": 8.241566074575065, "learning_rate": 2.182707020255006e-06, "loss": 1.0137, "num_input_tokens_seen": 6373376, "step": 1556 }, { "epoch": 1.1602086438152013, "grad_norm": 11.237032931756001, "learning_rate": 2.182317285423047e-06, "loss": 0.9585, "num_input_tokens_seen": 6377472, "step": 1557 }, { "epoch": 1.1609538002980626, "grad_norm": 9.318156309175087, "learning_rate": 2.1819273462204815e-06, "loss": 0.8468, "num_input_tokens_seen": 6381568, "step": 1558 }, { "epoch": 1.161698956780924, "grad_norm": 9.162282207886626, "learning_rate": 2.1815372027327875e-06, "loss": 0.6678, "num_input_tokens_seen": 6385664, "step": 1559 }, { "epoch": 1.1624441132637853, "grad_norm": 8.083157516129882, "learning_rate": 2.181146855045486e-06, "loss": 1.0819, "num_input_tokens_seen": 6389760, "step": 1560 }, { "epoch": 1.1631892697466468, "grad_norm": 11.525323004708332, "learning_rate": 2.1807563032441457e-06, "loss": 0.5922, "num_input_tokens_seen": 6393856, "step": 1561 }, { "epoch": 1.1639344262295082, "grad_norm": 7.917533148980427, "learning_rate": 2.1803655474143774e-06, "loss": 0.8629, "num_input_tokens_seen": 6397952, "step": 1562 }, { "epoch": 1.1646795827123695, "grad_norm": 9.326105674813212, "learning_rate": 2.1799745876418376e-06, "loss": 1.0499, "num_input_tokens_seen": 6402048, "step": 1563 }, { "epoch": 1.165424739195231, "grad_norm": 8.45273757506192, "learning_rate": 2.1795834240122276e-06, "loss": 0.4961, "num_input_tokens_seen": 6406144, "step": 1564 }, { "epoch": 1.1661698956780924, "grad_norm": 8.495689975964689, "learning_rate": 2.1791920566112933e-06, "loss": 0.9088, "num_input_tokens_seen": 6410240, "step": 1565 }, { "epoch": 1.1669150521609537, "grad_norm": 9.318259305562885, "learning_rate": 2.178800485524825e-06, "loss": 0.5755, "num_input_tokens_seen": 6414336, "step": 1566 }, { "epoch": 1.1676602086438153, "grad_norm": 7.66941501107661, "learning_rate": 2.178408710838658e-06, "loss": 0.3341, "num_input_tokens_seen": 6418432, "step": 1567 }, { "epoch": 1.1684053651266766, "grad_norm": 11.204471657281426, "learning_rate": 2.1780167326386724e-06, "loss": 0.8375, "num_input_tokens_seen": 6422528, "step": 1568 }, { "epoch": 1.169150521609538, "grad_norm": 6.677614485235771, "learning_rate": 2.1776245510107924e-06, "loss": 0.8837, "num_input_tokens_seen": 6426624, "step": 1569 }, { "epoch": 1.1698956780923995, "grad_norm": 10.445605057605212, "learning_rate": 2.1772321660409868e-06, "loss": 0.9372, "num_input_tokens_seen": 6430720, "step": 1570 }, { "epoch": 1.1706408345752608, "grad_norm": 7.688967417244842, "learning_rate": 2.1768395778152696e-06, "loss": 0.5798, "num_input_tokens_seen": 6434816, "step": 1571 }, { "epoch": 1.1713859910581221, "grad_norm": 7.299754716790883, "learning_rate": 2.1764467864196986e-06, "loss": 0.8164, "num_input_tokens_seen": 6438912, "step": 1572 }, { "epoch": 1.1721311475409837, "grad_norm": 8.173919389125517, "learning_rate": 2.1760537919403764e-06, "loss": 0.7502, "num_input_tokens_seen": 6443008, "step": 1573 }, { "epoch": 1.172876304023845, "grad_norm": 8.27930868179828, "learning_rate": 2.175660594463451e-06, "loss": 0.6984, "num_input_tokens_seen": 6447104, "step": 1574 }, { "epoch": 1.1736214605067063, "grad_norm": 10.002048189284606, "learning_rate": 2.1752671940751123e-06, "loss": 0.7538, "num_input_tokens_seen": 6451200, "step": 1575 }, { "epoch": 1.174366616989568, "grad_norm": 11.385578623662546, "learning_rate": 2.1748735908615988e-06, "loss": 0.6063, "num_input_tokens_seen": 6455296, "step": 1576 }, { "epoch": 1.1751117734724292, "grad_norm": 8.639816333931224, "learning_rate": 2.1744797849091895e-06, "loss": 0.8235, "num_input_tokens_seen": 6459392, "step": 1577 }, { "epoch": 1.1758569299552906, "grad_norm": 9.087946882622166, "learning_rate": 2.17408577630421e-06, "loss": 1.1454, "num_input_tokens_seen": 6463488, "step": 1578 }, { "epoch": 1.1766020864381521, "grad_norm": 9.303138579937293, "learning_rate": 2.1736915651330297e-06, "loss": 0.7536, "num_input_tokens_seen": 6467584, "step": 1579 }, { "epoch": 1.1773472429210134, "grad_norm": 9.701756388953324, "learning_rate": 2.173297151482063e-06, "loss": 0.6921, "num_input_tokens_seen": 6471680, "step": 1580 }, { "epoch": 1.1780923994038748, "grad_norm": 11.89883685855556, "learning_rate": 2.1729025354377673e-06, "loss": 0.6272, "num_input_tokens_seen": 6475776, "step": 1581 }, { "epoch": 1.1788375558867363, "grad_norm": 7.002589894016962, "learning_rate": 2.172507717086646e-06, "loss": 0.9309, "num_input_tokens_seen": 6479872, "step": 1582 }, { "epoch": 1.1795827123695977, "grad_norm": 8.535726376243307, "learning_rate": 2.1721126965152457e-06, "loss": 1.0797, "num_input_tokens_seen": 6483968, "step": 1583 }, { "epoch": 1.180327868852459, "grad_norm": 6.3910638547047265, "learning_rate": 2.171717473810158e-06, "loss": 0.4872, "num_input_tokens_seen": 6488064, "step": 1584 }, { "epoch": 1.1810730253353203, "grad_norm": 8.957750872426502, "learning_rate": 2.171322049058018e-06, "loss": 0.8957, "num_input_tokens_seen": 6492160, "step": 1585 }, { "epoch": 1.1818181818181819, "grad_norm": 8.200145574036174, "learning_rate": 2.170926422345506e-06, "loss": 0.9181, "num_input_tokens_seen": 6496256, "step": 1586 }, { "epoch": 1.1825633383010432, "grad_norm": 8.27123194146695, "learning_rate": 2.170530593759347e-06, "loss": 0.509, "num_input_tokens_seen": 6500352, "step": 1587 }, { "epoch": 1.1833084947839045, "grad_norm": 12.021166026153676, "learning_rate": 2.170134563386308e-06, "loss": 0.7087, "num_input_tokens_seen": 6504448, "step": 1588 }, { "epoch": 1.184053651266766, "grad_norm": 8.967720646394877, "learning_rate": 2.1697383313132027e-06, "loss": 0.6914, "num_input_tokens_seen": 6508544, "step": 1589 }, { "epoch": 1.1847988077496274, "grad_norm": 8.31828496764436, "learning_rate": 2.1693418976268874e-06, "loss": 0.9289, "num_input_tokens_seen": 6512640, "step": 1590 }, { "epoch": 1.1855439642324888, "grad_norm": 9.76302262840185, "learning_rate": 2.1689452624142632e-06, "loss": 0.7461, "num_input_tokens_seen": 6516736, "step": 1591 }, { "epoch": 1.1862891207153503, "grad_norm": 8.08738884711314, "learning_rate": 2.168548425762276e-06, "loss": 0.5903, "num_input_tokens_seen": 6520832, "step": 1592 }, { "epoch": 1.1870342771982116, "grad_norm": 11.86798933019287, "learning_rate": 2.168151387757915e-06, "loss": 0.9188, "num_input_tokens_seen": 6524928, "step": 1593 }, { "epoch": 1.187779433681073, "grad_norm": 17.07024503150389, "learning_rate": 2.167754148488213e-06, "loss": 0.842, "num_input_tokens_seen": 6529024, "step": 1594 }, { "epoch": 1.1885245901639343, "grad_norm": 10.515906205070909, "learning_rate": 2.167356708040249e-06, "loss": 0.9741, "num_input_tokens_seen": 6533120, "step": 1595 }, { "epoch": 1.1892697466467959, "grad_norm": 7.200844023803586, "learning_rate": 2.1669590665011438e-06, "loss": 0.9358, "num_input_tokens_seen": 6537216, "step": 1596 }, { "epoch": 1.1900149031296572, "grad_norm": 12.259751997558467, "learning_rate": 2.1665612239580635e-06, "loss": 0.7475, "num_input_tokens_seen": 6541312, "step": 1597 }, { "epoch": 1.1907600596125185, "grad_norm": 7.613293834223781, "learning_rate": 2.1661631804982184e-06, "loss": 0.719, "num_input_tokens_seen": 6545408, "step": 1598 }, { "epoch": 1.19150521609538, "grad_norm": 8.40771263183126, "learning_rate": 2.1657649362088618e-06, "loss": 0.7886, "num_input_tokens_seen": 6549504, "step": 1599 }, { "epoch": 1.1922503725782414, "grad_norm": 8.193220656137864, "learning_rate": 2.165366491177292e-06, "loss": 0.6116, "num_input_tokens_seen": 6553600, "step": 1600 }, { "epoch": 1.1929955290611027, "grad_norm": 8.295514694041486, "learning_rate": 2.164967845490851e-06, "loss": 0.9469, "num_input_tokens_seen": 6557696, "step": 1601 }, { "epoch": 1.1937406855439643, "grad_norm": 9.308508709024995, "learning_rate": 2.164568999236925e-06, "loss": 0.6396, "num_input_tokens_seen": 6561792, "step": 1602 }, { "epoch": 1.1944858420268256, "grad_norm": 7.954905356875922, "learning_rate": 2.1641699525029443e-06, "loss": 0.9009, "num_input_tokens_seen": 6565888, "step": 1603 }, { "epoch": 1.195230998509687, "grad_norm": 8.369385246249854, "learning_rate": 2.163770705376381e-06, "loss": 0.6602, "num_input_tokens_seen": 6569984, "step": 1604 }, { "epoch": 1.1959761549925485, "grad_norm": 11.189084651199034, "learning_rate": 2.1633712579447553e-06, "loss": 0.5451, "num_input_tokens_seen": 6574080, "step": 1605 }, { "epoch": 1.1967213114754098, "grad_norm": 9.549110103424045, "learning_rate": 2.1629716102956272e-06, "loss": 0.9573, "num_input_tokens_seen": 6578176, "step": 1606 }, { "epoch": 1.1974664679582712, "grad_norm": 7.931735986444886, "learning_rate": 2.162571762516603e-06, "loss": 0.8637, "num_input_tokens_seen": 6582272, "step": 1607 }, { "epoch": 1.1982116244411327, "grad_norm": 8.556729631635088, "learning_rate": 2.1621717146953322e-06, "loss": 0.7868, "num_input_tokens_seen": 6586368, "step": 1608 }, { "epoch": 1.198956780923994, "grad_norm": 8.415929292845792, "learning_rate": 2.1617714669195083e-06, "loss": 0.5366, "num_input_tokens_seen": 6590464, "step": 1609 }, { "epoch": 1.1997019374068554, "grad_norm": 6.8526327122025625, "learning_rate": 2.1613710192768677e-06, "loss": 0.6688, "num_input_tokens_seen": 6594560, "step": 1610 }, { "epoch": 1.200447093889717, "grad_norm": 10.836428432344185, "learning_rate": 2.1609703718551923e-06, "loss": 0.6758, "num_input_tokens_seen": 6598656, "step": 1611 }, { "epoch": 1.2011922503725783, "grad_norm": 7.472553651566683, "learning_rate": 2.1605695247423063e-06, "loss": 0.8504, "num_input_tokens_seen": 6602752, "step": 1612 }, { "epoch": 1.2019374068554396, "grad_norm": 8.690512752660652, "learning_rate": 2.1601684780260783e-06, "loss": 0.9736, "num_input_tokens_seen": 6606848, "step": 1613 }, { "epoch": 1.2026825633383011, "grad_norm": 8.82649696366452, "learning_rate": 2.159767231794421e-06, "loss": 0.8768, "num_input_tokens_seen": 6610944, "step": 1614 }, { "epoch": 1.2034277198211625, "grad_norm": 9.003190676024166, "learning_rate": 2.15936578613529e-06, "loss": 0.6426, "num_input_tokens_seen": 6615040, "step": 1615 }, { "epoch": 1.2041728763040238, "grad_norm": 9.083250681993334, "learning_rate": 2.1589641411366855e-06, "loss": 0.5461, "num_input_tokens_seen": 6619136, "step": 1616 }, { "epoch": 1.2049180327868854, "grad_norm": 10.791383307518425, "learning_rate": 2.1585622968866504e-06, "loss": 0.9702, "num_input_tokens_seen": 6623232, "step": 1617 }, { "epoch": 1.2056631892697467, "grad_norm": 8.218916748559513, "learning_rate": 2.158160253473272e-06, "loss": 1.0523, "num_input_tokens_seen": 6627328, "step": 1618 }, { "epoch": 1.206408345752608, "grad_norm": 8.8200376208298, "learning_rate": 2.157758010984682e-06, "loss": 0.8628, "num_input_tokens_seen": 6631424, "step": 1619 }, { "epoch": 1.2071535022354694, "grad_norm": 7.427456179058342, "learning_rate": 2.157355569509053e-06, "loss": 0.9243, "num_input_tokens_seen": 6635520, "step": 1620 }, { "epoch": 1.207898658718331, "grad_norm": 6.678537764226988, "learning_rate": 2.1569529291346046e-06, "loss": 0.8747, "num_input_tokens_seen": 6639616, "step": 1621 }, { "epoch": 1.2086438152011922, "grad_norm": 6.9769286303650695, "learning_rate": 2.156550089949598e-06, "loss": 0.9788, "num_input_tokens_seen": 6643712, "step": 1622 }, { "epoch": 1.2093889716840536, "grad_norm": 8.654485845110884, "learning_rate": 2.1561470520423377e-06, "loss": 1.0361, "num_input_tokens_seen": 6647808, "step": 1623 }, { "epoch": 1.2101341281669151, "grad_norm": 7.579170770503461, "learning_rate": 2.1557438155011735e-06, "loss": 0.7785, "num_input_tokens_seen": 6651904, "step": 1624 }, { "epoch": 1.2108792846497765, "grad_norm": 9.661161389798664, "learning_rate": 2.1553403804144976e-06, "loss": 0.8129, "num_input_tokens_seen": 6656000, "step": 1625 }, { "epoch": 1.2116244411326378, "grad_norm": 7.560360539087186, "learning_rate": 2.1549367468707456e-06, "loss": 0.9142, "num_input_tokens_seen": 6660096, "step": 1626 }, { "epoch": 1.2123695976154993, "grad_norm": 13.025655616958108, "learning_rate": 2.1545329149583966e-06, "loss": 0.6924, "num_input_tokens_seen": 6664192, "step": 1627 }, { "epoch": 1.2131147540983607, "grad_norm": 7.480910471398331, "learning_rate": 2.1541288847659736e-06, "loss": 0.8504, "num_input_tokens_seen": 6668288, "step": 1628 }, { "epoch": 1.213859910581222, "grad_norm": 9.17475723572352, "learning_rate": 2.153724656382043e-06, "loss": 0.7109, "num_input_tokens_seen": 6672384, "step": 1629 }, { "epoch": 1.2146050670640836, "grad_norm": 8.927799478341614, "learning_rate": 2.153320229895215e-06, "loss": 0.6959, "num_input_tokens_seen": 6676480, "step": 1630 }, { "epoch": 1.2153502235469449, "grad_norm": 7.804214919314973, "learning_rate": 2.1529156053941417e-06, "loss": 0.7278, "num_input_tokens_seen": 6680576, "step": 1631 }, { "epoch": 1.2160953800298062, "grad_norm": 7.942211616992183, "learning_rate": 2.152510782967521e-06, "loss": 0.7945, "num_input_tokens_seen": 6684672, "step": 1632 }, { "epoch": 1.2168405365126675, "grad_norm": 7.785603992688064, "learning_rate": 2.152105762704092e-06, "loss": 0.8687, "num_input_tokens_seen": 6688768, "step": 1633 }, { "epoch": 1.217585692995529, "grad_norm": 9.100358090766761, "learning_rate": 2.151700544692638e-06, "loss": 0.6837, "num_input_tokens_seen": 6692864, "step": 1634 }, { "epoch": 1.2183308494783904, "grad_norm": 7.891397318869083, "learning_rate": 2.1512951290219857e-06, "loss": 0.6959, "num_input_tokens_seen": 6696960, "step": 1635 }, { "epoch": 1.2190760059612518, "grad_norm": 12.161539949856532, "learning_rate": 2.1508895157810057e-06, "loss": 0.7076, "num_input_tokens_seen": 6701056, "step": 1636 }, { "epoch": 1.2198211624441133, "grad_norm": 7.243705642002197, "learning_rate": 2.1504837050586105e-06, "loss": 0.8535, "num_input_tokens_seen": 6705152, "step": 1637 }, { "epoch": 1.2205663189269746, "grad_norm": 9.074119343330366, "learning_rate": 2.1500776969437577e-06, "loss": 0.7408, "num_input_tokens_seen": 6709248, "step": 1638 }, { "epoch": 1.221311475409836, "grad_norm": 7.3616630592150525, "learning_rate": 2.149671491525446e-06, "loss": 0.5969, "num_input_tokens_seen": 6713344, "step": 1639 }, { "epoch": 1.2220566318926975, "grad_norm": 8.011212720250171, "learning_rate": 2.1492650888927193e-06, "loss": 0.7405, "num_input_tokens_seen": 6717440, "step": 1640 }, { "epoch": 1.2228017883755589, "grad_norm": 10.071367052707428, "learning_rate": 2.148858489134664e-06, "loss": 0.9589, "num_input_tokens_seen": 6721536, "step": 1641 }, { "epoch": 1.2235469448584202, "grad_norm": 9.22077651378028, "learning_rate": 2.1484516923404094e-06, "loss": 0.9518, "num_input_tokens_seen": 6725632, "step": 1642 }, { "epoch": 1.2242921013412817, "grad_norm": 9.233331246270351, "learning_rate": 2.148044698599128e-06, "loss": 1.0332, "num_input_tokens_seen": 6729728, "step": 1643 }, { "epoch": 1.225037257824143, "grad_norm": 7.769022039084347, "learning_rate": 2.1476375080000365e-06, "loss": 0.8794, "num_input_tokens_seen": 6733824, "step": 1644 }, { "epoch": 1.2257824143070044, "grad_norm": 7.702450452547642, "learning_rate": 2.147230120632393e-06, "loss": 0.7255, "num_input_tokens_seen": 6737920, "step": 1645 }, { "epoch": 1.226527570789866, "grad_norm": 8.679930290211091, "learning_rate": 2.146822536585501e-06, "loss": 0.7532, "num_input_tokens_seen": 6742016, "step": 1646 }, { "epoch": 1.2272727272727273, "grad_norm": 9.714064547145979, "learning_rate": 2.1464147559487046e-06, "loss": 0.5866, "num_input_tokens_seen": 6746112, "step": 1647 }, { "epoch": 1.2280178837555886, "grad_norm": 8.933466985402411, "learning_rate": 2.146006778811393e-06, "loss": 0.6818, "num_input_tokens_seen": 6750208, "step": 1648 }, { "epoch": 1.2287630402384502, "grad_norm": 6.816772113361643, "learning_rate": 2.145598605262997e-06, "loss": 0.6344, "num_input_tokens_seen": 6754304, "step": 1649 }, { "epoch": 1.2295081967213115, "grad_norm": 9.47995553739008, "learning_rate": 2.145190235392992e-06, "loss": 0.8675, "num_input_tokens_seen": 6758400, "step": 1650 }, { "epoch": 1.2302533532041728, "grad_norm": 9.051313314286535, "learning_rate": 2.144781669290895e-06, "loss": 0.7736, "num_input_tokens_seen": 6762496, "step": 1651 }, { "epoch": 1.2309985096870344, "grad_norm": 11.779896684818771, "learning_rate": 2.1443729070462665e-06, "loss": 0.8095, "num_input_tokens_seen": 6766592, "step": 1652 }, { "epoch": 1.2317436661698957, "grad_norm": 10.701059488797389, "learning_rate": 2.143963948748711e-06, "loss": 0.905, "num_input_tokens_seen": 6770688, "step": 1653 }, { "epoch": 1.232488822652757, "grad_norm": 7.729010407034054, "learning_rate": 2.143554794487874e-06, "loss": 0.8251, "num_input_tokens_seen": 6774784, "step": 1654 }, { "epoch": 1.2332339791356184, "grad_norm": 7.840560213464443, "learning_rate": 2.1431454443534456e-06, "loss": 1.0111, "num_input_tokens_seen": 6778880, "step": 1655 }, { "epoch": 1.23397913561848, "grad_norm": 9.788024034492993, "learning_rate": 2.1427358984351583e-06, "loss": 0.8582, "num_input_tokens_seen": 6782976, "step": 1656 }, { "epoch": 1.2347242921013413, "grad_norm": 6.929429392169119, "learning_rate": 2.1423261568227873e-06, "loss": 0.7202, "num_input_tokens_seen": 6787072, "step": 1657 }, { "epoch": 1.2354694485842026, "grad_norm": 8.174341037637209, "learning_rate": 2.1419162196061506e-06, "loss": 0.7817, "num_input_tokens_seen": 6791168, "step": 1658 }, { "epoch": 1.2362146050670642, "grad_norm": 9.5872108896686, "learning_rate": 2.1415060868751104e-06, "loss": 0.9006, "num_input_tokens_seen": 6795264, "step": 1659 }, { "epoch": 1.2369597615499255, "grad_norm": 7.539796429642222, "learning_rate": 2.1410957587195695e-06, "loss": 0.6204, "num_input_tokens_seen": 6799360, "step": 1660 }, { "epoch": 1.2377049180327868, "grad_norm": 8.95416600313891, "learning_rate": 2.1406852352294752e-06, "loss": 1.1719, "num_input_tokens_seen": 6803456, "step": 1661 }, { "epoch": 1.2384500745156484, "grad_norm": 7.721883557994124, "learning_rate": 2.1402745164948175e-06, "loss": 0.755, "num_input_tokens_seen": 6807552, "step": 1662 }, { "epoch": 1.2391952309985097, "grad_norm": 11.17539159908475, "learning_rate": 2.139863602605629e-06, "loss": 0.7038, "num_input_tokens_seen": 6811648, "step": 1663 }, { "epoch": 1.239940387481371, "grad_norm": 7.2598392189078655, "learning_rate": 2.139452493651984e-06, "loss": 0.8825, "num_input_tokens_seen": 6815744, "step": 1664 }, { "epoch": 1.2406855439642326, "grad_norm": 8.268135028484107, "learning_rate": 2.1390411897240014e-06, "loss": 0.8254, "num_input_tokens_seen": 6819840, "step": 1665 }, { "epoch": 1.241430700447094, "grad_norm": 11.17827868701084, "learning_rate": 2.1386296909118416e-06, "loss": 0.5245, "num_input_tokens_seen": 6823936, "step": 1666 }, { "epoch": 1.2421758569299552, "grad_norm": 8.316475032720414, "learning_rate": 2.1382179973057087e-06, "loss": 0.7573, "num_input_tokens_seen": 6828032, "step": 1667 }, { "epoch": 1.2429210134128166, "grad_norm": 8.423798428480648, "learning_rate": 2.1378061089958476e-06, "loss": 1.1089, "num_input_tokens_seen": 6832128, "step": 1668 }, { "epoch": 1.2436661698956781, "grad_norm": 8.728910770725209, "learning_rate": 2.1373940260725486e-06, "loss": 1.2109, "num_input_tokens_seen": 6836224, "step": 1669 }, { "epoch": 1.2444113263785395, "grad_norm": 8.151859625992682, "learning_rate": 2.136981748626142e-06, "loss": 0.9001, "num_input_tokens_seen": 6840320, "step": 1670 }, { "epoch": 1.2451564828614008, "grad_norm": 10.381352115719956, "learning_rate": 2.1365692767470025e-06, "loss": 0.7827, "num_input_tokens_seen": 6844416, "step": 1671 }, { "epoch": 1.2459016393442623, "grad_norm": 8.42870686332381, "learning_rate": 2.136156610525547e-06, "loss": 0.5272, "num_input_tokens_seen": 6848512, "step": 1672 }, { "epoch": 1.2466467958271237, "grad_norm": 8.114158650135607, "learning_rate": 2.135743750052235e-06, "loss": 0.7636, "num_input_tokens_seen": 6852608, "step": 1673 }, { "epoch": 1.247391952309985, "grad_norm": 7.774731788247447, "learning_rate": 2.135330695417568e-06, "loss": 0.8569, "num_input_tokens_seen": 6856704, "step": 1674 }, { "epoch": 1.2481371087928466, "grad_norm": 9.366528909072342, "learning_rate": 2.1349174467120905e-06, "loss": 0.775, "num_input_tokens_seen": 6860800, "step": 1675 }, { "epoch": 1.248882265275708, "grad_norm": 8.477777385244043, "learning_rate": 2.13450400402639e-06, "loss": 0.9679, "num_input_tokens_seen": 6864896, "step": 1676 }, { "epoch": 1.2496274217585692, "grad_norm": 9.5156529546, "learning_rate": 2.134090367451096e-06, "loss": 0.9434, "num_input_tokens_seen": 6868992, "step": 1677 }, { "epoch": 1.2503725782414308, "grad_norm": 7.977740555628138, "learning_rate": 2.1336765370768804e-06, "loss": 0.74, "num_input_tokens_seen": 6873088, "step": 1678 }, { "epoch": 1.251117734724292, "grad_norm": 8.560343597456379, "learning_rate": 2.1332625129944577e-06, "loss": 0.8013, "num_input_tokens_seen": 6877184, "step": 1679 }, { "epoch": 1.2518628912071534, "grad_norm": 8.213037637896702, "learning_rate": 2.1328482952945855e-06, "loss": 0.8076, "num_input_tokens_seen": 6881280, "step": 1680 }, { "epoch": 1.252608047690015, "grad_norm": 8.833771266573477, "learning_rate": 2.1324338840680627e-06, "loss": 0.7281, "num_input_tokens_seen": 6885376, "step": 1681 }, { "epoch": 1.2533532041728763, "grad_norm": 8.84717419173631, "learning_rate": 2.132019279405731e-06, "loss": 0.6707, "num_input_tokens_seen": 6889472, "step": 1682 }, { "epoch": 1.2540983606557377, "grad_norm": 7.776636647057546, "learning_rate": 2.1316044813984753e-06, "loss": 0.7168, "num_input_tokens_seen": 6893568, "step": 1683 }, { "epoch": 1.2548435171385992, "grad_norm": 9.923807091804745, "learning_rate": 2.131189490137222e-06, "loss": 0.7621, "num_input_tokens_seen": 6897664, "step": 1684 }, { "epoch": 1.2555886736214605, "grad_norm": 7.845738728126416, "learning_rate": 2.1307743057129403e-06, "loss": 0.889, "num_input_tokens_seen": 6901760, "step": 1685 }, { "epoch": 1.2563338301043219, "grad_norm": 10.525937523160414, "learning_rate": 2.130358928216642e-06, "loss": 0.7806, "num_input_tokens_seen": 6905856, "step": 1686 }, { "epoch": 1.2570789865871834, "grad_norm": 8.014943149949127, "learning_rate": 2.1299433577393795e-06, "loss": 0.8145, "num_input_tokens_seen": 6909952, "step": 1687 }, { "epoch": 1.2578241430700448, "grad_norm": 8.237382540551712, "learning_rate": 2.1295275943722503e-06, "loss": 1.0746, "num_input_tokens_seen": 6914048, "step": 1688 }, { "epoch": 1.258569299552906, "grad_norm": 9.407351308076102, "learning_rate": 2.1291116382063916e-06, "loss": 0.9881, "num_input_tokens_seen": 6918144, "step": 1689 }, { "epoch": 1.2593144560357676, "grad_norm": 11.072586134148043, "learning_rate": 2.128695489332984e-06, "loss": 0.6757, "num_input_tokens_seen": 6922240, "step": 1690 }, { "epoch": 1.260059612518629, "grad_norm": 9.181282824971493, "learning_rate": 2.1282791478432517e-06, "loss": 0.8283, "num_input_tokens_seen": 6926336, "step": 1691 }, { "epoch": 1.2608047690014903, "grad_norm": 9.007157024937522, "learning_rate": 2.127862613828458e-06, "loss": 0.7841, "num_input_tokens_seen": 6930432, "step": 1692 }, { "epoch": 1.2615499254843516, "grad_norm": 9.78475546276838, "learning_rate": 2.127445887379911e-06, "loss": 0.9732, "num_input_tokens_seen": 6934528, "step": 1693 }, { "epoch": 1.2622950819672132, "grad_norm": 6.8636215950981665, "learning_rate": 2.12702896858896e-06, "loss": 0.7823, "num_input_tokens_seen": 6938624, "step": 1694 }, { "epoch": 1.2630402384500745, "grad_norm": 7.301531681493745, "learning_rate": 2.1266118575469967e-06, "loss": 1.0662, "num_input_tokens_seen": 6942720, "step": 1695 }, { "epoch": 1.2637853949329358, "grad_norm": 7.335419209106375, "learning_rate": 2.1261945543454544e-06, "loss": 0.8921, "num_input_tokens_seen": 6946816, "step": 1696 }, { "epoch": 1.2645305514157974, "grad_norm": 7.345120186569054, "learning_rate": 2.1257770590758094e-06, "loss": 0.9767, "num_input_tokens_seen": 6950912, "step": 1697 }, { "epoch": 1.2652757078986587, "grad_norm": 12.123211890354591, "learning_rate": 2.125359371829579e-06, "loss": 0.8518, "num_input_tokens_seen": 6955008, "step": 1698 }, { "epoch": 1.26602086438152, "grad_norm": 7.86964782264189, "learning_rate": 2.1249414926983244e-06, "loss": 0.9527, "num_input_tokens_seen": 6959104, "step": 1699 }, { "epoch": 1.2667660208643814, "grad_norm": 8.260123499084301, "learning_rate": 2.1245234217736463e-06, "loss": 0.9239, "num_input_tokens_seen": 6963200, "step": 1700 }, { "epoch": 1.267511177347243, "grad_norm": 7.710393616139399, "learning_rate": 2.1241051591471897e-06, "loss": 0.6991, "num_input_tokens_seen": 6967296, "step": 1701 }, { "epoch": 1.2682563338301043, "grad_norm": 7.573981125653754, "learning_rate": 2.1236867049106406e-06, "loss": 0.8355, "num_input_tokens_seen": 6971392, "step": 1702 }, { "epoch": 1.2690014903129656, "grad_norm": 8.389154070652708, "learning_rate": 2.1232680591557274e-06, "loss": 0.8313, "num_input_tokens_seen": 6975488, "step": 1703 }, { "epoch": 1.2697466467958272, "grad_norm": 7.398497387271976, "learning_rate": 2.1228492219742196e-06, "loss": 0.9172, "num_input_tokens_seen": 6979584, "step": 1704 }, { "epoch": 1.2704918032786885, "grad_norm": 14.361489359206693, "learning_rate": 2.1224301934579292e-06, "loss": 0.77, "num_input_tokens_seen": 6983680, "step": 1705 }, { "epoch": 1.2712369597615498, "grad_norm": 9.858652031714014, "learning_rate": 2.1220109736987116e-06, "loss": 0.7985, "num_input_tokens_seen": 6987776, "step": 1706 }, { "epoch": 1.2719821162444114, "grad_norm": 8.470744586634819, "learning_rate": 2.1215915627884613e-06, "loss": 0.9528, "num_input_tokens_seen": 6991872, "step": 1707 }, { "epoch": 1.2727272727272727, "grad_norm": 8.904252795044764, "learning_rate": 2.1211719608191173e-06, "loss": 0.7585, "num_input_tokens_seen": 6995968, "step": 1708 }, { "epoch": 1.273472429210134, "grad_norm": 7.81185144152058, "learning_rate": 2.120752167882658e-06, "loss": 0.7919, "num_input_tokens_seen": 7000064, "step": 1709 }, { "epoch": 1.2742175856929956, "grad_norm": 9.1143548141078, "learning_rate": 2.120332184071106e-06, "loss": 0.6678, "num_input_tokens_seen": 7004160, "step": 1710 }, { "epoch": 1.274962742175857, "grad_norm": 7.848438470723808, "learning_rate": 2.1199120094765247e-06, "loss": 0.8287, "num_input_tokens_seen": 7008256, "step": 1711 }, { "epoch": 1.2757078986587183, "grad_norm": 9.440482911411715, "learning_rate": 2.119491644191019e-06, "loss": 0.6469, "num_input_tokens_seen": 7012352, "step": 1712 }, { "epoch": 1.2764530551415798, "grad_norm": 7.751716224716329, "learning_rate": 2.1190710883067364e-06, "loss": 0.6906, "num_input_tokens_seen": 7016448, "step": 1713 }, { "epoch": 1.2771982116244411, "grad_norm": 8.605452624654301, "learning_rate": 2.1186503419158657e-06, "loss": 0.7775, "num_input_tokens_seen": 7020544, "step": 1714 }, { "epoch": 1.2779433681073025, "grad_norm": 8.046136632699712, "learning_rate": 2.1182294051106366e-06, "loss": 0.6812, "num_input_tokens_seen": 7024640, "step": 1715 }, { "epoch": 1.278688524590164, "grad_norm": 10.288033034906404, "learning_rate": 2.117808277983323e-06, "loss": 0.7998, "num_input_tokens_seen": 7028736, "step": 1716 }, { "epoch": 1.2794336810730254, "grad_norm": 7.726685214416242, "learning_rate": 2.1173869606262377e-06, "loss": 0.6957, "num_input_tokens_seen": 7032832, "step": 1717 }, { "epoch": 1.2801788375558867, "grad_norm": 8.973226101237715, "learning_rate": 2.1169654531317367e-06, "loss": 0.5105, "num_input_tokens_seen": 7036928, "step": 1718 }, { "epoch": 1.2809239940387482, "grad_norm": 8.572763110744644, "learning_rate": 2.116543755592218e-06, "loss": 0.8779, "num_input_tokens_seen": 7041024, "step": 1719 }, { "epoch": 1.2816691505216096, "grad_norm": 18.9332653185162, "learning_rate": 2.1161218681001206e-06, "loss": 0.9793, "num_input_tokens_seen": 7045120, "step": 1720 }, { "epoch": 1.282414307004471, "grad_norm": 8.35216878690974, "learning_rate": 2.115699790747925e-06, "loss": 0.6913, "num_input_tokens_seen": 7049216, "step": 1721 }, { "epoch": 1.2831594634873325, "grad_norm": 9.03134241373371, "learning_rate": 2.1152775236281535e-06, "loss": 0.6254, "num_input_tokens_seen": 7053312, "step": 1722 }, { "epoch": 1.2839046199701938, "grad_norm": 9.360091210757961, "learning_rate": 2.11485506683337e-06, "loss": 0.7801, "num_input_tokens_seen": 7057408, "step": 1723 }, { "epoch": 1.2846497764530551, "grad_norm": 9.085862492952158, "learning_rate": 2.1144324204561805e-06, "loss": 0.6497, "num_input_tokens_seen": 7061504, "step": 1724 }, { "epoch": 1.2853949329359167, "grad_norm": 10.176445591680082, "learning_rate": 2.1140095845892317e-06, "loss": 0.7983, "num_input_tokens_seen": 7065600, "step": 1725 }, { "epoch": 1.286140089418778, "grad_norm": 8.851501370427304, "learning_rate": 2.1135865593252124e-06, "loss": 0.9725, "num_input_tokens_seen": 7069696, "step": 1726 }, { "epoch": 1.2868852459016393, "grad_norm": 11.238384536564457, "learning_rate": 2.1131633447568527e-06, "loss": 0.4756, "num_input_tokens_seen": 7073792, "step": 1727 }, { "epoch": 1.2876304023845009, "grad_norm": 7.912415229824704, "learning_rate": 2.1127399409769243e-06, "loss": 0.7014, "num_input_tokens_seen": 7077888, "step": 1728 }, { "epoch": 1.2883755588673622, "grad_norm": 11.856506689089866, "learning_rate": 2.1123163480782405e-06, "loss": 0.7319, "num_input_tokens_seen": 7081984, "step": 1729 }, { "epoch": 1.2891207153502235, "grad_norm": 8.643836908321124, "learning_rate": 2.1118925661536556e-06, "loss": 0.79, "num_input_tokens_seen": 7086080, "step": 1730 }, { "epoch": 1.2898658718330849, "grad_norm": 9.543716394060132, "learning_rate": 2.1114685952960657e-06, "loss": 0.7949, "num_input_tokens_seen": 7090176, "step": 1731 }, { "epoch": 1.2906110283159464, "grad_norm": 9.115820414558918, "learning_rate": 2.111044435598408e-06, "loss": 0.7241, "num_input_tokens_seen": 7094272, "step": 1732 }, { "epoch": 1.2913561847988078, "grad_norm": 7.357994360159699, "learning_rate": 2.110620087153662e-06, "loss": 0.6321, "num_input_tokens_seen": 7098368, "step": 1733 }, { "epoch": 1.292101341281669, "grad_norm": 12.101725882158823, "learning_rate": 2.110195550054848e-06, "loss": 0.6636, "num_input_tokens_seen": 7102464, "step": 1734 }, { "epoch": 1.2928464977645304, "grad_norm": 14.986036593863213, "learning_rate": 2.109770824395026e-06, "loss": 0.7383, "num_input_tokens_seen": 7106560, "step": 1735 }, { "epoch": 1.293591654247392, "grad_norm": 8.07817504166668, "learning_rate": 2.1093459102673005e-06, "loss": 0.8434, "num_input_tokens_seen": 7110656, "step": 1736 }, { "epoch": 1.2943368107302533, "grad_norm": 9.417550563385795, "learning_rate": 2.1089208077648153e-06, "loss": 0.5901, "num_input_tokens_seen": 7114752, "step": 1737 }, { "epoch": 1.2950819672131146, "grad_norm": 7.206528824588306, "learning_rate": 2.1084955169807554e-06, "loss": 0.9642, "num_input_tokens_seen": 7118848, "step": 1738 }, { "epoch": 1.2958271236959762, "grad_norm": 8.708423672887186, "learning_rate": 2.1080700380083485e-06, "loss": 0.9967, "num_input_tokens_seen": 7122944, "step": 1739 }, { "epoch": 1.2965722801788375, "grad_norm": 9.202577065241043, "learning_rate": 2.107644370940862e-06, "loss": 0.9752, "num_input_tokens_seen": 7127040, "step": 1740 }, { "epoch": 1.2973174366616989, "grad_norm": 7.240294724788961, "learning_rate": 2.107218515871605e-06, "loss": 0.9458, "num_input_tokens_seen": 7131136, "step": 1741 }, { "epoch": 1.2980625931445604, "grad_norm": 8.942106826941393, "learning_rate": 2.106792472893928e-06, "loss": 0.8479, "num_input_tokens_seen": 7135232, "step": 1742 }, { "epoch": 1.2988077496274217, "grad_norm": 9.549267487667576, "learning_rate": 2.1063662421012234e-06, "loss": 0.6524, "num_input_tokens_seen": 7139328, "step": 1743 }, { "epoch": 1.299552906110283, "grad_norm": 8.728021337289947, "learning_rate": 2.1059398235869233e-06, "loss": 0.8687, "num_input_tokens_seen": 7143424, "step": 1744 }, { "epoch": 1.3002980625931446, "grad_norm": 7.136810057381832, "learning_rate": 2.105513217444502e-06, "loss": 1.0127, "num_input_tokens_seen": 7147520, "step": 1745 }, { "epoch": 1.301043219076006, "grad_norm": 7.922875265328587, "learning_rate": 2.1050864237674745e-06, "loss": 0.7312, "num_input_tokens_seen": 7151616, "step": 1746 }, { "epoch": 1.3017883755588673, "grad_norm": 7.922472965102806, "learning_rate": 2.1046594426493973e-06, "loss": 0.6583, "num_input_tokens_seen": 7155712, "step": 1747 }, { "epoch": 1.3025335320417288, "grad_norm": 9.003209670355835, "learning_rate": 2.104232274183867e-06, "loss": 0.5958, "num_input_tokens_seen": 7159808, "step": 1748 }, { "epoch": 1.3032786885245902, "grad_norm": 9.47332824552297, "learning_rate": 2.1038049184645225e-06, "loss": 0.7951, "num_input_tokens_seen": 7163904, "step": 1749 }, { "epoch": 1.3040238450074515, "grad_norm": 8.714168225927958, "learning_rate": 2.1033773755850434e-06, "loss": 0.8481, "num_input_tokens_seen": 7168000, "step": 1750 }, { "epoch": 1.304769001490313, "grad_norm": 19.31055073096112, "learning_rate": 2.1029496456391497e-06, "loss": 0.9515, "num_input_tokens_seen": 7172096, "step": 1751 }, { "epoch": 1.3055141579731744, "grad_norm": 11.168468758906153, "learning_rate": 2.1025217287206034e-06, "loss": 0.6205, "num_input_tokens_seen": 7176192, "step": 1752 }, { "epoch": 1.3062593144560357, "grad_norm": 8.908283034963322, "learning_rate": 2.102093624923206e-06, "loss": 0.8949, "num_input_tokens_seen": 7180288, "step": 1753 }, { "epoch": 1.3070044709388973, "grad_norm": 10.800613383384079, "learning_rate": 2.1016653343408024e-06, "loss": 0.8827, "num_input_tokens_seen": 7184384, "step": 1754 }, { "epoch": 1.3077496274217586, "grad_norm": 9.003498263938983, "learning_rate": 2.1012368570672757e-06, "loss": 0.7502, "num_input_tokens_seen": 7188480, "step": 1755 }, { "epoch": 1.30849478390462, "grad_norm": 8.563236511187034, "learning_rate": 2.1008081931965513e-06, "loss": 0.896, "num_input_tokens_seen": 7192576, "step": 1756 }, { "epoch": 1.3092399403874815, "grad_norm": 9.479540327779766, "learning_rate": 2.1003793428225957e-06, "loss": 0.816, "num_input_tokens_seen": 7196672, "step": 1757 }, { "epoch": 1.3099850968703428, "grad_norm": 8.21189606400867, "learning_rate": 2.099950306039416e-06, "loss": 0.8379, "num_input_tokens_seen": 7200768, "step": 1758 }, { "epoch": 1.3107302533532041, "grad_norm": 9.036860222184446, "learning_rate": 2.09952108294106e-06, "loss": 0.6935, "num_input_tokens_seen": 7204864, "step": 1759 }, { "epoch": 1.3114754098360657, "grad_norm": 8.31367391388072, "learning_rate": 2.0990916736216164e-06, "loss": 0.7534, "num_input_tokens_seen": 7208960, "step": 1760 }, { "epoch": 1.312220566318927, "grad_norm": 8.27977396368019, "learning_rate": 2.098662078175215e-06, "loss": 0.9513, "num_input_tokens_seen": 7213056, "step": 1761 }, { "epoch": 1.3129657228017884, "grad_norm": 9.291346948798306, "learning_rate": 2.0982322966960264e-06, "loss": 0.556, "num_input_tokens_seen": 7217152, "step": 1762 }, { "epoch": 1.31371087928465, "grad_norm": 6.827733151581438, "learning_rate": 2.0978023292782613e-06, "loss": 0.6825, "num_input_tokens_seen": 7221248, "step": 1763 }, { "epoch": 1.3144560357675112, "grad_norm": 8.66875820029603, "learning_rate": 2.0973721760161714e-06, "loss": 0.5707, "num_input_tokens_seen": 7225344, "step": 1764 }, { "epoch": 1.3152011922503726, "grad_norm": 9.997480152427926, "learning_rate": 2.09694183700405e-06, "loss": 0.6204, "num_input_tokens_seen": 7229440, "step": 1765 }, { "epoch": 1.315946348733234, "grad_norm": 10.40726144206117, "learning_rate": 2.0965113123362303e-06, "loss": 0.6249, "num_input_tokens_seen": 7233536, "step": 1766 }, { "epoch": 1.3166915052160955, "grad_norm": 8.511373570495048, "learning_rate": 2.096080602107086e-06, "loss": 0.5253, "num_input_tokens_seen": 7237632, "step": 1767 }, { "epoch": 1.3174366616989568, "grad_norm": 11.138011494382667, "learning_rate": 2.095649706411032e-06, "loss": 0.5797, "num_input_tokens_seen": 7241728, "step": 1768 }, { "epoch": 1.3181818181818181, "grad_norm": 8.451019112685504, "learning_rate": 2.0952186253425244e-06, "loss": 1.0169, "num_input_tokens_seen": 7245824, "step": 1769 }, { "epoch": 1.3189269746646795, "grad_norm": 8.845129299765974, "learning_rate": 2.0947873589960586e-06, "loss": 0.8783, "num_input_tokens_seen": 7249920, "step": 1770 }, { "epoch": 1.319672131147541, "grad_norm": 8.790215690544178, "learning_rate": 2.0943559074661714e-06, "loss": 0.8094, "num_input_tokens_seen": 7254016, "step": 1771 }, { "epoch": 1.3204172876304023, "grad_norm": 8.920669745226775, "learning_rate": 2.09392427084744e-06, "loss": 1.0997, "num_input_tokens_seen": 7258112, "step": 1772 }, { "epoch": 1.3211624441132637, "grad_norm": 10.142572681172464, "learning_rate": 2.093492449234482e-06, "loss": 0.7451, "num_input_tokens_seen": 7262208, "step": 1773 }, { "epoch": 1.3219076005961252, "grad_norm": 7.533436155043117, "learning_rate": 2.0930604427219565e-06, "loss": 1.1393, "num_input_tokens_seen": 7266304, "step": 1774 }, { "epoch": 1.3226527570789866, "grad_norm": 7.685449371264109, "learning_rate": 2.0926282514045618e-06, "loss": 0.8294, "num_input_tokens_seen": 7270400, "step": 1775 }, { "epoch": 1.3233979135618479, "grad_norm": 7.934444313793877, "learning_rate": 2.0921958753770373e-06, "loss": 0.584, "num_input_tokens_seen": 7274496, "step": 1776 }, { "epoch": 1.3241430700447094, "grad_norm": 8.399833489867643, "learning_rate": 2.0917633147341634e-06, "loss": 0.7814, "num_input_tokens_seen": 7278592, "step": 1777 }, { "epoch": 1.3248882265275708, "grad_norm": 8.33695188906038, "learning_rate": 2.09133056957076e-06, "loss": 0.9582, "num_input_tokens_seen": 7282688, "step": 1778 }, { "epoch": 1.325633383010432, "grad_norm": 9.262902882901406, "learning_rate": 2.090897639981688e-06, "loss": 0.7912, "num_input_tokens_seen": 7286784, "step": 1779 }, { "epoch": 1.3263785394932937, "grad_norm": 8.48390626515685, "learning_rate": 2.090464526061849e-06, "loss": 0.873, "num_input_tokens_seen": 7290880, "step": 1780 }, { "epoch": 1.327123695976155, "grad_norm": 8.007138893596936, "learning_rate": 2.0900312279061844e-06, "loss": 0.6401, "num_input_tokens_seen": 7294976, "step": 1781 }, { "epoch": 1.3278688524590163, "grad_norm": 8.01435257237651, "learning_rate": 2.0895977456096766e-06, "loss": 0.7722, "num_input_tokens_seen": 7299072, "step": 1782 }, { "epoch": 1.3286140089418779, "grad_norm": 8.4661787653197, "learning_rate": 2.089164079267347e-06, "loss": 0.7975, "num_input_tokens_seen": 7303168, "step": 1783 }, { "epoch": 1.3293591654247392, "grad_norm": 7.5655707948910385, "learning_rate": 2.0887302289742594e-06, "loss": 0.8267, "num_input_tokens_seen": 7307264, "step": 1784 }, { "epoch": 1.3301043219076005, "grad_norm": 13.137471341203055, "learning_rate": 2.0882961948255166e-06, "loss": 0.8456, "num_input_tokens_seen": 7311360, "step": 1785 }, { "epoch": 1.330849478390462, "grad_norm": 7.303154286062542, "learning_rate": 2.087861976916262e-06, "loss": 0.9644, "num_input_tokens_seen": 7315456, "step": 1786 }, { "epoch": 1.3315946348733234, "grad_norm": 8.402789513886884, "learning_rate": 2.087427575341679e-06, "loss": 0.7009, "num_input_tokens_seen": 7319552, "step": 1787 }, { "epoch": 1.3323397913561847, "grad_norm": 10.297297149573609, "learning_rate": 2.0869929901969914e-06, "loss": 0.6899, "num_input_tokens_seen": 7323648, "step": 1788 }, { "epoch": 1.3330849478390463, "grad_norm": 10.998523275863421, "learning_rate": 2.0865582215774643e-06, "loss": 0.8562, "num_input_tokens_seen": 7327744, "step": 1789 }, { "epoch": 1.3338301043219076, "grad_norm": 9.017598593170364, "learning_rate": 2.0861232695784014e-06, "loss": 0.8324, "num_input_tokens_seen": 7331840, "step": 1790 }, { "epoch": 1.334575260804769, "grad_norm": 9.659479361681571, "learning_rate": 2.0856881342951467e-06, "loss": 0.8884, "num_input_tokens_seen": 7335936, "step": 1791 }, { "epoch": 1.3353204172876305, "grad_norm": 8.991333936101434, "learning_rate": 2.0852528158230863e-06, "loss": 0.7484, "num_input_tokens_seen": 7340032, "step": 1792 }, { "epoch": 1.3360655737704918, "grad_norm": 8.044652132600092, "learning_rate": 2.084817314257644e-06, "loss": 0.7244, "num_input_tokens_seen": 7344128, "step": 1793 }, { "epoch": 1.3368107302533532, "grad_norm": 9.133949565666986, "learning_rate": 2.084381629694286e-06, "loss": 0.7846, "num_input_tokens_seen": 7348224, "step": 1794 }, { "epoch": 1.3375558867362147, "grad_norm": 8.935991633586934, "learning_rate": 2.0839457622285158e-06, "loss": 1.0558, "num_input_tokens_seen": 7352320, "step": 1795 }, { "epoch": 1.338301043219076, "grad_norm": 9.40455788407042, "learning_rate": 2.0835097119558807e-06, "loss": 0.8539, "num_input_tokens_seen": 7356416, "step": 1796 }, { "epoch": 1.3390461997019374, "grad_norm": 8.980040317971163, "learning_rate": 2.083073478971964e-06, "loss": 0.6881, "num_input_tokens_seen": 7360512, "step": 1797 }, { "epoch": 1.339791356184799, "grad_norm": 21.31118293102572, "learning_rate": 2.0826370633723926e-06, "loss": 1.0265, "num_input_tokens_seen": 7364608, "step": 1798 }, { "epoch": 1.3405365126676603, "grad_norm": 7.862096452374398, "learning_rate": 2.082200465252831e-06, "loss": 0.8868, "num_input_tokens_seen": 7368704, "step": 1799 }, { "epoch": 1.3412816691505216, "grad_norm": 9.344441109463649, "learning_rate": 2.081763684708985e-06, "loss": 0.7198, "num_input_tokens_seen": 7372800, "step": 1800 }, { "epoch": 1.342026825633383, "grad_norm": 9.519971213606187, "learning_rate": 2.0813267218366e-06, "loss": 0.6231, "num_input_tokens_seen": 7376896, "step": 1801 }, { "epoch": 1.3427719821162445, "grad_norm": 11.2529838468248, "learning_rate": 2.0808895767314614e-06, "loss": 0.8079, "num_input_tokens_seen": 7380992, "step": 1802 }, { "epoch": 1.3435171385991058, "grad_norm": 9.04671916060878, "learning_rate": 2.0804522494893946e-06, "loss": 0.7523, "num_input_tokens_seen": 7385088, "step": 1803 }, { "epoch": 1.3442622950819672, "grad_norm": 9.329582699205016, "learning_rate": 2.080014740206265e-06, "loss": 0.6134, "num_input_tokens_seen": 7389184, "step": 1804 }, { "epoch": 1.3450074515648285, "grad_norm": 8.229622932927066, "learning_rate": 2.079577048977977e-06, "loss": 0.7481, "num_input_tokens_seen": 7393280, "step": 1805 }, { "epoch": 1.34575260804769, "grad_norm": 7.724406467160062, "learning_rate": 2.0791391759004765e-06, "loss": 0.6915, "num_input_tokens_seen": 7397376, "step": 1806 }, { "epoch": 1.3464977645305514, "grad_norm": 9.324273001265613, "learning_rate": 2.078701121069748e-06, "loss": 0.7923, "num_input_tokens_seen": 7401472, "step": 1807 }, { "epoch": 1.3472429210134127, "grad_norm": 8.336783830352617, "learning_rate": 2.078262884581816e-06, "loss": 0.9189, "num_input_tokens_seen": 7405568, "step": 1808 }, { "epoch": 1.3479880774962743, "grad_norm": 8.297952065330254, "learning_rate": 2.077824466532746e-06, "loss": 0.8063, "num_input_tokens_seen": 7409664, "step": 1809 }, { "epoch": 1.3487332339791356, "grad_norm": 9.794058897850332, "learning_rate": 2.077385867018641e-06, "loss": 0.7168, "num_input_tokens_seen": 7413760, "step": 1810 }, { "epoch": 1.349478390461997, "grad_norm": 8.365451901305029, "learning_rate": 2.0769470861356463e-06, "loss": 0.9537, "num_input_tokens_seen": 7417856, "step": 1811 }, { "epoch": 1.3502235469448585, "grad_norm": 9.749888147067265, "learning_rate": 2.076508123979945e-06, "loss": 0.7761, "num_input_tokens_seen": 7421952, "step": 1812 }, { "epoch": 1.3509687034277198, "grad_norm": 8.976149731023158, "learning_rate": 2.0760689806477617e-06, "loss": 0.9399, "num_input_tokens_seen": 7426048, "step": 1813 }, { "epoch": 1.3517138599105811, "grad_norm": 20.63668449369138, "learning_rate": 2.0756296562353588e-06, "loss": 0.7927, "num_input_tokens_seen": 7430144, "step": 1814 }, { "epoch": 1.3524590163934427, "grad_norm": 11.523428847721787, "learning_rate": 2.07519015083904e-06, "loss": 0.6576, "num_input_tokens_seen": 7434240, "step": 1815 }, { "epoch": 1.353204172876304, "grad_norm": 8.037217502394913, "learning_rate": 2.0747504645551474e-06, "loss": 0.674, "num_input_tokens_seen": 7438336, "step": 1816 }, { "epoch": 1.3539493293591653, "grad_norm": 8.754814963535365, "learning_rate": 2.0743105974800644e-06, "loss": 0.8149, "num_input_tokens_seen": 7442432, "step": 1817 }, { "epoch": 1.354694485842027, "grad_norm": 10.232316883232173, "learning_rate": 2.0738705497102117e-06, "loss": 0.5288, "num_input_tokens_seen": 7446528, "step": 1818 }, { "epoch": 1.3554396423248882, "grad_norm": 7.887829654582026, "learning_rate": 2.0734303213420515e-06, "loss": 0.8162, "num_input_tokens_seen": 7450624, "step": 1819 }, { "epoch": 1.3561847988077496, "grad_norm": 7.756515555338833, "learning_rate": 2.0729899124720855e-06, "loss": 0.8337, "num_input_tokens_seen": 7454720, "step": 1820 }, { "epoch": 1.3569299552906111, "grad_norm": 8.19342501060791, "learning_rate": 2.0725493231968534e-06, "loss": 1.0083, "num_input_tokens_seen": 7458816, "step": 1821 }, { "epoch": 1.3576751117734724, "grad_norm": 7.781198785392106, "learning_rate": 2.0721085536129364e-06, "loss": 0.8933, "num_input_tokens_seen": 7462912, "step": 1822 }, { "epoch": 1.3584202682563338, "grad_norm": 11.025467454896843, "learning_rate": 2.0716676038169543e-06, "loss": 0.7055, "num_input_tokens_seen": 7467008, "step": 1823 }, { "epoch": 1.3591654247391953, "grad_norm": 15.266112587765587, "learning_rate": 2.0712264739055663e-06, "loss": 0.8861, "num_input_tokens_seen": 7471104, "step": 1824 }, { "epoch": 1.3599105812220567, "grad_norm": 10.180959091252427, "learning_rate": 2.070785163975471e-06, "loss": 0.7286, "num_input_tokens_seen": 7475200, "step": 1825 }, { "epoch": 1.360655737704918, "grad_norm": 9.750167677559906, "learning_rate": 2.070343674123407e-06, "loss": 0.9796, "num_input_tokens_seen": 7479296, "step": 1826 }, { "epoch": 1.3614008941877795, "grad_norm": 7.553490113098455, "learning_rate": 2.069902004446151e-06, "loss": 0.9288, "num_input_tokens_seen": 7483392, "step": 1827 }, { "epoch": 1.3621460506706409, "grad_norm": 7.669349806633131, "learning_rate": 2.069460155040522e-06, "loss": 1.0561, "num_input_tokens_seen": 7487488, "step": 1828 }, { "epoch": 1.3628912071535022, "grad_norm": 8.02923571363827, "learning_rate": 2.069018126003375e-06, "loss": 0.7046, "num_input_tokens_seen": 7491584, "step": 1829 }, { "epoch": 1.3636363636363638, "grad_norm": 11.856135769505132, "learning_rate": 2.0685759174316067e-06, "loss": 0.952, "num_input_tokens_seen": 7495680, "step": 1830 }, { "epoch": 1.364381520119225, "grad_norm": 11.444971380279402, "learning_rate": 2.068133529422152e-06, "loss": 0.6904, "num_input_tokens_seen": 7499776, "step": 1831 }, { "epoch": 1.3651266766020864, "grad_norm": 8.611544277066681, "learning_rate": 2.0676909620719857e-06, "loss": 0.9648, "num_input_tokens_seen": 7503872, "step": 1832 }, { "epoch": 1.365871833084948, "grad_norm": 9.102446307749625, "learning_rate": 2.0672482154781217e-06, "loss": 0.9323, "num_input_tokens_seen": 7507968, "step": 1833 }, { "epoch": 1.3666169895678093, "grad_norm": 9.57441044199658, "learning_rate": 2.0668052897376127e-06, "loss": 0.7238, "num_input_tokens_seen": 7512064, "step": 1834 }, { "epoch": 1.3673621460506706, "grad_norm": 7.791369294668993, "learning_rate": 2.0663621849475523e-06, "loss": 0.8859, "num_input_tokens_seen": 7516160, "step": 1835 }, { "epoch": 1.368107302533532, "grad_norm": 13.453884505243078, "learning_rate": 2.0659189012050716e-06, "loss": 0.6447, "num_input_tokens_seen": 7520256, "step": 1836 }, { "epoch": 1.3688524590163935, "grad_norm": 7.805216573645159, "learning_rate": 2.0654754386073417e-06, "loss": 0.7783, "num_input_tokens_seen": 7524352, "step": 1837 }, { "epoch": 1.3695976154992549, "grad_norm": 16.286170483950986, "learning_rate": 2.0650317972515723e-06, "loss": 0.7708, "num_input_tokens_seen": 7528448, "step": 1838 }, { "epoch": 1.3703427719821162, "grad_norm": 7.2654080406053785, "learning_rate": 2.064587977235013e-06, "loss": 0.9122, "num_input_tokens_seen": 7532544, "step": 1839 }, { "epoch": 1.3710879284649775, "grad_norm": 7.833104073219139, "learning_rate": 2.064143978654953e-06, "loss": 0.961, "num_input_tokens_seen": 7536640, "step": 1840 }, { "epoch": 1.371833084947839, "grad_norm": 9.212473744080613, "learning_rate": 2.063699801608719e-06, "loss": 0.8772, "num_input_tokens_seen": 7540736, "step": 1841 }, { "epoch": 1.3725782414307004, "grad_norm": 9.515987461131353, "learning_rate": 2.0632554461936776e-06, "loss": 0.9022, "num_input_tokens_seen": 7544832, "step": 1842 }, { "epoch": 1.3733233979135617, "grad_norm": 8.000604861499006, "learning_rate": 2.062810912507236e-06, "loss": 0.7817, "num_input_tokens_seen": 7548928, "step": 1843 }, { "epoch": 1.3740685543964233, "grad_norm": 9.779403730100857, "learning_rate": 2.062366200646838e-06, "loss": 0.8723, "num_input_tokens_seen": 7553024, "step": 1844 }, { "epoch": 1.3748137108792846, "grad_norm": 7.395067032960343, "learning_rate": 2.0619213107099683e-06, "loss": 0.6757, "num_input_tokens_seen": 7557120, "step": 1845 }, { "epoch": 1.375558867362146, "grad_norm": 9.217562444174849, "learning_rate": 2.0614762427941494e-06, "loss": 0.9755, "num_input_tokens_seen": 7561216, "step": 1846 }, { "epoch": 1.3763040238450075, "grad_norm": 8.795528141801439, "learning_rate": 2.061030996996944e-06, "loss": 0.7674, "num_input_tokens_seen": 7565312, "step": 1847 }, { "epoch": 1.3770491803278688, "grad_norm": 8.778691572400351, "learning_rate": 2.0605855734159523e-06, "loss": 0.7393, "num_input_tokens_seen": 7569408, "step": 1848 }, { "epoch": 1.3777943368107302, "grad_norm": 9.9442875589647, "learning_rate": 2.0601399721488154e-06, "loss": 0.7606, "num_input_tokens_seen": 7573504, "step": 1849 }, { "epoch": 1.3785394932935917, "grad_norm": 9.386486311590383, "learning_rate": 2.0596941932932114e-06, "loss": 0.825, "num_input_tokens_seen": 7577600, "step": 1850 }, { "epoch": 1.379284649776453, "grad_norm": 13.717999627844272, "learning_rate": 2.059248236946858e-06, "loss": 0.8372, "num_input_tokens_seen": 7581696, "step": 1851 }, { "epoch": 1.3800298062593144, "grad_norm": 12.31430594294874, "learning_rate": 2.0588021032075127e-06, "loss": 0.6764, "num_input_tokens_seen": 7585792, "step": 1852 }, { "epoch": 1.380774962742176, "grad_norm": 8.064764494652483, "learning_rate": 2.058355792172971e-06, "loss": 0.8432, "num_input_tokens_seen": 7589888, "step": 1853 }, { "epoch": 1.3815201192250373, "grad_norm": 9.429845957623186, "learning_rate": 2.057909303941068e-06, "loss": 0.6539, "num_input_tokens_seen": 7593984, "step": 1854 }, { "epoch": 1.3822652757078986, "grad_norm": 8.171003117676506, "learning_rate": 2.0574626386096765e-06, "loss": 0.5617, "num_input_tokens_seen": 7598080, "step": 1855 }, { "epoch": 1.3830104321907601, "grad_norm": 9.520757186631892, "learning_rate": 2.0570157962767084e-06, "loss": 0.7351, "num_input_tokens_seen": 7602176, "step": 1856 }, { "epoch": 1.3837555886736215, "grad_norm": 15.635738126628674, "learning_rate": 2.0565687770401155e-06, "loss": 1.0504, "num_input_tokens_seen": 7606272, "step": 1857 }, { "epoch": 1.3845007451564828, "grad_norm": 11.723576505879844, "learning_rate": 2.0561215809978874e-06, "loss": 0.64, "num_input_tokens_seen": 7610368, "step": 1858 }, { "epoch": 1.3852459016393444, "grad_norm": 6.924535475909383, "learning_rate": 2.055674208248052e-06, "loss": 1.0681, "num_input_tokens_seen": 7614464, "step": 1859 }, { "epoch": 1.3859910581222057, "grad_norm": 10.888514374313232, "learning_rate": 2.0552266588886775e-06, "loss": 0.7455, "num_input_tokens_seen": 7618560, "step": 1860 }, { "epoch": 1.386736214605067, "grad_norm": 9.216960911728084, "learning_rate": 2.054778933017869e-06, "loss": 0.7302, "num_input_tokens_seen": 7622656, "step": 1861 }, { "epoch": 1.3874813710879286, "grad_norm": 9.557914831622686, "learning_rate": 2.0543310307337724e-06, "loss": 0.9751, "num_input_tokens_seen": 7626752, "step": 1862 }, { "epoch": 1.38822652757079, "grad_norm": 8.181104763694506, "learning_rate": 2.0538829521345706e-06, "loss": 0.9034, "num_input_tokens_seen": 7630848, "step": 1863 }, { "epoch": 1.3889716840536512, "grad_norm": 8.87390580673763, "learning_rate": 2.053434697318485e-06, "loss": 0.6213, "num_input_tokens_seen": 7634944, "step": 1864 }, { "epoch": 1.3897168405365128, "grad_norm": 24.499466827648305, "learning_rate": 2.0529862663837767e-06, "loss": 0.9241, "num_input_tokens_seen": 7639040, "step": 1865 }, { "epoch": 1.3904619970193741, "grad_norm": 8.297835327587267, "learning_rate": 2.052537659428745e-06, "loss": 0.779, "num_input_tokens_seen": 7643136, "step": 1866 }, { "epoch": 1.3912071535022354, "grad_norm": 8.684300474828936, "learning_rate": 2.0520888765517277e-06, "loss": 0.8896, "num_input_tokens_seen": 7647232, "step": 1867 }, { "epoch": 1.391952309985097, "grad_norm": 9.453646764159192, "learning_rate": 2.051639917851101e-06, "loss": 0.7807, "num_input_tokens_seen": 7651328, "step": 1868 }, { "epoch": 1.3926974664679583, "grad_norm": 11.314203080316691, "learning_rate": 2.0511907834252805e-06, "loss": 0.5771, "num_input_tokens_seen": 7655424, "step": 1869 }, { "epoch": 1.3934426229508197, "grad_norm": 8.476471674482436, "learning_rate": 2.050741473372719e-06, "loss": 0.8021, "num_input_tokens_seen": 7659520, "step": 1870 }, { "epoch": 1.394187779433681, "grad_norm": 8.578940712307139, "learning_rate": 2.0502919877919092e-06, "loss": 0.9464, "num_input_tokens_seen": 7663616, "step": 1871 }, { "epoch": 1.3949329359165425, "grad_norm": 8.082770398632872, "learning_rate": 2.0498423267813805e-06, "loss": 0.642, "num_input_tokens_seen": 7667712, "step": 1872 }, { "epoch": 1.3956780923994039, "grad_norm": 8.450045301544638, "learning_rate": 2.0493924904397027e-06, "loss": 0.8505, "num_input_tokens_seen": 7671808, "step": 1873 }, { "epoch": 1.3964232488822652, "grad_norm": 8.713862912975324, "learning_rate": 2.0489424788654827e-06, "loss": 0.8541, "num_input_tokens_seen": 7675904, "step": 1874 }, { "epoch": 1.3971684053651265, "grad_norm": 8.289633867840188, "learning_rate": 2.0484922921573664e-06, "loss": 0.9523, "num_input_tokens_seen": 7680000, "step": 1875 }, { "epoch": 1.397913561847988, "grad_norm": 8.292748911961468, "learning_rate": 2.048041930414038e-06, "loss": 0.9877, "num_input_tokens_seen": 7684096, "step": 1876 }, { "epoch": 1.3986587183308494, "grad_norm": 8.12099848618059, "learning_rate": 2.0475913937342197e-06, "loss": 0.7729, "num_input_tokens_seen": 7688192, "step": 1877 }, { "epoch": 1.3994038748137108, "grad_norm": 10.3178745750302, "learning_rate": 2.047140682216673e-06, "loss": 0.64, "num_input_tokens_seen": 7692288, "step": 1878 }, { "epoch": 1.4001490312965723, "grad_norm": 7.426193051670394, "learning_rate": 2.0466897959601963e-06, "loss": 0.6047, "num_input_tokens_seen": 7696384, "step": 1879 }, { "epoch": 1.4008941877794336, "grad_norm": 7.093087238705649, "learning_rate": 2.0462387350636274e-06, "loss": 0.7812, "num_input_tokens_seen": 7700480, "step": 1880 }, { "epoch": 1.401639344262295, "grad_norm": 7.962849937840654, "learning_rate": 2.0457874996258426e-06, "loss": 0.6199, "num_input_tokens_seen": 7704576, "step": 1881 }, { "epoch": 1.4023845007451565, "grad_norm": 15.224237590743881, "learning_rate": 2.045336089745755e-06, "loss": 0.6233, "num_input_tokens_seen": 7708672, "step": 1882 }, { "epoch": 1.4031296572280179, "grad_norm": 6.5878569705270085, "learning_rate": 2.0448845055223175e-06, "loss": 0.7564, "num_input_tokens_seen": 7712768, "step": 1883 }, { "epoch": 1.4038748137108792, "grad_norm": 7.343474671249659, "learning_rate": 2.0444327470545207e-06, "loss": 0.622, "num_input_tokens_seen": 7716864, "step": 1884 }, { "epoch": 1.4046199701937407, "grad_norm": 8.563620013125057, "learning_rate": 2.043980814441392e-06, "loss": 0.7925, "num_input_tokens_seen": 7720960, "step": 1885 }, { "epoch": 1.405365126676602, "grad_norm": 8.782866966449493, "learning_rate": 2.043528707782e-06, "loss": 0.7553, "num_input_tokens_seen": 7725056, "step": 1886 }, { "epoch": 1.4061102831594634, "grad_norm": 8.151443083619604, "learning_rate": 2.043076427175449e-06, "loss": 0.6119, "num_input_tokens_seen": 7729152, "step": 1887 }, { "epoch": 1.406855439642325, "grad_norm": 9.175811527811872, "learning_rate": 2.042623972720882e-06, "loss": 0.8733, "num_input_tokens_seen": 7733248, "step": 1888 }, { "epoch": 1.4076005961251863, "grad_norm": 8.761215601394742, "learning_rate": 2.0421713445174803e-06, "loss": 0.6438, "num_input_tokens_seen": 7737344, "step": 1889 }, { "epoch": 1.4083457526080476, "grad_norm": 7.836627397412094, "learning_rate": 2.041718542664463e-06, "loss": 0.7542, "num_input_tokens_seen": 7741440, "step": 1890 }, { "epoch": 1.4090909090909092, "grad_norm": 9.241971287834758, "learning_rate": 2.041265567261088e-06, "loss": 0.825, "num_input_tokens_seen": 7745536, "step": 1891 }, { "epoch": 1.4098360655737705, "grad_norm": 9.108980323706456, "learning_rate": 2.0408124184066507e-06, "loss": 0.699, "num_input_tokens_seen": 7749632, "step": 1892 }, { "epoch": 1.4105812220566318, "grad_norm": 8.74055488738711, "learning_rate": 2.0403590962004845e-06, "loss": 0.9673, "num_input_tokens_seen": 7753728, "step": 1893 }, { "epoch": 1.4113263785394934, "grad_norm": 13.1366268826796, "learning_rate": 2.0399056007419603e-06, "loss": 0.9729, "num_input_tokens_seen": 7757824, "step": 1894 }, { "epoch": 1.4120715350223547, "grad_norm": 10.174766791806526, "learning_rate": 2.039451932130488e-06, "loss": 0.6775, "num_input_tokens_seen": 7761920, "step": 1895 }, { "epoch": 1.412816691505216, "grad_norm": 8.37752839209166, "learning_rate": 2.0389980904655153e-06, "loss": 0.7267, "num_input_tokens_seen": 7766016, "step": 1896 }, { "epoch": 1.4135618479880776, "grad_norm": 8.03227878742459, "learning_rate": 2.038544075846527e-06, "loss": 0.5634, "num_input_tokens_seen": 7770112, "step": 1897 }, { "epoch": 1.414307004470939, "grad_norm": 8.874905370037968, "learning_rate": 2.0380898883730466e-06, "loss": 0.836, "num_input_tokens_seen": 7774208, "step": 1898 }, { "epoch": 1.4150521609538003, "grad_norm": 7.050962630943054, "learning_rate": 2.0376355281446354e-06, "loss": 0.6671, "num_input_tokens_seen": 7778304, "step": 1899 }, { "epoch": 1.4157973174366618, "grad_norm": 8.852397336624362, "learning_rate": 2.0371809952608916e-06, "loss": 0.4541, "num_input_tokens_seen": 7782400, "step": 1900 }, { "epoch": 1.4165424739195231, "grad_norm": 8.235121745309394, "learning_rate": 2.0367262898214533e-06, "loss": 0.7147, "num_input_tokens_seen": 7786496, "step": 1901 }, { "epoch": 1.4172876304023845, "grad_norm": 9.83120145464095, "learning_rate": 2.0362714119259943e-06, "loss": 0.7761, "num_input_tokens_seen": 7790592, "step": 1902 }, { "epoch": 1.418032786885246, "grad_norm": 8.168258966948294, "learning_rate": 2.0358163616742272e-06, "loss": 0.8911, "num_input_tokens_seen": 7794688, "step": 1903 }, { "epoch": 1.4187779433681074, "grad_norm": 7.723756210694702, "learning_rate": 2.035361139165903e-06, "loss": 0.7347, "num_input_tokens_seen": 7798784, "step": 1904 }, { "epoch": 1.4195230998509687, "grad_norm": 8.974828020923727, "learning_rate": 2.0349057445008085e-06, "loss": 0.7215, "num_input_tokens_seen": 7802880, "step": 1905 }, { "epoch": 1.42026825633383, "grad_norm": 7.257227259676293, "learning_rate": 2.0344501777787703e-06, "loss": 0.6971, "num_input_tokens_seen": 7806976, "step": 1906 }, { "epoch": 1.4210134128166916, "grad_norm": 10.196889663615783, "learning_rate": 2.0339944390996515e-06, "loss": 0.8762, "num_input_tokens_seen": 7811072, "step": 1907 }, { "epoch": 1.421758569299553, "grad_norm": 9.494763848791061, "learning_rate": 2.0335385285633537e-06, "loss": 0.5777, "num_input_tokens_seen": 7815168, "step": 1908 }, { "epoch": 1.4225037257824142, "grad_norm": 9.148971388230928, "learning_rate": 2.0330824462698153e-06, "loss": 0.7393, "num_input_tokens_seen": 7819264, "step": 1909 }, { "epoch": 1.4232488822652756, "grad_norm": 7.329974688517739, "learning_rate": 2.0326261923190133e-06, "loss": 1.0289, "num_input_tokens_seen": 7823360, "step": 1910 }, { "epoch": 1.4239940387481371, "grad_norm": 8.626776834078234, "learning_rate": 2.032169766810961e-06, "loss": 0.8112, "num_input_tokens_seen": 7827456, "step": 1911 }, { "epoch": 1.4247391952309985, "grad_norm": 7.43591989083736, "learning_rate": 2.0317131698457105e-06, "loss": 0.8357, "num_input_tokens_seen": 7831552, "step": 1912 }, { "epoch": 1.4254843517138598, "grad_norm": 8.205956596219895, "learning_rate": 2.0312564015233517e-06, "loss": 0.8064, "num_input_tokens_seen": 7835648, "step": 1913 }, { "epoch": 1.4262295081967213, "grad_norm": 7.657471144682773, "learning_rate": 2.0307994619440108e-06, "loss": 0.7143, "num_input_tokens_seen": 7839744, "step": 1914 }, { "epoch": 1.4269746646795827, "grad_norm": 10.31304878266363, "learning_rate": 2.0303423512078524e-06, "loss": 0.6194, "num_input_tokens_seen": 7843840, "step": 1915 }, { "epoch": 1.427719821162444, "grad_norm": 7.89916643217124, "learning_rate": 2.0298850694150785e-06, "loss": 0.7912, "num_input_tokens_seen": 7847936, "step": 1916 }, { "epoch": 1.4284649776453056, "grad_norm": 7.586076734795293, "learning_rate": 2.0294276166659286e-06, "loss": 0.7058, "num_input_tokens_seen": 7852032, "step": 1917 }, { "epoch": 1.4292101341281669, "grad_norm": 8.345570281305328, "learning_rate": 2.0289699930606796e-06, "loss": 0.8517, "num_input_tokens_seen": 7856128, "step": 1918 }, { "epoch": 1.4299552906110282, "grad_norm": 6.807127360157714, "learning_rate": 2.028512198699646e-06, "loss": 0.7135, "num_input_tokens_seen": 7860224, "step": 1919 }, { "epoch": 1.4307004470938898, "grad_norm": 10.434324619905908, "learning_rate": 2.0280542336831787e-06, "loss": 0.6601, "num_input_tokens_seen": 7864320, "step": 1920 }, { "epoch": 1.431445603576751, "grad_norm": 7.905740974869358, "learning_rate": 2.027596098111668e-06, "loss": 0.5379, "num_input_tokens_seen": 7868416, "step": 1921 }, { "epoch": 1.4321907600596124, "grad_norm": 7.863697199829122, "learning_rate": 2.02713779208554e-06, "loss": 0.9298, "num_input_tokens_seen": 7872512, "step": 1922 }, { "epoch": 1.432935916542474, "grad_norm": 9.912257385891106, "learning_rate": 2.0266793157052587e-06, "loss": 0.5326, "num_input_tokens_seen": 7876608, "step": 1923 }, { "epoch": 1.4336810730253353, "grad_norm": 8.877763431818021, "learning_rate": 2.0262206690713253e-06, "loss": 0.8675, "num_input_tokens_seen": 7880704, "step": 1924 }, { "epoch": 1.4344262295081966, "grad_norm": 10.179035740990118, "learning_rate": 2.0257618522842785e-06, "loss": 0.988, "num_input_tokens_seen": 7884800, "step": 1925 }, { "epoch": 1.4351713859910582, "grad_norm": 9.002716913775556, "learning_rate": 2.0253028654446944e-06, "loss": 0.867, "num_input_tokens_seen": 7888896, "step": 1926 }, { "epoch": 1.4359165424739195, "grad_norm": 9.615297369807395, "learning_rate": 2.0248437086531863e-06, "loss": 0.6626, "num_input_tokens_seen": 7892992, "step": 1927 }, { "epoch": 1.4366616989567809, "grad_norm": 9.17838478512578, "learning_rate": 2.024384382010404e-06, "loss": 0.68, "num_input_tokens_seen": 7897088, "step": 1928 }, { "epoch": 1.4374068554396424, "grad_norm": 9.266726603707783, "learning_rate": 2.0239248856170357e-06, "loss": 1.0023, "num_input_tokens_seen": 7901184, "step": 1929 }, { "epoch": 1.4381520119225037, "grad_norm": 8.51099308282797, "learning_rate": 2.023465219573806e-06, "loss": 0.5118, "num_input_tokens_seen": 7905280, "step": 1930 }, { "epoch": 1.438897168405365, "grad_norm": 7.523161522724961, "learning_rate": 2.023005383981477e-06, "loss": 0.6836, "num_input_tokens_seen": 7909376, "step": 1931 }, { "epoch": 1.4396423248882266, "grad_norm": 9.614438063794784, "learning_rate": 2.022545378940849e-06, "loss": 0.5789, "num_input_tokens_seen": 7913472, "step": 1932 }, { "epoch": 1.440387481371088, "grad_norm": 9.493873758949764, "learning_rate": 2.0220852045527565e-06, "loss": 0.7936, "num_input_tokens_seen": 7917568, "step": 1933 }, { "epoch": 1.4411326378539493, "grad_norm": 8.004739254385926, "learning_rate": 2.0216248609180743e-06, "loss": 0.7127, "num_input_tokens_seen": 7921664, "step": 1934 }, { "epoch": 1.4418777943368108, "grad_norm": 9.014799515527224, "learning_rate": 2.0211643481377124e-06, "loss": 0.8341, "num_input_tokens_seen": 7925760, "step": 1935 }, { "epoch": 1.4426229508196722, "grad_norm": 8.384462301237692, "learning_rate": 2.020703666312619e-06, "loss": 0.6438, "num_input_tokens_seen": 7929856, "step": 1936 }, { "epoch": 1.4433681073025335, "grad_norm": 7.667855546019243, "learning_rate": 2.020242815543779e-06, "loss": 0.8816, "num_input_tokens_seen": 7933952, "step": 1937 }, { "epoch": 1.444113263785395, "grad_norm": 9.923875085524983, "learning_rate": 2.0197817959322134e-06, "loss": 0.8635, "num_input_tokens_seen": 7938048, "step": 1938 }, { "epoch": 1.4448584202682564, "grad_norm": 13.387112415403486, "learning_rate": 2.019320607578982e-06, "loss": 1.0744, "num_input_tokens_seen": 7942144, "step": 1939 }, { "epoch": 1.4456035767511177, "grad_norm": 7.99919653773888, "learning_rate": 2.018859250585179e-06, "loss": 0.9076, "num_input_tokens_seen": 7946240, "step": 1940 }, { "epoch": 1.446348733233979, "grad_norm": 9.963378081374556, "learning_rate": 2.018397725051939e-06, "loss": 0.7192, "num_input_tokens_seen": 7950336, "step": 1941 }, { "epoch": 1.4470938897168406, "grad_norm": 8.827947612402, "learning_rate": 2.01793603108043e-06, "loss": 0.6166, "num_input_tokens_seen": 7954432, "step": 1942 }, { "epoch": 1.447839046199702, "grad_norm": 8.068625995283899, "learning_rate": 2.0174741687718597e-06, "loss": 0.9141, "num_input_tokens_seen": 7958528, "step": 1943 }, { "epoch": 1.4485842026825633, "grad_norm": 10.361630146314408, "learning_rate": 2.0170121382274717e-06, "loss": 0.9312, "num_input_tokens_seen": 7962624, "step": 1944 }, { "epoch": 1.4493293591654246, "grad_norm": 8.48599854566603, "learning_rate": 2.016549939548546e-06, "loss": 0.7388, "num_input_tokens_seen": 7966720, "step": 1945 }, { "epoch": 1.4500745156482862, "grad_norm": 7.836278435175169, "learning_rate": 2.0160875728363996e-06, "loss": 0.716, "num_input_tokens_seen": 7970816, "step": 1946 }, { "epoch": 1.4508196721311475, "grad_norm": 9.843432711669154, "learning_rate": 2.015625038192387e-06, "loss": 0.6628, "num_input_tokens_seen": 7974912, "step": 1947 }, { "epoch": 1.4515648286140088, "grad_norm": 9.23433287870354, "learning_rate": 2.015162335717899e-06, "loss": 0.7116, "num_input_tokens_seen": 7979008, "step": 1948 }, { "epoch": 1.4523099850968704, "grad_norm": 8.682555713605245, "learning_rate": 2.0146994655143633e-06, "loss": 0.6629, "num_input_tokens_seen": 7983104, "step": 1949 }, { "epoch": 1.4530551415797317, "grad_norm": 9.94035059342054, "learning_rate": 2.0142364276832445e-06, "loss": 0.6824, "num_input_tokens_seen": 7987200, "step": 1950 }, { "epoch": 1.453800298062593, "grad_norm": 8.866447672710514, "learning_rate": 2.013773222326043e-06, "loss": 0.5549, "num_input_tokens_seen": 7991296, "step": 1951 }, { "epoch": 1.4545454545454546, "grad_norm": 9.695885076880806, "learning_rate": 2.0133098495442978e-06, "loss": 0.938, "num_input_tokens_seen": 7995392, "step": 1952 }, { "epoch": 1.455290611028316, "grad_norm": 7.868593305779994, "learning_rate": 2.012846309439583e-06, "loss": 0.8168, "num_input_tokens_seen": 7999488, "step": 1953 }, { "epoch": 1.4560357675111772, "grad_norm": 8.520776214775168, "learning_rate": 2.0123826021135102e-06, "loss": 0.6229, "num_input_tokens_seen": 8003584, "step": 1954 }, { "epoch": 1.4567809239940388, "grad_norm": 8.343270859742631, "learning_rate": 2.0119187276677267e-06, "loss": 0.9812, "num_input_tokens_seen": 8007680, "step": 1955 }, { "epoch": 1.4575260804769001, "grad_norm": 8.192699590641958, "learning_rate": 2.0114546862039176e-06, "loss": 0.7793, "num_input_tokens_seen": 8011776, "step": 1956 }, { "epoch": 1.4582712369597615, "grad_norm": 9.088451566184142, "learning_rate": 2.010990477823804e-06, "loss": 0.9691, "num_input_tokens_seen": 8015872, "step": 1957 }, { "epoch": 1.459016393442623, "grad_norm": 9.098541886737141, "learning_rate": 2.010526102629144e-06, "loss": 0.961, "num_input_tokens_seen": 8019968, "step": 1958 }, { "epoch": 1.4597615499254843, "grad_norm": 7.019166888097006, "learning_rate": 2.0100615607217313e-06, "loss": 1.0833, "num_input_tokens_seen": 8024064, "step": 1959 }, { "epoch": 1.4605067064083457, "grad_norm": 10.65424070439475, "learning_rate": 2.009596852203398e-06, "loss": 0.6639, "num_input_tokens_seen": 8028160, "step": 1960 }, { "epoch": 1.4612518628912072, "grad_norm": 10.696569520588362, "learning_rate": 2.0091319771760094e-06, "loss": 0.6732, "num_input_tokens_seen": 8032256, "step": 1961 }, { "epoch": 1.4619970193740686, "grad_norm": 11.925416086280363, "learning_rate": 2.0086669357414714e-06, "loss": 1.0387, "num_input_tokens_seen": 8036352, "step": 1962 }, { "epoch": 1.46274217585693, "grad_norm": 8.919717835308077, "learning_rate": 2.0082017280017237e-06, "loss": 0.8233, "num_input_tokens_seen": 8040448, "step": 1963 }, { "epoch": 1.4634873323397914, "grad_norm": 10.693361321539099, "learning_rate": 2.007736354058743e-06, "loss": 0.7006, "num_input_tokens_seen": 8044544, "step": 1964 }, { "epoch": 1.4642324888226528, "grad_norm": 11.10916434006451, "learning_rate": 2.007270814014543e-06, "loss": 0.7369, "num_input_tokens_seen": 8048640, "step": 1965 }, { "epoch": 1.464977645305514, "grad_norm": 9.064037211729627, "learning_rate": 2.0068051079711732e-06, "loss": 0.7927, "num_input_tokens_seen": 8052736, "step": 1966 }, { "epoch": 1.4657228017883757, "grad_norm": 9.26599153706001, "learning_rate": 2.0063392360307195e-06, "loss": 0.8511, "num_input_tokens_seen": 8056832, "step": 1967 }, { "epoch": 1.466467958271237, "grad_norm": 7.450651637024586, "learning_rate": 2.0058731982953046e-06, "loss": 0.7079, "num_input_tokens_seen": 8060928, "step": 1968 }, { "epoch": 1.4672131147540983, "grad_norm": 10.388361377386126, "learning_rate": 2.005406994867087e-06, "loss": 0.6043, "num_input_tokens_seen": 8065024, "step": 1969 }, { "epoch": 1.4679582712369599, "grad_norm": 8.528254996212537, "learning_rate": 2.0049406258482624e-06, "loss": 0.83, "num_input_tokens_seen": 8069120, "step": 1970 }, { "epoch": 1.4687034277198212, "grad_norm": 7.750763769789148, "learning_rate": 2.0044740913410614e-06, "loss": 0.7314, "num_input_tokens_seen": 8073216, "step": 1971 }, { "epoch": 1.4694485842026825, "grad_norm": 12.016582740855196, "learning_rate": 2.0040073914477526e-06, "loss": 0.6839, "num_input_tokens_seen": 8077312, "step": 1972 }, { "epoch": 1.470193740685544, "grad_norm": 10.055731179927204, "learning_rate": 2.003540526270639e-06, "loss": 0.7545, "num_input_tokens_seen": 8081408, "step": 1973 }, { "epoch": 1.4709388971684054, "grad_norm": 9.393472311213866, "learning_rate": 2.003073495912062e-06, "loss": 1.0183, "num_input_tokens_seen": 8085504, "step": 1974 }, { "epoch": 1.4716840536512668, "grad_norm": 8.704082733337163, "learning_rate": 2.0026063004743966e-06, "loss": 0.6806, "num_input_tokens_seen": 8089600, "step": 1975 }, { "epoch": 1.472429210134128, "grad_norm": 8.250298272041542, "learning_rate": 2.0021389400600564e-06, "loss": 1.0329, "num_input_tokens_seen": 8093696, "step": 1976 }, { "epoch": 1.4731743666169896, "grad_norm": 9.201813185608284, "learning_rate": 2.0016714147714896e-06, "loss": 0.968, "num_input_tokens_seen": 8097792, "step": 1977 }, { "epoch": 1.473919523099851, "grad_norm": 14.456453398665378, "learning_rate": 2.0012037247111815e-06, "loss": 0.8327, "num_input_tokens_seen": 8101888, "step": 1978 }, { "epoch": 1.4746646795827123, "grad_norm": 10.234431267192063, "learning_rate": 2.0007358699816525e-06, "loss": 0.6685, "num_input_tokens_seen": 8105984, "step": 1979 }, { "epoch": 1.4754098360655736, "grad_norm": 10.857832795686845, "learning_rate": 2.0002678506854608e-06, "loss": 0.8311, "num_input_tokens_seen": 8110080, "step": 1980 }, { "epoch": 1.4761549925484352, "grad_norm": 11.574633748123945, "learning_rate": 1.9997996669251977e-06, "loss": 0.9739, "num_input_tokens_seen": 8114176, "step": 1981 }, { "epoch": 1.4769001490312965, "grad_norm": 9.967702114824887, "learning_rate": 1.9993313188034946e-06, "loss": 0.6609, "num_input_tokens_seen": 8118272, "step": 1982 }, { "epoch": 1.4776453055141578, "grad_norm": 8.880448475687091, "learning_rate": 1.9988628064230155e-06, "loss": 0.7391, "num_input_tokens_seen": 8122368, "step": 1983 }, { "epoch": 1.4783904619970194, "grad_norm": 6.665048952489725, "learning_rate": 1.998394129886461e-06, "loss": 0.6768, "num_input_tokens_seen": 8126464, "step": 1984 }, { "epoch": 1.4791356184798807, "grad_norm": 8.104387784855847, "learning_rate": 1.9979252892965705e-06, "loss": 0.9577, "num_input_tokens_seen": 8130560, "step": 1985 }, { "epoch": 1.479880774962742, "grad_norm": 8.77506899531447, "learning_rate": 1.997456284756115e-06, "loss": 0.6312, "num_input_tokens_seen": 8134656, "step": 1986 }, { "epoch": 1.4806259314456036, "grad_norm": 7.924530826354291, "learning_rate": 1.996987116367905e-06, "loss": 0.682, "num_input_tokens_seen": 8138752, "step": 1987 }, { "epoch": 1.481371087928465, "grad_norm": 7.211599651173624, "learning_rate": 1.996517784234785e-06, "loss": 0.7615, "num_input_tokens_seen": 8142848, "step": 1988 }, { "epoch": 1.4821162444113263, "grad_norm": 10.149826880998027, "learning_rate": 1.996048288459636e-06, "loss": 0.8965, "num_input_tokens_seen": 8146944, "step": 1989 }, { "epoch": 1.4828614008941878, "grad_norm": 7.323005258817422, "learning_rate": 1.9955786291453753e-06, "loss": 0.8896, "num_input_tokens_seen": 8151040, "step": 1990 }, { "epoch": 1.4836065573770492, "grad_norm": 10.464427429406637, "learning_rate": 1.9951088063949547e-06, "loss": 0.941, "num_input_tokens_seen": 8155136, "step": 1991 }, { "epoch": 1.4843517138599105, "grad_norm": 7.398511047412084, "learning_rate": 1.9946388203113635e-06, "loss": 0.9704, "num_input_tokens_seen": 8159232, "step": 1992 }, { "epoch": 1.485096870342772, "grad_norm": 7.6856396144147565, "learning_rate": 1.9941686709976256e-06, "loss": 0.8226, "num_input_tokens_seen": 8163328, "step": 1993 }, { "epoch": 1.4858420268256334, "grad_norm": 8.604342584969707, "learning_rate": 1.9936983585568008e-06, "loss": 0.5011, "num_input_tokens_seen": 8167424, "step": 1994 }, { "epoch": 1.4865871833084947, "grad_norm": 7.203343874221998, "learning_rate": 1.993227883091986e-06, "loss": 0.5904, "num_input_tokens_seen": 8171520, "step": 1995 }, { "epoch": 1.4873323397913563, "grad_norm": 8.213454663235968, "learning_rate": 1.9927572447063117e-06, "loss": 1.0457, "num_input_tokens_seen": 8175616, "step": 1996 }, { "epoch": 1.4880774962742176, "grad_norm": 7.340795201771691, "learning_rate": 1.9922864435029457e-06, "loss": 1.004, "num_input_tokens_seen": 8179712, "step": 1997 }, { "epoch": 1.488822652757079, "grad_norm": 8.369678989153055, "learning_rate": 1.991815479585091e-06, "loss": 0.6677, "num_input_tokens_seen": 8183808, "step": 1998 }, { "epoch": 1.4895678092399405, "grad_norm": 10.121713962475994, "learning_rate": 1.991344353055986e-06, "loss": 0.5498, "num_input_tokens_seen": 8187904, "step": 1999 }, { "epoch": 1.4903129657228018, "grad_norm": 7.271620893907643, "learning_rate": 1.9908730640189053e-06, "loss": 0.5411, "num_input_tokens_seen": 8192000, "step": 2000 }, { "epoch": 1.4910581222056631, "grad_norm": 17.833938139168556, "learning_rate": 1.9904016125771585e-06, "loss": 0.8725, "num_input_tokens_seen": 8196096, "step": 2001 }, { "epoch": 1.4918032786885247, "grad_norm": 7.951668525028, "learning_rate": 1.989929998834091e-06, "loss": 0.6239, "num_input_tokens_seen": 8200192, "step": 2002 }, { "epoch": 1.492548435171386, "grad_norm": 15.332231207159328, "learning_rate": 1.9894582228930845e-06, "loss": 0.8744, "num_input_tokens_seen": 8204288, "step": 2003 }, { "epoch": 1.4932935916542474, "grad_norm": 10.831772080294817, "learning_rate": 1.9889862848575554e-06, "loss": 0.7452, "num_input_tokens_seen": 8208384, "step": 2004 }, { "epoch": 1.494038748137109, "grad_norm": 8.603960893171209, "learning_rate": 1.9885141848309557e-06, "loss": 0.6563, "num_input_tokens_seen": 8212480, "step": 2005 }, { "epoch": 1.4947839046199702, "grad_norm": 9.850529509162715, "learning_rate": 1.9880419229167735e-06, "loss": 0.6544, "num_input_tokens_seen": 8216576, "step": 2006 }, { "epoch": 1.4955290611028316, "grad_norm": 8.72417532866282, "learning_rate": 1.9875694992185314e-06, "loss": 0.6499, "num_input_tokens_seen": 8220672, "step": 2007 }, { "epoch": 1.4962742175856931, "grad_norm": 8.574828984459478, "learning_rate": 1.9870969138397884e-06, "loss": 0.8445, "num_input_tokens_seen": 8224768, "step": 2008 }, { "epoch": 1.4970193740685545, "grad_norm": 9.398069137712143, "learning_rate": 1.9866241668841385e-06, "loss": 0.4635, "num_input_tokens_seen": 8228864, "step": 2009 }, { "epoch": 1.4977645305514158, "grad_norm": 9.032127714240488, "learning_rate": 1.9861512584552113e-06, "loss": 0.9121, "num_input_tokens_seen": 8232960, "step": 2010 }, { "epoch": 1.4985096870342771, "grad_norm": 7.8625365982140805, "learning_rate": 1.9856781886566716e-06, "loss": 0.7387, "num_input_tokens_seen": 8237056, "step": 2011 }, { "epoch": 1.4992548435171387, "grad_norm": 8.894167585340764, "learning_rate": 1.98520495759222e-06, "loss": 0.4775, "num_input_tokens_seen": 8241152, "step": 2012 }, { "epoch": 1.5, "grad_norm": 9.273061127680002, "learning_rate": 1.9847315653655916e-06, "loss": 0.8879, "num_input_tokens_seen": 8245248, "step": 2013 }, { "epoch": 1.5007451564828616, "grad_norm": 10.044215659887742, "learning_rate": 1.984258012080558e-06, "loss": 0.872, "num_input_tokens_seen": 8249344, "step": 2014 }, { "epoch": 1.5014903129657227, "grad_norm": 7.688066591661628, "learning_rate": 1.9837842978409242e-06, "loss": 0.9248, "num_input_tokens_seen": 8253440, "step": 2015 }, { "epoch": 1.5022354694485842, "grad_norm": 15.378506979424952, "learning_rate": 1.9833104227505335e-06, "loss": 0.6801, "num_input_tokens_seen": 8257536, "step": 2016 }, { "epoch": 1.5029806259314458, "grad_norm": 8.15371989225425, "learning_rate": 1.9828363869132616e-06, "loss": 1.0082, "num_input_tokens_seen": 8261632, "step": 2017 }, { "epoch": 1.5037257824143069, "grad_norm": 9.757763351504066, "learning_rate": 1.9823621904330207e-06, "loss": 0.813, "num_input_tokens_seen": 8265728, "step": 2018 }, { "epoch": 1.5044709388971684, "grad_norm": 8.741872623517553, "learning_rate": 1.981887833413758e-06, "loss": 0.7706, "num_input_tokens_seen": 8269824, "step": 2019 }, { "epoch": 1.5052160953800298, "grad_norm": 9.748755025409208, "learning_rate": 1.981413315959456e-06, "loss": 0.4791, "num_input_tokens_seen": 8273920, "step": 2020 }, { "epoch": 1.505961251862891, "grad_norm": 7.854091107178942, "learning_rate": 1.980938638174133e-06, "loss": 0.8152, "num_input_tokens_seen": 8278016, "step": 2021 }, { "epoch": 1.5067064083457526, "grad_norm": 14.555288412631164, "learning_rate": 1.9804638001618404e-06, "loss": 0.7121, "num_input_tokens_seen": 8282112, "step": 2022 }, { "epoch": 1.507451564828614, "grad_norm": 8.902816166232798, "learning_rate": 1.979988802026667e-06, "loss": 0.7817, "num_input_tokens_seen": 8286208, "step": 2023 }, { "epoch": 1.5081967213114753, "grad_norm": 10.65146988881764, "learning_rate": 1.9795136438727356e-06, "loss": 0.9041, "num_input_tokens_seen": 8290304, "step": 2024 }, { "epoch": 1.5089418777943369, "grad_norm": 6.648620255488169, "learning_rate": 1.979038325804205e-06, "loss": 0.9788, "num_input_tokens_seen": 8294400, "step": 2025 }, { "epoch": 1.5096870342771982, "grad_norm": 7.779762507417387, "learning_rate": 1.978562847925267e-06, "loss": 0.9794, "num_input_tokens_seen": 8298496, "step": 2026 }, { "epoch": 1.5104321907600595, "grad_norm": 8.726033812832362, "learning_rate": 1.9780872103401504e-06, "loss": 0.6799, "num_input_tokens_seen": 8302592, "step": 2027 }, { "epoch": 1.511177347242921, "grad_norm": 9.895054703732463, "learning_rate": 1.9776114131531183e-06, "loss": 0.5488, "num_input_tokens_seen": 8306688, "step": 2028 }, { "epoch": 1.5119225037257824, "grad_norm": 8.66164793770068, "learning_rate": 1.9771354564684685e-06, "loss": 1.0917, "num_input_tokens_seen": 8310784, "step": 2029 }, { "epoch": 1.5126676602086437, "grad_norm": 9.490914269006726, "learning_rate": 1.9766593403905347e-06, "loss": 0.5443, "num_input_tokens_seen": 8314880, "step": 2030 }, { "epoch": 1.5134128166915053, "grad_norm": 10.315293611368748, "learning_rate": 1.9761830650236844e-06, "loss": 0.7051, "num_input_tokens_seen": 8318976, "step": 2031 }, { "epoch": 1.5141579731743666, "grad_norm": 7.7473033442727415, "learning_rate": 1.9757066304723218e-06, "loss": 0.7641, "num_input_tokens_seen": 8323072, "step": 2032 }, { "epoch": 1.514903129657228, "grad_norm": 17.406737152692433, "learning_rate": 1.975230036840883e-06, "loss": 0.5923, "num_input_tokens_seen": 8327168, "step": 2033 }, { "epoch": 1.5156482861400895, "grad_norm": 8.030305948402612, "learning_rate": 1.9747532842338413e-06, "loss": 0.8794, "num_input_tokens_seen": 8331264, "step": 2034 }, { "epoch": 1.5163934426229508, "grad_norm": 6.807867076251768, "learning_rate": 1.9742763727557047e-06, "loss": 0.8095, "num_input_tokens_seen": 8335360, "step": 2035 }, { "epoch": 1.5171385991058122, "grad_norm": 11.030244014376256, "learning_rate": 1.9737993025110157e-06, "loss": 0.8827, "num_input_tokens_seen": 8339456, "step": 2036 }, { "epoch": 1.5178837555886737, "grad_norm": 9.029217605951152, "learning_rate": 1.9733220736043505e-06, "loss": 0.7378, "num_input_tokens_seen": 8343552, "step": 2037 }, { "epoch": 1.518628912071535, "grad_norm": 10.523784096119394, "learning_rate": 1.972844686140323e-06, "loss": 0.573, "num_input_tokens_seen": 8347648, "step": 2038 }, { "epoch": 1.5193740685543964, "grad_norm": 7.525166070358594, "learning_rate": 1.972367140223578e-06, "loss": 0.8084, "num_input_tokens_seen": 8351744, "step": 2039 }, { "epoch": 1.520119225037258, "grad_norm": 8.769608671758158, "learning_rate": 1.9718894359587977e-06, "loss": 0.7734, "num_input_tokens_seen": 8355840, "step": 2040 }, { "epoch": 1.5208643815201193, "grad_norm": 7.541414368714144, "learning_rate": 1.971411573450698e-06, "loss": 0.6374, "num_input_tokens_seen": 8359936, "step": 2041 }, { "epoch": 1.5216095380029806, "grad_norm": 8.942188020849471, "learning_rate": 1.9709335528040306e-06, "loss": 0.683, "num_input_tokens_seen": 8364032, "step": 2042 }, { "epoch": 1.5223546944858422, "grad_norm": 8.929945301914113, "learning_rate": 1.9704553741235797e-06, "loss": 0.7245, "num_input_tokens_seen": 8368128, "step": 2043 }, { "epoch": 1.5230998509687033, "grad_norm": 7.6626406332906996, "learning_rate": 1.969977037514167e-06, "loss": 0.8661, "num_input_tokens_seen": 8372224, "step": 2044 }, { "epoch": 1.5238450074515648, "grad_norm": 7.299532869472143, "learning_rate": 1.9694985430806467e-06, "loss": 0.8827, "num_input_tokens_seen": 8376320, "step": 2045 }, { "epoch": 1.5245901639344264, "grad_norm": 13.704625442360491, "learning_rate": 1.9690198909279077e-06, "loss": 0.7172, "num_input_tokens_seen": 8380416, "step": 2046 }, { "epoch": 1.5253353204172875, "grad_norm": 8.105714668569027, "learning_rate": 1.9685410811608746e-06, "loss": 0.8031, "num_input_tokens_seen": 8384512, "step": 2047 }, { "epoch": 1.526080476900149, "grad_norm": 8.505753677449322, "learning_rate": 1.9680621138845056e-06, "loss": 0.6411, "num_input_tokens_seen": 8388608, "step": 2048 }, { "epoch": 1.5268256333830106, "grad_norm": 13.003412054858408, "learning_rate": 1.9675829892037936e-06, "loss": 0.7383, "num_input_tokens_seen": 8392704, "step": 2049 }, { "epoch": 1.5275707898658717, "grad_norm": 8.149748978546219, "learning_rate": 1.9671037072237664e-06, "loss": 0.766, "num_input_tokens_seen": 8396800, "step": 2050 }, { "epoch": 1.5283159463487332, "grad_norm": 9.79012294104727, "learning_rate": 1.966624268049486e-06, "loss": 0.9248, "num_input_tokens_seen": 8400896, "step": 2051 }, { "epoch": 1.5290611028315948, "grad_norm": 7.801177131374099, "learning_rate": 1.9661446717860496e-06, "loss": 0.5944, "num_input_tokens_seen": 8404992, "step": 2052 }, { "epoch": 1.529806259314456, "grad_norm": 10.856917218903131, "learning_rate": 1.9656649185385864e-06, "loss": 0.7988, "num_input_tokens_seen": 8409088, "step": 2053 }, { "epoch": 1.5305514157973175, "grad_norm": 8.925418386608552, "learning_rate": 1.9651850084122633e-06, "loss": 0.5616, "num_input_tokens_seen": 8413184, "step": 2054 }, { "epoch": 1.5312965722801788, "grad_norm": 9.461861021670632, "learning_rate": 1.9647049415122792e-06, "loss": 0.9779, "num_input_tokens_seen": 8417280, "step": 2055 }, { "epoch": 1.5320417287630401, "grad_norm": 9.430331348279644, "learning_rate": 1.9642247179438682e-06, "loss": 0.7533, "num_input_tokens_seen": 8421376, "step": 2056 }, { "epoch": 1.5327868852459017, "grad_norm": 9.098741690420347, "learning_rate": 1.9637443378122997e-06, "loss": 0.6336, "num_input_tokens_seen": 8425472, "step": 2057 }, { "epoch": 1.533532041728763, "grad_norm": 8.397409820777561, "learning_rate": 1.9632638012228753e-06, "loss": 0.6061, "num_input_tokens_seen": 8429568, "step": 2058 }, { "epoch": 1.5342771982116243, "grad_norm": 7.619584958013396, "learning_rate": 1.962783108280933e-06, "loss": 0.9124, "num_input_tokens_seen": 8433664, "step": 2059 }, { "epoch": 1.535022354694486, "grad_norm": 8.53047326909768, "learning_rate": 1.9623022590918434e-06, "loss": 0.5938, "num_input_tokens_seen": 8437760, "step": 2060 }, { "epoch": 1.5357675111773472, "grad_norm": 8.50808628449463, "learning_rate": 1.961821253761012e-06, "loss": 0.7769, "num_input_tokens_seen": 8441856, "step": 2061 }, { "epoch": 1.5365126676602086, "grad_norm": 9.408962923822411, "learning_rate": 1.961340092393879e-06, "loss": 0.8616, "num_input_tokens_seen": 8445952, "step": 2062 }, { "epoch": 1.53725782414307, "grad_norm": 10.937191372132443, "learning_rate": 1.9608587750959186e-06, "loss": 0.7325, "num_input_tokens_seen": 8450048, "step": 2063 }, { "epoch": 1.5380029806259314, "grad_norm": 8.138605052113444, "learning_rate": 1.9603773019726387e-06, "loss": 0.9506, "num_input_tokens_seen": 8454144, "step": 2064 }, { "epoch": 1.5387481371087928, "grad_norm": 8.387450581853962, "learning_rate": 1.9598956731295816e-06, "loss": 0.5519, "num_input_tokens_seen": 8458240, "step": 2065 }, { "epoch": 1.5394932935916543, "grad_norm": 8.502761694823764, "learning_rate": 1.959413888672324e-06, "loss": 0.7527, "num_input_tokens_seen": 8462336, "step": 2066 }, { "epoch": 1.5402384500745157, "grad_norm": 7.758189166192837, "learning_rate": 1.9589319487064757e-06, "loss": 1.015, "num_input_tokens_seen": 8466432, "step": 2067 }, { "epoch": 1.540983606557377, "grad_norm": 6.9797012657313875, "learning_rate": 1.958449853337683e-06, "loss": 0.7763, "num_input_tokens_seen": 8470528, "step": 2068 }, { "epoch": 1.5417287630402385, "grad_norm": 8.869751327825693, "learning_rate": 1.9579676026716227e-06, "loss": 0.8995, "num_input_tokens_seen": 8474624, "step": 2069 }, { "epoch": 1.5424739195230999, "grad_norm": 8.511955972724463, "learning_rate": 1.9574851968140092e-06, "loss": 0.8029, "num_input_tokens_seen": 8478720, "step": 2070 }, { "epoch": 1.5432190760059612, "grad_norm": 10.120520509810067, "learning_rate": 1.957002635870588e-06, "loss": 0.7278, "num_input_tokens_seen": 8482816, "step": 2071 }, { "epoch": 1.5439642324888228, "grad_norm": 9.189320342442802, "learning_rate": 1.9565199199471414e-06, "loss": 0.3949, "num_input_tokens_seen": 8486912, "step": 2072 }, { "epoch": 1.544709388971684, "grad_norm": 10.463726422411678, "learning_rate": 1.9560370491494827e-06, "loss": 0.8169, "num_input_tokens_seen": 8491008, "step": 2073 }, { "epoch": 1.5454545454545454, "grad_norm": 7.512537365945123, "learning_rate": 1.955554023583462e-06, "loss": 0.6117, "num_input_tokens_seen": 8495104, "step": 2074 }, { "epoch": 1.546199701937407, "grad_norm": 9.357005248190703, "learning_rate": 1.9550708433549605e-06, "loss": 1.1341, "num_input_tokens_seen": 8499200, "step": 2075 }, { "epoch": 1.5469448584202683, "grad_norm": 10.57351880453605, "learning_rate": 1.9545875085698955e-06, "loss": 0.5256, "num_input_tokens_seen": 8503296, "step": 2076 }, { "epoch": 1.5476900149031296, "grad_norm": 9.009351808605425, "learning_rate": 1.954104019334218e-06, "loss": 0.6214, "num_input_tokens_seen": 8507392, "step": 2077 }, { "epoch": 1.5484351713859912, "grad_norm": 8.45512596395144, "learning_rate": 1.953620375753911e-06, "loss": 0.8861, "num_input_tokens_seen": 8511488, "step": 2078 }, { "epoch": 1.5491803278688525, "grad_norm": 9.345114958540861, "learning_rate": 1.953136577934994e-06, "loss": 0.7485, "num_input_tokens_seen": 8515584, "step": 2079 }, { "epoch": 1.5499254843517138, "grad_norm": 9.707435984524821, "learning_rate": 1.952652625983518e-06, "loss": 0.7887, "num_input_tokens_seen": 8519680, "step": 2080 }, { "epoch": 1.5506706408345754, "grad_norm": 9.176868788517295, "learning_rate": 1.9521685200055687e-06, "loss": 0.8405, "num_input_tokens_seen": 8523776, "step": 2081 }, { "epoch": 1.5514157973174365, "grad_norm": 7.692443483702514, "learning_rate": 1.951684260107266e-06, "loss": 0.7363, "num_input_tokens_seen": 8527872, "step": 2082 }, { "epoch": 1.552160953800298, "grad_norm": 8.330989585029034, "learning_rate": 1.9511998463947636e-06, "loss": 0.8393, "num_input_tokens_seen": 8531968, "step": 2083 }, { "epoch": 1.5529061102831596, "grad_norm": 8.380215529978797, "learning_rate": 1.950715278974247e-06, "loss": 0.5989, "num_input_tokens_seen": 8536064, "step": 2084 }, { "epoch": 1.5536512667660207, "grad_norm": 8.796619110641803, "learning_rate": 1.950230557951938e-06, "loss": 0.9128, "num_input_tokens_seen": 8540160, "step": 2085 }, { "epoch": 1.5543964232488823, "grad_norm": 9.291441904354505, "learning_rate": 1.9497456834340912e-06, "loss": 0.4605, "num_input_tokens_seen": 8544256, "step": 2086 }, { "epoch": 1.5551415797317438, "grad_norm": 10.41467196537089, "learning_rate": 1.9492606555269932e-06, "loss": 1.0317, "num_input_tokens_seen": 8548352, "step": 2087 }, { "epoch": 1.555886736214605, "grad_norm": 8.855327867561199, "learning_rate": 1.9487754743369667e-06, "loss": 0.6327, "num_input_tokens_seen": 8552448, "step": 2088 }, { "epoch": 1.5566318926974665, "grad_norm": 13.124416313677557, "learning_rate": 1.9482901399703664e-06, "loss": 0.4872, "num_input_tokens_seen": 8556544, "step": 2089 }, { "epoch": 1.5573770491803278, "grad_norm": 13.480032091196165, "learning_rate": 1.947804652533581e-06, "loss": 0.602, "num_input_tokens_seen": 8560640, "step": 2090 }, { "epoch": 1.5581222056631892, "grad_norm": 8.569605363522196, "learning_rate": 1.947319012133033e-06, "loss": 0.6024, "num_input_tokens_seen": 8564736, "step": 2091 }, { "epoch": 1.5588673621460507, "grad_norm": 10.232907274115782, "learning_rate": 1.9468332188751787e-06, "loss": 0.7804, "num_input_tokens_seen": 8568832, "step": 2092 }, { "epoch": 1.559612518628912, "grad_norm": 9.988661987993918, "learning_rate": 1.9463472728665065e-06, "loss": 0.7384, "num_input_tokens_seen": 8572928, "step": 2093 }, { "epoch": 1.5603576751117734, "grad_norm": 12.174482818356857, "learning_rate": 1.9458611742135402e-06, "loss": 0.5744, "num_input_tokens_seen": 8577024, "step": 2094 }, { "epoch": 1.561102831594635, "grad_norm": 7.539924912557428, "learning_rate": 1.9453749230228354e-06, "loss": 0.6146, "num_input_tokens_seen": 8581120, "step": 2095 }, { "epoch": 1.5618479880774963, "grad_norm": 9.329088953960154, "learning_rate": 1.944888519400982e-06, "loss": 1.0057, "num_input_tokens_seen": 8585216, "step": 2096 }, { "epoch": 1.5625931445603576, "grad_norm": 10.622960159332806, "learning_rate": 1.9444019634546034e-06, "loss": 0.6682, "num_input_tokens_seen": 8589312, "step": 2097 }, { "epoch": 1.5633383010432191, "grad_norm": 9.047767253140508, "learning_rate": 1.943915255290356e-06, "loss": 0.4519, "num_input_tokens_seen": 8593408, "step": 2098 }, { "epoch": 1.5640834575260805, "grad_norm": 8.937038107253601, "learning_rate": 1.943428395014929e-06, "loss": 0.8564, "num_input_tokens_seen": 8597504, "step": 2099 }, { "epoch": 1.5648286140089418, "grad_norm": 10.375447678550223, "learning_rate": 1.9429413827350468e-06, "loss": 0.6908, "num_input_tokens_seen": 8601600, "step": 2100 }, { "epoch": 1.5655737704918034, "grad_norm": 7.218981099182287, "learning_rate": 1.9424542185574656e-06, "loss": 0.827, "num_input_tokens_seen": 8605696, "step": 2101 }, { "epoch": 1.5663189269746647, "grad_norm": 8.57136425689753, "learning_rate": 1.941966902588975e-06, "loss": 0.4904, "num_input_tokens_seen": 8609792, "step": 2102 }, { "epoch": 1.567064083457526, "grad_norm": 6.761686225985415, "learning_rate": 1.9414794349363976e-06, "loss": 0.7761, "num_input_tokens_seen": 8613888, "step": 2103 }, { "epoch": 1.5678092399403876, "grad_norm": 11.078967157739267, "learning_rate": 1.940991815706591e-06, "loss": 0.9227, "num_input_tokens_seen": 8617984, "step": 2104 }, { "epoch": 1.568554396423249, "grad_norm": 7.212329774842022, "learning_rate": 1.9405040450064443e-06, "loss": 0.7205, "num_input_tokens_seen": 8622080, "step": 2105 }, { "epoch": 1.5692995529061102, "grad_norm": 8.059201186975788, "learning_rate": 1.9400161229428805e-06, "loss": 0.7615, "num_input_tokens_seen": 8626176, "step": 2106 }, { "epoch": 1.5700447093889718, "grad_norm": 8.725099032165149, "learning_rate": 1.9395280496228553e-06, "loss": 0.8497, "num_input_tokens_seen": 8630272, "step": 2107 }, { "epoch": 1.5707898658718331, "grad_norm": 10.376223566262963, "learning_rate": 1.9390398251533578e-06, "loss": 0.768, "num_input_tokens_seen": 8634368, "step": 2108 }, { "epoch": 1.5715350223546944, "grad_norm": 10.717324641112054, "learning_rate": 1.9385514496414105e-06, "loss": 0.7872, "num_input_tokens_seen": 8638464, "step": 2109 }, { "epoch": 1.572280178837556, "grad_norm": 10.219514714483424, "learning_rate": 1.9380629231940684e-06, "loss": 0.654, "num_input_tokens_seen": 8642560, "step": 2110 }, { "epoch": 1.5730253353204173, "grad_norm": 8.532288065367403, "learning_rate": 1.937574245918421e-06, "loss": 0.8486, "num_input_tokens_seen": 8646656, "step": 2111 }, { "epoch": 1.5737704918032787, "grad_norm": 8.541155904912266, "learning_rate": 1.9370854179215888e-06, "loss": 0.8393, "num_input_tokens_seen": 8650752, "step": 2112 }, { "epoch": 1.5745156482861402, "grad_norm": 7.822615432898432, "learning_rate": 1.9365964393107265e-06, "loss": 0.9178, "num_input_tokens_seen": 8654848, "step": 2113 }, { "epoch": 1.5752608047690015, "grad_norm": 9.5846560670992, "learning_rate": 1.9361073101930223e-06, "loss": 0.8375, "num_input_tokens_seen": 8658944, "step": 2114 }, { "epoch": 1.5760059612518629, "grad_norm": 8.161249611039766, "learning_rate": 1.9356180306756965e-06, "loss": 0.9243, "num_input_tokens_seen": 8663040, "step": 2115 }, { "epoch": 1.5767511177347244, "grad_norm": 7.809676626222226, "learning_rate": 1.9351286008660026e-06, "loss": 0.764, "num_input_tokens_seen": 8667136, "step": 2116 }, { "epoch": 1.5774962742175855, "grad_norm": 8.766928750157522, "learning_rate": 1.934639020871227e-06, "loss": 0.9405, "num_input_tokens_seen": 8671232, "step": 2117 }, { "epoch": 1.578241430700447, "grad_norm": 8.099835470654838, "learning_rate": 1.934149290798689e-06, "loss": 0.5942, "num_input_tokens_seen": 8675328, "step": 2118 }, { "epoch": 1.5789865871833086, "grad_norm": 8.795684634281479, "learning_rate": 1.933659410755742e-06, "loss": 1.039, "num_input_tokens_seen": 8679424, "step": 2119 }, { "epoch": 1.5797317436661698, "grad_norm": 8.617742020242673, "learning_rate": 1.93316938084977e-06, "loss": 0.5888, "num_input_tokens_seen": 8683520, "step": 2120 }, { "epoch": 1.5804769001490313, "grad_norm": 9.245425779064306, "learning_rate": 1.9326792011881912e-06, "loss": 0.7347, "num_input_tokens_seen": 8687616, "step": 2121 }, { "epoch": 1.5812220566318929, "grad_norm": 8.005321511737026, "learning_rate": 1.932188871878457e-06, "loss": 0.7622, "num_input_tokens_seen": 8691712, "step": 2122 }, { "epoch": 1.581967213114754, "grad_norm": 8.490837997047787, "learning_rate": 1.931698393028051e-06, "loss": 0.6037, "num_input_tokens_seen": 8695808, "step": 2123 }, { "epoch": 1.5827123695976155, "grad_norm": 7.823921948434276, "learning_rate": 1.9312077647444886e-06, "loss": 0.8124, "num_input_tokens_seen": 8699904, "step": 2124 }, { "epoch": 1.5834575260804769, "grad_norm": 8.083286701016794, "learning_rate": 1.9307169871353206e-06, "loss": 0.898, "num_input_tokens_seen": 8704000, "step": 2125 }, { "epoch": 1.5842026825633382, "grad_norm": 8.44902232390074, "learning_rate": 1.9302260603081284e-06, "loss": 0.9343, "num_input_tokens_seen": 8708096, "step": 2126 }, { "epoch": 1.5849478390461997, "grad_norm": 7.921199201169486, "learning_rate": 1.9297349843705267e-06, "loss": 0.8778, "num_input_tokens_seen": 8712192, "step": 2127 }, { "epoch": 1.585692995529061, "grad_norm": 12.054005358220296, "learning_rate": 1.929243759430162e-06, "loss": 0.8074, "num_input_tokens_seen": 8716288, "step": 2128 }, { "epoch": 1.5864381520119224, "grad_norm": 12.658575385109177, "learning_rate": 1.9287523855947154e-06, "loss": 0.6313, "num_input_tokens_seen": 8720384, "step": 2129 }, { "epoch": 1.587183308494784, "grad_norm": 7.347248393063659, "learning_rate": 1.928260862971899e-06, "loss": 0.8059, "num_input_tokens_seen": 8724480, "step": 2130 }, { "epoch": 1.5879284649776453, "grad_norm": 9.148832637337273, "learning_rate": 1.927769191669458e-06, "loss": 0.8395, "num_input_tokens_seen": 8728576, "step": 2131 }, { "epoch": 1.5886736214605066, "grad_norm": 7.526083234208958, "learning_rate": 1.9272773717951707e-06, "loss": 0.6044, "num_input_tokens_seen": 8732672, "step": 2132 }, { "epoch": 1.5894187779433682, "grad_norm": 7.545033213725763, "learning_rate": 1.9267854034568477e-06, "loss": 0.753, "num_input_tokens_seen": 8736768, "step": 2133 }, { "epoch": 1.5901639344262295, "grad_norm": 7.131275215528278, "learning_rate": 1.9262932867623314e-06, "loss": 0.9087, "num_input_tokens_seen": 8740864, "step": 2134 }, { "epoch": 1.5909090909090908, "grad_norm": 7.671253126199876, "learning_rate": 1.9258010218194972e-06, "loss": 0.822, "num_input_tokens_seen": 8744960, "step": 2135 }, { "epoch": 1.5916542473919524, "grad_norm": 9.3511433274637, "learning_rate": 1.9253086087362536e-06, "loss": 0.5674, "num_input_tokens_seen": 8749056, "step": 2136 }, { "epoch": 1.5923994038748137, "grad_norm": 7.788397755475116, "learning_rate": 1.9248160476205406e-06, "loss": 0.9132, "num_input_tokens_seen": 8753152, "step": 2137 }, { "epoch": 1.593144560357675, "grad_norm": 7.6200827932274064, "learning_rate": 1.9243233385803316e-06, "loss": 0.7409, "num_input_tokens_seen": 8757248, "step": 2138 }, { "epoch": 1.5938897168405366, "grad_norm": 9.146523355329482, "learning_rate": 1.9238304817236313e-06, "loss": 0.8176, "num_input_tokens_seen": 8761344, "step": 2139 }, { "epoch": 1.594634873323398, "grad_norm": 9.766644342759838, "learning_rate": 1.9233374771584784e-06, "loss": 0.721, "num_input_tokens_seen": 8765440, "step": 2140 }, { "epoch": 1.5953800298062593, "grad_norm": 8.722576295139161, "learning_rate": 1.922844324992942e-06, "loss": 0.9456, "num_input_tokens_seen": 8769536, "step": 2141 }, { "epoch": 1.5961251862891208, "grad_norm": 7.144098033442286, "learning_rate": 1.9223510253351253e-06, "loss": 0.8984, "num_input_tokens_seen": 8773632, "step": 2142 }, { "epoch": 1.5968703427719821, "grad_norm": 15.436454108518022, "learning_rate": 1.9218575782931626e-06, "loss": 0.993, "num_input_tokens_seen": 8777728, "step": 2143 }, { "epoch": 1.5976154992548435, "grad_norm": 7.994804645859062, "learning_rate": 1.9213639839752215e-06, "loss": 0.8204, "num_input_tokens_seen": 8781824, "step": 2144 }, { "epoch": 1.598360655737705, "grad_norm": 8.202521712089986, "learning_rate": 1.920870242489501e-06, "loss": 0.8726, "num_input_tokens_seen": 8785920, "step": 2145 }, { "epoch": 1.5991058122205664, "grad_norm": 8.024029103049388, "learning_rate": 1.920376353944233e-06, "loss": 0.7083, "num_input_tokens_seen": 8790016, "step": 2146 }, { "epoch": 1.5998509687034277, "grad_norm": 11.572469475893712, "learning_rate": 1.919882318447681e-06, "loss": 0.937, "num_input_tokens_seen": 8794112, "step": 2147 }, { "epoch": 1.6005961251862892, "grad_norm": 7.958663661739841, "learning_rate": 1.9193881361081415e-06, "loss": 0.8217, "num_input_tokens_seen": 8798208, "step": 2148 }, { "epoch": 1.6013412816691506, "grad_norm": 8.394390930855932, "learning_rate": 1.9188938070339428e-06, "loss": 0.7174, "num_input_tokens_seen": 8802304, "step": 2149 }, { "epoch": 1.602086438152012, "grad_norm": 10.65570915981855, "learning_rate": 1.9183993313334455e-06, "loss": 0.9726, "num_input_tokens_seen": 8806400, "step": 2150 }, { "epoch": 1.6028315946348735, "grad_norm": 7.858084937924269, "learning_rate": 1.9179047091150414e-06, "loss": 0.9086, "num_input_tokens_seen": 8810496, "step": 2151 }, { "epoch": 1.6035767511177346, "grad_norm": 9.179558018244752, "learning_rate": 1.917409940487156e-06, "loss": 0.9792, "num_input_tokens_seen": 8814592, "step": 2152 }, { "epoch": 1.6043219076005961, "grad_norm": 12.048135256181322, "learning_rate": 1.9169150255582464e-06, "loss": 0.5756, "num_input_tokens_seen": 8818688, "step": 2153 }, { "epoch": 1.6050670640834577, "grad_norm": 7.871979015666335, "learning_rate": 1.916419964436801e-06, "loss": 0.8135, "num_input_tokens_seen": 8822784, "step": 2154 }, { "epoch": 1.6058122205663188, "grad_norm": 9.72542844673163, "learning_rate": 1.91592475723134e-06, "loss": 0.7177, "num_input_tokens_seen": 8826880, "step": 2155 }, { "epoch": 1.6065573770491803, "grad_norm": 15.38370199052369, "learning_rate": 1.9154294040504174e-06, "loss": 0.8452, "num_input_tokens_seen": 8830976, "step": 2156 }, { "epoch": 1.6073025335320419, "grad_norm": 7.792287064716724, "learning_rate": 1.914933905002618e-06, "loss": 0.8262, "num_input_tokens_seen": 8835072, "step": 2157 }, { "epoch": 1.608047690014903, "grad_norm": 8.529029945834392, "learning_rate": 1.9144382601965584e-06, "loss": 0.7745, "num_input_tokens_seen": 8839168, "step": 2158 }, { "epoch": 1.6087928464977646, "grad_norm": 8.234757885992792, "learning_rate": 1.9139424697408877e-06, "loss": 0.8278, "num_input_tokens_seen": 8843264, "step": 2159 }, { "epoch": 1.6095380029806259, "grad_norm": 26.20059358056862, "learning_rate": 1.913446533744287e-06, "loss": 0.9915, "num_input_tokens_seen": 8847360, "step": 2160 }, { "epoch": 1.6102831594634872, "grad_norm": 9.88239972844787, "learning_rate": 1.912950452315468e-06, "loss": 0.7092, "num_input_tokens_seen": 8851456, "step": 2161 }, { "epoch": 1.6110283159463488, "grad_norm": 8.211532729495472, "learning_rate": 1.9124542255631763e-06, "loss": 0.9986, "num_input_tokens_seen": 8855552, "step": 2162 }, { "epoch": 1.61177347242921, "grad_norm": 7.79139256037522, "learning_rate": 1.9119578535961888e-06, "loss": 0.6872, "num_input_tokens_seen": 8859648, "step": 2163 }, { "epoch": 1.6125186289120714, "grad_norm": 9.139634309503647, "learning_rate": 1.9114613365233115e-06, "loss": 0.9358, "num_input_tokens_seen": 8863744, "step": 2164 }, { "epoch": 1.613263785394933, "grad_norm": 9.563572150289875, "learning_rate": 1.910964674453387e-06, "loss": 0.7972, "num_input_tokens_seen": 8867840, "step": 2165 }, { "epoch": 1.6140089418777943, "grad_norm": 8.513455701140273, "learning_rate": 1.9104678674952858e-06, "loss": 0.9061, "num_input_tokens_seen": 8871936, "step": 2166 }, { "epoch": 1.6147540983606556, "grad_norm": 7.623650578373522, "learning_rate": 1.9099709157579124e-06, "loss": 0.9333, "num_input_tokens_seen": 8876032, "step": 2167 }, { "epoch": 1.6154992548435172, "grad_norm": 11.23881823027282, "learning_rate": 1.909473819350201e-06, "loss": 0.7366, "num_input_tokens_seen": 8880128, "step": 2168 }, { "epoch": 1.6162444113263785, "grad_norm": 8.008758206318157, "learning_rate": 1.9089765783811193e-06, "loss": 0.7859, "num_input_tokens_seen": 8884224, "step": 2169 }, { "epoch": 1.6169895678092399, "grad_norm": 11.613926837137782, "learning_rate": 1.9084791929596662e-06, "loss": 0.8399, "num_input_tokens_seen": 8888320, "step": 2170 }, { "epoch": 1.6177347242921014, "grad_norm": 8.216213532134134, "learning_rate": 1.907981663194872e-06, "loss": 0.8492, "num_input_tokens_seen": 8892416, "step": 2171 }, { "epoch": 1.6184798807749627, "grad_norm": 9.851784713546044, "learning_rate": 1.9074839891957995e-06, "loss": 0.7083, "num_input_tokens_seen": 8896512, "step": 2172 }, { "epoch": 1.619225037257824, "grad_norm": 8.794532124139383, "learning_rate": 1.9069861710715413e-06, "loss": 0.8047, "num_input_tokens_seen": 8900608, "step": 2173 }, { "epoch": 1.6199701937406856, "grad_norm": 9.287845313489074, "learning_rate": 1.906488208931223e-06, "loss": 0.9092, "num_input_tokens_seen": 8904704, "step": 2174 }, { "epoch": 1.620715350223547, "grad_norm": 8.070371701383067, "learning_rate": 1.9059901028840014e-06, "loss": 0.7106, "num_input_tokens_seen": 8908800, "step": 2175 }, { "epoch": 1.6214605067064083, "grad_norm": 8.28273228435692, "learning_rate": 1.9054918530390654e-06, "loss": 0.6825, "num_input_tokens_seen": 8912896, "step": 2176 }, { "epoch": 1.6222056631892698, "grad_norm": 9.505172103958136, "learning_rate": 1.904993459505634e-06, "loss": 0.6509, "num_input_tokens_seen": 8916992, "step": 2177 }, { "epoch": 1.6229508196721312, "grad_norm": 11.44812277718759, "learning_rate": 1.90449492239296e-06, "loss": 0.8192, "num_input_tokens_seen": 8921088, "step": 2178 }, { "epoch": 1.6236959761549925, "grad_norm": 10.330339958154871, "learning_rate": 1.9039962418103248e-06, "loss": 1.0025, "num_input_tokens_seen": 8925184, "step": 2179 }, { "epoch": 1.624441132637854, "grad_norm": 9.057515667997743, "learning_rate": 1.903497417867044e-06, "loss": 0.7238, "num_input_tokens_seen": 8929280, "step": 2180 }, { "epoch": 1.6251862891207154, "grad_norm": 8.291190399042204, "learning_rate": 1.9029984506724628e-06, "loss": 0.7912, "num_input_tokens_seen": 8933376, "step": 2181 }, { "epoch": 1.6259314456035767, "grad_norm": 8.915977505716032, "learning_rate": 1.9024993403359582e-06, "loss": 0.6922, "num_input_tokens_seen": 8937472, "step": 2182 }, { "epoch": 1.6266766020864383, "grad_norm": 11.535372069406726, "learning_rate": 1.9020000869669386e-06, "loss": 0.8642, "num_input_tokens_seen": 8941568, "step": 2183 }, { "epoch": 1.6274217585692996, "grad_norm": 8.614837520957652, "learning_rate": 1.9015006906748446e-06, "loss": 0.5313, "num_input_tokens_seen": 8945664, "step": 2184 }, { "epoch": 1.628166915052161, "grad_norm": 7.782105794860148, "learning_rate": 1.9010011515691468e-06, "loss": 0.6639, "num_input_tokens_seen": 8949760, "step": 2185 }, { "epoch": 1.6289120715350225, "grad_norm": 11.207906454652196, "learning_rate": 1.9005014697593483e-06, "loss": 1.0073, "num_input_tokens_seen": 8953856, "step": 2186 }, { "epoch": 1.6296572280178836, "grad_norm": 7.799387102737545, "learning_rate": 1.9000016453549828e-06, "loss": 0.8097, "num_input_tokens_seen": 8957952, "step": 2187 }, { "epoch": 1.6304023845007451, "grad_norm": 8.24703532475951, "learning_rate": 1.899501678465615e-06, "loss": 0.887, "num_input_tokens_seen": 8962048, "step": 2188 }, { "epoch": 1.6311475409836067, "grad_norm": 9.105274340944783, "learning_rate": 1.8990015692008413e-06, "loss": 0.9171, "num_input_tokens_seen": 8966144, "step": 2189 }, { "epoch": 1.6318926974664678, "grad_norm": 7.82404930295901, "learning_rate": 1.8985013176702893e-06, "loss": 0.5415, "num_input_tokens_seen": 8970240, "step": 2190 }, { "epoch": 1.6326378539493294, "grad_norm": 8.442728519481339, "learning_rate": 1.898000923983618e-06, "loss": 0.6207, "num_input_tokens_seen": 8974336, "step": 2191 }, { "epoch": 1.633383010432191, "grad_norm": 8.874024168973046, "learning_rate": 1.897500388250517e-06, "loss": 1.0076, "num_input_tokens_seen": 8978432, "step": 2192 }, { "epoch": 1.634128166915052, "grad_norm": 8.43175920140979, "learning_rate": 1.8969997105807076e-06, "loss": 0.9423, "num_input_tokens_seen": 8982528, "step": 2193 }, { "epoch": 1.6348733233979136, "grad_norm": 8.325294772625094, "learning_rate": 1.896498891083941e-06, "loss": 0.7315, "num_input_tokens_seen": 8986624, "step": 2194 }, { "epoch": 1.635618479880775, "grad_norm": 10.438676081421605, "learning_rate": 1.8959979298700012e-06, "loss": 0.9464, "num_input_tokens_seen": 8990720, "step": 2195 }, { "epoch": 1.6363636363636362, "grad_norm": 10.019720278145698, "learning_rate": 1.8954968270487026e-06, "loss": 0.63, "num_input_tokens_seen": 8994816, "step": 2196 }, { "epoch": 1.6371087928464978, "grad_norm": 8.019465890352434, "learning_rate": 1.89499558272989e-06, "loss": 0.707, "num_input_tokens_seen": 8998912, "step": 2197 }, { "epoch": 1.6378539493293591, "grad_norm": 8.238944289878141, "learning_rate": 1.8944941970234402e-06, "loss": 0.7296, "num_input_tokens_seen": 9003008, "step": 2198 }, { "epoch": 1.6385991058122205, "grad_norm": 7.608939674587876, "learning_rate": 1.8939926700392603e-06, "loss": 0.7671, "num_input_tokens_seen": 9007104, "step": 2199 }, { "epoch": 1.639344262295082, "grad_norm": 8.538696846524397, "learning_rate": 1.8934910018872886e-06, "loss": 0.9156, "num_input_tokens_seen": 9011200, "step": 2200 }, { "epoch": 1.6400894187779433, "grad_norm": 8.520511622642362, "learning_rate": 1.8929891926774945e-06, "loss": 0.8891, "num_input_tokens_seen": 9015296, "step": 2201 }, { "epoch": 1.6408345752608047, "grad_norm": 8.855521240299383, "learning_rate": 1.8924872425198777e-06, "loss": 0.8984, "num_input_tokens_seen": 9019392, "step": 2202 }, { "epoch": 1.6415797317436662, "grad_norm": 8.84007319691643, "learning_rate": 1.8919851515244698e-06, "loss": 0.5693, "num_input_tokens_seen": 9023488, "step": 2203 }, { "epoch": 1.6423248882265276, "grad_norm": 8.00232253482481, "learning_rate": 1.8914829198013323e-06, "loss": 0.6143, "num_input_tokens_seen": 9027584, "step": 2204 }, { "epoch": 1.6430700447093889, "grad_norm": 7.731394451111798, "learning_rate": 1.8909805474605581e-06, "loss": 0.7863, "num_input_tokens_seen": 9031680, "step": 2205 }, { "epoch": 1.6438152011922504, "grad_norm": 9.1756382004893, "learning_rate": 1.8904780346122709e-06, "loss": 0.7408, "num_input_tokens_seen": 9035776, "step": 2206 }, { "epoch": 1.6445603576751118, "grad_norm": 10.122390290162471, "learning_rate": 1.8899753813666256e-06, "loss": 0.9875, "num_input_tokens_seen": 9039872, "step": 2207 }, { "epoch": 1.645305514157973, "grad_norm": 8.920941548808544, "learning_rate": 1.8894725878338064e-06, "loss": 0.7876, "num_input_tokens_seen": 9043968, "step": 2208 }, { "epoch": 1.6460506706408347, "grad_norm": 9.308484054374508, "learning_rate": 1.8889696541240298e-06, "loss": 0.9592, "num_input_tokens_seen": 9048064, "step": 2209 }, { "epoch": 1.646795827123696, "grad_norm": 9.552886547360188, "learning_rate": 1.8884665803475428e-06, "loss": 0.9166, "num_input_tokens_seen": 9052160, "step": 2210 }, { "epoch": 1.6475409836065573, "grad_norm": 9.520113268852544, "learning_rate": 1.8879633666146216e-06, "loss": 0.8387, "num_input_tokens_seen": 9056256, "step": 2211 }, { "epoch": 1.6482861400894189, "grad_norm": 8.198228006901237, "learning_rate": 1.8874600130355752e-06, "loss": 0.6236, "num_input_tokens_seen": 9060352, "step": 2212 }, { "epoch": 1.6490312965722802, "grad_norm": 7.796332251541497, "learning_rate": 1.8869565197207423e-06, "loss": 0.8283, "num_input_tokens_seen": 9064448, "step": 2213 }, { "epoch": 1.6497764530551415, "grad_norm": 8.479740042886995, "learning_rate": 1.8864528867804922e-06, "loss": 0.8317, "num_input_tokens_seen": 9068544, "step": 2214 }, { "epoch": 1.650521609538003, "grad_norm": 8.200121395595554, "learning_rate": 1.885949114325224e-06, "loss": 0.8288, "num_input_tokens_seen": 9072640, "step": 2215 }, { "epoch": 1.6512667660208644, "grad_norm": 7.964288685859538, "learning_rate": 1.885445202465369e-06, "loss": 0.8772, "num_input_tokens_seen": 9076736, "step": 2216 }, { "epoch": 1.6520119225037257, "grad_norm": 7.880538536260826, "learning_rate": 1.8849411513113886e-06, "loss": 0.7784, "num_input_tokens_seen": 9080832, "step": 2217 }, { "epoch": 1.6527570789865873, "grad_norm": 15.627687883557787, "learning_rate": 1.8844369609737734e-06, "loss": 0.6874, "num_input_tokens_seen": 9084928, "step": 2218 }, { "epoch": 1.6535022354694486, "grad_norm": 11.005424192885595, "learning_rate": 1.8839326315630463e-06, "loss": 0.831, "num_input_tokens_seen": 9089024, "step": 2219 }, { "epoch": 1.65424739195231, "grad_norm": 8.255528468096005, "learning_rate": 1.88342816318976e-06, "loss": 0.6357, "num_input_tokens_seen": 9093120, "step": 2220 }, { "epoch": 1.6549925484351715, "grad_norm": 9.947709730270454, "learning_rate": 1.8829235559644967e-06, "loss": 0.5853, "num_input_tokens_seen": 9097216, "step": 2221 }, { "epoch": 1.6557377049180326, "grad_norm": 9.592638413661437, "learning_rate": 1.8824188099978708e-06, "loss": 0.9527, "num_input_tokens_seen": 9101312, "step": 2222 }, { "epoch": 1.6564828614008942, "grad_norm": 7.367968039342473, "learning_rate": 1.8819139254005258e-06, "loss": 0.6133, "num_input_tokens_seen": 9105408, "step": 2223 }, { "epoch": 1.6572280178837557, "grad_norm": 8.631514465784523, "learning_rate": 1.8814089022831358e-06, "loss": 0.6729, "num_input_tokens_seen": 9109504, "step": 2224 }, { "epoch": 1.6579731743666168, "grad_norm": 8.18598590203017, "learning_rate": 1.8809037407564062e-06, "loss": 0.7084, "num_input_tokens_seen": 9113600, "step": 2225 }, { "epoch": 1.6587183308494784, "grad_norm": 10.49790688387346, "learning_rate": 1.8803984409310716e-06, "loss": 0.6407, "num_input_tokens_seen": 9117696, "step": 2226 }, { "epoch": 1.65946348733234, "grad_norm": 8.384707278077446, "learning_rate": 1.8798930029178968e-06, "loss": 0.8668, "num_input_tokens_seen": 9121792, "step": 2227 }, { "epoch": 1.660208643815201, "grad_norm": 9.680476205272193, "learning_rate": 1.8793874268276785e-06, "loss": 0.6095, "num_input_tokens_seen": 9125888, "step": 2228 }, { "epoch": 1.6609538002980626, "grad_norm": 8.00802622369522, "learning_rate": 1.8788817127712416e-06, "loss": 0.9043, "num_input_tokens_seen": 9129984, "step": 2229 }, { "epoch": 1.661698956780924, "grad_norm": 8.003784605931667, "learning_rate": 1.8783758608594426e-06, "loss": 0.9768, "num_input_tokens_seen": 9134080, "step": 2230 }, { "epoch": 1.6624441132637853, "grad_norm": 7.746509533675139, "learning_rate": 1.877869871203168e-06, "loss": 0.5945, "num_input_tokens_seen": 9138176, "step": 2231 }, { "epoch": 1.6631892697466468, "grad_norm": 7.280822922188494, "learning_rate": 1.8773637439133343e-06, "loss": 0.4582, "num_input_tokens_seen": 9142272, "step": 2232 }, { "epoch": 1.6639344262295082, "grad_norm": 8.551921949301235, "learning_rate": 1.8768574791008882e-06, "loss": 0.9218, "num_input_tokens_seen": 9146368, "step": 2233 }, { "epoch": 1.6646795827123695, "grad_norm": 7.926942281888369, "learning_rate": 1.8763510768768067e-06, "loss": 1.0306, "num_input_tokens_seen": 9150464, "step": 2234 }, { "epoch": 1.665424739195231, "grad_norm": 8.558601706129846, "learning_rate": 1.8758445373520961e-06, "loss": 0.8921, "num_input_tokens_seen": 9154560, "step": 2235 }, { "epoch": 1.6661698956780924, "grad_norm": 8.128492469936587, "learning_rate": 1.8753378606377944e-06, "loss": 0.7571, "num_input_tokens_seen": 9158656, "step": 2236 }, { "epoch": 1.6669150521609537, "grad_norm": 11.18574924755413, "learning_rate": 1.874831046844968e-06, "loss": 0.6254, "num_input_tokens_seen": 9162752, "step": 2237 }, { "epoch": 1.6676602086438153, "grad_norm": 8.83341182609522, "learning_rate": 1.8743240960847147e-06, "loss": 0.5998, "num_input_tokens_seen": 9166848, "step": 2238 }, { "epoch": 1.6684053651266766, "grad_norm": 8.520704869431563, "learning_rate": 1.8738170084681615e-06, "loss": 0.7507, "num_input_tokens_seen": 9170944, "step": 2239 }, { "epoch": 1.669150521609538, "grad_norm": 8.117117298178567, "learning_rate": 1.8733097841064653e-06, "loss": 0.9042, "num_input_tokens_seen": 9175040, "step": 2240 }, { "epoch": 1.6698956780923995, "grad_norm": 9.508766209923785, "learning_rate": 1.872802423110814e-06, "loss": 0.7864, "num_input_tokens_seen": 9179136, "step": 2241 }, { "epoch": 1.6706408345752608, "grad_norm": 9.103922881675874, "learning_rate": 1.8722949255924241e-06, "loss": 0.5395, "num_input_tokens_seen": 9183232, "step": 2242 }, { "epoch": 1.6713859910581221, "grad_norm": 9.351780782188971, "learning_rate": 1.871787291662543e-06, "loss": 0.6697, "num_input_tokens_seen": 9187328, "step": 2243 }, { "epoch": 1.6721311475409837, "grad_norm": 9.077896142356064, "learning_rate": 1.8712795214324475e-06, "loss": 0.6458, "num_input_tokens_seen": 9191424, "step": 2244 }, { "epoch": 1.672876304023845, "grad_norm": 10.723444035549495, "learning_rate": 1.8707716150134447e-06, "loss": 0.6309, "num_input_tokens_seen": 9195520, "step": 2245 }, { "epoch": 1.6736214605067063, "grad_norm": 8.17702944182211, "learning_rate": 1.870263572516871e-06, "loss": 0.6988, "num_input_tokens_seen": 9199616, "step": 2246 }, { "epoch": 1.674366616989568, "grad_norm": 10.837972162352633, "learning_rate": 1.8697553940540935e-06, "loss": 0.7052, "num_input_tokens_seen": 9203712, "step": 2247 }, { "epoch": 1.6751117734724292, "grad_norm": 10.774734128864756, "learning_rate": 1.869247079736508e-06, "loss": 0.5135, "num_input_tokens_seen": 9207808, "step": 2248 }, { "epoch": 1.6758569299552906, "grad_norm": 10.209386368178418, "learning_rate": 1.8687386296755406e-06, "loss": 0.7878, "num_input_tokens_seen": 9211904, "step": 2249 }, { "epoch": 1.6766020864381521, "grad_norm": 7.740850211644982, "learning_rate": 1.8682300439826476e-06, "loss": 0.8366, "num_input_tokens_seen": 9216000, "step": 2250 }, { "epoch": 1.6773472429210134, "grad_norm": 15.05220790767672, "learning_rate": 1.8677213227693143e-06, "loss": 0.9015, "num_input_tokens_seen": 9220096, "step": 2251 }, { "epoch": 1.6780923994038748, "grad_norm": 7.799746492161615, "learning_rate": 1.867212466147056e-06, "loss": 0.9677, "num_input_tokens_seen": 9224192, "step": 2252 }, { "epoch": 1.6788375558867363, "grad_norm": 7.8884804216617015, "learning_rate": 1.8667034742274181e-06, "loss": 0.8318, "num_input_tokens_seen": 9228288, "step": 2253 }, { "epoch": 1.6795827123695977, "grad_norm": 10.453995921346458, "learning_rate": 1.8661943471219748e-06, "loss": 0.9643, "num_input_tokens_seen": 9232384, "step": 2254 }, { "epoch": 1.680327868852459, "grad_norm": 10.537921571770028, "learning_rate": 1.8656850849423308e-06, "loss": 0.8523, "num_input_tokens_seen": 9236480, "step": 2255 }, { "epoch": 1.6810730253353205, "grad_norm": 8.507560626829305, "learning_rate": 1.8651756878001198e-06, "loss": 0.6501, "num_input_tokens_seen": 9240576, "step": 2256 }, { "epoch": 1.6818181818181817, "grad_norm": 9.624742336218642, "learning_rate": 1.864666155807005e-06, "loss": 0.9621, "num_input_tokens_seen": 9244672, "step": 2257 }, { "epoch": 1.6825633383010432, "grad_norm": 9.2620995366065, "learning_rate": 1.8641564890746796e-06, "loss": 0.7479, "num_input_tokens_seen": 9248768, "step": 2258 }, { "epoch": 1.6833084947839048, "grad_norm": 8.656147636852712, "learning_rate": 1.8636466877148669e-06, "loss": 0.5757, "num_input_tokens_seen": 9252864, "step": 2259 }, { "epoch": 1.6840536512667659, "grad_norm": 8.554493825216237, "learning_rate": 1.8631367518393178e-06, "loss": 0.7505, "num_input_tokens_seen": 9256960, "step": 2260 }, { "epoch": 1.6847988077496274, "grad_norm": 7.575195880614963, "learning_rate": 1.8626266815598152e-06, "loss": 0.747, "num_input_tokens_seen": 9261056, "step": 2261 }, { "epoch": 1.685543964232489, "grad_norm": 8.16475015195428, "learning_rate": 1.862116476988169e-06, "loss": 0.7668, "num_input_tokens_seen": 9265152, "step": 2262 }, { "epoch": 1.68628912071535, "grad_norm": 7.003758177537879, "learning_rate": 1.8616061382362205e-06, "loss": 0.6812, "num_input_tokens_seen": 9269248, "step": 2263 }, { "epoch": 1.6870342771982116, "grad_norm": 8.185265032737863, "learning_rate": 1.8610956654158389e-06, "loss": 0.8488, "num_input_tokens_seen": 9273344, "step": 2264 }, { "epoch": 1.687779433681073, "grad_norm": 7.943706737212765, "learning_rate": 1.8605850586389238e-06, "loss": 0.7937, "num_input_tokens_seen": 9277440, "step": 2265 }, { "epoch": 1.6885245901639343, "grad_norm": 10.296766220470802, "learning_rate": 1.8600743180174038e-06, "loss": 0.7379, "num_input_tokens_seen": 9281536, "step": 2266 }, { "epoch": 1.6892697466467959, "grad_norm": 8.76503711336752, "learning_rate": 1.8595634436632374e-06, "loss": 0.7815, "num_input_tokens_seen": 9285632, "step": 2267 }, { "epoch": 1.6900149031296572, "grad_norm": 8.87669302233529, "learning_rate": 1.859052435688411e-06, "loss": 0.5522, "num_input_tokens_seen": 9289728, "step": 2268 }, { "epoch": 1.6907600596125185, "grad_norm": 8.386011149108755, "learning_rate": 1.858541294204942e-06, "loss": 0.6393, "num_input_tokens_seen": 9293824, "step": 2269 }, { "epoch": 1.69150521609538, "grad_norm": 9.11795682732551, "learning_rate": 1.8580300193248757e-06, "loss": 0.9517, "num_input_tokens_seen": 9297920, "step": 2270 }, { "epoch": 1.6922503725782414, "grad_norm": 7.787987285012954, "learning_rate": 1.8575186111602877e-06, "loss": 0.6983, "num_input_tokens_seen": 9302016, "step": 2271 }, { "epoch": 1.6929955290611027, "grad_norm": 8.998976762612145, "learning_rate": 1.857007069823282e-06, "loss": 0.8444, "num_input_tokens_seen": 9306112, "step": 2272 }, { "epoch": 1.6937406855439643, "grad_norm": 9.233031785092333, "learning_rate": 1.856495395425992e-06, "loss": 0.7624, "num_input_tokens_seen": 9310208, "step": 2273 }, { "epoch": 1.6944858420268256, "grad_norm": 10.484879081312013, "learning_rate": 1.855983588080581e-06, "loss": 0.9802, "num_input_tokens_seen": 9314304, "step": 2274 }, { "epoch": 1.695230998509687, "grad_norm": 7.372369620466204, "learning_rate": 1.8554716478992398e-06, "loss": 0.6955, "num_input_tokens_seen": 9318400, "step": 2275 }, { "epoch": 1.6959761549925485, "grad_norm": 8.344819198466446, "learning_rate": 1.8549595749941904e-06, "loss": 0.862, "num_input_tokens_seen": 9322496, "step": 2276 }, { "epoch": 1.6967213114754098, "grad_norm": 8.042422620943558, "learning_rate": 1.854447369477682e-06, "loss": 0.7899, "num_input_tokens_seen": 9326592, "step": 2277 }, { "epoch": 1.6974664679582712, "grad_norm": 8.736732675151284, "learning_rate": 1.853935031461994e-06, "loss": 0.9078, "num_input_tokens_seen": 9330688, "step": 2278 }, { "epoch": 1.6982116244411327, "grad_norm": 8.608960267887223, "learning_rate": 1.853422561059435e-06, "loss": 0.6915, "num_input_tokens_seen": 9334784, "step": 2279 }, { "epoch": 1.698956780923994, "grad_norm": 9.80883028795717, "learning_rate": 1.8529099583823416e-06, "loss": 0.5629, "num_input_tokens_seen": 9338880, "step": 2280 }, { "epoch": 1.6997019374068554, "grad_norm": 8.50765394415782, "learning_rate": 1.8523972235430803e-06, "loss": 0.5394, "num_input_tokens_seen": 9342976, "step": 2281 }, { "epoch": 1.700447093889717, "grad_norm": 8.848708686403414, "learning_rate": 1.8518843566540462e-06, "loss": 0.5814, "num_input_tokens_seen": 9347072, "step": 2282 }, { "epoch": 1.7011922503725783, "grad_norm": 7.755032704095298, "learning_rate": 1.8513713578276631e-06, "loss": 0.9215, "num_input_tokens_seen": 9351168, "step": 2283 }, { "epoch": 1.7019374068554396, "grad_norm": 8.297678973672777, "learning_rate": 1.8508582271763841e-06, "loss": 0.4942, "num_input_tokens_seen": 9355264, "step": 2284 }, { "epoch": 1.7026825633383011, "grad_norm": 8.18611983108025, "learning_rate": 1.8503449648126912e-06, "loss": 0.765, "num_input_tokens_seen": 9359360, "step": 2285 }, { "epoch": 1.7034277198211625, "grad_norm": 8.139143875961203, "learning_rate": 1.8498315708490953e-06, "loss": 0.6078, "num_input_tokens_seen": 9363456, "step": 2286 }, { "epoch": 1.7041728763040238, "grad_norm": 8.097445449216734, "learning_rate": 1.8493180453981357e-06, "loss": 0.8763, "num_input_tokens_seen": 9367552, "step": 2287 }, { "epoch": 1.7049180327868854, "grad_norm": 12.112313753384033, "learning_rate": 1.848804388572381e-06, "loss": 0.7654, "num_input_tokens_seen": 9371648, "step": 2288 }, { "epoch": 1.7056631892697467, "grad_norm": 10.622975207935157, "learning_rate": 1.8482906004844289e-06, "loss": 0.8881, "num_input_tokens_seen": 9375744, "step": 2289 }, { "epoch": 1.706408345752608, "grad_norm": 7.338802120715555, "learning_rate": 1.8477766812469045e-06, "loss": 0.8071, "num_input_tokens_seen": 9379840, "step": 2290 }, { "epoch": 1.7071535022354696, "grad_norm": 10.046816443291181, "learning_rate": 1.8472626309724638e-06, "loss": 0.6771, "num_input_tokens_seen": 9383936, "step": 2291 }, { "epoch": 1.7078986587183307, "grad_norm": 7.751133874280362, "learning_rate": 1.8467484497737887e-06, "loss": 0.7961, "num_input_tokens_seen": 9388032, "step": 2292 }, { "epoch": 1.7086438152011922, "grad_norm": 16.003653313724104, "learning_rate": 1.8462341377635926e-06, "loss": 0.6324, "num_input_tokens_seen": 9392128, "step": 2293 }, { "epoch": 1.7093889716840538, "grad_norm": 9.241700526930371, "learning_rate": 1.8457196950546164e-06, "loss": 0.8893, "num_input_tokens_seen": 9396224, "step": 2294 }, { "epoch": 1.710134128166915, "grad_norm": 8.154101073166684, "learning_rate": 1.8452051217596287e-06, "loss": 0.742, "num_input_tokens_seen": 9400320, "step": 2295 }, { "epoch": 1.7108792846497765, "grad_norm": 8.843779041125245, "learning_rate": 1.8446904179914283e-06, "loss": 0.7978, "num_input_tokens_seen": 9404416, "step": 2296 }, { "epoch": 1.711624441132638, "grad_norm": 11.64101818366343, "learning_rate": 1.8441755838628417e-06, "loss": 0.4678, "num_input_tokens_seen": 9408512, "step": 2297 }, { "epoch": 1.7123695976154991, "grad_norm": 8.611934449300326, "learning_rate": 1.8436606194867244e-06, "loss": 0.2787, "num_input_tokens_seen": 9412608, "step": 2298 }, { "epoch": 1.7131147540983607, "grad_norm": 9.013300777224702, "learning_rate": 1.8431455249759602e-06, "loss": 0.8856, "num_input_tokens_seen": 9416704, "step": 2299 }, { "epoch": 1.713859910581222, "grad_norm": 8.100434438493032, "learning_rate": 1.8426303004434615e-06, "loss": 0.7759, "num_input_tokens_seen": 9420800, "step": 2300 }, { "epoch": 1.7146050670640833, "grad_norm": 9.665760590787011, "learning_rate": 1.8421149460021696e-06, "loss": 0.6387, "num_input_tokens_seen": 9424896, "step": 2301 }, { "epoch": 1.7153502235469449, "grad_norm": 23.370607974055382, "learning_rate": 1.8415994617650528e-06, "loss": 1.1064, "num_input_tokens_seen": 9428992, "step": 2302 }, { "epoch": 1.7160953800298062, "grad_norm": 7.727299415232526, "learning_rate": 1.8410838478451098e-06, "loss": 0.6034, "num_input_tokens_seen": 9433088, "step": 2303 }, { "epoch": 1.7168405365126675, "grad_norm": 12.428530204709476, "learning_rate": 1.8405681043553667e-06, "loss": 0.5015, "num_input_tokens_seen": 9437184, "step": 2304 }, { "epoch": 1.717585692995529, "grad_norm": 8.272018454134738, "learning_rate": 1.8400522314088778e-06, "loss": 0.8543, "num_input_tokens_seen": 9441280, "step": 2305 }, { "epoch": 1.7183308494783904, "grad_norm": 9.778384517267424, "learning_rate": 1.8395362291187269e-06, "loss": 0.8591, "num_input_tokens_seen": 9445376, "step": 2306 }, { "epoch": 1.7190760059612518, "grad_norm": 8.545936158475575, "learning_rate": 1.8390200975980244e-06, "loss": 0.5112, "num_input_tokens_seen": 9449472, "step": 2307 }, { "epoch": 1.7198211624441133, "grad_norm": 8.477452831817542, "learning_rate": 1.838503836959911e-06, "loss": 0.7991, "num_input_tokens_seen": 9453568, "step": 2308 }, { "epoch": 1.7205663189269746, "grad_norm": 10.32453915017995, "learning_rate": 1.8379874473175543e-06, "loss": 0.5431, "num_input_tokens_seen": 9457664, "step": 2309 }, { "epoch": 1.721311475409836, "grad_norm": 7.145090860514174, "learning_rate": 1.8374709287841503e-06, "loss": 0.843, "num_input_tokens_seen": 9461760, "step": 2310 }, { "epoch": 1.7220566318926975, "grad_norm": 9.412816591937943, "learning_rate": 1.836954281472924e-06, "loss": 0.8532, "num_input_tokens_seen": 9465856, "step": 2311 }, { "epoch": 1.7228017883755589, "grad_norm": 9.598761665667444, "learning_rate": 1.8364375054971276e-06, "loss": 0.8601, "num_input_tokens_seen": 9469952, "step": 2312 }, { "epoch": 1.7235469448584202, "grad_norm": 8.308977203336884, "learning_rate": 1.8359206009700425e-06, "loss": 0.7313, "num_input_tokens_seen": 9474048, "step": 2313 }, { "epoch": 1.7242921013412817, "grad_norm": 10.494608078421246, "learning_rate": 1.835403568004978e-06, "loss": 0.759, "num_input_tokens_seen": 9478144, "step": 2314 }, { "epoch": 1.725037257824143, "grad_norm": 6.714086091479885, "learning_rate": 1.8348864067152713e-06, "loss": 0.6122, "num_input_tokens_seen": 9482240, "step": 2315 }, { "epoch": 1.7257824143070044, "grad_norm": 8.800028088730919, "learning_rate": 1.8343691172142877e-06, "loss": 0.8862, "num_input_tokens_seen": 9486336, "step": 2316 }, { "epoch": 1.726527570789866, "grad_norm": 10.837121625961496, "learning_rate": 1.8338516996154206e-06, "loss": 0.8742, "num_input_tokens_seen": 9490432, "step": 2317 }, { "epoch": 1.7272727272727273, "grad_norm": 7.427954645792868, "learning_rate": 1.8333341540320923e-06, "loss": 0.8891, "num_input_tokens_seen": 9494528, "step": 2318 }, { "epoch": 1.7280178837555886, "grad_norm": 9.225854306887738, "learning_rate": 1.832816480577752e-06, "loss": 0.8102, "num_input_tokens_seen": 9498624, "step": 2319 }, { "epoch": 1.7287630402384502, "grad_norm": 8.836927628331834, "learning_rate": 1.832298679365877e-06, "loss": 0.8622, "num_input_tokens_seen": 9502720, "step": 2320 }, { "epoch": 1.7295081967213115, "grad_norm": 8.061572815692978, "learning_rate": 1.8317807505099742e-06, "loss": 0.6037, "num_input_tokens_seen": 9506816, "step": 2321 }, { "epoch": 1.7302533532041728, "grad_norm": 7.994009380675791, "learning_rate": 1.831262694123576e-06, "loss": 0.9767, "num_input_tokens_seen": 9510912, "step": 2322 }, { "epoch": 1.7309985096870344, "grad_norm": 10.537917183651002, "learning_rate": 1.8307445103202452e-06, "loss": 0.5912, "num_input_tokens_seen": 9515008, "step": 2323 }, { "epoch": 1.7317436661698957, "grad_norm": 8.843240463695896, "learning_rate": 1.8302261992135704e-06, "loss": 0.8564, "num_input_tokens_seen": 9519104, "step": 2324 }, { "epoch": 1.732488822652757, "grad_norm": 8.861288179408156, "learning_rate": 1.8297077609171696e-06, "loss": 0.9171, "num_input_tokens_seen": 9523200, "step": 2325 }, { "epoch": 1.7332339791356186, "grad_norm": 9.231948639086907, "learning_rate": 1.829189195544688e-06, "loss": 0.6807, "num_input_tokens_seen": 9527296, "step": 2326 }, { "epoch": 1.7339791356184797, "grad_norm": 11.386908264299368, "learning_rate": 1.8286705032097995e-06, "loss": 0.6739, "num_input_tokens_seen": 9531392, "step": 2327 }, { "epoch": 1.7347242921013413, "grad_norm": 7.297571148757346, "learning_rate": 1.8281516840262044e-06, "loss": 0.7216, "num_input_tokens_seen": 9535488, "step": 2328 }, { "epoch": 1.7354694485842028, "grad_norm": 8.747028053997823, "learning_rate": 1.8276327381076317e-06, "loss": 0.8508, "num_input_tokens_seen": 9539584, "step": 2329 }, { "epoch": 1.736214605067064, "grad_norm": 7.69911163026827, "learning_rate": 1.827113665567838e-06, "loss": 0.5607, "num_input_tokens_seen": 9543680, "step": 2330 }, { "epoch": 1.7369597615499255, "grad_norm": 8.275040614088988, "learning_rate": 1.8265944665206079e-06, "loss": 0.7823, "num_input_tokens_seen": 9547776, "step": 2331 }, { "epoch": 1.737704918032787, "grad_norm": 9.815895562612097, "learning_rate": 1.8260751410797533e-06, "loss": 0.383, "num_input_tokens_seen": 9551872, "step": 2332 }, { "epoch": 1.7384500745156481, "grad_norm": 8.021810707615307, "learning_rate": 1.8255556893591147e-06, "loss": 0.8413, "num_input_tokens_seen": 9555968, "step": 2333 }, { "epoch": 1.7391952309985097, "grad_norm": 8.833206771375405, "learning_rate": 1.825036111472559e-06, "loss": 0.8302, "num_input_tokens_seen": 9560064, "step": 2334 }, { "epoch": 1.7399403874813713, "grad_norm": 7.718258688075683, "learning_rate": 1.8245164075339816e-06, "loss": 0.7792, "num_input_tokens_seen": 9564160, "step": 2335 }, { "epoch": 1.7406855439642324, "grad_norm": 8.294095890701648, "learning_rate": 1.8239965776573048e-06, "loss": 0.8532, "num_input_tokens_seen": 9568256, "step": 2336 }, { "epoch": 1.741430700447094, "grad_norm": 9.381624871562675, "learning_rate": 1.8234766219564804e-06, "loss": 0.8345, "num_input_tokens_seen": 9572352, "step": 2337 }, { "epoch": 1.7421758569299552, "grad_norm": 9.148750611810641, "learning_rate": 1.8229565405454846e-06, "loss": 0.6299, "num_input_tokens_seen": 9576448, "step": 2338 }, { "epoch": 1.7429210134128166, "grad_norm": 8.60010894170353, "learning_rate": 1.822436333538324e-06, "loss": 0.7637, "num_input_tokens_seen": 9580544, "step": 2339 }, { "epoch": 1.7436661698956781, "grad_norm": 8.91182333882864, "learning_rate": 1.8219160010490316e-06, "loss": 0.7656, "num_input_tokens_seen": 9584640, "step": 2340 }, { "epoch": 1.7444113263785395, "grad_norm": 10.514150177280014, "learning_rate": 1.821395543191668e-06, "loss": 0.7814, "num_input_tokens_seen": 9588736, "step": 2341 }, { "epoch": 1.7451564828614008, "grad_norm": 21.154189991331116, "learning_rate": 1.8208749600803214e-06, "loss": 0.5757, "num_input_tokens_seen": 9592832, "step": 2342 }, { "epoch": 1.7459016393442623, "grad_norm": 8.416830811980596, "learning_rate": 1.8203542518291067e-06, "loss": 0.7427, "num_input_tokens_seen": 9596928, "step": 2343 }, { "epoch": 1.7466467958271237, "grad_norm": 8.347458127230686, "learning_rate": 1.8198334185521677e-06, "loss": 1.0191, "num_input_tokens_seen": 9601024, "step": 2344 }, { "epoch": 1.747391952309985, "grad_norm": 7.293680567723095, "learning_rate": 1.8193124603636744e-06, "loss": 0.7398, "num_input_tokens_seen": 9605120, "step": 2345 }, { "epoch": 1.7481371087928466, "grad_norm": 7.820492279378751, "learning_rate": 1.8187913773778244e-06, "loss": 0.7138, "num_input_tokens_seen": 9609216, "step": 2346 }, { "epoch": 1.748882265275708, "grad_norm": 8.00778134155774, "learning_rate": 1.8182701697088428e-06, "loss": 0.806, "num_input_tokens_seen": 9613312, "step": 2347 }, { "epoch": 1.7496274217585692, "grad_norm": 8.912524689283552, "learning_rate": 1.8177488374709823e-06, "loss": 0.5165, "num_input_tokens_seen": 9617408, "step": 2348 }, { "epoch": 1.7503725782414308, "grad_norm": 8.46395004083072, "learning_rate": 1.8172273807785225e-06, "loss": 0.9544, "num_input_tokens_seen": 9621504, "step": 2349 }, { "epoch": 1.751117734724292, "grad_norm": 8.203259599355519, "learning_rate": 1.8167057997457705e-06, "loss": 0.9882, "num_input_tokens_seen": 9625600, "step": 2350 }, { "epoch": 1.7518628912071534, "grad_norm": 7.814484278378462, "learning_rate": 1.8161840944870604e-06, "loss": 0.4561, "num_input_tokens_seen": 9629696, "step": 2351 }, { "epoch": 1.752608047690015, "grad_norm": 7.5704600729131135, "learning_rate": 1.815662265116754e-06, "loss": 0.6001, "num_input_tokens_seen": 9633792, "step": 2352 }, { "epoch": 1.7533532041728763, "grad_norm": 8.25312069250954, "learning_rate": 1.81514031174924e-06, "loss": 1.0491, "num_input_tokens_seen": 9637888, "step": 2353 }, { "epoch": 1.7540983606557377, "grad_norm": 7.295764915493271, "learning_rate": 1.814618234498934e-06, "loss": 0.766, "num_input_tokens_seen": 9641984, "step": 2354 }, { "epoch": 1.7548435171385992, "grad_norm": 9.26945901827419, "learning_rate": 1.8140960334802798e-06, "loss": 0.8135, "num_input_tokens_seen": 9646080, "step": 2355 }, { "epoch": 1.7555886736214605, "grad_norm": 8.475458413634435, "learning_rate": 1.8135737088077465e-06, "loss": 0.5521, "num_input_tokens_seen": 9650176, "step": 2356 }, { "epoch": 1.7563338301043219, "grad_norm": 10.301419431767837, "learning_rate": 1.813051260595832e-06, "loss": 0.8375, "num_input_tokens_seen": 9654272, "step": 2357 }, { "epoch": 1.7570789865871834, "grad_norm": 9.261251518721503, "learning_rate": 1.8125286889590608e-06, "loss": 0.7759, "num_input_tokens_seen": 9658368, "step": 2358 }, { "epoch": 1.7578241430700448, "grad_norm": 8.441948681656074, "learning_rate": 1.812005994011984e-06, "loss": 0.6984, "num_input_tokens_seen": 9662464, "step": 2359 }, { "epoch": 1.758569299552906, "grad_norm": 13.740294854868147, "learning_rate": 1.8114831758691803e-06, "loss": 0.7767, "num_input_tokens_seen": 9666560, "step": 2360 }, { "epoch": 1.7593144560357676, "grad_norm": 11.50556348396478, "learning_rate": 1.8109602346452547e-06, "loss": 0.5785, "num_input_tokens_seen": 9670656, "step": 2361 }, { "epoch": 1.7600596125186287, "grad_norm": 8.32743774185525, "learning_rate": 1.810437170454841e-06, "loss": 0.5541, "num_input_tokens_seen": 9674752, "step": 2362 }, { "epoch": 1.7608047690014903, "grad_norm": 9.180289257861563, "learning_rate": 1.809913983412597e-06, "loss": 0.7538, "num_input_tokens_seen": 9678848, "step": 2363 }, { "epoch": 1.7615499254843519, "grad_norm": 11.71025433897607, "learning_rate": 1.80939067363321e-06, "loss": 0.9452, "num_input_tokens_seen": 9682944, "step": 2364 }, { "epoch": 1.762295081967213, "grad_norm": 9.124129473117126, "learning_rate": 1.8088672412313927e-06, "loss": 0.893, "num_input_tokens_seen": 9687040, "step": 2365 }, { "epoch": 1.7630402384500745, "grad_norm": 8.844565544382377, "learning_rate": 1.8083436863218854e-06, "loss": 0.6094, "num_input_tokens_seen": 9691136, "step": 2366 }, { "epoch": 1.763785394932936, "grad_norm": 8.310024979707773, "learning_rate": 1.8078200090194552e-06, "loss": 0.6885, "num_input_tokens_seen": 9695232, "step": 2367 }, { "epoch": 1.7645305514157972, "grad_norm": 14.442107236443215, "learning_rate": 1.8072962094388958e-06, "loss": 0.7585, "num_input_tokens_seen": 9699328, "step": 2368 }, { "epoch": 1.7652757078986587, "grad_norm": 9.420515409029731, "learning_rate": 1.806772287695028e-06, "loss": 0.7558, "num_input_tokens_seen": 9703424, "step": 2369 }, { "epoch": 1.7660208643815203, "grad_norm": 8.324285195212928, "learning_rate": 1.806248243902699e-06, "loss": 0.6304, "num_input_tokens_seen": 9707520, "step": 2370 }, { "epoch": 1.7667660208643814, "grad_norm": 8.892426457589803, "learning_rate": 1.8057240781767826e-06, "loss": 0.5273, "num_input_tokens_seen": 9711616, "step": 2371 }, { "epoch": 1.767511177347243, "grad_norm": 9.662432878392186, "learning_rate": 1.8051997906321805e-06, "loss": 0.7844, "num_input_tokens_seen": 9715712, "step": 2372 }, { "epoch": 1.7682563338301043, "grad_norm": 14.90721024678128, "learning_rate": 1.8046753813838197e-06, "loss": 0.5586, "num_input_tokens_seen": 9719808, "step": 2373 }, { "epoch": 1.7690014903129656, "grad_norm": 11.977130604534105, "learning_rate": 1.8041508505466546e-06, "loss": 0.9281, "num_input_tokens_seen": 9723904, "step": 2374 }, { "epoch": 1.7697466467958272, "grad_norm": 8.54015086292218, "learning_rate": 1.8036261982356663e-06, "loss": 0.7286, "num_input_tokens_seen": 9728000, "step": 2375 }, { "epoch": 1.7704918032786885, "grad_norm": 8.900640173356859, "learning_rate": 1.8031014245658614e-06, "loss": 0.7782, "num_input_tokens_seen": 9732096, "step": 2376 }, { "epoch": 1.7712369597615498, "grad_norm": 9.90082594730072, "learning_rate": 1.802576529652275e-06, "loss": 0.8235, "num_input_tokens_seen": 9736192, "step": 2377 }, { "epoch": 1.7719821162444114, "grad_norm": 8.86700207776786, "learning_rate": 1.8020515136099678e-06, "loss": 0.9386, "num_input_tokens_seen": 9740288, "step": 2378 }, { "epoch": 1.7727272727272727, "grad_norm": 9.731797018831218, "learning_rate": 1.8015263765540267e-06, "loss": 0.5928, "num_input_tokens_seen": 9744384, "step": 2379 }, { "epoch": 1.773472429210134, "grad_norm": 8.528167044306393, "learning_rate": 1.8010011185995657e-06, "loss": 0.7419, "num_input_tokens_seen": 9748480, "step": 2380 }, { "epoch": 1.7742175856929956, "grad_norm": 7.713186697616506, "learning_rate": 1.800475739861725e-06, "loss": 0.8327, "num_input_tokens_seen": 9752576, "step": 2381 }, { "epoch": 1.774962742175857, "grad_norm": 7.5998537805838, "learning_rate": 1.7999502404556712e-06, "loss": 0.7862, "num_input_tokens_seen": 9756672, "step": 2382 }, { "epoch": 1.7757078986587183, "grad_norm": 7.7332804320127195, "learning_rate": 1.7994246204965979e-06, "loss": 0.805, "num_input_tokens_seen": 9760768, "step": 2383 }, { "epoch": 1.7764530551415798, "grad_norm": 7.335789610512906, "learning_rate": 1.7988988800997246e-06, "loss": 0.8415, "num_input_tokens_seen": 9764864, "step": 2384 }, { "epoch": 1.7771982116244411, "grad_norm": 7.6644258092372946, "learning_rate": 1.7983730193802967e-06, "loss": 0.7795, "num_input_tokens_seen": 9768960, "step": 2385 }, { "epoch": 1.7779433681073025, "grad_norm": 8.284551972391863, "learning_rate": 1.797847038453588e-06, "loss": 0.7664, "num_input_tokens_seen": 9773056, "step": 2386 }, { "epoch": 1.778688524590164, "grad_norm": 8.512277664872641, "learning_rate": 1.797320937434896e-06, "loss": 0.5822, "num_input_tokens_seen": 9777152, "step": 2387 }, { "epoch": 1.7794336810730254, "grad_norm": 8.073488797584416, "learning_rate": 1.7967947164395461e-06, "loss": 0.896, "num_input_tokens_seen": 9781248, "step": 2388 }, { "epoch": 1.7801788375558867, "grad_norm": 8.89662061146189, "learning_rate": 1.7962683755828906e-06, "loss": 0.6895, "num_input_tokens_seen": 9785344, "step": 2389 }, { "epoch": 1.7809239940387482, "grad_norm": 8.205241773682843, "learning_rate": 1.795741914980306e-06, "loss": 0.8229, "num_input_tokens_seen": 9789440, "step": 2390 }, { "epoch": 1.7816691505216096, "grad_norm": 7.847681741912531, "learning_rate": 1.7952153347471967e-06, "loss": 0.8198, "num_input_tokens_seen": 9793536, "step": 2391 }, { "epoch": 1.782414307004471, "grad_norm": 7.815129063608094, "learning_rate": 1.794688634998993e-06, "loss": 0.8972, "num_input_tokens_seen": 9797632, "step": 2392 }, { "epoch": 1.7831594634873325, "grad_norm": 8.633896759539928, "learning_rate": 1.794161815851151e-06, "loss": 0.9238, "num_input_tokens_seen": 9801728, "step": 2393 }, { "epoch": 1.7839046199701938, "grad_norm": 8.943394518919003, "learning_rate": 1.793634877419153e-06, "loss": 0.6836, "num_input_tokens_seen": 9805824, "step": 2394 }, { "epoch": 1.7846497764530551, "grad_norm": 9.757024281510388, "learning_rate": 1.7931078198185086e-06, "loss": 0.6351, "num_input_tokens_seen": 9809920, "step": 2395 }, { "epoch": 1.7853949329359167, "grad_norm": 8.954390689233136, "learning_rate": 1.7925806431647517e-06, "loss": 1.0468, "num_input_tokens_seen": 9814016, "step": 2396 }, { "epoch": 1.786140089418778, "grad_norm": 8.340797555549566, "learning_rate": 1.7920533475734435e-06, "loss": 0.807, "num_input_tokens_seen": 9818112, "step": 2397 }, { "epoch": 1.7868852459016393, "grad_norm": 8.738238555090508, "learning_rate": 1.7915259331601706e-06, "loss": 0.7353, "num_input_tokens_seen": 9822208, "step": 2398 }, { "epoch": 1.7876304023845009, "grad_norm": 8.709903287066567, "learning_rate": 1.7909984000405464e-06, "loss": 0.8917, "num_input_tokens_seen": 9826304, "step": 2399 }, { "epoch": 1.788375558867362, "grad_norm": 8.441799843855883, "learning_rate": 1.7904707483302101e-06, "loss": 0.6224, "num_input_tokens_seen": 9830400, "step": 2400 }, { "epoch": 1.7891207153502235, "grad_norm": 8.598078976814792, "learning_rate": 1.7899429781448264e-06, "loss": 0.5491, "num_input_tokens_seen": 9834496, "step": 2401 }, { "epoch": 1.789865871833085, "grad_norm": 9.33557575470883, "learning_rate": 1.7894150896000862e-06, "loss": 0.7425, "num_input_tokens_seen": 9838592, "step": 2402 }, { "epoch": 1.7906110283159462, "grad_norm": 9.263978074542226, "learning_rate": 1.7888870828117067e-06, "loss": 0.9035, "num_input_tokens_seen": 9842688, "step": 2403 }, { "epoch": 1.7913561847988078, "grad_norm": 7.872392238386693, "learning_rate": 1.7883589578954305e-06, "loss": 0.815, "num_input_tokens_seen": 9846784, "step": 2404 }, { "epoch": 1.7921013412816693, "grad_norm": 8.524420843009068, "learning_rate": 1.7878307149670265e-06, "loss": 0.8222, "num_input_tokens_seen": 9850880, "step": 2405 }, { "epoch": 1.7928464977645304, "grad_norm": 9.87840993641222, "learning_rate": 1.7873023541422893e-06, "loss": 0.629, "num_input_tokens_seen": 9854976, "step": 2406 }, { "epoch": 1.793591654247392, "grad_norm": 7.13459696096052, "learning_rate": 1.7867738755370396e-06, "loss": 0.6531, "num_input_tokens_seen": 9859072, "step": 2407 }, { "epoch": 1.7943368107302533, "grad_norm": 7.732266012763201, "learning_rate": 1.7862452792671235e-06, "loss": 0.8068, "num_input_tokens_seen": 9863168, "step": 2408 }, { "epoch": 1.7950819672131146, "grad_norm": 8.907054305092517, "learning_rate": 1.7857165654484128e-06, "loss": 0.678, "num_input_tokens_seen": 9867264, "step": 2409 }, { "epoch": 1.7958271236959762, "grad_norm": 8.55492132951122, "learning_rate": 1.785187734196806e-06, "loss": 0.7056, "num_input_tokens_seen": 9871360, "step": 2410 }, { "epoch": 1.7965722801788375, "grad_norm": 7.420155289116418, "learning_rate": 1.784658785628226e-06, "loss": 0.8072, "num_input_tokens_seen": 9875456, "step": 2411 }, { "epoch": 1.7973174366616989, "grad_norm": 8.046119830948129, "learning_rate": 1.7841297198586222e-06, "loss": 0.5477, "num_input_tokens_seen": 9879552, "step": 2412 }, { "epoch": 1.7980625931445604, "grad_norm": 9.297186985407754, "learning_rate": 1.78360053700397e-06, "loss": 0.8398, "num_input_tokens_seen": 9883648, "step": 2413 }, { "epoch": 1.7988077496274217, "grad_norm": 7.99630448321521, "learning_rate": 1.7830712371802697e-06, "loss": 1.0525, "num_input_tokens_seen": 9887744, "step": 2414 }, { "epoch": 1.799552906110283, "grad_norm": 9.001507500526335, "learning_rate": 1.7825418205035477e-06, "loss": 1.0611, "num_input_tokens_seen": 9891840, "step": 2415 }, { "epoch": 1.8002980625931446, "grad_norm": 8.677130050791076, "learning_rate": 1.7820122870898565e-06, "loss": 0.7547, "num_input_tokens_seen": 9895936, "step": 2416 }, { "epoch": 1.801043219076006, "grad_norm": 7.4212418091004855, "learning_rate": 1.7814826370552727e-06, "loss": 0.6974, "num_input_tokens_seen": 9900032, "step": 2417 }, { "epoch": 1.8017883755588673, "grad_norm": 10.843121137961587, "learning_rate": 1.7809528705158995e-06, "loss": 0.758, "num_input_tokens_seen": 9904128, "step": 2418 }, { "epoch": 1.8025335320417288, "grad_norm": 7.905184215720326, "learning_rate": 1.7804229875878663e-06, "loss": 0.796, "num_input_tokens_seen": 9908224, "step": 2419 }, { "epoch": 1.8032786885245902, "grad_norm": 7.051708973432077, "learning_rate": 1.7798929883873262e-06, "loss": 0.9959, "num_input_tokens_seen": 9912320, "step": 2420 }, { "epoch": 1.8040238450074515, "grad_norm": 8.042029576705318, "learning_rate": 1.779362873030459e-06, "loss": 0.663, "num_input_tokens_seen": 9916416, "step": 2421 }, { "epoch": 1.804769001490313, "grad_norm": 8.693228615908083, "learning_rate": 1.778832641633471e-06, "loss": 0.9598, "num_input_tokens_seen": 9920512, "step": 2422 }, { "epoch": 1.8055141579731744, "grad_norm": 7.450408391416711, "learning_rate": 1.778302294312591e-06, "loss": 0.9081, "num_input_tokens_seen": 9924608, "step": 2423 }, { "epoch": 1.8062593144560357, "grad_norm": 8.248239109257163, "learning_rate": 1.7777718311840759e-06, "loss": 0.5507, "num_input_tokens_seen": 9928704, "step": 2424 }, { "epoch": 1.8070044709388973, "grad_norm": 11.489098577607285, "learning_rate": 1.7772412523642066e-06, "loss": 0.7036, "num_input_tokens_seen": 9932800, "step": 2425 }, { "epoch": 1.8077496274217586, "grad_norm": 13.106010967784599, "learning_rate": 1.77671055796929e-06, "loss": 0.8037, "num_input_tokens_seen": 9936896, "step": 2426 }, { "epoch": 1.80849478390462, "grad_norm": 7.52809321750742, "learning_rate": 1.7761797481156582e-06, "loss": 0.5273, "num_input_tokens_seen": 9940992, "step": 2427 }, { "epoch": 1.8092399403874815, "grad_norm": 9.927189792905184, "learning_rate": 1.7756488229196684e-06, "loss": 0.8206, "num_input_tokens_seen": 9945088, "step": 2428 }, { "epoch": 1.8099850968703428, "grad_norm": 8.173010404667746, "learning_rate": 1.7751177824977029e-06, "loss": 0.6955, "num_input_tokens_seen": 9949184, "step": 2429 }, { "epoch": 1.8107302533532041, "grad_norm": 9.33004047688684, "learning_rate": 1.77458662696617e-06, "loss": 1.0713, "num_input_tokens_seen": 9953280, "step": 2430 }, { "epoch": 1.8114754098360657, "grad_norm": 7.589780314348917, "learning_rate": 1.7740553564415025e-06, "loss": 0.656, "num_input_tokens_seen": 9957376, "step": 2431 }, { "epoch": 1.812220566318927, "grad_norm": 8.155988670271267, "learning_rate": 1.7735239710401585e-06, "loss": 0.5698, "num_input_tokens_seen": 9961472, "step": 2432 }, { "epoch": 1.8129657228017884, "grad_norm": 9.989610125299546, "learning_rate": 1.7729924708786222e-06, "loss": 0.8074, "num_input_tokens_seen": 9965568, "step": 2433 }, { "epoch": 1.81371087928465, "grad_norm": 7.7852002125947495, "learning_rate": 1.7724608560734014e-06, "loss": 0.5885, "num_input_tokens_seen": 9969664, "step": 2434 }, { "epoch": 1.814456035767511, "grad_norm": 8.525620685217552, "learning_rate": 1.7719291267410305e-06, "loss": 0.7707, "num_input_tokens_seen": 9973760, "step": 2435 }, { "epoch": 1.8152011922503726, "grad_norm": 7.7737492063286675, "learning_rate": 1.7713972829980682e-06, "loss": 0.975, "num_input_tokens_seen": 9977856, "step": 2436 }, { "epoch": 1.8159463487332341, "grad_norm": 7.950141262315794, "learning_rate": 1.7708653249610985e-06, "loss": 0.5786, "num_input_tokens_seen": 9981952, "step": 2437 }, { "epoch": 1.8166915052160952, "grad_norm": 8.042212875268715, "learning_rate": 1.7703332527467298e-06, "loss": 0.8454, "num_input_tokens_seen": 9986048, "step": 2438 }, { "epoch": 1.8174366616989568, "grad_norm": 8.134710291676068, "learning_rate": 1.7698010664715966e-06, "loss": 0.7454, "num_input_tokens_seen": 9990144, "step": 2439 }, { "epoch": 1.8181818181818183, "grad_norm": 11.88614474935251, "learning_rate": 1.7692687662523584e-06, "loss": 0.8091, "num_input_tokens_seen": 9994240, "step": 2440 }, { "epoch": 1.8189269746646795, "grad_norm": 9.5622606165243, "learning_rate": 1.7687363522056982e-06, "loss": 1.0315, "num_input_tokens_seen": 9998336, "step": 2441 }, { "epoch": 1.819672131147541, "grad_norm": 7.6859647781590095, "learning_rate": 1.7682038244483258e-06, "loss": 0.7675, "num_input_tokens_seen": 10002432, "step": 2442 }, { "epoch": 1.8204172876304023, "grad_norm": 9.264219736104378, "learning_rate": 1.7676711830969747e-06, "loss": 0.5989, "num_input_tokens_seen": 10006528, "step": 2443 }, { "epoch": 1.8211624441132637, "grad_norm": 8.510270518251138, "learning_rate": 1.7671384282684036e-06, "loss": 0.756, "num_input_tokens_seen": 10010624, "step": 2444 }, { "epoch": 1.8219076005961252, "grad_norm": 13.875468408321849, "learning_rate": 1.7666055600793964e-06, "loss": 0.7149, "num_input_tokens_seen": 10014720, "step": 2445 }, { "epoch": 1.8226527570789866, "grad_norm": 9.950735995776311, "learning_rate": 1.766072578646762e-06, "loss": 0.6809, "num_input_tokens_seen": 10018816, "step": 2446 }, { "epoch": 1.8233979135618479, "grad_norm": 10.272900389352584, "learning_rate": 1.7655394840873326e-06, "loss": 0.9233, "num_input_tokens_seen": 10022912, "step": 2447 }, { "epoch": 1.8241430700447094, "grad_norm": 8.064709585897042, "learning_rate": 1.7650062765179674e-06, "loss": 1.144, "num_input_tokens_seen": 10027008, "step": 2448 }, { "epoch": 1.8248882265275708, "grad_norm": 8.084998198398454, "learning_rate": 1.764472956055549e-06, "loss": 0.6004, "num_input_tokens_seen": 10031104, "step": 2449 }, { "epoch": 1.825633383010432, "grad_norm": 8.436460203767586, "learning_rate": 1.7639395228169848e-06, "loss": 0.7301, "num_input_tokens_seen": 10035200, "step": 2450 }, { "epoch": 1.8263785394932937, "grad_norm": 9.26658859551443, "learning_rate": 1.7634059769192071e-06, "loss": 0.7768, "num_input_tokens_seen": 10039296, "step": 2451 }, { "epoch": 1.827123695976155, "grad_norm": 9.431642112251053, "learning_rate": 1.7628723184791736e-06, "loss": 0.7108, "num_input_tokens_seen": 10043392, "step": 2452 }, { "epoch": 1.8278688524590163, "grad_norm": 7.527531652613582, "learning_rate": 1.7623385476138651e-06, "loss": 0.8036, "num_input_tokens_seen": 10047488, "step": 2453 }, { "epoch": 1.8286140089418779, "grad_norm": 8.92138788916326, "learning_rate": 1.761804664440289e-06, "loss": 0.9896, "num_input_tokens_seen": 10051584, "step": 2454 }, { "epoch": 1.8293591654247392, "grad_norm": 6.513378132982973, "learning_rate": 1.7612706690754757e-06, "loss": 0.7696, "num_input_tokens_seen": 10055680, "step": 2455 }, { "epoch": 1.8301043219076005, "grad_norm": 8.873881809706916, "learning_rate": 1.7607365616364814e-06, "loss": 0.5282, "num_input_tokens_seen": 10059776, "step": 2456 }, { "epoch": 1.830849478390462, "grad_norm": 9.125218705583782, "learning_rate": 1.7602023422403851e-06, "loss": 0.8774, "num_input_tokens_seen": 10063872, "step": 2457 }, { "epoch": 1.8315946348733234, "grad_norm": 8.070783928192013, "learning_rate": 1.7596680110042927e-06, "loss": 0.9739, "num_input_tokens_seen": 10067968, "step": 2458 }, { "epoch": 1.8323397913561847, "grad_norm": 6.755364662145685, "learning_rate": 1.7591335680453326e-06, "loss": 0.7715, "num_input_tokens_seen": 10072064, "step": 2459 }, { "epoch": 1.8330849478390463, "grad_norm": 8.715903819086746, "learning_rate": 1.7585990134806591e-06, "loss": 0.9166, "num_input_tokens_seen": 10076160, "step": 2460 }, { "epoch": 1.8338301043219076, "grad_norm": 7.399957049223643, "learning_rate": 1.7580643474274498e-06, "loss": 0.6351, "num_input_tokens_seen": 10080256, "step": 2461 }, { "epoch": 1.834575260804769, "grad_norm": 10.600404632934884, "learning_rate": 1.7575295700029077e-06, "loss": 0.7865, "num_input_tokens_seen": 10084352, "step": 2462 }, { "epoch": 1.8353204172876305, "grad_norm": 7.746696261420446, "learning_rate": 1.7569946813242603e-06, "loss": 0.9527, "num_input_tokens_seen": 10088448, "step": 2463 }, { "epoch": 1.8360655737704918, "grad_norm": 9.511525305969432, "learning_rate": 1.756459681508758e-06, "loss": 0.71, "num_input_tokens_seen": 10092544, "step": 2464 }, { "epoch": 1.8368107302533532, "grad_norm": 16.909213817250695, "learning_rate": 1.7559245706736775e-06, "loss": 0.6768, "num_input_tokens_seen": 10096640, "step": 2465 }, { "epoch": 1.8375558867362147, "grad_norm": 8.19277751426186, "learning_rate": 1.755389348936318e-06, "loss": 0.5816, "num_input_tokens_seen": 10100736, "step": 2466 }, { "epoch": 1.838301043219076, "grad_norm": 8.618561745530087, "learning_rate": 1.7548540164140047e-06, "loss": 0.5604, "num_input_tokens_seen": 10104832, "step": 2467 }, { "epoch": 1.8390461997019374, "grad_norm": 20.215785730166754, "learning_rate": 1.7543185732240858e-06, "loss": 1.0002, "num_input_tokens_seen": 10108928, "step": 2468 }, { "epoch": 1.839791356184799, "grad_norm": 7.6526821607488875, "learning_rate": 1.7537830194839345e-06, "loss": 0.6521, "num_input_tokens_seen": 10113024, "step": 2469 }, { "epoch": 1.84053651266766, "grad_norm": 8.783671906942196, "learning_rate": 1.753247355310948e-06, "loss": 0.6385, "num_input_tokens_seen": 10117120, "step": 2470 }, { "epoch": 1.8412816691505216, "grad_norm": 9.73616664196826, "learning_rate": 1.7527115808225478e-06, "loss": 0.8598, "num_input_tokens_seen": 10121216, "step": 2471 }, { "epoch": 1.8420268256333832, "grad_norm": 10.405658414050885, "learning_rate": 1.7521756961361795e-06, "loss": 0.5823, "num_input_tokens_seen": 10125312, "step": 2472 }, { "epoch": 1.8427719821162443, "grad_norm": 9.995848160301067, "learning_rate": 1.751639701369313e-06, "loss": 0.7765, "num_input_tokens_seen": 10129408, "step": 2473 }, { "epoch": 1.8435171385991058, "grad_norm": 7.921530155496748, "learning_rate": 1.7511035966394412e-06, "loss": 0.4741, "num_input_tokens_seen": 10133504, "step": 2474 }, { "epoch": 1.8442622950819674, "grad_norm": 9.545687904414807, "learning_rate": 1.7505673820640834e-06, "loss": 0.7034, "num_input_tokens_seen": 10137600, "step": 2475 }, { "epoch": 1.8450074515648285, "grad_norm": 13.73041594841809, "learning_rate": 1.750031057760781e-06, "loss": 0.6634, "num_input_tokens_seen": 10141696, "step": 2476 }, { "epoch": 1.84575260804769, "grad_norm": 8.981628400459186, "learning_rate": 1.7494946238471e-06, "loss": 0.5948, "num_input_tokens_seen": 10145792, "step": 2477 }, { "epoch": 1.8464977645305514, "grad_norm": 9.300351535536018, "learning_rate": 1.7489580804406309e-06, "loss": 0.8269, "num_input_tokens_seen": 10149888, "step": 2478 }, { "epoch": 1.8472429210134127, "grad_norm": 8.507716034447748, "learning_rate": 1.7484214276589872e-06, "loss": 0.9313, "num_input_tokens_seen": 10153984, "step": 2479 }, { "epoch": 1.8479880774962743, "grad_norm": 8.22710640502189, "learning_rate": 1.747884665619808e-06, "loss": 0.7831, "num_input_tokens_seen": 10158080, "step": 2480 }, { "epoch": 1.8487332339791356, "grad_norm": 9.266292969727962, "learning_rate": 1.7473477944407546e-06, "loss": 0.6754, "num_input_tokens_seen": 10162176, "step": 2481 }, { "epoch": 1.849478390461997, "grad_norm": 44.232568815747456, "learning_rate": 1.746810814239514e-06, "loss": 0.8317, "num_input_tokens_seen": 10166272, "step": 2482 }, { "epoch": 1.8502235469448585, "grad_norm": 7.8803123353262166, "learning_rate": 1.746273725133795e-06, "loss": 0.8968, "num_input_tokens_seen": 10170368, "step": 2483 }, { "epoch": 1.8509687034277198, "grad_norm": 8.977036334661609, "learning_rate": 1.7457365272413315e-06, "loss": 0.7683, "num_input_tokens_seen": 10174464, "step": 2484 }, { "epoch": 1.8517138599105811, "grad_norm": 8.608208918242882, "learning_rate": 1.7451992206798813e-06, "loss": 0.8262, "num_input_tokens_seen": 10178560, "step": 2485 }, { "epoch": 1.8524590163934427, "grad_norm": 7.593193377643233, "learning_rate": 1.744661805567226e-06, "loss": 0.6326, "num_input_tokens_seen": 10182656, "step": 2486 }, { "epoch": 1.853204172876304, "grad_norm": 9.791147061755812, "learning_rate": 1.7441242820211707e-06, "loss": 0.7167, "num_input_tokens_seen": 10186752, "step": 2487 }, { "epoch": 1.8539493293591653, "grad_norm": 8.40649241595147, "learning_rate": 1.7435866501595444e-06, "loss": 0.8554, "num_input_tokens_seen": 10190848, "step": 2488 }, { "epoch": 1.854694485842027, "grad_norm": 8.23022296438201, "learning_rate": 1.7430489101001998e-06, "loss": 0.7951, "num_input_tokens_seen": 10194944, "step": 2489 }, { "epoch": 1.8554396423248882, "grad_norm": 10.814401055850848, "learning_rate": 1.742511061961014e-06, "loss": 0.6484, "num_input_tokens_seen": 10199040, "step": 2490 }, { "epoch": 1.8561847988077496, "grad_norm": 8.576999243959486, "learning_rate": 1.741973105859886e-06, "loss": 0.8186, "num_input_tokens_seen": 10203136, "step": 2491 }, { "epoch": 1.8569299552906111, "grad_norm": 8.718060187571538, "learning_rate": 1.7414350419147404e-06, "loss": 0.6619, "num_input_tokens_seen": 10207232, "step": 2492 }, { "epoch": 1.8576751117734724, "grad_norm": 8.914534977484807, "learning_rate": 1.7408968702435245e-06, "loss": 0.8044, "num_input_tokens_seen": 10211328, "step": 2493 }, { "epoch": 1.8584202682563338, "grad_norm": 8.718821212338451, "learning_rate": 1.740358590964209e-06, "loss": 0.7771, "num_input_tokens_seen": 10215424, "step": 2494 }, { "epoch": 1.8591654247391953, "grad_norm": 35.69859809060875, "learning_rate": 1.739820204194789e-06, "loss": 1.0977, "num_input_tokens_seen": 10219520, "step": 2495 }, { "epoch": 1.8599105812220567, "grad_norm": 6.591496368287761, "learning_rate": 1.7392817100532828e-06, "loss": 0.7698, "num_input_tokens_seen": 10223616, "step": 2496 }, { "epoch": 1.860655737704918, "grad_norm": 8.394870566388121, "learning_rate": 1.738743108657732e-06, "loss": 0.8416, "num_input_tokens_seen": 10227712, "step": 2497 }, { "epoch": 1.8614008941877795, "grad_norm": 8.333418689152948, "learning_rate": 1.7382044001262016e-06, "loss": 1.0281, "num_input_tokens_seen": 10231808, "step": 2498 }, { "epoch": 1.8621460506706409, "grad_norm": 9.232141803494963, "learning_rate": 1.7376655845767807e-06, "loss": 0.4612, "num_input_tokens_seen": 10235904, "step": 2499 }, { "epoch": 1.8628912071535022, "grad_norm": 10.233882489797331, "learning_rate": 1.7371266621275814e-06, "loss": 0.5586, "num_input_tokens_seen": 10240000, "step": 2500 }, { "epoch": 1.8636363636363638, "grad_norm": 8.362701032911671, "learning_rate": 1.7365876328967396e-06, "loss": 0.7373, "num_input_tokens_seen": 10244096, "step": 2501 }, { "epoch": 1.864381520119225, "grad_norm": 7.998233059290453, "learning_rate": 1.7360484970024139e-06, "loss": 0.8375, "num_input_tokens_seen": 10248192, "step": 2502 }, { "epoch": 1.8651266766020864, "grad_norm": 10.365091808138162, "learning_rate": 1.7355092545627872e-06, "loss": 0.5943, "num_input_tokens_seen": 10252288, "step": 2503 }, { "epoch": 1.865871833084948, "grad_norm": 8.154474359125865, "learning_rate": 1.7349699056960647e-06, "loss": 0.849, "num_input_tokens_seen": 10256384, "step": 2504 }, { "epoch": 1.866616989567809, "grad_norm": 13.404888267545955, "learning_rate": 1.734430450520476e-06, "loss": 0.6991, "num_input_tokens_seen": 10260480, "step": 2505 }, { "epoch": 1.8673621460506706, "grad_norm": 9.15471126515433, "learning_rate": 1.7338908891542732e-06, "loss": 0.8384, "num_input_tokens_seen": 10264576, "step": 2506 }, { "epoch": 1.8681073025335322, "grad_norm": 8.451214435846998, "learning_rate": 1.7333512217157324e-06, "loss": 0.7302, "num_input_tokens_seen": 10268672, "step": 2507 }, { "epoch": 1.8688524590163933, "grad_norm": 8.431585484294486, "learning_rate": 1.7328114483231526e-06, "loss": 0.7857, "num_input_tokens_seen": 10272768, "step": 2508 }, { "epoch": 1.8695976154992549, "grad_norm": 8.3502785349582, "learning_rate": 1.7322715690948554e-06, "loss": 0.5283, "num_input_tokens_seen": 10276864, "step": 2509 }, { "epoch": 1.8703427719821164, "grad_norm": 7.80691319026273, "learning_rate": 1.731731584149187e-06, "loss": 0.6622, "num_input_tokens_seen": 10280960, "step": 2510 }, { "epoch": 1.8710879284649775, "grad_norm": 12.205876199830257, "learning_rate": 1.7311914936045151e-06, "loss": 0.9909, "num_input_tokens_seen": 10285056, "step": 2511 }, { "epoch": 1.871833084947839, "grad_norm": 7.429016218966401, "learning_rate": 1.7306512975792319e-06, "loss": 0.8214, "num_input_tokens_seen": 10289152, "step": 2512 }, { "epoch": 1.8725782414307004, "grad_norm": 11.713308333590117, "learning_rate": 1.730110996191752e-06, "loss": 0.9594, "num_input_tokens_seen": 10293248, "step": 2513 }, { "epoch": 1.8733233979135617, "grad_norm": 7.916065707376575, "learning_rate": 1.7295705895605133e-06, "loss": 0.6738, "num_input_tokens_seen": 10297344, "step": 2514 }, { "epoch": 1.8740685543964233, "grad_norm": 11.683995388600653, "learning_rate": 1.729030077803977e-06, "loss": 0.7987, "num_input_tokens_seen": 10301440, "step": 2515 }, { "epoch": 1.8748137108792846, "grad_norm": 9.078003734747346, "learning_rate": 1.7284894610406271e-06, "loss": 0.7214, "num_input_tokens_seen": 10305536, "step": 2516 }, { "epoch": 1.875558867362146, "grad_norm": 7.586755244855073, "learning_rate": 1.727948739388971e-06, "loss": 0.6798, "num_input_tokens_seen": 10309632, "step": 2517 }, { "epoch": 1.8763040238450075, "grad_norm": 8.467105300731044, "learning_rate": 1.727407912967538e-06, "loss": 0.953, "num_input_tokens_seen": 10313728, "step": 2518 }, { "epoch": 1.8770491803278688, "grad_norm": 8.391168701496818, "learning_rate": 1.726866981894882e-06, "loss": 0.6018, "num_input_tokens_seen": 10317824, "step": 2519 }, { "epoch": 1.8777943368107302, "grad_norm": 8.593849962117318, "learning_rate": 1.7263259462895777e-06, "loss": 0.7063, "num_input_tokens_seen": 10321920, "step": 2520 }, { "epoch": 1.8785394932935917, "grad_norm": 8.933233588407868, "learning_rate": 1.725784806270225e-06, "loss": 0.6741, "num_input_tokens_seen": 10326016, "step": 2521 }, { "epoch": 1.879284649776453, "grad_norm": 7.174015825154257, "learning_rate": 1.7252435619554453e-06, "loss": 0.7672, "num_input_tokens_seen": 10330112, "step": 2522 }, { "epoch": 1.8800298062593144, "grad_norm": 6.953000648371039, "learning_rate": 1.7247022134638836e-06, "loss": 0.6816, "num_input_tokens_seen": 10334208, "step": 2523 }, { "epoch": 1.880774962742176, "grad_norm": 8.650352942557104, "learning_rate": 1.7241607609142066e-06, "loss": 0.6718, "num_input_tokens_seen": 10338304, "step": 2524 }, { "epoch": 1.8815201192250373, "grad_norm": 8.404108161737733, "learning_rate": 1.7236192044251052e-06, "loss": 0.8233, "num_input_tokens_seen": 10342400, "step": 2525 }, { "epoch": 1.8822652757078986, "grad_norm": 8.334943383543758, "learning_rate": 1.723077544115292e-06, "loss": 0.7441, "num_input_tokens_seen": 10346496, "step": 2526 }, { "epoch": 1.8830104321907601, "grad_norm": 8.503064896367258, "learning_rate": 1.722535780103503e-06, "loss": 0.8961, "num_input_tokens_seen": 10350592, "step": 2527 }, { "epoch": 1.8837555886736215, "grad_norm": 10.245683685368546, "learning_rate": 1.7219939125084975e-06, "loss": 0.7049, "num_input_tokens_seen": 10354688, "step": 2528 }, { "epoch": 1.8845007451564828, "grad_norm": 9.977373518645877, "learning_rate": 1.7214519414490556e-06, "loss": 0.7811, "num_input_tokens_seen": 10358784, "step": 2529 }, { "epoch": 1.8852459016393444, "grad_norm": 7.777130727833805, "learning_rate": 1.7209098670439816e-06, "loss": 0.595, "num_input_tokens_seen": 10362880, "step": 2530 }, { "epoch": 1.8859910581222057, "grad_norm": 8.382459026986368, "learning_rate": 1.7203676894121024e-06, "loss": 0.6474, "num_input_tokens_seen": 10366976, "step": 2531 }, { "epoch": 1.886736214605067, "grad_norm": 8.550710852027667, "learning_rate": 1.7198254086722666e-06, "loss": 0.7649, "num_input_tokens_seen": 10371072, "step": 2532 }, { "epoch": 1.8874813710879286, "grad_norm": 8.896588818302794, "learning_rate": 1.7192830249433466e-06, "loss": 0.9173, "num_input_tokens_seen": 10375168, "step": 2533 }, { "epoch": 1.88822652757079, "grad_norm": 8.80813610699366, "learning_rate": 1.7187405383442364e-06, "loss": 0.8516, "num_input_tokens_seen": 10379264, "step": 2534 }, { "epoch": 1.8889716840536512, "grad_norm": 7.633870202775202, "learning_rate": 1.718197948993853e-06, "loss": 0.7537, "num_input_tokens_seen": 10383360, "step": 2535 }, { "epoch": 1.8897168405365128, "grad_norm": 9.414113235952422, "learning_rate": 1.7176552570111362e-06, "loss": 0.7049, "num_input_tokens_seen": 10387456, "step": 2536 }, { "epoch": 1.8904619970193741, "grad_norm": 10.876841662099963, "learning_rate": 1.7171124625150476e-06, "loss": 0.8191, "num_input_tokens_seen": 10391552, "step": 2537 }, { "epoch": 1.8912071535022354, "grad_norm": 9.515415941898011, "learning_rate": 1.716569565624572e-06, "loss": 0.6971, "num_input_tokens_seen": 10395648, "step": 2538 }, { "epoch": 1.891952309985097, "grad_norm": 10.766193771887812, "learning_rate": 1.7160265664587155e-06, "loss": 0.8354, "num_input_tokens_seen": 10399744, "step": 2539 }, { "epoch": 1.8926974664679581, "grad_norm": 11.41102081663355, "learning_rate": 1.7154834651365082e-06, "loss": 0.7834, "num_input_tokens_seen": 10403840, "step": 2540 }, { "epoch": 1.8934426229508197, "grad_norm": 8.788985122820344, "learning_rate": 1.7149402617770017e-06, "loss": 0.7051, "num_input_tokens_seen": 10407936, "step": 2541 }, { "epoch": 1.8941877794336812, "grad_norm": 8.291400651387548, "learning_rate": 1.7143969564992694e-06, "loss": 0.8822, "num_input_tokens_seen": 10412032, "step": 2542 }, { "epoch": 1.8949329359165423, "grad_norm": 8.178138139036028, "learning_rate": 1.713853549422408e-06, "loss": 0.9149, "num_input_tokens_seen": 10416128, "step": 2543 }, { "epoch": 1.8956780923994039, "grad_norm": 8.182784400694892, "learning_rate": 1.7133100406655373e-06, "loss": 0.8371, "num_input_tokens_seen": 10420224, "step": 2544 }, { "epoch": 1.8964232488822654, "grad_norm": 7.722985528465512, "learning_rate": 1.7127664303477965e-06, "loss": 0.7452, "num_input_tokens_seen": 10424320, "step": 2545 }, { "epoch": 1.8971684053651265, "grad_norm": 9.499270714453903, "learning_rate": 1.71222271858835e-06, "loss": 0.5569, "num_input_tokens_seen": 10428416, "step": 2546 }, { "epoch": 1.897913561847988, "grad_norm": 7.495144208891004, "learning_rate": 1.711678905506383e-06, "loss": 0.8786, "num_input_tokens_seen": 10432512, "step": 2547 }, { "epoch": 1.8986587183308494, "grad_norm": 9.241172218399834, "learning_rate": 1.7111349912211033e-06, "loss": 0.839, "num_input_tokens_seen": 10436608, "step": 2548 }, { "epoch": 1.8994038748137108, "grad_norm": 9.716968791150691, "learning_rate": 1.710590975851741e-06, "loss": 0.8588, "num_input_tokens_seen": 10440704, "step": 2549 }, { "epoch": 1.9001490312965723, "grad_norm": 9.729508593840505, "learning_rate": 1.7100468595175473e-06, "loss": 0.4206, "num_input_tokens_seen": 10444800, "step": 2550 }, { "epoch": 1.9008941877794336, "grad_norm": 6.748897391840241, "learning_rate": 1.7095026423377973e-06, "loss": 0.6992, "num_input_tokens_seen": 10448896, "step": 2551 }, { "epoch": 1.901639344262295, "grad_norm": 6.848520430261073, "learning_rate": 1.708958324431787e-06, "loss": 0.9966, "num_input_tokens_seen": 10452992, "step": 2552 }, { "epoch": 1.9023845007451565, "grad_norm": 8.734574580577773, "learning_rate": 1.7084139059188342e-06, "loss": 0.6082, "num_input_tokens_seen": 10457088, "step": 2553 }, { "epoch": 1.9031296572280179, "grad_norm": 8.474998940768627, "learning_rate": 1.7078693869182806e-06, "loss": 0.9604, "num_input_tokens_seen": 10461184, "step": 2554 }, { "epoch": 1.9038748137108792, "grad_norm": 8.630844272617626, "learning_rate": 1.7073247675494875e-06, "loss": 0.862, "num_input_tokens_seen": 10465280, "step": 2555 }, { "epoch": 1.9046199701937407, "grad_norm": 8.55113910193652, "learning_rate": 1.7067800479318396e-06, "loss": 0.7241, "num_input_tokens_seen": 10469376, "step": 2556 }, { "epoch": 1.905365126676602, "grad_norm": 10.907974356461212, "learning_rate": 1.7062352281847442e-06, "loss": 0.731, "num_input_tokens_seen": 10473472, "step": 2557 }, { "epoch": 1.9061102831594634, "grad_norm": 7.676861817409554, "learning_rate": 1.7056903084276282e-06, "loss": 0.8691, "num_input_tokens_seen": 10477568, "step": 2558 }, { "epoch": 1.906855439642325, "grad_norm": 7.638446391412875, "learning_rate": 1.705145288779943e-06, "loss": 0.9733, "num_input_tokens_seen": 10481664, "step": 2559 }, { "epoch": 1.9076005961251863, "grad_norm": 7.491975507887233, "learning_rate": 1.7046001693611605e-06, "loss": 0.4061, "num_input_tokens_seen": 10485760, "step": 2560 }, { "epoch": 1.9083457526080476, "grad_norm": 9.437835033943104, "learning_rate": 1.7040549502907743e-06, "loss": 0.6979, "num_input_tokens_seen": 10489856, "step": 2561 }, { "epoch": 1.9090909090909092, "grad_norm": 7.575093941640316, "learning_rate": 1.7035096316883013e-06, "loss": 0.8704, "num_input_tokens_seen": 10493952, "step": 2562 }, { "epoch": 1.9098360655737705, "grad_norm": 9.628087729692345, "learning_rate": 1.7029642136732785e-06, "loss": 0.8867, "num_input_tokens_seen": 10498048, "step": 2563 }, { "epoch": 1.9105812220566318, "grad_norm": 10.965627257176914, "learning_rate": 1.7024186963652659e-06, "loss": 0.6748, "num_input_tokens_seen": 10502144, "step": 2564 }, { "epoch": 1.9113263785394934, "grad_norm": 8.751208490507716, "learning_rate": 1.7018730798838442e-06, "loss": 0.6985, "num_input_tokens_seen": 10506240, "step": 2565 }, { "epoch": 1.9120715350223547, "grad_norm": 7.0368158239224705, "learning_rate": 1.7013273643486166e-06, "loss": 0.6425, "num_input_tokens_seen": 10510336, "step": 2566 }, { "epoch": 1.912816691505216, "grad_norm": 8.802480649316516, "learning_rate": 1.7007815498792083e-06, "loss": 1.0475, "num_input_tokens_seen": 10514432, "step": 2567 }, { "epoch": 1.9135618479880776, "grad_norm": 7.139534127737788, "learning_rate": 1.7002356365952652e-06, "loss": 0.8989, "num_input_tokens_seen": 10518528, "step": 2568 }, { "epoch": 1.914307004470939, "grad_norm": 9.043476818385278, "learning_rate": 1.6996896246164554e-06, "loss": 0.7157, "num_input_tokens_seen": 10522624, "step": 2569 }, { "epoch": 1.9150521609538003, "grad_norm": 7.9337972938227574, "learning_rate": 1.699143514062469e-06, "loss": 0.887, "num_input_tokens_seen": 10526720, "step": 2570 }, { "epoch": 1.9157973174366618, "grad_norm": 7.813568998843606, "learning_rate": 1.6985973050530175e-06, "loss": 0.8139, "num_input_tokens_seen": 10530816, "step": 2571 }, { "epoch": 1.9165424739195231, "grad_norm": 9.561178365959668, "learning_rate": 1.698050997707833e-06, "loss": 0.6019, "num_input_tokens_seen": 10534912, "step": 2572 }, { "epoch": 1.9172876304023845, "grad_norm": 7.039735530616557, "learning_rate": 1.6975045921466703e-06, "loss": 0.697, "num_input_tokens_seen": 10539008, "step": 2573 }, { "epoch": 1.918032786885246, "grad_norm": 7.662622799758364, "learning_rate": 1.6969580884893062e-06, "loss": 0.8472, "num_input_tokens_seen": 10543104, "step": 2574 }, { "epoch": 1.9187779433681071, "grad_norm": 10.06558166975616, "learning_rate": 1.696411486855537e-06, "loss": 0.7733, "num_input_tokens_seen": 10547200, "step": 2575 }, { "epoch": 1.9195230998509687, "grad_norm": 8.972098290761068, "learning_rate": 1.6958647873651824e-06, "loss": 0.9701, "num_input_tokens_seen": 10551296, "step": 2576 }, { "epoch": 1.9202682563338302, "grad_norm": 11.189231371297364, "learning_rate": 1.6953179901380828e-06, "loss": 0.5165, "num_input_tokens_seen": 10555392, "step": 2577 }, { "epoch": 1.9210134128166914, "grad_norm": 8.756645864843849, "learning_rate": 1.6947710952940997e-06, "loss": 0.7931, "num_input_tokens_seen": 10559488, "step": 2578 }, { "epoch": 1.921758569299553, "grad_norm": 11.138993555297066, "learning_rate": 1.6942241029531164e-06, "loss": 0.7213, "num_input_tokens_seen": 10563584, "step": 2579 }, { "epoch": 1.9225037257824145, "grad_norm": 12.327186147055937, "learning_rate": 1.6936770132350378e-06, "loss": 0.5577, "num_input_tokens_seen": 10567680, "step": 2580 }, { "epoch": 1.9232488822652756, "grad_norm": 8.534912834039398, "learning_rate": 1.6931298262597898e-06, "loss": 1.039, "num_input_tokens_seen": 10571776, "step": 2581 }, { "epoch": 1.9239940387481371, "grad_norm": 10.722813013732173, "learning_rate": 1.6925825421473197e-06, "loss": 0.7151, "num_input_tokens_seen": 10575872, "step": 2582 }, { "epoch": 1.9247391952309985, "grad_norm": 9.07562064222056, "learning_rate": 1.692035161017596e-06, "loss": 0.6451, "num_input_tokens_seen": 10579968, "step": 2583 }, { "epoch": 1.9254843517138598, "grad_norm": 9.26124020428363, "learning_rate": 1.6914876829906089e-06, "loss": 0.9173, "num_input_tokens_seen": 10584064, "step": 2584 }, { "epoch": 1.9262295081967213, "grad_norm": 7.218536048469788, "learning_rate": 1.6909401081863688e-06, "loss": 1.0676, "num_input_tokens_seen": 10588160, "step": 2585 }, { "epoch": 1.9269746646795827, "grad_norm": 7.664428660573414, "learning_rate": 1.6903924367249084e-06, "loss": 0.6482, "num_input_tokens_seen": 10592256, "step": 2586 }, { "epoch": 1.927719821162444, "grad_norm": 8.138173810288265, "learning_rate": 1.6898446687262811e-06, "loss": 0.6732, "num_input_tokens_seen": 10596352, "step": 2587 }, { "epoch": 1.9284649776453056, "grad_norm": 7.092494535278011, "learning_rate": 1.6892968043105618e-06, "loss": 0.8144, "num_input_tokens_seen": 10600448, "step": 2588 }, { "epoch": 1.9292101341281669, "grad_norm": 8.308801909799389, "learning_rate": 1.688748843597846e-06, "loss": 0.7649, "num_input_tokens_seen": 10604544, "step": 2589 }, { "epoch": 1.9299552906110282, "grad_norm": 8.262892967623536, "learning_rate": 1.6882007867082511e-06, "loss": 0.5524, "num_input_tokens_seen": 10608640, "step": 2590 }, { "epoch": 1.9307004470938898, "grad_norm": 11.143370215788453, "learning_rate": 1.6876526337619146e-06, "loss": 0.8416, "num_input_tokens_seen": 10612736, "step": 2591 }, { "epoch": 1.931445603576751, "grad_norm": 16.196034828462942, "learning_rate": 1.6871043848789957e-06, "loss": 0.5957, "num_input_tokens_seen": 10616832, "step": 2592 }, { "epoch": 1.9321907600596124, "grad_norm": 12.39686315985044, "learning_rate": 1.6865560401796744e-06, "loss": 0.8509, "num_input_tokens_seen": 10620928, "step": 2593 }, { "epoch": 1.932935916542474, "grad_norm": 7.752111681388714, "learning_rate": 1.6860075997841515e-06, "loss": 0.5539, "num_input_tokens_seen": 10625024, "step": 2594 }, { "epoch": 1.9336810730253353, "grad_norm": 8.142616352139987, "learning_rate": 1.6854590638126494e-06, "loss": 0.8815, "num_input_tokens_seen": 10629120, "step": 2595 }, { "epoch": 1.9344262295081966, "grad_norm": 8.64997587005586, "learning_rate": 1.6849104323854113e-06, "loss": 0.9296, "num_input_tokens_seen": 10633216, "step": 2596 }, { "epoch": 1.9351713859910582, "grad_norm": 8.759866622551774, "learning_rate": 1.6843617056227007e-06, "loss": 0.9187, "num_input_tokens_seen": 10637312, "step": 2597 }, { "epoch": 1.9359165424739195, "grad_norm": 7.433560455072433, "learning_rate": 1.6838128836448031e-06, "loss": 0.7589, "num_input_tokens_seen": 10641408, "step": 2598 }, { "epoch": 1.9366616989567809, "grad_norm": 18.401280730085816, "learning_rate": 1.683263966572023e-06, "loss": 0.8522, "num_input_tokens_seen": 10645504, "step": 2599 }, { "epoch": 1.9374068554396424, "grad_norm": 7.160498891408159, "learning_rate": 1.6827149545246878e-06, "loss": 0.9223, "num_input_tokens_seen": 10649600, "step": 2600 }, { "epoch": 1.9381520119225037, "grad_norm": 7.658585549686166, "learning_rate": 1.6821658476231453e-06, "loss": 0.8379, "num_input_tokens_seen": 10653696, "step": 2601 }, { "epoch": 1.938897168405365, "grad_norm": 11.652187586075193, "learning_rate": 1.6816166459877624e-06, "loss": 0.6177, "num_input_tokens_seen": 10657792, "step": 2602 }, { "epoch": 1.9396423248882266, "grad_norm": 8.809242463714488, "learning_rate": 1.681067349738929e-06, "loss": 0.7283, "num_input_tokens_seen": 10661888, "step": 2603 }, { "epoch": 1.940387481371088, "grad_norm": 11.456405623953563, "learning_rate": 1.6805179589970543e-06, "loss": 0.6375, "num_input_tokens_seen": 10665984, "step": 2604 }, { "epoch": 1.9411326378539493, "grad_norm": 10.32156809305669, "learning_rate": 1.6799684738825688e-06, "loss": 0.6783, "num_input_tokens_seen": 10670080, "step": 2605 }, { "epoch": 1.9418777943368108, "grad_norm": 11.121255756743333, "learning_rate": 1.6794188945159236e-06, "loss": 0.7247, "num_input_tokens_seen": 10674176, "step": 2606 }, { "epoch": 1.9426229508196722, "grad_norm": 9.799045162806399, "learning_rate": 1.6788692210175905e-06, "loss": 0.7035, "num_input_tokens_seen": 10678272, "step": 2607 }, { "epoch": 1.9433681073025335, "grad_norm": 8.317968246752711, "learning_rate": 1.6783194535080616e-06, "loss": 0.8991, "num_input_tokens_seen": 10682368, "step": 2608 }, { "epoch": 1.944113263785395, "grad_norm": 8.795012357016825, "learning_rate": 1.6777695921078502e-06, "loss": 0.656, "num_input_tokens_seen": 10686464, "step": 2609 }, { "epoch": 1.9448584202682562, "grad_norm": 7.558681465011546, "learning_rate": 1.6772196369374896e-06, "loss": 0.7294, "num_input_tokens_seen": 10690560, "step": 2610 }, { "epoch": 1.9456035767511177, "grad_norm": 8.649879154408245, "learning_rate": 1.6766695881175343e-06, "loss": 0.8296, "num_input_tokens_seen": 10694656, "step": 2611 }, { "epoch": 1.9463487332339793, "grad_norm": 11.24093971700903, "learning_rate": 1.6761194457685582e-06, "loss": 0.5654, "num_input_tokens_seen": 10698752, "step": 2612 }, { "epoch": 1.9470938897168404, "grad_norm": 7.8347507437019575, "learning_rate": 1.6755692100111567e-06, "loss": 0.8306, "num_input_tokens_seen": 10702848, "step": 2613 }, { "epoch": 1.947839046199702, "grad_norm": 8.157412871775575, "learning_rate": 1.6750188809659457e-06, "loss": 0.7825, "num_input_tokens_seen": 10706944, "step": 2614 }, { "epoch": 1.9485842026825635, "grad_norm": 9.006856709611098, "learning_rate": 1.674468458753561e-06, "loss": 1.044, "num_input_tokens_seen": 10711040, "step": 2615 }, { "epoch": 1.9493293591654246, "grad_norm": 7.466317049442969, "learning_rate": 1.673917943494659e-06, "loss": 0.8419, "num_input_tokens_seen": 10715136, "step": 2616 }, { "epoch": 1.9500745156482862, "grad_norm": 9.35013245766895, "learning_rate": 1.673367335309917e-06, "loss": 0.746, "num_input_tokens_seen": 10719232, "step": 2617 }, { "epoch": 1.9508196721311475, "grad_norm": 10.314448459000257, "learning_rate": 1.672816634320032e-06, "loss": 0.7629, "num_input_tokens_seen": 10723328, "step": 2618 }, { "epoch": 1.9515648286140088, "grad_norm": 14.534267288950673, "learning_rate": 1.6722658406457214e-06, "loss": 0.6461, "num_input_tokens_seen": 10727424, "step": 2619 }, { "epoch": 1.9523099850968704, "grad_norm": 8.856049862862083, "learning_rate": 1.6717149544077238e-06, "loss": 0.4727, "num_input_tokens_seen": 10731520, "step": 2620 }, { "epoch": 1.9530551415797317, "grad_norm": 11.40571910315297, "learning_rate": 1.6711639757267965e-06, "loss": 0.8153, "num_input_tokens_seen": 10735616, "step": 2621 }, { "epoch": 1.953800298062593, "grad_norm": 7.051072301261578, "learning_rate": 1.6706129047237182e-06, "loss": 0.8163, "num_input_tokens_seen": 10739712, "step": 2622 }, { "epoch": 1.9545454545454546, "grad_norm": 8.700533546290334, "learning_rate": 1.6700617415192877e-06, "loss": 0.4761, "num_input_tokens_seen": 10743808, "step": 2623 }, { "epoch": 1.955290611028316, "grad_norm": 7.957539618954179, "learning_rate": 1.6695104862343242e-06, "loss": 0.6517, "num_input_tokens_seen": 10747904, "step": 2624 }, { "epoch": 1.9560357675111772, "grad_norm": 9.66254744850675, "learning_rate": 1.6689591389896663e-06, "loss": 0.4896, "num_input_tokens_seen": 10752000, "step": 2625 }, { "epoch": 1.9567809239940388, "grad_norm": 8.53567378884486, "learning_rate": 1.6684076999061737e-06, "loss": 0.8779, "num_input_tokens_seen": 10756096, "step": 2626 }, { "epoch": 1.9575260804769001, "grad_norm": 9.136416565750586, "learning_rate": 1.6678561691047258e-06, "loss": 0.8013, "num_input_tokens_seen": 10760192, "step": 2627 }, { "epoch": 1.9582712369597615, "grad_norm": 8.352100925837178, "learning_rate": 1.6673045467062216e-06, "loss": 0.4834, "num_input_tokens_seen": 10764288, "step": 2628 }, { "epoch": 1.959016393442623, "grad_norm": 9.175611402906375, "learning_rate": 1.666752832831581e-06, "loss": 0.8306, "num_input_tokens_seen": 10768384, "step": 2629 }, { "epoch": 1.9597615499254843, "grad_norm": 11.423530915701015, "learning_rate": 1.6662010276017435e-06, "loss": 0.6936, "num_input_tokens_seen": 10772480, "step": 2630 }, { "epoch": 1.9605067064083457, "grad_norm": 7.619977531215626, "learning_rate": 1.6656491311376691e-06, "loss": 0.6988, "num_input_tokens_seen": 10776576, "step": 2631 }, { "epoch": 1.9612518628912072, "grad_norm": 7.929547530720288, "learning_rate": 1.665097143560337e-06, "loss": 0.8685, "num_input_tokens_seen": 10780672, "step": 2632 }, { "epoch": 1.9619970193740686, "grad_norm": 8.896507599182812, "learning_rate": 1.6645450649907466e-06, "loss": 0.6517, "num_input_tokens_seen": 10784768, "step": 2633 }, { "epoch": 1.96274217585693, "grad_norm": 8.853501508812052, "learning_rate": 1.663992895549918e-06, "loss": 0.684, "num_input_tokens_seen": 10788864, "step": 2634 }, { "epoch": 1.9634873323397914, "grad_norm": 7.245313459007876, "learning_rate": 1.6634406353588906e-06, "loss": 1.0455, "num_input_tokens_seen": 10792960, "step": 2635 }, { "epoch": 1.9642324888226528, "grad_norm": 7.842261412958471, "learning_rate": 1.6628882845387236e-06, "loss": 0.9604, "num_input_tokens_seen": 10797056, "step": 2636 }, { "epoch": 1.964977645305514, "grad_norm": 9.409325202531221, "learning_rate": 1.6623358432104966e-06, "loss": 0.6552, "num_input_tokens_seen": 10801152, "step": 2637 }, { "epoch": 1.9657228017883757, "grad_norm": 8.399527505232586, "learning_rate": 1.6617833114953083e-06, "loss": 0.6323, "num_input_tokens_seen": 10805248, "step": 2638 }, { "epoch": 1.966467958271237, "grad_norm": 9.461552058707568, "learning_rate": 1.6612306895142779e-06, "loss": 0.7723, "num_input_tokens_seen": 10809344, "step": 2639 }, { "epoch": 1.9672131147540983, "grad_norm": 9.396830995722338, "learning_rate": 1.6606779773885436e-06, "loss": 0.7332, "num_input_tokens_seen": 10813440, "step": 2640 }, { "epoch": 1.9679582712369599, "grad_norm": 7.898464576013635, "learning_rate": 1.6601251752392645e-06, "loss": 0.6956, "num_input_tokens_seen": 10817536, "step": 2641 }, { "epoch": 1.9687034277198212, "grad_norm": 8.184279266592451, "learning_rate": 1.659572283187618e-06, "loss": 1.0454, "num_input_tokens_seen": 10821632, "step": 2642 }, { "epoch": 1.9694485842026825, "grad_norm": 8.294165794600394, "learning_rate": 1.6590193013548026e-06, "loss": 0.6749, "num_input_tokens_seen": 10825728, "step": 2643 }, { "epoch": 1.970193740685544, "grad_norm": 8.376233935388617, "learning_rate": 1.6584662298620357e-06, "loss": 0.7033, "num_input_tokens_seen": 10829824, "step": 2644 }, { "epoch": 1.9709388971684052, "grad_norm": 12.827595348447696, "learning_rate": 1.6579130688305546e-06, "loss": 0.6798, "num_input_tokens_seen": 10833920, "step": 2645 }, { "epoch": 1.9716840536512668, "grad_norm": 24.237005253515832, "learning_rate": 1.6573598183816158e-06, "loss": 0.6724, "num_input_tokens_seen": 10838016, "step": 2646 }, { "epoch": 1.9724292101341283, "grad_norm": 10.903919985886315, "learning_rate": 1.6568064786364965e-06, "loss": 0.7161, "num_input_tokens_seen": 10842112, "step": 2647 }, { "epoch": 1.9731743666169894, "grad_norm": 7.895004578461574, "learning_rate": 1.656253049716492e-06, "loss": 1.0085, "num_input_tokens_seen": 10846208, "step": 2648 }, { "epoch": 1.973919523099851, "grad_norm": 8.175667718062021, "learning_rate": 1.6556995317429178e-06, "loss": 0.4326, "num_input_tokens_seen": 10850304, "step": 2649 }, { "epoch": 1.9746646795827125, "grad_norm": 8.71907299579669, "learning_rate": 1.6551459248371093e-06, "loss": 0.6909, "num_input_tokens_seen": 10854400, "step": 2650 }, { "epoch": 1.9754098360655736, "grad_norm": 7.504548339563761, "learning_rate": 1.6545922291204215e-06, "loss": 0.8459, "num_input_tokens_seen": 10858496, "step": 2651 }, { "epoch": 1.9761549925484352, "grad_norm": 9.862490512884085, "learning_rate": 1.6540384447142278e-06, "loss": 0.7103, "num_input_tokens_seen": 10862592, "step": 2652 }, { "epoch": 1.9769001490312967, "grad_norm": 12.614982378331197, "learning_rate": 1.6534845717399218e-06, "loss": 0.9879, "num_input_tokens_seen": 10866688, "step": 2653 }, { "epoch": 1.9776453055141578, "grad_norm": 9.823127933581278, "learning_rate": 1.6529306103189165e-06, "loss": 0.6545, "num_input_tokens_seen": 10870784, "step": 2654 }, { "epoch": 1.9783904619970194, "grad_norm": 7.389033328652459, "learning_rate": 1.6523765605726444e-06, "loss": 0.9201, "num_input_tokens_seen": 10874880, "step": 2655 }, { "epoch": 1.9791356184798807, "grad_norm": 8.743673927549024, "learning_rate": 1.6518224226225572e-06, "loss": 0.7125, "num_input_tokens_seen": 10878976, "step": 2656 }, { "epoch": 1.979880774962742, "grad_norm": 10.626106336561271, "learning_rate": 1.6512681965901255e-06, "loss": 0.5274, "num_input_tokens_seen": 10883072, "step": 2657 }, { "epoch": 1.9806259314456036, "grad_norm": 8.020688484560312, "learning_rate": 1.65071388259684e-06, "loss": 0.6972, "num_input_tokens_seen": 10887168, "step": 2658 }, { "epoch": 1.981371087928465, "grad_norm": 9.47786814388324, "learning_rate": 1.6501594807642097e-06, "loss": 0.6862, "num_input_tokens_seen": 10891264, "step": 2659 }, { "epoch": 1.9821162444113263, "grad_norm": 9.65427317803532, "learning_rate": 1.649604991213764e-06, "loss": 0.7729, "num_input_tokens_seen": 10895360, "step": 2660 }, { "epoch": 1.9828614008941878, "grad_norm": 11.987181499625244, "learning_rate": 1.6490504140670508e-06, "loss": 0.673, "num_input_tokens_seen": 10899456, "step": 2661 }, { "epoch": 1.9836065573770492, "grad_norm": 13.603779011788939, "learning_rate": 1.6484957494456373e-06, "loss": 0.9319, "num_input_tokens_seen": 10903552, "step": 2662 }, { "epoch": 1.9843517138599105, "grad_norm": 8.108004377081684, "learning_rate": 1.6479409974711097e-06, "loss": 0.5617, "num_input_tokens_seen": 10907648, "step": 2663 }, { "epoch": 1.985096870342772, "grad_norm": 8.934966552206742, "learning_rate": 1.6473861582650746e-06, "loss": 0.7447, "num_input_tokens_seen": 10911744, "step": 2664 }, { "epoch": 1.9858420268256334, "grad_norm": 7.413622723679762, "learning_rate": 1.6468312319491556e-06, "loss": 0.9365, "num_input_tokens_seen": 10915840, "step": 2665 }, { "epoch": 1.9865871833084947, "grad_norm": 7.38020810266186, "learning_rate": 1.646276218644997e-06, "loss": 0.6593, "num_input_tokens_seen": 10919936, "step": 2666 }, { "epoch": 1.9873323397913563, "grad_norm": 11.12362133207148, "learning_rate": 1.6457211184742617e-06, "loss": 0.6863, "num_input_tokens_seen": 10924032, "step": 2667 }, { "epoch": 1.9880774962742176, "grad_norm": 8.730685775909873, "learning_rate": 1.6451659315586313e-06, "loss": 0.777, "num_input_tokens_seen": 10928128, "step": 2668 }, { "epoch": 1.988822652757079, "grad_norm": 8.525261118676392, "learning_rate": 1.644610658019807e-06, "loss": 0.6995, "num_input_tokens_seen": 10932224, "step": 2669 }, { "epoch": 1.9895678092399405, "grad_norm": 13.792240978751614, "learning_rate": 1.644055297979509e-06, "loss": 0.8632, "num_input_tokens_seen": 10936320, "step": 2670 }, { "epoch": 1.9903129657228018, "grad_norm": 9.536657856069993, "learning_rate": 1.6434998515594752e-06, "loss": 0.5669, "num_input_tokens_seen": 10940416, "step": 2671 }, { "epoch": 1.9910581222056631, "grad_norm": 11.139236191676751, "learning_rate": 1.6429443188814648e-06, "loss": 0.5922, "num_input_tokens_seen": 10944512, "step": 2672 }, { "epoch": 1.9918032786885247, "grad_norm": 8.02760379005128, "learning_rate": 1.6423887000672536e-06, "loss": 0.6794, "num_input_tokens_seen": 10948608, "step": 2673 }, { "epoch": 1.992548435171386, "grad_norm": 8.237572811504128, "learning_rate": 1.6418329952386374e-06, "loss": 0.681, "num_input_tokens_seen": 10952704, "step": 2674 }, { "epoch": 1.9932935916542474, "grad_norm": 9.403155955046914, "learning_rate": 1.6412772045174308e-06, "loss": 0.6777, "num_input_tokens_seen": 10956800, "step": 2675 }, { "epoch": 1.994038748137109, "grad_norm": 8.484506057255116, "learning_rate": 1.640721328025467e-06, "loss": 0.6353, "num_input_tokens_seen": 10960896, "step": 2676 }, { "epoch": 1.9947839046199702, "grad_norm": 7.404676937826306, "learning_rate": 1.6401653658845978e-06, "loss": 0.9433, "num_input_tokens_seen": 10964992, "step": 2677 }, { "epoch": 1.9955290611028316, "grad_norm": 9.826755223602303, "learning_rate": 1.639609318216695e-06, "loss": 0.7561, "num_input_tokens_seen": 10969088, "step": 2678 }, { "epoch": 1.9962742175856931, "grad_norm": 13.157245135418435, "learning_rate": 1.639053185143647e-06, "loss": 0.6235, "num_input_tokens_seen": 10973184, "step": 2679 }, { "epoch": 1.9970193740685542, "grad_norm": 10.615659937517345, "learning_rate": 1.638496966787363e-06, "loss": 0.7934, "num_input_tokens_seen": 10977280, "step": 2680 }, { "epoch": 1.9977645305514158, "grad_norm": 8.896182416452039, "learning_rate": 1.6379406632697694e-06, "loss": 0.7215, "num_input_tokens_seen": 10981376, "step": 2681 }, { "epoch": 1.9985096870342773, "grad_norm": 8.212407580320686, "learning_rate": 1.6373842747128126e-06, "loss": 0.6235, "num_input_tokens_seen": 10985472, "step": 2682 }, { "epoch": 1.9992548435171384, "grad_norm": 9.083002262672926, "learning_rate": 1.636827801238457e-06, "loss": 0.9071, "num_input_tokens_seen": 10989568, "step": 2683 }, { "epoch": 2.0, "grad_norm": 8.382553541302398, "learning_rate": 1.6362712429686844e-06, "loss": 0.6732, "num_input_tokens_seen": 10993664, "step": 2684 }, { "epoch": 2.0007451564828616, "grad_norm": 5.427489029806101, "learning_rate": 1.635714600025498e-06, "loss": 0.3061, "num_input_tokens_seen": 10997760, "step": 2685 }, { "epoch": 2.0014903129657227, "grad_norm": 7.441957075314551, "learning_rate": 1.6351578725309161e-06, "loss": 0.3507, "num_input_tokens_seen": 11001856, "step": 2686 }, { "epoch": 2.002235469448584, "grad_norm": 7.228874506382443, "learning_rate": 1.6346010606069787e-06, "loss": 0.5314, "num_input_tokens_seen": 11005952, "step": 2687 }, { "epoch": 2.0029806259314458, "grad_norm": 7.202871827715018, "learning_rate": 1.6340441643757424e-06, "loss": 0.3147, "num_input_tokens_seen": 11010048, "step": 2688 }, { "epoch": 2.003725782414307, "grad_norm": 7.744411253552605, "learning_rate": 1.6334871839592827e-06, "loss": 0.4407, "num_input_tokens_seen": 11014144, "step": 2689 }, { "epoch": 2.0044709388971684, "grad_norm": 7.800021463764896, "learning_rate": 1.6329301194796942e-06, "loss": 0.4112, "num_input_tokens_seen": 11018240, "step": 2690 }, { "epoch": 2.00521609538003, "grad_norm": 7.774129910359598, "learning_rate": 1.6323729710590892e-06, "loss": 0.5016, "num_input_tokens_seen": 11022336, "step": 2691 }, { "epoch": 2.005961251862891, "grad_norm": 7.466670897776169, "learning_rate": 1.6318157388195987e-06, "loss": 0.4642, "num_input_tokens_seen": 11026432, "step": 2692 }, { "epoch": 2.0067064083457526, "grad_norm": 7.438414616197496, "learning_rate": 1.631258422883372e-06, "loss": 0.5168, "num_input_tokens_seen": 11030528, "step": 2693 }, { "epoch": 2.007451564828614, "grad_norm": 8.963544679898991, "learning_rate": 1.6307010233725762e-06, "loss": 0.4503, "num_input_tokens_seen": 11034624, "step": 2694 }, { "epoch": 2.0081967213114753, "grad_norm": 7.528119483776621, "learning_rate": 1.6301435404093977e-06, "loss": 0.4077, "num_input_tokens_seen": 11038720, "step": 2695 }, { "epoch": 2.008941877794337, "grad_norm": 8.179042090380719, "learning_rate": 1.6295859741160405e-06, "loss": 0.416, "num_input_tokens_seen": 11042816, "step": 2696 }, { "epoch": 2.0096870342771984, "grad_norm": 7.990966685915561, "learning_rate": 1.6290283246147274e-06, "loss": 0.2201, "num_input_tokens_seen": 11046912, "step": 2697 }, { "epoch": 2.0104321907600595, "grad_norm": 12.395594237462916, "learning_rate": 1.6284705920276988e-06, "loss": 0.5057, "num_input_tokens_seen": 11051008, "step": 2698 }, { "epoch": 2.011177347242921, "grad_norm": 10.308367793861892, "learning_rate": 1.6279127764772146e-06, "loss": 0.4316, "num_input_tokens_seen": 11055104, "step": 2699 }, { "epoch": 2.0119225037257826, "grad_norm": 10.953742152119714, "learning_rate": 1.6273548780855508e-06, "loss": 0.4176, "num_input_tokens_seen": 11059200, "step": 2700 }, { "epoch": 2.0126676602086437, "grad_norm": 9.0676488137896, "learning_rate": 1.6267968969750031e-06, "loss": 0.2473, "num_input_tokens_seen": 11063296, "step": 2701 }, { "epoch": 2.0134128166915053, "grad_norm": 9.19862792954722, "learning_rate": 1.6262388332678855e-06, "loss": 0.3498, "num_input_tokens_seen": 11067392, "step": 2702 }, { "epoch": 2.0141579731743664, "grad_norm": 11.059579813503365, "learning_rate": 1.6256806870865285e-06, "loss": 0.486, "num_input_tokens_seen": 11071488, "step": 2703 }, { "epoch": 2.014903129657228, "grad_norm": 15.101379169586684, "learning_rate": 1.6251224585532824e-06, "loss": 0.6937, "num_input_tokens_seen": 11075584, "step": 2704 }, { "epoch": 2.0156482861400895, "grad_norm": 12.379291383890948, "learning_rate": 1.6245641477905152e-06, "loss": 0.4116, "num_input_tokens_seen": 11079680, "step": 2705 }, { "epoch": 2.0163934426229506, "grad_norm": 9.541266595934237, "learning_rate": 1.6240057549206117e-06, "loss": 0.3832, "num_input_tokens_seen": 11083776, "step": 2706 }, { "epoch": 2.017138599105812, "grad_norm": 10.713978081792126, "learning_rate": 1.6234472800659762e-06, "loss": 0.2631, "num_input_tokens_seen": 11087872, "step": 2707 }, { "epoch": 2.0178837555886737, "grad_norm": 13.4527952079295, "learning_rate": 1.6228887233490303e-06, "loss": 0.2366, "num_input_tokens_seen": 11091968, "step": 2708 }, { "epoch": 2.018628912071535, "grad_norm": 13.47639566870534, "learning_rate": 1.6223300848922135e-06, "loss": 0.426, "num_input_tokens_seen": 11096064, "step": 2709 }, { "epoch": 2.0193740685543964, "grad_norm": 9.427005369270725, "learning_rate": 1.6217713648179833e-06, "loss": 0.3948, "num_input_tokens_seen": 11100160, "step": 2710 }, { "epoch": 2.020119225037258, "grad_norm": 9.724621748116334, "learning_rate": 1.6212125632488157e-06, "loss": 0.329, "num_input_tokens_seen": 11104256, "step": 2711 }, { "epoch": 2.020864381520119, "grad_norm": 8.00594798840265, "learning_rate": 1.6206536803072035e-06, "loss": 0.389, "num_input_tokens_seen": 11108352, "step": 2712 }, { "epoch": 2.0216095380029806, "grad_norm": 7.642470811947633, "learning_rate": 1.6200947161156575e-06, "loss": 0.2559, "num_input_tokens_seen": 11112448, "step": 2713 }, { "epoch": 2.022354694485842, "grad_norm": 8.340951808629205, "learning_rate": 1.6195356707967068e-06, "loss": 0.6652, "num_input_tokens_seen": 11116544, "step": 2714 }, { "epoch": 2.0230998509687033, "grad_norm": 6.776875510177948, "learning_rate": 1.6189765444728988e-06, "loss": 0.2475, "num_input_tokens_seen": 11120640, "step": 2715 }, { "epoch": 2.023845007451565, "grad_norm": 7.84084467481329, "learning_rate": 1.6184173372667975e-06, "loss": 0.4209, "num_input_tokens_seen": 11124736, "step": 2716 }, { "epoch": 2.0245901639344264, "grad_norm": 7.899190139226269, "learning_rate": 1.6178580493009849e-06, "loss": 0.2503, "num_input_tokens_seen": 11128832, "step": 2717 }, { "epoch": 2.0253353204172875, "grad_norm": 7.946240536602946, "learning_rate": 1.6172986806980613e-06, "loss": 0.4661, "num_input_tokens_seen": 11132928, "step": 2718 }, { "epoch": 2.026080476900149, "grad_norm": 7.608778857728604, "learning_rate": 1.6167392315806443e-06, "loss": 0.4247, "num_input_tokens_seen": 11137024, "step": 2719 }, { "epoch": 2.0268256333830106, "grad_norm": 8.133890023150322, "learning_rate": 1.6161797020713688e-06, "loss": 0.3609, "num_input_tokens_seen": 11141120, "step": 2720 }, { "epoch": 2.0275707898658717, "grad_norm": 12.126529598168181, "learning_rate": 1.615620092292888e-06, "loss": 0.3843, "num_input_tokens_seen": 11145216, "step": 2721 }, { "epoch": 2.0283159463487332, "grad_norm": 11.113910764337259, "learning_rate": 1.615060402367872e-06, "loss": 0.2619, "num_input_tokens_seen": 11149312, "step": 2722 }, { "epoch": 2.029061102831595, "grad_norm": 8.09138253559834, "learning_rate": 1.6145006324190093e-06, "loss": 0.284, "num_input_tokens_seen": 11153408, "step": 2723 }, { "epoch": 2.029806259314456, "grad_norm": 10.866509938087145, "learning_rate": 1.6139407825690048e-06, "loss": 0.4266, "num_input_tokens_seen": 11157504, "step": 2724 }, { "epoch": 2.0305514157973175, "grad_norm": 9.151221063459992, "learning_rate": 1.6133808529405823e-06, "loss": 0.4134, "num_input_tokens_seen": 11161600, "step": 2725 }, { "epoch": 2.031296572280179, "grad_norm": 12.146112097352038, "learning_rate": 1.612820843656482e-06, "loss": 0.4224, "num_input_tokens_seen": 11165696, "step": 2726 }, { "epoch": 2.03204172876304, "grad_norm": 12.146002239442499, "learning_rate": 1.612260754839462e-06, "loss": 0.3673, "num_input_tokens_seen": 11169792, "step": 2727 }, { "epoch": 2.0327868852459017, "grad_norm": 7.813876169779595, "learning_rate": 1.6117005866122976e-06, "loss": 0.334, "num_input_tokens_seen": 11173888, "step": 2728 }, { "epoch": 2.0335320417287632, "grad_norm": 9.187083838496017, "learning_rate": 1.6111403390977824e-06, "loss": 0.3824, "num_input_tokens_seen": 11177984, "step": 2729 }, { "epoch": 2.0342771982116243, "grad_norm": 9.942743633173247, "learning_rate": 1.6105800124187254e-06, "loss": 0.1766, "num_input_tokens_seen": 11182080, "step": 2730 }, { "epoch": 2.035022354694486, "grad_norm": 19.27100926642217, "learning_rate": 1.6100196066979554e-06, "loss": 0.4043, "num_input_tokens_seen": 11186176, "step": 2731 }, { "epoch": 2.0357675111773474, "grad_norm": 8.73480101971519, "learning_rate": 1.6094591220583167e-06, "loss": 0.5028, "num_input_tokens_seen": 11190272, "step": 2732 }, { "epoch": 2.0365126676602086, "grad_norm": 9.76281226646735, "learning_rate": 1.6088985586226715e-06, "loss": 0.4971, "num_input_tokens_seen": 11194368, "step": 2733 }, { "epoch": 2.03725782414307, "grad_norm": 11.057032839805935, "learning_rate": 1.6083379165138996e-06, "loss": 0.3338, "num_input_tokens_seen": 11198464, "step": 2734 }, { "epoch": 2.0380029806259317, "grad_norm": 9.819121540373667, "learning_rate": 1.6077771958548977e-06, "loss": 0.3952, "num_input_tokens_seen": 11202560, "step": 2735 }, { "epoch": 2.0387481371087928, "grad_norm": 10.891442787024411, "learning_rate": 1.6072163967685795e-06, "loss": 0.3486, "num_input_tokens_seen": 11206656, "step": 2736 }, { "epoch": 2.0394932935916543, "grad_norm": 8.1272645684142, "learning_rate": 1.6066555193778766e-06, "loss": 0.304, "num_input_tokens_seen": 11210752, "step": 2737 }, { "epoch": 2.0402384500745154, "grad_norm": 8.344309699106319, "learning_rate": 1.6060945638057372e-06, "loss": 0.4441, "num_input_tokens_seen": 11214848, "step": 2738 }, { "epoch": 2.040983606557377, "grad_norm": 9.093004821440276, "learning_rate": 1.6055335301751266e-06, "loss": 0.3805, "num_input_tokens_seen": 11218944, "step": 2739 }, { "epoch": 2.0417287630402385, "grad_norm": 10.037117334663908, "learning_rate": 1.6049724186090273e-06, "loss": 0.5175, "num_input_tokens_seen": 11223040, "step": 2740 }, { "epoch": 2.0424739195230996, "grad_norm": 8.768958005036211, "learning_rate": 1.6044112292304391e-06, "loss": 0.766, "num_input_tokens_seen": 11227136, "step": 2741 }, { "epoch": 2.043219076005961, "grad_norm": 10.101735390946136, "learning_rate": 1.6038499621623787e-06, "loss": 0.2235, "num_input_tokens_seen": 11231232, "step": 2742 }, { "epoch": 2.0439642324888228, "grad_norm": 8.322344962822736, "learning_rate": 1.6032886175278801e-06, "loss": 0.2392, "num_input_tokens_seen": 11235328, "step": 2743 }, { "epoch": 2.044709388971684, "grad_norm": 8.799035707460902, "learning_rate": 1.6027271954499934e-06, "loss": 0.5272, "num_input_tokens_seen": 11239424, "step": 2744 }, { "epoch": 2.0454545454545454, "grad_norm": 8.187407527425684, "learning_rate": 1.6021656960517873e-06, "loss": 0.2898, "num_input_tokens_seen": 11243520, "step": 2745 }, { "epoch": 2.046199701937407, "grad_norm": 16.60317061727119, "learning_rate": 1.6016041194563458e-06, "loss": 0.4285, "num_input_tokens_seen": 11247616, "step": 2746 }, { "epoch": 2.046944858420268, "grad_norm": 9.939593416286504, "learning_rate": 1.6010424657867704e-06, "loss": 0.4264, "num_input_tokens_seen": 11251712, "step": 2747 }, { "epoch": 2.0476900149031296, "grad_norm": 8.242545517699837, "learning_rate": 1.6004807351661803e-06, "loss": 0.5424, "num_input_tokens_seen": 11255808, "step": 2748 }, { "epoch": 2.048435171385991, "grad_norm": 6.399872488256143, "learning_rate": 1.59991892771771e-06, "loss": 0.3185, "num_input_tokens_seen": 11259904, "step": 2749 }, { "epoch": 2.0491803278688523, "grad_norm": 10.927522597636896, "learning_rate": 1.599357043564512e-06, "loss": 0.4703, "num_input_tokens_seen": 11264000, "step": 2750 }, { "epoch": 2.049925484351714, "grad_norm": 19.323948641147634, "learning_rate": 1.5987950828297556e-06, "loss": 0.4219, "num_input_tokens_seen": 11268096, "step": 2751 }, { "epoch": 2.0506706408345754, "grad_norm": 10.040475248230834, "learning_rate": 1.5982330456366263e-06, "loss": 0.2799, "num_input_tokens_seen": 11272192, "step": 2752 }, { "epoch": 2.0514157973174365, "grad_norm": 9.181123615179349, "learning_rate": 1.5976709321083272e-06, "loss": 0.3161, "num_input_tokens_seen": 11276288, "step": 2753 }, { "epoch": 2.052160953800298, "grad_norm": 8.221145672348499, "learning_rate": 1.5971087423680768e-06, "loss": 0.6422, "num_input_tokens_seen": 11280384, "step": 2754 }, { "epoch": 2.0529061102831596, "grad_norm": 8.622570333073156, "learning_rate": 1.5965464765391114e-06, "loss": 0.335, "num_input_tokens_seen": 11284480, "step": 2755 }, { "epoch": 2.0536512667660207, "grad_norm": 8.96671029364275, "learning_rate": 1.595984134744684e-06, "loss": 0.4738, "num_input_tokens_seen": 11288576, "step": 2756 }, { "epoch": 2.0543964232488823, "grad_norm": 7.861628727666814, "learning_rate": 1.5954217171080637e-06, "loss": 0.4752, "num_input_tokens_seen": 11292672, "step": 2757 }, { "epoch": 2.055141579731744, "grad_norm": 10.369950904713276, "learning_rate": 1.5948592237525364e-06, "loss": 0.5173, "num_input_tokens_seen": 11296768, "step": 2758 }, { "epoch": 2.055886736214605, "grad_norm": 8.693942044161359, "learning_rate": 1.5942966548014053e-06, "loss": 0.5173, "num_input_tokens_seen": 11300864, "step": 2759 }, { "epoch": 2.0566318926974665, "grad_norm": 8.912393619978456, "learning_rate": 1.5937340103779882e-06, "loss": 0.5662, "num_input_tokens_seen": 11304960, "step": 2760 }, { "epoch": 2.057377049180328, "grad_norm": 8.026168778704475, "learning_rate": 1.5931712906056219e-06, "loss": 0.5471, "num_input_tokens_seen": 11309056, "step": 2761 }, { "epoch": 2.058122205663189, "grad_norm": 9.547551133224143, "learning_rate": 1.5926084956076583e-06, "loss": 0.3139, "num_input_tokens_seen": 11313152, "step": 2762 }, { "epoch": 2.0588673621460507, "grad_norm": 8.291428410954628, "learning_rate": 1.5920456255074657e-06, "loss": 0.3528, "num_input_tokens_seen": 11317248, "step": 2763 }, { "epoch": 2.0596125186289123, "grad_norm": 8.786720065734478, "learning_rate": 1.5914826804284295e-06, "loss": 0.3929, "num_input_tokens_seen": 11321344, "step": 2764 }, { "epoch": 2.0603576751117734, "grad_norm": 6.401530855604015, "learning_rate": 1.590919660493952e-06, "loss": 0.1518, "num_input_tokens_seen": 11325440, "step": 2765 }, { "epoch": 2.061102831594635, "grad_norm": 8.907272543171349, "learning_rate": 1.5903565658274501e-06, "loss": 0.3578, "num_input_tokens_seen": 11329536, "step": 2766 }, { "epoch": 2.0618479880774965, "grad_norm": 8.869955268544803, "learning_rate": 1.5897933965523584e-06, "loss": 0.3016, "num_input_tokens_seen": 11333632, "step": 2767 }, { "epoch": 2.0625931445603576, "grad_norm": 9.641535320355237, "learning_rate": 1.5892301527921283e-06, "loss": 0.5065, "num_input_tokens_seen": 11337728, "step": 2768 }, { "epoch": 2.063338301043219, "grad_norm": 9.745424034638278, "learning_rate": 1.5886668346702261e-06, "loss": 0.2405, "num_input_tokens_seen": 11341824, "step": 2769 }, { "epoch": 2.0640834575260807, "grad_norm": 8.999899693497335, "learning_rate": 1.5881034423101355e-06, "loss": 0.5252, "num_input_tokens_seen": 11345920, "step": 2770 }, { "epoch": 2.064828614008942, "grad_norm": 12.579076840836604, "learning_rate": 1.5875399758353556e-06, "loss": 0.3742, "num_input_tokens_seen": 11350016, "step": 2771 }, { "epoch": 2.0655737704918034, "grad_norm": 9.377980135475307, "learning_rate": 1.5869764353694033e-06, "loss": 0.5077, "num_input_tokens_seen": 11354112, "step": 2772 }, { "epoch": 2.066318926974665, "grad_norm": 8.87000053955925, "learning_rate": 1.5864128210358104e-06, "loss": 0.5829, "num_input_tokens_seen": 11358208, "step": 2773 }, { "epoch": 2.067064083457526, "grad_norm": 9.740106668360815, "learning_rate": 1.5858491329581243e-06, "loss": 0.2311, "num_input_tokens_seen": 11362304, "step": 2774 }, { "epoch": 2.0678092399403876, "grad_norm": 10.860656321006655, "learning_rate": 1.5852853712599107e-06, "loss": 0.4408, "num_input_tokens_seen": 11366400, "step": 2775 }, { "epoch": 2.0685543964232487, "grad_norm": 9.445453761330263, "learning_rate": 1.5847215360647491e-06, "loss": 0.555, "num_input_tokens_seen": 11370496, "step": 2776 }, { "epoch": 2.0692995529061102, "grad_norm": 11.189556583949782, "learning_rate": 1.5841576274962367e-06, "loss": 0.3865, "num_input_tokens_seen": 11374592, "step": 2777 }, { "epoch": 2.070044709388972, "grad_norm": 11.104631773578724, "learning_rate": 1.5835936456779862e-06, "loss": 0.509, "num_input_tokens_seen": 11378688, "step": 2778 }, { "epoch": 2.070789865871833, "grad_norm": 8.767722062660884, "learning_rate": 1.5830295907336272e-06, "loss": 0.4251, "num_input_tokens_seen": 11382784, "step": 2779 }, { "epoch": 2.0715350223546944, "grad_norm": 8.601719956088049, "learning_rate": 1.582465462786803e-06, "loss": 0.3993, "num_input_tokens_seen": 11386880, "step": 2780 }, { "epoch": 2.072280178837556, "grad_norm": 6.536409507624077, "learning_rate": 1.5819012619611756e-06, "loss": 0.176, "num_input_tokens_seen": 11390976, "step": 2781 }, { "epoch": 2.073025335320417, "grad_norm": 9.162325337145933, "learning_rate": 1.5813369883804217e-06, "loss": 0.3882, "num_input_tokens_seen": 11395072, "step": 2782 }, { "epoch": 2.0737704918032787, "grad_norm": 8.58629919859409, "learning_rate": 1.5807726421682337e-06, "loss": 0.5884, "num_input_tokens_seen": 11399168, "step": 2783 }, { "epoch": 2.07451564828614, "grad_norm": 8.590614766877506, "learning_rate": 1.5802082234483212e-06, "loss": 0.3777, "num_input_tokens_seen": 11403264, "step": 2784 }, { "epoch": 2.0752608047690013, "grad_norm": 9.066316897666978, "learning_rate": 1.5796437323444077e-06, "loss": 0.4458, "num_input_tokens_seen": 11407360, "step": 2785 }, { "epoch": 2.076005961251863, "grad_norm": 8.31543841972946, "learning_rate": 1.5790791689802345e-06, "loss": 0.212, "num_input_tokens_seen": 11411456, "step": 2786 }, { "epoch": 2.0767511177347244, "grad_norm": 11.28917352026084, "learning_rate": 1.5785145334795575e-06, "loss": 0.4058, "num_input_tokens_seen": 11415552, "step": 2787 }, { "epoch": 2.0774962742175855, "grad_norm": 9.498432652539753, "learning_rate": 1.577949825966149e-06, "loss": 0.4544, "num_input_tokens_seen": 11419648, "step": 2788 }, { "epoch": 2.078241430700447, "grad_norm": 8.879273438474119, "learning_rate": 1.5773850465637969e-06, "loss": 0.4831, "num_input_tokens_seen": 11423744, "step": 2789 }, { "epoch": 2.0789865871833086, "grad_norm": 8.343865781636698, "learning_rate": 1.5768201953963047e-06, "loss": 0.3531, "num_input_tokens_seen": 11427840, "step": 2790 }, { "epoch": 2.0797317436661698, "grad_norm": 8.436759269260076, "learning_rate": 1.576255272587492e-06, "loss": 0.3724, "num_input_tokens_seen": 11431936, "step": 2791 }, { "epoch": 2.0804769001490313, "grad_norm": 7.760687186882364, "learning_rate": 1.5756902782611937e-06, "loss": 0.2955, "num_input_tokens_seen": 11436032, "step": 2792 }, { "epoch": 2.081222056631893, "grad_norm": 10.362745058709931, "learning_rate": 1.5751252125412612e-06, "loss": 0.2901, "num_input_tokens_seen": 11440128, "step": 2793 }, { "epoch": 2.081967213114754, "grad_norm": 8.840646915593627, "learning_rate": 1.5745600755515606e-06, "loss": 0.4373, "num_input_tokens_seen": 11444224, "step": 2794 }, { "epoch": 2.0827123695976155, "grad_norm": 11.298589709234312, "learning_rate": 1.5739948674159732e-06, "loss": 0.6305, "num_input_tokens_seen": 11448320, "step": 2795 }, { "epoch": 2.083457526080477, "grad_norm": 10.252044755563738, "learning_rate": 1.5734295882583978e-06, "loss": 0.1947, "num_input_tokens_seen": 11452416, "step": 2796 }, { "epoch": 2.084202682563338, "grad_norm": 11.253465252740185, "learning_rate": 1.572864238202747e-06, "loss": 0.4697, "num_input_tokens_seen": 11456512, "step": 2797 }, { "epoch": 2.0849478390461997, "grad_norm": 11.372378419230397, "learning_rate": 1.5722988173729497e-06, "loss": 0.644, "num_input_tokens_seen": 11460608, "step": 2798 }, { "epoch": 2.0856929955290613, "grad_norm": 9.17579697954202, "learning_rate": 1.5717333258929504e-06, "loss": 0.5074, "num_input_tokens_seen": 11464704, "step": 2799 }, { "epoch": 2.0864381520119224, "grad_norm": 9.513886980181331, "learning_rate": 1.5711677638867088e-06, "loss": 0.216, "num_input_tokens_seen": 11468800, "step": 2800 }, { "epoch": 2.087183308494784, "grad_norm": 10.616075109217679, "learning_rate": 1.5706021314781997e-06, "loss": 0.3164, "num_input_tokens_seen": 11472896, "step": 2801 }, { "epoch": 2.0879284649776455, "grad_norm": 10.428373510780295, "learning_rate": 1.570036428791414e-06, "loss": 0.4065, "num_input_tokens_seen": 11476992, "step": 2802 }, { "epoch": 2.0886736214605066, "grad_norm": 7.384387484625405, "learning_rate": 1.5694706559503584e-06, "loss": 0.4132, "num_input_tokens_seen": 11481088, "step": 2803 }, { "epoch": 2.089418777943368, "grad_norm": 9.182157825211444, "learning_rate": 1.5689048130790533e-06, "loss": 0.5662, "num_input_tokens_seen": 11485184, "step": 2804 }, { "epoch": 2.0901639344262297, "grad_norm": 11.188840151404573, "learning_rate": 1.5683389003015358e-06, "loss": 0.4178, "num_input_tokens_seen": 11489280, "step": 2805 }, { "epoch": 2.090909090909091, "grad_norm": 7.754008259577093, "learning_rate": 1.5677729177418592e-06, "loss": 0.2604, "num_input_tokens_seen": 11493376, "step": 2806 }, { "epoch": 2.0916542473919524, "grad_norm": 7.72152424248237, "learning_rate": 1.5672068655240892e-06, "loss": 0.506, "num_input_tokens_seen": 11497472, "step": 2807 }, { "epoch": 2.092399403874814, "grad_norm": 8.978816466273146, "learning_rate": 1.566640743772309e-06, "loss": 0.5232, "num_input_tokens_seen": 11501568, "step": 2808 }, { "epoch": 2.093144560357675, "grad_norm": 10.399930922269537, "learning_rate": 1.5660745526106169e-06, "loss": 0.3914, "num_input_tokens_seen": 11505664, "step": 2809 }, { "epoch": 2.0938897168405366, "grad_norm": 14.644202587363424, "learning_rate": 1.565508292163126e-06, "loss": 0.2953, "num_input_tokens_seen": 11509760, "step": 2810 }, { "epoch": 2.0946348733233977, "grad_norm": 9.64240439975004, "learning_rate": 1.5649419625539646e-06, "loss": 0.4823, "num_input_tokens_seen": 11513856, "step": 2811 }, { "epoch": 2.0953800298062593, "grad_norm": 8.960323512025877, "learning_rate": 1.564375563907276e-06, "loss": 0.5895, "num_input_tokens_seen": 11517952, "step": 2812 }, { "epoch": 2.096125186289121, "grad_norm": 12.095851426057058, "learning_rate": 1.5638090963472195e-06, "loss": 0.4524, "num_input_tokens_seen": 11522048, "step": 2813 }, { "epoch": 2.096870342771982, "grad_norm": 9.557802539735459, "learning_rate": 1.5632425599979675e-06, "loss": 0.4906, "num_input_tokens_seen": 11526144, "step": 2814 }, { "epoch": 2.0976154992548435, "grad_norm": 10.958799854355906, "learning_rate": 1.56267595498371e-06, "loss": 0.3687, "num_input_tokens_seen": 11530240, "step": 2815 }, { "epoch": 2.098360655737705, "grad_norm": 8.853317925864278, "learning_rate": 1.56210928142865e-06, "loss": 0.3802, "num_input_tokens_seen": 11534336, "step": 2816 }, { "epoch": 2.099105812220566, "grad_norm": 8.953617841542993, "learning_rate": 1.5615425394570074e-06, "loss": 0.3171, "num_input_tokens_seen": 11538432, "step": 2817 }, { "epoch": 2.0998509687034277, "grad_norm": 8.092271720767902, "learning_rate": 1.5609757291930152e-06, "loss": 0.4325, "num_input_tokens_seen": 11542528, "step": 2818 }, { "epoch": 2.1005961251862892, "grad_norm": 9.857804399288941, "learning_rate": 1.5604088507609228e-06, "loss": 0.3426, "num_input_tokens_seen": 11546624, "step": 2819 }, { "epoch": 2.1013412816691504, "grad_norm": 7.145057265383573, "learning_rate": 1.5598419042849938e-06, "loss": 0.2365, "num_input_tokens_seen": 11550720, "step": 2820 }, { "epoch": 2.102086438152012, "grad_norm": 8.928115705607675, "learning_rate": 1.5592748898895071e-06, "loss": 0.4685, "num_input_tokens_seen": 11554816, "step": 2821 }, { "epoch": 2.1028315946348735, "grad_norm": 9.585350808736191, "learning_rate": 1.5587078076987559e-06, "loss": 0.2973, "num_input_tokens_seen": 11558912, "step": 2822 }, { "epoch": 2.1035767511177346, "grad_norm": 8.401132811233502, "learning_rate": 1.5581406578370487e-06, "loss": 0.5264, "num_input_tokens_seen": 11563008, "step": 2823 }, { "epoch": 2.104321907600596, "grad_norm": 8.42923858230266, "learning_rate": 1.5575734404287091e-06, "loss": 0.3356, "num_input_tokens_seen": 11567104, "step": 2824 }, { "epoch": 2.1050670640834577, "grad_norm": 11.031969469608914, "learning_rate": 1.557006155598075e-06, "loss": 0.5804, "num_input_tokens_seen": 11571200, "step": 2825 }, { "epoch": 2.105812220566319, "grad_norm": 9.80387854016673, "learning_rate": 1.5564388034694994e-06, "loss": 0.4639, "num_input_tokens_seen": 11575296, "step": 2826 }, { "epoch": 2.1065573770491803, "grad_norm": 10.022807123079934, "learning_rate": 1.5558713841673502e-06, "loss": 0.3367, "num_input_tokens_seen": 11579392, "step": 2827 }, { "epoch": 2.107302533532042, "grad_norm": 7.792810703060559, "learning_rate": 1.5553038978160093e-06, "loss": 0.5619, "num_input_tokens_seen": 11583488, "step": 2828 }, { "epoch": 2.108047690014903, "grad_norm": 9.675823701540637, "learning_rate": 1.5547363445398738e-06, "loss": 0.3859, "num_input_tokens_seen": 11587584, "step": 2829 }, { "epoch": 2.1087928464977646, "grad_norm": 9.842785574191936, "learning_rate": 1.5541687244633557e-06, "loss": 0.3712, "num_input_tokens_seen": 11591680, "step": 2830 }, { "epoch": 2.109538002980626, "grad_norm": 9.164004416057034, "learning_rate": 1.5536010377108812e-06, "loss": 0.6298, "num_input_tokens_seen": 11595776, "step": 2831 }, { "epoch": 2.110283159463487, "grad_norm": 8.3432404758709, "learning_rate": 1.5530332844068914e-06, "loss": 0.4157, "num_input_tokens_seen": 11599872, "step": 2832 }, { "epoch": 2.1110283159463488, "grad_norm": 14.69433995404094, "learning_rate": 1.552465464675842e-06, "loss": 0.3885, "num_input_tokens_seen": 11603968, "step": 2833 }, { "epoch": 2.1117734724292103, "grad_norm": 12.614123788432346, "learning_rate": 1.5518975786422024e-06, "loss": 0.3571, "num_input_tokens_seen": 11608064, "step": 2834 }, { "epoch": 2.1125186289120714, "grad_norm": 12.393627743242229, "learning_rate": 1.551329626430458e-06, "loss": 0.4153, "num_input_tokens_seen": 11612160, "step": 2835 }, { "epoch": 2.113263785394933, "grad_norm": 9.145679528582034, "learning_rate": 1.550761608165108e-06, "loss": 0.3135, "num_input_tokens_seen": 11616256, "step": 2836 }, { "epoch": 2.1140089418777945, "grad_norm": 8.73634099648992, "learning_rate": 1.5501935239706656e-06, "loss": 0.4389, "num_input_tokens_seen": 11620352, "step": 2837 }, { "epoch": 2.1147540983606556, "grad_norm": 14.400598075811363, "learning_rate": 1.5496253739716596e-06, "loss": 0.415, "num_input_tokens_seen": 11624448, "step": 2838 }, { "epoch": 2.115499254843517, "grad_norm": 8.985683508180605, "learning_rate": 1.5490571582926316e-06, "loss": 0.4284, "num_input_tokens_seen": 11628544, "step": 2839 }, { "epoch": 2.1162444113263787, "grad_norm": 9.206790013971455, "learning_rate": 1.5484888770581396e-06, "loss": 0.2738, "num_input_tokens_seen": 11632640, "step": 2840 }, { "epoch": 2.11698956780924, "grad_norm": 9.242618984448946, "learning_rate": 1.5479205303927541e-06, "loss": 0.4983, "num_input_tokens_seen": 11636736, "step": 2841 }, { "epoch": 2.1177347242921014, "grad_norm": 9.432245419007907, "learning_rate": 1.547352118421061e-06, "loss": 0.1542, "num_input_tokens_seen": 11640832, "step": 2842 }, { "epoch": 2.118479880774963, "grad_norm": 12.22926991749634, "learning_rate": 1.54678364126766e-06, "loss": 0.2851, "num_input_tokens_seen": 11644928, "step": 2843 }, { "epoch": 2.119225037257824, "grad_norm": 9.115785269952823, "learning_rate": 1.546215099057166e-06, "loss": 0.3389, "num_input_tokens_seen": 11649024, "step": 2844 }, { "epoch": 2.1199701937406856, "grad_norm": 7.371981562574154, "learning_rate": 1.5456464919142067e-06, "loss": 0.3968, "num_input_tokens_seen": 11653120, "step": 2845 }, { "epoch": 2.1207153502235467, "grad_norm": 10.405611795771378, "learning_rate": 1.5450778199634254e-06, "loss": 0.3809, "num_input_tokens_seen": 11657216, "step": 2846 }, { "epoch": 2.1214605067064083, "grad_norm": 13.228363653404516, "learning_rate": 1.5445090833294793e-06, "loss": 0.5735, "num_input_tokens_seen": 11661312, "step": 2847 }, { "epoch": 2.12220566318927, "grad_norm": 10.960859840204206, "learning_rate": 1.5439402821370387e-06, "loss": 0.4265, "num_input_tokens_seen": 11665408, "step": 2848 }, { "epoch": 2.122950819672131, "grad_norm": 8.31649524961081, "learning_rate": 1.5433714165107897e-06, "loss": 0.3274, "num_input_tokens_seen": 11669504, "step": 2849 }, { "epoch": 2.1236959761549925, "grad_norm": 9.49743044317169, "learning_rate": 1.542802486575431e-06, "loss": 0.3373, "num_input_tokens_seen": 11673600, "step": 2850 }, { "epoch": 2.124441132637854, "grad_norm": 8.939269314003484, "learning_rate": 1.5422334924556768e-06, "loss": 0.3044, "num_input_tokens_seen": 11677696, "step": 2851 }, { "epoch": 2.125186289120715, "grad_norm": 8.049928640356972, "learning_rate": 1.5416644342762538e-06, "loss": 0.5766, "num_input_tokens_seen": 11681792, "step": 2852 }, { "epoch": 2.1259314456035767, "grad_norm": 10.808496467328665, "learning_rate": 1.5410953121619043e-06, "loss": 0.2952, "num_input_tokens_seen": 11685888, "step": 2853 }, { "epoch": 2.1266766020864383, "grad_norm": 6.808939273107842, "learning_rate": 1.5405261262373844e-06, "loss": 0.2679, "num_input_tokens_seen": 11689984, "step": 2854 }, { "epoch": 2.1274217585692994, "grad_norm": 8.725751248093195, "learning_rate": 1.5399568766274624e-06, "loss": 0.5521, "num_input_tokens_seen": 11694080, "step": 2855 }, { "epoch": 2.128166915052161, "grad_norm": 10.502776364463672, "learning_rate": 1.5393875634569227e-06, "loss": 0.297, "num_input_tokens_seen": 11698176, "step": 2856 }, { "epoch": 2.1289120715350225, "grad_norm": 9.627471016740166, "learning_rate": 1.538818186850563e-06, "loss": 0.5034, "num_input_tokens_seen": 11702272, "step": 2857 }, { "epoch": 2.1296572280178836, "grad_norm": 14.67740076580768, "learning_rate": 1.5382487469331941e-06, "loss": 0.447, "num_input_tokens_seen": 11706368, "step": 2858 }, { "epoch": 2.130402384500745, "grad_norm": 8.24846614107412, "learning_rate": 1.5376792438296416e-06, "loss": 0.3753, "num_input_tokens_seen": 11710464, "step": 2859 }, { "epoch": 2.1311475409836067, "grad_norm": 9.746310857807872, "learning_rate": 1.5371096776647448e-06, "loss": 0.577, "num_input_tokens_seen": 11714560, "step": 2860 }, { "epoch": 2.131892697466468, "grad_norm": 10.020765307914635, "learning_rate": 1.5365400485633564e-06, "loss": 0.3646, "num_input_tokens_seen": 11718656, "step": 2861 }, { "epoch": 2.1326378539493294, "grad_norm": 9.54029567242735, "learning_rate": 1.535970356650343e-06, "loss": 0.5789, "num_input_tokens_seen": 11722752, "step": 2862 }, { "epoch": 2.133383010432191, "grad_norm": 9.723126417991843, "learning_rate": 1.5354006020505857e-06, "loss": 0.6561, "num_input_tokens_seen": 11726848, "step": 2863 }, { "epoch": 2.134128166915052, "grad_norm": 9.324321256303696, "learning_rate": 1.5348307848889784e-06, "loss": 0.553, "num_input_tokens_seen": 11730944, "step": 2864 }, { "epoch": 2.1348733233979136, "grad_norm": 10.359063054567208, "learning_rate": 1.5342609052904292e-06, "loss": 0.4025, "num_input_tokens_seen": 11735040, "step": 2865 }, { "epoch": 2.135618479880775, "grad_norm": 8.911470976245258, "learning_rate": 1.53369096337986e-06, "loss": 0.3439, "num_input_tokens_seen": 11739136, "step": 2866 }, { "epoch": 2.1363636363636362, "grad_norm": 11.057771267353933, "learning_rate": 1.5331209592822059e-06, "loss": 0.3059, "num_input_tokens_seen": 11743232, "step": 2867 }, { "epoch": 2.137108792846498, "grad_norm": 9.844777141570459, "learning_rate": 1.5325508931224155e-06, "loss": 0.2961, "num_input_tokens_seen": 11747328, "step": 2868 }, { "epoch": 2.1378539493293593, "grad_norm": 7.862313450834146, "learning_rate": 1.531980765025452e-06, "loss": 0.3533, "num_input_tokens_seen": 11751424, "step": 2869 }, { "epoch": 2.1385991058122205, "grad_norm": 7.988104648244719, "learning_rate": 1.531410575116291e-06, "loss": 0.4546, "num_input_tokens_seen": 11755520, "step": 2870 }, { "epoch": 2.139344262295082, "grad_norm": 8.227014054224481, "learning_rate": 1.5308403235199227e-06, "loss": 0.2776, "num_input_tokens_seen": 11759616, "step": 2871 }, { "epoch": 2.1400894187779436, "grad_norm": 8.329023928526388, "learning_rate": 1.53027001036135e-06, "loss": 0.3002, "num_input_tokens_seen": 11763712, "step": 2872 }, { "epoch": 2.1408345752608047, "grad_norm": 10.252776783047766, "learning_rate": 1.5296996357655897e-06, "loss": 0.2503, "num_input_tokens_seen": 11767808, "step": 2873 }, { "epoch": 2.1415797317436662, "grad_norm": 10.579432624034236, "learning_rate": 1.5291291998576721e-06, "loss": 0.3578, "num_input_tokens_seen": 11771904, "step": 2874 }, { "epoch": 2.1423248882265273, "grad_norm": 7.874094618479128, "learning_rate": 1.528558702762641e-06, "loss": 0.2244, "num_input_tokens_seen": 11776000, "step": 2875 }, { "epoch": 2.143070044709389, "grad_norm": 12.926360492356572, "learning_rate": 1.5279881446055528e-06, "loss": 0.3864, "num_input_tokens_seen": 11780096, "step": 2876 }, { "epoch": 2.1438152011922504, "grad_norm": 8.43864849090925, "learning_rate": 1.5274175255114784e-06, "loss": 0.4362, "num_input_tokens_seen": 11784192, "step": 2877 }, { "epoch": 2.144560357675112, "grad_norm": 9.903401193504688, "learning_rate": 1.5268468456055014e-06, "loss": 0.372, "num_input_tokens_seen": 11788288, "step": 2878 }, { "epoch": 2.145305514157973, "grad_norm": 9.553628953958324, "learning_rate": 1.526276105012719e-06, "loss": 0.577, "num_input_tokens_seen": 11792384, "step": 2879 }, { "epoch": 2.1460506706408347, "grad_norm": 9.312382469283415, "learning_rate": 1.5257053038582414e-06, "loss": 0.4113, "num_input_tokens_seen": 11796480, "step": 2880 }, { "epoch": 2.1467958271236958, "grad_norm": 9.534168014509433, "learning_rate": 1.525134442267193e-06, "loss": 0.3681, "num_input_tokens_seen": 11800576, "step": 2881 }, { "epoch": 2.1475409836065573, "grad_norm": 10.543269795584422, "learning_rate": 1.5245635203647097e-06, "loss": 0.2374, "num_input_tokens_seen": 11804672, "step": 2882 }, { "epoch": 2.148286140089419, "grad_norm": 16.609468494768986, "learning_rate": 1.5239925382759423e-06, "loss": 0.3389, "num_input_tokens_seen": 11808768, "step": 2883 }, { "epoch": 2.14903129657228, "grad_norm": 12.544500409100436, "learning_rate": 1.5234214961260541e-06, "loss": 0.2766, "num_input_tokens_seen": 11812864, "step": 2884 }, { "epoch": 2.1497764530551415, "grad_norm": 9.842082264326118, "learning_rate": 1.5228503940402217e-06, "loss": 0.3455, "num_input_tokens_seen": 11816960, "step": 2885 }, { "epoch": 2.150521609538003, "grad_norm": 8.652890501249413, "learning_rate": 1.5222792321436344e-06, "loss": 0.1862, "num_input_tokens_seen": 11821056, "step": 2886 }, { "epoch": 2.151266766020864, "grad_norm": 9.209834967208208, "learning_rate": 1.5217080105614956e-06, "loss": 0.2918, "num_input_tokens_seen": 11825152, "step": 2887 }, { "epoch": 2.1520119225037257, "grad_norm": 8.790838277707673, "learning_rate": 1.5211367294190203e-06, "loss": 0.3844, "num_input_tokens_seen": 11829248, "step": 2888 }, { "epoch": 2.1527570789865873, "grad_norm": 10.803633148547721, "learning_rate": 1.520565388841438e-06, "loss": 0.4214, "num_input_tokens_seen": 11833344, "step": 2889 }, { "epoch": 2.1535022354694484, "grad_norm": 10.08397104474012, "learning_rate": 1.51999398895399e-06, "loss": 0.5161, "num_input_tokens_seen": 11837440, "step": 2890 }, { "epoch": 2.15424739195231, "grad_norm": 12.54273518053468, "learning_rate": 1.519422529881932e-06, "loss": 0.5021, "num_input_tokens_seen": 11841536, "step": 2891 }, { "epoch": 2.1549925484351715, "grad_norm": 10.53786707226241, "learning_rate": 1.5188510117505314e-06, "loss": 0.4678, "num_input_tokens_seen": 11845632, "step": 2892 }, { "epoch": 2.1557377049180326, "grad_norm": 8.677747666151943, "learning_rate": 1.51827943468507e-06, "loss": 0.4419, "num_input_tokens_seen": 11849728, "step": 2893 }, { "epoch": 2.156482861400894, "grad_norm": 8.415879208998415, "learning_rate": 1.51770779881084e-06, "loss": 0.3976, "num_input_tokens_seen": 11853824, "step": 2894 }, { "epoch": 2.1572280178837557, "grad_norm": 8.468965244712326, "learning_rate": 1.5171361042531497e-06, "loss": 0.2924, "num_input_tokens_seen": 11857920, "step": 2895 }, { "epoch": 2.157973174366617, "grad_norm": 8.769920497610066, "learning_rate": 1.5165643511373171e-06, "loss": 0.4336, "num_input_tokens_seen": 11862016, "step": 2896 }, { "epoch": 2.1587183308494784, "grad_norm": 8.56531651155629, "learning_rate": 1.5159925395886755e-06, "loss": 0.385, "num_input_tokens_seen": 11866112, "step": 2897 }, { "epoch": 2.15946348733234, "grad_norm": 8.323460212643322, "learning_rate": 1.5154206697325696e-06, "loss": 0.3809, "num_input_tokens_seen": 11870208, "step": 2898 }, { "epoch": 2.160208643815201, "grad_norm": 9.714574779513383, "learning_rate": 1.514848741694358e-06, "loss": 0.3857, "num_input_tokens_seen": 11874304, "step": 2899 }, { "epoch": 2.1609538002980626, "grad_norm": 9.665275273612696, "learning_rate": 1.5142767555994108e-06, "loss": 0.65, "num_input_tokens_seen": 11878400, "step": 2900 }, { "epoch": 2.161698956780924, "grad_norm": 7.883754949476981, "learning_rate": 1.513704711573112e-06, "loss": 0.3255, "num_input_tokens_seen": 11882496, "step": 2901 }, { "epoch": 2.1624441132637853, "grad_norm": 8.221564331049587, "learning_rate": 1.5131326097408571e-06, "loss": 0.3606, "num_input_tokens_seen": 11886592, "step": 2902 }, { "epoch": 2.163189269746647, "grad_norm": 9.595135222722407, "learning_rate": 1.5125604502280556e-06, "loss": 0.528, "num_input_tokens_seen": 11890688, "step": 2903 }, { "epoch": 2.1639344262295084, "grad_norm": 8.952605738677146, "learning_rate": 1.5119882331601283e-06, "loss": 0.2675, "num_input_tokens_seen": 11894784, "step": 2904 }, { "epoch": 2.1646795827123695, "grad_norm": 9.657159661904823, "learning_rate": 1.5114159586625093e-06, "loss": 0.5281, "num_input_tokens_seen": 11898880, "step": 2905 }, { "epoch": 2.165424739195231, "grad_norm": 10.569803249797339, "learning_rate": 1.5108436268606457e-06, "loss": 0.3231, "num_input_tokens_seen": 11902976, "step": 2906 }, { "epoch": 2.1661698956780926, "grad_norm": 8.083919050768992, "learning_rate": 1.5102712378799966e-06, "loss": 0.2027, "num_input_tokens_seen": 11907072, "step": 2907 }, { "epoch": 2.1669150521609537, "grad_norm": 8.827759751160768, "learning_rate": 1.5096987918460337e-06, "loss": 0.3883, "num_input_tokens_seen": 11911168, "step": 2908 }, { "epoch": 2.1676602086438153, "grad_norm": 9.515893429186638, "learning_rate": 1.509126288884241e-06, "loss": 0.5236, "num_input_tokens_seen": 11915264, "step": 2909 }, { "epoch": 2.168405365126677, "grad_norm": 9.01148935972699, "learning_rate": 1.5085537291201154e-06, "loss": 0.4717, "num_input_tokens_seen": 11919360, "step": 2910 }, { "epoch": 2.169150521609538, "grad_norm": 9.83849230780593, "learning_rate": 1.5079811126791664e-06, "loss": 0.3493, "num_input_tokens_seen": 11923456, "step": 2911 }, { "epoch": 2.1698956780923995, "grad_norm": 7.169558911561559, "learning_rate": 1.507408439686915e-06, "loss": 0.2228, "num_input_tokens_seen": 11927552, "step": 2912 }, { "epoch": 2.170640834575261, "grad_norm": 37.396941666161354, "learning_rate": 1.506835710268896e-06, "loss": 0.4876, "num_input_tokens_seen": 11931648, "step": 2913 }, { "epoch": 2.171385991058122, "grad_norm": 7.439529445901002, "learning_rate": 1.5062629245506551e-06, "loss": 0.2116, "num_input_tokens_seen": 11935744, "step": 2914 }, { "epoch": 2.1721311475409837, "grad_norm": 10.120156363973203, "learning_rate": 1.5056900826577514e-06, "loss": 0.2326, "num_input_tokens_seen": 11939840, "step": 2915 }, { "epoch": 2.172876304023845, "grad_norm": 8.930949947369676, "learning_rate": 1.5051171847157556e-06, "loss": 0.4084, "num_input_tokens_seen": 11943936, "step": 2916 }, { "epoch": 2.1736214605067063, "grad_norm": 9.11711163231625, "learning_rate": 1.5045442308502511e-06, "loss": 0.4902, "num_input_tokens_seen": 11948032, "step": 2917 }, { "epoch": 2.174366616989568, "grad_norm": 10.76354953598596, "learning_rate": 1.5039712211868337e-06, "loss": 0.3805, "num_input_tokens_seen": 11952128, "step": 2918 }, { "epoch": 2.175111773472429, "grad_norm": 8.416436907423929, "learning_rate": 1.503398155851111e-06, "loss": 0.3713, "num_input_tokens_seen": 11956224, "step": 2919 }, { "epoch": 2.1758569299552906, "grad_norm": 10.290730267601147, "learning_rate": 1.5028250349687035e-06, "loss": 0.3683, "num_input_tokens_seen": 11960320, "step": 2920 }, { "epoch": 2.176602086438152, "grad_norm": 8.574841326937413, "learning_rate": 1.502251858665243e-06, "loss": 0.3923, "num_input_tokens_seen": 11964416, "step": 2921 }, { "epoch": 2.1773472429210132, "grad_norm": 10.094322646897938, "learning_rate": 1.5016786270663735e-06, "loss": 0.4093, "num_input_tokens_seen": 11968512, "step": 2922 }, { "epoch": 2.178092399403875, "grad_norm": 10.534129531908265, "learning_rate": 1.5011053402977518e-06, "loss": 0.4167, "num_input_tokens_seen": 11972608, "step": 2923 }, { "epoch": 2.1788375558867363, "grad_norm": 8.430014154156309, "learning_rate": 1.5005319984850467e-06, "loss": 0.3657, "num_input_tokens_seen": 11976704, "step": 2924 }, { "epoch": 2.1795827123695974, "grad_norm": 13.062211882783613, "learning_rate": 1.4999586017539384e-06, "loss": 0.399, "num_input_tokens_seen": 11980800, "step": 2925 }, { "epoch": 2.180327868852459, "grad_norm": 12.876891956254738, "learning_rate": 1.4993851502301197e-06, "loss": 0.3606, "num_input_tokens_seen": 11984896, "step": 2926 }, { "epoch": 2.1810730253353205, "grad_norm": 8.653093042171909, "learning_rate": 1.4988116440392952e-06, "loss": 0.4108, "num_input_tokens_seen": 11988992, "step": 2927 }, { "epoch": 2.1818181818181817, "grad_norm": 9.34523874367588, "learning_rate": 1.4982380833071819e-06, "loss": 0.522, "num_input_tokens_seen": 11993088, "step": 2928 }, { "epoch": 2.182563338301043, "grad_norm": 9.822197277050726, "learning_rate": 1.497664468159508e-06, "loss": 0.3299, "num_input_tokens_seen": 11997184, "step": 2929 }, { "epoch": 2.1833084947839048, "grad_norm": 7.545055941222429, "learning_rate": 1.4970907987220144e-06, "loss": 0.2721, "num_input_tokens_seen": 12001280, "step": 2930 }, { "epoch": 2.184053651266766, "grad_norm": 10.306150776085042, "learning_rate": 1.4965170751204534e-06, "loss": 0.7498, "num_input_tokens_seen": 12005376, "step": 2931 }, { "epoch": 2.1847988077496274, "grad_norm": 8.187768084286262, "learning_rate": 1.4959432974805891e-06, "loss": 0.3671, "num_input_tokens_seen": 12009472, "step": 2932 }, { "epoch": 2.185543964232489, "grad_norm": 10.984217685293729, "learning_rate": 1.4953694659281975e-06, "loss": 0.3561, "num_input_tokens_seen": 12013568, "step": 2933 }, { "epoch": 2.18628912071535, "grad_norm": 12.749093121231365, "learning_rate": 1.4947955805890673e-06, "loss": 0.3969, "num_input_tokens_seen": 12017664, "step": 2934 }, { "epoch": 2.1870342771982116, "grad_norm": 8.789711202009162, "learning_rate": 1.4942216415889976e-06, "loss": 0.3874, "num_input_tokens_seen": 12021760, "step": 2935 }, { "epoch": 2.187779433681073, "grad_norm": 9.341265847081692, "learning_rate": 1.4936476490538005e-06, "loss": 0.4428, "num_input_tokens_seen": 12025856, "step": 2936 }, { "epoch": 2.1885245901639343, "grad_norm": 9.366979987762496, "learning_rate": 1.4930736031092986e-06, "loss": 0.4871, "num_input_tokens_seen": 12029952, "step": 2937 }, { "epoch": 2.189269746646796, "grad_norm": 8.745118676218487, "learning_rate": 1.4924995038813269e-06, "loss": 0.2403, "num_input_tokens_seen": 12034048, "step": 2938 }, { "epoch": 2.1900149031296574, "grad_norm": 6.721913634554704, "learning_rate": 1.491925351495733e-06, "loss": 0.2015, "num_input_tokens_seen": 12038144, "step": 2939 }, { "epoch": 2.1907600596125185, "grad_norm": 12.17310161055944, "learning_rate": 1.491351146078374e-06, "loss": 0.3163, "num_input_tokens_seen": 12042240, "step": 2940 }, { "epoch": 2.19150521609538, "grad_norm": 11.2551770527977, "learning_rate": 1.4907768877551209e-06, "loss": 0.5122, "num_input_tokens_seen": 12046336, "step": 2941 }, { "epoch": 2.1922503725782416, "grad_norm": 10.111912483440598, "learning_rate": 1.4902025766518547e-06, "loss": 0.4209, "num_input_tokens_seen": 12050432, "step": 2942 }, { "epoch": 2.1929955290611027, "grad_norm": 10.297867661821623, "learning_rate": 1.489628212894468e-06, "loss": 0.5446, "num_input_tokens_seen": 12054528, "step": 2943 }, { "epoch": 2.1937406855439643, "grad_norm": 9.857771859902172, "learning_rate": 1.489053796608866e-06, "loss": 0.464, "num_input_tokens_seen": 12058624, "step": 2944 }, { "epoch": 2.194485842026826, "grad_norm": 9.147546976102321, "learning_rate": 1.488479327920965e-06, "loss": 0.5114, "num_input_tokens_seen": 12062720, "step": 2945 }, { "epoch": 2.195230998509687, "grad_norm": 9.18984486549373, "learning_rate": 1.4879048069566924e-06, "loss": 0.4866, "num_input_tokens_seen": 12066816, "step": 2946 }, { "epoch": 2.1959761549925485, "grad_norm": 9.598576172479124, "learning_rate": 1.4873302338419874e-06, "loss": 0.3957, "num_input_tokens_seen": 12070912, "step": 2947 }, { "epoch": 2.19672131147541, "grad_norm": 7.769479888142862, "learning_rate": 1.4867556087028004e-06, "loss": 0.3186, "num_input_tokens_seen": 12075008, "step": 2948 }, { "epoch": 2.197466467958271, "grad_norm": 11.07067823520032, "learning_rate": 1.4861809316650935e-06, "loss": 0.5842, "num_input_tokens_seen": 12079104, "step": 2949 }, { "epoch": 2.1982116244411327, "grad_norm": 7.7738841617283425, "learning_rate": 1.4856062028548395e-06, "loss": 0.2813, "num_input_tokens_seen": 12083200, "step": 2950 }, { "epoch": 2.198956780923994, "grad_norm": 10.611122737228609, "learning_rate": 1.485031422398024e-06, "loss": 0.1757, "num_input_tokens_seen": 12087296, "step": 2951 }, { "epoch": 2.1997019374068554, "grad_norm": 8.998294947924544, "learning_rate": 1.484456590420642e-06, "loss": 0.5214, "num_input_tokens_seen": 12091392, "step": 2952 }, { "epoch": 2.200447093889717, "grad_norm": 9.307362853150092, "learning_rate": 1.4838817070487013e-06, "loss": 0.5578, "num_input_tokens_seen": 12095488, "step": 2953 }, { "epoch": 2.201192250372578, "grad_norm": 9.36506515212713, "learning_rate": 1.4833067724082204e-06, "loss": 0.3284, "num_input_tokens_seen": 12099584, "step": 2954 }, { "epoch": 2.2019374068554396, "grad_norm": 23.350470471902913, "learning_rate": 1.4827317866252292e-06, "loss": 0.6187, "num_input_tokens_seen": 12103680, "step": 2955 }, { "epoch": 2.202682563338301, "grad_norm": 8.48876309931353, "learning_rate": 1.4821567498257683e-06, "loss": 0.656, "num_input_tokens_seen": 12107776, "step": 2956 }, { "epoch": 2.2034277198211623, "grad_norm": 8.711583580808878, "learning_rate": 1.48158166213589e-06, "loss": 0.3065, "num_input_tokens_seen": 12111872, "step": 2957 }, { "epoch": 2.204172876304024, "grad_norm": 13.203144760935778, "learning_rate": 1.481006523681658e-06, "loss": 0.191, "num_input_tokens_seen": 12115968, "step": 2958 }, { "epoch": 2.2049180327868854, "grad_norm": 9.590225565237125, "learning_rate": 1.4804313345891464e-06, "loss": 0.3173, "num_input_tokens_seen": 12120064, "step": 2959 }, { "epoch": 2.2056631892697465, "grad_norm": 11.31533437018266, "learning_rate": 1.4798560949844407e-06, "loss": 0.41, "num_input_tokens_seen": 12124160, "step": 2960 }, { "epoch": 2.206408345752608, "grad_norm": 9.872804787661629, "learning_rate": 1.4792808049936378e-06, "loss": 0.2662, "num_input_tokens_seen": 12128256, "step": 2961 }, { "epoch": 2.2071535022354696, "grad_norm": 8.368052370018637, "learning_rate": 1.478705464742845e-06, "loss": 0.402, "num_input_tokens_seen": 12132352, "step": 2962 }, { "epoch": 2.2078986587183307, "grad_norm": 8.802733895416475, "learning_rate": 1.4781300743581813e-06, "loss": 0.2727, "num_input_tokens_seen": 12136448, "step": 2963 }, { "epoch": 2.2086438152011922, "grad_norm": 8.144200305170198, "learning_rate": 1.4775546339657765e-06, "loss": 0.2287, "num_input_tokens_seen": 12140544, "step": 2964 }, { "epoch": 2.209388971684054, "grad_norm": 8.942014368042118, "learning_rate": 1.4769791436917708e-06, "loss": 0.6115, "num_input_tokens_seen": 12144640, "step": 2965 }, { "epoch": 2.210134128166915, "grad_norm": 8.74010238104123, "learning_rate": 1.4764036036623165e-06, "loss": 0.3564, "num_input_tokens_seen": 12148736, "step": 2966 }, { "epoch": 2.2108792846497765, "grad_norm": 8.708938790114878, "learning_rate": 1.4758280140035752e-06, "loss": 0.232, "num_input_tokens_seen": 12152832, "step": 2967 }, { "epoch": 2.211624441132638, "grad_norm": 11.038514512168673, "learning_rate": 1.4752523748417207e-06, "loss": 0.3098, "num_input_tokens_seen": 12156928, "step": 2968 }, { "epoch": 2.212369597615499, "grad_norm": 9.926561987294647, "learning_rate": 1.4746766863029375e-06, "loss": 0.3678, "num_input_tokens_seen": 12161024, "step": 2969 }, { "epoch": 2.2131147540983607, "grad_norm": 11.842979244897176, "learning_rate": 1.4741009485134198e-06, "loss": 0.4757, "num_input_tokens_seen": 12165120, "step": 2970 }, { "epoch": 2.2138599105812222, "grad_norm": 10.054912182374304, "learning_rate": 1.4735251615993743e-06, "loss": 0.521, "num_input_tokens_seen": 12169216, "step": 2971 }, { "epoch": 2.2146050670640833, "grad_norm": 14.414718297452184, "learning_rate": 1.472949325687017e-06, "loss": 0.5305, "num_input_tokens_seen": 12173312, "step": 2972 }, { "epoch": 2.215350223546945, "grad_norm": 8.039594441785644, "learning_rate": 1.4723734409025755e-06, "loss": 0.215, "num_input_tokens_seen": 12177408, "step": 2973 }, { "epoch": 2.2160953800298064, "grad_norm": 8.64158480366841, "learning_rate": 1.4717975073722881e-06, "loss": 0.3233, "num_input_tokens_seen": 12181504, "step": 2974 }, { "epoch": 2.2168405365126675, "grad_norm": 8.766580285617612, "learning_rate": 1.4712215252224036e-06, "loss": 0.3932, "num_input_tokens_seen": 12185600, "step": 2975 }, { "epoch": 2.217585692995529, "grad_norm": 9.233015706393326, "learning_rate": 1.4706454945791806e-06, "loss": 0.348, "num_input_tokens_seen": 12189696, "step": 2976 }, { "epoch": 2.2183308494783907, "grad_norm": 7.560789212261153, "learning_rate": 1.4700694155688902e-06, "loss": 0.1446, "num_input_tokens_seen": 12193792, "step": 2977 }, { "epoch": 2.2190760059612518, "grad_norm": 8.04545923418257, "learning_rate": 1.469493288317812e-06, "loss": 0.4464, "num_input_tokens_seen": 12197888, "step": 2978 }, { "epoch": 2.2198211624441133, "grad_norm": 9.178834023000444, "learning_rate": 1.4689171129522375e-06, "loss": 0.6049, "num_input_tokens_seen": 12201984, "step": 2979 }, { "epoch": 2.220566318926975, "grad_norm": 8.409094631817089, "learning_rate": 1.4683408895984692e-06, "loss": 0.4506, "num_input_tokens_seen": 12206080, "step": 2980 }, { "epoch": 2.221311475409836, "grad_norm": 9.12975145624692, "learning_rate": 1.4677646183828182e-06, "loss": 0.4816, "num_input_tokens_seen": 12210176, "step": 2981 }, { "epoch": 2.2220566318926975, "grad_norm": 9.752773009949763, "learning_rate": 1.4671882994316083e-06, "loss": 0.4619, "num_input_tokens_seen": 12214272, "step": 2982 }, { "epoch": 2.222801788375559, "grad_norm": 7.7518095492020445, "learning_rate": 1.4666119328711723e-06, "loss": 0.3135, "num_input_tokens_seen": 12218368, "step": 2983 }, { "epoch": 2.22354694485842, "grad_norm": 7.459094948161748, "learning_rate": 1.4660355188278535e-06, "loss": 0.5205, "num_input_tokens_seen": 12222464, "step": 2984 }, { "epoch": 2.2242921013412817, "grad_norm": 9.641146151627973, "learning_rate": 1.465459057428007e-06, "loss": 0.4373, "num_input_tokens_seen": 12226560, "step": 2985 }, { "epoch": 2.225037257824143, "grad_norm": 9.937749924223354, "learning_rate": 1.4648825487979963e-06, "loss": 0.3545, "num_input_tokens_seen": 12230656, "step": 2986 }, { "epoch": 2.2257824143070044, "grad_norm": 11.625218545852238, "learning_rate": 1.4643059930641961e-06, "loss": 0.405, "num_input_tokens_seen": 12234752, "step": 2987 }, { "epoch": 2.226527570789866, "grad_norm": 9.262355638768039, "learning_rate": 1.4637293903529925e-06, "loss": 0.6535, "num_input_tokens_seen": 12238848, "step": 2988 }, { "epoch": 2.227272727272727, "grad_norm": 16.959667047795747, "learning_rate": 1.46315274079078e-06, "loss": 0.2777, "num_input_tokens_seen": 12242944, "step": 2989 }, { "epoch": 2.2280178837555886, "grad_norm": 8.59886971599426, "learning_rate": 1.4625760445039646e-06, "loss": 0.4808, "num_input_tokens_seen": 12247040, "step": 2990 }, { "epoch": 2.22876304023845, "grad_norm": 9.940579104595763, "learning_rate": 1.4619993016189626e-06, "loss": 0.2245, "num_input_tokens_seen": 12251136, "step": 2991 }, { "epoch": 2.2295081967213113, "grad_norm": 9.191849956006468, "learning_rate": 1.4614225122621997e-06, "loss": 0.3872, "num_input_tokens_seen": 12255232, "step": 2992 }, { "epoch": 2.230253353204173, "grad_norm": 10.076904163511108, "learning_rate": 1.4608456765601123e-06, "loss": 0.4147, "num_input_tokens_seen": 12259328, "step": 2993 }, { "epoch": 2.2309985096870344, "grad_norm": 8.42583364350881, "learning_rate": 1.4602687946391475e-06, "loss": 0.1431, "num_input_tokens_seen": 12263424, "step": 2994 }, { "epoch": 2.2317436661698955, "grad_norm": 7.396395705489502, "learning_rate": 1.4596918666257614e-06, "loss": 0.4753, "num_input_tokens_seen": 12267520, "step": 2995 }, { "epoch": 2.232488822652757, "grad_norm": 9.242540994865184, "learning_rate": 1.4591148926464206e-06, "loss": 0.3872, "num_input_tokens_seen": 12271616, "step": 2996 }, { "epoch": 2.2332339791356186, "grad_norm": 8.005753529235616, "learning_rate": 1.458537872827602e-06, "loss": 0.428, "num_input_tokens_seen": 12275712, "step": 2997 }, { "epoch": 2.2339791356184797, "grad_norm": 8.889524379354876, "learning_rate": 1.4579608072957927e-06, "loss": 0.2906, "num_input_tokens_seen": 12279808, "step": 2998 }, { "epoch": 2.2347242921013413, "grad_norm": 10.602866526587624, "learning_rate": 1.4573836961774894e-06, "loss": 0.2576, "num_input_tokens_seen": 12283904, "step": 2999 }, { "epoch": 2.235469448584203, "grad_norm": 9.375215615557245, "learning_rate": 1.456806539599199e-06, "loss": 0.4738, "num_input_tokens_seen": 12288000, "step": 3000 }, { "epoch": 2.236214605067064, "grad_norm": 18.512165316882456, "learning_rate": 1.4562293376874383e-06, "loss": 0.4262, "num_input_tokens_seen": 12292096, "step": 3001 }, { "epoch": 2.2369597615499255, "grad_norm": 9.647392670041164, "learning_rate": 1.4556520905687347e-06, "loss": 0.6103, "num_input_tokens_seen": 12296192, "step": 3002 }, { "epoch": 2.237704918032787, "grad_norm": 12.522970705574272, "learning_rate": 1.455074798369624e-06, "loss": 0.2693, "num_input_tokens_seen": 12300288, "step": 3003 }, { "epoch": 2.238450074515648, "grad_norm": 12.31502478802709, "learning_rate": 1.4544974612166532e-06, "loss": 0.2797, "num_input_tokens_seen": 12304384, "step": 3004 }, { "epoch": 2.2391952309985097, "grad_norm": 8.393168133033548, "learning_rate": 1.4539200792363786e-06, "loss": 0.3159, "num_input_tokens_seen": 12308480, "step": 3005 }, { "epoch": 2.2399403874813713, "grad_norm": 9.039357110005247, "learning_rate": 1.4533426525553663e-06, "loss": 0.5437, "num_input_tokens_seen": 12312576, "step": 3006 }, { "epoch": 2.2406855439642324, "grad_norm": 27.831204435739608, "learning_rate": 1.4527651813001928e-06, "loss": 0.1908, "num_input_tokens_seen": 12316672, "step": 3007 }, { "epoch": 2.241430700447094, "grad_norm": 8.963823797098492, "learning_rate": 1.4521876655974438e-06, "loss": 0.394, "num_input_tokens_seen": 12320768, "step": 3008 }, { "epoch": 2.2421758569299555, "grad_norm": 10.193616168652232, "learning_rate": 1.4516101055737148e-06, "loss": 0.5488, "num_input_tokens_seen": 12324864, "step": 3009 }, { "epoch": 2.2429210134128166, "grad_norm": 7.9589083822127735, "learning_rate": 1.4510325013556107e-06, "loss": 0.2477, "num_input_tokens_seen": 12328960, "step": 3010 }, { "epoch": 2.243666169895678, "grad_norm": 17.240499981479548, "learning_rate": 1.4504548530697468e-06, "loss": 0.5269, "num_input_tokens_seen": 12333056, "step": 3011 }, { "epoch": 2.2444113263785397, "grad_norm": 8.425274761637635, "learning_rate": 1.4498771608427478e-06, "loss": 0.3195, "num_input_tokens_seen": 12337152, "step": 3012 }, { "epoch": 2.245156482861401, "grad_norm": 7.992222956991637, "learning_rate": 1.4492994248012481e-06, "loss": 0.2579, "num_input_tokens_seen": 12341248, "step": 3013 }, { "epoch": 2.2459016393442623, "grad_norm": 7.45851990069447, "learning_rate": 1.4487216450718913e-06, "loss": 0.3826, "num_input_tokens_seen": 12345344, "step": 3014 }, { "epoch": 2.246646795827124, "grad_norm": 13.723474870717931, "learning_rate": 1.448143821781331e-06, "loss": 0.1891, "num_input_tokens_seen": 12349440, "step": 3015 }, { "epoch": 2.247391952309985, "grad_norm": 9.00006287696559, "learning_rate": 1.4475659550562298e-06, "loss": 0.2339, "num_input_tokens_seen": 12353536, "step": 3016 }, { "epoch": 2.2481371087928466, "grad_norm": 8.364663570578568, "learning_rate": 1.4469880450232606e-06, "loss": 0.4826, "num_input_tokens_seen": 12357632, "step": 3017 }, { "epoch": 2.248882265275708, "grad_norm": 7.982727053682381, "learning_rate": 1.4464100918091053e-06, "loss": 0.3406, "num_input_tokens_seen": 12361728, "step": 3018 }, { "epoch": 2.2496274217585692, "grad_norm": 7.804063565411053, "learning_rate": 1.4458320955404554e-06, "loss": 0.3919, "num_input_tokens_seen": 12365824, "step": 3019 }, { "epoch": 2.2503725782414308, "grad_norm": 18.152973786384905, "learning_rate": 1.4452540563440118e-06, "loss": 0.397, "num_input_tokens_seen": 12369920, "step": 3020 }, { "epoch": 2.251117734724292, "grad_norm": 8.885032070746728, "learning_rate": 1.444675974346485e-06, "loss": 0.431, "num_input_tokens_seen": 12374016, "step": 3021 }, { "epoch": 2.2518628912071534, "grad_norm": 7.369321463422459, "learning_rate": 1.4440978496745944e-06, "loss": 0.3393, "num_input_tokens_seen": 12378112, "step": 3022 }, { "epoch": 2.252608047690015, "grad_norm": 8.132986970028423, "learning_rate": 1.4435196824550694e-06, "loss": 0.523, "num_input_tokens_seen": 12382208, "step": 3023 }, { "epoch": 2.2533532041728765, "grad_norm": 11.713175319952935, "learning_rate": 1.4429414728146476e-06, "loss": 0.196, "num_input_tokens_seen": 12386304, "step": 3024 }, { "epoch": 2.2540983606557377, "grad_norm": 9.121047381934043, "learning_rate": 1.4423632208800775e-06, "loss": 0.5833, "num_input_tokens_seen": 12390400, "step": 3025 }, { "epoch": 2.254843517138599, "grad_norm": 12.372023055660804, "learning_rate": 1.4417849267781157e-06, "loss": 0.3832, "num_input_tokens_seen": 12394496, "step": 3026 }, { "epoch": 2.2555886736214603, "grad_norm": 7.519651057661377, "learning_rate": 1.4412065906355284e-06, "loss": 0.2309, "num_input_tokens_seen": 12398592, "step": 3027 }, { "epoch": 2.256333830104322, "grad_norm": 46.451244850624306, "learning_rate": 1.4406282125790913e-06, "loss": 0.4436, "num_input_tokens_seen": 12402688, "step": 3028 }, { "epoch": 2.2570789865871834, "grad_norm": 14.01389784112564, "learning_rate": 1.4400497927355888e-06, "loss": 0.5134, "num_input_tokens_seen": 12406784, "step": 3029 }, { "epoch": 2.2578241430700445, "grad_norm": 8.83200395562495, "learning_rate": 1.4394713312318147e-06, "loss": 0.5512, "num_input_tokens_seen": 12410880, "step": 3030 }, { "epoch": 2.258569299552906, "grad_norm": 8.699688139174258, "learning_rate": 1.4388928281945719e-06, "loss": 0.4545, "num_input_tokens_seen": 12414976, "step": 3031 }, { "epoch": 2.2593144560357676, "grad_norm": 9.145530274229953, "learning_rate": 1.4383142837506722e-06, "loss": 0.4957, "num_input_tokens_seen": 12419072, "step": 3032 }, { "epoch": 2.2600596125186287, "grad_norm": 8.619287148710248, "learning_rate": 1.4377356980269368e-06, "loss": 0.4584, "num_input_tokens_seen": 12423168, "step": 3033 }, { "epoch": 2.2608047690014903, "grad_norm": 9.048781612220388, "learning_rate": 1.4371570711501959e-06, "loss": 0.6032, "num_input_tokens_seen": 12427264, "step": 3034 }, { "epoch": 2.261549925484352, "grad_norm": 8.855975040337764, "learning_rate": 1.4365784032472886e-06, "loss": 0.4283, "num_input_tokens_seen": 12431360, "step": 3035 }, { "epoch": 2.262295081967213, "grad_norm": 7.947514385167568, "learning_rate": 1.435999694445063e-06, "loss": 0.3765, "num_input_tokens_seen": 12435456, "step": 3036 }, { "epoch": 2.2630402384500745, "grad_norm": 11.419063381588778, "learning_rate": 1.435420944870376e-06, "loss": 0.3062, "num_input_tokens_seen": 12439552, "step": 3037 }, { "epoch": 2.263785394932936, "grad_norm": 10.114911638999798, "learning_rate": 1.434842154650094e-06, "loss": 0.3711, "num_input_tokens_seen": 12443648, "step": 3038 }, { "epoch": 2.264530551415797, "grad_norm": 8.656606254111407, "learning_rate": 1.4342633239110917e-06, "loss": 0.3327, "num_input_tokens_seen": 12447744, "step": 3039 }, { "epoch": 2.2652757078986587, "grad_norm": 9.623355242492584, "learning_rate": 1.4336844527802536e-06, "loss": 0.5049, "num_input_tokens_seen": 12451840, "step": 3040 }, { "epoch": 2.2660208643815203, "grad_norm": 8.046186447965038, "learning_rate": 1.4331055413844714e-06, "loss": 0.517, "num_input_tokens_seen": 12455936, "step": 3041 }, { "epoch": 2.2667660208643814, "grad_norm": 9.362893783394076, "learning_rate": 1.432526589850647e-06, "loss": 0.5063, "num_input_tokens_seen": 12460032, "step": 3042 }, { "epoch": 2.267511177347243, "grad_norm": 13.613639516562888, "learning_rate": 1.431947598305691e-06, "loss": 0.533, "num_input_tokens_seen": 12464128, "step": 3043 }, { "epoch": 2.2682563338301045, "grad_norm": 9.923831220276243, "learning_rate": 1.431368566876522e-06, "loss": 0.4727, "num_input_tokens_seen": 12468224, "step": 3044 }, { "epoch": 2.2690014903129656, "grad_norm": 9.672332354654905, "learning_rate": 1.430789495690068e-06, "loss": 0.4546, "num_input_tokens_seen": 12472320, "step": 3045 }, { "epoch": 2.269746646795827, "grad_norm": 16.77320747196373, "learning_rate": 1.4302103848732656e-06, "loss": 0.6237, "num_input_tokens_seen": 12476416, "step": 3046 }, { "epoch": 2.2704918032786887, "grad_norm": 12.720243102887832, "learning_rate": 1.42963123455306e-06, "loss": 0.2744, "num_input_tokens_seen": 12480512, "step": 3047 }, { "epoch": 2.27123695976155, "grad_norm": 10.431616089119155, "learning_rate": 1.4290520448564055e-06, "loss": 0.4956, "num_input_tokens_seen": 12484608, "step": 3048 }, { "epoch": 2.2719821162444114, "grad_norm": 10.0097465567592, "learning_rate": 1.4284728159102637e-06, "loss": 0.398, "num_input_tokens_seen": 12488704, "step": 3049 }, { "epoch": 2.2727272727272725, "grad_norm": 9.811449726401447, "learning_rate": 1.4278935478416068e-06, "loss": 0.4017, "num_input_tokens_seen": 12492800, "step": 3050 }, { "epoch": 2.273472429210134, "grad_norm": 8.601169898094396, "learning_rate": 1.4273142407774135e-06, "loss": 0.5062, "num_input_tokens_seen": 12496896, "step": 3051 }, { "epoch": 2.2742175856929956, "grad_norm": 8.893830048676875, "learning_rate": 1.4267348948446725e-06, "loss": 0.2203, "num_input_tokens_seen": 12500992, "step": 3052 }, { "epoch": 2.274962742175857, "grad_norm": 8.037718444415049, "learning_rate": 1.4261555101703806e-06, "loss": 0.4127, "num_input_tokens_seen": 12505088, "step": 3053 }, { "epoch": 2.2757078986587183, "grad_norm": 9.769633546559266, "learning_rate": 1.425576086881543e-06, "loss": 0.5122, "num_input_tokens_seen": 12509184, "step": 3054 }, { "epoch": 2.27645305514158, "grad_norm": 9.737718091693917, "learning_rate": 1.4249966251051734e-06, "loss": 0.5266, "num_input_tokens_seen": 12513280, "step": 3055 }, { "epoch": 2.277198211624441, "grad_norm": 7.327321397076641, "learning_rate": 1.4244171249682945e-06, "loss": 0.4766, "num_input_tokens_seen": 12517376, "step": 3056 }, { "epoch": 2.2779433681073025, "grad_norm": 8.489914971275349, "learning_rate": 1.4238375865979356e-06, "loss": 0.4141, "num_input_tokens_seen": 12521472, "step": 3057 }, { "epoch": 2.278688524590164, "grad_norm": 7.4759094588147565, "learning_rate": 1.4232580101211369e-06, "loss": 0.423, "num_input_tokens_seen": 12525568, "step": 3058 }, { "epoch": 2.2794336810730256, "grad_norm": 10.091549288640016, "learning_rate": 1.422678395664945e-06, "loss": 0.3788, "num_input_tokens_seen": 12529664, "step": 3059 }, { "epoch": 2.2801788375558867, "grad_norm": 8.18087166476019, "learning_rate": 1.422098743356416e-06, "loss": 0.2648, "num_input_tokens_seen": 12533760, "step": 3060 }, { "epoch": 2.2809239940387482, "grad_norm": 9.165125032601814, "learning_rate": 1.421519053322613e-06, "loss": 0.4008, "num_input_tokens_seen": 12537856, "step": 3061 }, { "epoch": 2.2816691505216093, "grad_norm": 18.805796796230442, "learning_rate": 1.4209393256906095e-06, "loss": 0.5209, "num_input_tokens_seen": 12541952, "step": 3062 }, { "epoch": 2.282414307004471, "grad_norm": 9.18194624962541, "learning_rate": 1.4203595605874847e-06, "loss": 0.4565, "num_input_tokens_seen": 12546048, "step": 3063 }, { "epoch": 2.2831594634873325, "grad_norm": 10.097908952803078, "learning_rate": 1.4197797581403277e-06, "loss": 0.4115, "num_input_tokens_seen": 12550144, "step": 3064 }, { "epoch": 2.2839046199701936, "grad_norm": 9.924795150983762, "learning_rate": 1.4191999184762356e-06, "loss": 0.4202, "num_input_tokens_seen": 12554240, "step": 3065 }, { "epoch": 2.284649776453055, "grad_norm": 8.006057590114454, "learning_rate": 1.418620041722313e-06, "loss": 0.459, "num_input_tokens_seen": 12558336, "step": 3066 }, { "epoch": 2.2853949329359167, "grad_norm": 11.308175385401395, "learning_rate": 1.4180401280056735e-06, "loss": 0.3759, "num_input_tokens_seen": 12562432, "step": 3067 }, { "epoch": 2.2861400894187778, "grad_norm": 9.123020881148234, "learning_rate": 1.4174601774534377e-06, "loss": 0.4738, "num_input_tokens_seen": 12566528, "step": 3068 }, { "epoch": 2.2868852459016393, "grad_norm": 10.540713294729244, "learning_rate": 1.4168801901927356e-06, "loss": 0.7308, "num_input_tokens_seen": 12570624, "step": 3069 }, { "epoch": 2.287630402384501, "grad_norm": 9.550263284478296, "learning_rate": 1.416300166350704e-06, "loss": 0.4127, "num_input_tokens_seen": 12574720, "step": 3070 }, { "epoch": 2.288375558867362, "grad_norm": 11.664123155971744, "learning_rate": 1.415720106054488e-06, "loss": 0.2445, "num_input_tokens_seen": 12578816, "step": 3071 }, { "epoch": 2.2891207153502235, "grad_norm": 10.052194385918975, "learning_rate": 1.415140009431242e-06, "loss": 0.4603, "num_input_tokens_seen": 12582912, "step": 3072 }, { "epoch": 2.289865871833085, "grad_norm": 9.879709205093286, "learning_rate": 1.4145598766081264e-06, "loss": 0.2091, "num_input_tokens_seen": 12587008, "step": 3073 }, { "epoch": 2.290611028315946, "grad_norm": 8.701221872112534, "learning_rate": 1.4139797077123108e-06, "loss": 0.1509, "num_input_tokens_seen": 12591104, "step": 3074 }, { "epoch": 2.2913561847988078, "grad_norm": 14.772617786082794, "learning_rate": 1.4133995028709724e-06, "loss": 0.314, "num_input_tokens_seen": 12595200, "step": 3075 }, { "epoch": 2.2921013412816693, "grad_norm": 9.070463503511768, "learning_rate": 1.4128192622112962e-06, "loss": 0.3463, "num_input_tokens_seen": 12599296, "step": 3076 }, { "epoch": 2.2928464977645304, "grad_norm": 12.195549854808371, "learning_rate": 1.4122389858604756e-06, "loss": 0.4648, "num_input_tokens_seen": 12603392, "step": 3077 }, { "epoch": 2.293591654247392, "grad_norm": 8.282606324293305, "learning_rate": 1.4116586739457103e-06, "loss": 0.2547, "num_input_tokens_seen": 12607488, "step": 3078 }, { "epoch": 2.2943368107302535, "grad_norm": 9.034656826084966, "learning_rate": 1.4110783265942094e-06, "loss": 0.1662, "num_input_tokens_seen": 12611584, "step": 3079 }, { "epoch": 2.2950819672131146, "grad_norm": 10.992947070013722, "learning_rate": 1.410497943933189e-06, "loss": 0.3104, "num_input_tokens_seen": 12615680, "step": 3080 }, { "epoch": 2.295827123695976, "grad_norm": 10.790628345382554, "learning_rate": 1.409917526089873e-06, "loss": 0.4838, "num_input_tokens_seen": 12619776, "step": 3081 }, { "epoch": 2.2965722801788377, "grad_norm": 6.72757707991526, "learning_rate": 1.4093370731914937e-06, "loss": 0.3412, "num_input_tokens_seen": 12623872, "step": 3082 }, { "epoch": 2.297317436661699, "grad_norm": 10.007761749074254, "learning_rate": 1.4087565853652905e-06, "loss": 0.5774, "num_input_tokens_seen": 12627968, "step": 3083 }, { "epoch": 2.2980625931445604, "grad_norm": 8.630326332076853, "learning_rate": 1.4081760627385097e-06, "loss": 0.3416, "num_input_tokens_seen": 12632064, "step": 3084 }, { "epoch": 2.2988077496274215, "grad_norm": 9.458485749550459, "learning_rate": 1.4075955054384064e-06, "loss": 0.3756, "num_input_tokens_seen": 12636160, "step": 3085 }, { "epoch": 2.299552906110283, "grad_norm": 8.659082210697594, "learning_rate": 1.4070149135922434e-06, "loss": 0.308, "num_input_tokens_seen": 12640256, "step": 3086 }, { "epoch": 2.3002980625931446, "grad_norm": 11.577900317554795, "learning_rate": 1.4064342873272899e-06, "loss": 0.4044, "num_input_tokens_seen": 12644352, "step": 3087 }, { "epoch": 2.301043219076006, "grad_norm": 10.756409462921765, "learning_rate": 1.4058536267708236e-06, "loss": 0.5874, "num_input_tokens_seen": 12648448, "step": 3088 }, { "epoch": 2.3017883755588673, "grad_norm": 9.925451176238559, "learning_rate": 1.4052729320501295e-06, "loss": 0.2474, "num_input_tokens_seen": 12652544, "step": 3089 }, { "epoch": 2.302533532041729, "grad_norm": 10.523719792369167, "learning_rate": 1.4046922032924998e-06, "loss": 0.3157, "num_input_tokens_seen": 12656640, "step": 3090 }, { "epoch": 2.30327868852459, "grad_norm": 8.832571303059408, "learning_rate": 1.4041114406252346e-06, "loss": 0.4241, "num_input_tokens_seen": 12660736, "step": 3091 }, { "epoch": 2.3040238450074515, "grad_norm": 8.461262281758572, "learning_rate": 1.4035306441756412e-06, "loss": 0.5309, "num_input_tokens_seen": 12664832, "step": 3092 }, { "epoch": 2.304769001490313, "grad_norm": 9.429506229205538, "learning_rate": 1.402949814071034e-06, "loss": 0.4897, "num_input_tokens_seen": 12668928, "step": 3093 }, { "epoch": 2.3055141579731746, "grad_norm": 8.165267012565543, "learning_rate": 1.4023689504387359e-06, "loss": 0.3127, "num_input_tokens_seen": 12673024, "step": 3094 }, { "epoch": 2.3062593144560357, "grad_norm": 8.70469173712193, "learning_rate": 1.4017880534060757e-06, "loss": 0.3728, "num_input_tokens_seen": 12677120, "step": 3095 }, { "epoch": 2.3070044709388973, "grad_norm": 11.085066764867436, "learning_rate": 1.4012071231003905e-06, "loss": 0.2115, "num_input_tokens_seen": 12681216, "step": 3096 }, { "epoch": 2.3077496274217584, "grad_norm": 8.682378888339379, "learning_rate": 1.4006261596490236e-06, "loss": 0.3914, "num_input_tokens_seen": 12685312, "step": 3097 }, { "epoch": 2.30849478390462, "grad_norm": 7.866240935566413, "learning_rate": 1.4000451631793274e-06, "loss": 0.1664, "num_input_tokens_seen": 12689408, "step": 3098 }, { "epoch": 2.3092399403874815, "grad_norm": 9.639200725091957, "learning_rate": 1.3994641338186598e-06, "loss": 0.2612, "num_input_tokens_seen": 12693504, "step": 3099 }, { "epoch": 2.3099850968703426, "grad_norm": 11.601952006867819, "learning_rate": 1.3988830716943868e-06, "loss": 0.5036, "num_input_tokens_seen": 12697600, "step": 3100 }, { "epoch": 2.310730253353204, "grad_norm": 10.596231556514429, "learning_rate": 1.3983019769338813e-06, "loss": 0.3834, "num_input_tokens_seen": 12701696, "step": 3101 }, { "epoch": 2.3114754098360657, "grad_norm": 12.248217587187677, "learning_rate": 1.3977208496645237e-06, "loss": 0.3692, "num_input_tokens_seen": 12705792, "step": 3102 }, { "epoch": 2.312220566318927, "grad_norm": 9.883342034871639, "learning_rate": 1.397139690013701e-06, "loss": 0.4381, "num_input_tokens_seen": 12709888, "step": 3103 }, { "epoch": 2.3129657228017884, "grad_norm": 9.897135847296145, "learning_rate": 1.3965584981088076e-06, "loss": 0.2705, "num_input_tokens_seen": 12713984, "step": 3104 }, { "epoch": 2.31371087928465, "grad_norm": 8.930869041076745, "learning_rate": 1.3959772740772452e-06, "loss": 0.2593, "num_input_tokens_seen": 12718080, "step": 3105 }, { "epoch": 2.314456035767511, "grad_norm": 10.458199847326135, "learning_rate": 1.3953960180464212e-06, "loss": 0.6583, "num_input_tokens_seen": 12722176, "step": 3106 }, { "epoch": 2.3152011922503726, "grad_norm": 9.126587111750375, "learning_rate": 1.3948147301437523e-06, "loss": 0.4185, "num_input_tokens_seen": 12726272, "step": 3107 }, { "epoch": 2.315946348733234, "grad_norm": 9.744445892209178, "learning_rate": 1.3942334104966603e-06, "loss": 0.3559, "num_input_tokens_seen": 12730368, "step": 3108 }, { "epoch": 2.3166915052160952, "grad_norm": 11.103458457808477, "learning_rate": 1.3936520592325746e-06, "loss": 0.6587, "num_input_tokens_seen": 12734464, "step": 3109 }, { "epoch": 2.317436661698957, "grad_norm": 8.408418269365173, "learning_rate": 1.3930706764789325e-06, "loss": 0.4883, "num_input_tokens_seen": 12738560, "step": 3110 }, { "epoch": 2.3181818181818183, "grad_norm": 10.940436018391239, "learning_rate": 1.3924892623631758e-06, "loss": 0.4098, "num_input_tokens_seen": 12742656, "step": 3111 }, { "epoch": 2.3189269746646795, "grad_norm": 11.204219938999136, "learning_rate": 1.3919078170127555e-06, "loss": 0.3641, "num_input_tokens_seen": 12746752, "step": 3112 }, { "epoch": 2.319672131147541, "grad_norm": 8.666344071282506, "learning_rate": 1.3913263405551288e-06, "loss": 0.3532, "num_input_tokens_seen": 12750848, "step": 3113 }, { "epoch": 2.3204172876304026, "grad_norm": 11.114612717446596, "learning_rate": 1.3907448331177587e-06, "loss": 0.3681, "num_input_tokens_seen": 12754944, "step": 3114 }, { "epoch": 2.3211624441132637, "grad_norm": 11.563956315312229, "learning_rate": 1.3901632948281163e-06, "loss": 0.4868, "num_input_tokens_seen": 12759040, "step": 3115 }, { "epoch": 2.321907600596125, "grad_norm": 8.249360849532314, "learning_rate": 1.3895817258136788e-06, "loss": 0.5319, "num_input_tokens_seen": 12763136, "step": 3116 }, { "epoch": 2.3226527570789868, "grad_norm": 9.297068662648602, "learning_rate": 1.3890001262019303e-06, "loss": 0.5422, "num_input_tokens_seen": 12767232, "step": 3117 }, { "epoch": 2.323397913561848, "grad_norm": 8.96766374998429, "learning_rate": 1.3884184961203614e-06, "loss": 0.7065, "num_input_tokens_seen": 12771328, "step": 3118 }, { "epoch": 2.3241430700447094, "grad_norm": 8.685887992107784, "learning_rate": 1.3878368356964696e-06, "loss": 0.3173, "num_input_tokens_seen": 12775424, "step": 3119 }, { "epoch": 2.3248882265275705, "grad_norm": 9.09001976958286, "learning_rate": 1.3872551450577595e-06, "loss": 0.4693, "num_input_tokens_seen": 12779520, "step": 3120 }, { "epoch": 2.325633383010432, "grad_norm": 9.182042660569468, "learning_rate": 1.3866734243317415e-06, "loss": 0.4157, "num_input_tokens_seen": 12783616, "step": 3121 }, { "epoch": 2.3263785394932937, "grad_norm": 8.626600560676362, "learning_rate": 1.3860916736459333e-06, "loss": 0.2775, "num_input_tokens_seen": 12787712, "step": 3122 }, { "epoch": 2.327123695976155, "grad_norm": 8.835093638714161, "learning_rate": 1.3855098931278583e-06, "loss": 0.1326, "num_input_tokens_seen": 12791808, "step": 3123 }, { "epoch": 2.3278688524590163, "grad_norm": 8.277233625865208, "learning_rate": 1.3849280829050465e-06, "loss": 0.4771, "num_input_tokens_seen": 12795904, "step": 3124 }, { "epoch": 2.328614008941878, "grad_norm": 9.40596575780333, "learning_rate": 1.3843462431050359e-06, "loss": 0.4869, "num_input_tokens_seen": 12800000, "step": 3125 }, { "epoch": 2.329359165424739, "grad_norm": 11.610754381216033, "learning_rate": 1.3837643738553695e-06, "loss": 0.2273, "num_input_tokens_seen": 12804096, "step": 3126 }, { "epoch": 2.3301043219076005, "grad_norm": 11.068757778486228, "learning_rate": 1.383182475283597e-06, "loss": 0.5341, "num_input_tokens_seen": 12808192, "step": 3127 }, { "epoch": 2.330849478390462, "grad_norm": 8.251818489940879, "learning_rate": 1.3826005475172748e-06, "loss": 0.247, "num_input_tokens_seen": 12812288, "step": 3128 }, { "epoch": 2.3315946348733236, "grad_norm": 8.482501766053744, "learning_rate": 1.382018590683966e-06, "loss": 0.3658, "num_input_tokens_seen": 12816384, "step": 3129 }, { "epoch": 2.3323397913561847, "grad_norm": 11.356724311911721, "learning_rate": 1.3814366049112396e-06, "loss": 0.4468, "num_input_tokens_seen": 12820480, "step": 3130 }, { "epoch": 2.3330849478390463, "grad_norm": 10.658511368647817, "learning_rate": 1.3808545903266708e-06, "loss": 0.2607, "num_input_tokens_seen": 12824576, "step": 3131 }, { "epoch": 2.3338301043219074, "grad_norm": 8.472359950816218, "learning_rate": 1.3802725470578413e-06, "loss": 0.4223, "num_input_tokens_seen": 12828672, "step": 3132 }, { "epoch": 2.334575260804769, "grad_norm": 8.850976762350372, "learning_rate": 1.3796904752323392e-06, "loss": 0.2823, "num_input_tokens_seen": 12832768, "step": 3133 }, { "epoch": 2.3353204172876305, "grad_norm": 9.957707621702967, "learning_rate": 1.3791083749777587e-06, "loss": 0.5312, "num_input_tokens_seen": 12836864, "step": 3134 }, { "epoch": 2.3360655737704916, "grad_norm": 10.19617699202895, "learning_rate": 1.3785262464217006e-06, "loss": 0.4057, "num_input_tokens_seen": 12840960, "step": 3135 }, { "epoch": 2.336810730253353, "grad_norm": 10.642031655784846, "learning_rate": 1.3779440896917715e-06, "loss": 0.4281, "num_input_tokens_seen": 12845056, "step": 3136 }, { "epoch": 2.3375558867362147, "grad_norm": 8.921824917101512, "learning_rate": 1.3773619049155846e-06, "loss": 0.3196, "num_input_tokens_seen": 12849152, "step": 3137 }, { "epoch": 2.338301043219076, "grad_norm": 8.304293928312433, "learning_rate": 1.3767796922207585e-06, "loss": 0.3497, "num_input_tokens_seen": 12853248, "step": 3138 }, { "epoch": 2.3390461997019374, "grad_norm": 9.350208975906556, "learning_rate": 1.3761974517349186e-06, "loss": 0.7392, "num_input_tokens_seen": 12857344, "step": 3139 }, { "epoch": 2.339791356184799, "grad_norm": 8.757453778653847, "learning_rate": 1.3756151835856959e-06, "loss": 0.425, "num_input_tokens_seen": 12861440, "step": 3140 }, { "epoch": 2.34053651266766, "grad_norm": 11.07049934314105, "learning_rate": 1.3750328879007286e-06, "loss": 0.531, "num_input_tokens_seen": 12865536, "step": 3141 }, { "epoch": 2.3412816691505216, "grad_norm": 8.401646459527239, "learning_rate": 1.374450564807659e-06, "loss": 0.4377, "num_input_tokens_seen": 12869632, "step": 3142 }, { "epoch": 2.342026825633383, "grad_norm": 8.823218483286789, "learning_rate": 1.3738682144341372e-06, "loss": 0.3386, "num_input_tokens_seen": 12873728, "step": 3143 }, { "epoch": 2.3427719821162443, "grad_norm": 7.551373175404996, "learning_rate": 1.373285836907818e-06, "loss": 0.3569, "num_input_tokens_seen": 12877824, "step": 3144 }, { "epoch": 2.343517138599106, "grad_norm": 8.553303295937427, "learning_rate": 1.3727034323563632e-06, "loss": 0.2613, "num_input_tokens_seen": 12881920, "step": 3145 }, { "epoch": 2.3442622950819674, "grad_norm": 8.718239059725924, "learning_rate": 1.3721210009074399e-06, "loss": 0.4936, "num_input_tokens_seen": 12886016, "step": 3146 }, { "epoch": 2.3450074515648285, "grad_norm": 9.347546788938462, "learning_rate": 1.3715385426887208e-06, "loss": 0.4816, "num_input_tokens_seen": 12890112, "step": 3147 }, { "epoch": 2.34575260804769, "grad_norm": 24.052830423779373, "learning_rate": 1.3709560578278856e-06, "loss": 0.5208, "num_input_tokens_seen": 12894208, "step": 3148 }, { "epoch": 2.3464977645305516, "grad_norm": 9.418574994444096, "learning_rate": 1.3703735464526188e-06, "loss": 0.2994, "num_input_tokens_seen": 12898304, "step": 3149 }, { "epoch": 2.3472429210134127, "grad_norm": 9.362118647945575, "learning_rate": 1.3697910086906112e-06, "loss": 0.3681, "num_input_tokens_seen": 12902400, "step": 3150 }, { "epoch": 2.3479880774962743, "grad_norm": 8.272407001116836, "learning_rate": 1.3692084446695589e-06, "loss": 0.2217, "num_input_tokens_seen": 12906496, "step": 3151 }, { "epoch": 2.348733233979136, "grad_norm": 9.777099960186678, "learning_rate": 1.3686258545171643e-06, "loss": 0.4265, "num_input_tokens_seen": 12910592, "step": 3152 }, { "epoch": 2.349478390461997, "grad_norm": 9.376070056996264, "learning_rate": 1.3680432383611353e-06, "loss": 0.3955, "num_input_tokens_seen": 12914688, "step": 3153 }, { "epoch": 2.3502235469448585, "grad_norm": 10.476516650883962, "learning_rate": 1.3674605963291856e-06, "loss": 0.49, "num_input_tokens_seen": 12918784, "step": 3154 }, { "epoch": 2.3509687034277196, "grad_norm": 8.624155603604473, "learning_rate": 1.3668779285490344e-06, "loss": 0.3849, "num_input_tokens_seen": 12922880, "step": 3155 }, { "epoch": 2.351713859910581, "grad_norm": 10.21291288130137, "learning_rate": 1.3662952351484069e-06, "loss": 0.2853, "num_input_tokens_seen": 12926976, "step": 3156 }, { "epoch": 2.3524590163934427, "grad_norm": 9.995872056414642, "learning_rate": 1.3657125162550333e-06, "loss": 0.5103, "num_input_tokens_seen": 12931072, "step": 3157 }, { "epoch": 2.3532041728763042, "grad_norm": 10.724647477813813, "learning_rate": 1.3651297719966496e-06, "loss": 0.4166, "num_input_tokens_seen": 12935168, "step": 3158 }, { "epoch": 2.3539493293591653, "grad_norm": 8.514590878538806, "learning_rate": 1.3645470025009984e-06, "loss": 0.2278, "num_input_tokens_seen": 12939264, "step": 3159 }, { "epoch": 2.354694485842027, "grad_norm": 19.913282604461138, "learning_rate": 1.363964207895826e-06, "loss": 0.4586, "num_input_tokens_seen": 12943360, "step": 3160 }, { "epoch": 2.355439642324888, "grad_norm": 7.755207100318981, "learning_rate": 1.3633813883088854e-06, "loss": 0.2853, "num_input_tokens_seen": 12947456, "step": 3161 }, { "epoch": 2.3561847988077496, "grad_norm": 8.18680593626515, "learning_rate": 1.3627985438679348e-06, "loss": 0.492, "num_input_tokens_seen": 12951552, "step": 3162 }, { "epoch": 2.356929955290611, "grad_norm": 9.579564219037925, "learning_rate": 1.362215674700738e-06, "loss": 0.5406, "num_input_tokens_seen": 12955648, "step": 3163 }, { "epoch": 2.3576751117734727, "grad_norm": 11.934937961742738, "learning_rate": 1.3616327809350642e-06, "loss": 0.4802, "num_input_tokens_seen": 12959744, "step": 3164 }, { "epoch": 2.3584202682563338, "grad_norm": 22.12931361799688, "learning_rate": 1.3610498626986874e-06, "loss": 0.2746, "num_input_tokens_seen": 12963840, "step": 3165 }, { "epoch": 2.3591654247391953, "grad_norm": 8.822001251946968, "learning_rate": 1.3604669201193876e-06, "loss": 0.3594, "num_input_tokens_seen": 12967936, "step": 3166 }, { "epoch": 2.3599105812220564, "grad_norm": 11.59068086139999, "learning_rate": 1.3598839533249502e-06, "loss": 0.4439, "num_input_tokens_seen": 12972032, "step": 3167 }, { "epoch": 2.360655737704918, "grad_norm": 8.62703300328786, "learning_rate": 1.359300962443166e-06, "loss": 0.3499, "num_input_tokens_seen": 12976128, "step": 3168 }, { "epoch": 2.3614008941877795, "grad_norm": 11.139726269082304, "learning_rate": 1.35871794760183e-06, "loss": 0.325, "num_input_tokens_seen": 12980224, "step": 3169 }, { "epoch": 2.3621460506706407, "grad_norm": 8.643112416261626, "learning_rate": 1.3581349089287438e-06, "loss": 0.3942, "num_input_tokens_seen": 12984320, "step": 3170 }, { "epoch": 2.362891207153502, "grad_norm": 8.583174758881663, "learning_rate": 1.3575518465517132e-06, "loss": 0.2676, "num_input_tokens_seen": 12988416, "step": 3171 }, { "epoch": 2.3636363636363638, "grad_norm": 9.898274496423994, "learning_rate": 1.35696876059855e-06, "loss": 0.5905, "num_input_tokens_seen": 12992512, "step": 3172 }, { "epoch": 2.364381520119225, "grad_norm": 9.065349052061933, "learning_rate": 1.3563856511970705e-06, "loss": 0.3601, "num_input_tokens_seen": 12996608, "step": 3173 }, { "epoch": 2.3651266766020864, "grad_norm": 9.806065157291108, "learning_rate": 1.3558025184750964e-06, "loss": 0.5748, "num_input_tokens_seen": 13000704, "step": 3174 }, { "epoch": 2.365871833084948, "grad_norm": 8.243856527114119, "learning_rate": 1.3552193625604551e-06, "loss": 0.5193, "num_input_tokens_seen": 13004800, "step": 3175 }, { "epoch": 2.366616989567809, "grad_norm": 9.402196216978181, "learning_rate": 1.3546361835809782e-06, "loss": 0.3043, "num_input_tokens_seen": 13008896, "step": 3176 }, { "epoch": 2.3673621460506706, "grad_norm": 10.229476135023116, "learning_rate": 1.3540529816645025e-06, "loss": 0.4603, "num_input_tokens_seen": 13012992, "step": 3177 }, { "epoch": 2.368107302533532, "grad_norm": 8.373028218166228, "learning_rate": 1.3534697569388704e-06, "loss": 0.5477, "num_input_tokens_seen": 13017088, "step": 3178 }, { "epoch": 2.3688524590163933, "grad_norm": 12.305063187212266, "learning_rate": 1.3528865095319287e-06, "loss": 0.263, "num_input_tokens_seen": 13021184, "step": 3179 }, { "epoch": 2.369597615499255, "grad_norm": 15.411338282030519, "learning_rate": 1.3523032395715293e-06, "loss": 0.3961, "num_input_tokens_seen": 13025280, "step": 3180 }, { "epoch": 2.3703427719821164, "grad_norm": 7.944124749439036, "learning_rate": 1.3517199471855294e-06, "loss": 0.4476, "num_input_tokens_seen": 13029376, "step": 3181 }, { "epoch": 2.3710879284649775, "grad_norm": 8.792660168070572, "learning_rate": 1.3511366325017905e-06, "loss": 0.6291, "num_input_tokens_seen": 13033472, "step": 3182 }, { "epoch": 2.371833084947839, "grad_norm": 8.696311137671806, "learning_rate": 1.3505532956481798e-06, "loss": 0.3292, "num_input_tokens_seen": 13037568, "step": 3183 }, { "epoch": 2.3725782414307006, "grad_norm": 8.429488906789171, "learning_rate": 1.3499699367525686e-06, "loss": 0.2436, "num_input_tokens_seen": 13041664, "step": 3184 }, { "epoch": 2.3733233979135617, "grad_norm": 8.557070858588874, "learning_rate": 1.3493865559428338e-06, "loss": 0.593, "num_input_tokens_seen": 13045760, "step": 3185 }, { "epoch": 2.3740685543964233, "grad_norm": 11.442541922121876, "learning_rate": 1.3488031533468558e-06, "loss": 0.2313, "num_input_tokens_seen": 13049856, "step": 3186 }, { "epoch": 2.374813710879285, "grad_norm": 8.892198086478988, "learning_rate": 1.3482197290925212e-06, "loss": 0.5581, "num_input_tokens_seen": 13053952, "step": 3187 }, { "epoch": 2.375558867362146, "grad_norm": 8.11469892187768, "learning_rate": 1.3476362833077205e-06, "loss": 0.4513, "num_input_tokens_seen": 13058048, "step": 3188 }, { "epoch": 2.3763040238450075, "grad_norm": 7.812198612642774, "learning_rate": 1.3470528161203493e-06, "loss": 0.4965, "num_input_tokens_seen": 13062144, "step": 3189 }, { "epoch": 2.3770491803278686, "grad_norm": 10.264788137005198, "learning_rate": 1.3464693276583082e-06, "loss": 0.5795, "num_input_tokens_seen": 13066240, "step": 3190 }, { "epoch": 2.37779433681073, "grad_norm": 9.202713575851215, "learning_rate": 1.3458858180495014e-06, "loss": 0.2668, "num_input_tokens_seen": 13070336, "step": 3191 }, { "epoch": 2.3785394932935917, "grad_norm": 8.366259822397287, "learning_rate": 1.3453022874218385e-06, "loss": 0.264, "num_input_tokens_seen": 13074432, "step": 3192 }, { "epoch": 2.3792846497764533, "grad_norm": 8.554028462880556, "learning_rate": 1.3447187359032335e-06, "loss": 0.4246, "num_input_tokens_seen": 13078528, "step": 3193 }, { "epoch": 2.3800298062593144, "grad_norm": 9.233292875007306, "learning_rate": 1.3441351636216054e-06, "loss": 0.2187, "num_input_tokens_seen": 13082624, "step": 3194 }, { "epoch": 2.380774962742176, "grad_norm": 9.38793014504804, "learning_rate": 1.3435515707048773e-06, "loss": 0.4262, "num_input_tokens_seen": 13086720, "step": 3195 }, { "epoch": 2.381520119225037, "grad_norm": 8.485723246661623, "learning_rate": 1.342967957280977e-06, "loss": 0.3607, "num_input_tokens_seen": 13090816, "step": 3196 }, { "epoch": 2.3822652757078986, "grad_norm": 9.263138330688777, "learning_rate": 1.3423843234778367e-06, "loss": 0.2279, "num_input_tokens_seen": 13094912, "step": 3197 }, { "epoch": 2.38301043219076, "grad_norm": 10.224608537799343, "learning_rate": 1.3418006694233928e-06, "loss": 0.3399, "num_input_tokens_seen": 13099008, "step": 3198 }, { "epoch": 2.3837555886736217, "grad_norm": 10.581815530765235, "learning_rate": 1.3412169952455867e-06, "loss": 0.7625, "num_input_tokens_seen": 13103104, "step": 3199 }, { "epoch": 2.384500745156483, "grad_norm": 7.918566961362681, "learning_rate": 1.3406333010723638e-06, "loss": 0.3541, "num_input_tokens_seen": 13107200, "step": 3200 }, { "epoch": 2.3852459016393444, "grad_norm": 9.336725406802723, "learning_rate": 1.340049587031674e-06, "loss": 0.4626, "num_input_tokens_seen": 13111296, "step": 3201 }, { "epoch": 2.3859910581222055, "grad_norm": 18.210226516118183, "learning_rate": 1.3394658532514718e-06, "loss": 0.3186, "num_input_tokens_seen": 13115392, "step": 3202 }, { "epoch": 2.386736214605067, "grad_norm": 8.782339886443479, "learning_rate": 1.3388820998597158e-06, "loss": 0.2772, "num_input_tokens_seen": 13119488, "step": 3203 }, { "epoch": 2.3874813710879286, "grad_norm": 8.961443650919376, "learning_rate": 1.3382983269843692e-06, "loss": 0.2838, "num_input_tokens_seen": 13123584, "step": 3204 }, { "epoch": 2.3882265275707897, "grad_norm": 14.307293215423032, "learning_rate": 1.3377145347533984e-06, "loss": 0.328, "num_input_tokens_seen": 13127680, "step": 3205 }, { "epoch": 2.3889716840536512, "grad_norm": 11.110837357742245, "learning_rate": 1.3371307232947753e-06, "loss": 0.2418, "num_input_tokens_seen": 13131776, "step": 3206 }, { "epoch": 2.389716840536513, "grad_norm": 11.477696365177902, "learning_rate": 1.3365468927364755e-06, "loss": 0.2957, "num_input_tokens_seen": 13135872, "step": 3207 }, { "epoch": 2.390461997019374, "grad_norm": 10.996538711107663, "learning_rate": 1.3359630432064788e-06, "loss": 0.466, "num_input_tokens_seen": 13139968, "step": 3208 }, { "epoch": 2.3912071535022354, "grad_norm": 9.580661906092455, "learning_rate": 1.3353791748327691e-06, "loss": 0.3179, "num_input_tokens_seen": 13144064, "step": 3209 }, { "epoch": 2.391952309985097, "grad_norm": 8.801038293387935, "learning_rate": 1.3347952877433346e-06, "loss": 0.3614, "num_input_tokens_seen": 13148160, "step": 3210 }, { "epoch": 2.392697466467958, "grad_norm": 10.308798726072451, "learning_rate": 1.3342113820661677e-06, "loss": 0.4377, "num_input_tokens_seen": 13152256, "step": 3211 }, { "epoch": 2.3934426229508197, "grad_norm": 6.690301935612252, "learning_rate": 1.3336274579292646e-06, "loss": 0.2243, "num_input_tokens_seen": 13156352, "step": 3212 }, { "epoch": 2.394187779433681, "grad_norm": 9.47800942953177, "learning_rate": 1.3330435154606252e-06, "loss": 0.6157, "num_input_tokens_seen": 13160448, "step": 3213 }, { "epoch": 2.3949329359165423, "grad_norm": 11.135476960420872, "learning_rate": 1.332459554788255e-06, "loss": 0.4609, "num_input_tokens_seen": 13164544, "step": 3214 }, { "epoch": 2.395678092399404, "grad_norm": 7.876504754824605, "learning_rate": 1.3318755760401608e-06, "loss": 0.3736, "num_input_tokens_seen": 13168640, "step": 3215 }, { "epoch": 2.3964232488822654, "grad_norm": 9.424830889944218, "learning_rate": 1.3312915793443559e-06, "loss": 0.6219, "num_input_tokens_seen": 13172736, "step": 3216 }, { "epoch": 2.3971684053651265, "grad_norm": 9.877905197965546, "learning_rate": 1.3307075648288564e-06, "loss": 0.4362, "num_input_tokens_seen": 13176832, "step": 3217 }, { "epoch": 2.397913561847988, "grad_norm": 9.675367269990163, "learning_rate": 1.3301235326216824e-06, "loss": 0.3646, "num_input_tokens_seen": 13180928, "step": 3218 }, { "epoch": 2.3986587183308496, "grad_norm": 10.590111608310869, "learning_rate": 1.329539482850858e-06, "loss": 0.5394, "num_input_tokens_seen": 13185024, "step": 3219 }, { "epoch": 2.3994038748137108, "grad_norm": 9.55673271432384, "learning_rate": 1.3289554156444106e-06, "loss": 0.2907, "num_input_tokens_seen": 13189120, "step": 3220 }, { "epoch": 2.4001490312965723, "grad_norm": 9.4986999747362, "learning_rate": 1.3283713311303725e-06, "loss": 0.3105, "num_input_tokens_seen": 13193216, "step": 3221 }, { "epoch": 2.400894187779434, "grad_norm": 7.812865081345063, "learning_rate": 1.3277872294367795e-06, "loss": 0.605, "num_input_tokens_seen": 13197312, "step": 3222 }, { "epoch": 2.401639344262295, "grad_norm": 7.688101887799089, "learning_rate": 1.3272031106916698e-06, "loss": 0.3406, "num_input_tokens_seen": 13201408, "step": 3223 }, { "epoch": 2.4023845007451565, "grad_norm": 9.069851972353685, "learning_rate": 1.326618975023087e-06, "loss": 0.3305, "num_input_tokens_seen": 13205504, "step": 3224 }, { "epoch": 2.4031296572280176, "grad_norm": 14.310673856969668, "learning_rate": 1.326034822559078e-06, "loss": 0.3749, "num_input_tokens_seen": 13209600, "step": 3225 }, { "epoch": 2.403874813710879, "grad_norm": 7.510622974725747, "learning_rate": 1.3254506534276927e-06, "loss": 0.4594, "num_input_tokens_seen": 13213696, "step": 3226 }, { "epoch": 2.4046199701937407, "grad_norm": 7.723875905175828, "learning_rate": 1.3248664677569853e-06, "loss": 0.0833, "num_input_tokens_seen": 13217792, "step": 3227 }, { "epoch": 2.4053651266766023, "grad_norm": 10.030413533844811, "learning_rate": 1.3242822656750138e-06, "loss": 0.2093, "num_input_tokens_seen": 13221888, "step": 3228 }, { "epoch": 2.4061102831594634, "grad_norm": 8.987486212025857, "learning_rate": 1.3236980473098393e-06, "loss": 0.6045, "num_input_tokens_seen": 13225984, "step": 3229 }, { "epoch": 2.406855439642325, "grad_norm": 8.541490785070836, "learning_rate": 1.3231138127895262e-06, "loss": 0.2727, "num_input_tokens_seen": 13230080, "step": 3230 }, { "epoch": 2.407600596125186, "grad_norm": 9.615627421584099, "learning_rate": 1.3225295622421438e-06, "loss": 0.2966, "num_input_tokens_seen": 13234176, "step": 3231 }, { "epoch": 2.4083457526080476, "grad_norm": 14.367294044429906, "learning_rate": 1.3219452957957632e-06, "loss": 0.4964, "num_input_tokens_seen": 13238272, "step": 3232 }, { "epoch": 2.409090909090909, "grad_norm": 12.487198096347777, "learning_rate": 1.3213610135784603e-06, "loss": 0.348, "num_input_tokens_seen": 13242368, "step": 3233 }, { "epoch": 2.4098360655737707, "grad_norm": 9.196692226285217, "learning_rate": 1.3207767157183132e-06, "loss": 0.4552, "num_input_tokens_seen": 13246464, "step": 3234 }, { "epoch": 2.410581222056632, "grad_norm": 10.724178818492181, "learning_rate": 1.3201924023434048e-06, "loss": 0.3017, "num_input_tokens_seen": 13250560, "step": 3235 }, { "epoch": 2.4113263785394934, "grad_norm": 9.012811643167659, "learning_rate": 1.3196080735818206e-06, "loss": 0.4062, "num_input_tokens_seen": 13254656, "step": 3236 }, { "epoch": 2.4120715350223545, "grad_norm": 8.25059665288634, "learning_rate": 1.3190237295616495e-06, "loss": 0.3222, "num_input_tokens_seen": 13258752, "step": 3237 }, { "epoch": 2.412816691505216, "grad_norm": 9.86121716579436, "learning_rate": 1.3184393704109844e-06, "loss": 0.1969, "num_input_tokens_seen": 13262848, "step": 3238 }, { "epoch": 2.4135618479880776, "grad_norm": 12.065073876397994, "learning_rate": 1.3178549962579204e-06, "loss": 0.5332, "num_input_tokens_seen": 13266944, "step": 3239 }, { "epoch": 2.4143070044709387, "grad_norm": 8.922862721933189, "learning_rate": 1.3172706072305563e-06, "loss": 0.2303, "num_input_tokens_seen": 13271040, "step": 3240 }, { "epoch": 2.4150521609538003, "grad_norm": 9.731478444109998, "learning_rate": 1.3166862034569955e-06, "loss": 0.3537, "num_input_tokens_seen": 13275136, "step": 3241 }, { "epoch": 2.415797317436662, "grad_norm": 10.5798305251109, "learning_rate": 1.316101785065342e-06, "loss": 0.1919, "num_input_tokens_seen": 13279232, "step": 3242 }, { "epoch": 2.416542473919523, "grad_norm": 11.523177430790302, "learning_rate": 1.3155173521837056e-06, "loss": 0.4534, "num_input_tokens_seen": 13283328, "step": 3243 }, { "epoch": 2.4172876304023845, "grad_norm": 8.157393974831477, "learning_rate": 1.314932904940198e-06, "loss": 0.3452, "num_input_tokens_seen": 13287424, "step": 3244 }, { "epoch": 2.418032786885246, "grad_norm": 9.545974772394313, "learning_rate": 1.3143484434629333e-06, "loss": 0.3315, "num_input_tokens_seen": 13291520, "step": 3245 }, { "epoch": 2.418777943368107, "grad_norm": 11.777449141438003, "learning_rate": 1.3137639678800307e-06, "loss": 0.4886, "num_input_tokens_seen": 13295616, "step": 3246 }, { "epoch": 2.4195230998509687, "grad_norm": 9.829829255951656, "learning_rate": 1.3131794783196108e-06, "loss": 0.2585, "num_input_tokens_seen": 13299712, "step": 3247 }, { "epoch": 2.4202682563338302, "grad_norm": 11.009978851818994, "learning_rate": 1.3125949749097982e-06, "loss": 0.5271, "num_input_tokens_seen": 13303808, "step": 3248 }, { "epoch": 2.4210134128166914, "grad_norm": 8.974433223075957, "learning_rate": 1.3120104577787202e-06, "loss": 0.2492, "num_input_tokens_seen": 13307904, "step": 3249 }, { "epoch": 2.421758569299553, "grad_norm": 7.775492686385837, "learning_rate": 1.3114259270545074e-06, "loss": 0.3467, "num_input_tokens_seen": 13312000, "step": 3250 }, { "epoch": 2.4225037257824145, "grad_norm": 9.48663429092318, "learning_rate": 1.3108413828652926e-06, "loss": 0.4668, "num_input_tokens_seen": 13316096, "step": 3251 }, { "epoch": 2.4232488822652756, "grad_norm": 10.586435115089126, "learning_rate": 1.310256825339212e-06, "loss": 0.2925, "num_input_tokens_seen": 13320192, "step": 3252 }, { "epoch": 2.423994038748137, "grad_norm": 7.774925883098737, "learning_rate": 1.3096722546044049e-06, "loss": 0.1664, "num_input_tokens_seen": 13324288, "step": 3253 }, { "epoch": 2.4247391952309987, "grad_norm": 10.006200341727293, "learning_rate": 1.3090876707890138e-06, "loss": 0.4095, "num_input_tokens_seen": 13328384, "step": 3254 }, { "epoch": 2.42548435171386, "grad_norm": 8.961036545931139, "learning_rate": 1.3085030740211832e-06, "loss": 0.345, "num_input_tokens_seen": 13332480, "step": 3255 }, { "epoch": 2.4262295081967213, "grad_norm": 9.898609226664362, "learning_rate": 1.307918464429061e-06, "loss": 0.7363, "num_input_tokens_seen": 13336576, "step": 3256 }, { "epoch": 2.426974664679583, "grad_norm": 10.334640718112341, "learning_rate": 1.3073338421407977e-06, "loss": 0.2426, "num_input_tokens_seen": 13340672, "step": 3257 }, { "epoch": 2.427719821162444, "grad_norm": 10.311786254284266, "learning_rate": 1.3067492072845472e-06, "loss": 0.3555, "num_input_tokens_seen": 13344768, "step": 3258 }, { "epoch": 2.4284649776453056, "grad_norm": 14.03060615639998, "learning_rate": 1.3061645599884648e-06, "loss": 0.3606, "num_input_tokens_seen": 13348864, "step": 3259 }, { "epoch": 2.429210134128167, "grad_norm": 9.652000779677717, "learning_rate": 1.3055799003807104e-06, "loss": 0.3475, "num_input_tokens_seen": 13352960, "step": 3260 }, { "epoch": 2.429955290611028, "grad_norm": 8.15496786727451, "learning_rate": 1.3049952285894444e-06, "loss": 0.3438, "num_input_tokens_seen": 13357056, "step": 3261 }, { "epoch": 2.4307004470938898, "grad_norm": 7.995275220963563, "learning_rate": 1.3044105447428319e-06, "loss": 0.2783, "num_input_tokens_seen": 13361152, "step": 3262 }, { "epoch": 2.4314456035767513, "grad_norm": 9.406978163594278, "learning_rate": 1.303825848969039e-06, "loss": 0.3821, "num_input_tokens_seen": 13365248, "step": 3263 }, { "epoch": 2.4321907600596124, "grad_norm": 7.621906473854574, "learning_rate": 1.303241141396236e-06, "loss": 0.1717, "num_input_tokens_seen": 13369344, "step": 3264 }, { "epoch": 2.432935916542474, "grad_norm": 12.027871965255821, "learning_rate": 1.3026564221525948e-06, "loss": 0.416, "num_input_tokens_seen": 13373440, "step": 3265 }, { "epoch": 2.433681073025335, "grad_norm": 11.723913305519872, "learning_rate": 1.3020716913662896e-06, "loss": 0.4053, "num_input_tokens_seen": 13377536, "step": 3266 }, { "epoch": 2.4344262295081966, "grad_norm": 8.707862153157215, "learning_rate": 1.3014869491654977e-06, "loss": 0.664, "num_input_tokens_seen": 13381632, "step": 3267 }, { "epoch": 2.435171385991058, "grad_norm": 7.9197061634326555, "learning_rate": 1.3009021956783994e-06, "loss": 0.3354, "num_input_tokens_seen": 13385728, "step": 3268 }, { "epoch": 2.4359165424739198, "grad_norm": 9.316546512966456, "learning_rate": 1.3003174310331758e-06, "loss": 0.3743, "num_input_tokens_seen": 13389824, "step": 3269 }, { "epoch": 2.436661698956781, "grad_norm": 9.700557748007613, "learning_rate": 1.2997326553580122e-06, "loss": 0.4002, "num_input_tokens_seen": 13393920, "step": 3270 }, { "epoch": 2.4374068554396424, "grad_norm": 9.048588237218956, "learning_rate": 1.2991478687810956e-06, "loss": 0.395, "num_input_tokens_seen": 13398016, "step": 3271 }, { "epoch": 2.4381520119225035, "grad_norm": 11.877656463700239, "learning_rate": 1.2985630714306152e-06, "loss": 0.3988, "num_input_tokens_seen": 13402112, "step": 3272 }, { "epoch": 2.438897168405365, "grad_norm": 11.682433903389548, "learning_rate": 1.2979782634347627e-06, "loss": 0.3688, "num_input_tokens_seen": 13406208, "step": 3273 }, { "epoch": 2.4396423248882266, "grad_norm": 11.44342199831286, "learning_rate": 1.2973934449217321e-06, "loss": 0.4492, "num_input_tokens_seen": 13410304, "step": 3274 }, { "epoch": 2.4403874813710877, "grad_norm": 13.589849442892023, "learning_rate": 1.29680861601972e-06, "loss": 0.5288, "num_input_tokens_seen": 13414400, "step": 3275 }, { "epoch": 2.4411326378539493, "grad_norm": 9.1887808796808, "learning_rate": 1.296223776856925e-06, "loss": 0.3587, "num_input_tokens_seen": 13418496, "step": 3276 }, { "epoch": 2.441877794336811, "grad_norm": 9.740598038099556, "learning_rate": 1.2956389275615483e-06, "loss": 0.2278, "num_input_tokens_seen": 13422592, "step": 3277 }, { "epoch": 2.442622950819672, "grad_norm": 8.991013438800739, "learning_rate": 1.2950540682617926e-06, "loss": 0.53, "num_input_tokens_seen": 13426688, "step": 3278 }, { "epoch": 2.4433681073025335, "grad_norm": 8.881309386501982, "learning_rate": 1.2944691990858638e-06, "loss": 0.4296, "num_input_tokens_seen": 13430784, "step": 3279 }, { "epoch": 2.444113263785395, "grad_norm": 9.987574261222738, "learning_rate": 1.2938843201619687e-06, "loss": 0.6245, "num_input_tokens_seen": 13434880, "step": 3280 }, { "epoch": 2.444858420268256, "grad_norm": 8.055882425334374, "learning_rate": 1.293299431618317e-06, "loss": 0.4452, "num_input_tokens_seen": 13438976, "step": 3281 }, { "epoch": 2.4456035767511177, "grad_norm": 8.741688522562063, "learning_rate": 1.292714533583121e-06, "loss": 0.6708, "num_input_tokens_seen": 13443072, "step": 3282 }, { "epoch": 2.4463487332339793, "grad_norm": 8.410123123487136, "learning_rate": 1.2921296261845939e-06, "loss": 0.2072, "num_input_tokens_seen": 13447168, "step": 3283 }, { "epoch": 2.4470938897168404, "grad_norm": 15.112145206368712, "learning_rate": 1.291544709550952e-06, "loss": 0.2924, "num_input_tokens_seen": 13451264, "step": 3284 }, { "epoch": 2.447839046199702, "grad_norm": 9.232633652456018, "learning_rate": 1.2909597838104135e-06, "loss": 0.4232, "num_input_tokens_seen": 13455360, "step": 3285 }, { "epoch": 2.4485842026825635, "grad_norm": 11.000651011038068, "learning_rate": 1.2903748490911976e-06, "loss": 0.2544, "num_input_tokens_seen": 13459456, "step": 3286 }, { "epoch": 2.4493293591654246, "grad_norm": 8.544246355775243, "learning_rate": 1.289789905521527e-06, "loss": 0.3442, "num_input_tokens_seen": 13463552, "step": 3287 }, { "epoch": 2.450074515648286, "grad_norm": 14.262471850794793, "learning_rate": 1.2892049532296244e-06, "loss": 0.3876, "num_input_tokens_seen": 13467648, "step": 3288 }, { "epoch": 2.4508196721311477, "grad_norm": 14.286016778151087, "learning_rate": 1.2886199923437162e-06, "loss": 0.4853, "num_input_tokens_seen": 13471744, "step": 3289 }, { "epoch": 2.451564828614009, "grad_norm": 8.993582216512044, "learning_rate": 1.28803502299203e-06, "loss": 0.285, "num_input_tokens_seen": 13475840, "step": 3290 }, { "epoch": 2.4523099850968704, "grad_norm": 10.0970443945333, "learning_rate": 1.2874500453027951e-06, "loss": 0.3445, "num_input_tokens_seen": 13479936, "step": 3291 }, { "epoch": 2.453055141579732, "grad_norm": 10.299031597361315, "learning_rate": 1.2868650594042428e-06, "loss": 0.3988, "num_input_tokens_seen": 13484032, "step": 3292 }, { "epoch": 2.453800298062593, "grad_norm": 8.004958649631737, "learning_rate": 1.2862800654246063e-06, "loss": 0.4382, "num_input_tokens_seen": 13488128, "step": 3293 }, { "epoch": 2.4545454545454546, "grad_norm": 8.728451142859083, "learning_rate": 1.2856950634921202e-06, "loss": 0.2164, "num_input_tokens_seen": 13492224, "step": 3294 }, { "epoch": 2.455290611028316, "grad_norm": 9.17647161753853, "learning_rate": 1.2851100537350217e-06, "loss": 0.3825, "num_input_tokens_seen": 13496320, "step": 3295 }, { "epoch": 2.4560357675111772, "grad_norm": 8.751951452618513, "learning_rate": 1.2845250362815486e-06, "loss": 0.2407, "num_input_tokens_seen": 13500416, "step": 3296 }, { "epoch": 2.456780923994039, "grad_norm": 10.959442278596564, "learning_rate": 1.283940011259941e-06, "loss": 0.5335, "num_input_tokens_seen": 13504512, "step": 3297 }, { "epoch": 2.4575260804769004, "grad_norm": 9.361927094276288, "learning_rate": 1.2833549787984406e-06, "loss": 0.571, "num_input_tokens_seen": 13508608, "step": 3298 }, { "epoch": 2.4582712369597615, "grad_norm": 8.01495617253409, "learning_rate": 1.2827699390252906e-06, "loss": 0.0913, "num_input_tokens_seen": 13512704, "step": 3299 }, { "epoch": 2.459016393442623, "grad_norm": 11.667144272509011, "learning_rate": 1.2821848920687361e-06, "loss": 0.5557, "num_input_tokens_seen": 13516800, "step": 3300 }, { "epoch": 2.459761549925484, "grad_norm": 8.856053402228666, "learning_rate": 1.281599838057023e-06, "loss": 0.4178, "num_input_tokens_seen": 13520896, "step": 3301 }, { "epoch": 2.4605067064083457, "grad_norm": 11.01842085260585, "learning_rate": 1.2810147771184e-06, "loss": 0.4041, "num_input_tokens_seen": 13524992, "step": 3302 }, { "epoch": 2.4612518628912072, "grad_norm": 9.959626916713487, "learning_rate": 1.2804297093811163e-06, "loss": 0.218, "num_input_tokens_seen": 13529088, "step": 3303 }, { "epoch": 2.461997019374069, "grad_norm": 10.65884625909483, "learning_rate": 1.2798446349734233e-06, "loss": 0.4688, "num_input_tokens_seen": 13533184, "step": 3304 }, { "epoch": 2.46274217585693, "grad_norm": 10.546471469716481, "learning_rate": 1.2792595540235731e-06, "loss": 0.3762, "num_input_tokens_seen": 13537280, "step": 3305 }, { "epoch": 2.4634873323397914, "grad_norm": 12.093368698647746, "learning_rate": 1.2786744666598199e-06, "loss": 0.5364, "num_input_tokens_seen": 13541376, "step": 3306 }, { "epoch": 2.4642324888226526, "grad_norm": 9.45847671818336, "learning_rate": 1.2780893730104185e-06, "loss": 0.43, "num_input_tokens_seen": 13545472, "step": 3307 }, { "epoch": 2.464977645305514, "grad_norm": 9.593600982442023, "learning_rate": 1.277504273203626e-06, "loss": 0.5577, "num_input_tokens_seen": 13549568, "step": 3308 }, { "epoch": 2.4657228017883757, "grad_norm": 8.039626833686027, "learning_rate": 1.2769191673677006e-06, "loss": 0.488, "num_input_tokens_seen": 13553664, "step": 3309 }, { "epoch": 2.4664679582712368, "grad_norm": 14.986500166364412, "learning_rate": 1.2763340556309014e-06, "loss": 0.3031, "num_input_tokens_seen": 13557760, "step": 3310 }, { "epoch": 2.4672131147540983, "grad_norm": 9.356215367677123, "learning_rate": 1.275748938121489e-06, "loss": 0.4377, "num_input_tokens_seen": 13561856, "step": 3311 }, { "epoch": 2.46795827123696, "grad_norm": 10.243994152252503, "learning_rate": 1.275163814967726e-06, "loss": 0.5657, "num_input_tokens_seen": 13565952, "step": 3312 }, { "epoch": 2.468703427719821, "grad_norm": 11.416398913338707, "learning_rate": 1.2745786862978746e-06, "loss": 0.3958, "num_input_tokens_seen": 13570048, "step": 3313 }, { "epoch": 2.4694485842026825, "grad_norm": 8.228631686686741, "learning_rate": 1.2739935522401998e-06, "loss": 0.3296, "num_input_tokens_seen": 13574144, "step": 3314 }, { "epoch": 2.470193740685544, "grad_norm": 8.532646339201655, "learning_rate": 1.2734084129229673e-06, "loss": 0.3841, "num_input_tokens_seen": 13578240, "step": 3315 }, { "epoch": 2.470938897168405, "grad_norm": 9.391583954621254, "learning_rate": 1.2728232684744433e-06, "loss": 0.5648, "num_input_tokens_seen": 13582336, "step": 3316 }, { "epoch": 2.4716840536512668, "grad_norm": 12.081461622666689, "learning_rate": 1.2722381190228955e-06, "loss": 0.658, "num_input_tokens_seen": 13586432, "step": 3317 }, { "epoch": 2.4724292101341283, "grad_norm": 8.639423299322955, "learning_rate": 1.2716529646965938e-06, "loss": 0.3127, "num_input_tokens_seen": 13590528, "step": 3318 }, { "epoch": 2.4731743666169894, "grad_norm": 7.510136042578072, "learning_rate": 1.2710678056238074e-06, "loss": 0.2885, "num_input_tokens_seen": 13594624, "step": 3319 }, { "epoch": 2.473919523099851, "grad_norm": 10.146629456563584, "learning_rate": 1.2704826419328075e-06, "loss": 0.487, "num_input_tokens_seen": 13598720, "step": 3320 }, { "epoch": 2.4746646795827125, "grad_norm": 10.12230347203876, "learning_rate": 1.2698974737518662e-06, "loss": 0.7054, "num_input_tokens_seen": 13602816, "step": 3321 }, { "epoch": 2.4754098360655736, "grad_norm": 7.960957979691698, "learning_rate": 1.2693123012092564e-06, "loss": 0.3633, "num_input_tokens_seen": 13606912, "step": 3322 }, { "epoch": 2.476154992548435, "grad_norm": 9.154676826498669, "learning_rate": 1.2687271244332526e-06, "loss": 0.6508, "num_input_tokens_seen": 13611008, "step": 3323 }, { "epoch": 2.4769001490312967, "grad_norm": 8.328099492420256, "learning_rate": 1.2681419435521295e-06, "loss": 0.4514, "num_input_tokens_seen": 13615104, "step": 3324 }, { "epoch": 2.477645305514158, "grad_norm": 9.244810543805013, "learning_rate": 1.2675567586941628e-06, "loss": 0.5054, "num_input_tokens_seen": 13619200, "step": 3325 }, { "epoch": 2.4783904619970194, "grad_norm": 8.322818485430856, "learning_rate": 1.2669715699876292e-06, "loss": 0.5148, "num_input_tokens_seen": 13623296, "step": 3326 }, { "epoch": 2.479135618479881, "grad_norm": 9.039488330493642, "learning_rate": 1.266386377560806e-06, "loss": 0.271, "num_input_tokens_seen": 13627392, "step": 3327 }, { "epoch": 2.479880774962742, "grad_norm": 10.28849256081117, "learning_rate": 1.2658011815419719e-06, "loss": 0.3666, "num_input_tokens_seen": 13631488, "step": 3328 }, { "epoch": 2.4806259314456036, "grad_norm": 12.323746364253788, "learning_rate": 1.265215982059406e-06, "loss": 0.4862, "num_input_tokens_seen": 13635584, "step": 3329 }, { "epoch": 2.481371087928465, "grad_norm": 10.395413815129846, "learning_rate": 1.2646307792413882e-06, "loss": 0.3073, "num_input_tokens_seen": 13639680, "step": 3330 }, { "epoch": 2.4821162444113263, "grad_norm": 9.554216407610726, "learning_rate": 1.2640455732161992e-06, "loss": 0.5788, "num_input_tokens_seen": 13643776, "step": 3331 }, { "epoch": 2.482861400894188, "grad_norm": 13.339394310084668, "learning_rate": 1.2634603641121201e-06, "loss": 0.3611, "num_input_tokens_seen": 13647872, "step": 3332 }, { "epoch": 2.4836065573770494, "grad_norm": 10.911018746641215, "learning_rate": 1.2628751520574336e-06, "loss": 0.3443, "num_input_tokens_seen": 13651968, "step": 3333 }, { "epoch": 2.4843517138599105, "grad_norm": 10.07219773945354, "learning_rate": 1.2622899371804212e-06, "loss": 0.6179, "num_input_tokens_seen": 13656064, "step": 3334 }, { "epoch": 2.485096870342772, "grad_norm": 10.364018914767911, "learning_rate": 1.2617047196093671e-06, "loss": 0.7325, "num_input_tokens_seen": 13660160, "step": 3335 }, { "epoch": 2.485842026825633, "grad_norm": 8.66025828939682, "learning_rate": 1.2611194994725548e-06, "loss": 0.3844, "num_input_tokens_seen": 13664256, "step": 3336 }, { "epoch": 2.4865871833084947, "grad_norm": 7.070575729315941, "learning_rate": 1.2605342768982688e-06, "loss": 0.2519, "num_input_tokens_seen": 13668352, "step": 3337 }, { "epoch": 2.4873323397913563, "grad_norm": 9.738866505773354, "learning_rate": 1.259949052014794e-06, "loss": 0.4418, "num_input_tokens_seen": 13672448, "step": 3338 }, { "epoch": 2.488077496274218, "grad_norm": 11.861604975774966, "learning_rate": 1.259363824950416e-06, "loss": 0.3993, "num_input_tokens_seen": 13676544, "step": 3339 }, { "epoch": 2.488822652757079, "grad_norm": 8.11949246622202, "learning_rate": 1.2587785958334208e-06, "loss": 0.3014, "num_input_tokens_seen": 13680640, "step": 3340 }, { "epoch": 2.4895678092399405, "grad_norm": 8.951588450955622, "learning_rate": 1.2581933647920945e-06, "loss": 0.4106, "num_input_tokens_seen": 13684736, "step": 3341 }, { "epoch": 2.4903129657228016, "grad_norm": 8.397164709211347, "learning_rate": 1.2576081319547247e-06, "loss": 0.3953, "num_input_tokens_seen": 13688832, "step": 3342 }, { "epoch": 2.491058122205663, "grad_norm": 10.251350010882899, "learning_rate": 1.2570228974495976e-06, "loss": 0.2827, "num_input_tokens_seen": 13692928, "step": 3343 }, { "epoch": 2.4918032786885247, "grad_norm": 7.918457705576386, "learning_rate": 1.256437661405001e-06, "loss": 0.6276, "num_input_tokens_seen": 13697024, "step": 3344 }, { "epoch": 2.492548435171386, "grad_norm": 8.916900192390832, "learning_rate": 1.2558524239492237e-06, "loss": 0.4185, "num_input_tokens_seen": 13701120, "step": 3345 }, { "epoch": 2.4932935916542474, "grad_norm": 9.71903432546496, "learning_rate": 1.255267185210553e-06, "loss": 0.3374, "num_input_tokens_seen": 13705216, "step": 3346 }, { "epoch": 2.494038748137109, "grad_norm": 13.172935536858319, "learning_rate": 1.2546819453172772e-06, "loss": 0.4391, "num_input_tokens_seen": 13709312, "step": 3347 }, { "epoch": 2.49478390461997, "grad_norm": 8.719942477985773, "learning_rate": 1.2540967043976863e-06, "loss": 0.6904, "num_input_tokens_seen": 13713408, "step": 3348 }, { "epoch": 2.4955290611028316, "grad_norm": 9.53739641234806, "learning_rate": 1.2535114625800679e-06, "loss": 0.2237, "num_input_tokens_seen": 13717504, "step": 3349 }, { "epoch": 2.496274217585693, "grad_norm": 9.518396342234405, "learning_rate": 1.2529262199927125e-06, "loss": 0.2834, "num_input_tokens_seen": 13721600, "step": 3350 }, { "epoch": 2.4970193740685542, "grad_norm": 9.207809632773452, "learning_rate": 1.2523409767639084e-06, "loss": 0.2134, "num_input_tokens_seen": 13725696, "step": 3351 }, { "epoch": 2.497764530551416, "grad_norm": 10.661651036891978, "learning_rate": 1.2517557330219456e-06, "loss": 0.4353, "num_input_tokens_seen": 13729792, "step": 3352 }, { "epoch": 2.4985096870342773, "grad_norm": 9.874700654156179, "learning_rate": 1.2511704888951134e-06, "loss": 0.2914, "num_input_tokens_seen": 13733888, "step": 3353 }, { "epoch": 2.4992548435171384, "grad_norm": 9.001768378800897, "learning_rate": 1.2505852445117017e-06, "loss": 0.316, "num_input_tokens_seen": 13737984, "step": 3354 }, { "epoch": 2.5, "grad_norm": 9.398188179772568, "learning_rate": 1.25e-06, "loss": 0.3543, "num_input_tokens_seen": 13742080, "step": 3355 }, { "epoch": 2.5007451564828616, "grad_norm": 8.464401835838762, "learning_rate": 1.2494147554882987e-06, "loss": 0.2797, "num_input_tokens_seen": 13746176, "step": 3356 }, { "epoch": 2.5014903129657227, "grad_norm": 8.481151207401563, "learning_rate": 1.248829511104887e-06, "loss": 0.544, "num_input_tokens_seen": 13750272, "step": 3357 }, { "epoch": 2.502235469448584, "grad_norm": 8.325915976072723, "learning_rate": 1.2482442669780548e-06, "loss": 0.4357, "num_input_tokens_seen": 13754368, "step": 3358 }, { "epoch": 2.5029806259314458, "grad_norm": 10.941122356607373, "learning_rate": 1.247659023236092e-06, "loss": 0.3914, "num_input_tokens_seen": 13758464, "step": 3359 }, { "epoch": 2.503725782414307, "grad_norm": 9.434251830979923, "learning_rate": 1.247073780007288e-06, "loss": 0.5298, "num_input_tokens_seen": 13762560, "step": 3360 }, { "epoch": 2.5044709388971684, "grad_norm": 10.670725795602877, "learning_rate": 1.2464885374199321e-06, "loss": 0.191, "num_input_tokens_seen": 13766656, "step": 3361 }, { "epoch": 2.50521609538003, "grad_norm": 11.14824437368891, "learning_rate": 1.2459032956023145e-06, "loss": 0.5687, "num_input_tokens_seen": 13770752, "step": 3362 }, { "epoch": 2.505961251862891, "grad_norm": 12.040521487420213, "learning_rate": 1.2453180546827232e-06, "loss": 0.5974, "num_input_tokens_seen": 13774848, "step": 3363 }, { "epoch": 2.5067064083457526, "grad_norm": 9.369862322632999, "learning_rate": 1.2447328147894478e-06, "loss": 0.306, "num_input_tokens_seen": 13778944, "step": 3364 }, { "epoch": 2.5074515648286138, "grad_norm": 10.387964528649087, "learning_rate": 1.244147576050777e-06, "loss": 0.5289, "num_input_tokens_seen": 13783040, "step": 3365 }, { "epoch": 2.5081967213114753, "grad_norm": 8.311948327109448, "learning_rate": 1.2435623385949992e-06, "loss": 0.2106, "num_input_tokens_seen": 13787136, "step": 3366 }, { "epoch": 2.508941877794337, "grad_norm": 8.98689562978459, "learning_rate": 1.2429771025504028e-06, "loss": 0.3404, "num_input_tokens_seen": 13791232, "step": 3367 }, { "epoch": 2.5096870342771984, "grad_norm": 11.195099615225315, "learning_rate": 1.2423918680452757e-06, "loss": 0.4793, "num_input_tokens_seen": 13795328, "step": 3368 }, { "epoch": 2.5104321907600595, "grad_norm": 8.594010502381362, "learning_rate": 1.2418066352079057e-06, "loss": 0.313, "num_input_tokens_seen": 13799424, "step": 3369 }, { "epoch": 2.511177347242921, "grad_norm": 8.081547269245698, "learning_rate": 1.2412214041665794e-06, "loss": 0.2538, "num_input_tokens_seen": 13803520, "step": 3370 }, { "epoch": 2.511922503725782, "grad_norm": 9.727952863366163, "learning_rate": 1.2406361750495841e-06, "loss": 0.334, "num_input_tokens_seen": 13807616, "step": 3371 }, { "epoch": 2.5126676602086437, "grad_norm": 9.059356216137624, "learning_rate": 1.240050947985206e-06, "loss": 0.2669, "num_input_tokens_seen": 13811712, "step": 3372 }, { "epoch": 2.5134128166915053, "grad_norm": 10.617965196076087, "learning_rate": 1.2394657231017314e-06, "loss": 0.5589, "num_input_tokens_seen": 13815808, "step": 3373 }, { "epoch": 2.514157973174367, "grad_norm": 9.921779582876473, "learning_rate": 1.2388805005274454e-06, "loss": 0.4447, "num_input_tokens_seen": 13819904, "step": 3374 }, { "epoch": 2.514903129657228, "grad_norm": 10.567042746115247, "learning_rate": 1.238295280390633e-06, "loss": 0.41, "num_input_tokens_seen": 13824000, "step": 3375 }, { "epoch": 2.5156482861400895, "grad_norm": 8.498661654846503, "learning_rate": 1.2377100628195792e-06, "loss": 0.2345, "num_input_tokens_seen": 13828096, "step": 3376 }, { "epoch": 2.5163934426229506, "grad_norm": 8.583266264036268, "learning_rate": 1.237124847942567e-06, "loss": 0.235, "num_input_tokens_seen": 13832192, "step": 3377 }, { "epoch": 2.517138599105812, "grad_norm": 9.575155087628499, "learning_rate": 1.23653963588788e-06, "loss": 0.412, "num_input_tokens_seen": 13836288, "step": 3378 }, { "epoch": 2.5178837555886737, "grad_norm": 9.493996751618395, "learning_rate": 1.2359544267838013e-06, "loss": 0.4421, "num_input_tokens_seen": 13840384, "step": 3379 }, { "epoch": 2.5186289120715353, "grad_norm": 10.17555396206246, "learning_rate": 1.2353692207586122e-06, "loss": 0.5525, "num_input_tokens_seen": 13844480, "step": 3380 }, { "epoch": 2.5193740685543964, "grad_norm": 7.538824052977788, "learning_rate": 1.2347840179405945e-06, "loss": 0.168, "num_input_tokens_seen": 13848576, "step": 3381 }, { "epoch": 2.520119225037258, "grad_norm": 9.35184300845888, "learning_rate": 1.2341988184580285e-06, "loss": 0.4576, "num_input_tokens_seen": 13852672, "step": 3382 }, { "epoch": 2.520864381520119, "grad_norm": 10.718501330299494, "learning_rate": 1.2336136224391944e-06, "loss": 0.2815, "num_input_tokens_seen": 13856768, "step": 3383 }, { "epoch": 2.5216095380029806, "grad_norm": 11.517908848672176, "learning_rate": 1.2330284300123712e-06, "loss": 0.2526, "num_input_tokens_seen": 13860864, "step": 3384 }, { "epoch": 2.522354694485842, "grad_norm": 10.023046408431716, "learning_rate": 1.2324432413058374e-06, "loss": 0.4147, "num_input_tokens_seen": 13864960, "step": 3385 }, { "epoch": 2.5230998509687033, "grad_norm": 9.481581756798947, "learning_rate": 1.2318580564478707e-06, "loss": 0.4174, "num_input_tokens_seen": 13869056, "step": 3386 }, { "epoch": 2.523845007451565, "grad_norm": 8.47304469852623, "learning_rate": 1.2312728755667476e-06, "loss": 0.4381, "num_input_tokens_seen": 13873152, "step": 3387 }, { "epoch": 2.5245901639344264, "grad_norm": 21.89302279824212, "learning_rate": 1.2306876987907436e-06, "loss": 0.2613, "num_input_tokens_seen": 13877248, "step": 3388 }, { "epoch": 2.5253353204172875, "grad_norm": 8.892742845787883, "learning_rate": 1.2301025262481344e-06, "loss": 0.3783, "num_input_tokens_seen": 13881344, "step": 3389 }, { "epoch": 2.526080476900149, "grad_norm": 9.073062672525898, "learning_rate": 1.2295173580671931e-06, "loss": 0.2685, "num_input_tokens_seen": 13885440, "step": 3390 }, { "epoch": 2.5268256333830106, "grad_norm": 9.074832108645788, "learning_rate": 1.2289321943761933e-06, "loss": 0.325, "num_input_tokens_seen": 13889536, "step": 3391 }, { "epoch": 2.5275707898658717, "grad_norm": 12.442668176213795, "learning_rate": 1.2283470353034068e-06, "loss": 0.2603, "num_input_tokens_seen": 13893632, "step": 3392 }, { "epoch": 2.5283159463487332, "grad_norm": 13.034700239472958, "learning_rate": 1.2277618809771047e-06, "loss": 0.4098, "num_input_tokens_seen": 13897728, "step": 3393 }, { "epoch": 2.529061102831595, "grad_norm": 9.314949306538058, "learning_rate": 1.2271767315255573e-06, "loss": 0.4225, "num_input_tokens_seen": 13901824, "step": 3394 }, { "epoch": 2.529806259314456, "grad_norm": 9.615820973350782, "learning_rate": 1.2265915870770331e-06, "loss": 0.4468, "num_input_tokens_seen": 13905920, "step": 3395 }, { "epoch": 2.5305514157973175, "grad_norm": 8.744225316204858, "learning_rate": 1.2260064477598004e-06, "loss": 0.5304, "num_input_tokens_seen": 13910016, "step": 3396 }, { "epoch": 2.531296572280179, "grad_norm": 8.131826163648807, "learning_rate": 1.2254213137021256e-06, "loss": 0.4223, "num_input_tokens_seen": 13914112, "step": 3397 }, { "epoch": 2.53204172876304, "grad_norm": 9.94709665791353, "learning_rate": 1.2248361850322743e-06, "loss": 0.4959, "num_input_tokens_seen": 13918208, "step": 3398 }, { "epoch": 2.5327868852459017, "grad_norm": 10.048140632775445, "learning_rate": 1.224251061878511e-06, "loss": 0.4156, "num_input_tokens_seen": 13922304, "step": 3399 }, { "epoch": 2.533532041728763, "grad_norm": 9.24483244685435, "learning_rate": 1.2236659443690988e-06, "loss": 0.1398, "num_input_tokens_seen": 13926400, "step": 3400 }, { "epoch": 2.5342771982116243, "grad_norm": 8.540783045333676, "learning_rate": 1.2230808326322994e-06, "loss": 0.2606, "num_input_tokens_seen": 13930496, "step": 3401 }, { "epoch": 2.535022354694486, "grad_norm": 8.46569823548794, "learning_rate": 1.222495726796374e-06, "loss": 0.4289, "num_input_tokens_seen": 13934592, "step": 3402 }, { "epoch": 2.5357675111773474, "grad_norm": 8.82014462735095, "learning_rate": 1.221910626989582e-06, "loss": 0.6357, "num_input_tokens_seen": 13938688, "step": 3403 }, { "epoch": 2.5365126676602086, "grad_norm": 9.517850438779522, "learning_rate": 1.2213255333401805e-06, "loss": 0.4609, "num_input_tokens_seen": 13942784, "step": 3404 }, { "epoch": 2.53725782414307, "grad_norm": 8.826512772478774, "learning_rate": 1.220740445976427e-06, "loss": 0.332, "num_input_tokens_seen": 13946880, "step": 3405 }, { "epoch": 2.538002980625931, "grad_norm": 10.12936991428337, "learning_rate": 1.2201553650265769e-06, "loss": 0.2976, "num_input_tokens_seen": 13950976, "step": 3406 }, { "epoch": 2.5387481371087928, "grad_norm": 8.854432254516155, "learning_rate": 1.219570290618884e-06, "loss": 0.4376, "num_input_tokens_seen": 13955072, "step": 3407 }, { "epoch": 2.5394932935916543, "grad_norm": 8.757343297819132, "learning_rate": 1.2189852228816002e-06, "loss": 0.4329, "num_input_tokens_seen": 13959168, "step": 3408 }, { "epoch": 2.540238450074516, "grad_norm": 8.626797799671046, "learning_rate": 1.2184001619429773e-06, "loss": 0.3158, "num_input_tokens_seen": 13963264, "step": 3409 }, { "epoch": 2.540983606557377, "grad_norm": 7.809883680170605, "learning_rate": 1.2178151079312645e-06, "loss": 0.2281, "num_input_tokens_seen": 13967360, "step": 3410 }, { "epoch": 2.5417287630402385, "grad_norm": 7.021073612356451, "learning_rate": 1.2172300609747096e-06, "loss": 0.1622, "num_input_tokens_seen": 13971456, "step": 3411 }, { "epoch": 2.5424739195230996, "grad_norm": 9.92777825448064, "learning_rate": 1.2166450212015596e-06, "loss": 0.3686, "num_input_tokens_seen": 13975552, "step": 3412 }, { "epoch": 2.543219076005961, "grad_norm": 9.573137861099545, "learning_rate": 1.216059988740059e-06, "loss": 0.3913, "num_input_tokens_seen": 13979648, "step": 3413 }, { "epoch": 2.5439642324888228, "grad_norm": 10.44296400883092, "learning_rate": 1.2154749637184516e-06, "loss": 0.5461, "num_input_tokens_seen": 13983744, "step": 3414 }, { "epoch": 2.5447093889716843, "grad_norm": 10.440283417115607, "learning_rate": 1.2148899462649783e-06, "loss": 0.3236, "num_input_tokens_seen": 13987840, "step": 3415 }, { "epoch": 2.5454545454545454, "grad_norm": 12.613236773382116, "learning_rate": 1.21430493650788e-06, "loss": 0.3437, "num_input_tokens_seen": 13991936, "step": 3416 }, { "epoch": 2.546199701937407, "grad_norm": 8.492236335220811, "learning_rate": 1.2137199345753944e-06, "loss": 0.5926, "num_input_tokens_seen": 13996032, "step": 3417 }, { "epoch": 2.546944858420268, "grad_norm": 7.579296218512801, "learning_rate": 1.2131349405957576e-06, "loss": 0.2362, "num_input_tokens_seen": 14000128, "step": 3418 }, { "epoch": 2.5476900149031296, "grad_norm": 8.15203532840405, "learning_rate": 1.2125499546972055e-06, "loss": 0.4551, "num_input_tokens_seen": 14004224, "step": 3419 }, { "epoch": 2.548435171385991, "grad_norm": 9.594070687896426, "learning_rate": 1.2119649770079705e-06, "loss": 0.3987, "num_input_tokens_seen": 14008320, "step": 3420 }, { "epoch": 2.5491803278688527, "grad_norm": 10.6741199493037, "learning_rate": 1.211380007656284e-06, "loss": 0.4355, "num_input_tokens_seen": 14012416, "step": 3421 }, { "epoch": 2.549925484351714, "grad_norm": 11.665529443968621, "learning_rate": 1.210795046770376e-06, "loss": 0.4036, "num_input_tokens_seen": 14016512, "step": 3422 }, { "epoch": 2.5506706408345754, "grad_norm": 7.24942280551381, "learning_rate": 1.2102100944784735e-06, "loss": 0.2797, "num_input_tokens_seen": 14020608, "step": 3423 }, { "epoch": 2.5514157973174365, "grad_norm": 8.93994738235958, "learning_rate": 1.2096251509088026e-06, "loss": 0.3075, "num_input_tokens_seen": 14024704, "step": 3424 }, { "epoch": 2.552160953800298, "grad_norm": 8.397924916787236, "learning_rate": 1.2090402161895867e-06, "loss": 0.3832, "num_input_tokens_seen": 14028800, "step": 3425 }, { "epoch": 2.5529061102831596, "grad_norm": 8.201695913961458, "learning_rate": 1.208455290449048e-06, "loss": 0.2296, "num_input_tokens_seen": 14032896, "step": 3426 }, { "epoch": 2.5536512667660207, "grad_norm": 8.822411715209029, "learning_rate": 1.2078703738154061e-06, "loss": 0.5366, "num_input_tokens_seen": 14036992, "step": 3427 }, { "epoch": 2.5543964232488823, "grad_norm": 8.206844350325452, "learning_rate": 1.2072854664168792e-06, "loss": 0.4134, "num_input_tokens_seen": 14041088, "step": 3428 }, { "epoch": 2.555141579731744, "grad_norm": 11.62593981173491, "learning_rate": 1.2067005683816832e-06, "loss": 0.3741, "num_input_tokens_seen": 14045184, "step": 3429 }, { "epoch": 2.555886736214605, "grad_norm": 9.525273019248745, "learning_rate": 1.2061156798380322e-06, "loss": 0.2155, "num_input_tokens_seen": 14049280, "step": 3430 }, { "epoch": 2.5566318926974665, "grad_norm": 9.844570761986194, "learning_rate": 1.205530800914137e-06, "loss": 0.3548, "num_input_tokens_seen": 14053376, "step": 3431 }, { "epoch": 2.557377049180328, "grad_norm": 9.313516654623, "learning_rate": 1.2049459317382078e-06, "loss": 0.4427, "num_input_tokens_seen": 14057472, "step": 3432 }, { "epoch": 2.558122205663189, "grad_norm": 10.320103302156214, "learning_rate": 1.204361072438452e-06, "loss": 0.3189, "num_input_tokens_seen": 14061568, "step": 3433 }, { "epoch": 2.5588673621460507, "grad_norm": 10.521241775956902, "learning_rate": 1.2037762231430754e-06, "loss": 0.4334, "num_input_tokens_seen": 14065664, "step": 3434 }, { "epoch": 2.559612518628912, "grad_norm": 8.803818451123147, "learning_rate": 1.2031913839802805e-06, "loss": 0.4868, "num_input_tokens_seen": 14069760, "step": 3435 }, { "epoch": 2.5603576751117734, "grad_norm": 8.5966312879258, "learning_rate": 1.2026065550782683e-06, "loss": 0.5275, "num_input_tokens_seen": 14073856, "step": 3436 }, { "epoch": 2.561102831594635, "grad_norm": 7.051492890147007, "learning_rate": 1.2020217365652377e-06, "loss": 0.2561, "num_input_tokens_seen": 14077952, "step": 3437 }, { "epoch": 2.5618479880774965, "grad_norm": 10.322581649618643, "learning_rate": 1.201436928569385e-06, "loss": 0.2281, "num_input_tokens_seen": 14082048, "step": 3438 }, { "epoch": 2.5625931445603576, "grad_norm": 10.232390242629483, "learning_rate": 1.2008521312189046e-06, "loss": 0.3498, "num_input_tokens_seen": 14086144, "step": 3439 }, { "epoch": 2.563338301043219, "grad_norm": 11.328697683816772, "learning_rate": 1.2002673446419878e-06, "loss": 0.4865, "num_input_tokens_seen": 14090240, "step": 3440 }, { "epoch": 2.5640834575260802, "grad_norm": 11.73079822056642, "learning_rate": 1.1996825689668244e-06, "loss": 0.4352, "num_input_tokens_seen": 14094336, "step": 3441 }, { "epoch": 2.564828614008942, "grad_norm": 10.729325968032988, "learning_rate": 1.1990978043216008e-06, "loss": 0.3421, "num_input_tokens_seen": 14098432, "step": 3442 }, { "epoch": 2.5655737704918034, "grad_norm": 16.380087455530617, "learning_rate": 1.1985130508345025e-06, "loss": 0.3333, "num_input_tokens_seen": 14102528, "step": 3443 }, { "epoch": 2.566318926974665, "grad_norm": 10.051651672497547, "learning_rate": 1.197928308633711e-06, "loss": 0.3008, "num_input_tokens_seen": 14106624, "step": 3444 }, { "epoch": 2.567064083457526, "grad_norm": 8.973389283663186, "learning_rate": 1.1973435778474058e-06, "loss": 0.4145, "num_input_tokens_seen": 14110720, "step": 3445 }, { "epoch": 2.5678092399403876, "grad_norm": 9.538265097511829, "learning_rate": 1.1967588586037644e-06, "loss": 0.1899, "num_input_tokens_seen": 14114816, "step": 3446 }, { "epoch": 2.5685543964232487, "grad_norm": 8.872472475749282, "learning_rate": 1.1961741510309612e-06, "loss": 0.2911, "num_input_tokens_seen": 14118912, "step": 3447 }, { "epoch": 2.5692995529061102, "grad_norm": 12.245883393537811, "learning_rate": 1.1955894552571687e-06, "loss": 0.3056, "num_input_tokens_seen": 14123008, "step": 3448 }, { "epoch": 2.570044709388972, "grad_norm": 9.449824294468295, "learning_rate": 1.195004771410556e-06, "loss": 0.4178, "num_input_tokens_seen": 14127104, "step": 3449 }, { "epoch": 2.5707898658718333, "grad_norm": 8.570134642830164, "learning_rate": 1.19442009961929e-06, "loss": 0.3342, "num_input_tokens_seen": 14131200, "step": 3450 }, { "epoch": 2.5715350223546944, "grad_norm": 8.426581589234774, "learning_rate": 1.1938354400115355e-06, "loss": 0.445, "num_input_tokens_seen": 14135296, "step": 3451 }, { "epoch": 2.572280178837556, "grad_norm": 9.591045654988331, "learning_rate": 1.1932507927154532e-06, "loss": 0.2197, "num_input_tokens_seen": 14139392, "step": 3452 }, { "epoch": 2.573025335320417, "grad_norm": 18.93911561082937, "learning_rate": 1.1926661578592025e-06, "loss": 0.4243, "num_input_tokens_seen": 14143488, "step": 3453 }, { "epoch": 2.5737704918032787, "grad_norm": 9.309191464035516, "learning_rate": 1.1920815355709392e-06, "loss": 0.3532, "num_input_tokens_seen": 14147584, "step": 3454 }, { "epoch": 2.57451564828614, "grad_norm": 9.880778989511612, "learning_rate": 1.191496925978817e-06, "loss": 0.3987, "num_input_tokens_seen": 14151680, "step": 3455 }, { "epoch": 2.5752608047690018, "grad_norm": 11.481684716605129, "learning_rate": 1.1909123292109862e-06, "loss": 0.2947, "num_input_tokens_seen": 14155776, "step": 3456 }, { "epoch": 2.576005961251863, "grad_norm": 12.426515979499685, "learning_rate": 1.1903277453955955e-06, "loss": 0.2443, "num_input_tokens_seen": 14159872, "step": 3457 }, { "epoch": 2.5767511177347244, "grad_norm": 9.723173766821652, "learning_rate": 1.1897431746607885e-06, "loss": 0.5147, "num_input_tokens_seen": 14163968, "step": 3458 }, { "epoch": 2.5774962742175855, "grad_norm": 14.691457470338838, "learning_rate": 1.189158617134708e-06, "loss": 0.4321, "num_input_tokens_seen": 14168064, "step": 3459 }, { "epoch": 2.578241430700447, "grad_norm": 8.886836306395432, "learning_rate": 1.188574072945493e-06, "loss": 0.2632, "num_input_tokens_seen": 14172160, "step": 3460 }, { "epoch": 2.5789865871833086, "grad_norm": 9.550302192950086, "learning_rate": 1.18798954222128e-06, "loss": 0.3273, "num_input_tokens_seen": 14176256, "step": 3461 }, { "epoch": 2.5797317436661698, "grad_norm": 9.633377989407814, "learning_rate": 1.187405025090202e-06, "loss": 0.3446, "num_input_tokens_seen": 14180352, "step": 3462 }, { "epoch": 2.5804769001490313, "grad_norm": 14.38502944001511, "learning_rate": 1.1868205216803894e-06, "loss": 0.4213, "num_input_tokens_seen": 14184448, "step": 3463 }, { "epoch": 2.581222056631893, "grad_norm": 10.459754849076607, "learning_rate": 1.1862360321199697e-06, "loss": 0.2405, "num_input_tokens_seen": 14188544, "step": 3464 }, { "epoch": 2.581967213114754, "grad_norm": 9.049986266109652, "learning_rate": 1.185651556537067e-06, "loss": 0.6244, "num_input_tokens_seen": 14192640, "step": 3465 }, { "epoch": 2.5827123695976155, "grad_norm": 9.450521027462292, "learning_rate": 1.1850670950598025e-06, "loss": 0.5554, "num_input_tokens_seen": 14196736, "step": 3466 }, { "epoch": 2.583457526080477, "grad_norm": 9.132188836074492, "learning_rate": 1.1844826478162944e-06, "loss": 0.2827, "num_input_tokens_seen": 14200832, "step": 3467 }, { "epoch": 2.584202682563338, "grad_norm": 9.566830293115938, "learning_rate": 1.1838982149346582e-06, "loss": 0.7028, "num_input_tokens_seen": 14204928, "step": 3468 }, { "epoch": 2.5849478390461997, "grad_norm": 6.792136062522676, "learning_rate": 1.183313796543005e-06, "loss": 0.2263, "num_input_tokens_seen": 14209024, "step": 3469 }, { "epoch": 2.585692995529061, "grad_norm": 12.264506979817039, "learning_rate": 1.182729392769444e-06, "loss": 0.4647, "num_input_tokens_seen": 14213120, "step": 3470 }, { "epoch": 2.5864381520119224, "grad_norm": 10.906209027097567, "learning_rate": 1.1821450037420804e-06, "loss": 0.3584, "num_input_tokens_seen": 14217216, "step": 3471 }, { "epoch": 2.587183308494784, "grad_norm": 8.586052291395033, "learning_rate": 1.1815606295890162e-06, "loss": 0.4094, "num_input_tokens_seen": 14221312, "step": 3472 }, { "epoch": 2.5879284649776455, "grad_norm": 8.543514119374873, "learning_rate": 1.1809762704383509e-06, "loss": 0.7428, "num_input_tokens_seen": 14225408, "step": 3473 }, { "epoch": 2.5886736214605066, "grad_norm": 15.793611980691267, "learning_rate": 1.1803919264181796e-06, "loss": 0.2473, "num_input_tokens_seen": 14229504, "step": 3474 }, { "epoch": 2.589418777943368, "grad_norm": 9.86985578896769, "learning_rate": 1.1798075976565954e-06, "loss": 0.628, "num_input_tokens_seen": 14233600, "step": 3475 }, { "epoch": 2.5901639344262293, "grad_norm": 9.40126571233922, "learning_rate": 1.179223284281687e-06, "loss": 0.6466, "num_input_tokens_seen": 14237696, "step": 3476 }, { "epoch": 2.590909090909091, "grad_norm": 15.29506123344479, "learning_rate": 1.17863898642154e-06, "loss": 0.4867, "num_input_tokens_seen": 14241792, "step": 3477 }, { "epoch": 2.5916542473919524, "grad_norm": 8.417273878153695, "learning_rate": 1.1780547042042372e-06, "loss": 0.2797, "num_input_tokens_seen": 14245888, "step": 3478 }, { "epoch": 2.592399403874814, "grad_norm": 9.54109599131107, "learning_rate": 1.1774704377578564e-06, "loss": 0.3814, "num_input_tokens_seen": 14249984, "step": 3479 }, { "epoch": 2.593144560357675, "grad_norm": 10.481901742139275, "learning_rate": 1.1768861872104738e-06, "loss": 0.4292, "num_input_tokens_seen": 14254080, "step": 3480 }, { "epoch": 2.5938897168405366, "grad_norm": 9.001199840684828, "learning_rate": 1.176301952690161e-06, "loss": 0.3057, "num_input_tokens_seen": 14258176, "step": 3481 }, { "epoch": 2.5946348733233977, "grad_norm": 9.555675397022139, "learning_rate": 1.1757177343249862e-06, "loss": 0.306, "num_input_tokens_seen": 14262272, "step": 3482 }, { "epoch": 2.5953800298062593, "grad_norm": 8.815229397604298, "learning_rate": 1.1751335322430147e-06, "loss": 0.3711, "num_input_tokens_seen": 14266368, "step": 3483 }, { "epoch": 2.596125186289121, "grad_norm": 7.564555117048533, "learning_rate": 1.174549346572308e-06, "loss": 0.6082, "num_input_tokens_seen": 14270464, "step": 3484 }, { "epoch": 2.5968703427719824, "grad_norm": 8.568123958792407, "learning_rate": 1.1739651774409227e-06, "loss": 0.4426, "num_input_tokens_seen": 14274560, "step": 3485 }, { "epoch": 2.5976154992548435, "grad_norm": 7.689204099911271, "learning_rate": 1.1733810249769134e-06, "loss": 0.3139, "num_input_tokens_seen": 14278656, "step": 3486 }, { "epoch": 2.598360655737705, "grad_norm": 10.081519425216802, "learning_rate": 1.1727968893083306e-06, "loss": 0.4105, "num_input_tokens_seen": 14282752, "step": 3487 }, { "epoch": 2.599105812220566, "grad_norm": 9.68555588066791, "learning_rate": 1.1722127705632211e-06, "loss": 0.4766, "num_input_tokens_seen": 14286848, "step": 3488 }, { "epoch": 2.5998509687034277, "grad_norm": 9.548086441371684, "learning_rate": 1.1716286688696277e-06, "loss": 0.4836, "num_input_tokens_seen": 14290944, "step": 3489 }, { "epoch": 2.6005961251862892, "grad_norm": 11.61300342709771, "learning_rate": 1.1710445843555898e-06, "loss": 0.4369, "num_input_tokens_seen": 14295040, "step": 3490 }, { "epoch": 2.601341281669151, "grad_norm": 10.389767781036658, "learning_rate": 1.1704605171491425e-06, "loss": 0.5515, "num_input_tokens_seen": 14299136, "step": 3491 }, { "epoch": 2.602086438152012, "grad_norm": 8.445177392847432, "learning_rate": 1.1698764673783178e-06, "loss": 0.3006, "num_input_tokens_seen": 14303232, "step": 3492 }, { "epoch": 2.6028315946348735, "grad_norm": 9.577713691354512, "learning_rate": 1.1692924351711438e-06, "loss": 0.4438, "num_input_tokens_seen": 14307328, "step": 3493 }, { "epoch": 2.6035767511177346, "grad_norm": 8.02542452315051, "learning_rate": 1.1687084206556443e-06, "loss": 0.2042, "num_input_tokens_seen": 14311424, "step": 3494 }, { "epoch": 2.604321907600596, "grad_norm": 10.420347321302478, "learning_rate": 1.1681244239598392e-06, "loss": 0.2168, "num_input_tokens_seen": 14315520, "step": 3495 }, { "epoch": 2.6050670640834577, "grad_norm": 10.00789090694049, "learning_rate": 1.1675404452117456e-06, "loss": 0.3757, "num_input_tokens_seen": 14319616, "step": 3496 }, { "epoch": 2.605812220566319, "grad_norm": 8.569779382246033, "learning_rate": 1.166956484539375e-06, "loss": 0.2571, "num_input_tokens_seen": 14323712, "step": 3497 }, { "epoch": 2.6065573770491803, "grad_norm": 9.374236536658875, "learning_rate": 1.1663725420707358e-06, "loss": 0.444, "num_input_tokens_seen": 14327808, "step": 3498 }, { "epoch": 2.607302533532042, "grad_norm": 14.94943401421354, "learning_rate": 1.1657886179338327e-06, "loss": 0.4145, "num_input_tokens_seen": 14331904, "step": 3499 }, { "epoch": 2.608047690014903, "grad_norm": 10.968225141662453, "learning_rate": 1.1652047122566658e-06, "loss": 0.4051, "num_input_tokens_seen": 14336000, "step": 3500 }, { "epoch": 2.6087928464977646, "grad_norm": 9.936202935515517, "learning_rate": 1.1646208251672315e-06, "loss": 0.4226, "num_input_tokens_seen": 14340096, "step": 3501 }, { "epoch": 2.609538002980626, "grad_norm": 9.985849635083218, "learning_rate": 1.1640369567935216e-06, "loss": 0.5332, "num_input_tokens_seen": 14344192, "step": 3502 }, { "epoch": 2.610283159463487, "grad_norm": 9.765297792771888, "learning_rate": 1.1634531072635249e-06, "loss": 0.5139, "num_input_tokens_seen": 14348288, "step": 3503 }, { "epoch": 2.6110283159463488, "grad_norm": 7.366221715341968, "learning_rate": 1.162869276705225e-06, "loss": 0.4021, "num_input_tokens_seen": 14352384, "step": 3504 }, { "epoch": 2.61177347242921, "grad_norm": 11.555922230022059, "learning_rate": 1.162285465246602e-06, "loss": 0.6017, "num_input_tokens_seen": 14356480, "step": 3505 }, { "epoch": 2.6125186289120714, "grad_norm": 10.528625623102318, "learning_rate": 1.1617016730156314e-06, "loss": 0.2348, "num_input_tokens_seen": 14360576, "step": 3506 }, { "epoch": 2.613263785394933, "grad_norm": 10.470648024291265, "learning_rate": 1.1611179001402842e-06, "loss": 0.6743, "num_input_tokens_seen": 14364672, "step": 3507 }, { "epoch": 2.6140089418777945, "grad_norm": 12.19114500260635, "learning_rate": 1.1605341467485282e-06, "loss": 0.1933, "num_input_tokens_seen": 14368768, "step": 3508 }, { "epoch": 2.6147540983606556, "grad_norm": 10.476769644295794, "learning_rate": 1.1599504129683262e-06, "loss": 0.4021, "num_input_tokens_seen": 14372864, "step": 3509 }, { "epoch": 2.615499254843517, "grad_norm": 9.671813260553474, "learning_rate": 1.159366698927637e-06, "loss": 0.3059, "num_input_tokens_seen": 14376960, "step": 3510 }, { "epoch": 2.6162444113263783, "grad_norm": 8.580478650872372, "learning_rate": 1.1587830047544141e-06, "loss": 0.3, "num_input_tokens_seen": 14381056, "step": 3511 }, { "epoch": 2.61698956780924, "grad_norm": 11.650069286928463, "learning_rate": 1.1581993305766078e-06, "loss": 0.4086, "num_input_tokens_seen": 14385152, "step": 3512 }, { "epoch": 2.6177347242921014, "grad_norm": 8.815433499046062, "learning_rate": 1.1576156765221639e-06, "loss": 0.5888, "num_input_tokens_seen": 14389248, "step": 3513 }, { "epoch": 2.618479880774963, "grad_norm": 9.738974760163527, "learning_rate": 1.1570320427190233e-06, "loss": 0.2163, "num_input_tokens_seen": 14393344, "step": 3514 }, { "epoch": 2.619225037257824, "grad_norm": 12.347407963823912, "learning_rate": 1.1564484292951229e-06, "loss": 0.4061, "num_input_tokens_seen": 14397440, "step": 3515 }, { "epoch": 2.6199701937406856, "grad_norm": 9.281616564841862, "learning_rate": 1.155864836378395e-06, "loss": 0.4301, "num_input_tokens_seen": 14401536, "step": 3516 }, { "epoch": 2.6207153502235467, "grad_norm": 9.233610826066267, "learning_rate": 1.155281264096767e-06, "loss": 0.4474, "num_input_tokens_seen": 14405632, "step": 3517 }, { "epoch": 2.6214605067064083, "grad_norm": 8.953884842713531, "learning_rate": 1.154697712578162e-06, "loss": 0.3086, "num_input_tokens_seen": 14409728, "step": 3518 }, { "epoch": 2.62220566318927, "grad_norm": 9.963134122749, "learning_rate": 1.154114181950499e-06, "loss": 0.267, "num_input_tokens_seen": 14413824, "step": 3519 }, { "epoch": 2.6229508196721314, "grad_norm": 7.3655764406792645, "learning_rate": 1.1535306723416922e-06, "loss": 0.3475, "num_input_tokens_seen": 14417920, "step": 3520 }, { "epoch": 2.6236959761549925, "grad_norm": 9.310151652545107, "learning_rate": 1.1529471838796507e-06, "loss": 0.3978, "num_input_tokens_seen": 14422016, "step": 3521 }, { "epoch": 2.624441132637854, "grad_norm": 8.40275840442208, "learning_rate": 1.1523637166922795e-06, "loss": 0.4115, "num_input_tokens_seen": 14426112, "step": 3522 }, { "epoch": 2.625186289120715, "grad_norm": 9.338441622100072, "learning_rate": 1.151780270907479e-06, "loss": 0.1262, "num_input_tokens_seen": 14430208, "step": 3523 }, { "epoch": 2.6259314456035767, "grad_norm": 10.423343482395046, "learning_rate": 1.1511968466531446e-06, "loss": 0.4811, "num_input_tokens_seen": 14434304, "step": 3524 }, { "epoch": 2.6266766020864383, "grad_norm": 9.878069287267842, "learning_rate": 1.1506134440571668e-06, "loss": 0.4714, "num_input_tokens_seen": 14438400, "step": 3525 }, { "epoch": 2.6274217585693, "grad_norm": 9.648466135178564, "learning_rate": 1.1500300632474318e-06, "loss": 0.5017, "num_input_tokens_seen": 14442496, "step": 3526 }, { "epoch": 2.628166915052161, "grad_norm": 8.838490104899376, "learning_rate": 1.1494467043518206e-06, "loss": 0.2242, "num_input_tokens_seen": 14446592, "step": 3527 }, { "epoch": 2.6289120715350225, "grad_norm": 7.547758568093761, "learning_rate": 1.1488633674982097e-06, "loss": 0.3743, "num_input_tokens_seen": 14450688, "step": 3528 }, { "epoch": 2.6296572280178836, "grad_norm": 9.478857809002132, "learning_rate": 1.148280052814471e-06, "loss": 0.3964, "num_input_tokens_seen": 14454784, "step": 3529 }, { "epoch": 2.630402384500745, "grad_norm": 9.378963840544705, "learning_rate": 1.147696760428471e-06, "loss": 0.334, "num_input_tokens_seen": 14458880, "step": 3530 }, { "epoch": 2.6311475409836067, "grad_norm": 9.11390809365318, "learning_rate": 1.1471134904680715e-06, "loss": 0.3026, "num_input_tokens_seen": 14462976, "step": 3531 }, { "epoch": 2.631892697466468, "grad_norm": 10.441415806876718, "learning_rate": 1.1465302430611298e-06, "loss": 0.4094, "num_input_tokens_seen": 14467072, "step": 3532 }, { "epoch": 2.6326378539493294, "grad_norm": 8.393390170097627, "learning_rate": 1.1459470183354977e-06, "loss": 0.464, "num_input_tokens_seen": 14471168, "step": 3533 }, { "epoch": 2.633383010432191, "grad_norm": 10.399606233255948, "learning_rate": 1.1453638164190222e-06, "loss": 0.3522, "num_input_tokens_seen": 14475264, "step": 3534 }, { "epoch": 2.634128166915052, "grad_norm": 9.067844201973626, "learning_rate": 1.144780637439545e-06, "loss": 0.2746, "num_input_tokens_seen": 14479360, "step": 3535 }, { "epoch": 2.6348733233979136, "grad_norm": 10.057197037919623, "learning_rate": 1.1441974815249036e-06, "loss": 0.3148, "num_input_tokens_seen": 14483456, "step": 3536 }, { "epoch": 2.635618479880775, "grad_norm": 10.1917008410948, "learning_rate": 1.1436143488029302e-06, "loss": 0.2396, "num_input_tokens_seen": 14487552, "step": 3537 }, { "epoch": 2.6363636363636362, "grad_norm": 11.692071861716496, "learning_rate": 1.1430312394014507e-06, "loss": 0.393, "num_input_tokens_seen": 14491648, "step": 3538 }, { "epoch": 2.637108792846498, "grad_norm": 9.327668051888663, "learning_rate": 1.1424481534482873e-06, "loss": 0.3525, "num_input_tokens_seen": 14495744, "step": 3539 }, { "epoch": 2.637853949329359, "grad_norm": 9.6901918462668, "learning_rate": 1.1418650910712568e-06, "loss": 0.3514, "num_input_tokens_seen": 14499840, "step": 3540 }, { "epoch": 2.6385991058122205, "grad_norm": 9.130322175091834, "learning_rate": 1.1412820523981704e-06, "loss": 0.2351, "num_input_tokens_seen": 14503936, "step": 3541 }, { "epoch": 2.639344262295082, "grad_norm": 10.143188228326053, "learning_rate": 1.1406990375568343e-06, "loss": 0.5326, "num_input_tokens_seen": 14508032, "step": 3542 }, { "epoch": 2.6400894187779436, "grad_norm": 11.941152095899215, "learning_rate": 1.14011604667505e-06, "loss": 0.3324, "num_input_tokens_seen": 14512128, "step": 3543 }, { "epoch": 2.6408345752608047, "grad_norm": 8.453975717755249, "learning_rate": 1.1395330798806128e-06, "loss": 0.2965, "num_input_tokens_seen": 14516224, "step": 3544 }, { "epoch": 2.6415797317436662, "grad_norm": 10.034706724020515, "learning_rate": 1.1389501373013132e-06, "loss": 0.6146, "num_input_tokens_seen": 14520320, "step": 3545 }, { "epoch": 2.6423248882265273, "grad_norm": 8.405456327325574, "learning_rate": 1.1383672190649365e-06, "loss": 0.5798, "num_input_tokens_seen": 14524416, "step": 3546 }, { "epoch": 2.643070044709389, "grad_norm": 9.188726585462277, "learning_rate": 1.1377843252992624e-06, "loss": 0.5623, "num_input_tokens_seen": 14528512, "step": 3547 }, { "epoch": 2.6438152011922504, "grad_norm": 9.489610615974742, "learning_rate": 1.1372014561320654e-06, "loss": 0.551, "num_input_tokens_seen": 14532608, "step": 3548 }, { "epoch": 2.644560357675112, "grad_norm": 9.28741689240064, "learning_rate": 1.1366186116911148e-06, "loss": 0.3512, "num_input_tokens_seen": 14536704, "step": 3549 }, { "epoch": 2.645305514157973, "grad_norm": 8.648139436015176, "learning_rate": 1.1360357921041743e-06, "loss": 0.1728, "num_input_tokens_seen": 14540800, "step": 3550 }, { "epoch": 2.6460506706408347, "grad_norm": 9.671809361295544, "learning_rate": 1.1354529974990022e-06, "loss": 0.6809, "num_input_tokens_seen": 14544896, "step": 3551 }, { "epoch": 2.6467958271236958, "grad_norm": 8.704036926503179, "learning_rate": 1.1348702280033506e-06, "loss": 0.2593, "num_input_tokens_seen": 14548992, "step": 3552 }, { "epoch": 2.6475409836065573, "grad_norm": 8.465435913456517, "learning_rate": 1.1342874837449673e-06, "loss": 0.6094, "num_input_tokens_seen": 14553088, "step": 3553 }, { "epoch": 2.648286140089419, "grad_norm": 8.22109164340037, "learning_rate": 1.1337047648515935e-06, "loss": 0.3515, "num_input_tokens_seen": 14557184, "step": 3554 }, { "epoch": 2.6490312965722804, "grad_norm": 8.276492909427969, "learning_rate": 1.133122071450966e-06, "loss": 0.4677, "num_input_tokens_seen": 14561280, "step": 3555 }, { "epoch": 2.6497764530551415, "grad_norm": 9.056463651969977, "learning_rate": 1.1325394036708148e-06, "loss": 0.4201, "num_input_tokens_seen": 14565376, "step": 3556 }, { "epoch": 2.650521609538003, "grad_norm": 8.752107122522347, "learning_rate": 1.131956761638865e-06, "loss": 0.3379, "num_input_tokens_seen": 14569472, "step": 3557 }, { "epoch": 2.651266766020864, "grad_norm": 8.262391170672947, "learning_rate": 1.131374145482836e-06, "loss": 0.3547, "num_input_tokens_seen": 14573568, "step": 3558 }, { "epoch": 2.6520119225037257, "grad_norm": 11.298731300074731, "learning_rate": 1.1307915553304413e-06, "loss": 0.4682, "num_input_tokens_seen": 14577664, "step": 3559 }, { "epoch": 2.6527570789865873, "grad_norm": 10.655216177310008, "learning_rate": 1.1302089913093893e-06, "loss": 0.4024, "num_input_tokens_seen": 14581760, "step": 3560 }, { "epoch": 2.653502235469449, "grad_norm": 8.990080381466338, "learning_rate": 1.1296264535473816e-06, "loss": 0.4452, "num_input_tokens_seen": 14585856, "step": 3561 }, { "epoch": 2.65424739195231, "grad_norm": 15.340127703389467, "learning_rate": 1.1290439421721146e-06, "loss": 0.3435, "num_input_tokens_seen": 14589952, "step": 3562 }, { "epoch": 2.6549925484351715, "grad_norm": 9.591058898642917, "learning_rate": 1.1284614573112792e-06, "loss": 0.5881, "num_input_tokens_seen": 14594048, "step": 3563 }, { "epoch": 2.6557377049180326, "grad_norm": 7.905174472897865, "learning_rate": 1.127878999092561e-06, "loss": 0.4911, "num_input_tokens_seen": 14598144, "step": 3564 }, { "epoch": 2.656482861400894, "grad_norm": 8.720680099206813, "learning_rate": 1.1272965676436374e-06, "loss": 0.3728, "num_input_tokens_seen": 14602240, "step": 3565 }, { "epoch": 2.6572280178837557, "grad_norm": 8.563264059194607, "learning_rate": 1.1267141630921826e-06, "loss": 0.5463, "num_input_tokens_seen": 14606336, "step": 3566 }, { "epoch": 2.657973174366617, "grad_norm": 7.469046381019521, "learning_rate": 1.1261317855658634e-06, "loss": 0.298, "num_input_tokens_seen": 14610432, "step": 3567 }, { "epoch": 2.6587183308494784, "grad_norm": 10.094415409227766, "learning_rate": 1.1255494351923412e-06, "loss": 0.5725, "num_input_tokens_seen": 14614528, "step": 3568 }, { "epoch": 2.65946348733234, "grad_norm": 9.924852378794457, "learning_rate": 1.1249671120992718e-06, "loss": 0.4199, "num_input_tokens_seen": 14618624, "step": 3569 }, { "epoch": 2.660208643815201, "grad_norm": 9.461322042300816, "learning_rate": 1.1243848164143043e-06, "loss": 0.3499, "num_input_tokens_seen": 14622720, "step": 3570 }, { "epoch": 2.6609538002980626, "grad_norm": 9.087758418683862, "learning_rate": 1.1238025482650818e-06, "loss": 0.5526, "num_input_tokens_seen": 14626816, "step": 3571 }, { "epoch": 2.661698956780924, "grad_norm": 11.458129967528377, "learning_rate": 1.123220307779242e-06, "loss": 0.4634, "num_input_tokens_seen": 14630912, "step": 3572 }, { "epoch": 2.6624441132637853, "grad_norm": 9.338914999559004, "learning_rate": 1.1226380950844156e-06, "loss": 0.4727, "num_input_tokens_seen": 14635008, "step": 3573 }, { "epoch": 2.663189269746647, "grad_norm": 10.549724086605298, "learning_rate": 1.1220559103082287e-06, "loss": 0.4721, "num_input_tokens_seen": 14639104, "step": 3574 }, { "epoch": 2.663934426229508, "grad_norm": 9.147400737269745, "learning_rate": 1.1214737535782994e-06, "loss": 0.5573, "num_input_tokens_seen": 14643200, "step": 3575 }, { "epoch": 2.6646795827123695, "grad_norm": 9.537422293591089, "learning_rate": 1.1208916250222413e-06, "loss": 0.5803, "num_input_tokens_seen": 14647296, "step": 3576 }, { "epoch": 2.665424739195231, "grad_norm": 10.946894457200196, "learning_rate": 1.120309524767661e-06, "loss": 0.5272, "num_input_tokens_seen": 14651392, "step": 3577 }, { "epoch": 2.6661698956780926, "grad_norm": 8.810998860800593, "learning_rate": 1.1197274529421593e-06, "loss": 0.4701, "num_input_tokens_seen": 14655488, "step": 3578 }, { "epoch": 2.6669150521609537, "grad_norm": 9.118226118507453, "learning_rate": 1.1191454096733299e-06, "loss": 0.3019, "num_input_tokens_seen": 14659584, "step": 3579 }, { "epoch": 2.6676602086438153, "grad_norm": 8.539364455179998, "learning_rate": 1.1185633950887606e-06, "loss": 0.5384, "num_input_tokens_seen": 14663680, "step": 3580 }, { "epoch": 2.6684053651266764, "grad_norm": 17.17203731361863, "learning_rate": 1.1179814093160343e-06, "loss": 0.4015, "num_input_tokens_seen": 14667776, "step": 3581 }, { "epoch": 2.669150521609538, "grad_norm": 9.37961556169834, "learning_rate": 1.1173994524827254e-06, "loss": 0.6746, "num_input_tokens_seen": 14671872, "step": 3582 }, { "epoch": 2.6698956780923995, "grad_norm": 14.40386281505869, "learning_rate": 1.1168175247164034e-06, "loss": 0.3615, "num_input_tokens_seen": 14675968, "step": 3583 }, { "epoch": 2.670640834575261, "grad_norm": 8.406749710042712, "learning_rate": 1.116235626144631e-06, "loss": 0.4114, "num_input_tokens_seen": 14680064, "step": 3584 }, { "epoch": 2.671385991058122, "grad_norm": 8.113408026306976, "learning_rate": 1.1156537568949644e-06, "loss": 0.2063, "num_input_tokens_seen": 14684160, "step": 3585 }, { "epoch": 2.6721311475409837, "grad_norm": 8.353712005065733, "learning_rate": 1.1150719170949537e-06, "loss": 0.353, "num_input_tokens_seen": 14688256, "step": 3586 }, { "epoch": 2.672876304023845, "grad_norm": 9.290005308474278, "learning_rate": 1.1144901068721422e-06, "loss": 0.2947, "num_input_tokens_seen": 14692352, "step": 3587 }, { "epoch": 2.6736214605067063, "grad_norm": 9.022851132942373, "learning_rate": 1.1139083263540671e-06, "loss": 0.4089, "num_input_tokens_seen": 14696448, "step": 3588 }, { "epoch": 2.674366616989568, "grad_norm": 9.048515650127591, "learning_rate": 1.1133265756682585e-06, "loss": 0.1855, "num_input_tokens_seen": 14700544, "step": 3589 }, { "epoch": 2.6751117734724295, "grad_norm": 11.153179650188754, "learning_rate": 1.1127448549422405e-06, "loss": 0.4792, "num_input_tokens_seen": 14704640, "step": 3590 }, { "epoch": 2.6758569299552906, "grad_norm": 11.945907134152808, "learning_rate": 1.1121631643035308e-06, "loss": 0.3907, "num_input_tokens_seen": 14708736, "step": 3591 }, { "epoch": 2.676602086438152, "grad_norm": 11.868979833320653, "learning_rate": 1.1115815038796393e-06, "loss": 0.4785, "num_input_tokens_seen": 14712832, "step": 3592 }, { "epoch": 2.6773472429210132, "grad_norm": 10.341628777998558, "learning_rate": 1.1109998737980703e-06, "loss": 0.5395, "num_input_tokens_seen": 14716928, "step": 3593 }, { "epoch": 2.678092399403875, "grad_norm": 7.8935833393770825, "learning_rate": 1.1104182741863218e-06, "loss": 0.3811, "num_input_tokens_seen": 14721024, "step": 3594 }, { "epoch": 2.6788375558867363, "grad_norm": 9.083720737131319, "learning_rate": 1.1098367051718842e-06, "loss": 0.6048, "num_input_tokens_seen": 14725120, "step": 3595 }, { "epoch": 2.679582712369598, "grad_norm": 11.248113640728128, "learning_rate": 1.1092551668822417e-06, "loss": 0.4313, "num_input_tokens_seen": 14729216, "step": 3596 }, { "epoch": 2.680327868852459, "grad_norm": 9.5796952868568, "learning_rate": 1.1086736594448716e-06, "loss": 0.3067, "num_input_tokens_seen": 14733312, "step": 3597 }, { "epoch": 2.6810730253353205, "grad_norm": 9.82037953383564, "learning_rate": 1.1080921829872447e-06, "loss": 0.5244, "num_input_tokens_seen": 14737408, "step": 3598 }, { "epoch": 2.6818181818181817, "grad_norm": 9.737163837455538, "learning_rate": 1.1075107376368244e-06, "loss": 0.4404, "num_input_tokens_seen": 14741504, "step": 3599 }, { "epoch": 2.682563338301043, "grad_norm": 10.860265064504548, "learning_rate": 1.1069293235210679e-06, "loss": 0.2816, "num_input_tokens_seen": 14745600, "step": 3600 }, { "epoch": 2.6833084947839048, "grad_norm": 8.981248839204037, "learning_rate": 1.1063479407674253e-06, "loss": 0.417, "num_input_tokens_seen": 14749696, "step": 3601 }, { "epoch": 2.684053651266766, "grad_norm": 10.07697798133254, "learning_rate": 1.10576658950334e-06, "loss": 0.4393, "num_input_tokens_seen": 14753792, "step": 3602 }, { "epoch": 2.6847988077496274, "grad_norm": 10.869059014866705, "learning_rate": 1.1051852698562479e-06, "loss": 0.3313, "num_input_tokens_seen": 14757888, "step": 3603 }, { "epoch": 2.685543964232489, "grad_norm": 9.503311764761909, "learning_rate": 1.1046039819535788e-06, "loss": 0.2667, "num_input_tokens_seen": 14761984, "step": 3604 }, { "epoch": 2.68628912071535, "grad_norm": 9.231884260858251, "learning_rate": 1.1040227259227557e-06, "loss": 0.2549, "num_input_tokens_seen": 14766080, "step": 3605 }, { "epoch": 2.6870342771982116, "grad_norm": 7.634985855921757, "learning_rate": 1.1034415018911928e-06, "loss": 0.348, "num_input_tokens_seen": 14770176, "step": 3606 }, { "epoch": 2.687779433681073, "grad_norm": 10.60774942023603, "learning_rate": 1.1028603099862993e-06, "loss": 0.534, "num_input_tokens_seen": 14774272, "step": 3607 }, { "epoch": 2.6885245901639343, "grad_norm": 8.203266858129727, "learning_rate": 1.102279150335477e-06, "loss": 0.3289, "num_input_tokens_seen": 14778368, "step": 3608 }, { "epoch": 2.689269746646796, "grad_norm": 8.321920234180414, "learning_rate": 1.101698023066119e-06, "loss": 0.4078, "num_input_tokens_seen": 14782464, "step": 3609 }, { "epoch": 2.690014903129657, "grad_norm": 12.344688611092732, "learning_rate": 1.1011169283056136e-06, "loss": 0.4499, "num_input_tokens_seen": 14786560, "step": 3610 }, { "epoch": 2.6907600596125185, "grad_norm": 11.651323236761128, "learning_rate": 1.1005358661813406e-06, "loss": 0.2369, "num_input_tokens_seen": 14790656, "step": 3611 }, { "epoch": 2.69150521609538, "grad_norm": 10.95525621095142, "learning_rate": 1.099954836820673e-06, "loss": 0.4457, "num_input_tokens_seen": 14794752, "step": 3612 }, { "epoch": 2.6922503725782416, "grad_norm": 10.623794527645645, "learning_rate": 1.0993738403509768e-06, "loss": 0.4135, "num_input_tokens_seen": 14798848, "step": 3613 }, { "epoch": 2.6929955290611027, "grad_norm": 9.912129478597315, "learning_rate": 1.0987928768996101e-06, "loss": 0.2774, "num_input_tokens_seen": 14802944, "step": 3614 }, { "epoch": 2.6937406855439643, "grad_norm": 10.506457301941271, "learning_rate": 1.0982119465939247e-06, "loss": 0.5572, "num_input_tokens_seen": 14807040, "step": 3615 }, { "epoch": 2.6944858420268254, "grad_norm": 11.262767553667487, "learning_rate": 1.0976310495612643e-06, "loss": 0.3271, "num_input_tokens_seen": 14811136, "step": 3616 }, { "epoch": 2.695230998509687, "grad_norm": 10.08832775836953, "learning_rate": 1.097050185928966e-06, "loss": 0.4542, "num_input_tokens_seen": 14815232, "step": 3617 }, { "epoch": 2.6959761549925485, "grad_norm": 8.87815502656561, "learning_rate": 1.0964693558243595e-06, "loss": 0.1502, "num_input_tokens_seen": 14819328, "step": 3618 }, { "epoch": 2.69672131147541, "grad_norm": 9.178372525501079, "learning_rate": 1.095888559374766e-06, "loss": 0.5128, "num_input_tokens_seen": 14823424, "step": 3619 }, { "epoch": 2.697466467958271, "grad_norm": 9.857633574963673, "learning_rate": 1.0953077967075006e-06, "loss": 0.2421, "num_input_tokens_seen": 14827520, "step": 3620 }, { "epoch": 2.6982116244411327, "grad_norm": 9.85960718879346, "learning_rate": 1.0947270679498711e-06, "loss": 0.4019, "num_input_tokens_seen": 14831616, "step": 3621 }, { "epoch": 2.698956780923994, "grad_norm": 8.550672620470356, "learning_rate": 1.0941463732291768e-06, "loss": 0.2458, "num_input_tokens_seen": 14835712, "step": 3622 }, { "epoch": 2.6997019374068554, "grad_norm": 8.509051751704218, "learning_rate": 1.0935657126727105e-06, "loss": 0.2759, "num_input_tokens_seen": 14839808, "step": 3623 }, { "epoch": 2.700447093889717, "grad_norm": 12.383079265046176, "learning_rate": 1.0929850864077569e-06, "loss": 0.5087, "num_input_tokens_seen": 14843904, "step": 3624 }, { "epoch": 2.7011922503725785, "grad_norm": 10.262458098768906, "learning_rate": 1.0924044945615938e-06, "loss": 0.4488, "num_input_tokens_seen": 14848000, "step": 3625 }, { "epoch": 2.7019374068554396, "grad_norm": 8.143883864126922, "learning_rate": 1.0918239372614907e-06, "loss": 0.1715, "num_input_tokens_seen": 14852096, "step": 3626 }, { "epoch": 2.702682563338301, "grad_norm": 8.83250429687107, "learning_rate": 1.0912434146347101e-06, "loss": 0.6072, "num_input_tokens_seen": 14856192, "step": 3627 }, { "epoch": 2.7034277198211623, "grad_norm": 9.974854293117094, "learning_rate": 1.0906629268085065e-06, "loss": 0.4129, "num_input_tokens_seen": 14860288, "step": 3628 }, { "epoch": 2.704172876304024, "grad_norm": 10.031081161708347, "learning_rate": 1.0900824739101272e-06, "loss": 0.3469, "num_input_tokens_seen": 14864384, "step": 3629 }, { "epoch": 2.7049180327868854, "grad_norm": 9.255185558578678, "learning_rate": 1.0895020560668112e-06, "loss": 0.5107, "num_input_tokens_seen": 14868480, "step": 3630 }, { "epoch": 2.705663189269747, "grad_norm": 8.109886725514972, "learning_rate": 1.0889216734057908e-06, "loss": 0.4332, "num_input_tokens_seen": 14872576, "step": 3631 }, { "epoch": 2.706408345752608, "grad_norm": 10.828491884412669, "learning_rate": 1.0883413260542904e-06, "loss": 0.3315, "num_input_tokens_seen": 14876672, "step": 3632 }, { "epoch": 2.7071535022354696, "grad_norm": 10.345917953272359, "learning_rate": 1.087761014139525e-06, "loss": 0.3177, "num_input_tokens_seen": 14880768, "step": 3633 }, { "epoch": 2.7078986587183307, "grad_norm": 10.003231168432542, "learning_rate": 1.087180737788704e-06, "loss": 0.4397, "num_input_tokens_seen": 14884864, "step": 3634 }, { "epoch": 2.7086438152011922, "grad_norm": 10.854926861514059, "learning_rate": 1.086600497129028e-06, "loss": 0.31, "num_input_tokens_seen": 14888960, "step": 3635 }, { "epoch": 2.709388971684054, "grad_norm": 8.74776624890384, "learning_rate": 1.0860202922876896e-06, "loss": 0.359, "num_input_tokens_seen": 14893056, "step": 3636 }, { "epoch": 2.710134128166915, "grad_norm": 12.169438657800209, "learning_rate": 1.085440123391874e-06, "loss": 0.4167, "num_input_tokens_seen": 14897152, "step": 3637 }, { "epoch": 2.7108792846497765, "grad_norm": 7.590931683878406, "learning_rate": 1.0848599905687584e-06, "loss": 0.1896, "num_input_tokens_seen": 14901248, "step": 3638 }, { "epoch": 2.711624441132638, "grad_norm": 8.369368180034314, "learning_rate": 1.0842798939455121e-06, "loss": 0.369, "num_input_tokens_seen": 14905344, "step": 3639 }, { "epoch": 2.712369597615499, "grad_norm": 9.81400029505616, "learning_rate": 1.0836998336492965e-06, "loss": 0.7195, "num_input_tokens_seen": 14909440, "step": 3640 }, { "epoch": 2.7131147540983607, "grad_norm": 10.387206819032992, "learning_rate": 1.0831198098072648e-06, "loss": 0.5649, "num_input_tokens_seen": 14913536, "step": 3641 }, { "epoch": 2.7138599105812222, "grad_norm": 8.615861652318374, "learning_rate": 1.0825398225465627e-06, "loss": 0.3876, "num_input_tokens_seen": 14917632, "step": 3642 }, { "epoch": 2.7146050670640833, "grad_norm": 9.219945190462834, "learning_rate": 1.0819598719943269e-06, "loss": 0.3816, "num_input_tokens_seen": 14921728, "step": 3643 }, { "epoch": 2.715350223546945, "grad_norm": 7.575826849665021, "learning_rate": 1.0813799582776871e-06, "loss": 0.1772, "num_input_tokens_seen": 14925824, "step": 3644 }, { "epoch": 2.716095380029806, "grad_norm": 10.609009704292733, "learning_rate": 1.0808000815237652e-06, "loss": 0.5746, "num_input_tokens_seen": 14929920, "step": 3645 }, { "epoch": 2.7168405365126675, "grad_norm": 12.855412097362551, "learning_rate": 1.080220241859673e-06, "loss": 0.3335, "num_input_tokens_seen": 14934016, "step": 3646 }, { "epoch": 2.717585692995529, "grad_norm": 12.022977128291911, "learning_rate": 1.079640439412516e-06, "loss": 0.3026, "num_input_tokens_seen": 14938112, "step": 3647 }, { "epoch": 2.7183308494783907, "grad_norm": 9.890764879989812, "learning_rate": 1.079060674309391e-06, "loss": 0.5475, "num_input_tokens_seen": 14942208, "step": 3648 }, { "epoch": 2.7190760059612518, "grad_norm": 9.445087827946995, "learning_rate": 1.0784809466773871e-06, "loss": 0.4865, "num_input_tokens_seen": 14946304, "step": 3649 }, { "epoch": 2.7198211624441133, "grad_norm": 7.514825634191807, "learning_rate": 1.0779012566435845e-06, "loss": 0.4842, "num_input_tokens_seen": 14950400, "step": 3650 }, { "epoch": 2.7205663189269744, "grad_norm": 9.39110023881178, "learning_rate": 1.0773216043350552e-06, "loss": 0.3947, "num_input_tokens_seen": 14954496, "step": 3651 }, { "epoch": 2.721311475409836, "grad_norm": 10.523847798946878, "learning_rate": 1.0767419898788636e-06, "loss": 0.3828, "num_input_tokens_seen": 14958592, "step": 3652 }, { "epoch": 2.7220566318926975, "grad_norm": 9.517053675166032, "learning_rate": 1.0761624134020646e-06, "loss": 0.3838, "num_input_tokens_seen": 14962688, "step": 3653 }, { "epoch": 2.722801788375559, "grad_norm": 8.82883673700943, "learning_rate": 1.075582875031706e-06, "loss": 0.4215, "num_input_tokens_seen": 14966784, "step": 3654 }, { "epoch": 2.72354694485842, "grad_norm": 9.017423931371658, "learning_rate": 1.0750033748948266e-06, "loss": 0.2165, "num_input_tokens_seen": 14970880, "step": 3655 }, { "epoch": 2.7242921013412817, "grad_norm": 8.652226115720742, "learning_rate": 1.074423913118457e-06, "loss": 0.2452, "num_input_tokens_seen": 14974976, "step": 3656 }, { "epoch": 2.725037257824143, "grad_norm": 9.66691468491505, "learning_rate": 1.0738444898296193e-06, "loss": 0.3188, "num_input_tokens_seen": 14979072, "step": 3657 }, { "epoch": 2.7257824143070044, "grad_norm": 9.236255349666187, "learning_rate": 1.0732651051553275e-06, "loss": 0.4675, "num_input_tokens_seen": 14983168, "step": 3658 }, { "epoch": 2.726527570789866, "grad_norm": 8.69783555264837, "learning_rate": 1.0726857592225872e-06, "loss": 0.3714, "num_input_tokens_seen": 14987264, "step": 3659 }, { "epoch": 2.7272727272727275, "grad_norm": 8.863070393944799, "learning_rate": 1.0721064521583939e-06, "loss": 0.5777, "num_input_tokens_seen": 14991360, "step": 3660 }, { "epoch": 2.7280178837555886, "grad_norm": 9.272971949937569, "learning_rate": 1.0715271840897365e-06, "loss": 0.4071, "num_input_tokens_seen": 14995456, "step": 3661 }, { "epoch": 2.72876304023845, "grad_norm": 9.998641553915494, "learning_rate": 1.070947955143595e-06, "loss": 0.3918, "num_input_tokens_seen": 14999552, "step": 3662 }, { "epoch": 2.7295081967213113, "grad_norm": 12.22826262546763, "learning_rate": 1.0703687654469404e-06, "loss": 0.3456, "num_input_tokens_seen": 15003648, "step": 3663 }, { "epoch": 2.730253353204173, "grad_norm": 12.542967172757109, "learning_rate": 1.0697896151267348e-06, "loss": 0.4295, "num_input_tokens_seen": 15007744, "step": 3664 }, { "epoch": 2.7309985096870344, "grad_norm": 9.298146932788237, "learning_rate": 1.0692105043099324e-06, "loss": 0.422, "num_input_tokens_seen": 15011840, "step": 3665 }, { "epoch": 2.731743666169896, "grad_norm": 7.6943844826150425, "learning_rate": 1.0686314331234784e-06, "loss": 0.3713, "num_input_tokens_seen": 15015936, "step": 3666 }, { "epoch": 2.732488822652757, "grad_norm": 8.830697089905904, "learning_rate": 1.0680524016943095e-06, "loss": 0.4, "num_input_tokens_seen": 15020032, "step": 3667 }, { "epoch": 2.7332339791356186, "grad_norm": 10.881767089667791, "learning_rate": 1.0674734101493531e-06, "loss": 0.3998, "num_input_tokens_seen": 15024128, "step": 3668 }, { "epoch": 2.7339791356184797, "grad_norm": 8.641446400628759, "learning_rate": 1.0668944586155288e-06, "loss": 0.4227, "num_input_tokens_seen": 15028224, "step": 3669 }, { "epoch": 2.7347242921013413, "grad_norm": 9.011637064370255, "learning_rate": 1.0663155472197466e-06, "loss": 0.3303, "num_input_tokens_seen": 15032320, "step": 3670 }, { "epoch": 2.735469448584203, "grad_norm": 7.863314368251982, "learning_rate": 1.0657366760889083e-06, "loss": 0.3732, "num_input_tokens_seen": 15036416, "step": 3671 }, { "epoch": 2.736214605067064, "grad_norm": 9.261086857260631, "learning_rate": 1.0651578453499064e-06, "loss": 0.1829, "num_input_tokens_seen": 15040512, "step": 3672 }, { "epoch": 2.7369597615499255, "grad_norm": 9.684066784418048, "learning_rate": 1.0645790551296245e-06, "loss": 0.4336, "num_input_tokens_seen": 15044608, "step": 3673 }, { "epoch": 2.737704918032787, "grad_norm": 10.279487773304574, "learning_rate": 1.0640003055549377e-06, "loss": 0.4512, "num_input_tokens_seen": 15048704, "step": 3674 }, { "epoch": 2.738450074515648, "grad_norm": 10.409735997559006, "learning_rate": 1.063421596752712e-06, "loss": 0.3657, "num_input_tokens_seen": 15052800, "step": 3675 }, { "epoch": 2.7391952309985097, "grad_norm": 10.778344274634575, "learning_rate": 1.0628429288498045e-06, "loss": 0.3709, "num_input_tokens_seen": 15056896, "step": 3676 }, { "epoch": 2.7399403874813713, "grad_norm": 9.806314873274296, "learning_rate": 1.0622643019730636e-06, "loss": 0.4512, "num_input_tokens_seen": 15060992, "step": 3677 }, { "epoch": 2.7406855439642324, "grad_norm": 9.368559855958654, "learning_rate": 1.0616857162493282e-06, "loss": 0.393, "num_input_tokens_seen": 15065088, "step": 3678 }, { "epoch": 2.741430700447094, "grad_norm": 10.53660476074744, "learning_rate": 1.0611071718054283e-06, "loss": 0.4627, "num_input_tokens_seen": 15069184, "step": 3679 }, { "epoch": 2.742175856929955, "grad_norm": 10.49424708900277, "learning_rate": 1.0605286687681857e-06, "loss": 0.4651, "num_input_tokens_seen": 15073280, "step": 3680 }, { "epoch": 2.7429210134128166, "grad_norm": 10.545176688921332, "learning_rate": 1.0599502072644114e-06, "loss": 0.2185, "num_input_tokens_seen": 15077376, "step": 3681 }, { "epoch": 2.743666169895678, "grad_norm": 7.681502310330052, "learning_rate": 1.0593717874209087e-06, "loss": 0.3048, "num_input_tokens_seen": 15081472, "step": 3682 }, { "epoch": 2.7444113263785397, "grad_norm": 9.826172435262446, "learning_rate": 1.0587934093644718e-06, "loss": 0.2833, "num_input_tokens_seen": 15085568, "step": 3683 }, { "epoch": 2.745156482861401, "grad_norm": 8.849879652672774, "learning_rate": 1.0582150732218843e-06, "loss": 0.4278, "num_input_tokens_seen": 15089664, "step": 3684 }, { "epoch": 2.7459016393442623, "grad_norm": 10.41871739539764, "learning_rate": 1.0576367791199227e-06, "loss": 0.2376, "num_input_tokens_seen": 15093760, "step": 3685 }, { "epoch": 2.7466467958271235, "grad_norm": 10.405895085786652, "learning_rate": 1.057058527185353e-06, "loss": 0.2274, "num_input_tokens_seen": 15097856, "step": 3686 }, { "epoch": 2.747391952309985, "grad_norm": 11.827032783526818, "learning_rate": 1.0564803175449315e-06, "loss": 0.3191, "num_input_tokens_seen": 15101952, "step": 3687 }, { "epoch": 2.7481371087928466, "grad_norm": 8.1400497468824, "learning_rate": 1.055902150325406e-06, "loss": 0.631, "num_input_tokens_seen": 15106048, "step": 3688 }, { "epoch": 2.748882265275708, "grad_norm": 8.108366432204, "learning_rate": 1.0553240256535155e-06, "loss": 0.2641, "num_input_tokens_seen": 15110144, "step": 3689 }, { "epoch": 2.7496274217585692, "grad_norm": 11.158010126186541, "learning_rate": 1.0547459436559886e-06, "loss": 0.4577, "num_input_tokens_seen": 15114240, "step": 3690 }, { "epoch": 2.7503725782414308, "grad_norm": 12.534553690733189, "learning_rate": 1.0541679044595449e-06, "loss": 0.2907, "num_input_tokens_seen": 15118336, "step": 3691 }, { "epoch": 2.751117734724292, "grad_norm": 8.983652990354855, "learning_rate": 1.053589908190895e-06, "loss": 0.375, "num_input_tokens_seen": 15122432, "step": 3692 }, { "epoch": 2.7518628912071534, "grad_norm": 8.530012584922279, "learning_rate": 1.0530119549767396e-06, "loss": 0.4219, "num_input_tokens_seen": 15126528, "step": 3693 }, { "epoch": 2.752608047690015, "grad_norm": 10.350905492227216, "learning_rate": 1.0524340449437704e-06, "loss": 0.2478, "num_input_tokens_seen": 15130624, "step": 3694 }, { "epoch": 2.7533532041728765, "grad_norm": 10.612833367269234, "learning_rate": 1.0518561782186693e-06, "loss": 0.4692, "num_input_tokens_seen": 15134720, "step": 3695 }, { "epoch": 2.7540983606557377, "grad_norm": 8.506318239434092, "learning_rate": 1.0512783549281089e-06, "loss": 0.4034, "num_input_tokens_seen": 15138816, "step": 3696 }, { "epoch": 2.754843517138599, "grad_norm": 10.570686715560223, "learning_rate": 1.050700575198752e-06, "loss": 0.2869, "num_input_tokens_seen": 15142912, "step": 3697 }, { "epoch": 2.7555886736214603, "grad_norm": 8.720768144545753, "learning_rate": 1.0501228391572522e-06, "loss": 0.3275, "num_input_tokens_seen": 15147008, "step": 3698 }, { "epoch": 2.756333830104322, "grad_norm": 7.8201798461759715, "learning_rate": 1.0495451469302535e-06, "loss": 0.3776, "num_input_tokens_seen": 15151104, "step": 3699 }, { "epoch": 2.7570789865871834, "grad_norm": 7.679775535268652, "learning_rate": 1.04896749864439e-06, "loss": 0.2907, "num_input_tokens_seen": 15155200, "step": 3700 }, { "epoch": 2.757824143070045, "grad_norm": 9.596239485364622, "learning_rate": 1.048389894426286e-06, "loss": 0.4951, "num_input_tokens_seen": 15159296, "step": 3701 }, { "epoch": 2.758569299552906, "grad_norm": 9.384984468892268, "learning_rate": 1.0478123344025566e-06, "loss": 0.3894, "num_input_tokens_seen": 15163392, "step": 3702 }, { "epoch": 2.7593144560357676, "grad_norm": 10.203475388739443, "learning_rate": 1.0472348186998076e-06, "loss": 0.44, "num_input_tokens_seen": 15167488, "step": 3703 }, { "epoch": 2.7600596125186287, "grad_norm": 13.810500121354305, "learning_rate": 1.0466573474446339e-06, "loss": 0.2329, "num_input_tokens_seen": 15171584, "step": 3704 }, { "epoch": 2.7608047690014903, "grad_norm": 9.740807312489535, "learning_rate": 1.0460799207636218e-06, "loss": 0.4538, "num_input_tokens_seen": 15175680, "step": 3705 }, { "epoch": 2.761549925484352, "grad_norm": 9.790242965963635, "learning_rate": 1.0455025387833472e-06, "loss": 0.5946, "num_input_tokens_seen": 15179776, "step": 3706 }, { "epoch": 2.762295081967213, "grad_norm": 11.924514442749848, "learning_rate": 1.0449252016303765e-06, "loss": 0.5334, "num_input_tokens_seen": 15183872, "step": 3707 }, { "epoch": 2.7630402384500745, "grad_norm": 9.847640385491383, "learning_rate": 1.044347909431266e-06, "loss": 0.5862, "num_input_tokens_seen": 15187968, "step": 3708 }, { "epoch": 2.763785394932936, "grad_norm": 11.764229107281286, "learning_rate": 1.0437706623125619e-06, "loss": 0.3146, "num_input_tokens_seen": 15192064, "step": 3709 }, { "epoch": 2.764530551415797, "grad_norm": 9.917406017778807, "learning_rate": 1.0431934604008012e-06, "loss": 0.243, "num_input_tokens_seen": 15196160, "step": 3710 }, { "epoch": 2.7652757078986587, "grad_norm": 8.219424425703007, "learning_rate": 1.0426163038225109e-06, "loss": 0.3128, "num_input_tokens_seen": 15200256, "step": 3711 }, { "epoch": 2.7660208643815203, "grad_norm": 12.356484408544725, "learning_rate": 1.0420391927042075e-06, "loss": 0.6257, "num_input_tokens_seen": 15204352, "step": 3712 }, { "epoch": 2.7667660208643814, "grad_norm": 9.265986294629242, "learning_rate": 1.0414621271723988e-06, "loss": 0.448, "num_input_tokens_seen": 15208448, "step": 3713 }, { "epoch": 2.767511177347243, "grad_norm": 26.956811151600107, "learning_rate": 1.04088510735358e-06, "loss": 0.333, "num_input_tokens_seen": 15212544, "step": 3714 }, { "epoch": 2.768256333830104, "grad_norm": 8.836249220840562, "learning_rate": 1.0403081333742393e-06, "loss": 0.4218, "num_input_tokens_seen": 15216640, "step": 3715 }, { "epoch": 2.7690014903129656, "grad_norm": 9.132068331578239, "learning_rate": 1.0397312053608527e-06, "loss": 0.4167, "num_input_tokens_seen": 15220736, "step": 3716 }, { "epoch": 2.769746646795827, "grad_norm": 9.51923297109291, "learning_rate": 1.0391543234398879e-06, "loss": 0.6147, "num_input_tokens_seen": 15224832, "step": 3717 }, { "epoch": 2.7704918032786887, "grad_norm": 11.886003094279165, "learning_rate": 1.0385774877378007e-06, "loss": 0.3341, "num_input_tokens_seen": 15228928, "step": 3718 }, { "epoch": 2.77123695976155, "grad_norm": 10.628781166430521, "learning_rate": 1.0380006983810378e-06, "loss": 0.5805, "num_input_tokens_seen": 15233024, "step": 3719 }, { "epoch": 2.7719821162444114, "grad_norm": 9.475782616209429, "learning_rate": 1.0374239554960356e-06, "loss": 0.636, "num_input_tokens_seen": 15237120, "step": 3720 }, { "epoch": 2.7727272727272725, "grad_norm": 9.886786576595409, "learning_rate": 1.0368472592092203e-06, "loss": 0.5232, "num_input_tokens_seen": 15241216, "step": 3721 }, { "epoch": 2.773472429210134, "grad_norm": 8.240494483747948, "learning_rate": 1.036270609647008e-06, "loss": 0.3785, "num_input_tokens_seen": 15245312, "step": 3722 }, { "epoch": 2.7742175856929956, "grad_norm": 8.537996464569117, "learning_rate": 1.035694006935804e-06, "loss": 0.6391, "num_input_tokens_seen": 15249408, "step": 3723 }, { "epoch": 2.774962742175857, "grad_norm": 10.083178099272088, "learning_rate": 1.0351174512020044e-06, "loss": 0.3755, "num_input_tokens_seen": 15253504, "step": 3724 }, { "epoch": 2.7757078986587183, "grad_norm": 8.86191028312424, "learning_rate": 1.0345409425719934e-06, "loss": 0.4288, "num_input_tokens_seen": 15257600, "step": 3725 }, { "epoch": 2.77645305514158, "grad_norm": 9.80684167354436, "learning_rate": 1.0339644811721467e-06, "loss": 0.4494, "num_input_tokens_seen": 15261696, "step": 3726 }, { "epoch": 2.777198211624441, "grad_norm": 8.832371949051192, "learning_rate": 1.0333880671288284e-06, "loss": 0.289, "num_input_tokens_seen": 15265792, "step": 3727 }, { "epoch": 2.7779433681073025, "grad_norm": 9.023940623401513, "learning_rate": 1.0328117005683921e-06, "loss": 0.5102, "num_input_tokens_seen": 15269888, "step": 3728 }, { "epoch": 2.778688524590164, "grad_norm": 10.860734869147553, "learning_rate": 1.032235381617182e-06, "loss": 0.2461, "num_input_tokens_seen": 15273984, "step": 3729 }, { "epoch": 2.7794336810730256, "grad_norm": 9.947094596234425, "learning_rate": 1.0316591104015314e-06, "loss": 0.4598, "num_input_tokens_seen": 15278080, "step": 3730 }, { "epoch": 2.7801788375558867, "grad_norm": 7.627916488083654, "learning_rate": 1.0310828870477627e-06, "loss": 0.3797, "num_input_tokens_seen": 15282176, "step": 3731 }, { "epoch": 2.7809239940387482, "grad_norm": 8.679506566346125, "learning_rate": 1.0305067116821885e-06, "loss": 0.3435, "num_input_tokens_seen": 15286272, "step": 3732 }, { "epoch": 2.7816691505216093, "grad_norm": 8.822299482634133, "learning_rate": 1.0299305844311102e-06, "loss": 0.2807, "num_input_tokens_seen": 15290368, "step": 3733 }, { "epoch": 2.782414307004471, "grad_norm": 8.729808352434686, "learning_rate": 1.0293545054208196e-06, "loss": 0.4355, "num_input_tokens_seen": 15294464, "step": 3734 }, { "epoch": 2.7831594634873325, "grad_norm": 11.61126820568835, "learning_rate": 1.0287784747775968e-06, "loss": 0.3442, "num_input_tokens_seen": 15298560, "step": 3735 }, { "epoch": 2.783904619970194, "grad_norm": 10.772485650428147, "learning_rate": 1.0282024926277119e-06, "loss": 0.3611, "num_input_tokens_seen": 15302656, "step": 3736 }, { "epoch": 2.784649776453055, "grad_norm": 9.50595423027191, "learning_rate": 1.0276265590974245e-06, "loss": 0.1518, "num_input_tokens_seen": 15306752, "step": 3737 }, { "epoch": 2.7853949329359167, "grad_norm": 10.915622418952845, "learning_rate": 1.027050674312983e-06, "loss": 0.7174, "num_input_tokens_seen": 15310848, "step": 3738 }, { "epoch": 2.7861400894187778, "grad_norm": 10.200563489998938, "learning_rate": 1.0264748384006259e-06, "loss": 0.305, "num_input_tokens_seen": 15314944, "step": 3739 }, { "epoch": 2.7868852459016393, "grad_norm": 9.357071186923736, "learning_rate": 1.0258990514865807e-06, "loss": 0.5918, "num_input_tokens_seen": 15319040, "step": 3740 }, { "epoch": 2.787630402384501, "grad_norm": 9.908767157114337, "learning_rate": 1.0253233136970634e-06, "loss": 0.3517, "num_input_tokens_seen": 15323136, "step": 3741 }, { "epoch": 2.788375558867362, "grad_norm": 9.81792812296011, "learning_rate": 1.0247476251582797e-06, "loss": 0.3903, "num_input_tokens_seen": 15327232, "step": 3742 }, { "epoch": 2.7891207153502235, "grad_norm": 7.281375402008903, "learning_rate": 1.0241719859964252e-06, "loss": 0.4806, "num_input_tokens_seen": 15331328, "step": 3743 }, { "epoch": 2.789865871833085, "grad_norm": 9.827763135444053, "learning_rate": 1.023596396337684e-06, "loss": 0.2539, "num_input_tokens_seen": 15335424, "step": 3744 }, { "epoch": 2.790611028315946, "grad_norm": 19.959433377623572, "learning_rate": 1.0230208563082296e-06, "loss": 0.5189, "num_input_tokens_seen": 15339520, "step": 3745 }, { "epoch": 2.7913561847988078, "grad_norm": 9.235307160432518, "learning_rate": 1.022445366034224e-06, "loss": 0.4467, "num_input_tokens_seen": 15343616, "step": 3746 }, { "epoch": 2.7921013412816693, "grad_norm": 9.439967478386922, "learning_rate": 1.0218699256418189e-06, "loss": 0.2958, "num_input_tokens_seen": 15347712, "step": 3747 }, { "epoch": 2.7928464977645304, "grad_norm": 10.010685450633387, "learning_rate": 1.0212945352571552e-06, "loss": 0.3731, "num_input_tokens_seen": 15351808, "step": 3748 }, { "epoch": 2.793591654247392, "grad_norm": 8.096785040705019, "learning_rate": 1.0207191950063624e-06, "loss": 0.2077, "num_input_tokens_seen": 15355904, "step": 3749 }, { "epoch": 2.794336810730253, "grad_norm": 12.535939442362604, "learning_rate": 1.0201439050155595e-06, "loss": 0.4626, "num_input_tokens_seen": 15360000, "step": 3750 }, { "epoch": 2.7950819672131146, "grad_norm": 8.787483363162796, "learning_rate": 1.0195686654108539e-06, "loss": 0.3136, "num_input_tokens_seen": 15364096, "step": 3751 }, { "epoch": 2.795827123695976, "grad_norm": 9.796583709689596, "learning_rate": 1.0189934763183422e-06, "loss": 0.3001, "num_input_tokens_seen": 15368192, "step": 3752 }, { "epoch": 2.7965722801788377, "grad_norm": 8.002260782091014, "learning_rate": 1.0184183378641104e-06, "loss": 0.3619, "num_input_tokens_seen": 15372288, "step": 3753 }, { "epoch": 2.797317436661699, "grad_norm": 8.50678565520147, "learning_rate": 1.0178432501742321e-06, "loss": 0.35, "num_input_tokens_seen": 15376384, "step": 3754 }, { "epoch": 2.7980625931445604, "grad_norm": 9.682247081251697, "learning_rate": 1.0172682133747716e-06, "loss": 0.5892, "num_input_tokens_seen": 15380480, "step": 3755 }, { "epoch": 2.7988077496274215, "grad_norm": 11.413998868824395, "learning_rate": 1.01669322759178e-06, "loss": 0.3075, "num_input_tokens_seen": 15384576, "step": 3756 }, { "epoch": 2.799552906110283, "grad_norm": 9.14920739810487, "learning_rate": 1.016118292951299e-06, "loss": 0.4422, "num_input_tokens_seen": 15388672, "step": 3757 }, { "epoch": 2.8002980625931446, "grad_norm": 7.689148209398066, "learning_rate": 1.0155434095793583e-06, "loss": 0.2474, "num_input_tokens_seen": 15392768, "step": 3758 }, { "epoch": 2.801043219076006, "grad_norm": 9.444315310046157, "learning_rate": 1.0149685776019764e-06, "loss": 0.5926, "num_input_tokens_seen": 15396864, "step": 3759 }, { "epoch": 2.8017883755588673, "grad_norm": 13.9245652582864, "learning_rate": 1.0143937971451607e-06, "loss": 0.3991, "num_input_tokens_seen": 15400960, "step": 3760 }, { "epoch": 2.802533532041729, "grad_norm": 9.527035683751665, "learning_rate": 1.0138190683349067e-06, "loss": 0.4812, "num_input_tokens_seen": 15405056, "step": 3761 }, { "epoch": 2.80327868852459, "grad_norm": 8.499757424840855, "learning_rate": 1.0132443912972e-06, "loss": 0.1837, "num_input_tokens_seen": 15409152, "step": 3762 }, { "epoch": 2.8040238450074515, "grad_norm": 11.529956077671523, "learning_rate": 1.0126697661580128e-06, "loss": 0.353, "num_input_tokens_seen": 15413248, "step": 3763 }, { "epoch": 2.804769001490313, "grad_norm": 8.156844143869687, "learning_rate": 1.0120951930433078e-06, "loss": 0.2509, "num_input_tokens_seen": 15417344, "step": 3764 }, { "epoch": 2.8055141579731746, "grad_norm": 9.087986155327789, "learning_rate": 1.0115206720790352e-06, "loss": 0.6222, "num_input_tokens_seen": 15421440, "step": 3765 }, { "epoch": 2.8062593144560357, "grad_norm": 11.320185938721211, "learning_rate": 1.0109462033911345e-06, "loss": 0.5037, "num_input_tokens_seen": 15425536, "step": 3766 }, { "epoch": 2.8070044709388973, "grad_norm": 8.927327065445324, "learning_rate": 1.0103717871055326e-06, "loss": 0.4223, "num_input_tokens_seen": 15429632, "step": 3767 }, { "epoch": 2.8077496274217584, "grad_norm": 9.1508271707353, "learning_rate": 1.0097974233481461e-06, "loss": 0.3531, "num_input_tokens_seen": 15433728, "step": 3768 }, { "epoch": 2.80849478390462, "grad_norm": 9.876940595854279, "learning_rate": 1.0092231122448795e-06, "loss": 0.3874, "num_input_tokens_seen": 15437824, "step": 3769 }, { "epoch": 2.8092399403874815, "grad_norm": 10.80686071645011, "learning_rate": 1.008648853921626e-06, "loss": 0.5173, "num_input_tokens_seen": 15441920, "step": 3770 }, { "epoch": 2.809985096870343, "grad_norm": 10.871131579460343, "learning_rate": 1.0080746485042674e-06, "loss": 0.4224, "num_input_tokens_seen": 15446016, "step": 3771 }, { "epoch": 2.810730253353204, "grad_norm": 11.038455028005524, "learning_rate": 1.0075004961186733e-06, "loss": 0.3462, "num_input_tokens_seen": 15450112, "step": 3772 }, { "epoch": 2.8114754098360657, "grad_norm": 10.354562514125558, "learning_rate": 1.006926396890702e-06, "loss": 0.3825, "num_input_tokens_seen": 15454208, "step": 3773 }, { "epoch": 2.812220566318927, "grad_norm": 8.819879368173236, "learning_rate": 1.0063523509462001e-06, "loss": 0.4234, "num_input_tokens_seen": 15458304, "step": 3774 }, { "epoch": 2.8129657228017884, "grad_norm": 8.764876908763735, "learning_rate": 1.0057783584110026e-06, "loss": 0.49, "num_input_tokens_seen": 15462400, "step": 3775 }, { "epoch": 2.81371087928465, "grad_norm": 11.026679664416054, "learning_rate": 1.005204419410933e-06, "loss": 0.5334, "num_input_tokens_seen": 15466496, "step": 3776 }, { "epoch": 2.814456035767511, "grad_norm": 11.080926499104605, "learning_rate": 1.0046305340718024e-06, "loss": 0.3545, "num_input_tokens_seen": 15470592, "step": 3777 }, { "epoch": 2.8152011922503726, "grad_norm": 9.40568941202112, "learning_rate": 1.004056702519411e-06, "loss": 0.2042, "num_input_tokens_seen": 15474688, "step": 3778 }, { "epoch": 2.815946348733234, "grad_norm": 11.75208175172063, "learning_rate": 1.0034829248795468e-06, "loss": 0.4588, "num_input_tokens_seen": 15478784, "step": 3779 }, { "epoch": 2.8166915052160952, "grad_norm": 8.907189553805482, "learning_rate": 1.0029092012779858e-06, "loss": 0.214, "num_input_tokens_seen": 15482880, "step": 3780 }, { "epoch": 2.817436661698957, "grad_norm": 29.486890004917132, "learning_rate": 1.0023355318404921e-06, "loss": 0.701, "num_input_tokens_seen": 15486976, "step": 3781 }, { "epoch": 2.8181818181818183, "grad_norm": 9.497184554422391, "learning_rate": 1.0017619166928185e-06, "loss": 0.4733, "num_input_tokens_seen": 15491072, "step": 3782 }, { "epoch": 2.8189269746646795, "grad_norm": 9.984362514069048, "learning_rate": 1.001188355960705e-06, "loss": 0.3842, "num_input_tokens_seen": 15495168, "step": 3783 }, { "epoch": 2.819672131147541, "grad_norm": 10.563127068169155, "learning_rate": 1.0006148497698807e-06, "loss": 0.2516, "num_input_tokens_seen": 15499264, "step": 3784 }, { "epoch": 2.820417287630402, "grad_norm": 10.67879274191511, "learning_rate": 1.000041398246062e-06, "loss": 0.2967, "num_input_tokens_seen": 15503360, "step": 3785 }, { "epoch": 2.8211624441132637, "grad_norm": 10.032401169049159, "learning_rate": 9.994680015149537e-07, "loss": 0.4289, "num_input_tokens_seen": 15507456, "step": 3786 }, { "epoch": 2.821907600596125, "grad_norm": 10.146378998706064, "learning_rate": 9.988946597022484e-07, "loss": 0.6655, "num_input_tokens_seen": 15511552, "step": 3787 }, { "epoch": 2.8226527570789868, "grad_norm": 9.727630850219029, "learning_rate": 9.983213729336267e-07, "loss": 0.4919, "num_input_tokens_seen": 15515648, "step": 3788 }, { "epoch": 2.823397913561848, "grad_norm": 9.699908181468942, "learning_rate": 9.977481413347576e-07, "loss": 0.5285, "num_input_tokens_seen": 15519744, "step": 3789 }, { "epoch": 2.8241430700447094, "grad_norm": 8.957303646489283, "learning_rate": 9.971749650312967e-07, "loss": 0.4007, "num_input_tokens_seen": 15523840, "step": 3790 }, { "epoch": 2.8248882265275705, "grad_norm": 10.592531991393669, "learning_rate": 9.96601844148889e-07, "loss": 0.3283, "num_input_tokens_seen": 15527936, "step": 3791 }, { "epoch": 2.825633383010432, "grad_norm": 8.936894151226321, "learning_rate": 9.960287788131665e-07, "loss": 0.4471, "num_input_tokens_seen": 15532032, "step": 3792 }, { "epoch": 2.8263785394932937, "grad_norm": 10.457982335041839, "learning_rate": 9.954557691497495e-07, "loss": 0.6394, "num_input_tokens_seen": 15536128, "step": 3793 }, { "epoch": 2.827123695976155, "grad_norm": 10.16052609797119, "learning_rate": 9.948828152842452e-07, "loss": 0.3189, "num_input_tokens_seen": 15540224, "step": 3794 }, { "epoch": 2.8278688524590163, "grad_norm": 15.290402084052518, "learning_rate": 9.943099173422494e-07, "loss": 0.2219, "num_input_tokens_seen": 15544320, "step": 3795 }, { "epoch": 2.828614008941878, "grad_norm": 10.189831482435276, "learning_rate": 9.937370754493455e-07, "loss": 0.7483, "num_input_tokens_seen": 15548416, "step": 3796 }, { "epoch": 2.829359165424739, "grad_norm": 11.212386403245052, "learning_rate": 9.931642897311044e-07, "loss": 0.3301, "num_input_tokens_seen": 15552512, "step": 3797 }, { "epoch": 2.8301043219076005, "grad_norm": 8.280215833139458, "learning_rate": 9.92591560313085e-07, "loss": 0.3583, "num_input_tokens_seen": 15556608, "step": 3798 }, { "epoch": 2.830849478390462, "grad_norm": 10.092707604396674, "learning_rate": 9.920188873208343e-07, "loss": 0.4338, "num_input_tokens_seen": 15560704, "step": 3799 }, { "epoch": 2.8315946348733236, "grad_norm": 11.669276816665452, "learning_rate": 9.91446270879885e-07, "loss": 0.3258, "num_input_tokens_seen": 15564800, "step": 3800 }, { "epoch": 2.8323397913561847, "grad_norm": 11.727952957281712, "learning_rate": 9.908737111157593e-07, "loss": 0.5587, "num_input_tokens_seen": 15568896, "step": 3801 }, { "epoch": 2.8330849478390463, "grad_norm": 10.394528778910152, "learning_rate": 9.903012081539667e-07, "loss": 0.605, "num_input_tokens_seen": 15572992, "step": 3802 }, { "epoch": 2.8338301043219074, "grad_norm": 10.527332699878757, "learning_rate": 9.897287621200036e-07, "loss": 0.3693, "num_input_tokens_seen": 15577088, "step": 3803 }, { "epoch": 2.834575260804769, "grad_norm": 8.294858785089929, "learning_rate": 9.891563731393543e-07, "loss": 0.7225, "num_input_tokens_seen": 15581184, "step": 3804 }, { "epoch": 2.8353204172876305, "grad_norm": 10.355742327027817, "learning_rate": 9.885840413374905e-07, "loss": 0.5215, "num_input_tokens_seen": 15585280, "step": 3805 }, { "epoch": 2.836065573770492, "grad_norm": 7.842935297682698, "learning_rate": 9.88011766839872e-07, "loss": 0.1976, "num_input_tokens_seen": 15589376, "step": 3806 }, { "epoch": 2.836810730253353, "grad_norm": 7.735861671038723, "learning_rate": 9.87439549771945e-07, "loss": 0.2457, "num_input_tokens_seen": 15593472, "step": 3807 }, { "epoch": 2.8375558867362147, "grad_norm": 11.589802155730965, "learning_rate": 9.86867390259143e-07, "loss": 0.2075, "num_input_tokens_seen": 15597568, "step": 3808 }, { "epoch": 2.838301043219076, "grad_norm": 10.032086314682974, "learning_rate": 9.862952884268886e-07, "loss": 0.3211, "num_input_tokens_seen": 15601664, "step": 3809 }, { "epoch": 2.8390461997019374, "grad_norm": 8.959824072055511, "learning_rate": 9.857232444005894e-07, "loss": 0.4181, "num_input_tokens_seen": 15605760, "step": 3810 }, { "epoch": 2.839791356184799, "grad_norm": 9.755936666912694, "learning_rate": 9.851512583056422e-07, "loss": 0.4443, "num_input_tokens_seen": 15609856, "step": 3811 }, { "epoch": 2.84053651266766, "grad_norm": 8.649545403752827, "learning_rate": 9.845793302674306e-07, "loss": 0.4768, "num_input_tokens_seen": 15613952, "step": 3812 }, { "epoch": 2.8412816691505216, "grad_norm": 7.71384664639158, "learning_rate": 9.84007460411325e-07, "loss": 0.3111, "num_input_tokens_seen": 15618048, "step": 3813 }, { "epoch": 2.842026825633383, "grad_norm": 9.443962151746325, "learning_rate": 9.834356488626833e-07, "loss": 0.3865, "num_input_tokens_seen": 15622144, "step": 3814 }, { "epoch": 2.8427719821162443, "grad_norm": 10.44651525925538, "learning_rate": 9.82863895746851e-07, "loss": 0.5931, "num_input_tokens_seen": 15626240, "step": 3815 }, { "epoch": 2.843517138599106, "grad_norm": 8.364485770304992, "learning_rate": 9.822922011891603e-07, "loss": 0.3641, "num_input_tokens_seen": 15630336, "step": 3816 }, { "epoch": 2.8442622950819674, "grad_norm": 10.39989133739295, "learning_rate": 9.817205653149307e-07, "loss": 0.2832, "num_input_tokens_seen": 15634432, "step": 3817 }, { "epoch": 2.8450074515648285, "grad_norm": 9.542385523114195, "learning_rate": 9.811489882494684e-07, "loss": 0.4036, "num_input_tokens_seen": 15638528, "step": 3818 }, { "epoch": 2.84575260804769, "grad_norm": 8.370175962948931, "learning_rate": 9.805774701180681e-07, "loss": 0.57, "num_input_tokens_seen": 15642624, "step": 3819 }, { "epoch": 2.846497764530551, "grad_norm": 10.336354265021921, "learning_rate": 9.800060110460104e-07, "loss": 0.3099, "num_input_tokens_seen": 15646720, "step": 3820 }, { "epoch": 2.8472429210134127, "grad_norm": 9.251028983381515, "learning_rate": 9.794346111585628e-07, "loss": 0.4878, "num_input_tokens_seen": 15650816, "step": 3821 }, { "epoch": 2.8479880774962743, "grad_norm": 9.412411360931143, "learning_rate": 9.788632705809803e-07, "loss": 0.3675, "num_input_tokens_seen": 15654912, "step": 3822 }, { "epoch": 2.848733233979136, "grad_norm": 11.689796959750428, "learning_rate": 9.782919894385052e-07, "loss": 0.3497, "num_input_tokens_seen": 15659008, "step": 3823 }, { "epoch": 2.849478390461997, "grad_norm": 8.170825501934742, "learning_rate": 9.777207678563658e-07, "loss": 0.3822, "num_input_tokens_seen": 15663104, "step": 3824 }, { "epoch": 2.8502235469448585, "grad_norm": 10.402256705112556, "learning_rate": 9.771496059597785e-07, "loss": 0.504, "num_input_tokens_seen": 15667200, "step": 3825 }, { "epoch": 2.8509687034277196, "grad_norm": 10.266523723620384, "learning_rate": 9.76578503873946e-07, "loss": 0.2688, "num_input_tokens_seen": 15671296, "step": 3826 }, { "epoch": 2.851713859910581, "grad_norm": 10.44760564348208, "learning_rate": 9.760074617240579e-07, "loss": 0.2319, "num_input_tokens_seen": 15675392, "step": 3827 }, { "epoch": 2.8524590163934427, "grad_norm": 9.617654420161767, "learning_rate": 9.754364796352905e-07, "loss": 0.6592, "num_input_tokens_seen": 15679488, "step": 3828 }, { "epoch": 2.8532041728763042, "grad_norm": 10.077107273019813, "learning_rate": 9.748655577328075e-07, "loss": 0.5338, "num_input_tokens_seen": 15683584, "step": 3829 }, { "epoch": 2.8539493293591653, "grad_norm": 9.132818104114575, "learning_rate": 9.742946961417588e-07, "loss": 0.4517, "num_input_tokens_seen": 15687680, "step": 3830 }, { "epoch": 2.854694485842027, "grad_norm": 9.471924874918082, "learning_rate": 9.737238949872812e-07, "loss": 0.3264, "num_input_tokens_seen": 15691776, "step": 3831 }, { "epoch": 2.855439642324888, "grad_norm": 10.22810094331058, "learning_rate": 9.731531543944988e-07, "loss": 0.7114, "num_input_tokens_seen": 15695872, "step": 3832 }, { "epoch": 2.8561847988077496, "grad_norm": 9.151350115659838, "learning_rate": 9.725824744885218e-07, "loss": 0.3614, "num_input_tokens_seen": 15699968, "step": 3833 }, { "epoch": 2.856929955290611, "grad_norm": 9.908207612387445, "learning_rate": 9.720118553944479e-07, "loss": 0.1757, "num_input_tokens_seen": 15704064, "step": 3834 }, { "epoch": 2.8576751117734727, "grad_norm": 12.689679105287562, "learning_rate": 9.714412972373597e-07, "loss": 0.52, "num_input_tokens_seen": 15708160, "step": 3835 }, { "epoch": 2.8584202682563338, "grad_norm": 7.982922725528489, "learning_rate": 9.70870800142328e-07, "loss": 0.3471, "num_input_tokens_seen": 15712256, "step": 3836 }, { "epoch": 2.8591654247391953, "grad_norm": 11.892034740517655, "learning_rate": 9.703003642344108e-07, "loss": 0.3156, "num_input_tokens_seen": 15716352, "step": 3837 }, { "epoch": 2.8599105812220564, "grad_norm": 7.567617129171708, "learning_rate": 9.697299896386506e-07, "loss": 0.4504, "num_input_tokens_seen": 15720448, "step": 3838 }, { "epoch": 2.860655737704918, "grad_norm": 9.662412626512317, "learning_rate": 9.69159676480078e-07, "loss": 0.5673, "num_input_tokens_seen": 15724544, "step": 3839 }, { "epoch": 2.8614008941877795, "grad_norm": 10.676863611340478, "learning_rate": 9.685894248837094e-07, "loss": 0.6748, "num_input_tokens_seen": 15728640, "step": 3840 }, { "epoch": 2.862146050670641, "grad_norm": 8.324157565953502, "learning_rate": 9.680192349745484e-07, "loss": 0.4101, "num_input_tokens_seen": 15732736, "step": 3841 }, { "epoch": 2.862891207153502, "grad_norm": 10.260135849237884, "learning_rate": 9.674491068775849e-07, "loss": 0.514, "num_input_tokens_seen": 15736832, "step": 3842 }, { "epoch": 2.8636363636363638, "grad_norm": 14.396843402489047, "learning_rate": 9.668790407177944e-07, "loss": 0.5623, "num_input_tokens_seen": 15740928, "step": 3843 }, { "epoch": 2.864381520119225, "grad_norm": 9.66657543644361, "learning_rate": 9.663090366201405e-07, "loss": 0.5269, "num_input_tokens_seen": 15745024, "step": 3844 }, { "epoch": 2.8651266766020864, "grad_norm": 11.321816501842232, "learning_rate": 9.657390947095708e-07, "loss": 0.5656, "num_input_tokens_seen": 15749120, "step": 3845 }, { "epoch": 2.865871833084948, "grad_norm": 11.024737304108335, "learning_rate": 9.651692151110218e-07, "loss": 0.4875, "num_input_tokens_seen": 15753216, "step": 3846 }, { "epoch": 2.866616989567809, "grad_norm": 9.13718428968831, "learning_rate": 9.64599397949415e-07, "loss": 0.2065, "num_input_tokens_seen": 15757312, "step": 3847 }, { "epoch": 2.8673621460506706, "grad_norm": 8.169058127040516, "learning_rate": 9.640296433496576e-07, "loss": 0.2014, "num_input_tokens_seen": 15761408, "step": 3848 }, { "epoch": 2.868107302533532, "grad_norm": 11.903821795574112, "learning_rate": 9.634599514366445e-07, "loss": 0.212, "num_input_tokens_seen": 15765504, "step": 3849 }, { "epoch": 2.8688524590163933, "grad_norm": 8.874699084040044, "learning_rate": 9.628903223352558e-07, "loss": 0.438, "num_input_tokens_seen": 15769600, "step": 3850 }, { "epoch": 2.869597615499255, "grad_norm": 8.34539208656286, "learning_rate": 9.623207561703589e-07, "loss": 0.3717, "num_input_tokens_seen": 15773696, "step": 3851 }, { "epoch": 2.8703427719821164, "grad_norm": 7.60346219100427, "learning_rate": 9.617512530668063e-07, "loss": 0.4552, "num_input_tokens_seen": 15777792, "step": 3852 }, { "epoch": 2.8710879284649775, "grad_norm": 9.923216776479215, "learning_rate": 9.611818131494374e-07, "loss": 0.3054, "num_input_tokens_seen": 15781888, "step": 3853 }, { "epoch": 2.871833084947839, "grad_norm": 10.219925290795377, "learning_rate": 9.606124365430777e-07, "loss": 0.2579, "num_input_tokens_seen": 15785984, "step": 3854 }, { "epoch": 2.8725782414307, "grad_norm": 9.141250939444607, "learning_rate": 9.60043123372538e-07, "loss": 0.4851, "num_input_tokens_seen": 15790080, "step": 3855 }, { "epoch": 2.8733233979135617, "grad_norm": 9.678436627222208, "learning_rate": 9.59473873762616e-07, "loss": 0.3642, "num_input_tokens_seen": 15794176, "step": 3856 }, { "epoch": 2.8740685543964233, "grad_norm": 8.604058357836756, "learning_rate": 9.589046878380957e-07, "loss": 0.3599, "num_input_tokens_seen": 15798272, "step": 3857 }, { "epoch": 2.874813710879285, "grad_norm": 9.955185608533853, "learning_rate": 9.583355657237462e-07, "loss": 0.356, "num_input_tokens_seen": 15802368, "step": 3858 }, { "epoch": 2.875558867362146, "grad_norm": 9.571214019681308, "learning_rate": 9.577665075443234e-07, "loss": 0.4969, "num_input_tokens_seen": 15806464, "step": 3859 }, { "epoch": 2.8763040238450075, "grad_norm": 12.150381068115399, "learning_rate": 9.57197513424569e-07, "loss": 0.5609, "num_input_tokens_seen": 15810560, "step": 3860 }, { "epoch": 2.8770491803278686, "grad_norm": 10.116715808688632, "learning_rate": 9.566285834892108e-07, "loss": 0.5185, "num_input_tokens_seen": 15814656, "step": 3861 }, { "epoch": 2.87779433681073, "grad_norm": 9.549960343989381, "learning_rate": 9.560597178629615e-07, "loss": 0.4096, "num_input_tokens_seen": 15818752, "step": 3862 }, { "epoch": 2.8785394932935917, "grad_norm": 9.944761859609057, "learning_rate": 9.55490916670521e-07, "loss": 0.439, "num_input_tokens_seen": 15822848, "step": 3863 }, { "epoch": 2.8792846497764533, "grad_norm": 10.714165334936839, "learning_rate": 9.54922180036575e-07, "loss": 0.3444, "num_input_tokens_seen": 15826944, "step": 3864 }, { "epoch": 2.8800298062593144, "grad_norm": 9.477970491989394, "learning_rate": 9.543535080857937e-07, "loss": 0.4767, "num_input_tokens_seen": 15831040, "step": 3865 }, { "epoch": 2.880774962742176, "grad_norm": 9.57907740810019, "learning_rate": 9.537849009428346e-07, "loss": 0.4757, "num_input_tokens_seen": 15835136, "step": 3866 }, { "epoch": 2.881520119225037, "grad_norm": 11.90289000770306, "learning_rate": 9.532163587323403e-07, "loss": 0.313, "num_input_tokens_seen": 15839232, "step": 3867 }, { "epoch": 2.8822652757078986, "grad_norm": 9.403769833274374, "learning_rate": 9.526478815789394e-07, "loss": 0.328, "num_input_tokens_seen": 15843328, "step": 3868 }, { "epoch": 2.88301043219076, "grad_norm": 7.264841687548105, "learning_rate": 9.520794696072462e-07, "loss": 0.1241, "num_input_tokens_seen": 15847424, "step": 3869 }, { "epoch": 2.8837555886736217, "grad_norm": 9.706053539333196, "learning_rate": 9.515111229418605e-07, "loss": 0.1963, "num_input_tokens_seen": 15851520, "step": 3870 }, { "epoch": 2.884500745156483, "grad_norm": 10.389147697883331, "learning_rate": 9.509428417073685e-07, "loss": 0.4773, "num_input_tokens_seen": 15855616, "step": 3871 }, { "epoch": 2.8852459016393444, "grad_norm": 8.90816856536227, "learning_rate": 9.503746260283407e-07, "loss": 0.4135, "num_input_tokens_seen": 15859712, "step": 3872 }, { "epoch": 2.8859910581222055, "grad_norm": 7.092478867415193, "learning_rate": 9.498064760293343e-07, "loss": 0.1512, "num_input_tokens_seen": 15863808, "step": 3873 }, { "epoch": 2.886736214605067, "grad_norm": 9.537129407302391, "learning_rate": 9.492383918348927e-07, "loss": 0.5241, "num_input_tokens_seen": 15867904, "step": 3874 }, { "epoch": 2.8874813710879286, "grad_norm": 8.585198533119264, "learning_rate": 9.486703735695424e-07, "loss": 0.2842, "num_input_tokens_seen": 15872000, "step": 3875 }, { "epoch": 2.88822652757079, "grad_norm": 7.631277688915567, "learning_rate": 9.481024213577981e-07, "loss": 0.1196, "num_input_tokens_seen": 15876096, "step": 3876 }, { "epoch": 2.8889716840536512, "grad_norm": 9.776676016712823, "learning_rate": 9.475345353241587e-07, "loss": 0.4437, "num_input_tokens_seen": 15880192, "step": 3877 }, { "epoch": 2.889716840536513, "grad_norm": 9.777820291187778, "learning_rate": 9.46966715593109e-07, "loss": 0.3483, "num_input_tokens_seen": 15884288, "step": 3878 }, { "epoch": 2.890461997019374, "grad_norm": 9.902261556462541, "learning_rate": 9.463989622891191e-07, "loss": 0.5052, "num_input_tokens_seen": 15888384, "step": 3879 }, { "epoch": 2.8912071535022354, "grad_norm": 9.906635863388518, "learning_rate": 9.458312755366446e-07, "loss": 0.2268, "num_input_tokens_seen": 15892480, "step": 3880 }, { "epoch": 2.891952309985097, "grad_norm": 8.895132963384109, "learning_rate": 9.452636554601266e-07, "loss": 0.2962, "num_input_tokens_seen": 15896576, "step": 3881 }, { "epoch": 2.892697466467958, "grad_norm": 9.695453802173613, "learning_rate": 9.446961021839912e-07, "loss": 0.433, "num_input_tokens_seen": 15900672, "step": 3882 }, { "epoch": 2.8934426229508197, "grad_norm": 9.324123716186795, "learning_rate": 9.441286158326501e-07, "loss": 0.5564, "num_input_tokens_seen": 15904768, "step": 3883 }, { "epoch": 2.894187779433681, "grad_norm": 8.21310415563538, "learning_rate": 9.435611965305007e-07, "loss": 0.2152, "num_input_tokens_seen": 15908864, "step": 3884 }, { "epoch": 2.8949329359165423, "grad_norm": 13.426867377792462, "learning_rate": 9.429938444019252e-07, "loss": 0.3137, "num_input_tokens_seen": 15912960, "step": 3885 }, { "epoch": 2.895678092399404, "grad_norm": 9.711450520528762, "learning_rate": 9.424265595712912e-07, "loss": 0.447, "num_input_tokens_seen": 15917056, "step": 3886 }, { "epoch": 2.8964232488822654, "grad_norm": 8.898918325825473, "learning_rate": 9.418593421629514e-07, "loss": 0.4268, "num_input_tokens_seen": 15921152, "step": 3887 }, { "epoch": 2.8971684053651265, "grad_norm": 10.263934119920107, "learning_rate": 9.412921923012449e-07, "loss": 0.4339, "num_input_tokens_seen": 15925248, "step": 3888 }, { "epoch": 2.897913561847988, "grad_norm": 8.78321913250721, "learning_rate": 9.407251101104936e-07, "loss": 0.5682, "num_input_tokens_seen": 15929344, "step": 3889 }, { "epoch": 2.898658718330849, "grad_norm": 14.196525453023767, "learning_rate": 9.401580957150065e-07, "loss": 0.3352, "num_input_tokens_seen": 15933440, "step": 3890 }, { "epoch": 2.8994038748137108, "grad_norm": 14.060003103443439, "learning_rate": 9.395911492390777e-07, "loss": 0.563, "num_input_tokens_seen": 15937536, "step": 3891 }, { "epoch": 2.9001490312965723, "grad_norm": 10.189106612744807, "learning_rate": 9.390242708069852e-07, "loss": 0.512, "num_input_tokens_seen": 15941632, "step": 3892 }, { "epoch": 2.900894187779434, "grad_norm": 7.9182910801159245, "learning_rate": 9.384574605429931e-07, "loss": 0.2136, "num_input_tokens_seen": 15945728, "step": 3893 }, { "epoch": 2.901639344262295, "grad_norm": 9.391904120315925, "learning_rate": 9.378907185713501e-07, "loss": 0.3915, "num_input_tokens_seen": 15949824, "step": 3894 }, { "epoch": 2.9023845007451565, "grad_norm": 10.567206088025074, "learning_rate": 9.373240450162904e-07, "loss": 0.324, "num_input_tokens_seen": 15953920, "step": 3895 }, { "epoch": 2.9031296572280176, "grad_norm": 10.071619860760268, "learning_rate": 9.367574400020326e-07, "loss": 0.4175, "num_input_tokens_seen": 15958016, "step": 3896 }, { "epoch": 2.903874813710879, "grad_norm": 10.38126324250699, "learning_rate": 9.36190903652781e-07, "loss": 0.3929, "num_input_tokens_seen": 15962112, "step": 3897 }, { "epoch": 2.9046199701937407, "grad_norm": 12.658668616993713, "learning_rate": 9.356244360927242e-07, "loss": 0.559, "num_input_tokens_seen": 15966208, "step": 3898 }, { "epoch": 2.9053651266766023, "grad_norm": 10.927795997637586, "learning_rate": 9.350580374460356e-07, "loss": 0.5031, "num_input_tokens_seen": 15970304, "step": 3899 }, { "epoch": 2.9061102831594634, "grad_norm": 7.870470688220351, "learning_rate": 9.344917078368742e-07, "loss": 0.2498, "num_input_tokens_seen": 15974400, "step": 3900 }, { "epoch": 2.906855439642325, "grad_norm": 9.07151405691147, "learning_rate": 9.339254473893837e-07, "loss": 0.3397, "num_input_tokens_seen": 15978496, "step": 3901 }, { "epoch": 2.907600596125186, "grad_norm": 10.13419771260807, "learning_rate": 9.333592562276916e-07, "loss": 0.4586, "num_input_tokens_seen": 15982592, "step": 3902 }, { "epoch": 2.9083457526080476, "grad_norm": 10.750218659496314, "learning_rate": 9.327931344759117e-07, "loss": 0.2969, "num_input_tokens_seen": 15986688, "step": 3903 }, { "epoch": 2.909090909090909, "grad_norm": 11.130865324041494, "learning_rate": 9.322270822581417e-07, "loss": 0.4999, "num_input_tokens_seen": 15990784, "step": 3904 }, { "epoch": 2.9098360655737707, "grad_norm": 8.852139498107093, "learning_rate": 9.316610996984643e-07, "loss": 0.3503, "num_input_tokens_seen": 15994880, "step": 3905 }, { "epoch": 2.910581222056632, "grad_norm": 9.964588501015175, "learning_rate": 9.310951869209469e-07, "loss": 0.2671, "num_input_tokens_seen": 15998976, "step": 3906 }, { "epoch": 2.9113263785394934, "grad_norm": 8.882730489960476, "learning_rate": 9.30529344049642e-07, "loss": 0.3342, "num_input_tokens_seen": 16003072, "step": 3907 }, { "epoch": 2.9120715350223545, "grad_norm": 8.43716883831679, "learning_rate": 9.299635712085863e-07, "loss": 0.3142, "num_input_tokens_seen": 16007168, "step": 3908 }, { "epoch": 2.912816691505216, "grad_norm": 10.29401189091128, "learning_rate": 9.293978685218007e-07, "loss": 0.2498, "num_input_tokens_seen": 16011264, "step": 3909 }, { "epoch": 2.9135618479880776, "grad_norm": 7.66831632025198, "learning_rate": 9.288322361132917e-07, "loss": 0.5198, "num_input_tokens_seen": 16015360, "step": 3910 }, { "epoch": 2.914307004470939, "grad_norm": 9.94207817015539, "learning_rate": 9.282666741070498e-07, "loss": 0.3844, "num_input_tokens_seen": 16019456, "step": 3911 }, { "epoch": 2.9150521609538003, "grad_norm": 9.170045688061432, "learning_rate": 9.277011826270503e-07, "loss": 0.269, "num_input_tokens_seen": 16023552, "step": 3912 }, { "epoch": 2.915797317436662, "grad_norm": 9.858579777086167, "learning_rate": 9.27135761797253e-07, "loss": 0.3105, "num_input_tokens_seen": 16027648, "step": 3913 }, { "epoch": 2.916542473919523, "grad_norm": 10.870758997571418, "learning_rate": 9.265704117416022e-07, "loss": 0.4605, "num_input_tokens_seen": 16031744, "step": 3914 }, { "epoch": 2.9172876304023845, "grad_norm": 9.815760466276648, "learning_rate": 9.260051325840272e-07, "loss": 0.4565, "num_input_tokens_seen": 16035840, "step": 3915 }, { "epoch": 2.918032786885246, "grad_norm": 9.550303837389027, "learning_rate": 9.254399244484403e-07, "loss": 0.5665, "num_input_tokens_seen": 16039936, "step": 3916 }, { "epoch": 2.918777943368107, "grad_norm": 7.1922174441591515, "learning_rate": 9.248747874587391e-07, "loss": 0.5082, "num_input_tokens_seen": 16044032, "step": 3917 }, { "epoch": 2.9195230998509687, "grad_norm": 9.045933641618513, "learning_rate": 9.243097217388064e-07, "loss": 0.4132, "num_input_tokens_seen": 16048128, "step": 3918 }, { "epoch": 2.9202682563338302, "grad_norm": 10.45291828526171, "learning_rate": 9.237447274125086e-07, "loss": 0.4195, "num_input_tokens_seen": 16052224, "step": 3919 }, { "epoch": 2.9210134128166914, "grad_norm": 8.787646202447695, "learning_rate": 9.231798046036957e-07, "loss": 0.3878, "num_input_tokens_seen": 16056320, "step": 3920 }, { "epoch": 2.921758569299553, "grad_norm": 8.819566061427214, "learning_rate": 9.226149534362036e-07, "loss": 0.2668, "num_input_tokens_seen": 16060416, "step": 3921 }, { "epoch": 2.9225037257824145, "grad_norm": 10.199821190810436, "learning_rate": 9.220501740338514e-07, "loss": 0.3606, "num_input_tokens_seen": 16064512, "step": 3922 }, { "epoch": 2.9232488822652756, "grad_norm": 10.236834267175409, "learning_rate": 9.214854665204429e-07, "loss": 0.5495, "num_input_tokens_seen": 16068608, "step": 3923 }, { "epoch": 2.923994038748137, "grad_norm": 7.791999719391614, "learning_rate": 9.209208310197656e-07, "loss": 0.5886, "num_input_tokens_seen": 16072704, "step": 3924 }, { "epoch": 2.9247391952309982, "grad_norm": 9.249458224148325, "learning_rate": 9.203562676555923e-07, "loss": 0.5362, "num_input_tokens_seen": 16076800, "step": 3925 }, { "epoch": 2.92548435171386, "grad_norm": 9.238474721126748, "learning_rate": 9.197917765516792e-07, "loss": 0.4718, "num_input_tokens_seen": 16080896, "step": 3926 }, { "epoch": 2.9262295081967213, "grad_norm": 10.41060330491192, "learning_rate": 9.192273578317662e-07, "loss": 0.2384, "num_input_tokens_seen": 16084992, "step": 3927 }, { "epoch": 2.926974664679583, "grad_norm": 8.740963742399856, "learning_rate": 9.186630116195787e-07, "loss": 0.501, "num_input_tokens_seen": 16089088, "step": 3928 }, { "epoch": 2.927719821162444, "grad_norm": 9.067757551176195, "learning_rate": 9.180987380388249e-07, "loss": 0.4351, "num_input_tokens_seen": 16093184, "step": 3929 }, { "epoch": 2.9284649776453056, "grad_norm": 10.699817427838992, "learning_rate": 9.175345372131975e-07, "loss": 0.4049, "num_input_tokens_seen": 16097280, "step": 3930 }, { "epoch": 2.9292101341281667, "grad_norm": 9.60275187691339, "learning_rate": 9.169704092663736e-07, "loss": 0.262, "num_input_tokens_seen": 16101376, "step": 3931 }, { "epoch": 2.929955290611028, "grad_norm": 9.823917823178126, "learning_rate": 9.164063543220139e-07, "loss": 0.44, "num_input_tokens_seen": 16105472, "step": 3932 }, { "epoch": 2.9307004470938898, "grad_norm": 7.928604840367778, "learning_rate": 9.158423725037635e-07, "loss": 0.3806, "num_input_tokens_seen": 16109568, "step": 3933 }, { "epoch": 2.9314456035767513, "grad_norm": 8.46368741983292, "learning_rate": 9.152784639352512e-07, "loss": 0.4524, "num_input_tokens_seen": 16113664, "step": 3934 }, { "epoch": 2.9321907600596124, "grad_norm": 8.062362996194148, "learning_rate": 9.147146287400896e-07, "loss": 0.2032, "num_input_tokens_seen": 16117760, "step": 3935 }, { "epoch": 2.932935916542474, "grad_norm": 7.584372145171778, "learning_rate": 9.141508670418759e-07, "loss": 0.1923, "num_input_tokens_seen": 16121856, "step": 3936 }, { "epoch": 2.933681073025335, "grad_norm": 8.45451770866937, "learning_rate": 9.135871789641901e-07, "loss": 0.37, "num_input_tokens_seen": 16125952, "step": 3937 }, { "epoch": 2.9344262295081966, "grad_norm": 8.220884924976565, "learning_rate": 9.130235646305968e-07, "loss": 0.3849, "num_input_tokens_seen": 16130048, "step": 3938 }, { "epoch": 2.935171385991058, "grad_norm": 8.995176133124323, "learning_rate": 9.124600241646442e-07, "loss": 0.4036, "num_input_tokens_seen": 16134144, "step": 3939 }, { "epoch": 2.9359165424739198, "grad_norm": 9.158039505048727, "learning_rate": 9.118965576898647e-07, "loss": 0.2986, "num_input_tokens_seen": 16138240, "step": 3940 }, { "epoch": 2.936661698956781, "grad_norm": 7.453167973743124, "learning_rate": 9.113331653297742e-07, "loss": 0.2816, "num_input_tokens_seen": 16142336, "step": 3941 }, { "epoch": 2.9374068554396424, "grad_norm": 12.815099052883292, "learning_rate": 9.107698472078724e-07, "loss": 0.3673, "num_input_tokens_seen": 16146432, "step": 3942 }, { "epoch": 2.9381520119225035, "grad_norm": 11.562855139277648, "learning_rate": 9.10206603447642e-07, "loss": 0.1599, "num_input_tokens_seen": 16150528, "step": 3943 }, { "epoch": 2.938897168405365, "grad_norm": 9.831065027732317, "learning_rate": 9.096434341725505e-07, "loss": 0.592, "num_input_tokens_seen": 16154624, "step": 3944 }, { "epoch": 2.9396423248882266, "grad_norm": 9.163773264387864, "learning_rate": 9.090803395060486e-07, "loss": 0.3724, "num_input_tokens_seen": 16158720, "step": 3945 }, { "epoch": 2.940387481371088, "grad_norm": 10.522264854812455, "learning_rate": 9.085173195715707e-07, "loss": 0.5156, "num_input_tokens_seen": 16162816, "step": 3946 }, { "epoch": 2.9411326378539493, "grad_norm": 8.672460419624493, "learning_rate": 9.079543744925348e-07, "loss": 0.1285, "num_input_tokens_seen": 16166912, "step": 3947 }, { "epoch": 2.941877794336811, "grad_norm": 7.971854051895903, "learning_rate": 9.073915043923424e-07, "loss": 0.4928, "num_input_tokens_seen": 16171008, "step": 3948 }, { "epoch": 2.942622950819672, "grad_norm": 10.471293797042275, "learning_rate": 9.068287093943785e-07, "loss": 0.4775, "num_input_tokens_seen": 16175104, "step": 3949 }, { "epoch": 2.9433681073025335, "grad_norm": 10.01266832245155, "learning_rate": 9.062659896220122e-07, "loss": 0.4667, "num_input_tokens_seen": 16179200, "step": 3950 }, { "epoch": 2.944113263785395, "grad_norm": 8.762609286745064, "learning_rate": 9.057033451985953e-07, "loss": 0.3463, "num_input_tokens_seen": 16183296, "step": 3951 }, { "epoch": 2.944858420268256, "grad_norm": 11.013835823773258, "learning_rate": 9.051407762474635e-07, "loss": 0.4096, "num_input_tokens_seen": 16187392, "step": 3952 }, { "epoch": 2.9456035767511177, "grad_norm": 8.752430580528973, "learning_rate": 9.045782828919366e-07, "loss": 0.4773, "num_input_tokens_seen": 16191488, "step": 3953 }, { "epoch": 2.9463487332339793, "grad_norm": 8.727329674003833, "learning_rate": 9.040158652553161e-07, "loss": 0.543, "num_input_tokens_seen": 16195584, "step": 3954 }, { "epoch": 2.9470938897168404, "grad_norm": 9.69840670466995, "learning_rate": 9.034535234608889e-07, "loss": 0.5208, "num_input_tokens_seen": 16199680, "step": 3955 }, { "epoch": 2.947839046199702, "grad_norm": 8.26724975615755, "learning_rate": 9.028912576319239e-07, "loss": 0.5226, "num_input_tokens_seen": 16203776, "step": 3956 }, { "epoch": 2.9485842026825635, "grad_norm": 14.411612841534081, "learning_rate": 9.023290678916735e-07, "loss": 0.2735, "num_input_tokens_seen": 16207872, "step": 3957 }, { "epoch": 2.9493293591654246, "grad_norm": 10.631071572537442, "learning_rate": 9.01766954363374e-07, "loss": 0.4931, "num_input_tokens_seen": 16211968, "step": 3958 }, { "epoch": 2.950074515648286, "grad_norm": 9.510516405993663, "learning_rate": 9.012049171702447e-07, "loss": 0.5592, "num_input_tokens_seen": 16216064, "step": 3959 }, { "epoch": 2.9508196721311473, "grad_norm": 8.86495726000363, "learning_rate": 9.006429564354883e-07, "loss": 0.4669, "num_input_tokens_seen": 16220160, "step": 3960 }, { "epoch": 2.951564828614009, "grad_norm": 11.424009440381276, "learning_rate": 9.000810722822905e-07, "loss": 0.4667, "num_input_tokens_seen": 16224256, "step": 3961 }, { "epoch": 2.9523099850968704, "grad_norm": 8.788551952935725, "learning_rate": 8.995192648338202e-07, "loss": 0.5006, "num_input_tokens_seen": 16228352, "step": 3962 }, { "epoch": 2.953055141579732, "grad_norm": 7.354645963700075, "learning_rate": 8.9895753421323e-07, "loss": 0.3576, "num_input_tokens_seen": 16232448, "step": 3963 }, { "epoch": 2.953800298062593, "grad_norm": 10.976589072121781, "learning_rate": 8.983958805436546e-07, "loss": 0.2671, "num_input_tokens_seen": 16236544, "step": 3964 }, { "epoch": 2.9545454545454546, "grad_norm": 8.750819634483433, "learning_rate": 8.978343039482129e-07, "loss": 0.2295, "num_input_tokens_seen": 16240640, "step": 3965 }, { "epoch": 2.9552906110283157, "grad_norm": 10.350266179499219, "learning_rate": 8.972728045500064e-07, "loss": 0.6312, "num_input_tokens_seen": 16244736, "step": 3966 }, { "epoch": 2.9560357675111772, "grad_norm": 9.556634318653856, "learning_rate": 8.967113824721202e-07, "loss": 0.3439, "num_input_tokens_seen": 16248832, "step": 3967 }, { "epoch": 2.956780923994039, "grad_norm": 10.461053076271753, "learning_rate": 8.961500378376213e-07, "loss": 0.354, "num_input_tokens_seen": 16252928, "step": 3968 }, { "epoch": 2.9575260804769004, "grad_norm": 9.768753655588895, "learning_rate": 8.955887707695615e-07, "loss": 0.2701, "num_input_tokens_seen": 16257024, "step": 3969 }, { "epoch": 2.9582712369597615, "grad_norm": 8.593743552114502, "learning_rate": 8.950275813909734e-07, "loss": 0.4754, "num_input_tokens_seen": 16261120, "step": 3970 }, { "epoch": 2.959016393442623, "grad_norm": 7.325743174866252, "learning_rate": 8.944664698248739e-07, "loss": 0.1761, "num_input_tokens_seen": 16265216, "step": 3971 }, { "epoch": 2.959761549925484, "grad_norm": 9.063337157523904, "learning_rate": 8.939054361942633e-07, "loss": 0.407, "num_input_tokens_seen": 16269312, "step": 3972 }, { "epoch": 2.9605067064083457, "grad_norm": 9.453208341763306, "learning_rate": 8.93344480622124e-07, "loss": 0.294, "num_input_tokens_seen": 16273408, "step": 3973 }, { "epoch": 2.9612518628912072, "grad_norm": 11.204718859019122, "learning_rate": 8.927836032314208e-07, "loss": 0.1487, "num_input_tokens_seen": 16277504, "step": 3974 }, { "epoch": 2.961997019374069, "grad_norm": 9.830381181996325, "learning_rate": 8.922228041451028e-07, "loss": 0.2354, "num_input_tokens_seen": 16281600, "step": 3975 }, { "epoch": 2.96274217585693, "grad_norm": 9.627115987194996, "learning_rate": 8.916620834861007e-07, "loss": 0.3393, "num_input_tokens_seen": 16285696, "step": 3976 }, { "epoch": 2.9634873323397914, "grad_norm": 8.150544832667817, "learning_rate": 8.911014413773288e-07, "loss": 0.4763, "num_input_tokens_seen": 16289792, "step": 3977 }, { "epoch": 2.9642324888226526, "grad_norm": 7.681891240541458, "learning_rate": 8.905408779416835e-07, "loss": 0.3827, "num_input_tokens_seen": 16293888, "step": 3978 }, { "epoch": 2.964977645305514, "grad_norm": 7.745609992905947, "learning_rate": 8.899803933020448e-07, "loss": 0.2753, "num_input_tokens_seen": 16297984, "step": 3979 }, { "epoch": 2.9657228017883757, "grad_norm": 10.687737472073252, "learning_rate": 8.894199875812748e-07, "loss": 0.3226, "num_input_tokens_seen": 16302080, "step": 3980 }, { "epoch": 2.966467958271237, "grad_norm": 7.812167208692196, "learning_rate": 8.888596609022181e-07, "loss": 0.3183, "num_input_tokens_seen": 16306176, "step": 3981 }, { "epoch": 2.9672131147540983, "grad_norm": 9.750401487552194, "learning_rate": 8.882994133877027e-07, "loss": 0.1772, "num_input_tokens_seen": 16310272, "step": 3982 }, { "epoch": 2.96795827123696, "grad_norm": 10.702500020961594, "learning_rate": 8.877392451605385e-07, "loss": 0.4759, "num_input_tokens_seen": 16314368, "step": 3983 }, { "epoch": 2.968703427719821, "grad_norm": 12.872054575970926, "learning_rate": 8.871791563435185e-07, "loss": 0.4347, "num_input_tokens_seen": 16318464, "step": 3984 }, { "epoch": 2.9694485842026825, "grad_norm": 10.21980685653229, "learning_rate": 8.866191470594182e-07, "loss": 0.5091, "num_input_tokens_seen": 16322560, "step": 3985 }, { "epoch": 2.970193740685544, "grad_norm": 8.421686785917204, "learning_rate": 8.860592174309955e-07, "loss": 0.3681, "num_input_tokens_seen": 16326656, "step": 3986 }, { "epoch": 2.970938897168405, "grad_norm": 11.859732430333565, "learning_rate": 8.854993675809912e-07, "loss": 0.314, "num_input_tokens_seen": 16330752, "step": 3987 }, { "epoch": 2.9716840536512668, "grad_norm": 10.096590700345521, "learning_rate": 8.849395976321281e-07, "loss": 0.3608, "num_input_tokens_seen": 16334848, "step": 3988 }, { "epoch": 2.9724292101341283, "grad_norm": 8.709550717685493, "learning_rate": 8.843799077071122e-07, "loss": 0.3462, "num_input_tokens_seen": 16338944, "step": 3989 }, { "epoch": 2.9731743666169894, "grad_norm": 12.876725994714754, "learning_rate": 8.838202979286316e-07, "loss": 0.3561, "num_input_tokens_seen": 16343040, "step": 3990 }, { "epoch": 2.973919523099851, "grad_norm": 8.802066877819117, "learning_rate": 8.83260768419356e-07, "loss": 0.1837, "num_input_tokens_seen": 16347136, "step": 3991 }, { "epoch": 2.9746646795827125, "grad_norm": 8.21710120115154, "learning_rate": 8.827013193019387e-07, "loss": 0.4759, "num_input_tokens_seen": 16351232, "step": 3992 }, { "epoch": 2.9754098360655736, "grad_norm": 10.150197472003846, "learning_rate": 8.821419506990152e-07, "loss": 0.5475, "num_input_tokens_seen": 16355328, "step": 3993 }, { "epoch": 2.976154992548435, "grad_norm": 9.386455526982687, "learning_rate": 8.815826627332026e-07, "loss": 0.3347, "num_input_tokens_seen": 16359424, "step": 3994 }, { "epoch": 2.9769001490312967, "grad_norm": 8.608506476179757, "learning_rate": 8.810234555271012e-07, "loss": 0.5609, "num_input_tokens_seen": 16363520, "step": 3995 }, { "epoch": 2.977645305514158, "grad_norm": 9.11439224872478, "learning_rate": 8.804643292032936e-07, "loss": 0.4451, "num_input_tokens_seen": 16367616, "step": 3996 }, { "epoch": 2.9783904619970194, "grad_norm": 9.575633899041913, "learning_rate": 8.799052838843432e-07, "loss": 0.4103, "num_input_tokens_seen": 16371712, "step": 3997 }, { "epoch": 2.979135618479881, "grad_norm": 11.009754795230117, "learning_rate": 8.793463196927973e-07, "loss": 0.3953, "num_input_tokens_seen": 16375808, "step": 3998 }, { "epoch": 2.979880774962742, "grad_norm": 8.788277722163702, "learning_rate": 8.787874367511847e-07, "loss": 0.4151, "num_input_tokens_seen": 16379904, "step": 3999 }, { "epoch": 2.9806259314456036, "grad_norm": 9.496983281528886, "learning_rate": 8.782286351820171e-07, "loss": 0.3057, "num_input_tokens_seen": 16384000, "step": 4000 }, { "epoch": 2.9813710879284647, "grad_norm": 9.79398514995617, "learning_rate": 8.77669915107787e-07, "loss": 0.3105, "num_input_tokens_seen": 16388096, "step": 4001 }, { "epoch": 2.9821162444113263, "grad_norm": 9.703997739891545, "learning_rate": 8.771112766509701e-07, "loss": 0.5657, "num_input_tokens_seen": 16392192, "step": 4002 }, { "epoch": 2.982861400894188, "grad_norm": 11.362198449205106, "learning_rate": 8.765527199340242e-07, "loss": 0.4172, "num_input_tokens_seen": 16396288, "step": 4003 }, { "epoch": 2.9836065573770494, "grad_norm": 8.366476326999358, "learning_rate": 8.759942450793886e-07, "loss": 0.4203, "num_input_tokens_seen": 16400384, "step": 4004 }, { "epoch": 2.9843517138599105, "grad_norm": 9.64065261646101, "learning_rate": 8.754358522094853e-07, "loss": 0.2512, "num_input_tokens_seen": 16404480, "step": 4005 }, { "epoch": 2.985096870342772, "grad_norm": 10.649582714519433, "learning_rate": 8.748775414467176e-07, "loss": 0.2672, "num_input_tokens_seen": 16408576, "step": 4006 }, { "epoch": 2.985842026825633, "grad_norm": 9.374944244284203, "learning_rate": 8.743193129134716e-07, "loss": 0.5343, "num_input_tokens_seen": 16412672, "step": 4007 }, { "epoch": 2.9865871833084947, "grad_norm": 8.797111480534639, "learning_rate": 8.73761166732115e-07, "loss": 0.2833, "num_input_tokens_seen": 16416768, "step": 4008 }, { "epoch": 2.9873323397913563, "grad_norm": 10.167193762251989, "learning_rate": 8.732031030249971e-07, "loss": 0.4046, "num_input_tokens_seen": 16420864, "step": 4009 }, { "epoch": 2.988077496274218, "grad_norm": 9.074322889963007, "learning_rate": 8.726451219144496e-07, "loss": 0.4044, "num_input_tokens_seen": 16424960, "step": 4010 }, { "epoch": 2.988822652757079, "grad_norm": 9.19737844749607, "learning_rate": 8.720872235227859e-07, "loss": 0.402, "num_input_tokens_seen": 16429056, "step": 4011 }, { "epoch": 2.9895678092399405, "grad_norm": 9.058668038147582, "learning_rate": 8.715294079723014e-07, "loss": 0.6353, "num_input_tokens_seen": 16433152, "step": 4012 }, { "epoch": 2.9903129657228016, "grad_norm": 9.677416746012955, "learning_rate": 8.709716753852729e-07, "loss": 0.362, "num_input_tokens_seen": 16437248, "step": 4013 }, { "epoch": 2.991058122205663, "grad_norm": 10.023211109268242, "learning_rate": 8.704140258839598e-07, "loss": 0.5468, "num_input_tokens_seen": 16441344, "step": 4014 }, { "epoch": 2.9918032786885247, "grad_norm": 9.139502692667545, "learning_rate": 8.698564595906029e-07, "loss": 0.5285, "num_input_tokens_seen": 16445440, "step": 4015 }, { "epoch": 2.9925484351713862, "grad_norm": 9.672750829649027, "learning_rate": 8.692989766274242e-07, "loss": 0.4048, "num_input_tokens_seen": 16449536, "step": 4016 }, { "epoch": 2.9932935916542474, "grad_norm": 11.135317763395065, "learning_rate": 8.687415771166284e-07, "loss": 0.3305, "num_input_tokens_seen": 16453632, "step": 4017 }, { "epoch": 2.994038748137109, "grad_norm": 10.139867383885102, "learning_rate": 8.681842611804016e-07, "loss": 0.5371, "num_input_tokens_seen": 16457728, "step": 4018 }, { "epoch": 2.99478390461997, "grad_norm": 8.37727965604506, "learning_rate": 8.676270289409108e-07, "loss": 0.3375, "num_input_tokens_seen": 16461824, "step": 4019 }, { "epoch": 2.9955290611028316, "grad_norm": 8.975055349816344, "learning_rate": 8.670698805203057e-07, "loss": 0.6045, "num_input_tokens_seen": 16465920, "step": 4020 }, { "epoch": 2.996274217585693, "grad_norm": 8.71110908575398, "learning_rate": 8.665128160407173e-07, "loss": 0.3041, "num_input_tokens_seen": 16470016, "step": 4021 }, { "epoch": 2.9970193740685542, "grad_norm": 8.558561930791601, "learning_rate": 8.659558356242583e-07, "loss": 0.2505, "num_input_tokens_seen": 16474112, "step": 4022 }, { "epoch": 2.997764530551416, "grad_norm": 8.83541289118216, "learning_rate": 8.653989393930221e-07, "loss": 0.178, "num_input_tokens_seen": 16478208, "step": 4023 }, { "epoch": 2.9985096870342773, "grad_norm": 9.427640122001934, "learning_rate": 8.648421274690845e-07, "loss": 0.2724, "num_input_tokens_seen": 16482304, "step": 4024 }, { "epoch": 2.9992548435171384, "grad_norm": 8.754968084657122, "learning_rate": 8.642853999745029e-07, "loss": 0.4506, "num_input_tokens_seen": 16486400, "step": 4025 }, { "epoch": 3.0, "grad_norm": 10.014070435909794, "learning_rate": 8.637287570313159e-07, "loss": 0.2749, "num_input_tokens_seen": 16490496, "step": 4026 }, { "epoch": 3.0007451564828616, "grad_norm": 5.593861257957256, "learning_rate": 8.631721987615435e-07, "loss": 0.135, "num_input_tokens_seen": 16494592, "step": 4027 }, { "epoch": 3.0014903129657227, "grad_norm": 6.443453236275027, "learning_rate": 8.626157252871878e-07, "loss": 0.2175, "num_input_tokens_seen": 16498688, "step": 4028 }, { "epoch": 3.002235469448584, "grad_norm": 5.676730564195854, "learning_rate": 8.620593367302308e-07, "loss": 0.1457, "num_input_tokens_seen": 16502784, "step": 4029 }, { "epoch": 3.0029806259314458, "grad_norm": 7.7406647939898185, "learning_rate": 8.615030332126374e-07, "loss": 0.2917, "num_input_tokens_seen": 16506880, "step": 4030 }, { "epoch": 3.003725782414307, "grad_norm": 6.498470012758513, "learning_rate": 8.609468148563532e-07, "loss": 0.1429, "num_input_tokens_seen": 16510976, "step": 4031 }, { "epoch": 3.0044709388971684, "grad_norm": 7.517611347238377, "learning_rate": 8.603906817833054e-07, "loss": 0.3231, "num_input_tokens_seen": 16515072, "step": 4032 }, { "epoch": 3.00521609538003, "grad_norm": 6.392141222196002, "learning_rate": 8.598346341154021e-07, "loss": 0.1416, "num_input_tokens_seen": 16519168, "step": 4033 }, { "epoch": 3.005961251862891, "grad_norm": 5.828418425171157, "learning_rate": 8.59278671974533e-07, "loss": 0.1296, "num_input_tokens_seen": 16523264, "step": 4034 }, { "epoch": 3.0067064083457526, "grad_norm": 6.574498433671757, "learning_rate": 8.587227954825694e-07, "loss": 0.1979, "num_input_tokens_seen": 16527360, "step": 4035 }, { "epoch": 3.007451564828614, "grad_norm": 6.767060806659719, "learning_rate": 8.58167004761363e-07, "loss": 0.1263, "num_input_tokens_seen": 16531456, "step": 4036 }, { "epoch": 3.0081967213114753, "grad_norm": 8.056532892724144, "learning_rate": 8.576112999327467e-07, "loss": 0.0475, "num_input_tokens_seen": 16535552, "step": 4037 }, { "epoch": 3.008941877794337, "grad_norm": 6.521019587787807, "learning_rate": 8.570556811185357e-07, "loss": 0.2469, "num_input_tokens_seen": 16539648, "step": 4038 }, { "epoch": 3.0096870342771984, "grad_norm": 8.549581272622204, "learning_rate": 8.56500148440525e-07, "loss": 0.3672, "num_input_tokens_seen": 16543744, "step": 4039 }, { "epoch": 3.0104321907600595, "grad_norm": 9.165724135179204, "learning_rate": 8.559447020204917e-07, "loss": 0.133, "num_input_tokens_seen": 16547840, "step": 4040 }, { "epoch": 3.011177347242921, "grad_norm": 8.4960880958403, "learning_rate": 8.553893419801933e-07, "loss": 0.146, "num_input_tokens_seen": 16551936, "step": 4041 }, { "epoch": 3.0119225037257826, "grad_norm": 9.663922177819014, "learning_rate": 8.54834068441369e-07, "loss": 0.087, "num_input_tokens_seen": 16556032, "step": 4042 }, { "epoch": 3.0126676602086437, "grad_norm": 7.911704349558054, "learning_rate": 8.542788815257387e-07, "loss": 0.0929, "num_input_tokens_seen": 16560128, "step": 4043 }, { "epoch": 3.0134128166915053, "grad_norm": 8.336451557833179, "learning_rate": 8.537237813550032e-07, "loss": 0.1187, "num_input_tokens_seen": 16564224, "step": 4044 }, { "epoch": 3.0141579731743664, "grad_norm": 12.683588072622271, "learning_rate": 8.531687680508448e-07, "loss": 0.2024, "num_input_tokens_seen": 16568320, "step": 4045 }, { "epoch": 3.014903129657228, "grad_norm": 9.405446214589958, "learning_rate": 8.526138417349258e-07, "loss": 0.1773, "num_input_tokens_seen": 16572416, "step": 4046 }, { "epoch": 3.0156482861400895, "grad_norm": 9.754278128347737, "learning_rate": 8.520590025288902e-07, "loss": 0.1526, "num_input_tokens_seen": 16576512, "step": 4047 }, { "epoch": 3.0163934426229506, "grad_norm": 13.66732805279877, "learning_rate": 8.51504250554363e-07, "loss": 0.166, "num_input_tokens_seen": 16580608, "step": 4048 }, { "epoch": 3.017138599105812, "grad_norm": 13.341322714938123, "learning_rate": 8.5094958593295e-07, "loss": 0.2102, "num_input_tokens_seen": 16584704, "step": 4049 }, { "epoch": 3.0178837555886737, "grad_norm": 13.47508381082571, "learning_rate": 8.503950087862367e-07, "loss": 0.153, "num_input_tokens_seen": 16588800, "step": 4050 }, { "epoch": 3.018628912071535, "grad_norm": 11.077161028639763, "learning_rate": 8.498405192357909e-07, "loss": 0.2685, "num_input_tokens_seen": 16592896, "step": 4051 }, { "epoch": 3.0193740685543964, "grad_norm": 10.268527312913566, "learning_rate": 8.492861174031607e-07, "loss": 0.1963, "num_input_tokens_seen": 16596992, "step": 4052 }, { "epoch": 3.020119225037258, "grad_norm": 11.020389548818278, "learning_rate": 8.487318034098749e-07, "loss": 0.2668, "num_input_tokens_seen": 16601088, "step": 4053 }, { "epoch": 3.020864381520119, "grad_norm": 6.6549957173098075, "learning_rate": 8.481775773774431e-07, "loss": 0.1093, "num_input_tokens_seen": 16605184, "step": 4054 }, { "epoch": 3.0216095380029806, "grad_norm": 7.546777478237039, "learning_rate": 8.476234394273558e-07, "loss": 0.0888, "num_input_tokens_seen": 16609280, "step": 4055 }, { "epoch": 3.022354694485842, "grad_norm": 6.836987326364009, "learning_rate": 8.470693896810838e-07, "loss": 0.0818, "num_input_tokens_seen": 16613376, "step": 4056 }, { "epoch": 3.0230998509687033, "grad_norm": 8.88693980649234, "learning_rate": 8.465154282600785e-07, "loss": 0.1423, "num_input_tokens_seen": 16617472, "step": 4057 }, { "epoch": 3.023845007451565, "grad_norm": 9.045836228138352, "learning_rate": 8.459615552857725e-07, "loss": 0.1961, "num_input_tokens_seen": 16621568, "step": 4058 }, { "epoch": 3.0245901639344264, "grad_norm": 8.70867435074785, "learning_rate": 8.454077708795788e-07, "loss": 0.2037, "num_input_tokens_seen": 16625664, "step": 4059 }, { "epoch": 3.0253353204172875, "grad_norm": 13.419934224406466, "learning_rate": 8.448540751628907e-07, "loss": 0.2207, "num_input_tokens_seen": 16629760, "step": 4060 }, { "epoch": 3.026080476900149, "grad_norm": 7.603363146067038, "learning_rate": 8.443004682570823e-07, "loss": 0.2099, "num_input_tokens_seen": 16633856, "step": 4061 }, { "epoch": 3.0268256333830106, "grad_norm": 8.653630756250472, "learning_rate": 8.437469502835085e-07, "loss": 0.1712, "num_input_tokens_seen": 16637952, "step": 4062 }, { "epoch": 3.0275707898658717, "grad_norm": 7.970793634400148, "learning_rate": 8.431935213635041e-07, "loss": 0.1372, "num_input_tokens_seen": 16642048, "step": 4063 }, { "epoch": 3.0283159463487332, "grad_norm": 7.570262101021281, "learning_rate": 8.426401816183843e-07, "loss": 0.1396, "num_input_tokens_seen": 16646144, "step": 4064 }, { "epoch": 3.029061102831595, "grad_norm": 7.546679600491537, "learning_rate": 8.420869311694461e-07, "loss": 0.1552, "num_input_tokens_seen": 16650240, "step": 4065 }, { "epoch": 3.029806259314456, "grad_norm": 7.891969043991308, "learning_rate": 8.415337701379647e-07, "loss": 0.3261, "num_input_tokens_seen": 16654336, "step": 4066 }, { "epoch": 3.0305514157973175, "grad_norm": 5.452814880505049, "learning_rate": 8.409806986451977e-07, "loss": 0.1085, "num_input_tokens_seen": 16658432, "step": 4067 }, { "epoch": 3.031296572280179, "grad_norm": 8.165740120002438, "learning_rate": 8.404277168123824e-07, "loss": 0.0935, "num_input_tokens_seen": 16662528, "step": 4068 }, { "epoch": 3.03204172876304, "grad_norm": 7.372417466656724, "learning_rate": 8.398748247607361e-07, "loss": 0.1348, "num_input_tokens_seen": 16666624, "step": 4069 }, { "epoch": 3.0327868852459017, "grad_norm": 11.384393547150044, "learning_rate": 8.393220226114567e-07, "loss": 0.1586, "num_input_tokens_seen": 16670720, "step": 4070 }, { "epoch": 3.0335320417287632, "grad_norm": 13.245137751638586, "learning_rate": 8.387693104857223e-07, "loss": 0.1911, "num_input_tokens_seen": 16674816, "step": 4071 }, { "epoch": 3.0342771982116243, "grad_norm": 7.029896925226886, "learning_rate": 8.38216688504692e-07, "loss": 0.1974, "num_input_tokens_seen": 16678912, "step": 4072 }, { "epoch": 3.035022354694486, "grad_norm": 9.566014960183672, "learning_rate": 8.376641567895036e-07, "loss": 0.2769, "num_input_tokens_seen": 16683008, "step": 4073 }, { "epoch": 3.0357675111773474, "grad_norm": 9.66327238211232, "learning_rate": 8.371117154612765e-07, "loss": 0.2799, "num_input_tokens_seen": 16687104, "step": 4074 }, { "epoch": 3.0365126676602086, "grad_norm": 6.5713184378388085, "learning_rate": 8.365593646411095e-07, "loss": 0.0585, "num_input_tokens_seen": 16691200, "step": 4075 }, { "epoch": 3.03725782414307, "grad_norm": 7.558789043477128, "learning_rate": 8.360071044500826e-07, "loss": 0.1711, "num_input_tokens_seen": 16695296, "step": 4076 }, { "epoch": 3.0380029806259317, "grad_norm": 14.35331539221159, "learning_rate": 8.35454935009254e-07, "loss": 0.2193, "num_input_tokens_seen": 16699392, "step": 4077 }, { "epoch": 3.0387481371087928, "grad_norm": 7.884279792963685, "learning_rate": 8.349028564396638e-07, "loss": 0.1547, "num_input_tokens_seen": 16703488, "step": 4078 }, { "epoch": 3.0394932935916543, "grad_norm": 7.399947574679084, "learning_rate": 8.343508688623315e-07, "loss": 0.1077, "num_input_tokens_seen": 16707584, "step": 4079 }, { "epoch": 3.0402384500745154, "grad_norm": 7.195918275898732, "learning_rate": 8.337989723982568e-07, "loss": 0.0763, "num_input_tokens_seen": 16711680, "step": 4080 }, { "epoch": 3.040983606557377, "grad_norm": 9.451461219871595, "learning_rate": 8.332471671684193e-07, "loss": 0.1279, "num_input_tokens_seen": 16715776, "step": 4081 }, { "epoch": 3.0417287630402385, "grad_norm": 9.909092608396206, "learning_rate": 8.326954532937789e-07, "loss": 0.1108, "num_input_tokens_seen": 16719872, "step": 4082 }, { "epoch": 3.0424739195230996, "grad_norm": 8.983232652860304, "learning_rate": 8.321438308952747e-07, "loss": 0.1192, "num_input_tokens_seen": 16723968, "step": 4083 }, { "epoch": 3.043219076005961, "grad_norm": 7.880995049249585, "learning_rate": 8.315923000938264e-07, "loss": 0.1204, "num_input_tokens_seen": 16728064, "step": 4084 }, { "epoch": 3.0439642324888228, "grad_norm": 8.037857482956605, "learning_rate": 8.310408610103338e-07, "loss": 0.176, "num_input_tokens_seen": 16732160, "step": 4085 }, { "epoch": 3.044709388971684, "grad_norm": 9.08960170711045, "learning_rate": 8.30489513765676e-07, "loss": 0.1353, "num_input_tokens_seen": 16736256, "step": 4086 }, { "epoch": 3.0454545454545454, "grad_norm": 13.301507463583665, "learning_rate": 8.299382584807122e-07, "loss": 0.1407, "num_input_tokens_seen": 16740352, "step": 4087 }, { "epoch": 3.046199701937407, "grad_norm": 6.772846902425701, "learning_rate": 8.29387095276282e-07, "loss": 0.0518, "num_input_tokens_seen": 16744448, "step": 4088 }, { "epoch": 3.046944858420268, "grad_norm": 9.701288835569688, "learning_rate": 8.288360242732036e-07, "loss": 0.2693, "num_input_tokens_seen": 16748544, "step": 4089 }, { "epoch": 3.0476900149031296, "grad_norm": 10.76694472676467, "learning_rate": 8.282850455922768e-07, "loss": 0.1561, "num_input_tokens_seen": 16752640, "step": 4090 }, { "epoch": 3.048435171385991, "grad_norm": 10.721760386250104, "learning_rate": 8.277341593542787e-07, "loss": 0.2194, "num_input_tokens_seen": 16756736, "step": 4091 }, { "epoch": 3.0491803278688523, "grad_norm": 7.803115582879737, "learning_rate": 8.271833656799683e-07, "loss": 0.1689, "num_input_tokens_seen": 16760832, "step": 4092 }, { "epoch": 3.049925484351714, "grad_norm": 8.872562053225217, "learning_rate": 8.266326646900835e-07, "loss": 0.2655, "num_input_tokens_seen": 16764928, "step": 4093 }, { "epoch": 3.0506706408345754, "grad_norm": 8.829057532939668, "learning_rate": 8.260820565053413e-07, "loss": 0.1919, "num_input_tokens_seen": 16769024, "step": 4094 }, { "epoch": 3.0514157973174365, "grad_norm": 14.68827121235391, "learning_rate": 8.255315412464396e-07, "loss": 0.21, "num_input_tokens_seen": 16773120, "step": 4095 }, { "epoch": 3.052160953800298, "grad_norm": 9.719899783802907, "learning_rate": 8.249811190340548e-07, "loss": 0.3414, "num_input_tokens_seen": 16777216, "step": 4096 }, { "epoch": 3.0529061102831596, "grad_norm": 5.927418458835889, "learning_rate": 8.244307899888437e-07, "loss": 0.0898, "num_input_tokens_seen": 16781312, "step": 4097 }, { "epoch": 3.0536512667660207, "grad_norm": 10.247415870319404, "learning_rate": 8.238805542314424e-07, "loss": 0.1696, "num_input_tokens_seen": 16785408, "step": 4098 }, { "epoch": 3.0543964232488823, "grad_norm": 8.038667335701659, "learning_rate": 8.233304118824661e-07, "loss": 0.1369, "num_input_tokens_seen": 16789504, "step": 4099 }, { "epoch": 3.055141579731744, "grad_norm": 8.936367534004203, "learning_rate": 8.227803630625106e-07, "loss": 0.2774, "num_input_tokens_seen": 16793600, "step": 4100 }, { "epoch": 3.055886736214605, "grad_norm": 8.584871893935498, "learning_rate": 8.222304078921501e-07, "loss": 0.2379, "num_input_tokens_seen": 16797696, "step": 4101 }, { "epoch": 3.0566318926974665, "grad_norm": 8.731834277146874, "learning_rate": 8.216805464919384e-07, "loss": 0.3326, "num_input_tokens_seen": 16801792, "step": 4102 }, { "epoch": 3.057377049180328, "grad_norm": 8.340915230052627, "learning_rate": 8.211307789824101e-07, "loss": 0.1411, "num_input_tokens_seen": 16805888, "step": 4103 }, { "epoch": 3.058122205663189, "grad_norm": 8.775983370354842, "learning_rate": 8.205811054840768e-07, "loss": 0.1192, "num_input_tokens_seen": 16809984, "step": 4104 }, { "epoch": 3.0588673621460507, "grad_norm": 11.377920180083386, "learning_rate": 8.200315261174316e-07, "loss": 0.1726, "num_input_tokens_seen": 16814080, "step": 4105 }, { "epoch": 3.0596125186289123, "grad_norm": 9.2419943957242, "learning_rate": 8.194820410029461e-07, "loss": 0.1314, "num_input_tokens_seen": 16818176, "step": 4106 }, { "epoch": 3.0603576751117734, "grad_norm": 5.71924681235842, "learning_rate": 8.189326502610714e-07, "loss": 0.1247, "num_input_tokens_seen": 16822272, "step": 4107 }, { "epoch": 3.061102831594635, "grad_norm": 8.245396056330577, "learning_rate": 8.183833540122378e-07, "loss": 0.1379, "num_input_tokens_seen": 16826368, "step": 4108 }, { "epoch": 3.0618479880774965, "grad_norm": 8.778457054038176, "learning_rate": 8.178341523768552e-07, "loss": 0.2069, "num_input_tokens_seen": 16830464, "step": 4109 }, { "epoch": 3.0625931445603576, "grad_norm": 7.808490461067736, "learning_rate": 8.172850454753123e-07, "loss": 0.1355, "num_input_tokens_seen": 16834560, "step": 4110 }, { "epoch": 3.063338301043219, "grad_norm": 7.977136575808869, "learning_rate": 8.167360334279773e-07, "loss": 0.2288, "num_input_tokens_seen": 16838656, "step": 4111 }, { "epoch": 3.0640834575260807, "grad_norm": 8.721260091184842, "learning_rate": 8.161871163551974e-07, "loss": 0.373, "num_input_tokens_seen": 16842752, "step": 4112 }, { "epoch": 3.064828614008942, "grad_norm": 8.780316636074678, "learning_rate": 8.156382943772994e-07, "loss": 0.1746, "num_input_tokens_seen": 16846848, "step": 4113 }, { "epoch": 3.0655737704918034, "grad_norm": 8.147857143012097, "learning_rate": 8.150895676145887e-07, "loss": 0.1441, "num_input_tokens_seen": 16850944, "step": 4114 }, { "epoch": 3.066318926974665, "grad_norm": 7.238110446743728, "learning_rate": 8.145409361873505e-07, "loss": 0.1455, "num_input_tokens_seen": 16855040, "step": 4115 }, { "epoch": 3.067064083457526, "grad_norm": 9.761007607295426, "learning_rate": 8.139924002158486e-07, "loss": 0.0711, "num_input_tokens_seen": 16859136, "step": 4116 }, { "epoch": 3.0678092399403876, "grad_norm": 10.448682529505142, "learning_rate": 8.134439598203262e-07, "loss": 0.222, "num_input_tokens_seen": 16863232, "step": 4117 }, { "epoch": 3.0685543964232487, "grad_norm": 20.9668267225008, "learning_rate": 8.128956151210049e-07, "loss": 0.1932, "num_input_tokens_seen": 16867328, "step": 4118 }, { "epoch": 3.0692995529061102, "grad_norm": 9.185208584225702, "learning_rate": 8.123473662380857e-07, "loss": 0.1913, "num_input_tokens_seen": 16871424, "step": 4119 }, { "epoch": 3.070044709388972, "grad_norm": 11.122378447176073, "learning_rate": 8.117992132917493e-07, "loss": 0.3916, "num_input_tokens_seen": 16875520, "step": 4120 }, { "epoch": 3.070789865871833, "grad_norm": 10.188608702855241, "learning_rate": 8.112511564021541e-07, "loss": 0.2593, "num_input_tokens_seen": 16879616, "step": 4121 }, { "epoch": 3.0715350223546944, "grad_norm": 12.901760675575286, "learning_rate": 8.107031956894384e-07, "loss": 0.2791, "num_input_tokens_seen": 16883712, "step": 4122 }, { "epoch": 3.072280178837556, "grad_norm": 9.913950591475666, "learning_rate": 8.101553312737191e-07, "loss": 0.1712, "num_input_tokens_seen": 16887808, "step": 4123 }, { "epoch": 3.073025335320417, "grad_norm": 6.970698627672201, "learning_rate": 8.096075632750919e-07, "loss": 0.1126, "num_input_tokens_seen": 16891904, "step": 4124 }, { "epoch": 3.0737704918032787, "grad_norm": 9.729718214036618, "learning_rate": 8.090598918136315e-07, "loss": 0.2557, "num_input_tokens_seen": 16896000, "step": 4125 }, { "epoch": 3.07451564828614, "grad_norm": 8.015437434373478, "learning_rate": 8.085123170093914e-07, "loss": 0.19, "num_input_tokens_seen": 16900096, "step": 4126 }, { "epoch": 3.0752608047690013, "grad_norm": 11.021318152124687, "learning_rate": 8.079648389824043e-07, "loss": 0.1585, "num_input_tokens_seen": 16904192, "step": 4127 }, { "epoch": 3.076005961251863, "grad_norm": 8.711924820810097, "learning_rate": 8.074174578526805e-07, "loss": 0.2054, "num_input_tokens_seen": 16908288, "step": 4128 }, { "epoch": 3.0767511177347244, "grad_norm": 6.754327085372717, "learning_rate": 8.068701737402104e-07, "loss": 0.1267, "num_input_tokens_seen": 16912384, "step": 4129 }, { "epoch": 3.0774962742175855, "grad_norm": 6.914428922888777, "learning_rate": 8.063229867649627e-07, "loss": 0.1152, "num_input_tokens_seen": 16916480, "step": 4130 }, { "epoch": 3.078241430700447, "grad_norm": 9.196159092706703, "learning_rate": 8.057758970468843e-07, "loss": 0.2474, "num_input_tokens_seen": 16920576, "step": 4131 }, { "epoch": 3.0789865871833086, "grad_norm": 7.70337463199647, "learning_rate": 8.05228904705901e-07, "loss": 0.2076, "num_input_tokens_seen": 16924672, "step": 4132 }, { "epoch": 3.0797317436661698, "grad_norm": 8.698435984781032, "learning_rate": 8.046820098619179e-07, "loss": 0.1574, "num_input_tokens_seen": 16928768, "step": 4133 }, { "epoch": 3.0804769001490313, "grad_norm": 9.731684211697818, "learning_rate": 8.04135212634818e-07, "loss": 0.2466, "num_input_tokens_seen": 16932864, "step": 4134 }, { "epoch": 3.081222056631893, "grad_norm": 7.631388628405074, "learning_rate": 8.035885131444633e-07, "loss": 0.1062, "num_input_tokens_seen": 16936960, "step": 4135 }, { "epoch": 3.081967213114754, "grad_norm": 9.804978744598424, "learning_rate": 8.030419115106941e-07, "loss": 0.2359, "num_input_tokens_seen": 16941056, "step": 4136 }, { "epoch": 3.0827123695976155, "grad_norm": 10.539667273809473, "learning_rate": 8.024954078533298e-07, "loss": 0.2437, "num_input_tokens_seen": 16945152, "step": 4137 }, { "epoch": 3.083457526080477, "grad_norm": 10.884449797477544, "learning_rate": 8.019490022921674e-07, "loss": 0.1327, "num_input_tokens_seen": 16949248, "step": 4138 }, { "epoch": 3.084202682563338, "grad_norm": 7.908077288726561, "learning_rate": 8.01402694946983e-07, "loss": 0.1784, "num_input_tokens_seen": 16953344, "step": 4139 }, { "epoch": 3.0849478390461997, "grad_norm": 8.782569246792756, "learning_rate": 8.00856485937531e-07, "loss": 0.2275, "num_input_tokens_seen": 16957440, "step": 4140 }, { "epoch": 3.0856929955290613, "grad_norm": 3.645414336486539, "learning_rate": 8.003103753835446e-07, "loss": 0.0194, "num_input_tokens_seen": 16961536, "step": 4141 }, { "epoch": 3.0864381520119224, "grad_norm": 9.92629979018978, "learning_rate": 7.997643634047351e-07, "loss": 0.199, "num_input_tokens_seen": 16965632, "step": 4142 }, { "epoch": 3.087183308494784, "grad_norm": 9.886940952132791, "learning_rate": 7.992184501207918e-07, "loss": 0.178, "num_input_tokens_seen": 16969728, "step": 4143 }, { "epoch": 3.0879284649776455, "grad_norm": 11.1604499677103, "learning_rate": 7.986726356513838e-07, "loss": 0.1487, "num_input_tokens_seen": 16973824, "step": 4144 }, { "epoch": 3.0886736214605066, "grad_norm": 10.626407109077695, "learning_rate": 7.981269201161562e-07, "loss": 0.3045, "num_input_tokens_seen": 16977920, "step": 4145 }, { "epoch": 3.089418777943368, "grad_norm": 10.238048362776578, "learning_rate": 7.975813036347347e-07, "loss": 0.381, "num_input_tokens_seen": 16982016, "step": 4146 }, { "epoch": 3.0901639344262297, "grad_norm": 9.009288275546439, "learning_rate": 7.97035786326722e-07, "loss": 0.1523, "num_input_tokens_seen": 16986112, "step": 4147 }, { "epoch": 3.090909090909091, "grad_norm": 7.766497177811064, "learning_rate": 7.964903683116989e-07, "loss": 0.1934, "num_input_tokens_seen": 16990208, "step": 4148 }, { "epoch": 3.0916542473919524, "grad_norm": 12.33754124673845, "learning_rate": 7.959450497092258e-07, "loss": 0.1886, "num_input_tokens_seen": 16994304, "step": 4149 }, { "epoch": 3.092399403874814, "grad_norm": 8.62943346927499, "learning_rate": 7.953998306388399e-07, "loss": 0.2128, "num_input_tokens_seen": 16998400, "step": 4150 }, { "epoch": 3.093144560357675, "grad_norm": 8.71941250747923, "learning_rate": 7.948547112200572e-07, "loss": 0.2478, "num_input_tokens_seen": 17002496, "step": 4151 }, { "epoch": 3.0938897168405366, "grad_norm": 7.730118558954527, "learning_rate": 7.943096915723719e-07, "loss": 0.0678, "num_input_tokens_seen": 17006592, "step": 4152 }, { "epoch": 3.0946348733233977, "grad_norm": 7.895031250775975, "learning_rate": 7.937647718152562e-07, "loss": 0.1523, "num_input_tokens_seen": 17010688, "step": 4153 }, { "epoch": 3.0953800298062593, "grad_norm": 10.267324754428966, "learning_rate": 7.932199520681605e-07, "loss": 0.2964, "num_input_tokens_seen": 17014784, "step": 4154 }, { "epoch": 3.096125186289121, "grad_norm": 4.318083443817645, "learning_rate": 7.926752324505129e-07, "loss": 0.0298, "num_input_tokens_seen": 17018880, "step": 4155 }, { "epoch": 3.096870342771982, "grad_norm": 6.860451583719743, "learning_rate": 7.921306130817197e-07, "loss": 0.1723, "num_input_tokens_seen": 17022976, "step": 4156 }, { "epoch": 3.0976154992548435, "grad_norm": 8.06904784553507, "learning_rate": 7.915860940811662e-07, "loss": 0.245, "num_input_tokens_seen": 17027072, "step": 4157 }, { "epoch": 3.098360655737705, "grad_norm": 10.07261210167439, "learning_rate": 7.910416755682137e-07, "loss": 0.3018, "num_input_tokens_seen": 17031168, "step": 4158 }, { "epoch": 3.099105812220566, "grad_norm": 7.938187504907979, "learning_rate": 7.904973576622033e-07, "loss": 0.1646, "num_input_tokens_seen": 17035264, "step": 4159 }, { "epoch": 3.0998509687034277, "grad_norm": 8.855519980448143, "learning_rate": 7.899531404824531e-07, "loss": 0.1683, "num_input_tokens_seen": 17039360, "step": 4160 }, { "epoch": 3.1005961251862892, "grad_norm": 6.493172626608103, "learning_rate": 7.894090241482597e-07, "loss": 0.0213, "num_input_tokens_seen": 17043456, "step": 4161 }, { "epoch": 3.1013412816691504, "grad_norm": 12.136285571468328, "learning_rate": 7.88865008778897e-07, "loss": 0.2669, "num_input_tokens_seen": 17047552, "step": 4162 }, { "epoch": 3.102086438152012, "grad_norm": 6.73916018287132, "learning_rate": 7.88321094493617e-07, "loss": 0.0581, "num_input_tokens_seen": 17051648, "step": 4163 }, { "epoch": 3.1028315946348735, "grad_norm": 8.872069301695378, "learning_rate": 7.877772814116502e-07, "loss": 0.1658, "num_input_tokens_seen": 17055744, "step": 4164 }, { "epoch": 3.1035767511177346, "grad_norm": 8.902605572320331, "learning_rate": 7.872335696522038e-07, "loss": 0.097, "num_input_tokens_seen": 17059840, "step": 4165 }, { "epoch": 3.104321907600596, "grad_norm": 10.916390936257013, "learning_rate": 7.866899593344632e-07, "loss": 0.1136, "num_input_tokens_seen": 17063936, "step": 4166 }, { "epoch": 3.1050670640834577, "grad_norm": 11.224236829821805, "learning_rate": 7.861464505775918e-07, "loss": 0.2593, "num_input_tokens_seen": 17068032, "step": 4167 }, { "epoch": 3.105812220566319, "grad_norm": 9.018638231813016, "learning_rate": 7.856030435007307e-07, "loss": 0.296, "num_input_tokens_seen": 17072128, "step": 4168 }, { "epoch": 3.1065573770491803, "grad_norm": 10.680591403030448, "learning_rate": 7.850597382229985e-07, "loss": 0.3391, "num_input_tokens_seen": 17076224, "step": 4169 }, { "epoch": 3.107302533532042, "grad_norm": 9.729195869793537, "learning_rate": 7.845165348634918e-07, "loss": 0.1602, "num_input_tokens_seen": 17080320, "step": 4170 }, { "epoch": 3.108047690014903, "grad_norm": 13.185594833411162, "learning_rate": 7.839734335412849e-07, "loss": 0.3899, "num_input_tokens_seen": 17084416, "step": 4171 }, { "epoch": 3.1087928464977646, "grad_norm": 9.941944185121182, "learning_rate": 7.834304343754287e-07, "loss": 0.1568, "num_input_tokens_seen": 17088512, "step": 4172 }, { "epoch": 3.109538002980626, "grad_norm": 14.392157383361296, "learning_rate": 7.828875374849526e-07, "loss": 0.3643, "num_input_tokens_seen": 17092608, "step": 4173 }, { "epoch": 3.110283159463487, "grad_norm": 7.804763721070884, "learning_rate": 7.823447429888641e-07, "loss": 0.14, "num_input_tokens_seen": 17096704, "step": 4174 }, { "epoch": 3.1110283159463488, "grad_norm": 12.044163824508436, "learning_rate": 7.818020510061473e-07, "loss": 0.2393, "num_input_tokens_seen": 17100800, "step": 4175 }, { "epoch": 3.1117734724292103, "grad_norm": 8.327703232212691, "learning_rate": 7.812594616557641e-07, "loss": 0.1479, "num_input_tokens_seen": 17104896, "step": 4176 }, { "epoch": 3.1125186289120714, "grad_norm": 9.136259696392235, "learning_rate": 7.807169750566539e-07, "loss": 0.2165, "num_input_tokens_seen": 17108992, "step": 4177 }, { "epoch": 3.113263785394933, "grad_norm": 10.692080465067392, "learning_rate": 7.801745913277337e-07, "loss": 0.126, "num_input_tokens_seen": 17113088, "step": 4178 }, { "epoch": 3.1140089418777945, "grad_norm": 7.851820076168421, "learning_rate": 7.796323105878981e-07, "loss": 0.1754, "num_input_tokens_seen": 17117184, "step": 4179 }, { "epoch": 3.1147540983606556, "grad_norm": 5.902181395403144, "learning_rate": 7.790901329560185e-07, "loss": 0.0908, "num_input_tokens_seen": 17121280, "step": 4180 }, { "epoch": 3.115499254843517, "grad_norm": 8.069524604093937, "learning_rate": 7.785480585509446e-07, "loss": 0.1343, "num_input_tokens_seen": 17125376, "step": 4181 }, { "epoch": 3.1162444113263787, "grad_norm": 7.287423022251695, "learning_rate": 7.78006087491503e-07, "loss": 0.1663, "num_input_tokens_seen": 17129472, "step": 4182 }, { "epoch": 3.11698956780924, "grad_norm": 6.581648279647306, "learning_rate": 7.774642198964968e-07, "loss": 0.1701, "num_input_tokens_seen": 17133568, "step": 4183 }, { "epoch": 3.1177347242921014, "grad_norm": 10.657710140131217, "learning_rate": 7.769224558847083e-07, "loss": 0.1043, "num_input_tokens_seen": 17137664, "step": 4184 }, { "epoch": 3.118479880774963, "grad_norm": 9.478466098027305, "learning_rate": 7.763807955748955e-07, "loss": 0.3054, "num_input_tokens_seen": 17141760, "step": 4185 }, { "epoch": 3.119225037257824, "grad_norm": 7.686518136433492, "learning_rate": 7.758392390857939e-07, "loss": 0.1832, "num_input_tokens_seen": 17145856, "step": 4186 }, { "epoch": 3.1199701937406856, "grad_norm": 8.008398129198405, "learning_rate": 7.752977865361173e-07, "loss": 0.2066, "num_input_tokens_seen": 17149952, "step": 4187 }, { "epoch": 3.1207153502235467, "grad_norm": 9.125424624879189, "learning_rate": 7.74756438044555e-07, "loss": 0.2513, "num_input_tokens_seen": 17154048, "step": 4188 }, { "epoch": 3.1214605067064083, "grad_norm": 6.897072661111022, "learning_rate": 7.742151937297754e-07, "loss": 0.2331, "num_input_tokens_seen": 17158144, "step": 4189 }, { "epoch": 3.12220566318927, "grad_norm": 6.714218623954544, "learning_rate": 7.736740537104226e-07, "loss": 0.1747, "num_input_tokens_seen": 17162240, "step": 4190 }, { "epoch": 3.122950819672131, "grad_norm": 9.309329416672922, "learning_rate": 7.731330181051186e-07, "loss": 0.2028, "num_input_tokens_seen": 17166336, "step": 4191 }, { "epoch": 3.1236959761549925, "grad_norm": 10.175929564214622, "learning_rate": 7.725920870324622e-07, "loss": 0.2991, "num_input_tokens_seen": 17170432, "step": 4192 }, { "epoch": 3.124441132637854, "grad_norm": 9.030497721752035, "learning_rate": 7.720512606110292e-07, "loss": 0.2576, "num_input_tokens_seen": 17174528, "step": 4193 }, { "epoch": 3.125186289120715, "grad_norm": 10.564916392499235, "learning_rate": 7.715105389593728e-07, "loss": 0.0349, "num_input_tokens_seen": 17178624, "step": 4194 }, { "epoch": 3.1259314456035767, "grad_norm": 7.558951619610015, "learning_rate": 7.709699221960229e-07, "loss": 0.1668, "num_input_tokens_seen": 17182720, "step": 4195 }, { "epoch": 3.1266766020864383, "grad_norm": 9.065273500016017, "learning_rate": 7.704294104394866e-07, "loss": 0.2616, "num_input_tokens_seen": 17186816, "step": 4196 }, { "epoch": 3.1274217585692994, "grad_norm": 8.208685505409278, "learning_rate": 7.698890038082482e-07, "loss": 0.1545, "num_input_tokens_seen": 17190912, "step": 4197 }, { "epoch": 3.128166915052161, "grad_norm": 11.518191413067708, "learning_rate": 7.693487024207689e-07, "loss": 0.1944, "num_input_tokens_seen": 17195008, "step": 4198 }, { "epoch": 3.1289120715350225, "grad_norm": 5.96373493737634, "learning_rate": 7.688085063954853e-07, "loss": 0.0992, "num_input_tokens_seen": 17199104, "step": 4199 }, { "epoch": 3.1296572280178836, "grad_norm": 7.016107292098186, "learning_rate": 7.682684158508136e-07, "loss": 0.154, "num_input_tokens_seen": 17203200, "step": 4200 }, { "epoch": 3.130402384500745, "grad_norm": 9.734313844693412, "learning_rate": 7.677284309051447e-07, "loss": 0.152, "num_input_tokens_seen": 17207296, "step": 4201 }, { "epoch": 3.1311475409836067, "grad_norm": 9.063891725662803, "learning_rate": 7.67188551676848e-07, "loss": 0.1461, "num_input_tokens_seen": 17211392, "step": 4202 }, { "epoch": 3.131892697466468, "grad_norm": 9.177995883716324, "learning_rate": 7.666487782842677e-07, "loss": 0.2459, "num_input_tokens_seen": 17215488, "step": 4203 }, { "epoch": 3.1326378539493294, "grad_norm": 6.625535234528294, "learning_rate": 7.661091108457269e-07, "loss": 0.1234, "num_input_tokens_seen": 17219584, "step": 4204 }, { "epoch": 3.133383010432191, "grad_norm": 7.157269670748979, "learning_rate": 7.655695494795243e-07, "loss": 0.1223, "num_input_tokens_seen": 17223680, "step": 4205 }, { "epoch": 3.134128166915052, "grad_norm": 5.140234503916483, "learning_rate": 7.650300943039355e-07, "loss": 0.0309, "num_input_tokens_seen": 17227776, "step": 4206 }, { "epoch": 3.1348733233979136, "grad_norm": 6.931652650633548, "learning_rate": 7.644907454372131e-07, "loss": 0.0943, "num_input_tokens_seen": 17231872, "step": 4207 }, { "epoch": 3.135618479880775, "grad_norm": 7.418944805942748, "learning_rate": 7.639515029975862e-07, "loss": 0.1547, "num_input_tokens_seen": 17235968, "step": 4208 }, { "epoch": 3.1363636363636362, "grad_norm": 9.361114062797178, "learning_rate": 7.634123671032608e-07, "loss": 0.1743, "num_input_tokens_seen": 17240064, "step": 4209 }, { "epoch": 3.137108792846498, "grad_norm": 9.704545910310777, "learning_rate": 7.628733378724187e-07, "loss": 0.1819, "num_input_tokens_seen": 17244160, "step": 4210 }, { "epoch": 3.1378539493293593, "grad_norm": 10.27618984342055, "learning_rate": 7.623344154232196e-07, "loss": 0.0873, "num_input_tokens_seen": 17248256, "step": 4211 }, { "epoch": 3.1385991058122205, "grad_norm": 9.147866085871238, "learning_rate": 7.617955998737988e-07, "loss": 0.1714, "num_input_tokens_seen": 17252352, "step": 4212 }, { "epoch": 3.139344262295082, "grad_norm": 4.403613449782856, "learning_rate": 7.612568913422686e-07, "loss": 0.0333, "num_input_tokens_seen": 17256448, "step": 4213 }, { "epoch": 3.1400894187779436, "grad_norm": 7.595156115662085, "learning_rate": 7.607182899467176e-07, "loss": 0.0626, "num_input_tokens_seen": 17260544, "step": 4214 }, { "epoch": 3.1408345752608047, "grad_norm": 9.528928855299014, "learning_rate": 7.601797958052112e-07, "loss": 0.1614, "num_input_tokens_seen": 17264640, "step": 4215 }, { "epoch": 3.1415797317436662, "grad_norm": 9.231956041393108, "learning_rate": 7.596414090357912e-07, "loss": 0.2243, "num_input_tokens_seen": 17268736, "step": 4216 }, { "epoch": 3.1423248882265273, "grad_norm": 25.774826684054826, "learning_rate": 7.59103129756476e-07, "loss": 0.2353, "num_input_tokens_seen": 17272832, "step": 4217 }, { "epoch": 3.143070044709389, "grad_norm": 7.4252925346675545, "learning_rate": 7.585649580852599e-07, "loss": 0.0925, "num_input_tokens_seen": 17276928, "step": 4218 }, { "epoch": 3.1438152011922504, "grad_norm": 10.606716006203278, "learning_rate": 7.580268941401145e-07, "loss": 0.108, "num_input_tokens_seen": 17281024, "step": 4219 }, { "epoch": 3.144560357675112, "grad_norm": 8.745914470658317, "learning_rate": 7.574889380389864e-07, "loss": 0.1947, "num_input_tokens_seen": 17285120, "step": 4220 }, { "epoch": 3.145305514157973, "grad_norm": 10.396511487300682, "learning_rate": 7.569510898998002e-07, "loss": 0.3975, "num_input_tokens_seen": 17289216, "step": 4221 }, { "epoch": 3.1460506706408347, "grad_norm": 7.356590880539959, "learning_rate": 7.564133498404556e-07, "loss": 0.1267, "num_input_tokens_seen": 17293312, "step": 4222 }, { "epoch": 3.1467958271236958, "grad_norm": 9.99953830216545, "learning_rate": 7.558757179788294e-07, "loss": 0.1794, "num_input_tokens_seen": 17297408, "step": 4223 }, { "epoch": 3.1475409836065573, "grad_norm": 10.240776705256371, "learning_rate": 7.55338194432774e-07, "loss": 0.2392, "num_input_tokens_seen": 17301504, "step": 4224 }, { "epoch": 3.148286140089419, "grad_norm": 7.911458030032286, "learning_rate": 7.548007793201193e-07, "loss": 0.2005, "num_input_tokens_seen": 17305600, "step": 4225 }, { "epoch": 3.14903129657228, "grad_norm": 8.553167000363777, "learning_rate": 7.542634727586691e-07, "loss": 0.2811, "num_input_tokens_seen": 17309696, "step": 4226 }, { "epoch": 3.1497764530551415, "grad_norm": 8.476130194882748, "learning_rate": 7.537262748662056e-07, "loss": 0.1711, "num_input_tokens_seen": 17313792, "step": 4227 }, { "epoch": 3.150521609538003, "grad_norm": 6.66070360589709, "learning_rate": 7.531891857604864e-07, "loss": 0.0526, "num_input_tokens_seen": 17317888, "step": 4228 }, { "epoch": 3.151266766020864, "grad_norm": 7.6202897851805265, "learning_rate": 7.526522055592455e-07, "loss": 0.2559, "num_input_tokens_seen": 17321984, "step": 4229 }, { "epoch": 3.1520119225037257, "grad_norm": 7.343602903749743, "learning_rate": 7.521153343801923e-07, "loss": 0.2256, "num_input_tokens_seen": 17326080, "step": 4230 }, { "epoch": 3.1527570789865873, "grad_norm": 7.60134511102121, "learning_rate": 7.515785723410129e-07, "loss": 0.1021, "num_input_tokens_seen": 17330176, "step": 4231 }, { "epoch": 3.1535022354694484, "grad_norm": 8.53755544044615, "learning_rate": 7.510419195593697e-07, "loss": 0.1736, "num_input_tokens_seen": 17334272, "step": 4232 }, { "epoch": 3.15424739195231, "grad_norm": 9.848875202280267, "learning_rate": 7.505053761529003e-07, "loss": 0.1708, "num_input_tokens_seen": 17338368, "step": 4233 }, { "epoch": 3.1549925484351715, "grad_norm": 8.55438319541434, "learning_rate": 7.499689422392194e-07, "loss": 0.3446, "num_input_tokens_seen": 17342464, "step": 4234 }, { "epoch": 3.1557377049180326, "grad_norm": 7.6825999883922576, "learning_rate": 7.494326179359168e-07, "loss": 0.1077, "num_input_tokens_seen": 17346560, "step": 4235 }, { "epoch": 3.156482861400894, "grad_norm": 11.34696320189813, "learning_rate": 7.488964033605589e-07, "loss": 0.1851, "num_input_tokens_seen": 17350656, "step": 4236 }, { "epoch": 3.1572280178837557, "grad_norm": 6.056670228030469, "learning_rate": 7.483602986306876e-07, "loss": 0.0414, "num_input_tokens_seen": 17354752, "step": 4237 }, { "epoch": 3.157973174366617, "grad_norm": 12.43321146849545, "learning_rate": 7.478243038638208e-07, "loss": 0.3151, "num_input_tokens_seen": 17358848, "step": 4238 }, { "epoch": 3.1587183308494784, "grad_norm": 9.691633390155058, "learning_rate": 7.472884191774526e-07, "loss": 0.1714, "num_input_tokens_seen": 17362944, "step": 4239 }, { "epoch": 3.15946348733234, "grad_norm": 8.198893236432498, "learning_rate": 7.467526446890525e-07, "loss": 0.1565, "num_input_tokens_seen": 17367040, "step": 4240 }, { "epoch": 3.160208643815201, "grad_norm": 10.275178862477434, "learning_rate": 7.46216980516066e-07, "loss": 0.4034, "num_input_tokens_seen": 17371136, "step": 4241 }, { "epoch": 3.1609538002980626, "grad_norm": 11.466545913632164, "learning_rate": 7.456814267759147e-07, "loss": 0.2775, "num_input_tokens_seen": 17375232, "step": 4242 }, { "epoch": 3.161698956780924, "grad_norm": 10.409741962788884, "learning_rate": 7.451459835859958e-07, "loss": 0.3386, "num_input_tokens_seen": 17379328, "step": 4243 }, { "epoch": 3.1624441132637853, "grad_norm": 8.351434900871745, "learning_rate": 7.446106510636823e-07, "loss": 0.1241, "num_input_tokens_seen": 17383424, "step": 4244 }, { "epoch": 3.163189269746647, "grad_norm": 10.385933213531079, "learning_rate": 7.440754293263231e-07, "loss": 0.2233, "num_input_tokens_seen": 17387520, "step": 4245 }, { "epoch": 3.1639344262295084, "grad_norm": 4.827251259705188, "learning_rate": 7.435403184912424e-07, "loss": 0.0521, "num_input_tokens_seen": 17391616, "step": 4246 }, { "epoch": 3.1646795827123695, "grad_norm": 4.3934254373007215, "learning_rate": 7.430053186757401e-07, "loss": 0.0503, "num_input_tokens_seen": 17395712, "step": 4247 }, { "epoch": 3.165424739195231, "grad_norm": 10.606013539759715, "learning_rate": 7.424704299970922e-07, "loss": 0.187, "num_input_tokens_seen": 17399808, "step": 4248 }, { "epoch": 3.1661698956780926, "grad_norm": 7.751487767266338, "learning_rate": 7.419356525725502e-07, "loss": 0.1466, "num_input_tokens_seen": 17403904, "step": 4249 }, { "epoch": 3.1669150521609537, "grad_norm": 7.713365350496284, "learning_rate": 7.414009865193411e-07, "loss": 0.1999, "num_input_tokens_seen": 17408000, "step": 4250 }, { "epoch": 3.1676602086438153, "grad_norm": 7.861424795380001, "learning_rate": 7.408664319546674e-07, "loss": 0.1038, "num_input_tokens_seen": 17412096, "step": 4251 }, { "epoch": 3.168405365126677, "grad_norm": 9.30427511068808, "learning_rate": 7.403319889957079e-07, "loss": 0.2292, "num_input_tokens_seen": 17416192, "step": 4252 }, { "epoch": 3.169150521609538, "grad_norm": 10.723555539863728, "learning_rate": 7.397976577596153e-07, "loss": 0.2647, "num_input_tokens_seen": 17420288, "step": 4253 }, { "epoch": 3.1698956780923995, "grad_norm": 8.830778224303891, "learning_rate": 7.392634383635191e-07, "loss": 0.1169, "num_input_tokens_seen": 17424384, "step": 4254 }, { "epoch": 3.170640834575261, "grad_norm": 9.103774191774134, "learning_rate": 7.387293309245244e-07, "loss": 0.1106, "num_input_tokens_seen": 17428480, "step": 4255 }, { "epoch": 3.171385991058122, "grad_norm": 8.048197429077318, "learning_rate": 7.381953355597114e-07, "loss": 0.1772, "num_input_tokens_seen": 17432576, "step": 4256 }, { "epoch": 3.1721311475409837, "grad_norm": 7.529915237577357, "learning_rate": 7.376614523861352e-07, "loss": 0.167, "num_input_tokens_seen": 17436672, "step": 4257 }, { "epoch": 3.172876304023845, "grad_norm": 10.67356536577827, "learning_rate": 7.371276815208269e-07, "loss": 0.0734, "num_input_tokens_seen": 17440768, "step": 4258 }, { "epoch": 3.1736214605067063, "grad_norm": 9.064591035836438, "learning_rate": 7.365940230807933e-07, "loss": 0.2127, "num_input_tokens_seen": 17444864, "step": 4259 }, { "epoch": 3.174366616989568, "grad_norm": 8.298744017749543, "learning_rate": 7.360604771830157e-07, "loss": 0.0659, "num_input_tokens_seen": 17448960, "step": 4260 }, { "epoch": 3.175111773472429, "grad_norm": 8.687145013038187, "learning_rate": 7.355270439444514e-07, "loss": 0.296, "num_input_tokens_seen": 17453056, "step": 4261 }, { "epoch": 3.1758569299552906, "grad_norm": 6.643681379780528, "learning_rate": 7.349937234820328e-07, "loss": 0.1302, "num_input_tokens_seen": 17457152, "step": 4262 }, { "epoch": 3.176602086438152, "grad_norm": 9.584091246539396, "learning_rate": 7.344605159126675e-07, "loss": 0.1931, "num_input_tokens_seen": 17461248, "step": 4263 }, { "epoch": 3.1773472429210132, "grad_norm": 9.512294787939927, "learning_rate": 7.339274213532385e-07, "loss": 0.2499, "num_input_tokens_seen": 17465344, "step": 4264 }, { "epoch": 3.178092399403875, "grad_norm": 9.46943882129879, "learning_rate": 7.333944399206039e-07, "loss": 0.1706, "num_input_tokens_seen": 17469440, "step": 4265 }, { "epoch": 3.1788375558867363, "grad_norm": 8.506863063351531, "learning_rate": 7.328615717315968e-07, "loss": 0.2093, "num_input_tokens_seen": 17473536, "step": 4266 }, { "epoch": 3.1795827123695974, "grad_norm": 11.39491296796593, "learning_rate": 7.323288169030259e-07, "loss": 0.165, "num_input_tokens_seen": 17477632, "step": 4267 }, { "epoch": 3.180327868852459, "grad_norm": 8.463721000233429, "learning_rate": 7.317961755516748e-07, "loss": 0.1533, "num_input_tokens_seen": 17481728, "step": 4268 }, { "epoch": 3.1810730253353205, "grad_norm": 7.9565551386619795, "learning_rate": 7.312636477943022e-07, "loss": 0.0737, "num_input_tokens_seen": 17485824, "step": 4269 }, { "epoch": 3.1818181818181817, "grad_norm": 9.659248381382826, "learning_rate": 7.307312337476422e-07, "loss": 0.0886, "num_input_tokens_seen": 17489920, "step": 4270 }, { "epoch": 3.182563338301043, "grad_norm": 8.10841041493851, "learning_rate": 7.301989335284035e-07, "loss": 0.103, "num_input_tokens_seen": 17494016, "step": 4271 }, { "epoch": 3.1833084947839048, "grad_norm": 10.375575906133944, "learning_rate": 7.296667472532706e-07, "loss": 0.2059, "num_input_tokens_seen": 17498112, "step": 4272 }, { "epoch": 3.184053651266766, "grad_norm": 8.813203782451561, "learning_rate": 7.291346750389019e-07, "loss": 0.1339, "num_input_tokens_seen": 17502208, "step": 4273 }, { "epoch": 3.1847988077496274, "grad_norm": 8.418303533440133, "learning_rate": 7.286027170019322e-07, "loss": 0.1836, "num_input_tokens_seen": 17506304, "step": 4274 }, { "epoch": 3.185543964232489, "grad_norm": 9.268404800453185, "learning_rate": 7.280708732589695e-07, "loss": 0.1795, "num_input_tokens_seen": 17510400, "step": 4275 }, { "epoch": 3.18628912071535, "grad_norm": 7.780523066289608, "learning_rate": 7.275391439265986e-07, "loss": 0.0808, "num_input_tokens_seen": 17514496, "step": 4276 }, { "epoch": 3.1870342771982116, "grad_norm": 15.426595749525614, "learning_rate": 7.270075291213782e-07, "loss": 0.1629, "num_input_tokens_seen": 17518592, "step": 4277 }, { "epoch": 3.187779433681073, "grad_norm": 8.600624852774528, "learning_rate": 7.264760289598419e-07, "loss": 0.2448, "num_input_tokens_seen": 17522688, "step": 4278 }, { "epoch": 3.1885245901639343, "grad_norm": 9.514586495849446, "learning_rate": 7.25944643558498e-07, "loss": 0.1676, "num_input_tokens_seen": 17526784, "step": 4279 }, { "epoch": 3.189269746646796, "grad_norm": 6.75104314314916, "learning_rate": 7.254133730338306e-07, "loss": 0.0409, "num_input_tokens_seen": 17530880, "step": 4280 }, { "epoch": 3.1900149031296574, "grad_norm": 6.961829398164519, "learning_rate": 7.24882217502298e-07, "loss": 0.1195, "num_input_tokens_seen": 17534976, "step": 4281 }, { "epoch": 3.1907600596125185, "grad_norm": 9.255183099403414, "learning_rate": 7.243511770803321e-07, "loss": 0.2234, "num_input_tokens_seen": 17539072, "step": 4282 }, { "epoch": 3.19150521609538, "grad_norm": 7.854232412855358, "learning_rate": 7.238202518843424e-07, "loss": 0.1173, "num_input_tokens_seen": 17543168, "step": 4283 }, { "epoch": 3.1922503725782416, "grad_norm": 9.580335362097259, "learning_rate": 7.232894420307102e-07, "loss": 0.2849, "num_input_tokens_seen": 17547264, "step": 4284 }, { "epoch": 3.1929955290611027, "grad_norm": 8.47224700313672, "learning_rate": 7.227587476357939e-07, "loss": 0.1773, "num_input_tokens_seen": 17551360, "step": 4285 }, { "epoch": 3.1937406855439643, "grad_norm": 8.688292870620542, "learning_rate": 7.222281688159243e-07, "loss": 0.1304, "num_input_tokens_seen": 17555456, "step": 4286 }, { "epoch": 3.194485842026826, "grad_norm": 10.767596605656887, "learning_rate": 7.216977056874094e-07, "loss": 0.2022, "num_input_tokens_seen": 17559552, "step": 4287 }, { "epoch": 3.195230998509687, "grad_norm": 10.119650262502768, "learning_rate": 7.211673583665297e-07, "loss": 0.1698, "num_input_tokens_seen": 17563648, "step": 4288 }, { "epoch": 3.1959761549925485, "grad_norm": 7.495389160703888, "learning_rate": 7.206371269695408e-07, "loss": 0.1354, "num_input_tokens_seen": 17567744, "step": 4289 }, { "epoch": 3.19672131147541, "grad_norm": 6.548731101876123, "learning_rate": 7.201070116126743e-07, "loss": 0.1313, "num_input_tokens_seen": 17571840, "step": 4290 }, { "epoch": 3.197466467958271, "grad_norm": 8.865116740596134, "learning_rate": 7.19577012412134e-07, "loss": 0.2265, "num_input_tokens_seen": 17575936, "step": 4291 }, { "epoch": 3.1982116244411327, "grad_norm": 8.847292065237031, "learning_rate": 7.190471294841011e-07, "loss": 0.2047, "num_input_tokens_seen": 17580032, "step": 4292 }, { "epoch": 3.198956780923994, "grad_norm": 10.295640211750875, "learning_rate": 7.185173629447279e-07, "loss": 0.1813, "num_input_tokens_seen": 17584128, "step": 4293 }, { "epoch": 3.1997019374068554, "grad_norm": 8.09059469008687, "learning_rate": 7.179877129101443e-07, "loss": 0.1191, "num_input_tokens_seen": 17588224, "step": 4294 }, { "epoch": 3.200447093889717, "grad_norm": 7.0158049892234065, "learning_rate": 7.174581794964522e-07, "loss": 0.165, "num_input_tokens_seen": 17592320, "step": 4295 }, { "epoch": 3.201192250372578, "grad_norm": 8.036377965101817, "learning_rate": 7.169287628197307e-07, "loss": 0.1886, "num_input_tokens_seen": 17596416, "step": 4296 }, { "epoch": 3.2019374068554396, "grad_norm": 9.453586016559038, "learning_rate": 7.163994629960307e-07, "loss": 0.1239, "num_input_tokens_seen": 17600512, "step": 4297 }, { "epoch": 3.202682563338301, "grad_norm": 9.528624732749925, "learning_rate": 7.15870280141378e-07, "loss": 0.2994, "num_input_tokens_seen": 17604608, "step": 4298 }, { "epoch": 3.2034277198211623, "grad_norm": 6.343980824656366, "learning_rate": 7.153412143717747e-07, "loss": 0.0831, "num_input_tokens_seen": 17608704, "step": 4299 }, { "epoch": 3.204172876304024, "grad_norm": 8.11830954238935, "learning_rate": 7.148122658031945e-07, "loss": 0.1448, "num_input_tokens_seen": 17612800, "step": 4300 }, { "epoch": 3.2049180327868854, "grad_norm": 6.666771200687216, "learning_rate": 7.142834345515876e-07, "loss": 0.2095, "num_input_tokens_seen": 17616896, "step": 4301 }, { "epoch": 3.2056631892697465, "grad_norm": 7.793996753397944, "learning_rate": 7.137547207328765e-07, "loss": 0.1772, "num_input_tokens_seen": 17620992, "step": 4302 }, { "epoch": 3.206408345752608, "grad_norm": 6.1779319036602525, "learning_rate": 7.132261244629607e-07, "loss": 0.052, "num_input_tokens_seen": 17625088, "step": 4303 }, { "epoch": 3.2071535022354696, "grad_norm": 10.455434259134575, "learning_rate": 7.126976458577104e-07, "loss": 0.2164, "num_input_tokens_seen": 17629184, "step": 4304 }, { "epoch": 3.2078986587183307, "grad_norm": 10.875852403615017, "learning_rate": 7.12169285032974e-07, "loss": 0.2553, "num_input_tokens_seen": 17633280, "step": 4305 }, { "epoch": 3.2086438152011922, "grad_norm": 10.655327005090236, "learning_rate": 7.116410421045699e-07, "loss": 0.3587, "num_input_tokens_seen": 17637376, "step": 4306 }, { "epoch": 3.209388971684054, "grad_norm": 5.597020797904595, "learning_rate": 7.11112917188294e-07, "loss": 0.1345, "num_input_tokens_seen": 17641472, "step": 4307 }, { "epoch": 3.210134128166915, "grad_norm": 10.070839309483947, "learning_rate": 7.105849103999147e-07, "loss": 0.3634, "num_input_tokens_seen": 17645568, "step": 4308 }, { "epoch": 3.2108792846497765, "grad_norm": 8.05471364884591, "learning_rate": 7.100570218551741e-07, "loss": 0.1955, "num_input_tokens_seen": 17649664, "step": 4309 }, { "epoch": 3.211624441132638, "grad_norm": 12.529755730270256, "learning_rate": 7.095292516697905e-07, "loss": 0.2181, "num_input_tokens_seen": 17653760, "step": 4310 }, { "epoch": 3.212369597615499, "grad_norm": 6.63593294411865, "learning_rate": 7.090015999594538e-07, "loss": 0.0752, "num_input_tokens_seen": 17657856, "step": 4311 }, { "epoch": 3.2131147540983607, "grad_norm": 8.368616980123967, "learning_rate": 7.084740668398299e-07, "loss": 0.2234, "num_input_tokens_seen": 17661952, "step": 4312 }, { "epoch": 3.2138599105812222, "grad_norm": 8.441807672219257, "learning_rate": 7.079466524265569e-07, "loss": 0.1871, "num_input_tokens_seen": 17666048, "step": 4313 }, { "epoch": 3.2146050670640833, "grad_norm": 8.95936199210585, "learning_rate": 7.074193568352486e-07, "loss": 0.1852, "num_input_tokens_seen": 17670144, "step": 4314 }, { "epoch": 3.215350223546945, "grad_norm": 8.838755427077556, "learning_rate": 7.068921801814918e-07, "loss": 0.0949, "num_input_tokens_seen": 17674240, "step": 4315 }, { "epoch": 3.2160953800298064, "grad_norm": 14.363149765361023, "learning_rate": 7.063651225808468e-07, "loss": 0.1671, "num_input_tokens_seen": 17678336, "step": 4316 }, { "epoch": 3.2168405365126675, "grad_norm": 6.873206361270975, "learning_rate": 7.058381841488493e-07, "loss": 0.059, "num_input_tokens_seen": 17682432, "step": 4317 }, { "epoch": 3.217585692995529, "grad_norm": 10.120044325360622, "learning_rate": 7.053113650010071e-07, "loss": 0.2371, "num_input_tokens_seen": 17686528, "step": 4318 }, { "epoch": 3.2183308494783907, "grad_norm": 8.03994232677237, "learning_rate": 7.047846652528039e-07, "loss": 0.0789, "num_input_tokens_seen": 17690624, "step": 4319 }, { "epoch": 3.2190760059612518, "grad_norm": 9.96296042620963, "learning_rate": 7.042580850196943e-07, "loss": 0.2915, "num_input_tokens_seen": 17694720, "step": 4320 }, { "epoch": 3.2198211624441133, "grad_norm": 6.854001856840013, "learning_rate": 7.037316244171101e-07, "loss": 0.1108, "num_input_tokens_seen": 17698816, "step": 4321 }, { "epoch": 3.220566318926975, "grad_norm": 7.446987709746264, "learning_rate": 7.032052835604539e-07, "loss": 0.1996, "num_input_tokens_seen": 17702912, "step": 4322 }, { "epoch": 3.221311475409836, "grad_norm": 8.95995158180337, "learning_rate": 7.026790625651045e-07, "loss": 0.2559, "num_input_tokens_seen": 17707008, "step": 4323 }, { "epoch": 3.2220566318926975, "grad_norm": 8.125894919440356, "learning_rate": 7.021529615464123e-07, "loss": 0.2092, "num_input_tokens_seen": 17711104, "step": 4324 }, { "epoch": 3.222801788375559, "grad_norm": 7.161969868092251, "learning_rate": 7.016269806197035e-07, "loss": 0.0963, "num_input_tokens_seen": 17715200, "step": 4325 }, { "epoch": 3.22354694485842, "grad_norm": 6.585415829501309, "learning_rate": 7.011011199002763e-07, "loss": 0.0914, "num_input_tokens_seen": 17719296, "step": 4326 }, { "epoch": 3.2242921013412817, "grad_norm": 8.02475373323911, "learning_rate": 7.005753795034023e-07, "loss": 0.276, "num_input_tokens_seen": 17723392, "step": 4327 }, { "epoch": 3.225037257824143, "grad_norm": 8.64845440948297, "learning_rate": 7.000497595443293e-07, "loss": 0.1727, "num_input_tokens_seen": 17727488, "step": 4328 }, { "epoch": 3.2257824143070044, "grad_norm": 7.872253564037367, "learning_rate": 6.995242601382751e-07, "loss": 0.151, "num_input_tokens_seen": 17731584, "step": 4329 }, { "epoch": 3.226527570789866, "grad_norm": 6.821918735808329, "learning_rate": 6.989988814004345e-07, "loss": 0.0459, "num_input_tokens_seen": 17735680, "step": 4330 }, { "epoch": 3.227272727272727, "grad_norm": 8.35661029147756, "learning_rate": 6.984736234459731e-07, "loss": 0.0752, "num_input_tokens_seen": 17739776, "step": 4331 }, { "epoch": 3.2280178837555886, "grad_norm": 6.688235879470439, "learning_rate": 6.979484863900327e-07, "loss": 0.0427, "num_input_tokens_seen": 17743872, "step": 4332 }, { "epoch": 3.22876304023845, "grad_norm": 6.265539265286159, "learning_rate": 6.97423470347725e-07, "loss": 0.0538, "num_input_tokens_seen": 17747968, "step": 4333 }, { "epoch": 3.2295081967213113, "grad_norm": 5.688678849630117, "learning_rate": 6.96898575434139e-07, "loss": 0.0552, "num_input_tokens_seen": 17752064, "step": 4334 }, { "epoch": 3.230253353204173, "grad_norm": 8.921987971350633, "learning_rate": 6.963738017643347e-07, "loss": 0.1781, "num_input_tokens_seen": 17756160, "step": 4335 }, { "epoch": 3.2309985096870344, "grad_norm": 10.04476903870131, "learning_rate": 6.958491494533458e-07, "loss": 0.3021, "num_input_tokens_seen": 17760256, "step": 4336 }, { "epoch": 3.2317436661698955, "grad_norm": 9.821033108092436, "learning_rate": 6.953246186161808e-07, "loss": 0.3497, "num_input_tokens_seen": 17764352, "step": 4337 }, { "epoch": 3.232488822652757, "grad_norm": 5.62904412151616, "learning_rate": 6.948002093678196e-07, "loss": 0.0512, "num_input_tokens_seen": 17768448, "step": 4338 }, { "epoch": 3.2332339791356186, "grad_norm": 11.467497397210286, "learning_rate": 6.942759218232175e-07, "loss": 0.3107, "num_input_tokens_seen": 17772544, "step": 4339 }, { "epoch": 3.2339791356184797, "grad_norm": 9.262557095743835, "learning_rate": 6.937517560973012e-07, "loss": 0.067, "num_input_tokens_seen": 17776640, "step": 4340 }, { "epoch": 3.2347242921013413, "grad_norm": 9.540242321616, "learning_rate": 6.932277123049723e-07, "loss": 0.1648, "num_input_tokens_seen": 17780736, "step": 4341 }, { "epoch": 3.235469448584203, "grad_norm": 11.126707361405831, "learning_rate": 6.927037905611045e-07, "loss": 0.3631, "num_input_tokens_seen": 17784832, "step": 4342 }, { "epoch": 3.236214605067064, "grad_norm": 6.050303359971336, "learning_rate": 6.921799909805449e-07, "loss": 0.0521, "num_input_tokens_seen": 17788928, "step": 4343 }, { "epoch": 3.2369597615499255, "grad_norm": 7.9794745225438515, "learning_rate": 6.916563136781149e-07, "loss": 0.1983, "num_input_tokens_seen": 17793024, "step": 4344 }, { "epoch": 3.237704918032787, "grad_norm": 9.833377407741484, "learning_rate": 6.911327587686074e-07, "loss": 0.3553, "num_input_tokens_seen": 17797120, "step": 4345 }, { "epoch": 3.238450074515648, "grad_norm": 6.687125000179422, "learning_rate": 6.906093263667909e-07, "loss": 0.0744, "num_input_tokens_seen": 17801216, "step": 4346 }, { "epoch": 3.2391952309985097, "grad_norm": 10.546777876111188, "learning_rate": 6.900860165874034e-07, "loss": 0.1371, "num_input_tokens_seen": 17805312, "step": 4347 }, { "epoch": 3.2399403874813713, "grad_norm": 5.682769190977884, "learning_rate": 6.895628295451598e-07, "loss": 0.0636, "num_input_tokens_seen": 17809408, "step": 4348 }, { "epoch": 3.2406855439642324, "grad_norm": 10.499143714872549, "learning_rate": 6.89039765354745e-07, "loss": 0.2522, "num_input_tokens_seen": 17813504, "step": 4349 }, { "epoch": 3.241430700447094, "grad_norm": 8.214127124622498, "learning_rate": 6.885168241308202e-07, "loss": 0.1268, "num_input_tokens_seen": 17817600, "step": 4350 }, { "epoch": 3.2421758569299555, "grad_norm": 9.416000223567144, "learning_rate": 6.879940059880163e-07, "loss": 0.2684, "num_input_tokens_seen": 17821696, "step": 4351 }, { "epoch": 3.2429210134128166, "grad_norm": 7.066262122271111, "learning_rate": 6.874713110409396e-07, "loss": 0.1755, "num_input_tokens_seen": 17825792, "step": 4352 }, { "epoch": 3.243666169895678, "grad_norm": 5.697246765036756, "learning_rate": 6.869487394041685e-07, "loss": 0.0596, "num_input_tokens_seen": 17829888, "step": 4353 }, { "epoch": 3.2444113263785397, "grad_norm": 6.751247178710642, "learning_rate": 6.864262911922538e-07, "loss": 0.155, "num_input_tokens_seen": 17833984, "step": 4354 }, { "epoch": 3.245156482861401, "grad_norm": 8.804352845471017, "learning_rate": 6.85903966519721e-07, "loss": 0.1462, "num_input_tokens_seen": 17838080, "step": 4355 }, { "epoch": 3.2459016393442623, "grad_norm": 7.7427245754371885, "learning_rate": 6.85381765501066e-07, "loss": 0.1708, "num_input_tokens_seen": 17842176, "step": 4356 }, { "epoch": 3.246646795827124, "grad_norm": 12.89394190095639, "learning_rate": 6.848596882507602e-07, "loss": 0.2448, "num_input_tokens_seen": 17846272, "step": 4357 }, { "epoch": 3.247391952309985, "grad_norm": 10.270482395476996, "learning_rate": 6.843377348832459e-07, "loss": 0.0475, "num_input_tokens_seen": 17850368, "step": 4358 }, { "epoch": 3.2481371087928466, "grad_norm": 6.4124200496209065, "learning_rate": 6.838159055129401e-07, "loss": 0.0409, "num_input_tokens_seen": 17854464, "step": 4359 }, { "epoch": 3.248882265275708, "grad_norm": 8.463620382763287, "learning_rate": 6.832942002542299e-07, "loss": 0.2325, "num_input_tokens_seen": 17858560, "step": 4360 }, { "epoch": 3.2496274217585692, "grad_norm": 10.126122393874436, "learning_rate": 6.827726192214781e-07, "loss": 0.2665, "num_input_tokens_seen": 17862656, "step": 4361 }, { "epoch": 3.2503725782414308, "grad_norm": 8.698384138774378, "learning_rate": 6.822511625290184e-07, "loss": 0.0743, "num_input_tokens_seen": 17866752, "step": 4362 }, { "epoch": 3.251117734724292, "grad_norm": 7.780864921682274, "learning_rate": 6.817298302911576e-07, "loss": 0.1263, "num_input_tokens_seen": 17870848, "step": 4363 }, { "epoch": 3.2518628912071534, "grad_norm": 9.138568432067407, "learning_rate": 6.812086226221763e-07, "loss": 0.2117, "num_input_tokens_seen": 17874944, "step": 4364 }, { "epoch": 3.252608047690015, "grad_norm": 7.283404271966475, "learning_rate": 6.806875396363261e-07, "loss": 0.1068, "num_input_tokens_seen": 17879040, "step": 4365 }, { "epoch": 3.2533532041728765, "grad_norm": 7.599201662867671, "learning_rate": 6.801665814478326e-07, "loss": 0.2299, "num_input_tokens_seen": 17883136, "step": 4366 }, { "epoch": 3.2540983606557377, "grad_norm": 6.4954780655659095, "learning_rate": 6.796457481708932e-07, "loss": 0.0753, "num_input_tokens_seen": 17887232, "step": 4367 }, { "epoch": 3.254843517138599, "grad_norm": 6.367577889330535, "learning_rate": 6.791250399196789e-07, "loss": 0.0725, "num_input_tokens_seen": 17891328, "step": 4368 }, { "epoch": 3.2555886736214603, "grad_norm": 6.620185073556588, "learning_rate": 6.786044568083322e-07, "loss": 0.1134, "num_input_tokens_seen": 17895424, "step": 4369 }, { "epoch": 3.256333830104322, "grad_norm": 8.597734626697243, "learning_rate": 6.780839989509682e-07, "loss": 0.2378, "num_input_tokens_seen": 17899520, "step": 4370 }, { "epoch": 3.2570789865871834, "grad_norm": 8.848511568375654, "learning_rate": 6.775636664616763e-07, "loss": 0.1157, "num_input_tokens_seen": 17903616, "step": 4371 }, { "epoch": 3.2578241430700445, "grad_norm": 4.398684448743771, "learning_rate": 6.770434594545154e-07, "loss": 0.0282, "num_input_tokens_seen": 17907712, "step": 4372 }, { "epoch": 3.258569299552906, "grad_norm": 5.653443110725307, "learning_rate": 6.765233780435207e-07, "loss": 0.029, "num_input_tokens_seen": 17911808, "step": 4373 }, { "epoch": 3.2593144560357676, "grad_norm": 7.944725762022742, "learning_rate": 6.760034223426953e-07, "loss": 0.1361, "num_input_tokens_seen": 17915904, "step": 4374 }, { "epoch": 3.2600596125186287, "grad_norm": 8.657506038579584, "learning_rate": 6.754835924660191e-07, "loss": 0.1895, "num_input_tokens_seen": 17920000, "step": 4375 }, { "epoch": 3.2608047690014903, "grad_norm": 9.444946940365394, "learning_rate": 6.749638885274412e-07, "loss": 0.2109, "num_input_tokens_seen": 17924096, "step": 4376 }, { "epoch": 3.261549925484352, "grad_norm": 5.9479854294903065, "learning_rate": 6.744443106408857e-07, "loss": 0.0371, "num_input_tokens_seen": 17928192, "step": 4377 }, { "epoch": 3.262295081967213, "grad_norm": 9.294011126386666, "learning_rate": 6.739248589202465e-07, "loss": 0.2784, "num_input_tokens_seen": 17932288, "step": 4378 }, { "epoch": 3.2630402384500745, "grad_norm": 15.260143883560394, "learning_rate": 6.734055334793923e-07, "loss": 0.2262, "num_input_tokens_seen": 17936384, "step": 4379 }, { "epoch": 3.263785394932936, "grad_norm": 7.4648849493835305, "learning_rate": 6.728863344321626e-07, "loss": 0.0594, "num_input_tokens_seen": 17940480, "step": 4380 }, { "epoch": 3.264530551415797, "grad_norm": 9.57806106526495, "learning_rate": 6.723672618923687e-07, "loss": 0.081, "num_input_tokens_seen": 17944576, "step": 4381 }, { "epoch": 3.2652757078986587, "grad_norm": 12.811682739572966, "learning_rate": 6.718483159737962e-07, "loss": 0.1629, "num_input_tokens_seen": 17948672, "step": 4382 }, { "epoch": 3.2660208643815203, "grad_norm": 9.391088863999284, "learning_rate": 6.713294967902006e-07, "loss": 0.1836, "num_input_tokens_seen": 17952768, "step": 4383 }, { "epoch": 3.2667660208643814, "grad_norm": 8.296323444783745, "learning_rate": 6.70810804455312e-07, "loss": 0.2406, "num_input_tokens_seen": 17956864, "step": 4384 }, { "epoch": 3.267511177347243, "grad_norm": 10.402004604078572, "learning_rate": 6.702922390828304e-07, "loss": 0.1637, "num_input_tokens_seen": 17960960, "step": 4385 }, { "epoch": 3.2682563338301045, "grad_norm": 7.513427381151331, "learning_rate": 6.697738007864303e-07, "loss": 0.1131, "num_input_tokens_seen": 17965056, "step": 4386 }, { "epoch": 3.2690014903129656, "grad_norm": 10.72812403077397, "learning_rate": 6.692554896797554e-07, "loss": 0.0844, "num_input_tokens_seen": 17969152, "step": 4387 }, { "epoch": 3.269746646795827, "grad_norm": 11.353599551441649, "learning_rate": 6.687373058764243e-07, "loss": 0.1386, "num_input_tokens_seen": 17973248, "step": 4388 }, { "epoch": 3.2704918032786887, "grad_norm": 14.132241303111895, "learning_rate": 6.682192494900261e-07, "loss": 0.1358, "num_input_tokens_seen": 17977344, "step": 4389 }, { "epoch": 3.27123695976155, "grad_norm": 7.255902056908551, "learning_rate": 6.677013206341232e-07, "loss": 0.2675, "num_input_tokens_seen": 17981440, "step": 4390 }, { "epoch": 3.2719821162444114, "grad_norm": 8.008305213960362, "learning_rate": 6.671835194222487e-07, "loss": 0.1232, "num_input_tokens_seen": 17985536, "step": 4391 }, { "epoch": 3.2727272727272725, "grad_norm": 9.248020427851763, "learning_rate": 6.666658459679079e-07, "loss": 0.2738, "num_input_tokens_seen": 17989632, "step": 4392 }, { "epoch": 3.273472429210134, "grad_norm": 10.207448100724472, "learning_rate": 6.661483003845797e-07, "loss": 0.2393, "num_input_tokens_seen": 17993728, "step": 4393 }, { "epoch": 3.2742175856929956, "grad_norm": 11.481683963677977, "learning_rate": 6.656308827857125e-07, "loss": 0.3609, "num_input_tokens_seen": 17997824, "step": 4394 }, { "epoch": 3.274962742175857, "grad_norm": 8.639036717447086, "learning_rate": 6.65113593284729e-07, "loss": 0.1772, "num_input_tokens_seen": 18001920, "step": 4395 }, { "epoch": 3.2757078986587183, "grad_norm": 8.998504150477183, "learning_rate": 6.645964319950218e-07, "loss": 0.2402, "num_input_tokens_seen": 18006016, "step": 4396 }, { "epoch": 3.27645305514158, "grad_norm": 8.995631593902857, "learning_rate": 6.640793990299577e-07, "loss": 0.2405, "num_input_tokens_seen": 18010112, "step": 4397 }, { "epoch": 3.277198211624441, "grad_norm": 8.76516943747079, "learning_rate": 6.635624945028728e-07, "loss": 0.1258, "num_input_tokens_seen": 18014208, "step": 4398 }, { "epoch": 3.2779433681073025, "grad_norm": 8.998699400084242, "learning_rate": 6.630457185270763e-07, "loss": 0.123, "num_input_tokens_seen": 18018304, "step": 4399 }, { "epoch": 3.278688524590164, "grad_norm": 10.808117616870067, "learning_rate": 6.625290712158506e-07, "loss": 0.3894, "num_input_tokens_seen": 18022400, "step": 4400 }, { "epoch": 3.2794336810730256, "grad_norm": 6.809460071549558, "learning_rate": 6.620125526824462e-07, "loss": 0.0513, "num_input_tokens_seen": 18026496, "step": 4401 }, { "epoch": 3.2801788375558867, "grad_norm": 5.159604276905982, "learning_rate": 6.614961630400894e-07, "loss": 0.0461, "num_input_tokens_seen": 18030592, "step": 4402 }, { "epoch": 3.2809239940387482, "grad_norm": 9.65984764540333, "learning_rate": 6.609799024019755e-07, "loss": 0.1562, "num_input_tokens_seen": 18034688, "step": 4403 }, { "epoch": 3.2816691505216093, "grad_norm": 7.174737718258086, "learning_rate": 6.604637708812737e-07, "loss": 0.083, "num_input_tokens_seen": 18038784, "step": 4404 }, { "epoch": 3.282414307004471, "grad_norm": 8.959707804587113, "learning_rate": 6.59947768591122e-07, "loss": 0.2799, "num_input_tokens_seen": 18042880, "step": 4405 }, { "epoch": 3.2831594634873325, "grad_norm": 8.905678687455877, "learning_rate": 6.594318956446336e-07, "loss": 0.245, "num_input_tokens_seen": 18046976, "step": 4406 }, { "epoch": 3.2839046199701936, "grad_norm": 9.230333839769113, "learning_rate": 6.589161521548908e-07, "loss": 0.3338, "num_input_tokens_seen": 18051072, "step": 4407 }, { "epoch": 3.284649776453055, "grad_norm": 10.466308770393221, "learning_rate": 6.584005382349476e-07, "loss": 0.2385, "num_input_tokens_seen": 18055168, "step": 4408 }, { "epoch": 3.2853949329359167, "grad_norm": 8.152081448214808, "learning_rate": 6.578850539978313e-07, "loss": 0.1997, "num_input_tokens_seen": 18059264, "step": 4409 }, { "epoch": 3.2861400894187778, "grad_norm": 10.26628073059233, "learning_rate": 6.573696995565385e-07, "loss": 0.2064, "num_input_tokens_seen": 18063360, "step": 4410 }, { "epoch": 3.2868852459016393, "grad_norm": 9.411388059312669, "learning_rate": 6.568544750240401e-07, "loss": 0.2081, "num_input_tokens_seen": 18067456, "step": 4411 }, { "epoch": 3.287630402384501, "grad_norm": 10.1808909387957, "learning_rate": 6.563393805132756e-07, "loss": 0.2087, "num_input_tokens_seen": 18071552, "step": 4412 }, { "epoch": 3.288375558867362, "grad_norm": 10.105022203862292, "learning_rate": 6.558244161371591e-07, "loss": 0.0767, "num_input_tokens_seen": 18075648, "step": 4413 }, { "epoch": 3.2891207153502235, "grad_norm": 9.373171435899152, "learning_rate": 6.553095820085722e-07, "loss": 0.2642, "num_input_tokens_seen": 18079744, "step": 4414 }, { "epoch": 3.289865871833085, "grad_norm": 8.161353182224051, "learning_rate": 6.54794878240372e-07, "loss": 0.1187, "num_input_tokens_seen": 18083840, "step": 4415 }, { "epoch": 3.290611028315946, "grad_norm": 9.692297664825382, "learning_rate": 6.542803049453841e-07, "loss": 0.226, "num_input_tokens_seen": 18087936, "step": 4416 }, { "epoch": 3.2913561847988078, "grad_norm": 5.882851047906413, "learning_rate": 6.537658622364077e-07, "loss": 0.0875, "num_input_tokens_seen": 18092032, "step": 4417 }, { "epoch": 3.2921013412816693, "grad_norm": 7.332523288856817, "learning_rate": 6.532515502262119e-07, "loss": 0.08, "num_input_tokens_seen": 18096128, "step": 4418 }, { "epoch": 3.2928464977645304, "grad_norm": 9.358181022790658, "learning_rate": 6.527373690275368e-07, "loss": 0.2456, "num_input_tokens_seen": 18100224, "step": 4419 }, { "epoch": 3.293591654247392, "grad_norm": 6.9968788953663035, "learning_rate": 6.522233187530958e-07, "loss": 0.1148, "num_input_tokens_seen": 18104320, "step": 4420 }, { "epoch": 3.2943368107302535, "grad_norm": 6.479915770059971, "learning_rate": 6.517093995155713e-07, "loss": 0.121, "num_input_tokens_seen": 18108416, "step": 4421 }, { "epoch": 3.2950819672131146, "grad_norm": 6.567010673387363, "learning_rate": 6.511956114276192e-07, "loss": 0.0726, "num_input_tokens_seen": 18112512, "step": 4422 }, { "epoch": 3.295827123695976, "grad_norm": 6.918168585196209, "learning_rate": 6.506819546018642e-07, "loss": 0.0575, "num_input_tokens_seen": 18116608, "step": 4423 }, { "epoch": 3.2965722801788377, "grad_norm": 12.853490452623582, "learning_rate": 6.50168429150905e-07, "loss": 0.3577, "num_input_tokens_seen": 18120704, "step": 4424 }, { "epoch": 3.297317436661699, "grad_norm": 8.369944427040712, "learning_rate": 6.496550351873092e-07, "loss": 0.2668, "num_input_tokens_seen": 18124800, "step": 4425 }, { "epoch": 3.2980625931445604, "grad_norm": 7.744811636506915, "learning_rate": 6.491417728236161e-07, "loss": 0.0825, "num_input_tokens_seen": 18128896, "step": 4426 }, { "epoch": 3.2988077496274215, "grad_norm": 10.172496960395975, "learning_rate": 6.486286421723378e-07, "loss": 0.14, "num_input_tokens_seen": 18132992, "step": 4427 }, { "epoch": 3.299552906110283, "grad_norm": 12.501115543907996, "learning_rate": 6.481156433459543e-07, "loss": 0.2442, "num_input_tokens_seen": 18137088, "step": 4428 }, { "epoch": 3.3002980625931446, "grad_norm": 8.708858343541161, "learning_rate": 6.476027764569202e-07, "loss": 0.2154, "num_input_tokens_seen": 18141184, "step": 4429 }, { "epoch": 3.301043219076006, "grad_norm": 8.45523604055262, "learning_rate": 6.470900416176585e-07, "loss": 0.2175, "num_input_tokens_seen": 18145280, "step": 4430 }, { "epoch": 3.3017883755588673, "grad_norm": 7.747886872968852, "learning_rate": 6.465774389405655e-07, "loss": 0.2066, "num_input_tokens_seen": 18149376, "step": 4431 }, { "epoch": 3.302533532041729, "grad_norm": 10.372814984957747, "learning_rate": 6.460649685380059e-07, "loss": 0.1393, "num_input_tokens_seen": 18153472, "step": 4432 }, { "epoch": 3.30327868852459, "grad_norm": 9.62535181484405, "learning_rate": 6.455526305223183e-07, "loss": 0.1833, "num_input_tokens_seen": 18157568, "step": 4433 }, { "epoch": 3.3040238450074515, "grad_norm": 9.405336807438026, "learning_rate": 6.450404250058104e-07, "loss": 0.328, "num_input_tokens_seen": 18161664, "step": 4434 }, { "epoch": 3.304769001490313, "grad_norm": 9.975020686335808, "learning_rate": 6.445283521007604e-07, "loss": 0.2798, "num_input_tokens_seen": 18165760, "step": 4435 }, { "epoch": 3.3055141579731746, "grad_norm": 8.60078566339963, "learning_rate": 6.440164119194197e-07, "loss": 0.1779, "num_input_tokens_seen": 18169856, "step": 4436 }, { "epoch": 3.3062593144560357, "grad_norm": 5.072935934493815, "learning_rate": 6.43504604574008e-07, "loss": 0.0396, "num_input_tokens_seen": 18173952, "step": 4437 }, { "epoch": 3.3070044709388973, "grad_norm": 6.838819255587021, "learning_rate": 6.429929301767182e-07, "loss": 0.1087, "num_input_tokens_seen": 18178048, "step": 4438 }, { "epoch": 3.3077496274217584, "grad_norm": 9.258849893779436, "learning_rate": 6.424813888397122e-07, "loss": 0.2903, "num_input_tokens_seen": 18182144, "step": 4439 }, { "epoch": 3.30849478390462, "grad_norm": 10.150428579478874, "learning_rate": 6.419699806751248e-07, "loss": 0.19, "num_input_tokens_seen": 18186240, "step": 4440 }, { "epoch": 3.3092399403874815, "grad_norm": 9.72958020254857, "learning_rate": 6.414587057950583e-07, "loss": 0.3125, "num_input_tokens_seen": 18190336, "step": 4441 }, { "epoch": 3.3099850968703426, "grad_norm": 10.18630556986551, "learning_rate": 6.409475643115895e-07, "loss": 0.1334, "num_input_tokens_seen": 18194432, "step": 4442 }, { "epoch": 3.310730253353204, "grad_norm": 8.394841975019226, "learning_rate": 6.404365563367629e-07, "loss": 0.0672, "num_input_tokens_seen": 18198528, "step": 4443 }, { "epoch": 3.3114754098360657, "grad_norm": 9.760646348720378, "learning_rate": 6.399256819825965e-07, "loss": 0.1902, "num_input_tokens_seen": 18202624, "step": 4444 }, { "epoch": 3.312220566318927, "grad_norm": 23.094168471646537, "learning_rate": 6.394149413610769e-07, "loss": 0.249, "num_input_tokens_seen": 18206720, "step": 4445 }, { "epoch": 3.3129657228017884, "grad_norm": 7.89412865845402, "learning_rate": 6.389043345841617e-07, "loss": 0.1278, "num_input_tokens_seen": 18210816, "step": 4446 }, { "epoch": 3.31371087928465, "grad_norm": 6.462661665719605, "learning_rate": 6.383938617637803e-07, "loss": 0.0687, "num_input_tokens_seen": 18214912, "step": 4447 }, { "epoch": 3.314456035767511, "grad_norm": 10.5664024265236, "learning_rate": 6.378835230118312e-07, "loss": 0.1713, "num_input_tokens_seen": 18219008, "step": 4448 }, { "epoch": 3.3152011922503726, "grad_norm": 7.295090832015725, "learning_rate": 6.373733184401854e-07, "loss": 0.1259, "num_input_tokens_seen": 18223104, "step": 4449 }, { "epoch": 3.315946348733234, "grad_norm": 10.184631156904866, "learning_rate": 6.36863248160682e-07, "loss": 0.1932, "num_input_tokens_seen": 18227200, "step": 4450 }, { "epoch": 3.3166915052160952, "grad_norm": 9.808487190021957, "learning_rate": 6.363533122851334e-07, "loss": 0.3571, "num_input_tokens_seen": 18231296, "step": 4451 }, { "epoch": 3.317436661698957, "grad_norm": 9.903684110047275, "learning_rate": 6.358435109253206e-07, "loss": 0.3134, "num_input_tokens_seen": 18235392, "step": 4452 }, { "epoch": 3.3181818181818183, "grad_norm": 9.045476380479235, "learning_rate": 6.353338441929951e-07, "loss": 0.1382, "num_input_tokens_seen": 18239488, "step": 4453 }, { "epoch": 3.3189269746646795, "grad_norm": 9.76216835935292, "learning_rate": 6.348243121998812e-07, "loss": 0.1568, "num_input_tokens_seen": 18243584, "step": 4454 }, { "epoch": 3.319672131147541, "grad_norm": 11.035708787082338, "learning_rate": 6.343149150576697e-07, "loss": 0.2043, "num_input_tokens_seen": 18247680, "step": 4455 }, { "epoch": 3.3204172876304026, "grad_norm": 9.319543924584682, "learning_rate": 6.338056528780258e-07, "loss": 0.3589, "num_input_tokens_seen": 18251776, "step": 4456 }, { "epoch": 3.3211624441132637, "grad_norm": 9.19541001461836, "learning_rate": 6.332965257725822e-07, "loss": 0.1323, "num_input_tokens_seen": 18255872, "step": 4457 }, { "epoch": 3.321907600596125, "grad_norm": 6.621941996194614, "learning_rate": 6.327875338529444e-07, "loss": 0.0958, "num_input_tokens_seen": 18259968, "step": 4458 }, { "epoch": 3.3226527570789868, "grad_norm": 7.5446756406618185, "learning_rate": 6.322786772306859e-07, "loss": 0.1696, "num_input_tokens_seen": 18264064, "step": 4459 }, { "epoch": 3.323397913561848, "grad_norm": 11.779296120176555, "learning_rate": 6.31769956017353e-07, "loss": 0.142, "num_input_tokens_seen": 18268160, "step": 4460 }, { "epoch": 3.3241430700447094, "grad_norm": 9.163675329329433, "learning_rate": 6.312613703244599e-07, "loss": 0.1476, "num_input_tokens_seen": 18272256, "step": 4461 }, { "epoch": 3.3248882265275705, "grad_norm": 9.028449979890398, "learning_rate": 6.307529202634924e-07, "loss": 0.2689, "num_input_tokens_seen": 18276352, "step": 4462 }, { "epoch": 3.325633383010432, "grad_norm": 9.097417501970275, "learning_rate": 6.302446059459069e-07, "loss": 0.2441, "num_input_tokens_seen": 18280448, "step": 4463 }, { "epoch": 3.3263785394932937, "grad_norm": 9.298711712378033, "learning_rate": 6.297364274831288e-07, "loss": 0.3234, "num_input_tokens_seen": 18284544, "step": 4464 }, { "epoch": 3.327123695976155, "grad_norm": 7.012260444468888, "learning_rate": 6.292283849865555e-07, "loss": 0.1379, "num_input_tokens_seen": 18288640, "step": 4465 }, { "epoch": 3.3278688524590163, "grad_norm": 10.700208914785014, "learning_rate": 6.287204785675524e-07, "loss": 0.2704, "num_input_tokens_seen": 18292736, "step": 4466 }, { "epoch": 3.328614008941878, "grad_norm": 9.345841143483502, "learning_rate": 6.282127083374575e-07, "loss": 0.2455, "num_input_tokens_seen": 18296832, "step": 4467 }, { "epoch": 3.329359165424739, "grad_norm": 9.82683582243619, "learning_rate": 6.277050744075762e-07, "loss": 0.1337, "num_input_tokens_seen": 18300928, "step": 4468 }, { "epoch": 3.3301043219076005, "grad_norm": 7.774606670301699, "learning_rate": 6.271975768891865e-07, "loss": 0.1038, "num_input_tokens_seen": 18305024, "step": 4469 }, { "epoch": 3.330849478390462, "grad_norm": 11.71571496847373, "learning_rate": 6.266902158935347e-07, "loss": 0.1509, "num_input_tokens_seen": 18309120, "step": 4470 }, { "epoch": 3.3315946348733236, "grad_norm": 11.31716440246229, "learning_rate": 6.261829915318387e-07, "loss": 0.209, "num_input_tokens_seen": 18313216, "step": 4471 }, { "epoch": 3.3323397913561847, "grad_norm": 6.808389443259281, "learning_rate": 6.256759039152857e-07, "loss": 0.2235, "num_input_tokens_seen": 18317312, "step": 4472 }, { "epoch": 3.3330849478390463, "grad_norm": 8.842407435150626, "learning_rate": 6.25168953155032e-07, "loss": 0.1562, "num_input_tokens_seen": 18321408, "step": 4473 }, { "epoch": 3.3338301043219074, "grad_norm": 7.945442727360668, "learning_rate": 6.24662139362206e-07, "loss": 0.1884, "num_input_tokens_seen": 18325504, "step": 4474 }, { "epoch": 3.334575260804769, "grad_norm": 8.897347849094698, "learning_rate": 6.241554626479039e-07, "loss": 0.1655, "num_input_tokens_seen": 18329600, "step": 4475 }, { "epoch": 3.3353204172876305, "grad_norm": 8.129709743022994, "learning_rate": 6.236489231231937e-07, "loss": 0.1708, "num_input_tokens_seen": 18333696, "step": 4476 }, { "epoch": 3.3360655737704916, "grad_norm": 8.961376018800962, "learning_rate": 6.231425208991117e-07, "loss": 0.1189, "num_input_tokens_seen": 18337792, "step": 4477 }, { "epoch": 3.336810730253353, "grad_norm": 10.301041952023544, "learning_rate": 6.226362560866659e-07, "loss": 0.2112, "num_input_tokens_seen": 18341888, "step": 4478 }, { "epoch": 3.3375558867362147, "grad_norm": 5.636143841866988, "learning_rate": 6.221301287968322e-07, "loss": 0.0639, "num_input_tokens_seen": 18345984, "step": 4479 }, { "epoch": 3.338301043219076, "grad_norm": 6.224094773590771, "learning_rate": 6.216241391405574e-07, "loss": 0.0611, "num_input_tokens_seen": 18350080, "step": 4480 }, { "epoch": 3.3390461997019374, "grad_norm": 7.621678268398681, "learning_rate": 6.211182872287587e-07, "loss": 0.0741, "num_input_tokens_seen": 18354176, "step": 4481 }, { "epoch": 3.339791356184799, "grad_norm": 10.51874616738598, "learning_rate": 6.206125731723221e-07, "loss": 0.2098, "num_input_tokens_seen": 18358272, "step": 4482 }, { "epoch": 3.34053651266766, "grad_norm": 7.260911206449542, "learning_rate": 6.201069970821037e-07, "loss": 0.1264, "num_input_tokens_seen": 18362368, "step": 4483 }, { "epoch": 3.3412816691505216, "grad_norm": 7.452605451280535, "learning_rate": 6.196015590689289e-07, "loss": 0.3029, "num_input_tokens_seen": 18366464, "step": 4484 }, { "epoch": 3.342026825633383, "grad_norm": 9.011896971133789, "learning_rate": 6.190962592435942e-07, "loss": 0.2296, "num_input_tokens_seen": 18370560, "step": 4485 }, { "epoch": 3.3427719821162443, "grad_norm": 7.331506320137675, "learning_rate": 6.185910977168642e-07, "loss": 0.2156, "num_input_tokens_seen": 18374656, "step": 4486 }, { "epoch": 3.343517138599106, "grad_norm": 10.055403022126505, "learning_rate": 6.180860745994746e-07, "loss": 0.4046, "num_input_tokens_seen": 18378752, "step": 4487 }, { "epoch": 3.3442622950819674, "grad_norm": 14.079932061160129, "learning_rate": 6.175811900021293e-07, "loss": 0.1756, "num_input_tokens_seen": 18382848, "step": 4488 }, { "epoch": 3.3450074515648285, "grad_norm": 18.030951480512872, "learning_rate": 6.170764440355035e-07, "loss": 0.0682, "num_input_tokens_seen": 18386944, "step": 4489 }, { "epoch": 3.34575260804769, "grad_norm": 9.697053567107593, "learning_rate": 6.165718368102407e-07, "loss": 0.2264, "num_input_tokens_seen": 18391040, "step": 4490 }, { "epoch": 3.3464977645305516, "grad_norm": 7.489165040532798, "learning_rate": 6.160673684369538e-07, "loss": 0.0618, "num_input_tokens_seen": 18395136, "step": 4491 }, { "epoch": 3.3472429210134127, "grad_norm": 6.903522700987043, "learning_rate": 6.155630390262268e-07, "loss": 0.1036, "num_input_tokens_seen": 18399232, "step": 4492 }, { "epoch": 3.3479880774962743, "grad_norm": 8.136189769785776, "learning_rate": 6.150588486886115e-07, "loss": 0.2023, "num_input_tokens_seen": 18403328, "step": 4493 }, { "epoch": 3.348733233979136, "grad_norm": 10.580810358787215, "learning_rate": 6.145547975346315e-07, "loss": 0.2334, "num_input_tokens_seen": 18407424, "step": 4494 }, { "epoch": 3.349478390461997, "grad_norm": 10.704534792789161, "learning_rate": 6.140508856747763e-07, "loss": 0.3185, "num_input_tokens_seen": 18411520, "step": 4495 }, { "epoch": 3.3502235469448585, "grad_norm": 8.06920489806614, "learning_rate": 6.135471132195086e-07, "loss": 0.1147, "num_input_tokens_seen": 18415616, "step": 4496 }, { "epoch": 3.3509687034277196, "grad_norm": 10.549989917829102, "learning_rate": 6.13043480279258e-07, "loss": 0.3803, "num_input_tokens_seen": 18419712, "step": 4497 }, { "epoch": 3.351713859910581, "grad_norm": 7.2078475306581, "learning_rate": 6.125399869644251e-07, "loss": 0.0964, "num_input_tokens_seen": 18423808, "step": 4498 }, { "epoch": 3.3524590163934427, "grad_norm": 9.980677691455003, "learning_rate": 6.120366333853791e-07, "loss": 0.4207, "num_input_tokens_seen": 18427904, "step": 4499 }, { "epoch": 3.3532041728763042, "grad_norm": 8.059429759953625, "learning_rate": 6.115334196524578e-07, "loss": 0.2439, "num_input_tokens_seen": 18432000, "step": 4500 }, { "epoch": 3.3539493293591653, "grad_norm": 10.70026535975439, "learning_rate": 6.110303458759706e-07, "loss": 0.2999, "num_input_tokens_seen": 18436096, "step": 4501 }, { "epoch": 3.354694485842027, "grad_norm": 11.204662968132503, "learning_rate": 6.105274121661937e-07, "loss": 0.0654, "num_input_tokens_seen": 18440192, "step": 4502 }, { "epoch": 3.355439642324888, "grad_norm": 10.904016010837886, "learning_rate": 6.100246186333748e-07, "loss": 0.1507, "num_input_tokens_seen": 18444288, "step": 4503 }, { "epoch": 3.3561847988077496, "grad_norm": 15.007621941978314, "learning_rate": 6.095219653877289e-07, "loss": 0.1643, "num_input_tokens_seen": 18448384, "step": 4504 }, { "epoch": 3.356929955290611, "grad_norm": 7.990262053962973, "learning_rate": 6.09019452539442e-07, "loss": 0.0892, "num_input_tokens_seen": 18452480, "step": 4505 }, { "epoch": 3.3576751117734727, "grad_norm": 7.79392670017914, "learning_rate": 6.085170801986681e-07, "loss": 0.1394, "num_input_tokens_seen": 18456576, "step": 4506 }, { "epoch": 3.3584202682563338, "grad_norm": 7.843216334741153, "learning_rate": 6.080148484755305e-07, "loss": 0.1298, "num_input_tokens_seen": 18460672, "step": 4507 }, { "epoch": 3.3591654247391953, "grad_norm": 6.617090351013574, "learning_rate": 6.075127574801226e-07, "loss": 0.0489, "num_input_tokens_seen": 18464768, "step": 4508 }, { "epoch": 3.3599105812220564, "grad_norm": 7.117060886882593, "learning_rate": 6.070108073225062e-07, "loss": 0.1093, "num_input_tokens_seen": 18468864, "step": 4509 }, { "epoch": 3.360655737704918, "grad_norm": 9.910935764431736, "learning_rate": 6.06508998112712e-07, "loss": 0.1762, "num_input_tokens_seen": 18472960, "step": 4510 }, { "epoch": 3.3614008941877795, "grad_norm": 8.0944309121133, "learning_rate": 6.060073299607399e-07, "loss": 0.3333, "num_input_tokens_seen": 18477056, "step": 4511 }, { "epoch": 3.3621460506706407, "grad_norm": 8.70614379252057, "learning_rate": 6.055058029765602e-07, "loss": 0.1146, "num_input_tokens_seen": 18481152, "step": 4512 }, { "epoch": 3.362891207153502, "grad_norm": 7.5561940890952055, "learning_rate": 6.0500441727011e-07, "loss": 0.1966, "num_input_tokens_seen": 18485248, "step": 4513 }, { "epoch": 3.3636363636363638, "grad_norm": 8.226370544090512, "learning_rate": 6.045031729512978e-07, "loss": 0.1506, "num_input_tokens_seen": 18489344, "step": 4514 }, { "epoch": 3.364381520119225, "grad_norm": 9.56137537777161, "learning_rate": 6.040020701299989e-07, "loss": 0.2168, "num_input_tokens_seen": 18493440, "step": 4515 }, { "epoch": 3.3651266766020864, "grad_norm": 6.657959891592515, "learning_rate": 6.035011089160593e-07, "loss": 0.1479, "num_input_tokens_seen": 18497536, "step": 4516 }, { "epoch": 3.365871833084948, "grad_norm": 10.01726457144529, "learning_rate": 6.030002894192932e-07, "loss": 0.1597, "num_input_tokens_seen": 18501632, "step": 4517 }, { "epoch": 3.366616989567809, "grad_norm": 8.375615830604746, "learning_rate": 6.024996117494831e-07, "loss": 0.1298, "num_input_tokens_seen": 18505728, "step": 4518 }, { "epoch": 3.3673621460506706, "grad_norm": 7.399607433846807, "learning_rate": 6.019990760163823e-07, "loss": 0.1653, "num_input_tokens_seen": 18509824, "step": 4519 }, { "epoch": 3.368107302533532, "grad_norm": 9.693446451755058, "learning_rate": 6.014986823297106e-07, "loss": 0.2201, "num_input_tokens_seen": 18513920, "step": 4520 }, { "epoch": 3.3688524590163933, "grad_norm": 8.593506088943387, "learning_rate": 6.009984307991594e-07, "loss": 0.3117, "num_input_tokens_seen": 18518016, "step": 4521 }, { "epoch": 3.369597615499255, "grad_norm": 10.279446731013179, "learning_rate": 6.004983215343854e-07, "loss": 0.2084, "num_input_tokens_seen": 18522112, "step": 4522 }, { "epoch": 3.3703427719821164, "grad_norm": 7.457259278902373, "learning_rate": 5.99998354645018e-07, "loss": 0.1479, "num_input_tokens_seen": 18526208, "step": 4523 }, { "epoch": 3.3710879284649775, "grad_norm": 8.94160460824565, "learning_rate": 5.994985302406518e-07, "loss": 0.3165, "num_input_tokens_seen": 18530304, "step": 4524 }, { "epoch": 3.371833084947839, "grad_norm": 8.271810546117862, "learning_rate": 5.989988484308534e-07, "loss": 0.0966, "num_input_tokens_seen": 18534400, "step": 4525 }, { "epoch": 3.3725782414307006, "grad_norm": 9.40408797893109, "learning_rate": 5.98499309325156e-07, "loss": 0.1641, "num_input_tokens_seen": 18538496, "step": 4526 }, { "epoch": 3.3733233979135617, "grad_norm": 10.45908932989874, "learning_rate": 5.979999130330617e-07, "loss": 0.3218, "num_input_tokens_seen": 18542592, "step": 4527 }, { "epoch": 3.3740685543964233, "grad_norm": 6.895019590504847, "learning_rate": 5.975006596640425e-07, "loss": 0.1162, "num_input_tokens_seen": 18546688, "step": 4528 }, { "epoch": 3.374813710879285, "grad_norm": 7.978917515145689, "learning_rate": 5.970015493275375e-07, "loss": 0.1477, "num_input_tokens_seen": 18550784, "step": 4529 }, { "epoch": 3.375558867362146, "grad_norm": 4.0162895426373515, "learning_rate": 5.965025821329563e-07, "loss": 0.0205, "num_input_tokens_seen": 18554880, "step": 4530 }, { "epoch": 3.3763040238450075, "grad_norm": 5.750974520602587, "learning_rate": 5.960037581896751e-07, "loss": 0.0661, "num_input_tokens_seen": 18558976, "step": 4531 }, { "epoch": 3.3770491803278686, "grad_norm": 5.936473869239344, "learning_rate": 5.955050776070404e-07, "loss": 0.103, "num_input_tokens_seen": 18563072, "step": 4532 }, { "epoch": 3.37779433681073, "grad_norm": 8.136524436793493, "learning_rate": 5.950065404943661e-07, "loss": 0.106, "num_input_tokens_seen": 18567168, "step": 4533 }, { "epoch": 3.3785394932935917, "grad_norm": 11.261260015441037, "learning_rate": 5.945081469609354e-07, "loss": 0.2297, "num_input_tokens_seen": 18571264, "step": 4534 }, { "epoch": 3.3792846497764533, "grad_norm": 7.018137851603311, "learning_rate": 5.94009897115999e-07, "loss": 0.1373, "num_input_tokens_seen": 18575360, "step": 4535 }, { "epoch": 3.3800298062593144, "grad_norm": 7.994849192782836, "learning_rate": 5.935117910687776e-07, "loss": 0.1309, "num_input_tokens_seen": 18579456, "step": 4536 }, { "epoch": 3.380774962742176, "grad_norm": 8.635406145448695, "learning_rate": 5.930138289284596e-07, "loss": 0.2036, "num_input_tokens_seen": 18583552, "step": 4537 }, { "epoch": 3.381520119225037, "grad_norm": 9.73716540256254, "learning_rate": 5.925160108042009e-07, "loss": 0.0922, "num_input_tokens_seen": 18587648, "step": 4538 }, { "epoch": 3.3822652757078986, "grad_norm": 2.4888976523105057, "learning_rate": 5.920183368051281e-07, "loss": 0.0111, "num_input_tokens_seen": 18591744, "step": 4539 }, { "epoch": 3.38301043219076, "grad_norm": 5.7874474831962175, "learning_rate": 5.915208070403339e-07, "loss": 0.0701, "num_input_tokens_seen": 18595840, "step": 4540 }, { "epoch": 3.3837555886736217, "grad_norm": 10.021414065709157, "learning_rate": 5.910234216188811e-07, "loss": 0.3225, "num_input_tokens_seen": 18599936, "step": 4541 }, { "epoch": 3.384500745156483, "grad_norm": 11.078849469024478, "learning_rate": 5.905261806497993e-07, "loss": 0.2733, "num_input_tokens_seen": 18604032, "step": 4542 }, { "epoch": 3.3852459016393444, "grad_norm": 8.172965082854077, "learning_rate": 5.900290842420883e-07, "loss": 0.048, "num_input_tokens_seen": 18608128, "step": 4543 }, { "epoch": 3.3859910581222055, "grad_norm": 8.765693919701835, "learning_rate": 5.895321325047146e-07, "loss": 0.1014, "num_input_tokens_seen": 18612224, "step": 4544 }, { "epoch": 3.386736214605067, "grad_norm": 28.995732208393846, "learning_rate": 5.890353255466131e-07, "loss": 0.1949, "num_input_tokens_seen": 18616320, "step": 4545 }, { "epoch": 3.3874813710879286, "grad_norm": 8.106966370881533, "learning_rate": 5.885386634766886e-07, "loss": 0.1081, "num_input_tokens_seen": 18620416, "step": 4546 }, { "epoch": 3.3882265275707897, "grad_norm": 13.42352246615963, "learning_rate": 5.880421464038118e-07, "loss": 0.1044, "num_input_tokens_seen": 18624512, "step": 4547 }, { "epoch": 3.3889716840536512, "grad_norm": 7.864864516776419, "learning_rate": 5.875457744368242e-07, "loss": 0.1026, "num_input_tokens_seen": 18628608, "step": 4548 }, { "epoch": 3.389716840536513, "grad_norm": 9.304227949313555, "learning_rate": 5.870495476845321e-07, "loss": 0.2331, "num_input_tokens_seen": 18632704, "step": 4549 }, { "epoch": 3.390461997019374, "grad_norm": 10.266674087375904, "learning_rate": 5.865534662557137e-07, "loss": 0.2625, "num_input_tokens_seen": 18636800, "step": 4550 }, { "epoch": 3.3912071535022354, "grad_norm": 7.998250211347251, "learning_rate": 5.860575302591124e-07, "loss": 0.189, "num_input_tokens_seen": 18640896, "step": 4551 }, { "epoch": 3.391952309985097, "grad_norm": 8.972241126129862, "learning_rate": 5.855617398034418e-07, "loss": 0.1965, "num_input_tokens_seen": 18644992, "step": 4552 }, { "epoch": 3.392697466467958, "grad_norm": 11.510543525191832, "learning_rate": 5.850660949973825e-07, "loss": 0.4157, "num_input_tokens_seen": 18649088, "step": 4553 }, { "epoch": 3.3934426229508197, "grad_norm": 8.251226111668647, "learning_rate": 5.845705959495828e-07, "loss": 0.2536, "num_input_tokens_seen": 18653184, "step": 4554 }, { "epoch": 3.394187779433681, "grad_norm": 8.345429868650188, "learning_rate": 5.840752427686605e-07, "loss": 0.267, "num_input_tokens_seen": 18657280, "step": 4555 }, { "epoch": 3.3949329359165423, "grad_norm": 12.174070167784345, "learning_rate": 5.835800355631996e-07, "loss": 0.2778, "num_input_tokens_seen": 18661376, "step": 4556 }, { "epoch": 3.395678092399404, "grad_norm": 7.6166093481385575, "learning_rate": 5.830849744417539e-07, "loss": 0.2011, "num_input_tokens_seen": 18665472, "step": 4557 }, { "epoch": 3.3964232488822654, "grad_norm": 7.126583254029026, "learning_rate": 5.825900595128438e-07, "loss": 0.2435, "num_input_tokens_seen": 18669568, "step": 4558 }, { "epoch": 3.3971684053651265, "grad_norm": 9.415700804271696, "learning_rate": 5.820952908849586e-07, "loss": 0.3691, "num_input_tokens_seen": 18673664, "step": 4559 }, { "epoch": 3.397913561847988, "grad_norm": 7.754930050437144, "learning_rate": 5.816006686665547e-07, "loss": 0.1504, "num_input_tokens_seen": 18677760, "step": 4560 }, { "epoch": 3.3986587183308496, "grad_norm": 9.793684939033145, "learning_rate": 5.811061929660577e-07, "loss": 0.1018, "num_input_tokens_seen": 18681856, "step": 4561 }, { "epoch": 3.3994038748137108, "grad_norm": 9.210160398189412, "learning_rate": 5.806118638918588e-07, "loss": 0.2208, "num_input_tokens_seen": 18685952, "step": 4562 }, { "epoch": 3.4001490312965723, "grad_norm": 7.593354396247437, "learning_rate": 5.801176815523196e-07, "loss": 0.0945, "num_input_tokens_seen": 18690048, "step": 4563 }, { "epoch": 3.400894187779434, "grad_norm": 6.903162129601965, "learning_rate": 5.79623646055768e-07, "loss": 0.1476, "num_input_tokens_seen": 18694144, "step": 4564 }, { "epoch": 3.401639344262295, "grad_norm": 9.539657142771407, "learning_rate": 5.791297575104995e-07, "loss": 0.2297, "num_input_tokens_seen": 18698240, "step": 4565 }, { "epoch": 3.4023845007451565, "grad_norm": 12.070557293336924, "learning_rate": 5.786360160247792e-07, "loss": 0.149, "num_input_tokens_seen": 18702336, "step": 4566 }, { "epoch": 3.4031296572280176, "grad_norm": 9.498030289095787, "learning_rate": 5.781424217068376e-07, "loss": 0.1841, "num_input_tokens_seen": 18706432, "step": 4567 }, { "epoch": 3.403874813710879, "grad_norm": 6.732497289346011, "learning_rate": 5.776489746648752e-07, "loss": 0.0983, "num_input_tokens_seen": 18710528, "step": 4568 }, { "epoch": 3.4046199701937407, "grad_norm": 9.30479500366776, "learning_rate": 5.771556750070581e-07, "loss": 0.1182, "num_input_tokens_seen": 18714624, "step": 4569 }, { "epoch": 3.4053651266766023, "grad_norm": 10.430005479580814, "learning_rate": 5.766625228415221e-07, "loss": 0.2394, "num_input_tokens_seen": 18718720, "step": 4570 }, { "epoch": 3.4061102831594634, "grad_norm": 7.769635934054749, "learning_rate": 5.761695182763689e-07, "loss": 0.2344, "num_input_tokens_seen": 18722816, "step": 4571 }, { "epoch": 3.406855439642325, "grad_norm": 9.438601431170277, "learning_rate": 5.756766614196686e-07, "loss": 0.1882, "num_input_tokens_seen": 18726912, "step": 4572 }, { "epoch": 3.407600596125186, "grad_norm": 10.679312446962562, "learning_rate": 5.751839523794598e-07, "loss": 0.3477, "num_input_tokens_seen": 18731008, "step": 4573 }, { "epoch": 3.4083457526080476, "grad_norm": 9.859334239315915, "learning_rate": 5.746913912637466e-07, "loss": 0.1063, "num_input_tokens_seen": 18735104, "step": 4574 }, { "epoch": 3.409090909090909, "grad_norm": 8.027279197592318, "learning_rate": 5.741989781805035e-07, "loss": 0.1532, "num_input_tokens_seen": 18739200, "step": 4575 }, { "epoch": 3.4098360655737707, "grad_norm": 7.7861544829163565, "learning_rate": 5.737067132376692e-07, "loss": 0.2176, "num_input_tokens_seen": 18743296, "step": 4576 }, { "epoch": 3.410581222056632, "grad_norm": 7.862142617954279, "learning_rate": 5.73214596543153e-07, "loss": 0.1527, "num_input_tokens_seen": 18747392, "step": 4577 }, { "epoch": 3.4113263785394934, "grad_norm": 8.394624328932009, "learning_rate": 5.727226282048293e-07, "loss": 0.2357, "num_input_tokens_seen": 18751488, "step": 4578 }, { "epoch": 3.4120715350223545, "grad_norm": 4.942720038712838, "learning_rate": 5.722308083305423e-07, "loss": 0.067, "num_input_tokens_seen": 18755584, "step": 4579 }, { "epoch": 3.412816691505216, "grad_norm": 11.749705331834813, "learning_rate": 5.717391370281012e-07, "loss": 0.2218, "num_input_tokens_seen": 18759680, "step": 4580 }, { "epoch": 3.4135618479880776, "grad_norm": 8.88595730513741, "learning_rate": 5.71247614405285e-07, "loss": 0.158, "num_input_tokens_seen": 18763776, "step": 4581 }, { "epoch": 3.4143070044709387, "grad_norm": 10.17084995316244, "learning_rate": 5.707562405698385e-07, "loss": 0.1397, "num_input_tokens_seen": 18767872, "step": 4582 }, { "epoch": 3.4150521609538003, "grad_norm": 9.127598060567289, "learning_rate": 5.702650156294738e-07, "loss": 0.1519, "num_input_tokens_seen": 18771968, "step": 4583 }, { "epoch": 3.415797317436662, "grad_norm": 9.176305852081038, "learning_rate": 5.697739396918719e-07, "loss": 0.1341, "num_input_tokens_seen": 18776064, "step": 4584 }, { "epoch": 3.416542473919523, "grad_norm": 10.322482996802949, "learning_rate": 5.692830128646793e-07, "loss": 0.1856, "num_input_tokens_seen": 18780160, "step": 4585 }, { "epoch": 3.4172876304023845, "grad_norm": 10.086508769590177, "learning_rate": 5.687922352555115e-07, "loss": 0.2283, "num_input_tokens_seen": 18784256, "step": 4586 }, { "epoch": 3.418032786885246, "grad_norm": 11.111873601797445, "learning_rate": 5.683016069719494e-07, "loss": 0.1429, "num_input_tokens_seen": 18788352, "step": 4587 }, { "epoch": 3.418777943368107, "grad_norm": 9.457323802403865, "learning_rate": 5.678111281215437e-07, "loss": 0.1543, "num_input_tokens_seen": 18792448, "step": 4588 }, { "epoch": 3.4195230998509687, "grad_norm": 9.069658584055592, "learning_rate": 5.673207988118092e-07, "loss": 0.2579, "num_input_tokens_seen": 18796544, "step": 4589 }, { "epoch": 3.4202682563338302, "grad_norm": 13.701735270558396, "learning_rate": 5.668306191502307e-07, "loss": 0.278, "num_input_tokens_seen": 18800640, "step": 4590 }, { "epoch": 3.4210134128166914, "grad_norm": 10.990960401589202, "learning_rate": 5.663405892442587e-07, "loss": 0.3, "num_input_tokens_seen": 18804736, "step": 4591 }, { "epoch": 3.421758569299553, "grad_norm": 9.249369034706419, "learning_rate": 5.65850709201311e-07, "loss": 0.217, "num_input_tokens_seen": 18808832, "step": 4592 }, { "epoch": 3.4225037257824145, "grad_norm": 9.50944281127936, "learning_rate": 5.653609791287736e-07, "loss": 0.1173, "num_input_tokens_seen": 18812928, "step": 4593 }, { "epoch": 3.4232488822652756, "grad_norm": 9.383280487956823, "learning_rate": 5.648713991339976e-07, "loss": 0.1901, "num_input_tokens_seen": 18817024, "step": 4594 }, { "epoch": 3.423994038748137, "grad_norm": 7.358689532630065, "learning_rate": 5.64381969324304e-07, "loss": 0.145, "num_input_tokens_seen": 18821120, "step": 4595 }, { "epoch": 3.4247391952309987, "grad_norm": 8.389572508493563, "learning_rate": 5.638926898069777e-07, "loss": 0.0586, "num_input_tokens_seen": 18825216, "step": 4596 }, { "epoch": 3.42548435171386, "grad_norm": 10.136842318704385, "learning_rate": 5.634035606892737e-07, "loss": 0.1108, "num_input_tokens_seen": 18829312, "step": 4597 }, { "epoch": 3.4262295081967213, "grad_norm": 10.405369052424833, "learning_rate": 5.629145820784119e-07, "loss": 0.2871, "num_input_tokens_seen": 18833408, "step": 4598 }, { "epoch": 3.426974664679583, "grad_norm": 10.456561083736059, "learning_rate": 5.624257540815793e-07, "loss": 0.2638, "num_input_tokens_seen": 18837504, "step": 4599 }, { "epoch": 3.427719821162444, "grad_norm": 6.390122890029529, "learning_rate": 5.619370768059318e-07, "loss": 0.062, "num_input_tokens_seen": 18841600, "step": 4600 }, { "epoch": 3.4284649776453056, "grad_norm": 11.704281459237649, "learning_rate": 5.614485503585897e-07, "loss": 0.1389, "num_input_tokens_seen": 18845696, "step": 4601 }, { "epoch": 3.429210134128167, "grad_norm": 8.332751038135402, "learning_rate": 5.60960174846643e-07, "loss": 0.1134, "num_input_tokens_seen": 18849792, "step": 4602 }, { "epoch": 3.429955290611028, "grad_norm": 9.910271637114837, "learning_rate": 5.604719503771452e-07, "loss": 0.2102, "num_input_tokens_seen": 18853888, "step": 4603 }, { "epoch": 3.4307004470938898, "grad_norm": 10.345793663617533, "learning_rate": 5.599838770571201e-07, "loss": 0.1512, "num_input_tokens_seen": 18857984, "step": 4604 }, { "epoch": 3.4314456035767513, "grad_norm": 8.44697467013967, "learning_rate": 5.594959549935558e-07, "loss": 0.194, "num_input_tokens_seen": 18862080, "step": 4605 }, { "epoch": 3.4321907600596124, "grad_norm": 7.433924230684241, "learning_rate": 5.590081842934091e-07, "loss": 0.1305, "num_input_tokens_seen": 18866176, "step": 4606 }, { "epoch": 3.432935916542474, "grad_norm": 8.617493033662305, "learning_rate": 5.585205650636023e-07, "loss": 0.2546, "num_input_tokens_seen": 18870272, "step": 4607 }, { "epoch": 3.433681073025335, "grad_norm": 8.926238132753166, "learning_rate": 5.580330974110257e-07, "loss": 0.1402, "num_input_tokens_seen": 18874368, "step": 4608 }, { "epoch": 3.4344262295081966, "grad_norm": 12.224932387574748, "learning_rate": 5.575457814425351e-07, "loss": 0.239, "num_input_tokens_seen": 18878464, "step": 4609 }, { "epoch": 3.435171385991058, "grad_norm": 12.186096983628918, "learning_rate": 5.570586172649533e-07, "loss": 0.1426, "num_input_tokens_seen": 18882560, "step": 4610 }, { "epoch": 3.4359165424739198, "grad_norm": 6.70257084902185, "learning_rate": 5.565716049850713e-07, "loss": 0.0861, "num_input_tokens_seen": 18886656, "step": 4611 }, { "epoch": 3.436661698956781, "grad_norm": 8.903173216264094, "learning_rate": 5.560847447096444e-07, "loss": 0.2126, "num_input_tokens_seen": 18890752, "step": 4612 }, { "epoch": 3.4374068554396424, "grad_norm": 6.837097068940149, "learning_rate": 5.55598036545397e-07, "loss": 0.0906, "num_input_tokens_seen": 18894848, "step": 4613 }, { "epoch": 3.4381520119225035, "grad_norm": 9.373617441762322, "learning_rate": 5.551114805990179e-07, "loss": 0.4006, "num_input_tokens_seen": 18898944, "step": 4614 }, { "epoch": 3.438897168405365, "grad_norm": 8.510100336556219, "learning_rate": 5.546250769771651e-07, "loss": 0.1802, "num_input_tokens_seen": 18903040, "step": 4615 }, { "epoch": 3.4396423248882266, "grad_norm": 8.960782809444892, "learning_rate": 5.541388257864601e-07, "loss": 0.2791, "num_input_tokens_seen": 18907136, "step": 4616 }, { "epoch": 3.4403874813710877, "grad_norm": 7.499860340589308, "learning_rate": 5.536527271334936e-07, "loss": 0.1532, "num_input_tokens_seen": 18911232, "step": 4617 }, { "epoch": 3.4411326378539493, "grad_norm": 9.055502343645376, "learning_rate": 5.531667811248219e-07, "loss": 0.2651, "num_input_tokens_seen": 18915328, "step": 4618 }, { "epoch": 3.441877794336811, "grad_norm": 5.897421967281506, "learning_rate": 5.52680987866967e-07, "loss": 0.0357, "num_input_tokens_seen": 18919424, "step": 4619 }, { "epoch": 3.442622950819672, "grad_norm": 10.318995506325905, "learning_rate": 5.521953474664194e-07, "loss": 0.2369, "num_input_tokens_seen": 18923520, "step": 4620 }, { "epoch": 3.4433681073025335, "grad_norm": 14.785252758406747, "learning_rate": 5.517098600296339e-07, "loss": 0.0886, "num_input_tokens_seen": 18927616, "step": 4621 }, { "epoch": 3.444113263785395, "grad_norm": 9.5233199108149, "learning_rate": 5.512245256630338e-07, "loss": 0.147, "num_input_tokens_seen": 18931712, "step": 4622 }, { "epoch": 3.444858420268256, "grad_norm": 9.130010906923527, "learning_rate": 5.507393444730069e-07, "loss": 0.2004, "num_input_tokens_seen": 18935808, "step": 4623 }, { "epoch": 3.4456035767511177, "grad_norm": 8.357880017690048, "learning_rate": 5.502543165659094e-07, "loss": 0.1985, "num_input_tokens_seen": 18939904, "step": 4624 }, { "epoch": 3.4463487332339793, "grad_norm": 8.255364595560463, "learning_rate": 5.497694420480622e-07, "loss": 0.1709, "num_input_tokens_seen": 18944000, "step": 4625 }, { "epoch": 3.4470938897168404, "grad_norm": 17.42910827546028, "learning_rate": 5.49284721025753e-07, "loss": 0.3801, "num_input_tokens_seen": 18948096, "step": 4626 }, { "epoch": 3.447839046199702, "grad_norm": 9.256403558822571, "learning_rate": 5.48800153605237e-07, "loss": 0.1317, "num_input_tokens_seen": 18952192, "step": 4627 }, { "epoch": 3.4485842026825635, "grad_norm": 9.65956408222845, "learning_rate": 5.483157398927338e-07, "loss": 0.3177, "num_input_tokens_seen": 18956288, "step": 4628 }, { "epoch": 3.4493293591654246, "grad_norm": 6.942123631551188, "learning_rate": 5.47831479994432e-07, "loss": 0.1869, "num_input_tokens_seen": 18960384, "step": 4629 }, { "epoch": 3.450074515648286, "grad_norm": 5.912379028414779, "learning_rate": 5.473473740164827e-07, "loss": 0.0997, "num_input_tokens_seen": 18964480, "step": 4630 }, { "epoch": 3.4508196721311477, "grad_norm": 7.516537574410291, "learning_rate": 5.468634220650068e-07, "loss": 0.1905, "num_input_tokens_seen": 18968576, "step": 4631 }, { "epoch": 3.451564828614009, "grad_norm": 9.22362559530944, "learning_rate": 5.463796242460892e-07, "loss": 0.1103, "num_input_tokens_seen": 18972672, "step": 4632 }, { "epoch": 3.4523099850968704, "grad_norm": 6.739158231725976, "learning_rate": 5.458959806657827e-07, "loss": 0.1038, "num_input_tokens_seen": 18976768, "step": 4633 }, { "epoch": 3.453055141579732, "grad_norm": 6.032848124205592, "learning_rate": 5.454124914301046e-07, "loss": 0.0538, "num_input_tokens_seen": 18980864, "step": 4634 }, { "epoch": 3.453800298062593, "grad_norm": 10.002139800382272, "learning_rate": 5.449291566450399e-07, "loss": 0.1592, "num_input_tokens_seen": 18984960, "step": 4635 }, { "epoch": 3.4545454545454546, "grad_norm": 8.086680272950089, "learning_rate": 5.444459764165389e-07, "loss": 0.0718, "num_input_tokens_seen": 18989056, "step": 4636 }, { "epoch": 3.455290611028316, "grad_norm": 8.769985408524215, "learning_rate": 5.439629508505173e-07, "loss": 0.1187, "num_input_tokens_seen": 18993152, "step": 4637 }, { "epoch": 3.4560357675111772, "grad_norm": 8.543246156077569, "learning_rate": 5.43480080052859e-07, "loss": 0.1422, "num_input_tokens_seen": 18997248, "step": 4638 }, { "epoch": 3.456780923994039, "grad_norm": 7.934432650589062, "learning_rate": 5.429973641294117e-07, "loss": 0.2164, "num_input_tokens_seen": 19001344, "step": 4639 }, { "epoch": 3.4575260804769004, "grad_norm": 8.048366637489973, "learning_rate": 5.425148031859912e-07, "loss": 0.1851, "num_input_tokens_seen": 19005440, "step": 4640 }, { "epoch": 3.4582712369597615, "grad_norm": 9.091999613357462, "learning_rate": 5.420323973283772e-07, "loss": 0.1603, "num_input_tokens_seen": 19009536, "step": 4641 }, { "epoch": 3.459016393442623, "grad_norm": 5.627388914118687, "learning_rate": 5.41550146662318e-07, "loss": 0.0694, "num_input_tokens_seen": 19013632, "step": 4642 }, { "epoch": 3.459761549925484, "grad_norm": 10.377786825794475, "learning_rate": 5.410680512935245e-07, "loss": 0.357, "num_input_tokens_seen": 19017728, "step": 4643 }, { "epoch": 3.4605067064083457, "grad_norm": 11.606188493583637, "learning_rate": 5.405861113276767e-07, "loss": 0.1808, "num_input_tokens_seen": 19021824, "step": 4644 }, { "epoch": 3.4612518628912072, "grad_norm": 8.337064036166478, "learning_rate": 5.401043268704192e-07, "loss": 0.0908, "num_input_tokens_seen": 19025920, "step": 4645 }, { "epoch": 3.461997019374069, "grad_norm": 8.61998693739083, "learning_rate": 5.396226980273617e-07, "loss": 0.199, "num_input_tokens_seen": 19030016, "step": 4646 }, { "epoch": 3.46274217585693, "grad_norm": 10.549817121239965, "learning_rate": 5.391412249040819e-07, "loss": 0.1216, "num_input_tokens_seen": 19034112, "step": 4647 }, { "epoch": 3.4634873323397914, "grad_norm": 6.277747544174698, "learning_rate": 5.38659907606121e-07, "loss": 0.0372, "num_input_tokens_seen": 19038208, "step": 4648 }, { "epoch": 3.4642324888226526, "grad_norm": 11.236997387409106, "learning_rate": 5.381787462389883e-07, "loss": 0.3996, "num_input_tokens_seen": 19042304, "step": 4649 }, { "epoch": 3.464977645305514, "grad_norm": 6.61173665288644, "learning_rate": 5.376977409081569e-07, "loss": 0.0871, "num_input_tokens_seen": 19046400, "step": 4650 }, { "epoch": 3.4657228017883757, "grad_norm": 8.547313706234995, "learning_rate": 5.372168917190673e-07, "loss": 0.0834, "num_input_tokens_seen": 19050496, "step": 4651 }, { "epoch": 3.4664679582712368, "grad_norm": 10.180949845608078, "learning_rate": 5.367361987771245e-07, "loss": 0.1551, "num_input_tokens_seen": 19054592, "step": 4652 }, { "epoch": 3.4672131147540983, "grad_norm": 5.4925583694851134, "learning_rate": 5.362556621877004e-07, "loss": 0.0687, "num_input_tokens_seen": 19058688, "step": 4653 }, { "epoch": 3.46795827123696, "grad_norm": 13.718995577221852, "learning_rate": 5.357752820561318e-07, "loss": 0.0935, "num_input_tokens_seen": 19062784, "step": 4654 }, { "epoch": 3.468703427719821, "grad_norm": 9.651198264062263, "learning_rate": 5.352950584877209e-07, "loss": 0.131, "num_input_tokens_seen": 19066880, "step": 4655 }, { "epoch": 3.4694485842026825, "grad_norm": 6.252702718381287, "learning_rate": 5.348149915877376e-07, "loss": 0.1069, "num_input_tokens_seen": 19070976, "step": 4656 }, { "epoch": 3.470193740685544, "grad_norm": 9.837857204085656, "learning_rate": 5.34335081461414e-07, "loss": 0.3193, "num_input_tokens_seen": 19075072, "step": 4657 }, { "epoch": 3.470938897168405, "grad_norm": 9.76290401393444, "learning_rate": 5.338553282139513e-07, "loss": 0.2273, "num_input_tokens_seen": 19079168, "step": 4658 }, { "epoch": 3.4716840536512668, "grad_norm": 6.952620748977265, "learning_rate": 5.333757319505141e-07, "loss": 0.1255, "num_input_tokens_seen": 19083264, "step": 4659 }, { "epoch": 3.4724292101341283, "grad_norm": 7.910483130989027, "learning_rate": 5.32896292776234e-07, "loss": 0.1549, "num_input_tokens_seen": 19087360, "step": 4660 }, { "epoch": 3.4731743666169894, "grad_norm": 7.483920061996636, "learning_rate": 5.324170107962067e-07, "loss": 0.1038, "num_input_tokens_seen": 19091456, "step": 4661 }, { "epoch": 3.473919523099851, "grad_norm": 16.11158393663506, "learning_rate": 5.319378861154949e-07, "loss": 0.1689, "num_input_tokens_seen": 19095552, "step": 4662 }, { "epoch": 3.4746646795827125, "grad_norm": 9.900552841636959, "learning_rate": 5.31458918839126e-07, "loss": 0.1883, "num_input_tokens_seen": 19099648, "step": 4663 }, { "epoch": 3.4754098360655736, "grad_norm": 7.827939259999653, "learning_rate": 5.309801090720925e-07, "loss": 0.0892, "num_input_tokens_seen": 19103744, "step": 4664 }, { "epoch": 3.476154992548435, "grad_norm": 7.764250866457708, "learning_rate": 5.305014569193537e-07, "loss": 0.1199, "num_input_tokens_seen": 19107840, "step": 4665 }, { "epoch": 3.4769001490312967, "grad_norm": 8.976783351810122, "learning_rate": 5.300229624858329e-07, "loss": 0.168, "num_input_tokens_seen": 19111936, "step": 4666 }, { "epoch": 3.477645305514158, "grad_norm": 9.451286047476383, "learning_rate": 5.295446258764202e-07, "loss": 0.1003, "num_input_tokens_seen": 19116032, "step": 4667 }, { "epoch": 3.4783904619970194, "grad_norm": 8.743956560118194, "learning_rate": 5.290664471959696e-07, "loss": 0.1079, "num_input_tokens_seen": 19120128, "step": 4668 }, { "epoch": 3.479135618479881, "grad_norm": 9.552509546794207, "learning_rate": 5.285884265493025e-07, "loss": 0.3514, "num_input_tokens_seen": 19124224, "step": 4669 }, { "epoch": 3.479880774962742, "grad_norm": 8.08277482405861, "learning_rate": 5.281105640412029e-07, "loss": 0.2684, "num_input_tokens_seen": 19128320, "step": 4670 }, { "epoch": 3.4806259314456036, "grad_norm": 6.687627567486922, "learning_rate": 5.276328597764227e-07, "loss": 0.0798, "num_input_tokens_seen": 19132416, "step": 4671 }, { "epoch": 3.481371087928465, "grad_norm": 7.764278511402871, "learning_rate": 5.271553138596775e-07, "loss": 0.1858, "num_input_tokens_seen": 19136512, "step": 4672 }, { "epoch": 3.4821162444113263, "grad_norm": 7.746366310191305, "learning_rate": 5.266779263956496e-07, "loss": 0.1743, "num_input_tokens_seen": 19140608, "step": 4673 }, { "epoch": 3.482861400894188, "grad_norm": 12.246869885207397, "learning_rate": 5.26200697488985e-07, "loss": 0.262, "num_input_tokens_seen": 19144704, "step": 4674 }, { "epoch": 3.4836065573770494, "grad_norm": 8.613222809749534, "learning_rate": 5.257236272442955e-07, "loss": 0.2106, "num_input_tokens_seen": 19148800, "step": 4675 }, { "epoch": 3.4843517138599105, "grad_norm": 10.9782497647538, "learning_rate": 5.252467157661591e-07, "loss": 0.1812, "num_input_tokens_seen": 19152896, "step": 4676 }, { "epoch": 3.485096870342772, "grad_norm": 8.068788839995145, "learning_rate": 5.247699631591174e-07, "loss": 0.0627, "num_input_tokens_seen": 19156992, "step": 4677 }, { "epoch": 3.485842026825633, "grad_norm": 9.5149085629526, "learning_rate": 5.242933695276789e-07, "loss": 0.1274, "num_input_tokens_seen": 19161088, "step": 4678 }, { "epoch": 3.4865871833084947, "grad_norm": 10.145003131715454, "learning_rate": 5.238169349763154e-07, "loss": 0.2799, "num_input_tokens_seen": 19165184, "step": 4679 }, { "epoch": 3.4873323397913563, "grad_norm": 7.8638703877624, "learning_rate": 5.233406596094654e-07, "loss": 0.0443, "num_input_tokens_seen": 19169280, "step": 4680 }, { "epoch": 3.488077496274218, "grad_norm": 9.944466681205117, "learning_rate": 5.228645435315318e-07, "loss": 0.232, "num_input_tokens_seen": 19173376, "step": 4681 }, { "epoch": 3.488822652757079, "grad_norm": 9.985986631326375, "learning_rate": 5.22388586846882e-07, "loss": 0.2109, "num_input_tokens_seen": 19177472, "step": 4682 }, { "epoch": 3.4895678092399405, "grad_norm": 7.375010911673733, "learning_rate": 5.219127896598504e-07, "loss": 0.1078, "num_input_tokens_seen": 19181568, "step": 4683 }, { "epoch": 3.4903129657228016, "grad_norm": 8.029293797245304, "learning_rate": 5.214371520747336e-07, "loss": 0.1097, "num_input_tokens_seen": 19185664, "step": 4684 }, { "epoch": 3.491058122205663, "grad_norm": 10.007464084903187, "learning_rate": 5.209616741957956e-07, "loss": 0.2674, "num_input_tokens_seen": 19189760, "step": 4685 }, { "epoch": 3.4918032786885247, "grad_norm": 7.570683546350049, "learning_rate": 5.204863561272643e-07, "loss": 0.1045, "num_input_tokens_seen": 19193856, "step": 4686 }, { "epoch": 3.492548435171386, "grad_norm": 7.193660387572928, "learning_rate": 5.200111979733331e-07, "loss": 0.1266, "num_input_tokens_seen": 19197952, "step": 4687 }, { "epoch": 3.4932935916542474, "grad_norm": 8.481607027117446, "learning_rate": 5.195361998381597e-07, "loss": 0.2119, "num_input_tokens_seen": 19202048, "step": 4688 }, { "epoch": 3.494038748137109, "grad_norm": 7.2055101666926475, "learning_rate": 5.190613618258675e-07, "loss": 0.1198, "num_input_tokens_seen": 19206144, "step": 4689 }, { "epoch": 3.49478390461997, "grad_norm": 18.818533583044783, "learning_rate": 5.185866840405443e-07, "loss": 0.1619, "num_input_tokens_seen": 19210240, "step": 4690 }, { "epoch": 3.4955290611028316, "grad_norm": 10.007045801026193, "learning_rate": 5.181121665862422e-07, "loss": 0.2839, "num_input_tokens_seen": 19214336, "step": 4691 }, { "epoch": 3.496274217585693, "grad_norm": 9.394532712986521, "learning_rate": 5.176378095669797e-07, "loss": 0.1924, "num_input_tokens_seen": 19218432, "step": 4692 }, { "epoch": 3.4970193740685542, "grad_norm": 9.431062129328582, "learning_rate": 5.171636130867385e-07, "loss": 0.2082, "num_input_tokens_seen": 19222528, "step": 4693 }, { "epoch": 3.497764530551416, "grad_norm": 10.244140172668144, "learning_rate": 5.166895772494668e-07, "loss": 0.3694, "num_input_tokens_seen": 19226624, "step": 4694 }, { "epoch": 3.4985096870342773, "grad_norm": 8.682341356301635, "learning_rate": 5.162157021590756e-07, "loss": 0.1471, "num_input_tokens_seen": 19230720, "step": 4695 }, { "epoch": 3.4992548435171384, "grad_norm": 7.550403655751304, "learning_rate": 5.15741987919443e-07, "loss": 0.1014, "num_input_tokens_seen": 19234816, "step": 4696 }, { "epoch": 3.5, "grad_norm": 8.782232541099718, "learning_rate": 5.152684346344087e-07, "loss": 0.1547, "num_input_tokens_seen": 19238912, "step": 4697 }, { "epoch": 3.5007451564828616, "grad_norm": 6.718158008397983, "learning_rate": 5.147950424077804e-07, "loss": 0.0738, "num_input_tokens_seen": 19243008, "step": 4698 }, { "epoch": 3.5014903129657227, "grad_norm": 8.02433810110172, "learning_rate": 5.143218113433285e-07, "loss": 0.1422, "num_input_tokens_seen": 19247104, "step": 4699 }, { "epoch": 3.502235469448584, "grad_norm": 10.33086249698013, "learning_rate": 5.13848741544789e-07, "loss": 0.1286, "num_input_tokens_seen": 19251200, "step": 4700 }, { "epoch": 3.5029806259314458, "grad_norm": 11.222945794310535, "learning_rate": 5.13375833115862e-07, "loss": 0.2259, "num_input_tokens_seen": 19255296, "step": 4701 }, { "epoch": 3.503725782414307, "grad_norm": 8.998259202265968, "learning_rate": 5.129030861602119e-07, "loss": 0.369, "num_input_tokens_seen": 19259392, "step": 4702 }, { "epoch": 3.5044709388971684, "grad_norm": 11.068359126678757, "learning_rate": 5.124305007814691e-07, "loss": 0.2039, "num_input_tokens_seen": 19263488, "step": 4703 }, { "epoch": 3.50521609538003, "grad_norm": 7.65005870058616, "learning_rate": 5.119580770832268e-07, "loss": 0.1903, "num_input_tokens_seen": 19267584, "step": 4704 }, { "epoch": 3.505961251862891, "grad_norm": 10.155679210014382, "learning_rate": 5.114858151690445e-07, "loss": 0.2363, "num_input_tokens_seen": 19271680, "step": 4705 }, { "epoch": 3.5067064083457526, "grad_norm": 10.621498010672282, "learning_rate": 5.110137151424446e-07, "loss": 0.2495, "num_input_tokens_seen": 19275776, "step": 4706 }, { "epoch": 3.5074515648286138, "grad_norm": 8.0390561197594, "learning_rate": 5.105417771069156e-07, "loss": 0.0818, "num_input_tokens_seen": 19279872, "step": 4707 }, { "epoch": 3.5081967213114753, "grad_norm": 10.002063963545986, "learning_rate": 5.100700011659092e-07, "loss": 0.2074, "num_input_tokens_seen": 19283968, "step": 4708 }, { "epoch": 3.508941877794337, "grad_norm": 5.510990577290341, "learning_rate": 5.095983874228417e-07, "loss": 0.0482, "num_input_tokens_seen": 19288064, "step": 4709 }, { "epoch": 3.5096870342771984, "grad_norm": 7.968607647517852, "learning_rate": 5.091269359810956e-07, "loss": 0.1773, "num_input_tokens_seen": 19292160, "step": 4710 }, { "epoch": 3.5104321907600595, "grad_norm": 9.093501704994102, "learning_rate": 5.086556469440144e-07, "loss": 0.1397, "num_input_tokens_seen": 19296256, "step": 4711 }, { "epoch": 3.511177347242921, "grad_norm": 8.721032287036788, "learning_rate": 5.081845204149096e-07, "loss": 0.059, "num_input_tokens_seen": 19300352, "step": 4712 }, { "epoch": 3.511922503725782, "grad_norm": 6.688087931546832, "learning_rate": 5.077135564970545e-07, "loss": 0.1305, "num_input_tokens_seen": 19304448, "step": 4713 }, { "epoch": 3.5126676602086437, "grad_norm": 8.856455315740456, "learning_rate": 5.072427552936885e-07, "loss": 0.189, "num_input_tokens_seen": 19308544, "step": 4714 }, { "epoch": 3.5134128166915053, "grad_norm": 9.003422570624453, "learning_rate": 5.067721169080142e-07, "loss": 0.32, "num_input_tokens_seen": 19312640, "step": 4715 }, { "epoch": 3.514157973174367, "grad_norm": 11.055455267061815, "learning_rate": 5.063016414431991e-07, "loss": 0.2394, "num_input_tokens_seen": 19316736, "step": 4716 }, { "epoch": 3.514903129657228, "grad_norm": 7.424718743545939, "learning_rate": 5.058313290023749e-07, "loss": 0.1695, "num_input_tokens_seen": 19320832, "step": 4717 }, { "epoch": 3.5156482861400895, "grad_norm": 11.77475192521754, "learning_rate": 5.053611796886367e-07, "loss": 0.2709, "num_input_tokens_seen": 19324928, "step": 4718 }, { "epoch": 3.5163934426229506, "grad_norm": 8.974456099133729, "learning_rate": 5.048911936050456e-07, "loss": 0.2141, "num_input_tokens_seen": 19329024, "step": 4719 }, { "epoch": 3.517138599105812, "grad_norm": 10.523780778455722, "learning_rate": 5.044213708546249e-07, "loss": 0.1784, "num_input_tokens_seen": 19333120, "step": 4720 }, { "epoch": 3.5178837555886737, "grad_norm": 12.067743766888148, "learning_rate": 5.039517115403641e-07, "loss": 0.2008, "num_input_tokens_seen": 19337216, "step": 4721 }, { "epoch": 3.5186289120715353, "grad_norm": 7.477437198000957, "learning_rate": 5.034822157652149e-07, "loss": 0.0959, "num_input_tokens_seen": 19341312, "step": 4722 }, { "epoch": 3.5193740685543964, "grad_norm": 8.380651760037214, "learning_rate": 5.030128836320955e-07, "loss": 0.1612, "num_input_tokens_seen": 19345408, "step": 4723 }, { "epoch": 3.520119225037258, "grad_norm": 11.25128834332576, "learning_rate": 5.025437152438852e-07, "loss": 0.0638, "num_input_tokens_seen": 19349504, "step": 4724 }, { "epoch": 3.520864381520119, "grad_norm": 8.861464173041945, "learning_rate": 5.020747107034301e-07, "loss": 0.0688, "num_input_tokens_seen": 19353600, "step": 4725 }, { "epoch": 3.5216095380029806, "grad_norm": 7.522457287962056, "learning_rate": 5.016058701135387e-07, "loss": 0.083, "num_input_tokens_seen": 19357696, "step": 4726 }, { "epoch": 3.522354694485842, "grad_norm": 11.239915786234333, "learning_rate": 5.011371935769852e-07, "loss": 0.2916, "num_input_tokens_seen": 19361792, "step": 4727 }, { "epoch": 3.5230998509687033, "grad_norm": 8.76433031976118, "learning_rate": 5.00668681196506e-07, "loss": 0.1869, "num_input_tokens_seen": 19365888, "step": 4728 }, { "epoch": 3.523845007451565, "grad_norm": 6.1278270113004645, "learning_rate": 5.002003330748024e-07, "loss": 0.0564, "num_input_tokens_seen": 19369984, "step": 4729 }, { "epoch": 3.5245901639344264, "grad_norm": 6.64335992734452, "learning_rate": 4.997321493145399e-07, "loss": 0.0968, "num_input_tokens_seen": 19374080, "step": 4730 }, { "epoch": 3.5253353204172875, "grad_norm": 8.851499942829589, "learning_rate": 4.992641300183475e-07, "loss": 0.1903, "num_input_tokens_seen": 19378176, "step": 4731 }, { "epoch": 3.526080476900149, "grad_norm": 7.879820700966898, "learning_rate": 4.987962752888188e-07, "loss": 0.0337, "num_input_tokens_seen": 19382272, "step": 4732 }, { "epoch": 3.5268256333830106, "grad_norm": 9.201856249716863, "learning_rate": 4.983285852285104e-07, "loss": 0.2049, "num_input_tokens_seen": 19386368, "step": 4733 }, { "epoch": 3.5275707898658717, "grad_norm": 9.284431993776412, "learning_rate": 4.978610599399439e-07, "loss": 0.1467, "num_input_tokens_seen": 19390464, "step": 4734 }, { "epoch": 3.5283159463487332, "grad_norm": 7.56896453931242, "learning_rate": 4.973936995256038e-07, "loss": 0.1386, "num_input_tokens_seen": 19394560, "step": 4735 }, { "epoch": 3.529061102831595, "grad_norm": 9.882902411915891, "learning_rate": 4.969265040879383e-07, "loss": 0.408, "num_input_tokens_seen": 19398656, "step": 4736 }, { "epoch": 3.529806259314456, "grad_norm": 8.960882769605362, "learning_rate": 4.964594737293611e-07, "loss": 0.3111, "num_input_tokens_seen": 19402752, "step": 4737 }, { "epoch": 3.5305514157973175, "grad_norm": 9.65473108381743, "learning_rate": 4.959926085522479e-07, "loss": 0.1156, "num_input_tokens_seen": 19406848, "step": 4738 }, { "epoch": 3.531296572280179, "grad_norm": 6.679515197572964, "learning_rate": 4.955259086589391e-07, "loss": 0.0942, "num_input_tokens_seen": 19410944, "step": 4739 }, { "epoch": 3.53204172876304, "grad_norm": 10.049973034847104, "learning_rate": 4.950593741517382e-07, "loss": 0.3264, "num_input_tokens_seen": 19415040, "step": 4740 }, { "epoch": 3.5327868852459017, "grad_norm": 10.57452489902392, "learning_rate": 4.945930051329134e-07, "loss": 0.2718, "num_input_tokens_seen": 19419136, "step": 4741 }, { "epoch": 3.533532041728763, "grad_norm": 8.682031279199927, "learning_rate": 4.941268017046957e-07, "loss": 0.1273, "num_input_tokens_seen": 19423232, "step": 4742 }, { "epoch": 3.5342771982116243, "grad_norm": 12.160328957379399, "learning_rate": 4.93660763969281e-07, "loss": 0.1352, "num_input_tokens_seen": 19427328, "step": 4743 }, { "epoch": 3.535022354694486, "grad_norm": 7.148829715637345, "learning_rate": 4.931948920288271e-07, "loss": 0.1429, "num_input_tokens_seen": 19431424, "step": 4744 }, { "epoch": 3.5357675111773474, "grad_norm": 8.482147671780789, "learning_rate": 4.927291859854573e-07, "loss": 0.1185, "num_input_tokens_seen": 19435520, "step": 4745 }, { "epoch": 3.5365126676602086, "grad_norm": 8.117351014596126, "learning_rate": 4.922636459412573e-07, "loss": 0.2191, "num_input_tokens_seen": 19439616, "step": 4746 }, { "epoch": 3.53725782414307, "grad_norm": 7.39586232614206, "learning_rate": 4.917982719982766e-07, "loss": 0.1689, "num_input_tokens_seen": 19443712, "step": 4747 }, { "epoch": 3.538002980625931, "grad_norm": 8.038063676061883, "learning_rate": 4.913330642585289e-07, "loss": 0.1231, "num_input_tokens_seen": 19447808, "step": 4748 }, { "epoch": 3.5387481371087928, "grad_norm": 9.237308625540628, "learning_rate": 4.908680228239906e-07, "loss": 0.2521, "num_input_tokens_seen": 19451904, "step": 4749 }, { "epoch": 3.5394932935916543, "grad_norm": 8.366131839401588, "learning_rate": 4.904031477966031e-07, "loss": 0.1053, "num_input_tokens_seen": 19456000, "step": 4750 }, { "epoch": 3.540238450074516, "grad_norm": 8.479675056561968, "learning_rate": 4.89938439278269e-07, "loss": 0.1175, "num_input_tokens_seen": 19460096, "step": 4751 }, { "epoch": 3.540983606557377, "grad_norm": 8.925389732449238, "learning_rate": 4.894738973708565e-07, "loss": 0.2999, "num_input_tokens_seen": 19464192, "step": 4752 }, { "epoch": 3.5417287630402385, "grad_norm": 8.167592650261899, "learning_rate": 4.89009522176196e-07, "loss": 0.2203, "num_input_tokens_seen": 19468288, "step": 4753 }, { "epoch": 3.5424739195230996, "grad_norm": 8.556058317395992, "learning_rate": 4.885453137960828e-07, "loss": 0.1467, "num_input_tokens_seen": 19472384, "step": 4754 }, { "epoch": 3.543219076005961, "grad_norm": 8.355789132650214, "learning_rate": 4.88081272332274e-07, "loss": 0.2189, "num_input_tokens_seen": 19476480, "step": 4755 }, { "epoch": 3.5439642324888228, "grad_norm": 7.966330912924135, "learning_rate": 4.876173978864903e-07, "loss": 0.2644, "num_input_tokens_seen": 19480576, "step": 4756 }, { "epoch": 3.5447093889716843, "grad_norm": 7.5448277457701955, "learning_rate": 4.871536905604174e-07, "loss": 0.2002, "num_input_tokens_seen": 19484672, "step": 4757 }, { "epoch": 3.5454545454545454, "grad_norm": 10.846453291918769, "learning_rate": 4.866901504557022e-07, "loss": 0.2604, "num_input_tokens_seen": 19488768, "step": 4758 }, { "epoch": 3.546199701937407, "grad_norm": 7.1532693332395025, "learning_rate": 4.862267776739573e-07, "loss": 0.0772, "num_input_tokens_seen": 19492864, "step": 4759 }, { "epoch": 3.546944858420268, "grad_norm": 7.246849025404237, "learning_rate": 4.857635723167559e-07, "loss": 0.1083, "num_input_tokens_seen": 19496960, "step": 4760 }, { "epoch": 3.5476900149031296, "grad_norm": 9.406117350643719, "learning_rate": 4.85300534485637e-07, "loss": 0.2459, "num_input_tokens_seen": 19501056, "step": 4761 }, { "epoch": 3.548435171385991, "grad_norm": 12.793737476464408, "learning_rate": 4.848376642821014e-07, "loss": 0.1779, "num_input_tokens_seen": 19505152, "step": 4762 }, { "epoch": 3.5491803278688527, "grad_norm": 7.51671789125209, "learning_rate": 4.843749618076131e-07, "loss": 0.082, "num_input_tokens_seen": 19509248, "step": 4763 }, { "epoch": 3.549925484351714, "grad_norm": 7.381238839860052, "learning_rate": 4.839124271636007e-07, "loss": 0.0735, "num_input_tokens_seen": 19513344, "step": 4764 }, { "epoch": 3.5506706408345754, "grad_norm": 11.628299166811951, "learning_rate": 4.834500604514546e-07, "loss": 0.4204, "num_input_tokens_seen": 19517440, "step": 4765 }, { "epoch": 3.5514157973174365, "grad_norm": 10.620241091159576, "learning_rate": 4.82987861772529e-07, "loss": 0.2371, "num_input_tokens_seen": 19521536, "step": 4766 }, { "epoch": 3.552160953800298, "grad_norm": 10.616293481850851, "learning_rate": 4.825258312281404e-07, "loss": 0.1705, "num_input_tokens_seen": 19525632, "step": 4767 }, { "epoch": 3.5529061102831596, "grad_norm": 22.31476199256083, "learning_rate": 4.820639689195704e-07, "loss": 0.3224, "num_input_tokens_seen": 19529728, "step": 4768 }, { "epoch": 3.5536512667660207, "grad_norm": 13.13992581751695, "learning_rate": 4.816022749480616e-07, "loss": 0.2185, "num_input_tokens_seen": 19533824, "step": 4769 }, { "epoch": 3.5543964232488823, "grad_norm": 11.367241917211123, "learning_rate": 4.811407494148214e-07, "loss": 0.1316, "num_input_tokens_seen": 19537920, "step": 4770 }, { "epoch": 3.555141579731744, "grad_norm": 9.510051488167719, "learning_rate": 4.806793924210186e-07, "loss": 0.1487, "num_input_tokens_seen": 19542016, "step": 4771 }, { "epoch": 3.555886736214605, "grad_norm": 9.49798309849507, "learning_rate": 4.802182040677868e-07, "loss": 0.1639, "num_input_tokens_seen": 19546112, "step": 4772 }, { "epoch": 3.5566318926974665, "grad_norm": 7.9461007601175, "learning_rate": 4.797571844562214e-07, "loss": 0.1886, "num_input_tokens_seen": 19550208, "step": 4773 }, { "epoch": 3.557377049180328, "grad_norm": 9.240944771482983, "learning_rate": 4.792963336873808e-07, "loss": 0.1971, "num_input_tokens_seen": 19554304, "step": 4774 }, { "epoch": 3.558122205663189, "grad_norm": 8.521167396193531, "learning_rate": 4.788356518622877e-07, "loss": 0.1745, "num_input_tokens_seen": 19558400, "step": 4775 }, { "epoch": 3.5588673621460507, "grad_norm": 7.7230734743162754, "learning_rate": 4.783751390819257e-07, "loss": 0.1072, "num_input_tokens_seen": 19562496, "step": 4776 }, { "epoch": 3.559612518628912, "grad_norm": 7.527036813507954, "learning_rate": 4.779147954472442e-07, "loss": 0.1308, "num_input_tokens_seen": 19566592, "step": 4777 }, { "epoch": 3.5603576751117734, "grad_norm": 7.001906930308481, "learning_rate": 4.774546210591517e-07, "loss": 0.0899, "num_input_tokens_seen": 19570688, "step": 4778 }, { "epoch": 3.561102831594635, "grad_norm": 10.825359755678733, "learning_rate": 4.769946160185232e-07, "loss": 0.3379, "num_input_tokens_seen": 19574784, "step": 4779 }, { "epoch": 3.5618479880774965, "grad_norm": 9.458415204838538, "learning_rate": 4.765347804261941e-07, "loss": 0.2882, "num_input_tokens_seen": 19578880, "step": 4780 }, { "epoch": 3.5625931445603576, "grad_norm": 9.671611794323692, "learning_rate": 4.760751143829648e-07, "loss": 0.1821, "num_input_tokens_seen": 19582976, "step": 4781 }, { "epoch": 3.563338301043219, "grad_norm": 10.393161373519808, "learning_rate": 4.7561561798959654e-07, "loss": 0.1443, "num_input_tokens_seen": 19587072, "step": 4782 }, { "epoch": 3.5640834575260802, "grad_norm": 8.587428852704361, "learning_rate": 4.751562913468141e-07, "loss": 0.1419, "num_input_tokens_seen": 19591168, "step": 4783 }, { "epoch": 3.564828614008942, "grad_norm": 11.43314088618084, "learning_rate": 4.7469713455530587e-07, "loss": 0.149, "num_input_tokens_seen": 19595264, "step": 4784 }, { "epoch": 3.5655737704918034, "grad_norm": 10.525575167174772, "learning_rate": 4.742381477157215e-07, "loss": 0.1525, "num_input_tokens_seen": 19599360, "step": 4785 }, { "epoch": 3.566318926974665, "grad_norm": 8.201399967342288, "learning_rate": 4.737793309286749e-07, "loss": 0.1225, "num_input_tokens_seen": 19603456, "step": 4786 }, { "epoch": 3.567064083457526, "grad_norm": 10.635285008119308, "learning_rate": 4.7332068429474147e-07, "loss": 0.1581, "num_input_tokens_seen": 19607552, "step": 4787 }, { "epoch": 3.5678092399403876, "grad_norm": 7.427404489320687, "learning_rate": 4.728622079144603e-07, "loss": 0.1492, "num_input_tokens_seen": 19611648, "step": 4788 }, { "epoch": 3.5685543964232487, "grad_norm": 9.031087800064235, "learning_rate": 4.7240390188833235e-07, "loss": 0.1292, "num_input_tokens_seen": 19615744, "step": 4789 }, { "epoch": 3.5692995529061102, "grad_norm": 7.061821836612589, "learning_rate": 4.719457663168217e-07, "loss": 0.0592, "num_input_tokens_seen": 19619840, "step": 4790 }, { "epoch": 3.570044709388972, "grad_norm": 9.968632330677686, "learning_rate": 4.714878013003546e-07, "loss": 0.1957, "num_input_tokens_seen": 19623936, "step": 4791 }, { "epoch": 3.5707898658718333, "grad_norm": 10.615520453876096, "learning_rate": 4.7103000693932076e-07, "loss": 0.1727, "num_input_tokens_seen": 19628032, "step": 4792 }, { "epoch": 3.5715350223546944, "grad_norm": 8.411634364731238, "learning_rate": 4.7057238333407183e-07, "loss": 0.2349, "num_input_tokens_seen": 19632128, "step": 4793 }, { "epoch": 3.572280178837556, "grad_norm": 11.332217673376851, "learning_rate": 4.7011493058492163e-07, "loss": 0.1506, "num_input_tokens_seen": 19636224, "step": 4794 }, { "epoch": 3.573025335320417, "grad_norm": 9.063482164793482, "learning_rate": 4.6965764879214794e-07, "loss": 0.2268, "num_input_tokens_seen": 19640320, "step": 4795 }, { "epoch": 3.5737704918032787, "grad_norm": 9.164384710819098, "learning_rate": 4.692005380559894e-07, "loss": 0.1894, "num_input_tokens_seen": 19644416, "step": 4796 }, { "epoch": 3.57451564828614, "grad_norm": 8.769565112467049, "learning_rate": 4.6874359847664866e-07, "loss": 0.1566, "num_input_tokens_seen": 19648512, "step": 4797 }, { "epoch": 3.5752608047690018, "grad_norm": 6.9112046655127095, "learning_rate": 4.682868301542895e-07, "loss": 0.1489, "num_input_tokens_seen": 19652608, "step": 4798 }, { "epoch": 3.576005961251863, "grad_norm": 11.765900808237522, "learning_rate": 4.678302331890394e-07, "loss": 0.1915, "num_input_tokens_seen": 19656704, "step": 4799 }, { "epoch": 3.5767511177347244, "grad_norm": 8.130901506778732, "learning_rate": 4.673738076809875e-07, "loss": 0.113, "num_input_tokens_seen": 19660800, "step": 4800 }, { "epoch": 3.5774962742175855, "grad_norm": 9.772354269467092, "learning_rate": 4.669175537301848e-07, "loss": 0.2356, "num_input_tokens_seen": 19664896, "step": 4801 }, { "epoch": 3.578241430700447, "grad_norm": 7.204165175318977, "learning_rate": 4.6646147143664665e-07, "loss": 0.0949, "num_input_tokens_seen": 19668992, "step": 4802 }, { "epoch": 3.5789865871833086, "grad_norm": 8.261720543089641, "learning_rate": 4.6600556090034845e-07, "loss": 0.1368, "num_input_tokens_seen": 19673088, "step": 4803 }, { "epoch": 3.5797317436661698, "grad_norm": 9.467983031931857, "learning_rate": 4.655498222212304e-07, "loss": 0.2897, "num_input_tokens_seen": 19677184, "step": 4804 }, { "epoch": 3.5804769001490313, "grad_norm": 7.977121693162526, "learning_rate": 4.6509425549919174e-07, "loss": 0.1673, "num_input_tokens_seen": 19681280, "step": 4805 }, { "epoch": 3.581222056631893, "grad_norm": 9.254561851454461, "learning_rate": 4.6463886083409765e-07, "loss": 0.2019, "num_input_tokens_seen": 19685376, "step": 4806 }, { "epoch": 3.581967213114754, "grad_norm": 13.009139760755193, "learning_rate": 4.6418363832577277e-07, "loss": 0.2626, "num_input_tokens_seen": 19689472, "step": 4807 }, { "epoch": 3.5827123695976155, "grad_norm": 7.952016296732389, "learning_rate": 4.6372858807400595e-07, "loss": 0.1083, "num_input_tokens_seen": 19693568, "step": 4808 }, { "epoch": 3.583457526080477, "grad_norm": 8.171387649165847, "learning_rate": 4.6327371017854714e-07, "loss": 0.2273, "num_input_tokens_seen": 19697664, "step": 4809 }, { "epoch": 3.584202682563338, "grad_norm": 9.024809522894264, "learning_rate": 4.6281900473910845e-07, "loss": 0.3036, "num_input_tokens_seen": 19701760, "step": 4810 }, { "epoch": 3.5849478390461997, "grad_norm": 10.043481080510972, "learning_rate": 4.623644718553652e-07, "loss": 0.3291, "num_input_tokens_seen": 19705856, "step": 4811 }, { "epoch": 3.585692995529061, "grad_norm": 9.697262224546925, "learning_rate": 4.6191011162695363e-07, "loss": 0.2582, "num_input_tokens_seen": 19709952, "step": 4812 }, { "epoch": 3.5864381520119224, "grad_norm": 7.817441876493574, "learning_rate": 4.614559241534734e-07, "loss": 0.1525, "num_input_tokens_seen": 19714048, "step": 4813 }, { "epoch": 3.587183308494784, "grad_norm": 8.301319476477977, "learning_rate": 4.610019095344849e-07, "loss": 0.1671, "num_input_tokens_seen": 19718144, "step": 4814 }, { "epoch": 3.5879284649776455, "grad_norm": 8.794675358799786, "learning_rate": 4.605480678695122e-07, "loss": 0.2237, "num_input_tokens_seen": 19722240, "step": 4815 }, { "epoch": 3.5886736214605066, "grad_norm": 12.34715047720028, "learning_rate": 4.600943992580402e-07, "loss": 0.158, "num_input_tokens_seen": 19726336, "step": 4816 }, { "epoch": 3.589418777943368, "grad_norm": 7.904945463335048, "learning_rate": 4.596409037995163e-07, "loss": 0.2168, "num_input_tokens_seen": 19730432, "step": 4817 }, { "epoch": 3.5901639344262293, "grad_norm": 9.21550215649739, "learning_rate": 4.591875815933496e-07, "loss": 0.1558, "num_input_tokens_seen": 19734528, "step": 4818 }, { "epoch": 3.590909090909091, "grad_norm": 9.581983908724444, "learning_rate": 4.5873443273891217e-07, "loss": 0.1429, "num_input_tokens_seen": 19738624, "step": 4819 }, { "epoch": 3.5916542473919524, "grad_norm": 8.638345642111458, "learning_rate": 4.582814573355375e-07, "loss": 0.2205, "num_input_tokens_seen": 19742720, "step": 4820 }, { "epoch": 3.592399403874814, "grad_norm": 6.564894342543705, "learning_rate": 4.578286554825201e-07, "loss": 0.1146, "num_input_tokens_seen": 19746816, "step": 4821 }, { "epoch": 3.593144560357675, "grad_norm": 9.928675696652574, "learning_rate": 4.573760272791186e-07, "loss": 0.1657, "num_input_tokens_seen": 19750912, "step": 4822 }, { "epoch": 3.5938897168405366, "grad_norm": 8.499972171904348, "learning_rate": 4.569235728245512e-07, "loss": 0.1815, "num_input_tokens_seen": 19755008, "step": 4823 }, { "epoch": 3.5946348733233977, "grad_norm": 9.008344831608047, "learning_rate": 4.564712922180002e-07, "loss": 0.114, "num_input_tokens_seen": 19759104, "step": 4824 }, { "epoch": 3.5953800298062593, "grad_norm": 7.829302204247588, "learning_rate": 4.5601918555860784e-07, "loss": 0.1176, "num_input_tokens_seen": 19763200, "step": 4825 }, { "epoch": 3.596125186289121, "grad_norm": 7.459964176267708, "learning_rate": 4.555672529454799e-07, "loss": 0.1419, "num_input_tokens_seen": 19767296, "step": 4826 }, { "epoch": 3.5968703427719824, "grad_norm": 13.684497160378465, "learning_rate": 4.55115494477683e-07, "loss": 0.0878, "num_input_tokens_seen": 19771392, "step": 4827 }, { "epoch": 3.5976154992548435, "grad_norm": 9.92399485866667, "learning_rate": 4.546639102542452e-07, "loss": 0.2974, "num_input_tokens_seen": 19775488, "step": 4828 }, { "epoch": 3.598360655737705, "grad_norm": 16.2162040416359, "learning_rate": 4.542125003741579e-07, "loss": 0.063, "num_input_tokens_seen": 19779584, "step": 4829 }, { "epoch": 3.599105812220566, "grad_norm": 11.983733290105096, "learning_rate": 4.5376126493637246e-07, "loss": 0.2492, "num_input_tokens_seen": 19783680, "step": 4830 }, { "epoch": 3.5998509687034277, "grad_norm": 7.1833110219846645, "learning_rate": 4.533102040398044e-07, "loss": 0.096, "num_input_tokens_seen": 19787776, "step": 4831 }, { "epoch": 3.6005961251862892, "grad_norm": 7.963352648792901, "learning_rate": 4.528593177833275e-07, "loss": 0.1015, "num_input_tokens_seen": 19791872, "step": 4832 }, { "epoch": 3.601341281669151, "grad_norm": 9.484285455540117, "learning_rate": 4.5240860626578066e-07, "loss": 0.2664, "num_input_tokens_seen": 19795968, "step": 4833 }, { "epoch": 3.602086438152012, "grad_norm": 9.533273281206878, "learning_rate": 4.5195806958596227e-07, "loss": 0.2546, "num_input_tokens_seen": 19800064, "step": 4834 }, { "epoch": 3.6028315946348735, "grad_norm": 12.2260036330745, "learning_rate": 4.5150770784263403e-07, "loss": 0.3872, "num_input_tokens_seen": 19804160, "step": 4835 }, { "epoch": 3.6035767511177346, "grad_norm": 9.193101892669338, "learning_rate": 4.510575211345174e-07, "loss": 0.3311, "num_input_tokens_seen": 19808256, "step": 4836 }, { "epoch": 3.604321907600596, "grad_norm": 12.13258096377766, "learning_rate": 4.506075095602977e-07, "loss": 0.1836, "num_input_tokens_seen": 19812352, "step": 4837 }, { "epoch": 3.6050670640834577, "grad_norm": 8.92651375577286, "learning_rate": 4.5015767321862e-07, "loss": 0.2994, "num_input_tokens_seen": 19816448, "step": 4838 }, { "epoch": 3.605812220566319, "grad_norm": 9.023481108523244, "learning_rate": 4.497080122080912e-07, "loss": 0.1382, "num_input_tokens_seen": 19820544, "step": 4839 }, { "epoch": 3.6065573770491803, "grad_norm": 5.7352986728697575, "learning_rate": 4.4925852662728114e-07, "loss": 0.076, "num_input_tokens_seen": 19824640, "step": 4840 }, { "epoch": 3.607302533532042, "grad_norm": 6.599752512903634, "learning_rate": 4.488092165747196e-07, "loss": 0.1386, "num_input_tokens_seen": 19828736, "step": 4841 }, { "epoch": 3.608047690014903, "grad_norm": 8.842680402362719, "learning_rate": 4.4836008214889913e-07, "loss": 0.1512, "num_input_tokens_seen": 19832832, "step": 4842 }, { "epoch": 3.6087928464977646, "grad_norm": 7.736122364335388, "learning_rate": 4.479111234482723e-07, "loss": 0.2796, "num_input_tokens_seen": 19836928, "step": 4843 }, { "epoch": 3.609538002980626, "grad_norm": 6.759213470168189, "learning_rate": 4.4746234057125567e-07, "loss": 0.1541, "num_input_tokens_seen": 19841024, "step": 4844 }, { "epoch": 3.610283159463487, "grad_norm": 12.511347861705431, "learning_rate": 4.470137336162238e-07, "loss": 0.2254, "num_input_tokens_seen": 19845120, "step": 4845 }, { "epoch": 3.6110283159463488, "grad_norm": 8.861819030550041, "learning_rate": 4.4656530268151573e-07, "loss": 0.0965, "num_input_tokens_seen": 19849216, "step": 4846 }, { "epoch": 3.61177347242921, "grad_norm": 8.285965671956626, "learning_rate": 4.461170478654303e-07, "loss": 0.1593, "num_input_tokens_seen": 19853312, "step": 4847 }, { "epoch": 3.6125186289120714, "grad_norm": 6.836866560440227, "learning_rate": 4.4566896926622776e-07, "loss": 0.143, "num_input_tokens_seen": 19857408, "step": 4848 }, { "epoch": 3.613263785394933, "grad_norm": 7.864624486492496, "learning_rate": 4.4522106698213106e-07, "loss": 0.1441, "num_input_tokens_seen": 19861504, "step": 4849 }, { "epoch": 3.6140089418777945, "grad_norm": 13.784516758217, "learning_rate": 4.447733411113228e-07, "loss": 0.0928, "num_input_tokens_seen": 19865600, "step": 4850 }, { "epoch": 3.6147540983606556, "grad_norm": 6.932853697919352, "learning_rate": 4.4432579175194835e-07, "loss": 0.0848, "num_input_tokens_seen": 19869696, "step": 4851 }, { "epoch": 3.615499254843517, "grad_norm": 8.810988469994488, "learning_rate": 4.4387841900211293e-07, "loss": 0.2419, "num_input_tokens_seen": 19873792, "step": 4852 }, { "epoch": 3.6162444113263783, "grad_norm": 12.472830902275895, "learning_rate": 4.434312229598847e-07, "loss": 0.2532, "num_input_tokens_seen": 19877888, "step": 4853 }, { "epoch": 3.61698956780924, "grad_norm": 10.015760699767723, "learning_rate": 4.4298420372329177e-07, "loss": 0.4199, "num_input_tokens_seen": 19881984, "step": 4854 }, { "epoch": 3.6177347242921014, "grad_norm": 7.242276800112809, "learning_rate": 4.425373613903236e-07, "loss": 0.0928, "num_input_tokens_seen": 19886080, "step": 4855 }, { "epoch": 3.618479880774963, "grad_norm": 8.560054994703533, "learning_rate": 4.4209069605893215e-07, "loss": 0.1019, "num_input_tokens_seen": 19890176, "step": 4856 }, { "epoch": 3.619225037257824, "grad_norm": 9.305235459558986, "learning_rate": 4.416442078270286e-07, "loss": 0.367, "num_input_tokens_seen": 19894272, "step": 4857 }, { "epoch": 3.6199701937406856, "grad_norm": 10.604239576036102, "learning_rate": 4.411978967924876e-07, "loss": 0.2057, "num_input_tokens_seen": 19898368, "step": 4858 }, { "epoch": 3.6207153502235467, "grad_norm": 7.308837000829238, "learning_rate": 4.407517630531423e-07, "loss": 0.1076, "num_input_tokens_seen": 19902464, "step": 4859 }, { "epoch": 3.6214605067064083, "grad_norm": 9.781285662158368, "learning_rate": 4.403058067067895e-07, "loss": 0.2341, "num_input_tokens_seen": 19906560, "step": 4860 }, { "epoch": 3.62220566318927, "grad_norm": 8.840157081610775, "learning_rate": 4.398600278511851e-07, "loss": 0.1014, "num_input_tokens_seen": 19910656, "step": 4861 }, { "epoch": 3.6229508196721314, "grad_norm": 11.661591197856264, "learning_rate": 4.39414426584048e-07, "loss": 0.3769, "num_input_tokens_seen": 19914752, "step": 4862 }, { "epoch": 3.6236959761549925, "grad_norm": 9.170351444407641, "learning_rate": 4.389690030030562e-07, "loss": 0.2174, "num_input_tokens_seen": 19918848, "step": 4863 }, { "epoch": 3.624441132637854, "grad_norm": 5.948555532463453, "learning_rate": 4.385237572058508e-07, "loss": 0.0827, "num_input_tokens_seen": 19922944, "step": 4864 }, { "epoch": 3.625186289120715, "grad_norm": 7.890974552470113, "learning_rate": 4.380786892900321e-07, "loss": 0.1636, "num_input_tokens_seen": 19927040, "step": 4865 }, { "epoch": 3.6259314456035767, "grad_norm": 9.398669071772481, "learning_rate": 4.3763379935316197e-07, "loss": 0.1284, "num_input_tokens_seen": 19931136, "step": 4866 }, { "epoch": 3.6266766020864383, "grad_norm": 6.794603606931549, "learning_rate": 4.3718908749276436e-07, "loss": 0.1992, "num_input_tokens_seen": 19935232, "step": 4867 }, { "epoch": 3.6274217585693, "grad_norm": 11.093340314601447, "learning_rate": 4.3674455380632224e-07, "loss": 0.1132, "num_input_tokens_seen": 19939328, "step": 4868 }, { "epoch": 3.628166915052161, "grad_norm": 6.923178435927628, "learning_rate": 4.363001983912815e-07, "loss": 0.0847, "num_input_tokens_seen": 19943424, "step": 4869 }, { "epoch": 3.6289120715350225, "grad_norm": 9.855574195828945, "learning_rate": 4.358560213450472e-07, "loss": 0.2441, "num_input_tokens_seen": 19947520, "step": 4870 }, { "epoch": 3.6296572280178836, "grad_norm": 8.767946865974428, "learning_rate": 4.354120227649873e-07, "loss": 0.2217, "num_input_tokens_seen": 19951616, "step": 4871 }, { "epoch": 3.630402384500745, "grad_norm": 7.862313058236455, "learning_rate": 4.3496820274842796e-07, "loss": 0.1225, "num_input_tokens_seen": 19955712, "step": 4872 }, { "epoch": 3.6311475409836067, "grad_norm": 7.9231725378591475, "learning_rate": 4.3452456139265883e-07, "loss": 0.149, "num_input_tokens_seen": 19959808, "step": 4873 }, { "epoch": 3.631892697466468, "grad_norm": 8.690076872722766, "learning_rate": 4.3408109879492887e-07, "loss": 0.1408, "num_input_tokens_seen": 19963904, "step": 4874 }, { "epoch": 3.6326378539493294, "grad_norm": 6.727131659864811, "learning_rate": 4.3363781505244783e-07, "loss": 0.1409, "num_input_tokens_seen": 19968000, "step": 4875 }, { "epoch": 3.633383010432191, "grad_norm": 9.75415365416469, "learning_rate": 4.3319471026238737e-07, "loss": 0.0955, "num_input_tokens_seen": 19972096, "step": 4876 }, { "epoch": 3.634128166915052, "grad_norm": 9.403919688603093, "learning_rate": 4.3275178452187857e-07, "loss": 0.3489, "num_input_tokens_seen": 19976192, "step": 4877 }, { "epoch": 3.6348733233979136, "grad_norm": 9.609146409903737, "learning_rate": 4.3230903792801484e-07, "loss": 0.1505, "num_input_tokens_seen": 19980288, "step": 4878 }, { "epoch": 3.635618479880775, "grad_norm": 9.374753595556921, "learning_rate": 4.318664705778482e-07, "loss": 0.111, "num_input_tokens_seen": 19984384, "step": 4879 }, { "epoch": 3.6363636363636362, "grad_norm": 9.749654425973002, "learning_rate": 4.314240825683938e-07, "loss": 0.3359, "num_input_tokens_seen": 19988480, "step": 4880 }, { "epoch": 3.637108792846498, "grad_norm": 9.507822021300031, "learning_rate": 4.309818739966255e-07, "loss": 0.1611, "num_input_tokens_seen": 19992576, "step": 4881 }, { "epoch": 3.637853949329359, "grad_norm": 9.457312629376174, "learning_rate": 4.3053984495947843e-07, "loss": 0.1562, "num_input_tokens_seen": 19996672, "step": 4882 }, { "epoch": 3.6385991058122205, "grad_norm": 6.451795052047859, "learning_rate": 4.300979955538492e-07, "loss": 0.0676, "num_input_tokens_seen": 20000768, "step": 4883 }, { "epoch": 3.639344262295082, "grad_norm": 12.109539144478676, "learning_rate": 4.2965632587659345e-07, "loss": 0.2906, "num_input_tokens_seen": 20004864, "step": 4884 }, { "epoch": 3.6400894187779436, "grad_norm": 10.264801358604927, "learning_rate": 4.2921483602452974e-07, "loss": 0.0864, "num_input_tokens_seen": 20008960, "step": 4885 }, { "epoch": 3.6408345752608047, "grad_norm": 7.7624879185864035, "learning_rate": 4.2877352609443425e-07, "loss": 0.1864, "num_input_tokens_seen": 20013056, "step": 4886 }, { "epoch": 3.6415797317436662, "grad_norm": 9.936979319662832, "learning_rate": 4.2833239618304613e-07, "loss": 0.0976, "num_input_tokens_seen": 20017152, "step": 4887 }, { "epoch": 3.6423248882265273, "grad_norm": 6.1348563070936075, "learning_rate": 4.2789144638706357e-07, "loss": 0.1131, "num_input_tokens_seen": 20021248, "step": 4888 }, { "epoch": 3.643070044709389, "grad_norm": 9.635646499982904, "learning_rate": 4.274506768031468e-07, "loss": 0.2062, "num_input_tokens_seen": 20025344, "step": 4889 }, { "epoch": 3.6438152011922504, "grad_norm": 6.611769383090107, "learning_rate": 4.2701008752791483e-07, "loss": 0.1498, "num_input_tokens_seen": 20029440, "step": 4890 }, { "epoch": 3.644560357675112, "grad_norm": 8.968613182728511, "learning_rate": 4.2656967865794876e-07, "loss": 0.2244, "num_input_tokens_seen": 20033536, "step": 4891 }, { "epoch": 3.645305514157973, "grad_norm": 8.751924325836603, "learning_rate": 4.261294502897889e-07, "loss": 0.2118, "num_input_tokens_seen": 20037632, "step": 4892 }, { "epoch": 3.6460506706408347, "grad_norm": 14.417288734764327, "learning_rate": 4.256894025199361e-07, "loss": 0.1418, "num_input_tokens_seen": 20041728, "step": 4893 }, { "epoch": 3.6467958271236958, "grad_norm": 6.7221413780854995, "learning_rate": 4.2524953544485284e-07, "loss": 0.074, "num_input_tokens_seen": 20045824, "step": 4894 }, { "epoch": 3.6475409836065573, "grad_norm": 10.085100887760984, "learning_rate": 4.2480984916096023e-07, "loss": 0.3055, "num_input_tokens_seen": 20049920, "step": 4895 }, { "epoch": 3.648286140089419, "grad_norm": 9.516006387720834, "learning_rate": 4.2437034376464137e-07, "loss": 0.1972, "num_input_tokens_seen": 20054016, "step": 4896 }, { "epoch": 3.6490312965722804, "grad_norm": 6.618304111738772, "learning_rate": 4.2393101935223834e-07, "loss": 0.111, "num_input_tokens_seen": 20058112, "step": 4897 }, { "epoch": 3.6497764530551415, "grad_norm": 9.74869031020706, "learning_rate": 4.2349187602005543e-07, "loss": 0.1372, "num_input_tokens_seen": 20062208, "step": 4898 }, { "epoch": 3.650521609538003, "grad_norm": 9.165397630399875, "learning_rate": 4.2305291386435424e-07, "loss": 0.2175, "num_input_tokens_seen": 20066304, "step": 4899 }, { "epoch": 3.651266766020864, "grad_norm": 9.4278189611082, "learning_rate": 4.226141329813596e-07, "loss": 0.2616, "num_input_tokens_seen": 20070400, "step": 4900 }, { "epoch": 3.6520119225037257, "grad_norm": 7.84607463754534, "learning_rate": 4.22175533467255e-07, "loss": 0.2073, "num_input_tokens_seen": 20074496, "step": 4901 }, { "epoch": 3.6527570789865873, "grad_norm": 10.046859710179293, "learning_rate": 4.2173711541818443e-07, "loss": 0.2428, "num_input_tokens_seen": 20078592, "step": 4902 }, { "epoch": 3.653502235469449, "grad_norm": 9.359971788735015, "learning_rate": 4.212988789302529e-07, "loss": 0.2096, "num_input_tokens_seen": 20082688, "step": 4903 }, { "epoch": 3.65424739195231, "grad_norm": 6.589723919433091, "learning_rate": 4.2086082409952393e-07, "loss": 0.1211, "num_input_tokens_seen": 20086784, "step": 4904 }, { "epoch": 3.6549925484351715, "grad_norm": 9.03832010599745, "learning_rate": 4.2042295102202356e-07, "loss": 0.1131, "num_input_tokens_seen": 20090880, "step": 4905 }, { "epoch": 3.6557377049180326, "grad_norm": 8.918883706602557, "learning_rate": 4.199852597937354e-07, "loss": 0.2246, "num_input_tokens_seen": 20094976, "step": 4906 }, { "epoch": 3.656482861400894, "grad_norm": 9.730073797307428, "learning_rate": 4.1954775051060577e-07, "loss": 0.1319, "num_input_tokens_seen": 20099072, "step": 4907 }, { "epoch": 3.6572280178837557, "grad_norm": 8.713069460457639, "learning_rate": 4.191104232685386e-07, "loss": 0.1379, "num_input_tokens_seen": 20103168, "step": 4908 }, { "epoch": 3.657973174366617, "grad_norm": 9.632757713510285, "learning_rate": 4.1867327816340023e-07, "loss": 0.1983, "num_input_tokens_seen": 20107264, "step": 4909 }, { "epoch": 3.6587183308494784, "grad_norm": 10.738249792141982, "learning_rate": 4.182363152910154e-07, "loss": 0.2582, "num_input_tokens_seen": 20111360, "step": 4910 }, { "epoch": 3.65946348733234, "grad_norm": 7.967298371540923, "learning_rate": 4.177995347471692e-07, "loss": 0.1602, "num_input_tokens_seen": 20115456, "step": 4911 }, { "epoch": 3.660208643815201, "grad_norm": 9.04355388829141, "learning_rate": 4.173629366276083e-07, "loss": 0.18, "num_input_tokens_seen": 20119552, "step": 4912 }, { "epoch": 3.6609538002980626, "grad_norm": 8.066262693507278, "learning_rate": 4.169265210280364e-07, "loss": 0.2272, "num_input_tokens_seen": 20123648, "step": 4913 }, { "epoch": 3.661698956780924, "grad_norm": 7.9789489852851005, "learning_rate": 4.164902880441202e-07, "loss": 0.1298, "num_input_tokens_seen": 20127744, "step": 4914 }, { "epoch": 3.6624441132637853, "grad_norm": 7.7938881384668335, "learning_rate": 4.160542377714842e-07, "loss": 0.2149, "num_input_tokens_seen": 20131840, "step": 4915 }, { "epoch": 3.663189269746647, "grad_norm": 4.986942103829965, "learning_rate": 4.156183703057147e-07, "loss": 0.0692, "num_input_tokens_seen": 20135936, "step": 4916 }, { "epoch": 3.663934426229508, "grad_norm": 8.55399951223972, "learning_rate": 4.151826857423559e-07, "loss": 0.0584, "num_input_tokens_seen": 20140032, "step": 4917 }, { "epoch": 3.6646795827123695, "grad_norm": 5.321745192496599, "learning_rate": 4.1474718417691393e-07, "loss": 0.0399, "num_input_tokens_seen": 20144128, "step": 4918 }, { "epoch": 3.665424739195231, "grad_norm": 9.237432400035562, "learning_rate": 4.1431186570485354e-07, "loss": 0.1403, "num_input_tokens_seen": 20148224, "step": 4919 }, { "epoch": 3.6661698956780926, "grad_norm": 6.338435562445111, "learning_rate": 4.1387673042159905e-07, "loss": 0.1251, "num_input_tokens_seen": 20152320, "step": 4920 }, { "epoch": 3.6669150521609537, "grad_norm": 9.243425845410025, "learning_rate": 4.1344177842253616e-07, "loss": 0.1774, "num_input_tokens_seen": 20156416, "step": 4921 }, { "epoch": 3.6676602086438153, "grad_norm": 10.36113850986344, "learning_rate": 4.1300700980300854e-07, "loss": 0.0902, "num_input_tokens_seen": 20160512, "step": 4922 }, { "epoch": 3.6684053651266764, "grad_norm": 8.980180436516001, "learning_rate": 4.125724246583214e-07, "loss": 0.1439, "num_input_tokens_seen": 20164608, "step": 4923 }, { "epoch": 3.669150521609538, "grad_norm": 9.083623663175535, "learning_rate": 4.121380230837382e-07, "loss": 0.1573, "num_input_tokens_seen": 20168704, "step": 4924 }, { "epoch": 3.6698956780923995, "grad_norm": 9.495055721921105, "learning_rate": 4.1170380517448397e-07, "loss": 0.2321, "num_input_tokens_seen": 20172800, "step": 4925 }, { "epoch": 3.670640834575261, "grad_norm": 9.850645366896401, "learning_rate": 4.11269771025741e-07, "loss": 0.2299, "num_input_tokens_seen": 20176896, "step": 4926 }, { "epoch": 3.671385991058122, "grad_norm": 9.946918637517244, "learning_rate": 4.1083592073265344e-07, "loss": 0.1584, "num_input_tokens_seen": 20180992, "step": 4927 }, { "epoch": 3.6721311475409837, "grad_norm": 10.412565624473931, "learning_rate": 4.1040225439032395e-07, "loss": 0.2439, "num_input_tokens_seen": 20185088, "step": 4928 }, { "epoch": 3.672876304023845, "grad_norm": 8.225774476246999, "learning_rate": 4.09968772093816e-07, "loss": 0.2788, "num_input_tokens_seen": 20189184, "step": 4929 }, { "epoch": 3.6736214605067063, "grad_norm": 9.101267294487421, "learning_rate": 4.0953547393815145e-07, "loss": 0.1815, "num_input_tokens_seen": 20193280, "step": 4930 }, { "epoch": 3.674366616989568, "grad_norm": 12.505419400127067, "learning_rate": 4.0910236001831207e-07, "loss": 0.0828, "num_input_tokens_seen": 20197376, "step": 4931 }, { "epoch": 3.6751117734724295, "grad_norm": 12.074780996387522, "learning_rate": 4.086694304292405e-07, "loss": 0.2445, "num_input_tokens_seen": 20201472, "step": 4932 }, { "epoch": 3.6758569299552906, "grad_norm": 7.755876867688183, "learning_rate": 4.0823668526583674e-07, "loss": 0.1632, "num_input_tokens_seen": 20205568, "step": 4933 }, { "epoch": 3.676602086438152, "grad_norm": 8.547094548237013, "learning_rate": 4.078041246229629e-07, "loss": 0.2141, "num_input_tokens_seen": 20209664, "step": 4934 }, { "epoch": 3.6773472429210132, "grad_norm": 6.068238370933075, "learning_rate": 4.073717485954383e-07, "loss": 0.0673, "num_input_tokens_seen": 20213760, "step": 4935 }, { "epoch": 3.678092399403875, "grad_norm": 10.301641693932671, "learning_rate": 4.0693955727804364e-07, "loss": 0.3226, "num_input_tokens_seen": 20217856, "step": 4936 }, { "epoch": 3.6788375558867363, "grad_norm": 10.280815587101182, "learning_rate": 4.065075507655182e-07, "loss": 0.2883, "num_input_tokens_seen": 20221952, "step": 4937 }, { "epoch": 3.679582712369598, "grad_norm": 9.484609815866232, "learning_rate": 4.0607572915256023e-07, "loss": 0.24, "num_input_tokens_seen": 20226048, "step": 4938 }, { "epoch": 3.680327868852459, "grad_norm": 8.268138648199145, "learning_rate": 4.0564409253382927e-07, "loss": 0.2401, "num_input_tokens_seen": 20230144, "step": 4939 }, { "epoch": 3.6810730253353205, "grad_norm": 8.281001457700908, "learning_rate": 4.052126410039417e-07, "loss": 0.1877, "num_input_tokens_seen": 20234240, "step": 4940 }, { "epoch": 3.6818181818181817, "grad_norm": 10.77451631164245, "learning_rate": 4.047813746574761e-07, "loss": 0.1701, "num_input_tokens_seen": 20238336, "step": 4941 }, { "epoch": 3.682563338301043, "grad_norm": 11.149564889960908, "learning_rate": 4.04350293588968e-07, "loss": 0.3906, "num_input_tokens_seen": 20242432, "step": 4942 }, { "epoch": 3.6833084947839048, "grad_norm": 6.693437618193779, "learning_rate": 4.039193978929144e-07, "loss": 0.12, "num_input_tokens_seen": 20246528, "step": 4943 }, { "epoch": 3.684053651266766, "grad_norm": 9.349897584922655, "learning_rate": 4.034886876637702e-07, "loss": 0.2213, "num_input_tokens_seen": 20250624, "step": 4944 }, { "epoch": 3.6847988077496274, "grad_norm": 9.213544599445793, "learning_rate": 4.030581629959504e-07, "loss": 0.2337, "num_input_tokens_seen": 20254720, "step": 4945 }, { "epoch": 3.685543964232489, "grad_norm": 8.380012979522919, "learning_rate": 4.026278239838291e-07, "loss": 0.0987, "num_input_tokens_seen": 20258816, "step": 4946 }, { "epoch": 3.68628912071535, "grad_norm": 8.892718819482623, "learning_rate": 4.021976707217391e-07, "loss": 0.1146, "num_input_tokens_seen": 20262912, "step": 4947 }, { "epoch": 3.6870342771982116, "grad_norm": 10.627656712414403, "learning_rate": 4.017677033039741e-07, "loss": 0.1981, "num_input_tokens_seen": 20267008, "step": 4948 }, { "epoch": 3.687779433681073, "grad_norm": 9.676575519521737, "learning_rate": 4.0133792182478493e-07, "loss": 0.2309, "num_input_tokens_seen": 20271104, "step": 4949 }, { "epoch": 3.6885245901639343, "grad_norm": 8.562377179994906, "learning_rate": 4.009083263783836e-07, "loss": 0.2633, "num_input_tokens_seen": 20275200, "step": 4950 }, { "epoch": 3.689269746646796, "grad_norm": 9.696190802400194, "learning_rate": 4.0047891705894003e-07, "loss": 0.2393, "num_input_tokens_seen": 20279296, "step": 4951 }, { "epoch": 3.690014903129657, "grad_norm": 7.835500541435923, "learning_rate": 4.0004969396058453e-07, "loss": 0.2093, "num_input_tokens_seen": 20283392, "step": 4952 }, { "epoch": 3.6907600596125185, "grad_norm": 17.74968967500035, "learning_rate": 3.9962065717740457e-07, "loss": 0.0964, "num_input_tokens_seen": 20287488, "step": 4953 }, { "epoch": 3.69150521609538, "grad_norm": 8.04352729262038, "learning_rate": 3.991918068034492e-07, "loss": 0.1058, "num_input_tokens_seen": 20291584, "step": 4954 }, { "epoch": 3.6922503725782416, "grad_norm": 9.868219751728327, "learning_rate": 3.9876314293272475e-07, "loss": 0.4056, "num_input_tokens_seen": 20295680, "step": 4955 }, { "epoch": 3.6929955290611027, "grad_norm": 6.045015577751399, "learning_rate": 3.983346656591981e-07, "loss": 0.0469, "num_input_tokens_seen": 20299776, "step": 4956 }, { "epoch": 3.6937406855439643, "grad_norm": 9.211505629105634, "learning_rate": 3.979063750767942e-07, "loss": 0.2449, "num_input_tokens_seen": 20303872, "step": 4957 }, { "epoch": 3.6944858420268254, "grad_norm": 10.979198005304413, "learning_rate": 3.974782712793969e-07, "loss": 0.2246, "num_input_tokens_seen": 20307968, "step": 4958 }, { "epoch": 3.695230998509687, "grad_norm": 7.36035806245981, "learning_rate": 3.970503543608506e-07, "loss": 0.259, "num_input_tokens_seen": 20312064, "step": 4959 }, { "epoch": 3.6959761549925485, "grad_norm": 10.75031652894782, "learning_rate": 3.966226244149568e-07, "loss": 0.2912, "num_input_tokens_seen": 20316160, "step": 4960 }, { "epoch": 3.69672131147541, "grad_norm": 5.895357431020825, "learning_rate": 3.961950815354777e-07, "loss": 0.0693, "num_input_tokens_seen": 20320256, "step": 4961 }, { "epoch": 3.697466467958271, "grad_norm": 9.52309800868129, "learning_rate": 3.9576772581613314e-07, "loss": 0.2284, "num_input_tokens_seen": 20324352, "step": 4962 }, { "epoch": 3.6982116244411327, "grad_norm": 8.4450645633442, "learning_rate": 3.953405573506032e-07, "loss": 0.1422, "num_input_tokens_seen": 20328448, "step": 4963 }, { "epoch": 3.698956780923994, "grad_norm": 11.275897937648027, "learning_rate": 3.949135762325258e-07, "loss": 0.1105, "num_input_tokens_seen": 20332544, "step": 4964 }, { "epoch": 3.6997019374068554, "grad_norm": 10.975564236584592, "learning_rate": 3.944867825554981e-07, "loss": 0.1371, "num_input_tokens_seen": 20336640, "step": 4965 }, { "epoch": 3.700447093889717, "grad_norm": 8.202793664206421, "learning_rate": 3.9406017641307735e-07, "loss": 0.2158, "num_input_tokens_seen": 20340736, "step": 4966 }, { "epoch": 3.7011922503725785, "grad_norm": 9.40750302292962, "learning_rate": 3.93633757898777e-07, "loss": 0.255, "num_input_tokens_seen": 20344832, "step": 4967 }, { "epoch": 3.7019374068554396, "grad_norm": 7.7231023564692505, "learning_rate": 3.9320752710607244e-07, "loss": 0.0937, "num_input_tokens_seen": 20348928, "step": 4968 }, { "epoch": 3.702682563338301, "grad_norm": 8.163356753774222, "learning_rate": 3.9278148412839557e-07, "loss": 0.1147, "num_input_tokens_seen": 20353024, "step": 4969 }, { "epoch": 3.7034277198211623, "grad_norm": 10.465224776391775, "learning_rate": 3.9235562905913876e-07, "loss": 0.2115, "num_input_tokens_seen": 20357120, "step": 4970 }, { "epoch": 3.704172876304024, "grad_norm": 8.490539128400428, "learning_rate": 3.919299619916518e-07, "loss": 0.1802, "num_input_tokens_seen": 20361216, "step": 4971 }, { "epoch": 3.7049180327868854, "grad_norm": 8.711544210003115, "learning_rate": 3.915044830192448e-07, "loss": 0.2082, "num_input_tokens_seen": 20365312, "step": 4972 }, { "epoch": 3.705663189269747, "grad_norm": 9.599780133471201, "learning_rate": 3.9107919223518533e-07, "loss": 0.2668, "num_input_tokens_seen": 20369408, "step": 4973 }, { "epoch": 3.706408345752608, "grad_norm": 9.943635214512033, "learning_rate": 3.906540897326998e-07, "loss": 0.3143, "num_input_tokens_seen": 20373504, "step": 4974 }, { "epoch": 3.7071535022354696, "grad_norm": 8.296976885477202, "learning_rate": 3.902291756049743e-07, "loss": 0.2963, "num_input_tokens_seen": 20377600, "step": 4975 }, { "epoch": 3.7078986587183307, "grad_norm": 9.952893346605192, "learning_rate": 3.8980444994515266e-07, "loss": 0.3476, "num_input_tokens_seen": 20381696, "step": 4976 }, { "epoch": 3.7086438152011922, "grad_norm": 7.8991348248436655, "learning_rate": 3.8937991284633823e-07, "loss": 0.2124, "num_input_tokens_seen": 20385792, "step": 4977 }, { "epoch": 3.709388971684054, "grad_norm": 8.474245218882855, "learning_rate": 3.889555644015919e-07, "loss": 0.0821, "num_input_tokens_seen": 20389888, "step": 4978 }, { "epoch": 3.710134128166915, "grad_norm": 8.857047795999891, "learning_rate": 3.88531404703935e-07, "loss": 0.2532, "num_input_tokens_seen": 20393984, "step": 4979 }, { "epoch": 3.7108792846497765, "grad_norm": 10.320086310478718, "learning_rate": 3.881074338463449e-07, "loss": 0.1187, "num_input_tokens_seen": 20398080, "step": 4980 }, { "epoch": 3.711624441132638, "grad_norm": 9.780859059819285, "learning_rate": 3.8768365192176015e-07, "loss": 0.2117, "num_input_tokens_seen": 20402176, "step": 4981 }, { "epoch": 3.712369597615499, "grad_norm": 8.742031705735608, "learning_rate": 3.8726005902307587e-07, "loss": 0.1564, "num_input_tokens_seen": 20406272, "step": 4982 }, { "epoch": 3.7131147540983607, "grad_norm": 8.806364413665934, "learning_rate": 3.8683665524314767e-07, "loss": 0.2454, "num_input_tokens_seen": 20410368, "step": 4983 }, { "epoch": 3.7138599105812222, "grad_norm": 7.949757656313143, "learning_rate": 3.864134406747881e-07, "loss": 0.1939, "num_input_tokens_seen": 20414464, "step": 4984 }, { "epoch": 3.7146050670640833, "grad_norm": 9.659296515137932, "learning_rate": 3.859904154107687e-07, "loss": 0.3971, "num_input_tokens_seen": 20418560, "step": 4985 }, { "epoch": 3.715350223546945, "grad_norm": 8.605920068066382, "learning_rate": 3.8556757954382e-07, "loss": 0.0546, "num_input_tokens_seen": 20422656, "step": 4986 }, { "epoch": 3.716095380029806, "grad_norm": 8.28924926731802, "learning_rate": 3.8514493316663014e-07, "loss": 0.125, "num_input_tokens_seen": 20426752, "step": 4987 }, { "epoch": 3.7168405365126675, "grad_norm": 9.74105021527456, "learning_rate": 3.84722476371847e-07, "loss": 0.0377, "num_input_tokens_seen": 20430848, "step": 4988 }, { "epoch": 3.717585692995529, "grad_norm": 4.86993894415454, "learning_rate": 3.843002092520752e-07, "loss": 0.0536, "num_input_tokens_seen": 20434944, "step": 4989 }, { "epoch": 3.7183308494783907, "grad_norm": 8.58616924130189, "learning_rate": 3.8387813189987967e-07, "loss": 0.2299, "num_input_tokens_seen": 20439040, "step": 4990 }, { "epoch": 3.7190760059612518, "grad_norm": 5.714072577784916, "learning_rate": 3.8345624440778215e-07, "loss": 0.1158, "num_input_tokens_seen": 20443136, "step": 4991 }, { "epoch": 3.7198211624441133, "grad_norm": 10.34682514345369, "learning_rate": 3.8303454686826323e-07, "loss": 0.2156, "num_input_tokens_seen": 20447232, "step": 4992 }, { "epoch": 3.7205663189269744, "grad_norm": 12.12504370707065, "learning_rate": 3.8261303937376273e-07, "loss": 0.1239, "num_input_tokens_seen": 20451328, "step": 4993 }, { "epoch": 3.721311475409836, "grad_norm": 8.795794804997577, "learning_rate": 3.8219172201667766e-07, "loss": 0.1887, "num_input_tokens_seen": 20455424, "step": 4994 }, { "epoch": 3.7220566318926975, "grad_norm": 8.481815178732147, "learning_rate": 3.817705948893638e-07, "loss": 0.2329, "num_input_tokens_seen": 20459520, "step": 4995 }, { "epoch": 3.722801788375559, "grad_norm": 9.318815512522377, "learning_rate": 3.8134965808413487e-07, "loss": 0.1306, "num_input_tokens_seen": 20463616, "step": 4996 }, { "epoch": 3.72354694485842, "grad_norm": 8.353543666396142, "learning_rate": 3.8092891169326405e-07, "loss": 0.2342, "num_input_tokens_seen": 20467712, "step": 4997 }, { "epoch": 3.7242921013412817, "grad_norm": 11.44564713114649, "learning_rate": 3.80508355808981e-07, "loss": 0.2353, "num_input_tokens_seen": 20471808, "step": 4998 }, { "epoch": 3.725037257824143, "grad_norm": 9.606482135928442, "learning_rate": 3.8008799052347555e-07, "loss": 0.1455, "num_input_tokens_seen": 20475904, "step": 4999 }, { "epoch": 3.7257824143070044, "grad_norm": 8.793151826343841, "learning_rate": 3.79667815928894e-07, "loss": 0.2434, "num_input_tokens_seen": 20480000, "step": 5000 }, { "epoch": 3.726527570789866, "grad_norm": 7.25723767540239, "learning_rate": 3.792478321173422e-07, "loss": 0.0708, "num_input_tokens_seen": 20484096, "step": 5001 }, { "epoch": 3.7272727272727275, "grad_norm": 8.265776071074336, "learning_rate": 3.7882803918088335e-07, "loss": 0.0632, "num_input_tokens_seen": 20488192, "step": 5002 }, { "epoch": 3.7280178837555886, "grad_norm": 9.243870348444162, "learning_rate": 3.784084372115386e-07, "loss": 0.1784, "num_input_tokens_seen": 20492288, "step": 5003 }, { "epoch": 3.72876304023845, "grad_norm": 7.981560612548795, "learning_rate": 3.7798902630128864e-07, "loss": 0.1385, "num_input_tokens_seen": 20496384, "step": 5004 }, { "epoch": 3.7295081967213113, "grad_norm": 6.207525583312072, "learning_rate": 3.775698065420703e-07, "loss": 0.0981, "num_input_tokens_seen": 20500480, "step": 5005 }, { "epoch": 3.730253353204173, "grad_norm": 11.276392206659429, "learning_rate": 3.7715077802578106e-07, "loss": 0.2636, "num_input_tokens_seen": 20504576, "step": 5006 }, { "epoch": 3.7309985096870344, "grad_norm": 5.393159538003346, "learning_rate": 3.767319408442731e-07, "loss": 0.1031, "num_input_tokens_seen": 20508672, "step": 5007 }, { "epoch": 3.731743666169896, "grad_norm": 6.732461714358356, "learning_rate": 3.763132950893597e-07, "loss": 0.1713, "num_input_tokens_seen": 20512768, "step": 5008 }, { "epoch": 3.732488822652757, "grad_norm": 6.788956797395565, "learning_rate": 3.758948408528104e-07, "loss": 0.2067, "num_input_tokens_seen": 20516864, "step": 5009 }, { "epoch": 3.7332339791356186, "grad_norm": 9.612254206573546, "learning_rate": 3.7547657822635394e-07, "loss": 0.0719, "num_input_tokens_seen": 20520960, "step": 5010 }, { "epoch": 3.7339791356184797, "grad_norm": 10.187252096769967, "learning_rate": 3.750585073016763e-07, "loss": 0.2725, "num_input_tokens_seen": 20525056, "step": 5011 }, { "epoch": 3.7347242921013413, "grad_norm": 9.451435598676841, "learning_rate": 3.746406281704211e-07, "loss": 0.295, "num_input_tokens_seen": 20529152, "step": 5012 }, { "epoch": 3.735469448584203, "grad_norm": 7.008081342582194, "learning_rate": 3.7422294092419113e-07, "loss": 0.1085, "num_input_tokens_seen": 20533248, "step": 5013 }, { "epoch": 3.736214605067064, "grad_norm": 5.696382762463491, "learning_rate": 3.7380544565454575e-07, "loss": 0.1433, "num_input_tokens_seen": 20537344, "step": 5014 }, { "epoch": 3.7369597615499255, "grad_norm": 8.216440755918411, "learning_rate": 3.733881424530038e-07, "loss": 0.1912, "num_input_tokens_seen": 20541440, "step": 5015 }, { "epoch": 3.737704918032787, "grad_norm": 9.022348150294816, "learning_rate": 3.7297103141104007e-07, "loss": 0.3898, "num_input_tokens_seen": 20545536, "step": 5016 }, { "epoch": 3.738450074515648, "grad_norm": 8.478053847195703, "learning_rate": 3.725541126200892e-07, "loss": 0.1447, "num_input_tokens_seen": 20549632, "step": 5017 }, { "epoch": 3.7391952309985097, "grad_norm": 9.537206068699266, "learning_rate": 3.721373861715424e-07, "loss": 0.1616, "num_input_tokens_seen": 20553728, "step": 5018 }, { "epoch": 3.7399403874813713, "grad_norm": 12.328525044457972, "learning_rate": 3.7172085215674865e-07, "loss": 0.2611, "num_input_tokens_seen": 20557824, "step": 5019 }, { "epoch": 3.7406855439642324, "grad_norm": 8.854035319780587, "learning_rate": 3.713045106670159e-07, "loss": 0.1981, "num_input_tokens_seen": 20561920, "step": 5020 }, { "epoch": 3.741430700447094, "grad_norm": 7.859213232830467, "learning_rate": 3.708883617936089e-07, "loss": 0.1355, "num_input_tokens_seen": 20566016, "step": 5021 }, { "epoch": 3.742175856929955, "grad_norm": 6.980390429979493, "learning_rate": 3.7047240562775046e-07, "loss": 0.1101, "num_input_tokens_seen": 20570112, "step": 5022 }, { "epoch": 3.7429210134128166, "grad_norm": 8.51174717804364, "learning_rate": 3.7005664226062067e-07, "loss": 0.2309, "num_input_tokens_seen": 20574208, "step": 5023 }, { "epoch": 3.743666169895678, "grad_norm": 11.008184968072577, "learning_rate": 3.696410717833587e-07, "loss": 0.0944, "num_input_tokens_seen": 20578304, "step": 5024 }, { "epoch": 3.7444113263785397, "grad_norm": 7.822298583256381, "learning_rate": 3.6922569428705965e-07, "loss": 0.2557, "num_input_tokens_seen": 20582400, "step": 5025 }, { "epoch": 3.745156482861401, "grad_norm": 6.657315750025053, "learning_rate": 3.688105098627781e-07, "loss": 0.1083, "num_input_tokens_seen": 20586496, "step": 5026 }, { "epoch": 3.7459016393442623, "grad_norm": 10.011539867901718, "learning_rate": 3.6839551860152477e-07, "loss": 0.1863, "num_input_tokens_seen": 20590592, "step": 5027 }, { "epoch": 3.7466467958271235, "grad_norm": 10.06697408199097, "learning_rate": 3.6798072059426927e-07, "loss": 0.1929, "num_input_tokens_seen": 20594688, "step": 5028 }, { "epoch": 3.747391952309985, "grad_norm": 8.729892801329028, "learning_rate": 3.67566115931938e-07, "loss": 0.099, "num_input_tokens_seen": 20598784, "step": 5029 }, { "epoch": 3.7481371087928466, "grad_norm": 9.259945611173375, "learning_rate": 3.671517047054149e-07, "loss": 0.1976, "num_input_tokens_seen": 20602880, "step": 5030 }, { "epoch": 3.748882265275708, "grad_norm": 7.616393605686193, "learning_rate": 3.667374870055425e-07, "loss": 0.1481, "num_input_tokens_seen": 20606976, "step": 5031 }, { "epoch": 3.7496274217585692, "grad_norm": 8.770722099584733, "learning_rate": 3.6632346292311977e-07, "loss": 0.3021, "num_input_tokens_seen": 20611072, "step": 5032 }, { "epoch": 3.7503725782414308, "grad_norm": 9.314182022617482, "learning_rate": 3.6590963254890463e-07, "loss": 0.1204, "num_input_tokens_seen": 20615168, "step": 5033 }, { "epoch": 3.751117734724292, "grad_norm": 8.325125109011301, "learning_rate": 3.654959959736104e-07, "loss": 0.0996, "num_input_tokens_seen": 20619264, "step": 5034 }, { "epoch": 3.7518628912071534, "grad_norm": 8.16798513100522, "learning_rate": 3.6508255328791003e-07, "loss": 0.1348, "num_input_tokens_seen": 20623360, "step": 5035 }, { "epoch": 3.752608047690015, "grad_norm": 8.58893336206658, "learning_rate": 3.6466930458243247e-07, "loss": 0.2279, "num_input_tokens_seen": 20627456, "step": 5036 }, { "epoch": 3.7533532041728765, "grad_norm": 9.018668442674796, "learning_rate": 3.642562499477656e-07, "loss": 0.2311, "num_input_tokens_seen": 20631552, "step": 5037 }, { "epoch": 3.7540983606557377, "grad_norm": 9.281304767827748, "learning_rate": 3.638433894744535e-07, "loss": 0.2357, "num_input_tokens_seen": 20635648, "step": 5038 }, { "epoch": 3.754843517138599, "grad_norm": 9.99769972034448, "learning_rate": 3.634307232529978e-07, "loss": 0.2439, "num_input_tokens_seen": 20639744, "step": 5039 }, { "epoch": 3.7555886736214603, "grad_norm": 9.262584100878147, "learning_rate": 3.630182513738585e-07, "loss": 0.2729, "num_input_tokens_seen": 20643840, "step": 5040 }, { "epoch": 3.756333830104322, "grad_norm": 10.039249830056974, "learning_rate": 3.6260597392745194e-07, "loss": 0.3049, "num_input_tokens_seen": 20647936, "step": 5041 }, { "epoch": 3.7570789865871834, "grad_norm": 8.401375853931768, "learning_rate": 3.6219389100415265e-07, "loss": 0.0486, "num_input_tokens_seen": 20652032, "step": 5042 }, { "epoch": 3.757824143070045, "grad_norm": 8.909517377559187, "learning_rate": 3.617820026942917e-07, "loss": 0.0348, "num_input_tokens_seen": 20656128, "step": 5043 }, { "epoch": 3.758569299552906, "grad_norm": 8.08958814018382, "learning_rate": 3.6137030908815847e-07, "loss": 0.2922, "num_input_tokens_seen": 20660224, "step": 5044 }, { "epoch": 3.7593144560357676, "grad_norm": 7.973223301712694, "learning_rate": 3.6095881027599896e-07, "loss": 0.1469, "num_input_tokens_seen": 20664320, "step": 5045 }, { "epoch": 3.7600596125186287, "grad_norm": 7.166610664276402, "learning_rate": 3.605475063480164e-07, "loss": 0.1296, "num_input_tokens_seen": 20668416, "step": 5046 }, { "epoch": 3.7608047690014903, "grad_norm": 9.308200024010612, "learning_rate": 3.6013639739437155e-07, "loss": 0.0676, "num_input_tokens_seen": 20672512, "step": 5047 }, { "epoch": 3.761549925484352, "grad_norm": 10.01961915359568, "learning_rate": 3.5972548350518276e-07, "loss": 0.0977, "num_input_tokens_seen": 20676608, "step": 5048 }, { "epoch": 3.762295081967213, "grad_norm": 8.243940842942008, "learning_rate": 3.593147647705253e-07, "loss": 0.1031, "num_input_tokens_seen": 20680704, "step": 5049 }, { "epoch": 3.7630402384500745, "grad_norm": 7.760197552658299, "learning_rate": 3.5890424128043095e-07, "loss": 0.1173, "num_input_tokens_seen": 20684800, "step": 5050 }, { "epoch": 3.763785394932936, "grad_norm": 7.3506081507585295, "learning_rate": 3.5849391312489033e-07, "loss": 0.1394, "num_input_tokens_seen": 20688896, "step": 5051 }, { "epoch": 3.764530551415797, "grad_norm": 11.592945718550592, "learning_rate": 3.580837803938496e-07, "loss": 0.0688, "num_input_tokens_seen": 20692992, "step": 5052 }, { "epoch": 3.7652757078986587, "grad_norm": 10.392446370419147, "learning_rate": 3.5767384317721323e-07, "loss": 0.256, "num_input_tokens_seen": 20697088, "step": 5053 }, { "epoch": 3.7660208643815203, "grad_norm": 8.822282213230828, "learning_rate": 3.57264101564842e-07, "loss": 0.2777, "num_input_tokens_seen": 20701184, "step": 5054 }, { "epoch": 3.7667660208643814, "grad_norm": 7.99342683247503, "learning_rate": 3.568545556465547e-07, "loss": 0.188, "num_input_tokens_seen": 20705280, "step": 5055 }, { "epoch": 3.767511177347243, "grad_norm": 11.118243435629312, "learning_rate": 3.5644520551212646e-07, "loss": 0.2191, "num_input_tokens_seen": 20709376, "step": 5056 }, { "epoch": 3.768256333830104, "grad_norm": 7.845612716820383, "learning_rate": 3.5603605125128934e-07, "loss": 0.2145, "num_input_tokens_seen": 20713472, "step": 5057 }, { "epoch": 3.7690014903129656, "grad_norm": 5.765437128877802, "learning_rate": 3.5562709295373364e-07, "loss": 0.0699, "num_input_tokens_seen": 20717568, "step": 5058 }, { "epoch": 3.769746646795827, "grad_norm": 9.676345182262951, "learning_rate": 3.552183307091052e-07, "loss": 0.2737, "num_input_tokens_seen": 20721664, "step": 5059 }, { "epoch": 3.7704918032786887, "grad_norm": 8.537118835252148, "learning_rate": 3.5480976460700867e-07, "loss": 0.1501, "num_input_tokens_seen": 20725760, "step": 5060 }, { "epoch": 3.77123695976155, "grad_norm": 8.43618337605527, "learning_rate": 3.544013947370033e-07, "loss": 0.1436, "num_input_tokens_seen": 20729856, "step": 5061 }, { "epoch": 3.7719821162444114, "grad_norm": 12.228711141383549, "learning_rate": 3.539932211886077e-07, "loss": 0.1336, "num_input_tokens_seen": 20733952, "step": 5062 }, { "epoch": 3.7727272727272725, "grad_norm": 7.983354578426593, "learning_rate": 3.535852440512957e-07, "loss": 0.1579, "num_input_tokens_seen": 20738048, "step": 5063 }, { "epoch": 3.773472429210134, "grad_norm": 7.9603389938754585, "learning_rate": 3.5317746341449954e-07, "loss": 0.0743, "num_input_tokens_seen": 20742144, "step": 5064 }, { "epoch": 3.7742175856929956, "grad_norm": 8.285452509644053, "learning_rate": 3.5276987936760726e-07, "loss": 0.2007, "num_input_tokens_seen": 20746240, "step": 5065 }, { "epoch": 3.774962742175857, "grad_norm": 7.770900295398296, "learning_rate": 3.523624919999638e-07, "loss": 0.2058, "num_input_tokens_seen": 20750336, "step": 5066 }, { "epoch": 3.7757078986587183, "grad_norm": 9.427073830062026, "learning_rate": 3.5195530140087225e-07, "loss": 0.0718, "num_input_tokens_seen": 20754432, "step": 5067 }, { "epoch": 3.77645305514158, "grad_norm": 8.45885340491855, "learning_rate": 3.515483076595909e-07, "loss": 0.1817, "num_input_tokens_seen": 20758528, "step": 5068 }, { "epoch": 3.777198211624441, "grad_norm": 3.92088718048873, "learning_rate": 3.511415108653364e-07, "loss": 0.0188, "num_input_tokens_seen": 20762624, "step": 5069 }, { "epoch": 3.7779433681073025, "grad_norm": 7.163670202929336, "learning_rate": 3.507349111072807e-07, "loss": 0.1338, "num_input_tokens_seen": 20766720, "step": 5070 }, { "epoch": 3.778688524590164, "grad_norm": 9.106088818257158, "learning_rate": 3.503285084745542e-07, "loss": 0.1326, "num_input_tokens_seen": 20770816, "step": 5071 }, { "epoch": 3.7794336810730256, "grad_norm": 9.051267566407379, "learning_rate": 3.499223030562429e-07, "loss": 0.1153, "num_input_tokens_seen": 20774912, "step": 5072 }, { "epoch": 3.7801788375558867, "grad_norm": 8.338529830850613, "learning_rate": 3.4951629494138995e-07, "loss": 0.1737, "num_input_tokens_seen": 20779008, "step": 5073 }, { "epoch": 3.7809239940387482, "grad_norm": 9.709205760608658, "learning_rate": 3.4911048421899464e-07, "loss": 0.1448, "num_input_tokens_seen": 20783104, "step": 5074 }, { "epoch": 3.7816691505216093, "grad_norm": 8.717586275270675, "learning_rate": 3.487048709780147e-07, "loss": 0.2543, "num_input_tokens_seen": 20787200, "step": 5075 }, { "epoch": 3.782414307004471, "grad_norm": 7.053390176894125, "learning_rate": 3.4829945530736263e-07, "loss": 0.1367, "num_input_tokens_seen": 20791296, "step": 5076 }, { "epoch": 3.7831594634873325, "grad_norm": 12.651014286449497, "learning_rate": 3.4789423729590846e-07, "loss": 0.1821, "num_input_tokens_seen": 20795392, "step": 5077 }, { "epoch": 3.783904619970194, "grad_norm": 8.77739932145871, "learning_rate": 3.474892170324795e-07, "loss": 0.1448, "num_input_tokens_seen": 20799488, "step": 5078 }, { "epoch": 3.784649776453055, "grad_norm": 9.59889687560526, "learning_rate": 3.4708439460585817e-07, "loss": 0.1989, "num_input_tokens_seen": 20803584, "step": 5079 }, { "epoch": 3.7853949329359167, "grad_norm": 7.184155842736038, "learning_rate": 3.466797701047854e-07, "loss": 0.2117, "num_input_tokens_seen": 20807680, "step": 5080 }, { "epoch": 3.7861400894187778, "grad_norm": 9.818528569323895, "learning_rate": 3.462753436179568e-07, "loss": 0.1188, "num_input_tokens_seen": 20811776, "step": 5081 }, { "epoch": 3.7868852459016393, "grad_norm": 8.335558639419729, "learning_rate": 3.458711152340266e-07, "loss": 0.2074, "num_input_tokens_seen": 20815872, "step": 5082 }, { "epoch": 3.787630402384501, "grad_norm": 7.94189159949988, "learning_rate": 3.454670850416039e-07, "loss": 0.1013, "num_input_tokens_seen": 20819968, "step": 5083 }, { "epoch": 3.788375558867362, "grad_norm": 10.905670276879421, "learning_rate": 3.4506325312925473e-07, "loss": 0.1206, "num_input_tokens_seen": 20824064, "step": 5084 }, { "epoch": 3.7891207153502235, "grad_norm": 10.329918825760211, "learning_rate": 3.446596195855026e-07, "loss": 0.1348, "num_input_tokens_seen": 20828160, "step": 5085 }, { "epoch": 3.789865871833085, "grad_norm": 8.532512719701142, "learning_rate": 3.4425618449882624e-07, "loss": 0.1852, "num_input_tokens_seen": 20832256, "step": 5086 }, { "epoch": 3.790611028315946, "grad_norm": 14.365121552769155, "learning_rate": 3.438529479576627e-07, "loss": 0.3128, "num_input_tokens_seen": 20836352, "step": 5087 }, { "epoch": 3.7913561847988078, "grad_norm": 8.728614106625438, "learning_rate": 3.4344991005040255e-07, "loss": 0.1466, "num_input_tokens_seen": 20840448, "step": 5088 }, { "epoch": 3.7921013412816693, "grad_norm": 8.809888115849237, "learning_rate": 3.430470708653959e-07, "loss": 0.1275, "num_input_tokens_seen": 20844544, "step": 5089 }, { "epoch": 3.7928464977645304, "grad_norm": 6.893634317048819, "learning_rate": 3.426444304909471e-07, "loss": 0.0718, "num_input_tokens_seen": 20848640, "step": 5090 }, { "epoch": 3.793591654247392, "grad_norm": 8.78884139182389, "learning_rate": 3.4224198901531863e-07, "loss": 0.2119, "num_input_tokens_seen": 20852736, "step": 5091 }, { "epoch": 3.794336810730253, "grad_norm": 7.2603189408466084, "learning_rate": 3.4183974652672784e-07, "loss": 0.1836, "num_input_tokens_seen": 20856832, "step": 5092 }, { "epoch": 3.7950819672131146, "grad_norm": 6.729085659848125, "learning_rate": 3.414377031133498e-07, "loss": 0.1216, "num_input_tokens_seen": 20860928, "step": 5093 }, { "epoch": 3.795827123695976, "grad_norm": 9.864178140210964, "learning_rate": 3.410358588633149e-07, "loss": 0.2032, "num_input_tokens_seen": 20865024, "step": 5094 }, { "epoch": 3.7965722801788377, "grad_norm": 9.418744198136531, "learning_rate": 3.406342138647099e-07, "loss": 0.2488, "num_input_tokens_seen": 20869120, "step": 5095 }, { "epoch": 3.797317436661699, "grad_norm": 6.247543076290684, "learning_rate": 3.402327682055792e-07, "loss": 0.105, "num_input_tokens_seen": 20873216, "step": 5096 }, { "epoch": 3.7980625931445604, "grad_norm": 10.790913969900455, "learning_rate": 3.398315219739216e-07, "loss": 0.2523, "num_input_tokens_seen": 20877312, "step": 5097 }, { "epoch": 3.7988077496274215, "grad_norm": 9.502966768661432, "learning_rate": 3.3943047525769394e-07, "loss": 0.2032, "num_input_tokens_seen": 20881408, "step": 5098 }, { "epoch": 3.799552906110283, "grad_norm": 9.073850562947065, "learning_rate": 3.390296281448077e-07, "loss": 0.2047, "num_input_tokens_seen": 20885504, "step": 5099 }, { "epoch": 3.8002980625931446, "grad_norm": 9.997538968493076, "learning_rate": 3.3862898072313274e-07, "loss": 0.4058, "num_input_tokens_seen": 20889600, "step": 5100 }, { "epoch": 3.801043219076006, "grad_norm": 8.733842482097119, "learning_rate": 3.382285330804921e-07, "loss": 0.2269, "num_input_tokens_seen": 20893696, "step": 5101 }, { "epoch": 3.8017883755588673, "grad_norm": 9.372084368387947, "learning_rate": 3.3782828530466814e-07, "loss": 0.1574, "num_input_tokens_seen": 20897792, "step": 5102 }, { "epoch": 3.802533532041729, "grad_norm": 7.189559425227419, "learning_rate": 3.3742823748339745e-07, "loss": 0.1426, "num_input_tokens_seen": 20901888, "step": 5103 }, { "epoch": 3.80327868852459, "grad_norm": 8.750489035145591, "learning_rate": 3.3702838970437314e-07, "loss": 0.1752, "num_input_tokens_seen": 20905984, "step": 5104 }, { "epoch": 3.8040238450074515, "grad_norm": 9.261006653898216, "learning_rate": 3.3662874205524524e-07, "loss": 0.1692, "num_input_tokens_seen": 20910080, "step": 5105 }, { "epoch": 3.804769001490313, "grad_norm": 6.963278969434283, "learning_rate": 3.36229294623619e-07, "loss": 0.0482, "num_input_tokens_seen": 20914176, "step": 5106 }, { "epoch": 3.8055141579731746, "grad_norm": 6.409757975197542, "learning_rate": 3.358300474970565e-07, "loss": 0.0863, "num_input_tokens_seen": 20918272, "step": 5107 }, { "epoch": 3.8062593144560357, "grad_norm": 8.534002179326412, "learning_rate": 3.35431000763075e-07, "loss": 0.1653, "num_input_tokens_seen": 20922368, "step": 5108 }, { "epoch": 3.8070044709388973, "grad_norm": 7.887640847274747, "learning_rate": 3.350321545091491e-07, "loss": 0.1839, "num_input_tokens_seen": 20926464, "step": 5109 }, { "epoch": 3.8077496274217584, "grad_norm": 7.952613114199631, "learning_rate": 3.346335088227083e-07, "loss": 0.1474, "num_input_tokens_seen": 20930560, "step": 5110 }, { "epoch": 3.80849478390462, "grad_norm": 17.57215005649573, "learning_rate": 3.3423506379113845e-07, "loss": 0.0664, "num_input_tokens_seen": 20934656, "step": 5111 }, { "epoch": 3.8092399403874815, "grad_norm": 6.199940377847398, "learning_rate": 3.3383681950178203e-07, "loss": 0.0559, "num_input_tokens_seen": 20938752, "step": 5112 }, { "epoch": 3.809985096870343, "grad_norm": 8.13093980006533, "learning_rate": 3.3343877604193634e-07, "loss": 0.1842, "num_input_tokens_seen": 20942848, "step": 5113 }, { "epoch": 3.810730253353204, "grad_norm": 6.4558153896078485, "learning_rate": 3.3304093349885674e-07, "loss": 0.0839, "num_input_tokens_seen": 20946944, "step": 5114 }, { "epoch": 3.8114754098360657, "grad_norm": 8.941616335954405, "learning_rate": 3.3264329195975124e-07, "loss": 0.2827, "num_input_tokens_seen": 20951040, "step": 5115 }, { "epoch": 3.812220566318927, "grad_norm": 9.136984550172295, "learning_rate": 3.322458515117871e-07, "loss": 0.2139, "num_input_tokens_seen": 20955136, "step": 5116 }, { "epoch": 3.8129657228017884, "grad_norm": 9.188681850475744, "learning_rate": 3.3184861224208524e-07, "loss": 0.1918, "num_input_tokens_seen": 20959232, "step": 5117 }, { "epoch": 3.81371087928465, "grad_norm": 8.00531004084413, "learning_rate": 3.3145157423772415e-07, "loss": 0.1801, "num_input_tokens_seen": 20963328, "step": 5118 }, { "epoch": 3.814456035767511, "grad_norm": 13.16647643407807, "learning_rate": 3.3105473758573675e-07, "loss": 0.1179, "num_input_tokens_seen": 20967424, "step": 5119 }, { "epoch": 3.8152011922503726, "grad_norm": 6.1443193064876915, "learning_rate": 3.30658102373113e-07, "loss": 0.1277, "num_input_tokens_seen": 20971520, "step": 5120 }, { "epoch": 3.815946348733234, "grad_norm": 8.601541619922546, "learning_rate": 3.3026166868679793e-07, "loss": 0.2126, "num_input_tokens_seen": 20975616, "step": 5121 }, { "epoch": 3.8166915052160952, "grad_norm": 9.368376484596853, "learning_rate": 3.298654366136922e-07, "loss": 0.172, "num_input_tokens_seen": 20979712, "step": 5122 }, { "epoch": 3.817436661698957, "grad_norm": 6.3668926704836295, "learning_rate": 3.294694062406535e-07, "loss": 0.0488, "num_input_tokens_seen": 20983808, "step": 5123 }, { "epoch": 3.8181818181818183, "grad_norm": 8.714036641184183, "learning_rate": 3.290735776544937e-07, "loss": 0.3035, "num_input_tokens_seen": 20987904, "step": 5124 }, { "epoch": 3.8189269746646795, "grad_norm": 8.154975528768391, "learning_rate": 3.2867795094198214e-07, "loss": 0.1833, "num_input_tokens_seen": 20992000, "step": 5125 }, { "epoch": 3.819672131147541, "grad_norm": 9.989838414840818, "learning_rate": 3.2828252618984214e-07, "loss": 0.1139, "num_input_tokens_seen": 20996096, "step": 5126 }, { "epoch": 3.820417287630402, "grad_norm": 10.510717471046405, "learning_rate": 3.278873034847549e-07, "loss": 0.1161, "num_input_tokens_seen": 21000192, "step": 5127 }, { "epoch": 3.8211624441132637, "grad_norm": 6.338139046402464, "learning_rate": 3.2749228291335435e-07, "loss": 0.1992, "num_input_tokens_seen": 21004288, "step": 5128 }, { "epoch": 3.821907600596125, "grad_norm": 8.456394128833749, "learning_rate": 3.2709746456223307e-07, "loss": 0.2023, "num_input_tokens_seen": 21008384, "step": 5129 }, { "epoch": 3.8226527570789868, "grad_norm": 9.923177591557366, "learning_rate": 3.267028485179377e-07, "loss": 0.3235, "num_input_tokens_seen": 21012480, "step": 5130 }, { "epoch": 3.823397913561848, "grad_norm": 7.01854644805526, "learning_rate": 3.263084348669704e-07, "loss": 0.088, "num_input_tokens_seen": 21016576, "step": 5131 }, { "epoch": 3.8241430700447094, "grad_norm": 9.587881101604953, "learning_rate": 3.2591422369579045e-07, "loss": 0.2573, "num_input_tokens_seen": 21020672, "step": 5132 }, { "epoch": 3.8248882265275705, "grad_norm": 6.436986938702662, "learning_rate": 3.255202150908107e-07, "loss": 0.0916, "num_input_tokens_seen": 21024768, "step": 5133 }, { "epoch": 3.825633383010432, "grad_norm": 8.205282259567221, "learning_rate": 3.251264091384017e-07, "loss": 0.2449, "num_input_tokens_seen": 21028864, "step": 5134 }, { "epoch": 3.8263785394932937, "grad_norm": 8.462035662350681, "learning_rate": 3.247328059248875e-07, "loss": 0.1346, "num_input_tokens_seen": 21032960, "step": 5135 }, { "epoch": 3.827123695976155, "grad_norm": 7.6719128902090015, "learning_rate": 3.2433940553654965e-07, "loss": 0.1421, "num_input_tokens_seen": 21037056, "step": 5136 }, { "epoch": 3.8278688524590163, "grad_norm": 7.686990080550256, "learning_rate": 3.2394620805962397e-07, "loss": 0.1914, "num_input_tokens_seen": 21041152, "step": 5137 }, { "epoch": 3.828614008941878, "grad_norm": 7.408066345603902, "learning_rate": 3.235532135803017e-07, "loss": 0.2803, "num_input_tokens_seen": 21045248, "step": 5138 }, { "epoch": 3.829359165424739, "grad_norm": 6.83114207176463, "learning_rate": 3.231604221847308e-07, "loss": 0.1143, "num_input_tokens_seen": 21049344, "step": 5139 }, { "epoch": 3.8301043219076005, "grad_norm": 9.590567245893228, "learning_rate": 3.227678339590132e-07, "loss": 0.1222, "num_input_tokens_seen": 21053440, "step": 5140 }, { "epoch": 3.830849478390462, "grad_norm": 9.392783030922054, "learning_rate": 3.2237544898920807e-07, "loss": 0.1329, "num_input_tokens_seen": 21057536, "step": 5141 }, { "epoch": 3.8315946348733236, "grad_norm": 10.126684579546296, "learning_rate": 3.2198326736132765e-07, "loss": 0.0994, "num_input_tokens_seen": 21061632, "step": 5142 }, { "epoch": 3.8323397913561847, "grad_norm": 9.693684382127023, "learning_rate": 3.2159128916134223e-07, "loss": 0.2369, "num_input_tokens_seen": 21065728, "step": 5143 }, { "epoch": 3.8330849478390463, "grad_norm": 7.383266948057696, "learning_rate": 3.211995144751751e-07, "loss": 0.0736, "num_input_tokens_seen": 21069824, "step": 5144 }, { "epoch": 3.8338301043219074, "grad_norm": 9.860887859099465, "learning_rate": 3.208079433887071e-07, "loss": 0.1019, "num_input_tokens_seen": 21073920, "step": 5145 }, { "epoch": 3.834575260804769, "grad_norm": 9.50942374696115, "learning_rate": 3.204165759877724e-07, "loss": 0.244, "num_input_tokens_seen": 21078016, "step": 5146 }, { "epoch": 3.8353204172876305, "grad_norm": 8.07253430371459, "learning_rate": 3.2002541235816256e-07, "loss": 0.2363, "num_input_tokens_seen": 21082112, "step": 5147 }, { "epoch": 3.836065573770492, "grad_norm": 7.263144078632956, "learning_rate": 3.196344525856229e-07, "loss": 0.0844, "num_input_tokens_seen": 21086208, "step": 5148 }, { "epoch": 3.836810730253353, "grad_norm": 7.764610846733054, "learning_rate": 3.192436967558543e-07, "loss": 0.1454, "num_input_tokens_seen": 21090304, "step": 5149 }, { "epoch": 3.8375558867362147, "grad_norm": 7.719032728872466, "learning_rate": 3.188531449545139e-07, "loss": 0.1105, "num_input_tokens_seen": 21094400, "step": 5150 }, { "epoch": 3.838301043219076, "grad_norm": 7.4871070644643165, "learning_rate": 3.1846279726721273e-07, "loss": 0.0635, "num_input_tokens_seen": 21098496, "step": 5151 }, { "epoch": 3.8390461997019374, "grad_norm": 8.37317292238411, "learning_rate": 3.180726537795187e-07, "loss": 0.1298, "num_input_tokens_seen": 21102592, "step": 5152 }, { "epoch": 3.839791356184799, "grad_norm": 8.343617558397048, "learning_rate": 3.1768271457695286e-07, "loss": 0.1064, "num_input_tokens_seen": 21106688, "step": 5153 }, { "epoch": 3.84053651266766, "grad_norm": 7.942791408345878, "learning_rate": 3.1729297974499434e-07, "loss": 0.131, "num_input_tokens_seen": 21110784, "step": 5154 }, { "epoch": 3.8412816691505216, "grad_norm": 7.870394656244454, "learning_rate": 3.169034493690738e-07, "loss": 0.0412, "num_input_tokens_seen": 21114880, "step": 5155 }, { "epoch": 3.842026825633383, "grad_norm": 8.092636685000427, "learning_rate": 3.165141235345806e-07, "loss": 0.1742, "num_input_tokens_seen": 21118976, "step": 5156 }, { "epoch": 3.8427719821162443, "grad_norm": 10.286810885138877, "learning_rate": 3.1612500232685724e-07, "loss": 0.2302, "num_input_tokens_seen": 21123072, "step": 5157 }, { "epoch": 3.843517138599106, "grad_norm": 5.6636946381633, "learning_rate": 3.1573608583120125e-07, "loss": 0.049, "num_input_tokens_seen": 21127168, "step": 5158 }, { "epoch": 3.8442622950819674, "grad_norm": 4.815556083156387, "learning_rate": 3.1534737413286705e-07, "loss": 0.0398, "num_input_tokens_seen": 21131264, "step": 5159 }, { "epoch": 3.8450074515648285, "grad_norm": 6.075098131678899, "learning_rate": 3.14958867317062e-07, "loss": 0.0601, "num_input_tokens_seen": 21135360, "step": 5160 }, { "epoch": 3.84575260804769, "grad_norm": 7.530302341450228, "learning_rate": 3.1457056546895045e-07, "loss": 0.0811, "num_input_tokens_seen": 21139456, "step": 5161 }, { "epoch": 3.846497764530551, "grad_norm": 9.803976170071262, "learning_rate": 3.1418246867365015e-07, "loss": 0.2419, "num_input_tokens_seen": 21143552, "step": 5162 }, { "epoch": 3.8472429210134127, "grad_norm": 7.509377863168905, "learning_rate": 3.137945770162354e-07, "loss": 0.2113, "num_input_tokens_seen": 21147648, "step": 5163 }, { "epoch": 3.8479880774962743, "grad_norm": 8.714723697542624, "learning_rate": 3.13406890581734e-07, "loss": 0.1305, "num_input_tokens_seen": 21151744, "step": 5164 }, { "epoch": 3.848733233979136, "grad_norm": 12.014281736817745, "learning_rate": 3.130194094551306e-07, "loss": 0.2488, "num_input_tokens_seen": 21155840, "step": 5165 }, { "epoch": 3.849478390461997, "grad_norm": 8.935371554786782, "learning_rate": 3.1263213372136324e-07, "loss": 0.2367, "num_input_tokens_seen": 21159936, "step": 5166 }, { "epoch": 3.8502235469448585, "grad_norm": 10.252647069226912, "learning_rate": 3.122450634653251e-07, "loss": 0.1149, "num_input_tokens_seen": 21164032, "step": 5167 }, { "epoch": 3.8509687034277196, "grad_norm": 8.66189722446165, "learning_rate": 3.1185819877186594e-07, "loss": 0.1834, "num_input_tokens_seen": 21168128, "step": 5168 }, { "epoch": 3.851713859910581, "grad_norm": 7.819462509437637, "learning_rate": 3.1147153972578796e-07, "loss": 0.079, "num_input_tokens_seen": 21172224, "step": 5169 }, { "epoch": 3.8524590163934427, "grad_norm": 7.815338935367751, "learning_rate": 3.110850864118504e-07, "loss": 0.1235, "num_input_tokens_seen": 21176320, "step": 5170 }, { "epoch": 3.8532041728763042, "grad_norm": 11.42978588470567, "learning_rate": 3.106988389147661e-07, "loss": 0.1735, "num_input_tokens_seen": 21180416, "step": 5171 }, { "epoch": 3.8539493293591653, "grad_norm": 11.540374505767153, "learning_rate": 3.1031279731920385e-07, "loss": 0.2869, "num_input_tokens_seen": 21184512, "step": 5172 }, { "epoch": 3.854694485842027, "grad_norm": 11.047250087484697, "learning_rate": 3.0992696170978593e-07, "loss": 0.1477, "num_input_tokens_seen": 21188608, "step": 5173 }, { "epoch": 3.855439642324888, "grad_norm": 7.707534096817511, "learning_rate": 3.095413321710912e-07, "loss": 0.1501, "num_input_tokens_seen": 21192704, "step": 5174 }, { "epoch": 3.8561847988077496, "grad_norm": 8.403213605023366, "learning_rate": 3.091559087876518e-07, "loss": 0.2362, "num_input_tokens_seen": 21196800, "step": 5175 }, { "epoch": 3.856929955290611, "grad_norm": 7.2298217578412, "learning_rate": 3.087706916439552e-07, "loss": 0.0446, "num_input_tokens_seen": 21200896, "step": 5176 }, { "epoch": 3.8576751117734727, "grad_norm": 11.613245769894927, "learning_rate": 3.0838568082444424e-07, "loss": 0.1218, "num_input_tokens_seen": 21204992, "step": 5177 }, { "epoch": 3.8584202682563338, "grad_norm": 7.629791698490568, "learning_rate": 3.0800087641351546e-07, "loss": 0.0322, "num_input_tokens_seen": 21209088, "step": 5178 }, { "epoch": 3.8591654247391953, "grad_norm": 9.96950489793087, "learning_rate": 3.076162784955214e-07, "loss": 0.2138, "num_input_tokens_seen": 21213184, "step": 5179 }, { "epoch": 3.8599105812220564, "grad_norm": 8.46072324243618, "learning_rate": 3.0723188715476806e-07, "loss": 0.2116, "num_input_tokens_seen": 21217280, "step": 5180 }, { "epoch": 3.860655737704918, "grad_norm": 10.270581501375887, "learning_rate": 3.0684770247551757e-07, "loss": 0.1996, "num_input_tokens_seen": 21221376, "step": 5181 }, { "epoch": 3.8614008941877795, "grad_norm": 7.745645182824834, "learning_rate": 3.064637245419848e-07, "loss": 0.0574, "num_input_tokens_seen": 21225472, "step": 5182 }, { "epoch": 3.862146050670641, "grad_norm": 7.054322186126784, "learning_rate": 3.060799534383413e-07, "loss": 0.1104, "num_input_tokens_seen": 21229568, "step": 5183 }, { "epoch": 3.862891207153502, "grad_norm": 7.5394799027509265, "learning_rate": 3.056963892487119e-07, "loss": 0.1929, "num_input_tokens_seen": 21233664, "step": 5184 }, { "epoch": 3.8636363636363638, "grad_norm": 7.992100555859877, "learning_rate": 3.0531303205717733e-07, "loss": 0.0605, "num_input_tokens_seen": 21237760, "step": 5185 }, { "epoch": 3.864381520119225, "grad_norm": 10.810945170725974, "learning_rate": 3.049298819477717e-07, "loss": 0.256, "num_input_tokens_seen": 21241856, "step": 5186 }, { "epoch": 3.8651266766020864, "grad_norm": 8.31953139743545, "learning_rate": 3.0454693900448406e-07, "loss": 0.106, "num_input_tokens_seen": 21245952, "step": 5187 }, { "epoch": 3.865871833084948, "grad_norm": 10.644757169189491, "learning_rate": 3.0416420331125877e-07, "loss": 0.4012, "num_input_tokens_seen": 21250048, "step": 5188 }, { "epoch": 3.866616989567809, "grad_norm": 7.688024750399014, "learning_rate": 3.037816749519938e-07, "loss": 0.0621, "num_input_tokens_seen": 21254144, "step": 5189 }, { "epoch": 3.8673621460506706, "grad_norm": 7.586234810922204, "learning_rate": 3.033993540105426e-07, "loss": 0.1477, "num_input_tokens_seen": 21258240, "step": 5190 }, { "epoch": 3.868107302533532, "grad_norm": 8.063284353361736, "learning_rate": 3.0301724057071186e-07, "loss": 0.1619, "num_input_tokens_seen": 21262336, "step": 5191 }, { "epoch": 3.8688524590163933, "grad_norm": 7.594014478504872, "learning_rate": 3.0263533471626443e-07, "loss": 0.1156, "num_input_tokens_seen": 21266432, "step": 5192 }, { "epoch": 3.869597615499255, "grad_norm": 6.449944822266423, "learning_rate": 3.022536365309166e-07, "loss": 0.1014, "num_input_tokens_seen": 21270528, "step": 5193 }, { "epoch": 3.8703427719821164, "grad_norm": 6.654479071813289, "learning_rate": 3.0187214609833856e-07, "loss": 0.1191, "num_input_tokens_seen": 21274624, "step": 5194 }, { "epoch": 3.8710879284649775, "grad_norm": 7.629339931314565, "learning_rate": 3.014908635021573e-07, "loss": 0.2084, "num_input_tokens_seen": 21278720, "step": 5195 }, { "epoch": 3.871833084947839, "grad_norm": 5.786101042433077, "learning_rate": 3.0110978882595077e-07, "loss": 0.1299, "num_input_tokens_seen": 21282816, "step": 5196 }, { "epoch": 3.8725782414307, "grad_norm": 8.266053461176737, "learning_rate": 3.007289221532547e-07, "loss": 0.1651, "num_input_tokens_seen": 21286912, "step": 5197 }, { "epoch": 3.8733233979135617, "grad_norm": 8.582692707184394, "learning_rate": 3.003482635675568e-07, "loss": 0.2609, "num_input_tokens_seen": 21291008, "step": 5198 }, { "epoch": 3.8740685543964233, "grad_norm": 8.778433011242631, "learning_rate": 2.9996781315230125e-07, "loss": 0.1516, "num_input_tokens_seen": 21295104, "step": 5199 }, { "epoch": 3.874813710879285, "grad_norm": 7.4835882811304, "learning_rate": 2.995875709908842e-07, "loss": 0.16, "num_input_tokens_seen": 21299200, "step": 5200 }, { "epoch": 3.875558867362146, "grad_norm": 9.567298931989484, "learning_rate": 2.9920753716665876e-07, "loss": 0.3773, "num_input_tokens_seen": 21303296, "step": 5201 }, { "epoch": 3.8763040238450075, "grad_norm": 8.874517079638412, "learning_rate": 2.988277117629304e-07, "loss": 0.1197, "num_input_tokens_seen": 21307392, "step": 5202 }, { "epoch": 3.8770491803278686, "grad_norm": 12.80020277575972, "learning_rate": 2.9844809486295923e-07, "loss": 0.2286, "num_input_tokens_seen": 21311488, "step": 5203 }, { "epoch": 3.87779433681073, "grad_norm": 7.836951538091991, "learning_rate": 2.980686865499606e-07, "loss": 0.0916, "num_input_tokens_seen": 21315584, "step": 5204 }, { "epoch": 3.8785394932935917, "grad_norm": 8.152520881706634, "learning_rate": 2.97689486907103e-07, "loss": 0.1397, "num_input_tokens_seen": 21319680, "step": 5205 }, { "epoch": 3.8792846497764533, "grad_norm": 9.372703859109262, "learning_rate": 2.9731049601751035e-07, "loss": 0.1619, "num_input_tokens_seen": 21323776, "step": 5206 }, { "epoch": 3.8800298062593144, "grad_norm": 11.114118101113764, "learning_rate": 2.969317139642594e-07, "loss": 0.1085, "num_input_tokens_seen": 21327872, "step": 5207 }, { "epoch": 3.880774962742176, "grad_norm": 12.041541143877593, "learning_rate": 2.9655314083038303e-07, "loss": 0.1919, "num_input_tokens_seen": 21331968, "step": 5208 }, { "epoch": 3.881520119225037, "grad_norm": 9.210499785870233, "learning_rate": 2.961747766988657e-07, "loss": 0.2353, "num_input_tokens_seen": 21336064, "step": 5209 }, { "epoch": 3.8822652757078986, "grad_norm": 6.913065389909078, "learning_rate": 2.9579662165264853e-07, "loss": 0.1693, "num_input_tokens_seen": 21340160, "step": 5210 }, { "epoch": 3.88301043219076, "grad_norm": 9.954652777511448, "learning_rate": 2.954186757746251e-07, "loss": 0.2483, "num_input_tokens_seen": 21344256, "step": 5211 }, { "epoch": 3.8837555886736217, "grad_norm": 8.538018044867579, "learning_rate": 2.950409391476447e-07, "loss": 0.1957, "num_input_tokens_seen": 21348352, "step": 5212 }, { "epoch": 3.884500745156483, "grad_norm": 9.159747430453724, "learning_rate": 2.9466341185450955e-07, "loss": 0.2017, "num_input_tokens_seen": 21352448, "step": 5213 }, { "epoch": 3.8852459016393444, "grad_norm": 10.861061361198589, "learning_rate": 2.942860939779757e-07, "loss": 0.1872, "num_input_tokens_seen": 21356544, "step": 5214 }, { "epoch": 3.8859910581222055, "grad_norm": 7.669444428434653, "learning_rate": 2.939089856007549e-07, "loss": 0.1437, "num_input_tokens_seen": 21360640, "step": 5215 }, { "epoch": 3.886736214605067, "grad_norm": 8.624487491196739, "learning_rate": 2.9353208680551124e-07, "loss": 0.2721, "num_input_tokens_seen": 21364736, "step": 5216 }, { "epoch": 3.8874813710879286, "grad_norm": 9.117890332636097, "learning_rate": 2.931553976748643e-07, "loss": 0.1671, "num_input_tokens_seen": 21368832, "step": 5217 }, { "epoch": 3.88822652757079, "grad_norm": 7.660041437182086, "learning_rate": 2.927789182913862e-07, "loss": 0.1406, "num_input_tokens_seen": 21372928, "step": 5218 }, { "epoch": 3.8889716840536512, "grad_norm": 8.95375281367105, "learning_rate": 2.9240264873760473e-07, "loss": 0.1163, "num_input_tokens_seen": 21377024, "step": 5219 }, { "epoch": 3.889716840536513, "grad_norm": 9.447325996094873, "learning_rate": 2.9202658909600056e-07, "loss": 0.2495, "num_input_tokens_seen": 21381120, "step": 5220 }, { "epoch": 3.890461997019374, "grad_norm": 10.41622341082462, "learning_rate": 2.916507394490084e-07, "loss": 0.2193, "num_input_tokens_seen": 21385216, "step": 5221 }, { "epoch": 3.8912071535022354, "grad_norm": 11.53315543660945, "learning_rate": 2.9127509987901787e-07, "loss": 0.2178, "num_input_tokens_seen": 21389312, "step": 5222 }, { "epoch": 3.891952309985097, "grad_norm": 7.887820360809616, "learning_rate": 2.9089967046837074e-07, "loss": 0.2654, "num_input_tokens_seen": 21393408, "step": 5223 }, { "epoch": 3.892697466467958, "grad_norm": 6.737409148163655, "learning_rate": 2.9052445129936477e-07, "loss": 0.0661, "num_input_tokens_seen": 21397504, "step": 5224 }, { "epoch": 3.8934426229508197, "grad_norm": 9.306209785240824, "learning_rate": 2.9014944245425003e-07, "loss": 0.1448, "num_input_tokens_seen": 21401600, "step": 5225 }, { "epoch": 3.894187779433681, "grad_norm": 8.631475931715393, "learning_rate": 2.897746440152317e-07, "loss": 0.1463, "num_input_tokens_seen": 21405696, "step": 5226 }, { "epoch": 3.8949329359165423, "grad_norm": 6.338201816525286, "learning_rate": 2.8940005606446776e-07, "loss": 0.05, "num_input_tokens_seen": 21409792, "step": 5227 }, { "epoch": 3.895678092399404, "grad_norm": 8.77814439535771, "learning_rate": 2.8902567868407115e-07, "loss": 0.1909, "num_input_tokens_seen": 21413888, "step": 5228 }, { "epoch": 3.8964232488822654, "grad_norm": 7.349453067196026, "learning_rate": 2.886515119561079e-07, "loss": 0.1893, "num_input_tokens_seen": 21417984, "step": 5229 }, { "epoch": 3.8971684053651265, "grad_norm": 10.166591356163984, "learning_rate": 2.882775559625972e-07, "loss": 0.1491, "num_input_tokens_seen": 21422080, "step": 5230 }, { "epoch": 3.897913561847988, "grad_norm": 8.394702889737056, "learning_rate": 2.879038107855141e-07, "loss": 0.0373, "num_input_tokens_seen": 21426176, "step": 5231 }, { "epoch": 3.898658718330849, "grad_norm": 10.231601676285486, "learning_rate": 2.875302765067853e-07, "loss": 0.1792, "num_input_tokens_seen": 21430272, "step": 5232 }, { "epoch": 3.8994038748137108, "grad_norm": 10.040870263070412, "learning_rate": 2.871569532082928e-07, "loss": 0.2626, "num_input_tokens_seen": 21434368, "step": 5233 }, { "epoch": 3.9001490312965723, "grad_norm": 9.143015520889591, "learning_rate": 2.8678384097187124e-07, "loss": 0.0881, "num_input_tokens_seen": 21438464, "step": 5234 }, { "epoch": 3.900894187779434, "grad_norm": 10.08501727316964, "learning_rate": 2.8641093987931036e-07, "loss": 0.2729, "num_input_tokens_seen": 21442560, "step": 5235 }, { "epoch": 3.901639344262295, "grad_norm": 9.024582616385384, "learning_rate": 2.8603825001235137e-07, "loss": 0.1363, "num_input_tokens_seen": 21446656, "step": 5236 }, { "epoch": 3.9023845007451565, "grad_norm": 6.377219800553473, "learning_rate": 2.856657714526917e-07, "loss": 0.1304, "num_input_tokens_seen": 21450752, "step": 5237 }, { "epoch": 3.9031296572280176, "grad_norm": 10.527081054331367, "learning_rate": 2.8529350428198043e-07, "loss": 0.2242, "num_input_tokens_seen": 21454848, "step": 5238 }, { "epoch": 3.903874813710879, "grad_norm": 9.887691769239252, "learning_rate": 2.8492144858182206e-07, "loss": 0.1766, "num_input_tokens_seen": 21458944, "step": 5239 }, { "epoch": 3.9046199701937407, "grad_norm": 6.088816740144383, "learning_rate": 2.8454960443377355e-07, "loss": 0.1496, "num_input_tokens_seen": 21463040, "step": 5240 }, { "epoch": 3.9053651266766023, "grad_norm": 16.52544703065565, "learning_rate": 2.841779719193452e-07, "loss": 0.1858, "num_input_tokens_seen": 21467136, "step": 5241 }, { "epoch": 3.9061102831594634, "grad_norm": 9.594454813503955, "learning_rate": 2.8380655112000237e-07, "loss": 0.2303, "num_input_tokens_seen": 21471232, "step": 5242 }, { "epoch": 3.906855439642325, "grad_norm": 10.151153290476065, "learning_rate": 2.8343534211716233e-07, "loss": 0.3115, "num_input_tokens_seen": 21475328, "step": 5243 }, { "epoch": 3.907600596125186, "grad_norm": 7.796724086699613, "learning_rate": 2.830643449921976e-07, "loss": 0.16, "num_input_tokens_seen": 21479424, "step": 5244 }, { "epoch": 3.9083457526080476, "grad_norm": 8.563009376546672, "learning_rate": 2.8269355982643273e-07, "loss": 0.1659, "num_input_tokens_seen": 21483520, "step": 5245 }, { "epoch": 3.909090909090909, "grad_norm": 6.99614400702289, "learning_rate": 2.823229867011469e-07, "loss": 0.1977, "num_input_tokens_seen": 21487616, "step": 5246 }, { "epoch": 3.9098360655737707, "grad_norm": 9.52892302512039, "learning_rate": 2.8195262569757245e-07, "loss": 0.3062, "num_input_tokens_seen": 21491712, "step": 5247 }, { "epoch": 3.910581222056632, "grad_norm": 13.543440891347327, "learning_rate": 2.8158247689689433e-07, "loss": 0.2225, "num_input_tokens_seen": 21495808, "step": 5248 }, { "epoch": 3.9113263785394934, "grad_norm": 7.956703928577812, "learning_rate": 2.812125403802529e-07, "loss": 0.1545, "num_input_tokens_seen": 21499904, "step": 5249 }, { "epoch": 3.9120715350223545, "grad_norm": 9.622294543462235, "learning_rate": 2.8084281622874036e-07, "loss": 0.1545, "num_input_tokens_seen": 21504000, "step": 5250 }, { "epoch": 3.912816691505216, "grad_norm": 6.801524663069386, "learning_rate": 2.804733045234029e-07, "loss": 0.1163, "num_input_tokens_seen": 21508096, "step": 5251 }, { "epoch": 3.9135618479880776, "grad_norm": 10.037000406596697, "learning_rate": 2.801040053452396e-07, "loss": 0.1452, "num_input_tokens_seen": 21512192, "step": 5252 }, { "epoch": 3.914307004470939, "grad_norm": 6.648934807939824, "learning_rate": 2.7973491877520453e-07, "loss": 0.07, "num_input_tokens_seen": 21516288, "step": 5253 }, { "epoch": 3.9150521609538003, "grad_norm": 7.981832461011363, "learning_rate": 2.793660448942033e-07, "loss": 0.1597, "num_input_tokens_seen": 21520384, "step": 5254 }, { "epoch": 3.915797317436662, "grad_norm": 9.251002915502394, "learning_rate": 2.789973837830962e-07, "loss": 0.2127, "num_input_tokens_seen": 21524480, "step": 5255 }, { "epoch": 3.916542473919523, "grad_norm": 10.451360626548084, "learning_rate": 2.786289355226958e-07, "loss": 0.0644, "num_input_tokens_seen": 21528576, "step": 5256 }, { "epoch": 3.9172876304023845, "grad_norm": 9.460203015469853, "learning_rate": 2.7826070019376927e-07, "loss": 0.3301, "num_input_tokens_seen": 21532672, "step": 5257 }, { "epoch": 3.918032786885246, "grad_norm": 8.279474579316334, "learning_rate": 2.778926778770362e-07, "loss": 0.2124, "num_input_tokens_seen": 21536768, "step": 5258 }, { "epoch": 3.918777943368107, "grad_norm": 9.042178664817904, "learning_rate": 2.775248686531692e-07, "loss": 0.1784, "num_input_tokens_seen": 21540864, "step": 5259 }, { "epoch": 3.9195230998509687, "grad_norm": 7.517279936024689, "learning_rate": 2.771572726027955e-07, "loss": 0.2092, "num_input_tokens_seen": 21544960, "step": 5260 }, { "epoch": 3.9202682563338302, "grad_norm": 8.821790936533295, "learning_rate": 2.7678988980649384e-07, "loss": 0.2987, "num_input_tokens_seen": 21549056, "step": 5261 }, { "epoch": 3.9210134128166914, "grad_norm": 8.221202660177163, "learning_rate": 2.764227203447985e-07, "loss": 0.158, "num_input_tokens_seen": 21553152, "step": 5262 }, { "epoch": 3.921758569299553, "grad_norm": 12.724170085331211, "learning_rate": 2.760557642981941e-07, "loss": 0.3821, "num_input_tokens_seen": 21557248, "step": 5263 }, { "epoch": 3.9225037257824145, "grad_norm": 8.52308825786646, "learning_rate": 2.756890217471211e-07, "loss": 0.2215, "num_input_tokens_seen": 21561344, "step": 5264 }, { "epoch": 3.9232488822652756, "grad_norm": 7.2708754583530775, "learning_rate": 2.7532249277197153e-07, "loss": 0.068, "num_input_tokens_seen": 21565440, "step": 5265 }, { "epoch": 3.923994038748137, "grad_norm": 9.238790692136371, "learning_rate": 2.749561774530915e-07, "loss": 0.1922, "num_input_tokens_seen": 21569536, "step": 5266 }, { "epoch": 3.9247391952309982, "grad_norm": 7.649886679773343, "learning_rate": 2.745900758707799e-07, "loss": 0.1229, "num_input_tokens_seen": 21573632, "step": 5267 }, { "epoch": 3.92548435171386, "grad_norm": 9.610677600885417, "learning_rate": 2.742241881052883e-07, "loss": 0.1545, "num_input_tokens_seen": 21577728, "step": 5268 }, { "epoch": 3.9262295081967213, "grad_norm": 13.003571435662302, "learning_rate": 2.7385851423682274e-07, "loss": 0.1112, "num_input_tokens_seen": 21581824, "step": 5269 }, { "epoch": 3.926974664679583, "grad_norm": 10.042458968774106, "learning_rate": 2.734930543455408e-07, "loss": 0.2239, "num_input_tokens_seen": 21585920, "step": 5270 }, { "epoch": 3.927719821162444, "grad_norm": 11.245168203216604, "learning_rate": 2.7312780851155464e-07, "loss": 0.3699, "num_input_tokens_seen": 21590016, "step": 5271 }, { "epoch": 3.9284649776453056, "grad_norm": 8.243409839472347, "learning_rate": 2.7276277681492783e-07, "loss": 0.1699, "num_input_tokens_seen": 21594112, "step": 5272 }, { "epoch": 3.9292101341281667, "grad_norm": 6.916908719685104, "learning_rate": 2.723979593356789e-07, "loss": 0.1581, "num_input_tokens_seen": 21598208, "step": 5273 }, { "epoch": 3.929955290611028, "grad_norm": 10.38053641614031, "learning_rate": 2.7203335615377786e-07, "loss": 0.2054, "num_input_tokens_seen": 21602304, "step": 5274 }, { "epoch": 3.9307004470938898, "grad_norm": 8.235674906291019, "learning_rate": 2.716689673491482e-07, "loss": 0.1813, "num_input_tokens_seen": 21606400, "step": 5275 }, { "epoch": 3.9314456035767513, "grad_norm": 8.003634990163505, "learning_rate": 2.7130479300166725e-07, "loss": 0.0625, "num_input_tokens_seen": 21610496, "step": 5276 }, { "epoch": 3.9321907600596124, "grad_norm": 7.824960211091366, "learning_rate": 2.709408331911641e-07, "loss": 0.1595, "num_input_tokens_seen": 21614592, "step": 5277 }, { "epoch": 3.932935916542474, "grad_norm": 8.157209895792086, "learning_rate": 2.7057708799742137e-07, "loss": 0.2597, "num_input_tokens_seen": 21618688, "step": 5278 }, { "epoch": 3.933681073025335, "grad_norm": 7.732924444098426, "learning_rate": 2.7021355750017435e-07, "loss": 0.0825, "num_input_tokens_seen": 21622784, "step": 5279 }, { "epoch": 3.9344262295081966, "grad_norm": 16.44398899583487, "learning_rate": 2.698502417791121e-07, "loss": 0.1629, "num_input_tokens_seen": 21626880, "step": 5280 }, { "epoch": 3.935171385991058, "grad_norm": 8.672006748179678, "learning_rate": 2.694871409138755e-07, "loss": 0.1954, "num_input_tokens_seen": 21630976, "step": 5281 }, { "epoch": 3.9359165424739198, "grad_norm": 6.118491650829617, "learning_rate": 2.6912425498405947e-07, "loss": 0.0752, "num_input_tokens_seen": 21635072, "step": 5282 }, { "epoch": 3.936661698956781, "grad_norm": 4.480573206716047, "learning_rate": 2.687615840692104e-07, "loss": 0.0414, "num_input_tokens_seen": 21639168, "step": 5283 }, { "epoch": 3.9374068554396424, "grad_norm": 8.153914342308752, "learning_rate": 2.683991282488292e-07, "loss": 0.2734, "num_input_tokens_seen": 21643264, "step": 5284 }, { "epoch": 3.9381520119225035, "grad_norm": 8.210689125784247, "learning_rate": 2.6803688760236827e-07, "loss": 0.101, "num_input_tokens_seen": 21647360, "step": 5285 }, { "epoch": 3.938897168405365, "grad_norm": 9.731878148051264, "learning_rate": 2.6767486220923304e-07, "loss": 0.2675, "num_input_tokens_seen": 21651456, "step": 5286 }, { "epoch": 3.9396423248882266, "grad_norm": 9.802544486001107, "learning_rate": 2.6731305214878296e-07, "loss": 0.326, "num_input_tokens_seen": 21655552, "step": 5287 }, { "epoch": 3.940387481371088, "grad_norm": 9.402221202842153, "learning_rate": 2.669514575003283e-07, "loss": 0.1267, "num_input_tokens_seen": 21659648, "step": 5288 }, { "epoch": 3.9411326378539493, "grad_norm": 6.934908824039143, "learning_rate": 2.665900783431345e-07, "loss": 0.0784, "num_input_tokens_seen": 21663744, "step": 5289 }, { "epoch": 3.941877794336811, "grad_norm": 7.886970897842643, "learning_rate": 2.6622891475641705e-07, "loss": 0.1231, "num_input_tokens_seen": 21667840, "step": 5290 }, { "epoch": 3.942622950819672, "grad_norm": 8.719745249524387, "learning_rate": 2.658679668193466e-07, "loss": 0.2125, "num_input_tokens_seen": 21671936, "step": 5291 }, { "epoch": 3.9433681073025335, "grad_norm": 8.268334955978942, "learning_rate": 2.655072346110449e-07, "loss": 0.2069, "num_input_tokens_seen": 21676032, "step": 5292 }, { "epoch": 3.944113263785395, "grad_norm": 10.689796136794797, "learning_rate": 2.651467182105874e-07, "loss": 0.1829, "num_input_tokens_seen": 21680128, "step": 5293 }, { "epoch": 3.944858420268256, "grad_norm": 10.97032300902012, "learning_rate": 2.647864176970019e-07, "loss": 0.3034, "num_input_tokens_seen": 21684224, "step": 5294 }, { "epoch": 3.9456035767511177, "grad_norm": 8.873681853419331, "learning_rate": 2.644263331492683e-07, "loss": 0.1746, "num_input_tokens_seen": 21688320, "step": 5295 }, { "epoch": 3.9463487332339793, "grad_norm": 8.480439998871416, "learning_rate": 2.640664646463205e-07, "loss": 0.1116, "num_input_tokens_seen": 21692416, "step": 5296 }, { "epoch": 3.9470938897168404, "grad_norm": 7.544539269472272, "learning_rate": 2.6370681226704344e-07, "loss": 0.1164, "num_input_tokens_seen": 21696512, "step": 5297 }, { "epoch": 3.947839046199702, "grad_norm": 8.979167181072409, "learning_rate": 2.633473760902762e-07, "loss": 0.108, "num_input_tokens_seen": 21700608, "step": 5298 }, { "epoch": 3.9485842026825635, "grad_norm": 8.655560782896174, "learning_rate": 2.6298815619480914e-07, "loss": 0.2341, "num_input_tokens_seen": 21704704, "step": 5299 }, { "epoch": 3.9493293591654246, "grad_norm": 8.111636255432254, "learning_rate": 2.6262915265938657e-07, "loss": 0.1148, "num_input_tokens_seen": 21708800, "step": 5300 }, { "epoch": 3.950074515648286, "grad_norm": 7.261960353609152, "learning_rate": 2.6227036556270414e-07, "loss": 0.114, "num_input_tokens_seen": 21712896, "step": 5301 }, { "epoch": 3.9508196721311473, "grad_norm": 9.663048289577544, "learning_rate": 2.619117949834106e-07, "loss": 0.2155, "num_input_tokens_seen": 21716992, "step": 5302 }, { "epoch": 3.951564828614009, "grad_norm": 8.885894511039067, "learning_rate": 2.615534410001068e-07, "loss": 0.2292, "num_input_tokens_seen": 21721088, "step": 5303 }, { "epoch": 3.9523099850968704, "grad_norm": 6.6603652202015855, "learning_rate": 2.611953036913471e-07, "loss": 0.1463, "num_input_tokens_seen": 21725184, "step": 5304 }, { "epoch": 3.953055141579732, "grad_norm": 9.918373670788279, "learning_rate": 2.608373831356377e-07, "loss": 0.3142, "num_input_tokens_seen": 21729280, "step": 5305 }, { "epoch": 3.953800298062593, "grad_norm": 10.03738294982692, "learning_rate": 2.6047967941143684e-07, "loss": 0.2542, "num_input_tokens_seen": 21733376, "step": 5306 }, { "epoch": 3.9545454545454546, "grad_norm": 9.56142027393509, "learning_rate": 2.601221925971563e-07, "loss": 0.2036, "num_input_tokens_seen": 21737472, "step": 5307 }, { "epoch": 3.9552906110283157, "grad_norm": 10.78382176095111, "learning_rate": 2.597649227711592e-07, "loss": 0.4095, "num_input_tokens_seen": 21741568, "step": 5308 }, { "epoch": 3.9560357675111772, "grad_norm": 9.502262039961742, "learning_rate": 2.5940787001176234e-07, "loss": 0.1308, "num_input_tokens_seen": 21745664, "step": 5309 }, { "epoch": 3.956780923994039, "grad_norm": 10.810952274484752, "learning_rate": 2.5905103439723344e-07, "loss": 0.3563, "num_input_tokens_seen": 21749760, "step": 5310 }, { "epoch": 3.9575260804769004, "grad_norm": 9.014895225891951, "learning_rate": 2.5869441600579416e-07, "loss": 0.1882, "num_input_tokens_seen": 21753856, "step": 5311 }, { "epoch": 3.9582712369597615, "grad_norm": 7.025930500569543, "learning_rate": 2.5833801491561743e-07, "loss": 0.0728, "num_input_tokens_seen": 21757952, "step": 5312 }, { "epoch": 3.959016393442623, "grad_norm": 7.822236779254506, "learning_rate": 2.579818312048286e-07, "loss": 0.2258, "num_input_tokens_seen": 21762048, "step": 5313 }, { "epoch": 3.959761549925484, "grad_norm": 8.57644180303642, "learning_rate": 2.5762586495150635e-07, "loss": 0.1085, "num_input_tokens_seen": 21766144, "step": 5314 }, { "epoch": 3.9605067064083457, "grad_norm": 8.66723483329154, "learning_rate": 2.572701162336803e-07, "loss": 0.2091, "num_input_tokens_seen": 21770240, "step": 5315 }, { "epoch": 3.9612518628912072, "grad_norm": 11.0931610075851, "learning_rate": 2.569145851293342e-07, "loss": 0.1901, "num_input_tokens_seen": 21774336, "step": 5316 }, { "epoch": 3.961997019374069, "grad_norm": 10.834048988628552, "learning_rate": 2.565592717164017e-07, "loss": 0.1279, "num_input_tokens_seen": 21778432, "step": 5317 }, { "epoch": 3.96274217585693, "grad_norm": 9.938775481153224, "learning_rate": 2.5620417607277097e-07, "loss": 0.1859, "num_input_tokens_seen": 21782528, "step": 5318 }, { "epoch": 3.9634873323397914, "grad_norm": 8.246734023354355, "learning_rate": 2.558492982762807e-07, "loss": 0.1012, "num_input_tokens_seen": 21786624, "step": 5319 }, { "epoch": 3.9642324888226526, "grad_norm": 6.867381644846078, "learning_rate": 2.554946384047236e-07, "loss": 0.0789, "num_input_tokens_seen": 21790720, "step": 5320 }, { "epoch": 3.964977645305514, "grad_norm": 6.626595104967314, "learning_rate": 2.551401965358431e-07, "loss": 0.0792, "num_input_tokens_seen": 21794816, "step": 5321 }, { "epoch": 3.9657228017883757, "grad_norm": 5.717105205886918, "learning_rate": 2.5478597274733523e-07, "loss": 0.0356, "num_input_tokens_seen": 21798912, "step": 5322 }, { "epoch": 3.966467958271237, "grad_norm": 8.251742038037593, "learning_rate": 2.544319671168488e-07, "loss": 0.0606, "num_input_tokens_seen": 21803008, "step": 5323 }, { "epoch": 3.9672131147540983, "grad_norm": 8.80864527279382, "learning_rate": 2.540781797219838e-07, "loss": 0.168, "num_input_tokens_seen": 21807104, "step": 5324 }, { "epoch": 3.96795827123696, "grad_norm": 8.483793938064412, "learning_rate": 2.537246106402938e-07, "loss": 0.175, "num_input_tokens_seen": 21811200, "step": 5325 }, { "epoch": 3.968703427719821, "grad_norm": 10.434237869081365, "learning_rate": 2.533712599492827e-07, "loss": 0.317, "num_input_tokens_seen": 21815296, "step": 5326 }, { "epoch": 3.9694485842026825, "grad_norm": 8.82716282806967, "learning_rate": 2.530181277264085e-07, "loss": 0.1836, "num_input_tokens_seen": 21819392, "step": 5327 }, { "epoch": 3.970193740685544, "grad_norm": 7.3875628629856065, "learning_rate": 2.526652140490797e-07, "loss": 0.1517, "num_input_tokens_seen": 21823488, "step": 5328 }, { "epoch": 3.970938897168405, "grad_norm": 6.386264610896049, "learning_rate": 2.523125189946575e-07, "loss": 0.1307, "num_input_tokens_seen": 21827584, "step": 5329 }, { "epoch": 3.9716840536512668, "grad_norm": 8.556392881320216, "learning_rate": 2.5196004264045504e-07, "loss": 0.0917, "num_input_tokens_seen": 21831680, "step": 5330 }, { "epoch": 3.9724292101341283, "grad_norm": 8.869423483094554, "learning_rate": 2.5160778506373814e-07, "loss": 0.1298, "num_input_tokens_seen": 21835776, "step": 5331 }, { "epoch": 3.9731743666169894, "grad_norm": 6.578225263554753, "learning_rate": 2.5125574634172397e-07, "loss": 0.0394, "num_input_tokens_seen": 21839872, "step": 5332 }, { "epoch": 3.973919523099851, "grad_norm": 13.574564536384958, "learning_rate": 2.5090392655158156e-07, "loss": 0.1875, "num_input_tokens_seen": 21843968, "step": 5333 }, { "epoch": 3.9746646795827125, "grad_norm": 10.614338764929348, "learning_rate": 2.505523257704329e-07, "loss": 0.1655, "num_input_tokens_seen": 21848064, "step": 5334 }, { "epoch": 3.9754098360655736, "grad_norm": 8.118675903987585, "learning_rate": 2.502009440753508e-07, "loss": 0.0786, "num_input_tokens_seen": 21852160, "step": 5335 }, { "epoch": 3.976154992548435, "grad_norm": 9.628852036810938, "learning_rate": 2.4984978154336134e-07, "loss": 0.1122, "num_input_tokens_seen": 21856256, "step": 5336 }, { "epoch": 3.9769001490312967, "grad_norm": 8.799357593486604, "learning_rate": 2.49498838251441e-07, "loss": 0.2496, "num_input_tokens_seen": 21860352, "step": 5337 }, { "epoch": 3.977645305514158, "grad_norm": 10.576514577722705, "learning_rate": 2.4914811427652007e-07, "loss": 0.397, "num_input_tokens_seen": 21864448, "step": 5338 }, { "epoch": 3.9783904619970194, "grad_norm": 9.964929803425244, "learning_rate": 2.48797609695479e-07, "loss": 0.2169, "num_input_tokens_seen": 21868544, "step": 5339 }, { "epoch": 3.979135618479881, "grad_norm": 7.092718876046934, "learning_rate": 2.4844732458515097e-07, "loss": 0.1104, "num_input_tokens_seen": 21872640, "step": 5340 }, { "epoch": 3.979880774962742, "grad_norm": 7.69480485482692, "learning_rate": 2.4809725902232145e-07, "loss": 0.1101, "num_input_tokens_seen": 21876736, "step": 5341 }, { "epoch": 3.9806259314456036, "grad_norm": 10.46676201396501, "learning_rate": 2.477474130837265e-07, "loss": 0.1667, "num_input_tokens_seen": 21880832, "step": 5342 }, { "epoch": 3.9813710879284647, "grad_norm": 10.627172043062858, "learning_rate": 2.47397786846056e-07, "loss": 0.2525, "num_input_tokens_seen": 21884928, "step": 5343 }, { "epoch": 3.9821162444113263, "grad_norm": 7.145247682512179, "learning_rate": 2.4704838038594927e-07, "loss": 0.1291, "num_input_tokens_seen": 21889024, "step": 5344 }, { "epoch": 3.982861400894188, "grad_norm": 10.268997204924078, "learning_rate": 2.466991937799995e-07, "loss": 0.2516, "num_input_tokens_seen": 21893120, "step": 5345 }, { "epoch": 3.9836065573770494, "grad_norm": 9.09553218554874, "learning_rate": 2.463502271047505e-07, "loss": 0.131, "num_input_tokens_seen": 21897216, "step": 5346 }, { "epoch": 3.9843517138599105, "grad_norm": 11.298876515048704, "learning_rate": 2.460014804366986e-07, "loss": 0.2509, "num_input_tokens_seen": 21901312, "step": 5347 }, { "epoch": 3.985096870342772, "grad_norm": 9.36469087847383, "learning_rate": 2.4565295385229117e-07, "loss": 0.3511, "num_input_tokens_seen": 21905408, "step": 5348 }, { "epoch": 3.985842026825633, "grad_norm": 8.93161068765521, "learning_rate": 2.4530464742792817e-07, "loss": 0.2292, "num_input_tokens_seen": 21909504, "step": 5349 }, { "epoch": 3.9865871833084947, "grad_norm": 8.933323282619543, "learning_rate": 2.449565612399606e-07, "loss": 0.2006, "num_input_tokens_seen": 21913600, "step": 5350 }, { "epoch": 3.9873323397913563, "grad_norm": 11.366853794152224, "learning_rate": 2.446086953646912e-07, "loss": 0.2215, "num_input_tokens_seen": 21917696, "step": 5351 }, { "epoch": 3.988077496274218, "grad_norm": 9.414129042729488, "learning_rate": 2.4426104987837527e-07, "loss": 0.1945, "num_input_tokens_seen": 21921792, "step": 5352 }, { "epoch": 3.988822652757079, "grad_norm": 11.726964327947238, "learning_rate": 2.439136248572184e-07, "loss": 0.1366, "num_input_tokens_seen": 21925888, "step": 5353 }, { "epoch": 3.9895678092399405, "grad_norm": 7.6082522979378995, "learning_rate": 2.4356642037737937e-07, "loss": 0.0903, "num_input_tokens_seen": 21929984, "step": 5354 }, { "epoch": 3.9903129657228016, "grad_norm": 9.77270620688132, "learning_rate": 2.4321943651496733e-07, "loss": 0.2491, "num_input_tokens_seen": 21934080, "step": 5355 }, { "epoch": 3.991058122205663, "grad_norm": 8.623459413255844, "learning_rate": 2.4287267334604447e-07, "loss": 0.2654, "num_input_tokens_seen": 21938176, "step": 5356 }, { "epoch": 3.9918032786885247, "grad_norm": 11.235336816479085, "learning_rate": 2.4252613094662245e-07, "loss": 0.1184, "num_input_tokens_seen": 21942272, "step": 5357 }, { "epoch": 3.9925484351713862, "grad_norm": 9.30118239078797, "learning_rate": 2.4217980939266696e-07, "loss": 0.1818, "num_input_tokens_seen": 21946368, "step": 5358 }, { "epoch": 3.9932935916542474, "grad_norm": 6.647701505309874, "learning_rate": 2.4183370876009374e-07, "loss": 0.1041, "num_input_tokens_seen": 21950464, "step": 5359 }, { "epoch": 3.994038748137109, "grad_norm": 10.779513041978358, "learning_rate": 2.4148782912477006e-07, "loss": 0.1985, "num_input_tokens_seen": 21954560, "step": 5360 }, { "epoch": 3.99478390461997, "grad_norm": 5.314290129659739, "learning_rate": 2.411421705625161e-07, "loss": 0.0903, "num_input_tokens_seen": 21958656, "step": 5361 }, { "epoch": 3.9955290611028316, "grad_norm": 8.530053293846121, "learning_rate": 2.4079673314910177e-07, "loss": 0.2721, "num_input_tokens_seen": 21962752, "step": 5362 }, { "epoch": 3.996274217585693, "grad_norm": 7.872881197512344, "learning_rate": 2.4045151696025034e-07, "loss": 0.1261, "num_input_tokens_seen": 21966848, "step": 5363 }, { "epoch": 3.9970193740685542, "grad_norm": 9.106308369217224, "learning_rate": 2.401065220716349e-07, "loss": 0.2068, "num_input_tokens_seen": 21970944, "step": 5364 }, { "epoch": 3.997764530551416, "grad_norm": 7.298054521542728, "learning_rate": 2.397617485588814e-07, "loss": 0.1061, "num_input_tokens_seen": 21975040, "step": 5365 }, { "epoch": 3.9985096870342773, "grad_norm": 9.889772479699205, "learning_rate": 2.3941719649756637e-07, "loss": 0.2057, "num_input_tokens_seen": 21979136, "step": 5366 }, { "epoch": 3.9992548435171384, "grad_norm": 8.42887976122005, "learning_rate": 2.390728659632177e-07, "loss": 0.0964, "num_input_tokens_seen": 21983232, "step": 5367 }, { "epoch": 4.0, "grad_norm": 14.531690393736525, "learning_rate": 2.387287570313158e-07, "loss": 0.1162, "num_input_tokens_seen": 21987328, "step": 5368 }, { "epoch": 4.0007451564828616, "grad_norm": 6.752089484946587, "learning_rate": 2.3838486977729114e-07, "loss": 0.1092, "num_input_tokens_seen": 21991424, "step": 5369 }, { "epoch": 4.001490312965723, "grad_norm": 7.850367029065411, "learning_rate": 2.3804120427652733e-07, "loss": 0.3052, "num_input_tokens_seen": 21995520, "step": 5370 }, { "epoch": 4.002235469448584, "grad_norm": 5.510167491938007, "learning_rate": 2.3769776060435691e-07, "loss": 0.0736, "num_input_tokens_seen": 21999616, "step": 5371 }, { "epoch": 4.002980625931445, "grad_norm": 5.003739333482864, "learning_rate": 2.373545388360661e-07, "loss": 0.0774, "num_input_tokens_seen": 22003712, "step": 5372 }, { "epoch": 4.003725782414307, "grad_norm": 6.361676913407045, "learning_rate": 2.3701153904689106e-07, "loss": 0.0771, "num_input_tokens_seen": 22007808, "step": 5373 }, { "epoch": 4.004470938897168, "grad_norm": 5.885692256017705, "learning_rate": 2.366687613120204e-07, "loss": 0.1206, "num_input_tokens_seen": 22011904, "step": 5374 }, { "epoch": 4.00521609538003, "grad_norm": 4.633972078758514, "learning_rate": 2.3632620570659266e-07, "loss": 0.0864, "num_input_tokens_seen": 22016000, "step": 5375 }, { "epoch": 4.0059612518628915, "grad_norm": 2.982458465162498, "learning_rate": 2.359838723056994e-07, "loss": 0.0224, "num_input_tokens_seen": 22020096, "step": 5376 }, { "epoch": 4.006706408345752, "grad_norm": 4.813610260484316, "learning_rate": 2.3564176118438183e-07, "loss": 0.0512, "num_input_tokens_seen": 22024192, "step": 5377 }, { "epoch": 4.007451564828614, "grad_norm": 3.405808388371742, "learning_rate": 2.3529987241763323e-07, "loss": 0.0393, "num_input_tokens_seen": 22028288, "step": 5378 }, { "epoch": 4.008196721311475, "grad_norm": 6.749070356652768, "learning_rate": 2.3495820608039842e-07, "loss": 0.1498, "num_input_tokens_seen": 22032384, "step": 5379 }, { "epoch": 4.008941877794337, "grad_norm": 6.504742784006542, "learning_rate": 2.3461676224757256e-07, "loss": 0.1503, "num_input_tokens_seen": 22036480, "step": 5380 }, { "epoch": 4.009687034277198, "grad_norm": 7.730230635549539, "learning_rate": 2.3427554099400327e-07, "loss": 0.0779, "num_input_tokens_seen": 22040576, "step": 5381 }, { "epoch": 4.01043219076006, "grad_norm": 7.643832744181351, "learning_rate": 2.3393454239448804e-07, "loss": 0.1502, "num_input_tokens_seen": 22044672, "step": 5382 }, { "epoch": 4.011177347242921, "grad_norm": 6.871535652187539, "learning_rate": 2.335937665237771e-07, "loss": 0.1081, "num_input_tokens_seen": 22048768, "step": 5383 }, { "epoch": 4.011922503725782, "grad_norm": 4.776359814055444, "learning_rate": 2.332532134565696e-07, "loss": 0.0724, "num_input_tokens_seen": 22052864, "step": 5384 }, { "epoch": 4.012667660208644, "grad_norm": 9.786876311037199, "learning_rate": 2.329128832675183e-07, "loss": 0.178, "num_input_tokens_seen": 22056960, "step": 5385 }, { "epoch": 4.013412816691505, "grad_norm": 4.476716983755826, "learning_rate": 2.3257277603122561e-07, "loss": 0.0594, "num_input_tokens_seen": 22061056, "step": 5386 }, { "epoch": 4.014157973174367, "grad_norm": 4.736900533264497, "learning_rate": 2.3223289182224523e-07, "loss": 0.0542, "num_input_tokens_seen": 22065152, "step": 5387 }, { "epoch": 4.014903129657228, "grad_norm": 7.169946202698487, "learning_rate": 2.3189323071508272e-07, "loss": 0.0522, "num_input_tokens_seen": 22069248, "step": 5388 }, { "epoch": 4.015648286140089, "grad_norm": 6.829912587072523, "learning_rate": 2.3155379278419363e-07, "loss": 0.1551, "num_input_tokens_seen": 22073344, "step": 5389 }, { "epoch": 4.016393442622951, "grad_norm": 5.450983232391889, "learning_rate": 2.3121457810398595e-07, "loss": 0.1149, "num_input_tokens_seen": 22077440, "step": 5390 }, { "epoch": 4.017138599105812, "grad_norm": 6.982700988894623, "learning_rate": 2.3087558674881716e-07, "loss": 0.0861, "num_input_tokens_seen": 22081536, "step": 5391 }, { "epoch": 4.017883755588674, "grad_norm": 6.527846537820981, "learning_rate": 2.3053681879299724e-07, "loss": 0.1355, "num_input_tokens_seen": 22085632, "step": 5392 }, { "epoch": 4.018628912071535, "grad_norm": 7.727915974905336, "learning_rate": 2.3019827431078626e-07, "loss": 0.1672, "num_input_tokens_seen": 22089728, "step": 5393 }, { "epoch": 4.019374068554397, "grad_norm": 5.057693383668341, "learning_rate": 2.2985995337639538e-07, "loss": 0.0363, "num_input_tokens_seen": 22093824, "step": 5394 }, { "epoch": 4.0201192250372575, "grad_norm": 2.405424934045811, "learning_rate": 2.295218560639874e-07, "loss": 0.0106, "num_input_tokens_seen": 22097920, "step": 5395 }, { "epoch": 4.020864381520119, "grad_norm": 6.414275053677805, "learning_rate": 2.2918398244767505e-07, "loss": 0.0458, "num_input_tokens_seen": 22102016, "step": 5396 }, { "epoch": 4.021609538002981, "grad_norm": 3.771671603589899, "learning_rate": 2.2884633260152375e-07, "loss": 0.0322, "num_input_tokens_seen": 22106112, "step": 5397 }, { "epoch": 4.022354694485842, "grad_norm": 6.156460259297963, "learning_rate": 2.285089065995473e-07, "loss": 0.0493, "num_input_tokens_seen": 22110208, "step": 5398 }, { "epoch": 4.023099850968704, "grad_norm": 8.932560519078486, "learning_rate": 2.2817170451571307e-07, "loss": 0.0958, "num_input_tokens_seen": 22114304, "step": 5399 }, { "epoch": 4.023845007451565, "grad_norm": 4.854837844368374, "learning_rate": 2.2783472642393736e-07, "loss": 0.0479, "num_input_tokens_seen": 22118400, "step": 5400 }, { "epoch": 4.024590163934426, "grad_norm": 7.337310863755202, "learning_rate": 2.2749797239808886e-07, "loss": 0.0621, "num_input_tokens_seen": 22122496, "step": 5401 }, { "epoch": 4.0253353204172875, "grad_norm": 7.26015281277669, "learning_rate": 2.2716144251198572e-07, "loss": 0.0729, "num_input_tokens_seen": 22126592, "step": 5402 }, { "epoch": 4.026080476900149, "grad_norm": 6.862967316216412, "learning_rate": 2.2682513683939852e-07, "loss": 0.093, "num_input_tokens_seen": 22130688, "step": 5403 }, { "epoch": 4.026825633383011, "grad_norm": 3.2187439774329123, "learning_rate": 2.2648905545404735e-07, "loss": 0.0193, "num_input_tokens_seen": 22134784, "step": 5404 }, { "epoch": 4.027570789865872, "grad_norm": 3.361955511077817, "learning_rate": 2.2615319842960352e-07, "loss": 0.0189, "num_input_tokens_seen": 22138880, "step": 5405 }, { "epoch": 4.028315946348733, "grad_norm": 5.516916068487833, "learning_rate": 2.258175658396898e-07, "loss": 0.0587, "num_input_tokens_seen": 22142976, "step": 5406 }, { "epoch": 4.029061102831594, "grad_norm": 7.213478297633031, "learning_rate": 2.2548215775787862e-07, "loss": 0.0813, "num_input_tokens_seen": 22147072, "step": 5407 }, { "epoch": 4.029806259314456, "grad_norm": 8.22038318565299, "learning_rate": 2.2514697425769448e-07, "loss": 0.0988, "num_input_tokens_seen": 22151168, "step": 5408 }, { "epoch": 4.0305514157973175, "grad_norm": 7.379304049659054, "learning_rate": 2.2481201541261142e-07, "loss": 0.1126, "num_input_tokens_seen": 22155264, "step": 5409 }, { "epoch": 4.031296572280179, "grad_norm": 7.6915836563270945, "learning_rate": 2.2447728129605564e-07, "loss": 0.0592, "num_input_tokens_seen": 22159360, "step": 5410 }, { "epoch": 4.032041728763041, "grad_norm": 5.929927029878535, "learning_rate": 2.241427719814021e-07, "loss": 0.1205, "num_input_tokens_seen": 22163456, "step": 5411 }, { "epoch": 4.032786885245901, "grad_norm": 5.606194951724029, "learning_rate": 2.2380848754197854e-07, "loss": 0.0662, "num_input_tokens_seen": 22167552, "step": 5412 }, { "epoch": 4.033532041728763, "grad_norm": 6.299527861891643, "learning_rate": 2.2347442805106216e-07, "loss": 0.0725, "num_input_tokens_seen": 22171648, "step": 5413 }, { "epoch": 4.034277198211624, "grad_norm": 4.606653331867826, "learning_rate": 2.2314059358188088e-07, "loss": 0.0439, "num_input_tokens_seen": 22175744, "step": 5414 }, { "epoch": 4.035022354694486, "grad_norm": 8.002696936588174, "learning_rate": 2.228069842076143e-07, "loss": 0.0809, "num_input_tokens_seen": 22179840, "step": 5415 }, { "epoch": 4.0357675111773474, "grad_norm": 4.204274000378248, "learning_rate": 2.2247360000139121e-07, "loss": 0.0298, "num_input_tokens_seen": 22183936, "step": 5416 }, { "epoch": 4.036512667660209, "grad_norm": 2.9349794014139374, "learning_rate": 2.221404410362925e-07, "loss": 0.0163, "num_input_tokens_seen": 22188032, "step": 5417 }, { "epoch": 4.03725782414307, "grad_norm": 5.879638410680908, "learning_rate": 2.2180750738534846e-07, "loss": 0.0908, "num_input_tokens_seen": 22192128, "step": 5418 }, { "epoch": 4.038002980625931, "grad_norm": 6.3150827825189895, "learning_rate": 2.214747991215409e-07, "loss": 0.0932, "num_input_tokens_seen": 22196224, "step": 5419 }, { "epoch": 4.038748137108793, "grad_norm": 4.116396375305822, "learning_rate": 2.2114231631780156e-07, "loss": 0.0324, "num_input_tokens_seen": 22200320, "step": 5420 }, { "epoch": 4.039493293591654, "grad_norm": 7.473932449238756, "learning_rate": 2.2081005904701343e-07, "loss": 0.0929, "num_input_tokens_seen": 22204416, "step": 5421 }, { "epoch": 4.040238450074516, "grad_norm": 7.76865919140672, "learning_rate": 2.204780273820094e-07, "loss": 0.0455, "num_input_tokens_seen": 22208512, "step": 5422 }, { "epoch": 4.040983606557377, "grad_norm": 8.284767413843607, "learning_rate": 2.2014622139557296e-07, "loss": 0.1512, "num_input_tokens_seen": 22212608, "step": 5423 }, { "epoch": 4.041728763040238, "grad_norm": 6.6303038856172005, "learning_rate": 2.198146411604392e-07, "loss": 0.1047, "num_input_tokens_seen": 22216704, "step": 5424 }, { "epoch": 4.0424739195231, "grad_norm": 5.45740537086583, "learning_rate": 2.1948328674929158e-07, "loss": 0.0841, "num_input_tokens_seen": 22220800, "step": 5425 }, { "epoch": 4.043219076005961, "grad_norm": 4.9887755815809625, "learning_rate": 2.1915215823476637e-07, "loss": 0.0681, "num_input_tokens_seen": 22224896, "step": 5426 }, { "epoch": 4.043964232488823, "grad_norm": 8.99136913820494, "learning_rate": 2.1882125568944882e-07, "loss": 0.2161, "num_input_tokens_seen": 22228992, "step": 5427 }, { "epoch": 4.044709388971684, "grad_norm": 5.468493947918703, "learning_rate": 2.1849057918587545e-07, "loss": 0.0724, "num_input_tokens_seen": 22233088, "step": 5428 }, { "epoch": 4.045454545454546, "grad_norm": 7.380013990696525, "learning_rate": 2.181601287965325e-07, "loss": 0.12, "num_input_tokens_seen": 22237184, "step": 5429 }, { "epoch": 4.0461997019374065, "grad_norm": 5.303465721038007, "learning_rate": 2.178299045938577e-07, "loss": 0.0846, "num_input_tokens_seen": 22241280, "step": 5430 }, { "epoch": 4.046944858420268, "grad_norm": 5.164554120282548, "learning_rate": 2.1749990665023812e-07, "loss": 0.0983, "num_input_tokens_seen": 22245376, "step": 5431 }, { "epoch": 4.04769001490313, "grad_norm": 5.012849772000503, "learning_rate": 2.1717013503801134e-07, "loss": 0.0403, "num_input_tokens_seen": 22249472, "step": 5432 }, { "epoch": 4.048435171385991, "grad_norm": 6.726637308773352, "learning_rate": 2.1684058982946635e-07, "loss": 0.1586, "num_input_tokens_seen": 22253568, "step": 5433 }, { "epoch": 4.049180327868853, "grad_norm": 8.134910884829932, "learning_rate": 2.1651127109684117e-07, "loss": 0.1643, "num_input_tokens_seen": 22257664, "step": 5434 }, { "epoch": 4.049925484351714, "grad_norm": 6.168369170175414, "learning_rate": 2.1618217891232556e-07, "loss": 0.1109, "num_input_tokens_seen": 22261760, "step": 5435 }, { "epoch": 4.050670640834575, "grad_norm": 6.065851778692026, "learning_rate": 2.1585331334805808e-07, "loss": 0.0985, "num_input_tokens_seen": 22265856, "step": 5436 }, { "epoch": 4.0514157973174365, "grad_norm": 8.503315441601938, "learning_rate": 2.155246744761294e-07, "loss": 0.1703, "num_input_tokens_seen": 22269952, "step": 5437 }, { "epoch": 4.052160953800298, "grad_norm": 5.274623306300999, "learning_rate": 2.1519626236857827e-07, "loss": 0.0541, "num_input_tokens_seen": 22274048, "step": 5438 }, { "epoch": 4.05290611028316, "grad_norm": 7.493113318027643, "learning_rate": 2.1486807709739585e-07, "loss": 0.1487, "num_input_tokens_seen": 22278144, "step": 5439 }, { "epoch": 4.053651266766021, "grad_norm": 8.83883439709968, "learning_rate": 2.145401187345221e-07, "loss": 0.1238, "num_input_tokens_seen": 22282240, "step": 5440 }, { "epoch": 4.054396423248882, "grad_norm": 4.80288832680887, "learning_rate": 2.1421238735184855e-07, "loss": 0.0305, "num_input_tokens_seen": 22286336, "step": 5441 }, { "epoch": 4.055141579731743, "grad_norm": 6.17152176733021, "learning_rate": 2.138848830212159e-07, "loss": 0.0785, "num_input_tokens_seen": 22290432, "step": 5442 }, { "epoch": 4.055886736214605, "grad_norm": 7.990692139747666, "learning_rate": 2.1355760581441496e-07, "loss": 0.1237, "num_input_tokens_seen": 22294528, "step": 5443 }, { "epoch": 4.0566318926974665, "grad_norm": 10.901162748697084, "learning_rate": 2.1323055580318815e-07, "loss": 0.1064, "num_input_tokens_seen": 22298624, "step": 5444 }, { "epoch": 4.057377049180328, "grad_norm": 6.989728766736853, "learning_rate": 2.1290373305922637e-07, "loss": 0.1668, "num_input_tokens_seen": 22302720, "step": 5445 }, { "epoch": 4.05812220566319, "grad_norm": 5.2634065777572925, "learning_rate": 2.1257713765417215e-07, "loss": 0.1343, "num_input_tokens_seen": 22306816, "step": 5446 }, { "epoch": 4.05886736214605, "grad_norm": 8.035130175546973, "learning_rate": 2.1225076965961682e-07, "loss": 0.1029, "num_input_tokens_seen": 22310912, "step": 5447 }, { "epoch": 4.059612518628912, "grad_norm": 8.517296642636994, "learning_rate": 2.1192462914710333e-07, "loss": 0.2669, "num_input_tokens_seen": 22315008, "step": 5448 }, { "epoch": 4.060357675111773, "grad_norm": 4.897810161738507, "learning_rate": 2.115987161881236e-07, "loss": 0.0393, "num_input_tokens_seen": 22319104, "step": 5449 }, { "epoch": 4.061102831594635, "grad_norm": 3.2379989660435653, "learning_rate": 2.1127303085411985e-07, "loss": 0.0251, "num_input_tokens_seen": 22323200, "step": 5450 }, { "epoch": 4.0618479880774965, "grad_norm": 5.490030745679786, "learning_rate": 2.109475732164856e-07, "loss": 0.0493, "num_input_tokens_seen": 22327296, "step": 5451 }, { "epoch": 4.062593144560358, "grad_norm": 5.465463580440426, "learning_rate": 2.106223433465622e-07, "loss": 0.0724, "num_input_tokens_seen": 22331392, "step": 5452 }, { "epoch": 4.063338301043219, "grad_norm": 6.982684769354684, "learning_rate": 2.102973413156431e-07, "loss": 0.0649, "num_input_tokens_seen": 22335488, "step": 5453 }, { "epoch": 4.06408345752608, "grad_norm": 9.501564873955502, "learning_rate": 2.099725671949708e-07, "loss": 0.0434, "num_input_tokens_seen": 22339584, "step": 5454 }, { "epoch": 4.064828614008942, "grad_norm": 4.179297851514124, "learning_rate": 2.0964802105573847e-07, "loss": 0.0501, "num_input_tokens_seen": 22343680, "step": 5455 }, { "epoch": 4.065573770491803, "grad_norm": 6.813132130988909, "learning_rate": 2.093237029690884e-07, "loss": 0.1004, "num_input_tokens_seen": 22347776, "step": 5456 }, { "epoch": 4.066318926974665, "grad_norm": 4.847883441583398, "learning_rate": 2.0899961300611394e-07, "loss": 0.0243, "num_input_tokens_seen": 22351872, "step": 5457 }, { "epoch": 4.0670640834575265, "grad_norm": 5.832170104436707, "learning_rate": 2.0867575123785774e-07, "loss": 0.0631, "num_input_tokens_seen": 22355968, "step": 5458 }, { "epoch": 4.067809239940387, "grad_norm": 4.474113063359625, "learning_rate": 2.0835211773531233e-07, "loss": 0.0234, "num_input_tokens_seen": 22360064, "step": 5459 }, { "epoch": 4.068554396423249, "grad_norm": 2.4653686758789415, "learning_rate": 2.0802871256942108e-07, "loss": 0.0102, "num_input_tokens_seen": 22364160, "step": 5460 }, { "epoch": 4.06929955290611, "grad_norm": 5.556493979070521, "learning_rate": 2.0770553581107606e-07, "loss": 0.0996, "num_input_tokens_seen": 22368256, "step": 5461 }, { "epoch": 4.070044709388972, "grad_norm": 10.362866321337028, "learning_rate": 2.0738258753112056e-07, "loss": 0.2122, "num_input_tokens_seen": 22372352, "step": 5462 }, { "epoch": 4.070789865871833, "grad_norm": 6.623478633433089, "learning_rate": 2.0705986780034645e-07, "loss": 0.0625, "num_input_tokens_seen": 22376448, "step": 5463 }, { "epoch": 4.071535022354695, "grad_norm": 7.385731018805654, "learning_rate": 2.0673737668949728e-07, "loss": 0.0827, "num_input_tokens_seen": 22380544, "step": 5464 }, { "epoch": 4.0722801788375556, "grad_norm": 8.16914674559309, "learning_rate": 2.06415114269264e-07, "loss": 0.0769, "num_input_tokens_seen": 22384640, "step": 5465 }, { "epoch": 4.073025335320417, "grad_norm": 5.538787047004282, "learning_rate": 2.0609308061028982e-07, "loss": 0.0499, "num_input_tokens_seen": 22388736, "step": 5466 }, { "epoch": 4.073770491803279, "grad_norm": 4.983957656157578, "learning_rate": 2.0577127578316633e-07, "loss": 0.0521, "num_input_tokens_seen": 22392832, "step": 5467 }, { "epoch": 4.07451564828614, "grad_norm": 7.74258556056087, "learning_rate": 2.0544969985843586e-07, "loss": 0.0418, "num_input_tokens_seen": 22396928, "step": 5468 }, { "epoch": 4.075260804769002, "grad_norm": 6.828724697376516, "learning_rate": 2.0512835290658991e-07, "loss": 0.0744, "num_input_tokens_seen": 22401024, "step": 5469 }, { "epoch": 4.076005961251863, "grad_norm": 5.5109069872333345, "learning_rate": 2.0480723499806995e-07, "loss": 0.038, "num_input_tokens_seen": 22405120, "step": 5470 }, { "epoch": 4.076751117734724, "grad_norm": 4.077105082977702, "learning_rate": 2.0448634620326756e-07, "loss": 0.0244, "num_input_tokens_seen": 22409216, "step": 5471 }, { "epoch": 4.0774962742175855, "grad_norm": 7.029155382394691, "learning_rate": 2.0416568659252345e-07, "loss": 0.1464, "num_input_tokens_seen": 22413312, "step": 5472 }, { "epoch": 4.078241430700447, "grad_norm": 15.127824466583409, "learning_rate": 2.03845256236129e-07, "loss": 0.0832, "num_input_tokens_seen": 22417408, "step": 5473 }, { "epoch": 4.078986587183309, "grad_norm": 6.627243410188929, "learning_rate": 2.0352505520432442e-07, "loss": 0.1406, "num_input_tokens_seen": 22421504, "step": 5474 }, { "epoch": 4.07973174366617, "grad_norm": 3.418713845993388, "learning_rate": 2.0320508356730038e-07, "loss": 0.0214, "num_input_tokens_seen": 22425600, "step": 5475 }, { "epoch": 4.080476900149031, "grad_norm": 9.875807839623667, "learning_rate": 2.02885341395197e-07, "loss": 0.2617, "num_input_tokens_seen": 22429696, "step": 5476 }, { "epoch": 4.081222056631892, "grad_norm": 4.981278464497279, "learning_rate": 2.0256582875810338e-07, "loss": 0.0628, "num_input_tokens_seen": 22433792, "step": 5477 }, { "epoch": 4.081967213114754, "grad_norm": 4.57599246650079, "learning_rate": 2.0224654572606014e-07, "loss": 0.0263, "num_input_tokens_seen": 22437888, "step": 5478 }, { "epoch": 4.0827123695976155, "grad_norm": 5.968127024054893, "learning_rate": 2.0192749236905508e-07, "loss": 0.0169, "num_input_tokens_seen": 22441984, "step": 5479 }, { "epoch": 4.083457526080477, "grad_norm": 7.497260989056984, "learning_rate": 2.016086687570279e-07, "loss": 0.112, "num_input_tokens_seen": 22446080, "step": 5480 }, { "epoch": 4.084202682563339, "grad_norm": 4.69048258030377, "learning_rate": 2.0129007495986647e-07, "loss": 0.0313, "num_input_tokens_seen": 22450176, "step": 5481 }, { "epoch": 4.084947839046199, "grad_norm": 9.224573379444537, "learning_rate": 2.0097171104740922e-07, "loss": 0.1986, "num_input_tokens_seen": 22454272, "step": 5482 }, { "epoch": 4.085692995529061, "grad_norm": 6.655055229682304, "learning_rate": 2.0065357708944342e-07, "loss": 0.0694, "num_input_tokens_seen": 22458368, "step": 5483 }, { "epoch": 4.086438152011922, "grad_norm": 9.887967364323895, "learning_rate": 2.0033567315570684e-07, "loss": 0.1424, "num_input_tokens_seen": 22462464, "step": 5484 }, { "epoch": 4.087183308494784, "grad_norm": 7.8796773795627075, "learning_rate": 2.00017999315886e-07, "loss": 0.0622, "num_input_tokens_seen": 22466560, "step": 5485 }, { "epoch": 4.0879284649776455, "grad_norm": 6.350217420883098, "learning_rate": 1.9970055563961686e-07, "loss": 0.0753, "num_input_tokens_seen": 22470656, "step": 5486 }, { "epoch": 4.088673621460507, "grad_norm": 7.490511775665116, "learning_rate": 1.9938334219648608e-07, "loss": 0.1298, "num_input_tokens_seen": 22474752, "step": 5487 }, { "epoch": 4.089418777943368, "grad_norm": 4.540772046929378, "learning_rate": 1.990663590560285e-07, "loss": 0.0343, "num_input_tokens_seen": 22478848, "step": 5488 }, { "epoch": 4.090163934426229, "grad_norm": 6.108249911101789, "learning_rate": 1.9874960628772963e-07, "loss": 0.0863, "num_input_tokens_seen": 22482944, "step": 5489 }, { "epoch": 4.090909090909091, "grad_norm": 9.873033427767746, "learning_rate": 1.9843308396102338e-07, "loss": 0.0975, "num_input_tokens_seen": 22487040, "step": 5490 }, { "epoch": 4.091654247391952, "grad_norm": 5.968412614966226, "learning_rate": 1.9811679214529464e-07, "loss": 0.0647, "num_input_tokens_seen": 22491136, "step": 5491 }, { "epoch": 4.092399403874814, "grad_norm": 7.1346847095233406, "learning_rate": 1.9780073090987547e-07, "loss": 0.1384, "num_input_tokens_seen": 22495232, "step": 5492 }, { "epoch": 4.0931445603576755, "grad_norm": 6.190696689855203, "learning_rate": 1.9748490032404977e-07, "loss": 0.0699, "num_input_tokens_seen": 22499328, "step": 5493 }, { "epoch": 4.093889716840536, "grad_norm": 11.527947461163986, "learning_rate": 1.9716930045704922e-07, "loss": 0.077, "num_input_tokens_seen": 22503424, "step": 5494 }, { "epoch": 4.094634873323398, "grad_norm": 3.809768678895973, "learning_rate": 1.9685393137805625e-07, "loss": 0.0174, "num_input_tokens_seen": 22507520, "step": 5495 }, { "epoch": 4.095380029806259, "grad_norm": 7.5915472473108405, "learning_rate": 1.965387931562017e-07, "loss": 0.0957, "num_input_tokens_seen": 22511616, "step": 5496 }, { "epoch": 4.096125186289121, "grad_norm": 8.726896597941984, "learning_rate": 1.962238858605657e-07, "loss": 0.0618, "num_input_tokens_seen": 22515712, "step": 5497 }, { "epoch": 4.096870342771982, "grad_norm": 7.321061944336527, "learning_rate": 1.959092095601789e-07, "loss": 0.1136, "num_input_tokens_seen": 22519808, "step": 5498 }, { "epoch": 4.097615499254844, "grad_norm": 6.041652130738151, "learning_rate": 1.9559476432401998e-07, "loss": 0.08, "num_input_tokens_seen": 22523904, "step": 5499 }, { "epoch": 4.098360655737705, "grad_norm": 8.206041198416907, "learning_rate": 1.9528055022101797e-07, "loss": 0.0926, "num_input_tokens_seen": 22528000, "step": 5500 }, { "epoch": 4.099105812220566, "grad_norm": 9.809177631162916, "learning_rate": 1.9496656732005057e-07, "loss": 0.0554, "num_input_tokens_seen": 22532096, "step": 5501 }, { "epoch": 4.099850968703428, "grad_norm": 7.695100995745454, "learning_rate": 1.9465281568994548e-07, "loss": 0.0808, "num_input_tokens_seen": 22536192, "step": 5502 }, { "epoch": 4.100596125186289, "grad_norm": 5.202256215721954, "learning_rate": 1.943392953994791e-07, "loss": 0.0599, "num_input_tokens_seen": 22540288, "step": 5503 }, { "epoch": 4.101341281669151, "grad_norm": 6.417441766003138, "learning_rate": 1.9402600651737684e-07, "loss": 0.0786, "num_input_tokens_seen": 22544384, "step": 5504 }, { "epoch": 4.102086438152012, "grad_norm": 6.653696772645487, "learning_rate": 1.9371294911231507e-07, "loss": 0.1004, "num_input_tokens_seen": 22548480, "step": 5505 }, { "epoch": 4.102831594634873, "grad_norm": 5.843024678827398, "learning_rate": 1.934001232529166e-07, "loss": 0.0475, "num_input_tokens_seen": 22552576, "step": 5506 }, { "epoch": 4.103576751117735, "grad_norm": 5.750732261328158, "learning_rate": 1.9308752900775633e-07, "loss": 0.0786, "num_input_tokens_seen": 22556672, "step": 5507 }, { "epoch": 4.104321907600596, "grad_norm": 6.914665726694715, "learning_rate": 1.9277516644535642e-07, "loss": 0.0564, "num_input_tokens_seen": 22560768, "step": 5508 }, { "epoch": 4.105067064083458, "grad_norm": 7.371154676836626, "learning_rate": 1.9246303563418957e-07, "loss": 0.1442, "num_input_tokens_seen": 22564864, "step": 5509 }, { "epoch": 4.105812220566319, "grad_norm": 6.077204182484449, "learning_rate": 1.9215113664267662e-07, "loss": 0.0707, "num_input_tokens_seen": 22568960, "step": 5510 }, { "epoch": 4.10655737704918, "grad_norm": 7.7626528719146695, "learning_rate": 1.9183946953918841e-07, "loss": 0.062, "num_input_tokens_seen": 22573056, "step": 5511 }, { "epoch": 4.1073025335320414, "grad_norm": 6.460934067314086, "learning_rate": 1.9152803439204407e-07, "loss": 0.0256, "num_input_tokens_seen": 22577152, "step": 5512 }, { "epoch": 4.108047690014903, "grad_norm": 6.2188342433376205, "learning_rate": 1.9121683126951313e-07, "loss": 0.1329, "num_input_tokens_seen": 22581248, "step": 5513 }, { "epoch": 4.1087928464977646, "grad_norm": 9.538195551842998, "learning_rate": 1.909058602398131e-07, "loss": 0.077, "num_input_tokens_seen": 22585344, "step": 5514 }, { "epoch": 4.109538002980626, "grad_norm": 9.275680229352805, "learning_rate": 1.905951213711109e-07, "loss": 0.0205, "num_input_tokens_seen": 22589440, "step": 5515 }, { "epoch": 4.110283159463488, "grad_norm": 6.241793300911737, "learning_rate": 1.902846147315232e-07, "loss": 0.053, "num_input_tokens_seen": 22593536, "step": 5516 }, { "epoch": 4.111028315946348, "grad_norm": 6.367523728462934, "learning_rate": 1.8997434038911467e-07, "loss": 0.048, "num_input_tokens_seen": 22597632, "step": 5517 }, { "epoch": 4.11177347242921, "grad_norm": 6.11391964274555, "learning_rate": 1.8966429841190057e-07, "loss": 0.1012, "num_input_tokens_seen": 22601728, "step": 5518 }, { "epoch": 4.112518628912071, "grad_norm": 5.879388147882828, "learning_rate": 1.893544888678432e-07, "loss": 0.0862, "num_input_tokens_seen": 22605824, "step": 5519 }, { "epoch": 4.113263785394933, "grad_norm": 4.079918074309008, "learning_rate": 1.890449118248558e-07, "loss": 0.0546, "num_input_tokens_seen": 22609920, "step": 5520 }, { "epoch": 4.1140089418777945, "grad_norm": 8.841027876969985, "learning_rate": 1.887355673507993e-07, "loss": 0.0824, "num_input_tokens_seen": 22614016, "step": 5521 }, { "epoch": 4.114754098360656, "grad_norm": 6.719678448702816, "learning_rate": 1.8842645551348488e-07, "loss": 0.0594, "num_input_tokens_seen": 22618112, "step": 5522 }, { "epoch": 4.115499254843517, "grad_norm": 5.949632987837751, "learning_rate": 1.8811757638067168e-07, "loss": 0.0765, "num_input_tokens_seen": 22622208, "step": 5523 }, { "epoch": 4.116244411326378, "grad_norm": 6.744347188333998, "learning_rate": 1.8780893002006806e-07, "loss": 0.03, "num_input_tokens_seen": 22626304, "step": 5524 }, { "epoch": 4.11698956780924, "grad_norm": 6.855757586451624, "learning_rate": 1.8750051649933195e-07, "loss": 0.1111, "num_input_tokens_seen": 22630400, "step": 5525 }, { "epoch": 4.117734724292101, "grad_norm": 6.25220613739564, "learning_rate": 1.871923358860693e-07, "loss": 0.0776, "num_input_tokens_seen": 22634496, "step": 5526 }, { "epoch": 4.118479880774963, "grad_norm": 7.629466154577748, "learning_rate": 1.868843882478359e-07, "loss": 0.0546, "num_input_tokens_seen": 22638592, "step": 5527 }, { "epoch": 4.1192250372578245, "grad_norm": 7.1217478803320065, "learning_rate": 1.8657667365213572e-07, "loss": 0.102, "num_input_tokens_seen": 22642688, "step": 5528 }, { "epoch": 4.119970193740685, "grad_norm": 7.481162772471038, "learning_rate": 1.8626919216642254e-07, "loss": 0.0928, "num_input_tokens_seen": 22646784, "step": 5529 }, { "epoch": 4.120715350223547, "grad_norm": 10.990988940464906, "learning_rate": 1.85961943858098e-07, "loss": 0.2572, "num_input_tokens_seen": 22650880, "step": 5530 }, { "epoch": 4.121460506706408, "grad_norm": 8.336678731546638, "learning_rate": 1.8565492879451307e-07, "loss": 0.1033, "num_input_tokens_seen": 22654976, "step": 5531 }, { "epoch": 4.12220566318927, "grad_norm": 5.208442789294504, "learning_rate": 1.8534814704296813e-07, "loss": 0.0449, "num_input_tokens_seen": 22659072, "step": 5532 }, { "epoch": 4.122950819672131, "grad_norm": 5.199572770951585, "learning_rate": 1.8504159867071173e-07, "loss": 0.0836, "num_input_tokens_seen": 22663168, "step": 5533 }, { "epoch": 4.123695976154993, "grad_norm": 4.839312470414758, "learning_rate": 1.8473528374494135e-07, "loss": 0.0346, "num_input_tokens_seen": 22667264, "step": 5534 }, { "epoch": 4.124441132637854, "grad_norm": 5.860399032375895, "learning_rate": 1.8442920233280314e-07, "loss": 0.0647, "num_input_tokens_seen": 22671360, "step": 5535 }, { "epoch": 4.125186289120715, "grad_norm": 8.050075524418318, "learning_rate": 1.841233545013929e-07, "loss": 0.1356, "num_input_tokens_seen": 22675456, "step": 5536 }, { "epoch": 4.125931445603577, "grad_norm": 8.26450065950924, "learning_rate": 1.8381774031775406e-07, "loss": 0.0931, "num_input_tokens_seen": 22679552, "step": 5537 }, { "epoch": 4.126676602086438, "grad_norm": 8.239333566250426, "learning_rate": 1.8351235984888013e-07, "loss": 0.1325, "num_input_tokens_seen": 22683648, "step": 5538 }, { "epoch": 4.1274217585693, "grad_norm": 9.148207873060315, "learning_rate": 1.83207213161712e-07, "loss": 0.1943, "num_input_tokens_seen": 22687744, "step": 5539 }, { "epoch": 4.128166915052161, "grad_norm": 6.170098985902517, "learning_rate": 1.829023003231406e-07, "loss": 0.1024, "num_input_tokens_seen": 22691840, "step": 5540 }, { "epoch": 4.128912071535022, "grad_norm": 7.539428594237577, "learning_rate": 1.8259762140000483e-07, "loss": 0.0998, "num_input_tokens_seen": 22695936, "step": 5541 }, { "epoch": 4.129657228017884, "grad_norm": 7.606723487117434, "learning_rate": 1.8229317645909197e-07, "loss": 0.1498, "num_input_tokens_seen": 22700032, "step": 5542 }, { "epoch": 4.130402384500745, "grad_norm": 5.178335143974694, "learning_rate": 1.8198896556713934e-07, "loss": 0.074, "num_input_tokens_seen": 22704128, "step": 5543 }, { "epoch": 4.131147540983607, "grad_norm": 10.133115106444489, "learning_rate": 1.8168498879083132e-07, "loss": 0.1001, "num_input_tokens_seen": 22708224, "step": 5544 }, { "epoch": 4.131892697466468, "grad_norm": 7.18426620783975, "learning_rate": 1.8138124619680286e-07, "loss": 0.1424, "num_input_tokens_seen": 22712320, "step": 5545 }, { "epoch": 4.13263785394933, "grad_norm": 5.616516513031531, "learning_rate": 1.8107773785163528e-07, "loss": 0.0704, "num_input_tokens_seen": 22716416, "step": 5546 }, { "epoch": 4.1333830104321905, "grad_norm": 7.542673874208474, "learning_rate": 1.807744638218606e-07, "loss": 0.1157, "num_input_tokens_seen": 22720512, "step": 5547 }, { "epoch": 4.134128166915052, "grad_norm": 3.355146199761067, "learning_rate": 1.8047142417395815e-07, "loss": 0.0116, "num_input_tokens_seen": 22724608, "step": 5548 }, { "epoch": 4.134873323397914, "grad_norm": 5.7384766498812985, "learning_rate": 1.8016861897435692e-07, "loss": 0.038, "num_input_tokens_seen": 22728704, "step": 5549 }, { "epoch": 4.135618479880775, "grad_norm": 6.585945875668322, "learning_rate": 1.7986604828943377e-07, "loss": 0.0742, "num_input_tokens_seen": 22732800, "step": 5550 }, { "epoch": 4.136363636363637, "grad_norm": 7.069929269552248, "learning_rate": 1.7956371218551387e-07, "loss": 0.123, "num_input_tokens_seen": 22736896, "step": 5551 }, { "epoch": 4.137108792846497, "grad_norm": 7.3413579335582755, "learning_rate": 1.7926161072887208e-07, "loss": 0.1477, "num_input_tokens_seen": 22740992, "step": 5552 }, { "epoch": 4.137853949329359, "grad_norm": 9.60288786745958, "learning_rate": 1.789597439857309e-07, "loss": 0.1877, "num_input_tokens_seen": 22745088, "step": 5553 }, { "epoch": 4.1385991058122205, "grad_norm": 7.470765693790151, "learning_rate": 1.786581120222619e-07, "loss": 0.0659, "num_input_tokens_seen": 22749184, "step": 5554 }, { "epoch": 4.139344262295082, "grad_norm": 5.728670696476447, "learning_rate": 1.783567149045845e-07, "loss": 0.0227, "num_input_tokens_seen": 22753280, "step": 5555 }, { "epoch": 4.140089418777944, "grad_norm": 5.048384761225381, "learning_rate": 1.7805555269876775e-07, "loss": 0.053, "num_input_tokens_seen": 22757376, "step": 5556 }, { "epoch": 4.140834575260805, "grad_norm": 7.872062551165302, "learning_rate": 1.7775462547082817e-07, "loss": 0.1174, "num_input_tokens_seen": 22761472, "step": 5557 }, { "epoch": 4.141579731743666, "grad_norm": 5.5671422034530655, "learning_rate": 1.774539332867313e-07, "loss": 0.055, "num_input_tokens_seen": 22765568, "step": 5558 }, { "epoch": 4.142324888226527, "grad_norm": 6.876231107688504, "learning_rate": 1.7715347621239055e-07, "loss": 0.0455, "num_input_tokens_seen": 22769664, "step": 5559 }, { "epoch": 4.143070044709389, "grad_norm": 3.4657425798053447, "learning_rate": 1.768532543136689e-07, "loss": 0.0093, "num_input_tokens_seen": 22773760, "step": 5560 }, { "epoch": 4.14381520119225, "grad_norm": 5.482791751597691, "learning_rate": 1.7655326765637692e-07, "loss": 0.0663, "num_input_tokens_seen": 22777856, "step": 5561 }, { "epoch": 4.144560357675112, "grad_norm": 5.6839478303309505, "learning_rate": 1.762535163062734e-07, "loss": 0.0743, "num_input_tokens_seen": 22781952, "step": 5562 }, { "epoch": 4.1453055141579735, "grad_norm": 7.477125716935647, "learning_rate": 1.7595400032906662e-07, "loss": 0.1028, "num_input_tokens_seen": 22786048, "step": 5563 }, { "epoch": 4.146050670640834, "grad_norm": 9.321059395132895, "learning_rate": 1.756547197904121e-07, "loss": 0.053, "num_input_tokens_seen": 22790144, "step": 5564 }, { "epoch": 4.146795827123696, "grad_norm": 4.747903146598765, "learning_rate": 1.7535567475591487e-07, "loss": 0.0037, "num_input_tokens_seen": 22794240, "step": 5565 }, { "epoch": 4.147540983606557, "grad_norm": 8.18777224846329, "learning_rate": 1.7505686529112707e-07, "loss": 0.0818, "num_input_tokens_seen": 22798336, "step": 5566 }, { "epoch": 4.148286140089419, "grad_norm": 5.4797569288529795, "learning_rate": 1.7475829146155032e-07, "loss": 0.0555, "num_input_tokens_seen": 22802432, "step": 5567 }, { "epoch": 4.14903129657228, "grad_norm": 3.7663034167698806, "learning_rate": 1.744599533326341e-07, "loss": 0.0326, "num_input_tokens_seen": 22806528, "step": 5568 }, { "epoch": 4.149776453055142, "grad_norm": 8.22511405174327, "learning_rate": 1.741618509697758e-07, "loss": 0.077, "num_input_tokens_seen": 22810624, "step": 5569 }, { "epoch": 4.150521609538003, "grad_norm": 5.927068063253356, "learning_rate": 1.7386398443832234e-07, "loss": 0.0689, "num_input_tokens_seen": 22814720, "step": 5570 }, { "epoch": 4.151266766020864, "grad_norm": 5.062572545975494, "learning_rate": 1.7356635380356742e-07, "loss": 0.0534, "num_input_tokens_seen": 22818816, "step": 5571 }, { "epoch": 4.152011922503726, "grad_norm": 5.365838847459286, "learning_rate": 1.7326895913075473e-07, "loss": 0.0297, "num_input_tokens_seen": 22822912, "step": 5572 }, { "epoch": 4.152757078986587, "grad_norm": 7.276359593055982, "learning_rate": 1.729718004850743e-07, "loss": 0.0876, "num_input_tokens_seen": 22827008, "step": 5573 }, { "epoch": 4.153502235469449, "grad_norm": 4.14082269353555, "learning_rate": 1.7267487793166602e-07, "loss": 0.0353, "num_input_tokens_seen": 22831104, "step": 5574 }, { "epoch": 4.15424739195231, "grad_norm": 10.017764745578967, "learning_rate": 1.723781915356171e-07, "loss": 0.2736, "num_input_tokens_seen": 22835200, "step": 5575 }, { "epoch": 4.154992548435171, "grad_norm": 7.604281434637547, "learning_rate": 1.7208174136196368e-07, "loss": 0.1126, "num_input_tokens_seen": 22839296, "step": 5576 }, { "epoch": 4.155737704918033, "grad_norm": 4.369748069433783, "learning_rate": 1.7178552747568966e-07, "loss": 0.0555, "num_input_tokens_seen": 22843392, "step": 5577 }, { "epoch": 4.156482861400894, "grad_norm": 7.477832769417363, "learning_rate": 1.7148954994172685e-07, "loss": 0.0834, "num_input_tokens_seen": 22847488, "step": 5578 }, { "epoch": 4.157228017883756, "grad_norm": 2.7937004225057813, "learning_rate": 1.7119380882495623e-07, "loss": 0.0161, "num_input_tokens_seen": 22851584, "step": 5579 }, { "epoch": 4.157973174366617, "grad_norm": 5.6442782704762084, "learning_rate": 1.7089830419020584e-07, "loss": 0.0606, "num_input_tokens_seen": 22855680, "step": 5580 }, { "epoch": 4.158718330849478, "grad_norm": 4.2247336633858525, "learning_rate": 1.7060303610225287e-07, "loss": 0.0362, "num_input_tokens_seen": 22859776, "step": 5581 }, { "epoch": 4.1594634873323395, "grad_norm": 5.2811696549055425, "learning_rate": 1.7030800462582167e-07, "loss": 0.045, "num_input_tokens_seen": 22863872, "step": 5582 }, { "epoch": 4.160208643815201, "grad_norm": 5.08458487868567, "learning_rate": 1.7001320982558577e-07, "loss": 0.0587, "num_input_tokens_seen": 22867968, "step": 5583 }, { "epoch": 4.160953800298063, "grad_norm": 5.923642453913958, "learning_rate": 1.6971865176616615e-07, "loss": 0.1417, "num_input_tokens_seen": 22872064, "step": 5584 }, { "epoch": 4.161698956780924, "grad_norm": 5.219395246126363, "learning_rate": 1.6942433051213208e-07, "loss": 0.0481, "num_input_tokens_seen": 22876160, "step": 5585 }, { "epoch": 4.162444113263786, "grad_norm": 4.133602147728739, "learning_rate": 1.6913024612800034e-07, "loss": 0.0159, "num_input_tokens_seen": 22880256, "step": 5586 }, { "epoch": 4.163189269746646, "grad_norm": 4.202257432018123, "learning_rate": 1.6883639867823694e-07, "loss": 0.03, "num_input_tokens_seen": 22884352, "step": 5587 }, { "epoch": 4.163934426229508, "grad_norm": 7.692972829771999, "learning_rate": 1.685427882272553e-07, "loss": 0.1144, "num_input_tokens_seen": 22888448, "step": 5588 }, { "epoch": 4.1646795827123695, "grad_norm": 6.060029888775675, "learning_rate": 1.6824941483941649e-07, "loss": 0.0993, "num_input_tokens_seen": 22892544, "step": 5589 }, { "epoch": 4.165424739195231, "grad_norm": 7.440641766919979, "learning_rate": 1.6795627857903064e-07, "loss": 0.1227, "num_input_tokens_seen": 22896640, "step": 5590 }, { "epoch": 4.166169895678093, "grad_norm": 6.700115615005603, "learning_rate": 1.6766337951035467e-07, "loss": 0.0758, "num_input_tokens_seen": 22900736, "step": 5591 }, { "epoch": 4.166915052160954, "grad_norm": 8.067654664010515, "learning_rate": 1.673707176975947e-07, "loss": 0.1833, "num_input_tokens_seen": 22904832, "step": 5592 }, { "epoch": 4.167660208643815, "grad_norm": 5.490508916297667, "learning_rate": 1.6707829320490393e-07, "loss": 0.0817, "num_input_tokens_seen": 22908928, "step": 5593 }, { "epoch": 4.168405365126676, "grad_norm": 7.081569524963757, "learning_rate": 1.667861060963842e-07, "loss": 0.0871, "num_input_tokens_seen": 22913024, "step": 5594 }, { "epoch": 4.169150521609538, "grad_norm": 3.761601810678806, "learning_rate": 1.6649415643608487e-07, "loss": 0.0223, "num_input_tokens_seen": 22917120, "step": 5595 }, { "epoch": 4.1698956780923995, "grad_norm": 5.827903773338956, "learning_rate": 1.662024442880032e-07, "loss": 0.0697, "num_input_tokens_seen": 22921216, "step": 5596 }, { "epoch": 4.170640834575261, "grad_norm": 6.514316544284433, "learning_rate": 1.6591096971608489e-07, "loss": 0.1197, "num_input_tokens_seen": 22925312, "step": 5597 }, { "epoch": 4.171385991058123, "grad_norm": 6.790748178774498, "learning_rate": 1.6561973278422294e-07, "loss": 0.0653, "num_input_tokens_seen": 22929408, "step": 5598 }, { "epoch": 4.172131147540983, "grad_norm": 7.184793350991165, "learning_rate": 1.6532873355625924e-07, "loss": 0.0922, "num_input_tokens_seen": 22933504, "step": 5599 }, { "epoch": 4.172876304023845, "grad_norm": 6.964167036324321, "learning_rate": 1.6503797209598189e-07, "loss": 0.1049, "num_input_tokens_seen": 22937600, "step": 5600 }, { "epoch": 4.173621460506706, "grad_norm": 4.655273328036381, "learning_rate": 1.6474744846712853e-07, "loss": 0.0221, "num_input_tokens_seen": 22941696, "step": 5601 }, { "epoch": 4.174366616989568, "grad_norm": 6.380007338878217, "learning_rate": 1.6445716273338366e-07, "loss": 0.0512, "num_input_tokens_seen": 22945792, "step": 5602 }, { "epoch": 4.1751117734724295, "grad_norm": 8.653324759629431, "learning_rate": 1.6416711495838038e-07, "loss": 0.0995, "num_input_tokens_seen": 22949888, "step": 5603 }, { "epoch": 4.175856929955291, "grad_norm": 6.715369290569373, "learning_rate": 1.6387730520569883e-07, "loss": 0.0925, "num_input_tokens_seen": 22953984, "step": 5604 }, { "epoch": 4.176602086438152, "grad_norm": 10.609981969825604, "learning_rate": 1.635877335388679e-07, "loss": 0.2431, "num_input_tokens_seen": 22958080, "step": 5605 }, { "epoch": 4.177347242921013, "grad_norm": 6.521839940995687, "learning_rate": 1.6329840002136347e-07, "loss": 0.0673, "num_input_tokens_seen": 22962176, "step": 5606 }, { "epoch": 4.178092399403875, "grad_norm": 4.770146011707221, "learning_rate": 1.6300930471660924e-07, "loss": 0.0512, "num_input_tokens_seen": 22966272, "step": 5607 }, { "epoch": 4.178837555886736, "grad_norm": 3.3949108680672224, "learning_rate": 1.6272044768797734e-07, "loss": 0.0124, "num_input_tokens_seen": 22970368, "step": 5608 }, { "epoch": 4.179582712369598, "grad_norm": 6.288781410645661, "learning_rate": 1.624318289987871e-07, "loss": 0.0822, "num_input_tokens_seen": 22974464, "step": 5609 }, { "epoch": 4.180327868852459, "grad_norm": 5.835549860853575, "learning_rate": 1.6214344871230593e-07, "loss": 0.0392, "num_input_tokens_seen": 22978560, "step": 5610 }, { "epoch": 4.18107302533532, "grad_norm": 8.453145974294602, "learning_rate": 1.6185530689174876e-07, "loss": 0.1737, "num_input_tokens_seen": 22982656, "step": 5611 }, { "epoch": 4.181818181818182, "grad_norm": 5.707884677351813, "learning_rate": 1.6156740360027874e-07, "loss": 0.0424, "num_input_tokens_seen": 22986752, "step": 5612 }, { "epoch": 4.182563338301043, "grad_norm": 6.95471955073254, "learning_rate": 1.612797389010054e-07, "loss": 0.0987, "num_input_tokens_seen": 22990848, "step": 5613 }, { "epoch": 4.183308494783905, "grad_norm": 4.743807338348461, "learning_rate": 1.609923128569879e-07, "loss": 0.0541, "num_input_tokens_seen": 22994944, "step": 5614 }, { "epoch": 4.184053651266766, "grad_norm": 5.4782820771642085, "learning_rate": 1.6070512553123142e-07, "loss": 0.0962, "num_input_tokens_seen": 22999040, "step": 5615 }, { "epoch": 4.184798807749628, "grad_norm": 7.739811492720885, "learning_rate": 1.6041817698668957e-07, "loss": 0.1365, "num_input_tokens_seen": 23003136, "step": 5616 }, { "epoch": 4.1855439642324885, "grad_norm": 5.902275053356631, "learning_rate": 1.6013146728626375e-07, "loss": 0.0841, "num_input_tokens_seen": 23007232, "step": 5617 }, { "epoch": 4.18628912071535, "grad_norm": 8.868339061001846, "learning_rate": 1.5984499649280255e-07, "loss": 0.153, "num_input_tokens_seen": 23011328, "step": 5618 }, { "epoch": 4.187034277198212, "grad_norm": 7.0204877934865575, "learning_rate": 1.595587646691027e-07, "loss": 0.122, "num_input_tokens_seen": 23015424, "step": 5619 }, { "epoch": 4.187779433681073, "grad_norm": 7.615635513621195, "learning_rate": 1.5927277187790787e-07, "loss": 0.0674, "num_input_tokens_seen": 23019520, "step": 5620 }, { "epoch": 4.188524590163935, "grad_norm": 6.978197387372023, "learning_rate": 1.5898701818191018e-07, "loss": 0.1316, "num_input_tokens_seen": 23023616, "step": 5621 }, { "epoch": 4.189269746646795, "grad_norm": 15.410238396710131, "learning_rate": 1.5870150364374852e-07, "loss": 0.0582, "num_input_tokens_seen": 23027712, "step": 5622 }, { "epoch": 4.190014903129657, "grad_norm": 7.589384402945915, "learning_rate": 1.5841622832600976e-07, "loss": 0.0769, "num_input_tokens_seen": 23031808, "step": 5623 }, { "epoch": 4.1907600596125185, "grad_norm": 6.481135607994568, "learning_rate": 1.5813119229122847e-07, "loss": 0.0973, "num_input_tokens_seen": 23035904, "step": 5624 }, { "epoch": 4.19150521609538, "grad_norm": 5.405163859434207, "learning_rate": 1.5784639560188623e-07, "loss": 0.0713, "num_input_tokens_seen": 23040000, "step": 5625 }, { "epoch": 4.192250372578242, "grad_norm": 5.0023868947580565, "learning_rate": 1.5756183832041333e-07, "loss": 0.0877, "num_input_tokens_seen": 23044096, "step": 5626 }, { "epoch": 4.192995529061103, "grad_norm": 5.955522066252496, "learning_rate": 1.572775205091856e-07, "loss": 0.1011, "num_input_tokens_seen": 23048192, "step": 5627 }, { "epoch": 4.193740685543964, "grad_norm": 2.917719969311506, "learning_rate": 1.5699344223052846e-07, "loss": 0.0157, "num_input_tokens_seen": 23052288, "step": 5628 }, { "epoch": 4.194485842026825, "grad_norm": 4.436278104384092, "learning_rate": 1.567096035467132e-07, "loss": 0.0446, "num_input_tokens_seen": 23056384, "step": 5629 }, { "epoch": 4.195230998509687, "grad_norm": 5.8573368009985005, "learning_rate": 1.564260045199599e-07, "loss": 0.0433, "num_input_tokens_seen": 23060480, "step": 5630 }, { "epoch": 4.1959761549925485, "grad_norm": 5.416411532767775, "learning_rate": 1.5614264521243497e-07, "loss": 0.0681, "num_input_tokens_seen": 23064576, "step": 5631 }, { "epoch": 4.19672131147541, "grad_norm": 10.972959722980633, "learning_rate": 1.558595256862533e-07, "loss": 0.0595, "num_input_tokens_seen": 23068672, "step": 5632 }, { "epoch": 4.197466467958272, "grad_norm": 3.6939560759790075, "learning_rate": 1.5557664600347647e-07, "loss": 0.0271, "num_input_tokens_seen": 23072768, "step": 5633 }, { "epoch": 4.198211624441132, "grad_norm": 6.583297355642469, "learning_rate": 1.5529400622611336e-07, "loss": 0.0938, "num_input_tokens_seen": 23076864, "step": 5634 }, { "epoch": 4.198956780923994, "grad_norm": 8.076719248352957, "learning_rate": 1.5501160641612139e-07, "loss": 0.0938, "num_input_tokens_seen": 23080960, "step": 5635 }, { "epoch": 4.199701937406855, "grad_norm": 5.276533587956095, "learning_rate": 1.5472944663540392e-07, "loss": 0.0311, "num_input_tokens_seen": 23085056, "step": 5636 }, { "epoch": 4.200447093889717, "grad_norm": 7.402796294529712, "learning_rate": 1.5444752694581288e-07, "loss": 0.1758, "num_input_tokens_seen": 23089152, "step": 5637 }, { "epoch": 4.2011922503725785, "grad_norm": 3.8784721923257788, "learning_rate": 1.5416584740914675e-07, "loss": 0.0801, "num_input_tokens_seen": 23093248, "step": 5638 }, { "epoch": 4.20193740685544, "grad_norm": 8.068593099517345, "learning_rate": 1.5388440808715225e-07, "loss": 0.1041, "num_input_tokens_seen": 23097344, "step": 5639 }, { "epoch": 4.202682563338301, "grad_norm": 4.10402159636503, "learning_rate": 1.5360320904152195e-07, "loss": 0.0346, "num_input_tokens_seen": 23101440, "step": 5640 }, { "epoch": 4.203427719821162, "grad_norm": 5.871398758704319, "learning_rate": 1.533222503338974e-07, "loss": 0.0439, "num_input_tokens_seen": 23105536, "step": 5641 }, { "epoch": 4.204172876304024, "grad_norm": 6.378294555999191, "learning_rate": 1.5304153202586672e-07, "loss": 0.1136, "num_input_tokens_seen": 23109632, "step": 5642 }, { "epoch": 4.204918032786885, "grad_norm": 10.367674923204458, "learning_rate": 1.527610541789648e-07, "loss": 0.2737, "num_input_tokens_seen": 23113728, "step": 5643 }, { "epoch": 4.205663189269747, "grad_norm": 4.993923364270393, "learning_rate": 1.52480816854675e-07, "loss": 0.0634, "num_input_tokens_seen": 23117824, "step": 5644 }, { "epoch": 4.2064083457526085, "grad_norm": 5.602545388238422, "learning_rate": 1.5220082011442704e-07, "loss": 0.046, "num_input_tokens_seen": 23121920, "step": 5645 }, { "epoch": 4.207153502235469, "grad_norm": 8.414701326675118, "learning_rate": 1.5192106401959838e-07, "loss": 0.1432, "num_input_tokens_seen": 23126016, "step": 5646 }, { "epoch": 4.207898658718331, "grad_norm": 4.886469185272763, "learning_rate": 1.516415486315134e-07, "loss": 0.0302, "num_input_tokens_seen": 23130112, "step": 5647 }, { "epoch": 4.208643815201192, "grad_norm": 9.188160756770495, "learning_rate": 1.5136227401144412e-07, "loss": 0.1181, "num_input_tokens_seen": 23134208, "step": 5648 }, { "epoch": 4.209388971684054, "grad_norm": 5.8012038926752245, "learning_rate": 1.510832402206093e-07, "loss": 0.068, "num_input_tokens_seen": 23138304, "step": 5649 }, { "epoch": 4.210134128166915, "grad_norm": 6.014824985557311, "learning_rate": 1.5080444732017502e-07, "loss": 0.0846, "num_input_tokens_seen": 23142400, "step": 5650 }, { "epoch": 4.210879284649776, "grad_norm": 5.774487115991027, "learning_rate": 1.5052589537125504e-07, "loss": 0.0484, "num_input_tokens_seen": 23146496, "step": 5651 }, { "epoch": 4.211624441132638, "grad_norm": 2.9176857204806246, "learning_rate": 1.502475844349094e-07, "loss": 0.0185, "num_input_tokens_seen": 23150592, "step": 5652 }, { "epoch": 4.212369597615499, "grad_norm": 6.47678444643844, "learning_rate": 1.499695145721468e-07, "loss": 0.0881, "num_input_tokens_seen": 23154688, "step": 5653 }, { "epoch": 4.213114754098361, "grad_norm": 9.0629781167983, "learning_rate": 1.496916858439211e-07, "loss": 0.0454, "num_input_tokens_seen": 23158784, "step": 5654 }, { "epoch": 4.213859910581222, "grad_norm": 7.321385738366828, "learning_rate": 1.4941409831113487e-07, "loss": 0.1081, "num_input_tokens_seen": 23162880, "step": 5655 }, { "epoch": 4.214605067064084, "grad_norm": 10.545259154945285, "learning_rate": 1.4913675203463692e-07, "loss": 0.1462, "num_input_tokens_seen": 23166976, "step": 5656 }, { "epoch": 4.215350223546944, "grad_norm": 7.471553761930757, "learning_rate": 1.488596470752242e-07, "loss": 0.0928, "num_input_tokens_seen": 23171072, "step": 5657 }, { "epoch": 4.216095380029806, "grad_norm": 3.4080355376436917, "learning_rate": 1.485827834936393e-07, "loss": 0.0185, "num_input_tokens_seen": 23175168, "step": 5658 }, { "epoch": 4.2168405365126675, "grad_norm": 6.132112822304894, "learning_rate": 1.483061613505733e-07, "loss": 0.06, "num_input_tokens_seen": 23179264, "step": 5659 }, { "epoch": 4.217585692995529, "grad_norm": 7.760523343898847, "learning_rate": 1.480297807066637e-07, "loss": 0.0646, "num_input_tokens_seen": 23183360, "step": 5660 }, { "epoch": 4.218330849478391, "grad_norm": 6.165795032871871, "learning_rate": 1.477536416224945e-07, "loss": 0.0642, "num_input_tokens_seen": 23187456, "step": 5661 }, { "epoch": 4.219076005961252, "grad_norm": 3.8998429830204753, "learning_rate": 1.4747774415859825e-07, "loss": 0.0229, "num_input_tokens_seen": 23191552, "step": 5662 }, { "epoch": 4.219821162444113, "grad_norm": 7.466239798507912, "learning_rate": 1.4720208837545283e-07, "loss": 0.0296, "num_input_tokens_seen": 23195648, "step": 5663 }, { "epoch": 4.220566318926974, "grad_norm": 5.520491040866966, "learning_rate": 1.4692667433348448e-07, "loss": 0.0429, "num_input_tokens_seen": 23199744, "step": 5664 }, { "epoch": 4.221311475409836, "grad_norm": 7.654403447530073, "learning_rate": 1.466515020930656e-07, "loss": 0.0943, "num_input_tokens_seen": 23203840, "step": 5665 }, { "epoch": 4.2220566318926975, "grad_norm": 5.504205079292468, "learning_rate": 1.4637657171451657e-07, "loss": 0.0522, "num_input_tokens_seen": 23207936, "step": 5666 }, { "epoch": 4.222801788375559, "grad_norm": 4.571176210779818, "learning_rate": 1.4610188325810305e-07, "loss": 0.0226, "num_input_tokens_seen": 23212032, "step": 5667 }, { "epoch": 4.223546944858421, "grad_norm": 10.581819460108424, "learning_rate": 1.4582743678403968e-07, "loss": 0.1781, "num_input_tokens_seen": 23216128, "step": 5668 }, { "epoch": 4.224292101341281, "grad_norm": 4.276714190215616, "learning_rate": 1.455532323524865e-07, "loss": 0.0283, "num_input_tokens_seen": 23220224, "step": 5669 }, { "epoch": 4.225037257824143, "grad_norm": 6.504735128044387, "learning_rate": 1.45279270023551e-07, "loss": 0.0693, "num_input_tokens_seen": 23224320, "step": 5670 }, { "epoch": 4.225782414307004, "grad_norm": 9.190728263261667, "learning_rate": 1.450055498572882e-07, "loss": 0.1989, "num_input_tokens_seen": 23228416, "step": 5671 }, { "epoch": 4.226527570789866, "grad_norm": 9.536658448153213, "learning_rate": 1.4473207191369896e-07, "loss": 0.096, "num_input_tokens_seen": 23232512, "step": 5672 }, { "epoch": 4.2272727272727275, "grad_norm": 5.264853264516338, "learning_rate": 1.4445883625273216e-07, "loss": 0.026, "num_input_tokens_seen": 23236608, "step": 5673 }, { "epoch": 4.228017883755589, "grad_norm": 6.290317498801654, "learning_rate": 1.4418584293428253e-07, "loss": 0.0819, "num_input_tokens_seen": 23240704, "step": 5674 }, { "epoch": 4.22876304023845, "grad_norm": 4.751089009015087, "learning_rate": 1.4391309201819248e-07, "loss": 0.0378, "num_input_tokens_seen": 23244800, "step": 5675 }, { "epoch": 4.229508196721311, "grad_norm": 12.385294177803882, "learning_rate": 1.4364058356425098e-07, "loss": 0.0369, "num_input_tokens_seen": 23248896, "step": 5676 }, { "epoch": 4.230253353204173, "grad_norm": 7.103358291428938, "learning_rate": 1.433683176321933e-07, "loss": 0.1008, "num_input_tokens_seen": 23252992, "step": 5677 }, { "epoch": 4.230998509687034, "grad_norm": 6.220057761624552, "learning_rate": 1.4309629428170268e-07, "loss": 0.0333, "num_input_tokens_seen": 23257088, "step": 5678 }, { "epoch": 4.231743666169896, "grad_norm": 4.573029150950568, "learning_rate": 1.4282451357240823e-07, "loss": 0.0626, "num_input_tokens_seen": 23261184, "step": 5679 }, { "epoch": 4.2324888226527575, "grad_norm": 6.18050110901169, "learning_rate": 1.4255297556388673e-07, "loss": 0.0854, "num_input_tokens_seen": 23265280, "step": 5680 }, { "epoch": 4.233233979135618, "grad_norm": 5.2366670737000005, "learning_rate": 1.4228168031566041e-07, "loss": 0.0482, "num_input_tokens_seen": 23269376, "step": 5681 }, { "epoch": 4.23397913561848, "grad_norm": 6.517757305826588, "learning_rate": 1.420106278871998e-07, "loss": 0.0608, "num_input_tokens_seen": 23273472, "step": 5682 }, { "epoch": 4.234724292101341, "grad_norm": 4.7409645138606535, "learning_rate": 1.417398183379211e-07, "loss": 0.028, "num_input_tokens_seen": 23277568, "step": 5683 }, { "epoch": 4.235469448584203, "grad_norm": 2.6444118258441587, "learning_rate": 1.4146925172718802e-07, "loss": 0.0103, "num_input_tokens_seen": 23281664, "step": 5684 }, { "epoch": 4.236214605067064, "grad_norm": 4.833343067935845, "learning_rate": 1.4119892811431059e-07, "loss": 0.0768, "num_input_tokens_seen": 23285760, "step": 5685 }, { "epoch": 4.236959761549926, "grad_norm": 6.116316170786015, "learning_rate": 1.4092884755854573e-07, "loss": 0.077, "num_input_tokens_seen": 23289856, "step": 5686 }, { "epoch": 4.237704918032787, "grad_norm": 3.550726628087425, "learning_rate": 1.4065901011909688e-07, "loss": 0.0179, "num_input_tokens_seen": 23293952, "step": 5687 }, { "epoch": 4.238450074515648, "grad_norm": 6.7185266877574135, "learning_rate": 1.403894158551143e-07, "loss": 0.0708, "num_input_tokens_seen": 23298048, "step": 5688 }, { "epoch": 4.23919523099851, "grad_norm": 8.926672758151533, "learning_rate": 1.4012006482569535e-07, "loss": 0.1781, "num_input_tokens_seen": 23302144, "step": 5689 }, { "epoch": 4.239940387481371, "grad_norm": 4.595597428934135, "learning_rate": 1.3985095708988317e-07, "loss": 0.0396, "num_input_tokens_seen": 23306240, "step": 5690 }, { "epoch": 4.240685543964233, "grad_norm": 4.62323767499093, "learning_rate": 1.3958209270666863e-07, "loss": 0.0349, "num_input_tokens_seen": 23310336, "step": 5691 }, { "epoch": 4.2414307004470935, "grad_norm": 5.954873892036531, "learning_rate": 1.3931347173498824e-07, "loss": 0.0366, "num_input_tokens_seen": 23314432, "step": 5692 }, { "epoch": 4.242175856929955, "grad_norm": 10.121505527283697, "learning_rate": 1.3904509423372626e-07, "loss": 0.2066, "num_input_tokens_seen": 23318528, "step": 5693 }, { "epoch": 4.242921013412817, "grad_norm": 5.633319309905124, "learning_rate": 1.3877696026171215e-07, "loss": 0.0607, "num_input_tokens_seen": 23322624, "step": 5694 }, { "epoch": 4.243666169895678, "grad_norm": 7.852355238093951, "learning_rate": 1.3850906987772333e-07, "loss": 0.0892, "num_input_tokens_seen": 23326720, "step": 5695 }, { "epoch": 4.24441132637854, "grad_norm": 5.8359184744951245, "learning_rate": 1.38241423140483e-07, "loss": 0.069, "num_input_tokens_seen": 23330816, "step": 5696 }, { "epoch": 4.245156482861401, "grad_norm": 4.762278671980394, "learning_rate": 1.3797402010866158e-07, "loss": 0.014, "num_input_tokens_seen": 23334912, "step": 5697 }, { "epoch": 4.245901639344262, "grad_norm": 1.7782431898011601, "learning_rate": 1.3770686084087547e-07, "loss": 0.0069, "num_input_tokens_seen": 23339008, "step": 5698 }, { "epoch": 4.2466467958271235, "grad_norm": 5.308776557948552, "learning_rate": 1.3743994539568774e-07, "loss": 0.0435, "num_input_tokens_seen": 23343104, "step": 5699 }, { "epoch": 4.247391952309985, "grad_norm": 8.988841311073745, "learning_rate": 1.3717327383160865e-07, "loss": 0.1268, "num_input_tokens_seen": 23347200, "step": 5700 }, { "epoch": 4.248137108792847, "grad_norm": 6.546727613019836, "learning_rate": 1.3690684620709396e-07, "loss": 0.0447, "num_input_tokens_seen": 23351296, "step": 5701 }, { "epoch": 4.248882265275708, "grad_norm": 4.7177779968258875, "learning_rate": 1.3664066258054707e-07, "loss": 0.0372, "num_input_tokens_seen": 23355392, "step": 5702 }, { "epoch": 4.24962742175857, "grad_norm": 9.067845074379147, "learning_rate": 1.3637472301031684e-07, "loss": 0.1508, "num_input_tokens_seen": 23359488, "step": 5703 }, { "epoch": 4.25037257824143, "grad_norm": 4.675166576232786, "learning_rate": 1.3610902755469947e-07, "loss": 0.0412, "num_input_tokens_seen": 23363584, "step": 5704 }, { "epoch": 4.251117734724292, "grad_norm": 6.9276588706055815, "learning_rate": 1.3584357627193705e-07, "loss": 0.1869, "num_input_tokens_seen": 23367680, "step": 5705 }, { "epoch": 4.251862891207153, "grad_norm": 7.425885852334372, "learning_rate": 1.355783692202184e-07, "loss": 0.0785, "num_input_tokens_seen": 23371776, "step": 5706 }, { "epoch": 4.252608047690015, "grad_norm": 6.797810779290849, "learning_rate": 1.3531340645767944e-07, "loss": 0.0689, "num_input_tokens_seen": 23375872, "step": 5707 }, { "epoch": 4.2533532041728765, "grad_norm": 4.1746802561036604, "learning_rate": 1.3504868804240082e-07, "loss": 0.0187, "num_input_tokens_seen": 23379968, "step": 5708 }, { "epoch": 4.254098360655738, "grad_norm": 8.8813117942273, "learning_rate": 1.347842140324114e-07, "loss": 0.135, "num_input_tokens_seen": 23384064, "step": 5709 }, { "epoch": 4.254843517138599, "grad_norm": 3.9397669043160994, "learning_rate": 1.3451998448568537e-07, "loss": 0.0305, "num_input_tokens_seen": 23388160, "step": 5710 }, { "epoch": 4.25558867362146, "grad_norm": 6.435733902002905, "learning_rate": 1.342559994601443e-07, "loss": 0.0722, "num_input_tokens_seen": 23392256, "step": 5711 }, { "epoch": 4.256333830104322, "grad_norm": 10.706526733635249, "learning_rate": 1.3399225901365498e-07, "loss": 0.3268, "num_input_tokens_seen": 23396352, "step": 5712 }, { "epoch": 4.257078986587183, "grad_norm": 6.562415767994476, "learning_rate": 1.3372876320403182e-07, "loss": 0.076, "num_input_tokens_seen": 23400448, "step": 5713 }, { "epoch": 4.257824143070045, "grad_norm": 4.031963348032939, "learning_rate": 1.3346551208903464e-07, "loss": 0.0237, "num_input_tokens_seen": 23404544, "step": 5714 }, { "epoch": 4.2585692995529065, "grad_norm": 1.7180046387643677, "learning_rate": 1.3320250572636973e-07, "loss": 0.005, "num_input_tokens_seen": 23408640, "step": 5715 }, { "epoch": 4.259314456035767, "grad_norm": 7.279270536080038, "learning_rate": 1.3293974417369043e-07, "loss": 0.0719, "num_input_tokens_seen": 23412736, "step": 5716 }, { "epoch": 4.260059612518629, "grad_norm": 10.392055665984385, "learning_rate": 1.326772274885954e-07, "loss": 0.0585, "num_input_tokens_seen": 23416832, "step": 5717 }, { "epoch": 4.26080476900149, "grad_norm": 2.768545355091304, "learning_rate": 1.3241495572863082e-07, "loss": 0.0232, "num_input_tokens_seen": 23420928, "step": 5718 }, { "epoch": 4.261549925484352, "grad_norm": 7.563924771739446, "learning_rate": 1.3215292895128794e-07, "loss": 0.169, "num_input_tokens_seen": 23425024, "step": 5719 }, { "epoch": 4.262295081967213, "grad_norm": 4.438165526592119, "learning_rate": 1.3189114721400582e-07, "loss": 0.039, "num_input_tokens_seen": 23429120, "step": 5720 }, { "epoch": 4.263040238450074, "grad_norm": 6.15977572424064, "learning_rate": 1.3162961057416754e-07, "loss": 0.057, "num_input_tokens_seen": 23433216, "step": 5721 }, { "epoch": 4.263785394932936, "grad_norm": 6.264567266931347, "learning_rate": 1.313683190891049e-07, "loss": 0.0316, "num_input_tokens_seen": 23437312, "step": 5722 }, { "epoch": 4.264530551415797, "grad_norm": 5.991659811787031, "learning_rate": 1.3110727281609405e-07, "loss": 0.0597, "num_input_tokens_seen": 23441408, "step": 5723 }, { "epoch": 4.265275707898659, "grad_norm": 5.929659646804255, "learning_rate": 1.3084647181235893e-07, "loss": 0.0552, "num_input_tokens_seen": 23445504, "step": 5724 }, { "epoch": 4.26602086438152, "grad_norm": 3.5675541211768502, "learning_rate": 1.305859161350688e-07, "loss": 0.0071, "num_input_tokens_seen": 23449600, "step": 5725 }, { "epoch": 4.266766020864382, "grad_norm": 7.074327309540182, "learning_rate": 1.303256058413388e-07, "loss": 0.1031, "num_input_tokens_seen": 23453696, "step": 5726 }, { "epoch": 4.267511177347243, "grad_norm": 7.354042207961501, "learning_rate": 1.3006554098823148e-07, "loss": 0.1152, "num_input_tokens_seen": 23457792, "step": 5727 }, { "epoch": 4.268256333830104, "grad_norm": 7.110966962358315, "learning_rate": 1.2980572163275432e-07, "loss": 0.0661, "num_input_tokens_seen": 23461888, "step": 5728 }, { "epoch": 4.269001490312966, "grad_norm": 8.649947494290098, "learning_rate": 1.2954614783186216e-07, "loss": 0.2261, "num_input_tokens_seen": 23465984, "step": 5729 }, { "epoch": 4.269746646795827, "grad_norm": 4.6783238285694075, "learning_rate": 1.2928681964245487e-07, "loss": 0.0478, "num_input_tokens_seen": 23470080, "step": 5730 }, { "epoch": 4.270491803278689, "grad_norm": 7.964832653375642, "learning_rate": 1.2902773712137954e-07, "loss": 0.1743, "num_input_tokens_seen": 23474176, "step": 5731 }, { "epoch": 4.27123695976155, "grad_norm": 4.046387049048281, "learning_rate": 1.2876890032542875e-07, "loss": 0.0265, "num_input_tokens_seen": 23478272, "step": 5732 }, { "epoch": 4.271982116244411, "grad_norm": 8.365005220918242, "learning_rate": 1.2851030931134113e-07, "loss": 0.0569, "num_input_tokens_seen": 23482368, "step": 5733 }, { "epoch": 4.2727272727272725, "grad_norm": 4.615028873123353, "learning_rate": 1.2825196413580223e-07, "loss": 0.037, "num_input_tokens_seen": 23486464, "step": 5734 }, { "epoch": 4.273472429210134, "grad_norm": 8.025032064102843, "learning_rate": 1.279938648554424e-07, "loss": 0.1055, "num_input_tokens_seen": 23490560, "step": 5735 }, { "epoch": 4.274217585692996, "grad_norm": 5.265260729142364, "learning_rate": 1.2773601152683945e-07, "loss": 0.0705, "num_input_tokens_seen": 23494656, "step": 5736 }, { "epoch": 4.274962742175857, "grad_norm": 8.411798861023113, "learning_rate": 1.2747840420651622e-07, "loss": 0.142, "num_input_tokens_seen": 23498752, "step": 5737 }, { "epoch": 4.275707898658719, "grad_norm": 6.428687509914629, "learning_rate": 1.2722104295094255e-07, "loss": 0.0347, "num_input_tokens_seen": 23502848, "step": 5738 }, { "epoch": 4.276453055141579, "grad_norm": 5.516893210915721, "learning_rate": 1.2696392781653327e-07, "loss": 0.0558, "num_input_tokens_seen": 23506944, "step": 5739 }, { "epoch": 4.277198211624441, "grad_norm": 4.887067548516521, "learning_rate": 1.2670705885965056e-07, "loss": 0.0282, "num_input_tokens_seen": 23511040, "step": 5740 }, { "epoch": 4.2779433681073025, "grad_norm": 10.269162807066412, "learning_rate": 1.264504361366016e-07, "loss": 0.1982, "num_input_tokens_seen": 23515136, "step": 5741 }, { "epoch": 4.278688524590164, "grad_norm": 6.445492162604763, "learning_rate": 1.2619405970363956e-07, "loss": 0.0781, "num_input_tokens_seen": 23519232, "step": 5742 }, { "epoch": 4.279433681073026, "grad_norm": 6.9861778604446, "learning_rate": 1.2593792961696467e-07, "loss": 0.0724, "num_input_tokens_seen": 23523328, "step": 5743 }, { "epoch": 4.280178837555887, "grad_norm": 4.789824430536046, "learning_rate": 1.2568204593272182e-07, "loss": 0.0404, "num_input_tokens_seen": 23527424, "step": 5744 }, { "epoch": 4.280923994038748, "grad_norm": 7.046992490423728, "learning_rate": 1.25426408707003e-07, "loss": 0.102, "num_input_tokens_seen": 23531520, "step": 5745 }, { "epoch": 4.281669150521609, "grad_norm": 4.374096215906728, "learning_rate": 1.2517101799584542e-07, "loss": 0.0524, "num_input_tokens_seen": 23535616, "step": 5746 }, { "epoch": 4.282414307004471, "grad_norm": 5.001151978539527, "learning_rate": 1.249158738552332e-07, "loss": 0.0298, "num_input_tokens_seen": 23539712, "step": 5747 }, { "epoch": 4.2831594634873325, "grad_norm": 8.855294500474557, "learning_rate": 1.2466097634109485e-07, "loss": 0.1002, "num_input_tokens_seen": 23543808, "step": 5748 }, { "epoch": 4.283904619970194, "grad_norm": 7.123867348819998, "learning_rate": 1.2440632550930614e-07, "loss": 0.0887, "num_input_tokens_seen": 23547904, "step": 5749 }, { "epoch": 4.284649776453055, "grad_norm": 5.347831239540861, "learning_rate": 1.2415192141568819e-07, "loss": 0.0418, "num_input_tokens_seen": 23552000, "step": 5750 }, { "epoch": 4.285394932935916, "grad_norm": 7.972764361402645, "learning_rate": 1.238977641160087e-07, "loss": 0.1683, "num_input_tokens_seen": 23556096, "step": 5751 }, { "epoch": 4.286140089418778, "grad_norm": 5.3535485318374105, "learning_rate": 1.2364385366598023e-07, "loss": 0.0718, "num_input_tokens_seen": 23560192, "step": 5752 }, { "epoch": 4.286885245901639, "grad_norm": 6.252045545823768, "learning_rate": 1.2339019012126182e-07, "loss": 0.0611, "num_input_tokens_seen": 23564288, "step": 5753 }, { "epoch": 4.287630402384501, "grad_norm": 6.0513432504942415, "learning_rate": 1.2313677353745863e-07, "loss": 0.0579, "num_input_tokens_seen": 23568384, "step": 5754 }, { "epoch": 4.288375558867362, "grad_norm": 4.561505851260121, "learning_rate": 1.2288360397012104e-07, "loss": 0.0274, "num_input_tokens_seen": 23572480, "step": 5755 }, { "epoch": 4.289120715350224, "grad_norm": 8.101745078776174, "learning_rate": 1.2263068147474592e-07, "loss": 0.1147, "num_input_tokens_seen": 23576576, "step": 5756 }, { "epoch": 4.289865871833085, "grad_norm": 8.206957187846799, "learning_rate": 1.2237800610677545e-07, "loss": 0.0667, "num_input_tokens_seen": 23580672, "step": 5757 }, { "epoch": 4.290611028315946, "grad_norm": 7.324088425678658, "learning_rate": 1.221255779215981e-07, "loss": 0.091, "num_input_tokens_seen": 23584768, "step": 5758 }, { "epoch": 4.291356184798808, "grad_norm": 8.080370845111705, "learning_rate": 1.218733969745478e-07, "loss": 0.1346, "num_input_tokens_seen": 23588864, "step": 5759 }, { "epoch": 4.292101341281669, "grad_norm": 5.142456727290856, "learning_rate": 1.2162146332090424e-07, "loss": 0.0313, "num_input_tokens_seen": 23592960, "step": 5760 }, { "epoch": 4.292846497764531, "grad_norm": 5.780641064365937, "learning_rate": 1.2136977701589377e-07, "loss": 0.0579, "num_input_tokens_seen": 23597056, "step": 5761 }, { "epoch": 4.2935916542473915, "grad_norm": 8.864179975362147, "learning_rate": 1.2111833811468675e-07, "loss": 0.0838, "num_input_tokens_seen": 23601152, "step": 5762 }, { "epoch": 4.294336810730253, "grad_norm": 8.507829260825321, "learning_rate": 1.2086714667240124e-07, "loss": 0.1343, "num_input_tokens_seen": 23605248, "step": 5763 }, { "epoch": 4.295081967213115, "grad_norm": 5.36867497955952, "learning_rate": 1.2061620274409977e-07, "loss": 0.0368, "num_input_tokens_seen": 23609344, "step": 5764 }, { "epoch": 4.295827123695976, "grad_norm": 9.23216049173036, "learning_rate": 1.2036550638479125e-07, "loss": 0.1502, "num_input_tokens_seen": 23613440, "step": 5765 }, { "epoch": 4.296572280178838, "grad_norm": 7.719449665895207, "learning_rate": 1.201150576494299e-07, "loss": 0.1131, "num_input_tokens_seen": 23617536, "step": 5766 }, { "epoch": 4.297317436661699, "grad_norm": 5.820094731007415, "learning_rate": 1.1986485659291626e-07, "loss": 0.0405, "num_input_tokens_seen": 23621632, "step": 5767 }, { "epoch": 4.29806259314456, "grad_norm": 8.12648016034271, "learning_rate": 1.1961490327009564e-07, "loss": 0.1654, "num_input_tokens_seen": 23625728, "step": 5768 }, { "epoch": 4.2988077496274215, "grad_norm": 2.9917950454905484, "learning_rate": 1.1936519773576017e-07, "loss": 0.0098, "num_input_tokens_seen": 23629824, "step": 5769 }, { "epoch": 4.299552906110283, "grad_norm": 7.322853761777646, "learning_rate": 1.1911574004464687e-07, "loss": 0.0267, "num_input_tokens_seen": 23633920, "step": 5770 }, { "epoch": 4.300298062593145, "grad_norm": 7.594681877162013, "learning_rate": 1.1886653025143816e-07, "loss": 0.0885, "num_input_tokens_seen": 23638016, "step": 5771 }, { "epoch": 4.301043219076006, "grad_norm": 5.721565325694691, "learning_rate": 1.186175684107635e-07, "loss": 0.0634, "num_input_tokens_seen": 23642112, "step": 5772 }, { "epoch": 4.301788375558868, "grad_norm": 6.3448104223848185, "learning_rate": 1.1836885457719618e-07, "loss": 0.0723, "num_input_tokens_seen": 23646208, "step": 5773 }, { "epoch": 4.302533532041728, "grad_norm": 7.492524726196526, "learning_rate": 1.181203888052572e-07, "loss": 0.1378, "num_input_tokens_seen": 23650304, "step": 5774 }, { "epoch": 4.30327868852459, "grad_norm": 9.216574478168136, "learning_rate": 1.1787217114941075e-07, "loss": 0.2264, "num_input_tokens_seen": 23654400, "step": 5775 }, { "epoch": 4.3040238450074515, "grad_norm": 5.3569843272671385, "learning_rate": 1.176242016640687e-07, "loss": 0.0603, "num_input_tokens_seen": 23658496, "step": 5776 }, { "epoch": 4.304769001490313, "grad_norm": 10.127224934146376, "learning_rate": 1.1737648040358735e-07, "loss": 0.11, "num_input_tokens_seen": 23662592, "step": 5777 }, { "epoch": 4.305514157973175, "grad_norm": 5.8472303176679805, "learning_rate": 1.1712900742226933e-07, "loss": 0.0642, "num_input_tokens_seen": 23666688, "step": 5778 }, { "epoch": 4.306259314456036, "grad_norm": 5.931282059526294, "learning_rate": 1.1688178277436236e-07, "loss": 0.0867, "num_input_tokens_seen": 23670784, "step": 5779 }, { "epoch": 4.307004470938897, "grad_norm": 5.920016791391644, "learning_rate": 1.1663480651405962e-07, "loss": 0.1668, "num_input_tokens_seen": 23674880, "step": 5780 }, { "epoch": 4.307749627421758, "grad_norm": 5.0361071490799745, "learning_rate": 1.1638807869550032e-07, "loss": 0.062, "num_input_tokens_seen": 23678976, "step": 5781 }, { "epoch": 4.30849478390462, "grad_norm": 8.865775527457464, "learning_rate": 1.1614159937276881e-07, "loss": 0.1565, "num_input_tokens_seen": 23683072, "step": 5782 }, { "epoch": 4.3092399403874815, "grad_norm": 5.158421458455046, "learning_rate": 1.1589536859989537e-07, "loss": 0.0396, "num_input_tokens_seen": 23687168, "step": 5783 }, { "epoch": 4.309985096870343, "grad_norm": 7.35340904834581, "learning_rate": 1.15649386430855e-07, "loss": 0.1147, "num_input_tokens_seen": 23691264, "step": 5784 }, { "epoch": 4.310730253353205, "grad_norm": 7.247301994648489, "learning_rate": 1.1540365291956942e-07, "loss": 0.0988, "num_input_tokens_seen": 23695360, "step": 5785 }, { "epoch": 4.311475409836065, "grad_norm": 7.58567738219107, "learning_rate": 1.1515816811990488e-07, "loss": 0.1019, "num_input_tokens_seen": 23699456, "step": 5786 }, { "epoch": 4.312220566318927, "grad_norm": 7.607859003274164, "learning_rate": 1.1491293208567317e-07, "loss": 0.1257, "num_input_tokens_seen": 23703552, "step": 5787 }, { "epoch": 4.312965722801788, "grad_norm": 6.494617306079506, "learning_rate": 1.1466794487063226e-07, "loss": 0.0617, "num_input_tokens_seen": 23707648, "step": 5788 }, { "epoch": 4.31371087928465, "grad_norm": 6.467419563412846, "learning_rate": 1.1442320652848476e-07, "loss": 0.0903, "num_input_tokens_seen": 23711744, "step": 5789 }, { "epoch": 4.3144560357675115, "grad_norm": 7.471761959919347, "learning_rate": 1.1417871711287914e-07, "loss": 0.0888, "num_input_tokens_seen": 23715840, "step": 5790 }, { "epoch": 4.315201192250372, "grad_norm": 7.763954534237204, "learning_rate": 1.1393447667740907e-07, "loss": 0.1925, "num_input_tokens_seen": 23719936, "step": 5791 }, { "epoch": 4.315946348733234, "grad_norm": 5.060639329311461, "learning_rate": 1.1369048527561405e-07, "loss": 0.098, "num_input_tokens_seen": 23724032, "step": 5792 }, { "epoch": 4.316691505216095, "grad_norm": 9.190017570284944, "learning_rate": 1.1344674296097855e-07, "loss": 0.0424, "num_input_tokens_seen": 23728128, "step": 5793 }, { "epoch": 4.317436661698957, "grad_norm": 5.036574390872392, "learning_rate": 1.1320324978693286e-07, "loss": 0.0441, "num_input_tokens_seen": 23732224, "step": 5794 }, { "epoch": 4.318181818181818, "grad_norm": 4.201770790932109, "learning_rate": 1.1296000580685196e-07, "loss": 0.023, "num_input_tokens_seen": 23736320, "step": 5795 }, { "epoch": 4.31892697466468, "grad_norm": 7.809086728650842, "learning_rate": 1.1271701107405732e-07, "loss": 0.0883, "num_input_tokens_seen": 23740416, "step": 5796 }, { "epoch": 4.3196721311475414, "grad_norm": 8.150506167308018, "learning_rate": 1.124742656418147e-07, "loss": 0.0584, "num_input_tokens_seen": 23744512, "step": 5797 }, { "epoch": 4.320417287630402, "grad_norm": 7.744885795968619, "learning_rate": 1.1223176956333556e-07, "loss": 0.1337, "num_input_tokens_seen": 23748608, "step": 5798 }, { "epoch": 4.321162444113264, "grad_norm": 4.484330182215892, "learning_rate": 1.1198952289177709e-07, "loss": 0.0279, "num_input_tokens_seen": 23752704, "step": 5799 }, { "epoch": 4.321907600596125, "grad_norm": 6.202481291739438, "learning_rate": 1.1174752568024111e-07, "loss": 0.0823, "num_input_tokens_seen": 23756800, "step": 5800 }, { "epoch": 4.322652757078987, "grad_norm": 4.571927931573388, "learning_rate": 1.1150577798177575e-07, "loss": 0.0555, "num_input_tokens_seen": 23760896, "step": 5801 }, { "epoch": 4.323397913561848, "grad_norm": 6.373154812248123, "learning_rate": 1.1126427984937308e-07, "loss": 0.0745, "num_input_tokens_seen": 23764992, "step": 5802 }, { "epoch": 4.324143070044709, "grad_norm": 6.159453936764878, "learning_rate": 1.110230313359717e-07, "loss": 0.0926, "num_input_tokens_seen": 23769088, "step": 5803 }, { "epoch": 4.3248882265275705, "grad_norm": 3.614203953047739, "learning_rate": 1.1078203249445474e-07, "loss": 0.0191, "num_input_tokens_seen": 23773184, "step": 5804 }, { "epoch": 4.325633383010432, "grad_norm": 6.442381031806356, "learning_rate": 1.1054128337765119e-07, "loss": 0.0487, "num_input_tokens_seen": 23777280, "step": 5805 }, { "epoch": 4.326378539493294, "grad_norm": 10.760506791343209, "learning_rate": 1.103007840383348e-07, "loss": 0.0498, "num_input_tokens_seen": 23781376, "step": 5806 }, { "epoch": 4.327123695976155, "grad_norm": 5.616004931579958, "learning_rate": 1.1006053452922452e-07, "loss": 0.0293, "num_input_tokens_seen": 23785472, "step": 5807 }, { "epoch": 4.327868852459017, "grad_norm": 4.261411000461505, "learning_rate": 1.0982053490298517e-07, "loss": 0.0105, "num_input_tokens_seen": 23789568, "step": 5808 }, { "epoch": 4.328614008941877, "grad_norm": 3.3768407500996043, "learning_rate": 1.095807852122259e-07, "loss": 0.0104, "num_input_tokens_seen": 23793664, "step": 5809 }, { "epoch": 4.329359165424739, "grad_norm": 4.6209498769152155, "learning_rate": 1.0934128550950204e-07, "loss": 0.0473, "num_input_tokens_seen": 23797760, "step": 5810 }, { "epoch": 4.3301043219076005, "grad_norm": 6.7522593313060035, "learning_rate": 1.0910203584731328e-07, "loss": 0.0565, "num_input_tokens_seen": 23801856, "step": 5811 }, { "epoch": 4.330849478390462, "grad_norm": 8.155010736793303, "learning_rate": 1.0886303627810512e-07, "loss": 0.145, "num_input_tokens_seen": 23805952, "step": 5812 }, { "epoch": 4.331594634873324, "grad_norm": 7.136012076308992, "learning_rate": 1.0862428685426806e-07, "loss": 0.1182, "num_input_tokens_seen": 23810048, "step": 5813 }, { "epoch": 4.332339791356185, "grad_norm": 2.854227558992479, "learning_rate": 1.0838578762813732e-07, "loss": 0.0086, "num_input_tokens_seen": 23814144, "step": 5814 }, { "epoch": 4.333084947839046, "grad_norm": 4.547586607593548, "learning_rate": 1.0814753865199354e-07, "loss": 0.0255, "num_input_tokens_seen": 23818240, "step": 5815 }, { "epoch": 4.333830104321907, "grad_norm": 4.449141211757319, "learning_rate": 1.0790953997806316e-07, "loss": 0.0235, "num_input_tokens_seen": 23822336, "step": 5816 }, { "epoch": 4.334575260804769, "grad_norm": 11.328298062221627, "learning_rate": 1.0767179165851683e-07, "loss": 0.0885, "num_input_tokens_seen": 23826432, "step": 5817 }, { "epoch": 4.3353204172876305, "grad_norm": 6.229702180249631, "learning_rate": 1.0743429374547045e-07, "loss": 0.0558, "num_input_tokens_seen": 23830528, "step": 5818 }, { "epoch": 4.336065573770492, "grad_norm": 9.29652524308159, "learning_rate": 1.0719704629098577e-07, "loss": 0.1669, "num_input_tokens_seen": 23834624, "step": 5819 }, { "epoch": 4.336810730253354, "grad_norm": 6.018635862419418, "learning_rate": 1.0696004934706878e-07, "loss": 0.0766, "num_input_tokens_seen": 23838720, "step": 5820 }, { "epoch": 4.337555886736214, "grad_norm": 1.7584131706641413, "learning_rate": 1.0672330296567113e-07, "loss": 0.0057, "num_input_tokens_seen": 23842816, "step": 5821 }, { "epoch": 4.338301043219076, "grad_norm": 6.351067700545628, "learning_rate": 1.0648680719868895e-07, "loss": 0.0925, "num_input_tokens_seen": 23846912, "step": 5822 }, { "epoch": 4.339046199701937, "grad_norm": 9.579990853684889, "learning_rate": 1.0625056209796425e-07, "loss": 0.0231, "num_input_tokens_seen": 23851008, "step": 5823 }, { "epoch": 4.339791356184799, "grad_norm": 4.845953862943443, "learning_rate": 1.0601456771528351e-07, "loss": 0.0461, "num_input_tokens_seen": 23855104, "step": 5824 }, { "epoch": 4.3405365126676605, "grad_norm": 6.057580504232753, "learning_rate": 1.0577882410237814e-07, "loss": 0.0504, "num_input_tokens_seen": 23859200, "step": 5825 }, { "epoch": 4.341281669150522, "grad_norm": 8.344704248710011, "learning_rate": 1.0554333131092511e-07, "loss": 0.13, "num_input_tokens_seen": 23863296, "step": 5826 }, { "epoch": 4.342026825633383, "grad_norm": 7.8877200776716805, "learning_rate": 1.0530808939254581e-07, "loss": 0.075, "num_input_tokens_seen": 23867392, "step": 5827 }, { "epoch": 4.342771982116244, "grad_norm": 6.982522181114342, "learning_rate": 1.0507309839880756e-07, "loss": 0.083, "num_input_tokens_seen": 23871488, "step": 5828 }, { "epoch": 4.343517138599106, "grad_norm": 6.9819364195419755, "learning_rate": 1.0483835838122125e-07, "loss": 0.0559, "num_input_tokens_seen": 23875584, "step": 5829 }, { "epoch": 4.344262295081967, "grad_norm": 5.658392289472649, "learning_rate": 1.046038693912442e-07, "loss": 0.0426, "num_input_tokens_seen": 23879680, "step": 5830 }, { "epoch": 4.345007451564829, "grad_norm": 4.286128230364197, "learning_rate": 1.0436963148027765e-07, "loss": 0.0302, "num_input_tokens_seen": 23883776, "step": 5831 }, { "epoch": 4.34575260804769, "grad_norm": 6.451099692512212, "learning_rate": 1.0413564469966858e-07, "loss": 0.0233, "num_input_tokens_seen": 23887872, "step": 5832 }, { "epoch": 4.346497764530551, "grad_norm": 7.348056434602623, "learning_rate": 1.0390190910070846e-07, "loss": 0.1163, "num_input_tokens_seen": 23891968, "step": 5833 }, { "epoch": 4.347242921013413, "grad_norm": 8.358436370697717, "learning_rate": 1.036684247346334e-07, "loss": 0.0244, "num_input_tokens_seen": 23896064, "step": 5834 }, { "epoch": 4.347988077496274, "grad_norm": 5.216881398445287, "learning_rate": 1.034351916526255e-07, "loss": 0.0534, "num_input_tokens_seen": 23900160, "step": 5835 }, { "epoch": 4.348733233979136, "grad_norm": 6.406542983276428, "learning_rate": 1.0320220990581054e-07, "loss": 0.0871, "num_input_tokens_seen": 23904256, "step": 5836 }, { "epoch": 4.349478390461997, "grad_norm": 6.835665539598329, "learning_rate": 1.0296947954526015e-07, "loss": 0.0542, "num_input_tokens_seen": 23908352, "step": 5837 }, { "epoch": 4.350223546944858, "grad_norm": 2.8933748212441457, "learning_rate": 1.0273700062199035e-07, "loss": 0.0078, "num_input_tokens_seen": 23912448, "step": 5838 }, { "epoch": 4.35096870342772, "grad_norm": 5.317170643148368, "learning_rate": 1.0250477318696231e-07, "loss": 0.0197, "num_input_tokens_seen": 23916544, "step": 5839 }, { "epoch": 4.351713859910581, "grad_norm": 4.026529599051946, "learning_rate": 1.0227279729108184e-07, "loss": 0.0358, "num_input_tokens_seen": 23920640, "step": 5840 }, { "epoch": 4.352459016393443, "grad_norm": 3.6563354927156393, "learning_rate": 1.0204107298519964e-07, "loss": 0.0381, "num_input_tokens_seen": 23924736, "step": 5841 }, { "epoch": 4.353204172876304, "grad_norm": 6.9757585283867165, "learning_rate": 1.0180960032011134e-07, "loss": 0.1503, "num_input_tokens_seen": 23928832, "step": 5842 }, { "epoch": 4.353949329359166, "grad_norm": 5.863737317550653, "learning_rate": 1.015783793465576e-07, "loss": 0.0309, "num_input_tokens_seen": 23932928, "step": 5843 }, { "epoch": 4.3546944858420265, "grad_norm": 8.274869610561225, "learning_rate": 1.0134741011522357e-07, "loss": 0.1143, "num_input_tokens_seen": 23937024, "step": 5844 }, { "epoch": 4.355439642324888, "grad_norm": 7.271702479718703, "learning_rate": 1.0111669267673901e-07, "loss": 0.0885, "num_input_tokens_seen": 23941120, "step": 5845 }, { "epoch": 4.35618479880775, "grad_norm": 7.598477668705093, "learning_rate": 1.0088622708167961e-07, "loss": 0.0627, "num_input_tokens_seen": 23945216, "step": 5846 }, { "epoch": 4.356929955290611, "grad_norm": 7.29790496729295, "learning_rate": 1.0065601338056425e-07, "loss": 0.1611, "num_input_tokens_seen": 23949312, "step": 5847 }, { "epoch": 4.357675111773473, "grad_norm": 8.195354147586052, "learning_rate": 1.004260516238581e-07, "loss": 0.1704, "num_input_tokens_seen": 23953408, "step": 5848 }, { "epoch": 4.358420268256334, "grad_norm": 3.331671939520442, "learning_rate": 1.0019634186197003e-07, "loss": 0.0156, "num_input_tokens_seen": 23957504, "step": 5849 }, { "epoch": 4.359165424739195, "grad_norm": 8.221263810900119, "learning_rate": 9.996688414525419e-08, "loss": 0.1878, "num_input_tokens_seen": 23961600, "step": 5850 }, { "epoch": 4.359910581222056, "grad_norm": 6.654880332493353, "learning_rate": 9.973767852400936e-08, "loss": 0.0825, "num_input_tokens_seen": 23965696, "step": 5851 }, { "epoch": 4.360655737704918, "grad_norm": 3.1384447240830924, "learning_rate": 9.950872504847886e-08, "loss": 0.0221, "num_input_tokens_seen": 23969792, "step": 5852 }, { "epoch": 4.3614008941877795, "grad_norm": 2.9565118191914928, "learning_rate": 9.928002376885127e-08, "loss": 0.0122, "num_input_tokens_seen": 23973888, "step": 5853 }, { "epoch": 4.362146050670641, "grad_norm": 12.929016624945852, "learning_rate": 9.905157473525898e-08, "loss": 0.1987, "num_input_tokens_seen": 23977984, "step": 5854 }, { "epoch": 4.362891207153503, "grad_norm": 6.997643289917573, "learning_rate": 9.882337799778055e-08, "loss": 0.0464, "num_input_tokens_seen": 23982080, "step": 5855 }, { "epoch": 4.363636363636363, "grad_norm": 5.497688973417877, "learning_rate": 9.85954336064375e-08, "loss": 0.0525, "num_input_tokens_seen": 23986176, "step": 5856 }, { "epoch": 4.364381520119225, "grad_norm": 10.228077808945528, "learning_rate": 9.836774161119721e-08, "loss": 0.2091, "num_input_tokens_seen": 23990272, "step": 5857 }, { "epoch": 4.365126676602086, "grad_norm": 5.422504643381722, "learning_rate": 9.814030206197117e-08, "loss": 0.0594, "num_input_tokens_seen": 23994368, "step": 5858 }, { "epoch": 4.365871833084948, "grad_norm": 7.03023447496538, "learning_rate": 9.791311500861614e-08, "loss": 0.0351, "num_input_tokens_seen": 23998464, "step": 5859 }, { "epoch": 4.3666169895678095, "grad_norm": 7.640315673397386, "learning_rate": 9.76861805009327e-08, "loss": 0.0826, "num_input_tokens_seen": 24002560, "step": 5860 }, { "epoch": 4.36736214605067, "grad_norm": 6.896554182654452, "learning_rate": 9.745949858866705e-08, "loss": 0.0926, "num_input_tokens_seen": 24006656, "step": 5861 }, { "epoch": 4.368107302533532, "grad_norm": 7.112214607529806, "learning_rate": 9.723306932150914e-08, "loss": 0.1162, "num_input_tokens_seen": 24010752, "step": 5862 }, { "epoch": 4.368852459016393, "grad_norm": 7.443878825502054, "learning_rate": 9.700689274909373e-08, "loss": 0.1043, "num_input_tokens_seen": 24014848, "step": 5863 }, { "epoch": 4.369597615499255, "grad_norm": 8.690915903552703, "learning_rate": 9.678096892100072e-08, "loss": 0.1337, "num_input_tokens_seen": 24018944, "step": 5864 }, { "epoch": 4.370342771982116, "grad_norm": 8.490845171788484, "learning_rate": 9.655529788675385e-08, "loss": 0.2264, "num_input_tokens_seen": 24023040, "step": 5865 }, { "epoch": 4.371087928464978, "grad_norm": 7.466844292648167, "learning_rate": 9.632987969582216e-08, "loss": 0.0878, "num_input_tokens_seen": 24027136, "step": 5866 }, { "epoch": 4.3718330849478395, "grad_norm": 8.574873853888434, "learning_rate": 9.610471439761862e-08, "loss": 0.1578, "num_input_tokens_seen": 24031232, "step": 5867 }, { "epoch": 4.3725782414307, "grad_norm": 8.8846682920328, "learning_rate": 9.587980204150166e-08, "loss": 0.0477, "num_input_tokens_seen": 24035328, "step": 5868 }, { "epoch": 4.373323397913562, "grad_norm": 5.482431809863192, "learning_rate": 9.565514267677282e-08, "loss": 0.0418, "num_input_tokens_seen": 24039424, "step": 5869 }, { "epoch": 4.374068554396423, "grad_norm": 3.7300231821205805, "learning_rate": 9.543073635267977e-08, "loss": 0.0546, "num_input_tokens_seen": 24043520, "step": 5870 }, { "epoch": 4.374813710879285, "grad_norm": 7.372646603947907, "learning_rate": 9.520658311841347e-08, "loss": 0.0875, "num_input_tokens_seen": 24047616, "step": 5871 }, { "epoch": 4.375558867362146, "grad_norm": 3.957373978794079, "learning_rate": 9.498268302311001e-08, "loss": 0.0509, "num_input_tokens_seen": 24051712, "step": 5872 }, { "epoch": 4.376304023845007, "grad_norm": 8.122375003969262, "learning_rate": 9.475903611585028e-08, "loss": 0.0952, "num_input_tokens_seen": 24055808, "step": 5873 }, { "epoch": 4.377049180327869, "grad_norm": 6.273742763350375, "learning_rate": 9.453564244565866e-08, "loss": 0.0603, "num_input_tokens_seen": 24059904, "step": 5874 }, { "epoch": 4.37779433681073, "grad_norm": 9.450702622951725, "learning_rate": 9.43125020615053e-08, "loss": 0.136, "num_input_tokens_seen": 24064000, "step": 5875 }, { "epoch": 4.378539493293592, "grad_norm": 5.960931964770383, "learning_rate": 9.408961501230343e-08, "loss": 0.0622, "num_input_tokens_seen": 24068096, "step": 5876 }, { "epoch": 4.379284649776453, "grad_norm": 8.022861089196075, "learning_rate": 9.38669813469123e-08, "loss": 0.1115, "num_input_tokens_seen": 24072192, "step": 5877 }, { "epoch": 4.380029806259315, "grad_norm": 11.967947048188986, "learning_rate": 9.364460111413426e-08, "loss": 0.115, "num_input_tokens_seen": 24076288, "step": 5878 }, { "epoch": 4.3807749627421755, "grad_norm": 7.941611357717102, "learning_rate": 9.342247436271657e-08, "loss": 0.128, "num_input_tokens_seen": 24080384, "step": 5879 }, { "epoch": 4.381520119225037, "grad_norm": 7.865710973386142, "learning_rate": 9.320060114135152e-08, "loss": 0.123, "num_input_tokens_seen": 24084480, "step": 5880 }, { "epoch": 4.382265275707899, "grad_norm": 9.166675913519928, "learning_rate": 9.297898149867466e-08, "loss": 0.1504, "num_input_tokens_seen": 24088576, "step": 5881 }, { "epoch": 4.38301043219076, "grad_norm": 10.4828926719739, "learning_rate": 9.275761548326739e-08, "loss": 0.063, "num_input_tokens_seen": 24092672, "step": 5882 }, { "epoch": 4.383755588673622, "grad_norm": 9.709056363893266, "learning_rate": 9.253650314365384e-08, "loss": 0.1991, "num_input_tokens_seen": 24096768, "step": 5883 }, { "epoch": 4.384500745156483, "grad_norm": 6.706228103702772, "learning_rate": 9.231564452830396e-08, "loss": 0.1294, "num_input_tokens_seen": 24100864, "step": 5884 }, { "epoch": 4.385245901639344, "grad_norm": 10.563966474342026, "learning_rate": 9.209503968563127e-08, "loss": 0.0544, "num_input_tokens_seen": 24104960, "step": 5885 }, { "epoch": 4.3859910581222055, "grad_norm": 9.810655964870707, "learning_rate": 9.187468866399429e-08, "loss": 0.1997, "num_input_tokens_seen": 24109056, "step": 5886 }, { "epoch": 4.386736214605067, "grad_norm": 7.15842034379837, "learning_rate": 9.165459151169509e-08, "loss": 0.1516, "num_input_tokens_seen": 24113152, "step": 5887 }, { "epoch": 4.387481371087929, "grad_norm": 13.481626095347721, "learning_rate": 9.14347482769809e-08, "loss": 0.1395, "num_input_tokens_seen": 24117248, "step": 5888 }, { "epoch": 4.38822652757079, "grad_norm": 5.328092867586837, "learning_rate": 9.12151590080429e-08, "loss": 0.0541, "num_input_tokens_seen": 24121344, "step": 5889 }, { "epoch": 4.388971684053652, "grad_norm": 8.144885782414047, "learning_rate": 9.099582375301619e-08, "loss": 0.0903, "num_input_tokens_seen": 24125440, "step": 5890 }, { "epoch": 4.389716840536512, "grad_norm": 7.216341597159059, "learning_rate": 9.077674255998121e-08, "loss": 0.0839, "num_input_tokens_seen": 24129536, "step": 5891 }, { "epoch": 4.390461997019374, "grad_norm": 2.720025118458955, "learning_rate": 9.055791547696161e-08, "loss": 0.0167, "num_input_tokens_seen": 24133632, "step": 5892 }, { "epoch": 4.3912071535022354, "grad_norm": 6.1830727419435725, "learning_rate": 9.033934255192625e-08, "loss": 0.1165, "num_input_tokens_seen": 24137728, "step": 5893 }, { "epoch": 4.391952309985097, "grad_norm": 4.852541140892734, "learning_rate": 9.012102383278765e-08, "loss": 0.0532, "num_input_tokens_seen": 24141824, "step": 5894 }, { "epoch": 4.3926974664679586, "grad_norm": 4.432601482754962, "learning_rate": 8.99029593674032e-08, "loss": 0.0353, "num_input_tokens_seen": 24145920, "step": 5895 }, { "epoch": 4.39344262295082, "grad_norm": 7.331386399557695, "learning_rate": 8.968514920357358e-08, "loss": 0.0849, "num_input_tokens_seen": 24150016, "step": 5896 }, { "epoch": 4.394187779433681, "grad_norm": 5.535261471239903, "learning_rate": 8.946759338904473e-08, "loss": 0.0402, "num_input_tokens_seen": 24154112, "step": 5897 }, { "epoch": 4.394932935916542, "grad_norm": 7.763392235556375, "learning_rate": 8.925029197150644e-08, "loss": 0.1097, "num_input_tokens_seen": 24158208, "step": 5898 }, { "epoch": 4.395678092399404, "grad_norm": 4.1207549961033285, "learning_rate": 8.903324499859242e-08, "loss": 0.0208, "num_input_tokens_seen": 24162304, "step": 5899 }, { "epoch": 4.396423248882265, "grad_norm": 5.0399649047378166, "learning_rate": 8.88164525178814e-08, "loss": 0.062, "num_input_tokens_seen": 24166400, "step": 5900 }, { "epoch": 4.397168405365127, "grad_norm": 8.268554349142201, "learning_rate": 8.859991457689523e-08, "loss": 0.1509, "num_input_tokens_seen": 24170496, "step": 5901 }, { "epoch": 4.397913561847988, "grad_norm": 7.807412016600214, "learning_rate": 8.838363122310123e-08, "loss": 0.1755, "num_input_tokens_seen": 24174592, "step": 5902 }, { "epoch": 4.398658718330849, "grad_norm": 6.73929238228446, "learning_rate": 8.816760250390965e-08, "loss": 0.0992, "num_input_tokens_seen": 24178688, "step": 5903 }, { "epoch": 4.399403874813711, "grad_norm": 6.749043825622654, "learning_rate": 8.79518284666761e-08, "loss": 0.0801, "num_input_tokens_seen": 24182784, "step": 5904 }, { "epoch": 4.400149031296572, "grad_norm": 6.075627242006233, "learning_rate": 8.773630915869955e-08, "loss": 0.0624, "num_input_tokens_seen": 24186880, "step": 5905 }, { "epoch": 4.400894187779434, "grad_norm": 7.225214246967893, "learning_rate": 8.752104462722317e-08, "loss": 0.1075, "num_input_tokens_seen": 24190976, "step": 5906 }, { "epoch": 4.401639344262295, "grad_norm": 5.675615187357334, "learning_rate": 8.730603491943476e-08, "loss": 0.0685, "num_input_tokens_seen": 24195072, "step": 5907 }, { "epoch": 4.402384500745156, "grad_norm": 6.582999714269803, "learning_rate": 8.709128008246568e-08, "loss": 0.0211, "num_input_tokens_seen": 24199168, "step": 5908 }, { "epoch": 4.403129657228018, "grad_norm": 7.107722011977562, "learning_rate": 8.687678016339227e-08, "loss": 0.0667, "num_input_tokens_seen": 24203264, "step": 5909 }, { "epoch": 4.403874813710879, "grad_norm": 6.8237206548121545, "learning_rate": 8.666253520923373e-08, "loss": 0.0628, "num_input_tokens_seen": 24207360, "step": 5910 }, { "epoch": 4.404619970193741, "grad_norm": 5.6854793737825435, "learning_rate": 8.644854526695473e-08, "loss": 0.0497, "num_input_tokens_seen": 24211456, "step": 5911 }, { "epoch": 4.405365126676602, "grad_norm": 8.114509917222177, "learning_rate": 8.623481038346287e-08, "loss": 0.1084, "num_input_tokens_seen": 24215552, "step": 5912 }, { "epoch": 4.406110283159464, "grad_norm": 6.328963998897184, "learning_rate": 8.602133060561068e-08, "loss": 0.0645, "num_input_tokens_seen": 24219648, "step": 5913 }, { "epoch": 4.4068554396423245, "grad_norm": 4.553259832477361, "learning_rate": 8.580810598019434e-08, "loss": 0.0377, "num_input_tokens_seen": 24223744, "step": 5914 }, { "epoch": 4.407600596125186, "grad_norm": 5.762341434221167, "learning_rate": 8.559513655395438e-08, "loss": 0.0962, "num_input_tokens_seen": 24227840, "step": 5915 }, { "epoch": 4.408345752608048, "grad_norm": 11.191507213339879, "learning_rate": 8.538242237357497e-08, "loss": 0.1001, "num_input_tokens_seen": 24231936, "step": 5916 }, { "epoch": 4.409090909090909, "grad_norm": 3.5692116259430056, "learning_rate": 8.516996348568465e-08, "loss": 0.0175, "num_input_tokens_seen": 24236032, "step": 5917 }, { "epoch": 4.409836065573771, "grad_norm": 7.858056719125321, "learning_rate": 8.495775993685604e-08, "loss": 0.0869, "num_input_tokens_seen": 24240128, "step": 5918 }, { "epoch": 4.410581222056632, "grad_norm": 6.380515090883273, "learning_rate": 8.474581177360553e-08, "loss": 0.0768, "num_input_tokens_seen": 24244224, "step": 5919 }, { "epoch": 4.411326378539493, "grad_norm": 11.431173393694367, "learning_rate": 8.453411904239372e-08, "loss": 0.0762, "num_input_tokens_seen": 24248320, "step": 5920 }, { "epoch": 4.4120715350223545, "grad_norm": 5.697197488991407, "learning_rate": 8.432268178962518e-08, "loss": 0.0524, "num_input_tokens_seen": 24252416, "step": 5921 }, { "epoch": 4.412816691505216, "grad_norm": 8.285180886504204, "learning_rate": 8.411150006164881e-08, "loss": 0.0861, "num_input_tokens_seen": 24256512, "step": 5922 }, { "epoch": 4.413561847988078, "grad_norm": 8.67553404710878, "learning_rate": 8.390057390475645e-08, "loss": 0.0989, "num_input_tokens_seen": 24260608, "step": 5923 }, { "epoch": 4.414307004470939, "grad_norm": 5.752343798972126, "learning_rate": 8.368990336518515e-08, "loss": 0.1095, "num_input_tokens_seen": 24264704, "step": 5924 }, { "epoch": 4.415052160953801, "grad_norm": 6.135239838415056, "learning_rate": 8.347948848911519e-08, "loss": 0.0744, "num_input_tokens_seen": 24268800, "step": 5925 }, { "epoch": 4.415797317436661, "grad_norm": 7.244210847488357, "learning_rate": 8.326932932267093e-08, "loss": 0.1134, "num_input_tokens_seen": 24272896, "step": 5926 }, { "epoch": 4.416542473919523, "grad_norm": 4.627205277226225, "learning_rate": 8.305942591192107e-08, "loss": 0.0323, "num_input_tokens_seen": 24276992, "step": 5927 }, { "epoch": 4.4172876304023845, "grad_norm": 5.784174964371456, "learning_rate": 8.284977830287755e-08, "loss": 0.0508, "num_input_tokens_seen": 24281088, "step": 5928 }, { "epoch": 4.418032786885246, "grad_norm": 14.557452661691245, "learning_rate": 8.264038654149694e-08, "loss": 0.1571, "num_input_tokens_seen": 24285184, "step": 5929 }, { "epoch": 4.418777943368108, "grad_norm": 3.8025510464962258, "learning_rate": 8.243125067367921e-08, "loss": 0.0186, "num_input_tokens_seen": 24289280, "step": 5930 }, { "epoch": 4.419523099850968, "grad_norm": 6.63272569353312, "learning_rate": 8.222237074526865e-08, "loss": 0.096, "num_input_tokens_seen": 24293376, "step": 5931 }, { "epoch": 4.42026825633383, "grad_norm": 5.404031010942439, "learning_rate": 8.201374680205306e-08, "loss": 0.0374, "num_input_tokens_seen": 24297472, "step": 5932 }, { "epoch": 4.421013412816691, "grad_norm": 14.473755750787232, "learning_rate": 8.180537888976423e-08, "loss": 0.1365, "num_input_tokens_seen": 24301568, "step": 5933 }, { "epoch": 4.421758569299553, "grad_norm": 6.584088714899753, "learning_rate": 8.159726705407822e-08, "loss": 0.0726, "num_input_tokens_seen": 24305664, "step": 5934 }, { "epoch": 4.4225037257824145, "grad_norm": 8.027138323940306, "learning_rate": 8.138941134061413e-08, "loss": 0.1825, "num_input_tokens_seen": 24309760, "step": 5935 }, { "epoch": 4.423248882265276, "grad_norm": 8.13780359205735, "learning_rate": 8.118181179493617e-08, "loss": 0.0667, "num_input_tokens_seen": 24313856, "step": 5936 }, { "epoch": 4.423994038748138, "grad_norm": 6.3187986581329865, "learning_rate": 8.097446846255061e-08, "loss": 0.1308, "num_input_tokens_seen": 24317952, "step": 5937 }, { "epoch": 4.424739195230998, "grad_norm": 7.518577253352552, "learning_rate": 8.076738138890954e-08, "loss": 0.0806, "num_input_tokens_seen": 24322048, "step": 5938 }, { "epoch": 4.42548435171386, "grad_norm": 7.690856982290581, "learning_rate": 8.056055061940724e-08, "loss": 0.1078, "num_input_tokens_seen": 24326144, "step": 5939 }, { "epoch": 4.426229508196721, "grad_norm": 7.351524061259653, "learning_rate": 8.03539761993831e-08, "loss": 0.0695, "num_input_tokens_seen": 24330240, "step": 5940 }, { "epoch": 4.426974664679583, "grad_norm": 7.8939801614450635, "learning_rate": 8.01476581741191e-08, "loss": 0.0659, "num_input_tokens_seen": 24334336, "step": 5941 }, { "epoch": 4.4277198211624444, "grad_norm": 6.561938977425524, "learning_rate": 7.994159658884213e-08, "loss": 0.0959, "num_input_tokens_seen": 24338432, "step": 5942 }, { "epoch": 4.428464977645305, "grad_norm": 6.855260612258208, "learning_rate": 7.973579148872216e-08, "loss": 0.1182, "num_input_tokens_seen": 24342528, "step": 5943 }, { "epoch": 4.429210134128167, "grad_norm": 7.293870215703833, "learning_rate": 7.953024291887282e-08, "loss": 0.1199, "num_input_tokens_seen": 24346624, "step": 5944 }, { "epoch": 4.429955290611028, "grad_norm": 7.48593659311373, "learning_rate": 7.932495092435252e-08, "loss": 0.1305, "num_input_tokens_seen": 24350720, "step": 5945 }, { "epoch": 4.43070044709389, "grad_norm": 4.994507398074723, "learning_rate": 7.911991555016208e-08, "loss": 0.0498, "num_input_tokens_seen": 24354816, "step": 5946 }, { "epoch": 4.431445603576751, "grad_norm": 6.204734817999513, "learning_rate": 7.891513684124704e-08, "loss": 0.0638, "num_input_tokens_seen": 24358912, "step": 5947 }, { "epoch": 4.432190760059613, "grad_norm": 5.478813982873852, "learning_rate": 7.871061484249626e-08, "loss": 0.054, "num_input_tokens_seen": 24363008, "step": 5948 }, { "epoch": 4.4329359165424735, "grad_norm": 4.477139200975792, "learning_rate": 7.850634959874286e-08, "loss": 0.0145, "num_input_tokens_seen": 24367104, "step": 5949 }, { "epoch": 4.433681073025335, "grad_norm": 9.09644224804067, "learning_rate": 7.830234115476232e-08, "loss": 0.168, "num_input_tokens_seen": 24371200, "step": 5950 }, { "epoch": 4.434426229508197, "grad_norm": 6.382084127386194, "learning_rate": 7.809858955527563e-08, "loss": 0.0452, "num_input_tokens_seen": 24375296, "step": 5951 }, { "epoch": 4.435171385991058, "grad_norm": 9.218897062965585, "learning_rate": 7.789509484494598e-08, "loss": 0.1961, "num_input_tokens_seen": 24379392, "step": 5952 }, { "epoch": 4.43591654247392, "grad_norm": 6.491258377113762, "learning_rate": 7.769185706838128e-08, "loss": 0.0364, "num_input_tokens_seen": 24383488, "step": 5953 }, { "epoch": 4.436661698956781, "grad_norm": 4.8371364711131735, "learning_rate": 7.748887627013244e-08, "loss": 0.0146, "num_input_tokens_seen": 24387584, "step": 5954 }, { "epoch": 4.437406855439642, "grad_norm": 7.875127527249208, "learning_rate": 7.72861524946944e-08, "loss": 0.0513, "num_input_tokens_seen": 24391680, "step": 5955 }, { "epoch": 4.4381520119225035, "grad_norm": 10.753614893694289, "learning_rate": 7.708368578650565e-08, "loss": 0.3304, "num_input_tokens_seen": 24395776, "step": 5956 }, { "epoch": 4.438897168405365, "grad_norm": 7.667995688663634, "learning_rate": 7.688147618994835e-08, "loss": 0.1604, "num_input_tokens_seen": 24399872, "step": 5957 }, { "epoch": 4.439642324888227, "grad_norm": 6.342162160664826, "learning_rate": 7.66795237493484e-08, "loss": 0.1036, "num_input_tokens_seen": 24403968, "step": 5958 }, { "epoch": 4.440387481371088, "grad_norm": 5.802959856754909, "learning_rate": 7.647782850897484e-08, "loss": 0.0371, "num_input_tokens_seen": 24408064, "step": 5959 }, { "epoch": 4.44113263785395, "grad_norm": 5.8494077865597, "learning_rate": 7.62763905130412e-08, "loss": 0.1127, "num_input_tokens_seen": 24412160, "step": 5960 }, { "epoch": 4.44187779433681, "grad_norm": 11.654724254125941, "learning_rate": 7.607520980570396e-08, "loss": 0.0918, "num_input_tokens_seen": 24416256, "step": 5961 }, { "epoch": 4.442622950819672, "grad_norm": 8.824557359251227, "learning_rate": 7.587428643106313e-08, "loss": 0.2152, "num_input_tokens_seen": 24420352, "step": 5962 }, { "epoch": 4.4433681073025335, "grad_norm": 10.952201875804546, "learning_rate": 7.567362043316303e-08, "loss": 0.1465, "num_input_tokens_seen": 24424448, "step": 5963 }, { "epoch": 4.444113263785395, "grad_norm": 6.386345816276306, "learning_rate": 7.547321185599032e-08, "loss": 0.0552, "num_input_tokens_seen": 24428544, "step": 5964 }, { "epoch": 4.444858420268257, "grad_norm": 7.455411719567932, "learning_rate": 7.527306074347678e-08, "loss": 0.0925, "num_input_tokens_seen": 24432640, "step": 5965 }, { "epoch": 4.445603576751118, "grad_norm": 6.883809348818983, "learning_rate": 7.507316713949636e-08, "loss": 0.1723, "num_input_tokens_seen": 24436736, "step": 5966 }, { "epoch": 4.446348733233979, "grad_norm": 6.789355835141325, "learning_rate": 7.48735310878676e-08, "loss": 0.0904, "num_input_tokens_seen": 24440832, "step": 5967 }, { "epoch": 4.44709388971684, "grad_norm": 5.4344068075012855, "learning_rate": 7.467415263235175e-08, "loss": 0.0274, "num_input_tokens_seen": 24444928, "step": 5968 }, { "epoch": 4.447839046199702, "grad_norm": 7.857149271859665, "learning_rate": 7.447503181665455e-08, "loss": 0.1665, "num_input_tokens_seen": 24449024, "step": 5969 }, { "epoch": 4.4485842026825635, "grad_norm": 8.829098620911264, "learning_rate": 7.427616868442427e-08, "loss": 0.1465, "num_input_tokens_seen": 24453120, "step": 5970 }, { "epoch": 4.449329359165425, "grad_norm": 8.598631600573219, "learning_rate": 7.4077563279253e-08, "loss": 0.1908, "num_input_tokens_seen": 24457216, "step": 5971 }, { "epoch": 4.450074515648286, "grad_norm": 2.10277129273592, "learning_rate": 7.387921564467701e-08, "loss": 0.0081, "num_input_tokens_seen": 24461312, "step": 5972 }, { "epoch": 4.450819672131147, "grad_norm": 9.754464512284445, "learning_rate": 7.3681125824175e-08, "loss": 0.0707, "num_input_tokens_seen": 24465408, "step": 5973 }, { "epoch": 4.451564828614009, "grad_norm": 7.7032547969425345, "learning_rate": 7.348329386117001e-08, "loss": 0.0419, "num_input_tokens_seen": 24469504, "step": 5974 }, { "epoch": 4.45230998509687, "grad_norm": 6.760468171079193, "learning_rate": 7.328571979902791e-08, "loss": 0.0839, "num_input_tokens_seen": 24473600, "step": 5975 }, { "epoch": 4.453055141579732, "grad_norm": 9.47423929147909, "learning_rate": 7.308840368105891e-08, "loss": 0.1159, "num_input_tokens_seen": 24477696, "step": 5976 }, { "epoch": 4.4538002980625935, "grad_norm": 2.290567719471889, "learning_rate": 7.289134555051522e-08, "loss": 0.0098, "num_input_tokens_seen": 24481792, "step": 5977 }, { "epoch": 4.454545454545454, "grad_norm": 8.349764832924217, "learning_rate": 7.269454545059423e-08, "loss": 0.1456, "num_input_tokens_seen": 24485888, "step": 5978 }, { "epoch": 4.455290611028316, "grad_norm": 7.127185614410668, "learning_rate": 7.249800342443534e-08, "loss": 0.0646, "num_input_tokens_seen": 24489984, "step": 5979 }, { "epoch": 4.456035767511177, "grad_norm": 6.891369555880731, "learning_rate": 7.230171951512253e-08, "loss": 0.0633, "num_input_tokens_seen": 24494080, "step": 5980 }, { "epoch": 4.456780923994039, "grad_norm": 6.475357581591888, "learning_rate": 7.210569376568222e-08, "loss": 0.1042, "num_input_tokens_seen": 24498176, "step": 5981 }, { "epoch": 4.4575260804769, "grad_norm": 6.353727346207032, "learning_rate": 7.190992621908449e-08, "loss": 0.0546, "num_input_tokens_seen": 24502272, "step": 5982 }, { "epoch": 4.458271236959762, "grad_norm": 5.017578837058832, "learning_rate": 7.171441691824349e-08, "loss": 0.0356, "num_input_tokens_seen": 24506368, "step": 5983 }, { "epoch": 4.459016393442623, "grad_norm": 6.1529477350325825, "learning_rate": 7.15191659060159e-08, "loss": 0.0682, "num_input_tokens_seen": 24510464, "step": 5984 }, { "epoch": 4.459761549925484, "grad_norm": 5.145736184669454, "learning_rate": 7.132417322520236e-08, "loss": 0.0458, "num_input_tokens_seen": 24514560, "step": 5985 }, { "epoch": 4.460506706408346, "grad_norm": 5.141496307154779, "learning_rate": 7.112943891854643e-08, "loss": 0.0557, "num_input_tokens_seen": 24518656, "step": 5986 }, { "epoch": 4.461251862891207, "grad_norm": 5.814756942831287, "learning_rate": 7.09349630287355e-08, "loss": 0.0322, "num_input_tokens_seen": 24522752, "step": 5987 }, { "epoch": 4.461997019374069, "grad_norm": 8.304462699955568, "learning_rate": 7.074074559839994e-08, "loss": 0.1317, "num_input_tokens_seen": 24526848, "step": 5988 }, { "epoch": 4.46274217585693, "grad_norm": 5.124577224870303, "learning_rate": 7.054678667011345e-08, "loss": 0.0143, "num_input_tokens_seen": 24530944, "step": 5989 }, { "epoch": 4.463487332339791, "grad_norm": 5.9214831077047725, "learning_rate": 7.035308628639367e-08, "loss": 0.1412, "num_input_tokens_seen": 24535040, "step": 5990 }, { "epoch": 4.4642324888226526, "grad_norm": 6.171840509090288, "learning_rate": 7.015964448970055e-08, "loss": 0.0897, "num_input_tokens_seen": 24539136, "step": 5991 }, { "epoch": 4.464977645305514, "grad_norm": 5.20489350904303, "learning_rate": 6.996646132243823e-08, "loss": 0.0541, "num_input_tokens_seen": 24543232, "step": 5992 }, { "epoch": 4.465722801788376, "grad_norm": 5.147344973472111, "learning_rate": 6.977353682695379e-08, "loss": 0.0539, "num_input_tokens_seen": 24547328, "step": 5993 }, { "epoch": 4.466467958271237, "grad_norm": 5.790218064283355, "learning_rate": 6.958087104553771e-08, "loss": 0.0578, "num_input_tokens_seen": 24551424, "step": 5994 }, { "epoch": 4.467213114754099, "grad_norm": 4.7850561570356005, "learning_rate": 6.938846402042343e-08, "loss": 0.0401, "num_input_tokens_seen": 24555520, "step": 5995 }, { "epoch": 4.467958271236959, "grad_norm": 6.702178523339698, "learning_rate": 6.919631579378832e-08, "loss": 0.0412, "num_input_tokens_seen": 24559616, "step": 5996 }, { "epoch": 4.468703427719821, "grad_norm": 7.4004672036702015, "learning_rate": 6.900442640775259e-08, "loss": 0.0815, "num_input_tokens_seen": 24563712, "step": 5997 }, { "epoch": 4.4694485842026825, "grad_norm": 10.661497411928256, "learning_rate": 6.881279590437939e-08, "loss": 0.0673, "num_input_tokens_seen": 24567808, "step": 5998 }, { "epoch": 4.470193740685544, "grad_norm": 4.317484543817219, "learning_rate": 6.862142432567595e-08, "loss": 0.0228, "num_input_tokens_seen": 24571904, "step": 5999 }, { "epoch": 4.470938897168406, "grad_norm": 6.0017560703569925, "learning_rate": 6.843031171359191e-08, "loss": 0.0813, "num_input_tokens_seen": 24576000, "step": 6000 }, { "epoch": 4.471684053651266, "grad_norm": 6.425927872563688, "learning_rate": 6.823945811002098e-08, "loss": 0.0673, "num_input_tokens_seen": 24580096, "step": 6001 }, { "epoch": 4.472429210134128, "grad_norm": 8.927763707333755, "learning_rate": 6.804886355679913e-08, "loss": 0.1604, "num_input_tokens_seen": 24584192, "step": 6002 }, { "epoch": 4.473174366616989, "grad_norm": 7.233788413818393, "learning_rate": 6.785852809570656e-08, "loss": 0.0616, "num_input_tokens_seen": 24588288, "step": 6003 }, { "epoch": 4.473919523099851, "grad_norm": 8.981177035256882, "learning_rate": 6.766845176846559e-08, "loss": 0.0314, "num_input_tokens_seen": 24592384, "step": 6004 }, { "epoch": 4.4746646795827125, "grad_norm": 6.32988338756229, "learning_rate": 6.747863461674276e-08, "loss": 0.0673, "num_input_tokens_seen": 24596480, "step": 6005 }, { "epoch": 4.475409836065574, "grad_norm": 7.333760989875134, "learning_rate": 6.728907668214685e-08, "loss": 0.1398, "num_input_tokens_seen": 24600576, "step": 6006 }, { "epoch": 4.476154992548436, "grad_norm": 7.276158764331843, "learning_rate": 6.709977800623103e-08, "loss": 0.0967, "num_input_tokens_seen": 24604672, "step": 6007 }, { "epoch": 4.476900149031296, "grad_norm": 7.6068660999359174, "learning_rate": 6.691073863049042e-08, "loss": 0.0867, "num_input_tokens_seen": 24608768, "step": 6008 }, { "epoch": 4.477645305514158, "grad_norm": 5.760100529240667, "learning_rate": 6.672195859636382e-08, "loss": 0.0517, "num_input_tokens_seen": 24612864, "step": 6009 }, { "epoch": 4.478390461997019, "grad_norm": 5.1582075395386555, "learning_rate": 6.653343794523342e-08, "loss": 0.0214, "num_input_tokens_seen": 24616960, "step": 6010 }, { "epoch": 4.479135618479881, "grad_norm": 7.665022681883015, "learning_rate": 6.63451767184238e-08, "loss": 0.0727, "num_input_tokens_seen": 24621056, "step": 6011 }, { "epoch": 4.4798807749627425, "grad_norm": 10.162670002669152, "learning_rate": 6.615717495720389e-08, "loss": 0.1228, "num_input_tokens_seen": 24625152, "step": 6012 }, { "epoch": 4.480625931445603, "grad_norm": 3.2415871297565158, "learning_rate": 6.596943270278433e-08, "loss": 0.0074, "num_input_tokens_seen": 24629248, "step": 6013 }, { "epoch": 4.481371087928465, "grad_norm": 8.299034866011425, "learning_rate": 6.578194999631998e-08, "loss": 0.0886, "num_input_tokens_seen": 24633344, "step": 6014 }, { "epoch": 4.482116244411326, "grad_norm": 6.528920376041184, "learning_rate": 6.559472687890838e-08, "loss": 0.0726, "num_input_tokens_seen": 24637440, "step": 6015 }, { "epoch": 4.482861400894188, "grad_norm": 5.66665256977138, "learning_rate": 6.540776339158978e-08, "loss": 0.078, "num_input_tokens_seen": 24641536, "step": 6016 }, { "epoch": 4.483606557377049, "grad_norm": 8.493501343814174, "learning_rate": 6.522105957534857e-08, "loss": 0.2231, "num_input_tokens_seen": 24645632, "step": 6017 }, { "epoch": 4.484351713859911, "grad_norm": 3.645950535631568, "learning_rate": 6.503461547111096e-08, "loss": 0.0186, "num_input_tokens_seen": 24649728, "step": 6018 }, { "epoch": 4.485096870342772, "grad_norm": 5.935869847798818, "learning_rate": 6.484843111974698e-08, "loss": 0.0539, "num_input_tokens_seen": 24653824, "step": 6019 }, { "epoch": 4.485842026825633, "grad_norm": 9.554885601014005, "learning_rate": 6.466250656206971e-08, "loss": 0.1688, "num_input_tokens_seen": 24657920, "step": 6020 }, { "epoch": 4.486587183308495, "grad_norm": 8.858430338992028, "learning_rate": 6.447684183883501e-08, "loss": 0.1793, "num_input_tokens_seen": 24662016, "step": 6021 }, { "epoch": 4.487332339791356, "grad_norm": 5.174938587578764, "learning_rate": 6.4291436990742e-08, "loss": 0.0965, "num_input_tokens_seen": 24666112, "step": 6022 }, { "epoch": 4.488077496274218, "grad_norm": 7.575217767870016, "learning_rate": 6.410629205843275e-08, "loss": 0.0791, "num_input_tokens_seen": 24670208, "step": 6023 }, { "epoch": 4.488822652757079, "grad_norm": 6.715746435084639, "learning_rate": 6.392140708249228e-08, "loss": 0.0773, "num_input_tokens_seen": 24674304, "step": 6024 }, { "epoch": 4.48956780923994, "grad_norm": 6.105479827578742, "learning_rate": 6.373678210344874e-08, "loss": 0.0538, "num_input_tokens_seen": 24678400, "step": 6025 }, { "epoch": 4.490312965722802, "grad_norm": 8.735185285380847, "learning_rate": 6.355241716177338e-08, "loss": 0.1537, "num_input_tokens_seen": 24682496, "step": 6026 }, { "epoch": 4.491058122205663, "grad_norm": 6.422995594326526, "learning_rate": 6.336831229787996e-08, "loss": 0.1315, "num_input_tokens_seen": 24686592, "step": 6027 }, { "epoch": 4.491803278688525, "grad_norm": 7.738783181671776, "learning_rate": 6.318446755212592e-08, "loss": 0.0948, "num_input_tokens_seen": 24690688, "step": 6028 }, { "epoch": 4.492548435171386, "grad_norm": 8.498907425576235, "learning_rate": 6.300088296481099e-08, "loss": 0.0801, "num_input_tokens_seen": 24694784, "step": 6029 }, { "epoch": 4.493293591654248, "grad_norm": 7.460321152403266, "learning_rate": 6.281755857617894e-08, "loss": 0.1233, "num_input_tokens_seen": 24698880, "step": 6030 }, { "epoch": 4.4940387481371085, "grad_norm": 6.557225505309291, "learning_rate": 6.263449442641484e-08, "loss": 0.0796, "num_input_tokens_seen": 24702976, "step": 6031 }, { "epoch": 4.49478390461997, "grad_norm": 6.226534955358975, "learning_rate": 6.24516905556484e-08, "loss": 0.1604, "num_input_tokens_seen": 24707072, "step": 6032 }, { "epoch": 4.495529061102832, "grad_norm": 9.034769081441745, "learning_rate": 6.22691470039509e-08, "loss": 0.1241, "num_input_tokens_seen": 24711168, "step": 6033 }, { "epoch": 4.496274217585693, "grad_norm": 5.840730105808309, "learning_rate": 6.208686381133783e-08, "loss": 0.073, "num_input_tokens_seen": 24715264, "step": 6034 }, { "epoch": 4.497019374068555, "grad_norm": 7.481817509046499, "learning_rate": 6.190484101776666e-08, "loss": 0.0812, "num_input_tokens_seen": 24719360, "step": 6035 }, { "epoch": 4.497764530551416, "grad_norm": 3.986717581664521, "learning_rate": 6.172307866313787e-08, "loss": 0.0399, "num_input_tokens_seen": 24723456, "step": 6036 }, { "epoch": 4.498509687034277, "grad_norm": 9.87921231359186, "learning_rate": 6.154157678729553e-08, "loss": 0.1668, "num_input_tokens_seen": 24727552, "step": 6037 }, { "epoch": 4.4992548435171384, "grad_norm": 6.809602821460707, "learning_rate": 6.136033543002559e-08, "loss": 0.0524, "num_input_tokens_seen": 24731648, "step": 6038 }, { "epoch": 4.5, "grad_norm": 6.218816692390529, "learning_rate": 6.11793546310581e-08, "loss": 0.078, "num_input_tokens_seen": 24735744, "step": 6039 }, { "epoch": 4.5007451564828616, "grad_norm": 7.413291315751642, "learning_rate": 6.099863443006476e-08, "loss": 0.1009, "num_input_tokens_seen": 24739840, "step": 6040 }, { "epoch": 4.501490312965723, "grad_norm": 7.537549640912596, "learning_rate": 6.081817486666114e-08, "loss": 0.1124, "num_input_tokens_seen": 24743936, "step": 6041 }, { "epoch": 4.502235469448584, "grad_norm": 8.066945799994821, "learning_rate": 6.063797598040516e-08, "loss": 0.075, "num_input_tokens_seen": 24748032, "step": 6042 }, { "epoch": 4.502980625931445, "grad_norm": 8.40330773780084, "learning_rate": 6.045803781079746e-08, "loss": 0.0717, "num_input_tokens_seen": 24752128, "step": 6043 }, { "epoch": 4.503725782414307, "grad_norm": 9.088528026196327, "learning_rate": 6.027836039728216e-08, "loss": 0.2154, "num_input_tokens_seen": 24756224, "step": 6044 }, { "epoch": 4.504470938897168, "grad_norm": 6.966842948600029, "learning_rate": 6.009894377924569e-08, "loss": 0.1024, "num_input_tokens_seen": 24760320, "step": 6045 }, { "epoch": 4.50521609538003, "grad_norm": 6.009361404225655, "learning_rate": 5.991978799601727e-08, "loss": 0.0777, "num_input_tokens_seen": 24764416, "step": 6046 }, { "epoch": 4.5059612518628915, "grad_norm": 2.638195062322554, "learning_rate": 5.974089308686912e-08, "loss": 0.0094, "num_input_tokens_seen": 24768512, "step": 6047 }, { "epoch": 4.506706408345753, "grad_norm": 6.884945875511838, "learning_rate": 5.956225909101651e-08, "loss": 0.0738, "num_input_tokens_seen": 24772608, "step": 6048 }, { "epoch": 4.507451564828614, "grad_norm": 7.785193758690588, "learning_rate": 5.938388604761716e-08, "loss": 0.125, "num_input_tokens_seen": 24776704, "step": 6049 }, { "epoch": 4.508196721311475, "grad_norm": 5.115046309189728, "learning_rate": 5.920577399577188e-08, "loss": 0.039, "num_input_tokens_seen": 24780800, "step": 6050 }, { "epoch": 4.508941877794337, "grad_norm": 2.3498527590241056, "learning_rate": 5.9027922974523727e-08, "loss": 0.0095, "num_input_tokens_seen": 24784896, "step": 6051 }, { "epoch": 4.509687034277198, "grad_norm": 5.329267191584325, "learning_rate": 5.8850333022859445e-08, "loss": 0.0553, "num_input_tokens_seen": 24788992, "step": 6052 }, { "epoch": 4.51043219076006, "grad_norm": 4.867017523247318, "learning_rate": 5.8673004179707615e-08, "loss": 0.0533, "num_input_tokens_seen": 24793088, "step": 6053 }, { "epoch": 4.511177347242921, "grad_norm": 7.80211861158793, "learning_rate": 5.8495936483940055e-08, "loss": 0.0344, "num_input_tokens_seen": 24797184, "step": 6054 }, { "epoch": 4.511922503725782, "grad_norm": 7.72885909772172, "learning_rate": 5.831912997437142e-08, "loss": 0.1614, "num_input_tokens_seen": 24801280, "step": 6055 }, { "epoch": 4.512667660208644, "grad_norm": 6.494784914277865, "learning_rate": 5.8142584689758616e-08, "loss": 0.0436, "num_input_tokens_seen": 24805376, "step": 6056 }, { "epoch": 4.513412816691505, "grad_norm": 8.820347067388658, "learning_rate": 5.796630066880235e-08, "loss": 0.2147, "num_input_tokens_seen": 24809472, "step": 6057 }, { "epoch": 4.514157973174367, "grad_norm": 6.066003181192477, "learning_rate": 5.7790277950144494e-08, "loss": 0.0733, "num_input_tokens_seen": 24813568, "step": 6058 }, { "epoch": 4.514903129657228, "grad_norm": 13.736221033285432, "learning_rate": 5.7614516572370855e-08, "loss": 0.0546, "num_input_tokens_seen": 24817664, "step": 6059 }, { "epoch": 4.515648286140089, "grad_norm": 4.864510680499927, "learning_rate": 5.743901657400963e-08, "loss": 0.0693, "num_input_tokens_seen": 24821760, "step": 6060 }, { "epoch": 4.516393442622951, "grad_norm": 8.590449738411587, "learning_rate": 5.7263777993531584e-08, "loss": 0.2132, "num_input_tokens_seen": 24825856, "step": 6061 }, { "epoch": 4.517138599105812, "grad_norm": 7.394522332240778, "learning_rate": 5.7088800869350424e-08, "loss": 0.0882, "num_input_tokens_seen": 24829952, "step": 6062 }, { "epoch": 4.517883755588674, "grad_norm": 4.497840825015247, "learning_rate": 5.691408523982214e-08, "loss": 0.047, "num_input_tokens_seen": 24834048, "step": 6063 }, { "epoch": 4.518628912071535, "grad_norm": 8.582307709140197, "learning_rate": 5.673963114324582e-08, "loss": 0.1839, "num_input_tokens_seen": 24838144, "step": 6064 }, { "epoch": 4.519374068554397, "grad_norm": 9.48989615099457, "learning_rate": 5.6565438617862815e-08, "loss": 0.1043, "num_input_tokens_seen": 24842240, "step": 6065 }, { "epoch": 4.5201192250372575, "grad_norm": 3.9338566478430272, "learning_rate": 5.6391507701857864e-08, "loss": 0.0324, "num_input_tokens_seen": 24846336, "step": 6066 }, { "epoch": 4.520864381520119, "grad_norm": 5.029611346417473, "learning_rate": 5.6217838433357265e-08, "loss": 0.006, "num_input_tokens_seen": 24850432, "step": 6067 }, { "epoch": 4.521609538002981, "grad_norm": 6.94630038159852, "learning_rate": 5.6044430850430986e-08, "loss": 0.1344, "num_input_tokens_seen": 24854528, "step": 6068 }, { "epoch": 4.522354694485842, "grad_norm": 7.882827338634618, "learning_rate": 5.5871284991091125e-08, "loss": 0.0827, "num_input_tokens_seen": 24858624, "step": 6069 }, { "epoch": 4.523099850968704, "grad_norm": 7.836029564301783, "learning_rate": 5.5698400893292456e-08, "loss": 0.1251, "num_input_tokens_seen": 24862720, "step": 6070 }, { "epoch": 4.523845007451564, "grad_norm": 4.672861749014259, "learning_rate": 5.5525778594932165e-08, "loss": 0.0523, "num_input_tokens_seen": 24866816, "step": 6071 }, { "epoch": 4.524590163934426, "grad_norm": 6.883920218018248, "learning_rate": 5.5353418133850813e-08, "loss": 0.0792, "num_input_tokens_seen": 24870912, "step": 6072 }, { "epoch": 4.5253353204172875, "grad_norm": 5.947777381258948, "learning_rate": 5.5181319547830545e-08, "loss": 0.047, "num_input_tokens_seen": 24875008, "step": 6073 }, { "epoch": 4.526080476900149, "grad_norm": 7.754460490257012, "learning_rate": 5.500948287459673e-08, "loss": 0.0997, "num_input_tokens_seen": 24879104, "step": 6074 }, { "epoch": 4.526825633383011, "grad_norm": 5.377972306763032, "learning_rate": 5.483790815181744e-08, "loss": 0.0769, "num_input_tokens_seen": 24883200, "step": 6075 }, { "epoch": 4.527570789865872, "grad_norm": 5.718938535566813, "learning_rate": 5.466659541710273e-08, "loss": 0.0763, "num_input_tokens_seen": 24887296, "step": 6076 }, { "epoch": 4.528315946348734, "grad_norm": 4.345184259865154, "learning_rate": 5.449554470800603e-08, "loss": 0.0375, "num_input_tokens_seen": 24891392, "step": 6077 }, { "epoch": 4.529061102831594, "grad_norm": 5.391302759553409, "learning_rate": 5.43247560620222e-08, "loss": 0.0475, "num_input_tokens_seen": 24895488, "step": 6078 }, { "epoch": 4.529806259314456, "grad_norm": 8.722367279903906, "learning_rate": 5.415422951659005e-08, "loss": 0.1378, "num_input_tokens_seen": 24899584, "step": 6079 }, { "epoch": 4.5305514157973175, "grad_norm": 7.957222601603683, "learning_rate": 5.398396510908982e-08, "loss": 0.0963, "num_input_tokens_seen": 24903680, "step": 6080 }, { "epoch": 4.531296572280179, "grad_norm": 6.191031595341632, "learning_rate": 5.3813962876844704e-08, "loss": 0.0851, "num_input_tokens_seen": 24907776, "step": 6081 }, { "epoch": 4.532041728763041, "grad_norm": 5.377121885691326, "learning_rate": 5.364422285712045e-08, "loss": 0.0731, "num_input_tokens_seen": 24911872, "step": 6082 }, { "epoch": 4.532786885245901, "grad_norm": 5.652338382489779, "learning_rate": 5.3474745087125216e-08, "loss": 0.0615, "num_input_tokens_seen": 24915968, "step": 6083 }, { "epoch": 4.533532041728763, "grad_norm": 7.722216405694113, "learning_rate": 5.3305529604010115e-08, "loss": 0.1153, "num_input_tokens_seen": 24920064, "step": 6084 }, { "epoch": 4.534277198211624, "grad_norm": 10.996854188115629, "learning_rate": 5.313657644486783e-08, "loss": 0.1637, "num_input_tokens_seen": 24924160, "step": 6085 }, { "epoch": 4.535022354694486, "grad_norm": 4.010399388259989, "learning_rate": 5.296788564673444e-08, "loss": 0.0213, "num_input_tokens_seen": 24928256, "step": 6086 }, { "epoch": 4.5357675111773474, "grad_norm": 3.6013271224384518, "learning_rate": 5.2799457246588e-08, "loss": 0.0118, "num_input_tokens_seen": 24932352, "step": 6087 }, { "epoch": 4.536512667660209, "grad_norm": 8.564934699716412, "learning_rate": 5.263129128134939e-08, "loss": 0.1129, "num_input_tokens_seen": 24936448, "step": 6088 }, { "epoch": 4.53725782414307, "grad_norm": 6.095179952514632, "learning_rate": 5.2463387787881764e-08, "loss": 0.0383, "num_input_tokens_seen": 24940544, "step": 6089 }, { "epoch": 4.538002980625931, "grad_norm": 2.1607553723816557, "learning_rate": 5.229574680299054e-08, "loss": 0.0077, "num_input_tokens_seen": 24944640, "step": 6090 }, { "epoch": 4.538748137108793, "grad_norm": 6.277650054226496, "learning_rate": 5.212836836342411e-08, "loss": 0.1638, "num_input_tokens_seen": 24948736, "step": 6091 }, { "epoch": 4.539493293591654, "grad_norm": 5.73047498014405, "learning_rate": 5.196125250587269e-08, "loss": 0.0812, "num_input_tokens_seen": 24952832, "step": 6092 }, { "epoch": 4.540238450074516, "grad_norm": 6.976423068355614, "learning_rate": 5.1794399266969775e-08, "loss": 0.0854, "num_input_tokens_seen": 24956928, "step": 6093 }, { "epoch": 4.540983606557377, "grad_norm": 6.494821963496932, "learning_rate": 5.162780868329026e-08, "loss": 0.1092, "num_input_tokens_seen": 24961024, "step": 6094 }, { "epoch": 4.541728763040238, "grad_norm": 6.542757641648829, "learning_rate": 5.1461480791352437e-08, "loss": 0.1084, "num_input_tokens_seen": 24965120, "step": 6095 }, { "epoch": 4.5424739195231, "grad_norm": 4.5143749502426544, "learning_rate": 5.129541562761631e-08, "loss": 0.0129, "num_input_tokens_seen": 24969216, "step": 6096 }, { "epoch": 4.543219076005961, "grad_norm": 8.9478007594349, "learning_rate": 5.112961322848456e-08, "loss": 0.1661, "num_input_tokens_seen": 24973312, "step": 6097 }, { "epoch": 4.543964232488823, "grad_norm": 9.92780312963076, "learning_rate": 5.096407363030215e-08, "loss": 0.1584, "num_input_tokens_seen": 24977408, "step": 6098 }, { "epoch": 4.544709388971684, "grad_norm": 5.38935968962723, "learning_rate": 5.079879686935685e-08, "loss": 0.0675, "num_input_tokens_seen": 24981504, "step": 6099 }, { "epoch": 4.545454545454545, "grad_norm": 8.375953142062604, "learning_rate": 5.0633782981878434e-08, "loss": 0.0315, "num_input_tokens_seen": 24985600, "step": 6100 }, { "epoch": 4.5461997019374065, "grad_norm": 11.37299437727475, "learning_rate": 5.0469032004038796e-08, "loss": 0.0413, "num_input_tokens_seen": 24989696, "step": 6101 }, { "epoch": 4.546944858420268, "grad_norm": 13.437703466879103, "learning_rate": 5.0304543971952807e-08, "loss": 0.0806, "num_input_tokens_seen": 24993792, "step": 6102 }, { "epoch": 4.54769001490313, "grad_norm": 8.718856808151164, "learning_rate": 5.014031892167731e-08, "loss": 0.17, "num_input_tokens_seen": 24997888, "step": 6103 }, { "epoch": 4.548435171385991, "grad_norm": 7.512325744341073, "learning_rate": 4.9976356889211844e-08, "loss": 0.1367, "num_input_tokens_seen": 25001984, "step": 6104 }, { "epoch": 4.549180327868853, "grad_norm": 4.689985670712961, "learning_rate": 4.98126579104978e-08, "loss": 0.0209, "num_input_tokens_seen": 25006080, "step": 6105 }, { "epoch": 4.549925484351714, "grad_norm": 5.494055735940482, "learning_rate": 4.9649222021419254e-08, "loss": 0.0187, "num_input_tokens_seen": 25010176, "step": 6106 }, { "epoch": 4.550670640834575, "grad_norm": 6.496686828981072, "learning_rate": 4.948604925780254e-08, "loss": 0.0727, "num_input_tokens_seen": 25014272, "step": 6107 }, { "epoch": 4.5514157973174365, "grad_norm": 5.823442361769239, "learning_rate": 4.932313965541613e-08, "loss": 0.0631, "num_input_tokens_seen": 25018368, "step": 6108 }, { "epoch": 4.552160953800298, "grad_norm": 6.245001411883382, "learning_rate": 4.916049324997133e-08, "loss": 0.0701, "num_input_tokens_seen": 25022464, "step": 6109 }, { "epoch": 4.55290611028316, "grad_norm": 9.13184983456899, "learning_rate": 4.899811007712099e-08, "loss": 0.1519, "num_input_tokens_seen": 25026560, "step": 6110 }, { "epoch": 4.553651266766021, "grad_norm": 5.072983599467191, "learning_rate": 4.88359901724611e-08, "loss": 0.0523, "num_input_tokens_seen": 25030656, "step": 6111 }, { "epoch": 4.554396423248882, "grad_norm": 3.7384427342834083, "learning_rate": 4.867413357152892e-08, "loss": 0.0221, "num_input_tokens_seen": 25034752, "step": 6112 }, { "epoch": 4.555141579731743, "grad_norm": 6.415329504862489, "learning_rate": 4.85125403098051e-08, "loss": 0.0434, "num_input_tokens_seen": 25038848, "step": 6113 }, { "epoch": 4.555886736214605, "grad_norm": 7.990026292706574, "learning_rate": 4.835121042271157e-08, "loss": 0.1508, "num_input_tokens_seen": 25042944, "step": 6114 }, { "epoch": 4.5566318926974665, "grad_norm": 6.013054538834708, "learning_rate": 4.8190143945613526e-08, "loss": 0.0903, "num_input_tokens_seen": 25047040, "step": 6115 }, { "epoch": 4.557377049180328, "grad_norm": 5.841117439689231, "learning_rate": 4.802934091381745e-08, "loss": 0.0337, "num_input_tokens_seen": 25051136, "step": 6116 }, { "epoch": 4.55812220566319, "grad_norm": 7.090610298519983, "learning_rate": 4.786880136257277e-08, "loss": 0.0883, "num_input_tokens_seen": 25055232, "step": 6117 }, { "epoch": 4.558867362146051, "grad_norm": 6.7572785879824515, "learning_rate": 4.770852532707093e-08, "loss": 0.1164, "num_input_tokens_seen": 25059328, "step": 6118 }, { "epoch": 4.559612518628912, "grad_norm": 3.8683864718232828, "learning_rate": 4.754851284244521e-08, "loss": 0.0236, "num_input_tokens_seen": 25063424, "step": 6119 }, { "epoch": 4.560357675111773, "grad_norm": 6.491157900692448, "learning_rate": 4.7388763943772007e-08, "loss": 0.0344, "num_input_tokens_seen": 25067520, "step": 6120 }, { "epoch": 4.561102831594635, "grad_norm": 4.249106741106495, "learning_rate": 4.722927866606886e-08, "loss": 0.0206, "num_input_tokens_seen": 25071616, "step": 6121 }, { "epoch": 4.5618479880774965, "grad_norm": 6.688968048510593, "learning_rate": 4.707005704429668e-08, "loss": 0.0981, "num_input_tokens_seen": 25075712, "step": 6122 }, { "epoch": 4.562593144560358, "grad_norm": 8.933016479419939, "learning_rate": 4.6911099113357424e-08, "loss": 0.1265, "num_input_tokens_seen": 25079808, "step": 6123 }, { "epoch": 4.563338301043219, "grad_norm": 5.670215570180973, "learning_rate": 4.67524049080964e-08, "loss": 0.0441, "num_input_tokens_seen": 25083904, "step": 6124 }, { "epoch": 4.56408345752608, "grad_norm": 5.021010246458214, "learning_rate": 4.659397446329983e-08, "loss": 0.0534, "num_input_tokens_seen": 25088000, "step": 6125 }, { "epoch": 4.564828614008942, "grad_norm": 0.8319246140173318, "learning_rate": 4.643580781369742e-08, "loss": 0.0032, "num_input_tokens_seen": 25092096, "step": 6126 }, { "epoch": 4.565573770491803, "grad_norm": 5.561465331511099, "learning_rate": 4.627790499396004e-08, "loss": 0.0145, "num_input_tokens_seen": 25096192, "step": 6127 }, { "epoch": 4.566318926974665, "grad_norm": 3.340100775642309, "learning_rate": 4.6120266038701125e-08, "loss": 0.0112, "num_input_tokens_seen": 25100288, "step": 6128 }, { "epoch": 4.5670640834575265, "grad_norm": 5.10887783043394, "learning_rate": 4.59628909824765e-08, "loss": 0.0442, "num_input_tokens_seen": 25104384, "step": 6129 }, { "epoch": 4.567809239940387, "grad_norm": 8.050770730984771, "learning_rate": 4.580577985978357e-08, "loss": 0.1162, "num_input_tokens_seen": 25108480, "step": 6130 }, { "epoch": 4.568554396423249, "grad_norm": 3.407512121823182, "learning_rate": 4.564893270506257e-08, "loss": 0.0425, "num_input_tokens_seen": 25112576, "step": 6131 }, { "epoch": 4.56929955290611, "grad_norm": 4.888410348733933, "learning_rate": 4.5492349552695295e-08, "loss": 0.0486, "num_input_tokens_seen": 25116672, "step": 6132 }, { "epoch": 4.570044709388972, "grad_norm": 6.373163498688854, "learning_rate": 4.533603043700596e-08, "loss": 0.0736, "num_input_tokens_seen": 25120768, "step": 6133 }, { "epoch": 4.570789865871833, "grad_norm": 4.578873159442608, "learning_rate": 4.5179975392260905e-08, "loss": 0.0468, "num_input_tokens_seen": 25124864, "step": 6134 }, { "epoch": 4.571535022354695, "grad_norm": 6.607880619802854, "learning_rate": 4.5024184452668316e-08, "loss": 0.076, "num_input_tokens_seen": 25128960, "step": 6135 }, { "epoch": 4.5722801788375556, "grad_norm": 5.952714820095552, "learning_rate": 4.4868657652378936e-08, "loss": 0.0555, "num_input_tokens_seen": 25133056, "step": 6136 }, { "epoch": 4.573025335320417, "grad_norm": 9.224434586474088, "learning_rate": 4.471339502548508e-08, "loss": 0.2106, "num_input_tokens_seen": 25137152, "step": 6137 }, { "epoch": 4.573770491803279, "grad_norm": 6.489310248668691, "learning_rate": 4.4558396606021734e-08, "loss": 0.1264, "num_input_tokens_seen": 25141248, "step": 6138 }, { "epoch": 4.57451564828614, "grad_norm": 7.192479030660729, "learning_rate": 4.4403662427965206e-08, "loss": 0.1163, "num_input_tokens_seen": 25145344, "step": 6139 }, { "epoch": 4.575260804769002, "grad_norm": 5.964160078056312, "learning_rate": 4.4249192525234744e-08, "loss": 0.0512, "num_input_tokens_seen": 25149440, "step": 6140 }, { "epoch": 4.576005961251862, "grad_norm": 8.065452332193857, "learning_rate": 4.409498693169104e-08, "loss": 0.1375, "num_input_tokens_seen": 25153536, "step": 6141 }, { "epoch": 4.576751117734724, "grad_norm": 6.716755208424882, "learning_rate": 4.394104568113719e-08, "loss": 0.1184, "num_input_tokens_seen": 25157632, "step": 6142 }, { "epoch": 4.5774962742175855, "grad_norm": 6.946772838297582, "learning_rate": 4.378736880731815e-08, "loss": 0.0612, "num_input_tokens_seen": 25161728, "step": 6143 }, { "epoch": 4.578241430700447, "grad_norm": 0.8228049422611906, "learning_rate": 4.363395634392101e-08, "loss": 0.0022, "num_input_tokens_seen": 25165824, "step": 6144 }, { "epoch": 4.578986587183309, "grad_norm": 6.6019961511946414, "learning_rate": 4.348080832457496e-08, "loss": 0.0689, "num_input_tokens_seen": 25169920, "step": 6145 }, { "epoch": 4.57973174366617, "grad_norm": 9.240810245808788, "learning_rate": 4.3327924782850804e-08, "loss": 0.1907, "num_input_tokens_seen": 25174016, "step": 6146 }, { "epoch": 4.580476900149032, "grad_norm": 7.034442593043898, "learning_rate": 4.317530575226214e-08, "loss": 0.0139, "num_input_tokens_seen": 25178112, "step": 6147 }, { "epoch": 4.581222056631892, "grad_norm": 5.369157908666619, "learning_rate": 4.3022951266263743e-08, "loss": 0.0618, "num_input_tokens_seen": 25182208, "step": 6148 }, { "epoch": 4.581967213114754, "grad_norm": 5.302368701390403, "learning_rate": 4.287086135825319e-08, "loss": 0.0321, "num_input_tokens_seen": 25186304, "step": 6149 }, { "epoch": 4.5827123695976155, "grad_norm": 5.025102095309868, "learning_rate": 4.271903606156938e-08, "loss": 0.0358, "num_input_tokens_seen": 25190400, "step": 6150 }, { "epoch": 4.583457526080477, "grad_norm": 10.544452727196534, "learning_rate": 4.256747540949388e-08, "loss": 0.1302, "num_input_tokens_seen": 25194496, "step": 6151 }, { "epoch": 4.584202682563339, "grad_norm": 4.129264924331483, "learning_rate": 4.2416179435249423e-08, "loss": 0.0359, "num_input_tokens_seen": 25198592, "step": 6152 }, { "epoch": 4.584947839046199, "grad_norm": 14.844442381792383, "learning_rate": 4.226514817200142e-08, "loss": 0.024, "num_input_tokens_seen": 25202688, "step": 6153 }, { "epoch": 4.585692995529061, "grad_norm": 4.4332262795666235, "learning_rate": 4.211438165285686e-08, "loss": 0.0287, "num_input_tokens_seen": 25206784, "step": 6154 }, { "epoch": 4.586438152011922, "grad_norm": 6.890660900765202, "learning_rate": 4.196387991086487e-08, "loss": 0.0711, "num_input_tokens_seen": 25210880, "step": 6155 }, { "epoch": 4.587183308494784, "grad_norm": 6.191602737613791, "learning_rate": 4.1813642979016686e-08, "loss": 0.0767, "num_input_tokens_seen": 25214976, "step": 6156 }, { "epoch": 4.5879284649776455, "grad_norm": 5.869799460609735, "learning_rate": 4.1663670890245004e-08, "loss": 0.0806, "num_input_tokens_seen": 25219072, "step": 6157 }, { "epoch": 4.588673621460507, "grad_norm": 5.665036059204045, "learning_rate": 4.151396367742505e-08, "loss": 0.0655, "num_input_tokens_seen": 25223168, "step": 6158 }, { "epoch": 4.589418777943368, "grad_norm": 6.500650823185447, "learning_rate": 4.136452137337363e-08, "loss": 0.0821, "num_input_tokens_seen": 25227264, "step": 6159 }, { "epoch": 4.590163934426229, "grad_norm": 7.9236146298403725, "learning_rate": 4.121534401084953e-08, "loss": 0.1553, "num_input_tokens_seen": 25231360, "step": 6160 }, { "epoch": 4.590909090909091, "grad_norm": 7.135103748380635, "learning_rate": 4.106643162255369e-08, "loss": 0.0803, "num_input_tokens_seen": 25235456, "step": 6161 }, { "epoch": 4.591654247391952, "grad_norm": 9.310556358317479, "learning_rate": 4.091778424112833e-08, "loss": 0.0666, "num_input_tokens_seen": 25239552, "step": 6162 }, { "epoch": 4.592399403874814, "grad_norm": 5.877389180160038, "learning_rate": 4.0769401899158486e-08, "loss": 0.0705, "num_input_tokens_seen": 25243648, "step": 6163 }, { "epoch": 4.5931445603576755, "grad_norm": 8.24203759546714, "learning_rate": 4.0621284629170225e-08, "loss": 0.0864, "num_input_tokens_seen": 25247744, "step": 6164 }, { "epoch": 4.593889716840536, "grad_norm": 8.800248016944357, "learning_rate": 4.0473432463632444e-08, "loss": 0.1502, "num_input_tokens_seen": 25251840, "step": 6165 }, { "epoch": 4.594634873323398, "grad_norm": 5.445867771498252, "learning_rate": 4.032584543495477e-08, "loss": 0.0731, "num_input_tokens_seen": 25255936, "step": 6166 }, { "epoch": 4.595380029806259, "grad_norm": 2.255809609408733, "learning_rate": 4.017852357548965e-08, "loss": 0.0059, "num_input_tokens_seen": 25260032, "step": 6167 }, { "epoch": 4.596125186289121, "grad_norm": 6.342655533133183, "learning_rate": 4.003146691753099e-08, "loss": 0.0504, "num_input_tokens_seen": 25264128, "step": 6168 }, { "epoch": 4.596870342771982, "grad_norm": 3.722739046324063, "learning_rate": 3.988467549331493e-08, "loss": 0.0252, "num_input_tokens_seen": 25268224, "step": 6169 }, { "epoch": 4.597615499254843, "grad_norm": 5.701850008269262, "learning_rate": 3.9738149335018795e-08, "loss": 0.0578, "num_input_tokens_seen": 25272320, "step": 6170 }, { "epoch": 4.598360655737705, "grad_norm": 8.205374251177291, "learning_rate": 3.959188847476259e-08, "loss": 0.1317, "num_input_tokens_seen": 25276416, "step": 6171 }, { "epoch": 4.599105812220566, "grad_norm": 6.907717458845591, "learning_rate": 3.9445892944607463e-08, "loss": 0.0515, "num_input_tokens_seen": 25280512, "step": 6172 }, { "epoch": 4.599850968703428, "grad_norm": 5.959794542790396, "learning_rate": 3.930016277655657e-08, "loss": 0.0328, "num_input_tokens_seen": 25284608, "step": 6173 }, { "epoch": 4.600596125186289, "grad_norm": 11.847642832619314, "learning_rate": 3.915469800255534e-08, "loss": 0.1014, "num_input_tokens_seen": 25288704, "step": 6174 }, { "epoch": 4.601341281669151, "grad_norm": 9.020567705027911, "learning_rate": 3.900949865449033e-08, "loss": 0.2209, "num_input_tokens_seen": 25292800, "step": 6175 }, { "epoch": 4.602086438152012, "grad_norm": 4.978122813804555, "learning_rate": 3.8864564764190535e-08, "loss": 0.0479, "num_input_tokens_seen": 25296896, "step": 6176 }, { "epoch": 4.602831594634873, "grad_norm": 7.599228332738885, "learning_rate": 3.871989636342638e-08, "loss": 0.1041, "num_input_tokens_seen": 25300992, "step": 6177 }, { "epoch": 4.603576751117735, "grad_norm": 4.923682477513042, "learning_rate": 3.857549348391054e-08, "loss": 0.0394, "num_input_tokens_seen": 25305088, "step": 6178 }, { "epoch": 4.604321907600596, "grad_norm": 6.724823563388537, "learning_rate": 3.8431356157296595e-08, "loss": 0.1021, "num_input_tokens_seen": 25309184, "step": 6179 }, { "epoch": 4.605067064083458, "grad_norm": 6.836577807796804, "learning_rate": 3.8287484415180934e-08, "loss": 0.0471, "num_input_tokens_seen": 25313280, "step": 6180 }, { "epoch": 4.605812220566319, "grad_norm": 6.74738964630498, "learning_rate": 3.814387828910096e-08, "loss": 0.1482, "num_input_tokens_seen": 25317376, "step": 6181 }, { "epoch": 4.60655737704918, "grad_norm": 8.017021011002434, "learning_rate": 3.800053781053623e-08, "loss": 0.108, "num_input_tokens_seen": 25321472, "step": 6182 }, { "epoch": 4.6073025335320414, "grad_norm": 4.77827344864367, "learning_rate": 3.785746301090826e-08, "loss": 0.0363, "num_input_tokens_seen": 25325568, "step": 6183 }, { "epoch": 4.608047690014903, "grad_norm": 5.983465135163643, "learning_rate": 3.771465392157961e-08, "loss": 0.0553, "num_input_tokens_seen": 25329664, "step": 6184 }, { "epoch": 4.6087928464977646, "grad_norm": 7.856019540234478, "learning_rate": 3.7572110573855514e-08, "loss": 0.0367, "num_input_tokens_seen": 25333760, "step": 6185 }, { "epoch": 4.609538002980626, "grad_norm": 8.456792617727734, "learning_rate": 3.742983299898223e-08, "loss": 0.1026, "num_input_tokens_seen": 25337856, "step": 6186 }, { "epoch": 4.610283159463488, "grad_norm": 4.953989507557271, "learning_rate": 3.728782122814814e-08, "loss": 0.0338, "num_input_tokens_seen": 25341952, "step": 6187 }, { "epoch": 4.611028315946349, "grad_norm": 5.003013453639464, "learning_rate": 3.714607529248307e-08, "loss": 0.0383, "num_input_tokens_seen": 25346048, "step": 6188 }, { "epoch": 4.61177347242921, "grad_norm": 6.898553586047165, "learning_rate": 3.700459522305883e-08, "loss": 0.0651, "num_input_tokens_seen": 25350144, "step": 6189 }, { "epoch": 4.612518628912071, "grad_norm": 5.427473534516758, "learning_rate": 3.6863381050888935e-08, "loss": 0.0146, "num_input_tokens_seen": 25354240, "step": 6190 }, { "epoch": 4.613263785394933, "grad_norm": 5.917302223862112, "learning_rate": 3.67224328069285e-08, "loss": 0.1061, "num_input_tokens_seen": 25358336, "step": 6191 }, { "epoch": 4.6140089418777945, "grad_norm": 7.463788925432434, "learning_rate": 3.6581750522074464e-08, "loss": 0.1533, "num_input_tokens_seen": 25362432, "step": 6192 }, { "epoch": 4.614754098360656, "grad_norm": 4.128588045823638, "learning_rate": 3.644133422716509e-08, "loss": 0.042, "num_input_tokens_seen": 25366528, "step": 6193 }, { "epoch": 4.615499254843517, "grad_norm": 5.93643435053542, "learning_rate": 3.630118395298102e-08, "loss": 0.0631, "num_input_tokens_seen": 25370624, "step": 6194 }, { "epoch": 4.616244411326378, "grad_norm": 8.913450827192323, "learning_rate": 3.6161299730243794e-08, "loss": 0.1043, "num_input_tokens_seen": 25374720, "step": 6195 }, { "epoch": 4.61698956780924, "grad_norm": 5.524171520853505, "learning_rate": 3.6021681589617495e-08, "loss": 0.0447, "num_input_tokens_seen": 25378816, "step": 6196 }, { "epoch": 4.617734724292101, "grad_norm": 3.0056363739059604, "learning_rate": 3.588232956170695e-08, "loss": 0.0125, "num_input_tokens_seen": 25382912, "step": 6197 }, { "epoch": 4.618479880774963, "grad_norm": 8.356292145310222, "learning_rate": 3.574324367705953e-08, "loss": 0.1678, "num_input_tokens_seen": 25387008, "step": 6198 }, { "epoch": 4.6192250372578245, "grad_norm": 7.595562425007197, "learning_rate": 3.560442396616362e-08, "loss": 0.0957, "num_input_tokens_seen": 25391104, "step": 6199 }, { "epoch": 4.619970193740685, "grad_norm": 7.716418331035459, "learning_rate": 3.5465870459449325e-08, "loss": 0.1321, "num_input_tokens_seen": 25395200, "step": 6200 }, { "epoch": 4.620715350223547, "grad_norm": 6.165762812694869, "learning_rate": 3.5327583187289017e-08, "loss": 0.0572, "num_input_tokens_seen": 25399296, "step": 6201 }, { "epoch": 4.621460506706408, "grad_norm": 12.581525706894064, "learning_rate": 3.518956217999594e-08, "loss": 0.2033, "num_input_tokens_seen": 25403392, "step": 6202 }, { "epoch": 4.62220566318927, "grad_norm": 7.201073242286056, "learning_rate": 3.505180746782535e-08, "loss": 0.1106, "num_input_tokens_seen": 25407488, "step": 6203 }, { "epoch": 4.622950819672131, "grad_norm": 6.6609000047375995, "learning_rate": 3.4914319080973924e-08, "loss": 0.0559, "num_input_tokens_seen": 25411584, "step": 6204 }, { "epoch": 4.623695976154993, "grad_norm": 1.9968192506003313, "learning_rate": 3.47770970495806e-08, "loss": 0.0062, "num_input_tokens_seen": 25415680, "step": 6205 }, { "epoch": 4.624441132637854, "grad_norm": 8.803161464356005, "learning_rate": 3.4640141403724794e-08, "loss": 0.0862, "num_input_tokens_seen": 25419776, "step": 6206 }, { "epoch": 4.625186289120715, "grad_norm": 7.876774533370694, "learning_rate": 3.45034521734286e-08, "loss": 0.129, "num_input_tokens_seen": 25423872, "step": 6207 }, { "epoch": 4.625931445603577, "grad_norm": 8.109092953799937, "learning_rate": 3.436702938865485e-08, "loss": 0.0922, "num_input_tokens_seen": 25427968, "step": 6208 }, { "epoch": 4.626676602086438, "grad_norm": 6.5668771349948445, "learning_rate": 3.4230873079308945e-08, "loss": 0.0858, "num_input_tokens_seen": 25432064, "step": 6209 }, { "epoch": 4.6274217585693, "grad_norm": 7.975005108557979, "learning_rate": 3.409498327523686e-08, "loss": 0.0764, "num_input_tokens_seen": 25436160, "step": 6210 }, { "epoch": 4.6281669150521605, "grad_norm": 5.707988789728377, "learning_rate": 3.395936000622685e-08, "loss": 0.0561, "num_input_tokens_seen": 25440256, "step": 6211 }, { "epoch": 4.628912071535022, "grad_norm": 8.055895601324243, "learning_rate": 3.382400330200833e-08, "loss": 0.186, "num_input_tokens_seen": 25444352, "step": 6212 }, { "epoch": 4.629657228017884, "grad_norm": 7.679844841261532, "learning_rate": 3.3688913192252564e-08, "loss": 0.0787, "num_input_tokens_seen": 25448448, "step": 6213 }, { "epoch": 4.630402384500745, "grad_norm": 5.848595064804572, "learning_rate": 3.3554089706572394e-08, "loss": 0.0967, "num_input_tokens_seen": 25452544, "step": 6214 }, { "epoch": 4.631147540983607, "grad_norm": 8.2447732261073, "learning_rate": 3.341953287452182e-08, "loss": 0.0813, "num_input_tokens_seen": 25456640, "step": 6215 }, { "epoch": 4.631892697466468, "grad_norm": 7.986656693056212, "learning_rate": 3.3285242725596965e-08, "loss": 0.0858, "num_input_tokens_seen": 25460736, "step": 6216 }, { "epoch": 4.63263785394933, "grad_norm": 5.693803067610976, "learning_rate": 3.315121928923498e-08, "loss": 0.0549, "num_input_tokens_seen": 25464832, "step": 6217 }, { "epoch": 4.6333830104321905, "grad_norm": 7.833278454348168, "learning_rate": 3.3017462594814586e-08, "loss": 0.1678, "num_input_tokens_seen": 25468928, "step": 6218 }, { "epoch": 4.634128166915052, "grad_norm": 7.448568404114518, "learning_rate": 3.288397267165677e-08, "loss": 0.0957, "num_input_tokens_seen": 25473024, "step": 6219 }, { "epoch": 4.634873323397914, "grad_norm": 6.742609042355312, "learning_rate": 3.275074954902299e-08, "loss": 0.0471, "num_input_tokens_seen": 25477120, "step": 6220 }, { "epoch": 4.635618479880775, "grad_norm": 8.83559604203067, "learning_rate": 3.261779325611683e-08, "loss": 0.0544, "num_input_tokens_seen": 25481216, "step": 6221 }, { "epoch": 4.636363636363637, "grad_norm": 6.053144957985743, "learning_rate": 3.248510382208317e-08, "loss": 0.0612, "num_input_tokens_seen": 25485312, "step": 6222 }, { "epoch": 4.637108792846497, "grad_norm": 6.535319376330362, "learning_rate": 3.235268127600874e-08, "loss": 0.0787, "num_input_tokens_seen": 25489408, "step": 6223 }, { "epoch": 4.637853949329359, "grad_norm": 4.661165180163014, "learning_rate": 3.222052564692116e-08, "loss": 0.04, "num_input_tokens_seen": 25493504, "step": 6224 }, { "epoch": 4.6385991058122205, "grad_norm": 6.969269059447764, "learning_rate": 3.2088636963790166e-08, "loss": 0.0317, "num_input_tokens_seen": 25497600, "step": 6225 }, { "epoch": 4.639344262295082, "grad_norm": 6.287965719845516, "learning_rate": 3.19570152555268e-08, "loss": 0.0976, "num_input_tokens_seen": 25501696, "step": 6226 }, { "epoch": 4.640089418777944, "grad_norm": 4.444707578576965, "learning_rate": 3.182566055098299e-08, "loss": 0.0359, "num_input_tokens_seen": 25505792, "step": 6227 }, { "epoch": 4.640834575260805, "grad_norm": 9.915840826846036, "learning_rate": 3.1694572878953195e-08, "loss": 0.1728, "num_input_tokens_seen": 25509888, "step": 6228 }, { "epoch": 4.641579731743666, "grad_norm": 6.848447695840525, "learning_rate": 3.156375226817221e-08, "loss": 0.1272, "num_input_tokens_seen": 25513984, "step": 6229 }, { "epoch": 4.642324888226527, "grad_norm": 4.368459550218183, "learning_rate": 3.1433198747317374e-08, "loss": 0.0144, "num_input_tokens_seen": 25518080, "step": 6230 }, { "epoch": 4.643070044709389, "grad_norm": 6.237125261662718, "learning_rate": 3.130291234500663e-08, "loss": 0.1125, "num_input_tokens_seen": 25522176, "step": 6231 }, { "epoch": 4.64381520119225, "grad_norm": 8.498554054127334, "learning_rate": 3.117289308980004e-08, "loss": 0.1808, "num_input_tokens_seen": 25526272, "step": 6232 }, { "epoch": 4.644560357675112, "grad_norm": 6.726357380966834, "learning_rate": 3.104314101019815e-08, "loss": 0.0948, "num_input_tokens_seen": 25530368, "step": 6233 }, { "epoch": 4.6453055141579735, "grad_norm": 10.395358555521694, "learning_rate": 3.091365613464403e-08, "loss": 0.1485, "num_input_tokens_seen": 25534464, "step": 6234 }, { "epoch": 4.646050670640834, "grad_norm": 4.3700838712541055, "learning_rate": 3.078443849152138e-08, "loss": 0.0193, "num_input_tokens_seen": 25538560, "step": 6235 }, { "epoch": 4.646795827123696, "grad_norm": 4.7218436273392514, "learning_rate": 3.0655488109156e-08, "loss": 0.0407, "num_input_tokens_seen": 25542656, "step": 6236 }, { "epoch": 4.647540983606557, "grad_norm": 6.33215404473622, "learning_rate": 3.0526805015814456e-08, "loss": 0.0897, "num_input_tokens_seen": 25546752, "step": 6237 }, { "epoch": 4.648286140089419, "grad_norm": 7.742041406987774, "learning_rate": 3.0398389239705004e-08, "loss": 0.1073, "num_input_tokens_seen": 25550848, "step": 6238 }, { "epoch": 4.64903129657228, "grad_norm": 7.509581615269888, "learning_rate": 3.027024080897736e-08, "loss": 0.1139, "num_input_tokens_seen": 25554944, "step": 6239 }, { "epoch": 4.649776453055141, "grad_norm": 8.749639385406343, "learning_rate": 3.014235975172253e-08, "loss": 0.1328, "num_input_tokens_seen": 25559040, "step": 6240 }, { "epoch": 4.650521609538003, "grad_norm": 6.423952914019023, "learning_rate": 3.001474609597321e-08, "loss": 0.1061, "num_input_tokens_seen": 25563136, "step": 6241 }, { "epoch": 4.651266766020864, "grad_norm": 8.882884872792584, "learning_rate": 2.988739986970274e-08, "loss": 0.1081, "num_input_tokens_seen": 25567232, "step": 6242 }, { "epoch": 4.652011922503726, "grad_norm": 7.077357406409451, "learning_rate": 2.9760321100826828e-08, "loss": 0.0885, "num_input_tokens_seen": 25571328, "step": 6243 }, { "epoch": 4.652757078986587, "grad_norm": 9.254883006031665, "learning_rate": 2.963350981720181e-08, "loss": 0.2088, "num_input_tokens_seen": 25575424, "step": 6244 }, { "epoch": 4.653502235469449, "grad_norm": 6.825643458779978, "learning_rate": 2.950696604662559e-08, "loss": 0.062, "num_input_tokens_seen": 25579520, "step": 6245 }, { "epoch": 4.65424739195231, "grad_norm": 6.124957157854699, "learning_rate": 2.9380689816837642e-08, "loss": 0.0821, "num_input_tokens_seen": 25583616, "step": 6246 }, { "epoch": 4.654992548435171, "grad_norm": 9.749546946407056, "learning_rate": 2.9254681155518188e-08, "loss": 0.1123, "num_input_tokens_seen": 25587712, "step": 6247 }, { "epoch": 4.655737704918033, "grad_norm": 7.256505978195537, "learning_rate": 2.9128940090289854e-08, "loss": 0.0969, "num_input_tokens_seen": 25591808, "step": 6248 }, { "epoch": 4.656482861400894, "grad_norm": 13.068874169500425, "learning_rate": 2.900346664871531e-08, "loss": 0.2076, "num_input_tokens_seen": 25595904, "step": 6249 }, { "epoch": 4.657228017883756, "grad_norm": 5.4803760421411125, "learning_rate": 2.8878260858299785e-08, "loss": 0.0504, "num_input_tokens_seen": 25600000, "step": 6250 }, { "epoch": 4.657973174366617, "grad_norm": 5.388603815300722, "learning_rate": 2.8753322746488955e-08, "loss": 0.0606, "num_input_tokens_seen": 25604096, "step": 6251 }, { "epoch": 4.658718330849478, "grad_norm": 4.817885320314949, "learning_rate": 2.862865234067036e-08, "loss": 0.0416, "num_input_tokens_seen": 25608192, "step": 6252 }, { "epoch": 4.6594634873323395, "grad_norm": 6.066694155376062, "learning_rate": 2.8504249668172555e-08, "loss": 0.0669, "num_input_tokens_seen": 25612288, "step": 6253 }, { "epoch": 4.660208643815201, "grad_norm": 7.6102260886744935, "learning_rate": 2.8380114756265396e-08, "loss": 0.0489, "num_input_tokens_seen": 25616384, "step": 6254 }, { "epoch": 4.660953800298063, "grad_norm": 6.29063232773675, "learning_rate": 2.825624763216031e-08, "loss": 0.0836, "num_input_tokens_seen": 25620480, "step": 6255 }, { "epoch": 4.661698956780924, "grad_norm": 6.873908183031854, "learning_rate": 2.8132648323009743e-08, "loss": 0.1504, "num_input_tokens_seen": 25624576, "step": 6256 }, { "epoch": 4.662444113263786, "grad_norm": 8.95249168392677, "learning_rate": 2.8009316855907577e-08, "loss": 0.1681, "num_input_tokens_seen": 25628672, "step": 6257 }, { "epoch": 4.663189269746647, "grad_norm": 7.510307341222235, "learning_rate": 2.7886253257888852e-08, "loss": 0.0248, "num_input_tokens_seen": 25632768, "step": 6258 }, { "epoch": 4.663934426229508, "grad_norm": 3.1203116576358703, "learning_rate": 2.7763457555930324e-08, "loss": 0.0149, "num_input_tokens_seen": 25636864, "step": 6259 }, { "epoch": 4.6646795827123695, "grad_norm": 6.742016998507534, "learning_rate": 2.764092977694921e-08, "loss": 0.102, "num_input_tokens_seen": 25640960, "step": 6260 }, { "epoch": 4.665424739195231, "grad_norm": 4.101878082804182, "learning_rate": 2.751866994780486e-08, "loss": 0.0361, "num_input_tokens_seen": 25645056, "step": 6261 }, { "epoch": 4.666169895678093, "grad_norm": 6.4046215934758335, "learning_rate": 2.7396678095297225e-08, "loss": 0.0272, "num_input_tokens_seen": 25649152, "step": 6262 }, { "epoch": 4.666915052160954, "grad_norm": 8.458095444282032, "learning_rate": 2.7274954246167967e-08, "loss": 0.1167, "num_input_tokens_seen": 25653248, "step": 6263 }, { "epoch": 4.667660208643815, "grad_norm": 7.780271682734739, "learning_rate": 2.715349842709991e-08, "loss": 0.1156, "num_input_tokens_seen": 25657344, "step": 6264 }, { "epoch": 4.668405365126676, "grad_norm": 4.26386098007361, "learning_rate": 2.703231066471676e-08, "loss": 0.0372, "num_input_tokens_seen": 25661440, "step": 6265 }, { "epoch": 4.669150521609538, "grad_norm": 5.400808946028117, "learning_rate": 2.691139098558393e-08, "loss": 0.0737, "num_input_tokens_seen": 25665536, "step": 6266 }, { "epoch": 4.6698956780923995, "grad_norm": 6.781012845483247, "learning_rate": 2.679073941620772e-08, "loss": 0.1249, "num_input_tokens_seen": 25669632, "step": 6267 }, { "epoch": 4.670640834575261, "grad_norm": 6.933067832954245, "learning_rate": 2.6670355983035996e-08, "loss": 0.0656, "num_input_tokens_seen": 25673728, "step": 6268 }, { "epoch": 4.671385991058123, "grad_norm": 6.711296964387614, "learning_rate": 2.6550240712457653e-08, "loss": 0.0657, "num_input_tokens_seen": 25677824, "step": 6269 }, { "epoch": 4.672131147540983, "grad_norm": 6.849275749269544, "learning_rate": 2.6430393630802742e-08, "loss": 0.1601, "num_input_tokens_seen": 25681920, "step": 6270 }, { "epoch": 4.672876304023845, "grad_norm": 6.161070705042332, "learning_rate": 2.6310814764342607e-08, "loss": 0.0732, "num_input_tokens_seen": 25686016, "step": 6271 }, { "epoch": 4.673621460506706, "grad_norm": 8.604616589164552, "learning_rate": 2.6191504139289613e-08, "loss": 0.0795, "num_input_tokens_seen": 25690112, "step": 6272 }, { "epoch": 4.674366616989568, "grad_norm": 7.391507740348601, "learning_rate": 2.607246178179812e-08, "loss": 0.1594, "num_input_tokens_seen": 25694208, "step": 6273 }, { "epoch": 4.6751117734724295, "grad_norm": 5.376628171148529, "learning_rate": 2.595368771796225e-08, "loss": 0.0402, "num_input_tokens_seen": 25698304, "step": 6274 }, { "epoch": 4.675856929955291, "grad_norm": 6.344556054636724, "learning_rate": 2.583518197381868e-08, "loss": 0.1033, "num_input_tokens_seen": 25702400, "step": 6275 }, { "epoch": 4.676602086438152, "grad_norm": 6.756604260971874, "learning_rate": 2.5716944575344406e-08, "loss": 0.0413, "num_input_tokens_seen": 25706496, "step": 6276 }, { "epoch": 4.677347242921013, "grad_norm": 4.848727946710611, "learning_rate": 2.559897554845814e-08, "loss": 0.0275, "num_input_tokens_seen": 25710592, "step": 6277 }, { "epoch": 4.678092399403875, "grad_norm": 7.147888062066299, "learning_rate": 2.5481274919019332e-08, "loss": 0.0472, "num_input_tokens_seen": 25714688, "step": 6278 }, { "epoch": 4.678837555886736, "grad_norm": 4.107466654627495, "learning_rate": 2.5363842712829018e-08, "loss": 0.032, "num_input_tokens_seen": 25718784, "step": 6279 }, { "epoch": 4.679582712369598, "grad_norm": 5.801811346657, "learning_rate": 2.524667895562896e-08, "loss": 0.0787, "num_input_tokens_seen": 25722880, "step": 6280 }, { "epoch": 4.6803278688524586, "grad_norm": 7.725793447532648, "learning_rate": 2.5129783673102508e-08, "loss": 0.1451, "num_input_tokens_seen": 25726976, "step": 6281 }, { "epoch": 4.68107302533532, "grad_norm": 7.314318719510636, "learning_rate": 2.5013156890873745e-08, "loss": 0.091, "num_input_tokens_seen": 25731072, "step": 6282 }, { "epoch": 4.681818181818182, "grad_norm": 15.244131356726834, "learning_rate": 2.4896798634508195e-08, "loss": 0.0938, "num_input_tokens_seen": 25735168, "step": 6283 }, { "epoch": 4.682563338301043, "grad_norm": 4.768422742362981, "learning_rate": 2.4780708929512537e-08, "loss": 0.0368, "num_input_tokens_seen": 25739264, "step": 6284 }, { "epoch": 4.683308494783905, "grad_norm": 5.870366388348382, "learning_rate": 2.4664887801334054e-08, "loss": 0.0877, "num_input_tokens_seen": 25743360, "step": 6285 }, { "epoch": 4.684053651266766, "grad_norm": 8.45230737797197, "learning_rate": 2.4549335275362157e-08, "loss": 0.068, "num_input_tokens_seen": 25747456, "step": 6286 }, { "epoch": 4.684798807749628, "grad_norm": 8.129033624900694, "learning_rate": 2.443405137692631e-08, "loss": 0.1305, "num_input_tokens_seen": 25751552, "step": 6287 }, { "epoch": 4.6855439642324885, "grad_norm": 5.61688812734534, "learning_rate": 2.4319036131297825e-08, "loss": 0.047, "num_input_tokens_seen": 25755648, "step": 6288 }, { "epoch": 4.68628912071535, "grad_norm": 3.9322090135964434, "learning_rate": 2.4204289563688478e-08, "loss": 0.0353, "num_input_tokens_seen": 25759744, "step": 6289 }, { "epoch": 4.687034277198212, "grad_norm": 7.716496354285409, "learning_rate": 2.4089811699252176e-08, "loss": 0.0474, "num_input_tokens_seen": 25763840, "step": 6290 }, { "epoch": 4.687779433681073, "grad_norm": 7.363044565762718, "learning_rate": 2.3975602563082732e-08, "loss": 0.0864, "num_input_tokens_seen": 25767936, "step": 6291 }, { "epoch": 4.688524590163935, "grad_norm": 6.215565489677902, "learning_rate": 2.3861662180215822e-08, "loss": 0.0978, "num_input_tokens_seen": 25772032, "step": 6292 }, { "epoch": 4.689269746646795, "grad_norm": 8.476053539096803, "learning_rate": 2.374799057562799e-08, "loss": 0.2382, "num_input_tokens_seen": 25776128, "step": 6293 }, { "epoch": 4.690014903129657, "grad_norm": 8.092824412196022, "learning_rate": 2.36345877742368e-08, "loss": 0.1466, "num_input_tokens_seen": 25780224, "step": 6294 }, { "epoch": 4.6907600596125185, "grad_norm": 11.899342719543153, "learning_rate": 2.3521453800901122e-08, "loss": 0.139, "num_input_tokens_seen": 25784320, "step": 6295 }, { "epoch": 4.69150521609538, "grad_norm": 2.8018365958776115, "learning_rate": 2.3408588680420424e-08, "loss": 0.0046, "num_input_tokens_seen": 25788416, "step": 6296 }, { "epoch": 4.692250372578242, "grad_norm": 8.183076961665876, "learning_rate": 2.3295992437535884e-08, "loss": 0.1507, "num_input_tokens_seen": 25792512, "step": 6297 }, { "epoch": 4.692995529061103, "grad_norm": 5.293529993747075, "learning_rate": 2.3183665096929153e-08, "loss": 0.07, "num_input_tokens_seen": 25796608, "step": 6298 }, { "epoch": 4.693740685543965, "grad_norm": 8.24156749358951, "learning_rate": 2.3071606683223307e-08, "loss": 0.1348, "num_input_tokens_seen": 25800704, "step": 6299 }, { "epoch": 4.694485842026825, "grad_norm": 7.624274221627825, "learning_rate": 2.295981722098245e-08, "loss": 0.0261, "num_input_tokens_seen": 25804800, "step": 6300 }, { "epoch": 4.695230998509687, "grad_norm": 3.035454341016618, "learning_rate": 2.2848296734711424e-08, "loss": 0.0089, "num_input_tokens_seen": 25808896, "step": 6301 }, { "epoch": 4.6959761549925485, "grad_norm": 5.982200043910032, "learning_rate": 2.2737045248856505e-08, "loss": 0.1018, "num_input_tokens_seen": 25812992, "step": 6302 }, { "epoch": 4.69672131147541, "grad_norm": 9.20678510593375, "learning_rate": 2.2626062787804437e-08, "loss": 0.1963, "num_input_tokens_seen": 25817088, "step": 6303 }, { "epoch": 4.697466467958272, "grad_norm": 6.485158532078793, "learning_rate": 2.2515349375883816e-08, "loss": 0.0979, "num_input_tokens_seen": 25821184, "step": 6304 }, { "epoch": 4.698211624441132, "grad_norm": 6.903656982820705, "learning_rate": 2.240490503736356e-08, "loss": 0.0886, "num_input_tokens_seen": 25825280, "step": 6305 }, { "epoch": 4.698956780923994, "grad_norm": 8.830490326430406, "learning_rate": 2.2294729796454028e-08, "loss": 0.1635, "num_input_tokens_seen": 25829376, "step": 6306 }, { "epoch": 4.699701937406855, "grad_norm": 6.592090738524656, "learning_rate": 2.2184823677306318e-08, "loss": 0.1414, "num_input_tokens_seen": 25833472, "step": 6307 }, { "epoch": 4.700447093889717, "grad_norm": 7.445791315650408, "learning_rate": 2.207518670401254e-08, "loss": 0.0699, "num_input_tokens_seen": 25837568, "step": 6308 }, { "epoch": 4.7011922503725785, "grad_norm": 5.5020083137560984, "learning_rate": 2.1965818900606118e-08, "loss": 0.0315, "num_input_tokens_seen": 25841664, "step": 6309 }, { "epoch": 4.701937406855439, "grad_norm": 1.765991054025926, "learning_rate": 2.1856720291061063e-08, "loss": 0.0057, "num_input_tokens_seen": 25845760, "step": 6310 }, { "epoch": 4.702682563338301, "grad_norm": 2.115799099687252, "learning_rate": 2.1747890899292688e-08, "loss": 0.006, "num_input_tokens_seen": 25849856, "step": 6311 }, { "epoch": 4.703427719821162, "grad_norm": 3.954262679701275, "learning_rate": 2.1639330749157055e-08, "loss": 0.0113, "num_input_tokens_seen": 25853952, "step": 6312 }, { "epoch": 4.704172876304024, "grad_norm": 10.531338979091764, "learning_rate": 2.1531039864451652e-08, "loss": 0.0834, "num_input_tokens_seen": 25858048, "step": 6313 }, { "epoch": 4.704918032786885, "grad_norm": 10.00608337487539, "learning_rate": 2.142301826891416e-08, "loss": 0.3098, "num_input_tokens_seen": 25862144, "step": 6314 }, { "epoch": 4.705663189269747, "grad_norm": 5.623385875902153, "learning_rate": 2.1315265986223967e-08, "loss": 0.0273, "num_input_tokens_seen": 25866240, "step": 6315 }, { "epoch": 4.7064083457526085, "grad_norm": 5.06395843026063, "learning_rate": 2.120778304000093e-08, "loss": 0.0394, "num_input_tokens_seen": 25870336, "step": 6316 }, { "epoch": 4.707153502235469, "grad_norm": 10.465563620248336, "learning_rate": 2.1100569453806203e-08, "loss": 0.1311, "num_input_tokens_seen": 25874432, "step": 6317 }, { "epoch": 4.707898658718331, "grad_norm": 6.3702728663062125, "learning_rate": 2.099362525114196e-08, "loss": 0.0681, "num_input_tokens_seen": 25878528, "step": 6318 }, { "epoch": 4.708643815201192, "grad_norm": 7.417353055808206, "learning_rate": 2.0886950455450834e-08, "loss": 0.0423, "num_input_tokens_seen": 25882624, "step": 6319 }, { "epoch": 4.709388971684054, "grad_norm": 6.345010427283557, "learning_rate": 2.078054509011676e-08, "loss": 0.0682, "num_input_tokens_seen": 25886720, "step": 6320 }, { "epoch": 4.710134128166915, "grad_norm": 8.422630109194523, "learning_rate": 2.0674409178464695e-08, "loss": 0.1312, "num_input_tokens_seen": 25890816, "step": 6321 }, { "epoch": 4.710879284649776, "grad_norm": 4.799431815148077, "learning_rate": 2.0568542743760328e-08, "loss": 0.0369, "num_input_tokens_seen": 25894912, "step": 6322 }, { "epoch": 4.711624441132638, "grad_norm": 7.054022248423084, "learning_rate": 2.0462945809210238e-08, "loss": 0.1026, "num_input_tokens_seen": 25899008, "step": 6323 }, { "epoch": 4.712369597615499, "grad_norm": 6.56646328727468, "learning_rate": 2.0357618397962158e-08, "loss": 0.0843, "num_input_tokens_seen": 25903104, "step": 6324 }, { "epoch": 4.713114754098361, "grad_norm": 7.179799933187549, "learning_rate": 2.0252560533104565e-08, "loss": 0.1089, "num_input_tokens_seen": 25907200, "step": 6325 }, { "epoch": 4.713859910581222, "grad_norm": 6.218166644473736, "learning_rate": 2.0147772237666953e-08, "loss": 0.0414, "num_input_tokens_seen": 25911296, "step": 6326 }, { "epoch": 4.714605067064084, "grad_norm": 5.2273290933234415, "learning_rate": 2.0043253534619417e-08, "loss": 0.0207, "num_input_tokens_seen": 25915392, "step": 6327 }, { "epoch": 4.715350223546945, "grad_norm": 3.5070725966451253, "learning_rate": 1.9939004446873495e-08, "loss": 0.021, "num_input_tokens_seen": 25919488, "step": 6328 }, { "epoch": 4.716095380029806, "grad_norm": 6.8087941200819, "learning_rate": 1.9835024997281322e-08, "loss": 0.142, "num_input_tokens_seen": 25923584, "step": 6329 }, { "epoch": 4.7168405365126675, "grad_norm": 7.721923792949683, "learning_rate": 1.9731315208635638e-08, "loss": 0.1267, "num_input_tokens_seen": 25927680, "step": 6330 }, { "epoch": 4.717585692995529, "grad_norm": 4.459255844759668, "learning_rate": 1.9627875103670618e-08, "loss": 0.0349, "num_input_tokens_seen": 25931776, "step": 6331 }, { "epoch": 4.718330849478391, "grad_norm": 7.757687374096134, "learning_rate": 1.9524704705061043e-08, "loss": 0.1263, "num_input_tokens_seen": 25935872, "step": 6332 }, { "epoch": 4.719076005961252, "grad_norm": 7.51805313599231, "learning_rate": 1.942180403542257e-08, "loss": 0.0797, "num_input_tokens_seen": 25939968, "step": 6333 }, { "epoch": 4.719821162444113, "grad_norm": 6.181240124283799, "learning_rate": 1.9319173117311742e-08, "loss": 0.0767, "num_input_tokens_seen": 25944064, "step": 6334 }, { "epoch": 4.720566318926974, "grad_norm": 9.164927022259153, "learning_rate": 1.921681197322611e-08, "loss": 0.0976, "num_input_tokens_seen": 25948160, "step": 6335 }, { "epoch": 4.721311475409836, "grad_norm": 4.904328026497584, "learning_rate": 1.911472062560371e-08, "loss": 0.0484, "num_input_tokens_seen": 25952256, "step": 6336 }, { "epoch": 4.7220566318926975, "grad_norm": 11.111429045198404, "learning_rate": 1.9012899096823996e-08, "loss": 0.1318, "num_input_tokens_seen": 25956352, "step": 6337 }, { "epoch": 4.722801788375559, "grad_norm": 5.5200735142108535, "learning_rate": 1.891134740920675e-08, "loss": 0.0499, "num_input_tokens_seen": 25960448, "step": 6338 }, { "epoch": 4.723546944858421, "grad_norm": 5.2928152919714835, "learning_rate": 1.8810065585012915e-08, "loss": 0.0227, "num_input_tokens_seen": 25964544, "step": 6339 }, { "epoch": 4.724292101341281, "grad_norm": 5.766776259356273, "learning_rate": 1.8709053646444318e-08, "loss": 0.0323, "num_input_tokens_seen": 25968640, "step": 6340 }, { "epoch": 4.725037257824143, "grad_norm": 4.752576089022857, "learning_rate": 1.860831161564325e-08, "loss": 0.0294, "num_input_tokens_seen": 25972736, "step": 6341 }, { "epoch": 4.725782414307004, "grad_norm": 8.740264897061289, "learning_rate": 1.850783951469315e-08, "loss": 0.1123, "num_input_tokens_seen": 25976832, "step": 6342 }, { "epoch": 4.726527570789866, "grad_norm": 11.24373757357574, "learning_rate": 1.8407637365618346e-08, "loss": 0.1031, "num_input_tokens_seen": 25980928, "step": 6343 }, { "epoch": 4.7272727272727275, "grad_norm": 7.7107187247274505, "learning_rate": 1.8307705190383773e-08, "loss": 0.1238, "num_input_tokens_seen": 25985024, "step": 6344 }, { "epoch": 4.728017883755589, "grad_norm": 5.225269067950273, "learning_rate": 1.8208043010895375e-08, "loss": 0.0574, "num_input_tokens_seen": 25989120, "step": 6345 }, { "epoch": 4.72876304023845, "grad_norm": 4.741180177932889, "learning_rate": 1.810865084899957e-08, "loss": 0.0196, "num_input_tokens_seen": 25993216, "step": 6346 }, { "epoch": 4.729508196721311, "grad_norm": 4.363555699692166, "learning_rate": 1.8009528726484064e-08, "loss": 0.0427, "num_input_tokens_seen": 25997312, "step": 6347 }, { "epoch": 4.730253353204173, "grad_norm": 3.417752463303751, "learning_rate": 1.791067666507704e-08, "loss": 0.0225, "num_input_tokens_seen": 26001408, "step": 6348 }, { "epoch": 4.730998509687034, "grad_norm": 6.010553187394703, "learning_rate": 1.781209468644754e-08, "loss": 0.0468, "num_input_tokens_seen": 26005504, "step": 6349 }, { "epoch": 4.731743666169896, "grad_norm": 5.663651413440828, "learning_rate": 1.771378281220551e-08, "loss": 0.0834, "num_input_tokens_seen": 26009600, "step": 6350 }, { "epoch": 4.732488822652757, "grad_norm": 4.87276726803878, "learning_rate": 1.7615741063901764e-08, "loss": 0.0265, "num_input_tokens_seen": 26013696, "step": 6351 }, { "epoch": 4.733233979135618, "grad_norm": 6.6268611806200255, "learning_rate": 1.7517969463027446e-08, "loss": 0.0614, "num_input_tokens_seen": 26017792, "step": 6352 }, { "epoch": 4.73397913561848, "grad_norm": 8.060638779214571, "learning_rate": 1.742046803101499e-08, "loss": 0.121, "num_input_tokens_seen": 26021888, "step": 6353 }, { "epoch": 4.734724292101341, "grad_norm": 9.000908278606135, "learning_rate": 1.7323236789237307e-08, "loss": 0.2127, "num_input_tokens_seen": 26025984, "step": 6354 }, { "epoch": 4.735469448584203, "grad_norm": 3.1721127289187407, "learning_rate": 1.7226275759008175e-08, "loss": 0.0142, "num_input_tokens_seen": 26030080, "step": 6355 }, { "epoch": 4.736214605067064, "grad_norm": 5.912847912767531, "learning_rate": 1.7129584961582263e-08, "loss": 0.045, "num_input_tokens_seen": 26034176, "step": 6356 }, { "epoch": 4.736959761549926, "grad_norm": 7.1058264035099965, "learning_rate": 1.7033164418154696e-08, "loss": 0.0978, "num_input_tokens_seen": 26038272, "step": 6357 }, { "epoch": 4.737704918032787, "grad_norm": 6.953333076669959, "learning_rate": 1.6937014149861902e-08, "loss": 0.0956, "num_input_tokens_seen": 26042368, "step": 6358 }, { "epoch": 4.738450074515648, "grad_norm": 8.23857324303588, "learning_rate": 1.6841134177780216e-08, "loss": 0.0632, "num_input_tokens_seen": 26046464, "step": 6359 }, { "epoch": 4.73919523099851, "grad_norm": 6.403101719336774, "learning_rate": 1.6745524522927546e-08, "loss": 0.1254, "num_input_tokens_seen": 26050560, "step": 6360 }, { "epoch": 4.739940387481371, "grad_norm": 4.9734200111626645, "learning_rate": 1.6650185206261992e-08, "loss": 0.0447, "num_input_tokens_seen": 26054656, "step": 6361 }, { "epoch": 4.740685543964233, "grad_norm": 7.270864837546218, "learning_rate": 1.6555116248682946e-08, "loss": 0.1077, "num_input_tokens_seen": 26058752, "step": 6362 }, { "epoch": 4.7414307004470935, "grad_norm": 5.217942115645898, "learning_rate": 1.6460317671029984e-08, "loss": 0.0353, "num_input_tokens_seen": 26062848, "step": 6363 }, { "epoch": 4.742175856929955, "grad_norm": 4.099627902114977, "learning_rate": 1.6365789494083574e-08, "loss": 0.0283, "num_input_tokens_seen": 26066944, "step": 6364 }, { "epoch": 4.742921013412817, "grad_norm": 7.193039386276516, "learning_rate": 1.627153173856505e-08, "loss": 0.1131, "num_input_tokens_seen": 26071040, "step": 6365 }, { "epoch": 4.743666169895678, "grad_norm": 4.8003608466853995, "learning_rate": 1.617754442513636e-08, "loss": 0.0367, "num_input_tokens_seen": 26075136, "step": 6366 }, { "epoch": 4.74441132637854, "grad_norm": 8.92916063408251, "learning_rate": 1.608382757440033e-08, "loss": 0.2459, "num_input_tokens_seen": 26079232, "step": 6367 }, { "epoch": 4.745156482861401, "grad_norm": 9.015876726711152, "learning_rate": 1.5990381206900108e-08, "loss": 0.182, "num_input_tokens_seen": 26083328, "step": 6368 }, { "epoch": 4.745901639344263, "grad_norm": 4.184118519930767, "learning_rate": 1.5897205343120003e-08, "loss": 0.0297, "num_input_tokens_seen": 26087424, "step": 6369 }, { "epoch": 4.7466467958271235, "grad_norm": 9.259564381526257, "learning_rate": 1.580430000348479e-08, "loss": 0.0222, "num_input_tokens_seen": 26091520, "step": 6370 }, { "epoch": 4.747391952309985, "grad_norm": 4.883028374378907, "learning_rate": 1.571166520835998e-08, "loss": 0.0641, "num_input_tokens_seen": 26095616, "step": 6371 }, { "epoch": 4.748137108792847, "grad_norm": 4.426504363782641, "learning_rate": 1.561930097805156e-08, "loss": 0.029, "num_input_tokens_seen": 26099712, "step": 6372 }, { "epoch": 4.748882265275708, "grad_norm": 10.083216935085144, "learning_rate": 1.55272073328068e-08, "loss": 0.2809, "num_input_tokens_seen": 26103808, "step": 6373 }, { "epoch": 4.74962742175857, "grad_norm": 4.754094583319637, "learning_rate": 1.543538429281316e-08, "loss": 0.0395, "num_input_tokens_seen": 26107904, "step": 6374 }, { "epoch": 4.75037257824143, "grad_norm": 6.1289011093697505, "learning_rate": 1.534383187819871e-08, "loss": 0.0749, "num_input_tokens_seen": 26112000, "step": 6375 }, { "epoch": 4.751117734724292, "grad_norm": 12.020209664286153, "learning_rate": 1.5252550109032678e-08, "loss": 0.0684, "num_input_tokens_seen": 26116096, "step": 6376 }, { "epoch": 4.751862891207153, "grad_norm": 2.738012835052017, "learning_rate": 1.5161539005324465e-08, "loss": 0.0107, "num_input_tokens_seen": 26120192, "step": 6377 }, { "epoch": 4.752608047690015, "grad_norm": 10.434685952813524, "learning_rate": 1.507079858702451e-08, "loss": 0.235, "num_input_tokens_seen": 26124288, "step": 6378 }, { "epoch": 4.7533532041728765, "grad_norm": 5.179248272676582, "learning_rate": 1.4980328874023703e-08, "loss": 0.0586, "num_input_tokens_seen": 26128384, "step": 6379 }, { "epoch": 4.754098360655737, "grad_norm": 9.941024366783838, "learning_rate": 1.4890129886153681e-08, "loss": 0.2122, "num_input_tokens_seen": 26132480, "step": 6380 }, { "epoch": 4.754843517138599, "grad_norm": 3.145169142022544, "learning_rate": 1.4800201643186407e-08, "loss": 0.0161, "num_input_tokens_seen": 26136576, "step": 6381 }, { "epoch": 4.75558867362146, "grad_norm": 5.993369839899875, "learning_rate": 1.4710544164835277e-08, "loss": 0.0428, "num_input_tokens_seen": 26140672, "step": 6382 }, { "epoch": 4.756333830104322, "grad_norm": 6.244086686389394, "learning_rate": 1.46211574707536e-08, "loss": 0.0169, "num_input_tokens_seen": 26144768, "step": 6383 }, { "epoch": 4.757078986587183, "grad_norm": 3.884197111307224, "learning_rate": 1.4532041580535422e-08, "loss": 0.0354, "num_input_tokens_seen": 26148864, "step": 6384 }, { "epoch": 4.757824143070045, "grad_norm": 7.601370334947398, "learning_rate": 1.4443196513715952e-08, "loss": 0.0601, "num_input_tokens_seen": 26152960, "step": 6385 }, { "epoch": 4.7585692995529065, "grad_norm": 1.5230584432953753, "learning_rate": 1.4354622289770443e-08, "loss": 0.0043, "num_input_tokens_seen": 26157056, "step": 6386 }, { "epoch": 4.759314456035767, "grad_norm": 6.034557559492301, "learning_rate": 1.4266318928114892e-08, "loss": 0.0321, "num_input_tokens_seen": 26161152, "step": 6387 }, { "epoch": 4.760059612518629, "grad_norm": 7.972259930966415, "learning_rate": 1.4178286448106315e-08, "loss": 0.1248, "num_input_tokens_seen": 26165248, "step": 6388 }, { "epoch": 4.76080476900149, "grad_norm": 7.664385458121251, "learning_rate": 1.4090524869041777e-08, "loss": 0.1206, "num_input_tokens_seen": 26169344, "step": 6389 }, { "epoch": 4.761549925484352, "grad_norm": 6.534551239218911, "learning_rate": 1.4003034210159499e-08, "loss": 0.0735, "num_input_tokens_seen": 26173440, "step": 6390 }, { "epoch": 4.762295081967213, "grad_norm": 6.843666930611611, "learning_rate": 1.3915814490637752e-08, "loss": 0.032, "num_input_tokens_seen": 26177536, "step": 6391 }, { "epoch": 4.763040238450074, "grad_norm": 5.972500896703068, "learning_rate": 1.3828865729596103e-08, "loss": 0.0581, "num_input_tokens_seen": 26181632, "step": 6392 }, { "epoch": 4.763785394932936, "grad_norm": 6.939242833974175, "learning_rate": 1.3742187946094026e-08, "loss": 0.1221, "num_input_tokens_seen": 26185728, "step": 6393 }, { "epoch": 4.764530551415797, "grad_norm": 7.614457790333533, "learning_rate": 1.3655781159132159e-08, "loss": 0.0894, "num_input_tokens_seen": 26189824, "step": 6394 }, { "epoch": 4.765275707898659, "grad_norm": 6.606455557303694, "learning_rate": 1.356964538765118e-08, "loss": 0.0568, "num_input_tokens_seen": 26193920, "step": 6395 }, { "epoch": 4.76602086438152, "grad_norm": 8.98454575656973, "learning_rate": 1.3483780650532796e-08, "loss": 0.0774, "num_input_tokens_seen": 26198016, "step": 6396 }, { "epoch": 4.766766020864382, "grad_norm": 6.95132393093904, "learning_rate": 1.3398186966599307e-08, "loss": 0.0952, "num_input_tokens_seen": 26202112, "step": 6397 }, { "epoch": 4.767511177347243, "grad_norm": 6.896683933408748, "learning_rate": 1.3312864354613347e-08, "loss": 0.1572, "num_input_tokens_seen": 26206208, "step": 6398 }, { "epoch": 4.768256333830104, "grad_norm": 7.246692775770287, "learning_rate": 1.3227812833278147e-08, "loss": 0.1141, "num_input_tokens_seen": 26210304, "step": 6399 }, { "epoch": 4.769001490312966, "grad_norm": 8.603789723468097, "learning_rate": 1.3143032421237961e-08, "loss": 0.0949, "num_input_tokens_seen": 26214400, "step": 6400 }, { "epoch": 4.769746646795827, "grad_norm": 6.989380349896133, "learning_rate": 1.3058523137076812e-08, "loss": 0.0804, "num_input_tokens_seen": 26218496, "step": 6401 }, { "epoch": 4.770491803278689, "grad_norm": 7.062916571631798, "learning_rate": 1.2974284999320019e-08, "loss": 0.0941, "num_input_tokens_seen": 26222592, "step": 6402 }, { "epoch": 4.77123695976155, "grad_norm": 9.15209250427066, "learning_rate": 1.2890318026433091e-08, "loss": 0.2099, "num_input_tokens_seen": 26226688, "step": 6403 }, { "epoch": 4.771982116244411, "grad_norm": 6.417456717632871, "learning_rate": 1.2806622236822136e-08, "loss": 0.069, "num_input_tokens_seen": 26230784, "step": 6404 }, { "epoch": 4.7727272727272725, "grad_norm": 8.618729261727523, "learning_rate": 1.2723197648834145e-08, "loss": 0.1485, "num_input_tokens_seen": 26234880, "step": 6405 }, { "epoch": 4.773472429210134, "grad_norm": 8.751047439621498, "learning_rate": 1.2640044280755882e-08, "loss": 0.2116, "num_input_tokens_seen": 26238976, "step": 6406 }, { "epoch": 4.774217585692996, "grad_norm": 7.9261484034564385, "learning_rate": 1.2557162150815816e-08, "loss": 0.1087, "num_input_tokens_seen": 26243072, "step": 6407 }, { "epoch": 4.774962742175857, "grad_norm": 9.152112073575283, "learning_rate": 1.247455127718164e-08, "loss": 0.1641, "num_input_tokens_seen": 26247168, "step": 6408 }, { "epoch": 4.775707898658719, "grad_norm": 4.0074157410618865, "learning_rate": 1.2392211677962617e-08, "loss": 0.0345, "num_input_tokens_seen": 26251264, "step": 6409 }, { "epoch": 4.776453055141579, "grad_norm": 6.8558007377798775, "learning_rate": 1.23101433712082e-08, "loss": 0.109, "num_input_tokens_seen": 26255360, "step": 6410 }, { "epoch": 4.777198211624441, "grad_norm": 6.314816808631844, "learning_rate": 1.2228346374908163e-08, "loss": 0.0285, "num_input_tokens_seen": 26259456, "step": 6411 }, { "epoch": 4.7779433681073025, "grad_norm": 6.549598053775398, "learning_rate": 1.2146820706993023e-08, "loss": 0.0453, "num_input_tokens_seen": 26263552, "step": 6412 }, { "epoch": 4.778688524590164, "grad_norm": 6.240248631744275, "learning_rate": 1.2065566385333905e-08, "loss": 0.0867, "num_input_tokens_seen": 26267648, "step": 6413 }, { "epoch": 4.779433681073026, "grad_norm": 6.3476532030415385, "learning_rate": 1.1984583427742391e-08, "loss": 0.1252, "num_input_tokens_seen": 26271744, "step": 6414 }, { "epoch": 4.780178837555887, "grad_norm": 6.165470044582648, "learning_rate": 1.1903871851970255e-08, "loss": 0.074, "num_input_tokens_seen": 26275840, "step": 6415 }, { "epoch": 4.780923994038748, "grad_norm": 8.64925898191153, "learning_rate": 1.182343167571029e-08, "loss": 0.1519, "num_input_tokens_seen": 26279936, "step": 6416 }, { "epoch": 4.781669150521609, "grad_norm": 3.539765064213001, "learning_rate": 1.1743262916595472e-08, "loss": 0.0176, "num_input_tokens_seen": 26284032, "step": 6417 }, { "epoch": 4.782414307004471, "grad_norm": 5.5879717426217255, "learning_rate": 1.1663365592199244e-08, "loss": 0.0987, "num_input_tokens_seen": 26288128, "step": 6418 }, { "epoch": 4.7831594634873325, "grad_norm": 1.9785749819719052, "learning_rate": 1.158373972003593e-08, "loss": 0.0053, "num_input_tokens_seen": 26292224, "step": 6419 }, { "epoch": 4.783904619970194, "grad_norm": 6.441134652026123, "learning_rate": 1.1504385317559763e-08, "loss": 0.026, "num_input_tokens_seen": 26296320, "step": 6420 }, { "epoch": 4.784649776453055, "grad_norm": 5.006703235012268, "learning_rate": 1.1425302402166272e-08, "loss": 0.0292, "num_input_tokens_seen": 26300416, "step": 6421 }, { "epoch": 4.785394932935916, "grad_norm": 7.076304347539459, "learning_rate": 1.1346490991190479e-08, "loss": 0.0641, "num_input_tokens_seen": 26304512, "step": 6422 }, { "epoch": 4.786140089418778, "grad_norm": 7.702738263371813, "learning_rate": 1.1267951101908841e-08, "loss": 0.1746, "num_input_tokens_seen": 26308608, "step": 6423 }, { "epoch": 4.786885245901639, "grad_norm": 6.582466103545716, "learning_rate": 1.1189682751537451e-08, "loss": 0.0755, "num_input_tokens_seen": 26312704, "step": 6424 }, { "epoch": 4.787630402384501, "grad_norm": 7.3549651544702845, "learning_rate": 1.1111685957233553e-08, "loss": 0.1071, "num_input_tokens_seen": 26316800, "step": 6425 }, { "epoch": 4.788375558867362, "grad_norm": 7.602050725277702, "learning_rate": 1.1033960736094584e-08, "loss": 0.1292, "num_input_tokens_seen": 26320896, "step": 6426 }, { "epoch": 4.789120715350224, "grad_norm": 8.897285206324614, "learning_rate": 1.0956507105158442e-08, "loss": 0.1205, "num_input_tokens_seen": 26324992, "step": 6427 }, { "epoch": 4.789865871833085, "grad_norm": 9.241151163487098, "learning_rate": 1.0879325081403491e-08, "loss": 0.1666, "num_input_tokens_seen": 26329088, "step": 6428 }, { "epoch": 4.790611028315946, "grad_norm": 7.723360895930647, "learning_rate": 1.0802414681748557e-08, "loss": 0.1453, "num_input_tokens_seen": 26333184, "step": 6429 }, { "epoch": 4.791356184798808, "grad_norm": 7.046075450660305, "learning_rate": 1.0725775923053073e-08, "loss": 0.1413, "num_input_tokens_seen": 26337280, "step": 6430 }, { "epoch": 4.792101341281669, "grad_norm": 8.924697902012102, "learning_rate": 1.0649408822116515e-08, "loss": 0.0943, "num_input_tokens_seen": 26341376, "step": 6431 }, { "epoch": 4.792846497764531, "grad_norm": 5.808774498680223, "learning_rate": 1.0573313395679519e-08, "loss": 0.0489, "num_input_tokens_seen": 26345472, "step": 6432 }, { "epoch": 4.7935916542473915, "grad_norm": 8.03045396612977, "learning_rate": 1.0497489660422356e-08, "loss": 0.1184, "num_input_tokens_seen": 26349568, "step": 6433 }, { "epoch": 4.794336810730253, "grad_norm": 3.972645253755523, "learning_rate": 1.042193763296645e-08, "loss": 0.0322, "num_input_tokens_seen": 26353664, "step": 6434 }, { "epoch": 4.795081967213115, "grad_norm": 5.113714210792632, "learning_rate": 1.0346657329873138e-08, "loss": 0.06, "num_input_tokens_seen": 26357760, "step": 6435 }, { "epoch": 4.795827123695976, "grad_norm": 8.508715874804466, "learning_rate": 1.02716487676445e-08, "loss": 0.1109, "num_input_tokens_seen": 26361856, "step": 6436 }, { "epoch": 4.796572280178838, "grad_norm": 7.403605959941923, "learning_rate": 1.0196911962722793e-08, "loss": 0.0727, "num_input_tokens_seen": 26365952, "step": 6437 }, { "epoch": 4.797317436661699, "grad_norm": 7.550950906374105, "learning_rate": 1.012244693149103e-08, "loss": 0.0885, "num_input_tokens_seen": 26370048, "step": 6438 }, { "epoch": 4.798062593144561, "grad_norm": 7.208076094055722, "learning_rate": 1.0048253690272402e-08, "loss": 0.0887, "num_input_tokens_seen": 26374144, "step": 6439 }, { "epoch": 4.7988077496274215, "grad_norm": 8.562107372917064, "learning_rate": 9.974332255330565e-09, "loss": 0.0573, "num_input_tokens_seen": 26378240, "step": 6440 }, { "epoch": 4.799552906110283, "grad_norm": 5.780688249312259, "learning_rate": 9.90068264286978e-09, "loss": 0.0622, "num_input_tokens_seen": 26382336, "step": 6441 }, { "epoch": 4.800298062593145, "grad_norm": 13.76196536233959, "learning_rate": 9.827304869034354e-09, "loss": 0.0644, "num_input_tokens_seen": 26386432, "step": 6442 }, { "epoch": 4.801043219076006, "grad_norm": 6.151212865495602, "learning_rate": 9.754198949909477e-09, "loss": 0.0641, "num_input_tokens_seen": 26390528, "step": 6443 }, { "epoch": 4.801788375558868, "grad_norm": 8.321830433245264, "learning_rate": 9.681364901520247e-09, "loss": 0.1725, "num_input_tokens_seen": 26394624, "step": 6444 }, { "epoch": 4.802533532041728, "grad_norm": 6.039317387758255, "learning_rate": 9.608802739832363e-09, "loss": 0.0815, "num_input_tokens_seen": 26398720, "step": 6445 }, { "epoch": 4.80327868852459, "grad_norm": 8.720083757421786, "learning_rate": 9.53651248075227e-09, "loss": 0.1232, "num_input_tokens_seen": 26402816, "step": 6446 }, { "epoch": 4.8040238450074515, "grad_norm": 4.7013765500865965, "learning_rate": 9.464494140126318e-09, "loss": 0.024, "num_input_tokens_seen": 26406912, "step": 6447 }, { "epoch": 4.804769001490313, "grad_norm": 4.969346721383907, "learning_rate": 9.392747733741464e-09, "loss": 0.0247, "num_input_tokens_seen": 26411008, "step": 6448 }, { "epoch": 4.805514157973175, "grad_norm": 11.997341587266432, "learning_rate": 9.321273277324988e-09, "loss": 0.222, "num_input_tokens_seen": 26415104, "step": 6449 }, { "epoch": 4.806259314456035, "grad_norm": 4.078462498712603, "learning_rate": 9.250070786544774e-09, "loss": 0.0258, "num_input_tokens_seen": 26419200, "step": 6450 }, { "epoch": 4.807004470938897, "grad_norm": 6.800010677231236, "learning_rate": 9.179140277008613e-09, "loss": 0.1243, "num_input_tokens_seen": 26423296, "step": 6451 }, { "epoch": 4.807749627421758, "grad_norm": 8.764719336711163, "learning_rate": 9.108481764265181e-09, "loss": 0.0411, "num_input_tokens_seen": 26427392, "step": 6452 }, { "epoch": 4.80849478390462, "grad_norm": 5.110408128377366, "learning_rate": 9.038095263803338e-09, "loss": 0.0636, "num_input_tokens_seen": 26431488, "step": 6453 }, { "epoch": 4.8092399403874815, "grad_norm": 6.100076386490355, "learning_rate": 8.967980791052406e-09, "loss": 0.0463, "num_input_tokens_seen": 26435584, "step": 6454 }, { "epoch": 4.809985096870343, "grad_norm": 2.938768042558003, "learning_rate": 8.898138361381759e-09, "loss": 0.0141, "num_input_tokens_seen": 26439680, "step": 6455 }, { "epoch": 4.810730253353205, "grad_norm": 7.263923788824545, "learning_rate": 8.828567990101372e-09, "loss": 0.0835, "num_input_tokens_seen": 26443776, "step": 6456 }, { "epoch": 4.811475409836065, "grad_norm": 8.594456200690562, "learning_rate": 8.759269692461686e-09, "loss": 0.1276, "num_input_tokens_seen": 26447872, "step": 6457 }, { "epoch": 4.812220566318927, "grad_norm": 7.425696056644263, "learning_rate": 8.690243483653188e-09, "loss": 0.1133, "num_input_tokens_seen": 26451968, "step": 6458 }, { "epoch": 4.812965722801788, "grad_norm": 7.42797857645316, "learning_rate": 8.621489378807246e-09, "loss": 0.1491, "num_input_tokens_seen": 26456064, "step": 6459 }, { "epoch": 4.81371087928465, "grad_norm": 5.738180705059607, "learning_rate": 8.553007392994999e-09, "loss": 0.0516, "num_input_tokens_seen": 26460160, "step": 6460 }, { "epoch": 4.8144560357675115, "grad_norm": 11.521444030746265, "learning_rate": 8.484797541228329e-09, "loss": 0.0887, "num_input_tokens_seen": 26464256, "step": 6461 }, { "epoch": 4.815201192250372, "grad_norm": 6.696657787824977, "learning_rate": 8.416859838459162e-09, "loss": 0.1269, "num_input_tokens_seen": 26468352, "step": 6462 }, { "epoch": 4.815946348733234, "grad_norm": 5.007680768681501, "learning_rate": 8.349194299580171e-09, "loss": 0.0452, "num_input_tokens_seen": 26472448, "step": 6463 }, { "epoch": 4.816691505216095, "grad_norm": 4.188811303655636, "learning_rate": 8.281800939423796e-09, "loss": 0.0238, "num_input_tokens_seen": 26476544, "step": 6464 }, { "epoch": 4.817436661698957, "grad_norm": 4.990769456245447, "learning_rate": 8.214679772763356e-09, "loss": 0.0176, "num_input_tokens_seen": 26480640, "step": 6465 }, { "epoch": 4.818181818181818, "grad_norm": 7.439750101092643, "learning_rate": 8.147830814312225e-09, "loss": 0.0702, "num_input_tokens_seen": 26484736, "step": 6466 }, { "epoch": 4.81892697466468, "grad_norm": 6.893552212339227, "learning_rate": 8.081254078724233e-09, "loss": 0.0604, "num_input_tokens_seen": 26488832, "step": 6467 }, { "epoch": 4.8196721311475414, "grad_norm": 6.814740998813151, "learning_rate": 8.01494958059354e-09, "loss": 0.0403, "num_input_tokens_seen": 26492928, "step": 6468 }, { "epoch": 4.820417287630402, "grad_norm": 7.128613593658054, "learning_rate": 7.948917334454216e-09, "loss": 0.0451, "num_input_tokens_seen": 26497024, "step": 6469 }, { "epoch": 4.821162444113264, "grad_norm": 7.935071174395464, "learning_rate": 7.883157354781484e-09, "loss": 0.082, "num_input_tokens_seen": 26501120, "step": 6470 }, { "epoch": 4.821907600596125, "grad_norm": 4.785936802336396, "learning_rate": 7.817669655990067e-09, "loss": 0.0539, "num_input_tokens_seen": 26505216, "step": 6471 }, { "epoch": 4.822652757078987, "grad_norm": 8.272124983382888, "learning_rate": 7.752454252435427e-09, "loss": 0.1748, "num_input_tokens_seen": 26509312, "step": 6472 }, { "epoch": 4.823397913561848, "grad_norm": 9.818234887112219, "learning_rate": 7.687511158413346e-09, "loss": 0.076, "num_input_tokens_seen": 26513408, "step": 6473 }, { "epoch": 4.824143070044709, "grad_norm": 8.524864995852095, "learning_rate": 7.622840388159525e-09, "loss": 0.1139, "num_input_tokens_seen": 26517504, "step": 6474 }, { "epoch": 4.8248882265275705, "grad_norm": 7.146885249596168, "learning_rate": 7.558441955850677e-09, "loss": 0.1338, "num_input_tokens_seen": 26521600, "step": 6475 }, { "epoch": 4.825633383010432, "grad_norm": 3.5400113013803107, "learning_rate": 7.494315875603008e-09, "loss": 0.0183, "num_input_tokens_seen": 26525696, "step": 6476 }, { "epoch": 4.826378539493294, "grad_norm": 6.980016180261393, "learning_rate": 7.4304621614737506e-09, "loss": 0.025, "num_input_tokens_seen": 26529792, "step": 6477 }, { "epoch": 4.827123695976155, "grad_norm": 8.735295345076374, "learning_rate": 7.36688082745976e-09, "loss": 0.2156, "num_input_tokens_seen": 26533888, "step": 6478 }, { "epoch": 4.827868852459017, "grad_norm": 2.3487938022814676, "learning_rate": 7.3035718874987785e-09, "loss": 0.0101, "num_input_tokens_seen": 26537984, "step": 6479 }, { "epoch": 4.828614008941877, "grad_norm": 3.0925635978241317, "learning_rate": 7.240535355468453e-09, "loss": 0.0168, "num_input_tokens_seen": 26542080, "step": 6480 }, { "epoch": 4.829359165424739, "grad_norm": 7.833880291123643, "learning_rate": 7.177771245186899e-09, "loss": 0.1092, "num_input_tokens_seen": 26546176, "step": 6481 }, { "epoch": 4.8301043219076005, "grad_norm": 4.628069401476548, "learning_rate": 7.115279570412415e-09, "loss": 0.0148, "num_input_tokens_seen": 26550272, "step": 6482 }, { "epoch": 4.830849478390462, "grad_norm": 4.053342485204808, "learning_rate": 7.053060344843626e-09, "loss": 0.0284, "num_input_tokens_seen": 26554368, "step": 6483 }, { "epoch": 4.831594634873324, "grad_norm": 9.770958785535965, "learning_rate": 6.991113582119485e-09, "loss": 0.1465, "num_input_tokens_seen": 26558464, "step": 6484 }, { "epoch": 4.832339791356185, "grad_norm": 4.368777016720172, "learning_rate": 6.9294392958191295e-09, "loss": 0.0231, "num_input_tokens_seen": 26562560, "step": 6485 }, { "epoch": 4.833084947839046, "grad_norm": 4.603131442562198, "learning_rate": 6.868037499461883e-09, "loss": 0.0331, "num_input_tokens_seen": 26566656, "step": 6486 }, { "epoch": 4.833830104321907, "grad_norm": 5.38037209113475, "learning_rate": 6.806908206507673e-09, "loss": 0.0483, "num_input_tokens_seen": 26570752, "step": 6487 }, { "epoch": 4.834575260804769, "grad_norm": 5.913181663535407, "learning_rate": 6.746051430356476e-09, "loss": 0.0423, "num_input_tokens_seen": 26574848, "step": 6488 }, { "epoch": 4.8353204172876305, "grad_norm": 1.8208402859920658, "learning_rate": 6.685467184348177e-09, "loss": 0.0068, "num_input_tokens_seen": 26578944, "step": 6489 }, { "epoch": 4.836065573770492, "grad_norm": 3.9290295227570327, "learning_rate": 6.625155481763679e-09, "loss": 0.0227, "num_input_tokens_seen": 26583040, "step": 6490 }, { "epoch": 4.836810730253353, "grad_norm": 7.442000534713569, "learning_rate": 6.565116335823518e-09, "loss": 0.0206, "num_input_tokens_seen": 26587136, "step": 6491 }, { "epoch": 4.837555886736214, "grad_norm": 4.148164109738878, "learning_rate": 6.5053497596889725e-09, "loss": 0.0189, "num_input_tokens_seen": 26591232, "step": 6492 }, { "epoch": 4.838301043219076, "grad_norm": 5.748103996016314, "learning_rate": 6.445855766460951e-09, "loss": 0.044, "num_input_tokens_seen": 26595328, "step": 6493 }, { "epoch": 4.839046199701937, "grad_norm": 4.843069671421125, "learning_rate": 6.386634369181105e-09, "loss": 0.0453, "num_input_tokens_seen": 26599424, "step": 6494 }, { "epoch": 4.839791356184799, "grad_norm": 7.18162120358337, "learning_rate": 6.327685580831272e-09, "loss": 0.0667, "num_input_tokens_seen": 26603520, "step": 6495 }, { "epoch": 4.8405365126676605, "grad_norm": 7.479312360722364, "learning_rate": 6.269009414333199e-09, "loss": 0.0969, "num_input_tokens_seen": 26607616, "step": 6496 }, { "epoch": 4.841281669150522, "grad_norm": 9.341552676664454, "learning_rate": 6.210605882549514e-09, "loss": 0.1699, "num_input_tokens_seen": 26611712, "step": 6497 }, { "epoch": 4.842026825633383, "grad_norm": 1.749184769000296, "learning_rate": 6.1524749982823374e-09, "loss": 0.0063, "num_input_tokens_seen": 26615808, "step": 6498 }, { "epoch": 4.842771982116244, "grad_norm": 8.750278624333035, "learning_rate": 6.094616774274531e-09, "loss": 0.1183, "num_input_tokens_seen": 26619904, "step": 6499 }, { "epoch": 4.843517138599106, "grad_norm": 4.740075432349126, "learning_rate": 6.037031223209005e-09, "loss": 0.0567, "num_input_tokens_seen": 26624000, "step": 6500 }, { "epoch": 4.844262295081967, "grad_norm": 6.877630830278919, "learning_rate": 5.9797183577089966e-09, "loss": 0.1015, "num_input_tokens_seen": 26628096, "step": 6501 }, { "epoch": 4.845007451564829, "grad_norm": 5.263033754545052, "learning_rate": 5.922678190337788e-09, "loss": 0.0495, "num_input_tokens_seen": 26632192, "step": 6502 }, { "epoch": 4.84575260804769, "grad_norm": 4.058798349037374, "learning_rate": 5.8659107335989894e-09, "loss": 0.0289, "num_input_tokens_seen": 26636288, "step": 6503 }, { "epoch": 4.846497764530551, "grad_norm": 9.707102509947747, "learning_rate": 5.809415999936535e-09, "loss": 0.1957, "num_input_tokens_seen": 26640384, "step": 6504 }, { "epoch": 4.847242921013413, "grad_norm": 6.137183746019271, "learning_rate": 5.7531940017342695e-09, "loss": 0.0827, "num_input_tokens_seen": 26644480, "step": 6505 }, { "epoch": 4.847988077496274, "grad_norm": 5.39176999153157, "learning_rate": 5.697244751316638e-09, "loss": 0.0407, "num_input_tokens_seen": 26648576, "step": 6506 }, { "epoch": 4.848733233979136, "grad_norm": 4.90947246042371, "learning_rate": 5.641568260948138e-09, "loss": 0.0718, "num_input_tokens_seen": 26652672, "step": 6507 }, { "epoch": 4.849478390461997, "grad_norm": 7.285011300311324, "learning_rate": 5.586164542833311e-09, "loss": 0.1371, "num_input_tokens_seen": 26656768, "step": 6508 }, { "epoch": 4.850223546944859, "grad_norm": 6.604244719638546, "learning_rate": 5.531033609117303e-09, "loss": 0.055, "num_input_tokens_seen": 26660864, "step": 6509 }, { "epoch": 4.85096870342772, "grad_norm": 8.112681510492987, "learning_rate": 5.476175471884892e-09, "loss": 0.0956, "num_input_tokens_seen": 26664960, "step": 6510 }, { "epoch": 4.851713859910581, "grad_norm": 7.923245235415574, "learning_rate": 5.421590143161459e-09, "loss": 0.1845, "num_input_tokens_seen": 26669056, "step": 6511 }, { "epoch": 4.852459016393443, "grad_norm": 6.906542772315902, "learning_rate": 5.367277634912571e-09, "loss": 0.0579, "num_input_tokens_seen": 26673152, "step": 6512 }, { "epoch": 4.853204172876304, "grad_norm": 5.689054937610274, "learning_rate": 5.313237959043982e-09, "loss": 0.0702, "num_input_tokens_seen": 26677248, "step": 6513 }, { "epoch": 4.853949329359166, "grad_norm": 4.05517228616554, "learning_rate": 5.259471127401494e-09, "loss": 0.0199, "num_input_tokens_seen": 26681344, "step": 6514 }, { "epoch": 4.8546944858420265, "grad_norm": 6.495180416263361, "learning_rate": 5.205977151771097e-09, "loss": 0.0865, "num_input_tokens_seen": 26685440, "step": 6515 }, { "epoch": 4.855439642324888, "grad_norm": 6.810652152037693, "learning_rate": 5.152756043879242e-09, "loss": 0.1002, "num_input_tokens_seen": 26689536, "step": 6516 }, { "epoch": 4.85618479880775, "grad_norm": 6.212713732785331, "learning_rate": 5.099807815392155e-09, "loss": 0.0769, "num_input_tokens_seen": 26693632, "step": 6517 }, { "epoch": 4.856929955290611, "grad_norm": 5.532550756495624, "learning_rate": 5.047132477916522e-09, "loss": 0.0785, "num_input_tokens_seen": 26697728, "step": 6518 }, { "epoch": 4.857675111773473, "grad_norm": 4.428528101462707, "learning_rate": 4.994730042999357e-09, "loss": 0.0379, "num_input_tokens_seen": 26701824, "step": 6519 }, { "epoch": 4.858420268256334, "grad_norm": 9.924860705833877, "learning_rate": 4.9426005221274445e-09, "loss": 0.1983, "num_input_tokens_seen": 26705920, "step": 6520 }, { "epoch": 4.859165424739195, "grad_norm": 7.458026833873728, "learning_rate": 4.890743926728031e-09, "loss": 0.0939, "num_input_tokens_seen": 26710016, "step": 6521 }, { "epoch": 4.859910581222056, "grad_norm": 5.049452548645276, "learning_rate": 4.8391602681684135e-09, "loss": 0.0383, "num_input_tokens_seen": 26714112, "step": 6522 }, { "epoch": 4.860655737704918, "grad_norm": 9.022049347963172, "learning_rate": 4.787849557756074e-09, "loss": 0.1287, "num_input_tokens_seen": 26718208, "step": 6523 }, { "epoch": 4.8614008941877795, "grad_norm": 1.5853712536933868, "learning_rate": 4.736811806738684e-09, "loss": 0.0048, "num_input_tokens_seen": 26722304, "step": 6524 }, { "epoch": 4.862146050670641, "grad_norm": 7.8125640975376145, "learning_rate": 4.686047026304097e-09, "loss": 0.1364, "num_input_tokens_seen": 26726400, "step": 6525 }, { "epoch": 4.862891207153503, "grad_norm": 3.6861856767434844, "learning_rate": 4.635555227580496e-09, "loss": 0.0141, "num_input_tokens_seen": 26730496, "step": 6526 }, { "epoch": 4.863636363636363, "grad_norm": 6.094987229175185, "learning_rate": 4.585336421635833e-09, "loss": 0.0886, "num_input_tokens_seen": 26734592, "step": 6527 }, { "epoch": 4.864381520119225, "grad_norm": 6.497035472973434, "learning_rate": 4.5353906194783856e-09, "loss": 0.0588, "num_input_tokens_seen": 26738688, "step": 6528 }, { "epoch": 4.865126676602086, "grad_norm": 6.04996523016689, "learning_rate": 4.485717832056896e-09, "loss": 0.0531, "num_input_tokens_seen": 26742784, "step": 6529 }, { "epoch": 4.865871833084948, "grad_norm": 6.349498646196391, "learning_rate": 4.436318070259598e-09, "loss": 0.0637, "num_input_tokens_seen": 26746880, "step": 6530 }, { "epoch": 4.8666169895678095, "grad_norm": 4.988264771954606, "learning_rate": 4.387191344915609e-09, "loss": 0.0408, "num_input_tokens_seen": 26750976, "step": 6531 }, { "epoch": 4.86736214605067, "grad_norm": 4.556421011500816, "learning_rate": 4.338337666793813e-09, "loss": 0.0353, "num_input_tokens_seen": 26755072, "step": 6532 }, { "epoch": 4.868107302533532, "grad_norm": 6.3035361055169385, "learning_rate": 4.289757046603143e-09, "loss": 0.0244, "num_input_tokens_seen": 26759168, "step": 6533 }, { "epoch": 4.868852459016393, "grad_norm": 7.78556018208889, "learning_rate": 4.241449494992861e-09, "loss": 0.0965, "num_input_tokens_seen": 26763264, "step": 6534 }, { "epoch": 4.869597615499255, "grad_norm": 7.696563169008471, "learning_rate": 4.193415022552411e-09, "loss": 0.1235, "num_input_tokens_seen": 26767360, "step": 6535 }, { "epoch": 4.870342771982116, "grad_norm": 6.154042985636066, "learning_rate": 4.145653639811287e-09, "loss": 0.1071, "num_input_tokens_seen": 26771456, "step": 6536 }, { "epoch": 4.871087928464978, "grad_norm": 1.6215383706948685, "learning_rate": 4.098165357239031e-09, "loss": 0.0082, "num_input_tokens_seen": 26775552, "step": 6537 }, { "epoch": 4.8718330849478395, "grad_norm": 6.882183641212364, "learning_rate": 4.050950185245512e-09, "loss": 0.0633, "num_input_tokens_seen": 26779648, "step": 6538 }, { "epoch": 4.8725782414307, "grad_norm": 1.3363287435924467, "learning_rate": 4.004008134180504e-09, "loss": 0.0039, "num_input_tokens_seen": 26783744, "step": 6539 }, { "epoch": 4.873323397913562, "grad_norm": 8.895164341622863, "learning_rate": 3.95733921433411e-09, "loss": 0.1162, "num_input_tokens_seen": 26787840, "step": 6540 }, { "epoch": 4.874068554396423, "grad_norm": 4.3814172014959185, "learning_rate": 3.910943435936482e-09, "loss": 0.0384, "num_input_tokens_seen": 26791936, "step": 6541 }, { "epoch": 4.874813710879285, "grad_norm": 7.639436506849162, "learning_rate": 3.864820809158093e-09, "loss": 0.0299, "num_input_tokens_seen": 26796032, "step": 6542 }, { "epoch": 4.875558867362146, "grad_norm": 6.754578395082958, "learning_rate": 3.818971344109051e-09, "loss": 0.0964, "num_input_tokens_seen": 26800128, "step": 6543 }, { "epoch": 4.876304023845007, "grad_norm": 7.015815422902942, "learning_rate": 3.7733950508400676e-09, "loss": 0.0665, "num_input_tokens_seen": 26804224, "step": 6544 }, { "epoch": 4.877049180327869, "grad_norm": 7.208713708027775, "learning_rate": 3.728091939341621e-09, "loss": 0.0743, "num_input_tokens_seen": 26808320, "step": 6545 }, { "epoch": 4.87779433681073, "grad_norm": 7.034426297512482, "learning_rate": 3.6830620195447973e-09, "loss": 0.089, "num_input_tokens_seen": 26812416, "step": 6546 }, { "epoch": 4.878539493293592, "grad_norm": 5.856676556772932, "learning_rate": 3.6383053013201718e-09, "loss": 0.0658, "num_input_tokens_seen": 26816512, "step": 6547 }, { "epoch": 4.879284649776453, "grad_norm": 6.518484568057987, "learning_rate": 3.5938217944787858e-09, "loss": 0.1307, "num_input_tokens_seen": 26820608, "step": 6548 }, { "epoch": 4.880029806259315, "grad_norm": 3.216104313510233, "learning_rate": 3.5496115087718663e-09, "loss": 0.0191, "num_input_tokens_seen": 26824704, "step": 6549 }, { "epoch": 4.8807749627421755, "grad_norm": 6.454882142722366, "learning_rate": 3.5056744538905507e-09, "loss": 0.0675, "num_input_tokens_seen": 26828800, "step": 6550 }, { "epoch": 4.881520119225037, "grad_norm": 7.6361355350615945, "learning_rate": 3.4620106394661625e-09, "loss": 0.1672, "num_input_tokens_seen": 26832896, "step": 6551 }, { "epoch": 4.882265275707899, "grad_norm": 8.308245741571103, "learning_rate": 3.4186200750700725e-09, "loss": 0.1124, "num_input_tokens_seen": 26836992, "step": 6552 }, { "epoch": 4.88301043219076, "grad_norm": 7.739754154845915, "learning_rate": 3.3755027702138397e-09, "loss": 0.1443, "num_input_tokens_seen": 26841088, "step": 6553 }, { "epoch": 4.883755588673622, "grad_norm": 7.720502935741907, "learning_rate": 3.332658734349209e-09, "loss": 0.1008, "num_input_tokens_seen": 26845184, "step": 6554 }, { "epoch": 4.884500745156483, "grad_norm": 8.852950583960592, "learning_rate": 3.290087976867695e-09, "loss": 0.1778, "num_input_tokens_seen": 26849280, "step": 6555 }, { "epoch": 4.885245901639344, "grad_norm": 6.081705303084347, "learning_rate": 3.2477905071012783e-09, "loss": 0.0655, "num_input_tokens_seen": 26853376, "step": 6556 }, { "epoch": 4.8859910581222055, "grad_norm": 6.555026921543984, "learning_rate": 3.2057663343217083e-09, "loss": 0.1138, "num_input_tokens_seen": 26857472, "step": 6557 }, { "epoch": 4.886736214605067, "grad_norm": 8.68329652273969, "learning_rate": 3.1640154677410606e-09, "loss": 0.1768, "num_input_tokens_seen": 26861568, "step": 6558 }, { "epoch": 4.887481371087929, "grad_norm": 3.5279492931707095, "learning_rate": 3.122537916511459e-09, "loss": 0.0259, "num_input_tokens_seen": 26865664, "step": 6559 }, { "epoch": 4.88822652757079, "grad_norm": 7.530545901014678, "learning_rate": 3.0813336897250744e-09, "loss": 0.101, "num_input_tokens_seen": 26869760, "step": 6560 }, { "epoch": 4.888971684053651, "grad_norm": 5.427533992301583, "learning_rate": 3.040402796413988e-09, "loss": 0.0707, "num_input_tokens_seen": 26873856, "step": 6561 }, { "epoch": 4.889716840536512, "grad_norm": 4.352527059250404, "learning_rate": 2.9997452455508836e-09, "loss": 0.0515, "num_input_tokens_seen": 26877952, "step": 6562 }, { "epoch": 4.890461997019374, "grad_norm": 5.655408009243198, "learning_rate": 2.9593610460479373e-09, "loss": 0.0818, "num_input_tokens_seen": 26882048, "step": 6563 }, { "epoch": 4.8912071535022354, "grad_norm": 4.3495918211680955, "learning_rate": 2.919250206757651e-09, "loss": 0.0383, "num_input_tokens_seen": 26886144, "step": 6564 }, { "epoch": 4.891952309985097, "grad_norm": 6.66255553379007, "learning_rate": 2.8794127364727144e-09, "loss": 0.0859, "num_input_tokens_seen": 26890240, "step": 6565 }, { "epoch": 4.8926974664679586, "grad_norm": 2.8004398753376423, "learning_rate": 2.8398486439257243e-09, "loss": 0.015, "num_input_tokens_seen": 26894336, "step": 6566 }, { "epoch": 4.89344262295082, "grad_norm": 7.407561889478452, "learning_rate": 2.8005579377894665e-09, "loss": 0.05, "num_input_tokens_seen": 26898432, "step": 6567 }, { "epoch": 4.894187779433681, "grad_norm": 10.139758629549148, "learning_rate": 2.761540626676773e-09, "loss": 0.0294, "num_input_tokens_seen": 26902528, "step": 6568 }, { "epoch": 4.894932935916542, "grad_norm": 7.085866691199901, "learning_rate": 2.722796719140386e-09, "loss": 0.0793, "num_input_tokens_seen": 26906624, "step": 6569 }, { "epoch": 4.895678092399404, "grad_norm": 6.0880495228653775, "learning_rate": 2.6843262236732337e-09, "loss": 0.0695, "num_input_tokens_seen": 26910720, "step": 6570 }, { "epoch": 4.896423248882265, "grad_norm": 1.2507128407315595, "learning_rate": 2.6461291487085705e-09, "loss": 0.0029, "num_input_tokens_seen": 26914816, "step": 6571 }, { "epoch": 4.897168405365127, "grad_norm": 7.141632937319198, "learning_rate": 2.608205502619143e-09, "loss": 0.129, "num_input_tokens_seen": 26918912, "step": 6572 }, { "epoch": 4.897913561847988, "grad_norm": 6.981582043058791, "learning_rate": 2.570555293718302e-09, "loss": 0.1059, "num_input_tokens_seen": 26923008, "step": 6573 }, { "epoch": 4.898658718330849, "grad_norm": 7.661489631897924, "learning_rate": 2.533178530259306e-09, "loss": 0.0741, "num_input_tokens_seen": 26927104, "step": 6574 }, { "epoch": 4.899403874813711, "grad_norm": 7.894955640894793, "learning_rate": 2.496075220435046e-09, "loss": 0.0947, "num_input_tokens_seen": 26931200, "step": 6575 }, { "epoch": 4.900149031296572, "grad_norm": 7.13127802713857, "learning_rate": 2.4592453723792932e-09, "loss": 0.066, "num_input_tokens_seen": 26935296, "step": 6576 }, { "epoch": 4.900894187779434, "grad_norm": 6.85833584780761, "learning_rate": 2.4226889941650346e-09, "loss": 0.0993, "num_input_tokens_seen": 26939392, "step": 6577 }, { "epoch": 4.901639344262295, "grad_norm": 7.808882157031708, "learning_rate": 2.3864060938058596e-09, "loss": 0.0817, "num_input_tokens_seen": 26943488, "step": 6578 }, { "epoch": 4.902384500745157, "grad_norm": 0.7419629289559639, "learning_rate": 2.350396679255268e-09, "loss": 0.0019, "num_input_tokens_seen": 26947584, "step": 6579 }, { "epoch": 4.903129657228018, "grad_norm": 11.216736988176795, "learning_rate": 2.314660758406806e-09, "loss": 0.1118, "num_input_tokens_seen": 26951680, "step": 6580 }, { "epoch": 4.903874813710879, "grad_norm": 3.590312579472587, "learning_rate": 2.2791983390939297e-09, "loss": 0.0211, "num_input_tokens_seen": 26955776, "step": 6581 }, { "epoch": 4.904619970193741, "grad_norm": 7.714752363106456, "learning_rate": 2.2440094290902826e-09, "loss": 0.1401, "num_input_tokens_seen": 26959872, "step": 6582 }, { "epoch": 4.905365126676602, "grad_norm": 6.178326660550141, "learning_rate": 2.209094036109416e-09, "loss": 0.0536, "num_input_tokens_seen": 26963968, "step": 6583 }, { "epoch": 4.906110283159464, "grad_norm": 5.722058357691178, "learning_rate": 2.1744521678053463e-09, "loss": 0.0534, "num_input_tokens_seen": 26968064, "step": 6584 }, { "epoch": 4.9068554396423245, "grad_norm": 6.607150757836772, "learning_rate": 2.1400838317715833e-09, "loss": 0.0416, "num_input_tokens_seen": 26972160, "step": 6585 }, { "epoch": 4.907600596125186, "grad_norm": 8.286551726273421, "learning_rate": 2.105989035541961e-09, "loss": 0.0893, "num_input_tokens_seen": 26976256, "step": 6586 }, { "epoch": 4.908345752608048, "grad_norm": 4.0381932304252075, "learning_rate": 2.0721677865902236e-09, "loss": 0.0376, "num_input_tokens_seen": 26980352, "step": 6587 }, { "epoch": 4.909090909090909, "grad_norm": 7.301788709722488, "learning_rate": 2.03862009233044e-09, "loss": 0.0777, "num_input_tokens_seen": 26984448, "step": 6588 }, { "epoch": 4.909836065573771, "grad_norm": 12.665810165526059, "learning_rate": 2.0053459601163116e-09, "loss": 0.0603, "num_input_tokens_seen": 26988544, "step": 6589 }, { "epoch": 4.910581222056632, "grad_norm": 7.5287200163245975, "learning_rate": 1.972345397241726e-09, "loss": 0.0567, "num_input_tokens_seen": 26992640, "step": 6590 }, { "epoch": 4.911326378539493, "grad_norm": 6.3859457948434475, "learning_rate": 1.9396184109408966e-09, "loss": 0.0434, "num_input_tokens_seen": 26996736, "step": 6591 }, { "epoch": 4.9120715350223545, "grad_norm": 6.048303924511238, "learning_rate": 1.9071650083875293e-09, "loss": 0.07, "num_input_tokens_seen": 27000832, "step": 6592 }, { "epoch": 4.912816691505216, "grad_norm": 7.491757537696492, "learning_rate": 1.874985196695933e-09, "loss": 0.0266, "num_input_tokens_seen": 27004928, "step": 6593 }, { "epoch": 4.913561847988078, "grad_norm": 8.068007287282725, "learning_rate": 1.8430789829199104e-09, "loss": 0.075, "num_input_tokens_seen": 27009024, "step": 6594 }, { "epoch": 4.914307004470939, "grad_norm": 7.349104329270055, "learning_rate": 1.8114463740535882e-09, "loss": 0.0373, "num_input_tokens_seen": 27013120, "step": 6595 }, { "epoch": 4.915052160953801, "grad_norm": 10.322912371694803, "learning_rate": 1.7800873770311422e-09, "loss": 0.0664, "num_input_tokens_seen": 27017216, "step": 6596 }, { "epoch": 4.915797317436661, "grad_norm": 6.631497465217566, "learning_rate": 1.7490019987265183e-09, "loss": 0.0738, "num_input_tokens_seen": 27021312, "step": 6597 }, { "epoch": 4.916542473919523, "grad_norm": 9.393641784872074, "learning_rate": 1.7181902459539878e-09, "loss": 0.1897, "num_input_tokens_seen": 27025408, "step": 6598 }, { "epoch": 4.9172876304023845, "grad_norm": 5.515244067866556, "learning_rate": 1.6876521254677315e-09, "loss": 0.0353, "num_input_tokens_seen": 27029504, "step": 6599 }, { "epoch": 4.918032786885246, "grad_norm": 7.668156809192891, "learning_rate": 1.6573876439618387e-09, "loss": 0.1283, "num_input_tokens_seen": 27033600, "step": 6600 }, { "epoch": 4.918777943368108, "grad_norm": 8.446624725985975, "learning_rate": 1.627396808070586e-09, "loss": 0.0889, "num_input_tokens_seen": 27037696, "step": 6601 }, { "epoch": 4.919523099850968, "grad_norm": 7.137751306563495, "learning_rate": 1.5976796243681592e-09, "loss": 0.1263, "num_input_tokens_seen": 27041792, "step": 6602 }, { "epoch": 4.92026825633383, "grad_norm": 6.4143939599730055, "learning_rate": 1.568236099368653e-09, "loss": 0.0994, "num_input_tokens_seen": 27045888, "step": 6603 }, { "epoch": 4.921013412816691, "grad_norm": 5.224679600137935, "learning_rate": 1.5390662395264876e-09, "loss": 0.0473, "num_input_tokens_seen": 27049984, "step": 6604 }, { "epoch": 4.921758569299553, "grad_norm": 10.091261674480258, "learning_rate": 1.5101700512357154e-09, "loss": 0.1375, "num_input_tokens_seen": 27054080, "step": 6605 }, { "epoch": 4.9225037257824145, "grad_norm": 5.43903407364057, "learning_rate": 1.4815475408307134e-09, "loss": 0.0312, "num_input_tokens_seen": 27058176, "step": 6606 }, { "epoch": 4.923248882265276, "grad_norm": 8.89428911098806, "learning_rate": 1.4531987145857685e-09, "loss": 0.0878, "num_input_tokens_seen": 27062272, "step": 6607 }, { "epoch": 4.923994038748138, "grad_norm": 7.54996949602131, "learning_rate": 1.4251235787150764e-09, "loss": 0.1139, "num_input_tokens_seen": 27066368, "step": 6608 }, { "epoch": 4.924739195230998, "grad_norm": 7.4228678469514024, "learning_rate": 1.3973221393728808e-09, "loss": 0.1193, "num_input_tokens_seen": 27070464, "step": 6609 }, { "epoch": 4.92548435171386, "grad_norm": 5.5058212565934594, "learning_rate": 1.3697944026534736e-09, "loss": 0.0663, "num_input_tokens_seen": 27074560, "step": 6610 }, { "epoch": 4.926229508196721, "grad_norm": 7.62536061718668, "learning_rate": 1.3425403745911946e-09, "loss": 0.1527, "num_input_tokens_seen": 27078656, "step": 6611 }, { "epoch": 4.926974664679583, "grad_norm": 2.5636410473232862, "learning_rate": 1.3155600611601537e-09, "loss": 0.0095, "num_input_tokens_seen": 27082752, "step": 6612 }, { "epoch": 4.9277198211624444, "grad_norm": 8.241368747863337, "learning_rate": 1.2888534682749255e-09, "loss": 0.1544, "num_input_tokens_seen": 27086848, "step": 6613 }, { "epoch": 4.928464977645305, "grad_norm": 6.696062766425335, "learning_rate": 1.262420601789438e-09, "loss": 0.0809, "num_input_tokens_seen": 27090944, "step": 6614 }, { "epoch": 4.929210134128167, "grad_norm": 4.469847979280142, "learning_rate": 1.2362614674982232e-09, "loss": 0.0468, "num_input_tokens_seen": 27095040, "step": 6615 }, { "epoch": 4.929955290611028, "grad_norm": 6.295522672764394, "learning_rate": 1.2103760711355827e-09, "loss": 0.1265, "num_input_tokens_seen": 27099136, "step": 6616 }, { "epoch": 4.93070044709389, "grad_norm": 5.166921489650104, "learning_rate": 1.1847644183754504e-09, "loss": 0.0646, "num_input_tokens_seen": 27103232, "step": 6617 }, { "epoch": 4.931445603576751, "grad_norm": 8.420553784726721, "learning_rate": 1.1594265148325012e-09, "loss": 0.1306, "num_input_tokens_seen": 27107328, "step": 6618 }, { "epoch": 4.932190760059613, "grad_norm": 4.628047324726222, "learning_rate": 1.1343623660606262e-09, "loss": 0.0293, "num_input_tokens_seen": 27111424, "step": 6619 }, { "epoch": 4.9329359165424735, "grad_norm": 21.924374106183723, "learning_rate": 1.1095719775541802e-09, "loss": 0.0201, "num_input_tokens_seen": 27115520, "step": 6620 }, { "epoch": 4.933681073025335, "grad_norm": 8.730341719013172, "learning_rate": 1.0850553547475663e-09, "loss": 0.1, "num_input_tokens_seen": 27119616, "step": 6621 }, { "epoch": 4.934426229508197, "grad_norm": 3.1245411058740453, "learning_rate": 1.0608125030148187e-09, "loss": 0.0123, "num_input_tokens_seen": 27123712, "step": 6622 }, { "epoch": 4.935171385991058, "grad_norm": 4.556737363124128, "learning_rate": 1.0368434276702976e-09, "loss": 0.0251, "num_input_tokens_seen": 27127808, "step": 6623 }, { "epoch": 4.93591654247392, "grad_norm": 8.04019127151619, "learning_rate": 1.0131481339678563e-09, "loss": 0.1068, "num_input_tokens_seen": 27131904, "step": 6624 }, { "epoch": 4.936661698956781, "grad_norm": 2.2549479090570173, "learning_rate": 9.897266271020889e-10, "loss": 0.0093, "num_input_tokens_seen": 27136000, "step": 6625 }, { "epoch": 4.937406855439642, "grad_norm": 6.715320784940699, "learning_rate": 9.665789122069446e-10, "loss": 0.0625, "num_input_tokens_seen": 27140096, "step": 6626 }, { "epoch": 4.9381520119225035, "grad_norm": 4.708804495087724, "learning_rate": 9.437049943565591e-10, "loss": 0.025, "num_input_tokens_seen": 27144192, "step": 6627 }, { "epoch": 4.938897168405365, "grad_norm": 6.256826786023728, "learning_rate": 9.211048785651155e-10, "loss": 0.0468, "num_input_tokens_seen": 27148288, "step": 6628 }, { "epoch": 4.939642324888227, "grad_norm": 6.078925181133203, "learning_rate": 8.987785697867068e-10, "loss": 0.0663, "num_input_tokens_seen": 27152384, "step": 6629 }, { "epoch": 4.940387481371088, "grad_norm": 10.86122323751273, "learning_rate": 8.767260729154737e-10, "loss": 0.1424, "num_input_tokens_seen": 27156480, "step": 6630 }, { "epoch": 4.941132637853949, "grad_norm": 8.172941892847696, "learning_rate": 8.549473927853269e-10, "loss": 0.1371, "num_input_tokens_seen": 27160576, "step": 6631 }, { "epoch": 4.94187779433681, "grad_norm": 8.60838349639049, "learning_rate": 8.334425341703645e-10, "loss": 0.16, "num_input_tokens_seen": 27164672, "step": 6632 }, { "epoch": 4.942622950819672, "grad_norm": 8.847870971446937, "learning_rate": 8.122115017847321e-10, "loss": 0.046, "num_input_tokens_seen": 27168768, "step": 6633 }, { "epoch": 4.9433681073025335, "grad_norm": 4.480389706788075, "learning_rate": 7.912543002823459e-10, "loss": 0.0633, "num_input_tokens_seen": 27172864, "step": 6634 }, { "epoch": 4.944113263785395, "grad_norm": 8.457456142880092, "learning_rate": 7.705709342571699e-10, "loss": 0.1017, "num_input_tokens_seen": 27176960, "step": 6635 }, { "epoch": 4.944858420268257, "grad_norm": 4.911180691425264, "learning_rate": 7.501614082430775e-10, "loss": 0.0234, "num_input_tokens_seen": 27181056, "step": 6636 }, { "epoch": 4.945603576751118, "grad_norm": 4.154502620679433, "learning_rate": 7.300257267141286e-10, "loss": 0.0257, "num_input_tokens_seen": 27185152, "step": 6637 }, { "epoch": 4.946348733233979, "grad_norm": 5.20581580418063, "learning_rate": 7.10163894084015e-10, "loss": 0.0492, "num_input_tokens_seen": 27189248, "step": 6638 }, { "epoch": 4.94709388971684, "grad_norm": 9.672626611140618, "learning_rate": 6.905759147067536e-10, "loss": 0.102, "num_input_tokens_seen": 27193344, "step": 6639 }, { "epoch": 4.947839046199702, "grad_norm": 4.958489731953977, "learning_rate": 6.712617928761322e-10, "loss": 0.0343, "num_input_tokens_seen": 27197440, "step": 6640 }, { "epoch": 4.9485842026825635, "grad_norm": 7.371461032603844, "learning_rate": 6.522215328259862e-10, "loss": 0.0558, "num_input_tokens_seen": 27201536, "step": 6641 }, { "epoch": 4.949329359165425, "grad_norm": 8.71855311623985, "learning_rate": 6.334551387299215e-10, "loss": 0.1552, "num_input_tokens_seen": 27205632, "step": 6642 }, { "epoch": 4.950074515648286, "grad_norm": 4.612547892105113, "learning_rate": 6.149626147018695e-10, "loss": 0.0394, "num_input_tokens_seen": 27209728, "step": 6643 }, { "epoch": 4.950819672131147, "grad_norm": 6.7202463802899, "learning_rate": 5.967439647952544e-10, "loss": 0.0737, "num_input_tokens_seen": 27213824, "step": 6644 }, { "epoch": 4.951564828614009, "grad_norm": 6.549746850023893, "learning_rate": 5.787991930039649e-10, "loss": 0.0899, "num_input_tokens_seen": 27217920, "step": 6645 }, { "epoch": 4.95230998509687, "grad_norm": 9.070742124369223, "learning_rate": 5.611283032616599e-10, "loss": 0.1603, "num_input_tokens_seen": 27222016, "step": 6646 }, { "epoch": 4.953055141579732, "grad_norm": 4.691287513690435, "learning_rate": 5.437312994417687e-10, "loss": 0.0208, "num_input_tokens_seen": 27226112, "step": 6647 }, { "epoch": 4.9538002980625935, "grad_norm": 5.805934239121477, "learning_rate": 5.266081853579075e-10, "loss": 0.0574, "num_input_tokens_seen": 27230208, "step": 6648 }, { "epoch": 4.954545454545455, "grad_norm": 8.577749274099764, "learning_rate": 5.097589647634627e-10, "loss": 0.0713, "num_input_tokens_seen": 27234304, "step": 6649 }, { "epoch": 4.955290611028316, "grad_norm": 7.0184223837002575, "learning_rate": 4.931836413521463e-10, "loss": 0.1415, "num_input_tokens_seen": 27238400, "step": 6650 }, { "epoch": 4.956035767511177, "grad_norm": 3.3182026046778197, "learning_rate": 4.768822187571631e-10, "loss": 0.0157, "num_input_tokens_seen": 27242496, "step": 6651 }, { "epoch": 4.956780923994039, "grad_norm": 6.006282849428107, "learning_rate": 4.608547005520436e-10, "loss": 0.0847, "num_input_tokens_seen": 27246592, "step": 6652 }, { "epoch": 4.9575260804769, "grad_norm": 9.026708778397934, "learning_rate": 4.451010902499497e-10, "loss": 0.1314, "num_input_tokens_seen": 27250688, "step": 6653 }, { "epoch": 4.958271236959762, "grad_norm": 2.0160794142215503, "learning_rate": 4.2962139130450777e-10, "loss": 0.0123, "num_input_tokens_seen": 27254784, "step": 6654 }, { "epoch": 4.959016393442623, "grad_norm": 6.137673859987925, "learning_rate": 4.14415607108698e-10, "loss": 0.1044, "num_input_tokens_seen": 27258880, "step": 6655 }, { "epoch": 4.959761549925484, "grad_norm": 6.811957206988344, "learning_rate": 3.994837409958263e-10, "loss": 0.023, "num_input_tokens_seen": 27262976, "step": 6656 }, { "epoch": 4.960506706408346, "grad_norm": 6.713910283524904, "learning_rate": 3.84825796238969e-10, "loss": 0.1072, "num_input_tokens_seen": 27267072, "step": 6657 }, { "epoch": 4.961251862891207, "grad_norm": 2.4921428285797784, "learning_rate": 3.704417760515278e-10, "loss": 0.0103, "num_input_tokens_seen": 27271168, "step": 6658 }, { "epoch": 4.961997019374069, "grad_norm": 4.332267508107988, "learning_rate": 3.563316835862585e-10, "loss": 0.0321, "num_input_tokens_seen": 27275264, "step": 6659 }, { "epoch": 4.96274217585693, "grad_norm": 4.37954519705336, "learning_rate": 3.424955219363812e-10, "loss": 0.0363, "num_input_tokens_seen": 27279360, "step": 6660 }, { "epoch": 4.963487332339791, "grad_norm": 5.925790231023641, "learning_rate": 3.289332941348866e-10, "loss": 0.0467, "num_input_tokens_seen": 27283456, "step": 6661 }, { "epoch": 4.9642324888226526, "grad_norm": 5.653100288069802, "learning_rate": 3.156450031545355e-10, "loss": 0.0568, "num_input_tokens_seen": 27287552, "step": 6662 }, { "epoch": 4.964977645305514, "grad_norm": 4.871699532387486, "learning_rate": 3.0263065190841436e-10, "loss": 0.0134, "num_input_tokens_seen": 27291648, "step": 6663 }, { "epoch": 4.965722801788376, "grad_norm": 10.174581274646055, "learning_rate": 2.8989024324938e-10, "loss": 0.1847, "num_input_tokens_seen": 27295744, "step": 6664 }, { "epoch": 4.966467958271237, "grad_norm": 4.394195123213764, "learning_rate": 2.774237799700596e-10, "loss": 0.0523, "num_input_tokens_seen": 27299840, "step": 6665 }, { "epoch": 4.967213114754099, "grad_norm": 4.894508894042131, "learning_rate": 2.652312648032673e-10, "loss": 0.0336, "num_input_tokens_seen": 27303936, "step": 6666 }, { "epoch": 4.967958271236959, "grad_norm": 7.169224053570064, "learning_rate": 2.5331270042172615e-10, "loss": 0.0643, "num_input_tokens_seen": 27308032, "step": 6667 }, { "epoch": 4.968703427719821, "grad_norm": 4.773573367532201, "learning_rate": 2.4166808943792977e-10, "loss": 0.0234, "num_input_tokens_seen": 27312128, "step": 6668 }, { "epoch": 4.9694485842026825, "grad_norm": 5.271080890064789, "learning_rate": 2.3029743440455832e-10, "loss": 0.0502, "num_input_tokens_seen": 27316224, "step": 6669 }, { "epoch": 4.970193740685544, "grad_norm": 6.21747699929893, "learning_rate": 2.1920073781420138e-10, "loss": 0.0307, "num_input_tokens_seen": 27320320, "step": 6670 }, { "epoch": 4.970938897168406, "grad_norm": 6.26649315011352, "learning_rate": 2.0837800209935755e-10, "loss": 0.0972, "num_input_tokens_seen": 27324416, "step": 6671 }, { "epoch": 4.971684053651266, "grad_norm": 8.034148446837115, "learning_rate": 1.978292296322959e-10, "loss": 0.1437, "num_input_tokens_seen": 27328512, "step": 6672 }, { "epoch": 4.972429210134128, "grad_norm": 7.010422220502038, "learning_rate": 1.8755442272533343e-10, "loss": 0.1426, "num_input_tokens_seen": 27332608, "step": 6673 }, { "epoch": 4.973174366616989, "grad_norm": 7.494826928433703, "learning_rate": 1.7755358363097387e-10, "loss": 0.0695, "num_input_tokens_seen": 27336704, "step": 6674 }, { "epoch": 4.973919523099851, "grad_norm": 9.844751029466194, "learning_rate": 1.6782671454149136e-10, "loss": 0.0764, "num_input_tokens_seen": 27340800, "step": 6675 }, { "epoch": 4.9746646795827125, "grad_norm": 8.030864794621394, "learning_rate": 1.5837381758893043e-10, "loss": 0.1918, "num_input_tokens_seen": 27344896, "step": 6676 }, { "epoch": 4.975409836065574, "grad_norm": 7.327186864186019, "learning_rate": 1.491948948455224e-10, "loss": 0.0425, "num_input_tokens_seen": 27348992, "step": 6677 }, { "epoch": 4.976154992548436, "grad_norm": 5.113235136362796, "learning_rate": 1.4028994832326891e-10, "loss": 0.0295, "num_input_tokens_seen": 27353088, "step": 6678 }, { "epoch": 4.976900149031296, "grad_norm": 5.077917396625967, "learning_rate": 1.3165897997421962e-10, "loss": 0.0466, "num_input_tokens_seen": 27357184, "step": 6679 }, { "epoch": 4.977645305514158, "grad_norm": 4.416687905197512, "learning_rate": 1.2330199169047208e-10, "loss": 0.0476, "num_input_tokens_seen": 27361280, "step": 6680 }, { "epoch": 4.978390461997019, "grad_norm": 6.053404763420186, "learning_rate": 1.152189853038943e-10, "loss": 0.0233, "num_input_tokens_seen": 27365376, "step": 6681 }, { "epoch": 4.979135618479881, "grad_norm": 5.376934232248088, "learning_rate": 1.0740996258612468e-10, "loss": 0.0282, "num_input_tokens_seen": 27369472, "step": 6682 }, { "epoch": 4.9798807749627425, "grad_norm": 9.740912439123123, "learning_rate": 9.98749252492659e-11, "loss": 0.0423, "num_input_tokens_seen": 27373568, "step": 6683 }, { "epoch": 4.980625931445603, "grad_norm": 7.128727987729973, "learning_rate": 9.261387494477469e-11, "loss": 0.0664, "num_input_tokens_seen": 27377664, "step": 6684 }, { "epoch": 4.981371087928465, "grad_norm": 5.352343874747858, "learning_rate": 8.562681326457211e-11, "loss": 0.0587, "num_input_tokens_seen": 27381760, "step": 6685 }, { "epoch": 4.982116244411326, "grad_norm": 6.251606563085021, "learning_rate": 7.891374174007204e-11, "loss": 0.0185, "num_input_tokens_seen": 27385856, "step": 6686 }, { "epoch": 4.982861400894188, "grad_norm": 7.279950636549851, "learning_rate": 7.247466184301389e-11, "loss": 0.0857, "num_input_tokens_seen": 27389952, "step": 6687 }, { "epoch": 4.983606557377049, "grad_norm": 10.114303022356685, "learning_rate": 6.630957498476864e-11, "loss": 0.2016, "num_input_tokens_seen": 27394048, "step": 6688 }, { "epoch": 4.984351713859911, "grad_norm": 5.442969496355281, "learning_rate": 6.04184825167553e-11, "loss": 0.0376, "num_input_tokens_seen": 27398144, "step": 6689 }, { "epoch": 4.985096870342772, "grad_norm": 4.45805702421612, "learning_rate": 5.4801385730302026e-11, "loss": 0.0251, "num_input_tokens_seen": 27402240, "step": 6690 }, { "epoch": 4.985842026825633, "grad_norm": 8.923553716795663, "learning_rate": 4.9458285856923696e-11, "loss": 0.1344, "num_input_tokens_seen": 27406336, "step": 6691 }, { "epoch": 4.986587183308495, "grad_norm": 7.952757009521268, "learning_rate": 4.4389184067628046e-11, "loss": 0.1312, "num_input_tokens_seen": 27410432, "step": 6692 }, { "epoch": 4.987332339791356, "grad_norm": 9.029867563747286, "learning_rate": 3.959408147374833e-11, "loss": 0.1498, "num_input_tokens_seen": 27414528, "step": 6693 }, { "epoch": 4.988077496274218, "grad_norm": 4.628325297681111, "learning_rate": 3.5072979126388185e-11, "loss": 0.033, "num_input_tokens_seen": 27418624, "step": 6694 }, { "epoch": 4.988822652757079, "grad_norm": 4.694736343910737, "learning_rate": 3.082587801656045e-11, "loss": 0.0266, "num_input_tokens_seen": 27422720, "step": 6695 }, { "epoch": 4.98956780923994, "grad_norm": 5.365305275427354, "learning_rate": 2.685277907518713e-11, "loss": 0.0233, "num_input_tokens_seen": 27426816, "step": 6696 }, { "epoch": 4.990312965722802, "grad_norm": 6.729100673330931, "learning_rate": 2.315368317337696e-11, "loss": 0.0505, "num_input_tokens_seen": 27430912, "step": 6697 }, { "epoch": 4.991058122205663, "grad_norm": 7.888260245454251, "learning_rate": 1.972859112200909e-11, "loss": 0.201, "num_input_tokens_seen": 27435008, "step": 6698 }, { "epoch": 4.991803278688525, "grad_norm": 6.9838777128216325, "learning_rate": 1.6577503671594275e-11, "loss": 0.0866, "num_input_tokens_seen": 27439104, "step": 6699 }, { "epoch": 4.992548435171386, "grad_norm": 5.904305831503523, "learning_rate": 1.3700421513246354e-11, "loss": 0.0877, "num_input_tokens_seen": 27443200, "step": 6700 }, { "epoch": 4.993293591654247, "grad_norm": 7.190244516608327, "learning_rate": 1.1097345277294446e-11, "loss": 0.1189, "num_input_tokens_seen": 27447296, "step": 6701 }, { "epoch": 4.9940387481371085, "grad_norm": 8.000186262937724, "learning_rate": 8.768275534670744e-12, "loss": 0.1817, "num_input_tokens_seen": 27451392, "step": 6702 }, { "epoch": 4.99478390461997, "grad_norm": 7.838007875085807, "learning_rate": 6.713212795661506e-12, "loss": 0.0773, "num_input_tokens_seen": 27455488, "step": 6703 }, { "epoch": 4.995529061102832, "grad_norm": 8.209283176124648, "learning_rate": 4.932157511017277e-12, "loss": 0.1651, "num_input_tokens_seen": 27459584, "step": 6704 }, { "epoch": 4.996274217585693, "grad_norm": 5.895330357899958, "learning_rate": 3.4251100708426744e-12, "loss": 0.0579, "num_input_tokens_seen": 27463680, "step": 6705 }, { "epoch": 4.997019374068555, "grad_norm": 6.79494211652655, "learning_rate": 2.192070805706603e-12, "loss": 0.2015, "num_input_tokens_seen": 27467776, "step": 6706 }, { "epoch": 4.997764530551416, "grad_norm": 7.774070492813983, "learning_rate": 1.2330399859483699e-12, "loss": 0.1377, "num_input_tokens_seen": 27471872, "step": 6707 }, { "epoch": 4.998509687034277, "grad_norm": 7.0919926673624465, "learning_rate": 5.480178215389042e-13, "loss": 0.0779, "num_input_tokens_seen": 27475968, "step": 6708 }, { "epoch": 4.9992548435171384, "grad_norm": 7.346484131593334, "learning_rate": 1.3700446291342594e-13, "loss": 0.0729, "num_input_tokens_seen": 27480064, "step": 6709 }, { "epoch": 5.0, "grad_norm": 7.552789007626611, "learning_rate": 0.0, "loss": 0.1431, "num_input_tokens_seen": 27484160, "step": 6710 }, { "epoch": 5.0, "num_input_tokens_seen": 27484160, "step": 6710, "total_flos": 43904296550400.0, "train_loss": 0.5583460686376336, "train_runtime": 5822.2036, "train_samples_per_second": 4.609, "train_steps_per_second": 1.152 } ], "logging_steps": 1, "max_steps": 6710, "num_input_tokens_seen": 27484160, "num_train_epochs": 5, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 43904296550400.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }