{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.6268656716417915, "eval_steps": 500, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007462686567164179, "grad_norm": 1.8659797972008254, "learning_rate": 0.0, "loss": 0.7986637353897095, "num_tokens": 940199.0, "step": 1 }, { "epoch": 0.014925373134328358, "grad_norm": 2.067470583174153, "learning_rate": 5.2631578947368416e-08, "loss": 0.8278242945671082, "num_tokens": 1940958.0, "step": 2 }, { "epoch": 0.022388059701492536, "grad_norm": 1.8137442955270493, "learning_rate": 1.0526315789473683e-07, "loss": 0.7961194515228271, "num_tokens": 2857380.0, "step": 3 }, { "epoch": 0.029850746268656716, "grad_norm": 1.8752542747158565, "learning_rate": 1.5789473684210525e-07, "loss": 0.7988173961639404, "num_tokens": 3696403.0, "step": 4 }, { "epoch": 0.03731343283582089, "grad_norm": 1.9383021105435863, "learning_rate": 2.1052631578947366e-07, "loss": 0.8283753395080566, "num_tokens": 4528235.0, "step": 5 }, { "epoch": 0.04477611940298507, "grad_norm": 1.9512483400491862, "learning_rate": 2.631578947368421e-07, "loss": 0.8254790902137756, "num_tokens": 5554672.0, "step": 6 }, { "epoch": 0.05223880597014925, "grad_norm": 1.8813572711890862, "learning_rate": 3.157894736842105e-07, "loss": 0.8230706453323364, "num_tokens": 6423132.0, "step": 7 }, { "epoch": 0.05970149253731343, "grad_norm": 1.9448895375733437, "learning_rate": 3.684210526315789e-07, "loss": 0.8051227331161499, "num_tokens": 7201644.0, "step": 8 }, { "epoch": 0.06716417910447761, "grad_norm": 1.8922387607703381, "learning_rate": 4.2105263157894733e-07, "loss": 0.7542356252670288, "num_tokens": 8128715.0, "step": 9 }, { "epoch": 0.07462686567164178, "grad_norm": 1.9842025446380493, "learning_rate": 4.7368421052631574e-07, "loss": 0.8522481918334961, "num_tokens": 9074027.0, "step": 10 }, { "epoch": 0.08208955223880597, "grad_norm": 1.7964968435378388, "learning_rate": 5.263157894736842e-07, "loss": 0.7835813760757446, "num_tokens": 9950641.0, "step": 11 }, { "epoch": 0.08955223880597014, "grad_norm": 1.8759196623765004, "learning_rate": 5.789473684210526e-07, "loss": 0.8266638517379761, "num_tokens": 10885057.0, "step": 12 }, { "epoch": 0.09701492537313433, "grad_norm": 1.739475404373344, "learning_rate": 6.31578947368421e-07, "loss": 0.8051838874816895, "num_tokens": 11697963.0, "step": 13 }, { "epoch": 0.1044776119402985, "grad_norm": 1.6356821751063044, "learning_rate": 6.842105263157895e-07, "loss": 0.7847919464111328, "num_tokens": 12632602.0, "step": 14 }, { "epoch": 0.11194029850746269, "grad_norm": 1.6314037268606378, "learning_rate": 7.368421052631578e-07, "loss": 0.775245189666748, "num_tokens": 13568889.0, "step": 15 }, { "epoch": 0.11940298507462686, "grad_norm": 1.6105524213156879, "learning_rate": 7.894736842105263e-07, "loss": 0.8013657331466675, "num_tokens": 14534242.0, "step": 16 }, { "epoch": 0.12686567164179105, "grad_norm": 1.5779316833603265, "learning_rate": 8.421052631578947e-07, "loss": 0.80766761302948, "num_tokens": 15435946.0, "step": 17 }, { "epoch": 0.13432835820895522, "grad_norm": 1.5079351322768313, "learning_rate": 8.947368421052631e-07, "loss": 0.7658109664916992, "num_tokens": 16352267.0, "step": 18 }, { "epoch": 0.1417910447761194, "grad_norm": 1.320308981062678, "learning_rate": 9.473684210526315e-07, "loss": 0.7778770327568054, "num_tokens": 17277422.0, "step": 19 }, { "epoch": 0.14925373134328357, "grad_norm": 1.4101803404332138, "learning_rate": 1e-06, "loss": 0.8158027529716492, "num_tokens": 18270697.0, "step": 20 }, { "epoch": 0.15671641791044777, "grad_norm": 1.3401873553364563, "learning_rate": 9.999938520216342e-07, "loss": 0.7766833305358887, "num_tokens": 19308555.0, "step": 21 }, { "epoch": 0.16417910447761194, "grad_norm": 1.27445266639497, "learning_rate": 9.999754082545259e-07, "loss": 0.7421952486038208, "num_tokens": 20162797.0, "step": 22 }, { "epoch": 0.17164179104477612, "grad_norm": 1.3912932427121052, "learning_rate": 9.999446692026396e-07, "loss": 0.7800503969192505, "num_tokens": 20981106.0, "step": 23 }, { "epoch": 0.1791044776119403, "grad_norm": 1.2963982894236503, "learning_rate": 9.999016357058995e-07, "loss": 0.766775369644165, "num_tokens": 21858000.0, "step": 24 }, { "epoch": 0.1865671641791045, "grad_norm": 1.140051976839368, "learning_rate": 9.998463089401678e-07, "loss": 0.7179380059242249, "num_tokens": 22793285.0, "step": 25 }, { "epoch": 0.19402985074626866, "grad_norm": 1.1651471102131281, "learning_rate": 9.997786904172126e-07, "loss": 0.810413122177124, "num_tokens": 23723801.0, "step": 26 }, { "epoch": 0.20149253731343283, "grad_norm": 1.0407108116745094, "learning_rate": 9.996987819846655e-07, "loss": 0.7446407079696655, "num_tokens": 24725740.0, "step": 27 }, { "epoch": 0.208955223880597, "grad_norm": 0.9942554431409824, "learning_rate": 9.996065858259727e-07, "loss": 0.7915131449699402, "num_tokens": 25730725.0, "step": 28 }, { "epoch": 0.21641791044776118, "grad_norm": 1.0234340889029334, "learning_rate": 9.995021044603342e-07, "loss": 0.7581333518028259, "num_tokens": 26557776.0, "step": 29 }, { "epoch": 0.22388059701492538, "grad_norm": 1.0332669286266374, "learning_rate": 9.993853407426352e-07, "loss": 0.7365682125091553, "num_tokens": 27504251.0, "step": 30 }, { "epoch": 0.23134328358208955, "grad_norm": 1.0634534407808287, "learning_rate": 9.99256297863368e-07, "loss": 0.7191506624221802, "num_tokens": 28534541.0, "step": 31 }, { "epoch": 0.23880597014925373, "grad_norm": 1.1818384387181422, "learning_rate": 9.99114979348545e-07, "loss": 0.7689279317855835, "num_tokens": 29341502.0, "step": 32 }, { "epoch": 0.2462686567164179, "grad_norm": 1.0143867857533606, "learning_rate": 9.989613890596033e-07, "loss": 0.7768257856369019, "num_tokens": 30211822.0, "step": 33 }, { "epoch": 0.2537313432835821, "grad_norm": 0.9912608655279589, "learning_rate": 9.987955311932968e-07, "loss": 0.7552160024642944, "num_tokens": 31102775.0, "step": 34 }, { "epoch": 0.26119402985074625, "grad_norm": 0.859671875219598, "learning_rate": 9.986174102815837e-07, "loss": 0.7417880892753601, "num_tokens": 31898227.0, "step": 35 }, { "epoch": 0.26865671641791045, "grad_norm": 0.8467136154890645, "learning_rate": 9.984270311915018e-07, "loss": 0.7220484614372253, "num_tokens": 32541892.0, "step": 36 }, { "epoch": 0.27611940298507465, "grad_norm": 0.968520072067917, "learning_rate": 9.982243991250357e-07, "loss": 0.7436271905899048, "num_tokens": 33543040.0, "step": 37 }, { "epoch": 0.2835820895522388, "grad_norm": 0.7117412176092366, "learning_rate": 9.980095196189748e-07, "loss": 0.7281963229179382, "num_tokens": 34505224.0, "step": 38 }, { "epoch": 0.291044776119403, "grad_norm": 0.7282443922942368, "learning_rate": 9.977823985447613e-07, "loss": 0.7709681987762451, "num_tokens": 35411826.0, "step": 39 }, { "epoch": 0.29850746268656714, "grad_norm": 0.8081380042496161, "learning_rate": 9.975430421083305e-07, "loss": 0.761425256729126, "num_tokens": 36307345.0, "step": 40 }, { "epoch": 0.30597014925373134, "grad_norm": 0.8423198034792783, "learning_rate": 9.972914568499411e-07, "loss": 0.7312315106391907, "num_tokens": 37196875.0, "step": 41 }, { "epoch": 0.31343283582089554, "grad_norm": 0.8739486151254683, "learning_rate": 9.970276496439966e-07, "loss": 0.7070371508598328, "num_tokens": 38112193.0, "step": 42 }, { "epoch": 0.3208955223880597, "grad_norm": 0.8222714239323922, "learning_rate": 9.967516276988567e-07, "loss": 0.7004337310791016, "num_tokens": 38855918.0, "step": 43 }, { "epoch": 0.3283582089552239, "grad_norm": 0.6714287856764727, "learning_rate": 9.964633985566412e-07, "loss": 0.7193351984024048, "num_tokens": 39833215.0, "step": 44 }, { "epoch": 0.3358208955223881, "grad_norm": 0.6336781599517638, "learning_rate": 9.961629700930235e-07, "loss": 0.7344927787780762, "num_tokens": 40760145.0, "step": 45 }, { "epoch": 0.34328358208955223, "grad_norm": 0.669269067988794, "learning_rate": 9.958503505170155e-07, "loss": 0.7277801632881165, "num_tokens": 41745749.0, "step": 46 }, { "epoch": 0.35074626865671643, "grad_norm": 0.7317041279195701, "learning_rate": 9.95525548370744e-07, "loss": 0.7005234956741333, "num_tokens": 42686630.0, "step": 47 }, { "epoch": 0.3582089552238806, "grad_norm": 0.7721535484599115, "learning_rate": 9.95188572529215e-07, "loss": 0.7193202376365662, "num_tokens": 43510586.0, "step": 48 }, { "epoch": 0.3656716417910448, "grad_norm": 0.7041253506550509, "learning_rate": 9.948394322000746e-07, "loss": 0.6881219744682312, "num_tokens": 44362248.0, "step": 49 }, { "epoch": 0.373134328358209, "grad_norm": 0.636240363667009, "learning_rate": 9.944781369233543e-07, "loss": 0.6522014141082764, "num_tokens": 45216722.0, "step": 50 }, { "epoch": 0.3805970149253731, "grad_norm": 0.6272094853443818, "learning_rate": 9.941046965712122e-07, "loss": 0.6842180490493774, "num_tokens": 46010142.0, "step": 51 }, { "epoch": 0.3880597014925373, "grad_norm": 0.5645071953625206, "learning_rate": 9.937191213476625e-07, "loss": 0.6692793369293213, "num_tokens": 46858670.0, "step": 52 }, { "epoch": 0.39552238805970147, "grad_norm": 0.585837329746578, "learning_rate": 9.933214217882971e-07, "loss": 0.7204340696334839, "num_tokens": 47836905.0, "step": 53 }, { "epoch": 0.40298507462686567, "grad_norm": 0.5938197808350745, "learning_rate": 9.929116087599972e-07, "loss": 0.7186766266822815, "num_tokens": 48836237.0, "step": 54 }, { "epoch": 0.41044776119402987, "grad_norm": 0.5264192209085824, "learning_rate": 9.924896934606364e-07, "loss": 0.7225839495658875, "num_tokens": 49860153.0, "step": 55 }, { "epoch": 0.417910447761194, "grad_norm": 0.5025994710035754, "learning_rate": 9.920556874187757e-07, "loss": 0.6711542010307312, "num_tokens": 50786110.0, "step": 56 }, { "epoch": 0.4253731343283582, "grad_norm": 0.43381197403497257, "learning_rate": 9.91609602493347e-07, "loss": 0.6544876098632812, "num_tokens": 51790390.0, "step": 57 }, { "epoch": 0.43283582089552236, "grad_norm": 0.43233023140164006, "learning_rate": 9.911514508733306e-07, "loss": 0.7029759883880615, "num_tokens": 52742397.0, "step": 58 }, { "epoch": 0.44029850746268656, "grad_norm": 0.43910093495037306, "learning_rate": 9.906812450774207e-07, "loss": 0.7200834155082703, "num_tokens": 53673114.0, "step": 59 }, { "epoch": 0.44776119402985076, "grad_norm": 0.44127209489849284, "learning_rate": 9.90198997953684e-07, "loss": 0.6370296478271484, "num_tokens": 54566889.0, "step": 60 }, { "epoch": 0.4552238805970149, "grad_norm": 0.45096698239872907, "learning_rate": 9.89704722679209e-07, "loss": 0.6861921548843384, "num_tokens": 55460491.0, "step": 61 }, { "epoch": 0.4626865671641791, "grad_norm": 0.4231809219141582, "learning_rate": 9.89198432759746e-07, "loss": 0.6846483945846558, "num_tokens": 56520990.0, "step": 62 }, { "epoch": 0.4701492537313433, "grad_norm": 0.4911103394390683, "learning_rate": 9.886801420293363e-07, "loss": 0.6806150674819946, "num_tokens": 57422206.0, "step": 63 }, { "epoch": 0.47761194029850745, "grad_norm": 0.4523240789905204, "learning_rate": 9.881498646499368e-07, "loss": 0.7077186107635498, "num_tokens": 58260720.0, "step": 64 }, { "epoch": 0.48507462686567165, "grad_norm": 0.41521834156751597, "learning_rate": 9.876076151110313e-07, "loss": 0.696556806564331, "num_tokens": 59123617.0, "step": 65 }, { "epoch": 0.4925373134328358, "grad_norm": 0.44090049904740325, "learning_rate": 9.870534082292349e-07, "loss": 0.6695712804794312, "num_tokens": 60033505.0, "step": 66 }, { "epoch": 0.5, "grad_norm": 0.3986636896953578, "learning_rate": 9.864872591478893e-07, "loss": 0.6385202407836914, "num_tokens": 60974452.0, "step": 67 }, { "epoch": 0.5074626865671642, "grad_norm": 0.36309454275310177, "learning_rate": 9.859091833366496e-07, "loss": 0.6627390384674072, "num_tokens": 61913977.0, "step": 68 }, { "epoch": 0.5149253731343284, "grad_norm": 0.39014943112878603, "learning_rate": 9.853191965910605e-07, "loss": 0.6990819573402405, "num_tokens": 62800879.0, "step": 69 }, { "epoch": 0.5223880597014925, "grad_norm": 0.365989388161065, "learning_rate": 9.84717315032125e-07, "loss": 0.6741257309913635, "num_tokens": 63823183.0, "step": 70 }, { "epoch": 0.5298507462686567, "grad_norm": 0.367640955997548, "learning_rate": 9.841035551058648e-07, "loss": 0.657660722732544, "num_tokens": 64700087.0, "step": 71 }, { "epoch": 0.5373134328358209, "grad_norm": 0.38259984541818554, "learning_rate": 9.834779335828697e-07, "loss": 0.6268120408058167, "num_tokens": 65533415.0, "step": 72 }, { "epoch": 0.5447761194029851, "grad_norm": 0.4459381628745124, "learning_rate": 9.828404675578403e-07, "loss": 0.6166980266571045, "num_tokens": 66411589.0, "step": 73 }, { "epoch": 0.5522388059701493, "grad_norm": 0.3964961096159446, "learning_rate": 9.821911744491202e-07, "loss": 0.6667238473892212, "num_tokens": 67203675.0, "step": 74 }, { "epoch": 0.5597014925373134, "grad_norm": 0.3942585728115673, "learning_rate": 9.815300719982202e-07, "loss": 0.6620233058929443, "num_tokens": 68056574.0, "step": 75 }, { "epoch": 0.5671641791044776, "grad_norm": 0.35812083783608406, "learning_rate": 9.808571782693343e-07, "loss": 0.6339540481567383, "num_tokens": 68907426.0, "step": 76 }, { "epoch": 0.5746268656716418, "grad_norm": 0.3369082279659881, "learning_rate": 9.801725116488449e-07, "loss": 0.6345670819282532, "num_tokens": 69817179.0, "step": 77 }, { "epoch": 0.582089552238806, "grad_norm": 0.4132547384225159, "learning_rate": 9.794760908448213e-07, "loss": 0.6722534894943237, "num_tokens": 70803003.0, "step": 78 }, { "epoch": 0.5895522388059702, "grad_norm": 0.3627266662239884, "learning_rate": 9.78767934886508e-07, "loss": 0.6158405542373657, "num_tokens": 71708353.0, "step": 79 }, { "epoch": 0.5970149253731343, "grad_norm": 0.3555505789377954, "learning_rate": 9.78048063123805e-07, "loss": 0.6479436159133911, "num_tokens": 72587706.0, "step": 80 }, { "epoch": 0.6044776119402985, "grad_norm": 0.36801010623436975, "learning_rate": 9.773164952267392e-07, "loss": 0.6247404217720032, "num_tokens": 73605832.0, "step": 81 }, { "epoch": 0.6119402985074627, "grad_norm": 0.4986298126613671, "learning_rate": 9.765732511849267e-07, "loss": 0.6385573744773865, "num_tokens": 74512496.0, "step": 82 }, { "epoch": 0.6194029850746269, "grad_norm": 0.3500792677565209, "learning_rate": 9.758183513070266e-07, "loss": 0.6781474351882935, "num_tokens": 75428481.0, "step": 83 }, { "epoch": 0.6268656716417911, "grad_norm": 0.39093330569733553, "learning_rate": 9.750518162201857e-07, "loss": 0.6386494636535645, "num_tokens": 76292706.0, "step": 84 }, { "epoch": 0.6343283582089553, "grad_norm": 0.4070936194536431, "learning_rate": 9.742736668694758e-07, "loss": 0.6180363893508911, "num_tokens": 77209633.0, "step": 85 }, { "epoch": 0.6417910447761194, "grad_norm": 0.3354352334817167, "learning_rate": 9.734839245173211e-07, "loss": 0.6163570880889893, "num_tokens": 78063420.0, "step": 86 }, { "epoch": 0.6492537313432836, "grad_norm": 0.45211158839386173, "learning_rate": 9.726826107429168e-07, "loss": 0.6268313527107239, "num_tokens": 78870397.0, "step": 87 }, { "epoch": 0.6567164179104478, "grad_norm": 0.31941448751910695, "learning_rate": 9.718697474416388e-07, "loss": 0.6327146291732788, "num_tokens": 79871666.0, "step": 88 }, { "epoch": 0.664179104477612, "grad_norm": 0.3722172382477918, "learning_rate": 9.71045356824448e-07, "loss": 0.6110676527023315, "num_tokens": 80712206.0, "step": 89 }, { "epoch": 0.6716417910447762, "grad_norm": 0.3465587426296506, "learning_rate": 9.7020946141728e-07, "loss": 0.5954413414001465, "num_tokens": 81538216.0, "step": 90 }, { "epoch": 0.6791044776119403, "grad_norm": 0.30862586857277025, "learning_rate": 9.693620840604325e-07, "loss": 0.6106799840927124, "num_tokens": 82585839.0, "step": 91 }, { "epoch": 0.6865671641791045, "grad_norm": 0.34576661664754443, "learning_rate": 9.685032479079392e-07, "loss": 0.6184056997299194, "num_tokens": 83427449.0, "step": 92 }, { "epoch": 0.6940298507462687, "grad_norm": 0.30968028932263886, "learning_rate": 9.676329764269383e-07, "loss": 0.6404486894607544, "num_tokens": 84449388.0, "step": 93 }, { "epoch": 0.7014925373134329, "grad_norm": 0.3132280475142172, "learning_rate": 9.667512933970313e-07, "loss": 0.6027534008026123, "num_tokens": 85256514.0, "step": 94 }, { "epoch": 0.7089552238805971, "grad_norm": 0.37723622679137114, "learning_rate": 9.658582229096319e-07, "loss": 0.636467695236206, "num_tokens": 86165960.0, "step": 95 }, { "epoch": 0.7164179104477612, "grad_norm": 0.3415033431478265, "learning_rate": 9.649537893673095e-07, "loss": 0.6198180317878723, "num_tokens": 86982659.0, "step": 96 }, { "epoch": 0.7238805970149254, "grad_norm": 0.38663387244532404, "learning_rate": 9.640380174831209e-07, "loss": 0.6216307878494263, "num_tokens": 87931000.0, "step": 97 }, { "epoch": 0.7313432835820896, "grad_norm": 0.32790162197293926, "learning_rate": 9.631109322799361e-07, "loss": 0.6376453638076782, "num_tokens": 88689701.0, "step": 98 }, { "epoch": 0.7388059701492538, "grad_norm": 0.30426811432679324, "learning_rate": 9.621725590897543e-07, "loss": 0.6182718276977539, "num_tokens": 89547645.0, "step": 99 }, { "epoch": 0.746268656716418, "grad_norm": 0.3319556928745246, "learning_rate": 9.61222923553011e-07, "loss": 0.6192991733551025, "num_tokens": 90297517.0, "step": 100 }, { "epoch": 0.753731343283582, "grad_norm": 0.3063539981854288, "learning_rate": 9.602620516178788e-07, "loss": 0.6192951202392578, "num_tokens": 91186856.0, "step": 101 }, { "epoch": 0.7611940298507462, "grad_norm": 0.3353221957958766, "learning_rate": 9.592899695395568e-07, "loss": 0.6191784739494324, "num_tokens": 91987232.0, "step": 102 }, { "epoch": 0.7686567164179104, "grad_norm": 2.4726215379945016, "learning_rate": 9.583067038795544e-07, "loss": 0.6689252257347107, "num_tokens": 92898696.0, "step": 103 }, { "epoch": 0.7761194029850746, "grad_norm": 0.3683317561657326, "learning_rate": 9.57312281504965e-07, "loss": 0.6164358854293823, "num_tokens": 93791120.0, "step": 104 }, { "epoch": 0.7835820895522388, "grad_norm": 0.31142782603302166, "learning_rate": 9.563067295877318e-07, "loss": 0.5814804434776306, "num_tokens": 94639289.0, "step": 105 }, { "epoch": 0.7910447761194029, "grad_norm": 0.5777785271285245, "learning_rate": 9.552900756039056e-07, "loss": 0.6628624200820923, "num_tokens": 95400207.0, "step": 106 }, { "epoch": 0.7985074626865671, "grad_norm": 0.31405569146967405, "learning_rate": 9.54262347332894e-07, "loss": 0.647003710269928, "num_tokens": 96227104.0, "step": 107 }, { "epoch": 0.8059701492537313, "grad_norm": 0.3000528898105912, "learning_rate": 9.532235728567022e-07, "loss": 0.6015387177467346, "num_tokens": 97056588.0, "step": 108 }, { "epoch": 0.8134328358208955, "grad_norm": 0.3033934043892588, "learning_rate": 9.521737805591661e-07, "loss": 0.629927396774292, "num_tokens": 97944111.0, "step": 109 }, { "epoch": 0.8208955223880597, "grad_norm": 0.29345271513653554, "learning_rate": 9.511129991251755e-07, "loss": 0.5817909836769104, "num_tokens": 98816920.0, "step": 110 }, { "epoch": 0.8283582089552238, "grad_norm": 0.3148352988531031, "learning_rate": 9.500412575398922e-07, "loss": 0.6288615465164185, "num_tokens": 99773832.0, "step": 111 }, { "epoch": 0.835820895522388, "grad_norm": 0.3225302890726275, "learning_rate": 9.489585850879564e-07, "loss": 0.6282119750976562, "num_tokens": 100805832.0, "step": 112 }, { "epoch": 0.8432835820895522, "grad_norm": 0.3125344123235252, "learning_rate": 9.478650113526874e-07, "loss": 0.6161372661590576, "num_tokens": 101747939.0, "step": 113 }, { "epoch": 0.8507462686567164, "grad_norm": 0.3022490350310229, "learning_rate": 9.467605662152745e-07, "loss": 0.6462452411651611, "num_tokens": 102733715.0, "step": 114 }, { "epoch": 0.8582089552238806, "grad_norm": 0.29284676964407896, "learning_rate": 9.456452798539616e-07, "loss": 0.5786178112030029, "num_tokens": 103577969.0, "step": 115 }, { "epoch": 0.8656716417910447, "grad_norm": 0.31081858681339425, "learning_rate": 9.445191827432215e-07, "loss": 0.6079792380332947, "num_tokens": 104507837.0, "step": 116 }, { "epoch": 0.8731343283582089, "grad_norm": 0.305140531974358, "learning_rate": 9.433823056529241e-07, "loss": 0.6422327160835266, "num_tokens": 105482901.0, "step": 117 }, { "epoch": 0.8805970149253731, "grad_norm": 0.8470132754236691, "learning_rate": 9.422346796474949e-07, "loss": 0.6116156578063965, "num_tokens": 106441176.0, "step": 118 }, { "epoch": 0.8880597014925373, "grad_norm": 0.3062865057157625, "learning_rate": 9.410763360850665e-07, "loss": 0.6365537643432617, "num_tokens": 107265870.0, "step": 119 }, { "epoch": 0.8955223880597015, "grad_norm": 0.3112284745186021, "learning_rate": 9.399073066166217e-07, "loss": 0.6294253468513489, "num_tokens": 108146690.0, "step": 120 }, { "epoch": 0.9029850746268657, "grad_norm": 0.3489293940518192, "learning_rate": 9.38727623185129e-07, "loss": 0.6304266452789307, "num_tokens": 109034402.0, "step": 121 }, { "epoch": 0.9104477611940298, "grad_norm": 0.30735356689099647, "learning_rate": 9.375373180246696e-07, "loss": 0.6445657014846802, "num_tokens": 109989572.0, "step": 122 }, { "epoch": 0.917910447761194, "grad_norm": 0.30036163756207646, "learning_rate": 9.36336423659556e-07, "loss": 0.6575721502304077, "num_tokens": 110969334.0, "step": 123 }, { "epoch": 0.9253731343283582, "grad_norm": 0.3150563557302255, "learning_rate": 9.351249729034441e-07, "loss": 0.6350916624069214, "num_tokens": 111844990.0, "step": 124 }, { "epoch": 0.9328358208955224, "grad_norm": 0.42959935293984475, "learning_rate": 9.339029988584364e-07, "loss": 0.6106451153755188, "num_tokens": 112800888.0, "step": 125 }, { "epoch": 0.9402985074626866, "grad_norm": 0.295731574777322, "learning_rate": 9.326705349141772e-07, "loss": 0.6363998651504517, "num_tokens": 113857610.0, "step": 126 }, { "epoch": 0.9477611940298507, "grad_norm": 0.2886185648925166, "learning_rate": 9.314276147469408e-07, "loss": 0.6078730225563049, "num_tokens": 114800904.0, "step": 127 }, { "epoch": 0.9552238805970149, "grad_norm": 0.2920263616232179, "learning_rate": 9.301742723187104e-07, "loss": 0.6083230972290039, "num_tokens": 115759913.0, "step": 128 }, { "epoch": 0.9626865671641791, "grad_norm": 0.44738555848421335, "learning_rate": 9.289105418762512e-07, "loss": 0.6401327848434448, "num_tokens": 116624191.0, "step": 129 }, { "epoch": 0.9701492537313433, "grad_norm": 0.31658939252581325, "learning_rate": 9.276364579501741e-07, "loss": 0.6194320321083069, "num_tokens": 117499418.0, "step": 130 }, { "epoch": 0.9776119402985075, "grad_norm": 0.3550447231230164, "learning_rate": 9.263520553539919e-07, "loss": 0.5973168611526489, "num_tokens": 118329517.0, "step": 131 }, { "epoch": 0.9850746268656716, "grad_norm": 0.3461631737220481, "learning_rate": 9.250573691831686e-07, "loss": 0.6246321201324463, "num_tokens": 119221343.0, "step": 132 }, { "epoch": 0.9925373134328358, "grad_norm": 0.29903733473091415, "learning_rate": 9.237524348141599e-07, "loss": 0.6080079078674316, "num_tokens": 120123659.0, "step": 133 }, { "epoch": 1.0, "grad_norm": 0.30082599081337075, "learning_rate": 9.224372879034471e-07, "loss": 0.610882043838501, "num_tokens": 121054976.0, "step": 134 }, { "epoch": 1.007462686567164, "grad_norm": 0.3306994076014225, "learning_rate": 9.211119643865625e-07, "loss": 0.617473840713501, "num_tokens": 121995409.0, "step": 135 }, { "epoch": 1.0149253731343284, "grad_norm": 0.4118286344591086, "learning_rate": 9.197765004771074e-07, "loss": 0.6029432415962219, "num_tokens": 122823226.0, "step": 136 }, { "epoch": 1.0223880597014925, "grad_norm": 0.33422438069467975, "learning_rate": 9.184309326657625e-07, "loss": 0.5911135077476501, "num_tokens": 123660597.0, "step": 137 }, { "epoch": 1.0298507462686568, "grad_norm": 0.3051145695551814, "learning_rate": 9.17075297719292e-07, "loss": 0.5806124806404114, "num_tokens": 124554146.0, "step": 138 }, { "epoch": 1.037313432835821, "grad_norm": 0.32826641207537816, "learning_rate": 9.157096326795367e-07, "loss": 0.6078518629074097, "num_tokens": 125332236.0, "step": 139 }, { "epoch": 1.044776119402985, "grad_norm": 0.2711859183967077, "learning_rate": 9.143339748624042e-07, "loss": 0.5872972011566162, "num_tokens": 126310236.0, "step": 140 }, { "epoch": 1.0522388059701493, "grad_norm": 0.29219827505795287, "learning_rate": 9.129483618568477e-07, "loss": 0.5903403759002686, "num_tokens": 127218706.0, "step": 141 }, { "epoch": 1.0597014925373134, "grad_norm": 0.31682373789781537, "learning_rate": 9.115528315238396e-07, "loss": 0.6067441701889038, "num_tokens": 128057825.0, "step": 142 }, { "epoch": 1.0671641791044777, "grad_norm": 0.290526593989316, "learning_rate": 9.101474219953366e-07, "loss": 0.5909883975982666, "num_tokens": 128955736.0, "step": 143 }, { "epoch": 1.0746268656716418, "grad_norm": 0.33126907763025953, "learning_rate": 9.087321716732382e-07, "loss": 0.6024787425994873, "num_tokens": 129777788.0, "step": 144 }, { "epoch": 1.0820895522388059, "grad_norm": 0.2848827551783459, "learning_rate": 9.073071192283374e-07, "loss": 0.6014402508735657, "num_tokens": 130659960.0, "step": 145 }, { "epoch": 1.0895522388059702, "grad_norm": 0.32145826962059754, "learning_rate": 9.058723035992631e-07, "loss": 0.5986078977584839, "num_tokens": 131550221.0, "step": 146 }, { "epoch": 1.0970149253731343, "grad_norm": 0.3517188747706852, "learning_rate": 9.044277639914176e-07, "loss": 0.6086349487304688, "num_tokens": 132486469.0, "step": 147 }, { "epoch": 1.1044776119402986, "grad_norm": 0.2956143436262153, "learning_rate": 9.029735398759043e-07, "loss": 0.5634535551071167, "num_tokens": 133366950.0, "step": 148 }, { "epoch": 1.1119402985074627, "grad_norm": 0.3478882615131154, "learning_rate": 9.015096709884492e-07, "loss": 0.5996171236038208, "num_tokens": 134285043.0, "step": 149 }, { "epoch": 1.1194029850746268, "grad_norm": 0.2966927318472822, "learning_rate": 9.000361973283158e-07, "loss": 0.5650948882102966, "num_tokens": 135136228.0, "step": 150 }, { "epoch": 1.126865671641791, "grad_norm": 0.29045922223177134, "learning_rate": 8.985531591572115e-07, "loss": 0.583465576171875, "num_tokens": 136013129.0, "step": 151 }, { "epoch": 1.1343283582089552, "grad_norm": 0.29420507308387045, "learning_rate": 8.970605969981879e-07, "loss": 0.6276301741600037, "num_tokens": 136978716.0, "step": 152 }, { "epoch": 1.1417910447761195, "grad_norm": 0.28206954373320775, "learning_rate": 8.955585516345332e-07, "loss": 0.5884029865264893, "num_tokens": 137957110.0, "step": 153 }, { "epoch": 1.1492537313432836, "grad_norm": 0.2893761158835339, "learning_rate": 8.940470641086581e-07, "loss": 0.58906090259552, "num_tokens": 138894209.0, "step": 154 }, { "epoch": 1.1567164179104479, "grad_norm": 0.3060099897084387, "learning_rate": 8.925261757209743e-07, "loss": 0.6283571720123291, "num_tokens": 139925878.0, "step": 155 }, { "epoch": 1.164179104477612, "grad_norm": 0.3245985255415534, "learning_rate": 8.909959280287655e-07, "loss": 0.5938559770584106, "num_tokens": 140844266.0, "step": 156 }, { "epoch": 1.171641791044776, "grad_norm": 0.30881864003106135, "learning_rate": 8.894563628450532e-07, "loss": 0.5916883945465088, "num_tokens": 141685264.0, "step": 157 }, { "epoch": 1.1791044776119404, "grad_norm": 0.3596602053150199, "learning_rate": 8.879075222374521e-07, "loss": 0.563378095626831, "num_tokens": 142607439.0, "step": 158 }, { "epoch": 1.1865671641791045, "grad_norm": 0.29872513574720144, "learning_rate": 8.863494485270226e-07, "loss": 0.5588960647583008, "num_tokens": 143442522.0, "step": 159 }, { "epoch": 1.1940298507462686, "grad_norm": 0.35553422720166944, "learning_rate": 8.847821842871136e-07, "loss": 0.6027117967605591, "num_tokens": 144356683.0, "step": 160 }, { "epoch": 1.2014925373134329, "grad_norm": 0.35962639938666846, "learning_rate": 8.832057723421988e-07, "loss": 0.5953375101089478, "num_tokens": 145164747.0, "step": 161 }, { "epoch": 1.208955223880597, "grad_norm": 0.3320369471002194, "learning_rate": 8.816202557667075e-07, "loss": 0.5746063590049744, "num_tokens": 145974438.0, "step": 162 }, { "epoch": 1.2164179104477613, "grad_norm": 0.28069224857904806, "learning_rate": 8.800256778838467e-07, "loss": 0.5617422461509705, "num_tokens": 146897553.0, "step": 163 }, { "epoch": 1.2238805970149254, "grad_norm": 0.3248983801962701, "learning_rate": 8.784220822644178e-07, "loss": 0.6217033267021179, "num_tokens": 147706235.0, "step": 164 }, { "epoch": 1.2313432835820897, "grad_norm": 0.2887208674047822, "learning_rate": 8.768095127256261e-07, "loss": 0.5414159297943115, "num_tokens": 148638477.0, "step": 165 }, { "epoch": 1.2388059701492538, "grad_norm": 0.32438535835021254, "learning_rate": 8.751880133298834e-07, "loss": 0.5938442945480347, "num_tokens": 149598765.0, "step": 166 }, { "epoch": 1.2462686567164178, "grad_norm": 0.2894650573081879, "learning_rate": 8.735576283836037e-07, "loss": 0.6117956638336182, "num_tokens": 150499813.0, "step": 167 }, { "epoch": 1.2537313432835822, "grad_norm": 0.3397567950477019, "learning_rate": 8.719184024359934e-07, "loss": 0.5914928317070007, "num_tokens": 151406909.0, "step": 168 }, { "epoch": 1.2611940298507462, "grad_norm": 0.290993077313297, "learning_rate": 8.70270380277833e-07, "loss": 0.6332953572273254, "num_tokens": 152358615.0, "step": 169 }, { "epoch": 1.2686567164179103, "grad_norm": 0.30309197520313697, "learning_rate": 8.686136069402541e-07, "loss": 0.5448141694068909, "num_tokens": 153140245.0, "step": 170 }, { "epoch": 1.2761194029850746, "grad_norm": 0.2861452466873225, "learning_rate": 8.669481276935083e-07, "loss": 0.6137048602104187, "num_tokens": 154065403.0, "step": 171 }, { "epoch": 1.2835820895522387, "grad_norm": 0.2818788428038251, "learning_rate": 8.652739880457308e-07, "loss": 0.606778621673584, "num_tokens": 155004060.0, "step": 172 }, { "epoch": 1.291044776119403, "grad_norm": 0.28295852710705355, "learning_rate": 8.635912337416962e-07, "loss": 0.599794864654541, "num_tokens": 155894044.0, "step": 173 }, { "epoch": 1.2985074626865671, "grad_norm": 0.2842356600778382, "learning_rate": 8.618999107615693e-07, "loss": 0.5917081832885742, "num_tokens": 156891753.0, "step": 174 }, { "epoch": 1.3059701492537314, "grad_norm": 0.6073059901583661, "learning_rate": 8.602000653196483e-07, "loss": 0.5762104988098145, "num_tokens": 157781264.0, "step": 175 }, { "epoch": 1.3134328358208955, "grad_norm": 0.38500166455619816, "learning_rate": 8.58491743863102e-07, "loss": 0.5871777534484863, "num_tokens": 158729371.0, "step": 176 }, { "epoch": 1.3208955223880596, "grad_norm": 0.27823289078417784, "learning_rate": 8.567749930707011e-07, "loss": 0.605941653251648, "num_tokens": 159723929.0, "step": 177 }, { "epoch": 1.328358208955224, "grad_norm": 0.35213890439138806, "learning_rate": 8.55049859851542e-07, "loss": 0.6232315301895142, "num_tokens": 160655042.0, "step": 178 }, { "epoch": 1.335820895522388, "grad_norm": 0.3835692532749333, "learning_rate": 8.533163913437657e-07, "loss": 0.5706331729888916, "num_tokens": 161689806.0, "step": 179 }, { "epoch": 1.3432835820895521, "grad_norm": 0.30950866157021506, "learning_rate": 8.515746349132691e-07, "loss": 0.5692603588104248, "num_tokens": 162672971.0, "step": 180 }, { "epoch": 1.3507462686567164, "grad_norm": 0.2670593807735062, "learning_rate": 8.498246381524123e-07, "loss": 0.55814528465271, "num_tokens": 163606727.0, "step": 181 }, { "epoch": 1.3582089552238805, "grad_norm": 0.30949695351670486, "learning_rate": 8.480664488787156e-07, "loss": 0.5762124061584473, "num_tokens": 164379724.0, "step": 182 }, { "epoch": 1.3656716417910448, "grad_norm": 0.5924544887792298, "learning_rate": 8.463001151335554e-07, "loss": 0.588869035243988, "num_tokens": 165282114.0, "step": 183 }, { "epoch": 1.373134328358209, "grad_norm": 0.28762285578208174, "learning_rate": 8.445256851808503e-07, "loss": 0.5752467513084412, "num_tokens": 166184652.0, "step": 184 }, { "epoch": 1.3805970149253732, "grad_norm": 0.3298383363971737, "learning_rate": 8.427432075057421e-07, "loss": 0.5592284798622131, "num_tokens": 167131883.0, "step": 185 }, { "epoch": 1.3880597014925373, "grad_norm": 0.29907045507102953, "learning_rate": 8.409527308132717e-07, "loss": 0.6292506456375122, "num_tokens": 168105786.0, "step": 186 }, { "epoch": 1.3955223880597014, "grad_norm": 0.5372519710802263, "learning_rate": 8.391543040270477e-07, "loss": 0.5994750261306763, "num_tokens": 168981965.0, "step": 187 }, { "epoch": 1.4029850746268657, "grad_norm": 0.3108497777882688, "learning_rate": 8.373479762879102e-07, "loss": 0.5894546508789062, "num_tokens": 169813930.0, "step": 188 }, { "epoch": 1.4104477611940298, "grad_norm": 0.263215877705637, "learning_rate": 8.355337969525874e-07, "loss": 0.5457190871238708, "num_tokens": 170803921.0, "step": 189 }, { "epoch": 1.417910447761194, "grad_norm": 0.3065822581882927, "learning_rate": 8.337118155923472e-07, "loss": 0.5782487988471985, "num_tokens": 171568584.0, "step": 190 }, { "epoch": 1.4253731343283582, "grad_norm": 0.33076776216010273, "learning_rate": 8.318820819916432e-07, "loss": 0.5753518342971802, "num_tokens": 172302686.0, "step": 191 }, { "epoch": 1.4328358208955223, "grad_norm": 0.28171276781641486, "learning_rate": 8.300446461467532e-07, "loss": 0.6102815270423889, "num_tokens": 173251435.0, "step": 192 }, { "epoch": 1.4402985074626866, "grad_norm": 0.2941656823177544, "learning_rate": 8.281995582644144e-07, "loss": 0.5915931463241577, "num_tokens": 174154926.0, "step": 193 }, { "epoch": 1.4477611940298507, "grad_norm": 0.3065460235572143, "learning_rate": 8.263468687604508e-07, "loss": 0.6099899411201477, "num_tokens": 174968736.0, "step": 194 }, { "epoch": 1.455223880597015, "grad_norm": 0.27914137530301547, "learning_rate": 8.244866282583955e-07, "loss": 0.6181570291519165, "num_tokens": 175993671.0, "step": 195 }, { "epoch": 1.462686567164179, "grad_norm": 0.3083419332308097, "learning_rate": 8.226188875881081e-07, "loss": 0.5710784196853638, "num_tokens": 176965410.0, "step": 196 }, { "epoch": 1.4701492537313432, "grad_norm": 0.5452138442343578, "learning_rate": 8.20743697784385e-07, "loss": 0.6409458518028259, "num_tokens": 177894815.0, "step": 197 }, { "epoch": 1.4776119402985075, "grad_norm": 0.3312904222346608, "learning_rate": 8.188611100855654e-07, "loss": 0.5432331562042236, "num_tokens": 178840660.0, "step": 198 }, { "epoch": 1.4850746268656716, "grad_norm": 0.2774128783983067, "learning_rate": 8.169711759321317e-07, "loss": 0.5913591384887695, "num_tokens": 179786009.0, "step": 199 }, { "epoch": 1.4925373134328357, "grad_norm": 0.27538698518702026, "learning_rate": 8.150739469653026e-07, "loss": 0.5327359437942505, "num_tokens": 180680467.0, "step": 200 }, { "epoch": 1.5, "grad_norm": 0.2839849649378776, "learning_rate": 8.131694750256233e-07, "loss": 0.59873366355896, "num_tokens": 181598316.0, "step": 201 }, { "epoch": 1.5074626865671643, "grad_norm": 0.3083583005906764, "learning_rate": 8.112578121515484e-07, "loss": 0.5907875299453735, "num_tokens": 182458909.0, "step": 202 }, { "epoch": 1.5149253731343284, "grad_norm": 0.29048547641188055, "learning_rate": 8.0933901057802e-07, "loss": 0.581605076789856, "num_tokens": 183258199.0, "step": 203 }, { "epoch": 1.5223880597014925, "grad_norm": 0.33631459084645754, "learning_rate": 8.074131227350408e-07, "loss": 0.5947036743164062, "num_tokens": 184223376.0, "step": 204 }, { "epoch": 1.5298507462686568, "grad_norm": 0.9803280537963448, "learning_rate": 8.054802012462409e-07, "loss": 0.6088910102844238, "num_tokens": 185129043.0, "step": 205 }, { "epoch": 1.537313432835821, "grad_norm": 0.35614179487581604, "learning_rate": 8.035402989274402e-07, "loss": 0.595119833946228, "num_tokens": 186025421.0, "step": 206 }, { "epoch": 1.544776119402985, "grad_norm": 0.28518439705384824, "learning_rate": 8.015934687852052e-07, "loss": 0.5574674606323242, "num_tokens": 186963319.0, "step": 207 }, { "epoch": 1.5522388059701493, "grad_norm": 0.29270174201680466, "learning_rate": 7.99639764015401e-07, "loss": 0.6108373999595642, "num_tokens": 187973354.0, "step": 208 }, { "epoch": 1.5597014925373134, "grad_norm": 0.3323311285244367, "learning_rate": 7.976792380017372e-07, "loss": 0.5401036143302917, "num_tokens": 188705328.0, "step": 209 }, { "epoch": 1.5671641791044775, "grad_norm": 0.3456603920554097, "learning_rate": 7.957119443143093e-07, "loss": 0.6063162088394165, "num_tokens": 189538934.0, "step": 210 }, { "epoch": 1.5746268656716418, "grad_norm": 0.3082330500125609, "learning_rate": 7.937379367081354e-07, "loss": 0.5718963146209717, "num_tokens": 190336903.0, "step": 211 }, { "epoch": 1.582089552238806, "grad_norm": 0.30744552461466196, "learning_rate": 7.917572691216866e-07, "loss": 0.62088942527771, "num_tokens": 191168843.0, "step": 212 }, { "epoch": 1.5895522388059702, "grad_norm": 0.4103702199848832, "learning_rate": 7.897699956754142e-07, "loss": 0.5833892822265625, "num_tokens": 192111363.0, "step": 213 }, { "epoch": 1.5970149253731343, "grad_norm": 0.2908559364406994, "learning_rate": 7.877761706702697e-07, "loss": 0.5971975922584534, "num_tokens": 193103746.0, "step": 214 }, { "epoch": 1.6044776119402986, "grad_norm": 0.3328614806641497, "learning_rate": 7.857758485862219e-07, "loss": 0.5907278656959534, "num_tokens": 193981563.0, "step": 215 }, { "epoch": 1.6119402985074627, "grad_norm": 0.37832890065438735, "learning_rate": 7.837690840807686e-07, "loss": 0.6060609817504883, "num_tokens": 194834592.0, "step": 216 }, { "epoch": 1.6194029850746268, "grad_norm": 0.2677852995292723, "learning_rate": 7.817559319874417e-07, "loss": 0.5535368323326111, "num_tokens": 195762991.0, "step": 217 }, { "epoch": 1.626865671641791, "grad_norm": 0.2741102900904772, "learning_rate": 7.797364473143103e-07, "loss": 0.5808444619178772, "num_tokens": 196736861.0, "step": 218 }, { "epoch": 1.6343283582089554, "grad_norm": 0.2912625129099285, "learning_rate": 7.777106852424768e-07, "loss": 0.5897442102432251, "num_tokens": 197626724.0, "step": 219 }, { "epoch": 1.6417910447761193, "grad_norm": 0.591147632193599, "learning_rate": 7.756787011245699e-07, "loss": 0.6097444295883179, "num_tokens": 198637589.0, "step": 220 }, { "epoch": 1.6492537313432836, "grad_norm": 0.3296433282919186, "learning_rate": 7.736405504832313e-07, "loss": 0.6026604175567627, "num_tokens": 199563255.0, "step": 221 }, { "epoch": 1.6567164179104479, "grad_norm": 0.27997593841499196, "learning_rate": 7.715962890095987e-07, "loss": 0.5822043418884277, "num_tokens": 200461303.0, "step": 222 }, { "epoch": 1.664179104477612, "grad_norm": 0.3067564437403442, "learning_rate": 7.69545972561785e-07, "loss": 0.6166250705718994, "num_tokens": 201369977.0, "step": 223 }, { "epoch": 1.671641791044776, "grad_norm": 0.2688906931144112, "learning_rate": 7.674896571633506e-07, "loss": 0.5445988178253174, "num_tokens": 202278503.0, "step": 224 }, { "epoch": 1.6791044776119404, "grad_norm": 0.2691568834958204, "learning_rate": 7.65427399001774e-07, "loss": 0.5422626733779907, "num_tokens": 203242720.0, "step": 225 }, { "epoch": 1.6865671641791045, "grad_norm": 0.2906038986233365, "learning_rate": 7.633592544269152e-07, "loss": 0.5783904790878296, "num_tokens": 204150301.0, "step": 226 }, { "epoch": 1.6940298507462686, "grad_norm": 0.2887554770352252, "learning_rate": 7.612852799494769e-07, "loss": 0.588298499584198, "num_tokens": 205093558.0, "step": 227 }, { "epoch": 1.7014925373134329, "grad_norm": 0.2993654753244292, "learning_rate": 7.592055322394602e-07, "loss": 0.6025734543800354, "num_tokens": 205970210.0, "step": 228 }, { "epoch": 1.7089552238805972, "grad_norm": 0.284435640176723, "learning_rate": 7.571200681246158e-07, "loss": 0.6054296493530273, "num_tokens": 206859291.0, "step": 229 }, { "epoch": 1.716417910447761, "grad_norm": 0.3195285881506451, "learning_rate": 7.550289445888914e-07, "loss": 0.5874844789505005, "num_tokens": 207786446.0, "step": 230 }, { "epoch": 1.7238805970149254, "grad_norm": 0.28553073165686976, "learning_rate": 7.529322187708751e-07, "loss": 0.6177946925163269, "num_tokens": 208698287.0, "step": 231 }, { "epoch": 1.7313432835820897, "grad_norm": 0.49617871796694113, "learning_rate": 7.508299479622334e-07, "loss": 0.5590040683746338, "num_tokens": 209548343.0, "step": 232 }, { "epoch": 1.7388059701492538, "grad_norm": 0.28377509079902785, "learning_rate": 7.487221896061457e-07, "loss": 0.5984382629394531, "num_tokens": 210468969.0, "step": 233 }, { "epoch": 1.7462686567164178, "grad_norm": 0.27598753591931857, "learning_rate": 7.46609001295736e-07, "loss": 0.614782452583313, "num_tokens": 211457470.0, "step": 234 }, { "epoch": 1.7537313432835822, "grad_norm": 0.3112684553028677, "learning_rate": 7.444904407724972e-07, "loss": 0.5674484372138977, "num_tokens": 212347451.0, "step": 235 }, { "epoch": 1.7611940298507462, "grad_norm": 0.6461909553210673, "learning_rate": 7.423665659247152e-07, "loss": 0.5716361999511719, "num_tokens": 213190706.0, "step": 236 }, { "epoch": 1.7686567164179103, "grad_norm": 0.28012296588796226, "learning_rate": 7.40237434785886e-07, "loss": 0.5874301195144653, "num_tokens": 214169074.0, "step": 237 }, { "epoch": 1.7761194029850746, "grad_norm": 0.279647753002793, "learning_rate": 7.381031055331305e-07, "loss": 0.6019556522369385, "num_tokens": 215188427.0, "step": 238 }, { "epoch": 1.783582089552239, "grad_norm": 0.39177638463203673, "learning_rate": 7.359636364856043e-07, "loss": 0.6031475067138672, "num_tokens": 216080767.0, "step": 239 }, { "epoch": 1.7910447761194028, "grad_norm": 0.3360890439886117, "learning_rate": 7.338190861029051e-07, "loss": 0.5989038944244385, "num_tokens": 216995394.0, "step": 240 }, { "epoch": 1.7985074626865671, "grad_norm": 0.2739266991645573, "learning_rate": 7.316695129834744e-07, "loss": 0.5613197684288025, "num_tokens": 217866020.0, "step": 241 }, { "epoch": 1.8059701492537314, "grad_norm": 0.5120657338819569, "learning_rate": 7.295149758629966e-07, "loss": 0.5808136463165283, "num_tokens": 218680341.0, "step": 242 }, { "epoch": 1.8134328358208955, "grad_norm": 0.294200195693698, "learning_rate": 7.273555336127946e-07, "loss": 0.5945237874984741, "num_tokens": 219550948.0, "step": 243 }, { "epoch": 1.8208955223880596, "grad_norm": 0.2909609394380647, "learning_rate": 7.251912452382205e-07, "loss": 0.5686002373695374, "num_tokens": 220517125.0, "step": 244 }, { "epoch": 1.828358208955224, "grad_norm": 0.29553389453752155, "learning_rate": 7.230221698770439e-07, "loss": 0.5637418031692505, "num_tokens": 221365026.0, "step": 245 }, { "epoch": 1.835820895522388, "grad_norm": 0.331158945273446, "learning_rate": 7.20848366797835e-07, "loss": 0.5235867500305176, "num_tokens": 222233736.0, "step": 246 }, { "epoch": 1.8432835820895521, "grad_norm": 0.29651149137672, "learning_rate": 7.186698953983465e-07, "loss": 0.6124955415725708, "num_tokens": 223222809.0, "step": 247 }, { "epoch": 1.8507462686567164, "grad_norm": 0.2967229855382439, "learning_rate": 7.164868152038898e-07, "loss": 0.5900925993919373, "num_tokens": 224116326.0, "step": 248 }, { "epoch": 1.8582089552238807, "grad_norm": 0.2770420932471773, "learning_rate": 7.14299185865708e-07, "loss": 0.5970636606216431, "num_tokens": 991360.0, "step": 249 }, { "epoch": 1.8656716417910446, "grad_norm": 0.2858538902789167, "learning_rate": 7.121070671593477e-07, "loss": 0.5819560289382935, "num_tokens": 2010167.0, "step": 250 }, { "epoch": 1.873134328358209, "grad_norm": 0.2972326089151943, "learning_rate": 7.099105189830235e-07, "loss": 0.5888773202896118, "num_tokens": 2885939.0, "step": 251 }, { "epoch": 1.8805970149253732, "grad_norm": 0.3057535952426567, "learning_rate": 7.07709601355983e-07, "loss": 0.5811155438423157, "num_tokens": 3762868.0, "step": 252 }, { "epoch": 1.8880597014925373, "grad_norm": 0.31258132271786665, "learning_rate": 7.055043744168657e-07, "loss": 0.6176049709320068, "num_tokens": 4733514.0, "step": 253 }, { "epoch": 1.8955223880597014, "grad_norm": 0.3030852287619505, "learning_rate": 7.03294898422061e-07, "loss": 0.5565370917320251, "num_tokens": 5639515.0, "step": 254 }, { "epoch": 1.9029850746268657, "grad_norm": 0.3012477759016326, "learning_rate": 7.010812337440604e-07, "loss": 0.568949818611145, "num_tokens": 6415052.0, "step": 255 }, { "epoch": 1.9104477611940298, "grad_norm": 0.27565638559926237, "learning_rate": 6.988634408698082e-07, "loss": 0.5407424569129944, "num_tokens": 7346190.0, "step": 256 }, { "epoch": 1.917910447761194, "grad_norm": 0.32156589204612385, "learning_rate": 6.9664158039905e-07, "loss": 0.609969973564148, "num_tokens": 8239599.0, "step": 257 }, { "epoch": 1.9253731343283582, "grad_norm": 0.3188048041018452, "learning_rate": 6.944157130426745e-07, "loss": 0.5987858176231384, "num_tokens": 9077707.0, "step": 258 }, { "epoch": 1.9328358208955225, "grad_norm": 0.2961821437519157, "learning_rate": 6.921858996210568e-07, "loss": 0.568209171295166, "num_tokens": 9982372.0, "step": 259 }, { "epoch": 1.9402985074626866, "grad_norm": 0.27392736616109464, "learning_rate": 6.899522010623958e-07, "loss": 0.5922641158103943, "num_tokens": 11023445.0, "step": 260 }, { "epoch": 1.9477611940298507, "grad_norm": 0.2933002440532853, "learning_rate": 6.877146784010486e-07, "loss": 0.5736743211746216, "num_tokens": 11857709.0, "step": 261 }, { "epoch": 1.955223880597015, "grad_norm": 0.27846912836784765, "learning_rate": 6.854733927758636e-07, "loss": 0.5894352197647095, "num_tokens": 12766496.0, "step": 262 }, { "epoch": 1.962686567164179, "grad_norm": 0.27777120339996386, "learning_rate": 6.8322840542851e-07, "loss": 0.601696789264679, "num_tokens": 13767472.0, "step": 263 }, { "epoch": 1.9701492537313432, "grad_norm": 0.46660244180149263, "learning_rate": 6.80979777701804e-07, "loss": 0.5974367260932922, "num_tokens": 14594712.0, "step": 264 }, { "epoch": 1.9776119402985075, "grad_norm": 0.2956712799688728, "learning_rate": 6.787275710380329e-07, "loss": 0.5965464115142822, "num_tokens": 15486445.0, "step": 265 }, { "epoch": 1.9850746268656716, "grad_norm": 0.31107227320954434, "learning_rate": 6.764718469772757e-07, "loss": 0.576676607131958, "num_tokens": 16227990.0, "step": 266 }, { "epoch": 1.9925373134328357, "grad_norm": 0.28037710557720436, "learning_rate": 6.742126671557227e-07, "loss": 0.556594729423523, "num_tokens": 17105978.0, "step": 267 }, { "epoch": 2.0, "grad_norm": 0.31658535211949906, "learning_rate": 6.719500933039897e-07, "loss": 0.5741510391235352, "num_tokens": 18011768.0, "step": 268 }, { "epoch": 2.0074626865671643, "grad_norm": 0.2959704877731208, "learning_rate": 6.69684187245433e-07, "loss": 0.596227765083313, "num_tokens": 18916003.0, "step": 269 }, { "epoch": 2.014925373134328, "grad_norm": 0.29107376493477805, "learning_rate": 6.674150108944592e-07, "loss": 0.5445001125335693, "num_tokens": 19684628.0, "step": 270 }, { "epoch": 2.0223880597014925, "grad_norm": 0.2777883261006439, "learning_rate": 6.651426262548325e-07, "loss": 0.5889461636543274, "num_tokens": 20690086.0, "step": 271 }, { "epoch": 2.029850746268657, "grad_norm": 0.2701961134075662, "learning_rate": 6.628670954179829e-07, "loss": 0.5695216655731201, "num_tokens": 21686072.0, "step": 272 }, { "epoch": 2.0373134328358207, "grad_norm": 0.26760015658938013, "learning_rate": 6.605884805613072e-07, "loss": 0.5295987129211426, "num_tokens": 22622971.0, "step": 273 }, { "epoch": 2.044776119402985, "grad_norm": 0.323703653894069, "learning_rate": 6.583068439464715e-07, "loss": 0.5844870209693909, "num_tokens": 23496905.0, "step": 274 }, { "epoch": 2.0522388059701493, "grad_norm": 0.30828913191565205, "learning_rate": 6.560222479177094e-07, "loss": 0.5690542459487915, "num_tokens": 24365149.0, "step": 275 }, { "epoch": 2.0597014925373136, "grad_norm": 0.3084417281876953, "learning_rate": 6.537347549001184e-07, "loss": 0.5742576122283936, "num_tokens": 25184612.0, "step": 276 }, { "epoch": 2.0671641791044775, "grad_norm": 0.3080393843745802, "learning_rate": 6.514444273979543e-07, "loss": 0.5722700357437134, "num_tokens": 26054937.0, "step": 277 }, { "epoch": 2.074626865671642, "grad_norm": 0.26775870499252885, "learning_rate": 6.491513279929237e-07, "loss": 0.5365396738052368, "num_tokens": 26954789.0, "step": 278 }, { "epoch": 2.082089552238806, "grad_norm": 0.35802085892606694, "learning_rate": 6.468555193424735e-07, "loss": 0.5596331357955933, "num_tokens": 27845072.0, "step": 279 }, { "epoch": 2.08955223880597, "grad_norm": 0.2719625157722236, "learning_rate": 6.445570641780786e-07, "loss": 0.5419675707817078, "num_tokens": 28658754.0, "step": 280 }, { "epoch": 2.0970149253731343, "grad_norm": 0.27709779905677195, "learning_rate": 6.422560253035287e-07, "loss": 0.5775716304779053, "num_tokens": 29562576.0, "step": 281 }, { "epoch": 2.1044776119402986, "grad_norm": 0.3348867650726701, "learning_rate": 6.39952465593211e-07, "loss": 0.585283637046814, "num_tokens": 30431256.0, "step": 282 }, { "epoch": 2.111940298507463, "grad_norm": 0.29694286893571703, "learning_rate": 6.376464479903937e-07, "loss": 0.5197538733482361, "num_tokens": 31183415.0, "step": 283 }, { "epoch": 2.1194029850746268, "grad_norm": 0.29840728825899304, "learning_rate": 6.35338035505505e-07, "loss": 0.5599273443222046, "num_tokens": 32067616.0, "step": 284 }, { "epoch": 2.126865671641791, "grad_norm": 0.33678060078261246, "learning_rate": 6.330272912144116e-07, "loss": 0.6192691326141357, "num_tokens": 32981757.0, "step": 285 }, { "epoch": 2.1343283582089554, "grad_norm": 0.2824243012069984, "learning_rate": 6.307142782566951e-07, "loss": 0.5863723754882812, "num_tokens": 34022251.0, "step": 286 }, { "epoch": 2.1417910447761193, "grad_norm": 0.299007428336042, "learning_rate": 6.283990598339274e-07, "loss": 0.5666537284851074, "num_tokens": 34771710.0, "step": 287 }, { "epoch": 2.1492537313432836, "grad_norm": 0.32004542775898487, "learning_rate": 6.260816992079431e-07, "loss": 0.5231757760047913, "num_tokens": 35650183.0, "step": 288 }, { "epoch": 2.156716417910448, "grad_norm": 0.3045721000640588, "learning_rate": 6.237622596991106e-07, "loss": 0.5760424137115479, "num_tokens": 36493771.0, "step": 289 }, { "epoch": 2.1641791044776117, "grad_norm": 0.2937692676912212, "learning_rate": 6.214408046846034e-07, "loss": 0.568109393119812, "num_tokens": 37330886.0, "step": 290 }, { "epoch": 2.171641791044776, "grad_norm": 0.27011730349579827, "learning_rate": 6.191173975966668e-07, "loss": 0.5667808055877686, "num_tokens": 38365287.0, "step": 291 }, { "epoch": 2.1791044776119404, "grad_norm": 0.28692265505240294, "learning_rate": 6.16792101920885e-07, "loss": 0.6112924814224243, "num_tokens": 39420111.0, "step": 292 }, { "epoch": 2.1865671641791047, "grad_norm": 0.28526825922243654, "learning_rate": 6.144649811944473e-07, "loss": 0.5639245510101318, "num_tokens": 40263636.0, "step": 293 }, { "epoch": 2.1940298507462686, "grad_norm": 0.2908552938644807, "learning_rate": 6.121360990044106e-07, "loss": 0.5848294496536255, "num_tokens": 41080304.0, "step": 294 }, { "epoch": 2.201492537313433, "grad_norm": 0.2821005110679085, "learning_rate": 6.098055189859634e-07, "loss": 0.5666854381561279, "num_tokens": 42076069.0, "step": 295 }, { "epoch": 2.208955223880597, "grad_norm": 0.2609076970175634, "learning_rate": 6.074733048206852e-07, "loss": 0.5690361857414246, "num_tokens": 43082365.0, "step": 296 }, { "epoch": 2.216417910447761, "grad_norm": 0.2836464990705104, "learning_rate": 6.051395202348089e-07, "loss": 0.5679644346237183, "num_tokens": 44012700.0, "step": 297 }, { "epoch": 2.2238805970149254, "grad_norm": 0.25600506754304947, "learning_rate": 6.028042289974768e-07, "loss": 0.5512281656265259, "num_tokens": 44917778.0, "step": 298 }, { "epoch": 2.2313432835820897, "grad_norm": 0.3257772292478769, "learning_rate": 6.004674949190003e-07, "loss": 0.5415934324264526, "num_tokens": 45740145.0, "step": 299 }, { "epoch": 2.2388059701492535, "grad_norm": 0.3286166291890802, "learning_rate": 5.981293818491152e-07, "loss": 0.5995659828186035, "num_tokens": 46620715.0, "step": 300 }, { "epoch": 2.246268656716418, "grad_norm": 0.2894099742683797, "learning_rate": 5.957899536752373e-07, "loss": 0.608267605304718, "num_tokens": 47539124.0, "step": 301 }, { "epoch": 2.253731343283582, "grad_norm": 0.2895508079853879, "learning_rate": 5.934492743207168e-07, "loss": 0.5291934013366699, "num_tokens": 48336408.0, "step": 302 }, { "epoch": 2.2611940298507465, "grad_norm": 0.3174861796364666, "learning_rate": 5.911074077430916e-07, "loss": 0.5688158273696899, "num_tokens": 49205406.0, "step": 303 }, { "epoch": 2.2686567164179103, "grad_norm": 0.3232273733079749, "learning_rate": 5.887644179323403e-07, "loss": 0.5540226697921753, "num_tokens": 50043421.0, "step": 304 }, { "epoch": 2.2761194029850746, "grad_norm": 0.28558608567310023, "learning_rate": 5.864203689091315e-07, "loss": 0.5832343697547913, "num_tokens": 50915233.0, "step": 305 }, { "epoch": 2.283582089552239, "grad_norm": 0.3069099454369789, "learning_rate": 5.84075324723078e-07, "loss": 0.5831292867660522, "num_tokens": 51814606.0, "step": 306 }, { "epoch": 2.291044776119403, "grad_norm": 0.2697329326673818, "learning_rate": 5.817293494509836e-07, "loss": 0.5265708565711975, "num_tokens": 52624758.0, "step": 307 }, { "epoch": 2.298507462686567, "grad_norm": 0.27527418320187297, "learning_rate": 5.793825071950935e-07, "loss": 0.5518659353256226, "num_tokens": 53591262.0, "step": 308 }, { "epoch": 2.3059701492537314, "grad_norm": 0.31199334247822685, "learning_rate": 5.770348620813432e-07, "loss": 0.5563576221466064, "num_tokens": 54586868.0, "step": 309 }, { "epoch": 2.3134328358208958, "grad_norm": 0.33111603191568395, "learning_rate": 5.746864782576053e-07, "loss": 0.5557553768157959, "num_tokens": 55375213.0, "step": 310 }, { "epoch": 2.3208955223880596, "grad_norm": 0.2955172821022238, "learning_rate": 5.723374198919376e-07, "loss": 0.5784043073654175, "num_tokens": 56208304.0, "step": 311 }, { "epoch": 2.328358208955224, "grad_norm": 0.27282274560683967, "learning_rate": 5.699877511708284e-07, "loss": 0.5383070111274719, "num_tokens": 57191922.0, "step": 312 }, { "epoch": 2.3358208955223883, "grad_norm": 0.258279345220606, "learning_rate": 5.676375362974449e-07, "loss": 0.5381882786750793, "num_tokens": 58105389.0, "step": 313 }, { "epoch": 2.343283582089552, "grad_norm": 0.33949214023855984, "learning_rate": 5.652868394898766e-07, "loss": 0.5437734723091125, "num_tokens": 58961497.0, "step": 314 }, { "epoch": 2.3507462686567164, "grad_norm": 0.2833579384746108, "learning_rate": 5.629357249793816e-07, "loss": 0.592788815498352, "num_tokens": 59947795.0, "step": 315 }, { "epoch": 2.3582089552238807, "grad_norm": 0.30062364724661333, "learning_rate": 5.605842570086319e-07, "loss": 0.5617958307266235, "num_tokens": 60964098.0, "step": 316 }, { "epoch": 2.3656716417910446, "grad_norm": 0.2800076043555411, "learning_rate": 5.582324998299572e-07, "loss": 0.5720120072364807, "num_tokens": 61889873.0, "step": 317 }, { "epoch": 2.373134328358209, "grad_norm": 0.27101523720971354, "learning_rate": 5.558805177035901e-07, "loss": 0.5741963386535645, "num_tokens": 62849188.0, "step": 318 }, { "epoch": 2.3805970149253732, "grad_norm": 0.285471903870457, "learning_rate": 5.53528374895909e-07, "loss": 0.5414842963218689, "num_tokens": 63726113.0, "step": 319 }, { "epoch": 2.388059701492537, "grad_norm": 0.27878281549210715, "learning_rate": 5.511761356776833e-07, "loss": 0.5728551745414734, "num_tokens": 64647597.0, "step": 320 }, { "epoch": 2.3955223880597014, "grad_norm": 0.28622078113655536, "learning_rate": 5.488238643223167e-07, "loss": 0.5815838575363159, "num_tokens": 65508928.0, "step": 321 }, { "epoch": 2.4029850746268657, "grad_norm": 0.27899596544094324, "learning_rate": 5.464716251040911e-07, "loss": 0.5664654970169067, "num_tokens": 66358099.0, "step": 322 }, { "epoch": 2.41044776119403, "grad_norm": 0.2825063898870399, "learning_rate": 5.441194822964099e-07, "loss": 0.5764633417129517, "num_tokens": 67219625.0, "step": 323 }, { "epoch": 2.417910447761194, "grad_norm": 0.2889902676953764, "learning_rate": 5.417675001700427e-07, "loss": 0.5656483173370361, "num_tokens": 68141332.0, "step": 324 }, { "epoch": 2.425373134328358, "grad_norm": 0.27885504864685967, "learning_rate": 5.39415742991368e-07, "loss": 0.6192145943641663, "num_tokens": 69046407.0, "step": 325 }, { "epoch": 2.4328358208955225, "grad_norm": 0.286670961202952, "learning_rate": 5.370642750206184e-07, "loss": 0.6090319156646729, "num_tokens": 70083093.0, "step": 326 }, { "epoch": 2.4402985074626864, "grad_norm": 0.275072660826794, "learning_rate": 5.347131605101236e-07, "loss": 0.6045145988464355, "num_tokens": 71047395.0, "step": 327 }, { "epoch": 2.4477611940298507, "grad_norm": 0.2864324709863456, "learning_rate": 5.323624637025551e-07, "loss": 0.5572278499603271, "num_tokens": 71932159.0, "step": 328 }, { "epoch": 2.455223880597015, "grad_norm": 0.2973709054279998, "learning_rate": 5.300122488291716e-07, "loss": 0.5611422061920166, "num_tokens": 72789371.0, "step": 329 }, { "epoch": 2.4626865671641793, "grad_norm": 0.3318220155418688, "learning_rate": 5.276625801080625e-07, "loss": 0.5865360498428345, "num_tokens": 73721478.0, "step": 330 }, { "epoch": 2.470149253731343, "grad_norm": 0.3436341281789925, "learning_rate": 5.253135217423947e-07, "loss": 0.5705252885818481, "num_tokens": 74706274.0, "step": 331 }, { "epoch": 2.4776119402985075, "grad_norm": 0.37986006551326945, "learning_rate": 5.229651379186569e-07, "loss": 0.5907820463180542, "num_tokens": 75647716.0, "step": 332 }, { "epoch": 2.485074626865672, "grad_norm": 0.3122329879913656, "learning_rate": 5.206174928049065e-07, "loss": 0.5766445994377136, "num_tokens": 76637809.0, "step": 333 }, { "epoch": 2.4925373134328357, "grad_norm": 0.2998854461296799, "learning_rate": 5.182706505490165e-07, "loss": 0.5649234652519226, "num_tokens": 77528162.0, "step": 334 }, { "epoch": 2.5, "grad_norm": 0.25745394101844005, "learning_rate": 5.15924675276922e-07, "loss": 0.5354350805282593, "num_tokens": 78421720.0, "step": 335 }, { "epoch": 2.5074626865671643, "grad_norm": 0.2829213028380422, "learning_rate": 5.135796310908685e-07, "loss": 0.5751874446868896, "num_tokens": 79217572.0, "step": 336 }, { "epoch": 2.5149253731343286, "grad_norm": 0.3139108030566433, "learning_rate": 5.112355820676599e-07, "loss": 0.5624819993972778, "num_tokens": 80184157.0, "step": 337 }, { "epoch": 2.5223880597014925, "grad_norm": 0.28913655087516726, "learning_rate": 5.088925922569083e-07, "loss": 0.568986713886261, "num_tokens": 81059812.0, "step": 338 }, { "epoch": 2.529850746268657, "grad_norm": 0.2960772237491209, "learning_rate": 5.065507256792833e-07, "loss": 0.5797086954116821, "num_tokens": 81975922.0, "step": 339 }, { "epoch": 2.5373134328358207, "grad_norm": 0.2882326553054164, "learning_rate": 5.042100463247629e-07, "loss": 0.5706868171691895, "num_tokens": 82823460.0, "step": 340 }, { "epoch": 2.544776119402985, "grad_norm": 0.307599479684034, "learning_rate": 5.018706181508851e-07, "loss": 0.5756710767745972, "num_tokens": 83559785.0, "step": 341 }, { "epoch": 2.5522388059701493, "grad_norm": 0.308017074889723, "learning_rate": 4.995325050809999e-07, "loss": 0.6031478643417358, "num_tokens": 84441431.0, "step": 342 }, { "epoch": 2.5597014925373136, "grad_norm": 0.29529571185477965, "learning_rate": 4.971957710025234e-07, "loss": 0.5946158170700073, "num_tokens": 85349485.0, "step": 343 }, { "epoch": 2.5671641791044775, "grad_norm": 0.2819610880293821, "learning_rate": 4.948604797651913e-07, "loss": 0.5992064476013184, "num_tokens": 86267065.0, "step": 344 }, { "epoch": 2.574626865671642, "grad_norm": 0.2972660099307388, "learning_rate": 4.925266951793149e-07, "loss": 0.573174774646759, "num_tokens": 87077996.0, "step": 345 }, { "epoch": 2.582089552238806, "grad_norm": 0.301461452019035, "learning_rate": 4.901944810140369e-07, "loss": 0.589251697063446, "num_tokens": 88180031.0, "step": 346 }, { "epoch": 2.58955223880597, "grad_norm": 0.2904479295025236, "learning_rate": 4.878639009955895e-07, "loss": 0.54721999168396, "num_tokens": 89096524.0, "step": 347 }, { "epoch": 2.5970149253731343, "grad_norm": 0.2633320073378902, "learning_rate": 4.855350188055528e-07, "loss": 0.5418224334716797, "num_tokens": 90020467.0, "step": 348 }, { "epoch": 2.6044776119402986, "grad_norm": 0.27176928239419323, "learning_rate": 4.83207898079115e-07, "loss": 0.565537691116333, "num_tokens": 90987416.0, "step": 349 }, { "epoch": 2.611940298507463, "grad_norm": 0.32160604849841345, "learning_rate": 4.808826024033334e-07, "loss": 0.5598034262657166, "num_tokens": 91795663.0, "step": 350 }, { "epoch": 2.6194029850746268, "grad_norm": 0.30348956227704144, "learning_rate": 4.785591953153966e-07, "loss": 0.5576733946800232, "num_tokens": 92702072.0, "step": 351 }, { "epoch": 2.626865671641791, "grad_norm": 0.2744155218003863, "learning_rate": 4.762377403008895e-07, "loss": 0.5912754535675049, "num_tokens": 93699627.0, "step": 352 }, { "epoch": 2.6343283582089554, "grad_norm": 0.2666509179401252, "learning_rate": 4.739183007920571e-07, "loss": 0.5752925276756287, "num_tokens": 94666002.0, "step": 353 }, { "epoch": 2.6417910447761193, "grad_norm": 0.261062380795103, "learning_rate": 4.7160094016607276e-07, "loss": 0.5275688767433167, "num_tokens": 95605433.0, "step": 354 }, { "epoch": 2.6492537313432836, "grad_norm": 0.2811617365131969, "learning_rate": 4.6928572174330495e-07, "loss": 0.5722550749778748, "num_tokens": 96516441.0, "step": 355 }, { "epoch": 2.656716417910448, "grad_norm": 0.2735603112854696, "learning_rate": 4.669727087855886e-07, "loss": 0.5699527859687805, "num_tokens": 97450967.0, "step": 356 }, { "epoch": 2.664179104477612, "grad_norm": 0.3395844478312138, "learning_rate": 4.6466196449449504e-07, "loss": 0.5282535552978516, "num_tokens": 98350106.0, "step": 357 }, { "epoch": 2.671641791044776, "grad_norm": 0.2830556116676336, "learning_rate": 4.6235355200960623e-07, "loss": 0.5501178503036499, "num_tokens": 99299833.0, "step": 358 }, { "epoch": 2.6791044776119404, "grad_norm": 0.2720175319739195, "learning_rate": 4.600475344067889e-07, "loss": 0.5554410219192505, "num_tokens": 100163789.0, "step": 359 }, { "epoch": 2.6865671641791042, "grad_norm": 0.39231713406243224, "learning_rate": 4.577439746964714e-07, "loss": 0.5649659633636475, "num_tokens": 101065769.0, "step": 360 }, { "epoch": 2.6940298507462686, "grad_norm": 0.3840423060468296, "learning_rate": 4.554429358219213e-07, "loss": 0.5463579893112183, "num_tokens": 102054742.0, "step": 361 }, { "epoch": 2.701492537313433, "grad_norm": 0.2823797561004669, "learning_rate": 4.531444806575265e-07, "loss": 0.5806522369384766, "num_tokens": 102999309.0, "step": 362 }, { "epoch": 2.708955223880597, "grad_norm": 0.2847114226753591, "learning_rate": 4.508486720070761e-07, "loss": 0.5655279159545898, "num_tokens": 103943807.0, "step": 363 }, { "epoch": 2.716417910447761, "grad_norm": 0.28205090767545954, "learning_rate": 4.4855557260204547e-07, "loss": 0.5465511083602905, "num_tokens": 104752259.0, "step": 364 }, { "epoch": 2.7238805970149254, "grad_norm": 0.31707271181231406, "learning_rate": 4.462652450998815e-07, "loss": 0.56863933801651, "num_tokens": 105560152.0, "step": 365 }, { "epoch": 2.7313432835820897, "grad_norm": 0.28484578348583783, "learning_rate": 4.439777520822905e-07, "loss": 0.5578351020812988, "num_tokens": 106370949.0, "step": 366 }, { "epoch": 2.7388059701492535, "grad_norm": 0.2911877277567285, "learning_rate": 4.416931560535284e-07, "loss": 0.560371994972229, "num_tokens": 107223604.0, "step": 367 }, { "epoch": 2.746268656716418, "grad_norm": 0.27157022459261115, "learning_rate": 4.394115194386927e-07, "loss": 0.5460314750671387, "num_tokens": 108240912.0, "step": 368 }, { "epoch": 2.753731343283582, "grad_norm": 0.2866648017188484, "learning_rate": 4.3713290458201714e-07, "loss": 0.567893922328949, "num_tokens": 109178166.0, "step": 369 }, { "epoch": 2.7611940298507465, "grad_norm": 0.29527473129759935, "learning_rate": 4.348573737451674e-07, "loss": 0.6049559116363525, "num_tokens": 109991427.0, "step": 370 }, { "epoch": 2.7686567164179103, "grad_norm": 0.31366814462249815, "learning_rate": 4.3258498910554085e-07, "loss": 0.5512971878051758, "num_tokens": 110872756.0, "step": 371 }, { "epoch": 2.7761194029850746, "grad_norm": 0.3023387372899213, "learning_rate": 4.3031581275456687e-07, "loss": 0.5625054836273193, "num_tokens": 111784756.0, "step": 372 }, { "epoch": 2.783582089552239, "grad_norm": 0.3327888958164682, "learning_rate": 4.2804990669601015e-07, "loss": 0.559493899345398, "num_tokens": 112842525.0, "step": 373 }, { "epoch": 2.791044776119403, "grad_norm": 0.2864205118032827, "learning_rate": 4.2578733284427735e-07, "loss": 0.541454553604126, "num_tokens": 113670083.0, "step": 374 }, { "epoch": 2.798507462686567, "grad_norm": 0.2787810223726183, "learning_rate": 4.2352815302272415e-07, "loss": 0.5479576587677002, "num_tokens": 114578655.0, "step": 375 }, { "epoch": 2.8059701492537314, "grad_norm": 0.2870728358422289, "learning_rate": 4.2127242896196715e-07, "loss": 0.5296257138252258, "num_tokens": 115385601.0, "step": 376 }, { "epoch": 2.8134328358208958, "grad_norm": 0.2787997170860601, "learning_rate": 4.190202222981959e-07, "loss": 0.5955355763435364, "num_tokens": 116307550.0, "step": 377 }, { "epoch": 2.8208955223880596, "grad_norm": 0.26436536089418516, "learning_rate": 4.1677159457149e-07, "loss": 0.5246421098709106, "num_tokens": 117169272.0, "step": 378 }, { "epoch": 2.828358208955224, "grad_norm": 0.29264065144374635, "learning_rate": 4.145266072241365e-07, "loss": 0.5522100329399109, "num_tokens": 118096842.0, "step": 379 }, { "epoch": 2.835820895522388, "grad_norm": 0.28815451535722664, "learning_rate": 4.1228532159895146e-07, "loss": 0.5797725915908813, "num_tokens": 119100115.0, "step": 380 }, { "epoch": 2.843283582089552, "grad_norm": 0.3003290343487832, "learning_rate": 4.100477989376042e-07, "loss": 0.5710124969482422, "num_tokens": 120047947.0, "step": 381 }, { "epoch": 2.8507462686567164, "grad_norm": 0.3124248817525316, "learning_rate": 4.0781410037894305e-07, "loss": 0.5675666332244873, "num_tokens": 120918603.0, "step": 382 }, { "epoch": 2.8582089552238807, "grad_norm": 0.29192127163210346, "learning_rate": 4.0558428695732563e-07, "loss": 0.5678380727767944, "num_tokens": 121705889.0, "step": 383 }, { "epoch": 2.8656716417910446, "grad_norm": 0.45590175900628427, "learning_rate": 4.033584196009502e-07, "loss": 0.5677410960197449, "num_tokens": 122662818.0, "step": 384 }, { "epoch": 2.873134328358209, "grad_norm": 0.2630633298189877, "learning_rate": 4.0113655913019173e-07, "loss": 0.5765926837921143, "num_tokens": 123634255.0, "step": 385 }, { "epoch": 2.8805970149253732, "grad_norm": 0.4068807334874914, "learning_rate": 3.989187662559397e-07, "loss": 0.5568211078643799, "num_tokens": 124693287.0, "step": 386 }, { "epoch": 2.888059701492537, "grad_norm": 0.2899519882101068, "learning_rate": 3.967051015779389e-07, "loss": 0.5638155937194824, "num_tokens": 125727015.0, "step": 387 }, { "epoch": 2.8955223880597014, "grad_norm": 0.27969326214227774, "learning_rate": 3.944956255831342e-07, "loss": 0.5610464215278625, "num_tokens": 126569685.0, "step": 388 }, { "epoch": 2.9029850746268657, "grad_norm": 0.2771191212457944, "learning_rate": 3.9229039864401703e-07, "loss": 0.5670617818832397, "num_tokens": 127486971.0, "step": 389 }, { "epoch": 2.91044776119403, "grad_norm": 0.28740458029106764, "learning_rate": 3.900894810169766e-07, "loss": 0.573495626449585, "num_tokens": 128449869.0, "step": 390 }, { "epoch": 2.917910447761194, "grad_norm": 0.3229624542793302, "learning_rate": 3.8789293284065236e-07, "loss": 0.5427689552307129, "num_tokens": 129068910.0, "step": 391 }, { "epoch": 2.925373134328358, "grad_norm": 0.28158062590946553, "learning_rate": 3.85700814134292e-07, "loss": 0.5718903541564941, "num_tokens": 129934302.0, "step": 392 }, { "epoch": 2.9328358208955225, "grad_norm": 0.29563596535877035, "learning_rate": 3.8351318479611037e-07, "loss": 0.5753850340843201, "num_tokens": 130851893.0, "step": 393 }, { "epoch": 2.9402985074626864, "grad_norm": 0.2766878486514577, "learning_rate": 3.813301046016536e-07, "loss": 0.5622212886810303, "num_tokens": 131790942.0, "step": 394 }, { "epoch": 2.9477611940298507, "grad_norm": 0.31331114946966404, "learning_rate": 3.7915163320216506e-07, "loss": 0.5439543724060059, "num_tokens": 132669917.0, "step": 395 }, { "epoch": 2.955223880597015, "grad_norm": 0.3302390373570611, "learning_rate": 3.7697783012295614e-07, "loss": 0.560044527053833, "num_tokens": 133626565.0, "step": 396 }, { "epoch": 2.9626865671641793, "grad_norm": 0.2829029075854862, "learning_rate": 3.7480875476177944e-07, "loss": 0.5379583835601807, "num_tokens": 134404690.0, "step": 397 }, { "epoch": 2.970149253731343, "grad_norm": 0.27234325967681716, "learning_rate": 3.7264446638720537e-07, "loss": 0.5365550518035889, "num_tokens": 135338794.0, "step": 398 }, { "epoch": 2.9776119402985075, "grad_norm": 0.2767187314816525, "learning_rate": 3.7048502413700343e-07, "loss": 0.5605146288871765, "num_tokens": 136245478.0, "step": 399 }, { "epoch": 2.9850746268656714, "grad_norm": 0.2660191705819811, "learning_rate": 3.683304870165257e-07, "loss": 0.5613399744033813, "num_tokens": 137308357.0, "step": 400 }, { "epoch": 2.9925373134328357, "grad_norm": 0.33641507483104477, "learning_rate": 3.66180913897095e-07, "loss": 0.5563279390335083, "num_tokens": 138207663.0, "step": 401 }, { "epoch": 3.0, "grad_norm": 0.2972265795339739, "learning_rate": 3.640363635143957e-07, "loss": 0.5664753913879395, "num_tokens": 139009002.0, "step": 402 }, { "epoch": 3.0074626865671643, "grad_norm": 0.2870907018242013, "learning_rate": 3.6189689446686957e-07, "loss": 0.5246941447257996, "num_tokens": 139777882.0, "step": 403 }, { "epoch": 3.014925373134328, "grad_norm": 0.2916504470337103, "learning_rate": 3.5976256521411397e-07, "loss": 0.5544458627700806, "num_tokens": 140621165.0, "step": 404 }, { "epoch": 3.0223880597014925, "grad_norm": 0.6265255165946803, "learning_rate": 3.576334340752847e-07, "loss": 0.5519254207611084, "num_tokens": 141606435.0, "step": 405 }, { "epoch": 3.029850746268657, "grad_norm": 0.3145949002923175, "learning_rate": 3.555095592275027e-07, "loss": 0.5923848152160645, "num_tokens": 142396631.0, "step": 406 }, { "epoch": 3.0373134328358207, "grad_norm": 0.2812921044098366, "learning_rate": 3.5339099870426415e-07, "loss": 0.586621105670929, "num_tokens": 143299300.0, "step": 407 }, { "epoch": 3.044776119402985, "grad_norm": 0.3253513362223779, "learning_rate": 3.512778103938542e-07, "loss": 0.5898826122283936, "num_tokens": 144081283.0, "step": 408 }, { "epoch": 3.0522388059701493, "grad_norm": 0.2829701335583593, "learning_rate": 3.491700520377667e-07, "loss": 0.5714683532714844, "num_tokens": 144948721.0, "step": 409 }, { "epoch": 3.0597014925373136, "grad_norm": 0.28049591422033593, "learning_rate": 3.470677812291248e-07, "loss": 0.5455187559127808, "num_tokens": 145915502.0, "step": 410 }, { "epoch": 3.0671641791044775, "grad_norm": 0.3105939716549867, "learning_rate": 3.4497105541110846e-07, "loss": 0.5836495161056519, "num_tokens": 146848524.0, "step": 411 }, { "epoch": 3.074626865671642, "grad_norm": 0.32302139932598506, "learning_rate": 3.428799318753844e-07, "loss": 0.5365943908691406, "num_tokens": 147673557.0, "step": 412 }, { "epoch": 3.082089552238806, "grad_norm": 0.2868589436031599, "learning_rate": 3.407944677605399e-07, "loss": 0.6071346998214722, "num_tokens": 148546880.0, "step": 413 }, { "epoch": 3.08955223880597, "grad_norm": 1.1120530008183112, "learning_rate": 3.3871472005052315e-07, "loss": 0.5794011354446411, "num_tokens": 149372372.0, "step": 414 }, { "epoch": 3.0970149253731343, "grad_norm": 0.29942808195408777, "learning_rate": 3.3664074557308484e-07, "loss": 0.609196662902832, "num_tokens": 150192682.0, "step": 415 }, { "epoch": 3.1044776119402986, "grad_norm": 0.26999532517075925, "learning_rate": 3.345726009982262e-07, "loss": 0.5523053407669067, "num_tokens": 151127243.0, "step": 416 }, { "epoch": 3.111940298507463, "grad_norm": 0.27391131798248525, "learning_rate": 3.325103428366494e-07, "loss": 0.5864978432655334, "num_tokens": 152087089.0, "step": 417 }, { "epoch": 3.1194029850746268, "grad_norm": 0.4729472521720116, "learning_rate": 3.3045402743821503e-07, "loss": 0.5435307025909424, "num_tokens": 153037095.0, "step": 418 }, { "epoch": 3.126865671641791, "grad_norm": 0.2613169380583225, "learning_rate": 3.284037109904013e-07, "loss": 0.5703420042991638, "num_tokens": 153970950.0, "step": 419 }, { "epoch": 3.1343283582089554, "grad_norm": 0.26928897911274874, "learning_rate": 3.2635944951676874e-07, "loss": 0.5258716344833374, "num_tokens": 154860955.0, "step": 420 }, { "epoch": 3.1417910447761193, "grad_norm": 0.2763903661377402, "learning_rate": 3.243212988754302e-07, "loss": 0.5877372026443481, "num_tokens": 155783554.0, "step": 421 }, { "epoch": 3.1492537313432836, "grad_norm": 0.29336310605505883, "learning_rate": 3.2228931475752317e-07, "loss": 0.5202987790107727, "num_tokens": 156633643.0, "step": 422 }, { "epoch": 3.156716417910448, "grad_norm": 0.25416164853887924, "learning_rate": 3.2026355268568985e-07, "loss": 0.5262839794158936, "num_tokens": 157509233.0, "step": 423 }, { "epoch": 3.1641791044776117, "grad_norm": 0.2833930281713182, "learning_rate": 3.1824406801255833e-07, "loss": 0.541146993637085, "num_tokens": 158408189.0, "step": 424 }, { "epoch": 3.171641791044776, "grad_norm": 0.27826344635794753, "learning_rate": 3.1623091591923155e-07, "loss": 0.5324054956436157, "num_tokens": 159344619.0, "step": 425 }, { "epoch": 3.1791044776119404, "grad_norm": 0.2641432062082384, "learning_rate": 3.142241514137781e-07, "loss": 0.512749969959259, "num_tokens": 160147804.0, "step": 426 }, { "epoch": 3.1865671641791047, "grad_norm": 0.29499294962840417, "learning_rate": 3.1222382932973044e-07, "loss": 0.5644066333770752, "num_tokens": 161152253.0, "step": 427 }, { "epoch": 3.1940298507462686, "grad_norm": 0.40120876254377613, "learning_rate": 3.1023000432458594e-07, "loss": 0.5188844203948975, "num_tokens": 161912590.0, "step": 428 }, { "epoch": 3.201492537313433, "grad_norm": 0.2760447831486433, "learning_rate": 3.082427308783133e-07, "loss": 0.581289529800415, "num_tokens": 162873772.0, "step": 429 }, { "epoch": 3.208955223880597, "grad_norm": 0.38218690237559466, "learning_rate": 3.0626206329186475e-07, "loss": 0.5367913246154785, "num_tokens": 163747353.0, "step": 430 }, { "epoch": 3.216417910447761, "grad_norm": 0.2964101267001395, "learning_rate": 3.042880556856907e-07, "loss": 0.5629439353942871, "num_tokens": 164553836.0, "step": 431 }, { "epoch": 3.2238805970149254, "grad_norm": 0.2759084567994263, "learning_rate": 3.023207619982628e-07, "loss": 0.5370494723320007, "num_tokens": 165403798.0, "step": 432 }, { "epoch": 3.2313432835820897, "grad_norm": 0.5359782382978606, "learning_rate": 3.003602359845989e-07, "loss": 0.5838747620582581, "num_tokens": 166345805.0, "step": 433 }, { "epoch": 3.2388059701492535, "grad_norm": 0.3148655670627395, "learning_rate": 2.9840653121479474e-07, "loss": 0.5563722848892212, "num_tokens": 167178202.0, "step": 434 }, { "epoch": 3.246268656716418, "grad_norm": 0.25689828139696275, "learning_rate": 2.964597010725599e-07, "loss": 0.5305824875831604, "num_tokens": 168180314.0, "step": 435 }, { "epoch": 3.253731343283582, "grad_norm": 0.2613039059579068, "learning_rate": 2.945197987537591e-07, "loss": 0.5461628437042236, "num_tokens": 169040092.0, "step": 436 }, { "epoch": 3.2611940298507465, "grad_norm": 0.2974430760623621, "learning_rate": 2.9258687726495905e-07, "loss": 0.5644657611846924, "num_tokens": 169917341.0, "step": 437 }, { "epoch": 3.2686567164179103, "grad_norm": 0.2609767450471703, "learning_rate": 2.9066098942197993e-07, "loss": 0.5402700901031494, "num_tokens": 170787209.0, "step": 438 }, { "epoch": 3.2761194029850746, "grad_norm": 0.3010762400531778, "learning_rate": 2.8874218784845154e-07, "loss": 0.560728907585144, "num_tokens": 171730223.0, "step": 439 }, { "epoch": 3.283582089552239, "grad_norm": 0.2799906205843846, "learning_rate": 2.868305249743766e-07, "loss": 0.5792785882949829, "num_tokens": 172620879.0, "step": 440 }, { "epoch": 3.291044776119403, "grad_norm": 0.27973702443764925, "learning_rate": 2.849260530346973e-07, "loss": 0.5594302415847778, "num_tokens": 173513731.0, "step": 441 }, { "epoch": 3.298507462686567, "grad_norm": 0.2604858147066468, "learning_rate": 2.830288240678682e-07, "loss": 0.5618335008621216, "num_tokens": 174466652.0, "step": 442 }, { "epoch": 3.3059701492537314, "grad_norm": 0.28140102117529975, "learning_rate": 2.8113888991443446e-07, "loss": 0.5599676370620728, "num_tokens": 175305008.0, "step": 443 }, { "epoch": 3.3134328358208958, "grad_norm": 0.26261993494203545, "learning_rate": 2.7925630221561505e-07, "loss": 0.5589733719825745, "num_tokens": 176287960.0, "step": 444 }, { "epoch": 3.3208955223880596, "grad_norm": 0.2691459794281276, "learning_rate": 2.773811124118918e-07, "loss": 0.5410393476486206, "num_tokens": 177240918.0, "step": 445 }, { "epoch": 3.328358208955224, "grad_norm": 0.27073917117802515, "learning_rate": 2.7551337174160425e-07, "loss": 0.550033688545227, "num_tokens": 178155824.0, "step": 446 }, { "epoch": 3.3358208955223883, "grad_norm": 0.27889915727953724, "learning_rate": 2.736531312395491e-07, "loss": 0.5926166772842407, "num_tokens": 179172034.0, "step": 447 }, { "epoch": 3.343283582089552, "grad_norm": 0.28321328409431284, "learning_rate": 2.718004417355855e-07, "loss": 0.5419124960899353, "num_tokens": 180085508.0, "step": 448 }, { "epoch": 3.3507462686567164, "grad_norm": 0.25769815974818394, "learning_rate": 2.6995535385324667e-07, "loss": 0.5644470453262329, "num_tokens": 181111200.0, "step": 449 }, { "epoch": 3.3582089552238807, "grad_norm": 0.2645694828416433, "learning_rate": 2.6811791800835684e-07, "loss": 0.5500813722610474, "num_tokens": 182084448.0, "step": 450 }, { "epoch": 3.3656716417910446, "grad_norm": 0.3398596794365398, "learning_rate": 2.6628818440765267e-07, "loss": 0.5711795687675476, "num_tokens": 183008409.0, "step": 451 }, { "epoch": 3.373134328358209, "grad_norm": 0.28312078619309694, "learning_rate": 2.6446620304741265e-07, "loss": 0.49891045689582825, "num_tokens": 183851194.0, "step": 452 }, { "epoch": 3.3805970149253732, "grad_norm": 0.30224336146290764, "learning_rate": 2.626520237120898e-07, "loss": 0.5533944368362427, "num_tokens": 184757031.0, "step": 453 }, { "epoch": 3.388059701492537, "grad_norm": 0.2635231520858661, "learning_rate": 2.6084569597295224e-07, "loss": 0.5472126007080078, "num_tokens": 185664557.0, "step": 454 }, { "epoch": 3.3955223880597014, "grad_norm": 0.38599423556926155, "learning_rate": 2.590472691867284e-07, "loss": 0.5409133434295654, "num_tokens": 186629979.0, "step": 455 }, { "epoch": 3.4029850746268657, "grad_norm": 0.2607919939858689, "learning_rate": 2.57256792494258e-07, "loss": 0.5315978527069092, "num_tokens": 187564334.0, "step": 456 }, { "epoch": 3.41044776119403, "grad_norm": 0.2805243275573726, "learning_rate": 2.554743148191497e-07, "loss": 0.5706053376197815, "num_tokens": 188489207.0, "step": 457 }, { "epoch": 3.417910447761194, "grad_norm": 0.27087210896579755, "learning_rate": 2.5369988486644446e-07, "loss": 0.5453130006790161, "num_tokens": 189462195.0, "step": 458 }, { "epoch": 3.425373134328358, "grad_norm": 0.2701409088054617, "learning_rate": 2.5193355112128434e-07, "loss": 0.5617469549179077, "num_tokens": 190385848.0, "step": 459 }, { "epoch": 3.4328358208955225, "grad_norm": 0.27610398300578565, "learning_rate": 2.501753618475877e-07, "loss": 0.5490225553512573, "num_tokens": 191288170.0, "step": 460 }, { "epoch": 3.4402985074626864, "grad_norm": 0.2791596724267742, "learning_rate": 2.4842536508673086e-07, "loss": 0.5552560091018677, "num_tokens": 192212892.0, "step": 461 }, { "epoch": 3.4477611940298507, "grad_norm": 0.2818729998392585, "learning_rate": 2.4668360865623443e-07, "loss": 0.5438352227210999, "num_tokens": 193052710.0, "step": 462 }, { "epoch": 3.455223880597015, "grad_norm": 0.29076648116800047, "learning_rate": 2.4495014014845805e-07, "loss": 0.5421488285064697, "num_tokens": 193971934.0, "step": 463 }, { "epoch": 3.4626865671641793, "grad_norm": 0.35528872999469885, "learning_rate": 2.432250069292989e-07, "loss": 0.5626663565635681, "num_tokens": 194797236.0, "step": 464 }, { "epoch": 3.470149253731343, "grad_norm": 0.2931920860035253, "learning_rate": 2.4150825613689786e-07, "loss": 0.575283944606781, "num_tokens": 195700091.0, "step": 465 }, { "epoch": 3.4776119402985075, "grad_norm": 0.2696233856000557, "learning_rate": 2.397999346803518e-07, "loss": 0.5804455280303955, "num_tokens": 196607890.0, "step": 466 }, { "epoch": 3.485074626865672, "grad_norm": 0.2612143539512627, "learning_rate": 2.3810008923843075e-07, "loss": 0.5534828901290894, "num_tokens": 197595641.0, "step": 467 }, { "epoch": 3.4925373134328357, "grad_norm": 0.3003385075218394, "learning_rate": 2.3640876625830382e-07, "loss": 0.5445773601531982, "num_tokens": 198539047.0, "step": 468 }, { "epoch": 3.5, "grad_norm": 0.27091005264565227, "learning_rate": 2.347260119542692e-07, "loss": 0.5666298866271973, "num_tokens": 199529062.0, "step": 469 }, { "epoch": 3.5074626865671643, "grad_norm": 0.2856115330105266, "learning_rate": 2.3305187230649173e-07, "loss": 0.5649522542953491, "num_tokens": 200452743.0, "step": 470 }, { "epoch": 3.5149253731343286, "grad_norm": 0.26678768606458664, "learning_rate": 2.3138639305974592e-07, "loss": 0.5633753538131714, "num_tokens": 201375863.0, "step": 471 }, { "epoch": 3.5223880597014925, "grad_norm": 0.24743082299382807, "learning_rate": 2.29729619722167e-07, "loss": 0.5463535785675049, "num_tokens": 202342633.0, "step": 472 }, { "epoch": 3.529850746268657, "grad_norm": 0.2849265779918233, "learning_rate": 2.2808159756400664e-07, "loss": 0.5450330376625061, "num_tokens": 203206779.0, "step": 473 }, { "epoch": 3.5373134328358207, "grad_norm": 0.2959172618289051, "learning_rate": 2.264423716163962e-07, "loss": 0.5645024180412292, "num_tokens": 204166390.0, "step": 474 }, { "epoch": 3.544776119402985, "grad_norm": 0.30652775818807565, "learning_rate": 2.248119866701167e-07, "loss": 0.6000641584396362, "num_tokens": 205136083.0, "step": 475 }, { "epoch": 3.5522388059701493, "grad_norm": 0.2649068122185818, "learning_rate": 2.231904872743739e-07, "loss": 0.563923180103302, "num_tokens": 206063663.0, "step": 476 }, { "epoch": 3.5597014925373136, "grad_norm": 0.30105804677037196, "learning_rate": 2.2157791773558222e-07, "loss": 0.5499534606933594, "num_tokens": 207014752.0, "step": 477 }, { "epoch": 3.5671641791044775, "grad_norm": 0.2719779878868544, "learning_rate": 2.1997432211615324e-07, "loss": 0.5947707891464233, "num_tokens": 208002031.0, "step": 478 }, { "epoch": 3.574626865671642, "grad_norm": 0.27217392575188515, "learning_rate": 2.1837974423329254e-07, "loss": 0.5516700744628906, "num_tokens": 208865142.0, "step": 479 }, { "epoch": 3.582089552238806, "grad_norm": 0.37099057189831747, "learning_rate": 2.1679422765780113e-07, "loss": 0.557658851146698, "num_tokens": 209775786.0, "step": 480 }, { "epoch": 3.58955223880597, "grad_norm": 0.27880227675709274, "learning_rate": 2.1521781571288644e-07, "loss": 0.5569248199462891, "num_tokens": 210690185.0, "step": 481 }, { "epoch": 3.5970149253731343, "grad_norm": 0.27705864225434784, "learning_rate": 2.136505514729774e-07, "loss": 0.5474062561988831, "num_tokens": 211593514.0, "step": 482 }, { "epoch": 3.6044776119402986, "grad_norm": 0.5224112991843156, "learning_rate": 2.120924777625479e-07, "loss": 0.5869604349136353, "num_tokens": 212523789.0, "step": 483 }, { "epoch": 3.611940298507463, "grad_norm": 0.2577684602730275, "learning_rate": 2.1054363715494693e-07, "loss": 0.5051690340042114, "num_tokens": 213378219.0, "step": 484 }, { "epoch": 3.6194029850746268, "grad_norm": 0.28941698042641906, "learning_rate": 2.090040719712344e-07, "loss": 0.5571575164794922, "num_tokens": 214172580.0, "step": 485 }, { "epoch": 3.626865671641791, "grad_norm": 0.4260866776138794, "learning_rate": 2.0747382427902572e-07, "loss": 0.5927166938781738, "num_tokens": 214985813.0, "step": 486 }, { "epoch": 3.6343283582089554, "grad_norm": 0.3024307973363642, "learning_rate": 2.0595293589134176e-07, "loss": 0.5418879985809326, "num_tokens": 215879921.0, "step": 487 }, { "epoch": 3.6417910447761193, "grad_norm": 0.2780348633827344, "learning_rate": 2.044414483654668e-07, "loss": 0.5637257695198059, "num_tokens": 216746710.0, "step": 488 }, { "epoch": 3.6492537313432836, "grad_norm": 0.36018093543345575, "learning_rate": 2.0293940300181212e-07, "loss": 0.5574115514755249, "num_tokens": 217502005.0, "step": 489 }, { "epoch": 3.656716417910448, "grad_norm": 0.36563238506727497, "learning_rate": 2.0144684084278846e-07, "loss": 0.5422406792640686, "num_tokens": 218245391.0, "step": 490 }, { "epoch": 3.664179104477612, "grad_norm": 0.26076893929904454, "learning_rate": 1.9996380267168416e-07, "loss": 0.5316330194473267, "num_tokens": 219197443.0, "step": 491 }, { "epoch": 3.671641791044776, "grad_norm": 0.2756117047272477, "learning_rate": 1.9849032901155073e-07, "loss": 0.576492965221405, "num_tokens": 220198270.0, "step": 492 }, { "epoch": 3.6791044776119404, "grad_norm": 0.3104844021800582, "learning_rate": 1.9702646012409576e-07, "loss": 0.5465894937515259, "num_tokens": 221120937.0, "step": 493 }, { "epoch": 3.6865671641791042, "grad_norm": 0.28862702731917284, "learning_rate": 1.9557223600858236e-07, "loss": 0.562412679195404, "num_tokens": 222035264.0, "step": 494 }, { "epoch": 3.6940298507462686, "grad_norm": 0.2990711536721463, "learning_rate": 1.9412769640073686e-07, "loss": 0.6177443265914917, "num_tokens": 222924164.0, "step": 495 }, { "epoch": 3.701492537313433, "grad_norm": 0.29123869800526553, "learning_rate": 1.9269288077166264e-07, "loss": 0.6057195067405701, "num_tokens": 223814612.0, "step": 496 }, { "epoch": 3.708955223880597, "grad_norm": 0.30962841555286785, "learning_rate": 1.9126782832676173e-07, "loss": 0.5551049709320068, "num_tokens": 224678747.0, "step": 497 }, { "epoch": 3.716417910447761, "grad_norm": 0.31789120255947023, "learning_rate": 1.8985257800466348e-07, "loss": 0.5476455092430115, "num_tokens": 225631946.0, "step": 498 }, { "epoch": 3.7238805970149254, "grad_norm": 0.2781242088040202, "learning_rate": 1.8844716847616052e-07, "loss": 0.5808273553848267, "num_tokens": 226600195.0, "step": 499 }, { "epoch": 3.7313432835820897, "grad_norm": 0.2954115885416602, "learning_rate": 1.8705163814315228e-07, "loss": 0.5603234767913818, "num_tokens": 227420424.0, "step": 500 }, { "epoch": 3.7388059701492535, "grad_norm": 0.26394812594414746, "learning_rate": 1.856660251375957e-07, "loss": 0.5475826263427734, "num_tokens": 228393641.0, "step": 501 }, { "epoch": 3.746268656716418, "grad_norm": 0.25638710494334194, "learning_rate": 1.8429036732046327e-07, "loss": 0.5315807461738586, "num_tokens": 229383627.0, "step": 502 }, { "epoch": 3.753731343283582, "grad_norm": 0.2820486917822845, "learning_rate": 1.8292470228070805e-07, "loss": 0.555698037147522, "num_tokens": 230312293.0, "step": 503 }, { "epoch": 3.7611940298507465, "grad_norm": 0.26963402638209555, "learning_rate": 1.8156906733423738e-07, "loss": 0.5559597611427307, "num_tokens": 231227207.0, "step": 504 }, { "epoch": 3.7686567164179103, "grad_norm": 0.2635960871590189, "learning_rate": 1.8022349952289273e-07, "loss": 0.5315006971359253, "num_tokens": 232129690.0, "step": 505 }, { "epoch": 3.7761194029850746, "grad_norm": 0.28835042727096805, "learning_rate": 1.7888803561343751e-07, "loss": 0.5724339485168457, "num_tokens": 232988180.0, "step": 506 }, { "epoch": 3.783582089552239, "grad_norm": 0.2880295787825169, "learning_rate": 1.7756271209655294e-07, "loss": 0.6089663505554199, "num_tokens": 233887459.0, "step": 507 }, { "epoch": 3.791044776119403, "grad_norm": 0.2923917838459817, "learning_rate": 1.7624756518584013e-07, "loss": 0.5508089065551758, "num_tokens": 234724288.0, "step": 508 }, { "epoch": 3.798507462686567, "grad_norm": 0.2663262112656659, "learning_rate": 1.7494263081683131e-07, "loss": 0.5383226871490479, "num_tokens": 235591261.0, "step": 509 }, { "epoch": 3.8059701492537314, "grad_norm": 0.2810986277113732, "learning_rate": 1.7364794464600808e-07, "loss": 0.5323266983032227, "num_tokens": 236513360.0, "step": 510 }, { "epoch": 3.8134328358208958, "grad_norm": 0.3206761651223088, "learning_rate": 1.7236354204982587e-07, "loss": 0.5368841290473938, "num_tokens": 237427821.0, "step": 511 }, { "epoch": 3.8208955223880596, "grad_norm": 0.2831284138508691, "learning_rate": 1.7108945812374873e-07, "loss": 0.5697877407073975, "num_tokens": 238361524.0, "step": 512 }, { "epoch": 3.828358208955224, "grad_norm": 0.4678969236120579, "learning_rate": 1.698257276812896e-07, "loss": 0.567964494228363, "num_tokens": 239295734.0, "step": 513 }, { "epoch": 3.835820895522388, "grad_norm": 0.3454163016526369, "learning_rate": 1.6857238525305922e-07, "loss": 0.5614358186721802, "num_tokens": 240192414.0, "step": 514 }, { "epoch": 3.843283582089552, "grad_norm": 0.26229554588086373, "learning_rate": 1.6732946508582286e-07, "loss": 0.5396016836166382, "num_tokens": 241149058.0, "step": 515 }, { "epoch": 3.8507462686567164, "grad_norm": 0.25830144071410005, "learning_rate": 1.6609700114156368e-07, "loss": 0.548250675201416, "num_tokens": 242110168.0, "step": 516 }, { "epoch": 3.8582089552238807, "grad_norm": 0.2577863607708949, "learning_rate": 1.648750270965559e-07, "loss": 0.5675839185714722, "num_tokens": 243142913.0, "step": 517 }, { "epoch": 3.8656716417910446, "grad_norm": 1.2576352530776749, "learning_rate": 1.6366357634044403e-07, "loss": 0.5479030609130859, "num_tokens": 244026260.0, "step": 518 }, { "epoch": 3.873134328358209, "grad_norm": 0.43195789720816985, "learning_rate": 1.6246268197533046e-07, "loss": 0.5657459497451782, "num_tokens": 244835255.0, "step": 519 }, { "epoch": 3.8805970149253732, "grad_norm": 0.2850328915996735, "learning_rate": 1.6127237681487092e-07, "loss": 0.5788131952285767, "num_tokens": 245744839.0, "step": 520 }, { "epoch": 3.888059701492537, "grad_norm": 0.2734777728630755, "learning_rate": 1.600926933833783e-07, "loss": 0.5809911489486694, "num_tokens": 246688392.0, "step": 521 }, { "epoch": 3.8955223880597014, "grad_norm": 0.2933774394372255, "learning_rate": 1.5892366391493362e-07, "loss": 0.5803858637809753, "num_tokens": 247632902.0, "step": 522 }, { "epoch": 3.9029850746268657, "grad_norm": 0.2903603425087314, "learning_rate": 1.5776532035250513e-07, "loss": 0.5569208860397339, "num_tokens": 248582604.0, "step": 523 }, { "epoch": 3.91044776119403, "grad_norm": 0.2633779070798848, "learning_rate": 1.5661769434707583e-07, "loss": 0.5375438928604126, "num_tokens": 249449227.0, "step": 524 }, { "epoch": 3.917910447761194, "grad_norm": 0.31705662168623416, "learning_rate": 1.5548081725677842e-07, "loss": 0.5713478326797485, "num_tokens": 250309108.0, "step": 525 }, { "epoch": 3.925373134328358, "grad_norm": 0.2743174912786303, "learning_rate": 1.5435472014603838e-07, "loss": 0.5781571865081787, "num_tokens": 251284224.0, "step": 526 }, { "epoch": 3.9328358208955225, "grad_norm": 0.2781053851258556, "learning_rate": 1.5323943378472546e-07, "loss": 0.5639330148696899, "num_tokens": 252205748.0, "step": 527 }, { "epoch": 3.9402985074626864, "grad_norm": 0.27546182816312653, "learning_rate": 1.5213498864731265e-07, "loss": 0.5076487064361572, "num_tokens": 253011360.0, "step": 528 }, { "epoch": 3.9477611940298507, "grad_norm": 0.2724349176732762, "learning_rate": 1.5104141491204357e-07, "loss": 0.5303751230239868, "num_tokens": 253876442.0, "step": 529 }, { "epoch": 3.955223880597015, "grad_norm": 0.29045934843508436, "learning_rate": 1.4995874246010776e-07, "loss": 0.5791366100311279, "num_tokens": 254702285.0, "step": 530 }, { "epoch": 3.9626865671641793, "grad_norm": 0.25781806828726467, "learning_rate": 1.4888700087482444e-07, "loss": 0.5378929376602173, "num_tokens": 255641666.0, "step": 531 }, { "epoch": 3.970149253731343, "grad_norm": 0.2657141262373794, "learning_rate": 1.4782621944083392e-07, "loss": 0.5480854511260986, "num_tokens": 256570444.0, "step": 532 }, { "epoch": 3.9776119402985075, "grad_norm": 0.27332662946272, "learning_rate": 1.467764271432977e-07, "loss": 0.5349365472793579, "num_tokens": 257440148.0, "step": 533 }, { "epoch": 3.9850746268656714, "grad_norm": 0.27927002916412313, "learning_rate": 1.4573765266710598e-07, "loss": 0.5557724237442017, "num_tokens": 258286072.0, "step": 534 }, { "epoch": 3.9925373134328357, "grad_norm": 0.30540619594506574, "learning_rate": 1.4470992439609444e-07, "loss": 0.5325461626052856, "num_tokens": 259027900.0, "step": 535 }, { "epoch": 4.0, "grad_norm": 0.2554852841852584, "learning_rate": 1.4369327041226831e-07, "loss": 0.5564035177230835, "num_tokens": 260054853.0, "step": 536 }, { "epoch": 4.007462686567164, "grad_norm": 0.3774222258568171, "learning_rate": 1.4268771849503506e-07, "loss": 0.5198606252670288, "num_tokens": 260848096.0, "step": 537 }, { "epoch": 4.014925373134329, "grad_norm": 0.42926748854255425, "learning_rate": 1.4169329612044566e-07, "loss": 0.5263375043869019, "num_tokens": 261795072.0, "step": 538 }, { "epoch": 4.022388059701493, "grad_norm": 0.28442769544889995, "learning_rate": 1.4071003046044322e-07, "loss": 0.5481403470039368, "num_tokens": 262649190.0, "step": 539 }, { "epoch": 4.029850746268656, "grad_norm": 0.33957785588251854, "learning_rate": 1.397379483821212e-07, "loss": 0.5446444749832153, "num_tokens": 263639124.0, "step": 540 }, { "epoch": 4.037313432835821, "grad_norm": 0.31285290824877576, "learning_rate": 1.3877707644698893e-07, "loss": 0.5856173038482666, "num_tokens": 264480175.0, "step": 541 }, { "epoch": 4.044776119402985, "grad_norm": 0.3146654769143484, "learning_rate": 1.3782744091024584e-07, "loss": 0.5640919208526611, "num_tokens": 265217661.0, "step": 542 }, { "epoch": 4.052238805970149, "grad_norm": 0.2543877235448555, "learning_rate": 1.3688906772006393e-07, "loss": 0.545689582824707, "num_tokens": 266155540.0, "step": 543 }, { "epoch": 4.059701492537314, "grad_norm": 0.2831883473552513, "learning_rate": 1.3596198251687917e-07, "loss": 0.5562140941619873, "num_tokens": 267051346.0, "step": 544 }, { "epoch": 4.067164179104478, "grad_norm": 0.2634821254048522, "learning_rate": 1.3504621063269057e-07, "loss": 0.5558310747146606, "num_tokens": 268001048.0, "step": 545 }, { "epoch": 4.074626865671641, "grad_norm": 0.2740473313761773, "learning_rate": 1.34141777090368e-07, "loss": 0.5498157739639282, "num_tokens": 268948251.0, "step": 546 }, { "epoch": 4.082089552238806, "grad_norm": 0.3034010911009059, "learning_rate": 1.3324870660296866e-07, "loss": 0.5079299211502075, "num_tokens": 269891870.0, "step": 547 }, { "epoch": 4.08955223880597, "grad_norm": 0.25296398139696874, "learning_rate": 1.3236702357306156e-07, "loss": 0.557180643081665, "num_tokens": 270893706.0, "step": 548 }, { "epoch": 4.097014925373134, "grad_norm": 0.5617876126540791, "learning_rate": 1.3149675209206084e-07, "loss": 0.5518041253089905, "num_tokens": 271655159.0, "step": 549 }, { "epoch": 4.104477611940299, "grad_norm": 0.26496316816661675, "learning_rate": 1.3063791593956756e-07, "loss": 0.5603747367858887, "num_tokens": 272587675.0, "step": 550 }, { "epoch": 4.111940298507463, "grad_norm": 0.2601339382276699, "learning_rate": 1.2979053858271993e-07, "loss": 0.5405164957046509, "num_tokens": 273463891.0, "step": 551 }, { "epoch": 4.119402985074627, "grad_norm": 0.2795698909044753, "learning_rate": 1.2895464317555206e-07, "loss": 0.5839468240737915, "num_tokens": 274283621.0, "step": 552 }, { "epoch": 4.126865671641791, "grad_norm": 0.24753013274997315, "learning_rate": 1.28130252558361e-07, "loss": 0.5279031991958618, "num_tokens": 275221184.0, "step": 553 }, { "epoch": 4.134328358208955, "grad_norm": 0.28266540188156414, "learning_rate": 1.2731738925708327e-07, "loss": 0.5553559064865112, "num_tokens": 276094732.0, "step": 554 }, { "epoch": 4.141791044776119, "grad_norm": 0.2690824174144065, "learning_rate": 1.265160754826787e-07, "loss": 0.572119951248169, "num_tokens": 277122845.0, "step": 555 }, { "epoch": 4.149253731343284, "grad_norm": 0.2472737773051283, "learning_rate": 1.2572633313052409e-07, "loss": 0.569811999797821, "num_tokens": 278203814.0, "step": 556 }, { "epoch": 4.156716417910448, "grad_norm": 0.2742488775651172, "learning_rate": 1.249481837798144e-07, "loss": 0.5402873754501343, "num_tokens": 279084275.0, "step": 557 }, { "epoch": 4.164179104477612, "grad_norm": 0.45576858034969514, "learning_rate": 1.2418164869297352e-07, "loss": 0.5487810373306274, "num_tokens": 279995589.0, "step": 558 }, { "epoch": 4.1716417910447765, "grad_norm": 0.33886009051736005, "learning_rate": 1.2342674881507325e-07, "loss": 0.5493899583816528, "num_tokens": 280947164.0, "step": 559 }, { "epoch": 4.17910447761194, "grad_norm": 0.2672882959311712, "learning_rate": 1.226835047732607e-07, "loss": 0.5846470594406128, "num_tokens": 281865957.0, "step": 560 }, { "epoch": 4.186567164179104, "grad_norm": 0.26682211241453646, "learning_rate": 1.2195193687619503e-07, "loss": 0.5684331655502319, "num_tokens": 282822635.0, "step": 561 }, { "epoch": 4.1940298507462686, "grad_norm": 0.5844175934203363, "learning_rate": 1.212320651134921e-07, "loss": 0.5448155403137207, "num_tokens": 283735562.0, "step": 562 }, { "epoch": 4.201492537313433, "grad_norm": 0.28667238801807454, "learning_rate": 1.2052390915517878e-07, "loss": 0.552519679069519, "num_tokens": 284514293.0, "step": 563 }, { "epoch": 4.208955223880597, "grad_norm": 0.27209099234946366, "learning_rate": 1.198274883511551e-07, "loss": 0.5868443250656128, "num_tokens": 285496842.0, "step": 564 }, { "epoch": 4.2164179104477615, "grad_norm": 0.282926588733111, "learning_rate": 1.1914282173066572e-07, "loss": 0.5723504424095154, "num_tokens": 286397891.0, "step": 565 }, { "epoch": 4.223880597014926, "grad_norm": 0.2770197201382834, "learning_rate": 1.1846992800177977e-07, "loss": 0.5528011918067932, "num_tokens": 287291637.0, "step": 566 }, { "epoch": 4.231343283582089, "grad_norm": 0.28361549703489075, "learning_rate": 1.1780882555087987e-07, "loss": 0.5853151082992554, "num_tokens": 288241806.0, "step": 567 }, { "epoch": 4.2388059701492535, "grad_norm": 0.2939822179720846, "learning_rate": 1.1715953244215962e-07, "loss": 0.5159034729003906, "num_tokens": 289026082.0, "step": 568 }, { "epoch": 4.246268656716418, "grad_norm": 0.27406466943349356, "learning_rate": 1.1652206641713017e-07, "loss": 0.5613383054733276, "num_tokens": 289932433.0, "step": 569 }, { "epoch": 4.253731343283582, "grad_norm": 0.26361825748402795, "learning_rate": 1.1589644489413516e-07, "loss": 0.5283357501029968, "num_tokens": 290832565.0, "step": 570 }, { "epoch": 4.2611940298507465, "grad_norm": 0.2760993964502923, "learning_rate": 1.1528268496787496e-07, "loss": 0.5750157833099365, "num_tokens": 291807812.0, "step": 571 }, { "epoch": 4.268656716417911, "grad_norm": 0.28258368565429337, "learning_rate": 1.1468080340893957e-07, "loss": 0.5445358157157898, "num_tokens": 292611203.0, "step": 572 }, { "epoch": 4.276119402985074, "grad_norm": 0.2730636420985558, "learning_rate": 1.1409081666335033e-07, "loss": 0.6081241369247437, "num_tokens": 293563137.0, "step": 573 }, { "epoch": 4.2835820895522385, "grad_norm": 0.2887060017580731, "learning_rate": 1.1351274085211066e-07, "loss": 0.5485525131225586, "num_tokens": 294390720.0, "step": 574 }, { "epoch": 4.291044776119403, "grad_norm": 0.3012747873135312, "learning_rate": 1.1294659177076522e-07, "loss": 0.5155702829360962, "num_tokens": 295331187.0, "step": 575 }, { "epoch": 4.298507462686567, "grad_norm": 0.2920426716987418, "learning_rate": 1.1239238488896874e-07, "loss": 0.5878146886825562, "num_tokens": 296174930.0, "step": 576 }, { "epoch": 4.3059701492537314, "grad_norm": 0.2717219604372653, "learning_rate": 1.118501353500631e-07, "loss": 0.5488337278366089, "num_tokens": 296984260.0, "step": 577 }, { "epoch": 4.313432835820896, "grad_norm": 0.2881268506141246, "learning_rate": 1.1131985797066362e-07, "loss": 0.5962164402008057, "num_tokens": 297814492.0, "step": 578 }, { "epoch": 4.32089552238806, "grad_norm": 0.3429404437909788, "learning_rate": 1.1080156724025409e-07, "loss": 0.5432817935943604, "num_tokens": 298682103.0, "step": 579 }, { "epoch": 4.3283582089552235, "grad_norm": 0.26831503316886746, "learning_rate": 1.1029527732079083e-07, "loss": 0.5613952875137329, "num_tokens": 299706050.0, "step": 580 }, { "epoch": 4.335820895522388, "grad_norm": 0.30116453227315987, "learning_rate": 1.0980100204631603e-07, "loss": 0.5974493026733398, "num_tokens": 300500751.0, "step": 581 }, { "epoch": 4.343283582089552, "grad_norm": 0.2574290282127498, "learning_rate": 1.0931875492257944e-07, "loss": 0.5080505609512329, "num_tokens": 301436049.0, "step": 582 }, { "epoch": 4.350746268656716, "grad_norm": 0.2768297952700454, "learning_rate": 1.088485491266694e-07, "loss": 0.5769013166427612, "num_tokens": 302245987.0, "step": 583 }, { "epoch": 4.358208955223881, "grad_norm": 0.2614099752067767, "learning_rate": 1.0839039750665291e-07, "loss": 0.5327722430229187, "num_tokens": 303180329.0, "step": 584 }, { "epoch": 4.365671641791045, "grad_norm": 0.2545576461755387, "learning_rate": 1.0794431258122429e-07, "loss": 0.5465179085731506, "num_tokens": 304106987.0, "step": 585 }, { "epoch": 4.373134328358209, "grad_norm": 0.3028807540174107, "learning_rate": 1.0751030653936354e-07, "loss": 0.5673878192901611, "num_tokens": 304931031.0, "step": 586 }, { "epoch": 4.380597014925373, "grad_norm": 0.2623967772914622, "learning_rate": 1.0708839124000287e-07, "loss": 0.5716835260391235, "num_tokens": 305846255.0, "step": 587 }, { "epoch": 4.388059701492537, "grad_norm": 0.2805523792578899, "learning_rate": 1.066785782117028e-07, "loss": 0.5245805978775024, "num_tokens": 306627892.0, "step": 588 }, { "epoch": 4.395522388059701, "grad_norm": 0.3121504154269003, "learning_rate": 1.0628087865233737e-07, "loss": 0.5411394238471985, "num_tokens": 307519113.0, "step": 589 }, { "epoch": 4.402985074626866, "grad_norm": 0.2913376914541736, "learning_rate": 1.0589530342878769e-07, "loss": 0.5592665672302246, "num_tokens": 308359627.0, "step": 590 }, { "epoch": 4.41044776119403, "grad_norm": 0.26729488095826903, "learning_rate": 1.0552186307664565e-07, "loss": 0.5448157787322998, "num_tokens": 309250463.0, "step": 591 }, { "epoch": 4.417910447761194, "grad_norm": 0.30314603500542503, "learning_rate": 1.0516056779992541e-07, "loss": 0.5698049664497375, "num_tokens": 310094707.0, "step": 592 }, { "epoch": 4.425373134328359, "grad_norm": 0.2786233963992514, "learning_rate": 1.0481142747078492e-07, "loss": 0.5542622804641724, "num_tokens": 310932669.0, "step": 593 }, { "epoch": 4.432835820895522, "grad_norm": 0.28934032270495663, "learning_rate": 1.0447445162925613e-07, "loss": 0.5697283744812012, "num_tokens": 311864048.0, "step": 594 }, { "epoch": 4.440298507462686, "grad_norm": 0.2500391659266081, "learning_rate": 1.0414964948298435e-07, "loss": 0.5528576374053955, "num_tokens": 312840365.0, "step": 595 }, { "epoch": 4.447761194029851, "grad_norm": 0.2916622377572014, "learning_rate": 1.0383702990697656e-07, "loss": 0.5366314649581909, "num_tokens": 313795804.0, "step": 596 }, { "epoch": 4.455223880597015, "grad_norm": 0.271646171110688, "learning_rate": 1.035366014433589e-07, "loss": 0.5479257106781006, "num_tokens": 314648164.0, "step": 597 }, { "epoch": 4.462686567164179, "grad_norm": 0.28220408400221697, "learning_rate": 1.032483723011433e-07, "loss": 0.5544242858886719, "num_tokens": 315521665.0, "step": 598 }, { "epoch": 4.470149253731344, "grad_norm": 0.32415992407130506, "learning_rate": 1.0297235035600334e-07, "loss": 0.5346230268478394, "num_tokens": 316460972.0, "step": 599 }, { "epoch": 4.477611940298507, "grad_norm": 0.29902595518304415, "learning_rate": 1.0270854315005874e-07, "loss": 0.5251238346099854, "num_tokens": 317398198.0, "step": 600 }, { "epoch": 4.485074626865671, "grad_norm": 0.27529822550906163, "learning_rate": 1.0245695789166948e-07, "loss": 0.550391674041748, "num_tokens": 318368138.0, "step": 601 }, { "epoch": 4.492537313432836, "grad_norm": 0.25678842717501, "learning_rate": 1.0221760145523875e-07, "loss": 0.5486523509025574, "num_tokens": 319254359.0, "step": 602 }, { "epoch": 4.5, "grad_norm": 0.30422330388419494, "learning_rate": 1.0199048038102526e-07, "loss": 0.5667173266410828, "num_tokens": 320163581.0, "step": 603 }, { "epoch": 4.507462686567164, "grad_norm": 0.2590403748368139, "learning_rate": 1.0177560087496423e-07, "loss": 0.5528400540351868, "num_tokens": 321099182.0, "step": 604 }, { "epoch": 4.514925373134329, "grad_norm": 0.2779131559748047, "learning_rate": 1.0157296880849824e-07, "loss": 0.5901874303817749, "num_tokens": 321960509.0, "step": 605 }, { "epoch": 4.522388059701493, "grad_norm": 0.29485819884103376, "learning_rate": 1.0138258971841641e-07, "loss": 0.5388875007629395, "num_tokens": 322828287.0, "step": 606 }, { "epoch": 4.529850746268656, "grad_norm": 0.2672600011322682, "learning_rate": 1.0120446880670325e-07, "loss": 0.5676090121269226, "num_tokens": 323747430.0, "step": 607 }, { "epoch": 4.537313432835821, "grad_norm": 0.2612827675126176, "learning_rate": 1.0103861094039667e-07, "loss": 0.5471125245094299, "num_tokens": 324666159.0, "step": 608 }, { "epoch": 4.544776119402985, "grad_norm": 0.27778633049164236, "learning_rate": 1.008850206514547e-07, "loss": 0.5418146848678589, "num_tokens": 325560938.0, "step": 609 }, { "epoch": 4.552238805970149, "grad_norm": 0.2969440332344535, "learning_rate": 1.0074370213663201e-07, "loss": 0.5470881462097168, "num_tokens": 326330466.0, "step": 610 }, { "epoch": 4.559701492537314, "grad_norm": 0.31944728844902104, "learning_rate": 1.0061465925736478e-07, "loss": 0.5502467155456543, "num_tokens": 327193522.0, "step": 611 }, { "epoch": 4.567164179104478, "grad_norm": 0.2519446230589244, "learning_rate": 1.0049789553966569e-07, "loss": 0.5561034679412842, "num_tokens": 328181555.0, "step": 612 }, { "epoch": 4.574626865671641, "grad_norm": 0.36293634063352515, "learning_rate": 1.0039341417402715e-07, "loss": 0.5579421520233154, "num_tokens": 329048630.0, "step": 613 }, { "epoch": 4.582089552238806, "grad_norm": 0.26255387893449383, "learning_rate": 1.0030121801533441e-07, "loss": 0.5714669823646545, "num_tokens": 329968258.0, "step": 614 }, { "epoch": 4.58955223880597, "grad_norm": 0.28393932823044754, "learning_rate": 1.002213095827875e-07, "loss": 0.542944610118866, "num_tokens": 330754847.0, "step": 615 }, { "epoch": 4.597014925373134, "grad_norm": 0.25741865731733854, "learning_rate": 1.0015369105983216e-07, "loss": 0.5193674564361572, "num_tokens": 331683385.0, "step": 616 }, { "epoch": 4.604477611940299, "grad_norm": 0.28009977442278555, "learning_rate": 1.0009836429410053e-07, "loss": 0.5400401949882507, "num_tokens": 332655820.0, "step": 617 }, { "epoch": 4.611940298507463, "grad_norm": 0.2840147519658574, "learning_rate": 1.0005533079736037e-07, "loss": 0.5117232203483582, "num_tokens": 333524783.0, "step": 618 }, { "epoch": 4.619402985074627, "grad_norm": 0.25572802620429147, "learning_rate": 1.0002459174547398e-07, "loss": 0.5419676303863525, "num_tokens": 334451276.0, "step": 619 }, { "epoch": 4.6268656716417915, "grad_norm": 0.2837927983840193, "learning_rate": 1.0000614797836585e-07, "loss": 0.5926541090011597, "num_tokens": 335471551.0, "step": 620 }, { "epoch": 4.6268656716417915, "step": 620, "total_flos": 829202911068160.0, "train_loss": 0.33651591361530364, "train_runtime": 10914.7384, "train_samples_per_second": 1.818, "train_steps_per_second": 0.057 } ], "logging_steps": 1, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 829202911068160.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }