{ "best_global_step": 4170, "best_metric": 0.9509243994462685, "best_model_checkpoint": "/workspaces/decompile_search/data/models/jan_experiments/stripped_unstripped_22b_unstripped_stop/checkpoint-4170", "epoch": 0.8347588717015468, "eval_steps": 30, "global_step": 4170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.9579070736061442, "epoch": 0.0010009099181073704, "grad_norm": 31.209657669067383, "learning_rate": 8e-09, "loss": 1.2257, "mean_token_accuracy": 0.7700222129171544, "num_tokens": 1145748.0, "step": 5 }, { "entropy": 1.011449571089311, "epoch": 0.002001819836214741, "grad_norm": 23.227827072143555, "learning_rate": 1.8e-08, "loss": 1.2785, "mean_token_accuracy": 0.7474062916907397, "num_tokens": 2249793.0, "step": 10 }, { "entropy": 1.05531109679829, "epoch": 0.003002729754322111, "grad_norm": 17.10189437866211, "learning_rate": 2.8e-08, "loss": 1.3173, "mean_token_accuracy": 0.732902865247293, "num_tokens": 3277722.0, "step": 15 }, { "entropy": 1.080242946473035, "epoch": 0.004003639672429482, "grad_norm": 10.145275115966797, "learning_rate": 3.7999999999999996e-08, "loss": 1.3569, "mean_token_accuracy": 0.7288111207160083, "num_tokens": 4232543.0, "step": 20 }, { "entropy": 1.1169310082088817, "epoch": 0.005004549590536852, "grad_norm": 41.83009338378906, "learning_rate": 4.8e-08, "loss": 1.548, "mean_token_accuracy": 0.7169481502337889, "num_tokens": 4962149.0, "step": 25 }, { "entropy": 0.9460688650608062, "epoch": 0.006005459508644222, "grad_norm": 29.797571182250977, "learning_rate": 5.8e-08, "loss": 1.1969, "mean_token_accuracy": 0.7767708816311576, "num_tokens": 6078015.0, "step": 30 }, { "epoch": 0.006005459508644222, "eval_entropy": 0.8858045388440616, "eval_loss": 1.0837815999984741, "eval_mean_token_accuracy": 0.7985450283425753, "eval_num_tokens": 6078015.0, "eval_runtime": 7.1989, "eval_samples_per_second": 135.159, "eval_steps_per_second": 8.473, "step": 30 }, { "entropy": 1.004025985436006, "epoch": 0.0070063694267515925, "grad_norm": 20.322914123535156, "learning_rate": 6.8e-08, "loss": 1.2249, "mean_token_accuracy": 0.757005724310875, "num_tokens": 7170484.0, "step": 35 }, { "entropy": 1.0326186922463503, "epoch": 0.008007279344858963, "grad_norm": 14.3685302734375, "learning_rate": 7.8e-08, "loss": 1.2034, "mean_token_accuracy": 0.7555106471885334, "num_tokens": 8215978.0, "step": 40 }, { "entropy": 1.050350191376426, "epoch": 0.009008189262966333, "grad_norm": 8.891011238098145, "learning_rate": 8.8e-08, "loss": 1.2058, "mean_token_accuracy": 0.7581022883003409, "num_tokens": 9176310.0, "step": 45 }, { "entropy": 1.0481942902911794, "epoch": 0.010009099181073703, "grad_norm": 25.542049407958984, "learning_rate": 9.8e-08, "loss": 1.2043, "mean_token_accuracy": 0.7758935884995894, "num_tokens": 9907937.0, "step": 50 }, { "entropy": 0.8896388709545135, "epoch": 0.011010009099181074, "grad_norm": 17.134859085083008, "learning_rate": 1.0799999999999999e-07, "loss": 0.9333, "mean_token_accuracy": 0.8181748888709328, "num_tokens": 11040470.0, "step": 55 }, { "entropy": 0.9338755531744524, "epoch": 0.012010919017288443, "grad_norm": 11.280884742736816, "learning_rate": 1.1799999999999998e-07, "loss": 0.9171, "mean_token_accuracy": 0.809243483435024, "num_tokens": 12138554.0, "step": 60 }, { "epoch": 0.012010919017288443, "eval_entropy": 0.827404188328102, "eval_loss": 0.7739703059196472, "eval_mean_token_accuracy": 0.8451327916051521, "eval_num_tokens": 12138554.0, "eval_runtime": 7.1713, "eval_samples_per_second": 135.679, "eval_steps_per_second": 8.506, "step": 60 }, { "entropy": 0.9394035225564783, "epoch": 0.013011828935395814, "grad_norm": 6.800612449645996, "learning_rate": 1.28e-07, "loss": 0.816, "mean_token_accuracy": 0.8172335895625028, "num_tokens": 13161971.0, "step": 65 }, { "entropy": 0.931338392604481, "epoch": 0.014012738853503185, "grad_norm": 3.9526984691619873, "learning_rate": 1.3800000000000002e-07, "loss": 0.729, "mean_token_accuracy": 0.8396818897940895, "num_tokens": 14121047.0, "step": 70 }, { "entropy": 0.9432308787649328, "epoch": 0.015013648771610554, "grad_norm": 13.202656745910645, "learning_rate": 1.4799999999999998e-07, "loss": 0.7149, "mean_token_accuracy": 0.8470803531733426, "num_tokens": 14853168.0, "step": 75 }, { "entropy": 0.8036102744666013, "epoch": 0.016014558689717927, "grad_norm": 5.494961261749268, "learning_rate": 1.5799999999999999e-07, "loss": 0.5533, "mean_token_accuracy": 0.8729204730554061, "num_tokens": 16006267.0, "step": 80 }, { "entropy": 0.8627965910868212, "epoch": 0.017015468607825296, "grad_norm": 3.109562635421753, "learning_rate": 1.68e-07, "loss": 0.527, "mean_token_accuracy": 0.8722775372591886, "num_tokens": 17103593.0, "step": 85 }, { "entropy": 0.8908803132447329, "epoch": 0.018016378525932665, "grad_norm": 1.8945680856704712, "learning_rate": 1.7799999999999998e-07, "loss": 0.5005, "mean_token_accuracy": 0.8758173530752008, "num_tokens": 18130627.0, "step": 90 }, { "epoch": 0.018016378525932665, "eval_entropy": 0.7762856190321875, "eval_loss": 0.40602007508277893, "eval_mean_token_accuracy": 0.899208920900939, "eval_num_tokens": 18130627.0, "eval_runtime": 7.1881, "eval_samples_per_second": 135.362, "eval_steps_per_second": 8.486, "step": 90 }, { "entropy": 0.9098557423461567, "epoch": 0.019017288444040038, "grad_norm": 1.237993597984314, "learning_rate": 1.88e-07, "loss": 0.4722, "mean_token_accuracy": 0.8838203982873396, "num_tokens": 19076434.0, "step": 95 }, { "entropy": 0.9214537311684001, "epoch": 0.020018198362147407, "grad_norm": 5.1105427742004395, "learning_rate": 1.98e-07, "loss": 0.4761, "mean_token_accuracy": 0.885197820446708, "num_tokens": 19806235.0, "step": 100 }, { "entropy": 0.7998497681184249, "epoch": 0.021019108280254776, "grad_norm": 2.0025761127471924, "learning_rate": 2.0799999999999998e-07, "loss": 0.4172, "mean_token_accuracy": 0.8957362061197107, "num_tokens": 20936435.0, "step": 105 }, { "entropy": 0.8544187637892636, "epoch": 0.02202001819836215, "grad_norm": 1.4504088163375854, "learning_rate": 2.18e-07, "loss": 0.4103, "mean_token_accuracy": 0.8934655557979237, "num_tokens": 22013484.0, "step": 110 }, { "entropy": 0.8855489952997728, "epoch": 0.023020928116469518, "grad_norm": 1.1078195571899414, "learning_rate": 2.28e-07, "loss": 0.4117, "mean_token_accuracy": 0.8933108893307773, "num_tokens": 23045886.0, "step": 115 }, { "entropy": 0.9073191062970595, "epoch": 0.024021838034576887, "grad_norm": 0.9317752718925476, "learning_rate": 2.38e-07, "loss": 0.4015, "mean_token_accuracy": 0.8976379817182367, "num_tokens": 23997532.0, "step": 120 }, { "epoch": 0.024021838034576887, "eval_entropy": 0.7722517046772066, "eval_loss": 0.29370784759521484, "eval_mean_token_accuracy": 0.9163104364129363, "eval_num_tokens": 23997532.0, "eval_runtime": 7.1752, "eval_samples_per_second": 135.606, "eval_steps_per_second": 8.502, "step": 120 }, { "entropy": 0.9209811920469457, "epoch": 0.02502274795268426, "grad_norm": 3.9668571949005127, "learning_rate": 2.48e-07, "loss": 0.4115, "mean_token_accuracy": 0.8966231562874534, "num_tokens": 24736731.0, "step": 125 }, { "entropy": 0.7977617914026434, "epoch": 0.02602365787079163, "grad_norm": 1.2996954917907715, "learning_rate": 2.58e-07, "loss": 0.3835, "mean_token_accuracy": 0.9029265501282432, "num_tokens": 25872435.0, "step": 130 }, { "entropy": 0.8485897243022918, "epoch": 0.027024567788898998, "grad_norm": 1.0686163902282715, "learning_rate": 2.68e-07, "loss": 0.3879, "mean_token_accuracy": 0.8995744439688595, "num_tokens": 26959841.0, "step": 135 }, { "entropy": 0.8709002348509702, "epoch": 0.02802547770700637, "grad_norm": 0.92051762342453, "learning_rate": 2.7800000000000003e-07, "loss": 0.3856, "mean_token_accuracy": 0.9004006510431116, "num_tokens": 27993013.0, "step": 140 }, { "entropy": 0.8909420809962533, "epoch": 0.02902638762511374, "grad_norm": 0.8413939476013184, "learning_rate": 2.88e-07, "loss": 0.388, "mean_token_accuracy": 0.9014144734902816, "num_tokens": 28946080.0, "step": 145 }, { "entropy": 0.9160865225575187, "epoch": 0.03002729754322111, "grad_norm": 4.02235221862793, "learning_rate": 2.98e-07, "loss": 0.3961, "mean_token_accuracy": 0.9003071714531291, "num_tokens": 29669559.0, "step": 150 }, { "epoch": 0.03002729754322111, "eval_entropy": 0.773854380748311, "eval_loss": 0.2762528359889984, "eval_mean_token_accuracy": 0.9212246898744927, "eval_num_tokens": 29669559.0, "eval_runtime": 7.2541, "eval_samples_per_second": 134.132, "eval_steps_per_second": 8.409, "step": 150 }, { "entropy": 0.7960180721499703, "epoch": 0.03102820746132848, "grad_norm": 1.3376996517181396, "learning_rate": 3.08e-07, "loss": 0.3679, "mean_token_accuracy": 0.9063912429592826, "num_tokens": 30786460.0, "step": 155 }, { "entropy": 0.8393291793086312, "epoch": 0.032029117379435854, "grad_norm": 0.9782769083976746, "learning_rate": 3.18e-07, "loss": 0.3739, "mean_token_accuracy": 0.9036351117220792, "num_tokens": 31893205.0, "step": 160 }, { "entropy": 0.8657339849255302, "epoch": 0.03303002729754322, "grad_norm": 0.916830837726593, "learning_rate": 3.28e-07, "loss": 0.3732, "mean_token_accuracy": 0.9034594573757865, "num_tokens": 32917211.0, "step": 165 }, { "entropy": 0.8868166403336959, "epoch": 0.03403093721565059, "grad_norm": 0.8498401045799255, "learning_rate": 3.38e-07, "loss": 0.3753, "mean_token_accuracy": 0.9043488231572238, "num_tokens": 33872966.0, "step": 170 }, { "entropy": 0.9020868615670637, "epoch": 0.03503184713375796, "grad_norm": 3.632817506790161, "learning_rate": 3.4799999999999994e-07, "loss": 0.3814, "mean_token_accuracy": 0.9035663994875821, "num_tokens": 34603817.0, "step": 175 }, { "entropy": 0.794374041665684, "epoch": 0.03603275705186533, "grad_norm": 1.3152629137039185, "learning_rate": 3.5799999999999995e-07, "loss": 0.3598, "mean_token_accuracy": 0.9083835087039254, "num_tokens": 35738363.0, "step": 180 }, { "epoch": 0.03603275705186533, "eval_entropy": 0.7766775371598416, "eval_loss": 0.2652251720428467, "eval_mean_token_accuracy": 0.9241212303521203, "eval_num_tokens": 35738363.0, "eval_runtime": 7.1835, "eval_samples_per_second": 135.449, "eval_steps_per_second": 8.492, "step": 180 }, { "entropy": 0.8374889547174628, "epoch": 0.0370336669699727, "grad_norm": 1.069807767868042, "learning_rate": 3.6799999999999996e-07, "loss": 0.362, "mean_token_accuracy": 0.906164778362621, "num_tokens": 36835769.0, "step": 185 }, { "entropy": 0.865638066963716, "epoch": 0.038034576888080075, "grad_norm": 0.9059945344924927, "learning_rate": 3.7799999999999997e-07, "loss": 0.3667, "mean_token_accuracy": 0.9057478319514881, "num_tokens": 37868585.0, "step": 190 }, { "entropy": 0.8880685941739516, "epoch": 0.039035486806187444, "grad_norm": 0.7509809136390686, "learning_rate": 3.88e-07, "loss": 0.3578, "mean_token_accuracy": 0.9070773032578555, "num_tokens": 38807693.0, "step": 195 }, { "entropy": 0.909015315771103, "epoch": 0.040036396724294813, "grad_norm": 3.0933594703674316, "learning_rate": 3.98e-07, "loss": 0.3733, "mean_token_accuracy": 0.9064179236238653, "num_tokens": 39535122.0, "step": 200 }, { "entropy": 0.7939805047078566, "epoch": 0.04103730664240218, "grad_norm": 1.179566740989685, "learning_rate": 4.0799999999999995e-07, "loss": 0.3551, "mean_token_accuracy": 0.9092546192082491, "num_tokens": 40666622.0, "step": 205 }, { "entropy": 0.846534158966758, "epoch": 0.04203821656050955, "grad_norm": 0.9151410460472107, "learning_rate": 4.1799999999999996e-07, "loss": 0.3589, "mean_token_accuracy": 0.9077435683120381, "num_tokens": 41748748.0, "step": 210 }, { "epoch": 0.04203821656050955, "eval_entropy": 0.7851541091184147, "eval_loss": 0.25734254717826843, "eval_mean_token_accuracy": 0.9266526767464934, "eval_num_tokens": 41748748.0, "eval_runtime": 7.1633, "eval_samples_per_second": 135.832, "eval_steps_per_second": 8.516, "step": 210 }, { "entropy": 0.8759665044871243, "epoch": 0.04303912647861693, "grad_norm": 1.0037354230880737, "learning_rate": 4.2799999999999997e-07, "loss": 0.3613, "mean_token_accuracy": 0.906467653946443, "num_tokens": 42776021.0, "step": 215 }, { "entropy": 0.8946322251449932, "epoch": 0.0440400363967243, "grad_norm": 0.7233147621154785, "learning_rate": 4.38e-07, "loss": 0.3685, "mean_token_accuracy": 0.9061482787132263, "num_tokens": 43731864.0, "step": 220 }, { "entropy": 0.9084791925820437, "epoch": 0.045040946314831666, "grad_norm": 3.0358214378356934, "learning_rate": 4.48e-07, "loss": 0.3644, "mean_token_accuracy": 0.9079454855485396, "num_tokens": 44458397.0, "step": 225 }, { "entropy": 0.795162683725357, "epoch": 0.046041856232939035, "grad_norm": 1.060544490814209, "learning_rate": 4.58e-07, "loss": 0.347, "mean_token_accuracy": 0.9115611629052596, "num_tokens": 45582504.0, "step": 230 }, { "entropy": 0.8501786047762091, "epoch": 0.047042766151046404, "grad_norm": 0.8345633745193481, "learning_rate": 4.68e-07, "loss": 0.3518, "mean_token_accuracy": 0.9087835637005892, "num_tokens": 46691664.0, "step": 235 }, { "entropy": 0.8739784273234281, "epoch": 0.04804367606915377, "grad_norm": 0.8899365067481995, "learning_rate": 4.779999999999999e-07, "loss": 0.354, "mean_token_accuracy": 0.9071491924199191, "num_tokens": 47712971.0, "step": 240 }, { "epoch": 0.04804367606915377, "eval_entropy": 0.7832569479942322, "eval_loss": 0.25191357731819153, "eval_mean_token_accuracy": 0.9278596326953075, "eval_num_tokens": 47712971.0, "eval_runtime": 7.1165, "eval_samples_per_second": 136.725, "eval_steps_per_second": 8.572, "step": 240 }, { "entropy": 0.8918125726959922, "epoch": 0.04904458598726115, "grad_norm": 0.7503334879875183, "learning_rate": 4.879999999999999e-07, "loss": 0.3536, "mean_token_accuracy": 0.9086009448224848, "num_tokens": 48657962.0, "step": 245 }, { "entropy": 0.9069400212981484, "epoch": 0.05004549590536852, "grad_norm": 2.7447826862335205, "learning_rate": 4.979999999999999e-07, "loss": 0.3607, "mean_token_accuracy": 0.9092577500776811, "num_tokens": 49382591.0, "step": 250 }, { "entropy": 0.7897281029007651, "epoch": 0.05104640582347589, "grad_norm": 1.0658516883850098, "learning_rate": 5.079999999999999e-07, "loss": 0.3343, "mean_token_accuracy": 0.9143867644396695, "num_tokens": 50493880.0, "step": 255 }, { "entropy": 0.8420896362174641, "epoch": 0.05204731574158326, "grad_norm": 0.9911607503890991, "learning_rate": 5.18e-07, "loss": 0.3467, "mean_token_accuracy": 0.9096509906378659, "num_tokens": 51593094.0, "step": 260 }, { "entropy": 0.8670889122919603, "epoch": 0.053048225659690626, "grad_norm": 0.8509455323219299, "learning_rate": 5.28e-07, "loss": 0.3429, "mean_token_accuracy": 0.9105742725459012, "num_tokens": 52624215.0, "step": 265 }, { "entropy": 0.8908169654282656, "epoch": 0.054049135577797995, "grad_norm": 1.258712649345398, "learning_rate": 5.38e-07, "loss": 0.3471, "mean_token_accuracy": 0.9100607395172119, "num_tokens": 53579583.0, "step": 270 }, { "epoch": 0.054049135577797995, "eval_entropy": 0.781644055100738, "eval_loss": 0.24734267592430115, "eval_mean_token_accuracy": 0.9283195352945172, "eval_num_tokens": 53579583.0, "eval_runtime": 7.1344, "eval_samples_per_second": 136.382, "eval_steps_per_second": 8.55, "step": 270 }, { "entropy": 0.8974328610030088, "epoch": 0.05505004549590537, "grad_norm": 2.7811849117279053, "learning_rate": 5.48e-07, "loss": 0.3531, "mean_token_accuracy": 0.9098310795697299, "num_tokens": 54306230.0, "step": 275 }, { "entropy": 0.790298336202448, "epoch": 0.05605095541401274, "grad_norm": 1.0269526243209839, "learning_rate": 5.58e-07, "loss": 0.3426, "mean_token_accuracy": 0.912391439893029, "num_tokens": 55434008.0, "step": 280 }, { "entropy": 0.8415309033610604, "epoch": 0.05705186533212011, "grad_norm": 0.8907763957977295, "learning_rate": 5.679999999999999e-07, "loss": 0.3457, "mean_token_accuracy": 0.9103805682875893, "num_tokens": 56527326.0, "step": 285 }, { "entropy": 0.8678737878799438, "epoch": 0.05805277525022748, "grad_norm": 0.933925449848175, "learning_rate": 5.779999999999999e-07, "loss": 0.3435, "mean_token_accuracy": 0.9097982775081288, "num_tokens": 57577716.0, "step": 290 }, { "entropy": 0.8823735535144805, "epoch": 0.05905368516833485, "grad_norm": 0.7687504291534424, "learning_rate": 5.879999999999999e-07, "loss": 0.3448, "mean_token_accuracy": 0.9104289829730987, "num_tokens": 58525576.0, "step": 295 }, { "entropy": 0.8998256000605497, "epoch": 0.06005459508644222, "grad_norm": 2.8133366107940674, "learning_rate": 5.979999999999999e-07, "loss": 0.3474, "mean_token_accuracy": 0.9119950543750416, "num_tokens": 59255710.0, "step": 300 }, { "epoch": 0.06005459508644222, "eval_entropy": 0.7829059782575388, "eval_loss": 0.24350076913833618, "eval_mean_token_accuracy": 0.9303004888237499, "eval_num_tokens": 59255710.0, "eval_runtime": 7.4044, "eval_samples_per_second": 131.408, "eval_steps_per_second": 8.238, "step": 300 }, { "entropy": 0.7927286684513092, "epoch": 0.06105550500454959, "grad_norm": 0.9818949103355408, "learning_rate": 6.079999999999999e-07, "loss": 0.3337, "mean_token_accuracy": 0.9142222328619524, "num_tokens": 60371359.0, "step": 305 }, { "entropy": 0.8404727691953833, "epoch": 0.06205641492265696, "grad_norm": 0.8817327618598938, "learning_rate": 6.18e-07, "loss": 0.336, "mean_token_accuracy": 0.9129292596470225, "num_tokens": 61459516.0, "step": 310 }, { "entropy": 0.8821817175908522, "epoch": 0.06305732484076433, "grad_norm": 1.0255104303359985, "learning_rate": 6.28e-07, "loss": 0.3445, "mean_token_accuracy": 0.9102013338695872, "num_tokens": 62511188.0, "step": 315 }, { "entropy": 0.8907575244253332, "epoch": 0.06405823475887171, "grad_norm": 0.6242014169692993, "learning_rate": 6.38e-07, "loss": 0.3306, "mean_token_accuracy": 0.9132393593137914, "num_tokens": 63462734.0, "step": 320 }, { "entropy": 0.9019130771810359, "epoch": 0.06505914467697907, "grad_norm": 2.4068405628204346, "learning_rate": 6.48e-07, "loss": 0.3431, "mean_token_accuracy": 0.9133785475384105, "num_tokens": 64195174.0, "step": 325 }, { "entropy": 0.7956698184663599, "epoch": 0.06606005459508645, "grad_norm": 1.0124462842941284, "learning_rate": 6.58e-07, "loss": 0.3232, "mean_token_accuracy": 0.916981242461638, "num_tokens": 65310690.0, "step": 330 }, { "epoch": 0.06606005459508645, "eval_entropy": 0.7826571161629724, "eval_loss": 0.2395564466714859, "eval_mean_token_accuracy": 0.9312737300747731, "eval_num_tokens": 65310690.0, "eval_runtime": 7.1164, "eval_samples_per_second": 136.726, "eval_steps_per_second": 8.572, "step": 330 }, { "entropy": 0.8469071046872573, "epoch": 0.06706096451319381, "grad_norm": 0.8455765843391418, "learning_rate": 6.68e-07, "loss": 0.3336, "mean_token_accuracy": 0.9130166606469587, "num_tokens": 66407916.0, "step": 335 }, { "entropy": 0.8715857310728593, "epoch": 0.06806187443130118, "grad_norm": 0.848980188369751, "learning_rate": 6.78e-07, "loss": 0.332, "mean_token_accuracy": 0.9133841227401387, "num_tokens": 67462675.0, "step": 340 }, { "entropy": 0.8887097320773385, "epoch": 0.06906278434940856, "grad_norm": 0.7524086236953735, "learning_rate": 6.879999999999999e-07, "loss": 0.3293, "mean_token_accuracy": 0.9141406866637143, "num_tokens": 68430686.0, "step": 345 }, { "entropy": 0.9178955408659848, "epoch": 0.07006369426751592, "grad_norm": 2.6862826347351074, "learning_rate": 6.979999999999999e-07, "loss": 0.3503, "mean_token_accuracy": 0.910759204084223, "num_tokens": 69159538.0, "step": 350 }, { "entropy": 0.7949739504944194, "epoch": 0.0710646041856233, "grad_norm": 0.939888060092926, "learning_rate": 7.079999999999999e-07, "loss": 0.3226, "mean_token_accuracy": 0.917046070098877, "num_tokens": 70297859.0, "step": 355 }, { "entropy": 0.8522608453577215, "epoch": 0.07206551410373066, "grad_norm": 0.8733773827552795, "learning_rate": 7.179999999999999e-07, "loss": 0.3245, "mean_token_accuracy": 0.9145838070999492, "num_tokens": 71388877.0, "step": 360 }, { "epoch": 0.07206551410373066, "eval_entropy": 0.7842210042672079, "eval_loss": 0.23638135194778442, "eval_mean_token_accuracy": 0.9317856763229996, "eval_num_tokens": 71388877.0, "eval_runtime": 7.0752, "eval_samples_per_second": 137.523, "eval_steps_per_second": 8.622, "step": 360 }, { "entropy": 0.8792784777554599, "epoch": 0.07306642402183804, "grad_norm": 0.9495289921760559, "learning_rate": 7.28e-07, "loss": 0.3357, "mean_token_accuracy": 0.9130886435508728, "num_tokens": 72425465.0, "step": 365 }, { "entropy": 0.8881763842972842, "epoch": 0.0740673339399454, "grad_norm": 0.7181362509727478, "learning_rate": 7.38e-07, "loss": 0.3273, "mean_token_accuracy": 0.9161153722893108, "num_tokens": 73377921.0, "step": 370 }, { "entropy": 0.9161553989757191, "epoch": 0.07506824385805277, "grad_norm": 2.3346126079559326, "learning_rate": 7.48e-07, "loss": 0.3416, "mean_token_accuracy": 0.9133548351851377, "num_tokens": 74109742.0, "step": 375 }, { "entropy": 0.7970652249726382, "epoch": 0.07606915377616015, "grad_norm": 0.9897024035453796, "learning_rate": 7.58e-07, "loss": 0.3206, "mean_token_accuracy": 0.9177604393525557, "num_tokens": 75254960.0, "step": 380 }, { "entropy": 0.8417174593968825, "epoch": 0.07707006369426751, "grad_norm": 1.0447583198547363, "learning_rate": 7.68e-07, "loss": 0.3204, "mean_token_accuracy": 0.9170865096829154, "num_tokens": 76368490.0, "step": 385 }, { "entropy": 0.8782219377431002, "epoch": 0.07807097361237489, "grad_norm": 0.9332379698753357, "learning_rate": 7.78e-07, "loss": 0.3301, "mean_token_accuracy": 0.9143734498457475, "num_tokens": 77412393.0, "step": 390 }, { "epoch": 0.07807097361237489, "eval_entropy": 0.779330243829821, "eval_loss": 0.23134513199329376, "eval_mean_token_accuracy": 0.9325118944293163, "eval_num_tokens": 77412393.0, "eval_runtime": 7.1925, "eval_samples_per_second": 135.279, "eval_steps_per_second": 8.481, "step": 390 }, { "entropy": 0.8939531375061381, "epoch": 0.07907188353048225, "grad_norm": 0.7231958508491516, "learning_rate": 7.88e-07, "loss": 0.3267, "mean_token_accuracy": 0.9155195750973442, "num_tokens": 78362341.0, "step": 395 }, { "entropy": 0.9062723474069075, "epoch": 0.08007279344858963, "grad_norm": 2.0436787605285645, "learning_rate": 7.98e-07, "loss": 0.3365, "mean_token_accuracy": 0.9146250042048367, "num_tokens": 79096898.0, "step": 400 }, { "entropy": 0.7945969061418013, "epoch": 0.081073703366697, "grad_norm": 0.9359119534492493, "learning_rate": 8.08e-07, "loss": 0.3177, "mean_token_accuracy": 0.9181306638500907, "num_tokens": 80217692.0, "step": 405 }, { "entropy": 0.844357325813987, "epoch": 0.08207461328480437, "grad_norm": 0.8438592553138733, "learning_rate": 8.179999999999999e-07, "loss": 0.3203, "mean_token_accuracy": 0.9169050964442167, "num_tokens": 81307270.0, "step": 410 }, { "entropy": 0.8725964551622217, "epoch": 0.08307552320291174, "grad_norm": 0.7924903035163879, "learning_rate": 8.28e-07, "loss": 0.3209, "mean_token_accuracy": 0.9159043675119226, "num_tokens": 82357546.0, "step": 415 }, { "entropy": 0.8886379382827065, "epoch": 0.0840764331210191, "grad_norm": 0.7205538153648376, "learning_rate": 8.38e-07, "loss": 0.3235, "mean_token_accuracy": 0.9162523784420707, "num_tokens": 83328951.0, "step": 420 }, { "epoch": 0.0840764331210191, "eval_entropy": 0.7785051123040622, "eval_loss": 0.2309650033712387, "eval_mean_token_accuracy": 0.9339301380954805, "eval_num_tokens": 83328951.0, "eval_runtime": 7.1338, "eval_samples_per_second": 136.393, "eval_steps_per_second": 8.551, "step": 420 }, { "entropy": 0.9015932933850722, "epoch": 0.08507734303912648, "grad_norm": 2.2584614753723145, "learning_rate": 8.48e-07, "loss": 0.3281, "mean_token_accuracy": 0.9161337093873457, "num_tokens": 84065002.0, "step": 425 }, { "entropy": 0.7703537437048825, "epoch": 0.08607825295723386, "grad_norm": 0.8600314855575562, "learning_rate": 8.58e-07, "loss": 0.3051, "mean_token_accuracy": 0.9216179517182437, "num_tokens": 85217505.0, "step": 430 }, { "entropy": 0.837453713200309, "epoch": 0.08707916287534122, "grad_norm": 0.837756335735321, "learning_rate": 8.68e-07, "loss": 0.3171, "mean_token_accuracy": 0.9169958277182145, "num_tokens": 86302592.0, "step": 435 }, { "entropy": 0.8648753382942893, "epoch": 0.0880800727934486, "grad_norm": 0.8668557405471802, "learning_rate": 8.78e-07, "loss": 0.3128, "mean_token_accuracy": 0.9178522229194641, "num_tokens": 87371908.0, "step": 440 }, { "entropy": 0.8813207095319574, "epoch": 0.08908098271155596, "grad_norm": 0.7655621767044067, "learning_rate": 8.88e-07, "loss": 0.325, "mean_token_accuracy": 0.9160207412459633, "num_tokens": 88338748.0, "step": 445 }, { "entropy": 0.8887845928018744, "epoch": 0.09008189262966333, "grad_norm": 1.8667429685592651, "learning_rate": 8.98e-07, "loss": 0.3201, "mean_token_accuracy": 0.918709414655512, "num_tokens": 89071435.0, "step": 450 }, { "epoch": 0.09008189262966333, "eval_entropy": 0.7726943004326742, "eval_loss": 0.2284156084060669, "eval_mean_token_accuracy": 0.9343840216026932, "eval_num_tokens": 89071435.0, "eval_runtime": 7.0961, "eval_samples_per_second": 137.118, "eval_steps_per_second": 8.596, "step": 450 }, { "entropy": 0.7833973987535997, "epoch": 0.0910828025477707, "grad_norm": 0.9149163365364075, "learning_rate": 9.08e-07, "loss": 0.309, "mean_token_accuracy": 0.9209194253791463, "num_tokens": 90204895.0, "step": 455 }, { "entropy": 0.8328197641806169, "epoch": 0.09208371246587807, "grad_norm": 0.8828219771385193, "learning_rate": 9.18e-07, "loss": 0.3113, "mean_token_accuracy": 0.9182033365423029, "num_tokens": 91290928.0, "step": 460 }, { "entropy": 0.8542791778391058, "epoch": 0.09308462238398545, "grad_norm": 0.847339928150177, "learning_rate": 9.28e-07, "loss": 0.3111, "mean_token_accuracy": 0.9180883992802013, "num_tokens": 92353449.0, "step": 465 }, { "entropy": 0.8792417260733518, "epoch": 0.09408553230209281, "grad_norm": 0.6507946848869324, "learning_rate": 9.379999999999998e-07, "loss": 0.3093, "mean_token_accuracy": 0.9185398605736819, "num_tokens": 93329097.0, "step": 470 }, { "entropy": 0.8795178760181773, "epoch": 0.09508644222020018, "grad_norm": 1.9719516038894653, "learning_rate": 9.479999999999999e-07, "loss": 0.3248, "mean_token_accuracy": 0.9174890155142004, "num_tokens": 94076001.0, "step": 475 }, { "entropy": 0.7680907699194821, "epoch": 0.09608735213830755, "grad_norm": 0.9014910459518433, "learning_rate": 9.58e-07, "loss": 0.2999, "mean_token_accuracy": 0.9230750652876767, "num_tokens": 95199341.0, "step": 480 }, { "epoch": 0.09608735213830755, "eval_entropy": 0.7766347244137624, "eval_loss": 0.22492578625679016, "eval_mean_token_accuracy": 0.9348700437389437, "eval_num_tokens": 95199341.0, "eval_runtime": 7.167, "eval_samples_per_second": 135.761, "eval_steps_per_second": 8.511, "step": 480 }, { "entropy": 0.8319806510751898, "epoch": 0.09708826205641492, "grad_norm": 0.8409860134124756, "learning_rate": 9.679999999999999e-07, "loss": 0.3096, "mean_token_accuracy": 0.919183918020942, "num_tokens": 96280405.0, "step": 485 }, { "entropy": 0.8512482968243685, "epoch": 0.0980891719745223, "grad_norm": 0.7460948824882507, "learning_rate": 9.78e-07, "loss": 0.3048, "mean_token_accuracy": 0.9195660710334778, "num_tokens": 97324958.0, "step": 490 }, { "entropy": 0.8670887063850056, "epoch": 0.09909008189262966, "grad_norm": 0.7144485116004944, "learning_rate": 9.88e-07, "loss": 0.3047, "mean_token_accuracy": 0.9209712348201058, "num_tokens": 98281184.0, "step": 495 }, { "entropy": 0.876132286678661, "epoch": 0.10009099181073704, "grad_norm": 2.102391242980957, "learning_rate": 9.98e-07, "loss": 0.3134, "mean_token_accuracy": 0.9201786610213193, "num_tokens": 99020369.0, "step": 500 }, { "entropy": 0.772329353202473, "epoch": 0.1010919017288444, "grad_norm": 0.9755299687385559, "learning_rate": 1.008e-06, "loss": 0.3018, "mean_token_accuracy": 0.9223481546748769, "num_tokens": 100156631.0, "step": 505 }, { "entropy": 0.8236495657400651, "epoch": 0.10209281164695178, "grad_norm": 0.8604740500450134, "learning_rate": 1.018e-06, "loss": 0.3032, "mean_token_accuracy": 0.9208208913152868, "num_tokens": 101235364.0, "step": 510 }, { "epoch": 0.10209281164695178, "eval_entropy": 0.7710273295152382, "eval_loss": 0.22457090020179749, "eval_mean_token_accuracy": 0.9353827013344062, "eval_num_tokens": 101235364.0, "eval_runtime": 7.1319, "eval_samples_per_second": 136.429, "eval_steps_per_second": 8.553, "step": 510 }, { "entropy": 0.856683783639561, "epoch": 0.10309372156505915, "grad_norm": 0.9585539102554321, "learning_rate": 1.028e-06, "loss": 0.3121, "mean_token_accuracy": 0.9176510073921897, "num_tokens": 102253625.0, "step": 515 }, { "entropy": 0.8793982776728544, "epoch": 0.10409463148316651, "grad_norm": 0.6796140074729919, "learning_rate": 1.038e-06, "loss": 0.3192, "mean_token_accuracy": 0.9176862608302724, "num_tokens": 103206550.0, "step": 520 }, { "entropy": 0.8828882379965348, "epoch": 0.10509554140127389, "grad_norm": 1.9700580835342407, "learning_rate": 1.048e-06, "loss": 0.3138, "mean_token_accuracy": 0.9197612930427898, "num_tokens": 103937735.0, "step": 525 }, { "entropy": 0.7799841582775116, "epoch": 0.10609645131938125, "grad_norm": 0.9521822929382324, "learning_rate": 1.058e-06, "loss": 0.2993, "mean_token_accuracy": 0.9223499368537557, "num_tokens": 105062562.0, "step": 530 }, { "entropy": 0.8236062943935394, "epoch": 0.10709736123748863, "grad_norm": 0.8883262276649475, "learning_rate": 1.068e-06, "loss": 0.3043, "mean_token_accuracy": 0.9200864168730649, "num_tokens": 106148854.0, "step": 535 }, { "entropy": 0.855875781991265, "epoch": 0.10809827115559599, "grad_norm": 0.7604862451553345, "learning_rate": 1.078e-06, "loss": 0.301, "mean_token_accuracy": 0.9207951112227006, "num_tokens": 107184016.0, "step": 540 }, { "epoch": 0.10809827115559599, "eval_entropy": 0.7688479355124177, "eval_loss": 0.2222135066986084, "eval_mean_token_accuracy": 0.9355378277966233, "eval_num_tokens": 107184016.0, "eval_runtime": 7.1389, "eval_samples_per_second": 136.295, "eval_steps_per_second": 8.545, "step": 540 }, { "entropy": 0.8719826313582334, "epoch": 0.10909918107370337, "grad_norm": 0.6990140676498413, "learning_rate": 1.088e-06, "loss": 0.3101, "mean_token_accuracy": 0.9189783123406496, "num_tokens": 108136311.0, "step": 545 }, { "entropy": 0.8727446463975039, "epoch": 0.11010009099181074, "grad_norm": 1.9771708250045776, "learning_rate": 1.0980000000000001e-06, "loss": 0.3056, "mean_token_accuracy": 0.9217729514295404, "num_tokens": 108870399.0, "step": 550 }, { "entropy": 0.7766597162593495, "epoch": 0.1111010009099181, "grad_norm": 0.9047304391860962, "learning_rate": 1.108e-06, "loss": 0.2989, "mean_token_accuracy": 0.9228177655826916, "num_tokens": 110010412.0, "step": 555 }, { "entropy": 0.821604372696443, "epoch": 0.11210191082802548, "grad_norm": 0.7859129905700684, "learning_rate": 1.1180000000000001e-06, "loss": 0.2986, "mean_token_accuracy": 0.9216692274267023, "num_tokens": 111101459.0, "step": 560 }, { "entropy": 0.8458013583313335, "epoch": 0.11310282074613284, "grad_norm": 0.7374240159988403, "learning_rate": 1.1279999999999998e-06, "loss": 0.3023, "mean_token_accuracy": 0.9201881116086786, "num_tokens": 112137324.0, "step": 565 }, { "entropy": 0.8739073032682593, "epoch": 0.11410373066424022, "grad_norm": 0.6672995686531067, "learning_rate": 1.138e-06, "loss": 0.3039, "mean_token_accuracy": 0.9213175762783398, "num_tokens": 113102193.0, "step": 570 }, { "epoch": 0.11410373066424022, "eval_entropy": 0.7622656685407044, "eval_loss": 0.21812133491039276, "eval_mean_token_accuracy": 0.9362532193543481, "eval_num_tokens": 113102193.0, "eval_runtime": 7.083, "eval_samples_per_second": 137.372, "eval_steps_per_second": 8.612, "step": 570 }, { "entropy": 0.8674825570800088, "epoch": 0.1151046405823476, "grad_norm": 1.9667503833770752, "learning_rate": 1.1479999999999999e-06, "loss": 0.3032, "mean_token_accuracy": 0.9225559597665614, "num_tokens": 113844512.0, "step": 575 }, { "entropy": 0.7683469571850516, "epoch": 0.11610555050045496, "grad_norm": 0.8600199818611145, "learning_rate": 1.158e-06, "loss": 0.2874, "mean_token_accuracy": 0.9255136945030906, "num_tokens": 114973235.0, "step": 580 }, { "entropy": 0.8179097500714388, "epoch": 0.11710646041856233, "grad_norm": 0.7877588272094727, "learning_rate": 1.1679999999999999e-06, "loss": 0.2992, "mean_token_accuracy": 0.9217895182696256, "num_tokens": 116057315.0, "step": 585 }, { "entropy": 0.8371223628520965, "epoch": 0.1181073703366697, "grad_norm": 0.7893165946006775, "learning_rate": 1.178e-06, "loss": 0.2992, "mean_token_accuracy": 0.9224891754713925, "num_tokens": 117103213.0, "step": 590 }, { "entropy": 0.8535070988264951, "epoch": 0.11910828025477707, "grad_norm": 0.7347795367240906, "learning_rate": 1.1879999999999999e-06, "loss": 0.2933, "mean_token_accuracy": 0.9236816352063959, "num_tokens": 118057061.0, "step": 595 }, { "entropy": 0.873365730047226, "epoch": 0.12010919017288443, "grad_norm": 2.104503870010376, "learning_rate": 1.1979999999999998e-06, "loss": 0.3067, "mean_token_accuracy": 0.9227487694133412, "num_tokens": 118778234.0, "step": 600 }, { "epoch": 0.12010919017288443, "eval_entropy": 0.7565031569512164, "eval_loss": 0.21913714706897736, "eval_mean_token_accuracy": 0.936484013424545, "eval_num_tokens": 118778234.0, "eval_runtime": 7.1122, "eval_samples_per_second": 136.806, "eval_steps_per_second": 8.577, "step": 600 }, { "entropy": 0.7622005002065139, "epoch": 0.12111010009099181, "grad_norm": 0.9007834196090698, "learning_rate": 1.208e-06, "loss": 0.2921, "mean_token_accuracy": 0.9243286257440394, "num_tokens": 119913350.0, "step": 605 }, { "entropy": 0.8152731066400355, "epoch": 0.12211101000909919, "grad_norm": 0.8577731251716614, "learning_rate": 1.2179999999999998e-06, "loss": 0.303, "mean_token_accuracy": 0.9211578873070804, "num_tokens": 121005111.0, "step": 610 }, { "entropy": 0.8311775321310216, "epoch": 0.12311191992720655, "grad_norm": 0.8874506950378418, "learning_rate": 1.228e-06, "loss": 0.2941, "mean_token_accuracy": 0.9234593711116097, "num_tokens": 122053423.0, "step": 615 }, { "entropy": 0.8455709652467207, "epoch": 0.12411282984531392, "grad_norm": 0.6410759687423706, "learning_rate": 1.2379999999999998e-06, "loss": 0.2869, "mean_token_accuracy": 0.9240941790017214, "num_tokens": 123023538.0, "step": 620 }, { "entropy": 0.865269153768366, "epoch": 0.1251137397634213, "grad_norm": 1.943968653678894, "learning_rate": 1.248e-06, "loss": 0.301, "mean_token_accuracy": 0.9235067736018788, "num_tokens": 123756604.0, "step": 625 }, { "entropy": 0.7617347641424699, "epoch": 0.12611464968152866, "grad_norm": 0.8967196941375732, "learning_rate": 1.2579999999999999e-06, "loss": 0.2934, "mean_token_accuracy": 0.9244806538928639, "num_tokens": 124876188.0, "step": 630 }, { "epoch": 0.12611464968152866, "eval_entropy": 0.758461444104304, "eval_loss": 0.21852023899555206, "eval_mean_token_accuracy": 0.9367063485208105, "eval_num_tokens": 124876188.0, "eval_runtime": 7.0895, "eval_samples_per_second": 137.246, "eval_steps_per_second": 8.604, "step": 630 }, { "entropy": 0.8160103900866075, "epoch": 0.12711555959963602, "grad_norm": 0.7799694538116455, "learning_rate": 1.268e-06, "loss": 0.2946, "mean_token_accuracy": 0.9232698521830819, "num_tokens": 125977507.0, "step": 635 }, { "entropy": 0.8289537635716525, "epoch": 0.12811646951774341, "grad_norm": 0.784251868724823, "learning_rate": 1.2779999999999999e-06, "loss": 0.2917, "mean_token_accuracy": 0.9241768289696086, "num_tokens": 127032719.0, "step": 640 }, { "entropy": 0.8573456623337485, "epoch": 0.12911737943585078, "grad_norm": 0.7045451402664185, "learning_rate": 1.288e-06, "loss": 0.2984, "mean_token_accuracy": 0.9235251740975814, "num_tokens": 127988387.0, "step": 645 }, { "entropy": 0.8507555175911297, "epoch": 0.13011828935395814, "grad_norm": 1.819658637046814, "learning_rate": 1.298e-06, "loss": 0.2914, "mean_token_accuracy": 0.924909613349221, "num_tokens": 128721584.0, "step": 650 }, { "entropy": 0.7407966630025343, "epoch": 0.1311191992720655, "grad_norm": 0.8966879844665527, "learning_rate": 1.308e-06, "loss": 0.28, "mean_token_accuracy": 0.92760883027857, "num_tokens": 129860649.0, "step": 655 }, { "entropy": 0.8003853126005693, "epoch": 0.1321201091901729, "grad_norm": 0.8079231977462769, "learning_rate": 1.318e-06, "loss": 0.2894, "mean_token_accuracy": 0.9240497372367166, "num_tokens": 130965498.0, "step": 660 }, { "epoch": 0.1321201091901729, "eval_entropy": 0.750291645526886, "eval_loss": 0.213973730802536, "eval_mean_token_accuracy": 0.9382940315809406, "eval_num_tokens": 130965498.0, "eval_runtime": 7.0264, "eval_samples_per_second": 138.477, "eval_steps_per_second": 8.681, "step": 660 }, { "entropy": 0.8198544599793174, "epoch": 0.13312101910828025, "grad_norm": 0.8206574320793152, "learning_rate": 1.328e-06, "loss": 0.2865, "mean_token_accuracy": 0.9246467048471624, "num_tokens": 132002292.0, "step": 665 }, { "entropy": 0.8489445995200764, "epoch": 0.13412192902638762, "grad_norm": 0.6715352535247803, "learning_rate": 1.338e-06, "loss": 0.2924, "mean_token_accuracy": 0.9234138223257932, "num_tokens": 132956333.0, "step": 670 }, { "entropy": 0.8532397329807282, "epoch": 0.135122838944495, "grad_norm": 1.8673855066299438, "learning_rate": 1.348e-06, "loss": 0.2997, "mean_token_accuracy": 0.9234183029694991, "num_tokens": 133677965.0, "step": 675 }, { "entropy": 0.7524514572186903, "epoch": 0.13612374886260237, "grad_norm": 0.8716238737106323, "learning_rate": 1.358e-06, "loss": 0.2841, "mean_token_accuracy": 0.9266363842920824, "num_tokens": 134802840.0, "step": 680 }, { "entropy": 0.8017716060985218, "epoch": 0.13712465878070973, "grad_norm": 0.7700985074043274, "learning_rate": 1.368e-06, "loss": 0.2855, "mean_token_accuracy": 0.9250672015276822, "num_tokens": 135888025.0, "step": 685 }, { "entropy": 0.8074500967155803, "epoch": 0.13812556869881712, "grad_norm": 0.8650295734405518, "learning_rate": 1.3779999999999998e-06, "loss": 0.286, "mean_token_accuracy": 0.9245203012769873, "num_tokens": 136924539.0, "step": 690 }, { "epoch": 0.13812556869881712, "eval_entropy": 0.7438388193239931, "eval_loss": 0.21345947682857513, "eval_mean_token_accuracy": 0.9379716603482355, "eval_num_tokens": 136924539.0, "eval_runtime": 7.2288, "eval_samples_per_second": 134.6, "eval_steps_per_second": 8.438, "step": 690 }, { "entropy": 0.8413560314611955, "epoch": 0.13912647861692448, "grad_norm": 0.6126115918159485, "learning_rate": 1.3879999999999999e-06, "loss": 0.2903, "mean_token_accuracy": 0.924701126055284, "num_tokens": 137879499.0, "step": 695 }, { "entropy": 0.8566647876392711, "epoch": 0.14012738853503184, "grad_norm": 1.6679223775863647, "learning_rate": 1.3979999999999998e-06, "loss": 0.2998, "mean_token_accuracy": 0.9239558404142206, "num_tokens": 138603916.0, "step": 700 }, { "entropy": 0.7479892709038475, "epoch": 0.1411282984531392, "grad_norm": 0.8888144493103027, "learning_rate": 1.408e-06, "loss": 0.281, "mean_token_accuracy": 0.9272219023921273, "num_tokens": 139731892.0, "step": 705 }, { "entropy": 0.8010973101312464, "epoch": 0.1421292083712466, "grad_norm": 0.7999457716941833, "learning_rate": 1.4179999999999998e-06, "loss": 0.2862, "mean_token_accuracy": 0.9249833226203918, "num_tokens": 140823520.0, "step": 710 }, { "entropy": 0.8243546702645042, "epoch": 0.14313011828935396, "grad_norm": 0.7929534912109375, "learning_rate": 1.428e-06, "loss": 0.289, "mean_token_accuracy": 0.9242551738565619, "num_tokens": 141838998.0, "step": 715 }, { "entropy": 0.8342987900430506, "epoch": 0.14413102820746132, "grad_norm": 0.7797636985778809, "learning_rate": 1.4379999999999998e-06, "loss": 0.286, "mean_token_accuracy": 0.9250240569764917, "num_tokens": 142788817.0, "step": 720 }, { "epoch": 0.14413102820746132, "eval_entropy": 0.7430196765993462, "eval_loss": 0.21336835622787476, "eval_mean_token_accuracy": 0.9381084842760055, "eval_num_tokens": 142788817.0, "eval_runtime": 7.0804, "eval_samples_per_second": 137.421, "eval_steps_per_second": 8.615, "step": 720 }, { "entropy": 0.8445068792863326, "epoch": 0.1451319381255687, "grad_norm": 1.6939398050308228, "learning_rate": 1.448e-06, "loss": 0.2895, "mean_token_accuracy": 0.9256705939769745, "num_tokens": 143528076.0, "step": 725 }, { "entropy": 0.7430232730778781, "epoch": 0.14613284804367607, "grad_norm": 0.9444680213928223, "learning_rate": 1.4579999999999998e-06, "loss": 0.2684, "mean_token_accuracy": 0.9300248498266394, "num_tokens": 144669748.0, "step": 730 }, { "entropy": 0.8000332848592238, "epoch": 0.14713375796178343, "grad_norm": 0.8172219395637512, "learning_rate": 1.468e-06, "loss": 0.2831, "mean_token_accuracy": 0.9256041104143317, "num_tokens": 145740545.0, "step": 735 }, { "entropy": 0.8183066297661175, "epoch": 0.1481346678798908, "grad_norm": 0.7712555527687073, "learning_rate": 1.4779999999999999e-06, "loss": 0.2832, "mean_token_accuracy": 0.9251929564909501, "num_tokens": 146777060.0, "step": 740 }, { "entropy": 0.8324488238854841, "epoch": 0.1491355777979982, "grad_norm": 0.6426140666007996, "learning_rate": 1.488e-06, "loss": 0.2842, "mean_token_accuracy": 0.925419792803851, "num_tokens": 147735794.0, "step": 745 }, { "entropy": 0.845582206140865, "epoch": 0.15013648771610555, "grad_norm": 1.8751927614212036, "learning_rate": 1.4979999999999999e-06, "loss": 0.2924, "mean_token_accuracy": 0.9251068283211101, "num_tokens": 148475511.0, "step": 750 }, { "epoch": 0.15013648771610555, "eval_entropy": 0.7409035055363764, "eval_loss": 0.20810416340827942, "eval_mean_token_accuracy": 0.9396978216093095, "eval_num_tokens": 148475511.0, "eval_runtime": 7.0982, "eval_samples_per_second": 137.077, "eval_steps_per_second": 8.594, "step": 750 }, { "entropy": 0.738556033372879, "epoch": 0.1511373976342129, "grad_norm": 0.9034550189971924, "learning_rate": 1.508e-06, "loss": 0.2687, "mean_token_accuracy": 0.9301519659432498, "num_tokens": 149602202.0, "step": 755 }, { "entropy": 0.8039004320448095, "epoch": 0.1521383075523203, "grad_norm": 0.7859927415847778, "learning_rate": 1.518e-06, "loss": 0.2835, "mean_token_accuracy": 0.9249755826863375, "num_tokens": 150660031.0, "step": 760 }, { "entropy": 0.8235690740021793, "epoch": 0.15313921747042766, "grad_norm": 0.9304025769233704, "learning_rate": 1.528e-06, "loss": 0.2837, "mean_token_accuracy": 0.9240913233973763, "num_tokens": 151705865.0, "step": 765 }, { "entropy": 0.8295753197236495, "epoch": 0.15414012738853503, "grad_norm": 0.6954373717308044, "learning_rate": 1.538e-06, "loss": 0.2826, "mean_token_accuracy": 0.9268539065664465, "num_tokens": 152662320.0, "step": 770 }, { "entropy": 0.8385162537748163, "epoch": 0.15514103730664242, "grad_norm": 1.6783963441848755, "learning_rate": 1.548e-06, "loss": 0.2861, "mean_token_accuracy": 0.9266390404917977, "num_tokens": 153390397.0, "step": 775 }, { "entropy": 0.7379371079531583, "epoch": 0.15614194722474978, "grad_norm": 0.9129334092140198, "learning_rate": 1.558e-06, "loss": 0.2728, "mean_token_accuracy": 0.9291019217534499, "num_tokens": 154510653.0, "step": 780 }, { "epoch": 0.15614194722474978, "eval_entropy": 0.744651442668477, "eval_loss": 0.20912407338619232, "eval_mean_token_accuracy": 0.9394876233866958, "eval_num_tokens": 154510653.0, "eval_runtime": 7.2729, "eval_samples_per_second": 133.784, "eval_steps_per_second": 8.387, "step": 780 }, { "entropy": 0.7914630618962375, "epoch": 0.15714285714285714, "grad_norm": 0.8796635270118713, "learning_rate": 1.568e-06, "loss": 0.279, "mean_token_accuracy": 0.9265191034837202, "num_tokens": 155598773.0, "step": 785 }, { "entropy": 0.8060251300985163, "epoch": 0.1581437670609645, "grad_norm": 0.722435474395752, "learning_rate": 1.578e-06, "loss": 0.2837, "mean_token_accuracy": 0.9250888071276925, "num_tokens": 156631931.0, "step": 790 }, { "entropy": 0.8348277311433445, "epoch": 0.1591446769790719, "grad_norm": 0.6317898631095886, "learning_rate": 1.588e-06, "loss": 0.2873, "mean_token_accuracy": 0.9253057523207231, "num_tokens": 157582000.0, "step": 795 }, { "entropy": 0.8389149953018535, "epoch": 0.16014558689717925, "grad_norm": 1.8928760290145874, "learning_rate": 1.598e-06, "loss": 0.2875, "mean_token_accuracy": 0.9262628761204806, "num_tokens": 158320297.0, "step": 800 }, { "entropy": 0.7366051608865911, "epoch": 0.16114649681528662, "grad_norm": 0.8541039228439331, "learning_rate": 1.608e-06, "loss": 0.2663, "mean_token_accuracy": 0.9309936154972424, "num_tokens": 159440189.0, "step": 805 }, { "entropy": 0.7848947503349998, "epoch": 0.162147406733394, "grad_norm": 0.752082884311676, "learning_rate": 1.618e-06, "loss": 0.2813, "mean_token_accuracy": 0.9258821048519829, "num_tokens": 160545730.0, "step": 810 }, { "epoch": 0.162147406733394, "eval_entropy": 0.7330616829825229, "eval_loss": 0.2090436816215515, "eval_mean_token_accuracy": 0.9397259344820117, "eval_num_tokens": 160545730.0, "eval_runtime": 7.0312, "eval_samples_per_second": 138.384, "eval_steps_per_second": 8.676, "step": 810 }, { "entropy": 0.8028561315753243, "epoch": 0.16314831665150137, "grad_norm": 0.7342677116394043, "learning_rate": 1.628e-06, "loss": 0.2804, "mean_token_accuracy": 0.9256297485394911, "num_tokens": 161573472.0, "step": 815 }, { "entropy": 0.8302115288647738, "epoch": 0.16414922656960873, "grad_norm": 0.7893772721290588, "learning_rate": 1.6379999999999998e-06, "loss": 0.2836, "mean_token_accuracy": 0.9263590422543613, "num_tokens": 162530179.0, "step": 820 }, { "entropy": 0.827764184366573, "epoch": 0.1651501364877161, "grad_norm": 1.99238920211792, "learning_rate": 1.648e-06, "loss": 0.2817, "mean_token_accuracy": 0.9277313232421875, "num_tokens": 163263981.0, "step": 825 }, { "entropy": 0.722087900205092, "epoch": 0.16615104640582348, "grad_norm": 0.8378480076789856, "learning_rate": 1.6579999999999998e-06, "loss": 0.2627, "mean_token_accuracy": 0.9315350006927143, "num_tokens": 164398357.0, "step": 830 }, { "entropy": 0.7725223552096974, "epoch": 0.16715195632393084, "grad_norm": 0.7914682626724243, "learning_rate": 1.668e-06, "loss": 0.2716, "mean_token_accuracy": 0.9280308051542803, "num_tokens": 165468194.0, "step": 835 }, { "entropy": 0.8090368211269379, "epoch": 0.1681528662420382, "grad_norm": 0.8411707282066345, "learning_rate": 1.6779999999999999e-06, "loss": 0.2795, "mean_token_accuracy": 0.9261684049259532, "num_tokens": 166510298.0, "step": 840 }, { "epoch": 0.1681528662420382, "eval_entropy": 0.726200912819534, "eval_loss": 0.20932643115520477, "eval_mean_token_accuracy": 0.9394924601570505, "eval_num_tokens": 166510298.0, "eval_runtime": 7.1551, "eval_samples_per_second": 135.988, "eval_steps_per_second": 8.525, "step": 840 }, { "entropy": 0.8117384894327684, "epoch": 0.1691537761601456, "grad_norm": 0.6859620809555054, "learning_rate": 1.6879999999999998e-06, "loss": 0.2775, "mean_token_accuracy": 0.9275808865373785, "num_tokens": 167466578.0, "step": 845 }, { "entropy": 0.8264376792040738, "epoch": 0.17015468607825296, "grad_norm": 1.9784756898880005, "learning_rate": 1.6979999999999999e-06, "loss": 0.2832, "mean_token_accuracy": 0.9271657266400077, "num_tokens": 168200534.0, "step": 850 }, { "entropy": 0.7185999631881714, "epoch": 0.17115559599636032, "grad_norm": 0.8721832036972046, "learning_rate": 1.7079999999999998e-06, "loss": 0.2669, "mean_token_accuracy": 0.9305740556933663, "num_tokens": 169340781.0, "step": 855 }, { "entropy": 0.7641947990114039, "epoch": 0.1721565059144677, "grad_norm": 0.7572875618934631, "learning_rate": 1.718e-06, "loss": 0.2713, "mean_token_accuracy": 0.9280713466080752, "num_tokens": 170422517.0, "step": 860 }, { "entropy": 0.7978116544810209, "epoch": 0.17315741583257507, "grad_norm": 0.7459094524383545, "learning_rate": 1.7279999999999998e-06, "loss": 0.2765, "mean_token_accuracy": 0.9266852221705697, "num_tokens": 171469260.0, "step": 865 }, { "entropy": 0.8156883180141449, "epoch": 0.17415832575068244, "grad_norm": 0.6680793166160583, "learning_rate": 1.738e-06, "loss": 0.2768, "mean_token_accuracy": 0.9277672919360074, "num_tokens": 172427066.0, "step": 870 }, { "epoch": 0.17415832575068244, "eval_entropy": 0.7165868282318115, "eval_loss": 0.20711387693881989, "eval_mean_token_accuracy": 0.9398616259215308, "eval_num_tokens": 172427066.0, "eval_runtime": 7.0985, "eval_samples_per_second": 137.071, "eval_steps_per_second": 8.593, "step": 870 }, { "entropy": 0.8210206216031855, "epoch": 0.1751592356687898, "grad_norm": 1.6756982803344727, "learning_rate": 1.7479999999999998e-06, "loss": 0.2764, "mean_token_accuracy": 0.9285656040365046, "num_tokens": 173142991.0, "step": 875 }, { "entropy": 0.7199788321148265, "epoch": 0.1761601455868972, "grad_norm": 0.8849101066589355, "learning_rate": 1.758e-06, "loss": 0.2598, "mean_token_accuracy": 0.9321709827943282, "num_tokens": 174282576.0, "step": 880 }, { "entropy": 0.7870851630514318, "epoch": 0.17716105550500455, "grad_norm": 0.7448862791061401, "learning_rate": 1.7679999999999998e-06, "loss": 0.2737, "mean_token_accuracy": 0.9282706352797422, "num_tokens": 175369483.0, "step": 885 }, { "entropy": 0.8100219070911407, "epoch": 0.1781619654231119, "grad_norm": 0.8964276313781738, "learning_rate": 1.778e-06, "loss": 0.2773, "mean_token_accuracy": 0.9265438226136294, "num_tokens": 176409360.0, "step": 890 }, { "entropy": 0.8189065765250813, "epoch": 0.1791628753412193, "grad_norm": 0.7688677906990051, "learning_rate": 1.7879999999999999e-06, "loss": 0.2699, "mean_token_accuracy": 0.9298083012754267, "num_tokens": 177361894.0, "step": 895 }, { "entropy": 0.8263901694254442, "epoch": 0.18016378525932666, "grad_norm": 1.8425803184509277, "learning_rate": 1.798e-06, "loss": 0.2763, "mean_token_accuracy": 0.9287969903512434, "num_tokens": 178089733.0, "step": 900 }, { "epoch": 0.18016378525932666, "eval_entropy": 0.7217419499256572, "eval_loss": 0.20589642226696014, "eval_mean_token_accuracy": 0.9406212060177912, "eval_num_tokens": 178089733.0, "eval_runtime": 7.1263, "eval_samples_per_second": 136.536, "eval_steps_per_second": 8.56, "step": 900 }, { "entropy": 0.72322096824646, "epoch": 0.18116469517743403, "grad_norm": 0.8959372043609619, "learning_rate": 1.8079999999999999e-06, "loss": 0.2666, "mean_token_accuracy": 0.9303640398112211, "num_tokens": 179231674.0, "step": 905 }, { "entropy": 0.7706199407577514, "epoch": 0.1821656050955414, "grad_norm": 0.8667539358139038, "learning_rate": 1.818e-06, "loss": 0.2765, "mean_token_accuracy": 0.9267726708542217, "num_tokens": 180315249.0, "step": 910 }, { "entropy": 0.7906596568497745, "epoch": 0.18316651501364878, "grad_norm": 0.7227725982666016, "learning_rate": 1.828e-06, "loss": 0.2686, "mean_token_accuracy": 0.9294304208322005, "num_tokens": 181340192.0, "step": 915 }, { "entropy": 0.8029232171448795, "epoch": 0.18416742493175614, "grad_norm": 0.6649354696273804, "learning_rate": 1.838e-06, "loss": 0.2667, "mean_token_accuracy": 0.9300361882556568, "num_tokens": 182307935.0, "step": 920 }, { "entropy": 0.7977046684785323, "epoch": 0.1851683348498635, "grad_norm": 1.6901991367340088, "learning_rate": 1.848e-06, "loss": 0.2723, "mean_token_accuracy": 0.9294875199144537, "num_tokens": 183060078.0, "step": 925 }, { "entropy": 0.7040667669339613, "epoch": 0.1861692447679709, "grad_norm": 0.8532208204269409, "learning_rate": 1.858e-06, "loss": 0.2571, "mean_token_accuracy": 0.9326450272039933, "num_tokens": 184221420.0, "step": 930 }, { "epoch": 0.1861692447679709, "eval_entropy": 0.725787528225633, "eval_loss": 0.20359419286251068, "eval_mean_token_accuracy": 0.9418040504221057, "eval_num_tokens": 184221420.0, "eval_runtime": 7.1054, "eval_samples_per_second": 136.938, "eval_steps_per_second": 8.585, "step": 930 }, { "entropy": 0.7605256313627416, "epoch": 0.18717015468607826, "grad_norm": 0.740172266960144, "learning_rate": 1.868e-06, "loss": 0.2642, "mean_token_accuracy": 0.9301990406079725, "num_tokens": 185315366.0, "step": 935 }, { "entropy": 0.788138997554779, "epoch": 0.18817106460418562, "grad_norm": 0.7328667640686035, "learning_rate": 1.8779999999999998e-06, "loss": 0.2594, "mean_token_accuracy": 0.9303668347272006, "num_tokens": 186337490.0, "step": 940 }, { "entropy": 0.815633828531612, "epoch": 0.189171974522293, "grad_norm": 0.6744860410690308, "learning_rate": 1.8879999999999998e-06, "loss": 0.2702, "mean_token_accuracy": 0.9292519065466794, "num_tokens": 187269716.0, "step": 945 }, { "entropy": 0.8176946092735637, "epoch": 0.19017288444040037, "grad_norm": 1.8253427743911743, "learning_rate": 1.8979999999999999e-06, "loss": 0.2659, "mean_token_accuracy": 0.931620988520709, "num_tokens": 187991692.0, "step": 950 }, { "entropy": 0.7086463402618062, "epoch": 0.19117379435850773, "grad_norm": 0.9137970209121704, "learning_rate": 1.9079999999999998e-06, "loss": 0.2586, "mean_token_accuracy": 0.9328472657637162, "num_tokens": 189120364.0, "step": 955 }, { "entropy": 0.7753447532653809, "epoch": 0.1921747042766151, "grad_norm": 0.8782041668891907, "learning_rate": 1.9179999999999997e-06, "loss": 0.2659, "mean_token_accuracy": 0.9294385693290017, "num_tokens": 190191796.0, "step": 960 }, { "epoch": 0.1921747042766151, "eval_entropy": 0.724771861170159, "eval_loss": 0.2042693942785263, "eval_mean_token_accuracy": 0.9414341498593815, "eval_num_tokens": 190191796.0, "eval_runtime": 7.0787, "eval_samples_per_second": 137.455, "eval_steps_per_second": 8.617, "step": 960 }, { "entropy": 0.7960949518463828, "epoch": 0.19317561419472248, "grad_norm": 0.713079035282135, "learning_rate": 1.928e-06, "loss": 0.2676, "mean_token_accuracy": 0.9294228992678902, "num_tokens": 191241957.0, "step": 965 }, { "entropy": 0.8051728243177587, "epoch": 0.19417652411282985, "grad_norm": 0.666320264339447, "learning_rate": 1.938e-06, "loss": 0.2652, "mean_token_accuracy": 0.9296959736130455, "num_tokens": 192200870.0, "step": 970 }, { "entropy": 0.8057123639366843, "epoch": 0.1951774340309372, "grad_norm": 1.6562741994857788, "learning_rate": 1.948e-06, "loss": 0.2605, "mean_token_accuracy": 0.9326337846842679, "num_tokens": 192934859.0, "step": 975 }, { "entropy": 0.7061334458264438, "epoch": 0.1961783439490446, "grad_norm": 0.8408867120742798, "learning_rate": 1.9579999999999997e-06, "loss": 0.2536, "mean_token_accuracy": 0.9338537595488808, "num_tokens": 194104139.0, "step": 980 }, { "entropy": 0.7579250015995719, "epoch": 0.19717925386715196, "grad_norm": 0.7840215563774109, "learning_rate": 1.968e-06, "loss": 0.2595, "mean_token_accuracy": 0.9305404896085913, "num_tokens": 195188455.0, "step": 985 }, { "entropy": 0.7914492504163222, "epoch": 0.19818016378525932, "grad_norm": 0.692529559135437, "learning_rate": 1.978e-06, "loss": 0.2671, "mean_token_accuracy": 0.9293763789263638, "num_tokens": 196209141.0, "step": 990 }, { "epoch": 0.19818016378525932, "eval_entropy": 0.7139858779360037, "eval_loss": 0.20102433860301971, "eval_mean_token_accuracy": 0.9419027381255979, "eval_num_tokens": 196209141.0, "eval_runtime": 7.1947, "eval_samples_per_second": 135.239, "eval_steps_per_second": 8.479, "step": 990 }, { "entropy": 0.7997732845219698, "epoch": 0.19918107370336668, "grad_norm": 0.7436501979827881, "learning_rate": 1.988e-06, "loss": 0.2613, "mean_token_accuracy": 0.9318017385222696, "num_tokens": 197169823.0, "step": 995 }, { "entropy": 0.808041772517291, "epoch": 0.20018198362147407, "grad_norm": 1.8346213102340698, "learning_rate": 1.9979999999999998e-06, "loss": 0.2684, "mean_token_accuracy": 0.9309409526261416, "num_tokens": 197904294.0, "step": 1000 }, { "entropy": 0.7090240332213316, "epoch": 0.20118289353958144, "grad_norm": 0.8898105025291443, "learning_rate": 1.9991103202846973e-06, "loss": 0.2542, "mean_token_accuracy": 0.9338583967902444, "num_tokens": 199040537.0, "step": 1005 }, { "entropy": 0.7625901590694081, "epoch": 0.2021838034576888, "grad_norm": 0.7580350041389465, "learning_rate": 1.997998220640569e-06, "loss": 0.2684, "mean_token_accuracy": 0.9290495872497558, "num_tokens": 200122330.0, "step": 1010 }, { "entropy": 0.7868972290645946, "epoch": 0.2031847133757962, "grad_norm": 0.9172696471214294, "learning_rate": 1.996886120996441e-06, "loss": 0.2592, "mean_token_accuracy": 0.9309038433161649, "num_tokens": 201149457.0, "step": 1015 }, { "entropy": 0.7947816740382802, "epoch": 0.20418562329390355, "grad_norm": 0.6719794273376465, "learning_rate": 1.9957740213523133e-06, "loss": 0.2634, "mean_token_accuracy": 0.9316002515229311, "num_tokens": 202101608.0, "step": 1020 }, { "epoch": 0.20418562329390355, "eval_entropy": 0.7147035344702298, "eval_loss": 0.20133115351200104, "eval_mean_token_accuracy": 0.9419465289741266, "eval_num_tokens": 202101608.0, "eval_runtime": 7.0701, "eval_samples_per_second": 137.622, "eval_steps_per_second": 8.628, "step": 1020 }, { "entropy": 0.813241909308867, "epoch": 0.2051865332120109, "grad_norm": 1.68107271194458, "learning_rate": 1.994661921708185e-06, "loss": 0.2721, "mean_token_accuracy": 0.9300860870968212, "num_tokens": 202823517.0, "step": 1025 }, { "entropy": 0.6989771512421694, "epoch": 0.2061874431301183, "grad_norm": 0.9269376397132874, "learning_rate": 1.9935498220640566e-06, "loss": 0.2535, "mean_token_accuracy": 0.9341622206297788, "num_tokens": 203958059.0, "step": 1030 }, { "entropy": 0.7591653926806017, "epoch": 0.20718835304822567, "grad_norm": 0.7755193114280701, "learning_rate": 1.992437722419929e-06, "loss": 0.2648, "mean_token_accuracy": 0.9302975632927635, "num_tokens": 205042771.0, "step": 1035 }, { "entropy": 0.7722339581359516, "epoch": 0.20818926296633303, "grad_norm": 0.8515006303787231, "learning_rate": 1.9913256227758007e-06, "loss": 0.2638, "mean_token_accuracy": 0.9300298192284324, "num_tokens": 206086748.0, "step": 1040 }, { "entropy": 0.7889559702439741, "epoch": 0.2091901728844404, "grad_norm": 0.6690332293510437, "learning_rate": 1.9902135231316726e-06, "loss": 0.2565, "mean_token_accuracy": 0.9327416582541033, "num_tokens": 207023751.0, "step": 1045 }, { "entropy": 0.7905822466720235, "epoch": 0.21019108280254778, "grad_norm": 1.524138331413269, "learning_rate": 1.9891014234875445e-06, "loss": 0.2618, "mean_token_accuracy": 0.9318056187846444, "num_tokens": 207751826.0, "step": 1050 }, { "epoch": 0.21019108280254778, "eval_entropy": 0.6987361546422615, "eval_loss": 0.20032314956188202, "eval_mean_token_accuracy": 0.9415211052191063, "eval_num_tokens": 207751826.0, "eval_runtime": 7.1237, "eval_samples_per_second": 136.587, "eval_steps_per_second": 8.563, "step": 1050 }, { "entropy": 0.700323451649059, "epoch": 0.21119199272065514, "grad_norm": 0.9274206161499023, "learning_rate": 1.9879893238434163e-06, "loss": 0.2499, "mean_token_accuracy": 0.9347092021595348, "num_tokens": 208886557.0, "step": 1055 }, { "entropy": 0.7475979534062472, "epoch": 0.2121929026387625, "grad_norm": 0.8458713293075562, "learning_rate": 1.986877224199288e-06, "loss": 0.261, "mean_token_accuracy": 0.9306270117109472, "num_tokens": 209999842.0, "step": 1060 }, { "entropy": 0.7634694963693619, "epoch": 0.2131938125568699, "grad_norm": 0.7438536882400513, "learning_rate": 1.98576512455516e-06, "loss": 0.2612, "mean_token_accuracy": 0.9316813165491278, "num_tokens": 211047482.0, "step": 1065 }, { "entropy": 0.7860465927557512, "epoch": 0.21419472247497726, "grad_norm": 0.6679530739784241, "learning_rate": 1.984653024911032e-06, "loss": 0.2616, "mean_token_accuracy": 0.932481362061067, "num_tokens": 211999890.0, "step": 1070 }, { "entropy": 0.7879262474450198, "epoch": 0.21519563239308462, "grad_norm": 1.5317449569702148, "learning_rate": 1.9835409252669037e-06, "loss": 0.256, "mean_token_accuracy": 0.9341791461814534, "num_tokens": 212724971.0, "step": 1075 }, { "entropy": 0.6914473251862959, "epoch": 0.21619654231119198, "grad_norm": 0.9009571671485901, "learning_rate": 1.9824288256227756e-06, "loss": 0.2469, "mean_token_accuracy": 0.935233576189388, "num_tokens": 213865483.0, "step": 1080 }, { "epoch": 0.21619654231119198, "eval_entropy": 0.6992622926586964, "eval_loss": 0.19818614423274994, "eval_mean_token_accuracy": 0.9426181824480901, "eval_num_tokens": 213865483.0, "eval_runtime": 7.3835, "eval_samples_per_second": 131.78, "eval_steps_per_second": 8.262, "step": 1080 }, { "entropy": 0.7396956460042433, "epoch": 0.21719745222929937, "grad_norm": 0.7676311135292053, "learning_rate": 1.9813167259786475e-06, "loss": 0.2553, "mean_token_accuracy": 0.9329301888292486, "num_tokens": 214946018.0, "step": 1085 }, { "entropy": 0.7576209339228543, "epoch": 0.21819836214740673, "grad_norm": 0.9512864351272583, "learning_rate": 1.9802046263345197e-06, "loss": 0.2574, "mean_token_accuracy": 0.931071363254027, "num_tokens": 215999988.0, "step": 1090 }, { "entropy": 0.7767835638739846, "epoch": 0.2191992720655141, "grad_norm": 0.6882670521736145, "learning_rate": 1.979092526690391e-06, "loss": 0.2518, "mean_token_accuracy": 0.9337078777226535, "num_tokens": 216962447.0, "step": 1095 }, { "entropy": 0.7832509934902191, "epoch": 0.22020018198362148, "grad_norm": 1.6970500946044922, "learning_rate": 1.977980427046263e-06, "loss": 0.2583, "mean_token_accuracy": 0.9332552210851149, "num_tokens": 217692537.0, "step": 1100 }, { "entropy": 0.6820299370722337, "epoch": 0.22120109190172885, "grad_norm": 0.8949645757675171, "learning_rate": 1.9768683274021353e-06, "loss": 0.2445, "mean_token_accuracy": 0.935930597782135, "num_tokens": 218839476.0, "step": 1105 }, { "entropy": 0.72886228073727, "epoch": 0.2222020018198362, "grad_norm": 0.8621814846992493, "learning_rate": 1.975756227758007e-06, "loss": 0.2493, "mean_token_accuracy": 0.9338542092930187, "num_tokens": 219923390.0, "step": 1110 }, { "epoch": 0.2222020018198362, "eval_entropy": 0.6884255741463333, "eval_loss": 0.19926953315734863, "eval_mean_token_accuracy": 0.9423930107570085, "eval_num_tokens": 219923390.0, "eval_runtime": 7.0927, "eval_samples_per_second": 137.184, "eval_steps_per_second": 8.6, "step": 1110 }, { "entropy": 0.7540641031482003, "epoch": 0.22320291173794357, "grad_norm": 0.971157431602478, "learning_rate": 1.974644128113879e-06, "loss": 0.2567, "mean_token_accuracy": 0.932219631021673, "num_tokens": 220957232.0, "step": 1115 }, { "entropy": 0.7798225131901828, "epoch": 0.22420382165605096, "grad_norm": 0.7949030995368958, "learning_rate": 1.973532028469751e-06, "loss": 0.2581, "mean_token_accuracy": 0.9322475785558874, "num_tokens": 221909237.0, "step": 1120 }, { "entropy": 0.7734460061246698, "epoch": 0.22520473157415832, "grad_norm": 1.671317219734192, "learning_rate": 1.9724199288256227e-06, "loss": 0.2532, "mean_token_accuracy": 0.9343869902870872, "num_tokens": 222629518.0, "step": 1125 }, { "entropy": 0.6769220758568156, "epoch": 0.22620564149226569, "grad_norm": 0.8417484164237976, "learning_rate": 1.9713078291814946e-06, "loss": 0.2432, "mean_token_accuracy": 0.9365156341682781, "num_tokens": 223771141.0, "step": 1130 }, { "entropy": 0.7289805867455222, "epoch": 0.22720655141037308, "grad_norm": 0.8334816694259644, "learning_rate": 1.9701957295373665e-06, "loss": 0.2564, "mean_token_accuracy": 0.9321391544558785, "num_tokens": 224858611.0, "step": 1135 }, { "entropy": 0.7575576175342906, "epoch": 0.22820746132848044, "grad_norm": 0.686861515045166, "learning_rate": 1.9690836298932383e-06, "loss": 0.2553, "mean_token_accuracy": 0.932028527693315, "num_tokens": 225904498.0, "step": 1140 }, { "epoch": 0.22820746132848044, "eval_entropy": 0.687260666831595, "eval_loss": 0.19723324477672577, "eval_mean_token_accuracy": 0.9429298082336051, "eval_num_tokens": 225904498.0, "eval_runtime": 7.2193, "eval_samples_per_second": 134.777, "eval_steps_per_second": 8.45, "step": 1140 }, { "entropy": 0.7571648413484747, "epoch": 0.2292083712465878, "grad_norm": 0.6368003487586975, "learning_rate": 1.96797153024911e-06, "loss": 0.2484, "mean_token_accuracy": 0.9342491680925543, "num_tokens": 226858707.0, "step": 1145 }, { "entropy": 0.7685175494714217, "epoch": 0.2302092811646952, "grad_norm": 1.7895119190216064, "learning_rate": 1.966859430604982e-06, "loss": 0.2531, "mean_token_accuracy": 0.9351052864031358, "num_tokens": 227586735.0, "step": 1150 }, { "entropy": 0.6730130303989758, "epoch": 0.23121019108280255, "grad_norm": 0.8514677286148071, "learning_rate": 1.9657473309608543e-06, "loss": 0.2434, "mean_token_accuracy": 0.9364338099956513, "num_tokens": 228710792.0, "step": 1155 }, { "entropy": 0.7245557562871413, "epoch": 0.23221110100090991, "grad_norm": 0.7925510406494141, "learning_rate": 1.9646352313167257e-06, "loss": 0.2565, "mean_token_accuracy": 0.9326732272451574, "num_tokens": 229789807.0, "step": 1160 }, { "entropy": 0.7381821754303846, "epoch": 0.23321201091901728, "grad_norm": 0.7272951006889343, "learning_rate": 1.9635231316725976e-06, "loss": 0.2467, "mean_token_accuracy": 0.9342716991901397, "num_tokens": 230830474.0, "step": 1165 }, { "entropy": 0.7532747295769778, "epoch": 0.23421292083712467, "grad_norm": 0.6639147996902466, "learning_rate": 1.96241103202847e-06, "loss": 0.2521, "mean_token_accuracy": 0.9335366579619321, "num_tokens": 231790758.0, "step": 1170 }, { "epoch": 0.23421292083712467, "eval_entropy": 0.6738434072400703, "eval_loss": 0.19970019161701202, "eval_mean_token_accuracy": 0.9427229283285923, "eval_num_tokens": 231790758.0, "eval_runtime": 7.0658, "eval_samples_per_second": 137.705, "eval_steps_per_second": 8.633, "step": 1170 }, { "entropy": 0.7472162235866894, "epoch": 0.23521383075523203, "grad_norm": 1.5396642684936523, "learning_rate": 1.9612989323843417e-06, "loss": 0.2494, "mean_token_accuracy": 0.9352785722775893, "num_tokens": 232530867.0, "step": 1175 }, { "entropy": 0.6697620332241059, "epoch": 0.2362147406733394, "grad_norm": 0.8647318482398987, "learning_rate": 1.960186832740213e-06, "loss": 0.2433, "mean_token_accuracy": 0.9363701712001454, "num_tokens": 233651796.0, "step": 1180 }, { "entropy": 0.7114524765448137, "epoch": 0.23721565059144678, "grad_norm": 0.8350867629051208, "learning_rate": 1.9590747330960855e-06, "loss": 0.251, "mean_token_accuracy": 0.9338924034075303, "num_tokens": 234754552.0, "step": 1185 }, { "entropy": 0.7274992368438027, "epoch": 0.23821656050955414, "grad_norm": 0.6969212293624878, "learning_rate": 1.9579626334519573e-06, "loss": 0.2487, "mean_token_accuracy": 0.9337175385518508, "num_tokens": 235782960.0, "step": 1190 }, { "entropy": 0.7455267862840133, "epoch": 0.2392174704276615, "grad_norm": 0.624343752861023, "learning_rate": 1.956850533807829e-06, "loss": 0.2532, "mean_token_accuracy": 0.9332292107018557, "num_tokens": 236735963.0, "step": 1195 }, { "entropy": 0.7484802782535553, "epoch": 0.24021838034576887, "grad_norm": 1.5747654438018799, "learning_rate": 1.955738434163701e-06, "loss": 0.2506, "mean_token_accuracy": 0.9349917281757701, "num_tokens": 237476602.0, "step": 1200 }, { "epoch": 0.24021838034576887, "eval_entropy": 0.6878872777594894, "eval_loss": 0.19709168374538422, "eval_mean_token_accuracy": 0.9429968146027111, "eval_num_tokens": 237476602.0, "eval_runtime": 7.0622, "eval_samples_per_second": 137.775, "eval_steps_per_second": 8.637, "step": 1200 }, { "entropy": 0.671830934827978, "epoch": 0.24121929026387626, "grad_norm": 0.8599943518638611, "learning_rate": 1.954626334519573e-06, "loss": 0.2366, "mean_token_accuracy": 0.9383657791397788, "num_tokens": 238617406.0, "step": 1205 }, { "entropy": 0.7293940170244737, "epoch": 0.24222020018198362, "grad_norm": 0.754350483417511, "learning_rate": 1.9535142348754447e-06, "loss": 0.2512, "mean_token_accuracy": 0.9323639192364432, "num_tokens": 239700088.0, "step": 1210 }, { "entropy": 0.7499282219193198, "epoch": 0.24322111010009098, "grad_norm": 0.7476288080215454, "learning_rate": 1.9524021352313166e-06, "loss": 0.2552, "mean_token_accuracy": 0.9318941896611994, "num_tokens": 240733335.0, "step": 1215 }, { "entropy": 0.7511982554739172, "epoch": 0.24422202001819837, "grad_norm": 0.6863506436347961, "learning_rate": 1.9512900355871885e-06, "loss": 0.243, "mean_token_accuracy": 0.9356909887357192, "num_tokens": 241687104.0, "step": 1220 }, { "entropy": 0.749161382154985, "epoch": 0.24522292993630573, "grad_norm": 1.631894826889038, "learning_rate": 1.9501779359430603e-06, "loss": 0.2514, "mean_token_accuracy": 0.9346412853761152, "num_tokens": 242426018.0, "step": 1225 }, { "entropy": 0.6726668021895669, "epoch": 0.2462238398544131, "grad_norm": 0.8596307635307312, "learning_rate": 1.949065836298932e-06, "loss": 0.2454, "mean_token_accuracy": 0.9364431234923276, "num_tokens": 243548718.0, "step": 1230 }, { "epoch": 0.2462238398544131, "eval_entropy": 0.6812147097509416, "eval_loss": 0.19748112559318542, "eval_mean_token_accuracy": 0.943136929488573, "eval_num_tokens": 243548718.0, "eval_runtime": 7.0861, "eval_samples_per_second": 137.311, "eval_steps_per_second": 8.608, "step": 1230 }, { "entropy": 0.7223554464903745, "epoch": 0.24722474977252049, "grad_norm": 0.8182641863822937, "learning_rate": 1.947953736654804e-06, "loss": 0.2473, "mean_token_accuracy": 0.9328928150913932, "num_tokens": 244634262.0, "step": 1235 }, { "entropy": 0.7410072830590335, "epoch": 0.24822565969062785, "grad_norm": 0.831390380859375, "learning_rate": 1.9468416370106763e-06, "loss": 0.2458, "mean_token_accuracy": 0.9338255047798156, "num_tokens": 245677570.0, "step": 1240 }, { "entropy": 0.7599548085169359, "epoch": 0.2492265696087352, "grad_norm": 0.8275907635688782, "learning_rate": 1.9457295373665477e-06, "loss": 0.2424, "mean_token_accuracy": 0.9356081453236667, "num_tokens": 246647643.0, "step": 1245 }, { "entropy": 0.7624553501605987, "epoch": 0.2502274795268426, "grad_norm": 1.9468681812286377, "learning_rate": 1.94461743772242e-06, "loss": 0.2445, "mean_token_accuracy": 0.9364055861126293, "num_tokens": 247388979.0, "step": 1250 }, { "entropy": 0.6826613940975883, "epoch": 0.25122838944494996, "grad_norm": 0.8892253041267395, "learning_rate": 1.943505338078292e-06, "loss": 0.2377, "mean_token_accuracy": 0.937986614487388, "num_tokens": 248507582.0, "step": 1255 }, { "entropy": 0.7349318878217177, "epoch": 0.2522292993630573, "grad_norm": 0.7683637738227844, "learning_rate": 1.9423932384341637e-06, "loss": 0.2494, "mean_token_accuracy": 0.9336408035321669, "num_tokens": 249580005.0, "step": 1260 }, { "epoch": 0.2522292993630573, "eval_entropy": 0.6866021556932418, "eval_loss": 0.19629527628421783, "eval_mean_token_accuracy": 0.9435793952863725, "eval_num_tokens": 249580005.0, "eval_runtime": 7.0601, "eval_samples_per_second": 137.817, "eval_steps_per_second": 8.64, "step": 1260 }, { "entropy": 0.7540321504527873, "epoch": 0.2532302092811647, "grad_norm": 0.7559732794761658, "learning_rate": 1.9412811387900356e-06, "loss": 0.2516, "mean_token_accuracy": 0.9330432496287606, "num_tokens": 250621468.0, "step": 1265 }, { "entropy": 0.7423492084849964, "epoch": 0.25423111919927205, "grad_norm": 0.7324007153511047, "learning_rate": 1.9401690391459075e-06, "loss": 0.2381, "mean_token_accuracy": 0.9373159939592535, "num_tokens": 251581237.0, "step": 1270 }, { "entropy": 0.7672662193124945, "epoch": 0.2552320291173794, "grad_norm": 1.4408397674560547, "learning_rate": 1.9390569395017793e-06, "loss": 0.2423, "mean_token_accuracy": 0.9368164999918505, "num_tokens": 252303125.0, "step": 1275 }, { "entropy": 0.6668464682318948, "epoch": 0.25623293903548683, "grad_norm": 0.9180498123168945, "learning_rate": 1.937944839857651e-06, "loss": 0.2387, "mean_token_accuracy": 0.9375127759846774, "num_tokens": 253437743.0, "step": 1280 }, { "entropy": 0.7173917884176427, "epoch": 0.2572338489535942, "grad_norm": 0.7993113994598389, "learning_rate": 1.936832740213523e-06, "loss": 0.2435, "mean_token_accuracy": 0.9355504203926434, "num_tokens": 254543380.0, "step": 1285 }, { "entropy": 0.7427029658447613, "epoch": 0.25823475887170155, "grad_norm": 0.7974119186401367, "learning_rate": 1.935720640569395e-06, "loss": 0.2404, "mean_token_accuracy": 0.9355508500879461, "num_tokens": 255569004.0, "step": 1290 }, { "epoch": 0.25823475887170155, "eval_entropy": 0.683321903963558, "eval_loss": 0.197197824716568, "eval_mean_token_accuracy": 0.9433203554544293, "eval_num_tokens": 255569004.0, "eval_runtime": 7.0779, "eval_samples_per_second": 137.47, "eval_steps_per_second": 8.618, "step": 1290 }, { "entropy": 0.7673221891576594, "epoch": 0.2592356687898089, "grad_norm": 0.6773776412010193, "learning_rate": 1.9346085409252667e-06, "loss": 0.2522, "mean_token_accuracy": 0.9333479886705225, "num_tokens": 256521955.0, "step": 1295 }, { "entropy": 0.7722088591618972, "epoch": 0.2602365787079163, "grad_norm": 1.5807671546936035, "learning_rate": 1.9334964412811386e-06, "loss": 0.2445, "mean_token_accuracy": 0.9367749100381678, "num_tokens": 257261891.0, "step": 1300 }, { "entropy": 0.6817871857773173, "epoch": 0.26123748862602364, "grad_norm": 0.8420500159263611, "learning_rate": 1.932384341637011e-06, "loss": 0.2307, "mean_token_accuracy": 0.9392897643826225, "num_tokens": 258422670.0, "step": 1305 }, { "entropy": 0.7291848995468834, "epoch": 0.262238398544131, "grad_norm": 0.8453850746154785, "learning_rate": 1.9312722419928823e-06, "loss": 0.2367, "mean_token_accuracy": 0.9367011297832836, "num_tokens": 259498893.0, "step": 1310 }, { "entropy": 0.7557943192395297, "epoch": 0.2632393084622384, "grad_norm": 0.7049674391746521, "learning_rate": 1.930160142348754e-06, "loss": 0.2394, "mean_token_accuracy": 0.9362640223719857, "num_tokens": 260532718.0, "step": 1315 }, { "entropy": 0.7660176255486228, "epoch": 0.2642402183803458, "grad_norm": 0.7112149596214294, "learning_rate": 1.9290480427046265e-06, "loss": 0.2442, "mean_token_accuracy": 0.9359012392434207, "num_tokens": 261477169.0, "step": 1320 }, { "epoch": 0.2642402183803458, "eval_entropy": 0.6886299956040304, "eval_loss": 0.19393064081668854, "eval_mean_token_accuracy": 0.9443301275128224, "eval_num_tokens": 261477169.0, "eval_runtime": 7.0513, "eval_samples_per_second": 137.989, "eval_steps_per_second": 8.651, "step": 1320 }, { "entropy": 0.7740896999835968, "epoch": 0.26524112829845314, "grad_norm": 1.7373411655426025, "learning_rate": 1.9279359430604983e-06, "loss": 0.2382, "mean_token_accuracy": 0.937723603031852, "num_tokens": 262203299.0, "step": 1325 }, { "entropy": 0.6813056788661264, "epoch": 0.2662420382165605, "grad_norm": 0.8700944185256958, "learning_rate": 1.9268238434163697e-06, "loss": 0.2344, "mean_token_accuracy": 0.9385422473604029, "num_tokens": 263358422.0, "step": 1330 }, { "entropy": 0.7247711669314991, "epoch": 0.26724294813466787, "grad_norm": 0.7497351169586182, "learning_rate": 1.925711743772242e-06, "loss": 0.2399, "mean_token_accuracy": 0.9360633611679077, "num_tokens": 264437282.0, "step": 1335 }, { "entropy": 0.7492102563381196, "epoch": 0.26824385805277523, "grad_norm": 0.712761402130127, "learning_rate": 1.924599644128114e-06, "loss": 0.2379, "mean_token_accuracy": 0.9369383118369362, "num_tokens": 265476998.0, "step": 1340 }, { "entropy": 0.7662336116487329, "epoch": 0.26924476797088265, "grad_norm": 1.0059868097305298, "learning_rate": 1.9234875444839857e-06, "loss": 0.235, "mean_token_accuracy": 0.9384725668213584, "num_tokens": 266433276.0, "step": 1345 }, { "entropy": 0.779201509735801, "epoch": 0.27024567788899, "grad_norm": 1.7948832511901855, "learning_rate": 1.9223754448398576e-06, "loss": 0.2454, "mean_token_accuracy": 0.93651580973105, "num_tokens": 267147370.0, "step": 1350 }, { "epoch": 0.27024567788899, "eval_entropy": 0.6936582659111649, "eval_loss": 0.18916112184524536, "eval_mean_token_accuracy": 0.9457350138758049, "eval_num_tokens": 267147370.0, "eval_runtime": 7.0725, "eval_samples_per_second": 137.575, "eval_steps_per_second": 8.625, "step": 1350 }, { "entropy": 0.6807152347131209, "epoch": 0.2712465878070974, "grad_norm": 0.8464104533195496, "learning_rate": 1.9212633451957295e-06, "loss": 0.2364, "mean_token_accuracy": 0.9381036953492599, "num_tokens": 268288956.0, "step": 1355 }, { "entropy": 0.7280165471813895, "epoch": 0.27224749772520473, "grad_norm": 0.828230082988739, "learning_rate": 1.9201512455516013e-06, "loss": 0.2385, "mean_token_accuracy": 0.9357607359235937, "num_tokens": 269355784.0, "step": 1360 }, { "entropy": 0.7481856107711792, "epoch": 0.2732484076433121, "grad_norm": 0.7362084984779358, "learning_rate": 1.919039145907473e-06, "loss": 0.244, "mean_token_accuracy": 0.9355522545901211, "num_tokens": 270398456.0, "step": 1365 }, { "entropy": 0.7570679174228148, "epoch": 0.27424931756141946, "grad_norm": 0.6655718684196472, "learning_rate": 1.917927046263345e-06, "loss": 0.2337, "mean_token_accuracy": 0.938739211992784, "num_tokens": 271357374.0, "step": 1370 }, { "entropy": 0.7706907001408664, "epoch": 0.2752502274795268, "grad_norm": 1.7031316757202148, "learning_rate": 1.916814946619217e-06, "loss": 0.2383, "mean_token_accuracy": 0.9374835350296714, "num_tokens": 272091770.0, "step": 1375 }, { "entropy": 0.6735027275302193, "epoch": 0.27625113739763424, "grad_norm": 0.847005307674408, "learning_rate": 1.9157028469750887e-06, "loss": 0.2313, "mean_token_accuracy": 0.9391226519237865, "num_tokens": 273228350.0, "step": 1380 }, { "epoch": 0.27625113739763424, "eval_entropy": 0.6851567674855716, "eval_loss": 0.19188910722732544, "eval_mean_token_accuracy": 0.9446489283295928, "eval_num_tokens": 273228350.0, "eval_runtime": 7.0137, "eval_samples_per_second": 138.728, "eval_steps_per_second": 8.697, "step": 1380 }, { "entropy": 0.7211063027381897, "epoch": 0.2772520473157416, "grad_norm": 0.7908993363380432, "learning_rate": 1.914590747330961e-06, "loss": 0.2372, "mean_token_accuracy": 0.9372067868709564, "num_tokens": 274295508.0, "step": 1385 }, { "entropy": 0.745275920087641, "epoch": 0.27825295723384896, "grad_norm": 0.7628899216651917, "learning_rate": 1.913478647686833e-06, "loss": 0.2376, "mean_token_accuracy": 0.9368884086608886, "num_tokens": 275339917.0, "step": 1390 }, { "entropy": 0.760984147678722, "epoch": 0.2792538671519563, "grad_norm": 0.6237201690673828, "learning_rate": 1.9123665480427043e-06, "loss": 0.2349, "mean_token_accuracy": 0.9373009134422648, "num_tokens": 276295274.0, "step": 1395 }, { "entropy": 0.7674431963400408, "epoch": 0.2802547770700637, "grad_norm": 1.5829390287399292, "learning_rate": 1.9112544483985766e-06, "loss": 0.2381, "mean_token_accuracy": 0.9387489958242936, "num_tokens": 277028591.0, "step": 1400 }, { "entropy": 0.6689763746478341, "epoch": 0.28125568698817105, "grad_norm": 0.9000157713890076, "learning_rate": 1.9101423487544485e-06, "loss": 0.2285, "mean_token_accuracy": 0.9405502384359187, "num_tokens": 278145836.0, "step": 1405 }, { "entropy": 0.7273861186070876, "epoch": 0.2822565969062784, "grad_norm": 0.7861266732215881, "learning_rate": 1.9090302491103203e-06, "loss": 0.2402, "mean_token_accuracy": 0.9364809323440898, "num_tokens": 279207214.0, "step": 1410 }, { "epoch": 0.2822565969062784, "eval_entropy": 0.6891253717610093, "eval_loss": 0.19434459507465363, "eval_mean_token_accuracy": 0.9439800313261689, "eval_num_tokens": 279207214.0, "eval_runtime": 7.0731, "eval_samples_per_second": 137.563, "eval_steps_per_second": 8.624, "step": 1410 }, { "entropy": 0.7461361895908009, "epoch": 0.28325750682438583, "grad_norm": 0.7425960898399353, "learning_rate": 1.907918149466192e-06, "loss": 0.2326, "mean_token_accuracy": 0.936630117893219, "num_tokens": 280228528.0, "step": 1415 }, { "entropy": 0.7540929274125533, "epoch": 0.2842584167424932, "grad_norm": 0.6490366458892822, "learning_rate": 1.906806049822064e-06, "loss": 0.2303, "mean_token_accuracy": 0.938701045513153, "num_tokens": 281179143.0, "step": 1420 }, { "entropy": 0.7628308453343131, "epoch": 0.28525932666060055, "grad_norm": 1.7688848972320557, "learning_rate": 1.9056939501779359e-06, "loss": 0.2352, "mean_token_accuracy": 0.938917446678335, "num_tokens": 281909203.0, "step": 1425 }, { "entropy": 0.6694089114665985, "epoch": 0.2862602365787079, "grad_norm": 0.8820457458496094, "learning_rate": 1.9045818505338077e-06, "loss": 0.2298, "mean_token_accuracy": 0.9397172857414592, "num_tokens": 283042814.0, "step": 1430 }, { "entropy": 0.7282295411283319, "epoch": 0.2872611464968153, "grad_norm": 0.7975929379463196, "learning_rate": 1.9034697508896796e-06, "loss": 0.241, "mean_token_accuracy": 0.9352296363223683, "num_tokens": 284128852.0, "step": 1435 }, { "entropy": 0.7595264895395799, "epoch": 0.28826205641492264, "grad_norm": 0.734137773513794, "learning_rate": 1.9023576512455515e-06, "loss": 0.2438, "mean_token_accuracy": 0.9358488120815971, "num_tokens": 285148293.0, "step": 1440 }, { "epoch": 0.28826205641492264, "eval_entropy": 0.6968571543693542, "eval_loss": 0.19376739859580994, "eval_mean_token_accuracy": 0.9440957708436934, "eval_num_tokens": 285148293.0, "eval_runtime": 7.0823, "eval_samples_per_second": 137.384, "eval_steps_per_second": 8.613, "step": 1440 }, { "entropy": 0.774249031868848, "epoch": 0.28926296633303, "grad_norm": 0.6629706025123596, "learning_rate": 1.9012455516014233e-06, "loss": 0.2369, "mean_token_accuracy": 0.9372912900014357, "num_tokens": 286110916.0, "step": 1445 }, { "entropy": 0.7704967883500186, "epoch": 0.2902638762511374, "grad_norm": 1.558838129043579, "learning_rate": 1.9001334519572954e-06, "loss": 0.2389, "mean_token_accuracy": 0.9377495538104664, "num_tokens": 286850014.0, "step": 1450 }, { "entropy": 0.6755463258786635, "epoch": 0.2912647861692448, "grad_norm": 0.8654264211654663, "learning_rate": 1.899021352313167e-06, "loss": 0.2256, "mean_token_accuracy": 0.9411474087021567, "num_tokens": 287979759.0, "step": 1455 }, { "entropy": 0.7257778595794331, "epoch": 0.29226569608735214, "grad_norm": 0.771135687828064, "learning_rate": 1.897909252669039e-06, "loss": 0.2318, "mean_token_accuracy": 0.9382907439361919, "num_tokens": 289095571.0, "step": 1460 }, { "entropy": 0.7421090098944577, "epoch": 0.2932666060054595, "grad_norm": 0.7648544907569885, "learning_rate": 1.896797153024911e-06, "loss": 0.2368, "mean_token_accuracy": 0.9367521686987443, "num_tokens": 290146846.0, "step": 1465 }, { "entropy": 0.7553750325332989, "epoch": 0.29426751592356687, "grad_norm": 0.7135232090950012, "learning_rate": 1.8956850533807828e-06, "loss": 0.2312, "mean_token_accuracy": 0.9393588678403334, "num_tokens": 291083010.0, "step": 1470 }, { "epoch": 0.29426751592356687, "eval_entropy": 0.6849908721251566, "eval_loss": 0.192779079079628, "eval_mean_token_accuracy": 0.945117437448658, "eval_num_tokens": 291083010.0, "eval_runtime": 7.1063, "eval_samples_per_second": 136.92, "eval_steps_per_second": 8.584, "step": 1470 }, { "entropy": 0.764943554726514, "epoch": 0.29526842584167423, "grad_norm": 1.642973780632019, "learning_rate": 1.8945729537366549e-06, "loss": 0.2358, "mean_token_accuracy": 0.9390517413616181, "num_tokens": 291805745.0, "step": 1475 }, { "entropy": 0.6666032200509852, "epoch": 0.2962693357597816, "grad_norm": 0.9155055284500122, "learning_rate": 1.8934608540925265e-06, "loss": 0.2214, "mean_token_accuracy": 0.9421197701584209, "num_tokens": 292942990.0, "step": 1480 }, { "entropy": 0.7196945285255258, "epoch": 0.297270245677889, "grad_norm": 0.8412073254585266, "learning_rate": 1.8923487544483984e-06, "loss": 0.2319, "mean_token_accuracy": 0.9379522231492129, "num_tokens": 294035917.0, "step": 1485 }, { "entropy": 0.7461063379591162, "epoch": 0.2982711555959964, "grad_norm": 0.7782725095748901, "learning_rate": 1.8912366548042705e-06, "loss": 0.2237, "mean_token_accuracy": 0.939490559426221, "num_tokens": 295053863.0, "step": 1490 }, { "entropy": 0.7554086994041096, "epoch": 0.29927206551410374, "grad_norm": 0.6107756495475769, "learning_rate": 1.8901245551601423e-06, "loss": 0.2238, "mean_token_accuracy": 0.9404414875940843, "num_tokens": 296006683.0, "step": 1495 }, { "entropy": 0.7677417294545608, "epoch": 0.3002729754322111, "grad_norm": 1.474409580230713, "learning_rate": 1.889012455516014e-06, "loss": 0.2235, "mean_token_accuracy": 0.9412395347248425, "num_tokens": 296732879.0, "step": 1500 }, { "epoch": 0.3002729754322111, "eval_entropy": 0.689288352845145, "eval_loss": 0.19140712916851044, "eval_mean_token_accuracy": 0.9447863365783066, "eval_num_tokens": 296732879.0, "eval_runtime": 7.0341, "eval_samples_per_second": 138.327, "eval_steps_per_second": 8.672, "step": 1500 }, { "entropy": 0.6815881165591153, "epoch": 0.30127388535031846, "grad_norm": 0.898077130317688, "learning_rate": 1.887900355871886e-06, "loss": 0.2252, "mean_token_accuracy": 0.9407751598141411, "num_tokens": 297865911.0, "step": 1505 }, { "entropy": 0.7220306786623868, "epoch": 0.3022747952684258, "grad_norm": 0.7506076693534851, "learning_rate": 1.8867882562277579e-06, "loss": 0.2271, "mean_token_accuracy": 0.9396520457484505, "num_tokens": 298941429.0, "step": 1510 }, { "entropy": 0.7438318740237843, "epoch": 0.3032757051865332, "grad_norm": 0.6981909275054932, "learning_rate": 1.88567615658363e-06, "loss": 0.2306, "mean_token_accuracy": 0.9382837609811263, "num_tokens": 299985729.0, "step": 1515 }, { "entropy": 0.7525452472946861, "epoch": 0.3042766151046406, "grad_norm": 0.6050431728363037, "learning_rate": 1.8845640569395016e-06, "loss": 0.2281, "mean_token_accuracy": 0.9401122011921622, "num_tokens": 300946365.0, "step": 1520 }, { "entropy": 0.7545472253452647, "epoch": 0.30527752502274796, "grad_norm": 1.560426115989685, "learning_rate": 1.8834519572953735e-06, "loss": 0.2295, "mean_token_accuracy": 0.9403627395629883, "num_tokens": 301692068.0, "step": 1525 }, { "entropy": 0.678287308324467, "epoch": 0.3062784349408553, "grad_norm": 0.9419786334037781, "learning_rate": 1.8823398576512455e-06, "loss": 0.2251, "mean_token_accuracy": 0.9410501371730458, "num_tokens": 302836523.0, "step": 1530 }, { "epoch": 0.3062784349408553, "eval_entropy": 0.6856205536693823, "eval_loss": 0.19144752621650696, "eval_mean_token_accuracy": 0.9452652296081918, "eval_num_tokens": 302836523.0, "eval_runtime": 7.0363, "eval_samples_per_second": 138.283, "eval_steps_per_second": 8.669, "step": 1530 }, { "entropy": 0.7217613477598537, "epoch": 0.3072793448589627, "grad_norm": 0.7879256010055542, "learning_rate": 1.8812277580071174e-06, "loss": 0.2274, "mean_token_accuracy": 0.9390739977359772, "num_tokens": 303914943.0, "step": 1535 }, { "entropy": 0.7384155148809607, "epoch": 0.30828025477707005, "grad_norm": 0.7203854918479919, "learning_rate": 1.880115658362989e-06, "loss": 0.2313, "mean_token_accuracy": 0.9380901526321065, "num_tokens": 304955992.0, "step": 1540 }, { "entropy": 0.7574896769090133, "epoch": 0.3092811646951774, "grad_norm": 0.6372812986373901, "learning_rate": 1.8790035587188611e-06, "loss": 0.2324, "mean_token_accuracy": 0.9386772296645425, "num_tokens": 305902295.0, "step": 1545 }, { "entropy": 0.7646282634951852, "epoch": 0.31028207461328483, "grad_norm": 1.6246287822723389, "learning_rate": 1.877891459074733e-06, "loss": 0.2342, "mean_token_accuracy": 0.93938661867922, "num_tokens": 306621753.0, "step": 1550 }, { "entropy": 0.6615015620535071, "epoch": 0.3112829845313922, "grad_norm": 0.8894542455673218, "learning_rate": 1.876779359430605e-06, "loss": 0.2159, "mean_token_accuracy": 0.9429294396530498, "num_tokens": 307776834.0, "step": 1555 }, { "entropy": 0.7109584380279887, "epoch": 0.31228389444949956, "grad_norm": 0.7467630505561829, "learning_rate": 1.8756672597864769e-06, "loss": 0.2262, "mean_token_accuracy": 0.9396391229196028, "num_tokens": 308870945.0, "step": 1560 }, { "epoch": 0.31228389444949956, "eval_entropy": 0.6883383735281522, "eval_loss": 0.18963798880577087, "eval_mean_token_accuracy": 0.94547944967864, "eval_num_tokens": 308870945.0, "eval_runtime": 7.0626, "eval_samples_per_second": 137.768, "eval_steps_per_second": 8.637, "step": 1560 }, { "entropy": 0.7411357695406133, "epoch": 0.3132848043676069, "grad_norm": 0.7028961181640625, "learning_rate": 1.8745551601423485e-06, "loss": 0.2327, "mean_token_accuracy": 0.9375578728589145, "num_tokens": 309929138.0, "step": 1565 }, { "entropy": 0.7481155969879844, "epoch": 0.3142857142857143, "grad_norm": 0.6543077230453491, "learning_rate": 1.8734430604982206e-06, "loss": 0.2229, "mean_token_accuracy": 0.9416344767267054, "num_tokens": 310870701.0, "step": 1570 }, { "entropy": 0.7611248016357421, "epoch": 0.31528662420382164, "grad_norm": 1.6195554733276367, "learning_rate": 1.8723309608540925e-06, "loss": 0.226, "mean_token_accuracy": 0.9410505023869601, "num_tokens": 311585943.0, "step": 1575 }, { "entropy": 0.6666329188780351, "epoch": 0.316287534121929, "grad_norm": 0.9081742167472839, "learning_rate": 1.8712188612099643e-06, "loss": 0.2203, "mean_token_accuracy": 0.9423610427162864, "num_tokens": 312697721.0, "step": 1580 }, { "entropy": 0.7309835661541332, "epoch": 0.3172884440400364, "grad_norm": 0.7687853574752808, "learning_rate": 1.8701067615658362e-06, "loss": 0.2313, "mean_token_accuracy": 0.9385158609260212, "num_tokens": 313772951.0, "step": 1585 }, { "entropy": 0.747816955501383, "epoch": 0.3182893539581438, "grad_norm": 0.7106137871742249, "learning_rate": 1.868994661921708e-06, "loss": 0.2306, "mean_token_accuracy": 0.9387970268726349, "num_tokens": 314805910.0, "step": 1590 }, { "epoch": 0.3182893539581438, "eval_entropy": 0.6863706659098141, "eval_loss": 0.1875828355550766, "eval_mean_token_accuracy": 0.9464720380110819, "eval_num_tokens": 314805910.0, "eval_runtime": 7.0006, "eval_samples_per_second": 138.987, "eval_steps_per_second": 8.713, "step": 1590 }, { "entropy": 0.7534966772252863, "epoch": 0.31929026387625115, "grad_norm": 0.6492555141448975, "learning_rate": 1.86788256227758e-06, "loss": 0.2231, "mean_token_accuracy": 0.9415834470228716, "num_tokens": 315758423.0, "step": 1595 }, { "entropy": 0.7610360833731564, "epoch": 0.3202911737943585, "grad_norm": 1.5349500179290771, "learning_rate": 1.866770462633452e-06, "loss": 0.2281, "mean_token_accuracy": 0.9405443429946899, "num_tokens": 316494038.0, "step": 1600 }, { "entropy": 0.6749826358123259, "epoch": 0.32129208371246587, "grad_norm": 0.8564639091491699, "learning_rate": 1.8656583629893236e-06, "loss": 0.2168, "mean_token_accuracy": 0.9434851581400091, "num_tokens": 317637128.0, "step": 1605 }, { "entropy": 0.7281479911370711, "epoch": 0.32229299363057323, "grad_norm": 0.7530900239944458, "learning_rate": 1.8645462633451957e-06, "loss": 0.2306, "mean_token_accuracy": 0.9385505968874152, "num_tokens": 318711437.0, "step": 1610 }, { "entropy": 0.7480952777645805, "epoch": 0.3232939035486806, "grad_norm": 0.7651330828666687, "learning_rate": 1.8634341637010675e-06, "loss": 0.2222, "mean_token_accuracy": 0.9407213232733986, "num_tokens": 319762432.0, "step": 1615 }, { "entropy": 0.7627674433318051, "epoch": 0.324294813466788, "grad_norm": 1.319263219833374, "learning_rate": 1.8623220640569394e-06, "loss": 0.2291, "mean_token_accuracy": 0.9400727407498793, "num_tokens": 320714756.0, "step": 1620 }, { "epoch": 0.324294813466788, "eval_entropy": 0.6917840105588319, "eval_loss": 0.1892538070678711, "eval_mean_token_accuracy": 0.9453025753380823, "eval_num_tokens": 320714756.0, "eval_runtime": 7.0555, "eval_samples_per_second": 137.907, "eval_steps_per_second": 8.646, "step": 1620 }, { "entropy": 0.7565292894840241, "epoch": 0.3252957233848954, "grad_norm": 1.500351905822754, "learning_rate": 1.8612099644128113e-06, "loss": 0.2253, "mean_token_accuracy": 0.9414621531963349, "num_tokens": 321455218.0, "step": 1625 }, { "entropy": 0.6765736005522988, "epoch": 0.32629663330300274, "grad_norm": 0.8855953216552734, "learning_rate": 1.8600978647686831e-06, "loss": 0.2215, "mean_token_accuracy": 0.9421357078985735, "num_tokens": 322588730.0, "step": 1630 }, { "entropy": 0.731200877644799, "epoch": 0.3272975432211101, "grad_norm": 0.8593675494194031, "learning_rate": 1.8589857651245552e-06, "loss": 0.2211, "mean_token_accuracy": 0.940478920394724, "num_tokens": 323655165.0, "step": 1635 }, { "entropy": 0.7489201041785154, "epoch": 0.32829845313921746, "grad_norm": 0.6950727105140686, "learning_rate": 1.857873665480427e-06, "loss": 0.2267, "mean_token_accuracy": 0.9399041311307387, "num_tokens": 324696694.0, "step": 1640 }, { "entropy": 0.7550950695167888, "epoch": 0.3292993630573248, "grad_norm": 0.6508896946907043, "learning_rate": 1.8567615658362989e-06, "loss": 0.2215, "mean_token_accuracy": 0.9421004755930467, "num_tokens": 325659634.0, "step": 1645 }, { "entropy": 0.7663576700470665, "epoch": 0.3303002729754322, "grad_norm": 1.634458065032959, "learning_rate": 1.8556494661921708e-06, "loss": 0.2207, "mean_token_accuracy": 0.9429123352874409, "num_tokens": 326395974.0, "step": 1650 }, { "epoch": 0.3303002729754322, "eval_entropy": 0.6862643673771718, "eval_loss": 0.18751998245716095, "eval_mean_token_accuracy": 0.9456404789549405, "eval_num_tokens": 326395974.0, "eval_runtime": 7.1493, "eval_samples_per_second": 136.097, "eval_steps_per_second": 8.532, "step": 1650 }, { "entropy": 0.6814083841713992, "epoch": 0.3313011828935396, "grad_norm": 0.8884172439575195, "learning_rate": 1.8545373665480426e-06, "loss": 0.2205, "mean_token_accuracy": 0.9427086288278753, "num_tokens": 327518250.0, "step": 1655 }, { "entropy": 0.7267371454022148, "epoch": 0.33230209281164697, "grad_norm": 0.864007294178009, "learning_rate": 1.8534252669039145e-06, "loss": 0.225, "mean_token_accuracy": 0.9397233930501071, "num_tokens": 328618461.0, "step": 1660 }, { "entropy": 0.7420350654558702, "epoch": 0.3333030027297543, "grad_norm": 0.7210493087768555, "learning_rate": 1.8523131672597865e-06, "loss": 0.2179, "mean_token_accuracy": 0.9413225569508292, "num_tokens": 329644537.0, "step": 1665 }, { "entropy": 0.7561123300682414, "epoch": 0.3343039126478617, "grad_norm": 0.6487271785736084, "learning_rate": 1.8512010676156582e-06, "loss": 0.2254, "mean_token_accuracy": 0.9409215840426358, "num_tokens": 330598048.0, "step": 1670 }, { "entropy": 0.7530049925500696, "epoch": 0.33530482256596905, "grad_norm": 1.4161484241485596, "learning_rate": 1.85008896797153e-06, "loss": 0.2262, "mean_token_accuracy": 0.9407056949355386, "num_tokens": 331335749.0, "step": 1675 }, { "entropy": 0.6709848523139954, "epoch": 0.3363057324840764, "grad_norm": 0.8709940314292908, "learning_rate": 1.8489768683274021e-06, "loss": 0.2126, "mean_token_accuracy": 0.9443153614347631, "num_tokens": 332446173.0, "step": 1680 }, { "epoch": 0.3363057324840764, "eval_entropy": 0.6809069924667234, "eval_loss": 0.18685181438922882, "eval_mean_token_accuracy": 0.9458920027388901, "eval_num_tokens": 332446173.0, "eval_runtime": 7.0534, "eval_samples_per_second": 137.948, "eval_steps_per_second": 8.648, "step": 1680 }, { "entropy": 0.7158765223893252, "epoch": 0.3373066424021838, "grad_norm": 0.7924162745475769, "learning_rate": 1.847864768683274e-06, "loss": 0.2193, "mean_token_accuracy": 0.9411095483736558, "num_tokens": 333540296.0, "step": 1685 }, { "entropy": 0.7432989163832231, "epoch": 0.3383075523202912, "grad_norm": 0.7170067429542542, "learning_rate": 1.8467526690391458e-06, "loss": 0.2231, "mean_token_accuracy": 0.9399533082138408, "num_tokens": 334567842.0, "step": 1690 }, { "entropy": 0.7567171161825007, "epoch": 0.33930846223839856, "grad_norm": 0.8179503679275513, "learning_rate": 1.8456405693950177e-06, "loss": 0.2163, "mean_token_accuracy": 0.9416104576804422, "num_tokens": 335513564.0, "step": 1695 }, { "entropy": 0.7489974737167359, "epoch": 0.3403093721565059, "grad_norm": 1.53611421585083, "learning_rate": 1.8445284697508895e-06, "loss": 0.2204, "mean_token_accuracy": 0.942312642661008, "num_tokens": 336241111.0, "step": 1700 }, { "entropy": 0.654525652256879, "epoch": 0.3413102820746133, "grad_norm": 0.8707150816917419, "learning_rate": 1.8434163701067616e-06, "loss": 0.2149, "mean_token_accuracy": 0.9436011032624678, "num_tokens": 337388829.0, "step": 1705 }, { "entropy": 0.7030858914960515, "epoch": 0.34231119199272064, "grad_norm": 0.7638726830482483, "learning_rate": 1.8423042704626333e-06, "loss": 0.2166, "mean_token_accuracy": 0.9418383235281164, "num_tokens": 338487950.0, "step": 1710 }, { "epoch": 0.34231119199272064, "eval_entropy": 0.6751082666584702, "eval_loss": 0.18752196431159973, "eval_mean_token_accuracy": 0.9461400889959491, "eval_num_tokens": 338487950.0, "eval_runtime": 7.0608, "eval_samples_per_second": 137.804, "eval_steps_per_second": 8.639, "step": 1710 }, { "entropy": 0.7331013533202084, "epoch": 0.343312101910828, "grad_norm": 0.7417324781417847, "learning_rate": 1.8411921708185051e-06, "loss": 0.2186, "mean_token_accuracy": 0.9412295021794059, "num_tokens": 339528555.0, "step": 1715 }, { "entropy": 0.7409337301145901, "epoch": 0.3443130118289354, "grad_norm": 0.5683432817459106, "learning_rate": 1.8400800711743772e-06, "loss": 0.2216, "mean_token_accuracy": 0.9424649195237593, "num_tokens": 340471062.0, "step": 1720 }, { "entropy": 0.7573754261840474, "epoch": 0.3453139217470428, "grad_norm": 1.6265780925750732, "learning_rate": 1.838967971530249e-06, "loss": 0.2232, "mean_token_accuracy": 0.9421446426348252, "num_tokens": 341200791.0, "step": 1725 }, { "entropy": 0.6634017543359236, "epoch": 0.34631483166515015, "grad_norm": 0.8331648111343384, "learning_rate": 1.8378558718861211e-06, "loss": 0.215, "mean_token_accuracy": 0.9439348957755349, "num_tokens": 342354243.0, "step": 1730 }, { "entropy": 0.7234609690579501, "epoch": 0.3473157415832575, "grad_norm": 0.7795122861862183, "learning_rate": 1.8367437722419928e-06, "loss": 0.2218, "mean_token_accuracy": 0.9413307238708842, "num_tokens": 343460007.0, "step": 1735 }, { "entropy": 0.7389553557742726, "epoch": 0.34831665150136487, "grad_norm": 0.7650998830795288, "learning_rate": 1.8356316725978646e-06, "loss": 0.2221, "mean_token_accuracy": 0.9402573309161446, "num_tokens": 344485585.0, "step": 1740 }, { "epoch": 0.34831665150136487, "eval_entropy": 0.6805569617474665, "eval_loss": 0.18589681386947632, "eval_mean_token_accuracy": 0.9470884516590932, "eval_num_tokens": 344485585.0, "eval_runtime": 7.0844, "eval_samples_per_second": 137.343, "eval_steps_per_second": 8.61, "step": 1740 }, { "entropy": 0.7529873455112631, "epoch": 0.34931756141947223, "grad_norm": 0.6545958518981934, "learning_rate": 1.8345195729537367e-06, "loss": 0.2154, "mean_token_accuracy": 0.9425264336846092, "num_tokens": 345437559.0, "step": 1745 }, { "entropy": 0.761711223558946, "epoch": 0.3503184713375796, "grad_norm": 1.5173709392547607, "learning_rate": 1.8334074733096085e-06, "loss": 0.2199, "mean_token_accuracy": 0.9430206764828075, "num_tokens": 346169843.0, "step": 1750 }, { "entropy": 0.6663856636394154, "epoch": 0.351319381255687, "grad_norm": 0.9439927935600281, "learning_rate": 1.8322953736654802e-06, "loss": 0.2083, "mean_token_accuracy": 0.9455912454561753, "num_tokens": 347311478.0, "step": 1755 }, { "entropy": 0.7240512457760897, "epoch": 0.3523202911737944, "grad_norm": 0.8567253351211548, "learning_rate": 1.8311832740213523e-06, "loss": 0.2201, "mean_token_accuracy": 0.94097445119511, "num_tokens": 348391019.0, "step": 1760 }, { "entropy": 0.7536248537627134, "epoch": 0.35332120109190174, "grad_norm": 0.7871220707893372, "learning_rate": 1.8300711743772241e-06, "loss": 0.2221, "mean_token_accuracy": 0.9408430018208244, "num_tokens": 349427873.0, "step": 1765 }, { "entropy": 0.752256919037212, "epoch": 0.3543221110100091, "grad_norm": 0.7607414722442627, "learning_rate": 1.8289590747330962e-06, "loss": 0.2133, "mean_token_accuracy": 0.9429961264133453, "num_tokens": 350368302.0, "step": 1770 }, { "epoch": 0.3543221110100091, "eval_entropy": 0.6793914603405311, "eval_loss": 0.18565388023853302, "eval_mean_token_accuracy": 0.9467462975470746, "eval_num_tokens": 350368302.0, "eval_runtime": 7.0807, "eval_samples_per_second": 137.416, "eval_steps_per_second": 8.615, "step": 1770 }, { "entropy": 0.7683494849638506, "epoch": 0.35532302092811646, "grad_norm": 1.6403712034225464, "learning_rate": 1.8278469750889678e-06, "loss": 0.2249, "mean_token_accuracy": 0.9412687144496225, "num_tokens": 351095025.0, "step": 1775 }, { "entropy": 0.6670152826742692, "epoch": 0.3563239308462238, "grad_norm": 0.8832131624221802, "learning_rate": 1.8267348754448397e-06, "loss": 0.205, "mean_token_accuracy": 0.9458218796686693, "num_tokens": 352193150.0, "step": 1780 }, { "entropy": 0.7185638297687877, "epoch": 0.3573248407643312, "grad_norm": 0.8331535458564758, "learning_rate": 1.8256227758007118e-06, "loss": 0.2204, "mean_token_accuracy": 0.9414176198569211, "num_tokens": 353272050.0, "step": 1785 }, { "entropy": 0.7420683860778808, "epoch": 0.3583257506824386, "grad_norm": 0.8582054376602173, "learning_rate": 1.8245106761565836e-06, "loss": 0.2194, "mean_token_accuracy": 0.9416669065302069, "num_tokens": 354301218.0, "step": 1790 }, { "entropy": 0.7551353487101469, "epoch": 0.35932666060054597, "grad_norm": 0.6729021668434143, "learning_rate": 1.8233985765124555e-06, "loss": 0.2176, "mean_token_accuracy": 0.9424347899176858, "num_tokens": 355250303.0, "step": 1795 }, { "entropy": 0.751355068250136, "epoch": 0.36032757051865333, "grad_norm": 1.6505812406539917, "learning_rate": 1.8222864768683273e-06, "loss": 0.2153, "mean_token_accuracy": 0.9439191401004792, "num_tokens": 355986825.0, "step": 1800 }, { "epoch": 0.36032757051865333, "eval_entropy": 0.6828553637520212, "eval_loss": 0.18496987223625183, "eval_mean_token_accuracy": 0.946765600657854, "eval_num_tokens": 355986825.0, "eval_runtime": 7.196, "eval_samples_per_second": 135.214, "eval_steps_per_second": 8.477, "step": 1800 }, { "entropy": 0.671533118052916, "epoch": 0.3613284804367607, "grad_norm": 0.9078927040100098, "learning_rate": 1.8211743772241992e-06, "loss": 0.2158, "mean_token_accuracy": 0.9436379351399161, "num_tokens": 357130061.0, "step": 1805 }, { "entropy": 0.7248334830457513, "epoch": 0.36232939035486805, "grad_norm": 0.8133084177970886, "learning_rate": 1.820062277580071e-06, "loss": 0.2168, "mean_token_accuracy": 0.9420726684006777, "num_tokens": 358203695.0, "step": 1810 }, { "entropy": 0.749659313396974, "epoch": 0.3633303002729754, "grad_norm": 0.7274289727210999, "learning_rate": 1.8189501779359431e-06, "loss": 0.2183, "mean_token_accuracy": 0.9410731163891879, "num_tokens": 359255272.0, "step": 1815 }, { "entropy": 0.7492427452044054, "epoch": 0.3643312101910828, "grad_norm": 0.6970122456550598, "learning_rate": 1.8178380782918148e-06, "loss": 0.2122, "mean_token_accuracy": 0.9437575806270946, "num_tokens": 360205760.0, "step": 1820 }, { "entropy": 0.7630361009727825, "epoch": 0.3653321201091902, "grad_norm": 1.5553841590881348, "learning_rate": 1.8167259786476868e-06, "loss": 0.2171, "mean_token_accuracy": 0.9435903998938474, "num_tokens": 360930756.0, "step": 1825 }, { "entropy": 0.6604581854560159, "epoch": 0.36633303002729756, "grad_norm": 0.8930894136428833, "learning_rate": 1.8156138790035587e-06, "loss": 0.2064, "mean_token_accuracy": 0.9453142903067849, "num_tokens": 362073768.0, "step": 1830 }, { "epoch": 0.36633303002729756, "eval_entropy": 0.6797945323537607, "eval_loss": 0.18627671897411346, "eval_mean_token_accuracy": 0.9462826427866201, "eval_num_tokens": 362073768.0, "eval_runtime": 7.0766, "eval_samples_per_second": 137.496, "eval_steps_per_second": 8.62, "step": 1830 }, { "entropy": 0.7122279877012426, "epoch": 0.3673339399454049, "grad_norm": 0.7932081818580627, "learning_rate": 1.8145017793594305e-06, "loss": 0.2163, "mean_token_accuracy": 0.9423632816834884, "num_tokens": 363174699.0, "step": 1835 }, { "entropy": 0.7323072322390296, "epoch": 0.3683348498635123, "grad_norm": 0.8120712637901306, "learning_rate": 1.8133896797153024e-06, "loss": 0.2135, "mean_token_accuracy": 0.9426818517121401, "num_tokens": 364220503.0, "step": 1840 }, { "entropy": 0.7379515593702143, "epoch": 0.36933575978161964, "grad_norm": 0.6464109420776367, "learning_rate": 1.8122775800711743e-06, "loss": 0.2155, "mean_token_accuracy": 0.9436933148990978, "num_tokens": 365183344.0, "step": 1845 }, { "entropy": 0.747268967736851, "epoch": 0.370336669699727, "grad_norm": 1.5971249341964722, "learning_rate": 1.8111654804270461e-06, "loss": 0.2143, "mean_token_accuracy": 0.9438003106550736, "num_tokens": 365919847.0, "step": 1850 }, { "entropy": 0.6582850561900573, "epoch": 0.37133757961783437, "grad_norm": 0.863985538482666, "learning_rate": 1.8100533807829182e-06, "loss": 0.2095, "mean_token_accuracy": 0.9448756602677432, "num_tokens": 367064732.0, "step": 1855 }, { "entropy": 0.7067332503470507, "epoch": 0.3723384895359418, "grad_norm": 0.8197696208953857, "learning_rate": 1.8089412811387898e-06, "loss": 0.2105, "mean_token_accuracy": 0.9437748323787343, "num_tokens": 368151926.0, "step": 1860 }, { "epoch": 0.3723384895359418, "eval_entropy": 0.6697740437554531, "eval_loss": 0.18541671335697174, "eval_mean_token_accuracy": 0.9467641076103586, "eval_num_tokens": 368151926.0, "eval_runtime": 7.2316, "eval_samples_per_second": 134.548, "eval_steps_per_second": 8.435, "step": 1860 }, { "entropy": 0.7188923353498632, "epoch": 0.37333939945404915, "grad_norm": 0.7241514921188354, "learning_rate": 1.807829181494662e-06, "loss": 0.2159, "mean_token_accuracy": 0.9426933927969499, "num_tokens": 369193883.0, "step": 1865 }, { "entropy": 0.7283467411994934, "epoch": 0.3743403093721565, "grad_norm": 0.6511433720588684, "learning_rate": 1.8067170818505338e-06, "loss": 0.2046, "mean_token_accuracy": 0.9453721604563973, "num_tokens": 370160842.0, "step": 1870 }, { "entropy": 0.7400695074688305, "epoch": 0.37534121929026387, "grad_norm": 1.732633113861084, "learning_rate": 1.8056049822064056e-06, "loss": 0.2073, "mean_token_accuracy": 0.9456492299383337, "num_tokens": 370892250.0, "step": 1875 }, { "entropy": 0.6502942296591672, "epoch": 0.37634212920837123, "grad_norm": 0.9057416915893555, "learning_rate": 1.8044928825622777e-06, "loss": 0.2069, "mean_token_accuracy": 0.9456757783889771, "num_tokens": 372022804.0, "step": 1880 }, { "entropy": 0.6950947940349579, "epoch": 0.3773430391264786, "grad_norm": 0.7539053559303284, "learning_rate": 1.8033807829181493e-06, "loss": 0.2123, "mean_token_accuracy": 0.9439136678522283, "num_tokens": 373111663.0, "step": 1885 }, { "entropy": 0.7123507954857566, "epoch": 0.378343949044586, "grad_norm": 0.7048158645629883, "learning_rate": 1.8022686832740212e-06, "loss": 0.2048, "mean_token_accuracy": 0.944564142552289, "num_tokens": 374149694.0, "step": 1890 }, { "epoch": 0.378343949044586, "eval_entropy": 0.6676561910598005, "eval_loss": 0.1879517138004303, "eval_mean_token_accuracy": 0.9460887654882962, "eval_num_tokens": 374149694.0, "eval_runtime": 7.0566, "eval_samples_per_second": 137.886, "eval_steps_per_second": 8.644, "step": 1890 }, { "entropy": 0.7297647855498574, "epoch": 0.3793448589626934, "grad_norm": 0.6239880323410034, "learning_rate": 1.8011565836298933e-06, "loss": 0.2072, "mean_token_accuracy": 0.9445389471270821, "num_tokens": 375098313.0, "step": 1895 }, { "entropy": 0.7302077791907571, "epoch": 0.38034576888080074, "grad_norm": 1.6145358085632324, "learning_rate": 1.8000444839857651e-06, "loss": 0.203, "mean_token_accuracy": 0.9472867906093597, "num_tokens": 375816250.0, "step": 1900 }, { "entropy": 0.651517802476883, "epoch": 0.3813466787989081, "grad_norm": 0.9150720834732056, "learning_rate": 1.7989323843416368e-06, "loss": 0.2025, "mean_token_accuracy": 0.9464400454000993, "num_tokens": 376972772.0, "step": 1905 }, { "entropy": 0.7013754468072545, "epoch": 0.38234758871701546, "grad_norm": 0.7586289048194885, "learning_rate": 1.7978202846975088e-06, "loss": 0.2136, "mean_token_accuracy": 0.9431337069381367, "num_tokens": 378053259.0, "step": 1910 }, { "entropy": 0.7290194546634501, "epoch": 0.3833484986351228, "grad_norm": 0.7516461610794067, "learning_rate": 1.7967081850533807e-06, "loss": 0.2104, "mean_token_accuracy": 0.9425576074556871, "num_tokens": 379103040.0, "step": 1915 }, { "entropy": 0.729288539019498, "epoch": 0.3843494085532302, "grad_norm": 0.620888352394104, "learning_rate": 1.7955960854092528e-06, "loss": 0.2161, "mean_token_accuracy": 0.9428860014135187, "num_tokens": 380068398.0, "step": 1920 }, { "epoch": 0.3843494085532302, "eval_entropy": 0.6771206176671826, "eval_loss": 0.18380054831504822, "eval_mean_token_accuracy": 0.9472967024709358, "eval_num_tokens": 380068398.0, "eval_runtime": 7.0603, "eval_samples_per_second": 137.813, "eval_steps_per_second": 8.64, "step": 1920 }, { "entropy": 0.7446090530265461, "epoch": 0.3853503184713376, "grad_norm": 1.8233861923217773, "learning_rate": 1.7944839857651244e-06, "loss": 0.2103, "mean_token_accuracy": 0.9449324960058386, "num_tokens": 380798426.0, "step": 1925 }, { "entropy": 0.6486159508878534, "epoch": 0.38635122838944497, "grad_norm": 0.8591449856758118, "learning_rate": 1.7933718861209963e-06, "loss": 0.1985, "mean_token_accuracy": 0.9478086097673937, "num_tokens": 381921279.0, "step": 1930 }, { "entropy": 0.6935458053242076, "epoch": 0.38735213830755233, "grad_norm": 0.7733218669891357, "learning_rate": 1.7922597864768683e-06, "loss": 0.2105, "mean_token_accuracy": 0.9436502153223211, "num_tokens": 383034624.0, "step": 1935 }, { "entropy": 0.7152742207050323, "epoch": 0.3883530482256597, "grad_norm": 0.7645531296730042, "learning_rate": 1.7911476868327402e-06, "loss": 0.2141, "mean_token_accuracy": 0.9426664742556485, "num_tokens": 384082302.0, "step": 1940 }, { "entropy": 0.72131880142472, "epoch": 0.38935395814376705, "grad_norm": 0.6350061893463135, "learning_rate": 1.7900355871886118e-06, "loss": 0.2139, "mean_token_accuracy": 0.9435091322118586, "num_tokens": 385027404.0, "step": 1945 }, { "entropy": 0.7312435442751104, "epoch": 0.3903548680618744, "grad_norm": 1.5735912322998047, "learning_rate": 1.788923487544484e-06, "loss": 0.2147, "mean_token_accuracy": 0.9442145147106864, "num_tokens": 385747602.0, "step": 1950 }, { "epoch": 0.3903548680618744, "eval_entropy": 0.6652554879423047, "eval_loss": 0.18200834095478058, "eval_mean_token_accuracy": 0.9481365035791867, "eval_num_tokens": 385747602.0, "eval_runtime": 7.0932, "eval_samples_per_second": 137.173, "eval_steps_per_second": 8.6, "step": 1950 }, { "entropy": 0.6431225692684001, "epoch": 0.3913557779799818, "grad_norm": 0.8696854710578918, "learning_rate": 1.7878113879003558e-06, "loss": 0.2025, "mean_token_accuracy": 0.9468498847701333, "num_tokens": 386887767.0, "step": 1955 }, { "entropy": 0.7034454665400766, "epoch": 0.3923566878980892, "grad_norm": 0.7643694877624512, "learning_rate": 1.7866992882562278e-06, "loss": 0.2126, "mean_token_accuracy": 0.9431325663219798, "num_tokens": 387964944.0, "step": 1960 }, { "entropy": 0.7246899883855473, "epoch": 0.39335759781619656, "grad_norm": 0.7786898016929626, "learning_rate": 1.7855871886120997e-06, "loss": 0.2089, "mean_token_accuracy": 0.9432727011767301, "num_tokens": 389004308.0, "step": 1965 }, { "entropy": 0.7278256719762629, "epoch": 0.3943585077343039, "grad_norm": 0.6474554538726807, "learning_rate": 1.7844750889679713e-06, "loss": 0.2103, "mean_token_accuracy": 0.9444690823554993, "num_tokens": 389960897.0, "step": 1970 }, { "entropy": 0.7353235575285825, "epoch": 0.3953594176524113, "grad_norm": 1.4766101837158203, "learning_rate": 1.7833629893238434e-06, "loss": 0.2122, "mean_token_accuracy": 0.9448792641813105, "num_tokens": 390690930.0, "step": 1975 }, { "entropy": 0.6459478435191242, "epoch": 0.39636032757051864, "grad_norm": 0.8893265128135681, "learning_rate": 1.7822508896797153e-06, "loss": 0.1952, "mean_token_accuracy": 0.9480626046657562, "num_tokens": 391824072.0, "step": 1980 }, { "epoch": 0.39636032757051864, "eval_entropy": 0.6699851712242502, "eval_loss": 0.18414077162742615, "eval_mean_token_accuracy": 0.9468610022888809, "eval_num_tokens": 391824072.0, "eval_runtime": 7.0451, "eval_samples_per_second": 138.11, "eval_steps_per_second": 8.659, "step": 1980 }, { "entropy": 0.6953091193329204, "epoch": 0.397361237488626, "grad_norm": 0.7777345776557922, "learning_rate": 1.7811387900355871e-06, "loss": 0.1999, "mean_token_accuracy": 0.9463782939043912, "num_tokens": 392922191.0, "step": 1985 }, { "entropy": 0.7238586826757951, "epoch": 0.39836214740673337, "grad_norm": 0.7229942679405212, "learning_rate": 1.780026690391459e-06, "loss": 0.2091, "mean_token_accuracy": 0.9436769247055053, "num_tokens": 393951915.0, "step": 1990 }, { "entropy": 0.7331149523908441, "epoch": 0.3993630573248408, "grad_norm": 4.645083427429199, "learning_rate": 1.7789145907473308e-06, "loss": 0.1999, "mean_token_accuracy": 0.9466928687962619, "num_tokens": 394911168.0, "step": 1995 }, { "entropy": 0.7309780456803062, "epoch": 0.40036396724294815, "grad_norm": 1.4829082489013672, "learning_rate": 1.777802491103203e-06, "loss": 0.2012, "mean_token_accuracy": 0.9472140420566906, "num_tokens": 395641494.0, "step": 2000 }, { "entropy": 0.6425018229267814, "epoch": 0.4013648771610555, "grad_norm": 0.9035446047782898, "learning_rate": 1.7766903914590748e-06, "loss": 0.2018, "mean_token_accuracy": 0.9465541931715878, "num_tokens": 396806735.0, "step": 2005 }, { "entropy": 0.6907033795660192, "epoch": 0.4023657870791629, "grad_norm": 0.7838383913040161, "learning_rate": 1.7755782918149464e-06, "loss": 0.202, "mean_token_accuracy": 0.9456824893301183, "num_tokens": 397897808.0, "step": 2010 }, { "epoch": 0.4023657870791629, "eval_entropy": 0.6626140348246841, "eval_loss": 0.18410223722457886, "eval_mean_token_accuracy": 0.9472184445037216, "eval_num_tokens": 397897808.0, "eval_runtime": 7.0819, "eval_samples_per_second": 137.392, "eval_steps_per_second": 8.613, "step": 2010 }, { "entropy": 0.726021606000987, "epoch": 0.40336669699727024, "grad_norm": 0.7114729881286621, "learning_rate": 1.7744661921708185e-06, "loss": 0.2095, "mean_token_accuracy": 0.9433100288564509, "num_tokens": 398940578.0, "step": 2015 }, { "entropy": 0.734506199576638, "epoch": 0.4043676069153776, "grad_norm": 0.6169213652610779, "learning_rate": 1.7733540925266903e-06, "loss": 0.2013, "mean_token_accuracy": 0.9469331833449277, "num_tokens": 399905944.0, "step": 2020 }, { "entropy": 0.7397109557281841, "epoch": 0.40536851683348496, "grad_norm": 1.60407292842865, "learning_rate": 1.7722419928825622e-06, "loss": 0.2062, "mean_token_accuracy": 0.945603883266449, "num_tokens": 400634292.0, "step": 2025 }, { "entropy": 0.6445516106757251, "epoch": 0.4063694267515924, "grad_norm": 0.8769928216934204, "learning_rate": 1.771129893238434e-06, "loss": 0.1977, "mean_token_accuracy": 0.9475978217341683, "num_tokens": 401779381.0, "step": 2030 }, { "entropy": 0.7098718599839644, "epoch": 0.40737033666969974, "grad_norm": 0.7846065163612366, "learning_rate": 1.770017793594306e-06, "loss": 0.2102, "mean_token_accuracy": 0.9430582647973841, "num_tokens": 402847416.0, "step": 2035 }, { "entropy": 0.7275996368039738, "epoch": 0.4083712465878071, "grad_norm": 0.7160109877586365, "learning_rate": 1.7689056939501778e-06, "loss": 0.2096, "mean_token_accuracy": 0.9433997192166068, "num_tokens": 403874433.0, "step": 2040 }, { "epoch": 0.4083712465878071, "eval_entropy": 0.6705086065120385, "eval_loss": 0.18461880087852478, "eval_mean_token_accuracy": 0.9473435878753662, "eval_num_tokens": 403874433.0, "eval_runtime": 7.0408, "eval_samples_per_second": 138.194, "eval_steps_per_second": 8.664, "step": 2040 }, { "entropy": 0.7333347418091514, "epoch": 0.40937215650591446, "grad_norm": 0.6281505823135376, "learning_rate": 1.7677935943060498e-06, "loss": 0.2069, "mean_token_accuracy": 0.9447664049538699, "num_tokens": 404833962.0, "step": 2045 }, { "entropy": 0.7319487474181435, "epoch": 0.4103730664240218, "grad_norm": 1.4577454328536987, "learning_rate": 1.7666814946619217e-06, "loss": 0.201, "mean_token_accuracy": 0.9480147020383315, "num_tokens": 405569513.0, "step": 2050 }, { "entropy": 0.6517360118302432, "epoch": 0.4113739763421292, "grad_norm": 0.8996165990829468, "learning_rate": 1.7655693950177935e-06, "loss": 0.2004, "mean_token_accuracy": 0.9475146136500618, "num_tokens": 406712661.0, "step": 2055 }, { "entropy": 0.7000083235177127, "epoch": 0.4123748862602366, "grad_norm": 0.8524026274681091, "learning_rate": 1.7644572953736654e-06, "loss": 0.2011, "mean_token_accuracy": 0.9456206717274406, "num_tokens": 407800620.0, "step": 2060 }, { "entropy": 0.7201575975526463, "epoch": 0.41337579617834397, "grad_norm": 0.7635099291801453, "learning_rate": 1.7633451957295373e-06, "loss": 0.2058, "mean_token_accuracy": 0.9447005716237155, "num_tokens": 408832671.0, "step": 2065 }, { "entropy": 0.7325827403502031, "epoch": 0.41437670609645133, "grad_norm": 0.6240025758743286, "learning_rate": 1.7622330960854093e-06, "loss": 0.2053, "mean_token_accuracy": 0.9453733026981354, "num_tokens": 409777940.0, "step": 2070 }, { "epoch": 0.41437670609645133, "eval_entropy": 0.6683724786414474, "eval_loss": 0.18311864137649536, "eval_mean_token_accuracy": 0.9476352060427431, "eval_num_tokens": 409777940.0, "eval_runtime": 7.0794, "eval_samples_per_second": 137.44, "eval_steps_per_second": 8.617, "step": 2070 }, { "entropy": 0.7300297211516987, "epoch": 0.4153776160145587, "grad_norm": 1.6698415279388428, "learning_rate": 1.761120996441281e-06, "loss": 0.2006, "mean_token_accuracy": 0.9477990069172599, "num_tokens": 410500764.0, "step": 2075 }, { "entropy": 0.6337598004124382, "epoch": 0.41637852593266605, "grad_norm": 0.881401002407074, "learning_rate": 1.7600088967971528e-06, "loss": 0.1969, "mean_token_accuracy": 0.9473873138427734, "num_tokens": 411638552.0, "step": 2080 }, { "entropy": 0.6980531494725835, "epoch": 0.4173794358507734, "grad_norm": 0.7880724668502808, "learning_rate": 1.758896797153025e-06, "loss": 0.2046, "mean_token_accuracy": 0.9452372453429482, "num_tokens": 412726553.0, "step": 2085 }, { "entropy": 0.721489062092521, "epoch": 0.4183803457688808, "grad_norm": 0.7589449882507324, "learning_rate": 1.7577846975088968e-06, "loss": 0.2078, "mean_token_accuracy": 0.9432421592148867, "num_tokens": 413743869.0, "step": 2090 }, { "entropy": 0.7336713259870355, "epoch": 0.4193812556869882, "grad_norm": 1.2119146585464478, "learning_rate": 1.7566725978647686e-06, "loss": 0.2028, "mean_token_accuracy": 0.9461878337643364, "num_tokens": 414696099.0, "step": 2095 }, { "entropy": 0.7307607759128917, "epoch": 0.42038216560509556, "grad_norm": 1.6642305850982666, "learning_rate": 1.7555604982206405e-06, "loss": 0.2065, "mean_token_accuracy": 0.9463564807718451, "num_tokens": 415430423.0, "step": 2100 }, { "epoch": 0.42038216560509556, "eval_entropy": 0.6673324142323166, "eval_loss": 0.1833495795726776, "eval_mean_token_accuracy": 0.9478640595420462, "eval_num_tokens": 415430423.0, "eval_runtime": 7.0408, "eval_samples_per_second": 138.194, "eval_steps_per_second": 8.664, "step": 2100 }, { "entropy": 0.6451762475750663, "epoch": 0.4213830755232029, "grad_norm": 0.9465289115905762, "learning_rate": 1.7544483985765123e-06, "loss": 0.1989, "mean_token_accuracy": 0.9476843329993161, "num_tokens": 416551707.0, "step": 2105 }, { "entropy": 0.6898594834587791, "epoch": 0.4223839854413103, "grad_norm": 0.7581867575645447, "learning_rate": 1.7533362989323844e-06, "loss": 0.2012, "mean_token_accuracy": 0.9459905033761805, "num_tokens": 417645350.0, "step": 2110 }, { "entropy": 0.71252573186701, "epoch": 0.42338489535941765, "grad_norm": 0.7181347608566284, "learning_rate": 1.752224199288256e-06, "loss": 0.2085, "mean_token_accuracy": 0.9445584345947612, "num_tokens": 418716115.0, "step": 2115 }, { "entropy": 0.7133055407892573, "epoch": 0.424385805277525, "grad_norm": 0.6408241987228394, "learning_rate": 1.751112099644128e-06, "loss": 0.2027, "mean_token_accuracy": 0.9469801886515183, "num_tokens": 419663774.0, "step": 2120 }, { "entropy": 0.711867922002619, "epoch": 0.42538671519563237, "grad_norm": 1.6646242141723633, "learning_rate": 1.75e-06, "loss": 0.1944, "mean_token_accuracy": 0.9488317923112349, "num_tokens": 420383624.0, "step": 2125 }, { "entropy": 0.6277808471159502, "epoch": 0.4263876251137398, "grad_norm": 0.8758793473243713, "learning_rate": 1.7488879003558718e-06, "loss": 0.1946, "mean_token_accuracy": 0.949054852398959, "num_tokens": 421525651.0, "step": 2130 }, { "epoch": 0.4263876251137398, "eval_entropy": 0.6551624673311828, "eval_loss": 0.18077890574932098, "eval_mean_token_accuracy": 0.9479863975868851, "eval_num_tokens": 421525651.0, "eval_runtime": 7.2475, "eval_samples_per_second": 134.253, "eval_steps_per_second": 8.417, "step": 2130 }, { "entropy": 0.6754489562728189, "epoch": 0.42738853503184715, "grad_norm": 0.8031311631202698, "learning_rate": 1.747775800711744e-06, "loss": 0.2031, "mean_token_accuracy": 0.9457703324881467, "num_tokens": 422612779.0, "step": 2135 }, { "entropy": 0.7074506429108707, "epoch": 0.4283894449499545, "grad_norm": 0.7394730448722839, "learning_rate": 1.7466637010676155e-06, "loss": 0.2102, "mean_token_accuracy": 0.9433455748991533, "num_tokens": 423647547.0, "step": 2140 }, { "entropy": 0.719589533589103, "epoch": 0.4293903548680619, "grad_norm": 0.7077816128730774, "learning_rate": 1.7455516014234874e-06, "loss": 0.2003, "mean_token_accuracy": 0.9472663749348034, "num_tokens": 424592674.0, "step": 2145 }, { "entropy": 0.7104438646273179, "epoch": 0.43039126478616924, "grad_norm": 1.5614880323410034, "learning_rate": 1.7444395017793595e-06, "loss": 0.2026, "mean_token_accuracy": 0.9470372357151725, "num_tokens": 425315977.0, "step": 2150 }, { "entropy": 0.6318686853755604, "epoch": 0.4313921747042766, "grad_norm": 0.9640966057777405, "learning_rate": 1.7433274021352313e-06, "loss": 0.1927, "mean_token_accuracy": 0.9490994160825555, "num_tokens": 426462515.0, "step": 2155 }, { "entropy": 0.6816817018118771, "epoch": 0.43239308462238396, "grad_norm": 0.7858216762542725, "learning_rate": 1.742215302491103e-06, "loss": 0.2074, "mean_token_accuracy": 0.9445357848297465, "num_tokens": 427573865.0, "step": 2160 }, { "epoch": 0.43239308462238396, "eval_entropy": 0.6511259249976424, "eval_loss": 0.1821490079164505, "eval_mean_token_accuracy": 0.9482881309556179, "eval_num_tokens": 427573865.0, "eval_runtime": 7.0417, "eval_samples_per_second": 138.178, "eval_steps_per_second": 8.663, "step": 2160 }, { "entropy": 0.6951464016329159, "epoch": 0.4333939945404914, "grad_norm": 0.7225281000137329, "learning_rate": 1.741103202846975e-06, "loss": 0.2008, "mean_token_accuracy": 0.9459953931244937, "num_tokens": 428613823.0, "step": 2165 }, { "entropy": 0.7063021380792964, "epoch": 0.43439490445859874, "grad_norm": 0.6970705986022949, "learning_rate": 1.739991103202847e-06, "loss": 0.201, "mean_token_accuracy": 0.9472221742976795, "num_tokens": 429555485.0, "step": 2170 }, { "entropy": 0.7152685084126212, "epoch": 0.4353958143767061, "grad_norm": 1.6057738065719604, "learning_rate": 1.7388790035587188e-06, "loss": 0.1993, "mean_token_accuracy": 0.9480575875802474, "num_tokens": 430289824.0, "step": 2175 }, { "entropy": 0.6313129712234844, "epoch": 0.43639672429481347, "grad_norm": 0.8678532838821411, "learning_rate": 1.7377669039145906e-06, "loss": 0.1918, "mean_token_accuracy": 0.9491723103956743, "num_tokens": 431416002.0, "step": 2180 }, { "entropy": 0.6787468869577754, "epoch": 0.4373976342129208, "grad_norm": 0.8015258312225342, "learning_rate": 1.7366548042704625e-06, "loss": 0.2065, "mean_token_accuracy": 0.9452338890595869, "num_tokens": 432495759.0, "step": 2185 }, { "entropy": 0.6967111151326787, "epoch": 0.4383985441310282, "grad_norm": 0.7446882128715515, "learning_rate": 1.7355427046263345e-06, "loss": 0.2016, "mean_token_accuracy": 0.9461214417761022, "num_tokens": 433540979.0, "step": 2190 }, { "epoch": 0.4383985441310282, "eval_entropy": 0.6525588817283755, "eval_loss": 0.1822170466184616, "eval_mean_token_accuracy": 0.9486938636811053, "eval_num_tokens": 433540979.0, "eval_runtime": 7.0989, "eval_samples_per_second": 137.064, "eval_steps_per_second": 8.593, "step": 2190 }, { "entropy": 0.7109386972405694, "epoch": 0.43939945404913555, "grad_norm": 0.6667978763580322, "learning_rate": 1.7344306049822064e-06, "loss": 0.197, "mean_token_accuracy": 0.947896112095226, "num_tokens": 434493365.0, "step": 2195 }, { "entropy": 0.7043106214566665, "epoch": 0.44040036396724297, "grad_norm": 1.5319359302520752, "learning_rate": 1.733318505338078e-06, "loss": 0.1958, "mean_token_accuracy": 0.9489297693425959, "num_tokens": 435226568.0, "step": 2200 }, { "entropy": 0.6357729142362422, "epoch": 0.44140127388535033, "grad_norm": 0.9023846983909607, "learning_rate": 1.7322064056939501e-06, "loss": 0.1931, "mean_token_accuracy": 0.9492196435278112, "num_tokens": 436329189.0, "step": 2205 }, { "entropy": 0.6714753104881807, "epoch": 0.4424021838034577, "grad_norm": 0.8319141864776611, "learning_rate": 1.731094306049822e-06, "loss": 0.1977, "mean_token_accuracy": 0.9467895637858997, "num_tokens": 437424508.0, "step": 2210 }, { "entropy": 0.6920918749137358, "epoch": 0.44340309372156506, "grad_norm": 0.7682778239250183, "learning_rate": 1.7299822064056938e-06, "loss": 0.1971, "mean_token_accuracy": 0.946890014410019, "num_tokens": 438454313.0, "step": 2215 }, { "entropy": 0.7164150400595232, "epoch": 0.4444040036396724, "grad_norm": 0.7394376397132874, "learning_rate": 1.728870106761566e-06, "loss": 0.203, "mean_token_accuracy": 0.9466155610301278, "num_tokens": 439405721.0, "step": 2220 }, { "epoch": 0.4444040036396724, "eval_entropy": 0.66180199140408, "eval_loss": 0.18257291615009308, "eval_mean_token_accuracy": 0.9479424806891895, "eval_num_tokens": 439405721.0, "eval_runtime": 7.0937, "eval_samples_per_second": 137.163, "eval_steps_per_second": 8.599, "step": 2220 }, { "entropy": 0.7252338122237812, "epoch": 0.4454049135577798, "grad_norm": 1.6949454545974731, "learning_rate": 1.7277580071174375e-06, "loss": 0.1974, "mean_token_accuracy": 0.9479389017278498, "num_tokens": 440135915.0, "step": 2225 }, { "entropy": 0.639058459888805, "epoch": 0.44640582347588714, "grad_norm": 0.9639801383018494, "learning_rate": 1.7266459074733096e-06, "loss": 0.1946, "mean_token_accuracy": 0.9484925963661888, "num_tokens": 441271311.0, "step": 2230 }, { "entropy": 0.6790904402732849, "epoch": 0.44740673339399456, "grad_norm": 0.9020050168037415, "learning_rate": 1.7255338078291815e-06, "loss": 0.1974, "mean_token_accuracy": 0.9472232580184936, "num_tokens": 442357624.0, "step": 2235 }, { "entropy": 0.7040961652994155, "epoch": 0.4484076433121019, "grad_norm": 0.7891590595245361, "learning_rate": 1.7244217081850533e-06, "loss": 0.2016, "mean_token_accuracy": 0.9471836420622739, "num_tokens": 443388566.0, "step": 2240 }, { "entropy": 0.726311549002474, "epoch": 0.4494085532302093, "grad_norm": 0.7292259931564331, "learning_rate": 1.7233096085409252e-06, "loss": 0.1985, "mean_token_accuracy": 0.9474069486964832, "num_tokens": 444351363.0, "step": 2245 }, { "entropy": 0.7288991510868073, "epoch": 0.45040946314831665, "grad_norm": 1.6061440706253052, "learning_rate": 1.722197508896797e-06, "loss": 0.1958, "mean_token_accuracy": 0.9497716730291194, "num_tokens": 445084627.0, "step": 2250 }, { "epoch": 0.45040946314831665, "eval_entropy": 0.6615917594706426, "eval_loss": 0.18212804198265076, "eval_mean_token_accuracy": 0.9479947060835167, "eval_num_tokens": 445084627.0, "eval_runtime": 7.1076, "eval_samples_per_second": 136.895, "eval_steps_per_second": 8.582, "step": 2250 }, { "entropy": 0.6405609087510542, "epoch": 0.451410373066424, "grad_norm": 0.9403535723686218, "learning_rate": 1.721085409252669e-06, "loss": 0.1883, "mean_token_accuracy": 0.9504117098721591, "num_tokens": 446228365.0, "step": 2255 }, { "entropy": 0.6863735209811818, "epoch": 0.45241128298453137, "grad_norm": 0.9006670117378235, "learning_rate": 1.719973309608541e-06, "loss": 0.201, "mean_token_accuracy": 0.9467043746601451, "num_tokens": 447304171.0, "step": 2260 }, { "entropy": 0.7067891944538464, "epoch": 0.4534121929026388, "grad_norm": 0.7302769422531128, "learning_rate": 1.7188612099644126e-06, "loss": 0.2, "mean_token_accuracy": 0.9461231302131307, "num_tokens": 448339733.0, "step": 2265 }, { "entropy": 0.7219751954078675, "epoch": 0.45441310282074615, "grad_norm": 0.6865111589431763, "learning_rate": 1.7177491103202845e-06, "loss": 0.1941, "mean_token_accuracy": 0.948776437477632, "num_tokens": 449292470.0, "step": 2270 }, { "entropy": 0.7329446852207184, "epoch": 0.4554140127388535, "grad_norm": 1.658300757408142, "learning_rate": 1.7166370106761565e-06, "loss": 0.1954, "mean_token_accuracy": 0.9491787666624243, "num_tokens": 450008277.0, "step": 2275 }, { "entropy": 0.6389858687465841, "epoch": 0.4564149226569609, "grad_norm": 0.9273704290390015, "learning_rate": 1.7155249110320284e-06, "loss": 0.1898, "mean_token_accuracy": 0.9499964730306105, "num_tokens": 451148916.0, "step": 2280 }, { "epoch": 0.4564149226569609, "eval_entropy": 0.6680871458327184, "eval_loss": 0.18033559620380402, "eval_mean_token_accuracy": 0.9485022337710272, "eval_num_tokens": 451148916.0, "eval_runtime": 7.0712, "eval_samples_per_second": 137.6, "eval_steps_per_second": 8.627, "step": 2280 }, { "entropy": 0.698534585129131, "epoch": 0.45741583257506824, "grad_norm": 0.8281445503234863, "learning_rate": 1.7144128113879003e-06, "loss": 0.1988, "mean_token_accuracy": 0.9470174924893813, "num_tokens": 452233124.0, "step": 2285 }, { "entropy": 0.7227046912366694, "epoch": 0.4584167424931756, "grad_norm": 0.7393911480903625, "learning_rate": 1.7133007117437721e-06, "loss": 0.1935, "mean_token_accuracy": 0.9472934657877142, "num_tokens": 453277649.0, "step": 2290 }, { "entropy": 0.7179521495645697, "epoch": 0.45941765241128296, "grad_norm": 0.6617256999015808, "learning_rate": 1.712188612099644e-06, "loss": 0.1963, "mean_token_accuracy": 0.9475378876382654, "num_tokens": 454237241.0, "step": 2295 }, { "entropy": 0.7308073119683699, "epoch": 0.4604185623293904, "grad_norm": 1.585522174835205, "learning_rate": 1.711076512455516e-06, "loss": 0.194, "mean_token_accuracy": 0.9490763951431621, "num_tokens": 454963043.0, "step": 2300 }, { "entropy": 0.6407526796514338, "epoch": 0.46141947224749774, "grad_norm": 0.9008808732032776, "learning_rate": 1.709964412811388e-06, "loss": 0.1895, "mean_token_accuracy": 0.9498597432266582, "num_tokens": 456106735.0, "step": 2305 }, { "entropy": 0.6931318703022871, "epoch": 0.4624203821656051, "grad_norm": 0.8182635307312012, "learning_rate": 1.7088523131672596e-06, "loss": 0.198, "mean_token_accuracy": 0.9471178504553708, "num_tokens": 457176055.0, "step": 2310 }, { "epoch": 0.4624203821656051, "eval_entropy": 0.6684376890542078, "eval_loss": 0.18013876676559448, "eval_mean_token_accuracy": 0.9488128394377037, "eval_num_tokens": 457176055.0, "eval_runtime": 7.0798, "eval_samples_per_second": 137.434, "eval_steps_per_second": 8.616, "step": 2310 }, { "entropy": 0.7130835376002572, "epoch": 0.46342129208371247, "grad_norm": 0.699058473110199, "learning_rate": 1.7077402135231316e-06, "loss": 0.1902, "mean_token_accuracy": 0.9488946026021784, "num_tokens": 458210295.0, "step": 2315 }, { "entropy": 0.7252059096639807, "epoch": 0.46442220200181983, "grad_norm": 0.6226775646209717, "learning_rate": 1.7066281138790035e-06, "loss": 0.1908, "mean_token_accuracy": 0.9491387551481073, "num_tokens": 459167123.0, "step": 2320 }, { "entropy": 0.7374506094238975, "epoch": 0.4654231119199272, "grad_norm": 1.667640209197998, "learning_rate": 1.7055160142348755e-06, "loss": 0.1973, "mean_token_accuracy": 0.9483266061002558, "num_tokens": 459902189.0, "step": 2325 }, { "entropy": 0.647046386924657, "epoch": 0.46642402183803455, "grad_norm": 0.9029215574264526, "learning_rate": 1.7044039145907472e-06, "loss": 0.1854, "mean_token_accuracy": 0.9509288961237127, "num_tokens": 461035882.0, "step": 2330 }, { "entropy": 0.6890198341824791, "epoch": 0.46742493175614197, "grad_norm": 0.8259357213973999, "learning_rate": 1.703291814946619e-06, "loss": 0.1888, "mean_token_accuracy": 0.9496154855598103, "num_tokens": 462118087.0, "step": 2335 }, { "entropy": 0.7170812931927768, "epoch": 0.46842584167424933, "grad_norm": 0.7089883685112, "learning_rate": 1.7021797153024911e-06, "loss": 0.1937, "mean_token_accuracy": 0.9480150136080655, "num_tokens": 463164516.0, "step": 2340 }, { "epoch": 0.46842584167424933, "eval_entropy": 0.662775921039894, "eval_loss": 0.18436135351657867, "eval_mean_token_accuracy": 0.9480926853711488, "eval_num_tokens": 463164516.0, "eval_runtime": 7.3033, "eval_samples_per_second": 133.228, "eval_steps_per_second": 8.352, "step": 2340 }, { "entropy": 0.7226597574624148, "epoch": 0.4694267515923567, "grad_norm": 0.6807421445846558, "learning_rate": 1.701067615658363e-06, "loss": 0.1919, "mean_token_accuracy": 0.9482672154903412, "num_tokens": 464120922.0, "step": 2345 }, { "entropy": 0.7318018051710996, "epoch": 0.47042766151046406, "grad_norm": 1.6226071119308472, "learning_rate": 1.6999555160142346e-06, "loss": 0.1978, "mean_token_accuracy": 0.9488935736092654, "num_tokens": 464841415.0, "step": 2350 }, { "entropy": 0.6446803179654208, "epoch": 0.4714285714285714, "grad_norm": 0.8962976932525635, "learning_rate": 1.6988434163701067e-06, "loss": 0.1801, "mean_token_accuracy": 0.952340427311984, "num_tokens": 465966413.0, "step": 2355 }, { "entropy": 0.6918975301764229, "epoch": 0.4724294813466788, "grad_norm": 0.7808786034584045, "learning_rate": 1.6977313167259786e-06, "loss": 0.1912, "mean_token_accuracy": 0.9489104704423384, "num_tokens": 467047557.0, "step": 2360 }, { "entropy": 0.713727774403312, "epoch": 0.47343039126478614, "grad_norm": 0.709165632724762, "learning_rate": 1.6966192170818506e-06, "loss": 0.1897, "mean_token_accuracy": 0.948500750281594, "num_tokens": 468103404.0, "step": 2365 }, { "entropy": 0.7288664657961238, "epoch": 0.47443130118289356, "grad_norm": 0.6595885753631592, "learning_rate": 1.6955071174377223e-06, "loss": 0.1932, "mean_token_accuracy": 0.9496505059979179, "num_tokens": 469059545.0, "step": 2370 }, { "epoch": 0.47443130118289356, "eval_entropy": 0.6668027595418399, "eval_loss": 0.1822473108768463, "eval_mean_token_accuracy": 0.9485983565205434, "eval_num_tokens": 469059545.0, "eval_runtime": 7.0789, "eval_samples_per_second": 137.45, "eval_steps_per_second": 8.617, "step": 2370 }, { "entropy": 0.7264184512875297, "epoch": 0.4754322111010009, "grad_norm": 1.5922738313674927, "learning_rate": 1.6943950177935941e-06, "loss": 0.1903, "mean_token_accuracy": 0.9503626883029938, "num_tokens": 469786008.0, "step": 2375 }, { "entropy": 0.6371802779761228, "epoch": 0.4764331210191083, "grad_norm": 0.9065341949462891, "learning_rate": 1.6932829181494662e-06, "loss": 0.1832, "mean_token_accuracy": 0.9514370045878671, "num_tokens": 470929765.0, "step": 2380 }, { "entropy": 0.6833381346680901, "epoch": 0.47743403093721565, "grad_norm": 0.7842475175857544, "learning_rate": 1.692170818505338e-06, "loss": 0.1919, "mean_token_accuracy": 0.9486998016184026, "num_tokens": 472010349.0, "step": 2385 }, { "entropy": 0.7116641182791102, "epoch": 0.478434940855323, "grad_norm": 0.7250556349754333, "learning_rate": 1.69105871886121e-06, "loss": 0.1994, "mean_token_accuracy": 0.9462671198628165, "num_tokens": 473044662.0, "step": 2390 }, { "entropy": 0.7203712877902118, "epoch": 0.47943585077343037, "grad_norm": 0.6930083632469177, "learning_rate": 1.6899466192170818e-06, "loss": 0.1951, "mean_token_accuracy": 0.9483953313394027, "num_tokens": 473994468.0, "step": 2395 }, { "entropy": 0.7180362874811346, "epoch": 0.48043676069153773, "grad_norm": 1.6153886318206787, "learning_rate": 1.6888345195729536e-06, "loss": 0.1877, "mean_token_accuracy": 0.9512313040820035, "num_tokens": 474722958.0, "step": 2400 }, { "epoch": 0.48043676069153773, "eval_entropy": 0.6513270115266081, "eval_loss": 0.18145401775836945, "eval_mean_token_accuracy": 0.9486565795101103, "eval_num_tokens": 474722958.0, "eval_runtime": 7.0749, "eval_samples_per_second": 137.528, "eval_steps_per_second": 8.622, "step": 2400 }, { "entropy": 0.631208659843965, "epoch": 0.48143767060964515, "grad_norm": 0.8330217003822327, "learning_rate": 1.6877224199288255e-06, "loss": 0.1832, "mean_token_accuracy": 0.9511312663555145, "num_tokens": 475864637.0, "step": 2405 }, { "entropy": 0.6784997463226319, "epoch": 0.4824385805277525, "grad_norm": 0.8162450194358826, "learning_rate": 1.6866103202846975e-06, "loss": 0.1877, "mean_token_accuracy": 0.948906400528821, "num_tokens": 476949278.0, "step": 2410 }, { "entropy": 0.7088199880990115, "epoch": 0.4834394904458599, "grad_norm": 0.7941007614135742, "learning_rate": 1.6854982206405692e-06, "loss": 0.1943, "mean_token_accuracy": 0.9472430489280007, "num_tokens": 477976989.0, "step": 2415 }, { "entropy": 0.7254412935538725, "epoch": 0.48444040036396724, "grad_norm": 0.6573625802993774, "learning_rate": 1.6843861209964413e-06, "loss": 0.1898, "mean_token_accuracy": 0.9490737053481015, "num_tokens": 478923190.0, "step": 2420 }, { "entropy": 0.7253351894291964, "epoch": 0.4854413102820746, "grad_norm": 1.5769827365875244, "learning_rate": 1.6832740213523131e-06, "loss": 0.1865, "mean_token_accuracy": 0.9515808555212888, "num_tokens": 479660032.0, "step": 2425 }, { "entropy": 0.6324506296352906, "epoch": 0.48644222020018196, "grad_norm": 0.8730674982070923, "learning_rate": 1.682161921708185e-06, "loss": 0.1722, "mean_token_accuracy": 0.9541928350925446, "num_tokens": 480777956.0, "step": 2430 }, { "epoch": 0.48644222020018196, "eval_entropy": 0.6624198419148805, "eval_loss": 0.18191221356391907, "eval_mean_token_accuracy": 0.948937864577184, "eval_num_tokens": 480777956.0, "eval_runtime": 7.0984, "eval_samples_per_second": 137.073, "eval_steps_per_second": 8.594, "step": 2430 }, { "entropy": 0.6939558029174805, "epoch": 0.4874431301182894, "grad_norm": 0.8086444735527039, "learning_rate": 1.6810498220640568e-06, "loss": 0.1983, "mean_token_accuracy": 0.9463418738408522, "num_tokens": 481850511.0, "step": 2435 }, { "entropy": 0.7101692611520941, "epoch": 0.48844404003639674, "grad_norm": 0.7348644733428955, "learning_rate": 1.6799377224199287e-06, "loss": 0.1924, "mean_token_accuracy": 0.9481533868746324, "num_tokens": 482899017.0, "step": 2440 }, { "entropy": 0.71939834383401, "epoch": 0.4894449499545041, "grad_norm": 0.6322587132453918, "learning_rate": 1.6788256227758006e-06, "loss": 0.1858, "mean_token_accuracy": 0.9497276311570948, "num_tokens": 483867488.0, "step": 2445 }, { "entropy": 0.7129947499795394, "epoch": 0.49044585987261147, "grad_norm": 1.5901869535446167, "learning_rate": 1.6777135231316726e-06, "loss": 0.1817, "mean_token_accuracy": 0.9520208895206451, "num_tokens": 484607972.0, "step": 2450 }, { "entropy": 0.6236480347134851, "epoch": 0.49144676979071883, "grad_norm": 0.9655255079269409, "learning_rate": 1.6766014234875443e-06, "loss": 0.1794, "mean_token_accuracy": 0.9524587219411677, "num_tokens": 485738406.0, "step": 2455 }, { "entropy": 0.6700492823665792, "epoch": 0.4924476797088262, "grad_norm": 0.7714277505874634, "learning_rate": 1.6754893238434163e-06, "loss": 0.1904, "mean_token_accuracy": 0.9482547689567913, "num_tokens": 486828631.0, "step": 2460 }, { "epoch": 0.4924476797088262, "eval_entropy": 0.6517836798409946, "eval_loss": 0.18481405079364777, "eval_mean_token_accuracy": 0.9480296187713498, "eval_num_tokens": 486828631.0, "eval_runtime": 7.0759, "eval_samples_per_second": 137.509, "eval_steps_per_second": 8.621, "step": 2460 }, { "entropy": 0.7008583583615043, "epoch": 0.49344858962693355, "grad_norm": 0.7727804780006409, "learning_rate": 1.6743772241992882e-06, "loss": 0.1935, "mean_token_accuracy": 0.9479362059723248, "num_tokens": 487864440.0, "step": 2465 }, { "entropy": 0.703695898977193, "epoch": 0.49444949954504097, "grad_norm": 0.6300666928291321, "learning_rate": 1.67326512455516e-06, "loss": 0.1843, "mean_token_accuracy": 0.950631813027642, "num_tokens": 488824928.0, "step": 2470 }, { "entropy": 0.7131873472170396, "epoch": 0.49545040946314833, "grad_norm": 1.5668567419052124, "learning_rate": 1.6721530249110321e-06, "loss": 0.1891, "mean_token_accuracy": 0.950605512749065, "num_tokens": 489558417.0, "step": 2475 }, { "entropy": 0.6274211555719376, "epoch": 0.4964513193812557, "grad_norm": 0.8960671424865723, "learning_rate": 1.6710409252669038e-06, "loss": 0.1808, "mean_token_accuracy": 0.9516859596425836, "num_tokens": 490705705.0, "step": 2480 }, { "entropy": 0.6673310320485722, "epoch": 0.49745222929936306, "grad_norm": 0.7930068373680115, "learning_rate": 1.6699288256227756e-06, "loss": 0.1854, "mean_token_accuracy": 0.9504123080860485, "num_tokens": 491806156.0, "step": 2485 }, { "entropy": 0.6888049499555068, "epoch": 0.4984531392174704, "grad_norm": 0.7340999245643616, "learning_rate": 1.6688167259786477e-06, "loss": 0.1825, "mean_token_accuracy": 0.9502738291567022, "num_tokens": 492861174.0, "step": 2490 }, { "epoch": 0.4984531392174704, "eval_entropy": 0.6484415804753538, "eval_loss": 0.18193961679935455, "eval_mean_token_accuracy": 0.9487551157591773, "eval_num_tokens": 492861174.0, "eval_runtime": 7.0365, "eval_samples_per_second": 138.278, "eval_steps_per_second": 8.669, "step": 2490 }, { "entropy": 0.7058779543096368, "epoch": 0.4994540491355778, "grad_norm": 0.6413493156433105, "learning_rate": 1.6677046263345196e-06, "loss": 0.1878, "mean_token_accuracy": 0.9502467995340174, "num_tokens": 493819119.0, "step": 2495 }, { "entropy": 0.7087403059005737, "epoch": 0.5004549590536852, "grad_norm": 1.787781834602356, "learning_rate": 1.6665925266903912e-06, "loss": 0.1787, "mean_token_accuracy": 0.9526755853132768, "num_tokens": 494543819.0, "step": 2500 }, { "entropy": 0.6390963711521842, "epoch": 0.5014558689717925, "grad_norm": 0.9745569825172424, "learning_rate": 1.6654804270462633e-06, "loss": 0.1852, "mean_token_accuracy": 0.9514152586460114, "num_tokens": 495662161.0, "step": 2505 }, { "entropy": 0.6857650220394135, "epoch": 0.5024567788898999, "grad_norm": 0.7823364734649658, "learning_rate": 1.6643683274021351e-06, "loss": 0.1904, "mean_token_accuracy": 0.9487736772407185, "num_tokens": 496756895.0, "step": 2510 }, { "entropy": 0.7007447817108848, "epoch": 0.5034576888080072, "grad_norm": 0.7864211201667786, "learning_rate": 1.6632562277580072e-06, "loss": 0.1886, "mean_token_accuracy": 0.949703172120181, "num_tokens": 497791268.0, "step": 2515 }, { "entropy": 0.7246824242851951, "epoch": 0.5044585987261146, "grad_norm": 0.7322613596916199, "learning_rate": 1.6621441281138788e-06, "loss": 0.1911, "mean_token_accuracy": 0.9495543116872961, "num_tokens": 498738442.0, "step": 2520 }, { "epoch": 0.5044585987261146, "eval_entropy": 0.6589406611489468, "eval_loss": 0.1833851933479309, "eval_mean_token_accuracy": 0.9485713624563373, "eval_num_tokens": 498738442.0, "eval_runtime": 7.0805, "eval_samples_per_second": 137.419, "eval_steps_per_second": 8.615, "step": 2520 }, { "entropy": 0.7190561719916083, "epoch": 0.5054595086442221, "grad_norm": 1.6090604066848755, "learning_rate": 1.6610320284697507e-06, "loss": 0.1893, "mean_token_accuracy": 0.9508534241806377, "num_tokens": 499466730.0, "step": 2525 }, { "entropy": 0.6477170705795288, "epoch": 0.5064604185623294, "grad_norm": 0.9411379098892212, "learning_rate": 1.6599199288256228e-06, "loss": 0.1869, "mean_token_accuracy": 0.950556813586842, "num_tokens": 500573434.0, "step": 2530 }, { "entropy": 0.6901047151197087, "epoch": 0.5074613284804368, "grad_norm": 0.8624676465988159, "learning_rate": 1.6588078291814946e-06, "loss": 0.1863, "mean_token_accuracy": 0.9498747451738878, "num_tokens": 501667610.0, "step": 2535 }, { "entropy": 0.6946311796253378, "epoch": 0.5084622383985441, "grad_norm": 0.7686476707458496, "learning_rate": 1.6576957295373665e-06, "loss": 0.1867, "mean_token_accuracy": 0.9502540312030099, "num_tokens": 502710197.0, "step": 2540 }, { "entropy": 0.7155399157242341, "epoch": 0.5094631483166515, "grad_norm": 0.6873733997344971, "learning_rate": 1.6565836298932383e-06, "loss": 0.1864, "mean_token_accuracy": 0.9502958644520153, "num_tokens": 503664692.0, "step": 2545 }, { "entropy": 0.7136711597442627, "epoch": 0.5104640582347588, "grad_norm": 1.6547688245773315, "learning_rate": 1.6554715302491102e-06, "loss": 0.1833, "mean_token_accuracy": 0.9522597675973719, "num_tokens": 504389185.0, "step": 2550 }, { "epoch": 0.5104640582347588, "eval_entropy": 0.6552407546121566, "eval_loss": 0.18142500519752502, "eval_mean_token_accuracy": 0.9480548022223301, "eval_num_tokens": 504389185.0, "eval_runtime": 7.0471, "eval_samples_per_second": 138.071, "eval_steps_per_second": 8.656, "step": 2550 }, { "entropy": 0.6178887445818294, "epoch": 0.5114649681528662, "grad_norm": 0.913406491279602, "learning_rate": 1.6543594306049823e-06, "loss": 0.1737, "mean_token_accuracy": 0.9535938934846357, "num_tokens": 505531871.0, "step": 2555 }, { "entropy": 0.6729612290859223, "epoch": 0.5124658780709737, "grad_norm": 0.944691002368927, "learning_rate": 1.6532473309608541e-06, "loss": 0.1858, "mean_token_accuracy": 0.9497326493263245, "num_tokens": 506624859.0, "step": 2560 }, { "entropy": 0.6915393555706197, "epoch": 0.513466787989081, "grad_norm": 0.7944353818893433, "learning_rate": 1.6521352313167258e-06, "loss": 0.1803, "mean_token_accuracy": 0.951506213166497, "num_tokens": 507654709.0, "step": 2565 }, { "entropy": 0.7096973836421967, "epoch": 0.5144676979071884, "grad_norm": 0.6219519972801208, "learning_rate": 1.6510231316725978e-06, "loss": 0.1827, "mean_token_accuracy": 0.9509352364323356, "num_tokens": 508600841.0, "step": 2570 }, { "entropy": 0.7088993831114335, "epoch": 0.5154686078252957, "grad_norm": 1.649739384651184, "learning_rate": 1.6499110320284697e-06, "loss": 0.1774, "mean_token_accuracy": 0.9526963141831485, "num_tokens": 509326741.0, "step": 2575 }, { "entropy": 0.6317986461249265, "epoch": 0.5164695177434031, "grad_norm": 0.8715526461601257, "learning_rate": 1.6487989323843416e-06, "loss": 0.1724, "mean_token_accuracy": 0.9545585063370792, "num_tokens": 510445233.0, "step": 2580 }, { "epoch": 0.5164695177434031, "eval_entropy": 0.65945937154723, "eval_loss": 0.18114233016967773, "eval_mean_token_accuracy": 0.9490967617660272, "eval_num_tokens": 510445233.0, "eval_runtime": 7.1179, "eval_samples_per_second": 136.698, "eval_steps_per_second": 8.57, "step": 2580 }, { "entropy": 0.6800177308646116, "epoch": 0.5174704276615104, "grad_norm": 0.8096470832824707, "learning_rate": 1.6476868327402134e-06, "loss": 0.1868, "mean_token_accuracy": 0.9498075154694644, "num_tokens": 511535453.0, "step": 2585 }, { "entropy": 0.6936602221293883, "epoch": 0.5184713375796178, "grad_norm": 0.7464794516563416, "learning_rate": 1.6465747330960853e-06, "loss": 0.1846, "mean_token_accuracy": 0.950649511272257, "num_tokens": 512578283.0, "step": 2590 }, { "entropy": 0.7119996157559482, "epoch": 0.5194722474977252, "grad_norm": 0.6432804465293884, "learning_rate": 1.6454626334519573e-06, "loss": 0.1886, "mean_token_accuracy": 0.9499502631750973, "num_tokens": 513537315.0, "step": 2595 }, { "entropy": 0.7189283625646071, "epoch": 0.5204731574158326, "grad_norm": 1.5802396535873413, "learning_rate": 1.6443505338078292e-06, "loss": 0.1834, "mean_token_accuracy": 0.9515245573087172, "num_tokens": 514261680.0, "step": 2600 }, { "entropy": 0.6315281949260018, "epoch": 0.52147406733394, "grad_norm": 0.8882037401199341, "learning_rate": 1.6432384341637008e-06, "loss": 0.1771, "mean_token_accuracy": 0.9534684544259852, "num_tokens": 515406090.0, "step": 2605 }, { "entropy": 0.6785016363317317, "epoch": 0.5224749772520473, "grad_norm": 0.8530360460281372, "learning_rate": 1.642126334519573e-06, "loss": 0.1837, "mean_token_accuracy": 0.9502465849572962, "num_tokens": 516486142.0, "step": 2610 }, { "epoch": 0.5224749772520473, "eval_entropy": 0.6628204916344315, "eval_loss": 0.1812705546617508, "eval_mean_token_accuracy": 0.948223913302187, "eval_num_tokens": 516486142.0, "eval_runtime": 7.0951, "eval_samples_per_second": 137.138, "eval_steps_per_second": 8.598, "step": 2610 }, { "entropy": 0.7054530333388935, "epoch": 0.5234758871701547, "grad_norm": 0.7850746512413025, "learning_rate": 1.6410142348754448e-06, "loss": 0.1827, "mean_token_accuracy": 0.9509090060537512, "num_tokens": 517532763.0, "step": 2615 }, { "entropy": 0.7255071458491412, "epoch": 0.524476797088262, "grad_norm": 0.6404737234115601, "learning_rate": 1.6399021352313166e-06, "loss": 0.1885, "mean_token_accuracy": 0.9504103817723014, "num_tokens": 518475304.0, "step": 2620 }, { "entropy": 0.7156178160147233, "epoch": 0.5254777070063694, "grad_norm": 1.8426103591918945, "learning_rate": 1.6387900355871887e-06, "loss": 0.1817, "mean_token_accuracy": 0.9521985969760202, "num_tokens": 519192476.0, "step": 2625 }, { "entropy": 0.6239164311777462, "epoch": 0.5264786169244768, "grad_norm": 0.9522146582603455, "learning_rate": 1.6376779359430603e-06, "loss": 0.1754, "mean_token_accuracy": 0.9539390303871849, "num_tokens": 520333033.0, "step": 2630 }, { "entropy": 0.6752398347312754, "epoch": 0.5274795268425841, "grad_norm": 0.809895396232605, "learning_rate": 1.6365658362989322e-06, "loss": 0.1807, "mean_token_accuracy": 0.9511864114891398, "num_tokens": 521436825.0, "step": 2635 }, { "entropy": 0.690356595678763, "epoch": 0.5284804367606916, "grad_norm": 0.7257580161094666, "learning_rate": 1.6354537366548043e-06, "loss": 0.183, "mean_token_accuracy": 0.9509031973101876, "num_tokens": 522480077.0, "step": 2640 }, { "epoch": 0.5284804367606916, "eval_entropy": 0.6481362190402922, "eval_loss": 0.18130838871002197, "eval_mean_token_accuracy": 0.9484924390667775, "eval_num_tokens": 522480077.0, "eval_runtime": 7.2611, "eval_samples_per_second": 134.002, "eval_steps_per_second": 8.401, "step": 2640 }, { "entropy": 0.704322841221636, "epoch": 0.5294813466787989, "grad_norm": 0.6928062438964844, "learning_rate": 1.6343416370106761e-06, "loss": 0.1791, "mean_token_accuracy": 0.9527296142144637, "num_tokens": 523435044.0, "step": 2645 }, { "entropy": 0.7071298014033924, "epoch": 0.5304822565969063, "grad_norm": 1.618299961090088, "learning_rate": 1.633229537366548e-06, "loss": 0.1773, "mean_token_accuracy": 0.9537473727356304, "num_tokens": 524169591.0, "step": 2650 }, { "entropy": 0.6280927267941562, "epoch": 0.5314831665150136, "grad_norm": 0.8909013867378235, "learning_rate": 1.6321174377224198e-06, "loss": 0.1787, "mean_token_accuracy": 0.9528289025480097, "num_tokens": 525350154.0, "step": 2655 }, { "entropy": 0.6755202797326174, "epoch": 0.532484076433121, "grad_norm": 0.7887572646141052, "learning_rate": 1.6310053380782917e-06, "loss": 0.1805, "mean_token_accuracy": 0.9506685668771917, "num_tokens": 526444507.0, "step": 2660 }, { "entropy": 0.6938620551065965, "epoch": 0.5334849863512284, "grad_norm": 0.7199150919914246, "learning_rate": 1.6298932384341638e-06, "loss": 0.1816, "mean_token_accuracy": 0.9513141361149875, "num_tokens": 527503125.0, "step": 2665 }, { "entropy": 0.7077038569883867, "epoch": 0.5344858962693357, "grad_norm": 0.7154355645179749, "learning_rate": 1.6287811387900354e-06, "loss": 0.1768, "mean_token_accuracy": 0.9529465122656389, "num_tokens": 528449606.0, "step": 2670 }, { "epoch": 0.5344858962693357, "eval_entropy": 0.6517275425254322, "eval_loss": 0.17996351420879364, "eval_mean_token_accuracy": 0.9491330697888234, "eval_num_tokens": 528449606.0, "eval_runtime": 7.0362, "eval_samples_per_second": 138.285, "eval_steps_per_second": 8.669, "step": 2670 }, { "entropy": 0.7160382747650147, "epoch": 0.5354868061874432, "grad_norm": 1.6408438682556152, "learning_rate": 1.6276690391459073e-06, "loss": 0.1817, "mean_token_accuracy": 0.9525136871771379, "num_tokens": 529188247.0, "step": 2675 }, { "entropy": 0.6233914153142409, "epoch": 0.5364877161055505, "grad_norm": 0.9246336221694946, "learning_rate": 1.6265569395017793e-06, "loss": 0.1719, "mean_token_accuracy": 0.954184738072482, "num_tokens": 530332960.0, "step": 2680 }, { "entropy": 0.6778095136989247, "epoch": 0.5374886260236579, "grad_norm": 0.8488349914550781, "learning_rate": 1.6254448398576512e-06, "loss": 0.184, "mean_token_accuracy": 0.950068386034532, "num_tokens": 531420665.0, "step": 2685 }, { "entropy": 0.6974740269509229, "epoch": 0.5384895359417653, "grad_norm": 0.7441538572311401, "learning_rate": 1.624332740213523e-06, "loss": 0.1816, "mean_token_accuracy": 0.9505076592618769, "num_tokens": 532457212.0, "step": 2690 }, { "entropy": 0.7071244949644262, "epoch": 0.5394904458598726, "grad_norm": 0.6449328660964966, "learning_rate": 1.623220640569395e-06, "loss": 0.1789, "mean_token_accuracy": 0.9523183091120286, "num_tokens": 533421804.0, "step": 2695 }, { "entropy": 0.7078518325632269, "epoch": 0.54049135577798, "grad_norm": 1.4397194385528564, "learning_rate": 1.6221085409252668e-06, "loss": 0.1795, "mean_token_accuracy": 0.9529074343768034, "num_tokens": 534147596.0, "step": 2700 }, { "epoch": 0.54049135577798, "eval_entropy": 0.6496536575379919, "eval_loss": 0.18150362372398376, "eval_mean_token_accuracy": 0.948933268179659, "eval_num_tokens": 534147596.0, "eval_runtime": 7.1231, "eval_samples_per_second": 136.598, "eval_steps_per_second": 8.564, "step": 2700 }, { "entropy": 0.6244452785361897, "epoch": 0.5414922656960873, "grad_norm": 0.8995323181152344, "learning_rate": 1.6209964412811388e-06, "loss": 0.173, "mean_token_accuracy": 0.9548817623745312, "num_tokens": 535275617.0, "step": 2705 }, { "entropy": 0.6707280232147736, "epoch": 0.5424931756141947, "grad_norm": 0.776010274887085, "learning_rate": 1.6198843416370107e-06, "loss": 0.182, "mean_token_accuracy": 0.9512561657211998, "num_tokens": 536378664.0, "step": 2710 }, { "entropy": 0.6926113960417835, "epoch": 0.543494085532302, "grad_norm": 0.7570468783378601, "learning_rate": 1.6187722419928823e-06, "loss": 0.1795, "mean_token_accuracy": 0.9521033758466894, "num_tokens": 537435601.0, "step": 2715 }, { "entropy": 0.7047263752330434, "epoch": 0.5444949954504095, "grad_norm": 0.726445198059082, "learning_rate": 1.6176601423487544e-06, "loss": 0.1784, "mean_token_accuracy": 0.9523573181845925, "num_tokens": 538399382.0, "step": 2720 }, { "entropy": 0.7051477047530088, "epoch": 0.5454959053685169, "grad_norm": 1.5214438438415527, "learning_rate": 1.6165480427046263e-06, "loss": 0.179, "mean_token_accuracy": 0.9527212950316343, "num_tokens": 539145348.0, "step": 2725 }, { "entropy": 0.6262573410164226, "epoch": 0.5464968152866242, "grad_norm": 0.8618422150611877, "learning_rate": 1.6154359430604983e-06, "loss": 0.1697, "mean_token_accuracy": 0.9551108999685808, "num_tokens": 540256367.0, "step": 2730 }, { "epoch": 0.5464968152866242, "eval_entropy": 0.6563674035619517, "eval_loss": 0.17830216884613037, "eval_mean_token_accuracy": 0.9493675964777587, "eval_num_tokens": 540256367.0, "eval_runtime": 7.0929, "eval_samples_per_second": 137.179, "eval_steps_per_second": 8.6, "step": 2730 }, { "entropy": 0.6828876172954386, "epoch": 0.5474977252047316, "grad_norm": 0.819709062576294, "learning_rate": 1.61432384341637e-06, "loss": 0.1797, "mean_token_accuracy": 0.9517435491085052, "num_tokens": 541348519.0, "step": 2735 }, { "entropy": 0.7007702973755923, "epoch": 0.5484986351228389, "grad_norm": 0.8111042976379395, "learning_rate": 1.6132117437722418e-06, "loss": 0.1811, "mean_token_accuracy": 0.9512450467456471, "num_tokens": 542379850.0, "step": 2740 }, { "entropy": 0.720324994488196, "epoch": 0.5494995450409463, "grad_norm": 0.8011656403541565, "learning_rate": 1.612099644128114e-06, "loss": 0.181, "mean_token_accuracy": 0.9518745259805159, "num_tokens": 543352827.0, "step": 2745 }, { "entropy": 0.7194907562299209, "epoch": 0.5505004549590536, "grad_norm": 1.7628045082092285, "learning_rate": 1.6109875444839858e-06, "loss": 0.1727, "mean_token_accuracy": 0.9550723487680609, "num_tokens": 544083387.0, "step": 2750 }, { "entropy": 0.6324405716224151, "epoch": 0.5515013648771611, "grad_norm": 0.9303938746452332, "learning_rate": 1.6098754448398574e-06, "loss": 0.1765, "mean_token_accuracy": 0.9533820531585, "num_tokens": 545240701.0, "step": 2755 }, { "entropy": 0.6824088400060481, "epoch": 0.5525022747952685, "grad_norm": 0.7898913025856018, "learning_rate": 1.6087633451957295e-06, "loss": 0.1758, "mean_token_accuracy": 0.9526371836662293, "num_tokens": 546313405.0, "step": 2760 }, { "epoch": 0.5525022747952685, "eval_entropy": 0.6607245241032272, "eval_loss": 0.18013106286525726, "eval_mean_token_accuracy": 0.9495335397173147, "eval_num_tokens": 546313405.0, "eval_runtime": 6.9999, "eval_samples_per_second": 139.003, "eval_steps_per_second": 8.714, "step": 2760 }, { "entropy": 0.7003410195762461, "epoch": 0.5535031847133758, "grad_norm": 0.8289366364479065, "learning_rate": 1.6076512455516013e-06, "loss": 0.1794, "mean_token_accuracy": 0.9513268579136241, "num_tokens": 547348958.0, "step": 2765 }, { "entropy": 0.7149239838123321, "epoch": 0.5545040946314832, "grad_norm": 0.6572934985160828, "learning_rate": 1.6065391459074732e-06, "loss": 0.1772, "mean_token_accuracy": 0.9532779801975597, "num_tokens": 548314371.0, "step": 2770 }, { "entropy": 0.7151028931140899, "epoch": 0.5555050045495905, "grad_norm": 1.5846747159957886, "learning_rate": 1.605427046263345e-06, "loss": 0.174, "mean_token_accuracy": 0.9540536219423468, "num_tokens": 549045682.0, "step": 2775 }, { "entropy": 0.6212464993650263, "epoch": 0.5565059144676979, "grad_norm": 0.9082188010215759, "learning_rate": 1.604314946619217e-06, "loss": 0.169, "mean_token_accuracy": 0.9555608651854776, "num_tokens": 550218972.0, "step": 2780 }, { "entropy": 0.6785877959294753, "epoch": 0.5575068243858052, "grad_norm": 0.8211308121681213, "learning_rate": 1.603202846975089e-06, "loss": 0.178, "mean_token_accuracy": 0.9523198788816278, "num_tokens": 551311414.0, "step": 2785 }, { "entropy": 0.6972319600257006, "epoch": 0.5585077343039127, "grad_norm": 0.7803236246109009, "learning_rate": 1.6020907473309608e-06, "loss": 0.1847, "mean_token_accuracy": 0.9511200400916013, "num_tokens": 552341292.0, "step": 2790 }, { "epoch": 0.5585077343039127, "eval_entropy": 0.6579079383709392, "eval_loss": 0.17929911613464355, "eval_mean_token_accuracy": 0.9495372889471836, "eval_num_tokens": 552341292.0, "eval_runtime": 7.0916, "eval_samples_per_second": 137.205, "eval_steps_per_second": 8.602, "step": 2790 }, { "entropy": 0.7150760515169664, "epoch": 0.5595086442220201, "grad_norm": 0.7405098080635071, "learning_rate": 1.6009786476868327e-06, "loss": 0.1755, "mean_token_accuracy": 0.9534146623177961, "num_tokens": 553288742.0, "step": 2795 }, { "entropy": 0.7124831140041351, "epoch": 0.5605095541401274, "grad_norm": 1.5424504280090332, "learning_rate": 1.5998665480427046e-06, "loss": 0.1741, "mean_token_accuracy": 0.9544130991805684, "num_tokens": 554020302.0, "step": 2800 }, { "entropy": 0.6266002264889804, "epoch": 0.5615104640582348, "grad_norm": 0.8750737309455872, "learning_rate": 1.5987544483985764e-06, "loss": 0.1693, "mean_token_accuracy": 0.955253835699775, "num_tokens": 555155713.0, "step": 2805 }, { "entropy": 0.6803346837108786, "epoch": 0.5625113739763421, "grad_norm": 0.8157915472984314, "learning_rate": 1.5976423487544483e-06, "loss": 0.1824, "mean_token_accuracy": 0.950763221762397, "num_tokens": 556265345.0, "step": 2810 }, { "entropy": 0.6968496783213182, "epoch": 0.5635122838944495, "grad_norm": 0.7749494910240173, "learning_rate": 1.5965302491103203e-06, "loss": 0.1783, "mean_token_accuracy": 0.9526708814230832, "num_tokens": 557289078.0, "step": 2815 }, { "entropy": 0.7060149173844944, "epoch": 0.5645131938125568, "grad_norm": 0.6400516033172607, "learning_rate": 1.595418149466192e-06, "loss": 0.1746, "mean_token_accuracy": 0.9539100381461056, "num_tokens": 558245587.0, "step": 2820 }, { "epoch": 0.5645131938125568, "eval_entropy": 0.6540780389895204, "eval_loss": 0.18272945284843445, "eval_mean_token_accuracy": 0.9483291203858423, "eval_num_tokens": 558245587.0, "eval_runtime": 7.0762, "eval_samples_per_second": 137.503, "eval_steps_per_second": 8.62, "step": 2820 }, { "entropy": 0.7129332022233443, "epoch": 0.5655141037306642, "grad_norm": 1.6981102228164673, "learning_rate": 1.594306049822064e-06, "loss": 0.1727, "mean_token_accuracy": 0.9546095479618419, "num_tokens": 558972025.0, "step": 2825 }, { "entropy": 0.6247928223826669, "epoch": 0.5665150136487717, "grad_norm": 0.9023392796516418, "learning_rate": 1.593193950177936e-06, "loss": 0.1717, "mean_token_accuracy": 0.9542244569821792, "num_tokens": 560119210.0, "step": 2830 }, { "entropy": 0.6733576593073931, "epoch": 0.567515923566879, "grad_norm": 0.7821236252784729, "learning_rate": 1.5920818505338078e-06, "loss": 0.1767, "mean_token_accuracy": 0.951957995783199, "num_tokens": 561195725.0, "step": 2835 }, { "entropy": 0.7028111894022334, "epoch": 0.5685168334849864, "grad_norm": 0.745370626449585, "learning_rate": 1.5909697508896796e-06, "loss": 0.1737, "mean_token_accuracy": 0.9535439290783622, "num_tokens": 562247195.0, "step": 2840 }, { "entropy": 0.7104091267694127, "epoch": 0.5695177434030937, "grad_norm": 0.606145977973938, "learning_rate": 1.5898576512455515e-06, "loss": 0.1723, "mean_token_accuracy": 0.9540031877431002, "num_tokens": 563213222.0, "step": 2845 }, { "entropy": 0.7080640597776933, "epoch": 0.5705186533212011, "grad_norm": 1.5943220853805542, "learning_rate": 1.5887455516014233e-06, "loss": 0.1775, "mean_token_accuracy": 0.9533238221298564, "num_tokens": 563958366.0, "step": 2850 }, { "epoch": 0.5705186533212011, "eval_entropy": 0.6547147309193846, "eval_loss": 0.18087640404701233, "eval_mean_token_accuracy": 0.9491668822335415, "eval_num_tokens": 563958366.0, "eval_runtime": 7.112, "eval_samples_per_second": 136.811, "eval_steps_per_second": 8.577, "step": 2850 }, { "entropy": 0.6236652623523365, "epoch": 0.5715195632393084, "grad_norm": 0.9184789061546326, "learning_rate": 1.5876334519572954e-06, "loss": 0.1674, "mean_token_accuracy": 0.9561629105697979, "num_tokens": 565085789.0, "step": 2855 }, { "entropy": 0.677449183030562, "epoch": 0.5725204731574158, "grad_norm": 0.8137527704238892, "learning_rate": 1.586521352313167e-06, "loss": 0.1773, "mean_token_accuracy": 0.9524840208617124, "num_tokens": 566182679.0, "step": 2860 }, { "entropy": 0.6954392140561884, "epoch": 0.5735213830755233, "grad_norm": 0.7889710068702698, "learning_rate": 1.585409252669039e-06, "loss": 0.1776, "mean_token_accuracy": 0.9529242997819727, "num_tokens": 567225094.0, "step": 2865 }, { "entropy": 0.6996893541379409, "epoch": 0.5745222929936306, "grad_norm": 0.678756058216095, "learning_rate": 1.584297153024911e-06, "loss": 0.17, "mean_token_accuracy": 0.9549407476728613, "num_tokens": 568176148.0, "step": 2870 }, { "entropy": 0.701075277003375, "epoch": 0.575523202911738, "grad_norm": 1.6416276693344116, "learning_rate": 1.5831850533807828e-06, "loss": 0.1726, "mean_token_accuracy": 0.9550453679128127, "num_tokens": 568919638.0, "step": 2875 }, { "entropy": 0.6135414800860665, "epoch": 0.5765241128298453, "grad_norm": 0.9266515374183655, "learning_rate": 1.582072953736655e-06, "loss": 0.1698, "mean_token_accuracy": 0.9551390783353285, "num_tokens": 570088747.0, "step": 2880 }, { "epoch": 0.5765241128298453, "eval_entropy": 0.6452143426801338, "eval_loss": 0.17846497893333435, "eval_mean_token_accuracy": 0.9499536039399319, "eval_num_tokens": 570088747.0, "eval_runtime": 7.1363, "eval_samples_per_second": 136.344, "eval_steps_per_second": 8.548, "step": 2880 }, { "entropy": 0.6611321086233313, "epoch": 0.5775250227479527, "grad_norm": 0.8491476774215698, "learning_rate": 1.5809608540925266e-06, "loss": 0.1699, "mean_token_accuracy": 0.9547013607892123, "num_tokens": 571174715.0, "step": 2885 }, { "entropy": 0.6863727271556854, "epoch": 0.57852593266606, "grad_norm": 0.7314031720161438, "learning_rate": 1.5798487544483984e-06, "loss": 0.1758, "mean_token_accuracy": 0.9533765917474574, "num_tokens": 572199578.0, "step": 2890 }, { "entropy": 0.6956823023882779, "epoch": 0.5795268425841674, "grad_norm": 0.6401289105415344, "learning_rate": 1.5787366548042705e-06, "loss": 0.167, "mean_token_accuracy": 0.9559003060514276, "num_tokens": 573157944.0, "step": 2895 }, { "entropy": 0.7030810995535417, "epoch": 0.5805277525022748, "grad_norm": 1.6768332719802856, "learning_rate": 1.5776245551601423e-06, "loss": 0.1739, "mean_token_accuracy": 0.954583527283235, "num_tokens": 573890695.0, "step": 2900 }, { "entropy": 0.6071485982699828, "epoch": 0.5815286624203821, "grad_norm": 0.887630045413971, "learning_rate": 1.576512455516014e-06, "loss": 0.1616, "mean_token_accuracy": 0.9568493794311177, "num_tokens": 575040060.0, "step": 2905 }, { "entropy": 0.670040900869803, "epoch": 0.5825295723384896, "grad_norm": 0.8263089060783386, "learning_rate": 1.575400355871886e-06, "loss": 0.1751, "mean_token_accuracy": 0.9534888061610135, "num_tokens": 576133302.0, "step": 2910 }, { "epoch": 0.5825295723384896, "eval_entropy": 0.6536685472629109, "eval_loss": 0.18158204853534698, "eval_mean_token_accuracy": 0.9491394347831851, "eval_num_tokens": 576133302.0, "eval_runtime": 7.0798, "eval_samples_per_second": 137.433, "eval_steps_per_second": 8.616, "step": 2910 }, { "entropy": 0.6866442734544927, "epoch": 0.5835304822565969, "grad_norm": 0.7574155926704407, "learning_rate": 1.574288256227758e-06, "loss": 0.1705, "mean_token_accuracy": 0.9546013420278375, "num_tokens": 577182306.0, "step": 2915 }, { "entropy": 0.6976790409196507, "epoch": 0.5845313921747043, "grad_norm": 0.7004702687263489, "learning_rate": 1.57317615658363e-06, "loss": 0.1683, "mean_token_accuracy": 0.9545241507616911, "num_tokens": 578143268.0, "step": 2920 }, { "entropy": 0.6992522610859437, "epoch": 0.5855323020928116, "grad_norm": 1.5859788656234741, "learning_rate": 1.5720640569395016e-06, "loss": 0.1649, "mean_token_accuracy": 0.957004501061006, "num_tokens": 578880587.0, "step": 2925 }, { "entropy": 0.6218387118794702, "epoch": 0.586533212010919, "grad_norm": 0.9949880838394165, "learning_rate": 1.5709519572953735e-06, "loss": 0.1693, "mean_token_accuracy": 0.955338594046506, "num_tokens": 580027281.0, "step": 2930 }, { "entropy": 0.6679159836335615, "epoch": 0.5875341219290264, "grad_norm": 0.7988151907920837, "learning_rate": 1.5698398576512456e-06, "loss": 0.175, "mean_token_accuracy": 0.9529053601351651, "num_tokens": 581093873.0, "step": 2935 }, { "entropy": 0.698068075559356, "epoch": 0.5885350318471337, "grad_norm": 0.7373477816581726, "learning_rate": 1.5687277580071174e-06, "loss": 0.17, "mean_token_accuracy": 0.9542166119272059, "num_tokens": 582126221.0, "step": 2940 }, { "epoch": 0.5885350318471337, "eval_entropy": 0.6519819942654156, "eval_loss": 0.1814005970954895, "eval_mean_token_accuracy": 0.9493043364071455, "eval_num_tokens": 582126221.0, "eval_runtime": 7.1253, "eval_samples_per_second": 136.555, "eval_steps_per_second": 8.561, "step": 2940 }, { "entropy": 0.7105110601945357, "epoch": 0.5895359417652412, "grad_norm": 0.6741722822189331, "learning_rate": 1.567615658362989e-06, "loss": 0.1753, "mean_token_accuracy": 0.9536024895581332, "num_tokens": 583074496.0, "step": 2945 }, { "entropy": 0.7069875858046791, "epoch": 0.5905368516833485, "grad_norm": 1.6635076999664307, "learning_rate": 1.5665035587188611e-06, "loss": 0.1708, "mean_token_accuracy": 0.9553401833230799, "num_tokens": 583801034.0, "step": 2950 }, { "entropy": 0.6250316668640483, "epoch": 0.5915377616014559, "grad_norm": 0.9111377596855164, "learning_rate": 1.565391459074733e-06, "loss": 0.1675, "mean_token_accuracy": 0.9554700141603296, "num_tokens": 584923018.0, "step": 2955 }, { "entropy": 0.6771905362606049, "epoch": 0.5925386715195632, "grad_norm": 0.8384516835212708, "learning_rate": 1.564279359430605e-06, "loss": 0.1719, "mean_token_accuracy": 0.9523442192511125, "num_tokens": 586002885.0, "step": 2960 }, { "entropy": 0.6946805049072612, "epoch": 0.5935395814376706, "grad_norm": 0.8512565493583679, "learning_rate": 1.563167259786477e-06, "loss": 0.1748, "mean_token_accuracy": 0.9534324310042641, "num_tokens": 587033715.0, "step": 2965 }, { "entropy": 0.7199535329233516, "epoch": 0.594540491355778, "grad_norm": 0.9583800435066223, "learning_rate": 1.5620551601423486e-06, "loss": 0.1718, "mean_token_accuracy": 0.9540681860663675, "num_tokens": 587973781.0, "step": 2970 }, { "epoch": 0.594540491355778, "eval_entropy": 0.6598269665827516, "eval_loss": 0.17958512902259827, "eval_mean_token_accuracy": 0.9494309190843926, "eval_num_tokens": 587973781.0, "eval_runtime": 7.1961, "eval_samples_per_second": 135.212, "eval_steps_per_second": 8.477, "step": 2970 }, { "entropy": 0.7052282398397273, "epoch": 0.5955414012738853, "grad_norm": 1.6504220962524414, "learning_rate": 1.5609430604982206e-06, "loss": 0.1642, "mean_token_accuracy": 0.9569182553074577, "num_tokens": 588703254.0, "step": 2975 }, { "entropy": 0.6265523303638805, "epoch": 0.5965423111919927, "grad_norm": 0.9333195090293884, "learning_rate": 1.5598309608540925e-06, "loss": 0.1649, "mean_token_accuracy": 0.95631789077412, "num_tokens": 589845740.0, "step": 2980 }, { "entropy": 0.679840994423086, "epoch": 0.5975432211101, "grad_norm": 0.8418663740158081, "learning_rate": 1.5587188612099643e-06, "loss": 0.1789, "mean_token_accuracy": 0.9520544881170446, "num_tokens": 590927438.0, "step": 2985 }, { "entropy": 0.7017120361328125, "epoch": 0.5985441310282075, "grad_norm": 0.7685884833335876, "learning_rate": 1.5576067615658362e-06, "loss": 0.1723, "mean_token_accuracy": 0.9537156251343814, "num_tokens": 591971110.0, "step": 2990 }, { "entropy": 0.7107149928808212, "epoch": 0.5995450409463148, "grad_norm": 0.7431087493896484, "learning_rate": 1.556494661921708e-06, "loss": 0.1694, "mean_token_accuracy": 0.954865367304195, "num_tokens": 592933967.0, "step": 2995 }, { "entropy": 0.7013540771874515, "epoch": 0.6005459508644222, "grad_norm": 1.5792326927185059, "learning_rate": 1.55538256227758e-06, "loss": 0.1627, "mean_token_accuracy": 0.9574482348832217, "num_tokens": 593674210.0, "step": 3000 }, { "epoch": 0.6005459508644222, "eval_entropy": 0.6583689304648853, "eval_loss": 0.18258357048034668, "eval_mean_token_accuracy": 0.9492516068161511, "eval_num_tokens": 593674210.0, "eval_runtime": 7.2234, "eval_samples_per_second": 134.702, "eval_steps_per_second": 8.445, "step": 3000 }, { "entropy": 0.6216048316522078, "epoch": 0.6015468607825296, "grad_norm": 0.9363070130348206, "learning_rate": 1.554270462633452e-06, "loss": 0.167, "mean_token_accuracy": 0.9563505985520103, "num_tokens": 594804599.0, "step": 3005 }, { "entropy": 0.6832993455908515, "epoch": 0.6025477707006369, "grad_norm": 0.8691864609718323, "learning_rate": 1.5531583629893236e-06, "loss": 0.1699, "mean_token_accuracy": 0.9539537310600281, "num_tokens": 595873162.0, "step": 3010 }, { "entropy": 0.7007348017259077, "epoch": 0.6035486806187443, "grad_norm": 0.7877841591835022, "learning_rate": 1.5520462633451957e-06, "loss": 0.1714, "mean_token_accuracy": 0.9534028855237093, "num_tokens": 596910846.0, "step": 3015 }, { "entropy": 0.7141089824112978, "epoch": 0.6045495905368516, "grad_norm": 0.6574937105178833, "learning_rate": 1.5509341637010676e-06, "loss": 0.1731, "mean_token_accuracy": 0.9544164771383459, "num_tokens": 597853139.0, "step": 3020 }, { "entropy": 0.7040485745126551, "epoch": 0.6055505004549591, "grad_norm": 1.726340889930725, "learning_rate": 1.5498220640569394e-06, "loss": 0.1645, "mean_token_accuracy": 0.9566915214061738, "num_tokens": 598577635.0, "step": 3025 }, { "entropy": 0.6146221702749078, "epoch": 0.6065514103730664, "grad_norm": 0.9489019513130188, "learning_rate": 1.5487099644128113e-06, "loss": 0.1643, "mean_token_accuracy": 0.9564465994184668, "num_tokens": 599712819.0, "step": 3030 }, { "epoch": 0.6065514103730664, "eval_entropy": 0.6538055861582521, "eval_loss": 0.18010924756526947, "eval_mean_token_accuracy": 0.9496048923398628, "eval_num_tokens": 599712819.0, "eval_runtime": 6.9982, "eval_samples_per_second": 139.035, "eval_steps_per_second": 8.716, "step": 3030 }, { "entropy": 0.6649877328764309, "epoch": 0.6075523202911738, "grad_norm": 0.7716711163520813, "learning_rate": 1.5475978647686831e-06, "loss": 0.1674, "mean_token_accuracy": 0.9545083528215235, "num_tokens": 600810295.0, "step": 3035 }, { "entropy": 0.6926416860385375, "epoch": 0.6085532302092812, "grad_norm": 0.7539160251617432, "learning_rate": 1.546485765124555e-06, "loss": 0.1712, "mean_token_accuracy": 0.9541196048259735, "num_tokens": 601845533.0, "step": 3040 }, { "entropy": 0.7097321336919611, "epoch": 0.6095541401273885, "grad_norm": 0.954349160194397, "learning_rate": 1.545373665480427e-06, "loss": 0.1657, "mean_token_accuracy": 0.9558960502797907, "num_tokens": 602800251.0, "step": 3045 }, { "entropy": 0.7092912332578138, "epoch": 0.6105550500454959, "grad_norm": 1.6350897550582886, "learning_rate": 1.544261565836299e-06, "loss": 0.1696, "mean_token_accuracy": 0.9553542906587774, "num_tokens": 603536113.0, "step": 3050 }, { "entropy": 0.6223157600923018, "epoch": 0.6115559599636032, "grad_norm": 0.913377583026886, "learning_rate": 1.5431494661921708e-06, "loss": 0.1621, "mean_token_accuracy": 0.9570931732654572, "num_tokens": 604670520.0, "step": 3055 }, { "entropy": 0.6658590446818958, "epoch": 0.6125568698817107, "grad_norm": 0.8451462984085083, "learning_rate": 1.5420373665480426e-06, "loss": 0.1634, "mean_token_accuracy": 0.9555998970161784, "num_tokens": 605754295.0, "step": 3060 }, { "epoch": 0.6125568698817107, "eval_entropy": 0.6519810190943421, "eval_loss": 0.18205370008945465, "eval_mean_token_accuracy": 0.9495203221430544, "eval_num_tokens": 605754295.0, "eval_runtime": 7.058, "eval_samples_per_second": 137.859, "eval_steps_per_second": 8.643, "step": 3060 }, { "entropy": 0.691884211789478, "epoch": 0.6135577797998181, "grad_norm": 0.7430902123451233, "learning_rate": 1.5409252669039145e-06, "loss": 0.1689, "mean_token_accuracy": 0.9547361292622306, "num_tokens": 606778333.0, "step": 3065 }, { "entropy": 0.7035522225228223, "epoch": 0.6145586897179254, "grad_norm": 0.6056403517723083, "learning_rate": 1.5398131672597866e-06, "loss": 0.1653, "mean_token_accuracy": 0.9561255595900796, "num_tokens": 607736004.0, "step": 3070 }, { "entropy": 0.7008915657346899, "epoch": 0.6155595996360328, "grad_norm": 1.5326563119888306, "learning_rate": 1.5387010676156582e-06, "loss": 0.1652, "mean_token_accuracy": 0.9567563999782909, "num_tokens": 608474829.0, "step": 3075 }, { "entropy": 0.6191458604552529, "epoch": 0.6165605095541401, "grad_norm": 0.9065665602684021, "learning_rate": 1.53758896797153e-06, "loss": 0.1582, "mean_token_accuracy": 0.9579779603264549, "num_tokens": 609604074.0, "step": 3080 }, { "entropy": 0.6666526832363823, "epoch": 0.6175614194722475, "grad_norm": 0.8556954860687256, "learning_rate": 1.5364768683274021e-06, "loss": 0.167, "mean_token_accuracy": 0.9543328859589316, "num_tokens": 610710246.0, "step": 3085 }, { "entropy": 0.6837726238099011, "epoch": 0.6185623293903548, "grad_norm": 0.8487630486488342, "learning_rate": 1.535364768683274e-06, "loss": 0.1688, "mean_token_accuracy": 0.955160356651653, "num_tokens": 611758840.0, "step": 3090 }, { "epoch": 0.6185623293903548, "eval_entropy": 0.6574973884176035, "eval_loss": 0.179812490940094, "eval_mean_token_accuracy": 0.949747121724926, "eval_num_tokens": 611758840.0, "eval_runtime": 7.067, "eval_samples_per_second": 137.683, "eval_steps_per_second": 8.632, "step": 3090 }, { "entropy": 0.7014839009805159, "epoch": 0.6195632393084622, "grad_norm": 0.6837453246116638, "learning_rate": 1.5342526690391456e-06, "loss": 0.1687, "mean_token_accuracy": 0.9557089160789143, "num_tokens": 612699784.0, "step": 3095 }, { "entropy": 0.7075679730285298, "epoch": 0.6205641492265697, "grad_norm": 1.7436314821243286, "learning_rate": 1.5331405693950177e-06, "loss": 0.169, "mean_token_accuracy": 0.9560685184868899, "num_tokens": 613436633.0, "step": 3100 }, { "entropy": 0.6202307202599265, "epoch": 0.621565059144677, "grad_norm": 0.9475667476654053, "learning_rate": 1.5320284697508896e-06, "loss": 0.1597, "mean_token_accuracy": 0.9571539521217346, "num_tokens": 614596800.0, "step": 3105 }, { "entropy": 0.6672575666145845, "epoch": 0.6225659690627844, "grad_norm": 0.8185185194015503, "learning_rate": 1.5309163701067616e-06, "loss": 0.1685, "mean_token_accuracy": 0.9542947286909277, "num_tokens": 615704382.0, "step": 3110 }, { "entropy": 0.6795628358017315, "epoch": 0.6235668789808917, "grad_norm": 0.7307755351066589, "learning_rate": 1.5298042704626333e-06, "loss": 0.1627, "mean_token_accuracy": 0.9561434664509513, "num_tokens": 616757442.0, "step": 3115 }, { "entropy": 0.6966196049343456, "epoch": 0.6245677888989991, "grad_norm": 0.6424974799156189, "learning_rate": 1.5286921708185051e-06, "loss": 0.167, "mean_token_accuracy": 0.9560314021327279, "num_tokens": 617714286.0, "step": 3120 }, { "epoch": 0.6245677888989991, "eval_entropy": 0.6488942484386632, "eval_loss": 0.18082934617996216, "eval_mean_token_accuracy": 0.9497447346077591, "eval_num_tokens": 617714286.0, "eval_runtime": 7.0333, "eval_samples_per_second": 138.342, "eval_steps_per_second": 8.673, "step": 3120 }, { "entropy": 0.6996615810827775, "epoch": 0.6255686988171064, "grad_norm": 1.6187618970870972, "learning_rate": 1.5275800711743772e-06, "loss": 0.1604, "mean_token_accuracy": 0.9579841077327729, "num_tokens": 618452310.0, "step": 3125 }, { "entropy": 0.6223932883956216, "epoch": 0.6265696087352138, "grad_norm": 0.9200411438941956, "learning_rate": 1.526467971530249e-06, "loss": 0.1598, "mean_token_accuracy": 0.9576196242462505, "num_tokens": 619611422.0, "step": 3130 }, { "entropy": 0.6693487199870023, "epoch": 0.6275705186533213, "grad_norm": 0.8405710458755493, "learning_rate": 1.525355871886121e-06, "loss": 0.1661, "mean_token_accuracy": 0.9551270468668505, "num_tokens": 620685183.0, "step": 3135 }, { "entropy": 0.6933458956805143, "epoch": 0.6285714285714286, "grad_norm": 0.9356978535652161, "learning_rate": 1.5242437722419928e-06, "loss": 0.1653, "mean_token_accuracy": 0.9554397490891543, "num_tokens": 621712160.0, "step": 3140 }, { "entropy": 0.6994579350406473, "epoch": 0.629572338489536, "grad_norm": 0.8684320449829102, "learning_rate": 1.5231316725978646e-06, "loss": 0.1685, "mean_token_accuracy": 0.9550400712273338, "num_tokens": 622669547.0, "step": 3145 }, { "entropy": 0.7034677063876932, "epoch": 0.6305732484076433, "grad_norm": 1.671410083770752, "learning_rate": 1.5220195729537367e-06, "loss": 0.1637, "mean_token_accuracy": 0.9572344660758972, "num_tokens": 623402046.0, "step": 3150 }, { "epoch": 0.6305732484076433, "eval_entropy": 0.6508955388772683, "eval_loss": 0.18214590847492218, "eval_mean_token_accuracy": 0.9497071631619187, "eval_num_tokens": 623402046.0, "eval_runtime": 7.0379, "eval_samples_per_second": 138.251, "eval_steps_per_second": 8.667, "step": 3150 }, { "entropy": 0.6169803651896391, "epoch": 0.6315741583257507, "grad_norm": 0.9965471625328064, "learning_rate": 1.5209074733096086e-06, "loss": 0.159, "mean_token_accuracy": 0.9576476606455716, "num_tokens": 624557115.0, "step": 3155 }, { "entropy": 0.6633053736253218, "epoch": 0.632575068243858, "grad_norm": 0.8597959280014038, "learning_rate": 1.5197953736654802e-06, "loss": 0.1664, "mean_token_accuracy": 0.9550956438888203, "num_tokens": 625630354.0, "step": 3160 }, { "entropy": 0.6893996604464271, "epoch": 0.6335759781619654, "grad_norm": 0.7524270415306091, "learning_rate": 1.5186832740213523e-06, "loss": 0.1637, "mean_token_accuracy": 0.9566006205298684, "num_tokens": 626655720.0, "step": 3165 }, { "entropy": 0.7058270321650939, "epoch": 0.6345768880800728, "grad_norm": 0.6807648539543152, "learning_rate": 1.5175711743772241e-06, "loss": 0.1637, "mean_token_accuracy": 0.9561190323396163, "num_tokens": 627597028.0, "step": 3170 }, { "entropy": 0.703032106702978, "epoch": 0.6355777979981801, "grad_norm": 1.6004669666290283, "learning_rate": 1.516459074733096e-06, "loss": 0.158, "mean_token_accuracy": 0.9586949603124099, "num_tokens": 628310494.0, "step": 3175 }, { "entropy": 0.6183769884434613, "epoch": 0.6365787079162876, "grad_norm": 0.9058781862258911, "learning_rate": 1.5153469750889679e-06, "loss": 0.1559, "mean_token_accuracy": 0.9587956016713922, "num_tokens": 629439229.0, "step": 3180 }, { "epoch": 0.6365787079162876, "eval_entropy": 0.646690167364527, "eval_loss": 0.18040700256824493, "eval_mean_token_accuracy": 0.9494497678318962, "eval_num_tokens": 629439229.0, "eval_runtime": 7.0218, "eval_samples_per_second": 138.569, "eval_steps_per_second": 8.687, "step": 3180 }, { "entropy": 0.6690399782224135, "epoch": 0.6375796178343949, "grad_norm": 0.8933009505271912, "learning_rate": 1.5142348754448397e-06, "loss": 0.1685, "mean_token_accuracy": 0.955499666929245, "num_tokens": 630526225.0, "step": 3185 }, { "entropy": 0.6908215509219603, "epoch": 0.6385805277525023, "grad_norm": 0.7805888056755066, "learning_rate": 1.5131227758007118e-06, "loss": 0.1682, "mean_token_accuracy": 0.9543212841857563, "num_tokens": 631544968.0, "step": 3190 }, { "entropy": 0.7041902406649156, "epoch": 0.6395814376706096, "grad_norm": 0.6156824827194214, "learning_rate": 1.5120106761565836e-06, "loss": 0.1637, "mean_token_accuracy": 0.957361562685533, "num_tokens": 632498779.0, "step": 3195 }, { "entropy": 0.7018732940608805, "epoch": 0.640582347588717, "grad_norm": 1.7362315654754639, "learning_rate": 1.5108985765124555e-06, "loss": 0.1581, "mean_token_accuracy": 0.9588197816501964, "num_tokens": 633231622.0, "step": 3200 }, { "entropy": 0.6234849360856143, "epoch": 0.6415832575068244, "grad_norm": 0.9099482297897339, "learning_rate": 1.5097864768683274e-06, "loss": 0.1613, "mean_token_accuracy": 0.9571101091124795, "num_tokens": 634372922.0, "step": 3205 }, { "entropy": 0.6708677102218975, "epoch": 0.6425841674249317, "grad_norm": 0.8247345089912415, "learning_rate": 1.5086743772241992e-06, "loss": 0.1665, "mean_token_accuracy": 0.9549422193657268, "num_tokens": 635453965.0, "step": 3210 }, { "epoch": 0.6425841674249317, "eval_entropy": 0.6494777632541344, "eval_loss": 0.18078424036502838, "eval_mean_token_accuracy": 0.9497652522853164, "eval_num_tokens": 635453965.0, "eval_runtime": 7.0204, "eval_samples_per_second": 138.595, "eval_steps_per_second": 8.689, "step": 3210 }, { "entropy": 0.6948196053504944, "epoch": 0.6435850773430392, "grad_norm": 0.7620670795440674, "learning_rate": 1.507562277580071e-06, "loss": 0.1629, "mean_token_accuracy": 0.9559367472475225, "num_tokens": 636472033.0, "step": 3215 }, { "entropy": 0.7079345066439021, "epoch": 0.6445859872611465, "grad_norm": 0.6674084663391113, "learning_rate": 1.5064501779359431e-06, "loss": 0.1678, "mean_token_accuracy": 0.9555874754082073, "num_tokens": 637412606.0, "step": 3220 }, { "entropy": 0.7107769147916274, "epoch": 0.6455868971792539, "grad_norm": 1.6964831352233887, "learning_rate": 1.5053380782918148e-06, "loss": 0.1647, "mean_token_accuracy": 0.9574269002134149, "num_tokens": 638133615.0, "step": 3225 }, { "entropy": 0.6168817777525295, "epoch": 0.6465878070973612, "grad_norm": 0.9298244118690491, "learning_rate": 1.5042259786476866e-06, "loss": 0.1569, "mean_token_accuracy": 0.9585052159699526, "num_tokens": 639270319.0, "step": 3230 }, { "entropy": 0.6654906495050951, "epoch": 0.6475887170154686, "grad_norm": 0.8299368023872375, "learning_rate": 1.5031138790035587e-06, "loss": 0.1663, "mean_token_accuracy": 0.9553114105354655, "num_tokens": 640343322.0, "step": 3235 }, { "entropy": 0.6908837380734357, "epoch": 0.648589626933576, "grad_norm": 0.7933794260025024, "learning_rate": 1.5020017793594306e-06, "loss": 0.1643, "mean_token_accuracy": 0.9559738993644714, "num_tokens": 641375050.0, "step": 3240 }, { "epoch": 0.648589626933576, "eval_entropy": 0.6472069806739932, "eval_loss": 0.18293221294879913, "eval_mean_token_accuracy": 0.9496043148587962, "eval_num_tokens": 641375050.0, "eval_runtime": 7.0336, "eval_samples_per_second": 138.335, "eval_steps_per_second": 8.673, "step": 3240 }, { "entropy": 0.702154829827222, "epoch": 0.6495905368516833, "grad_norm": 0.6860081553459167, "learning_rate": 1.5008896797153024e-06, "loss": 0.1632, "mean_token_accuracy": 0.9570187379013408, "num_tokens": 642328426.0, "step": 3245 }, { "entropy": 0.6984548650004647, "epoch": 0.6505914467697907, "grad_norm": 1.5585992336273193, "learning_rate": 1.4997775800711743e-06, "loss": 0.1568, "mean_token_accuracy": 0.9584351718425751, "num_tokens": 643060158.0, "step": 3250 }, { "entropy": 0.6129773226651278, "epoch": 0.6515923566878981, "grad_norm": 0.9925711750984192, "learning_rate": 1.4986654804270461e-06, "loss": 0.1585, "mean_token_accuracy": 0.958180884881453, "num_tokens": 644209016.0, "step": 3255 }, { "entropy": 0.6673231913284822, "epoch": 0.6525932666060055, "grad_norm": 0.8757086992263794, "learning_rate": 1.4975533807829182e-06, "loss": 0.1648, "mean_token_accuracy": 0.9552432694218376, "num_tokens": 645283312.0, "step": 3260 }, { "entropy": 0.6877216878262433, "epoch": 0.6535941765241128, "grad_norm": 0.7658048272132874, "learning_rate": 1.4964412811387899e-06, "loss": 0.1604, "mean_token_accuracy": 0.9569750054316087, "num_tokens": 646304671.0, "step": 3265 }, { "entropy": 0.7039268119768662, "epoch": 0.6545950864422202, "grad_norm": 0.6302322149276733, "learning_rate": 1.4953291814946617e-06, "loss": 0.1626, "mean_token_accuracy": 0.9568539722399279, "num_tokens": 647257247.0, "step": 3270 }, { "epoch": 0.6545950864422202, "eval_entropy": 0.6477563601048266, "eval_loss": 0.1822510063648224, "eval_mean_token_accuracy": 0.9499242520723187, "eval_num_tokens": 647257247.0, "eval_runtime": 7.0229, "eval_samples_per_second": 138.547, "eval_steps_per_second": 8.686, "step": 3270 }, { "entropy": 0.6974148641933094, "epoch": 0.6555959963603276, "grad_norm": 1.5133696794509888, "learning_rate": 1.4942170818505338e-06, "loss": 0.1554, "mean_token_accuracy": 0.9593302407047966, "num_tokens": 647996410.0, "step": 3275 }, { "entropy": 0.6135740900581533, "epoch": 0.6565969062784349, "grad_norm": 0.9020703434944153, "learning_rate": 1.4931049822064056e-06, "loss": 0.1536, "mean_token_accuracy": 0.9590423145077446, "num_tokens": 649158135.0, "step": 3280 }, { "entropy": 0.6622402567755092, "epoch": 0.6575978161965423, "grad_norm": 0.8561988472938538, "learning_rate": 1.4919928825622777e-06, "loss": 0.1606, "mean_token_accuracy": 0.9568440372293646, "num_tokens": 650238501.0, "step": 3285 }, { "entropy": 0.6838861806826158, "epoch": 0.6585987261146496, "grad_norm": 0.8391448259353638, "learning_rate": 1.4908807829181494e-06, "loss": 0.1585, "mean_token_accuracy": 0.9573293057355013, "num_tokens": 651266847.0, "step": 3290 }, { "entropy": 0.7013624326749281, "epoch": 0.6595996360327571, "grad_norm": 0.8588127493858337, "learning_rate": 1.4897686832740212e-06, "loss": 0.1598, "mean_token_accuracy": 0.9582955512133512, "num_tokens": 652220456.0, "step": 3295 }, { "entropy": 0.7058557136492296, "epoch": 0.6606005459508644, "grad_norm": 1.6215109825134277, "learning_rate": 1.4886565836298933e-06, "loss": 0.1558, "mean_token_accuracy": 0.9595519033345309, "num_tokens": 652954302.0, "step": 3300 }, { "epoch": 0.6606005459508644, "eval_entropy": 0.6499209511475484, "eval_loss": 0.1842976212501526, "eval_mean_token_accuracy": 0.948709644255091, "eval_num_tokens": 652954302.0, "eval_runtime": 7.0384, "eval_samples_per_second": 138.241, "eval_steps_per_second": 8.667, "step": 3300 }, { "entropy": 0.621334047480063, "epoch": 0.6616014558689718, "grad_norm": 0.9623603820800781, "learning_rate": 1.4875444839857651e-06, "loss": 0.1577, "mean_token_accuracy": 0.9584027409553528, "num_tokens": 654085373.0, "step": 3305 }, { "entropy": 0.6754447023976933, "epoch": 0.6626023657870792, "grad_norm": 0.8001890778541565, "learning_rate": 1.4864323843416368e-06, "loss": 0.1625, "mean_token_accuracy": 0.9564595872705633, "num_tokens": 655160148.0, "step": 3310 }, { "entropy": 0.6801682867787101, "epoch": 0.6636032757051865, "grad_norm": 0.7729578018188477, "learning_rate": 1.4853202846975089e-06, "loss": 0.1614, "mean_token_accuracy": 0.956423576853492, "num_tokens": 656214275.0, "step": 3315 }, { "entropy": 0.6868817286057906, "epoch": 0.6646041856232939, "grad_norm": 0.6810210943222046, "learning_rate": 1.4842081850533807e-06, "loss": 0.1566, "mean_token_accuracy": 0.9585105110298503, "num_tokens": 657178543.0, "step": 3320 }, { "entropy": 0.7001429920846766, "epoch": 0.6656050955414012, "grad_norm": 1.6506801843643188, "learning_rate": 1.4830960854092528e-06, "loss": 0.1543, "mean_token_accuracy": 0.9600406169891358, "num_tokens": 657908037.0, "step": 3325 }, { "entropy": 0.6132907515222376, "epoch": 0.6666060054595087, "grad_norm": 0.8838356733322144, "learning_rate": 1.4819839857651244e-06, "loss": 0.1537, "mean_token_accuracy": 0.9591153843836351, "num_tokens": 659055420.0, "step": 3330 }, { "epoch": 0.6666060054595087, "eval_entropy": 0.64075055913847, "eval_loss": 0.18135496973991394, "eval_mean_token_accuracy": 0.949864628862162, "eval_num_tokens": 659055420.0, "eval_runtime": 7.0585, "eval_samples_per_second": 137.848, "eval_steps_per_second": 8.642, "step": 3330 }, { "entropy": 0.6668086566708304, "epoch": 0.667606915377616, "grad_norm": 0.8098176121711731, "learning_rate": 1.4808718861209963e-06, "loss": 0.1621, "mean_token_accuracy": 0.9567366887222637, "num_tokens": 660143889.0, "step": 3335 }, { "entropy": 0.6945254163308577, "epoch": 0.6686078252957234, "grad_norm": 0.8045607209205627, "learning_rate": 1.4797597864768684e-06, "loss": 0.1623, "mean_token_accuracy": 0.9565726925026287, "num_tokens": 661162373.0, "step": 3340 }, { "entropy": 0.7033212970603596, "epoch": 0.6696087352138308, "grad_norm": 0.6349719762802124, "learning_rate": 1.4786476868327402e-06, "loss": 0.1617, "mean_token_accuracy": 0.9573416639458049, "num_tokens": 662112855.0, "step": 3345 }, { "entropy": 0.7066628607836637, "epoch": 0.6706096451319381, "grad_norm": 1.7244939804077148, "learning_rate": 1.4775355871886119e-06, "loss": 0.1561, "mean_token_accuracy": 0.9585809138688174, "num_tokens": 662839231.0, "step": 3350 }, { "entropy": 0.6230565829710527, "epoch": 0.6716105550500455, "grad_norm": 0.9981881380081177, "learning_rate": 1.476423487544484e-06, "loss": 0.1544, "mean_token_accuracy": 0.9593272588469766, "num_tokens": 663977485.0, "step": 3355 }, { "entropy": 0.6668084019964392, "epoch": 0.6726114649681528, "grad_norm": 0.8376649618148804, "learning_rate": 1.4753113879003558e-06, "loss": 0.1577, "mean_token_accuracy": 0.9574681758880615, "num_tokens": 665089323.0, "step": 3360 }, { "epoch": 0.6726114649681528, "eval_entropy": 0.6494350780229099, "eval_loss": 0.18222181499004364, "eval_mean_token_accuracy": 0.9497965378839461, "eval_num_tokens": 665089323.0, "eval_runtime": 7.0401, "eval_samples_per_second": 138.209, "eval_steps_per_second": 8.665, "step": 3360 }, { "entropy": 0.6925623609261079, "epoch": 0.6736123748862602, "grad_norm": 0.786201000213623, "learning_rate": 1.4741992882562276e-06, "loss": 0.1547, "mean_token_accuracy": 0.9576037005944685, "num_tokens": 666117675.0, "step": 3365 }, { "entropy": 0.7104368925094604, "epoch": 0.6746132848043676, "grad_norm": 0.736659049987793, "learning_rate": 1.4730871886120997e-06, "loss": 0.1572, "mean_token_accuracy": 0.9585932124744762, "num_tokens": 667063051.0, "step": 3370 }, { "entropy": 0.7050894742662256, "epoch": 0.675614194722475, "grad_norm": 1.705169916152954, "learning_rate": 1.4719750889679714e-06, "loss": 0.1547, "mean_token_accuracy": 0.9589919149875641, "num_tokens": 667787190.0, "step": 3375 }, { "entropy": 0.6144024074077606, "epoch": 0.6766151046405824, "grad_norm": 0.940698504447937, "learning_rate": 1.4708629893238434e-06, "loss": 0.1504, "mean_token_accuracy": 0.9602103607221083, "num_tokens": 668931786.0, "step": 3380 }, { "entropy": 0.6675797638568011, "epoch": 0.6776160145586897, "grad_norm": 0.859804093837738, "learning_rate": 1.4697508896797153e-06, "loss": 0.1538, "mean_token_accuracy": 0.9588768585161729, "num_tokens": 669987112.0, "step": 3385 }, { "entropy": 0.6947231168096716, "epoch": 0.6786169244767971, "grad_norm": 0.7688744068145752, "learning_rate": 1.4686387900355871e-06, "loss": 0.1608, "mean_token_accuracy": 0.9564679145812989, "num_tokens": 671037275.0, "step": 3390 }, { "epoch": 0.6786169244767971, "eval_entropy": 0.6460012275664533, "eval_loss": 0.18215857446193695, "eval_mean_token_accuracy": 0.9496130151826827, "eval_num_tokens": 671037275.0, "eval_runtime": 7.0121, "eval_samples_per_second": 138.76, "eval_steps_per_second": 8.699, "step": 3390 }, { "entropy": 0.7101943942633542, "epoch": 0.6796178343949044, "grad_norm": 0.6788628101348877, "learning_rate": 1.467526690391459e-06, "loss": 0.1595, "mean_token_accuracy": 0.9584887022321874, "num_tokens": 672001957.0, "step": 3395 }, { "entropy": 0.7138026226650585, "epoch": 0.6806187443130118, "grad_norm": 1.73914635181427, "learning_rate": 1.4664145907473309e-06, "loss": 0.1551, "mean_token_accuracy": 0.9598342695019462, "num_tokens": 672725611.0, "step": 3400 }, { "entropy": 0.6170999803326347, "epoch": 0.6816196542311193, "grad_norm": 0.9073975682258606, "learning_rate": 1.4653024911032027e-06, "loss": 0.1485, "mean_token_accuracy": 0.960613077878952, "num_tokens": 673851656.0, "step": 3405 }, { "entropy": 0.6696691445328973, "epoch": 0.6826205641492266, "grad_norm": 0.8153337836265564, "learning_rate": 1.4641903914590748e-06, "loss": 0.1569, "mean_token_accuracy": 0.957689621773633, "num_tokens": 674934713.0, "step": 3410 }, { "entropy": 0.6971980799328197, "epoch": 0.683621474067334, "grad_norm": 0.7351928949356079, "learning_rate": 1.4630782918149464e-06, "loss": 0.1554, "mean_token_accuracy": 0.9583224740895357, "num_tokens": 675967339.0, "step": 3415 }, { "entropy": 0.6999681651592254, "epoch": 0.6846223839854413, "grad_norm": 0.9492703676223755, "learning_rate": 1.4619661921708185e-06, "loss": 0.1508, "mean_token_accuracy": 0.9597205470908772, "num_tokens": 676914390.0, "step": 3420 }, { "epoch": 0.6846223839854413, "eval_entropy": 0.6426876177553271, "eval_loss": 0.18153499066829681, "eval_mean_token_accuracy": 0.9500644890988459, "eval_num_tokens": 676914390.0, "eval_runtime": 7.0075, "eval_samples_per_second": 138.851, "eval_steps_per_second": 8.705, "step": 3420 }, { "entropy": 0.7060933086005124, "epoch": 0.6856232939035487, "grad_norm": 1.5791497230529785, "learning_rate": 1.4608540925266904e-06, "loss": 0.1552, "mean_token_accuracy": 0.9589927136898041, "num_tokens": 677638601.0, "step": 3425 }, { "entropy": 0.6128041752360084, "epoch": 0.686624203821656, "grad_norm": 0.9395958781242371, "learning_rate": 1.4597419928825622e-06, "loss": 0.1481, "mean_token_accuracy": 0.9608120690692555, "num_tokens": 678785268.0, "step": 3430 }, { "entropy": 0.6682237459854646, "epoch": 0.6876251137397634, "grad_norm": 0.8034733533859253, "learning_rate": 1.458629893238434e-06, "loss": 0.1563, "mean_token_accuracy": 0.9580443588170138, "num_tokens": 679846215.0, "step": 3435 }, { "entropy": 0.6912163682959297, "epoch": 0.6886260236578708, "grad_norm": 0.7852122187614441, "learning_rate": 1.457517793594306e-06, "loss": 0.1562, "mean_token_accuracy": 0.957829516584223, "num_tokens": 680888257.0, "step": 3440 }, { "entropy": 0.7082947004925121, "epoch": 0.6896269335759782, "grad_norm": 0.6746036410331726, "learning_rate": 1.4564056939501778e-06, "loss": 0.1587, "mean_token_accuracy": 0.9582202104004947, "num_tokens": 681859900.0, "step": 3445 }, { "entropy": 0.7029309023510326, "epoch": 0.6906278434940856, "grad_norm": 1.6336463689804077, "learning_rate": 1.4552935943060499e-06, "loss": 0.1518, "mean_token_accuracy": 0.9600752207365904, "num_tokens": 682593875.0, "step": 3450 }, { "epoch": 0.6906278434940856, "eval_entropy": 0.650386592403787, "eval_loss": 0.18218755722045898, "eval_mean_token_accuracy": 0.9497824721649045, "eval_num_tokens": 682593875.0, "eval_runtime": 6.9925, "eval_samples_per_second": 139.149, "eval_steps_per_second": 8.724, "step": 3450 }, { "entropy": 0.6209653827277097, "epoch": 0.6916287534121929, "grad_norm": 0.9056838750839233, "learning_rate": 1.4541814946619217e-06, "loss": 0.1525, "mean_token_accuracy": 0.9596792773766951, "num_tokens": 683728552.0, "step": 3455 }, { "entropy": 0.6720722063021226, "epoch": 0.6926296633303003, "grad_norm": 0.848111093044281, "learning_rate": 1.4530693950177934e-06, "loss": 0.1556, "mean_token_accuracy": 0.9583455557172949, "num_tokens": 684820291.0, "step": 3460 }, { "entropy": 0.6981573473323476, "epoch": 0.6936305732484076, "grad_norm": 0.8181219100952148, "learning_rate": 1.4519572953736654e-06, "loss": 0.1619, "mean_token_accuracy": 0.9571636861020868, "num_tokens": 685843222.0, "step": 3465 }, { "entropy": 0.6986917571587996, "epoch": 0.694631483166515, "grad_norm": 0.6309542059898376, "learning_rate": 1.4508451957295373e-06, "loss": 0.1487, "mean_token_accuracy": 0.960549614646218, "num_tokens": 686795079.0, "step": 3470 }, { "entropy": 0.6992609934373335, "epoch": 0.6956323930846224, "grad_norm": 1.6340988874435425, "learning_rate": 1.4497330960854094e-06, "loss": 0.1526, "mean_token_accuracy": 0.9600770901549947, "num_tokens": 687534250.0, "step": 3475 }, { "entropy": 0.6203337536616759, "epoch": 0.6966333030027297, "grad_norm": 0.9461851119995117, "learning_rate": 1.448620996441281e-06, "loss": 0.1462, "mean_token_accuracy": 0.9612834345210682, "num_tokens": 688659595.0, "step": 3480 }, { "epoch": 0.6966333030027297, "eval_entropy": 0.6457425873787677, "eval_loss": 0.18017198145389557, "eval_mean_token_accuracy": 0.9507011683260809, "eval_num_tokens": 688659595.0, "eval_runtime": 7.1026, "eval_samples_per_second": 136.992, "eval_steps_per_second": 8.588, "step": 3480 }, { "entropy": 0.6704084252769297, "epoch": 0.6976342129208372, "grad_norm": 0.8556516766548157, "learning_rate": 1.4475088967971529e-06, "loss": 0.1551, "mean_token_accuracy": 0.9571811556816101, "num_tokens": 689757561.0, "step": 3485 }, { "entropy": 0.6925178121436726, "epoch": 0.6986351228389445, "grad_norm": 0.7813107967376709, "learning_rate": 1.446396797153025e-06, "loss": 0.1571, "mean_token_accuracy": 0.958230844410983, "num_tokens": 690812177.0, "step": 3490 }, { "entropy": 0.716183881055225, "epoch": 0.6996360327570519, "grad_norm": 0.6608054637908936, "learning_rate": 1.4452846975088968e-06, "loss": 0.1551, "mean_token_accuracy": 0.9585723102092742, "num_tokens": 691767285.0, "step": 3495 }, { "entropy": 0.7106182558970018, "epoch": 0.7006369426751592, "grad_norm": 1.6784389019012451, "learning_rate": 1.4441725978647684e-06, "loss": 0.1527, "mean_token_accuracy": 0.959889015826312, "num_tokens": 692508977.0, "step": 3500 }, { "entropy": 0.6119228395548734, "epoch": 0.7016378525932666, "grad_norm": 0.9040566086769104, "learning_rate": 1.4430604982206405e-06, "loss": 0.1463, "mean_token_accuracy": 0.9611663525754756, "num_tokens": 693677838.0, "step": 3505 }, { "entropy": 0.6678372830152511, "epoch": 0.702638762511374, "grad_norm": 0.8394715189933777, "learning_rate": 1.4419483985765124e-06, "loss": 0.1591, "mean_token_accuracy": 0.9574539330872622, "num_tokens": 694757367.0, "step": 3510 }, { "epoch": 0.702638762511374, "eval_entropy": 0.6457846052333956, "eval_loss": 0.18403349816799164, "eval_mean_token_accuracy": 0.9498394170745474, "eval_num_tokens": 694757367.0, "eval_runtime": 7.199, "eval_samples_per_second": 135.158, "eval_steps_per_second": 8.473, "step": 3510 }, { "entropy": 0.6866304833780635, "epoch": 0.7036396724294813, "grad_norm": 0.7923183441162109, "learning_rate": 1.4408362989323844e-06, "loss": 0.1548, "mean_token_accuracy": 0.9585600186478008, "num_tokens": 695807809.0, "step": 3515 }, { "entropy": 0.698677041313865, "epoch": 0.7046405823475888, "grad_norm": 0.6395448446273804, "learning_rate": 1.439724199288256e-06, "loss": 0.1552, "mean_token_accuracy": 0.9591563501141288, "num_tokens": 696762341.0, "step": 3520 }, { "entropy": 0.7000049211762168, "epoch": 0.7056414922656961, "grad_norm": 1.704610824584961, "learning_rate": 1.438612099644128e-06, "loss": 0.1483, "mean_token_accuracy": 0.9610144132917577, "num_tokens": 697491178.0, "step": 3525 }, { "entropy": 0.616519127108834, "epoch": 0.7066424021838035, "grad_norm": 0.938034176826477, "learning_rate": 1.4375e-06, "loss": 0.1466, "mean_token_accuracy": 0.961005428162488, "num_tokens": 698636465.0, "step": 3530 }, { "entropy": 0.6671069808981636, "epoch": 0.7076433121019108, "grad_norm": 0.8367746472358704, "learning_rate": 1.4363879003558719e-06, "loss": 0.1587, "mean_token_accuracy": 0.957984118569981, "num_tokens": 699733654.0, "step": 3535 }, { "entropy": 0.686641216007146, "epoch": 0.7086442220200182, "grad_norm": 0.7497020959854126, "learning_rate": 1.4352758007117437e-06, "loss": 0.1528, "mean_token_accuracy": 0.9585920035839081, "num_tokens": 700769270.0, "step": 3540 }, { "epoch": 0.7086442220200182, "eval_entropy": 0.6463534255496791, "eval_loss": 0.1813378483057022, "eval_mean_token_accuracy": 0.9505206639649438, "eval_num_tokens": 700769270.0, "eval_runtime": 7.0738, "eval_samples_per_second": 137.55, "eval_steps_per_second": 8.623, "step": 3540 }, { "entropy": 0.6957107446410439, "epoch": 0.7096451319381256, "grad_norm": 0.6717329025268555, "learning_rate": 1.4341637010676156e-06, "loss": 0.1505, "mean_token_accuracy": 0.960306400602514, "num_tokens": 701718494.0, "step": 3545 }, { "entropy": 0.7116701342842796, "epoch": 0.7106460418562329, "grad_norm": 1.768558144569397, "learning_rate": 1.4330516014234874e-06, "loss": 0.1514, "mean_token_accuracy": 0.9600772873921828, "num_tokens": 702436336.0, "step": 3550 }, { "entropy": 0.6127610867673701, "epoch": 0.7116469517743403, "grad_norm": 1.0162396430969238, "learning_rate": 1.4319395017793595e-06, "loss": 0.1489, "mean_token_accuracy": 0.9602883994579315, "num_tokens": 703581833.0, "step": 3555 }, { "entropy": 0.6664238596504385, "epoch": 0.7126478616924476, "grad_norm": 1.0451632738113403, "learning_rate": 1.4308274021352314e-06, "loss": 0.154, "mean_token_accuracy": 0.9591329000212929, "num_tokens": 704674589.0, "step": 3560 }, { "entropy": 0.6827213899655775, "epoch": 0.7136487716105551, "grad_norm": 1.1186326742172241, "learning_rate": 1.429715302491103e-06, "loss": 0.1515, "mean_token_accuracy": 0.9586131052537398, "num_tokens": 705709505.0, "step": 3565 }, { "entropy": 0.7037599785761399, "epoch": 0.7146496815286624, "grad_norm": 1.0326136350631714, "learning_rate": 1.428603202846975e-06, "loss": 0.1504, "mean_token_accuracy": 0.9594932919198816, "num_tokens": 706667750.0, "step": 3570 }, { "epoch": 0.7146496815286624, "eval_entropy": 0.646297568180522, "eval_loss": 0.18333254754543304, "eval_mean_token_accuracy": 0.9501433626550143, "eval_num_tokens": 706667750.0, "eval_runtime": 7.0291, "eval_samples_per_second": 138.425, "eval_steps_per_second": 8.678, "step": 3570 }, { "entropy": 0.7007351406595924, "epoch": 0.7156505914467698, "grad_norm": 1.76486337184906, "learning_rate": 1.427491103202847e-06, "loss": 0.1477, "mean_token_accuracy": 0.9616134372624484, "num_tokens": 707404649.0, "step": 3575 }, { "entropy": 0.6178434678099373, "epoch": 0.7166515013648772, "grad_norm": 0.9774680137634277, "learning_rate": 1.4263790035587188e-06, "loss": 0.1466, "mean_token_accuracy": 0.9608237499540503, "num_tokens": 708535434.0, "step": 3580 }, { "entropy": 0.6685388267040253, "epoch": 0.7176524112829845, "grad_norm": 0.8269554972648621, "learning_rate": 1.4252669039145906e-06, "loss": 0.1526, "mean_token_accuracy": 0.9589363054795699, "num_tokens": 709623931.0, "step": 3585 }, { "entropy": 0.6837363061579791, "epoch": 0.7186533212010919, "grad_norm": 0.759075939655304, "learning_rate": 1.4241548042704625e-06, "loss": 0.1502, "mean_token_accuracy": 0.9594019618901339, "num_tokens": 710666538.0, "step": 3590 }, { "entropy": 0.6967312319712206, "epoch": 0.7196542311191992, "grad_norm": 0.7646484971046448, "learning_rate": 1.4230427046263344e-06, "loss": 0.1507, "mean_token_accuracy": 0.9600866355679252, "num_tokens": 711616703.0, "step": 3595 }, { "entropy": 0.7033360708843578, "epoch": 0.7206551410373067, "grad_norm": 1.6652473211288452, "learning_rate": 1.4219306049822064e-06, "loss": 0.1527, "mean_token_accuracy": 0.9600472737442364, "num_tokens": 712339902.0, "step": 3600 }, { "epoch": 0.7206551410373067, "eval_entropy": 0.6492331389520989, "eval_loss": 0.18077199161052704, "eval_mean_token_accuracy": 0.9501383842014876, "eval_num_tokens": 712339902.0, "eval_runtime": 7.1169, "eval_samples_per_second": 136.717, "eval_steps_per_second": 8.571, "step": 3600 }, { "entropy": 0.6149807561527599, "epoch": 0.721656050955414, "grad_norm": 0.928352415561676, "learning_rate": 1.420818505338078e-06, "loss": 0.1451, "mean_token_accuracy": 0.9617514366453345, "num_tokens": 713469631.0, "step": 3605 }, { "entropy": 0.6708079273050481, "epoch": 0.7226569608735214, "grad_norm": 0.8746845722198486, "learning_rate": 1.4197064056939501e-06, "loss": 0.1553, "mean_token_accuracy": 0.9591595194556496, "num_tokens": 714568149.0, "step": 3610 }, { "entropy": 0.6869456586512652, "epoch": 0.7236578707916288, "grad_norm": 0.7715699076652527, "learning_rate": 1.418594306049822e-06, "loss": 0.1499, "mean_token_accuracy": 0.9593700820749457, "num_tokens": 715610113.0, "step": 3615 }, { "entropy": 0.6990539791909132, "epoch": 0.7246587807097361, "grad_norm": 0.7243727445602417, "learning_rate": 1.4174822064056939e-06, "loss": 0.1513, "mean_token_accuracy": 0.9602306030013344, "num_tokens": 716584071.0, "step": 3620 }, { "entropy": 0.7046049112623388, "epoch": 0.7256596906278435, "grad_norm": 1.6954907178878784, "learning_rate": 1.416370106761566e-06, "loss": 0.1451, "mean_token_accuracy": 0.962337300452319, "num_tokens": 717316967.0, "step": 3625 }, { "entropy": 0.6160386166789314, "epoch": 0.7266606005459508, "grad_norm": 0.9596716165542603, "learning_rate": 1.4152580071174376e-06, "loss": 0.1451, "mean_token_accuracy": 0.9615418428724463, "num_tokens": 718445509.0, "step": 3630 }, { "epoch": 0.7266606005459508, "eval_entropy": 0.6465183843354709, "eval_loss": 0.1834045797586441, "eval_mean_token_accuracy": 0.94984726143665, "eval_num_tokens": 718445509.0, "eval_runtime": 7.0491, "eval_samples_per_second": 138.031, "eval_steps_per_second": 8.654, "step": 3630 }, { "entropy": 0.6670124200257388, "epoch": 0.7276615104640582, "grad_norm": 0.8411712646484375, "learning_rate": 1.4141459074733094e-06, "loss": 0.1517, "mean_token_accuracy": 0.9583657335151325, "num_tokens": 719527229.0, "step": 3635 }, { "entropy": 0.691483823819594, "epoch": 0.7286624203821656, "grad_norm": 0.8118385672569275, "learning_rate": 1.4130338078291815e-06, "loss": 0.152, "mean_token_accuracy": 0.9590539748018438, "num_tokens": 720562230.0, "step": 3640 }, { "entropy": 0.6978759061206471, "epoch": 0.729663330300273, "grad_norm": 0.6536839604377747, "learning_rate": 1.4119217081850534e-06, "loss": 0.1491, "mean_token_accuracy": 0.9607249758460304, "num_tokens": 721510665.0, "step": 3645 }, { "entropy": 0.692423168908466, "epoch": 0.7306642402183804, "grad_norm": 1.5654547214508057, "learning_rate": 1.4108096085409252e-06, "loss": 0.1432, "mean_token_accuracy": 0.9622391191395846, "num_tokens": 722236574.0, "step": 3650 }, { "entropy": 0.6100410997867585, "epoch": 0.7316651501364877, "grad_norm": 0.9501330852508545, "learning_rate": 1.409697508896797e-06, "loss": 0.1428, "mean_token_accuracy": 0.9622214566577565, "num_tokens": 723386101.0, "step": 3655 }, { "entropy": 0.6609509679404172, "epoch": 0.7326660600545951, "grad_norm": 0.8339934945106506, "learning_rate": 1.408585409252669e-06, "loss": 0.1517, "mean_token_accuracy": 0.958848465572704, "num_tokens": 724478073.0, "step": 3660 }, { "epoch": 0.7326660600545951, "eval_entropy": 0.6399203392325855, "eval_loss": 0.18328502774238586, "eval_mean_token_accuracy": 0.9502141133683627, "eval_num_tokens": 724478073.0, "eval_runtime": 7.0325, "eval_samples_per_second": 138.358, "eval_steps_per_second": 8.674, "step": 3660 }, { "entropy": 0.6784357306632128, "epoch": 0.7336669699727024, "grad_norm": 0.7843255996704102, "learning_rate": 1.407473309608541e-06, "loss": 0.1497, "mean_token_accuracy": 0.9594681257551366, "num_tokens": 725530725.0, "step": 3665 }, { "entropy": 0.6910421122204173, "epoch": 0.7346678798908098, "grad_norm": 0.6633345484733582, "learning_rate": 1.4063612099644126e-06, "loss": 0.1475, "mean_token_accuracy": 0.9606767145070163, "num_tokens": 726494609.0, "step": 3670 }, { "entropy": 0.6835452020168304, "epoch": 0.7356687898089171, "grad_norm": 1.633773922920227, "learning_rate": 1.4052491103202845e-06, "loss": 0.1456, "mean_token_accuracy": 0.9621837160804055, "num_tokens": 727238125.0, "step": 3675 }, { "entropy": 0.603841777823188, "epoch": 0.7366696997270246, "grad_norm": 0.9435672760009766, "learning_rate": 1.4041370106761566e-06, "loss": 0.1412, "mean_token_accuracy": 0.9622807643630288, "num_tokens": 728383176.0, "step": 3680 }, { "entropy": 0.6598623080687089, "epoch": 0.737670609645132, "grad_norm": 0.9847542643547058, "learning_rate": 1.4030249110320284e-06, "loss": 0.1476, "mean_token_accuracy": 0.960059937021949, "num_tokens": 729471217.0, "step": 3685 }, { "entropy": 0.6746501594781875, "epoch": 0.7386715195632393, "grad_norm": 0.8633275628089905, "learning_rate": 1.4019128113879003e-06, "loss": 0.1535, "mean_token_accuracy": 0.9588108068162745, "num_tokens": 730506240.0, "step": 3690 }, { "epoch": 0.7386715195632393, "eval_entropy": 0.6451824957230052, "eval_loss": 0.1821545660495758, "eval_mean_token_accuracy": 0.9503177375089927, "eval_num_tokens": 730506240.0, "eval_runtime": 7.0055, "eval_samples_per_second": 138.891, "eval_steps_per_second": 8.707, "step": 3690 }, { "entropy": 0.6850697804581035, "epoch": 0.7396724294813467, "grad_norm": 0.7276564836502075, "learning_rate": 1.4008007117437721e-06, "loss": 0.1467, "mean_token_accuracy": 0.9607731239362196, "num_tokens": 731447363.0, "step": 3695 }, { "entropy": 0.6955865442752838, "epoch": 0.740673339399454, "grad_norm": 1.6277596950531006, "learning_rate": 1.399688612099644e-06, "loss": 0.1486, "mean_token_accuracy": 0.9611599418249998, "num_tokens": 732166557.0, "step": 3700 }, { "entropy": 0.605426854707978, "epoch": 0.7416742493175614, "grad_norm": 0.9806519746780396, "learning_rate": 1.398576512455516e-06, "loss": 0.1463, "mean_token_accuracy": 0.9612675450064919, "num_tokens": 733291659.0, "step": 3705 }, { "entropy": 0.6565829529003664, "epoch": 0.7426751592356687, "grad_norm": 0.8315209746360779, "learning_rate": 1.397464412811388e-06, "loss": 0.1497, "mean_token_accuracy": 0.9587733626365662, "num_tokens": 734373176.0, "step": 3710 }, { "entropy": 0.6744102640585465, "epoch": 0.7436760691537762, "grad_norm": 0.7913417816162109, "learning_rate": 1.3963523131672596e-06, "loss": 0.1491, "mean_token_accuracy": 0.9598020212216811, "num_tokens": 735417403.0, "step": 3715 }, { "entropy": 0.6832040979103609, "epoch": 0.7446769790718836, "grad_norm": 0.6335570216178894, "learning_rate": 1.3952402135231316e-06, "loss": 0.1478, "mean_token_accuracy": 0.9613436964425174, "num_tokens": 736379877.0, "step": 3720 }, { "epoch": 0.7446769790718836, "eval_entropy": 0.6417029988570292, "eval_loss": 0.18389441072940826, "eval_mean_token_accuracy": 0.9506415773610599, "eval_num_tokens": 736379877.0, "eval_runtime": 6.9931, "eval_samples_per_second": 139.137, "eval_steps_per_second": 8.723, "step": 3720 }, { "entropy": 0.6932811298153617, "epoch": 0.7456778889899909, "grad_norm": 1.6904598474502563, "learning_rate": 1.3941281138790035e-06, "loss": 0.1446, "mean_token_accuracy": 0.9619651420549913, "num_tokens": 737125840.0, "step": 3725 }, { "entropy": 0.6003106886690314, "epoch": 0.7466787989080983, "grad_norm": 0.9378305077552795, "learning_rate": 1.3930160142348756e-06, "loss": 0.14, "mean_token_accuracy": 0.9627970738844438, "num_tokens": 738254662.0, "step": 3730 }, { "entropy": 0.6617660430344668, "epoch": 0.7476797088262056, "grad_norm": 0.8425555229187012, "learning_rate": 1.3919039145907472e-06, "loss": 0.1504, "mean_token_accuracy": 0.9594602817838842, "num_tokens": 739351062.0, "step": 3735 }, { "entropy": 0.6762841075658799, "epoch": 0.748680618744313, "grad_norm": 0.7894054055213928, "learning_rate": 1.390791814946619e-06, "loss": 0.1473, "mean_token_accuracy": 0.9604934166778217, "num_tokens": 740390420.0, "step": 3740 }, { "entropy": 0.6912749702280218, "epoch": 0.7496815286624203, "grad_norm": 0.674893856048584, "learning_rate": 1.3896797153024911e-06, "loss": 0.1498, "mean_token_accuracy": 0.9602149833332408, "num_tokens": 741347590.0, "step": 3745 }, { "entropy": 0.6899059181863612, "epoch": 0.7506824385805277, "grad_norm": 1.6593986749649048, "learning_rate": 1.388567615658363e-06, "loss": 0.1446, "mean_token_accuracy": 0.9622206286950545, "num_tokens": 742079535.0, "step": 3750 }, { "epoch": 0.7506824385805277, "eval_entropy": 0.6426626101869052, "eval_loss": 0.18340618908405304, "eval_mean_token_accuracy": 0.950067000310929, "eval_num_tokens": 742079535.0, "eval_runtime": 7.0509, "eval_samples_per_second": 137.997, "eval_steps_per_second": 8.651, "step": 3750 }, { "entropy": 0.6002561170946468, "epoch": 0.7516833484986352, "grad_norm": 0.9747891426086426, "learning_rate": 1.3874555160142347e-06, "loss": 0.1398, "mean_token_accuracy": 0.9628812367265874, "num_tokens": 743216959.0, "step": 3755 }, { "entropy": 0.6451197315346111, "epoch": 0.7526842584167425, "grad_norm": 0.8404658436775208, "learning_rate": 1.3863434163701067e-06, "loss": 0.1482, "mean_token_accuracy": 0.9605841652913527, "num_tokens": 744296247.0, "step": 3760 }, { "entropy": 0.6663664116100831, "epoch": 0.7536851683348499, "grad_norm": 0.8358835577964783, "learning_rate": 1.3852313167259786e-06, "loss": 0.1478, "mean_token_accuracy": 0.9607979530637915, "num_tokens": 745340381.0, "step": 3765 }, { "entropy": 0.6912969998338006, "epoch": 0.7546860782529572, "grad_norm": 0.6832510828971863, "learning_rate": 1.3841192170818504e-06, "loss": 0.1438, "mean_token_accuracy": 0.9614791436628862, "num_tokens": 746282199.0, "step": 3770 }, { "entropy": 0.6806035925041546, "epoch": 0.7556869881710646, "grad_norm": 1.5931488275527954, "learning_rate": 1.3830071174377223e-06, "loss": 0.1384, "mean_token_accuracy": 0.9636109731414101, "num_tokens": 747012358.0, "step": 3775 }, { "entropy": 0.599098454280333, "epoch": 0.756687898089172, "grad_norm": 0.974331796169281, "learning_rate": 1.3818950177935942e-06, "loss": 0.1401, "mean_token_accuracy": 0.962726751240817, "num_tokens": 748131245.0, "step": 3780 }, { "epoch": 0.756687898089172, "eval_entropy": 0.6385055829267032, "eval_loss": 0.18179599940776825, "eval_mean_token_accuracy": 0.9507253433837265, "eval_num_tokens": 748131245.0, "eval_runtime": 7.076, "eval_samples_per_second": 137.507, "eval_steps_per_second": 8.621, "step": 3780 }, { "entropy": 0.6537848071618514, "epoch": 0.7576888080072793, "grad_norm": 0.834989607334137, "learning_rate": 1.3807829181494662e-06, "loss": 0.1487, "mean_token_accuracy": 0.9598635267127644, "num_tokens": 749231997.0, "step": 3785 }, { "entropy": 0.677148444273255, "epoch": 0.7586897179253868, "grad_norm": 0.8472671508789062, "learning_rate": 1.379670818505338e-06, "loss": 0.1497, "mean_token_accuracy": 0.959491520578211, "num_tokens": 750255932.0, "step": 3790 }, { "entropy": 0.6869501252066005, "epoch": 0.7596906278434941, "grad_norm": 0.8762997984886169, "learning_rate": 1.37855871886121e-06, "loss": 0.1497, "mean_token_accuracy": 0.9598960773511367, "num_tokens": 751208745.0, "step": 3795 }, { "entropy": 0.6860447016629305, "epoch": 0.7606915377616015, "grad_norm": 1.5384591817855835, "learning_rate": 1.3774466192170818e-06, "loss": 0.1446, "mean_token_accuracy": 0.962565876678987, "num_tokens": 751930047.0, "step": 3800 }, { "entropy": 0.6060644594105807, "epoch": 0.7616924476797088, "grad_norm": 1.1648614406585693, "learning_rate": 1.3763345195729537e-06, "loss": 0.1388, "mean_token_accuracy": 0.9629409508271651, "num_tokens": 753058832.0, "step": 3805 }, { "entropy": 0.6572167962789536, "epoch": 0.7626933575978162, "grad_norm": 0.8445199131965637, "learning_rate": 1.3752224199288255e-06, "loss": 0.1498, "mean_token_accuracy": 0.9599610285325484, "num_tokens": 754149822.0, "step": 3810 }, { "epoch": 0.7626933575978162, "eval_entropy": 0.6403389221332112, "eval_loss": 0.18195439875125885, "eval_mean_token_accuracy": 0.9507655999699577, "eval_num_tokens": 754149822.0, "eval_runtime": 7.0319, "eval_samples_per_second": 138.37, "eval_steps_per_second": 8.675, "step": 3810 }, { "entropy": 0.6741142516786401, "epoch": 0.7636942675159236, "grad_norm": 0.7991525530815125, "learning_rate": 1.3741103202846976e-06, "loss": 0.1478, "mean_token_accuracy": 0.9602746784687042, "num_tokens": 755198711.0, "step": 3815 }, { "entropy": 0.6725743767890063, "epoch": 0.7646951774340309, "grad_norm": 0.7079398036003113, "learning_rate": 1.3729982206405692e-06, "loss": 0.1394, "mean_token_accuracy": 0.9624946919354526, "num_tokens": 756163341.0, "step": 3820 }, { "entropy": 0.6698882056908174, "epoch": 0.7656960873521383, "grad_norm": 1.74517822265625, "learning_rate": 1.3718861209964413e-06, "loss": 0.1407, "mean_token_accuracy": 0.9628500087694688, "num_tokens": 756905491.0, "step": 3825 }, { "entropy": 0.5904871030287309, "epoch": 0.7666969972702457, "grad_norm": 0.9414699077606201, "learning_rate": 1.3707740213523132e-06, "loss": 0.141, "mean_token_accuracy": 0.9624408916993574, "num_tokens": 758060283.0, "step": 3830 }, { "entropy": 0.6385924938050184, "epoch": 0.7676979071883531, "grad_norm": 0.8462045788764954, "learning_rate": 1.369661921708185e-06, "loss": 0.1453, "mean_token_accuracy": 0.9610617979006334, "num_tokens": 759147397.0, "step": 3835 }, { "entropy": 0.6599980823018334, "epoch": 0.7686988171064604, "grad_norm": 0.7839226126670837, "learning_rate": 1.3685498220640569e-06, "loss": 0.1471, "mean_token_accuracy": 0.9601600359786641, "num_tokens": 760195635.0, "step": 3840 }, { "epoch": 0.7686988171064604, "eval_entropy": 0.6299580189048267, "eval_loss": 0.18395261466503143, "eval_mean_token_accuracy": 0.9503186227845364, "eval_num_tokens": 760195635.0, "eval_runtime": 7.0623, "eval_samples_per_second": 137.773, "eval_steps_per_second": 8.637, "step": 3840 }, { "entropy": 0.6697973516854373, "epoch": 0.7696997270245678, "grad_norm": 0.7063069939613342, "learning_rate": 1.3674377224199287e-06, "loss": 0.1428, "mean_token_accuracy": 0.962298633293672, "num_tokens": 761156776.0, "step": 3845 }, { "entropy": 0.6719630772417242, "epoch": 0.7707006369426752, "grad_norm": 1.6459494829177856, "learning_rate": 1.3663256227758006e-06, "loss": 0.1417, "mean_token_accuracy": 0.9631251654841683, "num_tokens": 761884891.0, "step": 3850 }, { "entropy": 0.5801961367780512, "epoch": 0.7717015468607825, "grad_norm": 0.9611675143241882, "learning_rate": 1.3652135231316726e-06, "loss": 0.1371, "mean_token_accuracy": 0.963437082008882, "num_tokens": 763026278.0, "step": 3855 }, { "entropy": 0.6409909424456683, "epoch": 0.7727024567788899, "grad_norm": 0.904933750629425, "learning_rate": 1.3641014234875443e-06, "loss": 0.145, "mean_token_accuracy": 0.9613065215674313, "num_tokens": 764133161.0, "step": 3860 }, { "entropy": 0.6667505315758965, "epoch": 0.7737033666969972, "grad_norm": 0.8145974278450012, "learning_rate": 1.3629893238434162e-06, "loss": 0.1426, "mean_token_accuracy": 0.9608027013865384, "num_tokens": 765182864.0, "step": 3865 }, { "entropy": 0.6728833125396209, "epoch": 0.7747042766151047, "grad_norm": 0.6522021889686584, "learning_rate": 1.3618772241992882e-06, "loss": 0.1428, "mean_token_accuracy": 0.96253671429374, "num_tokens": 766151392.0, "step": 3870 }, { "epoch": 0.7747042766151047, "eval_entropy": 0.6360755171932158, "eval_loss": 0.18470442295074463, "eval_mean_token_accuracy": 0.9502559939368826, "eval_num_tokens": 766151392.0, "eval_runtime": 7.0452, "eval_samples_per_second": 138.108, "eval_steps_per_second": 8.658, "step": 3870 }, { "entropy": 0.6780615603381938, "epoch": 0.775705186533212, "grad_norm": 1.7449678182601929, "learning_rate": 1.36076512455516e-06, "loss": 0.1426, "mean_token_accuracy": 0.9625177778980949, "num_tokens": 766878686.0, "step": 3875 }, { "entropy": 0.5946085046638142, "epoch": 0.7767060964513194, "grad_norm": 0.929898202419281, "learning_rate": 1.3596530249110321e-06, "loss": 0.142, "mean_token_accuracy": 0.9622558994726701, "num_tokens": 768017789.0, "step": 3880 }, { "entropy": 0.6389347041195089, "epoch": 0.7777070063694268, "grad_norm": 0.8491219878196716, "learning_rate": 1.3585409252669038e-06, "loss": 0.1447, "mean_token_accuracy": 0.9610752945596521, "num_tokens": 769116436.0, "step": 3885 }, { "entropy": 0.652755316008221, "epoch": 0.7787079162875341, "grad_norm": 0.7734766006469727, "learning_rate": 1.3574288256227757e-06, "loss": 0.1448, "mean_token_accuracy": 0.9613142360340465, "num_tokens": 770163077.0, "step": 3890 }, { "entropy": 0.6724637372927232, "epoch": 0.7797088262056415, "grad_norm": 0.7407189607620239, "learning_rate": 1.3563167259786477e-06, "loss": 0.1449, "mean_token_accuracy": 0.9617013438181443, "num_tokens": 771128050.0, "step": 3895 }, { "entropy": 0.6701836732300845, "epoch": 0.7807097361237488, "grad_norm": 1.544957160949707, "learning_rate": 1.3552046263345196e-06, "loss": 0.1402, "mean_token_accuracy": 0.9635271229527214, "num_tokens": 771862832.0, "step": 3900 }, { "epoch": 0.7807097361237488, "eval_entropy": 0.6336434786436987, "eval_loss": 0.18497776985168457, "eval_mean_token_accuracy": 0.9496565441616246, "eval_num_tokens": 771862832.0, "eval_runtime": 7.2324, "eval_samples_per_second": 134.533, "eval_steps_per_second": 8.434, "step": 3900 }, { "entropy": 0.6014582438902422, "epoch": 0.7817106460418562, "grad_norm": 0.9415779709815979, "learning_rate": 1.3540925266903912e-06, "loss": 0.1389, "mean_token_accuracy": 0.9630598826841874, "num_tokens": 772987977.0, "step": 3905 }, { "entropy": 0.6386843873695893, "epoch": 0.7827115559599636, "grad_norm": 0.8790847063064575, "learning_rate": 1.3529804270462633e-06, "loss": 0.1427, "mean_token_accuracy": 0.9621189491315322, "num_tokens": 774081270.0, "step": 3910 }, { "entropy": 0.6616236562078649, "epoch": 0.783712465878071, "grad_norm": 0.8104801177978516, "learning_rate": 1.3518683274021352e-06, "loss": 0.1497, "mean_token_accuracy": 0.9597180469469591, "num_tokens": 775149795.0, "step": 3915 }, { "entropy": 0.6556011256846515, "epoch": 0.7847133757961784, "grad_norm": 0.672337532043457, "learning_rate": 1.3507562277580072e-06, "loss": 0.1356, "mean_token_accuracy": 0.9633873435583982, "num_tokens": 776126187.0, "step": 3920 }, { "entropy": 0.6662999545986003, "epoch": 0.7857142857142857, "grad_norm": 1.6722906827926636, "learning_rate": 1.3496441281138789e-06, "loss": 0.1377, "mean_token_accuracy": 0.9638949724760922, "num_tokens": 776852493.0, "step": 3925 }, { "entropy": 0.5871605239131233, "epoch": 0.7867151956323931, "grad_norm": 0.9676291942596436, "learning_rate": 1.3485320284697507e-06, "loss": 0.1399, "mean_token_accuracy": 0.9627551940354434, "num_tokens": 777990863.0, "step": 3930 }, { "epoch": 0.7867151956323931, "eval_entropy": 0.6308732477367901, "eval_loss": 0.18097124993801117, "eval_mean_token_accuracy": 0.9505264221644792, "eval_num_tokens": 777990863.0, "eval_runtime": 7.0433, "eval_samples_per_second": 138.145, "eval_steps_per_second": 8.661, "step": 3930 }, { "entropy": 0.6423692865805193, "epoch": 0.7877161055505004, "grad_norm": 0.8802201747894287, "learning_rate": 1.3474199288256228e-06, "loss": 0.1438, "mean_token_accuracy": 0.961167681759054, "num_tokens": 779081702.0, "step": 3935 }, { "entropy": 0.6667531590570103, "epoch": 0.7887170154686078, "grad_norm": 0.7931551933288574, "learning_rate": 1.3463078291814947e-06, "loss": 0.14, "mean_token_accuracy": 0.9613506761464206, "num_tokens": 780111893.0, "step": 3940 }, { "entropy": 0.6715754471041939, "epoch": 0.7897179253867151, "grad_norm": 0.7701306939125061, "learning_rate": 1.3451957295373665e-06, "loss": 0.1411, "mean_token_accuracy": 0.9625847984444011, "num_tokens": 781073403.0, "step": 3945 }, { "entropy": 0.6649499947374518, "epoch": 0.7907188353048226, "grad_norm": 1.5195958614349365, "learning_rate": 1.3440836298932384e-06, "loss": 0.1359, "mean_token_accuracy": 0.9639710442586379, "num_tokens": 781810124.0, "step": 3950 }, { "entropy": 0.583565341071649, "epoch": 0.79171974522293, "grad_norm": 0.8992461562156677, "learning_rate": 1.3429715302491102e-06, "loss": 0.1341, "mean_token_accuracy": 0.9644046192819422, "num_tokens": 782971983.0, "step": 3955 }, { "entropy": 0.640065108646046, "epoch": 0.7927206551410373, "grad_norm": 0.8525144457817078, "learning_rate": 1.3418594306049823e-06, "loss": 0.14, "mean_token_accuracy": 0.9617968380451203, "num_tokens": 784058621.0, "step": 3960 }, { "epoch": 0.7927206551410373, "eval_entropy": 0.635074506529042, "eval_loss": 0.18335242569446564, "eval_mean_token_accuracy": 0.9508119587038384, "eval_num_tokens": 784058621.0, "eval_runtime": 7.1026, "eval_samples_per_second": 136.992, "eval_steps_per_second": 8.588, "step": 3960 }, { "entropy": 0.6650575345212763, "epoch": 0.7937215650591447, "grad_norm": 0.8279107213020325, "learning_rate": 1.3407473309608542e-06, "loss": 0.1428, "mean_token_accuracy": 0.9613479056141593, "num_tokens": 785094205.0, "step": 3965 }, { "entropy": 0.6705412731929259, "epoch": 0.794722474977252, "grad_norm": 0.6302039623260498, "learning_rate": 1.3396352313167258e-06, "loss": 0.1407, "mean_token_accuracy": 0.9632759739052166, "num_tokens": 786054123.0, "step": 3970 }, { "entropy": 0.672145242582668, "epoch": 0.7957233848953594, "grad_norm": 1.680174708366394, "learning_rate": 1.3385231316725979e-06, "loss": 0.1349, "mean_token_accuracy": 0.964633910222487, "num_tokens": 786786537.0, "step": 3975 }, { "entropy": 0.5964198713952845, "epoch": 0.7967242948134667, "grad_norm": 0.9663762450218201, "learning_rate": 1.3374110320284697e-06, "loss": 0.1359, "mean_token_accuracy": 0.963967761668292, "num_tokens": 787899948.0, "step": 3980 }, { "entropy": 0.6433003907853907, "epoch": 0.7977252047315742, "grad_norm": 0.8532348275184631, "learning_rate": 1.3362989323843416e-06, "loss": 0.1421, "mean_token_accuracy": 0.961438084190542, "num_tokens": 788992170.0, "step": 3985 }, { "entropy": 0.6622095113450831, "epoch": 0.7987261146496816, "grad_norm": 0.762026309967041, "learning_rate": 1.3351868327402134e-06, "loss": 0.1432, "mean_token_accuracy": 0.9612802380865271, "num_tokens": 790043605.0, "step": 3990 }, { "epoch": 0.7987261146496816, "eval_entropy": 0.6395599700388361, "eval_loss": 0.18441607058048248, "eval_mean_token_accuracy": 0.9507672083182414, "eval_num_tokens": 790043605.0, "eval_runtime": 7.0864, "eval_samples_per_second": 137.305, "eval_steps_per_second": 8.608, "step": 3990 }, { "entropy": 0.6702088984576139, "epoch": 0.7997270245677889, "grad_norm": 0.6676604151725769, "learning_rate": 1.3340747330960853e-06, "loss": 0.1413, "mean_token_accuracy": 0.9625859986652028, "num_tokens": 791005452.0, "step": 3995 }, { "entropy": 0.672667917880145, "epoch": 0.8007279344858963, "grad_norm": 1.7214157581329346, "learning_rate": 1.3329626334519572e-06, "loss": 0.1364, "mean_token_accuracy": 0.9645379922606728, "num_tokens": 791732168.0, "step": 4000 }, { "entropy": 0.5873585137453946, "epoch": 0.8017288444040036, "grad_norm": 0.9482730031013489, "learning_rate": 1.3318505338078292e-06, "loss": 0.1313, "mean_token_accuracy": 0.9647230906919999, "num_tokens": 792872336.0, "step": 4005 }, { "entropy": 0.6405410547148097, "epoch": 0.802729754322111, "grad_norm": 0.8936473727226257, "learning_rate": 1.3307384341637009e-06, "loss": 0.14, "mean_token_accuracy": 0.9622243496504697, "num_tokens": 793939516.0, "step": 4010 }, { "entropy": 0.6661894949999723, "epoch": 0.8037306642402183, "grad_norm": 0.7600812315940857, "learning_rate": 1.329626334519573e-06, "loss": 0.1397, "mean_token_accuracy": 0.961715197021311, "num_tokens": 794994800.0, "step": 4015 }, { "entropy": 0.6729139336130836, "epoch": 0.8047315741583257, "grad_norm": 0.6852670907974243, "learning_rate": 1.3285142348754448e-06, "loss": 0.1411, "mean_token_accuracy": 0.9618587027896535, "num_tokens": 795942393.0, "step": 4020 }, { "epoch": 0.8047315741583257, "eval_entropy": 0.6385497704881137, "eval_loss": 0.18298886716365814, "eval_mean_token_accuracy": 0.95083493092021, "eval_num_tokens": 795942393.0, "eval_runtime": 7.22, "eval_samples_per_second": 134.764, "eval_steps_per_second": 8.449, "step": 4020 }, { "entropy": 0.6744548082351685, "epoch": 0.8057324840764332, "grad_norm": 1.6530650854110718, "learning_rate": 1.3274021352313167e-06, "loss": 0.1324, "mean_token_accuracy": 0.965755269202319, "num_tokens": 796668510.0, "step": 4025 }, { "entropy": 0.5910497521812266, "epoch": 0.8067333939945405, "grad_norm": 0.9345577359199524, "learning_rate": 1.3262900355871887e-06, "loss": 0.1355, "mean_token_accuracy": 0.9639625316316431, "num_tokens": 797807447.0, "step": 4030 }, { "entropy": 0.6318013099106875, "epoch": 0.8077343039126479, "grad_norm": 0.8773898482322693, "learning_rate": 1.3251779359430604e-06, "loss": 0.1404, "mean_token_accuracy": 0.9624045025218617, "num_tokens": 798900021.0, "step": 4035 }, { "entropy": 0.6586670068177309, "epoch": 0.8087352138307552, "grad_norm": 0.7887452244758606, "learning_rate": 1.3240658362989322e-06, "loss": 0.1415, "mean_token_accuracy": 0.9622185707092286, "num_tokens": 799936750.0, "step": 4040 }, { "entropy": 0.6714841157197953, "epoch": 0.8097361237488626, "grad_norm": 0.6408624649047852, "learning_rate": 1.3229537366548043e-06, "loss": 0.1395, "mean_token_accuracy": 0.9631155973131006, "num_tokens": 800890339.0, "step": 4045 }, { "entropy": 0.671166700666601, "epoch": 0.8107370336669699, "grad_norm": 1.5876646041870117, "learning_rate": 1.3218416370106762e-06, "loss": 0.1345, "mean_token_accuracy": 0.9646015541120009, "num_tokens": 801621520.0, "step": 4050 }, { "epoch": 0.8107370336669699, "eval_entropy": 0.6287256723544636, "eval_loss": 0.18470019102096558, "eval_mean_token_accuracy": 0.950807981803769, "eval_num_tokens": 801621520.0, "eval_runtime": 7.0146, "eval_samples_per_second": 138.711, "eval_steps_per_second": 8.696, "step": 4050 }, { "entropy": 0.5836854978041215, "epoch": 0.8117379435850773, "grad_norm": 0.9563764929771423, "learning_rate": 1.320729537366548e-06, "loss": 0.137, "mean_token_accuracy": 0.9636379382827065, "num_tokens": 802760229.0, "step": 4055 }, { "entropy": 0.6356291532516479, "epoch": 0.8127388535031848, "grad_norm": 0.8574218153953552, "learning_rate": 1.3196174377224199e-06, "loss": 0.1469, "mean_token_accuracy": 0.9603158907456831, "num_tokens": 803856177.0, "step": 4060 }, { "entropy": 0.6495377407832579, "epoch": 0.8137397634212921, "grad_norm": 0.8074198365211487, "learning_rate": 1.3185053380782917e-06, "loss": 0.1386, "mean_token_accuracy": 0.9626891878518191, "num_tokens": 804911386.0, "step": 4065 }, { "entropy": 0.6640802432190288, "epoch": 0.8147406733393995, "grad_norm": 0.7216334342956543, "learning_rate": 1.3173932384341638e-06, "loss": 0.1376, "mean_token_accuracy": 0.9628864521330053, "num_tokens": 805866645.0, "step": 4070 }, { "entropy": 0.6685602345249869, "epoch": 0.8157415832575068, "grad_norm": 1.7459522485733032, "learning_rate": 1.3162811387900354e-06, "loss": 0.1392, "mean_token_accuracy": 0.9641159864989194, "num_tokens": 806595104.0, "step": 4075 }, { "entropy": 0.5927186499942433, "epoch": 0.8167424931756142, "grad_norm": 0.9222381114959717, "learning_rate": 1.3151690391459073e-06, "loss": 0.1358, "mean_token_accuracy": 0.9640862822532654, "num_tokens": 807727469.0, "step": 4080 }, { "epoch": 0.8167424931756142, "eval_entropy": 0.6358805328118996, "eval_loss": 0.18385587632656097, "eval_mean_token_accuracy": 0.9509023574532055, "eval_num_tokens": 807727469.0, "eval_runtime": 7.0308, "eval_samples_per_second": 138.392, "eval_steps_per_second": 8.676, "step": 4080 }, { "entropy": 0.6393450959162279, "epoch": 0.8177434030937215, "grad_norm": 0.84666907787323, "learning_rate": 1.3140569395017794e-06, "loss": 0.1404, "mean_token_accuracy": 0.961673707853664, "num_tokens": 808810685.0, "step": 4085 }, { "entropy": 0.6551467876542698, "epoch": 0.8187443130118289, "grad_norm": 0.8185970783233643, "learning_rate": 1.3129448398576512e-06, "loss": 0.1404, "mean_token_accuracy": 0.9627237601713701, "num_tokens": 809853102.0, "step": 4090 }, { "entropy": 0.6675399541854858, "epoch": 0.8197452229299363, "grad_norm": 0.6775302886962891, "learning_rate": 1.3118327402135229e-06, "loss": 0.1372, "mean_token_accuracy": 0.9637189258228649, "num_tokens": 810804376.0, "step": 4095 }, { "entropy": 0.668658479235389, "epoch": 0.8207461328480437, "grad_norm": 1.633571743965149, "learning_rate": 1.310720640569395e-06, "loss": 0.1355, "mean_token_accuracy": 0.9643102878873998, "num_tokens": 811535129.0, "step": 4100 }, { "entropy": 0.5880933046340943, "epoch": 0.8217470427661511, "grad_norm": 0.9041305780410767, "learning_rate": 1.3096085409252668e-06, "loss": 0.1334, "mean_token_accuracy": 0.9647295447913083, "num_tokens": 812666591.0, "step": 4105 }, { "entropy": 0.6378572691570629, "epoch": 0.8227479526842584, "grad_norm": 0.8185557723045349, "learning_rate": 1.3084964412811389e-06, "loss": 0.1345, "mean_token_accuracy": 0.963734941590916, "num_tokens": 813752886.0, "step": 4110 }, { "epoch": 0.8227479526842584, "eval_entropy": 0.6318896456820066, "eval_loss": 0.18597018718719482, "eval_mean_token_accuracy": 0.950390275384559, "eval_num_tokens": 813752886.0, "eval_runtime": 7.083, "eval_samples_per_second": 137.37, "eval_steps_per_second": 8.612, "step": 4110 }, { "entropy": 0.6565879512916911, "epoch": 0.8237488626023658, "grad_norm": 0.8251848816871643, "learning_rate": 1.3073843416370107e-06, "loss": 0.1373, "mean_token_accuracy": 0.9621736976233396, "num_tokens": 814780578.0, "step": 4115 }, { "entropy": 0.6591654620387337, "epoch": 0.8247497725204732, "grad_norm": 0.680801510810852, "learning_rate": 1.3062722419928824e-06, "loss": 0.134, "mean_token_accuracy": 0.9642984504049474, "num_tokens": 815727309.0, "step": 4120 }, { "entropy": 0.6689842855388468, "epoch": 0.8257506824385805, "grad_norm": 1.6310633420944214, "learning_rate": 1.3051601423487544e-06, "loss": 0.132, "mean_token_accuracy": 0.9654631246219981, "num_tokens": 816454050.0, "step": 4125 }, { "entropy": 0.5842076683586294, "epoch": 0.8267515923566879, "grad_norm": 1.0107406377792358, "learning_rate": 1.3040480427046263e-06, "loss": 0.1314, "mean_token_accuracy": 0.965171016888185, "num_tokens": 817616130.0, "step": 4130 }, { "entropy": 0.6371192531152206, "epoch": 0.8277525022747952, "grad_norm": 0.8968419432640076, "learning_rate": 1.3029359430604982e-06, "loss": 0.1369, "mean_token_accuracy": 0.9628690156069669, "num_tokens": 818690928.0, "step": 4135 }, { "entropy": 0.6599432755600322, "epoch": 0.8287534121929027, "grad_norm": 0.9083675146102905, "learning_rate": 1.30182384341637e-06, "loss": 0.1427, "mean_token_accuracy": 0.9619063783775677, "num_tokens": 819719290.0, "step": 4140 }, { "epoch": 0.8287534121929027, "eval_entropy": 0.627330626620621, "eval_loss": 0.18673530220985413, "eval_mean_token_accuracy": 0.9505173554185962, "eval_num_tokens": 819719290.0, "eval_runtime": 7.0694, "eval_samples_per_second": 137.636, "eval_steps_per_second": 8.629, "step": 4140 }, { "entropy": 0.6650434326041829, "epoch": 0.82975432211101, "grad_norm": 0.7410187721252441, "learning_rate": 1.3007117437722419e-06, "loss": 0.1343, "mean_token_accuracy": 0.9638854568654841, "num_tokens": 820658717.0, "step": 4145 }, { "entropy": 0.6653602497144179, "epoch": 0.8307552320291174, "grad_norm": 1.657799243927002, "learning_rate": 1.299599644128114e-06, "loss": 0.1314, "mean_token_accuracy": 0.9655446870760485, "num_tokens": 821387042.0, "step": 4150 }, { "entropy": 0.5887753250924024, "epoch": 0.8317561419472248, "grad_norm": 0.9166984558105469, "learning_rate": 1.2984875444839858e-06, "loss": 0.1305, "mean_token_accuracy": 0.9651481601324948, "num_tokens": 822524995.0, "step": 4155 }, { "entropy": 0.6301268073645505, "epoch": 0.8327570518653321, "grad_norm": 0.8409593105316162, "learning_rate": 1.2973754448398574e-06, "loss": 0.1316, "mean_token_accuracy": 0.9642227351665497, "num_tokens": 823595330.0, "step": 4160 }, { "entropy": 0.6482060646468942, "epoch": 0.8337579617834395, "grad_norm": 0.828996479511261, "learning_rate": 1.2962633451957295e-06, "loss": 0.141, "mean_token_accuracy": 0.9620839937166734, "num_tokens": 824625650.0, "step": 4165 }, { "entropy": 0.6599689732898365, "epoch": 0.8347588717015468, "grad_norm": 0.6699210405349731, "learning_rate": 1.2951512455516014e-06, "loss": 0.1399, "mean_token_accuracy": 0.9629984969442541, "num_tokens": 825583528.0, "step": 4170 }, { "epoch": 0.8347588717015468, "eval_entropy": 0.629080096229178, "eval_loss": 0.18436747789382935, "eval_mean_token_accuracy": 0.9509243994462685, "eval_num_tokens": 825583528.0, "eval_runtime": 7.0802, "eval_samples_per_second": 137.426, "eval_steps_per_second": 8.616, "step": 4170 } ], "logging_steps": 5, "max_steps": 9992, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 30, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3661070247602422e+19, "train_batch_size": 3, "trial_name": null, "trial_params": null }