diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3364 @@ +{ + "best_global_step": 30000, + "best_metric": 1.5021542310714722, + "best_model_checkpoint": "/mnt/mydata2/MoE/SLMOE/FINAL-MODEL-V2/checkpoint-30000", + "epoch": 3.9583848780246425, + "eval_steps": 1000, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.9970872095227241, + "epoch": 0.013195440975143089, + "grad_norm": 4.293704986572266, + "learning_rate": 3.96e-06, + "loss": 16.2682, + "mean_token_accuracy": 0.5875154070556163, + "num_tokens": 4681608.0, + "step": 100 + }, + { + "entropy": 1.9974382193386555, + "epoch": 0.026390881950286178, + "grad_norm": 4.16009521484375, + "learning_rate": 7.960000000000002e-06, + "loss": 16.2883, + "mean_token_accuracy": 0.5872169394791126, + "num_tokens": 9422456.0, + "step": 200 + }, + { + "entropy": 1.9942241810262202, + "epoch": 0.03958632292542926, + "grad_norm": 4.352612495422363, + "learning_rate": 1.196e-05, + "loss": 16.2576, + "mean_token_accuracy": 0.5878808307275176, + "num_tokens": 14133809.0, + "step": 300 + }, + { + "entropy": 1.9993504942953586, + "epoch": 0.052781763900572355, + "grad_norm": 4.423626899719238, + "learning_rate": 1.5960000000000003e-05, + "loss": 16.3033, + "mean_token_accuracy": 0.5866739044710994, + "num_tokens": 18832160.0, + "step": 400 + }, + { + "entropy": 2.0050615316629408, + "epoch": 0.06597720487571544, + "grad_norm": 4.602761268615723, + "learning_rate": 1.9960000000000002e-05, + "loss": 16.3501, + "mean_token_accuracy": 0.5865391325205564, + "num_tokens": 23552733.0, + "step": 500 + }, + { + "entropy": 1.9940124328434468, + "epoch": 0.07917264585085852, + "grad_norm": 4.151139736175537, + "learning_rate": 1.9997385793504095e-05, + "loss": 16.2651, + "mean_token_accuracy": 0.5878189096227289, + "num_tokens": 28254208.0, + "step": 600 + }, + { + "entropy": 1.9896119074523448, + "epoch": 0.09236808682600162, + "grad_norm": 4.60593843460083, + "learning_rate": 1.9994745180881968e-05, + "loss": 16.233, + "mean_token_accuracy": 0.5883422317355871, + "num_tokens": 32941633.0, + "step": 700 + }, + { + "entropy": 1.9750585129857063, + "epoch": 0.10556352780114471, + "grad_norm": 4.440836429595947, + "learning_rate": 1.9992104568259838e-05, + "loss": 16.089, + "mean_token_accuracy": 0.5906037037447095, + "num_tokens": 37665687.0, + "step": 800 + }, + { + "entropy": 1.959184586405754, + "epoch": 0.11875896877628779, + "grad_norm": 4.4237213134765625, + "learning_rate": 1.998946395563771e-05, + "loss": 15.9685, + "mean_token_accuracy": 0.5921973436698318, + "num_tokens": 42408986.0, + "step": 900 + }, + { + "entropy": 1.9661301617324352, + "epoch": 0.13195440975143088, + "grad_norm": 4.37338399887085, + "learning_rate": 1.9986823343015583e-05, + "loss": 16.0303, + "mean_token_accuracy": 0.5912668254226446, + "num_tokens": 47206895.0, + "step": 1000 + }, + { + "epoch": 0.13195440975143088, + "eval_entropy": 1.7280410604585583, + "eval_loss": 1.876280426979065, + "eval_mean_token_accuracy": 0.6125666955363589, + "eval_num_tokens": 47206895.0, + "eval_runtime": 3192.5073, + "eval_samples_per_second": 33.76, + "eval_steps_per_second": 4.22, + "step": 1000 + }, + { + "entropy": 1.952514111250639, + "epoch": 0.14514985072657396, + "grad_norm": 4.2272047996521, + "learning_rate": 1.9984182730393453e-05, + "loss": 15.9104, + "mean_token_accuracy": 0.5935716019570827, + "num_tokens": 51930574.0, + "step": 1100 + }, + { + "entropy": 1.951762547492981, + "epoch": 0.15834529170171704, + "grad_norm": 4.368087291717529, + "learning_rate": 1.9981542117771325e-05, + "loss": 15.9057, + "mean_token_accuracy": 0.5942437520623207, + "num_tokens": 56651451.0, + "step": 1200 + }, + { + "entropy": 1.9393150758743287, + "epoch": 0.17154073267686015, + "grad_norm": 4.117947101593018, + "learning_rate": 1.9978901505149195e-05, + "loss": 15.7951, + "mean_token_accuracy": 0.5956410552933812, + "num_tokens": 61366203.0, + "step": 1300 + }, + { + "entropy": 1.9590581172704697, + "epoch": 0.18473617365200323, + "grad_norm": 4.429177761077881, + "learning_rate": 1.9976260892527068e-05, + "loss": 15.9623, + "mean_token_accuracy": 0.5927011578902602, + "num_tokens": 66105310.0, + "step": 1400 + }, + { + "entropy": 1.9479418122768402, + "epoch": 0.1979316146271463, + "grad_norm": 4.270752906799316, + "learning_rate": 1.997362027990494e-05, + "loss": 15.8612, + "mean_token_accuracy": 0.5947995102033019, + "num_tokens": 70828886.0, + "step": 1500 + }, + { + "entropy": 1.9247798535227776, + "epoch": 0.21112705560228942, + "grad_norm": 4.055892467498779, + "learning_rate": 1.997097966728281e-05, + "loss": 15.6661, + "mean_token_accuracy": 0.598046317063272, + "num_tokens": 75498479.0, + "step": 1600 + }, + { + "entropy": 1.920890159010887, + "epoch": 0.2243224965774325, + "grad_norm": 4.256282329559326, + "learning_rate": 1.9968339054660683e-05, + "loss": 15.6489, + "mean_token_accuracy": 0.598283537067473, + "num_tokens": 80183990.0, + "step": 1700 + }, + { + "entropy": 1.9235932590067386, + "epoch": 0.23751793755257558, + "grad_norm": 4.397217750549316, + "learning_rate": 1.9965698442038555e-05, + "loss": 15.6559, + "mean_token_accuracy": 0.5974558148160577, + "num_tokens": 84876881.0, + "step": 1800 + }, + { + "entropy": 1.9103332214057445, + "epoch": 0.2507133785277187, + "grad_norm": 4.097607612609863, + "learning_rate": 1.9963057829416425e-05, + "loss": 15.5497, + "mean_token_accuracy": 0.6003026623278856, + "num_tokens": 89611584.0, + "step": 1900 + }, + { + "entropy": 1.914695471227169, + "epoch": 0.26390881950286177, + "grad_norm": 4.328080177307129, + "learning_rate": 1.9960417216794298e-05, + "loss": 15.5798, + "mean_token_accuracy": 0.5991538398340345, + "num_tokens": 94341018.0, + "step": 2000 + }, + { + "epoch": 0.26390881950286177, + "eval_entropy": 1.6988523987462736, + "eval_loss": 1.8283307552337646, + "eval_mean_token_accuracy": 0.6190770778948521, + "eval_num_tokens": 94341018.0, + "eval_runtime": 3191.3926, + "eval_samples_per_second": 33.772, + "eval_steps_per_second": 4.222, + "step": 2000 + }, + { + "entropy": 1.9157458925247193, + "epoch": 0.27710426047800485, + "grad_norm": 4.077412128448486, + "learning_rate": 1.995777660417217e-05, + "loss": 15.5759, + "mean_token_accuracy": 0.599464968368411, + "num_tokens": 99101131.0, + "step": 2100 + }, + { + "entropy": 1.9121157658100127, + "epoch": 0.29029970145314793, + "grad_norm": 4.194200038909912, + "learning_rate": 1.995513599155004e-05, + "loss": 15.5592, + "mean_token_accuracy": 0.5996096661686897, + "num_tokens": 103821529.0, + "step": 2200 + }, + { + "entropy": 1.8956915408372879, + "epoch": 0.303495142428291, + "grad_norm": 4.202551364898682, + "learning_rate": 1.9952495378927913e-05, + "loss": 15.4098, + "mean_token_accuracy": 0.6023901195079088, + "num_tokens": 108538002.0, + "step": 2300 + }, + { + "entropy": 1.9013544350862503, + "epoch": 0.3166905834034341, + "grad_norm": 4.200172424316406, + "learning_rate": 1.9949854766305785e-05, + "loss": 15.4667, + "mean_token_accuracy": 0.6008677286282181, + "num_tokens": 113208708.0, + "step": 2400 + }, + { + "entropy": 1.8889785696566106, + "epoch": 0.3298860243785772, + "grad_norm": 4.3031110763549805, + "learning_rate": 1.9947214153683655e-05, + "loss": 15.3555, + "mean_token_accuracy": 0.6025829451531172, + "num_tokens": 117941057.0, + "step": 2500 + }, + { + "entropy": 1.8911775602400303, + "epoch": 0.3430814653537203, + "grad_norm": 4.05417013168335, + "learning_rate": 1.9944573541061528e-05, + "loss": 15.3777, + "mean_token_accuracy": 0.6022695601731539, + "num_tokens": 122654207.0, + "step": 2600 + }, + { + "entropy": 1.8944527316093445, + "epoch": 0.3562769063288634, + "grad_norm": 4.3383097648620605, + "learning_rate": 1.99419329284394e-05, + "loss": 15.3979, + "mean_token_accuracy": 0.6022358436137438, + "num_tokens": 127395162.0, + "step": 2700 + }, + { + "entropy": 1.8847786201536656, + "epoch": 0.36947234730400647, + "grad_norm": 4.202676296234131, + "learning_rate": 1.993929231581727e-05, + "loss": 15.3206, + "mean_token_accuracy": 0.6034166327491403, + "num_tokens": 132115907.0, + "step": 2800 + }, + { + "entropy": 1.8890833668410778, + "epoch": 0.38266778827914955, + "grad_norm": 4.259827613830566, + "learning_rate": 1.9936651703195143e-05, + "loss": 15.3557, + "mean_token_accuracy": 0.6027516862004996, + "num_tokens": 136834999.0, + "step": 2900 + }, + { + "entropy": 1.8763156107068062, + "epoch": 0.3958632292542926, + "grad_norm": 3.9494423866271973, + "learning_rate": 1.9934011090573016e-05, + "loss": 15.2545, + "mean_token_accuracy": 0.6045393405854702, + "num_tokens": 141601677.0, + "step": 3000 + }, + { + "epoch": 0.3958632292542926, + "eval_entropy": 1.6155824041097597, + "eval_loss": 1.7954074144363403, + "eval_mean_token_accuracy": 0.6239857971845204, + "eval_num_tokens": 141601677.0, + "eval_runtime": 3193.3391, + "eval_samples_per_second": 33.752, + "eval_steps_per_second": 4.219, + "step": 3000 + }, + { + "entropy": 1.8736148147284986, + "epoch": 0.4090586702294357, + "grad_norm": 4.154598712921143, + "learning_rate": 1.9931370477950885e-05, + "loss": 15.2248, + "mean_token_accuracy": 0.6050427352637052, + "num_tokens": 146320541.0, + "step": 3100 + }, + { + "entropy": 1.8728535547852516, + "epoch": 0.42225411120457884, + "grad_norm": 4.243231296539307, + "learning_rate": 1.9928729865328758e-05, + "loss": 15.2124, + "mean_token_accuracy": 0.6051392666250467, + "num_tokens": 151034322.0, + "step": 3200 + }, + { + "entropy": 1.8753738756477834, + "epoch": 0.4354495521797219, + "grad_norm": 4.269708633422852, + "learning_rate": 1.9926089252706627e-05, + "loss": 15.2319, + "mean_token_accuracy": 0.6053581718355417, + "num_tokens": 155764395.0, + "step": 3300 + }, + { + "entropy": 1.8592823737859725, + "epoch": 0.448644993154865, + "grad_norm": 4.155630588531494, + "learning_rate": 1.9923448640084503e-05, + "loss": 15.1026, + "mean_token_accuracy": 0.6073809728398919, + "num_tokens": 160546122.0, + "step": 3400 + }, + { + "entropy": 1.8590475974977017, + "epoch": 0.4618404341300081, + "grad_norm": 4.049267768859863, + "learning_rate": 1.9920808027462373e-05, + "loss": 15.1112, + "mean_token_accuracy": 0.607349122017622, + "num_tokens": 165237636.0, + "step": 3500 + }, + { + "entropy": 1.8729989735782147, + "epoch": 0.47503587510515116, + "grad_norm": 4.30539608001709, + "learning_rate": 1.9918167414840242e-05, + "loss": 15.2137, + "mean_token_accuracy": 0.6055576696619391, + "num_tokens": 169961118.0, + "step": 3600 + }, + { + "entropy": 1.8558836616575718, + "epoch": 0.48823131608029424, + "grad_norm": 4.059924125671387, + "learning_rate": 1.991552680221812e-05, + "loss": 15.079, + "mean_token_accuracy": 0.6075002931430936, + "num_tokens": 174694060.0, + "step": 3700 + }, + { + "entropy": 1.8584099148213864, + "epoch": 0.5014267570554374, + "grad_norm": 4.13109016418457, + "learning_rate": 1.9912886189595988e-05, + "loss": 15.0896, + "mean_token_accuracy": 0.6074171752110124, + "num_tokens": 179400845.0, + "step": 3800 + }, + { + "entropy": 1.8450819233059883, + "epoch": 0.5146221980305804, + "grad_norm": 3.8707573413848877, + "learning_rate": 1.991024557697386e-05, + "loss": 14.9808, + "mean_token_accuracy": 0.6096408772468567, + "num_tokens": 184084788.0, + "step": 3900 + }, + { + "entropy": 1.837720358669758, + "epoch": 0.5278176390057235, + "grad_norm": 4.098764419555664, + "learning_rate": 1.9907604964351733e-05, + "loss": 14.9232, + "mean_token_accuracy": 0.6106652595847845, + "num_tokens": 188774170.0, + "step": 4000 + }, + { + "epoch": 0.5278176390057235, + "eval_entropy": 1.5803278617990546, + "eval_loss": 1.7662431001663208, + "eval_mean_token_accuracy": 0.6280910855415341, + "eval_num_tokens": 188774170.0, + "eval_runtime": 3188.7876, + "eval_samples_per_second": 33.8, + "eval_steps_per_second": 4.225, + "step": 4000 + }, + { + "entropy": 1.8657746087014675, + "epoch": 0.5410130799808666, + "grad_norm": 4.129471778869629, + "learning_rate": 1.9904964351729603e-05, + "loss": 15.1413, + "mean_token_accuracy": 0.6065389148145914, + "num_tokens": 193461030.0, + "step": 4100 + }, + { + "entropy": 1.8531225949525834, + "epoch": 0.5542085209560097, + "grad_norm": 4.1151604652404785, + "learning_rate": 1.9902323739107476e-05, + "loss": 15.0425, + "mean_token_accuracy": 0.60759482935071, + "num_tokens": 198128722.0, + "step": 4200 + }, + { + "entropy": 1.842765960842371, + "epoch": 0.5674039619311528, + "grad_norm": 4.083479404449463, + "learning_rate": 1.9899683126485345e-05, + "loss": 14.9646, + "mean_token_accuracy": 0.6094940543919801, + "num_tokens": 202860373.0, + "step": 4300 + }, + { + "entropy": 1.8503095911443233, + "epoch": 0.5805994029062959, + "grad_norm": 4.04292631149292, + "learning_rate": 1.9897042513863218e-05, + "loss": 15.0189, + "mean_token_accuracy": 0.6088438139855862, + "num_tokens": 207589820.0, + "step": 4400 + }, + { + "entropy": 1.8336368641257286, + "epoch": 0.593794843881439, + "grad_norm": 4.082155704498291, + "learning_rate": 1.989440190124109e-05, + "loss": 14.881, + "mean_token_accuracy": 0.6111773661524057, + "num_tokens": 212317338.0, + "step": 4500 + }, + { + "entropy": 1.8509277951717378, + "epoch": 0.606990284856582, + "grad_norm": 4.0600996017456055, + "learning_rate": 1.989176128861896e-05, + "loss": 15.0153, + "mean_token_accuracy": 0.6084930662810802, + "num_tokens": 217036645.0, + "step": 4600 + }, + { + "entropy": 1.8213860428333282, + "epoch": 0.6201857258317252, + "grad_norm": 4.324210166931152, + "learning_rate": 1.9889120675996833e-05, + "loss": 14.7757, + "mean_token_accuracy": 0.6129432079568505, + "num_tokens": 221721335.0, + "step": 4700 + }, + { + "entropy": 1.8302579675614834, + "epoch": 0.6333811668068682, + "grad_norm": 4.18501091003418, + "learning_rate": 1.9886480063374706e-05, + "loss": 14.8499, + "mean_token_accuracy": 0.610900132805109, + "num_tokens": 226424536.0, + "step": 4800 + }, + { + "entropy": 1.8055568043887615, + "epoch": 0.6465766077820113, + "grad_norm": 4.094173431396484, + "learning_rate": 1.9883839450752575e-05, + "loss": 14.6405, + "mean_token_accuracy": 0.6150890862569213, + "num_tokens": 231108868.0, + "step": 4900 + }, + { + "entropy": 1.818359861969948, + "epoch": 0.6597720487571545, + "grad_norm": 4.1406097412109375, + "learning_rate": 1.9881198838130448e-05, + "loss": 14.7503, + "mean_token_accuracy": 0.6136984185874462, + "num_tokens": 235827953.0, + "step": 5000 + }, + { + "epoch": 0.6597720487571545, + "eval_entropy": 1.5863489991730397, + "eval_loss": 1.7410829067230225, + "eval_mean_token_accuracy": 0.6313715932218462, + "eval_num_tokens": 235827953.0, + "eval_runtime": 3189.3586, + "eval_samples_per_second": 33.794, + "eval_steps_per_second": 4.224, + "step": 5000 + }, + { + "entropy": 1.828395222723484, + "epoch": 0.6729674897322975, + "grad_norm": 4.1239495277404785, + "learning_rate": 1.987855822550832e-05, + "loss": 14.8314, + "mean_token_accuracy": 0.611795287951827, + "num_tokens": 240569274.0, + "step": 5100 + }, + { + "entropy": 1.8127701422572136, + "epoch": 0.6861629307074406, + "grad_norm": 4.038999080657959, + "learning_rate": 1.987591761288619e-05, + "loss": 14.7074, + "mean_token_accuracy": 0.613722600787878, + "num_tokens": 245260313.0, + "step": 5200 + }, + { + "entropy": 1.8250135909020901, + "epoch": 0.6993583716825836, + "grad_norm": 4.045648097991943, + "learning_rate": 1.9873277000264063e-05, + "loss": 14.8054, + "mean_token_accuracy": 0.6120235136896371, + "num_tokens": 249954606.0, + "step": 5300 + }, + { + "entropy": 1.8105684253573417, + "epoch": 0.7125538126577268, + "grad_norm": 3.9636144638061523, + "learning_rate": 1.9870636387641936e-05, + "loss": 14.6757, + "mean_token_accuracy": 0.6151208320260048, + "num_tokens": 254701573.0, + "step": 5400 + }, + { + "entropy": 1.8140485088527203, + "epoch": 0.7257492536328698, + "grad_norm": 3.958618640899658, + "learning_rate": 1.9867995775019805e-05, + "loss": 14.7097, + "mean_token_accuracy": 0.613787483125925, + "num_tokens": 259356993.0, + "step": 5500 + }, + { + "entropy": 1.804336573332548, + "epoch": 0.7389446946080129, + "grad_norm": 3.6845266819000244, + "learning_rate": 1.9865355162397678e-05, + "loss": 14.6348, + "mean_token_accuracy": 0.6151521971076727, + "num_tokens": 264064776.0, + "step": 5600 + }, + { + "entropy": 1.7955251815915108, + "epoch": 0.7521401355831561, + "grad_norm": 4.289984703063965, + "learning_rate": 1.986271454977555e-05, + "loss": 14.5556, + "mean_token_accuracy": 0.6166081204265356, + "num_tokens": 268747372.0, + "step": 5700 + }, + { + "entropy": 1.8180018638074398, + "epoch": 0.7653355765582991, + "grad_norm": 4.030449390411377, + "learning_rate": 1.986007393715342e-05, + "loss": 14.7427, + "mean_token_accuracy": 0.613333948738873, + "num_tokens": 273463250.0, + "step": 5800 + }, + { + "entropy": 1.8108704054355622, + "epoch": 0.7785310175334422, + "grad_norm": 4.096724033355713, + "learning_rate": 1.9857433324531293e-05, + "loss": 14.6724, + "mean_token_accuracy": 0.6149687469750643, + "num_tokens": 278183524.0, + "step": 5900 + }, + { + "entropy": 1.8036722446978093, + "epoch": 0.7917264585085853, + "grad_norm": 4.013925552368164, + "learning_rate": 1.9854792711909166e-05, + "loss": 14.6154, + "mean_token_accuracy": 0.6153397902101279, + "num_tokens": 282897630.0, + "step": 6000 + }, + { + "epoch": 0.7917264585085853, + "eval_entropy": 1.5602015991178377, + "eval_loss": 1.718701958656311, + "eval_mean_token_accuracy": 0.6347915723799314, + "eval_num_tokens": 282897630.0, + "eval_runtime": 3190.0929, + "eval_samples_per_second": 33.786, + "eval_steps_per_second": 4.223, + "step": 6000 + }, + { + "entropy": 1.798267685174942, + "epoch": 0.8049218994837284, + "grad_norm": 3.8725955486297607, + "learning_rate": 1.9852152099287035e-05, + "loss": 14.5751, + "mean_token_accuracy": 0.6157875391095877, + "num_tokens": 287635938.0, + "step": 6100 + }, + { + "entropy": 1.7878125695884228, + "epoch": 0.8181173404588714, + "grad_norm": 4.046728134155273, + "learning_rate": 1.9849511486664908e-05, + "loss": 14.4771, + "mean_token_accuracy": 0.6187311994284391, + "num_tokens": 292290134.0, + "step": 6200 + }, + { + "entropy": 1.8043781124055385, + "epoch": 0.8313127814340145, + "grad_norm": 3.8358700275421143, + "learning_rate": 1.9846870874042777e-05, + "loss": 14.6268, + "mean_token_accuracy": 0.6149816115200519, + "num_tokens": 296943359.0, + "step": 6300 + }, + { + "entropy": 1.808346015959978, + "epoch": 0.8445082224091577, + "grad_norm": 4.048934459686279, + "learning_rate": 1.984423026142065e-05, + "loss": 14.6547, + "mean_token_accuracy": 0.6149462160468101, + "num_tokens": 301696338.0, + "step": 6400 + }, + { + "entropy": 1.7909628981351853, + "epoch": 0.8577036633843007, + "grad_norm": 4.050400733947754, + "learning_rate": 1.9841589648798523e-05, + "loss": 14.5162, + "mean_token_accuracy": 0.6173636147379875, + "num_tokens": 306409757.0, + "step": 6500 + }, + { + "entropy": 1.7796907857060433, + "epoch": 0.8708991043594438, + "grad_norm": 3.80794358253479, + "learning_rate": 1.9838949036176392e-05, + "loss": 14.4154, + "mean_token_accuracy": 0.6189755406975747, + "num_tokens": 311094592.0, + "step": 6600 + }, + { + "entropy": 1.7770866174995898, + "epoch": 0.8840945453345869, + "grad_norm": 4.046851634979248, + "learning_rate": 1.9836308423554265e-05, + "loss": 14.3936, + "mean_token_accuracy": 0.6197400981932879, + "num_tokens": 315777049.0, + "step": 6700 + }, + { + "entropy": 1.7914468431472779, + "epoch": 0.89728998630973, + "grad_norm": 3.747300863265991, + "learning_rate": 1.9833667810932138e-05, + "loss": 14.5052, + "mean_token_accuracy": 0.6180075034499168, + "num_tokens": 320507396.0, + "step": 6800 + }, + { + "entropy": 1.7918499463796616, + "epoch": 0.910485427284873, + "grad_norm": 3.889294147491455, + "learning_rate": 1.9831027198310007e-05, + "loss": 14.5115, + "mean_token_accuracy": 0.6171831817179918, + "num_tokens": 325185054.0, + "step": 6900 + }, + { + "entropy": 1.787924758642912, + "epoch": 0.9236808682600162, + "grad_norm": 3.9466371536254883, + "learning_rate": 1.9828386585687884e-05, + "loss": 14.4799, + "mean_token_accuracy": 0.6181329232081771, + "num_tokens": 329928778.0, + "step": 7000 + }, + { + "epoch": 0.9236808682600162, + "eval_entropy": 1.5404707675060503, + "eval_loss": 1.700563669204712, + "eval_mean_token_accuracy": 0.6374752920914234, + "eval_num_tokens": 329928778.0, + "eval_runtime": 3190.0879, + "eval_samples_per_second": 33.786, + "eval_steps_per_second": 4.223, + "step": 7000 + }, + { + "entropy": 1.7860313929617404, + "epoch": 0.9368763092351593, + "grad_norm": 3.9548611640930176, + "learning_rate": 1.9825745973065753e-05, + "loss": 14.4618, + "mean_token_accuracy": 0.6185688901692629, + "num_tokens": 334641739.0, + "step": 7100 + }, + { + "entropy": 1.7841411991417409, + "epoch": 0.9500717502103023, + "grad_norm": 3.9234402179718018, + "learning_rate": 1.9823105360443622e-05, + "loss": 14.4508, + "mean_token_accuracy": 0.6186311930418015, + "num_tokens": 339338811.0, + "step": 7200 + }, + { + "entropy": 1.7960207970440387, + "epoch": 0.9632671911854455, + "grad_norm": 3.974130392074585, + "learning_rate": 1.9820464747821495e-05, + "loss": 14.552, + "mean_token_accuracy": 0.6171760141849518, + "num_tokens": 344012186.0, + "step": 7300 + }, + { + "entropy": 1.7755667209625243, + "epoch": 0.9764626321605885, + "grad_norm": 3.956102132797241, + "learning_rate": 1.9817824135199368e-05, + "loss": 14.3741, + "mean_token_accuracy": 0.6195540763065219, + "num_tokens": 348731167.0, + "step": 7400 + }, + { + "entropy": 1.7824751836061479, + "epoch": 0.9896580731357316, + "grad_norm": 3.968210220336914, + "learning_rate": 1.981518352257724e-05, + "loss": 14.4282, + "mean_token_accuracy": 0.6188432604074479, + "num_tokens": 353442111.0, + "step": 7500 + }, + { + "entropy": 1.7721347595910606, + "epoch": 1.00277104260478, + "grad_norm": 3.980825185775757, + "learning_rate": 1.981254290995511e-05, + "loss": 14.2534, + "mean_token_accuracy": 0.6206337471428157, + "num_tokens": 358105945.0, + "step": 7600 + }, + { + "entropy": 1.7669402280449866, + "epoch": 1.0159664835799231, + "grad_norm": 3.9514989852905273, + "learning_rate": 1.9809902297332983e-05, + "loss": 14.2841, + "mean_token_accuracy": 0.621123610921204, + "num_tokens": 362858257.0, + "step": 7700 + }, + { + "entropy": 1.764955345094204, + "epoch": 1.0291619245550663, + "grad_norm": 3.9744436740875244, + "learning_rate": 1.9807261684710856e-05, + "loss": 14.2833, + "mean_token_accuracy": 0.6217504210770131, + "num_tokens": 367640143.0, + "step": 7800 + }, + { + "entropy": 1.7551834625005722, + "epoch": 1.0423573655302094, + "grad_norm": 4.042919158935547, + "learning_rate": 1.9804621072088725e-05, + "loss": 14.1949, + "mean_token_accuracy": 0.6225048137456178, + "num_tokens": 372344446.0, + "step": 7900 + }, + { + "entropy": 1.7445960550010204, + "epoch": 1.0555528065053523, + "grad_norm": 4.0731353759765625, + "learning_rate": 1.9801980459466598e-05, + "loss": 14.1036, + "mean_token_accuracy": 0.6247953659668565, + "num_tokens": 377055347.0, + "step": 8000 + }, + { + "epoch": 1.0555528065053523, + "eval_entropy": 1.5357393407745563, + "eval_loss": 1.6831690073013306, + "eval_mean_token_accuracy": 0.6397847914951448, + "eval_num_tokens": 377055347.0, + "eval_runtime": 3188.4458, + "eval_samples_per_second": 33.803, + "eval_steps_per_second": 4.226, + "step": 8000 + }, + { + "entropy": 1.7497526466846467, + "epoch": 1.0687482474804955, + "grad_norm": 4.043092727661133, + "learning_rate": 1.979933984684447e-05, + "loss": 14.1557, + "mean_token_accuracy": 0.6233407002687454, + "num_tokens": 381747520.0, + "step": 8100 + }, + { + "entropy": 1.7488390171527863, + "epoch": 1.0819436884556386, + "grad_norm": 3.92033052444458, + "learning_rate": 1.979669923422234e-05, + "loss": 14.1357, + "mean_token_accuracy": 0.6237702713161707, + "num_tokens": 386500736.0, + "step": 8200 + }, + { + "entropy": 1.746285059452057, + "epoch": 1.0951391294307817, + "grad_norm": 3.9088919162750244, + "learning_rate": 1.9794058621600213e-05, + "loss": 14.1195, + "mean_token_accuracy": 0.6236967007815838, + "num_tokens": 391229707.0, + "step": 8300 + }, + { + "entropy": 1.7552506732940674, + "epoch": 1.1083345704059249, + "grad_norm": 4.078105926513672, + "learning_rate": 1.9791418008978086e-05, + "loss": 14.191, + "mean_token_accuracy": 0.6234133420884609, + "num_tokens": 395919393.0, + "step": 8400 + }, + { + "entropy": 1.7433623734116555, + "epoch": 1.1215300113810678, + "grad_norm": 4.080204963684082, + "learning_rate": 1.9788777396355955e-05, + "loss": 14.0987, + "mean_token_accuracy": 0.6247739800065756, + "num_tokens": 400602316.0, + "step": 8500 + }, + { + "entropy": 1.7413157878816128, + "epoch": 1.134725452356211, + "grad_norm": 4.232221603393555, + "learning_rate": 1.9786136783733828e-05, + "loss": 14.0808, + "mean_token_accuracy": 0.6253204553574324, + "num_tokens": 405317129.0, + "step": 8600 + }, + { + "entropy": 1.7399081835150718, + "epoch": 1.147920893331354, + "grad_norm": 4.332466125488281, + "learning_rate": 1.97834961711117e-05, + "loss": 14.0645, + "mean_token_accuracy": 0.6258289200812578, + "num_tokens": 410076395.0, + "step": 8700 + }, + { + "entropy": 1.749226526170969, + "epoch": 1.1611163343064972, + "grad_norm": 3.86761474609375, + "learning_rate": 1.978085555848957e-05, + "loss": 14.1447, + "mean_token_accuracy": 0.6238241862505675, + "num_tokens": 414801400.0, + "step": 8800 + }, + { + "entropy": 1.7294802324473857, + "epoch": 1.17431177528164, + "grad_norm": 3.7982897758483887, + "learning_rate": 1.9778214945867443e-05, + "loss": 13.991, + "mean_token_accuracy": 0.6271847046166659, + "num_tokens": 419506353.0, + "step": 8900 + }, + { + "entropy": 1.751773677021265, + "epoch": 1.1875072162567832, + "grad_norm": 4.006007671356201, + "learning_rate": 1.9775574333245316e-05, + "loss": 14.1556, + "mean_token_accuracy": 0.6236618124693633, + "num_tokens": 424214730.0, + "step": 9000 + }, + { + "epoch": 1.1875072162567832, + "eval_entropy": 1.5117126926364997, + "eval_loss": 1.669206976890564, + "eval_mean_token_accuracy": 0.642173377870142, + "eval_num_tokens": 424214730.0, + "eval_runtime": 3188.1779, + "eval_samples_per_second": 33.806, + "eval_steps_per_second": 4.226, + "step": 9000 + }, + { + "entropy": 1.7496005721390246, + "epoch": 1.2007026572319264, + "grad_norm": 4.041248798370361, + "learning_rate": 1.9772933720623185e-05, + "loss": 14.1458, + "mean_token_accuracy": 0.623925342336297, + "num_tokens": 428874431.0, + "step": 9100 + }, + { + "entropy": 1.747845853716135, + "epoch": 1.2138980982070695, + "grad_norm": 4.10402774810791, + "learning_rate": 1.9770293108001058e-05, + "loss": 14.1284, + "mean_token_accuracy": 0.6235801701620222, + "num_tokens": 433631785.0, + "step": 9200 + }, + { + "entropy": 1.742924979031086, + "epoch": 1.2270935391822126, + "grad_norm": 3.7367687225341797, + "learning_rate": 1.976765249537893e-05, + "loss": 14.0895, + "mean_token_accuracy": 0.6245103114843369, + "num_tokens": 438341526.0, + "step": 9300 + }, + { + "entropy": 1.741254171282053, + "epoch": 1.2402889801573556, + "grad_norm": 3.969815969467163, + "learning_rate": 1.97650118827568e-05, + "loss": 14.069, + "mean_token_accuracy": 0.6253565014153719, + "num_tokens": 443056206.0, + "step": 9400 + }, + { + "entropy": 1.7313911478221415, + "epoch": 1.2534844211324987, + "grad_norm": 3.8585336208343506, + "learning_rate": 1.9762371270134673e-05, + "loss": 13.996, + "mean_token_accuracy": 0.6264266113936902, + "num_tokens": 447754874.0, + "step": 9500 + }, + { + "entropy": 1.736898885667324, + "epoch": 1.2666798621076418, + "grad_norm": 3.976346254348755, + "learning_rate": 1.9759730657512543e-05, + "loss": 14.0392, + "mean_token_accuracy": 0.6258945613354444, + "num_tokens": 452410741.0, + "step": 9600 + }, + { + "entropy": 1.728874337822199, + "epoch": 1.279875303082785, + "grad_norm": 4.028810501098633, + "learning_rate": 1.9757090044890415e-05, + "loss": 13.9683, + "mean_token_accuracy": 0.6269886953383684, + "num_tokens": 457091317.0, + "step": 9700 + }, + { + "entropy": 1.7307989183068275, + "epoch": 1.2930707440579279, + "grad_norm": 3.926074504852295, + "learning_rate": 1.9754449432268288e-05, + "loss": 13.9844, + "mean_token_accuracy": 0.6272025952115655, + "num_tokens": 461795304.0, + "step": 9800 + }, + { + "entropy": 1.7376534953713416, + "epoch": 1.306266185033071, + "grad_norm": 3.973266363143921, + "learning_rate": 1.9751808819646158e-05, + "loss": 14.0339, + "mean_token_accuracy": 0.6259337517619133, + "num_tokens": 466537252.0, + "step": 9900 + }, + { + "entropy": 1.7124468161165713, + "epoch": 1.3194616260082142, + "grad_norm": 3.9212160110473633, + "learning_rate": 1.974916820702403e-05, + "loss": 13.8356, + "mean_token_accuracy": 0.6298421548306942, + "num_tokens": 471210653.0, + "step": 10000 + }, + { + "epoch": 1.3194616260082142, + "eval_entropy": 1.515975620940738, + "eval_loss": 1.6549092531204224, + "eval_mean_token_accuracy": 0.6441165252658436, + "eval_num_tokens": 471210653.0, + "eval_runtime": 3185.9569, + "eval_samples_per_second": 33.83, + "eval_steps_per_second": 4.229, + "step": 10000 + }, + { + "entropy": 1.7285879038274288, + "epoch": 1.3326570669833573, + "grad_norm": 3.7557201385498047, + "learning_rate": 1.9746527594401903e-05, + "loss": 13.9691, + "mean_token_accuracy": 0.6267551811784506, + "num_tokens": 475934229.0, + "step": 10100 + }, + { + "entropy": 1.7297876067459583, + "epoch": 1.3458525079585004, + "grad_norm": 3.7093920707702637, + "learning_rate": 1.9743886981779773e-05, + "loss": 13.9748, + "mean_token_accuracy": 0.6267426482588053, + "num_tokens": 480609229.0, + "step": 10200 + }, + { + "entropy": 1.7228832334280013, + "epoch": 1.3590479489336436, + "grad_norm": 4.082423210144043, + "learning_rate": 1.974124636915765e-05, + "loss": 13.9133, + "mean_token_accuracy": 0.6282350146025419, + "num_tokens": 485365238.0, + "step": 10300 + }, + { + "entropy": 1.7269348740577697, + "epoch": 1.3722433899087865, + "grad_norm": 3.929724931716919, + "learning_rate": 1.9738605756535518e-05, + "loss": 13.9444, + "mean_token_accuracy": 0.6276592640578746, + "num_tokens": 490070914.0, + "step": 10400 + }, + { + "entropy": 1.720128181874752, + "epoch": 1.3854388308839296, + "grad_norm": 4.017079830169678, + "learning_rate": 1.9735965143913388e-05, + "loss": 13.8927, + "mean_token_accuracy": 0.6279544594511389, + "num_tokens": 494810156.0, + "step": 10500 + }, + { + "entropy": 1.7191148309409618, + "epoch": 1.3986342718590727, + "grad_norm": 3.982664108276367, + "learning_rate": 1.973332453129126e-05, + "loss": 13.8775, + "mean_token_accuracy": 0.6286891888082028, + "num_tokens": 499562228.0, + "step": 10600 + }, + { + "entropy": 1.7097445997595786, + "epoch": 1.4118297128342157, + "grad_norm": 3.8982956409454346, + "learning_rate": 1.9730683918669133e-05, + "loss": 13.8203, + "mean_token_accuracy": 0.6301189444214106, + "num_tokens": 504283158.0, + "step": 10700 + }, + { + "entropy": 1.7238348364830016, + "epoch": 1.4250251538093588, + "grad_norm": 3.908458948135376, + "learning_rate": 1.9728043306047006e-05, + "loss": 13.9223, + "mean_token_accuracy": 0.628321581557393, + "num_tokens": 508970795.0, + "step": 10800 + }, + { + "entropy": 1.7226741972565651, + "epoch": 1.438220594784502, + "grad_norm": 4.095105171203613, + "learning_rate": 1.9725402693424876e-05, + "loss": 13.9157, + "mean_token_accuracy": 0.6280835216119885, + "num_tokens": 513673694.0, + "step": 10900 + }, + { + "entropy": 1.7151854334771632, + "epoch": 1.451416035759645, + "grad_norm": 3.830979108810425, + "learning_rate": 1.972276208080275e-05, + "loss": 13.843, + "mean_token_accuracy": 0.6288192373514175, + "num_tokens": 518400143.0, + "step": 11000 + }, + { + "epoch": 1.451416035759645, + "eval_entropy": 1.4893881395710238, + "eval_loss": 1.6429299116134644, + "eval_mean_token_accuracy": 0.6459804228470001, + "eval_num_tokens": 518400143.0, + "eval_runtime": 3190.9199, + "eval_samples_per_second": 33.777, + "eval_steps_per_second": 4.222, + "step": 11000 + }, + { + "entropy": 1.7264313192665577, + "epoch": 1.4646114767347882, + "grad_norm": 3.997246503829956, + "learning_rate": 1.972012146818062e-05, + "loss": 13.9428, + "mean_token_accuracy": 0.6274989359080791, + "num_tokens": 523125727.0, + "step": 11100 + }, + { + "entropy": 1.7230025473237038, + "epoch": 1.4778069177099313, + "grad_norm": 3.9877476692199707, + "learning_rate": 1.971748085555849e-05, + "loss": 13.9068, + "mean_token_accuracy": 0.6281301632523537, + "num_tokens": 527844030.0, + "step": 11200 + }, + { + "entropy": 1.7104627051949501, + "epoch": 1.4910023586850742, + "grad_norm": 3.9071784019470215, + "learning_rate": 1.9714840242936363e-05, + "loss": 13.8127, + "mean_token_accuracy": 0.6302814479917288, + "num_tokens": 532587219.0, + "step": 11300 + }, + { + "entropy": 1.7233010344207287, + "epoch": 1.5041977996602174, + "grad_norm": 3.970679759979248, + "learning_rate": 1.9712199630314236e-05, + "loss": 13.9115, + "mean_token_accuracy": 0.627718816101551, + "num_tokens": 537335492.0, + "step": 11400 + }, + { + "entropy": 1.7085300183296204, + "epoch": 1.5173932406353605, + "grad_norm": 4.084446907043457, + "learning_rate": 1.9709559017692106e-05, + "loss": 13.7914, + "mean_token_accuracy": 0.6303271735459566, + "num_tokens": 542052225.0, + "step": 11500 + }, + { + "entropy": 1.7150896434485912, + "epoch": 1.5305886816105034, + "grad_norm": 3.8559019565582275, + "learning_rate": 1.970691840506998e-05, + "loss": 13.8508, + "mean_token_accuracy": 0.6289579905569553, + "num_tokens": 546782905.0, + "step": 11600 + }, + { + "entropy": 1.7150465674698352, + "epoch": 1.5437841225856466, + "grad_norm": 4.106844902038574, + "learning_rate": 1.970427779244785e-05, + "loss": 13.844, + "mean_token_accuracy": 0.6292690277844667, + "num_tokens": 551481827.0, + "step": 11700 + }, + { + "entropy": 1.7132095769047737, + "epoch": 1.5569795635607897, + "grad_norm": 4.053258419036865, + "learning_rate": 1.970163717982572e-05, + "loss": 13.8218, + "mean_token_accuracy": 0.629193360954523, + "num_tokens": 556220417.0, + "step": 11800 + }, + { + "entropy": 1.7086699897050857, + "epoch": 1.5701750045359328, + "grad_norm": 3.9460904598236084, + "learning_rate": 1.9698996567203593e-05, + "loss": 13.7941, + "mean_token_accuracy": 0.6298571369051933, + "num_tokens": 560909913.0, + "step": 11900 + }, + { + "entropy": 1.705731320977211, + "epoch": 1.583370445511076, + "grad_norm": 3.9144344329833984, + "learning_rate": 1.9696355954581466e-05, + "loss": 13.7618, + "mean_token_accuracy": 0.6303364527225495, + "num_tokens": 565678902.0, + "step": 12000 + }, + { + "epoch": 1.583370445511076, + "eval_entropy": 1.4800323092272984, + "eval_loss": 1.6302741765975952, + "eval_mean_token_accuracy": 0.6479193049404268, + "eval_num_tokens": 565678902.0, + "eval_runtime": 3188.8622, + "eval_samples_per_second": 33.799, + "eval_steps_per_second": 4.225, + "step": 12000 + }, + { + "entropy": 1.6993986825644969, + "epoch": 1.596565886486219, + "grad_norm": 4.023446559906006, + "learning_rate": 1.9693715341959336e-05, + "loss": 13.7179, + "mean_token_accuracy": 0.6318010853976012, + "num_tokens": 570386334.0, + "step": 12100 + }, + { + "entropy": 1.7054964397847652, + "epoch": 1.6097613274613622, + "grad_norm": 3.808046340942383, + "learning_rate": 1.969107472933721e-05, + "loss": 13.774, + "mean_token_accuracy": 0.630204633101821, + "num_tokens": 575112992.0, + "step": 12200 + }, + { + "entropy": 1.7120344342291356, + "epoch": 1.6229567684365052, + "grad_norm": 3.8330607414245605, + "learning_rate": 1.968843411671508e-05, + "loss": 13.8098, + "mean_token_accuracy": 0.6296659503132105, + "num_tokens": 579849787.0, + "step": 12300 + }, + { + "entropy": 1.7050182285904885, + "epoch": 1.6361522094116483, + "grad_norm": 4.006343364715576, + "learning_rate": 1.968579350409295e-05, + "loss": 13.7528, + "mean_token_accuracy": 0.6311237644404173, + "num_tokens": 584536225.0, + "step": 12400 + }, + { + "entropy": 1.708528604209423, + "epoch": 1.6493476503867912, + "grad_norm": 3.9078450202941895, + "learning_rate": 1.9683152891470823e-05, + "loss": 13.7855, + "mean_token_accuracy": 0.630221213772893, + "num_tokens": 589243850.0, + "step": 12500 + }, + { + "entropy": 1.7126329486072063, + "epoch": 1.6625430913619343, + "grad_norm": 3.8646676540374756, + "learning_rate": 1.9680512278848693e-05, + "loss": 13.814, + "mean_token_accuracy": 0.6300745321810246, + "num_tokens": 593974243.0, + "step": 12600 + }, + { + "entropy": 1.687779471129179, + "epoch": 1.6757385323370775, + "grad_norm": 3.9465432167053223, + "learning_rate": 1.9677871666226566e-05, + "loss": 13.6162, + "mean_token_accuracy": 0.6335779485851526, + "num_tokens": 598706959.0, + "step": 12700 + }, + { + "entropy": 1.7161117048561574, + "epoch": 1.6889339733122206, + "grad_norm": 3.7726762294769287, + "learning_rate": 1.967523105360444e-05, + "loss": 13.8461, + "mean_token_accuracy": 0.6291085375845432, + "num_tokens": 603417327.0, + "step": 12800 + }, + { + "entropy": 1.6877114294469358, + "epoch": 1.7021294142873638, + "grad_norm": 4.004697799682617, + "learning_rate": 1.9672590440982308e-05, + "loss": 13.6098, + "mean_token_accuracy": 0.6341679825633765, + "num_tokens": 608065925.0, + "step": 12900 + }, + { + "entropy": 1.6961762863397598, + "epoch": 1.7153248552625069, + "grad_norm": 3.9190824031829834, + "learning_rate": 1.966994982836018e-05, + "loss": 13.6786, + "mean_token_accuracy": 0.6323697911947965, + "num_tokens": 612761369.0, + "step": 13000 + }, + { + "epoch": 1.7153248552625069, + "eval_entropy": 1.4835227909274298, + "eval_loss": 1.619583010673523, + "eval_mean_token_accuracy": 0.6494705043581045, + "eval_num_tokens": 612761369.0, + "eval_runtime": 3188.0387, + "eval_samples_per_second": 33.808, + "eval_steps_per_second": 4.226, + "step": 13000 + }, + { + "entropy": 1.7056168286502362, + "epoch": 1.72852029623765, + "grad_norm": 4.131747245788574, + "learning_rate": 1.9667309215738053e-05, + "loss": 13.7522, + "mean_token_accuracy": 0.63100875236094, + "num_tokens": 617469480.0, + "step": 13100 + }, + { + "entropy": 1.6937249195575714, + "epoch": 1.741715737212793, + "grad_norm": 4.118540287017822, + "learning_rate": 1.9664668603115923e-05, + "loss": 13.6638, + "mean_token_accuracy": 0.6324852432310581, + "num_tokens": 622126208.0, + "step": 13200 + }, + { + "entropy": 1.6925005520880223, + "epoch": 1.754911178187936, + "grad_norm": 3.863349199295044, + "learning_rate": 1.9662027990493796e-05, + "loss": 13.6551, + "mean_token_accuracy": 0.6328117294609547, + "num_tokens": 626878868.0, + "step": 13300 + }, + { + "entropy": 1.698338780850172, + "epoch": 1.7681066191630792, + "grad_norm": 3.8205785751342773, + "learning_rate": 1.965938737787167e-05, + "loss": 13.6957, + "mean_token_accuracy": 0.6319728682935238, + "num_tokens": 631580180.0, + "step": 13400 + }, + { + "entropy": 1.6914977538585663, + "epoch": 1.7813020601382221, + "grad_norm": 3.8058321475982666, + "learning_rate": 1.9656746765249538e-05, + "loss": 13.6316, + "mean_token_accuracy": 0.6332664381712675, + "num_tokens": 636256941.0, + "step": 13500 + }, + { + "entropy": 1.6875527657568454, + "epoch": 1.7944975011133653, + "grad_norm": 4.034668922424316, + "learning_rate": 1.965410615262741e-05, + "loss": 13.6047, + "mean_token_accuracy": 0.6338882031291724, + "num_tokens": 640921349.0, + "step": 13600 + }, + { + "entropy": 1.6999479295313358, + "epoch": 1.8076929420885084, + "grad_norm": 3.9597856998443604, + "learning_rate": 1.9651465540005284e-05, + "loss": 13.7185, + "mean_token_accuracy": 0.6316749695688486, + "num_tokens": 645663113.0, + "step": 13700 + }, + { + "entropy": 1.6779197818040847, + "epoch": 1.8208883830636515, + "grad_norm": 3.8675427436828613, + "learning_rate": 1.9648824927383153e-05, + "loss": 13.5157, + "mean_token_accuracy": 0.6357244378328324, + "num_tokens": 650319261.0, + "step": 13800 + }, + { + "entropy": 1.6929333385825158, + "epoch": 1.8340838240387947, + "grad_norm": 3.900453805923462, + "learning_rate": 1.9646184314761026e-05, + "loss": 13.6518, + "mean_token_accuracy": 0.6333543327450752, + "num_tokens": 655031705.0, + "step": 13900 + }, + { + "entropy": 1.6835547630488872, + "epoch": 1.8472792650139378, + "grad_norm": 4.208860874176025, + "learning_rate": 1.96435437021389e-05, + "loss": 13.5688, + "mean_token_accuracy": 0.6342528595775366, + "num_tokens": 659779121.0, + "step": 14000 + }, + { + "epoch": 1.8472792650139378, + "eval_entropy": 1.4634111206028781, + "eval_loss": 1.6098047494888306, + "eval_mean_token_accuracy": 0.6510293728890473, + "eval_num_tokens": 659779121.0, + "eval_runtime": 3190.0344, + "eval_samples_per_second": 33.786, + "eval_steps_per_second": 4.223, + "step": 14000 + }, + { + "entropy": 1.6931584388017655, + "epoch": 1.860474705989081, + "grad_norm": 4.122419357299805, + "learning_rate": 1.9640903089516768e-05, + "loss": 13.653, + "mean_token_accuracy": 0.6321510327607394, + "num_tokens": 664482614.0, + "step": 14100 + }, + { + "entropy": 1.6729721108078957, + "epoch": 1.8736701469642238, + "grad_norm": 3.7705953121185303, + "learning_rate": 1.963826247689464e-05, + "loss": 13.4799, + "mean_token_accuracy": 0.6365410851687193, + "num_tokens": 669182529.0, + "step": 14200 + }, + { + "entropy": 1.6909792493283748, + "epoch": 1.886865587939367, + "grad_norm": 3.6213951110839844, + "learning_rate": 1.9635621864272514e-05, + "loss": 13.6385, + "mean_token_accuracy": 0.6330462139099836, + "num_tokens": 673935978.0, + "step": 14300 + }, + { + "entropy": 1.6902420930564404, + "epoch": 1.90006102891451, + "grad_norm": 3.9693639278411865, + "learning_rate": 1.9632981251650386e-05, + "loss": 13.618, + "mean_token_accuracy": 0.6335649444907904, + "num_tokens": 678713087.0, + "step": 14400 + }, + { + "entropy": 1.6775447849929332, + "epoch": 1.913256469889653, + "grad_norm": 3.9338343143463135, + "learning_rate": 1.9630340639028256e-05, + "loss": 13.5224, + "mean_token_accuracy": 0.6356154507398606, + "num_tokens": 683433667.0, + "step": 14500 + }, + { + "entropy": 1.6908086335659027, + "epoch": 1.9264519108647962, + "grad_norm": 4.041861534118652, + "learning_rate": 1.9627700026406125e-05, + "loss": 13.6255, + "mean_token_accuracy": 0.6331023909151554, + "num_tokens": 688149621.0, + "step": 14600 + }, + { + "entropy": 1.6837576559185983, + "epoch": 1.9396473518399393, + "grad_norm": 3.9566867351531982, + "learning_rate": 1.9625059413784e-05, + "loss": 13.5716, + "mean_token_accuracy": 0.6349082486331463, + "num_tokens": 692827568.0, + "step": 14700 + }, + { + "entropy": 1.7002186079323292, + "epoch": 1.9528427928150824, + "grad_norm": 4.085751533508301, + "learning_rate": 1.962241880116187e-05, + "loss": 13.7057, + "mean_token_accuracy": 0.6317524817958474, + "num_tokens": 697465702.0, + "step": 14800 + }, + { + "entropy": 1.673355882167816, + "epoch": 1.9660382337902256, + "grad_norm": 4.124021053314209, + "learning_rate": 1.9619778188539744e-05, + "loss": 13.4842, + "mean_token_accuracy": 0.6359885314106941, + "num_tokens": 702178812.0, + "step": 14900 + }, + { + "entropy": 1.6702920420467853, + "epoch": 1.9792336747653687, + "grad_norm": 3.9790737628936768, + "learning_rate": 1.9617137575917616e-05, + "loss": 13.4491, + "mean_token_accuracy": 0.6364967184513808, + "num_tokens": 706869072.0, + "step": 15000 + }, + { + "epoch": 1.9792336747653687, + "eval_entropy": 1.4559450233017144, + "eval_loss": 1.6001065969467163, + "eval_mean_token_accuracy": 0.6525887808856017, + "eval_num_tokens": 706869072.0, + "eval_runtime": 3188.4337, + "eval_samples_per_second": 33.803, + "eval_steps_per_second": 4.226, + "step": 15000 + }, + { + "entropy": 1.6789049740135669, + "epoch": 1.9924291157405116, + "grad_norm": 3.97013521194458, + "learning_rate": 1.9614496963295486e-05, + "loss": 13.5376, + "mean_token_accuracy": 0.6352450941503048, + "num_tokens": 711564626.0, + "step": 15100 + }, + { + "entropy": 1.6562597664647132, + "epoch": 2.00554208520956, + "grad_norm": 3.7767651081085205, + "learning_rate": 1.961185635067336e-05, + "loss": 13.2346, + "mean_token_accuracy": 0.6391579893400084, + "num_tokens": 716196843.0, + "step": 15200 + }, + { + "entropy": 1.6494072581827641, + "epoch": 2.018737526184703, + "grad_norm": 3.924259662628174, + "learning_rate": 1.960921573805123e-05, + "loss": 13.2641, + "mean_token_accuracy": 0.6396440506726503, + "num_tokens": 720902841.0, + "step": 15300 + }, + { + "entropy": 1.667148039340973, + "epoch": 2.0319329671598463, + "grad_norm": 4.009494304656982, + "learning_rate": 1.96065751254291e-05, + "loss": 13.4176, + "mean_token_accuracy": 0.6364856123179198, + "num_tokens": 725611095.0, + "step": 15400 + }, + { + "entropy": 1.6549566097557544, + "epoch": 2.0451284081349894, + "grad_norm": 3.6249144077301025, + "learning_rate": 1.9603934512806974e-05, + "loss": 13.3186, + "mean_token_accuracy": 0.6393066050112247, + "num_tokens": 730380517.0, + "step": 15500 + }, + { + "entropy": 1.6757390736043454, + "epoch": 2.0583238491101326, + "grad_norm": 4.016038417816162, + "learning_rate": 1.9601293900184843e-05, + "loss": 13.4928, + "mean_token_accuracy": 0.6354907912015915, + "num_tokens": 735121046.0, + "step": 15600 + }, + { + "entropy": 1.6515671475231648, + "epoch": 2.0715192900852757, + "grad_norm": 3.98551607131958, + "learning_rate": 1.9598653287562716e-05, + "loss": 13.2811, + "mean_token_accuracy": 0.6401347954571247, + "num_tokens": 739800256.0, + "step": 15700 + }, + { + "entropy": 1.6661450408399106, + "epoch": 2.084714731060419, + "grad_norm": 3.937788248062134, + "learning_rate": 1.959601267494059e-05, + "loss": 13.408, + "mean_token_accuracy": 0.6374346616864205, + "num_tokens": 744553871.0, + "step": 15800 + }, + { + "entropy": 1.6551994441449642, + "epoch": 2.0979101720355615, + "grad_norm": 3.7203454971313477, + "learning_rate": 1.9593372062318458e-05, + "loss": 13.31, + "mean_token_accuracy": 0.6383191919326783, + "num_tokens": 749255191.0, + "step": 15900 + }, + { + "entropy": 1.651690663099289, + "epoch": 2.1111056130107047, + "grad_norm": 3.984395980834961, + "learning_rate": 1.959073144969633e-05, + "loss": 13.285, + "mean_token_accuracy": 0.6396138309687376, + "num_tokens": 754009683.0, + "step": 16000 + }, + { + "epoch": 2.1111056130107047, + "eval_entropy": 1.4368949367493393, + "eval_loss": 1.5915658473968506, + "eval_mean_token_accuracy": 0.6540027022879364, + "eval_num_tokens": 754009683.0, + "eval_runtime": 3188.3805, + "eval_samples_per_second": 33.804, + "eval_steps_per_second": 4.226, + "step": 16000 + }, + { + "entropy": 1.6593476708233357, + "epoch": 2.124301053985848, + "grad_norm": 3.950589895248413, + "learning_rate": 1.9588090837074204e-05, + "loss": 13.3504, + "mean_token_accuracy": 0.6379156097769737, + "num_tokens": 758750576.0, + "step": 16100 + }, + { + "entropy": 1.643917052000761, + "epoch": 2.137496494960991, + "grad_norm": 3.8415000438690186, + "learning_rate": 1.9585450224452073e-05, + "loss": 13.2282, + "mean_token_accuracy": 0.6402662719786167, + "num_tokens": 763474293.0, + "step": 16200 + }, + { + "entropy": 1.6558144466578961, + "epoch": 2.150691935936134, + "grad_norm": 3.8246536254882812, + "learning_rate": 1.9582809611829946e-05, + "loss": 13.3193, + "mean_token_accuracy": 0.638944916576147, + "num_tokens": 768160609.0, + "step": 16300 + }, + { + "entropy": 1.6544341269135474, + "epoch": 2.163887376911277, + "grad_norm": 4.140397548675537, + "learning_rate": 1.958016899920782e-05, + "loss": 13.3071, + "mean_token_accuracy": 0.6394369124621153, + "num_tokens": 772837344.0, + "step": 16400 + }, + { + "entropy": 1.6510882955789565, + "epoch": 2.1770828178864203, + "grad_norm": 3.7167270183563232, + "learning_rate": 1.9577528386585688e-05, + "loss": 13.279, + "mean_token_accuracy": 0.6400315296649933, + "num_tokens": 777550627.0, + "step": 16500 + }, + { + "entropy": 1.6512632183730602, + "epoch": 2.1902782588615635, + "grad_norm": 4.07649040222168, + "learning_rate": 1.957488777396356e-05, + "loss": 13.2926, + "mean_token_accuracy": 0.6393875291198492, + "num_tokens": 782253834.0, + "step": 16600 + }, + { + "entropy": 1.6434899391233921, + "epoch": 2.2034736998367066, + "grad_norm": 3.8309640884399414, + "learning_rate": 1.9572247161341434e-05, + "loss": 13.2162, + "mean_token_accuracy": 0.6406759959459305, + "num_tokens": 786939754.0, + "step": 16700 + }, + { + "entropy": 1.6499764910340309, + "epoch": 2.2166691408118497, + "grad_norm": 3.879365921020508, + "learning_rate": 1.9569606548719303e-05, + "loss": 13.2715, + "mean_token_accuracy": 0.6398765755444765, + "num_tokens": 791637741.0, + "step": 16800 + }, + { + "entropy": 1.65800940066576, + "epoch": 2.2298645817869924, + "grad_norm": 3.7924554347991943, + "learning_rate": 1.9566965936097176e-05, + "loss": 13.3308, + "mean_token_accuracy": 0.6387289334088564, + "num_tokens": 796360880.0, + "step": 16900 + }, + { + "entropy": 1.6624345737695694, + "epoch": 2.2430600227621356, + "grad_norm": 3.9134092330932617, + "learning_rate": 1.956432532347505e-05, + "loss": 13.371, + "mean_token_accuracy": 0.6376094933599233, + "num_tokens": 801087791.0, + "step": 17000 + }, + { + "epoch": 2.2430600227621356, + "eval_entropy": 1.448110183363174, + "eval_loss": 1.5824671983718872, + "eval_mean_token_accuracy": 0.6552899748779286, + "eval_num_tokens": 801087791.0, + "eval_runtime": 3192.5207, + "eval_samples_per_second": 33.76, + "eval_steps_per_second": 4.22, + "step": 17000 + }, + { + "entropy": 1.6444039134681225, + "epoch": 2.2562554637372787, + "grad_norm": 3.820003032684326, + "learning_rate": 1.9561684710852918e-05, + "loss": 13.2239, + "mean_token_accuracy": 0.6408811850100755, + "num_tokens": 805773247.0, + "step": 17100 + }, + { + "entropy": 1.6364771522581578, + "epoch": 2.269450904712422, + "grad_norm": 3.975039005279541, + "learning_rate": 1.955904409823079e-05, + "loss": 13.1499, + "mean_token_accuracy": 0.6420202821493148, + "num_tokens": 810487263.0, + "step": 17200 + }, + { + "entropy": 1.6440125972032547, + "epoch": 2.282646345687565, + "grad_norm": 3.7972419261932373, + "learning_rate": 1.9556403485608664e-05, + "loss": 13.223, + "mean_token_accuracy": 0.6403986816108227, + "num_tokens": 815192150.0, + "step": 17300 + }, + { + "entropy": 1.6492359913885593, + "epoch": 2.295841786662708, + "grad_norm": 3.869448184967041, + "learning_rate": 1.9553762872986533e-05, + "loss": 13.2591, + "mean_token_accuracy": 0.6401562896370888, + "num_tokens": 819926950.0, + "step": 17400 + }, + { + "entropy": 1.6649620904028415, + "epoch": 2.3090372276378512, + "grad_norm": 3.9279003143310547, + "learning_rate": 1.9551122260364406e-05, + "loss": 13.3943, + "mean_token_accuracy": 0.6380402848124505, + "num_tokens": 824679306.0, + "step": 17500 + }, + { + "entropy": 1.6454954193532467, + "epoch": 2.3222326686129944, + "grad_norm": 3.772763729095459, + "learning_rate": 1.9548481647742275e-05, + "loss": 13.2317, + "mean_token_accuracy": 0.6408380315452814, + "num_tokens": 829391977.0, + "step": 17600 + }, + { + "entropy": 1.6422753143310547, + "epoch": 2.335428109588137, + "grad_norm": 3.8395848274230957, + "learning_rate": 1.954584103512015e-05, + "loss": 13.2048, + "mean_token_accuracy": 0.6405471435189247, + "num_tokens": 834068376.0, + "step": 17700 + }, + { + "entropy": 1.6545028822124004, + "epoch": 2.34862355056328, + "grad_norm": 3.8454055786132812, + "learning_rate": 1.954320042249802e-05, + "loss": 13.3062, + "mean_token_accuracy": 0.6392966616153717, + "num_tokens": 838797046.0, + "step": 17800 + }, + { + "entropy": 1.645525890737772, + "epoch": 2.3618189915384233, + "grad_norm": 3.923624038696289, + "learning_rate": 1.954055980987589e-05, + "loss": 13.2301, + "mean_token_accuracy": 0.6412591298669577, + "num_tokens": 843531964.0, + "step": 17900 + }, + { + "entropy": 1.6482495306432248, + "epoch": 2.3750144325135665, + "grad_norm": 3.7244012355804443, + "learning_rate": 1.9537919197253767e-05, + "loss": 13.2457, + "mean_token_accuracy": 0.6402912633121014, + "num_tokens": 848222945.0, + "step": 18000 + }, + { + "epoch": 2.3750144325135665, + "eval_entropy": 1.4282245099796693, + "eval_loss": 1.5754202604293823, + "eval_mean_token_accuracy": 0.6565356317000374, + "eval_num_tokens": 848222945.0, + "eval_runtime": 3189.9723, + "eval_samples_per_second": 33.787, + "eval_steps_per_second": 4.224, + "step": 18000 + }, + { + "entropy": 1.6553110727667808, + "epoch": 2.3882098734887096, + "grad_norm": 4.023122310638428, + "learning_rate": 1.9535278584631636e-05, + "loss": 13.3239, + "mean_token_accuracy": 0.6388157194852829, + "num_tokens": 852929556.0, + "step": 18100 + }, + { + "entropy": 1.6394376514852047, + "epoch": 2.4014053144638527, + "grad_norm": 3.670243263244629, + "learning_rate": 1.9532637972009505e-05, + "loss": 13.1698, + "mean_token_accuracy": 0.6420335720479489, + "num_tokens": 857624358.0, + "step": 18200 + }, + { + "entropy": 1.6484889774024487, + "epoch": 2.414600755438996, + "grad_norm": 3.8759660720825195, + "learning_rate": 1.952999735938738e-05, + "loss": 13.2519, + "mean_token_accuracy": 0.6404349724948406, + "num_tokens": 862377133.0, + "step": 18300 + }, + { + "entropy": 1.6585754190385342, + "epoch": 2.427796196414139, + "grad_norm": 3.8007171154022217, + "learning_rate": 1.952735674676525e-05, + "loss": 13.3324, + "mean_token_accuracy": 0.6386233323067426, + "num_tokens": 867068101.0, + "step": 18400 + }, + { + "entropy": 1.6433968134224415, + "epoch": 2.440991637389282, + "grad_norm": 3.977482795715332, + "learning_rate": 1.9524716134143124e-05, + "loss": 13.2035, + "mean_token_accuracy": 0.6406733729690314, + "num_tokens": 871781888.0, + "step": 18500 + }, + { + "entropy": 1.6583472032845021, + "epoch": 2.4541870783644253, + "grad_norm": 3.9003212451934814, + "learning_rate": 1.9522075521520993e-05, + "loss": 13.3365, + "mean_token_accuracy": 0.6388550719618797, + "num_tokens": 876490259.0, + "step": 18600 + }, + { + "entropy": 1.6305412173271179, + "epoch": 2.4673825193395684, + "grad_norm": 3.718053102493286, + "learning_rate": 1.9519434908898866e-05, + "loss": 13.1029, + "mean_token_accuracy": 0.6429571820795537, + "num_tokens": 881226173.0, + "step": 18700 + }, + { + "entropy": 1.639412898272276, + "epoch": 2.480577960314711, + "grad_norm": 3.988676071166992, + "learning_rate": 1.951679429627674e-05, + "loss": 13.1736, + "mean_token_accuracy": 0.6418413355201483, + "num_tokens": 885951801.0, + "step": 18800 + }, + { + "entropy": 1.6380822832882405, + "epoch": 2.4937734012898543, + "grad_norm": 4.214244842529297, + "learning_rate": 1.951415368365461e-05, + "loss": 13.1635, + "mean_token_accuracy": 0.6418183808401227, + "num_tokens": 890633366.0, + "step": 18900 + }, + { + "entropy": 1.6225868400931358, + "epoch": 2.5069688422649974, + "grad_norm": 3.6833455562591553, + "learning_rate": 1.951151307103248e-05, + "loss": 13.031, + "mean_token_accuracy": 0.6444745562970638, + "num_tokens": 895348664.0, + "step": 19000 + }, + { + "epoch": 2.5069688422649974, + "eval_entropy": 1.4229462220639948, + "eval_loss": 1.5674341917037964, + "eval_mean_token_accuracy": 0.6577362913928956, + "eval_num_tokens": 895348664.0, + "eval_runtime": 3191.3643, + "eval_samples_per_second": 33.772, + "eval_steps_per_second": 4.222, + "step": 19000 + }, + { + "entropy": 1.6459231120347977, + "epoch": 2.5201642832401405, + "grad_norm": 3.8315622806549072, + "learning_rate": 1.9508872458410354e-05, + "loss": 13.2281, + "mean_token_accuracy": 0.6401839184761048, + "num_tokens": 900054420.0, + "step": 19100 + }, + { + "entropy": 1.6490216328203677, + "epoch": 2.5333597242152837, + "grad_norm": 4.153664588928223, + "learning_rate": 1.9506231845788223e-05, + "loss": 13.2486, + "mean_token_accuracy": 0.6404328163713217, + "num_tokens": 904751166.0, + "step": 19200 + }, + { + "entropy": 1.6497003653645514, + "epoch": 2.546555165190427, + "grad_norm": 4.136670112609863, + "learning_rate": 1.9503591233166096e-05, + "loss": 13.2643, + "mean_token_accuracy": 0.6399475292861462, + "num_tokens": 909440312.0, + "step": 19300 + }, + { + "entropy": 1.6385792715847491, + "epoch": 2.55975060616557, + "grad_norm": 3.998361110687256, + "learning_rate": 1.950095062054397e-05, + "loss": 13.1585, + "mean_token_accuracy": 0.64167800180614, + "num_tokens": 914149150.0, + "step": 19400 + }, + { + "entropy": 1.640668357759714, + "epoch": 2.5729460471407126, + "grad_norm": 3.94272518157959, + "learning_rate": 1.949831000792184e-05, + "loss": 13.193, + "mean_token_accuracy": 0.6411869799345732, + "num_tokens": 918821751.0, + "step": 19500 + }, + { + "entropy": 1.639274080991745, + "epoch": 2.5861414881158558, + "grad_norm": 3.8270695209503174, + "learning_rate": 1.949566939529971e-05, + "loss": 13.1661, + "mean_token_accuracy": 0.6411959240585565, + "num_tokens": 923531575.0, + "step": 19600 + }, + { + "entropy": 1.6339846841990948, + "epoch": 2.599336929090999, + "grad_norm": 3.880585193634033, + "learning_rate": 1.9493028782677584e-05, + "loss": 13.1204, + "mean_token_accuracy": 0.6424805308878422, + "num_tokens": 928308752.0, + "step": 19700 + }, + { + "entropy": 1.6360827976465224, + "epoch": 2.612532370066142, + "grad_norm": 3.80788254737854, + "learning_rate": 1.9490388170055453e-05, + "loss": 13.1472, + "mean_token_accuracy": 0.6421842590346932, + "num_tokens": 933061721.0, + "step": 19800 + }, + { + "entropy": 1.642481252104044, + "epoch": 2.625727811041285, + "grad_norm": 3.922104835510254, + "learning_rate": 1.9487747557433326e-05, + "loss": 13.1925, + "mean_token_accuracy": 0.6409500490874052, + "num_tokens": 937780844.0, + "step": 19900 + }, + { + "entropy": 1.6298034279048443, + "epoch": 2.6389232520164283, + "grad_norm": 3.8405327796936035, + "learning_rate": 1.94851069448112e-05, + "loss": 13.0954, + "mean_token_accuracy": 0.6432888546586036, + "num_tokens": 942511572.0, + "step": 20000 + }, + { + "epoch": 2.6389232520164283, + "eval_entropy": 1.4283716988299866, + "eval_loss": 1.5600693225860596, + "eval_mean_token_accuracy": 0.6588419941908921, + "eval_num_tokens": 942511572.0, + "eval_runtime": 3187.1949, + "eval_samples_per_second": 33.817, + "eval_steps_per_second": 4.227, + "step": 20000 + }, + { + "entropy": 1.650664220750332, + "epoch": 2.6521186929915714, + "grad_norm": 3.9403529167175293, + "learning_rate": 1.948246633218907e-05, + "loss": 13.2577, + "mean_token_accuracy": 0.6399555268138647, + "num_tokens": 947255243.0, + "step": 20100 + }, + { + "entropy": 1.6283037734031678, + "epoch": 2.6653141339667146, + "grad_norm": 3.7040855884552, + "learning_rate": 1.947982571956694e-05, + "loss": 13.0764, + "mean_token_accuracy": 0.6435702281445265, + "num_tokens": 951924178.0, + "step": 20200 + }, + { + "entropy": 1.6288749648630618, + "epoch": 2.6785095749418577, + "grad_norm": 4.053677558898926, + "learning_rate": 1.9477185106944814e-05, + "loss": 13.0763, + "mean_token_accuracy": 0.6431901397556067, + "num_tokens": 956614112.0, + "step": 20300 + }, + { + "entropy": 1.6154142348468303, + "epoch": 2.691705015917001, + "grad_norm": 3.6884868144989014, + "learning_rate": 1.9474544494322683e-05, + "loss": 12.9704, + "mean_token_accuracy": 0.6454039007425308, + "num_tokens": 961310812.0, + "step": 20400 + }, + { + "entropy": 1.6393168839812278, + "epoch": 2.704900456892144, + "grad_norm": 3.920409679412842, + "learning_rate": 1.9471903881700556e-05, + "loss": 13.1667, + "mean_token_accuracy": 0.6419647770375013, + "num_tokens": 966060016.0, + "step": 20500 + }, + { + "entropy": 1.6280412651598453, + "epoch": 2.718095897867287, + "grad_norm": 3.860715627670288, + "learning_rate": 1.946926326907843e-05, + "loss": 13.0693, + "mean_token_accuracy": 0.6434524042159319, + "num_tokens": 970778070.0, + "step": 20600 + }, + { + "entropy": 1.6182962483167649, + "epoch": 2.73129133884243, + "grad_norm": 3.9382476806640625, + "learning_rate": 1.94666226564563e-05, + "loss": 12.9953, + "mean_token_accuracy": 0.645245413929224, + "num_tokens": 975501164.0, + "step": 20700 + }, + { + "entropy": 1.6398981650918723, + "epoch": 2.744486779817573, + "grad_norm": 3.749861240386963, + "learning_rate": 1.946398204383417e-05, + "loss": 13.1652, + "mean_token_accuracy": 0.6417403563112021, + "num_tokens": 980266218.0, + "step": 20800 + }, + { + "entropy": 1.6464379735291004, + "epoch": 2.757682220792716, + "grad_norm": 3.912741184234619, + "learning_rate": 1.946134143121204e-05, + "loss": 13.2189, + "mean_token_accuracy": 0.6404729437828064, + "num_tokens": 984961898.0, + "step": 20900 + }, + { + "entropy": 1.6274100148677826, + "epoch": 2.770877661767859, + "grad_norm": 3.992455005645752, + "learning_rate": 1.9458700818589913e-05, + "loss": 13.0539, + "mean_token_accuracy": 0.6441804407536984, + "num_tokens": 989614043.0, + "step": 21000 + }, + { + "epoch": 2.770877661767859, + "eval_entropy": 1.4158240256813492, + "eval_loss": 1.5523220300674438, + "eval_mean_token_accuracy": 0.6600379936238364, + "eval_num_tokens": 989614043.0, + "eval_runtime": 3189.5795, + "eval_samples_per_second": 33.791, + "eval_steps_per_second": 4.224, + "step": 21000 + }, + { + "entropy": 1.632586480230093, + "epoch": 2.7840731027430023, + "grad_norm": 3.9651315212249756, + "learning_rate": 1.9456060205967786e-05, + "loss": 13.1083, + "mean_token_accuracy": 0.6430674945563077, + "num_tokens": 994328590.0, + "step": 21100 + }, + { + "entropy": 1.6308821719884872, + "epoch": 2.7972685437181455, + "grad_norm": 3.786008834838867, + "learning_rate": 1.9453419593345656e-05, + "loss": 13.091, + "mean_token_accuracy": 0.6433896777033806, + "num_tokens": 999024614.0, + "step": 21200 + }, + { + "entropy": 1.6474685882031919, + "epoch": 2.8104639846932886, + "grad_norm": 3.9114224910736084, + "learning_rate": 1.9450778980723532e-05, + "loss": 13.2318, + "mean_token_accuracy": 0.640612950772047, + "num_tokens": 1003713201.0, + "step": 21300 + }, + { + "entropy": 1.6211960214376449, + "epoch": 2.8236594256684313, + "grad_norm": 4.09506368637085, + "learning_rate": 1.94481383681014e-05, + "loss": 13.0251, + "mean_token_accuracy": 0.6447122542560101, + "num_tokens": 1008443027.0, + "step": 21400 + }, + { + "entropy": 1.6274728824198246, + "epoch": 2.8368548666435744, + "grad_norm": 3.768113136291504, + "learning_rate": 1.944549775547927e-05, + "loss": 13.0694, + "mean_token_accuracy": 0.6439897135645151, + "num_tokens": 1013177678.0, + "step": 21500 + }, + { + "entropy": 1.6317154209315776, + "epoch": 2.8500503076187176, + "grad_norm": 4.080435276031494, + "learning_rate": 1.9442857142857147e-05, + "loss": 13.0882, + "mean_token_accuracy": 0.6439899149537086, + "num_tokens": 1017879507.0, + "step": 21600 + }, + { + "entropy": 1.6208388382196426, + "epoch": 2.8632457485938607, + "grad_norm": 3.988337993621826, + "learning_rate": 1.9440216530235016e-05, + "loss": 13.0154, + "mean_token_accuracy": 0.6451309756934642, + "num_tokens": 1022592377.0, + "step": 21700 + }, + { + "entropy": 1.633671799302101, + "epoch": 2.876441189569004, + "grad_norm": 3.740370988845825, + "learning_rate": 1.943757591761289e-05, + "loss": 13.1097, + "mean_token_accuracy": 0.6425185710191726, + "num_tokens": 1027278637.0, + "step": 21800 + }, + { + "entropy": 1.6209680989384652, + "epoch": 2.889636630544147, + "grad_norm": 3.957094669342041, + "learning_rate": 1.943493530499076e-05, + "loss": 13.0099, + "mean_token_accuracy": 0.6445255218446255, + "num_tokens": 1031978678.0, + "step": 21900 + }, + { + "entropy": 1.6203568048775197, + "epoch": 2.90283207151929, + "grad_norm": 3.7622430324554443, + "learning_rate": 1.943229469236863e-05, + "loss": 13.0027, + "mean_token_accuracy": 0.6453801936656237, + "num_tokens": 1036715064.0, + "step": 22000 + }, + { + "epoch": 2.90283207151929, + "eval_entropy": 1.4137322844946154, + "eval_loss": 1.5457555055618286, + "eval_mean_token_accuracy": 0.6610697363702012, + "eval_num_tokens": 1036715064.0, + "eval_runtime": 3187.9709, + "eval_samples_per_second": 33.808, + "eval_steps_per_second": 4.226, + "step": 22000 + }, + { + "entropy": 1.6192519588023424, + "epoch": 2.9160275124944333, + "grad_norm": 3.8866078853607178, + "learning_rate": 1.9429654079746504e-05, + "loss": 12.99, + "mean_token_accuracy": 0.6450617261230945, + "num_tokens": 1041372665.0, + "step": 22100 + }, + { + "entropy": 1.6254705637693405, + "epoch": 2.9292229534695764, + "grad_norm": 4.082258701324463, + "learning_rate": 1.9427013467124374e-05, + "loss": 13.0384, + "mean_token_accuracy": 0.6447862467169762, + "num_tokens": 1046068792.0, + "step": 22200 + }, + { + "entropy": 1.6164238581061363, + "epoch": 2.9424183944447195, + "grad_norm": 3.86102557182312, + "learning_rate": 1.9424372854502246e-05, + "loss": 12.953, + "mean_token_accuracy": 0.646661720648408, + "num_tokens": 1050838675.0, + "step": 22300 + }, + { + "entropy": 1.6251713410019875, + "epoch": 2.9556138354198627, + "grad_norm": 4.027069568634033, + "learning_rate": 1.942173224188012e-05, + "loss": 13.035, + "mean_token_accuracy": 0.6444545089453458, + "num_tokens": 1055533401.0, + "step": 22400 + }, + { + "entropy": 1.622108271420002, + "epoch": 2.968809276395006, + "grad_norm": 3.6608834266662598, + "learning_rate": 1.941909162925799e-05, + "loss": 13.0153, + "mean_token_accuracy": 0.6446515038982034, + "num_tokens": 1060259488.0, + "step": 22500 + }, + { + "entropy": 1.6189582243561744, + "epoch": 2.9820047173701485, + "grad_norm": 3.8715226650238037, + "learning_rate": 1.941645101663586e-05, + "loss": 12.9811, + "mean_token_accuracy": 0.6453317078202963, + "num_tokens": 1064896187.0, + "step": 22600 + }, + { + "entropy": 1.6309356051683426, + "epoch": 2.9952001583452916, + "grad_norm": 3.7819671630859375, + "learning_rate": 1.9413810404013734e-05, + "loss": 13.0815, + "mean_token_accuracy": 0.643519636541605, + "num_tokens": 1069620808.0, + "step": 22700 + }, + { + "entropy": 1.6086442614501377, + "epoch": 3.0083131278143402, + "grad_norm": 3.8293449878692627, + "learning_rate": 1.9411169791391604e-05, + "loss": 12.8103, + "mean_token_accuracy": 0.6470718562977869, + "num_tokens": 1074292820.0, + "step": 22800 + }, + { + "entropy": 1.5990345920622349, + "epoch": 3.0215085687894834, + "grad_norm": 3.9333574771881104, + "learning_rate": 1.9408529178769476e-05, + "loss": 12.8028, + "mean_token_accuracy": 0.6484330788999796, + "num_tokens": 1079008586.0, + "step": 22900 + }, + { + "entropy": 1.6048170095682144, + "epoch": 3.0347040097646265, + "grad_norm": 4.1680707931518555, + "learning_rate": 1.940588856614735e-05, + "loss": 12.8522, + "mean_token_accuracy": 0.6468084762245416, + "num_tokens": 1083742693.0, + "step": 23000 + }, + { + "epoch": 3.0347040097646265, + "eval_entropy": 1.4036445253712735, + "eval_loss": 1.5400227308273315, + "eval_mean_token_accuracy": 0.6620142995550259, + "eval_num_tokens": 1083742693.0, + "eval_runtime": 3187.7068, + "eval_samples_per_second": 33.811, + "eval_steps_per_second": 4.227, + "step": 23000 + }, + { + "entropy": 1.59215646982193, + "epoch": 3.047899450739769, + "grad_norm": 3.910896062850952, + "learning_rate": 1.940324795352522e-05, + "loss": 12.7527, + "mean_token_accuracy": 0.6494686865061522, + "num_tokens": 1088416491.0, + "step": 23100 + }, + { + "entropy": 1.6077661024034022, + "epoch": 3.0610948917149123, + "grad_norm": 3.82198429107666, + "learning_rate": 1.940060734090309e-05, + "loss": 12.8693, + "mean_token_accuracy": 0.6476485385000705, + "num_tokens": 1093142922.0, + "step": 23200 + }, + { + "entropy": 1.6198658345639705, + "epoch": 3.0742903326900555, + "grad_norm": 3.7349178791046143, + "learning_rate": 1.9397966728280964e-05, + "loss": 12.9709, + "mean_token_accuracy": 0.6451571603119374, + "num_tokens": 1097858791.0, + "step": 23300 + }, + { + "entropy": 1.6040863755345345, + "epoch": 3.0874857736651986, + "grad_norm": 4.026642799377441, + "learning_rate": 1.9395326115658834e-05, + "loss": 12.8509, + "mean_token_accuracy": 0.6470640433579683, + "num_tokens": 1102555946.0, + "step": 23400 + }, + { + "entropy": 1.6048882656544448, + "epoch": 3.1006812146403417, + "grad_norm": 3.827873706817627, + "learning_rate": 1.9392685503036706e-05, + "loss": 12.8447, + "mean_token_accuracy": 0.6481162996590137, + "num_tokens": 1107293137.0, + "step": 23500 + }, + { + "entropy": 1.6029019843041896, + "epoch": 3.113876655615485, + "grad_norm": 4.017215728759766, + "learning_rate": 1.939004489041458e-05, + "loss": 12.843, + "mean_token_accuracy": 0.6474066472053528, + "num_tokens": 1111954662.0, + "step": 23600 + }, + { + "entropy": 1.6040606062114238, + "epoch": 3.127072096590628, + "grad_norm": 3.802905797958374, + "learning_rate": 1.938740427779245e-05, + "loss": 12.84, + "mean_token_accuracy": 0.6483595797419548, + "num_tokens": 1116658744.0, + "step": 23700 + }, + { + "entropy": 1.6007949909567833, + "epoch": 3.140267537565771, + "grad_norm": 3.832118272781372, + "learning_rate": 1.938476366517032e-05, + "loss": 12.8116, + "mean_token_accuracy": 0.6484786373376846, + "num_tokens": 1121381338.0, + "step": 23800 + }, + { + "entropy": 1.603892664760351, + "epoch": 3.1534629785409143, + "grad_norm": 3.7704367637634277, + "learning_rate": 1.938212305254819e-05, + "loss": 12.8438, + "mean_token_accuracy": 0.6480603955686093, + "num_tokens": 1126116413.0, + "step": 23900 + }, + { + "entropy": 1.6137782764434814, + "epoch": 3.1666584195160574, + "grad_norm": 3.97542142868042, + "learning_rate": 1.9379482439926064e-05, + "loss": 12.9155, + "mean_token_accuracy": 0.6462130547314883, + "num_tokens": 1130879241.0, + "step": 24000 + }, + { + "epoch": 3.1666584195160574, + "eval_entropy": 1.409310864803211, + "eval_loss": 1.532834768295288, + "eval_mean_token_accuracy": 0.6631031635829886, + "eval_num_tokens": 1130879241.0, + "eval_runtime": 3194.5415, + "eval_samples_per_second": 33.739, + "eval_steps_per_second": 4.218, + "step": 24000 + }, + { + "entropy": 1.603936430066824, + "epoch": 3.1798538604912, + "grad_norm": 3.6425111293792725, + "learning_rate": 1.9376841827303937e-05, + "loss": 12.8439, + "mean_token_accuracy": 0.6481852814555168, + "num_tokens": 1135605204.0, + "step": 24100 + }, + { + "entropy": 1.5988010117411613, + "epoch": 3.1930493014663432, + "grad_norm": 3.9451022148132324, + "learning_rate": 1.9374201214681806e-05, + "loss": 12.8002, + "mean_token_accuracy": 0.6487304736673832, + "num_tokens": 1140319680.0, + "step": 24200 + }, + { + "entropy": 1.607831762433052, + "epoch": 3.2062447424414864, + "grad_norm": 3.7540087699890137, + "learning_rate": 1.937156060205968e-05, + "loss": 12.8768, + "mean_token_accuracy": 0.6474707532674074, + "num_tokens": 1145017019.0, + "step": 24300 + }, + { + "entropy": 1.6050450451672078, + "epoch": 3.2194401834166295, + "grad_norm": 3.8583192825317383, + "learning_rate": 1.936891998943755e-05, + "loss": 12.8607, + "mean_token_accuracy": 0.6476561278104782, + "num_tokens": 1149667952.0, + "step": 24400 + }, + { + "entropy": 1.6129796238243579, + "epoch": 3.2326356243917727, + "grad_norm": 3.7425315380096436, + "learning_rate": 1.936627937681542e-05, + "loss": 12.912, + "mean_token_accuracy": 0.6465145403146744, + "num_tokens": 1154393799.0, + "step": 24500 + }, + { + "entropy": 1.6006788285076619, + "epoch": 3.245831065366916, + "grad_norm": 3.9942708015441895, + "learning_rate": 1.9363638764193297e-05, + "loss": 12.8184, + "mean_token_accuracy": 0.6481123934686184, + "num_tokens": 1159132842.0, + "step": 24600 + }, + { + "entropy": 1.5836338178813458, + "epoch": 3.259026506342059, + "grad_norm": 3.8745949268341064, + "learning_rate": 1.9360998151571167e-05, + "loss": 12.6762, + "mean_token_accuracy": 0.6513490001112223, + "num_tokens": 1163839466.0, + "step": 24700 + }, + { + "entropy": 1.5958380722999572, + "epoch": 3.272221947317202, + "grad_norm": 3.5539045333862305, + "learning_rate": 1.9358357538949036e-05, + "loss": 12.7741, + "mean_token_accuracy": 0.6492387424409389, + "num_tokens": 1168609132.0, + "step": 24800 + }, + { + "entropy": 1.6030096143484116, + "epoch": 3.2854173882923448, + "grad_norm": 3.7582950592041016, + "learning_rate": 1.935571692632691e-05, + "loss": 12.8404, + "mean_token_accuracy": 0.6481160232424736, + "num_tokens": 1173311250.0, + "step": 24900 + }, + { + "entropy": 1.5964187413454056, + "epoch": 3.298612829267488, + "grad_norm": 3.8397164344787598, + "learning_rate": 1.935307631370478e-05, + "loss": 12.7728, + "mean_token_accuracy": 0.6495844420790672, + "num_tokens": 1178028693.0, + "step": 25000 + }, + { + "epoch": 3.298612829267488, + "eval_entropy": 1.4047100693374543, + "eval_loss": 1.5277026891708374, + "eval_mean_token_accuracy": 0.6640447715511214, + "eval_num_tokens": 1178028693.0, + "eval_runtime": 3194.6362, + "eval_samples_per_second": 33.738, + "eval_steps_per_second": 4.217, + "step": 25000 + }, + { + "entropy": 1.5998302234709263, + "epoch": 3.311808270242631, + "grad_norm": 3.663982391357422, + "learning_rate": 1.935043570108265e-05, + "loss": 12.8137, + "mean_token_accuracy": 0.6484686867892742, + "num_tokens": 1182755467.0, + "step": 25100 + }, + { + "entropy": 1.5965089382231235, + "epoch": 3.325003711217774, + "grad_norm": 3.9860620498657227, + "learning_rate": 1.9347795088460524e-05, + "loss": 12.7805, + "mean_token_accuracy": 0.6490729383379221, + "num_tokens": 1187484128.0, + "step": 25200 + }, + { + "entropy": 1.5897659668326378, + "epoch": 3.3381991521929173, + "grad_norm": 3.856250286102295, + "learning_rate": 1.9345154475838397e-05, + "loss": 12.7224, + "mean_token_accuracy": 0.6499618509411812, + "num_tokens": 1192183136.0, + "step": 25300 + }, + { + "entropy": 1.5996447187662124, + "epoch": 3.3513945931680604, + "grad_norm": 3.984705686569214, + "learning_rate": 1.934251386321627e-05, + "loss": 12.7979, + "mean_token_accuracy": 0.6491198971122504, + "num_tokens": 1196911691.0, + "step": 25400 + }, + { + "entropy": 1.5892897661030292, + "epoch": 3.3645900341432036, + "grad_norm": 3.9916419982910156, + "learning_rate": 1.933987325059414e-05, + "loss": 12.7189, + "mean_token_accuracy": 0.6509596475213766, + "num_tokens": 1201632490.0, + "step": 25500 + }, + { + "entropy": 1.599773357063532, + "epoch": 3.3777854751183467, + "grad_norm": 4.092468738555908, + "learning_rate": 1.933723263797201e-05, + "loss": 12.7999, + "mean_token_accuracy": 0.6487147487699986, + "num_tokens": 1206369149.0, + "step": 25600 + }, + { + "entropy": 1.5881148573756219, + "epoch": 3.39098091609349, + "grad_norm": 3.912086248397827, + "learning_rate": 1.9334592025349884e-05, + "loss": 12.7128, + "mean_token_accuracy": 0.6499782233685255, + "num_tokens": 1211082760.0, + "step": 25700 + }, + { + "entropy": 1.594959545582533, + "epoch": 3.404176357068633, + "grad_norm": 4.231387138366699, + "learning_rate": 1.9331951412727754e-05, + "loss": 12.758, + "mean_token_accuracy": 0.6493679398298263, + "num_tokens": 1215823737.0, + "step": 25800 + }, + { + "entropy": 1.5961143529415132, + "epoch": 3.417371798043776, + "grad_norm": 3.8411481380462646, + "learning_rate": 1.9329310800105627e-05, + "loss": 12.7719, + "mean_token_accuracy": 0.6493826608359814, + "num_tokens": 1220532274.0, + "step": 25900 + }, + { + "entropy": 1.6002527132630349, + "epoch": 3.430567239018919, + "grad_norm": 3.925577163696289, + "learning_rate": 1.93266701874835e-05, + "loss": 12.8023, + "mean_token_accuracy": 0.6485183215141297, + "num_tokens": 1225247176.0, + "step": 26000 + }, + { + "epoch": 3.430567239018919, + "eval_entropy": 1.3897053414566838, + "eval_loss": 1.5220413208007812, + "eval_mean_token_accuracy": 0.6650882579387893, + "eval_num_tokens": 1225247176.0, + "eval_runtime": 3189.1636, + "eval_samples_per_second": 33.796, + "eval_steps_per_second": 4.225, + "step": 26000 + }, + { + "entropy": 1.590363145917654, + "epoch": 3.443762679994062, + "grad_norm": 4.163620948791504, + "learning_rate": 1.932402957486137e-05, + "loss": 12.7238, + "mean_token_accuracy": 0.6503931378573179, + "num_tokens": 1229952948.0, + "step": 26100 + }, + { + "entropy": 1.5977550886571408, + "epoch": 3.456958120969205, + "grad_norm": 3.7180027961730957, + "learning_rate": 1.932138896223924e-05, + "loss": 12.7795, + "mean_token_accuracy": 0.6490350770950317, + "num_tokens": 1234708040.0, + "step": 26200 + }, + { + "entropy": 1.5859640747308732, + "epoch": 3.470153561944348, + "grad_norm": 5.798464298248291, + "learning_rate": 1.9318748349617114e-05, + "loss": 12.6806, + "mean_token_accuracy": 0.6516380459070206, + "num_tokens": 1239398688.0, + "step": 26300 + }, + { + "entropy": 1.5860621571540832, + "epoch": 3.4833490029194913, + "grad_norm": 3.894265651702881, + "learning_rate": 1.9316107736994984e-05, + "loss": 12.6903, + "mean_token_accuracy": 0.6511745493113995, + "num_tokens": 1244100384.0, + "step": 26400 + }, + { + "entropy": 1.5941654224693775, + "epoch": 3.4965444438946345, + "grad_norm": 3.791640520095825, + "learning_rate": 1.9313467124372857e-05, + "loss": 12.7507, + "mean_token_accuracy": 0.6497996034473181, + "num_tokens": 1248767346.0, + "step": 26500 + }, + { + "entropy": 1.5958171512186528, + "epoch": 3.5097398848697776, + "grad_norm": 4.074944496154785, + "learning_rate": 1.931082651175073e-05, + "loss": 12.7584, + "mean_token_accuracy": 0.649353106468916, + "num_tokens": 1253505675.0, + "step": 26600 + }, + { + "entropy": 1.5861407789587973, + "epoch": 3.5229353258449203, + "grad_norm": 3.7499473094940186, + "learning_rate": 1.93081858991286e-05, + "loss": 12.6829, + "mean_token_accuracy": 0.6514942896366119, + "num_tokens": 1258210555.0, + "step": 26700 + }, + { + "entropy": 1.5875330168008803, + "epoch": 3.5361307668200634, + "grad_norm": 3.857792854309082, + "learning_rate": 1.9305545286506472e-05, + "loss": 12.7021, + "mean_token_accuracy": 0.6506117970496416, + "num_tokens": 1262969138.0, + "step": 26800 + }, + { + "entropy": 1.5890568955242634, + "epoch": 3.5493262077952066, + "grad_norm": 3.789581537246704, + "learning_rate": 1.930290467388434e-05, + "loss": 12.7028, + "mean_token_accuracy": 0.6507829879224301, + "num_tokens": 1267718836.0, + "step": 26900 + }, + { + "entropy": 1.5945152980089188, + "epoch": 3.5625216487703497, + "grad_norm": 3.9242899417877197, + "learning_rate": 1.9300264061262214e-05, + "loss": 12.7496, + "mean_token_accuracy": 0.6498741636425257, + "num_tokens": 1272416713.0, + "step": 27000 + }, + { + "epoch": 3.5625216487703497, + "eval_entropy": 1.3967469545008124, + "eval_loss": 1.5160529613494873, + "eval_mean_token_accuracy": 0.6659562944003312, + "eval_num_tokens": 1272416713.0, + "eval_runtime": 3190.1496, + "eval_samples_per_second": 33.785, + "eval_steps_per_second": 4.223, + "step": 27000 + }, + { + "entropy": 1.5936158713698387, + "epoch": 3.575717089745493, + "grad_norm": 3.866844654083252, + "learning_rate": 1.9297623448640087e-05, + "loss": 12.7489, + "mean_token_accuracy": 0.6495719265192748, + "num_tokens": 1277143213.0, + "step": 27100 + }, + { + "entropy": 1.5857186970114707, + "epoch": 3.588912530720636, + "grad_norm": 3.7745931148529053, + "learning_rate": 1.9294982836017956e-05, + "loss": 12.6741, + "mean_token_accuracy": 0.6512483295798301, + "num_tokens": 1281830839.0, + "step": 27200 + }, + { + "entropy": 1.5887311869859695, + "epoch": 3.602107971695779, + "grad_norm": 3.8768470287323, + "learning_rate": 1.929234222339583e-05, + "loss": 12.7105, + "mean_token_accuracy": 0.6507860495895147, + "num_tokens": 1286521666.0, + "step": 27300 + }, + { + "entropy": 1.5972675527632236, + "epoch": 3.6153034126709223, + "grad_norm": 4.008309841156006, + "learning_rate": 1.9289701610773702e-05, + "loss": 12.7788, + "mean_token_accuracy": 0.648838073015213, + "num_tokens": 1291216900.0, + "step": 27400 + }, + { + "entropy": 1.5907193027436732, + "epoch": 3.6284988536460654, + "grad_norm": 3.7738773822784424, + "learning_rate": 1.928706099815157e-05, + "loss": 12.7136, + "mean_token_accuracy": 0.6508843255788088, + "num_tokens": 1295939930.0, + "step": 27500 + }, + { + "entropy": 1.5870854687690734, + "epoch": 3.6416942946212085, + "grad_norm": 4.030341625213623, + "learning_rate": 1.9284420385529444e-05, + "loss": 12.6917, + "mean_token_accuracy": 0.6508548408001661, + "num_tokens": 1300586613.0, + "step": 27600 + }, + { + "entropy": 1.5814402961730958, + "epoch": 3.6548897355963517, + "grad_norm": 3.9914443492889404, + "learning_rate": 1.9281779772907317e-05, + "loss": 12.647, + "mean_token_accuracy": 0.6520128079503774, + "num_tokens": 1305252391.0, + "step": 27700 + }, + { + "entropy": 1.5867080081999303, + "epoch": 3.668085176571495, + "grad_norm": 3.799436330795288, + "learning_rate": 1.9279139160285186e-05, + "loss": 12.6798, + "mean_token_accuracy": 0.6509626308828592, + "num_tokens": 1310026702.0, + "step": 27800 + }, + { + "entropy": 1.5863153117895126, + "epoch": 3.6812806175466375, + "grad_norm": 3.7359097003936768, + "learning_rate": 1.927649854766306e-05, + "loss": 12.6794, + "mean_token_accuracy": 0.6507589789479971, + "num_tokens": 1314753404.0, + "step": 27900 + }, + { + "entropy": 1.5910515576601028, + "epoch": 3.6944760585217806, + "grad_norm": 3.9891273975372314, + "learning_rate": 1.9273857935040932e-05, + "loss": 12.7211, + "mean_token_accuracy": 0.6505811680108309, + "num_tokens": 1319464143.0, + "step": 28000 + }, + { + "epoch": 3.6944760585217806, + "eval_entropy": 1.383818113944547, + "eval_loss": 1.5120134353637695, + "eval_mean_token_accuracy": 0.6666769426967735, + "eval_num_tokens": 1319464143.0, + "eval_runtime": 3188.8443, + "eval_samples_per_second": 33.799, + "eval_steps_per_second": 4.225, + "step": 28000 + }, + { + "entropy": 1.5939770445227623, + "epoch": 3.7076714994969238, + "grad_norm": 3.8743128776550293, + "learning_rate": 1.92712173224188e-05, + "loss": 12.7487, + "mean_token_accuracy": 0.6503194896131754, + "num_tokens": 1324177416.0, + "step": 28100 + }, + { + "entropy": 1.5899296332895756, + "epoch": 3.720866940472067, + "grad_norm": 4.185421466827393, + "learning_rate": 1.9268576709796674e-05, + "loss": 12.7073, + "mean_token_accuracy": 0.6508678549528122, + "num_tokens": 1328870314.0, + "step": 28200 + }, + { + "entropy": 1.5931289853900672, + "epoch": 3.73406238144721, + "grad_norm": 4.025436878204346, + "learning_rate": 1.9265936097174547e-05, + "loss": 12.7334, + "mean_token_accuracy": 0.650798460394144, + "num_tokens": 1333603965.0, + "step": 28300 + }, + { + "entropy": 1.579432168751955, + "epoch": 3.747257822422353, + "grad_norm": 3.8230814933776855, + "learning_rate": 1.9263295484552416e-05, + "loss": 12.6252, + "mean_token_accuracy": 0.6523202404379844, + "num_tokens": 1338353598.0, + "step": 28400 + }, + { + "entropy": 1.5902026624977588, + "epoch": 3.7604532633974963, + "grad_norm": 3.726195812225342, + "learning_rate": 1.926065487193029e-05, + "loss": 12.7178, + "mean_token_accuracy": 0.6507448834180832, + "num_tokens": 1343045600.0, + "step": 28500 + }, + { + "entropy": 1.5953569206595422, + "epoch": 3.773648704372639, + "grad_norm": 4.179470539093018, + "learning_rate": 1.9258014259308162e-05, + "loss": 12.756, + "mean_token_accuracy": 0.6497294913977385, + "num_tokens": 1347765894.0, + "step": 28600 + }, + { + "entropy": 1.5810581628233193, + "epoch": 3.786844145347782, + "grad_norm": 4.100180625915527, + "learning_rate": 1.9255373646686035e-05, + "loss": 12.6317, + "mean_token_accuracy": 0.6522324250638485, + "num_tokens": 1352440589.0, + "step": 28700 + }, + { + "entropy": 1.588026827275753, + "epoch": 3.8000395863229253, + "grad_norm": 3.9760444164276123, + "learning_rate": 1.9252733034063904e-05, + "loss": 12.6961, + "mean_token_accuracy": 0.6506867495179176, + "num_tokens": 1357132028.0, + "step": 28800 + }, + { + "entropy": 1.5934601209312678, + "epoch": 3.8132350272980684, + "grad_norm": 3.8777272701263428, + "learning_rate": 1.9250092421441773e-05, + "loss": 12.7266, + "mean_token_accuracy": 0.6502763725072146, + "num_tokens": 1361826888.0, + "step": 28900 + }, + { + "entropy": 1.5815143549442292, + "epoch": 3.8264304682732115, + "grad_norm": 3.8989417552948, + "learning_rate": 1.924745180881965e-05, + "loss": 12.6365, + "mean_token_accuracy": 0.6515141806006431, + "num_tokens": 1366504824.0, + "step": 29000 + }, + { + "epoch": 3.8264304682732115, + "eval_entropy": 1.3917201210933459, + "eval_loss": 1.5054519176483154, + "eval_mean_token_accuracy": 0.6676328241900312, + "eval_num_tokens": 1366504824.0, + "eval_runtime": 3191.917, + "eval_samples_per_second": 33.767, + "eval_steps_per_second": 4.221, + "step": 29000 + }, + { + "entropy": 1.585688829421997, + "epoch": 3.8396259092483547, + "grad_norm": 3.791611671447754, + "learning_rate": 1.924481119619752e-05, + "loss": 12.6651, + "mean_token_accuracy": 0.6511684814840555, + "num_tokens": 1371224055.0, + "step": 29100 + }, + { + "entropy": 1.564525719434023, + "epoch": 3.852821350223498, + "grad_norm": 3.8431272506713867, + "learning_rate": 1.9242170583575392e-05, + "loss": 12.4972, + "mean_token_accuracy": 0.6550224151462316, + "num_tokens": 1375950143.0, + "step": 29200 + }, + { + "entropy": 1.5668389943242074, + "epoch": 3.866016791198641, + "grad_norm": 3.915454149246216, + "learning_rate": 1.9239529970953265e-05, + "loss": 12.5206, + "mean_token_accuracy": 0.6548665834218264, + "num_tokens": 1380590024.0, + "step": 29300 + }, + { + "entropy": 1.5666982433199883, + "epoch": 3.879212232173784, + "grad_norm": 3.8743038177490234, + "learning_rate": 1.9236889358331134e-05, + "loss": 12.5167, + "mean_token_accuracy": 0.654480542242527, + "num_tokens": 1385311696.0, + "step": 29400 + }, + { + "entropy": 1.5765258029848337, + "epoch": 3.892407673148927, + "grad_norm": 3.8786461353302, + "learning_rate": 1.9234248745709007e-05, + "loss": 12.5871, + "mean_token_accuracy": 0.6527524341642856, + "num_tokens": 1390063828.0, + "step": 29500 + }, + { + "entropy": 1.5753777919709682, + "epoch": 3.9056031141240704, + "grad_norm": 3.727337598800659, + "learning_rate": 1.923160813308688e-05, + "loss": 12.5866, + "mean_token_accuracy": 0.6533032587170601, + "num_tokens": 1394744367.0, + "step": 29600 + }, + { + "entropy": 1.572633812725544, + "epoch": 3.9187985550992135, + "grad_norm": 3.863529920578003, + "learning_rate": 1.922896752046475e-05, + "loss": 12.563, + "mean_token_accuracy": 0.6538976988196373, + "num_tokens": 1399475298.0, + "step": 29700 + }, + { + "entropy": 1.576025907844305, + "epoch": 3.931993996074356, + "grad_norm": 3.851825714111328, + "learning_rate": 1.9226326907842622e-05, + "loss": 12.5812, + "mean_token_accuracy": 0.6533911099284887, + "num_tokens": 1404131763.0, + "step": 29800 + }, + { + "entropy": 1.5805292607098818, + "epoch": 3.9451894370494993, + "grad_norm": 3.8343303203582764, + "learning_rate": 1.922368629522049e-05, + "loss": 12.6123, + "mean_token_accuracy": 0.6528911211341619, + "num_tokens": 1408894509.0, + "step": 29900 + }, + { + "entropy": 1.5806397613883019, + "epoch": 3.9583848780246425, + "grad_norm": 3.8559279441833496, + "learning_rate": 1.9221045682598364e-05, + "loss": 12.6316, + "mean_token_accuracy": 0.6520166169852019, + "num_tokens": 1413596551.0, + "step": 30000 + }, + { + "epoch": 3.9583848780246425, + "eval_entropy": 1.3611459893354956, + "eval_loss": 1.5021542310714722, + "eval_mean_token_accuracy": 0.6685199348280462, + "eval_num_tokens": 1413596551.0, + "eval_runtime": 3405.6838, + "eval_samples_per_second": 31.647, + "eval_steps_per_second": 3.956, + "step": 30000 + } + ], + "logging_steps": 100, + "max_steps": 757900, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.4113451599817784e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}