{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 20466, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.25283123254776, "epoch": 0.0014659532360917686, "grad_norm": 55.25, "learning_rate": 2.6385224274406335e-08, "loss": 2.4731, "mean_token_accuracy": 0.5599748879671097, "num_tokens": 24641.0, "step": 10 }, { "entropy": 1.413840764760971, "epoch": 0.0029319064721835372, "grad_norm": 59.0, "learning_rate": 5.570214013485782e-08, "loss": 2.7131, "mean_token_accuracy": 0.5256420075893402, "num_tokens": 46457.0, "step": 20 }, { "entropy": 1.3948082149028778, "epoch": 0.004397859708275306, "grad_norm": 56.0, "learning_rate": 8.50190559953093e-08, "loss": 2.6384, "mean_token_accuracy": 0.5302335128188134, "num_tokens": 69867.0, "step": 30 }, { "entropy": 1.3661251753568648, "epoch": 0.0058638129443670745, "grad_norm": 46.25, "learning_rate": 1.1433597185576079e-07, "loss": 2.6058, "mean_token_accuracy": 0.5412983655929565, "num_tokens": 92876.0, "step": 40 }, { "entropy": 1.298092320561409, "epoch": 0.007329766180458844, "grad_norm": 53.5, "learning_rate": 1.4365288771621227e-07, "loss": 2.4632, "mean_token_accuracy": 0.5509454920887947, "num_tokens": 119159.0, "step": 50 }, { "entropy": 1.3397300273180008, "epoch": 0.008795719416550612, "grad_norm": 69.5, "learning_rate": 1.7296980357666373e-07, "loss": 2.5504, "mean_token_accuracy": 0.5522458389401436, "num_tokens": 144104.0, "step": 60 }, { "entropy": 1.4276505380868911, "epoch": 0.01026167265264238, "grad_norm": 49.5, "learning_rate": 2.0228671943711525e-07, "loss": 2.7437, "mean_token_accuracy": 0.514421458542347, "num_tokens": 165639.0, "step": 70 }, { "entropy": 1.3151417702436448, "epoch": 0.011727625888734149, "grad_norm": 42.5, "learning_rate": 2.316036352975667e-07, "loss": 2.6237, "mean_token_accuracy": 0.556600047647953, "num_tokens": 186903.0, "step": 80 }, { "entropy": 1.2940305203199387, "epoch": 0.013193579124825917, "grad_norm": 52.25, "learning_rate": 2.609205511580182e-07, "loss": 2.5496, "mean_token_accuracy": 0.5474430873990059, "num_tokens": 209583.0, "step": 90 }, { "entropy": 1.3065010994672774, "epoch": 0.014659532360917688, "grad_norm": 50.25, "learning_rate": 2.9023746701846967e-07, "loss": 2.5981, "mean_token_accuracy": 0.5425919145345688, "num_tokens": 233082.0, "step": 100 }, { "entropy": 1.325048315525055, "epoch": 0.016125485597009454, "grad_norm": 52.0, "learning_rate": 3.195543828789212e-07, "loss": 2.6453, "mean_token_accuracy": 0.5436795011162758, "num_tokens": 255414.0, "step": 110 }, { "entropy": 1.2538740694522859, "epoch": 0.017591438833101224, "grad_norm": 56.0, "learning_rate": 3.488712987393726e-07, "loss": 2.5664, "mean_token_accuracy": 0.5572778850793838, "num_tokens": 276583.0, "step": 120 }, { "entropy": 1.4040610939264297, "epoch": 0.019057392069192994, "grad_norm": 45.75, "learning_rate": 3.781882145998241e-07, "loss": 2.68, "mean_token_accuracy": 0.5333648875355721, "num_tokens": 298355.0, "step": 130 }, { "entropy": 1.2969217479228974, "epoch": 0.02052334530528476, "grad_norm": 44.0, "learning_rate": 4.0750513046027563e-07, "loss": 2.541, "mean_token_accuracy": 0.5481395915150642, "num_tokens": 322839.0, "step": 140 }, { "entropy": 1.332941436767578, "epoch": 0.02198929854137653, "grad_norm": 77.0, "learning_rate": 4.3682204632072715e-07, "loss": 2.5533, "mean_token_accuracy": 0.5420454263687133, "num_tokens": 344846.0, "step": 150 }, { "entropy": 1.3199474960565567, "epoch": 0.023455251777468298, "grad_norm": 44.75, "learning_rate": 4.6613896218117856e-07, "loss": 2.4776, "mean_token_accuracy": 0.5520137026906013, "num_tokens": 368862.0, "step": 160 }, { "entropy": 1.47680823802948, "epoch": 0.024921205013560068, "grad_norm": 47.5, "learning_rate": 4.954558780416301e-07, "loss": 2.7156, "mean_token_accuracy": 0.5261070787906647, "num_tokens": 391286.0, "step": 170 }, { "entropy": 1.350553148984909, "epoch": 0.026387158249651835, "grad_norm": 50.5, "learning_rate": 5.247727939020815e-07, "loss": 2.5875, "mean_token_accuracy": 0.5380565628409386, "num_tokens": 412773.0, "step": 180 }, { "entropy": 1.3542836010456085, "epoch": 0.027853111485743605, "grad_norm": 42.25, "learning_rate": 5.54089709762533e-07, "loss": 2.6049, "mean_token_accuracy": 0.5354041799902916, "num_tokens": 436704.0, "step": 190 }, { "entropy": 1.4196303248405457, "epoch": 0.029319064721835375, "grad_norm": 51.25, "learning_rate": 5.834066256229845e-07, "loss": 2.6552, "mean_token_accuracy": 0.5354075536131859, "num_tokens": 458000.0, "step": 200 }, { "entropy": 1.3946342766284943, "epoch": 0.03078501795792714, "grad_norm": 42.75, "learning_rate": 6.12723541483436e-07, "loss": 2.566, "mean_token_accuracy": 0.5355818867683411, "num_tokens": 479931.0, "step": 210 }, { "entropy": 1.2912331581115724, "epoch": 0.03225097119401891, "grad_norm": 50.0, "learning_rate": 6.420404573438875e-07, "loss": 2.5011, "mean_token_accuracy": 0.5521246075630188, "num_tokens": 500657.0, "step": 220 }, { "entropy": 1.407066786289215, "epoch": 0.03371692443011068, "grad_norm": 50.0, "learning_rate": 6.71357373204339e-07, "loss": 2.502, "mean_token_accuracy": 0.5366490066051484, "num_tokens": 526709.0, "step": 230 }, { "entropy": 1.3761765301227569, "epoch": 0.03518287766620245, "grad_norm": 35.25, "learning_rate": 7.006742890647903e-07, "loss": 2.3652, "mean_token_accuracy": 0.5582018181681633, "num_tokens": 554614.0, "step": 240 }, { "entropy": 1.2741256088018418, "epoch": 0.03664883090229422, "grad_norm": 42.0, "learning_rate": 7.299912049252419e-07, "loss": 2.3826, "mean_token_accuracy": 0.57050671428442, "num_tokens": 577249.0, "step": 250 }, { "entropy": 1.3749388754367828, "epoch": 0.03811478413838599, "grad_norm": 41.25, "learning_rate": 7.593081207856934e-07, "loss": 2.4097, "mean_token_accuracy": 0.5504184558987617, "num_tokens": 601908.0, "step": 260 }, { "entropy": 1.402105775475502, "epoch": 0.03958073737447775, "grad_norm": 37.0, "learning_rate": 7.886250366461449e-07, "loss": 2.4917, "mean_token_accuracy": 0.5470695421099663, "num_tokens": 623790.0, "step": 270 }, { "entropy": 1.3568048298358917, "epoch": 0.04104669061056952, "grad_norm": 45.25, "learning_rate": 8.179419525065964e-07, "loss": 2.3431, "mean_token_accuracy": 0.5530494973063469, "num_tokens": 647063.0, "step": 280 }, { "entropy": 1.5803349792957306, "epoch": 0.04251264384666129, "grad_norm": 27.5, "learning_rate": 8.472588683670479e-07, "loss": 2.7344, "mean_token_accuracy": 0.5154516488313675, "num_tokens": 665588.0, "step": 290 }, { "entropy": 1.5109591126441955, "epoch": 0.04397859708275306, "grad_norm": 36.75, "learning_rate": 8.765757842274994e-07, "loss": 2.6047, "mean_token_accuracy": 0.5259893089532852, "num_tokens": 688386.0, "step": 300 }, { "entropy": 1.532055252790451, "epoch": 0.045444550318844826, "grad_norm": 33.75, "learning_rate": 9.058927000879508e-07, "loss": 2.7077, "mean_token_accuracy": 0.5096044436097145, "num_tokens": 705912.0, "step": 310 }, { "entropy": 1.4697867572307586, "epoch": 0.046910503554936596, "grad_norm": 38.5, "learning_rate": 9.352096159484023e-07, "loss": 2.3843, "mean_token_accuracy": 0.5466781094670295, "num_tokens": 730427.0, "step": 320 }, { "entropy": 1.5008546561002731, "epoch": 0.048376456791028366, "grad_norm": 27.75, "learning_rate": 9.645265318088538e-07, "loss": 2.3247, "mean_token_accuracy": 0.5507321938872337, "num_tokens": 755186.0, "step": 330 }, { "entropy": 1.5204926788806916, "epoch": 0.049842410027120136, "grad_norm": 27.625, "learning_rate": 9.938434476693052e-07, "loss": 2.468, "mean_token_accuracy": 0.542216670513153, "num_tokens": 775810.0, "step": 340 }, { "entropy": 1.5719797909259796, "epoch": 0.051308363263211906, "grad_norm": 35.75, "learning_rate": 1.0231603635297568e-06, "loss": 2.4513, "mean_token_accuracy": 0.5338009864091873, "num_tokens": 796291.0, "step": 350 }, { "entropy": 1.6542179852724075, "epoch": 0.05277431649930367, "grad_norm": 33.25, "learning_rate": 1.0524772793902083e-06, "loss": 2.5515, "mean_token_accuracy": 0.5177045539021492, "num_tokens": 818376.0, "step": 360 }, { "entropy": 1.4739161610603333, "epoch": 0.05424026973539544, "grad_norm": 28.125, "learning_rate": 1.0817941952506597e-06, "loss": 2.2047, "mean_token_accuracy": 0.5619964465498924, "num_tokens": 844618.0, "step": 370 }, { "entropy": 1.530209904909134, "epoch": 0.05570622297148721, "grad_norm": 53.5, "learning_rate": 1.111111111111111e-06, "loss": 2.2735, "mean_token_accuracy": 0.5511181890964508, "num_tokens": 869679.0, "step": 380 }, { "entropy": 1.6364353120326995, "epoch": 0.05717217620757898, "grad_norm": 31.875, "learning_rate": 1.1404280269715626e-06, "loss": 2.3726, "mean_token_accuracy": 0.5421095192432404, "num_tokens": 891388.0, "step": 390 }, { "entropy": 1.472144365310669, "epoch": 0.05863812944367075, "grad_norm": 25.875, "learning_rate": 1.1697449428320142e-06, "loss": 2.1683, "mean_token_accuracy": 0.5754730254411697, "num_tokens": 916382.0, "step": 400 }, { "entropy": 1.4956263482570649, "epoch": 0.06010408267976251, "grad_norm": 26.375, "learning_rate": 1.1990618586924657e-06, "loss": 2.121, "mean_token_accuracy": 0.5730619415640831, "num_tokens": 941662.0, "step": 410 }, { "entropy": 1.559325623512268, "epoch": 0.06157003591585428, "grad_norm": 26.625, "learning_rate": 1.2283787745529171e-06, "loss": 2.2108, "mean_token_accuracy": 0.5636417225003243, "num_tokens": 963778.0, "step": 420 }, { "entropy": 1.5737532377243042, "epoch": 0.06303598915194605, "grad_norm": 46.5, "learning_rate": 1.2576956904133687e-06, "loss": 2.1995, "mean_token_accuracy": 0.5616327926516533, "num_tokens": 984767.0, "step": 430 }, { "entropy": 1.5132656216621398, "epoch": 0.06450194238803782, "grad_norm": 34.25, "learning_rate": 1.28701260627382e-06, "loss": 2.078, "mean_token_accuracy": 0.5819661915302277, "num_tokens": 1008842.0, "step": 440 }, { "entropy": 1.587599778175354, "epoch": 0.0659678956241296, "grad_norm": 23.125, "learning_rate": 1.3163295221342714e-06, "loss": 2.1448, "mean_token_accuracy": 0.5694536939263344, "num_tokens": 1032317.0, "step": 450 }, { "entropy": 1.4526239037513733, "epoch": 0.06743384886022136, "grad_norm": 19.875, "learning_rate": 1.3456464379947232e-06, "loss": 1.9653, "mean_token_accuracy": 0.5951266437768936, "num_tokens": 1056208.0, "step": 460 }, { "entropy": 1.61964390873909, "epoch": 0.06889980209631313, "grad_norm": 21.875, "learning_rate": 1.3749633538551745e-06, "loss": 2.1075, "mean_token_accuracy": 0.5658230796456337, "num_tokens": 1080876.0, "step": 470 }, { "entropy": 1.531758466362953, "epoch": 0.0703657553324049, "grad_norm": 16.625, "learning_rate": 1.404280269715626e-06, "loss": 1.9317, "mean_token_accuracy": 0.5915350079536438, "num_tokens": 1105848.0, "step": 480 }, { "entropy": 1.5876234710216521, "epoch": 0.07183170856849666, "grad_norm": 19.625, "learning_rate": 1.4335971855760775e-06, "loss": 2.0128, "mean_token_accuracy": 0.5781182110309601, "num_tokens": 1130293.0, "step": 490 }, { "entropy": 1.6668872475624084, "epoch": 0.07329766180458844, "grad_norm": 21.125, "learning_rate": 1.4629141014365288e-06, "loss": 2.0641, "mean_token_accuracy": 0.5693250834941864, "num_tokens": 1153909.0, "step": 500 }, { "entropy": 1.5780849754810333, "epoch": 0.0747636150406802, "grad_norm": 23.25, "learning_rate": 1.4922310172969806e-06, "loss": 1.8987, "mean_token_accuracy": 0.5902101844549179, "num_tokens": 1179787.0, "step": 510 }, { "entropy": 1.5983868837356567, "epoch": 0.07622956827677198, "grad_norm": 20.5, "learning_rate": 1.521547933157432e-06, "loss": 1.9921, "mean_token_accuracy": 0.5893337026238441, "num_tokens": 1201033.0, "step": 520 }, { "entropy": 1.617168813943863, "epoch": 0.07769552151286374, "grad_norm": 22.375, "learning_rate": 1.5508648490178835e-06, "loss": 1.9145, "mean_token_accuracy": 0.5817078784108162, "num_tokens": 1222140.0, "step": 530 }, { "entropy": 1.6026915192604065, "epoch": 0.0791614747489555, "grad_norm": 30.75, "learning_rate": 1.5801817648783349e-06, "loss": 2.0173, "mean_token_accuracy": 0.5873137354850769, "num_tokens": 1244451.0, "step": 540 }, { "entropy": 1.6237093031406402, "epoch": 0.08062742798504728, "grad_norm": 20.625, "learning_rate": 1.6094986807387862e-06, "loss": 1.9017, "mean_token_accuracy": 0.5904631644487381, "num_tokens": 1266282.0, "step": 550 }, { "entropy": 1.6444358587265016, "epoch": 0.08209338122113904, "grad_norm": 17.75, "learning_rate": 1.6388155965992378e-06, "loss": 1.9698, "mean_token_accuracy": 0.5805864557623863, "num_tokens": 1288708.0, "step": 560 }, { "entropy": 1.5714997112751008, "epoch": 0.08355933445723081, "grad_norm": 17.625, "learning_rate": 1.6681325124596894e-06, "loss": 1.7829, "mean_token_accuracy": 0.609296415746212, "num_tokens": 1311424.0, "step": 570 }, { "entropy": 1.7976841151714325, "epoch": 0.08502528769332258, "grad_norm": 17.125, "learning_rate": 1.697449428320141e-06, "loss": 2.075, "mean_token_accuracy": 0.5631542205810547, "num_tokens": 1331634.0, "step": 580 }, { "entropy": 1.6591939330101013, "epoch": 0.08649124092941435, "grad_norm": 16.0, "learning_rate": 1.7267663441805923e-06, "loss": 1.916, "mean_token_accuracy": 0.5866473585367202, "num_tokens": 1352372.0, "step": 590 }, { "entropy": 1.537699693441391, "epoch": 0.08795719416550613, "grad_norm": 17.875, "learning_rate": 1.7560832600410439e-06, "loss": 1.76, "mean_token_accuracy": 0.6081784337759018, "num_tokens": 1378286.0, "step": 600 }, { "entropy": 1.5544520139694213, "epoch": 0.08942314740159789, "grad_norm": 20.125, "learning_rate": 1.7854001759014952e-06, "loss": 1.7601, "mean_token_accuracy": 0.6174847334623337, "num_tokens": 1400067.0, "step": 610 }, { "entropy": 1.5727385878562927, "epoch": 0.09088910063768965, "grad_norm": 16.5, "learning_rate": 1.8147170917619466e-06, "loss": 1.7679, "mean_token_accuracy": 0.6091848641633988, "num_tokens": 1426677.0, "step": 620 }, { "entropy": 1.5909466683864593, "epoch": 0.09235505387378143, "grad_norm": 17.25, "learning_rate": 1.8440340076223984e-06, "loss": 1.7815, "mean_token_accuracy": 0.6056916370987893, "num_tokens": 1453665.0, "step": 630 }, { "entropy": 1.63122478723526, "epoch": 0.09382100710987319, "grad_norm": 16.5, "learning_rate": 1.8733509234828497e-06, "loss": 1.7457, "mean_token_accuracy": 0.6060785546898841, "num_tokens": 1476211.0, "step": 640 }, { "entropy": 1.771380114555359, "epoch": 0.09528696034596497, "grad_norm": 20.25, "learning_rate": 1.9026678393433013e-06, "loss": 1.9266, "mean_token_accuracy": 0.5840378105640411, "num_tokens": 1496913.0, "step": 650 }, { "entropy": 1.7105891525745391, "epoch": 0.09675291358205673, "grad_norm": 16.625, "learning_rate": 1.9319847552037527e-06, "loss": 1.8208, "mean_token_accuracy": 0.6001120150089264, "num_tokens": 1517071.0, "step": 660 }, { "entropy": 1.5371930003166199, "epoch": 0.0982188668181485, "grad_norm": 15.1875, "learning_rate": 1.9613016710642042e-06, "loss": 1.628, "mean_token_accuracy": 0.6270627945661544, "num_tokens": 1539297.0, "step": 670 }, { "entropy": 1.6599729835987092, "epoch": 0.09968482005424027, "grad_norm": 13.125, "learning_rate": 1.990618586924656e-06, "loss": 1.6898, "mean_token_accuracy": 0.6055290952324868, "num_tokens": 1560994.0, "step": 680 }, { "entropy": 1.6862299859523773, "epoch": 0.10115077329033204, "grad_norm": 14.9375, "learning_rate": 2.0199355027851074e-06, "loss": 1.7502, "mean_token_accuracy": 0.600650355219841, "num_tokens": 1584073.0, "step": 690 }, { "entropy": 1.5789681792259216, "epoch": 0.10261672652642381, "grad_norm": 11.9375, "learning_rate": 2.0492524186455585e-06, "loss": 1.6186, "mean_token_accuracy": 0.6283605486154556, "num_tokens": 1609721.0, "step": 700 }, { "entropy": 1.6771332025527954, "epoch": 0.10408267976251558, "grad_norm": 14.0, "learning_rate": 2.07856933450601e-06, "loss": 1.7489, "mean_token_accuracy": 0.6067024111747742, "num_tokens": 1631634.0, "step": 710 }, { "entropy": 1.6244808256626129, "epoch": 0.10554863299860734, "grad_norm": 12.9375, "learning_rate": 2.1078862503664617e-06, "loss": 1.6814, "mean_token_accuracy": 0.6089700475335121, "num_tokens": 1655238.0, "step": 720 }, { "entropy": 1.694501566886902, "epoch": 0.10701458623469912, "grad_norm": 14.25, "learning_rate": 2.137203166226913e-06, "loss": 1.7609, "mean_token_accuracy": 0.6015656799077987, "num_tokens": 1677888.0, "step": 730 }, { "entropy": 1.5577446132898332, "epoch": 0.10848053947079088, "grad_norm": 14.1875, "learning_rate": 2.166520082087365e-06, "loss": 1.623, "mean_token_accuracy": 0.6328493624925613, "num_tokens": 1702707.0, "step": 740 }, { "entropy": 1.6390549480915069, "epoch": 0.10994649270688266, "grad_norm": 12.6875, "learning_rate": 2.195836997947816e-06, "loss": 1.6941, "mean_token_accuracy": 0.6118786782026291, "num_tokens": 1722957.0, "step": 750 }, { "entropy": 1.606190675497055, "epoch": 0.11141244594297442, "grad_norm": 13.375, "learning_rate": 2.2251539138082675e-06, "loss": 1.6262, "mean_token_accuracy": 0.61300860196352, "num_tokens": 1747289.0, "step": 760 }, { "entropy": 1.558549502491951, "epoch": 0.11287839917906618, "grad_norm": 12.5625, "learning_rate": 2.254470829668719e-06, "loss": 1.5617, "mean_token_accuracy": 0.6301269710063935, "num_tokens": 1771237.0, "step": 770 }, { "entropy": 1.4628527909517288, "epoch": 0.11434435241515796, "grad_norm": 11.5625, "learning_rate": 2.2837877455291702e-06, "loss": 1.4189, "mean_token_accuracy": 0.6439291328191757, "num_tokens": 1799825.0, "step": 780 }, { "entropy": 1.6285467147827148, "epoch": 0.11581030565124972, "grad_norm": 11.5625, "learning_rate": 2.313104661389622e-06, "loss": 1.6955, "mean_token_accuracy": 0.6118112325668335, "num_tokens": 1821845.0, "step": 790 }, { "entropy": 1.5550211429595948, "epoch": 0.1172762588873415, "grad_norm": 14.4375, "learning_rate": 2.3424215772500734e-06, "loss": 1.5865, "mean_token_accuracy": 0.6240790218114853, "num_tokens": 1845183.0, "step": 800 }, { "entropy": 1.4953551054000855, "epoch": 0.11874221212343326, "grad_norm": 12.5, "learning_rate": 2.371738493110525e-06, "loss": 1.4524, "mean_token_accuracy": 0.6428318992257118, "num_tokens": 1864768.0, "step": 810 }, { "entropy": 1.6151892483234405, "epoch": 0.12020816535952503, "grad_norm": 12.875, "learning_rate": 2.4010554089709765e-06, "loss": 1.6534, "mean_token_accuracy": 0.6135767713189125, "num_tokens": 1889204.0, "step": 820 }, { "entropy": 1.5275954961776734, "epoch": 0.1216741185956168, "grad_norm": 11.625, "learning_rate": 2.4303723248314277e-06, "loss": 1.5521, "mean_token_accuracy": 0.6386136665940285, "num_tokens": 1912943.0, "step": 830 }, { "entropy": 1.5796338975429536, "epoch": 0.12314007183170857, "grad_norm": 27.0, "learning_rate": 2.4596892406918792e-06, "loss": 1.5603, "mean_token_accuracy": 0.6306994676589965, "num_tokens": 1935403.0, "step": 840 }, { "entropy": 1.437633317708969, "epoch": 0.12460602506780034, "grad_norm": 13.875, "learning_rate": 2.489006156552331e-06, "loss": 1.4204, "mean_token_accuracy": 0.6606641009449958, "num_tokens": 1958833.0, "step": 850 }, { "entropy": 1.6347171425819398, "epoch": 0.1260719783038921, "grad_norm": 28.75, "learning_rate": 2.5183230724127824e-06, "loss": 1.6724, "mean_token_accuracy": 0.6180515199899673, "num_tokens": 1978880.0, "step": 860 }, { "entropy": 1.6753376007080079, "epoch": 0.12753793153998388, "grad_norm": 15.875, "learning_rate": 2.547639988273234e-06, "loss": 1.6563, "mean_token_accuracy": 0.608917024731636, "num_tokens": 1999446.0, "step": 870 }, { "entropy": 1.589219269156456, "epoch": 0.12900388477607563, "grad_norm": 13.625, "learning_rate": 2.5769569041336855e-06, "loss": 1.5826, "mean_token_accuracy": 0.6147463411092758, "num_tokens": 2020568.0, "step": 880 }, { "entropy": 1.6183902859687804, "epoch": 0.1304698380121674, "grad_norm": 17.875, "learning_rate": 2.6062738199941367e-06, "loss": 1.6022, "mean_token_accuracy": 0.6208051472902298, "num_tokens": 2042482.0, "step": 890 }, { "entropy": 1.4896790564060212, "epoch": 0.1319357912482592, "grad_norm": 16.625, "learning_rate": 2.6355907358545887e-06, "loss": 1.5399, "mean_token_accuracy": 0.6453383922576904, "num_tokens": 2064156.0, "step": 900 }, { "entropy": 1.6016619145870208, "epoch": 0.13340174448435094, "grad_norm": 18.875, "learning_rate": 2.66490765171504e-06, "loss": 1.5616, "mean_token_accuracy": 0.6224727988243103, "num_tokens": 2084809.0, "step": 910 }, { "entropy": 1.4940084278583527, "epoch": 0.13486769772044271, "grad_norm": 11.0, "learning_rate": 2.6942245675754914e-06, "loss": 1.4698, "mean_token_accuracy": 0.6513595834374428, "num_tokens": 2106835.0, "step": 920 }, { "entropy": 1.4467491418123246, "epoch": 0.1363336509565345, "grad_norm": 12.8125, "learning_rate": 2.7235414834359425e-06, "loss": 1.4435, "mean_token_accuracy": 0.6626789718866348, "num_tokens": 2129085.0, "step": 930 }, { "entropy": 1.573822683095932, "epoch": 0.13779960419262627, "grad_norm": 21.125, "learning_rate": 2.752858399296394e-06, "loss": 1.503, "mean_token_accuracy": 0.6381863683462143, "num_tokens": 2148864.0, "step": 940 }, { "entropy": 1.5332079470157622, "epoch": 0.13926555742871802, "grad_norm": 11.125, "learning_rate": 2.782175315156846e-06, "loss": 1.5351, "mean_token_accuracy": 0.6386425122618675, "num_tokens": 2173959.0, "step": 950 }, { "entropy": 1.5113701105117798, "epoch": 0.1407315106648098, "grad_norm": 12.625, "learning_rate": 2.8114922310172972e-06, "loss": 1.4828, "mean_token_accuracy": 0.6503223985433578, "num_tokens": 2197548.0, "step": 960 }, { "entropy": 1.3686371207237245, "epoch": 0.14219746390090157, "grad_norm": 12.0625, "learning_rate": 2.840809146877749e-06, "loss": 1.275, "mean_token_accuracy": 0.6760158449411392, "num_tokens": 2222845.0, "step": 970 }, { "entropy": 1.6976830422878266, "epoch": 0.14366341713699332, "grad_norm": 18.625, "learning_rate": 2.8701260627382e-06, "loss": 1.7217, "mean_token_accuracy": 0.6109289109706879, "num_tokens": 2243214.0, "step": 980 }, { "entropy": 1.3527169078588486, "epoch": 0.1451293703730851, "grad_norm": 18.5, "learning_rate": 2.8994429785986515e-06, "loss": 1.3137, "mean_token_accuracy": 0.6857913047075271, "num_tokens": 2263776.0, "step": 990 }, { "entropy": 1.5089521944522857, "epoch": 0.14659532360917688, "grad_norm": 12.8125, "learning_rate": 2.9287598944591035e-06, "loss": 1.4509, "mean_token_accuracy": 0.6507100313901901, "num_tokens": 2285065.0, "step": 1000 }, { "entropy": 1.445404550433159, "epoch": 0.14806127684526862, "grad_norm": 21.0, "learning_rate": 2.9580768103195547e-06, "loss": 1.4105, "mean_token_accuracy": 0.6599164292216301, "num_tokens": 2306658.0, "step": 1010 }, { "entropy": 1.3777422606945038, "epoch": 0.1495272300813604, "grad_norm": 12.1875, "learning_rate": 2.9873937261800062e-06, "loss": 1.2754, "mean_token_accuracy": 0.6710947930812836, "num_tokens": 2331792.0, "step": 1020 }, { "entropy": 1.540156090259552, "epoch": 0.15099318331745218, "grad_norm": 12.8125, "learning_rate": 3.0167106420404574e-06, "loss": 1.479, "mean_token_accuracy": 0.6408717483282089, "num_tokens": 2352567.0, "step": 1030 }, { "entropy": 1.4346525013446807, "epoch": 0.15245913655354396, "grad_norm": 12.25, "learning_rate": 3.046027557900909e-06, "loss": 1.4095, "mean_token_accuracy": 0.6630053430795669, "num_tokens": 2374878.0, "step": 1040 }, { "entropy": 1.460228019952774, "epoch": 0.1539250897896357, "grad_norm": 9.625, "learning_rate": 3.075344473761361e-06, "loss": 1.3594, "mean_token_accuracy": 0.6604823380708694, "num_tokens": 2400520.0, "step": 1050 }, { "entropy": 1.4509092628955842, "epoch": 0.15539104302572748, "grad_norm": 10.0625, "learning_rate": 3.104661389621812e-06, "loss": 1.4081, "mean_token_accuracy": 0.6543880492448807, "num_tokens": 2422748.0, "step": 1060 }, { "entropy": 1.3891044706106186, "epoch": 0.15685699626181926, "grad_norm": 13.5, "learning_rate": 3.1339783054822637e-06, "loss": 1.3391, "mean_token_accuracy": 0.6686175018548965, "num_tokens": 2448162.0, "step": 1070 }, { "entropy": 1.3577708691358565, "epoch": 0.158322949497911, "grad_norm": 11.25, "learning_rate": 3.163295221342715e-06, "loss": 1.3168, "mean_token_accuracy": 0.6761578977108001, "num_tokens": 2471390.0, "step": 1080 }, { "entropy": 1.4060111403465272, "epoch": 0.15978890273400279, "grad_norm": 12.5625, "learning_rate": 3.1926121372031664e-06, "loss": 1.3215, "mean_token_accuracy": 0.6673409879207611, "num_tokens": 2495803.0, "step": 1090 }, { "entropy": 1.435472533106804, "epoch": 0.16125485597009456, "grad_norm": 12.4375, "learning_rate": 3.2219290530636175e-06, "loss": 1.3922, "mean_token_accuracy": 0.6639649510383606, "num_tokens": 2519134.0, "step": 1100 }, { "entropy": 1.5003547757863998, "epoch": 0.1627208092061863, "grad_norm": 9.3125, "learning_rate": 3.2512459689240695e-06, "loss": 1.5341, "mean_token_accuracy": 0.6417115911841392, "num_tokens": 2541378.0, "step": 1110 }, { "entropy": 1.476882266998291, "epoch": 0.1641867624422781, "grad_norm": 22.0, "learning_rate": 3.280562884784521e-06, "loss": 1.4985, "mean_token_accuracy": 0.6480431661009789, "num_tokens": 2564709.0, "step": 1120 }, { "entropy": 1.459595412015915, "epoch": 0.16565271567836987, "grad_norm": 10.25, "learning_rate": 3.3098798006449722e-06, "loss": 1.387, "mean_token_accuracy": 0.6652828186750412, "num_tokens": 2585531.0, "step": 1130 }, { "entropy": 1.3400749027729035, "epoch": 0.16711866891446162, "grad_norm": 11.75, "learning_rate": 3.339196716505424e-06, "loss": 1.3098, "mean_token_accuracy": 0.675098767876625, "num_tokens": 2607380.0, "step": 1140 }, { "entropy": 1.4286237716674806, "epoch": 0.1685846221505534, "grad_norm": 12.375, "learning_rate": 3.368513632365875e-06, "loss": 1.386, "mean_token_accuracy": 0.6669136464595795, "num_tokens": 2629621.0, "step": 1150 }, { "entropy": 1.371552586555481, "epoch": 0.17005057538664517, "grad_norm": 18.0, "learning_rate": 3.397830548226327e-06, "loss": 1.3455, "mean_token_accuracy": 0.6730488210916519, "num_tokens": 2651402.0, "step": 1160 }, { "entropy": 1.583845403790474, "epoch": 0.17151652862273695, "grad_norm": 13.375, "learning_rate": 3.4271474640867785e-06, "loss": 1.5533, "mean_token_accuracy": 0.6304793164134026, "num_tokens": 2672982.0, "step": 1170 }, { "entropy": 1.4311782240867614, "epoch": 0.1729824818588287, "grad_norm": 11.0625, "learning_rate": 3.4564643799472297e-06, "loss": 1.415, "mean_token_accuracy": 0.6627267062664032, "num_tokens": 2697467.0, "step": 1180 }, { "entropy": 1.4501012593507767, "epoch": 0.17444843509492047, "grad_norm": 11.0, "learning_rate": 3.4857812958076812e-06, "loss": 1.4147, "mean_token_accuracy": 0.6517012655735016, "num_tokens": 2718936.0, "step": 1190 }, { "entropy": 1.4287504732608796, "epoch": 0.17591438833101225, "grad_norm": 14.0625, "learning_rate": 3.5150982116681324e-06, "loss": 1.3152, "mean_token_accuracy": 0.6667961418628693, "num_tokens": 2742491.0, "step": 1200 }, { "entropy": 1.414398166537285, "epoch": 0.177380341567104, "grad_norm": 8.0625, "learning_rate": 3.5444151275285844e-06, "loss": 1.3381, "mean_token_accuracy": 0.6666110396385193, "num_tokens": 2762214.0, "step": 1210 }, { "entropy": 1.5151636064052583, "epoch": 0.17884629480319578, "grad_norm": 17.0, "learning_rate": 3.573732043389036e-06, "loss": 1.5271, "mean_token_accuracy": 0.6468397587537765, "num_tokens": 2780447.0, "step": 1220 }, { "entropy": 1.4793310970067979, "epoch": 0.18031224803928755, "grad_norm": 13.1875, "learning_rate": 3.603048959249487e-06, "loss": 1.3727, "mean_token_accuracy": 0.6537921667098999, "num_tokens": 2801292.0, "step": 1230 }, { "entropy": 1.4377906650304795, "epoch": 0.1817782012753793, "grad_norm": 15.6875, "learning_rate": 3.632365875109939e-06, "loss": 1.4214, "mean_token_accuracy": 0.661966335773468, "num_tokens": 2824463.0, "step": 1240 }, { "entropy": 1.4681696504354478, "epoch": 0.18324415451147108, "grad_norm": 12.625, "learning_rate": 3.6616827909703902e-06, "loss": 1.4416, "mean_token_accuracy": 0.6535322889685631, "num_tokens": 2850015.0, "step": 1250 }, { "entropy": 1.3105103760957717, "epoch": 0.18471010774756286, "grad_norm": 15.0625, "learning_rate": 3.690999706830842e-06, "loss": 1.2405, "mean_token_accuracy": 0.6857359945774079, "num_tokens": 2872314.0, "step": 1260 }, { "entropy": 1.5098979353904725, "epoch": 0.18617606098365463, "grad_norm": 12.8125, "learning_rate": 3.7203166226912934e-06, "loss": 1.4995, "mean_token_accuracy": 0.6460315644741058, "num_tokens": 2894239.0, "step": 1270 }, { "entropy": 1.638446143269539, "epoch": 0.18764201421974638, "grad_norm": 13.3125, "learning_rate": 3.7496335385517445e-06, "loss": 1.6752, "mean_token_accuracy": 0.6282639652490616, "num_tokens": 2914793.0, "step": 1280 }, { "entropy": 1.4174028903245925, "epoch": 0.18910796745583816, "grad_norm": 12.5, "learning_rate": 3.7789504544121965e-06, "loss": 1.3753, "mean_token_accuracy": 0.6733538925647735, "num_tokens": 2938002.0, "step": 1290 }, { "entropy": 1.5904677987098694, "epoch": 0.19057392069192994, "grad_norm": 11.9375, "learning_rate": 3.8082673702726477e-06, "loss": 1.4966, "mean_token_accuracy": 0.6390533030033112, "num_tokens": 2958818.0, "step": 1300 }, { "entropy": 1.5106680780649184, "epoch": 0.1920398739280217, "grad_norm": 12.9375, "learning_rate": 3.837584286133099e-06, "loss": 1.4381, "mean_token_accuracy": 0.6525804027915001, "num_tokens": 2980468.0, "step": 1310 }, { "entropy": 1.3735601633787156, "epoch": 0.19350582716411346, "grad_norm": 11.25, "learning_rate": 3.86690120199355e-06, "loss": 1.2235, "mean_token_accuracy": 0.6740753024816513, "num_tokens": 3003640.0, "step": 1320 }, { "entropy": 1.4450361728668213, "epoch": 0.19497178040020524, "grad_norm": 17.125, "learning_rate": 3.896218117854002e-06, "loss": 1.4157, "mean_token_accuracy": 0.6606000870466232, "num_tokens": 3029363.0, "step": 1330 }, { "entropy": 1.5754990696907043, "epoch": 0.196437733636297, "grad_norm": 13.5625, "learning_rate": 3.925535033714454e-06, "loss": 1.5181, "mean_token_accuracy": 0.6405826091766358, "num_tokens": 3048047.0, "step": 1340 }, { "entropy": 1.4792763262987136, "epoch": 0.19790368687238877, "grad_norm": 10.4375, "learning_rate": 3.954851949574905e-06, "loss": 1.4488, "mean_token_accuracy": 0.6574035003781319, "num_tokens": 3072367.0, "step": 1350 }, { "entropy": 1.563313552737236, "epoch": 0.19936964010848054, "grad_norm": 10.3125, "learning_rate": 3.984168865435356e-06, "loss": 1.5975, "mean_token_accuracy": 0.6384479865431786, "num_tokens": 3095630.0, "step": 1360 }, { "entropy": 1.4458220809698106, "epoch": 0.20083559334457232, "grad_norm": 10.25, "learning_rate": 4.013485781295807e-06, "loss": 1.4141, "mean_token_accuracy": 0.6639861673116684, "num_tokens": 3113286.0, "step": 1370 }, { "entropy": 1.3954906791448594, "epoch": 0.20230154658066407, "grad_norm": 11.0, "learning_rate": 4.042802697156259e-06, "loss": 1.3492, "mean_token_accuracy": 0.6654444336891174, "num_tokens": 3134431.0, "step": 1380 }, { "entropy": 1.350418469309807, "epoch": 0.20376749981675585, "grad_norm": 14.5625, "learning_rate": 4.072119613016711e-06, "loss": 1.2861, "mean_token_accuracy": 0.6840535581111908, "num_tokens": 3159201.0, "step": 1390 }, { "entropy": 1.4183424681425094, "epoch": 0.20523345305284763, "grad_norm": 11.1875, "learning_rate": 4.1014365288771625e-06, "loss": 1.3941, "mean_token_accuracy": 0.6627218261361122, "num_tokens": 3183088.0, "step": 1400 }, { "entropy": 1.5059909731149674, "epoch": 0.20669940628893937, "grad_norm": 10.75, "learning_rate": 4.130753444737614e-06, "loss": 1.4825, "mean_token_accuracy": 0.6509411707520485, "num_tokens": 3206681.0, "step": 1410 }, { "entropy": 1.400975489616394, "epoch": 0.20816535952503115, "grad_norm": 8.3125, "learning_rate": 4.160070360598066e-06, "loss": 1.3375, "mean_token_accuracy": 0.6668096125125885, "num_tokens": 3233837.0, "step": 1420 }, { "entropy": 1.3380768537521361, "epoch": 0.20963131276112293, "grad_norm": 9.4375, "learning_rate": 4.189387276458517e-06, "loss": 1.342, "mean_token_accuracy": 0.6786920413374901, "num_tokens": 3260590.0, "step": 1430 }, { "entropy": 1.370729798078537, "epoch": 0.21109726599721468, "grad_norm": 10.125, "learning_rate": 4.218704192318969e-06, "loss": 1.2714, "mean_token_accuracy": 0.6729479432106018, "num_tokens": 3283604.0, "step": 1440 }, { "entropy": 1.2814150273799896, "epoch": 0.21256321923330646, "grad_norm": 9.25, "learning_rate": 4.24802110817942e-06, "loss": 1.1725, "mean_token_accuracy": 0.6896132916212082, "num_tokens": 3307007.0, "step": 1450 }, { "entropy": 1.2631165951490402, "epoch": 0.21402917246939823, "grad_norm": 6.625, "learning_rate": 4.277338024039872e-06, "loss": 1.2504, "mean_token_accuracy": 0.6873376846313477, "num_tokens": 3335504.0, "step": 1460 }, { "entropy": 1.3466918468475342, "epoch": 0.21549512570548998, "grad_norm": 15.4375, "learning_rate": 4.306654939900323e-06, "loss": 1.3461, "mean_token_accuracy": 0.6790317863225936, "num_tokens": 3363066.0, "step": 1470 }, { "entropy": 1.4266670495271683, "epoch": 0.21696107894158176, "grad_norm": 10.0, "learning_rate": 4.335971855760774e-06, "loss": 1.4188, "mean_token_accuracy": 0.6638744980096817, "num_tokens": 3385709.0, "step": 1480 }, { "entropy": 1.439681026339531, "epoch": 0.21842703217767354, "grad_norm": 11.25, "learning_rate": 4.365288771621225e-06, "loss": 1.4426, "mean_token_accuracy": 0.6633105099201202, "num_tokens": 3405458.0, "step": 1490 }, { "entropy": 1.541125413775444, "epoch": 0.2198929854137653, "grad_norm": 13.125, "learning_rate": 4.394605687481677e-06, "loss": 1.5226, "mean_token_accuracy": 0.6459307968616486, "num_tokens": 3427573.0, "step": 1500 }, { "entropy": 1.3419945538043976, "epoch": 0.22135893864985706, "grad_norm": 19.25, "learning_rate": 4.423922603342129e-06, "loss": 1.299, "mean_token_accuracy": 0.6787095963954926, "num_tokens": 3448857.0, "step": 1510 }, { "entropy": 1.3236747890710832, "epoch": 0.22282489188594884, "grad_norm": 8.375, "learning_rate": 4.4532395192025805e-06, "loss": 1.3328, "mean_token_accuracy": 0.6853064984083176, "num_tokens": 3472983.0, "step": 1520 }, { "entropy": 1.6034924060106277, "epoch": 0.22429084512204062, "grad_norm": 13.1875, "learning_rate": 4.482556435063032e-06, "loss": 1.5613, "mean_token_accuracy": 0.6266369432210922, "num_tokens": 3489190.0, "step": 1530 }, { "entropy": 1.4963504016399383, "epoch": 0.22575679835813237, "grad_norm": 12.8125, "learning_rate": 4.511873350923483e-06, "loss": 1.4337, "mean_token_accuracy": 0.646208031475544, "num_tokens": 3513051.0, "step": 1540 }, { "entropy": 1.3567855179309845, "epoch": 0.22722275159422414, "grad_norm": 11.3125, "learning_rate": 4.541190266783935e-06, "loss": 1.2943, "mean_token_accuracy": 0.6742360711097717, "num_tokens": 3536325.0, "step": 1550 }, { "entropy": 1.5569882273674012, "epoch": 0.22868870483031592, "grad_norm": 9.1875, "learning_rate": 4.570507182644387e-06, "loss": 1.5182, "mean_token_accuracy": 0.6443121343851089, "num_tokens": 3560337.0, "step": 1560 }, { "entropy": 1.5524468928575517, "epoch": 0.23015465806640767, "grad_norm": 14.0, "learning_rate": 4.599824098504838e-06, "loss": 1.4483, "mean_token_accuracy": 0.6481706693768501, "num_tokens": 3582278.0, "step": 1570 }, { "entropy": 1.37132228910923, "epoch": 0.23162061130249945, "grad_norm": 10.9375, "learning_rate": 4.629141014365289e-06, "loss": 1.3257, "mean_token_accuracy": 0.67001414000988, "num_tokens": 3606511.0, "step": 1580 }, { "entropy": 1.4464658796787262, "epoch": 0.23308656453859122, "grad_norm": 10.3125, "learning_rate": 4.65845793022574e-06, "loss": 1.4243, "mean_token_accuracy": 0.6601900517940521, "num_tokens": 3630364.0, "step": 1590 }, { "entropy": 1.428574013710022, "epoch": 0.234552517774683, "grad_norm": 12.125, "learning_rate": 4.687774846086192e-06, "loss": 1.4377, "mean_token_accuracy": 0.6583260059356689, "num_tokens": 3652833.0, "step": 1600 }, { "entropy": 1.5038713723421098, "epoch": 0.23601847101077475, "grad_norm": 12.5625, "learning_rate": 4.717091761946644e-06, "loss": 1.4146, "mean_token_accuracy": 0.6502936854958534, "num_tokens": 3672844.0, "step": 1610 }, { "entropy": 1.4725337892770767, "epoch": 0.23748442424686653, "grad_norm": 11.875, "learning_rate": 4.746408677807095e-06, "loss": 1.4437, "mean_token_accuracy": 0.6512446641921997, "num_tokens": 3692895.0, "step": 1620 }, { "entropy": 1.249815246462822, "epoch": 0.2389503774829583, "grad_norm": 10.375, "learning_rate": 4.7757255936675465e-06, "loss": 1.2483, "mean_token_accuracy": 0.6981709629297257, "num_tokens": 3720243.0, "step": 1630 }, { "entropy": 1.5299287259578704, "epoch": 0.24041633071905005, "grad_norm": 11.4375, "learning_rate": 4.805042509527998e-06, "loss": 1.4797, "mean_token_accuracy": 0.6476666897535324, "num_tokens": 3742574.0, "step": 1640 }, { "entropy": 1.3898545235395432, "epoch": 0.24188228395514183, "grad_norm": 9.0, "learning_rate": 4.83435942538845e-06, "loss": 1.3501, "mean_token_accuracy": 0.6770107418298721, "num_tokens": 3765361.0, "step": 1650 }, { "entropy": 1.4818656116724014, "epoch": 0.2433482371912336, "grad_norm": 11.9375, "learning_rate": 4.863676341248901e-06, "loss": 1.3938, "mean_token_accuracy": 0.6479715466499328, "num_tokens": 3786669.0, "step": 1660 }, { "entropy": 1.4298436284065246, "epoch": 0.24481419042732536, "grad_norm": 9.5, "learning_rate": 4.892993257109353e-06, "loss": 1.4092, "mean_token_accuracy": 0.6594713777303696, "num_tokens": 3811641.0, "step": 1670 }, { "entropy": 1.4295513659715653, "epoch": 0.24628014366341713, "grad_norm": 14.75, "learning_rate": 4.922310172969804e-06, "loss": 1.327, "mean_token_accuracy": 0.6693428784608841, "num_tokens": 3833903.0, "step": 1680 }, { "entropy": 1.5428931266069412, "epoch": 0.2477460968995089, "grad_norm": 15.125, "learning_rate": 4.951627088830255e-06, "loss": 1.4967, "mean_token_accuracy": 0.643545514345169, "num_tokens": 3853516.0, "step": 1690 }, { "entropy": 1.4570579379796982, "epoch": 0.2492120501356007, "grad_norm": 18.875, "learning_rate": 4.980944004690707e-06, "loss": 1.4445, "mean_token_accuracy": 0.6521989732980729, "num_tokens": 3873041.0, "step": 1700 }, { "entropy": 1.4962241500616074, "epoch": 0.25067800337169244, "grad_norm": 9.125, "learning_rate": 5.010260920551159e-06, "loss": 1.4278, "mean_token_accuracy": 0.6531152218580246, "num_tokens": 3896338.0, "step": 1710 }, { "entropy": 1.4755153954029083, "epoch": 0.2521439566077842, "grad_norm": 13.4375, "learning_rate": 5.03957783641161e-06, "loss": 1.4487, "mean_token_accuracy": 0.6545021861791611, "num_tokens": 3916807.0, "step": 1720 }, { "entropy": 1.4066698879003525, "epoch": 0.253609909843876, "grad_norm": 15.625, "learning_rate": 5.068894752272061e-06, "loss": 1.362, "mean_token_accuracy": 0.6666294753551483, "num_tokens": 3940866.0, "step": 1730 }, { "entropy": 1.4855057448148727, "epoch": 0.25507586307996777, "grad_norm": 11.4375, "learning_rate": 5.098211668132513e-06, "loss": 1.3966, "mean_token_accuracy": 0.6568978816270828, "num_tokens": 3963635.0, "step": 1740 }, { "entropy": 1.175552523136139, "epoch": 0.2565418163160595, "grad_norm": 9.125, "learning_rate": 5.1275285839929645e-06, "loss": 1.1429, "mean_token_accuracy": 0.7088612884283065, "num_tokens": 3991109.0, "step": 1750 }, { "entropy": 1.4013624697923661, "epoch": 0.25800776955215127, "grad_norm": 14.625, "learning_rate": 5.156845499853416e-06, "loss": 1.355, "mean_token_accuracy": 0.6654776930809021, "num_tokens": 4012857.0, "step": 1760 }, { "entropy": 1.4266999572515489, "epoch": 0.25947372278824304, "grad_norm": 8.9375, "learning_rate": 5.186162415713867e-06, "loss": 1.3336, "mean_token_accuracy": 0.6645713597536087, "num_tokens": 4037017.0, "step": 1770 }, { "entropy": 1.4376948952674866, "epoch": 0.2609396760243348, "grad_norm": 9.625, "learning_rate": 5.215479331574319e-06, "loss": 1.457, "mean_token_accuracy": 0.6551876842975617, "num_tokens": 4058610.0, "step": 1780 }, { "entropy": 1.338196623325348, "epoch": 0.2624056292604266, "grad_norm": 11.75, "learning_rate": 5.24479624743477e-06, "loss": 1.294, "mean_token_accuracy": 0.6808309674263, "num_tokens": 4083727.0, "step": 1790 }, { "entropy": 1.5024945437908173, "epoch": 0.2638715824965184, "grad_norm": 14.0625, "learning_rate": 5.274113163295221e-06, "loss": 1.4394, "mean_token_accuracy": 0.6480053931474685, "num_tokens": 4104521.0, "step": 1800 }, { "entropy": 1.3683723241090775, "epoch": 0.26533753573261015, "grad_norm": 8.3125, "learning_rate": 5.303430079155674e-06, "loss": 1.3347, "mean_token_accuracy": 0.672050766646862, "num_tokens": 4127857.0, "step": 1810 }, { "entropy": 1.3613164305686951, "epoch": 0.2668034889687019, "grad_norm": 16.5, "learning_rate": 5.332746995016125e-06, "loss": 1.2927, "mean_token_accuracy": 0.6744700565934181, "num_tokens": 4148241.0, "step": 1820 }, { "entropy": 1.3201717555522918, "epoch": 0.26826944220479365, "grad_norm": 9.25, "learning_rate": 5.362063910876576e-06, "loss": 1.3239, "mean_token_accuracy": 0.6788221895694733, "num_tokens": 4175182.0, "step": 1830 }, { "entropy": 1.2526254057884216, "epoch": 0.26973539544088543, "grad_norm": 8.1875, "learning_rate": 5.391380826737028e-06, "loss": 1.2323, "mean_token_accuracy": 0.692057853937149, "num_tokens": 4198566.0, "step": 1840 }, { "entropy": 1.5186035931110382, "epoch": 0.2712013486769772, "grad_norm": 13.625, "learning_rate": 5.420697742597479e-06, "loss": 1.4549, "mean_token_accuracy": 0.6479292884469032, "num_tokens": 4220532.0, "step": 1850 }, { "entropy": 1.3593205004930495, "epoch": 0.272667301913069, "grad_norm": 12.375, "learning_rate": 5.4500146584579305e-06, "loss": 1.3513, "mean_token_accuracy": 0.6758809417486191, "num_tokens": 4245180.0, "step": 1860 }, { "entropy": 1.3126778870820999, "epoch": 0.27413325514916076, "grad_norm": 12.5, "learning_rate": 5.479331574318382e-06, "loss": 1.2377, "mean_token_accuracy": 0.6879893749952316, "num_tokens": 4270290.0, "step": 1870 }, { "entropy": 1.2494087874889375, "epoch": 0.27559920838525254, "grad_norm": 19.5, "learning_rate": 5.508648490178834e-06, "loss": 1.1609, "mean_token_accuracy": 0.7002261340618133, "num_tokens": 4293313.0, "step": 1880 }, { "entropy": 1.2791531324386596, "epoch": 0.27706516162134426, "grad_norm": 9.9375, "learning_rate": 5.537965406039285e-06, "loss": 1.2643, "mean_token_accuracy": 0.683156219124794, "num_tokens": 4320401.0, "step": 1890 }, { "entropy": 1.3878503799438477, "epoch": 0.27853111485743604, "grad_norm": 8.0625, "learning_rate": 5.567282321899736e-06, "loss": 1.3461, "mean_token_accuracy": 0.6716514348983764, "num_tokens": 4344803.0, "step": 1900 }, { "entropy": 1.3926512867212295, "epoch": 0.2799970680935278, "grad_norm": 11.875, "learning_rate": 5.596599237760189e-06, "loss": 1.3538, "mean_token_accuracy": 0.6671912133693695, "num_tokens": 4366175.0, "step": 1910 }, { "entropy": 1.5550675630569457, "epoch": 0.2814630213296196, "grad_norm": 10.5, "learning_rate": 5.62591615362064e-06, "loss": 1.5084, "mean_token_accuracy": 0.6427816599607468, "num_tokens": 4386094.0, "step": 1920 }, { "entropy": 1.6685091495513915, "epoch": 0.28292897456571137, "grad_norm": 10.125, "learning_rate": 5.655233069481091e-06, "loss": 1.6356, "mean_token_accuracy": 0.6160645365715027, "num_tokens": 4408421.0, "step": 1930 }, { "entropy": 1.303581526875496, "epoch": 0.28439492780180314, "grad_norm": 14.625, "learning_rate": 5.684549985341542e-06, "loss": 1.2727, "mean_token_accuracy": 0.6877759307622909, "num_tokens": 4430840.0, "step": 1940 }, { "entropy": 1.531065183877945, "epoch": 0.28586088103789487, "grad_norm": 14.75, "learning_rate": 5.713866901201994e-06, "loss": 1.4728, "mean_token_accuracy": 0.6416442111134529, "num_tokens": 4454929.0, "step": 1950 }, { "entropy": 1.3429428875446319, "epoch": 0.28732683427398664, "grad_norm": 8.25, "learning_rate": 5.743183817062445e-06, "loss": 1.2201, "mean_token_accuracy": 0.6840098291635514, "num_tokens": 4477667.0, "step": 1960 }, { "entropy": 1.3267211109399795, "epoch": 0.2887927875100784, "grad_norm": 10.125, "learning_rate": 5.7725007329228965e-06, "loss": 1.2565, "mean_token_accuracy": 0.6775642424821854, "num_tokens": 4504109.0, "step": 1970 }, { "entropy": 1.4738714158535005, "epoch": 0.2902587407461702, "grad_norm": 12.1875, "learning_rate": 5.8018176487833485e-06, "loss": 1.3657, "mean_token_accuracy": 0.660081484913826, "num_tokens": 4526106.0, "step": 1980 }, { "entropy": 1.3610664933919907, "epoch": 0.291724693982262, "grad_norm": 9.125, "learning_rate": 5.8311345646438e-06, "loss": 1.2823, "mean_token_accuracy": 0.681380957365036, "num_tokens": 4550877.0, "step": 1990 }, { "entropy": 1.5325461506843567, "epoch": 0.29319064721835375, "grad_norm": 9.3125, "learning_rate": 5.860451480504251e-06, "loss": 1.501, "mean_token_accuracy": 0.6400662630796432, "num_tokens": 4568424.0, "step": 2000 }, { "entropy": 1.565615475177765, "epoch": 0.2946566004544455, "grad_norm": 13.5625, "learning_rate": 5.889768396364704e-06, "loss": 1.5322, "mean_token_accuracy": 0.6377306476235389, "num_tokens": 4591699.0, "step": 2010 }, { "entropy": 1.2854655027389525, "epoch": 0.29612255369053725, "grad_norm": 11.125, "learning_rate": 5.919085312225155e-06, "loss": 1.2507, "mean_token_accuracy": 0.6860924959182739, "num_tokens": 4615806.0, "step": 2020 }, { "entropy": 1.3244401901960372, "epoch": 0.297588506926629, "grad_norm": 11.0, "learning_rate": 5.948402228085606e-06, "loss": 1.3472, "mean_token_accuracy": 0.6783598840236664, "num_tokens": 4643546.0, "step": 2030 }, { "entropy": 1.2965664833784103, "epoch": 0.2990544601627208, "grad_norm": 7.28125, "learning_rate": 5.977719143946057e-06, "loss": 1.2834, "mean_token_accuracy": 0.6888355523347854, "num_tokens": 4666118.0, "step": 2040 }, { "entropy": 1.5594497203826905, "epoch": 0.3005204133988126, "grad_norm": 13.125, "learning_rate": 6.007036059806509e-06, "loss": 1.5805, "mean_token_accuracy": 0.6323002442717552, "num_tokens": 4687757.0, "step": 2050 }, { "entropy": 1.3731900483369828, "epoch": 0.30198636663490436, "grad_norm": 9.1875, "learning_rate": 6.03635297566696e-06, "loss": 1.3202, "mean_token_accuracy": 0.6745158672332764, "num_tokens": 4709699.0, "step": 2060 }, { "entropy": 1.4218753784894944, "epoch": 0.30345231987099613, "grad_norm": 8.75, "learning_rate": 6.065669891527411e-06, "loss": 1.3493, "mean_token_accuracy": 0.6633736461400985, "num_tokens": 4732227.0, "step": 2070 }, { "entropy": 1.243794071674347, "epoch": 0.3049182731070879, "grad_norm": 8.8125, "learning_rate": 6.094986807387863e-06, "loss": 1.1992, "mean_token_accuracy": 0.6924352079629899, "num_tokens": 4759451.0, "step": 2080 }, { "entropy": 1.4628839135169982, "epoch": 0.30638422634317963, "grad_norm": 10.6875, "learning_rate": 6.1243037232483145e-06, "loss": 1.4276, "mean_token_accuracy": 0.6591650605201721, "num_tokens": 4781258.0, "step": 2090 }, { "entropy": 1.4806728333234787, "epoch": 0.3078501795792714, "grad_norm": 8.4375, "learning_rate": 6.153620639108766e-06, "loss": 1.4476, "mean_token_accuracy": 0.645695923268795, "num_tokens": 4805586.0, "step": 2100 }, { "entropy": 1.5901213318109513, "epoch": 0.3093161328153632, "grad_norm": 8.3125, "learning_rate": 6.182937554969217e-06, "loss": 1.5282, "mean_token_accuracy": 0.6334371268749237, "num_tokens": 4827973.0, "step": 2110 }, { "entropy": 1.3013278484344482, "epoch": 0.31078208605145496, "grad_norm": 9.375, "learning_rate": 6.21225447082967e-06, "loss": 1.2087, "mean_token_accuracy": 0.6880233675241471, "num_tokens": 4851604.0, "step": 2120 }, { "entropy": 1.5176890075206757, "epoch": 0.31224803928754674, "grad_norm": 10.6875, "learning_rate": 6.241571386690121e-06, "loss": 1.425, "mean_token_accuracy": 0.6450367778539657, "num_tokens": 4873190.0, "step": 2130 }, { "entropy": 1.5715102404356003, "epoch": 0.3137139925236385, "grad_norm": 10.0625, "learning_rate": 6.270888302550572e-06, "loss": 1.435, "mean_token_accuracy": 0.6415366128087043, "num_tokens": 4895441.0, "step": 2140 }, { "entropy": 1.2207586884498596, "epoch": 0.31517994575973024, "grad_norm": 8.75, "learning_rate": 6.300205218411024e-06, "loss": 1.1511, "mean_token_accuracy": 0.7048605740070343, "num_tokens": 4922343.0, "step": 2150 }, { "entropy": 1.3783370494842528, "epoch": 0.316645898995822, "grad_norm": 15.5625, "learning_rate": 6.329522134271475e-06, "loss": 1.2895, "mean_token_accuracy": 0.6680753290653229, "num_tokens": 4942476.0, "step": 2160 }, { "entropy": 1.4115062803030014, "epoch": 0.3181118522319138, "grad_norm": 8.125, "learning_rate": 6.358839050131926e-06, "loss": 1.3637, "mean_token_accuracy": 0.671144588291645, "num_tokens": 4964489.0, "step": 2170 }, { "entropy": 1.2501509219408036, "epoch": 0.31957780546800557, "grad_norm": 8.6875, "learning_rate": 6.388155965992379e-06, "loss": 1.1558, "mean_token_accuracy": 0.6976560860872268, "num_tokens": 4987463.0, "step": 2180 }, { "entropy": 1.3064077973365784, "epoch": 0.32104375870409735, "grad_norm": 11.125, "learning_rate": 6.41747288185283e-06, "loss": 1.3451, "mean_token_accuracy": 0.6780828326940537, "num_tokens": 5013157.0, "step": 2190 }, { "entropy": 1.3485292732715606, "epoch": 0.3225097119401891, "grad_norm": 16.25, "learning_rate": 6.446789797713281e-06, "loss": 1.2518, "mean_token_accuracy": 0.681101170182228, "num_tokens": 5037714.0, "step": 2200 }, { "entropy": 1.592606782913208, "epoch": 0.3239756651762809, "grad_norm": 15.1875, "learning_rate": 6.4761067135737325e-06, "loss": 1.5263, "mean_token_accuracy": 0.6402951762080192, "num_tokens": 5056229.0, "step": 2210 }, { "entropy": 1.370220559835434, "epoch": 0.3254416184123726, "grad_norm": 14.1875, "learning_rate": 6.5054236294341845e-06, "loss": 1.3102, "mean_token_accuracy": 0.6732840865850449, "num_tokens": 5080212.0, "step": 2220 }, { "entropy": 1.3090024828910827, "epoch": 0.3269075716484644, "grad_norm": 7.125, "learning_rate": 6.534740545294636e-06, "loss": 1.2337, "mean_token_accuracy": 0.681540596485138, "num_tokens": 5103390.0, "step": 2230 }, { "entropy": 1.3017589330673218, "epoch": 0.3283735248845562, "grad_norm": 10.4375, "learning_rate": 6.564057461155087e-06, "loss": 1.2242, "mean_token_accuracy": 0.6802477091550827, "num_tokens": 5128682.0, "step": 2240 }, { "entropy": 1.2710354179143906, "epoch": 0.32983947812064796, "grad_norm": 14.5, "learning_rate": 6.593374377015539e-06, "loss": 1.2527, "mean_token_accuracy": 0.6948857232928276, "num_tokens": 5151421.0, "step": 2250 }, { "entropy": 1.4683438628911971, "epoch": 0.33130543135673973, "grad_norm": 9.125, "learning_rate": 6.62269129287599e-06, "loss": 1.4213, "mean_token_accuracy": 0.6486032575368881, "num_tokens": 5176577.0, "step": 2260 }, { "entropy": 1.5119109272956848, "epoch": 0.3327713845928315, "grad_norm": 13.5625, "learning_rate": 6.652008208736441e-06, "loss": 1.4903, "mean_token_accuracy": 0.6536808118224144, "num_tokens": 5197361.0, "step": 2270 }, { "entropy": 1.416535660624504, "epoch": 0.33423733782892323, "grad_norm": 13.125, "learning_rate": 6.681325124596892e-06, "loss": 1.3774, "mean_token_accuracy": 0.6650669008493424, "num_tokens": 5217288.0, "step": 2280 }, { "entropy": 1.2551448613405227, "epoch": 0.335703291065015, "grad_norm": 8.8125, "learning_rate": 6.710642040457345e-06, "loss": 1.1734, "mean_token_accuracy": 0.6975555554032326, "num_tokens": 5242781.0, "step": 2290 }, { "entropy": 1.3618778496980668, "epoch": 0.3371692443011068, "grad_norm": 10.375, "learning_rate": 6.739958956317796e-06, "loss": 1.3187, "mean_token_accuracy": 0.6728916734457016, "num_tokens": 5265573.0, "step": 2300 }, { "entropy": 1.4666042029857635, "epoch": 0.33863519753719856, "grad_norm": 13.1875, "learning_rate": 6.769275872178247e-06, "loss": 1.4306, "mean_token_accuracy": 0.6520096153020859, "num_tokens": 5290014.0, "step": 2310 }, { "entropy": 1.4294601947069168, "epoch": 0.34010115077329034, "grad_norm": 9.9375, "learning_rate": 6.798592788038699e-06, "loss": 1.436, "mean_token_accuracy": 0.6547486394643783, "num_tokens": 5312857.0, "step": 2320 }, { "entropy": 1.321608343720436, "epoch": 0.3415671040093821, "grad_norm": 14.5, "learning_rate": 6.8279097038991505e-06, "loss": 1.2187, "mean_token_accuracy": 0.6796143531799317, "num_tokens": 5337547.0, "step": 2330 }, { "entropy": 1.2580893576145171, "epoch": 0.3430330572454739, "grad_norm": 6.46875, "learning_rate": 6.857226619759602e-06, "loss": 1.2051, "mean_token_accuracy": 0.7009197235107422, "num_tokens": 5360923.0, "step": 2340 }, { "entropy": 1.369808241724968, "epoch": 0.3444990104815656, "grad_norm": 9.5625, "learning_rate": 6.886543535620054e-06, "loss": 1.357, "mean_token_accuracy": 0.6785596668720245, "num_tokens": 5384907.0, "step": 2350 }, { "entropy": 1.2706761926412582, "epoch": 0.3459649637176574, "grad_norm": 10.1875, "learning_rate": 6.915860451480505e-06, "loss": 1.2172, "mean_token_accuracy": 0.6966419219970703, "num_tokens": 5410078.0, "step": 2360 }, { "entropy": 1.4131956428289414, "epoch": 0.34743091695374917, "grad_norm": 16.0, "learning_rate": 6.945177367340956e-06, "loss": 1.3969, "mean_token_accuracy": 0.6575319677591324, "num_tokens": 5430326.0, "step": 2370 }, { "entropy": 1.4759588658809661, "epoch": 0.34889687018984095, "grad_norm": 9.3125, "learning_rate": 6.974494283201407e-06, "loss": 1.4601, "mean_token_accuracy": 0.6538081020116806, "num_tokens": 5450493.0, "step": 2380 }, { "entropy": 1.37843516767025, "epoch": 0.3503628234259327, "grad_norm": 6.21875, "learning_rate": 7.00381119906186e-06, "loss": 1.3211, "mean_token_accuracy": 0.6726105093955994, "num_tokens": 5475682.0, "step": 2390 }, { "entropy": 1.4322671592235565, "epoch": 0.3518287766620245, "grad_norm": 7.5625, "learning_rate": 7.033128114922311e-06, "loss": 1.3549, "mean_token_accuracy": 0.6671777933835983, "num_tokens": 5498833.0, "step": 2400 }, { "entropy": 1.3670723915100098, "epoch": 0.3532947298981163, "grad_norm": 8.8125, "learning_rate": 7.062445030782762e-06, "loss": 1.2799, "mean_token_accuracy": 0.6725857436656952, "num_tokens": 5523234.0, "step": 2410 }, { "entropy": 1.4626886814832687, "epoch": 0.354760683134208, "grad_norm": 10.875, "learning_rate": 7.091761946643214e-06, "loss": 1.3895, "mean_token_accuracy": 0.6597592890262604, "num_tokens": 5544358.0, "step": 2420 }, { "entropy": 1.4590036869049072, "epoch": 0.3562266363702998, "grad_norm": 16.625, "learning_rate": 7.121078862503665e-06, "loss": 1.3711, "mean_token_accuracy": 0.6601185649633408, "num_tokens": 5565119.0, "step": 2430 }, { "entropy": 1.5312156975269318, "epoch": 0.35769258960639155, "grad_norm": 11.125, "learning_rate": 7.1503957783641165e-06, "loss": 1.5326, "mean_token_accuracy": 0.6431695997714997, "num_tokens": 5586354.0, "step": 2440 }, { "entropy": 1.3656319469213485, "epoch": 0.35915854284248333, "grad_norm": 13.875, "learning_rate": 7.179712694224568e-06, "loss": 1.3028, "mean_token_accuracy": 0.6743479192256927, "num_tokens": 5605436.0, "step": 2450 }, { "entropy": 1.4667850643396378, "epoch": 0.3606244960785751, "grad_norm": 10.0625, "learning_rate": 7.20902961008502e-06, "loss": 1.4854, "mean_token_accuracy": 0.6515883877873421, "num_tokens": 5628075.0, "step": 2460 }, { "entropy": 1.3122776836156844, "epoch": 0.3620904493146669, "grad_norm": 7.75, "learning_rate": 7.238346525945471e-06, "loss": 1.3152, "mean_token_accuracy": 0.6882083877921105, "num_tokens": 5651367.0, "step": 2470 }, { "entropy": 1.3628319799900055, "epoch": 0.3635564025507586, "grad_norm": 9.5, "learning_rate": 7.267663441805922e-06, "loss": 1.299, "mean_token_accuracy": 0.6731520593166351, "num_tokens": 5671991.0, "step": 2480 }, { "entropy": 1.4089546024799346, "epoch": 0.3650223557868504, "grad_norm": 13.0625, "learning_rate": 7.296980357666375e-06, "loss": 1.4373, "mean_token_accuracy": 0.6652038216590881, "num_tokens": 5696118.0, "step": 2490 }, { "entropy": 1.265910941362381, "epoch": 0.36648830902294216, "grad_norm": 7.5, "learning_rate": 7.326297273526826e-06, "loss": 1.2361, "mean_token_accuracy": 0.6960048288106918, "num_tokens": 5720389.0, "step": 2500 }, { "entropy": 1.3069349378347397, "epoch": 0.36795426225903394, "grad_norm": 7.4375, "learning_rate": 7.355614189387277e-06, "loss": 1.2604, "mean_token_accuracy": 0.684259757399559, "num_tokens": 5742075.0, "step": 2510 }, { "entropy": 1.422082006931305, "epoch": 0.3694202154951257, "grad_norm": 9.4375, "learning_rate": 7.384931105247729e-06, "loss": 1.3829, "mean_token_accuracy": 0.6625893890857697, "num_tokens": 5764426.0, "step": 2520 }, { "entropy": 1.3968217372894287, "epoch": 0.3708861687312175, "grad_norm": 10.25, "learning_rate": 7.41424802110818e-06, "loss": 1.3119, "mean_token_accuracy": 0.6745562255382538, "num_tokens": 5784214.0, "step": 2530 }, { "entropy": 1.3953731521964072, "epoch": 0.37235212196730927, "grad_norm": 15.375, "learning_rate": 7.443564936968631e-06, "loss": 1.3657, "mean_token_accuracy": 0.674965164065361, "num_tokens": 5807170.0, "step": 2540 }, { "entropy": 1.3669428497552871, "epoch": 0.373818075203401, "grad_norm": 8.9375, "learning_rate": 7.4728818528290825e-06, "loss": 1.3142, "mean_token_accuracy": 0.6806528955698014, "num_tokens": 5827640.0, "step": 2550 }, { "entropy": 1.330240750312805, "epoch": 0.37528402843949277, "grad_norm": 7.21875, "learning_rate": 7.5021987686895345e-06, "loss": 1.251, "mean_token_accuracy": 0.6788307785987854, "num_tokens": 5852584.0, "step": 2560 }, { "entropy": 1.1778131604194642, "epoch": 0.37674998167558454, "grad_norm": 10.625, "learning_rate": 7.531515684549986e-06, "loss": 1.1543, "mean_token_accuracy": 0.7008629769086838, "num_tokens": 5877535.0, "step": 2570 }, { "entropy": 1.4141887217760085, "epoch": 0.3782159349116763, "grad_norm": 9.9375, "learning_rate": 7.560832600410437e-06, "loss": 1.3914, "mean_token_accuracy": 0.6619095459580422, "num_tokens": 5898570.0, "step": 2580 }, { "entropy": 1.4560063064098359, "epoch": 0.3796818881477681, "grad_norm": 10.6875, "learning_rate": 7.59014951627089e-06, "loss": 1.3787, "mean_token_accuracy": 0.6654931887984276, "num_tokens": 5920970.0, "step": 2590 }, { "entropy": 1.331900918483734, "epoch": 0.3811478413838599, "grad_norm": 9.1875, "learning_rate": 7.619466432131341e-06, "loss": 1.3149, "mean_token_accuracy": 0.6839560985565185, "num_tokens": 5942183.0, "step": 2600 }, { "entropy": 1.5133181154727935, "epoch": 0.3826137946199516, "grad_norm": 10.9375, "learning_rate": 7.648783347991791e-06, "loss": 1.4841, "mean_token_accuracy": 0.6478877902030945, "num_tokens": 5960902.0, "step": 2610 }, { "entropy": 1.476695141196251, "epoch": 0.3840797478560434, "grad_norm": 12.6875, "learning_rate": 7.678100263852242e-06, "loss": 1.443, "mean_token_accuracy": 0.6502201110124588, "num_tokens": 5983138.0, "step": 2620 }, { "entropy": 1.334860047698021, "epoch": 0.38554570109213515, "grad_norm": 8.75, "learning_rate": 7.707417179712695e-06, "loss": 1.2987, "mean_token_accuracy": 0.6807158976793289, "num_tokens": 6010042.0, "step": 2630 }, { "entropy": 1.4753855884075164, "epoch": 0.38701165432822693, "grad_norm": 16.0, "learning_rate": 7.736734095573146e-06, "loss": 1.4516, "mean_token_accuracy": 0.6568301022052765, "num_tokens": 6032601.0, "step": 2640 }, { "entropy": 1.367088469862938, "epoch": 0.3884776075643187, "grad_norm": 11.5, "learning_rate": 7.766051011433597e-06, "loss": 1.2976, "mean_token_accuracy": 0.6764255076646805, "num_tokens": 6059535.0, "step": 2650 }, { "entropy": 1.2778871715068818, "epoch": 0.3899435608004105, "grad_norm": 8.5, "learning_rate": 7.79536792729405e-06, "loss": 1.2273, "mean_token_accuracy": 0.6919259682297707, "num_tokens": 6082668.0, "step": 2660 }, { "entropy": 1.3718428432941436, "epoch": 0.39140951403650226, "grad_norm": 11.0, "learning_rate": 7.824684843154501e-06, "loss": 1.389, "mean_token_accuracy": 0.6699602216482162, "num_tokens": 6106196.0, "step": 2670 }, { "entropy": 1.20935820043087, "epoch": 0.392875467272594, "grad_norm": 7.21875, "learning_rate": 7.854001759014952e-06, "loss": 1.1955, "mean_token_accuracy": 0.7053950667381287, "num_tokens": 6130275.0, "step": 2680 }, { "entropy": 1.3412000089883804, "epoch": 0.39434142050868576, "grad_norm": 10.5, "learning_rate": 7.883318674875404e-06, "loss": 1.2597, "mean_token_accuracy": 0.6853662133216858, "num_tokens": 6154691.0, "step": 2690 }, { "entropy": 1.2872052669525147, "epoch": 0.39580737374477754, "grad_norm": 14.25, "learning_rate": 7.912635590735855e-06, "loss": 1.2394, "mean_token_accuracy": 0.6882327675819397, "num_tokens": 6178358.0, "step": 2700 }, { "entropy": 1.378788286447525, "epoch": 0.3972733269808693, "grad_norm": 10.5625, "learning_rate": 7.941952506596306e-06, "loss": 1.3341, "mean_token_accuracy": 0.6774774461984634, "num_tokens": 6200384.0, "step": 2710 }, { "entropy": 1.3411975532770157, "epoch": 0.3987392802169611, "grad_norm": 10.9375, "learning_rate": 7.971269422456757e-06, "loss": 1.3208, "mean_token_accuracy": 0.6843509986996651, "num_tokens": 6220896.0, "step": 2720 }, { "entropy": 1.3920181393623352, "epoch": 0.40020523345305287, "grad_norm": 11.25, "learning_rate": 8.00058633831721e-06, "loss": 1.3298, "mean_token_accuracy": 0.6641665458679199, "num_tokens": 6244623.0, "step": 2730 }, { "entropy": 1.3872381508350373, "epoch": 0.40167118668914464, "grad_norm": 10.9375, "learning_rate": 8.029903254177661e-06, "loss": 1.3482, "mean_token_accuracy": 0.6669115751981736, "num_tokens": 6267691.0, "step": 2740 }, { "entropy": 1.2858745336532593, "epoch": 0.40313713992523637, "grad_norm": 9.1875, "learning_rate": 8.059220170038112e-06, "loss": 1.2745, "mean_token_accuracy": 0.6944200545549393, "num_tokens": 6289932.0, "step": 2750 }, { "entropy": 1.1926775991916656, "epoch": 0.40460309316132814, "grad_norm": 8.5, "learning_rate": 8.088537085898565e-06, "loss": 1.1157, "mean_token_accuracy": 0.7084702879190445, "num_tokens": 6314388.0, "step": 2760 }, { "entropy": 1.1993196874856948, "epoch": 0.4060690463974199, "grad_norm": 10.4375, "learning_rate": 8.117854001759016e-06, "loss": 1.1278, "mean_token_accuracy": 0.7091725379228592, "num_tokens": 6339440.0, "step": 2770 }, { "entropy": 1.5098136752843856, "epoch": 0.4075349996335117, "grad_norm": 19.5, "learning_rate": 8.147170917619467e-06, "loss": 1.4694, "mean_token_accuracy": 0.6472103103995324, "num_tokens": 6359197.0, "step": 2780 }, { "entropy": 1.2387328177690506, "epoch": 0.4090009528696035, "grad_norm": 10.5, "learning_rate": 8.176487833479918e-06, "loss": 1.1918, "mean_token_accuracy": 0.6933408498764038, "num_tokens": 6382113.0, "step": 2790 }, { "entropy": 1.2694376170635224, "epoch": 0.41046690610569525, "grad_norm": 12.1875, "learning_rate": 8.20580474934037e-06, "loss": 1.2373, "mean_token_accuracy": 0.6969663560390472, "num_tokens": 6404315.0, "step": 2800 }, { "entropy": 1.3682787328958512, "epoch": 0.41193285934178697, "grad_norm": 11.375, "learning_rate": 8.23512166520082e-06, "loss": 1.3433, "mean_token_accuracy": 0.6747937932610512, "num_tokens": 6429527.0, "step": 2810 }, { "entropy": 1.3294258534908294, "epoch": 0.41339881257787875, "grad_norm": 12.1875, "learning_rate": 8.264438581061272e-06, "loss": 1.3317, "mean_token_accuracy": 0.6747519910335541, "num_tokens": 6452868.0, "step": 2820 }, { "entropy": 1.5286444038152696, "epoch": 0.4148647658139705, "grad_norm": 7.6875, "learning_rate": 8.293755496921725e-06, "loss": 1.4838, "mean_token_accuracy": 0.6430249661207199, "num_tokens": 6473612.0, "step": 2830 }, { "entropy": 1.507550722360611, "epoch": 0.4163307190500623, "grad_norm": 10.375, "learning_rate": 8.323072412782176e-06, "loss": 1.4532, "mean_token_accuracy": 0.6534436494112015, "num_tokens": 6494963.0, "step": 2840 }, { "entropy": 1.3961683362722397, "epoch": 0.4177966722861541, "grad_norm": 7.3125, "learning_rate": 8.352389328642627e-06, "loss": 1.353, "mean_token_accuracy": 0.681647178530693, "num_tokens": 6518995.0, "step": 2850 }, { "entropy": 1.2148487955331801, "epoch": 0.41926262552224586, "grad_norm": 8.4375, "learning_rate": 8.38170624450308e-06, "loss": 1.117, "mean_token_accuracy": 0.6950666248798371, "num_tokens": 6543417.0, "step": 2860 }, { "entropy": 1.250291708111763, "epoch": 0.42072857875833763, "grad_norm": 8.625, "learning_rate": 8.411023160363531e-06, "loss": 1.2499, "mean_token_accuracy": 0.6947534620761872, "num_tokens": 6570136.0, "step": 2870 }, { "entropy": 1.4971339583396912, "epoch": 0.42219453199442936, "grad_norm": 9.75, "learning_rate": 8.440340076223982e-06, "loss": 1.4461, "mean_token_accuracy": 0.6475557699799538, "num_tokens": 6593179.0, "step": 2880 }, { "entropy": 1.4306561589241027, "epoch": 0.42366048523052113, "grad_norm": 10.25, "learning_rate": 8.469656992084433e-06, "loss": 1.3636, "mean_token_accuracy": 0.6651116073131561, "num_tokens": 6614645.0, "step": 2890 }, { "entropy": 1.162570095062256, "epoch": 0.4251264384666129, "grad_norm": 10.5, "learning_rate": 8.498973907944884e-06, "loss": 1.1203, "mean_token_accuracy": 0.711794650554657, "num_tokens": 6642397.0, "step": 2900 }, { "entropy": 1.567629075050354, "epoch": 0.4265923917027047, "grad_norm": 10.0625, "learning_rate": 8.528290823805336e-06, "loss": 1.5303, "mean_token_accuracy": 0.6299111366271972, "num_tokens": 6664770.0, "step": 2910 }, { "entropy": 1.1952803760766983, "epoch": 0.42805834493879646, "grad_norm": 10.375, "learning_rate": 8.557607739665787e-06, "loss": 1.1475, "mean_token_accuracy": 0.706127405166626, "num_tokens": 6689461.0, "step": 2920 }, { "entropy": 1.4680424928665161, "epoch": 0.42952429817488824, "grad_norm": 12.0, "learning_rate": 8.58692465552624e-06, "loss": 1.3815, "mean_token_accuracy": 0.6551087230443955, "num_tokens": 6714888.0, "step": 2930 }, { "entropy": 1.2678328156471252, "epoch": 0.43099025141097996, "grad_norm": 15.5, "learning_rate": 8.61624157138669e-06, "loss": 1.2125, "mean_token_accuracy": 0.6856592506170273, "num_tokens": 6736774.0, "step": 2940 }, { "entropy": 1.2417077839374542, "epoch": 0.43245620464707174, "grad_norm": 10.25, "learning_rate": 8.645558487247142e-06, "loss": 1.1774, "mean_token_accuracy": 0.6912662521004677, "num_tokens": 6760506.0, "step": 2950 }, { "entropy": 1.4748327314853669, "epoch": 0.4339221578831635, "grad_norm": 6.46875, "learning_rate": 8.674875403107593e-06, "loss": 1.4824, "mean_token_accuracy": 0.6557697117328644, "num_tokens": 6783978.0, "step": 2960 }, { "entropy": 1.2870807081460953, "epoch": 0.4353881111192553, "grad_norm": 12.0, "learning_rate": 8.704192318968046e-06, "loss": 1.2585, "mean_token_accuracy": 0.691018921136856, "num_tokens": 6808199.0, "step": 2970 }, { "entropy": 1.2754801660776138, "epoch": 0.43685406435534707, "grad_norm": 9.3125, "learning_rate": 8.733509234828497e-06, "loss": 1.252, "mean_token_accuracy": 0.6926135063171387, "num_tokens": 6835127.0, "step": 2980 }, { "entropy": 1.2991015300154687, "epoch": 0.43832001759143885, "grad_norm": 7.21875, "learning_rate": 8.762826150688948e-06, "loss": 1.2023, "mean_token_accuracy": 0.6902934014797211, "num_tokens": 6859617.0, "step": 2990 }, { "entropy": 1.2052295356988907, "epoch": 0.4397859708275306, "grad_norm": 8.25, "learning_rate": 8.7921430665494e-06, "loss": 1.1959, "mean_token_accuracy": 0.6970946490764618, "num_tokens": 6883267.0, "step": 3000 }, { "entropy": 1.3117123126983643, "epoch": 0.44125192406362235, "grad_norm": 15.6875, "learning_rate": 8.82145998240985e-06, "loss": 1.2399, "mean_token_accuracy": 0.6901248276233674, "num_tokens": 6906863.0, "step": 3010 }, { "entropy": 1.4650176376104356, "epoch": 0.4427178772997141, "grad_norm": 12.875, "learning_rate": 8.850776898270302e-06, "loss": 1.4434, "mean_token_accuracy": 0.658891212940216, "num_tokens": 6928303.0, "step": 3020 }, { "entropy": 1.4552023202180862, "epoch": 0.4441838305358059, "grad_norm": 14.9375, "learning_rate": 8.880093814130754e-06, "loss": 1.4361, "mean_token_accuracy": 0.6454089015722275, "num_tokens": 6951331.0, "step": 3030 }, { "entropy": 1.328046488761902, "epoch": 0.4456497837718977, "grad_norm": 14.25, "learning_rate": 8.909410729991206e-06, "loss": 1.2851, "mean_token_accuracy": 0.6778472185134887, "num_tokens": 6973024.0, "step": 3040 }, { "entropy": 1.2809226989746094, "epoch": 0.44711573700798946, "grad_norm": 7.46875, "learning_rate": 8.938727645851657e-06, "loss": 1.202, "mean_token_accuracy": 0.6932399243116378, "num_tokens": 6997316.0, "step": 3050 }, { "entropy": 1.1429519951343536, "epoch": 0.44858169024408123, "grad_norm": 7.09375, "learning_rate": 8.968044561712108e-06, "loss": 1.0899, "mean_token_accuracy": 0.7159370303153991, "num_tokens": 7024811.0, "step": 3060 }, { "entropy": 1.4779512584209442, "epoch": 0.450047643480173, "grad_norm": 12.8125, "learning_rate": 8.99736147757256e-06, "loss": 1.4791, "mean_token_accuracy": 0.6482337772846222, "num_tokens": 7044815.0, "step": 3070 }, { "entropy": 1.2823039680719375, "epoch": 0.45151359671626473, "grad_norm": 8.875, "learning_rate": 9.026678393433012e-06, "loss": 1.2271, "mean_token_accuracy": 0.6879557251930237, "num_tokens": 7067699.0, "step": 3080 }, { "entropy": 1.2309219390153885, "epoch": 0.4529795499523565, "grad_norm": 5.875, "learning_rate": 9.055995309293463e-06, "loss": 1.2096, "mean_token_accuracy": 0.7037359029054642, "num_tokens": 7094135.0, "step": 3090 }, { "entropy": 1.405721753835678, "epoch": 0.4544455031884483, "grad_norm": 13.8125, "learning_rate": 9.085312225153914e-06, "loss": 1.3671, "mean_token_accuracy": 0.6673165917396545, "num_tokens": 7116588.0, "step": 3100 }, { "entropy": 1.301267644762993, "epoch": 0.45591145642454006, "grad_norm": 16.875, "learning_rate": 9.114629141014365e-06, "loss": 1.2759, "mean_token_accuracy": 0.6938399791717529, "num_tokens": 7140445.0, "step": 3110 }, { "entropy": 1.399472564458847, "epoch": 0.45737740966063184, "grad_norm": 5.96875, "learning_rate": 9.143946056874816e-06, "loss": 1.381, "mean_token_accuracy": 0.668781578540802, "num_tokens": 7164346.0, "step": 3120 }, { "entropy": 1.2430251717567444, "epoch": 0.4588433628967236, "grad_norm": 10.875, "learning_rate": 9.173262972735268e-06, "loss": 1.2087, "mean_token_accuracy": 0.6906826108694076, "num_tokens": 7188001.0, "step": 3130 }, { "entropy": 1.0997689604759215, "epoch": 0.46030931613281534, "grad_norm": 7.1875, "learning_rate": 9.20257988859572e-06, "loss": 1.0346, "mean_token_accuracy": 0.7174371421337128, "num_tokens": 7215594.0, "step": 3140 }, { "entropy": 1.2710275620222091, "epoch": 0.4617752693689071, "grad_norm": 8.5, "learning_rate": 9.231896804456172e-06, "loss": 1.2233, "mean_token_accuracy": 0.6904328405857086, "num_tokens": 7240396.0, "step": 3150 }, { "entropy": 1.2791489720344544, "epoch": 0.4632412226049989, "grad_norm": 8.125, "learning_rate": 9.261213720316623e-06, "loss": 1.2594, "mean_token_accuracy": 0.6829344511032105, "num_tokens": 7262721.0, "step": 3160 }, { "entropy": 1.3743024945259095, "epoch": 0.46470717584109067, "grad_norm": 16.625, "learning_rate": 9.290530636177076e-06, "loss": 1.321, "mean_token_accuracy": 0.6718118995428085, "num_tokens": 7284534.0, "step": 3170 }, { "entropy": 1.4253044545650482, "epoch": 0.46617312907718245, "grad_norm": 7.53125, "learning_rate": 9.319847552037527e-06, "loss": 1.3984, "mean_token_accuracy": 0.6686758548021317, "num_tokens": 7308734.0, "step": 3180 }, { "entropy": 1.3912608116865157, "epoch": 0.4676390823132742, "grad_norm": 16.375, "learning_rate": 9.349164467897978e-06, "loss": 1.3866, "mean_token_accuracy": 0.6706719368696212, "num_tokens": 7331229.0, "step": 3190 }, { "entropy": 1.410289078950882, "epoch": 0.469105035549366, "grad_norm": 5.96875, "learning_rate": 9.378481383758429e-06, "loss": 1.3831, "mean_token_accuracy": 0.6729899108409881, "num_tokens": 7355055.0, "step": 3200 }, { "entropy": 1.371148082613945, "epoch": 0.4705709887854577, "grad_norm": 9.0625, "learning_rate": 9.40779829961888e-06, "loss": 1.3341, "mean_token_accuracy": 0.6772182643413543, "num_tokens": 7379288.0, "step": 3210 }, { "entropy": 1.1751022517681122, "epoch": 0.4720369420215495, "grad_norm": 7.21875, "learning_rate": 9.437115215479331e-06, "loss": 1.1656, "mean_token_accuracy": 0.7114402979612351, "num_tokens": 7405520.0, "step": 3220 }, { "entropy": 1.3395012766122818, "epoch": 0.4735028952576413, "grad_norm": 11.125, "learning_rate": 9.466432131339782e-06, "loss": 1.255, "mean_token_accuracy": 0.6815980896353722, "num_tokens": 7429617.0, "step": 3230 }, { "entropy": 1.2756331861019135, "epoch": 0.47496884849373305, "grad_norm": 7.46875, "learning_rate": 9.495749047200235e-06, "loss": 1.2479, "mean_token_accuracy": 0.6858011186122894, "num_tokens": 7452502.0, "step": 3240 }, { "entropy": 1.528576734662056, "epoch": 0.47643480172982483, "grad_norm": 8.875, "learning_rate": 9.525065963060686e-06, "loss": 1.4226, "mean_token_accuracy": 0.6534953385591507, "num_tokens": 7475761.0, "step": 3250 }, { "entropy": 1.4828141987323762, "epoch": 0.4779007549659166, "grad_norm": 12.9375, "learning_rate": 9.554382878921138e-06, "loss": 1.4778, "mean_token_accuracy": 0.6489861369132995, "num_tokens": 7495819.0, "step": 3260 }, { "entropy": 1.2493179112672805, "epoch": 0.47936670820200833, "grad_norm": 10.25, "learning_rate": 9.58369979478159e-06, "loss": 1.2018, "mean_token_accuracy": 0.6877525568008422, "num_tokens": 7521200.0, "step": 3270 }, { "entropy": 1.5618608355522157, "epoch": 0.4808326614381001, "grad_norm": 10.3125, "learning_rate": 9.613016710642042e-06, "loss": 1.4928, "mean_token_accuracy": 0.6403728425502777, "num_tokens": 7539491.0, "step": 3280 }, { "entropy": 1.3402130991220473, "epoch": 0.4822986146741919, "grad_norm": 19.375, "learning_rate": 9.642333626502493e-06, "loss": 1.3071, "mean_token_accuracy": 0.6712054550647736, "num_tokens": 7558659.0, "step": 3290 }, { "entropy": 1.297720491886139, "epoch": 0.48376456791028366, "grad_norm": 11.625, "learning_rate": 9.671650542362946e-06, "loss": 1.2343, "mean_token_accuracy": 0.6833435237407685, "num_tokens": 7584783.0, "step": 3300 }, { "entropy": 1.478592613339424, "epoch": 0.48523052114637544, "grad_norm": 11.8125, "learning_rate": 9.700967458223397e-06, "loss": 1.4858, "mean_token_accuracy": 0.6473887756466865, "num_tokens": 7607508.0, "step": 3310 }, { "entropy": 1.1585966736078261, "epoch": 0.4866964743824672, "grad_norm": 12.375, "learning_rate": 9.730284374083848e-06, "loss": 1.1406, "mean_token_accuracy": 0.7113249272108078, "num_tokens": 7633488.0, "step": 3320 }, { "entropy": 1.4606537401676178, "epoch": 0.488162427618559, "grad_norm": 11.75, "learning_rate": 9.759601289944299e-06, "loss": 1.4739, "mean_token_accuracy": 0.6565104022622108, "num_tokens": 7654329.0, "step": 3330 }, { "entropy": 1.1907165080308915, "epoch": 0.4896283808546507, "grad_norm": 10.4375, "learning_rate": 9.78891820580475e-06, "loss": 1.1148, "mean_token_accuracy": 0.7078078329563141, "num_tokens": 7681805.0, "step": 3340 }, { "entropy": 1.3237620770931244, "epoch": 0.4910943340907425, "grad_norm": 9.5625, "learning_rate": 9.818235121665201e-06, "loss": 1.2778, "mean_token_accuracy": 0.6782233089208602, "num_tokens": 7705954.0, "step": 3350 }, { "entropy": 1.1921510696411133, "epoch": 0.49256028732683427, "grad_norm": 7.3125, "learning_rate": 9.847552037525652e-06, "loss": 1.1046, "mean_token_accuracy": 0.7012212157249451, "num_tokens": 7729289.0, "step": 3360 }, { "entropy": 1.44234955906868, "epoch": 0.49402624056292604, "grad_norm": 10.6875, "learning_rate": 9.876868953386105e-06, "loss": 1.4065, "mean_token_accuracy": 0.6546935945749283, "num_tokens": 7749796.0, "step": 3370 }, { "entropy": 1.3322436451911925, "epoch": 0.4954921937990178, "grad_norm": 13.8125, "learning_rate": 9.906185869246556e-06, "loss": 1.2828, "mean_token_accuracy": 0.6824848145246506, "num_tokens": 7772362.0, "step": 3380 }, { "entropy": 1.207375529408455, "epoch": 0.4969581470351096, "grad_norm": 9.5, "learning_rate": 9.935502785107008e-06, "loss": 1.1585, "mean_token_accuracy": 0.6965362012386322, "num_tokens": 7795297.0, "step": 3390 }, { "entropy": 1.5442767292261124, "epoch": 0.4984241002712014, "grad_norm": 9.9375, "learning_rate": 9.964819700967459e-06, "loss": 1.4321, "mean_token_accuracy": 0.6488837212324142, "num_tokens": 7815660.0, "step": 3400 }, { "entropy": 1.4090375781059266, "epoch": 0.4998900535072931, "grad_norm": 7.875, "learning_rate": 9.994136616827912e-06, "loss": 1.2734, "mean_token_accuracy": 0.667142677307129, "num_tokens": 7837215.0, "step": 3410 }, { "entropy": 1.5793184518814087, "epoch": 0.5013560067433849, "grad_norm": 10.75, "learning_rate": 1.0023453532688363e-05, "loss": 1.5093, "mean_token_accuracy": 0.6343648970127106, "num_tokens": 7859520.0, "step": 3420 }, { "entropy": 1.4059403121471405, "epoch": 0.5028219599794767, "grad_norm": 10.0, "learning_rate": 1.0052770448548814e-05, "loss": 1.3631, "mean_token_accuracy": 0.6588822692632675, "num_tokens": 7880229.0, "step": 3430 }, { "entropy": 1.3478514015674592, "epoch": 0.5042879132155684, "grad_norm": 8.4375, "learning_rate": 1.0082087364409265e-05, "loss": 1.2827, "mean_token_accuracy": 0.6713593661785126, "num_tokens": 7903396.0, "step": 3440 }, { "entropy": 1.404641553759575, "epoch": 0.5057538664516602, "grad_norm": 11.0625, "learning_rate": 1.0111404280269716e-05, "loss": 1.3592, "mean_token_accuracy": 0.6651184141635895, "num_tokens": 7921916.0, "step": 3450 }, { "entropy": 1.2285229444503785, "epoch": 0.507219819687752, "grad_norm": 9.4375, "learning_rate": 1.0140721196130167e-05, "loss": 1.183, "mean_token_accuracy": 0.7055457413196564, "num_tokens": 7947214.0, "step": 3460 }, { "entropy": 1.4673383504152298, "epoch": 0.5086857729238438, "grad_norm": 6.90625, "learning_rate": 1.0170038111990618e-05, "loss": 1.4166, "mean_token_accuracy": 0.6559782445430755, "num_tokens": 7969198.0, "step": 3470 }, { "entropy": 1.2828203082084655, "epoch": 0.5101517261599355, "grad_norm": 10.125, "learning_rate": 1.019935502785107e-05, "loss": 1.2085, "mean_token_accuracy": 0.6879772484302521, "num_tokens": 7996363.0, "step": 3480 }, { "entropy": 1.3065901905298234, "epoch": 0.5116176793960273, "grad_norm": 7.5, "learning_rate": 1.0228671943711524e-05, "loss": 1.2275, "mean_token_accuracy": 0.6810851633548737, "num_tokens": 8020373.0, "step": 3490 }, { "entropy": 1.337727364897728, "epoch": 0.513083632632119, "grad_norm": 13.875, "learning_rate": 1.0257988859571975e-05, "loss": 1.2966, "mean_token_accuracy": 0.6749675542116165, "num_tokens": 8043570.0, "step": 3500 }, { "entropy": 1.3120615929365158, "epoch": 0.5145495858682108, "grad_norm": 10.625, "learning_rate": 1.0287305775432426e-05, "loss": 1.2806, "mean_token_accuracy": 0.6800201952457428, "num_tokens": 8066493.0, "step": 3510 }, { "entropy": 1.2309517085552215, "epoch": 0.5160155391043025, "grad_norm": 9.3125, "learning_rate": 1.0316622691292878e-05, "loss": 1.2141, "mean_token_accuracy": 0.6990595385432243, "num_tokens": 8092895.0, "step": 3520 }, { "entropy": 1.260016170144081, "epoch": 0.5174814923403943, "grad_norm": 7.5625, "learning_rate": 1.0345939607153329e-05, "loss": 1.1886, "mean_token_accuracy": 0.6941437065601349, "num_tokens": 8115412.0, "step": 3530 }, { "entropy": 1.3602521389722824, "epoch": 0.5189474455764861, "grad_norm": 11.4375, "learning_rate": 1.037525652301378e-05, "loss": 1.2889, "mean_token_accuracy": 0.67878557741642, "num_tokens": 8135803.0, "step": 3540 }, { "entropy": 1.307707604765892, "epoch": 0.5204133988125779, "grad_norm": 7.4375, "learning_rate": 1.0404573438874231e-05, "loss": 1.2783, "mean_token_accuracy": 0.6884313762187958, "num_tokens": 8159383.0, "step": 3550 }, { "entropy": 1.307625848054886, "epoch": 0.5218793520486696, "grad_norm": 14.0, "learning_rate": 1.0433890354734684e-05, "loss": 1.2596, "mean_token_accuracy": 0.6833557695150375, "num_tokens": 8181161.0, "step": 3560 }, { "entropy": 1.2829160898923875, "epoch": 0.5233453052847614, "grad_norm": 10.0625, "learning_rate": 1.0463207270595135e-05, "loss": 1.2145, "mean_token_accuracy": 0.6881446421146393, "num_tokens": 8206724.0, "step": 3570 }, { "entropy": 1.2778740972280502, "epoch": 0.5248112585208532, "grad_norm": 7.25, "learning_rate": 1.0492524186455586e-05, "loss": 1.2303, "mean_token_accuracy": 0.692660340666771, "num_tokens": 8231460.0, "step": 3580 }, { "entropy": 1.473352074623108, "epoch": 0.526277211756945, "grad_norm": 12.75, "learning_rate": 1.0521841102316037e-05, "loss": 1.4158, "mean_token_accuracy": 0.6548714846372604, "num_tokens": 8252960.0, "step": 3590 }, { "entropy": 1.3340028017759322, "epoch": 0.5277431649930368, "grad_norm": 8.6875, "learning_rate": 1.0551158018176488e-05, "loss": 1.2672, "mean_token_accuracy": 0.6725976020097733, "num_tokens": 8275611.0, "step": 3600 }, { "entropy": 1.2553834348917008, "epoch": 0.5292091182291285, "grad_norm": 9.9375, "learning_rate": 1.058047493403694e-05, "loss": 1.2331, "mean_token_accuracy": 0.694476741552353, "num_tokens": 8298366.0, "step": 3610 }, { "entropy": 1.2716798782348633, "epoch": 0.5306750714652203, "grad_norm": 14.0, "learning_rate": 1.060979184989739e-05, "loss": 1.1516, "mean_token_accuracy": 0.6877859711647034, "num_tokens": 8319316.0, "step": 3620 }, { "entropy": 1.3342985033988952, "epoch": 0.5321410247013121, "grad_norm": 10.625, "learning_rate": 1.0639108765757844e-05, "loss": 1.3071, "mean_token_accuracy": 0.6861299127340317, "num_tokens": 8341586.0, "step": 3630 }, { "entropy": 1.0874662071466445, "epoch": 0.5336069779374037, "grad_norm": 7.8125, "learning_rate": 1.0668425681618295e-05, "loss": 1.0523, "mean_token_accuracy": 0.7275832921266556, "num_tokens": 8367993.0, "step": 3640 }, { "entropy": 1.3584882944822312, "epoch": 0.5350729311734955, "grad_norm": 12.5, "learning_rate": 1.0697742597478746e-05, "loss": 1.3216, "mean_token_accuracy": 0.6768259942531586, "num_tokens": 8390925.0, "step": 3650 }, { "entropy": 1.2551349967718124, "epoch": 0.5365388844095873, "grad_norm": 7.53125, "learning_rate": 1.0727059513339197e-05, "loss": 1.1976, "mean_token_accuracy": 0.6900953680276871, "num_tokens": 8416143.0, "step": 3660 }, { "entropy": 1.3740099132061006, "epoch": 0.5380048376456791, "grad_norm": 6.625, "learning_rate": 1.0756376429199648e-05, "loss": 1.3383, "mean_token_accuracy": 0.6778732597827911, "num_tokens": 8439510.0, "step": 3670 }, { "entropy": 1.413731899857521, "epoch": 0.5394707908817709, "grad_norm": 15.6875, "learning_rate": 1.07856933450601e-05, "loss": 1.4265, "mean_token_accuracy": 0.6619576767086983, "num_tokens": 8462046.0, "step": 3680 }, { "entropy": 1.3025281369686126, "epoch": 0.5409367441178626, "grad_norm": 7.28125, "learning_rate": 1.081501026092055e-05, "loss": 1.2641, "mean_token_accuracy": 0.6878643482923508, "num_tokens": 8489040.0, "step": 3690 }, { "entropy": 1.206892293691635, "epoch": 0.5424026973539544, "grad_norm": 7.25, "learning_rate": 1.0844327176781005e-05, "loss": 1.1089, "mean_token_accuracy": 0.7066188246011734, "num_tokens": 8511074.0, "step": 3700 }, { "entropy": 1.3881187468767167, "epoch": 0.5438686505900462, "grad_norm": 16.0, "learning_rate": 1.0873644092641456e-05, "loss": 1.3687, "mean_token_accuracy": 0.6670081466436386, "num_tokens": 8531885.0, "step": 3710 }, { "entropy": 1.2507911920547485, "epoch": 0.545334603826138, "grad_norm": 12.75, "learning_rate": 1.0902961008501907e-05, "loss": 1.2284, "mean_token_accuracy": 0.6853036373853684, "num_tokens": 8558491.0, "step": 3720 }, { "entropy": 1.2965079069137573, "epoch": 0.5468005570622297, "grad_norm": 7.21875, "learning_rate": 1.0932277924362358e-05, "loss": 1.2407, "mean_token_accuracy": 0.6949764549732208, "num_tokens": 8582801.0, "step": 3730 }, { "entropy": 1.4940945655107498, "epoch": 0.5482665102983215, "grad_norm": 10.375, "learning_rate": 1.096159484022281e-05, "loss": 1.3908, "mean_token_accuracy": 0.6482196033000946, "num_tokens": 8604153.0, "step": 3740 }, { "entropy": 1.234125518798828, "epoch": 0.5497324635344133, "grad_norm": 7.59375, "learning_rate": 1.099091175608326e-05, "loss": 1.2313, "mean_token_accuracy": 0.7079445570707321, "num_tokens": 8626755.0, "step": 3750 }, { "entropy": 1.5476297378540038, "epoch": 0.5511984167705051, "grad_norm": 10.1875, "learning_rate": 1.1020228671943714e-05, "loss": 1.5669, "mean_token_accuracy": 0.649404501914978, "num_tokens": 8646641.0, "step": 3760 }, { "entropy": 1.2947152376174926, "epoch": 0.5526643700065967, "grad_norm": 8.125, "learning_rate": 1.1049545587804165e-05, "loss": 1.2123, "mean_token_accuracy": 0.6882754504680634, "num_tokens": 8673532.0, "step": 3770 }, { "entropy": 1.363123106956482, "epoch": 0.5541303232426885, "grad_norm": 9.4375, "learning_rate": 1.1078862503664616e-05, "loss": 1.3542, "mean_token_accuracy": 0.6738324806094169, "num_tokens": 8694279.0, "step": 3780 }, { "entropy": 1.227664291858673, "epoch": 0.5555962764787803, "grad_norm": 9.625, "learning_rate": 1.1108179419525067e-05, "loss": 1.1627, "mean_token_accuracy": 0.7003932625055314, "num_tokens": 8719383.0, "step": 3790 }, { "entropy": 1.1783418089151383, "epoch": 0.5570622297148721, "grad_norm": 7.53125, "learning_rate": 1.1137496335385518e-05, "loss": 1.1529, "mean_token_accuracy": 0.7157177358865738, "num_tokens": 8746286.0, "step": 3800 }, { "entropy": 1.4640410006046296, "epoch": 0.5585281829509638, "grad_norm": 8.25, "learning_rate": 1.116681325124597e-05, "loss": 1.4364, "mean_token_accuracy": 0.6534341454505921, "num_tokens": 8770196.0, "step": 3810 }, { "entropy": 1.279750108718872, "epoch": 0.5599941361870556, "grad_norm": 14.6875, "learning_rate": 1.119613016710642e-05, "loss": 1.2293, "mean_token_accuracy": 0.690860140323639, "num_tokens": 8796542.0, "step": 3820 }, { "entropy": 1.2537000983953477, "epoch": 0.5614600894231474, "grad_norm": 16.875, "learning_rate": 1.1225447082966873e-05, "loss": 1.1654, "mean_token_accuracy": 0.6973809540271759, "num_tokens": 8821221.0, "step": 3830 }, { "entropy": 1.5468328326940537, "epoch": 0.5629260426592392, "grad_norm": 15.875, "learning_rate": 1.1254763998827324e-05, "loss": 1.5384, "mean_token_accuracy": 0.6455185949802399, "num_tokens": 8843257.0, "step": 3840 }, { "entropy": 1.1560868591070175, "epoch": 0.564391995895331, "grad_norm": 10.3125, "learning_rate": 1.1284080914687776e-05, "loss": 1.0984, "mean_token_accuracy": 0.7191143959760666, "num_tokens": 8868524.0, "step": 3850 }, { "entropy": 1.1308218091726303, "epoch": 0.5658579491314227, "grad_norm": 9.125, "learning_rate": 1.1313397830548227e-05, "loss": 1.0383, "mean_token_accuracy": 0.717108941078186, "num_tokens": 8893472.0, "step": 3860 }, { "entropy": 1.3416529297828674, "epoch": 0.5673239023675145, "grad_norm": 12.5, "learning_rate": 1.1342714746408678e-05, "loss": 1.3281, "mean_token_accuracy": 0.6779568955302239, "num_tokens": 8916975.0, "step": 3870 }, { "entropy": 1.333076822757721, "epoch": 0.5687898556036063, "grad_norm": 10.1875, "learning_rate": 1.1372031662269129e-05, "loss": 1.2996, "mean_token_accuracy": 0.684269905090332, "num_tokens": 8942767.0, "step": 3880 }, { "entropy": 1.2534291297197342, "epoch": 0.5702558088396981, "grad_norm": 8.75, "learning_rate": 1.140134857812958e-05, "loss": 1.2654, "mean_token_accuracy": 0.6908800512552261, "num_tokens": 8964879.0, "step": 3890 }, { "entropy": 1.2841325759887696, "epoch": 0.5717217620757897, "grad_norm": 8.5625, "learning_rate": 1.1430665493990035e-05, "loss": 1.2747, "mean_token_accuracy": 0.6917517364025116, "num_tokens": 8986961.0, "step": 3900 }, { "entropy": 1.2049250155687332, "epoch": 0.5731877153118815, "grad_norm": 10.4375, "learning_rate": 1.1459982409850486e-05, "loss": 1.1461, "mean_token_accuracy": 0.704876184463501, "num_tokens": 9010176.0, "step": 3910 }, { "entropy": 1.2843432933092118, "epoch": 0.5746536685479733, "grad_norm": 8.125, "learning_rate": 1.1489299325710937e-05, "loss": 1.2425, "mean_token_accuracy": 0.6915645688772202, "num_tokens": 9034974.0, "step": 3920 }, { "entropy": 1.3542578995227814, "epoch": 0.5761196217840651, "grad_norm": 13.0625, "learning_rate": 1.1518616241571388e-05, "loss": 1.2727, "mean_token_accuracy": 0.6702173113822937, "num_tokens": 9057159.0, "step": 3930 }, { "entropy": 1.41710267663002, "epoch": 0.5775855750201568, "grad_norm": 11.3125, "learning_rate": 1.154793315743184e-05, "loss": 1.3696, "mean_token_accuracy": 0.6622951745986938, "num_tokens": 9077648.0, "step": 3940 }, { "entropy": 1.5038718909025193, "epoch": 0.5790515282562486, "grad_norm": 12.0625, "learning_rate": 1.157725007329229e-05, "loss": 1.4827, "mean_token_accuracy": 0.6415575832128525, "num_tokens": 9097760.0, "step": 3950 }, { "entropy": 1.6105290830135346, "epoch": 0.5805174814923404, "grad_norm": 13.125, "learning_rate": 1.1606566989152742e-05, "loss": 1.5589, "mean_token_accuracy": 0.6326312497258186, "num_tokens": 9116524.0, "step": 3960 }, { "entropy": 1.2437657177448274, "epoch": 0.5819834347284322, "grad_norm": 9.3125, "learning_rate": 1.1635883905013194e-05, "loss": 1.1541, "mean_token_accuracy": 0.7057879507541657, "num_tokens": 9141020.0, "step": 3970 }, { "entropy": 1.2119415670633316, "epoch": 0.583449387964524, "grad_norm": 12.8125, "learning_rate": 1.1665200820873646e-05, "loss": 1.1608, "mean_token_accuracy": 0.7096191048622131, "num_tokens": 9167107.0, "step": 3980 }, { "entropy": 1.2128030419349671, "epoch": 0.5849153412006157, "grad_norm": 6.71875, "learning_rate": 1.1694517736734097e-05, "loss": 1.1606, "mean_token_accuracy": 0.7027535736560822, "num_tokens": 9194015.0, "step": 3990 }, { "entropy": 1.2447504669427871, "epoch": 0.5863812944367075, "grad_norm": 10.125, "learning_rate": 1.1723834652594548e-05, "loss": 1.1422, "mean_token_accuracy": 0.6915670812129975, "num_tokens": 9217240.0, "step": 4000 }, { "entropy": 1.4259051144123078, "epoch": 0.5878472476727993, "grad_norm": 15.8125, "learning_rate": 1.1753151568454999e-05, "loss": 1.3698, "mean_token_accuracy": 0.6598316103219986, "num_tokens": 9240378.0, "step": 4010 }, { "entropy": 1.431957620382309, "epoch": 0.589313200908891, "grad_norm": 10.875, "learning_rate": 1.178246848431545e-05, "loss": 1.4488, "mean_token_accuracy": 0.6535759925842285, "num_tokens": 9263527.0, "step": 4020 }, { "entropy": 1.476978275179863, "epoch": 0.5907791541449827, "grad_norm": 9.9375, "learning_rate": 1.1811785400175901e-05, "loss": 1.4307, "mean_token_accuracy": 0.6558587744832038, "num_tokens": 9281900.0, "step": 4030 }, { "entropy": 1.3353843629360198, "epoch": 0.5922451073810745, "grad_norm": 11.625, "learning_rate": 1.1841102316036354e-05, "loss": 1.2533, "mean_token_accuracy": 0.6849133104085923, "num_tokens": 9308990.0, "step": 4040 }, { "entropy": 1.2512319564819336, "epoch": 0.5937110606171663, "grad_norm": 8.625, "learning_rate": 1.1870419231896805e-05, "loss": 1.2045, "mean_token_accuracy": 0.6928964048624039, "num_tokens": 9332774.0, "step": 4050 }, { "entropy": 1.2366972982883453, "epoch": 0.595177013853258, "grad_norm": 8.0625, "learning_rate": 1.1899736147757256e-05, "loss": 1.2484, "mean_token_accuracy": 0.6949911296367646, "num_tokens": 9358456.0, "step": 4060 }, { "entropy": 1.2693091660737992, "epoch": 0.5966429670893498, "grad_norm": 10.125, "learning_rate": 1.1929053063617708e-05, "loss": 1.2535, "mean_token_accuracy": 0.6920949131250381, "num_tokens": 9381797.0, "step": 4070 }, { "entropy": 1.3372864931821824, "epoch": 0.5981089203254416, "grad_norm": 6.375, "learning_rate": 1.1958369979478159e-05, "loss": 1.3282, "mean_token_accuracy": 0.6819088816642761, "num_tokens": 9404972.0, "step": 4080 }, { "entropy": 1.1366515934467316, "epoch": 0.5995748735615334, "grad_norm": 7.03125, "learning_rate": 1.198768689533861e-05, "loss": 1.1148, "mean_token_accuracy": 0.7238090693950653, "num_tokens": 9433086.0, "step": 4090 }, { "entropy": 1.2153202593326569, "epoch": 0.6010408267976252, "grad_norm": 9.0, "learning_rate": 1.2017003811199064e-05, "loss": 1.1872, "mean_token_accuracy": 0.7040929272770882, "num_tokens": 9457812.0, "step": 4100 }, { "entropy": 1.2705127462744712, "epoch": 0.6025067800337169, "grad_norm": 12.375, "learning_rate": 1.2046320727059516e-05, "loss": 1.1925, "mean_token_accuracy": 0.698218023777008, "num_tokens": 9482491.0, "step": 4110 }, { "entropy": 1.3553331643342972, "epoch": 0.6039727332698087, "grad_norm": 7.5, "learning_rate": 1.2075637642919967e-05, "loss": 1.3067, "mean_token_accuracy": 0.6783182322978973, "num_tokens": 9505087.0, "step": 4120 }, { "entropy": 1.2952655225992202, "epoch": 0.6054386865059005, "grad_norm": 8.6875, "learning_rate": 1.2104954558780418e-05, "loss": 1.2687, "mean_token_accuracy": 0.6834020853042603, "num_tokens": 9526966.0, "step": 4130 }, { "entropy": 1.417215597629547, "epoch": 0.6069046397419923, "grad_norm": 9.6875, "learning_rate": 1.2134271474640869e-05, "loss": 1.3792, "mean_token_accuracy": 0.6718692868947983, "num_tokens": 9548587.0, "step": 4140 }, { "entropy": 1.3456296294927597, "epoch": 0.608370592978084, "grad_norm": 11.1875, "learning_rate": 1.216358839050132e-05, "loss": 1.302, "mean_token_accuracy": 0.6761639237403869, "num_tokens": 9574813.0, "step": 4150 }, { "entropy": 1.2711796522140504, "epoch": 0.6098365462141758, "grad_norm": 7.25, "learning_rate": 1.2192905306361771e-05, "loss": 1.2895, "mean_token_accuracy": 0.6978533819317818, "num_tokens": 9598476.0, "step": 4160 }, { "entropy": 1.3399390518665313, "epoch": 0.6113024994502675, "grad_norm": 6.9375, "learning_rate": 1.2222222222222224e-05, "loss": 1.3637, "mean_token_accuracy": 0.6750861674547195, "num_tokens": 9620158.0, "step": 4170 }, { "entropy": 1.2609145835042, "epoch": 0.6127684526863593, "grad_norm": 6.78125, "learning_rate": 1.2251539138082675e-05, "loss": 1.1655, "mean_token_accuracy": 0.6887770175933838, "num_tokens": 9644724.0, "step": 4180 }, { "entropy": 1.439779457449913, "epoch": 0.614234405922451, "grad_norm": 10.5625, "learning_rate": 1.2280856053943126e-05, "loss": 1.4149, "mean_token_accuracy": 0.6475794523954391, "num_tokens": 9666296.0, "step": 4190 }, { "entropy": 1.2917697608470917, "epoch": 0.6157003591585428, "grad_norm": 11.5625, "learning_rate": 1.2310172969803578e-05, "loss": 1.2704, "mean_token_accuracy": 0.6918294578790665, "num_tokens": 9690334.0, "step": 4200 }, { "entropy": 1.3230039298534393, "epoch": 0.6171663123946346, "grad_norm": 7.875, "learning_rate": 1.2339489885664029e-05, "loss": 1.277, "mean_token_accuracy": 0.682563304901123, "num_tokens": 9712408.0, "step": 4210 }, { "entropy": 1.2867875665426254, "epoch": 0.6186322656307264, "grad_norm": 10.375, "learning_rate": 1.236880680152448e-05, "loss": 1.2033, "mean_token_accuracy": 0.6911074638366699, "num_tokens": 9736429.0, "step": 4220 }, { "entropy": 1.2692913562059402, "epoch": 0.6200982188668182, "grad_norm": 10.8125, "learning_rate": 1.2398123717384931e-05, "loss": 1.2603, "mean_token_accuracy": 0.6948667913675308, "num_tokens": 9760117.0, "step": 4230 }, { "entropy": 1.1271882563829423, "epoch": 0.6215641721029099, "grad_norm": 7.1875, "learning_rate": 1.2427440633245384e-05, "loss": 1.0612, "mean_token_accuracy": 0.7197933301329613, "num_tokens": 9786813.0, "step": 4240 }, { "entropy": 1.4017834395170212, "epoch": 0.6230301253390017, "grad_norm": 8.3125, "learning_rate": 1.2456757549105835e-05, "loss": 1.4022, "mean_token_accuracy": 0.6616987824440003, "num_tokens": 9810528.0, "step": 4250 }, { "entropy": 1.2476618200540543, "epoch": 0.6244960785750935, "grad_norm": 7.5, "learning_rate": 1.2486074464966286e-05, "loss": 1.1635, "mean_token_accuracy": 0.6969284772872925, "num_tokens": 9836227.0, "step": 4260 }, { "entropy": 1.38343488574028, "epoch": 0.6259620318111853, "grad_norm": 10.6875, "learning_rate": 1.2515391380826737e-05, "loss": 1.3731, "mean_token_accuracy": 0.6635289341211319, "num_tokens": 9857788.0, "step": 4270 }, { "entropy": 1.3139474153518678, "epoch": 0.627427985047277, "grad_norm": 9.4375, "learning_rate": 1.2544708296687188e-05, "loss": 1.2637, "mean_token_accuracy": 0.6788632541894912, "num_tokens": 9879982.0, "step": 4280 }, { "entropy": 1.2432274132966996, "epoch": 0.6288939382833688, "grad_norm": 8.8125, "learning_rate": 1.257402521254764e-05, "loss": 1.1973, "mean_token_accuracy": 0.6881041020154953, "num_tokens": 9904629.0, "step": 4290 }, { "entropy": 1.5572682201862336, "epoch": 0.6303598915194605, "grad_norm": 9.25, "learning_rate": 1.260334212840809e-05, "loss": 1.5007, "mean_token_accuracy": 0.6485047161579132, "num_tokens": 9924019.0, "step": 4300 }, { "entropy": 1.4832079589366913, "epoch": 0.6318258447555523, "grad_norm": 10.25, "learning_rate": 1.2632659044268545e-05, "loss": 1.4557, "mean_token_accuracy": 0.6510452330112457, "num_tokens": 9945980.0, "step": 4310 }, { "entropy": 1.3299841344356538, "epoch": 0.633291797991644, "grad_norm": 9.625, "learning_rate": 1.2661975960128996e-05, "loss": 1.2955, "mean_token_accuracy": 0.6778925746679306, "num_tokens": 9970399.0, "step": 4320 }, { "entropy": 1.298663392663002, "epoch": 0.6347577512277358, "grad_norm": 7.46875, "learning_rate": 1.2691292875989448e-05, "loss": 1.2777, "mean_token_accuracy": 0.6821434020996093, "num_tokens": 9991246.0, "step": 4330 }, { "entropy": 1.2325966089963913, "epoch": 0.6362237044638276, "grad_norm": 6.71875, "learning_rate": 1.2720609791849899e-05, "loss": 1.1663, "mean_token_accuracy": 0.6926221922039986, "num_tokens": 10015805.0, "step": 4340 }, { "entropy": 1.4027708262205123, "epoch": 0.6376896576999194, "grad_norm": 14.875, "learning_rate": 1.274992670771035e-05, "loss": 1.3213, "mean_token_accuracy": 0.663789801299572, "num_tokens": 10036485.0, "step": 4350 }, { "entropy": 1.4401298105716704, "epoch": 0.6391556109360111, "grad_norm": 14.25, "learning_rate": 1.2779243623570801e-05, "loss": 1.3711, "mean_token_accuracy": 0.6592388525605202, "num_tokens": 10059441.0, "step": 4360 }, { "entropy": 1.3919603198766708, "epoch": 0.6406215641721029, "grad_norm": 7.375, "learning_rate": 1.2808560539431252e-05, "loss": 1.3922, "mean_token_accuracy": 0.6686307042837143, "num_tokens": 10086532.0, "step": 4370 }, { "entropy": 1.3095344394445418, "epoch": 0.6420875174081947, "grad_norm": 7.75, "learning_rate": 1.2837877455291705e-05, "loss": 1.253, "mean_token_accuracy": 0.6819472581148147, "num_tokens": 10109341.0, "step": 4380 }, { "entropy": 1.1215385705232621, "epoch": 0.6435534706442865, "grad_norm": 13.125, "learning_rate": 1.2867194371152156e-05, "loss": 1.0841, "mean_token_accuracy": 0.7255510926246643, "num_tokens": 10133374.0, "step": 4390 }, { "entropy": 1.2194762349128723, "epoch": 0.6450194238803783, "grad_norm": 9.5, "learning_rate": 1.2896511287012607e-05, "loss": 1.1433, "mean_token_accuracy": 0.7032099068164825, "num_tokens": 10159363.0, "step": 4400 }, { "entropy": 1.6710076600313186, "epoch": 0.64648537711647, "grad_norm": 15.0, "learning_rate": 1.2925828202873058e-05, "loss": 1.6698, "mean_token_accuracy": 0.6161840990185737, "num_tokens": 10177633.0, "step": 4410 }, { "entropy": 1.292188623547554, "epoch": 0.6479513303525618, "grad_norm": 7.28125, "learning_rate": 1.295514511873351e-05, "loss": 1.3118, "mean_token_accuracy": 0.6875610619783401, "num_tokens": 10203186.0, "step": 4420 }, { "entropy": 1.1562246203422546, "epoch": 0.6494172835886535, "grad_norm": 10.5, "learning_rate": 1.298446203459396e-05, "loss": 1.1279, "mean_token_accuracy": 0.7125550597906113, "num_tokens": 10231285.0, "step": 4430 }, { "entropy": 1.2320525497198105, "epoch": 0.6508832368247452, "grad_norm": 8.0, "learning_rate": 1.3013778950454414e-05, "loss": 1.1796, "mean_token_accuracy": 0.6926270306110383, "num_tokens": 10256506.0, "step": 4440 }, { "entropy": 1.4389690220355988, "epoch": 0.652349190060837, "grad_norm": 10.8125, "learning_rate": 1.3043095866314865e-05, "loss": 1.3916, "mean_token_accuracy": 0.6641354471445083, "num_tokens": 10276794.0, "step": 4450 }, { "entropy": 1.220185285806656, "epoch": 0.6538151432969288, "grad_norm": 7.84375, "learning_rate": 1.3072412782175316e-05, "loss": 1.1667, "mean_token_accuracy": 0.6976089686155319, "num_tokens": 10299207.0, "step": 4460 }, { "entropy": 1.4845711886882782, "epoch": 0.6552810965330206, "grad_norm": 11.9375, "learning_rate": 1.3101729698035767e-05, "loss": 1.4502, "mean_token_accuracy": 0.655247887969017, "num_tokens": 10323115.0, "step": 4470 }, { "entropy": 1.1680356860160828, "epoch": 0.6567470497691124, "grad_norm": 7.03125, "learning_rate": 1.3131046613896218e-05, "loss": 1.1209, "mean_token_accuracy": 0.7096475124359131, "num_tokens": 10347375.0, "step": 4480 }, { "entropy": 1.215923273563385, "epoch": 0.6582130030052041, "grad_norm": 9.375, "learning_rate": 1.316036352975667e-05, "loss": 1.1576, "mean_token_accuracy": 0.700969448685646, "num_tokens": 10371420.0, "step": 4490 }, { "entropy": 1.1752849966287613, "epoch": 0.6596789562412959, "grad_norm": 13.8125, "learning_rate": 1.318968044561712e-05, "loss": 1.0891, "mean_token_accuracy": 0.710693308711052, "num_tokens": 10393996.0, "step": 4500 }, { "entropy": 1.3289962589740754, "epoch": 0.6611449094773877, "grad_norm": 9.3125, "learning_rate": 1.3218997361477575e-05, "loss": 1.2886, "mean_token_accuracy": 0.6893432140350342, "num_tokens": 10417464.0, "step": 4510 }, { "entropy": 1.3317008525133134, "epoch": 0.6626108627134795, "grad_norm": 12.125, "learning_rate": 1.3248314277338026e-05, "loss": 1.2743, "mean_token_accuracy": 0.6877497225999832, "num_tokens": 10439098.0, "step": 4520 }, { "entropy": 1.2200810134410858, "epoch": 0.6640768159495712, "grad_norm": 7.875, "learning_rate": 1.3277631193198477e-05, "loss": 1.1085, "mean_token_accuracy": 0.7073990851640701, "num_tokens": 10462450.0, "step": 4530 }, { "entropy": 1.4254248797893525, "epoch": 0.665542769185663, "grad_norm": 8.8125, "learning_rate": 1.3306948109058928e-05, "loss": 1.3791, "mean_token_accuracy": 0.6677764862775802, "num_tokens": 10485778.0, "step": 4540 }, { "entropy": 1.2414245635271073, "epoch": 0.6670087224217548, "grad_norm": 12.375, "learning_rate": 1.333626502491938e-05, "loss": 1.1948, "mean_token_accuracy": 0.6899586588144302, "num_tokens": 10506667.0, "step": 4550 }, { "entropy": 1.2990390241146088, "epoch": 0.6684746756578465, "grad_norm": 5.78125, "learning_rate": 1.336558194077983e-05, "loss": 1.2523, "mean_token_accuracy": 0.6864860326051712, "num_tokens": 10531076.0, "step": 4560 }, { "entropy": 1.3553549766540527, "epoch": 0.6699406288939382, "grad_norm": 12.6875, "learning_rate": 1.3394898856640282e-05, "loss": 1.3045, "mean_token_accuracy": 0.6773770317435265, "num_tokens": 10559270.0, "step": 4570 }, { "entropy": 1.2394065976142883, "epoch": 0.67140658213003, "grad_norm": 7.375, "learning_rate": 1.3424215772500735e-05, "loss": 1.1613, "mean_token_accuracy": 0.6996310174465179, "num_tokens": 10581183.0, "step": 4580 }, { "entropy": 1.4448842465877534, "epoch": 0.6728725353661218, "grad_norm": 12.0, "learning_rate": 1.3453532688361186e-05, "loss": 1.4203, "mean_token_accuracy": 0.6603567913174629, "num_tokens": 10605442.0, "step": 4590 }, { "entropy": 1.3163914412260056, "epoch": 0.6743384886022136, "grad_norm": 12.4375, "learning_rate": 1.3482849604221637e-05, "loss": 1.3056, "mean_token_accuracy": 0.686412262916565, "num_tokens": 10630146.0, "step": 4600 }, { "entropy": 1.1984927773475647, "epoch": 0.6758044418383053, "grad_norm": 9.0625, "learning_rate": 1.3512166520082088e-05, "loss": 1.1469, "mean_token_accuracy": 0.7040555268526077, "num_tokens": 10655573.0, "step": 4610 }, { "entropy": 1.3904953569173812, "epoch": 0.6772703950743971, "grad_norm": 6.78125, "learning_rate": 1.354148343594254e-05, "loss": 1.3378, "mean_token_accuracy": 0.6703354239463806, "num_tokens": 10677695.0, "step": 4620 }, { "entropy": 1.262979018688202, "epoch": 0.6787363483104889, "grad_norm": 9.6875, "learning_rate": 1.357080035180299e-05, "loss": 1.2492, "mean_token_accuracy": 0.6916483610868454, "num_tokens": 10703873.0, "step": 4630 }, { "entropy": 1.3217353224754333, "epoch": 0.6802023015465807, "grad_norm": 13.6875, "learning_rate": 1.3600117267663442e-05, "loss": 1.2574, "mean_token_accuracy": 0.6797277510166169, "num_tokens": 10727847.0, "step": 4640 }, { "entropy": 1.4313506543636323, "epoch": 0.6816682547826725, "grad_norm": 14.6875, "learning_rate": 1.3629434183523894e-05, "loss": 1.4442, "mean_token_accuracy": 0.6508629202842713, "num_tokens": 10747590.0, "step": 4650 }, { "entropy": 1.4812981009483337, "epoch": 0.6831342080187642, "grad_norm": 9.5625, "learning_rate": 1.3658751099384346e-05, "loss": 1.4033, "mean_token_accuracy": 0.6604020059108734, "num_tokens": 10771907.0, "step": 4660 }, { "entropy": 1.470238670706749, "epoch": 0.684600161254856, "grad_norm": 10.3125, "learning_rate": 1.3688068015244797e-05, "loss": 1.447, "mean_token_accuracy": 0.6545856326818467, "num_tokens": 10791954.0, "step": 4670 }, { "entropy": 1.470739722251892, "epoch": 0.6860661144909478, "grad_norm": 9.9375, "learning_rate": 1.3717384931105248e-05, "loss": 1.4175, "mean_token_accuracy": 0.6482149213552475, "num_tokens": 10814513.0, "step": 4680 }, { "entropy": 1.3407788693904876, "epoch": 0.6875320677270395, "grad_norm": 15.0, "learning_rate": 1.3746701846965699e-05, "loss": 1.2972, "mean_token_accuracy": 0.6840735003352165, "num_tokens": 10839559.0, "step": 4690 }, { "entropy": 1.273562815785408, "epoch": 0.6889980209631312, "grad_norm": 7.0, "learning_rate": 1.377601876282615e-05, "loss": 1.2612, "mean_token_accuracy": 0.6887640684843064, "num_tokens": 10862523.0, "step": 4700 }, { "entropy": 1.3401957750320435, "epoch": 0.690463974199223, "grad_norm": 11.4375, "learning_rate": 1.3805335678686605e-05, "loss": 1.1838, "mean_token_accuracy": 0.6852938115596772, "num_tokens": 10885467.0, "step": 4710 }, { "entropy": 1.194872146844864, "epoch": 0.6919299274353148, "grad_norm": 7.96875, "learning_rate": 1.3834652594547056e-05, "loss": 1.1472, "mean_token_accuracy": 0.7084931612014771, "num_tokens": 10908911.0, "step": 4720 }, { "entropy": 1.2363231211900712, "epoch": 0.6933958806714066, "grad_norm": 14.3125, "learning_rate": 1.3863969510407507e-05, "loss": 1.1992, "mean_token_accuracy": 0.6977182120084763, "num_tokens": 10932106.0, "step": 4730 }, { "entropy": 1.352634310722351, "epoch": 0.6948618339074983, "grad_norm": 7.40625, "learning_rate": 1.3893286426267958e-05, "loss": 1.3294, "mean_token_accuracy": 0.6706230491399765, "num_tokens": 10953076.0, "step": 4740 }, { "entropy": 1.3878548473119736, "epoch": 0.6963277871435901, "grad_norm": 10.3125, "learning_rate": 1.392260334212841e-05, "loss": 1.3868, "mean_token_accuracy": 0.6675230383872985, "num_tokens": 10975087.0, "step": 4750 }, { "entropy": 1.3088181048631669, "epoch": 0.6977937403796819, "grad_norm": 15.4375, "learning_rate": 1.395192025798886e-05, "loss": 1.259, "mean_token_accuracy": 0.6845801115036011, "num_tokens": 10996621.0, "step": 4760 }, { "entropy": 1.1653920710086823, "epoch": 0.6992596936157737, "grad_norm": 11.0, "learning_rate": 1.3981237173849312e-05, "loss": 1.1296, "mean_token_accuracy": 0.714437586069107, "num_tokens": 11024707.0, "step": 4770 }, { "entropy": 1.3328092396259308, "epoch": 0.7007256468518654, "grad_norm": 7.46875, "learning_rate": 1.4010554089709764e-05, "loss": 1.2997, "mean_token_accuracy": 0.6727986395359039, "num_tokens": 11046973.0, "step": 4780 }, { "entropy": 1.2961938366293908, "epoch": 0.7021916000879572, "grad_norm": 9.375, "learning_rate": 1.4039871005570216e-05, "loss": 1.1829, "mean_token_accuracy": 0.6831134587526322, "num_tokens": 11070271.0, "step": 4790 }, { "entropy": 1.4047529637813567, "epoch": 0.703657553324049, "grad_norm": 10.3125, "learning_rate": 1.4069187921430667e-05, "loss": 1.3909, "mean_token_accuracy": 0.6677388578653336, "num_tokens": 11089567.0, "step": 4800 }, { "entropy": 1.5368960559368134, "epoch": 0.7051235065601408, "grad_norm": 9.875, "learning_rate": 1.4098504837291118e-05, "loss": 1.497, "mean_token_accuracy": 0.6478627651929856, "num_tokens": 11110862.0, "step": 4810 }, { "entropy": 1.457329884171486, "epoch": 0.7065894597962326, "grad_norm": 9.0, "learning_rate": 1.4127821753151569e-05, "loss": 1.4248, "mean_token_accuracy": 0.6601524531841279, "num_tokens": 11133547.0, "step": 4820 }, { "entropy": 1.2269539147615434, "epoch": 0.7080554130323242, "grad_norm": 9.75, "learning_rate": 1.415713866901202e-05, "loss": 1.1738, "mean_token_accuracy": 0.6996443212032318, "num_tokens": 11155788.0, "step": 4830 }, { "entropy": 1.43188256919384, "epoch": 0.709521366268416, "grad_norm": 9.9375, "learning_rate": 1.4186455584872471e-05, "loss": 1.3624, "mean_token_accuracy": 0.6640131086111069, "num_tokens": 11177745.0, "step": 4840 }, { "entropy": 1.046053270995617, "epoch": 0.7109873195045078, "grad_norm": 6.84375, "learning_rate": 1.4215772500732926e-05, "loss": 0.9956, "mean_token_accuracy": 0.7363066077232361, "num_tokens": 11204715.0, "step": 4850 }, { "entropy": 1.2720306187868118, "epoch": 0.7124532727405996, "grad_norm": 12.0625, "learning_rate": 1.4245089416593377e-05, "loss": 1.2259, "mean_token_accuracy": 0.6891566187143325, "num_tokens": 11228028.0, "step": 4860 }, { "entropy": 1.2679410010576249, "epoch": 0.7139192259766913, "grad_norm": 7.125, "learning_rate": 1.4274406332453828e-05, "loss": 1.235, "mean_token_accuracy": 0.6972826570272446, "num_tokens": 11251823.0, "step": 4870 }, { "entropy": 1.5728963315486908, "epoch": 0.7153851792127831, "grad_norm": 9.0625, "learning_rate": 1.430372324831428e-05, "loss": 1.5777, "mean_token_accuracy": 0.6348887220025062, "num_tokens": 11269697.0, "step": 4880 }, { "entropy": 1.28750339448452, "epoch": 0.7168511324488749, "grad_norm": 10.25, "learning_rate": 1.433304016417473e-05, "loss": 1.2154, "mean_token_accuracy": 0.6795454293489456, "num_tokens": 11292911.0, "step": 4890 }, { "entropy": 1.2924628466367722, "epoch": 0.7183170856849667, "grad_norm": 10.625, "learning_rate": 1.4362357080035182e-05, "loss": 1.3102, "mean_token_accuracy": 0.6829824715852737, "num_tokens": 11313839.0, "step": 4900 }, { "entropy": 1.2178005695343017, "epoch": 0.7197830389210584, "grad_norm": 7.90625, "learning_rate": 1.4391673995895633e-05, "loss": 1.1603, "mean_token_accuracy": 0.7007954269647598, "num_tokens": 11336232.0, "step": 4910 }, { "entropy": 1.2192000716924667, "epoch": 0.7212489921571502, "grad_norm": 14.5, "learning_rate": 1.4420990911756086e-05, "loss": 1.1772, "mean_token_accuracy": 0.701339590549469, "num_tokens": 11358965.0, "step": 4920 }, { "entropy": 1.319139078259468, "epoch": 0.722714945393242, "grad_norm": 7.5625, "learning_rate": 1.4450307827616537e-05, "loss": 1.2493, "mean_token_accuracy": 0.6861840188503265, "num_tokens": 11383698.0, "step": 4930 }, { "entropy": 1.232242950797081, "epoch": 0.7241808986293338, "grad_norm": 11.4375, "learning_rate": 1.4479624743476988e-05, "loss": 1.195, "mean_token_accuracy": 0.69576096534729, "num_tokens": 11407799.0, "step": 4940 }, { "entropy": 1.1999111473560333, "epoch": 0.7256468518654255, "grad_norm": 9.4375, "learning_rate": 1.4508941659337439e-05, "loss": 1.176, "mean_token_accuracy": 0.706417515873909, "num_tokens": 11432317.0, "step": 4950 }, { "entropy": 1.3092357516288757, "epoch": 0.7271128051015172, "grad_norm": 9.75, "learning_rate": 1.453825857519789e-05, "loss": 1.2591, "mean_token_accuracy": 0.6908426403999328, "num_tokens": 11454629.0, "step": 4960 }, { "entropy": 1.3806799292564391, "epoch": 0.728578758337609, "grad_norm": 10.375, "learning_rate": 1.4567575491058341e-05, "loss": 1.3203, "mean_token_accuracy": 0.6736815601587296, "num_tokens": 11475491.0, "step": 4970 }, { "entropy": 1.414483168721199, "epoch": 0.7300447115737008, "grad_norm": 9.625, "learning_rate": 1.4596892406918792e-05, "loss": 1.3893, "mean_token_accuracy": 0.6631225675344468, "num_tokens": 11497669.0, "step": 4980 }, { "entropy": 1.3249703735113143, "epoch": 0.7315106648097925, "grad_norm": 10.1875, "learning_rate": 1.4626209322779245e-05, "loss": 1.3111, "mean_token_accuracy": 0.6807681947946549, "num_tokens": 11523367.0, "step": 4990 }, { "entropy": 1.2282486379146575, "epoch": 0.7329766180458843, "grad_norm": 10.0, "learning_rate": 1.4655526238639696e-05, "loss": 1.1598, "mean_token_accuracy": 0.6980828180909157, "num_tokens": 11544828.0, "step": 5000 }, { "entropy": 1.2738509893417358, "epoch": 0.7344425712819761, "grad_norm": 10.0625, "learning_rate": 1.4684843154500148e-05, "loss": 1.2111, "mean_token_accuracy": 0.6910221531987191, "num_tokens": 11568991.0, "step": 5010 }, { "entropy": 1.215120902657509, "epoch": 0.7359085245180679, "grad_norm": 10.375, "learning_rate": 1.4714160070360599e-05, "loss": 1.2038, "mean_token_accuracy": 0.697847780585289, "num_tokens": 11593148.0, "step": 5020 }, { "entropy": 1.3138453751802444, "epoch": 0.7373744777541597, "grad_norm": 7.875, "learning_rate": 1.474347698622105e-05, "loss": 1.24, "mean_token_accuracy": 0.6847038328647613, "num_tokens": 11612991.0, "step": 5030 }, { "entropy": 1.3226786196231841, "epoch": 0.7388404309902514, "grad_norm": 7.6875, "learning_rate": 1.4772793902081501e-05, "loss": 1.2833, "mean_token_accuracy": 0.6771773576736451, "num_tokens": 11632051.0, "step": 5040 }, { "entropy": 1.2309457600116729, "epoch": 0.7403063842263432, "grad_norm": 7.0, "learning_rate": 1.4802110817941956e-05, "loss": 1.1749, "mean_token_accuracy": 0.6957832410931587, "num_tokens": 11657872.0, "step": 5050 }, { "entropy": 1.3276364773511886, "epoch": 0.741772337462435, "grad_norm": 6.5625, "learning_rate": 1.4831427733802407e-05, "loss": 1.3103, "mean_token_accuracy": 0.6776412919163703, "num_tokens": 11680446.0, "step": 5060 }, { "entropy": 1.0554574191570283, "epoch": 0.7432382906985268, "grad_norm": 8.8125, "learning_rate": 1.4860744649662858e-05, "loss": 1.0347, "mean_token_accuracy": 0.7339630395174026, "num_tokens": 11707596.0, "step": 5070 }, { "entropy": 1.449861115217209, "epoch": 0.7447042439346185, "grad_norm": 13.75, "learning_rate": 1.4890061565523309e-05, "loss": 1.4402, "mean_token_accuracy": 0.6547907769680024, "num_tokens": 11729645.0, "step": 5080 }, { "entropy": 1.215521377325058, "epoch": 0.7461701971707102, "grad_norm": 7.21875, "learning_rate": 1.491937848138376e-05, "loss": 1.1786, "mean_token_accuracy": 0.7029460728168487, "num_tokens": 11754372.0, "step": 5090 }, { "entropy": 1.2404891163110734, "epoch": 0.747636150406802, "grad_norm": 6.84375, "learning_rate": 1.4948695397244211e-05, "loss": 1.1488, "mean_token_accuracy": 0.6969448626041412, "num_tokens": 11776620.0, "step": 5100 }, { "entropy": 1.3023002684116363, "epoch": 0.7491021036428938, "grad_norm": 11.125, "learning_rate": 1.4978012313104662e-05, "loss": 1.2702, "mean_token_accuracy": 0.6769918829202652, "num_tokens": 11798303.0, "step": 5110 }, { "entropy": 1.3634466335177422, "epoch": 0.7505680568789855, "grad_norm": 13.8125, "learning_rate": 1.5007329228965115e-05, "loss": 1.3075, "mean_token_accuracy": 0.6755014836788178, "num_tokens": 11822792.0, "step": 5120 }, { "entropy": 1.2942151188850404, "epoch": 0.7520340101150773, "grad_norm": 10.1875, "learning_rate": 1.5036646144825566e-05, "loss": 1.2821, "mean_token_accuracy": 0.6888857513666153, "num_tokens": 11845388.0, "step": 5130 }, { "entropy": 1.3187985837459564, "epoch": 0.7534999633511691, "grad_norm": 7.6875, "learning_rate": 1.5065963060686018e-05, "loss": 1.2683, "mean_token_accuracy": 0.6765559881925582, "num_tokens": 11866299.0, "step": 5140 }, { "entropy": 1.2510508447885513, "epoch": 0.7549659165872609, "grad_norm": 10.375, "learning_rate": 1.5095279976546469e-05, "loss": 1.2056, "mean_token_accuracy": 0.6992509812116623, "num_tokens": 11890461.0, "step": 5150 }, { "entropy": 1.2876760244369507, "epoch": 0.7564318698233526, "grad_norm": 7.5, "learning_rate": 1.512459689240692e-05, "loss": 1.2447, "mean_token_accuracy": 0.6855009347200394, "num_tokens": 11911274.0, "step": 5160 }, { "entropy": 1.2132894963026046, "epoch": 0.7578978230594444, "grad_norm": 6.28125, "learning_rate": 1.5153913808267371e-05, "loss": 1.1697, "mean_token_accuracy": 0.7027882069349289, "num_tokens": 11936085.0, "step": 5170 }, { "entropy": 1.2971851497888565, "epoch": 0.7593637762955362, "grad_norm": 10.1875, "learning_rate": 1.5183230724127822e-05, "loss": 1.1871, "mean_token_accuracy": 0.6921399280428886, "num_tokens": 11960996.0, "step": 5180 }, { "entropy": 1.2802244037389756, "epoch": 0.760829729531628, "grad_norm": 7.34375, "learning_rate": 1.5212547639988275e-05, "loss": 1.2418, "mean_token_accuracy": 0.6926645278930664, "num_tokens": 11983944.0, "step": 5190 }, { "entropy": 1.4996819853782655, "epoch": 0.7622956827677198, "grad_norm": 13.0625, "learning_rate": 1.5241864555848726e-05, "loss": 1.43, "mean_token_accuracy": 0.6550417497754097, "num_tokens": 12006954.0, "step": 5200 }, { "entropy": 1.1083065778017045, "epoch": 0.7637616360038115, "grad_norm": 9.375, "learning_rate": 1.5271181471709177e-05, "loss": 1.0296, "mean_token_accuracy": 0.7281710386276246, "num_tokens": 12031594.0, "step": 5210 }, { "entropy": 1.2428986430168152, "epoch": 0.7652275892399032, "grad_norm": 8.5625, "learning_rate": 1.530049838756963e-05, "loss": 1.1817, "mean_token_accuracy": 0.6961143612861633, "num_tokens": 12055484.0, "step": 5220 }, { "entropy": 1.2242067754268646, "epoch": 0.766693542475995, "grad_norm": 8.25, "learning_rate": 1.532981530343008e-05, "loss": 1.1685, "mean_token_accuracy": 0.6954874604940414, "num_tokens": 12079620.0, "step": 5230 }, { "entropy": 1.2395792424678802, "epoch": 0.7681594957120867, "grad_norm": 9.4375, "learning_rate": 1.5359132219290532e-05, "loss": 1.2342, "mean_token_accuracy": 0.6881227746605874, "num_tokens": 12104547.0, "step": 5240 }, { "entropy": 1.1322668135166167, "epoch": 0.7696254489481785, "grad_norm": 7.4375, "learning_rate": 1.5388449135150982e-05, "loss": 1.0558, "mean_token_accuracy": 0.7162070393562316, "num_tokens": 12130574.0, "step": 5250 }, { "entropy": 1.2922002375125885, "epoch": 0.7710914021842703, "grad_norm": 15.4375, "learning_rate": 1.5417766051011435e-05, "loss": 1.2744, "mean_token_accuracy": 0.6919665589928627, "num_tokens": 12152259.0, "step": 5260 }, { "entropy": 1.3774068623781204, "epoch": 0.7725573554203621, "grad_norm": 9.0625, "learning_rate": 1.5447082966871888e-05, "loss": 1.3623, "mean_token_accuracy": 0.6684454590082168, "num_tokens": 12174123.0, "step": 5270 }, { "entropy": 1.213736817240715, "epoch": 0.7740233086564539, "grad_norm": 8.625, "learning_rate": 1.5476399882732337e-05, "loss": 1.1553, "mean_token_accuracy": 0.7016455262899399, "num_tokens": 12197104.0, "step": 5280 }, { "entropy": 1.2927948385477066, "epoch": 0.7754892618925456, "grad_norm": 9.0625, "learning_rate": 1.550571679859279e-05, "loss": 1.2334, "mean_token_accuracy": 0.6916782230138778, "num_tokens": 12220885.0, "step": 5290 }, { "entropy": 1.3925365865230561, "epoch": 0.7769552151286374, "grad_norm": 9.0, "learning_rate": 1.553503371445324e-05, "loss": 1.3669, "mean_token_accuracy": 0.6685474902391434, "num_tokens": 12238807.0, "step": 5300 }, { "entropy": 1.2368726968765258, "epoch": 0.7784211683647292, "grad_norm": 10.25, "learning_rate": 1.5564350630313692e-05, "loss": 1.1706, "mean_token_accuracy": 0.703268775343895, "num_tokens": 12264613.0, "step": 5310 }, { "entropy": 1.3310335576534271, "epoch": 0.779887121600821, "grad_norm": 8.625, "learning_rate": 1.559366754617414e-05, "loss": 1.3005, "mean_token_accuracy": 0.6759785771369934, "num_tokens": 12285316.0, "step": 5320 }, { "entropy": 1.1513732075691223, "epoch": 0.7813530748369127, "grad_norm": 9.1875, "learning_rate": 1.5622984462034594e-05, "loss": 1.0959, "mean_token_accuracy": 0.7136809527873993, "num_tokens": 12313163.0, "step": 5330 }, { "entropy": 1.216403305530548, "epoch": 0.7828190280730045, "grad_norm": 9.0, "learning_rate": 1.5652301377895047e-05, "loss": 1.15, "mean_token_accuracy": 0.697441303730011, "num_tokens": 12334161.0, "step": 5340 }, { "entropy": 1.2555270463228225, "epoch": 0.7842849813090962, "grad_norm": 10.125, "learning_rate": 1.5681618293755497e-05, "loss": 1.1893, "mean_token_accuracy": 0.6944623023271561, "num_tokens": 12355060.0, "step": 5350 }, { "entropy": 1.3208678781986236, "epoch": 0.785750934545188, "grad_norm": 9.0625, "learning_rate": 1.571093520961595e-05, "loss": 1.2962, "mean_token_accuracy": 0.6841281801462173, "num_tokens": 12378550.0, "step": 5360 }, { "entropy": 1.373434391617775, "epoch": 0.7872168877812797, "grad_norm": 10.625, "learning_rate": 1.57402521254764e-05, "loss": 1.3315, "mean_token_accuracy": 0.6764768958091736, "num_tokens": 12399984.0, "step": 5370 }, { "entropy": 1.3020552098751068, "epoch": 0.7886828410173715, "grad_norm": 6.59375, "learning_rate": 1.5769569041336852e-05, "loss": 1.2868, "mean_token_accuracy": 0.6844318062067032, "num_tokens": 12424719.0, "step": 5380 }, { "entropy": 1.3585431545972824, "epoch": 0.7901487942534633, "grad_norm": 10.75, "learning_rate": 1.5798885957197305e-05, "loss": 1.3288, "mean_token_accuracy": 0.6881196677684784, "num_tokens": 12445034.0, "step": 5390 }, { "entropy": 1.4066684693098068, "epoch": 0.7916147474895551, "grad_norm": 14.625, "learning_rate": 1.5828202873057758e-05, "loss": 1.3854, "mean_token_accuracy": 0.6633688390254975, "num_tokens": 12463093.0, "step": 5400 }, { "entropy": 1.2170531302690506, "epoch": 0.7930807007256468, "grad_norm": 7.28125, "learning_rate": 1.5857519788918207e-05, "loss": 1.1576, "mean_token_accuracy": 0.7000980019569397, "num_tokens": 12485708.0, "step": 5410 }, { "entropy": 1.4619715064764023, "epoch": 0.7945466539617386, "grad_norm": 8.625, "learning_rate": 1.588683670477866e-05, "loss": 1.4205, "mean_token_accuracy": 0.6526595711708069, "num_tokens": 12507420.0, "step": 5420 }, { "entropy": 1.1694447427988053, "epoch": 0.7960126071978304, "grad_norm": 9.75, "learning_rate": 1.591615362063911e-05, "loss": 1.0705, "mean_token_accuracy": 0.7071300029754639, "num_tokens": 12530762.0, "step": 5430 }, { "entropy": 1.1355178713798524, "epoch": 0.7974785604339222, "grad_norm": 9.5, "learning_rate": 1.5945470536499562e-05, "loss": 1.1203, "mean_token_accuracy": 0.7143382340669632, "num_tokens": 12556407.0, "step": 5440 }, { "entropy": 1.1633911848068237, "epoch": 0.798944513670014, "grad_norm": 6.21875, "learning_rate": 1.597478745236001e-05, "loss": 1.1492, "mean_token_accuracy": 0.7092549681663514, "num_tokens": 12581121.0, "step": 5450 }, { "entropy": 1.3152824550867082, "epoch": 0.8004104669061057, "grad_norm": 7.8125, "learning_rate": 1.6004104368220464e-05, "loss": 1.273, "mean_token_accuracy": 0.6806286334991455, "num_tokens": 12602736.0, "step": 5460 }, { "entropy": 1.3658701062202454, "epoch": 0.8018764201421975, "grad_norm": 11.0, "learning_rate": 1.6033421284080917e-05, "loss": 1.3625, "mean_token_accuracy": 0.6828352361917496, "num_tokens": 12623410.0, "step": 5470 }, { "entropy": 1.15870793312788, "epoch": 0.8033423733782893, "grad_norm": 5.90625, "learning_rate": 1.6062738199941367e-05, "loss": 1.1126, "mean_token_accuracy": 0.7144654333591461, "num_tokens": 12647385.0, "step": 5480 }, { "entropy": 1.2270306706428529, "epoch": 0.804808326614381, "grad_norm": 8.75, "learning_rate": 1.609205511580182e-05, "loss": 1.1823, "mean_token_accuracy": 0.6972189277410508, "num_tokens": 12671533.0, "step": 5490 }, { "entropy": 1.2809490233659744, "epoch": 0.8062742798504727, "grad_norm": 16.0, "learning_rate": 1.612137203166227e-05, "loss": 1.2378, "mean_token_accuracy": 0.6920528322458267, "num_tokens": 12694143.0, "step": 5500 }, { "entropy": 1.3563053607940674, "epoch": 0.8077402330865645, "grad_norm": 8.1875, "learning_rate": 1.6150688947522722e-05, "loss": 1.3462, "mean_token_accuracy": 0.6757564902305603, "num_tokens": 12716757.0, "step": 5510 }, { "entropy": 1.2171526342630385, "epoch": 0.8092061863226563, "grad_norm": 9.4375, "learning_rate": 1.618000586338317e-05, "loss": 1.1796, "mean_token_accuracy": 0.7081211119890213, "num_tokens": 12745336.0, "step": 5520 }, { "entropy": 1.3560230165719986, "epoch": 0.8106721395587481, "grad_norm": 12.1875, "learning_rate": 1.6209322779243624e-05, "loss": 1.3526, "mean_token_accuracy": 0.6724089115858078, "num_tokens": 12767340.0, "step": 5530 }, { "entropy": 1.2869590252637864, "epoch": 0.8121380927948398, "grad_norm": 15.0, "learning_rate": 1.6238639695104077e-05, "loss": 1.2619, "mean_token_accuracy": 0.6851057916879654, "num_tokens": 12785268.0, "step": 5540 }, { "entropy": 1.2207739114761353, "epoch": 0.8136040460309316, "grad_norm": 7.90625, "learning_rate": 1.626795661096453e-05, "loss": 1.1454, "mean_token_accuracy": 0.7067738696932793, "num_tokens": 12808612.0, "step": 5550 }, { "entropy": 1.289278057217598, "epoch": 0.8150699992670234, "grad_norm": 9.375, "learning_rate": 1.629727352682498e-05, "loss": 1.19, "mean_token_accuracy": 0.6987370043992996, "num_tokens": 12831594.0, "step": 5560 }, { "entropy": 1.4748449683189393, "epoch": 0.8165359525031152, "grad_norm": 8.9375, "learning_rate": 1.6326590442685432e-05, "loss": 1.439, "mean_token_accuracy": 0.6480473399162292, "num_tokens": 12853828.0, "step": 5570 }, { "entropy": 1.2534620195627213, "epoch": 0.818001905739207, "grad_norm": 8.1875, "learning_rate": 1.635590735854588e-05, "loss": 1.267, "mean_token_accuracy": 0.6883357495069504, "num_tokens": 12876188.0, "step": 5580 }, { "entropy": 1.1396201640367507, "epoch": 0.8194678589752987, "grad_norm": 12.625, "learning_rate": 1.6385224274406334e-05, "loss": 1.0215, "mean_token_accuracy": 0.719684848189354, "num_tokens": 12898202.0, "step": 5590 }, { "entropy": 1.2761299908161163, "epoch": 0.8209338122113905, "grad_norm": 12.125, "learning_rate": 1.6414541190266787e-05, "loss": 1.2421, "mean_token_accuracy": 0.6903198391199112, "num_tokens": 12918913.0, "step": 5600 }, { "entropy": 1.3033012092113494, "epoch": 0.8223997654474823, "grad_norm": 10.6875, "learning_rate": 1.6443858106127237e-05, "loss": 1.2475, "mean_token_accuracy": 0.6859813496470452, "num_tokens": 12941686.0, "step": 5610 }, { "entropy": 1.3516968309879303, "epoch": 0.8238657186835739, "grad_norm": 7.65625, "learning_rate": 1.647317502198769e-05, "loss": 1.3317, "mean_token_accuracy": 0.680739912390709, "num_tokens": 12965160.0, "step": 5620 }, { "entropy": 1.2261343955993653, "epoch": 0.8253316719196657, "grad_norm": 7.0, "learning_rate": 1.650249193784814e-05, "loss": 1.141, "mean_token_accuracy": 0.7057270109653473, "num_tokens": 12990591.0, "step": 5630 }, { "entropy": 1.153782194852829, "epoch": 0.8267976251557575, "grad_norm": 6.84375, "learning_rate": 1.6531808853708592e-05, "loss": 1.1002, "mean_token_accuracy": 0.7100861877202987, "num_tokens": 13014555.0, "step": 5640 }, { "entropy": 1.224363997578621, "epoch": 0.8282635783918493, "grad_norm": 14.25, "learning_rate": 1.656112576956904e-05, "loss": 1.2311, "mean_token_accuracy": 0.6970030486583709, "num_tokens": 13035900.0, "step": 5650 }, { "entropy": 1.0583749681711196, "epoch": 0.829729531627941, "grad_norm": 10.0, "learning_rate": 1.6590442685429494e-05, "loss": 1.015, "mean_token_accuracy": 0.7325855046510696, "num_tokens": 13062449.0, "step": 5660 }, { "entropy": 1.3571293652057648, "epoch": 0.8311954848640328, "grad_norm": 9.4375, "learning_rate": 1.6619759601289947e-05, "loss": 1.2857, "mean_token_accuracy": 0.6735773056745529, "num_tokens": 13085813.0, "step": 5670 }, { "entropy": 1.159355565905571, "epoch": 0.8326614381001246, "grad_norm": 7.71875, "learning_rate": 1.6649076517150396e-05, "loss": 1.102, "mean_token_accuracy": 0.7032191962003708, "num_tokens": 13108448.0, "step": 5680 }, { "entropy": 1.325744691491127, "epoch": 0.8341273913362164, "grad_norm": 16.25, "learning_rate": 1.667839343301085e-05, "loss": 1.3085, "mean_token_accuracy": 0.6752249509096145, "num_tokens": 13133511.0, "step": 5690 }, { "entropy": 1.2633306086063385, "epoch": 0.8355933445723082, "grad_norm": 9.1875, "learning_rate": 1.67077103488713e-05, "loss": 1.1952, "mean_token_accuracy": 0.6896404296159744, "num_tokens": 13154260.0, "step": 5700 }, { "entropy": 1.3849945574998856, "epoch": 0.8370592978083999, "grad_norm": 10.9375, "learning_rate": 1.673702726473175e-05, "loss": 1.3354, "mean_token_accuracy": 0.6664645493030548, "num_tokens": 13175380.0, "step": 5710 }, { "entropy": 1.4683417350053787, "epoch": 0.8385252510444917, "grad_norm": 13.75, "learning_rate": 1.67663441805922e-05, "loss": 1.432, "mean_token_accuracy": 0.6533677369356156, "num_tokens": 13196583.0, "step": 5720 }, { "entropy": 1.3935118615627289, "epoch": 0.8399912042805835, "grad_norm": 9.25, "learning_rate": 1.6795661096452657e-05, "loss": 1.3517, "mean_token_accuracy": 0.6653122037649155, "num_tokens": 13222661.0, "step": 5730 }, { "entropy": 1.4118003964424133, "epoch": 0.8414571575166753, "grad_norm": 10.375, "learning_rate": 1.6824978012313107e-05, "loss": 1.3774, "mean_token_accuracy": 0.6675875633955002, "num_tokens": 13244810.0, "step": 5740 }, { "entropy": 1.2022523313760758, "epoch": 0.8429231107527669, "grad_norm": 6.90625, "learning_rate": 1.685429492817356e-05, "loss": 1.1286, "mean_token_accuracy": 0.7013562709093094, "num_tokens": 13268844.0, "step": 5750 }, { "entropy": 1.2868722707033158, "epoch": 0.8443890639888587, "grad_norm": 8.25, "learning_rate": 1.688361184403401e-05, "loss": 1.1944, "mean_token_accuracy": 0.6873423963785171, "num_tokens": 13289092.0, "step": 5760 }, { "entropy": 1.4429591357707978, "epoch": 0.8458550172249505, "grad_norm": 13.75, "learning_rate": 1.6912928759894462e-05, "loss": 1.436, "mean_token_accuracy": 0.6600356504321099, "num_tokens": 13311712.0, "step": 5770 }, { "entropy": 1.1607126146554947, "epoch": 0.8473209704610423, "grad_norm": 8.4375, "learning_rate": 1.694224567575491e-05, "loss": 1.1173, "mean_token_accuracy": 0.7116161555051803, "num_tokens": 13339374.0, "step": 5780 }, { "entropy": 1.2251354336738587, "epoch": 0.848786923697134, "grad_norm": 7.28125, "learning_rate": 1.6971562591615364e-05, "loss": 1.1899, "mean_token_accuracy": 0.7067140489816666, "num_tokens": 13366274.0, "step": 5790 }, { "entropy": 1.4135758846998214, "epoch": 0.8502528769332258, "grad_norm": 9.0625, "learning_rate": 1.7000879507475817e-05, "loss": 1.4441, "mean_token_accuracy": 0.6604284584522248, "num_tokens": 13384724.0, "step": 5800 }, { "entropy": 1.1871341913938522, "epoch": 0.8517188301693176, "grad_norm": 5.75, "learning_rate": 1.7030196423336266e-05, "loss": 1.084, "mean_token_accuracy": 0.7074188977479935, "num_tokens": 13410396.0, "step": 5810 }, { "entropy": 1.3809557497501372, "epoch": 0.8531847834054094, "grad_norm": 10.25, "learning_rate": 1.705951333919672e-05, "loss": 1.3348, "mean_token_accuracy": 0.6785537540912628, "num_tokens": 13434715.0, "step": 5820 }, { "entropy": 1.3020508915185929, "epoch": 0.8546507366415012, "grad_norm": 12.875, "learning_rate": 1.708883025505717e-05, "loss": 1.2862, "mean_token_accuracy": 0.6800718754529953, "num_tokens": 13459841.0, "step": 5830 }, { "entropy": 1.293996438384056, "epoch": 0.8561166898775929, "grad_norm": 6.875, "learning_rate": 1.711814717091762e-05, "loss": 1.2072, "mean_token_accuracy": 0.685341227054596, "num_tokens": 13483164.0, "step": 5840 }, { "entropy": 1.3824090301990508, "epoch": 0.8575826431136847, "grad_norm": 11.625, "learning_rate": 1.714746408677807e-05, "loss": 1.3338, "mean_token_accuracy": 0.6709936171770096, "num_tokens": 13506616.0, "step": 5850 }, { "entropy": 1.3666403502225877, "epoch": 0.8590485963497765, "grad_norm": 10.875, "learning_rate": 1.7176781002638524e-05, "loss": 1.3221, "mean_token_accuracy": 0.6728556305170059, "num_tokens": 13528750.0, "step": 5860 }, { "entropy": 1.2752291679382324, "epoch": 0.8605145495858683, "grad_norm": 9.5625, "learning_rate": 1.7206097918498977e-05, "loss": 1.2145, "mean_token_accuracy": 0.6944393932819366, "num_tokens": 13553261.0, "step": 5870 }, { "entropy": 1.211089026927948, "epoch": 0.8619805028219599, "grad_norm": 12.3125, "learning_rate": 1.7235414834359426e-05, "loss": 1.1444, "mean_token_accuracy": 0.7070489257574082, "num_tokens": 13574848.0, "step": 5880 }, { "entropy": 1.3241501301527023, "epoch": 0.8634464560580517, "grad_norm": 8.5, "learning_rate": 1.726473175021988e-05, "loss": 1.3358, "mean_token_accuracy": 0.6756343424320221, "num_tokens": 13601635.0, "step": 5890 }, { "entropy": 1.291443195939064, "epoch": 0.8649124092941435, "grad_norm": 9.375, "learning_rate": 1.729404866608033e-05, "loss": 1.2754, "mean_token_accuracy": 0.692650043964386, "num_tokens": 13624069.0, "step": 5900 }, { "entropy": 1.2183425694704055, "epoch": 0.8663783625302353, "grad_norm": 10.1875, "learning_rate": 1.732336558194078e-05, "loss": 1.2313, "mean_token_accuracy": 0.6963761448860168, "num_tokens": 13650237.0, "step": 5910 }, { "entropy": 1.3634672164916992, "epoch": 0.867844315766327, "grad_norm": 8.4375, "learning_rate": 1.735268249780123e-05, "loss": 1.2823, "mean_token_accuracy": 0.6789061248302459, "num_tokens": 13671939.0, "step": 5920 }, { "entropy": 1.2655408352613449, "epoch": 0.8693102690024188, "grad_norm": 5.90625, "learning_rate": 1.7381999413661684e-05, "loss": 1.2544, "mean_token_accuracy": 0.6885170012712478, "num_tokens": 13697904.0, "step": 5930 }, { "entropy": 1.2946384370326995, "epoch": 0.8707762222385106, "grad_norm": 10.0, "learning_rate": 1.7411316329522136e-05, "loss": 1.2539, "mean_token_accuracy": 0.6883791357278823, "num_tokens": 13721572.0, "step": 5940 }, { "entropy": 1.3733666330575942, "epoch": 0.8722421754746024, "grad_norm": 10.125, "learning_rate": 1.744063324538259e-05, "loss": 1.3447, "mean_token_accuracy": 0.6644601672887802, "num_tokens": 13742747.0, "step": 5950 }, { "entropy": 1.2222798824310304, "epoch": 0.8737081287106941, "grad_norm": 13.0625, "learning_rate": 1.746995016124304e-05, "loss": 1.1993, "mean_token_accuracy": 0.7005697786808014, "num_tokens": 13766604.0, "step": 5960 }, { "entropy": 1.414951029419899, "epoch": 0.8751740819467859, "grad_norm": 7.125, "learning_rate": 1.749926707710349e-05, "loss": 1.3372, "mean_token_accuracy": 0.6710182368755341, "num_tokens": 13786777.0, "step": 5970 }, { "entropy": 1.1851381480693817, "epoch": 0.8766400351828777, "grad_norm": 9.5625, "learning_rate": 1.752858399296394e-05, "loss": 1.1186, "mean_token_accuracy": 0.7032156765460968, "num_tokens": 13809982.0, "step": 5980 }, { "entropy": 1.3754118025302886, "epoch": 0.8781059884189695, "grad_norm": 9.8125, "learning_rate": 1.7557900908824394e-05, "loss": 1.3066, "mean_token_accuracy": 0.6736097663640976, "num_tokens": 13832197.0, "step": 5990 }, { "entropy": 1.308347964286804, "epoch": 0.8795719416550613, "grad_norm": 9.75, "learning_rate": 1.7587217824684843e-05, "loss": 1.2401, "mean_token_accuracy": 0.6893261402845383, "num_tokens": 13855482.0, "step": 6000 }, { "entropy": 1.3173475295305253, "epoch": 0.8810378948911529, "grad_norm": 8.9375, "learning_rate": 1.7616534740545296e-05, "loss": 1.2664, "mean_token_accuracy": 0.6858200609683991, "num_tokens": 13877844.0, "step": 6010 }, { "entropy": 1.067092055082321, "epoch": 0.8825038481272447, "grad_norm": 13.6875, "learning_rate": 1.764585165640575e-05, "loss": 1.0372, "mean_token_accuracy": 0.7273286759853363, "num_tokens": 13900714.0, "step": 6020 }, { "entropy": 1.2800173729658126, "epoch": 0.8839698013633365, "grad_norm": 9.8125, "learning_rate": 1.76751685722662e-05, "loss": 1.2541, "mean_token_accuracy": 0.6827918857336044, "num_tokens": 13924329.0, "step": 6030 }, { "entropy": 1.4586503952741623, "epoch": 0.8854357545994282, "grad_norm": 10.875, "learning_rate": 1.770448548812665e-05, "loss": 1.445, "mean_token_accuracy": 0.6494209304451942, "num_tokens": 13946166.0, "step": 6040 }, { "entropy": 1.4777934670448303, "epoch": 0.88690170783552, "grad_norm": 8.8125, "learning_rate": 1.77338024039871e-05, "loss": 1.4587, "mean_token_accuracy": 0.6561086401343346, "num_tokens": 13971062.0, "step": 6050 }, { "entropy": 1.051722651720047, "epoch": 0.8883676610716118, "grad_norm": 6.09375, "learning_rate": 1.7763119319847554e-05, "loss": 1.0056, "mean_token_accuracy": 0.7332020968198776, "num_tokens": 13996928.0, "step": 6060 }, { "entropy": 1.169564825296402, "epoch": 0.8898336143077036, "grad_norm": 9.75, "learning_rate": 1.7792436235708006e-05, "loss": 1.1357, "mean_token_accuracy": 0.715928740799427, "num_tokens": 14023902.0, "step": 6070 }, { "entropy": 1.061112540960312, "epoch": 0.8912995675437954, "grad_norm": 6.65625, "learning_rate": 1.7821753151568456e-05, "loss": 1.0231, "mean_token_accuracy": 0.7275304734706879, "num_tokens": 14050399.0, "step": 6080 }, { "entropy": 1.157866156101227, "epoch": 0.8927655207798871, "grad_norm": 15.4375, "learning_rate": 1.785107006742891e-05, "loss": 1.1177, "mean_token_accuracy": 0.7094449773430824, "num_tokens": 14075572.0, "step": 6090 }, { "entropy": 1.3927031993865966, "epoch": 0.8942314740159789, "grad_norm": 8.8125, "learning_rate": 1.7880386983289358e-05, "loss": 1.36, "mean_token_accuracy": 0.6699548721313476, "num_tokens": 14098371.0, "step": 6100 }, { "entropy": 1.2394950956106185, "epoch": 0.8956974272520707, "grad_norm": 7.15625, "learning_rate": 1.790970389914981e-05, "loss": 1.1704, "mean_token_accuracy": 0.7002541691064834, "num_tokens": 14120086.0, "step": 6110 }, { "entropy": 1.320177388191223, "epoch": 0.8971633804881625, "grad_norm": 9.25, "learning_rate": 1.793902081501026e-05, "loss": 1.2351, "mean_token_accuracy": 0.6869661211967468, "num_tokens": 14142268.0, "step": 6120 }, { "entropy": 1.3187602907419205, "epoch": 0.8986293337242542, "grad_norm": 6.875, "learning_rate": 1.7968337730870713e-05, "loss": 1.2937, "mean_token_accuracy": 0.682950907945633, "num_tokens": 14165948.0, "step": 6130 }, { "entropy": 1.1607182770967484, "epoch": 0.900095286960346, "grad_norm": 6.0, "learning_rate": 1.7997654646731166e-05, "loss": 1.145, "mean_token_accuracy": 0.7091206848621369, "num_tokens": 14189967.0, "step": 6140 }, { "entropy": 1.3777431547641754, "epoch": 0.9015612401964377, "grad_norm": 17.125, "learning_rate": 1.802697156259162e-05, "loss": 1.2954, "mean_token_accuracy": 0.6758611053228378, "num_tokens": 14209209.0, "step": 6150 }, { "entropy": 1.2165469646453857, "epoch": 0.9030271934325295, "grad_norm": 13.5, "learning_rate": 1.805628847845207e-05, "loss": 1.1969, "mean_token_accuracy": 0.7028187274932861, "num_tokens": 14235367.0, "step": 6160 }, { "entropy": 1.2122709065675736, "epoch": 0.9044931466686212, "grad_norm": 8.9375, "learning_rate": 1.808560539431252e-05, "loss": 1.1554, "mean_token_accuracy": 0.7037466615438461, "num_tokens": 14262809.0, "step": 6170 }, { "entropy": 1.3847891956567764, "epoch": 0.905959099904713, "grad_norm": 11.0, "learning_rate": 1.811492231017297e-05, "loss": 1.398, "mean_token_accuracy": 0.672494786977768, "num_tokens": 14287808.0, "step": 6180 }, { "entropy": 1.1647264420986176, "epoch": 0.9074250531408048, "grad_norm": 9.5, "learning_rate": 1.8144239226033424e-05, "loss": 1.1671, "mean_token_accuracy": 0.713229700922966, "num_tokens": 14315289.0, "step": 6190 }, { "entropy": 1.3139775484800338, "epoch": 0.9088910063768966, "grad_norm": 9.4375, "learning_rate": 1.8173556141893873e-05, "loss": 1.2472, "mean_token_accuracy": 0.686634612083435, "num_tokens": 14338311.0, "step": 6200 }, { "entropy": 1.3062845408916473, "epoch": 0.9103569596129883, "grad_norm": 10.0, "learning_rate": 1.8202873057754326e-05, "loss": 1.3012, "mean_token_accuracy": 0.6909263074398041, "num_tokens": 14360897.0, "step": 6210 }, { "entropy": 1.1650154635310173, "epoch": 0.9118229128490801, "grad_norm": 15.75, "learning_rate": 1.823218997361478e-05, "loss": 1.0875, "mean_token_accuracy": 0.7102257162332535, "num_tokens": 14383820.0, "step": 6220 }, { "entropy": 1.451330202817917, "epoch": 0.9132888660851719, "grad_norm": 9.3125, "learning_rate": 1.8261506889475228e-05, "loss": 1.4499, "mean_token_accuracy": 0.6600837200880051, "num_tokens": 14407145.0, "step": 6230 }, { "entropy": 1.3744856297969819, "epoch": 0.9147548193212637, "grad_norm": 14.8125, "learning_rate": 1.829082380533568e-05, "loss": 1.3654, "mean_token_accuracy": 0.6687394082546234, "num_tokens": 14428742.0, "step": 6240 }, { "entropy": 1.3475419372320174, "epoch": 0.9162207725573555, "grad_norm": 7.875, "learning_rate": 1.832014072119613e-05, "loss": 1.2582, "mean_token_accuracy": 0.6800679057836533, "num_tokens": 14451185.0, "step": 6250 }, { "entropy": 1.373263731598854, "epoch": 0.9176867257934472, "grad_norm": 10.25, "learning_rate": 1.8349457637056583e-05, "loss": 1.3521, "mean_token_accuracy": 0.6758869498968124, "num_tokens": 14476842.0, "step": 6260 }, { "entropy": 1.1534820824861527, "epoch": 0.919152679029539, "grad_norm": 10.0, "learning_rate": 1.8378774552917033e-05, "loss": 1.1136, "mean_token_accuracy": 0.7124876141548157, "num_tokens": 14503650.0, "step": 6270 }, { "entropy": 1.12755626142025, "epoch": 0.9206186322656307, "grad_norm": 9.0625, "learning_rate": 1.8408091468777486e-05, "loss": 1.0993, "mean_token_accuracy": 0.710336172580719, "num_tokens": 14527462.0, "step": 6280 }, { "entropy": 1.1319433838129043, "epoch": 0.9220845855017225, "grad_norm": 7.0625, "learning_rate": 1.843740838463794e-05, "loss": 1.1149, "mean_token_accuracy": 0.7256720155477524, "num_tokens": 14554162.0, "step": 6290 }, { "entropy": 1.3045414298772813, "epoch": 0.9235505387378142, "grad_norm": 8.3125, "learning_rate": 1.8466725300498388e-05, "loss": 1.2269, "mean_token_accuracy": 0.678934571146965, "num_tokens": 14577799.0, "step": 6300 }, { "entropy": 1.2833163917064667, "epoch": 0.925016491973906, "grad_norm": 7.625, "learning_rate": 1.849604221635884e-05, "loss": 1.1676, "mean_token_accuracy": 0.6948216944932938, "num_tokens": 14599969.0, "step": 6310 }, { "entropy": 1.2427875071763992, "epoch": 0.9264824452099978, "grad_norm": 6.03125, "learning_rate": 1.852535913221929e-05, "loss": 1.2314, "mean_token_accuracy": 0.6962690323591232, "num_tokens": 14621067.0, "step": 6320 }, { "entropy": 1.262687411904335, "epoch": 0.9279483984460896, "grad_norm": 15.8125, "learning_rate": 1.8554676048079743e-05, "loss": 1.1908, "mean_token_accuracy": 0.6913136810064315, "num_tokens": 14645323.0, "step": 6330 }, { "entropy": 1.0148941993713378, "epoch": 0.9294143516821813, "grad_norm": 10.4375, "learning_rate": 1.8583992963940192e-05, "loss": 0.9338, "mean_token_accuracy": 0.7425265163183212, "num_tokens": 14673804.0, "step": 6340 }, { "entropy": 1.3369463503360748, "epoch": 0.9308803049182731, "grad_norm": 9.125, "learning_rate": 1.861330987980065e-05, "loss": 1.2388, "mean_token_accuracy": 0.6781536966562272, "num_tokens": 14697388.0, "step": 6350 }, { "entropy": 1.3616376578807832, "epoch": 0.9323462581543649, "grad_norm": 10.0, "learning_rate": 1.8642626795661098e-05, "loss": 1.3224, "mean_token_accuracy": 0.6740956872701644, "num_tokens": 14717583.0, "step": 6360 }, { "entropy": 1.485172264277935, "epoch": 0.9338122113904567, "grad_norm": 9.0, "learning_rate": 1.867194371152155e-05, "loss": 1.4066, "mean_token_accuracy": 0.6647861048579216, "num_tokens": 14740273.0, "step": 6370 }, { "entropy": 1.2855311006307601, "epoch": 0.9352781646265484, "grad_norm": 5.34375, "learning_rate": 1.8701260627382e-05, "loss": 1.291, "mean_token_accuracy": 0.689969590306282, "num_tokens": 14762114.0, "step": 6380 }, { "entropy": 1.3094401061534882, "epoch": 0.9367441178626402, "grad_norm": 12.9375, "learning_rate": 1.8730577543242453e-05, "loss": 1.3159, "mean_token_accuracy": 0.685760098695755, "num_tokens": 14785971.0, "step": 6390 }, { "entropy": 1.246167467534542, "epoch": 0.938210071098732, "grad_norm": 14.0, "learning_rate": 1.8759894459102903e-05, "loss": 1.1921, "mean_token_accuracy": 0.6971038043498993, "num_tokens": 14809997.0, "step": 6400 }, { "entropy": 1.248328447341919, "epoch": 0.9396760243348237, "grad_norm": 7.375, "learning_rate": 1.8789211374963356e-05, "loss": 1.146, "mean_token_accuracy": 0.6973910629749298, "num_tokens": 14831850.0, "step": 6410 }, { "entropy": 1.1846527129411697, "epoch": 0.9411419775709154, "grad_norm": 14.25, "learning_rate": 1.881852829082381e-05, "loss": 1.1368, "mean_token_accuracy": 0.7037977397441864, "num_tokens": 14855665.0, "step": 6420 }, { "entropy": 1.4207362115383149, "epoch": 0.9426079308070072, "grad_norm": 14.0625, "learning_rate": 1.8847845206684258e-05, "loss": 1.3889, "mean_token_accuracy": 0.6569222033023834, "num_tokens": 14874847.0, "step": 6430 }, { "entropy": 1.3470139503479004, "epoch": 0.944073884043099, "grad_norm": 9.625, "learning_rate": 1.887716212254471e-05, "loss": 1.3066, "mean_token_accuracy": 0.6775850206613541, "num_tokens": 14896301.0, "step": 6440 }, { "entropy": 1.2052619457244873, "epoch": 0.9455398372791908, "grad_norm": 7.03125, "learning_rate": 1.890647903840516e-05, "loss": 1.1935, "mean_token_accuracy": 0.7029782891273498, "num_tokens": 14920757.0, "step": 6450 }, { "entropy": 1.289523822069168, "epoch": 0.9470057905152826, "grad_norm": 6.78125, "learning_rate": 1.8935795954265613e-05, "loss": 1.2413, "mean_token_accuracy": 0.6899956434965133, "num_tokens": 14941553.0, "step": 6460 }, { "entropy": 1.326800000667572, "epoch": 0.9484717437513743, "grad_norm": 14.6875, "learning_rate": 1.8965112870126062e-05, "loss": 1.3447, "mean_token_accuracy": 0.6779297292232513, "num_tokens": 14962180.0, "step": 6470 }, { "entropy": 1.1080780625343323, "epoch": 0.9499376969874661, "grad_norm": 9.375, "learning_rate": 1.8994429785986515e-05, "loss": 1.0938, "mean_token_accuracy": 0.7222896903753281, "num_tokens": 14985654.0, "step": 6480 }, { "entropy": 1.0597321376204492, "epoch": 0.9514036502235579, "grad_norm": 9.5, "learning_rate": 1.9023746701846968e-05, "loss": 0.9812, "mean_token_accuracy": 0.734853771328926, "num_tokens": 15008886.0, "step": 6490 }, { "entropy": 1.2901944428682328, "epoch": 0.9528696034596497, "grad_norm": 10.875, "learning_rate": 1.9053063617707418e-05, "loss": 1.2434, "mean_token_accuracy": 0.6813958168029786, "num_tokens": 15034690.0, "step": 6500 }, { "entropy": 1.5767694979906082, "epoch": 0.9543355566957414, "grad_norm": 9.75, "learning_rate": 1.908238053356787e-05, "loss": 1.5805, "mean_token_accuracy": 0.6290628105401993, "num_tokens": 15052208.0, "step": 6510 }, { "entropy": 1.4155357986688615, "epoch": 0.9558015099318332, "grad_norm": 8.1875, "learning_rate": 1.911169744942832e-05, "loss": 1.3382, "mean_token_accuracy": 0.6657622039318085, "num_tokens": 15074666.0, "step": 6520 }, { "entropy": 1.267582008242607, "epoch": 0.957267463167925, "grad_norm": 9.5, "learning_rate": 1.9141014365288773e-05, "loss": 1.2152, "mean_token_accuracy": 0.6922637164592743, "num_tokens": 15099238.0, "step": 6530 }, { "entropy": 1.2141892522573472, "epoch": 0.9587334164040167, "grad_norm": 10.0, "learning_rate": 1.9170331281149222e-05, "loss": 1.152, "mean_token_accuracy": 0.7023862451314926, "num_tokens": 15121386.0, "step": 6540 }, { "entropy": 1.4400275737047195, "epoch": 0.9601993696401084, "grad_norm": 11.625, "learning_rate": 1.919964819700968e-05, "loss": 1.4543, "mean_token_accuracy": 0.6592666491866112, "num_tokens": 15142927.0, "step": 6550 }, { "entropy": 1.3655238032341004, "epoch": 0.9616653228762002, "grad_norm": 9.4375, "learning_rate": 1.9228965112870128e-05, "loss": 1.317, "mean_token_accuracy": 0.6675771176815033, "num_tokens": 15163536.0, "step": 6560 }, { "entropy": 1.3766405522823333, "epoch": 0.963131276112292, "grad_norm": 11.0, "learning_rate": 1.925828202873058e-05, "loss": 1.3777, "mean_token_accuracy": 0.6715412318706513, "num_tokens": 15187215.0, "step": 6570 }, { "entropy": 1.2784130245447158, "epoch": 0.9645972293483838, "grad_norm": 9.6875, "learning_rate": 1.928759894459103e-05, "loss": 1.2425, "mean_token_accuracy": 0.6966150164604187, "num_tokens": 15209824.0, "step": 6580 }, { "entropy": 1.3001529783010484, "epoch": 0.9660631825844755, "grad_norm": 8.75, "learning_rate": 1.9316915860451483e-05, "loss": 1.231, "mean_token_accuracy": 0.6881300330162048, "num_tokens": 15235003.0, "step": 6590 }, { "entropy": 1.4163393825292587, "epoch": 0.9675291358205673, "grad_norm": 11.3125, "learning_rate": 1.9346232776311932e-05, "loss": 1.4063, "mean_token_accuracy": 0.6570394486188889, "num_tokens": 15255732.0, "step": 6600 }, { "entropy": 1.3295579850673676, "epoch": 0.9689950890566591, "grad_norm": 8.875, "learning_rate": 1.9375549692172385e-05, "loss": 1.3402, "mean_token_accuracy": 0.684128612279892, "num_tokens": 15277899.0, "step": 6610 }, { "entropy": 1.371691143512726, "epoch": 0.9704610422927509, "grad_norm": 9.125, "learning_rate": 1.9404866608032838e-05, "loss": 1.3422, "mean_token_accuracy": 0.6690101116895676, "num_tokens": 15301456.0, "step": 6620 }, { "entropy": 1.3713926941156387, "epoch": 0.9719269955288427, "grad_norm": 12.25, "learning_rate": 1.9434183523893288e-05, "loss": 1.3328, "mean_token_accuracy": 0.6666859328746796, "num_tokens": 15326824.0, "step": 6630 }, { "entropy": 1.22864630818367, "epoch": 0.9733929487649344, "grad_norm": 6.15625, "learning_rate": 1.946350043975374e-05, "loss": 1.1902, "mean_token_accuracy": 0.6994942188262939, "num_tokens": 15354033.0, "step": 6640 }, { "entropy": 1.4099381923675538, "epoch": 0.9748589020010262, "grad_norm": 13.0625, "learning_rate": 1.949281735561419e-05, "loss": 1.3959, "mean_token_accuracy": 0.6623447269201279, "num_tokens": 15375761.0, "step": 6650 }, { "entropy": 1.3454331934452057, "epoch": 0.976324855237118, "grad_norm": 7.5, "learning_rate": 1.9522134271474643e-05, "loss": 1.2651, "mean_token_accuracy": 0.6809130042791367, "num_tokens": 15396757.0, "step": 6660 }, { "entropy": 1.3706992119550705, "epoch": 0.9777908084732096, "grad_norm": 11.5625, "learning_rate": 1.9551451187335092e-05, "loss": 1.3131, "mean_token_accuracy": 0.6733568489551545, "num_tokens": 15418605.0, "step": 6670 }, { "entropy": 1.3630209654569625, "epoch": 0.9792567617093014, "grad_norm": 7.78125, "learning_rate": 1.9580768103195545e-05, "loss": 1.3653, "mean_token_accuracy": 0.6759110286831855, "num_tokens": 15446001.0, "step": 6680 }, { "entropy": 1.3819999188184737, "epoch": 0.9807227149453932, "grad_norm": 9.125, "learning_rate": 1.9610085019055998e-05, "loss": 1.3387, "mean_token_accuracy": 0.6723785489797592, "num_tokens": 15468005.0, "step": 6690 }, { "entropy": 1.2924242943525315, "epoch": 0.982188668181485, "grad_norm": 12.875, "learning_rate": 1.9639401934916447e-05, "loss": 1.2587, "mean_token_accuracy": 0.6822317481040955, "num_tokens": 15491583.0, "step": 6700 }, { "entropy": 1.3402435332536697, "epoch": 0.9836546214175768, "grad_norm": 8.125, "learning_rate": 1.96687188507769e-05, "loss": 1.2609, "mean_token_accuracy": 0.6775965094566345, "num_tokens": 15512027.0, "step": 6710 }, { "entropy": 1.2856587886810302, "epoch": 0.9851205746536685, "grad_norm": 9.625, "learning_rate": 1.969803576663735e-05, "loss": 1.2223, "mean_token_accuracy": 0.6865221604704856, "num_tokens": 15534533.0, "step": 6720 }, { "entropy": 1.1775938749313355, "epoch": 0.9865865278897603, "grad_norm": 7.40625, "learning_rate": 1.9727352682497802e-05, "loss": 1.1426, "mean_token_accuracy": 0.7080589145421982, "num_tokens": 15559509.0, "step": 6730 }, { "entropy": 1.1928306698799134, "epoch": 0.9880524811258521, "grad_norm": 8.125, "learning_rate": 1.9756669598358252e-05, "loss": 1.0997, "mean_token_accuracy": 0.7096274584531784, "num_tokens": 15580995.0, "step": 6740 }, { "entropy": 1.2795340269804, "epoch": 0.9895184343619439, "grad_norm": 7.34375, "learning_rate": 1.9785986514218708e-05, "loss": 1.1959, "mean_token_accuracy": 0.6903921961784363, "num_tokens": 15603333.0, "step": 6750 }, { "entropy": 1.230308586359024, "epoch": 0.9909843875980356, "grad_norm": 7.21875, "learning_rate": 1.9815303430079158e-05, "loss": 1.1356, "mean_token_accuracy": 0.6975774139165878, "num_tokens": 15623319.0, "step": 6760 }, { "entropy": 1.2422581434249877, "epoch": 0.9924503408341274, "grad_norm": 6.0, "learning_rate": 1.984462034593961e-05, "loss": 1.1611, "mean_token_accuracy": 0.6935051590204239, "num_tokens": 15648519.0, "step": 6770 }, { "entropy": 1.0982193425297737, "epoch": 0.9939162940702192, "grad_norm": 7.34375, "learning_rate": 1.987393726180006e-05, "loss": 1.0886, "mean_token_accuracy": 0.72080197930336, "num_tokens": 15675341.0, "step": 6780 }, { "entropy": 1.2988370180130004, "epoch": 0.995382247306311, "grad_norm": 10.0, "learning_rate": 1.9903254177660513e-05, "loss": 1.2696, "mean_token_accuracy": 0.6817353427410126, "num_tokens": 15696594.0, "step": 6790 }, { "entropy": 1.3875470370054246, "epoch": 0.9968482005424028, "grad_norm": 7.21875, "learning_rate": 1.9932571093520962e-05, "loss": 1.31, "mean_token_accuracy": 0.6587083160877227, "num_tokens": 15716837.0, "step": 6800 }, { "entropy": 1.199909582734108, "epoch": 0.9983141537784944, "grad_norm": 7.71875, "learning_rate": 1.9961888009381415e-05, "loss": 1.1265, "mean_token_accuracy": 0.7046977341175079, "num_tokens": 15740964.0, "step": 6810 }, { "entropy": 1.2920671194791793, "epoch": 0.9997801070145862, "grad_norm": 5.78125, "learning_rate": 1.9991204925241868e-05, "loss": 1.1933, "mean_token_accuracy": 0.6864607453346252, "num_tokens": 15764791.0, "step": 6820 }, { "epoch": 1.0, "eval_entropy": 1.2916618413574463, "eval_loss": 1.3098515272140503, "eval_mean_token_accuracy": 0.6839156101671561, "eval_num_tokens": 15769939.0, "eval_runtime": 56.382, "eval_samples_per_second": 53.776, "eval_steps_per_second": 26.888, "step": 6822 }, { "entropy": 1.2025502857409025, "epoch": 1.0011727625888733, "grad_norm": 9.3125, "learning_rate": 1.9999999358558027e-05, "loss": 1.1841, "mean_token_accuracy": 0.7085828906611392, "num_tokens": 15788917.0, "step": 6830 }, { "entropy": 1.1407565236091615, "epoch": 1.0026387158249652, "grad_norm": 9.875, "learning_rate": 1.999999621680161e-05, "loss": 1.1003, "mean_token_accuracy": 0.7155534386634826, "num_tokens": 15812110.0, "step": 6840 }, { "entropy": 1.395634698867798, "epoch": 1.0041046690610569, "grad_norm": 9.625, "learning_rate": 1.999999045691571e-05, "loss": 1.3382, "mean_token_accuracy": 0.6645452082157135, "num_tokens": 15830870.0, "step": 6850 }, { "entropy": 1.2036957919597626, "epoch": 1.0055706222971488, "grad_norm": 8.9375, "learning_rate": 1.999998207890182e-05, "loss": 1.1509, "mean_token_accuracy": 0.7075277447700501, "num_tokens": 15855232.0, "step": 6860 }, { "entropy": 1.3050825476646424, "epoch": 1.0070365755332404, "grad_norm": 12.8125, "learning_rate": 1.9999971082762145e-05, "loss": 1.3032, "mean_token_accuracy": 0.6743306636810302, "num_tokens": 15874041.0, "step": 6870 }, { "entropy": 1.289553427696228, "epoch": 1.0085025287693323, "grad_norm": 6.125, "learning_rate": 1.999995746849956e-05, "loss": 1.2574, "mean_token_accuracy": 0.6941193372011185, "num_tokens": 15899651.0, "step": 6880 }, { "entropy": 1.419933745265007, "epoch": 1.009968482005424, "grad_norm": 10.5625, "learning_rate": 1.9999941236117627e-05, "loss": 1.3634, "mean_token_accuracy": 0.6674848377704621, "num_tokens": 15922828.0, "step": 6890 }, { "entropy": 1.1469344973564148, "epoch": 1.0114344352415159, "grad_norm": 6.96875, "learning_rate": 1.9999922385620597e-05, "loss": 1.0874, "mean_token_accuracy": 0.7172352224588394, "num_tokens": 15944989.0, "step": 6900 }, { "entropy": 1.1673763230443002, "epoch": 1.0129003884776075, "grad_norm": 9.3125, "learning_rate": 1.999990091701341e-05, "loss": 1.1399, "mean_token_accuracy": 0.7126713842153549, "num_tokens": 15966136.0, "step": 6910 }, { "entropy": 1.1983121752738952, "epoch": 1.0143663417136994, "grad_norm": 8.3125, "learning_rate": 1.9999876830301684e-05, "loss": 1.1295, "mean_token_accuracy": 0.6981220841407776, "num_tokens": 15989316.0, "step": 6920 }, { "entropy": 1.3539187252521514, "epoch": 1.015832294949791, "grad_norm": 8.625, "learning_rate": 1.9999850125491724e-05, "loss": 1.2802, "mean_token_accuracy": 0.6715314492583275, "num_tokens": 16012023.0, "step": 6930 }, { "entropy": 1.205693754553795, "epoch": 1.0172982481858828, "grad_norm": 12.375, "learning_rate": 1.9999820802590517e-05, "loss": 1.1205, "mean_token_accuracy": 0.7019076347351074, "num_tokens": 16034692.0, "step": 6940 }, { "entropy": 1.2099773108959198, "epoch": 1.0187642014219747, "grad_norm": 6.9375, "learning_rate": 1.9999788861605748e-05, "loss": 1.1445, "mean_token_accuracy": 0.6984766364097595, "num_tokens": 16057679.0, "step": 6950 }, { "entropy": 1.25418359041214, "epoch": 1.0202301546580663, "grad_norm": 8.125, "learning_rate": 1.9999754302545778e-05, "loss": 1.177, "mean_token_accuracy": 0.6972251296043396, "num_tokens": 16077955.0, "step": 6960 }, { "entropy": 1.1774555534124374, "epoch": 1.0216961078941582, "grad_norm": 13.75, "learning_rate": 1.9999717125419654e-05, "loss": 1.1193, "mean_token_accuracy": 0.6996659576892853, "num_tokens": 16101214.0, "step": 6970 }, { "entropy": 1.2994250610470772, "epoch": 1.0231620611302499, "grad_norm": 9.375, "learning_rate": 1.999967733023711e-05, "loss": 1.2346, "mean_token_accuracy": 0.6896671801805496, "num_tokens": 16121951.0, "step": 6980 }, { "entropy": 1.301983469724655, "epoch": 1.0246280143663418, "grad_norm": 12.0625, "learning_rate": 1.9999634917008562e-05, "loss": 1.2573, "mean_token_accuracy": 0.6831659972667694, "num_tokens": 16141582.0, "step": 6990 }, { "entropy": 1.3334712237119675, "epoch": 1.0260939676024334, "grad_norm": 12.4375, "learning_rate": 1.9999589885745115e-05, "loss": 1.3268, "mean_token_accuracy": 0.6712213367223739, "num_tokens": 16164858.0, "step": 7000 }, { "entropy": 1.4383193224668502, "epoch": 1.0275599208385253, "grad_norm": 13.6875, "learning_rate": 1.9999542236458563e-05, "loss": 1.4281, "mean_token_accuracy": 0.6554515182971954, "num_tokens": 16188768.0, "step": 7010 }, { "entropy": 1.2059952437877655, "epoch": 1.029025874074617, "grad_norm": 7.40625, "learning_rate": 1.999949196916138e-05, "loss": 1.0684, "mean_token_accuracy": 0.7085949182510376, "num_tokens": 16215478.0, "step": 7020 }, { "entropy": 1.0679068624973298, "epoch": 1.0304918273107089, "grad_norm": 11.75, "learning_rate": 1.999943908386672e-05, "loss": 1.0738, "mean_token_accuracy": 0.7336953729391098, "num_tokens": 16243446.0, "step": 7030 }, { "entropy": 1.2468547612428664, "epoch": 1.0319577805468005, "grad_norm": 9.0, "learning_rate": 1.999938358058844e-05, "loss": 1.2041, "mean_token_accuracy": 0.6971589475870132, "num_tokens": 16266365.0, "step": 7040 }, { "entropy": 1.2434491604566573, "epoch": 1.0334237337828924, "grad_norm": 6.71875, "learning_rate": 1.9999325459341063e-05, "loss": 1.1936, "mean_token_accuracy": 0.6949279189109803, "num_tokens": 16292596.0, "step": 7050 }, { "entropy": 1.2967635929584502, "epoch": 1.034889687018984, "grad_norm": 8.75, "learning_rate": 1.9999264720139807e-05, "loss": 1.1858, "mean_token_accuracy": 0.6905998349189758, "num_tokens": 16319049.0, "step": 7060 }, { "entropy": 1.3346068769693376, "epoch": 1.0363556402550758, "grad_norm": 9.875, "learning_rate": 1.999920136300058e-05, "loss": 1.3174, "mean_token_accuracy": 0.6744190186262131, "num_tokens": 16339882.0, "step": 7070 }, { "entropy": 1.1099591448903083, "epoch": 1.0378215934911676, "grad_norm": 7.3125, "learning_rate": 1.999913538793996e-05, "loss": 1.0746, "mean_token_accuracy": 0.7183218091726303, "num_tokens": 16367621.0, "step": 7080 }, { "entropy": 1.1106664329767226, "epoch": 1.0392875467272593, "grad_norm": 6.09375, "learning_rate": 1.999906679497523e-05, "loss": 1.0436, "mean_token_accuracy": 0.7253878831863403, "num_tokens": 16391546.0, "step": 7090 }, { "entropy": 1.3532351702451706, "epoch": 1.0407534999633512, "grad_norm": 10.125, "learning_rate": 1.9998995584124345e-05, "loss": 1.2568, "mean_token_accuracy": 0.6764392882585526, "num_tokens": 16415892.0, "step": 7100 }, { "entropy": 1.1540004849433898, "epoch": 1.0422194531994429, "grad_norm": 8.8125, "learning_rate": 1.9998921755405944e-05, "loss": 1.1077, "mean_token_accuracy": 0.7128182768821716, "num_tokens": 16440620.0, "step": 7110 }, { "entropy": 1.343952314555645, "epoch": 1.0436854064355348, "grad_norm": 8.0, "learning_rate": 1.999884530883936e-05, "loss": 1.2878, "mean_token_accuracy": 0.6809811010956764, "num_tokens": 16460296.0, "step": 7120 }, { "entropy": 1.2931552827358246, "epoch": 1.0451513596716264, "grad_norm": 10.0, "learning_rate": 1.9998766244444614e-05, "loss": 1.3109, "mean_token_accuracy": 0.6873741716146469, "num_tokens": 16484912.0, "step": 7130 }, { "entropy": 1.1703765511512756, "epoch": 1.0466173129077183, "grad_norm": 6.9375, "learning_rate": 1.9998684562242396e-05, "loss": 1.0601, "mean_token_accuracy": 0.7189384013414383, "num_tokens": 16509349.0, "step": 7140 }, { "entropy": 1.4860281467437744, "epoch": 1.04808326614381, "grad_norm": 9.625, "learning_rate": 1.99986002622541e-05, "loss": 1.496, "mean_token_accuracy": 0.6494212061166763, "num_tokens": 16531031.0, "step": 7150 }, { "entropy": 1.2979628801345826, "epoch": 1.0495492193799019, "grad_norm": 6.09375, "learning_rate": 1.999851334450179e-05, "loss": 1.2513, "mean_token_accuracy": 0.6826528161764145, "num_tokens": 16556594.0, "step": 7160 }, { "entropy": 1.3127130091190338, "epoch": 1.0510151726159935, "grad_norm": 6.3125, "learning_rate": 1.9998423809008223e-05, "loss": 1.2649, "mean_token_accuracy": 0.6929091334342956, "num_tokens": 16577898.0, "step": 7170 }, { "entropy": 1.3124539732933045, "epoch": 1.0524811258520854, "grad_norm": 8.625, "learning_rate": 1.9998331655796844e-05, "loss": 1.2796, "mean_token_accuracy": 0.6811374068260193, "num_tokens": 16601241.0, "step": 7180 }, { "entropy": 1.145053617656231, "epoch": 1.053947079088177, "grad_norm": 11.625, "learning_rate": 1.999823688489178e-05, "loss": 1.0741, "mean_token_accuracy": 0.7136012792587281, "num_tokens": 16624964.0, "step": 7190 }, { "entropy": 1.4930299073457718, "epoch": 1.0554130323242688, "grad_norm": 15.3125, "learning_rate": 1.9998139496317838e-05, "loss": 1.4429, "mean_token_accuracy": 0.6462783008813858, "num_tokens": 16646312.0, "step": 7200 }, { "entropy": 1.244908195734024, "epoch": 1.0568789855603606, "grad_norm": 11.25, "learning_rate": 1.999803949010052e-05, "loss": 1.1924, "mean_token_accuracy": 0.6967493742704391, "num_tokens": 16668375.0, "step": 7210 }, { "entropy": 1.2006265938282012, "epoch": 1.0583449387964523, "grad_norm": 10.6875, "learning_rate": 1.999793686626601e-05, "loss": 1.1682, "mean_token_accuracy": 0.703893455862999, "num_tokens": 16691388.0, "step": 7220 }, { "entropy": 1.2016530066728592, "epoch": 1.0598108920325442, "grad_norm": 10.3125, "learning_rate": 1.9997831624841174e-05, "loss": 1.2033, "mean_token_accuracy": 0.7054964780807496, "num_tokens": 16717103.0, "step": 7230 }, { "entropy": 1.0638704180717469, "epoch": 1.0612768452686359, "grad_norm": 7.125, "learning_rate": 1.9997723765853563e-05, "loss": 1.0331, "mean_token_accuracy": 0.7303820580244065, "num_tokens": 16742696.0, "step": 7240 }, { "entropy": 1.3260975301265716, "epoch": 1.0627427985047277, "grad_norm": 7.65625, "learning_rate": 1.999761328933142e-05, "loss": 1.327, "mean_token_accuracy": 0.6768132835626602, "num_tokens": 16766329.0, "step": 7250 }, { "entropy": 1.3357076048851013, "epoch": 1.0642087517408194, "grad_norm": 7.9375, "learning_rate": 1.999750019530367e-05, "loss": 1.2991, "mean_token_accuracy": 0.6838137969374657, "num_tokens": 16792781.0, "step": 7260 }, { "entropy": 1.35421462059021, "epoch": 1.0656747049769113, "grad_norm": 13.125, "learning_rate": 1.999738448379992e-05, "loss": 1.2708, "mean_token_accuracy": 0.6806362554430961, "num_tokens": 16814580.0, "step": 7270 }, { "entropy": 1.3569172739982605, "epoch": 1.067140658213003, "grad_norm": 11.0, "learning_rate": 1.9997266154850463e-05, "loss": 1.2871, "mean_token_accuracy": 0.6748591914772988, "num_tokens": 16836283.0, "step": 7280 }, { "entropy": 1.3032677710056304, "epoch": 1.0686066114490949, "grad_norm": 12.625, "learning_rate": 1.9997145208486282e-05, "loss": 1.3058, "mean_token_accuracy": 0.6830355480313302, "num_tokens": 16859840.0, "step": 7290 }, { "entropy": 1.1838728606700897, "epoch": 1.0700725646851865, "grad_norm": 9.5, "learning_rate": 1.9997021644739046e-05, "loss": 1.0938, "mean_token_accuracy": 0.706995639204979, "num_tokens": 16881720.0, "step": 7300 }, { "entropy": 1.2360979318618774, "epoch": 1.0715385179212784, "grad_norm": 9.375, "learning_rate": 1.9996895463641096e-05, "loss": 1.1873, "mean_token_accuracy": 0.6982561200857162, "num_tokens": 16903617.0, "step": 7310 }, { "entropy": 1.2938247561454772, "epoch": 1.07300447115737, "grad_norm": 9.5625, "learning_rate": 1.9996766665225475e-05, "loss": 1.2479, "mean_token_accuracy": 0.6812280893325806, "num_tokens": 16922129.0, "step": 7320 }, { "entropy": 1.2328981727361679, "epoch": 1.074470424393462, "grad_norm": 8.875, "learning_rate": 1.99966352495259e-05, "loss": 1.2106, "mean_token_accuracy": 0.6953886121511459, "num_tokens": 16943425.0, "step": 7330 }, { "entropy": 1.1482363402843476, "epoch": 1.0759363776295536, "grad_norm": 8.6875, "learning_rate": 1.9996501216576784e-05, "loss": 1.0951, "mean_token_accuracy": 0.7142179131507873, "num_tokens": 16968338.0, "step": 7340 }, { "entropy": 1.1730873823165893, "epoch": 1.0774023308656453, "grad_norm": 10.5625, "learning_rate": 1.999636456641321e-05, "loss": 1.1133, "mean_token_accuracy": 0.7062408834695816, "num_tokens": 16993060.0, "step": 7350 }, { "entropy": 1.0632457852363586, "epoch": 1.0788682841017372, "grad_norm": 9.75, "learning_rate": 1.999622529907096e-05, "loss": 0.9943, "mean_token_accuracy": 0.7292854398488998, "num_tokens": 17015281.0, "step": 7360 }, { "entropy": 1.232741355895996, "epoch": 1.0803342373378289, "grad_norm": 11.8125, "learning_rate": 1.9996083414586497e-05, "loss": 1.1656, "mean_token_accuracy": 0.6903288066387177, "num_tokens": 17040029.0, "step": 7370 }, { "entropy": 1.235828810930252, "epoch": 1.0818001905739207, "grad_norm": 9.3125, "learning_rate": 1.999593891299697e-05, "loss": 1.1921, "mean_token_accuracy": 0.6955503851175309, "num_tokens": 17063409.0, "step": 7380 }, { "entropy": 1.3247672498226166, "epoch": 1.0832661438100124, "grad_norm": 14.375, "learning_rate": 1.99957917943402e-05, "loss": 1.3029, "mean_token_accuracy": 0.6824086099863053, "num_tokens": 17085640.0, "step": 7390 }, { "entropy": 1.301576018333435, "epoch": 1.0847320970461043, "grad_norm": 12.6875, "learning_rate": 1.9995642058654714e-05, "loss": 1.2099, "mean_token_accuracy": 0.6898682296276093, "num_tokens": 17105918.0, "step": 7400 }, { "entropy": 1.3882490575313569, "epoch": 1.086198050282196, "grad_norm": 14.375, "learning_rate": 1.9995489705979714e-05, "loss": 1.3169, "mean_token_accuracy": 0.6735828816890717, "num_tokens": 17129306.0, "step": 7410 }, { "entropy": 1.4083055555820465, "epoch": 1.0876640035182878, "grad_norm": 10.3125, "learning_rate": 1.9995334736355087e-05, "loss": 1.4031, "mean_token_accuracy": 0.668747641146183, "num_tokens": 17151274.0, "step": 7420 }, { "entropy": 1.39749975502491, "epoch": 1.0891299567543795, "grad_norm": 9.875, "learning_rate": 1.9995177149821404e-05, "loss": 1.3656, "mean_token_accuracy": 0.6699608534574508, "num_tokens": 17170026.0, "step": 7430 }, { "entropy": 1.0862579852342606, "epoch": 1.0905959099904714, "grad_norm": 10.75, "learning_rate": 1.999501694641993e-05, "loss": 1.0219, "mean_token_accuracy": 0.7240905195474625, "num_tokens": 17194434.0, "step": 7440 }, { "entropy": 1.127729943394661, "epoch": 1.092061863226563, "grad_norm": 7.4375, "learning_rate": 1.9994854126192596e-05, "loss": 1.0396, "mean_token_accuracy": 0.7229657411575318, "num_tokens": 17216664.0, "step": 7450 }, { "entropy": 1.093872132897377, "epoch": 1.0935278164626547, "grad_norm": 9.375, "learning_rate": 1.9994688689182046e-05, "loss": 1.0332, "mean_token_accuracy": 0.7225195884704589, "num_tokens": 17241164.0, "step": 7460 }, { "entropy": 1.2507360130548477, "epoch": 1.0949937696987466, "grad_norm": 9.9375, "learning_rate": 1.999452063543158e-05, "loss": 1.2117, "mean_token_accuracy": 0.7014961212873458, "num_tokens": 17262917.0, "step": 7470 }, { "entropy": 1.3890669494867325, "epoch": 1.0964597229348383, "grad_norm": 10.9375, "learning_rate": 1.9994349964985202e-05, "loss": 1.3673, "mean_token_accuracy": 0.6665780723094941, "num_tokens": 17283409.0, "step": 7480 }, { "entropy": 1.17710323035717, "epoch": 1.0979256761709302, "grad_norm": 11.1875, "learning_rate": 1.9994176677887596e-05, "loss": 1.0743, "mean_token_accuracy": 0.7097684442996979, "num_tokens": 17306768.0, "step": 7490 }, { "entropy": 1.295637196302414, "epoch": 1.0993916294070218, "grad_norm": 9.625, "learning_rate": 1.9994000774184135e-05, "loss": 1.3144, "mean_token_accuracy": 0.6853505477309227, "num_tokens": 17331053.0, "step": 7500 }, { "entropy": 1.4004454374313355, "epoch": 1.1008575826431137, "grad_norm": 7.25, "learning_rate": 1.9993822253920865e-05, "loss": 1.3244, "mean_token_accuracy": 0.6684523671865463, "num_tokens": 17352898.0, "step": 7510 }, { "entropy": 1.3414935022592545, "epoch": 1.1023235358792054, "grad_norm": 7.09375, "learning_rate": 1.9993641117144528e-05, "loss": 1.3169, "mean_token_accuracy": 0.6701092004776001, "num_tokens": 17373754.0, "step": 7520 }, { "entropy": 1.1745673388242721, "epoch": 1.1037894891152973, "grad_norm": 7.28125, "learning_rate": 1.9993457363902552e-05, "loss": 1.152, "mean_token_accuracy": 0.7090735673904419, "num_tokens": 17398508.0, "step": 7530 }, { "entropy": 1.2985221296548843, "epoch": 1.105255442351389, "grad_norm": 11.0625, "learning_rate": 1.999327099424304e-05, "loss": 1.2479, "mean_token_accuracy": 0.6943398445844651, "num_tokens": 17422124.0, "step": 7540 }, { "entropy": 1.4270990759134292, "epoch": 1.1067213955874808, "grad_norm": 12.6875, "learning_rate": 1.9993082008214787e-05, "loss": 1.4522, "mean_token_accuracy": 0.6547584116458893, "num_tokens": 17442208.0, "step": 7550 }, { "entropy": 1.3034896463155747, "epoch": 1.1081873488235725, "grad_norm": 12.875, "learning_rate": 1.9992890405867277e-05, "loss": 1.2412, "mean_token_accuracy": 0.685841104388237, "num_tokens": 17465145.0, "step": 7560 }, { "entropy": 1.2700834572315216, "epoch": 1.1096533020596644, "grad_norm": 8.5, "learning_rate": 1.9992696187250668e-05, "loss": 1.2111, "mean_token_accuracy": 0.6890330970287323, "num_tokens": 17487699.0, "step": 7570 }, { "entropy": 1.3145855516195297, "epoch": 1.111119255295756, "grad_norm": 7.78125, "learning_rate": 1.9992499352415812e-05, "loss": 1.2063, "mean_token_accuracy": 0.6822231978178024, "num_tokens": 17509999.0, "step": 7580 }, { "entropy": 1.2261007010936738, "epoch": 1.112585208531848, "grad_norm": 12.625, "learning_rate": 1.9992299901414244e-05, "loss": 1.1756, "mean_token_accuracy": 0.6933778584003448, "num_tokens": 17532741.0, "step": 7590 }, { "entropy": 1.2424679428339005, "epoch": 1.1140511617679396, "grad_norm": 6.40625, "learning_rate": 1.999209783429818e-05, "loss": 1.218, "mean_token_accuracy": 0.698337534070015, "num_tokens": 17557963.0, "step": 7600 }, { "entropy": 1.2083199232816697, "epoch": 1.1155171150040313, "grad_norm": 12.9375, "learning_rate": 1.9991893151120525e-05, "loss": 1.1717, "mean_token_accuracy": 0.7020735591650009, "num_tokens": 17581687.0, "step": 7610 }, { "entropy": 1.2098505914211273, "epoch": 1.1169830682401232, "grad_norm": 7.8125, "learning_rate": 1.999168585193487e-05, "loss": 1.2154, "mean_token_accuracy": 0.6991240084171295, "num_tokens": 17607332.0, "step": 7620 }, { "entropy": 1.2589897900819778, "epoch": 1.1184490214762148, "grad_norm": 7.53125, "learning_rate": 1.999147593679548e-05, "loss": 1.2391, "mean_token_accuracy": 0.6955213189125061, "num_tokens": 17630513.0, "step": 7630 }, { "entropy": 1.318173238635063, "epoch": 1.1199149747123067, "grad_norm": 10.0, "learning_rate": 1.9991263405757325e-05, "loss": 1.2475, "mean_token_accuracy": 0.6857015043497086, "num_tokens": 17652002.0, "step": 7640 }, { "entropy": 1.3564463943243026, "epoch": 1.1213809279483984, "grad_norm": 10.875, "learning_rate": 1.9991048258876044e-05, "loss": 1.3049, "mean_token_accuracy": 0.6771197259426117, "num_tokens": 17673382.0, "step": 7650 }, { "entropy": 1.2261092513799667, "epoch": 1.1228468811844903, "grad_norm": 8.5, "learning_rate": 1.9990830496207964e-05, "loss": 1.1306, "mean_token_accuracy": 0.704305237531662, "num_tokens": 17695322.0, "step": 7660 }, { "entropy": 1.3583180487155915, "epoch": 1.124312834420582, "grad_norm": 8.1875, "learning_rate": 1.9990610117810098e-05, "loss": 1.3648, "mean_token_accuracy": 0.6708371847867965, "num_tokens": 17718549.0, "step": 7670 }, { "entropy": 1.226490966975689, "epoch": 1.1257787876566738, "grad_norm": 9.5, "learning_rate": 1.999038712374015e-05, "loss": 1.1538, "mean_token_accuracy": 0.700319254398346, "num_tokens": 17739264.0, "step": 7680 }, { "entropy": 1.2978177577257157, "epoch": 1.1272447408927655, "grad_norm": 8.3125, "learning_rate": 1.9990161514056487e-05, "loss": 1.2129, "mean_token_accuracy": 0.6897207140922547, "num_tokens": 17763095.0, "step": 7690 }, { "entropy": 1.111958622932434, "epoch": 1.1287106941288574, "grad_norm": 9.3125, "learning_rate": 1.9989933288818194e-05, "loss": 1.059, "mean_token_accuracy": 0.7211978882551193, "num_tokens": 17788458.0, "step": 7700 }, { "entropy": 1.099173441529274, "epoch": 1.130176647364949, "grad_norm": 8.8125, "learning_rate": 1.9989702448085015e-05, "loss": 1.0514, "mean_token_accuracy": 0.7203189402818679, "num_tokens": 17812354.0, "step": 7710 }, { "entropy": 1.418110081553459, "epoch": 1.1316426006010407, "grad_norm": 9.0, "learning_rate": 1.9989468991917385e-05, "loss": 1.324, "mean_token_accuracy": 0.6644326269626617, "num_tokens": 17833171.0, "step": 7720 }, { "entropy": 1.2679696023464202, "epoch": 1.1331085538371326, "grad_norm": 8.5625, "learning_rate": 1.9989232920376435e-05, "loss": 1.159, "mean_token_accuracy": 0.699449411034584, "num_tokens": 17858029.0, "step": 7730 }, { "entropy": 1.1086147844791412, "epoch": 1.1345745070732243, "grad_norm": 8.6875, "learning_rate": 1.9988994233523963e-05, "loss": 1.1173, "mean_token_accuracy": 0.7222393989562989, "num_tokens": 17881308.0, "step": 7740 }, { "entropy": 1.1320438653230667, "epoch": 1.1360404603093162, "grad_norm": 6.375, "learning_rate": 1.9988752931422463e-05, "loss": 1.0919, "mean_token_accuracy": 0.7261982738971711, "num_tokens": 17908593.0, "step": 7750 }, { "entropy": 1.244100922346115, "epoch": 1.1375064135454078, "grad_norm": 9.1875, "learning_rate": 1.998850901413511e-05, "loss": 1.1883, "mean_token_accuracy": 0.6945676922798156, "num_tokens": 17931674.0, "step": 7760 }, { "entropy": 1.1638699680566789, "epoch": 1.1389723667814997, "grad_norm": 14.25, "learning_rate": 1.998826248172577e-05, "loss": 1.1237, "mean_token_accuracy": 0.7097329467535018, "num_tokens": 17954544.0, "step": 7770 }, { "entropy": 1.274840497970581, "epoch": 1.1404383200175914, "grad_norm": 12.6875, "learning_rate": 1.9988013334258982e-05, "loss": 1.2419, "mean_token_accuracy": 0.6908193230628967, "num_tokens": 17980185.0, "step": 7780 }, { "entropy": 1.2849934458732606, "epoch": 1.1419042732536833, "grad_norm": 12.875, "learning_rate": 1.998776157179998e-05, "loss": 1.2299, "mean_token_accuracy": 0.6847291320562363, "num_tokens": 18002100.0, "step": 7790 }, { "entropy": 1.159038746356964, "epoch": 1.143370226489775, "grad_norm": 13.5, "learning_rate": 1.998750719441468e-05, "loss": 1.087, "mean_token_accuracy": 0.7113112419843673, "num_tokens": 18025430.0, "step": 7800 }, { "entropy": 1.2390843749046325, "epoch": 1.1448361797258668, "grad_norm": 8.375, "learning_rate": 1.998725020216967e-05, "loss": 1.1777, "mean_token_accuracy": 0.6973342061042785, "num_tokens": 18047500.0, "step": 7810 }, { "entropy": 1.387335416674614, "epoch": 1.1463021329619585, "grad_norm": 9.1875, "learning_rate": 1.9986990595132247e-05, "loss": 1.4117, "mean_token_accuracy": 0.6667091771960258, "num_tokens": 18068314.0, "step": 7820 }, { "entropy": 1.1690547525882722, "epoch": 1.1477680861980504, "grad_norm": 6.625, "learning_rate": 1.9986728373370378e-05, "loss": 1.0942, "mean_token_accuracy": 0.711091834306717, "num_tokens": 18091242.0, "step": 7830 }, { "entropy": 1.3939763963222505, "epoch": 1.149234039434142, "grad_norm": 12.1875, "learning_rate": 1.9986463536952706e-05, "loss": 1.3339, "mean_token_accuracy": 0.6700289785861969, "num_tokens": 18114159.0, "step": 7840 }, { "entropy": 1.2348898351192474, "epoch": 1.150699992670234, "grad_norm": 10.125, "learning_rate": 1.9986196085948583e-05, "loss": 1.1681, "mean_token_accuracy": 0.6943050622940063, "num_tokens": 18137711.0, "step": 7850 }, { "entropy": 1.1429092198610307, "epoch": 1.1521659459063256, "grad_norm": 7.3125, "learning_rate": 1.9985926020428018e-05, "loss": 1.1019, "mean_token_accuracy": 0.7151133447885514, "num_tokens": 18161293.0, "step": 7860 }, { "entropy": 1.261123701930046, "epoch": 1.1536318991424173, "grad_norm": 12.625, "learning_rate": 1.9985653340461727e-05, "loss": 1.2314, "mean_token_accuracy": 0.6891691148281097, "num_tokens": 18184610.0, "step": 7870 }, { "entropy": 1.2343711465597154, "epoch": 1.1550978523785091, "grad_norm": 9.75, "learning_rate": 1.9985378046121096e-05, "loss": 1.1926, "mean_token_accuracy": 0.7087590843439102, "num_tokens": 18206789.0, "step": 7880 }, { "entropy": 1.2612506747245789, "epoch": 1.1565638056146008, "grad_norm": 9.375, "learning_rate": 1.9985100137478204e-05, "loss": 1.2201, "mean_token_accuracy": 0.6864638060331345, "num_tokens": 18225735.0, "step": 7890 }, { "entropy": 1.2300110906362534, "epoch": 1.1580297588506927, "grad_norm": 11.5, "learning_rate": 1.998481961460581e-05, "loss": 1.2105, "mean_token_accuracy": 0.6994627684354782, "num_tokens": 18246277.0, "step": 7900 }, { "entropy": 1.2108304917812347, "epoch": 1.1594957120867844, "grad_norm": 7.5625, "learning_rate": 1.9984536477577355e-05, "loss": 1.1804, "mean_token_accuracy": 0.700147596001625, "num_tokens": 18270445.0, "step": 7910 }, { "entropy": 1.1592963933944702, "epoch": 1.1609616653228763, "grad_norm": 8.625, "learning_rate": 1.9984250726466973e-05, "loss": 1.1132, "mean_token_accuracy": 0.7125275254249572, "num_tokens": 18291383.0, "step": 7920 }, { "entropy": 1.1584044009447099, "epoch": 1.162427618558968, "grad_norm": 11.5, "learning_rate": 1.9983962361349473e-05, "loss": 1.1374, "mean_token_accuracy": 0.7097839266061783, "num_tokens": 18315297.0, "step": 7930 }, { "entropy": 1.1523736417293549, "epoch": 1.1638935717950598, "grad_norm": 9.0, "learning_rate": 1.9983671382300356e-05, "loss": 1.1243, "mean_token_accuracy": 0.7097980231046677, "num_tokens": 18336677.0, "step": 7940 }, { "entropy": 1.1196268051862717, "epoch": 1.1653595250311515, "grad_norm": 8.625, "learning_rate": 1.9983377789395804e-05, "loss": 1.0502, "mean_token_accuracy": 0.7212891817092896, "num_tokens": 18360513.0, "step": 7950 }, { "entropy": 1.1502165853977204, "epoch": 1.1668254782672434, "grad_norm": 9.8125, "learning_rate": 1.9983081582712684e-05, "loss": 1.1163, "mean_token_accuracy": 0.7143410712480545, "num_tokens": 18388024.0, "step": 7960 }, { "entropy": 1.1946776926517486, "epoch": 1.168291431503335, "grad_norm": 6.6875, "learning_rate": 1.9982782762328547e-05, "loss": 1.1581, "mean_token_accuracy": 0.7029809296131134, "num_tokens": 18411419.0, "step": 7970 }, { "entropy": 1.3373067051172256, "epoch": 1.1697573847394267, "grad_norm": 10.6875, "learning_rate": 1.998248132832162e-05, "loss": 1.2693, "mean_token_accuracy": 0.6826079756021499, "num_tokens": 18433776.0, "step": 7980 }, { "entropy": 1.1142433345317841, "epoch": 1.1712233379755186, "grad_norm": 5.96875, "learning_rate": 1.9982177280770833e-05, "loss": 1.0313, "mean_token_accuracy": 0.7212407886981964, "num_tokens": 18459415.0, "step": 7990 }, { "entropy": 1.0709915578365325, "epoch": 1.1726892912116105, "grad_norm": 6.59375, "learning_rate": 1.9981870619755783e-05, "loss": 1.0243, "mean_token_accuracy": 0.7227054744958877, "num_tokens": 18484433.0, "step": 8000 }, { "entropy": 1.2908329129219056, "epoch": 1.1741552444477021, "grad_norm": 8.5625, "learning_rate": 1.9981561345356764e-05, "loss": 1.2051, "mean_token_accuracy": 0.6943241402506828, "num_tokens": 18506195.0, "step": 8010 }, { "entropy": 1.304473766684532, "epoch": 1.1756211976837938, "grad_norm": 9.3125, "learning_rate": 1.9981249457654742e-05, "loss": 1.2903, "mean_token_accuracy": 0.6788347274065017, "num_tokens": 18530233.0, "step": 8020 }, { "entropy": 1.1581927716732026, "epoch": 1.1770871509198857, "grad_norm": 7.21875, "learning_rate": 1.9980934956731378e-05, "loss": 1.1003, "mean_token_accuracy": 0.7182217329740525, "num_tokens": 18553358.0, "step": 8030 }, { "entropy": 1.3294427663087844, "epoch": 1.1785531041559774, "grad_norm": 7.4375, "learning_rate": 1.9980617842669006e-05, "loss": 1.312, "mean_token_accuracy": 0.6728596061468124, "num_tokens": 18578915.0, "step": 8040 }, { "entropy": 1.4975331664085387, "epoch": 1.1800190573920692, "grad_norm": 11.9375, "learning_rate": 1.998029811555066e-05, "loss": 1.4342, "mean_token_accuracy": 0.6505066365003586, "num_tokens": 18601449.0, "step": 8050 }, { "entropy": 1.2208754241466522, "epoch": 1.181485010628161, "grad_norm": 12.25, "learning_rate": 1.9979975775460035e-05, "loss": 1.1678, "mean_token_accuracy": 0.7001200944185257, "num_tokens": 18624735.0, "step": 8060 }, { "entropy": 1.2302956074476241, "epoch": 1.1829509638642528, "grad_norm": 7.28125, "learning_rate": 1.997965082248154e-05, "loss": 1.2172, "mean_token_accuracy": 0.6961577326059342, "num_tokens": 18650131.0, "step": 8070 }, { "entropy": 1.2694584101438522, "epoch": 1.1844169171003445, "grad_norm": 12.125, "learning_rate": 1.997932325670024e-05, "loss": 1.2257, "mean_token_accuracy": 0.686340194940567, "num_tokens": 18673685.0, "step": 8080 }, { "entropy": 1.391654834151268, "epoch": 1.1858828703364364, "grad_norm": 12.0625, "learning_rate": 1.9978993078201904e-05, "loss": 1.3782, "mean_token_accuracy": 0.6700852692127228, "num_tokens": 18693958.0, "step": 8090 }, { "entropy": 1.2006519705057144, "epoch": 1.187348823572528, "grad_norm": 10.0, "learning_rate": 1.997866028707297e-05, "loss": 1.1252, "mean_token_accuracy": 0.6996953904628753, "num_tokens": 18717984.0, "step": 8100 }, { "entropy": 1.245961058139801, "epoch": 1.18881477680862, "grad_norm": 8.25, "learning_rate": 1.997832488340057e-05, "loss": 1.2169, "mean_token_accuracy": 0.6911354273557663, "num_tokens": 18741218.0, "step": 8110 }, { "entropy": 1.2630988866090775, "epoch": 1.1902807300447116, "grad_norm": 11.875, "learning_rate": 1.997798686727252e-05, "loss": 1.2264, "mean_token_accuracy": 0.6940892323851585, "num_tokens": 18766633.0, "step": 8120 }, { "entropy": 1.102625547349453, "epoch": 1.1917466832808032, "grad_norm": 7.8125, "learning_rate": 1.9977646238777316e-05, "loss": 1.0525, "mean_token_accuracy": 0.7235903978347779, "num_tokens": 18792986.0, "step": 8130 }, { "entropy": 1.1702050864696503, "epoch": 1.1932126365168951, "grad_norm": 8.0, "learning_rate": 1.9977302998004134e-05, "loss": 1.1446, "mean_token_accuracy": 0.7046263426542282, "num_tokens": 18814580.0, "step": 8140 }, { "entropy": 1.4081081330776215, "epoch": 1.1946785897529868, "grad_norm": 8.8125, "learning_rate": 1.997695714504284e-05, "loss": 1.3708, "mean_token_accuracy": 0.6664967626333237, "num_tokens": 18831915.0, "step": 8150 }, { "entropy": 1.3582570523023605, "epoch": 1.1961445429890787, "grad_norm": 9.6875, "learning_rate": 1.997660867998399e-05, "loss": 1.3143, "mean_token_accuracy": 0.6656846463680267, "num_tokens": 18849865.0, "step": 8160 }, { "entropy": 1.210613614320755, "epoch": 1.1976104962251704, "grad_norm": 9.125, "learning_rate": 1.997625760291881e-05, "loss": 1.1597, "mean_token_accuracy": 0.7002564966678619, "num_tokens": 18873353.0, "step": 8170 }, { "entropy": 1.3661247313022613, "epoch": 1.1990764494612622, "grad_norm": 9.5, "learning_rate": 1.9975903913939217e-05, "loss": 1.3227, "mean_token_accuracy": 0.6733789324760437, "num_tokens": 18899394.0, "step": 8180 }, { "entropy": 1.0870557144284247, "epoch": 1.200542402697354, "grad_norm": 13.3125, "learning_rate": 1.9975547613137813e-05, "loss": 1.0039, "mean_token_accuracy": 0.7269314140081405, "num_tokens": 18925036.0, "step": 8190 }, { "entropy": 1.224964189529419, "epoch": 1.2020083559334458, "grad_norm": 10.625, "learning_rate": 1.9975188700607885e-05, "loss": 1.1525, "mean_token_accuracy": 0.7026874721050262, "num_tokens": 18945998.0, "step": 8200 }, { "entropy": 1.3445612996816636, "epoch": 1.2034743091695375, "grad_norm": 8.125, "learning_rate": 1.9974827176443395e-05, "loss": 1.3373, "mean_token_accuracy": 0.675517562031746, "num_tokens": 18968676.0, "step": 8210 }, { "entropy": 1.2112349092960357, "epoch": 1.2049402624056293, "grad_norm": 7.625, "learning_rate": 1.9974463040738997e-05, "loss": 1.165, "mean_token_accuracy": 0.6964822709560394, "num_tokens": 18992820.0, "step": 8220 }, { "entropy": 1.2422357112169267, "epoch": 1.206406215641721, "grad_norm": 8.5, "learning_rate": 1.997409629359003e-05, "loss": 1.2032, "mean_token_accuracy": 0.7037787675857544, "num_tokens": 19016808.0, "step": 8230 }, { "entropy": 1.2296515941619872, "epoch": 1.2078721688778127, "grad_norm": 9.0, "learning_rate": 1.9973726935092508e-05, "loss": 1.1697, "mean_token_accuracy": 0.695156580209732, "num_tokens": 19039834.0, "step": 8240 }, { "entropy": 1.3288650304079055, "epoch": 1.2093381221139046, "grad_norm": 9.4375, "learning_rate": 1.9973354965343137e-05, "loss": 1.3442, "mean_token_accuracy": 0.6741533249616622, "num_tokens": 19063948.0, "step": 8250 }, { "entropy": 1.2423579543828964, "epoch": 1.2108040753499965, "grad_norm": 11.9375, "learning_rate": 1.9972980384439302e-05, "loss": 1.2015, "mean_token_accuracy": 0.6997667402029037, "num_tokens": 19085744.0, "step": 8260 }, { "entropy": 1.4441985458135604, "epoch": 1.2122700285860881, "grad_norm": 8.5, "learning_rate": 1.9972603192479076e-05, "loss": 1.4117, "mean_token_accuracy": 0.6603557646274567, "num_tokens": 19105789.0, "step": 8270 }, { "entropy": 1.2065817624330522, "epoch": 1.2137359818221798, "grad_norm": 13.3125, "learning_rate": 1.997222338956121e-05, "loss": 1.1575, "mean_token_accuracy": 0.7018398404121399, "num_tokens": 19128680.0, "step": 8280 }, { "entropy": 1.3237407147884368, "epoch": 1.2152019350582717, "grad_norm": 9.5625, "learning_rate": 1.997184097578514e-05, "loss": 1.2702, "mean_token_accuracy": 0.677951380610466, "num_tokens": 19151779.0, "step": 8290 }, { "entropy": 1.3310970962047577, "epoch": 1.2166678882943633, "grad_norm": 12.75, "learning_rate": 1.997145595125099e-05, "loss": 1.3292, "mean_token_accuracy": 0.6881353408098221, "num_tokens": 19173311.0, "step": 8300 }, { "entropy": 1.2388422340154648, "epoch": 1.2181338415304552, "grad_norm": 7.53125, "learning_rate": 1.9971068316059565e-05, "loss": 1.158, "mean_token_accuracy": 0.6952472507953644, "num_tokens": 19195735.0, "step": 8310 }, { "entropy": 1.217163196206093, "epoch": 1.219599794766547, "grad_norm": 8.75, "learning_rate": 1.997067807031235e-05, "loss": 1.2062, "mean_token_accuracy": 0.7050163686275482, "num_tokens": 19219432.0, "step": 8320 }, { "entropy": 1.2774510994553565, "epoch": 1.2210657480026388, "grad_norm": 9.9375, "learning_rate": 1.997028521411152e-05, "loss": 1.2574, "mean_token_accuracy": 0.6934859126806259, "num_tokens": 19244099.0, "step": 8330 }, { "entropy": 1.2825005024671554, "epoch": 1.2225317012387305, "grad_norm": 9.5, "learning_rate": 1.9969889747559925e-05, "loss": 1.2081, "mean_token_accuracy": 0.6941129773855209, "num_tokens": 19268128.0, "step": 8340 }, { "entropy": 1.2908157259225845, "epoch": 1.2239976544748223, "grad_norm": 10.5625, "learning_rate": 1.9969491670761106e-05, "loss": 1.2323, "mean_token_accuracy": 0.6864058643579483, "num_tokens": 19290432.0, "step": 8350 }, { "entropy": 1.3051592051982879, "epoch": 1.225463607710914, "grad_norm": 6.96875, "learning_rate": 1.9969090983819286e-05, "loss": 1.2487, "mean_token_accuracy": 0.6781480431556701, "num_tokens": 19314203.0, "step": 8360 }, { "entropy": 1.2666983008384705, "epoch": 1.226929560947006, "grad_norm": 9.8125, "learning_rate": 1.9968687686839368e-05, "loss": 1.2291, "mean_token_accuracy": 0.6938182830810546, "num_tokens": 19338559.0, "step": 8370 }, { "entropy": 1.2194917529821396, "epoch": 1.2283955141830976, "grad_norm": 7.0, "learning_rate": 1.996828177992694e-05, "loss": 1.1822, "mean_token_accuracy": 0.7005347639322281, "num_tokens": 19363090.0, "step": 8380 }, { "entropy": 1.1822561174631119, "epoch": 1.2298614674191892, "grad_norm": 7.75, "learning_rate": 1.9967873263188277e-05, "loss": 1.1688, "mean_token_accuracy": 0.7062616437673569, "num_tokens": 19391373.0, "step": 8390 }, { "entropy": 1.378390407562256, "epoch": 1.2313274206552811, "grad_norm": 13.1875, "learning_rate": 1.9967462136730333e-05, "loss": 1.365, "mean_token_accuracy": 0.6823824197053909, "num_tokens": 19409795.0, "step": 8400 }, { "entropy": 1.3239964365959167, "epoch": 1.2327933738913728, "grad_norm": 8.6875, "learning_rate": 1.996704840066074e-05, "loss": 1.3025, "mean_token_accuracy": 0.6769445896148681, "num_tokens": 19430648.0, "step": 8410 }, { "entropy": 1.1049929827451705, "epoch": 1.2342593271274647, "grad_norm": 9.0625, "learning_rate": 1.996663205508783e-05, "loss": 1.0896, "mean_token_accuracy": 0.7191528350114822, "num_tokens": 19454214.0, "step": 8420 }, { "entropy": 1.318594041466713, "epoch": 1.2357252803635563, "grad_norm": 9.4375, "learning_rate": 1.99662131001206e-05, "loss": 1.2876, "mean_token_accuracy": 0.6831749081611633, "num_tokens": 19475114.0, "step": 8430 }, { "entropy": 1.195077359676361, "epoch": 1.2371912335996482, "grad_norm": 9.625, "learning_rate": 1.996579153586874e-05, "loss": 1.1199, "mean_token_accuracy": 0.7107096284627914, "num_tokens": 19499109.0, "step": 8440 }, { "entropy": 1.1825966447591783, "epoch": 1.2386571868357399, "grad_norm": 9.3125, "learning_rate": 1.996536736244262e-05, "loss": 1.127, "mean_token_accuracy": 0.7073414623737335, "num_tokens": 19525696.0, "step": 8450 }, { "entropy": 1.2508297681808471, "epoch": 1.2401231400718318, "grad_norm": 9.25, "learning_rate": 1.9964940579953297e-05, "loss": 1.2574, "mean_token_accuracy": 0.6992858797311783, "num_tokens": 19547915.0, "step": 8460 }, { "entropy": 1.1386162102222444, "epoch": 1.2415890933079234, "grad_norm": 10.0625, "learning_rate": 1.9964511188512507e-05, "loss": 1.1121, "mean_token_accuracy": 0.7196700811386109, "num_tokens": 19571993.0, "step": 8470 }, { "entropy": 1.1961974680423737, "epoch": 1.2430550465440153, "grad_norm": 6.84375, "learning_rate": 1.996407918823267e-05, "loss": 1.1803, "mean_token_accuracy": 0.7043748140335083, "num_tokens": 19595676.0, "step": 8480 }, { "entropy": 1.4670558393001556, "epoch": 1.244520999780107, "grad_norm": 11.4375, "learning_rate": 1.996364457922689e-05, "loss": 1.4914, "mean_token_accuracy": 0.6548519223928452, "num_tokens": 19618895.0, "step": 8490 }, { "entropy": 1.2456548869609834, "epoch": 1.2459869530161987, "grad_norm": 7.75, "learning_rate": 1.996320736160895e-05, "loss": 1.2075, "mean_token_accuracy": 0.6958816438913346, "num_tokens": 19642268.0, "step": 8500 }, { "entropy": 1.1849403023719787, "epoch": 1.2474529062522905, "grad_norm": 9.625, "learning_rate": 1.9962767535493322e-05, "loss": 1.1399, "mean_token_accuracy": 0.7080446928739548, "num_tokens": 19665697.0, "step": 8510 }, { "entropy": 1.3570242017507552, "epoch": 1.2489188594883824, "grad_norm": 13.4375, "learning_rate": 1.9962325100995162e-05, "loss": 1.3327, "mean_token_accuracy": 0.6725957959890365, "num_tokens": 19688560.0, "step": 8520 }, { "entropy": 1.1897883296012879, "epoch": 1.250384812724474, "grad_norm": 10.1875, "learning_rate": 1.9961880058230298e-05, "loss": 1.0947, "mean_token_accuracy": 0.7062407791614532, "num_tokens": 19708643.0, "step": 8530 }, { "entropy": 1.19278085231781, "epoch": 1.2518507659605658, "grad_norm": 8.625, "learning_rate": 1.996143240731525e-05, "loss": 1.156, "mean_token_accuracy": 0.696299159526825, "num_tokens": 19732149.0, "step": 8540 }, { "entropy": 1.4850953698158265, "epoch": 1.2533167191966577, "grad_norm": 13.6875, "learning_rate": 1.9960982148367224e-05, "loss": 1.4963, "mean_token_accuracy": 0.6535314947366715, "num_tokens": 19752445.0, "step": 8550 }, { "entropy": 1.2585141688585282, "epoch": 1.2547826724327493, "grad_norm": 7.21875, "learning_rate": 1.9960529281504098e-05, "loss": 1.1917, "mean_token_accuracy": 0.6915412396192551, "num_tokens": 19778875.0, "step": 8560 }, { "entropy": 1.3059006959199906, "epoch": 1.2562486256688412, "grad_norm": 8.125, "learning_rate": 1.9960073806844437e-05, "loss": 1.2306, "mean_token_accuracy": 0.6879364639520645, "num_tokens": 19799789.0, "step": 8570 }, { "entropy": 1.185035651922226, "epoch": 1.2577145789049329, "grad_norm": 11.0625, "learning_rate": 1.9959615724507496e-05, "loss": 1.1264, "mean_token_accuracy": 0.7071018159389496, "num_tokens": 19821889.0, "step": 8580 }, { "entropy": 1.393834364414215, "epoch": 1.2591805321410248, "grad_norm": 11.875, "learning_rate": 1.9959155034613206e-05, "loss": 1.3873, "mean_token_accuracy": 0.6650641798973084, "num_tokens": 19841087.0, "step": 8590 }, { "entropy": 1.1827852845191955, "epoch": 1.2606464853771164, "grad_norm": 9.25, "learning_rate": 1.9958691737282178e-05, "loss": 1.1649, "mean_token_accuracy": 0.7109099864959717, "num_tokens": 19865794.0, "step": 8600 }, { "entropy": 1.197489896416664, "epoch": 1.2621124386132083, "grad_norm": 14.4375, "learning_rate": 1.9958225832635708e-05, "loss": 1.15, "mean_token_accuracy": 0.7028390616178513, "num_tokens": 19888741.0, "step": 8610 }, { "entropy": 1.1547148019075393, "epoch": 1.2635783918493, "grad_norm": 12.875, "learning_rate": 1.9957757320795785e-05, "loss": 1.1354, "mean_token_accuracy": 0.7151894509792328, "num_tokens": 19912435.0, "step": 8620 }, { "entropy": 1.2037559509277345, "epoch": 1.2650443450853919, "grad_norm": 9.9375, "learning_rate": 1.9957286201885063e-05, "loss": 1.154, "mean_token_accuracy": 0.7082291692495346, "num_tokens": 19940630.0, "step": 8630 }, { "entropy": 1.4012288063764573, "epoch": 1.2665102983214835, "grad_norm": 11.0625, "learning_rate": 1.9956812476026887e-05, "loss": 1.3357, "mean_token_accuracy": 0.6663331717252732, "num_tokens": 19965772.0, "step": 8640 }, { "entropy": 1.228634414076805, "epoch": 1.2679762515575752, "grad_norm": 7.90625, "learning_rate": 1.995633614334529e-05, "loss": 1.2023, "mean_token_accuracy": 0.6979531407356262, "num_tokens": 19994471.0, "step": 8650 }, { "entropy": 1.3509396106004714, "epoch": 1.269442204793667, "grad_norm": 9.4375, "learning_rate": 1.995585720396498e-05, "loss": 1.3272, "mean_token_accuracy": 0.680682235956192, "num_tokens": 20017732.0, "step": 8660 }, { "entropy": 1.3513769119977952, "epoch": 1.270908158029759, "grad_norm": 7.375, "learning_rate": 1.9955375658011345e-05, "loss": 1.3282, "mean_token_accuracy": 0.6785157293081283, "num_tokens": 20039678.0, "step": 8670 }, { "entropy": 1.1930440217256546, "epoch": 1.2723741112658506, "grad_norm": 9.6875, "learning_rate": 1.995489150561047e-05, "loss": 1.1542, "mean_token_accuracy": 0.7015500843524933, "num_tokens": 20063381.0, "step": 8680 }, { "entropy": 1.1809492319822312, "epoch": 1.2738400645019423, "grad_norm": 8.5625, "learning_rate": 1.9954404746889103e-05, "loss": 1.1771, "mean_token_accuracy": 0.7058048486709595, "num_tokens": 20087074.0, "step": 8690 }, { "entropy": 1.2082881569862365, "epoch": 1.2753060177380342, "grad_norm": 9.1875, "learning_rate": 1.9953915381974688e-05, "loss": 1.1383, "mean_token_accuracy": 0.6957387492060662, "num_tokens": 20108508.0, "step": 8700 }, { "entropy": 1.234635779261589, "epoch": 1.2767719709741259, "grad_norm": 12.1875, "learning_rate": 1.995342341099535e-05, "loss": 1.1632, "mean_token_accuracy": 0.7057263016700744, "num_tokens": 20134886.0, "step": 8710 }, { "entropy": 1.007859079539776, "epoch": 1.2782379242102178, "grad_norm": 7.3125, "learning_rate": 1.9952928834079882e-05, "loss": 0.9523, "mean_token_accuracy": 0.7405643820762634, "num_tokens": 20159650.0, "step": 8720 }, { "entropy": 1.237021043896675, "epoch": 1.2797038774463094, "grad_norm": 9.25, "learning_rate": 1.9952431651357783e-05, "loss": 1.193, "mean_token_accuracy": 0.6955510973930359, "num_tokens": 20180890.0, "step": 8730 }, { "entropy": 1.3136773109436035, "epoch": 1.2811698306824013, "grad_norm": 11.625, "learning_rate": 1.9951931862959216e-05, "loss": 1.2738, "mean_token_accuracy": 0.6787832468748093, "num_tokens": 20206747.0, "step": 8740 }, { "entropy": 1.061192613840103, "epoch": 1.282635783918493, "grad_norm": 9.375, "learning_rate": 1.9951429469015034e-05, "loss": 1.0412, "mean_token_accuracy": 0.7315765589475631, "num_tokens": 20231428.0, "step": 8750 }, { "entropy": 1.3104854285717011, "epoch": 1.2841017371545846, "grad_norm": 13.125, "learning_rate": 1.9950924469656772e-05, "loss": 1.2978, "mean_token_accuracy": 0.6951914310455323, "num_tokens": 20256391.0, "step": 8760 }, { "entropy": 1.2505833029747009, "epoch": 1.2855676903906765, "grad_norm": 9.125, "learning_rate": 1.995041686501664e-05, "loss": 1.238, "mean_token_accuracy": 0.6900630116462707, "num_tokens": 20283155.0, "step": 8770 }, { "entropy": 1.5188051491975785, "epoch": 1.2870336436267684, "grad_norm": 12.75, "learning_rate": 1.994990665522754e-05, "loss": 1.4539, "mean_token_accuracy": 0.6569530144333839, "num_tokens": 20302267.0, "step": 8780 }, { "entropy": 1.1958658665418624, "epoch": 1.28849959686286, "grad_norm": 10.5625, "learning_rate": 1.9949393840423048e-05, "loss": 1.1245, "mean_token_accuracy": 0.7012640982866287, "num_tokens": 20325809.0, "step": 8790 }, { "entropy": 1.2504535913467407, "epoch": 1.2899655500989518, "grad_norm": 13.25, "learning_rate": 1.994887842073743e-05, "loss": 1.2175, "mean_token_accuracy": 0.6970232009887696, "num_tokens": 20347486.0, "step": 8800 }, { "entropy": 1.262554207444191, "epoch": 1.2914315033350436, "grad_norm": 11.875, "learning_rate": 1.994836039630563e-05, "loss": 1.235, "mean_token_accuracy": 0.6838883936405182, "num_tokens": 20371251.0, "step": 8810 }, { "entropy": 1.247108805179596, "epoch": 1.2928974565711353, "grad_norm": 12.4375, "learning_rate": 1.9947839767263266e-05, "loss": 1.1885, "mean_token_accuracy": 0.6974726319313049, "num_tokens": 20394412.0, "step": 8820 }, { "entropy": 1.151223850250244, "epoch": 1.2943634098072272, "grad_norm": 8.125, "learning_rate": 1.9947316533746653e-05, "loss": 1.0904, "mean_token_accuracy": 0.7130434095859528, "num_tokens": 20419550.0, "step": 8830 }, { "entropy": 1.3332852214574813, "epoch": 1.2958293630433189, "grad_norm": 9.4375, "learning_rate": 1.994679069589278e-05, "loss": 1.2672, "mean_token_accuracy": 0.6760007083415985, "num_tokens": 20439830.0, "step": 8840 }, { "entropy": 1.3328593626618386, "epoch": 1.2972953162794107, "grad_norm": 6.90625, "learning_rate": 1.9946262253839313e-05, "loss": 1.2943, "mean_token_accuracy": 0.6751489698886871, "num_tokens": 20464692.0, "step": 8850 }, { "entropy": 1.1247052043676375, "epoch": 1.2987612695155024, "grad_norm": 9.0, "learning_rate": 1.994573120772461e-05, "loss": 1.0429, "mean_token_accuracy": 0.7176267176866531, "num_tokens": 20487901.0, "step": 8860 }, { "entropy": 1.3148593381047249, "epoch": 1.3002272227515943, "grad_norm": 10.625, "learning_rate": 1.9945197557687704e-05, "loss": 1.2609, "mean_token_accuracy": 0.6798486888408661, "num_tokens": 20511647.0, "step": 8870 }, { "entropy": 1.21485443264246, "epoch": 1.301693175987686, "grad_norm": 10.9375, "learning_rate": 1.994466130386831e-05, "loss": 1.1761, "mean_token_accuracy": 0.7096399575471878, "num_tokens": 20534607.0, "step": 8880 }, { "entropy": 1.1379876226186751, "epoch": 1.3031591292237779, "grad_norm": 6.75, "learning_rate": 1.9944122446406825e-05, "loss": 1.0961, "mean_token_accuracy": 0.7224800944328308, "num_tokens": 20558687.0, "step": 8890 }, { "entropy": 1.2667191326618195, "epoch": 1.3046250824598695, "grad_norm": 8.6875, "learning_rate": 1.9943580985444336e-05, "loss": 1.1569, "mean_token_accuracy": 0.6863134890794754, "num_tokens": 20580180.0, "step": 8900 }, { "entropy": 1.2986000806093216, "epoch": 1.3060910356959612, "grad_norm": 7.0, "learning_rate": 1.99430369211226e-05, "loss": 1.2254, "mean_token_accuracy": 0.6867116123437882, "num_tokens": 20602981.0, "step": 8910 }, { "entropy": 1.2646744072437286, "epoch": 1.307556988932053, "grad_norm": 9.375, "learning_rate": 1.994249025358406e-05, "loss": 1.2662, "mean_token_accuracy": 0.69329674243927, "num_tokens": 20626472.0, "step": 8920 }, { "entropy": 1.2780030995607377, "epoch": 1.309022942168145, "grad_norm": 7.8125, "learning_rate": 1.994194098297184e-05, "loss": 1.2294, "mean_token_accuracy": 0.6883346110582351, "num_tokens": 20651081.0, "step": 8930 }, { "entropy": 1.188536822795868, "epoch": 1.3104888954042366, "grad_norm": 11.75, "learning_rate": 1.994138910942975e-05, "loss": 1.1458, "mean_token_accuracy": 0.7067872703075408, "num_tokens": 20675850.0, "step": 8940 }, { "entropy": 1.255830430984497, "epoch": 1.3119548486403283, "grad_norm": 9.0625, "learning_rate": 1.994083463310227e-05, "loss": 1.2401, "mean_token_accuracy": 0.6953504979610443, "num_tokens": 20699793.0, "step": 8950 }, { "entropy": 1.1903204530477525, "epoch": 1.3134208018764202, "grad_norm": 7.25, "learning_rate": 1.9940277554134578e-05, "loss": 1.1261, "mean_token_accuracy": 0.7119352102279664, "num_tokens": 20723920.0, "step": 8960 }, { "entropy": 1.1333075284957885, "epoch": 1.3148867551125119, "grad_norm": 8.875, "learning_rate": 1.9939717872672517e-05, "loss": 1.0751, "mean_token_accuracy": 0.71836359500885, "num_tokens": 20746727.0, "step": 8970 }, { "entropy": 1.5710612326860427, "epoch": 1.3163527083486037, "grad_norm": 8.25, "learning_rate": 1.9939155588862625e-05, "loss": 1.5699, "mean_token_accuracy": 0.6359361961483956, "num_tokens": 20764731.0, "step": 8980 }, { "entropy": 1.2543192863464356, "epoch": 1.3178186615846954, "grad_norm": 12.25, "learning_rate": 1.9938590702852115e-05, "loss": 1.227, "mean_token_accuracy": 0.6880697578191757, "num_tokens": 20786334.0, "step": 8990 }, { "entropy": 1.4001283556222917, "epoch": 1.3192846148207873, "grad_norm": 9.9375, "learning_rate": 1.9938023214788874e-05, "loss": 1.3487, "mean_token_accuracy": 0.6665228188037873, "num_tokens": 20809635.0, "step": 9000 }, { "entropy": 1.3060024440288545, "epoch": 1.320750568056879, "grad_norm": 9.0, "learning_rate": 1.9937453124821487e-05, "loss": 1.2334, "mean_token_accuracy": 0.6864656001329422, "num_tokens": 20833731.0, "step": 9010 }, { "entropy": 1.1886813908815383, "epoch": 1.3222165212929706, "grad_norm": 12.625, "learning_rate": 1.9936880433099205e-05, "loss": 1.1233, "mean_token_accuracy": 0.7027745008468628, "num_tokens": 20857525.0, "step": 9020 }, { "entropy": 1.151307898759842, "epoch": 1.3236824745290625, "grad_norm": 7.125, "learning_rate": 1.993630513977197e-05, "loss": 1.0357, "mean_token_accuracy": 0.7081638097763061, "num_tokens": 20880005.0, "step": 9030 }, { "entropy": 1.150944510102272, "epoch": 1.3251484277651544, "grad_norm": 8.0, "learning_rate": 1.99357272449904e-05, "loss": 1.1309, "mean_token_accuracy": 0.7120540142059326, "num_tokens": 20904736.0, "step": 9040 }, { "entropy": 1.24483642578125, "epoch": 1.326614381001246, "grad_norm": 8.125, "learning_rate": 1.993514674890579e-05, "loss": 1.1527, "mean_token_accuracy": 0.6920271545648575, "num_tokens": 20928718.0, "step": 9050 }, { "entropy": 1.200996646285057, "epoch": 1.3280803342373377, "grad_norm": 6.625, "learning_rate": 1.993456365167013e-05, "loss": 1.0996, "mean_token_accuracy": 0.7161652266979217, "num_tokens": 20953266.0, "step": 9060 }, { "entropy": 1.1329531490802764, "epoch": 1.3295462874734296, "grad_norm": 8.8125, "learning_rate": 1.993397795343608e-05, "loss": 1.1247, "mean_token_accuracy": 0.7235858753323555, "num_tokens": 20977893.0, "step": 9070 }, { "entropy": 1.2286746591329574, "epoch": 1.3310122407095213, "grad_norm": 9.375, "learning_rate": 1.993338965435698e-05, "loss": 1.162, "mean_token_accuracy": 0.6944783538579941, "num_tokens": 21001639.0, "step": 9080 }, { "entropy": 1.2463327765464782, "epoch": 1.3324781939456132, "grad_norm": 8.9375, "learning_rate": 1.993279875458686e-05, "loss": 1.1939, "mean_token_accuracy": 0.6928312331438065, "num_tokens": 21025687.0, "step": 9090 }, { "entropy": 1.370621892809868, "epoch": 1.3339441471817048, "grad_norm": 8.0625, "learning_rate": 1.9932205254280418e-05, "loss": 1.2984, "mean_token_accuracy": 0.6749988824129105, "num_tokens": 21049948.0, "step": 9100 }, { "entropy": 1.27707496881485, "epoch": 1.3354101004177967, "grad_norm": 12.9375, "learning_rate": 1.9931609153593045e-05, "loss": 1.2684, "mean_token_accuracy": 0.6879852831363678, "num_tokens": 21071088.0, "step": 9110 }, { "entropy": 1.2820963725447654, "epoch": 1.3368760536538884, "grad_norm": 9.375, "learning_rate": 1.9931010452680807e-05, "loss": 1.1781, "mean_token_accuracy": 0.6908757120370865, "num_tokens": 21090668.0, "step": 9120 }, { "entropy": 1.2898325890302658, "epoch": 1.3383420068899803, "grad_norm": 10.125, "learning_rate": 1.9930409151700453e-05, "loss": 1.2554, "mean_token_accuracy": 0.6871968537569046, "num_tokens": 21114488.0, "step": 9130 }, { "entropy": 1.2049978852272034, "epoch": 1.339807960126072, "grad_norm": 13.375, "learning_rate": 1.9929805250809407e-05, "loss": 1.1402, "mean_token_accuracy": 0.6998422384262085, "num_tokens": 21136682.0, "step": 9140 }, { "entropy": 1.2999972343444823, "epoch": 1.3412739133621638, "grad_norm": 14.5625, "learning_rate": 1.9929198750165785e-05, "loss": 1.2185, "mean_token_accuracy": 0.6793841004371644, "num_tokens": 21162100.0, "step": 9150 }, { "entropy": 1.1599612653255462, "epoch": 1.3427398665982555, "grad_norm": 10.25, "learning_rate": 1.9928589649928374e-05, "loss": 1.1262, "mean_token_accuracy": 0.7150771647691727, "num_tokens": 21183054.0, "step": 9160 }, { "entropy": 1.3685899943113327, "epoch": 1.3442058198343472, "grad_norm": 10.375, "learning_rate": 1.992797795025664e-05, "loss": 1.3542, "mean_token_accuracy": 0.6727657586336135, "num_tokens": 21205969.0, "step": 9170 }, { "entropy": 1.0696477800607682, "epoch": 1.345671773070439, "grad_norm": 9.9375, "learning_rate": 1.992736365131074e-05, "loss": 1.0191, "mean_token_accuracy": 0.7281067639589309, "num_tokens": 21230976.0, "step": 9180 }, { "entropy": 1.5111856430768966, "epoch": 1.347137726306531, "grad_norm": 7.84375, "learning_rate": 1.9926746753251498e-05, "loss": 1.4273, "mean_token_accuracy": 0.6527245104312897, "num_tokens": 21254756.0, "step": 9190 }, { "entropy": 1.1138581693172456, "epoch": 1.3486036795426226, "grad_norm": 8.75, "learning_rate": 1.9926127256240436e-05, "loss": 1.0067, "mean_token_accuracy": 0.7185070097446442, "num_tokens": 21279920.0, "step": 9200 }, { "entropy": 1.2445373713970185, "epoch": 1.3500696327787143, "grad_norm": 7.40625, "learning_rate": 1.9925505160439737e-05, "loss": 1.199, "mean_token_accuracy": 0.6939370661973954, "num_tokens": 21305017.0, "step": 9210 }, { "entropy": 1.1862462252378463, "epoch": 1.3515355860148062, "grad_norm": 12.8125, "learning_rate": 1.992488046601228e-05, "loss": 1.1859, "mean_token_accuracy": 0.7062039881944656, "num_tokens": 21329340.0, "step": 9220 }, { "entropy": 1.4676971763372422, "epoch": 1.3530015392508978, "grad_norm": 9.5, "learning_rate": 1.9924253173121618e-05, "loss": 1.3647, "mean_token_accuracy": 0.6540231704711914, "num_tokens": 21351494.0, "step": 9230 }, { "entropy": 1.3558754205703736, "epoch": 1.3544674924869897, "grad_norm": 7.21875, "learning_rate": 1.9923623281931982e-05, "loss": 1.3475, "mean_token_accuracy": 0.6816840440034866, "num_tokens": 21374668.0, "step": 9240 }, { "entropy": 1.1720675870776176, "epoch": 1.3559334457230814, "grad_norm": 12.8125, "learning_rate": 1.9922990792608283e-05, "loss": 1.0842, "mean_token_accuracy": 0.706174123287201, "num_tokens": 21399965.0, "step": 9250 }, { "entropy": 1.187364786863327, "epoch": 1.3573993989591733, "grad_norm": 8.875, "learning_rate": 1.9922355705316123e-05, "loss": 1.0843, "mean_token_accuracy": 0.7126944839954377, "num_tokens": 21422263.0, "step": 9260 }, { "entropy": 1.4473508924245835, "epoch": 1.358865352195265, "grad_norm": 9.6875, "learning_rate": 1.992171802022177e-05, "loss": 1.4084, "mean_token_accuracy": 0.6499342694878578, "num_tokens": 21443131.0, "step": 9270 }, { "entropy": 1.356276535987854, "epoch": 1.3603313054313566, "grad_norm": 8.5625, "learning_rate": 1.9921077737492175e-05, "loss": 1.3343, "mean_token_accuracy": 0.6716964900493622, "num_tokens": 21467109.0, "step": 9280 }, { "entropy": 1.1509250700473785, "epoch": 1.3617972586674485, "grad_norm": 5.9375, "learning_rate": 1.9920434857294985e-05, "loss": 1.0985, "mean_token_accuracy": 0.716162595152855, "num_tokens": 21490992.0, "step": 9290 }, { "entropy": 1.2766923546791076, "epoch": 1.3632632119035404, "grad_norm": 8.5, "learning_rate": 1.99197893797985e-05, "loss": 1.2383, "mean_token_accuracy": 0.6883162975311279, "num_tokens": 21512891.0, "step": 9300 }, { "entropy": 1.3301850408315659, "epoch": 1.364729165139632, "grad_norm": 9.375, "learning_rate": 1.9919141305171723e-05, "loss": 1.3246, "mean_token_accuracy": 0.6762389540672302, "num_tokens": 21536100.0, "step": 9310 }, { "entropy": 1.2921600311994552, "epoch": 1.3661951183757237, "grad_norm": 7.65625, "learning_rate": 1.9918490633584326e-05, "loss": 1.2052, "mean_token_accuracy": 0.6897517144680023, "num_tokens": 21557923.0, "step": 9320 }, { "entropy": 1.1566526293754578, "epoch": 1.3676610716118156, "grad_norm": 12.25, "learning_rate": 1.9917837365206667e-05, "loss": 1.0714, "mean_token_accuracy": 0.7129395276308059, "num_tokens": 21582875.0, "step": 9330 }, { "entropy": 1.3180439203977585, "epoch": 1.3691270248479073, "grad_norm": 8.9375, "learning_rate": 1.9917181500209772e-05, "loss": 1.2754, "mean_token_accuracy": 0.6821719348430634, "num_tokens": 21602314.0, "step": 9340 }, { "entropy": 1.2189882844686508, "epoch": 1.3705929780839992, "grad_norm": 8.9375, "learning_rate": 1.9916523038765362e-05, "loss": 1.1844, "mean_token_accuracy": 0.6982487127184868, "num_tokens": 21627195.0, "step": 9350 }, { "entropy": 1.3117438599467277, "epoch": 1.3720589313200908, "grad_norm": 8.875, "learning_rate": 1.9915861981045828e-05, "loss": 1.2802, "mean_token_accuracy": 0.6818821638822555, "num_tokens": 21652909.0, "step": 9360 }, { "entropy": 1.3425186306238175, "epoch": 1.3735248845561827, "grad_norm": 7.0, "learning_rate": 1.9915198327224247e-05, "loss": 1.3265, "mean_token_accuracy": 0.6834528103470803, "num_tokens": 21678991.0, "step": 9370 }, { "entropy": 1.2104782998561858, "epoch": 1.3749908377922744, "grad_norm": 9.8125, "learning_rate": 1.9914532077474368e-05, "loss": 1.2034, "mean_token_accuracy": 0.7041246891021729, "num_tokens": 21702661.0, "step": 9380 }, { "entropy": 1.177210494875908, "epoch": 1.3764567910283663, "grad_norm": 6.625, "learning_rate": 1.9913863231970626e-05, "loss": 1.0853, "mean_token_accuracy": 0.7129129439592361, "num_tokens": 21728381.0, "step": 9390 }, { "entropy": 1.147410136461258, "epoch": 1.377922744264458, "grad_norm": 7.53125, "learning_rate": 1.991319179088813e-05, "loss": 1.0896, "mean_token_accuracy": 0.7109249651432037, "num_tokens": 21752645.0, "step": 9400 }, { "entropy": 1.2889191061258316, "epoch": 1.3793886975005498, "grad_norm": 9.9375, "learning_rate": 1.9912517754402677e-05, "loss": 1.263, "mean_token_accuracy": 0.686933410167694, "num_tokens": 21775216.0, "step": 9410 }, { "entropy": 1.2982718616724014, "epoch": 1.3808546507366415, "grad_norm": 8.6875, "learning_rate": 1.9911841122690737e-05, "loss": 1.2799, "mean_token_accuracy": 0.6909778267145157, "num_tokens": 21799218.0, "step": 9420 }, { "entropy": 1.2488728329539298, "epoch": 1.3823206039727332, "grad_norm": 8.5625, "learning_rate": 1.991116189592946e-05, "loss": 1.1476, "mean_token_accuracy": 0.7020696699619293, "num_tokens": 21822170.0, "step": 9430 }, { "entropy": 1.222590270638466, "epoch": 1.383786557208825, "grad_norm": 7.5625, "learning_rate": 1.9910480074296676e-05, "loss": 1.1909, "mean_token_accuracy": 0.6997129291296005, "num_tokens": 21848116.0, "step": 9440 }, { "entropy": 1.233240008354187, "epoch": 1.385252510444917, "grad_norm": 10.5, "learning_rate": 1.9909795657970897e-05, "loss": 1.2106, "mean_token_accuracy": 0.6982622593641281, "num_tokens": 21871876.0, "step": 9450 }, { "entropy": 1.3582398295402527, "epoch": 1.3867184636810086, "grad_norm": 7.46875, "learning_rate": 1.990910864713131e-05, "loss": 1.3225, "mean_token_accuracy": 0.6700652986764908, "num_tokens": 21895585.0, "step": 9460 }, { "entropy": 1.0898628264665604, "epoch": 1.3881844169171003, "grad_norm": 9.9375, "learning_rate": 1.990841904195778e-05, "loss": 0.992, "mean_token_accuracy": 0.7235011845827103, "num_tokens": 21919900.0, "step": 9470 }, { "entropy": 1.2246022701263428, "epoch": 1.3896503701531921, "grad_norm": 10.875, "learning_rate": 1.9907726842630866e-05, "loss": 1.1701, "mean_token_accuracy": 0.6938129514455795, "num_tokens": 21942621.0, "step": 9480 }, { "entropy": 1.1797262996435165, "epoch": 1.3911163233892838, "grad_norm": 7.0625, "learning_rate": 1.9907032049331783e-05, "loss": 1.1212, "mean_token_accuracy": 0.703708964586258, "num_tokens": 21966600.0, "step": 9490 }, { "entropy": 1.2238686501979827, "epoch": 1.3925822766253757, "grad_norm": 7.875, "learning_rate": 1.9906334662242443e-05, "loss": 1.1758, "mean_token_accuracy": 0.7007131412625313, "num_tokens": 21988142.0, "step": 9500 }, { "entropy": 1.2103763401508332, "epoch": 1.3940482298614674, "grad_norm": 12.5, "learning_rate": 1.9905634681545428e-05, "loss": 1.2013, "mean_token_accuracy": 0.7087142556905747, "num_tokens": 22013148.0, "step": 9510 }, { "entropy": 1.169419711828232, "epoch": 1.3955141830975593, "grad_norm": 11.1875, "learning_rate": 1.9904932107424006e-05, "loss": 1.1267, "mean_token_accuracy": 0.706183397769928, "num_tokens": 22037793.0, "step": 9520 }, { "entropy": 1.2076320081949234, "epoch": 1.396980136333651, "grad_norm": 9.875, "learning_rate": 1.9904226940062114e-05, "loss": 1.1595, "mean_token_accuracy": 0.7066450536251068, "num_tokens": 22059415.0, "step": 9530 }, { "entropy": 1.1532551914453506, "epoch": 1.3984460895697426, "grad_norm": 5.96875, "learning_rate": 1.990351917964438e-05, "loss": 1.1285, "mean_token_accuracy": 0.7121962189674378, "num_tokens": 22086586.0, "step": 9540 }, { "entropy": 1.2386089473962785, "epoch": 1.3999120428058345, "grad_norm": 7.0, "learning_rate": 1.9902808826356104e-05, "loss": 1.2463, "mean_token_accuracy": 0.6965046256780625, "num_tokens": 22111835.0, "step": 9550 }, { "entropy": 1.1722925439476968, "epoch": 1.4013779960419264, "grad_norm": 7.53125, "learning_rate": 1.9902095880383264e-05, "loss": 1.1282, "mean_token_accuracy": 0.71515002399683, "num_tokens": 22138941.0, "step": 9560 }, { "entropy": 1.2985278785228729, "epoch": 1.402843949278018, "grad_norm": 13.4375, "learning_rate": 1.9901380341912517e-05, "loss": 1.2448, "mean_token_accuracy": 0.6878166735172272, "num_tokens": 22162585.0, "step": 9570 }, { "entropy": 1.1495393127202989, "epoch": 1.4043099025141097, "grad_norm": 7.96875, "learning_rate": 1.9900662211131206e-05, "loss": 1.1302, "mean_token_accuracy": 0.7115373194217682, "num_tokens": 22185617.0, "step": 9580 }, { "entropy": 1.2667575478553772, "epoch": 1.4057758557502016, "grad_norm": 8.8125, "learning_rate": 1.9899941488227342e-05, "loss": 1.2137, "mean_token_accuracy": 0.7016471922397614, "num_tokens": 22205656.0, "step": 9590 }, { "entropy": 1.0853358119726182, "epoch": 1.4072418089862935, "grad_norm": 8.0625, "learning_rate": 1.989921817338962e-05, "loss": 0.9755, "mean_token_accuracy": 0.7319185823202133, "num_tokens": 22229697.0, "step": 9600 }, { "entropy": 1.2888369351625442, "epoch": 1.4087077622223851, "grad_norm": 9.25, "learning_rate": 1.9898492266807414e-05, "loss": 1.2433, "mean_token_accuracy": 0.6874328449368476, "num_tokens": 22253531.0, "step": 9610 }, { "entropy": 1.2388431519269942, "epoch": 1.4101737154584768, "grad_norm": 12.5625, "learning_rate": 1.9897763768670775e-05, "loss": 1.1588, "mean_token_accuracy": 0.6908131062984466, "num_tokens": 22273716.0, "step": 9620 }, { "entropy": 1.307745772600174, "epoch": 1.4116396686945687, "grad_norm": 13.5, "learning_rate": 1.9897032679170435e-05, "loss": 1.2676, "mean_token_accuracy": 0.6866921424865723, "num_tokens": 22293669.0, "step": 9630 }, { "entropy": 1.2503044486045838, "epoch": 1.4131056219306604, "grad_norm": 6.59375, "learning_rate": 1.9896298998497802e-05, "loss": 1.2099, "mean_token_accuracy": 0.6913545817136765, "num_tokens": 22315842.0, "step": 9640 }, { "entropy": 1.1791915476322175, "epoch": 1.4145715751667522, "grad_norm": 7.40625, "learning_rate": 1.9895562726844965e-05, "loss": 1.1062, "mean_token_accuracy": 0.7163125097751617, "num_tokens": 22339618.0, "step": 9650 }, { "entropy": 1.272774139046669, "epoch": 1.416037528402844, "grad_norm": 9.5625, "learning_rate": 1.9894823864404686e-05, "loss": 1.2341, "mean_token_accuracy": 0.6843629866838455, "num_tokens": 22363248.0, "step": 9660 }, { "entropy": 1.0726122736930848, "epoch": 1.4175034816389358, "grad_norm": 6.3125, "learning_rate": 1.989408241137041e-05, "loss": 1.0126, "mean_token_accuracy": 0.720137146115303, "num_tokens": 22390361.0, "step": 9670 }, { "entropy": 1.3477701723575592, "epoch": 1.4189694348750275, "grad_norm": 11.6875, "learning_rate": 1.9893338367936264e-05, "loss": 1.3122, "mean_token_accuracy": 0.6776217430830002, "num_tokens": 22413830.0, "step": 9680 }, { "entropy": 1.3776203572750092, "epoch": 1.4204353881111191, "grad_norm": 8.8125, "learning_rate": 1.989259173429704e-05, "loss": 1.3327, "mean_token_accuracy": 0.678369066119194, "num_tokens": 22435259.0, "step": 9690 }, { "entropy": 1.33594653904438, "epoch": 1.421901341347211, "grad_norm": 11.1875, "learning_rate": 1.9891842510648222e-05, "loss": 1.3256, "mean_token_accuracy": 0.675543138384819, "num_tokens": 22457325.0, "step": 9700 }, { "entropy": 1.2257669404149056, "epoch": 1.423367294583303, "grad_norm": 10.5625, "learning_rate": 1.9891090697185964e-05, "loss": 1.1806, "mean_token_accuracy": 0.7011904299259186, "num_tokens": 22479286.0, "step": 9710 }, { "entropy": 1.3180401742458343, "epoch": 1.4248332478193946, "grad_norm": 13.125, "learning_rate": 1.9890336294107105e-05, "loss": 1.3125, "mean_token_accuracy": 0.675211849808693, "num_tokens": 22501738.0, "step": 9720 }, { "entropy": 1.304020881652832, "epoch": 1.4262992010554862, "grad_norm": 11.75, "learning_rate": 1.988957930160915e-05, "loss": 1.2217, "mean_token_accuracy": 0.6887218952178955, "num_tokens": 22524095.0, "step": 9730 }, { "entropy": 1.1406777858734132, "epoch": 1.4277651542915781, "grad_norm": 7.40625, "learning_rate": 1.9888819719890297e-05, "loss": 1.053, "mean_token_accuracy": 0.7230554789304733, "num_tokens": 22549192.0, "step": 9740 }, { "entropy": 1.26945258975029, "epoch": 1.4292311075276698, "grad_norm": 8.1875, "learning_rate": 1.9888057549149406e-05, "loss": 1.2261, "mean_token_accuracy": 0.6927028954029083, "num_tokens": 22571645.0, "step": 9750 }, { "entropy": 1.3106146842241286, "epoch": 1.4306970607637617, "grad_norm": 10.75, "learning_rate": 1.988729278958603e-05, "loss": 1.2458, "mean_token_accuracy": 0.6824240744113922, "num_tokens": 22594996.0, "step": 9760 }, { "entropy": 1.2352670282125473, "epoch": 1.4321630139998534, "grad_norm": 12.25, "learning_rate": 1.9886525441400394e-05, "loss": 1.2244, "mean_token_accuracy": 0.7021198779344558, "num_tokens": 22616344.0, "step": 9770 }, { "entropy": 1.3321022897958756, "epoch": 1.4336289672359452, "grad_norm": 8.9375, "learning_rate": 1.9885755504793397e-05, "loss": 1.2476, "mean_token_accuracy": 0.682996729016304, "num_tokens": 22639191.0, "step": 9780 }, { "entropy": 1.2195951461791992, "epoch": 1.435094920472037, "grad_norm": 8.8125, "learning_rate": 1.9884982979966617e-05, "loss": 1.1509, "mean_token_accuracy": 0.7078060775995254, "num_tokens": 22662415.0, "step": 9790 }, { "entropy": 1.2395706117153167, "epoch": 1.4365608737081288, "grad_norm": 8.375, "learning_rate": 1.9884207867122313e-05, "loss": 1.181, "mean_token_accuracy": 0.6979675829410553, "num_tokens": 22682845.0, "step": 9800 }, { "entropy": 1.2178602397441864, "epoch": 1.4380268269442205, "grad_norm": 7.03125, "learning_rate": 1.9883430166463416e-05, "loss": 1.1757, "mean_token_accuracy": 0.7013632684946061, "num_tokens": 22704982.0, "step": 9810 }, { "entropy": 1.3143818721175193, "epoch": 1.4394927801803123, "grad_norm": 14.0, "learning_rate": 1.9882649878193544e-05, "loss": 1.2539, "mean_token_accuracy": 0.6840845346450806, "num_tokens": 22724683.0, "step": 9820 }, { "entropy": 1.328680956363678, "epoch": 1.440958733416404, "grad_norm": 14.125, "learning_rate": 1.988186700251699e-05, "loss": 1.2543, "mean_token_accuracy": 0.6750828713178635, "num_tokens": 22745768.0, "step": 9830 }, { "entropy": 1.0480001509189605, "epoch": 1.4424246866524957, "grad_norm": 8.9375, "learning_rate": 1.9881081539638707e-05, "loss": 1.0274, "mean_token_accuracy": 0.7303318858146668, "num_tokens": 22770677.0, "step": 9840 }, { "entropy": 1.2082165986299516, "epoch": 1.4438906398885876, "grad_norm": 7.8125, "learning_rate": 1.988029348976435e-05, "loss": 1.1359, "mean_token_accuracy": 0.7049511164426804, "num_tokens": 22794336.0, "step": 9850 }, { "entropy": 1.5371334940195083, "epoch": 1.4453565931246795, "grad_norm": 8.6875, "learning_rate": 1.9879502853100238e-05, "loss": 1.4892, "mean_token_accuracy": 0.6409595131874084, "num_tokens": 22812954.0, "step": 9860 }, { "entropy": 1.152747967839241, "epoch": 1.4468225463607711, "grad_norm": 7.4375, "learning_rate": 1.987870962985337e-05, "loss": 1.1337, "mean_token_accuracy": 0.710613465309143, "num_tokens": 22837623.0, "step": 9870 }, { "entropy": 1.1821823507547378, "epoch": 1.4482884995968628, "grad_norm": 12.0, "learning_rate": 1.9877913820231426e-05, "loss": 1.1994, "mean_token_accuracy": 0.7003123015165329, "num_tokens": 22857868.0, "step": 9880 }, { "entropy": 1.1604348480701447, "epoch": 1.4497544528329547, "grad_norm": 9.75, "learning_rate": 1.987711542444275e-05, "loss": 1.095, "mean_token_accuracy": 0.7122529059648514, "num_tokens": 22882112.0, "step": 9890 }, { "entropy": 1.3076474517583847, "epoch": 1.4512204060690463, "grad_norm": 6.375, "learning_rate": 1.9876314442696382e-05, "loss": 1.2544, "mean_token_accuracy": 0.6889898717403412, "num_tokens": 22904540.0, "step": 9900 }, { "entropy": 1.2357600688934327, "epoch": 1.4526863593051382, "grad_norm": 9.125, "learning_rate": 1.9875510875202028e-05, "loss": 1.192, "mean_token_accuracy": 0.7047762721776962, "num_tokens": 22926700.0, "step": 9910 }, { "entropy": 1.2545586436986924, "epoch": 1.45415231254123, "grad_norm": 7.4375, "learning_rate": 1.9874704722170068e-05, "loss": 1.2396, "mean_token_accuracy": 0.693929934501648, "num_tokens": 22948034.0, "step": 9920 }, { "entropy": 1.1066010758280753, "epoch": 1.4556182657773218, "grad_norm": 10.8125, "learning_rate": 1.9873895983811563e-05, "loss": 1.0398, "mean_token_accuracy": 0.723475095629692, "num_tokens": 22971816.0, "step": 9930 }, { "entropy": 1.2392890810966493, "epoch": 1.4570842190134135, "grad_norm": 7.5, "learning_rate": 1.9873084660338257e-05, "loss": 1.1999, "mean_token_accuracy": 0.705535152554512, "num_tokens": 22994277.0, "step": 9940 }, { "entropy": 1.3208871603012085, "epoch": 1.4585501722495051, "grad_norm": 7.28125, "learning_rate": 1.987227075196256e-05, "loss": 1.2934, "mean_token_accuracy": 0.691740533709526, "num_tokens": 23018973.0, "step": 9950 }, { "entropy": 1.2369939118623734, "epoch": 1.460016125485597, "grad_norm": 9.1875, "learning_rate": 1.9871454258897567e-05, "loss": 1.1895, "mean_token_accuracy": 0.6939509004354477, "num_tokens": 23040657.0, "step": 9960 }, { "entropy": 1.1703111559152604, "epoch": 1.461482078721689, "grad_norm": 7.5625, "learning_rate": 1.9870635181357047e-05, "loss": 1.1171, "mean_token_accuracy": 0.7060460686683655, "num_tokens": 23066370.0, "step": 9970 }, { "entropy": 1.255358025431633, "epoch": 1.4629480319577806, "grad_norm": 8.5625, "learning_rate": 1.986981351955544e-05, "loss": 1.1932, "mean_token_accuracy": 0.6931944966316224, "num_tokens": 23085785.0, "step": 9980 }, { "entropy": 1.3179078280925751, "epoch": 1.4644139851938722, "grad_norm": 10.75, "learning_rate": 1.9868989273707874e-05, "loss": 1.2692, "mean_token_accuracy": 0.6760831624269485, "num_tokens": 23105803.0, "step": 9990 }, { "entropy": 1.2231737613677978, "epoch": 1.4658799384299641, "grad_norm": 9.0, "learning_rate": 1.986816244403014e-05, "loss": 1.1838, "mean_token_accuracy": 0.6952070355415344, "num_tokens": 23127836.0, "step": 10000 }, { "entropy": 1.102470600605011, "epoch": 1.4673458916660558, "grad_norm": 8.125, "learning_rate": 1.9867333030738723e-05, "loss": 1.0534, "mean_token_accuracy": 0.7315528362989425, "num_tokens": 23150244.0, "step": 10010 }, { "entropy": 1.3113805502653122, "epoch": 1.4688118449021477, "grad_norm": 7.53125, "learning_rate": 1.9866501034050768e-05, "loss": 1.2541, "mean_token_accuracy": 0.688956880569458, "num_tokens": 23173388.0, "step": 10020 }, { "entropy": 1.189794671535492, "epoch": 1.4702777981382393, "grad_norm": 6.4375, "learning_rate": 1.9865666454184097e-05, "loss": 1.1427, "mean_token_accuracy": 0.7037414193153382, "num_tokens": 23197098.0, "step": 10030 }, { "entropy": 1.128222143650055, "epoch": 1.4717437513743312, "grad_norm": 10.4375, "learning_rate": 1.986482929135723e-05, "loss": 1.0535, "mean_token_accuracy": 0.7155973613262177, "num_tokens": 23221116.0, "step": 10040 }, { "entropy": 1.0508635103702546, "epoch": 1.4732097046104229, "grad_norm": 7.28125, "learning_rate": 1.9863989545789327e-05, "loss": 0.987, "mean_token_accuracy": 0.7326636135578155, "num_tokens": 23245481.0, "step": 10050 }, { "entropy": 1.1559551000595092, "epoch": 1.4746756578465148, "grad_norm": 15.5, "learning_rate": 1.9863147217700263e-05, "loss": 1.1027, "mean_token_accuracy": 0.7122437536716462, "num_tokens": 23269851.0, "step": 10060 }, { "entropy": 1.2696208357810974, "epoch": 1.4761416110826064, "grad_norm": 5.875, "learning_rate": 1.9862302307310557e-05, "loss": 1.1958, "mean_token_accuracy": 0.6953899621963501, "num_tokens": 23294697.0, "step": 10070 }, { "entropy": 1.25274915099144, "epoch": 1.4776075643186983, "grad_norm": 9.0, "learning_rate": 1.9861454814841427e-05, "loss": 1.2473, "mean_token_accuracy": 0.6851618379354477, "num_tokens": 23313299.0, "step": 10080 }, { "entropy": 1.1570469558238983, "epoch": 1.47907351755479, "grad_norm": 7.78125, "learning_rate": 1.9860604740514748e-05, "loss": 1.1181, "mean_token_accuracy": 0.7085950642824173, "num_tokens": 23337304.0, "step": 10090 }, { "entropy": 1.24805908203125, "epoch": 1.4805394707908817, "grad_norm": 10.5625, "learning_rate": 1.985975208455309e-05, "loss": 1.2023, "mean_token_accuracy": 0.696613147854805, "num_tokens": 23363915.0, "step": 10100 }, { "entropy": 1.2189895957708359, "epoch": 1.4820054240269736, "grad_norm": 8.5, "learning_rate": 1.985889684717968e-05, "loss": 1.1857, "mean_token_accuracy": 0.6969820380210876, "num_tokens": 23382975.0, "step": 10110 }, { "entropy": 1.0844991236925126, "epoch": 1.4834713772630654, "grad_norm": 7.15625, "learning_rate": 1.985803902861844e-05, "loss": 1.0467, "mean_token_accuracy": 0.7303868502378463, "num_tokens": 23407530.0, "step": 10120 }, { "entropy": 1.2836492627859115, "epoch": 1.484937330499157, "grad_norm": 6.96875, "learning_rate": 1.9857178629093955e-05, "loss": 1.2607, "mean_token_accuracy": 0.6838064298033715, "num_tokens": 23428035.0, "step": 10130 }, { "entropy": 1.1479956835508347, "epoch": 1.4864032837352488, "grad_norm": 6.375, "learning_rate": 1.9856315648831486e-05, "loss": 1.0935, "mean_token_accuracy": 0.7159177035093307, "num_tokens": 23449497.0, "step": 10140 }, { "entropy": 1.3778962910175323, "epoch": 1.4878692369713407, "grad_norm": 7.625, "learning_rate": 1.9855450088056972e-05, "loss": 1.3366, "mean_token_accuracy": 0.6690073281526565, "num_tokens": 23473834.0, "step": 10150 }, { "entropy": 1.1654943197965622, "epoch": 1.4893351902074323, "grad_norm": 7.65625, "learning_rate": 1.985458194699703e-05, "loss": 1.1325, "mean_token_accuracy": 0.7136056959629059, "num_tokens": 23496988.0, "step": 10160 }, { "entropy": 1.2208922982215882, "epoch": 1.4908011434435242, "grad_norm": 8.75, "learning_rate": 1.9853711225878953e-05, "loss": 1.137, "mean_token_accuracy": 0.7033099174499512, "num_tokens": 23521978.0, "step": 10170 }, { "entropy": 1.217815899848938, "epoch": 1.4922670966796159, "grad_norm": 9.5, "learning_rate": 1.9852837924930704e-05, "loss": 1.1749, "mean_token_accuracy": 0.7007389098405838, "num_tokens": 23542870.0, "step": 10180 }, { "entropy": 1.2294093400239945, "epoch": 1.4937330499157078, "grad_norm": 10.0625, "learning_rate": 1.9851962044380923e-05, "loss": 1.1607, "mean_token_accuracy": 0.7028786987066269, "num_tokens": 23564499.0, "step": 10190 }, { "entropy": 1.20090192258358, "epoch": 1.4951990031517994, "grad_norm": 7.21875, "learning_rate": 1.985108358445893e-05, "loss": 1.1968, "mean_token_accuracy": 0.7091238319873809, "num_tokens": 23589541.0, "step": 10200 }, { "entropy": 1.2275371253490448, "epoch": 1.496664956387891, "grad_norm": 7.625, "learning_rate": 1.985020254539472e-05, "loss": 1.1688, "mean_token_accuracy": 0.6921293556690216, "num_tokens": 23612837.0, "step": 10210 }, { "entropy": 1.1985953748226166, "epoch": 1.498130909623983, "grad_norm": 6.34375, "learning_rate": 1.9849318927418958e-05, "loss": 1.1525, "mean_token_accuracy": 0.7115861311554909, "num_tokens": 23638669.0, "step": 10220 }, { "entropy": 1.1204245388507843, "epoch": 1.4995968628600749, "grad_norm": 9.625, "learning_rate": 1.9848432730762983e-05, "loss": 1.0965, "mean_token_accuracy": 0.7171857804059982, "num_tokens": 23663066.0, "step": 10230 }, { "entropy": 1.2571208596229553, "epoch": 1.5010628160961665, "grad_norm": 6.34375, "learning_rate": 1.9847543955658814e-05, "loss": 1.2212, "mean_token_accuracy": 0.6898612052202224, "num_tokens": 23686733.0, "step": 10240 }, { "entropy": 1.1361409693956375, "epoch": 1.5025287693322582, "grad_norm": 9.25, "learning_rate": 1.984665260233915e-05, "loss": 1.0368, "mean_token_accuracy": 0.7188237726688385, "num_tokens": 23710102.0, "step": 10250 }, { "entropy": 1.2668312817811966, "epoch": 1.50399472256835, "grad_norm": 7.71875, "learning_rate": 1.984575867103735e-05, "loss": 1.2375, "mean_token_accuracy": 0.6897116929292679, "num_tokens": 23735335.0, "step": 10260 }, { "entropy": 1.250565269589424, "epoch": 1.505460675804442, "grad_norm": 8.4375, "learning_rate": 1.9844862161987465e-05, "loss": 1.1806, "mean_token_accuracy": 0.6929759860038758, "num_tokens": 23755941.0, "step": 10270 }, { "entropy": 1.2342004954814911, "epoch": 1.5069266290405336, "grad_norm": 7.15625, "learning_rate": 1.984396307542421e-05, "loss": 1.1846, "mean_token_accuracy": 0.698310598731041, "num_tokens": 23780217.0, "step": 10280 }, { "entropy": 1.2794748306274415, "epoch": 1.5083925822766253, "grad_norm": 9.0, "learning_rate": 1.9843061411582972e-05, "loss": 1.1904, "mean_token_accuracy": 0.6975496292114258, "num_tokens": 23801675.0, "step": 10290 }, { "entropy": 1.1212030798196793, "epoch": 1.5098585355127172, "grad_norm": 13.625, "learning_rate": 1.9842157170699826e-05, "loss": 1.1032, "mean_token_accuracy": 0.7168443828821183, "num_tokens": 23827076.0, "step": 10300 }, { "entropy": 1.2572992652654649, "epoch": 1.5113244887488089, "grad_norm": 14.8125, "learning_rate": 1.9841250353011514e-05, "loss": 1.2357, "mean_token_accuracy": 0.692917314171791, "num_tokens": 23848939.0, "step": 10310 }, { "entropy": 1.2338586062192918, "epoch": 1.5127904419849005, "grad_norm": 9.5, "learning_rate": 1.9840340958755448e-05, "loss": 1.1101, "mean_token_accuracy": 0.6928750067949295, "num_tokens": 23872041.0, "step": 10320 }, { "entropy": 1.0725140035152436, "epoch": 1.5142563952209924, "grad_norm": 7.125, "learning_rate": 1.983942898816972e-05, "loss": 1.0131, "mean_token_accuracy": 0.7316102594137192, "num_tokens": 23897950.0, "step": 10330 }, { "entropy": 1.376725971698761, "epoch": 1.5157223484570843, "grad_norm": 12.0625, "learning_rate": 1.9838514441493096e-05, "loss": 1.3075, "mean_token_accuracy": 0.6689966768026352, "num_tokens": 23918539.0, "step": 10340 }, { "entropy": 1.2140399068593979, "epoch": 1.517188301693176, "grad_norm": 8.4375, "learning_rate": 1.9837597318965018e-05, "loss": 1.135, "mean_token_accuracy": 0.6984446734189987, "num_tokens": 23943678.0, "step": 10350 }, { "entropy": 1.2335283756256104, "epoch": 1.5186542549292676, "grad_norm": 7.96875, "learning_rate": 1.9836677620825602e-05, "loss": 1.1703, "mean_token_accuracy": 0.693614947795868, "num_tokens": 23965396.0, "step": 10360 }, { "entropy": 1.1585767269134521, "epoch": 1.5201202081653595, "grad_norm": 11.4375, "learning_rate": 1.9835755347315632e-05, "loss": 1.1491, "mean_token_accuracy": 0.7068466156721115, "num_tokens": 23989596.0, "step": 10370 }, { "entropy": 1.022016668319702, "epoch": 1.5215861614014514, "grad_norm": 7.0, "learning_rate": 1.9834830498676577e-05, "loss": 0.9214, "mean_token_accuracy": 0.7413171797990799, "num_tokens": 24015614.0, "step": 10380 }, { "entropy": 1.2534865468740464, "epoch": 1.523052114637543, "grad_norm": 9.0625, "learning_rate": 1.983390307515057e-05, "loss": 1.2044, "mean_token_accuracy": 0.6908579170703888, "num_tokens": 24039312.0, "step": 10390 }, { "entropy": 1.1647131264209747, "epoch": 1.5245180678736348, "grad_norm": 10.5625, "learning_rate": 1.9832973076980425e-05, "loss": 1.1188, "mean_token_accuracy": 0.697473892569542, "num_tokens": 24064042.0, "step": 10400 }, { "entropy": 1.2789953708648683, "epoch": 1.5259840211097266, "grad_norm": 10.125, "learning_rate": 1.983204050440963e-05, "loss": 1.2611, "mean_token_accuracy": 0.691452243924141, "num_tokens": 24087361.0, "step": 10410 }, { "entropy": 1.239151269197464, "epoch": 1.5274499743458185, "grad_norm": 8.9375, "learning_rate": 1.9831105357682336e-05, "loss": 1.2043, "mean_token_accuracy": 0.7048815608024597, "num_tokens": 24112778.0, "step": 10420 }, { "entropy": 1.1266495525836944, "epoch": 1.52891592758191, "grad_norm": 9.1875, "learning_rate": 1.9830167637043385e-05, "loss": 1.0748, "mean_token_accuracy": 0.7264415472745895, "num_tokens": 24134479.0, "step": 10430 }, { "entropy": 1.1599737763404847, "epoch": 1.5303818808180019, "grad_norm": 9.1875, "learning_rate": 1.982922734273828e-05, "loss": 1.0814, "mean_token_accuracy": 0.7063671827316285, "num_tokens": 24157327.0, "step": 10440 }, { "entropy": 1.2053853124380112, "epoch": 1.5318478340540937, "grad_norm": 13.125, "learning_rate": 1.9828284475013208e-05, "loss": 1.195, "mean_token_accuracy": 0.702821534872055, "num_tokens": 24180217.0, "step": 10450 }, { "entropy": 1.3140929013490676, "epoch": 1.5333137872901854, "grad_norm": 11.0, "learning_rate": 1.9827339034115013e-05, "loss": 1.2747, "mean_token_accuracy": 0.6793932914733887, "num_tokens": 24202486.0, "step": 10460 }, { "entropy": 1.183591377735138, "epoch": 1.534779740526277, "grad_norm": 9.9375, "learning_rate": 1.9826391020291236e-05, "loss": 1.1873, "mean_token_accuracy": 0.7084651678800583, "num_tokens": 24229050.0, "step": 10470 }, { "entropy": 1.2489636540412903, "epoch": 1.536245693762369, "grad_norm": 10.0, "learning_rate": 1.982544043379007e-05, "loss": 1.1785, "mean_token_accuracy": 0.6964780062437057, "num_tokens": 24251161.0, "step": 10480 }, { "entropy": 1.3630384653806686, "epoch": 1.5377116469984609, "grad_norm": 7.78125, "learning_rate": 1.98244872748604e-05, "loss": 1.3428, "mean_token_accuracy": 0.6720784962177276, "num_tokens": 24274805.0, "step": 10490 }, { "entropy": 1.2351271033287048, "epoch": 1.5391776002345525, "grad_norm": 7.40625, "learning_rate": 1.982353154375177e-05, "loss": 1.1648, "mean_token_accuracy": 0.7007684022188186, "num_tokens": 24294275.0, "step": 10500 }, { "entropy": 1.3215063214302063, "epoch": 1.5406435534706442, "grad_norm": 9.3125, "learning_rate": 1.98225732407144e-05, "loss": 1.3112, "mean_token_accuracy": 0.6762094020843505, "num_tokens": 24315010.0, "step": 10510 }, { "entropy": 1.2089270949363708, "epoch": 1.542109506706736, "grad_norm": 6.875, "learning_rate": 1.9821612365999192e-05, "loss": 1.1535, "mean_token_accuracy": 0.701535327732563, "num_tokens": 24339929.0, "step": 10520 }, { "entropy": 1.1864853024482727, "epoch": 1.543575459942828, "grad_norm": 13.9375, "learning_rate": 1.9820648919857712e-05, "loss": 1.1297, "mean_token_accuracy": 0.7144268453121185, "num_tokens": 24361078.0, "step": 10530 }, { "entropy": 1.1602874040603637, "epoch": 1.5450414131789196, "grad_norm": 7.125, "learning_rate": 1.9819682902542205e-05, "loss": 1.0841, "mean_token_accuracy": 0.7139178067445755, "num_tokens": 24385568.0, "step": 10540 }, { "entropy": 1.2905980795621872, "epoch": 1.5465073664150113, "grad_norm": 12.375, "learning_rate": 1.9818714314305584e-05, "loss": 1.2505, "mean_token_accuracy": 0.6858444154262543, "num_tokens": 24409403.0, "step": 10550 }, { "entropy": 1.0862625420093537, "epoch": 1.5479733196511032, "grad_norm": 13.1875, "learning_rate": 1.9817743155401444e-05, "loss": 1.0167, "mean_token_accuracy": 0.7345458507537842, "num_tokens": 24432847.0, "step": 10560 }, { "entropy": 1.142924252152443, "epoch": 1.5494392728871949, "grad_norm": 6.3125, "learning_rate": 1.9816769426084042e-05, "loss": 1.1345, "mean_token_accuracy": 0.7108615189790726, "num_tokens": 24456459.0, "step": 10570 }, { "entropy": 1.2269693464040756, "epoch": 1.5509052261232865, "grad_norm": 7.40625, "learning_rate": 1.981579312660831e-05, "loss": 1.1918, "mean_token_accuracy": 0.710510267317295, "num_tokens": 24483226.0, "step": 10580 }, { "entropy": 1.3646783709526062, "epoch": 1.5523711793593784, "grad_norm": 11.3125, "learning_rate": 1.9814814257229863e-05, "loss": 1.3131, "mean_token_accuracy": 0.6769515186548233, "num_tokens": 24505221.0, "step": 10590 }, { "entropy": 1.050296664237976, "epoch": 1.5538371325954703, "grad_norm": 7.53125, "learning_rate": 1.9813832818204978e-05, "loss": 0.983, "mean_token_accuracy": 0.7337777823209762, "num_tokens": 24531130.0, "step": 10600 }, { "entropy": 1.090765431523323, "epoch": 1.555303085831562, "grad_norm": 8.5625, "learning_rate": 1.9812848809790612e-05, "loss": 1.0562, "mean_token_accuracy": 0.7268309175968171, "num_tokens": 24555847.0, "step": 10610 }, { "entropy": 1.239765077829361, "epoch": 1.5567690390676536, "grad_norm": 8.4375, "learning_rate": 1.9811862232244384e-05, "loss": 1.1786, "mean_token_accuracy": 0.7003133982419968, "num_tokens": 24574734.0, "step": 10620 }, { "entropy": 1.141008460521698, "epoch": 1.5582349923037455, "grad_norm": 6.0, "learning_rate": 1.9810873085824604e-05, "loss": 1.0974, "mean_token_accuracy": 0.7079092532396316, "num_tokens": 24598287.0, "step": 10630 }, { "entropy": 1.1133719652891159, "epoch": 1.5597009455398374, "grad_norm": 9.75, "learning_rate": 1.980988137079023e-05, "loss": 1.0648, "mean_token_accuracy": 0.7255733668804168, "num_tokens": 24621327.0, "step": 10640 }, { "entropy": 1.1433954894542695, "epoch": 1.561166898775929, "grad_norm": 10.625, "learning_rate": 1.9808887087400916e-05, "loss": 1.1047, "mean_token_accuracy": 0.7141841173171997, "num_tokens": 24644152.0, "step": 10650 }, { "entropy": 1.2660430759191512, "epoch": 1.5626328520120207, "grad_norm": 7.03125, "learning_rate": 1.9807890235916975e-05, "loss": 1.2669, "mean_token_accuracy": 0.6920972168445587, "num_tokens": 24666915.0, "step": 10660 }, { "entropy": 1.2261504799127578, "epoch": 1.5640988052481126, "grad_norm": 6.8125, "learning_rate": 1.9806890816599393e-05, "loss": 1.1904, "mean_token_accuracy": 0.7006199151277542, "num_tokens": 24687867.0, "step": 10670 }, { "entropy": 1.150604310631752, "epoch": 1.5655647584842045, "grad_norm": 8.8125, "learning_rate": 1.9805888829709836e-05, "loss": 1.1007, "mean_token_accuracy": 0.7124580025672913, "num_tokens": 24710415.0, "step": 10680 }, { "entropy": 1.1555919140577315, "epoch": 1.5670307117202962, "grad_norm": 7.09375, "learning_rate": 1.980488427551064e-05, "loss": 1.1419, "mean_token_accuracy": 0.714571613073349, "num_tokens": 24735381.0, "step": 10690 }, { "entropy": 1.1774719059467316, "epoch": 1.5684966649563878, "grad_norm": 8.625, "learning_rate": 1.98038771542648e-05, "loss": 1.1162, "mean_token_accuracy": 0.7089532375335693, "num_tokens": 24760558.0, "step": 10700 }, { "entropy": 1.1050567507743836, "epoch": 1.5699626181924797, "grad_norm": 7.25, "learning_rate": 1.9802867466236e-05, "loss": 1.0558, "mean_token_accuracy": 0.7174951702356338, "num_tokens": 24783441.0, "step": 10710 }, { "entropy": 1.2640710860490798, "epoch": 1.5714285714285714, "grad_norm": 12.9375, "learning_rate": 1.9801855211688588e-05, "loss": 1.2694, "mean_token_accuracy": 0.677370497584343, "num_tokens": 24804939.0, "step": 10720 }, { "entropy": 1.120713397860527, "epoch": 1.572894524664663, "grad_norm": 9.125, "learning_rate": 1.9800840390887587e-05, "loss": 1.0958, "mean_token_accuracy": 0.7220742166042328, "num_tokens": 24829952.0, "step": 10730 }, { "entropy": 1.216777178645134, "epoch": 1.574360477900755, "grad_norm": 13.875, "learning_rate": 1.9799823004098687e-05, "loss": 1.1912, "mean_token_accuracy": 0.6983893483877182, "num_tokens": 24852630.0, "step": 10740 }, { "entropy": 1.4806131273508072, "epoch": 1.5758264311368468, "grad_norm": 8.875, "learning_rate": 1.979880305158826e-05, "loss": 1.4178, "mean_token_accuracy": 0.6558003425598145, "num_tokens": 24877977.0, "step": 10750 }, { "entropy": 1.2858109802007676, "epoch": 1.5772923843729385, "grad_norm": 10.1875, "learning_rate": 1.9797780533623334e-05, "loss": 1.2701, "mean_token_accuracy": 0.6879681885242462, "num_tokens": 24900568.0, "step": 10760 }, { "entropy": 1.1539961755275727, "epoch": 1.5787583376090302, "grad_norm": 9.0625, "learning_rate": 1.9796755450471624e-05, "loss": 1.069, "mean_token_accuracy": 0.7099590927362442, "num_tokens": 24925230.0, "step": 10770 }, { "entropy": 1.3013078063726424, "epoch": 1.580224290845122, "grad_norm": 7.25, "learning_rate": 1.979572780240151e-05, "loss": 1.2639, "mean_token_accuracy": 0.6879950761795044, "num_tokens": 24949858.0, "step": 10780 }, { "entropy": 1.1133844822645187, "epoch": 1.581690244081214, "grad_norm": 7.03125, "learning_rate": 1.9794697589682038e-05, "loss": 1.0857, "mean_token_accuracy": 0.7253763973712921, "num_tokens": 24975423.0, "step": 10790 }, { "entropy": 1.0255796998739242, "epoch": 1.5831561973173056, "grad_norm": 7.875, "learning_rate": 1.9793664812582936e-05, "loss": 0.9294, "mean_token_accuracy": 0.7409351885318756, "num_tokens": 25001147.0, "step": 10800 }, { "entropy": 1.3538257151842117, "epoch": 1.5846221505533973, "grad_norm": 8.125, "learning_rate": 1.97926294713746e-05, "loss": 1.3491, "mean_token_accuracy": 0.6744788348674774, "num_tokens": 25021246.0, "step": 10810 }, { "entropy": 1.1571768373250961, "epoch": 1.5860881037894892, "grad_norm": 8.4375, "learning_rate": 1.979159156632809e-05, "loss": 1.0893, "mean_token_accuracy": 0.7043894588947296, "num_tokens": 25044711.0, "step": 10820 }, { "entropy": 1.102740803360939, "epoch": 1.5875540570255808, "grad_norm": 7.375, "learning_rate": 1.9790551097715146e-05, "loss": 1.0459, "mean_token_accuracy": 0.7195653796195984, "num_tokens": 25067305.0, "step": 10830 }, { "entropy": 1.346173968911171, "epoch": 1.5890200102616725, "grad_norm": 12.375, "learning_rate": 1.978950806580818e-05, "loss": 1.3482, "mean_token_accuracy": 0.6749548226594925, "num_tokens": 25089817.0, "step": 10840 }, { "entropy": 1.235884916782379, "epoch": 1.5904859634977644, "grad_norm": 11.25, "learning_rate": 1.9788462470880264e-05, "loss": 1.1918, "mean_token_accuracy": 0.6899536430835724, "num_tokens": 25113036.0, "step": 10850 }, { "entropy": 1.0972918927669526, "epoch": 1.5919519167338563, "grad_norm": 6.4375, "learning_rate": 1.978741431320515e-05, "loss": 1.0196, "mean_token_accuracy": 0.7260525107383728, "num_tokens": 25138391.0, "step": 10860 }, { "entropy": 1.2476652204990386, "epoch": 1.593417869969948, "grad_norm": 7.25, "learning_rate": 1.978636359305727e-05, "loss": 1.1929, "mean_token_accuracy": 0.6880863100290299, "num_tokens": 25161632.0, "step": 10870 }, { "entropy": 1.2586310476064682, "epoch": 1.5948838232060396, "grad_norm": 12.1875, "learning_rate": 1.9785310310711705e-05, "loss": 1.2241, "mean_token_accuracy": 0.6850825041532517, "num_tokens": 25183653.0, "step": 10880 }, { "entropy": 1.364493390917778, "epoch": 1.5963497764421315, "grad_norm": 9.0625, "learning_rate": 1.9784254466444216e-05, "loss": 1.2751, "mean_token_accuracy": 0.6844106301665306, "num_tokens": 25205476.0, "step": 10890 }, { "entropy": 1.49171784222126, "epoch": 1.5978157296782234, "grad_norm": 10.125, "learning_rate": 1.9783196060531247e-05, "loss": 1.4444, "mean_token_accuracy": 0.6583788752555847, "num_tokens": 25227063.0, "step": 10900 }, { "entropy": 1.2148515105247497, "epoch": 1.599281682914315, "grad_norm": 11.5, "learning_rate": 1.97821350932499e-05, "loss": 1.1571, "mean_token_accuracy": 0.6976304054260254, "num_tokens": 25250828.0, "step": 10910 }, { "entropy": 1.1061373502016068, "epoch": 1.6007476361504067, "grad_norm": 9.125, "learning_rate": 1.9781071564877938e-05, "loss": 1.1311, "mean_token_accuracy": 0.7184530705213547, "num_tokens": 25275422.0, "step": 10920 }, { "entropy": 1.3884032368659973, "epoch": 1.6022135893864986, "grad_norm": 12.5, "learning_rate": 1.9780005475693822e-05, "loss": 1.4059, "mean_token_accuracy": 0.6570729851722718, "num_tokens": 25297774.0, "step": 10930 }, { "entropy": 1.3332613229751586, "epoch": 1.6036795426225905, "grad_norm": 9.6875, "learning_rate": 1.977893682597666e-05, "loss": 1.2508, "mean_token_accuracy": 0.6825523197650909, "num_tokens": 25321575.0, "step": 10940 }, { "entropy": 1.3160264521837235, "epoch": 1.6051454958586822, "grad_norm": 8.8125, "learning_rate": 1.977786561600624e-05, "loss": 1.2874, "mean_token_accuracy": 0.6864227652549744, "num_tokens": 25346591.0, "step": 10950 }, { "entropy": 1.213172659277916, "epoch": 1.6066114490947738, "grad_norm": 14.5625, "learning_rate": 1.9776791846063022e-05, "loss": 1.1422, "mean_token_accuracy": 0.7077664613723755, "num_tokens": 25367259.0, "step": 10960 }, { "entropy": 1.2498607099056245, "epoch": 1.6080774023308657, "grad_norm": 11.6875, "learning_rate": 1.9775715516428126e-05, "loss": 1.1914, "mean_token_accuracy": 0.6958065778017044, "num_tokens": 25390509.0, "step": 10970 }, { "entropy": 0.9521230086684227, "epoch": 1.6095433555669574, "grad_norm": 8.625, "learning_rate": 1.9774636627383356e-05, "loss": 0.8712, "mean_token_accuracy": 0.7603944033384323, "num_tokens": 25415771.0, "step": 10980 }, { "entropy": 1.2991198688745498, "epoch": 1.611009308803049, "grad_norm": 7.53125, "learning_rate": 1.977355517921118e-05, "loss": 1.2557, "mean_token_accuracy": 0.685220542550087, "num_tokens": 25438458.0, "step": 10990 }, { "entropy": 1.4566244035959244, "epoch": 1.612475262039141, "grad_norm": 9.5625, "learning_rate": 1.977247117219473e-05, "loss": 1.412, "mean_token_accuracy": 0.6551064744591713, "num_tokens": 25460400.0, "step": 11000 }, { "entropy": 1.1620822459459306, "epoch": 1.6139412152752328, "grad_norm": 12.6875, "learning_rate": 1.977138460661781e-05, "loss": 1.1166, "mean_token_accuracy": 0.7083588123321534, "num_tokens": 25483063.0, "step": 11010 }, { "entropy": 1.4154332607984543, "epoch": 1.6154071685113245, "grad_norm": 12.5625, "learning_rate": 1.9770295482764903e-05, "loss": 1.4034, "mean_token_accuracy": 0.6564199805259705, "num_tokens": 25503900.0, "step": 11020 }, { "entropy": 1.2684105038642883, "epoch": 1.6168731217474162, "grad_norm": 8.5, "learning_rate": 1.9769203800921157e-05, "loss": 1.2335, "mean_token_accuracy": 0.6940359503030777, "num_tokens": 25529390.0, "step": 11030 }, { "entropy": 1.2436715990304947, "epoch": 1.618339074983508, "grad_norm": 6.90625, "learning_rate": 1.9768109561372386e-05, "loss": 1.1772, "mean_token_accuracy": 0.6949643611907959, "num_tokens": 25551029.0, "step": 11040 }, { "entropy": 1.1488238453865052, "epoch": 1.6198050282196, "grad_norm": 12.8125, "learning_rate": 1.9767012764405078e-05, "loss": 1.0843, "mean_token_accuracy": 0.7129244595766068, "num_tokens": 25576708.0, "step": 11050 }, { "entropy": 1.177405709028244, "epoch": 1.6212709814556916, "grad_norm": 12.3125, "learning_rate": 1.9765913410306384e-05, "loss": 1.1611, "mean_token_accuracy": 0.7030182361602784, "num_tokens": 25600015.0, "step": 11060 }, { "entropy": 1.1901787519454956, "epoch": 1.6227369346917833, "grad_norm": 11.4375, "learning_rate": 1.9764811499364135e-05, "loss": 1.1834, "mean_token_accuracy": 0.7005995780229568, "num_tokens": 25624124.0, "step": 11070 }, { "entropy": 1.1539156436920166, "epoch": 1.6242028879278751, "grad_norm": 9.0625, "learning_rate": 1.9763707031866824e-05, "loss": 1.0923, "mean_token_accuracy": 0.713433688879013, "num_tokens": 25644728.0, "step": 11080 }, { "entropy": 1.3190320134162903, "epoch": 1.6256688411639668, "grad_norm": 8.6875, "learning_rate": 1.976260000810361e-05, "loss": 1.2833, "mean_token_accuracy": 0.6822304606437684, "num_tokens": 25667968.0, "step": 11090 }, { "entropy": 1.2387717723846436, "epoch": 1.6271347944000585, "grad_norm": 8.5625, "learning_rate": 1.9761490428364338e-05, "loss": 1.2217, "mean_token_accuracy": 0.691305422782898, "num_tokens": 25691269.0, "step": 11100 }, { "entropy": 1.242704412341118, "epoch": 1.6286007476361504, "grad_norm": 12.875, "learning_rate": 1.9760378292939495e-05, "loss": 1.2007, "mean_token_accuracy": 0.6914341360330581, "num_tokens": 25709956.0, "step": 11110 }, { "entropy": 1.363310033082962, "epoch": 1.6300667008722423, "grad_norm": 7.9375, "learning_rate": 1.9759263602120263e-05, "loss": 1.3414, "mean_token_accuracy": 0.6744317829608917, "num_tokens": 25730751.0, "step": 11120 }, { "entropy": 1.3973451614379884, "epoch": 1.631532654108334, "grad_norm": 12.6875, "learning_rate": 1.975814635619848e-05, "loss": 1.3774, "mean_token_accuracy": 0.6663376152515411, "num_tokens": 25751871.0, "step": 11130 }, { "entropy": 1.4483353286981582, "epoch": 1.6329986073444256, "grad_norm": 10.3125, "learning_rate": 1.975702655546666e-05, "loss": 1.386, "mean_token_accuracy": 0.6556488901376725, "num_tokens": 25771493.0, "step": 11140 }, { "entropy": 1.2101818591356277, "epoch": 1.6344645605805175, "grad_norm": 10.9375, "learning_rate": 1.9755904200217975e-05, "loss": 1.1922, "mean_token_accuracy": 0.7021886110305786, "num_tokens": 25794890.0, "step": 11150 }, { "entropy": 1.2163950830698014, "epoch": 1.6359305138166094, "grad_norm": 9.3125, "learning_rate": 1.9754779290746273e-05, "loss": 1.1865, "mean_token_accuracy": 0.6922981053590774, "num_tokens": 25818201.0, "step": 11160 }, { "entropy": 1.280381989479065, "epoch": 1.637396467052701, "grad_norm": 9.1875, "learning_rate": 1.975365182734607e-05, "loss": 1.2045, "mean_token_accuracy": 0.6850726872682571, "num_tokens": 25840643.0, "step": 11170 }, { "entropy": 1.3807485759258271, "epoch": 1.6388624202887927, "grad_norm": 9.3125, "learning_rate": 1.9752521810312556e-05, "loss": 1.3404, "mean_token_accuracy": 0.67738586217165, "num_tokens": 25863223.0, "step": 11180 }, { "entropy": 1.2183335319161415, "epoch": 1.6403283735248846, "grad_norm": 14.5625, "learning_rate": 1.9751389239941576e-05, "loss": 1.1823, "mean_token_accuracy": 0.7039651691913604, "num_tokens": 25888960.0, "step": 11190 }, { "entropy": 1.2246127277612686, "epoch": 1.6417943267609765, "grad_norm": 10.75, "learning_rate": 1.975025411652966e-05, "loss": 1.1977, "mean_token_accuracy": 0.7009033918380737, "num_tokens": 25910579.0, "step": 11200 }, { "entropy": 1.1840103209018706, "epoch": 1.6432602799970681, "grad_norm": 10.4375, "learning_rate": 1.974911644037399e-05, "loss": 1.1635, "mean_token_accuracy": 0.6942103415727615, "num_tokens": 25937295.0, "step": 11210 }, { "entropy": 1.0983409076929092, "epoch": 1.6447262332331598, "grad_norm": 5.46875, "learning_rate": 1.9747976211772432e-05, "loss": 1.0342, "mean_token_accuracy": 0.7208046108484268, "num_tokens": 25962208.0, "step": 11220 }, { "entropy": 1.1792825281620025, "epoch": 1.6461921864692517, "grad_norm": 8.4375, "learning_rate": 1.9746833431023505e-05, "loss": 1.1343, "mean_token_accuracy": 0.7071083009243011, "num_tokens": 25987100.0, "step": 11230 }, { "entropy": 1.1408768266439437, "epoch": 1.6476581397053434, "grad_norm": 11.4375, "learning_rate": 1.974568809842641e-05, "loss": 1.1137, "mean_token_accuracy": 0.7155638009309768, "num_tokens": 26011674.0, "step": 11240 }, { "entropy": 1.2136173099279404, "epoch": 1.649124092941435, "grad_norm": 13.5, "learning_rate": 1.974454021428101e-05, "loss": 1.1595, "mean_token_accuracy": 0.7002051293849945, "num_tokens": 26033012.0, "step": 11250 }, { "entropy": 1.383642253279686, "epoch": 1.650590046177527, "grad_norm": 14.0, "learning_rate": 1.9743389778887833e-05, "loss": 1.3206, "mean_token_accuracy": 0.671285080909729, "num_tokens": 26056151.0, "step": 11260 }, { "entropy": 1.3704394191503524, "epoch": 1.6520559994136188, "grad_norm": 10.4375, "learning_rate": 1.974223679254808e-05, "loss": 1.3064, "mean_token_accuracy": 0.6727125942707062, "num_tokens": 26074092.0, "step": 11270 }, { "entropy": 1.2276299938559532, "epoch": 1.6535219526497105, "grad_norm": 7.59375, "learning_rate": 1.9741081255563614e-05, "loss": 1.2259, "mean_token_accuracy": 0.7055073574185371, "num_tokens": 26098786.0, "step": 11280 }, { "entropy": 1.3068431526422501, "epoch": 1.6549879058858021, "grad_norm": 7.03125, "learning_rate": 1.9739923168236976e-05, "loss": 1.2773, "mean_token_accuracy": 0.684232771396637, "num_tokens": 26121910.0, "step": 11290 }, { "entropy": 1.1982148706912994, "epoch": 1.656453859121894, "grad_norm": 9.0625, "learning_rate": 1.9738762530871365e-05, "loss": 1.1096, "mean_token_accuracy": 0.7079778552055359, "num_tokens": 26143859.0, "step": 11300 }, { "entropy": 1.2291464865207673, "epoch": 1.657919812357986, "grad_norm": 12.9375, "learning_rate": 1.973759934377065e-05, "loss": 1.2045, "mean_token_accuracy": 0.6983579590916633, "num_tokens": 26167932.0, "step": 11310 }, { "entropy": 1.2226560905575752, "epoch": 1.6593857655940776, "grad_norm": 9.1875, "learning_rate": 1.9736433607239367e-05, "loss": 1.1316, "mean_token_accuracy": 0.7070600122213364, "num_tokens": 26191775.0, "step": 11320 }, { "entropy": 1.2347121953964233, "epoch": 1.6608517188301692, "grad_norm": 9.375, "learning_rate": 1.9735265321582728e-05, "loss": 1.1855, "mean_token_accuracy": 0.6999022513628006, "num_tokens": 26213275.0, "step": 11330 }, { "entropy": 1.3346222698688508, "epoch": 1.6623176720662611, "grad_norm": 9.25, "learning_rate": 1.97340944871066e-05, "loss": 1.3065, "mean_token_accuracy": 0.6767527043819428, "num_tokens": 26236079.0, "step": 11340 }, { "entropy": 1.3689057171344756, "epoch": 1.663783625302353, "grad_norm": 8.5625, "learning_rate": 1.9732921104117523e-05, "loss": 1.3033, "mean_token_accuracy": 0.6753749191761017, "num_tokens": 26258434.0, "step": 11350 }, { "entropy": 1.3661011457443237, "epoch": 1.6652495785384445, "grad_norm": 11.8125, "learning_rate": 1.9731745172922704e-05, "loss": 1.3305, "mean_token_accuracy": 0.6743572890758515, "num_tokens": 26278326.0, "step": 11360 }, { "entropy": 1.149736800789833, "epoch": 1.6667155317745364, "grad_norm": 7.59375, "learning_rate": 1.9730566693830018e-05, "loss": 1.1217, "mean_token_accuracy": 0.7128252476453781, "num_tokens": 26302182.0, "step": 11370 }, { "entropy": 1.131670954823494, "epoch": 1.6681814850106282, "grad_norm": 13.625, "learning_rate": 1.9729385667148008e-05, "loss": 1.081, "mean_token_accuracy": 0.7153748601675034, "num_tokens": 26328554.0, "step": 11380 }, { "entropy": 1.0455339282751084, "epoch": 1.66964743824672, "grad_norm": 9.0, "learning_rate": 1.972820209318588e-05, "loss": 1.0007, "mean_token_accuracy": 0.7328036487102508, "num_tokens": 26352287.0, "step": 11390 }, { "entropy": 1.138881766796112, "epoch": 1.6711133914828116, "grad_norm": 8.8125, "learning_rate": 1.9727015972253508e-05, "loss": 1.1154, "mean_token_accuracy": 0.7152654349803924, "num_tokens": 26376826.0, "step": 11400 }, { "entropy": 1.1867468252778053, "epoch": 1.6725793447189035, "grad_norm": 13.4375, "learning_rate": 1.9725827304661435e-05, "loss": 1.1539, "mean_token_accuracy": 0.7009199798107147, "num_tokens": 26398604.0, "step": 11410 }, { "entropy": 1.2608269810676576, "epoch": 1.6740452979549953, "grad_norm": 9.25, "learning_rate": 1.9724636090720874e-05, "loss": 1.218, "mean_token_accuracy": 0.6952350378036499, "num_tokens": 26422835.0, "step": 11420 }, { "entropy": 1.155861073732376, "epoch": 1.675511251191087, "grad_norm": 7.1875, "learning_rate": 1.9723442330743692e-05, "loss": 1.1101, "mean_token_accuracy": 0.7088181167840958, "num_tokens": 26446855.0, "step": 11430 }, { "entropy": 1.338971596956253, "epoch": 1.6769772044271787, "grad_norm": 9.0625, "learning_rate": 1.972224602504244e-05, "loss": 1.3021, "mean_token_accuracy": 0.6756313413381576, "num_tokens": 26467090.0, "step": 11440 }, { "entropy": 1.2162939190864563, "epoch": 1.6784431576632706, "grad_norm": 13.5625, "learning_rate": 1.9721047173930316e-05, "loss": 1.2211, "mean_token_accuracy": 0.7014288395643234, "num_tokens": 26491279.0, "step": 11450 }, { "entropy": 1.146204486489296, "epoch": 1.6799091108993625, "grad_norm": 8.0, "learning_rate": 1.9719845777721204e-05, "loss": 1.1078, "mean_token_accuracy": 0.7122608125209808, "num_tokens": 26518620.0, "step": 11460 }, { "entropy": 1.1421729117631911, "epoch": 1.6813750641354541, "grad_norm": 6.59375, "learning_rate": 1.9718641836729642e-05, "loss": 1.1131, "mean_token_accuracy": 0.7138542592525482, "num_tokens": 26543226.0, "step": 11470 }, { "entropy": 1.262361180782318, "epoch": 1.6828410173715458, "grad_norm": 8.1875, "learning_rate": 1.9717435351270836e-05, "loss": 1.2344, "mean_token_accuracy": 0.6924894899129868, "num_tokens": 26566383.0, "step": 11480 }, { "entropy": 1.2501673579216004, "epoch": 1.6843069706076377, "grad_norm": 7.75, "learning_rate": 1.971622632166066e-05, "loss": 1.1723, "mean_token_accuracy": 0.6970037132501602, "num_tokens": 26590008.0, "step": 11490 }, { "entropy": 1.3956752598285675, "epoch": 1.6857729238437293, "grad_norm": 12.0, "learning_rate": 1.9715014748215656e-05, "loss": 1.3836, "mean_token_accuracy": 0.6792545914649963, "num_tokens": 26612932.0, "step": 11500 }, { "entropy": 1.3515526324510574, "epoch": 1.687238877079821, "grad_norm": 9.0, "learning_rate": 1.971380063125303e-05, "loss": 1.3339, "mean_token_accuracy": 0.6761031538248062, "num_tokens": 26634921.0, "step": 11510 }, { "entropy": 1.2216984152793884, "epoch": 1.688704830315913, "grad_norm": 13.5, "learning_rate": 1.971258397109065e-05, "loss": 1.1757, "mean_token_accuracy": 0.6946967035531998, "num_tokens": 26654390.0, "step": 11520 }, { "entropy": 1.5158738881349563, "epoch": 1.6901707835520048, "grad_norm": 17.0, "learning_rate": 1.9711364768047058e-05, "loss": 1.5274, "mean_token_accuracy": 0.6383714586496353, "num_tokens": 26673989.0, "step": 11530 }, { "entropy": 1.3147939950227738, "epoch": 1.6916367367880965, "grad_norm": 7.25, "learning_rate": 1.9710143022441454e-05, "loss": 1.2411, "mean_token_accuracy": 0.6904275238513946, "num_tokens": 26695126.0, "step": 11540 }, { "entropy": 1.1581712037324905, "epoch": 1.6931026900241881, "grad_norm": 7.84375, "learning_rate": 1.9708918734593707e-05, "loss": 1.1272, "mean_token_accuracy": 0.7117832064628601, "num_tokens": 26716630.0, "step": 11550 }, { "entropy": 1.2054022192955016, "epoch": 1.69456864326028, "grad_norm": 8.3125, "learning_rate": 1.970769190482435e-05, "loss": 1.2006, "mean_token_accuracy": 0.7009083479642868, "num_tokens": 26739049.0, "step": 11560 }, { "entropy": 1.2010283321142197, "epoch": 1.696034596496372, "grad_norm": 11.625, "learning_rate": 1.9706462533454592e-05, "loss": 1.1732, "mean_token_accuracy": 0.7022874295711518, "num_tokens": 26763376.0, "step": 11570 }, { "entropy": 0.994119843840599, "epoch": 1.6975005497324636, "grad_norm": 7.4375, "learning_rate": 1.9705230620806288e-05, "loss": 0.9636, "mean_token_accuracy": 0.7464111089706421, "num_tokens": 26786955.0, "step": 11580 }, { "entropy": 1.4406604796648026, "epoch": 1.6989665029685552, "grad_norm": 11.3125, "learning_rate": 1.970399616720197e-05, "loss": 1.3901, "mean_token_accuracy": 0.6680540338158607, "num_tokens": 26809305.0, "step": 11590 }, { "entropy": 1.1581989973783493, "epoch": 1.7004324562046471, "grad_norm": 7.84375, "learning_rate": 1.970275917296484e-05, "loss": 1.1082, "mean_token_accuracy": 0.7149825304746628, "num_tokens": 26833012.0, "step": 11600 }, { "entropy": 1.2650763720273972, "epoch": 1.701898409440739, "grad_norm": 9.3125, "learning_rate": 1.9701519638418754e-05, "loss": 1.2162, "mean_token_accuracy": 0.689169031381607, "num_tokens": 26855801.0, "step": 11610 }, { "entropy": 1.1960458159446716, "epoch": 1.7033643626768304, "grad_norm": 11.5625, "learning_rate": 1.9700277563888243e-05, "loss": 1.1435, "mean_token_accuracy": 0.7035007447004318, "num_tokens": 26877101.0, "step": 11620 }, { "entropy": 1.2212612748146057, "epoch": 1.7048303159129223, "grad_norm": 7.40625, "learning_rate": 1.9699032949698497e-05, "loss": 1.1688, "mean_token_accuracy": 0.7011097371578217, "num_tokens": 26901494.0, "step": 11630 }, { "entropy": 1.4259128034114839, "epoch": 1.7062962691490142, "grad_norm": 11.9375, "learning_rate": 1.9697785796175367e-05, "loss": 1.4394, "mean_token_accuracy": 0.6593746960163116, "num_tokens": 26923643.0, "step": 11640 }, { "entropy": 1.1432985633611679, "epoch": 1.7077622223851059, "grad_norm": 9.25, "learning_rate": 1.969653610364538e-05, "loss": 1.0725, "mean_token_accuracy": 0.7197859972715378, "num_tokens": 26945204.0, "step": 11650 }, { "entropy": 1.3089995384216309, "epoch": 1.7092281756211976, "grad_norm": 8.3125, "learning_rate": 1.9695283872435714e-05, "loss": 1.2828, "mean_token_accuracy": 0.6736087381839753, "num_tokens": 26964767.0, "step": 11660 }, { "entropy": 1.130704227089882, "epoch": 1.7106941288572894, "grad_norm": 8.4375, "learning_rate": 1.969402910287423e-05, "loss": 1.1012, "mean_token_accuracy": 0.7167387455701828, "num_tokens": 26986859.0, "step": 11670 }, { "entropy": 1.236957821249962, "epoch": 1.7121600820933813, "grad_norm": 7.0625, "learning_rate": 1.969277179528944e-05, "loss": 1.1339, "mean_token_accuracy": 0.7066491484642029, "num_tokens": 27007434.0, "step": 11680 }, { "entropy": 1.3166721910238266, "epoch": 1.713626035329473, "grad_norm": 13.375, "learning_rate": 1.969151195001052e-05, "loss": 1.3225, "mean_token_accuracy": 0.6865025579929351, "num_tokens": 27030633.0, "step": 11690 }, { "entropy": 1.3165415555238724, "epoch": 1.7150919885655647, "grad_norm": 12.875, "learning_rate": 1.9690249567367314e-05, "loss": 1.3367, "mean_token_accuracy": 0.6790361404418945, "num_tokens": 27053404.0, "step": 11700 }, { "entropy": 1.1897840589284896, "epoch": 1.7165579418016566, "grad_norm": 9.5625, "learning_rate": 1.9688984647690333e-05, "loss": 1.1547, "mean_token_accuracy": 0.7087419539690017, "num_tokens": 27073113.0, "step": 11710 }, { "entropy": 1.2105638027191161, "epoch": 1.7180238950377484, "grad_norm": 9.75, "learning_rate": 1.9687717191310747e-05, "loss": 1.1771, "mean_token_accuracy": 0.6992402315139771, "num_tokens": 27095961.0, "step": 11720 }, { "entropy": 1.2471390187740325, "epoch": 1.71948984827384, "grad_norm": 8.5, "learning_rate": 1.9686447198560397e-05, "loss": 1.2003, "mean_token_accuracy": 0.6959726363420486, "num_tokens": 27120698.0, "step": 11730 }, { "entropy": 1.3984311819076538, "epoch": 1.7209558015099318, "grad_norm": 15.125, "learning_rate": 1.968517466977178e-05, "loss": 1.4086, "mean_token_accuracy": 0.6656749501824379, "num_tokens": 27142766.0, "step": 11740 }, { "entropy": 1.3765382409095763, "epoch": 1.7224217547460237, "grad_norm": 8.875, "learning_rate": 1.9683899605278062e-05, "loss": 1.3342, "mean_token_accuracy": 0.6725495159626007, "num_tokens": 27161330.0, "step": 11750 }, { "entropy": 1.3167137593030929, "epoch": 1.7238877079821153, "grad_norm": 11.5, "learning_rate": 1.968262200541307e-05, "loss": 1.3514, "mean_token_accuracy": 0.6797469824552536, "num_tokens": 27183872.0, "step": 11760 }, { "entropy": 1.0612283945083618, "epoch": 1.725353661218207, "grad_norm": 11.3125, "learning_rate": 1.9681341870511294e-05, "loss": 0.9691, "mean_token_accuracy": 0.7313054651021957, "num_tokens": 27207839.0, "step": 11770 }, { "entropy": 1.0634489461779595, "epoch": 1.7268196144542989, "grad_norm": 6.96875, "learning_rate": 1.9680059200907897e-05, "loss": 1.0067, "mean_token_accuracy": 0.7336118727922439, "num_tokens": 27232133.0, "step": 11780 }, { "entropy": 1.1757908910512924, "epoch": 1.7282855676903908, "grad_norm": 9.3125, "learning_rate": 1.9678773996938695e-05, "loss": 1.0958, "mean_token_accuracy": 0.7072917997837067, "num_tokens": 27255020.0, "step": 11790 }, { "entropy": 1.094107522070408, "epoch": 1.7297515209264824, "grad_norm": 6.625, "learning_rate": 1.967748625894017e-05, "loss": 1.0289, "mean_token_accuracy": 0.7247576445341111, "num_tokens": 27278349.0, "step": 11800 }, { "entropy": 1.1425156906247138, "epoch": 1.731217474162574, "grad_norm": 13.1875, "learning_rate": 1.9676195987249466e-05, "loss": 1.129, "mean_token_accuracy": 0.7173820197582245, "num_tokens": 27304695.0, "step": 11810 }, { "entropy": 1.138152852654457, "epoch": 1.732683427398666, "grad_norm": 9.5625, "learning_rate": 1.9674903182204402e-05, "loss": 1.0668, "mean_token_accuracy": 0.7191119223833085, "num_tokens": 27330756.0, "step": 11820 }, { "entropy": 1.2983226507902146, "epoch": 1.7341493806347579, "grad_norm": 8.75, "learning_rate": 1.9673607844143443e-05, "loss": 1.2026, "mean_token_accuracy": 0.6936733990907669, "num_tokens": 27349531.0, "step": 11830 }, { "entropy": 1.3900134295225144, "epoch": 1.7356153338708495, "grad_norm": 13.1875, "learning_rate": 1.967230997340573e-05, "loss": 1.3148, "mean_token_accuracy": 0.6725276321172714, "num_tokens": 27370497.0, "step": 11840 }, { "entropy": 1.172396221756935, "epoch": 1.7370812871069412, "grad_norm": 9.3125, "learning_rate": 1.9671009570331058e-05, "loss": 1.1137, "mean_token_accuracy": 0.7085967540740967, "num_tokens": 27392630.0, "step": 11850 }, { "entropy": 1.1914201915264129, "epoch": 1.738547240343033, "grad_norm": 6.84375, "learning_rate": 1.9669706635259893e-05, "loss": 1.0647, "mean_token_accuracy": 0.7105582058429718, "num_tokens": 27417301.0, "step": 11860 }, { "entropy": 1.390019965171814, "epoch": 1.740013193579125, "grad_norm": 9.125, "learning_rate": 1.966840116853336e-05, "loss": 1.3936, "mean_token_accuracy": 0.672562076151371, "num_tokens": 27440592.0, "step": 11870 }, { "entropy": 1.3017921417951583, "epoch": 1.7414791468152166, "grad_norm": 8.4375, "learning_rate": 1.966709317049325e-05, "loss": 1.2821, "mean_token_accuracy": 0.6795846164226532, "num_tokens": 27460922.0, "step": 11880 }, { "entropy": 1.181003099679947, "epoch": 1.7429451000513083, "grad_norm": 11.0, "learning_rate": 1.9665782641482006e-05, "loss": 1.1694, "mean_token_accuracy": 0.7066617429256439, "num_tokens": 27485948.0, "step": 11890 }, { "entropy": 1.008000263571739, "epoch": 1.7444110532874002, "grad_norm": 11.4375, "learning_rate": 1.966446958184275e-05, "loss": 0.9708, "mean_token_accuracy": 0.7450054675340653, "num_tokens": 27510547.0, "step": 11900 }, { "entropy": 1.2946033298969268, "epoch": 1.7458770065234919, "grad_norm": 15.0, "learning_rate": 1.9663153991919254e-05, "loss": 1.2965, "mean_token_accuracy": 0.680447793006897, "num_tokens": 27531463.0, "step": 11910 }, { "entropy": 1.1897721022367478, "epoch": 1.7473429597595835, "grad_norm": 12.0625, "learning_rate": 1.9661835872055955e-05, "loss": 1.1082, "mean_token_accuracy": 0.6995866924524308, "num_tokens": 27553528.0, "step": 11920 }, { "entropy": 1.1670029789209366, "epoch": 1.7488089129956754, "grad_norm": 7.5625, "learning_rate": 1.9660515222597955e-05, "loss": 1.1778, "mean_token_accuracy": 0.7080998674035073, "num_tokens": 27576952.0, "step": 11930 }, { "entropy": 1.2723976030945778, "epoch": 1.7502748662317673, "grad_norm": 7.21875, "learning_rate": 1.9659192043891023e-05, "loss": 1.1697, "mean_token_accuracy": 0.6996543407440186, "num_tokens": 27599948.0, "step": 11940 }, { "entropy": 1.09395988881588, "epoch": 1.751740819467859, "grad_norm": 7.28125, "learning_rate": 1.9657866336281574e-05, "loss": 1.0045, "mean_token_accuracy": 0.7265170484781265, "num_tokens": 27625012.0, "step": 11950 }, { "entropy": 1.2387204498052597, "epoch": 1.7532067727039506, "grad_norm": 9.5, "learning_rate": 1.9656538100116704e-05, "loss": 1.2178, "mean_token_accuracy": 0.6910095453262329, "num_tokens": 27648659.0, "step": 11960 }, { "entropy": 1.351230075955391, "epoch": 1.7546727259400425, "grad_norm": 10.5625, "learning_rate": 1.9655207335744162e-05, "loss": 1.2594, "mean_token_accuracy": 0.6810640633106232, "num_tokens": 27673424.0, "step": 11970 }, { "entropy": 1.209887683391571, "epoch": 1.7561386791761344, "grad_norm": 5.8125, "learning_rate": 1.9653874043512354e-05, "loss": 1.1252, "mean_token_accuracy": 0.699344477057457, "num_tokens": 27692748.0, "step": 11980 }, { "entropy": 1.2799103051424026, "epoch": 1.757604632412226, "grad_norm": 7.84375, "learning_rate": 1.9652538223770363e-05, "loss": 1.2802, "mean_token_accuracy": 0.6893205046653748, "num_tokens": 27715186.0, "step": 11990 }, { "entropy": 1.35498408973217, "epoch": 1.7590705856483178, "grad_norm": 8.8125, "learning_rate": 1.9651199876867914e-05, "loss": 1.2981, "mean_token_accuracy": 0.6731500744819641, "num_tokens": 27738447.0, "step": 12000 }, { "entropy": 1.41527661383152, "epoch": 1.7605365388844096, "grad_norm": 8.5625, "learning_rate": 1.9649859003155405e-05, "loss": 1.4396, "mean_token_accuracy": 0.6580379545688629, "num_tokens": 27758335.0, "step": 12010 }, { "entropy": 1.1585509032011032, "epoch": 1.7620024921205013, "grad_norm": 6.25, "learning_rate": 1.9648515602983898e-05, "loss": 1.0725, "mean_token_accuracy": 0.7188919574022293, "num_tokens": 27782996.0, "step": 12020 }, { "entropy": 1.2944514572620391, "epoch": 1.763468445356593, "grad_norm": 7.40625, "learning_rate": 1.964716967670511e-05, "loss": 1.1849, "mean_token_accuracy": 0.6884178340435028, "num_tokens": 27804643.0, "step": 12030 }, { "entropy": 1.2716759771108628, "epoch": 1.7649343985926849, "grad_norm": 8.375, "learning_rate": 1.9645821224671425e-05, "loss": 1.2264, "mean_token_accuracy": 0.6926877930760383, "num_tokens": 27827187.0, "step": 12040 }, { "entropy": 1.0309120386838913, "epoch": 1.7664003518287767, "grad_norm": 8.1875, "learning_rate": 1.9644470247235886e-05, "loss": 0.981, "mean_token_accuracy": 0.7386457979679107, "num_tokens": 27854834.0, "step": 12050 }, { "entropy": 1.3380413323640823, "epoch": 1.7678663050648684, "grad_norm": 12.4375, "learning_rate": 1.964311674475219e-05, "loss": 1.3253, "mean_token_accuracy": 0.6676199018955231, "num_tokens": 27877560.0, "step": 12060 }, { "entropy": 1.217397153377533, "epoch": 1.76933225830096, "grad_norm": 7.0625, "learning_rate": 1.9641760717574708e-05, "loss": 1.1589, "mean_token_accuracy": 0.70280881524086, "num_tokens": 27900712.0, "step": 12070 }, { "entropy": 1.051113212108612, "epoch": 1.770798211537052, "grad_norm": 8.0625, "learning_rate": 1.9640402166058466e-05, "loss": 0.998, "mean_token_accuracy": 0.7351816833019257, "num_tokens": 27928810.0, "step": 12080 }, { "entropy": 1.0556159138679504, "epoch": 1.7722641647731439, "grad_norm": 7.65625, "learning_rate": 1.9639041090559147e-05, "loss": 1.0456, "mean_token_accuracy": 0.7251468658447265, "num_tokens": 27954661.0, "step": 12090 }, { "entropy": 1.1460334569215775, "epoch": 1.7737301180092355, "grad_norm": 6.03125, "learning_rate": 1.9637677491433097e-05, "loss": 1.116, "mean_token_accuracy": 0.7187171131372452, "num_tokens": 27979330.0, "step": 12100 }, { "entropy": 1.1580551534891128, "epoch": 1.7751960712453272, "grad_norm": 8.125, "learning_rate": 1.9636311369037324e-05, "loss": 1.0426, "mean_token_accuracy": 0.7100010395050049, "num_tokens": 28004780.0, "step": 12110 }, { "entropy": 0.9333190754055977, "epoch": 1.776662024481419, "grad_norm": 8.1875, "learning_rate": 1.9634942723729503e-05, "loss": 0.8834, "mean_token_accuracy": 0.7583595454692841, "num_tokens": 28030160.0, "step": 12120 }, { "entropy": 1.2929522901773454, "epoch": 1.778127977717511, "grad_norm": 14.625, "learning_rate": 1.963357155586796e-05, "loss": 1.2701, "mean_token_accuracy": 0.6845263123512269, "num_tokens": 28049986.0, "step": 12130 }, { "entropy": 1.1613916724920272, "epoch": 1.7795939309536026, "grad_norm": 12.3125, "learning_rate": 1.963219786581168e-05, "loss": 1.1405, "mean_token_accuracy": 0.7174241870641709, "num_tokens": 28073468.0, "step": 12140 }, { "entropy": 1.1493421256542207, "epoch": 1.7810598841896943, "grad_norm": 9.3125, "learning_rate": 1.963082165392032e-05, "loss": 1.1386, "mean_token_accuracy": 0.71490398645401, "num_tokens": 28094925.0, "step": 12150 }, { "entropy": 1.1986760258674622, "epoch": 1.7825258374257862, "grad_norm": 11.875, "learning_rate": 1.9629442920554188e-05, "loss": 1.1735, "mean_token_accuracy": 0.7062569230794906, "num_tokens": 28115685.0, "step": 12160 }, { "entropy": 1.2801807597279549, "epoch": 1.7839917906618779, "grad_norm": 8.9375, "learning_rate": 1.962806166607425e-05, "loss": 1.1731, "mean_token_accuracy": 0.6937384501099586, "num_tokens": 28136154.0, "step": 12170 }, { "entropy": 1.2509783983230591, "epoch": 1.7854577438979695, "grad_norm": 8.625, "learning_rate": 1.9626677890842142e-05, "loss": 1.1859, "mean_token_accuracy": 0.688074991106987, "num_tokens": 28158750.0, "step": 12180 }, { "entropy": 1.2398091495037078, "epoch": 1.7869236971340614, "grad_norm": 9.125, "learning_rate": 1.9625291595220154e-05, "loss": 1.2284, "mean_token_accuracy": 0.6960211902856827, "num_tokens": 28182789.0, "step": 12190 }, { "entropy": 1.3493959158658981, "epoch": 1.7883896503701533, "grad_norm": 8.375, "learning_rate": 1.962390277957123e-05, "loss": 1.2775, "mean_token_accuracy": 0.6775564283132554, "num_tokens": 28204982.0, "step": 12200 }, { "entropy": 1.274512270092964, "epoch": 1.789855603606245, "grad_norm": 9.5625, "learning_rate": 1.9622511444258986e-05, "loss": 1.2491, "mean_token_accuracy": 0.693219056725502, "num_tokens": 28225601.0, "step": 12210 }, { "entropy": 1.144590750336647, "epoch": 1.7913215568423366, "grad_norm": 9.25, "learning_rate": 1.9621117589647696e-05, "loss": 1.1017, "mean_token_accuracy": 0.7127091825008393, "num_tokens": 28249576.0, "step": 12220 }, { "entropy": 1.2103952795267106, "epoch": 1.7927875100784285, "grad_norm": 8.3125, "learning_rate": 1.9619721216102278e-05, "loss": 1.1963, "mean_token_accuracy": 0.6984682619571686, "num_tokens": 28271533.0, "step": 12230 }, { "entropy": 1.2219445884227753, "epoch": 1.7942534633145204, "grad_norm": 7.0, "learning_rate": 1.961832232398833e-05, "loss": 1.1657, "mean_token_accuracy": 0.6998466283082962, "num_tokens": 28289333.0, "step": 12240 }, { "entropy": 1.1398582100868224, "epoch": 1.795719416550612, "grad_norm": 12.4375, "learning_rate": 1.9616920913672092e-05, "loss": 1.0788, "mean_token_accuracy": 0.7178419381380081, "num_tokens": 28315221.0, "step": 12250 }, { "entropy": 1.2740823805332184, "epoch": 1.7971853697867037, "grad_norm": 13.875, "learning_rate": 1.961551698552048e-05, "loss": 1.2527, "mean_token_accuracy": 0.6848131716251373, "num_tokens": 28339201.0, "step": 12260 }, { "entropy": 1.2208223164081573, "epoch": 1.7986513230227956, "grad_norm": 9.75, "learning_rate": 1.9614110539901054e-05, "loss": 1.2053, "mean_token_accuracy": 0.7043729901313782, "num_tokens": 28362544.0, "step": 12270 }, { "entropy": 1.426226419210434, "epoch": 1.8001172762588873, "grad_norm": 9.375, "learning_rate": 1.961270157718205e-05, "loss": 1.4239, "mean_token_accuracy": 0.6609455138444901, "num_tokens": 28381501.0, "step": 12280 }, { "entropy": 1.0670575708150865, "epoch": 1.801583229494979, "grad_norm": 7.03125, "learning_rate": 1.9611290097732338e-05, "loss": 1.0995, "mean_token_accuracy": 0.7367936372756958, "num_tokens": 28407479.0, "step": 12290 }, { "entropy": 1.0642446488142014, "epoch": 1.8030491827310708, "grad_norm": 15.3125, "learning_rate": 1.960987610192147e-05, "loss": 1.0105, "mean_token_accuracy": 0.727493554353714, "num_tokens": 28428133.0, "step": 12300 }, { "entropy": 1.3653554171323776, "epoch": 1.8045151359671627, "grad_norm": 8.5625, "learning_rate": 1.960845959011965e-05, "loss": 1.3276, "mean_token_accuracy": 0.6745679259300232, "num_tokens": 28447819.0, "step": 12310 }, { "entropy": 1.1881424218416214, "epoch": 1.8059810892032544, "grad_norm": 7.53125, "learning_rate": 1.9607040562697736e-05, "loss": 1.127, "mean_token_accuracy": 0.6998736023902893, "num_tokens": 28470424.0, "step": 12320 }, { "entropy": 1.1669221967458725, "epoch": 1.807447042439346, "grad_norm": 7.0625, "learning_rate": 1.960561902002725e-05, "loss": 1.0924, "mean_token_accuracy": 0.7075816065073013, "num_tokens": 28492250.0, "step": 12330 }, { "entropy": 1.0676291853189468, "epoch": 1.808912995675438, "grad_norm": 9.25, "learning_rate": 1.9604194962480368e-05, "loss": 1.0161, "mean_token_accuracy": 0.7366529554128647, "num_tokens": 28516574.0, "step": 12340 }, { "entropy": 1.2834492176771164, "epoch": 1.8103789489115298, "grad_norm": 9.375, "learning_rate": 1.960276839042993e-05, "loss": 1.2272, "mean_token_accuracy": 0.6946577459573746, "num_tokens": 28541298.0, "step": 12350 }, { "entropy": 1.154740685224533, "epoch": 1.8118449021476215, "grad_norm": 8.0625, "learning_rate": 1.960133930424943e-05, "loss": 1.1702, "mean_token_accuracy": 0.7113241136074067, "num_tokens": 28565464.0, "step": 12360 }, { "entropy": 1.1628314465284348, "epoch": 1.8133108553837132, "grad_norm": 6.96875, "learning_rate": 1.959990770431302e-05, "loss": 1.1282, "mean_token_accuracy": 0.7155717432498931, "num_tokens": 28591247.0, "step": 12370 }, { "entropy": 1.2015915632247924, "epoch": 1.814776808619805, "grad_norm": 6.8125, "learning_rate": 1.9598473590995513e-05, "loss": 1.1233, "mean_token_accuracy": 0.7067711263895035, "num_tokens": 28615470.0, "step": 12380 }, { "entropy": 1.2491961389780044, "epoch": 1.816242761855897, "grad_norm": 9.8125, "learning_rate": 1.9597036964672377e-05, "loss": 1.1558, "mean_token_accuracy": 0.7098028898239136, "num_tokens": 28639309.0, "step": 12390 }, { "entropy": 1.3956295788288116, "epoch": 1.8177087150919886, "grad_norm": 13.5625, "learning_rate": 1.9595597825719743e-05, "loss": 1.3708, "mean_token_accuracy": 0.6678645849227905, "num_tokens": 28662910.0, "step": 12400 }, { "entropy": 1.172575506567955, "epoch": 1.8191746683280803, "grad_norm": 8.5625, "learning_rate": 1.959415617451439e-05, "loss": 1.0716, "mean_token_accuracy": 0.7162936389446258, "num_tokens": 28684298.0, "step": 12410 }, { "entropy": 1.3000643253326416, "epoch": 1.8206406215641722, "grad_norm": 7.3125, "learning_rate": 1.9592712011433768e-05, "loss": 1.2933, "mean_token_accuracy": 0.6893129363656044, "num_tokens": 28709793.0, "step": 12420 }, { "entropy": 1.1869338572025299, "epoch": 1.8221065748002638, "grad_norm": 10.4375, "learning_rate": 1.9591265336855975e-05, "loss": 1.191, "mean_token_accuracy": 0.704012605547905, "num_tokens": 28732430.0, "step": 12430 }, { "entropy": 1.155486948788166, "epoch": 1.8235725280363555, "grad_norm": 8.0625, "learning_rate": 1.9589816151159766e-05, "loss": 1.1033, "mean_token_accuracy": 0.7100923001766205, "num_tokens": 28757136.0, "step": 12440 }, { "entropy": 1.057971605658531, "epoch": 1.8250384812724474, "grad_norm": 8.25, "learning_rate": 1.958836445472456e-05, "loss": 1.0185, "mean_token_accuracy": 0.7292855322360993, "num_tokens": 28782213.0, "step": 12450 }, { "entropy": 1.1320896357297898, "epoch": 1.8265044345085393, "grad_norm": 8.875, "learning_rate": 1.9586910247930434e-05, "loss": 1.1134, "mean_token_accuracy": 0.7166278660297394, "num_tokens": 28809331.0, "step": 12460 }, { "entropy": 1.1034770160913467, "epoch": 1.827970387744631, "grad_norm": 9.3125, "learning_rate": 1.958545353115811e-05, "loss": 1.0325, "mean_token_accuracy": 0.7250680744647979, "num_tokens": 28834567.0, "step": 12470 }, { "entropy": 1.2149113088846206, "epoch": 1.8294363409807226, "grad_norm": 12.4375, "learning_rate": 1.958399430478898e-05, "loss": 1.1742, "mean_token_accuracy": 0.6973531633615494, "num_tokens": 28857871.0, "step": 12480 }, { "entropy": 1.201597797870636, "epoch": 1.8309022942168145, "grad_norm": 8.6875, "learning_rate": 1.9582532569205092e-05, "loss": 1.1498, "mean_token_accuracy": 0.7052374362945557, "num_tokens": 28882331.0, "step": 12490 }, { "entropy": 1.40651236474514, "epoch": 1.8323682474529064, "grad_norm": 12.0625, "learning_rate": 1.958106832478914e-05, "loss": 1.3439, "mean_token_accuracy": 0.6729961633682251, "num_tokens": 28904527.0, "step": 12500 }, { "entropy": 1.227410125732422, "epoch": 1.833834200688998, "grad_norm": 9.125, "learning_rate": 1.9579601571924487e-05, "loss": 1.2017, "mean_token_accuracy": 0.7007646650075913, "num_tokens": 28928548.0, "step": 12510 }, { "entropy": 1.22055940926075, "epoch": 1.8353001539250897, "grad_norm": 14.25, "learning_rate": 1.9578132310995145e-05, "loss": 1.1605, "mean_token_accuracy": 0.7041781127452851, "num_tokens": 28951706.0, "step": 12520 }, { "entropy": 1.1493687123060226, "epoch": 1.8367661071611816, "grad_norm": 7.3125, "learning_rate": 1.957666054238579e-05, "loss": 1.0673, "mean_token_accuracy": 0.7123047560453415, "num_tokens": 28973631.0, "step": 12530 }, { "entropy": 1.2146876126527786, "epoch": 1.8382320603972735, "grad_norm": 13.1875, "learning_rate": 1.9575186266481746e-05, "loss": 1.2355, "mean_token_accuracy": 0.7016615256667137, "num_tokens": 28995944.0, "step": 12540 }, { "entropy": 1.199808230996132, "epoch": 1.839698013633365, "grad_norm": 9.625, "learning_rate": 1.9573709483669003e-05, "loss": 1.1179, "mean_token_accuracy": 0.701017078757286, "num_tokens": 29018397.0, "step": 12550 }, { "entropy": 1.3738439202308654, "epoch": 1.8411639668694568, "grad_norm": 9.25, "learning_rate": 1.9572230194334194e-05, "loss": 1.337, "mean_token_accuracy": 0.6651567369699478, "num_tokens": 29036550.0, "step": 12560 }, { "entropy": 1.2822959542274475, "epoch": 1.8426299201055487, "grad_norm": 8.1875, "learning_rate": 1.9570748398864623e-05, "loss": 1.2048, "mean_token_accuracy": 0.6874247848987579, "num_tokens": 29060799.0, "step": 12570 }, { "entropy": 1.1788443207740784, "epoch": 1.8440958733416404, "grad_norm": 11.9375, "learning_rate": 1.956926409764824e-05, "loss": 1.1813, "mean_token_accuracy": 0.7049784064292908, "num_tokens": 29083387.0, "step": 12580 }, { "entropy": 1.2204043745994568, "epoch": 1.845561826577732, "grad_norm": 11.75, "learning_rate": 1.9567777291073656e-05, "loss": 1.1477, "mean_token_accuracy": 0.6973560333251954, "num_tokens": 29104463.0, "step": 12590 }, { "entropy": 1.1437383592128754, "epoch": 1.847027779813824, "grad_norm": 8.6875, "learning_rate": 1.9566287979530137e-05, "loss": 1.1136, "mean_token_accuracy": 0.7109841227531433, "num_tokens": 29129171.0, "step": 12600 }, { "entropy": 1.2369670540094375, "epoch": 1.8484937330499158, "grad_norm": 6.71875, "learning_rate": 1.9564796163407602e-05, "loss": 1.1944, "mean_token_accuracy": 0.6875959664583207, "num_tokens": 29154993.0, "step": 12610 }, { "entropy": 1.2009786278009416, "epoch": 1.8499596862860075, "grad_norm": 6.625, "learning_rate": 1.9563301843096628e-05, "loss": 1.157, "mean_token_accuracy": 0.7033981949090957, "num_tokens": 29178310.0, "step": 12620 }, { "entropy": 1.460371932387352, "epoch": 1.8514256395220992, "grad_norm": 9.8125, "learning_rate": 1.9561805018988453e-05, "loss": 1.4101, "mean_token_accuracy": 0.6534877210855484, "num_tokens": 29198666.0, "step": 12630 }, { "entropy": 1.1923606723546982, "epoch": 1.852891592758191, "grad_norm": 6.4375, "learning_rate": 1.9560305691474954e-05, "loss": 1.1412, "mean_token_accuracy": 0.707498362660408, "num_tokens": 29222779.0, "step": 12640 }, { "entropy": 0.9914990991353989, "epoch": 1.854357545994283, "grad_norm": 8.5, "learning_rate": 1.9558803860948684e-05, "loss": 0.9706, "mean_token_accuracy": 0.7489920258522034, "num_tokens": 29248174.0, "step": 12650 }, { "entropy": 1.2056515723466874, "epoch": 1.8558234992303746, "grad_norm": 8.75, "learning_rate": 1.955729952780284e-05, "loss": 1.1703, "mean_token_accuracy": 0.7095350340008736, "num_tokens": 29272768.0, "step": 12660 }, { "entropy": 1.1688464432954788, "epoch": 1.8572894524664663, "grad_norm": 7.34375, "learning_rate": 1.9555792692431275e-05, "loss": 1.1166, "mean_token_accuracy": 0.7128655031323433, "num_tokens": 29292339.0, "step": 12670 }, { "entropy": 1.2069063454866409, "epoch": 1.8587554057025581, "grad_norm": 8.6875, "learning_rate": 1.9554283355228495e-05, "loss": 1.1604, "mean_token_accuracy": 0.7031363338232041, "num_tokens": 29315460.0, "step": 12680 }, { "entropy": 1.2800155460834504, "epoch": 1.8602213589386498, "grad_norm": 13.0625, "learning_rate": 1.955277151658967e-05, "loss": 1.2635, "mean_token_accuracy": 0.6911210745573044, "num_tokens": 29340287.0, "step": 12690 }, { "entropy": 1.1473792135715484, "epoch": 1.8616873121747415, "grad_norm": 6.96875, "learning_rate": 1.955125717691062e-05, "loss": 1.0719, "mean_token_accuracy": 0.7213230431079865, "num_tokens": 29364129.0, "step": 12700 }, { "entropy": 1.3297647714614869, "epoch": 1.8631532654108334, "grad_norm": 6.875, "learning_rate": 1.954974033658781e-05, "loss": 1.2739, "mean_token_accuracy": 0.6808464765548706, "num_tokens": 29386340.0, "step": 12710 }, { "entropy": 1.072801834344864, "epoch": 1.8646192186469253, "grad_norm": 7.78125, "learning_rate": 1.9548220996018377e-05, "loss": 1.0072, "mean_token_accuracy": 0.7348129123449325, "num_tokens": 29410708.0, "step": 12720 }, { "entropy": 1.3780465036630631, "epoch": 1.866085171883017, "grad_norm": 10.1875, "learning_rate": 1.9546699155600096e-05, "loss": 1.3902, "mean_token_accuracy": 0.669468104839325, "num_tokens": 29432750.0, "step": 12730 }, { "entropy": 1.4107334733009338, "epoch": 1.8675511251191086, "grad_norm": 9.0, "learning_rate": 1.9545174815731417e-05, "loss": 1.3458, "mean_token_accuracy": 0.6669967919588089, "num_tokens": 29454462.0, "step": 12740 }, { "entropy": 1.3761096984148025, "epoch": 1.8690170783552005, "grad_norm": 9.125, "learning_rate": 1.954364797681142e-05, "loss": 1.2904, "mean_token_accuracy": 0.6665095299482345, "num_tokens": 29474190.0, "step": 12750 }, { "entropy": 1.2737563371658325, "epoch": 1.8704830315912924, "grad_norm": 14.5625, "learning_rate": 1.9542118639239858e-05, "loss": 1.2174, "mean_token_accuracy": 0.6899191707372665, "num_tokens": 29496500.0, "step": 12760 }, { "entropy": 1.196490865945816, "epoch": 1.871948984827384, "grad_norm": 9.125, "learning_rate": 1.954058680341713e-05, "loss": 1.128, "mean_token_accuracy": 0.7076842874288559, "num_tokens": 29520221.0, "step": 12770 }, { "entropy": 1.1745941996574403, "epoch": 1.8734149380634757, "grad_norm": 8.25, "learning_rate": 1.953905246974429e-05, "loss": 1.1035, "mean_token_accuracy": 0.7137254551053047, "num_tokens": 29543243.0, "step": 12780 }, { "entropy": 1.1949845880270005, "epoch": 1.8748808912995676, "grad_norm": 11.125, "learning_rate": 1.9537515638623047e-05, "loss": 1.164, "mean_token_accuracy": 0.7132484674453735, "num_tokens": 29567144.0, "step": 12790 }, { "entropy": 1.061948189139366, "epoch": 1.8763468445356595, "grad_norm": 9.4375, "learning_rate": 1.953597631045576e-05, "loss": 1.0266, "mean_token_accuracy": 0.730604887008667, "num_tokens": 29591314.0, "step": 12800 }, { "entropy": 1.3005233824253082, "epoch": 1.877812797771751, "grad_norm": 9.25, "learning_rate": 1.9534434485645453e-05, "loss": 1.2551, "mean_token_accuracy": 0.6876221895217896, "num_tokens": 29611424.0, "step": 12810 }, { "entropy": 1.2504784643650055, "epoch": 1.8792787510078428, "grad_norm": 9.1875, "learning_rate": 1.953289016459579e-05, "loss": 1.1739, "mean_token_accuracy": 0.6968931257724762, "num_tokens": 29629083.0, "step": 12820 }, { "entropy": 1.0584483295679092, "epoch": 1.8807447042439347, "grad_norm": 9.9375, "learning_rate": 1.9531343347711095e-05, "loss": 1.0317, "mean_token_accuracy": 0.7361146420240402, "num_tokens": 29652216.0, "step": 12830 }, { "entropy": 1.1612524539232254, "epoch": 1.8822106574800264, "grad_norm": 6.4375, "learning_rate": 1.952979403539635e-05, "loss": 1.0727, "mean_token_accuracy": 0.7101065546274186, "num_tokens": 29676539.0, "step": 12840 }, { "entropy": 1.0486004188656808, "epoch": 1.883676610716118, "grad_norm": 15.5625, "learning_rate": 1.9528242228057176e-05, "loss": 1.0083, "mean_token_accuracy": 0.7240624606609345, "num_tokens": 29699794.0, "step": 12850 }, { "entropy": 1.3310259133577347, "epoch": 1.88514256395221, "grad_norm": 7.78125, "learning_rate": 1.952668792609986e-05, "loss": 1.2552, "mean_token_accuracy": 0.6830366760492325, "num_tokens": 29724160.0, "step": 12860 }, { "entropy": 1.3773912519216538, "epoch": 1.8866085171883018, "grad_norm": 7.8125, "learning_rate": 1.9525131129931343e-05, "loss": 1.3414, "mean_token_accuracy": 0.6713298469781875, "num_tokens": 29748408.0, "step": 12870 }, { "entropy": 1.403343665599823, "epoch": 1.8880744704243935, "grad_norm": 10.625, "learning_rate": 1.952357183995921e-05, "loss": 1.3486, "mean_token_accuracy": 0.6652710735797882, "num_tokens": 29768988.0, "step": 12880 }, { "entropy": 1.2333573549985886, "epoch": 1.8895404236604851, "grad_norm": 9.3125, "learning_rate": 1.9522010056591706e-05, "loss": 1.1524, "mean_token_accuracy": 0.7038238644599915, "num_tokens": 29790562.0, "step": 12890 }, { "entropy": 1.2638453975319863, "epoch": 1.891006376896577, "grad_norm": 11.0, "learning_rate": 1.9520445780237725e-05, "loss": 1.2431, "mean_token_accuracy": 0.6965440660715103, "num_tokens": 29813534.0, "step": 12900 }, { "entropy": 1.135784786939621, "epoch": 1.892472330132669, "grad_norm": 7.1875, "learning_rate": 1.9518879011306815e-05, "loss": 1.0945, "mean_token_accuracy": 0.7116535663604736, "num_tokens": 29833999.0, "step": 12910 }, { "entropy": 1.279599928855896, "epoch": 1.8939382833687606, "grad_norm": 9.3125, "learning_rate": 1.9517309750209175e-05, "loss": 1.2809, "mean_token_accuracy": 0.6864002257585525, "num_tokens": 29856925.0, "step": 12920 }, { "entropy": 1.0323730409145355, "epoch": 1.8954042366048522, "grad_norm": 7.6875, "learning_rate": 1.9515737997355662e-05, "loss": 0.9326, "mean_token_accuracy": 0.7397205322980881, "num_tokens": 29881693.0, "step": 12930 }, { "entropy": 1.1698435992002487, "epoch": 1.8968701898409441, "grad_norm": 8.6875, "learning_rate": 1.9514163753157776e-05, "loss": 1.1384, "mean_token_accuracy": 0.7107473105192185, "num_tokens": 29908019.0, "step": 12940 }, { "entropy": 1.3936603218317032, "epoch": 1.8983361430770358, "grad_norm": 8.6875, "learning_rate": 1.9512587018027678e-05, "loss": 1.3682, "mean_token_accuracy": 0.6702735483646393, "num_tokens": 29931312.0, "step": 12950 }, { "entropy": 1.2612123280763625, "epoch": 1.8998020963131275, "grad_norm": 6.4375, "learning_rate": 1.951100779237818e-05, "loss": 1.2018, "mean_token_accuracy": 0.6867934674024582, "num_tokens": 29953866.0, "step": 12960 }, { "entropy": 1.3232232064008713, "epoch": 1.9012680495492194, "grad_norm": 8.8125, "learning_rate": 1.950942607662274e-05, "loss": 1.2356, "mean_token_accuracy": 0.6868762016296387, "num_tokens": 29979607.0, "step": 12970 }, { "entropy": 1.3999718517065047, "epoch": 1.9027340027853112, "grad_norm": 16.125, "learning_rate": 1.9507841871175475e-05, "loss": 1.3312, "mean_token_accuracy": 0.6651184499263764, "num_tokens": 29997374.0, "step": 12980 }, { "entropy": 1.2793159455060958, "epoch": 1.904199956021403, "grad_norm": 16.25, "learning_rate": 1.9506255176451146e-05, "loss": 1.2587, "mean_token_accuracy": 0.6879714220762253, "num_tokens": 30019513.0, "step": 12990 }, { "entropy": 1.4012116551399232, "epoch": 1.9056659092574946, "grad_norm": 10.875, "learning_rate": 1.9504665992865174e-05, "loss": 1.376, "mean_token_accuracy": 0.6693484306335449, "num_tokens": 30043896.0, "step": 13000 }, { "entropy": 1.3456447690725326, "epoch": 1.9071318624935865, "grad_norm": 11.5, "learning_rate": 1.9503074320833626e-05, "loss": 1.3626, "mean_token_accuracy": 0.683524277806282, "num_tokens": 30067670.0, "step": 13010 }, { "entropy": 1.1127973675727845, "epoch": 1.9085978157296783, "grad_norm": 7.09375, "learning_rate": 1.9501480160773228e-05, "loss": 1.1141, "mean_token_accuracy": 0.7220563173294068, "num_tokens": 30094254.0, "step": 13020 }, { "entropy": 1.0749325156211853, "epoch": 1.91006376896577, "grad_norm": 14.1875, "learning_rate": 1.9499883513101344e-05, "loss": 1.0707, "mean_token_accuracy": 0.728872999548912, "num_tokens": 30119348.0, "step": 13030 }, { "entropy": 1.0350449055433273, "epoch": 1.9115297222018617, "grad_norm": 8.125, "learning_rate": 1.9498284378236002e-05, "loss": 1.034, "mean_token_accuracy": 0.7394527643918991, "num_tokens": 30145339.0, "step": 13040 }, { "entropy": 1.220262172818184, "epoch": 1.9129956754379536, "grad_norm": 8.625, "learning_rate": 1.9496682756595876e-05, "loss": 1.1622, "mean_token_accuracy": 0.710649648308754, "num_tokens": 30168210.0, "step": 13050 }, { "entropy": 1.3514384537935258, "epoch": 1.9144616286740455, "grad_norm": 11.5, "learning_rate": 1.9495078648600286e-05, "loss": 1.2897, "mean_token_accuracy": 0.681254318356514, "num_tokens": 30187462.0, "step": 13060 }, { "entropy": 1.0892876490950585, "epoch": 1.915927581910137, "grad_norm": 9.4375, "learning_rate": 1.9493472054669216e-05, "loss": 1.0578, "mean_token_accuracy": 0.7223007291555404, "num_tokens": 30213034.0, "step": 13070 }, { "entropy": 1.2420334279537202, "epoch": 1.9173935351462288, "grad_norm": 8.9375, "learning_rate": 1.9491862975223295e-05, "loss": 1.1861, "mean_token_accuracy": 0.696955892443657, "num_tokens": 30238464.0, "step": 13080 }, { "entropy": 1.314267912507057, "epoch": 1.9188594883823207, "grad_norm": 9.5, "learning_rate": 1.949025141068379e-05, "loss": 1.2619, "mean_token_accuracy": 0.6808974742889404, "num_tokens": 30263049.0, "step": 13090 }, { "entropy": 1.2899829417467117, "epoch": 1.9203254416184123, "grad_norm": 8.375, "learning_rate": 1.948863736147264e-05, "loss": 1.2558, "mean_token_accuracy": 0.6902901738882065, "num_tokens": 30286412.0, "step": 13100 }, { "entropy": 1.2157043904066085, "epoch": 1.921791394854504, "grad_norm": 5.84375, "learning_rate": 1.948702082801242e-05, "loss": 1.1429, "mean_token_accuracy": 0.6999878481030464, "num_tokens": 30310256.0, "step": 13110 }, { "entropy": 1.2255474954843522, "epoch": 1.923257348090596, "grad_norm": 7.625, "learning_rate": 1.9485401810726358e-05, "loss": 1.1639, "mean_token_accuracy": 0.6997535645961761, "num_tokens": 30335082.0, "step": 13120 }, { "entropy": 1.0836972385644912, "epoch": 1.9247233013266878, "grad_norm": 7.0625, "learning_rate": 1.9483780310038335e-05, "loss": 1.0517, "mean_token_accuracy": 0.7311275482177735, "num_tokens": 30362272.0, "step": 13130 }, { "entropy": 1.1699747264385223, "epoch": 1.9261892545627795, "grad_norm": 6.3125, "learning_rate": 1.948215632637288e-05, "loss": 1.1349, "mean_token_accuracy": 0.710124722123146, "num_tokens": 30384433.0, "step": 13140 }, { "entropy": 1.323326689004898, "epoch": 1.9276552077988711, "grad_norm": 13.8125, "learning_rate": 1.948052986015518e-05, "loss": 1.2738, "mean_token_accuracy": 0.6865853875875473, "num_tokens": 30406978.0, "step": 13150 }, { "entropy": 1.1189010620117188, "epoch": 1.929121161034963, "grad_norm": 11.5, "learning_rate": 1.9478900911811056e-05, "loss": 1.0667, "mean_token_accuracy": 0.7188639879226685, "num_tokens": 30431390.0, "step": 13160 }, { "entropy": 1.2290884748101234, "epoch": 1.930587114271055, "grad_norm": 8.8125, "learning_rate": 1.9477269481766993e-05, "loss": 1.1817, "mean_token_accuracy": 0.704525351524353, "num_tokens": 30451660.0, "step": 13170 }, { "entropy": 1.12649085521698, "epoch": 1.9320530675071466, "grad_norm": 9.0, "learning_rate": 1.9475635570450116e-05, "loss": 1.129, "mean_token_accuracy": 0.7135081827640534, "num_tokens": 30479020.0, "step": 13180 }, { "entropy": 1.1964578747749328, "epoch": 1.9335190207432382, "grad_norm": 6.75, "learning_rate": 1.947399917828821e-05, "loss": 1.1445, "mean_token_accuracy": 0.7036422669887543, "num_tokens": 30499649.0, "step": 13190 }, { "entropy": 1.303586682677269, "epoch": 1.9349849739793301, "grad_norm": 9.25, "learning_rate": 1.9472360305709702e-05, "loss": 1.2917, "mean_token_accuracy": 0.6869449496269227, "num_tokens": 30523130.0, "step": 13200 }, { "entropy": 1.2473644196987153, "epoch": 1.9364509272154218, "grad_norm": 8.0625, "learning_rate": 1.947071895314367e-05, "loss": 1.1727, "mean_token_accuracy": 0.6935553401708603, "num_tokens": 30544046.0, "step": 13210 }, { "entropy": 1.3226733326911926, "epoch": 1.9379168804515134, "grad_norm": 9.0, "learning_rate": 1.946907512101984e-05, "loss": 1.2916, "mean_token_accuracy": 0.6819080501794815, "num_tokens": 30567228.0, "step": 13220 }, { "entropy": 1.3033395946025848, "epoch": 1.9393828336876053, "grad_norm": 9.0625, "learning_rate": 1.9467428809768593e-05, "loss": 1.2314, "mean_token_accuracy": 0.6877453804016114, "num_tokens": 30585049.0, "step": 13230 }, { "entropy": 1.1807275712490082, "epoch": 1.9408487869236972, "grad_norm": 7.96875, "learning_rate": 1.9465780019820945e-05, "loss": 1.1411, "mean_token_accuracy": 0.705298388004303, "num_tokens": 30609907.0, "step": 13240 }, { "entropy": 1.3123493939638138, "epoch": 1.9423147401597889, "grad_norm": 11.4375, "learning_rate": 1.9464128751608583e-05, "loss": 1.3242, "mean_token_accuracy": 0.6851325869560242, "num_tokens": 30632956.0, "step": 13250 }, { "entropy": 1.2496091187000276, "epoch": 1.9437806933958806, "grad_norm": 10.9375, "learning_rate": 1.9462475005563824e-05, "loss": 1.2126, "mean_token_accuracy": 0.6965325564146042, "num_tokens": 30658735.0, "step": 13260 }, { "entropy": 1.397467812895775, "epoch": 1.9452466466319724, "grad_norm": 9.6875, "learning_rate": 1.946081878211964e-05, "loss": 1.3581, "mean_token_accuracy": 0.6711928814649581, "num_tokens": 30685725.0, "step": 13270 }, { "entropy": 1.1491195470094682, "epoch": 1.9467125998680643, "grad_norm": 8.5, "learning_rate": 1.9459160081709652e-05, "loss": 1.1094, "mean_token_accuracy": 0.7196546405553818, "num_tokens": 30710527.0, "step": 13280 }, { "entropy": 1.0798720628023148, "epoch": 1.948178553104156, "grad_norm": 6.78125, "learning_rate": 1.9457498904768132e-05, "loss": 1.0643, "mean_token_accuracy": 0.7333173722028732, "num_tokens": 30736481.0, "step": 13290 }, { "entropy": 1.2414645075798034, "epoch": 1.9496445063402477, "grad_norm": 11.0625, "learning_rate": 1.9455835251729996e-05, "loss": 1.2683, "mean_token_accuracy": 0.699692553281784, "num_tokens": 30764819.0, "step": 13300 }, { "entropy": 1.1649063348770141, "epoch": 1.9511104595763396, "grad_norm": 8.25, "learning_rate": 1.945416912303081e-05, "loss": 1.1397, "mean_token_accuracy": 0.7149263918399811, "num_tokens": 30787121.0, "step": 13310 }, { "entropy": 1.2414863675832748, "epoch": 1.9525764128124314, "grad_norm": 7.3125, "learning_rate": 1.9452500519106792e-05, "loss": 1.1909, "mean_token_accuracy": 0.7003043293952942, "num_tokens": 30810960.0, "step": 13320 }, { "entropy": 1.1937198370695115, "epoch": 1.954042366048523, "grad_norm": 6.78125, "learning_rate": 1.9450829440394797e-05, "loss": 1.1226, "mean_token_accuracy": 0.7047218918800354, "num_tokens": 30835455.0, "step": 13330 }, { "entropy": 1.1466160640120506, "epoch": 1.9555083192846148, "grad_norm": 8.125, "learning_rate": 1.9449155887332343e-05, "loss": 1.09, "mean_token_accuracy": 0.7184696868062019, "num_tokens": 30859815.0, "step": 13340 }, { "entropy": 1.2931883186101913, "epoch": 1.9569742725207067, "grad_norm": 8.625, "learning_rate": 1.944747986035758e-05, "loss": 1.2763, "mean_token_accuracy": 0.6879751354455947, "num_tokens": 30884295.0, "step": 13350 }, { "entropy": 1.340749630331993, "epoch": 1.9584402257567983, "grad_norm": 15.625, "learning_rate": 1.944580135990932e-05, "loss": 1.3213, "mean_token_accuracy": 0.6735680788755417, "num_tokens": 30905398.0, "step": 13360 }, { "entropy": 1.109843435883522, "epoch": 1.95990617899289, "grad_norm": 7.3125, "learning_rate": 1.9444120386427016e-05, "loss": 1.0592, "mean_token_accuracy": 0.7265677839517594, "num_tokens": 30928812.0, "step": 13370 }, { "entropy": 1.3218564450740815, "epoch": 1.9613721322289819, "grad_norm": 10.875, "learning_rate": 1.9442436940350764e-05, "loss": 1.2771, "mean_token_accuracy": 0.6769389033317565, "num_tokens": 30947442.0, "step": 13380 }, { "entropy": 1.268425953388214, "epoch": 1.9628380854650738, "grad_norm": 9.75, "learning_rate": 1.9440751022121317e-05, "loss": 1.2399, "mean_token_accuracy": 0.6852281361818313, "num_tokens": 30965959.0, "step": 13390 }, { "entropy": 1.2715091049671172, "epoch": 1.9643040387011654, "grad_norm": 7.59375, "learning_rate": 1.9439062632180066e-05, "loss": 1.2311, "mean_token_accuracy": 0.6945875257253646, "num_tokens": 30989007.0, "step": 13400 }, { "entropy": 1.2577845215797425, "epoch": 1.965769991937257, "grad_norm": 8.875, "learning_rate": 1.943737177096906e-05, "loss": 1.1953, "mean_token_accuracy": 0.6872548907995224, "num_tokens": 31009892.0, "step": 13410 }, { "entropy": 1.3559427857398987, "epoch": 1.967235945173349, "grad_norm": 7.3125, "learning_rate": 1.943567843893098e-05, "loss": 1.2962, "mean_token_accuracy": 0.6778279066085815, "num_tokens": 31031133.0, "step": 13420 }, { "entropy": 1.2650080353021622, "epoch": 1.9687018984094409, "grad_norm": 7.5, "learning_rate": 1.943398263650917e-05, "loss": 1.2024, "mean_token_accuracy": 0.6927106469869614, "num_tokens": 31055672.0, "step": 13430 }, { "entropy": 1.167484113574028, "epoch": 1.9701678516455325, "grad_norm": 8.8125, "learning_rate": 1.943228436414761e-05, "loss": 1.1279, "mean_token_accuracy": 0.7110966116189956, "num_tokens": 31080376.0, "step": 13440 }, { "entropy": 1.1124765157699585, "epoch": 1.9716338048816242, "grad_norm": 9.375, "learning_rate": 1.943058362229093e-05, "loss": 1.039, "mean_token_accuracy": 0.7239251375198364, "num_tokens": 31105001.0, "step": 13450 }, { "entropy": 1.238326907157898, "epoch": 1.973099758117716, "grad_norm": 5.09375, "learning_rate": 1.9428880411384402e-05, "loss": 1.2047, "mean_token_accuracy": 0.6977987930178642, "num_tokens": 31126151.0, "step": 13460 }, { "entropy": 1.2229508131742477, "epoch": 1.9745657113538078, "grad_norm": 12.1875, "learning_rate": 1.9427174731873958e-05, "loss": 1.214, "mean_token_accuracy": 0.7044151827692986, "num_tokens": 31147974.0, "step": 13470 }, { "entropy": 1.2986417710781097, "epoch": 1.9760316645898994, "grad_norm": 10.875, "learning_rate": 1.942546658420616e-05, "loss": 1.2583, "mean_token_accuracy": 0.6824543356895447, "num_tokens": 31172529.0, "step": 13480 }, { "entropy": 1.5423732846975327, "epoch": 1.9774976178259913, "grad_norm": 8.5625, "learning_rate": 1.942375596882823e-05, "loss": 1.5438, "mean_token_accuracy": 0.6378498986363411, "num_tokens": 31195125.0, "step": 13490 }, { "entropy": 1.164568829536438, "epoch": 1.9789635710620832, "grad_norm": 6.4375, "learning_rate": 1.9422042886188024e-05, "loss": 1.1113, "mean_token_accuracy": 0.7134113132953643, "num_tokens": 31218041.0, "step": 13500 }, { "entropy": 1.397884687781334, "epoch": 1.9804295242981749, "grad_norm": 9.625, "learning_rate": 1.942032733673405e-05, "loss": 1.3336, "mean_token_accuracy": 0.66622334420681, "num_tokens": 31237462.0, "step": 13510 }, { "entropy": 1.173433855175972, "epoch": 1.9818954775342665, "grad_norm": 6.8125, "learning_rate": 1.941860932091546e-05, "loss": 1.1287, "mean_token_accuracy": 0.711926531791687, "num_tokens": 31259832.0, "step": 13520 }, { "entropy": 1.3049629136919976, "epoch": 1.9833614307703584, "grad_norm": 10.25, "learning_rate": 1.9416888839182057e-05, "loss": 1.2396, "mean_token_accuracy": 0.6878985285758972, "num_tokens": 31284199.0, "step": 13530 }, { "entropy": 1.1803849071264267, "epoch": 1.9848273840064503, "grad_norm": 8.5625, "learning_rate": 1.9415165891984285e-05, "loss": 1.1905, "mean_token_accuracy": 0.7031772464513779, "num_tokens": 31309903.0, "step": 13540 }, { "entropy": 1.1512186050415039, "epoch": 1.986293337242542, "grad_norm": 10.1875, "learning_rate": 1.9413440479773226e-05, "loss": 1.0558, "mean_token_accuracy": 0.717444309592247, "num_tokens": 31331403.0, "step": 13550 }, { "entropy": 1.190060332417488, "epoch": 1.9877592904786336, "grad_norm": 7.59375, "learning_rate": 1.9411712603000626e-05, "loss": 1.169, "mean_token_accuracy": 0.7050361007452011, "num_tokens": 31351355.0, "step": 13560 }, { "entropy": 1.342108890414238, "epoch": 1.9892252437147255, "grad_norm": 13.1875, "learning_rate": 1.940998226211886e-05, "loss": 1.316, "mean_token_accuracy": 0.669711497426033, "num_tokens": 31370613.0, "step": 13570 }, { "entropy": 1.3393351674079894, "epoch": 1.9906911969508174, "grad_norm": 9.625, "learning_rate": 1.940824945758096e-05, "loss": 1.3032, "mean_token_accuracy": 0.6818408727645874, "num_tokens": 31392247.0, "step": 13580 }, { "entropy": 1.2411215543746947, "epoch": 1.992157150186909, "grad_norm": 9.5625, "learning_rate": 1.940651418984059e-05, "loss": 1.1282, "mean_token_accuracy": 0.6966937452554702, "num_tokens": 31414188.0, "step": 13590 }, { "entropy": 1.2065597593784332, "epoch": 1.9936231034230008, "grad_norm": 7.25, "learning_rate": 1.9404776459352068e-05, "loss": 1.1619, "mean_token_accuracy": 0.7104049742221832, "num_tokens": 31438217.0, "step": 13600 }, { "entropy": 1.092889776825905, "epoch": 1.9950890566590926, "grad_norm": 9.0, "learning_rate": 1.9403036266570354e-05, "loss": 1.0818, "mean_token_accuracy": 0.7289816051721573, "num_tokens": 31462592.0, "step": 13610 }, { "entropy": 1.4352520167827607, "epoch": 1.9965550098951843, "grad_norm": 7.53125, "learning_rate": 1.9401293611951053e-05, "loss": 1.4443, "mean_token_accuracy": 0.6512943267822265, "num_tokens": 31485877.0, "step": 13620 }, { "entropy": 1.328202909231186, "epoch": 1.998020963131276, "grad_norm": 7.125, "learning_rate": 1.9399548495950416e-05, "loss": 1.2993, "mean_token_accuracy": 0.6761165991425514, "num_tokens": 31510571.0, "step": 13630 }, { "entropy": 1.2137942790985108, "epoch": 1.9994869163673679, "grad_norm": 9.5, "learning_rate": 1.9397800919025336e-05, "loss": 1.1103, "mean_token_accuracy": 0.6970366343855858, "num_tokens": 31532674.0, "step": 13640 }, { "epoch": 2.0, "eval_entropy": 1.264600084549047, "eval_loss": 1.2777172327041626, "eval_mean_token_accuracy": 0.6897874738539429, "eval_num_tokens": 31539878.0, "eval_runtime": 56.8438, "eval_samples_per_second": 53.339, "eval_steps_per_second": 26.67, "step": 13644 }, { "entropy": 1.2411572462634037, "epoch": 2.000879571941655, "grad_norm": 8.25, "learning_rate": 1.9396050881633354e-05, "loss": 1.1241, "mean_token_accuracy": 0.7039635369652196, "num_tokens": 31554131.0, "step": 13650 }, { "entropy": 1.2160807520151138, "epoch": 2.0023455251777467, "grad_norm": 11.375, "learning_rate": 1.9394298384232647e-05, "loss": 1.1559, "mean_token_accuracy": 0.7069235652685165, "num_tokens": 31573193.0, "step": 13660 }, { "entropy": 1.3244419991970062, "epoch": 2.0038114784138386, "grad_norm": 12.125, "learning_rate": 1.9392543427282045e-05, "loss": 1.2947, "mean_token_accuracy": 0.6821984559297561, "num_tokens": 31595673.0, "step": 13670 }, { "entropy": 1.1709099620580674, "epoch": 2.0052774316499304, "grad_norm": 9.8125, "learning_rate": 1.9390786011241024e-05, "loss": 1.1009, "mean_token_accuracy": 0.7083550751209259, "num_tokens": 31619304.0, "step": 13680 }, { "entropy": 1.1365645498037338, "epoch": 2.0067433848860223, "grad_norm": 9.0, "learning_rate": 1.9389026136569688e-05, "loss": 1.0887, "mean_token_accuracy": 0.7176439702510834, "num_tokens": 31640305.0, "step": 13690 }, { "entropy": 1.0872930020093918, "epoch": 2.0082093381221138, "grad_norm": 7.15625, "learning_rate": 1.9387263803728803e-05, "loss": 1.0401, "mean_token_accuracy": 0.7243764758110046, "num_tokens": 31663387.0, "step": 13700 }, { "entropy": 1.1211113810539246, "epoch": 2.0096752913582057, "grad_norm": 7.78125, "learning_rate": 1.9385499013179768e-05, "loss": 1.0826, "mean_token_accuracy": 0.7246247738599777, "num_tokens": 31684585.0, "step": 13710 }, { "entropy": 1.3584108859300614, "epoch": 2.0111412445942976, "grad_norm": 9.125, "learning_rate": 1.938373176538463e-05, "loss": 1.3101, "mean_token_accuracy": 0.6774736613035202, "num_tokens": 31705948.0, "step": 13720 }, { "entropy": 1.2872193694114684, "epoch": 2.012607197830389, "grad_norm": 14.0, "learning_rate": 1.9381962060806074e-05, "loss": 1.2349, "mean_token_accuracy": 0.6848550468683243, "num_tokens": 31728334.0, "step": 13730 }, { "entropy": 1.2809602677822114, "epoch": 2.014073151066481, "grad_norm": 11.9375, "learning_rate": 1.9380189899907432e-05, "loss": 1.1843, "mean_token_accuracy": 0.69537433385849, "num_tokens": 31750153.0, "step": 13740 }, { "entropy": 1.1381223186850549, "epoch": 2.0155391043025728, "grad_norm": 8.375, "learning_rate": 1.9378415283152684e-05, "loss": 1.0664, "mean_token_accuracy": 0.7133791357278824, "num_tokens": 31772073.0, "step": 13750 }, { "entropy": 0.9540754109621048, "epoch": 2.0170050575386647, "grad_norm": 13.625, "learning_rate": 1.937663821100644e-05, "loss": 0.9444, "mean_token_accuracy": 0.7530208975076675, "num_tokens": 31796048.0, "step": 13760 }, { "entropy": 1.1419079929590226, "epoch": 2.018471010774756, "grad_norm": 9.0, "learning_rate": 1.9374858683933973e-05, "loss": 1.0392, "mean_token_accuracy": 0.7227568686008453, "num_tokens": 31819831.0, "step": 13770 }, { "entropy": 1.2143770664930345, "epoch": 2.019936964010848, "grad_norm": 11.6875, "learning_rate": 1.9373076702401174e-05, "loss": 1.1874, "mean_token_accuracy": 0.699714532494545, "num_tokens": 31843136.0, "step": 13780 }, { "entropy": 1.224331122636795, "epoch": 2.02140291724694, "grad_norm": 8.5, "learning_rate": 1.9371292266874593e-05, "loss": 1.168, "mean_token_accuracy": 0.6976536542177201, "num_tokens": 31865693.0, "step": 13790 }, { "entropy": 1.1674718350172042, "epoch": 2.0228688704830318, "grad_norm": 10.625, "learning_rate": 1.9369505377821416e-05, "loss": 1.1473, "mean_token_accuracy": 0.7139815390110016, "num_tokens": 31886369.0, "step": 13800 }, { "entropy": 1.1698017507791518, "epoch": 2.024334823719123, "grad_norm": 10.125, "learning_rate": 1.9367716035709484e-05, "loss": 1.0801, "mean_token_accuracy": 0.7087359309196473, "num_tokens": 31907469.0, "step": 13810 }, { "entropy": 1.166812428832054, "epoch": 2.025800776955215, "grad_norm": 10.9375, "learning_rate": 1.9365924241007256e-05, "loss": 1.1581, "mean_token_accuracy": 0.7155067771673203, "num_tokens": 31933953.0, "step": 13820 }, { "entropy": 1.1159226030111313, "epoch": 2.027266730191307, "grad_norm": 8.0, "learning_rate": 1.9364129994183857e-05, "loss": 1.078, "mean_token_accuracy": 0.7195590555667877, "num_tokens": 31957257.0, "step": 13830 }, { "entropy": 1.187909981608391, "epoch": 2.028732683427399, "grad_norm": 8.0, "learning_rate": 1.936233329570904e-05, "loss": 1.1556, "mean_token_accuracy": 0.7006301671266556, "num_tokens": 31980552.0, "step": 13840 }, { "entropy": 0.9882671505212783, "epoch": 2.0301986366634903, "grad_norm": 7.0625, "learning_rate": 1.936053414605321e-05, "loss": 0.9427, "mean_token_accuracy": 0.750397863984108, "num_tokens": 32005425.0, "step": 13850 }, { "entropy": 1.1019329309463501, "epoch": 2.031664589899582, "grad_norm": 7.40625, "learning_rate": 1.93587325456874e-05, "loss": 1.0944, "mean_token_accuracy": 0.7211731374263763, "num_tokens": 32031891.0, "step": 13860 }, { "entropy": 1.157654160261154, "epoch": 2.033130543135674, "grad_norm": 9.8125, "learning_rate": 1.9356928495083295e-05, "loss": 1.0932, "mean_token_accuracy": 0.7106741219758987, "num_tokens": 32052713.0, "step": 13870 }, { "entropy": 1.2422219067811966, "epoch": 2.0345964963717655, "grad_norm": 9.6875, "learning_rate": 1.935512199471322e-05, "loss": 1.2256, "mean_token_accuracy": 0.6994220405817032, "num_tokens": 32076967.0, "step": 13880 }, { "entropy": 0.9787596851587296, "epoch": 2.0360624496078574, "grad_norm": 9.375, "learning_rate": 1.9353313045050144e-05, "loss": 0.9296, "mean_token_accuracy": 0.7482833534479141, "num_tokens": 32103319.0, "step": 13890 }, { "entropy": 1.372862669825554, "epoch": 2.0375284028439493, "grad_norm": 8.0, "learning_rate": 1.9351501646567667e-05, "loss": 1.3112, "mean_token_accuracy": 0.6704637542366981, "num_tokens": 32125152.0, "step": 13900 }, { "entropy": 1.0050673961639405, "epoch": 2.038994356080041, "grad_norm": 6.125, "learning_rate": 1.9349687799740038e-05, "loss": 0.9368, "mean_token_accuracy": 0.7459022372961044, "num_tokens": 32150390.0, "step": 13910 }, { "entropy": 1.1381087839603423, "epoch": 2.0404603093161326, "grad_norm": 9.0625, "learning_rate": 1.934787150504215e-05, "loss": 1.1284, "mean_token_accuracy": 0.715079066157341, "num_tokens": 32174599.0, "step": 13920 }, { "entropy": 1.1028755649924278, "epoch": 2.0419262625522245, "grad_norm": 9.1875, "learning_rate": 1.934605276294953e-05, "loss": 1.0605, "mean_token_accuracy": 0.7203720331192016, "num_tokens": 32196101.0, "step": 13930 }, { "entropy": 1.3589549869298936, "epoch": 2.0433922157883164, "grad_norm": 9.375, "learning_rate": 1.934423157393835e-05, "loss": 1.286, "mean_token_accuracy": 0.6753516018390655, "num_tokens": 32217363.0, "step": 13940 }, { "entropy": 1.363708072900772, "epoch": 2.0448581690244083, "grad_norm": 8.9375, "learning_rate": 1.9342407938485413e-05, "loss": 1.3017, "mean_token_accuracy": 0.6657682299613953, "num_tokens": 32235405.0, "step": 13950 }, { "entropy": 1.2477270156145095, "epoch": 2.0463241222604998, "grad_norm": 10.875, "learning_rate": 1.934058185706818e-05, "loss": 1.2283, "mean_token_accuracy": 0.6948087036609649, "num_tokens": 32258240.0, "step": 13960 }, { "entropy": 1.2719774663448333, "epoch": 2.0477900754965916, "grad_norm": 6.1875, "learning_rate": 1.9338753330164742e-05, "loss": 1.2313, "mean_token_accuracy": 0.6923567861318588, "num_tokens": 32281996.0, "step": 13970 }, { "entropy": 1.377466970682144, "epoch": 2.0492560287326835, "grad_norm": 5.90625, "learning_rate": 1.933692235825383e-05, "loss": 1.3599, "mean_token_accuracy": 0.6799631208181381, "num_tokens": 32305775.0, "step": 13980 }, { "entropy": 1.2488154634833335, "epoch": 2.050721981968775, "grad_norm": 13.0, "learning_rate": 1.9335088941814814e-05, "loss": 1.1983, "mean_token_accuracy": 0.7037648677825927, "num_tokens": 32327245.0, "step": 13990 }, { "entropy": 1.2275935411453247, "epoch": 2.052187935204867, "grad_norm": 7.375, "learning_rate": 1.9333253081327704e-05, "loss": 1.129, "mean_token_accuracy": 0.7018446356058121, "num_tokens": 32348734.0, "step": 14000 }, { "entropy": 1.1909607127308846, "epoch": 2.0536538884409588, "grad_norm": 5.78125, "learning_rate": 1.933141477727316e-05, "loss": 1.1262, "mean_token_accuracy": 0.7010480999946594, "num_tokens": 32372225.0, "step": 14010 }, { "entropy": 1.2294019356369972, "epoch": 2.0551198416770506, "grad_norm": 13.9375, "learning_rate": 1.9329574030132473e-05, "loss": 1.2075, "mean_token_accuracy": 0.6906039297580719, "num_tokens": 32396773.0, "step": 14020 }, { "entropy": 1.3047458797693252, "epoch": 2.056585794913142, "grad_norm": 10.375, "learning_rate": 1.9327730840387567e-05, "loss": 1.2523, "mean_token_accuracy": 0.6953946501016617, "num_tokens": 32419771.0, "step": 14030 }, { "entropy": 1.2475240513682366, "epoch": 2.058051748149234, "grad_norm": 9.4375, "learning_rate": 1.932588520852102e-05, "loss": 1.2076, "mean_token_accuracy": 0.6986099183559418, "num_tokens": 32441191.0, "step": 14040 }, { "entropy": 1.1430369406938552, "epoch": 2.059517701385326, "grad_norm": 9.0, "learning_rate": 1.932403713501604e-05, "loss": 1.0525, "mean_token_accuracy": 0.7200036138296128, "num_tokens": 32466188.0, "step": 14050 }, { "entropy": 0.9945487976074219, "epoch": 2.0609836546214177, "grad_norm": 9.125, "learning_rate": 1.9322186620356476e-05, "loss": 0.9535, "mean_token_accuracy": 0.7438287377357483, "num_tokens": 32494082.0, "step": 14060 }, { "entropy": 1.1204145178198814, "epoch": 2.062449607857509, "grad_norm": 7.15625, "learning_rate": 1.932033366502682e-05, "loss": 1.0877, "mean_token_accuracy": 0.7238354057073593, "num_tokens": 32519589.0, "step": 14070 }, { "entropy": 1.2677551925182342, "epoch": 2.063915561093601, "grad_norm": 17.25, "learning_rate": 1.9318478269512194e-05, "loss": 1.2514, "mean_token_accuracy": 0.6917827159166337, "num_tokens": 32536899.0, "step": 14080 }, { "entropy": 1.1443063646554947, "epoch": 2.065381514329693, "grad_norm": 7.125, "learning_rate": 1.9316620434298375e-05, "loss": 1.0969, "mean_token_accuracy": 0.7183142364025116, "num_tokens": 32558620.0, "step": 14090 }, { "entropy": 1.1748047322034836, "epoch": 2.066847467565785, "grad_norm": 9.0, "learning_rate": 1.9314760159871757e-05, "loss": 1.079, "mean_token_accuracy": 0.7062559634447098, "num_tokens": 32581231.0, "step": 14100 }, { "entropy": 1.164207038283348, "epoch": 2.0683134208018763, "grad_norm": 8.3125, "learning_rate": 1.931289744671939e-05, "loss": 1.1456, "mean_token_accuracy": 0.7136894404888153, "num_tokens": 32604832.0, "step": 14110 }, { "entropy": 1.1648354798555374, "epoch": 2.069779374037968, "grad_norm": 9.0, "learning_rate": 1.9311032295328956e-05, "loss": 1.1382, "mean_token_accuracy": 0.7091114193201065, "num_tokens": 32631062.0, "step": 14120 }, { "entropy": 1.2059783041477203, "epoch": 2.07124532727406, "grad_norm": 9.6875, "learning_rate": 1.9309164706188776e-05, "loss": 1.154, "mean_token_accuracy": 0.7131444275379181, "num_tokens": 32654679.0, "step": 14130 }, { "entropy": 1.156986618041992, "epoch": 2.0727112805101515, "grad_norm": 8.375, "learning_rate": 1.930729467978781e-05, "loss": 1.0759, "mean_token_accuracy": 0.7108697861433029, "num_tokens": 32680609.0, "step": 14140 }, { "entropy": 1.2790018022060394, "epoch": 2.0741772337462434, "grad_norm": 9.1875, "learning_rate": 1.9305422216615652e-05, "loss": 1.2015, "mean_token_accuracy": 0.7008499756455422, "num_tokens": 32704343.0, "step": 14150 }, { "entropy": 1.1412691548466682, "epoch": 2.0756431869823353, "grad_norm": 6.6875, "learning_rate": 1.9303547317162543e-05, "loss": 1.1063, "mean_token_accuracy": 0.7202523291110993, "num_tokens": 32726981.0, "step": 14160 }, { "entropy": 1.1315770983695983, "epoch": 2.077109140218427, "grad_norm": 9.4375, "learning_rate": 1.930166998191935e-05, "loss": 1.0402, "mean_token_accuracy": 0.7084005445241928, "num_tokens": 32750837.0, "step": 14170 }, { "entropy": 1.2955131232738495, "epoch": 2.0785750934545186, "grad_norm": 8.25, "learning_rate": 1.9299790211377587e-05, "loss": 1.2324, "mean_token_accuracy": 0.6910987675189972, "num_tokens": 32775026.0, "step": 14180 }, { "entropy": 1.2679594218730927, "epoch": 2.0800410466906105, "grad_norm": 12.75, "learning_rate": 1.9297908006029403e-05, "loss": 1.1627, "mean_token_accuracy": 0.698353311419487, "num_tokens": 32797964.0, "step": 14190 }, { "entropy": 1.2194758027791976, "epoch": 2.0815069999267024, "grad_norm": 8.1875, "learning_rate": 1.9296023366367583e-05, "loss": 1.168, "mean_token_accuracy": 0.7022855907678605, "num_tokens": 32820765.0, "step": 14200 }, { "entropy": 1.1872043371200562, "epoch": 2.0829729531627943, "grad_norm": 7.53125, "learning_rate": 1.929413629288555e-05, "loss": 1.1466, "mean_token_accuracy": 0.7005807280540466, "num_tokens": 32841673.0, "step": 14210 }, { "entropy": 1.1115220010280609, "epoch": 2.0844389063988857, "grad_norm": 8.4375, "learning_rate": 1.9292246786077364e-05, "loss": 1.0703, "mean_token_accuracy": 0.7210763573646546, "num_tokens": 32865697.0, "step": 14220 }, { "entropy": 1.316936455667019, "epoch": 2.0859048596349776, "grad_norm": 8.0625, "learning_rate": 1.9290354846437725e-05, "loss": 1.3095, "mean_token_accuracy": 0.6857563585042954, "num_tokens": 32886570.0, "step": 14230 }, { "entropy": 1.2069514572620392, "epoch": 2.0873708128710695, "grad_norm": 7.25, "learning_rate": 1.9288460474461965e-05, "loss": 1.1412, "mean_token_accuracy": 0.707176360487938, "num_tokens": 32909753.0, "step": 14240 }, { "entropy": 1.2878999024629594, "epoch": 2.0888367661071614, "grad_norm": 10.0, "learning_rate": 1.9286563670646056e-05, "loss": 1.2157, "mean_token_accuracy": 0.6898901551961899, "num_tokens": 32928386.0, "step": 14250 }, { "entropy": 1.0172577798366547, "epoch": 2.090302719343253, "grad_norm": 7.84375, "learning_rate": 1.9284664435486604e-05, "loss": 0.9769, "mean_token_accuracy": 0.7343806177377701, "num_tokens": 32950988.0, "step": 14260 }, { "entropy": 0.9555214911699295, "epoch": 2.0917686725793447, "grad_norm": 9.0625, "learning_rate": 1.9282762769480857e-05, "loss": 0.8904, "mean_token_accuracy": 0.7523670941591263, "num_tokens": 32977037.0, "step": 14270 }, { "entropy": 1.2307527631521225, "epoch": 2.0932346258154366, "grad_norm": 13.3125, "learning_rate": 1.9280858673126697e-05, "loss": 1.2046, "mean_token_accuracy": 0.6910233557224273, "num_tokens": 32996621.0, "step": 14280 }, { "entropy": 1.1820859760046005, "epoch": 2.094700579051528, "grad_norm": 7.3125, "learning_rate": 1.9278952146922637e-05, "loss": 1.1296, "mean_token_accuracy": 0.7112047374248505, "num_tokens": 33020514.0, "step": 14290 }, { "entropy": 1.128038015961647, "epoch": 2.09616653228762, "grad_norm": 9.25, "learning_rate": 1.927704319136783e-05, "loss": 1.1307, "mean_token_accuracy": 0.7232388406991959, "num_tokens": 33045153.0, "step": 14300 }, { "entropy": 1.2547108083963394, "epoch": 2.097632485523712, "grad_norm": 9.5625, "learning_rate": 1.9275131806962068e-05, "loss": 1.2244, "mean_token_accuracy": 0.6946101576089859, "num_tokens": 33064418.0, "step": 14310 }, { "entropy": 1.196551352739334, "epoch": 2.0990984387598037, "grad_norm": 12.0625, "learning_rate": 1.927321799420578e-05, "loss": 1.1634, "mean_token_accuracy": 0.7012838780879974, "num_tokens": 33086372.0, "step": 14320 }, { "entropy": 1.1235421866178512, "epoch": 2.100564391995895, "grad_norm": 11.3125, "learning_rate": 1.9271301753600018e-05, "loss": 1.0827, "mean_token_accuracy": 0.7212383687496186, "num_tokens": 33112318.0, "step": 14330 }, { "entropy": 1.213804480433464, "epoch": 2.102030345231987, "grad_norm": 9.3125, "learning_rate": 1.926938308564649e-05, "loss": 1.1936, "mean_token_accuracy": 0.7009687051177025, "num_tokens": 33138302.0, "step": 14340 }, { "entropy": 1.2339561432600021, "epoch": 2.103496298468079, "grad_norm": 13.5, "learning_rate": 1.9267461990847515e-05, "loss": 1.1842, "mean_token_accuracy": 0.7039470553398133, "num_tokens": 33160872.0, "step": 14350 }, { "entropy": 1.0806543231010437, "epoch": 2.104962251704171, "grad_norm": 8.625, "learning_rate": 1.9265538469706072e-05, "loss": 1.0265, "mean_token_accuracy": 0.7367146670818329, "num_tokens": 33183590.0, "step": 14360 }, { "entropy": 1.0607769548892976, "epoch": 2.1064282049402623, "grad_norm": 8.375, "learning_rate": 1.9263612522725762e-05, "loss": 1.0326, "mean_token_accuracy": 0.7403076648712158, "num_tokens": 33209366.0, "step": 14370 }, { "entropy": 1.1036813139915467, "epoch": 2.107894158176354, "grad_norm": 9.0, "learning_rate": 1.926168415041082e-05, "loss": 1.0854, "mean_token_accuracy": 0.7252095222473145, "num_tokens": 33231369.0, "step": 14380 }, { "entropy": 1.1798922806978225, "epoch": 2.109360111412446, "grad_norm": 8.9375, "learning_rate": 1.925975335326612e-05, "loss": 1.1466, "mean_token_accuracy": 0.7060749262571335, "num_tokens": 33256276.0, "step": 14390 }, { "entropy": 1.1623878076672554, "epoch": 2.1108260646485375, "grad_norm": 11.375, "learning_rate": 1.925782013179717e-05, "loss": 1.0746, "mean_token_accuracy": 0.7133434921503067, "num_tokens": 33280219.0, "step": 14400 }, { "entropy": 1.106020560860634, "epoch": 2.1122920178846294, "grad_norm": 10.5, "learning_rate": 1.925588448651011e-05, "loss": 1.0286, "mean_token_accuracy": 0.7246781677007675, "num_tokens": 33303051.0, "step": 14410 }, { "entropy": 1.2098772838711738, "epoch": 2.1137579711207213, "grad_norm": 8.6875, "learning_rate": 1.9253946417911722e-05, "loss": 1.1661, "mean_token_accuracy": 0.707968008518219, "num_tokens": 33324392.0, "step": 14420 }, { "entropy": 1.2712797164916991, "epoch": 2.115223924356813, "grad_norm": 10.6875, "learning_rate": 1.9252005926509413e-05, "loss": 1.2143, "mean_token_accuracy": 0.6942479699850083, "num_tokens": 33348789.0, "step": 14430 }, { "entropy": 1.2015321925282478, "epoch": 2.1166898775929046, "grad_norm": 7.34375, "learning_rate": 1.9250063012811234e-05, "loss": 1.1154, "mean_token_accuracy": 0.7006738841533661, "num_tokens": 33370491.0, "step": 14440 }, { "entropy": 1.1852103739976882, "epoch": 2.1181558308289965, "grad_norm": 8.75, "learning_rate": 1.924811767732586e-05, "loss": 1.1642, "mean_token_accuracy": 0.7044091790914535, "num_tokens": 33391654.0, "step": 14450 }, { "entropy": 1.0720260009169578, "epoch": 2.1196217840650884, "grad_norm": 9.1875, "learning_rate": 1.924616992056261e-05, "loss": 1.016, "mean_token_accuracy": 0.7296324372291565, "num_tokens": 33415496.0, "step": 14460 }, { "entropy": 1.2618780821561812, "epoch": 2.1210877373011803, "grad_norm": 9.8125, "learning_rate": 1.924421974303143e-05, "loss": 1.1859, "mean_token_accuracy": 0.70398730635643, "num_tokens": 33438495.0, "step": 14470 }, { "entropy": 1.1047347754240036, "epoch": 2.1225536905372717, "grad_norm": 9.25, "learning_rate": 1.92422671452429e-05, "loss": 1.0876, "mean_token_accuracy": 0.7281916588544846, "num_tokens": 33461390.0, "step": 14480 }, { "entropy": 1.0982863843441009, "epoch": 2.1240196437733636, "grad_norm": 7.0, "learning_rate": 1.924031212770824e-05, "loss": 1.005, "mean_token_accuracy": 0.7230430662631988, "num_tokens": 33486593.0, "step": 14490 }, { "entropy": 1.2864873707294464, "epoch": 2.1254855970094555, "grad_norm": 9.875, "learning_rate": 1.9238354690939293e-05, "loss": 1.2177, "mean_token_accuracy": 0.6890183746814728, "num_tokens": 33511030.0, "step": 14500 }, { "entropy": 1.2588951915502549, "epoch": 2.126951550245547, "grad_norm": 9.3125, "learning_rate": 1.923639483544855e-05, "loss": 1.2005, "mean_token_accuracy": 0.696376520395279, "num_tokens": 33534523.0, "step": 14510 }, { "entropy": 1.3541049093008042, "epoch": 2.128417503481639, "grad_norm": 10.25, "learning_rate": 1.9234432561749118e-05, "loss": 1.3227, "mean_token_accuracy": 0.6706753820180893, "num_tokens": 33556460.0, "step": 14520 }, { "entropy": 1.3638856887817383, "epoch": 2.1298834567177307, "grad_norm": 13.0, "learning_rate": 1.9232467870354748e-05, "loss": 1.3572, "mean_token_accuracy": 0.6635289937257767, "num_tokens": 33578333.0, "step": 14530 }, { "entropy": 1.229723232984543, "epoch": 2.1313494099538226, "grad_norm": 7.90625, "learning_rate": 1.9230500761779828e-05, "loss": 1.2119, "mean_token_accuracy": 0.6949746966361999, "num_tokens": 33601328.0, "step": 14540 }, { "entropy": 1.3037643358111382, "epoch": 2.132815363189914, "grad_norm": 12.1875, "learning_rate": 1.9228531236539363e-05, "loss": 1.2538, "mean_token_accuracy": 0.6873639494180679, "num_tokens": 33621797.0, "step": 14550 }, { "entropy": 1.2710185050964355, "epoch": 2.134281316426006, "grad_norm": 8.9375, "learning_rate": 1.922655929514901e-05, "loss": 1.2016, "mean_token_accuracy": 0.6944811016321182, "num_tokens": 33641919.0, "step": 14560 }, { "entropy": 1.3338551312685012, "epoch": 2.135747269662098, "grad_norm": 10.625, "learning_rate": 1.922458493812504e-05, "loss": 1.297, "mean_token_accuracy": 0.680743059515953, "num_tokens": 33665195.0, "step": 14570 }, { "entropy": 1.071255385875702, "epoch": 2.1372132228981897, "grad_norm": 9.25, "learning_rate": 1.922260816598437e-05, "loss": 1.0178, "mean_token_accuracy": 0.7259317070245743, "num_tokens": 33686928.0, "step": 14580 }, { "entropy": 1.2211802184581757, "epoch": 2.138679176134281, "grad_norm": 8.3125, "learning_rate": 1.9220628979244548e-05, "loss": 1.2137, "mean_token_accuracy": 0.6934998065233231, "num_tokens": 33707465.0, "step": 14590 }, { "entropy": 1.0362044990062713, "epoch": 2.140145129370373, "grad_norm": 8.8125, "learning_rate": 1.9218647378423748e-05, "loss": 1.0133, "mean_token_accuracy": 0.7400437593460083, "num_tokens": 33733107.0, "step": 14600 }, { "entropy": 1.0396047621965407, "epoch": 2.141611082606465, "grad_norm": 7.125, "learning_rate": 1.921666336404078e-05, "loss": 1.0136, "mean_token_accuracy": 0.730276808142662, "num_tokens": 33760530.0, "step": 14610 }, { "entropy": 1.1137039750814437, "epoch": 2.143077035842557, "grad_norm": 7.40625, "learning_rate": 1.921467693661508e-05, "loss": 1.1185, "mean_token_accuracy": 0.7244410574436188, "num_tokens": 33786151.0, "step": 14620 }, { "entropy": 1.0633327156305312, "epoch": 2.1445429890786483, "grad_norm": 9.375, "learning_rate": 1.921268809666672e-05, "loss": 0.9988, "mean_token_accuracy": 0.7267427980899811, "num_tokens": 33809015.0, "step": 14630 }, { "entropy": 0.9858867034316063, "epoch": 2.14600894231474, "grad_norm": 9.3125, "learning_rate": 1.9210696844716415e-05, "loss": 0.8947, "mean_token_accuracy": 0.7501487106084823, "num_tokens": 33836427.0, "step": 14640 }, { "entropy": 1.222510626912117, "epoch": 2.147474895550832, "grad_norm": 9.25, "learning_rate": 1.920870318128549e-05, "loss": 1.1719, "mean_token_accuracy": 0.7001291513442993, "num_tokens": 33857137.0, "step": 14650 }, { "entropy": 1.1132077246904373, "epoch": 2.148940848786924, "grad_norm": 9.0, "learning_rate": 1.920670710689592e-05, "loss": 1.0722, "mean_token_accuracy": 0.7250542640686035, "num_tokens": 33877917.0, "step": 14660 }, { "entropy": 1.2846489071846008, "epoch": 2.1504068020230154, "grad_norm": 8.75, "learning_rate": 1.9204708622070295e-05, "loss": 1.2299, "mean_token_accuracy": 0.6884357988834381, "num_tokens": 33902298.0, "step": 14670 }, { "entropy": 1.3133222252130508, "epoch": 2.1518727552591073, "grad_norm": 13.625, "learning_rate": 1.920270772733185e-05, "loss": 1.2832, "mean_token_accuracy": 0.6791722238063812, "num_tokens": 33922846.0, "step": 14680 }, { "entropy": 1.0107443749904632, "epoch": 2.153338708495199, "grad_norm": 8.3125, "learning_rate": 1.9200704423204445e-05, "loss": 0.963, "mean_token_accuracy": 0.743595364689827, "num_tokens": 33948379.0, "step": 14690 }, { "entropy": 1.1646183609962464, "epoch": 2.1548046617312906, "grad_norm": 10.625, "learning_rate": 1.919869871021257e-05, "loss": 1.1274, "mean_token_accuracy": 0.7051996439695358, "num_tokens": 33971350.0, "step": 14700 }, { "entropy": 1.0622032463550568, "epoch": 2.1562706149673825, "grad_norm": 10.0, "learning_rate": 1.919669058888135e-05, "loss": 1.0202, "mean_token_accuracy": 0.7272839188575745, "num_tokens": 33996716.0, "step": 14710 }, { "entropy": 1.1979274481534958, "epoch": 2.1577365682034744, "grad_norm": 8.4375, "learning_rate": 1.919468005973653e-05, "loss": 1.1943, "mean_token_accuracy": 0.6957827866077423, "num_tokens": 34018184.0, "step": 14720 }, { "entropy": 1.1767743080854416, "epoch": 2.1592025214395663, "grad_norm": 7.84375, "learning_rate": 1.9192667123304497e-05, "loss": 1.1582, "mean_token_accuracy": 0.7117189258337021, "num_tokens": 34044114.0, "step": 14730 }, { "entropy": 1.2413691192865373, "epoch": 2.1606684746756577, "grad_norm": 11.5, "learning_rate": 1.9190651780112267e-05, "loss": 1.2475, "mean_token_accuracy": 0.6957176774740219, "num_tokens": 34066321.0, "step": 14740 }, { "entropy": 1.1623576402664184, "epoch": 2.1621344279117496, "grad_norm": 9.0625, "learning_rate": 1.918863403068748e-05, "loss": 1.1079, "mean_token_accuracy": 0.7199676483869553, "num_tokens": 34091799.0, "step": 14750 }, { "entropy": 1.0280721604824066, "epoch": 2.1636003811478415, "grad_norm": 9.1875, "learning_rate": 1.918661387555841e-05, "loss": 0.9687, "mean_token_accuracy": 0.7455003052949906, "num_tokens": 34117785.0, "step": 14760 }, { "entropy": 1.3620914757251739, "epoch": 2.1650663343839334, "grad_norm": 9.0625, "learning_rate": 1.9184591315253958e-05, "loss": 1.2903, "mean_token_accuracy": 0.6770294278860092, "num_tokens": 34140808.0, "step": 14770 }, { "entropy": 1.2740073382854462, "epoch": 2.166532287620025, "grad_norm": 13.125, "learning_rate": 1.918256635030366e-05, "loss": 1.2078, "mean_token_accuracy": 0.6875900357961655, "num_tokens": 34159638.0, "step": 14780 }, { "entropy": 1.1485545814037323, "epoch": 2.1679982408561167, "grad_norm": 8.9375, "learning_rate": 1.9180538981237675e-05, "loss": 1.1222, "mean_token_accuracy": 0.718090146780014, "num_tokens": 34181736.0, "step": 14790 }, { "entropy": 1.217862504720688, "epoch": 2.1694641940922086, "grad_norm": 7.8125, "learning_rate": 1.9178509208586797e-05, "loss": 1.1177, "mean_token_accuracy": 0.7021241933107376, "num_tokens": 34203595.0, "step": 14800 }, { "entropy": 1.3331517726182938, "epoch": 2.1709301473283, "grad_norm": 9.8125, "learning_rate": 1.9176477032882445e-05, "loss": 1.2626, "mean_token_accuracy": 0.6840052843093872, "num_tokens": 34230266.0, "step": 14810 }, { "entropy": 0.9764844492077828, "epoch": 2.172396100564392, "grad_norm": 9.75, "learning_rate": 1.9174442454656673e-05, "loss": 0.8984, "mean_token_accuracy": 0.7491744428873062, "num_tokens": 34255574.0, "step": 14820 }, { "entropy": 1.0402263551950455, "epoch": 2.173862053800484, "grad_norm": 8.25, "learning_rate": 1.9172405474442154e-05, "loss": 1.0262, "mean_token_accuracy": 0.7356658607721329, "num_tokens": 34280841.0, "step": 14830 }, { "entropy": 1.2651427373290063, "epoch": 2.1753280070365757, "grad_norm": 10.375, "learning_rate": 1.91703660927722e-05, "loss": 1.2228, "mean_token_accuracy": 0.6894789814949036, "num_tokens": 34304359.0, "step": 14840 }, { "entropy": 1.2059030473232268, "epoch": 2.176793960272667, "grad_norm": 9.625, "learning_rate": 1.9168324310180748e-05, "loss": 1.1726, "mean_token_accuracy": 0.7051484882831573, "num_tokens": 34328280.0, "step": 14850 }, { "entropy": 0.9252061367034912, "epoch": 2.178259913508759, "grad_norm": 7.09375, "learning_rate": 1.9166280127202365e-05, "loss": 0.9188, "mean_token_accuracy": 0.7626416712999344, "num_tokens": 34353210.0, "step": 14860 }, { "entropy": 1.1455406367778778, "epoch": 2.179725866744851, "grad_norm": 9.5625, "learning_rate": 1.9164233544372238e-05, "loss": 1.1104, "mean_token_accuracy": 0.7200254201889038, "num_tokens": 34376485.0, "step": 14870 }, { "entropy": 1.1537156581878663, "epoch": 2.181191819980943, "grad_norm": 11.5625, "learning_rate": 1.9162184562226193e-05, "loss": 1.1412, "mean_token_accuracy": 0.7156736880540848, "num_tokens": 34402825.0, "step": 14880 }, { "entropy": 1.1548157215118409, "epoch": 2.1826577732170342, "grad_norm": 11.25, "learning_rate": 1.916013318130068e-05, "loss": 1.1123, "mean_token_accuracy": 0.7211978182196617, "num_tokens": 34429755.0, "step": 14890 }, { "entropy": 1.1312684565782547, "epoch": 2.184123726453126, "grad_norm": 9.875, "learning_rate": 1.9158079402132783e-05, "loss": 1.095, "mean_token_accuracy": 0.7179067969322205, "num_tokens": 34452844.0, "step": 14900 }, { "entropy": 1.1824084758758544, "epoch": 2.185589679689218, "grad_norm": 8.375, "learning_rate": 1.91560232252602e-05, "loss": 1.1096, "mean_token_accuracy": 0.705046021938324, "num_tokens": 34478367.0, "step": 14910 }, { "entropy": 1.1623460724949837, "epoch": 2.1870556329253095, "grad_norm": 11.1875, "learning_rate": 1.9153964651221266e-05, "loss": 1.1328, "mean_token_accuracy": 0.7130699992179871, "num_tokens": 34499357.0, "step": 14920 }, { "entropy": 1.1802103579044343, "epoch": 2.1885215861614014, "grad_norm": 7.1875, "learning_rate": 1.9151903680554947e-05, "loss": 1.1763, "mean_token_accuracy": 0.7065380930900573, "num_tokens": 34520975.0, "step": 14930 }, { "entropy": 1.1365110725164413, "epoch": 2.1899875393974932, "grad_norm": 9.8125, "learning_rate": 1.9149840313800828e-05, "loss": 1.0753, "mean_token_accuracy": 0.7192366391420364, "num_tokens": 34547512.0, "step": 14940 }, { "entropy": 1.1495123296976089, "epoch": 2.191453492633585, "grad_norm": 10.4375, "learning_rate": 1.914777455149913e-05, "loss": 1.1171, "mean_token_accuracy": 0.711624014377594, "num_tokens": 34570347.0, "step": 14950 }, { "entropy": 1.2493381172418594, "epoch": 2.1929194458696766, "grad_norm": 16.625, "learning_rate": 1.9145706394190687e-05, "loss": 1.2327, "mean_token_accuracy": 0.694169995188713, "num_tokens": 34590109.0, "step": 14960 }, { "entropy": 1.1901148304343223, "epoch": 2.1943853991057685, "grad_norm": 9.4375, "learning_rate": 1.914363584241698e-05, "loss": 1.1194, "mean_token_accuracy": 0.7093828380107879, "num_tokens": 34612868.0, "step": 14970 }, { "entropy": 1.1088495254516602, "epoch": 2.1958513523418604, "grad_norm": 7.71875, "learning_rate": 1.9141562896720102e-05, "loss": 1.0675, "mean_token_accuracy": 0.7251914203166961, "num_tokens": 34638629.0, "step": 14980 }, { "entropy": 1.1290393024682999, "epoch": 2.1973173055779522, "grad_norm": 10.375, "learning_rate": 1.913948755764278e-05, "loss": 1.0782, "mean_token_accuracy": 0.7157484114170074, "num_tokens": 34662521.0, "step": 14990 }, { "entropy": 1.0471702337265014, "epoch": 2.1987832588140437, "grad_norm": 8.4375, "learning_rate": 1.913740982572836e-05, "loss": 0.9583, "mean_token_accuracy": 0.7382917076349258, "num_tokens": 34685655.0, "step": 15000 }, { "entropy": 1.109531706571579, "epoch": 2.2002492120501356, "grad_norm": 14.5, "learning_rate": 1.913532970152082e-05, "loss": 1.0263, "mean_token_accuracy": 0.7219138264656066, "num_tokens": 34709355.0, "step": 15010 }, { "entropy": 1.2683644413948059, "epoch": 2.2017151652862275, "grad_norm": 10.625, "learning_rate": 1.913324718556477e-05, "loss": 1.2387, "mean_token_accuracy": 0.6903713583946228, "num_tokens": 34729140.0, "step": 15020 }, { "entropy": 1.1343570336699487, "epoch": 2.203181118522319, "grad_norm": 9.4375, "learning_rate": 1.9131162278405432e-05, "loss": 1.0985, "mean_token_accuracy": 0.713887295126915, "num_tokens": 34751417.0, "step": 15030 }, { "entropy": 1.1217906922101974, "epoch": 2.204647071758411, "grad_norm": 8.375, "learning_rate": 1.9129074980588666e-05, "loss": 1.0672, "mean_token_accuracy": 0.7277366101741791, "num_tokens": 34773493.0, "step": 15040 }, { "entropy": 1.3060382783412934, "epoch": 2.2061130249945027, "grad_norm": 6.6875, "learning_rate": 1.912698529266095e-05, "loss": 1.2265, "mean_token_accuracy": 0.6823583364486694, "num_tokens": 34791756.0, "step": 15050 }, { "entropy": 1.102049422264099, "epoch": 2.2075789782305946, "grad_norm": 7.6875, "learning_rate": 1.91248932151694e-05, "loss": 1.1005, "mean_token_accuracy": 0.7178501337766647, "num_tokens": 34815735.0, "step": 15060 }, { "entropy": 1.1999853134155274, "epoch": 2.209044931466686, "grad_norm": 10.25, "learning_rate": 1.9122798748661743e-05, "loss": 1.1232, "mean_token_accuracy": 0.70890291929245, "num_tokens": 34838269.0, "step": 15070 }, { "entropy": 1.1562822937965394, "epoch": 2.210510884702778, "grad_norm": 8.75, "learning_rate": 1.9120701893686334e-05, "loss": 1.1734, "mean_token_accuracy": 0.705057218670845, "num_tokens": 34862300.0, "step": 15080 }, { "entropy": 1.2737075358629226, "epoch": 2.21197683793887, "grad_norm": 8.125, "learning_rate": 1.9118602650792163e-05, "loss": 1.2513, "mean_token_accuracy": 0.6878321677446365, "num_tokens": 34884480.0, "step": 15090 }, { "entropy": 1.1614771634340286, "epoch": 2.2134427911749617, "grad_norm": 13.0625, "learning_rate": 1.9116501020528838e-05, "loss": 1.1554, "mean_token_accuracy": 0.7147576302289963, "num_tokens": 34904508.0, "step": 15100 }, { "entropy": 1.1583786010742188, "epoch": 2.214908744411053, "grad_norm": 9.375, "learning_rate": 1.9114397003446594e-05, "loss": 1.0969, "mean_token_accuracy": 0.7193308055400849, "num_tokens": 34926885.0, "step": 15110 }, { "entropy": 1.2691397070884705, "epoch": 2.216374697647145, "grad_norm": 11.4375, "learning_rate": 1.911229060009629e-05, "loss": 1.2136, "mean_token_accuracy": 0.6951009452342987, "num_tokens": 34949997.0, "step": 15120 }, { "entropy": 1.2050824850797652, "epoch": 2.217840650883237, "grad_norm": 7.875, "learning_rate": 1.9110181811029405e-05, "loss": 1.2169, "mean_token_accuracy": 0.7007272630929947, "num_tokens": 34972299.0, "step": 15130 }, { "entropy": 1.1708863496780395, "epoch": 2.219306604119329, "grad_norm": 8.25, "learning_rate": 1.9108070636798054e-05, "loss": 1.1331, "mean_token_accuracy": 0.7081922888755798, "num_tokens": 34996702.0, "step": 15140 }, { "entropy": 1.1189543545246123, "epoch": 2.2207725573554202, "grad_norm": 7.34375, "learning_rate": 1.910595707795497e-05, "loss": 1.1083, "mean_token_accuracy": 0.7171375095844269, "num_tokens": 35019722.0, "step": 15150 }, { "entropy": 1.105293546617031, "epoch": 2.222238510591512, "grad_norm": 8.625, "learning_rate": 1.9103841135053502e-05, "loss": 1.0501, "mean_token_accuracy": 0.7264099717140198, "num_tokens": 35045160.0, "step": 15160 }, { "entropy": 1.068185755610466, "epoch": 2.223704463827604, "grad_norm": 7.3125, "learning_rate": 1.910172280864764e-05, "loss": 1.0202, "mean_token_accuracy": 0.725193664431572, "num_tokens": 35067319.0, "step": 15170 }, { "entropy": 1.1074829339981078, "epoch": 2.225170417063696, "grad_norm": 13.0625, "learning_rate": 1.9099602099291985e-05, "loss": 1.13, "mean_token_accuracy": 0.7186817467212677, "num_tokens": 35091698.0, "step": 15180 }, { "entropy": 1.1041037619113923, "epoch": 2.2266363702997873, "grad_norm": 9.3125, "learning_rate": 1.9097479007541772e-05, "loss": 1.0308, "mean_token_accuracy": 0.7243407219648361, "num_tokens": 35115029.0, "step": 15190 }, { "entropy": 1.1945430785417557, "epoch": 2.2281023235358792, "grad_norm": 7.03125, "learning_rate": 1.9095353533952844e-05, "loss": 1.1803, "mean_token_accuracy": 0.7001925736665726, "num_tokens": 35138635.0, "step": 15200 }, { "entropy": 1.2337313562631607, "epoch": 2.229568276771971, "grad_norm": 13.375, "learning_rate": 1.9093225679081686e-05, "loss": 1.2024, "mean_token_accuracy": 0.6936593383550644, "num_tokens": 35162879.0, "step": 15210 }, { "entropy": 1.1157830685377121, "epoch": 2.2310342300080626, "grad_norm": 7.5625, "learning_rate": 1.9091095443485398e-05, "loss": 1.1005, "mean_token_accuracy": 0.7200670629739762, "num_tokens": 35187029.0, "step": 15220 }, { "entropy": 1.1804859325289727, "epoch": 2.2325001832441544, "grad_norm": 8.6875, "learning_rate": 1.90889628277217e-05, "loss": 1.1034, "mean_token_accuracy": 0.713496720790863, "num_tokens": 35211676.0, "step": 15230 }, { "entropy": 1.0793717086315155, "epoch": 2.2339661364802463, "grad_norm": 9.3125, "learning_rate": 1.908682783234894e-05, "loss": 1.0182, "mean_token_accuracy": 0.7263741165399551, "num_tokens": 35236572.0, "step": 15240 }, { "entropy": 1.158864051103592, "epoch": 2.235432089716338, "grad_norm": 9.875, "learning_rate": 1.9084690457926086e-05, "loss": 1.1322, "mean_token_accuracy": 0.7129698067903518, "num_tokens": 35259174.0, "step": 15250 }, { "entropy": 1.1682104498147965, "epoch": 2.2368980429524297, "grad_norm": 12.25, "learning_rate": 1.9082550705012735e-05, "loss": 1.1285, "mean_token_accuracy": 0.7097507387399673, "num_tokens": 35281680.0, "step": 15260 }, { "entropy": 1.1757471591234208, "epoch": 2.2383639961885216, "grad_norm": 8.1875, "learning_rate": 1.90804085741691e-05, "loss": 1.0994, "mean_token_accuracy": 0.715869328379631, "num_tokens": 35305077.0, "step": 15270 }, { "entropy": 1.1441166937351226, "epoch": 2.2398299494246134, "grad_norm": 7.8125, "learning_rate": 1.9078264065956014e-05, "loss": 1.0781, "mean_token_accuracy": 0.7208972156047821, "num_tokens": 35332035.0, "step": 15280 }, { "entropy": 1.3589776128530502, "epoch": 2.2412959026607053, "grad_norm": 12.5625, "learning_rate": 1.9076117180934943e-05, "loss": 1.3353, "mean_token_accuracy": 0.6822304114699363, "num_tokens": 35351445.0, "step": 15290 }, { "entropy": 1.3513107895851135, "epoch": 2.2427618558967968, "grad_norm": 10.1875, "learning_rate": 1.907396791966797e-05, "loss": 1.2951, "mean_token_accuracy": 0.6744530662894249, "num_tokens": 35371967.0, "step": 15300 }, { "entropy": 1.2844993680715562, "epoch": 2.2442278091328887, "grad_norm": 8.3125, "learning_rate": 1.9071816282717798e-05, "loss": 1.2063, "mean_token_accuracy": 0.6938290178775788, "num_tokens": 35394249.0, "step": 15310 }, { "entropy": 1.3251852691173553, "epoch": 2.2456937623689806, "grad_norm": 10.6875, "learning_rate": 1.906966227064775e-05, "loss": 1.2907, "mean_token_accuracy": 0.6791919648647309, "num_tokens": 35416518.0, "step": 15320 }, { "entropy": 1.1282271862030029, "epoch": 2.247159715605072, "grad_norm": 9.1875, "learning_rate": 1.906750588402178e-05, "loss": 1.1077, "mean_token_accuracy": 0.7201898634433747, "num_tokens": 35441397.0, "step": 15330 }, { "entropy": 1.185820385813713, "epoch": 2.248625668841164, "grad_norm": 7.59375, "learning_rate": 1.906534712340445e-05, "loss": 1.1552, "mean_token_accuracy": 0.7063204497098923, "num_tokens": 35465290.0, "step": 15340 }, { "entropy": 1.189033830165863, "epoch": 2.2500916220772558, "grad_norm": 8.125, "learning_rate": 1.9063185989360967e-05, "loss": 1.1622, "mean_token_accuracy": 0.7044997662305832, "num_tokens": 35489392.0, "step": 15350 }, { "entropy": 1.0977874755859376, "epoch": 2.2515575753133477, "grad_norm": 11.6875, "learning_rate": 1.906102248245713e-05, "loss": 1.0668, "mean_token_accuracy": 0.7195299208164215, "num_tokens": 35513466.0, "step": 15360 }, { "entropy": 1.0491891831159592, "epoch": 2.253023528549439, "grad_norm": 11.625, "learning_rate": 1.9058856603259377e-05, "loss": 0.992, "mean_token_accuracy": 0.7329804629087449, "num_tokens": 35537988.0, "step": 15370 }, { "entropy": 1.0154206812381745, "epoch": 2.254489481785531, "grad_norm": 6.5625, "learning_rate": 1.9056688352334766e-05, "loss": 0.9421, "mean_token_accuracy": 0.7437652796506882, "num_tokens": 35567555.0, "step": 15380 }, { "entropy": 1.243457278609276, "epoch": 2.255955435021623, "grad_norm": 9.25, "learning_rate": 1.905451773025097e-05, "loss": 1.1888, "mean_token_accuracy": 0.6886635094881057, "num_tokens": 35586089.0, "step": 15390 }, { "entropy": 1.4042161241173745, "epoch": 2.2574213882577148, "grad_norm": 11.4375, "learning_rate": 1.9052344737576288e-05, "loss": 1.4197, "mean_token_accuracy": 0.667649795114994, "num_tokens": 35609634.0, "step": 15400 }, { "entropy": 1.1075396329164504, "epoch": 2.258887341493806, "grad_norm": 8.1875, "learning_rate": 1.905016937487964e-05, "loss": 1.0739, "mean_token_accuracy": 0.7184066981077194, "num_tokens": 35633473.0, "step": 15410 }, { "entropy": 1.1243178322911263, "epoch": 2.260353294729898, "grad_norm": 13.5625, "learning_rate": 1.9047991642730562e-05, "loss": 1.1192, "mean_token_accuracy": 0.7206156641244889, "num_tokens": 35655844.0, "step": 15420 }, { "entropy": 1.2852006569504737, "epoch": 2.26181924796599, "grad_norm": 9.1875, "learning_rate": 1.9045811541699213e-05, "loss": 1.219, "mean_token_accuracy": 0.6905445247888565, "num_tokens": 35680840.0, "step": 15430 }, { "entropy": 1.2797663778066635, "epoch": 2.2632852012020814, "grad_norm": 11.6875, "learning_rate": 1.904362907235637e-05, "loss": 1.2724, "mean_token_accuracy": 0.6890873640775681, "num_tokens": 35705012.0, "step": 15440 }, { "entropy": 1.0583030253648757, "epoch": 2.2647511544381733, "grad_norm": 11.5, "learning_rate": 1.9041444235273437e-05, "loss": 1.022, "mean_token_accuracy": 0.7331082403659821, "num_tokens": 35730607.0, "step": 15450 }, { "entropy": 1.2870960891246797, "epoch": 2.266217107674265, "grad_norm": 8.4375, "learning_rate": 1.9039257031022424e-05, "loss": 1.2697, "mean_token_accuracy": 0.6846460223197937, "num_tokens": 35754506.0, "step": 15460 }, { "entropy": 1.24292651116848, "epoch": 2.267683060910357, "grad_norm": 12.0625, "learning_rate": 1.903706746017598e-05, "loss": 1.2185, "mean_token_accuracy": 0.6967544168233871, "num_tokens": 35774502.0, "step": 15470 }, { "entropy": 1.0762374848127365, "epoch": 2.2691490141464485, "grad_norm": 7.25, "learning_rate": 1.9034875523307358e-05, "loss": 1.0011, "mean_token_accuracy": 0.7270061492919921, "num_tokens": 35798286.0, "step": 15480 }, { "entropy": 1.2125924736261369, "epoch": 2.2706149673825404, "grad_norm": 12.625, "learning_rate": 1.903268122099043e-05, "loss": 1.1553, "mean_token_accuracy": 0.7056877106428147, "num_tokens": 35820075.0, "step": 15490 }, { "entropy": 1.1466798812150956, "epoch": 2.2720809206186323, "grad_norm": 8.3125, "learning_rate": 1.9030484553799705e-05, "loss": 1.0867, "mean_token_accuracy": 0.7100672841072082, "num_tokens": 35843832.0, "step": 15500 }, { "entropy": 1.2764698207378387, "epoch": 2.273546873854724, "grad_norm": 10.375, "learning_rate": 1.9028285522310295e-05, "loss": 1.2558, "mean_token_accuracy": 0.6976595342159271, "num_tokens": 35867054.0, "step": 15510 }, { "entropy": 1.1503947168588637, "epoch": 2.2750128270908156, "grad_norm": 9.5, "learning_rate": 1.902608412709793e-05, "loss": 1.1074, "mean_token_accuracy": 0.7087903261184693, "num_tokens": 35889272.0, "step": 15520 }, { "entropy": 1.1799424856901168, "epoch": 2.2764787803269075, "grad_norm": 6.4375, "learning_rate": 1.9023880368738966e-05, "loss": 1.1616, "mean_token_accuracy": 0.7153949037194252, "num_tokens": 35912110.0, "step": 15530 }, { "entropy": 1.309221053123474, "epoch": 2.2779447335629994, "grad_norm": 10.4375, "learning_rate": 1.902167424781038e-05, "loss": 1.272, "mean_token_accuracy": 0.6813871681690216, "num_tokens": 35935581.0, "step": 15540 }, { "entropy": 1.1924454867839813, "epoch": 2.279410686799091, "grad_norm": 13.75, "learning_rate": 1.9019465764889756e-05, "loss": 1.1471, "mean_token_accuracy": 0.711710524559021, "num_tokens": 35954269.0, "step": 15550 }, { "entropy": 1.1540218412876129, "epoch": 2.2808766400351828, "grad_norm": 7.46875, "learning_rate": 1.901725492055531e-05, "loss": 1.0868, "mean_token_accuracy": 0.7078810870647431, "num_tokens": 35976603.0, "step": 15560 }, { "entropy": 1.1097116202116013, "epoch": 2.2823425932712746, "grad_norm": 6.125, "learning_rate": 1.9015041715385872e-05, "loss": 1.0602, "mean_token_accuracy": 0.7244650959968567, "num_tokens": 35999664.0, "step": 15570 }, { "entropy": 1.0066423267126083, "epoch": 2.2838085465073665, "grad_norm": 7.1875, "learning_rate": 1.9012826149960878e-05, "loss": 0.959, "mean_token_accuracy": 0.7456923097372055, "num_tokens": 36024001.0, "step": 15580 }, { "entropy": 1.320574812591076, "epoch": 2.2852744997434584, "grad_norm": 7.46875, "learning_rate": 1.90106082248604e-05, "loss": 1.2607, "mean_token_accuracy": 0.6835837543010712, "num_tokens": 36047275.0, "step": 15590 }, { "entropy": 1.1097445338964462, "epoch": 2.28674045297955, "grad_norm": 9.4375, "learning_rate": 1.9008387940665115e-05, "loss": 1.0693, "mean_token_accuracy": 0.7174741327762604, "num_tokens": 36072758.0, "step": 15600 }, { "entropy": 1.2833382457494735, "epoch": 2.2882064062156418, "grad_norm": 14.5625, "learning_rate": 1.9006165297956326e-05, "loss": 1.2145, "mean_token_accuracy": 0.6927435040473938, "num_tokens": 36093078.0, "step": 15610 }, { "entropy": 1.2626068055629731, "epoch": 2.2896723594517336, "grad_norm": 7.34375, "learning_rate": 1.9003940297315953e-05, "loss": 1.2189, "mean_token_accuracy": 0.6982035920023918, "num_tokens": 36115512.0, "step": 15620 }, { "entropy": 1.0698741853237153, "epoch": 2.291138312687825, "grad_norm": 7.53125, "learning_rate": 1.900171293932652e-05, "loss": 1.0573, "mean_token_accuracy": 0.7319156169891358, "num_tokens": 36141427.0, "step": 15630 }, { "entropy": 1.169370210170746, "epoch": 2.292604265923917, "grad_norm": 9.8125, "learning_rate": 1.8999483224571186e-05, "loss": 1.1402, "mean_token_accuracy": 0.7128853976726532, "num_tokens": 36165296.0, "step": 15640 }, { "entropy": 1.2899587452411652, "epoch": 2.294070219160009, "grad_norm": 14.5625, "learning_rate": 1.899725115363372e-05, "loss": 1.2737, "mean_token_accuracy": 0.684854581952095, "num_tokens": 36187470.0, "step": 15650 }, { "entropy": 1.1334260046482085, "epoch": 2.2955361723961007, "grad_norm": 9.125, "learning_rate": 1.8995016727098504e-05, "loss": 1.1529, "mean_token_accuracy": 0.7176352083683014, "num_tokens": 36207899.0, "step": 15660 }, { "entropy": 1.236479638516903, "epoch": 2.297002125632192, "grad_norm": 8.0625, "learning_rate": 1.899277994555054e-05, "loss": 1.1763, "mean_token_accuracy": 0.6943147897720336, "num_tokens": 36234451.0, "step": 15670 }, { "entropy": 1.314685547351837, "epoch": 2.298468078868284, "grad_norm": 10.4375, "learning_rate": 1.899054080957545e-05, "loss": 1.2809, "mean_token_accuracy": 0.6846749693155288, "num_tokens": 36256184.0, "step": 15680 }, { "entropy": 1.2582970529794693, "epoch": 2.299934032104376, "grad_norm": 9.5625, "learning_rate": 1.8988299319759464e-05, "loss": 1.2183, "mean_token_accuracy": 0.693334910273552, "num_tokens": 36281162.0, "step": 15690 }, { "entropy": 1.3625812441110612, "epoch": 2.301399985340468, "grad_norm": 16.375, "learning_rate": 1.8986055476689432e-05, "loss": 1.3521, "mean_token_accuracy": 0.6706550776958465, "num_tokens": 36302115.0, "step": 15700 }, { "entropy": 1.161108911037445, "epoch": 2.3028659385765593, "grad_norm": 11.625, "learning_rate": 1.8983809280952833e-05, "loss": 1.1187, "mean_token_accuracy": 0.7163513243198395, "num_tokens": 36325327.0, "step": 15710 }, { "entropy": 1.2271632105112076, "epoch": 2.304331891812651, "grad_norm": 9.4375, "learning_rate": 1.8981560733137735e-05, "loss": 1.1647, "mean_token_accuracy": 0.6919648081064225, "num_tokens": 36345781.0, "step": 15720 }, { "entropy": 1.175689634680748, "epoch": 2.305797845048743, "grad_norm": 11.125, "learning_rate": 1.8979309833832855e-05, "loss": 1.1234, "mean_token_accuracy": 0.7055793792009354, "num_tokens": 36368976.0, "step": 15730 }, { "entropy": 1.033715397119522, "epoch": 2.3072637982848345, "grad_norm": 9.625, "learning_rate": 1.8977056583627488e-05, "loss": 1.0105, "mean_token_accuracy": 0.7382365822792053, "num_tokens": 36391097.0, "step": 15740 }, { "entropy": 1.1694593220949172, "epoch": 2.3087297515209264, "grad_norm": 11.8125, "learning_rate": 1.8974800983111575e-05, "loss": 1.1411, "mean_token_accuracy": 0.7097324162721634, "num_tokens": 36412996.0, "step": 15750 }, { "entropy": 1.1372269988059998, "epoch": 2.3101957047570183, "grad_norm": 6.5, "learning_rate": 1.8972543032875665e-05, "loss": 1.0886, "mean_token_accuracy": 0.715478989481926, "num_tokens": 36437569.0, "step": 15760 }, { "entropy": 1.2303670078516007, "epoch": 2.31166165799311, "grad_norm": 10.0, "learning_rate": 1.8970282733510913e-05, "loss": 1.1436, "mean_token_accuracy": 0.7019165515899658, "num_tokens": 36460142.0, "step": 15770 }, { "entropy": 1.1364569157361983, "epoch": 2.3131276112292016, "grad_norm": 9.8125, "learning_rate": 1.8968020085609095e-05, "loss": 1.1161, "mean_token_accuracy": 0.7230620950460434, "num_tokens": 36483546.0, "step": 15780 }, { "entropy": 1.1822098553180695, "epoch": 2.3145935644652935, "grad_norm": 7.71875, "learning_rate": 1.8965755089762603e-05, "loss": 1.1198, "mean_token_accuracy": 0.7093426406383514, "num_tokens": 36508320.0, "step": 15790 }, { "entropy": 1.4230523228645324, "epoch": 2.3160595177013854, "grad_norm": 8.125, "learning_rate": 1.896348774656444e-05, "loss": 1.4022, "mean_token_accuracy": 0.6568196207284928, "num_tokens": 36530069.0, "step": 15800 }, { "entropy": 1.0934678196907044, "epoch": 2.3175254709374773, "grad_norm": 9.0, "learning_rate": 1.8961218056608235e-05, "loss": 1.0576, "mean_token_accuracy": 0.7262182533740997, "num_tokens": 36551546.0, "step": 15810 }, { "entropy": 1.0758361756801604, "epoch": 2.3189914241735687, "grad_norm": 6.3125, "learning_rate": 1.895894602048821e-05, "loss": 1.0179, "mean_token_accuracy": 0.7258591622114181, "num_tokens": 36576681.0, "step": 15820 }, { "entropy": 1.1516639932990074, "epoch": 2.3204573774096606, "grad_norm": 11.0, "learning_rate": 1.8956671638799223e-05, "loss": 1.1457, "mean_token_accuracy": 0.7035455375909805, "num_tokens": 36597885.0, "step": 15830 }, { "entropy": 1.0442392274737358, "epoch": 2.3219233306457525, "grad_norm": 7.40625, "learning_rate": 1.8954394912136733e-05, "loss": 0.9831, "mean_token_accuracy": 0.7307748675346375, "num_tokens": 36621240.0, "step": 15840 }, { "entropy": 1.2751971364021302, "epoch": 2.323389283881844, "grad_norm": 12.375, "learning_rate": 1.8952115841096814e-05, "loss": 1.2094, "mean_token_accuracy": 0.6924892276525497, "num_tokens": 36642346.0, "step": 15850 }, { "entropy": 1.0002491265535354, "epoch": 2.324855237117936, "grad_norm": 7.5625, "learning_rate": 1.8949834426276164e-05, "loss": 0.9828, "mean_token_accuracy": 0.7462838590145111, "num_tokens": 36668683.0, "step": 15860 }, { "entropy": 1.0605436533689498, "epoch": 2.3263211903540277, "grad_norm": 9.3125, "learning_rate": 1.894755066827208e-05, "loss": 1.0423, "mean_token_accuracy": 0.7270126730203629, "num_tokens": 36696284.0, "step": 15870 }, { "entropy": 1.089755180478096, "epoch": 2.3277871435901196, "grad_norm": 8.375, "learning_rate": 1.8945264567682485e-05, "loss": 1.0572, "mean_token_accuracy": 0.7259571999311447, "num_tokens": 36719411.0, "step": 15880 }, { "entropy": 1.193571151793003, "epoch": 2.329253096826211, "grad_norm": 8.1875, "learning_rate": 1.8942976125105905e-05, "loss": 1.1454, "mean_token_accuracy": 0.700043597817421, "num_tokens": 36741129.0, "step": 15890 }, { "entropy": 1.2322057217359543, "epoch": 2.330719050062303, "grad_norm": 7.65625, "learning_rate": 1.8940685341141488e-05, "loss": 1.2047, "mean_token_accuracy": 0.6951000779867172, "num_tokens": 36762667.0, "step": 15900 }, { "entropy": 1.2621253818273543, "epoch": 2.332185003298395, "grad_norm": 13.4375, "learning_rate": 1.893839221638899e-05, "loss": 1.2223, "mean_token_accuracy": 0.6970056340098381, "num_tokens": 36786974.0, "step": 15910 }, { "entropy": 1.264732950925827, "epoch": 2.3336509565344867, "grad_norm": 8.875, "learning_rate": 1.893609675144878e-05, "loss": 1.2386, "mean_token_accuracy": 0.6966810196638107, "num_tokens": 36808198.0, "step": 15920 }, { "entropy": 1.130670040845871, "epoch": 2.335116909770578, "grad_norm": 8.5625, "learning_rate": 1.893379894692184e-05, "loss": 1.0601, "mean_token_accuracy": 0.718412634730339, "num_tokens": 36830712.0, "step": 15930 }, { "entropy": 1.0929778426885606, "epoch": 2.33658286300667, "grad_norm": 9.0, "learning_rate": 1.893149880340977e-05, "loss": 1.0815, "mean_token_accuracy": 0.7225905239582062, "num_tokens": 36850909.0, "step": 15940 }, { "entropy": 1.0783461660146714, "epoch": 2.338048816242762, "grad_norm": 9.25, "learning_rate": 1.892919632151477e-05, "loss": 1.0207, "mean_token_accuracy": 0.7261549830436707, "num_tokens": 36874429.0, "step": 15950 }, { "entropy": 1.1114408761262893, "epoch": 2.3395147694788534, "grad_norm": 8.5625, "learning_rate": 1.8926891501839668e-05, "loss": 1.0505, "mean_token_accuracy": 0.7237543314695358, "num_tokens": 36898866.0, "step": 15960 }, { "entropy": 1.0969808965921402, "epoch": 2.3409807227149453, "grad_norm": 11.75, "learning_rate": 1.892458434498789e-05, "loss": 1.0879, "mean_token_accuracy": 0.7192880511283875, "num_tokens": 36923045.0, "step": 15970 }, { "entropy": 1.1702938005328178, "epoch": 2.342446675951037, "grad_norm": 12.0625, "learning_rate": 1.892227485156348e-05, "loss": 1.1587, "mean_token_accuracy": 0.7162327080965042, "num_tokens": 36947342.0, "step": 15980 }, { "entropy": 1.1215061843395233, "epoch": 2.343912629187129, "grad_norm": 8.5625, "learning_rate": 1.8919963022171098e-05, "loss": 1.0613, "mean_token_accuracy": 0.7175217717885971, "num_tokens": 36969151.0, "step": 15990 }, { "entropy": 1.190552493929863, "epoch": 2.345378582423221, "grad_norm": 9.125, "learning_rate": 1.8917648857416007e-05, "loss": 1.1489, "mean_token_accuracy": 0.7074726313352585, "num_tokens": 36991936.0, "step": 16000 }, { "entropy": 1.1698262959718704, "epoch": 2.3468445356593124, "grad_norm": 7.78125, "learning_rate": 1.8915332357904084e-05, "loss": 1.1272, "mean_token_accuracy": 0.707430151104927, "num_tokens": 37015467.0, "step": 16010 }, { "entropy": 1.2283308774232864, "epoch": 2.3483104888954043, "grad_norm": 7.0625, "learning_rate": 1.891301352424183e-05, "loss": 1.2127, "mean_token_accuracy": 0.6964597254991531, "num_tokens": 37037911.0, "step": 16020 }, { "entropy": 1.0687168806791305, "epoch": 2.349776442131496, "grad_norm": 9.5625, "learning_rate": 1.891069235703633e-05, "loss": 1.0161, "mean_token_accuracy": 0.734435784816742, "num_tokens": 37061579.0, "step": 16030 }, { "entropy": 1.1253724962472915, "epoch": 2.3512423953675876, "grad_norm": 8.75, "learning_rate": 1.8908368856895303e-05, "loss": 1.0999, "mean_token_accuracy": 0.7266558885574341, "num_tokens": 37088345.0, "step": 16040 }, { "entropy": 1.1866006791591643, "epoch": 2.3527083486036795, "grad_norm": 12.375, "learning_rate": 1.8906043024427078e-05, "loss": 1.1447, "mean_token_accuracy": 0.7044459730386734, "num_tokens": 37108359.0, "step": 16050 }, { "entropy": 1.097095674276352, "epoch": 2.3541743018397714, "grad_norm": 9.9375, "learning_rate": 1.8903714860240578e-05, "loss": 0.9991, "mean_token_accuracy": 0.7237568080425263, "num_tokens": 37131559.0, "step": 16060 }, { "entropy": 0.9474169567227364, "epoch": 2.355640255075863, "grad_norm": 9.875, "learning_rate": 1.890138436494535e-05, "loss": 0.8952, "mean_token_accuracy": 0.7499590814113617, "num_tokens": 37155708.0, "step": 16070 }, { "entropy": 1.2822959303855896, "epoch": 2.3571062083119547, "grad_norm": 8.4375, "learning_rate": 1.889905153915155e-05, "loss": 1.2262, "mean_token_accuracy": 0.6924429774284363, "num_tokens": 37177783.0, "step": 16080 }, { "entropy": 1.3256672024726868, "epoch": 2.3585721615480466, "grad_norm": 12.375, "learning_rate": 1.889671638346994e-05, "loss": 1.2847, "mean_token_accuracy": 0.6880932956933975, "num_tokens": 37198458.0, "step": 16090 }, { "entropy": 1.1305772244930268, "epoch": 2.3600381147841385, "grad_norm": 7.78125, "learning_rate": 1.88943788985119e-05, "loss": 1.0905, "mean_token_accuracy": 0.7210603177547454, "num_tokens": 37223319.0, "step": 16100 }, { "entropy": 1.0910063415765763, "epoch": 2.3615040680202304, "grad_norm": 9.125, "learning_rate": 1.8892039084889403e-05, "loss": 1.0152, "mean_token_accuracy": 0.7275571644306182, "num_tokens": 37246152.0, "step": 16110 }, { "entropy": 1.151190048456192, "epoch": 2.362970021256322, "grad_norm": 6.5, "learning_rate": 1.8889696943215055e-05, "loss": 1.1531, "mean_token_accuracy": 0.7140974700450897, "num_tokens": 37268356.0, "step": 16120 }, { "entropy": 1.228389510512352, "epoch": 2.3644359744924137, "grad_norm": 9.75, "learning_rate": 1.8887352474102054e-05, "loss": 1.1834, "mean_token_accuracy": 0.6954966813325882, "num_tokens": 37291028.0, "step": 16130 }, { "entropy": 1.1466989621520043, "epoch": 2.3659019277285056, "grad_norm": 9.875, "learning_rate": 1.888500567816421e-05, "loss": 1.0925, "mean_token_accuracy": 0.7187341809272766, "num_tokens": 37314141.0, "step": 16140 }, { "entropy": 1.2922033369541168, "epoch": 2.367367880964597, "grad_norm": 13.125, "learning_rate": 1.8882656556015945e-05, "loss": 1.3099, "mean_token_accuracy": 0.6867150068283081, "num_tokens": 37337007.0, "step": 16150 }, { "entropy": 1.1340243756771087, "epoch": 2.368833834200689, "grad_norm": 11.0625, "learning_rate": 1.8880305108272298e-05, "loss": 1.1125, "mean_token_accuracy": 0.7137698829174042, "num_tokens": 37364974.0, "step": 16160 }, { "entropy": 1.1970777839422226, "epoch": 2.370299787436781, "grad_norm": 6.28125, "learning_rate": 1.8877951335548897e-05, "loss": 1.1846, "mean_token_accuracy": 0.7103426069021225, "num_tokens": 37387007.0, "step": 16170 }, { "entropy": 1.0162954449653625, "epoch": 2.3717657406728727, "grad_norm": 7.78125, "learning_rate": 1.8875595238461996e-05, "loss": 0.9696, "mean_token_accuracy": 0.7428406298160553, "num_tokens": 37411685.0, "step": 16180 }, { "entropy": 1.15977301299572, "epoch": 2.373231693908964, "grad_norm": 8.5, "learning_rate": 1.8873236817628455e-05, "loss": 1.1252, "mean_token_accuracy": 0.7053516745567322, "num_tokens": 37435511.0, "step": 16190 }, { "entropy": 1.1897799402475357, "epoch": 2.374697647145056, "grad_norm": 13.125, "learning_rate": 1.8870876073665734e-05, "loss": 1.1832, "mean_token_accuracy": 0.7111997246742249, "num_tokens": 37457581.0, "step": 16200 }, { "entropy": 1.184570950269699, "epoch": 2.376163600381148, "grad_norm": 8.875, "learning_rate": 1.8868513007191912e-05, "loss": 1.1358, "mean_token_accuracy": 0.7051476150751114, "num_tokens": 37483109.0, "step": 16210 }, { "entropy": 1.251935103535652, "epoch": 2.37762955361724, "grad_norm": 10.25, "learning_rate": 1.8866147618825663e-05, "loss": 1.1556, "mean_token_accuracy": 0.6888904482126236, "num_tokens": 37504343.0, "step": 16220 }, { "entropy": 1.1820875346660613, "epoch": 2.3790955068533313, "grad_norm": 8.0, "learning_rate": 1.8863779909186283e-05, "loss": 1.1294, "mean_token_accuracy": 0.7087239623069763, "num_tokens": 37528027.0, "step": 16230 }, { "entropy": 1.1380547434091568, "epoch": 2.380561460089423, "grad_norm": 7.53125, "learning_rate": 1.8861409878893668e-05, "loss": 1.0722, "mean_token_accuracy": 0.714110466837883, "num_tokens": 37552799.0, "step": 16240 }, { "entropy": 1.2276699006557465, "epoch": 2.382027413325515, "grad_norm": 13.75, "learning_rate": 1.885903752856832e-05, "loss": 1.1466, "mean_token_accuracy": 0.7003178894519806, "num_tokens": 37574113.0, "step": 16250 }, { "entropy": 1.2680532529950141, "epoch": 2.3834933665616065, "grad_norm": 14.375, "learning_rate": 1.8856662858831355e-05, "loss": 1.2174, "mean_token_accuracy": 0.6918523728847503, "num_tokens": 37595567.0, "step": 16260 }, { "entropy": 1.2662564814090729, "epoch": 2.3849593197976984, "grad_norm": 8.375, "learning_rate": 1.8854285870304495e-05, "loss": 1.2876, "mean_token_accuracy": 0.6931095570325851, "num_tokens": 37616319.0, "step": 16270 }, { "entropy": 1.0384528756141662, "epoch": 2.3864252730337903, "grad_norm": 9.9375, "learning_rate": 1.8851906563610056e-05, "loss": 0.9356, "mean_token_accuracy": 0.7393887639045715, "num_tokens": 37641055.0, "step": 16280 }, { "entropy": 1.3131273239850998, "epoch": 2.387891226269882, "grad_norm": 11.1875, "learning_rate": 1.8849524939370982e-05, "loss": 1.2722, "mean_token_accuracy": 0.685002201795578, "num_tokens": 37665347.0, "step": 16290 }, { "entropy": 1.0424868315458298, "epoch": 2.3893571795059736, "grad_norm": 8.5625, "learning_rate": 1.8847140998210806e-05, "loss": 0.9997, "mean_token_accuracy": 0.7273939043283463, "num_tokens": 37689277.0, "step": 16300 }, { "entropy": 1.1160923033952712, "epoch": 2.3908231327420655, "grad_norm": 9.625, "learning_rate": 1.884475474075368e-05, "loss": 1.1316, "mean_token_accuracy": 0.72447689473629, "num_tokens": 37716775.0, "step": 16310 }, { "entropy": 1.2688601076602937, "epoch": 2.3922890859781574, "grad_norm": 6.65625, "learning_rate": 1.884236616762436e-05, "loss": 1.192, "mean_token_accuracy": 0.6935894936323166, "num_tokens": 37739966.0, "step": 16320 }, { "entropy": 1.2322536826133728, "epoch": 2.3937550392142493, "grad_norm": 10.8125, "learning_rate": 1.8839975279448195e-05, "loss": 1.1964, "mean_token_accuracy": 0.692655399441719, "num_tokens": 37760780.0, "step": 16330 }, { "entropy": 1.3980318397283553, "epoch": 2.3952209924503407, "grad_norm": 15.4375, "learning_rate": 1.883758207685116e-05, "loss": 1.3304, "mean_token_accuracy": 0.6667132049798965, "num_tokens": 37778983.0, "step": 16340 }, { "entropy": 1.203514149785042, "epoch": 2.3966869456864326, "grad_norm": 11.8125, "learning_rate": 1.883518656045982e-05, "loss": 1.137, "mean_token_accuracy": 0.7026718318462372, "num_tokens": 37797936.0, "step": 16350 }, { "entropy": 1.3102376580238342, "epoch": 2.3981528989225245, "grad_norm": 9.5625, "learning_rate": 1.883278873090136e-05, "loss": 1.3261, "mean_token_accuracy": 0.6780754566192627, "num_tokens": 37816764.0, "step": 16360 }, { "entropy": 1.323816254734993, "epoch": 2.399618852158616, "grad_norm": 8.75, "learning_rate": 1.8830388588803558e-05, "loss": 1.2539, "mean_token_accuracy": 0.6850458294153213, "num_tokens": 37837452.0, "step": 16370 }, { "entropy": 0.98582923412323, "epoch": 2.401084805394708, "grad_norm": 8.6875, "learning_rate": 1.8827986134794806e-05, "loss": 0.9173, "mean_token_accuracy": 0.7518735110759736, "num_tokens": 37863891.0, "step": 16380 }, { "entropy": 1.4309283375740052, "epoch": 2.4025507586307997, "grad_norm": 11.625, "learning_rate": 1.8825581369504092e-05, "loss": 1.4501, "mean_token_accuracy": 0.6595813572406769, "num_tokens": 37886182.0, "step": 16390 }, { "entropy": 1.1176761597394944, "epoch": 2.4040167118668916, "grad_norm": 8.1875, "learning_rate": 1.8823174293561018e-05, "loss": 1.1047, "mean_token_accuracy": 0.7225194245576858, "num_tokens": 37909551.0, "step": 16400 }, { "entropy": 1.2905628442764283, "epoch": 2.405482665102983, "grad_norm": 7.84375, "learning_rate": 1.882076490759579e-05, "loss": 1.2397, "mean_token_accuracy": 0.6920665204524994, "num_tokens": 37932513.0, "step": 16410 }, { "entropy": 1.3149644523859023, "epoch": 2.406948618339075, "grad_norm": 7.75, "learning_rate": 1.8818353212239217e-05, "loss": 1.2607, "mean_token_accuracy": 0.6825795263051987, "num_tokens": 37952392.0, "step": 16420 }, { "entropy": 1.3323875486850738, "epoch": 2.408414571575167, "grad_norm": 7.8125, "learning_rate": 1.8815939208122708e-05, "loss": 1.3229, "mean_token_accuracy": 0.6821953535079956, "num_tokens": 37977650.0, "step": 16430 }, { "entropy": 1.0867060154676438, "epoch": 2.4098805248112587, "grad_norm": 7.6875, "learning_rate": 1.8813522895878283e-05, "loss": 0.9835, "mean_token_accuracy": 0.7303851038217545, "num_tokens": 37999984.0, "step": 16440 }, { "entropy": 1.35321446955204, "epoch": 2.41134647804735, "grad_norm": 8.0, "learning_rate": 1.881110427613856e-05, "loss": 1.327, "mean_token_accuracy": 0.680071496963501, "num_tokens": 38026151.0, "step": 16450 }, { "entropy": 1.2184355527162551, "epoch": 2.412812431283442, "grad_norm": 8.875, "learning_rate": 1.8808683349536777e-05, "loss": 1.1415, "mean_token_accuracy": 0.7069809138774872, "num_tokens": 38049674.0, "step": 16460 }, { "entropy": 1.2340557545423507, "epoch": 2.414278384519534, "grad_norm": 12.0625, "learning_rate": 1.880626011670675e-05, "loss": 1.2059, "mean_token_accuracy": 0.7040291875600815, "num_tokens": 38070244.0, "step": 16470 }, { "entropy": 1.114707988500595, "epoch": 2.4157443377556254, "grad_norm": 8.875, "learning_rate": 1.8803834578282926e-05, "loss": 1.1195, "mean_token_accuracy": 0.7152275443077087, "num_tokens": 38095073.0, "step": 16480 }, { "entropy": 1.1083307176828385, "epoch": 2.4172102909917172, "grad_norm": 9.6875, "learning_rate": 1.8801406734900333e-05, "loss": 1.0703, "mean_token_accuracy": 0.7247473493218421, "num_tokens": 38119578.0, "step": 16490 }, { "entropy": 0.9702237695455551, "epoch": 2.418676244227809, "grad_norm": 6.375, "learning_rate": 1.879897658719461e-05, "loss": 0.9393, "mean_token_accuracy": 0.7498440533876419, "num_tokens": 38149193.0, "step": 16500 }, { "entropy": 1.1214463874697684, "epoch": 2.420142197463901, "grad_norm": 10.125, "learning_rate": 1.8796544135802017e-05, "loss": 1.0792, "mean_token_accuracy": 0.7214961349964142, "num_tokens": 38174064.0, "step": 16510 }, { "entropy": 1.0767825365066528, "epoch": 2.421608150699993, "grad_norm": 7.3125, "learning_rate": 1.8794109381359384e-05, "loss": 1.0475, "mean_token_accuracy": 0.7333208307623863, "num_tokens": 38201689.0, "step": 16520 }, { "entropy": 1.3983743399381638, "epoch": 2.4230741039360844, "grad_norm": 11.6875, "learning_rate": 1.8791672324504172e-05, "loss": 1.3445, "mean_token_accuracy": 0.6644863218069077, "num_tokens": 38222441.0, "step": 16530 }, { "entropy": 1.02123733907938, "epoch": 2.4245400571721762, "grad_norm": 9.0, "learning_rate": 1.8789232965874428e-05, "loss": 0.9728, "mean_token_accuracy": 0.7397175073623657, "num_tokens": 38248007.0, "step": 16540 }, { "entropy": 1.2321376860141755, "epoch": 2.426006010408268, "grad_norm": 10.625, "learning_rate": 1.878679130610881e-05, "loss": 1.1941, "mean_token_accuracy": 0.695287099480629, "num_tokens": 38269514.0, "step": 16550 }, { "entropy": 1.2660290241241454, "epoch": 2.4274719636443596, "grad_norm": 8.875, "learning_rate": 1.878434734584658e-05, "loss": 1.211, "mean_token_accuracy": 0.6888611286878585, "num_tokens": 38292226.0, "step": 16560 }, { "entropy": 1.1289765566587449, "epoch": 2.4289379168804515, "grad_norm": 9.875, "learning_rate": 1.8781901085727595e-05, "loss": 1.0854, "mean_token_accuracy": 0.7193387866020202, "num_tokens": 38317481.0, "step": 16570 }, { "entropy": 1.0066305086016656, "epoch": 2.4304038701165434, "grad_norm": 8.25, "learning_rate": 1.8779452526392315e-05, "loss": 0.9852, "mean_token_accuracy": 0.7463104009628296, "num_tokens": 38343829.0, "step": 16580 }, { "entropy": 1.0426799595355987, "epoch": 2.4318698233526352, "grad_norm": 7.09375, "learning_rate": 1.8777001668481813e-05, "loss": 0.9972, "mean_token_accuracy": 0.7398776054382324, "num_tokens": 38367736.0, "step": 16590 }, { "entropy": 1.1148140221834182, "epoch": 2.4333357765887267, "grad_norm": 7.0625, "learning_rate": 1.877454851263775e-05, "loss": 1.0918, "mean_token_accuracy": 0.7221053034067154, "num_tokens": 38394333.0, "step": 16600 }, { "entropy": 1.2543019771575927, "epoch": 2.4348017298248186, "grad_norm": 7.625, "learning_rate": 1.8772093059502398e-05, "loss": 1.2291, "mean_token_accuracy": 0.6936468005180358, "num_tokens": 38418904.0, "step": 16610 }, { "entropy": 1.1075321346521378, "epoch": 2.4362676830609105, "grad_norm": 13.9375, "learning_rate": 1.8769635309718617e-05, "loss": 1.0234, "mean_token_accuracy": 0.7225248128175735, "num_tokens": 38440923.0, "step": 16620 }, { "entropy": 1.0957248717546464, "epoch": 2.4377336362970023, "grad_norm": 7.25, "learning_rate": 1.876717526392989e-05, "loss": 1.0399, "mean_token_accuracy": 0.7196544647216797, "num_tokens": 38463405.0, "step": 16630 }, { "entropy": 1.146513444185257, "epoch": 2.439199589533094, "grad_norm": 12.75, "learning_rate": 1.876471292278028e-05, "loss": 1.0986, "mean_token_accuracy": 0.7139252901077271, "num_tokens": 38487331.0, "step": 16640 }, { "entropy": 1.072245779633522, "epoch": 2.4406655427691857, "grad_norm": 7.90625, "learning_rate": 1.876224828691447e-05, "loss": 1.0343, "mean_token_accuracy": 0.7341308951377868, "num_tokens": 38512262.0, "step": 16650 }, { "entropy": 1.21070396900177, "epoch": 2.4421314960052776, "grad_norm": 8.75, "learning_rate": 1.8759781356977723e-05, "loss": 1.1877, "mean_token_accuracy": 0.7043017119169235, "num_tokens": 38533562.0, "step": 16660 }, { "entropy": 1.1344587728381157, "epoch": 2.443597449241369, "grad_norm": 6.75, "learning_rate": 1.875731213361592e-05, "loss": 1.0187, "mean_token_accuracy": 0.7181604534387589, "num_tokens": 38558072.0, "step": 16670 }, { "entropy": 1.2532853037118912, "epoch": 2.445063402477461, "grad_norm": 9.875, "learning_rate": 1.875484061747553e-05, "loss": 1.2057, "mean_token_accuracy": 0.6974860370159149, "num_tokens": 38580545.0, "step": 16680 }, { "entropy": 1.1196387737989426, "epoch": 2.446529355713553, "grad_norm": 10.8125, "learning_rate": 1.875236680920364e-05, "loss": 1.122, "mean_token_accuracy": 0.7197554647922516, "num_tokens": 38602488.0, "step": 16690 }, { "entropy": 1.3003562033176421, "epoch": 2.4479953089496447, "grad_norm": 9.125, "learning_rate": 1.8749890709447913e-05, "loss": 1.2357, "mean_token_accuracy": 0.6916131645441055, "num_tokens": 38623210.0, "step": 16700 }, { "entropy": 1.1342347472906114, "epoch": 2.449461262185736, "grad_norm": 6.53125, "learning_rate": 1.8747412318856628e-05, "loss": 1.1183, "mean_token_accuracy": 0.7110966861248016, "num_tokens": 38650375.0, "step": 16710 }, { "entropy": 1.1102589040994644, "epoch": 2.450927215421828, "grad_norm": 8.0625, "learning_rate": 1.8744931638078663e-05, "loss": 1.0651, "mean_token_accuracy": 0.7218638032674789, "num_tokens": 38673099.0, "step": 16720 }, { "entropy": 1.168297478556633, "epoch": 2.45239316865792, "grad_norm": 13.1875, "learning_rate": 1.874244866776349e-05, "loss": 1.1488, "mean_token_accuracy": 0.711645245552063, "num_tokens": 38694378.0, "step": 16730 }, { "entropy": 1.1017076283693314, "epoch": 2.453859121894012, "grad_norm": 9.375, "learning_rate": 1.873996340856118e-05, "loss": 1.0288, "mean_token_accuracy": 0.7238416135311126, "num_tokens": 38716392.0, "step": 16740 }, { "entropy": 1.2292664408683778, "epoch": 2.4553250751301032, "grad_norm": 8.875, "learning_rate": 1.8737475861122415e-05, "loss": 1.2124, "mean_token_accuracy": 0.6955830842256546, "num_tokens": 38740130.0, "step": 16750 }, { "entropy": 1.2841715693473816, "epoch": 2.456791028366195, "grad_norm": 9.125, "learning_rate": 1.873498602609846e-05, "loss": 1.2445, "mean_token_accuracy": 0.6899653822183609, "num_tokens": 38762773.0, "step": 16760 }, { "entropy": 1.0765875279903412, "epoch": 2.458256981602287, "grad_norm": 9.0, "learning_rate": 1.8732493904141187e-05, "loss": 0.9771, "mean_token_accuracy": 0.727619132399559, "num_tokens": 38788614.0, "step": 16770 }, { "entropy": 1.1919088900089263, "epoch": 2.4597229348383784, "grad_norm": 7.71875, "learning_rate": 1.872999949590307e-05, "loss": 1.1369, "mean_token_accuracy": 0.708200004696846, "num_tokens": 38811505.0, "step": 16780 }, { "entropy": 1.1638930022716523, "epoch": 2.4611888880744703, "grad_norm": 9.125, "learning_rate": 1.8727502802037173e-05, "loss": 1.1622, "mean_token_accuracy": 0.7066326230764389, "num_tokens": 38832735.0, "step": 16790 }, { "entropy": 1.2055823028087616, "epoch": 2.4626548413105622, "grad_norm": 13.5625, "learning_rate": 1.8725003823197165e-05, "loss": 1.1487, "mean_token_accuracy": 0.704976886510849, "num_tokens": 38854286.0, "step": 16800 }, { "entropy": 1.219454762339592, "epoch": 2.464120794546654, "grad_norm": 11.25, "learning_rate": 1.8722502560037315e-05, "loss": 1.1528, "mean_token_accuracy": 0.697440280020237, "num_tokens": 38877265.0, "step": 16810 }, { "entropy": 1.2895234644412994, "epoch": 2.4655867477827456, "grad_norm": 11.8125, "learning_rate": 1.8719999013212478e-05, "loss": 1.28, "mean_token_accuracy": 0.6833738803863525, "num_tokens": 38902551.0, "step": 16820 }, { "entropy": 1.204075711965561, "epoch": 2.4670527010188374, "grad_norm": 10.125, "learning_rate": 1.8717493183378123e-05, "loss": 1.1538, "mean_token_accuracy": 0.7013117283582687, "num_tokens": 38925165.0, "step": 16830 }, { "entropy": 1.261253970861435, "epoch": 2.4685186542549293, "grad_norm": 9.5625, "learning_rate": 1.8714985071190302e-05, "loss": 1.1826, "mean_token_accuracy": 0.6873589903116226, "num_tokens": 38947736.0, "step": 16840 }, { "entropy": 1.3329614669084549, "epoch": 2.4699846074910212, "grad_norm": 12.3125, "learning_rate": 1.871247467730568e-05, "loss": 1.3069, "mean_token_accuracy": 0.6832632124423981, "num_tokens": 38970377.0, "step": 16850 }, { "entropy": 1.2491603761911392, "epoch": 2.4714505607271127, "grad_norm": 8.5, "learning_rate": 1.8709962002381506e-05, "loss": 1.1927, "mean_token_accuracy": 0.7009873509407043, "num_tokens": 38994652.0, "step": 16860 }, { "entropy": 1.3584735810756683, "epoch": 2.4729165139632046, "grad_norm": 9.9375, "learning_rate": 1.870744704707563e-05, "loss": 1.3454, "mean_token_accuracy": 0.67050921022892, "num_tokens": 39018478.0, "step": 16870 }, { "entropy": 1.1528451293706894, "epoch": 2.4743824671992964, "grad_norm": 13.5, "learning_rate": 1.87049298120465e-05, "loss": 1.065, "mean_token_accuracy": 0.720656543970108, "num_tokens": 39041823.0, "step": 16880 }, { "entropy": 1.1705699980258941, "epoch": 2.475848420435388, "grad_norm": 10.1875, "learning_rate": 1.8702410297953163e-05, "loss": 1.1333, "mean_token_accuracy": 0.7070876598358155, "num_tokens": 39062182.0, "step": 16890 }, { "entropy": 1.2594243943691255, "epoch": 2.4773143736714798, "grad_norm": 9.0, "learning_rate": 1.869988850545526e-05, "loss": 1.2124, "mean_token_accuracy": 0.6897521510720253, "num_tokens": 39082024.0, "step": 16900 }, { "entropy": 1.1957040384411812, "epoch": 2.4787803269075717, "grad_norm": 5.90625, "learning_rate": 1.869736443521303e-05, "loss": 1.1189, "mean_token_accuracy": 0.7086769968271256, "num_tokens": 39104986.0, "step": 16910 }, { "entropy": 1.1889816045761108, "epoch": 2.4802462801436636, "grad_norm": 7.75, "learning_rate": 1.869483808788731e-05, "loss": 1.1727, "mean_token_accuracy": 0.7048264920711518, "num_tokens": 39125851.0, "step": 16920 }, { "entropy": 1.0507065996527671, "epoch": 2.481712233379755, "grad_norm": 9.5625, "learning_rate": 1.8692309464139522e-05, "loss": 0.9952, "mean_token_accuracy": 0.7390849649906158, "num_tokens": 39150265.0, "step": 16930 }, { "entropy": 1.232691052556038, "epoch": 2.483178186615847, "grad_norm": 8.375, "learning_rate": 1.86897785646317e-05, "loss": 1.1806, "mean_token_accuracy": 0.6955595552921295, "num_tokens": 39169714.0, "step": 16940 }, { "entropy": 1.0849803894758225, "epoch": 2.4846441398519388, "grad_norm": 12.625, "learning_rate": 1.8687245390026464e-05, "loss": 1.0553, "mean_token_accuracy": 0.7257558166980743, "num_tokens": 39191837.0, "step": 16950 }, { "entropy": 1.2431086033582688, "epoch": 2.4861100930880307, "grad_norm": 11.875, "learning_rate": 1.8684709940987033e-05, "loss": 1.1471, "mean_token_accuracy": 0.6946235179901123, "num_tokens": 39216031.0, "step": 16960 }, { "entropy": 1.3773719042539596, "epoch": 2.487576046324122, "grad_norm": 6.71875, "learning_rate": 1.8682172218177224e-05, "loss": 1.3345, "mean_token_accuracy": 0.6640371352434158, "num_tokens": 39242750.0, "step": 16970 }, { "entropy": 1.1176343351602553, "epoch": 2.489041999560214, "grad_norm": 9.1875, "learning_rate": 1.867963222226144e-05, "loss": 1.0271, "mean_token_accuracy": 0.7151355504989624, "num_tokens": 39265987.0, "step": 16980 }, { "entropy": 1.2230991691350936, "epoch": 2.490507952796306, "grad_norm": 7.46875, "learning_rate": 1.8677089953904688e-05, "loss": 1.2046, "mean_token_accuracy": 0.7102817296981812, "num_tokens": 39288458.0, "step": 16990 }, { "entropy": 1.2635751485824585, "epoch": 2.4919739060323973, "grad_norm": 7.46875, "learning_rate": 1.8674545413772564e-05, "loss": 1.2159, "mean_token_accuracy": 0.6944351732730866, "num_tokens": 39310079.0, "step": 17000 }, { "entropy": 1.1072850048542022, "epoch": 2.493439859268489, "grad_norm": 9.4375, "learning_rate": 1.8671998602531267e-05, "loss": 1.0086, "mean_token_accuracy": 0.7306426987051964, "num_tokens": 39335155.0, "step": 17010 }, { "entropy": 1.2579703360795975, "epoch": 2.494905812504581, "grad_norm": 11.25, "learning_rate": 1.866944952084758e-05, "loss": 1.253, "mean_token_accuracy": 0.6863753259181976, "num_tokens": 39358096.0, "step": 17020 }, { "entropy": 1.216689148545265, "epoch": 2.496371765740673, "grad_norm": 8.6875, "learning_rate": 1.866689816938889e-05, "loss": 1.1479, "mean_token_accuracy": 0.6957253873348236, "num_tokens": 39382026.0, "step": 17030 }, { "entropy": 1.0342153072357179, "epoch": 2.497837718976765, "grad_norm": 9.1875, "learning_rate": 1.8664344548823178e-05, "loss": 0.9978, "mean_token_accuracy": 0.7329304456710816, "num_tokens": 39404910.0, "step": 17040 }, { "entropy": 1.204274471104145, "epoch": 2.4993036722128563, "grad_norm": 12.4375, "learning_rate": 1.8661788659819003e-05, "loss": 1.1732, "mean_token_accuracy": 0.706097936630249, "num_tokens": 39428591.0, "step": 17050 }, { "entropy": 1.25044504404068, "epoch": 2.500769625448948, "grad_norm": 10.5, "learning_rate": 1.865923050304554e-05, "loss": 1.1962, "mean_token_accuracy": 0.7020517200231552, "num_tokens": 39450182.0, "step": 17060 }, { "entropy": 1.0233414113521575, "epoch": 2.50223557868504, "grad_norm": 10.9375, "learning_rate": 1.8656670079172544e-05, "loss": 0.9419, "mean_token_accuracy": 0.7369366377592087, "num_tokens": 39474162.0, "step": 17070 }, { "entropy": 1.286153581738472, "epoch": 2.5037015319211315, "grad_norm": 9.25, "learning_rate": 1.8654107388870367e-05, "loss": 1.2315, "mean_token_accuracy": 0.6822037309408188, "num_tokens": 39494350.0, "step": 17080 }, { "entropy": 1.275828593969345, "epoch": 2.5051674851572234, "grad_norm": 12.9375, "learning_rate": 1.8651542432809957e-05, "loss": 1.2422, "mean_token_accuracy": 0.683596059679985, "num_tokens": 39513902.0, "step": 17090 }, { "entropy": 1.0444872945547103, "epoch": 2.5066334383933153, "grad_norm": 8.6875, "learning_rate": 1.8648975211662853e-05, "loss": 0.9926, "mean_token_accuracy": 0.7345615714788437, "num_tokens": 39539906.0, "step": 17100 }, { "entropy": 1.0557692393660545, "epoch": 2.5080993916294068, "grad_norm": 7.96875, "learning_rate": 1.864640572610118e-05, "loss": 1.0198, "mean_token_accuracy": 0.7308891892433167, "num_tokens": 39563618.0, "step": 17110 }, { "entropy": 1.140884205698967, "epoch": 2.5095653448654986, "grad_norm": 7.34375, "learning_rate": 1.8643833976797676e-05, "loss": 1.0891, "mean_token_accuracy": 0.7141744732856751, "num_tokens": 39585885.0, "step": 17120 }, { "entropy": 1.236025221645832, "epoch": 2.5110312981015905, "grad_norm": 5.75, "learning_rate": 1.864125996442565e-05, "loss": 1.2456, "mean_token_accuracy": 0.7016229182481766, "num_tokens": 39611685.0, "step": 17130 }, { "entropy": 1.0984503358602524, "epoch": 2.5124972513376824, "grad_norm": 8.625, "learning_rate": 1.863868368965901e-05, "loss": 1.0073, "mean_token_accuracy": 0.7332331329584122, "num_tokens": 39635012.0, "step": 17140 }, { "entropy": 1.2149702578783035, "epoch": 2.5139632045737743, "grad_norm": 8.8125, "learning_rate": 1.8636105153172264e-05, "loss": 1.1087, "mean_token_accuracy": 0.7076001793146134, "num_tokens": 39657393.0, "step": 17150 }, { "entropy": 1.0834560602903367, "epoch": 2.5154291578098658, "grad_norm": 5.75, "learning_rate": 1.8633524355640508e-05, "loss": 1.0496, "mean_token_accuracy": 0.7263107597827911, "num_tokens": 39680378.0, "step": 17160 }, { "entropy": 1.051857914030552, "epoch": 2.5168951110459576, "grad_norm": 14.0625, "learning_rate": 1.8630941297739417e-05, "loss": 0.928, "mean_token_accuracy": 0.7334965407848358, "num_tokens": 39705139.0, "step": 17170 }, { "entropy": 1.2316868245601653, "epoch": 2.5183610642820495, "grad_norm": 14.75, "learning_rate": 1.8628355980145282e-05, "loss": 1.1897, "mean_token_accuracy": 0.6958536803722382, "num_tokens": 39726140.0, "step": 17180 }, { "entropy": 0.9946269989013672, "epoch": 2.519827017518141, "grad_norm": 6.1875, "learning_rate": 1.8625768403534966e-05, "loss": 0.9832, "mean_token_accuracy": 0.7415563255548477, "num_tokens": 39749635.0, "step": 17190 }, { "entropy": 1.1526495277881623, "epoch": 2.521292970754233, "grad_norm": 10.0625, "learning_rate": 1.862317856858593e-05, "loss": 1.0999, "mean_token_accuracy": 0.713990044593811, "num_tokens": 39773546.0, "step": 17200 }, { "entropy": 1.2869481831789016, "epoch": 2.5227589239903248, "grad_norm": 11.125, "learning_rate": 1.8620586475976234e-05, "loss": 1.2863, "mean_token_accuracy": 0.6893913507461548, "num_tokens": 39797555.0, "step": 17210 }, { "entropy": 1.4209905505180358, "epoch": 2.5242248772264166, "grad_norm": 13.8125, "learning_rate": 1.8617992126384514e-05, "loss": 1.4163, "mean_token_accuracy": 0.6617703542113305, "num_tokens": 39821679.0, "step": 17220 }, { "entropy": 1.1423115670680999, "epoch": 2.525690830462508, "grad_norm": 13.125, "learning_rate": 1.8615395520490004e-05, "loss": 1.0682, "mean_token_accuracy": 0.7183566153049469, "num_tokens": 39843841.0, "step": 17230 }, { "entropy": 1.1936386555433274, "epoch": 2.5271567836986, "grad_norm": 12.875, "learning_rate": 1.8612796658972536e-05, "loss": 1.1385, "mean_token_accuracy": 0.7038935571908951, "num_tokens": 39864856.0, "step": 17240 }, { "entropy": 1.14906043112278, "epoch": 2.528622736934692, "grad_norm": 14.9375, "learning_rate": 1.861019554251252e-05, "loss": 1.1039, "mean_token_accuracy": 0.7175527095794678, "num_tokens": 39885581.0, "step": 17250 }, { "entropy": 1.189031833410263, "epoch": 2.5300886901707837, "grad_norm": 6.78125, "learning_rate": 1.8607592171790966e-05, "loss": 1.2032, "mean_token_accuracy": 0.70981265604496, "num_tokens": 39908462.0, "step": 17260 }, { "entropy": 1.2551440566778183, "epoch": 2.531554643406875, "grad_norm": 9.875, "learning_rate": 1.8604986547489467e-05, "loss": 1.1992, "mean_token_accuracy": 0.6981237024068833, "num_tokens": 39931994.0, "step": 17270 }, { "entropy": 1.13503055870533, "epoch": 2.533020596642967, "grad_norm": 7.71875, "learning_rate": 1.860237867029021e-05, "loss": 1.0912, "mean_token_accuracy": 0.7106177866458893, "num_tokens": 39958444.0, "step": 17280 }, { "entropy": 1.1819214969873428, "epoch": 2.534486549879059, "grad_norm": 9.1875, "learning_rate": 1.8599768540875972e-05, "loss": 1.1321, "mean_token_accuracy": 0.7097417414188385, "num_tokens": 39981406.0, "step": 17290 }, { "entropy": 1.0967952102422713, "epoch": 2.5359525031151504, "grad_norm": 9.625, "learning_rate": 1.8597156159930122e-05, "loss": 1.0278, "mean_token_accuracy": 0.7178319931030274, "num_tokens": 40006066.0, "step": 17300 }, { "entropy": 1.1549513220787049, "epoch": 2.5374184563512423, "grad_norm": 6.40625, "learning_rate": 1.8594541528136615e-05, "loss": 1.1032, "mean_token_accuracy": 0.7179982304573059, "num_tokens": 40030391.0, "step": 17310 }, { "entropy": 1.1457307413220406, "epoch": 2.538884409587334, "grad_norm": 14.375, "learning_rate": 1.859192464617999e-05, "loss": 1.0829, "mean_token_accuracy": 0.7183500915765763, "num_tokens": 40050520.0, "step": 17320 }, { "entropy": 1.1601818710565568, "epoch": 2.540350362823426, "grad_norm": 9.4375, "learning_rate": 1.8589305514745383e-05, "loss": 1.1249, "mean_token_accuracy": 0.7110889405012131, "num_tokens": 40072073.0, "step": 17330 }, { "entropy": 1.061713644862175, "epoch": 2.541816316059518, "grad_norm": 9.8125, "learning_rate": 1.858668413451852e-05, "loss": 0.9695, "mean_token_accuracy": 0.7335678249597549, "num_tokens": 40094500.0, "step": 17340 }, { "entropy": 1.1380744487047196, "epoch": 2.5432822692956094, "grad_norm": 8.5, "learning_rate": 1.858406050618571e-05, "loss": 1.0856, "mean_token_accuracy": 0.7181531727313996, "num_tokens": 40115396.0, "step": 17350 }, { "entropy": 1.0783351212739944, "epoch": 2.5447482225317013, "grad_norm": 7.84375, "learning_rate": 1.8581434630433857e-05, "loss": 1.0234, "mean_token_accuracy": 0.7287481635808944, "num_tokens": 40138773.0, "step": 17360 }, { "entropy": 1.135600957274437, "epoch": 2.546214175767793, "grad_norm": 12.5625, "learning_rate": 1.8578806507950445e-05, "loss": 1.1157, "mean_token_accuracy": 0.7097093492746354, "num_tokens": 40163902.0, "step": 17370 }, { "entropy": 1.3525040984153747, "epoch": 2.5476801290038846, "grad_norm": 8.75, "learning_rate": 1.857617613942355e-05, "loss": 1.3258, "mean_token_accuracy": 0.6831506937742233, "num_tokens": 40183161.0, "step": 17380 }, { "entropy": 1.1019289433956145, "epoch": 2.5491460822399765, "grad_norm": 8.9375, "learning_rate": 1.8573543525541838e-05, "loss": 1.0949, "mean_token_accuracy": 0.7173965692520141, "num_tokens": 40207821.0, "step": 17390 }, { "entropy": 1.2700913369655609, "epoch": 2.5506120354760684, "grad_norm": 6.875, "learning_rate": 1.8570908666994563e-05, "loss": 1.2697, "mean_token_accuracy": 0.6984278529882431, "num_tokens": 40229760.0, "step": 17400 }, { "entropy": 1.3007880687713622, "epoch": 2.55207798871216, "grad_norm": 8.75, "learning_rate": 1.8568271564471566e-05, "loss": 1.2607, "mean_token_accuracy": 0.6837175756692886, "num_tokens": 40250390.0, "step": 17410 }, { "entropy": 1.1452397465705872, "epoch": 2.5535439419482517, "grad_norm": 9.375, "learning_rate": 1.8565632218663278e-05, "loss": 1.0629, "mean_token_accuracy": 0.7123093783855439, "num_tokens": 40272726.0, "step": 17420 }, { "entropy": 1.2879614055156707, "epoch": 2.5550098951843436, "grad_norm": 10.0, "learning_rate": 1.8562990630260703e-05, "loss": 1.2112, "mean_token_accuracy": 0.6886659502983093, "num_tokens": 40295720.0, "step": 17430 }, { "entropy": 1.0751450479030609, "epoch": 2.5564758484204355, "grad_norm": 7.625, "learning_rate": 1.8560346799955454e-05, "loss": 1.0414, "mean_token_accuracy": 0.7268277823925018, "num_tokens": 40317185.0, "step": 17440 }, { "entropy": 1.0925472527742386, "epoch": 2.5579418016565274, "grad_norm": 7.46875, "learning_rate": 1.8557700728439715e-05, "loss": 1.0226, "mean_token_accuracy": 0.7284120738506317, "num_tokens": 40338388.0, "step": 17450 }, { "entropy": 1.2587381824851036, "epoch": 2.559407754892619, "grad_norm": 9.875, "learning_rate": 1.855505241640627e-05, "loss": 1.1915, "mean_token_accuracy": 0.7015637487173081, "num_tokens": 40361470.0, "step": 17460 }, { "entropy": 1.1863534778356553, "epoch": 2.5608737081287107, "grad_norm": 10.0625, "learning_rate": 1.8552401864548465e-05, "loss": 1.1332, "mean_token_accuracy": 0.7020455956459045, "num_tokens": 40384675.0, "step": 17470 }, { "entropy": 1.2662458300590516, "epoch": 2.5623396613648026, "grad_norm": 8.5625, "learning_rate": 1.8549749073560265e-05, "loss": 1.2168, "mean_token_accuracy": 0.6895909756422043, "num_tokens": 40406257.0, "step": 17480 }, { "entropy": 1.1785364687442779, "epoch": 2.563805614600894, "grad_norm": 6.40625, "learning_rate": 1.8547094044136198e-05, "loss": 1.0778, "mean_token_accuracy": 0.7084070920944214, "num_tokens": 40428911.0, "step": 17490 }, { "entropy": 1.215006297826767, "epoch": 2.565271567836986, "grad_norm": 9.5, "learning_rate": 1.854443677697139e-05, "loss": 1.1338, "mean_token_accuracy": 0.70201336145401, "num_tokens": 40452643.0, "step": 17500 }, { "entropy": 1.152723526954651, "epoch": 2.566737521073078, "grad_norm": 10.1875, "learning_rate": 1.854177727276154e-05, "loss": 1.0847, "mean_token_accuracy": 0.7123665526509285, "num_tokens": 40476829.0, "step": 17510 }, { "entropy": 1.2104957669973373, "epoch": 2.5682034743091693, "grad_norm": 12.8125, "learning_rate": 1.853911553220295e-05, "loss": 1.1425, "mean_token_accuracy": 0.7076512575149536, "num_tokens": 40498757.0, "step": 17520 }, { "entropy": 1.1152670472860335, "epoch": 2.569669427545261, "grad_norm": 7.375, "learning_rate": 1.8536451555992488e-05, "loss": 1.0417, "mean_token_accuracy": 0.7265449553728104, "num_tokens": 40522194.0, "step": 17530 }, { "entropy": 1.304081177711487, "epoch": 2.571135380781353, "grad_norm": 12.75, "learning_rate": 1.8533785344827632e-05, "loss": 1.2844, "mean_token_accuracy": 0.6923562288284302, "num_tokens": 40543794.0, "step": 17540 }, { "entropy": 1.248507758975029, "epoch": 2.572601334017445, "grad_norm": 10.0, "learning_rate": 1.853111689940642e-05, "loss": 1.2256, "mean_token_accuracy": 0.6950333774089813, "num_tokens": 40564424.0, "step": 17550 }, { "entropy": 1.3359335005283355, "epoch": 2.574067287253537, "grad_norm": 8.1875, "learning_rate": 1.852844622042749e-05, "loss": 1.3004, "mean_token_accuracy": 0.6845456451177597, "num_tokens": 40586729.0, "step": 17560 }, { "entropy": 1.2180239111185074, "epoch": 2.5755332404896283, "grad_norm": 9.3125, "learning_rate": 1.8525773308590058e-05, "loss": 1.1627, "mean_token_accuracy": 0.7011087656021118, "num_tokens": 40609069.0, "step": 17570 }, { "entropy": 1.1007899284362792, "epoch": 2.57699919372572, "grad_norm": 6.5625, "learning_rate": 1.8523098164593928e-05, "loss": 1.06, "mean_token_accuracy": 0.7212690383195877, "num_tokens": 40633227.0, "step": 17580 }, { "entropy": 1.0608916714787484, "epoch": 2.578465146961812, "grad_norm": 9.5, "learning_rate": 1.852042078913949e-05, "loss": 1.0023, "mean_token_accuracy": 0.7344649970531464, "num_tokens": 40655448.0, "step": 17590 }, { "entropy": 1.1176960930228232, "epoch": 2.5799311001979035, "grad_norm": 7.0625, "learning_rate": 1.8517741182927715e-05, "loss": 1.0642, "mean_token_accuracy": 0.7240787953138351, "num_tokens": 40675999.0, "step": 17600 }, { "entropy": 0.9953673407435417, "epoch": 2.5813970534339954, "grad_norm": 7.125, "learning_rate": 1.8515059346660153e-05, "loss": 0.9471, "mean_token_accuracy": 0.7398772209882736, "num_tokens": 40700613.0, "step": 17610 }, { "entropy": 1.1822492480278015, "epoch": 2.5828630066700873, "grad_norm": 14.1875, "learning_rate": 1.8512375281038953e-05, "loss": 1.0835, "mean_token_accuracy": 0.7114520370960236, "num_tokens": 40723015.0, "step": 17620 }, { "entropy": 1.0202831000089645, "epoch": 2.5843289599061787, "grad_norm": 9.5, "learning_rate": 1.8509688986766835e-05, "loss": 0.9943, "mean_token_accuracy": 0.7344191551208497, "num_tokens": 40746537.0, "step": 17630 }, { "entropy": 1.1186295717954635, "epoch": 2.5857949131422706, "grad_norm": 7.65625, "learning_rate": 1.8507000464547103e-05, "loss": 1.1036, "mean_token_accuracy": 0.7190187722444534, "num_tokens": 40768047.0, "step": 17640 }, { "entropy": 1.2164098113775252, "epoch": 2.5872608663783625, "grad_norm": 9.1875, "learning_rate": 1.8504309715083652e-05, "loss": 1.1884, "mean_token_accuracy": 0.6886667340993882, "num_tokens": 40787088.0, "step": 17650 }, { "entropy": 1.1988923490047454, "epoch": 2.5887268196144544, "grad_norm": 9.6875, "learning_rate": 1.850161673908095e-05, "loss": 1.1493, "mean_token_accuracy": 0.7080789536237717, "num_tokens": 40809924.0, "step": 17660 }, { "entropy": 1.1671968013048173, "epoch": 2.5901927728505463, "grad_norm": 10.6875, "learning_rate": 1.8498921537244054e-05, "loss": 1.1038, "mean_token_accuracy": 0.7155525326728821, "num_tokens": 40833225.0, "step": 17670 }, { "entropy": 1.1899569794535636, "epoch": 2.5916587260866377, "grad_norm": 9.375, "learning_rate": 1.8496224110278604e-05, "loss": 1.1224, "mean_token_accuracy": 0.7162790626287461, "num_tokens": 40858617.0, "step": 17680 }, { "entropy": 1.342928060889244, "epoch": 2.5931246793227296, "grad_norm": 9.3125, "learning_rate": 1.8493524458890827e-05, "loss": 1.241, "mean_token_accuracy": 0.6764685869216919, "num_tokens": 40880491.0, "step": 17690 }, { "entropy": 1.1335519313812257, "epoch": 2.5945906325588215, "grad_norm": 11.9375, "learning_rate": 1.849082258378752e-05, "loss": 1.0891, "mean_token_accuracy": 0.7139771074056626, "num_tokens": 40905393.0, "step": 17700 }, { "entropy": 1.20328249335289, "epoch": 2.596056585794913, "grad_norm": 13.0, "learning_rate": 1.848811848567607e-05, "loss": 1.2002, "mean_token_accuracy": 0.6992066860198974, "num_tokens": 40928003.0, "step": 17710 }, { "entropy": 0.9696186885237694, "epoch": 2.597522539031005, "grad_norm": 7.03125, "learning_rate": 1.848541216526445e-05, "loss": 0.8462, "mean_token_accuracy": 0.7500261992216111, "num_tokens": 40953052.0, "step": 17720 }, { "entropy": 1.2380745559930801, "epoch": 2.5989884922670967, "grad_norm": 9.0, "learning_rate": 1.8482703623261208e-05, "loss": 1.1921, "mean_token_accuracy": 0.7059817284345626, "num_tokens": 40975338.0, "step": 17730 }, { "entropy": 1.1840344309806823, "epoch": 2.6004544455031886, "grad_norm": 7.28125, "learning_rate": 1.847999286037547e-05, "loss": 1.1432, "mean_token_accuracy": 0.7105505108833313, "num_tokens": 40999119.0, "step": 17740 }, { "entropy": 1.2658453971147536, "epoch": 2.6019203987392805, "grad_norm": 14.75, "learning_rate": 1.8477279877316953e-05, "loss": 1.2198, "mean_token_accuracy": 0.6957791060209274, "num_tokens": 41021538.0, "step": 17750 }, { "entropy": 1.0256024718284606, "epoch": 2.603386351975372, "grad_norm": 9.3125, "learning_rate": 1.8474564674795952e-05, "loss": 0.9318, "mean_token_accuracy": 0.7416394829750061, "num_tokens": 41046148.0, "step": 17760 }, { "entropy": 1.0914247721433639, "epoch": 2.604852305211464, "grad_norm": 8.5, "learning_rate": 1.8471847253523344e-05, "loss": 1.0417, "mean_token_accuracy": 0.7243927717208862, "num_tokens": 41073202.0, "step": 17770 }, { "entropy": 1.249410369992256, "epoch": 2.6063182584475557, "grad_norm": 9.5, "learning_rate": 1.8469127614210586e-05, "loss": 1.2338, "mean_token_accuracy": 0.6961625874042511, "num_tokens": 41095845.0, "step": 17780 }, { "entropy": 1.2917064875364304, "epoch": 2.607784211683647, "grad_norm": 14.4375, "learning_rate": 1.8466405757569707e-05, "loss": 1.2671, "mean_token_accuracy": 0.6894845485687255, "num_tokens": 41116498.0, "step": 17790 }, { "entropy": 1.2839461520314217, "epoch": 2.609250164919739, "grad_norm": 14.125, "learning_rate": 1.8463681684313336e-05, "loss": 1.2993, "mean_token_accuracy": 0.6812950283288955, "num_tokens": 41138628.0, "step": 17800 }, { "entropy": 1.0256994679570197, "epoch": 2.610716118155831, "grad_norm": 7.5625, "learning_rate": 1.8460955395154658e-05, "loss": 0.9921, "mean_token_accuracy": 0.7393175303936005, "num_tokens": 41162720.0, "step": 17810 }, { "entropy": 1.2195057988166809, "epoch": 2.6121820713919224, "grad_norm": 10.3125, "learning_rate": 1.8458226890807464e-05, "loss": 1.1513, "mean_token_accuracy": 0.7074736207723618, "num_tokens": 41188396.0, "step": 17820 }, { "entropy": 1.1337337344884872, "epoch": 2.6136480246280143, "grad_norm": 7.375, "learning_rate": 1.8455496171986102e-05, "loss": 1.07, "mean_token_accuracy": 0.7191013336181641, "num_tokens": 41212027.0, "step": 17830 }, { "entropy": 1.1279699802398682, "epoch": 2.615113977864106, "grad_norm": 13.625, "learning_rate": 1.8452763239405517e-05, "loss": 1.0539, "mean_token_accuracy": 0.7219295054674149, "num_tokens": 41233206.0, "step": 17840 }, { "entropy": 1.056842365860939, "epoch": 2.616579931100198, "grad_norm": 9.75, "learning_rate": 1.8450028093781226e-05, "loss": 1.0296, "mean_token_accuracy": 0.7339475721120834, "num_tokens": 41256829.0, "step": 17850 }, { "entropy": 1.4013454556465148, "epoch": 2.61804588433629, "grad_norm": 9.9375, "learning_rate": 1.8447290735829317e-05, "loss": 1.3528, "mean_token_accuracy": 0.6663511693477631, "num_tokens": 41277909.0, "step": 17860 }, { "entropy": 1.0809909343719482, "epoch": 2.6195118375723814, "grad_norm": 7.59375, "learning_rate": 1.8444551166266475e-05, "loss": 1.0131, "mean_token_accuracy": 0.7314249038696289, "num_tokens": 41301369.0, "step": 17870 }, { "entropy": 1.0883595645427704, "epoch": 2.6209777908084733, "grad_norm": 6.59375, "learning_rate": 1.8441809385809953e-05, "loss": 1.0419, "mean_token_accuracy": 0.7217204660177231, "num_tokens": 41327653.0, "step": 17880 }, { "entropy": 1.186319923400879, "epoch": 2.622443744044565, "grad_norm": 8.8125, "learning_rate": 1.8439065395177582e-05, "loss": 1.1634, "mean_token_accuracy": 0.7074247598648071, "num_tokens": 41351575.0, "step": 17890 }, { "entropy": 1.1692779213190079, "epoch": 2.6239096972806566, "grad_norm": 10.25, "learning_rate": 1.8436319195087778e-05, "loss": 1.1986, "mean_token_accuracy": 0.7106975674629211, "num_tokens": 41374126.0, "step": 17900 }, { "entropy": 1.3500461131334305, "epoch": 2.6253756505167485, "grad_norm": 13.375, "learning_rate": 1.8433570786259533e-05, "loss": 1.2691, "mean_token_accuracy": 0.6836873650550842, "num_tokens": 41397931.0, "step": 17910 }, { "entropy": 1.3206550925970078, "epoch": 2.6268416037528404, "grad_norm": 11.375, "learning_rate": 1.8430820169412415e-05, "loss": 1.2721, "mean_token_accuracy": 0.6848706632852555, "num_tokens": 41417266.0, "step": 17920 }, { "entropy": 1.0686591416597366, "epoch": 2.628307556988932, "grad_norm": 12.9375, "learning_rate": 1.8428067345266564e-05, "loss": 1.0338, "mean_token_accuracy": 0.7338753998279571, "num_tokens": 41439232.0, "step": 17930 }, { "entropy": 1.006651172041893, "epoch": 2.6297735102250237, "grad_norm": 10.3125, "learning_rate": 1.8425312314542714e-05, "loss": 0.9875, "mean_token_accuracy": 0.7410313695669174, "num_tokens": 41467991.0, "step": 17940 }, { "entropy": 0.9374881267547608, "epoch": 2.6312394634611156, "grad_norm": 6.375, "learning_rate": 1.842255507796217e-05, "loss": 0.9142, "mean_token_accuracy": 0.7593678146600723, "num_tokens": 41492140.0, "step": 17950 }, { "entropy": 1.1512270495295525, "epoch": 2.6327054166972075, "grad_norm": 11.75, "learning_rate": 1.8419795636246805e-05, "loss": 1.1167, "mean_token_accuracy": 0.7114370912313461, "num_tokens": 41513040.0, "step": 17960 }, { "entropy": 1.1028441101312638, "epoch": 2.6341713699332994, "grad_norm": 7.1875, "learning_rate": 1.8417033990119083e-05, "loss": 1.0728, "mean_token_accuracy": 0.7255888640880584, "num_tokens": 41537115.0, "step": 17970 }, { "entropy": 1.1072836518287659, "epoch": 2.635637323169391, "grad_norm": 6.46875, "learning_rate": 1.841427014030203e-05, "loss": 1.0448, "mean_token_accuracy": 0.7225040227174759, "num_tokens": 41559500.0, "step": 17980 }, { "entropy": 1.212125538289547, "epoch": 2.6371032764054827, "grad_norm": 8.5, "learning_rate": 1.841150408751927e-05, "loss": 1.1596, "mean_token_accuracy": 0.703865921497345, "num_tokens": 41580385.0, "step": 17990 }, { "entropy": 1.1132221609354018, "epoch": 2.6385692296415746, "grad_norm": 7.03125, "learning_rate": 1.8408735832494983e-05, "loss": 1.0289, "mean_token_accuracy": 0.7228948712348938, "num_tokens": 41608716.0, "step": 18000 }, { "entropy": 1.2990728944540024, "epoch": 2.640035182877666, "grad_norm": 11.8125, "learning_rate": 1.8405965375953938e-05, "loss": 1.2907, "mean_token_accuracy": 0.6744722843170166, "num_tokens": 41627533.0, "step": 18010 }, { "entropy": 1.2570690393447876, "epoch": 2.641501136113758, "grad_norm": 10.0625, "learning_rate": 1.8403192718621477e-05, "loss": 1.1943, "mean_token_accuracy": 0.6886466950178146, "num_tokens": 41646929.0, "step": 18020 }, { "entropy": 1.1298530727624894, "epoch": 2.64296708934985, "grad_norm": 13.0625, "learning_rate": 1.8400417861223515e-05, "loss": 1.0766, "mean_token_accuracy": 0.7187382072210312, "num_tokens": 41668667.0, "step": 18030 }, { "entropy": 1.1837252914905547, "epoch": 2.6444330425859413, "grad_norm": 9.9375, "learning_rate": 1.8397640804486548e-05, "loss": 1.0976, "mean_token_accuracy": 0.7070813417434693, "num_tokens": 41692130.0, "step": 18040 }, { "entropy": 1.124664169549942, "epoch": 2.645898995822033, "grad_norm": 7.6875, "learning_rate": 1.8394861549137642e-05, "loss": 1.1266, "mean_token_accuracy": 0.7187464326620102, "num_tokens": 41716579.0, "step": 18050 }, { "entropy": 1.2149357318878173, "epoch": 2.647364949058125, "grad_norm": 8.9375, "learning_rate": 1.8392080095904446e-05, "loss": 1.171, "mean_token_accuracy": 0.6979780226945878, "num_tokens": 41738573.0, "step": 18060 }, { "entropy": 1.1536041229963303, "epoch": 2.648830902294217, "grad_norm": 14.1875, "learning_rate": 1.8389296445515182e-05, "loss": 1.1359, "mean_token_accuracy": 0.7219111382961273, "num_tokens": 41764900.0, "step": 18070 }, { "entropy": 1.209172335267067, "epoch": 2.650296855530309, "grad_norm": 9.0, "learning_rate": 1.8386510598698647e-05, "loss": 1.1499, "mean_token_accuracy": 0.7025201618671417, "num_tokens": 41785567.0, "step": 18080 }, { "entropy": 1.24893159866333, "epoch": 2.6517628087664002, "grad_norm": 13.75, "learning_rate": 1.8383722556184203e-05, "loss": 1.264, "mean_token_accuracy": 0.6857050389051438, "num_tokens": 41808915.0, "step": 18090 }, { "entropy": 0.9671169757843018, "epoch": 2.653228762002492, "grad_norm": 8.3125, "learning_rate": 1.8380932318701808e-05, "loss": 0.9237, "mean_token_accuracy": 0.7520599007606507, "num_tokens": 41834122.0, "step": 18100 }, { "entropy": 0.9974233627319335, "epoch": 2.654694715238584, "grad_norm": 9.6875, "learning_rate": 1.8378139886981975e-05, "loss": 0.9559, "mean_token_accuracy": 0.7454344213008881, "num_tokens": 41860239.0, "step": 18110 }, { "entropy": 1.085248053073883, "epoch": 2.6561606684746755, "grad_norm": 8.75, "learning_rate": 1.83753452617558e-05, "loss": 0.9985, "mean_token_accuracy": 0.728446489572525, "num_tokens": 41885099.0, "step": 18120 }, { "entropy": 1.1779681518673897, "epoch": 2.6576266217107674, "grad_norm": 7.90625, "learning_rate": 1.8372548443754952e-05, "loss": 1.1638, "mean_token_accuracy": 0.7073743134737015, "num_tokens": 41911090.0, "step": 18130 }, { "entropy": 1.1216196060180663, "epoch": 2.6590925749468592, "grad_norm": 7.84375, "learning_rate": 1.836974943371168e-05, "loss": 1.1172, "mean_token_accuracy": 0.7258982598781586, "num_tokens": 41935941.0, "step": 18140 }, { "entropy": 1.155520847439766, "epoch": 2.6605585281829507, "grad_norm": 7.5, "learning_rate": 1.8366948232358793e-05, "loss": 1.1003, "mean_token_accuracy": 0.715003925561905, "num_tokens": 41959595.0, "step": 18150 }, { "entropy": 0.9841071516275406, "epoch": 2.6620244814190426, "grad_norm": 9.4375, "learning_rate": 1.8364144840429685e-05, "loss": 0.9523, "mean_token_accuracy": 0.7491332918405533, "num_tokens": 41985763.0, "step": 18160 }, { "entropy": 1.099982699751854, "epoch": 2.6634904346551345, "grad_norm": 7.1875, "learning_rate": 1.8361339258658322e-05, "loss": 1.0665, "mean_token_accuracy": 0.7285169988870621, "num_tokens": 42008950.0, "step": 18170 }, { "entropy": 1.2005853533744812, "epoch": 2.6649563878912264, "grad_norm": 12.875, "learning_rate": 1.8358531487779246e-05, "loss": 1.1643, "mean_token_accuracy": 0.7086342632770538, "num_tokens": 42032530.0, "step": 18180 }, { "entropy": 1.0169757902622223, "epoch": 2.6664223411273182, "grad_norm": 6.71875, "learning_rate": 1.8355721528527558e-05, "loss": 0.9923, "mean_token_accuracy": 0.7430845111608505, "num_tokens": 42054581.0, "step": 18190 }, { "entropy": 1.1968425333499908, "epoch": 2.6678882943634097, "grad_norm": 7.84375, "learning_rate": 1.835290938163895e-05, "loss": 1.1355, "mean_token_accuracy": 0.7130108386278152, "num_tokens": 42079577.0, "step": 18200 }, { "entropy": 1.1729051321744919, "epoch": 2.6693542475995016, "grad_norm": 13.125, "learning_rate": 1.835009504784968e-05, "loss": 1.1632, "mean_token_accuracy": 0.7069114282727241, "num_tokens": 42103911.0, "step": 18210 }, { "entropy": 1.1595571160316467, "epoch": 2.6708202008355935, "grad_norm": 9.875, "learning_rate": 1.8347278527896567e-05, "loss": 1.1391, "mean_token_accuracy": 0.713091692328453, "num_tokens": 42126312.0, "step": 18220 }, { "entropy": 1.0731808453798295, "epoch": 2.672286154071685, "grad_norm": 9.375, "learning_rate": 1.8344459822517023e-05, "loss": 1.0017, "mean_token_accuracy": 0.7322685986757278, "num_tokens": 42149118.0, "step": 18230 }, { "entropy": 1.1528395771980287, "epoch": 2.673752107307777, "grad_norm": 12.5625, "learning_rate": 1.8341638932449018e-05, "loss": 1.1249, "mean_token_accuracy": 0.7096782773733139, "num_tokens": 42168546.0, "step": 18240 }, { "entropy": 1.2671992734074593, "epoch": 2.6752180605438687, "grad_norm": 12.6875, "learning_rate": 1.8338815858431096e-05, "loss": 1.215, "mean_token_accuracy": 0.6914318203926086, "num_tokens": 42190889.0, "step": 18250 }, { "entropy": 1.1845868468284606, "epoch": 2.6766840137799606, "grad_norm": 9.6875, "learning_rate": 1.8335990601202378e-05, "loss": 1.1526, "mean_token_accuracy": 0.7033950090408325, "num_tokens": 42212528.0, "step": 18260 }, { "entropy": 1.215462937951088, "epoch": 2.6781499670160525, "grad_norm": 13.75, "learning_rate": 1.8333163161502554e-05, "loss": 1.1986, "mean_token_accuracy": 0.6948407143354416, "num_tokens": 42235581.0, "step": 18270 }, { "entropy": 1.1938743472099305, "epoch": 2.679615920252144, "grad_norm": 9.125, "learning_rate": 1.833033354007188e-05, "loss": 1.1696, "mean_token_accuracy": 0.7061221927404404, "num_tokens": 42257595.0, "step": 18280 }, { "entropy": 1.2935110539197923, "epoch": 2.681081873488236, "grad_norm": 9.0625, "learning_rate": 1.8327501737651188e-05, "loss": 1.2283, "mean_token_accuracy": 0.6858452141284943, "num_tokens": 42280885.0, "step": 18290 }, { "entropy": 1.13335522711277, "epoch": 2.6825478267243277, "grad_norm": 9.875, "learning_rate": 1.8324667754981886e-05, "loss": 1.1281, "mean_token_accuracy": 0.7219656050205231, "num_tokens": 42306383.0, "step": 18300 }, { "entropy": 1.1716554284095764, "epoch": 2.684013779960419, "grad_norm": 13.8125, "learning_rate": 1.832183159280594e-05, "loss": 1.1049, "mean_token_accuracy": 0.7107885628938675, "num_tokens": 42330884.0, "step": 18310 }, { "entropy": 1.2187236428260804, "epoch": 2.685479733196511, "grad_norm": 9.375, "learning_rate": 1.8318993251865907e-05, "loss": 1.1376, "mean_token_accuracy": 0.6986217230558396, "num_tokens": 42357318.0, "step": 18320 }, { "entropy": 1.0689553409814834, "epoch": 2.686945686432603, "grad_norm": 8.375, "learning_rate": 1.8316152732904888e-05, "loss": 1.0266, "mean_token_accuracy": 0.7322678893804551, "num_tokens": 42381178.0, "step": 18330 }, { "entropy": 1.1992841124534608, "epoch": 2.6884116396686943, "grad_norm": 12.1875, "learning_rate": 1.8313310036666573e-05, "loss": 1.1481, "mean_token_accuracy": 0.7000740051269532, "num_tokens": 42401701.0, "step": 18340 }, { "entropy": 1.3585025101900101, "epoch": 2.6898775929047862, "grad_norm": 11.0625, "learning_rate": 1.8310465163895222e-05, "loss": 1.3426, "mean_token_accuracy": 0.6740335881710052, "num_tokens": 42424449.0, "step": 18350 }, { "entropy": 1.1860774174332618, "epoch": 2.691343546140878, "grad_norm": 10.625, "learning_rate": 1.830761811533565e-05, "loss": 1.1022, "mean_token_accuracy": 0.7046504259109497, "num_tokens": 42448684.0, "step": 18360 }, { "entropy": 1.2194811537861825, "epoch": 2.69280949937697, "grad_norm": 9.6875, "learning_rate": 1.830476889173326e-05, "loss": 1.201, "mean_token_accuracy": 0.7004662171006203, "num_tokens": 42473117.0, "step": 18370 }, { "entropy": 1.1652371406555175, "epoch": 2.694275452613062, "grad_norm": 9.5625, "learning_rate": 1.8301917493834013e-05, "loss": 1.1113, "mean_token_accuracy": 0.7142216175794601, "num_tokens": 42496538.0, "step": 18380 }, { "entropy": 1.0914501428604126, "epoch": 2.6957414058491533, "grad_norm": 9.125, "learning_rate": 1.829906392238444e-05, "loss": 1.0954, "mean_token_accuracy": 0.7157482951879501, "num_tokens": 42519830.0, "step": 18390 }, { "entropy": 1.0003580898046494, "epoch": 2.6972073590852452, "grad_norm": 8.875, "learning_rate": 1.8296208178131647e-05, "loss": 0.9469, "mean_token_accuracy": 0.7422969579696655, "num_tokens": 42545567.0, "step": 18400 }, { "entropy": 1.2253438234329224, "epoch": 2.698673312321337, "grad_norm": 9.25, "learning_rate": 1.8293350261823305e-05, "loss": 1.1648, "mean_token_accuracy": 0.7012060195207596, "num_tokens": 42568503.0, "step": 18410 }, { "entropy": 1.2442308098077774, "epoch": 2.7001392655574286, "grad_norm": 9.8125, "learning_rate": 1.829049017420765e-05, "loss": 1.1763, "mean_token_accuracy": 0.7043327242136002, "num_tokens": 42593250.0, "step": 18420 }, { "entropy": 1.207988405227661, "epoch": 2.7016052187935204, "grad_norm": 6.34375, "learning_rate": 1.828762791603349e-05, "loss": 1.2203, "mean_token_accuracy": 0.6979216665029526, "num_tokens": 42613454.0, "step": 18430 }, { "entropy": 1.291209378838539, "epoch": 2.7030711720296123, "grad_norm": 8.3125, "learning_rate": 1.8284763488050203e-05, "loss": 1.2671, "mean_token_accuracy": 0.683272360265255, "num_tokens": 42639179.0, "step": 18440 }, { "entropy": 1.1710103273391723, "epoch": 2.704537125265704, "grad_norm": 8.6875, "learning_rate": 1.828189689100774e-05, "loss": 1.1104, "mean_token_accuracy": 0.7096611142158509, "num_tokens": 42660927.0, "step": 18450 }, { "entropy": 1.1640239372849464, "epoch": 2.7060030785017957, "grad_norm": 6.84375, "learning_rate": 1.82790281256566e-05, "loss": 1.0643, "mean_token_accuracy": 0.70902698636055, "num_tokens": 42680419.0, "step": 18460 }, { "entropy": 1.4398102641105652, "epoch": 2.7074690317378876, "grad_norm": 10.6875, "learning_rate": 1.827615719274788e-05, "loss": 1.3901, "mean_token_accuracy": 0.6618187606334687, "num_tokens": 42702029.0, "step": 18470 }, { "entropy": 0.9297709420323372, "epoch": 2.7089349849739794, "grad_norm": 7.375, "learning_rate": 1.8273284093033213e-05, "loss": 0.8917, "mean_token_accuracy": 0.7614335238933563, "num_tokens": 42729855.0, "step": 18480 }, { "entropy": 1.196236726641655, "epoch": 2.7104009382100713, "grad_norm": 9.0625, "learning_rate": 1.8270408827264824e-05, "loss": 1.1295, "mean_token_accuracy": 0.7097253739833832, "num_tokens": 42751577.0, "step": 18490 }, { "entropy": 1.2634301245212556, "epoch": 2.7118668914461628, "grad_norm": 6.96875, "learning_rate": 1.8267531396195486e-05, "loss": 1.206, "mean_token_accuracy": 0.6943792253732681, "num_tokens": 42773534.0, "step": 18500 }, { "entropy": 1.0668329834938048, "epoch": 2.7133328446822547, "grad_norm": 9.3125, "learning_rate": 1.8264651800578558e-05, "loss": 0.9933, "mean_token_accuracy": 0.7268840342760086, "num_tokens": 42797640.0, "step": 18510 }, { "entropy": 1.187787237763405, "epoch": 2.7147987979183466, "grad_norm": 9.0, "learning_rate": 1.8261770041167946e-05, "loss": 1.1294, "mean_token_accuracy": 0.7020867079496383, "num_tokens": 42821401.0, "step": 18520 }, { "entropy": 1.4002118468284608, "epoch": 2.716264751154438, "grad_norm": 11.0, "learning_rate": 1.8258886118718136e-05, "loss": 1.34, "mean_token_accuracy": 0.6651912450790405, "num_tokens": 42839708.0, "step": 18530 }, { "entropy": 1.2183503568172456, "epoch": 2.71773070439053, "grad_norm": 13.1875, "learning_rate": 1.825600003398418e-05, "loss": 1.1967, "mean_token_accuracy": 0.6989215463399887, "num_tokens": 42860322.0, "step": 18540 }, { "entropy": 1.108153659105301, "epoch": 2.7191966576266218, "grad_norm": 8.125, "learning_rate": 1.8253111787721694e-05, "loss": 1.0823, "mean_token_accuracy": 0.7239424705505371, "num_tokens": 42883866.0, "step": 18550 }, { "entropy": 1.2667422413825988, "epoch": 2.720662610862713, "grad_norm": 11.9375, "learning_rate": 1.8250221380686847e-05, "loss": 1.246, "mean_token_accuracy": 0.6924011915922165, "num_tokens": 42906510.0, "step": 18560 }, { "entropy": 1.1871558874845505, "epoch": 2.722128564098805, "grad_norm": 10.875, "learning_rate": 1.82473288136364e-05, "loss": 1.1972, "mean_token_accuracy": 0.7062875479459763, "num_tokens": 42927753.0, "step": 18570 }, { "entropy": 1.299156618118286, "epoch": 2.723594517334897, "grad_norm": 11.25, "learning_rate": 1.824443408732765e-05, "loss": 1.2808, "mean_token_accuracy": 0.6816005200147629, "num_tokens": 42948435.0, "step": 18580 }, { "entropy": 1.1113072365522385, "epoch": 2.725060470570989, "grad_norm": 10.25, "learning_rate": 1.824153720251849e-05, "loss": 1.0028, "mean_token_accuracy": 0.7243814289569854, "num_tokens": 42974271.0, "step": 18590 }, { "entropy": 1.2938610911369324, "epoch": 2.7265264238070808, "grad_norm": 7.0, "learning_rate": 1.8238638159967348e-05, "loss": 1.3043, "mean_token_accuracy": 0.6843288242816925, "num_tokens": 42996841.0, "step": 18600 }, { "entropy": 1.0413832783699035, "epoch": 2.727992377043172, "grad_norm": 15.6875, "learning_rate": 1.8235736960433238e-05, "loss": 0.9936, "mean_token_accuracy": 0.7420460820198059, "num_tokens": 43019649.0, "step": 18610 }, { "entropy": 1.049288049340248, "epoch": 2.729458330279264, "grad_norm": 9.6875, "learning_rate": 1.8232833604675733e-05, "loss": 0.9938, "mean_token_accuracy": 0.7303562819957733, "num_tokens": 43041264.0, "step": 18620 }, { "entropy": 1.1061082094907762, "epoch": 2.730924283515356, "grad_norm": 7.25, "learning_rate": 1.8229928093454968e-05, "loss": 1.0782, "mean_token_accuracy": 0.7229527950286865, "num_tokens": 43065405.0, "step": 18630 }, { "entropy": 1.147709134221077, "epoch": 2.7323902367514474, "grad_norm": 9.1875, "learning_rate": 1.822702042753164e-05, "loss": 1.0582, "mean_token_accuracy": 0.7141040384769439, "num_tokens": 43086945.0, "step": 18640 }, { "entropy": 1.3120196014642715, "epoch": 2.7338561899875393, "grad_norm": 8.625, "learning_rate": 1.8224110607667018e-05, "loss": 1.3161, "mean_token_accuracy": 0.676306089758873, "num_tokens": 43109950.0, "step": 18650 }, { "entropy": 1.2841196358203888, "epoch": 2.735322143223631, "grad_norm": 11.0, "learning_rate": 1.822119863462293e-05, "loss": 1.1922, "mean_token_accuracy": 0.6958766639232635, "num_tokens": 43134187.0, "step": 18660 }, { "entropy": 1.1712326169013978, "epoch": 2.736788096459723, "grad_norm": 12.4375, "learning_rate": 1.821828450916177e-05, "loss": 1.1383, "mean_token_accuracy": 0.7042418152093888, "num_tokens": 43156238.0, "step": 18670 }, { "entropy": 1.2346252143383025, "epoch": 2.7382540496958145, "grad_norm": 11.4375, "learning_rate": 1.821536823204649e-05, "loss": 1.2044, "mean_token_accuracy": 0.7046827882528305, "num_tokens": 43176130.0, "step": 18680 }, { "entropy": 1.1690506041049957, "epoch": 2.7397200029319064, "grad_norm": 11.75, "learning_rate": 1.8212449804040614e-05, "loss": 1.1192, "mean_token_accuracy": 0.7100697994232178, "num_tokens": 43195474.0, "step": 18690 }, { "entropy": 1.0973882675170898, "epoch": 2.7411859561679983, "grad_norm": 6.28125, "learning_rate": 1.8209529225908222e-05, "loss": 1.0384, "mean_token_accuracy": 0.7199729084968567, "num_tokens": 43218969.0, "step": 18700 }, { "entropy": 1.17197183072567, "epoch": 2.74265190940409, "grad_norm": 9.8125, "learning_rate": 1.8206606498413962e-05, "loss": 1.1404, "mean_token_accuracy": 0.707529079914093, "num_tokens": 43240949.0, "step": 18710 }, { "entropy": 1.1720232665538788, "epoch": 2.7441178626401816, "grad_norm": 9.8125, "learning_rate": 1.8203681622323036e-05, "loss": 1.1691, "mean_token_accuracy": 0.7115532875061035, "num_tokens": 43264183.0, "step": 18720 }, { "entropy": 1.2463253036141395, "epoch": 2.7455838158762735, "grad_norm": 8.75, "learning_rate": 1.8200754598401223e-05, "loss": 1.2394, "mean_token_accuracy": 0.6956313252449036, "num_tokens": 43284906.0, "step": 18730 }, { "entropy": 1.0887647315859794, "epoch": 2.7470497691123654, "grad_norm": 9.125, "learning_rate": 1.8197825427414847e-05, "loss": 1.0113, "mean_token_accuracy": 0.72621248960495, "num_tokens": 43310459.0, "step": 18740 }, { "entropy": 1.3123018831014632, "epoch": 2.748515722348457, "grad_norm": 6.875, "learning_rate": 1.8194894110130815e-05, "loss": 1.2484, "mean_token_accuracy": 0.6920823752880096, "num_tokens": 43333068.0, "step": 18750 }, { "entropy": 1.293990468978882, "epoch": 2.7499816755845488, "grad_norm": 11.875, "learning_rate": 1.819196064731657e-05, "loss": 1.2912, "mean_token_accuracy": 0.6881831347942352, "num_tokens": 43355149.0, "step": 18760 }, { "entropy": 1.1363577544689178, "epoch": 2.7514476288206406, "grad_norm": 9.5, "learning_rate": 1.8189025039740143e-05, "loss": 1.102, "mean_token_accuracy": 0.7127518326044082, "num_tokens": 43374705.0, "step": 18770 }, { "entropy": 1.2889260590076446, "epoch": 2.7529135820567325, "grad_norm": 12.5625, "learning_rate": 1.8186087288170105e-05, "loss": 1.2593, "mean_token_accuracy": 0.688190707564354, "num_tokens": 43398621.0, "step": 18780 }, { "entropy": 1.242435485124588, "epoch": 2.7543795352928244, "grad_norm": 14.125, "learning_rate": 1.8183147393375603e-05, "loss": 1.1938, "mean_token_accuracy": 0.7022433280944824, "num_tokens": 43423660.0, "step": 18790 }, { "entropy": 0.9798844784498215, "epoch": 2.755845488528916, "grad_norm": 9.4375, "learning_rate": 1.8180205356126342e-05, "loss": 0.8739, "mean_token_accuracy": 0.747201269865036, "num_tokens": 43450461.0, "step": 18800 }, { "entropy": 1.3365817099809647, "epoch": 2.7573114417650078, "grad_norm": 8.1875, "learning_rate": 1.817726117719258e-05, "loss": 1.3431, "mean_token_accuracy": 0.6721755862236023, "num_tokens": 43471498.0, "step": 18810 }, { "entropy": 1.1991151094436645, "epoch": 2.7587773950010996, "grad_norm": 6.9375, "learning_rate": 1.8174314857345145e-05, "loss": 1.1498, "mean_token_accuracy": 0.7075035244226455, "num_tokens": 43493483.0, "step": 18820 }, { "entropy": 1.1220104098320007, "epoch": 2.760243348237191, "grad_norm": 7.09375, "learning_rate": 1.817136639735542e-05, "loss": 1.1122, "mean_token_accuracy": 0.7191101491451264, "num_tokens": 43518729.0, "step": 18830 }, { "entropy": 1.4611378073692323, "epoch": 2.761709301473283, "grad_norm": 11.9375, "learning_rate": 1.8168415797995352e-05, "loss": 1.4218, "mean_token_accuracy": 0.6565647631883621, "num_tokens": 43541034.0, "step": 18840 }, { "entropy": 1.1063042610883713, "epoch": 2.763175254709375, "grad_norm": 6.90625, "learning_rate": 1.816546306003745e-05, "loss": 1.0856, "mean_token_accuracy": 0.7229885250329972, "num_tokens": 43565329.0, "step": 18850 }, { "entropy": 1.3122300535440445, "epoch": 2.7646412079454663, "grad_norm": 12.9375, "learning_rate": 1.816250818425477e-05, "loss": 1.253, "mean_token_accuracy": 0.6848339334130287, "num_tokens": 43586726.0, "step": 18860 }, { "entropy": 1.151192620396614, "epoch": 2.766107161181558, "grad_norm": 7.1875, "learning_rate": 1.8159551171420943e-05, "loss": 1.1145, "mean_token_accuracy": 0.7120667934417725, "num_tokens": 43607988.0, "step": 18870 }, { "entropy": 1.1103577494621277, "epoch": 2.76757311441765, "grad_norm": 9.6875, "learning_rate": 1.8156592022310153e-05, "loss": 1.052, "mean_token_accuracy": 0.7214626699686051, "num_tokens": 43634689.0, "step": 18880 }, { "entropy": 1.3188464254140855, "epoch": 2.769039067653742, "grad_norm": 9.5, "learning_rate": 1.8153630737697146e-05, "loss": 1.3071, "mean_token_accuracy": 0.6719099760055542, "num_tokens": 43655135.0, "step": 18890 }, { "entropy": 0.990566436946392, "epoch": 2.770505020889834, "grad_norm": 11.125, "learning_rate": 1.8150667318357217e-05, "loss": 0.9368, "mean_token_accuracy": 0.7458571404218673, "num_tokens": 43678174.0, "step": 18900 }, { "entropy": 1.133150777220726, "epoch": 2.7719709741259253, "grad_norm": 13.4375, "learning_rate": 1.8147701765066238e-05, "loss": 1.0788, "mean_token_accuracy": 0.7136897057294845, "num_tokens": 43698936.0, "step": 18910 }, { "entropy": 1.2095217823982238, "epoch": 2.773436927362017, "grad_norm": 8.75, "learning_rate": 1.8144734078600625e-05, "loss": 1.1503, "mean_token_accuracy": 0.705208596587181, "num_tokens": 43721486.0, "step": 18920 }, { "entropy": 1.2693369954824447, "epoch": 2.774902880598109, "grad_norm": 8.5625, "learning_rate": 1.8141764259737352e-05, "loss": 1.2184, "mean_token_accuracy": 0.6927412003278732, "num_tokens": 43746464.0, "step": 18930 }, { "entropy": 1.3673241525888442, "epoch": 2.7763688338342005, "grad_norm": 10.9375, "learning_rate": 1.8138792309253967e-05, "loss": 1.3851, "mean_token_accuracy": 0.6718523949384689, "num_tokens": 43767465.0, "step": 18940 }, { "entropy": 1.2756041795015336, "epoch": 2.7778347870702924, "grad_norm": 12.25, "learning_rate": 1.8135818227928556e-05, "loss": 1.2388, "mean_token_accuracy": 0.6904418349266053, "num_tokens": 43789676.0, "step": 18950 }, { "entropy": 1.2207011103630065, "epoch": 2.7793007403063843, "grad_norm": 7.1875, "learning_rate": 1.8132842016539777e-05, "loss": 1.1556, "mean_token_accuracy": 0.6974404364824295, "num_tokens": 43810872.0, "step": 18960 }, { "entropy": 1.189922833442688, "epoch": 2.7807666935424757, "grad_norm": 7.78125, "learning_rate": 1.8129863675866843e-05, "loss": 1.1547, "mean_token_accuracy": 0.7025397747755051, "num_tokens": 43833012.0, "step": 18970 }, { "entropy": 1.2136468142271042, "epoch": 2.7822326467785676, "grad_norm": 9.875, "learning_rate": 1.8126883206689515e-05, "loss": 1.1618, "mean_token_accuracy": 0.7074248492717743, "num_tokens": 43856733.0, "step": 18980 }, { "entropy": 1.1940225958824158, "epoch": 2.7836986000146595, "grad_norm": 11.75, "learning_rate": 1.8123900609788124e-05, "loss": 1.1851, "mean_token_accuracy": 0.7071611285209656, "num_tokens": 43880380.0, "step": 18990 }, { "entropy": 1.270387876033783, "epoch": 2.7851645532507514, "grad_norm": 9.4375, "learning_rate": 1.8120915885943553e-05, "loss": 1.2245, "mean_token_accuracy": 0.6890469670295716, "num_tokens": 43900779.0, "step": 19000 }, { "entropy": 1.3142630517482758, "epoch": 2.7866305064868433, "grad_norm": 7.375, "learning_rate": 1.8117929035937237e-05, "loss": 1.2764, "mean_token_accuracy": 0.6857962071895599, "num_tokens": 43923796.0, "step": 19010 }, { "entropy": 1.3350618422031402, "epoch": 2.7880964597229347, "grad_norm": 9.6875, "learning_rate": 1.811494006055118e-05, "loss": 1.3067, "mean_token_accuracy": 0.6850470796227455, "num_tokens": 43942145.0, "step": 19020 }, { "entropy": 1.1256288170814515, "epoch": 2.7895624129590266, "grad_norm": 10.5625, "learning_rate": 1.8111948960567932e-05, "loss": 1.0791, "mean_token_accuracy": 0.719894403219223, "num_tokens": 43966951.0, "step": 19030 }, { "entropy": 1.2699384599924088, "epoch": 2.7910283661951185, "grad_norm": 10.75, "learning_rate": 1.81089557367706e-05, "loss": 1.2583, "mean_token_accuracy": 0.69660414904356, "num_tokens": 43991381.0, "step": 19040 }, { "entropy": 1.1867402628064156, "epoch": 2.79249431943121, "grad_norm": 7.09375, "learning_rate": 1.8105960389942845e-05, "loss": 1.1571, "mean_token_accuracy": 0.7048907309770585, "num_tokens": 44011205.0, "step": 19050 }, { "entropy": 1.1813104152679443, "epoch": 2.793960272667302, "grad_norm": 12.5625, "learning_rate": 1.8102962920868896e-05, "loss": 1.1243, "mean_token_accuracy": 0.7115608483552933, "num_tokens": 44037560.0, "step": 19060 }, { "entropy": 1.183723059296608, "epoch": 2.7954262259033937, "grad_norm": 8.8125, "learning_rate": 1.8099963330333527e-05, "loss": 1.1571, "mean_token_accuracy": 0.7161055162549019, "num_tokens": 44060880.0, "step": 19070 }, { "entropy": 1.2321641474962235, "epoch": 2.796892179139485, "grad_norm": 9.25, "learning_rate": 1.8096961619122063e-05, "loss": 1.1991, "mean_token_accuracy": 0.7034396797418594, "num_tokens": 44082540.0, "step": 19080 }, { "entropy": 1.379555681347847, "epoch": 2.798358132375577, "grad_norm": 15.625, "learning_rate": 1.80939577880204e-05, "loss": 1.3707, "mean_token_accuracy": 0.674259552359581, "num_tokens": 44100147.0, "step": 19090 }, { "entropy": 1.141817006468773, "epoch": 2.799824085611669, "grad_norm": 10.6875, "learning_rate": 1.8090951837814982e-05, "loss": 1.1254, "mean_token_accuracy": 0.719031172990799, "num_tokens": 44124771.0, "step": 19100 }, { "entropy": 1.0979144364595412, "epoch": 2.801290038847761, "grad_norm": 12.8125, "learning_rate": 1.8087943769292796e-05, "loss": 1.0312, "mean_token_accuracy": 0.7298583835363388, "num_tokens": 44146433.0, "step": 19110 }, { "entropy": 0.9414590895175934, "epoch": 2.8027559920838527, "grad_norm": 7.90625, "learning_rate": 1.8084933583241402e-05, "loss": 0.872, "mean_token_accuracy": 0.7511460840702057, "num_tokens": 44170095.0, "step": 19120 }, { "entropy": 1.130419972538948, "epoch": 2.804221945319944, "grad_norm": 8.6875, "learning_rate": 1.8081921280448903e-05, "loss": 1.0874, "mean_token_accuracy": 0.712906014919281, "num_tokens": 44191618.0, "step": 19130 }, { "entropy": 1.1456691533327104, "epoch": 2.805687898556036, "grad_norm": 11.125, "learning_rate": 1.8078906861703958e-05, "loss": 1.068, "mean_token_accuracy": 0.7174263224005699, "num_tokens": 44218091.0, "step": 19140 }, { "entropy": 1.2970322906970977, "epoch": 2.807153851792128, "grad_norm": 8.6875, "learning_rate": 1.807589032779578e-05, "loss": 1.309, "mean_token_accuracy": 0.6843457221984863, "num_tokens": 44237630.0, "step": 19150 }, { "entropy": 1.2448418647050858, "epoch": 2.8086198050282194, "grad_norm": 10.875, "learning_rate": 1.8072871679514142e-05, "loss": 1.2371, "mean_token_accuracy": 0.7013771444559097, "num_tokens": 44258215.0, "step": 19160 }, { "entropy": 1.1110518544912338, "epoch": 2.8100857582643113, "grad_norm": 9.75, "learning_rate": 1.8069850917649364e-05, "loss": 1.0457, "mean_token_accuracy": 0.7309638440608979, "num_tokens": 44283643.0, "step": 19170 }, { "entropy": 1.2381129920482636, "epoch": 2.811551711500403, "grad_norm": 14.0625, "learning_rate": 1.8066828042992317e-05, "loss": 1.2592, "mean_token_accuracy": 0.6939177304506302, "num_tokens": 44307264.0, "step": 19180 }, { "entropy": 1.386128455400467, "epoch": 2.813017664736495, "grad_norm": 9.1875, "learning_rate": 1.806380305633443e-05, "loss": 1.3137, "mean_token_accuracy": 0.6795546755194664, "num_tokens": 44327293.0, "step": 19190 }, { "entropy": 1.1142626792192458, "epoch": 2.814483617972587, "grad_norm": 7.53125, "learning_rate": 1.806077595846769e-05, "loss": 1.0993, "mean_token_accuracy": 0.7275420010089875, "num_tokens": 44351343.0, "step": 19200 }, { "entropy": 1.1182308956980704, "epoch": 2.8159495712086784, "grad_norm": 9.1875, "learning_rate": 1.8057746750184624e-05, "loss": 1.0315, "mean_token_accuracy": 0.7283850654959678, "num_tokens": 44376688.0, "step": 19210 }, { "entropy": 1.099701038002968, "epoch": 2.8174155244447703, "grad_norm": 13.625, "learning_rate": 1.805471543227832e-05, "loss": 1.0736, "mean_token_accuracy": 0.7204705506563187, "num_tokens": 44400207.0, "step": 19220 }, { "entropy": 1.1277052879333496, "epoch": 2.818881477680862, "grad_norm": 9.5625, "learning_rate": 1.805168200554242e-05, "loss": 1.0605, "mean_token_accuracy": 0.7208291947841644, "num_tokens": 44425386.0, "step": 19230 }, { "entropy": 1.1688789486885072, "epoch": 2.8203474309169536, "grad_norm": 8.3125, "learning_rate": 1.8048646470771108e-05, "loss": 1.1511, "mean_token_accuracy": 0.7094387263059616, "num_tokens": 44447910.0, "step": 19240 }, { "entropy": 1.2415332466363906, "epoch": 2.8218133841530455, "grad_norm": 8.25, "learning_rate": 1.804560882875913e-05, "loss": 1.1973, "mean_token_accuracy": 0.6927513360977173, "num_tokens": 44471750.0, "step": 19250 }, { "entropy": 1.115514686703682, "epoch": 2.8232793373891374, "grad_norm": 9.125, "learning_rate": 1.8042569080301784e-05, "loss": 1.1151, "mean_token_accuracy": 0.7111495971679688, "num_tokens": 44495437.0, "step": 19260 }, { "entropy": 1.290826439857483, "epoch": 2.824745290625229, "grad_norm": 9.4375, "learning_rate": 1.803952722619491e-05, "loss": 1.1843, "mean_token_accuracy": 0.6962664976716042, "num_tokens": 44518125.0, "step": 19270 }, { "entropy": 1.1560094982385636, "epoch": 2.8262112438613207, "grad_norm": 14.5625, "learning_rate": 1.803648326723491e-05, "loss": 1.0994, "mean_token_accuracy": 0.7081321358680726, "num_tokens": 44542162.0, "step": 19280 }, { "entropy": 1.2213454484939574, "epoch": 2.8276771970974126, "grad_norm": 6.6875, "learning_rate": 1.803343720421873e-05, "loss": 1.1814, "mean_token_accuracy": 0.6930972307920455, "num_tokens": 44567448.0, "step": 19290 }, { "entropy": 1.133961956202984, "epoch": 2.8291431503335045, "grad_norm": 7.28125, "learning_rate": 1.8030389037943867e-05, "loss": 1.1213, "mean_token_accuracy": 0.7183801680803299, "num_tokens": 44593853.0, "step": 19300 }, { "entropy": 1.1520284920930863, "epoch": 2.8306091035695964, "grad_norm": 8.75, "learning_rate": 1.802733876920837e-05, "loss": 1.1068, "mean_token_accuracy": 0.7208308890461922, "num_tokens": 44622371.0, "step": 19310 }, { "entropy": 1.1254413306713105, "epoch": 2.832075056805688, "grad_norm": 9.125, "learning_rate": 1.8024286398810845e-05, "loss": 1.0588, "mean_token_accuracy": 0.7179519683122635, "num_tokens": 44645569.0, "step": 19320 }, { "entropy": 1.1653087913990021, "epoch": 2.8335410100417797, "grad_norm": 7.15625, "learning_rate": 1.802123192755044e-05, "loss": 1.1521, "mean_token_accuracy": 0.7134069353342056, "num_tokens": 44667851.0, "step": 19330 }, { "entropy": 1.2193393498659133, "epoch": 2.8350069632778716, "grad_norm": 7.125, "learning_rate": 1.8018175356226853e-05, "loss": 1.1648, "mean_token_accuracy": 0.6999901324510575, "num_tokens": 44690967.0, "step": 19340 }, { "entropy": 1.1827371746301651, "epoch": 2.836472916513963, "grad_norm": 6.875, "learning_rate": 1.8015116685640337e-05, "loss": 1.1795, "mean_token_accuracy": 0.7120277404785156, "num_tokens": 44715694.0, "step": 19350 }, { "entropy": 1.129684990644455, "epoch": 2.837938869750055, "grad_norm": 8.5625, "learning_rate": 1.801205591659169e-05, "loss": 1.0793, "mean_token_accuracy": 0.710765290260315, "num_tokens": 44740391.0, "step": 19360 }, { "entropy": 1.2262619256973266, "epoch": 2.839404822986147, "grad_norm": 10.625, "learning_rate": 1.8008993049882262e-05, "loss": 1.1792, "mean_token_accuracy": 0.6933537542819976, "num_tokens": 44762936.0, "step": 19370 }, { "entropy": 1.1771902799606324, "epoch": 2.8408707762222383, "grad_norm": 8.75, "learning_rate": 1.8005928086313948e-05, "loss": 1.152, "mean_token_accuracy": 0.7072642087936402, "num_tokens": 44787353.0, "step": 19380 }, { "entropy": 1.2410237312316894, "epoch": 2.84233672945833, "grad_norm": 10.0625, "learning_rate": 1.8002861026689205e-05, "loss": 1.1304, "mean_token_accuracy": 0.6982301443815231, "num_tokens": 44806805.0, "step": 19390 }, { "entropy": 1.1632393330335618, "epoch": 2.843802682694422, "grad_norm": 8.875, "learning_rate": 1.7999791871811018e-05, "loss": 1.1222, "mean_token_accuracy": 0.7059114962816239, "num_tokens": 44831174.0, "step": 19400 }, { "entropy": 1.2008585274219512, "epoch": 2.845268635930514, "grad_norm": 7.4375, "learning_rate": 1.799672062248294e-05, "loss": 1.196, "mean_token_accuracy": 0.7046185880899429, "num_tokens": 44851814.0, "step": 19410 }, { "entropy": 0.9731931924819947, "epoch": 2.846734589166606, "grad_norm": 11.125, "learning_rate": 1.799364727950906e-05, "loss": 0.9216, "mean_token_accuracy": 0.756005147099495, "num_tokens": 44875757.0, "step": 19420 }, { "entropy": 1.08997343480587, "epoch": 2.8482005424026973, "grad_norm": 9.3125, "learning_rate": 1.799057184369402e-05, "loss": 1.0602, "mean_token_accuracy": 0.7253169775009155, "num_tokens": 44903008.0, "step": 19430 }, { "entropy": 1.170770075917244, "epoch": 2.849666495638789, "grad_norm": 9.25, "learning_rate": 1.7987494315843008e-05, "loss": 1.1389, "mean_token_accuracy": 0.7106391191482544, "num_tokens": 44927535.0, "step": 19440 }, { "entropy": 1.3472818791866303, "epoch": 2.851132448874881, "grad_norm": 13.875, "learning_rate": 1.7984414696761766e-05, "loss": 1.3301, "mean_token_accuracy": 0.6780015289783478, "num_tokens": 44946853.0, "step": 19450 }, { "entropy": 1.2907212555408478, "epoch": 2.8525984021109725, "grad_norm": 10.875, "learning_rate": 1.7981332987256573e-05, "loss": 1.2593, "mean_token_accuracy": 0.6796556681394577, "num_tokens": 44966913.0, "step": 19460 }, { "entropy": 1.063797877728939, "epoch": 2.8540643553470644, "grad_norm": 9.75, "learning_rate": 1.7978249188134262e-05, "loss": 1.0245, "mean_token_accuracy": 0.725436070561409, "num_tokens": 44993636.0, "step": 19470 }, { "entropy": 1.3535194396972656, "epoch": 2.8555303085831563, "grad_norm": 8.375, "learning_rate": 1.7975163300202214e-05, "loss": 1.2738, "mean_token_accuracy": 0.6816143095493317, "num_tokens": 45018535.0, "step": 19480 }, { "entropy": 1.3345523685216905, "epoch": 2.8569962618192477, "grad_norm": 7.875, "learning_rate": 1.797207532426835e-05, "loss": 1.295, "mean_token_accuracy": 0.6822685778141022, "num_tokens": 45039729.0, "step": 19490 }, { "entropy": 1.231700611114502, "epoch": 2.8584622150553396, "grad_norm": 10.0625, "learning_rate": 1.796898526114115e-05, "loss": 1.196, "mean_token_accuracy": 0.6966937124729157, "num_tokens": 45065138.0, "step": 19500 }, { "entropy": 1.2565504640340805, "epoch": 2.8599281682914315, "grad_norm": 13.5, "learning_rate": 1.7965893111629626e-05, "loss": 1.2301, "mean_token_accuracy": 0.6861053183674812, "num_tokens": 45088220.0, "step": 19510 }, { "entropy": 1.3074562042951583, "epoch": 2.8613941215275234, "grad_norm": 10.375, "learning_rate": 1.796279887654335e-05, "loss": 1.3053, "mean_token_accuracy": 0.6795206010341645, "num_tokens": 45109613.0, "step": 19520 }, { "entropy": 1.19196959733963, "epoch": 2.8628600747636153, "grad_norm": 8.125, "learning_rate": 1.7959702556692424e-05, "loss": 1.1608, "mean_token_accuracy": 0.7070769369602203, "num_tokens": 45132150.0, "step": 19530 }, { "entropy": 1.3210212767124176, "epoch": 2.8643260279997067, "grad_norm": 13.125, "learning_rate": 1.795660415288751e-05, "loss": 1.2455, "mean_token_accuracy": 0.6790771365165711, "num_tokens": 45150693.0, "step": 19540 }, { "entropy": 1.1291760504245758, "epoch": 2.8657919812357986, "grad_norm": 8.5625, "learning_rate": 1.7953503665939812e-05, "loss": 1.0869, "mean_token_accuracy": 0.7184501141309738, "num_tokens": 45174889.0, "step": 19550 }, { "entropy": 1.3429315984249115, "epoch": 2.8672579344718905, "grad_norm": 8.5625, "learning_rate": 1.7950401096661075e-05, "loss": 1.3602, "mean_token_accuracy": 0.6694808512926101, "num_tokens": 45197921.0, "step": 19560 }, { "entropy": 1.2201452791690826, "epoch": 2.868723887707982, "grad_norm": 8.4375, "learning_rate": 1.7947296445863594e-05, "loss": 1.1952, "mean_token_accuracy": 0.6954235732555389, "num_tokens": 45219473.0, "step": 19570 }, { "entropy": 1.2079978957772255, "epoch": 2.870189840944074, "grad_norm": 10.0625, "learning_rate": 1.7944189714360205e-05, "loss": 1.1169, "mean_token_accuracy": 0.7093648463487625, "num_tokens": 45241483.0, "step": 19580 }, { "entropy": 1.1587503463029862, "epoch": 2.8716557941801657, "grad_norm": 7.125, "learning_rate": 1.7941080902964293e-05, "loss": 1.0877, "mean_token_accuracy": 0.7172048181295395, "num_tokens": 45268214.0, "step": 19590 }, { "entropy": 1.1648815035820008, "epoch": 2.8731217474162576, "grad_norm": 15.375, "learning_rate": 1.7937970012489783e-05, "loss": 1.1194, "mean_token_accuracy": 0.7091481924057007, "num_tokens": 45290853.0, "step": 19600 }, { "entropy": 1.1346673965454102, "epoch": 2.874587700652349, "grad_norm": 8.4375, "learning_rate": 1.793485704375115e-05, "loss": 1.0979, "mean_token_accuracy": 0.7276661291718483, "num_tokens": 45314290.0, "step": 19610 }, { "entropy": 1.151353046298027, "epoch": 2.876053653888441, "grad_norm": 14.5625, "learning_rate": 1.793174199756341e-05, "loss": 1.1075, "mean_token_accuracy": 0.7141537129878998, "num_tokens": 45333125.0, "step": 19620 }, { "entropy": 1.0726276457309722, "epoch": 2.877519607124533, "grad_norm": 7.09375, "learning_rate": 1.792862487474212e-05, "loss": 1.0037, "mean_token_accuracy": 0.7345580339431763, "num_tokens": 45357285.0, "step": 19630 }, { "entropy": 1.1662335097789764, "epoch": 2.8789855603606247, "grad_norm": 8.3125, "learning_rate": 1.7925505676103382e-05, "loss": 1.1344, "mean_token_accuracy": 0.7098070830106735, "num_tokens": 45379574.0, "step": 19640 }, { "entropy": 1.3170736014842988, "epoch": 2.880451513596716, "grad_norm": 11.0, "learning_rate": 1.7922384402463845e-05, "loss": 1.2726, "mean_token_accuracy": 0.681411474943161, "num_tokens": 45403210.0, "step": 19650 }, { "entropy": 1.097908142209053, "epoch": 2.881917466832808, "grad_norm": 8.1875, "learning_rate": 1.7919261054640703e-05, "loss": 1.0477, "mean_token_accuracy": 0.7275069296360016, "num_tokens": 45428075.0, "step": 19660 }, { "entropy": 1.2101421028375625, "epoch": 2.8833834200689, "grad_norm": 9.25, "learning_rate": 1.7916135633451684e-05, "loss": 1.1725, "mean_token_accuracy": 0.6911795407533645, "num_tokens": 45451071.0, "step": 19670 }, { "entropy": 1.1531496495008469, "epoch": 2.8848493733049914, "grad_norm": 9.25, "learning_rate": 1.7913008139715064e-05, "loss": 1.0394, "mean_token_accuracy": 0.7147909849882126, "num_tokens": 45473565.0, "step": 19680 }, { "entropy": 0.9758344233036041, "epoch": 2.8863153265410832, "grad_norm": 12.4375, "learning_rate": 1.7909878574249665e-05, "loss": 0.9637, "mean_token_accuracy": 0.7480946391820907, "num_tokens": 45498439.0, "step": 19690 }, { "entropy": 1.0239616081118583, "epoch": 2.887781279777175, "grad_norm": 10.125, "learning_rate": 1.7906746937874847e-05, "loss": 0.988, "mean_token_accuracy": 0.734697338938713, "num_tokens": 45524365.0, "step": 19700 }, { "entropy": 1.1485138714313508, "epoch": 2.889247233013267, "grad_norm": 7.0625, "learning_rate": 1.790361323141051e-05, "loss": 1.1011, "mean_token_accuracy": 0.7151131674647331, "num_tokens": 45549763.0, "step": 19710 }, { "entropy": 1.1481154263019562, "epoch": 2.890713186249359, "grad_norm": 10.5, "learning_rate": 1.7900477455677105e-05, "loss": 1.1031, "mean_token_accuracy": 0.714380419254303, "num_tokens": 45573351.0, "step": 19720 }, { "entropy": 1.2363744586706162, "epoch": 2.8921791394854504, "grad_norm": 12.75, "learning_rate": 1.7897339611495615e-05, "loss": 1.1837, "mean_token_accuracy": 0.6974050521850585, "num_tokens": 45592806.0, "step": 19730 }, { "entropy": 1.1098355680704117, "epoch": 2.8936450927215422, "grad_norm": 9.75, "learning_rate": 1.7894199699687568e-05, "loss": 1.0929, "mean_token_accuracy": 0.7257880359888077, "num_tokens": 45617007.0, "step": 19740 }, { "entropy": 1.2222099125385284, "epoch": 2.895111045957634, "grad_norm": 7.375, "learning_rate": 1.7891057721075036e-05, "loss": 1.159, "mean_token_accuracy": 0.6990696042776108, "num_tokens": 45640725.0, "step": 19750 }, { "entropy": 1.1784106984734535, "epoch": 2.8965769991937256, "grad_norm": 8.0625, "learning_rate": 1.788791367648063e-05, "loss": 1.0678, "mean_token_accuracy": 0.7066678881645203, "num_tokens": 45666538.0, "step": 19760 }, { "entropy": 1.1153238236904144, "epoch": 2.8980429524298175, "grad_norm": 16.875, "learning_rate": 1.78847675667275e-05, "loss": 1.1239, "mean_token_accuracy": 0.7239186316728592, "num_tokens": 45689859.0, "step": 19770 }, { "entropy": 1.2712737947702408, "epoch": 2.8995089056659094, "grad_norm": 9.4375, "learning_rate": 1.788161939263934e-05, "loss": 1.2014, "mean_token_accuracy": 0.6924059092998505, "num_tokens": 45711400.0, "step": 19780 }, { "entropy": 1.1533941715955733, "epoch": 2.900974858902001, "grad_norm": 10.25, "learning_rate": 1.7878469155040382e-05, "loss": 1.1247, "mean_token_accuracy": 0.7178474396467209, "num_tokens": 45732888.0, "step": 19790 }, { "entropy": 1.0490107476711272, "epoch": 2.9024408121380927, "grad_norm": 7.53125, "learning_rate": 1.7875316854755403e-05, "loss": 0.9739, "mean_token_accuracy": 0.729155620932579, "num_tokens": 45755625.0, "step": 19800 }, { "entropy": 1.271466101706028, "epoch": 2.9039067653741846, "grad_norm": 9.5625, "learning_rate": 1.787216249260971e-05, "loss": 1.1969, "mean_token_accuracy": 0.6954784482717514, "num_tokens": 45779581.0, "step": 19810 }, { "entropy": 1.1499080300331115, "epoch": 2.9053727186102765, "grad_norm": 8.5, "learning_rate": 1.7869006069429164e-05, "loss": 1.1283, "mean_token_accuracy": 0.7091289907693863, "num_tokens": 45805804.0, "step": 19820 }, { "entropy": 1.307458084821701, "epoch": 2.9068386718463683, "grad_norm": 9.0, "learning_rate": 1.786584758604015e-05, "loss": 1.3013, "mean_token_accuracy": 0.6739439591765404, "num_tokens": 45828359.0, "step": 19830 }, { "entropy": 1.1472454249858857, "epoch": 2.90830462508246, "grad_norm": 9.0625, "learning_rate": 1.7862687043269603e-05, "loss": 1.0651, "mean_token_accuracy": 0.7170341432094574, "num_tokens": 45850472.0, "step": 19840 }, { "entropy": 1.0133533954620362, "epoch": 2.9097705783185517, "grad_norm": 15.125, "learning_rate": 1.7859524441945e-05, "loss": 0.9618, "mean_token_accuracy": 0.739276859164238, "num_tokens": 45875596.0, "step": 19850 }, { "entropy": 1.239413595199585, "epoch": 2.9112365315546436, "grad_norm": 9.9375, "learning_rate": 1.785635978289434e-05, "loss": 1.184, "mean_token_accuracy": 0.697741436958313, "num_tokens": 45896565.0, "step": 19860 }, { "entropy": 1.1475364595651627, "epoch": 2.912702484790735, "grad_norm": 7.6875, "learning_rate": 1.7853193066946182e-05, "loss": 1.1077, "mean_token_accuracy": 0.7205995976924896, "num_tokens": 45923020.0, "step": 19870 }, { "entropy": 1.1683872193098068, "epoch": 2.914168438026827, "grad_norm": 12.5625, "learning_rate": 1.7850024294929607e-05, "loss": 1.1383, "mean_token_accuracy": 0.7109404951334, "num_tokens": 45945383.0, "step": 19880 }, { "entropy": 1.1686067879199982, "epoch": 2.915634391262919, "grad_norm": 10.0, "learning_rate": 1.7846853467674248e-05, "loss": 1.0658, "mean_token_accuracy": 0.7085000008344651, "num_tokens": 45969476.0, "step": 19890 }, { "entropy": 1.213783848285675, "epoch": 2.9171003444990102, "grad_norm": 6.5625, "learning_rate": 1.784368058601026e-05, "loss": 1.155, "mean_token_accuracy": 0.7092059016227722, "num_tokens": 45993850.0, "step": 19900 }, { "entropy": 0.9507953599095345, "epoch": 2.918566297735102, "grad_norm": 8.9375, "learning_rate": 1.7840505650768353e-05, "loss": 0.8835, "mean_token_accuracy": 0.756884440779686, "num_tokens": 46020082.0, "step": 19910 }, { "entropy": 1.318719521164894, "epoch": 2.920032250971194, "grad_norm": 14.25, "learning_rate": 1.7837328662779764e-05, "loss": 1.3244, "mean_token_accuracy": 0.6858035743236541, "num_tokens": 46042330.0, "step": 19920 }, { "entropy": 1.1856589168310165, "epoch": 2.921498204207286, "grad_norm": 11.625, "learning_rate": 1.7834149622876266e-05, "loss": 1.1742, "mean_token_accuracy": 0.7102255046367645, "num_tokens": 46067886.0, "step": 19930 }, { "entropy": 1.1209787249565124, "epoch": 2.922964157443378, "grad_norm": 11.8125, "learning_rate": 1.7830968531890174e-05, "loss": 1.1133, "mean_token_accuracy": 0.719399881362915, "num_tokens": 46091241.0, "step": 19940 }, { "entropy": 1.1280826717615127, "epoch": 2.9244301106794692, "grad_norm": 7.75, "learning_rate": 1.7827785390654348e-05, "loss": 1.0993, "mean_token_accuracy": 0.7187776416540146, "num_tokens": 46113813.0, "step": 19950 }, { "entropy": 1.108560185134411, "epoch": 2.925896063915561, "grad_norm": 6.875, "learning_rate": 1.7824600200002164e-05, "loss": 1.067, "mean_token_accuracy": 0.7244858920574189, "num_tokens": 46139004.0, "step": 19960 }, { "entropy": 1.2582601264119149, "epoch": 2.927362017151653, "grad_norm": 9.0625, "learning_rate": 1.782141296076755e-05, "loss": 1.2415, "mean_token_accuracy": 0.6923416495323181, "num_tokens": 46163119.0, "step": 19970 }, { "entropy": 1.3896611154079437, "epoch": 2.9288279703877445, "grad_norm": 11.375, "learning_rate": 1.781822367378497e-05, "loss": 1.4073, "mean_token_accuracy": 0.6649805516004562, "num_tokens": 46181317.0, "step": 19980 }, { "entropy": 1.2396588057279587, "epoch": 2.9302939236238363, "grad_norm": 10.8125, "learning_rate": 1.781503233988942e-05, "loss": 1.2354, "mean_token_accuracy": 0.6957247167825699, "num_tokens": 46206606.0, "step": 19990 }, { "entropy": 1.1998330354690552, "epoch": 2.9317598768599282, "grad_norm": 7.75, "learning_rate": 1.7811838959916434e-05, "loss": 1.1095, "mean_token_accuracy": 0.711186084151268, "num_tokens": 46228797.0, "step": 20000 }, { "entropy": 1.262239581346512, "epoch": 2.9332258300960197, "grad_norm": 12.3125, "learning_rate": 1.7808643534702072e-05, "loss": 1.237, "mean_token_accuracy": 0.6908046424388885, "num_tokens": 46252775.0, "step": 20010 }, { "entropy": 1.1602690100669861, "epoch": 2.9346917833321116, "grad_norm": 7.28125, "learning_rate": 1.780544606508295e-05, "loss": 1.127, "mean_token_accuracy": 0.7119058221578598, "num_tokens": 46275049.0, "step": 20020 }, { "entropy": 1.1105126067996025, "epoch": 2.9361577365682034, "grad_norm": 7.53125, "learning_rate": 1.7802246551896202e-05, "loss": 1.0462, "mean_token_accuracy": 0.7233010530471802, "num_tokens": 46300731.0, "step": 20030 }, { "entropy": 1.0161093428730965, "epoch": 2.9376236898042953, "grad_norm": 7.3125, "learning_rate": 1.77990449959795e-05, "loss": 0.9671, "mean_token_accuracy": 0.7425823837518692, "num_tokens": 46325702.0, "step": 20040 }, { "entropy": 1.1101292550563813, "epoch": 2.9390896430403872, "grad_norm": 9.25, "learning_rate": 1.7795841398171058e-05, "loss": 1.0326, "mean_token_accuracy": 0.7258973360061646, "num_tokens": 46349071.0, "step": 20050 }, { "entropy": 1.1843984872102737, "epoch": 2.9405555962764787, "grad_norm": 9.75, "learning_rate": 1.7792635759309616e-05, "loss": 1.2108, "mean_token_accuracy": 0.7064444065093994, "num_tokens": 46371793.0, "step": 20060 }, { "entropy": 1.084637638926506, "epoch": 2.9420215495125706, "grad_norm": 8.9375, "learning_rate": 1.778942808023445e-05, "loss": 1.0311, "mean_token_accuracy": 0.7319386243820191, "num_tokens": 46400026.0, "step": 20070 }, { "entropy": 1.2218910723924636, "epoch": 2.9434875027486624, "grad_norm": 8.25, "learning_rate": 1.778621836178538e-05, "loss": 1.1279, "mean_token_accuracy": 0.7000965148210525, "num_tokens": 46423684.0, "step": 20080 }, { "entropy": 1.1816448032855988, "epoch": 2.944953455984754, "grad_norm": 10.6875, "learning_rate": 1.7783006604802743e-05, "loss": 1.1626, "mean_token_accuracy": 0.7021417826414108, "num_tokens": 46446655.0, "step": 20090 }, { "entropy": 1.2853930830955504, "epoch": 2.9464194092208458, "grad_norm": 8.8125, "learning_rate": 1.7779792810127427e-05, "loss": 1.2186, "mean_token_accuracy": 0.6965573728084564, "num_tokens": 46473202.0, "step": 20100 }, { "entropy": 1.338686364889145, "epoch": 2.9478853624569377, "grad_norm": 13.25, "learning_rate": 1.777657697860084e-05, "loss": 1.3294, "mean_token_accuracy": 0.6805462896823883, "num_tokens": 46491293.0, "step": 20110 }, { "entropy": 1.2275916695594788, "epoch": 2.9493513156930296, "grad_norm": 7.8125, "learning_rate": 1.777335911106493e-05, "loss": 1.2046, "mean_token_accuracy": 0.702023622393608, "num_tokens": 46512047.0, "step": 20120 }, { "entropy": 1.0660468757152557, "epoch": 2.950817268929121, "grad_norm": 8.6875, "learning_rate": 1.777013920836218e-05, "loss": 0.9669, "mean_token_accuracy": 0.7285980463027955, "num_tokens": 46536331.0, "step": 20130 }, { "entropy": 1.1753221303224564, "epoch": 2.952283222165213, "grad_norm": 8.5, "learning_rate": 1.7766917271335598e-05, "loss": 1.1294, "mean_token_accuracy": 0.7016082793474198, "num_tokens": 46561736.0, "step": 20140 }, { "entropy": 1.1598705232143403, "epoch": 2.9537491754013048, "grad_norm": 9.1875, "learning_rate": 1.776369330082873e-05, "loss": 1.1334, "mean_token_accuracy": 0.7105744838714599, "num_tokens": 46585314.0, "step": 20150 }, { "entropy": 1.0779353141784669, "epoch": 2.9552151286373967, "grad_norm": 7.03125, "learning_rate": 1.7760467297685657e-05, "loss": 1.016, "mean_token_accuracy": 0.7257605701684952, "num_tokens": 46610656.0, "step": 20160 }, { "entropy": 1.3804779201745987, "epoch": 2.956681081873488, "grad_norm": 13.375, "learning_rate": 1.7757239262750982e-05, "loss": 1.3709, "mean_token_accuracy": 0.6730569899082184, "num_tokens": 46631116.0, "step": 20170 }, { "entropy": 1.2241089940071106, "epoch": 2.95814703510958, "grad_norm": 9.5, "learning_rate": 1.775400919686986e-05, "loss": 1.1513, "mean_token_accuracy": 0.6933852106332778, "num_tokens": 46654161.0, "step": 20180 }, { "entropy": 1.249384868144989, "epoch": 2.959612988345672, "grad_norm": 11.1875, "learning_rate": 1.7750777100887943e-05, "loss": 1.2289, "mean_token_accuracy": 0.6926375061273575, "num_tokens": 46674914.0, "step": 20190 }, { "entropy": 1.118547123670578, "epoch": 2.9610789415817633, "grad_norm": 10.0, "learning_rate": 1.7747542975651452e-05, "loss": 1.0623, "mean_token_accuracy": 0.7173933565616608, "num_tokens": 46697178.0, "step": 20200 }, { "entropy": 1.095436719059944, "epoch": 2.962544894817855, "grad_norm": 7.15625, "learning_rate": 1.7744306822007124e-05, "loss": 1.0481, "mean_token_accuracy": 0.725366935133934, "num_tokens": 46722481.0, "step": 20210 }, { "entropy": 1.1857932776212692, "epoch": 2.964010848053947, "grad_norm": 12.5625, "learning_rate": 1.7741068640802218e-05, "loss": 1.1098, "mean_token_accuracy": 0.7125555574893951, "num_tokens": 46744413.0, "step": 20220 }, { "entropy": 1.2645143747329712, "epoch": 2.965476801290039, "grad_norm": 7.9375, "learning_rate": 1.7737828432884537e-05, "loss": 1.2497, "mean_token_accuracy": 0.6924518197774887, "num_tokens": 46769361.0, "step": 20230 }, { "entropy": 1.2719243362545967, "epoch": 2.966942754526131, "grad_norm": 9.4375, "learning_rate": 1.7734586199102402e-05, "loss": 1.2157, "mean_token_accuracy": 0.6931069478392601, "num_tokens": 46789370.0, "step": 20240 }, { "entropy": 1.1548308700323104, "epoch": 2.9684087077622223, "grad_norm": 9.0625, "learning_rate": 1.7731341940304688e-05, "loss": 1.0993, "mean_token_accuracy": 0.7141219258308411, "num_tokens": 46814246.0, "step": 20250 }, { "entropy": 1.3414143860340118, "epoch": 2.969874660998314, "grad_norm": 11.9375, "learning_rate": 1.772809565734077e-05, "loss": 1.336, "mean_token_accuracy": 0.6798712193965912, "num_tokens": 46835053.0, "step": 20260 }, { "entropy": 1.1751354962587357, "epoch": 2.971340614234406, "grad_norm": 10.3125, "learning_rate": 1.772484735106057e-05, "loss": 1.0912, "mean_token_accuracy": 0.708380651473999, "num_tokens": 46856731.0, "step": 20270 }, { "entropy": 1.105297103524208, "epoch": 2.9728065674704975, "grad_norm": 7.5625, "learning_rate": 1.7721597022314543e-05, "loss": 1.0613, "mean_token_accuracy": 0.7246344357728958, "num_tokens": 46881210.0, "step": 20280 }, { "entropy": 1.1468415230512619, "epoch": 2.9742725207065894, "grad_norm": 6.6875, "learning_rate": 1.7718344671953663e-05, "loss": 1.1186, "mean_token_accuracy": 0.7072641551494598, "num_tokens": 46906720.0, "step": 20290 }, { "entropy": 1.3707790285348893, "epoch": 2.9757384739426813, "grad_norm": 7.1875, "learning_rate": 1.7715090300829435e-05, "loss": 1.3645, "mean_token_accuracy": 0.6663914144039154, "num_tokens": 46925875.0, "step": 20300 }, { "entropy": 1.1961566388607026, "epoch": 2.9772044271787728, "grad_norm": 9.6875, "learning_rate": 1.77118339097939e-05, "loss": 1.1067, "mean_token_accuracy": 0.7106024920940399, "num_tokens": 46951233.0, "step": 20310 }, { "entropy": 1.0811491653323173, "epoch": 2.9786703804148646, "grad_norm": 10.375, "learning_rate": 1.7708575499699627e-05, "loss": 1.0084, "mean_token_accuracy": 0.7316865026950836, "num_tokens": 46974825.0, "step": 20320 }, { "entropy": 1.229711291193962, "epoch": 2.9801363336509565, "grad_norm": 6.09375, "learning_rate": 1.77053150713997e-05, "loss": 1.2674, "mean_token_accuracy": 0.6999128490686417, "num_tokens": 46998722.0, "step": 20330 }, { "entropy": 1.298739731311798, "epoch": 2.9816022868870484, "grad_norm": 10.625, "learning_rate": 1.7702052625747748e-05, "loss": 1.2646, "mean_token_accuracy": 0.6941920429468155, "num_tokens": 47022213.0, "step": 20340 }, { "entropy": 1.1165776535868646, "epoch": 2.9830682401231403, "grad_norm": 12.25, "learning_rate": 1.7698788163597923e-05, "loss": 1.0573, "mean_token_accuracy": 0.7271595239639282, "num_tokens": 47045691.0, "step": 20350 }, { "entropy": 1.1691247284412385, "epoch": 2.9845341933592318, "grad_norm": 9.8125, "learning_rate": 1.7695521685804902e-05, "loss": 1.114, "mean_token_accuracy": 0.7115205556154252, "num_tokens": 47069004.0, "step": 20360 }, { "entropy": 1.1435819000005722, "epoch": 2.9860001465953236, "grad_norm": 7.4375, "learning_rate": 1.7692253193223894e-05, "loss": 1.0944, "mean_token_accuracy": 0.7129167169332504, "num_tokens": 47093124.0, "step": 20370 }, { "entropy": 1.217539796233177, "epoch": 2.9874660998314155, "grad_norm": 8.5, "learning_rate": 1.768898268671063e-05, "loss": 1.18, "mean_token_accuracy": 0.6968638718128204, "num_tokens": 47116882.0, "step": 20380 }, { "entropy": 1.322134006023407, "epoch": 2.988932053067507, "grad_norm": 13.8125, "learning_rate": 1.768571016712137e-05, "loss": 1.3086, "mean_token_accuracy": 0.675589406490326, "num_tokens": 47136984.0, "step": 20390 }, { "entropy": 1.045129981637001, "epoch": 2.990398006303599, "grad_norm": 7.90625, "learning_rate": 1.7682435635312906e-05, "loss": 0.9492, "mean_token_accuracy": 0.7345781028270721, "num_tokens": 47159850.0, "step": 20400 }, { "entropy": 1.2697780311107636, "epoch": 2.9918639595396908, "grad_norm": 11.4375, "learning_rate": 1.767915909214255e-05, "loss": 1.229, "mean_token_accuracy": 0.6916067898273468, "num_tokens": 47180698.0, "step": 20410 }, { "entropy": 1.2387101247906684, "epoch": 2.993329912775782, "grad_norm": 11.25, "learning_rate": 1.7675880538468145e-05, "loss": 1.1854, "mean_token_accuracy": 0.6962555974721909, "num_tokens": 47203949.0, "step": 20420 }, { "entropy": 1.261678770184517, "epoch": 2.994795866011874, "grad_norm": 8.0, "learning_rate": 1.767259997514806e-05, "loss": 1.2158, "mean_token_accuracy": 0.6941365778446198, "num_tokens": 47224955.0, "step": 20430 }, { "entropy": 1.101868313550949, "epoch": 2.996261819247966, "grad_norm": 9.5625, "learning_rate": 1.766931740304119e-05, "loss": 0.976, "mean_token_accuracy": 0.7229547798633575, "num_tokens": 47250947.0, "step": 20440 }, { "entropy": 1.1707715004682542, "epoch": 2.997727772484058, "grad_norm": 14.0, "learning_rate": 1.7666032823006953e-05, "loss": 1.1009, "mean_token_accuracy": 0.7090524226427078, "num_tokens": 47274541.0, "step": 20450 }, { "entropy": 1.3196678727865219, "epoch": 2.9991937257201498, "grad_norm": 13.0, "learning_rate": 1.7662746235905298e-05, "loss": 1.2442, "mean_token_accuracy": 0.6706839263439178, "num_tokens": 47298476.0, "step": 20460 }, { "epoch": 3.0, "eval_entropy": 1.2244602692827071, "eval_loss": 1.2729096412658691, "eval_mean_token_accuracy": 0.690877513984577, "eval_num_tokens": 47309817.0, "eval_runtime": 56.9416, "eval_samples_per_second": 53.248, "eval_steps_per_second": 26.624, "step": 20466 } ], "logging_steps": 10, "max_steps": 68220, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.797139838986778e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }