{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.591446378680422, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.4190640166401862, "epoch": 0.017193208682570384, "grad_norm": 0.44122377038002014, "learning_rate": 0.00019964, "loss": 4.624512481689453, "mean_token_accuracy": 0.48470579609274866, "num_tokens": 59125.0, "step": 10 }, { "entropy": 1.3190652154386044, "epoch": 0.03438641736514077, "grad_norm": 0.5728761553764343, "learning_rate": 0.00019924, "loss": 4.138050842285156, "mean_token_accuracy": 0.5170843195170164, "num_tokens": 121732.0, "step": 20 }, { "entropy": 1.899017508327961, "epoch": 0.05157962604771115, "grad_norm": 0.5385442972183228, "learning_rate": 0.00019884000000000001, "loss": 3.410753631591797, "mean_token_accuracy": 0.5273028708994388, "num_tokens": 180496.0, "step": 30 }, { "entropy": 2.605507677793503, "epoch": 0.06877283473028153, "grad_norm": 0.39409318566322327, "learning_rate": 0.00019844, "loss": 3.210155487060547, "mean_token_accuracy": 0.5257176972925663, "num_tokens": 239996.0, "step": 40 }, { "entropy": 2.37664245814085, "epoch": 0.08596604341285193, "grad_norm": 0.32786861062049866, "learning_rate": 0.00019804, "loss": 2.821852111816406, "mean_token_accuracy": 0.5590792961418629, "num_tokens": 298043.0, "step": 50 }, { "entropy": 2.395954003930092, "epoch": 0.1031592520954223, "grad_norm": 0.3047294616699219, "learning_rate": 0.00019764, "loss": 2.7597124099731447, "mean_token_accuracy": 0.5563199404627085, "num_tokens": 355876.0, "step": 60 }, { "entropy": 2.411456014215946, "epoch": 0.12035246077799269, "grad_norm": 0.3513432741165161, "learning_rate": 0.00019724, "loss": 2.7809066772460938, "mean_token_accuracy": 0.5538371551781893, "num_tokens": 415356.0, "step": 70 }, { "entropy": 2.396346817910671, "epoch": 0.13754566946056307, "grad_norm": 0.32153311371803284, "learning_rate": 0.00019684, "loss": 2.673196029663086, "mean_token_accuracy": 0.5653545051813126, "num_tokens": 472895.0, "step": 80 }, { "entropy": 2.4005219876766204, "epoch": 0.15473887814313347, "grad_norm": 0.37381669878959656, "learning_rate": 0.00019644, "loss": 2.6783329010009767, "mean_token_accuracy": 0.565613779053092, "num_tokens": 531787.0, "step": 90 }, { "entropy": 2.291259491443634, "epoch": 0.17193208682570385, "grad_norm": 0.3424990475177765, "learning_rate": 0.00019604, "loss": 2.518219757080078, "mean_token_accuracy": 0.5815062165260315, "num_tokens": 591947.0, "step": 100 }, { "entropy": 2.3215643003582955, "epoch": 0.18912529550827423, "grad_norm": 0.37565672397613525, "learning_rate": 0.00019564, "loss": 2.5377925872802733, "mean_token_accuracy": 0.5835155539214611, "num_tokens": 649619.0, "step": 110 }, { "entropy": 2.4258039399981497, "epoch": 0.2063185041908446, "grad_norm": 0.4689179062843323, "learning_rate": 0.00019524, "loss": 2.6080394744873048, "mean_token_accuracy": 0.5646311499178409, "num_tokens": 707340.0, "step": 120 }, { "entropy": 2.317111870646477, "epoch": 0.223511712873415, "grad_norm": 0.3885783851146698, "learning_rate": 0.00019484, "loss": 2.477190399169922, "mean_token_accuracy": 0.5836748830974102, "num_tokens": 769505.0, "step": 130 }, { "entropy": 2.3055995970964434, "epoch": 0.24070492155598538, "grad_norm": 0.43443477153778076, "learning_rate": 0.00019444, "loss": 2.4609752655029298, "mean_token_accuracy": 0.5811029966920614, "num_tokens": 829367.0, "step": 140 }, { "entropy": 2.2776345878839495, "epoch": 0.2578981302385558, "grad_norm": 0.47762158513069153, "learning_rate": 0.00019404, "loss": 2.4081003189086916, "mean_token_accuracy": 0.5853929311037064, "num_tokens": 886215.0, "step": 150 }, { "entropy": 2.2963428094983103, "epoch": 0.27509133892112614, "grad_norm": 0.46731501817703247, "learning_rate": 0.00019364, "loss": 2.4675086975097655, "mean_token_accuracy": 0.5888176921755075, "num_tokens": 945101.0, "step": 160 }, { "entropy": 2.2366374909877775, "epoch": 0.29228454760369654, "grad_norm": 0.4367876350879669, "learning_rate": 0.00019323999999999999, "loss": 2.3875659942626952, "mean_token_accuracy": 0.5896096613258124, "num_tokens": 1004603.0, "step": 170 }, { "entropy": 2.254362703859806, "epoch": 0.30947775628626695, "grad_norm": 0.4333887994289398, "learning_rate": 0.00019284, "loss": 2.4054040908813477, "mean_token_accuracy": 0.5891438674181699, "num_tokens": 1067233.0, "step": 180 }, { "entropy": 2.2392648085951805, "epoch": 0.3266709649688373, "grad_norm": 0.45199301838874817, "learning_rate": 0.00019244000000000002, "loss": 2.3508541107177736, "mean_token_accuracy": 0.591214832291007, "num_tokens": 1124472.0, "step": 190 }, { "entropy": 2.149501931667328, "epoch": 0.3438641736514077, "grad_norm": 0.44520440697669983, "learning_rate": 0.00019204, "loss": 2.281546783447266, "mean_token_accuracy": 0.5986809592694045, "num_tokens": 1180382.0, "step": 200 }, { "entropy": 2.2454783216118814, "epoch": 0.3610573823339781, "grad_norm": 0.5927078723907471, "learning_rate": 0.00019164000000000003, "loss": 2.4131927490234375, "mean_token_accuracy": 0.5888259880244732, "num_tokens": 1240228.0, "step": 210 }, { "entropy": 2.2189586043357847, "epoch": 0.37825059101654845, "grad_norm": 0.4634048044681549, "learning_rate": 0.00019124000000000002, "loss": 2.336981201171875, "mean_token_accuracy": 0.5954784743487835, "num_tokens": 1300889.0, "step": 220 }, { "entropy": 2.317907977104187, "epoch": 0.39544379969911886, "grad_norm": 0.492512971162796, "learning_rate": 0.00019084, "loss": 2.4568838119506835, "mean_token_accuracy": 0.5841229122132063, "num_tokens": 1357396.0, "step": 230 }, { "entropy": 2.2429257184267044, "epoch": 0.4126370083816892, "grad_norm": 0.515352725982666, "learning_rate": 0.00019044000000000003, "loss": 2.3383663177490233, "mean_token_accuracy": 0.5922708168625832, "num_tokens": 1417505.0, "step": 240 }, { "entropy": 2.2210252806544304, "epoch": 0.4298302170642596, "grad_norm": 0.4831322133541107, "learning_rate": 0.00019004000000000002, "loss": 2.4192705154418945, "mean_token_accuracy": 0.5948628049343825, "num_tokens": 1478753.0, "step": 250 }, { "entropy": 2.274476508796215, "epoch": 0.44702342574683, "grad_norm": 0.49397456645965576, "learning_rate": 0.00018964, "loss": 2.3850372314453123, "mean_token_accuracy": 0.5882141895592212, "num_tokens": 1540514.0, "step": 260 }, { "entropy": 2.2252144277095796, "epoch": 0.46421663442940037, "grad_norm": 0.48239603638648987, "learning_rate": 0.00018924000000000002, "loss": 2.3765859603881836, "mean_token_accuracy": 0.597905408218503, "num_tokens": 1602363.0, "step": 270 }, { "entropy": 2.2030586138367654, "epoch": 0.48140984311197077, "grad_norm": 0.47931915521621704, "learning_rate": 0.00018884000000000001, "loss": 2.3378921508789063, "mean_token_accuracy": 0.5946248725056649, "num_tokens": 1663013.0, "step": 280 }, { "entropy": 2.193090632557869, "epoch": 0.4986030517945412, "grad_norm": 0.4267734885215759, "learning_rate": 0.00018844, "loss": 2.301167106628418, "mean_token_accuracy": 0.600122318789363, "num_tokens": 1724937.0, "step": 290 }, { "entropy": 2.2401370778679848, "epoch": 0.5157962604771116, "grad_norm": 0.49861446022987366, "learning_rate": 0.00018804000000000002, "loss": 2.3409378051757814, "mean_token_accuracy": 0.5942602109163999, "num_tokens": 1782528.0, "step": 300 }, { "entropy": 2.2543238058686255, "epoch": 0.5329894691596819, "grad_norm": 0.5549066066741943, "learning_rate": 0.00018764, "loss": 2.397447967529297, "mean_token_accuracy": 0.5927667014300824, "num_tokens": 1841865.0, "step": 310 }, { "entropy": 2.1446528255939485, "epoch": 0.5501826778422523, "grad_norm": 0.6057388782501221, "learning_rate": 0.00018724, "loss": 2.2854946136474608, "mean_token_accuracy": 0.5992342013865709, "num_tokens": 1901375.0, "step": 320 }, { "entropy": 2.2384325101971627, "epoch": 0.5673758865248227, "grad_norm": 0.6120573282241821, "learning_rate": 0.00018684000000000002, "loss": 2.3560409545898438, "mean_token_accuracy": 0.5913927119225264, "num_tokens": 1959650.0, "step": 330 }, { "entropy": 2.103813698887825, "epoch": 0.5845690952073931, "grad_norm": 0.5133985280990601, "learning_rate": 0.00018644, "loss": 2.2106700897216798, "mean_token_accuracy": 0.6131400100886821, "num_tokens": 2020031.0, "step": 340 }, { "entropy": 2.11156629472971, "epoch": 0.6017623038899634, "grad_norm": 0.4958188235759735, "learning_rate": 0.00018604, "loss": 2.2597396850585936, "mean_token_accuracy": 0.6059262081980705, "num_tokens": 2080034.0, "step": 350 }, { "entropy": 2.2104950502514837, "epoch": 0.6189555125725339, "grad_norm": 0.5096895098686218, "learning_rate": 0.00018564000000000002, "loss": 2.271474075317383, "mean_token_accuracy": 0.597117318212986, "num_tokens": 2137629.0, "step": 360 }, { "entropy": 2.134429484605789, "epoch": 0.6361487212551042, "grad_norm": 0.5682357549667358, "learning_rate": 0.00018524, "loss": 2.237981605529785, "mean_token_accuracy": 0.6059913612902165, "num_tokens": 2196125.0, "step": 370 }, { "entropy": 2.151672175526619, "epoch": 0.6533419299376746, "grad_norm": 0.5488378405570984, "learning_rate": 0.00018484000000000003, "loss": 2.264353942871094, "mean_token_accuracy": 0.5990234814584255, "num_tokens": 2256258.0, "step": 380 }, { "entropy": 2.1240685641765595, "epoch": 0.670535138620245, "grad_norm": 0.5736668705940247, "learning_rate": 0.00018444000000000002, "loss": 2.2498214721679686, "mean_token_accuracy": 0.6068044692277909, "num_tokens": 2317743.0, "step": 390 }, { "entropy": 2.110594576597214, "epoch": 0.6877283473028154, "grad_norm": 0.5833790302276611, "learning_rate": 0.00018404, "loss": 2.222176361083984, "mean_token_accuracy": 0.6128960818052291, "num_tokens": 2381669.0, "step": 400 }, { "entropy": 2.1233398094773293, "epoch": 0.7049215559853858, "grad_norm": 0.5612857937812805, "learning_rate": 0.00018364000000000002, "loss": 2.2054920196533203, "mean_token_accuracy": 0.6041553311049939, "num_tokens": 2439094.0, "step": 410 }, { "entropy": 2.1500213012099265, "epoch": 0.7221147646679562, "grad_norm": 0.5670902132987976, "learning_rate": 0.00018324000000000001, "loss": 2.2393463134765623, "mean_token_accuracy": 0.6055185578763485, "num_tokens": 2498246.0, "step": 420 }, { "entropy": 2.03703038841486, "epoch": 0.7393079733505266, "grad_norm": 0.6171953082084656, "learning_rate": 0.00018284, "loss": 2.129292678833008, "mean_token_accuracy": 0.6159544993191958, "num_tokens": 2557403.0, "step": 430 }, { "entropy": 2.0861524820327757, "epoch": 0.7565011820330969, "grad_norm": 0.5753834247589111, "learning_rate": 0.00018244000000000002, "loss": 2.189851760864258, "mean_token_accuracy": 0.6077463660389185, "num_tokens": 2614912.0, "step": 440 }, { "entropy": 2.127491444349289, "epoch": 0.7736943907156673, "grad_norm": 0.5808554291725159, "learning_rate": 0.00018204, "loss": 2.2198902130126954, "mean_token_accuracy": 0.6067756544798613, "num_tokens": 2673278.0, "step": 450 }, { "entropy": 2.10256717056036, "epoch": 0.7908875993982377, "grad_norm": 0.6682887077331543, "learning_rate": 0.00018164, "loss": 2.1562450408935545, "mean_token_accuracy": 0.6123722370713949, "num_tokens": 2733477.0, "step": 460 }, { "entropy": 2.0725875943899155, "epoch": 0.8080808080808081, "grad_norm": 0.5611984729766846, "learning_rate": 0.00018124000000000002, "loss": 2.1555084228515624, "mean_token_accuracy": 0.6104017984122038, "num_tokens": 2793130.0, "step": 470 }, { "entropy": 2.0911933913826943, "epoch": 0.8252740167633784, "grad_norm": 0.4832659661769867, "learning_rate": 0.00018084, "loss": 2.2131584167480467, "mean_token_accuracy": 0.6108913067728281, "num_tokens": 2856643.0, "step": 480 }, { "entropy": 2.089042477309704, "epoch": 0.8424672254459489, "grad_norm": 0.6528189182281494, "learning_rate": 0.00018044, "loss": 2.222637939453125, "mean_token_accuracy": 0.6140136975795031, "num_tokens": 2918669.0, "step": 490 }, { "entropy": 2.177516208589077, "epoch": 0.8596604341285192, "grad_norm": 0.511565089225769, "learning_rate": 0.00018004000000000002, "loss": 2.2838220596313477, "mean_token_accuracy": 0.6013776436448097, "num_tokens": 2977952.0, "step": 500 }, { "entropy": 2.0694020837545395, "epoch": 0.8768536428110896, "grad_norm": 0.6112110614776611, "learning_rate": 0.00017964, "loss": 2.1932716369628906, "mean_token_accuracy": 0.6094281867146492, "num_tokens": 3039858.0, "step": 510 }, { "entropy": 2.074494983255863, "epoch": 0.89404685149366, "grad_norm": 0.6264903545379639, "learning_rate": 0.00017924, "loss": 2.1746356964111326, "mean_token_accuracy": 0.6137691352516412, "num_tokens": 3095095.0, "step": 520 }, { "entropy": 2.1704643085598945, "epoch": 0.9112400601762304, "grad_norm": 0.5588786602020264, "learning_rate": 0.00017884000000000002, "loss": 2.256443977355957, "mean_token_accuracy": 0.6048448126763105, "num_tokens": 3154541.0, "step": 530 }, { "entropy": 2.0287919655442237, "epoch": 0.9284332688588007, "grad_norm": 0.5964768528938293, "learning_rate": 0.00017844, "loss": 2.1472768783569336, "mean_token_accuracy": 0.6183896280825139, "num_tokens": 3213587.0, "step": 540 }, { "entropy": 2.0924746826291085, "epoch": 0.9456264775413712, "grad_norm": 0.6021146774291992, "learning_rate": 0.00017804, "loss": 2.194413757324219, "mean_token_accuracy": 0.6094567842781544, "num_tokens": 3270420.0, "step": 550 }, { "entropy": 2.2008628591895105, "epoch": 0.9628196862239415, "grad_norm": 0.5850458741188049, "learning_rate": 0.00017764000000000001, "loss": 2.325449752807617, "mean_token_accuracy": 0.5982100035995245, "num_tokens": 3329358.0, "step": 560 }, { "entropy": 2.065661500394344, "epoch": 0.9800128949065119, "grad_norm": 0.5808996558189392, "learning_rate": 0.00017724, "loss": 2.197132873535156, "mean_token_accuracy": 0.614894449710846, "num_tokens": 3389408.0, "step": 570 }, { "entropy": 2.0561472952365873, "epoch": 0.9972061035890823, "grad_norm": 0.5550947189331055, "learning_rate": 0.00017684, "loss": 2.1736166000366213, "mean_token_accuracy": 0.6167749039828777, "num_tokens": 3448108.0, "step": 580 }, { "entropy": 2.017703249857023, "epoch": 1.0137545669460564, "grad_norm": 0.5885869860649109, "learning_rate": 0.00017644, "loss": 2.093535614013672, "mean_token_accuracy": 0.623307110427262, "num_tokens": 3510556.0, "step": 590 }, { "entropy": 2.0724975898861886, "epoch": 1.0309477756286267, "grad_norm": 0.6038488745689392, "learning_rate": 0.00017604, "loss": 2.1512643814086916, "mean_token_accuracy": 0.6218964882194996, "num_tokens": 3572796.0, "step": 600 }, { "entropy": 2.0171437337994576, "epoch": 1.048140984311197, "grad_norm": 0.5356580018997192, "learning_rate": 0.00017564, "loss": 2.1061470031738283, "mean_token_accuracy": 0.6229204799979925, "num_tokens": 3636040.0, "step": 610 }, { "entropy": 2.069349633157253, "epoch": 1.0653341929937674, "grad_norm": 0.6440969109535217, "learning_rate": 0.00017524, "loss": 2.1538244247436524, "mean_token_accuracy": 0.6151916943490505, "num_tokens": 3692494.0, "step": 620 }, { "entropy": 1.9605562821030618, "epoch": 1.0825274016763378, "grad_norm": 0.6160545349121094, "learning_rate": 0.00017484, "loss": 2.036081314086914, "mean_token_accuracy": 0.6326734255999327, "num_tokens": 3750269.0, "step": 630 }, { "entropy": 1.9889371052384377, "epoch": 1.0997206103589083, "grad_norm": 0.6164005398750305, "learning_rate": 0.00017444, "loss": 2.051021194458008, "mean_token_accuracy": 0.6301001563668251, "num_tokens": 3808993.0, "step": 640 }, { "entropy": 2.1208505019545556, "epoch": 1.1169138190414787, "grad_norm": 0.5549023151397705, "learning_rate": 0.00017404, "loss": 2.2218536376953124, "mean_token_accuracy": 0.6127480801194907, "num_tokens": 3875366.0, "step": 650 }, { "entropy": 2.0680324912071226, "epoch": 1.134107027724049, "grad_norm": 0.6039656400680542, "learning_rate": 0.00017364, "loss": 2.1767093658447267, "mean_token_accuracy": 0.61400815397501, "num_tokens": 3936258.0, "step": 660 }, { "entropy": 2.072723534703255, "epoch": 1.1513002364066194, "grad_norm": 0.5685736536979675, "learning_rate": 0.00017324000000000002, "loss": 2.16034049987793, "mean_token_accuracy": 0.6158209484070539, "num_tokens": 3996786.0, "step": 670 }, { "entropy": 2.0064551383256912, "epoch": 1.1684934450891897, "grad_norm": 0.6133168935775757, "learning_rate": 0.00017284, "loss": 2.0968536376953124, "mean_token_accuracy": 0.6204963065683842, "num_tokens": 4057237.0, "step": 680 }, { "entropy": 1.999112318456173, "epoch": 1.18568665377176, "grad_norm": 0.6392707228660583, "learning_rate": 0.00017244, "loss": 2.1093589782714846, "mean_token_accuracy": 0.6260342009365558, "num_tokens": 4115641.0, "step": 690 }, { "entropy": 1.9792790532112121, "epoch": 1.2028798624543304, "grad_norm": 0.5575782656669617, "learning_rate": 0.00017204, "loss": 2.097017288208008, "mean_token_accuracy": 0.6311597619205713, "num_tokens": 4179186.0, "step": 700 }, { "entropy": 2.0830755099654197, "epoch": 1.220073071136901, "grad_norm": 0.5660914182662964, "learning_rate": 0.00017164, "loss": 2.1811811447143556, "mean_token_accuracy": 0.6163320489227772, "num_tokens": 4239862.0, "step": 710 }, { "entropy": 2.0135093346238135, "epoch": 1.2372662798194713, "grad_norm": 0.535127580165863, "learning_rate": 0.00017124, "loss": 2.1185089111328126, "mean_token_accuracy": 0.6227246847003698, "num_tokens": 4299020.0, "step": 720 }, { "entropy": 2.0245131298899652, "epoch": 1.2544594885020417, "grad_norm": 0.5870150327682495, "learning_rate": 0.00017084, "loss": 2.1228567123413087, "mean_token_accuracy": 0.620493221282959, "num_tokens": 4357565.0, "step": 730 }, { "entropy": 2.0833021104335785, "epoch": 1.271652697184612, "grad_norm": 0.6691998243331909, "learning_rate": 0.00017044, "loss": 2.1688915252685548, "mean_token_accuracy": 0.6160433337092399, "num_tokens": 4416113.0, "step": 740 }, { "entropy": 1.9562881767749787, "epoch": 1.2888459058671824, "grad_norm": 0.5513840317726135, "learning_rate": 0.00017004, "loss": 2.038709831237793, "mean_token_accuracy": 0.6274638958275318, "num_tokens": 4479913.0, "step": 750 }, { "entropy": 2.0899336978793146, "epoch": 1.306039114549753, "grad_norm": 0.6334884166717529, "learning_rate": 0.00016964, "loss": 2.1464771270751952, "mean_token_accuracy": 0.6156820185482502, "num_tokens": 4538630.0, "step": 760 }, { "entropy": 2.006402041018009, "epoch": 1.3232323232323233, "grad_norm": 0.6608359813690186, "learning_rate": 0.00016924, "loss": 2.0910036087036135, "mean_token_accuracy": 0.6266825262457132, "num_tokens": 4596432.0, "step": 770 }, { "entropy": 1.981143780052662, "epoch": 1.3404255319148937, "grad_norm": 0.6512318849563599, "learning_rate": 0.00016884, "loss": 2.0733022689819336, "mean_token_accuracy": 0.6271612212061882, "num_tokens": 4653388.0, "step": 780 }, { "entropy": 2.046033799648285, "epoch": 1.357618740597464, "grad_norm": 0.657543957233429, "learning_rate": 0.00016844, "loss": 2.122422790527344, "mean_token_accuracy": 0.6139081876724959, "num_tokens": 4707749.0, "step": 790 }, { "entropy": 2.0472232535481454, "epoch": 1.3748119492800344, "grad_norm": 0.5705983638763428, "learning_rate": 0.00016804, "loss": 2.1095462799072267, "mean_token_accuracy": 0.6147159416228533, "num_tokens": 4768198.0, "step": 800 }, { "entropy": 2.0032111018896104, "epoch": 1.3920051579626047, "grad_norm": 0.6248787045478821, "learning_rate": 0.00016764, "loss": 2.06416015625, "mean_token_accuracy": 0.626778207719326, "num_tokens": 4830049.0, "step": 810 }, { "entropy": 2.008875849843025, "epoch": 1.409198366645175, "grad_norm": 0.6245584487915039, "learning_rate": 0.00016724000000000003, "loss": 2.096910285949707, "mean_token_accuracy": 0.626385198161006, "num_tokens": 4887278.0, "step": 820 }, { "entropy": 2.080148513615131, "epoch": 1.4263915753277456, "grad_norm": 0.6255568861961365, "learning_rate": 0.00016684000000000002, "loss": 2.1830982208251952, "mean_token_accuracy": 0.6152387134730816, "num_tokens": 4946401.0, "step": 830 }, { "entropy": 1.999508222937584, "epoch": 1.443584784010316, "grad_norm": 0.6275898218154907, "learning_rate": 0.00016644, "loss": 2.1005062103271483, "mean_token_accuracy": 0.6225087266415358, "num_tokens": 5007769.0, "step": 840 }, { "entropy": 1.937928880751133, "epoch": 1.4607779926928863, "grad_norm": 0.5719529986381531, "learning_rate": 0.00016604000000000003, "loss": 2.026857376098633, "mean_token_accuracy": 0.6374159537255764, "num_tokens": 5065155.0, "step": 850 }, { "entropy": 2.0230891808867453, "epoch": 1.4779712013754567, "grad_norm": 0.6000656485557556, "learning_rate": 0.00016564000000000002, "loss": 2.1284557342529298, "mean_token_accuracy": 0.6221452355384827, "num_tokens": 5127964.0, "step": 860 }, { "entropy": 2.0356423661112784, "epoch": 1.495164410058027, "grad_norm": 0.6452302932739258, "learning_rate": 0.00016524, "loss": 2.136619758605957, "mean_token_accuracy": 0.6180427376180887, "num_tokens": 5186867.0, "step": 870 }, { "entropy": 1.980474129319191, "epoch": 1.5123576187405976, "grad_norm": 0.6840422749519348, "learning_rate": 0.00016484000000000003, "loss": 2.1068920135498046, "mean_token_accuracy": 0.6276284489780665, "num_tokens": 5243496.0, "step": 880 }, { "entropy": 1.9933902084827424, "epoch": 1.5295508274231677, "grad_norm": 0.6497262716293335, "learning_rate": 0.00016444000000000002, "loss": 2.0951356887817383, "mean_token_accuracy": 0.625996507704258, "num_tokens": 5305224.0, "step": 890 }, { "entropy": 1.9787120044231414, "epoch": 1.5467440361057383, "grad_norm": 0.5943708419799805, "learning_rate": 0.00016404, "loss": 2.050846481323242, "mean_token_accuracy": 0.6277968011796474, "num_tokens": 5362047.0, "step": 900 }, { "entropy": 1.9207568421959877, "epoch": 1.5639372447883086, "grad_norm": 0.7813357710838318, "learning_rate": 0.00016364000000000003, "loss": 2.01483097076416, "mean_token_accuracy": 0.6335461936891079, "num_tokens": 5418646.0, "step": 910 }, { "entropy": 1.991297036409378, "epoch": 1.581130453470879, "grad_norm": 0.6333187818527222, "learning_rate": 0.00016324000000000002, "loss": 2.042116165161133, "mean_token_accuracy": 0.6277558326721191, "num_tokens": 5474990.0, "step": 920 }, { "entropy": 2.049289306998253, "epoch": 1.5983236621534493, "grad_norm": 0.6549156308174133, "learning_rate": 0.00016284, "loss": 2.160791778564453, "mean_token_accuracy": 0.6178662430495023, "num_tokens": 5536798.0, "step": 930 }, { "entropy": 2.0044725999236106, "epoch": 1.6155168708360197, "grad_norm": 0.7054678201675415, "learning_rate": 0.00016244000000000002, "loss": 2.1012857437133787, "mean_token_accuracy": 0.6249351866543293, "num_tokens": 5592825.0, "step": 940 }, { "entropy": 1.8883072763681412, "epoch": 1.6327100795185903, "grad_norm": 0.6150692701339722, "learning_rate": 0.00016204000000000001, "loss": 1.933417510986328, "mean_token_accuracy": 0.6382322389632463, "num_tokens": 5647293.0, "step": 950 }, { "entropy": 1.9773345232009887, "epoch": 1.6499032882011604, "grad_norm": 0.6604284048080444, "learning_rate": 0.00016164, "loss": 2.0537574768066404, "mean_token_accuracy": 0.6284147780388594, "num_tokens": 5707851.0, "step": 960 }, { "entropy": 1.9602369159460067, "epoch": 1.667096496883731, "grad_norm": 0.5877639055252075, "learning_rate": 0.00016124000000000002, "loss": 2.0338172912597656, "mean_token_accuracy": 0.6378967847675086, "num_tokens": 5766339.0, "step": 970 }, { "entropy": 2.0475835338234902, "epoch": 1.6842897055663013, "grad_norm": 0.6029936075210571, "learning_rate": 0.00016084, "loss": 2.161564254760742, "mean_token_accuracy": 0.6204672615975142, "num_tokens": 5827901.0, "step": 980 }, { "entropy": 1.9963667973876, "epoch": 1.7014829142488717, "grad_norm": 0.6379806399345398, "learning_rate": 0.00016044, "loss": 2.087441825866699, "mean_token_accuracy": 0.6302045777440071, "num_tokens": 5888724.0, "step": 990 }, { "entropy": 2.003598253428936, "epoch": 1.7186761229314422, "grad_norm": 0.6113580465316772, "learning_rate": 0.00016004000000000002, "loss": 2.136977195739746, "mean_token_accuracy": 0.6279609728604555, "num_tokens": 5948999.0, "step": 1000 }, { "entropy": 2.009714852273464, "epoch": 1.7358693316140124, "grad_norm": 0.6397438645362854, "learning_rate": 0.00015964, "loss": 2.1206714630126955, "mean_token_accuracy": 0.6264089956879616, "num_tokens": 6010131.0, "step": 1010 }, { "entropy": 1.9690447196364402, "epoch": 1.753062540296583, "grad_norm": 0.6629226803779602, "learning_rate": 0.00015924, "loss": 2.0379661560058593, "mean_token_accuracy": 0.6306859996169806, "num_tokens": 6070105.0, "step": 1020 }, { "entropy": 1.9761606559157372, "epoch": 1.7702557489791533, "grad_norm": 0.6340969800949097, "learning_rate": 0.00015884000000000002, "loss": 2.0979576110839844, "mean_token_accuracy": 0.6255223523825407, "num_tokens": 6129369.0, "step": 1030 }, { "entropy": 2.0855698764324186, "epoch": 1.7874489576617236, "grad_norm": 0.6090606451034546, "learning_rate": 0.00015844, "loss": 2.1925222396850588, "mean_token_accuracy": 0.6140031859278678, "num_tokens": 6191194.0, "step": 1040 }, { "entropy": 2.0149157389998438, "epoch": 1.804642166344294, "grad_norm": 0.6012734174728394, "learning_rate": 0.00015804, "loss": 2.104481506347656, "mean_token_accuracy": 0.6318388734012842, "num_tokens": 6254136.0, "step": 1050 }, { "entropy": 1.9774292945861816, "epoch": 1.8218353750268643, "grad_norm": 0.5775039792060852, "learning_rate": 0.00015764000000000002, "loss": 2.0788572311401365, "mean_token_accuracy": 0.6315916679799557, "num_tokens": 6311831.0, "step": 1060 }, { "entropy": 2.0096932500600815, "epoch": 1.839028583709435, "grad_norm": 0.6441799998283386, "learning_rate": 0.00015724, "loss": 2.067369079589844, "mean_token_accuracy": 0.6272157531231641, "num_tokens": 6368999.0, "step": 1070 }, { "entropy": 1.9509294107556343, "epoch": 1.856221792392005, "grad_norm": 0.7183738946914673, "learning_rate": 0.00015684, "loss": 2.046398162841797, "mean_token_accuracy": 0.6311775099486112, "num_tokens": 6426522.0, "step": 1080 }, { "entropy": 2.0291523337364197, "epoch": 1.8734150010745756, "grad_norm": 0.6105868816375732, "learning_rate": 0.00015644, "loss": 2.103832244873047, "mean_token_accuracy": 0.6217470221221447, "num_tokens": 6488781.0, "step": 1090 }, { "entropy": 1.9441853806376457, "epoch": 1.890608209757146, "grad_norm": 0.6209670901298523, "learning_rate": 0.00015604, "loss": 1.9830631256103515, "mean_token_accuracy": 0.6288409855216741, "num_tokens": 6545378.0, "step": 1100 }, { "entropy": 1.9924452617764472, "epoch": 1.9078014184397163, "grad_norm": 0.6691610813140869, "learning_rate": 0.00015564000000000002, "loss": 2.0985618591308595, "mean_token_accuracy": 0.6272476647049189, "num_tokens": 6605917.0, "step": 1110 }, { "entropy": 1.9443897798657417, "epoch": 1.9249946271222869, "grad_norm": 0.5956576466560364, "learning_rate": 0.00015524, "loss": 2.0018499374389647, "mean_token_accuracy": 0.6381504714488984, "num_tokens": 6661811.0, "step": 1120 }, { "entropy": 1.9805707216262818, "epoch": 1.942187835804857, "grad_norm": 0.5946056842803955, "learning_rate": 0.00015484, "loss": 2.0427513122558594, "mean_token_accuracy": 0.6258678704500198, "num_tokens": 6723336.0, "step": 1130 }, { "entropy": 1.9137805163860322, "epoch": 1.9593810444874276, "grad_norm": 0.6030678749084473, "learning_rate": 0.00015444000000000002, "loss": 1.9697463989257813, "mean_token_accuracy": 0.641855177283287, "num_tokens": 6781069.0, "step": 1140 }, { "entropy": 1.9368563026189805, "epoch": 1.9765742531699977, "grad_norm": 0.6237244009971619, "learning_rate": 0.00015404, "loss": 2.034241485595703, "mean_token_accuracy": 0.6384489141404629, "num_tokens": 6840129.0, "step": 1150 }, { "entropy": 1.8723769560456276, "epoch": 1.9937674618525683, "grad_norm": 0.6409602761268616, "learning_rate": 0.00015364, "loss": 1.9205358505249024, "mean_token_accuracy": 0.6437219835817813, "num_tokens": 6894472.0, "step": 1160 }, { "entropy": 1.8628178556244095, "epoch": 2.010315925209542, "grad_norm": 0.6614267230033875, "learning_rate": 0.00015324000000000002, "loss": 1.8996776580810546, "mean_token_accuracy": 0.6492502008165631, "num_tokens": 6952821.0, "step": 1170 }, { "entropy": 1.8918349608778953, "epoch": 2.0275091338921127, "grad_norm": 0.6476360559463501, "learning_rate": 0.00015284, "loss": 1.9498180389404296, "mean_token_accuracy": 0.6428254719823598, "num_tokens": 7014989.0, "step": 1180 }, { "entropy": 1.945443508028984, "epoch": 2.044702342574683, "grad_norm": 0.7725419402122498, "learning_rate": 0.00015244, "loss": 2.020453453063965, "mean_token_accuracy": 0.6328805617988109, "num_tokens": 7073087.0, "step": 1190 }, { "entropy": 2.039360311627388, "epoch": 2.0618955512572534, "grad_norm": 0.6981257200241089, "learning_rate": 0.00015204000000000001, "loss": 2.137688636779785, "mean_token_accuracy": 0.6245126206427812, "num_tokens": 7129761.0, "step": 1200 }, { "entropy": 1.9039745211601258, "epoch": 2.0790887599398236, "grad_norm": 0.6344786882400513, "learning_rate": 0.00015164, "loss": 1.945602798461914, "mean_token_accuracy": 0.6396142981946469, "num_tokens": 7187187.0, "step": 1210 }, { "entropy": 1.9067329421639443, "epoch": 2.096281968622394, "grad_norm": 0.6346563696861267, "learning_rate": 0.00015124, "loss": 1.990530014038086, "mean_token_accuracy": 0.6383092008531094, "num_tokens": 7247237.0, "step": 1220 }, { "entropy": 1.8649237960577012, "epoch": 2.1134751773049647, "grad_norm": 0.6211341023445129, "learning_rate": 0.00015084, "loss": 1.9000562667846679, "mean_token_accuracy": 0.6510101232677699, "num_tokens": 7301760.0, "step": 1230 }, { "entropy": 1.9364619553089142, "epoch": 2.130668385987535, "grad_norm": 0.6074926257133484, "learning_rate": 0.00015044, "loss": 2.0338695526123045, "mean_token_accuracy": 0.6359073251485825, "num_tokens": 7361681.0, "step": 1240 }, { "entropy": 1.86348085552454, "epoch": 2.1478615946701054, "grad_norm": 0.7472113370895386, "learning_rate": 0.00015004, "loss": 1.9359153747558593, "mean_token_accuracy": 0.6460991870611906, "num_tokens": 7424162.0, "step": 1250 }, { "entropy": 1.9580871596932412, "epoch": 2.1650548033526755, "grad_norm": 0.6993629336357117, "learning_rate": 0.00014964, "loss": 2.0203479766845702, "mean_token_accuracy": 0.630286343768239, "num_tokens": 7486191.0, "step": 1260 }, { "entropy": 1.9111438870429993, "epoch": 2.182248012035246, "grad_norm": 0.6560285687446594, "learning_rate": 0.00014924, "loss": 1.9584983825683593, "mean_token_accuracy": 0.6434897668659687, "num_tokens": 7544447.0, "step": 1270 }, { "entropy": 1.931545352935791, "epoch": 2.1994412207178167, "grad_norm": 0.6713767647743225, "learning_rate": 0.00014884, "loss": 2.106512451171875, "mean_token_accuracy": 0.6318223185837268, "num_tokens": 7604237.0, "step": 1280 }, { "entropy": 1.9441128447651863, "epoch": 2.216634429400387, "grad_norm": 0.6440369486808777, "learning_rate": 0.00014844, "loss": 2.0373985290527346, "mean_token_accuracy": 0.6320536743849516, "num_tokens": 7661475.0, "step": 1290 }, { "entropy": 1.949166515469551, "epoch": 2.2338276380829574, "grad_norm": 0.6829583644866943, "learning_rate": 0.00014804, "loss": 2.005051040649414, "mean_token_accuracy": 0.6337476786226034, "num_tokens": 7720352.0, "step": 1300 }, { "entropy": 1.8687394335865974, "epoch": 2.2510208467655275, "grad_norm": 0.6745384335517883, "learning_rate": 0.00014764, "loss": 1.9336997985839843, "mean_token_accuracy": 0.6435953237116336, "num_tokens": 7780071.0, "step": 1310 }, { "entropy": 1.8999060586094856, "epoch": 2.268214055448098, "grad_norm": 0.5983019471168518, "learning_rate": 0.00014724, "loss": 1.9348258972167969, "mean_token_accuracy": 0.6431682731956243, "num_tokens": 7839864.0, "step": 1320 }, { "entropy": 1.868900626897812, "epoch": 2.285407264130668, "grad_norm": 0.6673943400382996, "learning_rate": 0.00014684, "loss": 1.9038555145263671, "mean_token_accuracy": 0.6503972858190536, "num_tokens": 7901781.0, "step": 1330 }, { "entropy": 2.0273515924811365, "epoch": 2.3026004728132388, "grad_norm": 0.7098233103752136, "learning_rate": 0.00014644, "loss": 2.1501066207885744, "mean_token_accuracy": 0.6240016058087349, "num_tokens": 7962103.0, "step": 1340 }, { "entropy": 1.9079027369618415, "epoch": 2.3197936814958093, "grad_norm": 0.6738902926445007, "learning_rate": 0.00014604, "loss": 1.9681257247924804, "mean_token_accuracy": 0.6416124865412712, "num_tokens": 8021532.0, "step": 1350 }, { "entropy": 1.9230551555752755, "epoch": 2.3369868901783795, "grad_norm": 0.7348875999450684, "learning_rate": 0.00014564, "loss": 1.995201301574707, "mean_token_accuracy": 0.6393462974578142, "num_tokens": 8083084.0, "step": 1360 }, { "entropy": 1.9632413163781166, "epoch": 2.35418009886095, "grad_norm": 0.7093244194984436, "learning_rate": 0.00014523999999999998, "loss": 2.0105892181396485, "mean_token_accuracy": 0.6326792053878307, "num_tokens": 8144387.0, "step": 1370 }, { "entropy": 1.9028392255306243, "epoch": 2.37137330754352, "grad_norm": 0.7751646041870117, "learning_rate": 0.00014484, "loss": 1.9892047882080077, "mean_token_accuracy": 0.6404657427221536, "num_tokens": 8203720.0, "step": 1380 }, { "entropy": 1.860563676059246, "epoch": 2.3885665162260907, "grad_norm": 0.7243943214416504, "learning_rate": 0.00014444, "loss": 1.9201997756958007, "mean_token_accuracy": 0.6510312400758267, "num_tokens": 8263059.0, "step": 1390 }, { "entropy": 1.8883199632167815, "epoch": 2.405759724908661, "grad_norm": 0.6106081604957581, "learning_rate": 0.00014404, "loss": 1.9687911987304687, "mean_token_accuracy": 0.6493277192115784, "num_tokens": 8323826.0, "step": 1400 }, { "entropy": 1.9286921486258506, "epoch": 2.4229529335912314, "grad_norm": 0.697307825088501, "learning_rate": 0.00014364, "loss": 2.030810546875, "mean_token_accuracy": 0.6362422123551369, "num_tokens": 8383325.0, "step": 1410 }, { "entropy": 1.9122009217739104, "epoch": 2.440146142273802, "grad_norm": 0.7119978070259094, "learning_rate": 0.00014324, "loss": 1.9788457870483398, "mean_token_accuracy": 0.6400811962783337, "num_tokens": 8442393.0, "step": 1420 }, { "entropy": 1.9088031873106956, "epoch": 2.457339350956372, "grad_norm": 0.6792617440223694, "learning_rate": 0.00014284, "loss": 1.995138931274414, "mean_token_accuracy": 0.63965779915452, "num_tokens": 8501631.0, "step": 1430 }, { "entropy": 1.8871790513396263, "epoch": 2.4745325596389427, "grad_norm": 0.7191487550735474, "learning_rate": 0.00014244000000000003, "loss": 1.9728309631347656, "mean_token_accuracy": 0.642948642373085, "num_tokens": 8561457.0, "step": 1440 }, { "entropy": 1.8643269062042236, "epoch": 2.491725768321513, "grad_norm": 0.636345386505127, "learning_rate": 0.00014204000000000002, "loss": 1.9090641021728516, "mean_token_accuracy": 0.6537593178451061, "num_tokens": 8618281.0, "step": 1450 }, { "entropy": 1.90698651522398, "epoch": 2.5089189770040834, "grad_norm": 0.7444325685501099, "learning_rate": 0.00014164, "loss": 1.9631458282470704, "mean_token_accuracy": 0.6428062118589878, "num_tokens": 8675547.0, "step": 1460 }, { "entropy": 1.942822016775608, "epoch": 2.526112185686654, "grad_norm": 0.6707946062088013, "learning_rate": 0.00014124000000000002, "loss": 2.0348419189453124, "mean_token_accuracy": 0.6381070952862501, "num_tokens": 8735546.0, "step": 1470 }, { "entropy": 1.979990416765213, "epoch": 2.543305394369224, "grad_norm": 0.6958721280097961, "learning_rate": 0.00014084000000000001, "loss": 2.072053146362305, "mean_token_accuracy": 0.6283687971532345, "num_tokens": 8796404.0, "step": 1480 }, { "entropy": 1.9207274168729782, "epoch": 2.5604986030517947, "grad_norm": 0.6575210690498352, "learning_rate": 0.00014044, "loss": 1.9427066802978517, "mean_token_accuracy": 0.642588010430336, "num_tokens": 8853405.0, "step": 1490 }, { "entropy": 1.9980120360851288, "epoch": 2.577691811734365, "grad_norm": 0.7412211298942566, "learning_rate": 0.00014004000000000002, "loss": 2.1047718048095705, "mean_token_accuracy": 0.6264939974993468, "num_tokens": 8909416.0, "step": 1500 }, { "entropy": 1.8110749498009682, "epoch": 2.5948850204169354, "grad_norm": 0.7108538746833801, "learning_rate": 0.00013964, "loss": 1.8952640533447265, "mean_token_accuracy": 0.6537120632827282, "num_tokens": 8968510.0, "step": 1510 }, { "entropy": 1.977073846757412, "epoch": 2.612078229099506, "grad_norm": 0.7554802298545837, "learning_rate": 0.00013924, "loss": 2.0621898651123045, "mean_token_accuracy": 0.6327366977930069, "num_tokens": 9026884.0, "step": 1520 }, { "entropy": 1.8783492282032968, "epoch": 2.629271437782076, "grad_norm": 0.6592015027999878, "learning_rate": 0.00013884000000000002, "loss": 1.9230785369873047, "mean_token_accuracy": 0.6494536675512791, "num_tokens": 9085571.0, "step": 1530 }, { "entropy": 1.9282778173685073, "epoch": 2.6464646464646466, "grad_norm": 0.7717080116271973, "learning_rate": 0.00013844, "loss": 2.0319377899169924, "mean_token_accuracy": 0.6344667036086321, "num_tokens": 9147549.0, "step": 1540 }, { "entropy": 1.903467869758606, "epoch": 2.6636578551472168, "grad_norm": 0.6227516531944275, "learning_rate": 0.00013804000000000003, "loss": 1.9306724548339844, "mean_token_accuracy": 0.644033481925726, "num_tokens": 9204942.0, "step": 1550 }, { "entropy": 1.8967040538787843, "epoch": 2.6808510638297873, "grad_norm": 0.6684938073158264, "learning_rate": 0.00013764000000000002, "loss": 2.001560592651367, "mean_token_accuracy": 0.6470274899154902, "num_tokens": 9266446.0, "step": 1560 }, { "entropy": 1.8590586185455322, "epoch": 2.6980442725123575, "grad_norm": 0.6150694489479065, "learning_rate": 0.00013724, "loss": 1.9280338287353516, "mean_token_accuracy": 0.6484670951962471, "num_tokens": 9326109.0, "step": 1570 }, { "entropy": 1.9293041676282883, "epoch": 2.715237481194928, "grad_norm": 0.6057704091072083, "learning_rate": 0.00013684000000000002, "loss": 1.9943519592285157, "mean_token_accuracy": 0.6371258046478033, "num_tokens": 9385073.0, "step": 1580 }, { "entropy": 1.8843669161200522, "epoch": 2.732430689877498, "grad_norm": 0.6834639310836792, "learning_rate": 0.00013644000000000002, "loss": 1.9569879531860352, "mean_token_accuracy": 0.6437417894601822, "num_tokens": 9445137.0, "step": 1590 }, { "entropy": 1.8529930964112282, "epoch": 2.7496238985600687, "grad_norm": 0.6442180871963501, "learning_rate": 0.00013604, "loss": 1.8902450561523438, "mean_token_accuracy": 0.6518216013908387, "num_tokens": 9504160.0, "step": 1600 }, { "entropy": 1.939158782362938, "epoch": 2.7668171072426393, "grad_norm": 0.6240729689598083, "learning_rate": 0.00013564000000000002, "loss": 2.0188575744628907, "mean_token_accuracy": 0.63564417026937, "num_tokens": 9564675.0, "step": 1610 }, { "entropy": 1.9281259045004844, "epoch": 2.7840103159252094, "grad_norm": 0.750890851020813, "learning_rate": 0.00013524, "loss": 2.017038345336914, "mean_token_accuracy": 0.6387452960014344, "num_tokens": 9625026.0, "step": 1620 }, { "entropy": 1.873080413043499, "epoch": 2.80120352460778, "grad_norm": 0.776397168636322, "learning_rate": 0.00013484, "loss": 1.9759422302246095, "mean_token_accuracy": 0.6433901283890009, "num_tokens": 9685967.0, "step": 1630 }, { "entropy": 1.9089648619294166, "epoch": 2.81839673329035, "grad_norm": 0.6481618881225586, "learning_rate": 0.00013444000000000002, "loss": 1.956050491333008, "mean_token_accuracy": 0.6402542922645807, "num_tokens": 9745233.0, "step": 1640 }, { "entropy": 1.975960558652878, "epoch": 2.8355899419729207, "grad_norm": 0.6896694302558899, "learning_rate": 0.00013404, "loss": 2.0583721160888673, "mean_token_accuracy": 0.6340504981577396, "num_tokens": 9805150.0, "step": 1650 }, { "entropy": 1.945571132004261, "epoch": 2.8527831506554913, "grad_norm": 0.6386220455169678, "learning_rate": 0.00013364, "loss": 2.03116512298584, "mean_token_accuracy": 0.6365220382809639, "num_tokens": 9861196.0, "step": 1660 }, { "entropy": 1.9110410138964653, "epoch": 2.8699763593380614, "grad_norm": 0.7503199577331543, "learning_rate": 0.00013324000000000002, "loss": 1.9521196365356446, "mean_token_accuracy": 0.6381696432828903, "num_tokens": 9921155.0, "step": 1670 }, { "entropy": 1.849820225685835, "epoch": 2.887169568020632, "grad_norm": 0.6197855472564697, "learning_rate": 0.00013284, "loss": 1.8909440994262696, "mean_token_accuracy": 0.6426266122609376, "num_tokens": 9979351.0, "step": 1680 }, { "entropy": 1.8932805389165879, "epoch": 2.904362776703202, "grad_norm": 0.6703120470046997, "learning_rate": 0.00013244, "loss": 2.0233718872070314, "mean_token_accuracy": 0.646468547359109, "num_tokens": 10041238.0, "step": 1690 }, { "entropy": 1.8625088930130005, "epoch": 2.9215559853857727, "grad_norm": 0.73073410987854, "learning_rate": 0.00013204000000000002, "loss": 1.9317462921142579, "mean_token_accuracy": 0.6454428397119045, "num_tokens": 10099496.0, "step": 1700 }, { "entropy": 1.9354272544384004, "epoch": 2.9387491940683432, "grad_norm": 0.6566579937934875, "learning_rate": 0.00013164, "loss": 2.0027164459228515, "mean_token_accuracy": 0.6403926335275173, "num_tokens": 10161720.0, "step": 1710 }, { "entropy": 1.88578300178051, "epoch": 2.9559424027509134, "grad_norm": 0.7905747890472412, "learning_rate": 0.00013124, "loss": 1.9767372131347656, "mean_token_accuracy": 0.6421503167599439, "num_tokens": 10221734.0, "step": 1720 }, { "entropy": 1.870301403105259, "epoch": 2.973135611433484, "grad_norm": 0.7210419774055481, "learning_rate": 0.00013084000000000001, "loss": 1.9475433349609375, "mean_token_accuracy": 0.6528905872255564, "num_tokens": 10280223.0, "step": 1730 }, { "entropy": 1.8696911588311196, "epoch": 2.990328820116054, "grad_norm": 0.626354992389679, "learning_rate": 0.00013044, "loss": 1.926706314086914, "mean_token_accuracy": 0.6482070714235306, "num_tokens": 10339813.0, "step": 1740 }, { "entropy": 1.821205088844547, "epoch": 3.006877283473028, "grad_norm": 0.6353569030761719, "learning_rate": 0.00013004, "loss": 1.8657075881958007, "mean_token_accuracy": 0.6556356762136731, "num_tokens": 10398519.0, "step": 1750 }, { "entropy": 1.8890676617622375, "epoch": 3.0240704921555985, "grad_norm": 0.783729076385498, "learning_rate": 0.00012964, "loss": 1.9794137954711915, "mean_token_accuracy": 0.643126554042101, "num_tokens": 10456386.0, "step": 1760 }, { "entropy": 1.8766882956027984, "epoch": 3.041263700838169, "grad_norm": 0.7075045108795166, "learning_rate": 0.00012924, "loss": 1.9388771057128906, "mean_token_accuracy": 0.6462941512465477, "num_tokens": 10516721.0, "step": 1770 }, { "entropy": 1.7985384911298752, "epoch": 3.0584569095207392, "grad_norm": 0.7116262912750244, "learning_rate": 0.00012884, "loss": 1.8379974365234375, "mean_token_accuracy": 0.6582404263317585, "num_tokens": 10575553.0, "step": 1780 }, { "entropy": 1.8475583091378212, "epoch": 3.07565011820331, "grad_norm": 0.69736248254776, "learning_rate": 0.00012844, "loss": 1.9197765350341798, "mean_token_accuracy": 0.6509403776377439, "num_tokens": 10632501.0, "step": 1790 }, { "entropy": 1.8264927819371224, "epoch": 3.09284332688588, "grad_norm": 0.6354222297668457, "learning_rate": 0.00012804, "loss": 1.8965986251831055, "mean_token_accuracy": 0.6518782209604979, "num_tokens": 10693167.0, "step": 1800 }, { "entropy": 1.8696907818317414, "epoch": 3.1100365355684505, "grad_norm": 0.7568804621696472, "learning_rate": 0.00012764, "loss": 1.9332853317260743, "mean_token_accuracy": 0.6471077598631382, "num_tokens": 10753837.0, "step": 1810 }, { "entropy": 1.886954003572464, "epoch": 3.1272297442510206, "grad_norm": 0.7069846391677856, "learning_rate": 0.00012724, "loss": 1.9263908386230468, "mean_token_accuracy": 0.6466126769781113, "num_tokens": 10815256.0, "step": 1820 }, { "entropy": 1.8424360305070877, "epoch": 3.144422952933591, "grad_norm": 0.6524083614349365, "learning_rate": 0.00012684, "loss": 1.9088315963745117, "mean_token_accuracy": 0.6496367674320936, "num_tokens": 10877848.0, "step": 1830 }, { "entropy": 1.8966794192790986, "epoch": 3.1616161616161618, "grad_norm": 0.687421977519989, "learning_rate": 0.00012644000000000002, "loss": 1.9748069763183593, "mean_token_accuracy": 0.6424707356840372, "num_tokens": 10938042.0, "step": 1840 }, { "entropy": 1.81406429708004, "epoch": 3.178809370298732, "grad_norm": 0.7668496370315552, "learning_rate": 0.00012604, "loss": 1.8712465286254882, "mean_token_accuracy": 0.6571074567735196, "num_tokens": 10996204.0, "step": 1850 }, { "entropy": 1.8159340515732765, "epoch": 3.1960025789813025, "grad_norm": 0.7182545065879822, "learning_rate": 0.00012564, "loss": 1.830276107788086, "mean_token_accuracy": 0.6546356856822968, "num_tokens": 11056605.0, "step": 1860 }, { "entropy": 1.9095668271183968, "epoch": 3.2131957876638726, "grad_norm": 0.7548812031745911, "learning_rate": 0.00012524000000000001, "loss": 1.998922348022461, "mean_token_accuracy": 0.6411306612193585, "num_tokens": 11116614.0, "step": 1870 }, { "entropy": 1.8717206478118897, "epoch": 3.230388996346443, "grad_norm": 0.7692223191261292, "learning_rate": 0.00012484, "loss": 1.914438247680664, "mean_token_accuracy": 0.6441164951771498, "num_tokens": 11175802.0, "step": 1880 }, { "entropy": 1.8943733513355254, "epoch": 3.2475822050290137, "grad_norm": 0.6439138650894165, "learning_rate": 0.00012444, "loss": 1.9280553817749024, "mean_token_accuracy": 0.6476396139711141, "num_tokens": 11236477.0, "step": 1890 }, { "entropy": 1.8841392308473588, "epoch": 3.264775413711584, "grad_norm": 0.6971343159675598, "learning_rate": 0.00012404, "loss": 1.942568588256836, "mean_token_accuracy": 0.6398356795310974, "num_tokens": 11295146.0, "step": 1900 }, { "entropy": 1.8830088019371032, "epoch": 3.2819686223941544, "grad_norm": 0.7196023464202881, "learning_rate": 0.00012364, "loss": 1.963007354736328, "mean_token_accuracy": 0.6452915534377098, "num_tokens": 11355726.0, "step": 1910 }, { "entropy": 1.927216087281704, "epoch": 3.2991618310767246, "grad_norm": 0.790634274482727, "learning_rate": 0.00012324, "loss": 2.0809165954589846, "mean_token_accuracy": 0.6384686015546321, "num_tokens": 11415237.0, "step": 1920 }, { "entropy": 1.849087017774582, "epoch": 3.316355039759295, "grad_norm": 0.6752087473869324, "learning_rate": 0.00012284, "loss": 1.9017595291137694, "mean_token_accuracy": 0.6522149413824081, "num_tokens": 11476337.0, "step": 1930 }, { "entropy": 1.8517325416207313, "epoch": 3.3335482484418657, "grad_norm": 0.8036973476409912, "learning_rate": 0.00012244, "loss": 1.9011222839355468, "mean_token_accuracy": 0.6499856971204281, "num_tokens": 11537529.0, "step": 1940 }, { "entropy": 1.7622334837913514, "epoch": 3.350741457124436, "grad_norm": 0.7138587832450867, "learning_rate": 0.00012204, "loss": 1.7955827713012695, "mean_token_accuracy": 0.6596556272357702, "num_tokens": 11595421.0, "step": 1950 }, { "entropy": 1.8950866341590882, "epoch": 3.3679346658070064, "grad_norm": 0.6869714260101318, "learning_rate": 0.00012164, "loss": 1.948552131652832, "mean_token_accuracy": 0.6493024453520775, "num_tokens": 11655749.0, "step": 1960 }, { "entropy": 1.9235218942165375, "epoch": 3.3851278744895765, "grad_norm": 0.656403124332428, "learning_rate": 0.00012124, "loss": 2.04327449798584, "mean_token_accuracy": 0.6389912366867065, "num_tokens": 11717271.0, "step": 1970 }, { "entropy": 1.834906594455242, "epoch": 3.402321083172147, "grad_norm": 0.7343699932098389, "learning_rate": 0.00012084, "loss": 1.9038848876953125, "mean_token_accuracy": 0.6569048661738635, "num_tokens": 11778095.0, "step": 1980 }, { "entropy": 1.8515655741095542, "epoch": 3.4195142918547172, "grad_norm": 0.7009745240211487, "learning_rate": 0.00012043999999999999, "loss": 1.9157728195190429, "mean_token_accuracy": 0.6512683361768723, "num_tokens": 11835954.0, "step": 1990 }, { "entropy": 1.8634012743830681, "epoch": 3.436707500537288, "grad_norm": 0.6880552172660828, "learning_rate": 0.00012004, "loss": 1.9772762298583983, "mean_token_accuracy": 0.6531724959611893, "num_tokens": 11896615.0, "step": 2000 }, { "entropy": 1.8952298507094383, "epoch": 3.453900709219858, "grad_norm": 0.7292787432670593, "learning_rate": 0.00011964, "loss": 1.9302806854248047, "mean_token_accuracy": 0.6462091594934464, "num_tokens": 11954949.0, "step": 2010 }, { "entropy": 1.8723753660917282, "epoch": 3.4710939179024285, "grad_norm": 0.730530858039856, "learning_rate": 0.00011923999999999999, "loss": 1.9216194152832031, "mean_token_accuracy": 0.6504904717206955, "num_tokens": 12013803.0, "step": 2020 }, { "entropy": 1.8673277243971824, "epoch": 3.488287126584999, "grad_norm": 0.7530126571655273, "learning_rate": 0.00011884, "loss": 1.968985366821289, "mean_token_accuracy": 0.646847129613161, "num_tokens": 12073284.0, "step": 2030 }, { "entropy": 1.8757897645235062, "epoch": 3.505480335267569, "grad_norm": 0.7031217813491821, "learning_rate": 0.00011844, "loss": 1.9071741104125977, "mean_token_accuracy": 0.6450003884732723, "num_tokens": 12126451.0, "step": 2040 }, { "entropy": 1.7986262783408165, "epoch": 3.5226735439501398, "grad_norm": 0.7223983407020569, "learning_rate": 0.00011804, "loss": 1.8450950622558593, "mean_token_accuracy": 0.6576410517096519, "num_tokens": 12183343.0, "step": 2050 }, { "entropy": 1.8884935915470122, "epoch": 3.53986675263271, "grad_norm": 0.7206518650054932, "learning_rate": 0.00011763999999999999, "loss": 1.9660964965820313, "mean_token_accuracy": 0.6422303304076195, "num_tokens": 12243607.0, "step": 2060 }, { "entropy": 1.8009026944637299, "epoch": 3.5570599613152805, "grad_norm": 0.7229637503623962, "learning_rate": 0.00011724000000000002, "loss": 1.851433563232422, "mean_token_accuracy": 0.6556052915751934, "num_tokens": 12304867.0, "step": 2070 }, { "entropy": 1.7949693977832795, "epoch": 3.574253169997851, "grad_norm": 0.6935518383979797, "learning_rate": 0.00011684000000000001, "loss": 1.8848058700561523, "mean_token_accuracy": 0.6580755174160003, "num_tokens": 12367633.0, "step": 2080 }, { "entropy": 1.8038981169462205, "epoch": 3.591446378680421, "grad_norm": 0.7003904581069946, "learning_rate": 0.00011644000000000002, "loss": 1.8867233276367188, "mean_token_accuracy": 0.655081395432353, "num_tokens": 12423928.0, "step": 2090 }, { "entropy": 1.850062020123005, "epoch": 3.6086395873629917, "grad_norm": 0.6852926015853882, "learning_rate": 0.00011604000000000002, "loss": 1.9325201034545898, "mean_token_accuracy": 0.6472255479544401, "num_tokens": 12479411.0, "step": 2100 }, { "entropy": 1.8294448778033257, "epoch": 3.625832796045562, "grad_norm": 0.7044693827629089, "learning_rate": 0.00011564000000000001, "loss": 1.8989273071289063, "mean_token_accuracy": 0.6499249216169118, "num_tokens": 12539175.0, "step": 2110 }, { "entropy": 1.8719267755746842, "epoch": 3.6430260047281324, "grad_norm": 0.7180586457252502, "learning_rate": 0.00011524000000000001, "loss": 1.925216293334961, "mean_token_accuracy": 0.648950444161892, "num_tokens": 12598337.0, "step": 2120 }, { "entropy": 1.88923449665308, "epoch": 3.660219213410703, "grad_norm": 0.7464597821235657, "learning_rate": 0.00011484000000000002, "loss": 1.990826416015625, "mean_token_accuracy": 0.6456409864127636, "num_tokens": 12656592.0, "step": 2130 }, { "entropy": 1.8126205861568452, "epoch": 3.677412422093273, "grad_norm": 0.7253774404525757, "learning_rate": 0.00011444000000000001, "loss": 1.9414216995239257, "mean_token_accuracy": 0.6552157323807478, "num_tokens": 12717791.0, "step": 2140 }, { "entropy": 1.8930377542972565, "epoch": 3.6946056307758437, "grad_norm": 0.7404170036315918, "learning_rate": 0.00011404000000000001, "loss": 1.9364784240722657, "mean_token_accuracy": 0.6434980578720569, "num_tokens": 12775445.0, "step": 2150 }, { "entropy": 1.7652419656515121, "epoch": 3.711798839458414, "grad_norm": 0.688732385635376, "learning_rate": 0.00011364000000000002, "loss": 1.7636165618896484, "mean_token_accuracy": 0.6639453627169132, "num_tokens": 12834599.0, "step": 2160 }, { "entropy": 1.7745767116546631, "epoch": 3.7289920481409844, "grad_norm": 0.7011992335319519, "learning_rate": 0.00011324000000000001, "loss": 1.8347841262817384, "mean_token_accuracy": 0.6586773280054331, "num_tokens": 12889887.0, "step": 2170 }, { "entropy": 1.7952505484223367, "epoch": 3.746185256823555, "grad_norm": 0.7646785378456116, "learning_rate": 0.00011284000000000001, "loss": 1.883163070678711, "mean_token_accuracy": 0.6589437790215016, "num_tokens": 12950286.0, "step": 2180 }, { "entropy": 1.8878965258598328, "epoch": 3.763378465506125, "grad_norm": 0.7722623944282532, "learning_rate": 0.00011244000000000001, "loss": 1.9674694061279296, "mean_token_accuracy": 0.6422343414276839, "num_tokens": 13011083.0, "step": 2190 }, { "entropy": 1.919720321893692, "epoch": 3.780571674188695, "grad_norm": 0.7656893134117126, "learning_rate": 0.00011204000000000002, "loss": 1.9919773101806642, "mean_token_accuracy": 0.6393908958882093, "num_tokens": 13069376.0, "step": 2200 }, { "entropy": 1.77825688123703, "epoch": 3.797764882871266, "grad_norm": 0.8324808478355408, "learning_rate": 0.00011164000000000001, "loss": 1.8173160552978516, "mean_token_accuracy": 0.659475727379322, "num_tokens": 13124851.0, "step": 2210 }, { "entropy": 1.8232837438583374, "epoch": 3.8149580915538364, "grad_norm": 0.741481363773346, "learning_rate": 0.00011124000000000001, "loss": 1.860748291015625, "mean_token_accuracy": 0.6524971850216389, "num_tokens": 13182576.0, "step": 2220 }, { "entropy": 1.8588940657675266, "epoch": 3.8321513002364065, "grad_norm": 0.7748705148696899, "learning_rate": 0.00011084000000000002, "loss": 1.9206954956054687, "mean_token_accuracy": 0.6516353718936443, "num_tokens": 13242703.0, "step": 2230 }, { "entropy": 1.823398308455944, "epoch": 3.849344508918977, "grad_norm": 0.6341049671173096, "learning_rate": 0.00011044, "loss": 1.8718917846679688, "mean_token_accuracy": 0.6596139155328273, "num_tokens": 13303181.0, "step": 2240 }, { "entropy": 1.8098929420113563, "epoch": 3.866537717601547, "grad_norm": 0.6672969460487366, "learning_rate": 0.00011004000000000001, "loss": 1.8999752044677733, "mean_token_accuracy": 0.6594760783016682, "num_tokens": 13364371.0, "step": 2250 }, { "entropy": 1.7795367375016213, "epoch": 3.8837309262841178, "grad_norm": 0.6343891024589539, "learning_rate": 0.00010964000000000001, "loss": 1.827276611328125, "mean_token_accuracy": 0.6668465688824654, "num_tokens": 13425450.0, "step": 2260 }, { "entropy": 1.8673226684331894, "epoch": 3.9009241349666883, "grad_norm": 0.7357877492904663, "learning_rate": 0.00010924, "loss": 1.9206443786621095, "mean_token_accuracy": 0.647479448094964, "num_tokens": 13485806.0, "step": 2270 }, { "entropy": 1.806484942883253, "epoch": 3.9181173436492585, "grad_norm": 0.7172144055366516, "learning_rate": 0.00010884000000000001, "loss": 1.8789045333862304, "mean_token_accuracy": 0.6594084780663252, "num_tokens": 13544934.0, "step": 2280 }, { "entropy": 1.7970930591225625, "epoch": 3.935310552331829, "grad_norm": 0.7578801512718201, "learning_rate": 0.00010844000000000001, "loss": 1.8405040740966796, "mean_token_accuracy": 0.6608923889696598, "num_tokens": 13606653.0, "step": 2290 }, { "entropy": 1.8469372361898422, "epoch": 3.952503761014399, "grad_norm": 0.7626324892044067, "learning_rate": 0.00010804, "loss": 1.8629837036132812, "mean_token_accuracy": 0.6560039456933737, "num_tokens": 13663938.0, "step": 2300 }, { "entropy": 1.836122378706932, "epoch": 3.9696969696969697, "grad_norm": 0.7074365615844727, "learning_rate": 0.00010764, "loss": 1.8942070007324219, "mean_token_accuracy": 0.647238065674901, "num_tokens": 13722549.0, "step": 2310 }, { "entropy": 1.821449062973261, "epoch": 3.9868901783795403, "grad_norm": 0.6956577301025391, "learning_rate": 0.00010724000000000001, "loss": 1.8947336196899414, "mean_token_accuracy": 0.6528103355318308, "num_tokens": 13785922.0, "step": 2320 }, { "entropy": 1.839719023023333, "epoch": 4.003438641736514, "grad_norm": 0.6865222454071045, "learning_rate": 0.00010684, "loss": 1.8803377151489258, "mean_token_accuracy": 0.6526942384707464, "num_tokens": 13844647.0, "step": 2330 }, { "entropy": 1.855065654218197, "epoch": 4.020631850419084, "grad_norm": 0.7424384355545044, "learning_rate": 0.00010644, "loss": 1.9461166381835937, "mean_token_accuracy": 0.6463506512343884, "num_tokens": 13904724.0, "step": 2340 }, { "entropy": 1.7508789122104644, "epoch": 4.037825059101655, "grad_norm": 0.6670609712600708, "learning_rate": 0.00010604000000000001, "loss": 1.781893539428711, "mean_token_accuracy": 0.6653038747608662, "num_tokens": 13963472.0, "step": 2350 }, { "entropy": 1.8165026590228082, "epoch": 4.0550182677842255, "grad_norm": 0.7823750376701355, "learning_rate": 0.00010564000000000001, "loss": 1.8847312927246094, "mean_token_accuracy": 0.6607359856367111, "num_tokens": 14019708.0, "step": 2360 }, { "entropy": 1.794335062801838, "epoch": 4.072211476466796, "grad_norm": 0.8262340426445007, "learning_rate": 0.00010524, "loss": 1.8576740264892577, "mean_token_accuracy": 0.6582343481481076, "num_tokens": 14076178.0, "step": 2370 }, { "entropy": 1.8828865155577659, "epoch": 4.089404685149366, "grad_norm": 0.784656822681427, "learning_rate": 0.00010484, "loss": 1.9146394729614258, "mean_token_accuracy": 0.6491621173918247, "num_tokens": 14133662.0, "step": 2380 }, { "entropy": 1.918326808512211, "epoch": 4.106597893831936, "grad_norm": 0.7571077346801758, "learning_rate": 0.00010444000000000001, "loss": 2.024713897705078, "mean_token_accuracy": 0.643079025298357, "num_tokens": 14196967.0, "step": 2390 }, { "entropy": 1.7909317679703236, "epoch": 4.123791102514507, "grad_norm": 0.7276471257209778, "learning_rate": 0.00010404, "loss": 1.845133399963379, "mean_token_accuracy": 0.6548417568206787, "num_tokens": 14256866.0, "step": 2400 }, { "entropy": 1.7750686906278133, "epoch": 4.140984311197077, "grad_norm": 0.668246328830719, "learning_rate": 0.00010364, "loss": 1.7945009231567384, "mean_token_accuracy": 0.6641525950282812, "num_tokens": 14318324.0, "step": 2410 }, { "entropy": 1.823828212916851, "epoch": 4.158177519879647, "grad_norm": 0.7596518993377686, "learning_rate": 0.00010324000000000001, "loss": 1.898871612548828, "mean_token_accuracy": 0.6519910141825676, "num_tokens": 14380775.0, "step": 2420 }, { "entropy": 1.7938876405358315, "epoch": 4.175370728562218, "grad_norm": 0.6834619641304016, "learning_rate": 0.00010284, "loss": 1.8518138885498048, "mean_token_accuracy": 0.6622516691684723, "num_tokens": 14440862.0, "step": 2430 }, { "entropy": 1.8744625180959702, "epoch": 4.192563937244788, "grad_norm": 0.8088146448135376, "learning_rate": 0.00010244, "loss": 1.9542848587036132, "mean_token_accuracy": 0.6499028638005256, "num_tokens": 14500841.0, "step": 2440 }, { "entropy": 1.8284114554524422, "epoch": 4.209757145927359, "grad_norm": 0.82193523645401, "learning_rate": 0.00010204, "loss": 1.9107404708862306, "mean_token_accuracy": 0.6551219135522842, "num_tokens": 14564257.0, "step": 2450 }, { "entropy": 1.8538024842739105, "epoch": 4.226950354609929, "grad_norm": 0.7263757586479187, "learning_rate": 0.00010164, "loss": 1.8713863372802735, "mean_token_accuracy": 0.6510257624089718, "num_tokens": 14623019.0, "step": 2460 }, { "entropy": 1.756752038002014, "epoch": 4.244143563292499, "grad_norm": 0.7334346175193787, "learning_rate": 0.00010124, "loss": 1.7855047225952148, "mean_token_accuracy": 0.6687729060649872, "num_tokens": 14682191.0, "step": 2470 }, { "entropy": 1.7032470375299453, "epoch": 4.26133677197507, "grad_norm": 0.7168938517570496, "learning_rate": 0.00010084, "loss": 1.7648530960083009, "mean_token_accuracy": 0.6696467150002718, "num_tokens": 14739840.0, "step": 2480 }, { "entropy": 1.7426577515900135, "epoch": 4.27852998065764, "grad_norm": 0.7091065645217896, "learning_rate": 0.00010044000000000001, "loss": 1.8180946350097655, "mean_token_accuracy": 0.6640235505998134, "num_tokens": 14798444.0, "step": 2490 }, { "entropy": 1.8743537411093711, "epoch": 4.295723189340211, "grad_norm": 0.6376718878746033, "learning_rate": 0.00010004, "loss": 1.9534942626953125, "mean_token_accuracy": 0.6467559643089771, "num_tokens": 14861262.0, "step": 2500 }, { "entropy": 1.8234948687255383, "epoch": 4.312916398022781, "grad_norm": 0.778538167476654, "learning_rate": 9.964e-05, "loss": 1.8733020782470704, "mean_token_accuracy": 0.6553889319300652, "num_tokens": 14920923.0, "step": 2510 }, { "entropy": 1.812998068332672, "epoch": 4.330109606705351, "grad_norm": 0.7861834764480591, "learning_rate": 9.924e-05, "loss": 1.8699317932128907, "mean_token_accuracy": 0.6555795632302761, "num_tokens": 14978173.0, "step": 2520 }, { "entropy": 1.8013822883367538, "epoch": 4.347302815387922, "grad_norm": 0.751916229724884, "learning_rate": 9.884e-05, "loss": 1.8372121810913087, "mean_token_accuracy": 0.664341426640749, "num_tokens": 15034480.0, "step": 2530 }, { "entropy": 1.7700918450951577, "epoch": 4.364496024070492, "grad_norm": 0.7365695834159851, "learning_rate": 9.844000000000001e-05, "loss": 1.8166645050048829, "mean_token_accuracy": 0.6654425717890262, "num_tokens": 15093226.0, "step": 2540 }, { "entropy": 1.7808674454689026, "epoch": 4.381689232753063, "grad_norm": 0.7306393980979919, "learning_rate": 9.804e-05, "loss": 1.8363780975341797, "mean_token_accuracy": 0.6601886965334416, "num_tokens": 15149937.0, "step": 2550 }, { "entropy": 1.7890540674328803, "epoch": 4.398882441435633, "grad_norm": 0.7466715574264526, "learning_rate": 9.764000000000001e-05, "loss": 1.847653579711914, "mean_token_accuracy": 0.6586611110717058, "num_tokens": 15210500.0, "step": 2560 }, { "entropy": 1.7866264268755914, "epoch": 4.416075650118203, "grad_norm": 0.7825273871421814, "learning_rate": 9.724000000000001e-05, "loss": 1.82576904296875, "mean_token_accuracy": 0.6592508733272553, "num_tokens": 15268262.0, "step": 2570 }, { "entropy": 1.8321722269058227, "epoch": 4.433268858800774, "grad_norm": 0.7158058285713196, "learning_rate": 9.684000000000001e-05, "loss": 1.8807327270507812, "mean_token_accuracy": 0.6545467376708984, "num_tokens": 15330745.0, "step": 2580 }, { "entropy": 1.739266212284565, "epoch": 4.450462067483344, "grad_norm": 0.7281847596168518, "learning_rate": 9.644e-05, "loss": 1.7686588287353515, "mean_token_accuracy": 0.6666045777499676, "num_tokens": 15391266.0, "step": 2590 }, { "entropy": 1.8295569285750388, "epoch": 4.467655276165915, "grad_norm": 0.7166727781295776, "learning_rate": 9.604000000000001e-05, "loss": 1.9156217575073242, "mean_token_accuracy": 0.655017600953579, "num_tokens": 15449819.0, "step": 2600 }, { "entropy": 1.8236071288585662, "epoch": 4.484848484848484, "grad_norm": 0.6946532726287842, "learning_rate": 9.564000000000001e-05, "loss": 1.9035514831542968, "mean_token_accuracy": 0.649907086789608, "num_tokens": 15513231.0, "step": 2610 }, { "entropy": 1.7869442969560623, "epoch": 4.502041693531055, "grad_norm": 0.7257023453712463, "learning_rate": 9.524e-05, "loss": 1.841336441040039, "mean_token_accuracy": 0.6655759517103433, "num_tokens": 15568973.0, "step": 2620 }, { "entropy": 1.7462848544120788, "epoch": 4.519234902213626, "grad_norm": 0.7239391803741455, "learning_rate": 9.484e-05, "loss": 1.7989360809326171, "mean_token_accuracy": 0.6646886244416237, "num_tokens": 15627655.0, "step": 2630 }, { "entropy": 1.7926493644714356, "epoch": 4.536428110896196, "grad_norm": 0.7628325819969177, "learning_rate": 9.444000000000001e-05, "loss": 1.8627632141113282, "mean_token_accuracy": 0.654141866415739, "num_tokens": 15687626.0, "step": 2640 }, { "entropy": 1.7928333327174186, "epoch": 4.553621319578767, "grad_norm": 0.629107654094696, "learning_rate": 9.404e-05, "loss": 1.8784042358398438, "mean_token_accuracy": 0.6618591919541359, "num_tokens": 15750035.0, "step": 2650 }, { "entropy": 1.7438783437013625, "epoch": 4.570814528261336, "grad_norm": 0.6948845982551575, "learning_rate": 9.364e-05, "loss": 1.7456579208374023, "mean_token_accuracy": 0.6722261719405651, "num_tokens": 15809533.0, "step": 2660 }, { "entropy": 1.7451874181628226, "epoch": 4.588007736943907, "grad_norm": 0.7213107943534851, "learning_rate": 9.324000000000001e-05, "loss": 1.8111917495727539, "mean_token_accuracy": 0.6621977139264346, "num_tokens": 15866570.0, "step": 2670 }, { "entropy": 1.806991095095873, "epoch": 4.6052009456264775, "grad_norm": 0.9146936535835266, "learning_rate": 9.284e-05, "loss": 1.8761199951171874, "mean_token_accuracy": 0.6552402298897505, "num_tokens": 15923681.0, "step": 2680 }, { "entropy": 1.854476225376129, "epoch": 4.622394154309048, "grad_norm": 0.675061047077179, "learning_rate": 9.244e-05, "loss": 1.8601364135742187, "mean_token_accuracy": 0.656403211131692, "num_tokens": 15979879.0, "step": 2690 }, { "entropy": 1.8345128282904626, "epoch": 4.639587362991619, "grad_norm": 0.7702699303627014, "learning_rate": 9.204e-05, "loss": 1.9170707702636718, "mean_token_accuracy": 0.6507652081549168, "num_tokens": 16040136.0, "step": 2700 }, { "entropy": 1.8444690719246863, "epoch": 4.656780571674188, "grad_norm": 0.7249677181243896, "learning_rate": 9.164000000000001e-05, "loss": 1.9021928787231446, "mean_token_accuracy": 0.6553504541516304, "num_tokens": 16097652.0, "step": 2710 }, { "entropy": 1.8083212688565253, "epoch": 4.673973780356759, "grad_norm": 0.7018275260925293, "learning_rate": 9.124e-05, "loss": 1.87921199798584, "mean_token_accuracy": 0.6609590038657188, "num_tokens": 16159014.0, "step": 2720 }, { "entropy": 1.793540646135807, "epoch": 4.6911669890393295, "grad_norm": 0.731863796710968, "learning_rate": 9.084e-05, "loss": 1.847224807739258, "mean_token_accuracy": 0.6638176888227463, "num_tokens": 16223636.0, "step": 2730 }, { "entropy": 1.7947301134467124, "epoch": 4.7083601977219, "grad_norm": 0.7208489775657654, "learning_rate": 9.044000000000001e-05, "loss": 1.8400375366210937, "mean_token_accuracy": 0.6600434482097626, "num_tokens": 16281647.0, "step": 2740 }, { "entropy": 1.8043948471546174, "epoch": 4.725553406404471, "grad_norm": 0.7633848190307617, "learning_rate": 9.004e-05, "loss": 1.8509382247924804, "mean_token_accuracy": 0.6632162068039179, "num_tokens": 16340706.0, "step": 2750 }, { "entropy": 1.8240734949707984, "epoch": 4.74274661508704, "grad_norm": 0.7516812086105347, "learning_rate": 8.964e-05, "loss": 1.9139686584472657, "mean_token_accuracy": 0.6504824224859476, "num_tokens": 16398077.0, "step": 2760 }, { "entropy": 1.7775158017873764, "epoch": 4.759939823769611, "grad_norm": 0.7677133679389954, "learning_rate": 8.924e-05, "loss": 1.8351661682128906, "mean_token_accuracy": 0.6568478621542454, "num_tokens": 16458898.0, "step": 2770 }, { "entropy": 1.8671277523040772, "epoch": 4.7771330324521815, "grad_norm": 0.750451385974884, "learning_rate": 8.884e-05, "loss": 1.9589305877685548, "mean_token_accuracy": 0.6506143860518933, "num_tokens": 16519496.0, "step": 2780 }, { "entropy": 1.7745324671268463, "epoch": 4.794326241134752, "grad_norm": 0.8302338719367981, "learning_rate": 8.844e-05, "loss": 1.8637496948242187, "mean_token_accuracy": 0.6621543657034635, "num_tokens": 16579080.0, "step": 2790 }, { "entropy": 1.73246541172266, "epoch": 4.811519449817322, "grad_norm": 0.778176486492157, "learning_rate": 8.804e-05, "loss": 1.752696418762207, "mean_token_accuracy": 0.6727286443114281, "num_tokens": 16640932.0, "step": 2800 }, { "entropy": 1.8060437709093093, "epoch": 4.828712658499892, "grad_norm": 0.9019444584846497, "learning_rate": 8.764e-05, "loss": 1.9031681060791015, "mean_token_accuracy": 0.6563040159642697, "num_tokens": 16702244.0, "step": 2810 }, { "entropy": 1.8732322439551354, "epoch": 4.845905867182463, "grad_norm": 0.7397829294204712, "learning_rate": 8.724e-05, "loss": 1.9326038360595703, "mean_token_accuracy": 0.6478111572563648, "num_tokens": 16764555.0, "step": 2820 }, { "entropy": 1.842681024968624, "epoch": 4.863099075865033, "grad_norm": 0.8511717915534973, "learning_rate": 8.684e-05, "loss": 1.9107376098632813, "mean_token_accuracy": 0.6531910292804242, "num_tokens": 16821936.0, "step": 2830 }, { "entropy": 1.7571960732340812, "epoch": 4.880292284547604, "grad_norm": 0.7064304947853088, "learning_rate": 8.643999999999999e-05, "loss": 1.7985404968261718, "mean_token_accuracy": 0.6667480751872062, "num_tokens": 16882205.0, "step": 2840 }, { "entropy": 1.8695308573544025, "epoch": 4.897485493230175, "grad_norm": 0.7386742234230042, "learning_rate": 8.604000000000001e-05, "loss": 1.9543342590332031, "mean_token_accuracy": 0.6496741093695164, "num_tokens": 16939799.0, "step": 2850 }, { "entropy": 1.7877972453832627, "epoch": 4.914678701912744, "grad_norm": 0.7687976956367493, "learning_rate": 8.564000000000001e-05, "loss": 1.7994373321533204, "mean_token_accuracy": 0.6637697361409665, "num_tokens": 16997716.0, "step": 2860 }, { "entropy": 1.761916320025921, "epoch": 4.931871910595315, "grad_norm": 0.7507193088531494, "learning_rate": 8.524e-05, "loss": 1.788670539855957, "mean_token_accuracy": 0.6648910716176033, "num_tokens": 17057260.0, "step": 2870 }, { "entropy": 1.804823537170887, "epoch": 4.949065119277885, "grad_norm": 0.727188229560852, "learning_rate": 8.484000000000001e-05, "loss": 1.855522346496582, "mean_token_accuracy": 0.657912939786911, "num_tokens": 17116073.0, "step": 2880 }, { "entropy": 1.8259041801095008, "epoch": 4.966258327960456, "grad_norm": 0.7195336818695068, "learning_rate": 8.444000000000001e-05, "loss": 1.8942272186279296, "mean_token_accuracy": 0.6546841934323311, "num_tokens": 17174141.0, "step": 2890 }, { "entropy": 1.7153871595859527, "epoch": 4.983451536643026, "grad_norm": 0.7093940377235413, "learning_rate": 8.404e-05, "loss": 1.7350996017456055, "mean_token_accuracy": 0.6728265054523945, "num_tokens": 17233307.0, "step": 2900 }, { "entropy": 1.7630670566063422, "epoch": 5.0, "grad_norm": 0.979345440864563, "learning_rate": 8.364e-05, "loss": 1.8098876953125, "mean_token_accuracy": 0.6604567510741097, "num_tokens": 17289810.0, "step": 2910 }, { "entropy": 1.8877688512206077, "epoch": 5.017193208682571, "grad_norm": 0.8140257596969604, "learning_rate": 8.324000000000001e-05, "loss": 1.9562681198120118, "mean_token_accuracy": 0.6476880256086588, "num_tokens": 17349922.0, "step": 2920 }, { "entropy": 1.6694072388112544, "epoch": 5.034386417365141, "grad_norm": 0.7486578226089478, "learning_rate": 8.284000000000001e-05, "loss": 1.71788330078125, "mean_token_accuracy": 0.6781885512173176, "num_tokens": 17409363.0, "step": 2930 }, { "entropy": 1.8061093628406524, "epoch": 5.051579626047711, "grad_norm": 0.8148984313011169, "learning_rate": 8.244e-05, "loss": 1.8484228134155274, "mean_token_accuracy": 0.6591597832739353, "num_tokens": 17468218.0, "step": 2940 }, { "entropy": 1.7561381176114081, "epoch": 5.068772834730281, "grad_norm": 0.7412339448928833, "learning_rate": 8.204000000000001e-05, "loss": 1.8109855651855469, "mean_token_accuracy": 0.6648329850286245, "num_tokens": 17529603.0, "step": 2950 }, { "entropy": 1.7058369636535644, "epoch": 5.085966043412852, "grad_norm": 0.7845883369445801, "learning_rate": 8.164000000000001e-05, "loss": 1.7577402114868164, "mean_token_accuracy": 0.675883399322629, "num_tokens": 17587275.0, "step": 2960 }, { "entropy": 1.7319279327988624, "epoch": 5.1031592520954225, "grad_norm": 0.7546029090881348, "learning_rate": 8.124e-05, "loss": 1.8096488952636718, "mean_token_accuracy": 0.668717809766531, "num_tokens": 17647368.0, "step": 2970 }, { "entropy": 1.7872621923685075, "epoch": 5.120352460777993, "grad_norm": 0.7214957475662231, "learning_rate": 8.084e-05, "loss": 1.7827239990234376, "mean_token_accuracy": 0.663322826102376, "num_tokens": 17708210.0, "step": 2980 }, { "entropy": 1.7479579642415046, "epoch": 5.137545669460563, "grad_norm": 0.6938044428825378, "learning_rate": 8.044000000000001e-05, "loss": 1.837489700317383, "mean_token_accuracy": 0.666904554143548, "num_tokens": 17770498.0, "step": 2990 }, { "entropy": 1.760008592903614, "epoch": 5.154738878143133, "grad_norm": 0.7440096139907837, "learning_rate": 8.004e-05, "loss": 1.7957250595092773, "mean_token_accuracy": 0.6704145818948746, "num_tokens": 17831493.0, "step": 3000 }, { "entropy": 1.7866925299167633, "epoch": 5.171932086825704, "grad_norm": 0.775793731212616, "learning_rate": 7.964e-05, "loss": 1.8513370513916017, "mean_token_accuracy": 0.6593568369746208, "num_tokens": 17893338.0, "step": 3010 }, { "entropy": 1.720614206790924, "epoch": 5.1891252955082745, "grad_norm": 0.7855071425437927, "learning_rate": 7.924000000000001e-05, "loss": 1.7529998779296876, "mean_token_accuracy": 0.6738685265183448, "num_tokens": 17949102.0, "step": 3020 }, { "entropy": 1.8130397230386734, "epoch": 5.206318504190844, "grad_norm": 0.7261347770690918, "learning_rate": 7.884e-05, "loss": 1.8530288696289063, "mean_token_accuracy": 0.6672368694096804, "num_tokens": 18011291.0, "step": 3030 }, { "entropy": 1.7442916065454483, "epoch": 5.223511712873415, "grad_norm": 0.7350125908851624, "learning_rate": 7.844e-05, "loss": 1.8185455322265625, "mean_token_accuracy": 0.6648106183856726, "num_tokens": 18069969.0, "step": 3040 }, { "entropy": 1.8336029559373856, "epoch": 5.240704921555985, "grad_norm": 0.9380921125411987, "learning_rate": 7.804e-05, "loss": 1.8585586547851562, "mean_token_accuracy": 0.6559876747429371, "num_tokens": 18126839.0, "step": 3050 }, { "entropy": 1.831410789489746, "epoch": 5.257898130238556, "grad_norm": 0.7422699928283691, "learning_rate": 7.764e-05, "loss": 1.8669567108154297, "mean_token_accuracy": 0.660079612582922, "num_tokens": 18189169.0, "step": 3060 }, { "entropy": 1.7772829428315162, "epoch": 5.2750913389211265, "grad_norm": 0.80905681848526, "learning_rate": 7.724e-05, "loss": 1.8516859054565429, "mean_token_accuracy": 0.6614492174237967, "num_tokens": 18247092.0, "step": 3070 }, { "entropy": 1.8644750490784645, "epoch": 5.292284547603696, "grad_norm": 0.7613252997398376, "learning_rate": 7.684e-05, "loss": 1.8771135330200195, "mean_token_accuracy": 0.6572393793612719, "num_tokens": 18305337.0, "step": 3080 }, { "entropy": 1.7888765200972556, "epoch": 5.309477756286267, "grad_norm": 0.8216497302055359, "learning_rate": 7.644e-05, "loss": 1.877157211303711, "mean_token_accuracy": 0.6567147132009268, "num_tokens": 18366693.0, "step": 3090 }, { "entropy": 1.8046741798520087, "epoch": 5.326670964968837, "grad_norm": 0.7475964426994324, "learning_rate": 7.604e-05, "loss": 1.8193256378173828, "mean_token_accuracy": 0.6620738692581654, "num_tokens": 18424592.0, "step": 3100 }, { "entropy": 1.7556863978505135, "epoch": 5.343864173651408, "grad_norm": 0.7376730442047119, "learning_rate": 7.564e-05, "loss": 1.8117481231689454, "mean_token_accuracy": 0.6669185206294059, "num_tokens": 18480554.0, "step": 3110 }, { "entropy": 1.680773164331913, "epoch": 5.361057382333978, "grad_norm": 0.8276366591453552, "learning_rate": 7.524e-05, "loss": 1.7203754425048827, "mean_token_accuracy": 0.6801572386175394, "num_tokens": 18533859.0, "step": 3120 }, { "entropy": 1.7824992030858993, "epoch": 5.378250591016548, "grad_norm": 0.7689419984817505, "learning_rate": 7.484e-05, "loss": 1.7929088592529296, "mean_token_accuracy": 0.6625824831426144, "num_tokens": 18591432.0, "step": 3130 }, { "entropy": 1.7943954214453697, "epoch": 5.395443799699119, "grad_norm": 0.7818305492401123, "learning_rate": 7.444e-05, "loss": 1.9027202606201172, "mean_token_accuracy": 0.6574487689882517, "num_tokens": 18653780.0, "step": 3140 }, { "entropy": 1.6858137652277947, "epoch": 5.412637008381689, "grad_norm": 0.7783890962600708, "learning_rate": 7.404e-05, "loss": 1.6952400207519531, "mean_token_accuracy": 0.681446236371994, "num_tokens": 18712089.0, "step": 3150 }, { "entropy": 1.781475655734539, "epoch": 5.42983021706426, "grad_norm": 0.8033313751220703, "learning_rate": 7.364e-05, "loss": 1.8575824737548827, "mean_token_accuracy": 0.6616954285651445, "num_tokens": 18770760.0, "step": 3160 }, { "entropy": 1.8044284671545028, "epoch": 5.44702342574683, "grad_norm": 0.8778691291809082, "learning_rate": 7.324000000000001e-05, "loss": 1.868129348754883, "mean_token_accuracy": 0.6609551507979632, "num_tokens": 18830053.0, "step": 3170 }, { "entropy": 1.7783053085207938, "epoch": 5.4642166344294, "grad_norm": 0.794116735458374, "learning_rate": 7.284000000000001e-05, "loss": 1.8013723373413086, "mean_token_accuracy": 0.6687252540141344, "num_tokens": 18889827.0, "step": 3180 }, { "entropy": 1.7062184020876885, "epoch": 5.481409843111971, "grad_norm": 0.7348354458808899, "learning_rate": 7.244e-05, "loss": 1.7668045043945313, "mean_token_accuracy": 0.6689080417156219, "num_tokens": 18948434.0, "step": 3190 }, { "entropy": 1.7791135892271996, "epoch": 5.498603051794541, "grad_norm": 0.7285153865814209, "learning_rate": 7.204000000000001e-05, "loss": 1.8126005172729491, "mean_token_accuracy": 0.6612196549773216, "num_tokens": 19009735.0, "step": 3200 }, { "entropy": 1.7333651915192605, "epoch": 5.515796260477112, "grad_norm": 0.768817126750946, "learning_rate": 7.164000000000001e-05, "loss": 1.7593820571899415, "mean_token_accuracy": 0.6708316601812839, "num_tokens": 19070839.0, "step": 3210 }, { "entropy": 1.7646871596574782, "epoch": 5.532989469159682, "grad_norm": 0.7897234559059143, "learning_rate": 7.124e-05, "loss": 1.8217975616455078, "mean_token_accuracy": 0.6624716755002737, "num_tokens": 19131701.0, "step": 3220 }, { "entropy": 1.79796422123909, "epoch": 5.550182677842252, "grad_norm": 0.7753779292106628, "learning_rate": 7.084e-05, "loss": 1.8724674224853515, "mean_token_accuracy": 0.6603185098618269, "num_tokens": 19192206.0, "step": 3230 }, { "entropy": 1.6741029411554336, "epoch": 5.567375886524823, "grad_norm": 0.7559053301811218, "learning_rate": 7.044000000000001e-05, "loss": 1.7434120178222656, "mean_token_accuracy": 0.6718135714530945, "num_tokens": 19250887.0, "step": 3240 }, { "entropy": 1.8032452374696732, "epoch": 5.584569095207393, "grad_norm": 0.8887183666229248, "learning_rate": 7.004e-05, "loss": 1.8439495086669921, "mean_token_accuracy": 0.6600725017488003, "num_tokens": 19310066.0, "step": 3250 }, { "entropy": 1.7758998274803162, "epoch": 5.601762303889964, "grad_norm": 0.7295767068862915, "learning_rate": 6.964e-05, "loss": 1.8148815155029296, "mean_token_accuracy": 0.6652825616300106, "num_tokens": 19369366.0, "step": 3260 }, { "entropy": 1.662617878615856, "epoch": 5.6189555125725335, "grad_norm": 0.7842978835105896, "learning_rate": 6.924000000000001e-05, "loss": 1.7120464324951172, "mean_token_accuracy": 0.6784614086151123, "num_tokens": 19427377.0, "step": 3270 }, { "entropy": 1.8431208834052086, "epoch": 5.636148721255104, "grad_norm": 0.8514787554740906, "learning_rate": 6.884e-05, "loss": 1.9169921875, "mean_token_accuracy": 0.6530084304511548, "num_tokens": 19488230.0, "step": 3280 }, { "entropy": 1.6891573801636697, "epoch": 5.653341929937675, "grad_norm": 0.7638376951217651, "learning_rate": 6.844e-05, "loss": 1.7445995330810546, "mean_token_accuracy": 0.6780110366642476, "num_tokens": 19546500.0, "step": 3290 }, { "entropy": 1.7722659215331078, "epoch": 5.670535138620245, "grad_norm": 0.8072571754455566, "learning_rate": 6.804e-05, "loss": 1.8334453582763672, "mean_token_accuracy": 0.6662346951663494, "num_tokens": 19604586.0, "step": 3300 }, { "entropy": 1.824295823276043, "epoch": 5.687728347302816, "grad_norm": 0.8235921263694763, "learning_rate": 6.764000000000001e-05, "loss": 1.936505126953125, "mean_token_accuracy": 0.6559072963893413, "num_tokens": 19664352.0, "step": 3310 }, { "entropy": 1.776401199400425, "epoch": 5.704921555985385, "grad_norm": 0.7260850071907043, "learning_rate": 6.724e-05, "loss": 1.8124887466430664, "mean_token_accuracy": 0.6629696622490883, "num_tokens": 19725395.0, "step": 3320 }, { "entropy": 1.8356325037777423, "epoch": 5.722114764667956, "grad_norm": 0.7962324619293213, "learning_rate": 6.684e-05, "loss": 1.865267562866211, "mean_token_accuracy": 0.6575648851692677, "num_tokens": 19786886.0, "step": 3330 }, { "entropy": 1.7975003249943255, "epoch": 5.739307973350527, "grad_norm": 0.8319332599639893, "learning_rate": 6.644000000000001e-05, "loss": 1.8043830871582032, "mean_token_accuracy": 0.6615023009479046, "num_tokens": 19846593.0, "step": 3340 }, { "entropy": 1.7411245226860046, "epoch": 5.756501182033097, "grad_norm": 0.8770884871482849, "learning_rate": 6.604e-05, "loss": 1.8150835037231445, "mean_token_accuracy": 0.6629907101392746, "num_tokens": 19901728.0, "step": 3350 }, { "entropy": 1.7868980005383492, "epoch": 5.773694390715667, "grad_norm": 0.8012292385101318, "learning_rate": 6.564e-05, "loss": 1.8411848068237304, "mean_token_accuracy": 0.6610983822494745, "num_tokens": 19962294.0, "step": 3360 }, { "entropy": 1.8151665195822715, "epoch": 5.790887599398237, "grad_norm": 0.6628616452217102, "learning_rate": 6.524e-05, "loss": 1.873934555053711, "mean_token_accuracy": 0.6563344091176987, "num_tokens": 20023732.0, "step": 3370 }, { "entropy": 1.7544716522097588, "epoch": 5.808080808080808, "grad_norm": 0.765192985534668, "learning_rate": 6.484e-05, "loss": 1.7676244735717774, "mean_token_accuracy": 0.6681830242276192, "num_tokens": 20084955.0, "step": 3380 }, { "entropy": 1.7742430947721004, "epoch": 5.8252740167633785, "grad_norm": 0.9363911747932434, "learning_rate": 6.444e-05, "loss": 1.7936756134033203, "mean_token_accuracy": 0.6640898622572422, "num_tokens": 20145523.0, "step": 3390 }, { "entropy": 1.7672609627246856, "epoch": 5.842467225445949, "grad_norm": 0.8024185299873352, "learning_rate": 6.404e-05, "loss": 1.8390132904052734, "mean_token_accuracy": 0.66049126945436, "num_tokens": 20207613.0, "step": 3400 }, { "entropy": 1.7419263988733291, "epoch": 5.85966043412852, "grad_norm": 0.7452662587165833, "learning_rate": 6.364e-05, "loss": 1.7592693328857423, "mean_token_accuracy": 0.6729626737534999, "num_tokens": 20264080.0, "step": 3410 }, { "entropy": 1.794348457455635, "epoch": 5.876853642811089, "grad_norm": 0.7698886394500732, "learning_rate": 6.324e-05, "loss": 1.8215929031372071, "mean_token_accuracy": 0.6594825953245163, "num_tokens": 20325935.0, "step": 3420 }, { "entropy": 1.8058848246932029, "epoch": 5.89404685149366, "grad_norm": 0.7813654541969299, "learning_rate": 6.284e-05, "loss": 1.8705635070800781, "mean_token_accuracy": 0.6575064200907945, "num_tokens": 20384219.0, "step": 3430 }, { "entropy": 1.8296722590923309, "epoch": 5.9112400601762305, "grad_norm": 0.7985308766365051, "learning_rate": 6.244e-05, "loss": 1.8622390747070312, "mean_token_accuracy": 0.6598837457597255, "num_tokens": 20442055.0, "step": 3440 }, { "entropy": 1.745755286514759, "epoch": 5.928433268858801, "grad_norm": 0.7957124710083008, "learning_rate": 6.204e-05, "loss": 1.778817367553711, "mean_token_accuracy": 0.6711658544838428, "num_tokens": 20500787.0, "step": 3450 }, { "entropy": 1.8037077650427817, "epoch": 5.945626477541371, "grad_norm": 0.8097943067550659, "learning_rate": 6.164e-05, "loss": 1.8763154983520507, "mean_token_accuracy": 0.6581781908869744, "num_tokens": 20560544.0, "step": 3460 }, { "entropy": 1.6478220209479333, "epoch": 5.962819686223941, "grad_norm": 0.7882372736930847, "learning_rate": 6.124e-05, "loss": 1.7081596374511718, "mean_token_accuracy": 0.6812212504446507, "num_tokens": 20621866.0, "step": 3470 }, { "entropy": 1.7625097312033176, "epoch": 5.980012894906512, "grad_norm": 0.780114471912384, "learning_rate": 6.084000000000001e-05, "loss": 1.7864303588867188, "mean_token_accuracy": 0.6720023825764656, "num_tokens": 20677413.0, "step": 3480 }, { "entropy": 1.7418652877211571, "epoch": 5.9972061035890825, "grad_norm": 0.8374961614608765, "learning_rate": 6.044000000000001e-05, "loss": 1.8426128387451173, "mean_token_accuracy": 0.6612365163862706, "num_tokens": 20737424.0, "step": 3490 }, { "entropy": 1.8112690711950328, "epoch": 6.013754566946056, "grad_norm": 0.7742412686347961, "learning_rate": 6.004000000000001e-05, "loss": 1.8320732116699219, "mean_token_accuracy": 0.6643109286760355, "num_tokens": 20795175.0, "step": 3500 }, { "entropy": 1.687648557126522, "epoch": 6.0309477756286265, "grad_norm": 0.8348304629325867, "learning_rate": 5.9640000000000005e-05, "loss": 1.7558349609375, "mean_token_accuracy": 0.6784385897219181, "num_tokens": 20852486.0, "step": 3510 }, { "entropy": 1.6863658234477044, "epoch": 6.048140984311197, "grad_norm": 0.7642632126808167, "learning_rate": 5.924000000000001e-05, "loss": 1.6536775588989259, "mean_token_accuracy": 0.680523382127285, "num_tokens": 20908597.0, "step": 3520 }, { "entropy": 1.6652932062745094, "epoch": 6.065334192993768, "grad_norm": 0.8676924109458923, "learning_rate": 5.8840000000000006e-05, "loss": 1.7443069458007812, "mean_token_accuracy": 0.6719188451766968, "num_tokens": 20966567.0, "step": 3530 }, { "entropy": 1.7391631960868836, "epoch": 6.082527401676338, "grad_norm": 0.8444374799728394, "learning_rate": 5.844e-05, "loss": 1.7849775314331056, "mean_token_accuracy": 0.672398941218853, "num_tokens": 21023832.0, "step": 3540 }, { "entropy": 1.7432220742106437, "epoch": 6.099720610358908, "grad_norm": 0.7972187995910645, "learning_rate": 5.804000000000001e-05, "loss": 1.8264921188354493, "mean_token_accuracy": 0.6713483344763518, "num_tokens": 21080325.0, "step": 3550 }, { "entropy": 1.7394985787570476, "epoch": 6.1169138190414785, "grad_norm": 0.8266369700431824, "learning_rate": 5.7640000000000004e-05, "loss": 1.819821548461914, "mean_token_accuracy": 0.6708907049149275, "num_tokens": 21143316.0, "step": 3560 }, { "entropy": 1.7923602670431138, "epoch": 6.134107027724049, "grad_norm": 0.8315872550010681, "learning_rate": 5.724000000000001e-05, "loss": 1.8086809158325194, "mean_token_accuracy": 0.665992408245802, "num_tokens": 21203848.0, "step": 3570 }, { "entropy": 1.711188006401062, "epoch": 6.15130023640662, "grad_norm": 0.8174048066139221, "learning_rate": 5.6840000000000005e-05, "loss": 1.7656991958618165, "mean_token_accuracy": 0.6711975857615471, "num_tokens": 21266260.0, "step": 3580 }, { "entropy": 1.8437035098671912, "epoch": 6.16849344508919, "grad_norm": 0.8155949711799622, "learning_rate": 5.644e-05, "loss": 1.877999496459961, "mean_token_accuracy": 0.6532085236161947, "num_tokens": 21326008.0, "step": 3590 }, { "entropy": 1.7264528393745422, "epoch": 6.18568665377176, "grad_norm": 0.7951272130012512, "learning_rate": 5.6040000000000006e-05, "loss": 1.747119140625, "mean_token_accuracy": 0.6696909107267857, "num_tokens": 21385356.0, "step": 3600 }, { "entropy": 1.68227918446064, "epoch": 6.20287986245433, "grad_norm": 0.779587984085083, "learning_rate": 5.564e-05, "loss": 1.7062965393066407, "mean_token_accuracy": 0.6786911800503731, "num_tokens": 21443231.0, "step": 3610 }, { "entropy": 1.7644565671682357, "epoch": 6.220073071136901, "grad_norm": 0.9153981804847717, "learning_rate": 5.524e-05, "loss": 1.8082721710205079, "mean_token_accuracy": 0.6671201888471842, "num_tokens": 21499309.0, "step": 3620 }, { "entropy": 1.7211210913956165, "epoch": 6.237266279819472, "grad_norm": 0.8166586756706238, "learning_rate": 5.4840000000000003e-05, "loss": 1.769371795654297, "mean_token_accuracy": 0.6694241009652615, "num_tokens": 21558565.0, "step": 3630 }, { "entropy": 1.7693689942359925, "epoch": 6.254459488502041, "grad_norm": 0.7773623466491699, "learning_rate": 5.444e-05, "loss": 1.848412322998047, "mean_token_accuracy": 0.66685731112957, "num_tokens": 21618504.0, "step": 3640 }, { "entropy": 1.8090675905346871, "epoch": 6.271652697184612, "grad_norm": 0.9420453310012817, "learning_rate": 5.4040000000000004e-05, "loss": 1.8266836166381837, "mean_token_accuracy": 0.6643423162400722, "num_tokens": 21676861.0, "step": 3650 }, { "entropy": 1.7340097561478616, "epoch": 6.288845905867182, "grad_norm": 0.805880069732666, "learning_rate": 5.364e-05, "loss": 1.7760274887084961, "mean_token_accuracy": 0.6729184173047542, "num_tokens": 21734874.0, "step": 3660 }, { "entropy": 1.733542764186859, "epoch": 6.306039114549753, "grad_norm": 0.7459798455238342, "learning_rate": 5.324e-05, "loss": 1.7874065399169923, "mean_token_accuracy": 0.6733234331011773, "num_tokens": 21797467.0, "step": 3670 }, { "entropy": 1.6855479300022125, "epoch": 6.3232323232323235, "grad_norm": 0.7362611889839172, "learning_rate": 5.284e-05, "loss": 1.7557338714599608, "mean_token_accuracy": 0.6742986045777798, "num_tokens": 21856704.0, "step": 3680 }, { "entropy": 1.762756396830082, "epoch": 6.340425531914893, "grad_norm": 0.8349901437759399, "learning_rate": 5.244e-05, "loss": 1.784174346923828, "mean_token_accuracy": 0.6732991166412831, "num_tokens": 21915781.0, "step": 3690 }, { "entropy": 1.7664957396686076, "epoch": 6.357618740597464, "grad_norm": 0.8295337557792664, "learning_rate": 5.204e-05, "loss": 1.8338695526123048, "mean_token_accuracy": 0.6659718155860901, "num_tokens": 21973568.0, "step": 3700 }, { "entropy": 1.7744196206331253, "epoch": 6.374811949280034, "grad_norm": 0.739115297794342, "learning_rate": 5.164e-05, "loss": 1.8148929595947265, "mean_token_accuracy": 0.6660460762679576, "num_tokens": 22032979.0, "step": 3710 }, { "entropy": 1.7459667712450027, "epoch": 6.392005157962605, "grad_norm": 0.7716593146324158, "learning_rate": 5.124e-05, "loss": 1.8079204559326172, "mean_token_accuracy": 0.66551748290658, "num_tokens": 22092283.0, "step": 3720 }, { "entropy": 1.7491293936967849, "epoch": 6.4091983666451755, "grad_norm": 0.8270374536514282, "learning_rate": 5.084e-05, "loss": 1.8020380020141602, "mean_token_accuracy": 0.6673273537307978, "num_tokens": 22150667.0, "step": 3730 }, { "entropy": 1.6887403331696986, "epoch": 6.426391575327745, "grad_norm": 0.8306758403778076, "learning_rate": 5.044e-05, "loss": 1.7328964233398438, "mean_token_accuracy": 0.676455694437027, "num_tokens": 22211170.0, "step": 3740 }, { "entropy": 1.8332835257053375, "epoch": 6.443584784010316, "grad_norm": 0.8369497656822205, "learning_rate": 5.0039999999999995e-05, "loss": 1.913273239135742, "mean_token_accuracy": 0.656198850646615, "num_tokens": 22269928.0, "step": 3750 }, { "entropy": 1.6914366707205772, "epoch": 6.460777992692886, "grad_norm": 0.7562059164047241, "learning_rate": 4.9640000000000006e-05, "loss": 1.7506240844726562, "mean_token_accuracy": 0.67936124317348, "num_tokens": 22328611.0, "step": 3760 }, { "entropy": 1.7604179099202155, "epoch": 6.477971201375457, "grad_norm": 0.7541300058364868, "learning_rate": 4.924e-05, "loss": 1.8065948486328125, "mean_token_accuracy": 0.6697364591062069, "num_tokens": 22389219.0, "step": 3770 }, { "entropy": 1.731757602095604, "epoch": 6.4951644100580275, "grad_norm": 0.8319364190101624, "learning_rate": 4.884e-05, "loss": 1.7902181625366211, "mean_token_accuracy": 0.6673447206616402, "num_tokens": 22449858.0, "step": 3780 }, { "entropy": 1.7152166068553925, "epoch": 6.512357618740597, "grad_norm": 0.8575091361999512, "learning_rate": 4.8440000000000004e-05, "loss": 1.7424659729003906, "mean_token_accuracy": 0.6707747709006071, "num_tokens": 22509375.0, "step": 3790 }, { "entropy": 1.6641680032014847, "epoch": 6.529550827423168, "grad_norm": 0.7516652345657349, "learning_rate": 4.804e-05, "loss": 1.6937873840332032, "mean_token_accuracy": 0.6811798132956028, "num_tokens": 22566440.0, "step": 3800 }, { "entropy": 1.7551555022597314, "epoch": 6.546744036105738, "grad_norm": 0.817863941192627, "learning_rate": 4.7640000000000005e-05, "loss": 1.8282489776611328, "mean_token_accuracy": 0.6655839093029499, "num_tokens": 22627900.0, "step": 3810 }, { "entropy": 1.7025569766759872, "epoch": 6.563937244788309, "grad_norm": 0.757764458656311, "learning_rate": 4.724e-05, "loss": 1.7325496673583984, "mean_token_accuracy": 0.6785391330718994, "num_tokens": 22685738.0, "step": 3820 }, { "entropy": 1.699775031208992, "epoch": 6.5811304534708785, "grad_norm": 0.7960421442985535, "learning_rate": 4.684e-05, "loss": 1.7602745056152345, "mean_token_accuracy": 0.6698532458394766, "num_tokens": 22745696.0, "step": 3830 }, { "entropy": 1.8100605458021164, "epoch": 6.598323662153449, "grad_norm": 0.8477244973182678, "learning_rate": 4.644e-05, "loss": 1.8226333618164063, "mean_token_accuracy": 0.6646727129817009, "num_tokens": 22805783.0, "step": 3840 }, { "entropy": 1.7685839846730231, "epoch": 6.61551687083602, "grad_norm": 0.7853493690490723, "learning_rate": 4.604e-05, "loss": 1.8230281829833985, "mean_token_accuracy": 0.664577030390501, "num_tokens": 22866822.0, "step": 3850 }, { "entropy": 1.7810854628682136, "epoch": 6.63271007951859, "grad_norm": 0.7139444351196289, "learning_rate": 4.564e-05, "loss": 1.855198287963867, "mean_token_accuracy": 0.6652711797505617, "num_tokens": 22928790.0, "step": 3860 }, { "entropy": 1.7815292954444886, "epoch": 6.649903288201161, "grad_norm": 0.7039018869400024, "learning_rate": 4.524000000000001e-05, "loss": 1.845859909057617, "mean_token_accuracy": 0.6595252249389887, "num_tokens": 22990170.0, "step": 3870 }, { "entropy": 1.7107908308506012, "epoch": 6.667096496883731, "grad_norm": 0.7651708126068115, "learning_rate": 4.4840000000000004e-05, "loss": 1.7340824127197265, "mean_token_accuracy": 0.6750431463122368, "num_tokens": 23047902.0, "step": 3880 }, { "entropy": 1.7069460928440094, "epoch": 6.684289705566301, "grad_norm": 0.7385950088500977, "learning_rate": 4.444e-05, "loss": 1.758881187438965, "mean_token_accuracy": 0.6745327576994896, "num_tokens": 23112106.0, "step": 3890 }, { "entropy": 1.821124967932701, "epoch": 6.701482914248872, "grad_norm": 0.7827627658843994, "learning_rate": 4.4040000000000005e-05, "loss": 1.913480567932129, "mean_token_accuracy": 0.6593531377613544, "num_tokens": 23170056.0, "step": 3900 }, { "entropy": 1.7924881175160408, "epoch": 6.718676122931442, "grad_norm": 0.8166612386703491, "learning_rate": 4.364e-05, "loss": 1.855017852783203, "mean_token_accuracy": 0.6593458168208599, "num_tokens": 23228582.0, "step": 3910 }, { "entropy": 1.736910080909729, "epoch": 6.735869331614013, "grad_norm": 0.779629647731781, "learning_rate": 4.324e-05, "loss": 1.7581821441650392, "mean_token_accuracy": 0.6779871381819248, "num_tokens": 23288702.0, "step": 3920 }, { "entropy": 1.6776573412120341, "epoch": 6.7530625402965825, "grad_norm": 0.7625913619995117, "learning_rate": 4.284e-05, "loss": 1.7102031707763672, "mean_token_accuracy": 0.6794889360666275, "num_tokens": 23349004.0, "step": 3930 }, { "entropy": 1.8100020587444305, "epoch": 6.770255748979153, "grad_norm": 0.7499405145645142, "learning_rate": 4.244e-05, "loss": 1.8514158248901367, "mean_token_accuracy": 0.6620845705270767, "num_tokens": 23410874.0, "step": 3940 }, { "entropy": 1.697011759877205, "epoch": 6.787448957661724, "grad_norm": 0.736323893070221, "learning_rate": 4.2040000000000004e-05, "loss": 1.7609180450439452, "mean_token_accuracy": 0.6772994473576546, "num_tokens": 23472518.0, "step": 3950 }, { "entropy": 1.764576494693756, "epoch": 6.804642166344294, "grad_norm": 0.8523833751678467, "learning_rate": 4.164e-05, "loss": 1.81484375, "mean_token_accuracy": 0.6644324712455273, "num_tokens": 23531203.0, "step": 3960 }, { "entropy": 1.7241224959492683, "epoch": 6.821835375026865, "grad_norm": 0.8820350766181946, "learning_rate": 4.124e-05, "loss": 1.739130401611328, "mean_token_accuracy": 0.6771424360573292, "num_tokens": 23590289.0, "step": 3970 }, { "entropy": 1.6967746496200562, "epoch": 6.8390285837094345, "grad_norm": 0.8161067962646484, "learning_rate": 4.084e-05, "loss": 1.7659534454345702, "mean_token_accuracy": 0.6744477659463882, "num_tokens": 23647985.0, "step": 3980 }, { "entropy": 1.8578275874257089, "epoch": 6.856221792392005, "grad_norm": 0.778160810470581, "learning_rate": 4.044e-05, "loss": 1.9046249389648438, "mean_token_accuracy": 0.6525318272411823, "num_tokens": 23707387.0, "step": 3990 }, { "entropy": 1.781902502477169, "epoch": 6.873415001074576, "grad_norm": 0.9398592710494995, "learning_rate": 4.004e-05, "loss": 1.8081722259521484, "mean_token_accuracy": 0.6625144556164742, "num_tokens": 23764831.0, "step": 4000 }, { "entropy": 1.699565550684929, "epoch": 6.890608209757146, "grad_norm": 0.7662839889526367, "learning_rate": 3.964e-05, "loss": 1.7373327255249023, "mean_token_accuracy": 0.6809282444417477, "num_tokens": 23825367.0, "step": 4010 }, { "entropy": 1.6455101184546947, "epoch": 6.907801418439716, "grad_norm": 0.7619901299476624, "learning_rate": 3.9240000000000004e-05, "loss": 1.709805679321289, "mean_token_accuracy": 0.6812954246997833, "num_tokens": 23887369.0, "step": 4020 }, { "entropy": 1.7952800825238229, "epoch": 6.924994627122286, "grad_norm": 0.7858437299728394, "learning_rate": 3.884e-05, "loss": 1.8688398361206056, "mean_token_accuracy": 0.6621494639664889, "num_tokens": 23949358.0, "step": 4030 }, { "entropy": 1.772008201479912, "epoch": 6.942187835804857, "grad_norm": 0.7586779594421387, "learning_rate": 3.8440000000000005e-05, "loss": 1.798760986328125, "mean_token_accuracy": 0.667642817273736, "num_tokens": 24009691.0, "step": 4040 }, { "entropy": 1.7289930269122125, "epoch": 6.959381044487428, "grad_norm": 0.854505717754364, "learning_rate": 3.804e-05, "loss": 1.771562385559082, "mean_token_accuracy": 0.6692178774625063, "num_tokens": 24064506.0, "step": 4050 }, { "entropy": 1.715189914405346, "epoch": 6.976574253169998, "grad_norm": 0.758488655090332, "learning_rate": 3.7640000000000006e-05, "loss": 1.756412887573242, "mean_token_accuracy": 0.6710222817957401, "num_tokens": 24126841.0, "step": 4060 }, { "entropy": 1.7383173301815986, "epoch": 6.993767461852569, "grad_norm": 0.7450618147850037, "learning_rate": 3.724e-05, "loss": 1.7997669219970702, "mean_token_accuracy": 0.6649864386767149, "num_tokens": 24186159.0, "step": 4070 }, { "entropy": 1.7172312767474682, "epoch": 7.010315925209542, "grad_norm": 0.8475770950317383, "learning_rate": 3.684e-05, "loss": 1.7585922241210938, "mean_token_accuracy": 0.6746863397684965, "num_tokens": 24239759.0, "step": 4080 }, { "entropy": 1.7192407630383968, "epoch": 7.027509133892113, "grad_norm": 0.7818967700004578, "learning_rate": 3.6440000000000003e-05, "loss": 1.7634265899658204, "mean_token_accuracy": 0.6724576361477375, "num_tokens": 24298775.0, "step": 4090 }, { "entropy": 1.7496131911873818, "epoch": 7.044702342574683, "grad_norm": 0.8118335008621216, "learning_rate": 3.604e-05, "loss": 1.802253532409668, "mean_token_accuracy": 0.6702191606163979, "num_tokens": 24361142.0, "step": 4100 }, { "entropy": 1.7090509735047816, "epoch": 7.061895551257253, "grad_norm": 0.8414726257324219, "learning_rate": 3.5640000000000004e-05, "loss": 1.7347373962402344, "mean_token_accuracy": 0.679864277690649, "num_tokens": 24419838.0, "step": 4110 }, { "entropy": 1.6807728812098504, "epoch": 7.079088759939824, "grad_norm": 0.8567139506340027, "learning_rate": 3.524e-05, "loss": 1.7365150451660156, "mean_token_accuracy": 0.6765194039791822, "num_tokens": 24477518.0, "step": 4120 }, { "entropy": 1.709678091108799, "epoch": 7.096281968622394, "grad_norm": 0.8345620036125183, "learning_rate": 3.484e-05, "loss": 1.730575180053711, "mean_token_accuracy": 0.6709145799279213, "num_tokens": 24534560.0, "step": 4130 }, { "entropy": 1.6541544690728187, "epoch": 7.113475177304965, "grad_norm": 0.8509814143180847, "learning_rate": 3.444e-05, "loss": 1.6795757293701172, "mean_token_accuracy": 0.6856038823723793, "num_tokens": 24594829.0, "step": 4140 }, { "entropy": 1.7498343527317046, "epoch": 7.130668385987535, "grad_norm": 0.8674039244651794, "learning_rate": 3.404e-05, "loss": 1.8083892822265626, "mean_token_accuracy": 0.6709578204900026, "num_tokens": 24656798.0, "step": 4150 }, { "entropy": 1.677807478606701, "epoch": 7.147861594670105, "grad_norm": 0.8016234040260315, "learning_rate": 3.3639999999999996e-05, "loss": 1.7206790924072266, "mean_token_accuracy": 0.6754934191703796, "num_tokens": 24714009.0, "step": 4160 }, { "entropy": 1.672835360467434, "epoch": 7.1650548033526755, "grad_norm": 0.7139334082603455, "learning_rate": 3.324e-05, "loss": 1.7049163818359374, "mean_token_accuracy": 0.6851269513368606, "num_tokens": 24778022.0, "step": 4170 }, { "entropy": 1.6577355667948723, "epoch": 7.182248012035246, "grad_norm": 0.9129847288131714, "learning_rate": 3.2840000000000004e-05, "loss": 1.7073640823364258, "mean_token_accuracy": 0.6768647953867912, "num_tokens": 24837669.0, "step": 4180 }, { "entropy": 1.7049853071570396, "epoch": 7.199441220717817, "grad_norm": 0.7545643448829651, "learning_rate": 3.244e-05, "loss": 1.754374122619629, "mean_token_accuracy": 0.6808854278177023, "num_tokens": 24898991.0, "step": 4190 }, { "entropy": 1.6785477355122567, "epoch": 7.216634429400387, "grad_norm": 0.8802333474159241, "learning_rate": 3.2040000000000005e-05, "loss": 1.6974828720092774, "mean_token_accuracy": 0.6824289247393608, "num_tokens": 24957348.0, "step": 4200 }, { "entropy": 1.7312355414032936, "epoch": 7.233827638082957, "grad_norm": 0.8227038383483887, "learning_rate": 3.164e-05, "loss": 1.7645183563232423, "mean_token_accuracy": 0.6661410238593817, "num_tokens": 25016658.0, "step": 4210 }, { "entropy": 1.8124181643128394, "epoch": 7.2510208467655275, "grad_norm": 0.8563106060028076, "learning_rate": 3.1240000000000006e-05, "loss": 1.8163776397705078, "mean_token_accuracy": 0.6610642150044441, "num_tokens": 25074658.0, "step": 4220 }, { "entropy": 1.776869924366474, "epoch": 7.268214055448098, "grad_norm": 0.8615058064460754, "learning_rate": 3.084e-05, "loss": 1.861563491821289, "mean_token_accuracy": 0.6624562762677669, "num_tokens": 25132732.0, "step": 4230 }, { "entropy": 1.742109003663063, "epoch": 7.285407264130669, "grad_norm": 0.7851050496101379, "learning_rate": 3.0440000000000003e-05, "loss": 1.7527351379394531, "mean_token_accuracy": 0.6712357953190804, "num_tokens": 25194009.0, "step": 4240 }, { "entropy": 1.7356494843959809, "epoch": 7.302600472813239, "grad_norm": 0.8842288255691528, "learning_rate": 3.004e-05, "loss": 1.8091196060180663, "mean_token_accuracy": 0.6680308949202299, "num_tokens": 25250681.0, "step": 4250 }, { "entropy": 1.714112138748169, "epoch": 7.319793681495809, "grad_norm": 0.8050926923751831, "learning_rate": 2.964e-05, "loss": 1.741617774963379, "mean_token_accuracy": 0.6764710985124112, "num_tokens": 25307119.0, "step": 4260 }, { "entropy": 1.7806825146079064, "epoch": 7.3369868901783795, "grad_norm": 0.755797803401947, "learning_rate": 2.924e-05, "loss": 1.8448747634887694, "mean_token_accuracy": 0.6646751999855042, "num_tokens": 25365721.0, "step": 4270 }, { "entropy": 1.7478718511760234, "epoch": 7.35418009886095, "grad_norm": 0.8148614764213562, "learning_rate": 2.8840000000000002e-05, "loss": 1.8303293228149413, "mean_token_accuracy": 0.6662985436618328, "num_tokens": 25423309.0, "step": 4280 }, { "entropy": 1.6996045634150505, "epoch": 7.371373307543521, "grad_norm": 0.7613778114318848, "learning_rate": 2.844e-05, "loss": 1.7077817916870117, "mean_token_accuracy": 0.679437268525362, "num_tokens": 25480080.0, "step": 4290 }, { "entropy": 1.8055237784981728, "epoch": 7.38856651622609, "grad_norm": 0.899900496006012, "learning_rate": 2.804e-05, "loss": 1.882634735107422, "mean_token_accuracy": 0.659589122608304, "num_tokens": 25538885.0, "step": 4300 }, { "entropy": 1.6835025876760483, "epoch": 7.405759724908661, "grad_norm": 0.7718909382820129, "learning_rate": 2.764e-05, "loss": 1.7145641326904297, "mean_token_accuracy": 0.6805526971817016, "num_tokens": 25598830.0, "step": 4310 }, { "entropy": 1.7392980232834816, "epoch": 7.422952933591231, "grad_norm": 0.7144562005996704, "learning_rate": 2.724e-05, "loss": 1.7779796600341797, "mean_token_accuracy": 0.6709600411355495, "num_tokens": 25660275.0, "step": 4320 }, { "entropy": 1.7193088322877883, "epoch": 7.440146142273802, "grad_norm": 0.8038010001182556, "learning_rate": 2.6840000000000004e-05, "loss": 1.7928234100341798, "mean_token_accuracy": 0.6767275612801313, "num_tokens": 25719958.0, "step": 4330 }, { "entropy": 1.7314304433763028, "epoch": 7.457339350956373, "grad_norm": 0.7783089876174927, "learning_rate": 2.6440000000000004e-05, "loss": 1.7952003479003906, "mean_token_accuracy": 0.6740467935800553, "num_tokens": 25776689.0, "step": 4340 }, { "entropy": 1.74028614833951, "epoch": 7.474532559638942, "grad_norm": 0.8052565455436707, "learning_rate": 2.6040000000000005e-05, "loss": 1.7803146362304687, "mean_token_accuracy": 0.6733121275901794, "num_tokens": 25837916.0, "step": 4350 }, { "entropy": 1.6831192195415496, "epoch": 7.491725768321513, "grad_norm": 0.8941977024078369, "learning_rate": 2.5640000000000002e-05, "loss": 1.7077743530273437, "mean_token_accuracy": 0.6749852932989597, "num_tokens": 25896712.0, "step": 4360 }, { "entropy": 1.7840609520673751, "epoch": 7.508918977004083, "grad_norm": 0.818671703338623, "learning_rate": 2.5240000000000002e-05, "loss": 1.8329656600952149, "mean_token_accuracy": 0.6679215718060731, "num_tokens": 25958383.0, "step": 4370 }, { "entropy": 1.76528559923172, "epoch": 7.526112185686654, "grad_norm": 0.7579294443130493, "learning_rate": 2.4840000000000003e-05, "loss": 1.7914703369140625, "mean_token_accuracy": 0.6695499271154404, "num_tokens": 26017754.0, "step": 4380 }, { "entropy": 1.704708030819893, "epoch": 7.5433053943692245, "grad_norm": 0.8200159668922424, "learning_rate": 2.4440000000000003e-05, "loss": 1.774311637878418, "mean_token_accuracy": 0.6739427134394645, "num_tokens": 26075760.0, "step": 4390 }, { "entropy": 1.7540104657411575, "epoch": 7.560498603051794, "grad_norm": 0.8373399972915649, "learning_rate": 2.404e-05, "loss": 1.796240997314453, "mean_token_accuracy": 0.6640590511262416, "num_tokens": 26133858.0, "step": 4400 }, { "entropy": 1.754172220826149, "epoch": 7.577691811734365, "grad_norm": 0.7368677258491516, "learning_rate": 2.364e-05, "loss": 1.8175994873046875, "mean_token_accuracy": 0.6717667855322361, "num_tokens": 26197518.0, "step": 4410 }, { "entropy": 1.6564558774232865, "epoch": 7.594885020416935, "grad_norm": 0.8868939280509949, "learning_rate": 2.324e-05, "loss": 1.669070053100586, "mean_token_accuracy": 0.6839951984584332, "num_tokens": 26250823.0, "step": 4420 }, { "entropy": 1.7594470486044884, "epoch": 7.612078229099506, "grad_norm": 0.86412513256073, "learning_rate": 2.284e-05, "loss": 1.8095222473144532, "mean_token_accuracy": 0.666244950518012, "num_tokens": 26312548.0, "step": 4430 }, { "entropy": 1.7646627604961396, "epoch": 7.6292714377820765, "grad_norm": 0.7128214836120605, "learning_rate": 2.244e-05, "loss": 1.832158660888672, "mean_token_accuracy": 0.6679420609027147, "num_tokens": 26376747.0, "step": 4440 }, { "entropy": 1.7401177063584328, "epoch": 7.646464646464646, "grad_norm": 0.7479432225227356, "learning_rate": 2.2040000000000002e-05, "loss": 1.7779264450073242, "mean_token_accuracy": 0.6710429213941097, "num_tokens": 26438907.0, "step": 4450 }, { "entropy": 1.6960709124803544, "epoch": 7.663657855147217, "grad_norm": 0.8182732462882996, "learning_rate": 2.1640000000000003e-05, "loss": 1.7709745407104491, "mean_token_accuracy": 0.6782359674572944, "num_tokens": 26499840.0, "step": 4460 }, { "entropy": 1.8024938970804214, "epoch": 7.680851063829787, "grad_norm": 0.8208670020103455, "learning_rate": 2.124e-05, "loss": 1.8752277374267579, "mean_token_accuracy": 0.6610838636755944, "num_tokens": 26561739.0, "step": 4470 }, { "entropy": 1.6679524429142476, "epoch": 7.698044272512358, "grad_norm": 0.7669119834899902, "learning_rate": 2.084e-05, "loss": 1.6840700149536132, "mean_token_accuracy": 0.6839361816644669, "num_tokens": 26618997.0, "step": 4480 }, { "entropy": 1.669876104593277, "epoch": 7.715237481194928, "grad_norm": 0.8296427130699158, "learning_rate": 2.044e-05, "loss": 1.6926704406738282, "mean_token_accuracy": 0.6837400387972593, "num_tokens": 26677617.0, "step": 4490 }, { "entropy": 1.7478768080472946, "epoch": 7.732430689877498, "grad_norm": 0.9231081008911133, "learning_rate": 2.004e-05, "loss": 1.8043970108032226, "mean_token_accuracy": 0.6680058591067791, "num_tokens": 26735542.0, "step": 4500 }, { "entropy": 1.7587152615189552, "epoch": 7.749623898560069, "grad_norm": 0.8131846189498901, "learning_rate": 1.9640000000000002e-05, "loss": 1.798016357421875, "mean_token_accuracy": 0.6655693002045154, "num_tokens": 26796245.0, "step": 4510 }, { "entropy": 1.7238084524869919, "epoch": 7.766817107242639, "grad_norm": 0.8774024248123169, "learning_rate": 1.924e-05, "loss": 1.7398443222045898, "mean_token_accuracy": 0.6723451249301433, "num_tokens": 26852843.0, "step": 4520 }, { "entropy": 1.8012757793068885, "epoch": 7.78401031592521, "grad_norm": 0.881601095199585, "learning_rate": 1.8840000000000003e-05, "loss": 1.851584243774414, "mean_token_accuracy": 0.6612551022320986, "num_tokens": 26912327.0, "step": 4530 }, { "entropy": 1.7035338878631592, "epoch": 7.8012035246077795, "grad_norm": 0.8460244536399841, "learning_rate": 1.8440000000000003e-05, "loss": 1.7524948120117188, "mean_token_accuracy": 0.6760960537940264, "num_tokens": 26971076.0, "step": 4540 }, { "entropy": 1.6795054778456688, "epoch": 7.81839673329035, "grad_norm": 0.7720061540603638, "learning_rate": 1.804e-05, "loss": 1.70491943359375, "mean_token_accuracy": 0.6768644891679287, "num_tokens": 27031120.0, "step": 4550 }, { "entropy": 1.775759120285511, "epoch": 7.835589941972921, "grad_norm": 0.8407703638076782, "learning_rate": 1.764e-05, "loss": 1.8208852767944337, "mean_token_accuracy": 0.6638765886425972, "num_tokens": 27089926.0, "step": 4560 }, { "entropy": 1.7749223679304122, "epoch": 7.852783150655491, "grad_norm": 0.8033788204193115, "learning_rate": 1.724e-05, "loss": 1.8128280639648438, "mean_token_accuracy": 0.6697524327784776, "num_tokens": 27155776.0, "step": 4570 }, { "entropy": 1.7019891321659089, "epoch": 7.869976359338062, "grad_norm": 0.8756063580513, "learning_rate": 1.684e-05, "loss": 1.752833366394043, "mean_token_accuracy": 0.6720911644399166, "num_tokens": 27213676.0, "step": 4580 }, { "entropy": 1.7089907452464104, "epoch": 7.8871695680206315, "grad_norm": 0.8547044396400452, "learning_rate": 1.644e-05, "loss": 1.7329090118408204, "mean_token_accuracy": 0.6730512753129005, "num_tokens": 27273812.0, "step": 4590 }, { "entropy": 1.8000069722533225, "epoch": 7.904362776703202, "grad_norm": 0.8191949725151062, "learning_rate": 1.604e-05, "loss": 1.8508378982543945, "mean_token_accuracy": 0.6602330446243286, "num_tokens": 27334482.0, "step": 4600 }, { "entropy": 1.6531485810875892, "epoch": 7.921555985385773, "grad_norm": 0.7952063679695129, "learning_rate": 1.5640000000000003e-05, "loss": 1.6732818603515625, "mean_token_accuracy": 0.6840143203735352, "num_tokens": 27390777.0, "step": 4610 }, { "entropy": 1.7451679170131684, "epoch": 7.938749194068343, "grad_norm": 0.7736355066299438, "learning_rate": 1.5240000000000001e-05, "loss": 1.836105728149414, "mean_token_accuracy": 0.6631482250988483, "num_tokens": 27452458.0, "step": 4620 }, { "entropy": 1.6219932287931442, "epoch": 7.955942402750914, "grad_norm": 0.7429597973823547, "learning_rate": 1.4840000000000002e-05, "loss": 1.6252763748168946, "mean_token_accuracy": 0.6922797068953515, "num_tokens": 27510793.0, "step": 4630 }, { "entropy": 1.7097622737288476, "epoch": 7.9731356114334835, "grad_norm": 0.7546749114990234, "learning_rate": 1.444e-05, "loss": 1.7529830932617188, "mean_token_accuracy": 0.6756818048655987, "num_tokens": 27570434.0, "step": 4640 }, { "entropy": 1.7681476891040802, "epoch": 7.990328820116054, "grad_norm": 0.8919919729232788, "learning_rate": 1.4040000000000001e-05, "loss": 1.8469413757324218, "mean_token_accuracy": 0.6651480123400688, "num_tokens": 27632017.0, "step": 4650 }, { "entropy": 1.7464849283168842, "epoch": 8.006877283473028, "grad_norm": 0.8629288077354431, "learning_rate": 1.364e-05, "loss": 1.7770162582397462, "mean_token_accuracy": 0.6717489861048661, "num_tokens": 27687721.0, "step": 4660 }, { "entropy": 1.733792708069086, "epoch": 8.024070492155598, "grad_norm": 0.8012450337409973, "learning_rate": 1.324e-05, "loss": 1.7535259246826171, "mean_token_accuracy": 0.6781957261264324, "num_tokens": 27748609.0, "step": 4670 }, { "entropy": 1.673891542851925, "epoch": 8.041263700838169, "grad_norm": 0.8763530850410461, "learning_rate": 1.2839999999999999e-05, "loss": 1.7353546142578125, "mean_token_accuracy": 0.6773874297738075, "num_tokens": 27805200.0, "step": 4680 }, { "entropy": 1.6245143353939056, "epoch": 8.05845690952074, "grad_norm": 0.7880796194076538, "learning_rate": 1.244e-05, "loss": 1.6489152908325195, "mean_token_accuracy": 0.6891307681798935, "num_tokens": 27866189.0, "step": 4690 }, { "entropy": 1.7772031486034394, "epoch": 8.07565011820331, "grad_norm": 0.894481360912323, "learning_rate": 1.204e-05, "loss": 1.8237220764160156, "mean_token_accuracy": 0.6645158022642136, "num_tokens": 27929040.0, "step": 4700 }, { "entropy": 1.6911936491727828, "epoch": 8.09284332688588, "grad_norm": 0.8212205171585083, "learning_rate": 1.164e-05, "loss": 1.718613624572754, "mean_token_accuracy": 0.6778515942394734, "num_tokens": 27989259.0, "step": 4710 }, { "entropy": 1.7341958984732628, "epoch": 8.110036535568451, "grad_norm": 0.8757619261741638, "learning_rate": 1.124e-05, "loss": 1.83496150970459, "mean_token_accuracy": 0.67105031311512, "num_tokens": 28051037.0, "step": 4720 }, { "entropy": 1.6540620133280755, "epoch": 8.127229744251022, "grad_norm": 0.6871177554130554, "learning_rate": 1.084e-05, "loss": 1.6868721008300782, "mean_token_accuracy": 0.6824644193053245, "num_tokens": 28117218.0, "step": 4730 }, { "entropy": 1.7760244339704514, "epoch": 8.144422952933592, "grad_norm": 0.8672593832015991, "learning_rate": 1.0440000000000002e-05, "loss": 1.8467548370361329, "mean_token_accuracy": 0.6605620160698891, "num_tokens": 28176643.0, "step": 4740 }, { "entropy": 1.6998422421514987, "epoch": 8.16161616161616, "grad_norm": 0.9853087663650513, "learning_rate": 1.004e-05, "loss": 1.7283611297607422, "mean_token_accuracy": 0.6775359824299813, "num_tokens": 28234550.0, "step": 4750 }, { "entropy": 1.7665151111781596, "epoch": 8.178809370298731, "grad_norm": 0.8272210955619812, "learning_rate": 9.640000000000001e-06, "loss": 1.8442218780517579, "mean_token_accuracy": 0.6675057601183653, "num_tokens": 28292004.0, "step": 4760 }, { "entropy": 1.7351939789950848, "epoch": 8.196002578981302, "grad_norm": 0.8758223652839661, "learning_rate": 9.24e-06, "loss": 1.7823253631591798, "mean_token_accuracy": 0.6717655852437019, "num_tokens": 28351089.0, "step": 4770 }, { "entropy": 1.7320286817848682, "epoch": 8.213195787663873, "grad_norm": 0.8538162708282471, "learning_rate": 8.840000000000002e-06, "loss": 1.758108139038086, "mean_token_accuracy": 0.6750058546662331, "num_tokens": 28411108.0, "step": 4780 }, { "entropy": 1.7250167533755303, "epoch": 8.230388996346443, "grad_norm": 0.8055081963539124, "learning_rate": 8.44e-06, "loss": 1.7342365264892579, "mean_token_accuracy": 0.6727670766413212, "num_tokens": 28469910.0, "step": 4790 }, { "entropy": 1.6715928614139557, "epoch": 8.247582205029014, "grad_norm": 0.8282851576805115, "learning_rate": 8.040000000000001e-06, "loss": 1.7284685134887696, "mean_token_accuracy": 0.6803247310221195, "num_tokens": 28528732.0, "step": 4800 }, { "entropy": 1.7717369854450227, "epoch": 8.264775413711584, "grad_norm": 0.7199074029922485, "learning_rate": 7.64e-06, "loss": 1.8089387893676758, "mean_token_accuracy": 0.6684400778263807, "num_tokens": 28591231.0, "step": 4810 }, { "entropy": 1.6829568967223167, "epoch": 8.281968622394155, "grad_norm": 0.8212400674819946, "learning_rate": 7.240000000000001e-06, "loss": 1.6901424407958985, "mean_token_accuracy": 0.6812582932412624, "num_tokens": 28651538.0, "step": 4820 }, { "entropy": 1.7792557999491692, "epoch": 8.299161831076725, "grad_norm": 0.8251553773880005, "learning_rate": 6.840000000000001e-06, "loss": 1.8440101623535157, "mean_token_accuracy": 0.6635224357247352, "num_tokens": 28713818.0, "step": 4830 }, { "entropy": 1.6888219453394413, "epoch": 8.316355039759294, "grad_norm": 0.799067497253418, "learning_rate": 6.44e-06, "loss": 1.7452951431274415, "mean_token_accuracy": 0.6766478583216667, "num_tokens": 28771713.0, "step": 4840 }, { "entropy": 1.6663143932819366, "epoch": 8.333548248441865, "grad_norm": 0.7468796968460083, "learning_rate": 6.040000000000001e-06, "loss": 1.6975286483764649, "mean_token_accuracy": 0.6818139903247357, "num_tokens": 28833584.0, "step": 4850 }, { "entropy": 1.736840507388115, "epoch": 8.350741457124435, "grad_norm": 0.9168211817741394, "learning_rate": 5.64e-06, "loss": 1.8019765853881835, "mean_token_accuracy": 0.6729365028440952, "num_tokens": 28891158.0, "step": 4860 }, { "entropy": 1.7159839145839215, "epoch": 8.367934665807006, "grad_norm": 0.8348814249038696, "learning_rate": 5.240000000000001e-06, "loss": 1.7910118103027344, "mean_token_accuracy": 0.67631860896945, "num_tokens": 28948026.0, "step": 4870 }, { "entropy": 1.7169093780219555, "epoch": 8.385127874489577, "grad_norm": 0.8493881821632385, "learning_rate": 4.84e-06, "loss": 1.7167430877685548, "mean_token_accuracy": 0.6753393478691578, "num_tokens": 29005197.0, "step": 4880 }, { "entropy": 1.6801239594817161, "epoch": 8.402321083172147, "grad_norm": 0.8069011569023132, "learning_rate": 4.440000000000001e-06, "loss": 1.6674100875854492, "mean_token_accuracy": 0.681441531330347, "num_tokens": 29062454.0, "step": 4890 }, { "entropy": 1.7267012923955918, "epoch": 8.419514291854718, "grad_norm": 0.8063756823539734, "learning_rate": 4.04e-06, "loss": 1.7544673919677733, "mean_token_accuracy": 0.6745367147028446, "num_tokens": 29121055.0, "step": 4900 }, { "entropy": 1.6062462359666825, "epoch": 8.436707500537288, "grad_norm": 0.8285024762153625, "learning_rate": 3.6400000000000003e-06, "loss": 1.6273128509521484, "mean_token_accuracy": 0.690464211255312, "num_tokens": 29176963.0, "step": 4910 }, { "entropy": 1.7958560451865195, "epoch": 8.453900709219859, "grad_norm": 0.8202657103538513, "learning_rate": 3.24e-06, "loss": 1.8311897277832032, "mean_token_accuracy": 0.661663169786334, "num_tokens": 29235880.0, "step": 4920 }, { "entropy": 1.665907260030508, "epoch": 8.47109391790243, "grad_norm": 0.8672494292259216, "learning_rate": 2.8400000000000003e-06, "loss": 1.6878423690795898, "mean_token_accuracy": 0.6819184564054013, "num_tokens": 29295823.0, "step": 4930 }, { "entropy": 1.7426866918802262, "epoch": 8.488287126584998, "grad_norm": 0.8398126363754272, "learning_rate": 2.4400000000000004e-06, "loss": 1.810443115234375, "mean_token_accuracy": 0.6639036998152733, "num_tokens": 29355386.0, "step": 4940 }, { "entropy": 1.6938614405691623, "epoch": 8.505480335267569, "grad_norm": 0.7652584314346313, "learning_rate": 2.0400000000000004e-06, "loss": 1.7690727233886718, "mean_token_accuracy": 0.6737098075449467, "num_tokens": 29414966.0, "step": 4950 }, { "entropy": 1.7538506165146828, "epoch": 8.52267354395014, "grad_norm": 0.8389163017272949, "learning_rate": 1.6400000000000002e-06, "loss": 1.8067062377929688, "mean_token_accuracy": 0.6728679880499839, "num_tokens": 29472960.0, "step": 4960 }, { "entropy": 1.7591105610132218, "epoch": 8.53986675263271, "grad_norm": 0.8280366063117981, "learning_rate": 1.24e-06, "loss": 1.7855098724365235, "mean_token_accuracy": 0.6670263484120369, "num_tokens": 29531300.0, "step": 4970 }, { "entropy": 1.6825189530849456, "epoch": 8.55705996131528, "grad_norm": 0.8177328109741211, "learning_rate": 8.4e-07, "loss": 1.731926727294922, "mean_token_accuracy": 0.6818420931696891, "num_tokens": 29591290.0, "step": 4980 }, { "entropy": 1.7112577512860299, "epoch": 8.574253169997851, "grad_norm": 0.8413036465644836, "learning_rate": 4.4e-07, "loss": 1.7446353912353516, "mean_token_accuracy": 0.6750271447002888, "num_tokens": 29646086.0, "step": 4990 }, { "entropy": 1.7419164210557938, "epoch": 8.591446378680422, "grad_norm": 0.9462088346481323, "learning_rate": 4e-08, "loss": 1.7870445251464844, "mean_token_accuracy": 0.666933435574174, "num_tokens": 29704815.0, "step": 5000 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.438188209453138e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }