diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.6199825009939095, + "epoch": 0.46498687574543207, "eval_steps": 500, - "global_step": 200000, + "global_step": 150000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -150008,50006 +150008,6 @@ "mean_token_accuracy": 0.8454708248376847, "num_tokens": 180473902.0, "step": 150000 - }, - { - "entropy": 1.9131696656346322, - "epoch": 0.4650178748704818, - "grad_norm": 8.852290153503418, - "learning_rate": 3.709875228299118e-06, - "loss": 0.5096, - "mean_token_accuracy": 0.8401144355535507, - "num_tokens": 180484961.0, - "step": 150010 - }, - { - "entropy": 1.8424381971359254, - "epoch": 0.46504887399553146, - "grad_norm": 8.186081886291504, - "learning_rate": 3.709751579392853e-06, - "loss": 0.4484, - "mean_token_accuracy": 0.8486584141850472, - "num_tokens": 180497785.0, - "step": 150020 - }, - { - "entropy": 1.8932843878865242, - "epoch": 0.4650798731205812, - "grad_norm": 6.866567134857178, - "learning_rate": 3.709627942849295e-06, - "loss": 0.4095, - "mean_token_accuracy": 0.869660022854805, - "num_tokens": 180509401.0, - "step": 150030 - }, - { - "entropy": 1.9485246509313583, - "epoch": 0.46511087224563086, - "grad_norm": 8.80336856842041, - "learning_rate": 3.7095043186663837e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8586052864789963, - "num_tokens": 180520431.0, - "step": 150040 - }, - { - "entropy": 1.914072147011757, - "epoch": 0.4651418713706806, - "grad_norm": 14.637115478515625, - "learning_rate": 3.709380706842059e-06, - "loss": 0.4752, - "mean_token_accuracy": 0.8486096784472466, - "num_tokens": 180532404.0, - "step": 150050 - }, - { - "entropy": 1.7987019002437592, - "epoch": 0.46517287049573025, - "grad_norm": 2.0740082263946533, - "learning_rate": 3.709257107374263e-06, - "loss": 0.4432, - "mean_token_accuracy": 0.8616669073700904, - "num_tokens": 180546487.0, - "step": 150060 - }, - { - "entropy": 1.8739713817834853, - "epoch": 0.46520386962078, - "grad_norm": 8.125478744506836, - "learning_rate": 3.709133520260937e-06, - "loss": 0.4047, - "mean_token_accuracy": 0.8655928507447243, - "num_tokens": 180558526.0, - "step": 150070 - }, - { - "entropy": 1.9065773144364357, - "epoch": 0.46523486874582964, - "grad_norm": 8.001792907714844, - "learning_rate": 3.7090099455000217e-06, - "loss": 0.4646, - "mean_token_accuracy": 0.8476300582289695, - "num_tokens": 180571010.0, - "step": 150080 - }, - { - "entropy": 1.8418815195560456, - "epoch": 0.46526586787087937, - "grad_norm": 4.2682318687438965, - "learning_rate": 3.70888638308946e-06, - "loss": 0.4265, - "mean_token_accuracy": 0.850444746017456, - "num_tokens": 180584015.0, - "step": 150090 - }, - { - "entropy": 1.775348497927189, - "epoch": 0.46529686699592904, - "grad_norm": 2.5660839080810547, - "learning_rate": 3.7087628330271962e-06, - "loss": 0.3799, - "mean_token_accuracy": 0.8672156199812889, - "num_tokens": 180596841.0, - "step": 150100 - }, - { - "entropy": 1.8526948064565658, - "epoch": 0.46532786612097876, - "grad_norm": 9.855128288269043, - "learning_rate": 3.708639295311173e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.8536330834031105, - "num_tokens": 180609137.0, - "step": 150110 - }, - { - "entropy": 1.9509498670697212, - "epoch": 0.46535886524602843, - "grad_norm": 5.597041130065918, - "learning_rate": 3.708515769939334e-06, - "loss": 0.4768, - "mean_token_accuracy": 0.8556590229272842, - "num_tokens": 180620230.0, - "step": 150120 - }, - { - "entropy": 1.9075401276350021, - "epoch": 0.4653898643710781, - "grad_norm": 8.79906940460205, - "learning_rate": 3.708392256909624e-06, - "loss": 0.4772, - "mean_token_accuracy": 0.846394631266594, - "num_tokens": 180631753.0, - "step": 150130 - }, - { - "entropy": 1.9188668608665467, - "epoch": 0.4654208634961278, - "grad_norm": 7.841006755828857, - "learning_rate": 3.7082687562199866e-06, - "loss": 0.4726, - "mean_token_accuracy": 0.8531607300043106, - "num_tokens": 180642477.0, - "step": 150140 - }, - { - "entropy": 1.794436551630497, - "epoch": 0.4654518626211775, - "grad_norm": 8.100784301757812, - "learning_rate": 3.708145267868368e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8534730911254883, - "num_tokens": 180655800.0, - "step": 150150 - }, - { - "entropy": 1.8831926614046097, - "epoch": 0.4654828617462272, - "grad_norm": 6.693047046661377, - "learning_rate": 3.7080217918527147e-06, - "loss": 0.431, - "mean_token_accuracy": 0.8538334533572197, - "num_tokens": 180667072.0, - "step": 150160 - }, - { - "entropy": 1.8978948533535003, - "epoch": 0.4655138608712769, - "grad_norm": 8.671180725097656, - "learning_rate": 3.707898328170972e-06, - "loss": 0.5048, - "mean_token_accuracy": 0.8394773602485657, - "num_tokens": 180678789.0, - "step": 150170 - }, - { - "entropy": 1.9485942766070365, - "epoch": 0.4655448599963266, - "grad_norm": 9.572785377502441, - "learning_rate": 3.707774876821087e-06, - "loss": 0.4643, - "mean_token_accuracy": 0.8439914211630821, - "num_tokens": 180690154.0, - "step": 150180 - }, - { - "entropy": 1.952707216143608, - "epoch": 0.4655758591213763, - "grad_norm": 8.8486967086792, - "learning_rate": 3.707651437801007e-06, - "loss": 0.513, - "mean_token_accuracy": 0.8544019669294357, - "num_tokens": 180701768.0, - "step": 150190 - }, - { - "entropy": 1.9008170261979103, - "epoch": 0.465606858246426, - "grad_norm": 8.073139190673828, - "learning_rate": 3.7075280111086796e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.8544175952672959, - "num_tokens": 180713142.0, - "step": 150200 - }, - { - "entropy": 1.9324932008981706, - "epoch": 0.46563785737147567, - "grad_norm": 7.887551307678223, - "learning_rate": 3.707404596742053e-06, - "loss": 0.4257, - "mean_token_accuracy": 0.865489798784256, - "num_tokens": 180724698.0, - "step": 150210 - }, - { - "entropy": 1.8756032541394234, - "epoch": 0.4656688564965254, - "grad_norm": 7.615468978881836, - "learning_rate": 3.707281194699076e-06, - "loss": 0.4626, - "mean_token_accuracy": 0.8480568066239357, - "num_tokens": 180737683.0, - "step": 150220 - }, - { - "entropy": 1.8858239620923996, - "epoch": 0.46569985562157507, - "grad_norm": 7.510842800140381, - "learning_rate": 3.707157804977698e-06, - "loss": 0.4757, - "mean_token_accuracy": 0.8432325854897499, - "num_tokens": 180749594.0, - "step": 150230 - }, - { - "entropy": 1.9456377267837524, - "epoch": 0.4657308547466248, - "grad_norm": 6.716031074523926, - "learning_rate": 3.7070344275758684e-06, - "loss": 0.465, - "mean_token_accuracy": 0.8643984943628311, - "num_tokens": 180761340.0, - "step": 150240 - }, - { - "entropy": 1.9459609061479568, - "epoch": 0.46576185387167446, - "grad_norm": 8.735827445983887, - "learning_rate": 3.706911062491537e-06, - "loss": 0.51, - "mean_token_accuracy": 0.8414299875497818, - "num_tokens": 180773140.0, - "step": 150250 - }, - { - "entropy": 1.8788623332977294, - "epoch": 0.4657928529967242, - "grad_norm": 6.6167497634887695, - "learning_rate": 3.7067877097226546e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.8643033638596535, - "num_tokens": 180785039.0, - "step": 150260 - }, - { - "entropy": 1.7132162213325501, - "epoch": 0.46582385212177385, - "grad_norm": 3.7535245418548584, - "learning_rate": 3.706664369267172e-06, - "loss": 0.3818, - "mean_token_accuracy": 0.8678473249077797, - "num_tokens": 180799943.0, - "step": 150270 - }, - { - "entropy": 1.9511631667613982, - "epoch": 0.4658548512468236, - "grad_norm": 8.460371017456055, - "learning_rate": 3.7065410411230414e-06, - "loss": 0.4953, - "mean_token_accuracy": 0.8503921389579773, - "num_tokens": 180810998.0, - "step": 150280 - }, - { - "entropy": 1.8199073910713195, - "epoch": 0.46588585037187324, - "grad_norm": 8.949850082397461, - "learning_rate": 3.706417725288214e-06, - "loss": 0.4, - "mean_token_accuracy": 0.8588975444436073, - "num_tokens": 180824024.0, - "step": 150290 - }, - { - "entropy": 1.871119175851345, - "epoch": 0.46591684949692297, - "grad_norm": 8.456604957580566, - "learning_rate": 3.7062944217606428e-06, - "loss": 0.4164, - "mean_token_accuracy": 0.8629543900489807, - "num_tokens": 180835815.0, - "step": 150300 - }, - { - "entropy": 1.8718924656510354, - "epoch": 0.46594784862197264, - "grad_norm": 9.544185638427734, - "learning_rate": 3.706171130538281e-06, - "loss": 0.3963, - "mean_token_accuracy": 0.8634023860096931, - "num_tokens": 180847428.0, - "step": 150310 - }, - { - "entropy": 1.8509297877550126, - "epoch": 0.46597884774702236, - "grad_norm": 7.217175483703613, - "learning_rate": 3.7060478516190818e-06, - "loss": 0.3889, - "mean_token_accuracy": 0.8697842806577682, - "num_tokens": 180859536.0, - "step": 150320 - }, - { - "entropy": 1.8995045498013496, - "epoch": 0.46600984687207203, - "grad_norm": 7.621474266052246, - "learning_rate": 3.7059245850009987e-06, - "loss": 0.4723, - "mean_token_accuracy": 0.8520592898130417, - "num_tokens": 180872519.0, - "step": 150330 - }, - { - "entropy": 1.820411132276058, - "epoch": 0.46604084599712176, - "grad_norm": 7.533627033233643, - "learning_rate": 3.7058013306819874e-06, - "loss": 0.3875, - "mean_token_accuracy": 0.866758693754673, - "num_tokens": 180885396.0, - "step": 150340 - }, - { - "entropy": 1.9017600163817405, - "epoch": 0.4660718451221714, - "grad_norm": 7.15775728225708, - "learning_rate": 3.7056780886600014e-06, - "loss": 0.4171, - "mean_token_accuracy": 0.8558528557419777, - "num_tokens": 180897325.0, - "step": 150350 - }, - { - "entropy": 1.9121145501732826, - "epoch": 0.46610284424722115, - "grad_norm": 4.693599700927734, - "learning_rate": 3.705554858932996e-06, - "loss": 0.464, - "mean_token_accuracy": 0.8516936868429184, - "num_tokens": 180909047.0, - "step": 150360 - }, - { - "entropy": 1.9288795188069343, - "epoch": 0.4661338433722708, - "grad_norm": 7.500865936279297, - "learning_rate": 3.7054316414989283e-06, - "loss": 0.4391, - "mean_token_accuracy": 0.8530079022049903, - "num_tokens": 180920731.0, - "step": 150370 - }, - { - "entropy": 1.9719238847494125, - "epoch": 0.4661648424973205, - "grad_norm": 9.861359596252441, - "learning_rate": 3.7053084363557534e-06, - "loss": 0.5333, - "mean_token_accuracy": 0.8317109391093254, - "num_tokens": 180931182.0, - "step": 150380 - }, - { - "entropy": 1.9200509548187257, - "epoch": 0.4661958416223702, - "grad_norm": 3.9624438285827637, - "learning_rate": 3.705185243501429e-06, - "loss": 0.4287, - "mean_token_accuracy": 0.8530553847551345, - "num_tokens": 180942651.0, - "step": 150390 - }, - { - "entropy": 1.902939459681511, - "epoch": 0.4662268407474199, - "grad_norm": 10.163533210754395, - "learning_rate": 3.705062062933911e-06, - "loss": 0.4502, - "mean_token_accuracy": 0.8538908958435059, - "num_tokens": 180954042.0, - "step": 150400 - }, - { - "entropy": 1.869973176717758, - "epoch": 0.4662578398724696, - "grad_norm": 9.920553207397461, - "learning_rate": 3.7049388946511593e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8528695210814476, - "num_tokens": 180966670.0, - "step": 150410 - }, - { - "entropy": 1.9353784874081612, - "epoch": 0.4662888389975193, - "grad_norm": 7.96409273147583, - "learning_rate": 3.7048157386511297e-06, - "loss": 0.474, - "mean_token_accuracy": 0.8458786249160767, - "num_tokens": 180978919.0, - "step": 150420 - }, - { - "entropy": 1.8363364577293395, - "epoch": 0.466319838122569, - "grad_norm": 8.377767562866211, - "learning_rate": 3.7046925949317823e-06, - "loss": 0.4639, - "mean_token_accuracy": 0.8574515968561173, - "num_tokens": 180992419.0, - "step": 150430 - }, - { - "entropy": 1.8962649412453174, - "epoch": 0.46635083724761867, - "grad_norm": 10.839685440063477, - "learning_rate": 3.7045694634910766e-06, - "loss": 0.4511, - "mean_token_accuracy": 0.848162266612053, - "num_tokens": 181004918.0, - "step": 150440 - }, - { - "entropy": 1.8515453770756722, - "epoch": 0.4663818363726684, - "grad_norm": 4.3565354347229, - "learning_rate": 3.7044463443269713e-06, - "loss": 0.3889, - "mean_token_accuracy": 0.8653715640306473, - "num_tokens": 181018477.0, - "step": 150450 - }, - { - "entropy": 1.9294961079955102, - "epoch": 0.46641283549771806, - "grad_norm": 8.185575485229492, - "learning_rate": 3.704323237437427e-06, - "loss": 0.4565, - "mean_token_accuracy": 0.8540726110339165, - "num_tokens": 181030636.0, - "step": 150460 - }, - { - "entropy": 1.987247222661972, - "epoch": 0.4664438346227678, - "grad_norm": 10.924062728881836, - "learning_rate": 3.704200142820404e-06, - "loss": 0.5302, - "mean_token_accuracy": 0.8374628499150276, - "num_tokens": 181041619.0, - "step": 150470 - }, - { - "entropy": 1.9206083595752717, - "epoch": 0.46647483374781745, - "grad_norm": 7.948111057281494, - "learning_rate": 3.7040770604738633e-06, - "loss": 0.4365, - "mean_token_accuracy": 0.8553918346762657, - "num_tokens": 181053506.0, - "step": 150480 - }, - { - "entropy": 1.9235239014029504, - "epoch": 0.4665058328728672, - "grad_norm": 8.41197395324707, - "learning_rate": 3.703953990395767e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8539768025279045, - "num_tokens": 181064814.0, - "step": 150490 - }, - { - "entropy": 1.9210913270711898, - "epoch": 0.46653683199791685, - "grad_norm": 7.933341979980469, - "learning_rate": 3.7038309325840766e-06, - "loss": 0.4637, - "mean_token_accuracy": 0.8557110175490379, - "num_tokens": 181075834.0, - "step": 150500 - }, - { - "entropy": 1.904814685881138, - "epoch": 0.46656783112296657, - "grad_norm": 7.137592315673828, - "learning_rate": 3.703707887036754e-06, - "loss": 0.4201, - "mean_token_accuracy": 0.8601122245192527, - "num_tokens": 181087919.0, - "step": 150510 - }, - { - "entropy": 1.8629762142896653, - "epoch": 0.46659883024801624, - "grad_norm": 4.012499809265137, - "learning_rate": 3.7035848537517633e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8468594029545784, - "num_tokens": 181099822.0, - "step": 150520 - }, - { - "entropy": 1.8338840797543525, - "epoch": 0.46662982937306596, - "grad_norm": 8.058430671691895, - "learning_rate": 3.703461832727068e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.861028978228569, - "num_tokens": 181112410.0, - "step": 150530 - }, - { - "entropy": 1.9143824577331543, - "epoch": 0.46666082849811563, - "grad_norm": 7.549951076507568, - "learning_rate": 3.7033388239606303e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8437608435750008, - "num_tokens": 181123938.0, - "step": 150540 - }, - { - "entropy": 1.9112285375595093, - "epoch": 0.46669182762316536, - "grad_norm": 7.177019119262695, - "learning_rate": 3.7032158274504173e-06, - "loss": 0.4965, - "mean_token_accuracy": 0.8438215360045433, - "num_tokens": 181134127.0, - "step": 150550 - }, - { - "entropy": 1.8974832728505135, - "epoch": 0.466722826748215, - "grad_norm": 8.187768936157227, - "learning_rate": 3.7030928431943912e-06, - "loss": 0.4749, - "mean_token_accuracy": 0.8517803817987442, - "num_tokens": 181145556.0, - "step": 150560 - }, - { - "entropy": 1.9428151741623878, - "epoch": 0.46675382587326475, - "grad_norm": 9.65618896484375, - "learning_rate": 3.702969871190518e-06, - "loss": 0.489, - "mean_token_accuracy": 0.843976517021656, - "num_tokens": 181157264.0, - "step": 150570 - }, - { - "entropy": 1.9293995440006255, - "epoch": 0.4667848249983144, - "grad_norm": 7.708983898162842, - "learning_rate": 3.7028469114367653e-06, - "loss": 0.4889, - "mean_token_accuracy": 0.8495089873671532, - "num_tokens": 181168528.0, - "step": 150580 - }, - { - "entropy": 1.8680501088500023, - "epoch": 0.46681582412336414, - "grad_norm": 8.186799049377441, - "learning_rate": 3.702723963931097e-06, - "loss": 0.4549, - "mean_token_accuracy": 0.8557474792003632, - "num_tokens": 181180264.0, - "step": 150590 - }, - { - "entropy": 1.8408723667263984, - "epoch": 0.4668468232484138, - "grad_norm": 2.6501667499542236, - "learning_rate": 3.7026010286714814e-06, - "loss": 0.5115, - "mean_token_accuracy": 0.8513808265328408, - "num_tokens": 181193075.0, - "step": 150600 - }, - { - "entropy": 1.9793402075767517, - "epoch": 0.46687782237346354, - "grad_norm": 7.212341785430908, - "learning_rate": 3.702478105655884e-06, - "loss": 0.5262, - "mean_token_accuracy": 0.8411362200975419, - "num_tokens": 181203773.0, - "step": 150610 - }, - { - "entropy": 1.937410406768322, - "epoch": 0.4669088214985132, - "grad_norm": 8.84796142578125, - "learning_rate": 3.702355194882275e-06, - "loss": 0.4799, - "mean_token_accuracy": 0.8421960338950157, - "num_tokens": 181214991.0, - "step": 150620 - }, - { - "entropy": 1.8487501621246338, - "epoch": 0.4669398206235629, - "grad_norm": 3.367696523666382, - "learning_rate": 3.7022322963486203e-06, - "loss": 0.4316, - "mean_token_accuracy": 0.8592374339699745, - "num_tokens": 181228067.0, - "step": 150630 - }, - { - "entropy": 1.8551168724894525, - "epoch": 0.4669708197486126, - "grad_norm": 3.5218849182128906, - "learning_rate": 3.7021094100528897e-06, - "loss": 0.4068, - "mean_token_accuracy": 0.8567650452256202, - "num_tokens": 181240939.0, - "step": 150640 - }, - { - "entropy": 1.8298555374145509, - "epoch": 0.46700181887366227, - "grad_norm": 3.4528143405914307, - "learning_rate": 3.701986535993051e-06, - "loss": 0.3701, - "mean_token_accuracy": 0.8601305693387985, - "num_tokens": 181254113.0, - "step": 150650 - }, - { - "entropy": 1.9451593279838562, - "epoch": 0.467032817998712, - "grad_norm": 10.91589069366455, - "learning_rate": 3.7018636741670766e-06, - "loss": 0.5097, - "mean_token_accuracy": 0.852519790828228, - "num_tokens": 181265250.0, - "step": 150660 - }, - { - "entropy": 1.9798747926950455, - "epoch": 0.46706381712376166, - "grad_norm": 9.844844818115234, - "learning_rate": 3.701740824572933e-06, - "loss": 0.4884, - "mean_token_accuracy": 0.8470689475536346, - "num_tokens": 181275900.0, - "step": 150670 - }, - { - "entropy": 1.8553171455860138, - "epoch": 0.4670948162488114, - "grad_norm": 8.106888771057129, - "learning_rate": 3.7016179872085933e-06, - "loss": 0.5243, - "mean_token_accuracy": 0.8486344560980796, - "num_tokens": 181288591.0, - "step": 150680 - }, - { - "entropy": 1.9101608902215959, - "epoch": 0.46712581537386105, - "grad_norm": 5.4185075759887695, - "learning_rate": 3.7014951620720275e-06, - "loss": 0.4668, - "mean_token_accuracy": 0.8533428713679314, - "num_tokens": 181300555.0, - "step": 150690 - }, - { - "entropy": 1.8143582224845887, - "epoch": 0.4671568144989108, - "grad_norm": 8.726012229919434, - "learning_rate": 3.7013723491612075e-06, - "loss": 0.4051, - "mean_token_accuracy": 0.8616741225123405, - "num_tokens": 181313432.0, - "step": 150700 - }, - { - "entropy": 1.8796430230140686, - "epoch": 0.46718781362396045, - "grad_norm": 8.185118675231934, - "learning_rate": 3.701249548474104e-06, - "loss": 0.4173, - "mean_token_accuracy": 0.8626327365636826, - "num_tokens": 181325140.0, - "step": 150710 - }, - { - "entropy": 1.9016419626772403, - "epoch": 0.46721881274901017, - "grad_norm": 8.940559387207031, - "learning_rate": 3.7011267600086907e-06, - "loss": 0.4506, - "mean_token_accuracy": 0.8517810702323914, - "num_tokens": 181337140.0, - "step": 150720 - }, - { - "entropy": 1.9060870558023453, - "epoch": 0.46724981187405984, - "grad_norm": 7.316099643707275, - "learning_rate": 3.7010039837629403e-06, - "loss": 0.4335, - "mean_token_accuracy": 0.8516420558094978, - "num_tokens": 181348828.0, - "step": 150730 - }, - { - "entropy": 1.8336908236145972, - "epoch": 0.46728081099910956, - "grad_norm": 8.662108421325684, - "learning_rate": 3.700881219734826e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8569240808486939, - "num_tokens": 181360993.0, - "step": 150740 - }, - { - "entropy": 1.867463281750679, - "epoch": 0.46731181012415923, - "grad_norm": 4.231754302978516, - "learning_rate": 3.7007584679223206e-06, - "loss": 0.4002, - "mean_token_accuracy": 0.8555324092507363, - "num_tokens": 181373461.0, - "step": 150750 - }, - { - "entropy": 1.9317969232797623, - "epoch": 0.46734280924920896, - "grad_norm": 7.876861572265625, - "learning_rate": 3.7006357283234e-06, - "loss": 0.4745, - "mean_token_accuracy": 0.8606542080640793, - "num_tokens": 181383899.0, - "step": 150760 - }, - { - "entropy": 2.0077045977115633, - "epoch": 0.4673738083742586, - "grad_norm": 7.578515529632568, - "learning_rate": 3.700513000936038e-06, - "loss": 0.5033, - "mean_token_accuracy": 0.8427977785468102, - "num_tokens": 181394945.0, - "step": 150770 - }, - { - "entropy": 1.898098950833082, - "epoch": 0.46740480749930835, - "grad_norm": 9.076956748962402, - "learning_rate": 3.7003902857582097e-06, - "loss": 0.4804, - "mean_token_accuracy": 0.8464702069759369, - "num_tokens": 181406939.0, - "step": 150780 - }, - { - "entropy": 1.8830089807510375, - "epoch": 0.467435806624358, - "grad_norm": 8.655149459838867, - "learning_rate": 3.7002675827878913e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8559210911393166, - "num_tokens": 181419739.0, - "step": 150790 - }, - { - "entropy": 1.9508404403924942, - "epoch": 0.46746680574940774, - "grad_norm": 7.471928596496582, - "learning_rate": 3.7001448920230598e-06, - "loss": 0.4981, - "mean_token_accuracy": 0.8527390614151955, - "num_tokens": 181430494.0, - "step": 150800 - }, - { - "entropy": 1.9231002733111382, - "epoch": 0.4674978048744574, - "grad_norm": 8.397159576416016, - "learning_rate": 3.70002221346169e-06, - "loss": 0.5567, - "mean_token_accuracy": 0.8290016546845436, - "num_tokens": 181441868.0, - "step": 150810 - }, - { - "entropy": 1.9228404954075813, - "epoch": 0.46752880399950714, - "grad_norm": 4.443188190460205, - "learning_rate": 3.69989954710176e-06, - "loss": 0.4481, - "mean_token_accuracy": 0.847866815328598, - "num_tokens": 181453921.0, - "step": 150820 - }, - { - "entropy": 1.9264577642083167, - "epoch": 0.4675598031245568, - "grad_norm": 8.896585464477539, - "learning_rate": 3.699776892941247e-06, - "loss": 0.505, - "mean_token_accuracy": 0.8456339925527573, - "num_tokens": 181465120.0, - "step": 150830 - }, - { - "entropy": 1.9083795070648193, - "epoch": 0.46759080224960653, - "grad_norm": 8.588702201843262, - "learning_rate": 3.6996542509781293e-06, - "loss": 0.4605, - "mean_token_accuracy": 0.8652929633855819, - "num_tokens": 181476510.0, - "step": 150840 - }, - { - "entropy": 1.895962081849575, - "epoch": 0.4676218013746562, - "grad_norm": 8.5095853805542, - "learning_rate": 3.6995316212103853e-06, - "loss": 0.4649, - "mean_token_accuracy": 0.8454143151640892, - "num_tokens": 181488282.0, - "step": 150850 - }, - { - "entropy": 1.8628547742962838, - "epoch": 0.4676528004997059, - "grad_norm": 2.8419368267059326, - "learning_rate": 3.699409003635994e-06, - "loss": 0.4532, - "mean_token_accuracy": 0.8537169203162194, - "num_tokens": 181501584.0, - "step": 150860 - }, - { - "entropy": 1.915559995174408, - "epoch": 0.4676837996247556, - "grad_norm": 7.790353775024414, - "learning_rate": 3.6992863982529358e-06, - "loss": 0.4685, - "mean_token_accuracy": 0.8497019991278648, - "num_tokens": 181513133.0, - "step": 150870 - }, - { - "entropy": 1.856499010324478, - "epoch": 0.46771479874980526, - "grad_norm": 3.7286479473114014, - "learning_rate": 3.699163805059189e-06, - "loss": 0.4253, - "mean_token_accuracy": 0.8563529670238494, - "num_tokens": 181526648.0, - "step": 150880 - }, - { - "entropy": 1.8931098520755767, - "epoch": 0.467745797874855, - "grad_norm": 8.853365898132324, - "learning_rate": 3.699041224052734e-06, - "loss": 0.4505, - "mean_token_accuracy": 0.8582080423831939, - "num_tokens": 181538660.0, - "step": 150890 - }, - { - "entropy": 1.8996970742940902, - "epoch": 0.46777679699990465, - "grad_norm": 8.242768287658691, - "learning_rate": 3.6989186552315533e-06, - "loss": 0.4563, - "mean_token_accuracy": 0.8507839307188988, - "num_tokens": 181550742.0, - "step": 150900 - }, - { - "entropy": 1.9381722196936608, - "epoch": 0.4678077961249544, - "grad_norm": 8.271310806274414, - "learning_rate": 3.6987960985936266e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8593697786331177, - "num_tokens": 181562045.0, - "step": 150910 - }, - { - "entropy": 1.899745689332485, - "epoch": 0.46783879525000405, - "grad_norm": 9.7952241897583, - "learning_rate": 3.698673554136937e-06, - "loss": 0.4397, - "mean_token_accuracy": 0.8565332680940628, - "num_tokens": 181573997.0, - "step": 150920 - }, - { - "entropy": 1.9225837558507919, - "epoch": 0.46786979437505377, - "grad_norm": 6.767740726470947, - "learning_rate": 3.6985510218594654e-06, - "loss": 0.4257, - "mean_token_accuracy": 0.8650736913084984, - "num_tokens": 181585575.0, - "step": 150930 - }, - { - "entropy": 1.9318598195910455, - "epoch": 0.46790079350010344, - "grad_norm": 10.693130493164062, - "learning_rate": 3.698428501759196e-06, - "loss": 0.5226, - "mean_token_accuracy": 0.8426518440246582, - "num_tokens": 181597531.0, - "step": 150940 - }, - { - "entropy": 1.9709179133176804, - "epoch": 0.46793179262515316, - "grad_norm": 9.757343292236328, - "learning_rate": 3.6983059938341105e-06, - "loss": 0.4907, - "mean_token_accuracy": 0.8445596411824227, - "num_tokens": 181609089.0, - "step": 150950 - }, - { - "entropy": 1.9118812575936317, - "epoch": 0.46796279175020283, - "grad_norm": 7.699484825134277, - "learning_rate": 3.698183498082194e-06, - "loss": 0.4614, - "mean_token_accuracy": 0.8565623730421066, - "num_tokens": 181620966.0, - "step": 150960 - }, - { - "entropy": 1.918295791745186, - "epoch": 0.46799379087525256, - "grad_norm": 8.302897453308105, - "learning_rate": 3.698061014501429e-06, - "loss": 0.4548, - "mean_token_accuracy": 0.8550585329532623, - "num_tokens": 181632131.0, - "step": 150970 - }, - { - "entropy": 1.9636784076690674, - "epoch": 0.4680247900003022, - "grad_norm": 6.999095916748047, - "learning_rate": 3.697938543089802e-06, - "loss": 0.4968, - "mean_token_accuracy": 0.8381591513752937, - "num_tokens": 181643332.0, - "step": 150980 - }, - { - "entropy": 1.8199531510472298, - "epoch": 0.46805578912535195, - "grad_norm": 3.5625722408294678, - "learning_rate": 3.6978160838452965e-06, - "loss": 0.3936, - "mean_token_accuracy": 0.8629630744457245, - "num_tokens": 181656304.0, - "step": 150990 - }, - { - "entropy": 1.8767156556248665, - "epoch": 0.4680867882504016, - "grad_norm": 7.794158935546875, - "learning_rate": 3.697693636765899e-06, - "loss": 0.438, - "mean_token_accuracy": 0.8513418570160866, - "num_tokens": 181668812.0, - "step": 151000 - }, - { - "entropy": 1.8939493969082832, - "epoch": 0.46811778737545134, - "grad_norm": 7.774882793426514, - "learning_rate": 3.697571201849594e-06, - "loss": 0.4146, - "mean_token_accuracy": 0.8537428677082062, - "num_tokens": 181680854.0, - "step": 151010 - }, - { - "entropy": 1.8954125791788101, - "epoch": 0.468148786500501, - "grad_norm": 9.265185356140137, - "learning_rate": 3.6974487790943705e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.8574156984686852, - "num_tokens": 181693486.0, - "step": 151020 - }, - { - "entropy": 1.8389709085226058, - "epoch": 0.46817978562555074, - "grad_norm": 5.6764702796936035, - "learning_rate": 3.697326368498213e-06, - "loss": 0.4121, - "mean_token_accuracy": 0.858234989643097, - "num_tokens": 181706258.0, - "step": 151030 - }, - { - "entropy": 1.9311970368027687, - "epoch": 0.4682107847506004, - "grad_norm": 11.079048156738281, - "learning_rate": 3.69720397005911e-06, - "loss": 0.487, - "mean_token_accuracy": 0.846907764673233, - "num_tokens": 181717998.0, - "step": 151040 - }, - { - "entropy": 1.9391098693013191, - "epoch": 0.46824178387565013, - "grad_norm": 8.38887882232666, - "learning_rate": 3.6970815837750495e-06, - "loss": 0.4387, - "mean_token_accuracy": 0.8551925793290138, - "num_tokens": 181729056.0, - "step": 151050 - }, - { - "entropy": 1.8806929126381875, - "epoch": 0.4682727830006998, - "grad_norm": 8.16617202758789, - "learning_rate": 3.696959209644019e-06, - "loss": 0.4775, - "mean_token_accuracy": 0.8616417735815048, - "num_tokens": 181740577.0, - "step": 151060 - }, - { - "entropy": 1.8071967348456384, - "epoch": 0.4683037821257495, - "grad_norm": 9.592656135559082, - "learning_rate": 3.696836847664009e-06, - "loss": 0.4312, - "mean_token_accuracy": 0.8578245341777802, - "num_tokens": 181753546.0, - "step": 151070 - }, - { - "entropy": 1.955559852719307, - "epoch": 0.4683347812507992, - "grad_norm": 9.505187034606934, - "learning_rate": 3.6967144978330066e-06, - "loss": 0.4992, - "mean_token_accuracy": 0.8476584449410438, - "num_tokens": 181764665.0, - "step": 151080 - }, - { - "entropy": 1.9042584761977195, - "epoch": 0.4683657803758489, - "grad_norm": 8.192767143249512, - "learning_rate": 3.6965921601490035e-06, - "loss": 0.5423, - "mean_token_accuracy": 0.8445826590061187, - "num_tokens": 181777194.0, - "step": 151090 - }, - { - "entropy": 1.9415881663560868, - "epoch": 0.4683967795008986, - "grad_norm": 8.145171165466309, - "learning_rate": 3.696469834609988e-06, - "loss": 0.4829, - "mean_token_accuracy": 0.853820464015007, - "num_tokens": 181788063.0, - "step": 151100 - }, - { - "entropy": 1.7884770929813385, - "epoch": 0.4684277786259483, - "grad_norm": 9.306479454040527, - "learning_rate": 3.6963475212139516e-06, - "loss": 0.4244, - "mean_token_accuracy": 0.8613410949707031, - "num_tokens": 181801286.0, - "step": 151110 - }, - { - "entropy": 1.917163448035717, - "epoch": 0.468458777750998, - "grad_norm": 8.699514389038086, - "learning_rate": 3.696225219958886e-06, - "loss": 0.5055, - "mean_token_accuracy": 0.8294718861579895, - "num_tokens": 181813453.0, - "step": 151120 - }, - { - "entropy": 1.886626946926117, - "epoch": 0.46848977687604765, - "grad_norm": 9.164982795715332, - "learning_rate": 3.6961029308427824e-06, - "loss": 0.4606, - "mean_token_accuracy": 0.8475595995783806, - "num_tokens": 181825948.0, - "step": 151130 - }, - { - "entropy": 1.8653237760066985, - "epoch": 0.46852077600109737, - "grad_norm": 8.744365692138672, - "learning_rate": 3.695980653863633e-06, - "loss": 0.4231, - "mean_token_accuracy": 0.8610041797161102, - "num_tokens": 181838058.0, - "step": 151140 - }, - { - "entropy": 1.8321866735816001, - "epoch": 0.46855177512614704, - "grad_norm": 7.345215320587158, - "learning_rate": 3.69585838901943e-06, - "loss": 0.3699, - "mean_token_accuracy": 0.8644196107983589, - "num_tokens": 181850533.0, - "step": 151150 - }, - { - "entropy": 1.8806213557720184, - "epoch": 0.46858277425119677, - "grad_norm": 8.255237579345703, - "learning_rate": 3.6957361363081657e-06, - "loss": 0.4237, - "mean_token_accuracy": 0.8567614004015922, - "num_tokens": 181863191.0, - "step": 151160 - }, - { - "entropy": 1.9859347879886626, - "epoch": 0.46861377337624643, - "grad_norm": 8.459641456604004, - "learning_rate": 3.6956138957278346e-06, - "loss": 0.4667, - "mean_token_accuracy": 0.8509835615754128, - "num_tokens": 181873914.0, - "step": 151170 - }, - { - "entropy": 1.9205264270305633, - "epoch": 0.46864477250129616, - "grad_norm": 8.495672225952148, - "learning_rate": 3.69549166727643e-06, - "loss": 0.4519, - "mean_token_accuracy": 0.8561896935105324, - "num_tokens": 181884822.0, - "step": 151180 - }, - { - "entropy": 1.827118131518364, - "epoch": 0.4686757716263458, - "grad_norm": 5.892608642578125, - "learning_rate": 3.695369450951948e-06, - "loss": 0.3783, - "mean_token_accuracy": 0.8628544971346855, - "num_tokens": 181897677.0, - "step": 151190 - }, - { - "entropy": 1.8908180356025697, - "epoch": 0.46870677075139555, - "grad_norm": 3.845689535140991, - "learning_rate": 3.6952472467523807e-06, - "loss": 0.4393, - "mean_token_accuracy": 0.8599082320928574, - "num_tokens": 181909762.0, - "step": 151200 - }, - { - "entropy": 1.935589936375618, - "epoch": 0.4687377698764452, - "grad_norm": 7.964409351348877, - "learning_rate": 3.6951250546757256e-06, - "loss": 0.4782, - "mean_token_accuracy": 0.8504336655139924, - "num_tokens": 181920725.0, - "step": 151210 - }, - { - "entropy": 1.915782979130745, - "epoch": 0.46876876900149494, - "grad_norm": 4.305631637573242, - "learning_rate": 3.6950028747199766e-06, - "loss": 0.4388, - "mean_token_accuracy": 0.8563524767756462, - "num_tokens": 181932535.0, - "step": 151220 - }, - { - "entropy": 1.8579757571220399, - "epoch": 0.4687997681265446, - "grad_norm": 8.895844459533691, - "learning_rate": 3.6948807068831323e-06, - "loss": 0.4162, - "mean_token_accuracy": 0.8616629391908646, - "num_tokens": 181944531.0, - "step": 151230 - }, - { - "entropy": 1.9457248359918595, - "epoch": 0.46883076725159434, - "grad_norm": 8.319205284118652, - "learning_rate": 3.694758551163187e-06, - "loss": 0.4806, - "mean_token_accuracy": 0.8451997622847557, - "num_tokens": 181955726.0, - "step": 151240 - }, - { - "entropy": 1.8975832119584084, - "epoch": 0.468861766376644, - "grad_norm": 8.178126335144043, - "learning_rate": 3.6946364075581394e-06, - "loss": 0.4566, - "mean_token_accuracy": 0.8526521921157837, - "num_tokens": 181968223.0, - "step": 151250 - }, - { - "entropy": 1.9673262536525726, - "epoch": 0.46889276550169373, - "grad_norm": 9.780797958374023, - "learning_rate": 3.6945142760659856e-06, - "loss": 0.4799, - "mean_token_accuracy": 0.8512610167264938, - "num_tokens": 181978341.0, - "step": 151260 - }, - { - "entropy": 1.928539128601551, - "epoch": 0.4689237646267434, - "grad_norm": 7.564601421356201, - "learning_rate": 3.694392156684726e-06, - "loss": 0.4058, - "mean_token_accuracy": 0.8705823361873627, - "num_tokens": 181989455.0, - "step": 151270 - }, - { - "entropy": 1.9041236862540245, - "epoch": 0.4689547637517931, - "grad_norm": 3.4355711936950684, - "learning_rate": 3.6942700494123577e-06, - "loss": 0.4363, - "mean_token_accuracy": 0.8603847548365593, - "num_tokens": 182001135.0, - "step": 151280 - }, - { - "entropy": 1.8445854999125004, - "epoch": 0.4689857628768428, - "grad_norm": 8.764761924743652, - "learning_rate": 3.6941479542468796e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8481447994709015, - "num_tokens": 182013834.0, - "step": 151290 - }, - { - "entropy": 1.861207364499569, - "epoch": 0.4690167620018925, - "grad_norm": 10.083645820617676, - "learning_rate": 3.694025871186291e-06, - "loss": 0.4332, - "mean_token_accuracy": 0.8576826229691505, - "num_tokens": 182026041.0, - "step": 151300 - }, - { - "entropy": 1.9643630295991898, - "epoch": 0.4690477611269422, - "grad_norm": 7.361579895019531, - "learning_rate": 3.693903800228593e-06, - "loss": 0.4872, - "mean_token_accuracy": 0.8524459466338158, - "num_tokens": 182036872.0, - "step": 151310 - }, - { - "entropy": 1.9375217407941818, - "epoch": 0.4690787602519919, - "grad_norm": 6.923783779144287, - "learning_rate": 3.6937817413717846e-06, - "loss": 0.4696, - "mean_token_accuracy": 0.8514922142028809, - "num_tokens": 182048041.0, - "step": 151320 - }, - { - "entropy": 1.982517033815384, - "epoch": 0.4691097593770416, - "grad_norm": 9.478878021240234, - "learning_rate": 3.693659694613868e-06, - "loss": 0.5387, - "mean_token_accuracy": 0.8426320448517799, - "num_tokens": 182058840.0, - "step": 151330 - }, - { - "entropy": 1.8076622486114502, - "epoch": 0.4691407585020913, - "grad_norm": 8.073373794555664, - "learning_rate": 3.6935376599528437e-06, - "loss": 0.3949, - "mean_token_accuracy": 0.8623651906847953, - "num_tokens": 182072827.0, - "step": 151340 - }, - { - "entropy": 1.9158219203352929, - "epoch": 0.469171757627141, - "grad_norm": 7.664277076721191, - "learning_rate": 3.693415637386714e-06, - "loss": 0.486, - "mean_token_accuracy": 0.8522696018218994, - "num_tokens": 182084761.0, - "step": 151350 - }, - { - "entropy": 1.892909213900566, - "epoch": 0.46920275675219064, - "grad_norm": 3.7061331272125244, - "learning_rate": 3.69329362691348e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8496552616357803, - "num_tokens": 182096624.0, - "step": 151360 - }, - { - "entropy": 1.8524788722395897, - "epoch": 0.46923375587724037, - "grad_norm": 6.679312229156494, - "learning_rate": 3.693171628531146e-06, - "loss": 0.4046, - "mean_token_accuracy": 0.860837659239769, - "num_tokens": 182109437.0, - "step": 151370 - }, - { - "entropy": 1.9032334506511688, - "epoch": 0.46926475500229003, - "grad_norm": 8.694540023803711, - "learning_rate": 3.693049642237714e-06, - "loss": 0.4808, - "mean_token_accuracy": 0.8432764261960983, - "num_tokens": 182121203.0, - "step": 151380 - }, - { - "entropy": 1.952576905488968, - "epoch": 0.46929575412733976, - "grad_norm": 7.671172142028809, - "learning_rate": 3.692927668031188e-06, - "loss": 0.4649, - "mean_token_accuracy": 0.8436528146266937, - "num_tokens": 182133063.0, - "step": 151390 - }, - { - "entropy": 1.9288091585040092, - "epoch": 0.46932675325238943, - "grad_norm": 8.635396957397461, - "learning_rate": 3.6928057059095722e-06, - "loss": 0.4573, - "mean_token_accuracy": 0.8535377115011216, - "num_tokens": 182144648.0, - "step": 151400 - }, - { - "entropy": 1.8419179022312164, - "epoch": 0.46935775237743915, - "grad_norm": 7.754616737365723, - "learning_rate": 3.6926837558708713e-06, - "loss": 0.457, - "mean_token_accuracy": 0.8576859340071679, - "num_tokens": 182157015.0, - "step": 151410 - }, - { - "entropy": 1.7949551880359649, - "epoch": 0.4693887515024888, - "grad_norm": 5.1158061027526855, - "learning_rate": 3.69256181791309e-06, - "loss": 0.4135, - "mean_token_accuracy": 0.8699161261320114, - "num_tokens": 182169639.0, - "step": 151420 - }, - { - "entropy": 1.8809380188584328, - "epoch": 0.46941975062753855, - "grad_norm": 7.669633865356445, - "learning_rate": 3.692439892034234e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.8522050336003304, - "num_tokens": 182182293.0, - "step": 151430 - }, - { - "entropy": 1.954986748099327, - "epoch": 0.4694507497525882, - "grad_norm": 8.638233184814453, - "learning_rate": 3.6923179782323094e-06, - "loss": 0.4811, - "mean_token_accuracy": 0.8547404408454895, - "num_tokens": 182194218.0, - "step": 151440 - }, - { - "entropy": 1.9031829819083215, - "epoch": 0.46948174887763794, - "grad_norm": 3.230351448059082, - "learning_rate": 3.6921960765053222e-06, - "loss": 0.4171, - "mean_token_accuracy": 0.8607431352138519, - "num_tokens": 182206098.0, - "step": 151450 - }, - { - "entropy": 1.860182997584343, - "epoch": 0.4695127480026876, - "grad_norm": 8.417341232299805, - "learning_rate": 3.6920741868512786e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.8528778105974197, - "num_tokens": 182218062.0, - "step": 151460 - }, - { - "entropy": 1.8531003028154374, - "epoch": 0.46954374712773733, - "grad_norm": 4.122646331787109, - "learning_rate": 3.6919523092681877e-06, - "loss": 0.4197, - "mean_token_accuracy": 0.8522070854902267, - "num_tokens": 182231275.0, - "step": 151470 - }, - { - "entropy": 1.826745069026947, - "epoch": 0.469574746252787, - "grad_norm": 3.7218542098999023, - "learning_rate": 3.6918304437540562e-06, - "loss": 0.433, - "mean_token_accuracy": 0.850635839998722, - "num_tokens": 182244141.0, - "step": 151480 - }, - { - "entropy": 1.8726869612932204, - "epoch": 0.4696057453778367, - "grad_norm": 4.05550479888916, - "learning_rate": 3.6917085903068917e-06, - "loss": 0.4061, - "mean_token_accuracy": 0.8551696226000786, - "num_tokens": 182256928.0, - "step": 151490 - }, - { - "entropy": 1.8021176487207413, - "epoch": 0.4696367445028864, - "grad_norm": 8.80622386932373, - "learning_rate": 3.6915867489247047e-06, - "loss": 0.3553, - "mean_token_accuracy": 0.868051840364933, - "num_tokens": 182270315.0, - "step": 151500 - }, - { - "entropy": 1.9134663611650466, - "epoch": 0.4696677436279361, - "grad_norm": 8.243125915527344, - "learning_rate": 3.691464919605503e-06, - "loss": 0.453, - "mean_token_accuracy": 0.856773529946804, - "num_tokens": 182281992.0, - "step": 151510 - }, - { - "entropy": 1.8596371784806252, - "epoch": 0.4696987427529858, - "grad_norm": 6.062982559204102, - "learning_rate": 3.6913431023472958e-06, - "loss": 0.4168, - "mean_token_accuracy": 0.861498761177063, - "num_tokens": 182294109.0, - "step": 151520 - }, - { - "entropy": 1.8382547229528428, - "epoch": 0.4697297418780355, - "grad_norm": 8.140950202941895, - "learning_rate": 3.6912212971480952e-06, - "loss": 0.4041, - "mean_token_accuracy": 0.8637099817395211, - "num_tokens": 182307751.0, - "step": 151530 - }, - { - "entropy": 1.9259546086192132, - "epoch": 0.4697607410030852, - "grad_norm": 7.980990886688232, - "learning_rate": 3.69109950400591e-06, - "loss": 0.4731, - "mean_token_accuracy": 0.8463671579957008, - "num_tokens": 182319503.0, - "step": 151540 - }, - { - "entropy": 1.932530763745308, - "epoch": 0.4697917401281349, - "grad_norm": 7.715649604797363, - "learning_rate": 3.690977722918751e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.848869289457798, - "num_tokens": 182331575.0, - "step": 151550 - }, - { - "entropy": 1.8189910173416137, - "epoch": 0.4698227392531846, - "grad_norm": 11.992012023925781, - "learning_rate": 3.690855953884631e-06, - "loss": 0.441, - "mean_token_accuracy": 0.8527128636837006, - "num_tokens": 182344435.0, - "step": 151560 - }, - { - "entropy": 1.9173847809433937, - "epoch": 0.4698537383782343, - "grad_norm": 6.471937656402588, - "learning_rate": 3.6907341969015616e-06, - "loss": 0.4537, - "mean_token_accuracy": 0.8575096279382706, - "num_tokens": 182355816.0, - "step": 151570 - }, - { - "entropy": 1.8749173179268837, - "epoch": 0.46988473750328397, - "grad_norm": 7.470911502838135, - "learning_rate": 3.6906124519675545e-06, - "loss": 0.4368, - "mean_token_accuracy": 0.8550667837262154, - "num_tokens": 182367893.0, - "step": 151580 - }, - { - "entropy": 1.88685844540596, - "epoch": 0.4699157366283337, - "grad_norm": 6.896969318389893, - "learning_rate": 3.6904907190806227e-06, - "loss": 0.4846, - "mean_token_accuracy": 0.8534591227769852, - "num_tokens": 182379280.0, - "step": 151590 - }, - { - "entropy": 2.0131676971912382, - "epoch": 0.46994673575338336, - "grad_norm": 9.929747581481934, - "learning_rate": 3.6903689982387797e-06, - "loss": 0.557, - "mean_token_accuracy": 0.834673935174942, - "num_tokens": 182390158.0, - "step": 151600 - }, - { - "entropy": 1.903835128247738, - "epoch": 0.46997773487843303, - "grad_norm": 8.8099365234375, - "learning_rate": 3.6902472894400397e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8607927531003952, - "num_tokens": 182401869.0, - "step": 151610 - }, - { - "entropy": 1.720665130019188, - "epoch": 0.47000873400348275, - "grad_norm": 7.94708776473999, - "learning_rate": 3.6901255926824165e-06, - "loss": 0.3738, - "mean_token_accuracy": 0.8740614369511605, - "num_tokens": 182416920.0, - "step": 151620 - }, - { - "entropy": 1.8281274721026421, - "epoch": 0.4700397331285324, - "grad_norm": 9.17940616607666, - "learning_rate": 3.6900039079639236e-06, - "loss": 0.4254, - "mean_token_accuracy": 0.858392845094204, - "num_tokens": 182429039.0, - "step": 151630 - }, - { - "entropy": 1.8888639822602271, - "epoch": 0.47007073225358215, - "grad_norm": 8.588458061218262, - "learning_rate": 3.689882235282579e-06, - "loss": 0.437, - "mean_token_accuracy": 0.8532740786671639, - "num_tokens": 182441392.0, - "step": 151640 - }, - { - "entropy": 1.9319140702486037, - "epoch": 0.4701017313786318, - "grad_norm": 8.485003471374512, - "learning_rate": 3.689760574636396e-06, - "loss": 0.4998, - "mean_token_accuracy": 0.8546903505921364, - "num_tokens": 182452652.0, - "step": 151650 - }, - { - "entropy": 1.90705948472023, - "epoch": 0.47013273050368154, - "grad_norm": 4.283156394958496, - "learning_rate": 3.6896389260233906e-06, - "loss": 0.4591, - "mean_token_accuracy": 0.8467658385634422, - "num_tokens": 182464784.0, - "step": 151660 - }, - { - "entropy": 1.8410677209496498, - "epoch": 0.4701637296287312, - "grad_norm": 7.59536075592041, - "learning_rate": 3.6895172894415802e-06, - "loss": 0.4054, - "mean_token_accuracy": 0.8557831704616546, - "num_tokens": 182476898.0, - "step": 151670 - }, - { - "entropy": 1.777267834544182, - "epoch": 0.47019472875378093, - "grad_norm": 8.52239990234375, - "learning_rate": 3.6893956648889815e-06, - "loss": 0.3651, - "mean_token_accuracy": 0.8673418864607811, - "num_tokens": 182490990.0, - "step": 151680 - }, - { - "entropy": 1.9240646213293076, - "epoch": 0.4702257278788306, - "grad_norm": 8.47541332244873, - "learning_rate": 3.689274052363612e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8535736605525017, - "num_tokens": 182502543.0, - "step": 151690 - }, - { - "entropy": 1.9823878049850463, - "epoch": 0.4702567270038803, - "grad_norm": 7.675898551940918, - "learning_rate": 3.6891524518634897e-06, - "loss": 0.4817, - "mean_token_accuracy": 0.8481106892228126, - "num_tokens": 182513069.0, - "step": 151700 - }, - { - "entropy": 1.8830109879374504, - "epoch": 0.47028772612893, - "grad_norm": 3.7766404151916504, - "learning_rate": 3.6890308633866324e-06, - "loss": 0.3968, - "mean_token_accuracy": 0.8605834320187569, - "num_tokens": 182524794.0, - "step": 151710 - }, - { - "entropy": 1.9220516681671143, - "epoch": 0.4703187252539797, - "grad_norm": 7.916464805603027, - "learning_rate": 3.6889092869310594e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.859467314183712, - "num_tokens": 182535849.0, - "step": 151720 - }, - { - "entropy": 1.8670978620648384, - "epoch": 0.4703497243790294, - "grad_norm": 7.645886421203613, - "learning_rate": 3.68878772249479e-06, - "loss": 0.4164, - "mean_token_accuracy": 0.8644891589879989, - "num_tokens": 182548458.0, - "step": 151730 - }, - { - "entropy": 1.9042206808924675, - "epoch": 0.4703807235040791, - "grad_norm": 8.000757217407227, - "learning_rate": 3.6886661700758436e-06, - "loss": 0.4405, - "mean_token_accuracy": 0.8545676082372665, - "num_tokens": 182560533.0, - "step": 151740 - }, - { - "entropy": 1.8703896641731261, - "epoch": 0.4704117226291288, - "grad_norm": 7.271552562713623, - "learning_rate": 3.6885446296722404e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8588535279035568, - "num_tokens": 182572183.0, - "step": 151750 - }, - { - "entropy": 1.8296963378787041, - "epoch": 0.4704427217541785, - "grad_norm": 7.709904670715332, - "learning_rate": 3.688423101282001e-06, - "loss": 0.3636, - "mean_token_accuracy": 0.8668505728244782, - "num_tokens": 182584228.0, - "step": 151760 - }, - { - "entropy": 1.8257486388087272, - "epoch": 0.4704737208792282, - "grad_norm": 3.957359552383423, - "learning_rate": 3.688301584903147e-06, - "loss": 0.3983, - "mean_token_accuracy": 0.8542505398392677, - "num_tokens": 182597285.0, - "step": 151770 - }, - { - "entropy": 1.9298125982284546, - "epoch": 0.4705047200042779, - "grad_norm": 9.158317565917969, - "learning_rate": 3.688180080533698e-06, - "loss": 0.4692, - "mean_token_accuracy": 0.8475269585847854, - "num_tokens": 182609461.0, - "step": 151780 - }, - { - "entropy": 1.8265347346663474, - "epoch": 0.47053571912932757, - "grad_norm": 3.449368953704834, - "learning_rate": 3.688058588171679e-06, - "loss": 0.3726, - "mean_token_accuracy": 0.8622838228940963, - "num_tokens": 182623236.0, - "step": 151790 - }, - { - "entropy": 1.9088860154151917, - "epoch": 0.4705667182543773, - "grad_norm": 4.117319107055664, - "learning_rate": 3.68793710781511e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.8477122217416764, - "num_tokens": 182636203.0, - "step": 151800 - }, - { - "entropy": 1.9173002913594246, - "epoch": 0.47059771737942696, - "grad_norm": 7.626106262207031, - "learning_rate": 3.6878156394620156e-06, - "loss": 0.4764, - "mean_token_accuracy": 0.8568134188652039, - "num_tokens": 182647282.0, - "step": 151810 - }, - { - "entropy": 1.9399552628397942, - "epoch": 0.4706287165044767, - "grad_norm": 9.241488456726074, - "learning_rate": 3.687694183110418e-06, - "loss": 0.4581, - "mean_token_accuracy": 0.8620687261223793, - "num_tokens": 182658289.0, - "step": 151820 - }, - { - "entropy": 1.911539526283741, - "epoch": 0.47065971562952635, - "grad_norm": 8.373076438903809, - "learning_rate": 3.687572738758341e-06, - "loss": 0.435, - "mean_token_accuracy": 0.85491793602705, - "num_tokens": 182670044.0, - "step": 151830 - }, - { - "entropy": 1.9582807749509812, - "epoch": 0.4706907147545761, - "grad_norm": 7.713615894317627, - "learning_rate": 3.68745130640381e-06, - "loss": 0.4862, - "mean_token_accuracy": 0.8571858555078506, - "num_tokens": 182681164.0, - "step": 151840 - }, - { - "entropy": 1.9365883350372315, - "epoch": 0.47072171387962575, - "grad_norm": 8.034655570983887, - "learning_rate": 3.687329886044849e-06, - "loss": 0.4573, - "mean_token_accuracy": 0.8554182961583138, - "num_tokens": 182691971.0, - "step": 151850 - }, - { - "entropy": 1.9806547105312347, - "epoch": 0.4707527130046754, - "grad_norm": 9.290281295776367, - "learning_rate": 3.6872084776794824e-06, - "loss": 0.4649, - "mean_token_accuracy": 0.8474147111177445, - "num_tokens": 182702979.0, - "step": 151860 - }, - { - "entropy": 1.9025154545903207, - "epoch": 0.47078371212972514, - "grad_norm": 7.480702877044678, - "learning_rate": 3.6870870813057372e-06, - "loss": 0.4161, - "mean_token_accuracy": 0.8635838240385055, - "num_tokens": 182714689.0, - "step": 151870 - }, - { - "entropy": 1.9400853931903839, - "epoch": 0.4708147112547748, - "grad_norm": 8.509082794189453, - "learning_rate": 3.6869656969216393e-06, - "loss": 0.4811, - "mean_token_accuracy": 0.8622403219342232, - "num_tokens": 182725603.0, - "step": 151880 - }, - { - "entropy": 1.9585573732852937, - "epoch": 0.47084571037982453, - "grad_norm": 9.956759452819824, - "learning_rate": 3.686844324525214e-06, - "loss": 0.4887, - "mean_token_accuracy": 0.8526047378778457, - "num_tokens": 182736039.0, - "step": 151890 - }, - { - "entropy": 1.9035368889570237, - "epoch": 0.4708767095048742, - "grad_norm": 7.4699387550354, - "learning_rate": 3.6867229641144897e-06, - "loss": 0.51, - "mean_token_accuracy": 0.8410866633057594, - "num_tokens": 182747402.0, - "step": 151900 - }, - { - "entropy": 1.909564508497715, - "epoch": 0.4709077086299239, - "grad_norm": 8.168258666992188, - "learning_rate": 3.686601615687493e-06, - "loss": 0.4735, - "mean_token_accuracy": 0.8454542621970177, - "num_tokens": 182759365.0, - "step": 151910 - }, - { - "entropy": 1.9451777666807175, - "epoch": 0.4709387077549736, - "grad_norm": 8.727764129638672, - "learning_rate": 3.686480279242252e-06, - "loss": 0.4561, - "mean_token_accuracy": 0.8463275358080864, - "num_tokens": 182771263.0, - "step": 151920 - }, - { - "entropy": 1.875081916153431, - "epoch": 0.4709697068800233, - "grad_norm": 10.775795936584473, - "learning_rate": 3.6863589547767952e-06, - "loss": 0.4724, - "mean_token_accuracy": 0.8565489128232002, - "num_tokens": 182782809.0, - "step": 151930 - }, - { - "entropy": 1.8936992183327674, - "epoch": 0.471000706005073, - "grad_norm": 3.990841865539551, - "learning_rate": 3.6862376422891516e-06, - "loss": 0.4563, - "mean_token_accuracy": 0.8535718321800232, - "num_tokens": 182794788.0, - "step": 151940 - }, - { - "entropy": 1.8360718876123427, - "epoch": 0.4710317051301227, - "grad_norm": 9.025553703308105, - "learning_rate": 3.6861163417773506e-06, - "loss": 0.4586, - "mean_token_accuracy": 0.855416850745678, - "num_tokens": 182807619.0, - "step": 151950 - }, - { - "entropy": 1.827831156551838, - "epoch": 0.4710627042551724, - "grad_norm": 7.876723766326904, - "learning_rate": 3.6859950532394207e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8626267358660697, - "num_tokens": 182820542.0, - "step": 151960 - }, - { - "entropy": 1.9320463940501214, - "epoch": 0.4710937033802221, - "grad_norm": 4.7400383949279785, - "learning_rate": 3.6858737766733936e-06, - "loss": 0.4801, - "mean_token_accuracy": 0.8590385049581528, - "num_tokens": 182832246.0, - "step": 151970 - }, - { - "entropy": 1.9527651473879815, - "epoch": 0.4711247025052718, - "grad_norm": 7.10463285446167, - "learning_rate": 3.6857525120772986e-06, - "loss": 0.4874, - "mean_token_accuracy": 0.8457214340567589, - "num_tokens": 182843575.0, - "step": 151980 - }, - { - "entropy": 1.912015789747238, - "epoch": 0.4711557016303215, - "grad_norm": 10.008974075317383, - "learning_rate": 3.6856312594491684e-06, - "loss": 0.4629, - "mean_token_accuracy": 0.8591500997543335, - "num_tokens": 182855099.0, - "step": 151990 - }, - { - "entropy": 1.8941986069083214, - "epoch": 0.47118670075537117, - "grad_norm": 7.317121505737305, - "learning_rate": 3.6855100187870336e-06, - "loss": 0.4426, - "mean_token_accuracy": 0.8630182087421417, - "num_tokens": 182867082.0, - "step": 152000 - }, - { - "entropy": 1.9675297275185586, - "epoch": 0.4712176998804209, - "grad_norm": 8.49598503112793, - "learning_rate": 3.685388790088926e-06, - "loss": 0.4801, - "mean_token_accuracy": 0.8479068145155907, - "num_tokens": 182878343.0, - "step": 152010 - }, - { - "entropy": 1.8332099094986916, - "epoch": 0.47124869900547056, - "grad_norm": 9.543118476867676, - "learning_rate": 3.6852675733528774e-06, - "loss": 0.3753, - "mean_token_accuracy": 0.867411358654499, - "num_tokens": 182891559.0, - "step": 152020 - }, - { - "entropy": 1.8728103652596473, - "epoch": 0.4712796981305203, - "grad_norm": 7.202358245849609, - "learning_rate": 3.6851463685769223e-06, - "loss": 0.4315, - "mean_token_accuracy": 0.8574921682476997, - "num_tokens": 182903624.0, - "step": 152030 - }, - { - "entropy": 1.8856816232204436, - "epoch": 0.47131069725556995, - "grad_norm": 7.488083362579346, - "learning_rate": 3.6850251757590934e-06, - "loss": 0.4045, - "mean_token_accuracy": 0.8632172226905823, - "num_tokens": 182915491.0, - "step": 152040 - }, - { - "entropy": 1.961459246277809, - "epoch": 0.4713416963806197, - "grad_norm": 7.7718915939331055, - "learning_rate": 3.684903994897424e-06, - "loss": 0.4761, - "mean_token_accuracy": 0.8584063723683357, - "num_tokens": 182926333.0, - "step": 152050 - }, - { - "entropy": 1.9349211975932121, - "epoch": 0.47137269550566935, - "grad_norm": 7.356520652770996, - "learning_rate": 3.684782825989949e-06, - "loss": 0.4553, - "mean_token_accuracy": 0.85878836363554, - "num_tokens": 182937813.0, - "step": 152060 - }, - { - "entropy": 1.83448978215456, - "epoch": 0.4714036946307191, - "grad_norm": 7.66597843170166, - "learning_rate": 3.6846616690347028e-06, - "loss": 0.3923, - "mean_token_accuracy": 0.8577890947461129, - "num_tokens": 182951161.0, - "step": 152070 - }, - { - "entropy": 1.9403810694813728, - "epoch": 0.47143469375576874, - "grad_norm": 8.418362617492676, - "learning_rate": 3.684540524029721e-06, - "loss": 0.4767, - "mean_token_accuracy": 0.8424350410699845, - "num_tokens": 182962756.0, - "step": 152080 - }, - { - "entropy": 1.9282257035374641, - "epoch": 0.47146569288081847, - "grad_norm": 7.732948303222656, - "learning_rate": 3.6844193909730393e-06, - "loss": 0.4742, - "mean_token_accuracy": 0.8520740434527397, - "num_tokens": 182974074.0, - "step": 152090 - }, - { - "entropy": 1.8858251735568046, - "epoch": 0.47149669200586813, - "grad_norm": 7.745698928833008, - "learning_rate": 3.684298269862692e-06, - "loss": 0.45, - "mean_token_accuracy": 0.8505064308643341, - "num_tokens": 182986163.0, - "step": 152100 - }, - { - "entropy": 1.8810465842485429, - "epoch": 0.4715276911309178, - "grad_norm": 4.165502071380615, - "learning_rate": 3.6841771606967176e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8553640246391296, - "num_tokens": 182998633.0, - "step": 152110 - }, - { - "entropy": 1.8921176463365554, - "epoch": 0.4715586902559675, - "grad_norm": 7.671092987060547, - "learning_rate": 3.684056063473152e-06, - "loss": 0.394, - "mean_token_accuracy": 0.8703850194811821, - "num_tokens": 183010501.0, - "step": 152120 - }, - { - "entropy": 1.8996976360678672, - "epoch": 0.4715896893810172, - "grad_norm": 3.6530537605285645, - "learning_rate": 3.6839349781900336e-06, - "loss": 0.4634, - "mean_token_accuracy": 0.8507614269852638, - "num_tokens": 183022012.0, - "step": 152130 - }, - { - "entropy": 1.8891934141516686, - "epoch": 0.4716206885060669, - "grad_norm": 3.9836368560791016, - "learning_rate": 3.6838139048453997e-06, - "loss": 0.4175, - "mean_token_accuracy": 0.8550976201891899, - "num_tokens": 183034335.0, - "step": 152140 - }, - { - "entropy": 1.8683853819966316, - "epoch": 0.4716516876311166, - "grad_norm": 7.420729160308838, - "learning_rate": 3.6836928434372883e-06, - "loss": 0.4207, - "mean_token_accuracy": 0.8592018321156502, - "num_tokens": 183046580.0, - "step": 152150 - }, - { - "entropy": 1.9617306634783744, - "epoch": 0.4716826867561663, - "grad_norm": 8.44356632232666, - "learning_rate": 3.6835717939637382e-06, - "loss": 0.4988, - "mean_token_accuracy": 0.8451585352420807, - "num_tokens": 183057635.0, - "step": 152160 - }, - { - "entropy": 1.895785966515541, - "epoch": 0.471713685881216, - "grad_norm": 9.277005195617676, - "learning_rate": 3.6834507564227894e-06, - "loss": 0.4104, - "mean_token_accuracy": 0.8586176127195358, - "num_tokens": 183069793.0, - "step": 152170 - }, - { - "entropy": 1.9464083522558213, - "epoch": 0.4717446850062657, - "grad_norm": 9.0181884765625, - "learning_rate": 3.6833297308124816e-06, - "loss": 0.4491, - "mean_token_accuracy": 0.8472587198019028, - "num_tokens": 183081520.0, - "step": 152180 - }, - { - "entropy": 1.8146517232060433, - "epoch": 0.4717756841313154, - "grad_norm": 7.19028902053833, - "learning_rate": 3.683208717130854e-06, - "loss": 0.4022, - "mean_token_accuracy": 0.8625146299600601, - "num_tokens": 183094449.0, - "step": 152190 - }, - { - "entropy": 1.8609568387269975, - "epoch": 0.4718066832563651, - "grad_norm": 9.199468612670898, - "learning_rate": 3.6830877153759475e-06, - "loss": 0.4774, - "mean_token_accuracy": 0.8427708148956299, - "num_tokens": 183107155.0, - "step": 152200 - }, - { - "entropy": 1.9776297345757485, - "epoch": 0.47183768238141477, - "grad_norm": 9.14094352722168, - "learning_rate": 3.682966725545803e-06, - "loss": 0.52, - "mean_token_accuracy": 0.8375296384096146, - "num_tokens": 183118329.0, - "step": 152210 - }, - { - "entropy": 1.8778993353247642, - "epoch": 0.4718686815064645, - "grad_norm": 7.582857131958008, - "learning_rate": 3.6828457476384627e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8534572198987007, - "num_tokens": 183130122.0, - "step": 152220 - }, - { - "entropy": 1.9139007419347762, - "epoch": 0.47189968063151416, - "grad_norm": 8.777506828308105, - "learning_rate": 3.6827247816519684e-06, - "loss": 0.4427, - "mean_token_accuracy": 0.8544740185141564, - "num_tokens": 183141880.0, - "step": 152230 - }, - { - "entropy": 1.8196016132831574, - "epoch": 0.4719306797565639, - "grad_norm": 6.885635852813721, - "learning_rate": 3.6826038275843614e-06, - "loss": 0.3667, - "mean_token_accuracy": 0.8703275233507156, - "num_tokens": 183154880.0, - "step": 152240 - }, - { - "entropy": 1.951605823636055, - "epoch": 0.47196167888161356, - "grad_norm": 5.0247979164123535, - "learning_rate": 3.682482885433685e-06, - "loss": 0.4446, - "mean_token_accuracy": 0.8587975725531578, - "num_tokens": 183166905.0, - "step": 152250 - }, - { - "entropy": 1.8515436753630639, - "epoch": 0.4719926780066633, - "grad_norm": 7.222294807434082, - "learning_rate": 3.682361955197983e-06, - "loss": 0.3812, - "mean_token_accuracy": 0.8555783748626709, - "num_tokens": 183179878.0, - "step": 152260 - }, - { - "entropy": 1.9167709454894066, - "epoch": 0.47202367713171295, - "grad_norm": 3.6357107162475586, - "learning_rate": 3.682241036875299e-06, - "loss": 0.4907, - "mean_token_accuracy": 0.8456657961010933, - "num_tokens": 183191647.0, - "step": 152270 - }, - { - "entropy": 1.9338033080101014, - "epoch": 0.4720546762567627, - "grad_norm": 8.893651962280273, - "learning_rate": 3.682120130463677e-06, - "loss": 0.4721, - "mean_token_accuracy": 0.8554412201046944, - "num_tokens": 183203524.0, - "step": 152280 - }, - { - "entropy": 1.8064615935087205, - "epoch": 0.47208567538181234, - "grad_norm": 8.168737411499023, - "learning_rate": 3.681999235961162e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8518370047211647, - "num_tokens": 183216820.0, - "step": 152290 - }, - { - "entropy": 1.8602395519614219, - "epoch": 0.47211667450686207, - "grad_norm": 9.071744918823242, - "learning_rate": 3.681878353365799e-06, - "loss": 0.4119, - "mean_token_accuracy": 0.8614316672086716, - "num_tokens": 183228935.0, - "step": 152300 - }, - { - "entropy": 1.8436309307813645, - "epoch": 0.47214767363191174, - "grad_norm": 7.226712226867676, - "learning_rate": 3.6817574826756326e-06, - "loss": 0.4279, - "mean_token_accuracy": 0.8612644121050834, - "num_tokens": 183241674.0, - "step": 152310 - }, - { - "entropy": 1.8456968396902085, - "epoch": 0.47217867275696146, - "grad_norm": 7.766792297363281, - "learning_rate": 3.6816366238887095e-06, - "loss": 0.48, - "mean_token_accuracy": 0.8446129441261292, - "num_tokens": 183254431.0, - "step": 152320 - }, - { - "entropy": 1.8767608508467675, - "epoch": 0.47220967188201113, - "grad_norm": 8.538994789123535, - "learning_rate": 3.681515777003077e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8586622327566147, - "num_tokens": 183265932.0, - "step": 152330 - }, - { - "entropy": 1.9017756059765816, - "epoch": 0.47224067100706085, - "grad_norm": 9.971243858337402, - "learning_rate": 3.68139494201678e-06, - "loss": 0.4552, - "mean_token_accuracy": 0.8545010507106781, - "num_tokens": 183277542.0, - "step": 152340 - }, - { - "entropy": 1.9646113216876984, - "epoch": 0.4722716701321105, - "grad_norm": 12.376042366027832, - "learning_rate": 3.681274118927867e-06, - "loss": 0.506, - "mean_token_accuracy": 0.8538304835557937, - "num_tokens": 183288333.0, - "step": 152350 - }, - { - "entropy": 1.8397805973887444, - "epoch": 0.4723026692571602, - "grad_norm": 3.604233741760254, - "learning_rate": 3.6811533077343866e-06, - "loss": 0.4059, - "mean_token_accuracy": 0.8551945656538009, - "num_tokens": 183301004.0, - "step": 152360 - }, - { - "entropy": 1.8144716992974281, - "epoch": 0.4723336683822099, - "grad_norm": 8.521669387817383, - "learning_rate": 3.681032508434385e-06, - "loss": 0.3965, - "mean_token_accuracy": 0.8576774016022682, - "num_tokens": 183313976.0, - "step": 152370 - }, - { - "entropy": 1.9162440985441207, - "epoch": 0.4723646675072596, - "grad_norm": 4.395325660705566, - "learning_rate": 3.6809117210259127e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8563192039728165, - "num_tokens": 183325996.0, - "step": 152380 - }, - { - "entropy": 1.800866074860096, - "epoch": 0.4723956666323093, - "grad_norm": 4.031833648681641, - "learning_rate": 3.680790945507018e-06, - "loss": 0.3897, - "mean_token_accuracy": 0.8660914584994316, - "num_tokens": 183338994.0, - "step": 152390 - }, - { - "entropy": 1.9331076472997666, - "epoch": 0.472426665757359, - "grad_norm": 7.948485851287842, - "learning_rate": 3.6806701818757502e-06, - "loss": 0.4812, - "mean_token_accuracy": 0.8479227542877197, - "num_tokens": 183350361.0, - "step": 152400 - }, - { - "entropy": 1.894187593460083, - "epoch": 0.4724576648824087, - "grad_norm": 3.7551207542419434, - "learning_rate": 3.680549430130159e-06, - "loss": 0.4179, - "mean_token_accuracy": 0.8658199816942215, - "num_tokens": 183361759.0, - "step": 152410 - }, - { - "entropy": 1.936548638343811, - "epoch": 0.47248866400745837, - "grad_norm": 7.659783363342285, - "learning_rate": 3.680428690268297e-06, - "loss": 0.4861, - "mean_token_accuracy": 0.8464308723807334, - "num_tokens": 183372606.0, - "step": 152420 - }, - { - "entropy": 1.892865703999996, - "epoch": 0.4725196631325081, - "grad_norm": 5.735332012176514, - "learning_rate": 3.6803079622882127e-06, - "loss": 0.4803, - "mean_token_accuracy": 0.8505585104227066, - "num_tokens": 183384415.0, - "step": 152430 - }, - { - "entropy": 1.8908448189496994, - "epoch": 0.47255066225755776, - "grad_norm": 7.597827434539795, - "learning_rate": 3.6801872461879588e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8497716590762139, - "num_tokens": 183396513.0, - "step": 152440 - }, - { - "entropy": 1.9123169988393784, - "epoch": 0.4725816613826075, - "grad_norm": 8.45621395111084, - "learning_rate": 3.6800665419655856e-06, - "loss": 0.4784, - "mean_token_accuracy": 0.8462508618831635, - "num_tokens": 183407985.0, - "step": 152450 - }, - { - "entropy": 1.8796579480171203, - "epoch": 0.47261266050765716, - "grad_norm": 7.05368185043335, - "learning_rate": 3.6799458496191475e-06, - "loss": 0.471, - "mean_token_accuracy": 0.8509629443287849, - "num_tokens": 183419056.0, - "step": 152460 - }, - { - "entropy": 1.8205665156245232, - "epoch": 0.4726436596327069, - "grad_norm": 7.551618576049805, - "learning_rate": 3.6798251691466964e-06, - "loss": 0.4077, - "mean_token_accuracy": 0.858218289911747, - "num_tokens": 183431365.0, - "step": 152470 - }, - { - "entropy": 1.857941946387291, - "epoch": 0.47267465875775655, - "grad_norm": 9.102563858032227, - "learning_rate": 3.6797045005462845e-06, - "loss": 0.4162, - "mean_token_accuracy": 0.8679466590285301, - "num_tokens": 183442920.0, - "step": 152480 - }, - { - "entropy": 1.87067861109972, - "epoch": 0.4727056578828063, - "grad_norm": 4.3301897048950195, - "learning_rate": 3.6795838438159666e-06, - "loss": 0.449, - "mean_token_accuracy": 0.8534408792853355, - "num_tokens": 183454386.0, - "step": 152490 - }, - { - "entropy": 1.8775417447090148, - "epoch": 0.47273665700785594, - "grad_norm": 4.135782241821289, - "learning_rate": 3.6794631989537953e-06, - "loss": 0.4695, - "mean_token_accuracy": 0.8456904798746109, - "num_tokens": 183466342.0, - "step": 152500 - }, - { - "entropy": 1.9376057237386703, - "epoch": 0.47276765613290567, - "grad_norm": 8.15317153930664, - "learning_rate": 3.6793425659578263e-06, - "loss": 0.4641, - "mean_token_accuracy": 0.8525159105658531, - "num_tokens": 183477396.0, - "step": 152510 - }, - { - "entropy": 1.8745385244488717, - "epoch": 0.47279865525795534, - "grad_norm": 2.6730575561523438, - "learning_rate": 3.6792219448261145e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8526719242334366, - "num_tokens": 183489938.0, - "step": 152520 - }, - { - "entropy": 1.9059744358062745, - "epoch": 0.47282965438300506, - "grad_norm": 6.631153106689453, - "learning_rate": 3.6791013355567153e-06, - "loss": 0.4762, - "mean_token_accuracy": 0.8524453654885292, - "num_tokens": 183501781.0, - "step": 152530 - }, - { - "entropy": 1.9294340521097184, - "epoch": 0.47286065350805473, - "grad_norm": 9.2339506149292, - "learning_rate": 3.6789807381476832e-06, - "loss": 0.4727, - "mean_token_accuracy": 0.8489730969071388, - "num_tokens": 183513298.0, - "step": 152540 - }, - { - "entropy": 1.915228234231472, - "epoch": 0.47289165263310445, - "grad_norm": 7.601345062255859, - "learning_rate": 3.6788601525970764e-06, - "loss": 0.4844, - "mean_token_accuracy": 0.8517981469631195, - "num_tokens": 183524819.0, - "step": 152550 - }, - { - "entropy": 1.8034334376454353, - "epoch": 0.4729226517581541, - "grad_norm": 4.246415138244629, - "learning_rate": 3.67873957890295e-06, - "loss": 0.3833, - "mean_token_accuracy": 0.8603020891547203, - "num_tokens": 183537540.0, - "step": 152560 - }, - { - "entropy": 1.8432205379009248, - "epoch": 0.47295365088320385, - "grad_norm": 8.568245887756348, - "learning_rate": 3.6786190170633637e-06, - "loss": 0.421, - "mean_token_accuracy": 0.8558652251958847, - "num_tokens": 183550060.0, - "step": 152570 - }, - { - "entropy": 1.9111878633499146, - "epoch": 0.4729846500082535, - "grad_norm": 6.995577812194824, - "learning_rate": 3.678498467076371e-06, - "loss": 0.4648, - "mean_token_accuracy": 0.8523382723331452, - "num_tokens": 183561570.0, - "step": 152580 - }, - { - "entropy": 1.792884823679924, - "epoch": 0.47301564913330324, - "grad_norm": 5.278679370880127, - "learning_rate": 3.6783779289400336e-06, - "loss": 0.3254, - "mean_token_accuracy": 0.8689914017915725, - "num_tokens": 183575606.0, - "step": 152590 - }, - { - "entropy": 1.8932118907570838, - "epoch": 0.4730466482583529, - "grad_norm": 8.754302024841309, - "learning_rate": 3.6782574026524075e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.8607703685760498, - "num_tokens": 183588123.0, - "step": 152600 - }, - { - "entropy": 1.863177940249443, - "epoch": 0.4730776473834026, - "grad_norm": 4.379142761230469, - "learning_rate": 3.6781368882115536e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8468897491693497, - "num_tokens": 183600671.0, - "step": 152610 - }, - { - "entropy": 1.9543335124850274, - "epoch": 0.4731086465084523, - "grad_norm": 8.659340858459473, - "learning_rate": 3.6780163856155306e-06, - "loss": 0.5324, - "mean_token_accuracy": 0.8367661848664284, - "num_tokens": 183612571.0, - "step": 152620 - }, - { - "entropy": 1.8547952726483345, - "epoch": 0.47313964563350197, - "grad_norm": 11.049469947814941, - "learning_rate": 3.677895894862398e-06, - "loss": 0.4265, - "mean_token_accuracy": 0.853056563436985, - "num_tokens": 183625066.0, - "step": 152630 - }, - { - "entropy": 1.8781471252441406, - "epoch": 0.4731706447585517, - "grad_norm": 3.9625051021575928, - "learning_rate": 3.6777754159502156e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.8535264268517494, - "num_tokens": 183637140.0, - "step": 152640 - }, - { - "entropy": 1.9165562033653258, - "epoch": 0.47320164388360136, - "grad_norm": 8.369365692138672, - "learning_rate": 3.677654948877046e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.854417635500431, - "num_tokens": 183649188.0, - "step": 152650 - }, - { - "entropy": 1.9343287527561188, - "epoch": 0.4732326430086511, - "grad_norm": 8.027566909790039, - "learning_rate": 3.6775344936409477e-06, - "loss": 0.4711, - "mean_token_accuracy": 0.8549977317452431, - "num_tokens": 183660619.0, - "step": 152660 - }, - { - "entropy": 1.7774075135588645, - "epoch": 0.47326364213370076, - "grad_norm": 8.56772518157959, - "learning_rate": 3.6774140502399853e-06, - "loss": 0.3696, - "mean_token_accuracy": 0.8563277691602706, - "num_tokens": 183674724.0, - "step": 152670 - }, - { - "entropy": 1.7775170743465423, - "epoch": 0.4732946412587505, - "grad_norm": 3.6011083126068115, - "learning_rate": 3.6772936186722188e-06, - "loss": 0.3685, - "mean_token_accuracy": 0.8681730851531029, - "num_tokens": 183688045.0, - "step": 152680 - }, - { - "entropy": 1.891077609360218, - "epoch": 0.47332564038380015, - "grad_norm": 9.511871337890625, - "learning_rate": 3.6771731989357106e-06, - "loss": 0.4778, - "mean_token_accuracy": 0.8397925227880478, - "num_tokens": 183700096.0, - "step": 152690 - }, - { - "entropy": 1.8384882450103759, - "epoch": 0.4733566395088499, - "grad_norm": 6.966220855712891, - "learning_rate": 3.6770527910285245e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8562698677182198, - "num_tokens": 183712595.0, - "step": 152700 - }, - { - "entropy": 1.85170476436615, - "epoch": 0.47338763863389954, - "grad_norm": 8.161643981933594, - "learning_rate": 3.6769323949487246e-06, - "loss": 0.4254, - "mean_token_accuracy": 0.8651944741606712, - "num_tokens": 183724796.0, - "step": 152710 - }, - { - "entropy": 1.8865203723311423, - "epoch": 0.47341863775894927, - "grad_norm": 10.452710151672363, - "learning_rate": 3.676812010694373e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8501796692609787, - "num_tokens": 183737051.0, - "step": 152720 - }, - { - "entropy": 1.841557838022709, - "epoch": 0.47344963688399894, - "grad_norm": 3.842358350753784, - "learning_rate": 3.6766916382635347e-06, - "loss": 0.4337, - "mean_token_accuracy": 0.8630548194050789, - "num_tokens": 183749899.0, - "step": 152730 - }, - { - "entropy": 1.905419360846281, - "epoch": 0.47348063600904866, - "grad_norm": 7.395157337188721, - "learning_rate": 3.6765712776542745e-06, - "loss": 0.4447, - "mean_token_accuracy": 0.856062363088131, - "num_tokens": 183762289.0, - "step": 152740 - }, - { - "entropy": 1.6978764295578004, - "epoch": 0.47351163513409833, - "grad_norm": 8.431910514831543, - "learning_rate": 3.6764509288646577e-06, - "loss": 0.3217, - "mean_token_accuracy": 0.8770282730460167, - "num_tokens": 183776782.0, - "step": 152750 - }, - { - "entropy": 1.8656988859176635, - "epoch": 0.47354263425914805, - "grad_norm": 9.192631721496582, - "learning_rate": 3.6763305918927494e-06, - "loss": 0.4009, - "mean_token_accuracy": 0.871405579149723, - "num_tokens": 183788701.0, - "step": 152760 - }, - { - "entropy": 1.8958804607391357, - "epoch": 0.4735736333841977, - "grad_norm": 7.898538112640381, - "learning_rate": 3.676210266736617e-06, - "loss": 0.4845, - "mean_token_accuracy": 0.8465561047196388, - "num_tokens": 183799771.0, - "step": 152770 - }, - { - "entropy": 1.8809950187802316, - "epoch": 0.47360463250924745, - "grad_norm": 3.637345790863037, - "learning_rate": 3.6760899533943257e-06, - "loss": 0.4451, - "mean_token_accuracy": 0.8576677471399308, - "num_tokens": 183811578.0, - "step": 152780 - }, - { - "entropy": 1.804450687766075, - "epoch": 0.4736356316342971, - "grad_norm": 3.748936653137207, - "learning_rate": 3.675969651863942e-06, - "loss": 0.3788, - "mean_token_accuracy": 0.8617606118321419, - "num_tokens": 183824765.0, - "step": 152790 - }, - { - "entropy": 1.8327493906021117, - "epoch": 0.47366663075934684, - "grad_norm": 8.088676452636719, - "learning_rate": 3.675849362143535e-06, - "loss": 0.397, - "mean_token_accuracy": 0.8559876635670662, - "num_tokens": 183837545.0, - "step": 152800 - }, - { - "entropy": 1.8694708958268165, - "epoch": 0.4736976298843965, - "grad_norm": 8.502338409423828, - "learning_rate": 3.6757290842311712e-06, - "loss": 0.4315, - "mean_token_accuracy": 0.8672619074583053, - "num_tokens": 183849680.0, - "step": 152810 - }, - { - "entropy": 1.9533298015594482, - "epoch": 0.47372862900944623, - "grad_norm": 8.786824226379395, - "learning_rate": 3.6756088181249183e-06, - "loss": 0.4956, - "mean_token_accuracy": 0.844579030573368, - "num_tokens": 183860338.0, - "step": 152820 - }, - { - "entropy": 1.7913990572094918, - "epoch": 0.4737596281344959, - "grad_norm": 7.474668979644775, - "learning_rate": 3.675488563822847e-06, - "loss": 0.3915, - "mean_token_accuracy": 0.8707406103610993, - "num_tokens": 183872820.0, - "step": 152830 - }, - { - "entropy": 1.8852297574281693, - "epoch": 0.4737906272595456, - "grad_norm": 8.760466575622559, - "learning_rate": 3.675368321323025e-06, - "loss": 0.4826, - "mean_token_accuracy": 0.8509562835097313, - "num_tokens": 183884880.0, - "step": 152840 - }, - { - "entropy": 1.9746339291334152, - "epoch": 0.4738216263845953, - "grad_norm": 6.545812129974365, - "learning_rate": 3.6752480906235227e-06, - "loss": 0.5101, - "mean_token_accuracy": 0.8477579385042191, - "num_tokens": 183896158.0, - "step": 152850 - }, - { - "entropy": 1.8532864689826964, - "epoch": 0.47385262550964496, - "grad_norm": 7.785163879394531, - "learning_rate": 3.675127871722409e-06, - "loss": 0.3823, - "mean_token_accuracy": 0.8605849370360374, - "num_tokens": 183908502.0, - "step": 152860 - }, - { - "entropy": 1.927658785879612, - "epoch": 0.4738836246346947, - "grad_norm": 8.491934776306152, - "learning_rate": 3.6750076646177558e-06, - "loss": 0.4626, - "mean_token_accuracy": 0.8559217736124992, - "num_tokens": 183920673.0, - "step": 152870 - }, - { - "entropy": 1.9375594913959504, - "epoch": 0.47391462375974436, - "grad_norm": 7.955965995788574, - "learning_rate": 3.6748874693076326e-06, - "loss": 0.4898, - "mean_token_accuracy": 0.8447357758879661, - "num_tokens": 183932317.0, - "step": 152880 - }, - { - "entropy": 1.831139837950468, - "epoch": 0.4739456228847941, - "grad_norm": 4.309988021850586, - "learning_rate": 3.6747672857901117e-06, - "loss": 0.404, - "mean_token_accuracy": 0.8567724660038948, - "num_tokens": 183945777.0, - "step": 152890 - }, - { - "entropy": 1.9209632962942123, - "epoch": 0.47397662200984375, - "grad_norm": 7.784418106079102, - "learning_rate": 3.674647114063265e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8604557514190674, - "num_tokens": 183956863.0, - "step": 152900 - }, - { - "entropy": 1.9507730916142463, - "epoch": 0.4740076211348935, - "grad_norm": 7.421597480773926, - "learning_rate": 3.674526954125164e-06, - "loss": 0.4967, - "mean_token_accuracy": 0.8345337122678756, - "num_tokens": 183968147.0, - "step": 152910 - }, - { - "entropy": 1.9124934807419778, - "epoch": 0.47403862025994314, - "grad_norm": 9.237162590026855, - "learning_rate": 3.674406805973881e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.8510349482297898, - "num_tokens": 183978987.0, - "step": 152920 - }, - { - "entropy": 1.9097222343087197, - "epoch": 0.47406961938499287, - "grad_norm": 9.957469940185547, - "learning_rate": 3.6742866696074915e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.8503610968589783, - "num_tokens": 183990806.0, - "step": 152930 - }, - { - "entropy": 1.9008977569639682, - "epoch": 0.47410061851004254, - "grad_norm": 8.201800346374512, - "learning_rate": 3.6741665450240667e-06, - "loss": 0.4224, - "mean_token_accuracy": 0.8540589138865471, - "num_tokens": 184003428.0, - "step": 152940 - }, - { - "entropy": 1.8850766360759734, - "epoch": 0.47413161763509226, - "grad_norm": 7.431873798370361, - "learning_rate": 3.6740464322216814e-06, - "loss": 0.4595, - "mean_token_accuracy": 0.8467616021633149, - "num_tokens": 184015405.0, - "step": 152950 - }, - { - "entropy": 1.9019383952021598, - "epoch": 0.47416261676014193, - "grad_norm": 7.215775012969971, - "learning_rate": 3.67392633119841e-06, - "loss": 0.4318, - "mean_token_accuracy": 0.8517497405409813, - "num_tokens": 184027587.0, - "step": 152960 - }, - { - "entropy": 1.905358751118183, - "epoch": 0.47419361588519165, - "grad_norm": 2.930060863494873, - "learning_rate": 3.6738062419523276e-06, - "loss": 0.4719, - "mean_token_accuracy": 0.8468658626079559, - "num_tokens": 184039622.0, - "step": 152970 - }, - { - "entropy": 1.8342019632458686, - "epoch": 0.4742246150102413, - "grad_norm": 3.8289215564727783, - "learning_rate": 3.6736861644815084e-06, - "loss": 0.4457, - "mean_token_accuracy": 0.8554961785674096, - "num_tokens": 184051541.0, - "step": 152980 - }, - { - "entropy": 1.86045740544796, - "epoch": 0.47425561413529105, - "grad_norm": 7.936079025268555, - "learning_rate": 3.6735660987840305e-06, - "loss": 0.4993, - "mean_token_accuracy": 0.8530977353453636, - "num_tokens": 184063826.0, - "step": 152990 - }, - { - "entropy": 1.8976167649030686, - "epoch": 0.4742866132603407, - "grad_norm": 3.9572274684906006, - "learning_rate": 3.6734460448579673e-06, - "loss": 0.4313, - "mean_token_accuracy": 0.858278226852417, - "num_tokens": 184075371.0, - "step": 153000 - }, - { - "entropy": 1.8274334058165551, - "epoch": 0.47431761238539044, - "grad_norm": 6.631383419036865, - "learning_rate": 3.6733260027013985e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.8602709263563156, - "num_tokens": 184088890.0, - "step": 153010 - }, - { - "entropy": 1.934623521566391, - "epoch": 0.4743486115104401, - "grad_norm": 8.96616268157959, - "learning_rate": 3.673205972312398e-06, - "loss": 0.4912, - "mean_token_accuracy": 0.8473653078079224, - "num_tokens": 184100349.0, - "step": 153020 - }, - { - "entropy": 1.874304661154747, - "epoch": 0.47437961063548983, - "grad_norm": 6.859756946563721, - "learning_rate": 3.6730859536890454e-06, - "loss": 0.4008, - "mean_token_accuracy": 0.8729176357388496, - "num_tokens": 184112664.0, - "step": 153030 - }, - { - "entropy": 1.9090553134679795, - "epoch": 0.4744106097605395, - "grad_norm": 6.6315016746521, - "learning_rate": 3.6729659468294182e-06, - "loss": 0.4221, - "mean_token_accuracy": 0.8530550181865693, - "num_tokens": 184125100.0, - "step": 153040 - }, - { - "entropy": 1.8723108693957329, - "epoch": 0.4744416088855892, - "grad_norm": 4.024182319641113, - "learning_rate": 3.6728459517315944e-06, - "loss": 0.4138, - "mean_token_accuracy": 0.8646663218736649, - "num_tokens": 184136438.0, - "step": 153050 - }, - { - "entropy": 1.8754427291452884, - "epoch": 0.4744726080106389, - "grad_norm": 2.725093364715576, - "learning_rate": 3.672725968393654e-06, - "loss": 0.4289, - "mean_token_accuracy": 0.8570066452026367, - "num_tokens": 184148910.0, - "step": 153060 - }, - { - "entropy": 1.897630612552166, - "epoch": 0.4745036071356886, - "grad_norm": 8.470466613769531, - "learning_rate": 3.6726059968136746e-06, - "loss": 0.4828, - "mean_token_accuracy": 0.8399512648582459, - "num_tokens": 184160738.0, - "step": 153070 - }, - { - "entropy": 1.8298255681991578, - "epoch": 0.4745346062607383, - "grad_norm": 8.843210220336914, - "learning_rate": 3.672486036989737e-06, - "loss": 0.4457, - "mean_token_accuracy": 0.8569268405437469, - "num_tokens": 184173795.0, - "step": 153080 - }, - { - "entropy": 1.855211439728737, - "epoch": 0.47456560538578796, - "grad_norm": 7.2180280685424805, - "learning_rate": 3.6723660889199214e-06, - "loss": 0.4219, - "mean_token_accuracy": 0.8601253613829613, - "num_tokens": 184185706.0, - "step": 153090 - }, - { - "entropy": 1.957202786207199, - "epoch": 0.4745966045108377, - "grad_norm": 8.35397720336914, - "learning_rate": 3.672246152602308e-06, - "loss": 0.4675, - "mean_token_accuracy": 0.8588167384266854, - "num_tokens": 184196601.0, - "step": 153100 - }, - { - "entropy": 1.8255947291851045, - "epoch": 0.47462760363588735, - "grad_norm": 9.436110496520996, - "learning_rate": 3.6721262280349785e-06, - "loss": 0.4364, - "mean_token_accuracy": 0.85789824873209, - "num_tokens": 184209736.0, - "step": 153110 - }, - { - "entropy": 1.846110972762108, - "epoch": 0.4746586027609371, - "grad_norm": 3.714542865753174, - "learning_rate": 3.6720063152160128e-06, - "loss": 0.4014, - "mean_token_accuracy": 0.8676638081669807, - "num_tokens": 184222073.0, - "step": 153120 - }, - { - "entropy": 1.9054560333490371, - "epoch": 0.47468960188598674, - "grad_norm": 9.361035346984863, - "learning_rate": 3.6718864141434946e-06, - "loss": 0.4596, - "mean_token_accuracy": 0.8527064174413681, - "num_tokens": 184233663.0, - "step": 153130 - }, - { - "entropy": 1.9491969496011734, - "epoch": 0.47472060101103647, - "grad_norm": 8.116238594055176, - "learning_rate": 3.6717665248155054e-06, - "loss": 0.4821, - "mean_token_accuracy": 0.84459348320961, - "num_tokens": 184244980.0, - "step": 153140 - }, - { - "entropy": 1.9269451722502708, - "epoch": 0.47475160013608614, - "grad_norm": 7.581139087677002, - "learning_rate": 3.6716466472301283e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8586566999554635, - "num_tokens": 184256411.0, - "step": 153150 - }, - { - "entropy": 1.7936486840248107, - "epoch": 0.47478259926113586, - "grad_norm": 3.638094663619995, - "learning_rate": 3.671526781385446e-06, - "loss": 0.3555, - "mean_token_accuracy": 0.866078944504261, - "num_tokens": 184269455.0, - "step": 153160 - }, - { - "entropy": 1.9680861115455628, - "epoch": 0.47481359838618553, - "grad_norm": 8.033863067626953, - "learning_rate": 3.6714069272795433e-06, - "loss": 0.5163, - "mean_token_accuracy": 0.8408958032727242, - "num_tokens": 184280201.0, - "step": 153170 - }, - { - "entropy": 1.8380594596266746, - "epoch": 0.47484459751123526, - "grad_norm": 6.594268321990967, - "learning_rate": 3.6712870849105025e-06, - "loss": 0.4233, - "mean_token_accuracy": 0.8671738147735596, - "num_tokens": 184292398.0, - "step": 153180 - }, - { - "entropy": 1.9187800362706184, - "epoch": 0.4748755966362849, - "grad_norm": 9.44068431854248, - "learning_rate": 3.67116725427641e-06, - "loss": 0.4775, - "mean_token_accuracy": 0.8464760422706604, - "num_tokens": 184304484.0, - "step": 153190 - }, - { - "entropy": 1.8541857436299325, - "epoch": 0.47490659576133465, - "grad_norm": 8.135321617126465, - "learning_rate": 3.67104743537535e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.853699392080307, - "num_tokens": 184316960.0, - "step": 153200 - }, - { - "entropy": 1.8801586970686912, - "epoch": 0.4749375948863843, - "grad_norm": 9.78518009185791, - "learning_rate": 3.6709276282054077e-06, - "loss": 0.5018, - "mean_token_accuracy": 0.8420796692371368, - "num_tokens": 184328693.0, - "step": 153210 - }, - { - "entropy": 1.8984438449144363, - "epoch": 0.47496859401143404, - "grad_norm": 8.019004821777344, - "learning_rate": 3.6708078327646697e-06, - "loss": 0.432, - "mean_token_accuracy": 0.8611553832888603, - "num_tokens": 184340353.0, - "step": 153220 - }, - { - "entropy": 1.9146203354001046, - "epoch": 0.4749995931364837, - "grad_norm": 9.26950454711914, - "learning_rate": 3.6706880490512205e-06, - "loss": 0.469, - "mean_token_accuracy": 0.8470090657472611, - "num_tokens": 184351989.0, - "step": 153230 - }, - { - "entropy": 1.8650276467204094, - "epoch": 0.47503059226153344, - "grad_norm": 3.9464805126190186, - "learning_rate": 3.670568277063149e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8640954300761223, - "num_tokens": 184364008.0, - "step": 153240 - }, - { - "entropy": 1.8467861473560334, - "epoch": 0.4750615913865831, - "grad_norm": 8.731751441955566, - "learning_rate": 3.6704485167985414e-06, - "loss": 0.4382, - "mean_token_accuracy": 0.855233508348465, - "num_tokens": 184375127.0, - "step": 153250 - }, - { - "entropy": 1.7884536266326905, - "epoch": 0.47509259051163283, - "grad_norm": 7.820191383361816, - "learning_rate": 3.6703287682554855e-06, - "loss": 0.3785, - "mean_token_accuracy": 0.8606995210051537, - "num_tokens": 184388469.0, - "step": 153260 - }, - { - "entropy": 1.894975845515728, - "epoch": 0.4751235896366825, - "grad_norm": 8.473472595214844, - "learning_rate": 3.670209031432069e-06, - "loss": 0.4907, - "mean_token_accuracy": 0.8551534727215767, - "num_tokens": 184399686.0, - "step": 153270 - }, - { - "entropy": 1.8992106392979622, - "epoch": 0.4751545887617322, - "grad_norm": 7.339175224304199, - "learning_rate": 3.6700893063263798e-06, - "loss": 0.4458, - "mean_token_accuracy": 0.8561721444129944, - "num_tokens": 184410820.0, - "step": 153280 - }, - { - "entropy": 1.898959006369114, - "epoch": 0.4751855878867819, - "grad_norm": 3.8873322010040283, - "learning_rate": 3.669969592936509e-06, - "loss": 0.4884, - "mean_token_accuracy": 0.8328580185770988, - "num_tokens": 184422656.0, - "step": 153290 - }, - { - "entropy": 1.8946363300085067, - "epoch": 0.4752165870118316, - "grad_norm": 7.692179203033447, - "learning_rate": 3.669849891260544e-06, - "loss": 0.4482, - "mean_token_accuracy": 0.8575319543480873, - "num_tokens": 184433675.0, - "step": 153300 - }, - { - "entropy": 1.80959662348032, - "epoch": 0.4752475861368813, - "grad_norm": 8.2183198928833, - "learning_rate": 3.669730201296575e-06, - "loss": 0.3958, - "mean_token_accuracy": 0.8613125756382942, - "num_tokens": 184446305.0, - "step": 153310 - }, - { - "entropy": 1.8346762344241143, - "epoch": 0.475278585261931, - "grad_norm": 7.708462715148926, - "learning_rate": 3.6696105230426927e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8413996055722237, - "num_tokens": 184458769.0, - "step": 153320 - }, - { - "entropy": 1.8374806106090547, - "epoch": 0.4753095843869807, - "grad_norm": 8.594371795654297, - "learning_rate": 3.6694908564969873e-06, - "loss": 0.4607, - "mean_token_accuracy": 0.8590363129973412, - "num_tokens": 184470713.0, - "step": 153330 - }, - { - "entropy": 1.8095426648855208, - "epoch": 0.47534058351203035, - "grad_norm": 3.7232837677001953, - "learning_rate": 3.6693712016575504e-06, - "loss": 0.3899, - "mean_token_accuracy": 0.8638847827911377, - "num_tokens": 184482692.0, - "step": 153340 - }, - { - "entropy": 1.8390261575579643, - "epoch": 0.47537158263708007, - "grad_norm": 6.988356113433838, - "learning_rate": 3.6692515585224724e-06, - "loss": 0.4216, - "mean_token_accuracy": 0.8590392619371414, - "num_tokens": 184494926.0, - "step": 153350 - }, - { - "entropy": 1.8419798463582993, - "epoch": 0.47540258176212974, - "grad_norm": 9.946001052856445, - "learning_rate": 3.669131927089847e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.8536298364400864, - "num_tokens": 184506522.0, - "step": 153360 - }, - { - "entropy": 1.857440821826458, - "epoch": 0.47543358088717946, - "grad_norm": 9.011540412902832, - "learning_rate": 3.6690123073577653e-06, - "loss": 0.4895, - "mean_token_accuracy": 0.8468375623226165, - "num_tokens": 184519247.0, - "step": 153370 - }, - { - "entropy": 1.8762486830353737, - "epoch": 0.47546458001222913, - "grad_norm": 7.706223964691162, - "learning_rate": 3.66889269932432e-06, - "loss": 0.467, - "mean_token_accuracy": 0.8498484030365944, - "num_tokens": 184530480.0, - "step": 153380 - }, - { - "entropy": 1.8929049670696259, - "epoch": 0.47549557913727886, - "grad_norm": 9.091431617736816, - "learning_rate": 3.6687731029876057e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8557457372546196, - "num_tokens": 184541409.0, - "step": 153390 - }, - { - "entropy": 1.8962855875492095, - "epoch": 0.4755265782623285, - "grad_norm": 10.7317476272583, - "learning_rate": 3.6686535183457147e-06, - "loss": 0.5057, - "mean_token_accuracy": 0.8461247265338898, - "num_tokens": 184552652.0, - "step": 153400 - }, - { - "entropy": 1.8995665937662125, - "epoch": 0.47555757738737825, - "grad_norm": 8.745923042297363, - "learning_rate": 3.6685339453967417e-06, - "loss": 0.4825, - "mean_token_accuracy": 0.8513082057237625, - "num_tokens": 184563272.0, - "step": 153410 - }, - { - "entropy": 1.728972639143467, - "epoch": 0.4755885765124279, - "grad_norm": 3.664832353591919, - "learning_rate": 3.6684143841387817e-06, - "loss": 0.3383, - "mean_token_accuracy": 0.8756014302372932, - "num_tokens": 184577391.0, - "step": 153420 - }, - { - "entropy": 1.8072891741991044, - "epoch": 0.47561957563747764, - "grad_norm": 9.03156852722168, - "learning_rate": 3.6682948345699293e-06, - "loss": 0.3899, - "mean_token_accuracy": 0.8632027536630631, - "num_tokens": 184590084.0, - "step": 153430 - }, - { - "entropy": 1.8861136004328727, - "epoch": 0.4756505747625273, - "grad_norm": 7.806605815887451, - "learning_rate": 3.6681752966882795e-06, - "loss": 0.4435, - "mean_token_accuracy": 0.8579560875892639, - "num_tokens": 184601954.0, - "step": 153440 - }, - { - "entropy": 1.8768578171730042, - "epoch": 0.47568157388757704, - "grad_norm": 6.651251316070557, - "learning_rate": 3.668055770491929e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8663598403334618, - "num_tokens": 184613113.0, - "step": 153450 - }, - { - "entropy": 1.7853379517793655, - "epoch": 0.4757125730126267, - "grad_norm": 5.01438570022583, - "learning_rate": 3.667936255978974e-06, - "loss": 0.3818, - "mean_token_accuracy": 0.8630882278084755, - "num_tokens": 184626166.0, - "step": 153460 - }, - { - "entropy": 1.8284614101052283, - "epoch": 0.47574357213767643, - "grad_norm": 9.973559379577637, - "learning_rate": 3.6678167531475107e-06, - "loss": 0.4257, - "mean_token_accuracy": 0.8562781944870949, - "num_tokens": 184637586.0, - "step": 153470 - }, - { - "entropy": 1.7700392067432404, - "epoch": 0.4757745712627261, - "grad_norm": 4.605105876922607, - "learning_rate": 3.667697261995636e-06, - "loss": 0.3653, - "mean_token_accuracy": 0.8692534595727921, - "num_tokens": 184650692.0, - "step": 153480 - }, - { - "entropy": 1.9041405349969864, - "epoch": 0.4758055703877758, - "grad_norm": 8.431550025939941, - "learning_rate": 3.6675777825214486e-06, - "loss": 0.4631, - "mean_token_accuracy": 0.8535671159625053, - "num_tokens": 184662239.0, - "step": 153490 - }, - { - "entropy": 1.779083289206028, - "epoch": 0.4758365695128255, - "grad_norm": 4.4849748611450195, - "learning_rate": 3.667458314723047e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.853766855597496, - "num_tokens": 184676796.0, - "step": 153500 - }, - { - "entropy": 1.885641473531723, - "epoch": 0.4758675686378752, - "grad_norm": 3.370105266571045, - "learning_rate": 3.667338858598527e-06, - "loss": 0.4198, - "mean_token_accuracy": 0.8566616162657738, - "num_tokens": 184688573.0, - "step": 153510 - }, - { - "entropy": 1.9167728781700135, - "epoch": 0.4758985677629249, - "grad_norm": 14.460858345031738, - "learning_rate": 3.667219414145991e-06, - "loss": 0.4755, - "mean_token_accuracy": 0.8444859847426415, - "num_tokens": 184699665.0, - "step": 153520 - }, - { - "entropy": 1.807128955423832, - "epoch": 0.4759295668879746, - "grad_norm": 3.8784046173095703, - "learning_rate": 3.6670999813635354e-06, - "loss": 0.3995, - "mean_token_accuracy": 0.8568266987800598, - "num_tokens": 184712308.0, - "step": 153530 - }, - { - "entropy": 1.8136287018656732, - "epoch": 0.4759605660130243, - "grad_norm": 9.484222412109375, - "learning_rate": 3.666980560249262e-06, - "loss": 0.3689, - "mean_token_accuracy": 0.8740163326263428, - "num_tokens": 184724676.0, - "step": 153540 - }, - { - "entropy": 1.8556558206677436, - "epoch": 0.475991565138074, - "grad_norm": 7.489902496337891, - "learning_rate": 3.66686115080127e-06, - "loss": 0.5065, - "mean_token_accuracy": 0.8446243152022361, - "num_tokens": 184737203.0, - "step": 153550 - }, - { - "entropy": 1.9191300675272942, - "epoch": 0.47602256426312367, - "grad_norm": 8.194887161254883, - "learning_rate": 3.6667417530176603e-06, - "loss": 0.475, - "mean_token_accuracy": 0.8443259477615357, - "num_tokens": 184748452.0, - "step": 153560 - }, - { - "entropy": 1.795873185992241, - "epoch": 0.4760535633881734, - "grad_norm": 9.132946968078613, - "learning_rate": 3.6666223668965338e-06, - "loss": 0.3911, - "mean_token_accuracy": 0.8649899259209632, - "num_tokens": 184760930.0, - "step": 153570 - }, - { - "entropy": 1.9766324907541275, - "epoch": 0.47608456251322306, - "grad_norm": 8.085390090942383, - "learning_rate": 3.6665029924359922e-06, - "loss": 0.5102, - "mean_token_accuracy": 0.8393667578697205, - "num_tokens": 184771872.0, - "step": 153580 - }, - { - "entropy": 1.7679221838712693, - "epoch": 0.47611556163827273, - "grad_norm": 4.475643634796143, - "learning_rate": 3.6663836296341384e-06, - "loss": 0.3281, - "mean_token_accuracy": 0.869743998348713, - "num_tokens": 184785921.0, - "step": 153590 - }, - { - "entropy": 1.840344101190567, - "epoch": 0.47614656076332246, - "grad_norm": 4.371096134185791, - "learning_rate": 3.6662642784890723e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8603167220950126, - "num_tokens": 184797921.0, - "step": 153600 - }, - { - "entropy": 1.7917725771665574, - "epoch": 0.4761775598883721, - "grad_norm": 4.4459686279296875, - "learning_rate": 3.6661449389988997e-06, - "loss": 0.4022, - "mean_token_accuracy": 0.8593000710010529, - "num_tokens": 184810881.0, - "step": 153610 - }, - { - "entropy": 1.8440128073096276, - "epoch": 0.47620855901342185, - "grad_norm": 9.678831100463867, - "learning_rate": 3.6660256111617214e-06, - "loss": 0.4642, - "mean_token_accuracy": 0.8468298763036728, - "num_tokens": 184823216.0, - "step": 153620 - }, - { - "entropy": 1.9154121845960617, - "epoch": 0.4762395581384715, - "grad_norm": 8.692216873168945, - "learning_rate": 3.6659062949756423e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.8561125859618187, - "num_tokens": 184834303.0, - "step": 153630 - }, - { - "entropy": 1.7718016371130942, - "epoch": 0.47627055726352124, - "grad_norm": 7.937458038330078, - "learning_rate": 3.6657869904387666e-06, - "loss": 0.3948, - "mean_token_accuracy": 0.8643491208553314, - "num_tokens": 184846836.0, - "step": 153640 - }, - { - "entropy": 1.7790207624435426, - "epoch": 0.4763015563885709, - "grad_norm": 7.390011310577393, - "learning_rate": 3.6656676975491983e-06, - "loss": 0.433, - "mean_token_accuracy": 0.8582101926207543, - "num_tokens": 184861251.0, - "step": 153650 - }, - { - "entropy": 1.8293533489108085, - "epoch": 0.47633255551362064, - "grad_norm": 8.291117668151855, - "learning_rate": 3.6655484163050426e-06, - "loss": 0.4276, - "mean_token_accuracy": 0.8656255662441253, - "num_tokens": 184873707.0, - "step": 153660 - }, - { - "entropy": 1.803611083328724, - "epoch": 0.4763635546386703, - "grad_norm": 3.697669267654419, - "learning_rate": 3.6654291467044046e-06, - "loss": 0.3816, - "mean_token_accuracy": 0.8656400561332702, - "num_tokens": 184886963.0, - "step": 153670 - }, - { - "entropy": 1.8712692141532898, - "epoch": 0.47639455376372003, - "grad_norm": 8.776789665222168, - "learning_rate": 3.6653098887453913e-06, - "loss": 0.4827, - "mean_token_accuracy": 0.8448523849248886, - "num_tokens": 184898369.0, - "step": 153680 - }, - { - "entropy": 1.781012015044689, - "epoch": 0.4764255528887697, - "grad_norm": 7.237161636352539, - "learning_rate": 3.665190642426108e-06, - "loss": 0.4004, - "mean_token_accuracy": 0.8646363571286202, - "num_tokens": 184910887.0, - "step": 153690 - }, - { - "entropy": 1.7416659876704217, - "epoch": 0.4764565520138194, - "grad_norm": 8.15397834777832, - "learning_rate": 3.6650714077446614e-06, - "loss": 0.3926, - "mean_token_accuracy": 0.8684957414865494, - "num_tokens": 184924099.0, - "step": 153700 - }, - { - "entropy": 1.8524023115634918, - "epoch": 0.4764875511388691, - "grad_norm": 8.428122520446777, - "learning_rate": 3.664952184699158e-06, - "loss": 0.4665, - "mean_token_accuracy": 0.84611546844244, - "num_tokens": 184936005.0, - "step": 153710 - }, - { - "entropy": 1.8617612093687057, - "epoch": 0.4765185502639188, - "grad_norm": 7.877058982849121, - "learning_rate": 3.664832973287707e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.8690141126513481, - "num_tokens": 184948169.0, - "step": 153720 - }, - { - "entropy": 1.74190554022789, - "epoch": 0.4765495493889685, - "grad_norm": 3.756366491317749, - "learning_rate": 3.6647137735084156e-06, - "loss": 0.3976, - "mean_token_accuracy": 0.8677965849637985, - "num_tokens": 184961545.0, - "step": 153730 - }, - { - "entropy": 1.841639220714569, - "epoch": 0.4765805485140182, - "grad_norm": 6.832626819610596, - "learning_rate": 3.6645945853593916e-06, - "loss": 0.4091, - "mean_token_accuracy": 0.8673912405967712, - "num_tokens": 184973489.0, - "step": 153740 - }, - { - "entropy": 1.9268560737371445, - "epoch": 0.4766115476390679, - "grad_norm": 7.91566801071167, - "learning_rate": 3.6644754088387447e-06, - "loss": 0.5311, - "mean_token_accuracy": 0.8405150607228279, - "num_tokens": 184984295.0, - "step": 153750 - }, - { - "entropy": 1.8099448829889297, - "epoch": 0.4766425467641176, - "grad_norm": 7.360180854797363, - "learning_rate": 3.664356243944584e-06, - "loss": 0.4085, - "mean_token_accuracy": 0.8619012326002121, - "num_tokens": 184997460.0, - "step": 153760 - }, - { - "entropy": 1.9135849446058273, - "epoch": 0.47667354588916727, - "grad_norm": 7.397768020629883, - "learning_rate": 3.6642370906750193e-06, - "loss": 0.4747, - "mean_token_accuracy": 0.8504505261778832, - "num_tokens": 185007719.0, - "step": 153770 - }, - { - "entropy": 1.9261243373155594, - "epoch": 0.476704545014217, - "grad_norm": 6.789693832397461, - "learning_rate": 3.6641179490281596e-06, - "loss": 0.4927, - "mean_token_accuracy": 0.8460416331887245, - "num_tokens": 185018707.0, - "step": 153780 - }, - { - "entropy": 1.8728522315621376, - "epoch": 0.47673554413926666, - "grad_norm": 3.7547261714935303, - "learning_rate": 3.6639988190021176e-06, - "loss": 0.4731, - "mean_token_accuracy": 0.8428987547755241, - "num_tokens": 185030141.0, - "step": 153790 - }, - { - "entropy": 1.8588514044880866, - "epoch": 0.4767665432643164, - "grad_norm": 3.039729356765747, - "learning_rate": 3.6638797005950024e-06, - "loss": 0.406, - "mean_token_accuracy": 0.858186562359333, - "num_tokens": 185042499.0, - "step": 153800 - }, - { - "entropy": 1.8474113151431084, - "epoch": 0.47679754238936606, - "grad_norm": 8.664191246032715, - "learning_rate": 3.6637605938049266e-06, - "loss": 0.4419, - "mean_token_accuracy": 0.8646103963255882, - "num_tokens": 185054646.0, - "step": 153810 - }, - { - "entropy": 1.7442696556448936, - "epoch": 0.4768285415144158, - "grad_norm": 8.061847686767578, - "learning_rate": 3.663641498630001e-06, - "loss": 0.3783, - "mean_token_accuracy": 0.8688895791769028, - "num_tokens": 185067799.0, - "step": 153820 - }, - { - "entropy": 1.9126453652977944, - "epoch": 0.47685954063946545, - "grad_norm": 7.419454097747803, - "learning_rate": 3.663522415068339e-06, - "loss": 0.5575, - "mean_token_accuracy": 0.8450031682848931, - "num_tokens": 185079637.0, - "step": 153830 - }, - { - "entropy": 1.841584986448288, - "epoch": 0.4768905397645151, - "grad_norm": 7.337125778198242, - "learning_rate": 3.6634033431180534e-06, - "loss": 0.4272, - "mean_token_accuracy": 0.8534199371933937, - "num_tokens": 185091402.0, - "step": 153840 - }, - { - "entropy": 1.8615821033716202, - "epoch": 0.47692153888956484, - "grad_norm": 8.566046714782715, - "learning_rate": 3.663284282777256e-06, - "loss": 0.4612, - "mean_token_accuracy": 0.8558877721428871, - "num_tokens": 185102679.0, - "step": 153850 - }, - { - "entropy": 1.9264022588729859, - "epoch": 0.4769525380146145, - "grad_norm": 9.237041473388672, - "learning_rate": 3.6631652340440625e-06, - "loss": 0.4899, - "mean_token_accuracy": 0.8412112295627594, - "num_tokens": 185113641.0, - "step": 153860 - }, - { - "entropy": 1.8670520842075349, - "epoch": 0.47698353713966424, - "grad_norm": 8.40865707397461, - "learning_rate": 3.6630461969165847e-06, - "loss": 0.4872, - "mean_token_accuracy": 0.8481194913387299, - "num_tokens": 185125200.0, - "step": 153870 - }, - { - "entropy": 1.9037254452705383, - "epoch": 0.4770145362647139, - "grad_norm": 8.676751136779785, - "learning_rate": 3.662927171392938e-06, - "loss": 0.5002, - "mean_token_accuracy": 0.8378460854291916, - "num_tokens": 185136508.0, - "step": 153880 - }, - { - "entropy": 1.8699619323015213, - "epoch": 0.47704553538976363, - "grad_norm": 4.6836323738098145, - "learning_rate": 3.6628081574712363e-06, - "loss": 0.3981, - "mean_token_accuracy": 0.8679074257612228, - "num_tokens": 185148833.0, - "step": 153890 - }, - { - "entropy": 1.8572426453232764, - "epoch": 0.4770765345148133, - "grad_norm": 9.239683151245117, - "learning_rate": 3.662689155149597e-06, - "loss": 0.444, - "mean_token_accuracy": 0.8490281060338021, - "num_tokens": 185161322.0, - "step": 153900 - }, - { - "entropy": 1.8752911046147347, - "epoch": 0.477107533639863, - "grad_norm": 8.707393646240234, - "learning_rate": 3.662570164426135e-06, - "loss": 0.4612, - "mean_token_accuracy": 0.8525170043110848, - "num_tokens": 185173769.0, - "step": 153910 - }, - { - "entropy": 1.8172207593917846, - "epoch": 0.4771385327649127, - "grad_norm": 8.489155769348145, - "learning_rate": 3.6624511852989657e-06, - "loss": 0.4304, - "mean_token_accuracy": 0.858148242533207, - "num_tokens": 185187124.0, - "step": 153920 - }, - { - "entropy": 1.8730401009321214, - "epoch": 0.4771695318899624, - "grad_norm": 9.239094734191895, - "learning_rate": 3.6623322177662056e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.8503327935934066, - "num_tokens": 185199593.0, - "step": 153930 - }, - { - "entropy": 1.8695446357131005, - "epoch": 0.4772005310150121, - "grad_norm": 8.47158432006836, - "learning_rate": 3.662213261825973e-06, - "loss": 0.4366, - "mean_token_accuracy": 0.8578305006027221, - "num_tokens": 185212365.0, - "step": 153940 - }, - { - "entropy": 1.8985270693898202, - "epoch": 0.4772315301400618, - "grad_norm": 7.378330230712891, - "learning_rate": 3.6620943174763845e-06, - "loss": 0.483, - "mean_token_accuracy": 0.8512399345636368, - "num_tokens": 185223754.0, - "step": 153950 - }, - { - "entropy": 1.8317578330636024, - "epoch": 0.4772625292651115, - "grad_norm": 8.441747665405273, - "learning_rate": 3.661975384715558e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8525681480765342, - "num_tokens": 185237068.0, - "step": 153960 - }, - { - "entropy": 1.92967329621315, - "epoch": 0.4772935283901612, - "grad_norm": 7.202940940856934, - "learning_rate": 3.6618564635416117e-06, - "loss": 0.4644, - "mean_token_accuracy": 0.8535278737545013, - "num_tokens": 185247787.0, - "step": 153970 - }, - { - "entropy": 1.8548561289906502, - "epoch": 0.4773245275152109, - "grad_norm": 8.233925819396973, - "learning_rate": 3.661737553952664e-06, - "loss": 0.4522, - "mean_token_accuracy": 0.8560821250081062, - "num_tokens": 185260706.0, - "step": 153980 - }, - { - "entropy": 1.928127257525921, - "epoch": 0.4773555266402606, - "grad_norm": 8.103163719177246, - "learning_rate": 3.661618655946835e-06, - "loss": 0.4944, - "mean_token_accuracy": 0.8430070072412491, - "num_tokens": 185272523.0, - "step": 153990 - }, - { - "entropy": 1.8638317108154296, - "epoch": 0.47738652576531027, - "grad_norm": 7.030045032501221, - "learning_rate": 3.6614997695222444e-06, - "loss": 0.3983, - "mean_token_accuracy": 0.8599402293562889, - "num_tokens": 185284942.0, - "step": 154000 - }, - { - "entropy": 1.8330623269081117, - "epoch": 0.47741752489036, - "grad_norm": 7.989560127258301, - "learning_rate": 3.6613808946770103e-06, - "loss": 0.3899, - "mean_token_accuracy": 0.8641909688711167, - "num_tokens": 185297882.0, - "step": 154010 - }, - { - "entropy": 1.8085048258304597, - "epoch": 0.47744852401540966, - "grad_norm": 7.296682834625244, - "learning_rate": 3.6612620314092554e-06, - "loss": 0.4384, - "mean_token_accuracy": 0.8560028299689293, - "num_tokens": 185310312.0, - "step": 154020 - }, - { - "entropy": 1.9709538519382477, - "epoch": 0.4774795231404594, - "grad_norm": 7.870602130889893, - "learning_rate": 3.6611431797170994e-06, - "loss": 0.4859, - "mean_token_accuracy": 0.8472713023424149, - "num_tokens": 185321671.0, - "step": 154030 - }, - { - "entropy": 1.9173092991113663, - "epoch": 0.47751052226550905, - "grad_norm": 9.225147247314453, - "learning_rate": 3.661024339598664e-06, - "loss": 0.4928, - "mean_token_accuracy": 0.8469387844204903, - "num_tokens": 185333128.0, - "step": 154040 - }, - { - "entropy": 1.9338498145341874, - "epoch": 0.4775415213905588, - "grad_norm": 8.254136085510254, - "learning_rate": 3.6609055110520702e-06, - "loss": 0.4891, - "mean_token_accuracy": 0.8532148212194443, - "num_tokens": 185344879.0, - "step": 154050 - }, - { - "entropy": 1.7882016241550445, - "epoch": 0.47757252051560845, - "grad_norm": 2.6473770141601562, - "learning_rate": 3.660786694075441e-06, - "loss": 0.3935, - "mean_token_accuracy": 0.8609866589307785, - "num_tokens": 185357701.0, - "step": 154060 - }, - { - "entropy": 1.87758369743824, - "epoch": 0.47760351964065817, - "grad_norm": 8.510712623596191, - "learning_rate": 3.660667888666899e-06, - "loss": 0.4199, - "mean_token_accuracy": 0.8468722432851792, - "num_tokens": 185369849.0, - "step": 154070 - }, - { - "entropy": 1.744155490398407, - "epoch": 0.47763451876570784, - "grad_norm": 9.347501754760742, - "learning_rate": 3.6605490948245658e-06, - "loss": 0.3608, - "mean_token_accuracy": 0.8634230136871338, - "num_tokens": 185383168.0, - "step": 154080 - }, - { - "entropy": 1.838800160586834, - "epoch": 0.4776655178907575, - "grad_norm": 4.383167743682861, - "learning_rate": 3.6604303125465666e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.8602541759610176, - "num_tokens": 185395934.0, - "step": 154090 - }, - { - "entropy": 1.8426039129495622, - "epoch": 0.47769651701580723, - "grad_norm": 9.018194198608398, - "learning_rate": 3.6603115418310246e-06, - "loss": 0.4587, - "mean_token_accuracy": 0.855328157544136, - "num_tokens": 185408078.0, - "step": 154100 - }, - { - "entropy": 1.811622078716755, - "epoch": 0.4777275161408569, - "grad_norm": 4.057611465454102, - "learning_rate": 3.6601927826760636e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8523658573627472, - "num_tokens": 185421402.0, - "step": 154110 - }, - { - "entropy": 1.7786233231425286, - "epoch": 0.4777585152659066, - "grad_norm": 3.694188356399536, - "learning_rate": 3.660074035079809e-06, - "loss": 0.364, - "mean_token_accuracy": 0.8725257933139801, - "num_tokens": 185434169.0, - "step": 154120 - }, - { - "entropy": 1.873915046453476, - "epoch": 0.4777895143909563, - "grad_norm": 9.391448020935059, - "learning_rate": 3.659955299040385e-06, - "loss": 0.4731, - "mean_token_accuracy": 0.8509207114577293, - "num_tokens": 185445738.0, - "step": 154130 - }, - { - "entropy": 1.930639487504959, - "epoch": 0.477820513516006, - "grad_norm": 10.155179977416992, - "learning_rate": 3.6598365745559187e-06, - "loss": 0.5343, - "mean_token_accuracy": 0.8412887364625931, - "num_tokens": 185456538.0, - "step": 154140 - }, - { - "entropy": 1.9113746494054795, - "epoch": 0.4778515126410557, - "grad_norm": 8.49729061126709, - "learning_rate": 3.6597178616245345e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8552822053432465, - "num_tokens": 185468039.0, - "step": 154150 - }, - { - "entropy": 1.9362047135829925, - "epoch": 0.4778825117661054, - "grad_norm": 7.92914342880249, - "learning_rate": 3.6595991602443593e-06, - "loss": 0.477, - "mean_token_accuracy": 0.847518865764141, - "num_tokens": 185478541.0, - "step": 154160 - }, - { - "entropy": 1.8837508246302606, - "epoch": 0.4779135108911551, - "grad_norm": 8.445967674255371, - "learning_rate": 3.6594804704135206e-06, - "loss": 0.4509, - "mean_token_accuracy": 0.8552197054028511, - "num_tokens": 185489982.0, - "step": 154170 - }, - { - "entropy": 1.8416315570473671, - "epoch": 0.4779445100162048, - "grad_norm": 8.454257011413574, - "learning_rate": 3.659361792130145e-06, - "loss": 0.4256, - "mean_token_accuracy": 0.859393647313118, - "num_tokens": 185502258.0, - "step": 154180 - }, - { - "entropy": 1.9041074529290198, - "epoch": 0.4779755091412545, - "grad_norm": 3.8692028522491455, - "learning_rate": 3.6592431253923597e-06, - "loss": 0.4289, - "mean_token_accuracy": 0.8625993952155113, - "num_tokens": 185513826.0, - "step": 154190 - }, - { - "entropy": 1.8795275837182999, - "epoch": 0.4780065082663042, - "grad_norm": 8.003114700317383, - "learning_rate": 3.659124470198294e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.8551646754145622, - "num_tokens": 185525636.0, - "step": 154200 - }, - { - "entropy": 1.9021770521998405, - "epoch": 0.47803750739135387, - "grad_norm": 9.979130744934082, - "learning_rate": 3.659005826546076e-06, - "loss": 0.4726, - "mean_token_accuracy": 0.8435526043176651, - "num_tokens": 185537405.0, - "step": 154210 - }, - { - "entropy": 1.9014919973909854, - "epoch": 0.4780685065164036, - "grad_norm": 8.349839210510254, - "learning_rate": 3.6588871944338343e-06, - "loss": 0.4337, - "mean_token_accuracy": 0.8566196665167809, - "num_tokens": 185549006.0, - "step": 154220 - }, - { - "entropy": 1.928890497982502, - "epoch": 0.47809950564145326, - "grad_norm": 8.71741008758545, - "learning_rate": 3.6587685738596985e-06, - "loss": 0.4939, - "mean_token_accuracy": 0.8444687411189079, - "num_tokens": 185560063.0, - "step": 154230 - }, - { - "entropy": 1.8423243664205073, - "epoch": 0.478130504766503, - "grad_norm": 8.268646240234375, - "learning_rate": 3.6586499648217977e-06, - "loss": 0.4339, - "mean_token_accuracy": 0.8484700009226799, - "num_tokens": 185573162.0, - "step": 154240 - }, - { - "entropy": 1.9101153433322906, - "epoch": 0.47816150389155265, - "grad_norm": 10.118322372436523, - "learning_rate": 3.658531367318264e-06, - "loss": 0.4681, - "mean_token_accuracy": 0.8470887467265129, - "num_tokens": 185584820.0, - "step": 154250 - }, - { - "entropy": 1.9050511002540589, - "epoch": 0.4781925030166024, - "grad_norm": 7.960333824157715, - "learning_rate": 3.658412781347225e-06, - "loss": 0.5336, - "mean_token_accuracy": 0.8414354234933853, - "num_tokens": 185596298.0, - "step": 154260 - }, - { - "entropy": 1.8809094280004501, - "epoch": 0.47822350214165205, - "grad_norm": 4.4286885261535645, - "learning_rate": 3.6582942069068154e-06, - "loss": 0.4722, - "mean_token_accuracy": 0.8507549971342087, - "num_tokens": 185607751.0, - "step": 154270 - }, - { - "entropy": 1.7869101524353028, - "epoch": 0.47825450126670177, - "grad_norm": 7.874044418334961, - "learning_rate": 3.6581756439951644e-06, - "loss": 0.3641, - "mean_token_accuracy": 0.862111645936966, - "num_tokens": 185620912.0, - "step": 154280 - }, - { - "entropy": 1.9244977489113808, - "epoch": 0.47828550039175144, - "grad_norm": 9.23994255065918, - "learning_rate": 3.658057092610404e-06, - "loss": 0.5345, - "mean_token_accuracy": 0.8415355905890465, - "num_tokens": 185632155.0, - "step": 154290 - }, - { - "entropy": 1.9335034400224687, - "epoch": 0.47831649951680116, - "grad_norm": 7.755003452301025, - "learning_rate": 3.657938552750668e-06, - "loss": 0.5167, - "mean_token_accuracy": 0.8392974704504013, - "num_tokens": 185642986.0, - "step": 154300 - }, - { - "entropy": 1.9139926508069038, - "epoch": 0.47834749864185083, - "grad_norm": 8.397530555725098, - "learning_rate": 3.657820024414088e-06, - "loss": 0.5023, - "mean_token_accuracy": 0.8501160070300102, - "num_tokens": 185654713.0, - "step": 154310 - }, - { - "entropy": 1.8200761735439301, - "epoch": 0.47837849776690056, - "grad_norm": 8.106402397155762, - "learning_rate": 3.6577015075987958e-06, - "loss": 0.42, - "mean_token_accuracy": 0.8671575516462326, - "num_tokens": 185667339.0, - "step": 154320 - }, - { - "entropy": 1.9082677319645882, - "epoch": 0.4784094968919502, - "grad_norm": 3.1612582206726074, - "learning_rate": 3.6575830023029285e-06, - "loss": 0.4595, - "mean_token_accuracy": 0.8523703336715698, - "num_tokens": 185678791.0, - "step": 154330 - }, - { - "entropy": 1.8255998715758324, - "epoch": 0.4784404960169999, - "grad_norm": 7.894652366638184, - "learning_rate": 3.6574645085246168e-06, - "loss": 0.3864, - "mean_token_accuracy": 0.8676185742020607, - "num_tokens": 185691476.0, - "step": 154340 - }, - { - "entropy": 1.894423645734787, - "epoch": 0.4784714951420496, - "grad_norm": 9.043898582458496, - "learning_rate": 3.6573460262619975e-06, - "loss": 0.4669, - "mean_token_accuracy": 0.8409060701727867, - "num_tokens": 185703246.0, - "step": 154350 - }, - { - "entropy": 1.9331441923975945, - "epoch": 0.4785024942670993, - "grad_norm": 9.189416885375977, - "learning_rate": 3.657227555513204e-06, - "loss": 0.4848, - "mean_token_accuracy": 0.8522034257650375, - "num_tokens": 185714266.0, - "step": 154360 - }, - { - "entropy": 1.8812373384833336, - "epoch": 0.478533493392149, - "grad_norm": 7.222550392150879, - "learning_rate": 3.657109096276372e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8448280677199363, - "num_tokens": 185726369.0, - "step": 154370 - }, - { - "entropy": 1.9059947580099106, - "epoch": 0.4785644925171987, - "grad_norm": 6.528245449066162, - "learning_rate": 3.6569906485496383e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8697101891040802, - "num_tokens": 185737440.0, - "step": 154380 - }, - { - "entropy": 1.9224574849009515, - "epoch": 0.4785954916422484, - "grad_norm": 10.537102699279785, - "learning_rate": 3.656872212331138e-06, - "loss": 0.4862, - "mean_token_accuracy": 0.838177040219307, - "num_tokens": 185748538.0, - "step": 154390 - }, - { - "entropy": 1.894202496111393, - "epoch": 0.4786264907672981, - "grad_norm": 7.70367431640625, - "learning_rate": 3.6567537876190075e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.8510572746396065, - "num_tokens": 185759523.0, - "step": 154400 - }, - { - "entropy": 1.8672723740339279, - "epoch": 0.4786574898923478, - "grad_norm": 6.750048637390137, - "learning_rate": 3.6566353744113836e-06, - "loss": 0.4616, - "mean_token_accuracy": 0.857047687470913, - "num_tokens": 185772469.0, - "step": 154410 - }, - { - "entropy": 1.915586268901825, - "epoch": 0.47868848901739747, - "grad_norm": 8.35063648223877, - "learning_rate": 3.6565169727064054e-06, - "loss": 0.4429, - "mean_token_accuracy": 0.8574128776788712, - "num_tokens": 185784266.0, - "step": 154420 - }, - { - "entropy": 1.9031167924404144, - "epoch": 0.4787194881424472, - "grad_norm": 8.222635269165039, - "learning_rate": 3.656398582502209e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8537877663969994, - "num_tokens": 185795413.0, - "step": 154430 - }, - { - "entropy": 1.840549720823765, - "epoch": 0.47875048726749686, - "grad_norm": 7.137433052062988, - "learning_rate": 3.656280203796933e-06, - "loss": 0.4291, - "mean_token_accuracy": 0.8579451397061348, - "num_tokens": 185807798.0, - "step": 154440 - }, - { - "entropy": 1.861526158452034, - "epoch": 0.4787814863925466, - "grad_norm": 4.58693265914917, - "learning_rate": 3.656161836588717e-06, - "loss": 0.4406, - "mean_token_accuracy": 0.8540133357048034, - "num_tokens": 185820505.0, - "step": 154450 - }, - { - "entropy": 1.900732010602951, - "epoch": 0.47881248551759625, - "grad_norm": 3.4613146781921387, - "learning_rate": 3.6560434808756995e-06, - "loss": 0.4382, - "mean_token_accuracy": 0.8573839977383614, - "num_tokens": 185832232.0, - "step": 154460 - }, - { - "entropy": 1.8618409425020217, - "epoch": 0.478843484642646, - "grad_norm": 6.595524787902832, - "learning_rate": 3.6559251366560195e-06, - "loss": 0.4251, - "mean_token_accuracy": 0.8640586957335472, - "num_tokens": 185843445.0, - "step": 154470 - }, - { - "entropy": 1.9100439898669719, - "epoch": 0.47887448376769565, - "grad_norm": 9.08141040802002, - "learning_rate": 3.655806803927818e-06, - "loss": 0.4413, - "mean_token_accuracy": 0.8536060154438019, - "num_tokens": 185855042.0, - "step": 154480 - }, - { - "entropy": 1.8713151529431342, - "epoch": 0.47890548289274537, - "grad_norm": 3.9721601009368896, - "learning_rate": 3.655688482689234e-06, - "loss": 0.4301, - "mean_token_accuracy": 0.8604184344410897, - "num_tokens": 185866727.0, - "step": 154490 - }, - { - "entropy": 1.853001520037651, - "epoch": 0.47893648201779504, - "grad_norm": 4.289365768432617, - "learning_rate": 3.6555701729384096e-06, - "loss": 0.4227, - "mean_token_accuracy": 0.8596016809344291, - "num_tokens": 185878802.0, - "step": 154500 - }, - { - "entropy": 1.9377120748162269, - "epoch": 0.47896748114284476, - "grad_norm": 8.828749656677246, - "learning_rate": 3.6554518746734857e-06, - "loss": 0.4896, - "mean_token_accuracy": 0.8434145227074623, - "num_tokens": 185889992.0, - "step": 154510 - }, - { - "entropy": 1.8815474480390548, - "epoch": 0.47899848026789443, - "grad_norm": 7.912415027618408, - "learning_rate": 3.6553335878926037e-06, - "loss": 0.4563, - "mean_token_accuracy": 0.8570791929960251, - "num_tokens": 185901345.0, - "step": 154520 - }, - { - "entropy": 1.8100280150771142, - "epoch": 0.47902947939294416, - "grad_norm": 3.6402947902679443, - "learning_rate": 3.655215312593906e-06, - "loss": 0.4176, - "mean_token_accuracy": 0.8516449317336082, - "num_tokens": 185914045.0, - "step": 154530 - }, - { - "entropy": 1.905594563484192, - "epoch": 0.4790604785179938, - "grad_norm": 8.659533500671387, - "learning_rate": 3.6550970487755343e-06, - "loss": 0.4717, - "mean_token_accuracy": 0.8463260412216187, - "num_tokens": 185925597.0, - "step": 154540 - }, - { - "entropy": 1.9034359157085419, - "epoch": 0.47909147764304355, - "grad_norm": 7.604994297027588, - "learning_rate": 3.6549787964356328e-06, - "loss": 0.4815, - "mean_token_accuracy": 0.8514864310622215, - "num_tokens": 185936927.0, - "step": 154550 - }, - { - "entropy": 1.8217870756983756, - "epoch": 0.4791224767680932, - "grad_norm": 8.547018051147461, - "learning_rate": 3.6548605555723437e-06, - "loss": 0.4123, - "mean_token_accuracy": 0.8532523363828659, - "num_tokens": 185949713.0, - "step": 154560 - }, - { - "entropy": 1.86880983710289, - "epoch": 0.4791534758931429, - "grad_norm": 8.985418319702148, - "learning_rate": 3.6547423261838103e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8528073132038116, - "num_tokens": 185961110.0, - "step": 154570 - }, - { - "entropy": 1.873310026526451, - "epoch": 0.4791844750181926, - "grad_norm": 4.159888744354248, - "learning_rate": 3.654624108268179e-06, - "loss": 0.4601, - "mean_token_accuracy": 0.856771194934845, - "num_tokens": 185972911.0, - "step": 154580 - }, - { - "entropy": 1.809093876183033, - "epoch": 0.4792154741432423, - "grad_norm": 4.302996635437012, - "learning_rate": 3.6545059018235918e-06, - "loss": 0.4054, - "mean_token_accuracy": 0.8638216108083725, - "num_tokens": 185985591.0, - "step": 154590 - }, - { - "entropy": 1.8471155345439911, - "epoch": 0.479246473268292, - "grad_norm": 7.965666770935059, - "learning_rate": 3.654387706848195e-06, - "loss": 0.4537, - "mean_token_accuracy": 0.8619759008288383, - "num_tokens": 185998248.0, - "step": 154600 - }, - { - "entropy": 1.7570737972855568, - "epoch": 0.4792774723933417, - "grad_norm": 5.8140764236450195, - "learning_rate": 3.654269523340134e-06, - "loss": 0.3326, - "mean_token_accuracy": 0.8703777492046356, - "num_tokens": 186011817.0, - "step": 154610 - }, - { - "entropy": 1.7383157536387444, - "epoch": 0.4793084715183914, - "grad_norm": 3.960681200027466, - "learning_rate": 3.654151351297555e-06, - "loss": 0.3605, - "mean_token_accuracy": 0.8690917834639549, - "num_tokens": 186025532.0, - "step": 154620 - }, - { - "entropy": 1.9056962199509144, - "epoch": 0.47933947064344107, - "grad_norm": 6.778663158416748, - "learning_rate": 3.6540331907186033e-06, - "loss": 0.4353, - "mean_token_accuracy": 0.8545536309480667, - "num_tokens": 186037066.0, - "step": 154630 - }, - { - "entropy": 1.9246233195066451, - "epoch": 0.4793704697684908, - "grad_norm": 8.319918632507324, - "learning_rate": 3.653915041601426e-06, - "loss": 0.4952, - "mean_token_accuracy": 0.8444161772727966, - "num_tokens": 186047654.0, - "step": 154640 - }, - { - "entropy": 1.9099677562713624, - "epoch": 0.47940146889354046, - "grad_norm": 3.9827780723571777, - "learning_rate": 3.6537969039441702e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8582230001688004, - "num_tokens": 186059198.0, - "step": 154650 - }, - { - "entropy": 1.8121528401970863, - "epoch": 0.4794324680185902, - "grad_norm": 7.723804950714111, - "learning_rate": 3.653678777744984e-06, - "loss": 0.3769, - "mean_token_accuracy": 0.8730279862880707, - "num_tokens": 186072057.0, - "step": 154660 - }, - { - "entropy": 1.9339038461446763, - "epoch": 0.47946346714363985, - "grad_norm": 7.433446884155273, - "learning_rate": 3.6535606630020144e-06, - "loss": 0.519, - "mean_token_accuracy": 0.8461462393403053, - "num_tokens": 186083085.0, - "step": 154670 - }, - { - "entropy": 1.8623194843530655, - "epoch": 0.4794944662686896, - "grad_norm": 7.5080461502075195, - "learning_rate": 3.65344255971341e-06, - "loss": 0.4641, - "mean_token_accuracy": 0.8542502373456955, - "num_tokens": 186095411.0, - "step": 154680 - }, - { - "entropy": 1.8670690104365348, - "epoch": 0.47952546539373925, - "grad_norm": 5.185728073120117, - "learning_rate": 3.65332446787732e-06, - "loss": 0.4811, - "mean_token_accuracy": 0.8545142263174057, - "num_tokens": 186107457.0, - "step": 154690 - }, - { - "entropy": 1.907190305739641, - "epoch": 0.47955646451878897, - "grad_norm": 7.39229679107666, - "learning_rate": 3.6532063874918936e-06, - "loss": 0.4768, - "mean_token_accuracy": 0.8454447597265243, - "num_tokens": 186119514.0, - "step": 154700 - }, - { - "entropy": 1.809042975306511, - "epoch": 0.47958746364383864, - "grad_norm": 6.491355895996094, - "learning_rate": 3.65308831855528e-06, - "loss": 0.366, - "mean_token_accuracy": 0.8723908111453056, - "num_tokens": 186132317.0, - "step": 154710 - }, - { - "entropy": 1.9088991969823836, - "epoch": 0.47961846276888837, - "grad_norm": 8.895363807678223, - "learning_rate": 3.6529702610656294e-06, - "loss": 0.4835, - "mean_token_accuracy": 0.8540124624967576, - "num_tokens": 186143250.0, - "step": 154720 - }, - { - "entropy": 1.8378096505999566, - "epoch": 0.47964946189393803, - "grad_norm": 5.12782096862793, - "learning_rate": 3.652852215021092e-06, - "loss": 0.4399, - "mean_token_accuracy": 0.8618904039263725, - "num_tokens": 186154946.0, - "step": 154730 - }, - { - "entropy": 1.8013930171728134, - "epoch": 0.47968046101898776, - "grad_norm": 8.057185173034668, - "learning_rate": 3.6527341804198193e-06, - "loss": 0.4147, - "mean_token_accuracy": 0.8602392926812172, - "num_tokens": 186167660.0, - "step": 154740 - }, - { - "entropy": 1.8815608084201814, - "epoch": 0.4797114601440374, - "grad_norm": 8.834275245666504, - "learning_rate": 3.6526161572599616e-06, - "loss": 0.4896, - "mean_token_accuracy": 0.8500603690743447, - "num_tokens": 186178956.0, - "step": 154750 - }, - { - "entropy": 1.9094288021326065, - "epoch": 0.47974245926908715, - "grad_norm": 8.339888572692871, - "learning_rate": 3.6524981455396715e-06, - "loss": 0.5112, - "mean_token_accuracy": 0.8476811647415161, - "num_tokens": 186190800.0, - "step": 154760 - }, - { - "entropy": 1.8156311631202697, - "epoch": 0.4797734583941368, - "grad_norm": 3.8487701416015625, - "learning_rate": 3.6523801452571006e-06, - "loss": 0.3939, - "mean_token_accuracy": 0.8534475967288018, - "num_tokens": 186203466.0, - "step": 154770 - }, - { - "entropy": 1.8079561397433281, - "epoch": 0.47980445751918654, - "grad_norm": 3.5231499671936035, - "learning_rate": 3.652262156410402e-06, - "loss": 0.4518, - "mean_token_accuracy": 0.8429020419716835, - "num_tokens": 186215727.0, - "step": 154780 - }, - { - "entropy": 1.7731987565755845, - "epoch": 0.4798354566442362, - "grad_norm": 8.213481903076172, - "learning_rate": 3.6521441789977287e-06, - "loss": 0.3805, - "mean_token_accuracy": 0.868919475376606, - "num_tokens": 186228292.0, - "step": 154790 - }, - { - "entropy": 1.8571752443909646, - "epoch": 0.47986645576928594, - "grad_norm": 4.771162033081055, - "learning_rate": 3.6520262130172334e-06, - "loss": 0.4432, - "mean_token_accuracy": 0.8500853896141052, - "num_tokens": 186239711.0, - "step": 154800 - }, - { - "entropy": 1.8680015295743941, - "epoch": 0.4798974548943356, - "grad_norm": 9.111491203308105, - "learning_rate": 3.6519082584670694e-06, - "loss": 0.4754, - "mean_token_accuracy": 0.8505641296505928, - "num_tokens": 186251539.0, - "step": 154810 - }, - { - "entropy": 1.7599168375134469, - "epoch": 0.4799284540193853, - "grad_norm": 7.46876335144043, - "learning_rate": 3.6517903153453934e-06, - "loss": 0.412, - "mean_token_accuracy": 0.8639022320508957, - "num_tokens": 186265063.0, - "step": 154820 - }, - { - "entropy": 1.8380054078996182, - "epoch": 0.479959453144435, - "grad_norm": 2.27538800239563, - "learning_rate": 3.651672383650357e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8478976964950562, - "num_tokens": 186277300.0, - "step": 154830 - }, - { - "entropy": 1.91184870749712, - "epoch": 0.47999045226948467, - "grad_norm": 4.427165985107422, - "learning_rate": 3.651554463380117e-06, - "loss": 0.4953, - "mean_token_accuracy": 0.8442239791154862, - "num_tokens": 186288782.0, - "step": 154840 - }, - { - "entropy": 1.7352715209126472, - "epoch": 0.4800214513945344, - "grad_norm": 4.620881080627441, - "learning_rate": 3.6514365545328283e-06, - "loss": 0.3555, - "mean_token_accuracy": 0.8682469353079796, - "num_tokens": 186302145.0, - "step": 154850 - }, - { - "entropy": 1.8953940749168396, - "epoch": 0.48005245051958406, - "grad_norm": 8.379203796386719, - "learning_rate": 3.6513186571066473e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.851770706474781, - "num_tokens": 186313454.0, - "step": 154860 - }, - { - "entropy": 1.8896978572010994, - "epoch": 0.4800834496446338, - "grad_norm": 9.247343063354492, - "learning_rate": 3.6512007710997295e-06, - "loss": 0.4817, - "mean_token_accuracy": 0.8484896406531334, - "num_tokens": 186325023.0, - "step": 154870 - }, - { - "entropy": 1.900222858786583, - "epoch": 0.48011444876968346, - "grad_norm": 9.006083488464355, - "learning_rate": 3.6510828965102326e-06, - "loss": 0.4594, - "mean_token_accuracy": 0.8547819867730141, - "num_tokens": 186336470.0, - "step": 154880 - }, - { - "entropy": 1.805259671807289, - "epoch": 0.4801454478947332, - "grad_norm": 7.851848125457764, - "learning_rate": 3.6509650333363135e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.8576851665973664, - "num_tokens": 186348769.0, - "step": 154890 - }, - { - "entropy": 1.9134155437350273, - "epoch": 0.48017644701978285, - "grad_norm": 8.607007026672363, - "learning_rate": 3.6508471815761283e-06, - "loss": 0.4927, - "mean_token_accuracy": 0.845287999510765, - "num_tokens": 186359818.0, - "step": 154900 - }, - { - "entropy": 1.8226548954844475, - "epoch": 0.4802074461448326, - "grad_norm": 7.391361236572266, - "learning_rate": 3.6507293412278367e-06, - "loss": 0.4301, - "mean_token_accuracy": 0.8539162278175354, - "num_tokens": 186372552.0, - "step": 154910 - }, - { - "entropy": 1.8565132528543473, - "epoch": 0.48023844526988224, - "grad_norm": 8.47097110748291, - "learning_rate": 3.650611512289597e-06, - "loss": 0.5052, - "mean_token_accuracy": 0.8450149267911911, - "num_tokens": 186384399.0, - "step": 154920 - }, - { - "entropy": 1.8987506821751594, - "epoch": 0.48026944439493197, - "grad_norm": 9.051301002502441, - "learning_rate": 3.6504936947595664e-06, - "loss": 0.4538, - "mean_token_accuracy": 0.8581790804862977, - "num_tokens": 186395705.0, - "step": 154930 - }, - { - "entropy": 1.7860086098313332, - "epoch": 0.48030044351998163, - "grad_norm": 7.646950721740723, - "learning_rate": 3.6503758886359053e-06, - "loss": 0.3821, - "mean_token_accuracy": 0.8590388312935829, - "num_tokens": 186408158.0, - "step": 154940 - }, - { - "entropy": 1.835461364686489, - "epoch": 0.48033144264503136, - "grad_norm": 6.986077308654785, - "learning_rate": 3.650258093916774e-06, - "loss": 0.441, - "mean_token_accuracy": 0.8536029830574989, - "num_tokens": 186420083.0, - "step": 154950 - }, - { - "entropy": 1.7754562705755235, - "epoch": 0.48036244177008103, - "grad_norm": 7.720706462860107, - "learning_rate": 3.6501403106003315e-06, - "loss": 0.3864, - "mean_token_accuracy": 0.8717953890562058, - "num_tokens": 186432804.0, - "step": 154960 - }, - { - "entropy": 1.8868480697274208, - "epoch": 0.48039344089513075, - "grad_norm": 8.321026802062988, - "learning_rate": 3.650022538684739e-06, - "loss": 0.4553, - "mean_token_accuracy": 0.8510920956730843, - "num_tokens": 186444348.0, - "step": 154970 - }, - { - "entropy": 1.8335092812776566, - "epoch": 0.4804244400201804, - "grad_norm": 7.6114325523376465, - "learning_rate": 3.6499047781681557e-06, - "loss": 0.3916, - "mean_token_accuracy": 0.8667181313037873, - "num_tokens": 186456211.0, - "step": 154980 - }, - { - "entropy": 1.83046106249094, - "epoch": 0.48045543914523015, - "grad_norm": 7.575179576873779, - "learning_rate": 3.649787029048745e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.8602567866444588, - "num_tokens": 186468895.0, - "step": 154990 - }, - { - "entropy": 1.7130945861339568, - "epoch": 0.4804864382702798, - "grad_norm": 2.742889404296875, - "learning_rate": 3.6496692913246674e-06, - "loss": 0.3431, - "mean_token_accuracy": 0.876962573826313, - "num_tokens": 186482798.0, - "step": 155000 - }, - { - "entropy": 1.9094618245959283, - "epoch": 0.48051743739532954, - "grad_norm": 8.246910095214844, - "learning_rate": 3.649551564994085e-06, - "loss": 0.4625, - "mean_token_accuracy": 0.8591118276119232, - "num_tokens": 186494431.0, - "step": 155010 - }, - { - "entropy": 1.8281339153647422, - "epoch": 0.4805484365203792, - "grad_norm": 9.279541969299316, - "learning_rate": 3.6494338500551612e-06, - "loss": 0.4061, - "mean_token_accuracy": 0.8599943906068802, - "num_tokens": 186506615.0, - "step": 155020 - }, - { - "entropy": 1.7933892510831355, - "epoch": 0.48057943564542893, - "grad_norm": 7.063535690307617, - "learning_rate": 3.6493161465060584e-06, - "loss": 0.3894, - "mean_token_accuracy": 0.8706391632556916, - "num_tokens": 186520167.0, - "step": 155030 - }, - { - "entropy": 1.8618095114827156, - "epoch": 0.4806104347704786, - "grad_norm": 8.329252243041992, - "learning_rate": 3.649198454344939e-06, - "loss": 0.4554, - "mean_token_accuracy": 0.851315900683403, - "num_tokens": 186531387.0, - "step": 155040 - }, - { - "entropy": 1.9018240422010422, - "epoch": 0.4806414338955283, - "grad_norm": 7.958313941955566, - "learning_rate": 3.649080773569969e-06, - "loss": 0.482, - "mean_token_accuracy": 0.8383594647049903, - "num_tokens": 186543471.0, - "step": 155050 - }, - { - "entropy": 1.8148971542716026, - "epoch": 0.480672433020578, - "grad_norm": 8.183568954467773, - "learning_rate": 3.648963104179311e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8573694407939911, - "num_tokens": 186555739.0, - "step": 155060 - }, - { - "entropy": 1.898486042022705, - "epoch": 0.48070343214562766, - "grad_norm": 6.402829170227051, - "learning_rate": 3.648845446171129e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8548410072922706, - "num_tokens": 186566808.0, - "step": 155070 - }, - { - "entropy": 1.853443591296673, - "epoch": 0.4807344312706774, - "grad_norm": 9.244396209716797, - "learning_rate": 3.6487277995435906e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8503358826041222, - "num_tokens": 186579310.0, - "step": 155080 - }, - { - "entropy": 1.8664480939507484, - "epoch": 0.48076543039572706, - "grad_norm": 4.412349700927734, - "learning_rate": 3.6486101642948586e-06, - "loss": 0.4325, - "mean_token_accuracy": 0.8586249127984047, - "num_tokens": 186591176.0, - "step": 155090 - }, - { - "entropy": 1.8777339145541192, - "epoch": 0.4807964295207768, - "grad_norm": 6.988900184631348, - "learning_rate": 3.6484925404230997e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.863787741959095, - "num_tokens": 186602357.0, - "step": 155100 - }, - { - "entropy": 1.8698883458971978, - "epoch": 0.48082742864582645, - "grad_norm": 9.025288581848145, - "learning_rate": 3.6483749279264807e-06, - "loss": 0.4372, - "mean_token_accuracy": 0.8576495185494423, - "num_tokens": 186614146.0, - "step": 155110 - }, - { - "entropy": 1.8872638180851937, - "epoch": 0.4808584277708762, - "grad_norm": 8.719307899475098, - "learning_rate": 3.6482573268031675e-06, - "loss": 0.4322, - "mean_token_accuracy": 0.8448175966739655, - "num_tokens": 186626569.0, - "step": 155120 - }, - { - "entropy": 1.8500750228762626, - "epoch": 0.48088942689592584, - "grad_norm": 6.002628326416016, - "learning_rate": 3.6481397370513276e-06, - "loss": 0.5012, - "mean_token_accuracy": 0.8516894370317459, - "num_tokens": 186639402.0, - "step": 155130 - }, - { - "entropy": 1.9110220953822137, - "epoch": 0.48092042602097557, - "grad_norm": 6.769970417022705, - "learning_rate": 3.6480221586691295e-06, - "loss": 0.4582, - "mean_token_accuracy": 0.8488235414028168, - "num_tokens": 186651695.0, - "step": 155140 - }, - { - "entropy": 1.8391446053981781, - "epoch": 0.48095142514602524, - "grad_norm": 7.097575664520264, - "learning_rate": 3.647904591654739e-06, - "loss": 0.4181, - "mean_token_accuracy": 0.8646885514259338, - "num_tokens": 186663994.0, - "step": 155150 - }, - { - "entropy": 1.9141424641013145, - "epoch": 0.48098242427107496, - "grad_norm": 7.152270317077637, - "learning_rate": 3.6477870360063257e-06, - "loss": 0.4834, - "mean_token_accuracy": 0.8485850319266319, - "num_tokens": 186675453.0, - "step": 155160 - }, - { - "entropy": 1.9391047358512878, - "epoch": 0.48101342339612463, - "grad_norm": 8.798613548278809, - "learning_rate": 3.6476694917220577e-06, - "loss": 0.5, - "mean_token_accuracy": 0.8436856657266617, - "num_tokens": 186686372.0, - "step": 155170 - }, - { - "entropy": 1.8622770234942436, - "epoch": 0.48104442252117435, - "grad_norm": 8.492561340332031, - "learning_rate": 3.647551958800106e-06, - "loss": 0.4248, - "mean_token_accuracy": 0.8657533332705498, - "num_tokens": 186697676.0, - "step": 155180 - }, - { - "entropy": 1.9052131861448287, - "epoch": 0.481075421646224, - "grad_norm": 6.784976959228516, - "learning_rate": 3.6474344372386383e-06, - "loss": 0.483, - "mean_token_accuracy": 0.8484751611948014, - "num_tokens": 186708270.0, - "step": 155190 - }, - { - "entropy": 1.8822030156850815, - "epoch": 0.48110642077127375, - "grad_norm": 7.78303337097168, - "learning_rate": 3.647316927035825e-06, - "loss": 0.4875, - "mean_token_accuracy": 0.8471454083919525, - "num_tokens": 186719190.0, - "step": 155200 - }, - { - "entropy": 1.8744702443480492, - "epoch": 0.4811374198963234, - "grad_norm": 9.0418701171875, - "learning_rate": 3.6471994281898366e-06, - "loss": 0.4277, - "mean_token_accuracy": 0.8550082817673683, - "num_tokens": 186730876.0, - "step": 155210 - }, - { - "entropy": 1.905518215894699, - "epoch": 0.48116841902137314, - "grad_norm": 7.297862529754639, - "learning_rate": 3.647081940698843e-06, - "loss": 0.438, - "mean_token_accuracy": 0.8591875582933426, - "num_tokens": 186741370.0, - "step": 155220 - }, - { - "entropy": 1.940250787138939, - "epoch": 0.4811994181464228, - "grad_norm": 8.297772407531738, - "learning_rate": 3.6469644645610177e-06, - "loss": 0.5156, - "mean_token_accuracy": 0.8442102998495102, - "num_tokens": 186752169.0, - "step": 155230 - }, - { - "entropy": 1.8991802141070366, - "epoch": 0.48123041727147253, - "grad_norm": 8.18435001373291, - "learning_rate": 3.64684699977453e-06, - "loss": 0.4477, - "mean_token_accuracy": 0.8534949243068695, - "num_tokens": 186763406.0, - "step": 155240 - }, - { - "entropy": 1.8912201315164565, - "epoch": 0.4812614163965222, - "grad_norm": 8.27580738067627, - "learning_rate": 3.6467295463375536e-06, - "loss": 0.4753, - "mean_token_accuracy": 0.8461031407117844, - "num_tokens": 186774770.0, - "step": 155250 - }, - { - "entropy": 1.9139169782400132, - "epoch": 0.4812924155215719, - "grad_norm": 7.336427688598633, - "learning_rate": 3.6466121042482605e-06, - "loss": 0.5342, - "mean_token_accuracy": 0.8405420258641243, - "num_tokens": 186786162.0, - "step": 155260 - }, - { - "entropy": 1.7847980469465257, - "epoch": 0.4813234146466216, - "grad_norm": 9.743826866149902, - "learning_rate": 3.6464946735048225e-06, - "loss": 0.4024, - "mean_token_accuracy": 0.8628578379750251, - "num_tokens": 186798530.0, - "step": 155270 - }, - { - "entropy": 1.729624892771244, - "epoch": 0.4813544137716713, - "grad_norm": 3.902719497680664, - "learning_rate": 3.6463772541054143e-06, - "loss": 0.3639, - "mean_token_accuracy": 0.8717215061187744, - "num_tokens": 186811845.0, - "step": 155280 - }, - { - "entropy": 1.8367839485406876, - "epoch": 0.481385412896721, - "grad_norm": 7.629048824310303, - "learning_rate": 3.6462598460482084e-06, - "loss": 0.4439, - "mean_token_accuracy": 0.8575905844569206, - "num_tokens": 186823908.0, - "step": 155290 - }, - { - "entropy": 1.8470647171139718, - "epoch": 0.4814164120217707, - "grad_norm": 7.2646403312683105, - "learning_rate": 3.6461424493313795e-06, - "loss": 0.4655, - "mean_token_accuracy": 0.8454138725996018, - "num_tokens": 186835758.0, - "step": 155300 - }, - { - "entropy": 1.843580712378025, - "epoch": 0.4814474111468204, - "grad_norm": 4.490911483764648, - "learning_rate": 3.646025063953103e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8536460235714912, - "num_tokens": 186848093.0, - "step": 155310 - }, - { - "entropy": 1.8249020904302597, - "epoch": 0.48147841027187005, - "grad_norm": 7.88754415512085, - "learning_rate": 3.645907689911552e-06, - "loss": 0.422, - "mean_token_accuracy": 0.8577623814344406, - "num_tokens": 186860208.0, - "step": 155320 - }, - { - "entropy": 1.8020548969507217, - "epoch": 0.4815094093969198, - "grad_norm": 3.8696632385253906, - "learning_rate": 3.6457903272049033e-06, - "loss": 0.4251, - "mean_token_accuracy": 0.8550468429923057, - "num_tokens": 186872705.0, - "step": 155330 - }, - { - "entropy": 1.8387231886386872, - "epoch": 0.48154040852196944, - "grad_norm": 9.060702323913574, - "learning_rate": 3.645672975831332e-06, - "loss": 0.4401, - "mean_token_accuracy": 0.857013863325119, - "num_tokens": 186884728.0, - "step": 155340 - }, - { - "entropy": 1.7827875435352325, - "epoch": 0.48157140764701917, - "grad_norm": 2.826335906982422, - "learning_rate": 3.645555635789015e-06, - "loss": 0.391, - "mean_token_accuracy": 0.8733478873968125, - "num_tokens": 186897871.0, - "step": 155350 - }, - { - "entropy": 1.844759140908718, - "epoch": 0.48160240677206884, - "grad_norm": 7.48664665222168, - "learning_rate": 3.6454383070761275e-06, - "loss": 0.447, - "mean_token_accuracy": 0.8576426088809967, - "num_tokens": 186909772.0, - "step": 155360 - }, - { - "entropy": 1.8383795037865638, - "epoch": 0.48163340589711856, - "grad_norm": 12.733421325683594, - "learning_rate": 3.6453209896908476e-06, - "loss": 0.4823, - "mean_token_accuracy": 0.8490657106041908, - "num_tokens": 186921500.0, - "step": 155370 - }, - { - "entropy": 1.7978422671556473, - "epoch": 0.48166440502216823, - "grad_norm": 4.8395233154296875, - "learning_rate": 3.645203683631352e-06, - "loss": 0.4023, - "mean_token_accuracy": 0.8567419424653053, - "num_tokens": 186934087.0, - "step": 155380 - }, - { - "entropy": 1.8284786969423295, - "epoch": 0.48169540414721795, - "grad_norm": 3.7645609378814697, - "learning_rate": 3.6450863888958197e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.8614578813314437, - "num_tokens": 186945729.0, - "step": 155390 - }, - { - "entropy": 1.7969519719481468, - "epoch": 0.4817264032722676, - "grad_norm": 3.680454730987549, - "learning_rate": 3.6449691054824275e-06, - "loss": 0.3915, - "mean_token_accuracy": 0.872539047896862, - "num_tokens": 186957718.0, - "step": 155400 - }, - { - "entropy": 1.8333932682871819, - "epoch": 0.48175740239731735, - "grad_norm": 7.701864242553711, - "learning_rate": 3.644851833389354e-06, - "loss": 0.4355, - "mean_token_accuracy": 0.8568485960364342, - "num_tokens": 186969432.0, - "step": 155410 - }, - { - "entropy": 1.7963881850242616, - "epoch": 0.481788401522367, - "grad_norm": 7.959238529205322, - "learning_rate": 3.64473457261478e-06, - "loss": 0.4434, - "mean_token_accuracy": 0.8592301860451699, - "num_tokens": 186981898.0, - "step": 155420 - }, - { - "entropy": 1.8788066983222962, - "epoch": 0.48181940064741674, - "grad_norm": 8.316329002380371, - "learning_rate": 3.6446173231568833e-06, - "loss": 0.4416, - "mean_token_accuracy": 0.8591387689113616, - "num_tokens": 186993009.0, - "step": 155430 - }, - { - "entropy": 1.7975234732031822, - "epoch": 0.4818503997724664, - "grad_norm": 4.090932846069336, - "learning_rate": 3.644500085013844e-06, - "loss": 0.4096, - "mean_token_accuracy": 0.8513005197048187, - "num_tokens": 187005781.0, - "step": 155440 - }, - { - "entropy": 1.8089564740657806, - "epoch": 0.48188139889751613, - "grad_norm": 6.5081000328063965, - "learning_rate": 3.644382858183843e-06, - "loss": 0.4099, - "mean_token_accuracy": 0.8633261471986771, - "num_tokens": 187018552.0, - "step": 155450 - }, - { - "entropy": 1.7889938607811928, - "epoch": 0.4819123980225658, - "grad_norm": 3.51275372505188, - "learning_rate": 3.6442656426650603e-06, - "loss": 0.407, - "mean_token_accuracy": 0.8629370719194412, - "num_tokens": 187032150.0, - "step": 155460 - }, - { - "entropy": 1.8642136842012405, - "epoch": 0.4819433971476155, - "grad_norm": 7.637637615203857, - "learning_rate": 3.6441484384556776e-06, - "loss": 0.4404, - "mean_token_accuracy": 0.8616564303636551, - "num_tokens": 187043436.0, - "step": 155470 - }, - { - "entropy": 1.8972400605678559, - "epoch": 0.4819743962726652, - "grad_norm": 9.601285934448242, - "learning_rate": 3.6440312455538756e-06, - "loss": 0.4747, - "mean_token_accuracy": 0.8566853240132332, - "num_tokens": 187053461.0, - "step": 155480 - }, - { - "entropy": 1.905197212100029, - "epoch": 0.4820053953977149, - "grad_norm": 6.226647853851318, - "learning_rate": 3.643914063957837e-06, - "loss": 0.4993, - "mean_token_accuracy": 0.849284790456295, - "num_tokens": 187064477.0, - "step": 155490 - }, - { - "entropy": 1.848924145102501, - "epoch": 0.4820363945227646, - "grad_norm": 8.730347633361816, - "learning_rate": 3.643796893665743e-06, - "loss": 0.4393, - "mean_token_accuracy": 0.8531228393316269, - "num_tokens": 187076141.0, - "step": 155500 - }, - { - "entropy": 1.9209964841604232, - "epoch": 0.4820673936478143, - "grad_norm": 10.016907691955566, - "learning_rate": 3.643679734675778e-06, - "loss": 0.4781, - "mean_token_accuracy": 0.8399276688694954, - "num_tokens": 187087179.0, - "step": 155510 - }, - { - "entropy": 1.811716277897358, - "epoch": 0.482098392772864, - "grad_norm": 10.810657501220703, - "learning_rate": 3.643562586986124e-06, - "loss": 0.4141, - "mean_token_accuracy": 0.8532059162855148, - "num_tokens": 187099880.0, - "step": 155520 - }, - { - "entropy": 1.8917767360806466, - "epoch": 0.4821293918979137, - "grad_norm": 9.377852439880371, - "learning_rate": 3.6434454505949645e-06, - "loss": 0.464, - "mean_token_accuracy": 0.8542043223977089, - "num_tokens": 187111103.0, - "step": 155530 - }, - { - "entropy": 1.751031818985939, - "epoch": 0.4821603910229634, - "grad_norm": 11.62239933013916, - "learning_rate": 3.6433283255004835e-06, - "loss": 0.3926, - "mean_token_accuracy": 0.8702298268675804, - "num_tokens": 187124247.0, - "step": 155540 - }, - { - "entropy": 1.7074541047215461, - "epoch": 0.4821913901480131, - "grad_norm": 4.805656909942627, - "learning_rate": 3.643211211700866e-06, - "loss": 0.3803, - "mean_token_accuracy": 0.8729520246386528, - "num_tokens": 187138198.0, - "step": 155550 - }, - { - "entropy": 1.82494997382164, - "epoch": 0.48222238927306277, - "grad_norm": 4.265288352966309, - "learning_rate": 3.6430941091942967e-06, - "loss": 0.4014, - "mean_token_accuracy": 0.8578064441680908, - "num_tokens": 187150394.0, - "step": 155560 - }, - { - "entropy": 1.8820818334817886, - "epoch": 0.48225338839811244, - "grad_norm": 8.813608169555664, - "learning_rate": 3.6429770179789605e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.8515245079994201, - "num_tokens": 187161557.0, - "step": 155570 - }, - { - "entropy": 1.867897354066372, - "epoch": 0.48228438752316216, - "grad_norm": 7.266292572021484, - "learning_rate": 3.6428599380530417e-06, - "loss": 0.4301, - "mean_token_accuracy": 0.8602555438876152, - "num_tokens": 187173114.0, - "step": 155580 - }, - { - "entropy": 1.8383618667721748, - "epoch": 0.48231538664821183, - "grad_norm": 3.8756728172302246, - "learning_rate": 3.6427428694147288e-06, - "loss": 0.4232, - "mean_token_accuracy": 0.8568738460540771, - "num_tokens": 187184447.0, - "step": 155590 - }, - { - "entropy": 1.8275836020708085, - "epoch": 0.48234638577326155, - "grad_norm": 5.545556545257568, - "learning_rate": 3.642625812062206e-06, - "loss": 0.444, - "mean_token_accuracy": 0.8490733534097672, - "num_tokens": 187196783.0, - "step": 155600 - }, - { - "entropy": 1.8141174018383026, - "epoch": 0.4823773848983112, - "grad_norm": 3.5136075019836426, - "learning_rate": 3.642508765993661e-06, - "loss": 0.4333, - "mean_token_accuracy": 0.8561397686600685, - "num_tokens": 187209895.0, - "step": 155610 - }, - { - "entropy": 1.8496269896626472, - "epoch": 0.48240838402336095, - "grad_norm": 7.60167932510376, - "learning_rate": 3.6423917312072815e-06, - "loss": 0.4274, - "mean_token_accuracy": 0.862434770166874, - "num_tokens": 187221932.0, - "step": 155620 - }, - { - "entropy": 1.7664272099733354, - "epoch": 0.4824393831484106, - "grad_norm": 7.53615140914917, - "learning_rate": 3.6422747077012544e-06, - "loss": 0.3898, - "mean_token_accuracy": 0.8608724012970924, - "num_tokens": 187234329.0, - "step": 155630 - }, - { - "entropy": 1.8685143813490868, - "epoch": 0.48247038227346034, - "grad_norm": 8.27384090423584, - "learning_rate": 3.6421576954737676e-06, - "loss": 0.4483, - "mean_token_accuracy": 0.8536564335227013, - "num_tokens": 187245703.0, - "step": 155640 - }, - { - "entropy": 1.715440958738327, - "epoch": 0.48250138139851, - "grad_norm": 6.6035637855529785, - "learning_rate": 3.6420406945230103e-06, - "loss": 0.3223, - "mean_token_accuracy": 0.8742220461368561, - "num_tokens": 187258474.0, - "step": 155650 - }, - { - "entropy": 1.8504155680537224, - "epoch": 0.48253238052355973, - "grad_norm": 9.480717658996582, - "learning_rate": 3.6419237048471704e-06, - "loss": 0.4204, - "mean_token_accuracy": 0.8578575730323792, - "num_tokens": 187270524.0, - "step": 155660 - }, - { - "entropy": 1.7659525617957115, - "epoch": 0.4825633796486094, - "grad_norm": 8.356760025024414, - "learning_rate": 3.6418067264444382e-06, - "loss": 0.4063, - "mean_token_accuracy": 0.8561761811375618, - "num_tokens": 187283451.0, - "step": 155670 - }, - { - "entropy": 1.8657870590686798, - "epoch": 0.4825943787736591, - "grad_norm": 9.368040084838867, - "learning_rate": 3.641689759313003e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8581396222114563, - "num_tokens": 187295040.0, - "step": 155680 - }, - { - "entropy": 1.8435973614454269, - "epoch": 0.4826253778987088, - "grad_norm": 8.829280853271484, - "learning_rate": 3.6415728034510545e-06, - "loss": 0.4342, - "mean_token_accuracy": 0.8657978147268295, - "num_tokens": 187306210.0, - "step": 155690 - }, - { - "entropy": 1.9003707140684127, - "epoch": 0.4826563770237585, - "grad_norm": 10.332893371582031, - "learning_rate": 3.6414558588567827e-06, - "loss": 0.4659, - "mean_token_accuracy": 0.8597208350896836, - "num_tokens": 187316273.0, - "step": 155700 - }, - { - "entropy": 1.8557246506214142, - "epoch": 0.4826873761488082, - "grad_norm": 7.9659247398376465, - "learning_rate": 3.64133892552838e-06, - "loss": 0.4304, - "mean_token_accuracy": 0.8631693422794342, - "num_tokens": 187328277.0, - "step": 155710 - }, - { - "entropy": 1.909783835709095, - "epoch": 0.4827183752738579, - "grad_norm": 4.126440525054932, - "learning_rate": 3.6412220034640367e-06, - "loss": 0.5136, - "mean_token_accuracy": 0.8437858536839485, - "num_tokens": 187339540.0, - "step": 155720 - }, - { - "entropy": 1.8335693284869194, - "epoch": 0.4827493743989076, - "grad_norm": 8.314302444458008, - "learning_rate": 3.6411050926619444e-06, - "loss": 0.434, - "mean_token_accuracy": 0.85353202521801, - "num_tokens": 187351930.0, - "step": 155730 - }, - { - "entropy": 1.7986718460917472, - "epoch": 0.4827803735239573, - "grad_norm": 6.746932506561279, - "learning_rate": 3.6409881931202954e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8710423827171325, - "num_tokens": 187364243.0, - "step": 155740 - }, - { - "entropy": 1.92829280346632, - "epoch": 0.482811372649007, - "grad_norm": 9.083883285522461, - "learning_rate": 3.640871304837283e-06, - "loss": 0.4782, - "mean_token_accuracy": 0.8497691377997398, - "num_tokens": 187375649.0, - "step": 155750 - }, - { - "entropy": 1.866778840124607, - "epoch": 0.4828423717740567, - "grad_norm": 4.199884414672852, - "learning_rate": 3.640754427811098e-06, - "loss": 0.4778, - "mean_token_accuracy": 0.8422569170594215, - "num_tokens": 187387617.0, - "step": 155760 - }, - { - "entropy": 1.8624927997589111, - "epoch": 0.48287337089910637, - "grad_norm": 8.201692581176758, - "learning_rate": 3.640637562039936e-06, - "loss": 0.4795, - "mean_token_accuracy": 0.8347114294767379, - "num_tokens": 187399916.0, - "step": 155770 - }, - { - "entropy": 1.7571727007627487, - "epoch": 0.4829043700241561, - "grad_norm": 8.598112106323242, - "learning_rate": 3.6405207075219895e-06, - "loss": 0.3851, - "mean_token_accuracy": 0.8643165245652199, - "num_tokens": 187413052.0, - "step": 155780 - }, - { - "entropy": 1.8955672517418862, - "epoch": 0.48293536914920576, - "grad_norm": 13.206428527832031, - "learning_rate": 3.640403864255453e-06, - "loss": 0.4662, - "mean_token_accuracy": 0.8522917911410332, - "num_tokens": 187423801.0, - "step": 155790 - }, - { - "entropy": 1.862819616496563, - "epoch": 0.4829663682742555, - "grad_norm": 3.849029064178467, - "learning_rate": 3.6402870322385207e-06, - "loss": 0.4488, - "mean_token_accuracy": 0.8516810446977615, - "num_tokens": 187435559.0, - "step": 155800 - }, - { - "entropy": 1.9115412473678588, - "epoch": 0.48299736739930516, - "grad_norm": 8.526803016662598, - "learning_rate": 3.6401702114693883e-06, - "loss": 0.4928, - "mean_token_accuracy": 0.8415917620062828, - "num_tokens": 187447277.0, - "step": 155810 - }, - { - "entropy": 1.7353981606662274, - "epoch": 0.4830283665243548, - "grad_norm": 10.263956069946289, - "learning_rate": 3.6400534019462497e-06, - "loss": 0.3582, - "mean_token_accuracy": 0.8638710469007492, - "num_tokens": 187461097.0, - "step": 155820 - }, - { - "entropy": 1.8627897635102273, - "epoch": 0.48305936564940455, - "grad_norm": 8.649343490600586, - "learning_rate": 3.6399366036673023e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8539001986384391, - "num_tokens": 187473687.0, - "step": 155830 - }, - { - "entropy": 1.839612002670765, - "epoch": 0.4830903647744542, - "grad_norm": 8.778943061828613, - "learning_rate": 3.639819816630742e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8621722176671028, - "num_tokens": 187486542.0, - "step": 155840 - }, - { - "entropy": 1.8706245437264442, - "epoch": 0.48312136389950394, - "grad_norm": 7.706948757171631, - "learning_rate": 3.6397030408347638e-06, - "loss": 0.4887, - "mean_token_accuracy": 0.8378767624497414, - "num_tokens": 187498647.0, - "step": 155850 - }, - { - "entropy": 1.8668534964323045, - "epoch": 0.4831523630245536, - "grad_norm": 7.943578243255615, - "learning_rate": 3.639586276277567e-06, - "loss": 0.4789, - "mean_token_accuracy": 0.8449800446629524, - "num_tokens": 187510115.0, - "step": 155860 - }, - { - "entropy": 1.9097621962428093, - "epoch": 0.48318336214960333, - "grad_norm": 9.294798851013184, - "learning_rate": 3.639469522957346e-06, - "loss": 0.512, - "mean_token_accuracy": 0.8440002262592315, - "num_tokens": 187521520.0, - "step": 155870 - }, - { - "entropy": 1.7814342930912972, - "epoch": 0.483214361274653, - "grad_norm": 4.55609655380249, - "learning_rate": 3.639352780872302e-06, - "loss": 0.3879, - "mean_token_accuracy": 0.8625999093055725, - "num_tokens": 187534109.0, - "step": 155880 - }, - { - "entropy": 1.8384138897061348, - "epoch": 0.48324536039970273, - "grad_norm": 8.546332359313965, - "learning_rate": 3.6392360500206303e-06, - "loss": 0.4519, - "mean_token_accuracy": 0.8485594004392624, - "num_tokens": 187545835.0, - "step": 155890 - }, - { - "entropy": 1.8684273108839988, - "epoch": 0.4832763595247524, - "grad_norm": 8.705604553222656, - "learning_rate": 3.639119330400532e-06, - "loss": 0.458, - "mean_token_accuracy": 0.8522221609950066, - "num_tokens": 187557414.0, - "step": 155900 - }, - { - "entropy": 1.8951046496629715, - "epoch": 0.4833073586498021, - "grad_norm": 7.8505377769470215, - "learning_rate": 3.6390026220102036e-06, - "loss": 0.4849, - "mean_token_accuracy": 0.8535401314496994, - "num_tokens": 187568187.0, - "step": 155910 - }, - { - "entropy": 1.8770427122712134, - "epoch": 0.4833383577748518, - "grad_norm": 7.7342915534973145, - "learning_rate": 3.6388859248478454e-06, - "loss": 0.438, - "mean_token_accuracy": 0.8514578476548195, - "num_tokens": 187580732.0, - "step": 155920 - }, - { - "entropy": 1.8046485051512717, - "epoch": 0.4833693568999015, - "grad_norm": 7.7772369384765625, - "learning_rate": 3.6387692389116584e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.853971840441227, - "num_tokens": 187592862.0, - "step": 155930 - }, - { - "entropy": 1.8091088235378265, - "epoch": 0.4834003560249512, - "grad_norm": 4.339217662811279, - "learning_rate": 3.638652564199841e-06, - "loss": 0.3929, - "mean_token_accuracy": 0.8683247849345207, - "num_tokens": 187605469.0, - "step": 155940 - }, - { - "entropy": 1.8805622160434723, - "epoch": 0.4834313551500009, - "grad_norm": 8.848437309265137, - "learning_rate": 3.6385359007105956e-06, - "loss": 0.4171, - "mean_token_accuracy": 0.8534131139516831, - "num_tokens": 187617262.0, - "step": 155950 - }, - { - "entropy": 1.8057363733649254, - "epoch": 0.4834623542750506, - "grad_norm": 3.9022932052612305, - "learning_rate": 3.638419248442122e-06, - "loss": 0.4247, - "mean_token_accuracy": 0.8567571699619293, - "num_tokens": 187629721.0, - "step": 155960 - }, - { - "entropy": 1.764707237482071, - "epoch": 0.4834933534001003, - "grad_norm": 2.7294106483459473, - "learning_rate": 3.638302607392622e-06, - "loss": 0.4038, - "mean_token_accuracy": 0.8628931626677513, - "num_tokens": 187643292.0, - "step": 155970 - }, - { - "entropy": 1.804534560441971, - "epoch": 0.48352435252514997, - "grad_norm": 7.46668815612793, - "learning_rate": 3.6381859775602966e-06, - "loss": 0.4035, - "mean_token_accuracy": 0.8666567876935005, - "num_tokens": 187656242.0, - "step": 155980 - }, - { - "entropy": 1.9103981226682663, - "epoch": 0.4835553516501997, - "grad_norm": 7.5417070388793945, - "learning_rate": 3.638069358943349e-06, - "loss": 0.4645, - "mean_token_accuracy": 0.8543941050767898, - "num_tokens": 187667038.0, - "step": 155990 - }, - { - "entropy": 1.8362280517816543, - "epoch": 0.48358635077524936, - "grad_norm": 7.783841133117676, - "learning_rate": 3.637952751539982e-06, - "loss": 0.4361, - "mean_token_accuracy": 0.8486904546618461, - "num_tokens": 187679620.0, - "step": 156000 - }, - { - "entropy": 1.8745223417878152, - "epoch": 0.4836173499002991, - "grad_norm": 10.0465087890625, - "learning_rate": 3.6378361553483975e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8512848377227783, - "num_tokens": 187691256.0, - "step": 156010 - }, - { - "entropy": 1.9242184340953827, - "epoch": 0.48364834902534876, - "grad_norm": 8.397578239440918, - "learning_rate": 3.6377195703668004e-06, - "loss": 0.4736, - "mean_token_accuracy": 0.8493492394685745, - "num_tokens": 187701903.0, - "step": 156020 - }, - { - "entropy": 1.8979373127222061, - "epoch": 0.4836793481503985, - "grad_norm": 7.949219226837158, - "learning_rate": 3.6376029965933936e-06, - "loss": 0.5207, - "mean_token_accuracy": 0.8401141837239265, - "num_tokens": 187713014.0, - "step": 156030 - }, - { - "entropy": 1.8131269350647927, - "epoch": 0.48371034727544815, - "grad_norm": 3.8186416625976562, - "learning_rate": 3.637486434026381e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.848060154914856, - "num_tokens": 187726114.0, - "step": 156040 - }, - { - "entropy": 1.7898790180683135, - "epoch": 0.4837413464004979, - "grad_norm": 4.015059947967529, - "learning_rate": 3.6373698826639674e-06, - "loss": 0.4027, - "mean_token_accuracy": 0.8551013439893722, - "num_tokens": 187739024.0, - "step": 156050 - }, - { - "entropy": 1.7694806531071663, - "epoch": 0.48377234552554754, - "grad_norm": 8.411758422851562, - "learning_rate": 3.6372533425043592e-06, - "loss": 0.3709, - "mean_token_accuracy": 0.8681419178843498, - "num_tokens": 187750917.0, - "step": 156060 - }, - { - "entropy": 1.8274760872125626, - "epoch": 0.4838033446505972, - "grad_norm": 9.502041816711426, - "learning_rate": 3.6371368135457597e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8524838626384735, - "num_tokens": 187763130.0, - "step": 156070 - }, - { - "entropy": 1.8318697839975357, - "epoch": 0.48383434377564694, - "grad_norm": 9.341939926147461, - "learning_rate": 3.637020295786377e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8428703010082245, - "num_tokens": 187775702.0, - "step": 156080 - }, - { - "entropy": 1.9090040355920792, - "epoch": 0.4838653429006966, - "grad_norm": 9.399673461914062, - "learning_rate": 3.6369037892244154e-06, - "loss": 0.4788, - "mean_token_accuracy": 0.8548586055636406, - "num_tokens": 187786390.0, - "step": 156090 - }, - { - "entropy": 1.8724202007055282, - "epoch": 0.48389634202574633, - "grad_norm": 9.181824684143066, - "learning_rate": 3.6367872938580817e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8572488963603974, - "num_tokens": 187798294.0, - "step": 156100 - }, - { - "entropy": 1.7142846815288066, - "epoch": 0.483927341150796, - "grad_norm": 7.860523223876953, - "learning_rate": 3.6366708096855852e-06, - "loss": 0.3695, - "mean_token_accuracy": 0.8716006681323052, - "num_tokens": 187812016.0, - "step": 156110 - }, - { - "entropy": 1.8110566526651382, - "epoch": 0.4839583402758457, - "grad_norm": 7.496535778045654, - "learning_rate": 3.6365543367051304e-06, - "loss": 0.4139, - "mean_token_accuracy": 0.8591806992888451, - "num_tokens": 187824148.0, - "step": 156120 - }, - { - "entropy": 1.7441804528236389, - "epoch": 0.4839893394008954, - "grad_norm": 8.527477264404297, - "learning_rate": 3.636437874914927e-06, - "loss": 0.3678, - "mean_token_accuracy": 0.8575242191553116, - "num_tokens": 187837951.0, - "step": 156130 - }, - { - "entropy": 1.7979930967092514, - "epoch": 0.4840203385259451, - "grad_norm": 10.310881614685059, - "learning_rate": 3.636321424313183e-06, - "loss": 0.4433, - "mean_token_accuracy": 0.8589319199323654, - "num_tokens": 187850444.0, - "step": 156140 - }, - { - "entropy": 1.8642814740538598, - "epoch": 0.4840513376509948, - "grad_norm": 7.190332889556885, - "learning_rate": 3.6362049848981064e-06, - "loss": 0.4609, - "mean_token_accuracy": 0.8498084455728531, - "num_tokens": 187862015.0, - "step": 156150 - }, - { - "entropy": 1.8755100816488266, - "epoch": 0.4840823367760445, - "grad_norm": 9.06082820892334, - "learning_rate": 3.6360885566679073e-06, - "loss": 0.482, - "mean_token_accuracy": 0.8422882303595542, - "num_tokens": 187873861.0, - "step": 156160 - }, - { - "entropy": 1.8259334176778794, - "epoch": 0.4841133359010942, - "grad_norm": 7.07421875, - "learning_rate": 3.635972139620794e-06, - "loss": 0.428, - "mean_token_accuracy": 0.8559049963951111, - "num_tokens": 187885749.0, - "step": 156170 - }, - { - "entropy": 1.933459311723709, - "epoch": 0.4841443350261439, - "grad_norm": 9.747174263000488, - "learning_rate": 3.635855733754977e-06, - "loss": 0.4557, - "mean_token_accuracy": 0.8511974141001701, - "num_tokens": 187896265.0, - "step": 156180 - }, - { - "entropy": 1.8441163718700408, - "epoch": 0.48417533415119357, - "grad_norm": 8.249030113220215, - "learning_rate": 3.6357393390686667e-06, - "loss": 0.4045, - "mean_token_accuracy": 0.8691418990492821, - "num_tokens": 187907705.0, - "step": 156190 - }, - { - "entropy": 1.819663205742836, - "epoch": 0.4842063332762433, - "grad_norm": 9.237526893615723, - "learning_rate": 3.635622955560073e-06, - "loss": 0.4107, - "mean_token_accuracy": 0.856706628203392, - "num_tokens": 187920290.0, - "step": 156200 - }, - { - "entropy": 1.8128308981657029, - "epoch": 0.48423733240129296, - "grad_norm": 7.997885704040527, - "learning_rate": 3.6355065832274085e-06, - "loss": 0.4652, - "mean_token_accuracy": 0.8497570842504502, - "num_tokens": 187933859.0, - "step": 156210 - }, - { - "entropy": 1.8780751511454583, - "epoch": 0.4842683315263427, - "grad_norm": 3.8760132789611816, - "learning_rate": 3.6353902220688827e-06, - "loss": 0.447, - "mean_token_accuracy": 0.849544820189476, - "num_tokens": 187945493.0, - "step": 156220 - }, - { - "entropy": 1.897641871869564, - "epoch": 0.48429933065139236, - "grad_norm": 7.152022361755371, - "learning_rate": 3.635273872082708e-06, - "loss": 0.4177, - "mean_token_accuracy": 0.8556644603610039, - "num_tokens": 187957061.0, - "step": 156230 - }, - { - "entropy": 1.8795344799757003, - "epoch": 0.4843303297764421, - "grad_norm": 3.9362809658050537, - "learning_rate": 3.6351575332670987e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8543588429689407, - "num_tokens": 187968642.0, - "step": 156240 - }, - { - "entropy": 1.8843262925744058, - "epoch": 0.48436132890149175, - "grad_norm": 7.554871082305908, - "learning_rate": 3.635041205620265e-06, - "loss": 0.4385, - "mean_token_accuracy": 0.8527411460876465, - "num_tokens": 187980875.0, - "step": 156250 - }, - { - "entropy": 1.8457639068365097, - "epoch": 0.4843923280265415, - "grad_norm": 8.525203704833984, - "learning_rate": 3.6349248891404204e-06, - "loss": 0.4698, - "mean_token_accuracy": 0.8586451008915901, - "num_tokens": 187992538.0, - "step": 156260 - }, - { - "entropy": 1.882865457236767, - "epoch": 0.48442332715159114, - "grad_norm": 9.19686222076416, - "learning_rate": 3.6348085838257794e-06, - "loss": 0.4687, - "mean_token_accuracy": 0.8533548563718796, - "num_tokens": 188003720.0, - "step": 156270 - }, - { - "entropy": 1.8969163656234742, - "epoch": 0.48445432627664087, - "grad_norm": 7.925225734710693, - "learning_rate": 3.634692289674556e-06, - "loss": 0.5061, - "mean_token_accuracy": 0.8379020988941193, - "num_tokens": 188015486.0, - "step": 156280 - }, - { - "entropy": 1.8043647065758706, - "epoch": 0.48448532540169054, - "grad_norm": 10.64844799041748, - "learning_rate": 3.6345760066849624e-06, - "loss": 0.4691, - "mean_token_accuracy": 0.8532945722341537, - "num_tokens": 188027781.0, - "step": 156290 - }, - { - "entropy": 1.7452705442905425, - "epoch": 0.4845163245267402, - "grad_norm": 4.376309871673584, - "learning_rate": 3.6344597348552156e-06, - "loss": 0.3688, - "mean_token_accuracy": 0.8649339213967323, - "num_tokens": 188041959.0, - "step": 156300 - }, - { - "entropy": 1.867486347258091, - "epoch": 0.48454732365178993, - "grad_norm": 7.461634159088135, - "learning_rate": 3.6343434741835288e-06, - "loss": 0.4327, - "mean_token_accuracy": 0.8641291365027428, - "num_tokens": 188054346.0, - "step": 156310 - }, - { - "entropy": 1.8779701933264732, - "epoch": 0.4845783227768396, - "grad_norm": 12.157458305358887, - "learning_rate": 3.634227224668119e-06, - "loss": 0.4465, - "mean_token_accuracy": 0.8633387625217438, - "num_tokens": 188065860.0, - "step": 156320 - }, - { - "entropy": 1.8805889412760735, - "epoch": 0.4846093219018893, - "grad_norm": 8.761711120605469, - "learning_rate": 3.634110986307202e-06, - "loss": 0.5089, - "mean_token_accuracy": 0.8404549643397331, - "num_tokens": 188077040.0, - "step": 156330 - }, - { - "entropy": 1.917221650481224, - "epoch": 0.484640321026939, - "grad_norm": 7.180342674255371, - "learning_rate": 3.6339947590989927e-06, - "loss": 0.4653, - "mean_token_accuracy": 0.8567236632108688, - "num_tokens": 188088847.0, - "step": 156340 - }, - { - "entropy": 1.758651900291443, - "epoch": 0.4846713201519887, - "grad_norm": 7.375646591186523, - "learning_rate": 3.633878543041709e-06, - "loss": 0.3852, - "mean_token_accuracy": 0.8604704290628433, - "num_tokens": 188102559.0, - "step": 156350 - }, - { - "entropy": 1.8245538994669914, - "epoch": 0.4847023192770384, - "grad_norm": 9.896560668945312, - "learning_rate": 3.6337623381335667e-06, - "loss": 0.4922, - "mean_token_accuracy": 0.8480986952781677, - "num_tokens": 188116273.0, - "step": 156360 - }, - { - "entropy": 1.7007096633315086, - "epoch": 0.4847333184020881, - "grad_norm": 6.117918014526367, - "learning_rate": 3.633646144372785e-06, - "loss": 0.404, - "mean_token_accuracy": 0.8717444121837616, - "num_tokens": 188130208.0, - "step": 156370 - }, - { - "entropy": 1.8957871600985527, - "epoch": 0.4847643175271378, - "grad_norm": 8.124541282653809, - "learning_rate": 3.6335299617575805e-06, - "loss": 0.4721, - "mean_token_accuracy": 0.853635823726654, - "num_tokens": 188141683.0, - "step": 156380 - }, - { - "entropy": 1.871186774969101, - "epoch": 0.4847953166521875, - "grad_norm": 8.871430397033691, - "learning_rate": 3.6334137902861723e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.856009703874588, - "num_tokens": 188153756.0, - "step": 156390 - }, - { - "entropy": 1.8725305289030074, - "epoch": 0.48482631577723717, - "grad_norm": 8.753509521484375, - "learning_rate": 3.6332976299567785e-06, - "loss": 0.51, - "mean_token_accuracy": 0.8379634752869606, - "num_tokens": 188166036.0, - "step": 156400 - }, - { - "entropy": 1.8789052292704582, - "epoch": 0.4848573149022869, - "grad_norm": 8.494888305664062, - "learning_rate": 3.6331814807676176e-06, - "loss": 0.4757, - "mean_token_accuracy": 0.8496663123369217, - "num_tokens": 188177126.0, - "step": 156410 - }, - { - "entropy": 1.8479280322790146, - "epoch": 0.48488831402733656, - "grad_norm": 7.020564079284668, - "learning_rate": 3.6330653427169103e-06, - "loss": 0.4752, - "mean_token_accuracy": 0.8580684915184975, - "num_tokens": 188189522.0, - "step": 156420 - }, - { - "entropy": 1.849296373128891, - "epoch": 0.4849193131523863, - "grad_norm": 10.006750106811523, - "learning_rate": 3.6329492158028757e-06, - "loss": 0.4727, - "mean_token_accuracy": 0.8454330176115036, - "num_tokens": 188200987.0, - "step": 156430 - }, - { - "entropy": 1.8682186782360077, - "epoch": 0.48495031227743596, - "grad_norm": 8.694777488708496, - "learning_rate": 3.6328331000237343e-06, - "loss": 0.4601, - "mean_token_accuracy": 0.8542024463415145, - "num_tokens": 188213291.0, - "step": 156440 - }, - { - "entropy": 1.8008888192474841, - "epoch": 0.4849813114024857, - "grad_norm": 8.048171043395996, - "learning_rate": 3.6327169953777063e-06, - "loss": 0.4018, - "mean_token_accuracy": 0.8668388247489929, - "num_tokens": 188225876.0, - "step": 156450 - }, - { - "entropy": 1.799254597723484, - "epoch": 0.48501231052753535, - "grad_norm": 8.080785751342773, - "learning_rate": 3.632600901863014e-06, - "loss": 0.4142, - "mean_token_accuracy": 0.8576643332839012, - "num_tokens": 188238714.0, - "step": 156460 - }, - { - "entropy": 1.639521201699972, - "epoch": 0.4850433096525851, - "grad_norm": 3.9683926105499268, - "learning_rate": 3.6324848194778774e-06, - "loss": 0.3425, - "mean_token_accuracy": 0.869882382452488, - "num_tokens": 188253725.0, - "step": 156470 - }, - { - "entropy": 1.776284283399582, - "epoch": 0.48507430877763474, - "grad_norm": 4.678526401519775, - "learning_rate": 3.632368748220519e-06, - "loss": 0.4405, - "mean_token_accuracy": 0.8620742172002792, - "num_tokens": 188266283.0, - "step": 156480 - }, - { - "entropy": 1.8657296374440193, - "epoch": 0.48510530790268447, - "grad_norm": 7.574404239654541, - "learning_rate": 3.632252688089161e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.863825935125351, - "num_tokens": 188277756.0, - "step": 156490 - }, - { - "entropy": 1.8865803450345993, - "epoch": 0.48513630702773414, - "grad_norm": 8.333385467529297, - "learning_rate": 3.6321366390820266e-06, - "loss": 0.4272, - "mean_token_accuracy": 0.8613347887992859, - "num_tokens": 188288700.0, - "step": 156500 - }, - { - "entropy": 1.9244218185544013, - "epoch": 0.48516730615278386, - "grad_norm": 7.839199066162109, - "learning_rate": 3.6320206011973373e-06, - "loss": 0.5009, - "mean_token_accuracy": 0.8391366973519325, - "num_tokens": 188300479.0, - "step": 156510 - }, - { - "entropy": 1.8372875064611436, - "epoch": 0.48519830527783353, - "grad_norm": 7.72141170501709, - "learning_rate": 3.6319045744333185e-06, - "loss": 0.423, - "mean_token_accuracy": 0.860981197655201, - "num_tokens": 188312684.0, - "step": 156520 - }, - { - "entropy": 1.7935153931379317, - "epoch": 0.48522930440288325, - "grad_norm": 4.041905879974365, - "learning_rate": 3.631788558788193e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8546757638454437, - "num_tokens": 188325289.0, - "step": 156530 - }, - { - "entropy": 1.744502691924572, - "epoch": 0.4852603035279329, - "grad_norm": 7.012277603149414, - "learning_rate": 3.631672554260184e-06, - "loss": 0.3668, - "mean_token_accuracy": 0.8671909630298614, - "num_tokens": 188338884.0, - "step": 156540 - }, - { - "entropy": 1.786800318956375, - "epoch": 0.4852913026529826, - "grad_norm": 8.924103736877441, - "learning_rate": 3.6315565608475184e-06, - "loss": 0.3694, - "mean_token_accuracy": 0.8652423739433288, - "num_tokens": 188351601.0, - "step": 156550 - }, - { - "entropy": 1.798016108572483, - "epoch": 0.4853223017780323, - "grad_norm": 9.73193645477295, - "learning_rate": 3.631440578548419e-06, - "loss": 0.4074, - "mean_token_accuracy": 0.861903102695942, - "num_tokens": 188364272.0, - "step": 156560 - }, - { - "entropy": 1.7860716104507446, - "epoch": 0.485353300903082, - "grad_norm": 5.464559078216553, - "learning_rate": 3.631324607361113e-06, - "loss": 0.3677, - "mean_token_accuracy": 0.8641039207577705, - "num_tokens": 188377196.0, - "step": 156570 - }, - { - "entropy": 1.9405299216508864, - "epoch": 0.4853843000281317, - "grad_norm": 9.91318130493164, - "learning_rate": 3.6312086472838253e-06, - "loss": 0.5184, - "mean_token_accuracy": 0.8417086794972419, - "num_tokens": 188388093.0, - "step": 156580 - }, - { - "entropy": 1.8181221313774585, - "epoch": 0.4854152991531814, - "grad_norm": 11.408985137939453, - "learning_rate": 3.6310926983147823e-06, - "loss": 0.3895, - "mean_token_accuracy": 0.8606801643967629, - "num_tokens": 188400330.0, - "step": 156590 - }, - { - "entropy": 1.8644197478890419, - "epoch": 0.4854462982782311, - "grad_norm": 9.163566589355469, - "learning_rate": 3.630976760452211e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8528912767767907, - "num_tokens": 188411951.0, - "step": 156600 - }, - { - "entropy": 1.8152849718928337, - "epoch": 0.48547729740328077, - "grad_norm": 8.839181900024414, - "learning_rate": 3.6308608336943374e-06, - "loss": 0.4156, - "mean_token_accuracy": 0.8583212316036224, - "num_tokens": 188423655.0, - "step": 156610 - }, - { - "entropy": 1.8252029418945312, - "epoch": 0.4855082965283305, - "grad_norm": 8.845681190490723, - "learning_rate": 3.6307449180393895e-06, - "loss": 0.4381, - "mean_token_accuracy": 0.8624794527888298, - "num_tokens": 188435618.0, - "step": 156620 - }, - { - "entropy": 1.7919634029269218, - "epoch": 0.48553929565338017, - "grad_norm": 7.7746429443359375, - "learning_rate": 3.630629013485596e-06, - "loss": 0.3895, - "mean_token_accuracy": 0.8691168889403343, - "num_tokens": 188447995.0, - "step": 156630 - }, - { - "entropy": 1.7863436222076416, - "epoch": 0.4855702947784299, - "grad_norm": 8.812021255493164, - "learning_rate": 3.6305131200311835e-06, - "loss": 0.4042, - "mean_token_accuracy": 0.8654252752661705, - "num_tokens": 188460962.0, - "step": 156640 - }, - { - "entropy": 1.8167147532105445, - "epoch": 0.48560129390347956, - "grad_norm": 3.7734971046447754, - "learning_rate": 3.6303972376743814e-06, - "loss": 0.409, - "mean_token_accuracy": 0.8662436470389366, - "num_tokens": 188472440.0, - "step": 156650 - }, - { - "entropy": 1.864917366206646, - "epoch": 0.4856322930285293, - "grad_norm": 8.78889274597168, - "learning_rate": 3.630281366413419e-06, - "loss": 0.5209, - "mean_token_accuracy": 0.8403189569711685, - "num_tokens": 188484784.0, - "step": 156660 - }, - { - "entropy": 1.859625655412674, - "epoch": 0.48566329215357895, - "grad_norm": 7.059370040893555, - "learning_rate": 3.630165506246525e-06, - "loss": 0.4652, - "mean_token_accuracy": 0.8460963040590286, - "num_tokens": 188497245.0, - "step": 156670 - }, - { - "entropy": 1.7957852184772491, - "epoch": 0.4856942912786287, - "grad_norm": 8.198931694030762, - "learning_rate": 3.6300496571719295e-06, - "loss": 0.4054, - "mean_token_accuracy": 0.8644311785697937, - "num_tokens": 188509887.0, - "step": 156680 - }, - { - "entropy": 1.9186707973480224, - "epoch": 0.48572529040367834, - "grad_norm": 9.6982421875, - "learning_rate": 3.6299338191878623e-06, - "loss": 0.4884, - "mean_token_accuracy": 0.8487430840730668, - "num_tokens": 188520517.0, - "step": 156690 - }, - { - "entropy": 1.9093660950660705, - "epoch": 0.48575628952872807, - "grad_norm": 7.849754333496094, - "learning_rate": 3.6298179922925554e-06, - "loss": 0.4953, - "mean_token_accuracy": 0.8508066058158874, - "num_tokens": 188531905.0, - "step": 156700 - }, - { - "entropy": 1.8367055609822274, - "epoch": 0.48578728865377774, - "grad_norm": 7.576959609985352, - "learning_rate": 3.6297021764842377e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8587787270545959, - "num_tokens": 188543901.0, - "step": 156710 - }, - { - "entropy": 1.805798091739416, - "epoch": 0.48581828777882746, - "grad_norm": 8.443595886230469, - "learning_rate": 3.6295863717611423e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.8576732441782952, - "num_tokens": 188556121.0, - "step": 156720 - }, - { - "entropy": 1.8692028522491455, - "epoch": 0.48584928690387713, - "grad_norm": 3.9699697494506836, - "learning_rate": 3.6294705781214996e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8564115181565285, - "num_tokens": 188567950.0, - "step": 156730 - }, - { - "entropy": 1.7551136761903763, - "epoch": 0.48588028602892686, - "grad_norm": 2.822622299194336, - "learning_rate": 3.629354795563543e-06, - "loss": 0.4069, - "mean_token_accuracy": 0.8498562499880791, - "num_tokens": 188581754.0, - "step": 156740 - }, - { - "entropy": 1.8290818929672241, - "epoch": 0.4859112851539765, - "grad_norm": 6.920858860015869, - "learning_rate": 3.6292390240855038e-06, - "loss": 0.4039, - "mean_token_accuracy": 0.8647098124027253, - "num_tokens": 188593324.0, - "step": 156750 - }, - { - "entropy": 1.851450626552105, - "epoch": 0.48594228427902625, - "grad_norm": 7.4507904052734375, - "learning_rate": 3.629123263685616e-06, - "loss": 0.472, - "mean_token_accuracy": 0.8521499082446098, - "num_tokens": 188604866.0, - "step": 156760 - }, - { - "entropy": 1.8714505270123483, - "epoch": 0.4859732834040759, - "grad_norm": 8.083022117614746, - "learning_rate": 3.629007514362112e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8554137036204338, - "num_tokens": 188617023.0, - "step": 156770 - }, - { - "entropy": 1.8139952212572097, - "epoch": 0.48600428252912564, - "grad_norm": 7.504046440124512, - "learning_rate": 3.628891776113226e-06, - "loss": 0.436, - "mean_token_accuracy": 0.8558562770485878, - "num_tokens": 188629125.0, - "step": 156780 - }, - { - "entropy": 1.7777962386608124, - "epoch": 0.4860352816541753, - "grad_norm": 7.869353294372559, - "learning_rate": 3.6287760489371926e-06, - "loss": 0.404, - "mean_token_accuracy": 0.8635659471154213, - "num_tokens": 188642471.0, - "step": 156790 - }, - { - "entropy": 1.893085741996765, - "epoch": 0.486066280779225, - "grad_norm": 6.712622165679932, - "learning_rate": 3.628660332832246e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.8632133215665817, - "num_tokens": 188654009.0, - "step": 156800 - }, - { - "entropy": 1.846302431821823, - "epoch": 0.4860972799042747, - "grad_norm": 3.914888858795166, - "learning_rate": 3.628544627796621e-06, - "loss": 0.4908, - "mean_token_accuracy": 0.8565996348857879, - "num_tokens": 188666288.0, - "step": 156810 - }, - { - "entropy": 1.9017745479941368, - "epoch": 0.4861282790293244, - "grad_norm": 8.8294677734375, - "learning_rate": 3.628428933828553e-06, - "loss": 0.5231, - "mean_token_accuracy": 0.8494993954896927, - "num_tokens": 188677652.0, - "step": 156820 - }, - { - "entropy": 1.8907178774476052, - "epoch": 0.4861592781543741, - "grad_norm": 7.541580677032471, - "learning_rate": 3.6283132509262763e-06, - "loss": 0.498, - "mean_token_accuracy": 0.8456685811281204, - "num_tokens": 188688888.0, - "step": 156830 - }, - { - "entropy": 1.844075907766819, - "epoch": 0.48619027727942377, - "grad_norm": 4.210703372955322, - "learning_rate": 3.6281975790880292e-06, - "loss": 0.421, - "mean_token_accuracy": 0.857151634991169, - "num_tokens": 188700221.0, - "step": 156840 - }, - { - "entropy": 1.828725065290928, - "epoch": 0.4862212764044735, - "grad_norm": 8.287814140319824, - "learning_rate": 3.6280819183120477e-06, - "loss": 0.4319, - "mean_token_accuracy": 0.8587014749646187, - "num_tokens": 188712585.0, - "step": 156850 - }, - { - "entropy": 1.8757966220378877, - "epoch": 0.48625227552952316, - "grad_norm": 7.026792526245117, - "learning_rate": 3.6279662685965677e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8533299207687378, - "num_tokens": 188723844.0, - "step": 156860 - }, - { - "entropy": 1.818382728099823, - "epoch": 0.4862832746545729, - "grad_norm": 3.8120429515838623, - "learning_rate": 3.627850629939827e-06, - "loss": 0.4163, - "mean_token_accuracy": 0.8576777949929237, - "num_tokens": 188736591.0, - "step": 156870 - }, - { - "entropy": 1.8777550026774406, - "epoch": 0.48631427377962255, - "grad_norm": 7.84219217300415, - "learning_rate": 3.6277350023400635e-06, - "loss": 0.4651, - "mean_token_accuracy": 0.8507702320814132, - "num_tokens": 188748864.0, - "step": 156880 - }, - { - "entropy": 1.745957624912262, - "epoch": 0.4863452729046723, - "grad_norm": 3.141312599182129, - "learning_rate": 3.627619385795515e-06, - "loss": 0.3598, - "mean_token_accuracy": 0.8645585060119629, - "num_tokens": 188762327.0, - "step": 156890 - }, - { - "entropy": 1.8983976244926453, - "epoch": 0.48637627202972195, - "grad_norm": 9.90230655670166, - "learning_rate": 3.62750378030442e-06, - "loss": 0.4266, - "mean_token_accuracy": 0.8513541236519814, - "num_tokens": 188773931.0, - "step": 156900 - }, - { - "entropy": 1.7605302453041076, - "epoch": 0.48640727115477167, - "grad_norm": 6.981546401977539, - "learning_rate": 3.627388185865018e-06, - "loss": 0.3841, - "mean_token_accuracy": 0.8662245452404023, - "num_tokens": 188788065.0, - "step": 156910 - }, - { - "entropy": 1.7625210091471673, - "epoch": 0.48643827027982134, - "grad_norm": 2.2922286987304688, - "learning_rate": 3.6272726024755462e-06, - "loss": 0.3888, - "mean_token_accuracy": 0.8693696141242981, - "num_tokens": 188802039.0, - "step": 156920 - }, - { - "entropy": 1.8891747072339058, - "epoch": 0.48646926940487106, - "grad_norm": 4.775572776794434, - "learning_rate": 3.627157030134246e-06, - "loss": 0.4526, - "mean_token_accuracy": 0.8509297117590904, - "num_tokens": 188814295.0, - "step": 156930 - }, - { - "entropy": 1.763446855545044, - "epoch": 0.48650026852992073, - "grad_norm": 3.432339668273926, - "learning_rate": 3.627041468839358e-06, - "loss": 0.3982, - "mean_token_accuracy": 0.8640241846442223, - "num_tokens": 188827785.0, - "step": 156940 - }, - { - "entropy": 1.8804999768733979, - "epoch": 0.48653126765497046, - "grad_norm": 8.55935287475586, - "learning_rate": 3.626925918589121e-06, - "loss": 0.4666, - "mean_token_accuracy": 0.8539211377501488, - "num_tokens": 188840111.0, - "step": 156950 - }, - { - "entropy": 1.8551584213972092, - "epoch": 0.4865622667800201, - "grad_norm": 8.160202980041504, - "learning_rate": 3.6268103793817766e-06, - "loss": 0.472, - "mean_token_accuracy": 0.8460486233234406, - "num_tokens": 188852260.0, - "step": 156960 - }, - { - "entropy": 1.8928451895713807, - "epoch": 0.48659326590506985, - "grad_norm": 8.368380546569824, - "learning_rate": 3.6266948512155656e-06, - "loss": 0.4425, - "mean_token_accuracy": 0.8604690045118332, - "num_tokens": 188863606.0, - "step": 156970 - }, - { - "entropy": 1.817128673195839, - "epoch": 0.4866242650301195, - "grad_norm": 8.995201110839844, - "learning_rate": 3.6265793340887304e-06, - "loss": 0.4203, - "mean_token_accuracy": 0.860697340965271, - "num_tokens": 188876385.0, - "step": 156980 - }, - { - "entropy": 1.8426421731710434, - "epoch": 0.48665526415516924, - "grad_norm": 4.008335590362549, - "learning_rate": 3.626463827999513e-06, - "loss": 0.4162, - "mean_token_accuracy": 0.854708181321621, - "num_tokens": 188889014.0, - "step": 156990 - }, - { - "entropy": 1.8129928693175317, - "epoch": 0.4866862632802189, - "grad_norm": 4.042017936706543, - "learning_rate": 3.626348332946154e-06, - "loss": 0.3957, - "mean_token_accuracy": 0.8559390410780907, - "num_tokens": 188901539.0, - "step": 157000 - }, - { - "entropy": 1.8829196870326996, - "epoch": 0.48671726240526864, - "grad_norm": 9.24323844909668, - "learning_rate": 3.6262328489268982e-06, - "loss": 0.4865, - "mean_token_accuracy": 0.8524382159113884, - "num_tokens": 188913282.0, - "step": 157010 - }, - { - "entropy": 1.7663740821182727, - "epoch": 0.4867482615303183, - "grad_norm": 3.5652692317962646, - "learning_rate": 3.6261173759399875e-06, - "loss": 0.3192, - "mean_token_accuracy": 0.8695008844137192, - "num_tokens": 188926799.0, - "step": 157020 - }, - { - "entropy": 1.8988188937306405, - "epoch": 0.48677926065536803, - "grad_norm": 7.9664201736450195, - "learning_rate": 3.6260019139836667e-06, - "loss": 0.4219, - "mean_token_accuracy": 0.8581131368875503, - "num_tokens": 188938377.0, - "step": 157030 - }, - { - "entropy": 1.9046687543392182, - "epoch": 0.4868102597804177, - "grad_norm": 9.607617378234863, - "learning_rate": 3.6258864630561778e-06, - "loss": 0.5123, - "mean_token_accuracy": 0.8482811197638511, - "num_tokens": 188949656.0, - "step": 157040 - }, - { - "entropy": 1.8872241079807281, - "epoch": 0.48684125890546737, - "grad_norm": 8.410953521728516, - "learning_rate": 3.625771023155767e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.8558803468942642, - "num_tokens": 188961681.0, - "step": 157050 - }, - { - "entropy": 1.8674045950174332, - "epoch": 0.4868722580305171, - "grad_norm": 8.313755989074707, - "learning_rate": 3.6256555942806783e-06, - "loss": 0.5321, - "mean_token_accuracy": 0.8421355858445168, - "num_tokens": 188973938.0, - "step": 157060 - }, - { - "entropy": 1.77282817363739, - "epoch": 0.48690325715556676, - "grad_norm": 7.947534561157227, - "learning_rate": 3.625540176429157e-06, - "loss": 0.3933, - "mean_token_accuracy": 0.856605575978756, - "num_tokens": 188987056.0, - "step": 157070 - }, - { - "entropy": 1.7749268785119057, - "epoch": 0.4869342562806165, - "grad_norm": 3.8374805450439453, - "learning_rate": 3.625424769599448e-06, - "loss": 0.4021, - "mean_token_accuracy": 0.8576187625527382, - "num_tokens": 189000210.0, - "step": 157080 - }, - { - "entropy": 1.8151443317532538, - "epoch": 0.48696525540566615, - "grad_norm": 8.406780242919922, - "learning_rate": 3.6253093737897984e-06, - "loss": 0.4074, - "mean_token_accuracy": 0.8566974952816964, - "num_tokens": 189013181.0, - "step": 157090 - }, - { - "entropy": 1.8080456346273421, - "epoch": 0.4869962545307159, - "grad_norm": 10.127240180969238, - "learning_rate": 3.6251939889984526e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.8613529846072197, - "num_tokens": 189025238.0, - "step": 157100 - }, - { - "entropy": 1.9063942030072212, - "epoch": 0.48702725365576555, - "grad_norm": 8.239479064941406, - "learning_rate": 3.6250786152236593e-06, - "loss": 0.4802, - "mean_token_accuracy": 0.8503356859087944, - "num_tokens": 189036393.0, - "step": 157110 - }, - { - "entropy": 1.9125859394669533, - "epoch": 0.48705825278081527, - "grad_norm": 4.242093563079834, - "learning_rate": 3.624963252463665e-06, - "loss": 0.4902, - "mean_token_accuracy": 0.846269004046917, - "num_tokens": 189047901.0, - "step": 157120 - }, - { - "entropy": 1.9060107335448264, - "epoch": 0.48708925190586494, - "grad_norm": 5.090417385101318, - "learning_rate": 3.6248479007167166e-06, - "loss": 0.4821, - "mean_token_accuracy": 0.8466105028986931, - "num_tokens": 189059652.0, - "step": 157130 - }, - { - "entropy": 1.8610600799322128, - "epoch": 0.48712025103091466, - "grad_norm": 6.971331596374512, - "learning_rate": 3.6247325599810618e-06, - "loss": 0.3993, - "mean_token_accuracy": 0.8637379705905914, - "num_tokens": 189072096.0, - "step": 157140 - }, - { - "entropy": 1.8987021505832673, - "epoch": 0.48715125015596433, - "grad_norm": 3.920376777648926, - "learning_rate": 3.6246172302549497e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8554188758134842, - "num_tokens": 189083311.0, - "step": 157150 - }, - { - "entropy": 1.8860539883375167, - "epoch": 0.48718224928101406, - "grad_norm": 6.914085388183594, - "learning_rate": 3.624501911536628e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.8497070074081421, - "num_tokens": 189095521.0, - "step": 157160 - }, - { - "entropy": 1.879089817404747, - "epoch": 0.4872132484060637, - "grad_norm": 8.246174812316895, - "learning_rate": 3.624386603824347e-06, - "loss": 0.4463, - "mean_token_accuracy": 0.8566564947366715, - "num_tokens": 189107507.0, - "step": 157170 - }, - { - "entropy": 1.8245720505714416, - "epoch": 0.48724424753111345, - "grad_norm": 7.060524940490723, - "learning_rate": 3.6242713071163548e-06, - "loss": 0.3965, - "mean_token_accuracy": 0.8609587267041207, - "num_tokens": 189119955.0, - "step": 157180 - }, - { - "entropy": 1.8623187392950058, - "epoch": 0.4872752466561631, - "grad_norm": 6.407861232757568, - "learning_rate": 3.6241560214109024e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8532092794775963, - "num_tokens": 189131790.0, - "step": 157190 - }, - { - "entropy": 1.8516870602965354, - "epoch": 0.48730624578121284, - "grad_norm": 6.912207126617432, - "learning_rate": 3.624040746706239e-06, - "loss": 0.3995, - "mean_token_accuracy": 0.8642320290207863, - "num_tokens": 189143723.0, - "step": 157200 - }, - { - "entropy": 1.9268372386693955, - "epoch": 0.4873372449062625, - "grad_norm": 10.234281539916992, - "learning_rate": 3.623925483000615e-06, - "loss": 0.5109, - "mean_token_accuracy": 0.8470379471778869, - "num_tokens": 189154544.0, - "step": 157210 - }, - { - "entropy": 1.8641832515597343, - "epoch": 0.48736824403131224, - "grad_norm": 8.448433876037598, - "learning_rate": 3.6238102302922827e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.857720798254013, - "num_tokens": 189166226.0, - "step": 157220 - }, - { - "entropy": 1.9346581190824508, - "epoch": 0.4873992431563619, - "grad_norm": 8.481154441833496, - "learning_rate": 3.623694988579492e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.852697542309761, - "num_tokens": 189177197.0, - "step": 157230 - }, - { - "entropy": 1.8736652597784995, - "epoch": 0.48743024228141163, - "grad_norm": 8.590533256530762, - "learning_rate": 3.6235797578604958e-06, - "loss": 0.4529, - "mean_token_accuracy": 0.8409170970320702, - "num_tokens": 189189425.0, - "step": 157240 - }, - { - "entropy": 1.8822925239801407, - "epoch": 0.4874612414064613, - "grad_norm": 8.005229949951172, - "learning_rate": 3.6234645381335458e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8517946019768715, - "num_tokens": 189200841.0, - "step": 157250 - }, - { - "entropy": 1.868522433936596, - "epoch": 0.487492240531511, - "grad_norm": 7.923999786376953, - "learning_rate": 3.6233493293968936e-06, - "loss": 0.4733, - "mean_token_accuracy": 0.851598097383976, - "num_tokens": 189213006.0, - "step": 157260 - }, - { - "entropy": 1.8525910183787346, - "epoch": 0.4875232396565607, - "grad_norm": 7.764453887939453, - "learning_rate": 3.623234131648794e-06, - "loss": 0.4798, - "mean_token_accuracy": 0.8464570224285126, - "num_tokens": 189225217.0, - "step": 157270 - }, - { - "entropy": 1.8424304701387881, - "epoch": 0.4875542387816104, - "grad_norm": 9.055932998657227, - "learning_rate": 3.6231189448874993e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.8600788041949272, - "num_tokens": 189237890.0, - "step": 157280 - }, - { - "entropy": 1.8687356740236283, - "epoch": 0.4875852379066601, - "grad_norm": 7.5734663009643555, - "learning_rate": 3.6230037691112623e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8601713046431542, - "num_tokens": 189248774.0, - "step": 157290 - }, - { - "entropy": 1.887175776064396, - "epoch": 0.48761623703170975, - "grad_norm": 8.036844253540039, - "learning_rate": 3.6228886043183385e-06, - "loss": 0.4505, - "mean_token_accuracy": 0.8500497087836265, - "num_tokens": 189260259.0, - "step": 157300 - }, - { - "entropy": 1.8767434313893319, - "epoch": 0.4876472361567595, - "grad_norm": 9.874613761901855, - "learning_rate": 3.622773450506982e-06, - "loss": 0.4308, - "mean_token_accuracy": 0.8547291472554207, - "num_tokens": 189271853.0, - "step": 157310 - }, - { - "entropy": 1.8897757351398468, - "epoch": 0.48767823528180915, - "grad_norm": 8.708046913146973, - "learning_rate": 3.622658307675448e-06, - "loss": 0.4245, - "mean_token_accuracy": 0.8521436333656311, - "num_tokens": 189284384.0, - "step": 157320 - }, - { - "entropy": 1.7800422132015228, - "epoch": 0.48770923440685887, - "grad_norm": 3.693558931350708, - "learning_rate": 3.6225431758219905e-06, - "loss": 0.3894, - "mean_token_accuracy": 0.8698929205536843, - "num_tokens": 189297579.0, - "step": 157330 - }, - { - "entropy": 1.9262080565094948, - "epoch": 0.48774023353190854, - "grad_norm": 3.791533946990967, - "learning_rate": 3.6224280549448654e-06, - "loss": 0.4635, - "mean_token_accuracy": 0.8444644317030907, - "num_tokens": 189309341.0, - "step": 157340 - }, - { - "entropy": 1.8786733433604241, - "epoch": 0.48777123265695826, - "grad_norm": 7.771004676818848, - "learning_rate": 3.62231294504233e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8481441915035248, - "num_tokens": 189321372.0, - "step": 157350 - }, - { - "entropy": 1.9163564920425415, - "epoch": 0.48780223178200793, - "grad_norm": 14.17982006072998, - "learning_rate": 3.62219784611264e-06, - "loss": 0.4775, - "mean_token_accuracy": 0.854206183552742, - "num_tokens": 189332771.0, - "step": 157360 - }, - { - "entropy": 1.841642838716507, - "epoch": 0.48783323090705766, - "grad_norm": 9.067546844482422, - "learning_rate": 3.6220827581540524e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.8519814401865006, - "num_tokens": 189345612.0, - "step": 157370 - }, - { - "entropy": 1.9254460856318474, - "epoch": 0.4878642300321073, - "grad_norm": 10.410828590393066, - "learning_rate": 3.6219676811648235e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.8462668195366859, - "num_tokens": 189356913.0, - "step": 157380 - }, - { - "entropy": 1.8930987566709518, - "epoch": 0.48789522915715705, - "grad_norm": 4.706137657165527, - "learning_rate": 3.6218526151432123e-06, - "loss": 0.442, - "mean_token_accuracy": 0.8493619441986084, - "num_tokens": 189368660.0, - "step": 157390 - }, - { - "entropy": 1.86765104085207, - "epoch": 0.4879262282822067, - "grad_norm": 2.7685275077819824, - "learning_rate": 3.621737560087475e-06, - "loss": 0.4423, - "mean_token_accuracy": 0.8525311022996902, - "num_tokens": 189382038.0, - "step": 157400 - }, - { - "entropy": 1.844710259139538, - "epoch": 0.48795722740725644, - "grad_norm": 2.638852119445801, - "learning_rate": 3.6216225159958713e-06, - "loss": 0.4053, - "mean_token_accuracy": 0.8614204362034797, - "num_tokens": 189394683.0, - "step": 157410 - }, - { - "entropy": 1.8290350809693336, - "epoch": 0.4879882265323061, - "grad_norm": 4.062201976776123, - "learning_rate": 3.6215074828666598e-06, - "loss": 0.4031, - "mean_token_accuracy": 0.8629499360918998, - "num_tokens": 189407590.0, - "step": 157420 - }, - { - "entropy": 1.80876744389534, - "epoch": 0.48801922565735584, - "grad_norm": 7.40885066986084, - "learning_rate": 3.6213924606980995e-06, - "loss": 0.4373, - "mean_token_accuracy": 0.8544455721974373, - "num_tokens": 189420366.0, - "step": 157430 - }, - { - "entropy": 1.8963041082024574, - "epoch": 0.4880502247824055, - "grad_norm": 8.370992660522461, - "learning_rate": 3.6212774494884494e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.8585180237889289, - "num_tokens": 189431992.0, - "step": 157440 - }, - { - "entropy": 1.8512851744890213, - "epoch": 0.48808122390745523, - "grad_norm": 6.397646427154541, - "learning_rate": 3.6211624492359696e-06, - "loss": 0.4204, - "mean_token_accuracy": 0.8509098574519157, - "num_tokens": 189444932.0, - "step": 157450 - }, - { - "entropy": 1.8019349545240402, - "epoch": 0.4881122230325049, - "grad_norm": 3.6263394355773926, - "learning_rate": 3.6210474599389212e-06, - "loss": 0.3726, - "mean_token_accuracy": 0.8706066176295281, - "num_tokens": 189458119.0, - "step": 157460 - }, - { - "entropy": 1.9306983739137649, - "epoch": 0.4881432221575546, - "grad_norm": 7.022060394287109, - "learning_rate": 3.6209324815955636e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8524512380361557, - "num_tokens": 189469847.0, - "step": 157470 - }, - { - "entropy": 1.8647955879569054, - "epoch": 0.4881742212826043, - "grad_norm": 7.962439060211182, - "learning_rate": 3.620817514204159e-06, - "loss": 0.4444, - "mean_token_accuracy": 0.8580728963017463, - "num_tokens": 189482607.0, - "step": 157480 - }, - { - "entropy": 1.8655321180820466, - "epoch": 0.488205220407654, - "grad_norm": 6.718905448913574, - "learning_rate": 3.6207025577629685e-06, - "loss": 0.409, - "mean_token_accuracy": 0.8578371599316597, - "num_tokens": 189494789.0, - "step": 157490 - }, - { - "entropy": 1.782193235307932, - "epoch": 0.4882362195327037, - "grad_norm": 8.388843536376953, - "learning_rate": 3.620587612270253e-06, - "loss": 0.382, - "mean_token_accuracy": 0.867868272960186, - "num_tokens": 189508905.0, - "step": 157500 - }, - { - "entropy": 1.924613358080387, - "epoch": 0.4882672186577534, - "grad_norm": 7.749386787414551, - "learning_rate": 3.6204726777242765e-06, - "loss": 0.4814, - "mean_token_accuracy": 0.8526110947132111, - "num_tokens": 189520596.0, - "step": 157510 - }, - { - "entropy": 1.8958618491888046, - "epoch": 0.4882982177828031, - "grad_norm": 8.092024803161621, - "learning_rate": 3.6203577541233004e-06, - "loss": 0.4353, - "mean_token_accuracy": 0.8603635936975479, - "num_tokens": 189532260.0, - "step": 157520 - }, - { - "entropy": 1.9047956779599189, - "epoch": 0.4883292169078528, - "grad_norm": 4.471070766448975, - "learning_rate": 3.6202428414655877e-06, - "loss": 0.461, - "mean_token_accuracy": 0.842148295044899, - "num_tokens": 189544420.0, - "step": 157530 - }, - { - "entropy": 1.934245301783085, - "epoch": 0.48836021603290247, - "grad_norm": 10.193523406982422, - "learning_rate": 3.6201279397494023e-06, - "loss": 0.4829, - "mean_token_accuracy": 0.8412486299872398, - "num_tokens": 189555444.0, - "step": 157540 - }, - { - "entropy": 1.8730113923549652, - "epoch": 0.48839121515795214, - "grad_norm": 4.940889358520508, - "learning_rate": 3.6200130489730075e-06, - "loss": 0.4044, - "mean_token_accuracy": 0.859997783601284, - "num_tokens": 189568007.0, - "step": 157550 - }, - { - "entropy": 1.8695228844881058, - "epoch": 0.48842221428300187, - "grad_norm": 7.891258239746094, - "learning_rate": 3.6198981691346667e-06, - "loss": 0.4345, - "mean_token_accuracy": 0.8547190606594086, - "num_tokens": 189580842.0, - "step": 157560 - }, - { - "entropy": 1.9357234045863152, - "epoch": 0.48845321340805153, - "grad_norm": 8.230269432067871, - "learning_rate": 3.619783300232646e-06, - "loss": 0.4894, - "mean_token_accuracy": 0.8404277488589287, - "num_tokens": 189592624.0, - "step": 157570 - }, - { - "entropy": 1.9686369627714158, - "epoch": 0.48848421253310126, - "grad_norm": 8.409232139587402, - "learning_rate": 3.6196684422652106e-06, - "loss": 0.4816, - "mean_token_accuracy": 0.8537262305617332, - "num_tokens": 189603599.0, - "step": 157580 - }, - { - "entropy": 1.8572678551077844, - "epoch": 0.4885152116581509, - "grad_norm": 8.065033912658691, - "learning_rate": 3.6195535952306233e-06, - "loss": 0.4137, - "mean_token_accuracy": 0.8580617591738701, - "num_tokens": 189616217.0, - "step": 157590 - }, - { - "entropy": 1.854029330611229, - "epoch": 0.48854621078320065, - "grad_norm": 8.96259593963623, - "learning_rate": 3.6194387591271525e-06, - "loss": 0.4054, - "mean_token_accuracy": 0.8569090217351913, - "num_tokens": 189628302.0, - "step": 157600 - }, - { - "entropy": 1.836168546974659, - "epoch": 0.4885772099082503, - "grad_norm": 8.88343334197998, - "learning_rate": 3.619323933953063e-06, - "loss": 0.4175, - "mean_token_accuracy": 0.8561067417263984, - "num_tokens": 189640866.0, - "step": 157610 - }, - { - "entropy": 1.8437625631690024, - "epoch": 0.48860820903330004, - "grad_norm": 7.693849086761475, - "learning_rate": 3.6192091197066205e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8549635618925094, - "num_tokens": 189653230.0, - "step": 157620 - }, - { - "entropy": 1.971674808859825, - "epoch": 0.4886392081583497, - "grad_norm": 8.87653923034668, - "learning_rate": 3.619094316386093e-06, - "loss": 0.4898, - "mean_token_accuracy": 0.8423131659626961, - "num_tokens": 189663808.0, - "step": 157630 - }, - { - "entropy": 1.9198501527309417, - "epoch": 0.48867020728339944, - "grad_norm": 8.140296936035156, - "learning_rate": 3.6189795239897478e-06, - "loss": 0.4452, - "mean_token_accuracy": 0.8481401056051254, - "num_tokens": 189675324.0, - "step": 157640 - }, - { - "entropy": 1.93048115670681, - "epoch": 0.4887012064084491, - "grad_norm": 6.999414443969727, - "learning_rate": 3.618864742515852e-06, - "loss": 0.4691, - "mean_token_accuracy": 0.8548283234238625, - "num_tokens": 189686430.0, - "step": 157650 - }, - { - "entropy": 1.856081487238407, - "epoch": 0.48873220553349883, - "grad_norm": 9.344239234924316, - "learning_rate": 3.6187499719626744e-06, - "loss": 0.3988, - "mean_token_accuracy": 0.8584906741976738, - "num_tokens": 189698360.0, - "step": 157660 - }, - { - "entropy": 1.8952415823936462, - "epoch": 0.4887632046585485, - "grad_norm": 7.345611572265625, - "learning_rate": 3.6186352123284817e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.8519857943058013, - "num_tokens": 189710084.0, - "step": 157670 - }, - { - "entropy": 1.7388610973954202, - "epoch": 0.4887942037835982, - "grad_norm": 3.976619243621826, - "learning_rate": 3.6185204636115446e-06, - "loss": 0.3789, - "mean_token_accuracy": 0.8673770546913147, - "num_tokens": 189724014.0, - "step": 157680 - }, - { - "entropy": 1.8221586227416993, - "epoch": 0.4888252029086479, - "grad_norm": 8.260506629943848, - "learning_rate": 3.6184057258101305e-06, - "loss": 0.3949, - "mean_token_accuracy": 0.8662657171487809, - "num_tokens": 189736367.0, - "step": 157690 - }, - { - "entropy": 1.8475705727934837, - "epoch": 0.4888562020336976, - "grad_norm": 7.417000770568848, - "learning_rate": 3.6182909989225107e-06, - "loss": 0.3849, - "mean_token_accuracy": 0.8679051205515862, - "num_tokens": 189748764.0, - "step": 157700 - }, - { - "entropy": 2.0135974794626237, - "epoch": 0.4888872011587473, - "grad_norm": 8.551652908325195, - "learning_rate": 3.618176282946954e-06, - "loss": 0.5405, - "mean_token_accuracy": 0.8359439983963967, - "num_tokens": 189759508.0, - "step": 157710 - }, - { - "entropy": 1.8791854158043861, - "epoch": 0.488918200283797, - "grad_norm": 8.100190162658691, - "learning_rate": 3.6180615778817304e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.8628863275051117, - "num_tokens": 189771238.0, - "step": 157720 - }, - { - "entropy": 1.8653140723705293, - "epoch": 0.4889491994088467, - "grad_norm": 8.358512878417969, - "learning_rate": 3.6179468837251115e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.8619488105177879, - "num_tokens": 189783364.0, - "step": 157730 - }, - { - "entropy": 1.8888757541775703, - "epoch": 0.4889801985338964, - "grad_norm": 3.8749780654907227, - "learning_rate": 3.617832200475368e-06, - "loss": 0.4373, - "mean_token_accuracy": 0.8521002560853959, - "num_tokens": 189795088.0, - "step": 157740 - }, - { - "entropy": 1.8624941438436509, - "epoch": 0.4890111976589461, - "grad_norm": 8.596406936645508, - "learning_rate": 3.6177175281307715e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8576769649982452, - "num_tokens": 189807667.0, - "step": 157750 - }, - { - "entropy": 1.950662373006344, - "epoch": 0.4890421967839958, - "grad_norm": 7.65726375579834, - "learning_rate": 3.6176028666895936e-06, - "loss": 0.4938, - "mean_token_accuracy": 0.8424899771809577, - "num_tokens": 189819412.0, - "step": 157760 - }, - { - "entropy": 1.9022864386439324, - "epoch": 0.48907319590904547, - "grad_norm": 8.7039794921875, - "learning_rate": 3.6174882161501066e-06, - "loss": 0.4664, - "mean_token_accuracy": 0.8537297427654267, - "num_tokens": 189830506.0, - "step": 157770 - }, - { - "entropy": 1.892046569287777, - "epoch": 0.48910419503409513, - "grad_norm": 3.2481043338775635, - "learning_rate": 3.6173735765105827e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8614903211593627, - "num_tokens": 189841759.0, - "step": 157780 - }, - { - "entropy": 1.9165176048874855, - "epoch": 0.48913519415914486, - "grad_norm": 5.342856407165527, - "learning_rate": 3.6172589477692956e-06, - "loss": 0.469, - "mean_token_accuracy": 0.8441179916262627, - "num_tokens": 189853777.0, - "step": 157790 - }, - { - "entropy": 1.8486308708786965, - "epoch": 0.48916619328419453, - "grad_norm": 7.395596504211426, - "learning_rate": 3.6171443299245185e-06, - "loss": 0.4164, - "mean_token_accuracy": 0.8635935798287392, - "num_tokens": 189865936.0, - "step": 157800 - }, - { - "entropy": 1.8705967336893081, - "epoch": 0.48919719240924425, - "grad_norm": 9.73678207397461, - "learning_rate": 3.617029722974525e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8559140339493752, - "num_tokens": 189878261.0, - "step": 157810 - }, - { - "entropy": 1.8054387748241425, - "epoch": 0.4892281915342939, - "grad_norm": 2.276423931121826, - "learning_rate": 3.6169151269175887e-06, - "loss": 0.4847, - "mean_token_accuracy": 0.847235806286335, - "num_tokens": 189890974.0, - "step": 157820 - }, - { - "entropy": 1.8155124217271805, - "epoch": 0.48925919065934365, - "grad_norm": 8.386927604675293, - "learning_rate": 3.6168005417519857e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8562206014990806, - "num_tokens": 189903505.0, - "step": 157830 - }, - { - "entropy": 1.8468881100416183, - "epoch": 0.4892901897843933, - "grad_norm": 8.004000663757324, - "learning_rate": 3.616685967475989e-06, - "loss": 0.3927, - "mean_token_accuracy": 0.8538503974676133, - "num_tokens": 189916014.0, - "step": 157840 - }, - { - "entropy": 1.8874759256839753, - "epoch": 0.48932118890944304, - "grad_norm": 8.225050926208496, - "learning_rate": 3.6165714040878753e-06, - "loss": 0.4722, - "mean_token_accuracy": 0.8494992300868034, - "num_tokens": 189927102.0, - "step": 157850 - }, - { - "entropy": 1.9252758368849754, - "epoch": 0.4893521880344927, - "grad_norm": 9.809006690979004, - "learning_rate": 3.61645685158592e-06, - "loss": 0.4832, - "mean_token_accuracy": 0.8438394486904144, - "num_tokens": 189938653.0, - "step": 157860 - }, - { - "entropy": 1.86532269269228, - "epoch": 0.48938318715954243, - "grad_norm": 7.953715801239014, - "learning_rate": 3.616342309968398e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.8573437079787254, - "num_tokens": 189950573.0, - "step": 157870 - }, - { - "entropy": 1.8941395804286003, - "epoch": 0.4894141862845921, - "grad_norm": 8.05754566192627, - "learning_rate": 3.616227779233587e-06, - "loss": 0.433, - "mean_token_accuracy": 0.8579764619469643, - "num_tokens": 189962373.0, - "step": 157880 - }, - { - "entropy": 1.8559586256742477, - "epoch": 0.4894451854096418, - "grad_norm": 7.713656902313232, - "learning_rate": 3.6161132593797637e-06, - "loss": 0.4283, - "mean_token_accuracy": 0.852620604634285, - "num_tokens": 189975314.0, - "step": 157890 - }, - { - "entropy": 1.875124379992485, - "epoch": 0.4894761845346915, - "grad_norm": 8.812604904174805, - "learning_rate": 3.615998750405205e-06, - "loss": 0.448, - "mean_token_accuracy": 0.8633681252598763, - "num_tokens": 189987539.0, - "step": 157900 - }, - { - "entropy": 1.8811095029115676, - "epoch": 0.4895071836597412, - "grad_norm": 10.135832786560059, - "learning_rate": 3.6158842523081883e-06, - "loss": 0.4323, - "mean_token_accuracy": 0.8540236935019493, - "num_tokens": 189999109.0, - "step": 157910 - }, - { - "entropy": 1.8668017193675042, - "epoch": 0.4895381827847909, - "grad_norm": 8.14980411529541, - "learning_rate": 3.6157697650869916e-06, - "loss": 0.4357, - "mean_token_accuracy": 0.8578397080302238, - "num_tokens": 190011487.0, - "step": 157920 - }, - { - "entropy": 1.847634233534336, - "epoch": 0.4895691819098406, - "grad_norm": 7.752749919891357, - "learning_rate": 3.6156552887398934e-06, - "loss": 0.4059, - "mean_token_accuracy": 0.8659601092338562, - "num_tokens": 190023692.0, - "step": 157930 - }, - { - "entropy": 1.9245047122240067, - "epoch": 0.4896001810348903, - "grad_norm": 8.915438652038574, - "learning_rate": 3.615540823265173e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8451450273394585, - "num_tokens": 190035059.0, - "step": 157940 - }, - { - "entropy": 1.9289063662290573, - "epoch": 0.48963118015994, - "grad_norm": 8.771639823913574, - "learning_rate": 3.615426368661109e-06, - "loss": 0.4944, - "mean_token_accuracy": 0.8454332157969475, - "num_tokens": 190046188.0, - "step": 157950 - }, - { - "entropy": 1.8621390245854854, - "epoch": 0.4896621792849897, - "grad_norm": 8.388298034667969, - "learning_rate": 3.61531192492598e-06, - "loss": 0.4724, - "mean_token_accuracy": 0.8460786879062653, - "num_tokens": 190058636.0, - "step": 157960 - }, - { - "entropy": 1.9229322642087936, - "epoch": 0.4896931784100394, - "grad_norm": 9.573465347290039, - "learning_rate": 3.6151974920580674e-06, - "loss": 0.4714, - "mean_token_accuracy": 0.8481347262859344, - "num_tokens": 190069764.0, - "step": 157970 - }, - { - "entropy": 1.7791677549481393, - "epoch": 0.48972417753508907, - "grad_norm": 3.663013458251953, - "learning_rate": 3.6150830700556498e-06, - "loss": 0.3645, - "mean_token_accuracy": 0.874565900862217, - "num_tokens": 190082703.0, - "step": 157980 - }, - { - "entropy": 1.8474309265613555, - "epoch": 0.4897551766601388, - "grad_norm": 4.272090435028076, - "learning_rate": 3.61496865891701e-06, - "loss": 0.4352, - "mean_token_accuracy": 0.8537533566355705, - "num_tokens": 190095209.0, - "step": 157990 - }, - { - "entropy": 1.8715117886662482, - "epoch": 0.48978617578518846, - "grad_norm": 9.850628852844238, - "learning_rate": 3.6148542586404274e-06, - "loss": 0.4476, - "mean_token_accuracy": 0.8550691947340965, - "num_tokens": 190107364.0, - "step": 158000 - }, - { - "entropy": 1.8813581451773644, - "epoch": 0.4898171749102382, - "grad_norm": 8.418951988220215, - "learning_rate": 3.614739869224183e-06, - "loss": 0.4654, - "mean_token_accuracy": 0.8511038661003113, - "num_tokens": 190118495.0, - "step": 158010 - }, - { - "entropy": 1.8656797528266906, - "epoch": 0.48984817403528785, - "grad_norm": 8.324191093444824, - "learning_rate": 3.6146254906665607e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8558537855744361, - "num_tokens": 190130949.0, - "step": 158020 - }, - { - "entropy": 1.7613300889730454, - "epoch": 0.4898791731603375, - "grad_norm": 8.842034339904785, - "learning_rate": 3.61451112296584e-06, - "loss": 0.3847, - "mean_token_accuracy": 0.8550317272543907, - "num_tokens": 190144475.0, - "step": 158030 - }, - { - "entropy": 1.9174160197377206, - "epoch": 0.48991017228538725, - "grad_norm": 9.014627456665039, - "learning_rate": 3.614396766120305e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8600229039788246, - "num_tokens": 190155849.0, - "step": 158040 - }, - { - "entropy": 1.884463222324848, - "epoch": 0.4899411714104369, - "grad_norm": 9.524755477905273, - "learning_rate": 3.614282420128239e-06, - "loss": 0.5113, - "mean_token_accuracy": 0.838143216073513, - "num_tokens": 190166700.0, - "step": 158050 - }, - { - "entropy": 1.7631948590278625, - "epoch": 0.48997217053548664, - "grad_norm": 2.52717661857605, - "learning_rate": 3.614168084987924e-06, - "loss": 0.3626, - "mean_token_accuracy": 0.8702189907431602, - "num_tokens": 190180515.0, - "step": 158060 - }, - { - "entropy": 1.871630634367466, - "epoch": 0.4900031696605363, - "grad_norm": 7.53773832321167, - "learning_rate": 3.6140537606976445e-06, - "loss": 0.4348, - "mean_token_accuracy": 0.8584669470787049, - "num_tokens": 190192676.0, - "step": 158070 - }, - { - "entropy": 1.9180895671248437, - "epoch": 0.49003416878558603, - "grad_norm": 8.001392364501953, - "learning_rate": 3.613939447255685e-06, - "loss": 0.4626, - "mean_token_accuracy": 0.8476955905556679, - "num_tokens": 190203722.0, - "step": 158080 - }, - { - "entropy": 1.8444015264511109, - "epoch": 0.4900651679106357, - "grad_norm": 8.631937980651855, - "learning_rate": 3.6138251446603284e-06, - "loss": 0.4391, - "mean_token_accuracy": 0.8547645956277847, - "num_tokens": 190216666.0, - "step": 158090 - }, - { - "entropy": 1.8547025889158248, - "epoch": 0.4900961670356854, - "grad_norm": 8.481095314025879, - "learning_rate": 3.61371085290986e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.8580074161291122, - "num_tokens": 190228229.0, - "step": 158100 - }, - { - "entropy": 1.889371046423912, - "epoch": 0.4901271661607351, - "grad_norm": 7.122347831726074, - "learning_rate": 3.613596572002567e-06, - "loss": 0.4802, - "mean_token_accuracy": 0.854512770473957, - "num_tokens": 190239419.0, - "step": 158110 - }, - { - "entropy": 1.7553760528564453, - "epoch": 0.4901581652857848, - "grad_norm": 8.698734283447266, - "learning_rate": 3.6134823019367315e-06, - "loss": 0.4142, - "mean_token_accuracy": 0.864093753695488, - "num_tokens": 190252901.0, - "step": 158120 - }, - { - "entropy": 1.912713533639908, - "epoch": 0.4901891644108345, - "grad_norm": 12.8290376663208, - "learning_rate": 3.613368042710643e-06, - "loss": 0.4929, - "mean_token_accuracy": 0.8492612764239311, - "num_tokens": 190264212.0, - "step": 158130 - }, - { - "entropy": 1.8984272867441176, - "epoch": 0.4902201635358842, - "grad_norm": 8.531185150146484, - "learning_rate": 3.6132537943225854e-06, - "loss": 0.4689, - "mean_token_accuracy": 0.8525654658675194, - "num_tokens": 190275101.0, - "step": 158140 - }, - { - "entropy": 1.875417274236679, - "epoch": 0.4902511626609339, - "grad_norm": 7.956252574920654, - "learning_rate": 3.6131395567708462e-06, - "loss": 0.4432, - "mean_token_accuracy": 0.8634052559733391, - "num_tokens": 190286182.0, - "step": 158150 - }, - { - "entropy": 1.8667438074946403, - "epoch": 0.4902821617859836, - "grad_norm": 8.726470947265625, - "learning_rate": 3.6130253300537122e-06, - "loss": 0.4458, - "mean_token_accuracy": 0.853116650879383, - "num_tokens": 190298040.0, - "step": 158160 - }, - { - "entropy": 1.8496040746569633, - "epoch": 0.4903131609110333, - "grad_norm": 9.257894515991211, - "learning_rate": 3.612911114169471e-06, - "loss": 0.4438, - "mean_token_accuracy": 0.8475693762302399, - "num_tokens": 190310368.0, - "step": 158170 - }, - { - "entropy": 1.726231387257576, - "epoch": 0.490344160036083, - "grad_norm": 8.153938293457031, - "learning_rate": 3.612796909116411e-06, - "loss": 0.3491, - "mean_token_accuracy": 0.861776416003704, - "num_tokens": 190324421.0, - "step": 158180 - }, - { - "entropy": 1.91248170286417, - "epoch": 0.49037515916113267, - "grad_norm": 8.336747169494629, - "learning_rate": 3.61268271489282e-06, - "loss": 0.4851, - "mean_token_accuracy": 0.8475732728838921, - "num_tokens": 190335616.0, - "step": 158190 - }, - { - "entropy": 1.8628337591886521, - "epoch": 0.4904061582861824, - "grad_norm": 3.811713933944702, - "learning_rate": 3.6125685314969867e-06, - "loss": 0.4735, - "mean_token_accuracy": 0.8399553775787354, - "num_tokens": 190348026.0, - "step": 158200 - }, - { - "entropy": 1.8320413410663605, - "epoch": 0.49043715741123206, - "grad_norm": 8.665163040161133, - "learning_rate": 3.6124543589272e-06, - "loss": 0.4245, - "mean_token_accuracy": 0.8562450557947159, - "num_tokens": 190360694.0, - "step": 158210 - }, - { - "entropy": 1.8738995507359504, - "epoch": 0.4904681565362818, - "grad_norm": 7.839025497436523, - "learning_rate": 3.6123401971817484e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.8596608370542527, - "num_tokens": 190372445.0, - "step": 158220 - }, - { - "entropy": 1.9184532716870308, - "epoch": 0.49049915566133145, - "grad_norm": 7.668222904205322, - "learning_rate": 3.6122260462589237e-06, - "loss": 0.4548, - "mean_token_accuracy": 0.8530950948596001, - "num_tokens": 190383789.0, - "step": 158230 - }, - { - "entropy": 1.9229084342718124, - "epoch": 0.4905301547863812, - "grad_norm": 7.734839916229248, - "learning_rate": 3.6121119061570144e-06, - "loss": 0.4684, - "mean_token_accuracy": 0.8534473240375519, - "num_tokens": 190395480.0, - "step": 158240 - }, - { - "entropy": 1.8685783818364143, - "epoch": 0.49056115391143085, - "grad_norm": 3.483649253845215, - "learning_rate": 3.6119977768743107e-06, - "loss": 0.4063, - "mean_token_accuracy": 0.8567217275500297, - "num_tokens": 190407662.0, - "step": 158250 - }, - { - "entropy": 1.8338368773460387, - "epoch": 0.49059215303648057, - "grad_norm": 8.351194381713867, - "learning_rate": 3.611883658409105e-06, - "loss": 0.4413, - "mean_token_accuracy": 0.8560110285878182, - "num_tokens": 190419921.0, - "step": 158260 - }, - { - "entropy": 1.948527753353119, - "epoch": 0.49062315216153024, - "grad_norm": 9.00690746307373, - "learning_rate": 3.6117695507596867e-06, - "loss": 0.5085, - "mean_token_accuracy": 0.8417193755507469, - "num_tokens": 190430689.0, - "step": 158270 - }, - { - "entropy": 1.8747548520565034, - "epoch": 0.4906541512865799, - "grad_norm": 9.07733154296875, - "learning_rate": 3.61165545392435e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.8512477323412895, - "num_tokens": 190443343.0, - "step": 158280 - }, - { - "entropy": 1.8400466367602348, - "epoch": 0.49068515041162963, - "grad_norm": 4.46486759185791, - "learning_rate": 3.6115413679013844e-06, - "loss": 0.4234, - "mean_token_accuracy": 0.8513605386018753, - "num_tokens": 190456514.0, - "step": 158290 - }, - { - "entropy": 1.8381875991821288, - "epoch": 0.4907161495366793, - "grad_norm": 9.451220512390137, - "learning_rate": 3.611427292689083e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8568148076534271, - "num_tokens": 190468151.0, - "step": 158300 - }, - { - "entropy": 1.8779310151934623, - "epoch": 0.490747148661729, - "grad_norm": 8.845046043395996, - "learning_rate": 3.6113132282857395e-06, - "loss": 0.443, - "mean_token_accuracy": 0.8532233744859695, - "num_tokens": 190479605.0, - "step": 158310 - }, - { - "entropy": 1.8384259521961213, - "epoch": 0.4907781477867787, - "grad_norm": 4.648056507110596, - "learning_rate": 3.6111991746896458e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.8502117037773133, - "num_tokens": 190491381.0, - "step": 158320 - }, - { - "entropy": 1.8655261859297751, - "epoch": 0.4908091469118284, - "grad_norm": 6.474485397338867, - "learning_rate": 3.6110851318990962e-06, - "loss": 0.452, - "mean_token_accuracy": 0.8507389679551125, - "num_tokens": 190503415.0, - "step": 158330 - }, - { - "entropy": 1.8652947336435317, - "epoch": 0.4908401460368781, - "grad_norm": 7.278781890869141, - "learning_rate": 3.610971099912384e-06, - "loss": 0.4171, - "mean_token_accuracy": 0.8620616480708122, - "num_tokens": 190514775.0, - "step": 158340 - }, - { - "entropy": 1.745438788831234, - "epoch": 0.4908711451619278, - "grad_norm": 7.849375247955322, - "learning_rate": 3.6108570787278046e-06, - "loss": 0.397, - "mean_token_accuracy": 0.8658480405807495, - "num_tokens": 190527699.0, - "step": 158350 - }, - { - "entropy": 1.9015355363488198, - "epoch": 0.4909021442869775, - "grad_norm": 7.331711292266846, - "learning_rate": 3.6107430683436514e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.8570471405982971, - "num_tokens": 190539052.0, - "step": 158360 - }, - { - "entropy": 1.8853423982858657, - "epoch": 0.4909331434120272, - "grad_norm": 8.293661117553711, - "learning_rate": 3.6106290687582196e-06, - "loss": 0.4799, - "mean_token_accuracy": 0.8404641300439835, - "num_tokens": 190550584.0, - "step": 158370 - }, - { - "entropy": 1.9187031179666518, - "epoch": 0.4909641425370769, - "grad_norm": 7.615777969360352, - "learning_rate": 3.6105150799698063e-06, - "loss": 0.4587, - "mean_token_accuracy": 0.8540566086769104, - "num_tokens": 190561826.0, - "step": 158380 - }, - { - "entropy": 1.9027689948678017, - "epoch": 0.4909951416621266, - "grad_norm": 8.56459903717041, - "learning_rate": 3.6104011019767042e-06, - "loss": 0.4743, - "mean_token_accuracy": 0.8480705738067627, - "num_tokens": 190573607.0, - "step": 158390 - }, - { - "entropy": 1.8521524950861932, - "epoch": 0.49102614078717627, - "grad_norm": 8.306175231933594, - "learning_rate": 3.610287134777212e-06, - "loss": 0.4325, - "mean_token_accuracy": 0.8617270559072494, - "num_tokens": 190585491.0, - "step": 158400 - }, - { - "entropy": 1.883074052631855, - "epoch": 0.491057139912226, - "grad_norm": 7.961977958679199, - "learning_rate": 3.6101731783696254e-06, - "loss": 0.405, - "mean_token_accuracy": 0.8651751860976219, - "num_tokens": 190596588.0, - "step": 158410 - }, - { - "entropy": 1.9026632726192474, - "epoch": 0.49108813903727566, - "grad_norm": 7.978363037109375, - "learning_rate": 3.6100592327522414e-06, - "loss": 0.4865, - "mean_token_accuracy": 0.8567787855863571, - "num_tokens": 190607394.0, - "step": 158420 - }, - { - "entropy": 1.856230580806732, - "epoch": 0.4911191381623254, - "grad_norm": 8.092080116271973, - "learning_rate": 3.609945297923357e-06, - "loss": 0.4274, - "mean_token_accuracy": 0.8433807224035264, - "num_tokens": 190620235.0, - "step": 158430 - }, - { - "entropy": 1.8610840275883676, - "epoch": 0.49115013728737505, - "grad_norm": 6.581269264221191, - "learning_rate": 3.60983137388127e-06, - "loss": 0.4968, - "mean_token_accuracy": 0.8450413525104523, - "num_tokens": 190632374.0, - "step": 158440 - }, - { - "entropy": 1.8135110765695572, - "epoch": 0.4911811364124248, - "grad_norm": 8.886683464050293, - "learning_rate": 3.6097174606242787e-06, - "loss": 0.402, - "mean_token_accuracy": 0.8638947665691376, - "num_tokens": 190645439.0, - "step": 158450 - }, - { - "entropy": 1.9266662746667862, - "epoch": 0.49121213553747445, - "grad_norm": 8.657486915588379, - "learning_rate": 3.609603558150681e-06, - "loss": 0.4881, - "mean_token_accuracy": 0.8563274934887886, - "num_tokens": 190656388.0, - "step": 158460 - }, - { - "entropy": 1.8276669576764106, - "epoch": 0.4912431346625242, - "grad_norm": 2.386725664138794, - "learning_rate": 3.6094896664587762e-06, - "loss": 0.4024, - "mean_token_accuracy": 0.8620262265205383, - "num_tokens": 190669516.0, - "step": 158470 - }, - { - "entropy": 1.89359792470932, - "epoch": 0.49127413378757384, - "grad_norm": 4.238652229309082, - "learning_rate": 3.6093757855468625e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8525636330246925, - "num_tokens": 190681421.0, - "step": 158480 - }, - { - "entropy": 1.914654791355133, - "epoch": 0.49130513291262357, - "grad_norm": 6.82390022277832, - "learning_rate": 3.6092619154132415e-06, - "loss": 0.4631, - "mean_token_accuracy": 0.8525402382016182, - "num_tokens": 190692959.0, - "step": 158490 - }, - { - "entropy": 1.8836501479148864, - "epoch": 0.49133613203767323, - "grad_norm": 4.361353397369385, - "learning_rate": 3.6091480560562114e-06, - "loss": 0.4636, - "mean_token_accuracy": 0.8494673162698746, - "num_tokens": 190704902.0, - "step": 158500 - }, - { - "entropy": 1.901443050801754, - "epoch": 0.49136713116272296, - "grad_norm": 3.428273916244507, - "learning_rate": 3.6090342074740727e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8393049791455269, - "num_tokens": 190717320.0, - "step": 158510 - }, - { - "entropy": 1.9237724661827087, - "epoch": 0.4913981302877726, - "grad_norm": 8.968462944030762, - "learning_rate": 3.608920369665126e-06, - "loss": 0.4693, - "mean_token_accuracy": 0.8518886238336563, - "num_tokens": 190728541.0, - "step": 158520 - }, - { - "entropy": 1.9531280279159546, - "epoch": 0.4914291294128223, - "grad_norm": 9.573519706726074, - "learning_rate": 3.6088065426276725e-06, - "loss": 0.5737, - "mean_token_accuracy": 0.8390479683876038, - "num_tokens": 190740022.0, - "step": 158530 - }, - { - "entropy": 1.9052818953990935, - "epoch": 0.491460128537872, - "grad_norm": 9.276761054992676, - "learning_rate": 3.6086927263600148e-06, - "loss": 0.4698, - "mean_token_accuracy": 0.8530030116438866, - "num_tokens": 190751019.0, - "step": 158540 - }, - { - "entropy": 1.8869039386510849, - "epoch": 0.4914911276629217, - "grad_norm": 4.146981716156006, - "learning_rate": 3.6085789208604527e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8596585467457771, - "num_tokens": 190763800.0, - "step": 158550 - }, - { - "entropy": 1.7681370228528976, - "epoch": 0.4915221267879714, - "grad_norm": 3.943372964859009, - "learning_rate": 3.6084651261272897e-06, - "loss": 0.3736, - "mean_token_accuracy": 0.8705621913075448, - "num_tokens": 190777433.0, - "step": 158560 - }, - { - "entropy": 1.9185601070523262, - "epoch": 0.4915531259130211, - "grad_norm": 7.802610397338867, - "learning_rate": 3.6083513421588275e-06, - "loss": 0.4505, - "mean_token_accuracy": 0.8546137645840645, - "num_tokens": 190789867.0, - "step": 158570 - }, - { - "entropy": 1.9287995994091034, - "epoch": 0.4915841250380708, - "grad_norm": 8.18673324584961, - "learning_rate": 3.6082375689533694e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8485715165734291, - "num_tokens": 190801836.0, - "step": 158580 - }, - { - "entropy": 1.7540825635194779, - "epoch": 0.4916151241631205, - "grad_norm": 4.475728511810303, - "learning_rate": 3.6081238065092192e-06, - "loss": 0.3126, - "mean_token_accuracy": 0.8646261185407639, - "num_tokens": 190816330.0, - "step": 158590 - }, - { - "entropy": 1.9391558408737182, - "epoch": 0.4916461232881702, - "grad_norm": 4.104739189147949, - "learning_rate": 3.60801005482468e-06, - "loss": 0.5068, - "mean_token_accuracy": 0.8371262863278389, - "num_tokens": 190828313.0, - "step": 158600 - }, - { - "entropy": 1.9185767412185668, - "epoch": 0.49167712241321987, - "grad_norm": 8.502923011779785, - "learning_rate": 3.607896313898056e-06, - "loss": 0.4647, - "mean_token_accuracy": 0.8552430674433709, - "num_tokens": 190840723.0, - "step": 158610 - }, - { - "entropy": 1.9190486401319504, - "epoch": 0.4917081215382696, - "grad_norm": 3.9274349212646484, - "learning_rate": 3.6077825837276513e-06, - "loss": 0.4443, - "mean_token_accuracy": 0.8470717743039131, - "num_tokens": 190852704.0, - "step": 158620 - }, - { - "entropy": 1.9390907883644104, - "epoch": 0.49173912066331926, - "grad_norm": 8.89169979095459, - "learning_rate": 3.607668864311771e-06, - "loss": 0.4595, - "mean_token_accuracy": 0.8512443661689758, - "num_tokens": 190863472.0, - "step": 158630 - }, - { - "entropy": 1.9003855600953101, - "epoch": 0.491770119788369, - "grad_norm": 6.889798641204834, - "learning_rate": 3.607555155648721e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8536323562264443, - "num_tokens": 190875506.0, - "step": 158640 - }, - { - "entropy": 1.9173786222934723, - "epoch": 0.49180111891341866, - "grad_norm": 8.534137725830078, - "learning_rate": 3.6074414577368046e-06, - "loss": 0.4389, - "mean_token_accuracy": 0.8548526600003242, - "num_tokens": 190887431.0, - "step": 158650 - }, - { - "entropy": 1.9805822908878326, - "epoch": 0.4918321180384684, - "grad_norm": 8.78432846069336, - "learning_rate": 3.60732777057433e-06, - "loss": 0.481, - "mean_token_accuracy": 0.8482834473252296, - "num_tokens": 190898042.0, - "step": 158660 - }, - { - "entropy": 1.9581329673528671, - "epoch": 0.49186311716351805, - "grad_norm": 7.826303958892822, - "learning_rate": 3.607214094159603e-06, - "loss": 0.4789, - "mean_token_accuracy": 0.8571975663304329, - "num_tokens": 190909409.0, - "step": 158670 - }, - { - "entropy": 1.906209309399128, - "epoch": 0.4918941162885678, - "grad_norm": 8.253519058227539, - "learning_rate": 3.6071004284909293e-06, - "loss": 0.4601, - "mean_token_accuracy": 0.8512861356139183, - "num_tokens": 190921294.0, - "step": 158680 - }, - { - "entropy": 1.9099845930933952, - "epoch": 0.49192511541361744, - "grad_norm": 3.3048346042633057, - "learning_rate": 3.606986773566617e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.851481918990612, - "num_tokens": 190933707.0, - "step": 158690 - }, - { - "entropy": 1.8390526965260505, - "epoch": 0.49195611453866717, - "grad_norm": 3.801232099533081, - "learning_rate": 3.606873129384972e-06, - "loss": 0.3815, - "mean_token_accuracy": 0.8585761964321137, - "num_tokens": 190946405.0, - "step": 158700 - }, - { - "entropy": 1.8441692113876342, - "epoch": 0.49198711366371684, - "grad_norm": 8.876874923706055, - "learning_rate": 3.606759495944304e-06, - "loss": 0.4182, - "mean_token_accuracy": 0.8560978010296821, - "num_tokens": 190959067.0, - "step": 158710 - }, - { - "entropy": 1.8029930278658868, - "epoch": 0.49201811278876656, - "grad_norm": 8.655598640441895, - "learning_rate": 3.6066458732429203e-06, - "loss": 0.4188, - "mean_token_accuracy": 0.857164989411831, - "num_tokens": 190972623.0, - "step": 158720 - }, - { - "entropy": 1.9350174590945244, - "epoch": 0.49204911191381623, - "grad_norm": 9.002110481262207, - "learning_rate": 3.6065322612791293e-06, - "loss": 0.4608, - "mean_token_accuracy": 0.848691463470459, - "num_tokens": 190984331.0, - "step": 158730 - }, - { - "entropy": 1.8599534809589386, - "epoch": 0.49208011103886595, - "grad_norm": 3.756072759628296, - "learning_rate": 3.60641866005124e-06, - "loss": 0.4706, - "mean_token_accuracy": 0.846306300163269, - "num_tokens": 190997036.0, - "step": 158740 - }, - { - "entropy": 1.8874867737293244, - "epoch": 0.4921111101639156, - "grad_norm": 8.43781852722168, - "learning_rate": 3.6063050695575613e-06, - "loss": 0.4231, - "mean_token_accuracy": 0.8569997996091843, - "num_tokens": 191009039.0, - "step": 158750 - }, - { - "entropy": 1.9412825867533683, - "epoch": 0.49214210928896535, - "grad_norm": 8.520960807800293, - "learning_rate": 3.6061914897964035e-06, - "loss": 0.5182, - "mean_token_accuracy": 0.8366198286414146, - "num_tokens": 191020334.0, - "step": 158760 - }, - { - "entropy": 1.9006411105394363, - "epoch": 0.492173108414015, - "grad_norm": 10.549320220947266, - "learning_rate": 3.606077920766076e-06, - "loss": 0.4682, - "mean_token_accuracy": 0.845074562728405, - "num_tokens": 191032363.0, - "step": 158770 - }, - { - "entropy": 1.9263049215078354, - "epoch": 0.4922041075390647, - "grad_norm": 3.9291138648986816, - "learning_rate": 3.6059643624648898e-06, - "loss": 0.4452, - "mean_token_accuracy": 0.8536166071891784, - "num_tokens": 191044148.0, - "step": 158780 - }, - { - "entropy": 1.9111416772007943, - "epoch": 0.4922351066641144, - "grad_norm": 3.9041879177093506, - "learning_rate": 3.6058508148911555e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8473462641239167, - "num_tokens": 191055737.0, - "step": 158790 - }, - { - "entropy": 1.8865627378225327, - "epoch": 0.4922661057891641, - "grad_norm": 4.586888790130615, - "learning_rate": 3.605737278043184e-06, - "loss": 0.4357, - "mean_token_accuracy": 0.8481022253632545, - "num_tokens": 191069166.0, - "step": 158800 - }, - { - "entropy": 1.872072483599186, - "epoch": 0.4922971049142138, - "grad_norm": 8.540482521057129, - "learning_rate": 3.6056237519192867e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8634131327271461, - "num_tokens": 191082087.0, - "step": 158810 - }, - { - "entropy": 1.9465556621551514, - "epoch": 0.49232810403926347, - "grad_norm": 8.96761417388916, - "learning_rate": 3.605510236517776e-06, - "loss": 0.5089, - "mean_token_accuracy": 0.8466716349124909, - "num_tokens": 191093735.0, - "step": 158820 - }, - { - "entropy": 1.8635354220867157, - "epoch": 0.4923591031643132, - "grad_norm": 7.013364791870117, - "learning_rate": 3.6053967318369633e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.8506390795111656, - "num_tokens": 191106221.0, - "step": 158830 - }, - { - "entropy": 1.8185190051794051, - "epoch": 0.49239010228936286, - "grad_norm": 9.036824226379395, - "learning_rate": 3.6052832378751617e-06, - "loss": 0.4254, - "mean_token_accuracy": 0.8497332945466042, - "num_tokens": 191119668.0, - "step": 158840 - }, - { - "entropy": 1.915633998811245, - "epoch": 0.4924211014144126, - "grad_norm": 4.1883673667907715, - "learning_rate": 3.6051697546306846e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.8417811617255211, - "num_tokens": 191132314.0, - "step": 158850 - }, - { - "entropy": 1.9364984586834908, - "epoch": 0.49245210053946226, - "grad_norm": 7.76389217376709, - "learning_rate": 3.6050562821018447e-06, - "loss": 0.4657, - "mean_token_accuracy": 0.8470098197460174, - "num_tokens": 191143993.0, - "step": 158860 - }, - { - "entropy": 1.8327162981033325, - "epoch": 0.492483099664512, - "grad_norm": 7.964986324310303, - "learning_rate": 3.604942820286957e-06, - "loss": 0.4049, - "mean_token_accuracy": 0.8613449767231941, - "num_tokens": 191156864.0, - "step": 158870 - }, - { - "entropy": 1.8690617755055428, - "epoch": 0.49251409878956165, - "grad_norm": 4.35117244720459, - "learning_rate": 3.6048293691843333e-06, - "loss": 0.4475, - "mean_token_accuracy": 0.8491967663168907, - "num_tokens": 191169304.0, - "step": 158880 - }, - { - "entropy": 1.8968679785728455, - "epoch": 0.4925450979146114, - "grad_norm": 4.414698123931885, - "learning_rate": 3.6047159287922902e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8521355882287025, - "num_tokens": 191181207.0, - "step": 158890 - }, - { - "entropy": 1.9356569901108742, - "epoch": 0.49257609703966104, - "grad_norm": 8.631135940551758, - "learning_rate": 3.6046024991091415e-06, - "loss": 0.5132, - "mean_token_accuracy": 0.8320743992924691, - "num_tokens": 191192832.0, - "step": 158900 - }, - { - "entropy": 1.8540920540690422, - "epoch": 0.49260709616471077, - "grad_norm": 7.090327262878418, - "learning_rate": 3.604489080133203e-06, - "loss": 0.4149, - "mean_token_accuracy": 0.8626075640320778, - "num_tokens": 191205382.0, - "step": 158910 - }, - { - "entropy": 1.8661150723695754, - "epoch": 0.49263809528976044, - "grad_norm": 4.442000389099121, - "learning_rate": 3.604375671862789e-06, - "loss": 0.4347, - "mean_token_accuracy": 0.8547174111008644, - "num_tokens": 191218404.0, - "step": 158920 - }, - { - "entropy": 1.9430969834327698, - "epoch": 0.49266909441481016, - "grad_norm": 7.606647968292236, - "learning_rate": 3.604262274296218e-06, - "loss": 0.4695, - "mean_token_accuracy": 0.8519912898540497, - "num_tokens": 191230222.0, - "step": 158930 - }, - { - "entropy": 1.8457999877631663, - "epoch": 0.49270009353985983, - "grad_norm": 7.863997936248779, - "learning_rate": 3.6041488874318038e-06, - "loss": 0.3795, - "mean_token_accuracy": 0.8659952461719513, - "num_tokens": 191242348.0, - "step": 158940 - }, - { - "entropy": 1.9445855423808098, - "epoch": 0.49273109266490955, - "grad_norm": 10.894515037536621, - "learning_rate": 3.6040355112678643e-06, - "loss": 0.5062, - "mean_token_accuracy": 0.8433070674538612, - "num_tokens": 191254529.0, - "step": 158950 - }, - { - "entropy": 1.8986426308751105, - "epoch": 0.4927620917899592, - "grad_norm": 9.425984382629395, - "learning_rate": 3.6039221458027167e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.84424237459898, - "num_tokens": 191267476.0, - "step": 158960 - }, - { - "entropy": 1.8787411868572235, - "epoch": 0.49279309091500895, - "grad_norm": 7.693811416625977, - "learning_rate": 3.6038087910346775e-06, - "loss": 0.4695, - "mean_token_accuracy": 0.8551173999905586, - "num_tokens": 191279350.0, - "step": 158970 - }, - { - "entropy": 1.8206491246819496, - "epoch": 0.4928240900400586, - "grad_norm": 5.349733829498291, - "learning_rate": 3.6036954469620656e-06, - "loss": 0.4141, - "mean_token_accuracy": 0.858145822584629, - "num_tokens": 191292641.0, - "step": 158980 - }, - { - "entropy": 1.9382214292883873, - "epoch": 0.49285508916510834, - "grad_norm": 8.99599838256836, - "learning_rate": 3.603582113583198e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8653192713856697, - "num_tokens": 191303702.0, - "step": 158990 - }, - { - "entropy": 1.9484039723873139, - "epoch": 0.492886088290158, - "grad_norm": 9.494302749633789, - "learning_rate": 3.6034687908963946e-06, - "loss": 0.4695, - "mean_token_accuracy": 0.8453142330050468, - "num_tokens": 191315049.0, - "step": 159000 - }, - { - "entropy": 1.9259200036525725, - "epoch": 0.49291708741520773, - "grad_norm": 8.478474617004395, - "learning_rate": 3.6033554788999735e-06, - "loss": 0.5419, - "mean_token_accuracy": 0.8397252514958382, - "num_tokens": 191326984.0, - "step": 159010 - }, - { - "entropy": 1.7619906887412071, - "epoch": 0.4929480865402574, - "grad_norm": 7.3137407302856445, - "learning_rate": 3.603242177592254e-06, - "loss": 0.3754, - "mean_token_accuracy": 0.8664394572377205, - "num_tokens": 191340485.0, - "step": 159020 - }, - { - "entropy": 1.9478530764579773, - "epoch": 0.49297908566530707, - "grad_norm": 8.351778984069824, - "learning_rate": 3.6031288869715564e-06, - "loss": 0.509, - "mean_token_accuracy": 0.8485710576176644, - "num_tokens": 191351634.0, - "step": 159030 - }, - { - "entropy": 1.8213827714323998, - "epoch": 0.4930100847903568, - "grad_norm": 7.855036735534668, - "learning_rate": 3.6030156070362e-06, - "loss": 0.3937, - "mean_token_accuracy": 0.8727815434336662, - "num_tokens": 191364782.0, - "step": 159040 - }, - { - "entropy": 1.7960595518350602, - "epoch": 0.49304108391540646, - "grad_norm": 2.6937222480773926, - "learning_rate": 3.6029023377845047e-06, - "loss": 0.3767, - "mean_token_accuracy": 0.8672909617424012, - "num_tokens": 191377748.0, - "step": 159050 - }, - { - "entropy": 1.8690377235412599, - "epoch": 0.4930720830404562, - "grad_norm": 4.590978622436523, - "learning_rate": 3.6027890792147935e-06, - "loss": 0.4366, - "mean_token_accuracy": 0.8542490854859353, - "num_tokens": 191390080.0, - "step": 159060 - }, - { - "entropy": 1.7864416763186455, - "epoch": 0.49310308216550586, - "grad_norm": 8.749615669250488, - "learning_rate": 3.602675831325385e-06, - "loss": 0.3539, - "mean_token_accuracy": 0.8751874417066574, - "num_tokens": 191403446.0, - "step": 159070 - }, - { - "entropy": 1.9172361060976981, - "epoch": 0.4931340812905556, - "grad_norm": 8.16201114654541, - "learning_rate": 3.6025625941146024e-06, - "loss": 0.4948, - "mean_token_accuracy": 0.8378383472561837, - "num_tokens": 191415168.0, - "step": 159080 - }, - { - "entropy": 1.8703493565320968, - "epoch": 0.49316508041560525, - "grad_norm": 8.217545509338379, - "learning_rate": 3.602449367580767e-06, - "loss": 0.4655, - "mean_token_accuracy": 0.852349478006363, - "num_tokens": 191426771.0, - "step": 159090 - }, - { - "entropy": 1.9466371893882752, - "epoch": 0.493196079540655, - "grad_norm": 8.871587753295898, - "learning_rate": 3.6023361517222004e-06, - "loss": 0.4819, - "mean_token_accuracy": 0.8483422711491585, - "num_tokens": 191437750.0, - "step": 159100 - }, - { - "entropy": 1.9135088309645654, - "epoch": 0.49322707866570464, - "grad_norm": 8.855616569519043, - "learning_rate": 3.6022229465372273e-06, - "loss": 0.5018, - "mean_token_accuracy": 0.8475657269358635, - "num_tokens": 191449796.0, - "step": 159110 - }, - { - "entropy": 1.7874897986650466, - "epoch": 0.49325807779075437, - "grad_norm": 2.3957784175872803, - "learning_rate": 3.6021097520241676e-06, - "loss": 0.4042, - "mean_token_accuracy": 0.8699712991714478, - "num_tokens": 191462458.0, - "step": 159120 - }, - { - "entropy": 1.9435395896434784, - "epoch": 0.49328907691580404, - "grad_norm": 7.960422515869141, - "learning_rate": 3.6019965681813475e-06, - "loss": 0.4552, - "mean_token_accuracy": 0.8521071627736092, - "num_tokens": 191473434.0, - "step": 159130 - }, - { - "entropy": 1.9279766619205474, - "epoch": 0.49332007604085376, - "grad_norm": 3.857520341873169, - "learning_rate": 3.601883395007089e-06, - "loss": 0.5056, - "mean_token_accuracy": 0.8442885830998421, - "num_tokens": 191484903.0, - "step": 159140 - }, - { - "entropy": 1.8475008338689805, - "epoch": 0.49335107516590343, - "grad_norm": 6.955073356628418, - "learning_rate": 3.6017702324997172e-06, - "loss": 0.4643, - "mean_token_accuracy": 0.8446203991770744, - "num_tokens": 191498120.0, - "step": 159150 - }, - { - "entropy": 1.9266953021287918, - "epoch": 0.49338207429095315, - "grad_norm": 8.705105781555176, - "learning_rate": 3.6016570806575563e-06, - "loss": 0.5382, - "mean_token_accuracy": 0.8330066472291946, - "num_tokens": 191509278.0, - "step": 159160 - }, - { - "entropy": 1.8248368367552756, - "epoch": 0.4934130734160028, - "grad_norm": 7.812545299530029, - "learning_rate": 3.601543939478931e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.8588651672005654, - "num_tokens": 191521592.0, - "step": 159170 - }, - { - "entropy": 1.8829329013824463, - "epoch": 0.49344407254105255, - "grad_norm": 8.274620056152344, - "learning_rate": 3.6014308089621654e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8415772512555122, - "num_tokens": 191533098.0, - "step": 159180 - }, - { - "entropy": 1.8726583898067475, - "epoch": 0.4934750716661022, - "grad_norm": 9.604278564453125, - "learning_rate": 3.601317689105587e-06, - "loss": 0.4613, - "mean_token_accuracy": 0.8552197560667991, - "num_tokens": 191544887.0, - "step": 159190 - }, - { - "entropy": 1.859221415221691, - "epoch": 0.49350607079115194, - "grad_norm": 6.692994117736816, - "learning_rate": 3.6012045799075205e-06, - "loss": 0.4347, - "mean_token_accuracy": 0.8668141156435013, - "num_tokens": 191556853.0, - "step": 159200 - }, - { - "entropy": 1.9711533397436143, - "epoch": 0.4935370699162016, - "grad_norm": 9.214414596557617, - "learning_rate": 3.6010914813662927e-06, - "loss": 0.5046, - "mean_token_accuracy": 0.8427471235394478, - "num_tokens": 191568177.0, - "step": 159210 - }, - { - "entropy": 1.8187393069267273, - "epoch": 0.49356806904125133, - "grad_norm": 8.627388954162598, - "learning_rate": 3.60097839348023e-06, - "loss": 0.4427, - "mean_token_accuracy": 0.8565979853272438, - "num_tokens": 191580641.0, - "step": 159220 - }, - { - "entropy": 1.8383538708090783, - "epoch": 0.493599068166301, - "grad_norm": 3.850691795349121, - "learning_rate": 3.6008653162476602e-06, - "loss": 0.3849, - "mean_token_accuracy": 0.8547204375267029, - "num_tokens": 191593431.0, - "step": 159230 - }, - { - "entropy": 1.8895878791809082, - "epoch": 0.4936300672913507, - "grad_norm": 7.909356117248535, - "learning_rate": 3.6007522496669095e-06, - "loss": 0.4623, - "mean_token_accuracy": 0.8444205358624458, - "num_tokens": 191605046.0, - "step": 159240 - }, - { - "entropy": 1.9098277121782303, - "epoch": 0.4936610664164004, - "grad_norm": 6.846156597137451, - "learning_rate": 3.6006391937363068e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.8568753972649574, - "num_tokens": 191616417.0, - "step": 159250 - }, - { - "entropy": 1.8749052241444588, - "epoch": 0.4936920655414501, - "grad_norm": 8.896378517150879, - "learning_rate": 3.600526148454179e-06, - "loss": 0.4854, - "mean_token_accuracy": 0.847206848859787, - "num_tokens": 191628583.0, - "step": 159260 - }, - { - "entropy": 1.843507680296898, - "epoch": 0.4937230646664998, - "grad_norm": 8.739633560180664, - "learning_rate": 3.6004131138188563e-06, - "loss": 0.406, - "mean_token_accuracy": 0.8644677430391312, - "num_tokens": 191641180.0, - "step": 159270 - }, - { - "entropy": 1.928541123867035, - "epoch": 0.49375406379154946, - "grad_norm": 7.948415279388428, - "learning_rate": 3.6003000898286653e-06, - "loss": 0.4932, - "mean_token_accuracy": 0.8485148876905442, - "num_tokens": 191652064.0, - "step": 159280 - }, - { - "entropy": 1.8564437612891198, - "epoch": 0.4937850629165992, - "grad_norm": 3.7542104721069336, - "learning_rate": 3.6001870764819376e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8547224655747414, - "num_tokens": 191664242.0, - "step": 159290 - }, - { - "entropy": 1.886579157412052, - "epoch": 0.49381606204164885, - "grad_norm": 7.148205757141113, - "learning_rate": 3.6000740737770015e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8600714549422264, - "num_tokens": 191676071.0, - "step": 159300 - }, - { - "entropy": 1.9193453639745712, - "epoch": 0.4938470611666986, - "grad_norm": 7.308450222015381, - "learning_rate": 3.599961081712188e-06, - "loss": 0.4872, - "mean_token_accuracy": 0.8472814872860909, - "num_tokens": 191687806.0, - "step": 159310 - }, - { - "entropy": 1.9151620626449586, - "epoch": 0.49387806029174824, - "grad_norm": 3.6540892124176025, - "learning_rate": 3.5998481002858256e-06, - "loss": 0.459, - "mean_token_accuracy": 0.8533779740333557, - "num_tokens": 191698520.0, - "step": 159320 - }, - { - "entropy": 1.8681719586253167, - "epoch": 0.49390905941679797, - "grad_norm": 9.010276794433594, - "learning_rate": 3.5997351294962464e-06, - "loss": 0.4324, - "mean_token_accuracy": 0.8576828598976135, - "num_tokens": 191711297.0, - "step": 159330 - }, - { - "entropy": 1.8712261497974396, - "epoch": 0.49394005854184764, - "grad_norm": 7.325274467468262, - "learning_rate": 3.5996221693417817e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8589169010519981, - "num_tokens": 191723406.0, - "step": 159340 - }, - { - "entropy": 1.8066833555698394, - "epoch": 0.49397105766689736, - "grad_norm": 3.5466883182525635, - "learning_rate": 3.599509219820762e-06, - "loss": 0.4108, - "mean_token_accuracy": 0.8577698558568955, - "num_tokens": 191736967.0, - "step": 159350 - }, - { - "entropy": 1.7822127223014832, - "epoch": 0.49400205679194703, - "grad_norm": 4.0035576820373535, - "learning_rate": 3.5993962809315197e-06, - "loss": 0.3724, - "mean_token_accuracy": 0.8697702825069428, - "num_tokens": 191750056.0, - "step": 159360 - }, - { - "entropy": 1.936877228319645, - "epoch": 0.49403305591699676, - "grad_norm": 9.158493041992188, - "learning_rate": 3.5992833526723876e-06, - "loss": 0.4997, - "mean_token_accuracy": 0.8464233413338661, - "num_tokens": 191761365.0, - "step": 159370 - }, - { - "entropy": 1.8716009959578515, - "epoch": 0.4940640550420464, - "grad_norm": 9.602849006652832, - "learning_rate": 3.5991704350416963e-06, - "loss": 0.4716, - "mean_token_accuracy": 0.8516163632273674, - "num_tokens": 191773293.0, - "step": 159380 - }, - { - "entropy": 1.8163740314543246, - "epoch": 0.49409505416709615, - "grad_norm": 1.8862295150756836, - "learning_rate": 3.5990575280377803e-06, - "loss": 0.4396, - "mean_token_accuracy": 0.8576122790575027, - "num_tokens": 191786131.0, - "step": 159390 - }, - { - "entropy": 1.9228911444544792, - "epoch": 0.4941260532921458, - "grad_norm": 6.864531993865967, - "learning_rate": 3.5989446316589728e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8586970806121826, - "num_tokens": 191797537.0, - "step": 159400 - }, - { - "entropy": 1.8249765798449515, - "epoch": 0.49415705241719554, - "grad_norm": 9.325008392333984, - "learning_rate": 3.5988317459036063e-06, - "loss": 0.41, - "mean_token_accuracy": 0.8683119297027588, - "num_tokens": 191810309.0, - "step": 159410 - }, - { - "entropy": 1.8564854457974433, - "epoch": 0.4941880515422452, - "grad_norm": 9.13007926940918, - "learning_rate": 3.5987188707700173e-06, - "loss": 0.4441, - "mean_token_accuracy": 0.8559637442231178, - "num_tokens": 191821949.0, - "step": 159420 - }, - { - "entropy": 1.9013855665922166, - "epoch": 0.49421905066729493, - "grad_norm": 7.891731262207031, - "learning_rate": 3.598606006256537e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8630088746547699, - "num_tokens": 191833045.0, - "step": 159430 - }, - { - "entropy": 1.8180122032761574, - "epoch": 0.4942500497923446, - "grad_norm": 7.655247688293457, - "learning_rate": 3.5984931523615023e-06, - "loss": 0.4172, - "mean_token_accuracy": 0.8637914076447487, - "num_tokens": 191845436.0, - "step": 159440 - }, - { - "entropy": 1.949680632352829, - "epoch": 0.49428104891739433, - "grad_norm": 8.30689525604248, - "learning_rate": 3.5983803090832483e-06, - "loss": 0.489, - "mean_token_accuracy": 0.8543712347745895, - "num_tokens": 191856433.0, - "step": 159450 - }, - { - "entropy": 1.9027078568935394, - "epoch": 0.494312048042444, - "grad_norm": 9.063887596130371, - "learning_rate": 3.5982674764201085e-06, - "loss": 0.4823, - "mean_token_accuracy": 0.8519957110285759, - "num_tokens": 191867610.0, - "step": 159460 - }, - { - "entropy": 1.9152158468961715, - "epoch": 0.4943430471674937, - "grad_norm": 7.915401935577393, - "learning_rate": 3.598154654370421e-06, - "loss": 0.4439, - "mean_token_accuracy": 0.8566673934459687, - "num_tokens": 191878868.0, - "step": 159470 - }, - { - "entropy": 1.9054709061980248, - "epoch": 0.4943740462925434, - "grad_norm": 7.502203941345215, - "learning_rate": 3.598041842932521e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.863150903582573, - "num_tokens": 191890203.0, - "step": 159480 - }, - { - "entropy": 1.8717289566993713, - "epoch": 0.4944050454175931, - "grad_norm": 8.5286283493042, - "learning_rate": 3.5979290421047445e-06, - "loss": 0.4555, - "mean_token_accuracy": 0.851759298145771, - "num_tokens": 191901799.0, - "step": 159490 - }, - { - "entropy": 1.8034158065915107, - "epoch": 0.4944360445426428, - "grad_norm": 7.359280109405518, - "learning_rate": 3.5978162518854305e-06, - "loss": 0.3651, - "mean_token_accuracy": 0.8661789894104004, - "num_tokens": 191914715.0, - "step": 159500 - }, - { - "entropy": 1.900420169532299, - "epoch": 0.49446704366769245, - "grad_norm": 7.568366527557373, - "learning_rate": 3.5977034722729138e-06, - "loss": 0.4615, - "mean_token_accuracy": 0.8520032197237015, - "num_tokens": 191926711.0, - "step": 159510 - }, - { - "entropy": 1.8336208701133727, - "epoch": 0.4944980427927422, - "grad_norm": 3.925819158554077, - "learning_rate": 3.5975907032655337e-06, - "loss": 0.4219, - "mean_token_accuracy": 0.8600850090384483, - "num_tokens": 191939407.0, - "step": 159520 - }, - { - "entropy": 1.8698050826787949, - "epoch": 0.49452904191779185, - "grad_norm": 8.200296401977539, - "learning_rate": 3.5974779448616272e-06, - "loss": 0.4385, - "mean_token_accuracy": 0.8596655920147895, - "num_tokens": 191951027.0, - "step": 159530 - }, - { - "entropy": 1.9370542958378791, - "epoch": 0.49456004104284157, - "grad_norm": 8.19310474395752, - "learning_rate": 3.5973651970595336e-06, - "loss": 0.4588, - "mean_token_accuracy": 0.8577198967337608, - "num_tokens": 191962661.0, - "step": 159540 - }, - { - "entropy": 1.9390544414520263, - "epoch": 0.49459104016789124, - "grad_norm": 8.382871627807617, - "learning_rate": 3.5972524598575907e-06, - "loss": 0.4806, - "mean_token_accuracy": 0.855408425629139, - "num_tokens": 191973798.0, - "step": 159550 - }, - { - "entropy": 1.8777232959866523, - "epoch": 0.49462203929294096, - "grad_norm": 8.35791015625, - "learning_rate": 3.597139733254139e-06, - "loss": 0.4339, - "mean_token_accuracy": 0.860966557264328, - "num_tokens": 191985741.0, - "step": 159560 - }, - { - "entropy": 1.9498206496238708, - "epoch": 0.49465303841799063, - "grad_norm": 8.334501266479492, - "learning_rate": 3.597027017247517e-06, - "loss": 0.496, - "mean_token_accuracy": 0.8421608254313468, - "num_tokens": 191997083.0, - "step": 159570 - }, - { - "entropy": 1.9712836146354675, - "epoch": 0.49468403754304036, - "grad_norm": 8.104325294494629, - "learning_rate": 3.5969143118360645e-06, - "loss": 0.4925, - "mean_token_accuracy": 0.850074777007103, - "num_tokens": 192008075.0, - "step": 159580 - }, - { - "entropy": 1.9014674112200738, - "epoch": 0.49471503666809, - "grad_norm": 8.756881713867188, - "learning_rate": 3.596801617018122e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.8585664972662925, - "num_tokens": 192020574.0, - "step": 159590 - }, - { - "entropy": 1.9716274067759514, - "epoch": 0.49474603579313975, - "grad_norm": 9.641308784484863, - "learning_rate": 3.5966889327920303e-06, - "loss": 0.5101, - "mean_token_accuracy": 0.8372088670730591, - "num_tokens": 192032036.0, - "step": 159600 - }, - { - "entropy": 1.9579774558544158, - "epoch": 0.4947770349181894, - "grad_norm": 8.46997356414795, - "learning_rate": 3.596576259156129e-06, - "loss": 0.4723, - "mean_token_accuracy": 0.8514121487736702, - "num_tokens": 192043288.0, - "step": 159610 - }, - { - "entropy": 1.9294979020953178, - "epoch": 0.49480803404323914, - "grad_norm": 8.299081802368164, - "learning_rate": 3.5964635961087614e-06, - "loss": 0.4578, - "mean_token_accuracy": 0.8471070304512978, - "num_tokens": 192055247.0, - "step": 159620 - }, - { - "entropy": 1.9172394633293153, - "epoch": 0.4948390331682888, - "grad_norm": 7.675030708312988, - "learning_rate": 3.596350943648268e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8571576774120331, - "num_tokens": 192066190.0, - "step": 159630 - }, - { - "entropy": 1.9542252331972123, - "epoch": 0.49487003229333854, - "grad_norm": 7.442762851715088, - "learning_rate": 3.5962383017729907e-06, - "loss": 0.4722, - "mean_token_accuracy": 0.854285454750061, - "num_tokens": 192077133.0, - "step": 159640 - }, - { - "entropy": 1.9150119140744208, - "epoch": 0.4949010314183882, - "grad_norm": 7.354382514953613, - "learning_rate": 3.596125670481273e-06, - "loss": 0.4672, - "mean_token_accuracy": 0.8513759151101112, - "num_tokens": 192088338.0, - "step": 159650 - }, - { - "entropy": 1.9242964759469032, - "epoch": 0.49493203054343793, - "grad_norm": 8.360685348510742, - "learning_rate": 3.596013049771456e-06, - "loss": 0.4496, - "mean_token_accuracy": 0.8601465240120888, - "num_tokens": 192099903.0, - "step": 159660 - }, - { - "entropy": 1.8404909834265708, - "epoch": 0.4949630296684876, - "grad_norm": 6.913203239440918, - "learning_rate": 3.5959004396418847e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8578110337257385, - "num_tokens": 192112770.0, - "step": 159670 - }, - { - "entropy": 1.922998534142971, - "epoch": 0.4949940287935373, - "grad_norm": 4.097351551055908, - "learning_rate": 3.595787840090901e-06, - "loss": 0.4733, - "mean_token_accuracy": 0.8465961366891861, - "num_tokens": 192124121.0, - "step": 159680 - }, - { - "entropy": 1.921201765537262, - "epoch": 0.495025027918587, - "grad_norm": 8.46877384185791, - "learning_rate": 3.5956752511168493e-06, - "loss": 0.458, - "mean_token_accuracy": 0.8489266246557235, - "num_tokens": 192135725.0, - "step": 159690 - }, - { - "entropy": 1.795041662454605, - "epoch": 0.4950560270436367, - "grad_norm": 2.492420196533203, - "learning_rate": 3.5955626727180743e-06, - "loss": 0.3979, - "mean_token_accuracy": 0.865375991165638, - "num_tokens": 192149338.0, - "step": 159700 - }, - { - "entropy": 1.9332377836108208, - "epoch": 0.4950870261686864, - "grad_norm": 8.110081672668457, - "learning_rate": 3.59545010489292e-06, - "loss": 0.5078, - "mean_token_accuracy": 0.8450874507427215, - "num_tokens": 192160823.0, - "step": 159710 - }, - { - "entropy": 1.8410456269979476, - "epoch": 0.4951180252937361, - "grad_norm": 6.63314151763916, - "learning_rate": 3.595337547639732e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.855519250035286, - "num_tokens": 192174110.0, - "step": 159720 - }, - { - "entropy": 1.9845334231853484, - "epoch": 0.4951490244187858, - "grad_norm": 9.71750545501709, - "learning_rate": 3.5952250009568545e-06, - "loss": 0.494, - "mean_token_accuracy": 0.85206418633461, - "num_tokens": 192185155.0, - "step": 159730 - }, - { - "entropy": 1.8927864357829094, - "epoch": 0.4951800235438355, - "grad_norm": 3.551086187362671, - "learning_rate": 3.595112464842634e-06, - "loss": 0.451, - "mean_token_accuracy": 0.8464046537876129, - "num_tokens": 192197012.0, - "step": 159740 - }, - { - "entropy": 1.8689477637410163, - "epoch": 0.49521102266888517, - "grad_norm": 8.574982643127441, - "learning_rate": 3.594999939295416e-06, - "loss": 0.4122, - "mean_token_accuracy": 0.8616869121789932, - "num_tokens": 192209129.0, - "step": 159750 - }, - { - "entropy": 1.9469447121024133, - "epoch": 0.49524202179393484, - "grad_norm": 8.655628204345703, - "learning_rate": 3.5948874243135472e-06, - "loss": 0.457, - "mean_token_accuracy": 0.8474073946475983, - "num_tokens": 192220445.0, - "step": 159760 - }, - { - "entropy": 1.8619813948869706, - "epoch": 0.49527302091898456, - "grad_norm": 7.8713297843933105, - "learning_rate": 3.5947749198953753e-06, - "loss": 0.4569, - "mean_token_accuracy": 0.8450932338833809, - "num_tokens": 192232925.0, - "step": 159770 - }, - { - "entropy": 1.7789973124861718, - "epoch": 0.49530402004403423, - "grad_norm": 2.4723801612854004, - "learning_rate": 3.5946624260392455e-06, - "loss": 0.3942, - "mean_token_accuracy": 0.8597326517105103, - "num_tokens": 192246932.0, - "step": 159780 - }, - { - "entropy": 1.9105781406164168, - "epoch": 0.49533501916908396, - "grad_norm": 5.2691850662231445, - "learning_rate": 3.5945499427435066e-06, - "loss": 0.4914, - "mean_token_accuracy": 0.8449198469519615, - "num_tokens": 192258019.0, - "step": 159790 - }, - { - "entropy": 1.8505356892943383, - "epoch": 0.4953660182941336, - "grad_norm": 8.916192054748535, - "learning_rate": 3.5944374700065052e-06, - "loss": 0.4483, - "mean_token_accuracy": 0.855469511449337, - "num_tokens": 192270524.0, - "step": 159800 - }, - { - "entropy": 1.896497841179371, - "epoch": 0.49539701741918335, - "grad_norm": 3.9803407192230225, - "learning_rate": 3.5943250078265917e-06, - "loss": 0.4517, - "mean_token_accuracy": 0.8532499849796296, - "num_tokens": 192281947.0, - "step": 159810 - }, - { - "entropy": 1.8115594327449798, - "epoch": 0.495428016544233, - "grad_norm": 9.716015815734863, - "learning_rate": 3.594212556202113e-06, - "loss": 0.4016, - "mean_token_accuracy": 0.8568425461649894, - "num_tokens": 192295045.0, - "step": 159820 - }, - { - "entropy": 1.8505840301513672, - "epoch": 0.49545901566928274, - "grad_norm": 4.513120174407959, - "learning_rate": 3.594100115131418e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8524532780051232, - "num_tokens": 192307333.0, - "step": 159830 - }, - { - "entropy": 1.889463298022747, - "epoch": 0.4954900147943324, - "grad_norm": 4.289556503295898, - "learning_rate": 3.5939876846128567e-06, - "loss": 0.4797, - "mean_token_accuracy": 0.8482072561979294, - "num_tokens": 192319247.0, - "step": 159840 - }, - { - "entropy": 1.8135507375001907, - "epoch": 0.49552101391938214, - "grad_norm": 3.4432010650634766, - "learning_rate": 3.5938752646447785e-06, - "loss": 0.3781, - "mean_token_accuracy": 0.8619725123047829, - "num_tokens": 192332304.0, - "step": 159850 - }, - { - "entropy": 1.7961521610617637, - "epoch": 0.4955520130444318, - "grad_norm": 3.9302399158477783, - "learning_rate": 3.5937628552255325e-06, - "loss": 0.3758, - "mean_token_accuracy": 0.8562786504626274, - "num_tokens": 192345314.0, - "step": 159860 - }, - { - "entropy": 1.8858400031924247, - "epoch": 0.49558301216948153, - "grad_norm": 8.590622901916504, - "learning_rate": 3.593650456353471e-06, - "loss": 0.4808, - "mean_token_accuracy": 0.8526899054646492, - "num_tokens": 192357612.0, - "step": 159870 - }, - { - "entropy": 1.9095351725816727, - "epoch": 0.4956140112945312, - "grad_norm": 7.664645671844482, - "learning_rate": 3.593538068026943e-06, - "loss": 0.4466, - "mean_token_accuracy": 0.8507430016994476, - "num_tokens": 192369306.0, - "step": 159880 - }, - { - "entropy": 1.8387876883149148, - "epoch": 0.4956450104195809, - "grad_norm": 9.930180549621582, - "learning_rate": 3.593425690244299e-06, - "loss": 0.4228, - "mean_token_accuracy": 0.8595908388495446, - "num_tokens": 192381626.0, - "step": 159890 - }, - { - "entropy": 1.9600819304585457, - "epoch": 0.4956760095446306, - "grad_norm": 9.37524127960205, - "learning_rate": 3.5933133230038935e-06, - "loss": 0.488, - "mean_token_accuracy": 0.8463097050786018, - "num_tokens": 192393259.0, - "step": 159900 - }, - { - "entropy": 1.9440770626068116, - "epoch": 0.4957070086696803, - "grad_norm": 7.7554755210876465, - "learning_rate": 3.5932009663040756e-06, - "loss": 0.472, - "mean_token_accuracy": 0.8595362901687622, - "num_tokens": 192404060.0, - "step": 159910 - }, - { - "entropy": 1.6832211181521415, - "epoch": 0.49573800779473, - "grad_norm": 3.3963851928710938, - "learning_rate": 3.593088620143198e-06, - "loss": 0.3069, - "mean_token_accuracy": 0.8765401244163513, - "num_tokens": 192418579.0, - "step": 159920 - }, - { - "entropy": 1.7505709946155548, - "epoch": 0.4957690069197797, - "grad_norm": 7.246809959411621, - "learning_rate": 3.5929762845196146e-06, - "loss": 0.3778, - "mean_token_accuracy": 0.8670257017016411, - "num_tokens": 192432317.0, - "step": 159930 - }, - { - "entropy": 1.8624306321144104, - "epoch": 0.4958000060448294, - "grad_norm": 4.187715530395508, - "learning_rate": 3.5928639594316765e-06, - "loss": 0.445, - "mean_token_accuracy": 0.8575484842061997, - "num_tokens": 192444479.0, - "step": 159940 - }, - { - "entropy": 1.8551115036010741, - "epoch": 0.4958310051698791, - "grad_norm": 3.9937570095062256, - "learning_rate": 3.592751644877738e-06, - "loss": 0.4007, - "mean_token_accuracy": 0.8715003669261933, - "num_tokens": 192456677.0, - "step": 159950 - }, - { - "entropy": 1.9346601396799088, - "epoch": 0.49586200429492877, - "grad_norm": 8.055962562561035, - "learning_rate": 3.5926393408561522e-06, - "loss": 0.4836, - "mean_token_accuracy": 0.8497431352734566, - "num_tokens": 192468137.0, - "step": 159960 - }, - { - "entropy": 1.8983276531100273, - "epoch": 0.4958930034199785, - "grad_norm": 8.280396461486816, - "learning_rate": 3.5925270473652735e-06, - "loss": 0.4381, - "mean_token_accuracy": 0.8556265756487846, - "num_tokens": 192479694.0, - "step": 159970 - }, - { - "entropy": 1.882783156633377, - "epoch": 0.49592400254502816, - "grad_norm": 9.567574501037598, - "learning_rate": 3.5924147644034557e-06, - "loss": 0.4529, - "mean_token_accuracy": 0.8511046752333641, - "num_tokens": 192491376.0, - "step": 159980 - }, - { - "entropy": 1.7969225481152535, - "epoch": 0.4959550016700779, - "grad_norm": 2.3194515705108643, - "learning_rate": 3.592302491969054e-06, - "loss": 0.3909, - "mean_token_accuracy": 0.866172443330288, - "num_tokens": 192504201.0, - "step": 159990 - }, - { - "entropy": 1.9083019599318505, - "epoch": 0.49598600079512756, - "grad_norm": 7.948028564453125, - "learning_rate": 3.5921902300604235e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.8478529721498489, - "num_tokens": 192516145.0, - "step": 160000 - }, - { - "entropy": 1.8109319642186166, - "epoch": 0.4960169999201772, - "grad_norm": 8.787992477416992, - "learning_rate": 3.59207797867592e-06, - "loss": 0.3886, - "mean_token_accuracy": 0.8696772411465645, - "num_tokens": 192528786.0, - "step": 160010 - }, - { - "entropy": 1.8585580334067344, - "epoch": 0.49604799904522695, - "grad_norm": 3.6390159130096436, - "learning_rate": 3.591965737813897e-06, - "loss": 0.4232, - "mean_token_accuracy": 0.8561258107423783, - "num_tokens": 192541016.0, - "step": 160020 - }, - { - "entropy": 1.827420374751091, - "epoch": 0.4960789981702766, - "grad_norm": 7.844151020050049, - "learning_rate": 3.591853507472713e-06, - "loss": 0.4528, - "mean_token_accuracy": 0.8646694928407669, - "num_tokens": 192553566.0, - "step": 160030 - }, - { - "entropy": 1.9029534503817558, - "epoch": 0.49610999729532634, - "grad_norm": 8.42615795135498, - "learning_rate": 3.591741287650724e-06, - "loss": 0.4644, - "mean_token_accuracy": 0.8528878018260002, - "num_tokens": 192564840.0, - "step": 160040 - }, - { - "entropy": 1.9213525876402855, - "epoch": 0.496140996420376, - "grad_norm": 8.478250503540039, - "learning_rate": 3.5916290783462864e-06, - "loss": 0.5006, - "mean_token_accuracy": 0.8403630703687668, - "num_tokens": 192577087.0, - "step": 160050 - }, - { - "entropy": 1.8662138119339944, - "epoch": 0.49617199554542574, - "grad_norm": 8.441492080688477, - "learning_rate": 3.5915168795577587e-06, - "loss": 0.416, - "mean_token_accuracy": 0.8532093212008476, - "num_tokens": 192589686.0, - "step": 160060 - }, - { - "entropy": 1.8910415381193162, - "epoch": 0.4962029946704754, - "grad_norm": 6.746820449829102, - "learning_rate": 3.5914046912834966e-06, - "loss": 0.4201, - "mean_token_accuracy": 0.856334288418293, - "num_tokens": 192602111.0, - "step": 160070 - }, - { - "entropy": 1.9242399752140045, - "epoch": 0.49623399379552513, - "grad_norm": 8.525853157043457, - "learning_rate": 3.5912925135218584e-06, - "loss": 0.4753, - "mean_token_accuracy": 0.8545285657048225, - "num_tokens": 192613246.0, - "step": 160080 - }, - { - "entropy": 1.835014469921589, - "epoch": 0.4962649929205748, - "grad_norm": 3.318446159362793, - "learning_rate": 3.5911803462712035e-06, - "loss": 0.4093, - "mean_token_accuracy": 0.8679602593183517, - "num_tokens": 192625964.0, - "step": 160090 - }, - { - "entropy": 1.8478663966059685, - "epoch": 0.4962959920456245, - "grad_norm": 7.670246601104736, - "learning_rate": 3.5910681895298898e-06, - "loss": 0.4272, - "mean_token_accuracy": 0.8541574910283088, - "num_tokens": 192637795.0, - "step": 160100 - }, - { - "entropy": 1.901483315229416, - "epoch": 0.4963269911706742, - "grad_norm": 3.5240871906280518, - "learning_rate": 3.5909560432962764e-06, - "loss": 0.4324, - "mean_token_accuracy": 0.8594571843743324, - "num_tokens": 192649967.0, - "step": 160110 - }, - { - "entropy": 1.8992531165480613, - "epoch": 0.4963579902957239, - "grad_norm": 8.688932418823242, - "learning_rate": 3.590843907568723e-06, - "loss": 0.4436, - "mean_token_accuracy": 0.8527524411678314, - "num_tokens": 192661724.0, - "step": 160120 - }, - { - "entropy": 1.8817158490419388, - "epoch": 0.4963889894207736, - "grad_norm": 6.7711992263793945, - "learning_rate": 3.5907317823455882e-06, - "loss": 0.4296, - "mean_token_accuracy": 0.86692134141922, - "num_tokens": 192673692.0, - "step": 160130 - }, - { - "entropy": 1.8810399681329728, - "epoch": 0.4964199885458233, - "grad_norm": 3.6192641258239746, - "learning_rate": 3.5906196676252334e-06, - "loss": 0.4325, - "mean_token_accuracy": 0.8557695388793946, - "num_tokens": 192685719.0, - "step": 160140 - }, - { - "entropy": 1.763352060317993, - "epoch": 0.496450987670873, - "grad_norm": 8.72749137878418, - "learning_rate": 3.5905075634060187e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.8585641667246818, - "num_tokens": 192699439.0, - "step": 160150 - }, - { - "entropy": 1.9361596137285233, - "epoch": 0.4964819867959227, - "grad_norm": 6.907578945159912, - "learning_rate": 3.590395469686304e-06, - "loss": 0.5622, - "mean_token_accuracy": 0.8400822654366493, - "num_tokens": 192710481.0, - "step": 160160 - }, - { - "entropy": 1.978668710589409, - "epoch": 0.49651298592097237, - "grad_norm": 8.254986763000488, - "learning_rate": 3.590283386464452e-06, - "loss": 0.5487, - "mean_token_accuracy": 0.8343606054782867, - "num_tokens": 192721151.0, - "step": 160170 - }, - { - "entropy": 1.8602470502257347, - "epoch": 0.4965439850460221, - "grad_norm": 6.900646686553955, - "learning_rate": 3.590171313738823e-06, - "loss": 0.4019, - "mean_token_accuracy": 0.8702385917305946, - "num_tokens": 192733018.0, - "step": 160180 - }, - { - "entropy": 1.8422987774014472, - "epoch": 0.49657498417107176, - "grad_norm": 8.578625679016113, - "learning_rate": 3.59005925150778e-06, - "loss": 0.4148, - "mean_token_accuracy": 0.8609873950481415, - "num_tokens": 192745197.0, - "step": 160190 - }, - { - "entropy": 1.944932959973812, - "epoch": 0.4966059832961215, - "grad_norm": 9.156232833862305, - "learning_rate": 3.589947199769683e-06, - "loss": 0.4781, - "mean_token_accuracy": 0.8525719374418259, - "num_tokens": 192756468.0, - "step": 160200 - }, - { - "entropy": 1.8785189107060432, - "epoch": 0.49663698242117116, - "grad_norm": 7.663604259490967, - "learning_rate": 3.5898351585228973e-06, - "loss": 0.4157, - "mean_token_accuracy": 0.8553120478987694, - "num_tokens": 192769114.0, - "step": 160210 - }, - { - "entropy": 1.8660555586218834, - "epoch": 0.4966679815462209, - "grad_norm": 7.651570796966553, - "learning_rate": 3.589723127765784e-06, - "loss": 0.4296, - "mean_token_accuracy": 0.8518196612596511, - "num_tokens": 192782159.0, - "step": 160220 - }, - { - "entropy": 1.8391312196850778, - "epoch": 0.49669898067127055, - "grad_norm": 4.205702304840088, - "learning_rate": 3.5896111074967082e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.8440369963645935, - "num_tokens": 192795497.0, - "step": 160230 - }, - { - "entropy": 1.9263321340084076, - "epoch": 0.4967299797963203, - "grad_norm": 9.188881874084473, - "learning_rate": 3.589499097714031e-06, - "loss": 0.4668, - "mean_token_accuracy": 0.8481300055980683, - "num_tokens": 192807445.0, - "step": 160240 - }, - { - "entropy": 1.9688881516456604, - "epoch": 0.49676097892136994, - "grad_norm": 7.7191481590271, - "learning_rate": 3.589387098416119e-06, - "loss": 0.4924, - "mean_token_accuracy": 0.8576457172632217, - "num_tokens": 192818000.0, - "step": 160250 - }, - { - "entropy": 1.8553527995944024, - "epoch": 0.4967919780464196, - "grad_norm": 8.173221588134766, - "learning_rate": 3.5892751096013353e-06, - "loss": 0.4053, - "mean_token_accuracy": 0.8690877959132195, - "num_tokens": 192830549.0, - "step": 160260 - }, - { - "entropy": 1.9142266526818275, - "epoch": 0.49682297717146934, - "grad_norm": 3.9052417278289795, - "learning_rate": 3.5891631312680436e-06, - "loss": 0.4795, - "mean_token_accuracy": 0.8526837721467018, - "num_tokens": 192843309.0, - "step": 160270 - }, - { - "entropy": 1.9387848749756813, - "epoch": 0.496853976296519, - "grad_norm": 9.477102279663086, - "learning_rate": 3.5890511634146112e-06, - "loss": 0.4682, - "mean_token_accuracy": 0.8555059671401978, - "num_tokens": 192854568.0, - "step": 160280 - }, - { - "entropy": 1.8374897107481956, - "epoch": 0.49688497542156873, - "grad_norm": 7.315760135650635, - "learning_rate": 3.5889392060394016e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.8532007947564125, - "num_tokens": 192867609.0, - "step": 160290 - }, - { - "entropy": 1.952652522921562, - "epoch": 0.4969159745466184, - "grad_norm": 8.343222618103027, - "learning_rate": 3.588827259140783e-06, - "loss": 0.4878, - "mean_token_accuracy": 0.8470484256744385, - "num_tokens": 192878254.0, - "step": 160300 - }, - { - "entropy": 2.024555891752243, - "epoch": 0.4969469736716681, - "grad_norm": 7.191964149475098, - "learning_rate": 3.5887153227171184e-06, - "loss": 0.4987, - "mean_token_accuracy": 0.8383811950683594, - "num_tokens": 192889067.0, - "step": 160310 - }, - { - "entropy": 1.8601422876119613, - "epoch": 0.4969779727967178, - "grad_norm": 7.714452266693115, - "learning_rate": 3.5886033967667773e-06, - "loss": 0.4423, - "mean_token_accuracy": 0.8559423238039017, - "num_tokens": 192901287.0, - "step": 160320 - }, - { - "entropy": 1.9518428400158883, - "epoch": 0.4970089719217675, - "grad_norm": 4.6776275634765625, - "learning_rate": 3.5884914812881245e-06, - "loss": 0.4943, - "mean_token_accuracy": 0.8465476736426354, - "num_tokens": 192912823.0, - "step": 160330 - }, - { - "entropy": 1.8527782797813415, - "epoch": 0.4970399710468172, - "grad_norm": 7.401676177978516, - "learning_rate": 3.5883795762795283e-06, - "loss": 0.407, - "mean_token_accuracy": 0.8594189703464508, - "num_tokens": 192926217.0, - "step": 160340 - }, - { - "entropy": 1.8976733207702636, - "epoch": 0.4970709701718669, - "grad_norm": 6.59383487701416, - "learning_rate": 3.588267681739356e-06, - "loss": 0.395, - "mean_token_accuracy": 0.8672205924987793, - "num_tokens": 192937314.0, - "step": 160350 - }, - { - "entropy": 1.874462467432022, - "epoch": 0.4971019692969166, - "grad_norm": 6.723745822906494, - "learning_rate": 3.588155797665975e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.8523491159081459, - "num_tokens": 192949521.0, - "step": 160360 - }, - { - "entropy": 1.9262579411268235, - "epoch": 0.4971329684219663, - "grad_norm": 9.00383472442627, - "learning_rate": 3.588043924057755e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8581419870257377, - "num_tokens": 192960939.0, - "step": 160370 - }, - { - "entropy": 1.86645487844944, - "epoch": 0.497163967547016, - "grad_norm": 10.316017150878906, - "learning_rate": 3.5879320609130632e-06, - "loss": 0.42, - "mean_token_accuracy": 0.8479255259037017, - "num_tokens": 192973919.0, - "step": 160380 - }, - { - "entropy": 1.894903513789177, - "epoch": 0.4971949666720657, - "grad_norm": 8.902325630187988, - "learning_rate": 3.5878202082302687e-06, - "loss": 0.4436, - "mean_token_accuracy": 0.8499386295676231, - "num_tokens": 192985411.0, - "step": 160390 - }, - { - "entropy": 1.8823218569159508, - "epoch": 0.49722596579711537, - "grad_norm": 5.8220014572143555, - "learning_rate": 3.5877083660077423e-06, - "loss": 0.3909, - "mean_token_accuracy": 0.8693734228610992, - "num_tokens": 192997183.0, - "step": 160400 - }, - { - "entropy": 1.8421266853809357, - "epoch": 0.4972569649221651, - "grad_norm": 3.864579439163208, - "learning_rate": 3.5875965342438524e-06, - "loss": 0.3832, - "mean_token_accuracy": 0.8598260506987572, - "num_tokens": 193010293.0, - "step": 160410 - }, - { - "entropy": 1.836855800449848, - "epoch": 0.49728796404721476, - "grad_norm": 3.868010997772217, - "learning_rate": 3.5874847129369696e-06, - "loss": 0.4179, - "mean_token_accuracy": 0.8634832099080085, - "num_tokens": 193022967.0, - "step": 160420 - }, - { - "entropy": 1.716268916428089, - "epoch": 0.4973189631722645, - "grad_norm": 8.21786117553711, - "learning_rate": 3.5873729020854643e-06, - "loss": 0.3336, - "mean_token_accuracy": 0.875548966228962, - "num_tokens": 193037075.0, - "step": 160430 - }, - { - "entropy": 1.8625114277005195, - "epoch": 0.49734996229731415, - "grad_norm": 8.999544143676758, - "learning_rate": 3.5872611016877067e-06, - "loss": 0.4151, - "mean_token_accuracy": 0.8617020696401596, - "num_tokens": 193049323.0, - "step": 160440 - }, - { - "entropy": 1.85534148812294, - "epoch": 0.4973809614223639, - "grad_norm": 7.702752113342285, - "learning_rate": 3.5871493117420684e-06, - "loss": 0.4322, - "mean_token_accuracy": 0.8578760981559753, - "num_tokens": 193061093.0, - "step": 160450 - }, - { - "entropy": 1.8131542086601258, - "epoch": 0.49741196054741355, - "grad_norm": 7.689070224761963, - "learning_rate": 3.587037532246922e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8490845113992691, - "num_tokens": 193074196.0, - "step": 160460 - }, - { - "entropy": 1.8581334315240383, - "epoch": 0.49744295967246327, - "grad_norm": 8.081965446472168, - "learning_rate": 3.586925763200637e-06, - "loss": 0.4122, - "mean_token_accuracy": 0.8551263764500618, - "num_tokens": 193086783.0, - "step": 160470 - }, - { - "entropy": 1.8547850877046586, - "epoch": 0.49747395879751294, - "grad_norm": 7.127450466156006, - "learning_rate": 3.5868140046015883e-06, - "loss": 0.4176, - "mean_token_accuracy": 0.8548941016197205, - "num_tokens": 193099533.0, - "step": 160480 - }, - { - "entropy": 1.8142898052930831, - "epoch": 0.49750495792256266, - "grad_norm": 4.191645622253418, - "learning_rate": 3.586702256448146e-06, - "loss": 0.406, - "mean_token_accuracy": 0.8591264098882675, - "num_tokens": 193112575.0, - "step": 160490 - }, - { - "entropy": 1.927902916073799, - "epoch": 0.49753595704761233, - "grad_norm": 7.649588584899902, - "learning_rate": 3.5865905187386845e-06, - "loss": 0.4836, - "mean_token_accuracy": 0.848539587855339, - "num_tokens": 193124101.0, - "step": 160500 - }, - { - "entropy": 1.8359772473573686, - "epoch": 0.497566956172662, - "grad_norm": 3.336451530456543, - "learning_rate": 3.5864787914715773e-06, - "loss": 0.4197, - "mean_token_accuracy": 0.8483365759253502, - "num_tokens": 193136746.0, - "step": 160510 - }, - { - "entropy": 1.8549357965588569, - "epoch": 0.4975979552977117, - "grad_norm": 3.272468328475952, - "learning_rate": 3.5863670746451963e-06, - "loss": 0.3921, - "mean_token_accuracy": 0.8660996928811073, - "num_tokens": 193149274.0, - "step": 160520 - }, - { - "entropy": 1.907238420844078, - "epoch": 0.4976289544227614, - "grad_norm": 3.739243745803833, - "learning_rate": 3.5862553682579175e-06, - "loss": 0.4601, - "mean_token_accuracy": 0.8532083362340928, - "num_tokens": 193160807.0, - "step": 160530 - }, - { - "entropy": 1.8771161273121835, - "epoch": 0.4976599535478111, - "grad_norm": 7.761352062225342, - "learning_rate": 3.586143672308113e-06, - "loss": 0.4552, - "mean_token_accuracy": 0.851627166569233, - "num_tokens": 193173146.0, - "step": 160540 - }, - { - "entropy": 1.8144533932209015, - "epoch": 0.4976909526728608, - "grad_norm": 9.153278350830078, - "learning_rate": 3.58603198679416e-06, - "loss": 0.4255, - "mean_token_accuracy": 0.8531859219074249, - "num_tokens": 193185308.0, - "step": 160550 - }, - { - "entropy": 1.903257629275322, - "epoch": 0.4977219517979105, - "grad_norm": 9.370990753173828, - "learning_rate": 3.5859203117144324e-06, - "loss": 0.5, - "mean_token_accuracy": 0.8494491398334503, - "num_tokens": 193196560.0, - "step": 160560 - }, - { - "entropy": 1.7986585214734077, - "epoch": 0.4977529509229602, - "grad_norm": 3.870612144470215, - "learning_rate": 3.5858086470673054e-06, - "loss": 0.4082, - "mean_token_accuracy": 0.8564859181642532, - "num_tokens": 193209503.0, - "step": 160570 - }, - { - "entropy": 1.9573538348078727, - "epoch": 0.4977839500480099, - "grad_norm": 9.66162109375, - "learning_rate": 3.585696992851155e-06, - "loss": 0.4833, - "mean_token_accuracy": 0.8488394528627395, - "num_tokens": 193220876.0, - "step": 160580 - }, - { - "entropy": 1.8788496136665345, - "epoch": 0.4978149491730596, - "grad_norm": 8.454802513122559, - "learning_rate": 3.5855853490643573e-06, - "loss": 0.4499, - "mean_token_accuracy": 0.8537091195583344, - "num_tokens": 193232585.0, - "step": 160590 - }, - { - "entropy": 1.7184559665620327, - "epoch": 0.4978459482981093, - "grad_norm": 8.67977523803711, - "learning_rate": 3.5854737157052886e-06, - "loss": 0.3645, - "mean_token_accuracy": 0.8678611069917679, - "num_tokens": 193247245.0, - "step": 160600 - }, - { - "entropy": 1.8064767561852932, - "epoch": 0.49787694742315897, - "grad_norm": 2.588245153427124, - "learning_rate": 3.5853620927723265e-06, - "loss": 0.436, - "mean_token_accuracy": 0.860741664469242, - "num_tokens": 193260256.0, - "step": 160610 - }, - { - "entropy": 1.9140912219882011, - "epoch": 0.4979079465482087, - "grad_norm": 8.764533996582031, - "learning_rate": 3.5852504802638467e-06, - "loss": 0.5058, - "mean_token_accuracy": 0.8398638129234314, - "num_tokens": 193271290.0, - "step": 160620 - }, - { - "entropy": 1.796024279296398, - "epoch": 0.49793894567325836, - "grad_norm": 4.2609543800354, - "learning_rate": 3.5851388781782276e-06, - "loss": 0.3752, - "mean_token_accuracy": 0.8642449587583542, - "num_tokens": 193284344.0, - "step": 160630 - }, - { - "entropy": 1.82810877263546, - "epoch": 0.4979699447983081, - "grad_norm": 9.36631965637207, - "learning_rate": 3.5850272865138475e-06, - "loss": 0.4207, - "mean_token_accuracy": 0.8589912727475166, - "num_tokens": 193296415.0, - "step": 160640 - }, - { - "entropy": 1.8314507842063903, - "epoch": 0.49800094392335775, - "grad_norm": 8.399770736694336, - "learning_rate": 3.5849157052690836e-06, - "loss": 0.3946, - "mean_token_accuracy": 0.86473308801651, - "num_tokens": 193308563.0, - "step": 160650 - }, - { - "entropy": 1.8416701436042786, - "epoch": 0.4980319430484075, - "grad_norm": 8.284887313842773, - "learning_rate": 3.584804134442316e-06, - "loss": 0.4147, - "mean_token_accuracy": 0.8562467381358146, - "num_tokens": 193321328.0, - "step": 160660 - }, - { - "entropy": 1.8433082446455955, - "epoch": 0.49806294217345715, - "grad_norm": 3.750549554824829, - "learning_rate": 3.5846925740319214e-06, - "loss": 0.4251, - "mean_token_accuracy": 0.8653277009725571, - "num_tokens": 193333110.0, - "step": 160670 - }, - { - "entropy": 1.8722620084881783, - "epoch": 0.49809394129850687, - "grad_norm": 9.886913299560547, - "learning_rate": 3.5845810240362815e-06, - "loss": 0.4267, - "mean_token_accuracy": 0.857438001036644, - "num_tokens": 193345144.0, - "step": 160680 - }, - { - "entropy": 1.8812648728489876, - "epoch": 0.49812494042355654, - "grad_norm": 3.8210055828094482, - "learning_rate": 3.584469484453774e-06, - "loss": 0.4351, - "mean_token_accuracy": 0.850870543718338, - "num_tokens": 193357243.0, - "step": 160690 - }, - { - "entropy": 1.751509742438793, - "epoch": 0.49815593954860626, - "grad_norm": 8.921031951904297, - "learning_rate": 3.5843579552827802e-06, - "loss": 0.4664, - "mean_token_accuracy": 0.8553698793053627, - "num_tokens": 193370914.0, - "step": 160700 - }, - { - "entropy": 1.8664864271879196, - "epoch": 0.49818693867365593, - "grad_norm": 3.878030300140381, - "learning_rate": 3.5842464365216806e-06, - "loss": 0.4201, - "mean_token_accuracy": 0.8606648445129395, - "num_tokens": 193383110.0, - "step": 160710 - }, - { - "entropy": 1.844856907427311, - "epoch": 0.49821793779870566, - "grad_norm": 8.737525939941406, - "learning_rate": 3.584134928168854e-06, - "loss": 0.3944, - "mean_token_accuracy": 0.8666700348258018, - "num_tokens": 193395446.0, - "step": 160720 - }, - { - "entropy": 1.8013098880648613, - "epoch": 0.4982489369237553, - "grad_norm": 8.164528846740723, - "learning_rate": 3.5840234302226833e-06, - "loss": 0.425, - "mean_token_accuracy": 0.8570413291454315, - "num_tokens": 193408380.0, - "step": 160730 - }, - { - "entropy": 1.9012565463781357, - "epoch": 0.49827993604880505, - "grad_norm": 8.635117530822754, - "learning_rate": 3.58391194268155e-06, - "loss": 0.4785, - "mean_token_accuracy": 0.8446336343884469, - "num_tokens": 193420383.0, - "step": 160740 - }, - { - "entropy": 1.9297475934028625, - "epoch": 0.4983109351738547, - "grad_norm": 8.393662452697754, - "learning_rate": 3.5838004655438347e-06, - "loss": 0.4535, - "mean_token_accuracy": 0.8500937014818192, - "num_tokens": 193431556.0, - "step": 160750 - }, - { - "entropy": 1.875514169037342, - "epoch": 0.4983419342989044, - "grad_norm": 10.568635940551758, - "learning_rate": 3.5836889988079206e-06, - "loss": 0.4371, - "mean_token_accuracy": 0.8593442022800446, - "num_tokens": 193443133.0, - "step": 160760 - }, - { - "entropy": 1.8471161857247353, - "epoch": 0.4983729334239541, - "grad_norm": 8.708873748779297, - "learning_rate": 3.5835775424721886e-06, - "loss": 0.4227, - "mean_token_accuracy": 0.8582314670085907, - "num_tokens": 193455329.0, - "step": 160770 - }, - { - "entropy": 1.9030718505382538, - "epoch": 0.4984039325490038, - "grad_norm": 8.513256072998047, - "learning_rate": 3.583466096535023e-06, - "loss": 0.494, - "mean_token_accuracy": 0.8403557062149047, - "num_tokens": 193466580.0, - "step": 160780 - }, - { - "entropy": 1.7898402735590935, - "epoch": 0.4984349316740535, - "grad_norm": 3.610626697540283, - "learning_rate": 3.583354660994807e-06, - "loss": 0.3997, - "mean_token_accuracy": 0.8605961218476296, - "num_tokens": 193479729.0, - "step": 160790 - }, - { - "entropy": 1.8853804931044578, - "epoch": 0.4984659307991032, - "grad_norm": 7.379966735839844, - "learning_rate": 3.5832432358499227e-06, - "loss": 0.5226, - "mean_token_accuracy": 0.8424478873610497, - "num_tokens": 193491462.0, - "step": 160800 - }, - { - "entropy": 1.8701233610510826, - "epoch": 0.4984969299241529, - "grad_norm": 8.146286010742188, - "learning_rate": 3.5831318210987557e-06, - "loss": 0.3963, - "mean_token_accuracy": 0.8631499618291855, - "num_tokens": 193503353.0, - "step": 160810 - }, - { - "entropy": 1.8317988231778144, - "epoch": 0.49852792904920257, - "grad_norm": 7.9895758628845215, - "learning_rate": 3.583020416739689e-06, - "loss": 0.3922, - "mean_token_accuracy": 0.8609337940812111, - "num_tokens": 193516186.0, - "step": 160820 - }, - { - "entropy": 1.8742655038833618, - "epoch": 0.4985589281742523, - "grad_norm": 8.589852333068848, - "learning_rate": 3.582909022771108e-06, - "loss": 0.442, - "mean_token_accuracy": 0.8622861579060555, - "num_tokens": 193527160.0, - "step": 160830 - }, - { - "entropy": 1.8188278570771217, - "epoch": 0.49858992729930196, - "grad_norm": 3.9392247200012207, - "learning_rate": 3.582797639191397e-06, - "loss": 0.4083, - "mean_token_accuracy": 0.8563838452100754, - "num_tokens": 193539448.0, - "step": 160840 - }, - { - "entropy": 1.7949530065059662, - "epoch": 0.4986209264243517, - "grad_norm": 7.496416091918945, - "learning_rate": 3.5826862659989416e-06, - "loss": 0.3885, - "mean_token_accuracy": 0.858404703438282, - "num_tokens": 193552034.0, - "step": 160850 - }, - { - "entropy": 1.900011982023716, - "epoch": 0.49865192554940135, - "grad_norm": 3.616969108581543, - "learning_rate": 3.582574903192127e-06, - "loss": 0.4624, - "mean_token_accuracy": 0.8452894240617752, - "num_tokens": 193563777.0, - "step": 160860 - }, - { - "entropy": 1.8028769597411156, - "epoch": 0.4986829246744511, - "grad_norm": 9.061861991882324, - "learning_rate": 3.5824635507693403e-06, - "loss": 0.4553, - "mean_token_accuracy": 0.8631314247846603, - "num_tokens": 193577121.0, - "step": 160870 - }, - { - "entropy": 1.8786051549017428, - "epoch": 0.49871392379950075, - "grad_norm": 7.7923054695129395, - "learning_rate": 3.5823522087289664e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8566824227571488, - "num_tokens": 193588577.0, - "step": 160880 - }, - { - "entropy": 1.8396636798977852, - "epoch": 0.49874492292455047, - "grad_norm": 3.286170721054077, - "learning_rate": 3.582240877069393e-06, - "loss": 0.4473, - "mean_token_accuracy": 0.8455423668026925, - "num_tokens": 193601807.0, - "step": 160890 - }, - { - "entropy": 1.9669003546237946, - "epoch": 0.49877592204960014, - "grad_norm": 8.088907241821289, - "learning_rate": 3.5821295557890062e-06, - "loss": 0.5321, - "mean_token_accuracy": 0.8448588579893113, - "num_tokens": 193612280.0, - "step": 160900 - }, - { - "entropy": 1.8892876401543617, - "epoch": 0.49880692117464986, - "grad_norm": 7.942569255828857, - "learning_rate": 3.582018244886195e-06, - "loss": 0.4585, - "mean_token_accuracy": 0.8533686950802803, - "num_tokens": 193623951.0, - "step": 160910 - }, - { - "entropy": 1.9404545783996583, - "epoch": 0.49883792029969953, - "grad_norm": 6.952447891235352, - "learning_rate": 3.581906944359346e-06, - "loss": 0.5396, - "mean_token_accuracy": 0.8299561634659767, - "num_tokens": 193634649.0, - "step": 160920 - }, - { - "entropy": 1.872673834860325, - "epoch": 0.49886891942474926, - "grad_norm": 8.344843864440918, - "learning_rate": 3.5817956542068466e-06, - "loss": 0.4419, - "mean_token_accuracy": 0.862852719426155, - "num_tokens": 193646222.0, - "step": 160930 - }, - { - "entropy": 1.8161238446831702, - "epoch": 0.4988999185497989, - "grad_norm": 4.470328330993652, - "learning_rate": 3.581684374427087e-06, - "loss": 0.4024, - "mean_token_accuracy": 0.8478014081716537, - "num_tokens": 193659437.0, - "step": 160940 - }, - { - "entropy": 1.9250866621732712, - "epoch": 0.49893091767484865, - "grad_norm": 8.126724243164062, - "learning_rate": 3.581573105018454e-06, - "loss": 0.4955, - "mean_token_accuracy": 0.8495612889528275, - "num_tokens": 193670140.0, - "step": 160950 - }, - { - "entropy": 1.8661917194724083, - "epoch": 0.4989619167998983, - "grad_norm": 9.641986846923828, - "learning_rate": 3.581461845979339e-06, - "loss": 0.5094, - "mean_token_accuracy": 0.8382344901561737, - "num_tokens": 193682472.0, - "step": 160960 - }, - { - "entropy": 1.8382489174604415, - "epoch": 0.49899291592494804, - "grad_norm": 8.112931251525879, - "learning_rate": 3.5813505973081294e-06, - "loss": 0.4663, - "mean_token_accuracy": 0.8525958672165871, - "num_tokens": 193695088.0, - "step": 160970 - }, - { - "entropy": 1.8945767790079118, - "epoch": 0.4990239150499977, - "grad_norm": 6.302922248840332, - "learning_rate": 3.5812393590032156e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8630844354629517, - "num_tokens": 193706455.0, - "step": 160980 - }, - { - "entropy": 1.8268967524170876, - "epoch": 0.4990549141750474, - "grad_norm": 7.509896755218506, - "learning_rate": 3.581128131062989e-06, - "loss": 0.411, - "mean_token_accuracy": 0.8638947933912278, - "num_tokens": 193719141.0, - "step": 160990 - }, - { - "entropy": 1.869875578582287, - "epoch": 0.4990859133000971, - "grad_norm": 7.7595415115356445, - "learning_rate": 3.58101691348584e-06, - "loss": 0.4414, - "mean_token_accuracy": 0.8568534716963768, - "num_tokens": 193731463.0, - "step": 161000 - }, - { - "entropy": 1.8743370115756988, - "epoch": 0.4991169124251468, - "grad_norm": 8.525010108947754, - "learning_rate": 3.580905706270157e-06, - "loss": 0.4557, - "mean_token_accuracy": 0.8406820744276047, - "num_tokens": 193743131.0, - "step": 161010 - }, - { - "entropy": 1.8488017559051513, - "epoch": 0.4991479115501965, - "grad_norm": 3.2859580516815186, - "learning_rate": 3.5807945094143338e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8638896048069, - "num_tokens": 193755108.0, - "step": 161020 - }, - { - "entropy": 1.9557087212800979, - "epoch": 0.49917891067524617, - "grad_norm": 12.029139518737793, - "learning_rate": 3.580683322916761e-06, - "loss": 0.4921, - "mean_token_accuracy": 0.8448839083313942, - "num_tokens": 193766392.0, - "step": 161030 - }, - { - "entropy": 1.8913592636585235, - "epoch": 0.4992099098002959, - "grad_norm": 7.942804336547852, - "learning_rate": 3.580572146775831e-06, - "loss": 0.4635, - "mean_token_accuracy": 0.8532082542777062, - "num_tokens": 193778253.0, - "step": 161040 - }, - { - "entropy": 1.9256371706724167, - "epoch": 0.49924090892534556, - "grad_norm": 8.954080581665039, - "learning_rate": 3.5804609809899356e-06, - "loss": 0.4912, - "mean_token_accuracy": 0.8524809762835502, - "num_tokens": 193789236.0, - "step": 161050 - }, - { - "entropy": 1.8400282293558121, - "epoch": 0.4992719080503953, - "grad_norm": 7.780353546142578, - "learning_rate": 3.5803498255574676e-06, - "loss": 0.4207, - "mean_token_accuracy": 0.8595440477132797, - "num_tokens": 193801955.0, - "step": 161060 - }, - { - "entropy": 1.8195059970021248, - "epoch": 0.49930290717544495, - "grad_norm": 8.77364730834961, - "learning_rate": 3.5802386804768204e-06, - "loss": 0.4038, - "mean_token_accuracy": 0.8579374149441719, - "num_tokens": 193814514.0, - "step": 161070 - }, - { - "entropy": 1.9665822595357896, - "epoch": 0.4993339063004947, - "grad_norm": 10.273021697998047, - "learning_rate": 3.580127545746387e-06, - "loss": 0.5165, - "mean_token_accuracy": 0.8401234805583954, - "num_tokens": 193825591.0, - "step": 161080 - }, - { - "entropy": 1.903402642905712, - "epoch": 0.49936490542554435, - "grad_norm": 7.531785011291504, - "learning_rate": 3.5800164213645606e-06, - "loss": 0.4651, - "mean_token_accuracy": 0.8556138724088669, - "num_tokens": 193837687.0, - "step": 161090 - }, - { - "entropy": 1.8461224928498268, - "epoch": 0.49939590455059407, - "grad_norm": 7.117091655731201, - "learning_rate": 3.5799053073297356e-06, - "loss": 0.406, - "mean_token_accuracy": 0.8609979644417762, - "num_tokens": 193849622.0, - "step": 161100 - }, - { - "entropy": 1.9408129811286927, - "epoch": 0.49942690367564374, - "grad_norm": 8.56417465209961, - "learning_rate": 3.579794203640307e-06, - "loss": 0.4863, - "mean_token_accuracy": 0.8458396375179291, - "num_tokens": 193861216.0, - "step": 161110 - }, - { - "entropy": 1.8642560109496116, - "epoch": 0.49945790280069347, - "grad_norm": 8.209739685058594, - "learning_rate": 3.579683110294669e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8453632459044457, - "num_tokens": 193873699.0, - "step": 161120 - }, - { - "entropy": 1.935758689045906, - "epoch": 0.49948890192574313, - "grad_norm": 8.520611763000488, - "learning_rate": 3.5795720272912166e-06, - "loss": 0.4864, - "mean_token_accuracy": 0.8472222432494163, - "num_tokens": 193884501.0, - "step": 161130 - }, - { - "entropy": 1.7907845377922058, - "epoch": 0.49951990105079286, - "grad_norm": 3.21339750289917, - "learning_rate": 3.5794609546283445e-06, - "loss": 0.4153, - "mean_token_accuracy": 0.8530368015170098, - "num_tokens": 193898149.0, - "step": 161140 - }, - { - "entropy": 1.9477273747324944, - "epoch": 0.4995509001758425, - "grad_norm": 8.93225383758545, - "learning_rate": 3.5793498923044502e-06, - "loss": 0.5176, - "mean_token_accuracy": 0.839634670317173, - "num_tokens": 193909283.0, - "step": 161150 - }, - { - "entropy": 1.8608055517077446, - "epoch": 0.49958189930089225, - "grad_norm": 8.591536521911621, - "learning_rate": 3.579238840317929e-06, - "loss": 0.4194, - "mean_token_accuracy": 0.8619609922170639, - "num_tokens": 193921871.0, - "step": 161160 - }, - { - "entropy": 1.903657865524292, - "epoch": 0.4996128984259419, - "grad_norm": 6.806171417236328, - "learning_rate": 3.579127798667177e-06, - "loss": 0.4606, - "mean_token_accuracy": 0.8534547924995423, - "num_tokens": 193933281.0, - "step": 161170 - }, - { - "entropy": 1.8228919118642808, - "epoch": 0.49964389755099164, - "grad_norm": 5.248661041259766, - "learning_rate": 3.579016767350592e-06, - "loss": 0.3961, - "mean_token_accuracy": 0.8604255363345146, - "num_tokens": 193946044.0, - "step": 161180 - }, - { - "entropy": 1.8331756204366685, - "epoch": 0.4996748966760413, - "grad_norm": 10.0490140914917, - "learning_rate": 3.5789057463665692e-06, - "loss": 0.4363, - "mean_token_accuracy": 0.8582054451107979, - "num_tokens": 193958910.0, - "step": 161190 - }, - { - "entropy": 1.839639674127102, - "epoch": 0.49970589580109104, - "grad_norm": 10.93938159942627, - "learning_rate": 3.578794735713509e-06, - "loss": 0.4319, - "mean_token_accuracy": 0.854024501144886, - "num_tokens": 193970762.0, - "step": 161200 - }, - { - "entropy": 1.910327608883381, - "epoch": 0.4997368949261407, - "grad_norm": 9.916841506958008, - "learning_rate": 3.578683735389807e-06, - "loss": 0.4878, - "mean_token_accuracy": 0.84305549710989, - "num_tokens": 193981939.0, - "step": 161210 - }, - { - "entropy": 1.8969020605087281, - "epoch": 0.49976789405119043, - "grad_norm": 8.263409614562988, - "learning_rate": 3.5785727453938623e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.853573352098465, - "num_tokens": 193993370.0, - "step": 161220 - }, - { - "entropy": 1.8600556001067161, - "epoch": 0.4997988931762401, - "grad_norm": 8.200851440429688, - "learning_rate": 3.578461765724073e-06, - "loss": 0.4654, - "mean_token_accuracy": 0.8544128373265266, - "num_tokens": 194004957.0, - "step": 161230 - }, - { - "entropy": 1.868491704761982, - "epoch": 0.49982989230128977, - "grad_norm": 8.064600944519043, - "learning_rate": 3.5783507963788377e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8572847083210945, - "num_tokens": 194016347.0, - "step": 161240 - }, - { - "entropy": 1.904083575308323, - "epoch": 0.4998608914263395, - "grad_norm": 8.280893325805664, - "learning_rate": 3.5782398373565575e-06, - "loss": 0.5018, - "mean_token_accuracy": 0.8400688841938972, - "num_tokens": 194028152.0, - "step": 161250 - }, - { - "entropy": 1.8171085551381112, - "epoch": 0.49989189055138916, - "grad_norm": 10.810096740722656, - "learning_rate": 3.5781288886556296e-06, - "loss": 0.3941, - "mean_token_accuracy": 0.8669310986995697, - "num_tokens": 194040145.0, - "step": 161260 - }, - { - "entropy": 1.8588432848453522, - "epoch": 0.4999228896764389, - "grad_norm": 8.02641773223877, - "learning_rate": 3.578017950274456e-06, - "loss": 0.4678, - "mean_token_accuracy": 0.8487448245286942, - "num_tokens": 194051969.0, - "step": 161270 - }, - { - "entropy": 1.8924987077713014, - "epoch": 0.49995388880148856, - "grad_norm": 7.1130523681640625, - "learning_rate": 3.577907022211436e-06, - "loss": 0.4199, - "mean_token_accuracy": 0.8623496115207672, - "num_tokens": 194063394.0, - "step": 161280 - }, - { - "entropy": 1.876361007988453, - "epoch": 0.4999848879265383, - "grad_norm": 9.621905326843262, - "learning_rate": 3.57779610446497e-06, - "loss": 0.4444, - "mean_token_accuracy": 0.8544125527143478, - "num_tokens": 194074979.0, - "step": 161290 - }, - { - "entropy": 1.8698507621884346, - "epoch": 0.500015887051588, - "grad_norm": 6.634136199951172, - "learning_rate": 3.5776851970334595e-06, - "loss": 0.451, - "mean_token_accuracy": 0.8547692626714707, - "num_tokens": 194086868.0, - "step": 161300 - }, - { - "entropy": 1.8418845430016517, - "epoch": 0.5000468861766376, - "grad_norm": 8.004556655883789, - "learning_rate": 3.5775742999153062e-06, - "loss": 0.4005, - "mean_token_accuracy": 0.8597453564405442, - "num_tokens": 194099111.0, - "step": 161310 - }, - { - "entropy": 1.8818573281168938, - "epoch": 0.5000778853016874, - "grad_norm": 7.51983642578125, - "learning_rate": 3.577463413108911e-06, - "loss": 0.4688, - "mean_token_accuracy": 0.8519199967384339, - "num_tokens": 194110983.0, - "step": 161320 - }, - { - "entropy": 1.7738328106701373, - "epoch": 0.5001088844267371, - "grad_norm": 2.6278769969940186, - "learning_rate": 3.5773525366126755e-06, - "loss": 0.3906, - "mean_token_accuracy": 0.8709217235445976, - "num_tokens": 194125109.0, - "step": 161330 - }, - { - "entropy": 1.8441026404500007, - "epoch": 0.5001398835517867, - "grad_norm": 4.450294494628906, - "learning_rate": 3.5772416704250034e-06, - "loss": 0.4566, - "mean_token_accuracy": 0.8480847701430321, - "num_tokens": 194137288.0, - "step": 161340 - }, - { - "entropy": 1.8499442219734192, - "epoch": 0.5001708826768364, - "grad_norm": 9.552627563476562, - "learning_rate": 3.5771308145442972e-06, - "loss": 0.4164, - "mean_token_accuracy": 0.8577769994735718, - "num_tokens": 194149433.0, - "step": 161350 - }, - { - "entropy": 1.9184771940112113, - "epoch": 0.5002018818018862, - "grad_norm": 8.634852409362793, - "learning_rate": 3.5770199689689593e-06, - "loss": 0.4814, - "mean_token_accuracy": 0.8490300461649894, - "num_tokens": 194160817.0, - "step": 161360 - }, - { - "entropy": 1.8820372819900513, - "epoch": 0.5002328809269359, - "grad_norm": 3.9401602745056152, - "learning_rate": 3.576909133697393e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8586199179291725, - "num_tokens": 194172116.0, - "step": 161370 - }, - { - "entropy": 1.8695161834359169, - "epoch": 0.5002638800519855, - "grad_norm": 10.332625389099121, - "learning_rate": 3.5767983087280032e-06, - "loss": 0.4621, - "mean_token_accuracy": 0.8503239244222641, - "num_tokens": 194183969.0, - "step": 161380 - }, - { - "entropy": 1.8113522306084633, - "epoch": 0.5002948791770352, - "grad_norm": 8.187897682189941, - "learning_rate": 3.576687494059193e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8534181043505669, - "num_tokens": 194197490.0, - "step": 161390 - }, - { - "entropy": 1.8908743023872376, - "epoch": 0.500325878302085, - "grad_norm": 8.39079475402832, - "learning_rate": 3.5765766896893673e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8607093915343285, - "num_tokens": 194208821.0, - "step": 161400 - }, - { - "entropy": 1.9498828038573266, - "epoch": 0.5003568774271346, - "grad_norm": 10.169794082641602, - "learning_rate": 3.5764658956169306e-06, - "loss": 0.5227, - "mean_token_accuracy": 0.8365338191390037, - "num_tokens": 194220290.0, - "step": 161410 - }, - { - "entropy": 1.884354117512703, - "epoch": 0.5003878765521843, - "grad_norm": 9.11744213104248, - "learning_rate": 3.5763551118402885e-06, - "loss": 0.4548, - "mean_token_accuracy": 0.8483538582921029, - "num_tokens": 194231909.0, - "step": 161420 - }, - { - "entropy": 1.916728711128235, - "epoch": 0.500418875677234, - "grad_norm": 9.879512786865234, - "learning_rate": 3.576244338357846e-06, - "loss": 0.5, - "mean_token_accuracy": 0.8416429966688156, - "num_tokens": 194242891.0, - "step": 161430 - }, - { - "entropy": 1.902044305205345, - "epoch": 0.5004498748022838, - "grad_norm": 7.635390281677246, - "learning_rate": 3.5761335751680097e-06, - "loss": 0.4243, - "mean_token_accuracy": 0.8647704660892487, - "num_tokens": 194254117.0, - "step": 161440 - }, - { - "entropy": 1.85581027418375, - "epoch": 0.5004808739273334, - "grad_norm": 4.087507724761963, - "learning_rate": 3.5760228222691847e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8528918892145156, - "num_tokens": 194267450.0, - "step": 161450 - }, - { - "entropy": 1.8870635867118835, - "epoch": 0.5005118730523831, - "grad_norm": 4.798098087310791, - "learning_rate": 3.5759120796597783e-06, - "loss": 0.4226, - "mean_token_accuracy": 0.8504547387361526, - "num_tokens": 194279941.0, - "step": 161460 - }, - { - "entropy": 1.8962156638503074, - "epoch": 0.5005428721774328, - "grad_norm": 8.878067016601562, - "learning_rate": 3.575801347338197e-06, - "loss": 0.4982, - "mean_token_accuracy": 0.8517125174403191, - "num_tokens": 194292113.0, - "step": 161470 - }, - { - "entropy": 1.9940287500619889, - "epoch": 0.5005738713024825, - "grad_norm": 9.91559886932373, - "learning_rate": 3.5756906253028483e-06, - "loss": 0.4889, - "mean_token_accuracy": 0.8439011916518211, - "num_tokens": 194302629.0, - "step": 161480 - }, - { - "entropy": 1.9340040653944015, - "epoch": 0.5006048704275322, - "grad_norm": 9.120388984680176, - "learning_rate": 3.5755799135521397e-06, - "loss": 0.4423, - "mean_token_accuracy": 0.864893200993538, - "num_tokens": 194313712.0, - "step": 161490 - }, - { - "entropy": 1.8403319254517556, - "epoch": 0.5006358695525819, - "grad_norm": 8.813713073730469, - "learning_rate": 3.5754692120844784e-06, - "loss": 0.398, - "mean_token_accuracy": 0.8584855198860168, - "num_tokens": 194326868.0, - "step": 161500 - }, - { - "entropy": 1.9156290888786316, - "epoch": 0.5006668686776315, - "grad_norm": 8.051271438598633, - "learning_rate": 3.575358520898275e-06, - "loss": 0.4642, - "mean_token_accuracy": 0.8504732191562653, - "num_tokens": 194339370.0, - "step": 161510 - }, - { - "entropy": 1.8070098072290421, - "epoch": 0.5006978678026812, - "grad_norm": 8.109489440917969, - "learning_rate": 3.5752478399919354e-06, - "loss": 0.4026, - "mean_token_accuracy": 0.8623450741171836, - "num_tokens": 194352815.0, - "step": 161520 - }, - { - "entropy": 1.8298007018864155, - "epoch": 0.500728866927731, - "grad_norm": 2.6040987968444824, - "learning_rate": 3.5751371693638696e-06, - "loss": 0.4278, - "mean_token_accuracy": 0.8597873091697693, - "num_tokens": 194365615.0, - "step": 161530 - }, - { - "entropy": 1.8470653221011162, - "epoch": 0.5007598660527807, - "grad_norm": 7.487679958343506, - "learning_rate": 3.575026509012487e-06, - "loss": 0.4099, - "mean_token_accuracy": 0.8588323727250099, - "num_tokens": 194377759.0, - "step": 161540 - }, - { - "entropy": 1.7974566683173179, - "epoch": 0.5007908651778303, - "grad_norm": 7.743740081787109, - "learning_rate": 3.5749158589361976e-06, - "loss": 0.3927, - "mean_token_accuracy": 0.8658158406615257, - "num_tokens": 194390750.0, - "step": 161550 - }, - { - "entropy": 1.8499557554721833, - "epoch": 0.50082186430288, - "grad_norm": 8.951606750488281, - "learning_rate": 3.5748052191334103e-06, - "loss": 0.5034, - "mean_token_accuracy": 0.8367986068129539, - "num_tokens": 194403015.0, - "step": 161560 - }, - { - "entropy": 1.8279045611619948, - "epoch": 0.5008528634279298, - "grad_norm": 3.8482038974761963, - "learning_rate": 3.5746945896025365e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.8634175881743431, - "num_tokens": 194415227.0, - "step": 161570 - }, - { - "entropy": 1.8749937638640404, - "epoch": 0.5008838625529795, - "grad_norm": 9.023499488830566, - "learning_rate": 3.574583970341986e-06, - "loss": 0.4488, - "mean_token_accuracy": 0.8496809035539628, - "num_tokens": 194427501.0, - "step": 161580 - }, - { - "entropy": 1.8510484382510186, - "epoch": 0.5009148616780291, - "grad_norm": 6.773876190185547, - "learning_rate": 3.5744733613501707e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.855115358531475, - "num_tokens": 194439878.0, - "step": 161590 - }, - { - "entropy": 1.80962725430727, - "epoch": 0.5009458608030788, - "grad_norm": 8.465936660766602, - "learning_rate": 3.574362762625502e-06, - "loss": 0.4577, - "mean_token_accuracy": 0.8550980076193809, - "num_tokens": 194453476.0, - "step": 161600 - }, - { - "entropy": 1.8801057904958725, - "epoch": 0.5009768599281286, - "grad_norm": 8.899097442626953, - "learning_rate": 3.57425217416639e-06, - "loss": 0.521, - "mean_token_accuracy": 0.8410529911518096, - "num_tokens": 194465486.0, - "step": 161610 - }, - { - "entropy": 1.8403988644480704, - "epoch": 0.5010078590531782, - "grad_norm": 10.057798385620117, - "learning_rate": 3.5741415959712482e-06, - "loss": 0.4447, - "mean_token_accuracy": 0.85611502379179, - "num_tokens": 194477958.0, - "step": 161620 - }, - { - "entropy": 1.8564189299941063, - "epoch": 0.5010388581782279, - "grad_norm": 7.379710674285889, - "learning_rate": 3.5740310280384887e-06, - "loss": 0.4527, - "mean_token_accuracy": 0.8523194447159768, - "num_tokens": 194490764.0, - "step": 161630 - }, - { - "entropy": 1.864148397743702, - "epoch": 0.5010698573032776, - "grad_norm": 7.809131622314453, - "learning_rate": 3.5739204703665244e-06, - "loss": 0.4633, - "mean_token_accuracy": 0.8495198383927345, - "num_tokens": 194503509.0, - "step": 161640 - }, - { - "entropy": 1.916366446018219, - "epoch": 0.5011008564283274, - "grad_norm": 7.292889595031738, - "learning_rate": 3.573809922953768e-06, - "loss": 0.5339, - "mean_token_accuracy": 0.8405555829405784, - "num_tokens": 194515043.0, - "step": 161650 - }, - { - "entropy": 1.8589420065283775, - "epoch": 0.501131855553377, - "grad_norm": 3.132341146469116, - "learning_rate": 3.5736993857986335e-06, - "loss": 0.431, - "mean_token_accuracy": 0.8655919685959816, - "num_tokens": 194527318.0, - "step": 161660 - }, - { - "entropy": 1.925423364341259, - "epoch": 0.5011628546784267, - "grad_norm": 4.025610446929932, - "learning_rate": 3.573588858899534e-06, - "loss": 0.4783, - "mean_token_accuracy": 0.8521231457591056, - "num_tokens": 194538326.0, - "step": 161670 - }, - { - "entropy": 1.8823940232396126, - "epoch": 0.5011938538034764, - "grad_norm": 7.982590675354004, - "learning_rate": 3.5734783422548842e-06, - "loss": 0.4819, - "mean_token_accuracy": 0.843793198466301, - "num_tokens": 194550394.0, - "step": 161680 - }, - { - "entropy": 1.8528065443038941, - "epoch": 0.5012248529285261, - "grad_norm": 9.233685493469238, - "learning_rate": 3.5733678358630976e-06, - "loss": 0.4445, - "mean_token_accuracy": 0.8438840091228486, - "num_tokens": 194562639.0, - "step": 161690 - }, - { - "entropy": 1.8516671895980834, - "epoch": 0.5012558520535758, - "grad_norm": 3.61114239692688, - "learning_rate": 3.57325733972259e-06, - "loss": 0.4248, - "mean_token_accuracy": 0.851916354894638, - "num_tokens": 194574471.0, - "step": 161700 - }, - { - "entropy": 1.7707113809883595, - "epoch": 0.5012868511786255, - "grad_norm": 3.688938856124878, - "learning_rate": 3.5731468538317764e-06, - "loss": 0.3455, - "mean_token_accuracy": 0.8591636136174202, - "num_tokens": 194589266.0, - "step": 161710 - }, - { - "entropy": 1.89532478004694, - "epoch": 0.5013178503036751, - "grad_norm": 8.20800495147705, - "learning_rate": 3.573036378189072e-06, - "loss": 0.4477, - "mean_token_accuracy": 0.8467110142111778, - "num_tokens": 194601549.0, - "step": 161720 - }, - { - "entropy": 1.857888177037239, - "epoch": 0.5013488494287249, - "grad_norm": 7.576171398162842, - "learning_rate": 3.572925912792893e-06, - "loss": 0.4383, - "mean_token_accuracy": 0.8551659643650055, - "num_tokens": 194614405.0, - "step": 161730 - }, - { - "entropy": 1.8105993568897247, - "epoch": 0.5013798485537746, - "grad_norm": 9.140817642211914, - "learning_rate": 3.572815457641654e-06, - "loss": 0.4096, - "mean_token_accuracy": 0.8596289157867432, - "num_tokens": 194626813.0, - "step": 161740 - }, - { - "entropy": 1.8599924936890602, - "epoch": 0.5014108476788243, - "grad_norm": 4.219054698944092, - "learning_rate": 3.572705012733774e-06, - "loss": 0.4205, - "mean_token_accuracy": 0.860489945113659, - "num_tokens": 194638501.0, - "step": 161750 - }, - { - "entropy": 1.8883384391665459, - "epoch": 0.5014418468038739, - "grad_norm": 9.223127365112305, - "learning_rate": 3.572594578067668e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8457170099020004, - "num_tokens": 194650304.0, - "step": 161760 - }, - { - "entropy": 1.843181850016117, - "epoch": 0.5014728459289236, - "grad_norm": 8.869895935058594, - "learning_rate": 3.5724841536417538e-06, - "loss": 0.4415, - "mean_token_accuracy": 0.8534633457660675, - "num_tokens": 194662685.0, - "step": 161770 - }, - { - "entropy": 1.88509142100811, - "epoch": 0.5015038450539734, - "grad_norm": 9.733488082885742, - "learning_rate": 3.5723737394544493e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.8535164296627045, - "num_tokens": 194674862.0, - "step": 161780 - }, - { - "entropy": 1.8763200983405113, - "epoch": 0.501534844179023, - "grad_norm": 3.906675100326538, - "learning_rate": 3.5722633355041724e-06, - "loss": 0.4047, - "mean_token_accuracy": 0.8577969685196877, - "num_tokens": 194686775.0, - "step": 161790 - }, - { - "entropy": 1.896049427986145, - "epoch": 0.5015658433040727, - "grad_norm": 7.51211404800415, - "learning_rate": 3.5721529417893404e-06, - "loss": 0.4634, - "mean_token_accuracy": 0.8528813973069191, - "num_tokens": 194698375.0, - "step": 161800 - }, - { - "entropy": 1.8656664967536927, - "epoch": 0.5015968424291224, - "grad_norm": 8.359691619873047, - "learning_rate": 3.572042558308373e-06, - "loss": 0.4496, - "mean_token_accuracy": 0.8549571260809898, - "num_tokens": 194709410.0, - "step": 161810 - }, - { - "entropy": 1.827798655629158, - "epoch": 0.5016278415541722, - "grad_norm": 8.835021018981934, - "learning_rate": 3.5719321850596877e-06, - "loss": 0.4291, - "mean_token_accuracy": 0.8649173125624656, - "num_tokens": 194721917.0, - "step": 161820 - }, - { - "entropy": 1.8799199149012567, - "epoch": 0.5016588406792218, - "grad_norm": 3.991224765777588, - "learning_rate": 3.571821822041705e-06, - "loss": 0.4567, - "mean_token_accuracy": 0.8502721592783928, - "num_tokens": 194734562.0, - "step": 161830 - }, - { - "entropy": 1.8569432660937308, - "epoch": 0.5016898398042715, - "grad_norm": 9.473611831665039, - "learning_rate": 3.571711469252845e-06, - "loss": 0.4501, - "mean_token_accuracy": 0.8522385835647583, - "num_tokens": 194746701.0, - "step": 161840 - }, - { - "entropy": 1.8433676049113275, - "epoch": 0.5017208389293212, - "grad_norm": 8.809913635253906, - "learning_rate": 3.571601126691525e-06, - "loss": 0.4223, - "mean_token_accuracy": 0.8491621285676956, - "num_tokens": 194759126.0, - "step": 161850 - }, - { - "entropy": 1.8659282326698303, - "epoch": 0.501751838054371, - "grad_norm": 8.631592750549316, - "learning_rate": 3.5714907943561683e-06, - "loss": 0.4625, - "mean_token_accuracy": 0.8507665291428566, - "num_tokens": 194770782.0, - "step": 161860 - }, - { - "entropy": 1.8826793491840363, - "epoch": 0.5017828371794206, - "grad_norm": 7.433642387390137, - "learning_rate": 3.5713804722451937e-06, - "loss": 0.468, - "mean_token_accuracy": 0.8468863472342492, - "num_tokens": 194782817.0, - "step": 161870 - }, - { - "entropy": 1.7949487701058389, - "epoch": 0.5018138363044703, - "grad_norm": 7.305869102478027, - "learning_rate": 3.571270160357023e-06, - "loss": 0.4091, - "mean_token_accuracy": 0.8627200186252594, - "num_tokens": 194795809.0, - "step": 161880 - }, - { - "entropy": 1.778245857357979, - "epoch": 0.50184483542952, - "grad_norm": 7.835882663726807, - "learning_rate": 3.571159858690077e-06, - "loss": 0.4017, - "mean_token_accuracy": 0.8689703851938247, - "num_tokens": 194808823.0, - "step": 161890 - }, - { - "entropy": 1.9078995436429977, - "epoch": 0.5018758345545697, - "grad_norm": 8.27534008026123, - "learning_rate": 3.571049567242778e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8613488137722015, - "num_tokens": 194820450.0, - "step": 161900 - }, - { - "entropy": 1.8325102671980857, - "epoch": 0.5019068336796194, - "grad_norm": 8.722766876220703, - "learning_rate": 3.5709392860135466e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8510407224297524, - "num_tokens": 194833049.0, - "step": 161910 - }, - { - "entropy": 1.8785583242774009, - "epoch": 0.5019378328046691, - "grad_norm": 7.252606391906738, - "learning_rate": 3.570829015000807e-06, - "loss": 0.4644, - "mean_token_accuracy": 0.8446881324052811, - "num_tokens": 194845217.0, - "step": 161920 - }, - { - "entropy": 1.9201249092817307, - "epoch": 0.5019688319297188, - "grad_norm": 7.984278678894043, - "learning_rate": 3.5707187542029805e-06, - "loss": 0.476, - "mean_token_accuracy": 0.8447786808013916, - "num_tokens": 194856631.0, - "step": 161930 - }, - { - "entropy": 1.9102614864706993, - "epoch": 0.5019998310547685, - "grad_norm": 6.645012855529785, - "learning_rate": 3.5706085036184905e-06, - "loss": 0.4508, - "mean_token_accuracy": 0.8588759183883667, - "num_tokens": 194868487.0, - "step": 161940 - }, - { - "entropy": 1.8596350729465485, - "epoch": 0.5020308301798182, - "grad_norm": 7.373410701751709, - "learning_rate": 3.5704982632457604e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.8577965304255486, - "num_tokens": 194880047.0, - "step": 161950 - }, - { - "entropy": 1.8387646555900574, - "epoch": 0.5020618293048679, - "grad_norm": 7.618228435516357, - "learning_rate": 3.5703880330832136e-06, - "loss": 0.4219, - "mean_token_accuracy": 0.86079321205616, - "num_tokens": 194891550.0, - "step": 161960 - }, - { - "entropy": 1.9075782671570778, - "epoch": 0.5020928284299175, - "grad_norm": 9.226759910583496, - "learning_rate": 3.5702778131292744e-06, - "loss": 0.4635, - "mean_token_accuracy": 0.8493689194321632, - "num_tokens": 194902889.0, - "step": 161970 - }, - { - "entropy": 1.8558399647474288, - "epoch": 0.5021238275549672, - "grad_norm": 9.140117645263672, - "learning_rate": 3.5701676033823672e-06, - "loss": 0.3996, - "mean_token_accuracy": 0.8625021308660508, - "num_tokens": 194915434.0, - "step": 161980 - }, - { - "entropy": 1.8274715527892114, - "epoch": 0.502154826680017, - "grad_norm": 6.623842239379883, - "learning_rate": 3.570057403840917e-06, - "loss": 0.4433, - "mean_token_accuracy": 0.8567396998405457, - "num_tokens": 194928384.0, - "step": 161990 - }, - { - "entropy": 1.8467453569173813, - "epoch": 0.5021858258050667, - "grad_norm": 10.840411186218262, - "learning_rate": 3.5699472145033483e-06, - "loss": 0.4631, - "mean_token_accuracy": 0.8468669727444649, - "num_tokens": 194940502.0, - "step": 162000 - }, - { - "entropy": 1.826990969479084, - "epoch": 0.5022168249301163, - "grad_norm": 3.876290798187256, - "learning_rate": 3.5698370353680865e-06, - "loss": 0.3885, - "mean_token_accuracy": 0.8612556800246238, - "num_tokens": 194953655.0, - "step": 162010 - }, - { - "entropy": 1.865774655342102, - "epoch": 0.502247824055166, - "grad_norm": 8.75535774230957, - "learning_rate": 3.5697268664335578e-06, - "loss": 0.4588, - "mean_token_accuracy": 0.8436019718647003, - "num_tokens": 194966026.0, - "step": 162020 - }, - { - "entropy": 1.8176633030176164, - "epoch": 0.5022788231802158, - "grad_norm": 7.738468170166016, - "learning_rate": 3.569616707698188e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8597527951002121, - "num_tokens": 194978261.0, - "step": 162030 - }, - { - "entropy": 1.8128898188471794, - "epoch": 0.5023098223052654, - "grad_norm": 7.874271392822266, - "learning_rate": 3.5695065591604027e-06, - "loss": 0.4031, - "mean_token_accuracy": 0.8667504027485847, - "num_tokens": 194990803.0, - "step": 162040 - }, - { - "entropy": 1.809106007218361, - "epoch": 0.5023408214303151, - "grad_norm": 7.867502689361572, - "learning_rate": 3.5693964208186305e-06, - "loss": 0.4039, - "mean_token_accuracy": 0.8542467027902603, - "num_tokens": 195003204.0, - "step": 162050 - }, - { - "entropy": 1.8450546979904174, - "epoch": 0.5023718205553648, - "grad_norm": 8.1978178024292, - "learning_rate": 3.5692862926712974e-06, - "loss": 0.4954, - "mean_token_accuracy": 0.8562353745102882, - "num_tokens": 195014841.0, - "step": 162060 - }, - { - "entropy": 1.8364934653043747, - "epoch": 0.5024028196804146, - "grad_norm": 8.754971504211426, - "learning_rate": 3.56917617471683e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.8514863669872283, - "num_tokens": 195027579.0, - "step": 162070 - }, - { - "entropy": 1.8638313546776772, - "epoch": 0.5024338188054642, - "grad_norm": 8.994431495666504, - "learning_rate": 3.569066066953658e-06, - "loss": 0.4387, - "mean_token_accuracy": 0.8624847516417503, - "num_tokens": 195038596.0, - "step": 162080 - }, - { - "entropy": 1.8255416080355644, - "epoch": 0.5024648179305139, - "grad_norm": 9.329654693603516, - "learning_rate": 3.568955969380207e-06, - "loss": 0.4532, - "mean_token_accuracy": 0.8518840372562408, - "num_tokens": 195051056.0, - "step": 162090 - }, - { - "entropy": 1.8981266662478447, - "epoch": 0.5024958170555636, - "grad_norm": 6.833081245422363, - "learning_rate": 3.5688458819949083e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.8566917911171913, - "num_tokens": 195062601.0, - "step": 162100 - }, - { - "entropy": 1.9108289882540703, - "epoch": 0.5025268161806133, - "grad_norm": 9.281651496887207, - "learning_rate": 3.568735804796189e-06, - "loss": 0.4831, - "mean_token_accuracy": 0.8436849609017372, - "num_tokens": 195073772.0, - "step": 162110 - }, - { - "entropy": 1.877676671743393, - "epoch": 0.502557815305663, - "grad_norm": 7.044517517089844, - "learning_rate": 3.5686257377824777e-06, - "loss": 0.4868, - "mean_token_accuracy": 0.8399266093969345, - "num_tokens": 195086232.0, - "step": 162120 - }, - { - "entropy": 1.9210964649915696, - "epoch": 0.5025888144307127, - "grad_norm": 7.519648551940918, - "learning_rate": 3.568515680952206e-06, - "loss": 0.5072, - "mean_token_accuracy": 0.8400311037898064, - "num_tokens": 195097344.0, - "step": 162130 - }, - { - "entropy": 1.7322592556476593, - "epoch": 0.5026198135557624, - "grad_norm": 7.744187355041504, - "learning_rate": 3.5684056343038014e-06, - "loss": 0.4063, - "mean_token_accuracy": 0.8637993618845939, - "num_tokens": 195110801.0, - "step": 162140 - }, - { - "entropy": 1.8559142157435418, - "epoch": 0.5026508126808121, - "grad_norm": 8.278355598449707, - "learning_rate": 3.568295597835696e-06, - "loss": 0.4526, - "mean_token_accuracy": 0.8566200494766235, - "num_tokens": 195122316.0, - "step": 162150 - }, - { - "entropy": 1.8842547863721848, - "epoch": 0.5026818118058618, - "grad_norm": 7.3361029624938965, - "learning_rate": 3.5681855715463183e-06, - "loss": 0.4767, - "mean_token_accuracy": 0.85838573127985, - "num_tokens": 195133389.0, - "step": 162160 - }, - { - "entropy": 1.8281088501214982, - "epoch": 0.5027128109309115, - "grad_norm": 7.411527633666992, - "learning_rate": 3.5680755554341003e-06, - "loss": 0.4149, - "mean_token_accuracy": 0.8537177413702011, - "num_tokens": 195145711.0, - "step": 162170 - }, - { - "entropy": 1.8865630626678467, - "epoch": 0.5027438100559611, - "grad_norm": 9.198566436767578, - "learning_rate": 3.567965549497473e-06, - "loss": 0.4775, - "mean_token_accuracy": 0.8512567788362503, - "num_tokens": 195156986.0, - "step": 162180 - }, - { - "entropy": 1.77679483294487, - "epoch": 0.5027748091810109, - "grad_norm": 4.166167736053467, - "learning_rate": 3.5678555537348686e-06, - "loss": 0.381, - "mean_token_accuracy": 0.8688113674521446, - "num_tokens": 195170019.0, - "step": 162190 - }, - { - "entropy": 1.8510409817099571, - "epoch": 0.5028058083060606, - "grad_norm": 6.120206356048584, - "learning_rate": 3.5677455681447176e-06, - "loss": 0.4395, - "mean_token_accuracy": 0.8500844925642014, - "num_tokens": 195182067.0, - "step": 162200 - }, - { - "entropy": 1.8342761471867561, - "epoch": 0.5028368074311103, - "grad_norm": 7.963062286376953, - "learning_rate": 3.567635592725453e-06, - "loss": 0.4986, - "mean_token_accuracy": 0.8414328068494796, - "num_tokens": 195194944.0, - "step": 162210 - }, - { - "entropy": 1.8795111045241355, - "epoch": 0.5028678065561599, - "grad_norm": 8.403858184814453, - "learning_rate": 3.5675256274755066e-06, - "loss": 0.417, - "mean_token_accuracy": 0.862656545639038, - "num_tokens": 195206665.0, - "step": 162220 - }, - { - "entropy": 1.876885211467743, - "epoch": 0.5028988056812096, - "grad_norm": 7.945247173309326, - "learning_rate": 3.5674156723933117e-06, - "loss": 0.434, - "mean_token_accuracy": 0.8566308572888375, - "num_tokens": 195218499.0, - "step": 162230 - }, - { - "entropy": 1.8293002039194106, - "epoch": 0.5029298048062594, - "grad_norm": 7.862081050872803, - "learning_rate": 3.5673057274773025e-06, - "loss": 0.4145, - "mean_token_accuracy": 0.8659921497106552, - "num_tokens": 195231040.0, - "step": 162240 - }, - { - "entropy": 1.814247028529644, - "epoch": 0.502960803931309, - "grad_norm": 4.131370544433594, - "learning_rate": 3.567195792725911e-06, - "loss": 0.4476, - "mean_token_accuracy": 0.8450552076101303, - "num_tokens": 195243169.0, - "step": 162250 - }, - { - "entropy": 1.8270811960101128, - "epoch": 0.5029918030563587, - "grad_norm": 8.027288436889648, - "learning_rate": 3.5670858681375727e-06, - "loss": 0.4405, - "mean_token_accuracy": 0.8579193204641342, - "num_tokens": 195256142.0, - "step": 162260 - }, - { - "entropy": 1.8580777063965797, - "epoch": 0.5030228021814084, - "grad_norm": 9.05886173248291, - "learning_rate": 3.566975953710719e-06, - "loss": 0.4171, - "mean_token_accuracy": 0.8610779225826264, - "num_tokens": 195267553.0, - "step": 162270 - }, - { - "entropy": 1.8327148109674454, - "epoch": 0.5030538013064582, - "grad_norm": 8.335761070251465, - "learning_rate": 3.566866049443787e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.8535551458597184, - "num_tokens": 195280087.0, - "step": 162280 - }, - { - "entropy": 1.8819597899913787, - "epoch": 0.5030848004315078, - "grad_norm": 7.634295463562012, - "learning_rate": 3.566756155335211e-06, - "loss": 0.4524, - "mean_token_accuracy": 0.8621120229363441, - "num_tokens": 195291226.0, - "step": 162290 - }, - { - "entropy": 1.8669955790042878, - "epoch": 0.5031157995565575, - "grad_norm": 8.279129028320312, - "learning_rate": 3.5666462713834252e-06, - "loss": 0.4412, - "mean_token_accuracy": 0.8547209471464157, - "num_tokens": 195302557.0, - "step": 162300 - }, - { - "entropy": 1.8815603330731392, - "epoch": 0.5031467986816072, - "grad_norm": 3.4400410652160645, - "learning_rate": 3.566536397586867e-06, - "loss": 0.5623, - "mean_token_accuracy": 0.8529194951057434, - "num_tokens": 195314615.0, - "step": 162310 - }, - { - "entropy": 1.8385434448719025, - "epoch": 0.503177797806657, - "grad_norm": 8.215117454528809, - "learning_rate": 3.5664265339439706e-06, - "loss": 0.4481, - "mean_token_accuracy": 0.8514703080058098, - "num_tokens": 195326532.0, - "step": 162320 - }, - { - "entropy": 1.9076471701264381, - "epoch": 0.5032087969317066, - "grad_norm": 4.307278156280518, - "learning_rate": 3.566316680453173e-06, - "loss": 0.4357, - "mean_token_accuracy": 0.8577963724732399, - "num_tokens": 195338119.0, - "step": 162330 - }, - { - "entropy": 1.7793606474995614, - "epoch": 0.5032397960567563, - "grad_norm": 7.525938034057617, - "learning_rate": 3.566206837112911e-06, - "loss": 0.3931, - "mean_token_accuracy": 0.8615632086992264, - "num_tokens": 195351610.0, - "step": 162340 - }, - { - "entropy": 1.8213515982031823, - "epoch": 0.503270795181806, - "grad_norm": 7.753080368041992, - "learning_rate": 3.566097003921621e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.8667016878724099, - "num_tokens": 195364183.0, - "step": 162350 - }, - { - "entropy": 1.8507965892553329, - "epoch": 0.5033017943068557, - "grad_norm": 6.9658098220825195, - "learning_rate": 3.5659871808777403e-06, - "loss": 0.4439, - "mean_token_accuracy": 0.8534910395741463, - "num_tokens": 195376426.0, - "step": 162360 - }, - { - "entropy": 1.8666039258241653, - "epoch": 0.5033327934319054, - "grad_norm": 8.829039573669434, - "learning_rate": 3.5658773679797065e-06, - "loss": 0.4336, - "mean_token_accuracy": 0.8550096690654755, - "num_tokens": 195388504.0, - "step": 162370 - }, - { - "entropy": 1.7253398269414901, - "epoch": 0.5033637925569551, - "grad_norm": 3.7189226150512695, - "learning_rate": 3.5657675652259573e-06, - "loss": 0.352, - "mean_token_accuracy": 0.8698497876524925, - "num_tokens": 195401715.0, - "step": 162380 - }, - { - "entropy": 1.8578629180788995, - "epoch": 0.5033947916820047, - "grad_norm": 7.805914878845215, - "learning_rate": 3.5656577726149312e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8586318030953407, - "num_tokens": 195413783.0, - "step": 162390 - }, - { - "entropy": 1.860382466763258, - "epoch": 0.5034257908070545, - "grad_norm": 7.376383304595947, - "learning_rate": 3.5655479901450673e-06, - "loss": 0.4422, - "mean_token_accuracy": 0.852292089164257, - "num_tokens": 195426109.0, - "step": 162400 - }, - { - "entropy": 1.9275429144501686, - "epoch": 0.5034567899321042, - "grad_norm": 7.535244464874268, - "learning_rate": 3.5654382178148036e-06, - "loss": 0.4711, - "mean_token_accuracy": 0.8557485058903694, - "num_tokens": 195437469.0, - "step": 162410 - }, - { - "entropy": 1.858670848608017, - "epoch": 0.5034877890571539, - "grad_norm": 6.955326080322266, - "learning_rate": 3.56532845562258e-06, - "loss": 0.4751, - "mean_token_accuracy": 0.8485241889953613, - "num_tokens": 195449150.0, - "step": 162420 - }, - { - "entropy": 1.9349894881248475, - "epoch": 0.5035187881822035, - "grad_norm": 3.8754801750183105, - "learning_rate": 3.5652187035668352e-06, - "loss": 0.4934, - "mean_token_accuracy": 0.8489937767386436, - "num_tokens": 195461001.0, - "step": 162430 - }, - { - "entropy": 1.8421294122934342, - "epoch": 0.5035497873072533, - "grad_norm": 3.5598652362823486, - "learning_rate": 3.565108961646011e-06, - "loss": 0.3764, - "mean_token_accuracy": 0.8684445038437844, - "num_tokens": 195473100.0, - "step": 162440 - }, - { - "entropy": 1.9252609878778457, - "epoch": 0.503580786432303, - "grad_norm": 7.782764434814453, - "learning_rate": 3.5649992298585456e-06, - "loss": 0.4848, - "mean_token_accuracy": 0.8494197487831116, - "num_tokens": 195483490.0, - "step": 162450 - }, - { - "entropy": 1.9152775600552558, - "epoch": 0.5036117855573526, - "grad_norm": 8.533858299255371, - "learning_rate": 3.5648895082028813e-06, - "loss": 0.4879, - "mean_token_accuracy": 0.8448960855603218, - "num_tokens": 195494785.0, - "step": 162460 - }, - { - "entropy": 1.8215551912784576, - "epoch": 0.5036427846824023, - "grad_norm": 3.6843440532684326, - "learning_rate": 3.564779796677457e-06, - "loss": 0.4003, - "mean_token_accuracy": 0.86255484521389, - "num_tokens": 195507467.0, - "step": 162470 - }, - { - "entropy": 1.8157613933086396, - "epoch": 0.503673783807452, - "grad_norm": 3.5262677669525146, - "learning_rate": 3.5646700952807156e-06, - "loss": 0.4047, - "mean_token_accuracy": 0.8654473096132278, - "num_tokens": 195520127.0, - "step": 162480 - }, - { - "entropy": 1.7795906513929367, - "epoch": 0.5037047829325018, - "grad_norm": 2.948425531387329, - "learning_rate": 3.564560404011099e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.8592526748776436, - "num_tokens": 195533959.0, - "step": 162490 - }, - { - "entropy": 1.9089320957660676, - "epoch": 0.5037357820575514, - "grad_norm": 8.778511047363281, - "learning_rate": 3.5644507228670484e-06, - "loss": 0.4872, - "mean_token_accuracy": 0.8524185940623283, - "num_tokens": 195544950.0, - "step": 162500 - }, - { - "entropy": 1.9139399603009224, - "epoch": 0.5037667811826011, - "grad_norm": 9.315644264221191, - "learning_rate": 3.5643410518470057e-06, - "loss": 0.4663, - "mean_token_accuracy": 0.8555566519498825, - "num_tokens": 195555547.0, - "step": 162510 - }, - { - "entropy": 1.8605040475726127, - "epoch": 0.5037977803076508, - "grad_norm": 8.976792335510254, - "learning_rate": 3.5642313909494137e-06, - "loss": 0.4765, - "mean_token_accuracy": 0.8453806653618813, - "num_tokens": 195567770.0, - "step": 162520 - }, - { - "entropy": 1.907510343194008, - "epoch": 0.5038287794327005, - "grad_norm": 7.562481880187988, - "learning_rate": 3.564121740172716e-06, - "loss": 0.4398, - "mean_token_accuracy": 0.8639927938580513, - "num_tokens": 195578608.0, - "step": 162530 - }, - { - "entropy": 1.7901795297861098, - "epoch": 0.5038597785577502, - "grad_norm": 8.070297241210938, - "learning_rate": 3.564012099515356e-06, - "loss": 0.3757, - "mean_token_accuracy": 0.8621209725737572, - "num_tokens": 195592039.0, - "step": 162540 - }, - { - "entropy": 1.8731615900993348, - "epoch": 0.5038907776827999, - "grad_norm": 8.658186912536621, - "learning_rate": 3.5639024689757764e-06, - "loss": 0.4581, - "mean_token_accuracy": 0.8569270372390747, - "num_tokens": 195604573.0, - "step": 162550 - }, - { - "entropy": 1.9131707713007926, - "epoch": 0.5039217768078496, - "grad_norm": 9.220719337463379, - "learning_rate": 3.563792848552421e-06, - "loss": 0.4654, - "mean_token_accuracy": 0.8557065084576607, - "num_tokens": 195615623.0, - "step": 162560 - }, - { - "entropy": 1.868717408180237, - "epoch": 0.5039527759328993, - "grad_norm": 7.270646095275879, - "learning_rate": 3.5636832382437353e-06, - "loss": 0.4689, - "mean_token_accuracy": 0.8525914192199707, - "num_tokens": 195627525.0, - "step": 162570 - }, - { - "entropy": 1.8212410807609558, - "epoch": 0.503983775057949, - "grad_norm": 4.4293060302734375, - "learning_rate": 3.5635736380481627e-06, - "loss": 0.4176, - "mean_token_accuracy": 0.8602419763803482, - "num_tokens": 195639900.0, - "step": 162580 - }, - { - "entropy": 1.859234368801117, - "epoch": 0.5040147741829987, - "grad_norm": 7.507938861846924, - "learning_rate": 3.563464047964149e-06, - "loss": 0.3935, - "mean_token_accuracy": 0.8594542518258095, - "num_tokens": 195651608.0, - "step": 162590 - }, - { - "entropy": 1.838496372103691, - "epoch": 0.5040457733080483, - "grad_norm": 3.874418258666992, - "learning_rate": 3.5633544679901406e-06, - "loss": 0.3798, - "mean_token_accuracy": 0.8664188906550407, - "num_tokens": 195664106.0, - "step": 162600 - }, - { - "entropy": 1.8103223249316216, - "epoch": 0.5040767724330981, - "grad_norm": 7.857743740081787, - "learning_rate": 3.56324489812458e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.8632272258400917, - "num_tokens": 195676849.0, - "step": 162610 - }, - { - "entropy": 1.841738609969616, - "epoch": 0.5041077715581478, - "grad_norm": 4.171340465545654, - "learning_rate": 3.563135338365916e-06, - "loss": 0.4059, - "mean_token_accuracy": 0.8611914679408074, - "num_tokens": 195688771.0, - "step": 162620 - }, - { - "entropy": 1.8774579733610153, - "epoch": 0.5041387706831975, - "grad_norm": 3.3485891819000244, - "learning_rate": 3.5630257887125935e-06, - "loss": 0.4318, - "mean_token_accuracy": 0.8576310917735099, - "num_tokens": 195700208.0, - "step": 162630 - }, - { - "entropy": 1.768410849571228, - "epoch": 0.5041697698082471, - "grad_norm": 3.4588170051574707, - "learning_rate": 3.562916249163059e-06, - "loss": 0.3664, - "mean_token_accuracy": 0.8597879484295845, - "num_tokens": 195713203.0, - "step": 162640 - }, - { - "entropy": 1.8163224518299104, - "epoch": 0.5042007689332969, - "grad_norm": 7.023558616638184, - "learning_rate": 3.5628067197157614e-06, - "loss": 0.3915, - "mean_token_accuracy": 0.8744960188865661, - "num_tokens": 195725159.0, - "step": 162650 - }, - { - "entropy": 1.9602682262659072, - "epoch": 0.5042317680583466, - "grad_norm": 8.747138977050781, - "learning_rate": 3.5626972003691456e-06, - "loss": 0.5098, - "mean_token_accuracy": 0.8455164894461632, - "num_tokens": 195735711.0, - "step": 162660 - }, - { - "entropy": 1.9070382133126258, - "epoch": 0.5042627671833962, - "grad_norm": 7.247774124145508, - "learning_rate": 3.5625876911216605e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8507187455892563, - "num_tokens": 195746697.0, - "step": 162670 - }, - { - "entropy": 1.853401993960142, - "epoch": 0.5042937663084459, - "grad_norm": 7.354197978973389, - "learning_rate": 3.562478191971754e-06, - "loss": 0.4193, - "mean_token_accuracy": 0.8596841201186181, - "num_tokens": 195758088.0, - "step": 162680 - }, - { - "entropy": 1.8842257231473922, - "epoch": 0.5043247654334957, - "grad_norm": 7.232276916503906, - "learning_rate": 3.5623687029178734e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.8579404756426812, - "num_tokens": 195769723.0, - "step": 162690 - }, - { - "entropy": 1.8756929002702236, - "epoch": 0.5043557645585454, - "grad_norm": 6.513383388519287, - "learning_rate": 3.5622592239584688e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8571664929389954, - "num_tokens": 195782232.0, - "step": 162700 - }, - { - "entropy": 1.9318101540207864, - "epoch": 0.504386763683595, - "grad_norm": 8.294295310974121, - "learning_rate": 3.562149755091988e-06, - "loss": 0.5111, - "mean_token_accuracy": 0.8378466337919235, - "num_tokens": 195793428.0, - "step": 162710 - }, - { - "entropy": 1.8354035213589668, - "epoch": 0.5044177628086447, - "grad_norm": 7.902247905731201, - "learning_rate": 3.5620402963168814e-06, - "loss": 0.4184, - "mean_token_accuracy": 0.8561977759003639, - "num_tokens": 195806102.0, - "step": 162720 - }, - { - "entropy": 1.8348199382424355, - "epoch": 0.5044487619336944, - "grad_norm": 7.229824066162109, - "learning_rate": 3.5619308476315977e-06, - "loss": 0.4144, - "mean_token_accuracy": 0.8591752856969833, - "num_tokens": 195818891.0, - "step": 162730 - }, - { - "entropy": 1.9697114109992981, - "epoch": 0.5044797610587441, - "grad_norm": 7.836243152618408, - "learning_rate": 3.5618214090345877e-06, - "loss": 0.4915, - "mean_token_accuracy": 0.8503987655043602, - "num_tokens": 195829687.0, - "step": 162740 - }, - { - "entropy": 1.786692251265049, - "epoch": 0.5045107601837938, - "grad_norm": 6.62006950378418, - "learning_rate": 3.561711980524301e-06, - "loss": 0.3948, - "mean_token_accuracy": 0.8584597021341324, - "num_tokens": 195843142.0, - "step": 162750 - }, - { - "entropy": 1.8900447756052017, - "epoch": 0.5045417593088435, - "grad_norm": 9.13289737701416, - "learning_rate": 3.5616025620991885e-06, - "loss": 0.4559, - "mean_token_accuracy": 0.8533695697784424, - "num_tokens": 195855208.0, - "step": 162760 - }, - { - "entropy": 1.9006880089640616, - "epoch": 0.5045727584338932, - "grad_norm": 7.307352066040039, - "learning_rate": 3.5614931537577008e-06, - "loss": 0.5253, - "mean_token_accuracy": 0.8466876611113549, - "num_tokens": 195867829.0, - "step": 162770 - }, - { - "entropy": 1.9048534497618674, - "epoch": 0.5046037575589429, - "grad_norm": 9.334182739257812, - "learning_rate": 3.5613837554982904e-06, - "loss": 0.4961, - "mean_token_accuracy": 0.8492770150303841, - "num_tokens": 195879511.0, - "step": 162780 - }, - { - "entropy": 1.8171656548976898, - "epoch": 0.5046347566839926, - "grad_norm": 8.229894638061523, - "learning_rate": 3.5612743673194076e-06, - "loss": 0.379, - "mean_token_accuracy": 0.865054938197136, - "num_tokens": 195892308.0, - "step": 162790 - }, - { - "entropy": 1.8470677226781844, - "epoch": 0.5046657558090423, - "grad_norm": 7.824212551116943, - "learning_rate": 3.561164989219505e-06, - "loss": 0.4028, - "mean_token_accuracy": 0.8674655973911285, - "num_tokens": 195904395.0, - "step": 162800 - }, - { - "entropy": 1.9542918413877488, - "epoch": 0.5046967549340919, - "grad_norm": 8.983610153198242, - "learning_rate": 3.561055621197035e-06, - "loss": 0.5495, - "mean_token_accuracy": 0.8392492473125458, - "num_tokens": 195915064.0, - "step": 162810 - }, - { - "entropy": 1.8837048798799514, - "epoch": 0.5047277540591417, - "grad_norm": 9.500035285949707, - "learning_rate": 3.5609462632504497e-06, - "loss": 0.4458, - "mean_token_accuracy": 0.857371661067009, - "num_tokens": 195926940.0, - "step": 162820 - }, - { - "entropy": 1.904589705169201, - "epoch": 0.5047587531841914, - "grad_norm": 7.82620906829834, - "learning_rate": 3.5608369153782024e-06, - "loss": 0.4877, - "mean_token_accuracy": 0.8510312214493752, - "num_tokens": 195938652.0, - "step": 162830 - }, - { - "entropy": 1.8928764477372169, - "epoch": 0.5047897523092411, - "grad_norm": 9.256412506103516, - "learning_rate": 3.5607275775787463e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8574974581599235, - "num_tokens": 195949705.0, - "step": 162840 - }, - { - "entropy": 1.8851047992706298, - "epoch": 0.5048207514342907, - "grad_norm": 7.625764846801758, - "learning_rate": 3.5606182498505353e-06, - "loss": 0.4258, - "mean_token_accuracy": 0.8522122874855995, - "num_tokens": 195962380.0, - "step": 162850 - }, - { - "entropy": 1.8839653983712197, - "epoch": 0.5048517505593405, - "grad_norm": 8.40437126159668, - "learning_rate": 3.560508932192024e-06, - "loss": 0.509, - "mean_token_accuracy": 0.8503521829843521, - "num_tokens": 195974171.0, - "step": 162860 - }, - { - "entropy": 1.924563753604889, - "epoch": 0.5048827496843902, - "grad_norm": 4.18229866027832, - "learning_rate": 3.5603996246016654e-06, - "loss": 0.4687, - "mean_token_accuracy": 0.8518718630075455, - "num_tokens": 195985037.0, - "step": 162870 - }, - { - "entropy": 1.9052809610962869, - "epoch": 0.5049137488094398, - "grad_norm": 7.999540328979492, - "learning_rate": 3.5602903270779145e-06, - "loss": 0.4749, - "mean_token_accuracy": 0.8530282035470009, - "num_tokens": 195996485.0, - "step": 162880 - }, - { - "entropy": 1.844748669862747, - "epoch": 0.5049447479344895, - "grad_norm": 8.114140510559082, - "learning_rate": 3.560181039619226e-06, - "loss": 0.4077, - "mean_token_accuracy": 0.866498276591301, - "num_tokens": 196008896.0, - "step": 162890 - }, - { - "entropy": 1.9648150354623795, - "epoch": 0.5049757470595393, - "grad_norm": 8.700517654418945, - "learning_rate": 3.560071762224056e-06, - "loss": 0.4984, - "mean_token_accuracy": 0.8492494702339173, - "num_tokens": 196019760.0, - "step": 162900 - }, - { - "entropy": 1.860958421230316, - "epoch": 0.505006746184589, - "grad_norm": 8.077210426330566, - "learning_rate": 3.55996249489086e-06, - "loss": 0.4374, - "mean_token_accuracy": 0.8595580518245697, - "num_tokens": 196031677.0, - "step": 162910 - }, - { - "entropy": 1.914548434317112, - "epoch": 0.5050377453096386, - "grad_norm": 8.275936126708984, - "learning_rate": 3.559853237618094e-06, - "loss": 0.443, - "mean_token_accuracy": 0.85650105625391, - "num_tokens": 196043701.0, - "step": 162920 - }, - { - "entropy": 1.9637446463108064, - "epoch": 0.5050687444346883, - "grad_norm": 10.936336517333984, - "learning_rate": 3.5597439904042135e-06, - "loss": 0.4851, - "mean_token_accuracy": 0.8512757152318955, - "num_tokens": 196054231.0, - "step": 162930 - }, - { - "entropy": 1.805374266207218, - "epoch": 0.5050997435597381, - "grad_norm": 8.0625, - "learning_rate": 3.559634753247676e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.8544696539640426, - "num_tokens": 196067812.0, - "step": 162940 - }, - { - "entropy": 1.8754793629050255, - "epoch": 0.5051307426847877, - "grad_norm": 8.15665054321289, - "learning_rate": 3.5595255261469374e-06, - "loss": 0.509, - "mean_token_accuracy": 0.8432677164673805, - "num_tokens": 196080283.0, - "step": 162950 - }, - { - "entropy": 1.925532579421997, - "epoch": 0.5051617418098374, - "grad_norm": 9.213263511657715, - "learning_rate": 3.5594163091004564e-06, - "loss": 0.4683, - "mean_token_accuracy": 0.8600358635187149, - "num_tokens": 196091436.0, - "step": 162960 - }, - { - "entropy": 1.901042965054512, - "epoch": 0.5051927409348871, - "grad_norm": 8.694961547851562, - "learning_rate": 3.5593071021066894e-06, - "loss": 0.4715, - "mean_token_accuracy": 0.8498509466648102, - "num_tokens": 196103145.0, - "step": 162970 - }, - { - "entropy": 1.8106228694319726, - "epoch": 0.5052237400599368, - "grad_norm": 9.617980003356934, - "learning_rate": 3.559197905164095e-06, - "loss": 0.4386, - "mean_token_accuracy": 0.8574653580784798, - "num_tokens": 196116424.0, - "step": 162980 - }, - { - "entropy": 1.908635352551937, - "epoch": 0.5052547391849865, - "grad_norm": 9.529869079589844, - "learning_rate": 3.559088718271132e-06, - "loss": 0.4657, - "mean_token_accuracy": 0.8451606497168541, - "num_tokens": 196127678.0, - "step": 162990 - }, - { - "entropy": 1.8978085353970529, - "epoch": 0.5052857383100362, - "grad_norm": 8.438754081726074, - "learning_rate": 3.5589795414262574e-06, - "loss": 0.4265, - "mean_token_accuracy": 0.8580261752009392, - "num_tokens": 196139073.0, - "step": 163000 - }, - { - "entropy": 1.8685655757784843, - "epoch": 0.5053167374350859, - "grad_norm": 10.1161527633667, - "learning_rate": 3.558870374627932e-06, - "loss": 0.4395, - "mean_token_accuracy": 0.8439155369997025, - "num_tokens": 196151307.0, - "step": 163010 - }, - { - "entropy": 1.8491392314434052, - "epoch": 0.5053477365601355, - "grad_norm": 10.6281156539917, - "learning_rate": 3.5587612178746127e-06, - "loss": 0.4127, - "mean_token_accuracy": 0.8669519662857056, - "num_tokens": 196163886.0, - "step": 163020 - }, - { - "entropy": 1.9063034534454346, - "epoch": 0.5053787356851853, - "grad_norm": 3.8416171073913574, - "learning_rate": 3.5586520711647617e-06, - "loss": 0.5117, - "mean_token_accuracy": 0.8448263436555863, - "num_tokens": 196175457.0, - "step": 163030 - }, - { - "entropy": 1.806091445684433, - "epoch": 0.505409734810235, - "grad_norm": 4.29022216796875, - "learning_rate": 3.558542934496838e-06, - "loss": 0.4284, - "mean_token_accuracy": 0.8600598022341728, - "num_tokens": 196188363.0, - "step": 163040 - }, - { - "entropy": 1.8129280880093575, - "epoch": 0.5054407339352847, - "grad_norm": 3.926964521408081, - "learning_rate": 3.558433807869301e-06, - "loss": 0.4255, - "mean_token_accuracy": 0.8455210164189338, - "num_tokens": 196201293.0, - "step": 163050 - }, - { - "entropy": 1.9800580739974976, - "epoch": 0.5054717330603343, - "grad_norm": 8.467527389526367, - "learning_rate": 3.5583246912806125e-06, - "loss": 0.6094, - "mean_token_accuracy": 0.8347618013620377, - "num_tokens": 196213158.0, - "step": 163060 - }, - { - "entropy": 1.858849048614502, - "epoch": 0.5055027321853841, - "grad_norm": 6.902448654174805, - "learning_rate": 3.5582155847292326e-06, - "loss": 0.4079, - "mean_token_accuracy": 0.8614846393465996, - "num_tokens": 196224932.0, - "step": 163070 - }, - { - "entropy": 1.9071022912859916, - "epoch": 0.5055337313104338, - "grad_norm": 7.139303684234619, - "learning_rate": 3.558106488213623e-06, - "loss": 0.4688, - "mean_token_accuracy": 0.8478293195366859, - "num_tokens": 196236392.0, - "step": 163080 - }, - { - "entropy": 1.797558219730854, - "epoch": 0.5055647304354834, - "grad_norm": 3.989180326461792, - "learning_rate": 3.557997401732245e-06, - "loss": 0.3896, - "mean_token_accuracy": 0.8643699809908867, - "num_tokens": 196249374.0, - "step": 163090 - }, - { - "entropy": 1.9015874803066253, - "epoch": 0.5055957295605331, - "grad_norm": 10.056818962097168, - "learning_rate": 3.5578883252835605e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.854583865404129, - "num_tokens": 196261723.0, - "step": 163100 - }, - { - "entropy": 1.8313779145479203, - "epoch": 0.5056267286855829, - "grad_norm": 7.709977149963379, - "learning_rate": 3.557779258866032e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8604197606444359, - "num_tokens": 196273674.0, - "step": 163110 - }, - { - "entropy": 1.8923453286290168, - "epoch": 0.5056577278106326, - "grad_norm": 7.376804828643799, - "learning_rate": 3.5576702024781225e-06, - "loss": 0.4001, - "mean_token_accuracy": 0.8579896494746209, - "num_tokens": 196285871.0, - "step": 163120 - }, - { - "entropy": 1.8411938205361367, - "epoch": 0.5056887269356822, - "grad_norm": 9.377099990844727, - "learning_rate": 3.557561156118294e-06, - "loss": 0.4311, - "mean_token_accuracy": 0.8572324633598327, - "num_tokens": 196299051.0, - "step": 163130 - }, - { - "entropy": 1.9145785033702851, - "epoch": 0.5057197260607319, - "grad_norm": 7.624598026275635, - "learning_rate": 3.5574521197850097e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8504158198833466, - "num_tokens": 196310614.0, - "step": 163140 - }, - { - "entropy": 1.8663910001516342, - "epoch": 0.5057507251857817, - "grad_norm": 9.322566986083984, - "learning_rate": 3.5573430934767344e-06, - "loss": 0.4534, - "mean_token_accuracy": 0.851227393746376, - "num_tokens": 196322489.0, - "step": 163150 - }, - { - "entropy": 1.8053078174591064, - "epoch": 0.5057817243108313, - "grad_norm": 8.671806335449219, - "learning_rate": 3.5572340771919307e-06, - "loss": 0.3865, - "mean_token_accuracy": 0.8665953055024147, - "num_tokens": 196335281.0, - "step": 163160 - }, - { - "entropy": 1.9042056173086166, - "epoch": 0.505812723435881, - "grad_norm": 9.928486824035645, - "learning_rate": 3.5571250709290633e-06, - "loss": 0.4379, - "mean_token_accuracy": 0.8564940080046654, - "num_tokens": 196346642.0, - "step": 163170 - }, - { - "entropy": 1.9873277485370635, - "epoch": 0.5058437225609307, - "grad_norm": 8.902290344238281, - "learning_rate": 3.5570160746865965e-06, - "loss": 0.5176, - "mean_token_accuracy": 0.8341750279068947, - "num_tokens": 196358056.0, - "step": 163180 - }, - { - "entropy": 1.8295244112610818, - "epoch": 0.5058747216859805, - "grad_norm": 8.927177429199219, - "learning_rate": 3.5569070884629963e-06, - "loss": 0.4099, - "mean_token_accuracy": 0.8622911289334297, - "num_tokens": 196370104.0, - "step": 163190 - }, - { - "entropy": 1.8176844894886017, - "epoch": 0.5059057208110301, - "grad_norm": 7.046045303344727, - "learning_rate": 3.5567981122567263e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.8524423450231552, - "num_tokens": 196383106.0, - "step": 163200 - }, - { - "entropy": 1.90313328653574, - "epoch": 0.5059367199360798, - "grad_norm": 10.572953224182129, - "learning_rate": 3.556689146066253e-06, - "loss": 0.4372, - "mean_token_accuracy": 0.8559685334563255, - "num_tokens": 196394839.0, - "step": 163210 - }, - { - "entropy": 1.9140533238649369, - "epoch": 0.5059677190611295, - "grad_norm": 2.8058857917785645, - "learning_rate": 3.5565801898900427e-06, - "loss": 0.4258, - "mean_token_accuracy": 0.8639476001262665, - "num_tokens": 196406632.0, - "step": 163220 - }, - { - "entropy": 1.8180472642183303, - "epoch": 0.5059987181861791, - "grad_norm": 2.5980710983276367, - "learning_rate": 3.5564712437265604e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8603103265166283, - "num_tokens": 196419364.0, - "step": 163230 - }, - { - "entropy": 1.9302504003047942, - "epoch": 0.5060297173112289, - "grad_norm": 7.521374225616455, - "learning_rate": 3.5563623075742736e-06, - "loss": 0.471, - "mean_token_accuracy": 0.8582198992371559, - "num_tokens": 196430357.0, - "step": 163240 - }, - { - "entropy": 1.8773491248488425, - "epoch": 0.5060607164362786, - "grad_norm": 7.329319477081299, - "learning_rate": 3.556253381431648e-06, - "loss": 0.408, - "mean_token_accuracy": 0.8633986055850983, - "num_tokens": 196442626.0, - "step": 163250 - }, - { - "entropy": 1.9318356201052667, - "epoch": 0.5060917155613283, - "grad_norm": 3.0441231727600098, - "learning_rate": 3.5561444652971527e-06, - "loss": 0.4759, - "mean_token_accuracy": 0.8500076308846474, - "num_tokens": 196453654.0, - "step": 163260 - }, - { - "entropy": 1.8296071410179138, - "epoch": 0.5061227146863779, - "grad_norm": 7.275659084320068, - "learning_rate": 3.556035559169253e-06, - "loss": 0.434, - "mean_token_accuracy": 0.8644940406084061, - "num_tokens": 196465862.0, - "step": 163270 - }, - { - "entropy": 1.8528779864311218, - "epoch": 0.5061537138114277, - "grad_norm": 8.553239822387695, - "learning_rate": 3.5559266630464182e-06, - "loss": 0.416, - "mean_token_accuracy": 0.8582915931940078, - "num_tokens": 196478225.0, - "step": 163280 - }, - { - "entropy": 1.8958040222525596, - "epoch": 0.5061847129364774, - "grad_norm": 8.999159812927246, - "learning_rate": 3.555817776927117e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8500429227948189, - "num_tokens": 196489275.0, - "step": 163290 - }, - { - "entropy": 1.953699079155922, - "epoch": 0.506215712061527, - "grad_norm": 8.064541816711426, - "learning_rate": 3.5557089008098162e-06, - "loss": 0.4706, - "mean_token_accuracy": 0.8561220705509186, - "num_tokens": 196500253.0, - "step": 163300 - }, - { - "entropy": 1.870068684220314, - "epoch": 0.5062467111865767, - "grad_norm": 8.940662384033203, - "learning_rate": 3.5556000346929846e-06, - "loss": 0.506, - "mean_token_accuracy": 0.8416852578520775, - "num_tokens": 196512116.0, - "step": 163310 - }, - { - "entropy": 1.8221497237682343, - "epoch": 0.5062777103116265, - "grad_norm": 8.552144050598145, - "learning_rate": 3.555491178575094e-06, - "loss": 0.3984, - "mean_token_accuracy": 0.8682190105319023, - "num_tokens": 196525033.0, - "step": 163320 - }, - { - "entropy": 1.831996063888073, - "epoch": 0.5063087094366762, - "grad_norm": 9.747136116027832, - "learning_rate": 3.5553823324546104e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8476357832551003, - "num_tokens": 196537789.0, - "step": 163330 - }, - { - "entropy": 1.7898123905062675, - "epoch": 0.5063397085617258, - "grad_norm": 5.09330940246582, - "learning_rate": 3.5552734963300062e-06, - "loss": 0.3892, - "mean_token_accuracy": 0.8650940164923668, - "num_tokens": 196551522.0, - "step": 163340 - }, - { - "entropy": 1.7997850097715855, - "epoch": 0.5063707076867755, - "grad_norm": 4.622293472290039, - "learning_rate": 3.5551646701997505e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.8593069702386856, - "num_tokens": 196564746.0, - "step": 163350 - }, - { - "entropy": 1.8479792803525925, - "epoch": 0.5064017068118253, - "grad_norm": 10.61267375946045, - "learning_rate": 3.5550558540623135e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8591436266899108, - "num_tokens": 196577353.0, - "step": 163360 - }, - { - "entropy": 1.830297763645649, - "epoch": 0.506432705936875, - "grad_norm": 7.7146711349487305, - "learning_rate": 3.5549470479161675e-06, - "loss": 0.421, - "mean_token_accuracy": 0.8574198722839356, - "num_tokens": 196588958.0, - "step": 163370 - }, - { - "entropy": 1.8857371792197228, - "epoch": 0.5064637050619246, - "grad_norm": 4.479658126831055, - "learning_rate": 3.554838251759782e-06, - "loss": 0.5091, - "mean_token_accuracy": 0.840976457297802, - "num_tokens": 196600307.0, - "step": 163380 - }, - { - "entropy": 1.861470101773739, - "epoch": 0.5064947041869743, - "grad_norm": 6.463710784912109, - "learning_rate": 3.554729465591629e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8638939753174781, - "num_tokens": 196611868.0, - "step": 163390 - }, - { - "entropy": 1.7832600355148316, - "epoch": 0.5065257033120241, - "grad_norm": 7.852832317352295, - "learning_rate": 3.5546206894101797e-06, - "loss": 0.3883, - "mean_token_accuracy": 0.8515184715390205, - "num_tokens": 196625328.0, - "step": 163400 - }, - { - "entropy": 1.8880818665027619, - "epoch": 0.5065567024370737, - "grad_norm": 3.90632963180542, - "learning_rate": 3.554511923213907e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8527933284640312, - "num_tokens": 196637075.0, - "step": 163410 - }, - { - "entropy": 1.8785538420081138, - "epoch": 0.5065877015621234, - "grad_norm": 7.10991096496582, - "learning_rate": 3.5544031670012836e-06, - "loss": 0.47, - "mean_token_accuracy": 0.8546990528702736, - "num_tokens": 196648934.0, - "step": 163420 - }, - { - "entropy": 1.8279323622584343, - "epoch": 0.5066187006871731, - "grad_norm": 8.198480606079102, - "learning_rate": 3.5542944207707806e-06, - "loss": 0.4145, - "mean_token_accuracy": 0.8548176631331443, - "num_tokens": 196661904.0, - "step": 163430 - }, - { - "entropy": 1.8618044629693031, - "epoch": 0.5066496998122229, - "grad_norm": 9.521814346313477, - "learning_rate": 3.554185684520874e-06, - "loss": 0.4561, - "mean_token_accuracy": 0.8495841041207314, - "num_tokens": 196673894.0, - "step": 163440 - }, - { - "entropy": 1.8538531705737114, - "epoch": 0.5066806989372725, - "grad_norm": 8.083247184753418, - "learning_rate": 3.5540769582500344e-06, - "loss": 0.4565, - "mean_token_accuracy": 0.8586274668574333, - "num_tokens": 196686759.0, - "step": 163450 - }, - { - "entropy": 1.8029966354370117, - "epoch": 0.5067116980623222, - "grad_norm": 8.461233139038086, - "learning_rate": 3.553968241956737e-06, - "loss": 0.3961, - "mean_token_accuracy": 0.861527732014656, - "num_tokens": 196699380.0, - "step": 163460 - }, - { - "entropy": 1.9485577285289764, - "epoch": 0.5067426971873719, - "grad_norm": 8.530065536499023, - "learning_rate": 3.5538595356394546e-06, - "loss": 0.4552, - "mean_token_accuracy": 0.8589451789855957, - "num_tokens": 196710283.0, - "step": 163470 - }, - { - "entropy": 1.8798445031046866, - "epoch": 0.5067736963124215, - "grad_norm": 8.257960319519043, - "learning_rate": 3.553750839296663e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.8557620927691459, - "num_tokens": 196722297.0, - "step": 163480 - }, - { - "entropy": 1.8360868021845818, - "epoch": 0.5068046954374713, - "grad_norm": 4.080684661865234, - "learning_rate": 3.5536421529268368e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8628736943006515, - "num_tokens": 196735291.0, - "step": 163490 - }, - { - "entropy": 1.8096343368291854, - "epoch": 0.506835694562521, - "grad_norm": 8.325018882751465, - "learning_rate": 3.55353347652845e-06, - "loss": 0.362, - "mean_token_accuracy": 0.8683801367878914, - "num_tokens": 196748018.0, - "step": 163500 - }, - { - "entropy": 1.870979880541563, - "epoch": 0.5068666936875706, - "grad_norm": 8.216853141784668, - "learning_rate": 3.5534248100999797e-06, - "loss": 0.4639, - "mean_token_accuracy": 0.8563655108213425, - "num_tokens": 196759790.0, - "step": 163510 - }, - { - "entropy": 1.8905472248792647, - "epoch": 0.5068976928126203, - "grad_norm": 7.7638726234436035, - "learning_rate": 3.553316153639899e-06, - "loss": 0.4838, - "mean_token_accuracy": 0.8415771916508674, - "num_tokens": 196771119.0, - "step": 163520 - }, - { - "entropy": 1.8568203702569008, - "epoch": 0.5069286919376701, - "grad_norm": 7.551204204559326, - "learning_rate": 3.5532075071466866e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.8592291429638863, - "num_tokens": 196783134.0, - "step": 163530 - }, - { - "entropy": 1.845226949453354, - "epoch": 0.5069596910627198, - "grad_norm": 11.321545600891113, - "learning_rate": 3.5530988706188167e-06, - "loss": 0.4273, - "mean_token_accuracy": 0.8542516723275184, - "num_tokens": 196794843.0, - "step": 163540 - }, - { - "entropy": 1.9101459234952927, - "epoch": 0.5069906901877694, - "grad_norm": 7.633603572845459, - "learning_rate": 3.5529902440547686e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.8477040901780128, - "num_tokens": 196806883.0, - "step": 163550 - }, - { - "entropy": 1.9154523998498916, - "epoch": 0.5070216893128191, - "grad_norm": 7.648013591766357, - "learning_rate": 3.5528816274530164e-06, - "loss": 0.477, - "mean_token_accuracy": 0.852397172152996, - "num_tokens": 196818158.0, - "step": 163560 - }, - { - "entropy": 1.8581027761101723, - "epoch": 0.5070526884378689, - "grad_norm": 8.126503944396973, - "learning_rate": 3.5527730208120387e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.8557763010263443, - "num_tokens": 196830042.0, - "step": 163570 - }, - { - "entropy": 1.9071855247020721, - "epoch": 0.5070836875629186, - "grad_norm": 8.152803421020508, - "learning_rate": 3.5526644241303143e-06, - "loss": 0.4265, - "mean_token_accuracy": 0.8564406126737595, - "num_tokens": 196841618.0, - "step": 163580 - }, - { - "entropy": 1.7716242380440235, - "epoch": 0.5071146866879682, - "grad_norm": 9.797401428222656, - "learning_rate": 3.5525558374063195e-06, - "loss": 0.3823, - "mean_token_accuracy": 0.8628744825720787, - "num_tokens": 196855435.0, - "step": 163590 - }, - { - "entropy": 1.7673632681369782, - "epoch": 0.5071456858130179, - "grad_norm": 9.011829376220703, - "learning_rate": 3.5524472606385324e-06, - "loss": 0.3566, - "mean_token_accuracy": 0.8731319457292557, - "num_tokens": 196867956.0, - "step": 163600 - }, - { - "entropy": 1.9083643838763238, - "epoch": 0.5071766849380677, - "grad_norm": 4.590526103973389, - "learning_rate": 3.5523386938254335e-06, - "loss": 0.4542, - "mean_token_accuracy": 0.8606811299920082, - "num_tokens": 196879300.0, - "step": 163610 - }, - { - "entropy": 1.7268058001995086, - "epoch": 0.5072076840631173, - "grad_norm": 8.471205711364746, - "learning_rate": 3.5522301369655e-06, - "loss": 0.3688, - "mean_token_accuracy": 0.8630135610699654, - "num_tokens": 196893124.0, - "step": 163620 - }, - { - "entropy": 1.867137537896633, - "epoch": 0.507238683188167, - "grad_norm": 8.471604347229004, - "learning_rate": 3.552121590057212e-06, - "loss": 0.4147, - "mean_token_accuracy": 0.8629182651638985, - "num_tokens": 196905343.0, - "step": 163630 - }, - { - "entropy": 1.745313723385334, - "epoch": 0.5072696823132167, - "grad_norm": 3.680870532989502, - "learning_rate": 3.5520130530990493e-06, - "loss": 0.3636, - "mean_token_accuracy": 0.8692538917064667, - "num_tokens": 196919685.0, - "step": 163640 - }, - { - "entropy": 1.80671256929636, - "epoch": 0.5073006814382665, - "grad_norm": 10.35844898223877, - "learning_rate": 3.5519045260894907e-06, - "loss": 0.3949, - "mean_token_accuracy": 0.866978970170021, - "num_tokens": 196932109.0, - "step": 163650 - }, - { - "entropy": 1.8454255670309068, - "epoch": 0.5073316805633161, - "grad_norm": 9.719582557678223, - "learning_rate": 3.551796009027018e-06, - "loss": 0.4218, - "mean_token_accuracy": 0.8601211249828339, - "num_tokens": 196944736.0, - "step": 163660 - }, - { - "entropy": 1.8403030604124069, - "epoch": 0.5073626796883658, - "grad_norm": 8.171625137329102, - "learning_rate": 3.551687501910111e-06, - "loss": 0.4166, - "mean_token_accuracy": 0.8558960810303688, - "num_tokens": 196957203.0, - "step": 163670 - }, - { - "entropy": 1.9288623362779618, - "epoch": 0.5073936788134155, - "grad_norm": 7.492020606994629, - "learning_rate": 3.5515790047372508e-06, - "loss": 0.4898, - "mean_token_accuracy": 0.8433584362268448, - "num_tokens": 196967955.0, - "step": 163680 - }, - { - "entropy": 1.86645817309618, - "epoch": 0.5074246779384652, - "grad_norm": 7.969177722930908, - "learning_rate": 3.5514705175069186e-06, - "loss": 0.4805, - "mean_token_accuracy": 0.8469413831830025, - "num_tokens": 196979245.0, - "step": 163690 - }, - { - "entropy": 1.870174802839756, - "epoch": 0.5074556770635149, - "grad_norm": 9.207684516906738, - "learning_rate": 3.551362040217595e-06, - "loss": 0.4496, - "mean_token_accuracy": 0.8529495477676392, - "num_tokens": 196991101.0, - "step": 163700 - }, - { - "entropy": 1.906150184571743, - "epoch": 0.5074866761885646, - "grad_norm": 10.05109691619873, - "learning_rate": 3.551253572867763e-06, - "loss": 0.4698, - "mean_token_accuracy": 0.8455789417028428, - "num_tokens": 197002059.0, - "step": 163710 - }, - { - "entropy": 1.859985102713108, - "epoch": 0.5075176753136142, - "grad_norm": 7.572958469390869, - "learning_rate": 3.551145115455905e-06, - "loss": 0.4089, - "mean_token_accuracy": 0.8629415169358253, - "num_tokens": 197013664.0, - "step": 163720 - }, - { - "entropy": 1.8383402064442635, - "epoch": 0.5075486744386639, - "grad_norm": 4.040586471557617, - "learning_rate": 3.551036667980503e-06, - "loss": 0.396, - "mean_token_accuracy": 0.8514451235532761, - "num_tokens": 197026174.0, - "step": 163730 - }, - { - "entropy": 1.8111710965633392, - "epoch": 0.5075796735637137, - "grad_norm": 9.287176132202148, - "learning_rate": 3.55092823044004e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8552304312586785, - "num_tokens": 197039878.0, - "step": 163740 - }, - { - "entropy": 1.7784530088305472, - "epoch": 0.5076106726887634, - "grad_norm": 7.509320259094238, - "learning_rate": 3.5508198028329993e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.855512909591198, - "num_tokens": 197053606.0, - "step": 163750 - }, - { - "entropy": 1.9045552536845207, - "epoch": 0.507641671813813, - "grad_norm": 8.221936225891113, - "learning_rate": 3.5507113851578635e-06, - "loss": 0.4295, - "mean_token_accuracy": 0.8489176541566849, - "num_tokens": 197065651.0, - "step": 163760 - }, - { - "entropy": 1.8847736433148383, - "epoch": 0.5076726709388627, - "grad_norm": 7.487794876098633, - "learning_rate": 3.5506029774131175e-06, - "loss": 0.4387, - "mean_token_accuracy": 0.8544848307967186, - "num_tokens": 197077138.0, - "step": 163770 - }, - { - "entropy": 1.8698254212737084, - "epoch": 0.5077036700639125, - "grad_norm": 8.723936080932617, - "learning_rate": 3.550494579597245e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8674852281808854, - "num_tokens": 197088846.0, - "step": 163780 - }, - { - "entropy": 1.9057440683245659, - "epoch": 0.5077346691889622, - "grad_norm": 7.241644382476807, - "learning_rate": 3.550386191708731e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.8506020948290824, - "num_tokens": 197100182.0, - "step": 163790 - }, - { - "entropy": 1.816692951321602, - "epoch": 0.5077656683140118, - "grad_norm": 7.693803787231445, - "learning_rate": 3.55027781374606e-06, - "loss": 0.3816, - "mean_token_accuracy": 0.8697048261761665, - "num_tokens": 197112422.0, - "step": 163800 - }, - { - "entropy": 1.896825762093067, - "epoch": 0.5077966674390615, - "grad_norm": 10.119492530822754, - "learning_rate": 3.5501694457077167e-06, - "loss": 0.4499, - "mean_token_accuracy": 0.8635825246572495, - "num_tokens": 197123947.0, - "step": 163810 - }, - { - "entropy": 1.8742366090416909, - "epoch": 0.5078276665641113, - "grad_norm": 8.425687789916992, - "learning_rate": 3.550061087592187e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8580169692635536, - "num_tokens": 197135606.0, - "step": 163820 - }, - { - "entropy": 1.800526437163353, - "epoch": 0.5078586656891609, - "grad_norm": 8.00316333770752, - "learning_rate": 3.5499527393979565e-06, - "loss": 0.4263, - "mean_token_accuracy": 0.8611070349812507, - "num_tokens": 197148361.0, - "step": 163830 - }, - { - "entropy": 1.8301258489489556, - "epoch": 0.5078896648142106, - "grad_norm": 3.099703311920166, - "learning_rate": 3.5498444011235117e-06, - "loss": 0.4182, - "mean_token_accuracy": 0.8658434525132179, - "num_tokens": 197160383.0, - "step": 163840 - }, - { - "entropy": 1.801127390563488, - "epoch": 0.5079206639392603, - "grad_norm": 9.042040824890137, - "learning_rate": 3.549736072767338e-06, - "loss": 0.3819, - "mean_token_accuracy": 0.8617468386888504, - "num_tokens": 197173219.0, - "step": 163850 - }, - { - "entropy": 1.9284664213657379, - "epoch": 0.50795166306431, - "grad_norm": 9.226612091064453, - "learning_rate": 3.549627754327924e-06, - "loss": 0.4928, - "mean_token_accuracy": 0.8404616430401802, - "num_tokens": 197184744.0, - "step": 163860 - }, - { - "entropy": 1.8895764395594596, - "epoch": 0.5079826621893597, - "grad_norm": 8.440512657165527, - "learning_rate": 3.5495194458037544e-06, - "loss": 0.4993, - "mean_token_accuracy": 0.8522619500756263, - "num_tokens": 197196174.0, - "step": 163870 - }, - { - "entropy": 1.7769380405545234, - "epoch": 0.5080136613144094, - "grad_norm": 4.372562408447266, - "learning_rate": 3.549411147193318e-06, - "loss": 0.3977, - "mean_token_accuracy": 0.8606853768229484, - "num_tokens": 197208968.0, - "step": 163880 - }, - { - "entropy": 1.8417119562625885, - "epoch": 0.5080446604394591, - "grad_norm": 4.590822696685791, - "learning_rate": 3.5493028584951027e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.8595148786902428, - "num_tokens": 197221585.0, - "step": 163890 - }, - { - "entropy": 1.9214375928044318, - "epoch": 0.5080756595645088, - "grad_norm": 4.399219512939453, - "learning_rate": 3.5491945797075963e-06, - "loss": 0.468, - "mean_token_accuracy": 0.8582563444972038, - "num_tokens": 197233428.0, - "step": 163900 - }, - { - "entropy": 1.7925603240728378, - "epoch": 0.5081066586895585, - "grad_norm": 3.6897733211517334, - "learning_rate": 3.5490863108292856e-06, - "loss": 0.3631, - "mean_token_accuracy": 0.8733308136463165, - "num_tokens": 197246486.0, - "step": 163910 - }, - { - "entropy": 1.8893893167376519, - "epoch": 0.5081376578146082, - "grad_norm": 8.758194923400879, - "learning_rate": 3.5489780518586626e-06, - "loss": 0.4548, - "mean_token_accuracy": 0.8464448168873787, - "num_tokens": 197258696.0, - "step": 163920 - }, - { - "entropy": 1.8779218710958958, - "epoch": 0.5081686569396578, - "grad_norm": 8.010139465332031, - "learning_rate": 3.548869802794213e-06, - "loss": 0.4136, - "mean_token_accuracy": 0.8593690872192383, - "num_tokens": 197270587.0, - "step": 163930 - }, - { - "entropy": 1.8980011656880378, - "epoch": 0.5081996560647076, - "grad_norm": 4.0742106437683105, - "learning_rate": 3.548761563634428e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8588485881686211, - "num_tokens": 197282756.0, - "step": 163940 - }, - { - "entropy": 1.925993339717388, - "epoch": 0.5082306551897573, - "grad_norm": 6.346606254577637, - "learning_rate": 3.548653334377797e-06, - "loss": 0.4553, - "mean_token_accuracy": 0.853511492908001, - "num_tokens": 197294423.0, - "step": 163950 - }, - { - "entropy": 1.8731113463640212, - "epoch": 0.508261654314807, - "grad_norm": 3.2879886627197266, - "learning_rate": 3.5485451150228088e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8611008077859879, - "num_tokens": 197306496.0, - "step": 163960 - }, - { - "entropy": 1.9740417271852493, - "epoch": 0.5082926534398566, - "grad_norm": 7.165710926055908, - "learning_rate": 3.5484369055679565e-06, - "loss": 0.4983, - "mean_token_accuracy": 0.8428760409355164, - "num_tokens": 197317823.0, - "step": 163970 - }, - { - "entropy": 1.9370626002550124, - "epoch": 0.5083236525649063, - "grad_norm": 3.800901174545288, - "learning_rate": 3.5483287060117265e-06, - "loss": 0.452, - "mean_token_accuracy": 0.8547460094094277, - "num_tokens": 197329227.0, - "step": 163980 - }, - { - "entropy": 1.9272918194532394, - "epoch": 0.5083546516899561, - "grad_norm": 8.870448112487793, - "learning_rate": 3.548220516352614e-06, - "loss": 0.5126, - "mean_token_accuracy": 0.8483779489994049, - "num_tokens": 197340517.0, - "step": 163990 - }, - { - "entropy": 1.9233269169926643, - "epoch": 0.5083856508150058, - "grad_norm": 9.553996086120605, - "learning_rate": 3.548112336589107e-06, - "loss": 0.4691, - "mean_token_accuracy": 0.8529503479599952, - "num_tokens": 197351963.0, - "step": 164000 - }, - { - "entropy": 1.8978917241096496, - "epoch": 0.5084166499400554, - "grad_norm": 8.318848609924316, - "learning_rate": 3.548004166719699e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.8580149009823799, - "num_tokens": 197363948.0, - "step": 164010 - }, - { - "entropy": 1.877458170056343, - "epoch": 0.5084476490651051, - "grad_norm": 11.015156745910645, - "learning_rate": 3.5478960067428814e-06, - "loss": 0.422, - "mean_token_accuracy": 0.8697662636637687, - "num_tokens": 197375852.0, - "step": 164020 - }, - { - "entropy": 1.905393399298191, - "epoch": 0.5084786481901549, - "grad_norm": 6.92934513092041, - "learning_rate": 3.547787856657146e-06, - "loss": 0.4105, - "mean_token_accuracy": 0.8674715921282768, - "num_tokens": 197386981.0, - "step": 164030 - }, - { - "entropy": 1.8934918195009232, - "epoch": 0.5085096473152045, - "grad_norm": 9.103461265563965, - "learning_rate": 3.5476797164609863e-06, - "loss": 0.4769, - "mean_token_accuracy": 0.8485694825649261, - "num_tokens": 197398553.0, - "step": 164040 - }, - { - "entropy": 1.959120711684227, - "epoch": 0.5085406464402542, - "grad_norm": 7.4005351066589355, - "learning_rate": 3.5475715861528943e-06, - "loss": 0.4598, - "mean_token_accuracy": 0.855261555314064, - "num_tokens": 197409315.0, - "step": 164050 - }, - { - "entropy": 1.8080861911177635, - "epoch": 0.5085716455653039, - "grad_norm": 8.302371978759766, - "learning_rate": 3.547463465731363e-06, - "loss": 0.401, - "mean_token_accuracy": 0.8583597347140313, - "num_tokens": 197423022.0, - "step": 164060 - }, - { - "entropy": 1.8989132165908813, - "epoch": 0.5086026446903537, - "grad_norm": 7.591587066650391, - "learning_rate": 3.547355355194887e-06, - "loss": 0.4597, - "mean_token_accuracy": 0.8550845861434937, - "num_tokens": 197434763.0, - "step": 164070 - }, - { - "entropy": 1.7511195302009583, - "epoch": 0.5086336438154033, - "grad_norm": 8.043708801269531, - "learning_rate": 3.547247254541959e-06, - "loss": 0.3614, - "mean_token_accuracy": 0.8655338972806931, - "num_tokens": 197448589.0, - "step": 164080 - }, - { - "entropy": 1.9792837232351304, - "epoch": 0.508664642940453, - "grad_norm": 7.99447774887085, - "learning_rate": 3.5471391637710738e-06, - "loss": 0.499, - "mean_token_accuracy": 0.8506904348731041, - "num_tokens": 197459864.0, - "step": 164090 - }, - { - "entropy": 1.9158392682671548, - "epoch": 0.5086956420655027, - "grad_norm": 9.754770278930664, - "learning_rate": 3.547031082880726e-06, - "loss": 0.4272, - "mean_token_accuracy": 0.853808979690075, - "num_tokens": 197471839.0, - "step": 164100 - }, - { - "entropy": 1.8726506367325784, - "epoch": 0.5087266411905524, - "grad_norm": 10.339028358459473, - "learning_rate": 3.5469230118694107e-06, - "loss": 0.4304, - "mean_token_accuracy": 0.8654339507222175, - "num_tokens": 197483011.0, - "step": 164110 - }, - { - "entropy": 1.9108057722449303, - "epoch": 0.5087576403156021, - "grad_norm": 7.670596122741699, - "learning_rate": 3.5468149507356213e-06, - "loss": 0.4863, - "mean_token_accuracy": 0.8540572628378869, - "num_tokens": 197494412.0, - "step": 164120 - }, - { - "entropy": 1.8840801566839218, - "epoch": 0.5087886394406518, - "grad_norm": 3.8235244750976562, - "learning_rate": 3.5467068994778553e-06, - "loss": 0.4559, - "mean_token_accuracy": 0.8526855111122131, - "num_tokens": 197506145.0, - "step": 164130 - }, - { - "entropy": 1.9018552049994468, - "epoch": 0.5088196385657014, - "grad_norm": 10.32859992980957, - "learning_rate": 3.546598858094607e-06, - "loss": 0.4597, - "mean_token_accuracy": 0.8581657335162163, - "num_tokens": 197517526.0, - "step": 164140 - }, - { - "entropy": 1.7613848388195037, - "epoch": 0.5088506376907512, - "grad_norm": 6.967850208282471, - "learning_rate": 3.5464908265843733e-06, - "loss": 0.3746, - "mean_token_accuracy": 0.8673210009932518, - "num_tokens": 197530473.0, - "step": 164150 - }, - { - "entropy": 1.7470743969082831, - "epoch": 0.5088816368158009, - "grad_norm": 4.126484394073486, - "learning_rate": 3.5463828049456504e-06, - "loss": 0.4068, - "mean_token_accuracy": 0.8649399444460869, - "num_tokens": 197544141.0, - "step": 164160 - }, - { - "entropy": 1.7916277647018433, - "epoch": 0.5089126359408506, - "grad_norm": 7.930361270904541, - "learning_rate": 3.5462747931769348e-06, - "loss": 0.3446, - "mean_token_accuracy": 0.8660852313041687, - "num_tokens": 197557293.0, - "step": 164170 - }, - { - "entropy": 1.9348184138536453, - "epoch": 0.5089436350659002, - "grad_norm": 7.338921070098877, - "learning_rate": 3.546166791276724e-06, - "loss": 0.4419, - "mean_token_accuracy": 0.8696695506572724, - "num_tokens": 197568090.0, - "step": 164180 - }, - { - "entropy": 1.8887701407074928, - "epoch": 0.50897463419095, - "grad_norm": 8.492532730102539, - "learning_rate": 3.5460587992435147e-06, - "loss": 0.4823, - "mean_token_accuracy": 0.8550636664032936, - "num_tokens": 197579983.0, - "step": 164190 - }, - { - "entropy": 1.8933542668819427, - "epoch": 0.5090056333159997, - "grad_norm": 8.57034969329834, - "learning_rate": 3.5459508170758054e-06, - "loss": 0.4174, - "mean_token_accuracy": 0.868916317820549, - "num_tokens": 197590837.0, - "step": 164200 - }, - { - "entropy": 1.8781364992260934, - "epoch": 0.5090366324410494, - "grad_norm": 7.852393627166748, - "learning_rate": 3.5458428447720934e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.8487447679042817, - "num_tokens": 197603268.0, - "step": 164210 - }, - { - "entropy": 1.7770580515265464, - "epoch": 0.509067631566099, - "grad_norm": 2.3359949588775635, - "learning_rate": 3.545734882330877e-06, - "loss": 0.3527, - "mean_token_accuracy": 0.8754623264074326, - "num_tokens": 197615363.0, - "step": 164220 - }, - { - "entropy": 1.930907705426216, - "epoch": 0.5090986306911487, - "grad_norm": 8.559337615966797, - "learning_rate": 3.545626929750655e-06, - "loss": 0.4852, - "mean_token_accuracy": 0.8568104296922684, - "num_tokens": 197626364.0, - "step": 164230 - }, - { - "entropy": 1.8539829179644585, - "epoch": 0.5091296298161985, - "grad_norm": 9.082908630371094, - "learning_rate": 3.545518987029927e-06, - "loss": 0.4415, - "mean_token_accuracy": 0.8485892757773399, - "num_tokens": 197638250.0, - "step": 164240 - }, - { - "entropy": 1.8581020534038544, - "epoch": 0.5091606289412481, - "grad_norm": 7.516939640045166, - "learning_rate": 3.5454110541671916e-06, - "loss": 0.4276, - "mean_token_accuracy": 0.8599382519721985, - "num_tokens": 197650104.0, - "step": 164250 - }, - { - "entropy": 1.9120456263422967, - "epoch": 0.5091916280662978, - "grad_norm": 8.88882827758789, - "learning_rate": 3.5453031311609487e-06, - "loss": 0.4833, - "mean_token_accuracy": 0.8505443364381791, - "num_tokens": 197661272.0, - "step": 164260 - }, - { - "entropy": 1.9172147080302238, - "epoch": 0.5092226271913475, - "grad_norm": 9.059195518493652, - "learning_rate": 3.5451952180096983e-06, - "loss": 0.4718, - "mean_token_accuracy": 0.8485334411263465, - "num_tokens": 197672129.0, - "step": 164270 - }, - { - "entropy": 1.7511213548481463, - "epoch": 0.5092536263163973, - "grad_norm": 3.4584226608276367, - "learning_rate": 3.54508731471194e-06, - "loss": 0.3507, - "mean_token_accuracy": 0.8762229889631271, - "num_tokens": 197685480.0, - "step": 164280 - }, - { - "entropy": 1.8804846301674842, - "epoch": 0.5092846254414469, - "grad_norm": 6.6632866859436035, - "learning_rate": 3.5449794212661747e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8547676905989647, - "num_tokens": 197697031.0, - "step": 164290 - }, - { - "entropy": 1.8086992830038071, - "epoch": 0.5093156245664966, - "grad_norm": 9.774088859558105, - "learning_rate": 3.544871537670904e-06, - "loss": 0.3817, - "mean_token_accuracy": 0.8651171505451203, - "num_tokens": 197710087.0, - "step": 164300 - }, - { - "entropy": 1.9255617767572404, - "epoch": 0.5093466236915463, - "grad_norm": 8.894806861877441, - "learning_rate": 3.5447636639246287e-06, - "loss": 0.465, - "mean_token_accuracy": 0.8540884301066398, - "num_tokens": 197722079.0, - "step": 164310 - }, - { - "entropy": 1.8370198220014573, - "epoch": 0.509377622816596, - "grad_norm": 7.935099124908447, - "learning_rate": 3.544655800025849e-06, - "loss": 0.4137, - "mean_token_accuracy": 0.8566048562526702, - "num_tokens": 197734150.0, - "step": 164320 - }, - { - "entropy": 1.8919256404042244, - "epoch": 0.5094086219416457, - "grad_norm": 9.719154357910156, - "learning_rate": 3.544547945973069e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.8487145140767097, - "num_tokens": 197745210.0, - "step": 164330 - }, - { - "entropy": 1.870015025138855, - "epoch": 0.5094396210666954, - "grad_norm": 3.9895741939544678, - "learning_rate": 3.5444401017647894e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8562591806054115, - "num_tokens": 197757465.0, - "step": 164340 - }, - { - "entropy": 1.9672464281320572, - "epoch": 0.509470620191745, - "grad_norm": 8.094328880310059, - "learning_rate": 3.5443322673995123e-06, - "loss": 0.5217, - "mean_token_accuracy": 0.843452051281929, - "num_tokens": 197768213.0, - "step": 164350 - }, - { - "entropy": 1.9159003645181656, - "epoch": 0.5095016193167948, - "grad_norm": 6.670746326446533, - "learning_rate": 3.544224442875742e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.8514717981219292, - "num_tokens": 197779125.0, - "step": 164360 - }, - { - "entropy": 1.9196984738111496, - "epoch": 0.5095326184418445, - "grad_norm": 7.664515495300293, - "learning_rate": 3.5441166281919807e-06, - "loss": 0.4429, - "mean_token_accuracy": 0.8598332405090332, - "num_tokens": 197789903.0, - "step": 164370 - }, - { - "entropy": 1.9413407146930695, - "epoch": 0.5095636175668942, - "grad_norm": 8.811555862426758, - "learning_rate": 3.544008823346732e-06, - "loss": 0.4799, - "mean_token_accuracy": 0.8528330892324447, - "num_tokens": 197800737.0, - "step": 164380 - }, - { - "entropy": 1.9213477104902268, - "epoch": 0.5095946166919438, - "grad_norm": 9.078819274902344, - "learning_rate": 3.543901028338499e-06, - "loss": 0.4674, - "mean_token_accuracy": 0.8492666840553283, - "num_tokens": 197811307.0, - "step": 164390 - }, - { - "entropy": 1.8556159757077695, - "epoch": 0.5096256158169936, - "grad_norm": 2.676553249359131, - "learning_rate": 3.543793243165787e-06, - "loss": 0.4248, - "mean_token_accuracy": 0.8604639634490013, - "num_tokens": 197823836.0, - "step": 164400 - }, - { - "entropy": 1.873467753827572, - "epoch": 0.5096566149420433, - "grad_norm": 6.844171047210693, - "learning_rate": 3.5436854678270993e-06, - "loss": 0.4209, - "mean_token_accuracy": 0.8624146491289139, - "num_tokens": 197835920.0, - "step": 164410 - }, - { - "entropy": 1.8581449389457703, - "epoch": 0.509687614067093, - "grad_norm": 8.513300895690918, - "learning_rate": 3.5435777023209413e-06, - "loss": 0.46, - "mean_token_accuracy": 0.8637478351593018, - "num_tokens": 197847504.0, - "step": 164420 - }, - { - "entropy": 1.9565213829278947, - "epoch": 0.5097186131921426, - "grad_norm": 6.896825790405273, - "learning_rate": 3.543469946645818e-06, - "loss": 0.4646, - "mean_token_accuracy": 0.8548880100250245, - "num_tokens": 197857870.0, - "step": 164430 - }, - { - "entropy": 1.9588289812207222, - "epoch": 0.5097496123171924, - "grad_norm": 3.6317381858825684, - "learning_rate": 3.543362200800234e-06, - "loss": 0.4967, - "mean_token_accuracy": 0.8448257014155388, - "num_tokens": 197869294.0, - "step": 164440 - }, - { - "entropy": 1.8617324098944663, - "epoch": 0.5097806114422421, - "grad_norm": 9.595418930053711, - "learning_rate": 3.5432544647826957e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8515863418579102, - "num_tokens": 197881334.0, - "step": 164450 - }, - { - "entropy": 1.922871397435665, - "epoch": 0.5098116105672917, - "grad_norm": 8.938125610351562, - "learning_rate": 3.543146738591709e-06, - "loss": 0.4793, - "mean_token_accuracy": 0.847772279381752, - "num_tokens": 197892219.0, - "step": 164460 - }, - { - "entropy": 1.8417800962924957, - "epoch": 0.5098426096923414, - "grad_norm": 4.177606582641602, - "learning_rate": 3.5430390222257797e-06, - "loss": 0.4315, - "mean_token_accuracy": 0.8512136593461037, - "num_tokens": 197905193.0, - "step": 164470 - }, - { - "entropy": 1.979746587574482, - "epoch": 0.5098736088173911, - "grad_norm": 8.593181610107422, - "learning_rate": 3.5429313156834156e-06, - "loss": 0.4925, - "mean_token_accuracy": 0.8451717495918274, - "num_tokens": 197917083.0, - "step": 164480 - }, - { - "entropy": 1.9483318507671357, - "epoch": 0.5099046079424409, - "grad_norm": 7.634199619293213, - "learning_rate": 3.5428236189631227e-06, - "loss": 0.457, - "mean_token_accuracy": 0.8457421749830246, - "num_tokens": 197928191.0, - "step": 164490 - }, - { - "entropy": 1.8038464456796646, - "epoch": 0.5099356070674905, - "grad_norm": 8.22901725769043, - "learning_rate": 3.5427159320634082e-06, - "loss": 0.3874, - "mean_token_accuracy": 0.8666422128677368, - "num_tokens": 197941195.0, - "step": 164500 - }, - { - "entropy": 1.8235293269157409, - "epoch": 0.5099666061925402, - "grad_norm": 3.2526485919952393, - "learning_rate": 3.542608254982779e-06, - "loss": 0.3726, - "mean_token_accuracy": 0.862106654047966, - "num_tokens": 197953781.0, - "step": 164510 - }, - { - "entropy": 1.838102599978447, - "epoch": 0.5099976053175899, - "grad_norm": 3.73408579826355, - "learning_rate": 3.5425005877197447e-06, - "loss": 0.4261, - "mean_token_accuracy": 0.8607692360877991, - "num_tokens": 197965978.0, - "step": 164520 - }, - { - "entropy": 1.8384420067071914, - "epoch": 0.5100286044426396, - "grad_norm": 5.095405578613281, - "learning_rate": 3.5423929302728125e-06, - "loss": 0.3859, - "mean_token_accuracy": 0.8745714992284774, - "num_tokens": 197979104.0, - "step": 164530 - }, - { - "entropy": 1.9089295566082, - "epoch": 0.5100596035676893, - "grad_norm": 8.513500213623047, - "learning_rate": 3.5422852826404913e-06, - "loss": 0.4551, - "mean_token_accuracy": 0.852853499352932, - "num_tokens": 197990177.0, - "step": 164540 - }, - { - "entropy": 1.8421483382582664, - "epoch": 0.510090602692739, - "grad_norm": 4.655264854431152, - "learning_rate": 3.542177644821289e-06, - "loss": 0.4233, - "mean_token_accuracy": 0.8579674288630486, - "num_tokens": 198002778.0, - "step": 164550 - }, - { - "entropy": 1.7636081084609032, - "epoch": 0.5101216018177887, - "grad_norm": 6.842883110046387, - "learning_rate": 3.5420700168137166e-06, - "loss": 0.3649, - "mean_token_accuracy": 0.8728412866592408, - "num_tokens": 198015645.0, - "step": 164560 - }, - { - "entropy": 1.907676276564598, - "epoch": 0.5101526009428384, - "grad_norm": 3.8116097450256348, - "learning_rate": 3.5419623986162816e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.855129437148571, - "num_tokens": 198027888.0, - "step": 164570 - }, - { - "entropy": 1.8424773082137107, - "epoch": 0.5101836000678881, - "grad_norm": 8.173355102539062, - "learning_rate": 3.541854790227495e-06, - "loss": 0.4155, - "mean_token_accuracy": 0.8628433018922805, - "num_tokens": 198040245.0, - "step": 164580 - }, - { - "entropy": 1.8889279991388321, - "epoch": 0.5102145991929378, - "grad_norm": 4.406068325042725, - "learning_rate": 3.541747191645866e-06, - "loss": 0.4406, - "mean_token_accuracy": 0.8483342587947845, - "num_tokens": 198052398.0, - "step": 164590 - }, - { - "entropy": 1.7704968944191932, - "epoch": 0.5102455983179874, - "grad_norm": 7.529422760009766, - "learning_rate": 3.5416396028699058e-06, - "loss": 0.3776, - "mean_token_accuracy": 0.8711699575185776, - "num_tokens": 198065697.0, - "step": 164600 - }, - { - "entropy": 1.8590932220220566, - "epoch": 0.5102765974430372, - "grad_norm": 3.707942247390747, - "learning_rate": 3.5415320238981252e-06, - "loss": 0.3845, - "mean_token_accuracy": 0.8645050838589668, - "num_tokens": 198078312.0, - "step": 164610 - }, - { - "entropy": 1.91665877699852, - "epoch": 0.5103075965680869, - "grad_norm": 8.822848320007324, - "learning_rate": 3.541424454729035e-06, - "loss": 0.4654, - "mean_token_accuracy": 0.8502047717571258, - "num_tokens": 198089581.0, - "step": 164620 - }, - { - "entropy": 1.9822220832109452, - "epoch": 0.5103385956931366, - "grad_norm": 8.159029006958008, - "learning_rate": 3.5413168953611463e-06, - "loss": 0.5218, - "mean_token_accuracy": 0.8457861587405204, - "num_tokens": 198100607.0, - "step": 164630 - }, - { - "entropy": 1.8249846056103707, - "epoch": 0.5103695948181862, - "grad_norm": 7.530824184417725, - "learning_rate": 3.5412093457929706e-06, - "loss": 0.385, - "mean_token_accuracy": 0.8623237699270249, - "num_tokens": 198113464.0, - "step": 164640 - }, - { - "entropy": 1.8657769158482551, - "epoch": 0.510400593943236, - "grad_norm": 5.076999664306641, - "learning_rate": 3.5411018060230205e-06, - "loss": 0.4339, - "mean_token_accuracy": 0.8500089332461357, - "num_tokens": 198125673.0, - "step": 164650 - }, - { - "entropy": 1.8927562654018402, - "epoch": 0.5104315930682857, - "grad_norm": 7.442946910858154, - "learning_rate": 3.540994276049809e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.8680774718523026, - "num_tokens": 198138294.0, - "step": 164660 - }, - { - "entropy": 1.9183226093649863, - "epoch": 0.5104625921933353, - "grad_norm": 9.387221336364746, - "learning_rate": 3.540886755871847e-06, - "loss": 0.469, - "mean_token_accuracy": 0.8459124505519867, - "num_tokens": 198150002.0, - "step": 164670 - }, - { - "entropy": 1.7814438581466674, - "epoch": 0.510493591318385, - "grad_norm": 7.187930583953857, - "learning_rate": 3.5407792454876488e-06, - "loss": 0.3887, - "mean_token_accuracy": 0.8671529695391655, - "num_tokens": 198162639.0, - "step": 164680 - }, - { - "entropy": 1.9143247455358505, - "epoch": 0.5105245904434348, - "grad_norm": 7.576807022094727, - "learning_rate": 3.5406717448957274e-06, - "loss": 0.4714, - "mean_token_accuracy": 0.8488384842872619, - "num_tokens": 198174173.0, - "step": 164690 - }, - { - "entropy": 1.8134226769208908, - "epoch": 0.5105555895684845, - "grad_norm": 2.628392457962036, - "learning_rate": 3.5405642540945955e-06, - "loss": 0.3778, - "mean_token_accuracy": 0.862903805077076, - "num_tokens": 198188038.0, - "step": 164700 - }, - { - "entropy": 1.9163833409547806, - "epoch": 0.5105865886935341, - "grad_norm": 8.192092895507812, - "learning_rate": 3.5404567730827683e-06, - "loss": 0.4252, - "mean_token_accuracy": 0.85711118131876, - "num_tokens": 198199655.0, - "step": 164710 - }, - { - "entropy": 1.916769452393055, - "epoch": 0.5106175878185838, - "grad_norm": 3.710341453552246, - "learning_rate": 3.540349301858759e-06, - "loss": 0.4449, - "mean_token_accuracy": 0.8497982263565064, - "num_tokens": 198211331.0, - "step": 164720 - }, - { - "entropy": 1.9329410195350647, - "epoch": 0.5106485869436335, - "grad_norm": 7.230764389038086, - "learning_rate": 3.5402418404210827e-06, - "loss": 0.4664, - "mean_token_accuracy": 0.8429116651415824, - "num_tokens": 198223089.0, - "step": 164730 - }, - { - "entropy": 1.9391484722495078, - "epoch": 0.5106795860686832, - "grad_norm": 8.100869178771973, - "learning_rate": 3.540134388768255e-06, - "loss": 0.4478, - "mean_token_accuracy": 0.8531536921858788, - "num_tokens": 198234375.0, - "step": 164740 - }, - { - "entropy": 1.9161068618297576, - "epoch": 0.5107105851937329, - "grad_norm": 3.7813363075256348, - "learning_rate": 3.5400269468987893e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.8692640915513039, - "num_tokens": 198245859.0, - "step": 164750 - }, - { - "entropy": 1.861397238075733, - "epoch": 0.5107415843187826, - "grad_norm": 4.208311080932617, - "learning_rate": 3.5399195148112014e-06, - "loss": 0.4346, - "mean_token_accuracy": 0.8546230092644691, - "num_tokens": 198257420.0, - "step": 164760 - }, - { - "entropy": 1.9197211220860482, - "epoch": 0.5107725834438323, - "grad_norm": 9.448493003845215, - "learning_rate": 3.5398120925040085e-06, - "loss": 0.4794, - "mean_token_accuracy": 0.8522847130894661, - "num_tokens": 198268666.0, - "step": 164770 - }, - { - "entropy": 1.8664867907762528, - "epoch": 0.510803582568882, - "grad_norm": 9.399632453918457, - "learning_rate": 3.5397046799757255e-06, - "loss": 0.4372, - "mean_token_accuracy": 0.8565617486834526, - "num_tokens": 198280919.0, - "step": 164780 - }, - { - "entropy": 1.8348496824502945, - "epoch": 0.5108345816939317, - "grad_norm": 3.520848512649536, - "learning_rate": 3.539597277224869e-06, - "loss": 0.4182, - "mean_token_accuracy": 0.8544135078787803, - "num_tokens": 198293749.0, - "step": 164790 - }, - { - "entropy": 1.8871194452047348, - "epoch": 0.5108655808189814, - "grad_norm": 3.2471134662628174, - "learning_rate": 3.539489884249957e-06, - "loss": 0.4173, - "mean_token_accuracy": 0.8605234354734421, - "num_tokens": 198305524.0, - "step": 164800 - }, - { - "entropy": 1.8586633920669555, - "epoch": 0.510896579944031, - "grad_norm": 8.798843383789062, - "learning_rate": 3.5393825010495047e-06, - "loss": 0.4024, - "mean_token_accuracy": 0.8607447475194931, - "num_tokens": 198317700.0, - "step": 164810 - }, - { - "entropy": 1.7715517044067384, - "epoch": 0.5109275790690808, - "grad_norm": 8.043583869934082, - "learning_rate": 3.5392751276220303e-06, - "loss": 0.345, - "mean_token_accuracy": 0.8794338956475258, - "num_tokens": 198331081.0, - "step": 164820 - }, - { - "entropy": 1.8529795736074448, - "epoch": 0.5109585781941305, - "grad_norm": 7.2134013175964355, - "learning_rate": 3.5391677639660516e-06, - "loss": 0.4048, - "mean_token_accuracy": 0.8627293214201928, - "num_tokens": 198343316.0, - "step": 164830 - }, - { - "entropy": 1.7684592947363853, - "epoch": 0.5109895773191802, - "grad_norm": 3.8594398498535156, - "learning_rate": 3.5390604100800864e-06, - "loss": 0.4141, - "mean_token_accuracy": 0.8634788036346436, - "num_tokens": 198356311.0, - "step": 164840 - }, - { - "entropy": 1.8074128575623036, - "epoch": 0.5110205764442298, - "grad_norm": 7.467392444610596, - "learning_rate": 3.538953065962653e-06, - "loss": 0.4432, - "mean_token_accuracy": 0.8613016933202744, - "num_tokens": 198370124.0, - "step": 164850 - }, - { - "entropy": 1.9014081314206124, - "epoch": 0.5110515755692796, - "grad_norm": 8.128207206726074, - "learning_rate": 3.53884573161227e-06, - "loss": 0.4696, - "mean_token_accuracy": 0.8408779174089431, - "num_tokens": 198382346.0, - "step": 164860 - }, - { - "entropy": 1.8639238759875298, - "epoch": 0.5110825746943293, - "grad_norm": 8.85745906829834, - "learning_rate": 3.538738407027457e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8516306474804878, - "num_tokens": 198394541.0, - "step": 164870 - }, - { - "entropy": 1.8283547207713127, - "epoch": 0.5111135738193789, - "grad_norm": 8.482251167297363, - "learning_rate": 3.5386310922067324e-06, - "loss": 0.3967, - "mean_token_accuracy": 0.8613016873598098, - "num_tokens": 198407068.0, - "step": 164880 - }, - { - "entropy": 1.8640737235546112, - "epoch": 0.5111445729444286, - "grad_norm": 9.156514167785645, - "learning_rate": 3.5385237871486154e-06, - "loss": 0.4243, - "mean_token_accuracy": 0.8556307137012482, - "num_tokens": 198419070.0, - "step": 164890 - }, - { - "entropy": 1.784039095044136, - "epoch": 0.5111755720694784, - "grad_norm": 3.7771859169006348, - "learning_rate": 3.5384164918516278e-06, - "loss": 0.3541, - "mean_token_accuracy": 0.873118770122528, - "num_tokens": 198431781.0, - "step": 164900 - }, - { - "entropy": 1.917399762570858, - "epoch": 0.5112065711945281, - "grad_norm": 9.159950256347656, - "learning_rate": 3.5383092063142884e-06, - "loss": 0.504, - "mean_token_accuracy": 0.8393927633762359, - "num_tokens": 198443784.0, - "step": 164910 - }, - { - "entropy": 1.8153990522027015, - "epoch": 0.5112375703195777, - "grad_norm": 3.03525447845459, - "learning_rate": 3.538201930535117e-06, - "loss": 0.4143, - "mean_token_accuracy": 0.8660739451646805, - "num_tokens": 198456000.0, - "step": 164920 - }, - { - "entropy": 1.878081302344799, - "epoch": 0.5112685694446274, - "grad_norm": 7.645351409912109, - "learning_rate": 3.5380946645126355e-06, - "loss": 0.4463, - "mean_token_accuracy": 0.8555167511105537, - "num_tokens": 198467146.0, - "step": 164930 - }, - { - "entropy": 1.8525796994566917, - "epoch": 0.5112995685696771, - "grad_norm": 4.232410907745361, - "learning_rate": 3.537987408245366e-06, - "loss": 0.3904, - "mean_token_accuracy": 0.8579295039176941, - "num_tokens": 198479522.0, - "step": 164940 - }, - { - "entropy": 1.9068205669522285, - "epoch": 0.5113305676947268, - "grad_norm": 7.772948265075684, - "learning_rate": 3.537880161731828e-06, - "loss": 0.4729, - "mean_token_accuracy": 0.8462733164429664, - "num_tokens": 198491028.0, - "step": 164950 - }, - { - "entropy": 1.8264345929026604, - "epoch": 0.5113615668197765, - "grad_norm": 7.550524711608887, - "learning_rate": 3.5377729249705438e-06, - "loss": 0.3998, - "mean_token_accuracy": 0.867035549879074, - "num_tokens": 198503289.0, - "step": 164960 - }, - { - "entropy": 1.8178565993905067, - "epoch": 0.5113925659448262, - "grad_norm": 7.000597953796387, - "learning_rate": 3.5376656979600365e-06, - "loss": 0.3955, - "mean_token_accuracy": 0.8724155217409134, - "num_tokens": 198515438.0, - "step": 164970 - }, - { - "entropy": 1.9243502393364906, - "epoch": 0.5114235650698759, - "grad_norm": 7.471307277679443, - "learning_rate": 3.537558480698827e-06, - "loss": 0.4723, - "mean_token_accuracy": 0.848363322019577, - "num_tokens": 198526507.0, - "step": 164980 - }, - { - "entropy": 1.847606810927391, - "epoch": 0.5114545641949256, - "grad_norm": 8.920063972473145, - "learning_rate": 3.5374512731854394e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8580059990286827, - "num_tokens": 198538844.0, - "step": 164990 - }, - { - "entropy": 1.8540842577815055, - "epoch": 0.5114855633199753, - "grad_norm": 7.94528341293335, - "learning_rate": 3.5373440754183965e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8460657671093941, - "num_tokens": 198550727.0, - "step": 165000 - }, - { - "entropy": 1.8856645733118058, - "epoch": 0.511516562445025, - "grad_norm": 4.576896667480469, - "learning_rate": 3.53723688739622e-06, - "loss": 0.463, - "mean_token_accuracy": 0.8499826610088348, - "num_tokens": 198562724.0, - "step": 165010 - }, - { - "entropy": 1.927583736181259, - "epoch": 0.5115475615700746, - "grad_norm": 9.744261741638184, - "learning_rate": 3.5371297091174357e-06, - "loss": 0.4734, - "mean_token_accuracy": 0.8518247798085212, - "num_tokens": 198574075.0, - "step": 165020 - }, - { - "entropy": 1.8654179081320763, - "epoch": 0.5115785606951244, - "grad_norm": 4.000446796417236, - "learning_rate": 3.5370225405805663e-06, - "loss": 0.4602, - "mean_token_accuracy": 0.8373050197958947, - "num_tokens": 198586300.0, - "step": 165030 - }, - { - "entropy": 1.8400757029652595, - "epoch": 0.5116095598201741, - "grad_norm": 4.41628360748291, - "learning_rate": 3.5369153817841367e-06, - "loss": 0.3661, - "mean_token_accuracy": 0.8663878023624421, - "num_tokens": 198599012.0, - "step": 165040 - }, - { - "entropy": 1.93585607111454, - "epoch": 0.5116405589452238, - "grad_norm": 7.442049980163574, - "learning_rate": 3.5368082327266712e-06, - "loss": 0.4947, - "mean_token_accuracy": 0.8513322427868844, - "num_tokens": 198610777.0, - "step": 165050 - }, - { - "entropy": 1.8586627542972565, - "epoch": 0.5116715580702734, - "grad_norm": 4.374122619628906, - "learning_rate": 3.536701093406695e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8645623952150345, - "num_tokens": 198623111.0, - "step": 165060 - }, - { - "entropy": 1.8560616582632066, - "epoch": 0.5117025571953232, - "grad_norm": 4.863686561584473, - "learning_rate": 3.5365939638227324e-06, - "loss": 0.4074, - "mean_token_accuracy": 0.861749242246151, - "num_tokens": 198635806.0, - "step": 165070 - }, - { - "entropy": 1.8346017554402352, - "epoch": 0.5117335563203729, - "grad_norm": 10.877781867980957, - "learning_rate": 3.53648684397331e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.859768570959568, - "num_tokens": 198647986.0, - "step": 165080 - }, - { - "entropy": 1.8296894863247872, - "epoch": 0.5117645554454225, - "grad_norm": 6.875624656677246, - "learning_rate": 3.536379733856953e-06, - "loss": 0.4016, - "mean_token_accuracy": 0.8574081555008888, - "num_tokens": 198660724.0, - "step": 165090 - }, - { - "entropy": 1.8089259415864944, - "epoch": 0.5117955545704722, - "grad_norm": 4.244149684906006, - "learning_rate": 3.5362726334721887e-06, - "loss": 0.4045, - "mean_token_accuracy": 0.8540760070085526, - "num_tokens": 198674278.0, - "step": 165100 - }, - { - "entropy": 1.8661685228347777, - "epoch": 0.511826553695522, - "grad_norm": 7.697582244873047, - "learning_rate": 3.5361655428175417e-06, - "loss": 0.4136, - "mean_token_accuracy": 0.8633060559630394, - "num_tokens": 198686853.0, - "step": 165110 - }, - { - "entropy": 1.8793710887432098, - "epoch": 0.5118575528205717, - "grad_norm": 3.8460631370544434, - "learning_rate": 3.5360584618915406e-06, - "loss": 0.3943, - "mean_token_accuracy": 0.8673370242118835, - "num_tokens": 198699128.0, - "step": 165120 - }, - { - "entropy": 1.848849655687809, - "epoch": 0.5118885519456213, - "grad_norm": 9.106633186340332, - "learning_rate": 3.5359513906927113e-06, - "loss": 0.3761, - "mean_token_accuracy": 0.8695919916033745, - "num_tokens": 198711717.0, - "step": 165130 - }, - { - "entropy": 1.8800451412796975, - "epoch": 0.511919551070671, - "grad_norm": 7.916536808013916, - "learning_rate": 3.5358443292195817e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8574667185544967, - "num_tokens": 198723408.0, - "step": 165140 - }, - { - "entropy": 1.8587745755910874, - "epoch": 0.5119505501957208, - "grad_norm": 10.332902908325195, - "learning_rate": 3.5357372774706783e-06, - "loss": 0.4105, - "mean_token_accuracy": 0.8618461728096009, - "num_tokens": 198735665.0, - "step": 165150 - }, - { - "entropy": 1.8222912192344665, - "epoch": 0.5119815493207704, - "grad_norm": 3.885300397872925, - "learning_rate": 3.535630235444531e-06, - "loss": 0.4034, - "mean_token_accuracy": 0.8571152433753013, - "num_tokens": 198748298.0, - "step": 165160 - }, - { - "entropy": 1.9119007468223572, - "epoch": 0.5120125484458201, - "grad_norm": 9.721302032470703, - "learning_rate": 3.5355232031396675e-06, - "loss": 0.4826, - "mean_token_accuracy": 0.8569756045937538, - "num_tokens": 198758950.0, - "step": 165170 - }, - { - "entropy": 1.8979253500699997, - "epoch": 0.5120435475708698, - "grad_norm": 7.7287917137146, - "learning_rate": 3.5354161805546155e-06, - "loss": 0.4471, - "mean_token_accuracy": 0.8547446504235268, - "num_tokens": 198770387.0, - "step": 165180 - }, - { - "entropy": 1.8123277112841607, - "epoch": 0.5120745466959195, - "grad_norm": 4.713539123535156, - "learning_rate": 3.5353091676879057e-06, - "loss": 0.3891, - "mean_token_accuracy": 0.8553282573819161, - "num_tokens": 198783402.0, - "step": 165190 - }, - { - "entropy": 1.8285948097705842, - "epoch": 0.5121055458209692, - "grad_norm": 8.289105415344238, - "learning_rate": 3.535202164538066e-06, - "loss": 0.4239, - "mean_token_accuracy": 0.8569412216544151, - "num_tokens": 198796000.0, - "step": 165200 - }, - { - "entropy": 1.905951689183712, - "epoch": 0.5121365449460189, - "grad_norm": 9.26884937286377, - "learning_rate": 3.5350951711036262e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8543861001729965, - "num_tokens": 198807335.0, - "step": 165210 - }, - { - "entropy": 1.8662221506237984, - "epoch": 0.5121675440710686, - "grad_norm": 9.472772598266602, - "learning_rate": 3.5349881873831165e-06, - "loss": 0.4737, - "mean_token_accuracy": 0.8481184557080269, - "num_tokens": 198819688.0, - "step": 165220 - }, - { - "entropy": 1.8440553843975067, - "epoch": 0.5121985431961182, - "grad_norm": 8.241409301757812, - "learning_rate": 3.5348812133750676e-06, - "loss": 0.383, - "mean_token_accuracy": 0.8684048295021057, - "num_tokens": 198831562.0, - "step": 165230 - }, - { - "entropy": 1.8263012327253818, - "epoch": 0.512229542321168, - "grad_norm": 4.3429670333862305, - "learning_rate": 3.534774249078009e-06, - "loss": 0.3925, - "mean_token_accuracy": 0.8637837857007981, - "num_tokens": 198845431.0, - "step": 165240 - }, - { - "entropy": 1.831053911149502, - "epoch": 0.5122605414462177, - "grad_norm": 3.6324291229248047, - "learning_rate": 3.5346672944904727e-06, - "loss": 0.4284, - "mean_token_accuracy": 0.8589798659086227, - "num_tokens": 198858159.0, - "step": 165250 - }, - { - "entropy": 1.8699246987700462, - "epoch": 0.5122915405712674, - "grad_norm": 7.301723480224609, - "learning_rate": 3.534560349610988e-06, - "loss": 0.4003, - "mean_token_accuracy": 0.8703466773033142, - "num_tokens": 198870055.0, - "step": 165260 - }, - { - "entropy": 1.85399060100317, - "epoch": 0.512322539696317, - "grad_norm": 7.461488723754883, - "learning_rate": 3.5344534144380875e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.862822137773037, - "num_tokens": 198881372.0, - "step": 165270 - }, - { - "entropy": 1.9233081370592118, - "epoch": 0.5123535388213668, - "grad_norm": 9.688029289245605, - "learning_rate": 3.534346488970303e-06, - "loss": 0.498, - "mean_token_accuracy": 0.838792422413826, - "num_tokens": 198892477.0, - "step": 165280 - }, - { - "entropy": 1.8923705786466598, - "epoch": 0.5123845379464165, - "grad_norm": 8.833309173583984, - "learning_rate": 3.534239573206167e-06, - "loss": 0.4379, - "mean_token_accuracy": 0.8576890960335731, - "num_tokens": 198904123.0, - "step": 165290 - }, - { - "entropy": 1.9048192888498305, - "epoch": 0.5124155370714661, - "grad_norm": 7.9789934158325195, - "learning_rate": 3.534132667144212e-06, - "loss": 0.4729, - "mean_token_accuracy": 0.8539910644292832, - "num_tokens": 198914964.0, - "step": 165300 - }, - { - "entropy": 1.8376499265432358, - "epoch": 0.5124465361965158, - "grad_norm": 4.213772296905518, - "learning_rate": 3.534025770782969e-06, - "loss": 0.3961, - "mean_token_accuracy": 0.8648659616708756, - "num_tokens": 198927131.0, - "step": 165310 - }, - { - "entropy": 1.8888862892985343, - "epoch": 0.5124775353215656, - "grad_norm": 7.217865467071533, - "learning_rate": 3.533918884120972e-06, - "loss": 0.4491, - "mean_token_accuracy": 0.8601428523659707, - "num_tokens": 198938821.0, - "step": 165320 - }, - { - "entropy": 1.8831818588078022, - "epoch": 0.5125085344466153, - "grad_norm": 3.8709065914154053, - "learning_rate": 3.533812007156755e-06, - "loss": 0.4356, - "mean_token_accuracy": 0.8554113984107972, - "num_tokens": 198951829.0, - "step": 165330 - }, - { - "entropy": 1.7507137194275857, - "epoch": 0.5125395335716649, - "grad_norm": 4.336053848266602, - "learning_rate": 3.5337051398888504e-06, - "loss": 0.3983, - "mean_token_accuracy": 0.8600828319787979, - "num_tokens": 198965766.0, - "step": 165340 - }, - { - "entropy": 1.929706057906151, - "epoch": 0.5125705326967146, - "grad_norm": 3.9890756607055664, - "learning_rate": 3.5335982823157936e-06, - "loss": 0.4921, - "mean_token_accuracy": 0.8485506847500801, - "num_tokens": 198977381.0, - "step": 165350 - }, - { - "entropy": 1.9037581861019135, - "epoch": 0.5126015318217644, - "grad_norm": 7.280956745147705, - "learning_rate": 3.5334914344361184e-06, - "loss": 0.4437, - "mean_token_accuracy": 0.8539852678775788, - "num_tokens": 198988697.0, - "step": 165360 - }, - { - "entropy": 1.826684795320034, - "epoch": 0.512632530946814, - "grad_norm": 2.034179210662842, - "learning_rate": 3.533384596248358e-06, - "loss": 0.4377, - "mean_token_accuracy": 0.8617399200797081, - "num_tokens": 199001031.0, - "step": 165370 - }, - { - "entropy": 1.943645280599594, - "epoch": 0.5126635300718637, - "grad_norm": 9.540709495544434, - "learning_rate": 3.533277767751049e-06, - "loss": 0.4885, - "mean_token_accuracy": 0.846565519273281, - "num_tokens": 199012527.0, - "step": 165380 - }, - { - "entropy": 1.8760394856333733, - "epoch": 0.5126945291969134, - "grad_norm": 9.603574752807617, - "learning_rate": 3.533170948942725e-06, - "loss": 0.459, - "mean_token_accuracy": 0.8549725398421287, - "num_tokens": 199024645.0, - "step": 165390 - }, - { - "entropy": 1.9340546056628227, - "epoch": 0.5127255283219632, - "grad_norm": 8.026835441589355, - "learning_rate": 3.5330641398219232e-06, - "loss": 0.4585, - "mean_token_accuracy": 0.8500248864293098, - "num_tokens": 199036414.0, - "step": 165400 - }, - { - "entropy": 1.8972145736217498, - "epoch": 0.5127565274470128, - "grad_norm": 8.733002662658691, - "learning_rate": 3.532957340387178e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.856229642033577, - "num_tokens": 199048689.0, - "step": 165410 - }, - { - "entropy": 1.8412564128637314, - "epoch": 0.5127875265720625, - "grad_norm": 8.106005668640137, - "learning_rate": 3.5328505506370264e-06, - "loss": 0.405, - "mean_token_accuracy": 0.868453860282898, - "num_tokens": 199061006.0, - "step": 165420 - }, - { - "entropy": 1.881354607641697, - "epoch": 0.5128185256971122, - "grad_norm": 9.300365447998047, - "learning_rate": 3.5327437705700047e-06, - "loss": 0.4336, - "mean_token_accuracy": 0.8567851424217224, - "num_tokens": 199072950.0, - "step": 165430 - }, - { - "entropy": 1.9285381063818932, - "epoch": 0.5128495248221618, - "grad_norm": 9.102884292602539, - "learning_rate": 3.5326370001846483e-06, - "loss": 0.4798, - "mean_token_accuracy": 0.8458780542016029, - "num_tokens": 199083759.0, - "step": 165440 - }, - { - "entropy": 1.8580153673887252, - "epoch": 0.5128805239472116, - "grad_norm": 4.1997551918029785, - "learning_rate": 3.532530239479497e-06, - "loss": 0.3904, - "mean_token_accuracy": 0.8655030012130738, - "num_tokens": 199095203.0, - "step": 165450 - }, - { - "entropy": 1.7688224032521247, - "epoch": 0.5129115230722613, - "grad_norm": 9.050981521606445, - "learning_rate": 3.5324234884530855e-06, - "loss": 0.3636, - "mean_token_accuracy": 0.8712503552436829, - "num_tokens": 199109013.0, - "step": 165460 - }, - { - "entropy": 1.913888046145439, - "epoch": 0.512942522197311, - "grad_norm": 6.518311977386475, - "learning_rate": 3.532316747103952e-06, - "loss": 0.4783, - "mean_token_accuracy": 0.8588741257786751, - "num_tokens": 199120388.0, - "step": 165470 - }, - { - "entropy": 1.9045846730470657, - "epoch": 0.5129735213223606, - "grad_norm": 9.379058837890625, - "learning_rate": 3.5322100154306356e-06, - "loss": 0.435, - "mean_token_accuracy": 0.8588658139109612, - "num_tokens": 199132796.0, - "step": 165480 - }, - { - "entropy": 1.8787872701883317, - "epoch": 0.5130045204474104, - "grad_norm": 9.631776809692383, - "learning_rate": 3.532103293431674e-06, - "loss": 0.4292, - "mean_token_accuracy": 0.8609980300068856, - "num_tokens": 199145020.0, - "step": 165490 - }, - { - "entropy": 1.8990243777632714, - "epoch": 0.5130355195724601, - "grad_norm": 8.644235610961914, - "learning_rate": 3.5319965811056055e-06, - "loss": 0.4284, - "mean_token_accuracy": 0.8497151508927345, - "num_tokens": 199157072.0, - "step": 165500 - }, - { - "entropy": 1.964458554983139, - "epoch": 0.5130665186975097, - "grad_norm": 7.992344379425049, - "learning_rate": 3.531889878450969e-06, - "loss": 0.4772, - "mean_token_accuracy": 0.8534716755151749, - "num_tokens": 199168402.0, - "step": 165510 - }, - { - "entropy": 1.9405386328697205, - "epoch": 0.5130975178225594, - "grad_norm": 7.881420612335205, - "learning_rate": 3.5317831854663044e-06, - "loss": 0.48, - "mean_token_accuracy": 0.8552601054310799, - "num_tokens": 199179455.0, - "step": 165520 - }, - { - "entropy": 1.8967503055930137, - "epoch": 0.5131285169476092, - "grad_norm": 6.660313129425049, - "learning_rate": 3.5316765021501502e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8643925070762635, - "num_tokens": 199191263.0, - "step": 165530 - }, - { - "entropy": 1.8865905031561852, - "epoch": 0.5131595160726589, - "grad_norm": 7.316429615020752, - "learning_rate": 3.5315698285010475e-06, - "loss": 0.4497, - "mean_token_accuracy": 0.84514549523592, - "num_tokens": 199203119.0, - "step": 165540 - }, - { - "entropy": 1.925460186600685, - "epoch": 0.5131905151977085, - "grad_norm": 8.755541801452637, - "learning_rate": 3.531463164517535e-06, - "loss": 0.4387, - "mean_token_accuracy": 0.8621005952358246, - "num_tokens": 199214608.0, - "step": 165550 - }, - { - "entropy": 1.9230826959013938, - "epoch": 0.5132215143227582, - "grad_norm": 4.1020636558532715, - "learning_rate": 3.531356510198154e-06, - "loss": 0.4091, - "mean_token_accuracy": 0.8598949074745178, - "num_tokens": 199226632.0, - "step": 165560 - }, - { - "entropy": 1.801899181306362, - "epoch": 0.513252513447808, - "grad_norm": 8.648043632507324, - "learning_rate": 3.5312498655414447e-06, - "loss": 0.37, - "mean_token_accuracy": 0.8700105547904968, - "num_tokens": 199239560.0, - "step": 165570 - }, - { - "entropy": 1.8442170739173889, - "epoch": 0.5132835125728576, - "grad_norm": 3.063127040863037, - "learning_rate": 3.531143230545949e-06, - "loss": 0.3955, - "mean_token_accuracy": 0.8664918586611747, - "num_tokens": 199251263.0, - "step": 165580 - }, - { - "entropy": 1.784967178106308, - "epoch": 0.5133145116979073, - "grad_norm": 8.657151222229004, - "learning_rate": 3.5310366052102076e-06, - "loss": 0.3843, - "mean_token_accuracy": 0.8673649072647095, - "num_tokens": 199263888.0, - "step": 165590 - }, - { - "entropy": 1.7379182785749436, - "epoch": 0.513345510822957, - "grad_norm": 3.7212421894073486, - "learning_rate": 3.530929989532762e-06, - "loss": 0.3682, - "mean_token_accuracy": 0.8674047499895096, - "num_tokens": 199277870.0, - "step": 165600 - }, - { - "entropy": 1.749285961687565, - "epoch": 0.5133765099480068, - "grad_norm": 2.982008934020996, - "learning_rate": 3.5308233835121555e-06, - "loss": 0.3672, - "mean_token_accuracy": 0.8672578066587449, - "num_tokens": 199292780.0, - "step": 165610 - }, - { - "entropy": 1.8962912052869796, - "epoch": 0.5134075090730564, - "grad_norm": 8.388097763061523, - "learning_rate": 3.530716787146928e-06, - "loss": 0.4696, - "mean_token_accuracy": 0.8507509827613831, - "num_tokens": 199304340.0, - "step": 165620 - }, - { - "entropy": 1.835693357884884, - "epoch": 0.5134385081981061, - "grad_norm": 8.318591117858887, - "learning_rate": 3.5306102004356242e-06, - "loss": 0.3864, - "mean_token_accuracy": 0.8652938485145569, - "num_tokens": 199317295.0, - "step": 165630 - }, - { - "entropy": 1.8470296934247017, - "epoch": 0.5134695073231558, - "grad_norm": 8.046967506408691, - "learning_rate": 3.5305036233767865e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8550007939338684, - "num_tokens": 199330149.0, - "step": 165640 - }, - { - "entropy": 1.8552626207470895, - "epoch": 0.5135005064482056, - "grad_norm": 7.412757396697998, - "learning_rate": 3.5303970559689575e-06, - "loss": 0.4076, - "mean_token_accuracy": 0.8597292914986611, - "num_tokens": 199343185.0, - "step": 165650 - }, - { - "entropy": 1.7761410742998123, - "epoch": 0.5135315055732552, - "grad_norm": 8.163668632507324, - "learning_rate": 3.5302904982106816e-06, - "loss": 0.3737, - "mean_token_accuracy": 0.8719475150108338, - "num_tokens": 199356286.0, - "step": 165660 - }, - { - "entropy": 1.8710751131176948, - "epoch": 0.5135625046983049, - "grad_norm": 4.116541385650635, - "learning_rate": 3.530183950100502e-06, - "loss": 0.4045, - "mean_token_accuracy": 0.8619105949997902, - "num_tokens": 199368052.0, - "step": 165670 - }, - { - "entropy": 1.94207361638546, - "epoch": 0.5135935038233546, - "grad_norm": 7.243650913238525, - "learning_rate": 3.530077411636963e-06, - "loss": 0.5017, - "mean_token_accuracy": 0.843024018406868, - "num_tokens": 199379570.0, - "step": 165680 - }, - { - "entropy": 1.9412450224161149, - "epoch": 0.5136245029484042, - "grad_norm": 7.0396013259887695, - "learning_rate": 3.529970882818609e-06, - "loss": 0.4883, - "mean_token_accuracy": 0.8452283650636673, - "num_tokens": 199390522.0, - "step": 165690 - }, - { - "entropy": 1.8934280395507812, - "epoch": 0.513655502073454, - "grad_norm": 9.029003143310547, - "learning_rate": 3.529864363643985e-06, - "loss": 0.4384, - "mean_token_accuracy": 0.855101665854454, - "num_tokens": 199402784.0, - "step": 165700 - }, - { - "entropy": 1.9106672063469887, - "epoch": 0.5136865011985037, - "grad_norm": 9.377785682678223, - "learning_rate": 3.529757854111636e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8599131286144257, - "num_tokens": 199413917.0, - "step": 165710 - }, - { - "entropy": 1.8733697950839996, - "epoch": 0.5137175003235533, - "grad_norm": 8.300299644470215, - "learning_rate": 3.529651354220107e-06, - "loss": 0.4217, - "mean_token_accuracy": 0.8556776940822601, - "num_tokens": 199426429.0, - "step": 165720 - }, - { - "entropy": 1.9104059755802154, - "epoch": 0.513748499448603, - "grad_norm": 3.4120521545410156, - "learning_rate": 3.5295448639679436e-06, - "loss": 0.4907, - "mean_token_accuracy": 0.8444541275501252, - "num_tokens": 199438710.0, - "step": 165730 - }, - { - "entropy": 1.8936485648155212, - "epoch": 0.5137794985736528, - "grad_norm": 3.6589162349700928, - "learning_rate": 3.529438383353692e-06, - "loss": 0.4618, - "mean_token_accuracy": 0.851848429441452, - "num_tokens": 199449881.0, - "step": 165740 - }, - { - "entropy": 1.9428910300135613, - "epoch": 0.5138104976987025, - "grad_norm": 8.218547821044922, - "learning_rate": 3.529331912375899e-06, - "loss": 0.466, - "mean_token_accuracy": 0.8518887877464294, - "num_tokens": 199460975.0, - "step": 165750 - }, - { - "entropy": 1.8341187611222267, - "epoch": 0.5138414968237521, - "grad_norm": 4.086446762084961, - "learning_rate": 3.529225451033111e-06, - "loss": 0.4022, - "mean_token_accuracy": 0.8616880744695663, - "num_tokens": 199473647.0, - "step": 165760 - }, - { - "entropy": 1.8468362540006638, - "epoch": 0.5138724959488018, - "grad_norm": 5.933652400970459, - "learning_rate": 3.529118999323874e-06, - "loss": 0.4822, - "mean_token_accuracy": 0.8481451705098152, - "num_tokens": 199485712.0, - "step": 165770 - }, - { - "entropy": 1.9439073607325554, - "epoch": 0.5139034950738516, - "grad_norm": 8.484124183654785, - "learning_rate": 3.529012557246736e-06, - "loss": 0.506, - "mean_token_accuracy": 0.8435920670628547, - "num_tokens": 199497344.0, - "step": 165780 - }, - { - "entropy": 1.9494817286729813, - "epoch": 0.5139344941989012, - "grad_norm": 7.470255374908447, - "learning_rate": 3.528906124800245e-06, - "loss": 0.5036, - "mean_token_accuracy": 0.8448470056056976, - "num_tokens": 199508098.0, - "step": 165790 - }, - { - "entropy": 1.8470247462391853, - "epoch": 0.5139654933239509, - "grad_norm": 8.678824424743652, - "learning_rate": 3.528799701982948e-06, - "loss": 0.4463, - "mean_token_accuracy": 0.852811923623085, - "num_tokens": 199520795.0, - "step": 165800 - }, - { - "entropy": 1.8924536779522896, - "epoch": 0.5139964924490006, - "grad_norm": 7.182823657989502, - "learning_rate": 3.528693288793393e-06, - "loss": 0.4289, - "mean_token_accuracy": 0.8532521143555641, - "num_tokens": 199532460.0, - "step": 165810 - }, - { - "entropy": 1.7975892141461371, - "epoch": 0.5140274915740504, - "grad_norm": 9.009475708007812, - "learning_rate": 3.5285868852301284e-06, - "loss": 0.4233, - "mean_token_accuracy": 0.8550382882356644, - "num_tokens": 199545495.0, - "step": 165820 - }, - { - "entropy": 1.9081748649477959, - "epoch": 0.5140584906991, - "grad_norm": 7.362851142883301, - "learning_rate": 3.5284804912917044e-06, - "loss": 0.4172, - "mean_token_accuracy": 0.86656956076622, - "num_tokens": 199556576.0, - "step": 165830 - }, - { - "entropy": 1.8292412385344505, - "epoch": 0.5140894898241497, - "grad_norm": 7.867893218994141, - "learning_rate": 3.5283741069766682e-06, - "loss": 0.4065, - "mean_token_accuracy": 0.8617888554930687, - "num_tokens": 199568805.0, - "step": 165840 - }, - { - "entropy": 1.8667304873466493, - "epoch": 0.5141204889491994, - "grad_norm": 9.72758960723877, - "learning_rate": 3.5282677322835697e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.8470277801156044, - "num_tokens": 199580316.0, - "step": 165850 - }, - { - "entropy": 1.8580066189169884, - "epoch": 0.5141514880742492, - "grad_norm": 7.151798725128174, - "learning_rate": 3.528161367210959e-06, - "loss": 0.4803, - "mean_token_accuracy": 0.8555332809686661, - "num_tokens": 199592287.0, - "step": 165860 - }, - { - "entropy": 1.8838511526584625, - "epoch": 0.5141824871992988, - "grad_norm": 9.418664932250977, - "learning_rate": 3.528055011757387e-06, - "loss": 0.5016, - "mean_token_accuracy": 0.839755679666996, - "num_tokens": 199604689.0, - "step": 165870 - }, - { - "entropy": 1.8362286701798438, - "epoch": 0.5142134863243485, - "grad_norm": 9.759264945983887, - "learning_rate": 3.527948665921401e-06, - "loss": 0.3988, - "mean_token_accuracy": 0.8611894577741623, - "num_tokens": 199617535.0, - "step": 165880 - }, - { - "entropy": 1.8877715915441513, - "epoch": 0.5142444854493982, - "grad_norm": 5.549670219421387, - "learning_rate": 3.5278423297015547e-06, - "loss": 0.4641, - "mean_token_accuracy": 0.8523763597011567, - "num_tokens": 199629442.0, - "step": 165890 - }, - { - "entropy": 1.9229825258255004, - "epoch": 0.5142754845744479, - "grad_norm": 7.350560188293457, - "learning_rate": 3.527736003096397e-06, - "loss": 0.4548, - "mean_token_accuracy": 0.8494596913456917, - "num_tokens": 199641510.0, - "step": 165900 - }, - { - "entropy": 1.9444432079792022, - "epoch": 0.5143064836994976, - "grad_norm": 8.653480529785156, - "learning_rate": 3.52762968610448e-06, - "loss": 0.4827, - "mean_token_accuracy": 0.8489970117807388, - "num_tokens": 199652436.0, - "step": 165910 - }, - { - "entropy": 1.8965161561965942, - "epoch": 0.5143374828245473, - "grad_norm": 8.42867660522461, - "learning_rate": 3.527523378724355e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8526179268956184, - "num_tokens": 199664868.0, - "step": 165920 - }, - { - "entropy": 1.9393030047416686, - "epoch": 0.514368481949597, - "grad_norm": 8.376232147216797, - "learning_rate": 3.527417080954574e-06, - "loss": 0.4895, - "mean_token_accuracy": 0.8461043626070023, - "num_tokens": 199675872.0, - "step": 165930 - }, - { - "entropy": 1.938021996617317, - "epoch": 0.5143994810746466, - "grad_norm": 6.738912582397461, - "learning_rate": 3.527310792793688e-06, - "loss": 0.4515, - "mean_token_accuracy": 0.8560506254434586, - "num_tokens": 199687256.0, - "step": 165940 - }, - { - "entropy": 1.8744206488132478, - "epoch": 0.5144304801996964, - "grad_norm": 3.8372957706451416, - "learning_rate": 3.5272045142402507e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8504655987024308, - "num_tokens": 199699428.0, - "step": 165950 - }, - { - "entropy": 1.7658538281917573, - "epoch": 0.5144614793247461, - "grad_norm": 8.591163635253906, - "learning_rate": 3.527098245292814e-06, - "loss": 0.3913, - "mean_token_accuracy": 0.8597464576363564, - "num_tokens": 199713480.0, - "step": 165960 - }, - { - "entropy": 1.8621069818735123, - "epoch": 0.5144924784497957, - "grad_norm": 8.060001373291016, - "learning_rate": 3.5269919859499325e-06, - "loss": 0.4215, - "mean_token_accuracy": 0.8680314958095551, - "num_tokens": 199725158.0, - "step": 165970 - }, - { - "entropy": 1.985747216641903, - "epoch": 0.5145234775748454, - "grad_norm": 8.78643798828125, - "learning_rate": 3.5268857362101573e-06, - "loss": 0.5117, - "mean_token_accuracy": 0.8432222366333008, - "num_tokens": 199736883.0, - "step": 165980 - }, - { - "entropy": 1.90114316791296, - "epoch": 0.5145544766998952, - "grad_norm": 8.598512649536133, - "learning_rate": 3.5267794960720435e-06, - "loss": 0.4719, - "mean_token_accuracy": 0.8427591428160668, - "num_tokens": 199748636.0, - "step": 165990 - }, - { - "entropy": 1.8595915861427783, - "epoch": 0.5145854758249448, - "grad_norm": 8.431171417236328, - "learning_rate": 3.526673265534144e-06, - "loss": 0.4189, - "mean_token_accuracy": 0.8530223548412323, - "num_tokens": 199760844.0, - "step": 166000 - }, - { - "entropy": 1.9187236800789833, - "epoch": 0.5146164749499945, - "grad_norm": 9.313626289367676, - "learning_rate": 3.526567044595014e-06, - "loss": 0.4553, - "mean_token_accuracy": 0.8506917625665664, - "num_tokens": 199772512.0, - "step": 166010 - }, - { - "entropy": 1.894215178489685, - "epoch": 0.5146474740750442, - "grad_norm": 4.336549282073975, - "learning_rate": 3.526460833253208e-06, - "loss": 0.4713, - "mean_token_accuracy": 0.8501930445432663, - "num_tokens": 199784483.0, - "step": 166020 - }, - { - "entropy": 1.8991673409938812, - "epoch": 0.514678473200094, - "grad_norm": 8.60685920715332, - "learning_rate": 3.5263546315072805e-06, - "loss": 0.4325, - "mean_token_accuracy": 0.8552577182650566, - "num_tokens": 199795866.0, - "step": 166030 - }, - { - "entropy": 1.9237390816211701, - "epoch": 0.5147094723251436, - "grad_norm": 11.135947227478027, - "learning_rate": 3.5262484393557867e-06, - "loss": 0.4295, - "mean_token_accuracy": 0.8558558940887451, - "num_tokens": 199806932.0, - "step": 166040 - }, - { - "entropy": 1.9094901964068414, - "epoch": 0.5147404714501933, - "grad_norm": 3.4820947647094727, - "learning_rate": 3.526142256797282e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8509971618652343, - "num_tokens": 199818174.0, - "step": 166050 - }, - { - "entropy": 1.9149621576070786, - "epoch": 0.514771470575243, - "grad_norm": 7.39802360534668, - "learning_rate": 3.5260360838303213e-06, - "loss": 0.4559, - "mean_token_accuracy": 0.8556697189807891, - "num_tokens": 199829446.0, - "step": 166060 - }, - { - "entropy": 1.9035993307828902, - "epoch": 0.5148024697002928, - "grad_norm": 8.023327827453613, - "learning_rate": 3.525929920453463e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8653369784355164, - "num_tokens": 199840994.0, - "step": 166070 - }, - { - "entropy": 1.8855173870921136, - "epoch": 0.5148334688253424, - "grad_norm": 4.079839706420898, - "learning_rate": 3.525823766665261e-06, - "loss": 0.4415, - "mean_token_accuracy": 0.8557475224137306, - "num_tokens": 199853342.0, - "step": 166080 - }, - { - "entropy": 1.832002504169941, - "epoch": 0.5148644679503921, - "grad_norm": 8.041168212890625, - "learning_rate": 3.525717622464274e-06, - "loss": 0.4235, - "mean_token_accuracy": 0.8482832551002503, - "num_tokens": 199866649.0, - "step": 166090 - }, - { - "entropy": 1.806608146429062, - "epoch": 0.5148954670754418, - "grad_norm": 7.931125640869141, - "learning_rate": 3.525611487849057e-06, - "loss": 0.3683, - "mean_token_accuracy": 0.8658397302031517, - "num_tokens": 199879430.0, - "step": 166100 - }, - { - "entropy": 1.8772173956036569, - "epoch": 0.5149264662004915, - "grad_norm": 6.945058345794678, - "learning_rate": 3.525505362818169e-06, - "loss": 0.405, - "mean_token_accuracy": 0.8555058270692826, - "num_tokens": 199891989.0, - "step": 166110 - }, - { - "entropy": 1.9100875377655029, - "epoch": 0.5149574653255412, - "grad_norm": 6.640747547149658, - "learning_rate": 3.5253992473701666e-06, - "loss": 0.4133, - "mean_token_accuracy": 0.861659274995327, - "num_tokens": 199903727.0, - "step": 166120 - }, - { - "entropy": 1.853276364505291, - "epoch": 0.5149884644505909, - "grad_norm": 6.926969051361084, - "learning_rate": 3.525293141503608e-06, - "loss": 0.4246, - "mean_token_accuracy": 0.8639653757214546, - "num_tokens": 199915510.0, - "step": 166130 - }, - { - "entropy": 1.8948578789830208, - "epoch": 0.5150194635756405, - "grad_norm": 9.782580375671387, - "learning_rate": 3.525187045217052e-06, - "loss": 0.4605, - "mean_token_accuracy": 0.8522849783301354, - "num_tokens": 199927351.0, - "step": 166140 - }, - { - "entropy": 1.8471661388874054, - "epoch": 0.5150504627006903, - "grad_norm": 10.758081436157227, - "learning_rate": 3.5250809585090555e-06, - "loss": 0.433, - "mean_token_accuracy": 0.8511803165078163, - "num_tokens": 199940212.0, - "step": 166150 - }, - { - "entropy": 1.9439245939254761, - "epoch": 0.51508146182574, - "grad_norm": 7.815676212310791, - "learning_rate": 3.524974881378178e-06, - "loss": 0.4839, - "mean_token_accuracy": 0.8520305082201958, - "num_tokens": 199951112.0, - "step": 166160 - }, - { - "entropy": 1.9194507017731666, - "epoch": 0.5151124609507897, - "grad_norm": 8.278312683105469, - "learning_rate": 3.52486881382298e-06, - "loss": 0.4434, - "mean_token_accuracy": 0.8619399756193161, - "num_tokens": 199962297.0, - "step": 166170 - }, - { - "entropy": 1.9504115760326386, - "epoch": 0.5151434600758393, - "grad_norm": 9.44292163848877, - "learning_rate": 3.5247627558420196e-06, - "loss": 0.4733, - "mean_token_accuracy": 0.8505216121673584, - "num_tokens": 199972541.0, - "step": 166180 - }, - { - "entropy": 1.8769857451319694, - "epoch": 0.515174459200889, - "grad_norm": 3.786944627761841, - "learning_rate": 3.5246567074338563e-06, - "loss": 0.4242, - "mean_token_accuracy": 0.8582519263029098, - "num_tokens": 199984035.0, - "step": 166190 - }, - { - "entropy": 1.877619171142578, - "epoch": 0.5152054583259388, - "grad_norm": 4.849076271057129, - "learning_rate": 3.524550668597051e-06, - "loss": 0.4357, - "mean_token_accuracy": 0.8577707931399345, - "num_tokens": 199996112.0, - "step": 166200 - }, - { - "entropy": 1.9202482283115387, - "epoch": 0.5152364574509884, - "grad_norm": 3.6988606452941895, - "learning_rate": 3.5244446393301628e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.8513051331043243, - "num_tokens": 200007656.0, - "step": 166210 - }, - { - "entropy": 1.829783384501934, - "epoch": 0.5152674565760381, - "grad_norm": 7.577276229858398, - "learning_rate": 3.5243386196317535e-06, - "loss": 0.3663, - "mean_token_accuracy": 0.8670179888606071, - "num_tokens": 200019989.0, - "step": 166220 - }, - { - "entropy": 1.8723140776157379, - "epoch": 0.5152984557010878, - "grad_norm": 7.912343978881836, - "learning_rate": 3.5242326095003844e-06, - "loss": 0.456, - "mean_token_accuracy": 0.85295629799366, - "num_tokens": 200032038.0, - "step": 166230 - }, - { - "entropy": 1.8697478756308556, - "epoch": 0.5153294548261376, - "grad_norm": 3.890298366546631, - "learning_rate": 3.5241266089346147e-06, - "loss": 0.3906, - "mean_token_accuracy": 0.8689236909151077, - "num_tokens": 200044324.0, - "step": 166240 - }, - { - "entropy": 1.8814658299088478, - "epoch": 0.5153604539511872, - "grad_norm": 3.827788829803467, - "learning_rate": 3.524020617933008e-06, - "loss": 0.4433, - "mean_token_accuracy": 0.858353029191494, - "num_tokens": 200056180.0, - "step": 166250 - }, - { - "entropy": 1.8012808829545974, - "epoch": 0.5153914530762369, - "grad_norm": 7.109594345092773, - "learning_rate": 3.5239146364941247e-06, - "loss": 0.3766, - "mean_token_accuracy": 0.8628587678074837, - "num_tokens": 200069667.0, - "step": 166260 - }, - { - "entropy": 1.9676722317934037, - "epoch": 0.5154224522012866, - "grad_norm": 10.758739471435547, - "learning_rate": 3.5238086646165283e-06, - "loss": 0.4909, - "mean_token_accuracy": 0.8481611773371697, - "num_tokens": 200080634.0, - "step": 166270 - }, - { - "entropy": 1.9452502936124803, - "epoch": 0.5154534513263364, - "grad_norm": 9.257121086120605, - "learning_rate": 3.5237027022987795e-06, - "loss": 0.4932, - "mean_token_accuracy": 0.849012990295887, - "num_tokens": 200091838.0, - "step": 166280 - }, - { - "entropy": 1.8852812498807907, - "epoch": 0.515484450451386, - "grad_norm": 9.17672061920166, - "learning_rate": 3.523596749539443e-06, - "loss": 0.445, - "mean_token_accuracy": 0.8530494660139084, - "num_tokens": 200103554.0, - "step": 166290 - }, - { - "entropy": 1.8659416556358337, - "epoch": 0.5155154495764357, - "grad_norm": 9.371847152709961, - "learning_rate": 3.5234908063370803e-06, - "loss": 0.4387, - "mean_token_accuracy": 0.8579873785376548, - "num_tokens": 200115307.0, - "step": 166300 - }, - { - "entropy": 1.8628961145877838, - "epoch": 0.5155464487014854, - "grad_norm": 9.57691764831543, - "learning_rate": 3.5233848726902554e-06, - "loss": 0.4406, - "mean_token_accuracy": 0.8588206008076668, - "num_tokens": 200127125.0, - "step": 166310 - }, - { - "entropy": 1.8160467877984048, - "epoch": 0.5155774478265351, - "grad_norm": 6.197166919708252, - "learning_rate": 3.5232789485975323e-06, - "loss": 0.3701, - "mean_token_accuracy": 0.8625551044940949, - "num_tokens": 200140225.0, - "step": 166320 - }, - { - "entropy": 1.905704265832901, - "epoch": 0.5156084469515848, - "grad_norm": 10.474200248718262, - "learning_rate": 3.523173034057474e-06, - "loss": 0.5064, - "mean_token_accuracy": 0.8451021924614907, - "num_tokens": 200152276.0, - "step": 166330 - }, - { - "entropy": 1.887017984688282, - "epoch": 0.5156394460766345, - "grad_norm": 8.072344779968262, - "learning_rate": 3.523067129068646e-06, - "loss": 0.4472, - "mean_token_accuracy": 0.8550172179937363, - "num_tokens": 200164173.0, - "step": 166340 - }, - { - "entropy": 1.8533250823616982, - "epoch": 0.5156704452016841, - "grad_norm": 3.522214651107788, - "learning_rate": 3.5229612336296113e-06, - "loss": 0.3713, - "mean_token_accuracy": 0.8682062104344368, - "num_tokens": 200176357.0, - "step": 166350 - }, - { - "entropy": 1.9086049854755402, - "epoch": 0.5157014443267339, - "grad_norm": 7.360813140869141, - "learning_rate": 3.522855347738936e-06, - "loss": 0.4377, - "mean_token_accuracy": 0.8604349881410599, - "num_tokens": 200188158.0, - "step": 166360 - }, - { - "entropy": 1.875716508924961, - "epoch": 0.5157324434517836, - "grad_norm": 6.441467761993408, - "learning_rate": 3.5227494713951847e-06, - "loss": 0.4278, - "mean_token_accuracy": 0.8681019887328147, - "num_tokens": 200200162.0, - "step": 166370 - }, - { - "entropy": 1.776827821880579, - "epoch": 0.5157634425768333, - "grad_norm": 2.422685146331787, - "learning_rate": 3.522643604596923e-06, - "loss": 0.388, - "mean_token_accuracy": 0.8635517880320549, - "num_tokens": 200213370.0, - "step": 166380 - }, - { - "entropy": 1.8618718206882476, - "epoch": 0.5157944417018829, - "grad_norm": 3.5454483032226562, - "learning_rate": 3.522537747342717e-06, - "loss": 0.4561, - "mean_token_accuracy": 0.8620031788945198, - "num_tokens": 200225275.0, - "step": 166390 - }, - { - "entropy": 1.906214900314808, - "epoch": 0.5158254408269327, - "grad_norm": 9.334129333496094, - "learning_rate": 3.5224318996311324e-06, - "loss": 0.4746, - "mean_token_accuracy": 0.8507939204573631, - "num_tokens": 200236830.0, - "step": 166400 - }, - { - "entropy": 1.9781831055879593, - "epoch": 0.5158564399519824, - "grad_norm": 7.7256340980529785, - "learning_rate": 3.5223260614607365e-06, - "loss": 0.5335, - "mean_token_accuracy": 0.8375950440764427, - "num_tokens": 200247890.0, - "step": 166410 - }, - { - "entropy": 1.9732432961463928, - "epoch": 0.515887439077032, - "grad_norm": 9.70375919342041, - "learning_rate": 3.5222202328300936e-06, - "loss": 0.5296, - "mean_token_accuracy": 0.8345681294798851, - "num_tokens": 200259455.0, - "step": 166420 - }, - { - "entropy": 1.86635515242815, - "epoch": 0.5159184382020817, - "grad_norm": 3.7832741737365723, - "learning_rate": 3.5221144137377727e-06, - "loss": 0.4047, - "mean_token_accuracy": 0.8658062994480134, - "num_tokens": 200271878.0, - "step": 166430 - }, - { - "entropy": 1.8391347080469131, - "epoch": 0.5159494373271314, - "grad_norm": 3.3010501861572266, - "learning_rate": 3.522008604182341e-06, - "loss": 0.4102, - "mean_token_accuracy": 0.8555727571249008, - "num_tokens": 200284953.0, - "step": 166440 - }, - { - "entropy": 1.8243245914578439, - "epoch": 0.5159804364521812, - "grad_norm": 7.198421955108643, - "learning_rate": 3.521902804162365e-06, - "loss": 0.3804, - "mean_token_accuracy": 0.8652743324637413, - "num_tokens": 200298321.0, - "step": 166450 - }, - { - "entropy": 1.9128145158290863, - "epoch": 0.5160114355772308, - "grad_norm": 9.630524635314941, - "learning_rate": 3.5217970136764134e-06, - "loss": 0.461, - "mean_token_accuracy": 0.842883138358593, - "num_tokens": 200309586.0, - "step": 166460 - }, - { - "entropy": 1.923278383910656, - "epoch": 0.5160424347022805, - "grad_norm": 8.430140495300293, - "learning_rate": 3.5216912327230545e-06, - "loss": 0.4534, - "mean_token_accuracy": 0.8511732935905456, - "num_tokens": 200321291.0, - "step": 166470 - }, - { - "entropy": 1.779987198114395, - "epoch": 0.5160734338273302, - "grad_norm": 7.797722816467285, - "learning_rate": 3.521585461300856e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8620175883173943, - "num_tokens": 200334275.0, - "step": 166480 - }, - { - "entropy": 1.9071402192115783, - "epoch": 0.51610443295238, - "grad_norm": 8.116595268249512, - "learning_rate": 3.521479699408388e-06, - "loss": 0.4529, - "mean_token_accuracy": 0.8501278668642044, - "num_tokens": 200346105.0, - "step": 166490 - }, - { - "entropy": 1.8801355749368667, - "epoch": 0.5161354320774296, - "grad_norm": 5.864742279052734, - "learning_rate": 3.5213739470442176e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.8512676224112511, - "num_tokens": 200357894.0, - "step": 166500 - }, - { - "entropy": 1.8980636432766915, - "epoch": 0.5161664312024793, - "grad_norm": 8.04993724822998, - "learning_rate": 3.5212682042069157e-06, - "loss": 0.4358, - "mean_token_accuracy": 0.8641004547476768, - "num_tokens": 200369625.0, - "step": 166510 - }, - { - "entropy": 1.8364667430520059, - "epoch": 0.516197430327529, - "grad_norm": 12.634716033935547, - "learning_rate": 3.5211624708950515e-06, - "loss": 0.4214, - "mean_token_accuracy": 0.8635191649198533, - "num_tokens": 200381515.0, - "step": 166520 - }, - { - "entropy": 1.924870501458645, - "epoch": 0.5162284294525787, - "grad_norm": 8.887164115905762, - "learning_rate": 3.5210567471071962e-06, - "loss": 0.4749, - "mean_token_accuracy": 0.8511701777577401, - "num_tokens": 200392831.0, - "step": 166530 - }, - { - "entropy": 1.91126778870821, - "epoch": 0.5162594285776284, - "grad_norm": 8.9415922164917, - "learning_rate": 3.5209510328419174e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8611420899629593, - "num_tokens": 200404326.0, - "step": 166540 - }, - { - "entropy": 1.8403394103050232, - "epoch": 0.5162904277026781, - "grad_norm": 7.393084526062012, - "learning_rate": 3.520845328097788e-06, - "loss": 0.4025, - "mean_token_accuracy": 0.8644649833440781, - "num_tokens": 200417037.0, - "step": 166550 - }, - { - "entropy": 1.8981595396995545, - "epoch": 0.5163214268277277, - "grad_norm": 10.253661155700684, - "learning_rate": 3.520739632873378e-06, - "loss": 0.4727, - "mean_token_accuracy": 0.8498363941907883, - "num_tokens": 200428816.0, - "step": 166560 - }, - { - "entropy": 1.8573830232024193, - "epoch": 0.5163524259527775, - "grad_norm": 7.009857177734375, - "learning_rate": 3.5206339471672583e-06, - "loss": 0.4085, - "mean_token_accuracy": 0.8623615726828575, - "num_tokens": 200440760.0, - "step": 166570 - }, - { - "entropy": 1.904543998837471, - "epoch": 0.5163834250778272, - "grad_norm": 7.801250457763672, - "learning_rate": 3.5205282709780015e-06, - "loss": 0.5007, - "mean_token_accuracy": 0.8442154437303543, - "num_tokens": 200452685.0, - "step": 166580 - }, - { - "entropy": 1.8602082580327988, - "epoch": 0.5164144242028769, - "grad_norm": 4.4240946769714355, - "learning_rate": 3.5204226043041776e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.8587385684251785, - "num_tokens": 200464705.0, - "step": 166590 - }, - { - "entropy": 1.875721175968647, - "epoch": 0.5164454233279265, - "grad_norm": 3.9632303714752197, - "learning_rate": 3.520316947144361e-06, - "loss": 0.4252, - "mean_token_accuracy": 0.8631686121225357, - "num_tokens": 200476790.0, - "step": 166600 - }, - { - "entropy": 1.9048528328537941, - "epoch": 0.5164764224529763, - "grad_norm": 7.584252834320068, - "learning_rate": 3.520211299497122e-06, - "loss": 0.4534, - "mean_token_accuracy": 0.8571887835860252, - "num_tokens": 200488053.0, - "step": 166610 - }, - { - "entropy": 1.8573243141174316, - "epoch": 0.516507421578026, - "grad_norm": 5.636211395263672, - "learning_rate": 3.5201056613610347e-06, - "loss": 0.4837, - "mean_token_accuracy": 0.8564253136515617, - "num_tokens": 200499812.0, - "step": 166620 - }, - { - "entropy": 1.9138450264930724, - "epoch": 0.5165384207030757, - "grad_norm": 8.784363746643066, - "learning_rate": 3.5200000327346714e-06, - "loss": 0.4478, - "mean_token_accuracy": 0.8483364924788475, - "num_tokens": 200511806.0, - "step": 166630 - }, - { - "entropy": 1.914737243950367, - "epoch": 0.5165694198281253, - "grad_norm": 3.4447364807128906, - "learning_rate": 3.5198944136166048e-06, - "loss": 0.4885, - "mean_token_accuracy": 0.838789090514183, - "num_tokens": 200523241.0, - "step": 166640 - }, - { - "entropy": 1.7461669012904166, - "epoch": 0.5166004189531751, - "grad_norm": 3.7405452728271484, - "learning_rate": 3.5197888040054094e-06, - "loss": 0.3185, - "mean_token_accuracy": 0.8775429561734199, - "num_tokens": 200537167.0, - "step": 166650 - }, - { - "entropy": 1.8424347892403603, - "epoch": 0.5166314180782248, - "grad_norm": 9.276825904846191, - "learning_rate": 3.519683203899659e-06, - "loss": 0.4288, - "mean_token_accuracy": 0.8487263694405556, - "num_tokens": 200549567.0, - "step": 166660 - }, - { - "entropy": 1.8613199979066848, - "epoch": 0.5166624172032744, - "grad_norm": 7.739795684814453, - "learning_rate": 3.5195776132979283e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8574581235647202, - "num_tokens": 200561757.0, - "step": 166670 - }, - { - "entropy": 1.8857336401939393, - "epoch": 0.5166934163283241, - "grad_norm": 7.0130696296691895, - "learning_rate": 3.5194720321987894e-06, - "loss": 0.4515, - "mean_token_accuracy": 0.8589113280177116, - "num_tokens": 200573813.0, - "step": 166680 - }, - { - "entropy": 1.889016604423523, - "epoch": 0.5167244154533738, - "grad_norm": 7.985410690307617, - "learning_rate": 3.519366460600821e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.8589774370193481, - "num_tokens": 200584955.0, - "step": 166690 - }, - { - "entropy": 1.8742096453905106, - "epoch": 0.5167554145784236, - "grad_norm": 3.954725980758667, - "learning_rate": 3.519260898502594e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8560954242944717, - "num_tokens": 200597682.0, - "step": 166700 - }, - { - "entropy": 1.865584236383438, - "epoch": 0.5167864137034732, - "grad_norm": 8.845154762268066, - "learning_rate": 3.519155345902687e-06, - "loss": 0.4162, - "mean_token_accuracy": 0.8622621670365334, - "num_tokens": 200609258.0, - "step": 166710 - }, - { - "entropy": 1.9222540989518166, - "epoch": 0.5168174128285229, - "grad_norm": 4.905745029449463, - "learning_rate": 3.5190498027996738e-06, - "loss": 0.4671, - "mean_token_accuracy": 0.8524176865816117, - "num_tokens": 200620245.0, - "step": 166720 - }, - { - "entropy": 1.83238478153944, - "epoch": 0.5168484119535726, - "grad_norm": 8.726923942565918, - "learning_rate": 3.5189442691921306e-06, - "loss": 0.4615, - "mean_token_accuracy": 0.8474073141813279, - "num_tokens": 200632277.0, - "step": 166730 - }, - { - "entropy": 1.9488019853830338, - "epoch": 0.5168794110786223, - "grad_norm": 7.9495368003845215, - "learning_rate": 3.5188387450786355e-06, - "loss": 0.4888, - "mean_token_accuracy": 0.8537901550531387, - "num_tokens": 200642847.0, - "step": 166740 - }, - { - "entropy": 1.8265003859996796, - "epoch": 0.516910410203672, - "grad_norm": 8.291993141174316, - "learning_rate": 3.5187332304577628e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8603555530309677, - "num_tokens": 200654847.0, - "step": 166750 - }, - { - "entropy": 1.7454282835125923, - "epoch": 0.5169414093287217, - "grad_norm": 3.814364433288574, - "learning_rate": 3.5186277253280903e-06, - "loss": 0.3628, - "mean_token_accuracy": 0.8665547743439674, - "num_tokens": 200669003.0, - "step": 166760 - }, - { - "entropy": 1.8317288614809513, - "epoch": 0.5169724084537713, - "grad_norm": 3.715165853500366, - "learning_rate": 3.5185222296881947e-06, - "loss": 0.3879, - "mean_token_accuracy": 0.8635482504963875, - "num_tokens": 200681134.0, - "step": 166770 - }, - { - "entropy": 1.8476212650537491, - "epoch": 0.5170034075788211, - "grad_norm": 8.774842262268066, - "learning_rate": 3.518416743536654e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8514179885387421, - "num_tokens": 200693765.0, - "step": 166780 - }, - { - "entropy": 1.9044752269983292, - "epoch": 0.5170344067038708, - "grad_norm": 8.217805862426758, - "learning_rate": 3.518311266872046e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8520160347223282, - "num_tokens": 200705265.0, - "step": 166790 - }, - { - "entropy": 1.849595281481743, - "epoch": 0.5170654058289205, - "grad_norm": 8.451947212219238, - "learning_rate": 3.5182057996929488e-06, - "loss": 0.4108, - "mean_token_accuracy": 0.8606655448675156, - "num_tokens": 200718081.0, - "step": 166800 - }, - { - "entropy": 1.812503370642662, - "epoch": 0.5170964049539701, - "grad_norm": 8.556303024291992, - "learning_rate": 3.5181003419979404e-06, - "loss": 0.4244, - "mean_token_accuracy": 0.8502597719430923, - "num_tokens": 200730617.0, - "step": 166810 - }, - { - "entropy": 1.8947110012173654, - "epoch": 0.5171274040790199, - "grad_norm": 7.719273567199707, - "learning_rate": 3.5179948937855994e-06, - "loss": 0.4819, - "mean_token_accuracy": 0.8427142888307572, - "num_tokens": 200741235.0, - "step": 166820 - }, - { - "entropy": 1.8528358027338983, - "epoch": 0.5171584032040696, - "grad_norm": 7.755263805389404, - "learning_rate": 3.5178894550545055e-06, - "loss": 0.4345, - "mean_token_accuracy": 0.8589017793536187, - "num_tokens": 200753559.0, - "step": 166830 - }, - { - "entropy": 1.9601138323545455, - "epoch": 0.5171894023291193, - "grad_norm": 7.6968183517456055, - "learning_rate": 3.517784025803237e-06, - "loss": 0.5124, - "mean_token_accuracy": 0.8391397684812546, - "num_tokens": 200764345.0, - "step": 166840 - }, - { - "entropy": 1.831675609946251, - "epoch": 0.5172204014541689, - "grad_norm": 7.837430000305176, - "learning_rate": 3.5176786060303745e-06, - "loss": 0.4231, - "mean_token_accuracy": 0.8554352253675461, - "num_tokens": 200776357.0, - "step": 166850 - }, - { - "entropy": 1.8829845905303955, - "epoch": 0.5172514005792187, - "grad_norm": 3.6687681674957275, - "learning_rate": 3.5175731957344964e-06, - "loss": 0.4679, - "mean_token_accuracy": 0.848581674695015, - "num_tokens": 200788319.0, - "step": 166860 - }, - { - "entropy": 1.8463978335261344, - "epoch": 0.5172823997042684, - "grad_norm": 7.817187786102295, - "learning_rate": 3.5174677949141845e-06, - "loss": 0.4033, - "mean_token_accuracy": 0.8644368588924408, - "num_tokens": 200799628.0, - "step": 166870 - }, - { - "entropy": 1.864042194187641, - "epoch": 0.517313398829318, - "grad_norm": 8.473761558532715, - "learning_rate": 3.5173624035680187e-06, - "loss": 0.4281, - "mean_token_accuracy": 0.8542978420853615, - "num_tokens": 200811772.0, - "step": 166880 - }, - { - "entropy": 1.909403358399868, - "epoch": 0.5173443979543677, - "grad_norm": 10.229207992553711, - "learning_rate": 3.5172570216945785e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8571710214018822, - "num_tokens": 200822773.0, - "step": 166890 - }, - { - "entropy": 1.9418629288673401, - "epoch": 0.5173753970794175, - "grad_norm": 9.051170349121094, - "learning_rate": 3.517151649292447e-06, - "loss": 0.4753, - "mean_token_accuracy": 0.8507715001702308, - "num_tokens": 200833601.0, - "step": 166900 - }, - { - "entropy": 1.880239014327526, - "epoch": 0.5174063962044672, - "grad_norm": 9.583940505981445, - "learning_rate": 3.517046286360204e-06, - "loss": 0.4242, - "mean_token_accuracy": 0.8595744326710701, - "num_tokens": 200844933.0, - "step": 166910 - }, - { - "entropy": 1.8107650607824326, - "epoch": 0.5174373953295168, - "grad_norm": 8.987581253051758, - "learning_rate": 3.5169409328964315e-06, - "loss": 0.4582, - "mean_token_accuracy": 0.8517088457942009, - "num_tokens": 200857702.0, - "step": 166920 - }, - { - "entropy": 1.906441855430603, - "epoch": 0.5174683944545665, - "grad_norm": 8.563685417175293, - "learning_rate": 3.5168355888997115e-06, - "loss": 0.4786, - "mean_token_accuracy": 0.8531229227781296, - "num_tokens": 200869141.0, - "step": 166930 - }, - { - "entropy": 1.8778182238340377, - "epoch": 0.5174993935796162, - "grad_norm": 7.703993797302246, - "learning_rate": 3.5167302543686266e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.8593733966350555, - "num_tokens": 200880696.0, - "step": 166940 - }, - { - "entropy": 1.8586980432271958, - "epoch": 0.5175303927046659, - "grad_norm": 4.391999244689941, - "learning_rate": 3.51662492930176e-06, - "loss": 0.405, - "mean_token_accuracy": 0.8623529061675071, - "num_tokens": 200892668.0, - "step": 166950 - }, - { - "entropy": 1.9131682097911835, - "epoch": 0.5175613918297156, - "grad_norm": 8.268182754516602, - "learning_rate": 3.516519613697692e-06, - "loss": 0.4838, - "mean_token_accuracy": 0.8527122482657432, - "num_tokens": 200903497.0, - "step": 166960 - }, - { - "entropy": 1.8035447210073472, - "epoch": 0.5175923909547653, - "grad_norm": 7.8364386558532715, - "learning_rate": 3.5164143075550084e-06, - "loss": 0.4462, - "mean_token_accuracy": 0.8491023346781731, - "num_tokens": 200917014.0, - "step": 166970 - }, - { - "entropy": 1.8485938400030135, - "epoch": 0.517623390079815, - "grad_norm": 8.503170013427734, - "learning_rate": 3.5163090108722914e-06, - "loss": 0.422, - "mean_token_accuracy": 0.8574503690004349, - "num_tokens": 200929733.0, - "step": 166980 - }, - { - "entropy": 1.8992557242512702, - "epoch": 0.5176543892048647, - "grad_norm": 7.843517303466797, - "learning_rate": 3.5162037236481246e-06, - "loss": 0.4634, - "mean_token_accuracy": 0.8507803022861481, - "num_tokens": 200941462.0, - "step": 166990 - }, - { - "entropy": 1.9413973063230514, - "epoch": 0.5176853883299144, - "grad_norm": 7.582324981689453, - "learning_rate": 3.5160984458810924e-06, - "loss": 0.4653, - "mean_token_accuracy": 0.8503754585981369, - "num_tokens": 200952501.0, - "step": 167000 - }, - { - "entropy": 1.8474327087402345, - "epoch": 0.5177163874549641, - "grad_norm": 8.536754608154297, - "learning_rate": 3.515993177569779e-06, - "loss": 0.4311, - "mean_token_accuracy": 0.8653739243745804, - "num_tokens": 200964740.0, - "step": 167010 - }, - { - "entropy": 1.915780645608902, - "epoch": 0.5177473865800137, - "grad_norm": 8.24149227142334, - "learning_rate": 3.5158879187127694e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8639548003673554, - "num_tokens": 200976228.0, - "step": 167020 - }, - { - "entropy": 1.920879889279604, - "epoch": 0.5177783857050635, - "grad_norm": 9.354827880859375, - "learning_rate": 3.515782669308648e-06, - "loss": 0.4601, - "mean_token_accuracy": 0.8478871122002601, - "num_tokens": 200988501.0, - "step": 167030 - }, - { - "entropy": 1.8832586228847503, - "epoch": 0.5178093848301132, - "grad_norm": 8.015664100646973, - "learning_rate": 3.5156774293559997e-06, - "loss": 0.4471, - "mean_token_accuracy": 0.8528868913650512, - "num_tokens": 201000535.0, - "step": 167040 - }, - { - "entropy": 1.8097984239459037, - "epoch": 0.5178403839551629, - "grad_norm": 7.848792552947998, - "learning_rate": 3.5155721988534107e-06, - "loss": 0.3972, - "mean_token_accuracy": 0.8547261565923691, - "num_tokens": 201013877.0, - "step": 167050 - }, - { - "entropy": 1.8996559455990791, - "epoch": 0.5178713830802125, - "grad_norm": 8.505248069763184, - "learning_rate": 3.515466977799467e-06, - "loss": 0.4239, - "mean_token_accuracy": 0.8700977265834808, - "num_tokens": 201025601.0, - "step": 167060 - }, - { - "entropy": 1.8885849609971046, - "epoch": 0.5179023822052623, - "grad_norm": 8.436034202575684, - "learning_rate": 3.5153617661927536e-06, - "loss": 0.4076, - "mean_token_accuracy": 0.858351269364357, - "num_tokens": 201037162.0, - "step": 167070 - }, - { - "entropy": 1.8170109301805497, - "epoch": 0.517933381330312, - "grad_norm": 6.91679573059082, - "learning_rate": 3.5152565640318574e-06, - "loss": 0.4235, - "mean_token_accuracy": 0.8640385091304779, - "num_tokens": 201050080.0, - "step": 167080 - }, - { - "entropy": 1.915900157392025, - "epoch": 0.5179643804553616, - "grad_norm": 8.707968711853027, - "learning_rate": 3.515151371315366e-06, - "loss": 0.4522, - "mean_token_accuracy": 0.8495099931955338, - "num_tokens": 201061377.0, - "step": 167090 - }, - { - "entropy": 1.8400561198592187, - "epoch": 0.5179953795804113, - "grad_norm": 3.3338160514831543, - "learning_rate": 3.5150461880418655e-06, - "loss": 0.4202, - "mean_token_accuracy": 0.8588408932089806, - "num_tokens": 201074751.0, - "step": 167100 - }, - { - "entropy": 2.0018926441669462, - "epoch": 0.5180263787054611, - "grad_norm": 9.095269203186035, - "learning_rate": 3.5149410142099434e-06, - "loss": 0.5228, - "mean_token_accuracy": 0.8369523748755455, - "num_tokens": 201085008.0, - "step": 167110 - }, - { - "entropy": 1.896622897684574, - "epoch": 0.5180573778305108, - "grad_norm": 8.372537612915039, - "learning_rate": 3.5148358498181865e-06, - "loss": 0.4443, - "mean_token_accuracy": 0.8490730375051498, - "num_tokens": 201097253.0, - "step": 167120 - }, - { - "entropy": 1.9279163151979446, - "epoch": 0.5180883769555604, - "grad_norm": 6.603575229644775, - "learning_rate": 3.5147306948651838e-06, - "loss": 0.4692, - "mean_token_accuracy": 0.8467545539140702, - "num_tokens": 201109037.0, - "step": 167130 - }, - { - "entropy": 1.952432581782341, - "epoch": 0.5181193760806101, - "grad_norm": 8.363898277282715, - "learning_rate": 3.514625549349523e-06, - "loss": 0.4404, - "mean_token_accuracy": 0.8555149123072624, - "num_tokens": 201119894.0, - "step": 167140 - }, - { - "entropy": 1.9005773276090623, - "epoch": 0.5181503752056599, - "grad_norm": 8.650740623474121, - "learning_rate": 3.5145204132697925e-06, - "loss": 0.4749, - "mean_token_accuracy": 0.8549324363470078, - "num_tokens": 201131834.0, - "step": 167150 - }, - { - "entropy": 1.9701307892799378, - "epoch": 0.5181813743307095, - "grad_norm": 7.619329929351807, - "learning_rate": 3.5144152866245813e-06, - "loss": 0.4786, - "mean_token_accuracy": 0.8487115666270256, - "num_tokens": 201142782.0, - "step": 167160 - }, - { - "entropy": 1.9462131574749946, - "epoch": 0.5182123734557592, - "grad_norm": 8.687287330627441, - "learning_rate": 3.5143101694124783e-06, - "loss": 0.5086, - "mean_token_accuracy": 0.8393335893750191, - "num_tokens": 201154661.0, - "step": 167170 - }, - { - "entropy": 1.9058082491159438, - "epoch": 0.5182433725808089, - "grad_norm": 8.543585777282715, - "learning_rate": 3.5142050616320723e-06, - "loss": 0.4629, - "mean_token_accuracy": 0.8486988008022308, - "num_tokens": 201166535.0, - "step": 167180 - }, - { - "entropy": 1.9440076380968094, - "epoch": 0.5182743717058585, - "grad_norm": 9.032926559448242, - "learning_rate": 3.514099963281954e-06, - "loss": 0.4776, - "mean_token_accuracy": 0.8492290109395981, - "num_tokens": 201177837.0, - "step": 167190 - }, - { - "entropy": 1.9601941794157027, - "epoch": 0.5183053708309083, - "grad_norm": 8.802116394042969, - "learning_rate": 3.5139948743607123e-06, - "loss": 0.475, - "mean_token_accuracy": 0.8613344877958298, - "num_tokens": 201188703.0, - "step": 167200 - }, - { - "entropy": 2.0091607570648193, - "epoch": 0.518336369955958, - "grad_norm": 7.016593933105469, - "learning_rate": 3.513889794866938e-06, - "loss": 0.5003, - "mean_token_accuracy": 0.8481083378195763, - "num_tokens": 201199672.0, - "step": 167210 - }, - { - "entropy": 1.963625544309616, - "epoch": 0.5183673690810077, - "grad_norm": 9.131065368652344, - "learning_rate": 3.5137847247992224e-06, - "loss": 0.4796, - "mean_token_accuracy": 0.8464767590165139, - "num_tokens": 201211201.0, - "step": 167220 - }, - { - "entropy": 1.9450436413288117, - "epoch": 0.5183983682060573, - "grad_norm": 8.431489944458008, - "learning_rate": 3.513679664156155e-06, - "loss": 0.5262, - "mean_token_accuracy": 0.8462486758828163, - "num_tokens": 201222474.0, - "step": 167230 - }, - { - "entropy": 1.9184205442667008, - "epoch": 0.5184293673311071, - "grad_norm": 8.032157897949219, - "learning_rate": 3.5135746129363267e-06, - "loss": 0.4851, - "mean_token_accuracy": 0.8471630290150642, - "num_tokens": 201233600.0, - "step": 167240 - }, - { - "entropy": 1.8578265473246574, - "epoch": 0.5184603664561568, - "grad_norm": 3.748180866241455, - "learning_rate": 3.5134695711383304e-06, - "loss": 0.4732, - "mean_token_accuracy": 0.8563891410827636, - "num_tokens": 201246423.0, - "step": 167250 - }, - { - "entropy": 1.7977075070142745, - "epoch": 0.5184913655812065, - "grad_norm": 8.292705535888672, - "learning_rate": 3.5133645387607567e-06, - "loss": 0.3485, - "mean_token_accuracy": 0.8734015017747879, - "num_tokens": 201259551.0, - "step": 167260 - }, - { - "entropy": 1.927210134267807, - "epoch": 0.5185223647062561, - "grad_norm": 8.34482479095459, - "learning_rate": 3.5132595158021987e-06, - "loss": 0.4447, - "mean_token_accuracy": 0.8532064571976662, - "num_tokens": 201270364.0, - "step": 167270 - }, - { - "entropy": 1.8977614745497704, - "epoch": 0.5185533638313059, - "grad_norm": 8.938633918762207, - "learning_rate": 3.5131545022612474e-06, - "loss": 0.4179, - "mean_token_accuracy": 0.8541785389184952, - "num_tokens": 201281740.0, - "step": 167280 - }, - { - "entropy": 1.940072876214981, - "epoch": 0.5185843629563556, - "grad_norm": 8.34633731842041, - "learning_rate": 3.5130494981364954e-06, - "loss": 0.5433, - "mean_token_accuracy": 0.8353075116872788, - "num_tokens": 201293729.0, - "step": 167290 - }, - { - "entropy": 1.9134965017437935, - "epoch": 0.5186153620814052, - "grad_norm": 7.574338912963867, - "learning_rate": 3.512944503426537e-06, - "loss": 0.4505, - "mean_token_accuracy": 0.8549482673406601, - "num_tokens": 201305349.0, - "step": 167300 - }, - { - "entropy": 1.8456029385328292, - "epoch": 0.5186463612064549, - "grad_norm": 10.16829776763916, - "learning_rate": 3.5128395181299646e-06, - "loss": 0.4192, - "mean_token_accuracy": 0.8634034737944603, - "num_tokens": 201317808.0, - "step": 167310 - }, - { - "entropy": 1.9166592717170716, - "epoch": 0.5186773603315047, - "grad_norm": 3.4636266231536865, - "learning_rate": 3.5127345422453706e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8501359283924103, - "num_tokens": 201329922.0, - "step": 167320 - }, - { - "entropy": 1.937665620446205, - "epoch": 0.5187083594565544, - "grad_norm": 8.84914779663086, - "learning_rate": 3.512629575771351e-06, - "loss": 0.4813, - "mean_token_accuracy": 0.8507133930921554, - "num_tokens": 201341333.0, - "step": 167330 - }, - { - "entropy": 1.8795202478766442, - "epoch": 0.518739358581604, - "grad_norm": 2.294635772705078, - "learning_rate": 3.5125246187064975e-06, - "loss": 0.4155, - "mean_token_accuracy": 0.8657633870840072, - "num_tokens": 201353605.0, - "step": 167340 - }, - { - "entropy": 1.8707646176218986, - "epoch": 0.5187703577066537, - "grad_norm": 8.930290222167969, - "learning_rate": 3.5124196710494057e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.8540336638689041, - "num_tokens": 201366110.0, - "step": 167350 - }, - { - "entropy": 1.9009417802095414, - "epoch": 0.5188013568317035, - "grad_norm": 6.296438694000244, - "learning_rate": 3.5123147327986706e-06, - "loss": 0.5145, - "mean_token_accuracy": 0.8389176607131958, - "num_tokens": 201377996.0, - "step": 167360 - }, - { - "entropy": 1.8275167733430862, - "epoch": 0.5188323559567531, - "grad_norm": 8.323405265808105, - "learning_rate": 3.5122098039528868e-06, - "loss": 0.3644, - "mean_token_accuracy": 0.8666198328137398, - "num_tokens": 201390846.0, - "step": 167370 - }, - { - "entropy": 1.9348952636122703, - "epoch": 0.5188633550818028, - "grad_norm": 7.705176830291748, - "learning_rate": 3.5121048845106496e-06, - "loss": 0.49, - "mean_token_accuracy": 0.8476896122097969, - "num_tokens": 201402387.0, - "step": 167380 - }, - { - "entropy": 1.8035302460193634, - "epoch": 0.5188943542068525, - "grad_norm": 9.087127685546875, - "learning_rate": 3.511999974470554e-06, - "loss": 0.4148, - "mean_token_accuracy": 0.8623087644577027, - "num_tokens": 201415169.0, - "step": 167390 - }, - { - "entropy": 1.9165479212999343, - "epoch": 0.5189253533319023, - "grad_norm": 6.9358439445495605, - "learning_rate": 3.5118950738311956e-06, - "loss": 0.449, - "mean_token_accuracy": 0.861064849793911, - "num_tokens": 201425956.0, - "step": 167400 - }, - { - "entropy": 1.8067945793271065, - "epoch": 0.5189563524569519, - "grad_norm": 9.967735290527344, - "learning_rate": 3.5117901825911716e-06, - "loss": 0.4001, - "mean_token_accuracy": 0.860332365334034, - "num_tokens": 201438627.0, - "step": 167410 - }, - { - "entropy": 1.8174662292003632, - "epoch": 0.5189873515820016, - "grad_norm": 3.570725679397583, - "learning_rate": 3.511685300749078e-06, - "loss": 0.3971, - "mean_token_accuracy": 0.8613906666636467, - "num_tokens": 201450928.0, - "step": 167420 - }, - { - "entropy": 1.847064770758152, - "epoch": 0.5190183507070513, - "grad_norm": 4.677860736846924, - "learning_rate": 3.5115804283035115e-06, - "loss": 0.4355, - "mean_token_accuracy": 0.8492764979600906, - "num_tokens": 201463376.0, - "step": 167430 - }, - { - "entropy": 1.9042583480477333, - "epoch": 0.5190493498321009, - "grad_norm": 8.254162788391113, - "learning_rate": 3.5114755652530693e-06, - "loss": 0.4691, - "mean_token_accuracy": 0.8512088477611541, - "num_tokens": 201474321.0, - "step": 167440 - }, - { - "entropy": 1.8291652023792266, - "epoch": 0.5190803489571507, - "grad_norm": 2.466158866882324, - "learning_rate": 3.511370711596348e-06, - "loss": 0.3921, - "mean_token_accuracy": 0.8621661022305489, - "num_tokens": 201487177.0, - "step": 167450 - }, - { - "entropy": 1.771915179491043, - "epoch": 0.5191113480822004, - "grad_norm": 3.680634021759033, - "learning_rate": 3.511265867331945e-06, - "loss": 0.3624, - "mean_token_accuracy": 0.8703644022345542, - "num_tokens": 201500224.0, - "step": 167460 - }, - { - "entropy": 1.9247154101729393, - "epoch": 0.51914234720725, - "grad_norm": 8.170753479003906, - "learning_rate": 3.51116103245846e-06, - "loss": 0.4676, - "mean_token_accuracy": 0.847488397359848, - "num_tokens": 201511452.0, - "step": 167470 - }, - { - "entropy": 1.8737490639090537, - "epoch": 0.5191733463322997, - "grad_norm": 8.452012062072754, - "learning_rate": 3.5110562069744893e-06, - "loss": 0.4449, - "mean_token_accuracy": 0.8589896142482758, - "num_tokens": 201522618.0, - "step": 167480 - }, - { - "entropy": 1.9660809606313705, - "epoch": 0.5192043454573495, - "grad_norm": 7.087366580963135, - "learning_rate": 3.510951390878632e-06, - "loss": 0.5226, - "mean_token_accuracy": 0.8420694902539253, - "num_tokens": 201533424.0, - "step": 167490 - }, - { - "entropy": 1.7874524354934693, - "epoch": 0.5192353445823992, - "grad_norm": 8.6710205078125, - "learning_rate": 3.5108465841694865e-06, - "loss": 0.3561, - "mean_token_accuracy": 0.8666209667921067, - "num_tokens": 201545720.0, - "step": 167500 - }, - { - "entropy": 1.8550549894571304, - "epoch": 0.5192663437074488, - "grad_norm": 6.358746528625488, - "learning_rate": 3.510741786845653e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.8609934568405151, - "num_tokens": 201557543.0, - "step": 167510 - }, - { - "entropy": 1.774479240179062, - "epoch": 0.5192973428324985, - "grad_norm": 3.6877810955047607, - "learning_rate": 3.51063699890573e-06, - "loss": 0.3477, - "mean_token_accuracy": 0.868420633673668, - "num_tokens": 201570575.0, - "step": 167520 - }, - { - "entropy": 1.7928667023777962, - "epoch": 0.5193283419575483, - "grad_norm": 7.632199764251709, - "learning_rate": 3.5105322203483174e-06, - "loss": 0.3707, - "mean_token_accuracy": 0.8707344710826874, - "num_tokens": 201582948.0, - "step": 167530 - }, - { - "entropy": 1.8913492798805236, - "epoch": 0.519359341082598, - "grad_norm": 7.02748441696167, - "learning_rate": 3.5104274511720142e-06, - "loss": 0.4966, - "mean_token_accuracy": 0.8536164119839669, - "num_tokens": 201594624.0, - "step": 167540 - }, - { - "entropy": 1.779433585703373, - "epoch": 0.5193903402076476, - "grad_norm": 5.0814528465271, - "learning_rate": 3.510322691375422e-06, - "loss": 0.4422, - "mean_token_accuracy": 0.8520729154348373, - "num_tokens": 201607799.0, - "step": 167550 - }, - { - "entropy": 1.8301453605294227, - "epoch": 0.5194213393326973, - "grad_norm": 7.968308448791504, - "learning_rate": 3.51021794095714e-06, - "loss": 0.4214, - "mean_token_accuracy": 0.8606723442673683, - "num_tokens": 201620325.0, - "step": 167560 - }, - { - "entropy": 1.823227186501026, - "epoch": 0.5194523384577471, - "grad_norm": 3.0040812492370605, - "learning_rate": 3.5101131999157707e-06, - "loss": 0.3914, - "mean_token_accuracy": 0.8648414790630341, - "num_tokens": 201632449.0, - "step": 167570 - }, - { - "entropy": 1.9500362485647202, - "epoch": 0.5194833375827967, - "grad_norm": 8.145763397216797, - "learning_rate": 3.5100084682499138e-06, - "loss": 0.4902, - "mean_token_accuracy": 0.8469778224825859, - "num_tokens": 201643547.0, - "step": 167580 - }, - { - "entropy": 1.8669037535786628, - "epoch": 0.5195143367078464, - "grad_norm": 10.10450267791748, - "learning_rate": 3.509903745958171e-06, - "loss": 0.4746, - "mean_token_accuracy": 0.8486629739403725, - "num_tokens": 201655893.0, - "step": 167590 - }, - { - "entropy": 1.8534766390919686, - "epoch": 0.5195453358328961, - "grad_norm": 6.937488079071045, - "learning_rate": 3.5097990330391435e-06, - "loss": 0.397, - "mean_token_accuracy": 0.8601803749799728, - "num_tokens": 201668052.0, - "step": 167600 - }, - { - "entropy": 1.8165239058434963, - "epoch": 0.5195763349579459, - "grad_norm": 8.837699890136719, - "learning_rate": 3.5096943294914342e-06, - "loss": 0.3897, - "mean_token_accuracy": 0.855919572710991, - "num_tokens": 201681132.0, - "step": 167610 - }, - { - "entropy": 1.8845690086483955, - "epoch": 0.5196073340829955, - "grad_norm": 6.574382305145264, - "learning_rate": 3.509589635313646e-06, - "loss": 0.4629, - "mean_token_accuracy": 0.8573358491063118, - "num_tokens": 201692782.0, - "step": 167620 - }, - { - "entropy": 1.8185426101088524, - "epoch": 0.5196383332080452, - "grad_norm": 4.730546474456787, - "learning_rate": 3.50948495050438e-06, - "loss": 0.4295, - "mean_token_accuracy": 0.8604342013597488, - "num_tokens": 201705173.0, - "step": 167630 - }, - { - "entropy": 1.834762555360794, - "epoch": 0.5196693323330949, - "grad_norm": 8.875571250915527, - "learning_rate": 3.509380275062239e-06, - "loss": 0.4126, - "mean_token_accuracy": 0.8564514756202698, - "num_tokens": 201717199.0, - "step": 167640 - }, - { - "entropy": 1.7287873297929763, - "epoch": 0.5197003314581446, - "grad_norm": 3.043661594390869, - "learning_rate": 3.5092756089858265e-06, - "loss": 0.3535, - "mean_token_accuracy": 0.8749393910169602, - "num_tokens": 201730596.0, - "step": 167650 - }, - { - "entropy": 1.885722841322422, - "epoch": 0.5197313305831943, - "grad_norm": 12.398810386657715, - "learning_rate": 3.509170952273747e-06, - "loss": 0.4911, - "mean_token_accuracy": 0.8453606396913529, - "num_tokens": 201742944.0, - "step": 167660 - }, - { - "entropy": 1.923583248257637, - "epoch": 0.519762329708244, - "grad_norm": 8.343000411987305, - "learning_rate": 3.5090663049246027e-06, - "loss": 0.4595, - "mean_token_accuracy": 0.8542603000998497, - "num_tokens": 201754679.0, - "step": 167670 - }, - { - "entropy": 1.868654875457287, - "epoch": 0.5197933288332937, - "grad_norm": 7.407617568969727, - "learning_rate": 3.508961666936999e-06, - "loss": 0.421, - "mean_token_accuracy": 0.854319129884243, - "num_tokens": 201766625.0, - "step": 167680 - }, - { - "entropy": 1.8509719505906106, - "epoch": 0.5198243279583433, - "grad_norm": 8.51540756225586, - "learning_rate": 3.5088570383095382e-06, - "loss": 0.408, - "mean_token_accuracy": 0.8613230153918267, - "num_tokens": 201779781.0, - "step": 167690 - }, - { - "entropy": 1.9388750493526459, - "epoch": 0.5198553270833931, - "grad_norm": 8.581016540527344, - "learning_rate": 3.5087524190408275e-06, - "loss": 0.4629, - "mean_token_accuracy": 0.8547941818833351, - "num_tokens": 201791089.0, - "step": 167700 - }, - { - "entropy": 1.8844716548919678, - "epoch": 0.5198863262084428, - "grad_norm": 4.552313327789307, - "learning_rate": 3.5086478091294697e-06, - "loss": 0.4253, - "mean_token_accuracy": 0.8569486826658249, - "num_tokens": 201803147.0, - "step": 167710 - }, - { - "entropy": 1.8605488628149032, - "epoch": 0.5199173253334924, - "grad_norm": 3.5587663650512695, - "learning_rate": 3.5085432085740706e-06, - "loss": 0.4315, - "mean_token_accuracy": 0.8524827226996422, - "num_tokens": 201815695.0, - "step": 167720 - }, - { - "entropy": 1.9242730915546418, - "epoch": 0.5199483244585421, - "grad_norm": 7.837512493133545, - "learning_rate": 3.5084386173732365e-06, - "loss": 0.4666, - "mean_token_accuracy": 0.8563606560230255, - "num_tokens": 201826892.0, - "step": 167730 - }, - { - "entropy": 1.9117944180965423, - "epoch": 0.5199793235835919, - "grad_norm": 7.830265522003174, - "learning_rate": 3.508334035525572e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8568824425339698, - "num_tokens": 201837887.0, - "step": 167740 - }, - { - "entropy": 1.857511366903782, - "epoch": 0.5200103227086416, - "grad_norm": 7.367649555206299, - "learning_rate": 3.508229463029684e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8551246121525764, - "num_tokens": 201849875.0, - "step": 167750 - }, - { - "entropy": 1.915374681353569, - "epoch": 0.5200413218336912, - "grad_norm": 7.929410934448242, - "learning_rate": 3.5081248998841776e-06, - "loss": 0.4532, - "mean_token_accuracy": 0.8556568145751953, - "num_tokens": 201861837.0, - "step": 167760 - }, - { - "entropy": 1.8976527541875838, - "epoch": 0.5200723209587409, - "grad_norm": 8.157280921936035, - "learning_rate": 3.508020346087661e-06, - "loss": 0.4502, - "mean_token_accuracy": 0.8576169595122337, - "num_tokens": 201873619.0, - "step": 167770 - }, - { - "entropy": 1.7953968927264214, - "epoch": 0.5201033200837907, - "grad_norm": 4.682733535766602, - "learning_rate": 3.5079158016387403e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8547178015112877, - "num_tokens": 201887216.0, - "step": 167780 - }, - { - "entropy": 1.9149467661976813, - "epoch": 0.5201343192088403, - "grad_norm": 8.50699520111084, - "learning_rate": 3.5078112665360233e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.8565734103322029, - "num_tokens": 201898719.0, - "step": 167790 - }, - { - "entropy": 1.9032021701335906, - "epoch": 0.52016531833389, - "grad_norm": 9.742694854736328, - "learning_rate": 3.507706740778117e-06, - "loss": 0.4929, - "mean_token_accuracy": 0.8466816678643226, - "num_tokens": 201910969.0, - "step": 167800 - }, - { - "entropy": 1.8507519498467446, - "epoch": 0.5201963174589397, - "grad_norm": 7.235980987548828, - "learning_rate": 3.507602224363628e-06, - "loss": 0.439, - "mean_token_accuracy": 0.8613151669502258, - "num_tokens": 201922829.0, - "step": 167810 - }, - { - "entropy": 1.9132382899522782, - "epoch": 0.5202273165839895, - "grad_norm": 8.334747314453125, - "learning_rate": 3.507497717291167e-06, - "loss": 0.5141, - "mean_token_accuracy": 0.8361768543720245, - "num_tokens": 201933856.0, - "step": 167820 - }, - { - "entropy": 1.8912567496299744, - "epoch": 0.5202583157090391, - "grad_norm": 7.0671281814575195, - "learning_rate": 3.5073932195593413e-06, - "loss": 0.4624, - "mean_token_accuracy": 0.8508770078420639, - "num_tokens": 201945311.0, - "step": 167830 - }, - { - "entropy": 1.8898201130330563, - "epoch": 0.5202893148340888, - "grad_norm": 8.528030395507812, - "learning_rate": 3.5072887311667585e-06, - "loss": 0.4189, - "mean_token_accuracy": 0.8624457880854607, - "num_tokens": 201956923.0, - "step": 167840 - }, - { - "entropy": 1.9448716431856155, - "epoch": 0.5203203139591385, - "grad_norm": 6.0862531661987305, - "learning_rate": 3.5071842521120283e-06, - "loss": 0.4538, - "mean_token_accuracy": 0.8547197684645653, - "num_tokens": 201967672.0, - "step": 167850 - }, - { - "entropy": 1.9171678617596626, - "epoch": 0.5203513130841882, - "grad_norm": 7.810421466827393, - "learning_rate": 3.50707978239376e-06, - "loss": 0.4812, - "mean_token_accuracy": 0.847561101615429, - "num_tokens": 201979178.0, - "step": 167860 - }, - { - "entropy": 1.9273208022117614, - "epoch": 0.5203823122092379, - "grad_norm": 6.643603801727295, - "learning_rate": 3.506975322010564e-06, - "loss": 0.4682, - "mean_token_accuracy": 0.8571745559573174, - "num_tokens": 201990250.0, - "step": 167870 - }, - { - "entropy": 1.8480944350361823, - "epoch": 0.5204133113342876, - "grad_norm": 9.434873580932617, - "learning_rate": 3.506870870961049e-06, - "loss": 0.4179, - "mean_token_accuracy": 0.8565628483891488, - "num_tokens": 202002760.0, - "step": 167880 - }, - { - "entropy": 1.928942036628723, - "epoch": 0.5204443104593373, - "grad_norm": 8.584845542907715, - "learning_rate": 3.506766429243825e-06, - "loss": 0.477, - "mean_token_accuracy": 0.8476613745093345, - "num_tokens": 202014389.0, - "step": 167890 - }, - { - "entropy": 1.8081008911132812, - "epoch": 0.520475309584387, - "grad_norm": 9.22602367401123, - "learning_rate": 3.5066619968575027e-06, - "loss": 0.3941, - "mean_token_accuracy": 0.857190066576004, - "num_tokens": 202027608.0, - "step": 167900 - }, - { - "entropy": 1.8879745230078697, - "epoch": 0.5205063087094367, - "grad_norm": 8.31643009185791, - "learning_rate": 3.5065575738006936e-06, - "loss": 0.4802, - "mean_token_accuracy": 0.849236112833023, - "num_tokens": 202039567.0, - "step": 167910 - }, - { - "entropy": 1.8697897508740424, - "epoch": 0.5205373078344864, - "grad_norm": 8.23218059539795, - "learning_rate": 3.506453160072007e-06, - "loss": 0.4117, - "mean_token_accuracy": 0.8641917198896408, - "num_tokens": 202051156.0, - "step": 167920 - }, - { - "entropy": 1.9332033962011337, - "epoch": 0.520568306959536, - "grad_norm": 8.56834888458252, - "learning_rate": 3.5063487556700566e-06, - "loss": 0.4395, - "mean_token_accuracy": 0.8528130277991295, - "num_tokens": 202062489.0, - "step": 167930 - }, - { - "entropy": 1.8717573434114456, - "epoch": 0.5205993060845857, - "grad_norm": 8.18609619140625, - "learning_rate": 3.5062443605934516e-06, - "loss": 0.4287, - "mean_token_accuracy": 0.8530358478426934, - "num_tokens": 202074193.0, - "step": 167940 - }, - { - "entropy": 1.8864813312888145, - "epoch": 0.5206303052096355, - "grad_norm": 3.9505395889282227, - "learning_rate": 3.506139974840805e-06, - "loss": 0.4388, - "mean_token_accuracy": 0.8556156173348427, - "num_tokens": 202086671.0, - "step": 167950 - }, - { - "entropy": 1.8406436771154404, - "epoch": 0.5206613043346852, - "grad_norm": 9.141216278076172, - "learning_rate": 3.5060355984107285e-06, - "loss": 0.4141, - "mean_token_accuracy": 0.8611835777759552, - "num_tokens": 202099613.0, - "step": 167960 - }, - { - "entropy": 1.9403121635317802, - "epoch": 0.5206923034597348, - "grad_norm": 3.5786476135253906, - "learning_rate": 3.505931231301835e-06, - "loss": 0.4829, - "mean_token_accuracy": 0.8485002890229225, - "num_tokens": 202111350.0, - "step": 167970 - }, - { - "entropy": 1.9006236642599106, - "epoch": 0.5207233025847845, - "grad_norm": 9.575447082519531, - "learning_rate": 3.505826873512737e-06, - "loss": 0.4582, - "mean_token_accuracy": 0.8519483998417854, - "num_tokens": 202123308.0, - "step": 167980 - }, - { - "entropy": 1.8104294762015343, - "epoch": 0.5207543017098343, - "grad_norm": 8.087808609008789, - "learning_rate": 3.5057225250420484e-06, - "loss": 0.3938, - "mean_token_accuracy": 0.8671251326799393, - "num_tokens": 202136153.0, - "step": 167990 - }, - { - "entropy": 1.9393351331353188, - "epoch": 0.520785300834884, - "grad_norm": 10.509461402893066, - "learning_rate": 3.505618185888381e-06, - "loss": 0.4636, - "mean_token_accuracy": 0.8453001469373703, - "num_tokens": 202147809.0, - "step": 168000 - }, - { - "entropy": 1.8146055683493614, - "epoch": 0.5208162999599336, - "grad_norm": 8.376832962036133, - "learning_rate": 3.5055138560503493e-06, - "loss": 0.3666, - "mean_token_accuracy": 0.8654261186718941, - "num_tokens": 202161538.0, - "step": 168010 - }, - { - "entropy": 1.799776268005371, - "epoch": 0.5208472990849833, - "grad_norm": 5.0410614013671875, - "learning_rate": 3.5054095355265664e-06, - "loss": 0.3816, - "mean_token_accuracy": 0.8637054204940796, - "num_tokens": 202175257.0, - "step": 168020 - }, - { - "entropy": 1.884345107525587, - "epoch": 0.5208782982100331, - "grad_norm": 8.229707717895508, - "learning_rate": 3.5053052243156464e-06, - "loss": 0.411, - "mean_token_accuracy": 0.8621705248951912, - "num_tokens": 202187329.0, - "step": 168030 - }, - { - "entropy": 1.9020549565553666, - "epoch": 0.5209092973350827, - "grad_norm": 7.426633358001709, - "learning_rate": 3.505200922416206e-06, - "loss": 0.448, - "mean_token_accuracy": 0.8587285667657852, - "num_tokens": 202198454.0, - "step": 168040 - }, - { - "entropy": 1.850784559547901, - "epoch": 0.5209402964601324, - "grad_norm": 8.169835090637207, - "learning_rate": 3.5050966298268575e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.8499960124492645, - "num_tokens": 202210755.0, - "step": 168050 - }, - { - "entropy": 1.9735260367393495, - "epoch": 0.5209712955851821, - "grad_norm": 7.9670538902282715, - "learning_rate": 3.5049923465462167e-06, - "loss": 0.4895, - "mean_token_accuracy": 0.850350596010685, - "num_tokens": 202222203.0, - "step": 168060 - }, - { - "entropy": 1.761663031578064, - "epoch": 0.5210022947102319, - "grad_norm": 3.8146419525146484, - "learning_rate": 3.5048880725728984e-06, - "loss": 0.3797, - "mean_token_accuracy": 0.8592882245779038, - "num_tokens": 202236919.0, - "step": 168070 - }, - { - "entropy": 1.8373216532170773, - "epoch": 0.5210332938352815, - "grad_norm": 8.581892013549805, - "learning_rate": 3.50478380790552e-06, - "loss": 0.3779, - "mean_token_accuracy": 0.8676151230931282, - "num_tokens": 202249310.0, - "step": 168080 - }, - { - "entropy": 1.9933584868907928, - "epoch": 0.5210642929603312, - "grad_norm": 8.73966121673584, - "learning_rate": 3.5046795525426953e-06, - "loss": 0.5005, - "mean_token_accuracy": 0.8555369764566422, - "num_tokens": 202259808.0, - "step": 168090 - }, - { - "entropy": 1.9289013132452966, - "epoch": 0.5210952920853809, - "grad_norm": 7.783086776733398, - "learning_rate": 3.504575306483041e-06, - "loss": 0.4778, - "mean_token_accuracy": 0.8488955944776535, - "num_tokens": 202271227.0, - "step": 168100 - }, - { - "entropy": 1.9002173766493797, - "epoch": 0.5211262912104306, - "grad_norm": 8.410770416259766, - "learning_rate": 3.504471069725175e-06, - "loss": 0.4502, - "mean_token_accuracy": 0.8557520598173142, - "num_tokens": 202283576.0, - "step": 168110 - }, - { - "entropy": 1.9111146241426469, - "epoch": 0.5211572903354803, - "grad_norm": 4.1151580810546875, - "learning_rate": 3.504366842267712e-06, - "loss": 0.4711, - "mean_token_accuracy": 0.8459262296557426, - "num_tokens": 202295511.0, - "step": 168120 - }, - { - "entropy": 1.9295530125498772, - "epoch": 0.52118828946053, - "grad_norm": 7.352995872497559, - "learning_rate": 3.5042626241092702e-06, - "loss": 0.4767, - "mean_token_accuracy": 0.8493158027529717, - "num_tokens": 202307587.0, - "step": 168130 - }, - { - "entropy": 1.933849672973156, - "epoch": 0.5212192885855796, - "grad_norm": 7.733171463012695, - "learning_rate": 3.504158415248468e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8486435234546661, - "num_tokens": 202319619.0, - "step": 168140 - }, - { - "entropy": 1.9473506823182105, - "epoch": 0.5212502877106293, - "grad_norm": 7.936611652374268, - "learning_rate": 3.5040542156839207e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8542565524578094, - "num_tokens": 202331068.0, - "step": 168150 - }, - { - "entropy": 1.7915266767144202, - "epoch": 0.5212812868356791, - "grad_norm": 2.430027723312378, - "learning_rate": 3.503950025414247e-06, - "loss": 0.3578, - "mean_token_accuracy": 0.8742259576916694, - "num_tokens": 202343873.0, - "step": 168160 - }, - { - "entropy": 1.9285468205809593, - "epoch": 0.5213122859607288, - "grad_norm": 4.726843357086182, - "learning_rate": 3.5038458444380665e-06, - "loss": 0.4777, - "mean_token_accuracy": 0.8377237111330033, - "num_tokens": 202356321.0, - "step": 168170 - }, - { - "entropy": 1.9144100919365883, - "epoch": 0.5213432850857784, - "grad_norm": 7.406527996063232, - "learning_rate": 3.5037416727539957e-06, - "loss": 0.4292, - "mean_token_accuracy": 0.8542553022503853, - "num_tokens": 202368467.0, - "step": 168180 - }, - { - "entropy": 1.796132105588913, - "epoch": 0.5213742842108281, - "grad_norm": 9.365875244140625, - "learning_rate": 3.5036375103606553e-06, - "loss": 0.4268, - "mean_token_accuracy": 0.8590227887034416, - "num_tokens": 202381918.0, - "step": 168190 - }, - { - "entropy": 1.9180462673306464, - "epoch": 0.5214052833358779, - "grad_norm": 4.419243812561035, - "learning_rate": 3.5035333572566626e-06, - "loss": 0.4338, - "mean_token_accuracy": 0.8502243459224701, - "num_tokens": 202394433.0, - "step": 168200 - }, - { - "entropy": 1.9016762152314186, - "epoch": 0.5214362824609275, - "grad_norm": 7.407059669494629, - "learning_rate": 3.5034292134406377e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8573667585849762, - "num_tokens": 202406568.0, - "step": 168210 - }, - { - "entropy": 1.914240649342537, - "epoch": 0.5214672815859772, - "grad_norm": 8.344120025634766, - "learning_rate": 3.5033250789112005e-06, - "loss": 0.429, - "mean_token_accuracy": 0.858941039443016, - "num_tokens": 202418554.0, - "step": 168220 - }, - { - "entropy": 1.8893193066120149, - "epoch": 0.5214982807110269, - "grad_norm": 7.024362087249756, - "learning_rate": 3.503220953666971e-06, - "loss": 0.4118, - "mean_token_accuracy": 0.8614280000329018, - "num_tokens": 202430879.0, - "step": 168230 - }, - { - "entropy": 1.9234270766377448, - "epoch": 0.5215292798360767, - "grad_norm": 9.460162162780762, - "learning_rate": 3.503116837706569e-06, - "loss": 0.4683, - "mean_token_accuracy": 0.8518292471766472, - "num_tokens": 202442812.0, - "step": 168240 - }, - { - "entropy": 1.9079196915030479, - "epoch": 0.5215602789611263, - "grad_norm": 6.5405731201171875, - "learning_rate": 3.5030127310286148e-06, - "loss": 0.5293, - "mean_token_accuracy": 0.85546166151762, - "num_tokens": 202455059.0, - "step": 168250 - }, - { - "entropy": 1.9224894732236861, - "epoch": 0.521591278086176, - "grad_norm": 7.023353099822998, - "learning_rate": 3.5029086336317297e-06, - "loss": 0.4551, - "mean_token_accuracy": 0.857777102291584, - "num_tokens": 202466597.0, - "step": 168260 - }, - { - "entropy": 1.8937045753002166, - "epoch": 0.5216222772112257, - "grad_norm": 7.842694282531738, - "learning_rate": 3.5028045455145344e-06, - "loss": 0.4452, - "mean_token_accuracy": 0.8521323055028915, - "num_tokens": 202478310.0, - "step": 168270 - }, - { - "entropy": 1.8532606914639473, - "epoch": 0.5216532763362755, - "grad_norm": 9.699804306030273, - "learning_rate": 3.5027004666756504e-06, - "loss": 0.4198, - "mean_token_accuracy": 0.861707229912281, - "num_tokens": 202490868.0, - "step": 168280 - }, - { - "entropy": 1.836945144087076, - "epoch": 0.5216842754613251, - "grad_norm": 4.717660903930664, - "learning_rate": 3.5025963971136994e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.865605103969574, - "num_tokens": 202504329.0, - "step": 168290 - }, - { - "entropy": 1.827773043513298, - "epoch": 0.5217152745863748, - "grad_norm": 3.8117053508758545, - "learning_rate": 3.5024923368273035e-06, - "loss": 0.4044, - "mean_token_accuracy": 0.866480652987957, - "num_tokens": 202516707.0, - "step": 168300 - }, - { - "entropy": 1.9818437069654464, - "epoch": 0.5217462737114245, - "grad_norm": 7.698195457458496, - "learning_rate": 3.502388285815085e-06, - "loss": 0.4729, - "mean_token_accuracy": 0.8499929904937744, - "num_tokens": 202527629.0, - "step": 168310 - }, - { - "entropy": 1.9486901313066483, - "epoch": 0.5217772728364742, - "grad_norm": 6.33158540725708, - "learning_rate": 3.5022842440756654e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.8544744238257408, - "num_tokens": 202538355.0, - "step": 168320 - }, - { - "entropy": 1.8631914988160134, - "epoch": 0.5218082719615239, - "grad_norm": 4.584253787994385, - "learning_rate": 3.5021802116076686e-06, - "loss": 0.4546, - "mean_token_accuracy": 0.851434463262558, - "num_tokens": 202551027.0, - "step": 168330 - }, - { - "entropy": 1.8589393228292466, - "epoch": 0.5218392710865736, - "grad_norm": 3.837559461593628, - "learning_rate": 3.5020761884097182e-06, - "loss": 0.3925, - "mean_token_accuracy": 0.8624763250350952, - "num_tokens": 202563762.0, - "step": 168340 - }, - { - "entropy": 1.8021437704563141, - "epoch": 0.5218702702116232, - "grad_norm": 4.51010799407959, - "learning_rate": 3.501972174480436e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8552407011389732, - "num_tokens": 202577418.0, - "step": 168350 - }, - { - "entropy": 1.771887867152691, - "epoch": 0.521901269336673, - "grad_norm": 7.540759086608887, - "learning_rate": 3.501868169818446e-06, - "loss": 0.3321, - "mean_token_accuracy": 0.8738851860165596, - "num_tokens": 202591115.0, - "step": 168360 - }, - { - "entropy": 1.945265081524849, - "epoch": 0.5219322684617227, - "grad_norm": 10.528114318847656, - "learning_rate": 3.501764174422373e-06, - "loss": 0.4937, - "mean_token_accuracy": 0.8545716404914856, - "num_tokens": 202602290.0, - "step": 168370 - }, - { - "entropy": 1.9452046200633049, - "epoch": 0.5219632675867724, - "grad_norm": 9.390413284301758, - "learning_rate": 3.50166018829084e-06, - "loss": 0.484, - "mean_token_accuracy": 0.8406281679868698, - "num_tokens": 202614626.0, - "step": 168380 - }, - { - "entropy": 1.8217643454670907, - "epoch": 0.521994266711822, - "grad_norm": 4.555027484893799, - "learning_rate": 3.5015562114224727e-06, - "loss": 0.421, - "mean_token_accuracy": 0.8638969480991363, - "num_tokens": 202627795.0, - "step": 168390 - }, - { - "entropy": 1.945632593333721, - "epoch": 0.5220252658368717, - "grad_norm": 9.390481948852539, - "learning_rate": 3.5014522438158964e-06, - "loss": 0.4952, - "mean_token_accuracy": 0.8453856199979782, - "num_tokens": 202638589.0, - "step": 168400 - }, - { - "entropy": 1.956186081469059, - "epoch": 0.5220562649619215, - "grad_norm": 7.714739799499512, - "learning_rate": 3.501348285469734e-06, - "loss": 0.5171, - "mean_token_accuracy": 0.8455076336860656, - "num_tokens": 202649948.0, - "step": 168410 - }, - { - "entropy": 1.9386845543980598, - "epoch": 0.5220872640869711, - "grad_norm": 10.144988059997559, - "learning_rate": 3.501244336382612e-06, - "loss": 0.5306, - "mean_token_accuracy": 0.8470963105559349, - "num_tokens": 202661739.0, - "step": 168420 - }, - { - "entropy": 1.8727570980787278, - "epoch": 0.5221182632120208, - "grad_norm": 8.593897819519043, - "learning_rate": 3.5011403965531572e-06, - "loss": 0.4007, - "mean_token_accuracy": 0.8567142084240913, - "num_tokens": 202674217.0, - "step": 168430 - }, - { - "entropy": 1.944206291437149, - "epoch": 0.5221492623370705, - "grad_norm": 4.206070899963379, - "learning_rate": 3.501036465979994e-06, - "loss": 0.4829, - "mean_token_accuracy": 0.849232442677021, - "num_tokens": 202685615.0, - "step": 168440 - }, - { - "entropy": 1.821098268032074, - "epoch": 0.5221802614621203, - "grad_norm": 7.023726940155029, - "learning_rate": 3.500932544661749e-06, - "loss": 0.3865, - "mean_token_accuracy": 0.8620294481515884, - "num_tokens": 202698481.0, - "step": 168450 - }, - { - "entropy": 1.9788533598184586, - "epoch": 0.5222112605871699, - "grad_norm": 7.938227653503418, - "learning_rate": 3.5008286325970486e-06, - "loss": 0.4763, - "mean_token_accuracy": 0.8501974329352379, - "num_tokens": 202709206.0, - "step": 168460 - }, - { - "entropy": 1.9976587742567062, - "epoch": 0.5222422597122196, - "grad_norm": 7.722753524780273, - "learning_rate": 3.50072472978452e-06, - "loss": 0.4661, - "mean_token_accuracy": 0.8504937782883644, - "num_tokens": 202720069.0, - "step": 168470 - }, - { - "entropy": 1.9856803625822068, - "epoch": 0.5222732588372693, - "grad_norm": 8.472858428955078, - "learning_rate": 3.5006208362227906e-06, - "loss": 0.5181, - "mean_token_accuracy": 0.8380648702383041, - "num_tokens": 202731035.0, - "step": 168480 - }, - { - "entropy": 1.9204457074403762, - "epoch": 0.522304257962319, - "grad_norm": 7.643885612487793, - "learning_rate": 3.5005169519104863e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8558697760105133, - "num_tokens": 202742481.0, - "step": 168490 - }, - { - "entropy": 1.973274603486061, - "epoch": 0.5223352570873687, - "grad_norm": 3.862253189086914, - "learning_rate": 3.5004130768462363e-06, - "loss": 0.4715, - "mean_token_accuracy": 0.8509830832481384, - "num_tokens": 202753483.0, - "step": 168500 - }, - { - "entropy": 1.8810513690114021, - "epoch": 0.5223662562124184, - "grad_norm": 7.860805034637451, - "learning_rate": 3.5003092110286685e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8568109959363938, - "num_tokens": 202765592.0, - "step": 168510 - }, - { - "entropy": 1.8250865265727043, - "epoch": 0.5223972553374681, - "grad_norm": 3.617983341217041, - "learning_rate": 3.50020535445641e-06, - "loss": 0.3621, - "mean_token_accuracy": 0.86634581387043, - "num_tokens": 202778915.0, - "step": 168520 - }, - { - "entropy": 1.8575761586427688, - "epoch": 0.5224282544625178, - "grad_norm": 8.61739444732666, - "learning_rate": 3.50010150712809e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8585295125842094, - "num_tokens": 202790824.0, - "step": 168530 - }, - { - "entropy": 1.9157093957066536, - "epoch": 0.5224592535875675, - "grad_norm": 6.821208953857422, - "learning_rate": 3.4999976690423367e-06, - "loss": 0.4177, - "mean_token_accuracy": 0.8653114318847657, - "num_tokens": 202802693.0, - "step": 168540 - }, - { - "entropy": 1.9041334331035613, - "epoch": 0.5224902527126172, - "grad_norm": 4.114165306091309, - "learning_rate": 3.4998938401977808e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.860270942747593, - "num_tokens": 202814814.0, - "step": 168550 - }, - { - "entropy": 1.8540833964943886, - "epoch": 0.5225212518376668, - "grad_norm": 3.6030046939849854, - "learning_rate": 3.499790020593049e-06, - "loss": 0.4128, - "mean_token_accuracy": 0.8587525814771653, - "num_tokens": 202826750.0, - "step": 168560 - }, - { - "entropy": 1.801897644996643, - "epoch": 0.5225522509627166, - "grad_norm": 4.086511611938477, - "learning_rate": 3.499686210226774e-06, - "loss": 0.3762, - "mean_token_accuracy": 0.8603298366069794, - "num_tokens": 202839477.0, - "step": 168570 - }, - { - "entropy": 1.8381630688905717, - "epoch": 0.5225832500877663, - "grad_norm": 6.6824774742126465, - "learning_rate": 3.499582409097583e-06, - "loss": 0.4449, - "mean_token_accuracy": 0.8520712524652481, - "num_tokens": 202851991.0, - "step": 168580 - }, - { - "entropy": 1.9276944547891617, - "epoch": 0.522614249212816, - "grad_norm": 3.1579701900482178, - "learning_rate": 3.4994786172041074e-06, - "loss": 0.4818, - "mean_token_accuracy": 0.85104900598526, - "num_tokens": 202862938.0, - "step": 168590 - }, - { - "entropy": 1.9088122382760049, - "epoch": 0.5226452483378656, - "grad_norm": 8.01588249206543, - "learning_rate": 3.4993748345449783e-06, - "loss": 0.4524, - "mean_token_accuracy": 0.8535411536693573, - "num_tokens": 202874862.0, - "step": 168600 - }, - { - "entropy": 1.9341975510120393, - "epoch": 0.5226762474629154, - "grad_norm": 7.971364498138428, - "learning_rate": 3.4992710611188265e-06, - "loss": 0.4818, - "mean_token_accuracy": 0.8459011435508728, - "num_tokens": 202885847.0, - "step": 168610 - }, - { - "entropy": 1.9188231587409974, - "epoch": 0.5227072465879651, - "grad_norm": 8.544054985046387, - "learning_rate": 3.4991672969242813e-06, - "loss": 0.452, - "mean_token_accuracy": 0.8533058494329453, - "num_tokens": 202897038.0, - "step": 168620 - }, - { - "entropy": 1.865157674252987, - "epoch": 0.5227382457130147, - "grad_norm": 4.100846767425537, - "learning_rate": 3.499063541959976e-06, - "loss": 0.4161, - "mean_token_accuracy": 0.8643029734492302, - "num_tokens": 202908553.0, - "step": 168630 - }, - { - "entropy": 1.9419152081012725, - "epoch": 0.5227692448380644, - "grad_norm": 7.540561199188232, - "learning_rate": 3.4989597962245414e-06, - "loss": 0.4499, - "mean_token_accuracy": 0.8607556402683259, - "num_tokens": 202919848.0, - "step": 168640 - }, - { - "entropy": 1.7625506401062012, - "epoch": 0.5228002439631141, - "grad_norm": 7.734172821044922, - "learning_rate": 3.498856059716609e-06, - "loss": 0.3477, - "mean_token_accuracy": 0.8655755758285523, - "num_tokens": 202933201.0, - "step": 168650 - }, - { - "entropy": 1.7975231781601906, - "epoch": 0.5228312430881639, - "grad_norm": 9.36955451965332, - "learning_rate": 3.4987523324348114e-06, - "loss": 0.3961, - "mean_token_accuracy": 0.8536623597145081, - "num_tokens": 202946343.0, - "step": 168660 - }, - { - "entropy": 1.7663616985082626, - "epoch": 0.5228622422132135, - "grad_norm": 7.1532182693481445, - "learning_rate": 3.498648614377782e-06, - "loss": 0.3221, - "mean_token_accuracy": 0.8766553714871407, - "num_tokens": 202960093.0, - "step": 168670 - }, - { - "entropy": 1.9286183029413224, - "epoch": 0.5228932413382632, - "grad_norm": 8.140057563781738, - "learning_rate": 3.498544905544153e-06, - "loss": 0.4348, - "mean_token_accuracy": 0.8631218791007995, - "num_tokens": 202971106.0, - "step": 168680 - }, - { - "entropy": 1.8273029983043672, - "epoch": 0.5229242404633129, - "grad_norm": 8.231640815734863, - "learning_rate": 3.498441205932556e-06, - "loss": 0.4325, - "mean_token_accuracy": 0.8470514804124832, - "num_tokens": 202984430.0, - "step": 168690 - }, - { - "entropy": 1.9099699601531028, - "epoch": 0.5229552395883627, - "grad_norm": 10.600870132446289, - "learning_rate": 3.498337515541626e-06, - "loss": 0.4953, - "mean_token_accuracy": 0.84662024974823, - "num_tokens": 202995888.0, - "step": 168700 - }, - { - "entropy": 1.8221244931221008, - "epoch": 0.5229862387134123, - "grad_norm": 7.553411483764648, - "learning_rate": 3.498233834369996e-06, - "loss": 0.39, - "mean_token_accuracy": 0.8616412520408631, - "num_tokens": 203008576.0, - "step": 168710 - }, - { - "entropy": 1.8235291928052901, - "epoch": 0.523017237838462, - "grad_norm": 8.67770767211914, - "learning_rate": 3.498130162416301e-06, - "loss": 0.4043, - "mean_token_accuracy": 0.8563500612974166, - "num_tokens": 203022507.0, - "step": 168720 - }, - { - "entropy": 1.9123604744672775, - "epoch": 0.5230482369635117, - "grad_norm": 9.894606590270996, - "learning_rate": 3.4980264996791734e-06, - "loss": 0.4826, - "mean_token_accuracy": 0.8581787392497062, - "num_tokens": 203033907.0, - "step": 168730 - }, - { - "entropy": 1.9000810965895654, - "epoch": 0.5230792360885614, - "grad_norm": 3.7253689765930176, - "learning_rate": 3.4979228461572484e-06, - "loss": 0.4473, - "mean_token_accuracy": 0.8557541042566299, - "num_tokens": 203046455.0, - "step": 168740 - }, - { - "entropy": 1.876817238330841, - "epoch": 0.5231102352136111, - "grad_norm": 10.623446464538574, - "learning_rate": 3.4978192018491614e-06, - "loss": 0.4359, - "mean_token_accuracy": 0.8541541695594788, - "num_tokens": 203058507.0, - "step": 168750 - }, - { - "entropy": 1.9653200805187225, - "epoch": 0.5231412343386608, - "grad_norm": 14.3389253616333, - "learning_rate": 3.497715566753547e-06, - "loss": 0.525, - "mean_token_accuracy": 0.8415609449148178, - "num_tokens": 203069867.0, - "step": 168760 - }, - { - "entropy": 1.9515894502401352, - "epoch": 0.5231722334637104, - "grad_norm": 6.81016731262207, - "learning_rate": 3.497611940869041e-06, - "loss": 0.4799, - "mean_token_accuracy": 0.8508537262678146, - "num_tokens": 203080859.0, - "step": 168770 - }, - { - "entropy": 1.8536252856254578, - "epoch": 0.5232032325887602, - "grad_norm": 4.349778652191162, - "learning_rate": 3.497508324194277e-06, - "loss": 0.4111, - "mean_token_accuracy": 0.8588446915149689, - "num_tokens": 203093835.0, - "step": 168780 - }, - { - "entropy": 1.9472486570477485, - "epoch": 0.5232342317138099, - "grad_norm": 8.198267936706543, - "learning_rate": 3.497404716727893e-06, - "loss": 0.4509, - "mean_token_accuracy": 0.8478568762540817, - "num_tokens": 203105798.0, - "step": 168790 - }, - { - "entropy": 1.9527030482888221, - "epoch": 0.5232652308388596, - "grad_norm": 5.557439804077148, - "learning_rate": 3.4973011184685244e-06, - "loss": 0.4537, - "mean_token_accuracy": 0.8537330031394958, - "num_tokens": 203116698.0, - "step": 168800 - }, - { - "entropy": 1.939917927980423, - "epoch": 0.5232962299639092, - "grad_norm": 7.945644855499268, - "learning_rate": 3.4971975294148085e-06, - "loss": 0.4396, - "mean_token_accuracy": 0.8575639694929122, - "num_tokens": 203128225.0, - "step": 168810 - }, - { - "entropy": 1.9317618682980537, - "epoch": 0.523327229088959, - "grad_norm": 9.663101196289062, - "learning_rate": 3.4970939495653804e-06, - "loss": 0.4807, - "mean_token_accuracy": 0.8493759095668793, - "num_tokens": 203139468.0, - "step": 168820 - }, - { - "entropy": 1.884542666375637, - "epoch": 0.5233582282140087, - "grad_norm": 5.212667465209961, - "learning_rate": 3.4969903789188785e-06, - "loss": 0.4658, - "mean_token_accuracy": 0.8502926960587501, - "num_tokens": 203151240.0, - "step": 168830 - }, - { - "entropy": 1.8819777011871337, - "epoch": 0.5233892273390583, - "grad_norm": 7.3790106773376465, - "learning_rate": 3.49688681747394e-06, - "loss": 0.418, - "mean_token_accuracy": 0.8593169033527375, - "num_tokens": 203162849.0, - "step": 168840 - }, - { - "entropy": 1.887918284535408, - "epoch": 0.523420226464108, - "grad_norm": 10.357288360595703, - "learning_rate": 3.4967832652292015e-06, - "loss": 0.423, - "mean_token_accuracy": 0.8608911827206611, - "num_tokens": 203174283.0, - "step": 168850 - }, - { - "entropy": 1.9403936624526978, - "epoch": 0.5234512255891578, - "grad_norm": 8.486865043640137, - "learning_rate": 3.4966797221833016e-06, - "loss": 0.5105, - "mean_token_accuracy": 0.8440033331513405, - "num_tokens": 203184723.0, - "step": 168860 - }, - { - "entropy": 1.8190939247608184, - "epoch": 0.5234822247142075, - "grad_norm": 2.843285083770752, - "learning_rate": 3.496576188334879e-06, - "loss": 0.4588, - "mean_token_accuracy": 0.8482326999306679, - "num_tokens": 203197121.0, - "step": 168870 - }, - { - "entropy": 1.964727732539177, - "epoch": 0.5235132238392571, - "grad_norm": 8.751008987426758, - "learning_rate": 3.49647266368257e-06, - "loss": 0.4936, - "mean_token_accuracy": 0.8521199285984039, - "num_tokens": 203208003.0, - "step": 168880 - }, - { - "entropy": 1.8414556071162225, - "epoch": 0.5235442229643068, - "grad_norm": 7.824958801269531, - "learning_rate": 3.496369148225016e-06, - "loss": 0.3974, - "mean_token_accuracy": 0.8654927790164948, - "num_tokens": 203220730.0, - "step": 168890 - }, - { - "entropy": 1.7450319975614548, - "epoch": 0.5235752220893565, - "grad_norm": 7.182183265686035, - "learning_rate": 3.4962656419608543e-06, - "loss": 0.3285, - "mean_token_accuracy": 0.8719310641288758, - "num_tokens": 203234537.0, - "step": 168900 - }, - { - "entropy": 1.9158180862665177, - "epoch": 0.5236062212144063, - "grad_norm": 3.739776611328125, - "learning_rate": 3.496162144888725e-06, - "loss": 0.4321, - "mean_token_accuracy": 0.8643902540206909, - "num_tokens": 203246093.0, - "step": 168910 - }, - { - "entropy": 1.863074316084385, - "epoch": 0.5236372203394559, - "grad_norm": 6.915127754211426, - "learning_rate": 3.4960586570072673e-06, - "loss": 0.4325, - "mean_token_accuracy": 0.8576465114951134, - "num_tokens": 203258124.0, - "step": 168920 - }, - { - "entropy": 1.9026149466633797, - "epoch": 0.5236682194645056, - "grad_norm": 6.958155155181885, - "learning_rate": 3.4959551783151207e-06, - "loss": 0.42, - "mean_token_accuracy": 0.8604533329606057, - "num_tokens": 203270041.0, - "step": 168930 - }, - { - "entropy": 1.8807974457740784, - "epoch": 0.5236992185895553, - "grad_norm": 7.943653106689453, - "learning_rate": 3.495851708810926e-06, - "loss": 0.4061, - "mean_token_accuracy": 0.8648590832948685, - "num_tokens": 203281957.0, - "step": 168940 - }, - { - "entropy": 1.8590452671051025, - "epoch": 0.523730217714605, - "grad_norm": 9.260481834411621, - "learning_rate": 3.495748248493323e-06, - "loss": 0.4522, - "mean_token_accuracy": 0.8521923378109932, - "num_tokens": 203293743.0, - "step": 168950 - }, - { - "entropy": 1.8895818576216699, - "epoch": 0.5237612168396547, - "grad_norm": 8.63992691040039, - "learning_rate": 3.495644797360953e-06, - "loss": 0.483, - "mean_token_accuracy": 0.8420132413506508, - "num_tokens": 203305487.0, - "step": 168960 - }, - { - "entropy": 1.8291345238685608, - "epoch": 0.5237922159647044, - "grad_norm": 4.117769241333008, - "learning_rate": 3.4955413554124568e-06, - "loss": 0.4038, - "mean_token_accuracy": 0.8555587381124496, - "num_tokens": 203318682.0, - "step": 168970 - }, - { - "entropy": 1.8397614166140557, - "epoch": 0.523823215089754, - "grad_norm": 3.8362808227539062, - "learning_rate": 3.495437922646475e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8543219909071922, - "num_tokens": 203331200.0, - "step": 168980 - }, - { - "entropy": 1.8733160473406314, - "epoch": 0.5238542142148038, - "grad_norm": 7.672023296356201, - "learning_rate": 3.49533449906165e-06, - "loss": 0.4379, - "mean_token_accuracy": 0.8624173834919929, - "num_tokens": 203342773.0, - "step": 168990 - }, - { - "entropy": 1.8732567384839058, - "epoch": 0.5238852133398535, - "grad_norm": 9.277653694152832, - "learning_rate": 3.495231084656623e-06, - "loss": 0.4505, - "mean_token_accuracy": 0.861282941699028, - "num_tokens": 203354610.0, - "step": 169000 - }, - { - "entropy": 1.810496024042368, - "epoch": 0.5239162124649032, - "grad_norm": 8.090917587280273, - "learning_rate": 3.4951276794300364e-06, - "loss": 0.4273, - "mean_token_accuracy": 0.8605089306831359, - "num_tokens": 203366465.0, - "step": 169010 - }, - { - "entropy": 1.8707633331418037, - "epoch": 0.5239472115899528, - "grad_norm": 8.256461143493652, - "learning_rate": 3.495024283380533e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.857209199666977, - "num_tokens": 203378502.0, - "step": 169020 - }, - { - "entropy": 1.8902933821082115, - "epoch": 0.5239782107150026, - "grad_norm": 10.598012924194336, - "learning_rate": 3.494920896506754e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8510710790753364, - "num_tokens": 203390100.0, - "step": 169030 - }, - { - "entropy": 1.8810963049530982, - "epoch": 0.5240092098400523, - "grad_norm": 3.4652762413024902, - "learning_rate": 3.494817518807344e-06, - "loss": 0.4237, - "mean_token_accuracy": 0.8639438837766648, - "num_tokens": 203401188.0, - "step": 169040 - }, - { - "entropy": 1.9366990119218825, - "epoch": 0.524040208965102, - "grad_norm": 7.726593971252441, - "learning_rate": 3.4947141502809452e-06, - "loss": 0.5079, - "mean_token_accuracy": 0.8416985169053077, - "num_tokens": 203411783.0, - "step": 169050 - }, - { - "entropy": 1.9235543400049209, - "epoch": 0.5240712080901516, - "grad_norm": 9.40511417388916, - "learning_rate": 3.4946107909262012e-06, - "loss": 0.4447, - "mean_token_accuracy": 0.8531079888343811, - "num_tokens": 203422994.0, - "step": 169060 - }, - { - "entropy": 1.9281636044383048, - "epoch": 0.5241022072152014, - "grad_norm": 8.211299896240234, - "learning_rate": 3.4945074407417565e-06, - "loss": 0.4914, - "mean_token_accuracy": 0.8453192666172982, - "num_tokens": 203435055.0, - "step": 169070 - }, - { - "entropy": 1.8821644067764283, - "epoch": 0.5241332063402511, - "grad_norm": 9.113029479980469, - "learning_rate": 3.494404099726255e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8508746236562729, - "num_tokens": 203447495.0, - "step": 169080 - }, - { - "entropy": 1.9432899564504624, - "epoch": 0.5241642054653007, - "grad_norm": 8.647567749023438, - "learning_rate": 3.4943007678783398e-06, - "loss": 0.4896, - "mean_token_accuracy": 0.8540888220071793, - "num_tokens": 203458347.0, - "step": 169090 - }, - { - "entropy": 1.9115382328629493, - "epoch": 0.5241952045903504, - "grad_norm": 8.112555503845215, - "learning_rate": 3.4941974451966564e-06, - "loss": 0.4247, - "mean_token_accuracy": 0.851653291285038, - "num_tokens": 203469957.0, - "step": 169100 - }, - { - "entropy": 1.9376183688640594, - "epoch": 0.5242262037154002, - "grad_norm": 9.164286613464355, - "learning_rate": 3.49409413167985e-06, - "loss": 0.5251, - "mean_token_accuracy": 0.8397494927048683, - "num_tokens": 203481549.0, - "step": 169110 - }, - { - "entropy": 1.9066183164715766, - "epoch": 0.5242572028404499, - "grad_norm": 11.092423439025879, - "learning_rate": 3.4939908273265666e-06, - "loss": 0.4596, - "mean_token_accuracy": 0.8511956304311752, - "num_tokens": 203494139.0, - "step": 169120 - }, - { - "entropy": 1.8191746473312378, - "epoch": 0.5242882019654995, - "grad_norm": 7.425056457519531, - "learning_rate": 3.4938875321354493e-06, - "loss": 0.3734, - "mean_token_accuracy": 0.8660463154315948, - "num_tokens": 203507676.0, - "step": 169130 - }, - { - "entropy": 1.8262469589710235, - "epoch": 0.5243192010905492, - "grad_norm": 3.7181785106658936, - "learning_rate": 3.4937842461051453e-06, - "loss": 0.4016, - "mean_token_accuracy": 0.859797814488411, - "num_tokens": 203521237.0, - "step": 169140 - }, - { - "entropy": 1.9659714698791504, - "epoch": 0.5243502002155989, - "grad_norm": 7.551681995391846, - "learning_rate": 3.4936809692343003e-06, - "loss": 0.4721, - "mean_token_accuracy": 0.8544010862708091, - "num_tokens": 203532795.0, - "step": 169150 - }, - { - "entropy": 1.8716353714466094, - "epoch": 0.5243811993406486, - "grad_norm": 7.684150218963623, - "learning_rate": 3.493577701521561e-06, - "loss": 0.3979, - "mean_token_accuracy": 0.8697442948818207, - "num_tokens": 203545242.0, - "step": 169160 - }, - { - "entropy": 1.956957994401455, - "epoch": 0.5244121984656983, - "grad_norm": 8.428984642028809, - "learning_rate": 3.493474442965574e-06, - "loss": 0.4803, - "mean_token_accuracy": 0.844064648449421, - "num_tokens": 203556959.0, - "step": 169170 - }, - { - "entropy": 1.8892595663666725, - "epoch": 0.524443197590748, - "grad_norm": 9.209391593933105, - "learning_rate": 3.4933711935649857e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.8504563570022583, - "num_tokens": 203568917.0, - "step": 169180 - }, - { - "entropy": 1.8977754607796669, - "epoch": 0.5244741967157976, - "grad_norm": 4.250955581665039, - "learning_rate": 3.493267953318443e-06, - "loss": 0.4496, - "mean_token_accuracy": 0.8544679388403893, - "num_tokens": 203580698.0, - "step": 169190 - }, - { - "entropy": 1.966796690225601, - "epoch": 0.5245051958408474, - "grad_norm": 8.080552101135254, - "learning_rate": 3.493164722224594e-06, - "loss": 0.4686, - "mean_token_accuracy": 0.857042309641838, - "num_tokens": 203592023.0, - "step": 169200 - }, - { - "entropy": 1.9782009929418565, - "epoch": 0.5245361949658971, - "grad_norm": 10.818819999694824, - "learning_rate": 3.4930615002820863e-06, - "loss": 0.4827, - "mean_token_accuracy": 0.8517513841390609, - "num_tokens": 203603354.0, - "step": 169210 - }, - { - "entropy": 1.9704937011003494, - "epoch": 0.5245671940909468, - "grad_norm": 9.936196327209473, - "learning_rate": 3.492958287489568e-06, - "loss": 0.444, - "mean_token_accuracy": 0.8553407356142998, - "num_tokens": 203614187.0, - "step": 169220 - }, - { - "entropy": 1.8049380242824555, - "epoch": 0.5245981932159964, - "grad_norm": 2.656586170196533, - "learning_rate": 3.492855083845687e-06, - "loss": 0.441, - "mean_token_accuracy": 0.855417400598526, - "num_tokens": 203626946.0, - "step": 169230 - }, - { - "entropy": 1.8623328655958176, - "epoch": 0.5246291923410462, - "grad_norm": 8.69382095336914, - "learning_rate": 3.492751889349091e-06, - "loss": 0.4543, - "mean_token_accuracy": 0.8532855972647667, - "num_tokens": 203639192.0, - "step": 169240 - }, - { - "entropy": 1.9222152277827262, - "epoch": 0.5246601914660959, - "grad_norm": 9.066729545593262, - "learning_rate": 3.4926487039984308e-06, - "loss": 0.4637, - "mean_token_accuracy": 0.8545737609267234, - "num_tokens": 203650676.0, - "step": 169250 - }, - { - "entropy": 1.929406814277172, - "epoch": 0.5246911905911456, - "grad_norm": 8.320404052734375, - "learning_rate": 3.492545527792354e-06, - "loss": 0.4538, - "mean_token_accuracy": 0.8549767971038819, - "num_tokens": 203662180.0, - "step": 169260 - }, - { - "entropy": 1.9012570947408676, - "epoch": 0.5247221897161952, - "grad_norm": 7.0718536376953125, - "learning_rate": 3.4924423607295107e-06, - "loss": 0.4361, - "mean_token_accuracy": 0.8597986832261085, - "num_tokens": 203673564.0, - "step": 169270 - }, - { - "entropy": 1.7952979236841202, - "epoch": 0.524753188841245, - "grad_norm": 2.635957717895508, - "learning_rate": 3.4923392028085507e-06, - "loss": 0.3773, - "mean_token_accuracy": 0.8655047237873077, - "num_tokens": 203686834.0, - "step": 169280 - }, - { - "entropy": 1.9100728452205658, - "epoch": 0.5247841879662947, - "grad_norm": 3.5995821952819824, - "learning_rate": 3.492236054028123e-06, - "loss": 0.4945, - "mean_token_accuracy": 0.8441936239600182, - "num_tokens": 203698055.0, - "step": 169290 - }, - { - "entropy": 1.900466850399971, - "epoch": 0.5248151870913443, - "grad_norm": 3.480902910232544, - "learning_rate": 3.4921329143868787e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.850808109343052, - "num_tokens": 203710441.0, - "step": 169300 - }, - { - "entropy": 1.8833848714828492, - "epoch": 0.524846186216394, - "grad_norm": 8.502030372619629, - "learning_rate": 3.4920297838834676e-06, - "loss": 0.4221, - "mean_token_accuracy": 0.8584718599915504, - "num_tokens": 203722127.0, - "step": 169310 - }, - { - "entropy": 1.855775459110737, - "epoch": 0.5248771853414438, - "grad_norm": 7.603749752044678, - "learning_rate": 3.491926662516541e-06, - "loss": 0.39, - "mean_token_accuracy": 0.8616424456238747, - "num_tokens": 203735547.0, - "step": 169320 - }, - { - "entropy": 1.9335170581936836, - "epoch": 0.5249081844664935, - "grad_norm": 9.550626754760742, - "learning_rate": 3.4918235502847503e-06, - "loss": 0.4743, - "mean_token_accuracy": 0.8472717061638833, - "num_tokens": 203746751.0, - "step": 169330 - }, - { - "entropy": 1.8978175684809684, - "epoch": 0.5249391835915431, - "grad_norm": 8.762434959411621, - "learning_rate": 3.4917204471867455e-06, - "loss": 0.4518, - "mean_token_accuracy": 0.8437378078699111, - "num_tokens": 203758587.0, - "step": 169340 - }, - { - "entropy": 1.8782646000385284, - "epoch": 0.5249701827165928, - "grad_norm": 7.3637871742248535, - "learning_rate": 3.4916173532211793e-06, - "loss": 0.428, - "mean_token_accuracy": 0.8627133265137672, - "num_tokens": 203770143.0, - "step": 169350 - }, - { - "entropy": 1.9008455261588098, - "epoch": 0.5250011818416426, - "grad_norm": 9.275456428527832, - "learning_rate": 3.491514268386703e-06, - "loss": 0.4408, - "mean_token_accuracy": 0.8553502589464188, - "num_tokens": 203782295.0, - "step": 169360 - }, - { - "entropy": 1.922987850010395, - "epoch": 0.5250321809666922, - "grad_norm": 8.815250396728516, - "learning_rate": 3.49141119268197e-06, - "loss": 0.4384, - "mean_token_accuracy": 0.8504637837409973, - "num_tokens": 203793616.0, - "step": 169370 - }, - { - "entropy": 1.9362686663866042, - "epoch": 0.5250631800917419, - "grad_norm": 7.780436992645264, - "learning_rate": 3.4913081261056313e-06, - "loss": 0.4331, - "mean_token_accuracy": 0.8581616476178169, - "num_tokens": 203805629.0, - "step": 169380 - }, - { - "entropy": 1.8906577423214912, - "epoch": 0.5250941792167916, - "grad_norm": 7.460948944091797, - "learning_rate": 3.4912050686563403e-06, - "loss": 0.4573, - "mean_token_accuracy": 0.8532565832138062, - "num_tokens": 203817628.0, - "step": 169390 - }, - { - "entropy": 1.8528029769659042, - "epoch": 0.5251251783418412, - "grad_norm": 7.081979751586914, - "learning_rate": 3.49110202033275e-06, - "loss": 0.4096, - "mean_token_accuracy": 0.857559771835804, - "num_tokens": 203829983.0, - "step": 169400 - }, - { - "entropy": 1.8572653010487556, - "epoch": 0.525156177466891, - "grad_norm": 8.674805641174316, - "learning_rate": 3.4909989811335133e-06, - "loss": 0.4098, - "mean_token_accuracy": 0.865797932446003, - "num_tokens": 203842180.0, - "step": 169410 - }, - { - "entropy": 1.9635521739721298, - "epoch": 0.5251871765919407, - "grad_norm": 7.031563758850098, - "learning_rate": 3.490895951057284e-06, - "loss": 0.4654, - "mean_token_accuracy": 0.8588854551315308, - "num_tokens": 203853240.0, - "step": 169420 - }, - { - "entropy": 1.8900843843817712, - "epoch": 0.5252181757169904, - "grad_norm": 9.572572708129883, - "learning_rate": 3.4907929301027164e-06, - "loss": 0.4315, - "mean_token_accuracy": 0.8617082446813583, - "num_tokens": 203865022.0, - "step": 169430 - }, - { - "entropy": 1.831702572107315, - "epoch": 0.52524917484204, - "grad_norm": 9.38916015625, - "learning_rate": 3.4906899182684645e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.8631092309951782, - "num_tokens": 203876987.0, - "step": 169440 - }, - { - "entropy": 1.8782099664211274, - "epoch": 0.5252801739670898, - "grad_norm": 8.228728294372559, - "learning_rate": 3.4905869155531813e-06, - "loss": 0.4405, - "mean_token_accuracy": 0.8564833179116249, - "num_tokens": 203888591.0, - "step": 169450 - }, - { - "entropy": 1.8731250807642936, - "epoch": 0.5253111730921395, - "grad_norm": 8.023486137390137, - "learning_rate": 3.490483921955523e-06, - "loss": 0.443, - "mean_token_accuracy": 0.8530664145946503, - "num_tokens": 203900766.0, - "step": 169460 - }, - { - "entropy": 1.8310665681958198, - "epoch": 0.5253421722171892, - "grad_norm": 7.332006931304932, - "learning_rate": 3.4903809374741443e-06, - "loss": 0.396, - "mean_token_accuracy": 0.8547357425093651, - "num_tokens": 203913127.0, - "step": 169470 - }, - { - "entropy": 1.9228072851896285, - "epoch": 0.5253731713422388, - "grad_norm": 7.0422282218933105, - "learning_rate": 3.4902779621077004e-06, - "loss": 0.4801, - "mean_token_accuracy": 0.8502156108617782, - "num_tokens": 203924369.0, - "step": 169480 - }, - { - "entropy": 1.8755038015544414, - "epoch": 0.5254041704672886, - "grad_norm": 3.896348237991333, - "learning_rate": 3.4901749958548465e-06, - "loss": 0.3737, - "mean_token_accuracy": 0.8643324583768844, - "num_tokens": 203937278.0, - "step": 169490 - }, - { - "entropy": 1.8818801745772362, - "epoch": 0.5254351695923383, - "grad_norm": 8.24651050567627, - "learning_rate": 3.4900720387142383e-06, - "loss": 0.43, - "mean_token_accuracy": 0.8514017388224602, - "num_tokens": 203948661.0, - "step": 169500 - }, - { - "entropy": 1.9414301648736, - "epoch": 0.5254661687173879, - "grad_norm": 9.171976089477539, - "learning_rate": 3.4899690906845326e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8593140855431557, - "num_tokens": 203959539.0, - "step": 169510 - }, - { - "entropy": 1.9445295572280883, - "epoch": 0.5254971678424376, - "grad_norm": 8.482927322387695, - "learning_rate": 3.4898661517643845e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8427321448922157, - "num_tokens": 203970761.0, - "step": 169520 - }, - { - "entropy": 1.8659766510128974, - "epoch": 0.5255281669674874, - "grad_norm": 8.594217300415039, - "learning_rate": 3.489763221952451e-06, - "loss": 0.4573, - "mean_token_accuracy": 0.8495791599154472, - "num_tokens": 203982341.0, - "step": 169530 - }, - { - "entropy": 1.9512912288308144, - "epoch": 0.5255591660925371, - "grad_norm": 8.930543899536133, - "learning_rate": 3.4896603012473913e-06, - "loss": 0.5258, - "mean_token_accuracy": 0.8444173559546471, - "num_tokens": 203993560.0, - "step": 169540 - }, - { - "entropy": 1.9422188624739647, - "epoch": 0.5255901652175867, - "grad_norm": 8.387059211730957, - "learning_rate": 3.4895573896478594e-06, - "loss": 0.4749, - "mean_token_accuracy": 0.8544582590460778, - "num_tokens": 204004877.0, - "step": 169550 - }, - { - "entropy": 2.0166870802640915, - "epoch": 0.5256211643426364, - "grad_norm": 9.012490272521973, - "learning_rate": 3.4894544871525138e-06, - "loss": 0.5375, - "mean_token_accuracy": 0.8376458153128624, - "num_tokens": 204015926.0, - "step": 169560 - }, - { - "entropy": 1.9142573595046997, - "epoch": 0.5256521634676862, - "grad_norm": 3.9384617805480957, - "learning_rate": 3.489351593760012e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.8592307507991791, - "num_tokens": 204027720.0, - "step": 169570 - }, - { - "entropy": 1.9269609957933427, - "epoch": 0.5256831625927358, - "grad_norm": 8.374215126037598, - "learning_rate": 3.489248709469013e-06, - "loss": 0.4784, - "mean_token_accuracy": 0.8519828408956528, - "num_tokens": 204039091.0, - "step": 169580 - }, - { - "entropy": 1.8991817444562913, - "epoch": 0.5257141617177855, - "grad_norm": 6.979922771453857, - "learning_rate": 3.489145834278175e-06, - "loss": 0.4364, - "mean_token_accuracy": 0.8537618055939674, - "num_tokens": 204051145.0, - "step": 169590 - }, - { - "entropy": 1.9464630469679833, - "epoch": 0.5257451608428352, - "grad_norm": 3.8475420475006104, - "learning_rate": 3.489042968186156e-06, - "loss": 0.4497, - "mean_token_accuracy": 0.8544916957616806, - "num_tokens": 204062107.0, - "step": 169600 - }, - { - "entropy": 1.8664928540587424, - "epoch": 0.525776159967885, - "grad_norm": 5.392734527587891, - "learning_rate": 3.4889401111916138e-06, - "loss": 0.383, - "mean_token_accuracy": 0.8774384975433349, - "num_tokens": 204074381.0, - "step": 169610 - }, - { - "entropy": 1.9056679099798202, - "epoch": 0.5258071590929346, - "grad_norm": 8.054254531860352, - "learning_rate": 3.4888372632932095e-06, - "loss": 0.4806, - "mean_token_accuracy": 0.8502617850899696, - "num_tokens": 204086077.0, - "step": 169620 - }, - { - "entropy": 1.8804067566990852, - "epoch": 0.5258381582179843, - "grad_norm": 9.382166862487793, - "learning_rate": 3.4887344244896004e-06, - "loss": 0.4181, - "mean_token_accuracy": 0.8627091318368911, - "num_tokens": 204097655.0, - "step": 169630 - }, - { - "entropy": 1.9133615478873254, - "epoch": 0.525869157343034, - "grad_norm": 3.766981363296509, - "learning_rate": 3.488631594779449e-06, - "loss": 0.5307, - "mean_token_accuracy": 0.8391897663474083, - "num_tokens": 204109163.0, - "step": 169640 - }, - { - "entropy": 1.8695868149399757, - "epoch": 0.5259001564680836, - "grad_norm": 5.005122184753418, - "learning_rate": 3.488528774161413e-06, - "loss": 0.4281, - "mean_token_accuracy": 0.8512502536177635, - "num_tokens": 204121920.0, - "step": 169650 - }, - { - "entropy": 1.9205554395914077, - "epoch": 0.5259311555931334, - "grad_norm": 7.43350076675415, - "learning_rate": 3.4884259626341523e-06, - "loss": 0.4477, - "mean_token_accuracy": 0.8556781709194183, - "num_tokens": 204133982.0, - "step": 169660 - }, - { - "entropy": 1.9717912420630455, - "epoch": 0.5259621547181831, - "grad_norm": 8.707015037536621, - "learning_rate": 3.488323160196329e-06, - "loss": 0.517, - "mean_token_accuracy": 0.8443710133433342, - "num_tokens": 204145583.0, - "step": 169670 - }, - { - "entropy": 1.8160235270857812, - "epoch": 0.5259931538432328, - "grad_norm": 4.329787254333496, - "learning_rate": 3.488220366846603e-06, - "loss": 0.3306, - "mean_token_accuracy": 0.8737678155303001, - "num_tokens": 204158620.0, - "step": 169680 - }, - { - "entropy": 1.8438635244965553, - "epoch": 0.5260241529682824, - "grad_norm": 6.801762580871582, - "learning_rate": 3.488117582583636e-06, - "loss": 0.486, - "mean_token_accuracy": 0.8483135282993317, - "num_tokens": 204172257.0, - "step": 169690 - }, - { - "entropy": 1.883940464258194, - "epoch": 0.5260551520933322, - "grad_norm": 7.7768120765686035, - "learning_rate": 3.4880148074060883e-06, - "loss": 0.3715, - "mean_token_accuracy": 0.8663879573345185, - "num_tokens": 204185159.0, - "step": 169700 - }, - { - "entropy": 1.8030682742595672, - "epoch": 0.5260861512183819, - "grad_norm": 4.2130584716796875, - "learning_rate": 3.4879120413126217e-06, - "loss": 0.3886, - "mean_token_accuracy": 0.8646674558520318, - "num_tokens": 204198490.0, - "step": 169710 - }, - { - "entropy": 1.9641460493206977, - "epoch": 0.5261171503434315, - "grad_norm": 7.377951622009277, - "learning_rate": 3.487809284301899e-06, - "loss": 0.4551, - "mean_token_accuracy": 0.8550163015723229, - "num_tokens": 204209459.0, - "step": 169720 - }, - { - "entropy": 1.9644610643386842, - "epoch": 0.5261481494684812, - "grad_norm": 8.131065368652344, - "learning_rate": 3.487706536372582e-06, - "loss": 0.4511, - "mean_token_accuracy": 0.8526425957679749, - "num_tokens": 204220685.0, - "step": 169730 - }, - { - "entropy": 1.8255484610795976, - "epoch": 0.526179148593531, - "grad_norm": 2.616626739501953, - "learning_rate": 3.4876037975233325e-06, - "loss": 0.3855, - "mean_token_accuracy": 0.867869146168232, - "num_tokens": 204233514.0, - "step": 169740 - }, - { - "entropy": 1.9307437628507613, - "epoch": 0.5262101477185807, - "grad_norm": 8.157958984375, - "learning_rate": 3.487501067752813e-06, - "loss": 0.449, - "mean_token_accuracy": 0.8660890907049179, - "num_tokens": 204244621.0, - "step": 169750 - }, - { - "entropy": 1.923675388097763, - "epoch": 0.5262411468436303, - "grad_norm": 9.466991424560547, - "learning_rate": 3.4873983470596878e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8543407127261162, - "num_tokens": 204256714.0, - "step": 169760 - }, - { - "entropy": 1.9599072575569152, - "epoch": 0.52627214596868, - "grad_norm": 8.29330062866211, - "learning_rate": 3.487295635442619e-06, - "loss": 0.5009, - "mean_token_accuracy": 0.8472807705402374, - "num_tokens": 204267567.0, - "step": 169770 - }, - { - "entropy": 1.924948987364769, - "epoch": 0.5263031450937298, - "grad_norm": 7.503832817077637, - "learning_rate": 3.4871929329002707e-06, - "loss": 0.4462, - "mean_token_accuracy": 0.850759707391262, - "num_tokens": 204279403.0, - "step": 169780 - }, - { - "entropy": 1.9135427996516228, - "epoch": 0.5263341442187794, - "grad_norm": 7.390564918518066, - "learning_rate": 3.487090239431306e-06, - "loss": 0.4437, - "mean_token_accuracy": 0.8542888939380646, - "num_tokens": 204290947.0, - "step": 169790 - }, - { - "entropy": 1.8421156138181687, - "epoch": 0.5263651433438291, - "grad_norm": 3.895979642868042, - "learning_rate": 3.4869875550343913e-06, - "loss": 0.3785, - "mean_token_accuracy": 0.8799378126859665, - "num_tokens": 204304029.0, - "step": 169800 - }, - { - "entropy": 1.8601937487721443, - "epoch": 0.5263961424688788, - "grad_norm": 8.522631645202637, - "learning_rate": 3.4868848797081872e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.8664719671010971, - "num_tokens": 204315905.0, - "step": 169810 - }, - { - "entropy": 1.9008443534374238, - "epoch": 0.5264271415939286, - "grad_norm": 8.575570106506348, - "learning_rate": 3.4867822134513614e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8516224265098572, - "num_tokens": 204327709.0, - "step": 169820 - }, - { - "entropy": 1.8827106207609177, - "epoch": 0.5264581407189782, - "grad_norm": 7.430328369140625, - "learning_rate": 3.486679556262577e-06, - "loss": 0.4653, - "mean_token_accuracy": 0.8505603596568108, - "num_tokens": 204339706.0, - "step": 169830 - }, - { - "entropy": 1.9500410586595536, - "epoch": 0.5264891398440279, - "grad_norm": 8.332409858703613, - "learning_rate": 3.4865769081405006e-06, - "loss": 0.4682, - "mean_token_accuracy": 0.8455384686589241, - "num_tokens": 204351742.0, - "step": 169840 - }, - { - "entropy": 1.9368303149938584, - "epoch": 0.5265201389690776, - "grad_norm": 7.815258979797363, - "learning_rate": 3.4864742690837967e-06, - "loss": 0.4626, - "mean_token_accuracy": 0.8535314634442329, - "num_tokens": 204363119.0, - "step": 169850 - }, - { - "entropy": 1.8521654710173607, - "epoch": 0.5265511380941273, - "grad_norm": 9.0866117477417, - "learning_rate": 3.4863716390911313e-06, - "loss": 0.4524, - "mean_token_accuracy": 0.8530202284455299, - "num_tokens": 204374754.0, - "step": 169860 - }, - { - "entropy": 1.89563407599926, - "epoch": 0.526582137219177, - "grad_norm": 7.82810640335083, - "learning_rate": 3.4862690181611697e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8638562858104706, - "num_tokens": 204386610.0, - "step": 169870 - }, - { - "entropy": 1.873375302553177, - "epoch": 0.5266131363442267, - "grad_norm": 8.360272407531738, - "learning_rate": 3.4861664062925797e-06, - "loss": 0.409, - "mean_token_accuracy": 0.8664838880300522, - "num_tokens": 204398916.0, - "step": 169880 - }, - { - "entropy": 1.9053056687116623, - "epoch": 0.5266441354692764, - "grad_norm": 7.953768730163574, - "learning_rate": 3.486063803484026e-06, - "loss": 0.4874, - "mean_token_accuracy": 0.8378211975097656, - "num_tokens": 204410801.0, - "step": 169890 - }, - { - "entropy": 2.0088341891765595, - "epoch": 0.526675134594326, - "grad_norm": 8.043516159057617, - "learning_rate": 3.4859612097341776e-06, - "loss": 0.5323, - "mean_token_accuracy": 0.8447351723909378, - "num_tokens": 204421350.0, - "step": 169900 - }, - { - "entropy": 1.8681996390223503, - "epoch": 0.5267061337193758, - "grad_norm": 7.973627090454102, - "learning_rate": 3.4858586250417004e-06, - "loss": 0.4218, - "mean_token_accuracy": 0.8473279893398284, - "num_tokens": 204434186.0, - "step": 169910 - }, - { - "entropy": 1.8609192743897438, - "epoch": 0.5267371328444255, - "grad_norm": 7.692805290222168, - "learning_rate": 3.485756049405261e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.8609997630119324, - "num_tokens": 204446234.0, - "step": 169920 - }, - { - "entropy": 1.920175838470459, - "epoch": 0.5267681319694751, - "grad_norm": 8.081625938415527, - "learning_rate": 3.485653482823528e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8617697909474373, - "num_tokens": 204457418.0, - "step": 169930 - }, - { - "entropy": 1.9051961615681647, - "epoch": 0.5267991310945248, - "grad_norm": 8.327408790588379, - "learning_rate": 3.48555092529517e-06, - "loss": 0.4447, - "mean_token_accuracy": 0.8548000529408455, - "num_tokens": 204468969.0, - "step": 169940 - }, - { - "entropy": 1.9939599066972733, - "epoch": 0.5268301302195746, - "grad_norm": 7.189398288726807, - "learning_rate": 3.485448376818854e-06, - "loss": 0.4884, - "mean_token_accuracy": 0.8490493327379227, - "num_tokens": 204480095.0, - "step": 169950 - }, - { - "entropy": 1.883024525642395, - "epoch": 0.5268611293446243, - "grad_norm": 7.902504920959473, - "learning_rate": 3.4853458373932486e-06, - "loss": 0.4408, - "mean_token_accuracy": 0.8574581429362297, - "num_tokens": 204492166.0, - "step": 169960 - }, - { - "entropy": 1.8119531005620957, - "epoch": 0.5268921284696739, - "grad_norm": 8.166264533996582, - "learning_rate": 3.4852433070170234e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.8615056142210961, - "num_tokens": 204505151.0, - "step": 169970 - }, - { - "entropy": 1.8452864043414592, - "epoch": 0.5269231275947236, - "grad_norm": 7.124696254730225, - "learning_rate": 3.4851407856888465e-06, - "loss": 0.3881, - "mean_token_accuracy": 0.8681139811873436, - "num_tokens": 204517998.0, - "step": 169980 - }, - { - "entropy": 1.9364143311977386, - "epoch": 0.5269541267197734, - "grad_norm": 7.2239813804626465, - "learning_rate": 3.485038273407387e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8534222990274429, - "num_tokens": 204529222.0, - "step": 169990 - }, - { - "entropy": 1.9091769382357597, - "epoch": 0.526985125844823, - "grad_norm": 7.802124977111816, - "learning_rate": 3.4849357701713165e-06, - "loss": 0.4399, - "mean_token_accuracy": 0.8642896711826324, - "num_tokens": 204540170.0, - "step": 170000 - }, - { - "entropy": 1.8714770466089248, - "epoch": 0.5270161249698727, - "grad_norm": 9.276057243347168, - "learning_rate": 3.484833275979302e-06, - "loss": 0.4207, - "mean_token_accuracy": 0.8525351405143737, - "num_tokens": 204552616.0, - "step": 170010 - }, - { - "entropy": 1.857717652618885, - "epoch": 0.5270471240949224, - "grad_norm": 8.349593162536621, - "learning_rate": 3.4847307908300156e-06, - "loss": 0.4202, - "mean_token_accuracy": 0.8563280582427979, - "num_tokens": 204565267.0, - "step": 170020 - }, - { - "entropy": 1.7349850058555603, - "epoch": 0.5270781232199722, - "grad_norm": 7.835326194763184, - "learning_rate": 3.484628314722127e-06, - "loss": 0.3564, - "mean_token_accuracy": 0.871354128420353, - "num_tokens": 204579353.0, - "step": 170030 - }, - { - "entropy": 1.8870081350207328, - "epoch": 0.5271091223450218, - "grad_norm": 9.670953750610352, - "learning_rate": 3.484525847654307e-06, - "loss": 0.4444, - "mean_token_accuracy": 0.8451803371310234, - "num_tokens": 204592415.0, - "step": 170040 - }, - { - "entropy": 1.8382927820086479, - "epoch": 0.5271401214700715, - "grad_norm": 8.578036308288574, - "learning_rate": 3.4844233896252264e-06, - "loss": 0.3805, - "mean_token_accuracy": 0.8661419838666916, - "num_tokens": 204605425.0, - "step": 170050 - }, - { - "entropy": 1.969618821144104, - "epoch": 0.5271711205951212, - "grad_norm": 7.408189296722412, - "learning_rate": 3.484320940633557e-06, - "loss": 0.4686, - "mean_token_accuracy": 0.8454100415110588, - "num_tokens": 204616440.0, - "step": 170060 - }, - { - "entropy": 1.9854531973600387, - "epoch": 0.527202119720171, - "grad_norm": 8.708170890808105, - "learning_rate": 3.484218500677969e-06, - "loss": 0.5099, - "mean_token_accuracy": 0.8439430460333824, - "num_tokens": 204627483.0, - "step": 170070 - }, - { - "entropy": 1.8998574405908584, - "epoch": 0.5272331188452206, - "grad_norm": 8.767289161682129, - "learning_rate": 3.4841160697571356e-06, - "loss": 0.4279, - "mean_token_accuracy": 0.8521378070116044, - "num_tokens": 204639576.0, - "step": 170080 - }, - { - "entropy": 1.7341085240244865, - "epoch": 0.5272641179702703, - "grad_norm": 2.5850956439971924, - "learning_rate": 3.484013647869728e-06, - "loss": 0.3823, - "mean_token_accuracy": 0.8609752714633941, - "num_tokens": 204654034.0, - "step": 170090 - }, - { - "entropy": 1.897045449912548, - "epoch": 0.52729511709532, - "grad_norm": 8.623618125915527, - "learning_rate": 3.483911235014418e-06, - "loss": 0.4526, - "mean_token_accuracy": 0.8511906012892723, - "num_tokens": 204665912.0, - "step": 170100 - }, - { - "entropy": 1.9212136805057525, - "epoch": 0.5273261162203697, - "grad_norm": 9.318014144897461, - "learning_rate": 3.4838088311898806e-06, - "loss": 0.4971, - "mean_token_accuracy": 0.8414152905344963, - "num_tokens": 204677184.0, - "step": 170110 - }, - { - "entropy": 1.8723152995109558, - "epoch": 0.5273571153454194, - "grad_norm": 3.7983345985412598, - "learning_rate": 3.483706436394786e-06, - "loss": 0.4603, - "mean_token_accuracy": 0.8477897018194198, - "num_tokens": 204689567.0, - "step": 170120 - }, - { - "entropy": 1.8677892670035363, - "epoch": 0.5273881144704691, - "grad_norm": 4.092134952545166, - "learning_rate": 3.4836040506278078e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8498900100588799, - "num_tokens": 204701815.0, - "step": 170130 - }, - { - "entropy": 1.9073769941926002, - "epoch": 0.5274191135955187, - "grad_norm": 7.181079387664795, - "learning_rate": 3.4835016738876204e-06, - "loss": 0.4018, - "mean_token_accuracy": 0.8678811907768249, - "num_tokens": 204712717.0, - "step": 170140 - }, - { - "entropy": 1.9618832275271416, - "epoch": 0.5274501127205684, - "grad_norm": 6.790325164794922, - "learning_rate": 3.483399306172897e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8506255120038986, - "num_tokens": 204724239.0, - "step": 170150 - }, - { - "entropy": 1.8595173746347426, - "epoch": 0.5274811118456182, - "grad_norm": 7.266312599182129, - "learning_rate": 3.483296947482312e-06, - "loss": 0.4021, - "mean_token_accuracy": 0.8559166491031647, - "num_tokens": 204736308.0, - "step": 170160 - }, - { - "entropy": 1.9057711243629456, - "epoch": 0.5275121109706679, - "grad_norm": 8.606921195983887, - "learning_rate": 3.4831945978145384e-06, - "loss": 0.4537, - "mean_token_accuracy": 0.8584994554519654, - "num_tokens": 204747595.0, - "step": 170170 - }, - { - "entropy": 1.9293906196951867, - "epoch": 0.5275431100957175, - "grad_norm": 6.8501386642456055, - "learning_rate": 3.483092257168251e-06, - "loss": 0.4908, - "mean_token_accuracy": 0.8433649852871895, - "num_tokens": 204758718.0, - "step": 170180 - }, - { - "entropy": 1.8679754436016083, - "epoch": 0.5275741092207672, - "grad_norm": 9.489760398864746, - "learning_rate": 3.4829899255421258e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8598893627524375, - "num_tokens": 204770993.0, - "step": 170190 - }, - { - "entropy": 1.8623951964080334, - "epoch": 0.527605108345817, - "grad_norm": 3.54229998588562, - "learning_rate": 3.4828876029348362e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.860218308866024, - "num_tokens": 204784022.0, - "step": 170200 - }, - { - "entropy": 1.8672780320048332, - "epoch": 0.5276361074708666, - "grad_norm": 8.450098991394043, - "learning_rate": 3.482785289345059e-06, - "loss": 0.3949, - "mean_token_accuracy": 0.8674737766385079, - "num_tokens": 204795531.0, - "step": 170210 - }, - { - "entropy": 1.820541125535965, - "epoch": 0.5276671065959163, - "grad_norm": 4.2782416343688965, - "learning_rate": 3.4826829847714694e-06, - "loss": 0.3921, - "mean_token_accuracy": 0.8648440659046173, - "num_tokens": 204808516.0, - "step": 170220 - }, - { - "entropy": 1.9438726305961609, - "epoch": 0.527698105720966, - "grad_norm": 8.147907257080078, - "learning_rate": 3.482580689212742e-06, - "loss": 0.4952, - "mean_token_accuracy": 0.842696090042591, - "num_tokens": 204819677.0, - "step": 170230 - }, - { - "entropy": 2.027736449241638, - "epoch": 0.5277291048460158, - "grad_norm": 8.2122802734375, - "learning_rate": 3.482478402667555e-06, - "loss": 0.5091, - "mean_token_accuracy": 0.8460215464234352, - "num_tokens": 204830585.0, - "step": 170240 - }, - { - "entropy": 1.800948679447174, - "epoch": 0.5277601039710654, - "grad_norm": 3.8614985942840576, - "learning_rate": 3.4823761251345834e-06, - "loss": 0.3803, - "mean_token_accuracy": 0.8599292874336243, - "num_tokens": 204843549.0, - "step": 170250 - }, - { - "entropy": 1.8939184978604318, - "epoch": 0.5277911030961151, - "grad_norm": 7.87742805480957, - "learning_rate": 3.482273856612504e-06, - "loss": 0.4268, - "mean_token_accuracy": 0.8504303067922592, - "num_tokens": 204855126.0, - "step": 170260 - }, - { - "entropy": 1.983650615811348, - "epoch": 0.5278221022211648, - "grad_norm": 10.45950984954834, - "learning_rate": 3.482171597099994e-06, - "loss": 0.4943, - "mean_token_accuracy": 0.8400062531232834, - "num_tokens": 204866558.0, - "step": 170270 - }, - { - "entropy": 1.8704144909977913, - "epoch": 0.5278531013462145, - "grad_norm": 8.008955955505371, - "learning_rate": 3.48206934659573e-06, - "loss": 0.4065, - "mean_token_accuracy": 0.8669389069080353, - "num_tokens": 204878894.0, - "step": 170280 - }, - { - "entropy": 1.9509722471237183, - "epoch": 0.5278841004712642, - "grad_norm": 9.869590759277344, - "learning_rate": 3.4819671050983904e-06, - "loss": 0.5041, - "mean_token_accuracy": 0.8404587954282761, - "num_tokens": 204889942.0, - "step": 170290 - }, - { - "entropy": 1.8514474794268607, - "epoch": 0.5279150995963139, - "grad_norm": 6.360729694366455, - "learning_rate": 3.4818648726066523e-06, - "loss": 0.4217, - "mean_token_accuracy": 0.8630643576383591, - "num_tokens": 204902937.0, - "step": 170300 - }, - { - "entropy": 1.9012500569224358, - "epoch": 0.5279460987213636, - "grad_norm": 4.0904059410095215, - "learning_rate": 3.4817626491191943e-06, - "loss": 0.4278, - "mean_token_accuracy": 0.8642749860882759, - "num_tokens": 204914895.0, - "step": 170310 - }, - { - "entropy": 1.847599171102047, - "epoch": 0.5279770978464133, - "grad_norm": 9.81718921661377, - "learning_rate": 3.4816604346346944e-06, - "loss": 0.3941, - "mean_token_accuracy": 0.8613127902150154, - "num_tokens": 204926771.0, - "step": 170320 - }, - { - "entropy": 1.9407305240631103, - "epoch": 0.528008096971463, - "grad_norm": 7.33196496963501, - "learning_rate": 3.4815582291518313e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8560499280691147, - "num_tokens": 204938238.0, - "step": 170330 - }, - { - "entropy": 1.8835730522871017, - "epoch": 0.5280390960965127, - "grad_norm": 7.579594612121582, - "learning_rate": 3.4814560326692833e-06, - "loss": 0.4312, - "mean_token_accuracy": 0.8645493924617768, - "num_tokens": 204949767.0, - "step": 170340 - }, - { - "entropy": 1.8464823752641677, - "epoch": 0.5280700952215623, - "grad_norm": 4.027227401733398, - "learning_rate": 3.4813538451857298e-06, - "loss": 0.4321, - "mean_token_accuracy": 0.8526124745607376, - "num_tokens": 204962686.0, - "step": 170350 - }, - { - "entropy": 1.7890559926629066, - "epoch": 0.5281010943466121, - "grad_norm": 2.017467498779297, - "learning_rate": 3.4812516666998506e-06, - "loss": 0.3931, - "mean_token_accuracy": 0.8606483578681946, - "num_tokens": 204976198.0, - "step": 170360 - }, - { - "entropy": 1.922431980073452, - "epoch": 0.5281320934716618, - "grad_norm": 8.555486679077148, - "learning_rate": 3.4811494972103254e-06, - "loss": 0.4786, - "mean_token_accuracy": 0.8491716757416725, - "num_tokens": 204987681.0, - "step": 170370 - }, - { - "entropy": 1.943414855003357, - "epoch": 0.5281630925967115, - "grad_norm": 9.159770011901855, - "learning_rate": 3.481047336715833e-06, - "loss": 0.4762, - "mean_token_accuracy": 0.8497003749012947, - "num_tokens": 204999270.0, - "step": 170380 - }, - { - "entropy": 1.9107708364725113, - "epoch": 0.5281940917217611, - "grad_norm": 7.628677845001221, - "learning_rate": 3.4809451852150548e-06, - "loss": 0.4448, - "mean_token_accuracy": 0.8630807980895042, - "num_tokens": 205010971.0, - "step": 170390 - }, - { - "entropy": 1.863230326771736, - "epoch": 0.5282250908468108, - "grad_norm": 8.68991756439209, - "learning_rate": 3.4808430427066714e-06, - "loss": 0.4174, - "mean_token_accuracy": 0.8568514421582222, - "num_tokens": 205023252.0, - "step": 170400 - }, - { - "entropy": 1.8994240581989288, - "epoch": 0.5282560899718606, - "grad_norm": 4.405247211456299, - "learning_rate": 3.4807409091893627e-06, - "loss": 0.4538, - "mean_token_accuracy": 0.8561380252242088, - "num_tokens": 205035045.0, - "step": 170410 - }, - { - "entropy": 1.8784004136919976, - "epoch": 0.5282870890969102, - "grad_norm": 7.923066139221191, - "learning_rate": 3.48063878466181e-06, - "loss": 0.465, - "mean_token_accuracy": 0.8491328284144402, - "num_tokens": 205047400.0, - "step": 170420 - }, - { - "entropy": 1.850473153591156, - "epoch": 0.5283180882219599, - "grad_norm": 8.640925407409668, - "learning_rate": 3.480536669122695e-06, - "loss": 0.4093, - "mean_token_accuracy": 0.8576840028166771, - "num_tokens": 205060199.0, - "step": 170430 - }, - { - "entropy": 1.8898152530193328, - "epoch": 0.5283490873470096, - "grad_norm": 9.438566207885742, - "learning_rate": 3.480434562570698e-06, - "loss": 0.4626, - "mean_token_accuracy": 0.8480508312582969, - "num_tokens": 205072684.0, - "step": 170440 - }, - { - "entropy": 1.8964637443423271, - "epoch": 0.5283800864720594, - "grad_norm": 7.4299798011779785, - "learning_rate": 3.4803324650045023e-06, - "loss": 0.4368, - "mean_token_accuracy": 0.851615709066391, - "num_tokens": 205084596.0, - "step": 170450 - }, - { - "entropy": 1.928575037419796, - "epoch": 0.528411085597109, - "grad_norm": 9.050699234008789, - "learning_rate": 3.480230376422789e-06, - "loss": 0.4368, - "mean_token_accuracy": 0.8607550710439682, - "num_tokens": 205096396.0, - "step": 170460 - }, - { - "entropy": 1.9804482892155648, - "epoch": 0.5284420847221587, - "grad_norm": 11.58686637878418, - "learning_rate": 3.4801282968242415e-06, - "loss": 0.4946, - "mean_token_accuracy": 0.8415035635232926, - "num_tokens": 205107901.0, - "step": 170470 - }, - { - "entropy": 1.965294747054577, - "epoch": 0.5284730838472084, - "grad_norm": 3.7001168727874756, - "learning_rate": 3.4800262262075415e-06, - "loss": 0.4808, - "mean_token_accuracy": 0.8423107132315636, - "num_tokens": 205119244.0, - "step": 170480 - }, - { - "entropy": 1.9720071524381637, - "epoch": 0.5285040829722581, - "grad_norm": 7.256556034088135, - "learning_rate": 3.4799241645713715e-06, - "loss": 0.4625, - "mean_token_accuracy": 0.8573978587985038, - "num_tokens": 205129854.0, - "step": 170490 - }, - { - "entropy": 1.9305579409003257, - "epoch": 0.5285350820973078, - "grad_norm": 6.5054097175598145, - "learning_rate": 3.479822111914416e-06, - "loss": 0.4467, - "mean_token_accuracy": 0.8480049282312393, - "num_tokens": 205141601.0, - "step": 170500 - }, - { - "entropy": 1.7918709874153138, - "epoch": 0.5285660812223575, - "grad_norm": 8.549609184265137, - "learning_rate": 3.479720068235358e-06, - "loss": 0.383, - "mean_token_accuracy": 0.8719558849930763, - "num_tokens": 205154051.0, - "step": 170510 - }, - { - "entropy": 1.921235775947571, - "epoch": 0.5285970803474072, - "grad_norm": 4.8535895347595215, - "learning_rate": 3.4796180335328818e-06, - "loss": 0.4472, - "mean_token_accuracy": 0.8587585180997849, - "num_tokens": 205166626.0, - "step": 170520 - }, - { - "entropy": 1.836370651423931, - "epoch": 0.5286280794724569, - "grad_norm": 7.811720371246338, - "learning_rate": 3.47951600780567e-06, - "loss": 0.3946, - "mean_token_accuracy": 0.8683404237031936, - "num_tokens": 205180061.0, - "step": 170530 - }, - { - "entropy": 1.9387354284524918, - "epoch": 0.5286590785975066, - "grad_norm": 10.000458717346191, - "learning_rate": 3.4794139910524073e-06, - "loss": 0.4971, - "mean_token_accuracy": 0.8442668840289116, - "num_tokens": 205190252.0, - "step": 170540 - }, - { - "entropy": 1.9495687514543534, - "epoch": 0.5286900777225563, - "grad_norm": 8.926695823669434, - "learning_rate": 3.479311983271778e-06, - "loss": 0.4923, - "mean_token_accuracy": 0.8437932714819908, - "num_tokens": 205200850.0, - "step": 170550 - }, - { - "entropy": 1.8872013121843338, - "epoch": 0.5287210768476059, - "grad_norm": 8.04249382019043, - "learning_rate": 3.4792099844624676e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8474228546023369, - "num_tokens": 205212588.0, - "step": 170560 - }, - { - "entropy": 1.8189583510160445, - "epoch": 0.5287520759726557, - "grad_norm": 6.7188568115234375, - "learning_rate": 3.479107994623162e-06, - "loss": 0.3891, - "mean_token_accuracy": 0.8674964413046837, - "num_tokens": 205225588.0, - "step": 170570 - }, - { - "entropy": 1.8325736671686172, - "epoch": 0.5287830750977054, - "grad_norm": 3.9094064235687256, - "learning_rate": 3.4790060137525445e-06, - "loss": 0.4672, - "mean_token_accuracy": 0.8595153912901878, - "num_tokens": 205238340.0, - "step": 170580 - }, - { - "entropy": 1.7467509105801582, - "epoch": 0.5288140742227551, - "grad_norm": 4.745372295379639, - "learning_rate": 3.478904041849302e-06, - "loss": 0.3705, - "mean_token_accuracy": 0.8667123362421989, - "num_tokens": 205252508.0, - "step": 170590 - }, - { - "entropy": 1.8727618649601936, - "epoch": 0.5288450733478047, - "grad_norm": 8.714336395263672, - "learning_rate": 3.4788020789121196e-06, - "loss": 0.4304, - "mean_token_accuracy": 0.8614105626940727, - "num_tokens": 205264806.0, - "step": 170600 - }, - { - "entropy": 1.9451875492930413, - "epoch": 0.5288760724728545, - "grad_norm": 7.6290388107299805, - "learning_rate": 3.478700124939684e-06, - "loss": 0.5023, - "mean_token_accuracy": 0.8461343124508858, - "num_tokens": 205276133.0, - "step": 170610 - }, - { - "entropy": 1.9926445066928864, - "epoch": 0.5289070715979042, - "grad_norm": 8.265352249145508, - "learning_rate": 3.4785981799306824e-06, - "loss": 0.5201, - "mean_token_accuracy": 0.8461702167987823, - "num_tokens": 205286443.0, - "step": 170620 - }, - { - "entropy": 1.9504462242126466, - "epoch": 0.5289380707229538, - "grad_norm": 6.742199420928955, - "learning_rate": 3.4784962438838e-06, - "loss": 0.4811, - "mean_token_accuracy": 0.843754306435585, - "num_tokens": 205297534.0, - "step": 170630 - }, - { - "entropy": 1.8863413706421852, - "epoch": 0.5289690698480035, - "grad_norm": 7.461973667144775, - "learning_rate": 3.478394316797724e-06, - "loss": 0.4452, - "mean_token_accuracy": 0.8559864416718483, - "num_tokens": 205310114.0, - "step": 170640 - }, - { - "entropy": 1.9195245072245597, - "epoch": 0.5290000689730532, - "grad_norm": 7.091519355773926, - "learning_rate": 3.4782923986711427e-06, - "loss": 0.4084, - "mean_token_accuracy": 0.861282479763031, - "num_tokens": 205322178.0, - "step": 170650 - }, - { - "entropy": 1.9424390122294426, - "epoch": 0.529031068098103, - "grad_norm": 7.431682586669922, - "learning_rate": 3.4781904895027417e-06, - "loss": 0.3987, - "mean_token_accuracy": 0.8677595168352127, - "num_tokens": 205333899.0, - "step": 170660 - }, - { - "entropy": 1.9619128614664079, - "epoch": 0.5290620672231526, - "grad_norm": 7.763944625854492, - "learning_rate": 3.4780885892912114e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8576542928814888, - "num_tokens": 205345194.0, - "step": 170670 - }, - { - "entropy": 1.813578100502491, - "epoch": 0.5290930663482023, - "grad_norm": 9.515689849853516, - "learning_rate": 3.4779866980352376e-06, - "loss": 0.3974, - "mean_token_accuracy": 0.8588124215602875, - "num_tokens": 205358327.0, - "step": 170680 - }, - { - "entropy": 1.9403289988636971, - "epoch": 0.529124065473252, - "grad_norm": 4.967763423919678, - "learning_rate": 3.477884815733509e-06, - "loss": 0.4581, - "mean_token_accuracy": 0.8475630313158036, - "num_tokens": 205370299.0, - "step": 170690 - }, - { - "entropy": 1.9086318165063858, - "epoch": 0.5291550645983017, - "grad_norm": 4.940814018249512, - "learning_rate": 3.4777829423847153e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8549073994159698, - "num_tokens": 205381624.0, - "step": 170700 - }, - { - "entropy": 1.870780572295189, - "epoch": 0.5291860637233514, - "grad_norm": 8.352327346801758, - "learning_rate": 3.4776810779875443e-06, - "loss": 0.3743, - "mean_token_accuracy": 0.8714735135436058, - "num_tokens": 205394297.0, - "step": 170710 - }, - { - "entropy": 1.930020920932293, - "epoch": 0.5292170628484011, - "grad_norm": 7.697651386260986, - "learning_rate": 3.477579222540686e-06, - "loss": 0.4324, - "mean_token_accuracy": 0.857844527065754, - "num_tokens": 205406236.0, - "step": 170720 - }, - { - "entropy": 1.8871884644031525, - "epoch": 0.5292480619734508, - "grad_norm": 8.5154447555542, - "learning_rate": 3.4774773760428288e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.8678113892674446, - "num_tokens": 205417888.0, - "step": 170730 - }, - { - "entropy": 1.882606066763401, - "epoch": 0.5292790610985005, - "grad_norm": 9.096325874328613, - "learning_rate": 3.4773755384926622e-06, - "loss": 0.4838, - "mean_token_accuracy": 0.8506225869059563, - "num_tokens": 205429795.0, - "step": 170740 - }, - { - "entropy": 1.810667596757412, - "epoch": 0.5293100602235502, - "grad_norm": 9.180743217468262, - "learning_rate": 3.477273709888877e-06, - "loss": 0.4372, - "mean_token_accuracy": 0.8561224222183228, - "num_tokens": 205442704.0, - "step": 170750 - }, - { - "entropy": 1.815290132164955, - "epoch": 0.5293410593485999, - "grad_norm": 3.778643846511841, - "learning_rate": 3.4771718902301632e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.8607045948505402, - "num_tokens": 205455503.0, - "step": 170760 - }, - { - "entropy": 1.8823131069540977, - "epoch": 0.5293720584736495, - "grad_norm": 4.358188152313232, - "learning_rate": 3.4770700795152113e-06, - "loss": 0.4234, - "mean_token_accuracy": 0.857579217851162, - "num_tokens": 205467961.0, - "step": 170770 - }, - { - "entropy": 1.9376216232776642, - "epoch": 0.5294030575986993, - "grad_norm": 8.354575157165527, - "learning_rate": 3.4769682777427115e-06, - "loss": 0.4691, - "mean_token_accuracy": 0.8583876609802246, - "num_tokens": 205479799.0, - "step": 170780 - }, - { - "entropy": 2.025956559181213, - "epoch": 0.529434056723749, - "grad_norm": 8.568845748901367, - "learning_rate": 3.4768664849113554e-06, - "loss": 0.554, - "mean_token_accuracy": 0.834122110903263, - "num_tokens": 205490649.0, - "step": 170790 - }, - { - "entropy": 1.87187303006649, - "epoch": 0.5294650558487987, - "grad_norm": 9.696044921875, - "learning_rate": 3.4767647010198333e-06, - "loss": 0.4231, - "mean_token_accuracy": 0.8556722313165664, - "num_tokens": 205503385.0, - "step": 170800 - }, - { - "entropy": 1.916469356417656, - "epoch": 0.5294960549738483, - "grad_norm": 8.421268463134766, - "learning_rate": 3.476662926066838e-06, - "loss": 0.4559, - "mean_token_accuracy": 0.8445528611540795, - "num_tokens": 205515716.0, - "step": 170810 - }, - { - "entropy": 2.0089281469583513, - "epoch": 0.5295270540988981, - "grad_norm": 8.422050476074219, - "learning_rate": 3.476561160051061e-06, - "loss": 0.5132, - "mean_token_accuracy": 0.8411181181669235, - "num_tokens": 205526612.0, - "step": 170820 - }, - { - "entropy": 1.8615966871380807, - "epoch": 0.5295580532239478, - "grad_norm": 8.690731048583984, - "learning_rate": 3.4764594029711927e-06, - "loss": 0.4089, - "mean_token_accuracy": 0.8632576271891594, - "num_tokens": 205538945.0, - "step": 170830 - }, - { - "entropy": 1.9792701214551927, - "epoch": 0.5295890523489974, - "grad_norm": 7.975484371185303, - "learning_rate": 3.476357654825928e-06, - "loss": 0.4982, - "mean_token_accuracy": 0.8475763037800789, - "num_tokens": 205549884.0, - "step": 170840 - }, - { - "entropy": 1.880912221968174, - "epoch": 0.5296200514740471, - "grad_norm": 8.423057556152344, - "learning_rate": 3.476255915613958e-06, - "loss": 0.426, - "mean_token_accuracy": 0.8568846389651299, - "num_tokens": 205562367.0, - "step": 170850 - }, - { - "entropy": 1.898339208960533, - "epoch": 0.5296510505990969, - "grad_norm": 3.246166706085205, - "learning_rate": 3.4761541853339758e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8588922902941704, - "num_tokens": 205574760.0, - "step": 170860 - }, - { - "entropy": 1.9326817393302917, - "epoch": 0.5296820497241466, - "grad_norm": 7.557738304138184, - "learning_rate": 3.476052463984674e-06, - "loss": 0.4572, - "mean_token_accuracy": 0.8551507011055947, - "num_tokens": 205586611.0, - "step": 170870 - }, - { - "entropy": 1.9366946816444397, - "epoch": 0.5297130488491962, - "grad_norm": 9.150696754455566, - "learning_rate": 3.475950751564748e-06, - "loss": 0.5066, - "mean_token_accuracy": 0.852847796678543, - "num_tokens": 205598159.0, - "step": 170880 - }, - { - "entropy": 1.8673755928874016, - "epoch": 0.5297440479742459, - "grad_norm": 8.274737358093262, - "learning_rate": 3.4758490480728883e-06, - "loss": 0.4386, - "mean_token_accuracy": 0.859563185274601, - "num_tokens": 205610695.0, - "step": 170890 - }, - { - "entropy": 1.9312998801469803, - "epoch": 0.5297750470992956, - "grad_norm": 4.422121524810791, - "learning_rate": 3.4757473535077918e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.8544506028294563, - "num_tokens": 205623043.0, - "step": 170900 - }, - { - "entropy": 1.878605456650257, - "epoch": 0.5298060462243454, - "grad_norm": 8.475115776062012, - "learning_rate": 3.475645667868151e-06, - "loss": 0.3706, - "mean_token_accuracy": 0.8737639874219895, - "num_tokens": 205635875.0, - "step": 170910 - }, - { - "entropy": 1.8959810853004455, - "epoch": 0.529837045349395, - "grad_norm": 7.229968070983887, - "learning_rate": 3.4755439911526608e-06, - "loss": 0.4773, - "mean_token_accuracy": 0.8536716759204864, - "num_tokens": 205648414.0, - "step": 170920 - }, - { - "entropy": 1.9027748540043832, - "epoch": 0.5298680444744447, - "grad_norm": 7.403899669647217, - "learning_rate": 3.4754423233600166e-06, - "loss": 0.4177, - "mean_token_accuracy": 0.8609417542815209, - "num_tokens": 205660124.0, - "step": 170930 - }, - { - "entropy": 1.8189100459218026, - "epoch": 0.5298990435994944, - "grad_norm": 7.083561420440674, - "learning_rate": 3.475340664488912e-06, - "loss": 0.4209, - "mean_token_accuracy": 0.860698975622654, - "num_tokens": 205672963.0, - "step": 170940 - }, - { - "entropy": 1.9738568156957625, - "epoch": 0.5299300427245441, - "grad_norm": 4.123296737670898, - "learning_rate": 3.4752390145380434e-06, - "loss": 0.469, - "mean_token_accuracy": 0.851046659052372, - "num_tokens": 205683908.0, - "step": 170950 - }, - { - "entropy": 1.9216079100966454, - "epoch": 0.5299610418495938, - "grad_norm": 7.863401412963867, - "learning_rate": 3.4751373735061063e-06, - "loss": 0.4362, - "mean_token_accuracy": 0.8537676095962524, - "num_tokens": 205695636.0, - "step": 170960 - }, - { - "entropy": 1.9529218971729279, - "epoch": 0.5299920409746435, - "grad_norm": 4.082332611083984, - "learning_rate": 3.4750357413917956e-06, - "loss": 0.4356, - "mean_token_accuracy": 0.8507506966590881, - "num_tokens": 205707912.0, - "step": 170970 - }, - { - "entropy": 1.9960295543074609, - "epoch": 0.5300230400996931, - "grad_norm": 9.555853843688965, - "learning_rate": 3.4749341181938086e-06, - "loss": 0.4904, - "mean_token_accuracy": 0.8483745753765106, - "num_tokens": 205719170.0, - "step": 170980 - }, - { - "entropy": 1.9430689826607703, - "epoch": 0.5300540392247429, - "grad_norm": 6.543044567108154, - "learning_rate": 3.4748325039108404e-06, - "loss": 0.4138, - "mean_token_accuracy": 0.8665653809905052, - "num_tokens": 205730921.0, - "step": 170990 - }, - { - "entropy": 1.7677809298038483, - "epoch": 0.5300850383497926, - "grad_norm": 4.4991536140441895, - "learning_rate": 3.4747308985415882e-06, - "loss": 0.3484, - "mean_token_accuracy": 0.8721923053264617, - "num_tokens": 205745399.0, - "step": 171000 - }, - { - "entropy": 1.8735228538513184, - "epoch": 0.5301160374748423, - "grad_norm": 7.107239723205566, - "learning_rate": 3.4746293020847494e-06, - "loss": 0.436, - "mean_token_accuracy": 0.856609933078289, - "num_tokens": 205758439.0, - "step": 171010 - }, - { - "entropy": 1.9389600038528443, - "epoch": 0.5301470365998919, - "grad_norm": 8.752920150756836, - "learning_rate": 3.4745277145390203e-06, - "loss": 0.4929, - "mean_token_accuracy": 0.8403217494487762, - "num_tokens": 205770191.0, - "step": 171020 - }, - { - "entropy": 1.9211769714951514, - "epoch": 0.5301780357249417, - "grad_norm": 8.575197219848633, - "learning_rate": 3.474426135903099e-06, - "loss": 0.4398, - "mean_token_accuracy": 0.8504500105977059, - "num_tokens": 205782618.0, - "step": 171030 - }, - { - "entropy": 1.9171297043561935, - "epoch": 0.5302090348499914, - "grad_norm": 7.678973197937012, - "learning_rate": 3.4743245661756834e-06, - "loss": 0.4715, - "mean_token_accuracy": 0.8509926497936249, - "num_tokens": 205794273.0, - "step": 171040 - }, - { - "entropy": 1.945462729036808, - "epoch": 0.530240033975041, - "grad_norm": 6.8942060470581055, - "learning_rate": 3.4742230053554697e-06, - "loss": 0.4429, - "mean_token_accuracy": 0.8553547367453576, - "num_tokens": 205805838.0, - "step": 171050 - }, - { - "entropy": 1.9018363282084465, - "epoch": 0.5302710331000907, - "grad_norm": 7.0610504150390625, - "learning_rate": 3.474121453441158e-06, - "loss": 0.4176, - "mean_token_accuracy": 0.8616586282849312, - "num_tokens": 205818642.0, - "step": 171060 - }, - { - "entropy": 1.9222232773900032, - "epoch": 0.5303020322251405, - "grad_norm": 6.2709784507751465, - "learning_rate": 3.474019910431446e-06, - "loss": 0.4335, - "mean_token_accuracy": 0.8556934267282486, - "num_tokens": 205831069.0, - "step": 171070 - }, - { - "entropy": 1.8316721200942994, - "epoch": 0.5303330313501902, - "grad_norm": 3.8591957092285156, - "learning_rate": 3.4739183763250324e-06, - "loss": 0.3959, - "mean_token_accuracy": 0.8605335086584092, - "num_tokens": 205844403.0, - "step": 171080 - }, - { - "entropy": 1.9160492144525052, - "epoch": 0.5303640304752398, - "grad_norm": 2.401305913925171, - "learning_rate": 3.4738168511206166e-06, - "loss": 0.4393, - "mean_token_accuracy": 0.8519866108894348, - "num_tokens": 205857195.0, - "step": 171090 - }, - { - "entropy": 1.9100564211606978, - "epoch": 0.5303950296002895, - "grad_norm": 9.531085968017578, - "learning_rate": 3.4737153348168974e-06, - "loss": 0.4267, - "mean_token_accuracy": 0.8574813723564148, - "num_tokens": 205868676.0, - "step": 171100 - }, - { - "entropy": 1.892510113120079, - "epoch": 0.5304260287253393, - "grad_norm": 8.419562339782715, - "learning_rate": 3.4736138274125736e-06, - "loss": 0.4768, - "mean_token_accuracy": 0.8523040831089019, - "num_tokens": 205880267.0, - "step": 171110 - }, - { - "entropy": 1.930445173382759, - "epoch": 0.530457027850389, - "grad_norm": 8.468448638916016, - "learning_rate": 3.473512328906347e-06, - "loss": 0.4445, - "mean_token_accuracy": 0.8609248101711273, - "num_tokens": 205891258.0, - "step": 171120 - }, - { - "entropy": 1.933561460673809, - "epoch": 0.5304880269754386, - "grad_norm": 9.11067008972168, - "learning_rate": 3.473410839296916e-06, - "loss": 0.4754, - "mean_token_accuracy": 0.8550457268953323, - "num_tokens": 205902734.0, - "step": 171130 - }, - { - "entropy": 1.8673659279942512, - "epoch": 0.5305190261004883, - "grad_norm": 7.698315143585205, - "learning_rate": 3.4733093585829824e-06, - "loss": 0.4254, - "mean_token_accuracy": 0.8555175766348839, - "num_tokens": 205915319.0, - "step": 171140 - }, - { - "entropy": 1.87417384237051, - "epoch": 0.530550025225538, - "grad_norm": 9.63538646697998, - "learning_rate": 3.4732078867632454e-06, - "loss": 0.4275, - "mean_token_accuracy": 0.8585587859153747, - "num_tokens": 205927617.0, - "step": 171150 - }, - { - "entropy": 1.7987977162003517, - "epoch": 0.5305810243505877, - "grad_norm": 8.212528228759766, - "learning_rate": 3.473106423836406e-06, - "loss": 0.4199, - "mean_token_accuracy": 0.8556270152330399, - "num_tokens": 205941685.0, - "step": 171160 - }, - { - "entropy": 1.9743974149227141, - "epoch": 0.5306120234756374, - "grad_norm": 8.22183609008789, - "learning_rate": 3.4730049698011663e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.8619926169514656, - "num_tokens": 205952726.0, - "step": 171170 - }, - { - "entropy": 1.8682279139757156, - "epoch": 0.5306430226006871, - "grad_norm": 8.998403549194336, - "learning_rate": 3.4729035246562272e-06, - "loss": 0.3837, - "mean_token_accuracy": 0.8567379638552666, - "num_tokens": 205965714.0, - "step": 171180 - }, - { - "entropy": 1.999984535574913, - "epoch": 0.5306740217257367, - "grad_norm": 10.236671447753906, - "learning_rate": 3.4728020884002904e-06, - "loss": 0.4473, - "mean_token_accuracy": 0.8585654407739639, - "num_tokens": 205976447.0, - "step": 171190 - }, - { - "entropy": 1.9566071853041649, - "epoch": 0.5307050208507865, - "grad_norm": 8.674802780151367, - "learning_rate": 3.4727006610320568e-06, - "loss": 0.4732, - "mean_token_accuracy": 0.8497566193342209, - "num_tokens": 205987680.0, - "step": 171200 - }, - { - "entropy": 1.9564196184277534, - "epoch": 0.5307360199758362, - "grad_norm": 9.1547269821167, - "learning_rate": 3.4725992425502305e-06, - "loss": 0.4506, - "mean_token_accuracy": 0.8570850923657417, - "num_tokens": 205999031.0, - "step": 171210 - }, - { - "entropy": 1.8233773186802864, - "epoch": 0.5307670191008859, - "grad_norm": 3.746918201446533, - "learning_rate": 3.4724978329535126e-06, - "loss": 0.3573, - "mean_token_accuracy": 0.868577241897583, - "num_tokens": 206012684.0, - "step": 171220 - }, - { - "entropy": 1.9081631690263747, - "epoch": 0.5307980182259355, - "grad_norm": 8.837114334106445, - "learning_rate": 3.472396432240606e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8688212335109711, - "num_tokens": 206024510.0, - "step": 171230 - }, - { - "entropy": 1.9076814904808999, - "epoch": 0.5308290173509853, - "grad_norm": 3.593170642852783, - "learning_rate": 3.4722950404102142e-06, - "loss": 0.4478, - "mean_token_accuracy": 0.8619635835289955, - "num_tokens": 206036479.0, - "step": 171240 - }, - { - "entropy": 1.9302921831607818, - "epoch": 0.530860016476035, - "grad_norm": 9.049263000488281, - "learning_rate": 3.4721936574610398e-06, - "loss": 0.5019, - "mean_token_accuracy": 0.8401584982872009, - "num_tokens": 206048513.0, - "step": 171250 - }, - { - "entropy": 1.9466217041015625, - "epoch": 0.5308910156010846, - "grad_norm": 8.891075134277344, - "learning_rate": 3.4720922833917873e-06, - "loss": 0.459, - "mean_token_accuracy": 0.8489014402031898, - "num_tokens": 206059987.0, - "step": 171260 - }, - { - "entropy": 1.9733015805482865, - "epoch": 0.5309220147261343, - "grad_norm": 8.178411483764648, - "learning_rate": 3.471990918201159e-06, - "loss": 0.484, - "mean_token_accuracy": 0.8565971910953522, - "num_tokens": 206071045.0, - "step": 171270 - }, - { - "entropy": 1.9010334610939026, - "epoch": 0.5309530138511841, - "grad_norm": 4.258671283721924, - "learning_rate": 3.4718895618878607e-06, - "loss": 0.4309, - "mean_token_accuracy": 0.8510370135307312, - "num_tokens": 206083794.0, - "step": 171280 - }, - { - "entropy": 1.9572122678160668, - "epoch": 0.5309840129762338, - "grad_norm": 6.8072285652160645, - "learning_rate": 3.4717882144505954e-06, - "loss": 0.4409, - "mean_token_accuracy": 0.8524574264883995, - "num_tokens": 206095828.0, - "step": 171290 - }, - { - "entropy": 1.7896913141012192, - "epoch": 0.5310150121012834, - "grad_norm": 8.507625579833984, - "learning_rate": 3.471686875888068e-06, - "loss": 0.3289, - "mean_token_accuracy": 0.8647554531693459, - "num_tokens": 206109338.0, - "step": 171300 - }, - { - "entropy": 1.9450671926140786, - "epoch": 0.5310460112263331, - "grad_norm": 9.12865161895752, - "learning_rate": 3.471585546198984e-06, - "loss": 0.4163, - "mean_token_accuracy": 0.8636203348636627, - "num_tokens": 206120543.0, - "step": 171310 - }, - { - "entropy": 2.017956680059433, - "epoch": 0.5310770103513829, - "grad_norm": 7.720823287963867, - "learning_rate": 3.4714842253820474e-06, - "loss": 0.5124, - "mean_token_accuracy": 0.8404148548841477, - "num_tokens": 206131003.0, - "step": 171320 - }, - { - "entropy": 1.9527254745364189, - "epoch": 0.5311080094764326, - "grad_norm": 8.367618560791016, - "learning_rate": 3.4713829134359637e-06, - "loss": 0.4437, - "mean_token_accuracy": 0.8573798596858978, - "num_tokens": 206142871.0, - "step": 171330 - }, - { - "entropy": 1.8782565340399742, - "epoch": 0.5311390086014822, - "grad_norm": 11.827756881713867, - "learning_rate": 3.47128161035944e-06, - "loss": 0.498, - "mean_token_accuracy": 0.8391523391008378, - "num_tokens": 206155200.0, - "step": 171340 - }, - { - "entropy": 1.8876243814826013, - "epoch": 0.5311700077265319, - "grad_norm": 2.4594743251800537, - "learning_rate": 3.471180316151181e-06, - "loss": 0.4669, - "mean_token_accuracy": 0.8383335024118423, - "num_tokens": 206167298.0, - "step": 171350 - }, - { - "entropy": 1.9841479808092117, - "epoch": 0.5312010068515816, - "grad_norm": 7.097223281860352, - "learning_rate": 3.4710790308098923e-06, - "loss": 0.473, - "mean_token_accuracy": 0.8582739785313607, - "num_tokens": 206178070.0, - "step": 171360 - }, - { - "entropy": 1.7915344461798668, - "epoch": 0.5312320059766313, - "grad_norm": 6.845920562744141, - "learning_rate": 3.4709777543342814e-06, - "loss": 0.3932, - "mean_token_accuracy": 0.8660350710153579, - "num_tokens": 206191483.0, - "step": 171370 - }, - { - "entropy": 1.9542489141225814, - "epoch": 0.531263005101681, - "grad_norm": 5.8323211669921875, - "learning_rate": 3.470876486723055e-06, - "loss": 0.4619, - "mean_token_accuracy": 0.8544400915503502, - "num_tokens": 206202729.0, - "step": 171380 - }, - { - "entropy": 1.9344114646315576, - "epoch": 0.5312940042267307, - "grad_norm": 7.813360214233398, - "learning_rate": 3.4707752279749196e-06, - "loss": 0.4989, - "mean_token_accuracy": 0.8451777279376984, - "num_tokens": 206214813.0, - "step": 171390 - }, - { - "entropy": 1.8977814212441444, - "epoch": 0.5313250033517803, - "grad_norm": 7.615082263946533, - "learning_rate": 3.470673978088583e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8473282709717751, - "num_tokens": 206226984.0, - "step": 171400 - }, - { - "entropy": 1.835694682598114, - "epoch": 0.5313560024768301, - "grad_norm": 7.679605960845947, - "learning_rate": 3.470572737062751e-06, - "loss": 0.3752, - "mean_token_accuracy": 0.86816466152668, - "num_tokens": 206239730.0, - "step": 171410 - }, - { - "entropy": 1.909173347055912, - "epoch": 0.5313870016018798, - "grad_norm": 7.6606245040893555, - "learning_rate": 3.470471504896134e-06, - "loss": 0.4562, - "mean_token_accuracy": 0.860865730047226, - "num_tokens": 206251566.0, - "step": 171420 - }, - { - "entropy": 1.9146757140755652, - "epoch": 0.5314180007269295, - "grad_norm": 9.230483055114746, - "learning_rate": 3.470370281587438e-06, - "loss": 0.4581, - "mean_token_accuracy": 0.8539326146245003, - "num_tokens": 206264185.0, - "step": 171430 - }, - { - "entropy": 1.965253323316574, - "epoch": 0.5314489998519791, - "grad_norm": 8.606255531311035, - "learning_rate": 3.4702690671353717e-06, - "loss": 0.4623, - "mean_token_accuracy": 0.8522212520241738, - "num_tokens": 206275569.0, - "step": 171440 - }, - { - "entropy": 1.9642528668045998, - "epoch": 0.5314799989770289, - "grad_norm": 4.579861640930176, - "learning_rate": 3.470167861538645e-06, - "loss": 0.4641, - "mean_token_accuracy": 0.8538163334131241, - "num_tokens": 206286986.0, - "step": 171450 - }, - { - "entropy": 1.9098300486803055, - "epoch": 0.5315109981020786, - "grad_norm": 8.911893844604492, - "learning_rate": 3.4700666647959643e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8599412739276886, - "num_tokens": 206298944.0, - "step": 171460 - }, - { - "entropy": 1.8672787815332412, - "epoch": 0.5315419972271282, - "grad_norm": 9.113754272460938, - "learning_rate": 3.4699654769060397e-06, - "loss": 0.423, - "mean_token_accuracy": 0.8529917195439338, - "num_tokens": 206311131.0, - "step": 171470 - }, - { - "entropy": 1.91919025182724, - "epoch": 0.5315729963521779, - "grad_norm": 3.830425977706909, - "learning_rate": 3.4698642978675815e-06, - "loss": 0.4355, - "mean_token_accuracy": 0.8582732066512108, - "num_tokens": 206322833.0, - "step": 171480 - }, - { - "entropy": 1.9081533446907997, - "epoch": 0.5316039954772277, - "grad_norm": 9.601178169250488, - "learning_rate": 3.469763127679298e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.8536054447293282, - "num_tokens": 206334813.0, - "step": 171490 - }, - { - "entropy": 1.929221235215664, - "epoch": 0.5316349946022774, - "grad_norm": 10.570503234863281, - "learning_rate": 3.4696619663399002e-06, - "loss": 0.4636, - "mean_token_accuracy": 0.8487908989191055, - "num_tokens": 206346558.0, - "step": 171500 - }, - { - "entropy": 1.8757613226771355, - "epoch": 0.531665993727327, - "grad_norm": 8.72884464263916, - "learning_rate": 3.4695608138480966e-06, - "loss": 0.5066, - "mean_token_accuracy": 0.84766735881567, - "num_tokens": 206360134.0, - "step": 171510 - }, - { - "entropy": 1.9297456085681914, - "epoch": 0.5316969928523767, - "grad_norm": 3.734219551086426, - "learning_rate": 3.4694596702026e-06, - "loss": 0.4571, - "mean_token_accuracy": 0.8514823019504547, - "num_tokens": 206371913.0, - "step": 171520 - }, - { - "entropy": 1.9066060423851012, - "epoch": 0.5317279919774265, - "grad_norm": 8.339156150817871, - "learning_rate": 3.4693585354021194e-06, - "loss": 0.4002, - "mean_token_accuracy": 0.8531105428934097, - "num_tokens": 206385110.0, - "step": 171530 - }, - { - "entropy": 1.9329568713903427, - "epoch": 0.5317589911024762, - "grad_norm": 8.051816940307617, - "learning_rate": 3.469257409445365e-06, - "loss": 0.4729, - "mean_token_accuracy": 0.8447334930300713, - "num_tokens": 206396309.0, - "step": 171540 - }, - { - "entropy": 1.9102501556277276, - "epoch": 0.5317899902275258, - "grad_norm": 3.6929969787597656, - "learning_rate": 3.46915629233105e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.857056412100792, - "num_tokens": 206408121.0, - "step": 171550 - }, - { - "entropy": 1.8484255477786065, - "epoch": 0.5318209893525755, - "grad_norm": 7.808130264282227, - "learning_rate": 3.469055184057884e-06, - "loss": 0.4515, - "mean_token_accuracy": 0.8478391453623771, - "num_tokens": 206421278.0, - "step": 171560 - }, - { - "entropy": 1.7564107075333595, - "epoch": 0.5318519884776253, - "grad_norm": 4.053802013397217, - "learning_rate": 3.46895408462458e-06, - "loss": 0.3849, - "mean_token_accuracy": 0.8702838957309723, - "num_tokens": 206435223.0, - "step": 171570 - }, - { - "entropy": 1.904933924973011, - "epoch": 0.5318829876026749, - "grad_norm": 7.180025577545166, - "learning_rate": 3.4688529940298493e-06, - "loss": 0.4499, - "mean_token_accuracy": 0.8612485557794571, - "num_tokens": 206446956.0, - "step": 171580 - }, - { - "entropy": 1.9514161705970765, - "epoch": 0.5319139867277246, - "grad_norm": 8.409560203552246, - "learning_rate": 3.4687519122724043e-06, - "loss": 0.4828, - "mean_token_accuracy": 0.8485655844211578, - "num_tokens": 206458061.0, - "step": 171590 - }, - { - "entropy": 1.912647745013237, - "epoch": 0.5319449858527743, - "grad_norm": 3.6584439277648926, - "learning_rate": 3.4686508393509575e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8545682623982429, - "num_tokens": 206470372.0, - "step": 171600 - }, - { - "entropy": 1.895480439066887, - "epoch": 0.531975984977824, - "grad_norm": 7.713948726654053, - "learning_rate": 3.4685497752642215e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8665553316473961, - "num_tokens": 206481643.0, - "step": 171610 - }, - { - "entropy": 1.78747378885746, - "epoch": 0.5320069841028737, - "grad_norm": 9.084161758422852, - "learning_rate": 3.4684487200109096e-06, - "loss": 0.3502, - "mean_token_accuracy": 0.8678223341703415, - "num_tokens": 206495368.0, - "step": 171620 - }, - { - "entropy": 2.0036447823047636, - "epoch": 0.5320379832279234, - "grad_norm": 8.734299659729004, - "learning_rate": 3.468347673589735e-06, - "loss": 0.4483, - "mean_token_accuracy": 0.8604890152812004, - "num_tokens": 206505972.0, - "step": 171630 - }, - { - "entropy": 1.8867813602089882, - "epoch": 0.5320689823529731, - "grad_norm": 4.2584147453308105, - "learning_rate": 3.468246635999411e-06, - "loss": 0.4346, - "mean_token_accuracy": 0.8592818707227707, - "num_tokens": 206517895.0, - "step": 171640 - }, - { - "entropy": 1.8816465973854064, - "epoch": 0.5320999814780227, - "grad_norm": 3.920475721359253, - "learning_rate": 3.468145607238652e-06, - "loss": 0.4481, - "mean_token_accuracy": 0.844399020075798, - "num_tokens": 206530527.0, - "step": 171650 - }, - { - "entropy": 1.887511445581913, - "epoch": 0.5321309806030725, - "grad_norm": 7.801617622375488, - "learning_rate": 3.4680445873061712e-06, - "loss": 0.3949, - "mean_token_accuracy": 0.8650444522500038, - "num_tokens": 206542889.0, - "step": 171660 - }, - { - "entropy": 1.9158253505825997, - "epoch": 0.5321619797281222, - "grad_norm": 9.402112007141113, - "learning_rate": 3.4679435762006837e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.8618099376559257, - "num_tokens": 206555988.0, - "step": 171670 - }, - { - "entropy": 1.87806778550148, - "epoch": 0.5321929788531718, - "grad_norm": 9.117574691772461, - "learning_rate": 3.4678425739209038e-06, - "loss": 0.4078, - "mean_token_accuracy": 0.8550878211855888, - "num_tokens": 206569026.0, - "step": 171680 - }, - { - "entropy": 1.9359776824712753, - "epoch": 0.5322239779782215, - "grad_norm": 8.080842971801758, - "learning_rate": 3.467741580465545e-06, - "loss": 0.4795, - "mean_token_accuracy": 0.8594881877303123, - "num_tokens": 206580849.0, - "step": 171690 - }, - { - "entropy": 1.8874040782451629, - "epoch": 0.5322549771032713, - "grad_norm": 9.077346801757812, - "learning_rate": 3.4676405958333254e-06, - "loss": 0.4366, - "mean_token_accuracy": 0.8619496509432792, - "num_tokens": 206592399.0, - "step": 171700 - }, - { - "entropy": 1.9352701306343079, - "epoch": 0.532285976228321, - "grad_norm": 9.528621673583984, - "learning_rate": 3.4675396200229575e-06, - "loss": 0.4648, - "mean_token_accuracy": 0.844129042327404, - "num_tokens": 206604443.0, - "step": 171710 - }, - { - "entropy": 1.930326594412327, - "epoch": 0.5323169753533706, - "grad_norm": 6.972187519073486, - "learning_rate": 3.4674386530331596e-06, - "loss": 0.443, - "mean_token_accuracy": 0.8594956189393997, - "num_tokens": 206615898.0, - "step": 171720 - }, - { - "entropy": 1.8591390900313853, - "epoch": 0.5323479744784203, - "grad_norm": 8.316756248474121, - "learning_rate": 3.4673376948626446e-06, - "loss": 0.392, - "mean_token_accuracy": 0.8666254863142967, - "num_tokens": 206628209.0, - "step": 171730 - }, - { - "entropy": 1.8329302787780761, - "epoch": 0.5323789736034701, - "grad_norm": 8.091080665588379, - "learning_rate": 3.467236745510131e-06, - "loss": 0.4303, - "mean_token_accuracy": 0.8599769771099091, - "num_tokens": 206640092.0, - "step": 171740 - }, - { - "entropy": 1.910070639848709, - "epoch": 0.5324099727285198, - "grad_norm": 7.389466762542725, - "learning_rate": 3.467135804974334e-06, - "loss": 0.4384, - "mean_token_accuracy": 0.8566348433494568, - "num_tokens": 206652537.0, - "step": 171750 - }, - { - "entropy": 1.8269345745444299, - "epoch": 0.5324409718535694, - "grad_norm": 7.980764389038086, - "learning_rate": 3.467034873253971e-06, - "loss": 0.4006, - "mean_token_accuracy": 0.8637316823005676, - "num_tokens": 206666095.0, - "step": 171760 - }, - { - "entropy": 1.8664949864149094, - "epoch": 0.5324719709786191, - "grad_norm": 7.793969631195068, - "learning_rate": 3.466933950347759e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8574715122580528, - "num_tokens": 206679078.0, - "step": 171770 - }, - { - "entropy": 1.980468001961708, - "epoch": 0.5325029701036689, - "grad_norm": 11.13428783416748, - "learning_rate": 3.466833036254414e-06, - "loss": 0.4664, - "mean_token_accuracy": 0.8566422179341316, - "num_tokens": 206689658.0, - "step": 171780 - }, - { - "entropy": 1.9217968866229058, - "epoch": 0.5325339692287185, - "grad_norm": 8.130260467529297, - "learning_rate": 3.4667321309726548e-06, - "loss": 0.4216, - "mean_token_accuracy": 0.8584070265293121, - "num_tokens": 206701510.0, - "step": 171790 - }, - { - "entropy": 1.9040216952562332, - "epoch": 0.5325649683537682, - "grad_norm": 9.502899169921875, - "learning_rate": 3.4666312345011983e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8540240123867988, - "num_tokens": 206713670.0, - "step": 171800 - }, - { - "entropy": 1.9251901611685753, - "epoch": 0.5325959674788179, - "grad_norm": 4.581042289733887, - "learning_rate": 3.4665303468387633e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8467323318123817, - "num_tokens": 206726091.0, - "step": 171810 - }, - { - "entropy": 1.9404249116778374, - "epoch": 0.5326269666038677, - "grad_norm": 9.953166007995605, - "learning_rate": 3.4664294679840674e-06, - "loss": 0.4994, - "mean_token_accuracy": 0.8406923890113831, - "num_tokens": 206737652.0, - "step": 171820 - }, - { - "entropy": 1.8462284199893475, - "epoch": 0.5326579657289173, - "grad_norm": 9.736047744750977, - "learning_rate": 3.466328597935829e-06, - "loss": 0.4135, - "mean_token_accuracy": 0.8614635065197944, - "num_tokens": 206751383.0, - "step": 171830 - }, - { - "entropy": 1.8233645886182785, - "epoch": 0.532688964853967, - "grad_norm": 2.792520046234131, - "learning_rate": 3.4662277366927677e-06, - "loss": 0.4048, - "mean_token_accuracy": 0.8697629243135452, - "num_tokens": 206764520.0, - "step": 171840 - }, - { - "entropy": 1.9205037295818328, - "epoch": 0.5327199639790167, - "grad_norm": 7.479151725769043, - "learning_rate": 3.4661268842536015e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.856606675684452, - "num_tokens": 206777062.0, - "step": 171850 - }, - { - "entropy": 1.8721492350101472, - "epoch": 0.5327509631040663, - "grad_norm": 7.801178455352783, - "learning_rate": 3.4660260406170507e-06, - "loss": 0.4388, - "mean_token_accuracy": 0.8510833352804184, - "num_tokens": 206789262.0, - "step": 171860 - }, - { - "entropy": 1.7701141953468322, - "epoch": 0.5327819622291161, - "grad_norm": 2.5856385231018066, - "learning_rate": 3.4659252057818343e-06, - "loss": 0.3925, - "mean_token_accuracy": 0.8556995436549186, - "num_tokens": 206803454.0, - "step": 171870 - }, - { - "entropy": 1.89136783182621, - "epoch": 0.5328129613541658, - "grad_norm": 3.5028934478759766, - "learning_rate": 3.4658243797466718e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.8560910046100616, - "num_tokens": 206815864.0, - "step": 171880 - }, - { - "entropy": 1.8578203037381171, - "epoch": 0.5328439604792155, - "grad_norm": 7.930136203765869, - "learning_rate": 3.465723562510284e-06, - "loss": 0.386, - "mean_token_accuracy": 0.8557386100292206, - "num_tokens": 206828998.0, - "step": 171890 - }, - { - "entropy": 1.883947178721428, - "epoch": 0.5328749596042651, - "grad_norm": 3.5195116996765137, - "learning_rate": 3.4656227540713905e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.8585299208760262, - "num_tokens": 206841080.0, - "step": 171900 - }, - { - "entropy": 1.9098535567522048, - "epoch": 0.5329059587293149, - "grad_norm": 3.2760863304138184, - "learning_rate": 3.465521954428713e-06, - "loss": 0.4457, - "mean_token_accuracy": 0.8506251364946366, - "num_tokens": 206853482.0, - "step": 171910 - }, - { - "entropy": 1.9376119673252106, - "epoch": 0.5329369578543646, - "grad_norm": 3.709351062774658, - "learning_rate": 3.465421163580971e-06, - "loss": 0.4254, - "mean_token_accuracy": 0.8610507115721703, - "num_tokens": 206865078.0, - "step": 171920 - }, - { - "entropy": 1.9255750745534896, - "epoch": 0.5329679569794142, - "grad_norm": 8.195679664611816, - "learning_rate": 3.4653203815268865e-06, - "loss": 0.4434, - "mean_token_accuracy": 0.8562889978289604, - "num_tokens": 206877650.0, - "step": 171930 - }, - { - "entropy": 1.9404271885752677, - "epoch": 0.5329989561044639, - "grad_norm": 6.643172740936279, - "learning_rate": 3.46521960826518e-06, - "loss": 0.4128, - "mean_token_accuracy": 0.8628787726163865, - "num_tokens": 206888757.0, - "step": 171940 - }, - { - "entropy": 1.8620061218738555, - "epoch": 0.5330299552295137, - "grad_norm": 7.816823482513428, - "learning_rate": 3.465118843794575e-06, - "loss": 0.4129, - "mean_token_accuracy": 0.8594163581728935, - "num_tokens": 206901369.0, - "step": 171950 - }, - { - "entropy": 1.8807374939322472, - "epoch": 0.5330609543545634, - "grad_norm": 8.188911437988281, - "learning_rate": 3.465018088113791e-06, - "loss": 0.4932, - "mean_token_accuracy": 0.8510372474789619, - "num_tokens": 206913816.0, - "step": 171960 - }, - { - "entropy": 1.819548812508583, - "epoch": 0.533091953479613, - "grad_norm": 4.415698051452637, - "learning_rate": 3.464917341221552e-06, - "loss": 0.3946, - "mean_token_accuracy": 0.8620653316378594, - "num_tokens": 206927026.0, - "step": 171970 - }, - { - "entropy": 1.8535535261034966, - "epoch": 0.5331229526046627, - "grad_norm": 8.980545043945312, - "learning_rate": 3.4648166031165797e-06, - "loss": 0.4003, - "mean_token_accuracy": 0.8751401022076607, - "num_tokens": 206939349.0, - "step": 171980 - }, - { - "entropy": 1.9402846857905387, - "epoch": 0.5331539517297125, - "grad_norm": 8.309432983398438, - "learning_rate": 3.4647158737975966e-06, - "loss": 0.4902, - "mean_token_accuracy": 0.8573802545666694, - "num_tokens": 206951605.0, - "step": 171990 - }, - { - "entropy": 1.8485840871930121, - "epoch": 0.5331849508547621, - "grad_norm": 9.220359802246094, - "learning_rate": 3.4646151532633265e-06, - "loss": 0.4131, - "mean_token_accuracy": 0.8622194632887841, - "num_tokens": 206964468.0, - "step": 172000 - }, - { - "entropy": 1.9720681741833688, - "epoch": 0.5332159499798118, - "grad_norm": 8.475946426391602, - "learning_rate": 3.4645144415124916e-06, - "loss": 0.4566, - "mean_token_accuracy": 0.8575832083821296, - "num_tokens": 206975496.0, - "step": 172010 - }, - { - "entropy": 1.9544428601861, - "epoch": 0.5332469491048615, - "grad_norm": 8.449519157409668, - "learning_rate": 3.4644137385438166e-06, - "loss": 0.4649, - "mean_token_accuracy": 0.8478255704045295, - "num_tokens": 206987103.0, - "step": 172020 - }, - { - "entropy": 1.9222390592098235, - "epoch": 0.5332779482299113, - "grad_norm": 7.751585960388184, - "learning_rate": 3.4643130443560222e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8567695304751396, - "num_tokens": 206998633.0, - "step": 172030 - }, - { - "entropy": 1.8945856779813766, - "epoch": 0.5333089473549609, - "grad_norm": 8.54149055480957, - "learning_rate": 3.4642123589478366e-06, - "loss": 0.4262, - "mean_token_accuracy": 0.85653665214777, - "num_tokens": 207011016.0, - "step": 172040 - }, - { - "entropy": 1.9423924744129182, - "epoch": 0.5333399464800106, - "grad_norm": 9.315898895263672, - "learning_rate": 3.464111682317981e-06, - "loss": 0.4784, - "mean_token_accuracy": 0.8562673062086106, - "num_tokens": 207022489.0, - "step": 172050 - }, - { - "entropy": 1.8690737947821616, - "epoch": 0.5333709456050603, - "grad_norm": 3.2628121376037598, - "learning_rate": 3.4640110144651813e-06, - "loss": 0.4014, - "mean_token_accuracy": 0.8605872854590416, - "num_tokens": 207035182.0, - "step": 172060 - }, - { - "entropy": 1.954857885837555, - "epoch": 0.53340194473011, - "grad_norm": 7.77451229095459, - "learning_rate": 3.463910355388162e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8507927462458611, - "num_tokens": 207046525.0, - "step": 172070 - }, - { - "entropy": 1.878305557370186, - "epoch": 0.5334329438551597, - "grad_norm": 4.307947158813477, - "learning_rate": 3.463809705085648e-06, - "loss": 0.4057, - "mean_token_accuracy": 0.8610593438148498, - "num_tokens": 207059144.0, - "step": 172080 - }, - { - "entropy": 1.893868698179722, - "epoch": 0.5334639429802094, - "grad_norm": 8.05561351776123, - "learning_rate": 3.4637090635563638e-06, - "loss": 0.4299, - "mean_token_accuracy": 0.8604981452226639, - "num_tokens": 207071120.0, - "step": 172090 - }, - { - "entropy": 1.87116037607193, - "epoch": 0.533494942105259, - "grad_norm": 2.938767194747925, - "learning_rate": 3.4636084307990363e-06, - "loss": 0.4198, - "mean_token_accuracy": 0.8630983516573906, - "num_tokens": 207084331.0, - "step": 172100 - }, - { - "entropy": 1.9578429192304612, - "epoch": 0.5335259412303087, - "grad_norm": 8.522046089172363, - "learning_rate": 3.4635078068123907e-06, - "loss": 0.4783, - "mean_token_accuracy": 0.8521770656108856, - "num_tokens": 207094960.0, - "step": 172110 - }, - { - "entropy": 1.8961300045251845, - "epoch": 0.5335569403553585, - "grad_norm": 9.013341903686523, - "learning_rate": 3.463407191595153e-06, - "loss": 0.3939, - "mean_token_accuracy": 0.8604053586721421, - "num_tokens": 207107582.0, - "step": 172120 - }, - { - "entropy": 1.9070754051208496, - "epoch": 0.5335879394804082, - "grad_norm": 6.7184929847717285, - "learning_rate": 3.463306585146049e-06, - "loss": 0.432, - "mean_token_accuracy": 0.8622305661439895, - "num_tokens": 207119530.0, - "step": 172130 - }, - { - "entropy": 1.8905790030956269, - "epoch": 0.5336189386054578, - "grad_norm": 7.1780619621276855, - "learning_rate": 3.4632059874638064e-06, - "loss": 0.43, - "mean_token_accuracy": 0.8592176213860512, - "num_tokens": 207131721.0, - "step": 172140 - }, - { - "entropy": 1.899731382727623, - "epoch": 0.5336499377305075, - "grad_norm": 7.558929443359375, - "learning_rate": 3.4631053985471513e-06, - "loss": 0.419, - "mean_token_accuracy": 0.8633270218968392, - "num_tokens": 207143338.0, - "step": 172150 - }, - { - "entropy": 1.8563763827085495, - "epoch": 0.5336809368555573, - "grad_norm": 4.571922302246094, - "learning_rate": 3.4630048183948106e-06, - "loss": 0.4133, - "mean_token_accuracy": 0.8659122556447982, - "num_tokens": 207155624.0, - "step": 172160 - }, - { - "entropy": 1.8730071052908897, - "epoch": 0.533711935980607, - "grad_norm": 8.322432518005371, - "learning_rate": 3.462904247005513e-06, - "loss": 0.4306, - "mean_token_accuracy": 0.8556889921426774, - "num_tokens": 207167987.0, - "step": 172170 - }, - { - "entropy": 1.9720443457365036, - "epoch": 0.5337429351056566, - "grad_norm": 9.21069622039795, - "learning_rate": 3.4628036843779846e-06, - "loss": 0.5156, - "mean_token_accuracy": 0.8485893458127975, - "num_tokens": 207179298.0, - "step": 172180 - }, - { - "entropy": 1.8745362177491187, - "epoch": 0.5337739342307063, - "grad_norm": 7.4704461097717285, - "learning_rate": 3.462703130510953e-06, - "loss": 0.3796, - "mean_token_accuracy": 0.8695771753787994, - "num_tokens": 207191249.0, - "step": 172190 - }, - { - "entropy": 1.9133777856826781, - "epoch": 0.5338049333557561, - "grad_norm": 9.01170539855957, - "learning_rate": 3.4626025854031475e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8624561950564384, - "num_tokens": 207202988.0, - "step": 172200 - }, - { - "entropy": 1.9582080647349358, - "epoch": 0.5338359324808057, - "grad_norm": 8.62938404083252, - "learning_rate": 3.462502049053296e-06, - "loss": 0.4948, - "mean_token_accuracy": 0.8430709898471832, - "num_tokens": 207214250.0, - "step": 172210 - }, - { - "entropy": 1.8965184196829796, - "epoch": 0.5338669316058554, - "grad_norm": 4.407395362854004, - "learning_rate": 3.462401521460128e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8566911399364472, - "num_tokens": 207227351.0, - "step": 172220 - }, - { - "entropy": 1.9487920999526978, - "epoch": 0.5338979307309051, - "grad_norm": 8.692533493041992, - "learning_rate": 3.462301002622371e-06, - "loss": 0.465, - "mean_token_accuracy": 0.8509125307202339, - "num_tokens": 207238828.0, - "step": 172230 - }, - { - "entropy": 1.923832182586193, - "epoch": 0.5339289298559549, - "grad_norm": 8.191177368164062, - "learning_rate": 3.4622004925387546e-06, - "loss": 0.442, - "mean_token_accuracy": 0.8531102791428566, - "num_tokens": 207251198.0, - "step": 172240 - }, - { - "entropy": 1.8550199180841447, - "epoch": 0.5339599289810045, - "grad_norm": 4.443090438842773, - "learning_rate": 3.4620999912080088e-06, - "loss": 0.4148, - "mean_token_accuracy": 0.8594103202223777, - "num_tokens": 207264801.0, - "step": 172250 - }, - { - "entropy": 1.9658312529325486, - "epoch": 0.5339909281060542, - "grad_norm": 7.00798225402832, - "learning_rate": 3.461999498628862e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.855456392467022, - "num_tokens": 207276463.0, - "step": 172260 - }, - { - "entropy": 1.9467895850539207, - "epoch": 0.5340219272311039, - "grad_norm": 10.077486038208008, - "learning_rate": 3.4618990148000458e-06, - "loss": 0.4644, - "mean_token_accuracy": 0.844844251871109, - "num_tokens": 207288481.0, - "step": 172270 - }, - { - "entropy": 1.8941056042909623, - "epoch": 0.5340529263561536, - "grad_norm": 7.274338245391846, - "learning_rate": 3.4617985397202896e-06, - "loss": 0.4039, - "mean_token_accuracy": 0.8619851514697074, - "num_tokens": 207301250.0, - "step": 172280 - }, - { - "entropy": 1.9818316102027893, - "epoch": 0.5340839254812033, - "grad_norm": 4.565423965454102, - "learning_rate": 3.4616980733883243e-06, - "loss": 0.4777, - "mean_token_accuracy": 0.847413894534111, - "num_tokens": 207312358.0, - "step": 172290 - }, - { - "entropy": 1.9006615951657295, - "epoch": 0.534114924606253, - "grad_norm": 4.181703090667725, - "learning_rate": 3.461597615802879e-06, - "loss": 0.435, - "mean_token_accuracy": 0.8480461463332176, - "num_tokens": 207324643.0, - "step": 172300 - }, - { - "entropy": 1.949682556092739, - "epoch": 0.5341459237313027, - "grad_norm": 3.784156322479248, - "learning_rate": 3.4614971669626863e-06, - "loss": 0.4195, - "mean_token_accuracy": 0.8603693306446075, - "num_tokens": 207336517.0, - "step": 172310 - }, - { - "entropy": 1.9486548736691476, - "epoch": 0.5341769228563524, - "grad_norm": 6.828722953796387, - "learning_rate": 3.461396726866477e-06, - "loss": 0.4737, - "mean_token_accuracy": 0.8525962918996811, - "num_tokens": 207347650.0, - "step": 172320 - }, - { - "entropy": 1.9545435726642608, - "epoch": 0.5342079219814021, - "grad_norm": 7.952531337738037, - "learning_rate": 3.4612962955129826e-06, - "loss": 0.4338, - "mean_token_accuracy": 0.8619145333766938, - "num_tokens": 207359091.0, - "step": 172330 - }, - { - "entropy": 1.889906283468008, - "epoch": 0.5342389211064518, - "grad_norm": 7.539053440093994, - "learning_rate": 3.461195872900934e-06, - "loss": 0.4255, - "mean_token_accuracy": 0.8641923710703849, - "num_tokens": 207371795.0, - "step": 172340 - }, - { - "entropy": 1.9179092735052108, - "epoch": 0.5342699202315014, - "grad_norm": 3.5443851947784424, - "learning_rate": 3.461095459029065e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8606422588229179, - "num_tokens": 207383403.0, - "step": 172350 - }, - { - "entropy": 2.0034004122018816, - "epoch": 0.5343009193565511, - "grad_norm": 8.226675033569336, - "learning_rate": 3.460995053896107e-06, - "loss": 0.4975, - "mean_token_accuracy": 0.8465193957090378, - "num_tokens": 207394658.0, - "step": 172360 - }, - { - "entropy": 1.9629688054323196, - "epoch": 0.5343319184816009, - "grad_norm": 7.675604820251465, - "learning_rate": 3.4608946575007906e-06, - "loss": 0.439, - "mean_token_accuracy": 0.8677529841661453, - "num_tokens": 207405234.0, - "step": 172370 - }, - { - "entropy": 1.9701706409454345, - "epoch": 0.5343629176066506, - "grad_norm": 7.960362434387207, - "learning_rate": 3.4607942698418523e-06, - "loss": 0.454, - "mean_token_accuracy": 0.8556162640452385, - "num_tokens": 207416438.0, - "step": 172380 - }, - { - "entropy": 1.8850088074803353, - "epoch": 0.5343939167317002, - "grad_norm": 7.914768218994141, - "learning_rate": 3.4606938909180217e-06, - "loss": 0.3836, - "mean_token_accuracy": 0.8754544615745544, - "num_tokens": 207428510.0, - "step": 172390 - }, - { - "entropy": 1.8988526239991188, - "epoch": 0.5344249158567499, - "grad_norm": 9.443257331848145, - "learning_rate": 3.460593520728034e-06, - "loss": 0.4398, - "mean_token_accuracy": 0.8607255682349205, - "num_tokens": 207439902.0, - "step": 172400 - }, - { - "entropy": 1.8619684666395186, - "epoch": 0.5344559149817997, - "grad_norm": 8.755477905273438, - "learning_rate": 3.460493159270622e-06, - "loss": 0.3887, - "mean_token_accuracy": 0.8674828916788101, - "num_tokens": 207452303.0, - "step": 172410 - }, - { - "entropy": 1.9530357331037522, - "epoch": 0.5344869141068493, - "grad_norm": 7.649057388305664, - "learning_rate": 3.4603928065445197e-06, - "loss": 0.4671, - "mean_token_accuracy": 0.8552047446370125, - "num_tokens": 207463895.0, - "step": 172420 - }, - { - "entropy": 1.9203311637043954, - "epoch": 0.534517913231899, - "grad_norm": 9.419485092163086, - "learning_rate": 3.460292462548462e-06, - "loss": 0.4482, - "mean_token_accuracy": 0.8501846611499786, - "num_tokens": 207475641.0, - "step": 172430 - }, - { - "entropy": 1.965132975578308, - "epoch": 0.5345489123569487, - "grad_norm": 7.676466464996338, - "learning_rate": 3.4601921272811813e-06, - "loss": 0.491, - "mean_token_accuracy": 0.8473555102944375, - "num_tokens": 207487141.0, - "step": 172440 - }, - { - "entropy": 1.9162400186061859, - "epoch": 0.5345799114819985, - "grad_norm": 8.840492248535156, - "learning_rate": 3.4600918007414135e-06, - "loss": 0.4943, - "mean_token_accuracy": 0.8445732891559601, - "num_tokens": 207500343.0, - "step": 172450 - }, - { - "entropy": 1.9381729751825332, - "epoch": 0.5346109106070481, - "grad_norm": 7.45047664642334, - "learning_rate": 3.4599914829278934e-06, - "loss": 0.462, - "mean_token_accuracy": 0.8518120244145393, - "num_tokens": 207511612.0, - "step": 172460 - }, - { - "entropy": 1.967686542868614, - "epoch": 0.5346419097320978, - "grad_norm": 8.611394882202148, - "learning_rate": 3.459891173839356e-06, - "loss": 0.4766, - "mean_token_accuracy": 0.8540784746408463, - "num_tokens": 207523508.0, - "step": 172470 - }, - { - "entropy": 1.9352397471666336, - "epoch": 0.5346729088571475, - "grad_norm": 3.897338390350342, - "learning_rate": 3.459790873474536e-06, - "loss": 0.4391, - "mean_token_accuracy": 0.8492213904857635, - "num_tokens": 207535586.0, - "step": 172480 - }, - { - "entropy": 1.9683047980070114, - "epoch": 0.5347039079821972, - "grad_norm": 8.10174560546875, - "learning_rate": 3.45969058183217e-06, - "loss": 0.4578, - "mean_token_accuracy": 0.8603872656822205, - "num_tokens": 207546228.0, - "step": 172490 - }, - { - "entropy": 1.823877865076065, - "epoch": 0.5347349071072469, - "grad_norm": 7.564563751220703, - "learning_rate": 3.459590298910993e-06, - "loss": 0.4185, - "mean_token_accuracy": 0.8649493157863617, - "num_tokens": 207559795.0, - "step": 172500 - }, - { - "entropy": 1.8830648839473725, - "epoch": 0.5347659062322966, - "grad_norm": 7.6251301765441895, - "learning_rate": 3.459490024709742e-06, - "loss": 0.3804, - "mean_token_accuracy": 0.8656977102160454, - "num_tokens": 207572213.0, - "step": 172510 - }, - { - "entropy": 1.8875678315758706, - "epoch": 0.5347969053573463, - "grad_norm": 3.419621229171753, - "learning_rate": 3.4593897592271515e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8647677347064018, - "num_tokens": 207584322.0, - "step": 172520 - }, - { - "entropy": 1.9262285679578781, - "epoch": 0.534827904482396, - "grad_norm": 8.029248237609863, - "learning_rate": 3.4592895024619606e-06, - "loss": 0.4394, - "mean_token_accuracy": 0.8527608752250672, - "num_tokens": 207595700.0, - "step": 172530 - }, - { - "entropy": 1.9239438846707344, - "epoch": 0.5348589036074457, - "grad_norm": 9.767742156982422, - "learning_rate": 3.4591892544129045e-06, - "loss": 0.4567, - "mean_token_accuracy": 0.8469107151031494, - "num_tokens": 207607393.0, - "step": 172540 - }, - { - "entropy": 1.9097449451684951, - "epoch": 0.5348899027324954, - "grad_norm": 7.202256679534912, - "learning_rate": 3.459089015078721e-06, - "loss": 0.3834, - "mean_token_accuracy": 0.8678121000528336, - "num_tokens": 207619018.0, - "step": 172550 - }, - { - "entropy": 1.87255839407444, - "epoch": 0.534920901857545, - "grad_norm": 8.831064224243164, - "learning_rate": 3.4589887844581472e-06, - "loss": 0.4533, - "mean_token_accuracy": 0.8524145618081093, - "num_tokens": 207631371.0, - "step": 172560 - }, - { - "entropy": 1.8829514354467392, - "epoch": 0.5349519009825948, - "grad_norm": 9.208990097045898, - "learning_rate": 3.4588885625499207e-06, - "loss": 0.4128, - "mean_token_accuracy": 0.8572834596037865, - "num_tokens": 207643443.0, - "step": 172570 - }, - { - "entropy": 1.7449659064412117, - "epoch": 0.5349829001076445, - "grad_norm": 8.625064849853516, - "learning_rate": 3.45878834935278e-06, - "loss": 0.383, - "mean_token_accuracy": 0.8635283961892128, - "num_tokens": 207657645.0, - "step": 172580 - }, - { - "entropy": 1.9483749613165855, - "epoch": 0.5350138992326942, - "grad_norm": 9.903924942016602, - "learning_rate": 3.458688144865463e-06, - "loss": 0.4544, - "mean_token_accuracy": 0.8563203975558281, - "num_tokens": 207669310.0, - "step": 172590 - }, - { - "entropy": 1.8220311045646667, - "epoch": 0.5350448983577438, - "grad_norm": 3.884104013442993, - "learning_rate": 3.4585879490867074e-06, - "loss": 0.4267, - "mean_token_accuracy": 0.8625266656279564, - "num_tokens": 207682331.0, - "step": 172600 - }, - { - "entropy": 1.7253479383885861, - "epoch": 0.5350758974827935, - "grad_norm": 8.76376724243164, - "learning_rate": 3.458487762015253e-06, - "loss": 0.3447, - "mean_token_accuracy": 0.8723403230309487, - "num_tokens": 207697659.0, - "step": 172610 - }, - { - "entropy": 1.9204844385385513, - "epoch": 0.5351068966078433, - "grad_norm": 8.889963150024414, - "learning_rate": 3.4583875836498375e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8568612888455391, - "num_tokens": 207709348.0, - "step": 172620 - }, - { - "entropy": 1.8080897092819215, - "epoch": 0.5351378957328929, - "grad_norm": 7.010310173034668, - "learning_rate": 3.458287413989201e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.8615889981389045, - "num_tokens": 207722545.0, - "step": 172630 - }, - { - "entropy": 1.8895937889814376, - "epoch": 0.5351688948579426, - "grad_norm": 10.335844993591309, - "learning_rate": 3.4581872530320827e-06, - "loss": 0.461, - "mean_token_accuracy": 0.8531122609972954, - "num_tokens": 207734311.0, - "step": 172640 - }, - { - "entropy": 1.979372352361679, - "epoch": 0.5351998939829923, - "grad_norm": 8.217429161071777, - "learning_rate": 3.4580871007772215e-06, - "loss": 0.4852, - "mean_token_accuracy": 0.8483906969428062, - "num_tokens": 207744931.0, - "step": 172650 - }, - { - "entropy": 1.8208840578794478, - "epoch": 0.5352308931080421, - "grad_norm": 4.37178897857666, - "learning_rate": 3.4579869572233585e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8537062719464302, - "num_tokens": 207757648.0, - "step": 172660 - }, - { - "entropy": 1.9177382573485375, - "epoch": 0.5352618922330917, - "grad_norm": 8.044947624206543, - "learning_rate": 3.457886822369234e-06, - "loss": 0.4674, - "mean_token_accuracy": 0.8472369626164437, - "num_tokens": 207769899.0, - "step": 172670 - }, - { - "entropy": 1.8201391249895096, - "epoch": 0.5352928913581414, - "grad_norm": 3.541053533554077, - "learning_rate": 3.4577866962135876e-06, - "loss": 0.3582, - "mean_token_accuracy": 0.8720339864492417, - "num_tokens": 207782654.0, - "step": 172680 - }, - { - "entropy": 1.8746508598327636, - "epoch": 0.5353238904831911, - "grad_norm": 4.510343551635742, - "learning_rate": 3.4576865787551605e-06, - "loss": 0.4442, - "mean_token_accuracy": 0.8468729451298713, - "num_tokens": 207794735.0, - "step": 172690 - }, - { - "entropy": 1.9637598276138306, - "epoch": 0.5353548896082408, - "grad_norm": 9.965831756591797, - "learning_rate": 3.4575864699926935e-06, - "loss": 0.4919, - "mean_token_accuracy": 0.8475473329424859, - "num_tokens": 207805438.0, - "step": 172700 - }, - { - "entropy": 1.8837266564369202, - "epoch": 0.5353858887332905, - "grad_norm": 4.100901126861572, - "learning_rate": 3.4574863699249277e-06, - "loss": 0.4945, - "mean_token_accuracy": 0.8563615873456001, - "num_tokens": 207817902.0, - "step": 172710 - }, - { - "entropy": 1.8690167635679245, - "epoch": 0.5354168878583402, - "grad_norm": 8.928622245788574, - "learning_rate": 3.457386278550605e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8597008213400841, - "num_tokens": 207829494.0, - "step": 172720 - }, - { - "entropy": 1.8866547778248788, - "epoch": 0.5354478869833899, - "grad_norm": 8.530135154724121, - "learning_rate": 3.4572861958684666e-06, - "loss": 0.4395, - "mean_token_accuracy": 0.8589810565114021, - "num_tokens": 207841741.0, - "step": 172730 - }, - { - "entropy": 1.874574901163578, - "epoch": 0.5354788861084396, - "grad_norm": 8.313958168029785, - "learning_rate": 3.457186121877255e-06, - "loss": 0.4252, - "mean_token_accuracy": 0.852715365588665, - "num_tokens": 207853896.0, - "step": 172740 - }, - { - "entropy": 1.8388253033161164, - "epoch": 0.5355098852334893, - "grad_norm": 3.2290029525756836, - "learning_rate": 3.4570860565757115e-06, - "loss": 0.3969, - "mean_token_accuracy": 0.8671393066644668, - "num_tokens": 207867006.0, - "step": 172750 - }, - { - "entropy": 1.9595274776220322, - "epoch": 0.535540884358539, - "grad_norm": 8.689278602600098, - "learning_rate": 3.45698599996258e-06, - "loss": 0.5341, - "mean_token_accuracy": 0.8384348094463349, - "num_tokens": 207878408.0, - "step": 172760 - }, - { - "entropy": 1.8406404912471772, - "epoch": 0.5355718834835886, - "grad_norm": 8.577286720275879, - "learning_rate": 3.456885952036602e-06, - "loss": 0.41, - "mean_token_accuracy": 0.8525245815515519, - "num_tokens": 207891429.0, - "step": 172770 - }, - { - "entropy": 1.8390149533748628, - "epoch": 0.5356028826086384, - "grad_norm": 2.493551731109619, - "learning_rate": 3.4567859127965212e-06, - "loss": 0.3574, - "mean_token_accuracy": 0.8665968149900436, - "num_tokens": 207904908.0, - "step": 172780 - }, - { - "entropy": 1.9711681693792342, - "epoch": 0.5356338817336881, - "grad_norm": 8.279610633850098, - "learning_rate": 3.4566858822410814e-06, - "loss": 0.4773, - "mean_token_accuracy": 0.8502449646592141, - "num_tokens": 207916097.0, - "step": 172790 - }, - { - "entropy": 1.8366296276450158, - "epoch": 0.5356648808587378, - "grad_norm": 3.318671226501465, - "learning_rate": 3.4565858603690243e-06, - "loss": 0.381, - "mean_token_accuracy": 0.8724953427910804, - "num_tokens": 207928142.0, - "step": 172800 - }, - { - "entropy": 1.866967648267746, - "epoch": 0.5356958799837874, - "grad_norm": 8.332340240478516, - "learning_rate": 3.4564858471790957e-06, - "loss": 0.4108, - "mean_token_accuracy": 0.8624767050147056, - "num_tokens": 207940178.0, - "step": 172810 - }, - { - "entropy": 1.911839447915554, - "epoch": 0.5357268791088372, - "grad_norm": 8.774737358093262, - "learning_rate": 3.456385842670038e-06, - "loss": 0.4684, - "mean_token_accuracy": 0.8479789897799492, - "num_tokens": 207952013.0, - "step": 172820 - }, - { - "entropy": 1.8767947524785995, - "epoch": 0.5357578782338869, - "grad_norm": 7.686695098876953, - "learning_rate": 3.4562858468405963e-06, - "loss": 0.4132, - "mean_token_accuracy": 0.8593664765357971, - "num_tokens": 207964510.0, - "step": 172830 - }, - { - "entropy": 1.8312649622559547, - "epoch": 0.5357888773589365, - "grad_norm": 3.4589436054229736, - "learning_rate": 3.456185859689516e-06, - "loss": 0.4022, - "mean_token_accuracy": 0.8600029453635216, - "num_tokens": 207977857.0, - "step": 172840 - }, - { - "entropy": 1.7913474388420583, - "epoch": 0.5358198764839862, - "grad_norm": 6.77054500579834, - "learning_rate": 3.45608588121554e-06, - "loss": 0.4069, - "mean_token_accuracy": 0.8663949474692345, - "num_tokens": 207991073.0, - "step": 172850 - }, - { - "entropy": 1.8621043905615806, - "epoch": 0.5358508756090359, - "grad_norm": 8.442105293273926, - "learning_rate": 3.455985911417414e-06, - "loss": 0.4384, - "mean_token_accuracy": 0.8533388510346412, - "num_tokens": 208003843.0, - "step": 172860 - }, - { - "entropy": 1.876605623960495, - "epoch": 0.5358818747340857, - "grad_norm": 9.525609016418457, - "learning_rate": 3.455885950293884e-06, - "loss": 0.5398, - "mean_token_accuracy": 0.8462421506643295, - "num_tokens": 208015803.0, - "step": 172870 - }, - { - "entropy": 1.8648066401481629, - "epoch": 0.5359128738591353, - "grad_norm": 8.97998046875, - "learning_rate": 3.455785997843695e-06, - "loss": 0.4359, - "mean_token_accuracy": 0.8536581054329873, - "num_tokens": 208028644.0, - "step": 172880 - }, - { - "entropy": 1.9021985232830048, - "epoch": 0.535943872984185, - "grad_norm": 9.232839584350586, - "learning_rate": 3.4556860540655936e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8553122028708458, - "num_tokens": 208039829.0, - "step": 172890 - }, - { - "entropy": 1.8113704428076745, - "epoch": 0.5359748721092347, - "grad_norm": 6.826096534729004, - "learning_rate": 3.4555861189583244e-06, - "loss": 0.3362, - "mean_token_accuracy": 0.8639003306627273, - "num_tokens": 208052850.0, - "step": 172900 - }, - { - "entropy": 1.8847822308540345, - "epoch": 0.5360058712342844, - "grad_norm": 8.075764656066895, - "learning_rate": 3.4554861925206344e-06, - "loss": 0.4416, - "mean_token_accuracy": 0.8611627250909806, - "num_tokens": 208065092.0, - "step": 172910 - }, - { - "entropy": 1.9323305204510688, - "epoch": 0.5360368703593341, - "grad_norm": 7.556993007659912, - "learning_rate": 3.4553862747512707e-06, - "loss": 0.4678, - "mean_token_accuracy": 0.8483913406729698, - "num_tokens": 208076555.0, - "step": 172920 - }, - { - "entropy": 1.8728545591235162, - "epoch": 0.5360678694843838, - "grad_norm": 7.655262470245361, - "learning_rate": 3.4552863656489795e-06, - "loss": 0.3961, - "mean_token_accuracy": 0.8667805373668671, - "num_tokens": 208087801.0, - "step": 172930 - }, - { - "entropy": 1.9701668232679368, - "epoch": 0.5360988686094335, - "grad_norm": 7.848801136016846, - "learning_rate": 3.4551864652125078e-06, - "loss": 0.5238, - "mean_token_accuracy": 0.8293770626187325, - "num_tokens": 208098926.0, - "step": 172940 - }, - { - "entropy": 1.909597432613373, - "epoch": 0.5361298677344832, - "grad_norm": 9.201456069946289, - "learning_rate": 3.4550865734406037e-06, - "loss": 0.4504, - "mean_token_accuracy": 0.8472042381763458, - "num_tokens": 208110534.0, - "step": 172950 - }, - { - "entropy": 1.8972481831908226, - "epoch": 0.5361608668595329, - "grad_norm": 8.038980484008789, - "learning_rate": 3.4549866903320134e-06, - "loss": 0.4745, - "mean_token_accuracy": 0.8507405325770379, - "num_tokens": 208122410.0, - "step": 172960 - }, - { - "entropy": 1.9153016999363899, - "epoch": 0.5361918659845826, - "grad_norm": 9.517046928405762, - "learning_rate": 3.4548868158854864e-06, - "loss": 0.4585, - "mean_token_accuracy": 0.8530256912112236, - "num_tokens": 208133695.0, - "step": 172970 - }, - { - "entropy": 1.883463017642498, - "epoch": 0.5362228651096322, - "grad_norm": 8.928050994873047, - "learning_rate": 3.4547869500997693e-06, - "loss": 0.4342, - "mean_token_accuracy": 0.8534481927752495, - "num_tokens": 208146386.0, - "step": 172980 - }, - { - "entropy": 1.8348631024360658, - "epoch": 0.536253864234682, - "grad_norm": 6.866858959197998, - "learning_rate": 3.4546870929736113e-06, - "loss": 0.4131, - "mean_token_accuracy": 0.8672243788838386, - "num_tokens": 208159187.0, - "step": 172990 - }, - { - "entropy": 1.9339495077729225, - "epoch": 0.5362848633597317, - "grad_norm": 8.929131507873535, - "learning_rate": 3.4545872445057615e-06, - "loss": 0.4579, - "mean_token_accuracy": 0.8579076424241066, - "num_tokens": 208170970.0, - "step": 173000 - }, - { - "entropy": 1.9648241117596625, - "epoch": 0.5363158624847814, - "grad_norm": 9.15043830871582, - "learning_rate": 3.4544874046949674e-06, - "loss": 0.5074, - "mean_token_accuracy": 0.8426274269819259, - "num_tokens": 208182432.0, - "step": 173010 - }, - { - "entropy": 1.8830683097243308, - "epoch": 0.536346861609831, - "grad_norm": 8.971197128295898, - "learning_rate": 3.454387573539979e-06, - "loss": 0.4268, - "mean_token_accuracy": 0.8599732339382171, - "num_tokens": 208194473.0, - "step": 173020 - }, - { - "entropy": 1.9190237015485763, - "epoch": 0.5363778607348808, - "grad_norm": 8.68978500366211, - "learning_rate": 3.4542877510395453e-06, - "loss": 0.4314, - "mean_token_accuracy": 0.8580868989229202, - "num_tokens": 208206024.0, - "step": 173030 - }, - { - "entropy": 1.8665350019931792, - "epoch": 0.5364088598599305, - "grad_norm": 8.050957679748535, - "learning_rate": 3.4541879371924155e-06, - "loss": 0.427, - "mean_token_accuracy": 0.8516592651605606, - "num_tokens": 208218803.0, - "step": 173040 - }, - { - "entropy": 1.9037343636155128, - "epoch": 0.5364398589849801, - "grad_norm": 8.649856567382812, - "learning_rate": 3.4540881319973406e-06, - "loss": 0.4471, - "mean_token_accuracy": 0.8579539388418198, - "num_tokens": 208230525.0, - "step": 173050 - }, - { - "entropy": 1.9306460633873939, - "epoch": 0.5364708581100298, - "grad_norm": 8.644009590148926, - "learning_rate": 3.4539883354530695e-06, - "loss": 0.4496, - "mean_token_accuracy": 0.8506815403699874, - "num_tokens": 208242480.0, - "step": 173060 - }, - { - "entropy": 1.8853941515088082, - "epoch": 0.5365018572350796, - "grad_norm": 9.751359939575195, - "learning_rate": 3.4538885475583533e-06, - "loss": 0.4279, - "mean_token_accuracy": 0.8584751397371292, - "num_tokens": 208253902.0, - "step": 173070 - }, - { - "entropy": 1.9570772036910058, - "epoch": 0.5365328563601293, - "grad_norm": 8.534307479858398, - "learning_rate": 3.453788768311943e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.8538262590765953, - "num_tokens": 208265528.0, - "step": 173080 - }, - { - "entropy": 1.9516688704490661, - "epoch": 0.5365638554851789, - "grad_norm": 10.632366180419922, - "learning_rate": 3.4536889977125888e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.8635988682508469, - "num_tokens": 208277567.0, - "step": 173090 - }, - { - "entropy": 1.8980008989572525, - "epoch": 0.5365948546102286, - "grad_norm": 9.882356643676758, - "learning_rate": 3.4535892357590418e-06, - "loss": 0.4432, - "mean_token_accuracy": 0.8611980080604553, - "num_tokens": 208289458.0, - "step": 173100 - }, - { - "entropy": 1.8366466403007506, - "epoch": 0.5366258537352783, - "grad_norm": 8.65365219116211, - "learning_rate": 3.453489482450053e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8533262208104133, - "num_tokens": 208302479.0, - "step": 173110 - }, - { - "entropy": 1.9177147299051285, - "epoch": 0.536656852860328, - "grad_norm": 7.513286590576172, - "learning_rate": 3.453389737784375e-06, - "loss": 0.4168, - "mean_token_accuracy": 0.8542504772543907, - "num_tokens": 208314078.0, - "step": 173120 - }, - { - "entropy": 1.9224371239542961, - "epoch": 0.5366878519853777, - "grad_norm": 6.784999370574951, - "learning_rate": 3.453290001760759e-06, - "loss": 0.4775, - "mean_token_accuracy": 0.8514690682291984, - "num_tokens": 208325860.0, - "step": 173130 - }, - { - "entropy": 1.9379079461097717, - "epoch": 0.5367188511104274, - "grad_norm": 7.635251522064209, - "learning_rate": 3.453190274377958e-06, - "loss": 0.4587, - "mean_token_accuracy": 0.8511418506503106, - "num_tokens": 208337761.0, - "step": 173140 - }, - { - "entropy": 1.9702737927436829, - "epoch": 0.536749850235477, - "grad_norm": 8.940947532653809, - "learning_rate": 3.4530905556347235e-06, - "loss": 0.5274, - "mean_token_accuracy": 0.842144227027893, - "num_tokens": 208348546.0, - "step": 173150 - }, - { - "entropy": 1.8190948873758317, - "epoch": 0.5367808493605268, - "grad_norm": 8.213035583496094, - "learning_rate": 3.4529908455298076e-06, - "loss": 0.4273, - "mean_token_accuracy": 0.8630830571055412, - "num_tokens": 208361404.0, - "step": 173160 - }, - { - "entropy": 1.8803176492452622, - "epoch": 0.5368118484855765, - "grad_norm": 8.948467254638672, - "learning_rate": 3.452891144061965e-06, - "loss": 0.5111, - "mean_token_accuracy": 0.8477946504950523, - "num_tokens": 208373438.0, - "step": 173170 - }, - { - "entropy": 1.8883336156606674, - "epoch": 0.5368428476106262, - "grad_norm": 6.753879547119141, - "learning_rate": 3.4527914512299472e-06, - "loss": 0.4126, - "mean_token_accuracy": 0.8684131637215614, - "num_tokens": 208385340.0, - "step": 173180 - }, - { - "entropy": 1.890251599252224, - "epoch": 0.5368738467356758, - "grad_norm": 3.4004273414611816, - "learning_rate": 3.452691767032508e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.8704697549343109, - "num_tokens": 208397816.0, - "step": 173190 - }, - { - "entropy": 1.8218652233481407, - "epoch": 0.5369048458607256, - "grad_norm": 3.7632715702056885, - "learning_rate": 3.452592091468402e-06, - "loss": 0.3582, - "mean_token_accuracy": 0.8653622597455979, - "num_tokens": 208411104.0, - "step": 173200 - }, - { - "entropy": 1.9061686143279075, - "epoch": 0.5369358449857753, - "grad_norm": 9.365750312805176, - "learning_rate": 3.4524924245363815e-06, - "loss": 0.4465, - "mean_token_accuracy": 0.8505714625120163, - "num_tokens": 208422977.0, - "step": 173210 - }, - { - "entropy": 1.855700920522213, - "epoch": 0.536966844110825, - "grad_norm": 9.142428398132324, - "learning_rate": 3.4523927662352024e-06, - "loss": 0.4, - "mean_token_accuracy": 0.8599683746695519, - "num_tokens": 208435820.0, - "step": 173220 - }, - { - "entropy": 1.9547921270132065, - "epoch": 0.5369978432358746, - "grad_norm": 9.191194534301758, - "learning_rate": 3.4522931165636174e-06, - "loss": 0.4557, - "mean_token_accuracy": 0.8558192774653435, - "num_tokens": 208446473.0, - "step": 173230 - }, - { - "entropy": 1.9289256483316422, - "epoch": 0.5370288423609244, - "grad_norm": 8.622015953063965, - "learning_rate": 3.4521934755203822e-06, - "loss": 0.5088, - "mean_token_accuracy": 0.8463401675224305, - "num_tokens": 208458688.0, - "step": 173240 - }, - { - "entropy": 1.8982299000024796, - "epoch": 0.5370598414859741, - "grad_norm": 3.722247838973999, - "learning_rate": 3.4520938431042513e-06, - "loss": 0.4673, - "mean_token_accuracy": 0.8519804239273071, - "num_tokens": 208470509.0, - "step": 173250 - }, - { - "entropy": 1.9269887924194335, - "epoch": 0.5370908406110237, - "grad_norm": 7.334140300750732, - "learning_rate": 3.4519942193139803e-06, - "loss": 0.4256, - "mean_token_accuracy": 0.8573471799492836, - "num_tokens": 208481720.0, - "step": 173260 - }, - { - "entropy": 1.8478943184018135, - "epoch": 0.5371218397360734, - "grad_norm": 8.863199234008789, - "learning_rate": 3.451894604148324e-06, - "loss": 0.3867, - "mean_token_accuracy": 0.8715490698814392, - "num_tokens": 208493429.0, - "step": 173270 - }, - { - "entropy": 1.916895118355751, - "epoch": 0.5371528388611232, - "grad_norm": 8.353137016296387, - "learning_rate": 3.451794997606039e-06, - "loss": 0.4384, - "mean_token_accuracy": 0.8541607096791267, - "num_tokens": 208504592.0, - "step": 173280 - }, - { - "entropy": 1.906629091501236, - "epoch": 0.5371838379861729, - "grad_norm": 6.4494428634643555, - "learning_rate": 3.4516953996858797e-06, - "loss": 0.4076, - "mean_token_accuracy": 0.8636928036808967, - "num_tokens": 208517286.0, - "step": 173290 - }, - { - "entropy": 1.909351000189781, - "epoch": 0.5372148371112225, - "grad_norm": 8.884832382202148, - "learning_rate": 3.451595810386603e-06, - "loss": 0.4188, - "mean_token_accuracy": 0.8635331153869629, - "num_tokens": 208528781.0, - "step": 173300 - }, - { - "entropy": 1.9376345589756965, - "epoch": 0.5372458362362722, - "grad_norm": 8.363286972045898, - "learning_rate": 3.4514962297069664e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.842518937587738, - "num_tokens": 208540735.0, - "step": 173310 - }, - { - "entropy": 1.8835818395018578, - "epoch": 0.537276835361322, - "grad_norm": 9.146308898925781, - "learning_rate": 3.451396657645725e-06, - "loss": 0.4783, - "mean_token_accuracy": 0.8543020248413086, - "num_tokens": 208553020.0, - "step": 173320 - }, - { - "entropy": 1.9691929280757905, - "epoch": 0.5373078344863716, - "grad_norm": 7.7050275802612305, - "learning_rate": 3.451297094201636e-06, - "loss": 0.4905, - "mean_token_accuracy": 0.844947075843811, - "num_tokens": 208564228.0, - "step": 173330 - }, - { - "entropy": 1.8853344082832337, - "epoch": 0.5373388336114213, - "grad_norm": 8.397521018981934, - "learning_rate": 3.4511975393734574e-06, - "loss": 0.3892, - "mean_token_accuracy": 0.8540289342403412, - "num_tokens": 208577038.0, - "step": 173340 - }, - { - "entropy": 1.8994945645332337, - "epoch": 0.537369832736471, - "grad_norm": 8.379923820495605, - "learning_rate": 3.4510979931599466e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.8555789023637772, - "num_tokens": 208589925.0, - "step": 173350 - }, - { - "entropy": 1.892551527917385, - "epoch": 0.5374008318615207, - "grad_norm": 3.935771942138672, - "learning_rate": 3.4509984555598596e-06, - "loss": 0.4218, - "mean_token_accuracy": 0.8619065880775452, - "num_tokens": 208602194.0, - "step": 173360 - }, - { - "entropy": 1.8687604144215584, - "epoch": 0.5374318309865704, - "grad_norm": 9.803389549255371, - "learning_rate": 3.450898926571956e-06, - "loss": 0.4001, - "mean_token_accuracy": 0.8642571851611137, - "num_tokens": 208615134.0, - "step": 173370 - }, - { - "entropy": 1.8904517486691474, - "epoch": 0.5374628301116201, - "grad_norm": 8.909818649291992, - "learning_rate": 3.450799406194994e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8571186706423759, - "num_tokens": 208627116.0, - "step": 173380 - }, - { - "entropy": 1.869077640771866, - "epoch": 0.5374938292366698, - "grad_norm": 7.4785284996032715, - "learning_rate": 3.4506998944277306e-06, - "loss": 0.3902, - "mean_token_accuracy": 0.8658688947558403, - "num_tokens": 208639356.0, - "step": 173390 - }, - { - "entropy": 1.9270390465855598, - "epoch": 0.5375248283617194, - "grad_norm": 7.729918956756592, - "learning_rate": 3.4506003912689252e-06, - "loss": 0.4549, - "mean_token_accuracy": 0.8558061882853508, - "num_tokens": 208651327.0, - "step": 173400 - }, - { - "entropy": 1.9015106394886971, - "epoch": 0.5375558274867692, - "grad_norm": 4.28214168548584, - "learning_rate": 3.450500896717338e-06, - "loss": 0.4466, - "mean_token_accuracy": 0.860819086432457, - "num_tokens": 208664302.0, - "step": 173410 - }, - { - "entropy": 1.8902736604213715, - "epoch": 0.5375868266118189, - "grad_norm": 7.470119953155518, - "learning_rate": 3.4504014107717265e-06, - "loss": 0.4488, - "mean_token_accuracy": 0.8502381905913353, - "num_tokens": 208675327.0, - "step": 173420 - }, - { - "entropy": 1.895962157845497, - "epoch": 0.5376178257368686, - "grad_norm": 8.215499877929688, - "learning_rate": 3.45030193343085e-06, - "loss": 0.4397, - "mean_token_accuracy": 0.850038780272007, - "num_tokens": 208687504.0, - "step": 173430 - }, - { - "entropy": 1.9525114193558692, - "epoch": 0.5376488248619182, - "grad_norm": 9.476152420043945, - "learning_rate": 3.450202464693469e-06, - "loss": 0.4646, - "mean_token_accuracy": 0.847514845430851, - "num_tokens": 208698463.0, - "step": 173440 - }, - { - "entropy": 1.8926068410277366, - "epoch": 0.537679823986968, - "grad_norm": 7.753235816955566, - "learning_rate": 3.450103004558344e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.858433011174202, - "num_tokens": 208709860.0, - "step": 173450 - }, - { - "entropy": 1.874835228919983, - "epoch": 0.5377108231120177, - "grad_norm": 8.78343677520752, - "learning_rate": 3.4500035530242337e-06, - "loss": 0.4079, - "mean_token_accuracy": 0.8578429788351059, - "num_tokens": 208722252.0, - "step": 173460 - }, - { - "entropy": 1.8854070708155632, - "epoch": 0.5377418222370673, - "grad_norm": 9.277860641479492, - "learning_rate": 3.449904110089899e-06, - "loss": 0.4636, - "mean_token_accuracy": 0.8456300452351571, - "num_tokens": 208734913.0, - "step": 173470 - }, - { - "entropy": 1.8384581625461578, - "epoch": 0.537772821362117, - "grad_norm": 7.878200054168701, - "learning_rate": 3.4498046757541017e-06, - "loss": 0.4592, - "mean_token_accuracy": 0.8508857935667038, - "num_tokens": 208747257.0, - "step": 173480 - }, - { - "entropy": 1.9341495603322982, - "epoch": 0.5378038204871668, - "grad_norm": 8.864315032958984, - "learning_rate": 3.449705250015601e-06, - "loss": 0.5136, - "mean_token_accuracy": 0.8379593536257743, - "num_tokens": 208758938.0, - "step": 173490 - }, - { - "entropy": 1.9576148480176925, - "epoch": 0.5378348196122165, - "grad_norm": 8.191580772399902, - "learning_rate": 3.449605832873159e-06, - "loss": 0.4595, - "mean_token_accuracy": 0.8521469473838806, - "num_tokens": 208770439.0, - "step": 173500 - }, - { - "entropy": 1.8926663532853127, - "epoch": 0.5378658187372661, - "grad_norm": 7.8449387550354, - "learning_rate": 3.449506424325537e-06, - "loss": 0.4068, - "mean_token_accuracy": 0.8681238234043122, - "num_tokens": 208782915.0, - "step": 173510 - }, - { - "entropy": 1.8926350608468057, - "epoch": 0.5378968178623158, - "grad_norm": 8.02475643157959, - "learning_rate": 3.4494070243714972e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8605057239532471, - "num_tokens": 208795021.0, - "step": 173520 - }, - { - "entropy": 1.9649064928293227, - "epoch": 0.5379278169873656, - "grad_norm": 8.462267875671387, - "learning_rate": 3.449307633009801e-06, - "loss": 0.454, - "mean_token_accuracy": 0.8515180319547653, - "num_tokens": 208805781.0, - "step": 173530 - }, - { - "entropy": 1.8660337910056115, - "epoch": 0.5379588161124153, - "grad_norm": 6.343677520751953, - "learning_rate": 3.4492082502392097e-06, - "loss": 0.425, - "mean_token_accuracy": 0.8579024732112884, - "num_tokens": 208818869.0, - "step": 173540 - }, - { - "entropy": 1.9495095014572144, - "epoch": 0.5379898152374649, - "grad_norm": 7.972060203552246, - "learning_rate": 3.449108876058487e-06, - "loss": 0.4564, - "mean_token_accuracy": 0.851052550971508, - "num_tokens": 208829863.0, - "step": 173550 - }, - { - "entropy": 1.8896791711449623, - "epoch": 0.5380208143625146, - "grad_norm": 8.474870681762695, - "learning_rate": 3.449009510466395e-06, - "loss": 0.4094, - "mean_token_accuracy": 0.8639056518673897, - "num_tokens": 208841639.0, - "step": 173560 - }, - { - "entropy": 1.8332088232040404, - "epoch": 0.5380518134875644, - "grad_norm": 3.642097234725952, - "learning_rate": 3.4489101534616973e-06, - "loss": 0.3799, - "mean_token_accuracy": 0.871310307085514, - "num_tokens": 208854694.0, - "step": 173570 - }, - { - "entropy": 1.9200448259711265, - "epoch": 0.538082812612614, - "grad_norm": 8.212307929992676, - "learning_rate": 3.448810805043157e-06, - "loss": 0.4183, - "mean_token_accuracy": 0.8595086053013802, - "num_tokens": 208866628.0, - "step": 173580 - }, - { - "entropy": 1.8766262441873551, - "epoch": 0.5381138117376637, - "grad_norm": 4.330519199371338, - "learning_rate": 3.448711465209536e-06, - "loss": 0.405, - "mean_token_accuracy": 0.8583259165287018, - "num_tokens": 208879177.0, - "step": 173590 - }, - { - "entropy": 1.9223135873675345, - "epoch": 0.5381448108627134, - "grad_norm": 7.595412254333496, - "learning_rate": 3.448612133959599e-06, - "loss": 0.4735, - "mean_token_accuracy": 0.8525502845644951, - "num_tokens": 208890739.0, - "step": 173600 - }, - { - "entropy": 1.8765450350940227, - "epoch": 0.538175809987763, - "grad_norm": 9.189422607421875, - "learning_rate": 3.448512811292111e-06, - "loss": 0.4277, - "mean_token_accuracy": 0.8525310665369034, - "num_tokens": 208903519.0, - "step": 173610 - }, - { - "entropy": 1.9034860491752625, - "epoch": 0.5382068091128128, - "grad_norm": 5.840885639190674, - "learning_rate": 3.448413497205834e-06, - "loss": 0.4489, - "mean_token_accuracy": 0.856965248286724, - "num_tokens": 208914822.0, - "step": 173620 - }, - { - "entropy": 1.9403439238667488, - "epoch": 0.5382378082378625, - "grad_norm": 8.857246398925781, - "learning_rate": 3.448314191699534e-06, - "loss": 0.4864, - "mean_token_accuracy": 0.845775356888771, - "num_tokens": 208926069.0, - "step": 173630 - }, - { - "entropy": 1.8852522909641265, - "epoch": 0.5382688073629122, - "grad_norm": 8.122220039367676, - "learning_rate": 3.448214894771975e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.850862056016922, - "num_tokens": 208938482.0, - "step": 173640 - }, - { - "entropy": 1.838669629395008, - "epoch": 0.5382998064879618, - "grad_norm": 8.469392776489258, - "learning_rate": 3.4481156064219225e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.8549366906285286, - "num_tokens": 208951343.0, - "step": 173650 - }, - { - "entropy": 1.8912505745887755, - "epoch": 0.5383308056130116, - "grad_norm": 7.779234409332275, - "learning_rate": 3.4480163266481407e-06, - "loss": 0.4467, - "mean_token_accuracy": 0.8542173370718956, - "num_tokens": 208962657.0, - "step": 173660 - }, - { - "entropy": 1.836891622841358, - "epoch": 0.5383618047380613, - "grad_norm": 7.45448637008667, - "learning_rate": 3.447917055449396e-06, - "loss": 0.3494, - "mean_token_accuracy": 0.8678934350609779, - "num_tokens": 208975278.0, - "step": 173670 - }, - { - "entropy": 1.899190789461136, - "epoch": 0.538392803863111, - "grad_norm": 8.006275177001953, - "learning_rate": 3.447817792824453e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.856468240916729, - "num_tokens": 208986773.0, - "step": 173680 - }, - { - "entropy": 1.8860803157091142, - "epoch": 0.5384238029881606, - "grad_norm": 8.964807510375977, - "learning_rate": 3.4477185387720796e-06, - "loss": 0.4189, - "mean_token_accuracy": 0.8659941121935845, - "num_tokens": 208998891.0, - "step": 173690 - }, - { - "entropy": 1.9246498376131058, - "epoch": 0.5384548021132104, - "grad_norm": 8.570847511291504, - "learning_rate": 3.447619293291039e-06, - "loss": 0.4594, - "mean_token_accuracy": 0.8461833164095879, - "num_tokens": 209009880.0, - "step": 173700 - }, - { - "entropy": 1.9233383148908616, - "epoch": 0.5384858012382601, - "grad_norm": 7.748110294342041, - "learning_rate": 3.4475200563801005e-06, - "loss": 0.4573, - "mean_token_accuracy": 0.8469515576958656, - "num_tokens": 209021551.0, - "step": 173710 - }, - { - "entropy": 1.9223771378397942, - "epoch": 0.5385168003633097, - "grad_norm": 4.379847526550293, - "learning_rate": 3.4474208280380296e-06, - "loss": 0.4549, - "mean_token_accuracy": 0.8564412742853165, - "num_tokens": 209033109.0, - "step": 173720 - }, - { - "entropy": 1.8591162428259849, - "epoch": 0.5385477994883594, - "grad_norm": 8.297298431396484, - "learning_rate": 3.447321608263592e-06, - "loss": 0.4186, - "mean_token_accuracy": 0.8554350823163986, - "num_tokens": 209045664.0, - "step": 173730 - }, - { - "entropy": 1.947766014933586, - "epoch": 0.5385787986134092, - "grad_norm": 9.672722816467285, - "learning_rate": 3.447222397055556e-06, - "loss": 0.4817, - "mean_token_accuracy": 0.8485858336091041, - "num_tokens": 209056805.0, - "step": 173740 - }, - { - "entropy": 1.9055955439805985, - "epoch": 0.5386097977384589, - "grad_norm": 6.304044723510742, - "learning_rate": 3.4471231944126893e-06, - "loss": 0.4918, - "mean_token_accuracy": 0.8486945450305938, - "num_tokens": 209068171.0, - "step": 173750 - }, - { - "entropy": 1.845375980436802, - "epoch": 0.5386407968635085, - "grad_norm": 9.359332084655762, - "learning_rate": 3.447024000333759e-06, - "loss": 0.4399, - "mean_token_accuracy": 0.8463126629590988, - "num_tokens": 209080990.0, - "step": 173760 - }, - { - "entropy": 1.9070203810930253, - "epoch": 0.5386717959885582, - "grad_norm": 8.701565742492676, - "learning_rate": 3.4469248148175338e-06, - "loss": 0.4482, - "mean_token_accuracy": 0.8570631608366966, - "num_tokens": 209092750.0, - "step": 173770 - }, - { - "entropy": 1.9061894118785858, - "epoch": 0.538702795113608, - "grad_norm": 3.7396252155303955, - "learning_rate": 3.44682563786278e-06, - "loss": 0.4586, - "mean_token_accuracy": 0.8583489716053009, - "num_tokens": 209104565.0, - "step": 173780 - }, - { - "entropy": 1.8039513304829597, - "epoch": 0.5387337942386576, - "grad_norm": 3.53255558013916, - "learning_rate": 3.4467264694682684e-06, - "loss": 0.3631, - "mean_token_accuracy": 0.8604403004050255, - "num_tokens": 209117953.0, - "step": 173790 - }, - { - "entropy": 1.9707183718681336, - "epoch": 0.5387647933637073, - "grad_norm": 6.6833109855651855, - "learning_rate": 3.446627309632766e-06, - "loss": 0.4572, - "mean_token_accuracy": 0.851428984105587, - "num_tokens": 209129538.0, - "step": 173800 - }, - { - "entropy": 1.8999001562595368, - "epoch": 0.538795792488757, - "grad_norm": 7.964756965637207, - "learning_rate": 3.446528158355042e-06, - "loss": 0.4346, - "mean_token_accuracy": 0.8490406572818756, - "num_tokens": 209141533.0, - "step": 173810 - }, - { - "entropy": 1.9331061124801636, - "epoch": 0.5388267916138068, - "grad_norm": 9.25655460357666, - "learning_rate": 3.4464290156338653e-06, - "loss": 0.4422, - "mean_token_accuracy": 0.8557355120778084, - "num_tokens": 209153672.0, - "step": 173820 - }, - { - "entropy": 1.904996284842491, - "epoch": 0.5388577907388564, - "grad_norm": 8.72864818572998, - "learning_rate": 3.4463298814680063e-06, - "loss": 0.4802, - "mean_token_accuracy": 0.8544986084103584, - "num_tokens": 209166617.0, - "step": 173830 - }, - { - "entropy": 1.8626493886113167, - "epoch": 0.5388887898639061, - "grad_norm": 8.355567932128906, - "learning_rate": 3.4462307558562334e-06, - "loss": 0.397, - "mean_token_accuracy": 0.8615503028035164, - "num_tokens": 209178685.0, - "step": 173840 - }, - { - "entropy": 1.87907382696867, - "epoch": 0.5389197889889558, - "grad_norm": 7.804126262664795, - "learning_rate": 3.4461316387973177e-06, - "loss": 0.4509, - "mean_token_accuracy": 0.8576026365160943, - "num_tokens": 209191040.0, - "step": 173850 - }, - { - "entropy": 1.903736950457096, - "epoch": 0.5389507881140054, - "grad_norm": 8.96075439453125, - "learning_rate": 3.446032530290028e-06, - "loss": 0.4424, - "mean_token_accuracy": 0.8592004761099815, - "num_tokens": 209203023.0, - "step": 173860 - }, - { - "entropy": 1.850924776494503, - "epoch": 0.5389817872390552, - "grad_norm": 7.536379337310791, - "learning_rate": 3.4459334303331358e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.8623146697878837, - "num_tokens": 209214957.0, - "step": 173870 - }, - { - "entropy": 1.7714354708790778, - "epoch": 0.5390127863641049, - "grad_norm": 3.7824769020080566, - "learning_rate": 3.445834338925412e-06, - "loss": 0.3724, - "mean_token_accuracy": 0.8654832825064659, - "num_tokens": 209227593.0, - "step": 173880 - }, - { - "entropy": 1.8509420529007912, - "epoch": 0.5390437854891545, - "grad_norm": 8.168303489685059, - "learning_rate": 3.4457352560656255e-06, - "loss": 0.4233, - "mean_token_accuracy": 0.8622468948364258, - "num_tokens": 209240043.0, - "step": 173890 - }, - { - "entropy": 1.8068629071116447, - "epoch": 0.5390747846142042, - "grad_norm": 2.740614652633667, - "learning_rate": 3.4456361817525485e-06, - "loss": 0.3772, - "mean_token_accuracy": 0.8759689271450043, - "num_tokens": 209252184.0, - "step": 173900 - }, - { - "entropy": 1.8972719386219978, - "epoch": 0.539105783739254, - "grad_norm": 4.083326816558838, - "learning_rate": 3.4455371159849536e-06, - "loss": 0.4811, - "mean_token_accuracy": 0.8535422086715698, - "num_tokens": 209264573.0, - "step": 173910 - }, - { - "entropy": 1.8553459897637368, - "epoch": 0.5391367828643037, - "grad_norm": 7.37409782409668, - "learning_rate": 3.445438058761611e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8643639668822288, - "num_tokens": 209276436.0, - "step": 173920 - }, - { - "entropy": 1.8629083037376404, - "epoch": 0.5391677819893533, - "grad_norm": 11.203222274780273, - "learning_rate": 3.4453390100812933e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8535048425197601, - "num_tokens": 209288296.0, - "step": 173930 - }, - { - "entropy": 1.888429582118988, - "epoch": 0.539198781114403, - "grad_norm": 9.21101188659668, - "learning_rate": 3.4452399699427715e-06, - "loss": 0.417, - "mean_token_accuracy": 0.850302429497242, - "num_tokens": 209300674.0, - "step": 173940 - }, - { - "entropy": 1.865077406167984, - "epoch": 0.5392297802394528, - "grad_norm": 6.1955485343933105, - "learning_rate": 3.4451409383448185e-06, - "loss": 0.4204, - "mean_token_accuracy": 0.8566244795918465, - "num_tokens": 209312498.0, - "step": 173950 - }, - { - "entropy": 1.835512214899063, - "epoch": 0.5392607793645025, - "grad_norm": 3.3951921463012695, - "learning_rate": 3.4450419152862075e-06, - "loss": 0.4027, - "mean_token_accuracy": 0.8619059652090073, - "num_tokens": 209324850.0, - "step": 173960 - }, - { - "entropy": 1.9534437566995622, - "epoch": 0.5392917784895521, - "grad_norm": 7.149500846862793, - "learning_rate": 3.444942900765711e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.8548975363373756, - "num_tokens": 209335333.0, - "step": 173970 - }, - { - "entropy": 1.8894205838441849, - "epoch": 0.5393227776146018, - "grad_norm": 3.9499619007110596, - "learning_rate": 3.4448438947821017e-06, - "loss": 0.4722, - "mean_token_accuracy": 0.8543545797467231, - "num_tokens": 209347592.0, - "step": 173980 - }, - { - "entropy": 1.9478689044713975, - "epoch": 0.5393537767396516, - "grad_norm": 8.097626686096191, - "learning_rate": 3.4447448973341536e-06, - "loss": 0.5032, - "mean_token_accuracy": 0.8446411550045013, - "num_tokens": 209358862.0, - "step": 173990 - }, - { - "entropy": 1.8606567561626435, - "epoch": 0.5393847758647012, - "grad_norm": 7.8725457191467285, - "learning_rate": 3.4446459084206392e-06, - "loss": 0.4179, - "mean_token_accuracy": 0.8621124207973481, - "num_tokens": 209370738.0, - "step": 174000 - }, - { - "entropy": 1.9500312462449074, - "epoch": 0.5394157749897509, - "grad_norm": 7.8665289878845215, - "learning_rate": 3.4445469280403334e-06, - "loss": 0.4702, - "mean_token_accuracy": 0.8539640560746193, - "num_tokens": 209382325.0, - "step": 174010 - }, - { - "entropy": 1.8706529840826989, - "epoch": 0.5394467741148006, - "grad_norm": 4.0839457511901855, - "learning_rate": 3.4444479561920104e-06, - "loss": 0.4393, - "mean_token_accuracy": 0.853885892033577, - "num_tokens": 209394402.0, - "step": 174020 - }, - { - "entropy": 1.8229328334331512, - "epoch": 0.5394777732398504, - "grad_norm": 2.62138032913208, - "learning_rate": 3.4443489928744434e-06, - "loss": 0.4121, - "mean_token_accuracy": 0.8603856071829796, - "num_tokens": 209407115.0, - "step": 174030 - }, - { - "entropy": 1.9071577087044715, - "epoch": 0.5395087723649, - "grad_norm": 7.644775390625, - "learning_rate": 3.444250038086408e-06, - "loss": 0.4771, - "mean_token_accuracy": 0.8491267457604408, - "num_tokens": 209418495.0, - "step": 174040 - }, - { - "entropy": 1.8522107481956482, - "epoch": 0.5395397714899497, - "grad_norm": 8.107036590576172, - "learning_rate": 3.444151091826679e-06, - "loss": 0.4551, - "mean_token_accuracy": 0.849695211648941, - "num_tokens": 209430988.0, - "step": 174050 - }, - { - "entropy": 1.891443131864071, - "epoch": 0.5395707706149994, - "grad_norm": 10.422231674194336, - "learning_rate": 3.444052154094031e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.8538878068327904, - "num_tokens": 209442507.0, - "step": 174060 - }, - { - "entropy": 1.8737448453903198, - "epoch": 0.5396017697400491, - "grad_norm": 7.765069961547852, - "learning_rate": 3.4439532248872388e-06, - "loss": 0.4372, - "mean_token_accuracy": 0.8545178517699241, - "num_tokens": 209454958.0, - "step": 174070 - }, - { - "entropy": 1.9368214011192322, - "epoch": 0.5396327688650988, - "grad_norm": 7.202656269073486, - "learning_rate": 3.4438543042050785e-06, - "loss": 0.4844, - "mean_token_accuracy": 0.8466168284416199, - "num_tokens": 209466512.0, - "step": 174080 - }, - { - "entropy": 1.8895224526524543, - "epoch": 0.5396637679901485, - "grad_norm": 3.6564581394195557, - "learning_rate": 3.4437553920463267e-06, - "loss": 0.4716, - "mean_token_accuracy": 0.8483851253986359, - "num_tokens": 209479335.0, - "step": 174090 - }, - { - "entropy": 1.7815810799598695, - "epoch": 0.5396947671151981, - "grad_norm": 3.281292676925659, - "learning_rate": 3.443656488409759e-06, - "loss": 0.4025, - "mean_token_accuracy": 0.8650498852133751, - "num_tokens": 209493538.0, - "step": 174100 - }, - { - "entropy": 1.9042472764849663, - "epoch": 0.5397257662402478, - "grad_norm": 6.953648567199707, - "learning_rate": 3.443557593294151e-06, - "loss": 0.4342, - "mean_token_accuracy": 0.8552184790372849, - "num_tokens": 209504818.0, - "step": 174110 - }, - { - "entropy": 1.8446774929761887, - "epoch": 0.5397567653652976, - "grad_norm": 7.378088474273682, - "learning_rate": 3.443458706698279e-06, - "loss": 0.3876, - "mean_token_accuracy": 0.8741657555103302, - "num_tokens": 209517016.0, - "step": 174120 - }, - { - "entropy": 1.8744469970464706, - "epoch": 0.5397877644903473, - "grad_norm": 8.403420448303223, - "learning_rate": 3.443359828620922e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8557930126786232, - "num_tokens": 209529296.0, - "step": 174130 - }, - { - "entropy": 1.8760189965367318, - "epoch": 0.5398187636153969, - "grad_norm": 7.711986064910889, - "learning_rate": 3.4432609590608547e-06, - "loss": 0.4138, - "mean_token_accuracy": 0.8661058858036995, - "num_tokens": 209541437.0, - "step": 174140 - }, - { - "entropy": 1.8844467535614968, - "epoch": 0.5398497627404466, - "grad_norm": 7.68595552444458, - "learning_rate": 3.443162098016855e-06, - "loss": 0.4104, - "mean_token_accuracy": 0.8724717631936073, - "num_tokens": 209552945.0, - "step": 174150 - }, - { - "entropy": 1.9028114691376685, - "epoch": 0.5398807618654964, - "grad_norm": 8.077682495117188, - "learning_rate": 3.443063245487701e-06, - "loss": 0.4133, - "mean_token_accuracy": 0.8684661194682122, - "num_tokens": 209563980.0, - "step": 174160 - }, - { - "entropy": 1.904925413429737, - "epoch": 0.539911760990546, - "grad_norm": 4.101963520050049, - "learning_rate": 3.44296440147217e-06, - "loss": 0.4403, - "mean_token_accuracy": 0.8549754232168197, - "num_tokens": 209575474.0, - "step": 174170 - }, - { - "entropy": 1.8629205331206322, - "epoch": 0.5399427601155957, - "grad_norm": 9.179095268249512, - "learning_rate": 3.4428655659690396e-06, - "loss": 0.4372, - "mean_token_accuracy": 0.8535078182816506, - "num_tokens": 209587292.0, - "step": 174180 - }, - { - "entropy": 1.9449424147605896, - "epoch": 0.5399737592406454, - "grad_norm": 8.293116569519043, - "learning_rate": 3.4427667389770895e-06, - "loss": 0.479, - "mean_token_accuracy": 0.8479105710983277, - "num_tokens": 209598002.0, - "step": 174190 - }, - { - "entropy": 1.9391576394438743, - "epoch": 0.5400047583656952, - "grad_norm": 6.534260272979736, - "learning_rate": 3.442667920495097e-06, - "loss": 0.4704, - "mean_token_accuracy": 0.8563640967011452, - "num_tokens": 209609592.0, - "step": 174200 - }, - { - "entropy": 1.8575612157583237, - "epoch": 0.5400357574907448, - "grad_norm": 7.172982692718506, - "learning_rate": 3.4425691105218407e-06, - "loss": 0.4005, - "mean_token_accuracy": 0.8640211433172226, - "num_tokens": 209621671.0, - "step": 174210 - }, - { - "entropy": 1.8971868574619293, - "epoch": 0.5400667566157945, - "grad_norm": 7.444050312042236, - "learning_rate": 3.4424703090561005e-06, - "loss": 0.4028, - "mean_token_accuracy": 0.8565149545669556, - "num_tokens": 209634034.0, - "step": 174220 - }, - { - "entropy": 1.8702701404690742, - "epoch": 0.5400977557408442, - "grad_norm": 8.514378547668457, - "learning_rate": 3.442371516096655e-06, - "loss": 0.4013, - "mean_token_accuracy": 0.8600099131464958, - "num_tokens": 209646181.0, - "step": 174230 - }, - { - "entropy": 1.9420946687459946, - "epoch": 0.540128754865894, - "grad_norm": 7.077751159667969, - "learning_rate": 3.4422727316422843e-06, - "loss": 0.4714, - "mean_token_accuracy": 0.8432377234101296, - "num_tokens": 209658564.0, - "step": 174240 - }, - { - "entropy": 1.719575546681881, - "epoch": 0.5401597539909436, - "grad_norm": 3.8743057250976562, - "learning_rate": 3.442173955691767e-06, - "loss": 0.3195, - "mean_token_accuracy": 0.8714294508099556, - "num_tokens": 209673328.0, - "step": 174250 - }, - { - "entropy": 1.8225284963846207, - "epoch": 0.5401907531159933, - "grad_norm": 3.381141424179077, - "learning_rate": 3.442075188243884e-06, - "loss": 0.3789, - "mean_token_accuracy": 0.8631697997450829, - "num_tokens": 209686284.0, - "step": 174260 - }, - { - "entropy": 1.8965171083807946, - "epoch": 0.540221752241043, - "grad_norm": 7.4435529708862305, - "learning_rate": 3.4419764292974155e-06, - "loss": 0.4457, - "mean_token_accuracy": 0.8570987716317177, - "num_tokens": 209697671.0, - "step": 174270 - }, - { - "entropy": 1.9002371713519097, - "epoch": 0.5402527513660927, - "grad_norm": 7.851629734039307, - "learning_rate": 3.4418776788511416e-06, - "loss": 0.4192, - "mean_token_accuracy": 0.8583559781312943, - "num_tokens": 209709310.0, - "step": 174280 - }, - { - "entropy": 1.9418524980545044, - "epoch": 0.5402837504911424, - "grad_norm": 9.440665245056152, - "learning_rate": 3.441778936903844e-06, - "loss": 0.4982, - "mean_token_accuracy": 0.8463272243738175, - "num_tokens": 209720450.0, - "step": 174290 - }, - { - "entropy": 1.8744873508810997, - "epoch": 0.5403147496161921, - "grad_norm": 8.730061531066895, - "learning_rate": 3.4416802034543018e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8531945258378982, - "num_tokens": 209733006.0, - "step": 174300 - }, - { - "entropy": 1.9549224942922592, - "epoch": 0.5403457487412417, - "grad_norm": 8.060844421386719, - "learning_rate": 3.441581478501298e-06, - "loss": 0.4547, - "mean_token_accuracy": 0.8591006144881248, - "num_tokens": 209743610.0, - "step": 174310 - }, - { - "entropy": 1.9375012516975403, - "epoch": 0.5403767478662915, - "grad_norm": 8.074051856994629, - "learning_rate": 3.4414827620436124e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8519991576671601, - "num_tokens": 209755113.0, - "step": 174320 - }, - { - "entropy": 1.908737349510193, - "epoch": 0.5404077469913412, - "grad_norm": 3.8884499073028564, - "learning_rate": 3.4413840540800287e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8561335727572441, - "num_tokens": 209767582.0, - "step": 174330 - }, - { - "entropy": 1.882502131164074, - "epoch": 0.5404387461163909, - "grad_norm": 5.514358997344971, - "learning_rate": 3.441285354609327e-06, - "loss": 0.5137, - "mean_token_accuracy": 0.8450815051794052, - "num_tokens": 209780073.0, - "step": 174340 - }, - { - "entropy": 1.936092458665371, - "epoch": 0.5404697452414405, - "grad_norm": 8.400111198425293, - "learning_rate": 3.4411866636302905e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8598876744508743, - "num_tokens": 209791202.0, - "step": 174350 - }, - { - "entropy": 1.789114499092102, - "epoch": 0.5405007443664902, - "grad_norm": 3.711569309234619, - "learning_rate": 3.4410879811417013e-06, - "loss": 0.3763, - "mean_token_accuracy": 0.8579269453883172, - "num_tokens": 209804460.0, - "step": 174360 - }, - { - "entropy": 1.90018672645092, - "epoch": 0.54053174349154, - "grad_norm": 3.735109567642212, - "learning_rate": 3.4409893071423422e-06, - "loss": 0.435, - "mean_token_accuracy": 0.8549308374524116, - "num_tokens": 209815948.0, - "step": 174370 - }, - { - "entropy": 1.9148787215352059, - "epoch": 0.5405627426165897, - "grad_norm": 4.6859354972839355, - "learning_rate": 3.440890641630996e-06, - "loss": 0.4424, - "mean_token_accuracy": 0.8568407163023949, - "num_tokens": 209828424.0, - "step": 174380 - }, - { - "entropy": 1.9087970286607743, - "epoch": 0.5405937417416393, - "grad_norm": 3.260207414627075, - "learning_rate": 3.440791984606446e-06, - "loss": 0.421, - "mean_token_accuracy": 0.8594276770949364, - "num_tokens": 209839992.0, - "step": 174390 - }, - { - "entropy": 1.892660665512085, - "epoch": 0.540624740866689, - "grad_norm": 8.223847389221191, - "learning_rate": 3.440693336067476e-06, - "loss": 0.4091, - "mean_token_accuracy": 0.8555561438202858, - "num_tokens": 209851621.0, - "step": 174400 - }, - { - "entropy": 1.9226069450378418, - "epoch": 0.5406557399917388, - "grad_norm": 9.147815704345703, - "learning_rate": 3.4405946960128685e-06, - "loss": 0.484, - "mean_token_accuracy": 0.838910260796547, - "num_tokens": 209863542.0, - "step": 174410 - }, - { - "entropy": 1.8764439657330514, - "epoch": 0.5406867391167884, - "grad_norm": 10.320169448852539, - "learning_rate": 3.440496064441408e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8501550763845444, - "num_tokens": 209876332.0, - "step": 174420 - }, - { - "entropy": 1.932776327431202, - "epoch": 0.5407177382418381, - "grad_norm": 3.934086561203003, - "learning_rate": 3.4403974413518787e-06, - "loss": 0.4702, - "mean_token_accuracy": 0.8536558374762535, - "num_tokens": 209887319.0, - "step": 174430 - }, - { - "entropy": 1.9584178015589715, - "epoch": 0.5407487373668878, - "grad_norm": 9.40331745147705, - "learning_rate": 3.4402988267430653e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8565890833735466, - "num_tokens": 209898463.0, - "step": 174440 - }, - { - "entropy": 1.8962328165769577, - "epoch": 0.5407797364919376, - "grad_norm": 7.782931327819824, - "learning_rate": 3.4402002206137525e-06, - "loss": 0.4703, - "mean_token_accuracy": 0.8547718808054924, - "num_tokens": 209910866.0, - "step": 174450 - }, - { - "entropy": 2.017945593595505, - "epoch": 0.5408107356169872, - "grad_norm": 6.7005486488342285, - "learning_rate": 3.440101622962724e-06, - "loss": 0.4734, - "mean_token_accuracy": 0.8474322855472565, - "num_tokens": 209921348.0, - "step": 174460 - }, - { - "entropy": 1.9669252157211303, - "epoch": 0.5408417347420369, - "grad_norm": 9.323506355285645, - "learning_rate": 3.440003033788766e-06, - "loss": 0.4896, - "mean_token_accuracy": 0.8428177922964096, - "num_tokens": 209932134.0, - "step": 174470 - }, - { - "entropy": 1.8594336844980717, - "epoch": 0.5408727338670866, - "grad_norm": 11.331262588500977, - "learning_rate": 3.4399044530906633e-06, - "loss": 0.4695, - "mean_token_accuracy": 0.8466810330748558, - "num_tokens": 209945678.0, - "step": 174480 - }, - { - "entropy": 1.9158946216106414, - "epoch": 0.5409037329921363, - "grad_norm": 8.143959999084473, - "learning_rate": 3.439805880867202e-06, - "loss": 0.4263, - "mean_token_accuracy": 0.8595825538039208, - "num_tokens": 209957030.0, - "step": 174490 - }, - { - "entropy": 1.9205025285482407, - "epoch": 0.540934732117186, - "grad_norm": 7.518352031707764, - "learning_rate": 3.439707317117168e-06, - "loss": 0.4791, - "mean_token_accuracy": 0.8524655893445015, - "num_tokens": 209968372.0, - "step": 174500 - }, - { - "entropy": 1.8929289489984513, - "epoch": 0.5409657312422357, - "grad_norm": 9.195374488830566, - "learning_rate": 3.439608761839347e-06, - "loss": 0.4426, - "mean_token_accuracy": 0.8569901049137115, - "num_tokens": 209980375.0, - "step": 174510 - }, - { - "entropy": 1.8607401132583619, - "epoch": 0.5409967303672854, - "grad_norm": 8.270475387573242, - "learning_rate": 3.439510215032525e-06, - "loss": 0.4106, - "mean_token_accuracy": 0.8566593199968338, - "num_tokens": 209992780.0, - "step": 174520 - }, - { - "entropy": 1.9021028637886048, - "epoch": 0.5410277294923351, - "grad_norm": 8.019796371459961, - "learning_rate": 3.4394116766954886e-06, - "loss": 0.4518, - "mean_token_accuracy": 0.8580415681004524, - "num_tokens": 210004429.0, - "step": 174530 - }, - { - "entropy": 1.9144439026713371, - "epoch": 0.5410587286173848, - "grad_norm": 9.35071086883545, - "learning_rate": 3.439313146827026e-06, - "loss": 0.4001, - "mean_token_accuracy": 0.8612898230552674, - "num_tokens": 210016007.0, - "step": 174540 - }, - { - "entropy": 1.9227073967456818, - "epoch": 0.5410897277424345, - "grad_norm": 8.214776992797852, - "learning_rate": 3.439214625425923e-06, - "loss": 0.4615, - "mean_token_accuracy": 0.8489612191915512, - "num_tokens": 210027331.0, - "step": 174550 - }, - { - "entropy": 1.8217739909887314, - "epoch": 0.5411207268674841, - "grad_norm": 4.1948957443237305, - "learning_rate": 3.439116112490967e-06, - "loss": 0.3779, - "mean_token_accuracy": 0.8614114180207253, - "num_tokens": 210040443.0, - "step": 174560 - }, - { - "entropy": 1.9101142331957817, - "epoch": 0.5411517259925338, - "grad_norm": 7.208160877227783, - "learning_rate": 3.4390176080209454e-06, - "loss": 0.4299, - "mean_token_accuracy": 0.8595498502254486, - "num_tokens": 210052159.0, - "step": 174570 - }, - { - "entropy": 1.8564433738589288, - "epoch": 0.5411827251175836, - "grad_norm": 8.189614295959473, - "learning_rate": 3.4389191120146466e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8517980024218559, - "num_tokens": 210064171.0, - "step": 174580 - }, - { - "entropy": 1.7955839857459068, - "epoch": 0.5412137242426333, - "grad_norm": 9.149805068969727, - "learning_rate": 3.438820624470858e-06, - "loss": 0.3621, - "mean_token_accuracy": 0.8595570862293244, - "num_tokens": 210077852.0, - "step": 174590 - }, - { - "entropy": 1.9039988040924072, - "epoch": 0.5412447233676829, - "grad_norm": 8.685386657714844, - "learning_rate": 3.4387221453883684e-06, - "loss": 0.4086, - "mean_token_accuracy": 0.8574944108724594, - "num_tokens": 210090031.0, - "step": 174600 - }, - { - "entropy": 1.943969477713108, - "epoch": 0.5412757224927326, - "grad_norm": 9.362236976623535, - "learning_rate": 3.438623674765967e-06, - "loss": 0.4563, - "mean_token_accuracy": 0.8577531427145004, - "num_tokens": 210101413.0, - "step": 174610 - }, - { - "entropy": 1.8930824875831604, - "epoch": 0.5413067216177824, - "grad_norm": 3.906763792037964, - "learning_rate": 3.4385252126024405e-06, - "loss": 0.4181, - "mean_token_accuracy": 0.8613180920481682, - "num_tokens": 210115101.0, - "step": 174620 - }, - { - "entropy": 1.8995563462376595, - "epoch": 0.541337720742832, - "grad_norm": 6.614351749420166, - "learning_rate": 3.4384267588965796e-06, - "loss": 0.4127, - "mean_token_accuracy": 0.855283860862255, - "num_tokens": 210127312.0, - "step": 174630 - }, - { - "entropy": 1.8280969649553298, - "epoch": 0.5413687198678817, - "grad_norm": 7.801238059997559, - "learning_rate": 3.438328313647173e-06, - "loss": 0.3953, - "mean_token_accuracy": 0.8650396257638931, - "num_tokens": 210140420.0, - "step": 174640 - }, - { - "entropy": 1.81494547277689, - "epoch": 0.5413997189929314, - "grad_norm": 3.7451062202453613, - "learning_rate": 3.4382298768530104e-06, - "loss": 0.3754, - "mean_token_accuracy": 0.8647580876946449, - "num_tokens": 210153529.0, - "step": 174650 - }, - { - "entropy": 1.7520886182785034, - "epoch": 0.5414307181179812, - "grad_norm": 2.5675580501556396, - "learning_rate": 3.438131448512881e-06, - "loss": 0.3702, - "mean_token_accuracy": 0.8710595235228539, - "num_tokens": 210167218.0, - "step": 174660 - }, - { - "entropy": 1.8860586032271385, - "epoch": 0.5414617172430308, - "grad_norm": 3.851811647415161, - "learning_rate": 3.438033028625575e-06, - "loss": 0.4301, - "mean_token_accuracy": 0.8687780782580375, - "num_tokens": 210178719.0, - "step": 174670 - }, - { - "entropy": 1.9037866786122322, - "epoch": 0.5414927163680805, - "grad_norm": 8.392757415771484, - "learning_rate": 3.4379346171898826e-06, - "loss": 0.4302, - "mean_token_accuracy": 0.8572928667068481, - "num_tokens": 210190726.0, - "step": 174680 - }, - { - "entropy": 1.8577393546700478, - "epoch": 0.5415237154931302, - "grad_norm": 8.268177032470703, - "learning_rate": 3.437836214204595e-06, - "loss": 0.396, - "mean_token_accuracy": 0.8722563162446022, - "num_tokens": 210203752.0, - "step": 174690 - }, - { - "entropy": 1.8598589971661568, - "epoch": 0.5415547146181799, - "grad_norm": 10.199872016906738, - "learning_rate": 3.437737819668502e-06, - "loss": 0.4306, - "mean_token_accuracy": 0.856185057759285, - "num_tokens": 210215977.0, - "step": 174700 - }, - { - "entropy": 1.8862924322485923, - "epoch": 0.5415857137432296, - "grad_norm": 3.774386167526245, - "learning_rate": 3.437639433580395e-06, - "loss": 0.4979, - "mean_token_accuracy": 0.852426928281784, - "num_tokens": 210228793.0, - "step": 174710 - }, - { - "entropy": 1.9758263260126114, - "epoch": 0.5416167128682793, - "grad_norm": 7.43900728225708, - "learning_rate": 3.4375410559390653e-06, - "loss": 0.5577, - "mean_token_accuracy": 0.8290947854518891, - "num_tokens": 210240226.0, - "step": 174720 - }, - { - "entropy": 1.774106466770172, - "epoch": 0.541647711993329, - "grad_norm": 2.82202410697937, - "learning_rate": 3.437442686743304e-06, - "loss": 0.3548, - "mean_token_accuracy": 0.8689462915062904, - "num_tokens": 210254170.0, - "step": 174730 - }, - { - "entropy": 1.891635850071907, - "epoch": 0.5416787111183787, - "grad_norm": 7.973023891448975, - "learning_rate": 3.4373443259919025e-06, - "loss": 0.4441, - "mean_token_accuracy": 0.852353821694851, - "num_tokens": 210267073.0, - "step": 174740 - }, - { - "entropy": 1.9281975954771042, - "epoch": 0.5417097102434284, - "grad_norm": 8.363383293151855, - "learning_rate": 3.437245973683653e-06, - "loss": 0.469, - "mean_token_accuracy": 0.8436614751815796, - "num_tokens": 210278959.0, - "step": 174750 - }, - { - "entropy": 1.9063287645578384, - "epoch": 0.5417407093684781, - "grad_norm": 7.292370319366455, - "learning_rate": 3.4371476298173482e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8531043663620949, - "num_tokens": 210290779.0, - "step": 174760 - }, - { - "entropy": 1.8370270490646363, - "epoch": 0.5417717084935277, - "grad_norm": 7.736352443695068, - "learning_rate": 3.43704929439178e-06, - "loss": 0.3753, - "mean_token_accuracy": 0.8688179135322571, - "num_tokens": 210303485.0, - "step": 174770 - }, - { - "entropy": 1.930588774383068, - "epoch": 0.5418027076185775, - "grad_norm": 10.174417495727539, - "learning_rate": 3.436950967405741e-06, - "loss": 0.4887, - "mean_token_accuracy": 0.8459608346223831, - "num_tokens": 210314659.0, - "step": 174780 - }, - { - "entropy": 1.7679610848426819, - "epoch": 0.5418337067436272, - "grad_norm": 6.358749866485596, - "learning_rate": 3.436852648858024e-06, - "loss": 0.353, - "mean_token_accuracy": 0.8780291527509689, - "num_tokens": 210328185.0, - "step": 174790 - }, - { - "entropy": 1.8495957791805266, - "epoch": 0.5418647058686769, - "grad_norm": 3.853734016418457, - "learning_rate": 3.436754338747422e-06, - "loss": 0.4052, - "mean_token_accuracy": 0.8549610197544097, - "num_tokens": 210341316.0, - "step": 174800 - }, - { - "entropy": 1.9119720757007599, - "epoch": 0.5418957049937265, - "grad_norm": 9.63703441619873, - "learning_rate": 3.4366560370727294e-06, - "loss": 0.4288, - "mean_token_accuracy": 0.8599133908748626, - "num_tokens": 210353173.0, - "step": 174810 - }, - { - "entropy": 1.8423039808869361, - "epoch": 0.5419267041187762, - "grad_norm": 7.737087726593018, - "learning_rate": 3.436557743832738e-06, - "loss": 0.3884, - "mean_token_accuracy": 0.8715093210339546, - "num_tokens": 210365171.0, - "step": 174820 - }, - { - "entropy": 1.936174413561821, - "epoch": 0.541957703243826, - "grad_norm": 9.276434898376465, - "learning_rate": 3.436459459026243e-06, - "loss": 0.4711, - "mean_token_accuracy": 0.8483540773391723, - "num_tokens": 210376844.0, - "step": 174830 - }, - { - "entropy": 1.8812786877155303, - "epoch": 0.5419887023688756, - "grad_norm": 8.049873352050781, - "learning_rate": 3.436361182652038e-06, - "loss": 0.4226, - "mean_token_accuracy": 0.8652190431952477, - "num_tokens": 210389173.0, - "step": 174840 - }, - { - "entropy": 1.8744924083352088, - "epoch": 0.5420197014939253, - "grad_norm": 7.033712863922119, - "learning_rate": 3.436262914708918e-06, - "loss": 0.4246, - "mean_token_accuracy": 0.868674422800541, - "num_tokens": 210401296.0, - "step": 174850 - }, - { - "entropy": 1.909075213968754, - "epoch": 0.542050700618975, - "grad_norm": 9.055830001831055, - "learning_rate": 3.436164655195677e-06, - "loss": 0.4647, - "mean_token_accuracy": 0.848932571709156, - "num_tokens": 210412989.0, - "step": 174860 - }, - { - "entropy": 1.907054753601551, - "epoch": 0.5420816997440248, - "grad_norm": 8.009981155395508, - "learning_rate": 3.4360664041111097e-06, - "loss": 0.4362, - "mean_token_accuracy": 0.855173397064209, - "num_tokens": 210424168.0, - "step": 174870 - }, - { - "entropy": 1.855896058678627, - "epoch": 0.5421126988690744, - "grad_norm": 4.083925247192383, - "learning_rate": 3.435968161454011e-06, - "loss": 0.3958, - "mean_token_accuracy": 0.8597793817520142, - "num_tokens": 210436327.0, - "step": 174880 - }, - { - "entropy": 1.7705978021025657, - "epoch": 0.5421436979941241, - "grad_norm": 3.9541049003601074, - "learning_rate": 3.4358699272231765e-06, - "loss": 0.3642, - "mean_token_accuracy": 0.8635337948799133, - "num_tokens": 210449816.0, - "step": 174890 - }, - { - "entropy": 1.8276181071996689, - "epoch": 0.5421746971191738, - "grad_norm": 4.227304458618164, - "learning_rate": 3.435771701417402e-06, - "loss": 0.3886, - "mean_token_accuracy": 0.8660426765680314, - "num_tokens": 210462975.0, - "step": 174900 - }, - { - "entropy": 1.7664773181080817, - "epoch": 0.5422056962442235, - "grad_norm": 4.43179988861084, - "learning_rate": 3.435673484035484e-06, - "loss": 0.3642, - "mean_token_accuracy": 0.8715872049331665, - "num_tokens": 210477054.0, - "step": 174910 - }, - { - "entropy": 1.901937472820282, - "epoch": 0.5422366953692732, - "grad_norm": 4.020650863647461, - "learning_rate": 3.435575275076216e-06, - "loss": 0.4332, - "mean_token_accuracy": 0.8585090190172195, - "num_tokens": 210487847.0, - "step": 174920 - }, - { - "entropy": 1.7410590797662735, - "epoch": 0.5422676944943229, - "grad_norm": 10.307069778442383, - "learning_rate": 3.4354770745383966e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.8655442446470261, - "num_tokens": 210502071.0, - "step": 174930 - }, - { - "entropy": 1.9489874988794327, - "epoch": 0.5422986936193726, - "grad_norm": 7.642431259155273, - "learning_rate": 3.4353788824208217e-06, - "loss": 0.5031, - "mean_token_accuracy": 0.846542339026928, - "num_tokens": 210513613.0, - "step": 174940 - }, - { - "entropy": 1.9443811848759651, - "epoch": 0.5423296927444223, - "grad_norm": 6.863345623016357, - "learning_rate": 3.4352806987222875e-06, - "loss": 0.4587, - "mean_token_accuracy": 0.8550491377711296, - "num_tokens": 210524857.0, - "step": 174950 - }, - { - "entropy": 1.950060898065567, - "epoch": 0.542360691869472, - "grad_norm": 10.906983375549316, - "learning_rate": 3.435182523441591e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8554666265845299, - "num_tokens": 210535796.0, - "step": 174960 - }, - { - "entropy": 1.7573295474052428, - "epoch": 0.5423916909945217, - "grad_norm": 5.128137111663818, - "learning_rate": 3.4350843565775303e-06, - "loss": 0.3626, - "mean_token_accuracy": 0.8658852711319923, - "num_tokens": 210549961.0, - "step": 174970 - }, - { - "entropy": 1.9221460312604903, - "epoch": 0.5424226901195713, - "grad_norm": 8.352112770080566, - "learning_rate": 3.4349861981289017e-06, - "loss": 0.4817, - "mean_token_accuracy": 0.8526023477315903, - "num_tokens": 210561178.0, - "step": 174980 - }, - { - "entropy": 1.9498736828565597, - "epoch": 0.5424536892446211, - "grad_norm": 7.754190921783447, - "learning_rate": 3.434888048094504e-06, - "loss": 0.4683, - "mean_token_accuracy": 0.8518366366624832, - "num_tokens": 210571601.0, - "step": 174990 - }, - { - "entropy": 1.8772225648164749, - "epoch": 0.5424846883696708, - "grad_norm": 9.029406547546387, - "learning_rate": 3.4347899064731345e-06, - "loss": 0.4659, - "mean_token_accuracy": 0.8565710127353668, - "num_tokens": 210583068.0, - "step": 175000 - }, - { - "entropy": 1.8856910184025764, - "epoch": 0.5425156874947205, - "grad_norm": 8.368382453918457, - "learning_rate": 3.4346917732635916e-06, - "loss": 0.4603, - "mean_token_accuracy": 0.853217662870884, - "num_tokens": 210595557.0, - "step": 175010 - }, - { - "entropy": 1.8868588835000992, - "epoch": 0.5425466866197701, - "grad_norm": 8.443853378295898, - "learning_rate": 3.4345936484646737e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.846336479485035, - "num_tokens": 210606720.0, - "step": 175020 - }, - { - "entropy": 1.8998877912759782, - "epoch": 0.5425776857448199, - "grad_norm": 3.720510482788086, - "learning_rate": 3.4344955320751795e-06, - "loss": 0.4661, - "mean_token_accuracy": 0.8508727207779885, - "num_tokens": 210618725.0, - "step": 175030 - }, - { - "entropy": 1.8702963769435883, - "epoch": 0.5426086848698696, - "grad_norm": 7.714414596557617, - "learning_rate": 3.4343974240939077e-06, - "loss": 0.4281, - "mean_token_accuracy": 0.855791012942791, - "num_tokens": 210631454.0, - "step": 175040 - }, - { - "entropy": 1.9237717658281326, - "epoch": 0.5426396839949192, - "grad_norm": 7.232386112213135, - "learning_rate": 3.4342993245196576e-06, - "loss": 0.4663, - "mean_token_accuracy": 0.8514825507998467, - "num_tokens": 210643027.0, - "step": 175050 - }, - { - "entropy": 1.9704272598028183, - "epoch": 0.5426706831199689, - "grad_norm": 8.11324691772461, - "learning_rate": 3.434201233351228e-06, - "loss": 0.4937, - "mean_token_accuracy": 0.8477542325854301, - "num_tokens": 210654106.0, - "step": 175060 - }, - { - "entropy": 1.9354610219597816, - "epoch": 0.5427016822450186, - "grad_norm": 9.615270614624023, - "learning_rate": 3.43410315058742e-06, - "loss": 0.4594, - "mean_token_accuracy": 0.862465412914753, - "num_tokens": 210665825.0, - "step": 175070 - }, - { - "entropy": 1.8135445401072503, - "epoch": 0.5427326813700684, - "grad_norm": 4.195128917694092, - "learning_rate": 3.4340050762270326e-06, - "loss": 0.3732, - "mean_token_accuracy": 0.8759765163064003, - "num_tokens": 210678215.0, - "step": 175080 - }, - { - "entropy": 1.8600399553775788, - "epoch": 0.542763680495118, - "grad_norm": 7.729010581970215, - "learning_rate": 3.4339070102688653e-06, - "loss": 0.4025, - "mean_token_accuracy": 0.8706375062465668, - "num_tokens": 210690496.0, - "step": 175090 - }, - { - "entropy": 1.84743193089962, - "epoch": 0.5427946796201677, - "grad_norm": 3.6065328121185303, - "learning_rate": 3.433808952711719e-06, - "loss": 0.3923, - "mean_token_accuracy": 0.8589259907603264, - "num_tokens": 210703213.0, - "step": 175100 - }, - { - "entropy": 1.8852507174015045, - "epoch": 0.5428256787452174, - "grad_norm": 7.954366683959961, - "learning_rate": 3.433710903554394e-06, - "loss": 0.4549, - "mean_token_accuracy": 0.8529264152050018, - "num_tokens": 210715201.0, - "step": 175110 - }, - { - "entropy": 1.8923664793372155, - "epoch": 0.5428566778702671, - "grad_norm": 7.592563152313232, - "learning_rate": 3.433612862795692e-06, - "loss": 0.4806, - "mean_token_accuracy": 0.8424620926380157, - "num_tokens": 210727901.0, - "step": 175120 - }, - { - "entropy": 1.9149468883872032, - "epoch": 0.5428876769953168, - "grad_norm": 7.934050559997559, - "learning_rate": 3.4335148304344136e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8597536578774452, - "num_tokens": 210738953.0, - "step": 175130 - }, - { - "entropy": 1.8892448142170906, - "epoch": 0.5429186761203665, - "grad_norm": 8.11552619934082, - "learning_rate": 3.43341680646936e-06, - "loss": 0.42, - "mean_token_accuracy": 0.862630070745945, - "num_tokens": 210751021.0, - "step": 175140 - }, - { - "entropy": 1.958571094274521, - "epoch": 0.5429496752454162, - "grad_norm": 8.172052383422852, - "learning_rate": 3.433318790899332e-06, - "loss": 0.4662, - "mean_token_accuracy": 0.8557330414652824, - "num_tokens": 210761979.0, - "step": 175150 - }, - { - "entropy": 1.8148248717188835, - "epoch": 0.5429806743704659, - "grad_norm": 7.902945041656494, - "learning_rate": 3.4332207837231325e-06, - "loss": 0.3968, - "mean_token_accuracy": 0.8547403365373611, - "num_tokens": 210775482.0, - "step": 175160 - }, - { - "entropy": 1.862857685983181, - "epoch": 0.5430116734955156, - "grad_norm": 11.147188186645508, - "learning_rate": 3.433122784939563e-06, - "loss": 0.4253, - "mean_token_accuracy": 0.8630274266004563, - "num_tokens": 210788441.0, - "step": 175170 - }, - { - "entropy": 1.923531498014927, - "epoch": 0.5430426726205653, - "grad_norm": 9.402071952819824, - "learning_rate": 3.433024794547426e-06, - "loss": 0.4787, - "mean_token_accuracy": 0.8527959004044533, - "num_tokens": 210799409.0, - "step": 175180 - }, - { - "entropy": 1.8989013388752938, - "epoch": 0.5430736717456149, - "grad_norm": 8.027140617370605, - "learning_rate": 3.432926812545524e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.8585892334580422, - "num_tokens": 210811487.0, - "step": 175190 - }, - { - "entropy": 1.8911652371287346, - "epoch": 0.5431046708706647, - "grad_norm": 8.578186988830566, - "learning_rate": 3.4328288389326593e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8501900911331177, - "num_tokens": 210824056.0, - "step": 175200 - }, - { - "entropy": 1.902529464662075, - "epoch": 0.5431356699957144, - "grad_norm": 7.3062896728515625, - "learning_rate": 3.4327308737076353e-06, - "loss": 0.4403, - "mean_token_accuracy": 0.8597478345036507, - "num_tokens": 210836589.0, - "step": 175210 - }, - { - "entropy": 1.9480405256152153, - "epoch": 0.5431666691207641, - "grad_norm": 7.592820644378662, - "learning_rate": 3.4326329168692556e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8596141144633294, - "num_tokens": 210848034.0, - "step": 175220 - }, - { - "entropy": 1.8504156574606896, - "epoch": 0.5431976682458137, - "grad_norm": 8.794576644897461, - "learning_rate": 3.432534968416322e-06, - "loss": 0.4322, - "mean_token_accuracy": 0.8582611083984375, - "num_tokens": 210860350.0, - "step": 175230 - }, - { - "entropy": 1.8938454136252403, - "epoch": 0.5432286673708635, - "grad_norm": 8.417749404907227, - "learning_rate": 3.4324370283476405e-06, - "loss": 0.4335, - "mean_token_accuracy": 0.8565092936158181, - "num_tokens": 210872269.0, - "step": 175240 - }, - { - "entropy": 1.9709706351161003, - "epoch": 0.5432596664959132, - "grad_norm": 10.069972038269043, - "learning_rate": 3.4323390966620135e-06, - "loss": 0.4962, - "mean_token_accuracy": 0.8481551617383957, - "num_tokens": 210883417.0, - "step": 175250 - }, - { - "entropy": 1.9295583993196488, - "epoch": 0.5432906656209628, - "grad_norm": 8.450238227844238, - "learning_rate": 3.4322411733582455e-06, - "loss": 0.4716, - "mean_token_accuracy": 0.840464486181736, - "num_tokens": 210894817.0, - "step": 175260 - }, - { - "entropy": 1.9116766616702079, - "epoch": 0.5433216647460125, - "grad_norm": 8.815253257751465, - "learning_rate": 3.432143258435141e-06, - "loss": 0.4855, - "mean_token_accuracy": 0.8442767933011055, - "num_tokens": 210907004.0, - "step": 175270 - }, - { - "entropy": 1.80950618237257, - "epoch": 0.5433526638710623, - "grad_norm": 8.208104133605957, - "learning_rate": 3.432045351891505e-06, - "loss": 0.3741, - "mean_token_accuracy": 0.8676191523671151, - "num_tokens": 210920640.0, - "step": 175280 - }, - { - "entropy": 1.8785382106900215, - "epoch": 0.543383662996112, - "grad_norm": 9.787672996520996, - "learning_rate": 3.4319474537261416e-06, - "loss": 0.4403, - "mean_token_accuracy": 0.8466727659106255, - "num_tokens": 210933513.0, - "step": 175290 - }, - { - "entropy": 1.9406921476125718, - "epoch": 0.5434146621211616, - "grad_norm": 8.736618995666504, - "learning_rate": 3.4318495639378563e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.857597853243351, - "num_tokens": 210944604.0, - "step": 175300 - }, - { - "entropy": 1.9792003378272056, - "epoch": 0.5434456612462113, - "grad_norm": 9.496161460876465, - "learning_rate": 3.431751682525455e-06, - "loss": 0.5123, - "mean_token_accuracy": 0.8454146191477776, - "num_tokens": 210955527.0, - "step": 175310 - }, - { - "entropy": 1.838035662472248, - "epoch": 0.543476660371261, - "grad_norm": 4.342799186706543, - "learning_rate": 3.431653809487742e-06, - "loss": 0.3837, - "mean_token_accuracy": 0.8712288469076157, - "num_tokens": 210967578.0, - "step": 175320 - }, - { - "entropy": 1.8781815335154532, - "epoch": 0.5435076594963107, - "grad_norm": 9.674783706665039, - "learning_rate": 3.4315559448235246e-06, - "loss": 0.449, - "mean_token_accuracy": 0.8497504383325577, - "num_tokens": 210979425.0, - "step": 175330 - }, - { - "entropy": 1.9270706087350846, - "epoch": 0.5435386586213604, - "grad_norm": 8.616156578063965, - "learning_rate": 3.431458088531609e-06, - "loss": 0.4758, - "mean_token_accuracy": 0.852242237329483, - "num_tokens": 210990495.0, - "step": 175340 - }, - { - "entropy": 1.890567271411419, - "epoch": 0.5435696577464101, - "grad_norm": 8.244331359863281, - "learning_rate": 3.4313602406107998e-06, - "loss": 0.4215, - "mean_token_accuracy": 0.8536026954650879, - "num_tokens": 211002476.0, - "step": 175350 - }, - { - "entropy": 1.9067375272512437, - "epoch": 0.5436006568714598, - "grad_norm": 3.6741037368774414, - "learning_rate": 3.4312624010599042e-06, - "loss": 0.4467, - "mean_token_accuracy": 0.8597292706370354, - "num_tokens": 211013821.0, - "step": 175360 - }, - { - "entropy": 1.8930379897356033, - "epoch": 0.5436316559965095, - "grad_norm": 7.21564245223999, - "learning_rate": 3.43116456987773e-06, - "loss": 0.4242, - "mean_token_accuracy": 0.8594876885414123, - "num_tokens": 211025851.0, - "step": 175370 - }, - { - "entropy": 1.823582974076271, - "epoch": 0.5436626551215592, - "grad_norm": 9.629420280456543, - "learning_rate": 3.431066747063083e-06, - "loss": 0.3963, - "mean_token_accuracy": 0.8635643497109413, - "num_tokens": 211039218.0, - "step": 175380 - }, - { - "entropy": 1.878513753414154, - "epoch": 0.5436936542466089, - "grad_norm": 8.077526092529297, - "learning_rate": 3.4309689326147717e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.8498103663325309, - "num_tokens": 211051346.0, - "step": 175390 - }, - { - "entropy": 1.9916471809148788, - "epoch": 0.5437246533716585, - "grad_norm": 8.912239074707031, - "learning_rate": 3.430871126531603e-06, - "loss": 0.4276, - "mean_token_accuracy": 0.8641511112451553, - "num_tokens": 211061719.0, - "step": 175400 - }, - { - "entropy": 1.8105146452784537, - "epoch": 0.5437556524967083, - "grad_norm": 2.607809066772461, - "learning_rate": 3.430773328812384e-06, - "loss": 0.4415, - "mean_token_accuracy": 0.857674777507782, - "num_tokens": 211075875.0, - "step": 175410 - }, - { - "entropy": 1.9355085432529449, - "epoch": 0.543786651621758, - "grad_norm": 7.287834167480469, - "learning_rate": 3.4306755394559235e-06, - "loss": 0.4681, - "mean_token_accuracy": 0.8538926064968109, - "num_tokens": 211086848.0, - "step": 175420 - }, - { - "entropy": 1.9138874024152757, - "epoch": 0.5438176507468077, - "grad_norm": 7.868054389953613, - "learning_rate": 3.4305777584610297e-06, - "loss": 0.4848, - "mean_token_accuracy": 0.8511651039123536, - "num_tokens": 211098638.0, - "step": 175430 - }, - { - "entropy": 1.8980089619755744, - "epoch": 0.5438486498718573, - "grad_norm": 3.7297019958496094, - "learning_rate": 3.4304799858265103e-06, - "loss": 0.4274, - "mean_token_accuracy": 0.8571511924266815, - "num_tokens": 211110170.0, - "step": 175440 - }, - { - "entropy": 1.9528877034783363, - "epoch": 0.5438796489969071, - "grad_norm": 3.9986767768859863, - "learning_rate": 3.430382221551175e-06, - "loss": 0.4903, - "mean_token_accuracy": 0.8474042981863021, - "num_tokens": 211121931.0, - "step": 175450 - }, - { - "entropy": 1.9155747339129447, - "epoch": 0.5439106481219568, - "grad_norm": 4.115497589111328, - "learning_rate": 3.4302844656338325e-06, - "loss": 0.4581, - "mean_token_accuracy": 0.8570854663848877, - "num_tokens": 211134004.0, - "step": 175460 - }, - { - "entropy": 1.9859370857477188, - "epoch": 0.5439416472470064, - "grad_norm": 8.700798034667969, - "learning_rate": 3.4301867180732913e-06, - "loss": 0.4981, - "mean_token_accuracy": 0.8435089200735092, - "num_tokens": 211144638.0, - "step": 175470 - }, - { - "entropy": 1.8553397893905639, - "epoch": 0.5439726463720561, - "grad_norm": 8.085160255432129, - "learning_rate": 3.4300889788683617e-06, - "loss": 0.3936, - "mean_token_accuracy": 0.8684851303696632, - "num_tokens": 211156786.0, - "step": 175480 - }, - { - "entropy": 1.8702120378613472, - "epoch": 0.5440036454971059, - "grad_norm": 8.793665885925293, - "learning_rate": 3.429991248017853e-06, - "loss": 0.4362, - "mean_token_accuracy": 0.8538280829787255, - "num_tokens": 211168883.0, - "step": 175490 - }, - { - "entropy": 1.8888561949133873, - "epoch": 0.5440346446221556, - "grad_norm": 3.943817138671875, - "learning_rate": 3.429893525520575e-06, - "loss": 0.4232, - "mean_token_accuracy": 0.8594100803136826, - "num_tokens": 211180344.0, - "step": 175500 - }, - { - "entropy": 1.9499733239412307, - "epoch": 0.5440656437472052, - "grad_norm": 8.029459953308105, - "learning_rate": 3.4297958113753376e-06, - "loss": 0.4739, - "mean_token_accuracy": 0.8499315157532692, - "num_tokens": 211191260.0, - "step": 175510 - }, - { - "entropy": 1.9230097450315953, - "epoch": 0.5440966428722549, - "grad_norm": 8.456263542175293, - "learning_rate": 3.4296981055809513e-06, - "loss": 0.4608, - "mean_token_accuracy": 0.8554961159825325, - "num_tokens": 211202649.0, - "step": 175520 - }, - { - "entropy": 1.8276287920773029, - "epoch": 0.5441276419973047, - "grad_norm": 5.568211078643799, - "learning_rate": 3.429600408136228e-06, - "loss": 0.4078, - "mean_token_accuracy": 0.8503857553005219, - "num_tokens": 211216056.0, - "step": 175530 - }, - { - "entropy": 1.957040660083294, - "epoch": 0.5441586411223543, - "grad_norm": 8.783206939697266, - "learning_rate": 3.429502719039976e-06, - "loss": 0.5035, - "mean_token_accuracy": 0.8379690006375313, - "num_tokens": 211227214.0, - "step": 175540 - }, - { - "entropy": 1.8432976469397544, - "epoch": 0.544189640247404, - "grad_norm": 4.159402847290039, - "learning_rate": 3.4294050382910083e-06, - "loss": 0.3847, - "mean_token_accuracy": 0.8681906953454017, - "num_tokens": 211240465.0, - "step": 175550 - }, - { - "entropy": 1.8789378896355629, - "epoch": 0.5442206393724537, - "grad_norm": 6.207590103149414, - "learning_rate": 3.4293073658881355e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8544572427868843, - "num_tokens": 211252444.0, - "step": 175560 - }, - { - "entropy": 1.82925843000412, - "epoch": 0.5442516384975034, - "grad_norm": 4.576752662658691, - "learning_rate": 3.429209701830169e-06, - "loss": 0.41, - "mean_token_accuracy": 0.8618702232837677, - "num_tokens": 211265393.0, - "step": 175570 - }, - { - "entropy": 1.920812802016735, - "epoch": 0.5442826376225531, - "grad_norm": 7.142611503601074, - "learning_rate": 3.429112046115921e-06, - "loss": 0.4923, - "mean_token_accuracy": 0.8376043096184731, - "num_tokens": 211276859.0, - "step": 175580 - }, - { - "entropy": 1.8717590168118476, - "epoch": 0.5443136367476028, - "grad_norm": 7.412079334259033, - "learning_rate": 3.4290143987442045e-06, - "loss": 0.3956, - "mean_token_accuracy": 0.8562609612941742, - "num_tokens": 211289884.0, - "step": 175590 - }, - { - "entropy": 1.8445143483579158, - "epoch": 0.5443446358726525, - "grad_norm": 9.400655746459961, - "learning_rate": 3.4289167597138296e-06, - "loss": 0.3788, - "mean_token_accuracy": 0.8686357572674751, - "num_tokens": 211302516.0, - "step": 175600 - }, - { - "entropy": 1.8606164276599884, - "epoch": 0.5443756349977021, - "grad_norm": 8.446548461914062, - "learning_rate": 3.4288191290236093e-06, - "loss": 0.443, - "mean_token_accuracy": 0.8458468541502953, - "num_tokens": 211315066.0, - "step": 175610 - }, - { - "entropy": 1.8773271977901458, - "epoch": 0.5444066341227519, - "grad_norm": 8.17170524597168, - "learning_rate": 3.428721506672358e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.863808399438858, - "num_tokens": 211327052.0, - "step": 175620 - }, - { - "entropy": 1.8070940539240836, - "epoch": 0.5444376332478016, - "grad_norm": 2.7977521419525146, - "learning_rate": 3.4286238926588865e-06, - "loss": 0.3686, - "mean_token_accuracy": 0.8716510728001594, - "num_tokens": 211340446.0, - "step": 175630 - }, - { - "entropy": 1.8379455134272575, - "epoch": 0.5444686323728513, - "grad_norm": 3.4940185546875, - "learning_rate": 3.4285262869820103e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8602952301502228, - "num_tokens": 211352351.0, - "step": 175640 - }, - { - "entropy": 1.8413410797715186, - "epoch": 0.5444996314979009, - "grad_norm": 3.8549675941467285, - "learning_rate": 3.428428689640541e-06, - "loss": 0.3772, - "mean_token_accuracy": 0.8687475189566612, - "num_tokens": 211364643.0, - "step": 175650 - }, - { - "entropy": 1.9668766289949418, - "epoch": 0.5445306306229507, - "grad_norm": 7.134676456451416, - "learning_rate": 3.4283311006332927e-06, - "loss": 0.4793, - "mean_token_accuracy": 0.85071252733469, - "num_tokens": 211376493.0, - "step": 175660 - }, - { - "entropy": 1.8720296204090119, - "epoch": 0.5445616297480004, - "grad_norm": 8.542624473571777, - "learning_rate": 3.42823351995908e-06, - "loss": 0.3931, - "mean_token_accuracy": 0.8693165212869645, - "num_tokens": 211388683.0, - "step": 175670 - }, - { - "entropy": 1.997011986374855, - "epoch": 0.54459262887305, - "grad_norm": 8.116767883300781, - "learning_rate": 3.4281359476167157e-06, - "loss": 0.5041, - "mean_token_accuracy": 0.8428088009357453, - "num_tokens": 211399341.0, - "step": 175680 - }, - { - "entropy": 1.9090626895427705, - "epoch": 0.5446236279980997, - "grad_norm": 7.649209022521973, - "learning_rate": 3.428038383605016e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8594685822725296, - "num_tokens": 211411098.0, - "step": 175690 - }, - { - "entropy": 1.9082271412014962, - "epoch": 0.5446546271231495, - "grad_norm": 7.6990838050842285, - "learning_rate": 3.427940827922794e-06, - "loss": 0.4221, - "mean_token_accuracy": 0.8626911997795105, - "num_tokens": 211422535.0, - "step": 175700 - }, - { - "entropy": 1.8366917654871942, - "epoch": 0.5446856262481992, - "grad_norm": 3.823190689086914, - "learning_rate": 3.4278432805688655e-06, - "loss": 0.4119, - "mean_token_accuracy": 0.8561408147215843, - "num_tokens": 211434911.0, - "step": 175710 - }, - { - "entropy": 1.8189725920557975, - "epoch": 0.5447166253732488, - "grad_norm": 7.969996929168701, - "learning_rate": 3.4277457415420452e-06, - "loss": 0.4019, - "mean_token_accuracy": 0.8670765489339829, - "num_tokens": 211446980.0, - "step": 175720 - }, - { - "entropy": 1.8865886434912682, - "epoch": 0.5447476244982985, - "grad_norm": 12.047924995422363, - "learning_rate": 3.4276482108411475e-06, - "loss": 0.43, - "mean_token_accuracy": 0.8559050917625427, - "num_tokens": 211459883.0, - "step": 175730 - }, - { - "entropy": 1.908797726035118, - "epoch": 0.5447786236233483, - "grad_norm": 9.070921897888184, - "learning_rate": 3.42755068846499e-06, - "loss": 0.4925, - "mean_token_accuracy": 0.8463908329606056, - "num_tokens": 211470730.0, - "step": 175740 - }, - { - "entropy": 1.9791014522314072, - "epoch": 0.544809622748398, - "grad_norm": 7.4087066650390625, - "learning_rate": 3.427453174412387e-06, - "loss": 0.4655, - "mean_token_accuracy": 0.8555495351552963, - "num_tokens": 211481448.0, - "step": 175750 - }, - { - "entropy": 1.8049452945590019, - "epoch": 0.5448406218734476, - "grad_norm": 9.106644630432129, - "learning_rate": 3.4273556686821547e-06, - "loss": 0.4439, - "mean_token_accuracy": 0.8545056328177452, - "num_tokens": 211494686.0, - "step": 175760 - }, - { - "entropy": 1.8799980938434602, - "epoch": 0.5448716209984973, - "grad_norm": 9.619363784790039, - "learning_rate": 3.4272581712731103e-06, - "loss": 0.4607, - "mean_token_accuracy": 0.8563680201768875, - "num_tokens": 211507092.0, - "step": 175770 - }, - { - "entropy": 1.9642152100801469, - "epoch": 0.5449026201235471, - "grad_norm": 9.396842002868652, - "learning_rate": 3.427160682184069e-06, - "loss": 0.4747, - "mean_token_accuracy": 0.8502561032772065, - "num_tokens": 211517813.0, - "step": 175780 - }, - { - "entropy": 1.896457839012146, - "epoch": 0.5449336192485967, - "grad_norm": 8.072653770446777, - "learning_rate": 3.427063201413849e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8577140867710114, - "num_tokens": 211529323.0, - "step": 175790 - }, - { - "entropy": 1.8286516055464745, - "epoch": 0.5449646183736464, - "grad_norm": 3.8755486011505127, - "learning_rate": 3.4269657289612652e-06, - "loss": 0.3747, - "mean_token_accuracy": 0.8703928470611573, - "num_tokens": 211543025.0, - "step": 175800 - }, - { - "entropy": 1.915991945564747, - "epoch": 0.5449956174986961, - "grad_norm": 7.096658229827881, - "learning_rate": 3.4268682648251365e-06, - "loss": 0.4537, - "mean_token_accuracy": 0.862190768122673, - "num_tokens": 211554435.0, - "step": 175810 - }, - { - "entropy": 1.9708215206861497, - "epoch": 0.5450266166237457, - "grad_norm": 9.02193832397461, - "learning_rate": 3.42677080900428e-06, - "loss": 0.5086, - "mean_token_accuracy": 0.8442001223564148, - "num_tokens": 211565658.0, - "step": 175820 - }, - { - "entropy": 1.8491972595453263, - "epoch": 0.5450576157487955, - "grad_norm": 7.748945713043213, - "learning_rate": 3.426673361497514e-06, - "loss": 0.4015, - "mean_token_accuracy": 0.8688301354646683, - "num_tokens": 211577795.0, - "step": 175830 - }, - { - "entropy": 1.7991741731762887, - "epoch": 0.5450886148738452, - "grad_norm": 4.371496677398682, - "learning_rate": 3.426575922303655e-06, - "loss": 0.3885, - "mean_token_accuracy": 0.8593105405569077, - "num_tokens": 211590921.0, - "step": 175840 - }, - { - "entropy": 1.888825187087059, - "epoch": 0.5451196139988949, - "grad_norm": 8.067505836486816, - "learning_rate": 3.4264784914215223e-06, - "loss": 0.4248, - "mean_token_accuracy": 0.8572896763682365, - "num_tokens": 211602924.0, - "step": 175850 - }, - { - "entropy": 1.926749548316002, - "epoch": 0.5451506131239445, - "grad_norm": 7.764368534088135, - "learning_rate": 3.4263810688499335e-06, - "loss": 0.5012, - "mean_token_accuracy": 0.8423927545547485, - "num_tokens": 211614393.0, - "step": 175860 - }, - { - "entropy": 1.876363991200924, - "epoch": 0.5451816122489943, - "grad_norm": 8.829561233520508, - "learning_rate": 3.4262836545877082e-06, - "loss": 0.4313, - "mean_token_accuracy": 0.8627447575330734, - "num_tokens": 211626201.0, - "step": 175870 - }, - { - "entropy": 1.8496167272329331, - "epoch": 0.545212611374044, - "grad_norm": 4.570156574249268, - "learning_rate": 3.426186248633664e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.8569568589329719, - "num_tokens": 211638710.0, - "step": 175880 - }, - { - "entropy": 1.9157718800008297, - "epoch": 0.5452436104990936, - "grad_norm": 8.592246055603027, - "learning_rate": 3.4260888509866207e-06, - "loss": 0.4602, - "mean_token_accuracy": 0.8535981699824333, - "num_tokens": 211650495.0, - "step": 175890 - }, - { - "entropy": 1.9073502153158188, - "epoch": 0.5452746096241433, - "grad_norm": 8.481475830078125, - "learning_rate": 3.425991461645398e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8480674445629119, - "num_tokens": 211662190.0, - "step": 175900 - }, - { - "entropy": 1.8318237490952014, - "epoch": 0.5453056087491931, - "grad_norm": 7.986082077026367, - "learning_rate": 3.4258940806088153e-06, - "loss": 0.371, - "mean_token_accuracy": 0.8599618077278137, - "num_tokens": 211674333.0, - "step": 175910 - }, - { - "entropy": 1.8723966658115387, - "epoch": 0.5453366078742428, - "grad_norm": 4.092220783233643, - "learning_rate": 3.425796707875692e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8486889436841011, - "num_tokens": 211686356.0, - "step": 175920 - }, - { - "entropy": 1.9107608869671822, - "epoch": 0.5453676069992924, - "grad_norm": 7.9507222175598145, - "learning_rate": 3.4256993434448477e-06, - "loss": 0.4382, - "mean_token_accuracy": 0.8596844106912613, - "num_tokens": 211697780.0, - "step": 175930 - }, - { - "entropy": 1.8922893926501274, - "epoch": 0.5453986061243421, - "grad_norm": 7.145480155944824, - "learning_rate": 3.4256019873151043e-06, - "loss": 0.4013, - "mean_token_accuracy": 0.869250500202179, - "num_tokens": 211709704.0, - "step": 175940 - }, - { - "entropy": 1.7841238886117936, - "epoch": 0.5454296052493919, - "grad_norm": 6.294280529022217, - "learning_rate": 3.425504639485281e-06, - "loss": 0.3564, - "mean_token_accuracy": 0.8691599667072296, - "num_tokens": 211723529.0, - "step": 175950 - }, - { - "entropy": 1.892759595811367, - "epoch": 0.5454606043744415, - "grad_norm": 9.486949920654297, - "learning_rate": 3.425407299954198e-06, - "loss": 0.4968, - "mean_token_accuracy": 0.852388060092926, - "num_tokens": 211736158.0, - "step": 175960 - }, - { - "entropy": 1.8831520915031432, - "epoch": 0.5454916034994912, - "grad_norm": 3.6384406089782715, - "learning_rate": 3.4253099687206783e-06, - "loss": 0.4208, - "mean_token_accuracy": 0.8568282395601272, - "num_tokens": 211748342.0, - "step": 175970 - }, - { - "entropy": 1.9316042974591254, - "epoch": 0.5455226026245409, - "grad_norm": 8.037728309631348, - "learning_rate": 3.425212645783541e-06, - "loss": 0.4168, - "mean_token_accuracy": 0.865487614274025, - "num_tokens": 211759533.0, - "step": 175980 - }, - { - "entropy": 1.9403535202145576, - "epoch": 0.5455536017495907, - "grad_norm": 8.629094123840332, - "learning_rate": 3.425115331141609e-06, - "loss": 0.4796, - "mean_token_accuracy": 0.8469337105751038, - "num_tokens": 211771568.0, - "step": 175990 - }, - { - "entropy": 1.931604516506195, - "epoch": 0.5455846008746403, - "grad_norm": 8.681797981262207, - "learning_rate": 3.4250180247937037e-06, - "loss": 0.4437, - "mean_token_accuracy": 0.858528833091259, - "num_tokens": 211783261.0, - "step": 176000 - }, - { - "entropy": 1.8988345205783843, - "epoch": 0.54561559999969, - "grad_norm": 9.937957763671875, - "learning_rate": 3.424920726738647e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8475104600191117, - "num_tokens": 211795458.0, - "step": 176010 - }, - { - "entropy": 1.9326810777187347, - "epoch": 0.5456465991247397, - "grad_norm": 9.935134887695312, - "learning_rate": 3.42482343697526e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8590706020593644, - "num_tokens": 211806886.0, - "step": 176020 - }, - { - "entropy": 1.8864729851484299, - "epoch": 0.5456775982497895, - "grad_norm": 2.4956181049346924, - "learning_rate": 3.424726155502366e-06, - "loss": 0.4375, - "mean_token_accuracy": 0.8551818341016769, - "num_tokens": 211819657.0, - "step": 176030 - }, - { - "entropy": 1.9797317951917648, - "epoch": 0.5457085973748391, - "grad_norm": 7.554134845733643, - "learning_rate": 3.4246288823187878e-06, - "loss": 0.4961, - "mean_token_accuracy": 0.8407960489392281, - "num_tokens": 211831292.0, - "step": 176040 - }, - { - "entropy": 1.8862545937299728, - "epoch": 0.5457395964998888, - "grad_norm": 4.226990222930908, - "learning_rate": 3.4245316174233486e-06, - "loss": 0.4034, - "mean_token_accuracy": 0.8609147578477859, - "num_tokens": 211842562.0, - "step": 176050 - }, - { - "entropy": 1.9039891347289086, - "epoch": 0.5457705956249385, - "grad_norm": 6.957450866699219, - "learning_rate": 3.4244343608148693e-06, - "loss": 0.4324, - "mean_token_accuracy": 0.854132778942585, - "num_tokens": 211854877.0, - "step": 176060 - }, - { - "entropy": 1.9222405552864075, - "epoch": 0.5458015947499881, - "grad_norm": 7.754835605621338, - "learning_rate": 3.4243371124921765e-06, - "loss": 0.4526, - "mean_token_accuracy": 0.8527779713273048, - "num_tokens": 211866852.0, - "step": 176070 - }, - { - "entropy": 1.85050872862339, - "epoch": 0.5458325938750379, - "grad_norm": 8.02645206451416, - "learning_rate": 3.424239872454091e-06, - "loss": 0.4146, - "mean_token_accuracy": 0.8586536258459091, - "num_tokens": 211880242.0, - "step": 176080 - }, - { - "entropy": 2.002450078725815, - "epoch": 0.5458635930000876, - "grad_norm": 7.224414825439453, - "learning_rate": 3.4241426406994375e-06, - "loss": 0.4909, - "mean_token_accuracy": 0.848207288980484, - "num_tokens": 211891539.0, - "step": 176090 - }, - { - "entropy": 1.9079863592982291, - "epoch": 0.5458945921251372, - "grad_norm": 6.804903507232666, - "learning_rate": 3.424045417227041e-06, - "loss": 0.3954, - "mean_token_accuracy": 0.8677893698215484, - "num_tokens": 211903998.0, - "step": 176100 - }, - { - "entropy": 1.9110632047057152, - "epoch": 0.5459255912501869, - "grad_norm": 8.103818893432617, - "learning_rate": 3.4239482020357246e-06, - "loss": 0.4405, - "mean_token_accuracy": 0.8578624501824379, - "num_tokens": 211915738.0, - "step": 176110 - }, - { - "entropy": 1.9270017936825752, - "epoch": 0.5459565903752367, - "grad_norm": 8.514951705932617, - "learning_rate": 3.423850995124313e-06, - "loss": 0.4535, - "mean_token_accuracy": 0.8561173945665359, - "num_tokens": 211927936.0, - "step": 176120 - }, - { - "entropy": 1.8232133775949477, - "epoch": 0.5459875895002864, - "grad_norm": 3.2978906631469727, - "learning_rate": 3.4237537964916305e-06, - "loss": 0.3729, - "mean_token_accuracy": 0.8648741155862808, - "num_tokens": 211941066.0, - "step": 176130 - }, - { - "entropy": 2.0083392202854156, - "epoch": 0.546018588625336, - "grad_norm": 8.977083206176758, - "learning_rate": 3.4236566061365035e-06, - "loss": 0.4844, - "mean_token_accuracy": 0.8511565178632736, - "num_tokens": 211951856.0, - "step": 176140 - }, - { - "entropy": 1.9292023211717606, - "epoch": 0.5460495877503857, - "grad_norm": 8.130477905273438, - "learning_rate": 3.4235594240577557e-06, - "loss": 0.4471, - "mean_token_accuracy": 0.8568751186132431, - "num_tokens": 211963564.0, - "step": 176150 - }, - { - "entropy": 1.9371128112077713, - "epoch": 0.5460805868754355, - "grad_norm": 7.703921318054199, - "learning_rate": 3.423462250254213e-06, - "loss": 0.434, - "mean_token_accuracy": 0.8611638352274895, - "num_tokens": 211974754.0, - "step": 176160 - }, - { - "entropy": 1.9029362380504609, - "epoch": 0.5461115860004851, - "grad_norm": 7.6146111488342285, - "learning_rate": 3.423365084724701e-06, - "loss": 0.4769, - "mean_token_accuracy": 0.8572221979498863, - "num_tokens": 211986813.0, - "step": 176170 - }, - { - "entropy": 1.8494912847876548, - "epoch": 0.5461425851255348, - "grad_norm": 8.22357177734375, - "learning_rate": 3.4232679274680464e-06, - "loss": 0.3777, - "mean_token_accuracy": 0.8618330404162406, - "num_tokens": 211999495.0, - "step": 176180 - }, - { - "entropy": 1.8791470482945443, - "epoch": 0.5461735842505845, - "grad_norm": 9.076931953430176, - "learning_rate": 3.423170778483074e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.854033924639225, - "num_tokens": 212011756.0, - "step": 176190 - }, - { - "entropy": 1.8349644854664802, - "epoch": 0.5462045833756343, - "grad_norm": 9.453182220458984, - "learning_rate": 3.4230736377686115e-06, - "loss": 0.4189, - "mean_token_accuracy": 0.8578223511576653, - "num_tokens": 212024516.0, - "step": 176200 - }, - { - "entropy": 1.873240815103054, - "epoch": 0.5462355825006839, - "grad_norm": 3.723876714706421, - "learning_rate": 3.4229765053234847e-06, - "loss": 0.418, - "mean_token_accuracy": 0.8600479438900948, - "num_tokens": 212036668.0, - "step": 176210 - }, - { - "entropy": 1.881936539709568, - "epoch": 0.5462665816257336, - "grad_norm": 3.735361099243164, - "learning_rate": 3.42287938114652e-06, - "loss": 0.4051, - "mean_token_accuracy": 0.8556406930088997, - "num_tokens": 212049504.0, - "step": 176220 - }, - { - "entropy": 1.8623495429754258, - "epoch": 0.5462975807507833, - "grad_norm": 7.01729154586792, - "learning_rate": 3.422782265236545e-06, - "loss": 0.4193, - "mean_token_accuracy": 0.8536556616425515, - "num_tokens": 212061955.0, - "step": 176230 - }, - { - "entropy": 1.8628789693117143, - "epoch": 0.546328579875833, - "grad_norm": 10.14441967010498, - "learning_rate": 3.422685157592387e-06, - "loss": 0.4202, - "mean_token_accuracy": 0.8602953940629959, - "num_tokens": 212074670.0, - "step": 176240 - }, - { - "entropy": 1.8812679126858711, - "epoch": 0.5463595790008827, - "grad_norm": 7.10945987701416, - "learning_rate": 3.422588058212874e-06, - "loss": 0.436, - "mean_token_accuracy": 0.8632075861096382, - "num_tokens": 212086210.0, - "step": 176250 - }, - { - "entropy": 1.891566054522991, - "epoch": 0.5463905781259324, - "grad_norm": 9.948281288146973, - "learning_rate": 3.422490967096833e-06, - "loss": 0.4718, - "mean_token_accuracy": 0.8480950236320496, - "num_tokens": 212098498.0, - "step": 176260 - }, - { - "entropy": 1.897965356707573, - "epoch": 0.5464215772509821, - "grad_norm": 8.800344467163086, - "learning_rate": 3.422393884243092e-06, - "loss": 0.4876, - "mean_token_accuracy": 0.8485878229141235, - "num_tokens": 212110580.0, - "step": 176270 - }, - { - "entropy": 1.944313894212246, - "epoch": 0.5464525763760318, - "grad_norm": 7.566771984100342, - "learning_rate": 3.422296809650479e-06, - "loss": 0.438, - "mean_token_accuracy": 0.8603140458464622, - "num_tokens": 212121926.0, - "step": 176280 - }, - { - "entropy": 1.8406140804290771, - "epoch": 0.5464835755010815, - "grad_norm": 3.525524377822876, - "learning_rate": 3.4221997433178233e-06, - "loss": 0.4135, - "mean_token_accuracy": 0.867090655863285, - "num_tokens": 212134194.0, - "step": 176290 - }, - { - "entropy": 1.8961115300655365, - "epoch": 0.5465145746261312, - "grad_norm": 8.181693077087402, - "learning_rate": 3.4221026852439532e-06, - "loss": 0.434, - "mean_token_accuracy": 0.8568276822566986, - "num_tokens": 212146986.0, - "step": 176300 - }, - { - "entropy": 1.9399204954504967, - "epoch": 0.5465455737511808, - "grad_norm": 10.183042526245117, - "learning_rate": 3.4220056354276985e-06, - "loss": 0.4762, - "mean_token_accuracy": 0.8448749095201492, - "num_tokens": 212157998.0, - "step": 176310 - }, - { - "entropy": 1.841617615520954, - "epoch": 0.5465765728762305, - "grad_norm": 8.120635986328125, - "learning_rate": 3.4219085938678864e-06, - "loss": 0.4175, - "mean_token_accuracy": 0.853398184478283, - "num_tokens": 212170935.0, - "step": 176320 - }, - { - "entropy": 1.9700021624565125, - "epoch": 0.5466075720012803, - "grad_norm": 8.018360137939453, - "learning_rate": 3.4218115605633472e-06, - "loss": 0.519, - "mean_token_accuracy": 0.8460304543375969, - "num_tokens": 212181943.0, - "step": 176330 - }, - { - "entropy": 1.9688458293676376, - "epoch": 0.54663857112633, - "grad_norm": 8.161321640014648, - "learning_rate": 3.4217145355129107e-06, - "loss": 0.4783, - "mean_token_accuracy": 0.8540339127182961, - "num_tokens": 212192813.0, - "step": 176340 - }, - { - "entropy": 1.8571071356534958, - "epoch": 0.5466695702513796, - "grad_norm": 9.385038375854492, - "learning_rate": 3.421617518715407e-06, - "loss": 0.4457, - "mean_token_accuracy": 0.857005886733532, - "num_tokens": 212205062.0, - "step": 176350 - }, - { - "entropy": 1.9261850699782372, - "epoch": 0.5467005693764293, - "grad_norm": 7.309939861297607, - "learning_rate": 3.4215205101696656e-06, - "loss": 0.4476, - "mean_token_accuracy": 0.8574215292930603, - "num_tokens": 212216611.0, - "step": 176360 - }, - { - "entropy": 1.8523739635944367, - "epoch": 0.5467315685014791, - "grad_norm": 9.52919864654541, - "learning_rate": 3.4214235098745175e-06, - "loss": 0.4102, - "mean_token_accuracy": 0.8608949258923531, - "num_tokens": 212228523.0, - "step": 176370 - }, - { - "entropy": 1.9219273343682288, - "epoch": 0.5467625676265288, - "grad_norm": 4.339505672454834, - "learning_rate": 3.4213265178287926e-06, - "loss": 0.4414, - "mean_token_accuracy": 0.8524685263633728, - "num_tokens": 212240103.0, - "step": 176380 - }, - { - "entropy": 1.8322541788220406, - "epoch": 0.5467935667515784, - "grad_norm": 8.57495403289795, - "learning_rate": 3.4212295340313217e-06, - "loss": 0.3702, - "mean_token_accuracy": 0.8736605256795883, - "num_tokens": 212252653.0, - "step": 176390 - }, - { - "entropy": 1.9648008421063423, - "epoch": 0.5468245658766281, - "grad_norm": 9.930930137634277, - "learning_rate": 3.4211325584809363e-06, - "loss": 0.4705, - "mean_token_accuracy": 0.8485071301460266, - "num_tokens": 212264077.0, - "step": 176400 - }, - { - "entropy": 1.867118813097477, - "epoch": 0.5468555650016779, - "grad_norm": 7.651360511779785, - "learning_rate": 3.421035591176467e-06, - "loss": 0.4321, - "mean_token_accuracy": 0.8613616541028023, - "num_tokens": 212275869.0, - "step": 176410 - }, - { - "entropy": 1.8624306157231332, - "epoch": 0.5468865641267275, - "grad_norm": 9.126770973205566, - "learning_rate": 3.420938632116746e-06, - "loss": 0.4176, - "mean_token_accuracy": 0.8574478432536126, - "num_tokens": 212288650.0, - "step": 176420 - }, - { - "entropy": 1.9164914295077324, - "epoch": 0.5469175632517772, - "grad_norm": 4.5546464920043945, - "learning_rate": 3.420841681300604e-06, - "loss": 0.4307, - "mean_token_accuracy": 0.8624847665429115, - "num_tokens": 212300163.0, - "step": 176430 - }, - { - "entropy": 1.9037815898656845, - "epoch": 0.5469485623768269, - "grad_norm": 7.759820938110352, - "learning_rate": 3.4207447387268737e-06, - "loss": 0.4273, - "mean_token_accuracy": 0.8627464339137078, - "num_tokens": 212311103.0, - "step": 176440 - }, - { - "entropy": 1.7071588724851607, - "epoch": 0.5469795615018767, - "grad_norm": 2.4559144973754883, - "learning_rate": 3.4206478043943875e-06, - "loss": 0.3622, - "mean_token_accuracy": 0.8709375485777855, - "num_tokens": 212325146.0, - "step": 176450 - }, - { - "entropy": 1.8931735321879386, - "epoch": 0.5470105606269263, - "grad_norm": 9.125152587890625, - "learning_rate": 3.4205508783019776e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8564422413706779, - "num_tokens": 212337549.0, - "step": 176460 - }, - { - "entropy": 1.9534398928284644, - "epoch": 0.547041559751976, - "grad_norm": 7.810306072235107, - "learning_rate": 3.4204539604484755e-06, - "loss": 0.4873, - "mean_token_accuracy": 0.8494616970419884, - "num_tokens": 212349414.0, - "step": 176470 - }, - { - "entropy": 1.910618807375431, - "epoch": 0.5470725588770257, - "grad_norm": 3.4500370025634766, - "learning_rate": 3.4203570508327157e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8558408632874489, - "num_tokens": 212361418.0, - "step": 176480 - }, - { - "entropy": 1.8364159688353539, - "epoch": 0.5471035580020754, - "grad_norm": 8.173087120056152, - "learning_rate": 3.4202601494535305e-06, - "loss": 0.397, - "mean_token_accuracy": 0.8700931832194329, - "num_tokens": 212374620.0, - "step": 176490 - }, - { - "entropy": 1.8050838321447373, - "epoch": 0.5471345571271251, - "grad_norm": 8.026020050048828, - "learning_rate": 3.420163256309753e-06, - "loss": 0.437, - "mean_token_accuracy": 0.8587026596069336, - "num_tokens": 212388166.0, - "step": 176500 - }, - { - "entropy": 1.8974261716008187, - "epoch": 0.5471655562521748, - "grad_norm": 10.081825256347656, - "learning_rate": 3.420066371400217e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8566187128424645, - "num_tokens": 212400535.0, - "step": 176510 - }, - { - "entropy": 1.9240003824234009, - "epoch": 0.5471965553772244, - "grad_norm": 9.594392776489258, - "learning_rate": 3.4199694947237566e-06, - "loss": 0.4192, - "mean_token_accuracy": 0.8579287111759186, - "num_tokens": 212412419.0, - "step": 176520 - }, - { - "entropy": 1.9196303203701972, - "epoch": 0.5472275545022742, - "grad_norm": 9.45782470703125, - "learning_rate": 3.4198726262792054e-06, - "loss": 0.4266, - "mean_token_accuracy": 0.8646796032786369, - "num_tokens": 212423898.0, - "step": 176530 - }, - { - "entropy": 1.825406238436699, - "epoch": 0.5472585536273239, - "grad_norm": 7.693227291107178, - "learning_rate": 3.419775766065398e-06, - "loss": 0.4145, - "mean_token_accuracy": 0.8612094387412071, - "num_tokens": 212436667.0, - "step": 176540 - }, - { - "entropy": 1.9636890798807145, - "epoch": 0.5472895527523736, - "grad_norm": 8.634857177734375, - "learning_rate": 3.4196789140811686e-06, - "loss": 0.4654, - "mean_token_accuracy": 0.8505135089159012, - "num_tokens": 212447404.0, - "step": 176550 - }, - { - "entropy": 1.981712308526039, - "epoch": 0.5473205518774232, - "grad_norm": 8.67991828918457, - "learning_rate": 3.419582070325352e-06, - "loss": 0.4891, - "mean_token_accuracy": 0.8396664500236511, - "num_tokens": 212458756.0, - "step": 176560 - }, - { - "entropy": 1.9482917726039886, - "epoch": 0.5473515510024729, - "grad_norm": 8.68653392791748, - "learning_rate": 3.419485234796783e-06, - "loss": 0.4686, - "mean_token_accuracy": 0.8521886184811592, - "num_tokens": 212469914.0, - "step": 176570 - }, - { - "entropy": 1.8495470777153968, - "epoch": 0.5473825501275227, - "grad_norm": 8.767205238342285, - "learning_rate": 3.4193884074942973e-06, - "loss": 0.475, - "mean_token_accuracy": 0.8421885713934898, - "num_tokens": 212482617.0, - "step": 176580 - }, - { - "entropy": 1.925833135843277, - "epoch": 0.5474135492525724, - "grad_norm": 9.569941520690918, - "learning_rate": 3.419291588416729e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8548354566097259, - "num_tokens": 212493736.0, - "step": 176590 - }, - { - "entropy": 1.927579800784588, - "epoch": 0.547444548377622, - "grad_norm": 6.636186122894287, - "learning_rate": 3.419194777562916e-06, - "loss": 0.4381, - "mean_token_accuracy": 0.8604348719120025, - "num_tokens": 212505362.0, - "step": 176600 - }, - { - "entropy": 1.884330753982067, - "epoch": 0.5474755475026717, - "grad_norm": 3.8287642002105713, - "learning_rate": 3.4190979749316914e-06, - "loss": 0.4075, - "mean_token_accuracy": 0.8639156118035316, - "num_tokens": 212517532.0, - "step": 176610 - }, - { - "entropy": 1.8772628799080848, - "epoch": 0.5475065466277215, - "grad_norm": 3.413010835647583, - "learning_rate": 3.419001180521894e-06, - "loss": 0.4058, - "mean_token_accuracy": 0.8584465265274048, - "num_tokens": 212529948.0, - "step": 176620 - }, - { - "entropy": 1.788725607097149, - "epoch": 0.5475375457527711, - "grad_norm": 3.859731674194336, - "learning_rate": 3.418904394332358e-06, - "loss": 0.4216, - "mean_token_accuracy": 0.8587246060371398, - "num_tokens": 212543917.0, - "step": 176630 - }, - { - "entropy": 2.0069529950618743, - "epoch": 0.5475685448778208, - "grad_norm": 9.134621620178223, - "learning_rate": 3.418807616361922e-06, - "loss": 0.4914, - "mean_token_accuracy": 0.8446884289383888, - "num_tokens": 212554218.0, - "step": 176640 - }, - { - "entropy": 1.9287556931376457, - "epoch": 0.5475995440028705, - "grad_norm": 7.816536903381348, - "learning_rate": 3.41871084660942e-06, - "loss": 0.4572, - "mean_token_accuracy": 0.856015394628048, - "num_tokens": 212565960.0, - "step": 176650 - }, - { - "entropy": 1.9495777159929275, - "epoch": 0.5476305431279203, - "grad_norm": 8.957378387451172, - "learning_rate": 3.418614085073691e-06, - "loss": 0.4634, - "mean_token_accuracy": 0.8514742866158486, - "num_tokens": 212576897.0, - "step": 176660 - }, - { - "entropy": 1.9367633134126663, - "epoch": 0.5476615422529699, - "grad_norm": 7.440584659576416, - "learning_rate": 3.4185173317535724e-06, - "loss": 0.5019, - "mean_token_accuracy": 0.8516129940748215, - "num_tokens": 212588931.0, - "step": 176670 - }, - { - "entropy": 1.826993039250374, - "epoch": 0.5476925413780196, - "grad_norm": 5.0734968185424805, - "learning_rate": 3.4184205866479007e-06, - "loss": 0.4267, - "mean_token_accuracy": 0.8655605405569077, - "num_tokens": 212601793.0, - "step": 176680 - }, - { - "entropy": 1.8689114913344382, - "epoch": 0.5477235405030693, - "grad_norm": 7.733332633972168, - "learning_rate": 3.418323849755514e-06, - "loss": 0.431, - "mean_token_accuracy": 0.8604257687926292, - "num_tokens": 212613657.0, - "step": 176690 - }, - { - "entropy": 1.8902839928865434, - "epoch": 0.547754539628119, - "grad_norm": 9.017712593078613, - "learning_rate": 3.4182271210752506e-06, - "loss": 0.434, - "mean_token_accuracy": 0.8590465992689132, - "num_tokens": 212625786.0, - "step": 176700 - }, - { - "entropy": 1.832987940311432, - "epoch": 0.5477855387531687, - "grad_norm": 4.72005033493042, - "learning_rate": 3.418130400605948e-06, - "loss": 0.3948, - "mean_token_accuracy": 0.8583321020007133, - "num_tokens": 212638893.0, - "step": 176710 - }, - { - "entropy": 1.9117455899715423, - "epoch": 0.5478165378782184, - "grad_norm": 9.520256996154785, - "learning_rate": 3.418033688346445e-06, - "loss": 0.4751, - "mean_token_accuracy": 0.8495288237929344, - "num_tokens": 212650833.0, - "step": 176720 - }, - { - "entropy": 1.9174165919423103, - "epoch": 0.547847537003268, - "grad_norm": 9.004129409790039, - "learning_rate": 3.41793698429558e-06, - "loss": 0.4574, - "mean_token_accuracy": 0.855744905769825, - "num_tokens": 212662564.0, - "step": 176730 - }, - { - "entropy": 1.7828210145235062, - "epoch": 0.5478785361283178, - "grad_norm": 3.5656747817993164, - "learning_rate": 3.417840288452193e-06, - "loss": 0.3524, - "mean_token_accuracy": 0.8767985865473747, - "num_tokens": 212675271.0, - "step": 176740 - }, - { - "entropy": 1.968538075685501, - "epoch": 0.5479095352533675, - "grad_norm": 8.913453102111816, - "learning_rate": 3.417743600815121e-06, - "loss": 0.4958, - "mean_token_accuracy": 0.8396434724330902, - "num_tokens": 212686870.0, - "step": 176750 - }, - { - "entropy": 1.8434936612844468, - "epoch": 0.5479405343784172, - "grad_norm": 7.429874897003174, - "learning_rate": 3.417646921383205e-06, - "loss": 0.4147, - "mean_token_accuracy": 0.859374138712883, - "num_tokens": 212699255.0, - "step": 176760 - }, - { - "entropy": 1.8811804696917533, - "epoch": 0.5479715335034668, - "grad_norm": 3.9854447841644287, - "learning_rate": 3.417550250155284e-06, - "loss": 0.4246, - "mean_token_accuracy": 0.854819868505001, - "num_tokens": 212711239.0, - "step": 176770 - }, - { - "entropy": 1.7966576874256135, - "epoch": 0.5480025326285166, - "grad_norm": 4.191843032836914, - "learning_rate": 3.417453587130197e-06, - "loss": 0.4258, - "mean_token_accuracy": 0.8515931665897369, - "num_tokens": 212725043.0, - "step": 176780 - }, - { - "entropy": 1.9064485728740692, - "epoch": 0.5480335317535663, - "grad_norm": 7.229835510253906, - "learning_rate": 3.4173569323067857e-06, - "loss": 0.4399, - "mean_token_accuracy": 0.8546797648072243, - "num_tokens": 212736700.0, - "step": 176790 - }, - { - "entropy": 1.8742914631962777, - "epoch": 0.548064530878616, - "grad_norm": 8.884461402893066, - "learning_rate": 3.4172602856838886e-06, - "loss": 0.4433, - "mean_token_accuracy": 0.8549501106142998, - "num_tokens": 212748462.0, - "step": 176800 - }, - { - "entropy": 1.9238539248704911, - "epoch": 0.5480955300036656, - "grad_norm": 7.668822765350342, - "learning_rate": 3.4171636472603475e-06, - "loss": 0.4465, - "mean_token_accuracy": 0.8567259907722473, - "num_tokens": 212760223.0, - "step": 176810 - }, - { - "entropy": 1.8961792960762978, - "epoch": 0.5481265291287153, - "grad_norm": 9.429628372192383, - "learning_rate": 3.4170670170350023e-06, - "loss": 0.4037, - "mean_token_accuracy": 0.8663010224699974, - "num_tokens": 212772026.0, - "step": 176820 - }, - { - "entropy": 1.9418221697211266, - "epoch": 0.5481575282537651, - "grad_norm": 8.488309860229492, - "learning_rate": 3.4169703950066953e-06, - "loss": 0.4807, - "mean_token_accuracy": 0.8481117516756058, - "num_tokens": 212783135.0, - "step": 176830 - }, - { - "entropy": 1.8715976506471634, - "epoch": 0.5481885273788147, - "grad_norm": 3.8496992588043213, - "learning_rate": 3.4168737811742647e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.8450487107038498, - "num_tokens": 212795268.0, - "step": 176840 - }, - { - "entropy": 1.9355599626898765, - "epoch": 0.5482195265038644, - "grad_norm": 8.719645500183105, - "learning_rate": 3.4167771755365547e-06, - "loss": 0.4067, - "mean_token_accuracy": 0.8616823077201843, - "num_tokens": 212807043.0, - "step": 176850 - }, - { - "entropy": 1.962566938996315, - "epoch": 0.5482505256289141, - "grad_norm": 9.193645477294922, - "learning_rate": 3.416680578092406e-06, - "loss": 0.4672, - "mean_token_accuracy": 0.8520191237330437, - "num_tokens": 212818031.0, - "step": 176860 - }, - { - "entropy": 1.985722643136978, - "epoch": 0.5482815247539639, - "grad_norm": 8.554361343383789, - "learning_rate": 3.4165839888406603e-06, - "loss": 0.508, - "mean_token_accuracy": 0.8496146738529206, - "num_tokens": 212828755.0, - "step": 176870 - }, - { - "entropy": 1.964412897825241, - "epoch": 0.5483125238790135, - "grad_norm": 7.86843729019165, - "learning_rate": 3.41648740778016e-06, - "loss": 0.4946, - "mean_token_accuracy": 0.8463870123028755, - "num_tokens": 212839422.0, - "step": 176880 - }, - { - "entropy": 1.8350771561264991, - "epoch": 0.5483435230040632, - "grad_norm": 8.836234092712402, - "learning_rate": 3.416390834909747e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.8595826268196106, - "num_tokens": 212852284.0, - "step": 176890 - }, - { - "entropy": 1.8943647101521492, - "epoch": 0.5483745221291129, - "grad_norm": 3.6767899990081787, - "learning_rate": 3.4162942702282643e-06, - "loss": 0.3818, - "mean_token_accuracy": 0.8682879105210304, - "num_tokens": 212864828.0, - "step": 176900 - }, - { - "entropy": 1.8380630269646645, - "epoch": 0.5484055212541626, - "grad_norm": 4.24230432510376, - "learning_rate": 3.416197713734554e-06, - "loss": 0.4151, - "mean_token_accuracy": 0.8496894925832749, - "num_tokens": 212878003.0, - "step": 176910 - }, - { - "entropy": 1.8945675104856492, - "epoch": 0.5484365203792123, - "grad_norm": 4.138571739196777, - "learning_rate": 3.4161011654274595e-06, - "loss": 0.4444, - "mean_token_accuracy": 0.8576994270086289, - "num_tokens": 212889803.0, - "step": 176920 - }, - { - "entropy": 1.9622309237718583, - "epoch": 0.548467519504262, - "grad_norm": 8.334486961364746, - "learning_rate": 3.416004625305824e-06, - "loss": 0.4922, - "mean_token_accuracy": 0.8503334224224091, - "num_tokens": 212900420.0, - "step": 176930 - }, - { - "entropy": 1.9361471354961395, - "epoch": 0.5484985186293116, - "grad_norm": 8.700508117675781, - "learning_rate": 3.41590809336849e-06, - "loss": 0.4483, - "mean_token_accuracy": 0.865361250936985, - "num_tokens": 212911692.0, - "step": 176940 - }, - { - "entropy": 1.9572209745645524, - "epoch": 0.5485295177543614, - "grad_norm": 7.193476676940918, - "learning_rate": 3.4158115696143034e-06, - "loss": 0.463, - "mean_token_accuracy": 0.8585177928209304, - "num_tokens": 212922654.0, - "step": 176950 - }, - { - "entropy": 1.8604034408926964, - "epoch": 0.5485605168794111, - "grad_norm": 4.305553436279297, - "learning_rate": 3.4157150540421064e-06, - "loss": 0.4161, - "mean_token_accuracy": 0.8506237909197807, - "num_tokens": 212935189.0, - "step": 176960 - }, - { - "entropy": 1.849170657992363, - "epoch": 0.5485915160044608, - "grad_norm": 8.496451377868652, - "learning_rate": 3.4156185466507438e-06, - "loss": 0.3962, - "mean_token_accuracy": 0.8654500484466553, - "num_tokens": 212948374.0, - "step": 176970 - }, - { - "entropy": 1.8641334384679795, - "epoch": 0.5486225151295104, - "grad_norm": 8.56639289855957, - "learning_rate": 3.4155220474390593e-06, - "loss": 0.4587, - "mean_token_accuracy": 0.8593708544969558, - "num_tokens": 212960091.0, - "step": 176980 - }, - { - "entropy": 1.8458046644926072, - "epoch": 0.5486535142545602, - "grad_norm": 9.526514053344727, - "learning_rate": 3.415425556405898e-06, - "loss": 0.4167, - "mean_token_accuracy": 0.8565845727920532, - "num_tokens": 212973786.0, - "step": 176990 - }, - { - "entropy": 1.902486439049244, - "epoch": 0.5486845133796099, - "grad_norm": 6.776129722595215, - "learning_rate": 3.4153290735501043e-06, - "loss": 0.4011, - "mean_token_accuracy": 0.8635582402348518, - "num_tokens": 212985730.0, - "step": 177000 - }, - { - "entropy": 1.7577807061374187, - "epoch": 0.5487155125046596, - "grad_norm": 3.811516284942627, - "learning_rate": 3.4152325988705235e-06, - "loss": 0.322, - "mean_token_accuracy": 0.8757859200239182, - "num_tokens": 212999329.0, - "step": 177010 - }, - { - "entropy": 1.9133754044771194, - "epoch": 0.5487465116297092, - "grad_norm": 9.288987159729004, - "learning_rate": 3.415136132366002e-06, - "loss": 0.4793, - "mean_token_accuracy": 0.8521696537733078, - "num_tokens": 213011377.0, - "step": 177020 - }, - { - "entropy": 1.911231203377247, - "epoch": 0.548777510754759, - "grad_norm": 9.659815788269043, - "learning_rate": 3.4150396740353836e-06, - "loss": 0.4877, - "mean_token_accuracy": 0.8533487483859062, - "num_tokens": 213023298.0, - "step": 177030 - }, - { - "entropy": 1.8330079704523086, - "epoch": 0.5488085098798087, - "grad_norm": 3.9295654296875, - "learning_rate": 3.414943223877514e-06, - "loss": 0.4096, - "mean_token_accuracy": 0.8561473324894905, - "num_tokens": 213036685.0, - "step": 177040 - }, - { - "entropy": 1.9210347287356853, - "epoch": 0.5488395090048583, - "grad_norm": 9.26740550994873, - "learning_rate": 3.4148467818912405e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8491527557373046, - "num_tokens": 213048305.0, - "step": 177050 - }, - { - "entropy": 1.762385478615761, - "epoch": 0.548870508129908, - "grad_norm": 3.844062328338623, - "learning_rate": 3.414750348075409e-06, - "loss": 0.3472, - "mean_token_accuracy": 0.8719919562339783, - "num_tokens": 213063074.0, - "step": 177060 - }, - { - "entropy": 1.8222975119948388, - "epoch": 0.5489015072549577, - "grad_norm": 6.260597229003906, - "learning_rate": 3.4146539224288642e-06, - "loss": 0.4227, - "mean_token_accuracy": 0.850552336871624, - "num_tokens": 213076571.0, - "step": 177070 - }, - { - "entropy": 1.8614796712994575, - "epoch": 0.5489325063800075, - "grad_norm": 8.594372749328613, - "learning_rate": 3.414557504950455e-06, - "loss": 0.4338, - "mean_token_accuracy": 0.8523694068193436, - "num_tokens": 213088764.0, - "step": 177080 - }, - { - "entropy": 1.7709208399057388, - "epoch": 0.5489635055050571, - "grad_norm": 3.2780303955078125, - "learning_rate": 3.414461095639028e-06, - "loss": 0.3706, - "mean_token_accuracy": 0.8679910570383071, - "num_tokens": 213103075.0, - "step": 177090 - }, - { - "entropy": 1.775453121960163, - "epoch": 0.5489945046301068, - "grad_norm": 4.352440357208252, - "learning_rate": 3.4143646944934284e-06, - "loss": 0.3508, - "mean_token_accuracy": 0.8666783288121224, - "num_tokens": 213115647.0, - "step": 177100 - }, - { - "entropy": 1.9121138677001, - "epoch": 0.5490255037551565, - "grad_norm": 9.411645889282227, - "learning_rate": 3.414268301512505e-06, - "loss": 0.4668, - "mean_token_accuracy": 0.8493054270744324, - "num_tokens": 213127680.0, - "step": 177110 - }, - { - "entropy": 1.8475042030215263, - "epoch": 0.5490565028802062, - "grad_norm": 7.3609232902526855, - "learning_rate": 3.4141719166951058e-06, - "loss": 0.4844, - "mean_token_accuracy": 0.8392117530107498, - "num_tokens": 213140708.0, - "step": 177120 - }, - { - "entropy": 1.9383308678865432, - "epoch": 0.5490875020052559, - "grad_norm": 9.486416816711426, - "learning_rate": 3.414075540040077e-06, - "loss": 0.4934, - "mean_token_accuracy": 0.8431765273213386, - "num_tokens": 213151379.0, - "step": 177130 - }, - { - "entropy": 1.9540158912539483, - "epoch": 0.5491185011303056, - "grad_norm": 11.021252632141113, - "learning_rate": 3.4139791715462683e-06, - "loss": 0.4682, - "mean_token_accuracy": 0.8570782467722893, - "num_tokens": 213162500.0, - "step": 177140 - }, - { - "entropy": 1.8902826353907585, - "epoch": 0.5491495002553552, - "grad_norm": 9.033095359802246, - "learning_rate": 3.413882811212527e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8544860497117043, - "num_tokens": 213174567.0, - "step": 177150 - }, - { - "entropy": 1.874396102130413, - "epoch": 0.549180499380405, - "grad_norm": 8.876323699951172, - "learning_rate": 3.413786459037702e-06, - "loss": 0.4313, - "mean_token_accuracy": 0.8585318312048912, - "num_tokens": 213186648.0, - "step": 177160 - }, - { - "entropy": 1.86837887018919, - "epoch": 0.5492114985054547, - "grad_norm": 8.945581436157227, - "learning_rate": 3.4136901150206408e-06, - "loss": 0.3991, - "mean_token_accuracy": 0.8691040202975273, - "num_tokens": 213198967.0, - "step": 177170 - }, - { - "entropy": 1.917728215456009, - "epoch": 0.5492424976305044, - "grad_norm": 8.172450065612793, - "learning_rate": 3.4135937791601936e-06, - "loss": 0.4237, - "mean_token_accuracy": 0.8643875792622566, - "num_tokens": 213210339.0, - "step": 177180 - }, - { - "entropy": 1.897295144200325, - "epoch": 0.549273496755554, - "grad_norm": 8.179666519165039, - "learning_rate": 3.4134974514552092e-06, - "loss": 0.4711, - "mean_token_accuracy": 0.847081832587719, - "num_tokens": 213222457.0, - "step": 177190 - }, - { - "entropy": 1.8492035642266273, - "epoch": 0.5493044958806038, - "grad_norm": 9.86941909790039, - "learning_rate": 3.4134011319045374e-06, - "loss": 0.4404, - "mean_token_accuracy": 0.8557023197412491, - "num_tokens": 213235500.0, - "step": 177200 - }, - { - "entropy": 1.9028280600905418, - "epoch": 0.5493354950056535, - "grad_norm": 7.475229263305664, - "learning_rate": 3.413304820507027e-06, - "loss": 0.4379, - "mean_token_accuracy": 0.856609919667244, - "num_tokens": 213246948.0, - "step": 177210 - }, - { - "entropy": 1.821316310763359, - "epoch": 0.5493664941307032, - "grad_norm": 8.389219284057617, - "learning_rate": 3.4132085172615283e-06, - "loss": 0.3987, - "mean_token_accuracy": 0.8570780649781227, - "num_tokens": 213259384.0, - "step": 177220 - }, - { - "entropy": 1.8156044453382492, - "epoch": 0.5493974932557528, - "grad_norm": 8.002120018005371, - "learning_rate": 3.413112222166891e-06, - "loss": 0.3753, - "mean_token_accuracy": 0.8619963735342026, - "num_tokens": 213272870.0, - "step": 177230 - }, - { - "entropy": 1.8711471810936928, - "epoch": 0.5494284923808026, - "grad_norm": 6.479476451873779, - "learning_rate": 3.413015935221966e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.8612604930996894, - "num_tokens": 213284570.0, - "step": 177240 - }, - { - "entropy": 1.8935720384120942, - "epoch": 0.5494594915058523, - "grad_norm": 3.4970638751983643, - "learning_rate": 3.412919656425603e-06, - "loss": 0.4378, - "mean_token_accuracy": 0.8482785031199456, - "num_tokens": 213297004.0, - "step": 177250 - }, - { - "entropy": 1.8641334936022758, - "epoch": 0.5494904906309019, - "grad_norm": 7.379012584686279, - "learning_rate": 3.4128233857766536e-06, - "loss": 0.4602, - "mean_token_accuracy": 0.8560739994049072, - "num_tokens": 213309632.0, - "step": 177260 - }, - { - "entropy": 1.91132210791111, - "epoch": 0.5495214897559516, - "grad_norm": 9.017353057861328, - "learning_rate": 3.4127271232739683e-06, - "loss": 0.4547, - "mean_token_accuracy": 0.8572400063276291, - "num_tokens": 213320429.0, - "step": 177270 - }, - { - "entropy": 1.8946466326713562, - "epoch": 0.5495524888810014, - "grad_norm": 10.003750801086426, - "learning_rate": 3.4126308689163976e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.8475130766630172, - "num_tokens": 213331719.0, - "step": 177280 - }, - { - "entropy": 1.9153622835874557, - "epoch": 0.5495834880060511, - "grad_norm": 7.587196350097656, - "learning_rate": 3.4125346227027955e-06, - "loss": 0.4754, - "mean_token_accuracy": 0.8539967402815819, - "num_tokens": 213343303.0, - "step": 177290 - }, - { - "entropy": 1.9106049820780755, - "epoch": 0.5496144871311007, - "grad_norm": 6.409841537475586, - "learning_rate": 3.4124383846320103e-06, - "loss": 0.4409, - "mean_token_accuracy": 0.8548314660787583, - "num_tokens": 213355306.0, - "step": 177300 - }, - { - "entropy": 1.869752712547779, - "epoch": 0.5496454862561504, - "grad_norm": 7.410600185394287, - "learning_rate": 3.412342154702895e-06, - "loss": 0.3933, - "mean_token_accuracy": 0.8623670980334281, - "num_tokens": 213367450.0, - "step": 177310 - }, - { - "entropy": 1.9162461429834365, - "epoch": 0.5496764853812001, - "grad_norm": 7.369466781616211, - "learning_rate": 3.4122459329143033e-06, - "loss": 0.4437, - "mean_token_accuracy": 0.8539816990494729, - "num_tokens": 213378872.0, - "step": 177320 - }, - { - "entropy": 1.9060623481869698, - "epoch": 0.5497074845062498, - "grad_norm": 8.427303314208984, - "learning_rate": 3.4121497192650853e-06, - "loss": 0.4491, - "mean_token_accuracy": 0.8590623676776886, - "num_tokens": 213390091.0, - "step": 177330 - }, - { - "entropy": 1.8645298302173614, - "epoch": 0.5497384836312995, - "grad_norm": 2.700556993484497, - "learning_rate": 3.4120535137540954e-06, - "loss": 0.4582, - "mean_token_accuracy": 0.8587430387735366, - "num_tokens": 213402523.0, - "step": 177340 - }, - { - "entropy": 1.841796737909317, - "epoch": 0.5497694827563492, - "grad_norm": 3.1640028953552246, - "learning_rate": 3.4119573163801855e-06, - "loss": 0.3854, - "mean_token_accuracy": 0.8694748774170875, - "num_tokens": 213414146.0, - "step": 177350 - }, - { - "entropy": 1.8999506399035453, - "epoch": 0.5498004818813989, - "grad_norm": 7.3486552238464355, - "learning_rate": 3.411861127142209e-06, - "loss": 0.4441, - "mean_token_accuracy": 0.8545324966311455, - "num_tokens": 213425589.0, - "step": 177360 - }, - { - "entropy": 1.9142435252666474, - "epoch": 0.5498314810064486, - "grad_norm": 7.839383125305176, - "learning_rate": 3.411764946039018e-06, - "loss": 0.4879, - "mean_token_accuracy": 0.8502327635884285, - "num_tokens": 213436889.0, - "step": 177370 - }, - { - "entropy": 1.767600019276142, - "epoch": 0.5498624801314983, - "grad_norm": 5.790437698364258, - "learning_rate": 3.4116687730694664e-06, - "loss": 0.3863, - "mean_token_accuracy": 0.862785404920578, - "num_tokens": 213450524.0, - "step": 177380 - }, - { - "entropy": 1.9111123844981193, - "epoch": 0.549893479256548, - "grad_norm": 8.019339561462402, - "learning_rate": 3.411572608232409e-06, - "loss": 0.4339, - "mean_token_accuracy": 0.8633904352784156, - "num_tokens": 213461443.0, - "step": 177390 - }, - { - "entropy": 1.9108331441879272, - "epoch": 0.5499244783815976, - "grad_norm": 8.2950439453125, - "learning_rate": 3.4114764515266983e-06, - "loss": 0.413, - "mean_token_accuracy": 0.8633098036050797, - "num_tokens": 213472425.0, - "step": 177400 - }, - { - "entropy": 1.923930747807026, - "epoch": 0.5499554775066474, - "grad_norm": 8.409180641174316, - "learning_rate": 3.411380302951189e-06, - "loss": 0.5004, - "mean_token_accuracy": 0.8442714124917984, - "num_tokens": 213483267.0, - "step": 177410 - }, - { - "entropy": 1.6820106402039527, - "epoch": 0.5499864766316971, - "grad_norm": 3.9981110095977783, - "learning_rate": 3.4112841625047366e-06, - "loss": 0.3014, - "mean_token_accuracy": 0.8790968149900437, - "num_tokens": 213497545.0, - "step": 177420 - }, - { - "entropy": 1.9298490598797797, - "epoch": 0.5500174757567468, - "grad_norm": 9.856298446655273, - "learning_rate": 3.411188030186193e-06, - "loss": 0.4957, - "mean_token_accuracy": 0.8398898839950562, - "num_tokens": 213508743.0, - "step": 177430 - }, - { - "entropy": 1.8625278130173684, - "epoch": 0.5500484748817964, - "grad_norm": 6.03195858001709, - "learning_rate": 3.411091905994416e-06, - "loss": 0.4289, - "mean_token_accuracy": 0.8528363704681396, - "num_tokens": 213520882.0, - "step": 177440 - }, - { - "entropy": 1.9055146545171737, - "epoch": 0.5500794740068462, - "grad_norm": 8.34013557434082, - "learning_rate": 3.410995789928259e-06, - "loss": 0.4526, - "mean_token_accuracy": 0.8541034102439881, - "num_tokens": 213533088.0, - "step": 177450 - }, - { - "entropy": 1.9039269611239433, - "epoch": 0.5501104731318959, - "grad_norm": 7.9535346031188965, - "learning_rate": 3.4108996819865776e-06, - "loss": 0.5036, - "mean_token_accuracy": 0.8517986431717872, - "num_tokens": 213544684.0, - "step": 177460 - }, - { - "entropy": 1.852563814818859, - "epoch": 0.5501414722569455, - "grad_norm": 4.031369686126709, - "learning_rate": 3.4108035821682268e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.8509661376476287, - "num_tokens": 213557896.0, - "step": 177470 - }, - { - "entropy": 1.9033602237701417, - "epoch": 0.5501724713819952, - "grad_norm": 8.224656105041504, - "learning_rate": 3.4107074904720634e-06, - "loss": 0.4596, - "mean_token_accuracy": 0.8506034925580025, - "num_tokens": 213569530.0, - "step": 177480 - }, - { - "entropy": 1.9195539087057114, - "epoch": 0.550203470507045, - "grad_norm": 9.235480308532715, - "learning_rate": 3.4106114068969415e-06, - "loss": 0.4282, - "mean_token_accuracy": 0.8559677705168725, - "num_tokens": 213581205.0, - "step": 177490 - }, - { - "entropy": 1.8721505463123322, - "epoch": 0.5502344696320947, - "grad_norm": 7.580912113189697, - "learning_rate": 3.4105153314417192e-06, - "loss": 0.4263, - "mean_token_accuracy": 0.8641788467764855, - "num_tokens": 213592977.0, - "step": 177500 - }, - { - "entropy": 1.8690673634409904, - "epoch": 0.5502654687571443, - "grad_norm": 9.193328857421875, - "learning_rate": 3.4104192641052523e-06, - "loss": 0.4289, - "mean_token_accuracy": 0.85512455701828, - "num_tokens": 213604540.0, - "step": 177510 - }, - { - "entropy": 1.89061731249094, - "epoch": 0.550296467882194, - "grad_norm": 8.917570114135742, - "learning_rate": 3.410323204886397e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8444926172494889, - "num_tokens": 213616613.0, - "step": 177520 - }, - { - "entropy": 1.857405199110508, - "epoch": 0.5503274670072438, - "grad_norm": 7.011704444885254, - "learning_rate": 3.4102271537840104e-06, - "loss": 0.4116, - "mean_token_accuracy": 0.855528536438942, - "num_tokens": 213629454.0, - "step": 177530 - }, - { - "entropy": 1.9259560242295266, - "epoch": 0.5503584661322934, - "grad_norm": 5.598184108734131, - "learning_rate": 3.4101311107969497e-06, - "loss": 0.4306, - "mean_token_accuracy": 0.8530476734042167, - "num_tokens": 213641852.0, - "step": 177540 - }, - { - "entropy": 1.9008552610874176, - "epoch": 0.5503894652573431, - "grad_norm": 8.267748832702637, - "learning_rate": 3.410035075924073e-06, - "loss": 0.4357, - "mean_token_accuracy": 0.8589813798666001, - "num_tokens": 213653420.0, - "step": 177550 - }, - { - "entropy": 1.8692269191145896, - "epoch": 0.5504204643823928, - "grad_norm": 7.912377834320068, - "learning_rate": 3.4099390491642353e-06, - "loss": 0.3968, - "mean_token_accuracy": 0.8613191261887551, - "num_tokens": 213665530.0, - "step": 177560 - }, - { - "entropy": 1.8758762568235396, - "epoch": 0.5504514635074425, - "grad_norm": 7.625454902648926, - "learning_rate": 3.4098430305162966e-06, - "loss": 0.4776, - "mean_token_accuracy": 0.848248441517353, - "num_tokens": 213677266.0, - "step": 177570 - }, - { - "entropy": 1.923265139758587, - "epoch": 0.5504824626324922, - "grad_norm": 9.186751365661621, - "learning_rate": 3.4097470199791143e-06, - "loss": 0.4604, - "mean_token_accuracy": 0.8520221054553986, - "num_tokens": 213689292.0, - "step": 177580 - }, - { - "entropy": 1.8164562806487083, - "epoch": 0.5505134617575419, - "grad_norm": 7.9483113288879395, - "learning_rate": 3.409651017551546e-06, - "loss": 0.3789, - "mean_token_accuracy": 0.8560757398605346, - "num_tokens": 213702018.0, - "step": 177590 - }, - { - "entropy": 1.9234494641423225, - "epoch": 0.5505444608825916, - "grad_norm": 7.260690689086914, - "learning_rate": 3.409555023232452e-06, - "loss": 0.4335, - "mean_token_accuracy": 0.8594354689121246, - "num_tokens": 213713623.0, - "step": 177600 - }, - { - "entropy": 1.8692736342549323, - "epoch": 0.5505754600076412, - "grad_norm": 7.934293270111084, - "learning_rate": 3.409459037020688e-06, - "loss": 0.4769, - "mean_token_accuracy": 0.8551886230707169, - "num_tokens": 213726229.0, - "step": 177610 - }, - { - "entropy": 1.8698809787631034, - "epoch": 0.550606459132691, - "grad_norm": 8.365594863891602, - "learning_rate": 3.4093630589151157e-06, - "loss": 0.407, - "mean_token_accuracy": 0.8643792599439621, - "num_tokens": 213738593.0, - "step": 177620 - }, - { - "entropy": 1.8786334797739983, - "epoch": 0.5506374582577407, - "grad_norm": 3.4549944400787354, - "learning_rate": 3.4092670889145925e-06, - "loss": 0.4304, - "mean_token_accuracy": 0.8562889456748962, - "num_tokens": 213750663.0, - "step": 177630 - }, - { - "entropy": 1.8907557740807532, - "epoch": 0.5506684573827904, - "grad_norm": 8.042924880981445, - "learning_rate": 3.4091711270179773e-06, - "loss": 0.4265, - "mean_token_accuracy": 0.8546307638287545, - "num_tokens": 213762199.0, - "step": 177640 - }, - { - "entropy": 1.8805308431386947, - "epoch": 0.55069945650784, - "grad_norm": 9.960572242736816, - "learning_rate": 3.409075173224132e-06, - "loss": 0.427, - "mean_token_accuracy": 0.8486173838376999, - "num_tokens": 213774792.0, - "step": 177650 - }, - { - "entropy": 1.9167560517787934, - "epoch": 0.5507304556328898, - "grad_norm": 7.627819538116455, - "learning_rate": 3.4089792275319143e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8527622357010841, - "num_tokens": 213786490.0, - "step": 177660 - }, - { - "entropy": 1.8611066058278083, - "epoch": 0.5507614547579395, - "grad_norm": 7.6696295738220215, - "learning_rate": 3.408883289940184e-06, - "loss": 0.425, - "mean_token_accuracy": 0.8635546818375588, - "num_tokens": 213798884.0, - "step": 177670 - }, - { - "entropy": 1.9489878684282302, - "epoch": 0.5507924538829891, - "grad_norm": 3.252110004425049, - "learning_rate": 3.408787360447803e-06, - "loss": 0.4764, - "mean_token_accuracy": 0.8568515613675117, - "num_tokens": 213810509.0, - "step": 177680 - }, - { - "entropy": 1.925804816186428, - "epoch": 0.5508234530080388, - "grad_norm": 8.378000259399414, - "learning_rate": 3.4086914390536304e-06, - "loss": 0.4567, - "mean_token_accuracy": 0.8556630969047546, - "num_tokens": 213822155.0, - "step": 177690 - }, - { - "entropy": 1.8882811307907104, - "epoch": 0.5508544521330886, - "grad_norm": 7.851574897766113, - "learning_rate": 3.408595525756528e-06, - "loss": 0.4318, - "mean_token_accuracy": 0.854667441546917, - "num_tokens": 213834194.0, - "step": 177700 - }, - { - "entropy": 1.9184136673808099, - "epoch": 0.5508854512581383, - "grad_norm": 3.8098981380462646, - "learning_rate": 3.4084996205553554e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.85739406645298, - "num_tokens": 213845395.0, - "step": 177710 - }, - { - "entropy": 1.955088695883751, - "epoch": 0.5509164503831879, - "grad_norm": 7.2159953117370605, - "learning_rate": 3.4084037234489744e-06, - "loss": 0.4649, - "mean_token_accuracy": 0.8552487745881081, - "num_tokens": 213855942.0, - "step": 177720 - }, - { - "entropy": 1.876141707599163, - "epoch": 0.5509474495082376, - "grad_norm": 7.4983229637146, - "learning_rate": 3.4083078344362464e-06, - "loss": 0.4537, - "mean_token_accuracy": 0.8518855974078179, - "num_tokens": 213867853.0, - "step": 177730 - }, - { - "entropy": 1.835733339190483, - "epoch": 0.5509784486332874, - "grad_norm": 4.303890228271484, - "learning_rate": 3.4082119535160323e-06, - "loss": 0.4295, - "mean_token_accuracy": 0.8588640108704567, - "num_tokens": 213880352.0, - "step": 177740 - }, - { - "entropy": 1.9237274020910262, - "epoch": 0.551009447758337, - "grad_norm": 6.245169162750244, - "learning_rate": 3.4081160806871948e-06, - "loss": 0.4426, - "mean_token_accuracy": 0.8537571519613266, - "num_tokens": 213891709.0, - "step": 177750 - }, - { - "entropy": 1.9308589279651642, - "epoch": 0.5510404468833867, - "grad_norm": 11.33820629119873, - "learning_rate": 3.4080202159485964e-06, - "loss": 0.4657, - "mean_token_accuracy": 0.8526531413197518, - "num_tokens": 213903261.0, - "step": 177760 - }, - { - "entropy": 1.8311443090438844, - "epoch": 0.5510714460084364, - "grad_norm": 7.7392802238464355, - "learning_rate": 3.4079243592990975e-06, - "loss": 0.368, - "mean_token_accuracy": 0.8646254643797875, - "num_tokens": 213916092.0, - "step": 177770 - }, - { - "entropy": 1.8906705155968666, - "epoch": 0.551102445133486, - "grad_norm": 7.602465629577637, - "learning_rate": 3.4078285107375612e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.85352271348238, - "num_tokens": 213926945.0, - "step": 177780 - }, - { - "entropy": 1.7623763605952263, - "epoch": 0.5511334442585358, - "grad_norm": 2.4720168113708496, - "learning_rate": 3.4077326702628514e-06, - "loss": 0.3741, - "mean_token_accuracy": 0.8656698524951935, - "num_tokens": 213940183.0, - "step": 177790 - }, - { - "entropy": 1.8430035665631295, - "epoch": 0.5511644433835855, - "grad_norm": 8.119751930236816, - "learning_rate": 3.4076368378738296e-06, - "loss": 0.403, - "mean_token_accuracy": 0.854887755215168, - "num_tokens": 213952829.0, - "step": 177800 - }, - { - "entropy": 1.788646037876606, - "epoch": 0.5511954425086352, - "grad_norm": 7.915525913238525, - "learning_rate": 3.40754101356936e-06, - "loss": 0.3689, - "mean_token_accuracy": 0.8624294593930244, - "num_tokens": 213965992.0, - "step": 177810 - }, - { - "entropy": 1.8792960986495018, - "epoch": 0.5512264416336848, - "grad_norm": 3.7082908153533936, - "learning_rate": 3.407445197348305e-06, - "loss": 0.5063, - "mean_token_accuracy": 0.8567387446761131, - "num_tokens": 213978690.0, - "step": 177820 - }, - { - "entropy": 1.9804147839546205, - "epoch": 0.5512574407587346, - "grad_norm": 7.826719284057617, - "learning_rate": 3.4073493892095287e-06, - "loss": 0.5066, - "mean_token_accuracy": 0.8443496122956275, - "num_tokens": 213989611.0, - "step": 177830 - }, - { - "entropy": 1.9625308007001876, - "epoch": 0.5512884398837843, - "grad_norm": 7.848978519439697, - "learning_rate": 3.4072535891518947e-06, - "loss": 0.439, - "mean_token_accuracy": 0.8623924240469932, - "num_tokens": 214000412.0, - "step": 177840 - }, - { - "entropy": 1.8899758756160736, - "epoch": 0.551319439008834, - "grad_norm": 10.778916358947754, - "learning_rate": 3.407157797174267e-06, - "loss": 0.4663, - "mean_token_accuracy": 0.8471853539347649, - "num_tokens": 214011641.0, - "step": 177850 - }, - { - "entropy": 1.8247670635581017, - "epoch": 0.5513504381338836, - "grad_norm": 3.6938233375549316, - "learning_rate": 3.4070620132755107e-06, - "loss": 0.3959, - "mean_token_accuracy": 0.8624543026089668, - "num_tokens": 214024699.0, - "step": 177860 - }, - { - "entropy": 1.8587495237588882, - "epoch": 0.5513814372589334, - "grad_norm": 5.922456741333008, - "learning_rate": 3.4069662374544886e-06, - "loss": 0.408, - "mean_token_accuracy": 0.8634365990757942, - "num_tokens": 214037493.0, - "step": 177870 - }, - { - "entropy": 1.902451252937317, - "epoch": 0.5514124363839831, - "grad_norm": 7.852017879486084, - "learning_rate": 3.4068704697100667e-06, - "loss": 0.4425, - "mean_token_accuracy": 0.8582121372222901, - "num_tokens": 214048733.0, - "step": 177880 - }, - { - "entropy": 1.9077907755970955, - "epoch": 0.5514434355090327, - "grad_norm": 7.281789779663086, - "learning_rate": 3.4067747100411104e-06, - "loss": 0.4688, - "mean_token_accuracy": 0.8456922695040703, - "num_tokens": 214059824.0, - "step": 177890 - }, - { - "entropy": 1.94886015355587, - "epoch": 0.5514744346340824, - "grad_norm": 7.4586968421936035, - "learning_rate": 3.4066789584464825e-06, - "loss": 0.5074, - "mean_token_accuracy": 0.8491571620106697, - "num_tokens": 214070576.0, - "step": 177900 - }, - { - "entropy": 1.8953621149063111, - "epoch": 0.5515054337591322, - "grad_norm": 3.4469287395477295, - "learning_rate": 3.406583214925051e-06, - "loss": 0.4315, - "mean_token_accuracy": 0.8564003005623817, - "num_tokens": 214082785.0, - "step": 177910 - }, - { - "entropy": 1.8691608056426048, - "epoch": 0.5515364328841819, - "grad_norm": 9.114605903625488, - "learning_rate": 3.406487479475681e-06, - "loss": 0.4313, - "mean_token_accuracy": 0.8627192363142967, - "num_tokens": 214094914.0, - "step": 177920 - }, - { - "entropy": 1.9036862418055533, - "epoch": 0.5515674320092315, - "grad_norm": 7.558939456939697, - "learning_rate": 3.4063917520972363e-06, - "loss": 0.4081, - "mean_token_accuracy": 0.8584882363677024, - "num_tokens": 214107151.0, - "step": 177930 - }, - { - "entropy": 1.818896722793579, - "epoch": 0.5515984311342812, - "grad_norm": 7.9749956130981445, - "learning_rate": 3.4062960327885846e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.8586197167634964, - "num_tokens": 214120094.0, - "step": 177940 - }, - { - "entropy": 1.8816650286316872, - "epoch": 0.551629430259331, - "grad_norm": 6.810136795043945, - "learning_rate": 3.406200321548592e-06, - "loss": 0.4122, - "mean_token_accuracy": 0.8652727901935577, - "num_tokens": 214131838.0, - "step": 177950 - }, - { - "entropy": 1.8971996203064918, - "epoch": 0.5516604293843806, - "grad_norm": 9.636186599731445, - "learning_rate": 3.4061046183761253e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.8593866720795631, - "num_tokens": 214143806.0, - "step": 177960 - }, - { - "entropy": 1.8690408036112784, - "epoch": 0.5516914285094303, - "grad_norm": 8.90285587310791, - "learning_rate": 3.406008923270051e-06, - "loss": 0.4036, - "mean_token_accuracy": 0.8644983410835266, - "num_tokens": 214155957.0, - "step": 177970 - }, - { - "entropy": 1.931781667470932, - "epoch": 0.55172242763448, - "grad_norm": 6.843881130218506, - "learning_rate": 3.405913236229235e-06, - "loss": 0.4505, - "mean_token_accuracy": 0.8537558436393737, - "num_tokens": 214167498.0, - "step": 177980 - }, - { - "entropy": 1.9443962126970291, - "epoch": 0.5517534267595298, - "grad_norm": 8.771767616271973, - "learning_rate": 3.405817557252546e-06, - "loss": 0.4801, - "mean_token_accuracy": 0.8550885379314422, - "num_tokens": 214178482.0, - "step": 177990 - }, - { - "entropy": 1.8965649604797363, - "epoch": 0.5517844258845794, - "grad_norm": 9.75014591217041, - "learning_rate": 3.4057218863388503e-06, - "loss": 0.4606, - "mean_token_accuracy": 0.8570706993341446, - "num_tokens": 214189426.0, - "step": 178000 - }, - { - "entropy": 1.89953922778368, - "epoch": 0.5518154250096291, - "grad_norm": 8.906521797180176, - "learning_rate": 3.4056262234870164e-06, - "loss": 0.4568, - "mean_token_accuracy": 0.8522025436162949, - "num_tokens": 214201647.0, - "step": 178010 - }, - { - "entropy": 1.9386347502470016, - "epoch": 0.5518464241346788, - "grad_norm": 8.296730995178223, - "learning_rate": 3.4055305686959106e-06, - "loss": 0.4652, - "mean_token_accuracy": 0.8579264089465142, - "num_tokens": 214212554.0, - "step": 178020 - }, - { - "entropy": 1.9103853285312653, - "epoch": 0.5518774232597284, - "grad_norm": 8.537365913391113, - "learning_rate": 3.405434921964403e-06, - "loss": 0.446, - "mean_token_accuracy": 0.8576163098216056, - "num_tokens": 214223866.0, - "step": 178030 - }, - { - "entropy": 1.9078239232301712, - "epoch": 0.5519084223847782, - "grad_norm": 8.394222259521484, - "learning_rate": 3.4053392832913596e-06, - "loss": 0.4268, - "mean_token_accuracy": 0.8546977087855339, - "num_tokens": 214235513.0, - "step": 178040 - }, - { - "entropy": 1.8636566340923308, - "epoch": 0.5519394215098279, - "grad_norm": 9.244743347167969, - "learning_rate": 3.405243652675651e-06, - "loss": 0.408, - "mean_token_accuracy": 0.8607104942202568, - "num_tokens": 214247762.0, - "step": 178050 - }, - { - "entropy": 1.9099171817302705, - "epoch": 0.5519704206348776, - "grad_norm": 7.364317417144775, - "learning_rate": 3.4051480301161445e-06, - "loss": 0.4361, - "mean_token_accuracy": 0.8574780553579331, - "num_tokens": 214258815.0, - "step": 178060 - }, - { - "entropy": 1.8716615453362464, - "epoch": 0.5520014197599272, - "grad_norm": 7.7942681312561035, - "learning_rate": 3.40505241561171e-06, - "loss": 0.4218, - "mean_token_accuracy": 0.857321010529995, - "num_tokens": 214270753.0, - "step": 178070 - }, - { - "entropy": 1.8762011349201202, - "epoch": 0.552032418884977, - "grad_norm": 3.7072243690490723, - "learning_rate": 3.4049568091612157e-06, - "loss": 0.4013, - "mean_token_accuracy": 0.8608313351869583, - "num_tokens": 214282462.0, - "step": 178080 - }, - { - "entropy": 1.897752860188484, - "epoch": 0.5520634180100267, - "grad_norm": 7.348748207092285, - "learning_rate": 3.404861210763532e-06, - "loss": 0.4795, - "mean_token_accuracy": 0.8526074305176735, - "num_tokens": 214294085.0, - "step": 178090 - }, - { - "entropy": 1.939253604412079, - "epoch": 0.5520944171350763, - "grad_norm": 8.933135032653809, - "learning_rate": 3.4047656204175273e-06, - "loss": 0.4585, - "mean_token_accuracy": 0.8538300707936287, - "num_tokens": 214305365.0, - "step": 178100 - }, - { - "entropy": 1.8425916746258735, - "epoch": 0.552125416260126, - "grad_norm": 9.111211776733398, - "learning_rate": 3.404670038122073e-06, - "loss": 0.4216, - "mean_token_accuracy": 0.8560868754982949, - "num_tokens": 214318160.0, - "step": 178110 - }, - { - "entropy": 1.9103170096874238, - "epoch": 0.5521564153851758, - "grad_norm": 9.979475021362305, - "learning_rate": 3.404574463876037e-06, - "loss": 0.4577, - "mean_token_accuracy": 0.8544515520334244, - "num_tokens": 214329036.0, - "step": 178120 - }, - { - "entropy": 1.8822571218013764, - "epoch": 0.5521874145102255, - "grad_norm": 9.598880767822266, - "learning_rate": 3.404478897678291e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8551209643483162, - "num_tokens": 214341063.0, - "step": 178130 - }, - { - "entropy": 1.8942831814289094, - "epoch": 0.5522184136352751, - "grad_norm": 9.16766357421875, - "learning_rate": 3.404383339527706e-06, - "loss": 0.5151, - "mean_token_accuracy": 0.8461239233613014, - "num_tokens": 214353635.0, - "step": 178140 - }, - { - "entropy": 1.8503873273730278, - "epoch": 0.5522494127603248, - "grad_norm": 7.313621520996094, - "learning_rate": 3.404287789423152e-06, - "loss": 0.4015, - "mean_token_accuracy": 0.8708026796579361, - "num_tokens": 214366216.0, - "step": 178150 - }, - { - "entropy": 1.9046386152505874, - "epoch": 0.5522804118853746, - "grad_norm": 7.900165557861328, - "learning_rate": 3.4041922473634986e-06, - "loss": 0.4703, - "mean_token_accuracy": 0.8486365541815758, - "num_tokens": 214377957.0, - "step": 178160 - }, - { - "entropy": 1.9191514551639557, - "epoch": 0.5523114110104242, - "grad_norm": 10.395622253417969, - "learning_rate": 3.4040967133476198e-06, - "loss": 0.4728, - "mean_token_accuracy": 0.8610471084713935, - "num_tokens": 214388687.0, - "step": 178170 - }, - { - "entropy": 1.92356576025486, - "epoch": 0.5523424101354739, - "grad_norm": 7.511850833892822, - "learning_rate": 3.404001187374385e-06, - "loss": 0.4613, - "mean_token_accuracy": 0.8621419206261635, - "num_tokens": 214400793.0, - "step": 178180 - }, - { - "entropy": 1.8569615572690963, - "epoch": 0.5523734092605236, - "grad_norm": 8.08397102355957, - "learning_rate": 3.4039056694426665e-06, - "loss": 0.4074, - "mean_token_accuracy": 0.8697955429553985, - "num_tokens": 214413097.0, - "step": 178190 - }, - { - "entropy": 1.912328739464283, - "epoch": 0.5524044083855734, - "grad_norm": 7.70759391784668, - "learning_rate": 3.403810159551335e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8634463772177696, - "num_tokens": 214424684.0, - "step": 178200 - }, - { - "entropy": 1.874166764318943, - "epoch": 0.552435407510623, - "grad_norm": 4.644807815551758, - "learning_rate": 3.4037146576992636e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8502660781145096, - "num_tokens": 214437087.0, - "step": 178210 - }, - { - "entropy": 1.8312366858124733, - "epoch": 0.5524664066356727, - "grad_norm": 7.805587291717529, - "learning_rate": 3.4036191638853257e-06, - "loss": 0.4046, - "mean_token_accuracy": 0.8689468517899513, - "num_tokens": 214450085.0, - "step": 178220 - }, - { - "entropy": 1.8324200913310051, - "epoch": 0.5524974057607224, - "grad_norm": 8.242805480957031, - "learning_rate": 3.403523678108391e-06, - "loss": 0.4323, - "mean_token_accuracy": 0.8615979239344597, - "num_tokens": 214462485.0, - "step": 178230 - }, - { - "entropy": 1.7994080841541291, - "epoch": 0.5525284048857722, - "grad_norm": 4.501520156860352, - "learning_rate": 3.403428200367334e-06, - "loss": 0.3608, - "mean_token_accuracy": 0.8683733329176903, - "num_tokens": 214475717.0, - "step": 178240 - }, - { - "entropy": 1.8464299738407135, - "epoch": 0.5525594040108218, - "grad_norm": 6.4583353996276855, - "learning_rate": 3.403332730661027e-06, - "loss": 0.3707, - "mean_token_accuracy": 0.868985840678215, - "num_tokens": 214488274.0, - "step": 178250 - }, - { - "entropy": 1.8997196108102798, - "epoch": 0.5525904031358715, - "grad_norm": 8.174156188964844, - "learning_rate": 3.4032372689883443e-06, - "loss": 0.4303, - "mean_token_accuracy": 0.8552378371357918, - "num_tokens": 214500434.0, - "step": 178260 - }, - { - "entropy": 1.8080697432160378, - "epoch": 0.5526214022609212, - "grad_norm": 7.958045959472656, - "learning_rate": 3.4031418153481583e-06, - "loss": 0.3823, - "mean_token_accuracy": 0.8627307131886482, - "num_tokens": 214513847.0, - "step": 178270 - }, - { - "entropy": 1.8869553804397583, - "epoch": 0.5526524013859708, - "grad_norm": 4.627087116241455, - "learning_rate": 3.403046369739342e-06, - "loss": 0.3881, - "mean_token_accuracy": 0.8562980949878692, - "num_tokens": 214526034.0, - "step": 178280 - }, - { - "entropy": 1.8838186770677567, - "epoch": 0.5526834005110206, - "grad_norm": 6.58920431137085, - "learning_rate": 3.40295093216077e-06, - "loss": 0.4378, - "mean_token_accuracy": 0.8497677370905876, - "num_tokens": 214537809.0, - "step": 178290 - }, - { - "entropy": 1.9977750718593597, - "epoch": 0.5527143996360703, - "grad_norm": 8.55444622039795, - "learning_rate": 3.402855502611316e-06, - "loss": 0.4819, - "mean_token_accuracy": 0.8497657686471939, - "num_tokens": 214548718.0, - "step": 178300 - }, - { - "entropy": 1.961137953400612, - "epoch": 0.5527453987611199, - "grad_norm": 7.637760639190674, - "learning_rate": 3.402760081089855e-06, - "loss": 0.4594, - "mean_token_accuracy": 0.8611582666635513, - "num_tokens": 214559723.0, - "step": 178310 - }, - { - "entropy": 1.8815833404660225, - "epoch": 0.5527763978861696, - "grad_norm": 8.812824249267578, - "learning_rate": 3.402664667595261e-06, - "loss": 0.4037, - "mean_token_accuracy": 0.8595074489712715, - "num_tokens": 214572097.0, - "step": 178320 - }, - { - "entropy": 1.8904899656772614, - "epoch": 0.5528073970112194, - "grad_norm": 8.477423667907715, - "learning_rate": 3.402569262126409e-06, - "loss": 0.4027, - "mean_token_accuracy": 0.860831169784069, - "num_tokens": 214584506.0, - "step": 178330 - }, - { - "entropy": 1.9823995113372803, - "epoch": 0.5528383961362691, - "grad_norm": 14.011083602905273, - "learning_rate": 3.4024738646821725e-06, - "loss": 0.5019, - "mean_token_accuracy": 0.8458440572023391, - "num_tokens": 214595212.0, - "step": 178340 - }, - { - "entropy": 1.9233446091413497, - "epoch": 0.5528693952613187, - "grad_norm": 4.202877044677734, - "learning_rate": 3.402378475261428e-06, - "loss": 0.4589, - "mean_token_accuracy": 0.8481686040759087, - "num_tokens": 214607213.0, - "step": 178350 - }, - { - "entropy": 1.8569358214735985, - "epoch": 0.5529003943863684, - "grad_norm": 7.546200275421143, - "learning_rate": 3.402283093863051e-06, - "loss": 0.3933, - "mean_token_accuracy": 0.8649446710944175, - "num_tokens": 214619852.0, - "step": 178360 - }, - { - "entropy": 1.8237456619739532, - "epoch": 0.5529313935114182, - "grad_norm": 4.462338447570801, - "learning_rate": 3.4021877204859167e-06, - "loss": 0.4097, - "mean_token_accuracy": 0.8608748510479927, - "num_tokens": 214632981.0, - "step": 178370 - }, - { - "entropy": 1.7977847412228585, - "epoch": 0.5529623926364678, - "grad_norm": 7.902055740356445, - "learning_rate": 3.402092355128901e-06, - "loss": 0.4027, - "mean_token_accuracy": 0.8611117780208588, - "num_tokens": 214645980.0, - "step": 178380 - }, - { - "entropy": 1.9663831263780593, - "epoch": 0.5529933917615175, - "grad_norm": 8.112964630126953, - "learning_rate": 3.401996997790879e-06, - "loss": 0.4681, - "mean_token_accuracy": 0.8413886666297913, - "num_tokens": 214656890.0, - "step": 178390 - }, - { - "entropy": 1.9313152968883514, - "epoch": 0.5530243908865672, - "grad_norm": 8.793562889099121, - "learning_rate": 3.4019016484707284e-06, - "loss": 0.4452, - "mean_token_accuracy": 0.8541143208742141, - "num_tokens": 214668760.0, - "step": 178400 - }, - { - "entropy": 1.8870180428028107, - "epoch": 0.553055390011617, - "grad_norm": 8.036027908325195, - "learning_rate": 3.4018063071673245e-06, - "loss": 0.3979, - "mean_token_accuracy": 0.8699349537491798, - "num_tokens": 214680662.0, - "step": 178410 - }, - { - "entropy": 1.8556498274207116, - "epoch": 0.5530863891366666, - "grad_norm": 3.778268337249756, - "learning_rate": 3.4017109738795445e-06, - "loss": 0.419, - "mean_token_accuracy": 0.8546939373016358, - "num_tokens": 214693500.0, - "step": 178420 - }, - { - "entropy": 1.9078700512647628, - "epoch": 0.5531173882617163, - "grad_norm": 3.581650733947754, - "learning_rate": 3.4016156486062657e-06, - "loss": 0.4424, - "mean_token_accuracy": 0.8544475376605988, - "num_tokens": 214705682.0, - "step": 178430 - }, - { - "entropy": 1.9096450537443161, - "epoch": 0.553148387386766, - "grad_norm": 8.489810943603516, - "learning_rate": 3.4015203313463646e-06, - "loss": 0.4332, - "mean_token_accuracy": 0.8597532555460929, - "num_tokens": 214717144.0, - "step": 178440 - }, - { - "entropy": 1.7644469410181045, - "epoch": 0.5531793865118158, - "grad_norm": 3.961745500564575, - "learning_rate": 3.4014250220987184e-06, - "loss": 0.3481, - "mean_token_accuracy": 0.8705752789974213, - "num_tokens": 214731161.0, - "step": 178450 - }, - { - "entropy": 1.949856935441494, - "epoch": 0.5532103856368654, - "grad_norm": 7.908204555511475, - "learning_rate": 3.4013297208622048e-06, - "loss": 0.4865, - "mean_token_accuracy": 0.8459553733468056, - "num_tokens": 214742166.0, - "step": 178460 - }, - { - "entropy": 1.9470009535551072, - "epoch": 0.5532413847619151, - "grad_norm": 8.177698135375977, - "learning_rate": 3.4012344276357022e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.8542072013020515, - "num_tokens": 214752463.0, - "step": 178470 - }, - { - "entropy": 1.889072911441326, - "epoch": 0.5532723838869648, - "grad_norm": 7.375345230102539, - "learning_rate": 3.4011391424180885e-06, - "loss": 0.4467, - "mean_token_accuracy": 0.8569256499409675, - "num_tokens": 214764692.0, - "step": 178480 - }, - { - "entropy": 1.9212247535586358, - "epoch": 0.5533033830120145, - "grad_norm": 7.591136455535889, - "learning_rate": 3.4010438652082413e-06, - "loss": 0.4432, - "mean_token_accuracy": 0.8538636654615402, - "num_tokens": 214776359.0, - "step": 178490 - }, - { - "entropy": 1.943355268239975, - "epoch": 0.5533343821370642, - "grad_norm": 8.059382438659668, - "learning_rate": 3.4009485960050386e-06, - "loss": 0.4625, - "mean_token_accuracy": 0.8473861545324326, - "num_tokens": 214788078.0, - "step": 178500 - }, - { - "entropy": 1.9362692579627037, - "epoch": 0.5533653812621139, - "grad_norm": 9.145943641662598, - "learning_rate": 3.4008533348073603e-06, - "loss": 0.5119, - "mean_token_accuracy": 0.8416964054107666, - "num_tokens": 214800127.0, - "step": 178510 - }, - { - "entropy": 1.8370727390050887, - "epoch": 0.5533963803871635, - "grad_norm": 9.235921859741211, - "learning_rate": 3.4007580816140845e-06, - "loss": 0.4117, - "mean_token_accuracy": 0.860569167137146, - "num_tokens": 214812714.0, - "step": 178520 - }, - { - "entropy": 1.874185086786747, - "epoch": 0.5534273795122132, - "grad_norm": 8.020057678222656, - "learning_rate": 3.4006628364240914e-06, - "loss": 0.4091, - "mean_token_accuracy": 0.862524189054966, - "num_tokens": 214824970.0, - "step": 178530 - }, - { - "entropy": 1.887164406478405, - "epoch": 0.553458378637263, - "grad_norm": 8.896452903747559, - "learning_rate": 3.400567599236259e-06, - "loss": 0.45, - "mean_token_accuracy": 0.8595001310110092, - "num_tokens": 214836577.0, - "step": 178540 - }, - { - "entropy": 1.9219500213861465, - "epoch": 0.5534893777623127, - "grad_norm": 9.564613342285156, - "learning_rate": 3.4004723700494674e-06, - "loss": 0.4559, - "mean_token_accuracy": 0.8518328025937081, - "num_tokens": 214847973.0, - "step": 178550 - }, - { - "entropy": 1.9421744406223298, - "epoch": 0.5535203768873623, - "grad_norm": 9.683865547180176, - "learning_rate": 3.4003771488625957e-06, - "loss": 0.4834, - "mean_token_accuracy": 0.849106827378273, - "num_tokens": 214859099.0, - "step": 178560 - }, - { - "entropy": 1.9177052691578864, - "epoch": 0.553551376012412, - "grad_norm": 7.975922107696533, - "learning_rate": 3.4002819356745246e-06, - "loss": 0.4332, - "mean_token_accuracy": 0.856335574388504, - "num_tokens": 214870059.0, - "step": 178570 - }, - { - "entropy": 1.8940430551767349, - "epoch": 0.5535823751374618, - "grad_norm": 8.784914016723633, - "learning_rate": 3.400186730484135e-06, - "loss": 0.4355, - "mean_token_accuracy": 0.8558658957481384, - "num_tokens": 214882455.0, - "step": 178580 - }, - { - "entropy": 1.8574804171919823, - "epoch": 0.5536133742625114, - "grad_norm": 10.208087921142578, - "learning_rate": 3.4000915332903057e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8440673872828484, - "num_tokens": 214894606.0, - "step": 178590 - }, - { - "entropy": 1.745209413766861, - "epoch": 0.5536443733875611, - "grad_norm": 7.461440086364746, - "learning_rate": 3.3999963440919182e-06, - "loss": 0.3436, - "mean_token_accuracy": 0.8850232422351837, - "num_tokens": 214908808.0, - "step": 178600 - }, - { - "entropy": 1.996240535378456, - "epoch": 0.5536753725126108, - "grad_norm": 8.930257797241211, - "learning_rate": 3.3999011628878536e-06, - "loss": 0.5221, - "mean_token_accuracy": 0.8388968229293823, - "num_tokens": 214919755.0, - "step": 178610 - }, - { - "entropy": 1.935780856013298, - "epoch": 0.5537063716376606, - "grad_norm": 8.519956588745117, - "learning_rate": 3.3998059896769924e-06, - "loss": 0.4509, - "mean_token_accuracy": 0.8615932404994965, - "num_tokens": 214931240.0, - "step": 178620 - }, - { - "entropy": 1.9266652196645737, - "epoch": 0.5537373707627102, - "grad_norm": 8.8302001953125, - "learning_rate": 3.3997108244582166e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8644925430417061, - "num_tokens": 214942808.0, - "step": 178630 - }, - { - "entropy": 1.9325824990868568, - "epoch": 0.5537683698877599, - "grad_norm": 6.90614652633667, - "learning_rate": 3.399615667230407e-06, - "loss": 0.4483, - "mean_token_accuracy": 0.8497261658310891, - "num_tokens": 214953920.0, - "step": 178640 - }, - { - "entropy": 1.8539125517010688, - "epoch": 0.5537993690128096, - "grad_norm": 3.6557509899139404, - "learning_rate": 3.3995205179924456e-06, - "loss": 0.3863, - "mean_token_accuracy": 0.8620917737483978, - "num_tokens": 214966871.0, - "step": 178650 - }, - { - "entropy": 1.8373021736741066, - "epoch": 0.5538303681378594, - "grad_norm": 4.263208866119385, - "learning_rate": 3.399425376743214e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8628732219338417, - "num_tokens": 214979239.0, - "step": 178660 - }, - { - "entropy": 1.8937377348542213, - "epoch": 0.553861367262909, - "grad_norm": 8.342369079589844, - "learning_rate": 3.399330243481595e-06, - "loss": 0.427, - "mean_token_accuracy": 0.8600796848535538, - "num_tokens": 214990681.0, - "step": 178670 - }, - { - "entropy": 1.9099412277340888, - "epoch": 0.5538923663879587, - "grad_norm": 9.195755958557129, - "learning_rate": 3.3992351182064708e-06, - "loss": 0.436, - "mean_token_accuracy": 0.8553076282143592, - "num_tokens": 215003112.0, - "step": 178680 - }, - { - "entropy": 1.8709931746125221, - "epoch": 0.5539233655130084, - "grad_norm": 7.637581825256348, - "learning_rate": 3.3991400009167243e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8645595505833625, - "num_tokens": 215014645.0, - "step": 178690 - }, - { - "entropy": 1.8919344574213028, - "epoch": 0.5539543646380581, - "grad_norm": 9.006782531738281, - "learning_rate": 3.3990448916112375e-06, - "loss": 0.4642, - "mean_token_accuracy": 0.8571132674813271, - "num_tokens": 215025975.0, - "step": 178700 - }, - { - "entropy": 1.8450028151273727, - "epoch": 0.5539853637631078, - "grad_norm": 3.9442358016967773, - "learning_rate": 3.398949790288894e-06, - "loss": 0.4166, - "mean_token_accuracy": 0.8657394587993622, - "num_tokens": 215038005.0, - "step": 178710 - }, - { - "entropy": 1.8893791273236276, - "epoch": 0.5540163628881575, - "grad_norm": 9.21173095703125, - "learning_rate": 3.398854696948577e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.8602850198745727, - "num_tokens": 215050378.0, - "step": 178720 - }, - { - "entropy": 1.856932234764099, - "epoch": 0.5540473620132071, - "grad_norm": 9.254220962524414, - "learning_rate": 3.3987596115891695e-06, - "loss": 0.384, - "mean_token_accuracy": 0.8598960101604461, - "num_tokens": 215063082.0, - "step": 178730 - }, - { - "entropy": 1.9601772159337998, - "epoch": 0.5540783611382569, - "grad_norm": 8.915283203125, - "learning_rate": 3.3986645342095564e-06, - "loss": 0.5036, - "mean_token_accuracy": 0.8410410761833191, - "num_tokens": 215074576.0, - "step": 178740 - }, - { - "entropy": 1.8779058650135994, - "epoch": 0.5541093602633066, - "grad_norm": 7.433322429656982, - "learning_rate": 3.3985694648086206e-06, - "loss": 0.4263, - "mean_token_accuracy": 0.8639277085661888, - "num_tokens": 215086708.0, - "step": 178750 - }, - { - "entropy": 1.8679771453142167, - "epoch": 0.5541403593883563, - "grad_norm": 7.492850303649902, - "learning_rate": 3.3984744033852464e-06, - "loss": 0.3961, - "mean_token_accuracy": 0.8661182969808578, - "num_tokens": 215098777.0, - "step": 178760 - }, - { - "entropy": 1.8983294278383256, - "epoch": 0.5541713585134059, - "grad_norm": 3.626197338104248, - "learning_rate": 3.3983793499383184e-06, - "loss": 0.4469, - "mean_token_accuracy": 0.8521765947341919, - "num_tokens": 215109847.0, - "step": 178770 - }, - { - "entropy": 1.808673584461212, - "epoch": 0.5542023576384556, - "grad_norm": 2.6603171825408936, - "learning_rate": 3.3982843044667215e-06, - "loss": 0.4092, - "mean_token_accuracy": 0.8597457781434059, - "num_tokens": 215122763.0, - "step": 178780 - }, - { - "entropy": 1.8716325148940087, - "epoch": 0.5542333567635054, - "grad_norm": 6.67529821395874, - "learning_rate": 3.3981892669693396e-06, - "loss": 0.4246, - "mean_token_accuracy": 0.8609462291002273, - "num_tokens": 215134946.0, - "step": 178790 - }, - { - "entropy": 1.8793714240193367, - "epoch": 0.554264355888555, - "grad_norm": 8.348356246948242, - "learning_rate": 3.398094237445058e-06, - "loss": 0.415, - "mean_token_accuracy": 0.855781489610672, - "num_tokens": 215147406.0, - "step": 178800 - }, - { - "entropy": 2.0147987425327303, - "epoch": 0.5542953550136047, - "grad_norm": 8.565105438232422, - "learning_rate": 3.3979992158927623e-06, - "loss": 0.5062, - "mean_token_accuracy": 0.8423831447958946, - "num_tokens": 215157966.0, - "step": 178810 - }, - { - "entropy": 1.9154839143157005, - "epoch": 0.5543263541386544, - "grad_norm": 5.788300037384033, - "learning_rate": 3.397904202311338e-06, - "loss": 0.4506, - "mean_token_accuracy": 0.8576369374990463, - "num_tokens": 215169739.0, - "step": 178820 - }, - { - "entropy": 1.8988683700561524, - "epoch": 0.5543573532637042, - "grad_norm": 4.657011032104492, - "learning_rate": 3.39780919669967e-06, - "loss": 0.4615, - "mean_token_accuracy": 0.8491096451878548, - "num_tokens": 215182189.0, - "step": 178830 - }, - { - "entropy": 1.9139982163906097, - "epoch": 0.5543883523887538, - "grad_norm": 8.855412483215332, - "learning_rate": 3.3977141990566454e-06, - "loss": 0.4652, - "mean_token_accuracy": 0.8528054267168045, - "num_tokens": 215193816.0, - "step": 178840 - }, - { - "entropy": 1.8482922896742822, - "epoch": 0.5544193515138035, - "grad_norm": 4.143898963928223, - "learning_rate": 3.3976192093811496e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.8553830787539483, - "num_tokens": 215206994.0, - "step": 178850 - }, - { - "entropy": 1.862758006155491, - "epoch": 0.5544503506388532, - "grad_norm": 7.396191120147705, - "learning_rate": 3.397524227672068e-06, - "loss": 0.3892, - "mean_token_accuracy": 0.8633818462491035, - "num_tokens": 215219502.0, - "step": 178860 - }, - { - "entropy": 1.8616106539964676, - "epoch": 0.554481349763903, - "grad_norm": 8.938167572021484, - "learning_rate": 3.3974292539282892e-06, - "loss": 0.4637, - "mean_token_accuracy": 0.8673127844929696, - "num_tokens": 215231589.0, - "step": 178870 - }, - { - "entropy": 1.8870980799198152, - "epoch": 0.5545123488889526, - "grad_norm": 4.069732666015625, - "learning_rate": 3.3973342881486983e-06, - "loss": 0.4574, - "mean_token_accuracy": 0.8577647238969803, - "num_tokens": 215244003.0, - "step": 178880 - }, - { - "entropy": 1.8829326719045638, - "epoch": 0.5545433480140023, - "grad_norm": 7.217538356781006, - "learning_rate": 3.397239330332184e-06, - "loss": 0.45, - "mean_token_accuracy": 0.8619607254862786, - "num_tokens": 215256216.0, - "step": 178890 - }, - { - "entropy": 1.883328229188919, - "epoch": 0.554574347139052, - "grad_norm": 8.35718059539795, - "learning_rate": 3.3971443804776305e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.8626414731144905, - "num_tokens": 215268764.0, - "step": 178900 - }, - { - "entropy": 1.814480559527874, - "epoch": 0.5546053462641017, - "grad_norm": 6.604519844055176, - "learning_rate": 3.3970494385839274e-06, - "loss": 0.3997, - "mean_token_accuracy": 0.8664494514465332, - "num_tokens": 215281709.0, - "step": 178910 - }, - { - "entropy": 1.9098477900028228, - "epoch": 0.5546363453891514, - "grad_norm": 8.933908462524414, - "learning_rate": 3.396954504649963e-06, - "loss": 0.4445, - "mean_token_accuracy": 0.8497991293668747, - "num_tokens": 215294413.0, - "step": 178920 - }, - { - "entropy": 1.907794676721096, - "epoch": 0.5546673445142011, - "grad_norm": 5.517307758331299, - "learning_rate": 3.3968595786746234e-06, - "loss": 0.4222, - "mean_token_accuracy": 0.8596857756376266, - "num_tokens": 215305421.0, - "step": 178930 - }, - { - "entropy": 1.9482158571481705, - "epoch": 0.5546983436392507, - "grad_norm": 3.700831890106201, - "learning_rate": 3.396764660656798e-06, - "loss": 0.4664, - "mean_token_accuracy": 0.8476640805602074, - "num_tokens": 215317173.0, - "step": 178940 - }, - { - "entropy": 1.8691062763333322, - "epoch": 0.5547293427643005, - "grad_norm": 5.681051254272461, - "learning_rate": 3.3966697505953737e-06, - "loss": 0.441, - "mean_token_accuracy": 0.8514941841363907, - "num_tokens": 215330505.0, - "step": 178950 - }, - { - "entropy": 1.9252370223402977, - "epoch": 0.5547603418893502, - "grad_norm": 7.053623199462891, - "learning_rate": 3.3965748484892403e-06, - "loss": 0.4534, - "mean_token_accuracy": 0.8586797535419464, - "num_tokens": 215342454.0, - "step": 178960 - }, - { - "entropy": 1.8300693720579146, - "epoch": 0.5547913410143999, - "grad_norm": 6.6290998458862305, - "learning_rate": 3.3964799543372855e-06, - "loss": 0.4482, - "mean_token_accuracy": 0.8510792776942253, - "num_tokens": 215356481.0, - "step": 178970 - }, - { - "entropy": 1.9910599797964097, - "epoch": 0.5548223401394495, - "grad_norm": 8.265558242797852, - "learning_rate": 3.396385068138399e-06, - "loss": 0.5201, - "mean_token_accuracy": 0.8351536065340042, - "num_tokens": 215367564.0, - "step": 178980 - }, - { - "entropy": 1.843056969344616, - "epoch": 0.5548533392644993, - "grad_norm": 8.04113483428955, - "learning_rate": 3.3962901898914703e-06, - "loss": 0.4067, - "mean_token_accuracy": 0.8625307679176331, - "num_tokens": 215380757.0, - "step": 178990 - }, - { - "entropy": 1.9277337312698364, - "epoch": 0.554884338389549, - "grad_norm": 8.615413665771484, - "learning_rate": 3.3961953195953873e-06, - "loss": 0.437, - "mean_token_accuracy": 0.8572276189923287, - "num_tokens": 215392044.0, - "step": 179000 - }, - { - "entropy": 1.9238358929753303, - "epoch": 0.5549153375145986, - "grad_norm": 8.563727378845215, - "learning_rate": 3.3961004572490406e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.8628179371356964, - "num_tokens": 215404277.0, - "step": 179010 - }, - { - "entropy": 1.9175738573074341, - "epoch": 0.5549463366396483, - "grad_norm": 8.990974426269531, - "learning_rate": 3.3960056028513206e-06, - "loss": 0.4156, - "mean_token_accuracy": 0.8585062682628631, - "num_tokens": 215416501.0, - "step": 179020 - }, - { - "entropy": 1.9726496756076812, - "epoch": 0.554977335764698, - "grad_norm": 8.099010467529297, - "learning_rate": 3.395910756401116e-06, - "loss": 0.4686, - "mean_token_accuracy": 0.8548742011189461, - "num_tokens": 215427859.0, - "step": 179030 - }, - { - "entropy": 1.9836474925279617, - "epoch": 0.5550083348897478, - "grad_norm": 10.268874168395996, - "learning_rate": 3.3958159178973173e-06, - "loss": 0.4616, - "mean_token_accuracy": 0.8632662326097489, - "num_tokens": 215438192.0, - "step": 179040 - }, - { - "entropy": 1.8734207972884178, - "epoch": 0.5550393340147974, - "grad_norm": 8.809417724609375, - "learning_rate": 3.3957210873388156e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8660463884472847, - "num_tokens": 215450883.0, - "step": 179050 - }, - { - "entropy": 1.8947151079773903, - "epoch": 0.5550703331398471, - "grad_norm": 8.365950584411621, - "learning_rate": 3.395626264724501e-06, - "loss": 0.4302, - "mean_token_accuracy": 0.8539259016513825, - "num_tokens": 215462732.0, - "step": 179060 - }, - { - "entropy": 1.9040836334228515, - "epoch": 0.5551013322648968, - "grad_norm": 8.468751907348633, - "learning_rate": 3.3955314500532647e-06, - "loss": 0.3862, - "mean_token_accuracy": 0.863848377764225, - "num_tokens": 215475111.0, - "step": 179070 - }, - { - "entropy": 1.9592783853411675, - "epoch": 0.5551323313899466, - "grad_norm": 10.57170581817627, - "learning_rate": 3.395436643323997e-06, - "loss": 0.472, - "mean_token_accuracy": 0.848185470700264, - "num_tokens": 215486025.0, - "step": 179080 - }, - { - "entropy": 1.7874959081411361, - "epoch": 0.5551633305149962, - "grad_norm": 9.114726066589355, - "learning_rate": 3.395341844535591e-06, - "loss": 0.3474, - "mean_token_accuracy": 0.8718330755829811, - "num_tokens": 215500439.0, - "step": 179090 - }, - { - "entropy": 1.9357839733362199, - "epoch": 0.5551943296400459, - "grad_norm": 9.298722267150879, - "learning_rate": 3.3952470536869374e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8474046647548675, - "num_tokens": 215511982.0, - "step": 179100 - }, - { - "entropy": 1.9739779576659202, - "epoch": 0.5552253287650956, - "grad_norm": 3.7368788719177246, - "learning_rate": 3.395152270776927e-06, - "loss": 0.4991, - "mean_token_accuracy": 0.8470907166600228, - "num_tokens": 215523514.0, - "step": 179110 - }, - { - "entropy": 1.9102428033947945, - "epoch": 0.5552563278901453, - "grad_norm": 4.0481109619140625, - "learning_rate": 3.3950574958044526e-06, - "loss": 0.4144, - "mean_token_accuracy": 0.863105945289135, - "num_tokens": 215535555.0, - "step": 179120 - }, - { - "entropy": 1.9373560398817062, - "epoch": 0.555287327015195, - "grad_norm": 6.839097023010254, - "learning_rate": 3.3949627287684063e-06, - "loss": 0.4458, - "mean_token_accuracy": 0.862397214770317, - "num_tokens": 215546809.0, - "step": 179130 - }, - { - "entropy": 1.9235127955675124, - "epoch": 0.5553183261402447, - "grad_norm": 8.559110641479492, - "learning_rate": 3.39486796966768e-06, - "loss": 0.5766, - "mean_token_accuracy": 0.8470494523644447, - "num_tokens": 215558903.0, - "step": 179140 - }, - { - "entropy": 1.9319372728466988, - "epoch": 0.5553493252652943, - "grad_norm": 8.896299362182617, - "learning_rate": 3.3947732185011684e-06, - "loss": 0.4639, - "mean_token_accuracy": 0.8440060004591942, - "num_tokens": 215570155.0, - "step": 179150 - }, - { - "entropy": 1.8521119982004166, - "epoch": 0.5553803243903441, - "grad_norm": 8.03239631652832, - "learning_rate": 3.3946784752677613e-06, - "loss": 0.3786, - "mean_token_accuracy": 0.8644193202257157, - "num_tokens": 215581883.0, - "step": 179160 - }, - { - "entropy": 1.849208802729845, - "epoch": 0.5554113235153938, - "grad_norm": 5.667628765106201, - "learning_rate": 3.394583739966354e-06, - "loss": 0.3963, - "mean_token_accuracy": 0.8646664693951607, - "num_tokens": 215595076.0, - "step": 179170 - }, - { - "entropy": 1.8232768312096597, - "epoch": 0.5554423226404435, - "grad_norm": 3.93361234664917, - "learning_rate": 3.3944890125958384e-06, - "loss": 0.3884, - "mean_token_accuracy": 0.8674084261059761, - "num_tokens": 215607710.0, - "step": 179180 - }, - { - "entropy": 1.901517029106617, - "epoch": 0.5554733217654931, - "grad_norm": 8.467094421386719, - "learning_rate": 3.3943942931551087e-06, - "loss": 0.4349, - "mean_token_accuracy": 0.8507841497659683, - "num_tokens": 215619794.0, - "step": 179190 - }, - { - "entropy": 1.8633908212184906, - "epoch": 0.5555043208905429, - "grad_norm": 8.088972091674805, - "learning_rate": 3.394299581643059e-06, - "loss": 0.421, - "mean_token_accuracy": 0.8526325955986976, - "num_tokens": 215632836.0, - "step": 179200 - }, - { - "entropy": 1.8850948512554169, - "epoch": 0.5555353200155926, - "grad_norm": 7.233824729919434, - "learning_rate": 3.3942048780585822e-06, - "loss": 0.3987, - "mean_token_accuracy": 0.8667260557413101, - "num_tokens": 215644918.0, - "step": 179210 - }, - { - "entropy": 1.9587379336357116, - "epoch": 0.5555663191406423, - "grad_norm": 10.04115104675293, - "learning_rate": 3.394110182400573e-06, - "loss": 0.4951, - "mean_token_accuracy": 0.8412050887942314, - "num_tokens": 215656229.0, - "step": 179220 - }, - { - "entropy": 1.899441859126091, - "epoch": 0.5555973182656919, - "grad_norm": 9.780128479003906, - "learning_rate": 3.394015494667926e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8703594207763672, - "num_tokens": 215667872.0, - "step": 179230 - }, - { - "entropy": 1.8831466823816299, - "epoch": 0.5556283173907417, - "grad_norm": 7.8261260986328125, - "learning_rate": 3.393920814859535e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.8532602831721305, - "num_tokens": 215680339.0, - "step": 179240 - }, - { - "entropy": 1.9097533985972404, - "epoch": 0.5556593165157914, - "grad_norm": 9.267577171325684, - "learning_rate": 3.393826142974295e-06, - "loss": 0.4261, - "mean_token_accuracy": 0.862908025085926, - "num_tokens": 215691416.0, - "step": 179250 - }, - { - "entropy": 1.9572410345077516, - "epoch": 0.555690315640841, - "grad_norm": 8.92303466796875, - "learning_rate": 3.3937314790111027e-06, - "loss": 0.4594, - "mean_token_accuracy": 0.8508604303002357, - "num_tokens": 215702129.0, - "step": 179260 - }, - { - "entropy": 1.8611516401171684, - "epoch": 0.5557213147658907, - "grad_norm": 5.001405715942383, - "learning_rate": 3.3936368229688506e-06, - "loss": 0.399, - "mean_token_accuracy": 0.8587961971759797, - "num_tokens": 215715265.0, - "step": 179270 - }, - { - "entropy": 1.8070557743310929, - "epoch": 0.5557523138909404, - "grad_norm": 2.886479616165161, - "learning_rate": 3.3935421748464355e-06, - "loss": 0.3995, - "mean_token_accuracy": 0.8679655134677887, - "num_tokens": 215728529.0, - "step": 179280 - }, - { - "entropy": 1.9148321211338044, - "epoch": 0.5557833130159902, - "grad_norm": 3.6641204357147217, - "learning_rate": 3.393447534642752e-06, - "loss": 0.4342, - "mean_token_accuracy": 0.8596915438771248, - "num_tokens": 215740571.0, - "step": 179290 - }, - { - "entropy": 1.9517053723335267, - "epoch": 0.5558143121410398, - "grad_norm": 8.067102432250977, - "learning_rate": 3.393352902356698e-06, - "loss": 0.4207, - "mean_token_accuracy": 0.8586618512868881, - "num_tokens": 215752177.0, - "step": 179300 - }, - { - "entropy": 1.9378379747271537, - "epoch": 0.5558453112660895, - "grad_norm": 4.564868927001953, - "learning_rate": 3.3932582779871685e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.855698038637638, - "num_tokens": 215763713.0, - "step": 179310 - }, - { - "entropy": 1.9738548263907432, - "epoch": 0.5558763103911392, - "grad_norm": 8.826933860778809, - "learning_rate": 3.393163661533059e-06, - "loss": 0.4929, - "mean_token_accuracy": 0.8455957636237145, - "num_tokens": 215774828.0, - "step": 179320 - }, - { - "entropy": 1.8536319456994534, - "epoch": 0.5559073095161889, - "grad_norm": 10.126803398132324, - "learning_rate": 3.3930690529932682e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.857026931643486, - "num_tokens": 215788064.0, - "step": 179330 - }, - { - "entropy": 1.8911743879318237, - "epoch": 0.5559383086412386, - "grad_norm": 3.8450915813446045, - "learning_rate": 3.3929744523666895e-06, - "loss": 0.3811, - "mean_token_accuracy": 0.8661640584468842, - "num_tokens": 215800499.0, - "step": 179340 - }, - { - "entropy": 1.8529613867402077, - "epoch": 0.5559693077662883, - "grad_norm": 7.823770523071289, - "learning_rate": 3.3928798596522235e-06, - "loss": 0.3802, - "mean_token_accuracy": 0.8706428721547127, - "num_tokens": 215812209.0, - "step": 179350 - }, - { - "entropy": 1.972227230668068, - "epoch": 0.556000306891338, - "grad_norm": 8.537871360778809, - "learning_rate": 3.3927852748487644e-06, - "loss": 0.4798, - "mean_token_accuracy": 0.8534824714064598, - "num_tokens": 215823150.0, - "step": 179360 - }, - { - "entropy": 1.8717477947473526, - "epoch": 0.5560313060163877, - "grad_norm": 9.057830810546875, - "learning_rate": 3.392690697955211e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8547875434160233, - "num_tokens": 215835082.0, - "step": 179370 - }, - { - "entropy": 1.9645709812641143, - "epoch": 0.5560623051414374, - "grad_norm": 8.062889099121094, - "learning_rate": 3.3925961289704608e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.847657410800457, - "num_tokens": 215846350.0, - "step": 179380 - }, - { - "entropy": 1.8887288227677346, - "epoch": 0.5560933042664871, - "grad_norm": 9.724749565124512, - "learning_rate": 3.3925015678934114e-06, - "loss": 0.4379, - "mean_token_accuracy": 0.8625921666622162, - "num_tokens": 215857447.0, - "step": 179390 - }, - { - "entropy": 1.8312772125005723, - "epoch": 0.5561243033915367, - "grad_norm": 8.317710876464844, - "learning_rate": 3.3924070147229606e-06, - "loss": 0.395, - "mean_token_accuracy": 0.8607356131076813, - "num_tokens": 215869731.0, - "step": 179400 - }, - { - "entropy": 1.8348938211798669, - "epoch": 0.5561553025165865, - "grad_norm": 3.625267505645752, - "learning_rate": 3.392312469458007e-06, - "loss": 0.4183, - "mean_token_accuracy": 0.8623997822403908, - "num_tokens": 215882539.0, - "step": 179410 - }, - { - "entropy": 1.9049311459064484, - "epoch": 0.5561863016416362, - "grad_norm": 8.238363265991211, - "learning_rate": 3.3922179320974487e-06, - "loss": 0.4466, - "mean_token_accuracy": 0.8547548159956933, - "num_tokens": 215893749.0, - "step": 179420 - }, - { - "entropy": 1.9604707762598992, - "epoch": 0.5562173007666859, - "grad_norm": 4.5083842277526855, - "learning_rate": 3.392123402640185e-06, - "loss": 0.4975, - "mean_token_accuracy": 0.842636513710022, - "num_tokens": 215905283.0, - "step": 179430 - }, - { - "entropy": 1.9151503935456275, - "epoch": 0.5562482998917355, - "grad_norm": 8.561266899108887, - "learning_rate": 3.3920288810851136e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.8592620491981506, - "num_tokens": 215917371.0, - "step": 179440 - }, - { - "entropy": 1.9202652871608734, - "epoch": 0.5562792990167853, - "grad_norm": 8.415385246276855, - "learning_rate": 3.3919343674311346e-06, - "loss": 0.4692, - "mean_token_accuracy": 0.8525185972452164, - "num_tokens": 215928244.0, - "step": 179450 - }, - { - "entropy": 1.9669117599725723, - "epoch": 0.556310298141835, - "grad_norm": 9.637960433959961, - "learning_rate": 3.3918398616771477e-06, - "loss": 0.536, - "mean_token_accuracy": 0.8396474197506905, - "num_tokens": 215939994.0, - "step": 179460 - }, - { - "entropy": 1.8051816076040268, - "epoch": 0.5563412972668846, - "grad_norm": 3.206972599029541, - "learning_rate": 3.391745363822051e-06, - "loss": 0.3796, - "mean_token_accuracy": 0.8714934229850769, - "num_tokens": 215953601.0, - "step": 179470 - }, - { - "entropy": 1.7923723876476287, - "epoch": 0.5563722963919343, - "grad_norm": 9.600335121154785, - "learning_rate": 3.3916508738647453e-06, - "loss": 0.3852, - "mean_token_accuracy": 0.8598181501030921, - "num_tokens": 215966356.0, - "step": 179480 - }, - { - "entropy": 1.7627654626965523, - "epoch": 0.5564032955169841, - "grad_norm": 9.009991645812988, - "learning_rate": 3.3915563918041296e-06, - "loss": 0.3674, - "mean_token_accuracy": 0.8625359818339348, - "num_tokens": 215979553.0, - "step": 179490 - }, - { - "entropy": 1.9511140018701554, - "epoch": 0.5564342946420338, - "grad_norm": 7.965838432312012, - "learning_rate": 3.3914619176391057e-06, - "loss": 0.5094, - "mean_token_accuracy": 0.8408928826451302, - "num_tokens": 215990229.0, - "step": 179500 - }, - { - "entropy": 1.8817020952701569, - "epoch": 0.5564652937670834, - "grad_norm": 6.978695869445801, - "learning_rate": 3.3913674513685724e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8560591101646423, - "num_tokens": 216001836.0, - "step": 179510 - }, - { - "entropy": 1.8947572827339172, - "epoch": 0.5564962928921331, - "grad_norm": 8.471761703491211, - "learning_rate": 3.391272992991431e-06, - "loss": 0.4332, - "mean_token_accuracy": 0.8558598890900612, - "num_tokens": 216012863.0, - "step": 179520 - }, - { - "entropy": 1.8559965267777443, - "epoch": 0.5565272920171828, - "grad_norm": 8.303871154785156, - "learning_rate": 3.3911785425065817e-06, - "loss": 0.4302, - "mean_token_accuracy": 0.8528834328055381, - "num_tokens": 216025581.0, - "step": 179530 - }, - { - "entropy": 1.9025905281305313, - "epoch": 0.5565582911422325, - "grad_norm": 4.735015392303467, - "learning_rate": 3.391084099912926e-06, - "loss": 0.4623, - "mean_token_accuracy": 0.846812778711319, - "num_tokens": 216037600.0, - "step": 179540 - }, - { - "entropy": 1.9127229869365692, - "epoch": 0.5565892902672822, - "grad_norm": 6.8129777908325195, - "learning_rate": 3.3909896652093653e-06, - "loss": 0.3978, - "mean_token_accuracy": 0.8685696572065353, - "num_tokens": 216050178.0, - "step": 179550 - }, - { - "entropy": 1.8396325334906578, - "epoch": 0.5566202893923319, - "grad_norm": 4.305300712585449, - "learning_rate": 3.3908952383948007e-06, - "loss": 0.4182, - "mean_token_accuracy": 0.8576811730861664, - "num_tokens": 216063083.0, - "step": 179560 - }, - { - "entropy": 1.8920865491032601, - "epoch": 0.5566512885173815, - "grad_norm": 8.195730209350586, - "learning_rate": 3.3908008194681347e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8563900366425514, - "num_tokens": 216074865.0, - "step": 179570 - }, - { - "entropy": 1.8664014101028443, - "epoch": 0.5566822876424313, - "grad_norm": 6.416919231414795, - "learning_rate": 3.3907064084282674e-06, - "loss": 0.4388, - "mean_token_accuracy": 0.8568268820643425, - "num_tokens": 216086684.0, - "step": 179580 - }, - { - "entropy": 1.8229648873209954, - "epoch": 0.556713286767481, - "grad_norm": 4.24645471572876, - "learning_rate": 3.3906120052741033e-06, - "loss": 0.4015, - "mean_token_accuracy": 0.8657773956656456, - "num_tokens": 216099331.0, - "step": 179590 - }, - { - "entropy": 1.9399886280298233, - "epoch": 0.5567442858925307, - "grad_norm": 8.29112720489502, - "learning_rate": 3.3905176100045423e-06, - "loss": 0.47, - "mean_token_accuracy": 0.8519351586699486, - "num_tokens": 216111189.0, - "step": 179600 - }, - { - "entropy": 1.936243087053299, - "epoch": 0.5567752850175803, - "grad_norm": 6.696967124938965, - "learning_rate": 3.3904232226184886e-06, - "loss": 0.4882, - "mean_token_accuracy": 0.8442001849412918, - "num_tokens": 216122264.0, - "step": 179610 - }, - { - "entropy": 1.9167515233159065, - "epoch": 0.5568062841426301, - "grad_norm": 8.134096145629883, - "learning_rate": 3.390328843114844e-06, - "loss": 0.435, - "mean_token_accuracy": 0.8545194461941719, - "num_tokens": 216133960.0, - "step": 179620 - }, - { - "entropy": 1.9942961156368255, - "epoch": 0.5568372832676798, - "grad_norm": 9.308036804199219, - "learning_rate": 3.390234471492512e-06, - "loss": 0.4398, - "mean_token_accuracy": 0.8635125383734703, - "num_tokens": 216144747.0, - "step": 179630 - }, - { - "entropy": 1.862863690406084, - "epoch": 0.5568682823927295, - "grad_norm": 9.135284423828125, - "learning_rate": 3.390140107750395e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8429133668541908, - "num_tokens": 216157517.0, - "step": 179640 - }, - { - "entropy": 1.9794843196868896, - "epoch": 0.5568992815177791, - "grad_norm": 6.671160697937012, - "learning_rate": 3.390045751887397e-06, - "loss": 0.533, - "mean_token_accuracy": 0.8423982381820678, - "num_tokens": 216167900.0, - "step": 179650 - }, - { - "entropy": 1.9758970573544503, - "epoch": 0.5569302806428289, - "grad_norm": 8.529526710510254, - "learning_rate": 3.389951403902422e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8486952930688858, - "num_tokens": 216179421.0, - "step": 179660 - }, - { - "entropy": 1.8976741746068, - "epoch": 0.5569612797678786, - "grad_norm": 8.507506370544434, - "learning_rate": 3.389857063794373e-06, - "loss": 0.4353, - "mean_token_accuracy": 0.8545589908957482, - "num_tokens": 216191649.0, - "step": 179670 - }, - { - "entropy": 1.9042115330696106, - "epoch": 0.5569922788929282, - "grad_norm": 3.675581455230713, - "learning_rate": 3.389762731562154e-06, - "loss": 0.4994, - "mean_token_accuracy": 0.8550908967852593, - "num_tokens": 216203571.0, - "step": 179680 - }, - { - "entropy": 1.9547617256641387, - "epoch": 0.5570232780179779, - "grad_norm": 7.903559684753418, - "learning_rate": 3.38966840720467e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.8654710337519645, - "num_tokens": 216214089.0, - "step": 179690 - }, - { - "entropy": 1.9081181332468986, - "epoch": 0.5570542771430277, - "grad_norm": 4.779351711273193, - "learning_rate": 3.3895740907208246e-06, - "loss": 0.4533, - "mean_token_accuracy": 0.8563227370381356, - "num_tokens": 216225286.0, - "step": 179700 - }, - { - "entropy": 1.9601490914821624, - "epoch": 0.5570852762680774, - "grad_norm": 8.551769256591797, - "learning_rate": 3.3894797821095223e-06, - "loss": 0.5053, - "mean_token_accuracy": 0.8430069953203201, - "num_tokens": 216236582.0, - "step": 179710 - }, - { - "entropy": 1.936085006594658, - "epoch": 0.557116275393127, - "grad_norm": 6.998064041137695, - "learning_rate": 3.3893854813696696e-06, - "loss": 0.4181, - "mean_token_accuracy": 0.8596946880221367, - "num_tokens": 216247995.0, - "step": 179720 - }, - { - "entropy": 1.9320401161909104, - "epoch": 0.5571472745181767, - "grad_norm": 7.7311272621154785, - "learning_rate": 3.38929118850017e-06, - "loss": 0.5353, - "mean_token_accuracy": 0.8525026142597198, - "num_tokens": 216259406.0, - "step": 179730 - }, - { - "entropy": 1.9668843865394592, - "epoch": 0.5571782736432265, - "grad_norm": 7.928240776062012, - "learning_rate": 3.389196903499929e-06, - "loss": 0.5082, - "mean_token_accuracy": 0.8433253467082977, - "num_tokens": 216270626.0, - "step": 179740 - }, - { - "entropy": 1.895675989985466, - "epoch": 0.5572092727682761, - "grad_norm": 7.530059337615967, - "learning_rate": 3.3891026263678523e-06, - "loss": 0.4499, - "mean_token_accuracy": 0.86041489392519, - "num_tokens": 216282050.0, - "step": 179750 - }, - { - "entropy": 1.8358202219009399, - "epoch": 0.5572402718933258, - "grad_norm": 3.5207154750823975, - "learning_rate": 3.389008357102846e-06, - "loss": 0.3855, - "mean_token_accuracy": 0.8601732328534126, - "num_tokens": 216294458.0, - "step": 179760 - }, - { - "entropy": 1.9326426953077316, - "epoch": 0.5572712710183755, - "grad_norm": 8.358792304992676, - "learning_rate": 3.3889140957038156e-06, - "loss": 0.45, - "mean_token_accuracy": 0.8487183004617691, - "num_tokens": 216305382.0, - "step": 179770 - }, - { - "entropy": 1.9060441732406617, - "epoch": 0.5573022701434251, - "grad_norm": 6.259328842163086, - "learning_rate": 3.3888198421696677e-06, - "loss": 0.4413, - "mean_token_accuracy": 0.8615023106336593, - "num_tokens": 216317172.0, - "step": 179780 - }, - { - "entropy": 1.8866166576743126, - "epoch": 0.5573332692684749, - "grad_norm": 8.117493629455566, - "learning_rate": 3.3887255964993077e-06, - "loss": 0.443, - "mean_token_accuracy": 0.8601656764745712, - "num_tokens": 216328605.0, - "step": 179790 - }, - { - "entropy": 1.8952102825045585, - "epoch": 0.5573642683935246, - "grad_norm": 10.748100280761719, - "learning_rate": 3.388631358691643e-06, - "loss": 0.439, - "mean_token_accuracy": 0.853930875658989, - "num_tokens": 216340153.0, - "step": 179800 - }, - { - "entropy": 1.8524406239390374, - "epoch": 0.5573952675185743, - "grad_norm": 8.204578399658203, - "learning_rate": 3.3885371287455803e-06, - "loss": 0.4055, - "mean_token_accuracy": 0.8580199882388115, - "num_tokens": 216351612.0, - "step": 179810 - }, - { - "entropy": 1.910023505985737, - "epoch": 0.5574262666436239, - "grad_norm": 9.937796592712402, - "learning_rate": 3.3884429066600267e-06, - "loss": 0.465, - "mean_token_accuracy": 0.8472720056772232, - "num_tokens": 216362609.0, - "step": 179820 - }, - { - "entropy": 1.9104634687304496, - "epoch": 0.5574572657686737, - "grad_norm": 6.716761589050293, - "learning_rate": 3.3883486924338895e-06, - "loss": 0.4488, - "mean_token_accuracy": 0.8515969440340996, - "num_tokens": 216374245.0, - "step": 179830 - }, - { - "entropy": 1.8826022177934647, - "epoch": 0.5574882648937234, - "grad_norm": 4.184479713439941, - "learning_rate": 3.388254486066074e-06, - "loss": 0.4613, - "mean_token_accuracy": 0.8498593136668205, - "num_tokens": 216385180.0, - "step": 179840 - }, - { - "entropy": 1.8581200003623963, - "epoch": 0.557519264018773, - "grad_norm": 6.967178821563721, - "learning_rate": 3.388160287555491e-06, - "loss": 0.3964, - "mean_token_accuracy": 0.8627260774374008, - "num_tokens": 216398368.0, - "step": 179850 - }, - { - "entropy": 1.8448219254612923, - "epoch": 0.5575502631438227, - "grad_norm": 7.474876403808594, - "learning_rate": 3.388066096901047e-06, - "loss": 0.3825, - "mean_token_accuracy": 0.866121557354927, - "num_tokens": 216410681.0, - "step": 179860 - }, - { - "entropy": 1.9550578266382217, - "epoch": 0.5575812622688725, - "grad_norm": 7.641628742218018, - "learning_rate": 3.38797191410165e-06, - "loss": 0.479, - "mean_token_accuracy": 0.8485138610005378, - "num_tokens": 216421405.0, - "step": 179870 - }, - { - "entropy": 1.873170644044876, - "epoch": 0.5576122613939222, - "grad_norm": 3.591663360595703, - "learning_rate": 3.3878777391562086e-06, - "loss": 0.4301, - "mean_token_accuracy": 0.8564320370554924, - "num_tokens": 216433518.0, - "step": 179880 - }, - { - "entropy": 1.9816926151514054, - "epoch": 0.5576432605189718, - "grad_norm": 8.375580787658691, - "learning_rate": 3.3877835720636297e-06, - "loss": 0.4995, - "mean_token_accuracy": 0.8439663261175155, - "num_tokens": 216444283.0, - "step": 179890 - }, - { - "entropy": 1.9584947019815444, - "epoch": 0.5576742596440215, - "grad_norm": 7.122523784637451, - "learning_rate": 3.387689412822824e-06, - "loss": 0.4879, - "mean_token_accuracy": 0.8492183342576027, - "num_tokens": 216455129.0, - "step": 179900 - }, - { - "entropy": 1.8550873950123787, - "epoch": 0.5577052587690713, - "grad_norm": 4.019259929656982, - "learning_rate": 3.3875952614327e-06, - "loss": 0.4312, - "mean_token_accuracy": 0.8580047935247421, - "num_tokens": 216467330.0, - "step": 179910 - }, - { - "entropy": 1.985216537117958, - "epoch": 0.557736257894121, - "grad_norm": 9.813446998596191, - "learning_rate": 3.387501117892166e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8531203463673591, - "num_tokens": 216477979.0, - "step": 179920 - }, - { - "entropy": 1.923241350054741, - "epoch": 0.5577672570191706, - "grad_norm": 8.68863582611084, - "learning_rate": 3.387406982200133e-06, - "loss": 0.439, - "mean_token_accuracy": 0.8578556135296822, - "num_tokens": 216489302.0, - "step": 179930 - }, - { - "entropy": 1.8665193885564804, - "epoch": 0.5577982561442203, - "grad_norm": 7.6800618171691895, - "learning_rate": 3.3873128543555078e-06, - "loss": 0.4732, - "mean_token_accuracy": 0.8479210451245308, - "num_tokens": 216502141.0, - "step": 179940 - }, - { - "entropy": 1.848955325782299, - "epoch": 0.5578292552692701, - "grad_norm": 6.941798686981201, - "learning_rate": 3.3872187343572026e-06, - "loss": 0.4394, - "mean_token_accuracy": 0.8580316364765167, - "num_tokens": 216514646.0, - "step": 179950 - }, - { - "entropy": 1.868771779537201, - "epoch": 0.5578602543943197, - "grad_norm": 3.62846302986145, - "learning_rate": 3.3871246222041264e-06, - "loss": 0.4252, - "mean_token_accuracy": 0.8587727159261703, - "num_tokens": 216526552.0, - "step": 179960 - }, - { - "entropy": 1.9015920877456665, - "epoch": 0.5578912535193694, - "grad_norm": 7.458217620849609, - "learning_rate": 3.3870305178951897e-06, - "loss": 0.4416, - "mean_token_accuracy": 0.8526492983102798, - "num_tokens": 216537686.0, - "step": 179970 - }, - { - "entropy": 1.890255093574524, - "epoch": 0.5579222526444191, - "grad_norm": 9.054282188415527, - "learning_rate": 3.386936421429302e-06, - "loss": 0.4167, - "mean_token_accuracy": 0.8584634244441987, - "num_tokens": 216550036.0, - "step": 179980 - }, - { - "entropy": 1.8730426907539368, - "epoch": 0.5579532517694689, - "grad_norm": 8.789347648620605, - "learning_rate": 3.3868423328053756e-06, - "loss": 0.4425, - "mean_token_accuracy": 0.8546410590410233, - "num_tokens": 216561857.0, - "step": 179990 - }, - { - "entropy": 1.8982899636030197, - "epoch": 0.5579842508945185, - "grad_norm": 4.101022243499756, - "learning_rate": 3.38674825202232e-06, - "loss": 0.398, - "mean_token_accuracy": 0.8562424078583717, - "num_tokens": 216573765.0, - "step": 180000 - }, - { - "entropy": 1.9094096958637237, - "epoch": 0.5580152500195682, - "grad_norm": 8.289060592651367, - "learning_rate": 3.3866541790790464e-06, - "loss": 0.4275, - "mean_token_accuracy": 0.8647675678133965, - "num_tokens": 216585551.0, - "step": 180010 - }, - { - "entropy": 1.8832878440618515, - "epoch": 0.5580462491446179, - "grad_norm": 9.198235511779785, - "learning_rate": 3.386560113974466e-06, - "loss": 0.4308, - "mean_token_accuracy": 0.8545297279953956, - "num_tokens": 216597949.0, - "step": 180020 - }, - { - "entropy": 1.8627006128430366, - "epoch": 0.5580772482696675, - "grad_norm": 7.299589157104492, - "learning_rate": 3.386466056707491e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8517317429184914, - "num_tokens": 216610160.0, - "step": 180030 - }, - { - "entropy": 1.8306142285466194, - "epoch": 0.5581082473947173, - "grad_norm": 8.921402931213379, - "learning_rate": 3.386372007277032e-06, - "loss": 0.4133, - "mean_token_accuracy": 0.8578998699784279, - "num_tokens": 216623520.0, - "step": 180040 - }, - { - "entropy": 1.8765130326151849, - "epoch": 0.558139246519767, - "grad_norm": 2.5298173427581787, - "learning_rate": 3.386277965682002e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8659792527556419, - "num_tokens": 216635942.0, - "step": 180050 - }, - { - "entropy": 1.8612042382359504, - "epoch": 0.5581702456448167, - "grad_norm": 7.690088748931885, - "learning_rate": 3.3861839319213115e-06, - "loss": 0.4431, - "mean_token_accuracy": 0.8645087346434593, - "num_tokens": 216647774.0, - "step": 180060 - }, - { - "entropy": 1.8710038140416145, - "epoch": 0.5582012447698663, - "grad_norm": 7.982130527496338, - "learning_rate": 3.386089905993874e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8475197136402131, - "num_tokens": 216659937.0, - "step": 180070 - }, - { - "entropy": 1.8144805401563644, - "epoch": 0.5582322438949161, - "grad_norm": 7.199626445770264, - "learning_rate": 3.3859958878986027e-06, - "loss": 0.3962, - "mean_token_accuracy": 0.8654684230685235, - "num_tokens": 216672789.0, - "step": 180080 - }, - { - "entropy": 1.9647779405117034, - "epoch": 0.5582632430199658, - "grad_norm": 7.467442035675049, - "learning_rate": 3.385901877634408e-06, - "loss": 0.4716, - "mean_token_accuracy": 0.8525005370378494, - "num_tokens": 216683584.0, - "step": 180090 - }, - { - "entropy": 1.9427626758813858, - "epoch": 0.5582942421450154, - "grad_norm": 8.000640869140625, - "learning_rate": 3.3858078752002057e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8522224456071854, - "num_tokens": 216694614.0, - "step": 180100 - }, - { - "entropy": 1.8860655903816224, - "epoch": 0.5583252412700651, - "grad_norm": 8.802091598510742, - "learning_rate": 3.3857138805949064e-06, - "loss": 0.4276, - "mean_token_accuracy": 0.8601176783442497, - "num_tokens": 216706596.0, - "step": 180110 - }, - { - "entropy": 1.8183810651302337, - "epoch": 0.5583562403951149, - "grad_norm": 9.341395378112793, - "learning_rate": 3.3856198938174252e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.8621927976608277, - "num_tokens": 216718825.0, - "step": 180120 - }, - { - "entropy": 1.816829715669155, - "epoch": 0.5583872395201646, - "grad_norm": 8.577747344970703, - "learning_rate": 3.3855259148666746e-06, - "loss": 0.3878, - "mean_token_accuracy": 0.8645969301462173, - "num_tokens": 216731098.0, - "step": 180130 - }, - { - "entropy": 1.8784575372934342, - "epoch": 0.5584182386452142, - "grad_norm": 7.502293109893799, - "learning_rate": 3.3854319437415695e-06, - "loss": 0.4159, - "mean_token_accuracy": 0.8556536450982094, - "num_tokens": 216742776.0, - "step": 180140 - }, - { - "entropy": 1.9293906539678574, - "epoch": 0.5584492377702639, - "grad_norm": 4.2252326011657715, - "learning_rate": 3.3853379804410227e-06, - "loss": 0.4437, - "mean_token_accuracy": 0.8556514650583267, - "num_tokens": 216753738.0, - "step": 180150 - }, - { - "entropy": 1.9010004952549935, - "epoch": 0.5584802368953137, - "grad_norm": 7.127376079559326, - "learning_rate": 3.385244024963948e-06, - "loss": 0.4391, - "mean_token_accuracy": 0.8559280022978782, - "num_tokens": 216765075.0, - "step": 180160 - }, - { - "entropy": 1.877715417742729, - "epoch": 0.5585112360203633, - "grad_norm": 8.160017013549805, - "learning_rate": 3.3851500773092618e-06, - "loss": 0.4217, - "mean_token_accuracy": 0.8626894101500511, - "num_tokens": 216776057.0, - "step": 180170 - }, - { - "entropy": 1.9002387523651123, - "epoch": 0.558542235145413, - "grad_norm": 3.840376377105713, - "learning_rate": 3.385056137475877e-06, - "loss": 0.4262, - "mean_token_accuracy": 0.8568391934037208, - "num_tokens": 216787593.0, - "step": 180180 - }, - { - "entropy": 1.956373357772827, - "epoch": 0.5585732342704627, - "grad_norm": 8.961759567260742, - "learning_rate": 3.3849622054627097e-06, - "loss": 0.5069, - "mean_token_accuracy": 0.8425930678844452, - "num_tokens": 216798230.0, - "step": 180190 - }, - { - "entropy": 1.8474145486950875, - "epoch": 0.5586042333955125, - "grad_norm": 7.407924652099609, - "learning_rate": 3.3848682812686738e-06, - "loss": 0.4604, - "mean_token_accuracy": 0.8518683150410652, - "num_tokens": 216810802.0, - "step": 180200 - }, - { - "entropy": 1.8440819442272187, - "epoch": 0.5586352325205621, - "grad_norm": 3.6953446865081787, - "learning_rate": 3.384774364892685e-06, - "loss": 0.3735, - "mean_token_accuracy": 0.8669338792562484, - "num_tokens": 216823216.0, - "step": 180210 - }, - { - "entropy": 1.9052780613303184, - "epoch": 0.5586662316456118, - "grad_norm": 8.425951957702637, - "learning_rate": 3.3846804563336588e-06, - "loss": 0.4715, - "mean_token_accuracy": 0.8528780624270439, - "num_tokens": 216834842.0, - "step": 180220 - }, - { - "entropy": 1.8528162971138955, - "epoch": 0.5586972307706615, - "grad_norm": 7.783543586730957, - "learning_rate": 3.384586555590511e-06, - "loss": 0.4665, - "mean_token_accuracy": 0.8446239486336709, - "num_tokens": 216846620.0, - "step": 180230 - }, - { - "entropy": 1.8845822378993033, - "epoch": 0.5587282298957112, - "grad_norm": 7.422481060028076, - "learning_rate": 3.3844926626621576e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8657801419496536, - "num_tokens": 216857442.0, - "step": 180240 - }, - { - "entropy": 1.8267993465065957, - "epoch": 0.5587592290207609, - "grad_norm": 7.172003269195557, - "learning_rate": 3.384398777547514e-06, - "loss": 0.39, - "mean_token_accuracy": 0.86737762093544, - "num_tokens": 216870883.0, - "step": 180250 - }, - { - "entropy": 1.9326736852526665, - "epoch": 0.5587902281458106, - "grad_norm": 8.402005195617676, - "learning_rate": 3.3843049002454976e-06, - "loss": 0.4408, - "mean_token_accuracy": 0.85916518419981, - "num_tokens": 216882245.0, - "step": 180260 - }, - { - "entropy": 1.9091658741235733, - "epoch": 0.5588212272708603, - "grad_norm": 4.516397476196289, - "learning_rate": 3.3842110307550237e-06, - "loss": 0.426, - "mean_token_accuracy": 0.8538550525903702, - "num_tokens": 216893966.0, - "step": 180270 - }, - { - "entropy": 1.828370450437069, - "epoch": 0.5588522263959099, - "grad_norm": 8.806618690490723, - "learning_rate": 3.3841171690750097e-06, - "loss": 0.4108, - "mean_token_accuracy": 0.8570792287588119, - "num_tokens": 216905937.0, - "step": 180280 - }, - { - "entropy": 1.885986079275608, - "epoch": 0.5588832255209597, - "grad_norm": 8.106322288513184, - "learning_rate": 3.3840233152043726e-06, - "loss": 0.3967, - "mean_token_accuracy": 0.8598363786935806, - "num_tokens": 216918163.0, - "step": 180290 - }, - { - "entropy": 1.7997533604502678, - "epoch": 0.5589142246460094, - "grad_norm": 4.08975076675415, - "learning_rate": 3.383929469142029e-06, - "loss": 0.3597, - "mean_token_accuracy": 0.8623867243528366, - "num_tokens": 216931475.0, - "step": 180300 - }, - { - "entropy": 1.8519815653562546, - "epoch": 0.558945223771059, - "grad_norm": 8.924751281738281, - "learning_rate": 3.3838356308868977e-06, - "loss": 0.4515, - "mean_token_accuracy": 0.8536183342337609, - "num_tokens": 216944144.0, - "step": 180310 - }, - { - "entropy": 1.8736138075590134, - "epoch": 0.5589762228961087, - "grad_norm": 8.16180419921875, - "learning_rate": 3.3837418004378943e-06, - "loss": 0.4119, - "mean_token_accuracy": 0.8557430118322372, - "num_tokens": 216956827.0, - "step": 180320 - }, - { - "entropy": 1.8750511035323143, - "epoch": 0.5590072220211585, - "grad_norm": 6.086195945739746, - "learning_rate": 3.3836479777939375e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8607206657528877, - "num_tokens": 216968537.0, - "step": 180330 - }, - { - "entropy": 1.904085485637188, - "epoch": 0.5590382211462082, - "grad_norm": 7.704912185668945, - "learning_rate": 3.3835541629539455e-06, - "loss": 0.4277, - "mean_token_accuracy": 0.856925904750824, - "num_tokens": 216980705.0, - "step": 180340 - }, - { - "entropy": 1.8759368106722831, - "epoch": 0.5590692202712578, - "grad_norm": 7.605447292327881, - "learning_rate": 3.3834603559168363e-06, - "loss": 0.468, - "mean_token_accuracy": 0.8501611366868019, - "num_tokens": 216992340.0, - "step": 180350 - }, - { - "entropy": 1.9152187556028366, - "epoch": 0.5591002193963075, - "grad_norm": 10.049348831176758, - "learning_rate": 3.383366556681528e-06, - "loss": 0.4489, - "mean_token_accuracy": 0.8524507984519005, - "num_tokens": 217003401.0, - "step": 180360 - }, - { - "entropy": 1.8950191289186478, - "epoch": 0.5591312185213573, - "grad_norm": 8.646200180053711, - "learning_rate": 3.38327276524694e-06, - "loss": 0.4646, - "mean_token_accuracy": 0.8536058843135834, - "num_tokens": 217015175.0, - "step": 180370 - }, - { - "entropy": 1.8828943863511085, - "epoch": 0.559162217646407, - "grad_norm": 7.870319366455078, - "learning_rate": 3.38317898161199e-06, - "loss": 0.438, - "mean_token_accuracy": 0.8609470695257186, - "num_tokens": 217027041.0, - "step": 180380 - }, - { - "entropy": 1.9423518463969232, - "epoch": 0.5591932167714566, - "grad_norm": 7.569737911224365, - "learning_rate": 3.3830852057755987e-06, - "loss": 0.4684, - "mean_token_accuracy": 0.8526441812515259, - "num_tokens": 217038149.0, - "step": 180390 - }, - { - "entropy": 1.8567870572209357, - "epoch": 0.5592242158965063, - "grad_norm": 9.234038352966309, - "learning_rate": 3.3829914377366834e-06, - "loss": 0.4124, - "mean_token_accuracy": 0.8553133830428123, - "num_tokens": 217050007.0, - "step": 180400 - }, - { - "entropy": 1.9220700711011887, - "epoch": 0.5592552150215561, - "grad_norm": 3.5993828773498535, - "learning_rate": 3.3828976774941647e-06, - "loss": 0.4788, - "mean_token_accuracy": 0.85472621768713, - "num_tokens": 217060818.0, - "step": 180410 - }, - { - "entropy": 1.9288207918405533, - "epoch": 0.5592862141466057, - "grad_norm": 9.777146339416504, - "learning_rate": 3.3828039250469622e-06, - "loss": 0.5021, - "mean_token_accuracy": 0.8496465310454369, - "num_tokens": 217071834.0, - "step": 180420 - }, - { - "entropy": 1.9446373760700226, - "epoch": 0.5593172132716554, - "grad_norm": 9.732808113098145, - "learning_rate": 3.3827101803939956e-06, - "loss": 0.4787, - "mean_token_accuracy": 0.8456583991646767, - "num_tokens": 217082352.0, - "step": 180430 - }, - { - "entropy": 1.8855812832713128, - "epoch": 0.5593482123967051, - "grad_norm": 8.88985538482666, - "learning_rate": 3.382616443534185e-06, - "loss": 0.4613, - "mean_token_accuracy": 0.854129233956337, - "num_tokens": 217094782.0, - "step": 180440 - }, - { - "entropy": 1.9131209135055542, - "epoch": 0.5593792115217548, - "grad_norm": 8.299091339111328, - "learning_rate": 3.3825227144664507e-06, - "loss": 0.4168, - "mean_token_accuracy": 0.8680454924702644, - "num_tokens": 217105718.0, - "step": 180450 - }, - { - "entropy": 1.9053846567869186, - "epoch": 0.5594102106468045, - "grad_norm": 7.925025939941406, - "learning_rate": 3.382428993189713e-06, - "loss": 0.429, - "mean_token_accuracy": 0.8584159076213836, - "num_tokens": 217117301.0, - "step": 180460 - }, - { - "entropy": 2.009468224644661, - "epoch": 0.5594412097718542, - "grad_norm": 8.280466079711914, - "learning_rate": 3.3823352797028937e-06, - "loss": 0.5359, - "mean_token_accuracy": 0.8343116879463196, - "num_tokens": 217127990.0, - "step": 180470 - }, - { - "entropy": 1.9432775482535363, - "epoch": 0.5594722088969039, - "grad_norm": 11.16329288482666, - "learning_rate": 3.3822415740049113e-06, - "loss": 0.4675, - "mean_token_accuracy": 0.8549125969409943, - "num_tokens": 217139731.0, - "step": 180480 - }, - { - "entropy": 1.9695190101861955, - "epoch": 0.5595032080219536, - "grad_norm": 7.9360151290893555, - "learning_rate": 3.3821478760946896e-06, - "loss": 0.4965, - "mean_token_accuracy": 0.8430364161729813, - "num_tokens": 217151051.0, - "step": 180490 - }, - { - "entropy": 1.9071122258901596, - "epoch": 0.5595342071470033, - "grad_norm": 8.332737922668457, - "learning_rate": 3.3820541859711486e-06, - "loss": 0.4596, - "mean_token_accuracy": 0.8503222405910492, - "num_tokens": 217163091.0, - "step": 180500 - }, - { - "entropy": 1.9587863117456437, - "epoch": 0.559565206272053, - "grad_norm": 8.474498748779297, - "learning_rate": 3.3819605036332104e-06, - "loss": 0.5526, - "mean_token_accuracy": 0.8527715653181076, - "num_tokens": 217174509.0, - "step": 180510 - }, - { - "entropy": 1.9071698397397996, - "epoch": 0.5595962053971026, - "grad_norm": 7.895602226257324, - "learning_rate": 3.381866829079796e-06, - "loss": 0.3977, - "mean_token_accuracy": 0.854709891974926, - "num_tokens": 217187334.0, - "step": 180520 - }, - { - "entropy": 1.9282568588852882, - "epoch": 0.5596272045221523, - "grad_norm": 9.097548484802246, - "learning_rate": 3.3817731623098284e-06, - "loss": 0.4441, - "mean_token_accuracy": 0.8497316464781761, - "num_tokens": 217199374.0, - "step": 180530 - }, - { - "entropy": 1.89561807513237, - "epoch": 0.5596582036472021, - "grad_norm": 7.903514862060547, - "learning_rate": 3.3816795033222283e-06, - "loss": 0.4522, - "mean_token_accuracy": 0.8572723090648651, - "num_tokens": 217210367.0, - "step": 180540 - }, - { - "entropy": 1.8535553574562074, - "epoch": 0.5596892027722518, - "grad_norm": 7.994612693786621, - "learning_rate": 3.3815858521159193e-06, - "loss": 0.4345, - "mean_token_accuracy": 0.8570759683847428, - "num_tokens": 217223005.0, - "step": 180550 - }, - { - "entropy": 1.9246620133519172, - "epoch": 0.5597202018973014, - "grad_norm": 3.801100969314575, - "learning_rate": 3.381492208689824e-06, - "loss": 0.4914, - "mean_token_accuracy": 0.8471505641937256, - "num_tokens": 217234469.0, - "step": 180560 - }, - { - "entropy": 1.9053822994232177, - "epoch": 0.5597512010223511, - "grad_norm": 7.562039375305176, - "learning_rate": 3.3813985730428643e-06, - "loss": 0.4727, - "mean_token_accuracy": 0.8488843128085136, - "num_tokens": 217245583.0, - "step": 180570 - }, - { - "entropy": 1.9393005177378655, - "epoch": 0.5597822001474009, - "grad_norm": 6.803304195404053, - "learning_rate": 3.3813049451739642e-06, - "loss": 0.4526, - "mean_token_accuracy": 0.8515973404049874, - "num_tokens": 217257128.0, - "step": 180580 - }, - { - "entropy": 1.841621221601963, - "epoch": 0.5598131992724505, - "grad_norm": 3.8993310928344727, - "learning_rate": 3.381211325082046e-06, - "loss": 0.433, - "mean_token_accuracy": 0.853488278388977, - "num_tokens": 217271173.0, - "step": 180590 - }, - { - "entropy": 1.8540022999048233, - "epoch": 0.5598441983975002, - "grad_norm": 7.256168365478516, - "learning_rate": 3.3811177127660346e-06, - "loss": 0.4081, - "mean_token_accuracy": 0.8576511323451996, - "num_tokens": 217284025.0, - "step": 180600 - }, - { - "entropy": 1.8471224382519722, - "epoch": 0.5598751975225499, - "grad_norm": 7.620468616485596, - "learning_rate": 3.381024108224852e-06, - "loss": 0.4046, - "mean_token_accuracy": 0.8556807443499566, - "num_tokens": 217296298.0, - "step": 180610 - }, - { - "entropy": 1.8334318205714226, - "epoch": 0.5599061966475997, - "grad_norm": 9.344206809997559, - "learning_rate": 3.380930511457423e-06, - "loss": 0.401, - "mean_token_accuracy": 0.8624182164669036, - "num_tokens": 217309050.0, - "step": 180620 - }, - { - "entropy": 1.9678836941719056, - "epoch": 0.5599371957726493, - "grad_norm": 8.84647274017334, - "learning_rate": 3.3808369224626708e-06, - "loss": 0.5047, - "mean_token_accuracy": 0.8477346986532212, - "num_tokens": 217319744.0, - "step": 180630 - }, - { - "entropy": 1.8432461515069007, - "epoch": 0.559968194897699, - "grad_norm": 8.65550708770752, - "learning_rate": 3.3807433412395207e-06, - "loss": 0.3991, - "mean_token_accuracy": 0.8575503960251808, - "num_tokens": 217332550.0, - "step": 180640 - }, - { - "entropy": 1.9450537994503976, - "epoch": 0.5599991940227487, - "grad_norm": 8.20946979522705, - "learning_rate": 3.380649767786897e-06, - "loss": 0.4791, - "mean_token_accuracy": 0.8566359907388688, - "num_tokens": 217343970.0, - "step": 180650 - }, - { - "entropy": 1.8421220853924751, - "epoch": 0.5600301931477984, - "grad_norm": 4.591266632080078, - "learning_rate": 3.3805562021037243e-06, - "loss": 0.4047, - "mean_token_accuracy": 0.8594477117061615, - "num_tokens": 217356623.0, - "step": 180660 - }, - { - "entropy": 1.8869489043951035, - "epoch": 0.5600611922728481, - "grad_norm": 9.574021339416504, - "learning_rate": 3.3804626441889266e-06, - "loss": 0.4696, - "mean_token_accuracy": 0.8554420277476311, - "num_tokens": 217368615.0, - "step": 180670 - }, - { - "entropy": 1.9096188768744469, - "epoch": 0.5600921913978978, - "grad_norm": 9.077582359313965, - "learning_rate": 3.38036909404143e-06, - "loss": 0.4591, - "mean_token_accuracy": 0.8512759923934936, - "num_tokens": 217380573.0, - "step": 180680 - }, - { - "entropy": 1.9089935168623924, - "epoch": 0.5601231905229475, - "grad_norm": 4.384575366973877, - "learning_rate": 3.3802755516601593e-06, - "loss": 0.4301, - "mean_token_accuracy": 0.8570929080247879, - "num_tokens": 217391703.0, - "step": 180690 - }, - { - "entropy": 1.8445791646838188, - "epoch": 0.5601541896479972, - "grad_norm": 3.831366777420044, - "learning_rate": 3.3801820170440408e-06, - "loss": 0.4018, - "mean_token_accuracy": 0.8612964913249016, - "num_tokens": 217404339.0, - "step": 180700 - }, - { - "entropy": 1.796808883547783, - "epoch": 0.5601851887730469, - "grad_norm": 5.263000011444092, - "learning_rate": 3.380088490191999e-06, - "loss": 0.3628, - "mean_token_accuracy": 0.8694143995642662, - "num_tokens": 217418163.0, - "step": 180710 - }, - { - "entropy": 1.848763944208622, - "epoch": 0.5602161878980966, - "grad_norm": 8.649629592895508, - "learning_rate": 3.3799949711029606e-06, - "loss": 0.4028, - "mean_token_accuracy": 0.8593548595905304, - "num_tokens": 217430946.0, - "step": 180720 - }, - { - "entropy": 1.7809420630335808, - "epoch": 0.5602471870231462, - "grad_norm": 8.454540252685547, - "learning_rate": 3.3799014597758516e-06, - "loss": 0.3918, - "mean_token_accuracy": 0.8662583023309708, - "num_tokens": 217444104.0, - "step": 180730 - }, - { - "entropy": 1.962505580484867, - "epoch": 0.560278186148196, - "grad_norm": 9.19823932647705, - "learning_rate": 3.379807956209598e-06, - "loss": 0.519, - "mean_token_accuracy": 0.8412714377045631, - "num_tokens": 217455742.0, - "step": 180740 - }, - { - "entropy": 1.9442083448171616, - "epoch": 0.5603091852732457, - "grad_norm": 8.571081161499023, - "learning_rate": 3.3797144604031275e-06, - "loss": 0.488, - "mean_token_accuracy": 0.8483389243483543, - "num_tokens": 217467004.0, - "step": 180750 - }, - { - "entropy": 1.926881869137287, - "epoch": 0.5603401843982954, - "grad_norm": 8.731983184814453, - "learning_rate": 3.379620972355366e-06, - "loss": 0.4925, - "mean_token_accuracy": 0.8496791407465935, - "num_tokens": 217478640.0, - "step": 180760 - }, - { - "entropy": 1.9326330974698067, - "epoch": 0.560371183523345, - "grad_norm": 8.762312889099121, - "learning_rate": 3.37952749206524e-06, - "loss": 0.4452, - "mean_token_accuracy": 0.8533821493387223, - "num_tokens": 217490602.0, - "step": 180770 - }, - { - "entropy": 1.9303222298622131, - "epoch": 0.5604021826483947, - "grad_norm": 3.5224175453186035, - "learning_rate": 3.3794340195316775e-06, - "loss": 0.4442, - "mean_token_accuracy": 0.8604114964604378, - "num_tokens": 217501995.0, - "step": 180780 - }, - { - "entropy": 1.9192448794841765, - "epoch": 0.5604331817734445, - "grad_norm": 7.257331371307373, - "learning_rate": 3.3793405547536056e-06, - "loss": 0.4342, - "mean_token_accuracy": 0.8658577457070351, - "num_tokens": 217512568.0, - "step": 180790 - }, - { - "entropy": 1.8068484604358672, - "epoch": 0.5604641808984941, - "grad_norm": 4.195193290710449, - "learning_rate": 3.3792470977299516e-06, - "loss": 0.3891, - "mean_token_accuracy": 0.863228565454483, - "num_tokens": 217525201.0, - "step": 180800 - }, - { - "entropy": 1.9096578344702722, - "epoch": 0.5604951800235438, - "grad_norm": 5.258492469787598, - "learning_rate": 3.3791536484596447e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.8544479146599769, - "num_tokens": 217537342.0, - "step": 180810 - }, - { - "entropy": 1.843989658355713, - "epoch": 0.5605261791485935, - "grad_norm": 7.95306921005249, - "learning_rate": 3.3790602069416108e-06, - "loss": 0.4277, - "mean_token_accuracy": 0.8533659532666207, - "num_tokens": 217549457.0, - "step": 180820 - }, - { - "entropy": 1.8372595831751823, - "epoch": 0.5605571782736433, - "grad_norm": 9.44124698638916, - "learning_rate": 3.3789667731747796e-06, - "loss": 0.4008, - "mean_token_accuracy": 0.8592608660459519, - "num_tokens": 217561678.0, - "step": 180830 - }, - { - "entropy": 1.9358297988772393, - "epoch": 0.5605881773986929, - "grad_norm": 8.502898216247559, - "learning_rate": 3.3788733471580787e-06, - "loss": 0.5051, - "mean_token_accuracy": 0.8386871755123139, - "num_tokens": 217573016.0, - "step": 180840 - }, - { - "entropy": 1.8033106908202172, - "epoch": 0.5606191765237426, - "grad_norm": 4.1827778816223145, - "learning_rate": 3.3787799288904372e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8475642517209053, - "num_tokens": 217585990.0, - "step": 180850 - }, - { - "entropy": 1.8953222632408142, - "epoch": 0.5606501756487923, - "grad_norm": 9.73503303527832, - "learning_rate": 3.378686518370784e-06, - "loss": 0.4351, - "mean_token_accuracy": 0.8606355428695679, - "num_tokens": 217598445.0, - "step": 180860 - }, - { - "entropy": 1.845881848037243, - "epoch": 0.560681174773842, - "grad_norm": 4.5580596923828125, - "learning_rate": 3.378593115598048e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8616230577230454, - "num_tokens": 217611053.0, - "step": 180870 - }, - { - "entropy": 1.9507944285869598, - "epoch": 0.5607121738988917, - "grad_norm": 7.940414905548096, - "learning_rate": 3.3784997205711583e-06, - "loss": 0.4911, - "mean_token_accuracy": 0.8515224680304527, - "num_tokens": 217621744.0, - "step": 180880 - }, - { - "entropy": 1.885131499171257, - "epoch": 0.5607431730239414, - "grad_norm": 6.840700149536133, - "learning_rate": 3.378406333289044e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8618391931056977, - "num_tokens": 217633886.0, - "step": 180890 - }, - { - "entropy": 1.8611220195889473, - "epoch": 0.5607741721489911, - "grad_norm": 9.760030746459961, - "learning_rate": 3.378312953750636e-06, - "loss": 0.4317, - "mean_token_accuracy": 0.855574083328247, - "num_tokens": 217646166.0, - "step": 180900 - }, - { - "entropy": 1.889116460084915, - "epoch": 0.5608051712740408, - "grad_norm": 6.201332092285156, - "learning_rate": 3.378219581954863e-06, - "loss": 0.3904, - "mean_token_accuracy": 0.862904566526413, - "num_tokens": 217659044.0, - "step": 180910 - }, - { - "entropy": 1.866611397266388, - "epoch": 0.5608361703990905, - "grad_norm": 7.196857452392578, - "learning_rate": 3.3781262179006557e-06, - "loss": 0.4999, - "mean_token_accuracy": 0.8469146594405175, - "num_tokens": 217671556.0, - "step": 180920 - }, - { - "entropy": 1.9216289982199668, - "epoch": 0.5608671695241402, - "grad_norm": 8.19442367553711, - "learning_rate": 3.3780328615869445e-06, - "loss": 0.4755, - "mean_token_accuracy": 0.8460784748196601, - "num_tokens": 217683298.0, - "step": 180930 - }, - { - "entropy": 1.8504672214388846, - "epoch": 0.5608981686491898, - "grad_norm": 8.605911254882812, - "learning_rate": 3.3779395130126593e-06, - "loss": 0.4582, - "mean_token_accuracy": 0.8548540785908699, - "num_tokens": 217695959.0, - "step": 180940 - }, - { - "entropy": 1.8894923403859138, - "epoch": 0.5609291677742396, - "grad_norm": 6.890311241149902, - "learning_rate": 3.3778461721767307e-06, - "loss": 0.386, - "mean_token_accuracy": 0.8696539297699928, - "num_tokens": 217708357.0, - "step": 180950 - }, - { - "entropy": 1.98569797873497, - "epoch": 0.5609601668992893, - "grad_norm": 8.540229797363281, - "learning_rate": 3.3777528390780907e-06, - "loss": 0.5213, - "mean_token_accuracy": 0.8437126785516739, - "num_tokens": 217718928.0, - "step": 180960 - }, - { - "entropy": 1.8385651513934136, - "epoch": 0.560991166024339, - "grad_norm": 7.91072416305542, - "learning_rate": 3.3776595137156694e-06, - "loss": 0.4382, - "mean_token_accuracy": 0.8596760660409928, - "num_tokens": 217731229.0, - "step": 180970 - }, - { - "entropy": 1.920003080368042, - "epoch": 0.5610221651493886, - "grad_norm": 8.425142288208008, - "learning_rate": 3.3775661960883983e-06, - "loss": 0.4694, - "mean_token_accuracy": 0.8521833911538124, - "num_tokens": 217742639.0, - "step": 180980 - }, - { - "entropy": 1.8847643241286278, - "epoch": 0.5610531642744383, - "grad_norm": 7.655343055725098, - "learning_rate": 3.3774728861952093e-06, - "loss": 0.4288, - "mean_token_accuracy": 0.8574874952435494, - "num_tokens": 217755303.0, - "step": 180990 - }, - { - "entropy": 1.8867151036858558, - "epoch": 0.5610841633994881, - "grad_norm": 7.4873552322387695, - "learning_rate": 3.377379584035034e-06, - "loss": 0.394, - "mean_token_accuracy": 0.8581796258687973, - "num_tokens": 217767553.0, - "step": 181000 - }, - { - "entropy": 1.9579679414629936, - "epoch": 0.5611151625245377, - "grad_norm": 8.260603904724121, - "learning_rate": 3.3772862896068038e-06, - "loss": 0.4768, - "mean_token_accuracy": 0.8498960748314858, - "num_tokens": 217778968.0, - "step": 181010 - }, - { - "entropy": 1.9273895144462585, - "epoch": 0.5611461616495874, - "grad_norm": 7.966805934906006, - "learning_rate": 3.377193002909452e-06, - "loss": 0.4217, - "mean_token_accuracy": 0.8586221739649773, - "num_tokens": 217789896.0, - "step": 181020 - }, - { - "entropy": 1.907166676223278, - "epoch": 0.5611771607746371, - "grad_norm": 7.4472126960754395, - "learning_rate": 3.3770997239419097e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8602954059839248, - "num_tokens": 217801388.0, - "step": 181030 - }, - { - "entropy": 1.807604393362999, - "epoch": 0.5612081598996869, - "grad_norm": 7.787635326385498, - "learning_rate": 3.37700645270311e-06, - "loss": 0.3718, - "mean_token_accuracy": 0.868038372695446, - "num_tokens": 217814403.0, - "step": 181040 - }, - { - "entropy": 1.854034560918808, - "epoch": 0.5612391590247365, - "grad_norm": 3.2894787788391113, - "learning_rate": 3.376913189191986e-06, - "loss": 0.4321, - "mean_token_accuracy": 0.8540055334568024, - "num_tokens": 217826856.0, - "step": 181050 - }, - { - "entropy": 1.7675895690917969, - "epoch": 0.5612701581497862, - "grad_norm": 7.049602031707764, - "learning_rate": 3.37681993340747e-06, - "loss": 0.3731, - "mean_token_accuracy": 0.868474094569683, - "num_tokens": 217840044.0, - "step": 181060 - }, - { - "entropy": 1.8834607109427453, - "epoch": 0.5613011572748359, - "grad_norm": 4.7856645584106445, - "learning_rate": 3.376726685348496e-06, - "loss": 0.4672, - "mean_token_accuracy": 0.850586649775505, - "num_tokens": 217852080.0, - "step": 181070 - }, - { - "entropy": 1.8780501514673233, - "epoch": 0.5613321563998857, - "grad_norm": 9.758124351501465, - "learning_rate": 3.3766334450139965e-06, - "loss": 0.4613, - "mean_token_accuracy": 0.8537691235542297, - "num_tokens": 217863129.0, - "step": 181080 - }, - { - "entropy": 1.8639142349362374, - "epoch": 0.5613631555249353, - "grad_norm": 6.5365681648254395, - "learning_rate": 3.3765402124029056e-06, - "loss": 0.4419, - "mean_token_accuracy": 0.8505441218614578, - "num_tokens": 217875833.0, - "step": 181090 - }, - { - "entropy": 1.8525004491209984, - "epoch": 0.561394154649985, - "grad_norm": 9.217967987060547, - "learning_rate": 3.376446987514157e-06, - "loss": 0.4707, - "mean_token_accuracy": 0.8587792262434959, - "num_tokens": 217887693.0, - "step": 181100 - }, - { - "entropy": 1.8868872478604317, - "epoch": 0.5614251537750347, - "grad_norm": 7.5307536125183105, - "learning_rate": 3.3763537703466853e-06, - "loss": 0.4419, - "mean_token_accuracy": 0.8458724349737168, - "num_tokens": 217900013.0, - "step": 181110 - }, - { - "entropy": 1.9061895191669465, - "epoch": 0.5614561529000844, - "grad_norm": 8.436769485473633, - "learning_rate": 3.3762605608994243e-06, - "loss": 0.4541, - "mean_token_accuracy": 0.8492231115698814, - "num_tokens": 217911913.0, - "step": 181120 - }, - { - "entropy": 1.8444679781794548, - "epoch": 0.5614871520251341, - "grad_norm": 7.85495138168335, - "learning_rate": 3.3761673591713067e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8593740567564965, - "num_tokens": 217924423.0, - "step": 181130 - }, - { - "entropy": 1.8500046521425246, - "epoch": 0.5615181511501838, - "grad_norm": 7.943841457366943, - "learning_rate": 3.37607416516127e-06, - "loss": 0.4448, - "mean_token_accuracy": 0.8572930499911309, - "num_tokens": 217937018.0, - "step": 181140 - }, - { - "entropy": 1.817262691259384, - "epoch": 0.5615491502752334, - "grad_norm": 3.3246030807495117, - "learning_rate": 3.3759809788682475e-06, - "loss": 0.4198, - "mean_token_accuracy": 0.8568054154515267, - "num_tokens": 217949942.0, - "step": 181150 - }, - { - "entropy": 1.8344750300049781, - "epoch": 0.5615801494002832, - "grad_norm": 8.242709159851074, - "learning_rate": 3.375887800291174e-06, - "loss": 0.3961, - "mean_token_accuracy": 0.8631076112389564, - "num_tokens": 217962691.0, - "step": 181160 - }, - { - "entropy": 1.7523364052176476, - "epoch": 0.5616111485253329, - "grad_norm": 3.873065233230591, - "learning_rate": 3.375794629428986e-06, - "loss": 0.395, - "mean_token_accuracy": 0.8638006061315536, - "num_tokens": 217976123.0, - "step": 181170 - }, - { - "entropy": 1.8123833030462264, - "epoch": 0.5616421476503826, - "grad_norm": 7.738763332366943, - "learning_rate": 3.3757014662806175e-06, - "loss": 0.3936, - "mean_token_accuracy": 0.8730161666870118, - "num_tokens": 217987749.0, - "step": 181180 - }, - { - "entropy": 1.8182844251394272, - "epoch": 0.5616731467754322, - "grad_norm": 8.23554801940918, - "learning_rate": 3.375608310845005e-06, - "loss": 0.3712, - "mean_token_accuracy": 0.8598535865545273, - "num_tokens": 218000711.0, - "step": 181190 - }, - { - "entropy": 1.8626601822674274, - "epoch": 0.561704145900482, - "grad_norm": 8.313724517822266, - "learning_rate": 3.3755151631210835e-06, - "loss": 0.4396, - "mean_token_accuracy": 0.8639240145683289, - "num_tokens": 218012655.0, - "step": 181200 - }, - { - "entropy": 1.8847302973270417, - "epoch": 0.5617351450255317, - "grad_norm": 3.391298532485962, - "learning_rate": 3.3754220231077907e-06, - "loss": 0.4619, - "mean_token_accuracy": 0.8537727981805802, - "num_tokens": 218024210.0, - "step": 181210 - }, - { - "entropy": 1.8442262202501296, - "epoch": 0.5617661441505813, - "grad_norm": 8.985638618469238, - "learning_rate": 3.3753288908040614e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8595643252134323, - "num_tokens": 218035577.0, - "step": 181220 - }, - { - "entropy": 1.8500028520822525, - "epoch": 0.561797143275631, - "grad_norm": 4.673086643218994, - "learning_rate": 3.3752357662088324e-06, - "loss": 0.4176, - "mean_token_accuracy": 0.8605799928307534, - "num_tokens": 218047979.0, - "step": 181230 - }, - { - "entropy": 1.8613388493657113, - "epoch": 0.5618281424006807, - "grad_norm": 8.323710441589355, - "learning_rate": 3.375142649321041e-06, - "loss": 0.4013, - "mean_token_accuracy": 0.863029134273529, - "num_tokens": 218059650.0, - "step": 181240 - }, - { - "entropy": 1.8575000897049905, - "epoch": 0.5618591415257305, - "grad_norm": 8.198358535766602, - "learning_rate": 3.3750495401396232e-06, - "loss": 0.3981, - "mean_token_accuracy": 0.866016274690628, - "num_tokens": 218071502.0, - "step": 181250 - }, - { - "entropy": 1.8431531935930252, - "epoch": 0.5618901406507801, - "grad_norm": 7.122215747833252, - "learning_rate": 3.3749564386635165e-06, - "loss": 0.425, - "mean_token_accuracy": 0.8529547974467278, - "num_tokens": 218083432.0, - "step": 181260 - }, - { - "entropy": 1.8526543870568275, - "epoch": 0.5619211397758298, - "grad_norm": 9.108004570007324, - "learning_rate": 3.374863344891659e-06, - "loss": 0.4862, - "mean_token_accuracy": 0.8487247556447983, - "num_tokens": 218095552.0, - "step": 181270 - }, - { - "entropy": 1.812180995941162, - "epoch": 0.5619521389008795, - "grad_norm": 7.413666725158691, - "learning_rate": 3.3747702588229863e-06, - "loss": 0.422, - "mean_token_accuracy": 0.8599738642573357, - "num_tokens": 218108445.0, - "step": 181280 - }, - { - "entropy": 1.8726341724395752, - "epoch": 0.5619831380259293, - "grad_norm": 7.884952068328857, - "learning_rate": 3.3746771804564375e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8589571207761765, - "num_tokens": 218119972.0, - "step": 181290 - }, - { - "entropy": 1.9018690153956412, - "epoch": 0.5620141371509789, - "grad_norm": 7.468959331512451, - "learning_rate": 3.3745841097909506e-06, - "loss": 0.4528, - "mean_token_accuracy": 0.8478945583105088, - "num_tokens": 218131689.0, - "step": 181300 - }, - { - "entropy": 1.8484178960323334, - "epoch": 0.5620451362760286, - "grad_norm": 8.958715438842773, - "learning_rate": 3.374491046825463e-06, - "loss": 0.4233, - "mean_token_accuracy": 0.8532816350460053, - "num_tokens": 218144676.0, - "step": 181310 - }, - { - "entropy": 1.7837095826864242, - "epoch": 0.5620761354010783, - "grad_norm": 3.5719587802886963, - "learning_rate": 3.3743979915589137e-06, - "loss": 0.4143, - "mean_token_accuracy": 0.8583781942725182, - "num_tokens": 218158190.0, - "step": 181320 - }, - { - "entropy": 1.8700577557086944, - "epoch": 0.562107134526128, - "grad_norm": 7.927095890045166, - "learning_rate": 3.3743049439902402e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8564609229564667, - "num_tokens": 218170560.0, - "step": 181330 - }, - { - "entropy": 1.9219986885786056, - "epoch": 0.5621381336511777, - "grad_norm": 6.3925065994262695, - "learning_rate": 3.3742119041183824e-06, - "loss": 0.4689, - "mean_token_accuracy": 0.8569181904196739, - "num_tokens": 218181816.0, - "step": 181340 - }, - { - "entropy": 1.8808184131979941, - "epoch": 0.5621691327762274, - "grad_norm": 6.876534938812256, - "learning_rate": 3.3741188719422784e-06, - "loss": 0.407, - "mean_token_accuracy": 0.8688705995678901, - "num_tokens": 218193715.0, - "step": 181350 - }, - { - "entropy": 1.8285144343972206, - "epoch": 0.562200131901277, - "grad_norm": 10.440187454223633, - "learning_rate": 3.3740258474608677e-06, - "loss": 0.3971, - "mean_token_accuracy": 0.8620707169175148, - "num_tokens": 218205927.0, - "step": 181360 - }, - { - "entropy": 1.7766144782304765, - "epoch": 0.5622311310263268, - "grad_norm": 7.924615859985352, - "learning_rate": 3.373932830673091e-06, - "loss": 0.3888, - "mean_token_accuracy": 0.8614334642887116, - "num_tokens": 218220052.0, - "step": 181370 - }, - { - "entropy": 1.757065513730049, - "epoch": 0.5622621301513765, - "grad_norm": 4.278183937072754, - "learning_rate": 3.3738398215778845e-06, - "loss": 0.3149, - "mean_token_accuracy": 0.874184074997902, - "num_tokens": 218233698.0, - "step": 181380 - }, - { - "entropy": 1.723100933432579, - "epoch": 0.5622931292764262, - "grad_norm": 3.8255221843719482, - "learning_rate": 3.3737468201741915e-06, - "loss": 0.3361, - "mean_token_accuracy": 0.8724021464586258, - "num_tokens": 218248320.0, - "step": 181390 - }, - { - "entropy": 1.7919250458478928, - "epoch": 0.5623241284014758, - "grad_norm": 8.5140962600708, - "learning_rate": 3.3736538264609485e-06, - "loss": 0.3807, - "mean_token_accuracy": 0.861699141561985, - "num_tokens": 218260949.0, - "step": 181400 - }, - { - "entropy": 1.9182514041662215, - "epoch": 0.5623551275265256, - "grad_norm": 7.7248053550720215, - "learning_rate": 3.3735608404370995e-06, - "loss": 0.4438, - "mean_token_accuracy": 0.8568162053823472, - "num_tokens": 218272641.0, - "step": 181410 - }, - { - "entropy": 1.8269777074456215, - "epoch": 0.5623861266515753, - "grad_norm": 8.181014060974121, - "learning_rate": 3.373467862101582e-06, - "loss": 0.3907, - "mean_token_accuracy": 0.8538804024457931, - "num_tokens": 218285341.0, - "step": 181420 - }, - { - "entropy": 1.8285759806632995, - "epoch": 0.562417125776625, - "grad_norm": 8.884249687194824, - "learning_rate": 3.373374891453338e-06, - "loss": 0.3753, - "mean_token_accuracy": 0.8638269484043122, - "num_tokens": 218297741.0, - "step": 181430 - }, - { - "entropy": 1.8283386200666427, - "epoch": 0.5624481249016746, - "grad_norm": 9.47203540802002, - "learning_rate": 3.373281928491307e-06, - "loss": 0.4152, - "mean_token_accuracy": 0.8587843701243401, - "num_tokens": 218309967.0, - "step": 181440 - }, - { - "entropy": 1.9383835330605508, - "epoch": 0.5624791240267244, - "grad_norm": 3.79379940032959, - "learning_rate": 3.3731889732144313e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8580824270844459, - "num_tokens": 218321382.0, - "step": 181450 - }, - { - "entropy": 1.9270856872200965, - "epoch": 0.5625101231517741, - "grad_norm": 3.337674140930176, - "learning_rate": 3.373096025621651e-06, - "loss": 0.5043, - "mean_token_accuracy": 0.8432699516415596, - "num_tokens": 218332602.0, - "step": 181460 - }, - { - "entropy": 1.7900800183415413, - "epoch": 0.5625411222768237, - "grad_norm": 10.64853572845459, - "learning_rate": 3.3730030857119085e-06, - "loss": 0.396, - "mean_token_accuracy": 0.8590823590755463, - "num_tokens": 218345274.0, - "step": 181470 - }, - { - "entropy": 1.8173652663826942, - "epoch": 0.5625721214018734, - "grad_norm": 8.35346794128418, - "learning_rate": 3.3729101534841448e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.858729963004589, - "num_tokens": 218358082.0, - "step": 181480 - }, - { - "entropy": 1.910238166153431, - "epoch": 0.5626031205269231, - "grad_norm": 8.957452774047852, - "learning_rate": 3.3728172289373016e-06, - "loss": 0.4586, - "mean_token_accuracy": 0.8480331093072891, - "num_tokens": 218368843.0, - "step": 181490 - }, - { - "entropy": 1.9071012750267982, - "epoch": 0.5626341196519729, - "grad_norm": 9.414094924926758, - "learning_rate": 3.372724312070321e-06, - "loss": 0.4599, - "mean_token_accuracy": 0.856868302822113, - "num_tokens": 218380607.0, - "step": 181500 - }, - { - "entropy": 1.811684738099575, - "epoch": 0.5626651187770225, - "grad_norm": 3.4954662322998047, - "learning_rate": 3.372631402882146e-06, - "loss": 0.3706, - "mean_token_accuracy": 0.860798105597496, - "num_tokens": 218393372.0, - "step": 181510 - }, - { - "entropy": 1.7179926961660386, - "epoch": 0.5626961179020722, - "grad_norm": 3.3773441314697266, - "learning_rate": 3.3725385013717184e-06, - "loss": 0.3536, - "mean_token_accuracy": 0.8678355023264885, - "num_tokens": 218408589.0, - "step": 181520 - }, - { - "entropy": 1.8567181393504142, - "epoch": 0.5627271170271219, - "grad_norm": 9.402314186096191, - "learning_rate": 3.3724456075379795e-06, - "loss": 0.4668, - "mean_token_accuracy": 0.8574750825762749, - "num_tokens": 218420524.0, - "step": 181530 - }, - { - "entropy": 1.9640752226114273, - "epoch": 0.5627581161521716, - "grad_norm": 6.7787580490112305, - "learning_rate": 3.3723527213798744e-06, - "loss": 0.4768, - "mean_token_accuracy": 0.850230510532856, - "num_tokens": 218431700.0, - "step": 181540 - }, - { - "entropy": 1.846253375709057, - "epoch": 0.5627891152772213, - "grad_norm": 10.096291542053223, - "learning_rate": 3.3722598428963448e-06, - "loss": 0.4509, - "mean_token_accuracy": 0.8543033003807068, - "num_tokens": 218444142.0, - "step": 181550 - }, - { - "entropy": 1.9752780228853226, - "epoch": 0.562820114402271, - "grad_norm": 6.870584964752197, - "learning_rate": 3.3721669720863344e-06, - "loss": 0.4711, - "mean_token_accuracy": 0.8470905661582947, - "num_tokens": 218455101.0, - "step": 181560 - }, - { - "entropy": 1.8309684470295906, - "epoch": 0.5628511135273206, - "grad_norm": 12.073573112487793, - "learning_rate": 3.372074108948786e-06, - "loss": 0.3821, - "mean_token_accuracy": 0.8653752103447914, - "num_tokens": 218467013.0, - "step": 181570 - }, - { - "entropy": 1.899401769042015, - "epoch": 0.5628821126523704, - "grad_norm": 7.853172302246094, - "learning_rate": 3.3719812534826446e-06, - "loss": 0.3908, - "mean_token_accuracy": 0.8672493144869804, - "num_tokens": 218478180.0, - "step": 181580 - }, - { - "entropy": 1.851992353796959, - "epoch": 0.5629131117774201, - "grad_norm": 9.056402206420898, - "learning_rate": 3.3718884056868523e-06, - "loss": 0.4164, - "mean_token_accuracy": 0.859456042945385, - "num_tokens": 218489874.0, - "step": 181590 - }, - { - "entropy": 1.8867039635777474, - "epoch": 0.5629441109024698, - "grad_norm": 8.47264289855957, - "learning_rate": 3.371795565560354e-06, - "loss": 0.4586, - "mean_token_accuracy": 0.851148608326912, - "num_tokens": 218501597.0, - "step": 181600 - }, - { - "entropy": 1.913269890844822, - "epoch": 0.5629751100275194, - "grad_norm": 9.350337028503418, - "learning_rate": 3.371702733102094e-06, - "loss": 0.4597, - "mean_token_accuracy": 0.8451100900769234, - "num_tokens": 218513013.0, - "step": 181610 - }, - { - "entropy": 1.8536349445581437, - "epoch": 0.5630061091525692, - "grad_norm": 8.868657112121582, - "learning_rate": 3.3716099083110165e-06, - "loss": 0.4204, - "mean_token_accuracy": 0.856175285577774, - "num_tokens": 218525416.0, - "step": 181620 - }, - { - "entropy": 1.7822051152586937, - "epoch": 0.5630371082776189, - "grad_norm": 6.670261383056641, - "learning_rate": 3.3715170911860665e-06, - "loss": 0.4111, - "mean_token_accuracy": 0.8628834947943688, - "num_tokens": 218538704.0, - "step": 181630 - }, - { - "entropy": 1.926338541507721, - "epoch": 0.5630681074026685, - "grad_norm": 7.397019863128662, - "learning_rate": 3.3714242817261888e-06, - "loss": 0.4562, - "mean_token_accuracy": 0.8480693429708481, - "num_tokens": 218549814.0, - "step": 181640 - }, - { - "entropy": 1.9475450232625007, - "epoch": 0.5630991065277182, - "grad_norm": 7.937694549560547, - "learning_rate": 3.3713314799303282e-06, - "loss": 0.4916, - "mean_token_accuracy": 0.8462337568402291, - "num_tokens": 218561432.0, - "step": 181650 - }, - { - "entropy": 1.9426579788327216, - "epoch": 0.563130105652768, - "grad_norm": 3.288832664489746, - "learning_rate": 3.3712386857974295e-06, - "loss": 0.4888, - "mean_token_accuracy": 0.8500252008438111, - "num_tokens": 218572081.0, - "step": 181660 - }, - { - "entropy": 1.9244334518909454, - "epoch": 0.5631611047778177, - "grad_norm": 8.894190788269043, - "learning_rate": 3.3711458993264397e-06, - "loss": 0.4831, - "mean_token_accuracy": 0.8501438573002815, - "num_tokens": 218582749.0, - "step": 181670 - }, - { - "entropy": 1.7644945442676545, - "epoch": 0.5631921039028673, - "grad_norm": 8.007524490356445, - "learning_rate": 3.3710531205163026e-06, - "loss": 0.378, - "mean_token_accuracy": 0.871790862083435, - "num_tokens": 218596221.0, - "step": 181680 - }, - { - "entropy": 1.8322599783539772, - "epoch": 0.563223103027917, - "grad_norm": 8.165786743164062, - "learning_rate": 3.370960349365965e-06, - "loss": 0.4591, - "mean_token_accuracy": 0.8571014538407326, - "num_tokens": 218609139.0, - "step": 181690 - }, - { - "entropy": 1.8685690701007842, - "epoch": 0.5632541021529668, - "grad_norm": 7.681282043457031, - "learning_rate": 3.3708675858743733e-06, - "loss": 0.4513, - "mean_token_accuracy": 0.8578503727912903, - "num_tokens": 218621107.0, - "step": 181700 - }, - { - "entropy": 1.823290081322193, - "epoch": 0.5632851012780165, - "grad_norm": 8.989456176757812, - "learning_rate": 3.370774830040473e-06, - "loss": 0.3913, - "mean_token_accuracy": 0.8660375028848648, - "num_tokens": 218633534.0, - "step": 181710 - }, - { - "entropy": 1.8917041584849357, - "epoch": 0.5633161004030661, - "grad_norm": 8.177289009094238, - "learning_rate": 3.370682081863211e-06, - "loss": 0.4436, - "mean_token_accuracy": 0.8619986966252327, - "num_tokens": 218644523.0, - "step": 181720 - }, - { - "entropy": 1.8801601991057395, - "epoch": 0.5633470995281158, - "grad_norm": 3.5459909439086914, - "learning_rate": 3.370589341341534e-06, - "loss": 0.4332, - "mean_token_accuracy": 0.8533681690692901, - "num_tokens": 218656058.0, - "step": 181730 - }, - { - "entropy": 1.8284826904535294, - "epoch": 0.5633780986531655, - "grad_norm": 7.410562992095947, - "learning_rate": 3.3704966084743894e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8601794943213463, - "num_tokens": 218668031.0, - "step": 181740 - }, - { - "entropy": 1.8929321303963662, - "epoch": 0.5634090977782152, - "grad_norm": 7.5576605796813965, - "learning_rate": 3.3704038832607233e-06, - "loss": 0.4436, - "mean_token_accuracy": 0.8542150601744651, - "num_tokens": 218679679.0, - "step": 181750 - }, - { - "entropy": 1.8731725335121154, - "epoch": 0.5634400969032649, - "grad_norm": 8.853988647460938, - "learning_rate": 3.3703111656994835e-06, - "loss": 0.4276, - "mean_token_accuracy": 0.8578036665916443, - "num_tokens": 218691566.0, - "step": 181760 - }, - { - "entropy": 1.8713270604610444, - "epoch": 0.5634710960283146, - "grad_norm": 4.01953125, - "learning_rate": 3.3702184557896173e-06, - "loss": 0.4284, - "mean_token_accuracy": 0.8601006358861923, - "num_tokens": 218703686.0, - "step": 181770 - }, - { - "entropy": 1.8796763598918915, - "epoch": 0.5635020951533642, - "grad_norm": 7.775223255157471, - "learning_rate": 3.3701257535300722e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8497362047433853, - "num_tokens": 218715301.0, - "step": 181780 - }, - { - "entropy": 1.8186719685792923, - "epoch": 0.563533094278414, - "grad_norm": 3.402961015701294, - "learning_rate": 3.370033058919797e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.8580001771450043, - "num_tokens": 218728361.0, - "step": 181790 - }, - { - "entropy": 1.8945069938898087, - "epoch": 0.5635640934034637, - "grad_norm": 7.426494598388672, - "learning_rate": 3.3699403719577394e-06, - "loss": 0.451, - "mean_token_accuracy": 0.850483974814415, - "num_tokens": 218739876.0, - "step": 181800 - }, - { - "entropy": 1.7792957812547683, - "epoch": 0.5635950925285134, - "grad_norm": 4.302509307861328, - "learning_rate": 3.369847692642847e-06, - "loss": 0.3832, - "mean_token_accuracy": 0.8662660971283913, - "num_tokens": 218753484.0, - "step": 181810 - }, - { - "entropy": 1.8422718912363052, - "epoch": 0.563626091653563, - "grad_norm": 2.8755502700805664, - "learning_rate": 3.369755020974068e-06, - "loss": 0.4281, - "mean_token_accuracy": 0.8658865958452224, - "num_tokens": 218765774.0, - "step": 181820 - }, - { - "entropy": 1.9335956647992134, - "epoch": 0.5636570907786128, - "grad_norm": 6.701143264770508, - "learning_rate": 3.3696623569503535e-06, - "loss": 0.4164, - "mean_token_accuracy": 0.8665258780121803, - "num_tokens": 218778083.0, - "step": 181830 - }, - { - "entropy": 1.9469329804182052, - "epoch": 0.5636880899036625, - "grad_norm": 10.032766342163086, - "learning_rate": 3.36956970057065e-06, - "loss": 0.4773, - "mean_token_accuracy": 0.8478762224316597, - "num_tokens": 218788844.0, - "step": 181840 - }, - { - "entropy": 1.8688429698348046, - "epoch": 0.5637190890287122, - "grad_norm": 6.791390895843506, - "learning_rate": 3.3694770518339077e-06, - "loss": 0.3792, - "mean_token_accuracy": 0.861931498348713, - "num_tokens": 218801226.0, - "step": 181850 - }, - { - "entropy": 1.9176680132746697, - "epoch": 0.5637500881537618, - "grad_norm": 7.465823173522949, - "learning_rate": 3.3693844107390755e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8572080507874489, - "num_tokens": 218813183.0, - "step": 181860 - }, - { - "entropy": 1.8121962711215018, - "epoch": 0.5637810872788116, - "grad_norm": 8.418926239013672, - "learning_rate": 3.369291777285103e-06, - "loss": 0.3942, - "mean_token_accuracy": 0.8700475439429283, - "num_tokens": 218825988.0, - "step": 181870 - }, - { - "entropy": 1.8737857460975647, - "epoch": 0.5638120864038613, - "grad_norm": 8.048589706420898, - "learning_rate": 3.36919915147094e-06, - "loss": 0.4066, - "mean_token_accuracy": 0.8601662442088127, - "num_tokens": 218837922.0, - "step": 181880 - }, - { - "entropy": 1.8621310248970986, - "epoch": 0.5638430855289109, - "grad_norm": 4.604039669036865, - "learning_rate": 3.369106533295537e-06, - "loss": 0.3964, - "mean_token_accuracy": 0.8758772403001786, - "num_tokens": 218849411.0, - "step": 181890 - }, - { - "entropy": 1.8950058773159981, - "epoch": 0.5638740846539606, - "grad_norm": 5.675747871398926, - "learning_rate": 3.3690139227578422e-06, - "loss": 0.4684, - "mean_token_accuracy": 0.8569332420825958, - "num_tokens": 218861088.0, - "step": 181900 - }, - { - "entropy": 1.8820589035749435, - "epoch": 0.5639050837790104, - "grad_norm": 7.437646389007568, - "learning_rate": 3.368921319856808e-06, - "loss": 0.4316, - "mean_token_accuracy": 0.8657960072159767, - "num_tokens": 218873158.0, - "step": 181910 - }, - { - "entropy": 1.8981757640838623, - "epoch": 0.56393608290406, - "grad_norm": 8.741206169128418, - "learning_rate": 3.3688287245913843e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8553640082478523, - "num_tokens": 218884537.0, - "step": 181920 - }, - { - "entropy": 1.835992193222046, - "epoch": 0.5639670820291097, - "grad_norm": 9.34242057800293, - "learning_rate": 3.3687361369605216e-06, - "loss": 0.4143, - "mean_token_accuracy": 0.8556068152189255, - "num_tokens": 218896869.0, - "step": 181930 - }, - { - "entropy": 1.9087290972471238, - "epoch": 0.5639980811541594, - "grad_norm": 9.419221878051758, - "learning_rate": 3.3686435569631716e-06, - "loss": 0.4621, - "mean_token_accuracy": 0.8489242225885392, - "num_tokens": 218908053.0, - "step": 181940 - }, - { - "entropy": 1.9714890956878661, - "epoch": 0.5640290802792092, - "grad_norm": 8.146427154541016, - "learning_rate": 3.3685509845982834e-06, - "loss": 0.544, - "mean_token_accuracy": 0.8375766724348068, - "num_tokens": 218919129.0, - "step": 181950 - }, - { - "entropy": 1.9061841890215874, - "epoch": 0.5640600794042588, - "grad_norm": 9.288558959960938, - "learning_rate": 3.3684584198648105e-06, - "loss": 0.4434, - "mean_token_accuracy": 0.8559746578335762, - "num_tokens": 218930350.0, - "step": 181960 - }, - { - "entropy": 1.8730715066194534, - "epoch": 0.5640910785293085, - "grad_norm": 7.72963285446167, - "learning_rate": 3.368365862761704e-06, - "loss": 0.416, - "mean_token_accuracy": 0.8536106586456299, - "num_tokens": 218942941.0, - "step": 181970 - }, - { - "entropy": 1.883928555250168, - "epoch": 0.5641220776543582, - "grad_norm": 10.418731689453125, - "learning_rate": 3.3682733132879146e-06, - "loss": 0.4637, - "mean_token_accuracy": 0.8562978118658066, - "num_tokens": 218953961.0, - "step": 181980 - }, - { - "entropy": 1.896950177848339, - "epoch": 0.5641530767794078, - "grad_norm": 7.440096378326416, - "learning_rate": 3.368180771442396e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8484185233712196, - "num_tokens": 218965790.0, - "step": 181990 - }, - { - "entropy": 1.8978441506624222, - "epoch": 0.5641840759044576, - "grad_norm": 8.00220012664795, - "learning_rate": 3.3680882372240982e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.8625353053212166, - "num_tokens": 218977796.0, - "step": 182000 - }, - { - "entropy": 1.9124243408441544, - "epoch": 0.5642150750295073, - "grad_norm": 7.3425612449646, - "learning_rate": 3.3679957106319743e-06, - "loss": 0.4538, - "mean_token_accuracy": 0.8619111463427543, - "num_tokens": 218988584.0, - "step": 182010 - }, - { - "entropy": 1.8750930547714233, - "epoch": 0.564246074154557, - "grad_norm": 8.522233963012695, - "learning_rate": 3.367903191664978e-06, - "loss": 0.3983, - "mean_token_accuracy": 0.8666472569108009, - "num_tokens": 219000591.0, - "step": 182020 - }, - { - "entropy": 1.7184800058603287, - "epoch": 0.5642770732796066, - "grad_norm": 3.9416251182556152, - "learning_rate": 3.3678106803220605e-06, - "loss": 0.3806, - "mean_token_accuracy": 0.8791543394327164, - "num_tokens": 219014594.0, - "step": 182030 - }, - { - "entropy": 1.8457399889826775, - "epoch": 0.5643080724046564, - "grad_norm": 4.241663455963135, - "learning_rate": 3.367718176602176e-06, - "loss": 0.44, - "mean_token_accuracy": 0.8577973261475563, - "num_tokens": 219026319.0, - "step": 182040 - }, - { - "entropy": 1.8218046829104424, - "epoch": 0.5643390715297061, - "grad_norm": 7.572890758514404, - "learning_rate": 3.3676256805042766e-06, - "loss": 0.4048, - "mean_token_accuracy": 0.8654015228152275, - "num_tokens": 219038716.0, - "step": 182050 - }, - { - "entropy": 1.9060118064284324, - "epoch": 0.5643700706547558, - "grad_norm": 8.616740226745605, - "learning_rate": 3.367533192027317e-06, - "loss": 0.4448, - "mean_token_accuracy": 0.8549579024314881, - "num_tokens": 219050480.0, - "step": 182060 - }, - { - "entropy": 1.8430630937218666, - "epoch": 0.5644010697798054, - "grad_norm": 10.946331977844238, - "learning_rate": 3.367440711170249e-06, - "loss": 0.439, - "mean_token_accuracy": 0.8547526493668556, - "num_tokens": 219063246.0, - "step": 182070 - }, - { - "entropy": 1.82396737113595, - "epoch": 0.5644320689048552, - "grad_norm": 7.970212459564209, - "learning_rate": 3.367348237932027e-06, - "loss": 0.3988, - "mean_token_accuracy": 0.8534167259931564, - "num_tokens": 219075833.0, - "step": 182080 - }, - { - "entropy": 1.8939991384744643, - "epoch": 0.5644630680299049, - "grad_norm": 8.313651084899902, - "learning_rate": 3.367255772311606e-06, - "loss": 0.4177, - "mean_token_accuracy": 0.8648671418428421, - "num_tokens": 219087696.0, - "step": 182090 - }, - { - "entropy": 1.9231013625860214, - "epoch": 0.5644940671549545, - "grad_norm": 8.496840476989746, - "learning_rate": 3.3671633143079395e-06, - "loss": 0.4876, - "mean_token_accuracy": 0.8469110265374183, - "num_tokens": 219099058.0, - "step": 182100 - }, - { - "entropy": 1.8222051367163659, - "epoch": 0.5645250662800042, - "grad_norm": 4.980925559997559, - "learning_rate": 3.3670708639199813e-06, - "loss": 0.4101, - "mean_token_accuracy": 0.8577820032835006, - "num_tokens": 219110780.0, - "step": 182110 - }, - { - "entropy": 1.8365742474794389, - "epoch": 0.564556065405054, - "grad_norm": 8.180721282958984, - "learning_rate": 3.366978421146686e-06, - "loss": 0.4459, - "mean_token_accuracy": 0.8519570723176002, - "num_tokens": 219122821.0, - "step": 182120 - }, - { - "entropy": 1.914619317650795, - "epoch": 0.5645870645301037, - "grad_norm": 7.3940582275390625, - "learning_rate": 3.3668859859870096e-06, - "loss": 0.4661, - "mean_token_accuracy": 0.8543267771601677, - "num_tokens": 219134702.0, - "step": 182130 - }, - { - "entropy": 1.9344725817441941, - "epoch": 0.5646180636551533, - "grad_norm": 8.559662818908691, - "learning_rate": 3.366793558439905e-06, - "loss": 0.4758, - "mean_token_accuracy": 0.8544822856783867, - "num_tokens": 219145554.0, - "step": 182140 - }, - { - "entropy": 1.8622259184718133, - "epoch": 0.564649062780203, - "grad_norm": 8.896142959594727, - "learning_rate": 3.3667011385043298e-06, - "loss": 0.4579, - "mean_token_accuracy": 0.8583355069160461, - "num_tokens": 219157347.0, - "step": 182150 - }, - { - "entropy": 1.8627305820584297, - "epoch": 0.5646800619052528, - "grad_norm": 3.9189836978912354, - "learning_rate": 3.366608726179237e-06, - "loss": 0.4274, - "mean_token_accuracy": 0.8522779107093811, - "num_tokens": 219169536.0, - "step": 182160 - }, - { - "entropy": 1.907105678319931, - "epoch": 0.5647110610303024, - "grad_norm": 7.8243632316589355, - "learning_rate": 3.3665163214635838e-06, - "loss": 0.4709, - "mean_token_accuracy": 0.8565170675516128, - "num_tokens": 219180940.0, - "step": 182170 - }, - { - "entropy": 1.901413296163082, - "epoch": 0.5647420601553521, - "grad_norm": 6.818977355957031, - "learning_rate": 3.366423924356325e-06, - "loss": 0.4327, - "mean_token_accuracy": 0.8567003801465034, - "num_tokens": 219192893.0, - "step": 182180 - }, - { - "entropy": 1.8407423809170722, - "epoch": 0.5647730592804018, - "grad_norm": 8.929377555847168, - "learning_rate": 3.3663315348564173e-06, - "loss": 0.3894, - "mean_token_accuracy": 0.8660805970430374, - "num_tokens": 219205279.0, - "step": 182190 - }, - { - "entropy": 1.9222645550966262, - "epoch": 0.5648040584054516, - "grad_norm": 7.735604763031006, - "learning_rate": 3.366239152962816e-06, - "loss": 0.4425, - "mean_token_accuracy": 0.860330006480217, - "num_tokens": 219216624.0, - "step": 182200 - }, - { - "entropy": 1.9066086545586587, - "epoch": 0.5648350575305012, - "grad_norm": 9.061637878417969, - "learning_rate": 3.366146778674479e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.859072470664978, - "num_tokens": 219228847.0, - "step": 182210 - }, - { - "entropy": 1.8881817117333413, - "epoch": 0.5648660566555509, - "grad_norm": 9.51163387298584, - "learning_rate": 3.366054411990361e-06, - "loss": 0.4759, - "mean_token_accuracy": 0.8498694509267807, - "num_tokens": 219240418.0, - "step": 182220 - }, - { - "entropy": 1.947641570866108, - "epoch": 0.5648970557806006, - "grad_norm": 9.015745162963867, - "learning_rate": 3.36596205290942e-06, - "loss": 0.4916, - "mean_token_accuracy": 0.8451044097542763, - "num_tokens": 219251188.0, - "step": 182230 - }, - { - "entropy": 1.8586992233991624, - "epoch": 0.5649280549056502, - "grad_norm": 9.301876068115234, - "learning_rate": 3.3658697014306124e-06, - "loss": 0.4404, - "mean_token_accuracy": 0.86016416400671, - "num_tokens": 219263759.0, - "step": 182240 - }, - { - "entropy": 1.769415383040905, - "epoch": 0.5649590540307, - "grad_norm": 8.25972843170166, - "learning_rate": 3.3657773575528956e-06, - "loss": 0.3909, - "mean_token_accuracy": 0.86795554459095, - "num_tokens": 219277713.0, - "step": 182250 - }, - { - "entropy": 1.909845282137394, - "epoch": 0.5649900531557497, - "grad_norm": 10.241829872131348, - "learning_rate": 3.3656850212752273e-06, - "loss": 0.4862, - "mean_token_accuracy": 0.8492648482322693, - "num_tokens": 219289088.0, - "step": 182260 - }, - { - "entropy": 1.8235872462391853, - "epoch": 0.5650210522807994, - "grad_norm": 8.383881568908691, - "learning_rate": 3.365592692596564e-06, - "loss": 0.4238, - "mean_token_accuracy": 0.8593395933508873, - "num_tokens": 219302014.0, - "step": 182270 - }, - { - "entropy": 1.8279115334153175, - "epoch": 0.565052051405849, - "grad_norm": 8.371310234069824, - "learning_rate": 3.3655003715158642e-06, - "loss": 0.348, - "mean_token_accuracy": 0.8745242550969123, - "num_tokens": 219314472.0, - "step": 182280 - }, - { - "entropy": 1.8149750515818597, - "epoch": 0.5650830505308988, - "grad_norm": 8.341879844665527, - "learning_rate": 3.365408058032086e-06, - "loss": 0.386, - "mean_token_accuracy": 0.8628831997513771, - "num_tokens": 219326935.0, - "step": 182290 - }, - { - "entropy": 1.97727922052145, - "epoch": 0.5651140496559485, - "grad_norm": 9.078901290893555, - "learning_rate": 3.3653157521441876e-06, - "loss": 0.5073, - "mean_token_accuracy": 0.837429566681385, - "num_tokens": 219338368.0, - "step": 182300 - }, - { - "entropy": 1.9396789371967316, - "epoch": 0.5651450487809981, - "grad_norm": 8.132356643676758, - "learning_rate": 3.3652234538511265e-06, - "loss": 0.4279, - "mean_token_accuracy": 0.8660401061177254, - "num_tokens": 219349114.0, - "step": 182310 - }, - { - "entropy": 1.862569797039032, - "epoch": 0.5651760479060478, - "grad_norm": 4.012666702270508, - "learning_rate": 3.3651311631518623e-06, - "loss": 0.4103, - "mean_token_accuracy": 0.8647153243422508, - "num_tokens": 219361509.0, - "step": 182320 - }, - { - "entropy": 1.8481622457504272, - "epoch": 0.5652070470310976, - "grad_norm": 4.161468505859375, - "learning_rate": 3.365038880045353e-06, - "loss": 0.4099, - "mean_token_accuracy": 0.8633993789553642, - "num_tokens": 219373790.0, - "step": 182330 - }, - { - "entropy": 1.8544276610016823, - "epoch": 0.5652380461561473, - "grad_norm": 9.143585205078125, - "learning_rate": 3.3649466045305584e-06, - "loss": 0.4366, - "mean_token_accuracy": 0.8522509098052978, - "num_tokens": 219385836.0, - "step": 182340 - }, - { - "entropy": 1.8401200011372567, - "epoch": 0.5652690452811969, - "grad_norm": 7.8143510818481445, - "learning_rate": 3.3648543366064367e-06, - "loss": 0.3762, - "mean_token_accuracy": 0.8740338370203972, - "num_tokens": 219398181.0, - "step": 182350 - }, - { - "entropy": 1.8663185223937035, - "epoch": 0.5653000444062466, - "grad_norm": 7.707705020904541, - "learning_rate": 3.364762076271948e-06, - "loss": 0.4953, - "mean_token_accuracy": 0.8531212002038956, - "num_tokens": 219410931.0, - "step": 182360 - }, - { - "entropy": 1.878646893799305, - "epoch": 0.5653310435312964, - "grad_norm": 8.273179054260254, - "learning_rate": 3.3646698235260516e-06, - "loss": 0.4457, - "mean_token_accuracy": 0.8549691379070282, - "num_tokens": 219422675.0, - "step": 182370 - }, - { - "entropy": 1.842753717303276, - "epoch": 0.565362042656346, - "grad_norm": 2.2988572120666504, - "learning_rate": 3.3645775783677077e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8559475839138031, - "num_tokens": 219434999.0, - "step": 182380 - }, - { - "entropy": 1.9365380138158799, - "epoch": 0.5653930417813957, - "grad_norm": 7.147967338562012, - "learning_rate": 3.364485340795875e-06, - "loss": 0.4765, - "mean_token_accuracy": 0.8467515155673027, - "num_tokens": 219445954.0, - "step": 182390 - }, - { - "entropy": 1.8891595363616944, - "epoch": 0.5654240409064454, - "grad_norm": 7.55145788192749, - "learning_rate": 3.3643931108095146e-06, - "loss": 0.4173, - "mean_token_accuracy": 0.8647678464651107, - "num_tokens": 219458306.0, - "step": 182400 - }, - { - "entropy": 1.8514413997530936, - "epoch": 0.5654550400314952, - "grad_norm": 5.811746120452881, - "learning_rate": 3.364300888407587e-06, - "loss": 0.4501, - "mean_token_accuracy": 0.8549320101737976, - "num_tokens": 219470617.0, - "step": 182410 - }, - { - "entropy": 1.8094148866832256, - "epoch": 0.5654860391565448, - "grad_norm": 2.7894392013549805, - "learning_rate": 3.364208673589053e-06, - "loss": 0.3814, - "mean_token_accuracy": 0.8556830063462257, - "num_tokens": 219484050.0, - "step": 182420 - }, - { - "entropy": 1.859315599501133, - "epoch": 0.5655170382815945, - "grad_norm": 8.049720764160156, - "learning_rate": 3.3641164663528717e-06, - "loss": 0.4152, - "mean_token_accuracy": 0.8648128852248191, - "num_tokens": 219496016.0, - "step": 182430 - }, - { - "entropy": 1.8675218299031258, - "epoch": 0.5655480374066442, - "grad_norm": 10.070209503173828, - "learning_rate": 3.364024266698006e-06, - "loss": 0.4903, - "mean_token_accuracy": 0.8463949292898179, - "num_tokens": 219508002.0, - "step": 182440 - }, - { - "entropy": 1.871927236020565, - "epoch": 0.565579036531694, - "grad_norm": 8.271561622619629, - "learning_rate": 3.3639320746234164e-06, - "loss": 0.4382, - "mean_token_accuracy": 0.8576414436101913, - "num_tokens": 219519654.0, - "step": 182450 - }, - { - "entropy": 1.8629664570093154, - "epoch": 0.5656100356567436, - "grad_norm": 9.942543983459473, - "learning_rate": 3.3638398901280646e-06, - "loss": 0.436, - "mean_token_accuracy": 0.8536071136593819, - "num_tokens": 219532168.0, - "step": 182460 - }, - { - "entropy": 1.9039793327450751, - "epoch": 0.5656410347817933, - "grad_norm": 9.30384635925293, - "learning_rate": 3.363747713210911e-06, - "loss": 0.4506, - "mean_token_accuracy": 0.8593076154589653, - "num_tokens": 219543244.0, - "step": 182470 - }, - { - "entropy": 1.896394467353821, - "epoch": 0.565672033906843, - "grad_norm": 10.03402328491211, - "learning_rate": 3.363655543870918e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8478322148323059, - "num_tokens": 219555030.0, - "step": 182480 - }, - { - "entropy": 1.8448296919465066, - "epoch": 0.5657030330318926, - "grad_norm": 8.039837837219238, - "learning_rate": 3.3635633821070474e-06, - "loss": 0.3759, - "mean_token_accuracy": 0.8729675441980362, - "num_tokens": 219567463.0, - "step": 182490 - }, - { - "entropy": 1.7930359467864037, - "epoch": 0.5657340321569424, - "grad_norm": 8.818025588989258, - "learning_rate": 3.363471227918261e-06, - "loss": 0.4062, - "mean_token_accuracy": 0.870102445781231, - "num_tokens": 219580320.0, - "step": 182500 - }, - { - "entropy": 1.7787843875586986, - "epoch": 0.5657650312819921, - "grad_norm": 6.676950931549072, - "learning_rate": 3.3633790813035238e-06, - "loss": 0.3836, - "mean_token_accuracy": 0.8641812324523925, - "num_tokens": 219593450.0, - "step": 182510 - }, - { - "entropy": 1.8252146616578102, - "epoch": 0.5657960304070417, - "grad_norm": 8.978650093078613, - "learning_rate": 3.363286942261795e-06, - "loss": 0.3925, - "mean_token_accuracy": 0.8654040232300758, - "num_tokens": 219605533.0, - "step": 182520 - }, - { - "entropy": 1.914176408946514, - "epoch": 0.5658270295320914, - "grad_norm": 10.18586254119873, - "learning_rate": 3.3631948107920388e-06, - "loss": 0.4876, - "mean_token_accuracy": 0.852830545604229, - "num_tokens": 219616787.0, - "step": 182530 - }, - { - "entropy": 1.8280531086027623, - "epoch": 0.5658580286571412, - "grad_norm": 3.613919973373413, - "learning_rate": 3.3631026868932177e-06, - "loss": 0.4088, - "mean_token_accuracy": 0.8650343582034111, - "num_tokens": 219630185.0, - "step": 182540 - }, - { - "entropy": 1.7496952429413795, - "epoch": 0.5658890277821909, - "grad_norm": 9.850088119506836, - "learning_rate": 3.3630105705642953e-06, - "loss": 0.3499, - "mean_token_accuracy": 0.8619767040014267, - "num_tokens": 219643599.0, - "step": 182550 - }, - { - "entropy": 1.8812230035662652, - "epoch": 0.5659200269072405, - "grad_norm": 7.521579265594482, - "learning_rate": 3.3629184618042353e-06, - "loss": 0.4028, - "mean_token_accuracy": 0.8584534049034118, - "num_tokens": 219655194.0, - "step": 182560 - }, - { - "entropy": 1.861245647072792, - "epoch": 0.5659510260322902, - "grad_norm": 7.2412614822387695, - "learning_rate": 3.3628263606119996e-06, - "loss": 0.4032, - "mean_token_accuracy": 0.8658636063337326, - "num_tokens": 219668106.0, - "step": 182570 - }, - { - "entropy": 1.7555628687143325, - "epoch": 0.56598202515734, - "grad_norm": 8.427504539489746, - "learning_rate": 3.3627342669865537e-06, - "loss": 0.3867, - "mean_token_accuracy": 0.8658777117729187, - "num_tokens": 219681746.0, - "step": 182580 - }, - { - "entropy": 1.925270189344883, - "epoch": 0.5660130242823896, - "grad_norm": 7.512916088104248, - "learning_rate": 3.362642180926861e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8513542354106903, - "num_tokens": 219692977.0, - "step": 182590 - }, - { - "entropy": 1.7738431617617607, - "epoch": 0.5660440234074393, - "grad_norm": 8.270086288452148, - "learning_rate": 3.3625501024318863e-06, - "loss": 0.4018, - "mean_token_accuracy": 0.8591394439339638, - "num_tokens": 219705620.0, - "step": 182600 - }, - { - "entropy": 1.8614506632089616, - "epoch": 0.566075022532489, - "grad_norm": 8.078804016113281, - "learning_rate": 3.3624580315005917e-06, - "loss": 0.3953, - "mean_token_accuracy": 0.8608483463525772, - "num_tokens": 219718256.0, - "step": 182610 - }, - { - "entropy": 1.9512107968330383, - "epoch": 0.5661060216575388, - "grad_norm": 7.991335391998291, - "learning_rate": 3.3623659681319444e-06, - "loss": 0.4568, - "mean_token_accuracy": 0.8626005321741104, - "num_tokens": 219729201.0, - "step": 182620 - }, - { - "entropy": 1.8471109196543694, - "epoch": 0.5661370207825884, - "grad_norm": 7.598386287689209, - "learning_rate": 3.3622739123249072e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.861255344748497, - "num_tokens": 219741656.0, - "step": 182630 - }, - { - "entropy": 1.888534700870514, - "epoch": 0.5661680199076381, - "grad_norm": 7.569242000579834, - "learning_rate": 3.3621818640784463e-06, - "loss": 0.4501, - "mean_token_accuracy": 0.8554926365613937, - "num_tokens": 219754235.0, - "step": 182640 - }, - { - "entropy": 1.8320038333535194, - "epoch": 0.5661990190326878, - "grad_norm": 7.632225036621094, - "learning_rate": 3.3620898233915266e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8643024682998657, - "num_tokens": 219766919.0, - "step": 182650 - }, - { - "entropy": 1.8487570151686668, - "epoch": 0.5662300181577375, - "grad_norm": 6.813345432281494, - "learning_rate": 3.3619977902631126e-06, - "loss": 0.3771, - "mean_token_accuracy": 0.8686630889773369, - "num_tokens": 219779203.0, - "step": 182660 - }, - { - "entropy": 1.935315978527069, - "epoch": 0.5662610172827872, - "grad_norm": 8.292950630187988, - "learning_rate": 3.361905764692171e-06, - "loss": 0.4412, - "mean_token_accuracy": 0.8577398896217346, - "num_tokens": 219790189.0, - "step": 182670 - }, - { - "entropy": 1.9341531276702881, - "epoch": 0.5662920164078369, - "grad_norm": 8.29041862487793, - "learning_rate": 3.3618137466776664e-06, - "loss": 0.4463, - "mean_token_accuracy": 0.8639525189995766, - "num_tokens": 219801446.0, - "step": 182680 - }, - { - "entropy": 1.8314605563879014, - "epoch": 0.5663230155328866, - "grad_norm": 3.608710765838623, - "learning_rate": 3.3617217362185654e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8570237934589386, - "num_tokens": 219813951.0, - "step": 182690 - }, - { - "entropy": 1.8585295498371124, - "epoch": 0.5663540146579363, - "grad_norm": 8.209573745727539, - "learning_rate": 3.361629733313834e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8623125806450844, - "num_tokens": 219825535.0, - "step": 182700 - }, - { - "entropy": 1.867700320482254, - "epoch": 0.566385013782986, - "grad_norm": 7.110729217529297, - "learning_rate": 3.3615377379624386e-06, - "loss": 0.4966, - "mean_token_accuracy": 0.8486958160996437, - "num_tokens": 219838008.0, - "step": 182710 - }, - { - "entropy": 1.7865567639470101, - "epoch": 0.5664160129080357, - "grad_norm": 4.511937141418457, - "learning_rate": 3.361445750163346e-06, - "loss": 0.3865, - "mean_token_accuracy": 0.8617598488926888, - "num_tokens": 219850952.0, - "step": 182720 - }, - { - "entropy": 1.9098443865776062, - "epoch": 0.5664470120330853, - "grad_norm": 6.9264984130859375, - "learning_rate": 3.3613537699155224e-06, - "loss": 0.4365, - "mean_token_accuracy": 0.8570198655128479, - "num_tokens": 219862258.0, - "step": 182730 - }, - { - "entropy": 1.8792338341474533, - "epoch": 0.566478011158135, - "grad_norm": 8.768360137939453, - "learning_rate": 3.3612617972179345e-06, - "loss": 0.4238, - "mean_token_accuracy": 0.8603082686662674, - "num_tokens": 219874000.0, - "step": 182740 - }, - { - "entropy": 1.8674265652894975, - "epoch": 0.5665090102831848, - "grad_norm": 4.6239471435546875, - "learning_rate": 3.3611698320695502e-06, - "loss": 0.4112, - "mean_token_accuracy": 0.866169311106205, - "num_tokens": 219885856.0, - "step": 182750 - }, - { - "entropy": 1.8426985383033752, - "epoch": 0.5665400094082345, - "grad_norm": 7.317948818206787, - "learning_rate": 3.361077874469336e-06, - "loss": 0.4306, - "mean_token_accuracy": 0.8583661854267121, - "num_tokens": 219898930.0, - "step": 182760 - }, - { - "entropy": 1.906522662937641, - "epoch": 0.5665710085332841, - "grad_norm": 7.433799743652344, - "learning_rate": 3.3609859244162606e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8607842057943345, - "num_tokens": 219909785.0, - "step": 182770 - }, - { - "entropy": 1.858467762172222, - "epoch": 0.5666020076583338, - "grad_norm": 2.7514004707336426, - "learning_rate": 3.3608939819092902e-06, - "loss": 0.4304, - "mean_token_accuracy": 0.8575167447328568, - "num_tokens": 219921781.0, - "step": 182780 - }, - { - "entropy": 1.838193103671074, - "epoch": 0.5666330067833836, - "grad_norm": 9.16029167175293, - "learning_rate": 3.360802046947394e-06, - "loss": 0.4182, - "mean_token_accuracy": 0.8564060479402542, - "num_tokens": 219934047.0, - "step": 182790 - }, - { - "entropy": 1.8478391572833062, - "epoch": 0.5666640059084332, - "grad_norm": 3.9323177337646484, - "learning_rate": 3.360710119529539e-06, - "loss": 0.416, - "mean_token_accuracy": 0.8672307848930358, - "num_tokens": 219946952.0, - "step": 182800 - }, - { - "entropy": 1.8906191244721413, - "epoch": 0.5666950050334829, - "grad_norm": 7.394486904144287, - "learning_rate": 3.3606181996546943e-06, - "loss": 0.4671, - "mean_token_accuracy": 0.8540799006819725, - "num_tokens": 219958787.0, - "step": 182810 - }, - { - "entropy": 1.8989437848329545, - "epoch": 0.5667260041585326, - "grad_norm": 8.674349784851074, - "learning_rate": 3.3605262873218285e-06, - "loss": 0.4569, - "mean_token_accuracy": 0.8600521177053452, - "num_tokens": 219970417.0, - "step": 182820 - }, - { - "entropy": 1.8586945995688438, - "epoch": 0.5667570032835824, - "grad_norm": 3.3568546772003174, - "learning_rate": 3.3604343825299098e-06, - "loss": 0.4322, - "mean_token_accuracy": 0.8593952640891075, - "num_tokens": 219982749.0, - "step": 182830 - }, - { - "entropy": 1.880942103266716, - "epoch": 0.566788002408632, - "grad_norm": 4.652389049530029, - "learning_rate": 3.360342485277907e-06, - "loss": 0.4517, - "mean_token_accuracy": 0.8370317235589028, - "num_tokens": 219995175.0, - "step": 182840 - }, - { - "entropy": 1.9068979054689408, - "epoch": 0.5668190015336817, - "grad_norm": 8.30090045928955, - "learning_rate": 3.360250595564789e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8553275540471077, - "num_tokens": 220006219.0, - "step": 182850 - }, - { - "entropy": 1.9159462600946426, - "epoch": 0.5668500006587314, - "grad_norm": 8.318293571472168, - "learning_rate": 3.3601587133895264e-06, - "loss": 0.4175, - "mean_token_accuracy": 0.8619872912764549, - "num_tokens": 220018022.0, - "step": 182860 - }, - { - "entropy": 1.9070228204131126, - "epoch": 0.5668809997837811, - "grad_norm": 8.146712303161621, - "learning_rate": 3.360066838751088e-06, - "loss": 0.4446, - "mean_token_accuracy": 0.852077630162239, - "num_tokens": 220030045.0, - "step": 182870 - }, - { - "entropy": 1.8928291469812393, - "epoch": 0.5669119989088308, - "grad_norm": 8.228689193725586, - "learning_rate": 3.359974971648442e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8475547671318054, - "num_tokens": 220042402.0, - "step": 182880 - }, - { - "entropy": 1.9087701261043548, - "epoch": 0.5669429980338805, - "grad_norm": 4.87774658203125, - "learning_rate": 3.359883112080561e-06, - "loss": 0.4429, - "mean_token_accuracy": 0.8584561154246331, - "num_tokens": 220054088.0, - "step": 182890 - }, - { - "entropy": 1.9589183256030083, - "epoch": 0.5669739971589302, - "grad_norm": 8.75279426574707, - "learning_rate": 3.359791260046413e-06, - "loss": 0.4998, - "mean_token_accuracy": 0.8392207562923432, - "num_tokens": 220065452.0, - "step": 182900 - }, - { - "entropy": 1.8772664383053779, - "epoch": 0.5670049962839799, - "grad_norm": 7.904957294464111, - "learning_rate": 3.3596994155449686e-06, - "loss": 0.4242, - "mean_token_accuracy": 0.8565537855029106, - "num_tokens": 220078156.0, - "step": 182910 - }, - { - "entropy": 1.904349359869957, - "epoch": 0.5670359954090296, - "grad_norm": 7.1998491287231445, - "learning_rate": 3.359607578575199e-06, - "loss": 0.4379, - "mean_token_accuracy": 0.8597839877009392, - "num_tokens": 220089673.0, - "step": 182920 - }, - { - "entropy": 1.9074815943837167, - "epoch": 0.5670669945340793, - "grad_norm": 9.475333213806152, - "learning_rate": 3.3595157491360746e-06, - "loss": 0.4209, - "mean_token_accuracy": 0.864620278775692, - "num_tokens": 220101476.0, - "step": 182930 - }, - { - "entropy": 1.909410683810711, - "epoch": 0.5670979936591289, - "grad_norm": 3.6434812545776367, - "learning_rate": 3.3594239272265657e-06, - "loss": 0.46, - "mean_token_accuracy": 0.8497651174664498, - "num_tokens": 220112593.0, - "step": 182940 - }, - { - "entropy": 1.8805428713560104, - "epoch": 0.5671289927841787, - "grad_norm": 4.920924663543701, - "learning_rate": 3.359332112845644e-06, - "loss": 0.4708, - "mean_token_accuracy": 0.8507418289780617, - "num_tokens": 220124868.0, - "step": 182950 - }, - { - "entropy": 1.7920621424913405, - "epoch": 0.5671599919092284, - "grad_norm": 7.2576069831848145, - "learning_rate": 3.359240305992281e-06, - "loss": 0.3795, - "mean_token_accuracy": 0.8677036970853805, - "num_tokens": 220138016.0, - "step": 182960 - }, - { - "entropy": 1.9227833956480027, - "epoch": 0.5671909910342781, - "grad_norm": 6.757200241088867, - "learning_rate": 3.3591485066654474e-06, - "loss": 0.4196, - "mean_token_accuracy": 0.8580241650342941, - "num_tokens": 220149184.0, - "step": 182970 - }, - { - "entropy": 1.9060191959142685, - "epoch": 0.5672219901593277, - "grad_norm": 8.616246223449707, - "learning_rate": 3.3590567148641155e-06, - "loss": 0.4649, - "mean_token_accuracy": 0.8536302790045738, - "num_tokens": 220161347.0, - "step": 182980 - }, - { - "entropy": 1.849088190495968, - "epoch": 0.5672529892843774, - "grad_norm": 3.9549193382263184, - "learning_rate": 3.3589649305872564e-06, - "loss": 0.4058, - "mean_token_accuracy": 0.860573235154152, - "num_tokens": 220173832.0, - "step": 182990 - }, - { - "entropy": 1.9189164862036705, - "epoch": 0.5672839884094272, - "grad_norm": 8.847037315368652, - "learning_rate": 3.3588731538338426e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8600964099168777, - "num_tokens": 220185558.0, - "step": 183000 - }, - { - "entropy": 1.908226054906845, - "epoch": 0.5673149875344768, - "grad_norm": 7.739184379577637, - "learning_rate": 3.3587813846028462e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.8561240792274475, - "num_tokens": 220197401.0, - "step": 183010 - }, - { - "entropy": 1.9531918078660966, - "epoch": 0.5673459866595265, - "grad_norm": 9.055732727050781, - "learning_rate": 3.35868962289324e-06, - "loss": 0.4726, - "mean_token_accuracy": 0.8516348704695702, - "num_tokens": 220208127.0, - "step": 183020 - }, - { - "entropy": 1.931714966893196, - "epoch": 0.5673769857845762, - "grad_norm": 8.877476692199707, - "learning_rate": 3.3585978687039964e-06, - "loss": 0.451, - "mean_token_accuracy": 0.8567020818591118, - "num_tokens": 220218719.0, - "step": 183030 - }, - { - "entropy": 1.8644310608506203, - "epoch": 0.567407984909626, - "grad_norm": 10.3399019241333, - "learning_rate": 3.358506122034088e-06, - "loss": 0.3853, - "mean_token_accuracy": 0.8704071551561355, - "num_tokens": 220230842.0, - "step": 183040 - }, - { - "entropy": 1.9125615239143372, - "epoch": 0.5674389840346756, - "grad_norm": 7.946232795715332, - "learning_rate": 3.358414382882489e-06, - "loss": 0.4469, - "mean_token_accuracy": 0.852666375041008, - "num_tokens": 220242813.0, - "step": 183050 - }, - { - "entropy": 1.9153546869754792, - "epoch": 0.5674699831597253, - "grad_norm": 3.3038265705108643, - "learning_rate": 3.3583226512481703e-06, - "loss": 0.4751, - "mean_token_accuracy": 0.8458715483546257, - "num_tokens": 220254735.0, - "step": 183060 - }, - { - "entropy": 1.884465442597866, - "epoch": 0.567500982284775, - "grad_norm": 7.026264667510986, - "learning_rate": 3.3582309271301076e-06, - "loss": 0.4013, - "mean_token_accuracy": 0.860778984427452, - "num_tokens": 220266193.0, - "step": 183070 - }, - { - "entropy": 1.975827944278717, - "epoch": 0.5675319814098247, - "grad_norm": 8.726240158081055, - "learning_rate": 3.3581392105272736e-06, - "loss": 0.5006, - "mean_token_accuracy": 0.8460676193237304, - "num_tokens": 220276810.0, - "step": 183080 - }, - { - "entropy": 1.8663788139820099, - "epoch": 0.5675629805348744, - "grad_norm": 9.79867172241211, - "learning_rate": 3.3580475014386416e-06, - "loss": 0.3921, - "mean_token_accuracy": 0.8591655671596528, - "num_tokens": 220289125.0, - "step": 183090 - }, - { - "entropy": 1.9003127381205558, - "epoch": 0.5675939796599241, - "grad_norm": 3.7742018699645996, - "learning_rate": 3.357955799863186e-06, - "loss": 0.466, - "mean_token_accuracy": 0.860464958846569, - "num_tokens": 220300198.0, - "step": 183100 - }, - { - "entropy": 1.891619788110256, - "epoch": 0.5676249787849738, - "grad_norm": 8.731237411499023, - "learning_rate": 3.3578641057998818e-06, - "loss": 0.4292, - "mean_token_accuracy": 0.8640395820140838, - "num_tokens": 220311070.0, - "step": 183110 - }, - { - "entropy": 1.9159280955791473, - "epoch": 0.5676559779100235, - "grad_norm": 8.504816055297852, - "learning_rate": 3.3577724192477028e-06, - "loss": 0.4635, - "mean_token_accuracy": 0.8534823596477509, - "num_tokens": 220322812.0, - "step": 183120 - }, - { - "entropy": 1.9482707679271698, - "epoch": 0.5676869770350732, - "grad_norm": 9.165209770202637, - "learning_rate": 3.3576807402056232e-06, - "loss": 0.4358, - "mean_token_accuracy": 0.8688961327075958, - "num_tokens": 220333449.0, - "step": 183130 - }, - { - "entropy": 1.874471677839756, - "epoch": 0.5677179761601229, - "grad_norm": 9.139283180236816, - "learning_rate": 3.3575890686726183e-06, - "loss": 0.4151, - "mean_token_accuracy": 0.8565481066703796, - "num_tokens": 220345253.0, - "step": 183140 - }, - { - "entropy": 1.8991287276148796, - "epoch": 0.5677489752851725, - "grad_norm": 8.980961799621582, - "learning_rate": 3.3574974046476634e-06, - "loss": 0.4268, - "mean_token_accuracy": 0.8628603085875511, - "num_tokens": 220356474.0, - "step": 183150 - }, - { - "entropy": 1.8723631590604781, - "epoch": 0.5677799744102223, - "grad_norm": 3.9973182678222656, - "learning_rate": 3.3574057481297323e-06, - "loss": 0.4319, - "mean_token_accuracy": 0.8480863317847251, - "num_tokens": 220368857.0, - "step": 183160 - }, - { - "entropy": 1.7563340038061142, - "epoch": 0.567810973535272, - "grad_norm": 8.3080472946167, - "learning_rate": 3.3573140991178016e-06, - "loss": 0.3625, - "mean_token_accuracy": 0.8686195388436317, - "num_tokens": 220382271.0, - "step": 183170 - }, - { - "entropy": 1.9225044384598733, - "epoch": 0.5678419726603217, - "grad_norm": 8.145635604858398, - "learning_rate": 3.3572224576108474e-06, - "loss": 0.4519, - "mean_token_accuracy": 0.8540999129414558, - "num_tokens": 220394223.0, - "step": 183180 - }, - { - "entropy": 1.8276883363723755, - "epoch": 0.5678729717853713, - "grad_norm": 7.337801456451416, - "learning_rate": 3.3571308236078437e-06, - "loss": 0.3733, - "mean_token_accuracy": 0.8690663799643517, - "num_tokens": 220407171.0, - "step": 183190 - }, - { - "entropy": 1.9450956612825394, - "epoch": 0.5679039709104211, - "grad_norm": 8.292513847351074, - "learning_rate": 3.3570391971077676e-06, - "loss": 0.5, - "mean_token_accuracy": 0.8465719684958458, - "num_tokens": 220418012.0, - "step": 183200 - }, - { - "entropy": 1.8319994553923606, - "epoch": 0.5679349700354708, - "grad_norm": 8.656058311462402, - "learning_rate": 3.3569475781095955e-06, - "loss": 0.4086, - "mean_token_accuracy": 0.861232291162014, - "num_tokens": 220431041.0, - "step": 183210 - }, - { - "entropy": 1.8853465974330903, - "epoch": 0.5679659691605204, - "grad_norm": 8.341654777526855, - "learning_rate": 3.356855966612303e-06, - "loss": 0.4425, - "mean_token_accuracy": 0.8643447190523148, - "num_tokens": 220442375.0, - "step": 183220 - }, - { - "entropy": 1.9103177651762961, - "epoch": 0.5679969682855701, - "grad_norm": 9.43069076538086, - "learning_rate": 3.3567643626148666e-06, - "loss": 0.43, - "mean_token_accuracy": 0.8563966482877732, - "num_tokens": 220454284.0, - "step": 183230 - }, - { - "entropy": 1.9459126323461533, - "epoch": 0.5680279674106198, - "grad_norm": 9.42568302154541, - "learning_rate": 3.3566727661162647e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8557520717382431, - "num_tokens": 220466716.0, - "step": 183240 - }, - { - "entropy": 1.9446142554283141, - "epoch": 0.5680589665356696, - "grad_norm": 7.449951648712158, - "learning_rate": 3.3565811771154718e-06, - "loss": 0.5724, - "mean_token_accuracy": 0.8319184750318527, - "num_tokens": 220478467.0, - "step": 183250 - }, - { - "entropy": 1.8678555905818939, - "epoch": 0.5680899656607192, - "grad_norm": 9.808244705200195, - "learning_rate": 3.3564895956114668e-06, - "loss": 0.4239, - "mean_token_accuracy": 0.8574158445000648, - "num_tokens": 220490488.0, - "step": 183260 - }, - { - "entropy": 1.8901596933603286, - "epoch": 0.5681209647857689, - "grad_norm": 8.877074241638184, - "learning_rate": 3.3563980216032265e-06, - "loss": 0.4224, - "mean_token_accuracy": 0.8529431104660035, - "num_tokens": 220502198.0, - "step": 183270 - }, - { - "entropy": 1.8711446061730386, - "epoch": 0.5681519639108186, - "grad_norm": 6.931746482849121, - "learning_rate": 3.3563064550897285e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.8635139778256417, - "num_tokens": 220513840.0, - "step": 183280 - }, - { - "entropy": 1.7962354853749276, - "epoch": 0.5681829630358683, - "grad_norm": 7.381433963775635, - "learning_rate": 3.356214896069951e-06, - "loss": 0.3305, - "mean_token_accuracy": 0.8718640059232712, - "num_tokens": 220527216.0, - "step": 183290 - }, - { - "entropy": 1.89014650657773, - "epoch": 0.568213962160918, - "grad_norm": 10.772461891174316, - "learning_rate": 3.3561233445428705e-06, - "loss": 0.4507, - "mean_token_accuracy": 0.8503888994455338, - "num_tokens": 220539151.0, - "step": 183300 - }, - { - "entropy": 1.925488579273224, - "epoch": 0.5682449612859677, - "grad_norm": 8.212474822998047, - "learning_rate": 3.3560318005074654e-06, - "loss": 0.4384, - "mean_token_accuracy": 0.8614701345562935, - "num_tokens": 220550011.0, - "step": 183310 - }, - { - "entropy": 1.9450786262750626, - "epoch": 0.5682759604110174, - "grad_norm": 8.625468254089355, - "learning_rate": 3.355940263962716e-06, - "loss": 0.4781, - "mean_token_accuracy": 0.8568323805928231, - "num_tokens": 220560733.0, - "step": 183320 - }, - { - "entropy": 1.8635306119918824, - "epoch": 0.5683069595360671, - "grad_norm": 3.6268012523651123, - "learning_rate": 3.3558487349075984e-06, - "loss": 0.4041, - "mean_token_accuracy": 0.8642154484987259, - "num_tokens": 220573023.0, - "step": 183330 - }, - { - "entropy": 1.8082802429795266, - "epoch": 0.5683379586611168, - "grad_norm": 2.5699939727783203, - "learning_rate": 3.355757213341093e-06, - "loss": 0.3472, - "mean_token_accuracy": 0.8663254842162132, - "num_tokens": 220586830.0, - "step": 183340 - }, - { - "entropy": 1.9095750093460082, - "epoch": 0.5683689577861665, - "grad_norm": 8.66707706451416, - "learning_rate": 3.355665699262178e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8577072054147721, - "num_tokens": 220599167.0, - "step": 183350 - }, - { - "entropy": 1.8929280027747155, - "epoch": 0.5683999569112161, - "grad_norm": 4.26317834854126, - "learning_rate": 3.355574192669832e-06, - "loss": 0.4378, - "mean_token_accuracy": 0.853906175494194, - "num_tokens": 220611085.0, - "step": 183360 - }, - { - "entropy": 1.968779844045639, - "epoch": 0.5684309560362659, - "grad_norm": 7.8538007736206055, - "learning_rate": 3.355482693563035e-06, - "loss": 0.5, - "mean_token_accuracy": 0.8414223611354827, - "num_tokens": 220621870.0, - "step": 183370 - }, - { - "entropy": 1.8689953058958053, - "epoch": 0.5684619551613156, - "grad_norm": 7.304516315460205, - "learning_rate": 3.3553912019407663e-06, - "loss": 0.4096, - "mean_token_accuracy": 0.8525772273540497, - "num_tokens": 220634736.0, - "step": 183380 - }, - { - "entropy": 1.9347944945096969, - "epoch": 0.5684929542863653, - "grad_norm": 9.088506698608398, - "learning_rate": 3.3552997178020064e-06, - "loss": 0.4833, - "mean_token_accuracy": 0.8514841318130493, - "num_tokens": 220646056.0, - "step": 183390 - }, - { - "entropy": 1.9211686462163926, - "epoch": 0.5685239534114149, - "grad_norm": 8.135913848876953, - "learning_rate": 3.355208241145733e-06, - "loss": 0.441, - "mean_token_accuracy": 0.8514508843421936, - "num_tokens": 220658047.0, - "step": 183400 - }, - { - "entropy": 1.908077448606491, - "epoch": 0.5685549525364647, - "grad_norm": 3.7041707038879395, - "learning_rate": 3.3551167719709283e-06, - "loss": 0.4337, - "mean_token_accuracy": 0.8491874933242798, - "num_tokens": 220670530.0, - "step": 183410 - }, - { - "entropy": 1.8245035156607627, - "epoch": 0.5685859516615144, - "grad_norm": 8.679533958435059, - "learning_rate": 3.3550253102765717e-06, - "loss": 0.3993, - "mean_token_accuracy": 0.8612040519714356, - "num_tokens": 220682946.0, - "step": 183420 - }, - { - "entropy": 1.9670052647590637, - "epoch": 0.568616950786564, - "grad_norm": 8.43134593963623, - "learning_rate": 3.3549338560616435e-06, - "loss": 0.49, - "mean_token_accuracy": 0.8479528352618217, - "num_tokens": 220694045.0, - "step": 183430 - }, - { - "entropy": 1.7410773530602455, - "epoch": 0.5686479499116137, - "grad_norm": 7.65750789642334, - "learning_rate": 3.354842409325125e-06, - "loss": 0.3453, - "mean_token_accuracy": 0.8736346036195755, - "num_tokens": 220707673.0, - "step": 183440 - }, - { - "entropy": 1.852739727497101, - "epoch": 0.5686789490366635, - "grad_norm": 7.290695667266846, - "learning_rate": 3.3547509700659964e-06, - "loss": 0.4512, - "mean_token_accuracy": 0.8603045761585235, - "num_tokens": 220720156.0, - "step": 183450 - }, - { - "entropy": 1.8762595668435096, - "epoch": 0.5687099481617132, - "grad_norm": 8.566794395446777, - "learning_rate": 3.3546595382832387e-06, - "loss": 0.4302, - "mean_token_accuracy": 0.8579180598258972, - "num_tokens": 220732483.0, - "step": 183460 - }, - { - "entropy": 1.9345880046486854, - "epoch": 0.5687409472867628, - "grad_norm": 9.06840991973877, - "learning_rate": 3.354568113975834e-06, - "loss": 0.4413, - "mean_token_accuracy": 0.8575801998376846, - "num_tokens": 220743293.0, - "step": 183470 - }, - { - "entropy": 1.9276087015867234, - "epoch": 0.5687719464118125, - "grad_norm": 7.404775619506836, - "learning_rate": 3.354476697142762e-06, - "loss": 0.4017, - "mean_token_accuracy": 0.8662512883543968, - "num_tokens": 220754912.0, - "step": 183480 - }, - { - "entropy": 1.9424118250608444, - "epoch": 0.5688029455368622, - "grad_norm": 6.11152982711792, - "learning_rate": 3.3543852877830067e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8560282364487648, - "num_tokens": 220766123.0, - "step": 183490 - }, - { - "entropy": 1.8890651270747185, - "epoch": 0.568833944661912, - "grad_norm": 8.704337120056152, - "learning_rate": 3.354293885895549e-06, - "loss": 0.4835, - "mean_token_accuracy": 0.855229164659977, - "num_tokens": 220778032.0, - "step": 183500 - }, - { - "entropy": 1.7918873026967048, - "epoch": 0.5688649437869616, - "grad_norm": 5.357420444488525, - "learning_rate": 3.3542024914793692e-06, - "loss": 0.3684, - "mean_token_accuracy": 0.8629763871431351, - "num_tokens": 220791558.0, - "step": 183510 - }, - { - "entropy": 1.8693130612373352, - "epoch": 0.5688959429120113, - "grad_norm": 3.724578380584717, - "learning_rate": 3.3541111045334514e-06, - "loss": 0.3959, - "mean_token_accuracy": 0.8646483674645424, - "num_tokens": 220803255.0, - "step": 183520 - }, - { - "entropy": 1.8977489918470383, - "epoch": 0.568926942037061, - "grad_norm": 6.983435153961182, - "learning_rate": 3.3540197250567773e-06, - "loss": 0.4671, - "mean_token_accuracy": 0.8517172873020172, - "num_tokens": 220814834.0, - "step": 183530 - }, - { - "entropy": 1.9508653551340103, - "epoch": 0.5689579411621107, - "grad_norm": 7.906034469604492, - "learning_rate": 3.35392835304833e-06, - "loss": 0.473, - "mean_token_accuracy": 0.8508275121450424, - "num_tokens": 220825869.0, - "step": 183540 - }, - { - "entropy": 1.8913847014307976, - "epoch": 0.5689889402871604, - "grad_norm": 3.185633659362793, - "learning_rate": 3.3538369885070923e-06, - "loss": 0.3913, - "mean_token_accuracy": 0.8732113286852836, - "num_tokens": 220837783.0, - "step": 183550 - }, - { - "entropy": 1.914082932472229, - "epoch": 0.5690199394122101, - "grad_norm": 6.322134494781494, - "learning_rate": 3.3537456314320467e-06, - "loss": 0.4325, - "mean_token_accuracy": 0.8633823007345199, - "num_tokens": 220848662.0, - "step": 183560 - }, - { - "entropy": 1.9293600648641587, - "epoch": 0.5690509385372597, - "grad_norm": 8.116273880004883, - "learning_rate": 3.3536542818221757e-06, - "loss": 0.4947, - "mean_token_accuracy": 0.8529650032520294, - "num_tokens": 220859719.0, - "step": 183570 - }, - { - "entropy": 1.9193365216255187, - "epoch": 0.5690819376623095, - "grad_norm": 7.5625996589660645, - "learning_rate": 3.353562939676464e-06, - "loss": 0.4578, - "mean_token_accuracy": 0.8576421469449997, - "num_tokens": 220870811.0, - "step": 183580 - }, - { - "entropy": 1.842278851568699, - "epoch": 0.5691129367873592, - "grad_norm": 8.025686264038086, - "learning_rate": 3.353471604993895e-06, - "loss": 0.4059, - "mean_token_accuracy": 0.8678732305765152, - "num_tokens": 220882929.0, - "step": 183590 - }, - { - "entropy": 1.9584626644849776, - "epoch": 0.5691439359124089, - "grad_norm": 9.167868614196777, - "learning_rate": 3.3533802777734523e-06, - "loss": 0.5023, - "mean_token_accuracy": 0.8444946691393852, - "num_tokens": 220893487.0, - "step": 183600 - }, - { - "entropy": 1.8276565596461296, - "epoch": 0.5691749350374585, - "grad_norm": 3.9966540336608887, - "learning_rate": 3.3532889580141193e-06, - "loss": 0.3778, - "mean_token_accuracy": 0.8683551594614982, - "num_tokens": 220905974.0, - "step": 183610 - }, - { - "entropy": 1.893023744225502, - "epoch": 0.5692059341625083, - "grad_norm": 7.548893451690674, - "learning_rate": 3.3531976457148803e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.8617466360330581, - "num_tokens": 220917499.0, - "step": 183620 - }, - { - "entropy": 1.9760723441839219, - "epoch": 0.569236933287558, - "grad_norm": 7.412717342376709, - "learning_rate": 3.35310634087472e-06, - "loss": 0.4774, - "mean_token_accuracy": 0.8563466891646385, - "num_tokens": 220928532.0, - "step": 183630 - }, - { - "entropy": 1.8784570693969727, - "epoch": 0.5692679324126076, - "grad_norm": 9.210555076599121, - "learning_rate": 3.353015043492623e-06, - "loss": 0.445, - "mean_token_accuracy": 0.8611171096563339, - "num_tokens": 220941186.0, - "step": 183640 - }, - { - "entropy": 1.8664739981293679, - "epoch": 0.5692989315376573, - "grad_norm": 8.183813095092773, - "learning_rate": 3.352923753567574e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.8593077078461647, - "num_tokens": 220952736.0, - "step": 183650 - }, - { - "entropy": 1.9031253203749656, - "epoch": 0.5693299306627071, - "grad_norm": 8.010009765625, - "learning_rate": 3.352832471098557e-06, - "loss": 0.464, - "mean_token_accuracy": 0.8554147705435753, - "num_tokens": 220964870.0, - "step": 183660 - }, - { - "entropy": 1.9231388315558433, - "epoch": 0.5693609297877568, - "grad_norm": 7.923822402954102, - "learning_rate": 3.352741196084558e-06, - "loss": 0.4639, - "mean_token_accuracy": 0.8455759003758431, - "num_tokens": 220976650.0, - "step": 183670 - }, - { - "entropy": 1.9567074686288835, - "epoch": 0.5693919289128064, - "grad_norm": 8.853241920471191, - "learning_rate": 3.352649928524562e-06, - "loss": 0.496, - "mean_token_accuracy": 0.8528679683804512, - "num_tokens": 220987807.0, - "step": 183680 - }, - { - "entropy": 1.812475062906742, - "epoch": 0.5694229280378561, - "grad_norm": 4.6928606033325195, - "learning_rate": 3.3525586684175554e-06, - "loss": 0.3857, - "mean_token_accuracy": 0.8690725296735764, - "num_tokens": 221001512.0, - "step": 183690 - }, - { - "entropy": 1.9027960404753685, - "epoch": 0.5694539271629059, - "grad_norm": 8.04566764831543, - "learning_rate": 3.3524674157625238e-06, - "loss": 0.4908, - "mean_token_accuracy": 0.8378259748220444, - "num_tokens": 221013722.0, - "step": 183700 - }, - { - "entropy": 1.877773503959179, - "epoch": 0.5694849262879556, - "grad_norm": 7.116755485534668, - "learning_rate": 3.3523761705584506e-06, - "loss": 0.4057, - "mean_token_accuracy": 0.8673833280801773, - "num_tokens": 221025952.0, - "step": 183710 - }, - { - "entropy": 1.880093328654766, - "epoch": 0.5695159254130052, - "grad_norm": 7.747344017028809, - "learning_rate": 3.3522849328043243e-06, - "loss": 0.447, - "mean_token_accuracy": 0.8541530668735504, - "num_tokens": 221037662.0, - "step": 183720 - }, - { - "entropy": 1.8805700927972793, - "epoch": 0.5695469245380549, - "grad_norm": 7.554903507232666, - "learning_rate": 3.352193702499131e-06, - "loss": 0.4396, - "mean_token_accuracy": 0.8540754541754723, - "num_tokens": 221049646.0, - "step": 183730 - }, - { - "entropy": 1.8181962668895721, - "epoch": 0.5695779236631046, - "grad_norm": 8.619624137878418, - "learning_rate": 3.352102479641857e-06, - "loss": 0.3976, - "mean_token_accuracy": 0.8621390506625175, - "num_tokens": 221062839.0, - "step": 183740 - }, - { - "entropy": 1.9224794439971447, - "epoch": 0.5696089227881543, - "grad_norm": 8.067683219909668, - "learning_rate": 3.3520112642314882e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.8606799691915512, - "num_tokens": 221074434.0, - "step": 183750 - }, - { - "entropy": 1.9693562895059586, - "epoch": 0.569639921913204, - "grad_norm": 10.010885238647461, - "learning_rate": 3.3519200562670123e-06, - "loss": 0.5083, - "mean_token_accuracy": 0.8417602896690368, - "num_tokens": 221085251.0, - "step": 183760 - }, - { - "entropy": 1.963064904510975, - "epoch": 0.5696709210382537, - "grad_norm": 9.861781120300293, - "learning_rate": 3.3518288557474155e-06, - "loss": 0.4952, - "mean_token_accuracy": 0.8481303885579109, - "num_tokens": 221096618.0, - "step": 183770 - }, - { - "entropy": 1.9917975157499312, - "epoch": 0.5697019201633033, - "grad_norm": 8.955374717712402, - "learning_rate": 3.3517376626716858e-06, - "loss": 0.4906, - "mean_token_accuracy": 0.8551626339554786, - "num_tokens": 221107422.0, - "step": 183780 - }, - { - "entropy": 1.800957126915455, - "epoch": 0.5697329192883531, - "grad_norm": 7.669996738433838, - "learning_rate": 3.3516464770388104e-06, - "loss": 0.3663, - "mean_token_accuracy": 0.8650017514824867, - "num_tokens": 221120677.0, - "step": 183790 - }, - { - "entropy": 1.8783799439668656, - "epoch": 0.5697639184134028, - "grad_norm": 7.118563652038574, - "learning_rate": 3.351555298847777e-06, - "loss": 0.4096, - "mean_token_accuracy": 0.8592517286539078, - "num_tokens": 221133188.0, - "step": 183800 - }, - { - "entropy": 1.9285740569233893, - "epoch": 0.5697949175384525, - "grad_norm": 8.174911499023438, - "learning_rate": 3.351464128097573e-06, - "loss": 0.4596, - "mean_token_accuracy": 0.8508363962173462, - "num_tokens": 221145365.0, - "step": 183810 - }, - { - "entropy": 1.8228991374373436, - "epoch": 0.5698259166635021, - "grad_norm": 7.459652423858643, - "learning_rate": 3.3513729647871875e-06, - "loss": 0.3882, - "mean_token_accuracy": 0.8594255059957504, - "num_tokens": 221158893.0, - "step": 183820 - }, - { - "entropy": 1.8317522302269935, - "epoch": 0.5698569157885519, - "grad_norm": 8.87353515625, - "learning_rate": 3.3512818089156067e-06, - "loss": 0.3505, - "mean_token_accuracy": 0.8580220609903335, - "num_tokens": 221172123.0, - "step": 183830 - }, - { - "entropy": 1.8688715621829033, - "epoch": 0.5698879149136016, - "grad_norm": 8.272303581237793, - "learning_rate": 3.3511906604818205e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.8654984638094902, - "num_tokens": 221183785.0, - "step": 183840 - }, - { - "entropy": 1.9467407405376433, - "epoch": 0.5699189140386512, - "grad_norm": 8.615777969360352, - "learning_rate": 3.351099519484818e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8512974575161933, - "num_tokens": 221194815.0, - "step": 183850 - }, - { - "entropy": 1.7814303100109101, - "epoch": 0.5699499131637009, - "grad_norm": 8.84643268585205, - "learning_rate": 3.3510083859235865e-06, - "loss": 0.365, - "mean_token_accuracy": 0.8647480860352517, - "num_tokens": 221208640.0, - "step": 183860 - }, - { - "entropy": 1.9105361938476562, - "epoch": 0.5699809122887507, - "grad_norm": 4.086887836456299, - "learning_rate": 3.3509172597971156e-06, - "loss": 0.4007, - "mean_token_accuracy": 0.8670039981603622, - "num_tokens": 221220350.0, - "step": 183870 - }, - { - "entropy": 1.929616743326187, - "epoch": 0.5700119114138004, - "grad_norm": 10.38846492767334, - "learning_rate": 3.350826141104395e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.861053255200386, - "num_tokens": 221231474.0, - "step": 183880 - }, - { - "entropy": 1.879753015935421, - "epoch": 0.57004291053885, - "grad_norm": 8.4418306350708, - "learning_rate": 3.3507350298444134e-06, - "loss": 0.4435, - "mean_token_accuracy": 0.8551955938339233, - "num_tokens": 221243202.0, - "step": 183890 - }, - { - "entropy": 1.872837108373642, - "epoch": 0.5700739096638997, - "grad_norm": 4.017162322998047, - "learning_rate": 3.3506439260161606e-06, - "loss": 0.4691, - "mean_token_accuracy": 0.8521816402673721, - "num_tokens": 221256087.0, - "step": 183900 - }, - { - "entropy": 1.8944094195961951, - "epoch": 0.5701049087889495, - "grad_norm": 7.516350746154785, - "learning_rate": 3.3505528296186263e-06, - "loss": 0.4123, - "mean_token_accuracy": 0.8568247303366661, - "num_tokens": 221267912.0, - "step": 183910 - }, - { - "entropy": 1.8976934224367141, - "epoch": 0.5701359079139992, - "grad_norm": 6.143962383270264, - "learning_rate": 3.3504617406508e-06, - "loss": 0.5026, - "mean_token_accuracy": 0.8466740503907204, - "num_tokens": 221279968.0, - "step": 183920 - }, - { - "entropy": 1.9248884186148643, - "epoch": 0.5701669070390488, - "grad_norm": 9.086318969726562, - "learning_rate": 3.350370659111672e-06, - "loss": 0.4393, - "mean_token_accuracy": 0.8509233519434929, - "num_tokens": 221292101.0, - "step": 183930 - }, - { - "entropy": 1.8520910263061523, - "epoch": 0.5701979061640985, - "grad_norm": 8.545215606689453, - "learning_rate": 3.3502795850002332e-06, - "loss": 0.4501, - "mean_token_accuracy": 0.8515016883611679, - "num_tokens": 221304057.0, - "step": 183940 - }, - { - "entropy": 1.9516564026474952, - "epoch": 0.5702289052891483, - "grad_norm": 7.543873310089111, - "learning_rate": 3.3501885183154742e-06, - "loss": 0.4834, - "mean_token_accuracy": 0.8392630383372307, - "num_tokens": 221315295.0, - "step": 183950 - }, - { - "entropy": 1.9031856134533882, - "epoch": 0.5702599044141979, - "grad_norm": 8.644308090209961, - "learning_rate": 3.350097459056385e-06, - "loss": 0.4412, - "mean_token_accuracy": 0.8548777773976326, - "num_tokens": 221326997.0, - "step": 183960 - }, - { - "entropy": 1.8481758192181588, - "epoch": 0.5702909035392476, - "grad_norm": 6.953856945037842, - "learning_rate": 3.3500064072219567e-06, - "loss": 0.4322, - "mean_token_accuracy": 0.8590131789445877, - "num_tokens": 221339994.0, - "step": 183970 - }, - { - "entropy": 1.8460357397794724, - "epoch": 0.5703219026642973, - "grad_norm": 3.499201536178589, - "learning_rate": 3.34991536281118e-06, - "loss": 0.4079, - "mean_token_accuracy": 0.8534980297088623, - "num_tokens": 221352932.0, - "step": 183980 - }, - { - "entropy": 1.874594159424305, - "epoch": 0.5703529017893469, - "grad_norm": 8.420859336853027, - "learning_rate": 3.349824325823047e-06, - "loss": 0.429, - "mean_token_accuracy": 0.858637835085392, - "num_tokens": 221365971.0, - "step": 183990 - }, - { - "entropy": 1.8195825964212418, - "epoch": 0.5703839009143967, - "grad_norm": 8.116730690002441, - "learning_rate": 3.349733296256548e-06, - "loss": 0.3564, - "mean_token_accuracy": 0.879566079378128, - "num_tokens": 221379522.0, - "step": 184000 - }, - { - "entropy": 1.9340773046016693, - "epoch": 0.5704149000394464, - "grad_norm": 7.0886030197143555, - "learning_rate": 3.349642274110677e-06, - "loss": 0.4667, - "mean_token_accuracy": 0.8608402445912361, - "num_tokens": 221390338.0, - "step": 184010 - }, - { - "entropy": 1.8692950300872326, - "epoch": 0.5704458991644961, - "grad_norm": 7.241457939147949, - "learning_rate": 3.3495512593844233e-06, - "loss": 0.3905, - "mean_token_accuracy": 0.8651369750499726, - "num_tokens": 221402614.0, - "step": 184020 - }, - { - "entropy": 1.9052888661623002, - "epoch": 0.5704768982895457, - "grad_norm": 9.200166702270508, - "learning_rate": 3.34946025207678e-06, - "loss": 0.4433, - "mean_token_accuracy": 0.8558596163988114, - "num_tokens": 221414238.0, - "step": 184030 - }, - { - "entropy": 1.8719256192445755, - "epoch": 0.5705078974145955, - "grad_norm": 7.344235420227051, - "learning_rate": 3.349369252186739e-06, - "loss": 0.4642, - "mean_token_accuracy": 0.858357360959053, - "num_tokens": 221426450.0, - "step": 184040 - }, - { - "entropy": 1.8825799271464347, - "epoch": 0.5705388965396452, - "grad_norm": 8.594145774841309, - "learning_rate": 3.3492782597132935e-06, - "loss": 0.3814, - "mean_token_accuracy": 0.8614272058010102, - "num_tokens": 221438819.0, - "step": 184050 - }, - { - "entropy": 1.9068335115909576, - "epoch": 0.5705698956646948, - "grad_norm": 3.7201220989227295, - "learning_rate": 3.349187274655436e-06, - "loss": 0.454, - "mean_token_accuracy": 0.8533118292689323, - "num_tokens": 221450339.0, - "step": 184060 - }, - { - "entropy": 1.7704555153846742, - "epoch": 0.5706008947897445, - "grad_norm": 5.395776748657227, - "learning_rate": 3.349096297012158e-06, - "loss": 0.3804, - "mean_token_accuracy": 0.8651518598198891, - "num_tokens": 221464112.0, - "step": 184070 - }, - { - "entropy": 1.8785992763936519, - "epoch": 0.5706318939147943, - "grad_norm": 7.935133934020996, - "learning_rate": 3.3490053267824535e-06, - "loss": 0.4394, - "mean_token_accuracy": 0.8562982544302941, - "num_tokens": 221476400.0, - "step": 184080 - }, - { - "entropy": 1.9128589004278183, - "epoch": 0.570662893039844, - "grad_norm": 8.40445613861084, - "learning_rate": 3.348914363965316e-06, - "loss": 0.4707, - "mean_token_accuracy": 0.8426511645317077, - "num_tokens": 221488509.0, - "step": 184090 - }, - { - "entropy": 1.8774797767400742, - "epoch": 0.5706938921648936, - "grad_norm": 6.458841800689697, - "learning_rate": 3.3488234085597382e-06, - "loss": 0.4032, - "mean_token_accuracy": 0.8713184729218483, - "num_tokens": 221500300.0, - "step": 184100 - }, - { - "entropy": 1.862789809703827, - "epoch": 0.5707248912899433, - "grad_norm": 8.445926666259766, - "learning_rate": 3.348732460564714e-06, - "loss": 0.4022, - "mean_token_accuracy": 0.8612472444772721, - "num_tokens": 221512682.0, - "step": 184110 - }, - { - "entropy": 1.9176673144102097, - "epoch": 0.5707558904149931, - "grad_norm": 8.715554237365723, - "learning_rate": 3.348641519979238e-06, - "loss": 0.4771, - "mean_token_accuracy": 0.8517135217785835, - "num_tokens": 221524087.0, - "step": 184120 - }, - { - "entropy": 1.8521241903305055, - "epoch": 0.5707868895400428, - "grad_norm": 3.694570541381836, - "learning_rate": 3.3485505868023025e-06, - "loss": 0.4083, - "mean_token_accuracy": 0.8623161077499389, - "num_tokens": 221536141.0, - "step": 184130 - }, - { - "entropy": 1.94044778496027, - "epoch": 0.5708178886650924, - "grad_norm": 7.394150733947754, - "learning_rate": 3.3484596610329025e-06, - "loss": 0.4764, - "mean_token_accuracy": 0.8515927508473397, - "num_tokens": 221547849.0, - "step": 184140 - }, - { - "entropy": 1.9291246131062507, - "epoch": 0.5708488877901421, - "grad_norm": 4.240436553955078, - "learning_rate": 3.348368742670032e-06, - "loss": 0.4467, - "mean_token_accuracy": 0.8608126997947693, - "num_tokens": 221558692.0, - "step": 184150 - }, - { - "entropy": 1.9156172186136247, - "epoch": 0.5708798869151919, - "grad_norm": 6.2478742599487305, - "learning_rate": 3.3482778317126867e-06, - "loss": 0.4477, - "mean_token_accuracy": 0.8602596551179886, - "num_tokens": 221570280.0, - "step": 184160 - }, - { - "entropy": 1.8546249985694885, - "epoch": 0.5709108860402415, - "grad_norm": 4.608593463897705, - "learning_rate": 3.3481869281598605e-06, - "loss": 0.3939, - "mean_token_accuracy": 0.862372313439846, - "num_tokens": 221583436.0, - "step": 184170 - }, - { - "entropy": 1.8053992852568626, - "epoch": 0.5709418851652912, - "grad_norm": 4.155288219451904, - "learning_rate": 3.348096032010548e-06, - "loss": 0.3995, - "mean_token_accuracy": 0.8623798578977585, - "num_tokens": 221596995.0, - "step": 184180 - }, - { - "entropy": 1.9127266079187393, - "epoch": 0.5709728842903409, - "grad_norm": 8.820666313171387, - "learning_rate": 3.348005143263744e-06, - "loss": 0.4983, - "mean_token_accuracy": 0.8540406107902527, - "num_tokens": 221608475.0, - "step": 184190 - }, - { - "entropy": 1.9034708708524704, - "epoch": 0.5710038834153905, - "grad_norm": 8.682132720947266, - "learning_rate": 3.3479142619184447e-06, - "loss": 0.4888, - "mean_token_accuracy": 0.8417846232652664, - "num_tokens": 221620246.0, - "step": 184200 - }, - { - "entropy": 1.8972658574581147, - "epoch": 0.5710348825404403, - "grad_norm": 8.441329002380371, - "learning_rate": 3.3478233879736455e-06, - "loss": 0.4557, - "mean_token_accuracy": 0.8526045650243759, - "num_tokens": 221631407.0, - "step": 184210 - }, - { - "entropy": 1.9070860490202903, - "epoch": 0.57106588166549, - "grad_norm": 7.873006343841553, - "learning_rate": 3.347732521428342e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8503958001732826, - "num_tokens": 221643299.0, - "step": 184220 - }, - { - "entropy": 1.9741833984851838, - "epoch": 0.5710968807905397, - "grad_norm": 8.034712791442871, - "learning_rate": 3.3476416622815293e-06, - "loss": 0.4831, - "mean_token_accuracy": 0.8505676060914993, - "num_tokens": 221654730.0, - "step": 184230 - }, - { - "entropy": 1.8234672516584396, - "epoch": 0.5711278799155893, - "grad_norm": 9.788541793823242, - "learning_rate": 3.3475508105322048e-06, - "loss": 0.3888, - "mean_token_accuracy": 0.8619481399655342, - "num_tokens": 221668242.0, - "step": 184240 - }, - { - "entropy": 1.9055007576942444, - "epoch": 0.5711588790406391, - "grad_norm": 9.112692832946777, - "learning_rate": 3.3474599661793634e-06, - "loss": 0.4432, - "mean_token_accuracy": 0.8560148790478707, - "num_tokens": 221679959.0, - "step": 184250 - }, - { - "entropy": 1.8885867521166801, - "epoch": 0.5711898781656888, - "grad_norm": 8.689698219299316, - "learning_rate": 3.347369129222002e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8601107582449913, - "num_tokens": 221691999.0, - "step": 184260 - }, - { - "entropy": 1.9156020715832711, - "epoch": 0.5712208772907384, - "grad_norm": 8.803537368774414, - "learning_rate": 3.3472782996591185e-06, - "loss": 0.4405, - "mean_token_accuracy": 0.8553341671824455, - "num_tokens": 221703723.0, - "step": 184270 - }, - { - "entropy": 1.9182790204882623, - "epoch": 0.5712518764157881, - "grad_norm": 8.862181663513184, - "learning_rate": 3.347187477489707e-06, - "loss": 0.4623, - "mean_token_accuracy": 0.8531744465231895, - "num_tokens": 221714671.0, - "step": 184280 - }, - { - "entropy": 1.9861332833766938, - "epoch": 0.5712828755408379, - "grad_norm": 7.757084369659424, - "learning_rate": 3.3470966627127677e-06, - "loss": 0.5033, - "mean_token_accuracy": 0.8418697059154511, - "num_tokens": 221725356.0, - "step": 184290 - }, - { - "entropy": 1.858552846312523, - "epoch": 0.5713138746658876, - "grad_norm": 6.145276069641113, - "learning_rate": 3.347005855327296e-06, - "loss": 0.3798, - "mean_token_accuracy": 0.8689824387431144, - "num_tokens": 221737404.0, - "step": 184300 - }, - { - "entropy": 1.7062400430440903, - "epoch": 0.5713448737909372, - "grad_norm": 9.089742660522461, - "learning_rate": 3.3469150553322895e-06, - "loss": 0.3334, - "mean_token_accuracy": 0.8730680406093597, - "num_tokens": 221751518.0, - "step": 184310 - }, - { - "entropy": 1.8535742238163948, - "epoch": 0.5713758729159869, - "grad_norm": 7.534306049346924, - "learning_rate": 3.3468242627267454e-06, - "loss": 0.4175, - "mean_token_accuracy": 0.8658009812235832, - "num_tokens": 221763099.0, - "step": 184320 - }, - { - "entropy": 1.8937023341655732, - "epoch": 0.5714068720410367, - "grad_norm": 4.602698802947998, - "learning_rate": 3.346733477509662e-06, - "loss": 0.4561, - "mean_token_accuracy": 0.8501476779580116, - "num_tokens": 221774434.0, - "step": 184330 - }, - { - "entropy": 1.8975597500801087, - "epoch": 0.5714378711660864, - "grad_norm": 3.3787753582000732, - "learning_rate": 3.3466426996800364e-06, - "loss": 0.3821, - "mean_token_accuracy": 0.8754017919301986, - "num_tokens": 221786154.0, - "step": 184340 - }, - { - "entropy": 1.9251778170466423, - "epoch": 0.571468870291136, - "grad_norm": 7.641809463500977, - "learning_rate": 3.3465519292368692e-06, - "loss": 0.4741, - "mean_token_accuracy": 0.8496328294277191, - "num_tokens": 221797589.0, - "step": 184350 - }, - { - "entropy": 1.9057638555765153, - "epoch": 0.5714998694161857, - "grad_norm": 7.094670295715332, - "learning_rate": 3.3464611661791564e-06, - "loss": 0.4109, - "mean_token_accuracy": 0.8637410655617714, - "num_tokens": 221809125.0, - "step": 184360 - }, - { - "entropy": 1.9552569746971131, - "epoch": 0.5715308685412355, - "grad_norm": 8.105437278747559, - "learning_rate": 3.346370410505897e-06, - "loss": 0.4735, - "mean_token_accuracy": 0.850199231505394, - "num_tokens": 221819459.0, - "step": 184370 - }, - { - "entropy": 1.9675395026803018, - "epoch": 0.5715618676662851, - "grad_norm": 8.247036933898926, - "learning_rate": 3.3462796622160893e-06, - "loss": 0.4602, - "mean_token_accuracy": 0.8617465287446976, - "num_tokens": 221830856.0, - "step": 184380 - }, - { - "entropy": 1.857323682308197, - "epoch": 0.5715928667913348, - "grad_norm": 7.871233940124512, - "learning_rate": 3.346188921308734e-06, - "loss": 0.4005, - "mean_token_accuracy": 0.8622802942991257, - "num_tokens": 221843358.0, - "step": 184390 - }, - { - "entropy": 1.8538252532482147, - "epoch": 0.5716238659163845, - "grad_norm": 7.213409900665283, - "learning_rate": 3.346098187782828e-06, - "loss": 0.4626, - "mean_token_accuracy": 0.857527782022953, - "num_tokens": 221856160.0, - "step": 184400 - }, - { - "entropy": 1.850920520722866, - "epoch": 0.5716548650414343, - "grad_norm": 8.479315757751465, - "learning_rate": 3.346007461637373e-06, - "loss": 0.4254, - "mean_token_accuracy": 0.8530377745628357, - "num_tokens": 221868467.0, - "step": 184410 - }, - { - "entropy": 1.9420105203986169, - "epoch": 0.5716858641664839, - "grad_norm": 9.029580116271973, - "learning_rate": 3.345916742871366e-06, - "loss": 0.4586, - "mean_token_accuracy": 0.8558577001094818, - "num_tokens": 221879886.0, - "step": 184420 - }, - { - "entropy": 1.9024136036634445, - "epoch": 0.5717168632915336, - "grad_norm": 7.557387351989746, - "learning_rate": 3.345826031483808e-06, - "loss": 0.4562, - "mean_token_accuracy": 0.8580532029271126, - "num_tokens": 221892180.0, - "step": 184430 - }, - { - "entropy": 1.8506018549203873, - "epoch": 0.5717478624165833, - "grad_norm": 7.604180812835693, - "learning_rate": 3.3457353274736993e-06, - "loss": 0.3891, - "mean_token_accuracy": 0.8685560122132301, - "num_tokens": 221904480.0, - "step": 184440 - }, - { - "entropy": 1.9249087005853653, - "epoch": 0.5717788615416329, - "grad_norm": 6.4561238288879395, - "learning_rate": 3.3456446308400387e-06, - "loss": 0.4528, - "mean_token_accuracy": 0.8623852536082268, - "num_tokens": 221916316.0, - "step": 184450 - }, - { - "entropy": 1.9324004918336868, - "epoch": 0.5718098606666827, - "grad_norm": 8.638805389404297, - "learning_rate": 3.3455539415818272e-06, - "loss": 0.4662, - "mean_token_accuracy": 0.8507967680692673, - "num_tokens": 221926873.0, - "step": 184460 - }, - { - "entropy": 1.840507847070694, - "epoch": 0.5718408597917324, - "grad_norm": 3.3923275470733643, - "learning_rate": 3.345463259698065e-06, - "loss": 0.4129, - "mean_token_accuracy": 0.8589583113789558, - "num_tokens": 221938994.0, - "step": 184470 - }, - { - "entropy": 1.8590350210666657, - "epoch": 0.571871858916782, - "grad_norm": 7.706557273864746, - "learning_rate": 3.3453725851877535e-06, - "loss": 0.4199, - "mean_token_accuracy": 0.8596820175647736, - "num_tokens": 221951096.0, - "step": 184480 - }, - { - "entropy": 1.8945080533623695, - "epoch": 0.5719028580418317, - "grad_norm": 8.865853309631348, - "learning_rate": 3.3452819180498917e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8579330548644066, - "num_tokens": 221963513.0, - "step": 184490 - }, - { - "entropy": 1.7538821056485177, - "epoch": 0.5719338571668815, - "grad_norm": 4.122339248657227, - "learning_rate": 3.3451912582834833e-06, - "loss": 0.3788, - "mean_token_accuracy": 0.8662702322006226, - "num_tokens": 221977767.0, - "step": 184500 - }, - { - "entropy": 1.9670095562934875, - "epoch": 0.5719648562919312, - "grad_norm": 8.489312171936035, - "learning_rate": 3.345100605887527e-06, - "loss": 0.5325, - "mean_token_accuracy": 0.8418423220515251, - "num_tokens": 221989148.0, - "step": 184510 - }, - { - "entropy": 1.9049221113324166, - "epoch": 0.5719958554169808, - "grad_norm": 3.830526113510132, - "learning_rate": 3.345009960861025e-06, - "loss": 0.4495, - "mean_token_accuracy": 0.8524208962917328, - "num_tokens": 222001134.0, - "step": 184520 - }, - { - "entropy": 1.8578404620289803, - "epoch": 0.5720268545420305, - "grad_norm": 7.5913872718811035, - "learning_rate": 3.344919323202979e-06, - "loss": 0.4048, - "mean_token_accuracy": 0.8586911797523499, - "num_tokens": 222014081.0, - "step": 184530 - }, - { - "entropy": 1.8676321879029274, - "epoch": 0.5720578536670803, - "grad_norm": 4.292795658111572, - "learning_rate": 3.3448286929123913e-06, - "loss": 0.4647, - "mean_token_accuracy": 0.8599231988191605, - "num_tokens": 222026648.0, - "step": 184540 - }, - { - "entropy": 1.9369481548666954, - "epoch": 0.57208885279213, - "grad_norm": 8.17246150970459, - "learning_rate": 3.3447380699882633e-06, - "loss": 0.44, - "mean_token_accuracy": 0.8523173183202744, - "num_tokens": 222037839.0, - "step": 184550 - }, - { - "entropy": 1.976218593120575, - "epoch": 0.5721198519171796, - "grad_norm": 7.205543041229248, - "learning_rate": 3.3446474544295966e-06, - "loss": 0.5077, - "mean_token_accuracy": 0.8457430496811866, - "num_tokens": 222048700.0, - "step": 184560 - }, - { - "entropy": 1.9195255041122437, - "epoch": 0.5721508510422293, - "grad_norm": 8.929695129394531, - "learning_rate": 3.3445568462353943e-06, - "loss": 0.5033, - "mean_token_accuracy": 0.8410187095403672, - "num_tokens": 222059842.0, - "step": 184570 - }, - { - "entropy": 1.8937813118100166, - "epoch": 0.5721818501672791, - "grad_norm": 2.680063247680664, - "learning_rate": 3.3444662454046596e-06, - "loss": 0.4137, - "mean_token_accuracy": 0.861142173409462, - "num_tokens": 222072051.0, - "step": 184580 - }, - { - "entropy": 1.8604128420352937, - "epoch": 0.5722128492923287, - "grad_norm": 8.521194458007812, - "learning_rate": 3.344375651936393e-06, - "loss": 0.403, - "mean_token_accuracy": 0.8724682793021202, - "num_tokens": 222083791.0, - "step": 184590 - }, - { - "entropy": 1.85889263600111, - "epoch": 0.5722438484173784, - "grad_norm": 8.879921913146973, - "learning_rate": 3.3442850658295996e-06, - "loss": 0.461, - "mean_token_accuracy": 0.8506242975592613, - "num_tokens": 222096815.0, - "step": 184600 - }, - { - "entropy": 1.9790507823228836, - "epoch": 0.5722748475424281, - "grad_norm": 9.367940902709961, - "learning_rate": 3.344194487083281e-06, - "loss": 0.4705, - "mean_token_accuracy": 0.8495551526546479, - "num_tokens": 222107446.0, - "step": 184610 - }, - { - "entropy": 1.8882742941379547, - "epoch": 0.5723058466674779, - "grad_norm": 7.1077494621276855, - "learning_rate": 3.3441039156964413e-06, - "loss": 0.4317, - "mean_token_accuracy": 0.8558852866291999, - "num_tokens": 222119676.0, - "step": 184620 - }, - { - "entropy": 1.8880833089351654, - "epoch": 0.5723368457925275, - "grad_norm": 9.013257026672363, - "learning_rate": 3.344013351668084e-06, - "loss": 0.4912, - "mean_token_accuracy": 0.8326297968626022, - "num_tokens": 222131253.0, - "step": 184630 - }, - { - "entropy": 1.8886737152934074, - "epoch": 0.5723678449175772, - "grad_norm": 7.31126594543457, - "learning_rate": 3.3439227949972125e-06, - "loss": 0.4299, - "mean_token_accuracy": 0.863846381008625, - "num_tokens": 222142462.0, - "step": 184640 - }, - { - "entropy": 1.867803943157196, - "epoch": 0.5723988440426269, - "grad_norm": 7.314255714416504, - "learning_rate": 3.3438322456828306e-06, - "loss": 0.3942, - "mean_token_accuracy": 0.8646995663642884, - "num_tokens": 222155046.0, - "step": 184650 - }, - { - "entropy": 1.9728734582662582, - "epoch": 0.5724298431676766, - "grad_norm": 9.504714965820312, - "learning_rate": 3.3437417037239413e-06, - "loss": 0.5216, - "mean_token_accuracy": 0.8391286611557007, - "num_tokens": 222165883.0, - "step": 184660 - }, - { - "entropy": 1.8285054996609689, - "epoch": 0.5724608422927263, - "grad_norm": 8.178985595703125, - "learning_rate": 3.343651169119551e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8603216618299484, - "num_tokens": 222178296.0, - "step": 184670 - }, - { - "entropy": 1.8394104793667794, - "epoch": 0.572491841417776, - "grad_norm": 8.587333679199219, - "learning_rate": 3.3435606418686635e-06, - "loss": 0.4059, - "mean_token_accuracy": 0.8623070940375328, - "num_tokens": 222190553.0, - "step": 184680 - }, - { - "entropy": 1.9125571131706238, - "epoch": 0.5725228405428257, - "grad_norm": 9.115639686584473, - "learning_rate": 3.3434701219702815e-06, - "loss": 0.5031, - "mean_token_accuracy": 0.8482768699526787, - "num_tokens": 222202320.0, - "step": 184690 - }, - { - "entropy": 1.8850971892476083, - "epoch": 0.5725538396678753, - "grad_norm": 3.7713778018951416, - "learning_rate": 3.3433796094234124e-06, - "loss": 0.4296, - "mean_token_accuracy": 0.8511196061968803, - "num_tokens": 222215003.0, - "step": 184700 - }, - { - "entropy": 1.9377149119973183, - "epoch": 0.5725848387929251, - "grad_norm": 8.725337028503418, - "learning_rate": 3.3432891042270587e-06, - "loss": 0.5203, - "mean_token_accuracy": 0.8383104220032692, - "num_tokens": 222226156.0, - "step": 184710 - }, - { - "entropy": 1.8472907304763795, - "epoch": 0.5726158379179748, - "grad_norm": 3.7417922019958496, - "learning_rate": 3.3431986063802274e-06, - "loss": 0.4063, - "mean_token_accuracy": 0.8651013299822807, - "num_tokens": 222238479.0, - "step": 184720 - }, - { - "entropy": 1.8834988474845886, - "epoch": 0.5726468370430244, - "grad_norm": 4.482487678527832, - "learning_rate": 3.343108115881923e-06, - "loss": 0.4553, - "mean_token_accuracy": 0.8457119733095169, - "num_tokens": 222250852.0, - "step": 184730 - }, - { - "entropy": 1.965399381518364, - "epoch": 0.5726778361680741, - "grad_norm": 8.264575004577637, - "learning_rate": 3.343017632731152e-06, - "loss": 0.4604, - "mean_token_accuracy": 0.8593659609556198, - "num_tokens": 222261325.0, - "step": 184740 - }, - { - "entropy": 1.8925912261009217, - "epoch": 0.5727088352931239, - "grad_norm": 8.465385437011719, - "learning_rate": 3.3429271569269196e-06, - "loss": 0.4592, - "mean_token_accuracy": 0.8567355901002884, - "num_tokens": 222273418.0, - "step": 184750 - }, - { - "entropy": 1.8055074244737626, - "epoch": 0.5727398344181736, - "grad_norm": 10.0112943649292, - "learning_rate": 3.342836688468231e-06, - "loss": 0.3752, - "mean_token_accuracy": 0.8707456439733505, - "num_tokens": 222286332.0, - "step": 184760 - }, - { - "entropy": 1.9379899948835373, - "epoch": 0.5727708335432232, - "grad_norm": 8.254408836364746, - "learning_rate": 3.3427462273540922e-06, - "loss": 0.4284, - "mean_token_accuracy": 0.864084042608738, - "num_tokens": 222297376.0, - "step": 184770 - }, - { - "entropy": 1.8589140594005584, - "epoch": 0.5728018326682729, - "grad_norm": 4.008504390716553, - "learning_rate": 3.3426557735835114e-06, - "loss": 0.4101, - "mean_token_accuracy": 0.8606909677386284, - "num_tokens": 222309500.0, - "step": 184780 - }, - { - "entropy": 1.9276038601994514, - "epoch": 0.5728328317933227, - "grad_norm": 7.754671573638916, - "learning_rate": 3.342565327155493e-06, - "loss": 0.4735, - "mean_token_accuracy": 0.8512817919254303, - "num_tokens": 222321392.0, - "step": 184790 - }, - { - "entropy": 1.8575094789266586, - "epoch": 0.5728638309183723, - "grad_norm": 8.80823040008545, - "learning_rate": 3.3424748880690448e-06, - "loss": 0.3583, - "mean_token_accuracy": 0.8824114248156547, - "num_tokens": 222333134.0, - "step": 184800 - }, - { - "entropy": 1.9034967720508575, - "epoch": 0.572894830043422, - "grad_norm": 7.53140115737915, - "learning_rate": 3.3423844563231735e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.8583787888288498, - "num_tokens": 222344834.0, - "step": 184810 - }, - { - "entropy": 1.9382282495498657, - "epoch": 0.5729258291684717, - "grad_norm": 7.528757572174072, - "learning_rate": 3.3422940319168854e-06, - "loss": 0.4647, - "mean_token_accuracy": 0.8557207301259041, - "num_tokens": 222355554.0, - "step": 184820 - }, - { - "entropy": 1.8476083979010582, - "epoch": 0.5729568282935215, - "grad_norm": 8.761383056640625, - "learning_rate": 3.342203614849189e-06, - "loss": 0.4288, - "mean_token_accuracy": 0.8453965499997139, - "num_tokens": 222368658.0, - "step": 184830 - }, - { - "entropy": 1.9420979261398315, - "epoch": 0.5729878274185711, - "grad_norm": 8.630688667297363, - "learning_rate": 3.342113205119091e-06, - "loss": 0.4568, - "mean_token_accuracy": 0.8516060650348664, - "num_tokens": 222379694.0, - "step": 184840 - }, - { - "entropy": 1.8539077758789062, - "epoch": 0.5730188265436208, - "grad_norm": 8.41552734375, - "learning_rate": 3.3420228027255983e-06, - "loss": 0.4226, - "mean_token_accuracy": 0.85543172955513, - "num_tokens": 222391970.0, - "step": 184850 - }, - { - "entropy": 1.915343302488327, - "epoch": 0.5730498256686705, - "grad_norm": 4.425499439239502, - "learning_rate": 3.34193240766772e-06, - "loss": 0.505, - "mean_token_accuracy": 0.8428995698690415, - "num_tokens": 222404607.0, - "step": 184860 - }, - { - "entropy": 1.873552420735359, - "epoch": 0.5730808247937202, - "grad_norm": 7.669476509094238, - "learning_rate": 3.341842019944464e-06, - "loss": 0.4291, - "mean_token_accuracy": 0.8621705710887909, - "num_tokens": 222416145.0, - "step": 184870 - }, - { - "entropy": 1.9667245000600815, - "epoch": 0.5731118239187699, - "grad_norm": 7.355974197387695, - "learning_rate": 3.341751639554837e-06, - "loss": 0.5173, - "mean_token_accuracy": 0.8417428076267243, - "num_tokens": 222427915.0, - "step": 184880 - }, - { - "entropy": 1.8021482951939105, - "epoch": 0.5731428230438196, - "grad_norm": 10.828743934631348, - "learning_rate": 3.341661266497849e-06, - "loss": 0.3702, - "mean_token_accuracy": 0.8738041296601295, - "num_tokens": 222441840.0, - "step": 184890 - }, - { - "entropy": 1.8824231550097466, - "epoch": 0.5731738221688693, - "grad_norm": 8.7166109085083, - "learning_rate": 3.341570900772508e-06, - "loss": 0.4612, - "mean_token_accuracy": 0.8486659601330757, - "num_tokens": 222453572.0, - "step": 184900 - }, - { - "entropy": 1.8422776013612747, - "epoch": 0.573204821293919, - "grad_norm": 3.8054652214050293, - "learning_rate": 3.3414805423778225e-06, - "loss": 0.4281, - "mean_token_accuracy": 0.8694258004426956, - "num_tokens": 222466149.0, - "step": 184910 - }, - { - "entropy": 1.914320407807827, - "epoch": 0.5732358204189687, - "grad_norm": 7.1034040451049805, - "learning_rate": 3.3413901913128016e-06, - "loss": 0.4678, - "mean_token_accuracy": 0.8578106880187988, - "num_tokens": 222477685.0, - "step": 184920 - }, - { - "entropy": 1.801837073266506, - "epoch": 0.5732668195440184, - "grad_norm": 7.344631671905518, - "learning_rate": 3.3412998475764543e-06, - "loss": 0.3845, - "mean_token_accuracy": 0.8626899436116219, - "num_tokens": 222490789.0, - "step": 184930 - }, - { - "entropy": 1.8856204375624657, - "epoch": 0.573297818669068, - "grad_norm": 9.500457763671875, - "learning_rate": 3.3412095111677906e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.8661552131175995, - "num_tokens": 222503018.0, - "step": 184940 - }, - { - "entropy": 1.9374446853995324, - "epoch": 0.5733288177941177, - "grad_norm": 7.306221961975098, - "learning_rate": 3.341119182085818e-06, - "loss": 0.4638, - "mean_token_accuracy": 0.8488444343209267, - "num_tokens": 222514264.0, - "step": 184950 - }, - { - "entropy": 1.964336033165455, - "epoch": 0.5733598169191675, - "grad_norm": 7.5833635330200195, - "learning_rate": 3.3410288603295487e-06, - "loss": 0.5151, - "mean_token_accuracy": 0.838313241302967, - "num_tokens": 222524996.0, - "step": 184960 - }, - { - "entropy": 1.879715594649315, - "epoch": 0.5733908160442172, - "grad_norm": 4.586198329925537, - "learning_rate": 3.340938545897991e-06, - "loss": 0.4092, - "mean_token_accuracy": 0.8588324815034867, - "num_tokens": 222537104.0, - "step": 184970 - }, - { - "entropy": 1.8826902776956558, - "epoch": 0.5734218151692668, - "grad_norm": 2.1931099891662598, - "learning_rate": 3.3408482387901552e-06, - "loss": 0.4676, - "mean_token_accuracy": 0.8416186437010765, - "num_tokens": 222550488.0, - "step": 184980 - }, - { - "entropy": 1.7597140736877919, - "epoch": 0.5734528142943165, - "grad_norm": 7.864374160766602, - "learning_rate": 3.340757939005052e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.8579392388463021, - "num_tokens": 222564524.0, - "step": 184990 - }, - { - "entropy": 1.8594155356287956, - "epoch": 0.5734838134193663, - "grad_norm": 8.28993034362793, - "learning_rate": 3.340667646541692e-06, - "loss": 0.5026, - "mean_token_accuracy": 0.855444373190403, - "num_tokens": 222576994.0, - "step": 185000 - }, - { - "entropy": 1.9743560194969176, - "epoch": 0.5735148125444159, - "grad_norm": 8.733586311340332, - "learning_rate": 3.3405773613990844e-06, - "loss": 0.4627, - "mean_token_accuracy": 0.8568108826875687, - "num_tokens": 222588182.0, - "step": 185010 - }, - { - "entropy": 1.9266073897480964, - "epoch": 0.5735458116694656, - "grad_norm": 8.534564018249512, - "learning_rate": 3.3404870835762415e-06, - "loss": 0.4355, - "mean_token_accuracy": 0.8611538261175156, - "num_tokens": 222599656.0, - "step": 185020 - }, - { - "entropy": 1.919064535200596, - "epoch": 0.5735768107945153, - "grad_norm": 8.062556266784668, - "learning_rate": 3.340396813072173e-06, - "loss": 0.4306, - "mean_token_accuracy": 0.8658887654542923, - "num_tokens": 222611088.0, - "step": 185030 - }, - { - "entropy": 1.8669011473655701, - "epoch": 0.5736078099195651, - "grad_norm": 3.368703603744507, - "learning_rate": 3.3403065498858922e-06, - "loss": 0.4036, - "mean_token_accuracy": 0.8619739264249802, - "num_tokens": 222623519.0, - "step": 185040 - }, - { - "entropy": 1.9168578773736953, - "epoch": 0.5736388090446147, - "grad_norm": 8.530938148498535, - "learning_rate": 3.3402162940164077e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.8555745720863343, - "num_tokens": 222635620.0, - "step": 185050 - }, - { - "entropy": 1.8974201664328576, - "epoch": 0.5736698081696644, - "grad_norm": 8.14111328125, - "learning_rate": 3.3401260454627328e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8681931644678116, - "num_tokens": 222647379.0, - "step": 185060 - }, - { - "entropy": 1.8431912809610367, - "epoch": 0.5737008072947141, - "grad_norm": 5.964134693145752, - "learning_rate": 3.340035804223879e-06, - "loss": 0.3635, - "mean_token_accuracy": 0.8663034707307815, - "num_tokens": 222660763.0, - "step": 185070 - }, - { - "entropy": 1.9197598233819009, - "epoch": 0.5737318064197638, - "grad_norm": 9.90364933013916, - "learning_rate": 3.339945570298858e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.8376195728778839, - "num_tokens": 222673262.0, - "step": 185080 - }, - { - "entropy": 1.8817140839993953, - "epoch": 0.5737628055448135, - "grad_norm": 3.678171396255493, - "learning_rate": 3.3398553436866814e-06, - "loss": 0.4016, - "mean_token_accuracy": 0.8645113825798034, - "num_tokens": 222685683.0, - "step": 185090 - }, - { - "entropy": 1.97952641248703, - "epoch": 0.5737938046698632, - "grad_norm": 8.106505393981934, - "learning_rate": 3.3397651243863634e-06, - "loss": 0.4696, - "mean_token_accuracy": 0.8467271432280541, - "num_tokens": 222696840.0, - "step": 185100 - }, - { - "entropy": 1.8992469534277916, - "epoch": 0.5738248037949129, - "grad_norm": 6.165231227874756, - "learning_rate": 3.339674912396914e-06, - "loss": 0.4567, - "mean_token_accuracy": 0.8624764665961265, - "num_tokens": 222709302.0, - "step": 185110 - }, - { - "entropy": 2.011495552957058, - "epoch": 0.5738558029199626, - "grad_norm": 8.762554168701172, - "learning_rate": 3.3395847077173466e-06, - "loss": 0.4771, - "mean_token_accuracy": 0.8507163986563683, - "num_tokens": 222720793.0, - "step": 185120 - }, - { - "entropy": 1.9778069868683814, - "epoch": 0.5738868020450123, - "grad_norm": 9.670307159423828, - "learning_rate": 3.339494510346675e-06, - "loss": 0.5101, - "mean_token_accuracy": 0.8424128621816636, - "num_tokens": 222732158.0, - "step": 185130 - }, - { - "entropy": 1.9170079410076142, - "epoch": 0.573917801170062, - "grad_norm": 9.013583183288574, - "learning_rate": 3.339404320283912e-06, - "loss": 0.419, - "mean_token_accuracy": 0.8663454532623291, - "num_tokens": 222743467.0, - "step": 185140 - }, - { - "entropy": 1.8615730196237563, - "epoch": 0.5739488002951116, - "grad_norm": 4.1062517166137695, - "learning_rate": 3.3393141375280703e-06, - "loss": 0.413, - "mean_token_accuracy": 0.856800027191639, - "num_tokens": 222756022.0, - "step": 185150 - }, - { - "entropy": 1.7988073825836182, - "epoch": 0.5739797994201614, - "grad_norm": 7.607936382293701, - "learning_rate": 3.3392239620781634e-06, - "loss": 0.3365, - "mean_token_accuracy": 0.8725836887955666, - "num_tokens": 222769768.0, - "step": 185160 - }, - { - "entropy": 1.8891526952385902, - "epoch": 0.5740107985452111, - "grad_norm": 4.363334655761719, - "learning_rate": 3.3391337939332046e-06, - "loss": 0.4161, - "mean_token_accuracy": 0.8500419408082962, - "num_tokens": 222782578.0, - "step": 185170 - }, - { - "entropy": 1.8543832316994666, - "epoch": 0.5740417976702608, - "grad_norm": 7.629032135009766, - "learning_rate": 3.3390436330922088e-06, - "loss": 0.4031, - "mean_token_accuracy": 0.8651989296078682, - "num_tokens": 222795069.0, - "step": 185180 - }, - { - "entropy": 1.951645615696907, - "epoch": 0.5740727967953104, - "grad_norm": 9.708003997802734, - "learning_rate": 3.3389534795541887e-06, - "loss": 0.4839, - "mean_token_accuracy": 0.8511202931404114, - "num_tokens": 222807285.0, - "step": 185190 - }, - { - "entropy": 1.9666521787643432, - "epoch": 0.5741037959203601, - "grad_norm": 8.467106819152832, - "learning_rate": 3.3388633333181598e-06, - "loss": 0.4732, - "mean_token_accuracy": 0.8552713766694069, - "num_tokens": 222818841.0, - "step": 185200 - }, - { - "entropy": 1.889816901087761, - "epoch": 0.5741347950454099, - "grad_norm": 6.984008312225342, - "learning_rate": 3.338773194383135e-06, - "loss": 0.3885, - "mean_token_accuracy": 0.8665608659386634, - "num_tokens": 222830644.0, - "step": 185210 - }, - { - "entropy": 1.81104983240366, - "epoch": 0.5741657941704595, - "grad_norm": 2.781053066253662, - "learning_rate": 3.3386830627481296e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.8583825513720512, - "num_tokens": 222844372.0, - "step": 185220 - }, - { - "entropy": 1.7910630270838737, - "epoch": 0.5741967932955092, - "grad_norm": 8.483392715454102, - "learning_rate": 3.3385929384121583e-06, - "loss": 0.4083, - "mean_token_accuracy": 0.8679147735238075, - "num_tokens": 222858299.0, - "step": 185230 - }, - { - "entropy": 1.8780374467372893, - "epoch": 0.5742277924205589, - "grad_norm": 8.997187614440918, - "learning_rate": 3.338502821374236e-06, - "loss": 0.4365, - "mean_token_accuracy": 0.8558851927518845, - "num_tokens": 222870451.0, - "step": 185240 - }, - { - "entropy": 1.9205268666148185, - "epoch": 0.5742587915456087, - "grad_norm": 8.913628578186035, - "learning_rate": 3.3384127116333775e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8517489165067673, - "num_tokens": 222881620.0, - "step": 185250 - }, - { - "entropy": 1.8900404646992683, - "epoch": 0.5742897906706583, - "grad_norm": 7.601342678070068, - "learning_rate": 3.338322609188599e-06, - "loss": 0.4345, - "mean_token_accuracy": 0.8635605916380882, - "num_tokens": 222893543.0, - "step": 185260 - }, - { - "entropy": 1.8780425779521466, - "epoch": 0.574320789795708, - "grad_norm": 7.883829116821289, - "learning_rate": 3.3382325140389145e-06, - "loss": 0.4162, - "mean_token_accuracy": 0.866215144097805, - "num_tokens": 222905289.0, - "step": 185270 - }, - { - "entropy": 1.8748355612158776, - "epoch": 0.5743517889207577, - "grad_norm": 7.552894115447998, - "learning_rate": 3.3381424261833405e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.860708336532116, - "num_tokens": 222917674.0, - "step": 185280 - }, - { - "entropy": 1.968424329161644, - "epoch": 0.5743827880458074, - "grad_norm": 8.948657035827637, - "learning_rate": 3.338052345620893e-06, - "loss": 0.4957, - "mean_token_accuracy": 0.848981560766697, - "num_tokens": 222928114.0, - "step": 185290 - }, - { - "entropy": 1.8713356271386146, - "epoch": 0.5744137871708571, - "grad_norm": 3.9206175804138184, - "learning_rate": 3.3379622723505878e-06, - "loss": 0.3843, - "mean_token_accuracy": 0.8630928680300712, - "num_tokens": 222940066.0, - "step": 185300 - }, - { - "entropy": 1.83049533367157, - "epoch": 0.5744447862959068, - "grad_norm": 11.425100326538086, - "learning_rate": 3.337872206371441e-06, - "loss": 0.4477, - "mean_token_accuracy": 0.8608759924769401, - "num_tokens": 222953334.0, - "step": 185310 - }, - { - "entropy": 1.9473757684230804, - "epoch": 0.5744757854209565, - "grad_norm": 8.36613655090332, - "learning_rate": 3.3377821476824686e-06, - "loss": 0.4622, - "mean_token_accuracy": 0.8547708854079247, - "num_tokens": 222964268.0, - "step": 185320 - }, - { - "entropy": 1.8634528711438179, - "epoch": 0.5745067845460062, - "grad_norm": 4.429279804229736, - "learning_rate": 3.337692096282688e-06, - "loss": 0.3893, - "mean_token_accuracy": 0.8642795279622077, - "num_tokens": 222976959.0, - "step": 185330 - }, - { - "entropy": 1.9195874109864235, - "epoch": 0.5745377836710559, - "grad_norm": 8.691102981567383, - "learning_rate": 3.3376020521711158e-06, - "loss": 0.4845, - "mean_token_accuracy": 0.848965446650982, - "num_tokens": 222988480.0, - "step": 185340 - }, - { - "entropy": 1.8805852934718132, - "epoch": 0.5745687827961056, - "grad_norm": 8.096450805664062, - "learning_rate": 3.3375120153467684e-06, - "loss": 0.4481, - "mean_token_accuracy": 0.8595820307731629, - "num_tokens": 223000727.0, - "step": 185350 - }, - { - "entropy": 1.9091674134135246, - "epoch": 0.5745997819211552, - "grad_norm": 9.245760917663574, - "learning_rate": 3.3374219858086632e-06, - "loss": 0.4397, - "mean_token_accuracy": 0.85263032913208, - "num_tokens": 223012381.0, - "step": 185360 - }, - { - "entropy": 1.9092094480991364, - "epoch": 0.574630781046205, - "grad_norm": 7.5631585121154785, - "learning_rate": 3.3373319635558177e-06, - "loss": 0.4805, - "mean_token_accuracy": 0.8590279519557953, - "num_tokens": 223023553.0, - "step": 185370 - }, - { - "entropy": 1.9298975244164467, - "epoch": 0.5746617801712547, - "grad_norm": 8.844869613647461, - "learning_rate": 3.3372419485872493e-06, - "loss": 0.4743, - "mean_token_accuracy": 0.8552899554371833, - "num_tokens": 223034518.0, - "step": 185380 - }, - { - "entropy": 1.9004199922084808, - "epoch": 0.5746927792963044, - "grad_norm": 8.642338752746582, - "learning_rate": 3.337151940901976e-06, - "loss": 0.4267, - "mean_token_accuracy": 0.8602864474058152, - "num_tokens": 223046017.0, - "step": 185390 - }, - { - "entropy": 1.9718008667230607, - "epoch": 0.574723778421354, - "grad_norm": 8.08690071105957, - "learning_rate": 3.3370619404990144e-06, - "loss": 0.4737, - "mean_token_accuracy": 0.8506859511137008, - "num_tokens": 223056921.0, - "step": 185400 - }, - { - "entropy": 1.862248282134533, - "epoch": 0.5747547775464038, - "grad_norm": 7.678162574768066, - "learning_rate": 3.336971947377385e-06, - "loss": 0.4184, - "mean_token_accuracy": 0.8505977019667625, - "num_tokens": 223070656.0, - "step": 185410 - }, - { - "entropy": 1.9419626086950301, - "epoch": 0.5747857766714535, - "grad_norm": 8.565497398376465, - "learning_rate": 3.336881961536103e-06, - "loss": 0.4772, - "mean_token_accuracy": 0.84831403195858, - "num_tokens": 223082072.0, - "step": 185420 - }, - { - "entropy": 1.94987653195858, - "epoch": 0.5748167757965031, - "grad_norm": 8.058905601501465, - "learning_rate": 3.3367919829741894e-06, - "loss": 0.4562, - "mean_token_accuracy": 0.8444725587964058, - "num_tokens": 223094099.0, - "step": 185430 - }, - { - "entropy": 1.952441319823265, - "epoch": 0.5748477749215528, - "grad_norm": 9.241267204284668, - "learning_rate": 3.3367020116906613e-06, - "loss": 0.5223, - "mean_token_accuracy": 0.8438718289136886, - "num_tokens": 223105424.0, - "step": 185440 - }, - { - "entropy": 1.908140140771866, - "epoch": 0.5748787740466025, - "grad_norm": 3.946951150894165, - "learning_rate": 3.3366120476845383e-06, - "loss": 0.4116, - "mean_token_accuracy": 0.8631471082568168, - "num_tokens": 223117241.0, - "step": 185450 - }, - { - "entropy": 1.880707675218582, - "epoch": 0.5749097731716523, - "grad_norm": 3.73358154296875, - "learning_rate": 3.33652209095484e-06, - "loss": 0.4374, - "mean_token_accuracy": 0.848235011100769, - "num_tokens": 223130225.0, - "step": 185460 - }, - { - "entropy": 1.9595543146133423, - "epoch": 0.5749407722967019, - "grad_norm": 8.203792572021484, - "learning_rate": 3.3364321415005833e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8585345134139061, - "num_tokens": 223140686.0, - "step": 185470 - }, - { - "entropy": 1.938676691055298, - "epoch": 0.5749717714217516, - "grad_norm": 8.655381202697754, - "learning_rate": 3.3363421993207896e-06, - "loss": 0.4542, - "mean_token_accuracy": 0.8510200262069703, - "num_tokens": 223152032.0, - "step": 185480 - }, - { - "entropy": 1.879559238255024, - "epoch": 0.5750027705468013, - "grad_norm": 4.304847240447998, - "learning_rate": 3.336252264414478e-06, - "loss": 0.3947, - "mean_token_accuracy": 0.8723119482398033, - "num_tokens": 223163453.0, - "step": 185490 - }, - { - "entropy": 1.8480073928833007, - "epoch": 0.575033769671851, - "grad_norm": 7.835402011871338, - "learning_rate": 3.336162336780667e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.8501526653766632, - "num_tokens": 223176639.0, - "step": 185500 - }, - { - "entropy": 1.928117537498474, - "epoch": 0.5750647687969007, - "grad_norm": 9.117416381835938, - "learning_rate": 3.3360724164183784e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.8559558629989624, - "num_tokens": 223188517.0, - "step": 185510 - }, - { - "entropy": 1.8703112483024598, - "epoch": 0.5750957679219504, - "grad_norm": 8.261551856994629, - "learning_rate": 3.3359825033266312e-06, - "loss": 0.4208, - "mean_token_accuracy": 0.8563395589590073, - "num_tokens": 223200740.0, - "step": 185520 - }, - { - "entropy": 1.9589730098843574, - "epoch": 0.575126767047, - "grad_norm": 8.426461219787598, - "learning_rate": 3.3358925975044452e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.8574065804481507, - "num_tokens": 223211860.0, - "step": 185530 - }, - { - "entropy": 1.8471322655677795, - "epoch": 0.5751577661720498, - "grad_norm": 7.796889305114746, - "learning_rate": 3.335802698950843e-06, - "loss": 0.4, - "mean_token_accuracy": 0.8555324643850326, - "num_tokens": 223224385.0, - "step": 185540 - }, - { - "entropy": 1.862960086762905, - "epoch": 0.5751887652970995, - "grad_norm": 8.271576881408691, - "learning_rate": 3.335712807664843e-06, - "loss": 0.3873, - "mean_token_accuracy": 0.8653026908636093, - "num_tokens": 223236896.0, - "step": 185550 - }, - { - "entropy": 1.9048970609903335, - "epoch": 0.5752197644221492, - "grad_norm": 3.4314792156219482, - "learning_rate": 3.335622923645467e-06, - "loss": 0.4319, - "mean_token_accuracy": 0.8632827535271644, - "num_tokens": 223248127.0, - "step": 185560 - }, - { - "entropy": 1.868567879498005, - "epoch": 0.5752507635471988, - "grad_norm": 3.734616279602051, - "learning_rate": 3.335533046891736e-06, - "loss": 0.4152, - "mean_token_accuracy": 0.8556340306997299, - "num_tokens": 223260369.0, - "step": 185570 - }, - { - "entropy": 1.8998666673898696, - "epoch": 0.5752817626722486, - "grad_norm": 7.988044261932373, - "learning_rate": 3.3354431774026707e-06, - "loss": 0.4166, - "mean_token_accuracy": 0.8611581310629844, - "num_tokens": 223272457.0, - "step": 185580 - }, - { - "entropy": 1.898111554980278, - "epoch": 0.5753127617972983, - "grad_norm": 7.35044527053833, - "learning_rate": 3.3353533151772933e-06, - "loss": 0.4322, - "mean_token_accuracy": 0.8583281725645066, - "num_tokens": 223284354.0, - "step": 185590 - }, - { - "entropy": 1.9633333265781403, - "epoch": 0.575343760922348, - "grad_norm": 9.622851371765137, - "learning_rate": 3.3352634602146243e-06, - "loss": 0.4728, - "mean_token_accuracy": 0.8559970110654831, - "num_tokens": 223295453.0, - "step": 185600 - }, - { - "entropy": 1.9035291761159896, - "epoch": 0.5753747600473976, - "grad_norm": 7.453666687011719, - "learning_rate": 3.3351736125136876e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.8548563599586487, - "num_tokens": 223307635.0, - "step": 185610 - }, - { - "entropy": 1.9105853006243705, - "epoch": 0.5754057591724474, - "grad_norm": 7.252157688140869, - "learning_rate": 3.3350837720735026e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.8604837030172348, - "num_tokens": 223319421.0, - "step": 185620 - }, - { - "entropy": 1.9059565871953965, - "epoch": 0.5754367582974971, - "grad_norm": 9.886700630187988, - "learning_rate": 3.3349939388930917e-06, - "loss": 0.4179, - "mean_token_accuracy": 0.8626177817583084, - "num_tokens": 223331495.0, - "step": 185630 - }, - { - "entropy": 1.8059578329324721, - "epoch": 0.5754677574225467, - "grad_norm": 7.218722343444824, - "learning_rate": 3.3349041129714787e-06, - "loss": 0.3851, - "mean_token_accuracy": 0.8683707162737846, - "num_tokens": 223344530.0, - "step": 185640 - }, - { - "entropy": 1.9087028831243515, - "epoch": 0.5754987565475964, - "grad_norm": 8.074930191040039, - "learning_rate": 3.334814294307686e-06, - "loss": 0.419, - "mean_token_accuracy": 0.8641566768288612, - "num_tokens": 223355905.0, - "step": 185650 - }, - { - "entropy": 1.9034427180886269, - "epoch": 0.5755297556726462, - "grad_norm": 4.149694919586182, - "learning_rate": 3.3347244829007354e-06, - "loss": 0.4368, - "mean_token_accuracy": 0.8565534576773643, - "num_tokens": 223367855.0, - "step": 185660 - }, - { - "entropy": 1.843117931485176, - "epoch": 0.5755607547976959, - "grad_norm": 8.624125480651855, - "learning_rate": 3.3346346787496496e-06, - "loss": 0.4031, - "mean_token_accuracy": 0.8669708013534546, - "num_tokens": 223379487.0, - "step": 185670 - }, - { - "entropy": 1.9281734496355056, - "epoch": 0.5755917539227455, - "grad_norm": 7.270875453948975, - "learning_rate": 3.334544881853452e-06, - "loss": 0.4725, - "mean_token_accuracy": 0.8511006891727447, - "num_tokens": 223390086.0, - "step": 185680 - }, - { - "entropy": 1.8094001136720181, - "epoch": 0.5756227530477952, - "grad_norm": 10.744769096374512, - "learning_rate": 3.3344550922111663e-06, - "loss": 0.3938, - "mean_token_accuracy": 0.8723492413759232, - "num_tokens": 223403380.0, - "step": 185690 - }, - { - "entropy": 1.7403853356838226, - "epoch": 0.5756537521728449, - "grad_norm": 3.8312265872955322, - "learning_rate": 3.3343653098218156e-06, - "loss": 0.3603, - "mean_token_accuracy": 0.8727297469973564, - "num_tokens": 223416987.0, - "step": 185700 - }, - { - "entropy": 1.9369304656982422, - "epoch": 0.5756847512978946, - "grad_norm": 8.717244148254395, - "learning_rate": 3.334275534684423e-06, - "loss": 0.4751, - "mean_token_accuracy": 0.85114157050848, - "num_tokens": 223428740.0, - "step": 185710 - }, - { - "entropy": 1.8861325442790986, - "epoch": 0.5757157504229443, - "grad_norm": 3.7332777976989746, - "learning_rate": 3.334185766798014e-06, - "loss": 0.4337, - "mean_token_accuracy": 0.8636431515216827, - "num_tokens": 223441017.0, - "step": 185720 - }, - { - "entropy": 1.9709544464945794, - "epoch": 0.575746749547994, - "grad_norm": 9.138751983642578, - "learning_rate": 3.33409600616161e-06, - "loss": 0.4946, - "mean_token_accuracy": 0.8416403874754905, - "num_tokens": 223452264.0, - "step": 185730 - }, - { - "entropy": 1.884700782597065, - "epoch": 0.5757777486730437, - "grad_norm": 7.390668869018555, - "learning_rate": 3.334006252774237e-06, - "loss": 0.4169, - "mean_token_accuracy": 0.855260145664215, - "num_tokens": 223464737.0, - "step": 185740 - }, - { - "entropy": 1.8476317539811133, - "epoch": 0.5758087477980934, - "grad_norm": 7.85823917388916, - "learning_rate": 3.3339165066349184e-06, - "loss": 0.441, - "mean_token_accuracy": 0.858846141397953, - "num_tokens": 223477825.0, - "step": 185750 - }, - { - "entropy": 1.84466263204813, - "epoch": 0.5758397469231431, - "grad_norm": 3.960548162460327, - "learning_rate": 3.333826767742679e-06, - "loss": 0.4561, - "mean_token_accuracy": 0.8526906743645668, - "num_tokens": 223490606.0, - "step": 185760 - }, - { - "entropy": 1.8592259749770164, - "epoch": 0.5758707460481928, - "grad_norm": 4.318323612213135, - "learning_rate": 3.3337370360965444e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.8533689677715302, - "num_tokens": 223503065.0, - "step": 185770 - }, - { - "entropy": 1.8432864606380464, - "epoch": 0.5759017451732424, - "grad_norm": 8.023193359375, - "learning_rate": 3.333647311695538e-06, - "loss": 0.4007, - "mean_token_accuracy": 0.8605846777558327, - "num_tokens": 223515498.0, - "step": 185780 - }, - { - "entropy": 1.8691019058227538, - "epoch": 0.5759327442982922, - "grad_norm": 4.231311321258545, - "learning_rate": 3.333557594538686e-06, - "loss": 0.4092, - "mean_token_accuracy": 0.85765770226717, - "num_tokens": 223528474.0, - "step": 185790 - }, - { - "entropy": 1.9564666375517845, - "epoch": 0.5759637434233419, - "grad_norm": 8.478957176208496, - "learning_rate": 3.3334678846250137e-06, - "loss": 0.4562, - "mean_token_accuracy": 0.8489108413457871, - "num_tokens": 223539675.0, - "step": 185800 - }, - { - "entropy": 1.8829040467739104, - "epoch": 0.5759947425483916, - "grad_norm": 7.5195841789245605, - "learning_rate": 3.333378181953545e-06, - "loss": 0.3945, - "mean_token_accuracy": 0.8718097507953644, - "num_tokens": 223551556.0, - "step": 185810 - }, - { - "entropy": 1.816511270403862, - "epoch": 0.5760257416734412, - "grad_norm": 9.738638877868652, - "learning_rate": 3.333288486523308e-06, - "loss": 0.4132, - "mean_token_accuracy": 0.8638149693608284, - "num_tokens": 223564923.0, - "step": 185820 - }, - { - "entropy": 1.911141762137413, - "epoch": 0.576056740798491, - "grad_norm": 8.631954193115234, - "learning_rate": 3.333198798333326e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8467279374599457, - "num_tokens": 223576640.0, - "step": 185830 - }, - { - "entropy": 1.8985722064971924, - "epoch": 0.5760877399235407, - "grad_norm": 14.50521469116211, - "learning_rate": 3.3331091173826262e-06, - "loss": 0.4624, - "mean_token_accuracy": 0.8531594723463058, - "num_tokens": 223588249.0, - "step": 185840 - }, - { - "entropy": 1.9046927765011787, - "epoch": 0.5761187390485903, - "grad_norm": 7.594962120056152, - "learning_rate": 3.333019443670235e-06, - "loss": 0.4528, - "mean_token_accuracy": 0.8494858413934707, - "num_tokens": 223599798.0, - "step": 185850 - }, - { - "entropy": 1.8822232261300087, - "epoch": 0.57614973817364, - "grad_norm": 7.868729114532471, - "learning_rate": 3.332929777195178e-06, - "loss": 0.4241, - "mean_token_accuracy": 0.8659264981746674, - "num_tokens": 223611745.0, - "step": 185860 - }, - { - "entropy": 1.902225561439991, - "epoch": 0.5761807372986898, - "grad_norm": 9.798776626586914, - "learning_rate": 3.332840117956483e-06, - "loss": 0.4679, - "mean_token_accuracy": 0.8460187584161758, - "num_tokens": 223623156.0, - "step": 185870 - }, - { - "entropy": 1.8417491719126702, - "epoch": 0.5762117364237395, - "grad_norm": 9.396997451782227, - "learning_rate": 3.332750465953175e-06, - "loss": 0.4264, - "mean_token_accuracy": 0.8622669294476509, - "num_tokens": 223636123.0, - "step": 185880 - }, - { - "entropy": 1.881843839585781, - "epoch": 0.5762427355487891, - "grad_norm": 3.9266364574432373, - "learning_rate": 3.3326608211842826e-06, - "loss": 0.4262, - "mean_token_accuracy": 0.8624861449003219, - "num_tokens": 223647753.0, - "step": 185890 - }, - { - "entropy": 1.8574804991483689, - "epoch": 0.5762737346738388, - "grad_norm": 8.535648345947266, - "learning_rate": 3.3325711836488317e-06, - "loss": 0.4801, - "mean_token_accuracy": 0.8512572422623634, - "num_tokens": 223660418.0, - "step": 185900 - }, - { - "entropy": 1.9249091818928719, - "epoch": 0.5763047337988886, - "grad_norm": 8.348278999328613, - "learning_rate": 3.33248155334585e-06, - "loss": 0.468, - "mean_token_accuracy": 0.8501077264547348, - "num_tokens": 223672031.0, - "step": 185910 - }, - { - "entropy": 1.9589193403720855, - "epoch": 0.5763357329239382, - "grad_norm": 6.652914524078369, - "learning_rate": 3.332391930274365e-06, - "loss": 0.4829, - "mean_token_accuracy": 0.8530449777841568, - "num_tokens": 223682697.0, - "step": 185920 - }, - { - "entropy": 1.9307951003313064, - "epoch": 0.5763667320489879, - "grad_norm": 9.178523063659668, - "learning_rate": 3.3323023144334043e-06, - "loss": 0.4709, - "mean_token_accuracy": 0.8498701199889183, - "num_tokens": 223693854.0, - "step": 185930 - }, - { - "entropy": 1.8847261801362039, - "epoch": 0.5763977311740376, - "grad_norm": 6.794214248657227, - "learning_rate": 3.3322127058219962e-06, - "loss": 0.424, - "mean_token_accuracy": 0.8585624843835831, - "num_tokens": 223705949.0, - "step": 185940 - }, - { - "entropy": 1.8827034577727317, - "epoch": 0.5764287302990873, - "grad_norm": 8.527931213378906, - "learning_rate": 3.3321231044391673e-06, - "loss": 0.4139, - "mean_token_accuracy": 0.8589151486754417, - "num_tokens": 223717648.0, - "step": 185950 - }, - { - "entropy": 1.9328924477100373, - "epoch": 0.576459729424137, - "grad_norm": 7.280888080596924, - "learning_rate": 3.332033510283947e-06, - "loss": 0.4557, - "mean_token_accuracy": 0.8496242374181747, - "num_tokens": 223728682.0, - "step": 185960 - }, - { - "entropy": 1.899811689555645, - "epoch": 0.5764907285491867, - "grad_norm": 8.402411460876465, - "learning_rate": 3.331943923355363e-06, - "loss": 0.4406, - "mean_token_accuracy": 0.8634314224123955, - "num_tokens": 223740221.0, - "step": 185970 - }, - { - "entropy": 1.8626563012599946, - "epoch": 0.5765217276742364, - "grad_norm": 7.133705139160156, - "learning_rate": 3.3318543436524452e-06, - "loss": 0.4281, - "mean_token_accuracy": 0.8618907809257508, - "num_tokens": 223752470.0, - "step": 185980 - }, - { - "entropy": 1.8463318169116973, - "epoch": 0.576552726799286, - "grad_norm": 8.737984657287598, - "learning_rate": 3.3317647711742214e-06, - "loss": 0.3922, - "mean_token_accuracy": 0.8712903305888176, - "num_tokens": 223764746.0, - "step": 185990 - }, - { - "entropy": 1.8844687968492508, - "epoch": 0.5765837259243358, - "grad_norm": 7.062756538391113, - "learning_rate": 3.3316752059197193e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8577682986855507, - "num_tokens": 223776938.0, - "step": 186000 - }, - { - "entropy": 1.9419248923659325, - "epoch": 0.5766147250493855, - "grad_norm": 8.24322509765625, - "learning_rate": 3.3315856478879698e-06, - "loss": 0.4517, - "mean_token_accuracy": 0.8601583212614059, - "num_tokens": 223788522.0, - "step": 186010 - }, - { - "entropy": 1.9109676256775856, - "epoch": 0.5766457241744352, - "grad_norm": 8.720685005187988, - "learning_rate": 3.331496097078002e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8536943554878235, - "num_tokens": 223799940.0, - "step": 186020 - }, - { - "entropy": 1.9459070816636086, - "epoch": 0.5766767232994848, - "grad_norm": 8.089960098266602, - "learning_rate": 3.3314065534888447e-06, - "loss": 0.4819, - "mean_token_accuracy": 0.8517192706465722, - "num_tokens": 223811983.0, - "step": 186030 - }, - { - "entropy": 1.9221731379628182, - "epoch": 0.5767077224245346, - "grad_norm": 8.933829307556152, - "learning_rate": 3.3313170171195273e-06, - "loss": 0.459, - "mean_token_accuracy": 0.8519295886158943, - "num_tokens": 223823283.0, - "step": 186040 - }, - { - "entropy": 1.9014252200722694, - "epoch": 0.5767387215495843, - "grad_norm": 8.663115501403809, - "learning_rate": 3.3312274879690805e-06, - "loss": 0.4275, - "mean_token_accuracy": 0.8544277712702751, - "num_tokens": 223834688.0, - "step": 186050 - }, - { - "entropy": 1.9084532499313354, - "epoch": 0.576769720674634, - "grad_norm": 7.686354160308838, - "learning_rate": 3.331137966036534e-06, - "loss": 0.4747, - "mean_token_accuracy": 0.8504864275455475, - "num_tokens": 223846506.0, - "step": 186060 - }, - { - "entropy": 1.9267695754766465, - "epoch": 0.5768007197996836, - "grad_norm": 8.076788902282715, - "learning_rate": 3.3310484513209175e-06, - "loss": 0.5041, - "mean_token_accuracy": 0.844456459581852, - "num_tokens": 223858169.0, - "step": 186070 - }, - { - "entropy": 2.0005256950855257, - "epoch": 0.5768317189247334, - "grad_norm": 7.602105617523193, - "learning_rate": 3.3309589438212626e-06, - "loss": 0.498, - "mean_token_accuracy": 0.8492702186107636, - "num_tokens": 223868879.0, - "step": 186080 - }, - { - "entropy": 1.805472445487976, - "epoch": 0.5768627180497831, - "grad_norm": 7.9939284324646, - "learning_rate": 3.3308694435365983e-06, - "loss": 0.3894, - "mean_token_accuracy": 0.867642617225647, - "num_tokens": 223881970.0, - "step": 186090 - }, - { - "entropy": 1.8925477690994739, - "epoch": 0.5768937171748327, - "grad_norm": 4.912389755249023, - "learning_rate": 3.3307799504659565e-06, - "loss": 0.4159, - "mean_token_accuracy": 0.8653697654604912, - "num_tokens": 223894071.0, - "step": 186100 - }, - { - "entropy": 1.879456302523613, - "epoch": 0.5769247162998824, - "grad_norm": 9.695762634277344, - "learning_rate": 3.330690464608368e-06, - "loss": 0.4224, - "mean_token_accuracy": 0.8551273748278618, - "num_tokens": 223906457.0, - "step": 186110 - }, - { - "entropy": 1.8417584151029587, - "epoch": 0.5769557154249322, - "grad_norm": 3.9739835262298584, - "learning_rate": 3.3306009859628634e-06, - "loss": 0.4145, - "mean_token_accuracy": 0.8579704150557518, - "num_tokens": 223918979.0, - "step": 186120 - }, - { - "entropy": 1.846211352944374, - "epoch": 0.5769867145499818, - "grad_norm": 3.5446488857269287, - "learning_rate": 3.3305115145284746e-06, - "loss": 0.4218, - "mean_token_accuracy": 0.8604969620704651, - "num_tokens": 223932341.0, - "step": 186130 - }, - { - "entropy": 1.8327767252922058, - "epoch": 0.5770177136750315, - "grad_norm": 8.83708667755127, - "learning_rate": 3.3304220503042323e-06, - "loss": 0.3859, - "mean_token_accuracy": 0.8605792403221131, - "num_tokens": 223945064.0, - "step": 186140 - }, - { - "entropy": 1.9210361555218696, - "epoch": 0.5770487128000812, - "grad_norm": 7.3829803466796875, - "learning_rate": 3.3303325932891682e-06, - "loss": 0.4376, - "mean_token_accuracy": 0.8643152773380279, - "num_tokens": 223956532.0, - "step": 186150 - }, - { - "entropy": 1.9436809882521628, - "epoch": 0.577079711925131, - "grad_norm": 11.1614990234375, - "learning_rate": 3.330243143482315e-06, - "loss": 0.4786, - "mean_token_accuracy": 0.8519028946757317, - "num_tokens": 223967817.0, - "step": 186160 - }, - { - "entropy": 1.8733919098973275, - "epoch": 0.5771107110501806, - "grad_norm": 8.432992935180664, - "learning_rate": 3.330153700882705e-06, - "loss": 0.4406, - "mean_token_accuracy": 0.8607887044548989, - "num_tokens": 223980144.0, - "step": 186170 - }, - { - "entropy": 1.9079894140362739, - "epoch": 0.5771417101752303, - "grad_norm": 9.000441551208496, - "learning_rate": 3.3300642654893694e-06, - "loss": 0.4601, - "mean_token_accuracy": 0.8596588268876075, - "num_tokens": 223991651.0, - "step": 186180 - }, - { - "entropy": 1.804750144481659, - "epoch": 0.57717270930028, - "grad_norm": 3.7925076484680176, - "learning_rate": 3.32997483730134e-06, - "loss": 0.4498, - "mean_token_accuracy": 0.8558259546756745, - "num_tokens": 224005369.0, - "step": 186190 - }, - { - "entropy": 1.844835962355137, - "epoch": 0.5772037084253296, - "grad_norm": 4.014327049255371, - "learning_rate": 3.329885416317651e-06, - "loss": 0.3784, - "mean_token_accuracy": 0.8642392948269844, - "num_tokens": 224017679.0, - "step": 186200 - }, - { - "entropy": 1.893115258216858, - "epoch": 0.5772347075503794, - "grad_norm": 6.389863014221191, - "learning_rate": 3.329796002537334e-06, - "loss": 0.4206, - "mean_token_accuracy": 0.8519633457064628, - "num_tokens": 224030018.0, - "step": 186210 - }, - { - "entropy": 1.8942801833152771, - "epoch": 0.5772657066754291, - "grad_norm": 3.37688946723938, - "learning_rate": 3.3297065959594223e-06, - "loss": 0.423, - "mean_token_accuracy": 0.8598490327596664, - "num_tokens": 224041265.0, - "step": 186220 - }, - { - "entropy": 1.9119876235723496, - "epoch": 0.5772967058004788, - "grad_norm": 8.3579740524292, - "learning_rate": 3.329617196582949e-06, - "loss": 0.445, - "mean_token_accuracy": 0.8617330178618431, - "num_tokens": 224052454.0, - "step": 186230 - }, - { - "entropy": 1.8348846569657327, - "epoch": 0.5773277049255284, - "grad_norm": 9.87274169921875, - "learning_rate": 3.3295278044069474e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8611876145005226, - "num_tokens": 224064686.0, - "step": 186240 - }, - { - "entropy": 1.8601296663284301, - "epoch": 0.5773587040505782, - "grad_norm": 4.845494270324707, - "learning_rate": 3.3294384194304515e-06, - "loss": 0.3801, - "mean_token_accuracy": 0.8686206489801407, - "num_tokens": 224077062.0, - "step": 186250 - }, - { - "entropy": 1.7862082317471504, - "epoch": 0.5773897031756279, - "grad_norm": 3.842564105987549, - "learning_rate": 3.329349041652493e-06, - "loss": 0.411, - "mean_token_accuracy": 0.8551653146743774, - "num_tokens": 224090360.0, - "step": 186260 - }, - { - "entropy": 1.9282741218805313, - "epoch": 0.5774207023006775, - "grad_norm": 9.02146053314209, - "learning_rate": 3.3292596710721083e-06, - "loss": 0.4742, - "mean_token_accuracy": 0.8523161500692368, - "num_tokens": 224101225.0, - "step": 186270 - }, - { - "entropy": 1.8909048795700074, - "epoch": 0.5774517014257272, - "grad_norm": 9.213672637939453, - "learning_rate": 3.3291703076883297e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8545626923441887, - "num_tokens": 224112311.0, - "step": 186280 - }, - { - "entropy": 1.8705749198794366, - "epoch": 0.577482700550777, - "grad_norm": 9.169853210449219, - "learning_rate": 3.3290809515001925e-06, - "loss": 0.4121, - "mean_token_accuracy": 0.8574076175689698, - "num_tokens": 224124770.0, - "step": 186290 - }, - { - "entropy": 1.8199501633644104, - "epoch": 0.5775136996758267, - "grad_norm": 2.57895565032959, - "learning_rate": 3.3289916025067307e-06, - "loss": 0.4444, - "mean_token_accuracy": 0.8515478879213333, - "num_tokens": 224137410.0, - "step": 186300 - }, - { - "entropy": 1.8437852144241333, - "epoch": 0.5775446988008763, - "grad_norm": 9.475756645202637, - "learning_rate": 3.3289022607069788e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8646301284432412, - "num_tokens": 224149672.0, - "step": 186310 - }, - { - "entropy": 1.8889194279909134, - "epoch": 0.577575697925926, - "grad_norm": 8.501579284667969, - "learning_rate": 3.3288129260999714e-06, - "loss": 0.4847, - "mean_token_accuracy": 0.8534078598022461, - "num_tokens": 224160181.0, - "step": 186320 - }, - { - "entropy": 1.8528265476226806, - "epoch": 0.5776066970509758, - "grad_norm": 9.43176555633545, - "learning_rate": 3.3287235986847425e-06, - "loss": 0.4503, - "mean_token_accuracy": 0.8597616523504257, - "num_tokens": 224172466.0, - "step": 186330 - }, - { - "entropy": 1.8917500972747803, - "epoch": 0.5776376961760255, - "grad_norm": 6.551401615142822, - "learning_rate": 3.32863427846033e-06, - "loss": 0.4177, - "mean_token_accuracy": 0.8578035309910774, - "num_tokens": 224184378.0, - "step": 186340 - }, - { - "entropy": 1.9086822256445886, - "epoch": 0.5776686953010751, - "grad_norm": 9.394867897033691, - "learning_rate": 3.3285449654257657e-06, - "loss": 0.4568, - "mean_token_accuracy": 0.8590981259942054, - "num_tokens": 224195741.0, - "step": 186350 - }, - { - "entropy": 1.813156895339489, - "epoch": 0.5776996944261248, - "grad_norm": 9.021100044250488, - "learning_rate": 3.3284556595800877e-06, - "loss": 0.3968, - "mean_token_accuracy": 0.8551269844174385, - "num_tokens": 224208130.0, - "step": 186360 - }, - { - "entropy": 1.9111459612846375, - "epoch": 0.5777306935511746, - "grad_norm": 7.168278217315674, - "learning_rate": 3.3283663609223305e-06, - "loss": 0.446, - "mean_token_accuracy": 0.8628562480211258, - "num_tokens": 224219268.0, - "step": 186370 - }, - { - "entropy": 1.88259015083313, - "epoch": 0.5777616926762242, - "grad_norm": 7.674609661102295, - "learning_rate": 3.3282770694515305e-06, - "loss": 0.4337, - "mean_token_accuracy": 0.8598455607891082, - "num_tokens": 224231165.0, - "step": 186380 - }, - { - "entropy": 1.9493561804294586, - "epoch": 0.5777926918012739, - "grad_norm": 8.67314338684082, - "learning_rate": 3.3281877851667234e-06, - "loss": 0.4838, - "mean_token_accuracy": 0.8493429899215699, - "num_tokens": 224242109.0, - "step": 186390 - }, - { - "entropy": 1.8610494658350945, - "epoch": 0.5778236909263236, - "grad_norm": 8.769448280334473, - "learning_rate": 3.328098508066945e-06, - "loss": 0.4097, - "mean_token_accuracy": 0.8556885406374931, - "num_tokens": 224254564.0, - "step": 186400 - }, - { - "entropy": 1.8939135536551475, - "epoch": 0.5778546900513734, - "grad_norm": 8.891929626464844, - "learning_rate": 3.328009238151232e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8511065915226936, - "num_tokens": 224266367.0, - "step": 186410 - }, - { - "entropy": 1.8918179273605347, - "epoch": 0.577885689176423, - "grad_norm": 7.109013557434082, - "learning_rate": 3.3279199754186207e-06, - "loss": 0.4373, - "mean_token_accuracy": 0.8539350911974907, - "num_tokens": 224277604.0, - "step": 186420 - }, - { - "entropy": 1.8385544650256633, - "epoch": 0.5779166883014727, - "grad_norm": 3.675187110900879, - "learning_rate": 3.3278307198681493e-06, - "loss": 0.3938, - "mean_token_accuracy": 0.8628764376044273, - "num_tokens": 224290724.0, - "step": 186430 - }, - { - "entropy": 1.8970770210027694, - "epoch": 0.5779476874265224, - "grad_norm": 7.674618721008301, - "learning_rate": 3.3277414714988525e-06, - "loss": 0.4101, - "mean_token_accuracy": 0.8650184333324432, - "num_tokens": 224302151.0, - "step": 186440 - }, - { - "entropy": 1.7818903237581254, - "epoch": 0.577978686551572, - "grad_norm": 8.283598899841309, - "learning_rate": 3.3276522303097693e-06, - "loss": 0.4221, - "mean_token_accuracy": 0.8627783268690109, - "num_tokens": 224314955.0, - "step": 186450 - }, - { - "entropy": 1.7635787680745125, - "epoch": 0.5780096856766218, - "grad_norm": 3.638246774673462, - "learning_rate": 3.327562996299935e-06, - "loss": 0.3648, - "mean_token_accuracy": 0.8763737082481384, - "num_tokens": 224328414.0, - "step": 186460 - }, - { - "entropy": 1.9035615831613542, - "epoch": 0.5780406848016715, - "grad_norm": 8.13401985168457, - "learning_rate": 3.327473769468389e-06, - "loss": 0.4871, - "mean_token_accuracy": 0.8484982311725616, - "num_tokens": 224339900.0, - "step": 186470 - }, - { - "entropy": 1.8948525011539459, - "epoch": 0.5780716839267211, - "grad_norm": 9.15529727935791, - "learning_rate": 3.3273845498141684e-06, - "loss": 0.4518, - "mean_token_accuracy": 0.8480350777506829, - "num_tokens": 224351827.0, - "step": 186480 - }, - { - "entropy": 1.8445970296859742, - "epoch": 0.5781026830517708, - "grad_norm": 7.669584274291992, - "learning_rate": 3.3272953373363104e-06, - "loss": 0.4031, - "mean_token_accuracy": 0.8681982949376106, - "num_tokens": 224363747.0, - "step": 186490 - }, - { - "entropy": 1.8775669053196906, - "epoch": 0.5781336821768206, - "grad_norm": 10.55215835571289, - "learning_rate": 3.3272061320338533e-06, - "loss": 0.4386, - "mean_token_accuracy": 0.8503807350993157, - "num_tokens": 224375666.0, - "step": 186500 - }, - { - "entropy": 1.8514750853180886, - "epoch": 0.5781646813018703, - "grad_norm": 9.644965171813965, - "learning_rate": 3.327116933905835e-06, - "loss": 0.4161, - "mean_token_accuracy": 0.850967912375927, - "num_tokens": 224388240.0, - "step": 186510 - }, - { - "entropy": 1.8742257088422776, - "epoch": 0.5781956804269199, - "grad_norm": 8.107645988464355, - "learning_rate": 3.3270277429512948e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8402368798851967, - "num_tokens": 224400452.0, - "step": 186520 - }, - { - "entropy": 1.9575652033090591, - "epoch": 0.5782266795519696, - "grad_norm": 8.81623649597168, - "learning_rate": 3.3269385591692703e-06, - "loss": 0.4783, - "mean_token_accuracy": 0.8493165135383606, - "num_tokens": 224411160.0, - "step": 186530 - }, - { - "entropy": 1.9046847075223923, - "epoch": 0.5782576786770194, - "grad_norm": 7.765852451324463, - "learning_rate": 3.3268493825588014e-06, - "loss": 0.4726, - "mean_token_accuracy": 0.8459122270345688, - "num_tokens": 224422669.0, - "step": 186540 - }, - { - "entropy": 1.8750623926520347, - "epoch": 0.578288677802069, - "grad_norm": 7.867640495300293, - "learning_rate": 3.326760213118926e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.86184511333704, - "num_tokens": 224434513.0, - "step": 186550 - }, - { - "entropy": 1.9057077065110206, - "epoch": 0.5783196769271187, - "grad_norm": 7.460243225097656, - "learning_rate": 3.3266710508486826e-06, - "loss": 0.4872, - "mean_token_accuracy": 0.8422660037875176, - "num_tokens": 224446264.0, - "step": 186560 - }, - { - "entropy": 1.8610460609197617, - "epoch": 0.5783506760521684, - "grad_norm": 3.488306760787964, - "learning_rate": 3.326581895747112e-06, - "loss": 0.4157, - "mean_token_accuracy": 0.8676585868000984, - "num_tokens": 224457865.0, - "step": 186570 - }, - { - "entropy": 1.8878431126475335, - "epoch": 0.5783816751772182, - "grad_norm": 9.883951187133789, - "learning_rate": 3.3264927478132523e-06, - "loss": 0.464, - "mean_token_accuracy": 0.8584196507930756, - "num_tokens": 224469459.0, - "step": 186580 - }, - { - "entropy": 1.8954463630914689, - "epoch": 0.5784126743022678, - "grad_norm": 8.813650131225586, - "learning_rate": 3.3264036070461443e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8573021531105042, - "num_tokens": 224481227.0, - "step": 186590 - }, - { - "entropy": 1.8967059776186943, - "epoch": 0.5784436734273175, - "grad_norm": 8.553400039672852, - "learning_rate": 3.326314473444827e-06, - "loss": 0.4707, - "mean_token_accuracy": 0.8501126199960709, - "num_tokens": 224492656.0, - "step": 186600 - }, - { - "entropy": 1.771684955060482, - "epoch": 0.5784746725523672, - "grad_norm": 8.498135566711426, - "learning_rate": 3.32622534700834e-06, - "loss": 0.4226, - "mean_token_accuracy": 0.8598705172538758, - "num_tokens": 224506650.0, - "step": 186610 - }, - { - "entropy": 1.8974428325891495, - "epoch": 0.578505671677417, - "grad_norm": 7.043177604675293, - "learning_rate": 3.3261362277357252e-06, - "loss": 0.5038, - "mean_token_accuracy": 0.8514397636055946, - "num_tokens": 224518531.0, - "step": 186620 - }, - { - "entropy": 1.8689001992344856, - "epoch": 0.5785366708024666, - "grad_norm": 8.190739631652832, - "learning_rate": 3.326047115626021e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8575657159090042, - "num_tokens": 224530692.0, - "step": 186630 - }, - { - "entropy": 1.938750332593918, - "epoch": 0.5785676699275163, - "grad_norm": 8.82866096496582, - "learning_rate": 3.3259580106782683e-06, - "loss": 0.4586, - "mean_token_accuracy": 0.8508216217160225, - "num_tokens": 224542421.0, - "step": 186640 - }, - { - "entropy": 1.883426919579506, - "epoch": 0.578598669052566, - "grad_norm": 6.959622859954834, - "learning_rate": 3.32586891289151e-06, - "loss": 0.4698, - "mean_token_accuracy": 0.8552673414349556, - "num_tokens": 224555049.0, - "step": 186650 - }, - { - "entropy": 1.8930795446038247, - "epoch": 0.5786296681776157, - "grad_norm": 8.692614555358887, - "learning_rate": 3.325779822264784e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8654409319162368, - "num_tokens": 224566727.0, - "step": 186660 - }, - { - "entropy": 1.8495046302676201, - "epoch": 0.5786606673026654, - "grad_norm": 3.965505599975586, - "learning_rate": 3.325690738797133e-06, - "loss": 0.4155, - "mean_token_accuracy": 0.8580985084176064, - "num_tokens": 224579383.0, - "step": 186670 - }, - { - "entropy": 1.8006903648376464, - "epoch": 0.5786916664277151, - "grad_norm": 3.8050777912139893, - "learning_rate": 3.3256016624875973e-06, - "loss": 0.3696, - "mean_token_accuracy": 0.8640064790844917, - "num_tokens": 224592615.0, - "step": 186680 - }, - { - "entropy": 1.8454625502228736, - "epoch": 0.5787226655527647, - "grad_norm": 3.603572368621826, - "learning_rate": 3.3255125933352197e-06, - "loss": 0.3792, - "mean_token_accuracy": 0.8661387443542481, - "num_tokens": 224604638.0, - "step": 186690 - }, - { - "entropy": 1.8448929190635681, - "epoch": 0.5787536646778144, - "grad_norm": 9.511395454406738, - "learning_rate": 3.3254235313390405e-06, - "loss": 0.391, - "mean_token_accuracy": 0.8726136729121208, - "num_tokens": 224617135.0, - "step": 186700 - }, - { - "entropy": 1.831068354845047, - "epoch": 0.5787846638028642, - "grad_norm": 2.786999225616455, - "learning_rate": 3.3253344764981014e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.8656001672148704, - "num_tokens": 224629636.0, - "step": 186710 - }, - { - "entropy": 1.9099891155958175, - "epoch": 0.5788156629279139, - "grad_norm": 8.675889015197754, - "learning_rate": 3.325245428811446e-06, - "loss": 0.447, - "mean_token_accuracy": 0.859323938190937, - "num_tokens": 224640785.0, - "step": 186720 - }, - { - "entropy": 1.820456326007843, - "epoch": 0.5788466620529635, - "grad_norm": 8.677525520324707, - "learning_rate": 3.3251563882781154e-06, - "loss": 0.4217, - "mean_token_accuracy": 0.8548235654830932, - "num_tokens": 224653264.0, - "step": 186730 - }, - { - "entropy": 1.8897985056042672, - "epoch": 0.5788776611780132, - "grad_norm": 6.519395351409912, - "learning_rate": 3.3250673548971507e-06, - "loss": 0.4324, - "mean_token_accuracy": 0.8648929744958878, - "num_tokens": 224664511.0, - "step": 186740 - }, - { - "entropy": 1.8283945478498935, - "epoch": 0.578908660303063, - "grad_norm": 6.973257064819336, - "learning_rate": 3.3249783286675967e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.8637444868683815, - "num_tokens": 224677556.0, - "step": 186750 - }, - { - "entropy": 1.8844505622982979, - "epoch": 0.5789396594281127, - "grad_norm": 8.534669876098633, - "learning_rate": 3.324889309588494e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8586746722459793, - "num_tokens": 224689269.0, - "step": 186760 - }, - { - "entropy": 1.8787866935133934, - "epoch": 0.5789706585531623, - "grad_norm": 3.6595122814178467, - "learning_rate": 3.324800297658888e-06, - "loss": 0.3958, - "mean_token_accuracy": 0.870287548005581, - "num_tokens": 224700969.0, - "step": 186770 - }, - { - "entropy": 1.7502490743994712, - "epoch": 0.579001657678212, - "grad_norm": 8.145038604736328, - "learning_rate": 3.324711292877819e-06, - "loss": 0.355, - "mean_token_accuracy": 0.8686568707227706, - "num_tokens": 224714989.0, - "step": 186780 - }, - { - "entropy": 1.9025743126869201, - "epoch": 0.5790326568032618, - "grad_norm": 8.816189765930176, - "learning_rate": 3.3246222952443317e-06, - "loss": 0.4395, - "mean_token_accuracy": 0.860697540640831, - "num_tokens": 224726107.0, - "step": 186790 - }, - { - "entropy": 1.9007497653365135, - "epoch": 0.5790636559283114, - "grad_norm": 9.165550231933594, - "learning_rate": 3.3245333047574696e-06, - "loss": 0.4571, - "mean_token_accuracy": 0.8492210701107978, - "num_tokens": 224737335.0, - "step": 186800 - }, - { - "entropy": 1.853816030919552, - "epoch": 0.5790946550533611, - "grad_norm": 3.157280445098877, - "learning_rate": 3.3244443214162752e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8501716807484627, - "num_tokens": 224749158.0, - "step": 186810 - }, - { - "entropy": 1.8620413765311241, - "epoch": 0.5791256541784108, - "grad_norm": 7.5174431800842285, - "learning_rate": 3.3243553452197936e-06, - "loss": 0.4302, - "mean_token_accuracy": 0.8606794208288193, - "num_tokens": 224761123.0, - "step": 186820 - }, - { - "entropy": 1.8457257106900216, - "epoch": 0.5791566533034606, - "grad_norm": 9.554959297180176, - "learning_rate": 3.324266376167067e-06, - "loss": 0.4244, - "mean_token_accuracy": 0.8568149819970131, - "num_tokens": 224772955.0, - "step": 186830 - }, - { - "entropy": 1.9089529544115067, - "epoch": 0.5791876524285102, - "grad_norm": 8.996994972229004, - "learning_rate": 3.324177414257142e-06, - "loss": 0.4878, - "mean_token_accuracy": 0.8526898682117462, - "num_tokens": 224784360.0, - "step": 186840 - }, - { - "entropy": 1.894026516377926, - "epoch": 0.5792186515535599, - "grad_norm": 7.662275791168213, - "learning_rate": 3.324088459489061e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8504077926278114, - "num_tokens": 224795989.0, - "step": 186850 - }, - { - "entropy": 1.8165657207369805, - "epoch": 0.5792496506786096, - "grad_norm": 9.793351173400879, - "learning_rate": 3.323999511861869e-06, - "loss": 0.4487, - "mean_token_accuracy": 0.8550665840506554, - "num_tokens": 224808637.0, - "step": 186860 - }, - { - "entropy": 1.8741124719381332, - "epoch": 0.5792806498036593, - "grad_norm": 8.422378540039062, - "learning_rate": 3.3239105713746105e-06, - "loss": 0.6209, - "mean_token_accuracy": 0.827587154507637, - "num_tokens": 224821374.0, - "step": 186870 - }, - { - "entropy": 1.9334801375865935, - "epoch": 0.579311648928709, - "grad_norm": 7.4311747550964355, - "learning_rate": 3.3238216380263306e-06, - "loss": 0.4331, - "mean_token_accuracy": 0.8683941826224327, - "num_tokens": 224832001.0, - "step": 186880 - }, - { - "entropy": 1.8828293219208718, - "epoch": 0.5793426480537587, - "grad_norm": 8.103363037109375, - "learning_rate": 3.323732711816074e-06, - "loss": 0.4346, - "mean_token_accuracy": 0.8580566599965096, - "num_tokens": 224844029.0, - "step": 186890 - }, - { - "entropy": 1.927389144897461, - "epoch": 0.5793736471788083, - "grad_norm": 8.316423416137695, - "learning_rate": 3.323643792742886e-06, - "loss": 0.4576, - "mean_token_accuracy": 0.8571579784154892, - "num_tokens": 224855164.0, - "step": 186900 - }, - { - "entropy": 1.9288083717226983, - "epoch": 0.5794046463038581, - "grad_norm": 8.076248168945312, - "learning_rate": 3.323554880805812e-06, - "loss": 0.4577, - "mean_token_accuracy": 0.8487173110246659, - "num_tokens": 224866437.0, - "step": 186910 - }, - { - "entropy": 1.8607487827539444, - "epoch": 0.5794356454289078, - "grad_norm": 6.731062889099121, - "learning_rate": 3.3234659760038983e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8549078598618507, - "num_tokens": 224878689.0, - "step": 186920 - }, - { - "entropy": 1.8520071715116502, - "epoch": 0.5794666445539575, - "grad_norm": 8.76063346862793, - "learning_rate": 3.323377078336189e-06, - "loss": 0.4352, - "mean_token_accuracy": 0.8471438974142075, - "num_tokens": 224890831.0, - "step": 186930 - }, - { - "entropy": 1.750344455242157, - "epoch": 0.5794976436790071, - "grad_norm": 3.300847053527832, - "learning_rate": 3.3232881878017316e-06, - "loss": 0.3531, - "mean_token_accuracy": 0.8734717309474945, - "num_tokens": 224905046.0, - "step": 186940 - }, - { - "entropy": 1.9492957681417464, - "epoch": 0.5795286428040568, - "grad_norm": 7.751434326171875, - "learning_rate": 3.3231993043995707e-06, - "loss": 0.4894, - "mean_token_accuracy": 0.8514164552092552, - "num_tokens": 224916167.0, - "step": 186950 - }, - { - "entropy": 1.8545825704932213, - "epoch": 0.5795596419291066, - "grad_norm": 8.6134672164917, - "learning_rate": 3.3231104281287542e-06, - "loss": 0.3904, - "mean_token_accuracy": 0.8570887267589569, - "num_tokens": 224928011.0, - "step": 186960 - }, - { - "entropy": 1.801781241595745, - "epoch": 0.5795906410541563, - "grad_norm": 3.786421537399292, - "learning_rate": 3.3230215589883276e-06, - "loss": 0.3872, - "mean_token_accuracy": 0.8656805202364921, - "num_tokens": 224940817.0, - "step": 186970 - }, - { - "entropy": 1.9169122859835626, - "epoch": 0.5796216401792059, - "grad_norm": 8.907334327697754, - "learning_rate": 3.3229326969773367e-06, - "loss": 0.4766, - "mean_token_accuracy": 0.8506680697202682, - "num_tokens": 224952881.0, - "step": 186980 - }, - { - "entropy": 1.9773944050073624, - "epoch": 0.5796526393042556, - "grad_norm": 8.65813159942627, - "learning_rate": 3.3228438420948306e-06, - "loss": 0.5282, - "mean_token_accuracy": 0.8384165942668915, - "num_tokens": 224963431.0, - "step": 186990 - }, - { - "entropy": 1.8282001480460166, - "epoch": 0.5796836384293054, - "grad_norm": 7.797760963439941, - "learning_rate": 3.322754994339854e-06, - "loss": 0.4154, - "mean_token_accuracy": 0.8569248840212822, - "num_tokens": 224975554.0, - "step": 187000 - }, - { - "entropy": 1.9159265920519828, - "epoch": 0.579714637554355, - "grad_norm": 7.266661643981934, - "learning_rate": 3.322666153711455e-06, - "loss": 0.4856, - "mean_token_accuracy": 0.848967120051384, - "num_tokens": 224987114.0, - "step": 187010 - }, - { - "entropy": 1.8885586738586426, - "epoch": 0.5797456366794047, - "grad_norm": 6.6897358894348145, - "learning_rate": 3.3225773202086815e-06, - "loss": 0.449, - "mean_token_accuracy": 0.860285858809948, - "num_tokens": 224998810.0, - "step": 187020 - }, - { - "entropy": 1.9200876533985138, - "epoch": 0.5797766358044544, - "grad_norm": 8.429367065429688, - "learning_rate": 3.32248849383058e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8620867624878883, - "num_tokens": 225010448.0, - "step": 187030 - }, - { - "entropy": 1.8974786669015884, - "epoch": 0.5798076349295042, - "grad_norm": 10.145453453063965, - "learning_rate": 3.3223996745761976e-06, - "loss": 0.4706, - "mean_token_accuracy": 0.8538267195224762, - "num_tokens": 225022286.0, - "step": 187040 - }, - { - "entropy": 1.8508014142513276, - "epoch": 0.5798386340545538, - "grad_norm": 6.897191047668457, - "learning_rate": 3.3223108624445845e-06, - "loss": 0.3944, - "mean_token_accuracy": 0.8764521285891533, - "num_tokens": 225033829.0, - "step": 187050 - }, - { - "entropy": 1.9024966299533843, - "epoch": 0.5798696331796035, - "grad_norm": 9.509614944458008, - "learning_rate": 3.3222220574347875e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8539675638079643, - "num_tokens": 225044717.0, - "step": 187060 - }, - { - "entropy": 1.9286258906126021, - "epoch": 0.5799006323046532, - "grad_norm": 9.100786209106445, - "learning_rate": 3.322133259545854e-06, - "loss": 0.4526, - "mean_token_accuracy": 0.8484894841909408, - "num_tokens": 225055870.0, - "step": 187070 - }, - { - "entropy": 1.8572608321905135, - "epoch": 0.5799316314297029, - "grad_norm": 7.915788650512695, - "learning_rate": 3.3220444687768326e-06, - "loss": 0.3979, - "mean_token_accuracy": 0.859556196630001, - "num_tokens": 225067867.0, - "step": 187080 - }, - { - "entropy": 1.8662755504250526, - "epoch": 0.5799626305547526, - "grad_norm": 2.604182004928589, - "learning_rate": 3.321955685126773e-06, - "loss": 0.4504, - "mean_token_accuracy": 0.8469509720802307, - "num_tokens": 225080836.0, - "step": 187090 - }, - { - "entropy": 1.8107245221734047, - "epoch": 0.5799936296798023, - "grad_norm": 5.619945049285889, - "learning_rate": 3.321866908594724e-06, - "loss": 0.4282, - "mean_token_accuracy": 0.851016329228878, - "num_tokens": 225093899.0, - "step": 187100 - }, - { - "entropy": 1.8843757688999176, - "epoch": 0.580024628804852, - "grad_norm": 8.56844425201416, - "learning_rate": 3.3217781391797337e-06, - "loss": 0.4644, - "mean_token_accuracy": 0.8601377189159394, - "num_tokens": 225105531.0, - "step": 187110 - }, - { - "entropy": 1.914840179681778, - "epoch": 0.5800556279299017, - "grad_norm": 4.212116718292236, - "learning_rate": 3.321689376880851e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8556945145130157, - "num_tokens": 225116937.0, - "step": 187120 - }, - { - "entropy": 1.8552332192659378, - "epoch": 0.5800866270549514, - "grad_norm": 8.2677001953125, - "learning_rate": 3.321600621697126e-06, - "loss": 0.445, - "mean_token_accuracy": 0.8575369238853454, - "num_tokens": 225129995.0, - "step": 187130 - }, - { - "entropy": 1.7027757972478867, - "epoch": 0.5801176261800011, - "grad_norm": 7.780810832977295, - "learning_rate": 3.321511873627607e-06, - "loss": 0.2914, - "mean_token_accuracy": 0.8850486144423485, - "num_tokens": 225144239.0, - "step": 187140 - }, - { - "entropy": 1.8446943432092666, - "epoch": 0.5801486253050507, - "grad_norm": 8.228107452392578, - "learning_rate": 3.3214231326713453e-06, - "loss": 0.3869, - "mean_token_accuracy": 0.8636909976601601, - "num_tokens": 225156722.0, - "step": 187150 - }, - { - "entropy": 1.8933214277029038, - "epoch": 0.5801796244301005, - "grad_norm": 8.84110164642334, - "learning_rate": 3.3213343988273893e-06, - "loss": 0.4644, - "mean_token_accuracy": 0.8518497928977012, - "num_tokens": 225168460.0, - "step": 187160 - }, - { - "entropy": 1.7452817946672439, - "epoch": 0.5802106235551502, - "grad_norm": 8.720771789550781, - "learning_rate": 3.3212456720947895e-06, - "loss": 0.3704, - "mean_token_accuracy": 0.869410839676857, - "num_tokens": 225182491.0, - "step": 187170 - }, - { - "entropy": 1.957821214199066, - "epoch": 0.5802416226801999, - "grad_norm": 8.16481876373291, - "learning_rate": 3.3211569524725963e-06, - "loss": 0.5062, - "mean_token_accuracy": 0.8408224180340766, - "num_tokens": 225193214.0, - "step": 187180 - }, - { - "entropy": 1.813154575228691, - "epoch": 0.5802726218052495, - "grad_norm": 7.962581634521484, - "learning_rate": 3.32106823995986e-06, - "loss": 0.378, - "mean_token_accuracy": 0.8722536355257035, - "num_tokens": 225206240.0, - "step": 187190 - }, - { - "entropy": 1.970565813779831, - "epoch": 0.5803036209302992, - "grad_norm": 10.95875072479248, - "learning_rate": 3.3209795345556313e-06, - "loss": 0.4901, - "mean_token_accuracy": 0.8511197417974472, - "num_tokens": 225217327.0, - "step": 187200 - }, - { - "entropy": 1.8491960406303405, - "epoch": 0.580334620055349, - "grad_norm": 4.887589931488037, - "learning_rate": 3.3208908362589596e-06, - "loss": 0.3912, - "mean_token_accuracy": 0.861751139163971, - "num_tokens": 225230075.0, - "step": 187210 - }, - { - "entropy": 1.8215350538492203, - "epoch": 0.5803656191803986, - "grad_norm": 7.864952564239502, - "learning_rate": 3.320802145068898e-06, - "loss": 0.3837, - "mean_token_accuracy": 0.8659237533807754, - "num_tokens": 225243314.0, - "step": 187220 - }, - { - "entropy": 1.8638467997312547, - "epoch": 0.5803966183054483, - "grad_norm": 7.326906204223633, - "learning_rate": 3.320713460984496e-06, - "loss": 0.4286, - "mean_token_accuracy": 0.8706554800271988, - "num_tokens": 225255618.0, - "step": 187230 - }, - { - "entropy": 1.9212041601538659, - "epoch": 0.580427617430498, - "grad_norm": 10.682036399841309, - "learning_rate": 3.320624784004805e-06, - "loss": 0.4907, - "mean_token_accuracy": 0.8419831424951554, - "num_tokens": 225266837.0, - "step": 187240 - }, - { - "entropy": 1.8850742995738983, - "epoch": 0.5804586165555478, - "grad_norm": 3.9985318183898926, - "learning_rate": 3.320536114128877e-06, - "loss": 0.4433, - "mean_token_accuracy": 0.8472855120897294, - "num_tokens": 225278716.0, - "step": 187250 - }, - { - "entropy": 1.9892206102609635, - "epoch": 0.5804896156805974, - "grad_norm": 9.449318885803223, - "learning_rate": 3.3204474513557626e-06, - "loss": 0.5333, - "mean_token_accuracy": 0.8419775024056435, - "num_tokens": 225289299.0, - "step": 187260 - }, - { - "entropy": 1.8804281800985336, - "epoch": 0.5805206148056471, - "grad_norm": 2.9244470596313477, - "learning_rate": 3.320358795684515e-06, - "loss": 0.4574, - "mean_token_accuracy": 0.8517866969108582, - "num_tokens": 225301373.0, - "step": 187270 - }, - { - "entropy": 1.9121669724583625, - "epoch": 0.5805516139306968, - "grad_norm": 7.696053981781006, - "learning_rate": 3.320270147114185e-06, - "loss": 0.4661, - "mean_token_accuracy": 0.8523363262414932, - "num_tokens": 225313048.0, - "step": 187280 - }, - { - "entropy": 1.8642026141285897, - "epoch": 0.5805826130557465, - "grad_norm": 7.293044567108154, - "learning_rate": 3.3201815056438252e-06, - "loss": 0.4044, - "mean_token_accuracy": 0.8651653498411178, - "num_tokens": 225324857.0, - "step": 187290 - }, - { - "entropy": 1.8944362625479698, - "epoch": 0.5806136121807962, - "grad_norm": 8.115937232971191, - "learning_rate": 3.3200928712724882e-06, - "loss": 0.5121, - "mean_token_accuracy": 0.8387358605861663, - "num_tokens": 225336672.0, - "step": 187300 - }, - { - "entropy": 1.9070868030190469, - "epoch": 0.5806446113058459, - "grad_norm": 7.693739414215088, - "learning_rate": 3.320004243999225e-06, - "loss": 0.4052, - "mean_token_accuracy": 0.8645681887865067, - "num_tokens": 225347958.0, - "step": 187310 - }, - { - "entropy": 1.9648890227079392, - "epoch": 0.5806756104308956, - "grad_norm": 8.766926765441895, - "learning_rate": 3.3199156238230913e-06, - "loss": 0.4859, - "mean_token_accuracy": 0.8412608623504638, - "num_tokens": 225358571.0, - "step": 187320 - }, - { - "entropy": 1.8624957248568534, - "epoch": 0.5807066095559453, - "grad_norm": 9.408360481262207, - "learning_rate": 3.319827010743137e-06, - "loss": 0.4336, - "mean_token_accuracy": 0.8628850594162941, - "num_tokens": 225370298.0, - "step": 187330 - }, - { - "entropy": 1.8627533257007598, - "epoch": 0.580737608680995, - "grad_norm": 7.75230598449707, - "learning_rate": 3.3197384047584157e-06, - "loss": 0.4356, - "mean_token_accuracy": 0.8560406729578972, - "num_tokens": 225381912.0, - "step": 187340 - }, - { - "entropy": 1.8439243629574775, - "epoch": 0.5807686078060447, - "grad_norm": 6.105306625366211, - "learning_rate": 3.319649805867981e-06, - "loss": 0.4292, - "mean_token_accuracy": 0.8594496145844459, - "num_tokens": 225394374.0, - "step": 187350 - }, - { - "entropy": 1.8410490676760674, - "epoch": 0.5807996069310943, - "grad_norm": 9.341657638549805, - "learning_rate": 3.319561214070887e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8523242130875588, - "num_tokens": 225406849.0, - "step": 187360 - }, - { - "entropy": 1.8524947792291642, - "epoch": 0.5808306060561441, - "grad_norm": 8.550660133361816, - "learning_rate": 3.3194726293661866e-06, - "loss": 0.4039, - "mean_token_accuracy": 0.8669483736157417, - "num_tokens": 225418553.0, - "step": 187370 - }, - { - "entropy": 1.7620634004473685, - "epoch": 0.5808616051811938, - "grad_norm": 4.0590410232543945, - "learning_rate": 3.319384051752933e-06, - "loss": 0.3503, - "mean_token_accuracy": 0.8679928451776504, - "num_tokens": 225431710.0, - "step": 187380 - }, - { - "entropy": 1.863413827866316, - "epoch": 0.5808926043062435, - "grad_norm": 8.537755966186523, - "learning_rate": 3.3192954812301805e-06, - "loss": 0.4167, - "mean_token_accuracy": 0.8627542704343796, - "num_tokens": 225443800.0, - "step": 187390 - }, - { - "entropy": 1.9017139032483101, - "epoch": 0.5809236034312931, - "grad_norm": 9.76558780670166, - "learning_rate": 3.319206917796984e-06, - "loss": 0.4272, - "mean_token_accuracy": 0.855692045390606, - "num_tokens": 225455170.0, - "step": 187400 - }, - { - "entropy": 1.8679040119051933, - "epoch": 0.5809546025563428, - "grad_norm": 5.080874919891357, - "learning_rate": 3.3191183614523958e-06, - "loss": 0.4409, - "mean_token_accuracy": 0.8646040439605713, - "num_tokens": 225466335.0, - "step": 187410 - }, - { - "entropy": 1.8454194337129592, - "epoch": 0.5809856016813926, - "grad_norm": 7.6558003425598145, - "learning_rate": 3.319029812195473e-06, - "loss": 0.4163, - "mean_token_accuracy": 0.8634024366736412, - "num_tokens": 225478223.0, - "step": 187420 - }, - { - "entropy": 1.8780192330479621, - "epoch": 0.5810166008064422, - "grad_norm": 6.734908103942871, - "learning_rate": 3.3189412700252675e-06, - "loss": 0.4554, - "mean_token_accuracy": 0.8515690982341766, - "num_tokens": 225489362.0, - "step": 187430 - }, - { - "entropy": 1.8287191420793534, - "epoch": 0.5810475999314919, - "grad_norm": 6.768209457397461, - "learning_rate": 3.3188527349408363e-06, - "loss": 0.4084, - "mean_token_accuracy": 0.860761895775795, - "num_tokens": 225501988.0, - "step": 187440 - }, - { - "entropy": 1.895971204340458, - "epoch": 0.5810785990565416, - "grad_norm": 8.12165641784668, - "learning_rate": 3.318764206941233e-06, - "loss": 0.4789, - "mean_token_accuracy": 0.853890660405159, - "num_tokens": 225513151.0, - "step": 187450 - }, - { - "entropy": 1.8142405509948731, - "epoch": 0.5811095981815914, - "grad_norm": 9.434362411499023, - "learning_rate": 3.318675686025512e-06, - "loss": 0.4006, - "mean_token_accuracy": 0.8667998522520065, - "num_tokens": 225525252.0, - "step": 187460 - }, - { - "entropy": 1.8143661960959434, - "epoch": 0.581140597306641, - "grad_norm": 4.254508972167969, - "learning_rate": 3.3185871721927317e-06, - "loss": 0.387, - "mean_token_accuracy": 0.8701783046126366, - "num_tokens": 225537767.0, - "step": 187470 - }, - { - "entropy": 1.8940456122159959, - "epoch": 0.5811715964316907, - "grad_norm": 8.721658706665039, - "learning_rate": 3.3184986654419447e-06, - "loss": 0.4564, - "mean_token_accuracy": 0.8549002230167388, - "num_tokens": 225549434.0, - "step": 187480 - }, - { - "entropy": 1.8303603425621986, - "epoch": 0.5812025955567404, - "grad_norm": 4.4417405128479, - "learning_rate": 3.318410165772207e-06, - "loss": 0.4196, - "mean_token_accuracy": 0.8598802730441093, - "num_tokens": 225561041.0, - "step": 187490 - }, - { - "entropy": 1.9144440218806267, - "epoch": 0.5812335946817901, - "grad_norm": 8.221935272216797, - "learning_rate": 3.3183216731825756e-06, - "loss": 0.4801, - "mean_token_accuracy": 0.8425498574972152, - "num_tokens": 225572823.0, - "step": 187500 - }, - { - "entropy": 1.8884352698922158, - "epoch": 0.5812645938068398, - "grad_norm": 8.147493362426758, - "learning_rate": 3.3182331876721063e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8497319743037224, - "num_tokens": 225584616.0, - "step": 187510 - }, - { - "entropy": 1.8196037113666534, - "epoch": 0.5812955929318895, - "grad_norm": 8.769972801208496, - "learning_rate": 3.3181447092398542e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.8692084148526191, - "num_tokens": 225596386.0, - "step": 187520 - }, - { - "entropy": 1.8051419720053672, - "epoch": 0.5813265920569392, - "grad_norm": 3.612482786178589, - "learning_rate": 3.3180562378848775e-06, - "loss": 0.4222, - "mean_token_accuracy": 0.8601198688149452, - "num_tokens": 225608958.0, - "step": 187530 - }, - { - "entropy": 1.8300196036696434, - "epoch": 0.5813575911819889, - "grad_norm": 3.7968201637268066, - "learning_rate": 3.317967773606231e-06, - "loss": 0.3841, - "mean_token_accuracy": 0.8680305197834969, - "num_tokens": 225621216.0, - "step": 187540 - }, - { - "entropy": 1.8044159524142742, - "epoch": 0.5813885903070386, - "grad_norm": 7.892989158630371, - "learning_rate": 3.3178793164029725e-06, - "loss": 0.3922, - "mean_token_accuracy": 0.8621145501732826, - "num_tokens": 225634613.0, - "step": 187550 - }, - { - "entropy": 1.872318272292614, - "epoch": 0.5814195894320883, - "grad_norm": 4.278754711151123, - "learning_rate": 3.3177908662741582e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.8609319195151329, - "num_tokens": 225646477.0, - "step": 187560 - }, - { - "entropy": 1.8809518799185754, - "epoch": 0.5814505885571379, - "grad_norm": 6.835907459259033, - "learning_rate": 3.3177024232188454e-06, - "loss": 0.4415, - "mean_token_accuracy": 0.8569922402501107, - "num_tokens": 225658719.0, - "step": 187570 - }, - { - "entropy": 1.848207938671112, - "epoch": 0.5814815876821877, - "grad_norm": 8.355209350585938, - "learning_rate": 3.317613987236091e-06, - "loss": 0.3822, - "mean_token_accuracy": 0.8651770651340485, - "num_tokens": 225670144.0, - "step": 187580 - }, - { - "entropy": 1.9318113803863526, - "epoch": 0.5815125868072374, - "grad_norm": 7.146533012390137, - "learning_rate": 3.3175255583249538e-06, - "loss": 0.519, - "mean_token_accuracy": 0.8475144997239112, - "num_tokens": 225681379.0, - "step": 187590 - }, - { - "entropy": 1.897273415327072, - "epoch": 0.581543585932287, - "grad_norm": 8.656028747558594, - "learning_rate": 3.31743713648449e-06, - "loss": 0.4406, - "mean_token_accuracy": 0.8592107653617859, - "num_tokens": 225693244.0, - "step": 187600 - }, - { - "entropy": 1.8229992628097533, - "epoch": 0.5815745850573367, - "grad_norm": 8.393926620483398, - "learning_rate": 3.317348721713758e-06, - "loss": 0.4122, - "mean_token_accuracy": 0.8646789059042931, - "num_tokens": 225705254.0, - "step": 187610 - }, - { - "entropy": 1.887227413058281, - "epoch": 0.5816055841823865, - "grad_norm": 8.333544731140137, - "learning_rate": 3.3172603140118146e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.8589311584830284, - "num_tokens": 225716548.0, - "step": 187620 - }, - { - "entropy": 1.8871796131134033, - "epoch": 0.5816365833074362, - "grad_norm": 8.231070518493652, - "learning_rate": 3.31717191337772e-06, - "loss": 0.4664, - "mean_token_accuracy": 0.8593043237924576, - "num_tokens": 225728445.0, - "step": 187630 - }, - { - "entropy": 1.8334648802876472, - "epoch": 0.5816675824324858, - "grad_norm": 8.094388961791992, - "learning_rate": 3.317083519810531e-06, - "loss": 0.4345, - "mean_token_accuracy": 0.8523730918765068, - "num_tokens": 225740029.0, - "step": 187640 - }, - { - "entropy": 1.8685568556189538, - "epoch": 0.5816985815575355, - "grad_norm": 9.478364944458008, - "learning_rate": 3.316995133309307e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.8546089068055153, - "num_tokens": 225752464.0, - "step": 187650 - }, - { - "entropy": 1.8670329421758651, - "epoch": 0.5817295806825852, - "grad_norm": 7.892263412475586, - "learning_rate": 3.3169067538731053e-06, - "loss": 0.427, - "mean_token_accuracy": 0.8570248574018479, - "num_tokens": 225764756.0, - "step": 187660 - }, - { - "entropy": 1.8107655197381973, - "epoch": 0.581760579807635, - "grad_norm": 4.070509910583496, - "learning_rate": 3.3168183815009868e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.8547146677970886, - "num_tokens": 225777573.0, - "step": 187670 - }, - { - "entropy": 1.890822234749794, - "epoch": 0.5817915789326846, - "grad_norm": 8.35913372039795, - "learning_rate": 3.3167300161920084e-06, - "loss": 0.4778, - "mean_token_accuracy": 0.8575819626450538, - "num_tokens": 225788758.0, - "step": 187680 - }, - { - "entropy": 1.8458858624100685, - "epoch": 0.5818225780577343, - "grad_norm": 3.60536789894104, - "learning_rate": 3.3166416579452303e-06, - "loss": 0.4312, - "mean_token_accuracy": 0.8596644833683967, - "num_tokens": 225800356.0, - "step": 187690 - }, - { - "entropy": 1.893759785592556, - "epoch": 0.581853577182784, - "grad_norm": 9.219444274902344, - "learning_rate": 3.316553306759712e-06, - "loss": 0.4349, - "mean_token_accuracy": 0.8564515098929405, - "num_tokens": 225811643.0, - "step": 187700 - }, - { - "entropy": 1.7977493658661843, - "epoch": 0.5818845763078337, - "grad_norm": 3.17759370803833, - "learning_rate": 3.3164649626345125e-06, - "loss": 0.3782, - "mean_token_accuracy": 0.8628740921616554, - "num_tokens": 225824019.0, - "step": 187710 - }, - { - "entropy": 1.86475830078125, - "epoch": 0.5819155754328834, - "grad_norm": 4.8829498291015625, - "learning_rate": 3.316376625568692e-06, - "loss": 0.4555, - "mean_token_accuracy": 0.8398260667920112, - "num_tokens": 225836100.0, - "step": 187720 - }, - { - "entropy": 1.859655699133873, - "epoch": 0.5819465745579331, - "grad_norm": 3.9662086963653564, - "learning_rate": 3.31628829556131e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.8486672386527061, - "num_tokens": 225848121.0, - "step": 187730 - }, - { - "entropy": 1.8039502635598184, - "epoch": 0.5819775736829828, - "grad_norm": 6.981991767883301, - "learning_rate": 3.316199972611427e-06, - "loss": 0.3584, - "mean_token_accuracy": 0.8625715032219887, - "num_tokens": 225861162.0, - "step": 187740 - }, - { - "entropy": 1.9068680822849273, - "epoch": 0.5820085728080325, - "grad_norm": 6.9623894691467285, - "learning_rate": 3.3161116567181025e-06, - "loss": 0.449, - "mean_token_accuracy": 0.8555295959115028, - "num_tokens": 225872355.0, - "step": 187750 - }, - { - "entropy": 1.8862819537520408, - "epoch": 0.5820395719330822, - "grad_norm": 8.505620956420898, - "learning_rate": 3.3160233478803978e-06, - "loss": 0.4282, - "mean_token_accuracy": 0.858185425400734, - "num_tokens": 225884385.0, - "step": 187760 - }, - { - "entropy": 1.8174796253442764, - "epoch": 0.5820705710581319, - "grad_norm": 3.593024253845215, - "learning_rate": 3.3159350460973725e-06, - "loss": 0.3673, - "mean_token_accuracy": 0.8744990170001984, - "num_tokens": 225896421.0, - "step": 187770 - }, - { - "entropy": 1.794604268670082, - "epoch": 0.5821015701831815, - "grad_norm": 4.108510971069336, - "learning_rate": 3.3158467513680887e-06, - "loss": 0.4025, - "mean_token_accuracy": 0.8654602453112602, - "num_tokens": 225909202.0, - "step": 187780 - }, - { - "entropy": 1.856592944264412, - "epoch": 0.5821325693082313, - "grad_norm": 3.522554636001587, - "learning_rate": 3.315758463691606e-06, - "loss": 0.4668, - "mean_token_accuracy": 0.8488976538181305, - "num_tokens": 225921116.0, - "step": 187790 - }, - { - "entropy": 1.7395574852824212, - "epoch": 0.582163568433281, - "grad_norm": 3.735677719116211, - "learning_rate": 3.315670183066986e-06, - "loss": 0.3464, - "mean_token_accuracy": 0.8713794186711311, - "num_tokens": 225934922.0, - "step": 187800 - }, - { - "entropy": 1.8028660848736764, - "epoch": 0.5821945675583307, - "grad_norm": 8.561286926269531, - "learning_rate": 3.3155819094932912e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.8610808923840523, - "num_tokens": 225947073.0, - "step": 187810 - }, - { - "entropy": 1.9100270748138428, - "epoch": 0.5822255666833803, - "grad_norm": 9.145927429199219, - "learning_rate": 3.3154936429695807e-06, - "loss": 0.4629, - "mean_token_accuracy": 0.8617308288812637, - "num_tokens": 225958338.0, - "step": 187820 - }, - { - "entropy": 1.8450745210051536, - "epoch": 0.5822565658084301, - "grad_norm": 3.313938617706299, - "learning_rate": 3.3154053834949173e-06, - "loss": 0.3908, - "mean_token_accuracy": 0.8720159530639648, - "num_tokens": 225970594.0, - "step": 187830 - }, - { - "entropy": 1.8195609986782073, - "epoch": 0.5822875649334798, - "grad_norm": 3.9881591796875, - "learning_rate": 3.315317131068364e-06, - "loss": 0.4007, - "mean_token_accuracy": 0.8652278319001198, - "num_tokens": 225983382.0, - "step": 187840 - }, - { - "entropy": 1.8461186781525611, - "epoch": 0.5823185640585294, - "grad_norm": 8.813285827636719, - "learning_rate": 3.315228885688981e-06, - "loss": 0.4284, - "mean_token_accuracy": 0.8545858830213546, - "num_tokens": 225995123.0, - "step": 187850 - }, - { - "entropy": 1.8090190321207047, - "epoch": 0.5823495631835791, - "grad_norm": 8.754561424255371, - "learning_rate": 3.3151406473558305e-06, - "loss": 0.3976, - "mean_token_accuracy": 0.8618172109127045, - "num_tokens": 226008105.0, - "step": 187860 - }, - { - "entropy": 1.867758809030056, - "epoch": 0.5823805623086289, - "grad_norm": 7.915543556213379, - "learning_rate": 3.315052416067976e-06, - "loss": 0.4739, - "mean_token_accuracy": 0.8492542281746864, - "num_tokens": 226020743.0, - "step": 187870 - }, - { - "entropy": 1.823201984167099, - "epoch": 0.5824115614336786, - "grad_norm": 8.312398910522461, - "learning_rate": 3.3149641918244797e-06, - "loss": 0.4272, - "mean_token_accuracy": 0.8526169434189796, - "num_tokens": 226033165.0, - "step": 187880 - }, - { - "entropy": 1.9004525065422058, - "epoch": 0.5824425605587282, - "grad_norm": 7.280655384063721, - "learning_rate": 3.3148759746244036e-06, - "loss": 0.4729, - "mean_token_accuracy": 0.8470793083310127, - "num_tokens": 226045476.0, - "step": 187890 - }, - { - "entropy": 1.7311370939016342, - "epoch": 0.5824735596837779, - "grad_norm": 9.387449264526367, - "learning_rate": 3.3147877644668114e-06, - "loss": 0.339, - "mean_token_accuracy": 0.8730020001530647, - "num_tokens": 226059148.0, - "step": 187900 - }, - { - "entropy": 1.814567594230175, - "epoch": 0.5825045588088276, - "grad_norm": 3.6906380653381348, - "learning_rate": 3.3146995613507654e-06, - "loss": 0.4556, - "mean_token_accuracy": 0.856498584151268, - "num_tokens": 226072217.0, - "step": 187910 - }, - { - "entropy": 1.8509540766477586, - "epoch": 0.5825355579338773, - "grad_norm": 9.102182388305664, - "learning_rate": 3.3146113652753294e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8627394825220108, - "num_tokens": 226084204.0, - "step": 187920 - }, - { - "entropy": 1.7961204290390014, - "epoch": 0.582566557058927, - "grad_norm": 7.360368728637695, - "learning_rate": 3.3145231762395663e-06, - "loss": 0.39, - "mean_token_accuracy": 0.8610507771372795, - "num_tokens": 226097223.0, - "step": 187930 - }, - { - "entropy": 1.9453897044062614, - "epoch": 0.5825975561839767, - "grad_norm": 8.529144287109375, - "learning_rate": 3.3144349942425403e-06, - "loss": 0.4752, - "mean_token_accuracy": 0.8498910903930664, - "num_tokens": 226108705.0, - "step": 187940 - }, - { - "entropy": 1.881111888587475, - "epoch": 0.5826285553090264, - "grad_norm": 7.892261505126953, - "learning_rate": 3.3143468192833146e-06, - "loss": 0.4494, - "mean_token_accuracy": 0.867023055255413, - "num_tokens": 226119736.0, - "step": 187950 - }, - { - "entropy": 1.9663893669843673, - "epoch": 0.5826595544340761, - "grad_norm": 9.200639724731445, - "learning_rate": 3.3142586513609527e-06, - "loss": 0.499, - "mean_token_accuracy": 0.841266855597496, - "num_tokens": 226131251.0, - "step": 187960 - }, - { - "entropy": 1.8415462985634803, - "epoch": 0.5826905535591258, - "grad_norm": 7.170725345611572, - "learning_rate": 3.3141704904745196e-06, - "loss": 0.4037, - "mean_token_accuracy": 0.8617974206805229, - "num_tokens": 226142855.0, - "step": 187970 - }, - { - "entropy": 1.869658489525318, - "epoch": 0.5827215526841755, - "grad_norm": 7.896175861358643, - "learning_rate": 3.314082336623079e-06, - "loss": 0.4493, - "mean_token_accuracy": 0.8597209066152572, - "num_tokens": 226154990.0, - "step": 187980 - }, - { - "entropy": 1.8297441124916076, - "epoch": 0.5827525518092251, - "grad_norm": 3.932016134262085, - "learning_rate": 3.313994189805696e-06, - "loss": 0.3952, - "mean_token_accuracy": 0.8598747432231904, - "num_tokens": 226167448.0, - "step": 187990 - }, - { - "entropy": 1.871214209496975, - "epoch": 0.5827835509342749, - "grad_norm": 9.672822952270508, - "learning_rate": 3.3139060500214345e-06, - "loss": 0.4218, - "mean_token_accuracy": 0.8542467385530472, - "num_tokens": 226178960.0, - "step": 188000 - }, - { - "entropy": 1.7930740877985953, - "epoch": 0.5828145500593246, - "grad_norm": 8.267410278320312, - "learning_rate": 3.313817917269359e-06, - "loss": 0.3895, - "mean_token_accuracy": 0.8633213341236115, - "num_tokens": 226191638.0, - "step": 188010 - }, - { - "entropy": 1.8819244831800461, - "epoch": 0.5828455491843743, - "grad_norm": 8.082293510437012, - "learning_rate": 3.313729791548535e-06, - "loss": 0.4395, - "mean_token_accuracy": 0.8530665084719657, - "num_tokens": 226202623.0, - "step": 188020 - }, - { - "entropy": 1.8311271622776986, - "epoch": 0.5828765483094239, - "grad_norm": 11.455146789550781, - "learning_rate": 3.3136416728580277e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8615739867091179, - "num_tokens": 226215309.0, - "step": 188030 - }, - { - "entropy": 1.904327604174614, - "epoch": 0.5829075474344737, - "grad_norm": 7.183170318603516, - "learning_rate": 3.313553561196902e-06, - "loss": 0.4502, - "mean_token_accuracy": 0.8606903403997421, - "num_tokens": 226226613.0, - "step": 188040 - }, - { - "entropy": 1.897905382514, - "epoch": 0.5829385465595234, - "grad_norm": 8.285850524902344, - "learning_rate": 3.3134654565642236e-06, - "loss": 0.458, - "mean_token_accuracy": 0.8442597165703773, - "num_tokens": 226238398.0, - "step": 188050 - }, - { - "entropy": 1.8283357247710228, - "epoch": 0.582969545684573, - "grad_norm": 6.005077838897705, - "learning_rate": 3.3133773589590583e-06, - "loss": 0.391, - "mean_token_accuracy": 0.862805712223053, - "num_tokens": 226251131.0, - "step": 188060 - }, - { - "entropy": 1.9413809180259705, - "epoch": 0.5830005448096227, - "grad_norm": 14.281983375549316, - "learning_rate": 3.3132892683804725e-06, - "loss": 0.5778, - "mean_token_accuracy": 0.840830785036087, - "num_tokens": 226262959.0, - "step": 188070 - }, - { - "entropy": 1.8144272148609162, - "epoch": 0.5830315439346725, - "grad_norm": 8.84279727935791, - "learning_rate": 3.313201184827531e-06, - "loss": 0.4398, - "mean_token_accuracy": 0.8624243274331093, - "num_tokens": 226276078.0, - "step": 188080 - }, - { - "entropy": 1.9170308411121368, - "epoch": 0.5830625430597222, - "grad_norm": 4.164804935455322, - "learning_rate": 3.3131131082993e-06, - "loss": 0.4998, - "mean_token_accuracy": 0.8424900874495507, - "num_tokens": 226287036.0, - "step": 188090 - }, - { - "entropy": 1.8704634562134743, - "epoch": 0.5830935421847718, - "grad_norm": 8.712486267089844, - "learning_rate": 3.3130250387948467e-06, - "loss": 0.4603, - "mean_token_accuracy": 0.8427205190062523, - "num_tokens": 226299076.0, - "step": 188100 - }, - { - "entropy": 1.841335417330265, - "epoch": 0.5831245413098215, - "grad_norm": 7.971294403076172, - "learning_rate": 3.3129369763132374e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8621138870716095, - "num_tokens": 226311492.0, - "step": 188110 - }, - { - "entropy": 1.876630488038063, - "epoch": 0.5831555404348713, - "grad_norm": 3.832597255706787, - "learning_rate": 3.312848920853538e-06, - "loss": 0.463, - "mean_token_accuracy": 0.859805490076542, - "num_tokens": 226323785.0, - "step": 188120 - }, - { - "entropy": 1.9168067395687103, - "epoch": 0.583186539559921, - "grad_norm": 8.005231857299805, - "learning_rate": 3.312760872414816e-06, - "loss": 0.4887, - "mean_token_accuracy": 0.8578100666403771, - "num_tokens": 226335299.0, - "step": 188130 - }, - { - "entropy": 1.844995990395546, - "epoch": 0.5832175386849706, - "grad_norm": 6.676909923553467, - "learning_rate": 3.3126728309961387e-06, - "loss": 0.4515, - "mean_token_accuracy": 0.8538479104638099, - "num_tokens": 226347611.0, - "step": 188140 - }, - { - "entropy": 1.9432258665561677, - "epoch": 0.5832485378100203, - "grad_norm": 3.9236388206481934, - "learning_rate": 3.312584796596573e-06, - "loss": 0.4391, - "mean_token_accuracy": 0.8563521727919579, - "num_tokens": 226358832.0, - "step": 188150 - }, - { - "entropy": 1.830638773739338, - "epoch": 0.58327953693507, - "grad_norm": 8.679086685180664, - "learning_rate": 3.312496769215186e-06, - "loss": 0.3824, - "mean_token_accuracy": 0.8629469037055969, - "num_tokens": 226372333.0, - "step": 188160 - }, - { - "entropy": 1.9505089595913887, - "epoch": 0.5833105360601197, - "grad_norm": 9.412543296813965, - "learning_rate": 3.312408748851046e-06, - "loss": 0.5211, - "mean_token_accuracy": 0.8360338136553764, - "num_tokens": 226384676.0, - "step": 188170 - }, - { - "entropy": 1.7972653448581695, - "epoch": 0.5833415351851694, - "grad_norm": 6.050276756286621, - "learning_rate": 3.3123207355032193e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8623633801937103, - "num_tokens": 226397428.0, - "step": 188180 - }, - { - "entropy": 1.8783445715904237, - "epoch": 0.5833725343102191, - "grad_norm": 3.4504499435424805, - "learning_rate": 3.312232729170776e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.854879067838192, - "num_tokens": 226409277.0, - "step": 188190 - }, - { - "entropy": 1.8437550991773606, - "epoch": 0.5834035334352687, - "grad_norm": 8.014336585998535, - "learning_rate": 3.3121447298527826e-06, - "loss": 0.434, - "mean_token_accuracy": 0.8607449740171432, - "num_tokens": 226421381.0, - "step": 188200 - }, - { - "entropy": 1.8978667482733727, - "epoch": 0.5834345325603185, - "grad_norm": 7.275966644287109, - "learning_rate": 3.3120567375483074e-06, - "loss": 0.4366, - "mean_token_accuracy": 0.8560662552714348, - "num_tokens": 226433314.0, - "step": 188210 - }, - { - "entropy": 1.7705576822161675, - "epoch": 0.5834655316853682, - "grad_norm": 8.13685131072998, - "learning_rate": 3.3119687522564193e-06, - "loss": 0.3593, - "mean_token_accuracy": 0.8652193561196327, - "num_tokens": 226447082.0, - "step": 188220 - }, - { - "entropy": 1.8670435428619385, - "epoch": 0.5834965308104179, - "grad_norm": 8.564881324768066, - "learning_rate": 3.3118807739761864e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.8523515909910202, - "num_tokens": 226458416.0, - "step": 188230 - }, - { - "entropy": 1.7603057324886322, - "epoch": 0.5835275299354675, - "grad_norm": 8.4653959274292, - "learning_rate": 3.311792802706678e-06, - "loss": 0.3732, - "mean_token_accuracy": 0.8672403365373611, - "num_tokens": 226471502.0, - "step": 188240 - }, - { - "entropy": 1.8395553201436996, - "epoch": 0.5835585290605173, - "grad_norm": 8.05081844329834, - "learning_rate": 3.3117048384469636e-06, - "loss": 0.4032, - "mean_token_accuracy": 0.8622571364045143, - "num_tokens": 226483666.0, - "step": 188250 - }, - { - "entropy": 1.9094887733459474, - "epoch": 0.583589528185567, - "grad_norm": 7.921069145202637, - "learning_rate": 3.311616881196111e-06, - "loss": 0.5063, - "mean_token_accuracy": 0.8507940858602524, - "num_tokens": 226494845.0, - "step": 188260 - }, - { - "entropy": 1.9042211279273034, - "epoch": 0.5836205273106166, - "grad_norm": 9.858515739440918, - "learning_rate": 3.3115289309531896e-06, - "loss": 0.455, - "mean_token_accuracy": 0.8520038589835167, - "num_tokens": 226506098.0, - "step": 188270 - }, - { - "entropy": 1.9129501059651375, - "epoch": 0.5836515264356663, - "grad_norm": 4.229575157165527, - "learning_rate": 3.31144098771727e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8541373431682586, - "num_tokens": 226517333.0, - "step": 188280 - }, - { - "entropy": 1.8737207755446434, - "epoch": 0.5836825255607161, - "grad_norm": 9.133805274963379, - "learning_rate": 3.31135305148742e-06, - "loss": 0.4655, - "mean_token_accuracy": 0.8556098580360413, - "num_tokens": 226529232.0, - "step": 188290 - }, - { - "entropy": 1.8534547090530396, - "epoch": 0.5837135246857658, - "grad_norm": 8.339399337768555, - "learning_rate": 3.3112651222627118e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8503861218690872, - "num_tokens": 226541327.0, - "step": 188300 - }, - { - "entropy": 1.8679198250174522, - "epoch": 0.5837445238108154, - "grad_norm": 7.660033226013184, - "learning_rate": 3.3111772000422136e-06, - "loss": 0.4435, - "mean_token_accuracy": 0.8588420808315277, - "num_tokens": 226553424.0, - "step": 188310 - }, - { - "entropy": 1.9072294965386392, - "epoch": 0.5837755229358651, - "grad_norm": 7.387581825256348, - "learning_rate": 3.3110892848249966e-06, - "loss": 0.4852, - "mean_token_accuracy": 0.8525169312953949, - "num_tokens": 226564494.0, - "step": 188320 - }, - { - "entropy": 1.8538544602692126, - "epoch": 0.5838065220609149, - "grad_norm": 11.310450553894043, - "learning_rate": 3.3110013766101297e-06, - "loss": 0.423, - "mean_token_accuracy": 0.8551127001643181, - "num_tokens": 226577233.0, - "step": 188330 - }, - { - "entropy": 1.9005723729729653, - "epoch": 0.5838375211859645, - "grad_norm": 8.547388076782227, - "learning_rate": 3.310913475396685e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8635548874735832, - "num_tokens": 226588550.0, - "step": 188340 - }, - { - "entropy": 1.828601559996605, - "epoch": 0.5838685203110142, - "grad_norm": 4.200283527374268, - "learning_rate": 3.310825581183732e-06, - "loss": 0.3652, - "mean_token_accuracy": 0.8699035704135895, - "num_tokens": 226601722.0, - "step": 188350 - }, - { - "entropy": 1.9584389060735703, - "epoch": 0.5838995194360639, - "grad_norm": 8.566116333007812, - "learning_rate": 3.3107376939703425e-06, - "loss": 0.509, - "mean_token_accuracy": 0.8420336455106735, - "num_tokens": 226612375.0, - "step": 188360 - }, - { - "entropy": 1.930802473425865, - "epoch": 0.5839305185611137, - "grad_norm": 9.12570858001709, - "learning_rate": 3.310649813755587e-06, - "loss": 0.5011, - "mean_token_accuracy": 0.8506847187876702, - "num_tokens": 226623213.0, - "step": 188370 - }, - { - "entropy": 1.9112320333719253, - "epoch": 0.5839615176861633, - "grad_norm": 8.679109573364258, - "learning_rate": 3.3105619405385365e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.8585551649332046, - "num_tokens": 226634267.0, - "step": 188380 - }, - { - "entropy": 1.9466535836458205, - "epoch": 0.583992516811213, - "grad_norm": 4.623629093170166, - "learning_rate": 3.310474074318262e-06, - "loss": 0.4933, - "mean_token_accuracy": 0.843099795281887, - "num_tokens": 226645782.0, - "step": 188390 - }, - { - "entropy": 1.751372517645359, - "epoch": 0.5840235159362627, - "grad_norm": 3.6573829650878906, - "learning_rate": 3.3103862150938366e-06, - "loss": 0.3644, - "mean_token_accuracy": 0.8672532960772514, - "num_tokens": 226659879.0, - "step": 188400 - }, - { - "entropy": 1.878103531897068, - "epoch": 0.5840545150613123, - "grad_norm": 3.6132452487945557, - "learning_rate": 3.310298362864331e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.864518155157566, - "num_tokens": 226671393.0, - "step": 188410 - }, - { - "entropy": 1.8883730322122574, - "epoch": 0.5840855141863621, - "grad_norm": 7.3440327644348145, - "learning_rate": 3.310210517628817e-06, - "loss": 0.4561, - "mean_token_accuracy": 0.8584344744682312, - "num_tokens": 226683271.0, - "step": 188420 - }, - { - "entropy": 1.8579667672514915, - "epoch": 0.5841165133114118, - "grad_norm": 3.851297616958618, - "learning_rate": 3.3101226793863665e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8607630103826522, - "num_tokens": 226695544.0, - "step": 188430 - }, - { - "entropy": 1.8530279219150543, - "epoch": 0.5841475124364615, - "grad_norm": 9.353023529052734, - "learning_rate": 3.3100348481360524e-06, - "loss": 0.4427, - "mean_token_accuracy": 0.8588374391198158, - "num_tokens": 226707908.0, - "step": 188440 - }, - { - "entropy": 1.9490127682685852, - "epoch": 0.5841785115615111, - "grad_norm": 4.572386741638184, - "learning_rate": 3.3099470238769466e-06, - "loss": 0.4735, - "mean_token_accuracy": 0.8485381156206131, - "num_tokens": 226719751.0, - "step": 188450 - }, - { - "entropy": 1.8131704688072205, - "epoch": 0.5842095106865609, - "grad_norm": 7.624702453613281, - "learning_rate": 3.309859206608122e-06, - "loss": 0.3821, - "mean_token_accuracy": 0.8713184505701065, - "num_tokens": 226732052.0, - "step": 188460 - }, - { - "entropy": 1.9276420667767524, - "epoch": 0.5842405098116106, - "grad_norm": 7.374622344970703, - "learning_rate": 3.3097713963286505e-06, - "loss": 0.484, - "mean_token_accuracy": 0.8484125405550003, - "num_tokens": 226743129.0, - "step": 188470 - }, - { - "entropy": 1.8226234167814255, - "epoch": 0.5842715089366602, - "grad_norm": 7.201011657714844, - "learning_rate": 3.309683593037606e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8551812097430229, - "num_tokens": 226756342.0, - "step": 188480 - }, - { - "entropy": 1.8935386508703231, - "epoch": 0.5843025080617099, - "grad_norm": 3.1278462409973145, - "learning_rate": 3.309595796734061e-06, - "loss": 0.4143, - "mean_token_accuracy": 0.8602985307574272, - "num_tokens": 226768034.0, - "step": 188490 - }, - { - "entropy": 1.897402636706829, - "epoch": 0.5843335071867597, - "grad_norm": 4.209484577178955, - "learning_rate": 3.3095080074170894e-06, - "loss": 0.4179, - "mean_token_accuracy": 0.8683418110013008, - "num_tokens": 226779243.0, - "step": 188500 - }, - { - "entropy": 1.7633845642209054, - "epoch": 0.5843645063118094, - "grad_norm": 4.389909267425537, - "learning_rate": 3.309420225085764e-06, - "loss": 0.371, - "mean_token_accuracy": 0.87280925065279, - "num_tokens": 226791987.0, - "step": 188510 - }, - { - "entropy": 1.9105349063873291, - "epoch": 0.584395505436859, - "grad_norm": 8.567315101623535, - "learning_rate": 3.3093324497391587e-06, - "loss": 0.502, - "mean_token_accuracy": 0.8505501300096512, - "num_tokens": 226802929.0, - "step": 188520 - }, - { - "entropy": 1.8890083134174347, - "epoch": 0.5844265045619087, - "grad_norm": 7.157079696655273, - "learning_rate": 3.3092446813763467e-06, - "loss": 0.4592, - "mean_token_accuracy": 0.8604133263230324, - "num_tokens": 226814577.0, - "step": 188530 - }, - { - "entropy": 1.93582623898983, - "epoch": 0.5844575036869585, - "grad_norm": 7.214765548706055, - "learning_rate": 3.309156919996403e-06, - "loss": 0.4797, - "mean_token_accuracy": 0.8459798738360405, - "num_tokens": 226825129.0, - "step": 188540 - }, - { - "entropy": 1.8311146199703217, - "epoch": 0.5844885028120081, - "grad_norm": 4.586556434631348, - "learning_rate": 3.309069165598401e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8530444666743279, - "num_tokens": 226837973.0, - "step": 188550 - }, - { - "entropy": 1.8696679010987283, - "epoch": 0.5845195019370578, - "grad_norm": 7.468491077423096, - "learning_rate": 3.308981418181415e-06, - "loss": 0.3964, - "mean_token_accuracy": 0.8730146139860153, - "num_tokens": 226849471.0, - "step": 188560 - }, - { - "entropy": 1.9135367214679717, - "epoch": 0.5845505010621075, - "grad_norm": 8.727263450622559, - "learning_rate": 3.3088936777445195e-06, - "loss": 0.4769, - "mean_token_accuracy": 0.8547710910439491, - "num_tokens": 226860616.0, - "step": 188570 - }, - { - "entropy": 1.8736458599567414, - "epoch": 0.5845815001871573, - "grad_norm": 4.631098747253418, - "learning_rate": 3.3088059442867896e-06, - "loss": 0.4295, - "mean_token_accuracy": 0.8680781036615371, - "num_tokens": 226871990.0, - "step": 188580 - }, - { - "entropy": 1.8468430042266846, - "epoch": 0.5846124993122069, - "grad_norm": 4.6543049812316895, - "learning_rate": 3.3087182178072996e-06, - "loss": 0.3891, - "mean_token_accuracy": 0.8615953475236893, - "num_tokens": 226884227.0, - "step": 188590 - }, - { - "entropy": 1.8528510972857475, - "epoch": 0.5846434984372566, - "grad_norm": 3.4963319301605225, - "learning_rate": 3.308630498305125e-06, - "loss": 0.4341, - "mean_token_accuracy": 0.8522498548030853, - "num_tokens": 226896429.0, - "step": 188600 - }, - { - "entropy": 1.7623168423771858, - "epoch": 0.5846744975623063, - "grad_norm": 3.6100804805755615, - "learning_rate": 3.30854278577934e-06, - "loss": 0.3711, - "mean_token_accuracy": 0.8676817521452904, - "num_tokens": 226909499.0, - "step": 188610 - }, - { - "entropy": 1.8766853883862495, - "epoch": 0.584705496687356, - "grad_norm": 4.469761848449707, - "learning_rate": 3.308455080229021e-06, - "loss": 0.4275, - "mean_token_accuracy": 0.8567523375153542, - "num_tokens": 226922210.0, - "step": 188620 - }, - { - "entropy": 1.9278080672025681, - "epoch": 0.5847364958124057, - "grad_norm": 7.888815402984619, - "learning_rate": 3.308367381653242e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.851407541334629, - "num_tokens": 226933618.0, - "step": 188630 - }, - { - "entropy": 1.892588023841381, - "epoch": 0.5847674949374554, - "grad_norm": 8.37109661102295, - "learning_rate": 3.3082796900510807e-06, - "loss": 0.4633, - "mean_token_accuracy": 0.8398651763796806, - "num_tokens": 226945299.0, - "step": 188640 - }, - { - "entropy": 1.9134973019361496, - "epoch": 0.5847984940625051, - "grad_norm": 8.243192672729492, - "learning_rate": 3.3081920054216115e-06, - "loss": 0.4962, - "mean_token_accuracy": 0.8384934589266777, - "num_tokens": 226956769.0, - "step": 188650 - }, - { - "entropy": 1.9360808670520782, - "epoch": 0.5848294931875547, - "grad_norm": 8.460143089294434, - "learning_rate": 3.308104327763911e-06, - "loss": 0.5219, - "mean_token_accuracy": 0.8484696820378304, - "num_tokens": 226967465.0, - "step": 188660 - }, - { - "entropy": 1.8962484419345855, - "epoch": 0.5848604923126045, - "grad_norm": 9.766073226928711, - "learning_rate": 3.308016657077055e-06, - "loss": 0.4365, - "mean_token_accuracy": 0.8597484767436981, - "num_tokens": 226978952.0, - "step": 188670 - }, - { - "entropy": 1.8484892681241036, - "epoch": 0.5848914914376542, - "grad_norm": 8.785158157348633, - "learning_rate": 3.3079289933601196e-06, - "loss": 0.4598, - "mean_token_accuracy": 0.8503390610218048, - "num_tokens": 226990498.0, - "step": 188680 - }, - { - "entropy": 1.9609992295503615, - "epoch": 0.5849224905627038, - "grad_norm": 8.992911338806152, - "learning_rate": 3.307841336612182e-06, - "loss": 0.4542, - "mean_token_accuracy": 0.8564995005726814, - "num_tokens": 227001098.0, - "step": 188690 - }, - { - "entropy": 1.7960527956485748, - "epoch": 0.5849534896877535, - "grad_norm": 4.051634311676025, - "learning_rate": 3.307753686832319e-06, - "loss": 0.3667, - "mean_token_accuracy": 0.8742998942732811, - "num_tokens": 227014146.0, - "step": 188700 - }, - { - "entropy": 1.8706871971488, - "epoch": 0.5849844888128033, - "grad_norm": 10.606660842895508, - "learning_rate": 3.3076660440196073e-06, - "loss": 0.4624, - "mean_token_accuracy": 0.8497942507266998, - "num_tokens": 227025140.0, - "step": 188710 - }, - { - "entropy": 1.8714196056127548, - "epoch": 0.585015487937853, - "grad_norm": 5.643126964569092, - "learning_rate": 3.307578408173123e-06, - "loss": 0.4442, - "mean_token_accuracy": 0.8556253165006638, - "num_tokens": 227036917.0, - "step": 188720 - }, - { - "entropy": 1.7751472130417825, - "epoch": 0.5850464870629026, - "grad_norm": 9.494732856750488, - "learning_rate": 3.307490779291944e-06, - "loss": 0.3812, - "mean_token_accuracy": 0.8668629273772239, - "num_tokens": 227049736.0, - "step": 188730 - }, - { - "entropy": 1.8384251773357392, - "epoch": 0.5850774861879523, - "grad_norm": 8.89919376373291, - "learning_rate": 3.307403157375148e-06, - "loss": 0.4083, - "mean_token_accuracy": 0.8520896717905998, - "num_tokens": 227061666.0, - "step": 188740 - }, - { - "entropy": 1.8475905492901803, - "epoch": 0.5851084853130021, - "grad_norm": 8.330229759216309, - "learning_rate": 3.307315542421812e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8590241074562073, - "num_tokens": 227073116.0, - "step": 188750 - }, - { - "entropy": 1.7911785036325454, - "epoch": 0.5851394844380517, - "grad_norm": 7.692288398742676, - "learning_rate": 3.307227934431014e-06, - "loss": 0.3705, - "mean_token_accuracy": 0.8710220009088516, - "num_tokens": 227085773.0, - "step": 188760 - }, - { - "entropy": 1.8938721969723702, - "epoch": 0.5851704835631014, - "grad_norm": 3.932691812515259, - "learning_rate": 3.3071403334018326e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8493056833744049, - "num_tokens": 227097247.0, - "step": 188770 - }, - { - "entropy": 1.708780439198017, - "epoch": 0.5852014826881511, - "grad_norm": 3.246295690536499, - "learning_rate": 3.307052739333345e-06, - "loss": 0.344, - "mean_token_accuracy": 0.8691340193152428, - "num_tokens": 227111719.0, - "step": 188780 - }, - { - "entropy": 1.897132021188736, - "epoch": 0.5852324818132009, - "grad_norm": 8.147296905517578, - "learning_rate": 3.3069651522246294e-06, - "loss": 0.4624, - "mean_token_accuracy": 0.8504904195666313, - "num_tokens": 227123856.0, - "step": 188790 - }, - { - "entropy": 1.9081890538334847, - "epoch": 0.5852634809382505, - "grad_norm": 8.91196346282959, - "learning_rate": 3.3068775720747642e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8543705970048905, - "num_tokens": 227134956.0, - "step": 188800 - }, - { - "entropy": 1.8409344106912613, - "epoch": 0.5852944800633002, - "grad_norm": 7.536337375640869, - "learning_rate": 3.306789998882828e-06, - "loss": 0.4559, - "mean_token_accuracy": 0.8481650918722152, - "num_tokens": 227146690.0, - "step": 188810 - }, - { - "entropy": 1.7761472210288047, - "epoch": 0.5853254791883499, - "grad_norm": 9.95036506652832, - "learning_rate": 3.3067024326478997e-06, - "loss": 0.3968, - "mean_token_accuracy": 0.8567533954977989, - "num_tokens": 227159423.0, - "step": 188820 - }, - { - "entropy": 1.822265100479126, - "epoch": 0.5853564783133997, - "grad_norm": 4.746108531951904, - "learning_rate": 3.3066148733690585e-06, - "loss": 0.4361, - "mean_token_accuracy": 0.8569403976202011, - "num_tokens": 227171637.0, - "step": 188830 - }, - { - "entropy": 1.7964641809463502, - "epoch": 0.5853874774384493, - "grad_norm": 5.9528374671936035, - "learning_rate": 3.306527321045383e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8559579864144325, - "num_tokens": 227184996.0, - "step": 188840 - }, - { - "entropy": 1.8235788196325302, - "epoch": 0.585418476563499, - "grad_norm": 9.29820442199707, - "learning_rate": 3.3064397756759524e-06, - "loss": 0.4037, - "mean_token_accuracy": 0.8652059838175774, - "num_tokens": 227196975.0, - "step": 188850 - }, - { - "entropy": 1.9079763412475585, - "epoch": 0.5854494756885487, - "grad_norm": 7.375221252441406, - "learning_rate": 3.3063522372598465e-06, - "loss": 0.4912, - "mean_token_accuracy": 0.8551223009824753, - "num_tokens": 227208072.0, - "step": 188860 - }, - { - "entropy": 1.8686128735542298, - "epoch": 0.5854804748135984, - "grad_norm": 3.882939338684082, - "learning_rate": 3.3062647057961455e-06, - "loss": 0.4247, - "mean_token_accuracy": 0.8613992780447006, - "num_tokens": 227219452.0, - "step": 188870 - }, - { - "entropy": 1.8170276552438736, - "epoch": 0.5855114739386481, - "grad_norm": 8.03610897064209, - "learning_rate": 3.306177181283928e-06, - "loss": 0.3851, - "mean_token_accuracy": 0.8691373020410538, - "num_tokens": 227232288.0, - "step": 188880 - }, - { - "entropy": 1.7830303296446801, - "epoch": 0.5855424730636978, - "grad_norm": 4.951107978820801, - "learning_rate": 3.3060896637222738e-06, - "loss": 0.4251, - "mean_token_accuracy": 0.8494992271065712, - "num_tokens": 227245140.0, - "step": 188890 - }, - { - "entropy": 1.808831486105919, - "epoch": 0.5855734721887474, - "grad_norm": 9.004613876342773, - "learning_rate": 3.3060021531102636e-06, - "loss": 0.4261, - "mean_token_accuracy": 0.8603759378194809, - "num_tokens": 227258195.0, - "step": 188900 - }, - { - "entropy": 1.8654738262295723, - "epoch": 0.5856044713137971, - "grad_norm": 8.35360336303711, - "learning_rate": 3.3059146494469785e-06, - "loss": 0.4873, - "mean_token_accuracy": 0.8470656216144562, - "num_tokens": 227269403.0, - "step": 188910 - }, - { - "entropy": 1.90666244328022, - "epoch": 0.5856354704388469, - "grad_norm": 7.269579887390137, - "learning_rate": 3.3058271527314976e-06, - "loss": 0.4363, - "mean_token_accuracy": 0.8671371981501579, - "num_tokens": 227281068.0, - "step": 188920 - }, - { - "entropy": 1.8575873374938965, - "epoch": 0.5856664695638966, - "grad_norm": 7.192220211029053, - "learning_rate": 3.3057396629629024e-06, - "loss": 0.4337, - "mean_token_accuracy": 0.8495822846889496, - "num_tokens": 227292522.0, - "step": 188930 - }, - { - "entropy": 1.8657715290784835, - "epoch": 0.5856974686889462, - "grad_norm": 7.0419135093688965, - "learning_rate": 3.305652180140273e-06, - "loss": 0.4059, - "mean_token_accuracy": 0.8614326119422913, - "num_tokens": 227304770.0, - "step": 188940 - }, - { - "entropy": 1.9066298857331276, - "epoch": 0.5857284678139959, - "grad_norm": 7.648441791534424, - "learning_rate": 3.3055647042626904e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.8575715780258178, - "num_tokens": 227315975.0, - "step": 188950 - }, - { - "entropy": 1.8731012284755706, - "epoch": 0.5857594669390457, - "grad_norm": 6.847329139709473, - "learning_rate": 3.305477235329237e-06, - "loss": 0.468, - "mean_token_accuracy": 0.8527000054717064, - "num_tokens": 227327413.0, - "step": 188960 - }, - { - "entropy": 1.8332221895456313, - "epoch": 0.5857904660640953, - "grad_norm": 9.892985343933105, - "learning_rate": 3.305389773338992e-06, - "loss": 0.3955, - "mean_token_accuracy": 0.8629180341959, - "num_tokens": 227339150.0, - "step": 188970 - }, - { - "entropy": 1.88137661293149, - "epoch": 0.585821465189145, - "grad_norm": 8.720916748046875, - "learning_rate": 3.305302318291039e-06, - "loss": 0.452, - "mean_token_accuracy": 0.8541831061244011, - "num_tokens": 227350744.0, - "step": 188980 - }, - { - "entropy": 1.7486733600497246, - "epoch": 0.5858524643141947, - "grad_norm": 8.791574478149414, - "learning_rate": 3.3052148701844576e-06, - "loss": 0.3755, - "mean_token_accuracy": 0.87051263153553, - "num_tokens": 227364249.0, - "step": 188990 - }, - { - "entropy": 1.8991805881261825, - "epoch": 0.5858834634392445, - "grad_norm": 7.9490461349487305, - "learning_rate": 3.3051274290183317e-06, - "loss": 0.4683, - "mean_token_accuracy": 0.8558553963899612, - "num_tokens": 227375754.0, - "step": 189000 - }, - { - "entropy": 1.895850320160389, - "epoch": 0.5859144625642941, - "grad_norm": 8.310858726501465, - "learning_rate": 3.305039994791741e-06, - "loss": 0.4559, - "mean_token_accuracy": 0.85080985724926, - "num_tokens": 227387025.0, - "step": 189010 - }, - { - "entropy": 1.8519131153821946, - "epoch": 0.5859454616893438, - "grad_norm": 2.4337899684906006, - "learning_rate": 3.3049525675037696e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8626031950116158, - "num_tokens": 227399366.0, - "step": 189020 - }, - { - "entropy": 1.8202516600489616, - "epoch": 0.5859764608143935, - "grad_norm": 3.6557323932647705, - "learning_rate": 3.3048651471535e-06, - "loss": 0.4176, - "mean_token_accuracy": 0.8586086541414261, - "num_tokens": 227411475.0, - "step": 189030 - }, - { - "entropy": 1.7547407761216163, - "epoch": 0.5860074599394433, - "grad_norm": 6.2841997146606445, - "learning_rate": 3.304777733740012e-06, - "loss": 0.3648, - "mean_token_accuracy": 0.8678300872445106, - "num_tokens": 227425090.0, - "step": 189040 - }, - { - "entropy": 1.876267357170582, - "epoch": 0.5860384590644929, - "grad_norm": 4.17917537689209, - "learning_rate": 3.304690327262391e-06, - "loss": 0.4658, - "mean_token_accuracy": 0.8612060084939003, - "num_tokens": 227436823.0, - "step": 189050 - }, - { - "entropy": 1.8834017142653465, - "epoch": 0.5860694581895426, - "grad_norm": 11.105561256408691, - "learning_rate": 3.304602927719719e-06, - "loss": 0.4476, - "mean_token_accuracy": 0.8533930122852326, - "num_tokens": 227447772.0, - "step": 189060 - }, - { - "entropy": 1.8341946393251418, - "epoch": 0.5861004573145923, - "grad_norm": 8.324958801269531, - "learning_rate": 3.3045155351110786e-06, - "loss": 0.4218, - "mean_token_accuracy": 0.859440878033638, - "num_tokens": 227459616.0, - "step": 189070 - }, - { - "entropy": 1.9266793727874756, - "epoch": 0.586131456439642, - "grad_norm": 8.78128719329834, - "learning_rate": 3.304428149435554e-06, - "loss": 0.5021, - "mean_token_accuracy": 0.8393941059708595, - "num_tokens": 227470147.0, - "step": 189080 - }, - { - "entropy": 1.8669157862663268, - "epoch": 0.5861624555646917, - "grad_norm": 3.7182395458221436, - "learning_rate": 3.304340770692227e-06, - "loss": 0.4446, - "mean_token_accuracy": 0.8524700611829757, - "num_tokens": 227481869.0, - "step": 189090 - }, - { - "entropy": 1.8480639606714249, - "epoch": 0.5861934546897414, - "grad_norm": 7.5054168701171875, - "learning_rate": 3.3042533988801816e-06, - "loss": 0.4159, - "mean_token_accuracy": 0.8661205500364304, - "num_tokens": 227493852.0, - "step": 189100 - }, - { - "entropy": 1.8628501027822495, - "epoch": 0.586224453814791, - "grad_norm": 3.835216522216797, - "learning_rate": 3.304166033998502e-06, - "loss": 0.4198, - "mean_token_accuracy": 0.8646493136882782, - "num_tokens": 227505056.0, - "step": 189110 - }, - { - "entropy": 1.8842681765556335, - "epoch": 0.5862554529398408, - "grad_norm": 4.247763633728027, - "learning_rate": 3.304078676046273e-06, - "loss": 0.4195, - "mean_token_accuracy": 0.854621236026287, - "num_tokens": 227516359.0, - "step": 189120 - }, - { - "entropy": 1.8450437039136887, - "epoch": 0.5862864520648905, - "grad_norm": 7.098427772521973, - "learning_rate": 3.303991325022576e-06, - "loss": 0.4048, - "mean_token_accuracy": 0.8628740802407264, - "num_tokens": 227528633.0, - "step": 189130 - }, - { - "entropy": 1.8995537489652634, - "epoch": 0.5863174511899402, - "grad_norm": 9.48597526550293, - "learning_rate": 3.3039039809264976e-06, - "loss": 0.4433, - "mean_token_accuracy": 0.8529835850000381, - "num_tokens": 227539740.0, - "step": 189140 - }, - { - "entropy": 1.8391615077853203, - "epoch": 0.5863484503149898, - "grad_norm": 8.266154289245605, - "learning_rate": 3.303816643757121e-06, - "loss": 0.4701, - "mean_token_accuracy": 0.8522128477692604, - "num_tokens": 227552532.0, - "step": 189150 - }, - { - "entropy": 1.8922418750822545, - "epoch": 0.5863794494400395, - "grad_norm": 8.158439636230469, - "learning_rate": 3.30372931351353e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.8632095634937287, - "num_tokens": 227564255.0, - "step": 189160 - }, - { - "entropy": 1.9275053888559341, - "epoch": 0.5864104485650893, - "grad_norm": 9.102051734924316, - "learning_rate": 3.3036419901948117e-06, - "loss": 0.4718, - "mean_token_accuracy": 0.851666758954525, - "num_tokens": 227574967.0, - "step": 189170 - }, - { - "entropy": 1.8686299338936805, - "epoch": 0.586441447690139, - "grad_norm": 7.848930835723877, - "learning_rate": 3.303554673800049e-06, - "loss": 0.4426, - "mean_token_accuracy": 0.8474835962057113, - "num_tokens": 227587486.0, - "step": 189180 - }, - { - "entropy": 1.9148722842335701, - "epoch": 0.5864724468151886, - "grad_norm": 7.397693157196045, - "learning_rate": 3.3034673643283273e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8615544393658638, - "num_tokens": 227598667.0, - "step": 189190 - }, - { - "entropy": 1.8950233474373817, - "epoch": 0.5865034459402383, - "grad_norm": 6.918297290802002, - "learning_rate": 3.3033800617787316e-06, - "loss": 0.427, - "mean_token_accuracy": 0.8527185708284378, - "num_tokens": 227610380.0, - "step": 189200 - }, - { - "entropy": 1.9290294289588927, - "epoch": 0.5865344450652881, - "grad_norm": 3.6919219493865967, - "learning_rate": 3.303292766150348e-06, - "loss": 0.4992, - "mean_token_accuracy": 0.8398040190339089, - "num_tokens": 227621658.0, - "step": 189210 - }, - { - "entropy": 1.7984000250697136, - "epoch": 0.5865654441903377, - "grad_norm": 7.641847133636475, - "learning_rate": 3.303205477442261e-06, - "loss": 0.3887, - "mean_token_accuracy": 0.8632738545536995, - "num_tokens": 227634019.0, - "step": 189220 - }, - { - "entropy": 1.8095714911818503, - "epoch": 0.5865964433153874, - "grad_norm": 8.667887687683105, - "learning_rate": 3.303118195653558e-06, - "loss": 0.4209, - "mean_token_accuracy": 0.8591258674860001, - "num_tokens": 227648132.0, - "step": 189230 - }, - { - "entropy": 1.8297943592071533, - "epoch": 0.5866274424404371, - "grad_norm": 9.270956993103027, - "learning_rate": 3.3030309207833233e-06, - "loss": 0.3771, - "mean_token_accuracy": 0.8597560048103332, - "num_tokens": 227660944.0, - "step": 189240 - }, - { - "entropy": 1.870998741686344, - "epoch": 0.5866584415654869, - "grad_norm": 9.251309394836426, - "learning_rate": 3.3029436528306434e-06, - "loss": 0.4482, - "mean_token_accuracy": 0.853928117454052, - "num_tokens": 227673235.0, - "step": 189250 - }, - { - "entropy": 1.8127895906567573, - "epoch": 0.5866894406905365, - "grad_norm": 8.351231575012207, - "learning_rate": 3.302856391794605e-06, - "loss": 0.431, - "mean_token_accuracy": 0.8543845057487488, - "num_tokens": 227686452.0, - "step": 189260 - }, - { - "entropy": 1.7898150980472565, - "epoch": 0.5867204398155862, - "grad_norm": 2.87815523147583, - "learning_rate": 3.3027691376742943e-06, - "loss": 0.392, - "mean_token_accuracy": 0.8682875841856003, - "num_tokens": 227699484.0, - "step": 189270 - }, - { - "entropy": 1.961062216758728, - "epoch": 0.5867514389406359, - "grad_norm": 4.274531364440918, - "learning_rate": 3.3026818904687975e-06, - "loss": 0.4872, - "mean_token_accuracy": 0.8424155503511429, - "num_tokens": 227710193.0, - "step": 189280 - }, - { - "entropy": 1.8331158310174942, - "epoch": 0.5867824380656856, - "grad_norm": 7.7590012550354, - "learning_rate": 3.3025946501772012e-06, - "loss": 0.3811, - "mean_token_accuracy": 0.8739141911268234, - "num_tokens": 227722944.0, - "step": 189290 - }, - { - "entropy": 1.8074917957186698, - "epoch": 0.5868134371907353, - "grad_norm": 2.69134259223938, - "learning_rate": 3.3025074167985928e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8565651878714562, - "num_tokens": 227735625.0, - "step": 189300 - }, - { - "entropy": 1.8668998405337334, - "epoch": 0.586844436315785, - "grad_norm": 8.62458610534668, - "learning_rate": 3.3024201903320586e-06, - "loss": 0.4129, - "mean_token_accuracy": 0.8737411797046661, - "num_tokens": 227746528.0, - "step": 189310 - }, - { - "entropy": 1.785508194565773, - "epoch": 0.5868754354408346, - "grad_norm": 8.008922576904297, - "learning_rate": 3.3023329707766865e-06, - "loss": 0.3802, - "mean_token_accuracy": 0.8677184686064721, - "num_tokens": 227759227.0, - "step": 189320 - }, - { - "entropy": 1.8027926161885262, - "epoch": 0.5869064345658844, - "grad_norm": 6.850667953491211, - "learning_rate": 3.3022457581315642e-06, - "loss": 0.3878, - "mean_token_accuracy": 0.8741619393229485, - "num_tokens": 227772084.0, - "step": 189330 - }, - { - "entropy": 1.7575479179620743, - "epoch": 0.5869374336909341, - "grad_norm": 3.98160719871521, - "learning_rate": 3.302158552395779e-06, - "loss": 0.37, - "mean_token_accuracy": 0.8698368772864342, - "num_tokens": 227785177.0, - "step": 189340 - }, - { - "entropy": 1.8547125473618506, - "epoch": 0.5869684328159838, - "grad_norm": 9.128921508789062, - "learning_rate": 3.302071353568418e-06, - "loss": 0.4529, - "mean_token_accuracy": 0.8492949277162551, - "num_tokens": 227797306.0, - "step": 189350 - }, - { - "entropy": 1.8572920322418214, - "epoch": 0.5869994319410334, - "grad_norm": 8.80993938446045, - "learning_rate": 3.3019841616485705e-06, - "loss": 0.441, - "mean_token_accuracy": 0.8457432359457016, - "num_tokens": 227809558.0, - "step": 189360 - }, - { - "entropy": 1.8914913788437844, - "epoch": 0.5870304310660832, - "grad_norm": 8.738089561462402, - "learning_rate": 3.3018969766353227e-06, - "loss": 0.4547, - "mean_token_accuracy": 0.8606926470994949, - "num_tokens": 227820941.0, - "step": 189370 - }, - { - "entropy": 1.917687328159809, - "epoch": 0.5870614301911329, - "grad_norm": 7.077282428741455, - "learning_rate": 3.3018097985277643e-06, - "loss": 0.4211, - "mean_token_accuracy": 0.8651439309120178, - "num_tokens": 227832317.0, - "step": 189380 - }, - { - "entropy": 1.8793218448758124, - "epoch": 0.5870924293161826, - "grad_norm": 9.505925178527832, - "learning_rate": 3.301722627324983e-06, - "loss": 0.4959, - "mean_token_accuracy": 0.8499870494008064, - "num_tokens": 227844172.0, - "step": 189390 - }, - { - "entropy": 1.9014255598187446, - "epoch": 0.5871234284412322, - "grad_norm": 8.800775527954102, - "learning_rate": 3.3016354630260677e-06, - "loss": 0.4709, - "mean_token_accuracy": 0.8577530473470688, - "num_tokens": 227855598.0, - "step": 189400 - }, - { - "entropy": 1.9350775092840196, - "epoch": 0.5871544275662819, - "grad_norm": 8.164290428161621, - "learning_rate": 3.3015483056301072e-06, - "loss": 0.5224, - "mean_token_accuracy": 0.8480610802769661, - "num_tokens": 227866126.0, - "step": 189410 - }, - { - "entropy": 1.8243651062250137, - "epoch": 0.5871854266913317, - "grad_norm": 11.792317390441895, - "learning_rate": 3.3014611551361896e-06, - "loss": 0.4244, - "mean_token_accuracy": 0.8601853996515274, - "num_tokens": 227878353.0, - "step": 189420 - }, - { - "entropy": 1.8115493163466454, - "epoch": 0.5872164258163813, - "grad_norm": 2.0419137477874756, - "learning_rate": 3.3013740115434056e-06, - "loss": 0.4208, - "mean_token_accuracy": 0.8694847837090492, - "num_tokens": 227891678.0, - "step": 189430 - }, - { - "entropy": 1.808267669379711, - "epoch": 0.587247424941431, - "grad_norm": 7.618503570556641, - "learning_rate": 3.301286874850844e-06, - "loss": 0.4137, - "mean_token_accuracy": 0.8591461583971978, - "num_tokens": 227903837.0, - "step": 189440 - }, - { - "entropy": 1.875668992102146, - "epoch": 0.5872784240664807, - "grad_norm": 8.09911060333252, - "learning_rate": 3.301199745057593e-06, - "loss": 0.4322, - "mean_token_accuracy": 0.8624072283506393, - "num_tokens": 227915966.0, - "step": 189450 - }, - { - "entropy": 1.9233120754361153, - "epoch": 0.5873094231915305, - "grad_norm": 8.465187072753906, - "learning_rate": 3.3011126221627424e-06, - "loss": 0.4759, - "mean_token_accuracy": 0.8484439864754677, - "num_tokens": 227927615.0, - "step": 189460 - }, - { - "entropy": 1.8734351724386216, - "epoch": 0.5873404223165801, - "grad_norm": 8.21156120300293, - "learning_rate": 3.301025506165383e-06, - "loss": 0.4166, - "mean_token_accuracy": 0.8689304769039154, - "num_tokens": 227939154.0, - "step": 189470 - }, - { - "entropy": 1.802188740670681, - "epoch": 0.5873714214416298, - "grad_norm": 8.768073081970215, - "learning_rate": 3.3009383970646043e-06, - "loss": 0.3884, - "mean_token_accuracy": 0.8656351312994957, - "num_tokens": 227951955.0, - "step": 189480 - }, - { - "entropy": 1.8910615742206573, - "epoch": 0.5874024205666795, - "grad_norm": 8.669722557067871, - "learning_rate": 3.3008512948594968e-06, - "loss": 0.4498, - "mean_token_accuracy": 0.8541586175560951, - "num_tokens": 227963163.0, - "step": 189490 - }, - { - "entropy": 1.8721488162875175, - "epoch": 0.5874334196917292, - "grad_norm": 2.82273530960083, - "learning_rate": 3.300764199549149e-06, - "loss": 0.4317, - "mean_token_accuracy": 0.8580455660820008, - "num_tokens": 227975840.0, - "step": 189500 - }, - { - "entropy": 1.8127721384167672, - "epoch": 0.5874644188167789, - "grad_norm": 7.7262115478515625, - "learning_rate": 3.300677111132654e-06, - "loss": 0.3995, - "mean_token_accuracy": 0.8592531457543373, - "num_tokens": 227988830.0, - "step": 189510 - }, - { - "entropy": 1.9156973659992218, - "epoch": 0.5874954179418286, - "grad_norm": 9.007680892944336, - "learning_rate": 3.3005900296091005e-06, - "loss": 0.502, - "mean_token_accuracy": 0.84843979626894, - "num_tokens": 227999884.0, - "step": 189520 - }, - { - "entropy": 1.8657537907361985, - "epoch": 0.5875264170668782, - "grad_norm": 7.236568450927734, - "learning_rate": 3.3005029549775797e-06, - "loss": 0.4632, - "mean_token_accuracy": 0.8456703305244446, - "num_tokens": 228012514.0, - "step": 189530 - }, - { - "entropy": 1.8161449700593948, - "epoch": 0.587557416191928, - "grad_norm": 6.984272480010986, - "learning_rate": 3.3004158872371827e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.8517293512821198, - "num_tokens": 228025517.0, - "step": 189540 - }, - { - "entropy": 1.8823605373501777, - "epoch": 0.5875884153169777, - "grad_norm": 8.093108177185059, - "learning_rate": 3.300328826387e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.85855952501297, - "num_tokens": 228037560.0, - "step": 189550 - }, - { - "entropy": 1.9040069863200189, - "epoch": 0.5876194144420274, - "grad_norm": 7.7805399894714355, - "learning_rate": 3.300241772426124e-06, - "loss": 0.4507, - "mean_token_accuracy": 0.8509999126195907, - "num_tokens": 228049640.0, - "step": 189560 - }, - { - "entropy": 1.828610323369503, - "epoch": 0.587650413567077, - "grad_norm": 7.373439788818359, - "learning_rate": 3.3001547253536457e-06, - "loss": 0.4386, - "mean_token_accuracy": 0.8533773705363273, - "num_tokens": 228062020.0, - "step": 189570 - }, - { - "entropy": 1.8179884567856788, - "epoch": 0.5876814126921268, - "grad_norm": 6.4519782066345215, - "learning_rate": 3.3000676851686556e-06, - "loss": 0.4151, - "mean_token_accuracy": 0.8558628976345062, - "num_tokens": 228073965.0, - "step": 189580 - }, - { - "entropy": 1.9012035757303238, - "epoch": 0.5877124118171765, - "grad_norm": 7.507246017456055, - "learning_rate": 3.2999806518702484e-06, - "loss": 0.443, - "mean_token_accuracy": 0.8569813266396522, - "num_tokens": 228084587.0, - "step": 189590 - }, - { - "entropy": 1.8052480682730674, - "epoch": 0.5877434109422262, - "grad_norm": 8.202247619628906, - "learning_rate": 3.2998936254575126e-06, - "loss": 0.399, - "mean_token_accuracy": 0.8647924482822418, - "num_tokens": 228097593.0, - "step": 189600 - }, - { - "entropy": 1.8566293939948082, - "epoch": 0.5877744100672758, - "grad_norm": 8.915870666503906, - "learning_rate": 3.299806605929542e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.8599877521395684, - "num_tokens": 228109710.0, - "step": 189610 - }, - { - "entropy": 1.876941440999508, - "epoch": 0.5878054091923256, - "grad_norm": 7.945460796356201, - "learning_rate": 3.2997195932854287e-06, - "loss": 0.4675, - "mean_token_accuracy": 0.8425981119275093, - "num_tokens": 228121176.0, - "step": 189620 - }, - { - "entropy": 1.8251575097441672, - "epoch": 0.5878364083173753, - "grad_norm": 7.691454887390137, - "learning_rate": 3.2996325875242646e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.859329092502594, - "num_tokens": 228133888.0, - "step": 189630 - }, - { - "entropy": 1.7917707473039628, - "epoch": 0.5878674074424249, - "grad_norm": 7.17902946472168, - "learning_rate": 3.299545588645143e-06, - "loss": 0.3799, - "mean_token_accuracy": 0.8672048881649971, - "num_tokens": 228146650.0, - "step": 189640 - }, - { - "entropy": 1.897452747821808, - "epoch": 0.5878984065674746, - "grad_norm": 9.511686325073242, - "learning_rate": 3.299458596647157e-06, - "loss": 0.4981, - "mean_token_accuracy": 0.8457076713442803, - "num_tokens": 228158498.0, - "step": 189650 - }, - { - "entropy": 1.8302209421992301, - "epoch": 0.5879294056925243, - "grad_norm": 7.85939359664917, - "learning_rate": 3.2993716115293996e-06, - "loss": 0.3928, - "mean_token_accuracy": 0.8569305628538132, - "num_tokens": 228171347.0, - "step": 189660 - }, - { - "entropy": 1.8603603541851044, - "epoch": 0.5879604048175741, - "grad_norm": 8.279330253601074, - "learning_rate": 3.299284633290962e-06, - "loss": 0.4573, - "mean_token_accuracy": 0.850348000228405, - "num_tokens": 228183313.0, - "step": 189670 - }, - { - "entropy": 1.8520802944898604, - "epoch": 0.5879914039426237, - "grad_norm": 6.35352897644043, - "learning_rate": 3.299197661930939e-06, - "loss": 0.3909, - "mean_token_accuracy": 0.8683344289660454, - "num_tokens": 228195055.0, - "step": 189680 - }, - { - "entropy": 1.8788090363144874, - "epoch": 0.5880224030676734, - "grad_norm": 9.007162094116211, - "learning_rate": 3.2991106974484244e-06, - "loss": 0.4573, - "mean_token_accuracy": 0.8503051429986954, - "num_tokens": 228206727.0, - "step": 189690 - }, - { - "entropy": 1.9124977201223374, - "epoch": 0.5880534021927231, - "grad_norm": 7.327961444854736, - "learning_rate": 3.299023739842512e-06, - "loss": 0.4265, - "mean_token_accuracy": 0.8595570877194405, - "num_tokens": 228218308.0, - "step": 189700 - }, - { - "entropy": 1.8749097779393196, - "epoch": 0.5880844013177728, - "grad_norm": 8.713927268981934, - "learning_rate": 3.2989367891122944e-06, - "loss": 0.4444, - "mean_token_accuracy": 0.8532299667596817, - "num_tokens": 228229913.0, - "step": 189710 - }, - { - "entropy": 1.7920349642634392, - "epoch": 0.5881154004428225, - "grad_norm": 4.948719024658203, - "learning_rate": 3.2988498452568652e-06, - "loss": 0.4064, - "mean_token_accuracy": 0.869075883924961, - "num_tokens": 228242445.0, - "step": 189720 - }, - { - "entropy": 1.890162006020546, - "epoch": 0.5881463995678722, - "grad_norm": 7.9683356285095215, - "learning_rate": 3.29876290827532e-06, - "loss": 0.4604, - "mean_token_accuracy": 0.8604482188820839, - "num_tokens": 228253723.0, - "step": 189730 - }, - { - "entropy": 1.8672292068600655, - "epoch": 0.5881773986929218, - "grad_norm": 3.6988465785980225, - "learning_rate": 3.298675978166752e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.8574666917324066, - "num_tokens": 228265723.0, - "step": 189740 - }, - { - "entropy": 1.9488049060106278, - "epoch": 0.5882083978179716, - "grad_norm": 9.950399398803711, - "learning_rate": 3.298589054930257e-06, - "loss": 0.5182, - "mean_token_accuracy": 0.8396835044026375, - "num_tokens": 228276302.0, - "step": 189750 - }, - { - "entropy": 1.923870074748993, - "epoch": 0.5882393969430213, - "grad_norm": 10.66832160949707, - "learning_rate": 3.298502138564928e-06, - "loss": 0.5146, - "mean_token_accuracy": 0.8430263265967369, - "num_tokens": 228287766.0, - "step": 189760 - }, - { - "entropy": 1.8855630785226822, - "epoch": 0.588270396068071, - "grad_norm": 8.5703125, - "learning_rate": 3.298415229069861e-06, - "loss": 0.4446, - "mean_token_accuracy": 0.8629995420575142, - "num_tokens": 228298917.0, - "step": 189770 - }, - { - "entropy": 1.9425514459609985, - "epoch": 0.5883013951931206, - "grad_norm": 8.09411907196045, - "learning_rate": 3.29832832644415e-06, - "loss": 0.4893, - "mean_token_accuracy": 0.8434894353151321, - "num_tokens": 228310478.0, - "step": 189780 - }, - { - "entropy": 1.860796995460987, - "epoch": 0.5883323943181704, - "grad_norm": 5.034776210784912, - "learning_rate": 3.2982414306868905e-06, - "loss": 0.4634, - "mean_token_accuracy": 0.8519127145409584, - "num_tokens": 228322635.0, - "step": 189790 - }, - { - "entropy": 1.9245415329933167, - "epoch": 0.5883633934432201, - "grad_norm": 3.561757802963257, - "learning_rate": 3.2981545417971785e-06, - "loss": 0.4799, - "mean_token_accuracy": 0.8493681281805039, - "num_tokens": 228334274.0, - "step": 189800 - }, - { - "entropy": 1.9446907877922057, - "epoch": 0.5883943925682698, - "grad_norm": 6.972831726074219, - "learning_rate": 3.2980676597741084e-06, - "loss": 0.4795, - "mean_token_accuracy": 0.8560405924916268, - "num_tokens": 228345048.0, - "step": 189810 - }, - { - "entropy": 1.8980532869696618, - "epoch": 0.5884253916933194, - "grad_norm": 7.752938747406006, - "learning_rate": 3.297980784616776e-06, - "loss": 0.4386, - "mean_token_accuracy": 0.861751252412796, - "num_tokens": 228356615.0, - "step": 189820 - }, - { - "entropy": 1.8524247661232949, - "epoch": 0.5884563908183692, - "grad_norm": 3.781128168106079, - "learning_rate": 3.297893916324278e-06, - "loss": 0.4092, - "mean_token_accuracy": 0.8605551645159721, - "num_tokens": 228368712.0, - "step": 189830 - }, - { - "entropy": 1.7883019983768462, - "epoch": 0.5884873899434189, - "grad_norm": 7.834524154663086, - "learning_rate": 3.2978070548957085e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8655453786253929, - "num_tokens": 228381568.0, - "step": 189840 - }, - { - "entropy": 1.8673112966120242, - "epoch": 0.5885183890684685, - "grad_norm": 7.856258869171143, - "learning_rate": 3.297720200330167e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8590942695736885, - "num_tokens": 228393610.0, - "step": 189850 - }, - { - "entropy": 1.934214359521866, - "epoch": 0.5885493881935182, - "grad_norm": 8.630729675292969, - "learning_rate": 3.297633352626745e-06, - "loss": 0.5183, - "mean_token_accuracy": 0.8391479954123497, - "num_tokens": 228404573.0, - "step": 189860 - }, - { - "entropy": 1.8712112307548523, - "epoch": 0.588580387318568, - "grad_norm": 7.566702842712402, - "learning_rate": 3.2975465117845427e-06, - "loss": 0.4238, - "mean_token_accuracy": 0.8533918172121048, - "num_tokens": 228416198.0, - "step": 189870 - }, - { - "entropy": 1.8967519655823708, - "epoch": 0.5886113864436177, - "grad_norm": 6.956945896148682, - "learning_rate": 3.2974596778026564e-06, - "loss": 0.4759, - "mean_token_accuracy": 0.8500938445329667, - "num_tokens": 228428108.0, - "step": 189880 - }, - { - "entropy": 1.8681173652410508, - "epoch": 0.5886423855686673, - "grad_norm": 8.703429222106934, - "learning_rate": 3.2973728506801805e-06, - "loss": 0.4208, - "mean_token_accuracy": 0.8636728703975678, - "num_tokens": 228439922.0, - "step": 189890 - }, - { - "entropy": 1.87923151999712, - "epoch": 0.588673384693717, - "grad_norm": 8.526222229003906, - "learning_rate": 3.297286030416215e-06, - "loss": 0.4416, - "mean_token_accuracy": 0.8607040166854858, - "num_tokens": 228451960.0, - "step": 189900 - }, - { - "entropy": 1.8602784745395184, - "epoch": 0.5887043838187667, - "grad_norm": 8.467865943908691, - "learning_rate": 3.2971992170098548e-06, - "loss": 0.4323, - "mean_token_accuracy": 0.8517221093177796, - "num_tokens": 228464279.0, - "step": 189910 - }, - { - "entropy": 1.7735490411520005, - "epoch": 0.5887353829438164, - "grad_norm": 7.633058547973633, - "learning_rate": 3.2971124104601977e-06, - "loss": 0.3409, - "mean_token_accuracy": 0.8725137710571289, - "num_tokens": 228477690.0, - "step": 189920 - }, - { - "entropy": 1.926728144288063, - "epoch": 0.5887663820688661, - "grad_norm": 8.148148536682129, - "learning_rate": 3.297025610766342e-06, - "loss": 0.5202, - "mean_token_accuracy": 0.8435298100113868, - "num_tokens": 228488250.0, - "step": 189930 - }, - { - "entropy": 1.8616325289011002, - "epoch": 0.5887973811939158, - "grad_norm": 3.8072237968444824, - "learning_rate": 3.2969388179273836e-06, - "loss": 0.5177, - "mean_token_accuracy": 0.8439561665058136, - "num_tokens": 228501099.0, - "step": 189940 - }, - { - "entropy": 1.9076742500066757, - "epoch": 0.5888283803189654, - "grad_norm": 7.454319477081299, - "learning_rate": 3.2968520319424223e-06, - "loss": 0.4693, - "mean_token_accuracy": 0.8690858289599419, - "num_tokens": 228512982.0, - "step": 189950 - }, - { - "entropy": 1.8784878611564637, - "epoch": 0.5888593794440152, - "grad_norm": 7.8391032218933105, - "learning_rate": 3.2967652528105547e-06, - "loss": 0.4506, - "mean_token_accuracy": 0.8554589852690697, - "num_tokens": 228524707.0, - "step": 189960 - }, - { - "entropy": 1.8729647427797318, - "epoch": 0.5888903785690649, - "grad_norm": 8.286604881286621, - "learning_rate": 3.29667848053088e-06, - "loss": 0.4153, - "mean_token_accuracy": 0.8640352576971054, - "num_tokens": 228537429.0, - "step": 189970 - }, - { - "entropy": 1.8447729453444481, - "epoch": 0.5889213776941146, - "grad_norm": 8.21757698059082, - "learning_rate": 3.2965917151024953e-06, - "loss": 0.3909, - "mean_token_accuracy": 0.8646952509880066, - "num_tokens": 228549779.0, - "step": 189980 - }, - { - "entropy": 1.8539592325687408, - "epoch": 0.5889523768191642, - "grad_norm": 8.536675453186035, - "learning_rate": 3.2965049565244996e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8582399547100067, - "num_tokens": 228561851.0, - "step": 189990 - }, - { - "entropy": 1.862364935874939, - "epoch": 0.588983375944214, - "grad_norm": 7.077996253967285, - "learning_rate": 3.2964182047959915e-06, - "loss": 0.3991, - "mean_token_accuracy": 0.8641496703028679, - "num_tokens": 228573974.0, - "step": 190000 - }, - { - "entropy": 1.8656501933932303, - "epoch": 0.5890143750692637, - "grad_norm": 8.515663146972656, - "learning_rate": 3.29633145991607e-06, - "loss": 0.4006, - "mean_token_accuracy": 0.865130890905857, - "num_tokens": 228586362.0, - "step": 190010 - }, - { - "entropy": 1.7867624640464783, - "epoch": 0.5890453741943134, - "grad_norm": 4.023524284362793, - "learning_rate": 3.2962447218838334e-06, - "loss": 0.3593, - "mean_token_accuracy": 0.8638389229774475, - "num_tokens": 228598677.0, - "step": 190020 - }, - { - "entropy": 1.9330232799053193, - "epoch": 0.589076373319363, - "grad_norm": 7.86823844909668, - "learning_rate": 3.2961579906983814e-06, - "loss": 0.4809, - "mean_token_accuracy": 0.8452364414930343, - "num_tokens": 228609772.0, - "step": 190030 - }, - { - "entropy": 1.8227289929986, - "epoch": 0.5891073724444128, - "grad_norm": 7.341483116149902, - "learning_rate": 3.2960712663588136e-06, - "loss": 0.4069, - "mean_token_accuracy": 0.8582392573356629, - "num_tokens": 228622421.0, - "step": 190040 - }, - { - "entropy": 1.8125467792153358, - "epoch": 0.5891383715694625, - "grad_norm": 2.910619020462036, - "learning_rate": 3.2959845488642283e-06, - "loss": 0.3902, - "mean_token_accuracy": 0.8680084839463234, - "num_tokens": 228635235.0, - "step": 190050 - }, - { - "entropy": 1.8080111667513847, - "epoch": 0.5891693706945121, - "grad_norm": 3.3258447647094727, - "learning_rate": 3.2958978382137264e-06, - "loss": 0.3514, - "mean_token_accuracy": 0.8626864954829216, - "num_tokens": 228648296.0, - "step": 190060 - }, - { - "entropy": 1.8418749034404756, - "epoch": 0.5892003698195618, - "grad_norm": 9.099149703979492, - "learning_rate": 3.295811134406407e-06, - "loss": 0.4577, - "mean_token_accuracy": 0.8486756533384323, - "num_tokens": 228660808.0, - "step": 190070 - }, - { - "entropy": 1.9027037620544434, - "epoch": 0.5892313689446116, - "grad_norm": 3.7178332805633545, - "learning_rate": 3.29572443744137e-06, - "loss": 0.4607, - "mean_token_accuracy": 0.8494066312909127, - "num_tokens": 228672586.0, - "step": 190080 - }, - { - "entropy": 1.8233142986893653, - "epoch": 0.5892623680696613, - "grad_norm": 7.8997721672058105, - "learning_rate": 3.295637747317715e-06, - "loss": 0.3931, - "mean_token_accuracy": 0.8714895188808441, - "num_tokens": 228684573.0, - "step": 190090 - }, - { - "entropy": 1.8038986414670943, - "epoch": 0.5892933671947109, - "grad_norm": 7.1145548820495605, - "learning_rate": 3.2955510640345435e-06, - "loss": 0.3996, - "mean_token_accuracy": 0.8632494986057282, - "num_tokens": 228697371.0, - "step": 190100 - }, - { - "entropy": 1.8754192054271699, - "epoch": 0.5893243663197606, - "grad_norm": 8.730184555053711, - "learning_rate": 3.295464387590955e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8609462678432465, - "num_tokens": 228709207.0, - "step": 190110 - }, - { - "entropy": 1.7887951895594596, - "epoch": 0.5893553654448104, - "grad_norm": 9.219457626342773, - "learning_rate": 3.2953777179860507e-06, - "loss": 0.3884, - "mean_token_accuracy": 0.8643956810235978, - "num_tokens": 228722841.0, - "step": 190120 - }, - { - "entropy": 1.856288492679596, - "epoch": 0.58938636456986, - "grad_norm": 9.556690216064453, - "learning_rate": 3.295291055218931e-06, - "loss": 0.4015, - "mean_token_accuracy": 0.8619930043816566, - "num_tokens": 228735293.0, - "step": 190130 - }, - { - "entropy": 1.8539328455924988, - "epoch": 0.5894173636949097, - "grad_norm": 9.75001049041748, - "learning_rate": 3.2952043992886974e-06, - "loss": 0.4742, - "mean_token_accuracy": 0.8486363276839256, - "num_tokens": 228747448.0, - "step": 190140 - }, - { - "entropy": 1.8937363922595978, - "epoch": 0.5894483628199594, - "grad_norm": 8.638582229614258, - "learning_rate": 3.2951177501944498e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.8536964386701584, - "num_tokens": 228758391.0, - "step": 190150 - }, - { - "entropy": 1.8685229420661926, - "epoch": 0.589479361945009, - "grad_norm": 7.673697471618652, - "learning_rate": 3.2950311079352905e-06, - "loss": 0.4424, - "mean_token_accuracy": 0.8545270696282387, - "num_tokens": 228769201.0, - "step": 190160 - }, - { - "entropy": 1.7864877060055733, - "epoch": 0.5895103610700588, - "grad_norm": 4.983564853668213, - "learning_rate": 3.294944472510321e-06, - "loss": 0.3753, - "mean_token_accuracy": 0.8641117364168167, - "num_tokens": 228782125.0, - "step": 190170 - }, - { - "entropy": 1.9448559939861298, - "epoch": 0.5895413601951085, - "grad_norm": 6.925676345825195, - "learning_rate": 3.294857843918642e-06, - "loss": 0.4691, - "mean_token_accuracy": 0.8519876688718796, - "num_tokens": 228793264.0, - "step": 190180 - }, - { - "entropy": 1.8454899474978448, - "epoch": 0.5895723593201582, - "grad_norm": 8.729484558105469, - "learning_rate": 3.294771222159356e-06, - "loss": 0.4546, - "mean_token_accuracy": 0.8515693292021751, - "num_tokens": 228805524.0, - "step": 190190 - }, - { - "entropy": 1.9692073345184327, - "epoch": 0.5896033584452078, - "grad_norm": 8.359795570373535, - "learning_rate": 3.2946846072315646e-06, - "loss": 0.501, - "mean_token_accuracy": 0.8440068036317825, - "num_tokens": 228816416.0, - "step": 190200 - }, - { - "entropy": 1.894757117331028, - "epoch": 0.5896343575702576, - "grad_norm": 7.804634094238281, - "learning_rate": 3.2945979991343706e-06, - "loss": 0.5185, - "mean_token_accuracy": 0.8510986045002937, - "num_tokens": 228829075.0, - "step": 190210 - }, - { - "entropy": 1.860536876320839, - "epoch": 0.5896653566953073, - "grad_norm": 2.497119426727295, - "learning_rate": 3.2945113978668752e-06, - "loss": 0.4086, - "mean_token_accuracy": 0.8606500387191772, - "num_tokens": 228841126.0, - "step": 190220 - }, - { - "entropy": 1.8744609951972961, - "epoch": 0.589696355820357, - "grad_norm": 8.66141128540039, - "learning_rate": 3.294424803428181e-06, - "loss": 0.4361, - "mean_token_accuracy": 0.855774176120758, - "num_tokens": 228853330.0, - "step": 190230 - }, - { - "entropy": 1.9277845978736878, - "epoch": 0.5897273549454066, - "grad_norm": 7.061468124389648, - "learning_rate": 3.2943382158173916e-06, - "loss": 0.4606, - "mean_token_accuracy": 0.858029393851757, - "num_tokens": 228864546.0, - "step": 190240 - }, - { - "entropy": 1.8712523072957992, - "epoch": 0.5897583540704564, - "grad_norm": 7.561338424682617, - "learning_rate": 3.2942516350336085e-06, - "loss": 0.4729, - "mean_token_accuracy": 0.8526345446705819, - "num_tokens": 228876361.0, - "step": 190250 - }, - { - "entropy": 1.8795302122831345, - "epoch": 0.5897893531955061, - "grad_norm": 4.068550109863281, - "learning_rate": 3.2941650610759356e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.8507021918892861, - "num_tokens": 228888145.0, - "step": 190260 - }, - { - "entropy": 1.8870377585291862, - "epoch": 0.5898203523205557, - "grad_norm": 7.460421085357666, - "learning_rate": 3.2940784939434754e-06, - "loss": 0.4529, - "mean_token_accuracy": 0.8494185790419578, - "num_tokens": 228900354.0, - "step": 190270 - }, - { - "entropy": 1.9352505058050156, - "epoch": 0.5898513514456054, - "grad_norm": 7.420712947845459, - "learning_rate": 3.2939919336353315e-06, - "loss": 0.4738, - "mean_token_accuracy": 0.8464471936225891, - "num_tokens": 228911066.0, - "step": 190280 - }, - { - "entropy": 1.8705057039856912, - "epoch": 0.5898823505706552, - "grad_norm": 6.3799028396606445, - "learning_rate": 3.293905380150607e-06, - "loss": 0.4178, - "mean_token_accuracy": 0.8655120372772217, - "num_tokens": 228922702.0, - "step": 190290 - }, - { - "entropy": 1.8629606902599334, - "epoch": 0.5899133496957049, - "grad_norm": 9.938080787658691, - "learning_rate": 3.2938188334884057e-06, - "loss": 0.4466, - "mean_token_accuracy": 0.8527127936482429, - "num_tokens": 228934867.0, - "step": 190300 - }, - { - "entropy": 1.783542599529028, - "epoch": 0.5899443488207545, - "grad_norm": 7.353618621826172, - "learning_rate": 3.293732293647831e-06, - "loss": 0.3522, - "mean_token_accuracy": 0.8648853242397309, - "num_tokens": 228947899.0, - "step": 190310 - }, - { - "entropy": 1.891326430439949, - "epoch": 0.5899753479458042, - "grad_norm": 9.56852912902832, - "learning_rate": 3.293645760627988e-06, - "loss": 0.4662, - "mean_token_accuracy": 0.8535473197698593, - "num_tokens": 228959022.0, - "step": 190320 - }, - { - "entropy": 1.8293341875076294, - "epoch": 0.590006347070854, - "grad_norm": 9.369425773620605, - "learning_rate": 3.293559234427979e-06, - "loss": 0.3919, - "mean_token_accuracy": 0.8616625502705574, - "num_tokens": 228971424.0, - "step": 190330 - }, - { - "entropy": 1.8950300842523575, - "epoch": 0.5900373461959036, - "grad_norm": 8.347681045532227, - "learning_rate": 3.2934727150469092e-06, - "loss": 0.4705, - "mean_token_accuracy": 0.8472575053572655, - "num_tokens": 228982796.0, - "step": 190340 - }, - { - "entropy": 1.890362760424614, - "epoch": 0.5900683453209533, - "grad_norm": 3.295876979827881, - "learning_rate": 3.2933862024838826e-06, - "loss": 0.4796, - "mean_token_accuracy": 0.852965684235096, - "num_tokens": 228994264.0, - "step": 190350 - }, - { - "entropy": 1.8526059299707414, - "epoch": 0.590099344446003, - "grad_norm": 3.781344175338745, - "learning_rate": 3.293299696738004e-06, - "loss": 0.4694, - "mean_token_accuracy": 0.847343610227108, - "num_tokens": 229006040.0, - "step": 190360 - }, - { - "entropy": 1.8531452476978303, - "epoch": 0.5901303435710528, - "grad_norm": 8.642162322998047, - "learning_rate": 3.29321319780838e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8518374875187874, - "num_tokens": 229018075.0, - "step": 190370 - }, - { - "entropy": 1.8752460539340974, - "epoch": 0.5901613426961024, - "grad_norm": 3.7689857482910156, - "learning_rate": 3.293126705694112e-06, - "loss": 0.4529, - "mean_token_accuracy": 0.8554120779037475, - "num_tokens": 229029496.0, - "step": 190380 - }, - { - "entropy": 1.8158239707350732, - "epoch": 0.5901923418211521, - "grad_norm": 8.14268970489502, - "learning_rate": 3.293040220394307e-06, - "loss": 0.4164, - "mean_token_accuracy": 0.8686467096209526, - "num_tokens": 229041879.0, - "step": 190390 - }, - { - "entropy": 1.8639475405216217, - "epoch": 0.5902233409462018, - "grad_norm": 7.9564642906188965, - "learning_rate": 3.29295374190807e-06, - "loss": 0.4521, - "mean_token_accuracy": 0.8520817801356315, - "num_tokens": 229053955.0, - "step": 190400 - }, - { - "entropy": 1.8774761408567429, - "epoch": 0.5902543400712514, - "grad_norm": 7.807625770568848, - "learning_rate": 3.2928672702345065e-06, - "loss": 0.4185, - "mean_token_accuracy": 0.8621849209070206, - "num_tokens": 229065976.0, - "step": 190410 - }, - { - "entropy": 1.933225655555725, - "epoch": 0.5902853391963012, - "grad_norm": 8.07839298248291, - "learning_rate": 3.292780805372722e-06, - "loss": 0.5244, - "mean_token_accuracy": 0.8469988331198692, - "num_tokens": 229076710.0, - "step": 190420 - }, - { - "entropy": 1.8818949446082116, - "epoch": 0.5903163383213509, - "grad_norm": 4.203948020935059, - "learning_rate": 3.2926943473218215e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8559348538517952, - "num_tokens": 229088535.0, - "step": 190430 - }, - { - "entropy": 1.8088610842823982, - "epoch": 0.5903473374464006, - "grad_norm": 8.226593971252441, - "learning_rate": 3.2926078960809123e-06, - "loss": 0.3615, - "mean_token_accuracy": 0.8699506267905235, - "num_tokens": 229101332.0, - "step": 190440 - }, - { - "entropy": 1.8730618119239808, - "epoch": 0.5903783365714502, - "grad_norm": 7.754554271697998, - "learning_rate": 3.2925214516490994e-06, - "loss": 0.4562, - "mean_token_accuracy": 0.8541584327816963, - "num_tokens": 229113342.0, - "step": 190450 - }, - { - "entropy": 1.854330986738205, - "epoch": 0.5904093356965, - "grad_norm": 8.528146743774414, - "learning_rate": 3.2924350140254895e-06, - "loss": 0.4067, - "mean_token_accuracy": 0.8653483808040618, - "num_tokens": 229125410.0, - "step": 190460 - }, - { - "entropy": 1.8535527095198632, - "epoch": 0.5904403348215497, - "grad_norm": 7.2377848625183105, - "learning_rate": 3.2923485832091883e-06, - "loss": 0.4328, - "mean_token_accuracy": 0.85265993475914, - "num_tokens": 229137915.0, - "step": 190470 - }, - { - "entropy": 1.8914847001433372, - "epoch": 0.5904713339465993, - "grad_norm": 8.46504020690918, - "learning_rate": 3.2922621591993033e-06, - "loss": 0.4135, - "mean_token_accuracy": 0.8679022789001465, - "num_tokens": 229149136.0, - "step": 190480 - }, - { - "entropy": 1.842303329706192, - "epoch": 0.590502333071649, - "grad_norm": 2.873488664627075, - "learning_rate": 3.2921757419949406e-06, - "loss": 0.3956, - "mean_token_accuracy": 0.8661413252353668, - "num_tokens": 229161955.0, - "step": 190490 - }, - { - "entropy": 1.9058812111616135, - "epoch": 0.5905333321966988, - "grad_norm": 7.14345645904541, - "learning_rate": 3.2920893315952072e-06, - "loss": 0.4804, - "mean_token_accuracy": 0.8542837470769882, - "num_tokens": 229172882.0, - "step": 190500 - }, - { - "entropy": 1.8523188918828963, - "epoch": 0.5905643313217485, - "grad_norm": 3.61691951751709, - "learning_rate": 3.2920029279992097e-06, - "loss": 0.469, - "mean_token_accuracy": 0.8538831368088722, - "num_tokens": 229185438.0, - "step": 190510 - }, - { - "entropy": 1.9006501093506813, - "epoch": 0.5905953304467981, - "grad_norm": 7.883707523345947, - "learning_rate": 3.291916531206057e-06, - "loss": 0.4411, - "mean_token_accuracy": 0.8513767853379249, - "num_tokens": 229197140.0, - "step": 190520 - }, - { - "entropy": 1.7014507532119751, - "epoch": 0.5906263295718478, - "grad_norm": 7.430270671844482, - "learning_rate": 3.2918301412148545e-06, - "loss": 0.3265, - "mean_token_accuracy": 0.8773944795131683, - "num_tokens": 229210947.0, - "step": 190530 - }, - { - "entropy": 1.8675128430128098, - "epoch": 0.5906573286968976, - "grad_norm": 7.121996879577637, - "learning_rate": 3.2917437580247104e-06, - "loss": 0.4279, - "mean_token_accuracy": 0.8630870550870895, - "num_tokens": 229222512.0, - "step": 190540 - }, - { - "entropy": 1.736918868124485, - "epoch": 0.5906883278219472, - "grad_norm": 8.349037170410156, - "learning_rate": 3.291657381634732e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8642535865306854, - "num_tokens": 229236428.0, - "step": 190550 - }, - { - "entropy": 1.8738708287477492, - "epoch": 0.5907193269469969, - "grad_norm": 5.51838493347168, - "learning_rate": 3.291571012044028e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8621172949671745, - "num_tokens": 229248565.0, - "step": 190560 - }, - { - "entropy": 1.9014319479465485, - "epoch": 0.5907503260720466, - "grad_norm": 7.950676918029785, - "learning_rate": 3.2914846492517054e-06, - "loss": 0.472, - "mean_token_accuracy": 0.8562928780913353, - "num_tokens": 229260100.0, - "step": 190570 - }, - { - "entropy": 1.8517659470438956, - "epoch": 0.5907813251970964, - "grad_norm": 3.7238996028900146, - "learning_rate": 3.2913982932568738e-06, - "loss": 0.3979, - "mean_token_accuracy": 0.8641832873225213, - "num_tokens": 229272556.0, - "step": 190580 - }, - { - "entropy": 1.8479650676250459, - "epoch": 0.590812324322146, - "grad_norm": 3.849630117416382, - "learning_rate": 3.2913119440586404e-06, - "loss": 0.3865, - "mean_token_accuracy": 0.8724173724651336, - "num_tokens": 229283852.0, - "step": 190590 - }, - { - "entropy": 1.8994674652814865, - "epoch": 0.5908433234471957, - "grad_norm": 9.459546089172363, - "learning_rate": 3.2912256016561138e-06, - "loss": 0.4241, - "mean_token_accuracy": 0.8530774980783462, - "num_tokens": 229296603.0, - "step": 190600 - }, - { - "entropy": 1.7782090470194816, - "epoch": 0.5908743225722454, - "grad_norm": 3.8931798934936523, - "learning_rate": 3.2911392660484033e-06, - "loss": 0.3921, - "mean_token_accuracy": 0.8701503276824951, - "num_tokens": 229309684.0, - "step": 190610 - }, - { - "entropy": 1.8570687502622605, - "epoch": 0.590905321697295, - "grad_norm": 7.796317100524902, - "learning_rate": 3.291052937234617e-06, - "loss": 0.4414, - "mean_token_accuracy": 0.8513424873352051, - "num_tokens": 229321331.0, - "step": 190620 - }, - { - "entropy": 1.8146572425961494, - "epoch": 0.5909363208223448, - "grad_norm": 4.251481056213379, - "learning_rate": 3.290966615213865e-06, - "loss": 0.4234, - "mean_token_accuracy": 0.8561153456568718, - "num_tokens": 229334264.0, - "step": 190630 - }, - { - "entropy": 1.7932671368122102, - "epoch": 0.5909673199473945, - "grad_norm": 8.984890937805176, - "learning_rate": 3.2908802999852547e-06, - "loss": 0.384, - "mean_token_accuracy": 0.8709327802062035, - "num_tokens": 229346949.0, - "step": 190640 - }, - { - "entropy": 1.9233350574970245, - "epoch": 0.5909983190724442, - "grad_norm": 8.20529556274414, - "learning_rate": 3.2907939915478963e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8593816369771957, - "num_tokens": 229358034.0, - "step": 190650 - }, - { - "entropy": 1.8050723016262054, - "epoch": 0.5910293181974938, - "grad_norm": 8.480942726135254, - "learning_rate": 3.2907076899009004e-06, - "loss": 0.3899, - "mean_token_accuracy": 0.8727429389953614, - "num_tokens": 229370748.0, - "step": 190660 - }, - { - "entropy": 1.8822183892130853, - "epoch": 0.5910603173225436, - "grad_norm": 8.092206954956055, - "learning_rate": 3.2906213950433745e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.8524337723851204, - "num_tokens": 229382446.0, - "step": 190670 - }, - { - "entropy": 1.9217692241072655, - "epoch": 0.5910913164475933, - "grad_norm": 9.312432289123535, - "learning_rate": 3.2905351069744307e-06, - "loss": 0.4852, - "mean_token_accuracy": 0.8461758613586425, - "num_tokens": 229393634.0, - "step": 190680 - }, - { - "entropy": 1.8857564225792884, - "epoch": 0.5911223155726429, - "grad_norm": 9.134812355041504, - "learning_rate": 3.2904488256931777e-06, - "loss": 0.4678, - "mean_token_accuracy": 0.8554666772484779, - "num_tokens": 229406173.0, - "step": 190690 - }, - { - "entropy": 1.8799413040280342, - "epoch": 0.5911533146976926, - "grad_norm": 8.584431648254395, - "learning_rate": 3.2903625511987254e-06, - "loss": 0.4473, - "mean_token_accuracy": 0.8597801223397254, - "num_tokens": 229417486.0, - "step": 190700 - }, - { - "entropy": 1.8675891801714897, - "epoch": 0.5911843138227424, - "grad_norm": 7.617621898651123, - "learning_rate": 3.290276283490185e-06, - "loss": 0.4465, - "mean_token_accuracy": 0.8503276690840721, - "num_tokens": 229430100.0, - "step": 190710 - }, - { - "entropy": 1.9346825338900089, - "epoch": 0.5912153129477921, - "grad_norm": 10.103340148925781, - "learning_rate": 3.290190022566666e-06, - "loss": 0.4456, - "mean_token_accuracy": 0.8546615943312645, - "num_tokens": 229441996.0, - "step": 190720 - }, - { - "entropy": 1.8633850425481797, - "epoch": 0.5912463120728417, - "grad_norm": 8.875619888305664, - "learning_rate": 3.2901037684272796e-06, - "loss": 0.4331, - "mean_token_accuracy": 0.8545124664902687, - "num_tokens": 229454182.0, - "step": 190730 - }, - { - "entropy": 1.8230377197265626, - "epoch": 0.5912773111978914, - "grad_norm": 7.591728210449219, - "learning_rate": 3.2900175210711366e-06, - "loss": 0.4321, - "mean_token_accuracy": 0.8573815956711769, - "num_tokens": 229465840.0, - "step": 190740 - }, - { - "entropy": 1.8421626284718513, - "epoch": 0.5913083103229412, - "grad_norm": 8.975364685058594, - "learning_rate": 3.2899312804973477e-06, - "loss": 0.3937, - "mean_token_accuracy": 0.8599640503525734, - "num_tokens": 229478483.0, - "step": 190750 - }, - { - "entropy": 1.8469863578677177, - "epoch": 0.5913393094479908, - "grad_norm": 7.199525356292725, - "learning_rate": 3.289845046705025e-06, - "loss": 0.3998, - "mean_token_accuracy": 0.860556535422802, - "num_tokens": 229490958.0, - "step": 190760 - }, - { - "entropy": 1.941561996936798, - "epoch": 0.5913703085730405, - "grad_norm": 6.5112080574035645, - "learning_rate": 3.289758819693278e-06, - "loss": 0.4735, - "mean_token_accuracy": 0.855010262131691, - "num_tokens": 229501945.0, - "step": 190770 - }, - { - "entropy": 1.830227905511856, - "epoch": 0.5914013076980902, - "grad_norm": 7.796418190002441, - "learning_rate": 3.28967259946122e-06, - "loss": 0.4471, - "mean_token_accuracy": 0.8564648866653443, - "num_tokens": 229514727.0, - "step": 190780 - }, - { - "entropy": 1.856962652504444, - "epoch": 0.59143230682314, - "grad_norm": 8.053735733032227, - "learning_rate": 3.2895863860079615e-06, - "loss": 0.3957, - "mean_token_accuracy": 0.8695003658533096, - "num_tokens": 229527296.0, - "step": 190790 - }, - { - "entropy": 1.879108041524887, - "epoch": 0.5914633059481896, - "grad_norm": 8.106398582458496, - "learning_rate": 3.2895001793326137e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8647478729486465, - "num_tokens": 229538967.0, - "step": 190800 - }, - { - "entropy": 1.8307623445987702, - "epoch": 0.5914943050732393, - "grad_norm": 7.531067848205566, - "learning_rate": 3.2894139794342905e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8509333118796348, - "num_tokens": 229551817.0, - "step": 190810 - }, - { - "entropy": 1.871297837793827, - "epoch": 0.591525304198289, - "grad_norm": 7.942951202392578, - "learning_rate": 3.289327786312102e-06, - "loss": 0.4349, - "mean_token_accuracy": 0.8627708986401558, - "num_tokens": 229563622.0, - "step": 190820 - }, - { - "entropy": 1.849818505346775, - "epoch": 0.5915563033233388, - "grad_norm": 4.293807506561279, - "learning_rate": 3.2892415999651623e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.8529814571142197, - "num_tokens": 229575703.0, - "step": 190830 - }, - { - "entropy": 1.8120758205652236, - "epoch": 0.5915873024483884, - "grad_norm": 4.3140869140625, - "learning_rate": 3.2891554203925823e-06, - "loss": 0.4253, - "mean_token_accuracy": 0.8529886394739151, - "num_tokens": 229588734.0, - "step": 190840 - }, - { - "entropy": 1.8439616784453392, - "epoch": 0.5916183015734381, - "grad_norm": 3.7536003589630127, - "learning_rate": 3.289069247593475e-06, - "loss": 0.4103, - "mean_token_accuracy": 0.8631413847208023, - "num_tokens": 229601210.0, - "step": 190850 - }, - { - "entropy": 1.8744244769215583, - "epoch": 0.5916493006984878, - "grad_norm": 7.325197696685791, - "learning_rate": 3.2889830815669536e-06, - "loss": 0.4116, - "mean_token_accuracy": 0.8581292122602463, - "num_tokens": 229613374.0, - "step": 190860 - }, - { - "entropy": 1.8809070125222207, - "epoch": 0.5916802998235374, - "grad_norm": 4.105676651000977, - "learning_rate": 3.288896922312131e-06, - "loss": 0.4658, - "mean_token_accuracy": 0.8529094144701957, - "num_tokens": 229625494.0, - "step": 190870 - }, - { - "entropy": 1.900406464934349, - "epoch": 0.5917112989485872, - "grad_norm": 6.932359218597412, - "learning_rate": 3.2888107698281193e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.842863355576992, - "num_tokens": 229637447.0, - "step": 190880 - }, - { - "entropy": 1.899899123609066, - "epoch": 0.5917422980736369, - "grad_norm": 8.196488380432129, - "learning_rate": 3.288724624114033e-06, - "loss": 0.4888, - "mean_token_accuracy": 0.8505870133638382, - "num_tokens": 229648952.0, - "step": 190890 - }, - { - "entropy": 1.8124980226159095, - "epoch": 0.5917732971986865, - "grad_norm": 8.71292495727539, - "learning_rate": 3.2886384851689847e-06, - "loss": 0.3686, - "mean_token_accuracy": 0.8594872117042541, - "num_tokens": 229661520.0, - "step": 190900 - }, - { - "entropy": 1.9170294746756553, - "epoch": 0.5918042963237362, - "grad_norm": 8.061601638793945, - "learning_rate": 3.2885523529920883e-06, - "loss": 0.4595, - "mean_token_accuracy": 0.8494271531701088, - "num_tokens": 229673586.0, - "step": 190910 - }, - { - "entropy": 1.947436010837555, - "epoch": 0.591835295448786, - "grad_norm": 10.752445220947266, - "learning_rate": 3.2884662275824575e-06, - "loss": 0.494, - "mean_token_accuracy": 0.8480764240026474, - "num_tokens": 229684280.0, - "step": 190920 - }, - { - "entropy": 1.8770150147378444, - "epoch": 0.5918662945738357, - "grad_norm": 7.548092365264893, - "learning_rate": 3.2883801089392058e-06, - "loss": 0.4648, - "mean_token_accuracy": 0.8523095726966858, - "num_tokens": 229696570.0, - "step": 190930 - }, - { - "entropy": 1.8843964487314224, - "epoch": 0.5918972936988853, - "grad_norm": 8.857001304626465, - "learning_rate": 3.2882939970614485e-06, - "loss": 0.444, - "mean_token_accuracy": 0.8598011612892151, - "num_tokens": 229708816.0, - "step": 190940 - }, - { - "entropy": 1.8774077758193015, - "epoch": 0.591928292823935, - "grad_norm": 8.422423362731934, - "learning_rate": 3.2882078919482983e-06, - "loss": 0.4059, - "mean_token_accuracy": 0.8628498017787933, - "num_tokens": 229720697.0, - "step": 190950 - }, - { - "entropy": 1.8349227026104926, - "epoch": 0.5919592919489848, - "grad_norm": 8.86723518371582, - "learning_rate": 3.28812179359887e-06, - "loss": 0.4308, - "mean_token_accuracy": 0.864958544075489, - "num_tokens": 229733597.0, - "step": 190960 - }, - { - "entropy": 1.8025419846177102, - "epoch": 0.5919902910740344, - "grad_norm": 3.7912731170654297, - "learning_rate": 3.288035702012279e-06, - "loss": 0.403, - "mean_token_accuracy": 0.8578298717737198, - "num_tokens": 229746676.0, - "step": 190970 - }, - { - "entropy": 1.9060227274894714, - "epoch": 0.5920212901990841, - "grad_norm": 9.238999366760254, - "learning_rate": 3.2879496171876383e-06, - "loss": 0.4513, - "mean_token_accuracy": 0.8610510796308517, - "num_tokens": 229757868.0, - "step": 190980 - }, - { - "entropy": 1.8276602059602738, - "epoch": 0.5920522893241338, - "grad_norm": 6.9514970779418945, - "learning_rate": 3.287863539124065e-06, - "loss": 0.3916, - "mean_token_accuracy": 0.8673168778419494, - "num_tokens": 229770018.0, - "step": 190990 - }, - { - "entropy": 1.8806076273322105, - "epoch": 0.5920832884491836, - "grad_norm": 9.204923629760742, - "learning_rate": 3.287777467820672e-06, - "loss": 0.4105, - "mean_token_accuracy": 0.8578602224588394, - "num_tokens": 229781623.0, - "step": 191000 - }, - { - "entropy": 1.8622824847698212, - "epoch": 0.5921142875742332, - "grad_norm": 8.241448402404785, - "learning_rate": 3.2876914032765763e-06, - "loss": 0.4353, - "mean_token_accuracy": 0.8582434579730034, - "num_tokens": 229793960.0, - "step": 191010 - }, - { - "entropy": 1.9200577780604362, - "epoch": 0.5921452866992829, - "grad_norm": 8.937591552734375, - "learning_rate": 3.2876053454908915e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8509643584489822, - "num_tokens": 229805175.0, - "step": 191020 - }, - { - "entropy": 1.846642728149891, - "epoch": 0.5921762858243326, - "grad_norm": 9.971576690673828, - "learning_rate": 3.287519294462734e-06, - "loss": 0.4646, - "mean_token_accuracy": 0.8499289125204086, - "num_tokens": 229817090.0, - "step": 191030 - }, - { - "entropy": 1.8406170830130577, - "epoch": 0.5922072849493824, - "grad_norm": 3.3949577808380127, - "learning_rate": 3.2874332501912204e-06, - "loss": 0.4157, - "mean_token_accuracy": 0.8627133294939995, - "num_tokens": 229828824.0, - "step": 191040 - }, - { - "entropy": 1.7291911885142326, - "epoch": 0.592238284074432, - "grad_norm": 3.691608190536499, - "learning_rate": 3.2873472126754647e-06, - "loss": 0.3814, - "mean_token_accuracy": 0.8592000618577004, - "num_tokens": 229843350.0, - "step": 191050 - }, - { - "entropy": 1.806246455013752, - "epoch": 0.5922692831994817, - "grad_norm": 9.855522155761719, - "learning_rate": 3.2872611819145838e-06, - "loss": 0.3753, - "mean_token_accuracy": 0.8735313445329667, - "num_tokens": 229855513.0, - "step": 191060 - }, - { - "entropy": 1.9019072502851486, - "epoch": 0.5923002823245314, - "grad_norm": 7.412065029144287, - "learning_rate": 3.2871751579076937e-06, - "loss": 0.4758, - "mean_token_accuracy": 0.8572008535265923, - "num_tokens": 229866228.0, - "step": 191070 - }, - { - "entropy": 1.8569483637809754, - "epoch": 0.5923312814495811, - "grad_norm": 4.520204067230225, - "learning_rate": 3.287089140653912e-06, - "loss": 0.429, - "mean_token_accuracy": 0.8527514040470123, - "num_tokens": 229878084.0, - "step": 191080 - }, - { - "entropy": 1.8153726771473884, - "epoch": 0.5923622805746308, - "grad_norm": 3.624915599822998, - "learning_rate": 3.2870031301523534e-06, - "loss": 0.4317, - "mean_token_accuracy": 0.8487672790884971, - "num_tokens": 229890969.0, - "step": 191090 - }, - { - "entropy": 1.8336610078811646, - "epoch": 0.5923932796996805, - "grad_norm": 7.657143592834473, - "learning_rate": 3.2869171264021356e-06, - "loss": 0.4096, - "mean_token_accuracy": 0.8640960767865181, - "num_tokens": 229902960.0, - "step": 191100 - }, - { - "entropy": 1.843455323576927, - "epoch": 0.5924242788247301, - "grad_norm": 7.258844375610352, - "learning_rate": 3.2868311294023743e-06, - "loss": 0.4093, - "mean_token_accuracy": 0.8586787343025207, - "num_tokens": 229915427.0, - "step": 191110 - }, - { - "entropy": 1.871065580844879, - "epoch": 0.5924552779497798, - "grad_norm": 8.253884315490723, - "learning_rate": 3.286745139152187e-06, - "loss": 0.4192, - "mean_token_accuracy": 0.8717275485396385, - "num_tokens": 229926864.0, - "step": 191120 - }, - { - "entropy": 1.8746835321187973, - "epoch": 0.5924862770748296, - "grad_norm": 4.194708347320557, - "learning_rate": 3.2866591556506915e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8504206746816635, - "num_tokens": 229938889.0, - "step": 191130 - }, - { - "entropy": 1.9504939645528794, - "epoch": 0.5925172761998793, - "grad_norm": 8.435033798217773, - "learning_rate": 3.2865731788970053e-06, - "loss": 0.5223, - "mean_token_accuracy": 0.8394033655524253, - "num_tokens": 229950278.0, - "step": 191140 - }, - { - "entropy": 1.83246139138937, - "epoch": 0.5925482753249289, - "grad_norm": 8.544919967651367, - "learning_rate": 3.2864872088902443e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8616950988769532, - "num_tokens": 229963221.0, - "step": 191150 - }, - { - "entropy": 1.8219051674008369, - "epoch": 0.5925792744499786, - "grad_norm": 4.832451343536377, - "learning_rate": 3.286401245629527e-06, - "loss": 0.4261, - "mean_token_accuracy": 0.8639433071017265, - "num_tokens": 229976076.0, - "step": 191160 - }, - { - "entropy": 1.8256654024124146, - "epoch": 0.5926102735750284, - "grad_norm": 8.011951446533203, - "learning_rate": 3.2863152891139717e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8585153207182884, - "num_tokens": 229988966.0, - "step": 191170 - }, - { - "entropy": 1.925898441672325, - "epoch": 0.592641272700078, - "grad_norm": 7.606298923492432, - "learning_rate": 3.2862293393426953e-06, - "loss": 0.5079, - "mean_token_accuracy": 0.8489936038851738, - "num_tokens": 230000081.0, - "step": 191180 - }, - { - "entropy": 1.807943683117628, - "epoch": 0.5926722718251277, - "grad_norm": 7.411771297454834, - "learning_rate": 3.2861433963148164e-06, - "loss": 0.3932, - "mean_token_accuracy": 0.8664886385202408, - "num_tokens": 230013462.0, - "step": 191190 - }, - { - "entropy": 1.9161637112498284, - "epoch": 0.5927032709501774, - "grad_norm": 8.881155967712402, - "learning_rate": 3.2860574600294535e-06, - "loss": 0.5271, - "mean_token_accuracy": 0.8394664421677589, - "num_tokens": 230024516.0, - "step": 191200 - }, - { - "entropy": 1.8874651074409485, - "epoch": 0.5927342700752272, - "grad_norm": 3.6384761333465576, - "learning_rate": 3.2859715304857254e-06, - "loss": 0.4523, - "mean_token_accuracy": 0.8549251481890678, - "num_tokens": 230036074.0, - "step": 191210 - }, - { - "entropy": 1.9186082750558853, - "epoch": 0.5927652692002768, - "grad_norm": 7.823178768157959, - "learning_rate": 3.285885607682749e-06, - "loss": 0.4595, - "mean_token_accuracy": 0.8534322634339333, - "num_tokens": 230047336.0, - "step": 191220 - }, - { - "entropy": 1.87621361464262, - "epoch": 0.5927962683253265, - "grad_norm": 9.075730323791504, - "learning_rate": 3.285799691619645e-06, - "loss": 0.4277, - "mean_token_accuracy": 0.8539279267191887, - "num_tokens": 230058593.0, - "step": 191230 - }, - { - "entropy": 1.816857473552227, - "epoch": 0.5928272674503762, - "grad_norm": 8.063228607177734, - "learning_rate": 3.285713782295531e-06, - "loss": 0.4016, - "mean_token_accuracy": 0.8562188908457756, - "num_tokens": 230071437.0, - "step": 191240 - }, - { - "entropy": 1.806934006512165, - "epoch": 0.592858266575426, - "grad_norm": 7.675832271575928, - "learning_rate": 3.2856278797095264e-06, - "loss": 0.4024, - "mean_token_accuracy": 0.8649509236216545, - "num_tokens": 230084144.0, - "step": 191250 - }, - { - "entropy": 1.9102825194597244, - "epoch": 0.5928892657004756, - "grad_norm": 9.0030517578125, - "learning_rate": 3.2855419838607507e-06, - "loss": 0.465, - "mean_token_accuracy": 0.8502855435013771, - "num_tokens": 230095428.0, - "step": 191260 - }, - { - "entropy": 1.9246444031596184, - "epoch": 0.5929202648255253, - "grad_norm": 9.100739479064941, - "learning_rate": 3.2854560947483234e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.8507225677371025, - "num_tokens": 230106120.0, - "step": 191270 - }, - { - "entropy": 1.8004204019904138, - "epoch": 0.592951263950575, - "grad_norm": 7.986629009246826, - "learning_rate": 3.2853702123713637e-06, - "loss": 0.356, - "mean_token_accuracy": 0.8648659467697144, - "num_tokens": 230119224.0, - "step": 191280 - }, - { - "entropy": 1.899955762922764, - "epoch": 0.5929822630756247, - "grad_norm": 7.1050262451171875, - "learning_rate": 3.285284336728991e-06, - "loss": 0.4563, - "mean_token_accuracy": 0.8590516731142998, - "num_tokens": 230130967.0, - "step": 191290 - }, - { - "entropy": 1.8669502303004264, - "epoch": 0.5930132622006744, - "grad_norm": 7.009920597076416, - "learning_rate": 3.2851984678203264e-06, - "loss": 0.4348, - "mean_token_accuracy": 0.8672806769609451, - "num_tokens": 230143119.0, - "step": 191300 - }, - { - "entropy": 1.8921653680503367, - "epoch": 0.5930442613257241, - "grad_norm": 4.443764686584473, - "learning_rate": 3.2851126056444887e-06, - "loss": 0.43, - "mean_token_accuracy": 0.8577714130282402, - "num_tokens": 230154483.0, - "step": 191310 - }, - { - "entropy": 1.8913510665297508, - "epoch": 0.5930752604507737, - "grad_norm": 3.9651403427124023, - "learning_rate": 3.2850267502005983e-06, - "loss": 0.4303, - "mean_token_accuracy": 0.8589453801512719, - "num_tokens": 230166104.0, - "step": 191320 - }, - { - "entropy": 1.82257222533226, - "epoch": 0.5931062595758235, - "grad_norm": 4.177800178527832, - "learning_rate": 3.284940901487776e-06, - "loss": 0.3705, - "mean_token_accuracy": 0.8736421540379524, - "num_tokens": 230179512.0, - "step": 191330 - }, - { - "entropy": 1.9297788351774217, - "epoch": 0.5931372587008732, - "grad_norm": 9.562280654907227, - "learning_rate": 3.284855059505142e-06, - "loss": 0.4625, - "mean_token_accuracy": 0.8502230435609818, - "num_tokens": 230191047.0, - "step": 191340 - }, - { - "entropy": 1.9412023738026618, - "epoch": 0.5931682578259229, - "grad_norm": 7.020253658294678, - "learning_rate": 3.284769224251817e-06, - "loss": 0.4729, - "mean_token_accuracy": 0.8489518001675606, - "num_tokens": 230202789.0, - "step": 191350 - }, - { - "entropy": 1.9434513285756112, - "epoch": 0.5931992569509725, - "grad_norm": 8.415234565734863, - "learning_rate": 3.284683395726922e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8568918332457542, - "num_tokens": 230213774.0, - "step": 191360 - }, - { - "entropy": 1.8855139076709748, - "epoch": 0.5932302560760222, - "grad_norm": 10.343971252441406, - "learning_rate": 3.284597573929578e-06, - "loss": 0.4631, - "mean_token_accuracy": 0.8502753600478172, - "num_tokens": 230225552.0, - "step": 191370 - }, - { - "entropy": 1.8967630013823509, - "epoch": 0.593261255201072, - "grad_norm": 8.149733543395996, - "learning_rate": 3.284511758858906e-06, - "loss": 0.4346, - "mean_token_accuracy": 0.8509375154972076, - "num_tokens": 230237427.0, - "step": 191380 - }, - { - "entropy": 1.7960798382759093, - "epoch": 0.5932922543261216, - "grad_norm": 7.275712966918945, - "learning_rate": 3.2844259505140276e-06, - "loss": 0.3491, - "mean_token_accuracy": 0.8680211365222931, - "num_tokens": 230250950.0, - "step": 191390 - }, - { - "entropy": 1.8419205382466317, - "epoch": 0.5933232534511713, - "grad_norm": 3.7994577884674072, - "learning_rate": 3.284340148894064e-06, - "loss": 0.4106, - "mean_token_accuracy": 0.8641535311937332, - "num_tokens": 230263043.0, - "step": 191400 - }, - { - "entropy": 1.736213345825672, - "epoch": 0.593354252576221, - "grad_norm": 7.778532028198242, - "learning_rate": 3.2842543539981364e-06, - "loss": 0.3567, - "mean_token_accuracy": 0.8742548227310181, - "num_tokens": 230276722.0, - "step": 191410 - }, - { - "entropy": 1.8282306969165802, - "epoch": 0.5933852517012708, - "grad_norm": 9.659984588623047, - "learning_rate": 3.284168565825368e-06, - "loss": 0.4445, - "mean_token_accuracy": 0.8552992671728135, - "num_tokens": 230289280.0, - "step": 191420 - }, - { - "entropy": 1.9203779965639114, - "epoch": 0.5934162508263204, - "grad_norm": 7.255051612854004, - "learning_rate": 3.2840827843748797e-06, - "loss": 0.4823, - "mean_token_accuracy": 0.8464539498090744, - "num_tokens": 230300136.0, - "step": 191430 - }, - { - "entropy": 1.7909793078899383, - "epoch": 0.5934472499513701, - "grad_norm": 3.7259464263916016, - "learning_rate": 3.2839970096457935e-06, - "loss": 0.4302, - "mean_token_accuracy": 0.8664010047912598, - "num_tokens": 230313917.0, - "step": 191440 - }, - { - "entropy": 1.9096889346837997, - "epoch": 0.5934782490764198, - "grad_norm": 9.46761417388916, - "learning_rate": 3.2839112416372327e-06, - "loss": 0.5224, - "mean_token_accuracy": 0.8417908117175102, - "num_tokens": 230324881.0, - "step": 191450 - }, - { - "entropy": 1.9355971276760102, - "epoch": 0.5935092482014696, - "grad_norm": 8.894039154052734, - "learning_rate": 3.2838254803483185e-06, - "loss": 0.4866, - "mean_token_accuracy": 0.8509115308523179, - "num_tokens": 230335923.0, - "step": 191460 - }, - { - "entropy": 1.906863410770893, - "epoch": 0.5935402473265192, - "grad_norm": 7.58919620513916, - "learning_rate": 3.283739725778174e-06, - "loss": 0.4721, - "mean_token_accuracy": 0.8512718111276627, - "num_tokens": 230347235.0, - "step": 191470 - }, - { - "entropy": 1.9059718355536461, - "epoch": 0.5935712464515689, - "grad_norm": 8.579503059387207, - "learning_rate": 3.2836539779259224e-06, - "loss": 0.4622, - "mean_token_accuracy": 0.8537722200155258, - "num_tokens": 230358488.0, - "step": 191480 - }, - { - "entropy": 1.819216100871563, - "epoch": 0.5936022455766186, - "grad_norm": 8.80203914642334, - "learning_rate": 3.2835682367906855e-06, - "loss": 0.3962, - "mean_token_accuracy": 0.8643426224589348, - "num_tokens": 230370597.0, - "step": 191490 - }, - { - "entropy": 1.8791495144367218, - "epoch": 0.5936332447016683, - "grad_norm": 9.201972007751465, - "learning_rate": 3.2834825023715877e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.8502372041344642, - "num_tokens": 230382760.0, - "step": 191500 - }, - { - "entropy": 1.819232840836048, - "epoch": 0.593664243826718, - "grad_norm": 7.602062702178955, - "learning_rate": 3.2833967746677518e-06, - "loss": 0.3809, - "mean_token_accuracy": 0.8715303301811218, - "num_tokens": 230395032.0, - "step": 191510 - }, - { - "entropy": 1.8747132286429404, - "epoch": 0.5936952429517677, - "grad_norm": 4.803540229797363, - "learning_rate": 3.2833110536783016e-06, - "loss": 0.4888, - "mean_token_accuracy": 0.8512809172272682, - "num_tokens": 230406951.0, - "step": 191520 - }, - { - "entropy": 1.8536844834685327, - "epoch": 0.5937262420768173, - "grad_norm": 9.080991744995117, - "learning_rate": 3.2832253394023596e-06, - "loss": 0.4075, - "mean_token_accuracy": 0.8655486524105072, - "num_tokens": 230419249.0, - "step": 191530 - }, - { - "entropy": 1.9990214914083482, - "epoch": 0.5937572412018671, - "grad_norm": 7.607864856719971, - "learning_rate": 3.2831396318390503e-06, - "loss": 0.4952, - "mean_token_accuracy": 0.8464673578739166, - "num_tokens": 230430199.0, - "step": 191540 - }, - { - "entropy": 1.7617909386754036, - "epoch": 0.5937882403269168, - "grad_norm": 3.5683536529541016, - "learning_rate": 3.283053930987497e-06, - "loss": 0.3467, - "mean_token_accuracy": 0.8725787281990052, - "num_tokens": 230443895.0, - "step": 191550 - }, - { - "entropy": 1.9190792486071586, - "epoch": 0.5938192394519665, - "grad_norm": 7.397488594055176, - "learning_rate": 3.2829682368468247e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.854629234969616, - "num_tokens": 230455306.0, - "step": 191560 - }, - { - "entropy": 1.776638814806938, - "epoch": 0.5938502385770161, - "grad_norm": 7.879034042358398, - "learning_rate": 3.282882549416157e-06, - "loss": 0.3757, - "mean_token_accuracy": 0.868142431974411, - "num_tokens": 230468425.0, - "step": 191570 - }, - { - "entropy": 1.872198860347271, - "epoch": 0.5938812377020659, - "grad_norm": 8.502291679382324, - "learning_rate": 3.282796868694618e-06, - "loss": 0.4276, - "mean_token_accuracy": 0.8636263519525528, - "num_tokens": 230479390.0, - "step": 191580 - }, - { - "entropy": 1.8519338369369507, - "epoch": 0.5939122368271156, - "grad_norm": 4.0390543937683105, - "learning_rate": 3.2827111946813322e-06, - "loss": 0.4277, - "mean_token_accuracy": 0.8555379435420036, - "num_tokens": 230491954.0, - "step": 191590 - }, - { - "entropy": 1.8225978150963784, - "epoch": 0.5939432359521652, - "grad_norm": 8.881861686706543, - "learning_rate": 3.282625527375426e-06, - "loss": 0.3955, - "mean_token_accuracy": 0.8637348249554634, - "num_tokens": 230504269.0, - "step": 191600 - }, - { - "entropy": 1.8848466604948044, - "epoch": 0.5939742350772149, - "grad_norm": 7.539337635040283, - "learning_rate": 3.2825398667760223e-06, - "loss": 0.4104, - "mean_token_accuracy": 0.8589835464954376, - "num_tokens": 230516153.0, - "step": 191610 - }, - { - "entropy": 1.9843410074710846, - "epoch": 0.5940052342022646, - "grad_norm": 9.129307746887207, - "learning_rate": 3.282454212882246e-06, - "loss": 0.5621, - "mean_token_accuracy": 0.8272468775510788, - "num_tokens": 230526588.0, - "step": 191620 - }, - { - "entropy": 1.8471686720848084, - "epoch": 0.5940362333273144, - "grad_norm": 7.236810684204102, - "learning_rate": 3.2823685656932235e-06, - "loss": 0.4541, - "mean_token_accuracy": 0.8503222376108169, - "num_tokens": 230538252.0, - "step": 191630 - }, - { - "entropy": 1.848535105586052, - "epoch": 0.594067232452364, - "grad_norm": 9.330757141113281, - "learning_rate": 3.28228292520808e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.8530674621462822, - "num_tokens": 230550476.0, - "step": 191640 - }, - { - "entropy": 1.8827123567461967, - "epoch": 0.5940982315774137, - "grad_norm": 8.938699722290039, - "learning_rate": 3.2821972914259404e-06, - "loss": 0.4394, - "mean_token_accuracy": 0.8543351233005524, - "num_tokens": 230562174.0, - "step": 191650 - }, - { - "entropy": 1.842447192966938, - "epoch": 0.5941292307024634, - "grad_norm": 7.114195346832275, - "learning_rate": 3.2821116643459306e-06, - "loss": 0.409, - "mean_token_accuracy": 0.8618290036916733, - "num_tokens": 230574113.0, - "step": 191660 - }, - { - "entropy": 1.8264191061258317, - "epoch": 0.5941602298275132, - "grad_norm": 3.526878595352173, - "learning_rate": 3.282026043967176e-06, - "loss": 0.3666, - "mean_token_accuracy": 0.8595398098230362, - "num_tokens": 230587543.0, - "step": 191670 - }, - { - "entropy": 1.871898628771305, - "epoch": 0.5941912289525628, - "grad_norm": 3.80003023147583, - "learning_rate": 3.2819404302888037e-06, - "loss": 0.4486, - "mean_token_accuracy": 0.8516824513673782, - "num_tokens": 230600047.0, - "step": 191680 - }, - { - "entropy": 1.87267697006464, - "epoch": 0.5942222280776125, - "grad_norm": 8.057140350341797, - "learning_rate": 3.281854823309938e-06, - "loss": 0.3967, - "mean_token_accuracy": 0.8602946862578392, - "num_tokens": 230612330.0, - "step": 191690 - }, - { - "entropy": 1.819714893400669, - "epoch": 0.5942532272026622, - "grad_norm": 5.560348987579346, - "learning_rate": 3.2817692230297066e-06, - "loss": 0.3805, - "mean_token_accuracy": 0.8662676781415939, - "num_tokens": 230624973.0, - "step": 191700 - }, - { - "entropy": 1.8511559277772904, - "epoch": 0.5942842263277119, - "grad_norm": 10.996221542358398, - "learning_rate": 3.281683629447236e-06, - "loss": 0.4261, - "mean_token_accuracy": 0.8555448532104493, - "num_tokens": 230637494.0, - "step": 191710 - }, - { - "entropy": 1.8133092552423478, - "epoch": 0.5943152254527616, - "grad_norm": 4.265011310577393, - "learning_rate": 3.2815980425616522e-06, - "loss": 0.3555, - "mean_token_accuracy": 0.8670377910137177, - "num_tokens": 230650747.0, - "step": 191720 - }, - { - "entropy": 1.7965379044413567, - "epoch": 0.5943462245778113, - "grad_norm": 4.251617908477783, - "learning_rate": 3.2815124623720825e-06, - "loss": 0.3882, - "mean_token_accuracy": 0.8509297609329224, - "num_tokens": 230663926.0, - "step": 191730 - }, - { - "entropy": 1.8378178104758263, - "epoch": 0.594377223702861, - "grad_norm": 8.80667781829834, - "learning_rate": 3.281426888877653e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8531530737876892, - "num_tokens": 230676509.0, - "step": 191740 - }, - { - "entropy": 1.9434722900390624, - "epoch": 0.5944082228279107, - "grad_norm": 8.150884628295898, - "learning_rate": 3.2813413220774917e-06, - "loss": 0.4754, - "mean_token_accuracy": 0.849103407561779, - "num_tokens": 230687164.0, - "step": 191750 - }, - { - "entropy": 1.746304377913475, - "epoch": 0.5944392219529604, - "grad_norm": 8.53097152709961, - "learning_rate": 3.2812557619707245e-06, - "loss": 0.4021, - "mean_token_accuracy": 0.8571577414870262, - "num_tokens": 230700490.0, - "step": 191760 - }, - { - "entropy": 1.8631037756800652, - "epoch": 0.5944702210780101, - "grad_norm": 3.721904993057251, - "learning_rate": 3.281170208556481e-06, - "loss": 0.4147, - "mean_token_accuracy": 0.8637348189949989, - "num_tokens": 230712164.0, - "step": 191770 - }, - { - "entropy": 1.8626684874296189, - "epoch": 0.5945012202030597, - "grad_norm": 9.87203311920166, - "learning_rate": 3.2810846618338865e-06, - "loss": 0.4516, - "mean_token_accuracy": 0.8456865876913071, - "num_tokens": 230723576.0, - "step": 191780 - }, - { - "entropy": 1.846021082997322, - "epoch": 0.5945322193281095, - "grad_norm": 8.10145092010498, - "learning_rate": 3.28099912180207e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.84685700237751, - "num_tokens": 230735756.0, - "step": 191790 - }, - { - "entropy": 1.9466848850250245, - "epoch": 0.5945632184531592, - "grad_norm": 8.779047012329102, - "learning_rate": 3.2809135884601595e-06, - "loss": 0.4874, - "mean_token_accuracy": 0.8415557697415352, - "num_tokens": 230747089.0, - "step": 191800 - }, - { - "entropy": 1.8106557488441468, - "epoch": 0.5945942175782089, - "grad_norm": 7.083572864532471, - "learning_rate": 3.2808280618072817e-06, - "loss": 0.3861, - "mean_token_accuracy": 0.8683284193277359, - "num_tokens": 230759971.0, - "step": 191810 - }, - { - "entropy": 1.755371056497097, - "epoch": 0.5946252167032585, - "grad_norm": 4.735435485839844, - "learning_rate": 3.2807425418425672e-06, - "loss": 0.3442, - "mean_token_accuracy": 0.869535180926323, - "num_tokens": 230773520.0, - "step": 191820 - }, - { - "entropy": 1.9257057636976243, - "epoch": 0.5946562158283083, - "grad_norm": 8.006759643554688, - "learning_rate": 3.280657028565142e-06, - "loss": 0.4385, - "mean_token_accuracy": 0.8602747738361358, - "num_tokens": 230784640.0, - "step": 191830 - }, - { - "entropy": 1.904049114882946, - "epoch": 0.594687214953358, - "grad_norm": 8.111559867858887, - "learning_rate": 3.2805715219741352e-06, - "loss": 0.4546, - "mean_token_accuracy": 0.8505662351846695, - "num_tokens": 230795955.0, - "step": 191840 - }, - { - "entropy": 1.7928750395774842, - "epoch": 0.5947182140784076, - "grad_norm": 3.7698416709899902, - "learning_rate": 3.280486022068676e-06, - "loss": 0.3698, - "mean_token_accuracy": 0.8676718294620513, - "num_tokens": 230808581.0, - "step": 191850 - }, - { - "entropy": 1.8332011282444, - "epoch": 0.5947492132034573, - "grad_norm": 9.309735298156738, - "learning_rate": 3.280400528847893e-06, - "loss": 0.4294, - "mean_token_accuracy": 0.8579959213733673, - "num_tokens": 230821353.0, - "step": 191860 - }, - { - "entropy": 1.8673254698514938, - "epoch": 0.594780212328507, - "grad_norm": 3.900421619415283, - "learning_rate": 3.280315042310916e-06, - "loss": 0.4234, - "mean_token_accuracy": 0.8639498367905617, - "num_tokens": 230832972.0, - "step": 191870 - }, - { - "entropy": 1.799913428723812, - "epoch": 0.5948112114535568, - "grad_norm": 3.741468906402588, - "learning_rate": 3.2802295624568725e-06, - "loss": 0.3769, - "mean_token_accuracy": 0.8695779353380203, - "num_tokens": 230845058.0, - "step": 191880 - }, - { - "entropy": 1.8533201590180397, - "epoch": 0.5948422105786064, - "grad_norm": 4.252591133117676, - "learning_rate": 3.2801440892848922e-06, - "loss": 0.4459, - "mean_token_accuracy": 0.8506780445575715, - "num_tokens": 230856930.0, - "step": 191890 - }, - { - "entropy": 1.82045723721385, - "epoch": 0.5948732097036561, - "grad_norm": 9.829673767089844, - "learning_rate": 3.2800586227941063e-06, - "loss": 0.3771, - "mean_token_accuracy": 0.8705439165234565, - "num_tokens": 230868606.0, - "step": 191900 - }, - { - "entropy": 1.9105019956827163, - "epoch": 0.5949042088287058, - "grad_norm": 7.704622268676758, - "learning_rate": 3.279973162983642e-06, - "loss": 0.4722, - "mean_token_accuracy": 0.8548673078417778, - "num_tokens": 230879612.0, - "step": 191910 - }, - { - "entropy": 1.7643791824579238, - "epoch": 0.5949352079537555, - "grad_norm": 8.07160472869873, - "learning_rate": 3.279887709852631e-06, - "loss": 0.3875, - "mean_token_accuracy": 0.8598277315497398, - "num_tokens": 230893181.0, - "step": 191920 - }, - { - "entropy": 1.8020456477999687, - "epoch": 0.5949662070788052, - "grad_norm": 8.120401382446289, - "learning_rate": 3.2798022634002025e-06, - "loss": 0.4261, - "mean_token_accuracy": 0.8596922606229782, - "num_tokens": 230905797.0, - "step": 191930 - }, - { - "entropy": 1.8268871188163758, - "epoch": 0.5949972062038549, - "grad_norm": 8.7583589553833, - "learning_rate": 3.2797168236254867e-06, - "loss": 0.3928, - "mean_token_accuracy": 0.8658729165792465, - "num_tokens": 230917719.0, - "step": 191940 - }, - { - "entropy": 1.8274653524160385, - "epoch": 0.5950282053289045, - "grad_norm": 8.776835441589355, - "learning_rate": 3.2796313905276135e-06, - "loss": 0.4778, - "mean_token_accuracy": 0.8572709947824478, - "num_tokens": 230930520.0, - "step": 191950 - }, - { - "entropy": 1.7833070874214172, - "epoch": 0.5950592044539543, - "grad_norm": 8.960201263427734, - "learning_rate": 3.2795459641057135e-06, - "loss": 0.4131, - "mean_token_accuracy": 0.8575366869568825, - "num_tokens": 230944152.0, - "step": 191960 - }, - { - "entropy": 1.9216564670205116, - "epoch": 0.595090203579004, - "grad_norm": 5.831974506378174, - "learning_rate": 3.2794605443589176e-06, - "loss": 0.4398, - "mean_token_accuracy": 0.8623504295945168, - "num_tokens": 230955827.0, - "step": 191970 - }, - { - "entropy": 1.9013028174638749, - "epoch": 0.5951212027040537, - "grad_norm": 8.19417667388916, - "learning_rate": 3.2793751312863564e-06, - "loss": 0.4694, - "mean_token_accuracy": 0.8450792387127877, - "num_tokens": 230968253.0, - "step": 191980 - }, - { - "entropy": 1.913487869501114, - "epoch": 0.5951522018291033, - "grad_norm": 7.69880485534668, - "learning_rate": 3.2792897248871604e-06, - "loss": 0.4304, - "mean_token_accuracy": 0.8569048374891282, - "num_tokens": 230979258.0, - "step": 191990 - }, - { - "entropy": 1.9196341052651404, - "epoch": 0.5951832009541531, - "grad_norm": 7.564324378967285, - "learning_rate": 3.279204325160461e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.861096502840519, - "num_tokens": 230990249.0, - "step": 192000 - }, - { - "entropy": 1.9277661710977554, - "epoch": 0.5952142000792028, - "grad_norm": 8.377885818481445, - "learning_rate": 3.2791189321053897e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8573820039629936, - "num_tokens": 231000863.0, - "step": 192010 - }, - { - "entropy": 1.7512979969382285, - "epoch": 0.5952451992042525, - "grad_norm": 4.907512187957764, - "learning_rate": 3.2790335457210778e-06, - "loss": 0.3723, - "mean_token_accuracy": 0.86670580804348, - "num_tokens": 231014360.0, - "step": 192020 - }, - { - "entropy": 1.8931189358234406, - "epoch": 0.5952761983293021, - "grad_norm": 7.5218377113342285, - "learning_rate": 3.2789481660066556e-06, - "loss": 0.4542, - "mean_token_accuracy": 0.8527333214879036, - "num_tokens": 231025991.0, - "step": 192030 - }, - { - "entropy": 1.8755558401346206, - "epoch": 0.5953071974543519, - "grad_norm": 7.976413726806641, - "learning_rate": 3.2788627929612564e-06, - "loss": 0.4795, - "mean_token_accuracy": 0.8503453806042671, - "num_tokens": 231037580.0, - "step": 192040 - }, - { - "entropy": 1.9751273036003112, - "epoch": 0.5953381965794016, - "grad_norm": 8.17392349243164, - "learning_rate": 3.2787774265840118e-06, - "loss": 0.5357, - "mean_token_accuracy": 0.8392359703779221, - "num_tokens": 231048287.0, - "step": 192050 - }, - { - "entropy": 1.8563969075679778, - "epoch": 0.5953691957044512, - "grad_norm": 8.1824951171875, - "learning_rate": 3.2786920668740523e-06, - "loss": 0.3955, - "mean_token_accuracy": 0.8573218896985054, - "num_tokens": 231060684.0, - "step": 192060 - }, - { - "entropy": 1.8792769759893417, - "epoch": 0.5954001948295009, - "grad_norm": 8.142267227172852, - "learning_rate": 3.2786067138305128e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8620775952935219, - "num_tokens": 231072491.0, - "step": 192070 - }, - { - "entropy": 1.8542990058660507, - "epoch": 0.5954311939545507, - "grad_norm": 4.228769779205322, - "learning_rate": 3.278521367452523e-06, - "loss": 0.4365, - "mean_token_accuracy": 0.8524388536810875, - "num_tokens": 231084915.0, - "step": 192080 - }, - { - "entropy": 1.8490941271185874, - "epoch": 0.5954621930796004, - "grad_norm": 6.09950590133667, - "learning_rate": 3.2784360277392156e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8661493092775345, - "num_tokens": 231097243.0, - "step": 192090 - }, - { - "entropy": 1.8824499115347861, - "epoch": 0.59549319220465, - "grad_norm": 10.269917488098145, - "learning_rate": 3.2783506946897257e-06, - "loss": 0.4303, - "mean_token_accuracy": 0.8613364815711975, - "num_tokens": 231109082.0, - "step": 192100 - }, - { - "entropy": 1.8853991687297822, - "epoch": 0.5955241913296997, - "grad_norm": 8.798211097717285, - "learning_rate": 3.2782653683031833e-06, - "loss": 0.467, - "mean_token_accuracy": 0.851449416577816, - "num_tokens": 231121501.0, - "step": 192110 - }, - { - "entropy": 1.8421639442443847, - "epoch": 0.5955551904547494, - "grad_norm": 9.831924438476562, - "learning_rate": 3.278180048578723e-06, - "loss": 0.3744, - "mean_token_accuracy": 0.8697717979550361, - "num_tokens": 231133864.0, - "step": 192120 - }, - { - "entropy": 1.9034344598650932, - "epoch": 0.5955861895797991, - "grad_norm": 8.559617042541504, - "learning_rate": 3.2780947355154772e-06, - "loss": 0.4635, - "mean_token_accuracy": 0.8431790366768837, - "num_tokens": 231145502.0, - "step": 192130 - }, - { - "entropy": 1.927782055735588, - "epoch": 0.5956171887048488, - "grad_norm": 7.649790287017822, - "learning_rate": 3.2780094291125804e-06, - "loss": 0.471, - "mean_token_accuracy": 0.8572627499699592, - "num_tokens": 231156738.0, - "step": 192140 - }, - { - "entropy": 1.7562423720955849, - "epoch": 0.5956481878298985, - "grad_norm": 3.492551326751709, - "learning_rate": 3.277924129369164e-06, - "loss": 0.399, - "mean_token_accuracy": 0.8618337422609329, - "num_tokens": 231170841.0, - "step": 192150 - }, - { - "entropy": 1.8300852000713348, - "epoch": 0.5956791869549481, - "grad_norm": 4.240060806274414, - "learning_rate": 3.277838836284362e-06, - "loss": 0.3706, - "mean_token_accuracy": 0.8771671429276466, - "num_tokens": 231182659.0, - "step": 192160 - }, - { - "entropy": 1.8263467103242874, - "epoch": 0.5957101860799979, - "grad_norm": 4.0324788093566895, - "learning_rate": 3.2777535498573097e-06, - "loss": 0.4734, - "mean_token_accuracy": 0.8442503765225411, - "num_tokens": 231195832.0, - "step": 192170 - }, - { - "entropy": 1.8414695873856544, - "epoch": 0.5957411852050476, - "grad_norm": 8.079511642456055, - "learning_rate": 3.2776682700871396e-06, - "loss": 0.3827, - "mean_token_accuracy": 0.8633176237344742, - "num_tokens": 231208416.0, - "step": 192180 - }, - { - "entropy": 1.9129913032054902, - "epoch": 0.5957721843300973, - "grad_norm": 7.3883585929870605, - "learning_rate": 3.2775829969729867e-06, - "loss": 0.4703, - "mean_token_accuracy": 0.859006418287754, - "num_tokens": 231219018.0, - "step": 192190 - }, - { - "entropy": 1.9273998066782951, - "epoch": 0.5958031834551469, - "grad_norm": 7.80372953414917, - "learning_rate": 3.2774977305139842e-06, - "loss": 0.4364, - "mean_token_accuracy": 0.8586554944515228, - "num_tokens": 231230542.0, - "step": 192200 - }, - { - "entropy": 1.9060659855604172, - "epoch": 0.5958341825801967, - "grad_norm": 8.776111602783203, - "learning_rate": 3.2774124707092676e-06, - "loss": 0.4366, - "mean_token_accuracy": 0.8529020965099334, - "num_tokens": 231242139.0, - "step": 192210 - }, - { - "entropy": 1.8044569581747054, - "epoch": 0.5958651817052464, - "grad_norm": 8.237578392028809, - "learning_rate": 3.2773272175579697e-06, - "loss": 0.4111, - "mean_token_accuracy": 0.8572305858135223, - "num_tokens": 231255179.0, - "step": 192220 - }, - { - "entropy": 1.8218290351331234, - "epoch": 0.595896180830296, - "grad_norm": 9.003937721252441, - "learning_rate": 3.2772419710592275e-06, - "loss": 0.4043, - "mean_token_accuracy": 0.8563547030091285, - "num_tokens": 231268401.0, - "step": 192230 - }, - { - "entropy": 1.8903609573841096, - "epoch": 0.5959271799553457, - "grad_norm": 7.921415328979492, - "learning_rate": 3.2771567312121737e-06, - "loss": 0.4608, - "mean_token_accuracy": 0.8533718347549438, - "num_tokens": 231279496.0, - "step": 192240 - }, - { - "entropy": 1.9152607202529908, - "epoch": 0.5959581790803955, - "grad_norm": 6.411677837371826, - "learning_rate": 3.277071498015945e-06, - "loss": 0.4684, - "mean_token_accuracy": 0.8491850972175599, - "num_tokens": 231290709.0, - "step": 192250 - }, - { - "entropy": 1.9035677805542945, - "epoch": 0.5959891782054452, - "grad_norm": 9.079903602600098, - "learning_rate": 3.2769862714696755e-06, - "loss": 0.4571, - "mean_token_accuracy": 0.8487890064716339, - "num_tokens": 231302222.0, - "step": 192260 - }, - { - "entropy": 1.9371833622455596, - "epoch": 0.5960201773304948, - "grad_norm": 7.364938259124756, - "learning_rate": 3.2769010515725007e-06, - "loss": 0.4713, - "mean_token_accuracy": 0.850156307220459, - "num_tokens": 231313394.0, - "step": 192270 - }, - { - "entropy": 1.8407243877649306, - "epoch": 0.5960511764555445, - "grad_norm": 8.76639175415039, - "learning_rate": 3.276815838323557e-06, - "loss": 0.4282, - "mean_token_accuracy": 0.8598857238888741, - "num_tokens": 231325783.0, - "step": 192280 - }, - { - "entropy": 1.8932856723666192, - "epoch": 0.5960821755805943, - "grad_norm": 8.80115795135498, - "learning_rate": 3.276730631721979e-06, - "loss": 0.4519, - "mean_token_accuracy": 0.8356119051575661, - "num_tokens": 231337818.0, - "step": 192290 - }, - { - "entropy": 1.7974711254239082, - "epoch": 0.596113174705644, - "grad_norm": 4.231179714202881, - "learning_rate": 3.2766454317669018e-06, - "loss": 0.349, - "mean_token_accuracy": 0.8690388962626457, - "num_tokens": 231350995.0, - "step": 192300 - }, - { - "entropy": 1.8709260776638985, - "epoch": 0.5961441738306936, - "grad_norm": 6.723526477813721, - "learning_rate": 3.2765602384574635e-06, - "loss": 0.4299, - "mean_token_accuracy": 0.8658482566475868, - "num_tokens": 231363054.0, - "step": 192310 - }, - { - "entropy": 1.8911598384380341, - "epoch": 0.5961751729557433, - "grad_norm": 7.90709924697876, - "learning_rate": 3.276475051792798e-06, - "loss": 0.469, - "mean_token_accuracy": 0.8582043945789337, - "num_tokens": 231374130.0, - "step": 192320 - }, - { - "entropy": 1.8854842871427535, - "epoch": 0.5962061720807931, - "grad_norm": 8.477952003479004, - "learning_rate": 3.2763898717720433e-06, - "loss": 0.436, - "mean_token_accuracy": 0.8625700756907463, - "num_tokens": 231385137.0, - "step": 192330 - }, - { - "entropy": 1.8439275979995728, - "epoch": 0.5962371712058427, - "grad_norm": 8.775875091552734, - "learning_rate": 3.2763046983943347e-06, - "loss": 0.4485, - "mean_token_accuracy": 0.8571325197815896, - "num_tokens": 231397818.0, - "step": 192340 - }, - { - "entropy": 1.8874992370605468, - "epoch": 0.5962681703308924, - "grad_norm": 8.183721542358398, - "learning_rate": 3.276219531658809e-06, - "loss": 0.4692, - "mean_token_accuracy": 0.8506439417600632, - "num_tokens": 231409817.0, - "step": 192350 - }, - { - "entropy": 1.886862662434578, - "epoch": 0.5962991694559421, - "grad_norm": 9.884288787841797, - "learning_rate": 3.276134371564604e-06, - "loss": 0.4771, - "mean_token_accuracy": 0.8503222018480301, - "num_tokens": 231421405.0, - "step": 192360 - }, - { - "entropy": 1.9534014105796813, - "epoch": 0.5963301685809917, - "grad_norm": 9.145366668701172, - "learning_rate": 3.2760492181108543e-06, - "loss": 0.5263, - "mean_token_accuracy": 0.8373357936739921, - "num_tokens": 231432345.0, - "step": 192370 - }, - { - "entropy": 1.8393455937504768, - "epoch": 0.5963611677060415, - "grad_norm": 7.934825420379639, - "learning_rate": 3.2759640712966994e-06, - "loss": 0.3978, - "mean_token_accuracy": 0.8589689537882805, - "num_tokens": 231444501.0, - "step": 192380 - }, - { - "entropy": 1.7986860394477844, - "epoch": 0.5963921668310912, - "grad_norm": 9.304616928100586, - "learning_rate": 3.2758789311212754e-06, - "loss": 0.3973, - "mean_token_accuracy": 0.8577401593327523, - "num_tokens": 231457103.0, - "step": 192390 - }, - { - "entropy": 1.9137774541974069, - "epoch": 0.5964231659561409, - "grad_norm": 9.727755546569824, - "learning_rate": 3.2757937975837195e-06, - "loss": 0.495, - "mean_token_accuracy": 0.8480470091104507, - "num_tokens": 231468352.0, - "step": 192400 - }, - { - "entropy": 1.9033278197050094, - "epoch": 0.5964541650811905, - "grad_norm": 8.264392852783203, - "learning_rate": 3.275708670683169e-06, - "loss": 0.412, - "mean_token_accuracy": 0.8584694027900696, - "num_tokens": 231479992.0, - "step": 192410 - }, - { - "entropy": 1.936842668056488, - "epoch": 0.5964851642062403, - "grad_norm": 8.501169204711914, - "learning_rate": 3.275623550418762e-06, - "loss": 0.4878, - "mean_token_accuracy": 0.8506403654813767, - "num_tokens": 231491144.0, - "step": 192420 - }, - { - "entropy": 1.8134740129113198, - "epoch": 0.59651616333129, - "grad_norm": 9.514694213867188, - "learning_rate": 3.2755384367896375e-06, - "loss": 0.4056, - "mean_token_accuracy": 0.8578062415122986, - "num_tokens": 231503499.0, - "step": 192430 - }, - { - "entropy": 1.8479931145906447, - "epoch": 0.5965471624563397, - "grad_norm": 8.95875072479248, - "learning_rate": 3.275453329794932e-06, - "loss": 0.3958, - "mean_token_accuracy": 0.8631100043654442, - "num_tokens": 231515536.0, - "step": 192440 - }, - { - "entropy": 1.8025830656290054, - "epoch": 0.5965781615813893, - "grad_norm": 3.1973462104797363, - "learning_rate": 3.2753682294337836e-06, - "loss": 0.3814, - "mean_token_accuracy": 0.8669339135289192, - "num_tokens": 231528253.0, - "step": 192450 - }, - { - "entropy": 1.9666550129652023, - "epoch": 0.5966091607064391, - "grad_norm": 7.123218059539795, - "learning_rate": 3.2752831357053304e-06, - "loss": 0.5224, - "mean_token_accuracy": 0.8414567679166793, - "num_tokens": 231539229.0, - "step": 192460 - }, - { - "entropy": 1.8350554794073104, - "epoch": 0.5966401598314888, - "grad_norm": 9.063071250915527, - "learning_rate": 3.2751980486087127e-06, - "loss": 0.4336, - "mean_token_accuracy": 0.8557635545730591, - "num_tokens": 231551262.0, - "step": 192470 - }, - { - "entropy": 1.84403538107872, - "epoch": 0.5966711589565384, - "grad_norm": 7.305988788604736, - "learning_rate": 3.275112968143067e-06, - "loss": 0.4092, - "mean_token_accuracy": 0.8625670328736306, - "num_tokens": 231563485.0, - "step": 192480 - }, - { - "entropy": 1.929225890338421, - "epoch": 0.5967021580815881, - "grad_norm": 8.14575481414795, - "learning_rate": 3.2750278943075337e-06, - "loss": 0.4446, - "mean_token_accuracy": 0.8594361409544945, - "num_tokens": 231574806.0, - "step": 192490 - }, - { - "entropy": 1.9129980459809304, - "epoch": 0.5967331572066379, - "grad_norm": 7.192739009857178, - "learning_rate": 3.2749428271012503e-06, - "loss": 0.4545, - "mean_token_accuracy": 0.8492211416363716, - "num_tokens": 231586108.0, - "step": 192500 - }, - { - "entropy": 1.7690053448081016, - "epoch": 0.5967641563316876, - "grad_norm": 3.324474573135376, - "learning_rate": 3.274857766523356e-06, - "loss": 0.3511, - "mean_token_accuracy": 0.8734697043895722, - "num_tokens": 231599945.0, - "step": 192510 - }, - { - "entropy": 1.8933797150850296, - "epoch": 0.5967951554567372, - "grad_norm": 7.506561756134033, - "learning_rate": 3.2747727125729916e-06, - "loss": 0.4465, - "mean_token_accuracy": 0.8573498860001564, - "num_tokens": 231611932.0, - "step": 192520 - }, - { - "entropy": 1.9005317762494087, - "epoch": 0.5968261545817869, - "grad_norm": 8.153799057006836, - "learning_rate": 3.2746876652492956e-06, - "loss": 0.475, - "mean_token_accuracy": 0.8526677310466766, - "num_tokens": 231623838.0, - "step": 192530 - }, - { - "entropy": 1.9433410674333573, - "epoch": 0.5968571537068367, - "grad_norm": 7.4116530418396, - "learning_rate": 3.2746026245514067e-06, - "loss": 0.5235, - "mean_token_accuracy": 0.8457577064633369, - "num_tokens": 231634466.0, - "step": 192540 - }, - { - "entropy": 1.8406395971775056, - "epoch": 0.5968881528318863, - "grad_norm": 8.905058860778809, - "learning_rate": 3.274517590478466e-06, - "loss": 0.3864, - "mean_token_accuracy": 0.8675936937332154, - "num_tokens": 231646117.0, - "step": 192550 - }, - { - "entropy": 1.8168965771794319, - "epoch": 0.596919151956936, - "grad_norm": 7.075071811676025, - "learning_rate": 3.2744325630296127e-06, - "loss": 0.3893, - "mean_token_accuracy": 0.8673455089330673, - "num_tokens": 231658932.0, - "step": 192560 - }, - { - "entropy": 1.854390124976635, - "epoch": 0.5969501510819857, - "grad_norm": 7.545094966888428, - "learning_rate": 3.2743475422039867e-06, - "loss": 0.4227, - "mean_token_accuracy": 0.8597999036312103, - "num_tokens": 231670845.0, - "step": 192570 - }, - { - "entropy": 1.9084011524915696, - "epoch": 0.5969811502070355, - "grad_norm": 8.939079284667969, - "learning_rate": 3.2742625280007286e-06, - "loss": 0.4572, - "mean_token_accuracy": 0.8635304853320122, - "num_tokens": 231681984.0, - "step": 192580 - }, - { - "entropy": 1.7502815291285514, - "epoch": 0.5970121493320851, - "grad_norm": 3.648104667663574, - "learning_rate": 3.2741775204189778e-06, - "loss": 0.3869, - "mean_token_accuracy": 0.8664402410387992, - "num_tokens": 231695315.0, - "step": 192590 - }, - { - "entropy": 1.915536816418171, - "epoch": 0.5970431484571348, - "grad_norm": 7.727672100067139, - "learning_rate": 3.274092519457876e-06, - "loss": 0.5206, - "mean_token_accuracy": 0.827784538269043, - "num_tokens": 231707239.0, - "step": 192600 - }, - { - "entropy": 1.869948796927929, - "epoch": 0.5970741475821845, - "grad_norm": 8.682625770568848, - "learning_rate": 3.274007525116563e-06, - "loss": 0.4365, - "mean_token_accuracy": 0.8611716479063034, - "num_tokens": 231719236.0, - "step": 192610 - }, - { - "entropy": 1.9523400247097016, - "epoch": 0.5971051467072341, - "grad_norm": 9.861188888549805, - "learning_rate": 3.2739225373941804e-06, - "loss": 0.4744, - "mean_token_accuracy": 0.8510428488254547, - "num_tokens": 231729734.0, - "step": 192620 - }, - { - "entropy": 1.8546181842684746, - "epoch": 0.5971361458322839, - "grad_norm": 8.405597686767578, - "learning_rate": 3.2738375562898684e-06, - "loss": 0.4359, - "mean_token_accuracy": 0.8611133188009262, - "num_tokens": 231741463.0, - "step": 192630 - }, - { - "entropy": 1.8450936824083328, - "epoch": 0.5971671449573336, - "grad_norm": 7.496282577514648, - "learning_rate": 3.273752581802769e-06, - "loss": 0.4426, - "mean_token_accuracy": 0.8597393468022346, - "num_tokens": 231753316.0, - "step": 192640 - }, - { - "entropy": 1.8092393189668656, - "epoch": 0.5971981440823833, - "grad_norm": 8.165148735046387, - "learning_rate": 3.2736676139320224e-06, - "loss": 0.4095, - "mean_token_accuracy": 0.8632441446185112, - "num_tokens": 231766156.0, - "step": 192650 - }, - { - "entropy": 1.9509679853916169, - "epoch": 0.5972291432074329, - "grad_norm": 10.11458969116211, - "learning_rate": 3.2735826526767705e-06, - "loss": 0.4703, - "mean_token_accuracy": 0.8565096750855445, - "num_tokens": 231777012.0, - "step": 192660 - }, - { - "entropy": 1.912872165441513, - "epoch": 0.5972601423324827, - "grad_norm": 8.525450706481934, - "learning_rate": 3.2734976980361547e-06, - "loss": 0.5026, - "mean_token_accuracy": 0.84669349193573, - "num_tokens": 231787575.0, - "step": 192670 - }, - { - "entropy": 1.7871014818549156, - "epoch": 0.5972911414575324, - "grad_norm": 8.310676574707031, - "learning_rate": 3.2734127500093175e-06, - "loss": 0.3778, - "mean_token_accuracy": 0.8582663521170616, - "num_tokens": 231800421.0, - "step": 192680 - }, - { - "entropy": 1.9158703058958053, - "epoch": 0.597322140582582, - "grad_norm": 8.641186714172363, - "learning_rate": 3.2733278085954e-06, - "loss": 0.4326, - "mean_token_accuracy": 0.8641590908169746, - "num_tokens": 231811362.0, - "step": 192690 - }, - { - "entropy": 1.8585405111312867, - "epoch": 0.5973531397076317, - "grad_norm": 9.843905448913574, - "learning_rate": 3.2732428737935443e-06, - "loss": 0.481, - "mean_token_accuracy": 0.8423675552010537, - "num_tokens": 231823920.0, - "step": 192700 - }, - { - "entropy": 1.9212913721799851, - "epoch": 0.5973841388326815, - "grad_norm": 7.2393341064453125, - "learning_rate": 3.273157945602893e-06, - "loss": 0.4341, - "mean_token_accuracy": 0.8591304495930672, - "num_tokens": 231834874.0, - "step": 192710 - }, - { - "entropy": 1.9177624106407165, - "epoch": 0.5974151379577312, - "grad_norm": 7.788320064544678, - "learning_rate": 3.2730730240225896e-06, - "loss": 0.4679, - "mean_token_accuracy": 0.8575213506817818, - "num_tokens": 231845743.0, - "step": 192720 - }, - { - "entropy": 1.750645785033703, - "epoch": 0.5974461370827808, - "grad_norm": 8.398082733154297, - "learning_rate": 3.2729881090517734e-06, - "loss": 0.346, - "mean_token_accuracy": 0.8724553629755973, - "num_tokens": 231858233.0, - "step": 192730 - }, - { - "entropy": 1.8802256792783738, - "epoch": 0.5974771362078305, - "grad_norm": 7.55579948425293, - "learning_rate": 3.27290320068959e-06, - "loss": 0.41, - "mean_token_accuracy": 0.8634855419397354, - "num_tokens": 231870089.0, - "step": 192740 - }, - { - "entropy": 1.8600758731365203, - "epoch": 0.5975081353328803, - "grad_norm": 7.949321746826172, - "learning_rate": 3.272818298935181e-06, - "loss": 0.4434, - "mean_token_accuracy": 0.8537335246801376, - "num_tokens": 231881937.0, - "step": 192750 - }, - { - "entropy": 1.931334674358368, - "epoch": 0.5975391344579299, - "grad_norm": 6.258232593536377, - "learning_rate": 3.2727334037876896e-06, - "loss": 0.4986, - "mean_token_accuracy": 0.8499545440077781, - "num_tokens": 231892789.0, - "step": 192760 - }, - { - "entropy": 1.9782539188861847, - "epoch": 0.5975701335829796, - "grad_norm": 8.786333084106445, - "learning_rate": 3.2726485152462585e-06, - "loss": 0.4957, - "mean_token_accuracy": 0.84947439879179, - "num_tokens": 231903402.0, - "step": 192770 - }, - { - "entropy": 1.9479021221399306, - "epoch": 0.5976011327080293, - "grad_norm": 11.152770042419434, - "learning_rate": 3.2725636333100318e-06, - "loss": 0.5102, - "mean_token_accuracy": 0.8418626189231873, - "num_tokens": 231914878.0, - "step": 192780 - }, - { - "entropy": 1.870383796095848, - "epoch": 0.5976321318330791, - "grad_norm": 9.35821533203125, - "learning_rate": 3.272478757978153e-06, - "loss": 0.4081, - "mean_token_accuracy": 0.8648263871669769, - "num_tokens": 231927587.0, - "step": 192790 - }, - { - "entropy": 1.9339314162731172, - "epoch": 0.5976631309581287, - "grad_norm": 12.332832336425781, - "learning_rate": 3.272393889249766e-06, - "loss": 0.48, - "mean_token_accuracy": 0.8536783277988433, - "num_tokens": 231939020.0, - "step": 192800 - }, - { - "entropy": 1.867577140033245, - "epoch": 0.5976941300831784, - "grad_norm": 3.920532703399658, - "learning_rate": 3.2723090271240126e-06, - "loss": 0.4298, - "mean_token_accuracy": 0.8560645163059235, - "num_tokens": 231951443.0, - "step": 192810 - }, - { - "entropy": 1.874250377714634, - "epoch": 0.5977251292082281, - "grad_norm": 11.359745979309082, - "learning_rate": 3.272224171600038e-06, - "loss": 0.4143, - "mean_token_accuracy": 0.8662922665476799, - "num_tokens": 231963672.0, - "step": 192820 - }, - { - "entropy": 1.8122723251581192, - "epoch": 0.5977561283332778, - "grad_norm": 4.158001899719238, - "learning_rate": 3.2721393226769865e-06, - "loss": 0.3682, - "mean_token_accuracy": 0.8700658664107322, - "num_tokens": 231976624.0, - "step": 192830 - }, - { - "entropy": 1.8762271001935005, - "epoch": 0.5977871274583275, - "grad_norm": 5.985019207000732, - "learning_rate": 3.2720544803540026e-06, - "loss": 0.4151, - "mean_token_accuracy": 0.8615304440259933, - "num_tokens": 231988561.0, - "step": 192840 - }, - { - "entropy": 1.948697453737259, - "epoch": 0.5978181265833772, - "grad_norm": 3.944962978363037, - "learning_rate": 3.27196964463023e-06, - "loss": 0.478, - "mean_token_accuracy": 0.8495228439569473, - "num_tokens": 231999874.0, - "step": 192850 - }, - { - "entropy": 1.8740996643900871, - "epoch": 0.5978491257084269, - "grad_norm": 9.939412117004395, - "learning_rate": 3.2718848155048135e-06, - "loss": 0.4362, - "mean_token_accuracy": 0.8526767179369926, - "num_tokens": 232012109.0, - "step": 192860 - }, - { - "entropy": 1.8960766091942787, - "epoch": 0.5978801248334765, - "grad_norm": 8.182035446166992, - "learning_rate": 3.271799992976898e-06, - "loss": 0.4289, - "mean_token_accuracy": 0.8591491937637329, - "num_tokens": 232023642.0, - "step": 192870 - }, - { - "entropy": 1.9175554364919662, - "epoch": 0.5979111239585263, - "grad_norm": 8.83017635345459, - "learning_rate": 3.2717151770456274e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8541746884584427, - "num_tokens": 232035340.0, - "step": 192880 - }, - { - "entropy": 1.8813014537096024, - "epoch": 0.597942123083576, - "grad_norm": 8.69917106628418, - "learning_rate": 3.271630367710148e-06, - "loss": 0.4102, - "mean_token_accuracy": 0.8582179397344589, - "num_tokens": 232047284.0, - "step": 192890 - }, - { - "entropy": 1.9298100471496582, - "epoch": 0.5979731222086256, - "grad_norm": 7.181746006011963, - "learning_rate": 3.271545564969604e-06, - "loss": 0.4514, - "mean_token_accuracy": 0.8552373722195625, - "num_tokens": 232058105.0, - "step": 192900 - }, - { - "entropy": 1.835771170258522, - "epoch": 0.5980041213336753, - "grad_norm": 8.301854133605957, - "learning_rate": 3.2714607688231415e-06, - "loss": 0.4252, - "mean_token_accuracy": 0.8567116022109985, - "num_tokens": 232070445.0, - "step": 192910 - }, - { - "entropy": 1.803771485388279, - "epoch": 0.5980351204587251, - "grad_norm": 3.411482095718384, - "learning_rate": 3.2713759792699057e-06, - "loss": 0.3896, - "mean_token_accuracy": 0.8670433759689331, - "num_tokens": 232083201.0, - "step": 192920 - }, - { - "entropy": 1.8091656923294068, - "epoch": 0.5980661195837748, - "grad_norm": 3.935356378555298, - "learning_rate": 3.2712911963090414e-06, - "loss": 0.4069, - "mean_token_accuracy": 0.8612916901707649, - "num_tokens": 232095640.0, - "step": 192930 - }, - { - "entropy": 1.7774908185005187, - "epoch": 0.5980971187088244, - "grad_norm": 4.483081340789795, - "learning_rate": 3.271206419939695e-06, - "loss": 0.339, - "mean_token_accuracy": 0.8757447093725205, - "num_tokens": 232108961.0, - "step": 192940 - }, - { - "entropy": 1.857834528386593, - "epoch": 0.5981281178338741, - "grad_norm": 13.082944869995117, - "learning_rate": 3.2711216501610145e-06, - "loss": 0.4389, - "mean_token_accuracy": 0.8536162137985229, - "num_tokens": 232120922.0, - "step": 192950 - }, - { - "entropy": 1.9390349090099335, - "epoch": 0.5981591169589239, - "grad_norm": 7.945557594299316, - "learning_rate": 3.271036886972142e-06, - "loss": 0.4603, - "mean_token_accuracy": 0.854901310801506, - "num_tokens": 232132214.0, - "step": 192960 - }, - { - "entropy": 1.9223468393087386, - "epoch": 0.5981901160839735, - "grad_norm": 8.340877532958984, - "learning_rate": 3.2709521303722264e-06, - "loss": 0.4912, - "mean_token_accuracy": 0.8535726457834244, - "num_tokens": 232143217.0, - "step": 192970 - }, - { - "entropy": 1.8658886596560478, - "epoch": 0.5982211152090232, - "grad_norm": 9.085555076599121, - "learning_rate": 3.2708673803604135e-06, - "loss": 0.4568, - "mean_token_accuracy": 0.8573013827204704, - "num_tokens": 232155836.0, - "step": 192980 - }, - { - "entropy": 1.7965003371238708, - "epoch": 0.5982521143340729, - "grad_norm": 8.895267486572266, - "learning_rate": 3.2707826369358496e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.856090834736824, - "num_tokens": 232168724.0, - "step": 192990 - }, - { - "entropy": 1.8233628869056702, - "epoch": 0.5982831134591227, - "grad_norm": 3.5606913566589355, - "learning_rate": 3.2706979000976823e-06, - "loss": 0.4248, - "mean_token_accuracy": 0.8582077980041504, - "num_tokens": 232180804.0, - "step": 193000 - }, - { - "entropy": 1.8934690535068512, - "epoch": 0.5983141125841723, - "grad_norm": 9.63917064666748, - "learning_rate": 3.270613169845057e-06, - "loss": 0.4814, - "mean_token_accuracy": 0.846235491335392, - "num_tokens": 232192284.0, - "step": 193010 - }, - { - "entropy": 1.8537084929645062, - "epoch": 0.598345111709222, - "grad_norm": 3.254624605178833, - "learning_rate": 3.2705284461771226e-06, - "loss": 0.3885, - "mean_token_accuracy": 0.864462012052536, - "num_tokens": 232204849.0, - "step": 193020 - }, - { - "entropy": 1.849136011302471, - "epoch": 0.5983761108342717, - "grad_norm": 7.761828422546387, - "learning_rate": 3.2704437290930247e-06, - "loss": 0.4467, - "mean_token_accuracy": 0.8548069670796394, - "num_tokens": 232216877.0, - "step": 193030 - }, - { - "entropy": 1.8947343826293945, - "epoch": 0.5984071099593214, - "grad_norm": 7.728636264801025, - "learning_rate": 3.270359018591911e-06, - "loss": 0.436, - "mean_token_accuracy": 0.8585354000329971, - "num_tokens": 232228648.0, - "step": 193040 - }, - { - "entropy": 1.8421491459012032, - "epoch": 0.5984381090843711, - "grad_norm": 1.6748687028884888, - "learning_rate": 3.270274314672929e-06, - "loss": 0.4616, - "mean_token_accuracy": 0.8583970040082931, - "num_tokens": 232241457.0, - "step": 193050 - }, - { - "entropy": 1.9403438106179238, - "epoch": 0.5984691082094208, - "grad_norm": 7.910982131958008, - "learning_rate": 3.2701896173352277e-06, - "loss": 0.523, - "mean_token_accuracy": 0.842846755683422, - "num_tokens": 232252793.0, - "step": 193060 - }, - { - "entropy": 1.8780175030231476, - "epoch": 0.5985001073344705, - "grad_norm": 8.453100204467773, - "learning_rate": 3.2701049265779523e-06, - "loss": 0.3958, - "mean_token_accuracy": 0.866106778383255, - "num_tokens": 232265169.0, - "step": 193070 - }, - { - "entropy": 1.779895555973053, - "epoch": 0.5985311064595202, - "grad_norm": 5.00262975692749, - "learning_rate": 3.2700202424002527e-06, - "loss": 0.4239, - "mean_token_accuracy": 0.8611460939049721, - "num_tokens": 232278699.0, - "step": 193080 - }, - { - "entropy": 1.8304154217243194, - "epoch": 0.5985621055845699, - "grad_norm": 7.103408336639404, - "learning_rate": 3.2699355648012763e-06, - "loss": 0.3709, - "mean_token_accuracy": 0.8728136703372001, - "num_tokens": 232291597.0, - "step": 193090 - }, - { - "entropy": 1.923523449897766, - "epoch": 0.5985931047096196, - "grad_norm": 8.71379280090332, - "learning_rate": 3.2698508937801714e-06, - "loss": 0.4458, - "mean_token_accuracy": 0.8502437174320221, - "num_tokens": 232303268.0, - "step": 193100 - }, - { - "entropy": 1.9225520223379136, - "epoch": 0.5986241038346692, - "grad_norm": 8.151899337768555, - "learning_rate": 3.2697662293360872e-06, - "loss": 0.4667, - "mean_token_accuracy": 0.8552662536501885, - "num_tokens": 232314225.0, - "step": 193110 - }, - { - "entropy": 1.871592454612255, - "epoch": 0.5986551029597189, - "grad_norm": 7.62418270111084, - "learning_rate": 3.2696815714681707e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.8585105463862419, - "num_tokens": 232326953.0, - "step": 193120 - }, - { - "entropy": 1.8937001720070838, - "epoch": 0.5986861020847687, - "grad_norm": 8.169354438781738, - "learning_rate": 3.2695969201755716e-06, - "loss": 0.428, - "mean_token_accuracy": 0.8637664049863816, - "num_tokens": 232338445.0, - "step": 193130 - }, - { - "entropy": 1.9185884833335876, - "epoch": 0.5987171012098184, - "grad_norm": 9.113455772399902, - "learning_rate": 3.2695122754574392e-06, - "loss": 0.4388, - "mean_token_accuracy": 0.8630929663777351, - "num_tokens": 232349797.0, - "step": 193140 - }, - { - "entropy": 1.676446057856083, - "epoch": 0.598748100334868, - "grad_norm": 6.821518898010254, - "learning_rate": 3.269427637312922e-06, - "loss": 0.2483, - "mean_token_accuracy": 0.8841329157352448, - "num_tokens": 232364387.0, - "step": 193150 - }, - { - "entropy": 1.8946543186903, - "epoch": 0.5987790994599177, - "grad_norm": 7.599132061004639, - "learning_rate": 3.269343005741169e-06, - "loss": 0.4098, - "mean_token_accuracy": 0.857475508749485, - "num_tokens": 232376474.0, - "step": 193160 - }, - { - "entropy": 1.8924265503883362, - "epoch": 0.5988100985849675, - "grad_norm": 8.127379417419434, - "learning_rate": 3.2692583807413296e-06, - "loss": 0.4587, - "mean_token_accuracy": 0.8496007680892944, - "num_tokens": 232388628.0, - "step": 193170 - }, - { - "entropy": 1.8490541279315948, - "epoch": 0.5988410977100171, - "grad_norm": 7.713603973388672, - "learning_rate": 3.269173762312554e-06, - "loss": 0.4225, - "mean_token_accuracy": 0.8569965809583664, - "num_tokens": 232401302.0, - "step": 193180 - }, - { - "entropy": 1.894689080119133, - "epoch": 0.5988720968350668, - "grad_norm": 6.935965538024902, - "learning_rate": 3.2690891504539906e-06, - "loss": 0.4643, - "mean_token_accuracy": 0.8526157677173615, - "num_tokens": 232413830.0, - "step": 193190 - }, - { - "entropy": 1.916493308544159, - "epoch": 0.5989030959601165, - "grad_norm": 9.567176818847656, - "learning_rate": 3.26900454516479e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.8588901504874229, - "num_tokens": 232424966.0, - "step": 193200 - }, - { - "entropy": 1.8126317217946053, - "epoch": 0.5989340950851663, - "grad_norm": 3.7845699787139893, - "learning_rate": 3.2689199464441022e-06, - "loss": 0.4287, - "mean_token_accuracy": 0.8520424425601959, - "num_tokens": 232438169.0, - "step": 193210 - }, - { - "entropy": 1.8341682583093644, - "epoch": 0.5989650942102159, - "grad_norm": 8.897905349731445, - "learning_rate": 3.268835354291078e-06, - "loss": 0.3813, - "mean_token_accuracy": 0.8573813617229462, - "num_tokens": 232451220.0, - "step": 193220 - }, - { - "entropy": 1.936905488371849, - "epoch": 0.5989960933352656, - "grad_norm": 9.211556434631348, - "learning_rate": 3.2687507687048653e-06, - "loss": 0.4759, - "mean_token_accuracy": 0.8555907920002938, - "num_tokens": 232462258.0, - "step": 193230 - }, - { - "entropy": 1.9206456869840622, - "epoch": 0.5990270924603153, - "grad_norm": 8.207487106323242, - "learning_rate": 3.2686661896846167e-06, - "loss": 0.4774, - "mean_token_accuracy": 0.8534928426146507, - "num_tokens": 232473643.0, - "step": 193240 - }, - { - "entropy": 1.9361522287130355, - "epoch": 0.599058091585365, - "grad_norm": 7.724515438079834, - "learning_rate": 3.2685816172294825e-06, - "loss": 0.5038, - "mean_token_accuracy": 0.8487791284918785, - "num_tokens": 232484346.0, - "step": 193250 - }, - { - "entropy": 1.8511110663414, - "epoch": 0.5990890907104147, - "grad_norm": 2.388965606689453, - "learning_rate": 3.2684970513386126e-06, - "loss": 0.4309, - "mean_token_accuracy": 0.8582392647862435, - "num_tokens": 232496624.0, - "step": 193260 - }, - { - "entropy": 1.8527045264840125, - "epoch": 0.5991200898354644, - "grad_norm": 7.476731777191162, - "learning_rate": 3.2684124920111587e-06, - "loss": 0.4084, - "mean_token_accuracy": 0.8627937600016594, - "num_tokens": 232508749.0, - "step": 193270 - }, - { - "entropy": 1.7666074097156526, - "epoch": 0.5991510889605141, - "grad_norm": 4.153000354766846, - "learning_rate": 3.268327939246271e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.8549092411994934, - "num_tokens": 232522585.0, - "step": 193280 - }, - { - "entropy": 1.8363301545381545, - "epoch": 0.5991820880855638, - "grad_norm": 10.776314735412598, - "learning_rate": 3.2682433930431013e-06, - "loss": 0.4202, - "mean_token_accuracy": 0.8624247089028358, - "num_tokens": 232535038.0, - "step": 193290 - }, - { - "entropy": 1.8398231253027917, - "epoch": 0.5992130872106135, - "grad_norm": 7.152562141418457, - "learning_rate": 3.2681588534008004e-06, - "loss": 0.4497, - "mean_token_accuracy": 0.8504539951682091, - "num_tokens": 232548491.0, - "step": 193300 - }, - { - "entropy": 1.8838460862636566, - "epoch": 0.5992440863356632, - "grad_norm": 10.93239974975586, - "learning_rate": 3.2680743203185205e-06, - "loss": 0.4315, - "mean_token_accuracy": 0.8576446041464806, - "num_tokens": 232559957.0, - "step": 193310 - }, - { - "entropy": 1.9226768806576728, - "epoch": 0.5992750854607128, - "grad_norm": 7.900559902191162, - "learning_rate": 3.267989793795413e-06, - "loss": 0.4959, - "mean_token_accuracy": 0.8387135237455368, - "num_tokens": 232570903.0, - "step": 193320 - }, - { - "entropy": 1.8913242816925049, - "epoch": 0.5993060845857626, - "grad_norm": 7.149113178253174, - "learning_rate": 3.2679052738306294e-06, - "loss": 0.4507, - "mean_token_accuracy": 0.8567343667149544, - "num_tokens": 232582496.0, - "step": 193330 - }, - { - "entropy": 1.8473687127232552, - "epoch": 0.5993370837108123, - "grad_norm": 8.293903350830078, - "learning_rate": 3.2678207604233227e-06, - "loss": 0.3866, - "mean_token_accuracy": 0.8724023550748825, - "num_tokens": 232594664.0, - "step": 193340 - }, - { - "entropy": 1.9113451212644577, - "epoch": 0.599368082835862, - "grad_norm": 7.854452610015869, - "learning_rate": 3.267736253572643e-06, - "loss": 0.4285, - "mean_token_accuracy": 0.8739783897995949, - "num_tokens": 232605471.0, - "step": 193350 - }, - { - "entropy": 1.9115384712815284, - "epoch": 0.5993990819609116, - "grad_norm": 6.98056697845459, - "learning_rate": 3.267651753277744e-06, - "loss": 0.4429, - "mean_token_accuracy": 0.8557354226708412, - "num_tokens": 232617080.0, - "step": 193360 - }, - { - "entropy": 1.9489524781703949, - "epoch": 0.5994300810859613, - "grad_norm": 7.385791301727295, - "learning_rate": 3.2675672595377785e-06, - "loss": 0.4952, - "mean_token_accuracy": 0.847708423435688, - "num_tokens": 232627867.0, - "step": 193370 - }, - { - "entropy": 1.8183632001280785, - "epoch": 0.5994610802110111, - "grad_norm": 4.137541770935059, - "learning_rate": 3.2674827723518975e-06, - "loss": 0.4927, - "mean_token_accuracy": 0.8466948196291924, - "num_tokens": 232640614.0, - "step": 193380 - }, - { - "entropy": 1.8530121505260468, - "epoch": 0.5994920793360607, - "grad_norm": 7.538222312927246, - "learning_rate": 3.2673982917192556e-06, - "loss": 0.4231, - "mean_token_accuracy": 0.861600874364376, - "num_tokens": 232652014.0, - "step": 193390 - }, - { - "entropy": 1.8623303756117822, - "epoch": 0.5995230784611104, - "grad_norm": 7.082211494445801, - "learning_rate": 3.267313817639004e-06, - "loss": 0.4051, - "mean_token_accuracy": 0.8642869338393211, - "num_tokens": 232664199.0, - "step": 193400 - }, - { - "entropy": 1.7694115832448005, - "epoch": 0.5995540775861601, - "grad_norm": 3.6996982097625732, - "learning_rate": 3.2672293501102966e-06, - "loss": 0.3785, - "mean_token_accuracy": 0.8665571510791779, - "num_tokens": 232677707.0, - "step": 193410 - }, - { - "entropy": 1.8033221870660783, - "epoch": 0.5995850767112099, - "grad_norm": 6.799063682556152, - "learning_rate": 3.267144889132287e-06, - "loss": 0.4092, - "mean_token_accuracy": 0.8535528868436814, - "num_tokens": 232690686.0, - "step": 193420 - }, - { - "entropy": 1.8295499309897423, - "epoch": 0.5996160758362595, - "grad_norm": 2.946974515914917, - "learning_rate": 3.2670604347041273e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.8552382230758667, - "num_tokens": 232703540.0, - "step": 193430 - }, - { - "entropy": 1.937794703245163, - "epoch": 0.5996470749613092, - "grad_norm": 8.346149444580078, - "learning_rate": 3.2669759868249717e-06, - "loss": 0.4511, - "mean_token_accuracy": 0.8573932707309723, - "num_tokens": 232714394.0, - "step": 193440 - }, - { - "entropy": 1.872834388911724, - "epoch": 0.5996780740863589, - "grad_norm": 7.327252388000488, - "learning_rate": 3.2668915454939737e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.8600111588835716, - "num_tokens": 232725531.0, - "step": 193450 - }, - { - "entropy": 1.9216588035225868, - "epoch": 0.5997090732114086, - "grad_norm": 5.6127400398254395, - "learning_rate": 3.266807110710288e-06, - "loss": 0.4927, - "mean_token_accuracy": 0.8471625164151192, - "num_tokens": 232736811.0, - "step": 193460 - }, - { - "entropy": 1.8705299958586692, - "epoch": 0.5997400723364583, - "grad_norm": 7.6800971031188965, - "learning_rate": 3.2667226824730674e-06, - "loss": 0.4132, - "mean_token_accuracy": 0.8520657777786255, - "num_tokens": 232748796.0, - "step": 193470 - }, - { - "entropy": 1.9085370868444442, - "epoch": 0.599771071461508, - "grad_norm": 8.688849449157715, - "learning_rate": 3.2666382607814663e-06, - "loss": 0.4506, - "mean_token_accuracy": 0.8572714865207672, - "num_tokens": 232760237.0, - "step": 193480 - }, - { - "entropy": 1.935324090719223, - "epoch": 0.5998020705865577, - "grad_norm": 5.7981791496276855, - "learning_rate": 3.2665538456346385e-06, - "loss": 0.4836, - "mean_token_accuracy": 0.8494788527488708, - "num_tokens": 232771006.0, - "step": 193490 - }, - { - "entropy": 1.8078247852623464, - "epoch": 0.5998330697116074, - "grad_norm": 3.719273090362549, - "learning_rate": 3.266469437031739e-06, - "loss": 0.3763, - "mean_token_accuracy": 0.8646056219935417, - "num_tokens": 232784079.0, - "step": 193500 - }, - { - "entropy": 1.802126082777977, - "epoch": 0.5998640688366571, - "grad_norm": 8.91614818572998, - "learning_rate": 3.2663850349719223e-06, - "loss": 0.3865, - "mean_token_accuracy": 0.8643121868371964, - "num_tokens": 232796465.0, - "step": 193510 - }, - { - "entropy": 1.9151442632079125, - "epoch": 0.5998950679617068, - "grad_norm": 4.269763946533203, - "learning_rate": 3.2663006394543432e-06, - "loss": 0.4553, - "mean_token_accuracy": 0.8613762855529785, - "num_tokens": 232808088.0, - "step": 193520 - }, - { - "entropy": 1.8307126134634018, - "epoch": 0.5999260670867564, - "grad_norm": 7.482618808746338, - "learning_rate": 3.266216250478157e-06, - "loss": 0.3853, - "mean_token_accuracy": 0.8658735737204551, - "num_tokens": 232820793.0, - "step": 193530 - }, - { - "entropy": 1.834550380706787, - "epoch": 0.5999570662118062, - "grad_norm": 7.851871013641357, - "learning_rate": 3.2661318680425176e-06, - "loss": 0.4695, - "mean_token_accuracy": 0.8613473907113075, - "num_tokens": 232833220.0, - "step": 193540 - }, - { - "entropy": 1.8679144203662872, - "epoch": 0.5999880653368559, - "grad_norm": 8.374329566955566, - "learning_rate": 3.26604749214658e-06, - "loss": 0.4531, - "mean_token_accuracy": 0.8531968653202057, - "num_tokens": 232845400.0, - "step": 193550 - }, - { - "entropy": 1.9155082762241364, - "epoch": 0.6000190644619056, - "grad_norm": 8.865435600280762, - "learning_rate": 3.2659631227895016e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8567243069410324, - "num_tokens": 232856561.0, - "step": 193560 - }, - { - "entropy": 1.7923461616039276, - "epoch": 0.6000500635869552, - "grad_norm": 3.426719903945923, - "learning_rate": 3.265878759970436e-06, - "loss": 0.3789, - "mean_token_accuracy": 0.8709788426756859, - "num_tokens": 232869397.0, - "step": 193570 - }, - { - "entropy": 1.8624560877680778, - "epoch": 0.600081062712005, - "grad_norm": 7.726009845733643, - "learning_rate": 3.2657944036885394e-06, - "loss": 0.4342, - "mean_token_accuracy": 0.8586367711424827, - "num_tokens": 232881337.0, - "step": 193580 - }, - { - "entropy": 1.873550534248352, - "epoch": 0.6001120618370547, - "grad_norm": 8.238359451293945, - "learning_rate": 3.265710053942967e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8538247391581535, - "num_tokens": 232893398.0, - "step": 193590 - }, - { - "entropy": 1.8931268915534019, - "epoch": 0.6001430609621043, - "grad_norm": 7.079104423522949, - "learning_rate": 3.2656257107328758e-06, - "loss": 0.4212, - "mean_token_accuracy": 0.8740960642695427, - "num_tokens": 232904771.0, - "step": 193600 - }, - { - "entropy": 1.8737851038575173, - "epoch": 0.600174060087154, - "grad_norm": 7.745109558105469, - "learning_rate": 3.265541374057421e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.850721350312233, - "num_tokens": 232916506.0, - "step": 193610 - }, - { - "entropy": 1.8829128816723824, - "epoch": 0.6002050592122037, - "grad_norm": 8.40649127960205, - "learning_rate": 3.2654570439157595e-06, - "loss": 0.4804, - "mean_token_accuracy": 0.8492335632443428, - "num_tokens": 232928689.0, - "step": 193620 - }, - { - "entropy": 1.8552207678556443, - "epoch": 0.6002360583372535, - "grad_norm": 7.093789577484131, - "learning_rate": 3.265372720307048e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8560128346085548, - "num_tokens": 232941993.0, - "step": 193630 - }, - { - "entropy": 1.8739019855856895, - "epoch": 0.6002670574623031, - "grad_norm": 7.4782490730285645, - "learning_rate": 3.2652884032304424e-06, - "loss": 0.4381, - "mean_token_accuracy": 0.8601368397474289, - "num_tokens": 232953269.0, - "step": 193640 - }, - { - "entropy": 1.8960627764463425, - "epoch": 0.6002980565873528, - "grad_norm": 6.894164085388184, - "learning_rate": 3.265204092685098e-06, - "loss": 0.4838, - "mean_token_accuracy": 0.8524527877569199, - "num_tokens": 232964859.0, - "step": 193650 - }, - { - "entropy": 1.8895342394709587, - "epoch": 0.6003290557124025, - "grad_norm": 7.591185092926025, - "learning_rate": 3.2651197886701742e-06, - "loss": 0.4484, - "mean_token_accuracy": 0.8550887897610664, - "num_tokens": 232976895.0, - "step": 193660 - }, - { - "entropy": 1.8984116226434709, - "epoch": 0.6003600548374523, - "grad_norm": 7.582576274871826, - "learning_rate": 3.2650354911848266e-06, - "loss": 0.4924, - "mean_token_accuracy": 0.8485193282365799, - "num_tokens": 232989028.0, - "step": 193670 - }, - { - "entropy": 1.8505906209349632, - "epoch": 0.6003910539625019, - "grad_norm": 3.8775970935821533, - "learning_rate": 3.2649512002282124e-06, - "loss": 0.4203, - "mean_token_accuracy": 0.8600415006279946, - "num_tokens": 233001436.0, - "step": 193680 - }, - { - "entropy": 1.8285743162035941, - "epoch": 0.6004220530875516, - "grad_norm": 9.559125900268555, - "learning_rate": 3.2648669157994896e-06, - "loss": 0.4169, - "mean_token_accuracy": 0.8612663343548774, - "num_tokens": 233014192.0, - "step": 193690 - }, - { - "entropy": 1.9036530345678329, - "epoch": 0.6004530522126013, - "grad_norm": 7.049911975860596, - "learning_rate": 3.2647826378978148e-06, - "loss": 0.5172, - "mean_token_accuracy": 0.8517943635582924, - "num_tokens": 233026158.0, - "step": 193700 - }, - { - "entropy": 1.889918527007103, - "epoch": 0.600484051337651, - "grad_norm": 3.9879229068756104, - "learning_rate": 3.2646983665223462e-06, - "loss": 0.4538, - "mean_token_accuracy": 0.8560131743550301, - "num_tokens": 233037390.0, - "step": 193710 - }, - { - "entropy": 1.8171194300055504, - "epoch": 0.6005150504627007, - "grad_norm": 4.150033473968506, - "learning_rate": 3.264614101672241e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8582745015621185, - "num_tokens": 233050147.0, - "step": 193720 - }, - { - "entropy": 1.9315490901470185, - "epoch": 0.6005460495877504, - "grad_norm": 9.391478538513184, - "learning_rate": 3.264529843346658e-06, - "loss": 0.5165, - "mean_token_accuracy": 0.838477349281311, - "num_tokens": 233061368.0, - "step": 193730 - }, - { - "entropy": 1.8814868345856666, - "epoch": 0.6005770487128, - "grad_norm": 7.952883243560791, - "learning_rate": 3.2644455915447548e-06, - "loss": 0.4463, - "mean_token_accuracy": 0.8509133666753769, - "num_tokens": 233073284.0, - "step": 193740 - }, - { - "entropy": 1.850820629298687, - "epoch": 0.6006080478378498, - "grad_norm": 8.097213745117188, - "learning_rate": 3.264361346265689e-06, - "loss": 0.4185, - "mean_token_accuracy": 0.8518119260668755, - "num_tokens": 233085186.0, - "step": 193750 - }, - { - "entropy": 1.845947080850601, - "epoch": 0.6006390469628995, - "grad_norm": 4.145138263702393, - "learning_rate": 3.2642771075086203e-06, - "loss": 0.4359, - "mean_token_accuracy": 0.8572747737169266, - "num_tokens": 233097277.0, - "step": 193760 - }, - { - "entropy": 1.9382601469755172, - "epoch": 0.6006700460879492, - "grad_norm": 7.650824069976807, - "learning_rate": 3.2641928752727066e-06, - "loss": 0.4886, - "mean_token_accuracy": 0.8393212363123894, - "num_tokens": 233108532.0, - "step": 193770 - }, - { - "entropy": 1.9437370300292969, - "epoch": 0.6007010452129988, - "grad_norm": 9.946969032287598, - "learning_rate": 3.2641086495571056e-06, - "loss": 0.4972, - "mean_token_accuracy": 0.8489497140049934, - "num_tokens": 233119148.0, - "step": 193780 - }, - { - "entropy": 1.8955764845013618, - "epoch": 0.6007320443380486, - "grad_norm": 9.539673805236816, - "learning_rate": 3.2640244303609774e-06, - "loss": 0.4232, - "mean_token_accuracy": 0.8624850213527679, - "num_tokens": 233130697.0, - "step": 193790 - }, - { - "entropy": 1.868457528948784, - "epoch": 0.6007630434630983, - "grad_norm": 8.330771446228027, - "learning_rate": 3.2639402176834805e-06, - "loss": 0.4313, - "mean_token_accuracy": 0.8593797326087952, - "num_tokens": 233142920.0, - "step": 193800 - }, - { - "entropy": 1.834128810465336, - "epoch": 0.600794042588148, - "grad_norm": 8.034552574157715, - "learning_rate": 3.263856011523774e-06, - "loss": 0.4024, - "mean_token_accuracy": 0.8655689373612404, - "num_tokens": 233155201.0, - "step": 193810 - }, - { - "entropy": 1.8737445190548896, - "epoch": 0.6008250417131976, - "grad_norm": 8.31792163848877, - "learning_rate": 3.2637718118810175e-06, - "loss": 0.4203, - "mean_token_accuracy": 0.8651196718215942, - "num_tokens": 233166612.0, - "step": 193820 - }, - { - "entropy": 1.9548739314079284, - "epoch": 0.6008560408382473, - "grad_norm": 8.683969497680664, - "learning_rate": 3.2636876187543705e-06, - "loss": 0.5071, - "mean_token_accuracy": 0.8405161753296853, - "num_tokens": 233177679.0, - "step": 193830 - }, - { - "entropy": 1.816799834370613, - "epoch": 0.6008870399632971, - "grad_norm": 8.057676315307617, - "learning_rate": 3.2636034321429916e-06, - "loss": 0.4629, - "mean_token_accuracy": 0.8585867390036583, - "num_tokens": 233191105.0, - "step": 193840 - }, - { - "entropy": 1.7605865061283112, - "epoch": 0.6009180390883467, - "grad_norm": 10.099117279052734, - "learning_rate": 3.263519252046042e-06, - "loss": 0.3582, - "mean_token_accuracy": 0.8697127535939216, - "num_tokens": 233205120.0, - "step": 193850 - }, - { - "entropy": 1.8630171611905098, - "epoch": 0.6009490382133964, - "grad_norm": 11.458540916442871, - "learning_rate": 3.2634350784626803e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.854158864915371, - "num_tokens": 233217172.0, - "step": 193860 - }, - { - "entropy": 1.7534714862704277, - "epoch": 0.6009800373384461, - "grad_norm": 8.782004356384277, - "learning_rate": 3.263350911392067e-06, - "loss": 0.3824, - "mean_token_accuracy": 0.8709551870822907, - "num_tokens": 233230844.0, - "step": 193870 - }, - { - "entropy": 1.757567247748375, - "epoch": 0.6010110364634959, - "grad_norm": 3.572934865951538, - "learning_rate": 3.2632667508333627e-06, - "loss": 0.3279, - "mean_token_accuracy": 0.8731027454137802, - "num_tokens": 233243904.0, - "step": 193880 - }, - { - "entropy": 1.7916569873690604, - "epoch": 0.6010420355885455, - "grad_norm": 8.961586952209473, - "learning_rate": 3.2631825967857267e-06, - "loss": 0.3954, - "mean_token_accuracy": 0.8586719200015068, - "num_tokens": 233256370.0, - "step": 193890 - }, - { - "entropy": 1.8887753248214723, - "epoch": 0.6010730347135952, - "grad_norm": 8.99552059173584, - "learning_rate": 3.2630984492483208e-06, - "loss": 0.4665, - "mean_token_accuracy": 0.8502195388078689, - "num_tokens": 233268291.0, - "step": 193900 - }, - { - "entropy": 1.905160166323185, - "epoch": 0.6011040338386449, - "grad_norm": 8.513335227966309, - "learning_rate": 3.2630143082203054e-06, - "loss": 0.4491, - "mean_token_accuracy": 0.8540363058447837, - "num_tokens": 233279493.0, - "step": 193910 - }, - { - "entropy": 1.8477469071745873, - "epoch": 0.6011350329636946, - "grad_norm": 9.18237018585205, - "learning_rate": 3.26293017370084e-06, - "loss": 0.4271, - "mean_token_accuracy": 0.8550464868545532, - "num_tokens": 233291971.0, - "step": 193920 - }, - { - "entropy": 1.932934196293354, - "epoch": 0.6011660320887443, - "grad_norm": 8.203780174255371, - "learning_rate": 3.262846045689087e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.856448483467102, - "num_tokens": 233303256.0, - "step": 193930 - }, - { - "entropy": 1.8909889429807663, - "epoch": 0.601197031213794, - "grad_norm": 8.730911254882812, - "learning_rate": 3.2627619241842075e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.8476616650819778, - "num_tokens": 233315468.0, - "step": 193940 - }, - { - "entropy": 1.9053794205188752, - "epoch": 0.6012280303388436, - "grad_norm": 8.247823715209961, - "learning_rate": 3.2626778091853617e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8605243265628815, - "num_tokens": 233327021.0, - "step": 193950 - }, - { - "entropy": 1.873201458156109, - "epoch": 0.6012590294638934, - "grad_norm": 4.534573078155518, - "learning_rate": 3.262593700691711e-06, - "loss": 0.4639, - "mean_token_accuracy": 0.8496430322527886, - "num_tokens": 233339811.0, - "step": 193960 - }, - { - "entropy": 1.8513876274228096, - "epoch": 0.6012900285889431, - "grad_norm": 3.71006178855896, - "learning_rate": 3.2625095987024186e-06, - "loss": 0.4257, - "mean_token_accuracy": 0.8635744020342827, - "num_tokens": 233351662.0, - "step": 193970 - }, - { - "entropy": 1.9053289726376534, - "epoch": 0.6013210277139928, - "grad_norm": 8.673333168029785, - "learning_rate": 3.2624255032166445e-06, - "loss": 0.4696, - "mean_token_accuracy": 0.846896342933178, - "num_tokens": 233363924.0, - "step": 193980 - }, - { - "entropy": 1.875174443423748, - "epoch": 0.6013520268390424, - "grad_norm": 7.574441909790039, - "learning_rate": 3.2623414142335513e-06, - "loss": 0.4757, - "mean_token_accuracy": 0.8469843864440918, - "num_tokens": 233376185.0, - "step": 193990 - }, - { - "entropy": 1.8693370588123799, - "epoch": 0.6013830259640922, - "grad_norm": 7.561769962310791, - "learning_rate": 3.2622573317523008e-06, - "loss": 0.427, - "mean_token_accuracy": 0.8566849261522294, - "num_tokens": 233388303.0, - "step": 194000 - }, - { - "entropy": 1.9676036804914474, - "epoch": 0.6014140250891419, - "grad_norm": 9.288419723510742, - "learning_rate": 3.262173255772056e-06, - "loss": 0.4925, - "mean_token_accuracy": 0.8490712627768516, - "num_tokens": 233399248.0, - "step": 194010 - }, - { - "entropy": 1.8525095939636231, - "epoch": 0.6014450242141915, - "grad_norm": 9.906126022338867, - "learning_rate": 3.2620891862919774e-06, - "loss": 0.452, - "mean_token_accuracy": 0.8561065092682838, - "num_tokens": 233411943.0, - "step": 194020 - }, - { - "entropy": 1.8668144181370736, - "epoch": 0.6014760233392412, - "grad_norm": 7.300792694091797, - "learning_rate": 3.2620051233112295e-06, - "loss": 0.4151, - "mean_token_accuracy": 0.8610492289066315, - "num_tokens": 233424386.0, - "step": 194030 - }, - { - "entropy": 1.8017012298107147, - "epoch": 0.601507022464291, - "grad_norm": 8.391036033630371, - "learning_rate": 3.2619210668289734e-06, - "loss": 0.4048, - "mean_token_accuracy": 0.8658035784959793, - "num_tokens": 233437193.0, - "step": 194040 - }, - { - "entropy": 1.857760213315487, - "epoch": 0.6015380215893407, - "grad_norm": 7.2276530265808105, - "learning_rate": 3.2618370168443723e-06, - "loss": 0.4269, - "mean_token_accuracy": 0.8575306445360183, - "num_tokens": 233449580.0, - "step": 194050 - }, - { - "entropy": 1.8119343966245651, - "epoch": 0.6015690207143903, - "grad_norm": 8.690643310546875, - "learning_rate": 3.2617529733565897e-06, - "loss": 0.3645, - "mean_token_accuracy": 0.8623786956071854, - "num_tokens": 233462399.0, - "step": 194060 - }, - { - "entropy": 1.7857703417539597, - "epoch": 0.60160001983944, - "grad_norm": 7.535576343536377, - "learning_rate": 3.261668936364788e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.850919260084629, - "num_tokens": 233476237.0, - "step": 194070 - }, - { - "entropy": 1.767759621143341, - "epoch": 0.6016310189644897, - "grad_norm": 8.007060050964355, - "learning_rate": 3.2615849058681304e-06, - "loss": 0.3887, - "mean_token_accuracy": 0.8724671140313148, - "num_tokens": 233489544.0, - "step": 194080 - }, - { - "entropy": 1.929178735613823, - "epoch": 0.6016620180895395, - "grad_norm": 7.4237775802612305, - "learning_rate": 3.2615008818657813e-06, - "loss": 0.4403, - "mean_token_accuracy": 0.8556190207600594, - "num_tokens": 233500500.0, - "step": 194090 - }, - { - "entropy": 1.8788953140377997, - "epoch": 0.6016930172145891, - "grad_norm": 9.347229957580566, - "learning_rate": 3.261416864356902e-06, - "loss": 0.4587, - "mean_token_accuracy": 0.8497606843709946, - "num_tokens": 233512190.0, - "step": 194100 - }, - { - "entropy": 1.7931357741355896, - "epoch": 0.6017240163396388, - "grad_norm": 6.776369571685791, - "learning_rate": 3.2613328533406585e-06, - "loss": 0.3821, - "mean_token_accuracy": 0.8622545897960663, - "num_tokens": 233525905.0, - "step": 194110 - }, - { - "entropy": 1.8501682430505753, - "epoch": 0.6017550154646885, - "grad_norm": 7.778374195098877, - "learning_rate": 3.261248848816214e-06, - "loss": 0.4055, - "mean_token_accuracy": 0.8576924055814743, - "num_tokens": 233538057.0, - "step": 194120 - }, - { - "entropy": 1.803722159564495, - "epoch": 0.6017860145897382, - "grad_norm": 7.2863054275512695, - "learning_rate": 3.2611648507827317e-06, - "loss": 0.4083, - "mean_token_accuracy": 0.8544072508811951, - "num_tokens": 233550730.0, - "step": 194130 - }, - { - "entropy": 1.766814012825489, - "epoch": 0.6018170137147879, - "grad_norm": 8.607423782348633, - "learning_rate": 3.2610808592393763e-06, - "loss": 0.3566, - "mean_token_accuracy": 0.8650653749704361, - "num_tokens": 233564293.0, - "step": 194140 - }, - { - "entropy": 1.9077049940824509, - "epoch": 0.6018480128398376, - "grad_norm": 8.215561866760254, - "learning_rate": 3.2609968741853123e-06, - "loss": 0.4829, - "mean_token_accuracy": 0.8469783037900924, - "num_tokens": 233576154.0, - "step": 194150 - }, - { - "entropy": 1.8505355820059777, - "epoch": 0.6018790119648872, - "grad_norm": 3.6293749809265137, - "learning_rate": 3.2609128956197027e-06, - "loss": 0.428, - "mean_token_accuracy": 0.8624635130167008, - "num_tokens": 233588765.0, - "step": 194160 - }, - { - "entropy": 1.9129736453294754, - "epoch": 0.601910011089937, - "grad_norm": 3.988649606704712, - "learning_rate": 3.260828923541714e-06, - "loss": 0.4424, - "mean_token_accuracy": 0.8539815843105316, - "num_tokens": 233600721.0, - "step": 194170 - }, - { - "entropy": 1.751829355955124, - "epoch": 0.6019410102149867, - "grad_norm": 7.624534606933594, - "learning_rate": 3.26074495795051e-06, - "loss": 0.3417, - "mean_token_accuracy": 0.8725816965103149, - "num_tokens": 233614563.0, - "step": 194180 - }, - { - "entropy": 1.8846061840653419, - "epoch": 0.6019720093400364, - "grad_norm": 9.135648727416992, - "learning_rate": 3.2606609988452565e-06, - "loss": 0.467, - "mean_token_accuracy": 0.8573798984289169, - "num_tokens": 233626732.0, - "step": 194190 - }, - { - "entropy": 1.9097521618008613, - "epoch": 0.602003008465086, - "grad_norm": 9.23742389678955, - "learning_rate": 3.260577046225117e-06, - "loss": 0.4443, - "mean_token_accuracy": 0.8530338272452355, - "num_tokens": 233638787.0, - "step": 194200 - }, - { - "entropy": 1.900249882042408, - "epoch": 0.6020340075901358, - "grad_norm": 8.547508239746094, - "learning_rate": 3.260493100089257e-06, - "loss": 0.4815, - "mean_token_accuracy": 0.8495122745633126, - "num_tokens": 233650856.0, - "step": 194210 - }, - { - "entropy": 1.8656728267669678, - "epoch": 0.6020650067151855, - "grad_norm": 9.300488471984863, - "learning_rate": 3.2604091604368428e-06, - "loss": 0.4515, - "mean_token_accuracy": 0.8569801360368728, - "num_tokens": 233663014.0, - "step": 194220 - }, - { - "entropy": 1.9170267432928085, - "epoch": 0.6020960058402351, - "grad_norm": 8.057255744934082, - "learning_rate": 3.2603252272670386e-06, - "loss": 0.4536, - "mean_token_accuracy": 0.851267957687378, - "num_tokens": 233674576.0, - "step": 194230 - }, - { - "entropy": 1.8948960661888123, - "epoch": 0.6021270049652848, - "grad_norm": 8.496610641479492, - "learning_rate": 3.2602413005790114e-06, - "loss": 0.4068, - "mean_token_accuracy": 0.8641295149922371, - "num_tokens": 233686986.0, - "step": 194240 - }, - { - "entropy": 1.8183800637722016, - "epoch": 0.6021580040903346, - "grad_norm": 8.398859024047852, - "learning_rate": 3.2601573803719268e-06, - "loss": 0.3852, - "mean_token_accuracy": 0.8668906956911087, - "num_tokens": 233699610.0, - "step": 194250 - }, - { - "entropy": 1.830089993774891, - "epoch": 0.6021890032153843, - "grad_norm": 4.63645601272583, - "learning_rate": 3.260073466644949e-06, - "loss": 0.4444, - "mean_token_accuracy": 0.855503711104393, - "num_tokens": 233713020.0, - "step": 194260 - }, - { - "entropy": 1.8949781507253647, - "epoch": 0.6022200023404339, - "grad_norm": 9.201313018798828, - "learning_rate": 3.2599895593972454e-06, - "loss": 0.4639, - "mean_token_accuracy": 0.8466179206967354, - "num_tokens": 233724971.0, - "step": 194270 - }, - { - "entropy": 1.857119870185852, - "epoch": 0.6022510014654836, - "grad_norm": 7.717209815979004, - "learning_rate": 3.2599056586279833e-06, - "loss": 0.4003, - "mean_token_accuracy": 0.8609469637274743, - "num_tokens": 233737558.0, - "step": 194280 - }, - { - "entropy": 1.8890058800578118, - "epoch": 0.6022820005905334, - "grad_norm": 2.933504104614258, - "learning_rate": 3.2598217643363266e-06, - "loss": 0.4414, - "mean_token_accuracy": 0.8636812314391136, - "num_tokens": 233749352.0, - "step": 194290 - }, - { - "entropy": 1.8319082364439965, - "epoch": 0.602312999715583, - "grad_norm": 3.977686643600464, - "learning_rate": 3.2597378765214437e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.8596528321504593, - "num_tokens": 233761268.0, - "step": 194300 - }, - { - "entropy": 1.9153360083699227, - "epoch": 0.6023439988406327, - "grad_norm": 8.767776489257812, - "learning_rate": 3.2596539951825013e-06, - "loss": 0.4758, - "mean_token_accuracy": 0.8494154572486877, - "num_tokens": 233772804.0, - "step": 194310 - }, - { - "entropy": 1.8809309303760529, - "epoch": 0.6023749979656824, - "grad_norm": 8.12421989440918, - "learning_rate": 3.2595701203186654e-06, - "loss": 0.4498, - "mean_token_accuracy": 0.8539055183529853, - "num_tokens": 233785351.0, - "step": 194320 - }, - { - "entropy": 1.8863722085952759, - "epoch": 0.6024059970907321, - "grad_norm": 3.445355176925659, - "learning_rate": 3.2594862519291025e-06, - "loss": 0.4426, - "mean_token_accuracy": 0.8564076155424118, - "num_tokens": 233796945.0, - "step": 194330 - }, - { - "entropy": 1.8581660106778144, - "epoch": 0.6024369962157818, - "grad_norm": 8.705437660217285, - "learning_rate": 3.2594023900129813e-06, - "loss": 0.4451, - "mean_token_accuracy": 0.856708000600338, - "num_tokens": 233808677.0, - "step": 194340 - }, - { - "entropy": 1.8240687146782875, - "epoch": 0.6024679953408315, - "grad_norm": 8.276046752929688, - "learning_rate": 3.259318534569468e-06, - "loss": 0.4727, - "mean_token_accuracy": 0.8530595406889916, - "num_tokens": 233821656.0, - "step": 194350 - }, - { - "entropy": 1.8797230839729309, - "epoch": 0.6024989944658812, - "grad_norm": 7.852595329284668, - "learning_rate": 3.2592346855977304e-06, - "loss": 0.4417, - "mean_token_accuracy": 0.8535345628857612, - "num_tokens": 233833787.0, - "step": 194360 - }, - { - "entropy": 1.8471519738435744, - "epoch": 0.6025299935909308, - "grad_norm": 8.075080871582031, - "learning_rate": 3.259150843096936e-06, - "loss": 0.3899, - "mean_token_accuracy": 0.8740817040205002, - "num_tokens": 233845912.0, - "step": 194370 - }, - { - "entropy": 1.8665089890360833, - "epoch": 0.6025609927159806, - "grad_norm": 4.186502456665039, - "learning_rate": 3.2590670070662517e-06, - "loss": 0.4278, - "mean_token_accuracy": 0.8563824415206909, - "num_tokens": 233858116.0, - "step": 194380 - }, - { - "entropy": 1.867053084075451, - "epoch": 0.6025919918410303, - "grad_norm": 7.1243133544921875, - "learning_rate": 3.258983177504847e-06, - "loss": 0.4172, - "mean_token_accuracy": 0.8526987448334694, - "num_tokens": 233870411.0, - "step": 194390 - }, - { - "entropy": 1.8570081070065498, - "epoch": 0.60262299096608, - "grad_norm": 5.597967624664307, - "learning_rate": 3.258899354411889e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.8594145223498344, - "num_tokens": 233883552.0, - "step": 194400 - }, - { - "entropy": 1.8842222318053246, - "epoch": 0.6026539900911296, - "grad_norm": 9.581622123718262, - "learning_rate": 3.258815537786546e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8546239584684372, - "num_tokens": 233895529.0, - "step": 194410 - }, - { - "entropy": 1.8748262420296669, - "epoch": 0.6026849892161794, - "grad_norm": 4.644106864929199, - "learning_rate": 3.2587317276279862e-06, - "loss": 0.4534, - "mean_token_accuracy": 0.8466273933649063, - "num_tokens": 233907783.0, - "step": 194420 - }, - { - "entropy": 1.8434445157647132, - "epoch": 0.6027159883412291, - "grad_norm": 9.612750053405762, - "learning_rate": 3.258647923935378e-06, - "loss": 0.4407, - "mean_token_accuracy": 0.8550927296280861, - "num_tokens": 233918985.0, - "step": 194430 - }, - { - "entropy": 1.8526080772280693, - "epoch": 0.6027469874662787, - "grad_norm": 7.647796154022217, - "learning_rate": 3.258564126707891e-06, - "loss": 0.3581, - "mean_token_accuracy": 0.8636227637529373, - "num_tokens": 233931333.0, - "step": 194440 - }, - { - "entropy": 1.8961267843842506, - "epoch": 0.6027779865913284, - "grad_norm": 7.142885684967041, - "learning_rate": 3.2584803359446926e-06, - "loss": 0.4422, - "mean_token_accuracy": 0.8610294297337532, - "num_tokens": 233942827.0, - "step": 194450 - }, - { - "entropy": 1.8752504363656044, - "epoch": 0.6028089857163782, - "grad_norm": 8.758703231811523, - "learning_rate": 3.258396551644953e-06, - "loss": 0.4262, - "mean_token_accuracy": 0.8480131477117538, - "num_tokens": 233955077.0, - "step": 194460 - }, - { - "entropy": 1.9181806400418282, - "epoch": 0.6028399848414279, - "grad_norm": 8.899065017700195, - "learning_rate": 3.2583127738078397e-06, - "loss": 0.4755, - "mean_token_accuracy": 0.8472186312079429, - "num_tokens": 233966300.0, - "step": 194470 - }, - { - "entropy": 1.9128697007894515, - "epoch": 0.6028709839664775, - "grad_norm": 8.163961410522461, - "learning_rate": 3.258229002432523e-06, - "loss": 0.4591, - "mean_token_accuracy": 0.8505765497684479, - "num_tokens": 233977795.0, - "step": 194480 - }, - { - "entropy": 1.8266849026083947, - "epoch": 0.6029019830915272, - "grad_norm": 3.2791669368743896, - "learning_rate": 3.2581452375181723e-06, - "loss": 0.3898, - "mean_token_accuracy": 0.8646172285079956, - "num_tokens": 233989904.0, - "step": 194490 - }, - { - "entropy": 1.8116347730159759, - "epoch": 0.602932982216577, - "grad_norm": 7.465194225311279, - "learning_rate": 3.2580614790639577e-06, - "loss": 0.3866, - "mean_token_accuracy": 0.8606933891773224, - "num_tokens": 234002395.0, - "step": 194500 - }, - { - "entropy": 1.7665009267628193, - "epoch": 0.6029639813416267, - "grad_norm": 9.060105323791504, - "learning_rate": 3.2579777270690473e-06, - "loss": 0.3952, - "mean_token_accuracy": 0.8632282540202141, - "num_tokens": 234016437.0, - "step": 194510 - }, - { - "entropy": 1.8606160417199136, - "epoch": 0.6029949804666763, - "grad_norm": 6.732377052307129, - "learning_rate": 3.2578939815326116e-06, - "loss": 0.4017, - "mean_token_accuracy": 0.8659750744700432, - "num_tokens": 234028357.0, - "step": 194520 - }, - { - "entropy": 1.8387962013483048, - "epoch": 0.603025979591726, - "grad_norm": 7.758693695068359, - "learning_rate": 3.2578102424538213e-06, - "loss": 0.4185, - "mean_token_accuracy": 0.8687957838177681, - "num_tokens": 234040845.0, - "step": 194530 - }, - { - "entropy": 1.8796296492218971, - "epoch": 0.6030569787167758, - "grad_norm": 8.825248718261719, - "learning_rate": 3.257726509831845e-06, - "loss": 0.4148, - "mean_token_accuracy": 0.8598513454198837, - "num_tokens": 234051837.0, - "step": 194540 - }, - { - "entropy": 1.9274076849222184, - "epoch": 0.6030879778418254, - "grad_norm": 9.172769546508789, - "learning_rate": 3.2576427836658545e-06, - "loss": 0.4743, - "mean_token_accuracy": 0.8573598250746727, - "num_tokens": 234062981.0, - "step": 194550 - }, - { - "entropy": 1.8512599140405654, - "epoch": 0.6031189769668751, - "grad_norm": 7.954952716827393, - "learning_rate": 3.25755906395502e-06, - "loss": 0.4305, - "mean_token_accuracy": 0.8574766382575035, - "num_tokens": 234075485.0, - "step": 194560 - }, - { - "entropy": 1.82259241938591, - "epoch": 0.6031499760919248, - "grad_norm": 8.998270034790039, - "learning_rate": 3.257475350698512e-06, - "loss": 0.3997, - "mean_token_accuracy": 0.8661994159221649, - "num_tokens": 234088067.0, - "step": 194570 - }, - { - "entropy": 1.79864399433136, - "epoch": 0.6031809752169744, - "grad_norm": 6.640564918518066, - "learning_rate": 3.2573916438955004e-06, - "loss": 0.4454, - "mean_token_accuracy": 0.8569759652018547, - "num_tokens": 234101864.0, - "step": 194580 - }, - { - "entropy": 1.9484706163406371, - "epoch": 0.6032119743420242, - "grad_norm": 5.720554828643799, - "learning_rate": 3.257307943545156e-06, - "loss": 0.4506, - "mean_token_accuracy": 0.8616125836968422, - "num_tokens": 234112736.0, - "step": 194590 - }, - { - "entropy": 1.9250062137842179, - "epoch": 0.6032429734670739, - "grad_norm": 7.0284318923950195, - "learning_rate": 3.2572242496466517e-06, - "loss": 0.4421, - "mean_token_accuracy": 0.8584776639938354, - "num_tokens": 234123876.0, - "step": 194600 - }, - { - "entropy": 1.9174670904874802, - "epoch": 0.6032739725921236, - "grad_norm": 8.020894050598145, - "learning_rate": 3.257140562199157e-06, - "loss": 0.4624, - "mean_token_accuracy": 0.8567168533802032, - "num_tokens": 234134673.0, - "step": 194610 - }, - { - "entropy": 1.888125415146351, - "epoch": 0.6033049717171732, - "grad_norm": 8.607301712036133, - "learning_rate": 3.257056881201843e-06, - "loss": 0.4332, - "mean_token_accuracy": 0.8639842256903648, - "num_tokens": 234146458.0, - "step": 194620 - }, - { - "entropy": 1.867220088839531, - "epoch": 0.603335970842223, - "grad_norm": 9.075867652893066, - "learning_rate": 3.256973206653882e-06, - "loss": 0.4141, - "mean_token_accuracy": 0.861547501385212, - "num_tokens": 234158546.0, - "step": 194630 - }, - { - "entropy": 1.860349379479885, - "epoch": 0.6033669699672727, - "grad_norm": 4.725429058074951, - "learning_rate": 3.2568895385544454e-06, - "loss": 0.4405, - "mean_token_accuracy": 0.8553501516580582, - "num_tokens": 234171213.0, - "step": 194640 - }, - { - "entropy": 1.8892497256398202, - "epoch": 0.6033979690923224, - "grad_norm": 6.847842216491699, - "learning_rate": 3.256805876902705e-06, - "loss": 0.4598, - "mean_token_accuracy": 0.8541723743081093, - "num_tokens": 234183250.0, - "step": 194650 - }, - { - "entropy": 1.9206036776304245, - "epoch": 0.603428968217372, - "grad_norm": 9.059221267700195, - "learning_rate": 3.2567222216978333e-06, - "loss": 0.475, - "mean_token_accuracy": 0.8496464163064956, - "num_tokens": 234193842.0, - "step": 194660 - }, - { - "entropy": 1.845395915210247, - "epoch": 0.6034599673424218, - "grad_norm": 8.204310417175293, - "learning_rate": 3.2566385729390017e-06, - "loss": 0.416, - "mean_token_accuracy": 0.8577623799443245, - "num_tokens": 234206952.0, - "step": 194670 - }, - { - "entropy": 1.8823886767029763, - "epoch": 0.6034909664674715, - "grad_norm": 5.860503673553467, - "learning_rate": 3.256554930625382e-06, - "loss": 0.499, - "mean_token_accuracy": 0.8504135593771934, - "num_tokens": 234218721.0, - "step": 194680 - }, - { - "entropy": 1.8299574121832847, - "epoch": 0.6035219655925211, - "grad_norm": 5.5976057052612305, - "learning_rate": 3.2564712947561462e-06, - "loss": 0.3959, - "mean_token_accuracy": 0.8664263695478439, - "num_tokens": 234231910.0, - "step": 194690 - }, - { - "entropy": 1.825202153623104, - "epoch": 0.6035529647175708, - "grad_norm": 3.987738609313965, - "learning_rate": 3.256387665330469e-06, - "loss": 0.4114, - "mean_token_accuracy": 0.8607425019145012, - "num_tokens": 234244624.0, - "step": 194700 - }, - { - "entropy": 1.8675637185573577, - "epoch": 0.6035839638426206, - "grad_norm": 7.845833778381348, - "learning_rate": 3.256304042347521e-06, - "loss": 0.4261, - "mean_token_accuracy": 0.8430291518568993, - "num_tokens": 234257973.0, - "step": 194710 - }, - { - "entropy": 1.906636281311512, - "epoch": 0.6036149629676703, - "grad_norm": 3.074232578277588, - "learning_rate": 3.2562204258064755e-06, - "loss": 0.4918, - "mean_token_accuracy": 0.8502906784415245, - "num_tokens": 234269531.0, - "step": 194720 - }, - { - "entropy": 1.853766144812107, - "epoch": 0.6036459620927199, - "grad_norm": 7.9279093742370605, - "learning_rate": 3.256136815706506e-06, - "loss": 0.387, - "mean_token_accuracy": 0.8601211562752724, - "num_tokens": 234282642.0, - "step": 194730 - }, - { - "entropy": 1.880632211267948, - "epoch": 0.6036769612177696, - "grad_norm": 8.704763412475586, - "learning_rate": 3.256053212046785e-06, - "loss": 0.4341, - "mean_token_accuracy": 0.8636703386902809, - "num_tokens": 234293805.0, - "step": 194740 - }, - { - "entropy": 1.9346161454916, - "epoch": 0.6037079603428194, - "grad_norm": 8.34154224395752, - "learning_rate": 3.2559696148264864e-06, - "loss": 0.4963, - "mean_token_accuracy": 0.848820036649704, - "num_tokens": 234305089.0, - "step": 194750 - }, - { - "entropy": 1.8757025212049485, - "epoch": 0.603738959467869, - "grad_norm": 8.365077018737793, - "learning_rate": 3.2558860240447836e-06, - "loss": 0.4554, - "mean_token_accuracy": 0.8507075756788254, - "num_tokens": 234317804.0, - "step": 194760 - }, - { - "entropy": 1.9414838343858718, - "epoch": 0.6037699585929187, - "grad_norm": 8.162266731262207, - "learning_rate": 3.2558024397008493e-06, - "loss": 0.4716, - "mean_token_accuracy": 0.8519972428679466, - "num_tokens": 234328443.0, - "step": 194770 - }, - { - "entropy": 1.894056186079979, - "epoch": 0.6038009577179684, - "grad_norm": 8.409675598144531, - "learning_rate": 3.255718861793857e-06, - "loss": 0.4551, - "mean_token_accuracy": 0.854424498975277, - "num_tokens": 234339704.0, - "step": 194780 - }, - { - "entropy": 1.8207903996109962, - "epoch": 0.6038319568430182, - "grad_norm": 6.82459831237793, - "learning_rate": 3.2556352903229823e-06, - "loss": 0.3634, - "mean_token_accuracy": 0.8668209493160248, - "num_tokens": 234352453.0, - "step": 194790 - }, - { - "entropy": 1.8704303100705146, - "epoch": 0.6038629559680678, - "grad_norm": 8.402528762817383, - "learning_rate": 3.255551725287397e-06, - "loss": 0.4087, - "mean_token_accuracy": 0.8615420445799827, - "num_tokens": 234364986.0, - "step": 194800 - }, - { - "entropy": 1.8487682089209556, - "epoch": 0.6038939550931175, - "grad_norm": 8.179166793823242, - "learning_rate": 3.255468166686277e-06, - "loss": 0.478, - "mean_token_accuracy": 0.8573832005262375, - "num_tokens": 234376774.0, - "step": 194810 - }, - { - "entropy": 1.8813228532671928, - "epoch": 0.6039249542181672, - "grad_norm": 9.37818431854248, - "learning_rate": 3.2553846145187955e-06, - "loss": 0.462, - "mean_token_accuracy": 0.8545000702142715, - "num_tokens": 234389254.0, - "step": 194820 - }, - { - "entropy": 1.8351835548877715, - "epoch": 0.6039559533432168, - "grad_norm": 8.236607551574707, - "learning_rate": 3.2553010687841272e-06, - "loss": 0.3922, - "mean_token_accuracy": 0.8637291580438614, - "num_tokens": 234401607.0, - "step": 194830 - }, - { - "entropy": 1.7457613579928875, - "epoch": 0.6039869524682666, - "grad_norm": 8.144421577453613, - "learning_rate": 3.2552175294814477e-06, - "loss": 0.323, - "mean_token_accuracy": 0.8719058111310005, - "num_tokens": 234415866.0, - "step": 194840 - }, - { - "entropy": 1.9108372643589973, - "epoch": 0.6040179515933163, - "grad_norm": 8.523344039916992, - "learning_rate": 3.2551339966099303e-06, - "loss": 0.4617, - "mean_token_accuracy": 0.8551770314574242, - "num_tokens": 234427296.0, - "step": 194850 - }, - { - "entropy": 1.9472704499959945, - "epoch": 0.604048950718366, - "grad_norm": 8.217750549316406, - "learning_rate": 3.2550504701687506e-06, - "loss": 0.4961, - "mean_token_accuracy": 0.8496365174651146, - "num_tokens": 234438164.0, - "step": 194860 - }, - { - "entropy": 1.8766314849257468, - "epoch": 0.6040799498434156, - "grad_norm": 7.862065315246582, - "learning_rate": 3.254966950157084e-06, - "loss": 0.388, - "mean_token_accuracy": 0.8675270214676857, - "num_tokens": 234449674.0, - "step": 194870 - }, - { - "entropy": 1.9081780746579171, - "epoch": 0.6041109489684654, - "grad_norm": 8.318652153015137, - "learning_rate": 3.2548834365741045e-06, - "loss": 0.4496, - "mean_token_accuracy": 0.8522797405719758, - "num_tokens": 234460914.0, - "step": 194880 - }, - { - "entropy": 1.9317078649997712, - "epoch": 0.6041419480935151, - "grad_norm": 7.822876930236816, - "learning_rate": 3.2547999294189885e-06, - "loss": 0.4632, - "mean_token_accuracy": 0.8557189866900444, - "num_tokens": 234471839.0, - "step": 194890 - }, - { - "entropy": 1.944938975572586, - "epoch": 0.6041729472185647, - "grad_norm": 8.961193084716797, - "learning_rate": 3.2547164286909106e-06, - "loss": 0.4491, - "mean_token_accuracy": 0.8495589464902877, - "num_tokens": 234483129.0, - "step": 194900 - }, - { - "entropy": 1.8596145376563071, - "epoch": 0.6042039463436144, - "grad_norm": 8.231999397277832, - "learning_rate": 3.2546329343890477e-06, - "loss": 0.4084, - "mean_token_accuracy": 0.8626977398991584, - "num_tokens": 234495421.0, - "step": 194910 - }, - { - "entropy": 1.9315678521990776, - "epoch": 0.6042349454686642, - "grad_norm": 8.767191886901855, - "learning_rate": 3.254549446512574e-06, - "loss": 0.5051, - "mean_token_accuracy": 0.8393030554056168, - "num_tokens": 234507370.0, - "step": 194920 - }, - { - "entropy": 1.863025739789009, - "epoch": 0.6042659445937139, - "grad_norm": 9.030223846435547, - "learning_rate": 3.2544659650606662e-06, - "loss": 0.4552, - "mean_token_accuracy": 0.8481202870607376, - "num_tokens": 234520201.0, - "step": 194930 - }, - { - "entropy": 1.9192076668143272, - "epoch": 0.6042969437187635, - "grad_norm": 8.617793083190918, - "learning_rate": 3.254382490032501e-06, - "loss": 0.4622, - "mean_token_accuracy": 0.8552293330430984, - "num_tokens": 234532032.0, - "step": 194940 - }, - { - "entropy": 1.869245907664299, - "epoch": 0.6043279428438132, - "grad_norm": 8.926547050476074, - "learning_rate": 3.2542990214272536e-06, - "loss": 0.4473, - "mean_token_accuracy": 0.8564789742231369, - "num_tokens": 234543524.0, - "step": 194950 - }, - { - "entropy": 1.8756817936897279, - "epoch": 0.604358941968863, - "grad_norm": 10.440224647521973, - "learning_rate": 3.254215559244101e-06, - "loss": 0.406, - "mean_token_accuracy": 0.8592181280255318, - "num_tokens": 234555409.0, - "step": 194960 - }, - { - "entropy": 1.974171131849289, - "epoch": 0.6043899410939126, - "grad_norm": 6.878018856048584, - "learning_rate": 3.2541321034822192e-06, - "loss": 0.4648, - "mean_token_accuracy": 0.8529852941632271, - "num_tokens": 234566421.0, - "step": 194970 - }, - { - "entropy": 1.9051358461380006, - "epoch": 0.6044209402189623, - "grad_norm": 7.7151641845703125, - "learning_rate": 3.2540486541407855e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8591649144887924, - "num_tokens": 234577678.0, - "step": 194980 - }, - { - "entropy": 1.8991325095295906, - "epoch": 0.604451939344012, - "grad_norm": 7.806362628936768, - "learning_rate": 3.2539652112189764e-06, - "loss": 0.4509, - "mean_token_accuracy": 0.8524706467986107, - "num_tokens": 234589834.0, - "step": 194990 - }, - { - "entropy": 1.9010869175195695, - "epoch": 0.6044829384690618, - "grad_norm": 4.471253871917725, - "learning_rate": 3.2538817747159686e-06, - "loss": 0.4381, - "mean_token_accuracy": 0.8526588916778565, - "num_tokens": 234601789.0, - "step": 195000 - }, - { - "entropy": 1.9764428943395616, - "epoch": 0.6045139375941114, - "grad_norm": 9.513835906982422, - "learning_rate": 3.253798344630939e-06, - "loss": 0.4746, - "mean_token_accuracy": 0.8532751992344856, - "num_tokens": 234612475.0, - "step": 195010 - }, - { - "entropy": 1.859013244509697, - "epoch": 0.6045449367191611, - "grad_norm": 8.61437702178955, - "learning_rate": 3.2537149209630657e-06, - "loss": 0.4175, - "mean_token_accuracy": 0.8659439861774445, - "num_tokens": 234625091.0, - "step": 195020 - }, - { - "entropy": 1.8695720225572585, - "epoch": 0.6045759358442108, - "grad_norm": 8.737157821655273, - "learning_rate": 3.2536315037115256e-06, - "loss": 0.4483, - "mean_token_accuracy": 0.8510843962430954, - "num_tokens": 234637555.0, - "step": 195030 - }, - { - "entropy": 1.890501080453396, - "epoch": 0.6046069349692605, - "grad_norm": 7.709783554077148, - "learning_rate": 3.253548092875496e-06, - "loss": 0.441, - "mean_token_accuracy": 0.8527573123574257, - "num_tokens": 234650075.0, - "step": 195040 - }, - { - "entropy": 1.9229430109262466, - "epoch": 0.6046379340943102, - "grad_norm": 9.317115783691406, - "learning_rate": 3.2534646884541554e-06, - "loss": 0.5282, - "mean_token_accuracy": 0.8381368711590766, - "num_tokens": 234661397.0, - "step": 195050 - }, - { - "entropy": 1.88003631234169, - "epoch": 0.6046689332193599, - "grad_norm": 10.253046989440918, - "learning_rate": 3.253381290446681e-06, - "loss": 0.4203, - "mean_token_accuracy": 0.8574978530406951, - "num_tokens": 234673626.0, - "step": 195060 - }, - { - "entropy": 1.8511680349707604, - "epoch": 0.6046999323444096, - "grad_norm": 7.66872501373291, - "learning_rate": 3.2532978988522507e-06, - "loss": 0.4148, - "mean_token_accuracy": 0.8553901076316833, - "num_tokens": 234685268.0, - "step": 195070 - }, - { - "entropy": 1.792649395763874, - "epoch": 0.6047309314694592, - "grad_norm": 4.690402984619141, - "learning_rate": 3.2532145136700434e-06, - "loss": 0.3734, - "mean_token_accuracy": 0.87020503282547, - "num_tokens": 234699509.0, - "step": 195080 - }, - { - "entropy": 1.8900166735053063, - "epoch": 0.604761930594509, - "grad_norm": 7.005067825317383, - "learning_rate": 3.2531311348992368e-06, - "loss": 0.4268, - "mean_token_accuracy": 0.8626169845461845, - "num_tokens": 234711090.0, - "step": 195090 - }, - { - "entropy": 1.9425034090876578, - "epoch": 0.6047929297195587, - "grad_norm": 9.413784980773926, - "learning_rate": 3.2530477625390084e-06, - "loss": 0.4869, - "mean_token_accuracy": 0.8451692447066307, - "num_tokens": 234722059.0, - "step": 195100 - }, - { - "entropy": 1.7889280065894126, - "epoch": 0.6048239288446083, - "grad_norm": 3.961284637451172, - "learning_rate": 3.252964396588539e-06, - "loss": 0.3559, - "mean_token_accuracy": 0.8673868477344513, - "num_tokens": 234735463.0, - "step": 195110 - }, - { - "entropy": 1.7480084210634232, - "epoch": 0.604854927969658, - "grad_norm": 7.0458292961120605, - "learning_rate": 3.2528810370470048e-06, - "loss": 0.3167, - "mean_token_accuracy": 0.8772800341248512, - "num_tokens": 234748860.0, - "step": 195120 - }, - { - "entropy": 1.8654575437307357, - "epoch": 0.6048859270947078, - "grad_norm": 7.868505001068115, - "learning_rate": 3.252797683913587e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8634969860315322, - "num_tokens": 234760800.0, - "step": 195130 - }, - { - "entropy": 1.8794177323579788, - "epoch": 0.6049169262197575, - "grad_norm": 9.02531909942627, - "learning_rate": 3.2527143371874637e-06, - "loss": 0.4453, - "mean_token_accuracy": 0.8530821368098259, - "num_tokens": 234772987.0, - "step": 195140 - }, - { - "entropy": 1.925089493393898, - "epoch": 0.6049479253448071, - "grad_norm": 9.341005325317383, - "learning_rate": 3.2526309968678134e-06, - "loss": 0.4984, - "mean_token_accuracy": 0.8462122529745102, - "num_tokens": 234783221.0, - "step": 195150 - }, - { - "entropy": 1.8883005887269975, - "epoch": 0.6049789244698568, - "grad_norm": 8.575138092041016, - "learning_rate": 3.252547662953816e-06, - "loss": 0.4475, - "mean_token_accuracy": 0.8575668081641197, - "num_tokens": 234794379.0, - "step": 195160 - }, - { - "entropy": 1.862256444990635, - "epoch": 0.6050099235949066, - "grad_norm": 7.103546619415283, - "learning_rate": 3.2524643354446505e-06, - "loss": 0.4073, - "mean_token_accuracy": 0.8599424958229065, - "num_tokens": 234806398.0, - "step": 195170 - }, - { - "entropy": 1.8624321684241294, - "epoch": 0.6050409227199562, - "grad_norm": 3.818366050720215, - "learning_rate": 3.252381014339498e-06, - "loss": 0.4032, - "mean_token_accuracy": 0.8636543408036232, - "num_tokens": 234817814.0, - "step": 195180 - }, - { - "entropy": 1.8703722521662711, - "epoch": 0.6050719218450059, - "grad_norm": 4.305206298828125, - "learning_rate": 3.2522976996375367e-06, - "loss": 0.404, - "mean_token_accuracy": 0.8602907463908196, - "num_tokens": 234829832.0, - "step": 195190 - }, - { - "entropy": 1.8468534156680108, - "epoch": 0.6051029209700556, - "grad_norm": 3.9072325229644775, - "learning_rate": 3.2522143913379473e-06, - "loss": 0.4012, - "mean_token_accuracy": 0.8623139545321464, - "num_tokens": 234843094.0, - "step": 195200 - }, - { - "entropy": 1.8561934053897857, - "epoch": 0.6051339200951054, - "grad_norm": 4.525662899017334, - "learning_rate": 3.252131089439909e-06, - "loss": 0.4292, - "mean_token_accuracy": 0.8550740092992782, - "num_tokens": 234856218.0, - "step": 195210 - }, - { - "entropy": 1.8898674458265305, - "epoch": 0.605164919220155, - "grad_norm": 7.813089370727539, - "learning_rate": 3.2520477939426032e-06, - "loss": 0.4168, - "mean_token_accuracy": 0.8595176964998246, - "num_tokens": 234867782.0, - "step": 195220 - }, - { - "entropy": 1.8157613292336463, - "epoch": 0.6051959183452047, - "grad_norm": 3.6355841159820557, - "learning_rate": 3.2519645048452092e-06, - "loss": 0.3635, - "mean_token_accuracy": 0.866732519865036, - "num_tokens": 234880463.0, - "step": 195230 - }, - { - "entropy": 1.854991014301777, - "epoch": 0.6052269174702544, - "grad_norm": 3.633997678756714, - "learning_rate": 3.2518812221469077e-06, - "loss": 0.43, - "mean_token_accuracy": 0.858416149020195, - "num_tokens": 234892857.0, - "step": 195240 - }, - { - "entropy": 1.9053555011749268, - "epoch": 0.6052579165953041, - "grad_norm": 4.182331562042236, - "learning_rate": 3.2517979458468796e-06, - "loss": 0.5057, - "mean_token_accuracy": 0.8488553673028946, - "num_tokens": 234904345.0, - "step": 195250 - }, - { - "entropy": 1.8818264544010161, - "epoch": 0.6052889157203538, - "grad_norm": 7.960140228271484, - "learning_rate": 3.251714675944306e-06, - "loss": 0.4199, - "mean_token_accuracy": 0.8615189298987389, - "num_tokens": 234916251.0, - "step": 195260 - }, - { - "entropy": 1.8361662790179252, - "epoch": 0.6053199148454035, - "grad_norm": 8.360210418701172, - "learning_rate": 3.251631412438367e-06, - "loss": 0.3575, - "mean_token_accuracy": 0.8698137417435646, - "num_tokens": 234928346.0, - "step": 195270 - }, - { - "entropy": 1.7871503584086894, - "epoch": 0.6053509139704532, - "grad_norm": 7.105410575866699, - "learning_rate": 3.2515481553282447e-06, - "loss": 0.3557, - "mean_token_accuracy": 0.8668757110834122, - "num_tokens": 234942366.0, - "step": 195280 - }, - { - "entropy": 1.7971447199583053, - "epoch": 0.6053819130955029, - "grad_norm": 7.089763164520264, - "learning_rate": 3.2514649046131196e-06, - "loss": 0.4001, - "mean_token_accuracy": 0.8638567492365837, - "num_tokens": 234955981.0, - "step": 195290 - }, - { - "entropy": 1.9256662502884865, - "epoch": 0.6054129122205526, - "grad_norm": 7.474786281585693, - "learning_rate": 3.251381660292173e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8565624952316284, - "num_tokens": 234967329.0, - "step": 195300 - }, - { - "entropy": 1.899828238785267, - "epoch": 0.6054439113456023, - "grad_norm": 8.254742622375488, - "learning_rate": 3.251298422364587e-06, - "loss": 0.4554, - "mean_token_accuracy": 0.8602090954780579, - "num_tokens": 234978476.0, - "step": 195310 - }, - { - "entropy": 1.7984695211052895, - "epoch": 0.6054749104706519, - "grad_norm": 7.558569431304932, - "learning_rate": 3.251215190829542e-06, - "loss": 0.3842, - "mean_token_accuracy": 0.8689236968755722, - "num_tokens": 234991750.0, - "step": 195320 - }, - { - "entropy": 1.885707500576973, - "epoch": 0.6055059095957016, - "grad_norm": 9.93189811706543, - "learning_rate": 3.2511319656862224e-06, - "loss": 0.4656, - "mean_token_accuracy": 0.8508443906903267, - "num_tokens": 235003414.0, - "step": 195330 - }, - { - "entropy": 1.954411643743515, - "epoch": 0.6055369087207514, - "grad_norm": 7.854653835296631, - "learning_rate": 3.251048746933807e-06, - "loss": 0.4464, - "mean_token_accuracy": 0.8575719073414803, - "num_tokens": 235014106.0, - "step": 195340 - }, - { - "entropy": 1.9777034103870392, - "epoch": 0.6055679078458011, - "grad_norm": 7.847527980804443, - "learning_rate": 3.2509655345714796e-06, - "loss": 0.4875, - "mean_token_accuracy": 0.850102536380291, - "num_tokens": 235024582.0, - "step": 195350 - }, - { - "entropy": 1.868042555451393, - "epoch": 0.6055989069708507, - "grad_norm": 7.146167755126953, - "learning_rate": 3.2508823285984228e-06, - "loss": 0.414, - "mean_token_accuracy": 0.8623890817165375, - "num_tokens": 235036898.0, - "step": 195360 - }, - { - "entropy": 1.856178944557905, - "epoch": 0.6056299060959004, - "grad_norm": 7.413034439086914, - "learning_rate": 3.250799129013818e-06, - "loss": 0.4141, - "mean_token_accuracy": 0.8633477687835693, - "num_tokens": 235048923.0, - "step": 195370 - }, - { - "entropy": 1.845955815911293, - "epoch": 0.6056609052209502, - "grad_norm": 3.1155219078063965, - "learning_rate": 3.2507159358168485e-06, - "loss": 0.3825, - "mean_token_accuracy": 0.8615516528487206, - "num_tokens": 235061678.0, - "step": 195380 - }, - { - "entropy": 1.8690281867980958, - "epoch": 0.6056919043459998, - "grad_norm": 9.380828857421875, - "learning_rate": 3.250632749006697e-06, - "loss": 0.4436, - "mean_token_accuracy": 0.8594823315739631, - "num_tokens": 235074606.0, - "step": 195390 - }, - { - "entropy": 1.8899589315056802, - "epoch": 0.6057229034710495, - "grad_norm": 3.6747963428497314, - "learning_rate": 3.2505495685825455e-06, - "loss": 0.4324, - "mean_token_accuracy": 0.8502388820052147, - "num_tokens": 235087108.0, - "step": 195400 - }, - { - "entropy": 1.8682299882173539, - "epoch": 0.6057539025960992, - "grad_norm": 4.402914524078369, - "learning_rate": 3.250466394543577e-06, - "loss": 0.4336, - "mean_token_accuracy": 0.854416199028492, - "num_tokens": 235099187.0, - "step": 195410 - }, - { - "entropy": 1.8698025971651078, - "epoch": 0.605784901721149, - "grad_norm": 3.2012693881988525, - "learning_rate": 3.2503832268889757e-06, - "loss": 0.3847, - "mean_token_accuracy": 0.8672586590051651, - "num_tokens": 235111448.0, - "step": 195420 - }, - { - "entropy": 1.831081511080265, - "epoch": 0.6058159008461986, - "grad_norm": 9.413392066955566, - "learning_rate": 3.250300065617925e-06, - "loss": 0.3793, - "mean_token_accuracy": 0.8682454422116279, - "num_tokens": 235124547.0, - "step": 195430 - }, - { - "entropy": 1.7941778182983399, - "epoch": 0.6058468999712483, - "grad_norm": 3.7888948917388916, - "learning_rate": 3.250216910729607e-06, - "loss": 0.3671, - "mean_token_accuracy": 0.8639795690774917, - "num_tokens": 235138392.0, - "step": 195440 - }, - { - "entropy": 1.8263335436582566, - "epoch": 0.605877899096298, - "grad_norm": 6.821713924407959, - "learning_rate": 3.2501337622232058e-06, - "loss": 0.4002, - "mean_token_accuracy": 0.8669615253806114, - "num_tokens": 235151379.0, - "step": 195450 - }, - { - "entropy": 1.95903280377388, - "epoch": 0.6059088982213477, - "grad_norm": 7.372539520263672, - "learning_rate": 3.250050620097905e-06, - "loss": 0.4776, - "mean_token_accuracy": 0.8572368696331978, - "num_tokens": 235162289.0, - "step": 195460 - }, - { - "entropy": 1.9106680050492286, - "epoch": 0.6059398973463974, - "grad_norm": 8.736983299255371, - "learning_rate": 3.2499674843528896e-06, - "loss": 0.4272, - "mean_token_accuracy": 0.86090597063303, - "num_tokens": 235174024.0, - "step": 195470 - }, - { - "entropy": 1.907187005877495, - "epoch": 0.6059708964714471, - "grad_norm": 10.137788772583008, - "learning_rate": 3.249884354987342e-06, - "loss": 0.4589, - "mean_token_accuracy": 0.8515108034014702, - "num_tokens": 235185376.0, - "step": 195480 - }, - { - "entropy": 1.8968829110264778, - "epoch": 0.6060018955964968, - "grad_norm": 4.01402473449707, - "learning_rate": 3.2498012320004473e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.8576158374547959, - "num_tokens": 235197232.0, - "step": 195490 - }, - { - "entropy": 1.957116511464119, - "epoch": 0.6060328947215465, - "grad_norm": 8.424763679504395, - "learning_rate": 3.24971811539139e-06, - "loss": 0.4923, - "mean_token_accuracy": 0.8485281109809876, - "num_tokens": 235207687.0, - "step": 195500 - }, - { - "entropy": 1.7959214732050897, - "epoch": 0.6060638938465962, - "grad_norm": 3.726801633834839, - "learning_rate": 3.249635005159353e-06, - "loss": 0.3504, - "mean_token_accuracy": 0.8715131148695946, - "num_tokens": 235220308.0, - "step": 195510 - }, - { - "entropy": 1.8073456510901451, - "epoch": 0.6060948929716459, - "grad_norm": 9.020705223083496, - "learning_rate": 3.2495519013035233e-06, - "loss": 0.4018, - "mean_token_accuracy": 0.8587002739310264, - "num_tokens": 235234216.0, - "step": 195520 - }, - { - "entropy": 1.8868019104003906, - "epoch": 0.6061258920966955, - "grad_norm": 8.478205680847168, - "learning_rate": 3.249468803823083e-06, - "loss": 0.4385, - "mean_token_accuracy": 0.8596696853637695, - "num_tokens": 235245495.0, - "step": 195530 - }, - { - "entropy": 1.9176657855510713, - "epoch": 0.6061568912217453, - "grad_norm": 10.844125747680664, - "learning_rate": 3.2493857127172197e-06, - "loss": 0.4828, - "mean_token_accuracy": 0.8496246546506881, - "num_tokens": 235256714.0, - "step": 195540 - }, - { - "entropy": 1.95402589738369, - "epoch": 0.606187890346795, - "grad_norm": 8.79029369354248, - "learning_rate": 3.2493026279851157e-06, - "loss": 0.5417, - "mean_token_accuracy": 0.8361954748630523, - "num_tokens": 235267437.0, - "step": 195550 - }, - { - "entropy": 1.9639874294400215, - "epoch": 0.6062188894718447, - "grad_norm": 6.906721591949463, - "learning_rate": 3.249219549625959e-06, - "loss": 0.462, - "mean_token_accuracy": 0.8525883480906487, - "num_tokens": 235278628.0, - "step": 195560 - }, - { - "entropy": 1.8649201110005378, - "epoch": 0.6062498885968943, - "grad_norm": 9.573200225830078, - "learning_rate": 3.2491364776389322e-06, - "loss": 0.4099, - "mean_token_accuracy": 0.8608361706137657, - "num_tokens": 235290518.0, - "step": 195570 - }, - { - "entropy": 1.8654879599809646, - "epoch": 0.606280887721944, - "grad_norm": 5.2166666984558105, - "learning_rate": 3.2490534120232226e-06, - "loss": 0.4454, - "mean_token_accuracy": 0.8498337626457214, - "num_tokens": 235303213.0, - "step": 195580 - }, - { - "entropy": 1.8535957843065263, - "epoch": 0.6063118868469938, - "grad_norm": 3.9607527256011963, - "learning_rate": 3.248970352778015e-06, - "loss": 0.4155, - "mean_token_accuracy": 0.8599080324172974, - "num_tokens": 235316471.0, - "step": 195590 - }, - { - "entropy": 1.862211149930954, - "epoch": 0.6063428859720434, - "grad_norm": 9.359057426452637, - "learning_rate": 3.2488872999024964e-06, - "loss": 0.4519, - "mean_token_accuracy": 0.8552052542567253, - "num_tokens": 235328218.0, - "step": 195600 - }, - { - "entropy": 1.9296976819634437, - "epoch": 0.6063738850970931, - "grad_norm": 4.127715587615967, - "learning_rate": 3.2488042533958505e-06, - "loss": 0.4762, - "mean_token_accuracy": 0.8469863876700401, - "num_tokens": 235339957.0, - "step": 195610 - }, - { - "entropy": 1.880443413555622, - "epoch": 0.6064048842221428, - "grad_norm": 7.147751808166504, - "learning_rate": 3.2487212132572648e-06, - "loss": 0.4236, - "mean_token_accuracy": 0.851515157520771, - "num_tokens": 235352123.0, - "step": 195620 - }, - { - "entropy": 1.938239911198616, - "epoch": 0.6064358833471926, - "grad_norm": 7.499401092529297, - "learning_rate": 3.2486381794859252e-06, - "loss": 0.4796, - "mean_token_accuracy": 0.8516676813364029, - "num_tokens": 235363800.0, - "step": 195630 - }, - { - "entropy": 1.9163310438394547, - "epoch": 0.6064668824722422, - "grad_norm": 7.778500080108643, - "learning_rate": 3.2485551520810183e-06, - "loss": 0.4688, - "mean_token_accuracy": 0.8536335572600364, - "num_tokens": 235375270.0, - "step": 195640 - }, - { - "entropy": 1.928651387989521, - "epoch": 0.6064978815972919, - "grad_norm": 13.652063369750977, - "learning_rate": 3.24847213104173e-06, - "loss": 0.4866, - "mean_token_accuracy": 0.8396665036678315, - "num_tokens": 235386782.0, - "step": 195650 - }, - { - "entropy": 1.778220947086811, - "epoch": 0.6065288807223416, - "grad_norm": 4.027807712554932, - "learning_rate": 3.2483891163672477e-06, - "loss": 0.4061, - "mean_token_accuracy": 0.8619612753391266, - "num_tokens": 235401570.0, - "step": 195660 - }, - { - "entropy": 1.7784413129091263, - "epoch": 0.6065598798473913, - "grad_norm": 4.169196605682373, - "learning_rate": 3.248306108056758e-06, - "loss": 0.3822, - "mean_token_accuracy": 0.8609212309122085, - "num_tokens": 235414496.0, - "step": 195670 - }, - { - "entropy": 1.8775099590420723, - "epoch": 0.606590878972441, - "grad_norm": 7.162752628326416, - "learning_rate": 3.2482231061094467e-06, - "loss": 0.4024, - "mean_token_accuracy": 0.8568590208888054, - "num_tokens": 235426691.0, - "step": 195680 - }, - { - "entropy": 1.8699518121778964, - "epoch": 0.6066218780974907, - "grad_norm": 4.00959587097168, - "learning_rate": 3.248140110524503e-06, - "loss": 0.366, - "mean_token_accuracy": 0.8694511279463768, - "num_tokens": 235439467.0, - "step": 195690 - }, - { - "entropy": 1.7986382976174355, - "epoch": 0.6066528772225404, - "grad_norm": 7.902369499206543, - "learning_rate": 3.248057121301112e-06, - "loss": 0.3647, - "mean_token_accuracy": 0.8713407471776009, - "num_tokens": 235451837.0, - "step": 195700 - }, - { - "entropy": 1.853308279812336, - "epoch": 0.6066838763475901, - "grad_norm": 4.1074628829956055, - "learning_rate": 3.247974138438463e-06, - "loss": 0.3729, - "mean_token_accuracy": 0.87129997164011, - "num_tokens": 235464410.0, - "step": 195710 - }, - { - "entropy": 1.9251540154218674, - "epoch": 0.6067148754726398, - "grad_norm": 7.291789531707764, - "learning_rate": 3.2478911619357413e-06, - "loss": 0.4646, - "mean_token_accuracy": 0.8508058547973633, - "num_tokens": 235475058.0, - "step": 195720 - }, - { - "entropy": 1.7726059317588807, - "epoch": 0.6067458745976895, - "grad_norm": 9.575237274169922, - "learning_rate": 3.2478081917921357e-06, - "loss": 0.383, - "mean_token_accuracy": 0.8688571244478226, - "num_tokens": 235487731.0, - "step": 195730 - }, - { - "entropy": 1.8045031249523162, - "epoch": 0.6067768737227391, - "grad_norm": 4.120028018951416, - "learning_rate": 3.247725228006835e-06, - "loss": 0.3759, - "mean_token_accuracy": 0.8642520338296891, - "num_tokens": 235500017.0, - "step": 195740 - }, - { - "entropy": 1.929566852748394, - "epoch": 0.6068078728477889, - "grad_norm": 5.6811203956604, - "learning_rate": 3.2476422705790256e-06, - "loss": 0.484, - "mean_token_accuracy": 0.8492823615670204, - "num_tokens": 235511445.0, - "step": 195750 - }, - { - "entropy": 1.8349244624376297, - "epoch": 0.6068388719728386, - "grad_norm": 7.675958156585693, - "learning_rate": 3.2475593195078966e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8610428079962731, - "num_tokens": 235524298.0, - "step": 195760 - }, - { - "entropy": 1.8410429537296296, - "epoch": 0.6068698710978883, - "grad_norm": 6.709105014801025, - "learning_rate": 3.2474763747926347e-06, - "loss": 0.4403, - "mean_token_accuracy": 0.8631402567029, - "num_tokens": 235536357.0, - "step": 195770 - }, - { - "entropy": 1.949179396033287, - "epoch": 0.6069008702229379, - "grad_norm": 7.939975261688232, - "learning_rate": 3.2473934364324306e-06, - "loss": 0.4539, - "mean_token_accuracy": 0.8561415210366249, - "num_tokens": 235547135.0, - "step": 195780 - }, - { - "entropy": 1.8811473071575164, - "epoch": 0.6069318693479877, - "grad_norm": 8.053126335144043, - "learning_rate": 3.2473105044264703e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8551708459854126, - "num_tokens": 235558745.0, - "step": 195790 - }, - { - "entropy": 1.921061459183693, - "epoch": 0.6069628684730374, - "grad_norm": 9.278833389282227, - "learning_rate": 3.2472275787739445e-06, - "loss": 0.5125, - "mean_token_accuracy": 0.8361942797899247, - "num_tokens": 235570191.0, - "step": 195800 - }, - { - "entropy": 1.7232392877340317, - "epoch": 0.606993867598087, - "grad_norm": 8.542473793029785, - "learning_rate": 3.2471446594740413e-06, - "loss": 0.3554, - "mean_token_accuracy": 0.8698214948177337, - "num_tokens": 235584902.0, - "step": 195810 - }, - { - "entropy": 1.923740841448307, - "epoch": 0.6070248667231367, - "grad_norm": 8.452467918395996, - "learning_rate": 3.2470617465259496e-06, - "loss": 0.4753, - "mean_token_accuracy": 0.8503957465291023, - "num_tokens": 235596504.0, - "step": 195820 - }, - { - "entropy": 1.8590814411640166, - "epoch": 0.6070558658481864, - "grad_norm": 4.637422561645508, - "learning_rate": 3.246978839928858e-06, - "loss": 0.4276, - "mean_token_accuracy": 0.8605611816048622, - "num_tokens": 235608967.0, - "step": 195830 - }, - { - "entropy": 1.8847665458917617, - "epoch": 0.6070868649732362, - "grad_norm": 7.79510498046875, - "learning_rate": 3.246895939681957e-06, - "loss": 0.4712, - "mean_token_accuracy": 0.8544412195682526, - "num_tokens": 235621390.0, - "step": 195840 - }, - { - "entropy": 1.8920132741332054, - "epoch": 0.6071178640982858, - "grad_norm": 10.064355850219727, - "learning_rate": 3.2468130457844353e-06, - "loss": 0.4621, - "mean_token_accuracy": 0.8499759629368782, - "num_tokens": 235633162.0, - "step": 195850 - }, - { - "entropy": 1.9165378838777543, - "epoch": 0.6071488632233355, - "grad_norm": 6.8753132820129395, - "learning_rate": 3.2467301582354816e-06, - "loss": 0.5017, - "mean_token_accuracy": 0.8535776749253273, - "num_tokens": 235644499.0, - "step": 195860 - }, - { - "entropy": 1.837621060013771, - "epoch": 0.6071798623483852, - "grad_norm": 4.135498046875, - "learning_rate": 3.2466472770342873e-06, - "loss": 0.4054, - "mean_token_accuracy": 0.8666768983006478, - "num_tokens": 235656397.0, - "step": 195870 - }, - { - "entropy": 1.880282236635685, - "epoch": 0.607210861473435, - "grad_norm": 7.622752666473389, - "learning_rate": 3.24656440218004e-06, - "loss": 0.4346, - "mean_token_accuracy": 0.8597177520394326, - "num_tokens": 235668423.0, - "step": 195880 - }, - { - "entropy": 1.8491656512022019, - "epoch": 0.6072418605984846, - "grad_norm": 8.067870140075684, - "learning_rate": 3.2464815336719317e-06, - "loss": 0.3972, - "mean_token_accuracy": 0.8632102563977242, - "num_tokens": 235680658.0, - "step": 195890 - }, - { - "entropy": 1.8378586277365685, - "epoch": 0.6072728597235343, - "grad_norm": 4.336076259613037, - "learning_rate": 3.2463986715091527e-06, - "loss": 0.4127, - "mean_token_accuracy": 0.859869047999382, - "num_tokens": 235693223.0, - "step": 195900 - }, - { - "entropy": 1.7845323011279106, - "epoch": 0.607303858848584, - "grad_norm": 8.610404968261719, - "learning_rate": 3.2463158156908914e-06, - "loss": 0.3724, - "mean_token_accuracy": 0.8675568237900734, - "num_tokens": 235706654.0, - "step": 195910 - }, - { - "entropy": 1.9118803530931472, - "epoch": 0.6073348579736337, - "grad_norm": 9.936572074890137, - "learning_rate": 3.246232966216339e-06, - "loss": 0.4685, - "mean_token_accuracy": 0.8511000022292137, - "num_tokens": 235718122.0, - "step": 195920 - }, - { - "entropy": 1.886147889494896, - "epoch": 0.6073658570986834, - "grad_norm": 8.693392753601074, - "learning_rate": 3.2461501230846863e-06, - "loss": 0.4781, - "mean_token_accuracy": 0.8479161009192466, - "num_tokens": 235729645.0, - "step": 195930 - }, - { - "entropy": 1.8959844335913658, - "epoch": 0.6073968562237331, - "grad_norm": 8.2455472946167, - "learning_rate": 3.2460672862951235e-06, - "loss": 0.4382, - "mean_token_accuracy": 0.8538859143853188, - "num_tokens": 235741982.0, - "step": 195940 - }, - { - "entropy": 1.907785764336586, - "epoch": 0.6074278553487827, - "grad_norm": 7.019741058349609, - "learning_rate": 3.2459844558468436e-06, - "loss": 0.4127, - "mean_token_accuracy": 0.8617287069559098, - "num_tokens": 235753575.0, - "step": 195950 - }, - { - "entropy": 1.840754969418049, - "epoch": 0.6074588544738325, - "grad_norm": 8.111040115356445, - "learning_rate": 3.245901631739034e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8641861319541931, - "num_tokens": 235765664.0, - "step": 195960 - }, - { - "entropy": 1.8874695479869843, - "epoch": 0.6074898535988822, - "grad_norm": 8.226910591125488, - "learning_rate": 3.2458188139708885e-06, - "loss": 0.4425, - "mean_token_accuracy": 0.8520441308617592, - "num_tokens": 235777654.0, - "step": 195970 - }, - { - "entropy": 1.9285912573337556, - "epoch": 0.6075208527239319, - "grad_norm": 8.283777236938477, - "learning_rate": 3.2457360025415967e-06, - "loss": 0.4783, - "mean_token_accuracy": 0.8478949934244155, - "num_tokens": 235789176.0, - "step": 195980 - }, - { - "entropy": 1.9073958545923233, - "epoch": 0.6075518518489815, - "grad_norm": 8.261918067932129, - "learning_rate": 3.245653197450352e-06, - "loss": 0.4654, - "mean_token_accuracy": 0.8549600809812545, - "num_tokens": 235800644.0, - "step": 195990 - }, - { - "entropy": 1.8290598958730697, - "epoch": 0.6075828509740313, - "grad_norm": 8.439154624938965, - "learning_rate": 3.2455703986963444e-06, - "loss": 0.472, - "mean_token_accuracy": 0.8555337622761726, - "num_tokens": 235813792.0, - "step": 196000 - }, - { - "entropy": 1.891948239505291, - "epoch": 0.607613850099081, - "grad_norm": 8.003715515136719, - "learning_rate": 3.2454876062787656e-06, - "loss": 0.4513, - "mean_token_accuracy": 0.8544703394174575, - "num_tokens": 235825305.0, - "step": 196010 - }, - { - "entropy": 1.8116373434662818, - "epoch": 0.6076448492241306, - "grad_norm": 7.792770862579346, - "learning_rate": 3.245404820196808e-06, - "loss": 0.3699, - "mean_token_accuracy": 0.8783703818917274, - "num_tokens": 235837596.0, - "step": 196020 - }, - { - "entropy": 1.8243522822856904, - "epoch": 0.6076758483491803, - "grad_norm": 8.489314079284668, - "learning_rate": 3.245322040449663e-06, - "loss": 0.4255, - "mean_token_accuracy": 0.8566062182188035, - "num_tokens": 235850827.0, - "step": 196030 - }, - { - "entropy": 1.823631004989147, - "epoch": 0.6077068474742301, - "grad_norm": 9.67441177368164, - "learning_rate": 3.245239267036524e-06, - "loss": 0.3855, - "mean_token_accuracy": 0.8649219900369645, - "num_tokens": 235863914.0, - "step": 196040 - }, - { - "entropy": 1.9446945399045945, - "epoch": 0.6077378465992798, - "grad_norm": 7.182814598083496, - "learning_rate": 3.245156499956582e-06, - "loss": 0.4913, - "mean_token_accuracy": 0.8489960566163063, - "num_tokens": 235874892.0, - "step": 196050 - }, - { - "entropy": 1.9042619869112969, - "epoch": 0.6077688457243294, - "grad_norm": 9.036920547485352, - "learning_rate": 3.24507373920903e-06, - "loss": 0.5058, - "mean_token_accuracy": 0.8501462116837502, - "num_tokens": 235886665.0, - "step": 196060 - }, - { - "entropy": 1.9438833177089692, - "epoch": 0.6077998448493791, - "grad_norm": 8.651317596435547, - "learning_rate": 3.2449909847930606e-06, - "loss": 0.5106, - "mean_token_accuracy": 0.843551354110241, - "num_tokens": 235897580.0, - "step": 196070 - }, - { - "entropy": 1.8759539812803268, - "epoch": 0.6078308439744288, - "grad_norm": 3.959768533706665, - "learning_rate": 3.244908236707866e-06, - "loss": 0.4392, - "mean_token_accuracy": 0.8555333316326141, - "num_tokens": 235909609.0, - "step": 196080 - }, - { - "entropy": 1.8777537867426872, - "epoch": 0.6078618430994785, - "grad_norm": 6.433634281158447, - "learning_rate": 3.24482549495264e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.8643855184316636, - "num_tokens": 235922055.0, - "step": 196090 - }, - { - "entropy": 1.9086138397455215, - "epoch": 0.6078928422245282, - "grad_norm": 8.411335945129395, - "learning_rate": 3.244742759526575e-06, - "loss": 0.4146, - "mean_token_accuracy": 0.8512304544448852, - "num_tokens": 235933703.0, - "step": 196100 - }, - { - "entropy": 1.9222751021385194, - "epoch": 0.6079238413495779, - "grad_norm": 10.310203552246094, - "learning_rate": 3.244660030428864e-06, - "loss": 0.508, - "mean_token_accuracy": 0.8420830890536308, - "num_tokens": 235944993.0, - "step": 196110 - }, - { - "entropy": 1.90309539437294, - "epoch": 0.6079548404746276, - "grad_norm": 4.145473957061768, - "learning_rate": 3.2445773076587004e-06, - "loss": 0.4278, - "mean_token_accuracy": 0.8556212335824966, - "num_tokens": 235957125.0, - "step": 196120 - }, - { - "entropy": 1.8705694049596786, - "epoch": 0.6079858395996773, - "grad_norm": 7.968718528747559, - "learning_rate": 3.2444945912152776e-06, - "loss": 0.4219, - "mean_token_accuracy": 0.861976896226406, - "num_tokens": 235968426.0, - "step": 196130 - }, - { - "entropy": 1.8437914915382863, - "epoch": 0.608016838724727, - "grad_norm": 7.557614803314209, - "learning_rate": 3.244411881097789e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.8543598294258118, - "num_tokens": 235981059.0, - "step": 196140 - }, - { - "entropy": 1.8821563974022866, - "epoch": 0.6080478378497767, - "grad_norm": 8.101224899291992, - "learning_rate": 3.2443291773054294e-06, - "loss": 0.4593, - "mean_token_accuracy": 0.848881970345974, - "num_tokens": 235992831.0, - "step": 196150 - }, - { - "entropy": 1.799989365041256, - "epoch": 0.6080788369748263, - "grad_norm": 6.886673450469971, - "learning_rate": 3.244246479837392e-06, - "loss": 0.4032, - "mean_token_accuracy": 0.8733502745628356, - "num_tokens": 236006483.0, - "step": 196160 - }, - { - "entropy": 1.9383499220013618, - "epoch": 0.6081098360998761, - "grad_norm": 7.37465763092041, - "learning_rate": 3.24416378869287e-06, - "loss": 0.4858, - "mean_token_accuracy": 0.8524863049387932, - "num_tokens": 236017916.0, - "step": 196170 - }, - { - "entropy": 1.8896848455071449, - "epoch": 0.6081408352249258, - "grad_norm": 6.178394794464111, - "learning_rate": 3.2440811038710583e-06, - "loss": 0.467, - "mean_token_accuracy": 0.8497600927948952, - "num_tokens": 236030461.0, - "step": 196180 - }, - { - "entropy": 1.9355302542448043, - "epoch": 0.6081718343499755, - "grad_norm": 7.876641750335693, - "learning_rate": 3.2439984253711515e-06, - "loss": 0.5212, - "mean_token_accuracy": 0.8411651849746704, - "num_tokens": 236042188.0, - "step": 196190 - }, - { - "entropy": 1.877279168367386, - "epoch": 0.6082028334750251, - "grad_norm": 6.912283420562744, - "learning_rate": 3.2439157531923432e-06, - "loss": 0.4101, - "mean_token_accuracy": 0.8578936800360679, - "num_tokens": 236053737.0, - "step": 196200 - }, - { - "entropy": 1.9325706675648688, - "epoch": 0.6082338326000749, - "grad_norm": 7.355538845062256, - "learning_rate": 3.2438330873338297e-06, - "loss": 0.4665, - "mean_token_accuracy": 0.8536846920847893, - "num_tokens": 236065060.0, - "step": 196210 - }, - { - "entropy": 1.9052503943443297, - "epoch": 0.6082648317251246, - "grad_norm": 7.190591812133789, - "learning_rate": 3.2437504277948032e-06, - "loss": 0.4317, - "mean_token_accuracy": 0.8584253296256066, - "num_tokens": 236077520.0, - "step": 196220 - }, - { - "entropy": 1.7885384008288383, - "epoch": 0.6082958308501742, - "grad_norm": 7.963929176330566, - "learning_rate": 3.2436677745744605e-06, - "loss": 0.3639, - "mean_token_accuracy": 0.865674552321434, - "num_tokens": 236090760.0, - "step": 196230 - }, - { - "entropy": 1.9493159145116805, - "epoch": 0.6083268299752239, - "grad_norm": 8.272683143615723, - "learning_rate": 3.243585127671996e-06, - "loss": 0.4766, - "mean_token_accuracy": 0.844498673081398, - "num_tokens": 236101923.0, - "step": 196240 - }, - { - "entropy": 1.92689688205719, - "epoch": 0.6083578291002737, - "grad_norm": 7.235065937042236, - "learning_rate": 3.2435024870866042e-06, - "loss": 0.4287, - "mean_token_accuracy": 0.8672019839286804, - "num_tokens": 236113110.0, - "step": 196250 - }, - { - "entropy": 1.9139114201068879, - "epoch": 0.6083888282253234, - "grad_norm": 8.594062805175781, - "learning_rate": 3.2434198528174823e-06, - "loss": 0.4418, - "mean_token_accuracy": 0.8535932213068008, - "num_tokens": 236124685.0, - "step": 196260 - }, - { - "entropy": 1.938517615199089, - "epoch": 0.608419827350373, - "grad_norm": 8.178302764892578, - "learning_rate": 3.2433372248638235e-06, - "loss": 0.4367, - "mean_token_accuracy": 0.8637794941663742, - "num_tokens": 236136003.0, - "step": 196270 - }, - { - "entropy": 1.825549528002739, - "epoch": 0.6084508264754227, - "grad_norm": 8.20291805267334, - "learning_rate": 3.2432546032248247e-06, - "loss": 0.3806, - "mean_token_accuracy": 0.866515477001667, - "num_tokens": 236149072.0, - "step": 196280 - }, - { - "entropy": 1.8921896636486053, - "epoch": 0.6084818256004725, - "grad_norm": 7.851335048675537, - "learning_rate": 3.2431719878996814e-06, - "loss": 0.4614, - "mean_token_accuracy": 0.8482361376285553, - "num_tokens": 236160335.0, - "step": 196290 - }, - { - "entropy": 1.8066128924489022, - "epoch": 0.6085128247255222, - "grad_norm": 8.3627347946167, - "learning_rate": 3.243089378887589e-06, - "loss": 0.4086, - "mean_token_accuracy": 0.8578595578670501, - "num_tokens": 236172757.0, - "step": 196300 - }, - { - "entropy": 1.8721036612987518, - "epoch": 0.6085438238505718, - "grad_norm": 8.146467208862305, - "learning_rate": 3.243006776187744e-06, - "loss": 0.4263, - "mean_token_accuracy": 0.8607116281986237, - "num_tokens": 236184544.0, - "step": 196310 - }, - { - "entropy": 1.8730005353689194, - "epoch": 0.6085748229756215, - "grad_norm": 3.757640838623047, - "learning_rate": 3.2429241797993433e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.8655624583363533, - "num_tokens": 236196381.0, - "step": 196320 - }, - { - "entropy": 1.7163779377937316, - "epoch": 0.6086058221006712, - "grad_norm": 3.7942216396331787, - "learning_rate": 3.2428415897215815e-06, - "loss": 0.3292, - "mean_token_accuracy": 0.8697278842329978, - "num_tokens": 236210821.0, - "step": 196330 - }, - { - "entropy": 1.9790637254714967, - "epoch": 0.6086368212257209, - "grad_norm": 8.186054229736328, - "learning_rate": 3.242759005953657e-06, - "loss": 0.5292, - "mean_token_accuracy": 0.839950506389141, - "num_tokens": 236221650.0, - "step": 196340 - }, - { - "entropy": 1.9705463379621506, - "epoch": 0.6086678203507706, - "grad_norm": 6.929887771606445, - "learning_rate": 3.242676428494764e-06, - "loss": 0.478, - "mean_token_accuracy": 0.8496137440204621, - "num_tokens": 236232491.0, - "step": 196350 - }, - { - "entropy": 1.8872851014137269, - "epoch": 0.6086988194758203, - "grad_norm": 8.718164443969727, - "learning_rate": 3.2425938573441014e-06, - "loss": 0.4755, - "mean_token_accuracy": 0.8533385038375855, - "num_tokens": 236244089.0, - "step": 196360 - }, - { - "entropy": 1.7990367278456687, - "epoch": 0.6087298186008699, - "grad_norm": 8.366378784179688, - "learning_rate": 3.2425112925008656e-06, - "loss": 0.4352, - "mean_token_accuracy": 0.8653234079480171, - "num_tokens": 236258434.0, - "step": 196370 - }, - { - "entropy": 1.8804176807403565, - "epoch": 0.6087608177259197, - "grad_norm": 3.7890381813049316, - "learning_rate": 3.2424287339642526e-06, - "loss": 0.4348, - "mean_token_accuracy": 0.8637651592493057, - "num_tokens": 236269932.0, - "step": 196380 - }, - { - "entropy": 1.921053881943226, - "epoch": 0.6087918168509694, - "grad_norm": 8.576593399047852, - "learning_rate": 3.2423461817334606e-06, - "loss": 0.4843, - "mean_token_accuracy": 0.8536047399044037, - "num_tokens": 236281900.0, - "step": 196390 - }, - { - "entropy": 1.8900701761245728, - "epoch": 0.6088228159760191, - "grad_norm": 7.210069179534912, - "learning_rate": 3.2422636358076863e-06, - "loss": 0.447, - "mean_token_accuracy": 0.8583195567131042, - "num_tokens": 236292882.0, - "step": 196400 - }, - { - "entropy": 1.9627773761749268, - "epoch": 0.6088538151010687, - "grad_norm": 10.353533744812012, - "learning_rate": 3.242181096186128e-06, - "loss": 0.5226, - "mean_token_accuracy": 0.8441329196095466, - "num_tokens": 236303626.0, - "step": 196410 - }, - { - "entropy": 1.9172202914953231, - "epoch": 0.6088848142261185, - "grad_norm": 7.017533302307129, - "learning_rate": 3.2420985628679825e-06, - "loss": 0.4182, - "mean_token_accuracy": 0.8683144554495812, - "num_tokens": 236314947.0, - "step": 196420 - }, - { - "entropy": 1.7904640406370163, - "epoch": 0.6089158133511682, - "grad_norm": 8.5465087890625, - "learning_rate": 3.242016035852447e-06, - "loss": 0.393, - "mean_token_accuracy": 0.8550734579563141, - "num_tokens": 236327944.0, - "step": 196430 - }, - { - "entropy": 1.8345270901918411, - "epoch": 0.6089468124762178, - "grad_norm": 10.23214054107666, - "learning_rate": 3.241933515138721e-06, - "loss": 0.4352, - "mean_token_accuracy": 0.8526005238294602, - "num_tokens": 236339687.0, - "step": 196440 - }, - { - "entropy": 1.871364989876747, - "epoch": 0.6089778116012675, - "grad_norm": 3.8333592414855957, - "learning_rate": 3.2418510007260012e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8566494002938271, - "num_tokens": 236351160.0, - "step": 196450 - }, - { - "entropy": 1.9011761844158173, - "epoch": 0.6090088107263173, - "grad_norm": 9.281325340270996, - "learning_rate": 3.241768492613486e-06, - "loss": 0.4825, - "mean_token_accuracy": 0.8505636110901833, - "num_tokens": 236363054.0, - "step": 196460 - }, - { - "entropy": 1.8195012554526329, - "epoch": 0.609039809851367, - "grad_norm": 8.800650596618652, - "learning_rate": 3.241685990800375e-06, - "loss": 0.3941, - "mean_token_accuracy": 0.8715409860014915, - "num_tokens": 236375258.0, - "step": 196470 - }, - { - "entropy": 1.9295989215373992, - "epoch": 0.6090708089764166, - "grad_norm": 4.64478063583374, - "learning_rate": 3.2416034952858657e-06, - "loss": 0.4684, - "mean_token_accuracy": 0.8563613042235374, - "num_tokens": 236386829.0, - "step": 196480 - }, - { - "entropy": 1.882970041036606, - "epoch": 0.6091018081014663, - "grad_norm": 8.149700164794922, - "learning_rate": 3.2415210060691556e-06, - "loss": 0.4319, - "mean_token_accuracy": 0.8507476598024368, - "num_tokens": 236398588.0, - "step": 196490 - }, - { - "entropy": 1.8938290104269981, - "epoch": 0.6091328072265161, - "grad_norm": 7.915431022644043, - "learning_rate": 3.241438523149444e-06, - "loss": 0.4216, - "mean_token_accuracy": 0.866845078766346, - "num_tokens": 236409808.0, - "step": 196500 - }, - { - "entropy": 1.9040289625525475, - "epoch": 0.6091638063515658, - "grad_norm": 8.756927490234375, - "learning_rate": 3.241356046525931e-06, - "loss": 0.467, - "mean_token_accuracy": 0.8510834023356437, - "num_tokens": 236421042.0, - "step": 196510 - }, - { - "entropy": 1.863030880689621, - "epoch": 0.6091948054766154, - "grad_norm": 8.007649421691895, - "learning_rate": 3.241273576197815e-06, - "loss": 0.4034, - "mean_token_accuracy": 0.8647183448076248, - "num_tokens": 236432441.0, - "step": 196520 - }, - { - "entropy": 1.7401383429765702, - "epoch": 0.6092258046016651, - "grad_norm": 7.5847272872924805, - "learning_rate": 3.241191112164295e-06, - "loss": 0.3899, - "mean_token_accuracy": 0.8612486839294433, - "num_tokens": 236446359.0, - "step": 196530 - }, - { - "entropy": 1.8843446135520936, - "epoch": 0.6092568037267149, - "grad_norm": 7.633392810821533, - "learning_rate": 3.2411086544245702e-06, - "loss": 0.471, - "mean_token_accuracy": 0.8475699871778488, - "num_tokens": 236457568.0, - "step": 196540 - }, - { - "entropy": 1.9031406715512276, - "epoch": 0.6092878028517645, - "grad_norm": 7.588416576385498, - "learning_rate": 3.24102620297784e-06, - "loss": 0.459, - "mean_token_accuracy": 0.8562910869717598, - "num_tokens": 236469337.0, - "step": 196550 - }, - { - "entropy": 1.854507052898407, - "epoch": 0.6093188019768142, - "grad_norm": 10.432723999023438, - "learning_rate": 3.2409437578233037e-06, - "loss": 0.4633, - "mean_token_accuracy": 0.8570180460810661, - "num_tokens": 236481076.0, - "step": 196560 - }, - { - "entropy": 1.8487398087978364, - "epoch": 0.6093498011018639, - "grad_norm": 8.431697845458984, - "learning_rate": 3.2408613189601625e-06, - "loss": 0.4247, - "mean_token_accuracy": 0.8562931612133979, - "num_tokens": 236493481.0, - "step": 196570 - }, - { - "entropy": 1.7827741295099258, - "epoch": 0.6093808002269135, - "grad_norm": 7.114185810089111, - "learning_rate": 3.2407788863876144e-06, - "loss": 0.4015, - "mean_token_accuracy": 0.863072381913662, - "num_tokens": 236506265.0, - "step": 196580 - }, - { - "entropy": 1.8275396153330803, - "epoch": 0.6094117993519633, - "grad_norm": 8.118542671203613, - "learning_rate": 3.2406964601048606e-06, - "loss": 0.4293, - "mean_token_accuracy": 0.8519252866506577, - "num_tokens": 236518948.0, - "step": 196590 - }, - { - "entropy": 1.8712569296360015, - "epoch": 0.609442798477013, - "grad_norm": 7.7484660148620605, - "learning_rate": 3.2406140401111015e-06, - "loss": 0.4241, - "mean_token_accuracy": 0.8604287385940552, - "num_tokens": 236530875.0, - "step": 196600 - }, - { - "entropy": 1.8790629103779792, - "epoch": 0.6094737976020627, - "grad_norm": 7.387578010559082, - "learning_rate": 3.240531626405536e-06, - "loss": 0.415, - "mean_token_accuracy": 0.8621467888355255, - "num_tokens": 236542685.0, - "step": 196610 - }, - { - "entropy": 1.8917624711990357, - "epoch": 0.6095047967271123, - "grad_norm": 8.596790313720703, - "learning_rate": 3.240449218987366e-06, - "loss": 0.5302, - "mean_token_accuracy": 0.8511245831847191, - "num_tokens": 236554217.0, - "step": 196620 - }, - { - "entropy": 1.8684320464730262, - "epoch": 0.6095357958521621, - "grad_norm": 4.486491680145264, - "learning_rate": 3.240366817855792e-06, - "loss": 0.439, - "mean_token_accuracy": 0.8597828835248947, - "num_tokens": 236565677.0, - "step": 196630 - }, - { - "entropy": 1.8999014109373094, - "epoch": 0.6095667949772118, - "grad_norm": 8.079882621765137, - "learning_rate": 3.240284423010013e-06, - "loss": 0.4231, - "mean_token_accuracy": 0.864780393242836, - "num_tokens": 236576691.0, - "step": 196640 - }, - { - "entropy": 1.845630045235157, - "epoch": 0.6095977941022614, - "grad_norm": 7.2956109046936035, - "learning_rate": 3.240202034449232e-06, - "loss": 0.42, - "mean_token_accuracy": 0.8588384404778481, - "num_tokens": 236588267.0, - "step": 196650 - }, - { - "entropy": 1.8553462713956832, - "epoch": 0.6096287932273111, - "grad_norm": 9.026920318603516, - "learning_rate": 3.2401196521726493e-06, - "loss": 0.4567, - "mean_token_accuracy": 0.8575291231274604, - "num_tokens": 236600044.0, - "step": 196660 - }, - { - "entropy": 1.738269330561161, - "epoch": 0.6096597923523609, - "grad_norm": 4.796543121337891, - "learning_rate": 3.2400372761794647e-06, - "loss": 0.3288, - "mean_token_accuracy": 0.8745391935110092, - "num_tokens": 236613490.0, - "step": 196670 - }, - { - "entropy": 1.7987927034497262, - "epoch": 0.6096907914774106, - "grad_norm": 8.865738868713379, - "learning_rate": 3.239954906468882e-06, - "loss": 0.4463, - "mean_token_accuracy": 0.8437414780259133, - "num_tokens": 236626711.0, - "step": 196680 - }, - { - "entropy": 1.8533886075019836, - "epoch": 0.6097217906024602, - "grad_norm": 8.215073585510254, - "learning_rate": 3.239872543040101e-06, - "loss": 0.4249, - "mean_token_accuracy": 0.8536760002374649, - "num_tokens": 236638229.0, - "step": 196690 - }, - { - "entropy": 1.7952544182538985, - "epoch": 0.6097527897275099, - "grad_norm": 9.725830078125, - "learning_rate": 3.239790185892323e-06, - "loss": 0.4011, - "mean_token_accuracy": 0.8603607341647148, - "num_tokens": 236650934.0, - "step": 196700 - }, - { - "entropy": 1.7925215408205986, - "epoch": 0.6097837888525597, - "grad_norm": 3.792999744415283, - "learning_rate": 3.2397078350247505e-06, - "loss": 0.401, - "mean_token_accuracy": 0.8590068429708481, - "num_tokens": 236663905.0, - "step": 196710 - }, - { - "entropy": 1.8281165212392807, - "epoch": 0.6098147879776094, - "grad_norm": 7.636138439178467, - "learning_rate": 3.239625490436586e-06, - "loss": 0.4295, - "mean_token_accuracy": 0.863307175040245, - "num_tokens": 236675769.0, - "step": 196720 - }, - { - "entropy": 1.8852648198604585, - "epoch": 0.609845787102659, - "grad_norm": 6.831247329711914, - "learning_rate": 3.239543152127031e-06, - "loss": 0.4309, - "mean_token_accuracy": 0.8598569333553314, - "num_tokens": 236686930.0, - "step": 196730 - }, - { - "entropy": 1.71738261282444, - "epoch": 0.6098767862277087, - "grad_norm": 3.711994171142578, - "learning_rate": 3.2394608200952864e-06, - "loss": 0.3729, - "mean_token_accuracy": 0.8827052310109138, - "num_tokens": 236700123.0, - "step": 196740 - }, - { - "entropy": 1.794165775924921, - "epoch": 0.6099077853527585, - "grad_norm": 7.85221004486084, - "learning_rate": 3.239378494340556e-06, - "loss": 0.4, - "mean_token_accuracy": 0.8599389553070068, - "num_tokens": 236713096.0, - "step": 196750 - }, - { - "entropy": 1.8303263157606124, - "epoch": 0.6099387844778081, - "grad_norm": 4.312600612640381, - "learning_rate": 3.239296174862041e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.8508070096373558, - "num_tokens": 236725536.0, - "step": 196760 - }, - { - "entropy": 1.85920792222023, - "epoch": 0.6099697836028578, - "grad_norm": 7.289279937744141, - "learning_rate": 3.239213861658946e-06, - "loss": 0.4479, - "mean_token_accuracy": 0.8590033814311028, - "num_tokens": 236736840.0, - "step": 196770 - }, - { - "entropy": 1.8310221433639526, - "epoch": 0.6100007827279075, - "grad_norm": 7.4863715171813965, - "learning_rate": 3.2391315547304714e-06, - "loss": 0.4155, - "mean_token_accuracy": 0.8629860013723374, - "num_tokens": 236749212.0, - "step": 196780 - }, - { - "entropy": 1.8335285529494285, - "epoch": 0.6100317818529573, - "grad_norm": 8.005071640014648, - "learning_rate": 3.2390492540758217e-06, - "loss": 0.4201, - "mean_token_accuracy": 0.8613960042595863, - "num_tokens": 236761681.0, - "step": 196790 - }, - { - "entropy": 1.7937164142727853, - "epoch": 0.6100627809780069, - "grad_norm": 7.301197052001953, - "learning_rate": 3.2389669596941985e-06, - "loss": 0.4242, - "mean_token_accuracy": 0.8637640863656998, - "num_tokens": 236774319.0, - "step": 196800 - }, - { - "entropy": 1.8970348417758942, - "epoch": 0.6100937801030566, - "grad_norm": 7.350532531738281, - "learning_rate": 3.2388846715848064e-06, - "loss": 0.4576, - "mean_token_accuracy": 0.848687818646431, - "num_tokens": 236785809.0, - "step": 196810 - }, - { - "entropy": 1.8532146289944649, - "epoch": 0.6101247792281063, - "grad_norm": 8.086607933044434, - "learning_rate": 3.2388023897468483e-06, - "loss": 0.4145, - "mean_token_accuracy": 0.8612830236554145, - "num_tokens": 236797749.0, - "step": 196820 - }, - { - "entropy": 1.8598564073443413, - "epoch": 0.6101557783531559, - "grad_norm": 3.8540594577789307, - "learning_rate": 3.2387201141795277e-06, - "loss": 0.4237, - "mean_token_accuracy": 0.8560948729515075, - "num_tokens": 236809315.0, - "step": 196830 - }, - { - "entropy": 1.693070538341999, - "epoch": 0.6101867774782057, - "grad_norm": 4.18641996383667, - "learning_rate": 3.238637844882047e-06, - "loss": 0.3489, - "mean_token_accuracy": 0.8701474830508232, - "num_tokens": 236823125.0, - "step": 196840 - }, - { - "entropy": 1.8074431777000428, - "epoch": 0.6102177766032554, - "grad_norm": 7.329400539398193, - "learning_rate": 3.238555581853611e-06, - "loss": 0.4006, - "mean_token_accuracy": 0.8615547999739647, - "num_tokens": 236836016.0, - "step": 196850 - }, - { - "entropy": 1.8871548235416413, - "epoch": 0.610248775728305, - "grad_norm": 8.988987922668457, - "learning_rate": 3.238473325093423e-06, - "loss": 0.4659, - "mean_token_accuracy": 0.8461986109614372, - "num_tokens": 236847417.0, - "step": 196860 - }, - { - "entropy": 1.8518931522965432, - "epoch": 0.6102797748533547, - "grad_norm": 7.389161586761475, - "learning_rate": 3.2383910746006876e-06, - "loss": 0.4374, - "mean_token_accuracy": 0.8587850123643875, - "num_tokens": 236859125.0, - "step": 196870 - }, - { - "entropy": 1.8223053202033044, - "epoch": 0.6103107739784045, - "grad_norm": 7.211876392364502, - "learning_rate": 3.2383088303746092e-06, - "loss": 0.4053, - "mean_token_accuracy": 0.8618249401450158, - "num_tokens": 236871551.0, - "step": 196880 - }, - { - "entropy": 1.8100453361868858, - "epoch": 0.6103417731034542, - "grad_norm": 4.357117176055908, - "learning_rate": 3.2382265924143915e-06, - "loss": 0.4501, - "mean_token_accuracy": 0.8459898293018341, - "num_tokens": 236884756.0, - "step": 196890 - }, - { - "entropy": 1.8431350201368333, - "epoch": 0.6103727722285038, - "grad_norm": 6.173425674438477, - "learning_rate": 3.238144360719238e-06, - "loss": 0.4408, - "mean_token_accuracy": 0.8548444300889969, - "num_tokens": 236897054.0, - "step": 196900 - }, - { - "entropy": 1.8529955729842187, - "epoch": 0.6104037713535535, - "grad_norm": 8.83094596862793, - "learning_rate": 3.2380621352883545e-06, - "loss": 0.4685, - "mean_token_accuracy": 0.8517158061265946, - "num_tokens": 236908851.0, - "step": 196910 - }, - { - "entropy": 1.8931651636958122, - "epoch": 0.6104347704786033, - "grad_norm": 9.133150100708008, - "learning_rate": 3.237979916120946e-06, - "loss": 0.4787, - "mean_token_accuracy": 0.8503081545233726, - "num_tokens": 236920724.0, - "step": 196920 - }, - { - "entropy": 1.7971744000911714, - "epoch": 0.610465769603653, - "grad_norm": 7.09853982925415, - "learning_rate": 3.237897703216217e-06, - "loss": 0.3723, - "mean_token_accuracy": 0.8749179974198341, - "num_tokens": 236933466.0, - "step": 196930 - }, - { - "entropy": 1.956950694322586, - "epoch": 0.6104967687287026, - "grad_norm": 7.854861259460449, - "learning_rate": 3.237815496573371e-06, - "loss": 0.5144, - "mean_token_accuracy": 0.851188787817955, - "num_tokens": 236944729.0, - "step": 196940 - }, - { - "entropy": 1.8420781329274178, - "epoch": 0.6105277678537523, - "grad_norm": 4.328573226928711, - "learning_rate": 3.2377332961916154e-06, - "loss": 0.4509, - "mean_token_accuracy": 0.851442477107048, - "num_tokens": 236958154.0, - "step": 196950 - }, - { - "entropy": 1.9192476660013198, - "epoch": 0.6105587669788021, - "grad_norm": 7.871819019317627, - "learning_rate": 3.237651102070154e-06, - "loss": 0.4576, - "mean_token_accuracy": 0.8586204499006271, - "num_tokens": 236968917.0, - "step": 196960 - }, - { - "entropy": 1.7532826662063599, - "epoch": 0.6105897661038517, - "grad_norm": 8.188135147094727, - "learning_rate": 3.237568914208193e-06, - "loss": 0.374, - "mean_token_accuracy": 0.8674729630351067, - "num_tokens": 236982705.0, - "step": 196970 - }, - { - "entropy": 1.9599376797676087, - "epoch": 0.6106207652289014, - "grad_norm": 11.441996574401855, - "learning_rate": 3.2374867326049374e-06, - "loss": 0.474, - "mean_token_accuracy": 0.8513784274458885, - "num_tokens": 236993636.0, - "step": 196980 - }, - { - "entropy": 1.9321315556764602, - "epoch": 0.6106517643539511, - "grad_norm": 10.580621719360352, - "learning_rate": 3.2374045572595936e-06, - "loss": 0.4976, - "mean_token_accuracy": 0.8470071226358413, - "num_tokens": 237004534.0, - "step": 196990 - }, - { - "entropy": 1.8774404481053353, - "epoch": 0.6106827634790009, - "grad_norm": 8.446964263916016, - "learning_rate": 3.237322388171367e-06, - "loss": 0.4343, - "mean_token_accuracy": 0.859823040664196, - "num_tokens": 237016327.0, - "step": 197000 - }, - { - "entropy": 1.9051348567008972, - "epoch": 0.6107137626040505, - "grad_norm": 8.405506134033203, - "learning_rate": 3.2372402253394627e-06, - "loss": 0.4877, - "mean_token_accuracy": 0.8614391669631004, - "num_tokens": 237027383.0, - "step": 197010 - }, - { - "entropy": 1.8986682042479515, - "epoch": 0.6107447617291002, - "grad_norm": 7.353382110595703, - "learning_rate": 3.2371580687630882e-06, - "loss": 0.4324, - "mean_token_accuracy": 0.8544711455702781, - "num_tokens": 237039405.0, - "step": 197020 - }, - { - "entropy": 1.8387581080198288, - "epoch": 0.6107757608541499, - "grad_norm": 7.9818501472473145, - "learning_rate": 3.2370759184414486e-06, - "loss": 0.4099, - "mean_token_accuracy": 0.8615599378943444, - "num_tokens": 237052054.0, - "step": 197030 - }, - { - "entropy": 1.907764096558094, - "epoch": 0.6108067599791995, - "grad_norm": 7.708189010620117, - "learning_rate": 3.2369937743737518e-06, - "loss": 0.4402, - "mean_token_accuracy": 0.8530245617032051, - "num_tokens": 237064397.0, - "step": 197040 - }, - { - "entropy": 1.8867248311638831, - "epoch": 0.6108377591042493, - "grad_norm": 9.634085655212402, - "learning_rate": 3.2369116365592037e-06, - "loss": 0.473, - "mean_token_accuracy": 0.8468973502516747, - "num_tokens": 237076147.0, - "step": 197050 - }, - { - "entropy": 1.859901525080204, - "epoch": 0.610868758229299, - "grad_norm": 8.066091537475586, - "learning_rate": 3.2368295049970094e-06, - "loss": 0.4365, - "mean_token_accuracy": 0.8572079196572304, - "num_tokens": 237087769.0, - "step": 197060 - }, - { - "entropy": 1.878857983648777, - "epoch": 0.6108997573543486, - "grad_norm": 7.823647499084473, - "learning_rate": 3.2367473796863778e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8438057050108909, - "num_tokens": 237099723.0, - "step": 197070 - }, - { - "entropy": 1.8205513462424279, - "epoch": 0.6109307564793983, - "grad_norm": 3.872516393661499, - "learning_rate": 3.236665260626515e-06, - "loss": 0.3891, - "mean_token_accuracy": 0.8547244861721992, - "num_tokens": 237111838.0, - "step": 197080 - }, - { - "entropy": 1.8410150706768036, - "epoch": 0.6109617556044481, - "grad_norm": 9.855649948120117, - "learning_rate": 3.2365831478166287e-06, - "loss": 0.4648, - "mean_token_accuracy": 0.8547208964824676, - "num_tokens": 237124565.0, - "step": 197090 - }, - { - "entropy": 1.7792586773633956, - "epoch": 0.6109927547294978, - "grad_norm": 9.24644947052002, - "learning_rate": 3.236501041255925e-06, - "loss": 0.3683, - "mean_token_accuracy": 0.8620190620422363, - "num_tokens": 237138066.0, - "step": 197100 - }, - { - "entropy": 1.893431168794632, - "epoch": 0.6110237538545474, - "grad_norm": 8.146915435791016, - "learning_rate": 3.2364189409436118e-06, - "loss": 0.4575, - "mean_token_accuracy": 0.8524102881550789, - "num_tokens": 237149822.0, - "step": 197110 - }, - { - "entropy": 1.898918354511261, - "epoch": 0.6110547529795971, - "grad_norm": 3.6508231163024902, - "learning_rate": 3.2363368468788973e-06, - "loss": 0.4527, - "mean_token_accuracy": 0.8517585694789886, - "num_tokens": 237160803.0, - "step": 197120 - }, - { - "entropy": 1.8995780304074288, - "epoch": 0.6110857521046469, - "grad_norm": 7.793694496154785, - "learning_rate": 3.2362547590609886e-06, - "loss": 0.4895, - "mean_token_accuracy": 0.8504430979490281, - "num_tokens": 237172435.0, - "step": 197130 - }, - { - "entropy": 1.7506232902407646, - "epoch": 0.6111167512296966, - "grad_norm": 8.007512092590332, - "learning_rate": 3.236172677489094e-06, - "loss": 0.4034, - "mean_token_accuracy": 0.8630235210061074, - "num_tokens": 237186683.0, - "step": 197140 - }, - { - "entropy": 1.7544875517487526, - "epoch": 0.6111477503547462, - "grad_norm": 8.81264877319336, - "learning_rate": 3.23609060216242e-06, - "loss": 0.3796, - "mean_token_accuracy": 0.8727211073040962, - "num_tokens": 237200218.0, - "step": 197150 - }, - { - "entropy": 1.9269504621624947, - "epoch": 0.6111787494797959, - "grad_norm": 7.985466957092285, - "learning_rate": 3.2360085330801767e-06, - "loss": 0.421, - "mean_token_accuracy": 0.8687825843691825, - "num_tokens": 237211313.0, - "step": 197160 - }, - { - "entropy": 1.859518238902092, - "epoch": 0.6112097486048457, - "grad_norm": 5.540771961212158, - "learning_rate": 3.23592647024157e-06, - "loss": 0.4441, - "mean_token_accuracy": 0.853747871518135, - "num_tokens": 237222938.0, - "step": 197170 - }, - { - "entropy": 1.9155102893710136, - "epoch": 0.6112407477298953, - "grad_norm": 8.322548866271973, - "learning_rate": 3.2358444136458106e-06, - "loss": 0.4616, - "mean_token_accuracy": 0.8527912616729736, - "num_tokens": 237233724.0, - "step": 197180 - }, - { - "entropy": 1.9483972951769828, - "epoch": 0.611271746854945, - "grad_norm": 8.302480697631836, - "learning_rate": 3.235762363292106e-06, - "loss": 0.4694, - "mean_token_accuracy": 0.8494256645441055, - "num_tokens": 237245105.0, - "step": 197190 - }, - { - "entropy": 1.8738413840532302, - "epoch": 0.6113027459799947, - "grad_norm": 3.559133529663086, - "learning_rate": 3.2356803191796642e-06, - "loss": 0.4215, - "mean_token_accuracy": 0.855416576564312, - "num_tokens": 237257405.0, - "step": 197200 - }, - { - "entropy": 1.8959673658013343, - "epoch": 0.6113337451050445, - "grad_norm": 10.196348190307617, - "learning_rate": 3.2355982813076953e-06, - "loss": 0.4681, - "mean_token_accuracy": 0.8532440841197968, - "num_tokens": 237268409.0, - "step": 197210 - }, - { - "entropy": 1.8422715082764625, - "epoch": 0.6113647442300941, - "grad_norm": 5.707858562469482, - "learning_rate": 3.2355162496754073e-06, - "loss": 0.4789, - "mean_token_accuracy": 0.84599878937006, - "num_tokens": 237281020.0, - "step": 197220 - }, - { - "entropy": 1.8892938852310182, - "epoch": 0.6113957433551438, - "grad_norm": 3.6076462268829346, - "learning_rate": 3.235434224282009e-06, - "loss": 0.4153, - "mean_token_accuracy": 0.8560446351766586, - "num_tokens": 237293369.0, - "step": 197230 - }, - { - "entropy": 1.9170384138822556, - "epoch": 0.6114267424801935, - "grad_norm": 5.8077921867370605, - "learning_rate": 3.235352205126711e-06, - "loss": 0.4699, - "mean_token_accuracy": 0.8565402716398239, - "num_tokens": 237304689.0, - "step": 197240 - }, - { - "entropy": 1.9104067966341973, - "epoch": 0.6114577416052432, - "grad_norm": 9.619139671325684, - "learning_rate": 3.2352701922087217e-06, - "loss": 0.4633, - "mean_token_accuracy": 0.8569847285747528, - "num_tokens": 237316300.0, - "step": 197250 - }, - { - "entropy": 1.7930829659104348, - "epoch": 0.6114887407302929, - "grad_norm": 3.60917592048645, - "learning_rate": 3.23518818552725e-06, - "loss": 0.3691, - "mean_token_accuracy": 0.8651748016476631, - "num_tokens": 237329389.0, - "step": 197260 - }, - { - "entropy": 1.8553033851087093, - "epoch": 0.6115197398553426, - "grad_norm": 2.5513739585876465, - "learning_rate": 3.2351061850815062e-06, - "loss": 0.47, - "mean_token_accuracy": 0.8540530279278755, - "num_tokens": 237342015.0, - "step": 197270 - }, - { - "entropy": 1.8537073224782943, - "epoch": 0.6115507389803922, - "grad_norm": 7.609178066253662, - "learning_rate": 3.235024190870701e-06, - "loss": 0.4222, - "mean_token_accuracy": 0.8554342269897461, - "num_tokens": 237354500.0, - "step": 197280 - }, - { - "entropy": 1.8396665275096893, - "epoch": 0.6115817381054419, - "grad_norm": 9.246115684509277, - "learning_rate": 3.2349422028940426e-06, - "loss": 0.4347, - "mean_token_accuracy": 0.852047860622406, - "num_tokens": 237366515.0, - "step": 197290 - }, - { - "entropy": 1.8538162797689437, - "epoch": 0.6116127372304917, - "grad_norm": 7.036844730377197, - "learning_rate": 3.2348602211507424e-06, - "loss": 0.3995, - "mean_token_accuracy": 0.8756770327687263, - "num_tokens": 237378222.0, - "step": 197300 - }, - { - "entropy": 1.9473214149475098, - "epoch": 0.6116437363555414, - "grad_norm": 7.349457263946533, - "learning_rate": 3.23477824564001e-06, - "loss": 0.4921, - "mean_token_accuracy": 0.8548978239297866, - "num_tokens": 237389710.0, - "step": 197310 - }, - { - "entropy": 1.9008982643485068, - "epoch": 0.611674735480591, - "grad_norm": 8.243997573852539, - "learning_rate": 3.2346962763610556e-06, - "loss": 0.4524, - "mean_token_accuracy": 0.8519371166825295, - "num_tokens": 237401797.0, - "step": 197320 - }, - { - "entropy": 1.831042182445526, - "epoch": 0.6117057346056407, - "grad_norm": 8.572514533996582, - "learning_rate": 3.2346143133130897e-06, - "loss": 0.4071, - "mean_token_accuracy": 0.8584729507565498, - "num_tokens": 237414474.0, - "step": 197330 - }, - { - "entropy": 1.8398546814918517, - "epoch": 0.6117367337306905, - "grad_norm": 10.016891479492188, - "learning_rate": 3.234532356495323e-06, - "loss": 0.4172, - "mean_token_accuracy": 0.8620932117104531, - "num_tokens": 237426310.0, - "step": 197340 - }, - { - "entropy": 1.8987890288233757, - "epoch": 0.6117677328557402, - "grad_norm": 4.706457614898682, - "learning_rate": 3.234450405906967e-06, - "loss": 0.4558, - "mean_token_accuracy": 0.8507528349757194, - "num_tokens": 237437770.0, - "step": 197350 - }, - { - "entropy": 1.851937872171402, - "epoch": 0.6117987319807898, - "grad_norm": 7.817611217498779, - "learning_rate": 3.2343684615472313e-06, - "loss": 0.3843, - "mean_token_accuracy": 0.8699416399002076, - "num_tokens": 237449562.0, - "step": 197360 - }, - { - "entropy": 1.8670279935002327, - "epoch": 0.6118297311058395, - "grad_norm": 7.869930744171143, - "learning_rate": 3.2342865234153282e-06, - "loss": 0.4245, - "mean_token_accuracy": 0.8561404958367348, - "num_tokens": 237461931.0, - "step": 197370 - }, - { - "entropy": 1.8751569628715514, - "epoch": 0.6118607302308893, - "grad_norm": 8.188568115234375, - "learning_rate": 3.2342045915104675e-06, - "loss": 0.461, - "mean_token_accuracy": 0.8529067426919937, - "num_tokens": 237473861.0, - "step": 197380 - }, - { - "entropy": 1.8241604372859002, - "epoch": 0.6118917293559389, - "grad_norm": 3.8533084392547607, - "learning_rate": 3.2341226658318614e-06, - "loss": 0.3897, - "mean_token_accuracy": 0.8676508396863938, - "num_tokens": 237486393.0, - "step": 197390 - }, - { - "entropy": 1.9073212459683417, - "epoch": 0.6119227284809886, - "grad_norm": 7.623655796051025, - "learning_rate": 3.234040746378722e-06, - "loss": 0.4447, - "mean_token_accuracy": 0.8545937642455101, - "num_tokens": 237497546.0, - "step": 197400 - }, - { - "entropy": 1.8588381856679916, - "epoch": 0.6119537276060383, - "grad_norm": 7.781834602355957, - "learning_rate": 3.2339588331502598e-06, - "loss": 0.4383, - "mean_token_accuracy": 0.8564682066440582, - "num_tokens": 237510315.0, - "step": 197410 - }, - { - "entropy": 1.8095207661390305, - "epoch": 0.6119847267310881, - "grad_norm": 4.215339660644531, - "learning_rate": 3.233876926145686e-06, - "loss": 0.3784, - "mean_token_accuracy": 0.8653566733002662, - "num_tokens": 237522650.0, - "step": 197420 - }, - { - "entropy": 1.8454365268349648, - "epoch": 0.6120157258561377, - "grad_norm": 6.672299385070801, - "learning_rate": 3.2337950253642135e-06, - "loss": 0.4404, - "mean_token_accuracy": 0.8600479036569595, - "num_tokens": 237534798.0, - "step": 197430 - }, - { - "entropy": 1.7626978799700737, - "epoch": 0.6120467249811874, - "grad_norm": 9.613987922668457, - "learning_rate": 3.2337131308050545e-06, - "loss": 0.3694, - "mean_token_accuracy": 0.8646810084581376, - "num_tokens": 237548710.0, - "step": 197440 - }, - { - "entropy": 1.9045309767127037, - "epoch": 0.6120777241062371, - "grad_norm": 8.034784317016602, - "learning_rate": 3.233631242467421e-06, - "loss": 0.4571, - "mean_token_accuracy": 0.8537096366286278, - "num_tokens": 237560491.0, - "step": 197450 - }, - { - "entropy": 1.75089842826128, - "epoch": 0.6121087232312868, - "grad_norm": 8.159855842590332, - "learning_rate": 3.2335493603505246e-06, - "loss": 0.3873, - "mean_token_accuracy": 0.869796434044838, - "num_tokens": 237573666.0, - "step": 197460 - }, - { - "entropy": 1.8842484802007675, - "epoch": 0.6121397223563365, - "grad_norm": 7.651088237762451, - "learning_rate": 3.2334674844535783e-06, - "loss": 0.4253, - "mean_token_accuracy": 0.8594219341874123, - "num_tokens": 237585141.0, - "step": 197470 - }, - { - "entropy": 1.8589984819293022, - "epoch": 0.6121707214813862, - "grad_norm": 3.8243820667266846, - "learning_rate": 3.233385614775794e-06, - "loss": 0.3806, - "mean_token_accuracy": 0.8726780936121941, - "num_tokens": 237596837.0, - "step": 197480 - }, - { - "entropy": 1.7896558150649071, - "epoch": 0.6122017206064359, - "grad_norm": 10.02405834197998, - "learning_rate": 3.2333037513163856e-06, - "loss": 0.3866, - "mean_token_accuracy": 0.8653994277119637, - "num_tokens": 237610276.0, - "step": 197490 - }, - { - "entropy": 1.7988707900047303, - "epoch": 0.6122327197314856, - "grad_norm": 7.22122049331665, - "learning_rate": 3.233221894074566e-06, - "loss": 0.3847, - "mean_token_accuracy": 0.863201479613781, - "num_tokens": 237623129.0, - "step": 197500 - }, - { - "entropy": 1.8126246452331543, - "epoch": 0.6122637188565353, - "grad_norm": 8.908790588378906, - "learning_rate": 3.233140043049547e-06, - "loss": 0.3593, - "mean_token_accuracy": 0.8658370718359947, - "num_tokens": 237635448.0, - "step": 197510 - }, - { - "entropy": 1.8080999478697777, - "epoch": 0.612294717981585, - "grad_norm": 2.396630048751831, - "learning_rate": 3.233058198240541e-06, - "loss": 0.3758, - "mean_token_accuracy": 0.8676091253757476, - "num_tokens": 237648272.0, - "step": 197520 - }, - { - "entropy": 1.7976605847477913, - "epoch": 0.6123257171066346, - "grad_norm": 9.25484561920166, - "learning_rate": 3.232976359646764e-06, - "loss": 0.35, - "mean_token_accuracy": 0.8769566550850868, - "num_tokens": 237660724.0, - "step": 197530 - }, - { - "entropy": 1.9517788350582124, - "epoch": 0.6123567162316843, - "grad_norm": 8.63337230682373, - "learning_rate": 3.232894527267427e-06, - "loss": 0.5639, - "mean_token_accuracy": 0.8373573496937752, - "num_tokens": 237672350.0, - "step": 197540 - }, - { - "entropy": 1.838518961519003, - "epoch": 0.6123877153567341, - "grad_norm": 8.386921882629395, - "learning_rate": 3.232812701101745e-06, - "loss": 0.4273, - "mean_token_accuracy": 0.8607329323887825, - "num_tokens": 237684833.0, - "step": 197550 - }, - { - "entropy": 1.9415549844503404, - "epoch": 0.6124187144817838, - "grad_norm": 7.513718605041504, - "learning_rate": 3.2327308811489318e-06, - "loss": 0.4443, - "mean_token_accuracy": 0.8599239453673363, - "num_tokens": 237695928.0, - "step": 197560 - }, - { - "entropy": 1.8711989670991898, - "epoch": 0.6124497136068334, - "grad_norm": 2.5814082622528076, - "learning_rate": 3.2326490674081996e-06, - "loss": 0.4596, - "mean_token_accuracy": 0.8575176253914834, - "num_tokens": 237708081.0, - "step": 197570 - }, - { - "entropy": 1.9002268463373184, - "epoch": 0.6124807127318831, - "grad_norm": 7.871204853057861, - "learning_rate": 3.2325672598787633e-06, - "loss": 0.4741, - "mean_token_accuracy": 0.8506250888109207, - "num_tokens": 237719581.0, - "step": 197580 - }, - { - "entropy": 1.858892673254013, - "epoch": 0.6125117118569329, - "grad_norm": 4.453184127807617, - "learning_rate": 3.232485458559837e-06, - "loss": 0.4198, - "mean_token_accuracy": 0.8605369016528129, - "num_tokens": 237731116.0, - "step": 197590 - }, - { - "entropy": 1.8251206517219543, - "epoch": 0.6125427109819825, - "grad_norm": 7.465099334716797, - "learning_rate": 3.232403663450635e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8610186651349068, - "num_tokens": 237743524.0, - "step": 197600 - }, - { - "entropy": 1.8249526843428612, - "epoch": 0.6125737101070322, - "grad_norm": 4.715644836425781, - "learning_rate": 3.2323218745503727e-06, - "loss": 0.4143, - "mean_token_accuracy": 0.8602005869150162, - "num_tokens": 237755768.0, - "step": 197610 - }, - { - "entropy": 1.9152856677770616, - "epoch": 0.6126047092320819, - "grad_norm": 3.7345130443573, - "learning_rate": 3.232240091858262e-06, - "loss": 0.4123, - "mean_token_accuracy": 0.8612644776701928, - "num_tokens": 237767214.0, - "step": 197620 - }, - { - "entropy": 1.812970919162035, - "epoch": 0.6126357083571317, - "grad_norm": 11.833577156066895, - "learning_rate": 3.23215831537352e-06, - "loss": 0.3969, - "mean_token_accuracy": 0.8618756428360939, - "num_tokens": 237780541.0, - "step": 197630 - }, - { - "entropy": 1.8648294284939766, - "epoch": 0.6126667074821813, - "grad_norm": 3.68322491645813, - "learning_rate": 3.2320765450953605e-06, - "loss": 0.4, - "mean_token_accuracy": 0.8567120462656022, - "num_tokens": 237793175.0, - "step": 197640 - }, - { - "entropy": 1.8529816403985024, - "epoch": 0.612697706607231, - "grad_norm": 7.8077592849731445, - "learning_rate": 3.2319947810229992e-06, - "loss": 0.3818, - "mean_token_accuracy": 0.866400220990181, - "num_tokens": 237805646.0, - "step": 197650 - }, - { - "entropy": 1.9075462460517882, - "epoch": 0.6127287057322807, - "grad_norm": 8.350885391235352, - "learning_rate": 3.23191302315565e-06, - "loss": 0.4491, - "mean_token_accuracy": 0.8558677047491073, - "num_tokens": 237817020.0, - "step": 197660 - }, - { - "entropy": 1.8251194074749946, - "epoch": 0.6127597048573304, - "grad_norm": 8.18161678314209, - "learning_rate": 3.2318312714925286e-06, - "loss": 0.4597, - "mean_token_accuracy": 0.8488647878170014, - "num_tokens": 237830190.0, - "step": 197670 - }, - { - "entropy": 1.910027140378952, - "epoch": 0.6127907039823801, - "grad_norm": 8.298951148986816, - "learning_rate": 3.2317495260328507e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8613663628697396, - "num_tokens": 237841186.0, - "step": 197680 - }, - { - "entropy": 1.8520949259400368, - "epoch": 0.6128217031074298, - "grad_norm": 7.493919372558594, - "learning_rate": 3.231667786775831e-06, - "loss": 0.417, - "mean_token_accuracy": 0.864731340110302, - "num_tokens": 237853366.0, - "step": 197690 - }, - { - "entropy": 1.8635297536849975, - "epoch": 0.6128527022324795, - "grad_norm": 10.218779563903809, - "learning_rate": 3.231586053720686e-06, - "loss": 0.4244, - "mean_token_accuracy": 0.8612703114748002, - "num_tokens": 237865400.0, - "step": 197700 - }, - { - "entropy": 1.8062796369194984, - "epoch": 0.6128837013575292, - "grad_norm": 3.485726833343506, - "learning_rate": 3.2315043268666313e-06, - "loss": 0.4039, - "mean_token_accuracy": 0.8678748309612274, - "num_tokens": 237877764.0, - "step": 197710 - }, - { - "entropy": 1.8527929231524467, - "epoch": 0.6129147004825789, - "grad_norm": 7.560081958770752, - "learning_rate": 3.231422606212883e-06, - "loss": 0.4175, - "mean_token_accuracy": 0.8541894242167473, - "num_tokens": 237889106.0, - "step": 197720 - }, - { - "entropy": 1.8421387121081352, - "epoch": 0.6129456996076286, - "grad_norm": 7.515761852264404, - "learning_rate": 3.2313408917586557e-06, - "loss": 0.3825, - "mean_token_accuracy": 0.8652972161769867, - "num_tokens": 237901508.0, - "step": 197730 - }, - { - "entropy": 1.8394855439662934, - "epoch": 0.6129766987326782, - "grad_norm": 4.9240546226501465, - "learning_rate": 3.231259183503168e-06, - "loss": 0.5128, - "mean_token_accuracy": 0.8508085206151008, - "num_tokens": 237913802.0, - "step": 197740 - }, - { - "entropy": 1.884807887673378, - "epoch": 0.613007697857728, - "grad_norm": 8.731439590454102, - "learning_rate": 3.231177481445634e-06, - "loss": 0.417, - "mean_token_accuracy": 0.8633570462465286, - "num_tokens": 237924909.0, - "step": 197750 - }, - { - "entropy": 1.8097985833883286, - "epoch": 0.6130386969827777, - "grad_norm": 8.296426773071289, - "learning_rate": 3.2310957855852715e-06, - "loss": 0.3985, - "mean_token_accuracy": 0.862330450117588, - "num_tokens": 237937637.0, - "step": 197760 - }, - { - "entropy": 1.8103355050086976, - "epoch": 0.6130696961078274, - "grad_norm": 4.088588714599609, - "learning_rate": 3.231014095921296e-06, - "loss": 0.387, - "mean_token_accuracy": 0.8579976573586464, - "num_tokens": 237950579.0, - "step": 197770 - }, - { - "entropy": 1.8344371646642685, - "epoch": 0.613100695232877, - "grad_norm": 7.0518388748168945, - "learning_rate": 3.2309324124529264e-06, - "loss": 0.4239, - "mean_token_accuracy": 0.8459786728024483, - "num_tokens": 237963468.0, - "step": 197780 - }, - { - "entropy": 1.8165979743003846, - "epoch": 0.6131316943579267, - "grad_norm": 8.043184280395508, - "learning_rate": 3.230850735179376e-06, - "loss": 0.377, - "mean_token_accuracy": 0.8663666322827339, - "num_tokens": 237976313.0, - "step": 197790 - }, - { - "entropy": 1.81719990670681, - "epoch": 0.6131626934829765, - "grad_norm": 6.555117130279541, - "learning_rate": 3.2307690640998657e-06, - "loss": 0.3561, - "mean_token_accuracy": 0.8760808929800987, - "num_tokens": 237989127.0, - "step": 197800 - }, - { - "entropy": 1.8462613999843598, - "epoch": 0.6131936926080261, - "grad_norm": 7.677376747131348, - "learning_rate": 3.2306873992136107e-06, - "loss": 0.3921, - "mean_token_accuracy": 0.8611488565802574, - "num_tokens": 238001168.0, - "step": 197810 - }, - { - "entropy": 1.9142689868807792, - "epoch": 0.6132246917330758, - "grad_norm": 6.646963119506836, - "learning_rate": 3.2306057405198276e-06, - "loss": 0.4428, - "mean_token_accuracy": 0.8539297908544541, - "num_tokens": 238012767.0, - "step": 197820 - }, - { - "entropy": 1.905218243598938, - "epoch": 0.6132556908581255, - "grad_norm": 8.068191528320312, - "learning_rate": 3.230524088017735e-06, - "loss": 0.4308, - "mean_token_accuracy": 0.8539548605680466, - "num_tokens": 238024831.0, - "step": 197830 - }, - { - "entropy": 1.8698325335979462, - "epoch": 0.6132866899831753, - "grad_norm": 7.347733020782471, - "learning_rate": 3.2304424417065505e-06, - "loss": 0.4083, - "mean_token_accuracy": 0.8575117960572243, - "num_tokens": 238036523.0, - "step": 197840 - }, - { - "entropy": 1.824147316813469, - "epoch": 0.6133176891082249, - "grad_norm": 9.090344429016113, - "learning_rate": 3.2303608015854916e-06, - "loss": 0.4235, - "mean_token_accuracy": 0.8511833533644676, - "num_tokens": 238049547.0, - "step": 197850 - }, - { - "entropy": 1.851954886317253, - "epoch": 0.6133486882332746, - "grad_norm": 8.417724609375, - "learning_rate": 3.2302791676537758e-06, - "loss": 0.3987, - "mean_token_accuracy": 0.8720298394560814, - "num_tokens": 238061858.0, - "step": 197860 - }, - { - "entropy": 1.7978620529174805, - "epoch": 0.6133796873583243, - "grad_norm": 4.008333206176758, - "learning_rate": 3.2301975399106215e-06, - "loss": 0.3852, - "mean_token_accuracy": 0.8638000205159188, - "num_tokens": 238075560.0, - "step": 197870 - }, - { - "entropy": 1.8916511505842208, - "epoch": 0.613410686483374, - "grad_norm": 7.247588634490967, - "learning_rate": 3.2301159183552466e-06, - "loss": 0.4714, - "mean_token_accuracy": 0.8495002686977386, - "num_tokens": 238086563.0, - "step": 197880 - }, - { - "entropy": 1.8061557114124298, - "epoch": 0.6134416856084237, - "grad_norm": 3.697267770767212, - "learning_rate": 3.2300343029868697e-06, - "loss": 0.4692, - "mean_token_accuracy": 0.8531102150678634, - "num_tokens": 238099873.0, - "step": 197890 - }, - { - "entropy": 1.9366677463054658, - "epoch": 0.6134726847334734, - "grad_norm": 8.543197631835938, - "learning_rate": 3.229952693804708e-06, - "loss": 0.4636, - "mean_token_accuracy": 0.8499609500169754, - "num_tokens": 238111212.0, - "step": 197900 - }, - { - "entropy": 1.9563579097390176, - "epoch": 0.613503683858523, - "grad_norm": 9.433308601379395, - "learning_rate": 3.2298710908079823e-06, - "loss": 0.4932, - "mean_token_accuracy": 0.8431360751390458, - "num_tokens": 238122557.0, - "step": 197910 - }, - { - "entropy": 1.815135808289051, - "epoch": 0.6135346829835728, - "grad_norm": 7.3137969970703125, - "learning_rate": 3.2297894939959094e-06, - "loss": 0.404, - "mean_token_accuracy": 0.8702261924743653, - "num_tokens": 238135193.0, - "step": 197920 - }, - { - "entropy": 1.8503555253148078, - "epoch": 0.6135656821086225, - "grad_norm": 4.021295070648193, - "learning_rate": 3.2297079033677094e-06, - "loss": 0.4389, - "mean_token_accuracy": 0.8535087972879409, - "num_tokens": 238147844.0, - "step": 197930 - }, - { - "entropy": 1.8699439376592637, - "epoch": 0.6135966812336722, - "grad_norm": 8.971370697021484, - "learning_rate": 3.2296263189225992e-06, - "loss": 0.4331, - "mean_token_accuracy": 0.8574245125055313, - "num_tokens": 238159624.0, - "step": 197940 - }, - { - "entropy": 1.837162759900093, - "epoch": 0.6136276803587218, - "grad_norm": 4.629751682281494, - "learning_rate": 3.2295447406598004e-06, - "loss": 0.4149, - "mean_token_accuracy": 0.8483832642436028, - "num_tokens": 238172709.0, - "step": 197950 - }, - { - "entropy": 1.7987508580088616, - "epoch": 0.6136586794837716, - "grad_norm": 8.438607215881348, - "learning_rate": 3.22946316857853e-06, - "loss": 0.3552, - "mean_token_accuracy": 0.8662834882736206, - "num_tokens": 238185176.0, - "step": 197960 - }, - { - "entropy": 1.833447128534317, - "epoch": 0.6136896786088213, - "grad_norm": 4.577413558959961, - "learning_rate": 3.22938160267801e-06, - "loss": 0.423, - "mean_token_accuracy": 0.8562129974365235, - "num_tokens": 238197981.0, - "step": 197970 - }, - { - "entropy": 1.8880952566862106, - "epoch": 0.613720677733871, - "grad_norm": 8.018782615661621, - "learning_rate": 3.2293000429574573e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.8598210200667381, - "num_tokens": 238209028.0, - "step": 197980 - }, - { - "entropy": 1.797704565525055, - "epoch": 0.6137516768589206, - "grad_norm": 4.476016998291016, - "learning_rate": 3.2292184894160926e-06, - "loss": 0.3497, - "mean_token_accuracy": 0.8691965594887734, - "num_tokens": 238222150.0, - "step": 197990 - }, - { - "entropy": 1.7164287373423577, - "epoch": 0.6137826759839704, - "grad_norm": 8.452033996582031, - "learning_rate": 3.229136942053136e-06, - "loss": 0.3452, - "mean_token_accuracy": 0.8812255963683129, - "num_tokens": 238235915.0, - "step": 198000 - }, - { - "entropy": 1.8963288113474845, - "epoch": 0.6138136751090201, - "grad_norm": 8.194510459899902, - "learning_rate": 3.2290554008678066e-06, - "loss": 0.4848, - "mean_token_accuracy": 0.846800397336483, - "num_tokens": 238248130.0, - "step": 198010 - }, - { - "entropy": 1.9342078655958175, - "epoch": 0.6138446742340697, - "grad_norm": 8.6097993850708, - "learning_rate": 3.2289738658593254e-06, - "loss": 0.492, - "mean_token_accuracy": 0.8513077095150947, - "num_tokens": 238259188.0, - "step": 198020 - }, - { - "entropy": 1.7547577977180482, - "epoch": 0.6138756733591194, - "grad_norm": 6.928021430969238, - "learning_rate": 3.2288923370269125e-06, - "loss": 0.3452, - "mean_token_accuracy": 0.8803062126040458, - "num_tokens": 238272312.0, - "step": 198030 - }, - { - "entropy": 1.892691618204117, - "epoch": 0.6139066724841691, - "grad_norm": 8.509113311767578, - "learning_rate": 3.228810814369788e-06, - "loss": 0.4369, - "mean_token_accuracy": 0.8577762544155121, - "num_tokens": 238284154.0, - "step": 198040 - }, - { - "entropy": 1.9040351197123528, - "epoch": 0.6139376716092189, - "grad_norm": 9.362594604492188, - "learning_rate": 3.2287292978871717e-06, - "loss": 0.4842, - "mean_token_accuracy": 0.845809280872345, - "num_tokens": 238295201.0, - "step": 198050 - }, - { - "entropy": 1.9087980896234513, - "epoch": 0.6139686707342685, - "grad_norm": 8.504251480102539, - "learning_rate": 3.2286477875782848e-06, - "loss": 0.4468, - "mean_token_accuracy": 0.8549018561840057, - "num_tokens": 238306358.0, - "step": 198060 - }, - { - "entropy": 1.8494928494095801, - "epoch": 0.6139996698593182, - "grad_norm": 8.494181632995605, - "learning_rate": 3.2285662834423486e-06, - "loss": 0.4133, - "mean_token_accuracy": 0.853402042388916, - "num_tokens": 238319160.0, - "step": 198070 - }, - { - "entropy": 1.9097889050841332, - "epoch": 0.6140306689843679, - "grad_norm": 9.164204597473145, - "learning_rate": 3.228484785478583e-06, - "loss": 0.5, - "mean_token_accuracy": 0.847290362417698, - "num_tokens": 238330705.0, - "step": 198080 - }, - { - "entropy": 1.8471282333135606, - "epoch": 0.6140616681094176, - "grad_norm": 9.91257381439209, - "learning_rate": 3.2284032936862096e-06, - "loss": 0.4111, - "mean_token_accuracy": 0.8610455006361007, - "num_tokens": 238342634.0, - "step": 198090 - }, - { - "entropy": 1.888751471042633, - "epoch": 0.6140926672344673, - "grad_norm": 7.841835975646973, - "learning_rate": 3.2283218080644495e-06, - "loss": 0.4841, - "mean_token_accuracy": 0.8621149614453316, - "num_tokens": 238354440.0, - "step": 198100 - }, - { - "entropy": 1.874174064397812, - "epoch": 0.614123666359517, - "grad_norm": 3.7390925884246826, - "learning_rate": 3.2282403286125243e-06, - "loss": 0.4404, - "mean_token_accuracy": 0.8606786787509918, - "num_tokens": 238365755.0, - "step": 198110 - }, - { - "entropy": 1.933007836341858, - "epoch": 0.6141546654845667, - "grad_norm": 7.941004753112793, - "learning_rate": 3.2281588553296544e-06, - "loss": 0.4697, - "mean_token_accuracy": 0.8500162839889527, - "num_tokens": 238377048.0, - "step": 198120 - }, - { - "entropy": 1.8944669008255004, - "epoch": 0.6141856646096164, - "grad_norm": 9.369904518127441, - "learning_rate": 3.2280773882150623e-06, - "loss": 0.4184, - "mean_token_accuracy": 0.8600606888532638, - "num_tokens": 238388297.0, - "step": 198130 - }, - { - "entropy": 1.8376318320631981, - "epoch": 0.6142166637346661, - "grad_norm": 7.919347763061523, - "learning_rate": 3.2279959272679695e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8605689898133277, - "num_tokens": 238399407.0, - "step": 198140 - }, - { - "entropy": 1.8755689665675164, - "epoch": 0.6142476628597158, - "grad_norm": 9.534124374389648, - "learning_rate": 3.2279144724875984e-06, - "loss": 0.4495, - "mean_token_accuracy": 0.8619197174906731, - "num_tokens": 238410783.0, - "step": 198150 - }, - { - "entropy": 1.8431221053004265, - "epoch": 0.6142786619847654, - "grad_norm": 3.9078283309936523, - "learning_rate": 3.2278330238731697e-06, - "loss": 0.4836, - "mean_token_accuracy": 0.8388959527015686, - "num_tokens": 238422622.0, - "step": 198160 - }, - { - "entropy": 1.8363049641251563, - "epoch": 0.6143096611098152, - "grad_norm": 7.458052158355713, - "learning_rate": 3.2277515814239064e-06, - "loss": 0.4527, - "mean_token_accuracy": 0.8575503110885621, - "num_tokens": 238434950.0, - "step": 198170 - }, - { - "entropy": 1.757038240134716, - "epoch": 0.6143406602348649, - "grad_norm": 7.210783004760742, - "learning_rate": 3.2276701451390308e-06, - "loss": 0.3817, - "mean_token_accuracy": 0.8728042364120483, - "num_tokens": 238446797.0, - "step": 198180 - }, - { - "entropy": 1.7734515801072122, - "epoch": 0.6143716593599146, - "grad_norm": 8.399144172668457, - "learning_rate": 3.227588715017765e-06, - "loss": 0.3601, - "mean_token_accuracy": 0.8601013854146004, - "num_tokens": 238460683.0, - "step": 198190 - }, - { - "entropy": 1.7706726059317588, - "epoch": 0.6144026584849642, - "grad_norm": 7.503983020782471, - "learning_rate": 3.2275072910593307e-06, - "loss": 0.4025, - "mean_token_accuracy": 0.8651550650596619, - "num_tokens": 238473268.0, - "step": 198200 - }, - { - "entropy": 1.9047612845897675, - "epoch": 0.614433657610014, - "grad_norm": 8.110784530639648, - "learning_rate": 3.227425873262953e-06, - "loss": 0.472, - "mean_token_accuracy": 0.8519498959183693, - "num_tokens": 238484560.0, - "step": 198210 - }, - { - "entropy": 1.842281450331211, - "epoch": 0.6144646567350637, - "grad_norm": 2.75750470161438, - "learning_rate": 3.227344461627852e-06, - "loss": 0.4766, - "mean_token_accuracy": 0.8465745970606804, - "num_tokens": 238496894.0, - "step": 198220 - }, - { - "entropy": 1.8060670062899589, - "epoch": 0.6144956558601133, - "grad_norm": 8.336816787719727, - "learning_rate": 3.227263056153253e-06, - "loss": 0.3836, - "mean_token_accuracy": 0.8592736333608627, - "num_tokens": 238509479.0, - "step": 198230 - }, - { - "entropy": 1.8654377192258835, - "epoch": 0.614526654985163, - "grad_norm": 9.700325965881348, - "learning_rate": 3.2271816568383773e-06, - "loss": 0.4458, - "mean_token_accuracy": 0.8588502183556557, - "num_tokens": 238520909.0, - "step": 198240 - }, - { - "entropy": 1.8672282606363297, - "epoch": 0.6145576541102128, - "grad_norm": 8.815815925598145, - "learning_rate": 3.2271002636824487e-06, - "loss": 0.4492, - "mean_token_accuracy": 0.8577031642198563, - "num_tokens": 238532759.0, - "step": 198250 - }, - { - "entropy": 1.8973318859934807, - "epoch": 0.6145886532352625, - "grad_norm": 7.610883712768555, - "learning_rate": 3.2270188766846907e-06, - "loss": 0.4677, - "mean_token_accuracy": 0.847311581671238, - "num_tokens": 238544572.0, - "step": 198260 - }, - { - "entropy": 1.8408987671136856, - "epoch": 0.6146196523603121, - "grad_norm": 7.623629570007324, - "learning_rate": 3.2269374958443272e-06, - "loss": 0.4092, - "mean_token_accuracy": 0.8681899920105934, - "num_tokens": 238557228.0, - "step": 198270 - }, - { - "entropy": 1.8358715653419495, - "epoch": 0.6146506514853618, - "grad_norm": 4.767894744873047, - "learning_rate": 3.226856121160581e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.8569270819425583, - "num_tokens": 238569345.0, - "step": 198280 - }, - { - "entropy": 1.8315407454967498, - "epoch": 0.6146816506104115, - "grad_norm": 10.133764266967773, - "learning_rate": 3.2267747526326765e-06, - "loss": 0.3687, - "mean_token_accuracy": 0.870010358095169, - "num_tokens": 238581506.0, - "step": 198290 - }, - { - "entropy": 1.861008095741272, - "epoch": 0.6147126497354612, - "grad_norm": 7.7988152503967285, - "learning_rate": 3.226693390259837e-06, - "loss": 0.488, - "mean_token_accuracy": 0.8421275913715363, - "num_tokens": 238593036.0, - "step": 198300 - }, - { - "entropy": 1.90427585542202, - "epoch": 0.6147436488605109, - "grad_norm": 8.522828102111816, - "learning_rate": 3.226612034041287e-06, - "loss": 0.4981, - "mean_token_accuracy": 0.8471856519579888, - "num_tokens": 238604178.0, - "step": 198310 - }, - { - "entropy": 1.7154094487428666, - "epoch": 0.6147746479855606, - "grad_norm": 7.768447399139404, - "learning_rate": 3.226530683976251e-06, - "loss": 0.3595, - "mean_token_accuracy": 0.8743997603654862, - "num_tokens": 238617882.0, - "step": 198320 - }, - { - "entropy": 1.9018635407090188, - "epoch": 0.6148056471106103, - "grad_norm": 8.199962615966797, - "learning_rate": 3.2264493400639522e-06, - "loss": 0.4756, - "mean_token_accuracy": 0.8509716719388962, - "num_tokens": 238629421.0, - "step": 198330 - }, - { - "entropy": 1.8207050204277038, - "epoch": 0.61483664623566, - "grad_norm": 7.46588134765625, - "learning_rate": 3.2263680023036163e-06, - "loss": 0.424, - "mean_token_accuracy": 0.8665485486388207, - "num_tokens": 238642304.0, - "step": 198340 - }, - { - "entropy": 1.8872560843825341, - "epoch": 0.6148676453607097, - "grad_norm": 6.688050270080566, - "learning_rate": 3.226286670694467e-06, - "loss": 0.4562, - "mean_token_accuracy": 0.8600884407758713, - "num_tokens": 238653847.0, - "step": 198350 - }, - { - "entropy": 1.8786954209208488, - "epoch": 0.6148986444857594, - "grad_norm": 8.85261344909668, - "learning_rate": 3.22620534523573e-06, - "loss": 0.5395, - "mean_token_accuracy": 0.8428824350237847, - "num_tokens": 238665976.0, - "step": 198360 - }, - { - "entropy": 1.865576508641243, - "epoch": 0.614929643610809, - "grad_norm": 7.852488994598389, - "learning_rate": 3.226124025926629e-06, - "loss": 0.4199, - "mean_token_accuracy": 0.8543688908219338, - "num_tokens": 238678533.0, - "step": 198370 - }, - { - "entropy": 1.8879351884126663, - "epoch": 0.6149606427358588, - "grad_norm": 8.607827186584473, - "learning_rate": 3.22604271276639e-06, - "loss": 0.421, - "mean_token_accuracy": 0.8636692270636559, - "num_tokens": 238690116.0, - "step": 198380 - }, - { - "entropy": 1.874523502588272, - "epoch": 0.6149916418609085, - "grad_norm": 8.846104621887207, - "learning_rate": 3.2259614057542377e-06, - "loss": 0.4378, - "mean_token_accuracy": 0.8568495228886605, - "num_tokens": 238701949.0, - "step": 198390 - }, - { - "entropy": 1.8623564064502716, - "epoch": 0.6150226409859582, - "grad_norm": 6.648087978363037, - "learning_rate": 3.2258801048893974e-06, - "loss": 0.443, - "mean_token_accuracy": 0.8549107179045677, - "num_tokens": 238713655.0, - "step": 198400 - }, - { - "entropy": 1.938394169509411, - "epoch": 0.6150536401110078, - "grad_norm": 8.953487396240234, - "learning_rate": 3.2257988101710937e-06, - "loss": 0.4643, - "mean_token_accuracy": 0.8423136845231056, - "num_tokens": 238724982.0, - "step": 198410 - }, - { - "entropy": 1.8558956488966942, - "epoch": 0.6150846392360576, - "grad_norm": 4.239894390106201, - "learning_rate": 3.2257175215985535e-06, - "loss": 0.4442, - "mean_token_accuracy": 0.8487705007195473, - "num_tokens": 238737393.0, - "step": 198420 - }, - { - "entropy": 1.8345892995595932, - "epoch": 0.6151156383611073, - "grad_norm": 4.324648380279541, - "learning_rate": 3.2256362391710023e-06, - "loss": 0.4015, - "mean_token_accuracy": 0.8563659995794296, - "num_tokens": 238749535.0, - "step": 198430 - }, - { - "entropy": 1.9343143850564957, - "epoch": 0.6151466374861569, - "grad_norm": 8.221383094787598, - "learning_rate": 3.2255549628876655e-06, - "loss": 0.4908, - "mean_token_accuracy": 0.847763192653656, - "num_tokens": 238760327.0, - "step": 198440 - }, - { - "entropy": 1.8079216808080674, - "epoch": 0.6151776366112066, - "grad_norm": 8.265410423278809, - "learning_rate": 3.225473692747769e-06, - "loss": 0.3999, - "mean_token_accuracy": 0.8656876266002655, - "num_tokens": 238773252.0, - "step": 198450 - }, - { - "entropy": 1.8814214587211608, - "epoch": 0.6152086357362564, - "grad_norm": 8.192183494567871, - "learning_rate": 3.2253924287505382e-06, - "loss": 0.4289, - "mean_token_accuracy": 0.8665597438812256, - "num_tokens": 238784746.0, - "step": 198460 - }, - { - "entropy": 1.8934792011976243, - "epoch": 0.6152396348613061, - "grad_norm": 9.08458137512207, - "learning_rate": 3.2253111708952007e-06, - "loss": 0.4584, - "mean_token_accuracy": 0.8449272364377975, - "num_tokens": 238796524.0, - "step": 198470 - }, - { - "entropy": 1.8874700501561166, - "epoch": 0.6152706339863557, - "grad_norm": 8.042974472045898, - "learning_rate": 3.225229919180983e-06, - "loss": 0.4448, - "mean_token_accuracy": 0.8547151833772659, - "num_tokens": 238807564.0, - "step": 198480 - }, - { - "entropy": 1.8576602458953857, - "epoch": 0.6153016331114054, - "grad_norm": 3.5211164951324463, - "learning_rate": 3.22514867360711e-06, - "loss": 0.4267, - "mean_token_accuracy": 0.8638330847024918, - "num_tokens": 238819320.0, - "step": 198490 - }, - { - "entropy": 1.9324481308460235, - "epoch": 0.6153326322364552, - "grad_norm": 8.405794143676758, - "learning_rate": 3.22506743417281e-06, - "loss": 0.4537, - "mean_token_accuracy": 0.8566653996706008, - "num_tokens": 238830154.0, - "step": 198500 - }, - { - "entropy": 1.8496293708682061, - "epoch": 0.6153636313615048, - "grad_norm": 8.828564643859863, - "learning_rate": 3.224986200877309e-06, - "loss": 0.4046, - "mean_token_accuracy": 0.8587576478719712, - "num_tokens": 238842348.0, - "step": 198510 - }, - { - "entropy": 1.9132272452116013, - "epoch": 0.6153946304865545, - "grad_norm": 7.063823223114014, - "learning_rate": 3.2249049737198336e-06, - "loss": 0.4393, - "mean_token_accuracy": 0.8504229575395584, - "num_tokens": 238853792.0, - "step": 198520 - }, - { - "entropy": 1.8963459238409996, - "epoch": 0.6154256296116042, - "grad_norm": 3.3602142333984375, - "learning_rate": 3.224823752699611e-06, - "loss": 0.4292, - "mean_token_accuracy": 0.8591250211000443, - "num_tokens": 238865334.0, - "step": 198530 - }, - { - "entropy": 1.8504900515079499, - "epoch": 0.6154566287366539, - "grad_norm": 9.155741691589355, - "learning_rate": 3.224742537815869e-06, - "loss": 0.4123, - "mean_token_accuracy": 0.862186798453331, - "num_tokens": 238877468.0, - "step": 198540 - }, - { - "entropy": 1.8866616129875182, - "epoch": 0.6154876278617036, - "grad_norm": 9.354654312133789, - "learning_rate": 3.2246613290678347e-06, - "loss": 0.4945, - "mean_token_accuracy": 0.84332734644413, - "num_tokens": 238887991.0, - "step": 198550 - }, - { - "entropy": 1.8416400015354157, - "epoch": 0.6155186269867533, - "grad_norm": 3.3999783992767334, - "learning_rate": 3.224580126454735e-06, - "loss": 0.3933, - "mean_token_accuracy": 0.8651275053620339, - "num_tokens": 238900140.0, - "step": 198560 - }, - { - "entropy": 1.8191387727856636, - "epoch": 0.615549626111803, - "grad_norm": 7.950167655944824, - "learning_rate": 3.2244989299757985e-06, - "loss": 0.3814, - "mean_token_accuracy": 0.873188602924347, - "num_tokens": 238911988.0, - "step": 198570 - }, - { - "entropy": 1.8519021973013878, - "epoch": 0.6155806252368526, - "grad_norm": 2.7390058040618896, - "learning_rate": 3.224417739630251e-06, - "loss": 0.425, - "mean_token_accuracy": 0.8561043664813042, - "num_tokens": 238924394.0, - "step": 198580 - }, - { - "entropy": 1.8018446296453476, - "epoch": 0.6156116243619024, - "grad_norm": 4.802493095397949, - "learning_rate": 3.2243365554173233e-06, - "loss": 0.4041, - "mean_token_accuracy": 0.8607585147023201, - "num_tokens": 238937932.0, - "step": 198590 - }, - { - "entropy": 1.8335761934518815, - "epoch": 0.6156426234869521, - "grad_norm": 7.089979648590088, - "learning_rate": 3.224255377336241e-06, - "loss": 0.3715, - "mean_token_accuracy": 0.8711427718400955, - "num_tokens": 238950359.0, - "step": 198600 - }, - { - "entropy": 1.814923171699047, - "epoch": 0.6156736226120018, - "grad_norm": 9.263649940490723, - "learning_rate": 3.224174205386233e-06, - "loss": 0.4192, - "mean_token_accuracy": 0.8567982107400894, - "num_tokens": 238963129.0, - "step": 198610 - }, - { - "entropy": 1.9202240645885467, - "epoch": 0.6157046217370514, - "grad_norm": 6.889978885650635, - "learning_rate": 3.224093039566528e-06, - "loss": 0.4209, - "mean_token_accuracy": 0.8707507133483887, - "num_tokens": 238974124.0, - "step": 198620 - }, - { - "entropy": 1.9241647884249686, - "epoch": 0.6157356208621012, - "grad_norm": 8.047388076782227, - "learning_rate": 3.224011879876354e-06, - "loss": 0.4879, - "mean_token_accuracy": 0.8526466712355614, - "num_tokens": 238985580.0, - "step": 198630 - }, - { - "entropy": 1.887078620493412, - "epoch": 0.6157666199871509, - "grad_norm": 9.372336387634277, - "learning_rate": 3.2239307263149396e-06, - "loss": 0.4833, - "mean_token_accuracy": 0.8520206958055496, - "num_tokens": 238997245.0, - "step": 198640 - }, - { - "entropy": 1.8250777184963227, - "epoch": 0.6157976191122005, - "grad_norm": 4.112492084503174, - "learning_rate": 3.2238495788815127e-06, - "loss": 0.403, - "mean_token_accuracy": 0.8616786897182465, - "num_tokens": 239010115.0, - "step": 198650 - }, - { - "entropy": 1.8797308295965194, - "epoch": 0.6158286182372502, - "grad_norm": 7.629178047180176, - "learning_rate": 3.223768437575303e-06, - "loss": 0.4646, - "mean_token_accuracy": 0.8530390352010727, - "num_tokens": 239022003.0, - "step": 198660 - }, - { - "entropy": 1.9422622606158257, - "epoch": 0.6158596173623, - "grad_norm": 8.976255416870117, - "learning_rate": 3.22368730239554e-06, - "loss": 0.4623, - "mean_token_accuracy": 0.855775335431099, - "num_tokens": 239033410.0, - "step": 198670 - }, - { - "entropy": 1.924929141998291, - "epoch": 0.6158906164873497, - "grad_norm": 9.170016288757324, - "learning_rate": 3.2236061733414515e-06, - "loss": 0.4611, - "mean_token_accuracy": 0.8503917545080185, - "num_tokens": 239044548.0, - "step": 198680 - }, - { - "entropy": 1.8258700281381608, - "epoch": 0.6159216156123993, - "grad_norm": 7.752175331115723, - "learning_rate": 3.2235250504122685e-06, - "loss": 0.4535, - "mean_token_accuracy": 0.8530758947134018, - "num_tokens": 239058126.0, - "step": 198690 - }, - { - "entropy": 1.8341179564595222, - "epoch": 0.615952614737449, - "grad_norm": 8.030282974243164, - "learning_rate": 3.2234439336072184e-06, - "loss": 0.3675, - "mean_token_accuracy": 0.8733653351664543, - "num_tokens": 239070897.0, - "step": 198700 - }, - { - "entropy": 1.9363200575113297, - "epoch": 0.6159836138624988, - "grad_norm": 3.9513907432556152, - "learning_rate": 3.2233628229255317e-06, - "loss": 0.469, - "mean_token_accuracy": 0.8473497003316879, - "num_tokens": 239082965.0, - "step": 198710 - }, - { - "entropy": 1.8560904487967491, - "epoch": 0.6160146129875484, - "grad_norm": 8.370074272155762, - "learning_rate": 3.223281718366438e-06, - "loss": 0.413, - "mean_token_accuracy": 0.8634010136127472, - "num_tokens": 239094946.0, - "step": 198720 - }, - { - "entropy": 1.9300329357385635, - "epoch": 0.6160456121125981, - "grad_norm": 7.569369792938232, - "learning_rate": 3.2232006199291665e-06, - "loss": 0.4474, - "mean_token_accuracy": 0.8554398715496063, - "num_tokens": 239106368.0, - "step": 198730 - }, - { - "entropy": 1.8099862188100815, - "epoch": 0.6160766112376478, - "grad_norm": 8.18184757232666, - "learning_rate": 3.223119527612948e-06, - "loss": 0.43, - "mean_token_accuracy": 0.8559572860598564, - "num_tokens": 239119049.0, - "step": 198740 - }, - { - "entropy": 1.869805746525526, - "epoch": 0.6161076103626976, - "grad_norm": 3.7304792404174805, - "learning_rate": 3.223038441417012e-06, - "loss": 0.4134, - "mean_token_accuracy": 0.8575934499502182, - "num_tokens": 239130718.0, - "step": 198750 - }, - { - "entropy": 1.8467522531747818, - "epoch": 0.6161386094877472, - "grad_norm": 9.405539512634277, - "learning_rate": 3.2229573613405883e-06, - "loss": 0.3907, - "mean_token_accuracy": 0.8660490214824677, - "num_tokens": 239142637.0, - "step": 198760 - }, - { - "entropy": 1.860187816619873, - "epoch": 0.6161696086127969, - "grad_norm": 9.967666625976562, - "learning_rate": 3.2228762873829078e-06, - "loss": 0.4259, - "mean_token_accuracy": 0.8545159950852395, - "num_tokens": 239154629.0, - "step": 198770 - }, - { - "entropy": 1.8846646919846535, - "epoch": 0.6162006077378466, - "grad_norm": 8.279898643493652, - "learning_rate": 3.222795219543201e-06, - "loss": 0.4543, - "mean_token_accuracy": 0.8574783384799958, - "num_tokens": 239165944.0, - "step": 198780 - }, - { - "entropy": 1.8572672247886657, - "epoch": 0.6162316068628962, - "grad_norm": 6.9327521324157715, - "learning_rate": 3.2227141578206988e-06, - "loss": 0.4497, - "mean_token_accuracy": 0.8502766817808152, - "num_tokens": 239178303.0, - "step": 198790 - }, - { - "entropy": 1.9343391239643097, - "epoch": 0.616262605987946, - "grad_norm": 9.41601848602295, - "learning_rate": 3.2226331022146312e-06, - "loss": 0.484, - "mean_token_accuracy": 0.8539301916956902, - "num_tokens": 239188981.0, - "step": 198800 - }, - { - "entropy": 1.9151773989200591, - "epoch": 0.6162936051129957, - "grad_norm": 8.566873550415039, - "learning_rate": 3.222552052724229e-06, - "loss": 0.4602, - "mean_token_accuracy": 0.8554543927311897, - "num_tokens": 239199823.0, - "step": 198810 - }, - { - "entropy": 1.8708260178565979, - "epoch": 0.6163246042380454, - "grad_norm": 7.07456636428833, - "learning_rate": 3.2224710093487233e-06, - "loss": 0.4185, - "mean_token_accuracy": 0.8673142939805984, - "num_tokens": 239211868.0, - "step": 198820 - }, - { - "entropy": 1.8244033619761466, - "epoch": 0.616355603363095, - "grad_norm": 7.7489800453186035, - "learning_rate": 3.222389972087346e-06, - "loss": 0.3842, - "mean_token_accuracy": 0.8668134346604347, - "num_tokens": 239224272.0, - "step": 198830 - }, - { - "entropy": 1.8696729049086571, - "epoch": 0.6163866024881448, - "grad_norm": 8.857129096984863, - "learning_rate": 3.2223089409393272e-06, - "loss": 0.4601, - "mean_token_accuracy": 0.8558687403798103, - "num_tokens": 239236145.0, - "step": 198840 - }, - { - "entropy": 1.862788826227188, - "epoch": 0.6164176016131945, - "grad_norm": 8.101411819458008, - "learning_rate": 3.2222279159038993e-06, - "loss": 0.4349, - "mean_token_accuracy": 0.8538533791899681, - "num_tokens": 239248699.0, - "step": 198850 - }, - { - "entropy": 1.9156145766377448, - "epoch": 0.6164486007382441, - "grad_norm": 7.721922397613525, - "learning_rate": 3.222146896980293e-06, - "loss": 0.4445, - "mean_token_accuracy": 0.8570765167474746, - "num_tokens": 239260281.0, - "step": 198860 - }, - { - "entropy": 1.8555357545614242, - "epoch": 0.6164795998632938, - "grad_norm": 7.878803730010986, - "learning_rate": 3.222065884167741e-06, - "loss": 0.4, - "mean_token_accuracy": 0.8711650311946869, - "num_tokens": 239272202.0, - "step": 198870 - }, - { - "entropy": 1.8976218074560165, - "epoch": 0.6165105989883436, - "grad_norm": 7.291401386260986, - "learning_rate": 3.2219848774654737e-06, - "loss": 0.4195, - "mean_token_accuracy": 0.8570414736866951, - "num_tokens": 239283994.0, - "step": 198880 - }, - { - "entropy": 1.9023361951112747, - "epoch": 0.6165415981133933, - "grad_norm": 4.05077600479126, - "learning_rate": 3.2219038768727244e-06, - "loss": 0.4414, - "mean_token_accuracy": 0.8495072141289711, - "num_tokens": 239296291.0, - "step": 198890 - }, - { - "entropy": 1.8692690685391427, - "epoch": 0.6165725972384429, - "grad_norm": 8.19445514678955, - "learning_rate": 3.221822882388725e-06, - "loss": 0.4228, - "mean_token_accuracy": 0.8572457298636437, - "num_tokens": 239308415.0, - "step": 198900 - }, - { - "entropy": 1.84363332092762, - "epoch": 0.6166035963634926, - "grad_norm": 8.040933609008789, - "learning_rate": 3.221741894012706e-06, - "loss": 0.4498, - "mean_token_accuracy": 0.8483372122049332, - "num_tokens": 239321387.0, - "step": 198910 - }, - { - "entropy": 1.9250966876745224, - "epoch": 0.6166345954885424, - "grad_norm": 8.463235855102539, - "learning_rate": 3.221660911743902e-06, - "loss": 0.485, - "mean_token_accuracy": 0.8497035041451454, - "num_tokens": 239332518.0, - "step": 198920 - }, - { - "entropy": 1.795482875406742, - "epoch": 0.616665594613592, - "grad_norm": 9.157936096191406, - "learning_rate": 3.221579935581544e-06, - "loss": 0.387, - "mean_token_accuracy": 0.867797677218914, - "num_tokens": 239346161.0, - "step": 198930 - }, - { - "entropy": 1.9090109691023827, - "epoch": 0.6166965937386417, - "grad_norm": 7.305553436279297, - "learning_rate": 3.221498965524865e-06, - "loss": 0.4669, - "mean_token_accuracy": 0.8539376124739647, - "num_tokens": 239357090.0, - "step": 198940 - }, - { - "entropy": 1.8361437231302262, - "epoch": 0.6167275928636914, - "grad_norm": 7.487894058227539, - "learning_rate": 3.221418001573099e-06, - "loss": 0.4262, - "mean_token_accuracy": 0.8567282602190971, - "num_tokens": 239369359.0, - "step": 198950 - }, - { - "entropy": 1.8894758448004723, - "epoch": 0.6167585919887412, - "grad_norm": 5.915857315063477, - "learning_rate": 3.221337043725477e-06, - "loss": 0.4348, - "mean_token_accuracy": 0.8565126731991768, - "num_tokens": 239381084.0, - "step": 198960 - }, - { - "entropy": 1.953935231268406, - "epoch": 0.6167895911137908, - "grad_norm": 8.806802749633789, - "learning_rate": 3.2212560919812327e-06, - "loss": 0.4583, - "mean_token_accuracy": 0.8550879210233688, - "num_tokens": 239392342.0, - "step": 198970 - }, - { - "entropy": 1.848163591325283, - "epoch": 0.6168205902388405, - "grad_norm": 3.8063647747039795, - "learning_rate": 3.2211751463395992e-06, - "loss": 0.3818, - "mean_token_accuracy": 0.862635625898838, - "num_tokens": 239404961.0, - "step": 198980 - }, - { - "entropy": 1.9147900685667991, - "epoch": 0.6168515893638902, - "grad_norm": 8.159584999084473, - "learning_rate": 3.2210942067998103e-06, - "loss": 0.4119, - "mean_token_accuracy": 0.8590657487511635, - "num_tokens": 239416335.0, - "step": 198990 - }, - { - "entropy": 1.9331546038389207, - "epoch": 0.61688258848894, - "grad_norm": 10.028764724731445, - "learning_rate": 3.221013273361099e-06, - "loss": 0.5019, - "mean_token_accuracy": 0.8451780170202255, - "num_tokens": 239427470.0, - "step": 199000 - }, - { - "entropy": 1.8391878455877304, - "epoch": 0.6169135876139896, - "grad_norm": 8.022989273071289, - "learning_rate": 3.220932346022699e-06, - "loss": 0.3846, - "mean_token_accuracy": 0.8623939231038094, - "num_tokens": 239440496.0, - "step": 199010 - }, - { - "entropy": 1.8676461964845656, - "epoch": 0.6169445867390393, - "grad_norm": 3.621432065963745, - "learning_rate": 3.220851424783843e-06, - "loss": 0.4093, - "mean_token_accuracy": 0.8650773599743843, - "num_tokens": 239452244.0, - "step": 199020 - }, - { - "entropy": 1.896689024567604, - "epoch": 0.616975585864089, - "grad_norm": 8.148860931396484, - "learning_rate": 3.2207705096437664e-06, - "loss": 0.4692, - "mean_token_accuracy": 0.8594577699899674, - "num_tokens": 239462795.0, - "step": 199030 - }, - { - "entropy": 1.82914230376482, - "epoch": 0.6170065849891386, - "grad_norm": 8.967019081115723, - "learning_rate": 3.220689600601702e-06, - "loss": 0.4341, - "mean_token_accuracy": 0.857168261706829, - "num_tokens": 239476029.0, - "step": 199040 - }, - { - "entropy": 1.8706032626330853, - "epoch": 0.6170375841141884, - "grad_norm": 8.116171836853027, - "learning_rate": 3.2206086976568857e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8567556083202362, - "num_tokens": 239487901.0, - "step": 199050 - }, - { - "entropy": 1.8878035575151444, - "epoch": 0.6170685832392381, - "grad_norm": 3.669631004333496, - "learning_rate": 3.220527800808549e-06, - "loss": 0.4185, - "mean_token_accuracy": 0.8642433434724808, - "num_tokens": 239500013.0, - "step": 199060 - }, - { - "entropy": 1.8171610817313195, - "epoch": 0.6170995823642877, - "grad_norm": 2.781083822250366, - "learning_rate": 3.220446910055929e-06, - "loss": 0.372, - "mean_token_accuracy": 0.8732234045863152, - "num_tokens": 239512409.0, - "step": 199070 - }, - { - "entropy": 1.8164382711052895, - "epoch": 0.6171305814893374, - "grad_norm": 7.623589038848877, - "learning_rate": 3.2203660253982575e-06, - "loss": 0.4445, - "mean_token_accuracy": 0.8559349969029426, - "num_tokens": 239525947.0, - "step": 199080 - }, - { - "entropy": 1.7972185373306275, - "epoch": 0.6171615806143872, - "grad_norm": 7.532422065734863, - "learning_rate": 3.2202851468347713e-06, - "loss": 0.3724, - "mean_token_accuracy": 0.8654538482427597, - "num_tokens": 239539242.0, - "step": 199090 - }, - { - "entropy": 1.9267156958580016, - "epoch": 0.6171925797394369, - "grad_norm": 8.321616172790527, - "learning_rate": 3.2202042743647046e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8527379527688026, - "num_tokens": 239550383.0, - "step": 199100 - }, - { - "entropy": 1.8652078002691268, - "epoch": 0.6172235788644865, - "grad_norm": 8.199543952941895, - "learning_rate": 3.220123407987291e-06, - "loss": 0.4072, - "mean_token_accuracy": 0.8678713470697403, - "num_tokens": 239562372.0, - "step": 199110 - }, - { - "entropy": 1.8801757156848908, - "epoch": 0.6172545779895362, - "grad_norm": 4.798285484313965, - "learning_rate": 3.2200425477017676e-06, - "loss": 0.4314, - "mean_token_accuracy": 0.8524357378482819, - "num_tokens": 239574140.0, - "step": 199120 - }, - { - "entropy": 1.871886597573757, - "epoch": 0.617285577114586, - "grad_norm": 9.886223793029785, - "learning_rate": 3.2199616935073684e-06, - "loss": 0.4191, - "mean_token_accuracy": 0.8601740583777427, - "num_tokens": 239586553.0, - "step": 199130 - }, - { - "entropy": 1.8011711463332176, - "epoch": 0.6173165762396357, - "grad_norm": 3.447469472885132, - "learning_rate": 3.219880845403329e-06, - "loss": 0.4188, - "mean_token_accuracy": 0.8566708326339721, - "num_tokens": 239599835.0, - "step": 199140 - }, - { - "entropy": 1.953110656142235, - "epoch": 0.6173475753646853, - "grad_norm": 7.4284796714782715, - "learning_rate": 3.2198000033888844e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8602241560816765, - "num_tokens": 239610521.0, - "step": 199150 - }, - { - "entropy": 1.7998080857098102, - "epoch": 0.617378574489735, - "grad_norm": 4.082189559936523, - "learning_rate": 3.2197191674632703e-06, - "loss": 0.4031, - "mean_token_accuracy": 0.8608902215957641, - "num_tokens": 239623309.0, - "step": 199160 - }, - { - "entropy": 1.7854876972734928, - "epoch": 0.6174095736147848, - "grad_norm": 10.280808448791504, - "learning_rate": 3.2196383376257232e-06, - "loss": 0.3503, - "mean_token_accuracy": 0.8712309464812279, - "num_tokens": 239637541.0, - "step": 199170 - }, - { - "entropy": 1.941126611828804, - "epoch": 0.6174405727398344, - "grad_norm": 7.390081882476807, - "learning_rate": 3.219557513875478e-06, - "loss": 0.4495, - "mean_token_accuracy": 0.8549496442079544, - "num_tokens": 239649091.0, - "step": 199180 - }, - { - "entropy": 1.7639221414923667, - "epoch": 0.6174715718648841, - "grad_norm": 6.774654388427734, - "learning_rate": 3.2194766962117714e-06, - "loss": 0.3335, - "mean_token_accuracy": 0.8723353713750839, - "num_tokens": 239662638.0, - "step": 199190 - }, - { - "entropy": 1.945545607805252, - "epoch": 0.6175025709899338, - "grad_norm": 8.495312690734863, - "learning_rate": 3.219395884633839e-06, - "loss": 0.5042, - "mean_token_accuracy": 0.8469432502985, - "num_tokens": 239673886.0, - "step": 199200 - }, - { - "entropy": 1.8317839153110982, - "epoch": 0.6175335701149836, - "grad_norm": 9.136716842651367, - "learning_rate": 3.219315079140918e-06, - "loss": 0.3809, - "mean_token_accuracy": 0.8626415833830834, - "num_tokens": 239686394.0, - "step": 199210 - }, - { - "entropy": 1.856505075097084, - "epoch": 0.6175645692400332, - "grad_norm": 3.8257100582122803, - "learning_rate": 3.219234279732243e-06, - "loss": 0.3951, - "mean_token_accuracy": 0.857763585448265, - "num_tokens": 239699160.0, - "step": 199220 - }, - { - "entropy": 1.9680979698896408, - "epoch": 0.6175955683650829, - "grad_norm": 10.98916244506836, - "learning_rate": 3.219153486407052e-06, - "loss": 0.4892, - "mean_token_accuracy": 0.8537120461463928, - "num_tokens": 239709462.0, - "step": 199230 - }, - { - "entropy": 1.8636929139494895, - "epoch": 0.6176265674901326, - "grad_norm": 9.323714256286621, - "learning_rate": 3.2190726991645806e-06, - "loss": 0.407, - "mean_token_accuracy": 0.8667399495840072, - "num_tokens": 239721459.0, - "step": 199240 - }, - { - "entropy": 1.7128648608922958, - "epoch": 0.6176575666151823, - "grad_norm": 3.4446444511413574, - "learning_rate": 3.2189919180040667e-06, - "loss": 0.3653, - "mean_token_accuracy": 0.8705388396978379, - "num_tokens": 239735044.0, - "step": 199250 - }, - { - "entropy": 1.8738186940550805, - "epoch": 0.617688565740232, - "grad_norm": 9.5772123336792, - "learning_rate": 3.2189111429247467e-06, - "loss": 0.399, - "mean_token_accuracy": 0.8661644160747528, - "num_tokens": 239747635.0, - "step": 199260 - }, - { - "entropy": 1.867714224755764, - "epoch": 0.6177195648652817, - "grad_norm": 8.063520431518555, - "learning_rate": 3.2188303739258574e-06, - "loss": 0.4329, - "mean_token_accuracy": 0.8557335734367371, - "num_tokens": 239759230.0, - "step": 199270 - }, - { - "entropy": 1.8949234545230866, - "epoch": 0.6177505639903313, - "grad_norm": 9.034475326538086, - "learning_rate": 3.2187496110066364e-06, - "loss": 0.4393, - "mean_token_accuracy": 0.8573550119996071, - "num_tokens": 239770185.0, - "step": 199280 - }, - { - "entropy": 1.9065477907657624, - "epoch": 0.617781563115381, - "grad_norm": 8.86186408996582, - "learning_rate": 3.218668854166321e-06, - "loss": 0.4439, - "mean_token_accuracy": 0.8593709141016006, - "num_tokens": 239781844.0, - "step": 199290 - }, - { - "entropy": 1.8973913431167602, - "epoch": 0.6178125622404308, - "grad_norm": 9.058403968811035, - "learning_rate": 3.2185881034041483e-06, - "loss": 0.465, - "mean_token_accuracy": 0.8546336054801941, - "num_tokens": 239793680.0, - "step": 199300 - }, - { - "entropy": 1.8704157873988152, - "epoch": 0.6178435613654805, - "grad_norm": 3.7460262775421143, - "learning_rate": 3.218507358719356e-06, - "loss": 0.4316, - "mean_token_accuracy": 0.871597295999527, - "num_tokens": 239805800.0, - "step": 199310 - }, - { - "entropy": 1.9420752674341202, - "epoch": 0.6178745604905301, - "grad_norm": 4.6435346603393555, - "learning_rate": 3.2184266201111815e-06, - "loss": 0.486, - "mean_token_accuracy": 0.8475510746240615, - "num_tokens": 239817299.0, - "step": 199320 - }, - { - "entropy": 1.8781623139977455, - "epoch": 0.6179055596155798, - "grad_norm": 6.959751129150391, - "learning_rate": 3.218345887578864e-06, - "loss": 0.4344, - "mean_token_accuracy": 0.8654584184288978, - "num_tokens": 239829124.0, - "step": 199330 - }, - { - "entropy": 1.9558769553899764, - "epoch": 0.6179365587406296, - "grad_norm": 7.644688606262207, - "learning_rate": 3.2182651611216393e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.8626610890030861, - "num_tokens": 239840235.0, - "step": 199340 - }, - { - "entropy": 1.9544329941272736, - "epoch": 0.6179675578656793, - "grad_norm": 7.619667053222656, - "learning_rate": 3.2181844407387477e-06, - "loss": 0.4907, - "mean_token_accuracy": 0.8480888634920121, - "num_tokens": 239850897.0, - "step": 199350 - }, - { - "entropy": 1.9408447206020356, - "epoch": 0.6179985569907289, - "grad_norm": 9.406037330627441, - "learning_rate": 3.218103726429427e-06, - "loss": 0.5414, - "mean_token_accuracy": 0.8272959098219872, - "num_tokens": 239862352.0, - "step": 199360 - }, - { - "entropy": 1.8339162483811378, - "epoch": 0.6180295561157786, - "grad_norm": 7.903733253479004, - "learning_rate": 3.218023018192914e-06, - "loss": 0.3946, - "mean_token_accuracy": 0.864348916709423, - "num_tokens": 239875037.0, - "step": 199370 - }, - { - "entropy": 1.8993135869503022, - "epoch": 0.6180605552408284, - "grad_norm": 7.121351718902588, - "learning_rate": 3.2179423160284488e-06, - "loss": 0.4683, - "mean_token_accuracy": 0.8541018515825272, - "num_tokens": 239886000.0, - "step": 199380 - }, - { - "entropy": 1.9435019135475158, - "epoch": 0.618091554365878, - "grad_norm": 8.412862777709961, - "learning_rate": 3.2178616199352696e-06, - "loss": 0.4982, - "mean_token_accuracy": 0.8492094442248345, - "num_tokens": 239897551.0, - "step": 199390 - }, - { - "entropy": 1.8815071359276772, - "epoch": 0.6181225534909277, - "grad_norm": 10.336962699890137, - "learning_rate": 3.217780929912615e-06, - "loss": 0.4228, - "mean_token_accuracy": 0.8544108346104622, - "num_tokens": 239908859.0, - "step": 199400 - }, - { - "entropy": 1.789330853521824, - "epoch": 0.6181535526159774, - "grad_norm": 9.012761116027832, - "learning_rate": 3.2177002459597244e-06, - "loss": 0.3917, - "mean_token_accuracy": 0.8624624639749527, - "num_tokens": 239922173.0, - "step": 199410 - }, - { - "entropy": 1.7700959324836731, - "epoch": 0.6181845517410272, - "grad_norm": 8.16477108001709, - "learning_rate": 3.2176195680758367e-06, - "loss": 0.3792, - "mean_token_accuracy": 0.8740750655531884, - "num_tokens": 239935765.0, - "step": 199420 - }, - { - "entropy": 1.898264265060425, - "epoch": 0.6182155508660768, - "grad_norm": 7.839510917663574, - "learning_rate": 3.2175388962601905e-06, - "loss": 0.4812, - "mean_token_accuracy": 0.8386017501354217, - "num_tokens": 239947401.0, - "step": 199430 - }, - { - "entropy": 1.8775344803929328, - "epoch": 0.6182465499911265, - "grad_norm": 8.23879337310791, - "learning_rate": 3.217458230512026e-06, - "loss": 0.46, - "mean_token_accuracy": 0.8576970815658569, - "num_tokens": 239958844.0, - "step": 199440 - }, - { - "entropy": 1.9250696122646331, - "epoch": 0.6182775491161762, - "grad_norm": 8.848472595214844, - "learning_rate": 3.217377570830582e-06, - "loss": 0.49, - "mean_token_accuracy": 0.8477827280759811, - "num_tokens": 239969861.0, - "step": 199450 - }, - { - "entropy": 1.822139708697796, - "epoch": 0.6183085482412259, - "grad_norm": 6.093864440917969, - "learning_rate": 3.2172969172150988e-06, - "loss": 0.4089, - "mean_token_accuracy": 0.8548695519566536, - "num_tokens": 239982644.0, - "step": 199460 - }, - { - "entropy": 1.8985069781541823, - "epoch": 0.6183395473662756, - "grad_norm": 7.458169460296631, - "learning_rate": 3.2172162696648153e-06, - "loss": 0.4334, - "mean_token_accuracy": 0.8565488919615746, - "num_tokens": 239994171.0, - "step": 199470 - }, - { - "entropy": 1.8331923454999923, - "epoch": 0.6183705464913253, - "grad_norm": 7.46090841293335, - "learning_rate": 3.217135628178972e-06, - "loss": 0.4207, - "mean_token_accuracy": 0.8442671373486519, - "num_tokens": 240007032.0, - "step": 199480 - }, - { - "entropy": 1.864036823809147, - "epoch": 0.618401545616375, - "grad_norm": 3.682788848876953, - "learning_rate": 3.2170549927568086e-06, - "loss": 0.4336, - "mean_token_accuracy": 0.8551156163215637, - "num_tokens": 240019306.0, - "step": 199490 - }, - { - "entropy": 1.879542638361454, - "epoch": 0.6184325447414247, - "grad_norm": 8.343369483947754, - "learning_rate": 3.216974363397565e-06, - "loss": 0.4333, - "mean_token_accuracy": 0.856440258026123, - "num_tokens": 240031122.0, - "step": 199500 - }, - { - "entropy": 1.8944496288895607, - "epoch": 0.6184635438664744, - "grad_norm": 8.23908519744873, - "learning_rate": 3.2168937401004816e-06, - "loss": 0.4287, - "mean_token_accuracy": 0.8619818806648254, - "num_tokens": 240043420.0, - "step": 199510 - }, - { - "entropy": 1.831877386569977, - "epoch": 0.6184945429915241, - "grad_norm": 8.815319061279297, - "learning_rate": 3.2168131228647997e-06, - "loss": 0.3623, - "mean_token_accuracy": 0.8666740179061889, - "num_tokens": 240056129.0, - "step": 199520 - }, - { - "entropy": 1.861776551604271, - "epoch": 0.6185255421165737, - "grad_norm": 10.601828575134277, - "learning_rate": 3.216732511689758e-06, - "loss": 0.4158, - "mean_token_accuracy": 0.8628039136528969, - "num_tokens": 240067794.0, - "step": 199530 - }, - { - "entropy": 1.8404061213135718, - "epoch": 0.6185565412416234, - "grad_norm": 8.904887199401855, - "learning_rate": 3.2166519065745986e-06, - "loss": 0.4142, - "mean_token_accuracy": 0.8643636539578438, - "num_tokens": 240080470.0, - "step": 199540 - }, - { - "entropy": 1.9593759417533874, - "epoch": 0.6185875403666732, - "grad_norm": 8.867597579956055, - "learning_rate": 3.216571307518563e-06, - "loss": 0.5131, - "mean_token_accuracy": 0.837907250225544, - "num_tokens": 240091238.0, - "step": 199550 - }, - { - "entropy": 1.8562731474637986, - "epoch": 0.6186185394917229, - "grad_norm": 4.115611553192139, - "learning_rate": 3.2164907145208897e-06, - "loss": 0.4471, - "mean_token_accuracy": 0.850125202536583, - "num_tokens": 240103551.0, - "step": 199560 - }, - { - "entropy": 1.8324606716632843, - "epoch": 0.6186495386167725, - "grad_norm": 4.200334072113037, - "learning_rate": 3.2164101275808217e-06, - "loss": 0.3759, - "mean_token_accuracy": 0.863725571334362, - "num_tokens": 240115791.0, - "step": 199570 - }, - { - "entropy": 1.8296230912208558, - "epoch": 0.6186805377418222, - "grad_norm": 7.744045734405518, - "learning_rate": 3.2163295466975992e-06, - "loss": 0.4115, - "mean_token_accuracy": 0.8615129724144935, - "num_tokens": 240128381.0, - "step": 199580 - }, - { - "entropy": 1.860279454290867, - "epoch": 0.618711536866872, - "grad_norm": 10.000602722167969, - "learning_rate": 3.2162489718704637e-06, - "loss": 0.4181, - "mean_token_accuracy": 0.8575031846761704, - "num_tokens": 240140364.0, - "step": 199590 - }, - { - "entropy": 1.733563742041588, - "epoch": 0.6187425359919216, - "grad_norm": 1.9951728582382202, - "learning_rate": 3.2161684030986574e-06, - "loss": 0.3909, - "mean_token_accuracy": 0.8683977887034416, - "num_tokens": 240153746.0, - "step": 199600 - }, - { - "entropy": 1.7496012575924396, - "epoch": 0.6187735351169713, - "grad_norm": 10.270469665527344, - "learning_rate": 3.216087840381421e-06, - "loss": 0.3555, - "mean_token_accuracy": 0.8686364412307739, - "num_tokens": 240167779.0, - "step": 199610 - }, - { - "entropy": 1.8671685114502907, - "epoch": 0.618804534242021, - "grad_norm": 6.848788738250732, - "learning_rate": 3.2160072837179973e-06, - "loss": 0.4525, - "mean_token_accuracy": 0.859526140987873, - "num_tokens": 240179637.0, - "step": 199620 - }, - { - "entropy": 1.851687641441822, - "epoch": 0.6188355333670708, - "grad_norm": 7.782304286956787, - "learning_rate": 3.2159267331076264e-06, - "loss": 0.4502, - "mean_token_accuracy": 0.8539917662739753, - "num_tokens": 240191476.0, - "step": 199630 - }, - { - "entropy": 1.8153897300362587, - "epoch": 0.6188665324921204, - "grad_norm": 8.59154224395752, - "learning_rate": 3.2158461885495516e-06, - "loss": 0.3978, - "mean_token_accuracy": 0.8624886780977249, - "num_tokens": 240204196.0, - "step": 199640 - }, - { - "entropy": 1.8622885420918465, - "epoch": 0.6188975316171701, - "grad_norm": 7.064817428588867, - "learning_rate": 3.215765650043015e-06, - "loss": 0.4145, - "mean_token_accuracy": 0.8695329040288925, - "num_tokens": 240216191.0, - "step": 199650 - }, - { - "entropy": 1.8840928718447685, - "epoch": 0.6189285307422198, - "grad_norm": 7.8145341873168945, - "learning_rate": 3.215685117587258e-06, - "loss": 0.4572, - "mean_token_accuracy": 0.8516005665063858, - "num_tokens": 240227499.0, - "step": 199660 - }, - { - "entropy": 1.9540878593921662, - "epoch": 0.6189595298672695, - "grad_norm": 7.2002105712890625, - "learning_rate": 3.2156045911815255e-06, - "loss": 0.5119, - "mean_token_accuracy": 0.8494768559932708, - "num_tokens": 240238394.0, - "step": 199670 - }, - { - "entropy": 1.8394357457756996, - "epoch": 0.6189905289923192, - "grad_norm": 7.4307169914245605, - "learning_rate": 3.215524070825056e-06, - "loss": 0.3882, - "mean_token_accuracy": 0.8646936297416687, - "num_tokens": 240250932.0, - "step": 199680 - }, - { - "entropy": 1.9449533462524413, - "epoch": 0.6190215281173689, - "grad_norm": 9.829404830932617, - "learning_rate": 3.2154435565170947e-06, - "loss": 0.4779, - "mean_token_accuracy": 0.8556923270225525, - "num_tokens": 240261925.0, - "step": 199690 - }, - { - "entropy": 1.845785889029503, - "epoch": 0.6190525272424185, - "grad_norm": 4.0769944190979, - "learning_rate": 3.2153630482568847e-06, - "loss": 0.4125, - "mean_token_accuracy": 0.857506263256073, - "num_tokens": 240274462.0, - "step": 199700 - }, - { - "entropy": 1.8135719284415246, - "epoch": 0.6190835263674683, - "grad_norm": 4.39888334274292, - "learning_rate": 3.2152825460436675e-06, - "loss": 0.3959, - "mean_token_accuracy": 0.8683549582958221, - "num_tokens": 240287191.0, - "step": 199710 - }, - { - "entropy": 1.9001155897974968, - "epoch": 0.619114525492518, - "grad_norm": 7.417325019836426, - "learning_rate": 3.2152020498766874e-06, - "loss": 0.4515, - "mean_token_accuracy": 0.8583855494856835, - "num_tokens": 240298252.0, - "step": 199720 - }, - { - "entropy": 1.9175253897905349, - "epoch": 0.6191455246175677, - "grad_norm": 7.785533428192139, - "learning_rate": 3.215121559755187e-06, - "loss": 0.5209, - "mean_token_accuracy": 0.8480686709284783, - "num_tokens": 240309816.0, - "step": 199730 - }, - { - "entropy": 1.757785053551197, - "epoch": 0.6191765237426173, - "grad_norm": 3.814818859100342, - "learning_rate": 3.2150410756784088e-06, - "loss": 0.3694, - "mean_token_accuracy": 0.8653519213199615, - "num_tokens": 240322874.0, - "step": 199740 - }, - { - "entropy": 1.837675492465496, - "epoch": 0.6192075228676671, - "grad_norm": 6.441486358642578, - "learning_rate": 3.214960597645598e-06, - "loss": 0.4462, - "mean_token_accuracy": 0.8579330682754517, - "num_tokens": 240335672.0, - "step": 199750 - }, - { - "entropy": 1.9337011486291886, - "epoch": 0.6192385219927168, - "grad_norm": 8.365361213684082, - "learning_rate": 3.214880125655997e-06, - "loss": 0.4904, - "mean_token_accuracy": 0.8540092423558235, - "num_tokens": 240345914.0, - "step": 199760 - }, - { - "entropy": 1.8270794779062272, - "epoch": 0.6192695211177665, - "grad_norm": 7.633695125579834, - "learning_rate": 3.2147996597088495e-06, - "loss": 0.4056, - "mean_token_accuracy": 0.860172837972641, - "num_tokens": 240357519.0, - "step": 199770 - }, - { - "entropy": 1.9075476467609405, - "epoch": 0.6193005202428161, - "grad_norm": 10.322786331176758, - "learning_rate": 3.2147191998034004e-06, - "loss": 0.4528, - "mean_token_accuracy": 0.8539448142051697, - "num_tokens": 240369329.0, - "step": 199780 - }, - { - "entropy": 1.915835802257061, - "epoch": 0.6193315193678658, - "grad_norm": 3.6770076751708984, - "learning_rate": 3.2146387459388915e-06, - "loss": 0.434, - "mean_token_accuracy": 0.8576722934842109, - "num_tokens": 240380801.0, - "step": 199790 - }, - { - "entropy": 1.8189099460840226, - "epoch": 0.6193625184929156, - "grad_norm": 7.405015468597412, - "learning_rate": 3.2145582981145695e-06, - "loss": 0.3819, - "mean_token_accuracy": 0.8634320721030235, - "num_tokens": 240393144.0, - "step": 199800 - }, - { - "entropy": 1.860145990550518, - "epoch": 0.6193935176179652, - "grad_norm": 7.664626598358154, - "learning_rate": 3.214477856329677e-06, - "loss": 0.4422, - "mean_token_accuracy": 0.8565018594264984, - "num_tokens": 240404917.0, - "step": 199810 - }, - { - "entropy": 1.848432496190071, - "epoch": 0.6194245167430149, - "grad_norm": 8.589262962341309, - "learning_rate": 3.2143974205834592e-06, - "loss": 0.4368, - "mean_token_accuracy": 0.8619036614894867, - "num_tokens": 240416883.0, - "step": 199820 - }, - { - "entropy": 1.8772276252508164, - "epoch": 0.6194555158680646, - "grad_norm": 8.915504455566406, - "learning_rate": 3.2143169908751603e-06, - "loss": 0.4476, - "mean_token_accuracy": 0.8534212946891785, - "num_tokens": 240428587.0, - "step": 199830 - }, - { - "entropy": 1.86657772064209, - "epoch": 0.6194865149931144, - "grad_norm": 5.037930011749268, - "learning_rate": 3.214236567204025e-06, - "loss": 0.4766, - "mean_token_accuracy": 0.8429229214787484, - "num_tokens": 240441470.0, - "step": 199840 - }, - { - "entropy": 1.8196565955877304, - "epoch": 0.619517514118164, - "grad_norm": 7.531178951263428, - "learning_rate": 3.2141561495692973e-06, - "loss": 0.3875, - "mean_token_accuracy": 0.8649965927004815, - "num_tokens": 240453843.0, - "step": 199850 - }, - { - "entropy": 1.9064827859401703, - "epoch": 0.6195485132432137, - "grad_norm": 6.881914138793945, - "learning_rate": 3.214075737970223e-06, - "loss": 0.4427, - "mean_token_accuracy": 0.8601453080773354, - "num_tokens": 240465162.0, - "step": 199860 - }, - { - "entropy": 1.9310476690530778, - "epoch": 0.6195795123682634, - "grad_norm": 8.316636085510254, - "learning_rate": 3.2139953324060473e-06, - "loss": 0.4568, - "mean_token_accuracy": 0.8539797902107239, - "num_tokens": 240476257.0, - "step": 199870 - }, - { - "entropy": 1.723580890893936, - "epoch": 0.6196105114933131, - "grad_norm": 8.44727897644043, - "learning_rate": 3.213914932876015e-06, - "loss": 0.3252, - "mean_token_accuracy": 0.8854304000735282, - "num_tokens": 240490189.0, - "step": 199880 - }, - { - "entropy": 1.9352102607488633, - "epoch": 0.6196415106183628, - "grad_norm": 6.963158130645752, - "learning_rate": 3.2138345393793713e-06, - "loss": 0.4948, - "mean_token_accuracy": 0.8497881457209587, - "num_tokens": 240500887.0, - "step": 199890 - }, - { - "entropy": 1.7900185361504555, - "epoch": 0.6196725097434125, - "grad_norm": 3.517059326171875, - "learning_rate": 3.2137541519153613e-06, - "loss": 0.373, - "mean_token_accuracy": 0.8654786735773087, - "num_tokens": 240513549.0, - "step": 199900 - }, - { - "entropy": 1.8556710347533225, - "epoch": 0.6197035088684621, - "grad_norm": 8.985323905944824, - "learning_rate": 3.213673770483231e-06, - "loss": 0.4297, - "mean_token_accuracy": 0.8574738964438439, - "num_tokens": 240525617.0, - "step": 199910 - }, - { - "entropy": 1.8572029948234559, - "epoch": 0.6197345079935119, - "grad_norm": 8.606582641601562, - "learning_rate": 3.213593395082226e-06, - "loss": 0.4194, - "mean_token_accuracy": 0.8530072510242462, - "num_tokens": 240537926.0, - "step": 199920 - }, - { - "entropy": 1.823797370493412, - "epoch": 0.6197655071185616, - "grad_norm": 9.115023612976074, - "learning_rate": 3.2135130257115936e-06, - "loss": 0.4123, - "mean_token_accuracy": 0.8578477054834366, - "num_tokens": 240550835.0, - "step": 199930 - }, - { - "entropy": 1.853266417235136, - "epoch": 0.6197965062436113, - "grad_norm": 7.527276992797852, - "learning_rate": 3.2134326623705775e-06, - "loss": 0.4336, - "mean_token_accuracy": 0.8626913323998451, - "num_tokens": 240563639.0, - "step": 199940 - }, - { - "entropy": 1.882079529762268, - "epoch": 0.6198275053686609, - "grad_norm": 7.617688179016113, - "learning_rate": 3.213352305058424e-06, - "loss": 0.4216, - "mean_token_accuracy": 0.8646379739046097, - "num_tokens": 240575040.0, - "step": 199950 - }, - { - "entropy": 1.9121492326259613, - "epoch": 0.6198585044937107, - "grad_norm": 7.7319817543029785, - "learning_rate": 3.2132719537743813e-06, - "loss": 0.4168, - "mean_token_accuracy": 0.8699195802211761, - "num_tokens": 240585966.0, - "step": 199960 - }, - { - "entropy": 1.820683164894581, - "epoch": 0.6198895036187604, - "grad_norm": 8.092674255371094, - "learning_rate": 3.213191608517694e-06, - "loss": 0.399, - "mean_token_accuracy": 0.8665958195924759, - "num_tokens": 240599118.0, - "step": 199970 - }, - { - "entropy": 1.9012608036398888, - "epoch": 0.61992050274381, - "grad_norm": 8.381446838378906, - "learning_rate": 3.2131112692876087e-06, - "loss": 0.4323, - "mean_token_accuracy": 0.8655321434140205, - "num_tokens": 240611152.0, - "step": 199980 - }, - { - "entropy": 1.917787343263626, - "epoch": 0.6199515018688597, - "grad_norm": 9.232633590698242, - "learning_rate": 3.213030936083373e-06, - "loss": 0.4704, - "mean_token_accuracy": 0.8561913996934891, - "num_tokens": 240622027.0, - "step": 199990 - }, - { - "entropy": 1.8914671823382379, - "epoch": 0.6199825009939095, - "grad_norm": 8.929455757141113, - "learning_rate": 3.212950608904232e-06, - "loss": 0.4855, - "mean_token_accuracy": 0.8469156593084335, - "num_tokens": 240634070.0, - "step": 200000 } ], "logging_steps": 10, @@ -200027,7 +150027,7 @@ "attributes": {} } }, - "total_flos": 9.909176166603817e+17, + "total_flos": 7.432957984332841e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null