{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8819891855120658, "epoch": 0.014603870025556773, "grad_norm": 0.8598580956459045, "learning_rate": 1.7475728155339808e-06, "loss": 0.8448, "mean_token_accuracy": 0.7740211486816406, "num_tokens": 770936.0, "step": 10 }, { "entropy": 0.9709278956055641, "epoch": 0.029207740051113547, "grad_norm": 0.615368127822876, "learning_rate": 3.689320388349515e-06, "loss": 0.8205, "mean_token_accuracy": 0.7773708969354629, "num_tokens": 1541367.0, "step": 20 }, { "entropy": 1.0815545290708541, "epoch": 0.04381161007667032, "grad_norm": 0.6463781595230103, "learning_rate": 5.631067961165049e-06, "loss": 0.7713, "mean_token_accuracy": 0.7880979612469673, "num_tokens": 2318852.0, "step": 30 }, { "entropy": 1.0128332734107972, "epoch": 0.058415480102227094, "grad_norm": 0.5948992371559143, "learning_rate": 7.572815533980583e-06, "loss": 0.7764, "mean_token_accuracy": 0.7841672986745835, "num_tokens": 3084308.0, "step": 40 }, { "entropy": 1.012282955646515, "epoch": 0.07301935012778386, "grad_norm": 0.6310043931007385, "learning_rate": 9.514563106796117e-06, "loss": 0.7564, "mean_token_accuracy": 0.7881451666355133, "num_tokens": 3834841.0, "step": 50 }, { "entropy": 1.0291071116924286, "epoch": 0.08762322015334063, "grad_norm": 0.6180324554443359, "learning_rate": 1.145631067961165e-05, "loss": 0.7502, "mean_token_accuracy": 0.7902129292488098, "num_tokens": 4596954.0, "step": 60 }, { "entropy": 1.0330789506435394, "epoch": 0.1022270901788974, "grad_norm": 0.6203945279121399, "learning_rate": 1.3398058252427187e-05, "loss": 0.7596, "mean_token_accuracy": 0.7884851574897767, "num_tokens": 5367587.0, "step": 70 }, { "entropy": 1.0233020395040513, "epoch": 0.11683096020445419, "grad_norm": 0.5948684811592102, "learning_rate": 1.533980582524272e-05, "loss": 0.7538, "mean_token_accuracy": 0.788135002553463, "num_tokens": 6132539.0, "step": 80 }, { "entropy": 1.022057643532753, "epoch": 0.13143483023001096, "grad_norm": 0.6073455810546875, "learning_rate": 1.7281553398058253e-05, "loss": 0.7418, "mean_token_accuracy": 0.7916493251919746, "num_tokens": 6910021.0, "step": 90 }, { "entropy": 1.0418485432863236, "epoch": 0.14603870025556773, "grad_norm": 0.56880784034729, "learning_rate": 1.922330097087379e-05, "loss": 0.7686, "mean_token_accuracy": 0.7854298010468483, "num_tokens": 7672070.0, "step": 100 }, { "entropy": 1.0249923586845398, "epoch": 0.1606425702811245, "grad_norm": 0.6060709357261658, "learning_rate": 1.9999839020034848e-05, "loss": 0.7376, "mean_token_accuracy": 0.7922860443592071, "num_tokens": 8437851.0, "step": 110 }, { "entropy": 1.0423746079206466, "epoch": 0.17524644030668127, "grad_norm": 0.5950225591659546, "learning_rate": 1.9998855272350457e-05, "loss": 0.75, "mean_token_accuracy": 0.7883219286799431, "num_tokens": 9220959.0, "step": 120 }, { "entropy": 1.058434711396694, "epoch": 0.18985031033223804, "grad_norm": 0.5805737972259521, "learning_rate": 1.99969772981684e-05, "loss": 0.754, "mean_token_accuracy": 0.7883959293365479, "num_tokens": 9998137.0, "step": 130 }, { "entropy": 1.055925799906254, "epoch": 0.2044541803577948, "grad_norm": 0.5844792723655701, "learning_rate": 1.9994205265441328e-05, "loss": 0.7443, "mean_token_accuracy": 0.7903072774410248, "num_tokens": 10776402.0, "step": 140 }, { "entropy": 1.0465368673205375, "epoch": 0.21905805038335158, "grad_norm": 0.6346914172172546, "learning_rate": 1.9990539422080134e-05, "loss": 0.7423, "mean_token_accuracy": 0.7909073531627655, "num_tokens": 11537876.0, "step": 150 }, { "entropy": 1.0467117950320244, "epoch": 0.23366192040890837, "grad_norm": 0.555743396282196, "learning_rate": 1.9985980095931774e-05, "loss": 0.7426, "mean_token_accuracy": 0.7902399882674217, "num_tokens": 12313854.0, "step": 160 }, { "entropy": 1.0474098443984985, "epoch": 0.24826579043446514, "grad_norm": 0.6585262417793274, "learning_rate": 1.9980527694749952e-05, "loss": 0.7502, "mean_token_accuracy": 0.7889217540621758, "num_tokens": 13059556.0, "step": 170 }, { "entropy": 1.046202352643013, "epoch": 0.2628696604600219, "grad_norm": 0.5887473225593567, "learning_rate": 1.9974182706158646e-05, "loss": 0.7287, "mean_token_accuracy": 0.7938704118132591, "num_tokens": 13829911.0, "step": 180 }, { "entropy": 1.0543677926063537, "epoch": 0.27747353048557866, "grad_norm": 0.5907500982284546, "learning_rate": 1.996694569760851e-05, "loss": 0.7465, "mean_token_accuracy": 0.790279072523117, "num_tokens": 14595687.0, "step": 190 }, { "entropy": 1.059071497619152, "epoch": 0.29207740051113545, "grad_norm": 0.5858122706413269, "learning_rate": 1.995881731632611e-05, "loss": 0.7376, "mean_token_accuracy": 0.7931964993476868, "num_tokens": 15362920.0, "step": 200 }, { "entropy": 1.0532679110765457, "epoch": 0.3066812705366922, "grad_norm": 0.5991299152374268, "learning_rate": 1.9949798289256054e-05, "loss": 0.7578, "mean_token_accuracy": 0.786410291492939, "num_tokens": 16128067.0, "step": 210 }, { "entropy": 1.0485742643475533, "epoch": 0.321285140562249, "grad_norm": 0.5720910429954529, "learning_rate": 1.993988942299598e-05, "loss": 0.7404, "mean_token_accuracy": 0.7914015546441078, "num_tokens": 16911462.0, "step": 220 }, { "entropy": 1.0678794473409652, "epoch": 0.3358890105878058, "grad_norm": 0.6100424528121948, "learning_rate": 1.9929091603724404e-05, "loss": 0.7479, "mean_token_accuracy": 0.7890788897871971, "num_tokens": 17697397.0, "step": 230 }, { "entropy": 1.0550393044948578, "epoch": 0.35049288061336253, "grad_norm": 0.5960067510604858, "learning_rate": 1.9917405797121484e-05, "loss": 0.7518, "mean_token_accuracy": 0.7884634032845497, "num_tokens": 18465317.0, "step": 240 }, { "entropy": 1.0647842451930045, "epoch": 0.36509675063891933, "grad_norm": 0.61286461353302, "learning_rate": 1.990483304828264e-05, "loss": 0.7568, "mean_token_accuracy": 0.7871849820017814, "num_tokens": 19218071.0, "step": 250 }, { "entropy": 1.02879488915205, "epoch": 0.3797006206644761, "grad_norm": 0.5549123883247375, "learning_rate": 1.9891374481625112e-05, "loss": 0.7221, "mean_token_accuracy": 0.7956910878419876, "num_tokens": 19985551.0, "step": 260 }, { "entropy": 1.0511873975396155, "epoch": 0.39430449069003287, "grad_norm": 0.542924702167511, "learning_rate": 1.987703130078737e-05, "loss": 0.7303, "mean_token_accuracy": 0.7930067017674446, "num_tokens": 20759854.0, "step": 270 }, { "entropy": 1.046246202290058, "epoch": 0.4089083607155896, "grad_norm": 0.5551313161849976, "learning_rate": 1.986180478852149e-05, "loss": 0.7252, "mean_token_accuracy": 0.7956920295953751, "num_tokens": 21543604.0, "step": 280 }, { "entropy": 1.0610669255256653, "epoch": 0.4235122307411464, "grad_norm": 0.629205048084259, "learning_rate": 1.9845696306578433e-05, "loss": 0.7377, "mean_token_accuracy": 0.7924546420574188, "num_tokens": 22310802.0, "step": 290 }, { "entropy": 1.0590673327445983, "epoch": 0.43811610076670315, "grad_norm": 0.6042408347129822, "learning_rate": 1.9828707295586253e-05, "loss": 0.7356, "mean_token_accuracy": 0.7922392532229423, "num_tokens": 23088849.0, "step": 300 }, { "entropy": 1.059952473640442, "epoch": 0.45271997079225995, "grad_norm": 0.6013913750648499, "learning_rate": 1.981083927492125e-05, "loss": 0.7379, "mean_token_accuracy": 0.7928509280085564, "num_tokens": 23858610.0, "step": 310 }, { "entropy": 1.0460969746112823, "epoch": 0.46732384081781675, "grad_norm": 0.5834254622459412, "learning_rate": 1.9792093842572106e-05, "loss": 0.7335, "mean_token_accuracy": 0.7924273937940598, "num_tokens": 24590316.0, "step": 320 }, { "entropy": 1.0532741829752923, "epoch": 0.4819277108433735, "grad_norm": 0.5749837160110474, "learning_rate": 1.9772472674996962e-05, "loss": 0.746, "mean_token_accuracy": 0.789176419377327, "num_tokens": 25353089.0, "step": 330 }, { "entropy": 1.0500716269016266, "epoch": 0.4965315808689303, "grad_norm": 0.5746060609817505, "learning_rate": 1.975197752697349e-05, "loss": 0.7414, "mean_token_accuracy": 0.7907495066523552, "num_tokens": 26117777.0, "step": 340 }, { "entropy": 1.051976852118969, "epoch": 0.5111354508944871, "grad_norm": 0.5868931412696838, "learning_rate": 1.973061023144196e-05, "loss": 0.7256, "mean_token_accuracy": 0.7959051370620728, "num_tokens": 26885934.0, "step": 350 }, { "entropy": 1.078227588534355, "epoch": 0.5257393209200438, "grad_norm": 0.5909886360168457, "learning_rate": 1.9708372699341297e-05, "loss": 0.7568, "mean_token_accuracy": 0.7874275401234627, "num_tokens": 27648234.0, "step": 360 }, { "entropy": 1.0532238066196442, "epoch": 0.5403431909456006, "grad_norm": 0.5472894310951233, "learning_rate": 1.9685266919438208e-05, "loss": 0.7351, "mean_token_accuracy": 0.7938356637954712, "num_tokens": 28411776.0, "step": 370 }, { "entropy": 1.0588267534971236, "epoch": 0.5549470609711573, "grad_norm": 0.5929501056671143, "learning_rate": 1.9661294958149312e-05, "loss": 0.745, "mean_token_accuracy": 0.7911377623677254, "num_tokens": 29169804.0, "step": 380 }, { "entropy": 1.0762585639953612, "epoch": 0.5695509309967142, "grad_norm": 0.6019272804260254, "learning_rate": 1.963645895935632e-05, "loss": 0.7458, "mean_token_accuracy": 0.7898744881153107, "num_tokens": 29942397.0, "step": 390 }, { "entropy": 1.0671012222766876, "epoch": 0.5841548010222709, "grad_norm": 0.6131295561790466, "learning_rate": 1.9610761144214307e-05, "loss": 0.7547, "mean_token_accuracy": 0.7888392016291619, "num_tokens": 30737478.0, "step": 400 }, { "entropy": 1.0662592992186546, "epoch": 0.5987586710478277, "grad_norm": 0.579192578792572, "learning_rate": 1.9584203810953094e-05, "loss": 0.731, "mean_token_accuracy": 0.7930623203516006, "num_tokens": 31487239.0, "step": 410 }, { "entropy": 1.0672000512480735, "epoch": 0.6133625410733844, "grad_norm": 0.5550879836082458, "learning_rate": 1.9556789334671668e-05, "loss": 0.7395, "mean_token_accuracy": 0.792091254889965, "num_tokens": 32248396.0, "step": 420 }, { "entropy": 1.0656457453966142, "epoch": 0.6279664110989412, "grad_norm": 1.1255723237991333, "learning_rate": 1.9528520167125803e-05, "loss": 0.7395, "mean_token_accuracy": 0.7922166392207146, "num_tokens": 33011134.0, "step": 430 }, { "entropy": 1.0731883138418197, "epoch": 0.642570281124498, "grad_norm": 0.5882017016410828, "learning_rate": 1.9499398836508776e-05, "loss": 0.7493, "mean_token_accuracy": 0.7877946421504021, "num_tokens": 33787966.0, "step": 440 }, { "entropy": 1.0552370175719261, "epoch": 0.6571741511500547, "grad_norm": 0.6130959987640381, "learning_rate": 1.9469427947225267e-05, "loss": 0.7111, "mean_token_accuracy": 0.7995760783553123, "num_tokens": 34559268.0, "step": 450 }, { "entropy": 1.069392178952694, "epoch": 0.6717780211756116, "grad_norm": 0.5893939733505249, "learning_rate": 1.9438610179658447e-05, "loss": 0.7364, "mean_token_accuracy": 0.7937344864010811, "num_tokens": 35306495.0, "step": 460 }, { "entropy": 1.0607215002179147, "epoch": 0.6863818912011683, "grad_norm": 0.5958935022354126, "learning_rate": 1.9406948289930247e-05, "loss": 0.7358, "mean_token_accuracy": 0.7919678211212158, "num_tokens": 36064111.0, "step": 470 }, { "entropy": 1.079184153676033, "epoch": 0.7009857612267251, "grad_norm": 0.5986253619194031, "learning_rate": 1.9374445109654888e-05, "loss": 0.7421, "mean_token_accuracy": 0.7907416477799416, "num_tokens": 36832765.0, "step": 480 }, { "entropy": 1.0671533614397049, "epoch": 0.7155896312522818, "grad_norm": 0.5463415384292603, "learning_rate": 1.9341103545685637e-05, "loss": 0.7304, "mean_token_accuracy": 0.7941543251276016, "num_tokens": 37609612.0, "step": 490 }, { "entropy": 1.0800618380308151, "epoch": 0.7301935012778387, "grad_norm": 0.7118626832962036, "learning_rate": 1.930692657985482e-05, "loss": 0.7289, "mean_token_accuracy": 0.7952305421233177, "num_tokens": 38378623.0, "step": 500 }, { "entropy": 1.051339966058731, "epoch": 0.7447973713033954, "grad_norm": 0.5645574331283569, "learning_rate": 1.927191726870718e-05, "loss": 0.7091, "mean_token_accuracy": 0.7985311895608902, "num_tokens": 39146551.0, "step": 510 }, { "entropy": 1.0769214510917664, "epoch": 0.7594012413289521, "grad_norm": 0.6002981066703796, "learning_rate": 1.9236078743226502e-05, "loss": 0.7288, "mean_token_accuracy": 0.7942462310194969, "num_tokens": 39908848.0, "step": 520 }, { "entropy": 1.0461754769086837, "epoch": 0.7740051113545089, "grad_norm": 0.5525550246238708, "learning_rate": 1.919941420855559e-05, "loss": 0.7058, "mean_token_accuracy": 0.8000339075922966, "num_tokens": 40694755.0, "step": 530 }, { "entropy": 1.0753471747040748, "epoch": 0.7886089813800657, "grad_norm": 0.5900936722755432, "learning_rate": 1.916192694370965e-05, "loss": 0.7535, "mean_token_accuracy": 0.7890919044613838, "num_tokens": 41468815.0, "step": 540 }, { "entropy": 1.0729323342442512, "epoch": 0.8032128514056225, "grad_norm": 0.6021299958229065, "learning_rate": 1.912362030128302e-05, "loss": 0.7149, "mean_token_accuracy": 0.7978271067142486, "num_tokens": 42243180.0, "step": 550 }, { "entropy": 1.0811306938529015, "epoch": 0.8178167214311792, "grad_norm": 0.5898880958557129, "learning_rate": 1.9084497707149337e-05, "loss": 0.7383, "mean_token_accuracy": 0.7915405228734016, "num_tokens": 43002684.0, "step": 560 }, { "entropy": 1.0548559844493866, "epoch": 0.8324205914567361, "grad_norm": 0.6246165633201599, "learning_rate": 1.9044562660155158e-05, "loss": 0.7287, "mean_token_accuracy": 0.7933633595705032, "num_tokens": 43756816.0, "step": 570 }, { "entropy": 1.0436548948287965, "epoch": 0.8470244614822928, "grad_norm": 0.5454269051551819, "learning_rate": 1.900381873180704e-05, "loss": 0.7019, "mean_token_accuracy": 0.8000532567501069, "num_tokens": 44502793.0, "step": 580 }, { "entropy": 1.061595305800438, "epoch": 0.8616283315078496, "grad_norm": 0.5577073693275452, "learning_rate": 1.896226956595214e-05, "loss": 0.7347, "mean_token_accuracy": 0.7934001550078392, "num_tokens": 45260661.0, "step": 590 }, { "entropy": 1.0671459570527078, "epoch": 0.8762322015334063, "grad_norm": 0.5666953325271606, "learning_rate": 1.891991887845233e-05, "loss": 0.7157, "mean_token_accuracy": 0.7979460313916207, "num_tokens": 46040873.0, "step": 600 }, { "entropy": 1.0621700644493104, "epoch": 0.8908360715589632, "grad_norm": 0.5529576539993286, "learning_rate": 1.887677045685188e-05, "loss": 0.7252, "mean_token_accuracy": 0.7951329663395882, "num_tokens": 46818910.0, "step": 610 }, { "entropy": 1.0564261555671692, "epoch": 0.9054399415845199, "grad_norm": 0.6081776022911072, "learning_rate": 1.8832828160038717e-05, "loss": 0.7224, "mean_token_accuracy": 0.7945129945874214, "num_tokens": 47581801.0, "step": 620 }, { "entropy": 1.0867107123136521, "epoch": 0.9200438116100766, "grad_norm": 0.5485532283782959, "learning_rate": 1.8788095917899322e-05, "loss": 0.7397, "mean_token_accuracy": 0.7922059059143066, "num_tokens": 48353228.0, "step": 630 }, { "entropy": 1.0725250199437142, "epoch": 0.9346476816356335, "grad_norm": 0.5478349924087524, "learning_rate": 1.8742577730967275e-05, "loss": 0.7282, "mean_token_accuracy": 0.7937524914741516, "num_tokens": 49128771.0, "step": 640 }, { "entropy": 1.065740318596363, "epoch": 0.9492515516611902, "grad_norm": 0.5200572609901428, "learning_rate": 1.8696277670065453e-05, "loss": 0.7158, "mean_token_accuracy": 0.7966950073838234, "num_tokens": 49898864.0, "step": 650 }, { "entropy": 1.0765223398804664, "epoch": 0.963855421686747, "grad_norm": 0.5623005032539368, "learning_rate": 1.8649199875942e-05, "loss": 0.7316, "mean_token_accuracy": 0.79328583329916, "num_tokens": 50667692.0, "step": 660 }, { "entropy": 1.0664428249001503, "epoch": 0.9784592917123037, "grad_norm": 0.5457090735435486, "learning_rate": 1.860134855889997e-05, "loss": 0.7192, "mean_token_accuracy": 0.7951617255806923, "num_tokens": 51439436.0, "step": 670 }, { "entropy": 1.054172757267952, "epoch": 0.9930631617378606, "grad_norm": 0.558342456817627, "learning_rate": 1.8552727998420815e-05, "loss": 0.7284, "mean_token_accuracy": 0.795482975244522, "num_tokens": 52216680.0, "step": 680 }, { "entropy": 1.0329305483744695, "epoch": 1.0073019350127783, "grad_norm": 0.519607424736023, "learning_rate": 1.850334254278164e-05, "loss": 0.6312, "mean_token_accuracy": 0.8187721814864721, "num_tokens": 52953330.0, "step": 690 }, { "entropy": 0.8502921864390374, "epoch": 1.0219058050383352, "grad_norm": 0.6058902740478516, "learning_rate": 1.845319660866635e-05, "loss": 0.5259, "mean_token_accuracy": 0.8443149983882904, "num_tokens": 53713129.0, "step": 700 }, { "entropy": 0.8117894425988197, "epoch": 1.036509675063892, "grad_norm": 0.5925081968307495, "learning_rate": 1.8402294680770607e-05, "loss": 0.5039, "mean_token_accuracy": 0.8498010948300362, "num_tokens": 54480149.0, "step": 710 }, { "entropy": 0.7792180389165878, "epoch": 1.0511135450894487, "grad_norm": 0.637313961982727, "learning_rate": 1.8350641311400813e-05, "loss": 0.4964, "mean_token_accuracy": 0.851611290872097, "num_tokens": 55232208.0, "step": 720 }, { "entropy": 0.8110285863280297, "epoch": 1.0657174151150055, "grad_norm": 0.6269355416297913, "learning_rate": 1.8298241120066923e-05, "loss": 0.5102, "mean_token_accuracy": 0.8473430201411247, "num_tokens": 56017632.0, "step": 730 }, { "entropy": 0.805873404443264, "epoch": 1.0803212851405624, "grad_norm": 0.6263405084609985, "learning_rate": 1.8245098793069353e-05, "loss": 0.4989, "mean_token_accuracy": 0.8503431648015976, "num_tokens": 56794094.0, "step": 740 }, { "entropy": 0.7984029710292816, "epoch": 1.094925155166119, "grad_norm": 0.6596930623054504, "learning_rate": 1.819121908307985e-05, "loss": 0.5029, "mean_token_accuracy": 0.8493917599320412, "num_tokens": 57569065.0, "step": 750 }, { "entropy": 0.8163655236363411, "epoch": 1.1095290251916758, "grad_norm": 0.6013880372047424, "learning_rate": 1.813660680871645e-05, "loss": 0.508, "mean_token_accuracy": 0.8476730227470398, "num_tokens": 58342947.0, "step": 760 }, { "entropy": 0.8022693067789077, "epoch": 1.1241328952172325, "grad_norm": 0.6172059774398804, "learning_rate": 1.8081266854112536e-05, "loss": 0.5066, "mean_token_accuracy": 0.847652480006218, "num_tokens": 59095076.0, "step": 770 }, { "entropy": 0.8082443997263908, "epoch": 1.1387367652427893, "grad_norm": 0.5798039436340332, "learning_rate": 1.8025204168480036e-05, "loss": 0.5084, "mean_token_accuracy": 0.8473975166678429, "num_tokens": 59855251.0, "step": 780 }, { "entropy": 0.8160747662186623, "epoch": 1.1533406352683462, "grad_norm": 0.6323524713516235, "learning_rate": 1.7968423765666805e-05, "loss": 0.5181, "mean_token_accuracy": 0.8460124984383584, "num_tokens": 60616729.0, "step": 790 }, { "entropy": 0.8220678076148034, "epoch": 1.1679445052939028, "grad_norm": 0.5704360008239746, "learning_rate": 1.7910930723708206e-05, "loss": 0.4989, "mean_token_accuracy": 0.8500168919563293, "num_tokens": 61391607.0, "step": 800 }, { "entropy": 0.8171453341841698, "epoch": 1.1825483753194597, "grad_norm": 0.6301026940345764, "learning_rate": 1.7852730184372996e-05, "loss": 0.5094, "mean_token_accuracy": 0.847735871374607, "num_tokens": 62176326.0, "step": 810 }, { "entropy": 0.8387730494141579, "epoch": 1.1971522453450165, "grad_norm": 0.6920040845870972, "learning_rate": 1.779382735270345e-05, "loss": 0.528, "mean_token_accuracy": 0.8430045962333679, "num_tokens": 62936900.0, "step": 820 }, { "entropy": 0.8173261970281601, "epoch": 1.2117561153705732, "grad_norm": 0.6174663305282593, "learning_rate": 1.773422749654988e-05, "loss": 0.505, "mean_token_accuracy": 0.849335603415966, "num_tokens": 63714937.0, "step": 830 }, { "entropy": 0.8201940849423408, "epoch": 1.22635998539613, "grad_norm": 0.6521441340446472, "learning_rate": 1.7673935946099515e-05, "loss": 0.5169, "mean_token_accuracy": 0.8452720895409584, "num_tokens": 64484592.0, "step": 840 }, { "entropy": 0.8129187062382698, "epoch": 1.2409638554216866, "grad_norm": 0.6480047106742859, "learning_rate": 1.7612958093399793e-05, "loss": 0.5101, "mean_token_accuracy": 0.847150382399559, "num_tokens": 65242523.0, "step": 850 }, { "entropy": 0.797595490515232, "epoch": 1.2555677254472435, "grad_norm": 0.6072301268577576, "learning_rate": 1.7551299391876147e-05, "loss": 0.4954, "mean_token_accuracy": 0.8505616560578346, "num_tokens": 66010969.0, "step": 860 }, { "entropy": 0.8110659316182136, "epoch": 1.2701715954728003, "grad_norm": 0.6669343709945679, "learning_rate": 1.7488965355844293e-05, "loss": 0.5132, "mean_token_accuracy": 0.8465436458587646, "num_tokens": 66767531.0, "step": 870 }, { "entropy": 0.8244558870792389, "epoch": 1.284775465498357, "grad_norm": 0.6384320855140686, "learning_rate": 1.742596156001705e-05, "loss": 0.5276, "mean_token_accuracy": 0.8408863857388497, "num_tokens": 67498692.0, "step": 880 }, { "entropy": 0.8114714965224266, "epoch": 1.2993793355239138, "grad_norm": 0.5711145401000977, "learning_rate": 1.73622936390058e-05, "loss": 0.5139, "mean_token_accuracy": 0.8463606715202332, "num_tokens": 68285940.0, "step": 890 }, { "entropy": 0.8195077747106552, "epoch": 1.3139832055494707, "grad_norm": 0.6267218589782715, "learning_rate": 1.7297967286816553e-05, "loss": 0.5154, "mean_token_accuracy": 0.8449677467346192, "num_tokens": 69039571.0, "step": 900 }, { "entropy": 0.8230465367436409, "epoch": 1.3285870755750273, "grad_norm": 0.6753976345062256, "learning_rate": 1.723298825634072e-05, "loss": 0.5357, "mean_token_accuracy": 0.8401415064930916, "num_tokens": 69793108.0, "step": 910 }, { "entropy": 0.8096449464559555, "epoch": 1.3431909456005842, "grad_norm": 0.6720008850097656, "learning_rate": 1.716736235884062e-05, "loss": 0.5098, "mean_token_accuracy": 0.8470739260315895, "num_tokens": 70568130.0, "step": 920 }, { "entropy": 0.8186160072684288, "epoch": 1.357794815626141, "grad_norm": 0.5903974771499634, "learning_rate": 1.7101095463429748e-05, "loss": 0.5198, "mean_token_accuracy": 0.8454992339015007, "num_tokens": 71349507.0, "step": 930 }, { "entropy": 0.8150006383657455, "epoch": 1.3723986856516976, "grad_norm": 0.5898988842964172, "learning_rate": 1.7034193496547903e-05, "loss": 0.5071, "mean_token_accuracy": 0.8491646945476532, "num_tokens": 72135471.0, "step": 940 }, { "entropy": 0.808189807832241, "epoch": 1.3870025556772545, "grad_norm": 0.5973511934280396, "learning_rate": 1.6966662441431157e-05, "loss": 0.5321, "mean_token_accuracy": 0.8418431639671325, "num_tokens": 72886499.0, "step": 950 }, { "entropy": 0.8094048380851746, "epoch": 1.4016064257028114, "grad_norm": 0.6130071878433228, "learning_rate": 1.6898508337576754e-05, "loss": 0.519, "mean_token_accuracy": 0.8449132606387139, "num_tokens": 73639876.0, "step": 960 }, { "entropy": 0.8094017982482911, "epoch": 1.416210295728368, "grad_norm": 0.6462528705596924, "learning_rate": 1.6829737280203e-05, "loss": 0.5158, "mean_token_accuracy": 0.8453432083129883, "num_tokens": 74401459.0, "step": 970 }, { "entropy": 0.8205527007579804, "epoch": 1.4308141657539248, "grad_norm": 0.6410425901412964, "learning_rate": 1.676035541970411e-05, "loss": 0.5366, "mean_token_accuracy": 0.8401491552591324, "num_tokens": 75180262.0, "step": 980 }, { "entropy": 0.8101997837424278, "epoch": 1.4454180357794817, "grad_norm": 0.6770428419113159, "learning_rate": 1.669036896110021e-05, "loss": 0.5278, "mean_token_accuracy": 0.8413688018918037, "num_tokens": 75935757.0, "step": 990 }, { "entropy": 0.8236281529068947, "epoch": 1.4600219058050383, "grad_norm": 0.5985122323036194, "learning_rate": 1.6619784163482372e-05, "loss": 0.5169, "mean_token_accuracy": 0.8454629138112069, "num_tokens": 76677214.0, "step": 1000 }, { "entropy": 0.8149201571941376, "epoch": 1.4746257758305952, "grad_norm": 0.6033229231834412, "learning_rate": 1.6548607339452853e-05, "loss": 0.5157, "mean_token_accuracy": 0.8454660639166832, "num_tokens": 77429839.0, "step": 1010 }, { "entropy": 0.8358469530940056, "epoch": 1.4892296458561518, "grad_norm": 0.6250749826431274, "learning_rate": 1.6476844854560537e-05, "loss": 0.5274, "mean_token_accuracy": 0.8430205255746841, "num_tokens": 78198665.0, "step": 1020 }, { "entropy": 0.8159400016069412, "epoch": 1.5038335158817087, "grad_norm": 0.6094375848770142, "learning_rate": 1.640450312673166e-05, "loss": 0.5237, "mean_token_accuracy": 0.8430356681346893, "num_tokens": 78963745.0, "step": 1030 }, { "entropy": 0.7948550447821617, "epoch": 1.5184373859072653, "grad_norm": 0.6190559267997742, "learning_rate": 1.6331588625695823e-05, "loss": 0.4984, "mean_token_accuracy": 0.850845368206501, "num_tokens": 79742482.0, "step": 1040 }, { "entropy": 0.8203089535236359, "epoch": 1.5330412559328221, "grad_norm": 0.627266526222229, "learning_rate": 1.6258107872407376e-05, "loss": 0.5222, "mean_token_accuracy": 0.8443042784929276, "num_tokens": 80513082.0, "step": 1050 }, { "entropy": 0.8034705385565758, "epoch": 1.547645125958379, "grad_norm": 0.5822563767433167, "learning_rate": 1.6184067438462268e-05, "loss": 0.5108, "mean_token_accuracy": 0.8467739015817642, "num_tokens": 81287042.0, "step": 1060 }, { "entropy": 0.7996939823031426, "epoch": 1.5622489959839356, "grad_norm": 0.8322076201438904, "learning_rate": 1.6109473945510277e-05, "loss": 0.5059, "mean_token_accuracy": 0.8484288737177849, "num_tokens": 82062918.0, "step": 1070 }, { "entropy": 0.8242404267191887, "epoch": 1.5768528660094925, "grad_norm": 0.6267576217651367, "learning_rate": 1.6034334064662868e-05, "loss": 0.5236, "mean_token_accuracy": 0.8428681313991546, "num_tokens": 82835236.0, "step": 1080 }, { "entropy": 0.8226122260093689, "epoch": 1.5914567360350493, "grad_norm": 0.578869104385376, "learning_rate": 1.595865451589654e-05, "loss": 0.522, "mean_token_accuracy": 0.8434418380260468, "num_tokens": 83610617.0, "step": 1090 }, { "entropy": 0.8142560958862305, "epoch": 1.606060606060606, "grad_norm": 0.652740478515625, "learning_rate": 1.5882442067451856e-05, "loss": 0.52, "mean_token_accuracy": 0.8449100449681282, "num_tokens": 84379036.0, "step": 1100 }, { "entropy": 0.8149377256631851, "epoch": 1.6206644760861628, "grad_norm": 0.5886592864990234, "learning_rate": 1.5805703535228137e-05, "loss": 0.5318, "mean_token_accuracy": 0.8409554123878479, "num_tokens": 85126712.0, "step": 1110 }, { "entropy": 0.8216987118124962, "epoch": 1.6352683461117197, "grad_norm": 0.625464916229248, "learning_rate": 1.5728445782173896e-05, "loss": 0.5213, "mean_token_accuracy": 0.8435990065336227, "num_tokens": 85888279.0, "step": 1120 }, { "entropy": 0.8107628434896469, "epoch": 1.6498722161372763, "grad_norm": 0.6127883195877075, "learning_rate": 1.565067571767306e-05, "loss": 0.5307, "mean_token_accuracy": 0.8404728651046753, "num_tokens": 86651718.0, "step": 1130 }, { "entropy": 0.8122030794620514, "epoch": 1.6644760861628332, "grad_norm": 0.6447589993476868, "learning_rate": 1.557240029692705e-05, "loss": 0.5288, "mean_token_accuracy": 0.8413975268602372, "num_tokens": 87418899.0, "step": 1140 }, { "entropy": 0.8063643991947174, "epoch": 1.67907995618839, "grad_norm": 0.6214607357978821, "learning_rate": 1.5493626520332758e-05, "loss": 0.5046, "mean_token_accuracy": 0.8487621054053307, "num_tokens": 88197755.0, "step": 1150 }, { "entropy": 0.811661048233509, "epoch": 1.6936838262139466, "grad_norm": 0.587363600730896, "learning_rate": 1.5414361432856475e-05, "loss": 0.5232, "mean_token_accuracy": 0.8434179335832596, "num_tokens": 88960882.0, "step": 1160 }, { "entropy": 0.824562780559063, "epoch": 1.7082876962395035, "grad_norm": 0.5840896964073181, "learning_rate": 1.533461212340384e-05, "loss": 0.5295, "mean_token_accuracy": 0.8425872087478637, "num_tokens": 89730659.0, "step": 1170 }, { "entropy": 0.8084111362695694, "epoch": 1.7228915662650603, "grad_norm": 0.6445785760879517, "learning_rate": 1.5254385724185872e-05, "loss": 0.5189, "mean_token_accuracy": 0.8443948805332184, "num_tokens": 90519374.0, "step": 1180 }, { "entropy": 0.802851003408432, "epoch": 1.737495436290617, "grad_norm": 0.5873962044715881, "learning_rate": 1.5173689410081091e-05, "loss": 0.5215, "mean_token_accuracy": 0.8436427339911461, "num_tokens": 91280437.0, "step": 1190 }, { "entropy": 0.8080698132514954, "epoch": 1.7520993063161738, "grad_norm": 0.594432532787323, "learning_rate": 1.5092530397993877e-05, "loss": 0.519, "mean_token_accuracy": 0.8449363052845001, "num_tokens": 92051212.0, "step": 1200 }, { "entropy": 0.8046671271324157, "epoch": 1.7667031763417307, "grad_norm": 0.6073573231697083, "learning_rate": 1.5010915946209013e-05, "loss": 0.5207, "mean_token_accuracy": 0.8437707021832466, "num_tokens": 92824656.0, "step": 1210 }, { "entropy": 0.799671696126461, "epoch": 1.7813070463672873, "grad_norm": 0.5851039886474609, "learning_rate": 1.492885335374258e-05, "loss": 0.5171, "mean_token_accuracy": 0.845556665956974, "num_tokens": 93601248.0, "step": 1220 }, { "entropy": 0.7995420083403587, "epoch": 1.795910916392844, "grad_norm": 0.6009591221809387, "learning_rate": 1.4846349959689166e-05, "loss": 0.5064, "mean_token_accuracy": 0.8478008210659027, "num_tokens": 94367653.0, "step": 1230 }, { "entropy": 0.8032964497804642, "epoch": 1.810514786418401, "grad_norm": 0.6265828013420105, "learning_rate": 1.4763413142565524e-05, "loss": 0.5126, "mean_token_accuracy": 0.8462978065013885, "num_tokens": 95134532.0, "step": 1240 }, { "entropy": 0.8157682687044143, "epoch": 1.8251186564439577, "grad_norm": 0.6303804516792297, "learning_rate": 1.468005031965068e-05, "loss": 0.5285, "mean_token_accuracy": 0.8411597847938538, "num_tokens": 95888124.0, "step": 1250 }, { "entropy": 0.8242416352033615, "epoch": 1.8397225264695143, "grad_norm": 0.6679196953773499, "learning_rate": 1.4596268946322587e-05, "loss": 0.5277, "mean_token_accuracy": 0.8423770368099213, "num_tokens": 96640919.0, "step": 1260 }, { "entropy": 0.8027110368013382, "epoch": 1.8543263964950711, "grad_norm": 0.6514571309089661, "learning_rate": 1.4512076515391375e-05, "loss": 0.5144, "mean_token_accuracy": 0.845685575902462, "num_tokens": 97402349.0, "step": 1270 }, { "entropy": 0.8184534996747971, "epoch": 1.868930266520628, "grad_norm": 0.6248390078544617, "learning_rate": 1.4427480556429237e-05, "loss": 0.5232, "mean_token_accuracy": 0.8441895946860314, "num_tokens": 98195965.0, "step": 1280 }, { "entropy": 0.8160968586802483, "epoch": 1.8835341365461846, "grad_norm": 0.6021704077720642, "learning_rate": 1.4342488635097044e-05, "loss": 0.5149, "mean_token_accuracy": 0.8458479061722756, "num_tokens": 98968374.0, "step": 1290 }, { "entropy": 0.8257978558540344, "epoch": 1.8981380065717415, "grad_norm": 0.5873125195503235, "learning_rate": 1.425710835246773e-05, "loss": 0.5265, "mean_token_accuracy": 0.8433559283614158, "num_tokens": 99729725.0, "step": 1300 }, { "entropy": 0.8362466841936111, "epoch": 1.9127418765972983, "grad_norm": 0.6376807689666748, "learning_rate": 1.4171347344346494e-05, "loss": 0.5343, "mean_token_accuracy": 0.8405352383852005, "num_tokens": 100511457.0, "step": 1310 }, { "entropy": 0.806542806327343, "epoch": 1.927345746622855, "grad_norm": 0.6307621002197266, "learning_rate": 1.4085213280587916e-05, "loss": 0.508, "mean_token_accuracy": 0.8474292501807212, "num_tokens": 101294648.0, "step": 1320 }, { "entropy": 0.828097864985466, "epoch": 1.9419496166484118, "grad_norm": 0.6401568055152893, "learning_rate": 1.3998713864410029e-05, "loss": 0.523, "mean_token_accuracy": 0.8436363622546196, "num_tokens": 102057660.0, "step": 1330 }, { "entropy": 0.8092041403055191, "epoch": 1.9565534866739687, "grad_norm": 0.5988907217979431, "learning_rate": 1.3911856831705372e-05, "loss": 0.5199, "mean_token_accuracy": 0.8438414841890335, "num_tokens": 102826372.0, "step": 1340 }, { "entropy": 0.8254723891615867, "epoch": 1.9711573566995253, "grad_norm": 0.6260574460029602, "learning_rate": 1.3824649950349173e-05, "loss": 0.5307, "mean_token_accuracy": 0.8420073762536049, "num_tokens": 103585161.0, "step": 1350 }, { "entropy": 0.8164636805653572, "epoch": 1.9857612267250822, "grad_norm": 0.6160163879394531, "learning_rate": 1.373710101950464e-05, "loss": 0.5213, "mean_token_accuracy": 0.8451445356011391, "num_tokens": 104380569.0, "step": 1360 }, { "entropy": 0.8088174370618967, "epoch": 2.0, "grad_norm": 0.6965954303741455, "learning_rate": 1.3649217868925435e-05, "loss": 0.5143, "mean_token_accuracy": 0.8466584361516513, "num_tokens": 105148277.0, "step": 1370 } ], "logging_steps": 10, "max_steps": 3425, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.54747959995138e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }