{ "best_global_step": 12500, "best_metric": 0.012303678318858147, "best_model_checkpoint": "./sft_model/checkpoint-12500", "epoch": 7.054176072234763, "eval_steps": 500, "global_step": 12500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7264264225959778, "epoch": 0.0028216704288939053, "grad_norm": 1.9348883628845215, "learning_rate": 4.999999882129922e-06, "loss": 0.7593, "mean_token_accuracy": 0.8013904571533204, "num_tokens": 40154.0, "step": 5 }, { "entropy": 0.8295215606689453, "epoch": 0.0056433408577878106, "grad_norm": 1.6994550228118896, "learning_rate": 4.999999403282752e-06, "loss": 0.6675, "mean_token_accuracy": 0.8162339448928833, "num_tokens": 80736.0, "step": 10 }, { "entropy": 0.8583701252937317, "epoch": 0.008465011286681716, "grad_norm": 1.4073373079299927, "learning_rate": 4.999998556091706e-06, "loss": 0.7051, "mean_token_accuracy": 0.8052006244659424, "num_tokens": 121191.0, "step": 15 }, { "entropy": 0.8272644519805908, "epoch": 0.011286681715575621, "grad_norm": 1.322540044784546, "learning_rate": 4.999997340556951e-06, "loss": 0.6492, "mean_token_accuracy": 0.814669954776764, "num_tokens": 161873.0, "step": 20 }, { "entropy": 0.7710798859596253, "epoch": 0.014108352144469526, "grad_norm": 1.4983023405075073, "learning_rate": 4.999995756678724e-06, "loss": 0.6253, "mean_token_accuracy": 0.8207517623901367, "num_tokens": 202530.0, "step": 25 }, { "entropy": 0.7607816934585572, "epoch": 0.016930022573363433, "grad_norm": 1.315825343132019, "learning_rate": 4.999993804457336e-06, "loss": 0.6278, "mean_token_accuracy": 0.8195699453353882, "num_tokens": 242963.0, "step": 30 }, { "entropy": 0.6939435482025147, "epoch": 0.019751693002257337, "grad_norm": 1.3920537233352661, "learning_rate": 4.999991483893173e-06, "loss": 0.5477, "mean_token_accuracy": 0.8409396290779114, "num_tokens": 283687.0, "step": 35 }, { "entropy": 0.7462215900421143, "epoch": 0.022573363431151242, "grad_norm": 1.3442474603652954, "learning_rate": 4.999988794986688e-06, "loss": 0.6124, "mean_token_accuracy": 0.822938334941864, "num_tokens": 324425.0, "step": 40 }, { "entropy": 0.7756533026695251, "epoch": 0.025395033860045147, "grad_norm": 1.3415160179138184, "learning_rate": 4.999985737738411e-06, "loss": 0.6479, "mean_token_accuracy": 0.8175343155860901, "num_tokens": 365286.0, "step": 45 }, { "entropy": 0.7056244015693665, "epoch": 0.028216704288939052, "grad_norm": 1.4092621803283691, "learning_rate": 4.999982312148941e-06, "loss": 0.5813, "mean_token_accuracy": 0.8333210825920105, "num_tokens": 405893.0, "step": 50 }, { "entropy": 0.7671143412590027, "epoch": 0.031038374717832957, "grad_norm": 1.3218460083007812, "learning_rate": 4.999978518218954e-06, "loss": 0.6215, "mean_token_accuracy": 0.8200177073478698, "num_tokens": 446486.0, "step": 55 }, { "entropy": 0.71772620677948, "epoch": 0.033860045146726865, "grad_norm": 1.3232901096343994, "learning_rate": 4.999974355949192e-06, "loss": 0.5928, "mean_token_accuracy": 0.825872802734375, "num_tokens": 487151.0, "step": 60 }, { "entropy": 0.8025589108467102, "epoch": 0.03668171557562077, "grad_norm": 1.3470555543899536, "learning_rate": 4.999969825340475e-06, "loss": 0.6601, "mean_token_accuracy": 0.8110421895980835, "num_tokens": 527748.0, "step": 65 }, { "entropy": 0.6747750401496887, "epoch": 0.039503386004514675, "grad_norm": 1.3361178636550903, "learning_rate": 4.999964926393691e-06, "loss": 0.5425, "mean_token_accuracy": 0.8391481637954712, "num_tokens": 568462.0, "step": 70 }, { "entropy": 0.7461091756820679, "epoch": 0.04232505643340858, "grad_norm": 1.4912503957748413, "learning_rate": 4.999959659109804e-06, "loss": 0.5997, "mean_token_accuracy": 0.8242739796638489, "num_tokens": 608879.0, "step": 75 }, { "entropy": 0.7261611700057984, "epoch": 0.045146726862302484, "grad_norm": 1.2540172338485718, "learning_rate": 4.999954023489848e-06, "loss": 0.5904, "mean_token_accuracy": 0.8286155581474304, "num_tokens": 649552.0, "step": 80 }, { "entropy": 0.7150743484497071, "epoch": 0.04796839729119639, "grad_norm": 1.4506585597991943, "learning_rate": 4.99994801953493e-06, "loss": 0.5708, "mean_token_accuracy": 0.8319103837013244, "num_tokens": 690358.0, "step": 85 }, { "entropy": 0.6064900994300843, "epoch": 0.050790067720090294, "grad_norm": 1.2554188966751099, "learning_rate": 4.999941647246231e-06, "loss": 0.4921, "mean_token_accuracy": 0.851774251461029, "num_tokens": 731078.0, "step": 90 }, { "entropy": 0.6873465299606323, "epoch": 0.0536117381489842, "grad_norm": 1.2316175699234009, "learning_rate": 4.9999349066250014e-06, "loss": 0.5744, "mean_token_accuracy": 0.8320832848548889, "num_tokens": 771649.0, "step": 95 }, { "entropy": 0.7353047490119934, "epoch": 0.056433408577878104, "grad_norm": 2.000030994415283, "learning_rate": 4.9999277976725655e-06, "loss": 0.5994, "mean_token_accuracy": 0.8255870938301086, "num_tokens": 812286.0, "step": 100 }, { "entropy": 0.7349668025970459, "epoch": 0.05925507900677201, "grad_norm": 1.4496697187423706, "learning_rate": 4.999920320390319e-06, "loss": 0.5832, "mean_token_accuracy": 0.8332556962966919, "num_tokens": 853019.0, "step": 105 }, { "entropy": 0.7148452520370483, "epoch": 0.062076749435665914, "grad_norm": 1.39479660987854, "learning_rate": 4.999912474779733e-06, "loss": 0.6143, "mean_token_accuracy": 0.8211281180381775, "num_tokens": 893817.0, "step": 110 }, { "entropy": 0.7357437491416932, "epoch": 0.06489841986455983, "grad_norm": 1.3685375452041626, "learning_rate": 4.999904260842348e-06, "loss": 0.5979, "mean_token_accuracy": 0.8241528987884521, "num_tokens": 934521.0, "step": 115 }, { "entropy": 0.6611983060836792, "epoch": 0.06772009029345373, "grad_norm": 1.266114354133606, "learning_rate": 4.999895678579776e-06, "loss": 0.5311, "mean_token_accuracy": 0.8424633145332336, "num_tokens": 975258.0, "step": 120 }, { "entropy": 0.6882431030273437, "epoch": 0.07054176072234764, "grad_norm": 1.4671064615249634, "learning_rate": 4.999886727993704e-06, "loss": 0.5649, "mean_token_accuracy": 0.8341028809547424, "num_tokens": 1015761.0, "step": 125 }, { "entropy": 0.7173007011413575, "epoch": 0.07336343115124154, "grad_norm": 1.4032313823699951, "learning_rate": 4.999877409085892e-06, "loss": 0.5785, "mean_token_accuracy": 0.8307689428329468, "num_tokens": 1056694.0, "step": 130 }, { "entropy": 0.6990100502967834, "epoch": 0.07618510158013544, "grad_norm": 1.3346563577651978, "learning_rate": 4.999867721858168e-06, "loss": 0.591, "mean_token_accuracy": 0.8263980627059937, "num_tokens": 1097488.0, "step": 135 }, { "entropy": 0.6879770636558533, "epoch": 0.07900677200902935, "grad_norm": 1.5278775691986084, "learning_rate": 4.999857666312438e-06, "loss": 0.5686, "mean_token_accuracy": 0.8352189302444458, "num_tokens": 1138244.0, "step": 140 }, { "entropy": 0.6410947203636169, "epoch": 0.08182844243792325, "grad_norm": 1.5688629150390625, "learning_rate": 4.999847242450674e-06, "loss": 0.5137, "mean_token_accuracy": 0.8461119532585144, "num_tokens": 1179012.0, "step": 145 }, { "entropy": 0.669737708568573, "epoch": 0.08465011286681716, "grad_norm": 1.421505093574524, "learning_rate": 4.999836450274926e-06, "loss": 0.5398, "mean_token_accuracy": 0.8409364104270936, "num_tokens": 1219636.0, "step": 150 }, { "entropy": 0.7268872022628784, "epoch": 0.08747178329571106, "grad_norm": 1.4139888286590576, "learning_rate": 4.999825289787314e-06, "loss": 0.5906, "mean_token_accuracy": 0.8278973698616028, "num_tokens": 1260358.0, "step": 155 }, { "entropy": 0.6793547749519349, "epoch": 0.09029345372460497, "grad_norm": 1.3994945287704468, "learning_rate": 4.99981376099003e-06, "loss": 0.5473, "mean_token_accuracy": 0.8384710073471069, "num_tokens": 1300160.0, "step": 160 }, { "entropy": 0.7041888356208801, "epoch": 0.09311512415349887, "grad_norm": 1.2893599271774292, "learning_rate": 4.999801863885339e-06, "loss": 0.5769, "mean_token_accuracy": 0.8308470964431762, "num_tokens": 1340909.0, "step": 165 }, { "entropy": 0.7476347208023071, "epoch": 0.09593679458239278, "grad_norm": 1.4180552959442139, "learning_rate": 4.999789598475578e-06, "loss": 0.6191, "mean_token_accuracy": 0.8223087549209595, "num_tokens": 1381254.0, "step": 170 }, { "entropy": 0.7091502547264099, "epoch": 0.09875846501128668, "grad_norm": 1.2832435369491577, "learning_rate": 4.999776964763157e-06, "loss": 0.5698, "mean_token_accuracy": 0.8346888542175293, "num_tokens": 1421956.0, "step": 175 }, { "entropy": 0.6569717288017273, "epoch": 0.10158013544018059, "grad_norm": 1.4216198921203613, "learning_rate": 4.999763962750557e-06, "loss": 0.5289, "mean_token_accuracy": 0.8407601952552796, "num_tokens": 1462724.0, "step": 180 }, { "entropy": 0.7254927158355713, "epoch": 0.1044018058690745, "grad_norm": 1.367143154144287, "learning_rate": 4.999750592440333e-06, "loss": 0.603, "mean_token_accuracy": 0.8244819164276123, "num_tokens": 1503032.0, "step": 185 }, { "entropy": 0.6174264311790466, "epoch": 0.1072234762979684, "grad_norm": 1.1871576309204102, "learning_rate": 4.999736853835111e-06, "loss": 0.5176, "mean_token_accuracy": 0.8440237402915954, "num_tokens": 1543678.0, "step": 190 }, { "entropy": 0.6621920347213746, "epoch": 0.1100451467268623, "grad_norm": 1.3519974946975708, "learning_rate": 4.999722746937591e-06, "loss": 0.5392, "mean_token_accuracy": 0.8406000256538391, "num_tokens": 1584469.0, "step": 195 }, { "entropy": 0.72998526096344, "epoch": 0.11286681715575621, "grad_norm": 1.5044736862182617, "learning_rate": 4.999708271750544e-06, "loss": 0.5784, "mean_token_accuracy": 0.8296970129013062, "num_tokens": 1625117.0, "step": 200 }, { "entropy": 0.620005464553833, "epoch": 0.11568848758465011, "grad_norm": 1.2598135471343994, "learning_rate": 4.999693428276813e-06, "loss": 0.5077, "mean_token_accuracy": 0.8482168078422546, "num_tokens": 1665890.0, "step": 205 }, { "entropy": 0.7349537253379822, "epoch": 0.11851015801354402, "grad_norm": 1.290712833404541, "learning_rate": 4.999678216519314e-06, "loss": 0.5826, "mean_token_accuracy": 0.8309862017631531, "num_tokens": 1706501.0, "step": 210 }, { "entropy": 0.6702666759490967, "epoch": 0.12133182844243792, "grad_norm": 1.1792408227920532, "learning_rate": 4.999662636481035e-06, "loss": 0.5404, "mean_token_accuracy": 0.8420506477355957, "num_tokens": 1747287.0, "step": 215 }, { "entropy": 0.7209498405456543, "epoch": 0.12415349887133183, "grad_norm": 1.4285236597061157, "learning_rate": 4.999646688165039e-06, "loss": 0.5868, "mean_token_accuracy": 0.8266986727714538, "num_tokens": 1787814.0, "step": 220 }, { "entropy": 0.6917775988578796, "epoch": 0.12697516930022573, "grad_norm": 1.4500813484191895, "learning_rate": 4.999630371574457e-06, "loss": 0.5451, "mean_token_accuracy": 0.8405853033065795, "num_tokens": 1828330.0, "step": 225 }, { "entropy": 0.7286237478256226, "epoch": 0.12979683972911965, "grad_norm": 1.197171688079834, "learning_rate": 4.999613686712493e-06, "loss": 0.5945, "mean_token_accuracy": 0.8261087775230408, "num_tokens": 1868916.0, "step": 230 }, { "entropy": 0.7036802411079407, "epoch": 0.13261851015801354, "grad_norm": 1.3373335599899292, "learning_rate": 4.999596633582429e-06, "loss": 0.5736, "mean_token_accuracy": 0.8307413101196289, "num_tokens": 1909557.0, "step": 235 }, { "entropy": 0.6876640558242798, "epoch": 0.13544018058690746, "grad_norm": 1.2515480518341064, "learning_rate": 4.999579212187611e-06, "loss": 0.5421, "mean_token_accuracy": 0.8372658848762512, "num_tokens": 1949962.0, "step": 240 }, { "entropy": 0.6509540438652038, "epoch": 0.13826185101580135, "grad_norm": 1.2757313251495361, "learning_rate": 4.999561422531464e-06, "loss": 0.5263, "mean_token_accuracy": 0.8403837084770203, "num_tokens": 1990703.0, "step": 245 }, { "entropy": 0.6634021878242493, "epoch": 0.14108352144469527, "grad_norm": 1.3375166654586792, "learning_rate": 4.9995432646174815e-06, "loss": 0.5638, "mean_token_accuracy": 0.831636929512024, "num_tokens": 2031252.0, "step": 250 }, { "entropy": 0.7269237041473389, "epoch": 0.14390519187358916, "grad_norm": 1.3792425394058228, "learning_rate": 4.9995247384492314e-06, "loss": 0.5982, "mean_token_accuracy": 0.8250751614570617, "num_tokens": 2071662.0, "step": 255 }, { "entropy": 0.7198888540267945, "epoch": 0.14672686230248308, "grad_norm": 1.4608137607574463, "learning_rate": 4.999505844030352e-06, "loss": 0.5952, "mean_token_accuracy": 0.8260551571846009, "num_tokens": 2112264.0, "step": 260 }, { "entropy": 0.6873027205467224, "epoch": 0.14954853273137697, "grad_norm": 1.3915055990219116, "learning_rate": 4.999486581364557e-06, "loss": 0.5486, "mean_token_accuracy": 0.837960147857666, "num_tokens": 2152995.0, "step": 265 }, { "entropy": 0.7061834812164307, "epoch": 0.1523702031602709, "grad_norm": 1.3316701650619507, "learning_rate": 4.999466950455628e-06, "loss": 0.5651, "mean_token_accuracy": 0.8314638137817383, "num_tokens": 2193728.0, "step": 270 }, { "entropy": 0.6988749980926514, "epoch": 0.15519187358916478, "grad_norm": 1.4248734712600708, "learning_rate": 4.999446951307424e-06, "loss": 0.5708, "mean_token_accuracy": 0.8328610777854919, "num_tokens": 2234493.0, "step": 275 }, { "entropy": 0.7523432970046997, "epoch": 0.1580135440180587, "grad_norm": 1.5484312772750854, "learning_rate": 4.999426583923873e-06, "loss": 0.6236, "mean_token_accuracy": 0.8170114874839782, "num_tokens": 2274609.0, "step": 280 }, { "entropy": 0.6838536024093628, "epoch": 0.1608352144469526, "grad_norm": 1.367386817932129, "learning_rate": 4.999405848308975e-06, "loss": 0.5309, "mean_token_accuracy": 0.842476773262024, "num_tokens": 2315124.0, "step": 285 }, { "entropy": 0.6006429553031921, "epoch": 0.1636568848758465, "grad_norm": 1.3099923133850098, "learning_rate": 4.999384744466805e-06, "loss": 0.4904, "mean_token_accuracy": 0.8527785658836364, "num_tokens": 2355713.0, "step": 290 }, { "entropy": 0.6720806241035462, "epoch": 0.1664785553047404, "grad_norm": 1.2152323722839355, "learning_rate": 4.999363272401508e-06, "loss": 0.5263, "mean_token_accuracy": 0.8428802371025086, "num_tokens": 2395726.0, "step": 295 }, { "entropy": 0.730102264881134, "epoch": 0.16930022573363432, "grad_norm": 1.3833410739898682, "learning_rate": 4.9993414321173014e-06, "loss": 0.6063, "mean_token_accuracy": 0.8238815784454345, "num_tokens": 2436268.0, "step": 300 }, { "entropy": 0.7030096173286438, "epoch": 0.1721218961625282, "grad_norm": 1.3912793397903442, "learning_rate": 4.9993192236184786e-06, "loss": 0.5728, "mean_token_accuracy": 0.829773461818695, "num_tokens": 2476941.0, "step": 305 }, { "entropy": 0.692926001548767, "epoch": 0.17494356659142213, "grad_norm": 1.406414270401001, "learning_rate": 4.9992966469094005e-06, "loss": 0.5581, "mean_token_accuracy": 0.8338650107383728, "num_tokens": 2517495.0, "step": 310 }, { "entropy": 0.7009713649749756, "epoch": 0.17776523702031602, "grad_norm": 1.4576942920684814, "learning_rate": 4.999273701994501e-06, "loss": 0.5827, "mean_token_accuracy": 0.8266690850257874, "num_tokens": 2558100.0, "step": 315 }, { "entropy": 0.6718906760215759, "epoch": 0.18058690744920994, "grad_norm": 1.4303754568099976, "learning_rate": 4.999250388878291e-06, "loss": 0.5668, "mean_token_accuracy": 0.830142343044281, "num_tokens": 2598172.0, "step": 320 }, { "entropy": 0.6644892454147339, "epoch": 0.18340857787810383, "grad_norm": 1.3855103254318237, "learning_rate": 4.999226707565348e-06, "loss": 0.5289, "mean_token_accuracy": 0.8401578664779663, "num_tokens": 2638766.0, "step": 325 }, { "entropy": 0.6168012738227844, "epoch": 0.18623024830699775, "grad_norm": 1.3673598766326904, "learning_rate": 4.999202658060324e-06, "loss": 0.5022, "mean_token_accuracy": 0.8519076347351074, "num_tokens": 2679412.0, "step": 330 }, { "entropy": 0.7058925032615662, "epoch": 0.18905191873589164, "grad_norm": 1.2969095706939697, "learning_rate": 4.9991782403679445e-06, "loss": 0.5568, "mean_token_accuracy": 0.8322369575500488, "num_tokens": 2720202.0, "step": 335 }, { "entropy": 0.6692469239234924, "epoch": 0.19187358916478556, "grad_norm": 1.3060578107833862, "learning_rate": 4.999153454493006e-06, "loss": 0.5492, "mean_token_accuracy": 0.837912917137146, "num_tokens": 2760837.0, "step": 340 }, { "entropy": 0.6806850790977478, "epoch": 0.19469525959367945, "grad_norm": 1.3894520998001099, "learning_rate": 4.999128300440377e-06, "loss": 0.5959, "mean_token_accuracy": 0.8271700620651246, "num_tokens": 2801488.0, "step": 345 }, { "entropy": 0.6444134593009949, "epoch": 0.19751693002257337, "grad_norm": 1.2214727401733398, "learning_rate": 4.9991027782150005e-06, "loss": 0.4907, "mean_token_accuracy": 0.8515629053115845, "num_tokens": 2842036.0, "step": 350 }, { "entropy": 0.720831036567688, "epoch": 0.20033860045146726, "grad_norm": 1.3418818712234497, "learning_rate": 4.99907688782189e-06, "loss": 0.5878, "mean_token_accuracy": 0.8290072560310364, "num_tokens": 2882628.0, "step": 355 }, { "entropy": 0.6308605074882507, "epoch": 0.20316027088036118, "grad_norm": 1.3402037620544434, "learning_rate": 4.9990506292661315e-06, "loss": 0.5121, "mean_token_accuracy": 0.8474404573440552, "num_tokens": 2923198.0, "step": 360 }, { "entropy": 0.6803479313850402, "epoch": 0.20598194130925507, "grad_norm": 1.4827245473861694, "learning_rate": 4.9990240025528825e-06, "loss": 0.5541, "mean_token_accuracy": 0.8327592492103577, "num_tokens": 2963798.0, "step": 365 }, { "entropy": 0.6540728807449341, "epoch": 0.208803611738149, "grad_norm": 1.3350443840026855, "learning_rate": 4.998997007687375e-06, "loss": 0.5089, "mean_token_accuracy": 0.8447277069091796, "num_tokens": 3004683.0, "step": 370 }, { "entropy": 0.7112419605255127, "epoch": 0.21162528216704288, "grad_norm": 1.3664087057113647, "learning_rate": 4.998969644674911e-06, "loss": 0.5828, "mean_token_accuracy": 0.8266646385192871, "num_tokens": 3045439.0, "step": 375 }, { "entropy": 0.6424817204475403, "epoch": 0.2144469525959368, "grad_norm": 1.295762300491333, "learning_rate": 4.998941913520867e-06, "loss": 0.5321, "mean_token_accuracy": 0.8423674941062927, "num_tokens": 3086082.0, "step": 380 }, { "entropy": 0.6630952477455139, "epoch": 0.2172686230248307, "grad_norm": 1.189756989479065, "learning_rate": 4.998913814230691e-06, "loss": 0.5333, "mean_token_accuracy": 0.839359712600708, "num_tokens": 3126821.0, "step": 385 }, { "entropy": 0.7343137979507446, "epoch": 0.2200902934537246, "grad_norm": 1.3133877515792847, "learning_rate": 4.998885346809902e-06, "loss": 0.616, "mean_token_accuracy": 0.820310366153717, "num_tokens": 3167554.0, "step": 390 }, { "entropy": 0.7052167534828186, "epoch": 0.2229119638826185, "grad_norm": 1.2900844812393188, "learning_rate": 4.998856511264094e-06, "loss": 0.5687, "mean_token_accuracy": 0.8320470333099366, "num_tokens": 3208157.0, "step": 395 }, { "entropy": 0.7610344290733337, "epoch": 0.22573363431151242, "grad_norm": 1.5215799808502197, "learning_rate": 4.99882730759893e-06, "loss": 0.6134, "mean_token_accuracy": 0.81887845993042, "num_tokens": 3248261.0, "step": 400 }, { "entropy": 0.6336552977561951, "epoch": 0.22855530474040633, "grad_norm": 1.2715165615081787, "learning_rate": 4.9987977358201475e-06, "loss": 0.5034, "mean_token_accuracy": 0.8473275661468506, "num_tokens": 3288888.0, "step": 405 }, { "entropy": 0.6352953314781189, "epoch": 0.23137697516930023, "grad_norm": 1.3592379093170166, "learning_rate": 4.998767795933557e-06, "loss": 0.5094, "mean_token_accuracy": 0.8488934755325317, "num_tokens": 3329423.0, "step": 410 }, { "entropy": 0.6898619294166565, "epoch": 0.23419864559819414, "grad_norm": 1.5599400997161865, "learning_rate": 4.998737487945039e-06, "loss": 0.5475, "mean_token_accuracy": 0.8362764358520508, "num_tokens": 3369896.0, "step": 415 }, { "entropy": 0.6755191922187805, "epoch": 0.23702031602708803, "grad_norm": 1.3778337240219116, "learning_rate": 4.998706811860548e-06, "loss": 0.5762, "mean_token_accuracy": 0.8319776773452758, "num_tokens": 3410527.0, "step": 420 }, { "entropy": 0.6867754459381104, "epoch": 0.23984198645598195, "grad_norm": 1.3699222803115845, "learning_rate": 4.99867576768611e-06, "loss": 0.5414, "mean_token_accuracy": 0.839304780960083, "num_tokens": 3451077.0, "step": 425 }, { "entropy": 0.683435583114624, "epoch": 0.24266365688487584, "grad_norm": 1.3843740224838257, "learning_rate": 4.9986443554278244e-06, "loss": 0.5439, "mean_token_accuracy": 0.8384104609489441, "num_tokens": 3491423.0, "step": 430 }, { "entropy": 0.6476885080337524, "epoch": 0.24548532731376976, "grad_norm": 1.2700644731521606, "learning_rate": 4.998612575091861e-06, "loss": 0.5217, "mean_token_accuracy": 0.8458979368209839, "num_tokens": 3532023.0, "step": 435 }, { "entropy": 0.6664133548736573, "epoch": 0.24830699774266365, "grad_norm": 1.3746997117996216, "learning_rate": 4.998580426684464e-06, "loss": 0.5236, "mean_token_accuracy": 0.8449108362197876, "num_tokens": 3572699.0, "step": 440 }, { "entropy": 0.6357210874557495, "epoch": 0.25112866817155755, "grad_norm": 1.2438454627990723, "learning_rate": 4.99854791021195e-06, "loss": 0.512, "mean_token_accuracy": 0.8491321802139282, "num_tokens": 3613440.0, "step": 445 }, { "entropy": 0.627473509311676, "epoch": 0.25395033860045146, "grad_norm": 1.1443346738815308, "learning_rate": 4.998515025680703e-06, "loss": 0.5025, "mean_token_accuracy": 0.8488677740097046, "num_tokens": 3653969.0, "step": 450 }, { "entropy": 0.6401142597198486, "epoch": 0.2567720090293454, "grad_norm": 1.3476841449737549, "learning_rate": 4.998481773097187e-06, "loss": 0.503, "mean_token_accuracy": 0.8511731863021851, "num_tokens": 3693874.0, "step": 455 }, { "entropy": 0.645455002784729, "epoch": 0.2595936794582393, "grad_norm": 1.392980933189392, "learning_rate": 4.998448152467933e-06, "loss": 0.515, "mean_token_accuracy": 0.8437379837036133, "num_tokens": 3734684.0, "step": 460 }, { "entropy": 0.7039332389831543, "epoch": 0.26241534988713316, "grad_norm": 1.42012357711792, "learning_rate": 4.998414163799545e-06, "loss": 0.5646, "mean_token_accuracy": 0.8330328106880188, "num_tokens": 3775233.0, "step": 465 }, { "entropy": 0.6496623396873474, "epoch": 0.2652370203160271, "grad_norm": 1.168820858001709, "learning_rate": 4.998379807098703e-06, "loss": 0.512, "mean_token_accuracy": 0.8448665142059326, "num_tokens": 3815898.0, "step": 470 }, { "entropy": 0.6927712917327881, "epoch": 0.268058690744921, "grad_norm": 1.277652621269226, "learning_rate": 4.998345082372153e-06, "loss": 0.5485, "mean_token_accuracy": 0.8371129631996155, "num_tokens": 3856530.0, "step": 475 }, { "entropy": 0.6336887121200562, "epoch": 0.2708803611738149, "grad_norm": 1.3250688314437866, "learning_rate": 4.998309989626718e-06, "loss": 0.496, "mean_token_accuracy": 0.8498493075370789, "num_tokens": 3897409.0, "step": 480 }, { "entropy": 0.645351254940033, "epoch": 0.2737020316027088, "grad_norm": 1.1753090620040894, "learning_rate": 4.998274528869292e-06, "loss": 0.5256, "mean_token_accuracy": 0.8436009883880615, "num_tokens": 3938183.0, "step": 485 }, { "entropy": 0.6607337474822998, "epoch": 0.2765237020316027, "grad_norm": 1.386691927909851, "learning_rate": 4.998238700106842e-06, "loss": 0.5226, "mean_token_accuracy": 0.8447944283485412, "num_tokens": 3978774.0, "step": 490 }, { "entropy": 0.7330415606498718, "epoch": 0.2793453724604966, "grad_norm": 1.3130143880844116, "learning_rate": 4.998202503346405e-06, "loss": 0.5945, "mean_token_accuracy": 0.8254420638084412, "num_tokens": 4019301.0, "step": 495 }, { "entropy": 0.6656695961952209, "epoch": 0.28216704288939054, "grad_norm": 1.4055564403533936, "learning_rate": 4.998165938595094e-06, "loss": 0.5339, "mean_token_accuracy": 0.8389903545379639, "num_tokens": 4059906.0, "step": 500 }, { "epoch": 0.28216704288939054, "eval_entropy": 0.6397659778594971, "eval_loss": 0.5099606513977051, "eval_mean_token_accuracy": 0.857033371925354, "eval_num_tokens": 4059906.0, "eval_runtime": 0.1662, "eval_samples_per_second": 24.067, "eval_steps_per_second": 6.017, "step": 500 }, { "entropy": 0.6222930490970612, "epoch": 0.2849887133182844, "grad_norm": 1.3186826705932617, "learning_rate": 4.99812900586009e-06, "loss": 0.5115, "mean_token_accuracy": 0.8475234031677246, "num_tokens": 4100501.0, "step": 505 }, { "entropy": 0.6861591100692749, "epoch": 0.2878103837471783, "grad_norm": 1.2481554746627808, "learning_rate": 4.998091705148649e-06, "loss": 0.5386, "mean_token_accuracy": 0.8370915293693543, "num_tokens": 4141056.0, "step": 510 }, { "entropy": 0.6614163756370545, "epoch": 0.29063205417607224, "grad_norm": 1.3787559270858765, "learning_rate": 4.998054036468099e-06, "loss": 0.5331, "mean_token_accuracy": 0.8423222064971924, "num_tokens": 4181622.0, "step": 515 }, { "entropy": 0.6498169541358948, "epoch": 0.29345372460496616, "grad_norm": 1.140236735343933, "learning_rate": 4.9980159998258406e-06, "loss": 0.5086, "mean_token_accuracy": 0.846357011795044, "num_tokens": 4222377.0, "step": 520 }, { "entropy": 0.6879486322402955, "epoch": 0.29627539503386, "grad_norm": 1.4712022542953491, "learning_rate": 4.997977595229346e-06, "loss": 0.5573, "mean_token_accuracy": 0.835738730430603, "num_tokens": 4263122.0, "step": 525 }, { "entropy": 0.6771662116050721, "epoch": 0.29909706546275394, "grad_norm": 1.3491076231002808, "learning_rate": 4.997938822686158e-06, "loss": 0.547, "mean_token_accuracy": 0.8368203639984131, "num_tokens": 4303690.0, "step": 530 }, { "entropy": 0.614617896080017, "epoch": 0.30191873589164786, "grad_norm": 1.2441357374191284, "learning_rate": 4.9978996822038964e-06, "loss": 0.4844, "mean_token_accuracy": 0.8524455547332763, "num_tokens": 4344245.0, "step": 535 }, { "entropy": 0.6798507928848266, "epoch": 0.3047404063205418, "grad_norm": 1.3364735841751099, "learning_rate": 4.997860173790247e-06, "loss": 0.5325, "mean_token_accuracy": 0.8381256461143494, "num_tokens": 4384408.0, "step": 540 }, { "entropy": 0.6491321206092835, "epoch": 0.30756207674943564, "grad_norm": 1.3930151462554932, "learning_rate": 4.997820297452975e-06, "loss": 0.4999, "mean_token_accuracy": 0.8447855472564697, "num_tokens": 4424758.0, "step": 545 }, { "entropy": 0.6349780797958374, "epoch": 0.31038374717832956, "grad_norm": 1.406150460243225, "learning_rate": 4.99778005319991e-06, "loss": 0.51, "mean_token_accuracy": 0.8480640172958374, "num_tokens": 4465315.0, "step": 550 }, { "entropy": 0.6838363170623779, "epoch": 0.3132054176072235, "grad_norm": 1.5294631719589233, "learning_rate": 4.997739441038962e-06, "loss": 0.5587, "mean_token_accuracy": 0.8342589735984802, "num_tokens": 4505898.0, "step": 555 }, { "entropy": 0.7220178842544556, "epoch": 0.3160270880361174, "grad_norm": 1.3367938995361328, "learning_rate": 4.997698460978107e-06, "loss": 0.5938, "mean_token_accuracy": 0.8219369530677796, "num_tokens": 4546647.0, "step": 560 }, { "entropy": 0.6441617131233215, "epoch": 0.31884875846501126, "grad_norm": 1.1907154321670532, "learning_rate": 4.997657113025395e-06, "loss": 0.5127, "mean_token_accuracy": 0.8480484962463379, "num_tokens": 4587353.0, "step": 565 }, { "entropy": 0.7197385430335999, "epoch": 0.3216704288939052, "grad_norm": 1.5142972469329834, "learning_rate": 4.99761539718895e-06, "loss": 0.5569, "mean_token_accuracy": 0.8342020034790039, "num_tokens": 4627673.0, "step": 570 }, { "entropy": 0.7443452477455139, "epoch": 0.3244920993227991, "grad_norm": 1.2722396850585938, "learning_rate": 4.997573313476966e-06, "loss": 0.6036, "mean_token_accuracy": 0.8234388828277588, "num_tokens": 4668389.0, "step": 575 }, { "entropy": 0.6642969369888305, "epoch": 0.327313769751693, "grad_norm": 1.3420699834823608, "learning_rate": 4.997530861897713e-06, "loss": 0.5121, "mean_token_accuracy": 0.8478710532188416, "num_tokens": 4708970.0, "step": 580 }, { "entropy": 0.6479909062385559, "epoch": 0.3301354401805869, "grad_norm": 1.422701358795166, "learning_rate": 4.997488042459528e-06, "loss": 0.5289, "mean_token_accuracy": 0.8437910914421082, "num_tokens": 4749674.0, "step": 585 }, { "entropy": 0.6523343324661255, "epoch": 0.3329571106094808, "grad_norm": 1.3672442436218262, "learning_rate": 4.997444855170823e-06, "loss": 0.531, "mean_token_accuracy": 0.8426079154014587, "num_tokens": 4790249.0, "step": 590 }, { "entropy": 0.6659448742866516, "epoch": 0.3357787810383747, "grad_norm": 1.267858624458313, "learning_rate": 4.997401300040084e-06, "loss": 0.5406, "mean_token_accuracy": 0.8389504075050354, "num_tokens": 4831050.0, "step": 595 }, { "entropy": 0.6824786901473999, "epoch": 0.33860045146726864, "grad_norm": 1.3464597463607788, "learning_rate": 4.997357377075866e-06, "loss": 0.5328, "mean_token_accuracy": 0.8403973698616027, "num_tokens": 4871225.0, "step": 600 }, { "entropy": 0.6604692697525024, "epoch": 0.34142212189616256, "grad_norm": 1.4553182125091553, "learning_rate": 4.997313086286797e-06, "loss": 0.5313, "mean_token_accuracy": 0.8407448887825012, "num_tokens": 4911582.0, "step": 605 }, { "entropy": 0.6908664226531982, "epoch": 0.3442437923250564, "grad_norm": 1.3354190587997437, "learning_rate": 4.997268427681579e-06, "loss": 0.5585, "mean_token_accuracy": 0.8351940751075745, "num_tokens": 4952166.0, "step": 610 }, { "entropy": 0.6428348064422608, "epoch": 0.34706546275395034, "grad_norm": 1.2946590185165405, "learning_rate": 4.997223401268985e-06, "loss": 0.5154, "mean_token_accuracy": 0.8446980834007263, "num_tokens": 4992726.0, "step": 615 }, { "entropy": 0.6469574570655823, "epoch": 0.34988713318284426, "grad_norm": 1.3383957147598267, "learning_rate": 4.9971780070578605e-06, "loss": 0.5082, "mean_token_accuracy": 0.8440989255905151, "num_tokens": 5033375.0, "step": 620 }, { "entropy": 0.6753599047660828, "epoch": 0.3527088036117382, "grad_norm": 1.4977456331253052, "learning_rate": 4.997132245057124e-06, "loss": 0.5473, "mean_token_accuracy": 0.8380128145217896, "num_tokens": 5074101.0, "step": 625 }, { "entropy": 0.6009699106216431, "epoch": 0.35553047404063204, "grad_norm": 1.3025342226028442, "learning_rate": 4.997086115275763e-06, "loss": 0.4942, "mean_token_accuracy": 0.8532807230949402, "num_tokens": 5114789.0, "step": 630 }, { "entropy": 0.7020259857177734, "epoch": 0.35835214446952596, "grad_norm": 1.42304265499115, "learning_rate": 4.997039617722843e-06, "loss": 0.5778, "mean_token_accuracy": 0.8297113418579102, "num_tokens": 5155557.0, "step": 635 }, { "entropy": 0.6381742715835571, "epoch": 0.3611738148984199, "grad_norm": 1.3583488464355469, "learning_rate": 4.996992752407496e-06, "loss": 0.4815, "mean_token_accuracy": 0.8546654939651489, "num_tokens": 5195254.0, "step": 640 }, { "entropy": 0.7070492506027222, "epoch": 0.3639954853273138, "grad_norm": 1.4853992462158203, "learning_rate": 4.996945519338929e-06, "loss": 0.5622, "mean_token_accuracy": 0.8343408942222595, "num_tokens": 5235919.0, "step": 645 }, { "entropy": 0.7001669526100158, "epoch": 0.36681715575620766, "grad_norm": 1.263033390045166, "learning_rate": 4.996897918526422e-06, "loss": 0.5645, "mean_token_accuracy": 0.8342163324356079, "num_tokens": 5276443.0, "step": 650 }, { "entropy": 0.6899022579193115, "epoch": 0.3696388261851016, "grad_norm": 1.3277238607406616, "learning_rate": 4.996849949979325e-06, "loss": 0.5756, "mean_token_accuracy": 0.8299772024154664, "num_tokens": 5317131.0, "step": 655 }, { "entropy": 0.6805570363998413, "epoch": 0.3724604966139955, "grad_norm": 1.2228710651397705, "learning_rate": 4.996801613707063e-06, "loss": 0.5515, "mean_token_accuracy": 0.8347800970077515, "num_tokens": 5357866.0, "step": 660 }, { "entropy": 0.6331149935722351, "epoch": 0.3752821670428894, "grad_norm": 1.3809823989868164, "learning_rate": 4.9967529097191305e-06, "loss": 0.5098, "mean_token_accuracy": 0.8466669321060181, "num_tokens": 5398573.0, "step": 665 }, { "entropy": 0.6736877083778381, "epoch": 0.3781038374717833, "grad_norm": 1.3853009939193726, "learning_rate": 4.996703838025095e-06, "loss": 0.5324, "mean_token_accuracy": 0.839972174167633, "num_tokens": 5439412.0, "step": 670 }, { "entropy": 0.6803161859512329, "epoch": 0.3809255079006772, "grad_norm": 1.4266542196273804, "learning_rate": 4.996654398634597e-06, "loss": 0.5506, "mean_token_accuracy": 0.8333458185195923, "num_tokens": 5479945.0, "step": 675 }, { "entropy": 0.7106911301612854, "epoch": 0.3837471783295711, "grad_norm": 1.482615351676941, "learning_rate": 4.996604591557349e-06, "loss": 0.5753, "mean_token_accuracy": 0.8281079649925231, "num_tokens": 5520469.0, "step": 680 }, { "entropy": 0.6807895421981811, "epoch": 0.38656884875846503, "grad_norm": 1.2304539680480957, "learning_rate": 4.996554416803137e-06, "loss": 0.5497, "mean_token_accuracy": 0.8346430540084839, "num_tokens": 5561155.0, "step": 685 }, { "entropy": 0.6457465648651123, "epoch": 0.3893905191873589, "grad_norm": 1.470390796661377, "learning_rate": 4.996503874381815e-06, "loss": 0.5143, "mean_token_accuracy": 0.846334969997406, "num_tokens": 5601850.0, "step": 690 }, { "entropy": 0.5989644646644592, "epoch": 0.3922121896162528, "grad_norm": 1.3190268278121948, "learning_rate": 4.996452964303315e-06, "loss": 0.4601, "mean_token_accuracy": 0.8595061659812927, "num_tokens": 5642455.0, "step": 695 }, { "entropy": 0.6620395064353943, "epoch": 0.39503386004514673, "grad_norm": 1.4160014390945435, "learning_rate": 4.996401686577636e-06, "loss": 0.5315, "mean_token_accuracy": 0.8426364421844482, "num_tokens": 5683057.0, "step": 700 }, { "entropy": 0.6351073563098908, "epoch": 0.39785553047404065, "grad_norm": 1.417966604232788, "learning_rate": 4.996350041214852e-06, "loss": 0.5148, "mean_token_accuracy": 0.8437610864639282, "num_tokens": 5723964.0, "step": 705 }, { "entropy": 0.7087972402572632, "epoch": 0.4006772009029345, "grad_norm": 1.4004472494125366, "learning_rate": 4.996298028225111e-06, "loss": 0.57, "mean_token_accuracy": 0.8329956293106079, "num_tokens": 5764573.0, "step": 710 }, { "entropy": 0.7214209794998169, "epoch": 0.40349887133182843, "grad_norm": 1.5199602842330933, "learning_rate": 4.996245647618627e-06, "loss": 0.591, "mean_token_accuracy": 0.8271468758583069, "num_tokens": 5805361.0, "step": 715 }, { "entropy": 0.6683865189552307, "epoch": 0.40632054176072235, "grad_norm": 1.3417999744415283, "learning_rate": 4.996192899405693e-06, "loss": 0.5318, "mean_token_accuracy": 0.8423861384391784, "num_tokens": 5846241.0, "step": 720 }, { "entropy": 0.7061811447143554, "epoch": 0.40914221218961627, "grad_norm": 1.456238031387329, "learning_rate": 4.996139783596671e-06, "loss": 0.5438, "mean_token_accuracy": 0.8380335092544555, "num_tokens": 5886839.0, "step": 725 }, { "entropy": 0.6855465412139893, "epoch": 0.41196388261851014, "grad_norm": 1.3961089849472046, "learning_rate": 4.996086300201995e-06, "loss": 0.5363, "mean_token_accuracy": 0.838828194141388, "num_tokens": 5927433.0, "step": 730 }, { "entropy": 0.6245934367179871, "epoch": 0.41478555304740405, "grad_norm": 1.252845048904419, "learning_rate": 4.996032449232172e-06, "loss": 0.5042, "mean_token_accuracy": 0.8503703594207763, "num_tokens": 5968213.0, "step": 735 }, { "entropy": 0.6127149283885955, "epoch": 0.417607223476298, "grad_norm": 1.207445502281189, "learning_rate": 4.995978230697782e-06, "loss": 0.4892, "mean_token_accuracy": 0.8527963757514954, "num_tokens": 6008851.0, "step": 740 }, { "entropy": 0.7167877793312073, "epoch": 0.4204288939051919, "grad_norm": 1.266051173210144, "learning_rate": 4.995923644609474e-06, "loss": 0.5714, "mean_token_accuracy": 0.8294572830200195, "num_tokens": 6049545.0, "step": 745 }, { "entropy": 0.5922774732112884, "epoch": 0.42325056433408575, "grad_norm": 1.3726112842559814, "learning_rate": 4.995868690977974e-06, "loss": 0.4756, "mean_token_accuracy": 0.8563015341758728, "num_tokens": 6090230.0, "step": 750 }, { "entropy": 0.6408256888389587, "epoch": 0.4260722347629797, "grad_norm": 1.3127083778381348, "learning_rate": 4.995813369814075e-06, "loss": 0.5208, "mean_token_accuracy": 0.8428861260414123, "num_tokens": 6130808.0, "step": 755 }, { "entropy": 0.6796544313430786, "epoch": 0.4288939051918736, "grad_norm": 1.409447431564331, "learning_rate": 4.995757681128648e-06, "loss": 0.5331, "mean_token_accuracy": 0.8407068133354187, "num_tokens": 6171110.0, "step": 760 }, { "entropy": 0.6506537318229675, "epoch": 0.4317155756207675, "grad_norm": 1.2559049129486084, "learning_rate": 4.995701624932631e-06, "loss": 0.4936, "mean_token_accuracy": 0.8502960324287414, "num_tokens": 6211953.0, "step": 765 }, { "entropy": 0.6637347579002381, "epoch": 0.4345372460496614, "grad_norm": 1.309942603111267, "learning_rate": 4.995645201237036e-06, "loss": 0.5321, "mean_token_accuracy": 0.8399680137634278, "num_tokens": 6252408.0, "step": 770 }, { "entropy": 0.6979188203811646, "epoch": 0.4373589164785553, "grad_norm": 1.5395498275756836, "learning_rate": 4.995588410052948e-06, "loss": 0.5862, "mean_token_accuracy": 0.8252558946609497, "num_tokens": 6293047.0, "step": 775 }, { "entropy": 0.6070165634155273, "epoch": 0.4401805869074492, "grad_norm": 1.1005687713623047, "learning_rate": 4.995531251391524e-06, "loss": 0.488, "mean_token_accuracy": 0.8509817957878113, "num_tokens": 6333702.0, "step": 780 }, { "entropy": 0.6723381996154785, "epoch": 0.44300225733634313, "grad_norm": 1.27623450756073, "learning_rate": 4.995473725263992e-06, "loss": 0.5316, "mean_token_accuracy": 0.8400357604026795, "num_tokens": 6374319.0, "step": 785 }, { "entropy": 0.6453313708305359, "epoch": 0.445823927765237, "grad_norm": 1.3552383184432983, "learning_rate": 4.995415831681654e-06, "loss": 0.5294, "mean_token_accuracy": 0.8450021266937255, "num_tokens": 6415005.0, "step": 790 }, { "entropy": 0.7531963586807251, "epoch": 0.4486455981941309, "grad_norm": 1.303346037864685, "learning_rate": 4.9953575706558835e-06, "loss": 0.6133, "mean_token_accuracy": 0.8193735122680664, "num_tokens": 6455608.0, "step": 795 }, { "entropy": 0.6664739489555359, "epoch": 0.45146726862302483, "grad_norm": 1.3953213691711426, "learning_rate": 4.9952989421981244e-06, "loss": 0.5294, "mean_token_accuracy": 0.8387627482414246, "num_tokens": 6495958.0, "step": 800 }, { "entropy": 0.6475582718849182, "epoch": 0.45428893905191875, "grad_norm": 1.3377810716629028, "learning_rate": 4.995239946319895e-06, "loss": 0.5229, "mean_token_accuracy": 0.8416747093200684, "num_tokens": 6536670.0, "step": 805 }, { "entropy": 0.6350281596183777, "epoch": 0.45711060948081267, "grad_norm": 1.3273295164108276, "learning_rate": 4.995180583032784e-06, "loss": 0.513, "mean_token_accuracy": 0.8475698709487915, "num_tokens": 6577503.0, "step": 810 }, { "entropy": 0.6923723101615906, "epoch": 0.45993227990970653, "grad_norm": 1.4398008584976196, "learning_rate": 4.9951208523484555e-06, "loss": 0.5527, "mean_token_accuracy": 0.834313976764679, "num_tokens": 6617829.0, "step": 815 }, { "entropy": 0.6901078701019288, "epoch": 0.46275395033860045, "grad_norm": 1.2139265537261963, "learning_rate": 4.995060754278642e-06, "loss": 0.5589, "mean_token_accuracy": 0.832787299156189, "num_tokens": 6658320.0, "step": 820 }, { "entropy": 0.592526650428772, "epoch": 0.46557562076749437, "grad_norm": 1.34032142162323, "learning_rate": 4.9950002888351514e-06, "loss": 0.4736, "mean_token_accuracy": 0.8565322160720825, "num_tokens": 6699023.0, "step": 825 }, { "entropy": 0.713707959651947, "epoch": 0.4683972911963883, "grad_norm": 1.4716837406158447, "learning_rate": 4.994939456029859e-06, "loss": 0.5794, "mean_token_accuracy": 0.8286639451980591, "num_tokens": 6739446.0, "step": 830 }, { "entropy": 0.6702501654624939, "epoch": 0.47121896162528215, "grad_norm": 1.2845447063446045, "learning_rate": 4.994878255874719e-06, "loss": 0.5355, "mean_token_accuracy": 0.8418485879898071, "num_tokens": 6780190.0, "step": 835 }, { "entropy": 0.6634608149528504, "epoch": 0.47404063205417607, "grad_norm": 1.3624354600906372, "learning_rate": 4.994816688381751e-06, "loss": 0.5284, "mean_token_accuracy": 0.8404200553894043, "num_tokens": 6820877.0, "step": 840 }, { "entropy": 0.7302131295204163, "epoch": 0.47686230248307, "grad_norm": 1.4335999488830566, "learning_rate": 4.994754753563054e-06, "loss": 0.5943, "mean_token_accuracy": 0.8234637618064881, "num_tokens": 6861674.0, "step": 845 }, { "entropy": 0.6252121210098267, "epoch": 0.4796839729119639, "grad_norm": 1.423103928565979, "learning_rate": 4.994692451430791e-06, "loss": 0.5053, "mean_token_accuracy": 0.8479559659957886, "num_tokens": 6902472.0, "step": 850 }, { "entropy": 0.6240964293479919, "epoch": 0.48250564334085777, "grad_norm": 1.416658878326416, "learning_rate": 4.9946297819972025e-06, "loss": 0.509, "mean_token_accuracy": 0.8452272534370422, "num_tokens": 6942877.0, "step": 855 }, { "entropy": 0.6672571420669555, "epoch": 0.4853273137697517, "grad_norm": 1.3133835792541504, "learning_rate": 4.994566745274601e-06, "loss": 0.52, "mean_token_accuracy": 0.8426130771636963, "num_tokens": 6983426.0, "step": 860 }, { "entropy": 0.6581945300102234, "epoch": 0.4881489841986456, "grad_norm": 1.4337458610534668, "learning_rate": 4.994503341275369e-06, "loss": 0.5225, "mean_token_accuracy": 0.8437890887260437, "num_tokens": 7023732.0, "step": 865 }, { "entropy": 0.6551531553268433, "epoch": 0.4909706546275395, "grad_norm": 1.3199399709701538, "learning_rate": 4.994439570011963e-06, "loss": 0.518, "mean_token_accuracy": 0.8422786116600036, "num_tokens": 7064450.0, "step": 870 }, { "entropy": 0.6477535367012024, "epoch": 0.4937923250564334, "grad_norm": 1.2373626232147217, "learning_rate": 4.99437543149691e-06, "loss": 0.5278, "mean_token_accuracy": 0.8424580574035645, "num_tokens": 7105220.0, "step": 875 }, { "entropy": 0.6733549952507019, "epoch": 0.4966139954853273, "grad_norm": 1.3904764652252197, "learning_rate": 4.994310925742811e-06, "loss": 0.5219, "mean_token_accuracy": 0.8425598025321961, "num_tokens": 7145455.0, "step": 880 }, { "entropy": 0.6530658364295959, "epoch": 0.4994356659142212, "grad_norm": 1.387152075767517, "learning_rate": 4.9942460527623374e-06, "loss": 0.5397, "mean_token_accuracy": 0.8402209639549255, "num_tokens": 7185927.0, "step": 885 }, { "entropy": 0.6589832663536072, "epoch": 0.5022573363431151, "grad_norm": 1.2508260011672974, "learning_rate": 4.9941808125682336e-06, "loss": 0.5162, "mean_token_accuracy": 0.8469531655311584, "num_tokens": 7226531.0, "step": 890 }, { "entropy": 0.6467825770378113, "epoch": 0.505079006772009, "grad_norm": 1.2095719575881958, "learning_rate": 4.994115205173317e-06, "loss": 0.5325, "mean_token_accuracy": 0.8432755708694458, "num_tokens": 7267311.0, "step": 895 }, { "entropy": 0.6670600891113281, "epoch": 0.5079006772009029, "grad_norm": 1.3329170942306519, "learning_rate": 4.994049230590474e-06, "loss": 0.5145, "mean_token_accuracy": 0.8473351120948791, "num_tokens": 7308118.0, "step": 900 }, { "entropy": 0.654244887828827, "epoch": 0.5107223476297968, "grad_norm": 1.3469221591949463, "learning_rate": 4.993982888832667e-06, "loss": 0.5262, "mean_token_accuracy": 0.8424062848091125, "num_tokens": 7348961.0, "step": 905 }, { "entropy": 0.6805335998535156, "epoch": 0.5135440180586908, "grad_norm": 1.4335166215896606, "learning_rate": 4.993916179912929e-06, "loss": 0.5611, "mean_token_accuracy": 0.8342411160469055, "num_tokens": 7389295.0, "step": 910 }, { "entropy": 0.6766547679901123, "epoch": 0.5163656884875847, "grad_norm": 1.2276146411895752, "learning_rate": 4.993849103844365e-06, "loss": 0.5502, "mean_token_accuracy": 0.8335286498069763, "num_tokens": 7429891.0, "step": 915 }, { "entropy": 0.6441382646560669, "epoch": 0.5191873589164786, "grad_norm": 1.277137041091919, "learning_rate": 4.9937816606401506e-06, "loss": 0.5377, "mean_token_accuracy": 0.8357362985610962, "num_tokens": 7470461.0, "step": 920 }, { "entropy": 0.6156215190887451, "epoch": 0.5220090293453724, "grad_norm": 1.3269896507263184, "learning_rate": 4.993713850313537e-06, "loss": 0.4867, "mean_token_accuracy": 0.8524217247962952, "num_tokens": 7511054.0, "step": 925 }, { "entropy": 0.702682101726532, "epoch": 0.5248306997742663, "grad_norm": 1.4615741968154907, "learning_rate": 4.993645672877843e-06, "loss": 0.575, "mean_token_accuracy": 0.8279630184173584, "num_tokens": 7551567.0, "step": 930 }, { "entropy": 0.6455079674720764, "epoch": 0.5276523702031602, "grad_norm": 1.3673226833343506, "learning_rate": 4.993577128346465e-06, "loss": 0.5157, "mean_token_accuracy": 0.8442542672157287, "num_tokens": 7591635.0, "step": 935 }, { "entropy": 0.6908176302909851, "epoch": 0.5304740406320542, "grad_norm": 1.4456593990325928, "learning_rate": 4.993508216732867e-06, "loss": 0.558, "mean_token_accuracy": 0.832518482208252, "num_tokens": 7631859.0, "step": 940 }, { "entropy": 0.7004367709159851, "epoch": 0.5332957110609481, "grad_norm": 1.3898345232009888, "learning_rate": 4.993438938050587e-06, "loss": 0.5657, "mean_token_accuracy": 0.8318912148475647, "num_tokens": 7672520.0, "step": 945 }, { "entropy": 0.6737080454826355, "epoch": 0.536117381489842, "grad_norm": 1.398849606513977, "learning_rate": 4.993369292313235e-06, "loss": 0.5669, "mean_token_accuracy": 0.8320559740066529, "num_tokens": 7713151.0, "step": 950 }, { "entropy": 0.6230229139328003, "epoch": 0.5389390519187359, "grad_norm": 1.264159917831421, "learning_rate": 4.993299279534492e-06, "loss": 0.4986, "mean_token_accuracy": 0.8493775367736817, "num_tokens": 7754018.0, "step": 955 }, { "entropy": 0.6292937159538269, "epoch": 0.5417607223476298, "grad_norm": 1.344561219215393, "learning_rate": 4.993228899728113e-06, "loss": 0.5148, "mean_token_accuracy": 0.8411380529403687, "num_tokens": 7794654.0, "step": 960 }, { "entropy": 0.6116863548755646, "epoch": 0.5445823927765236, "grad_norm": 1.1279850006103516, "learning_rate": 4.993158152907923e-06, "loss": 0.4921, "mean_token_accuracy": 0.8529308915138245, "num_tokens": 7835397.0, "step": 965 }, { "entropy": 0.6285845994949341, "epoch": 0.5474040632054176, "grad_norm": 1.1283153295516968, "learning_rate": 4.993087039087823e-06, "loss": 0.4815, "mean_token_accuracy": 0.8528955459594727, "num_tokens": 7876148.0, "step": 970 }, { "entropy": 0.6560633659362793, "epoch": 0.5502257336343115, "grad_norm": 1.213463306427002, "learning_rate": 4.993015558281779e-06, "loss": 0.5345, "mean_token_accuracy": 0.8412879347801209, "num_tokens": 7916823.0, "step": 975 }, { "entropy": 0.6548992276191712, "epoch": 0.5530474040632054, "grad_norm": 1.2896618843078613, "learning_rate": 4.992943710503838e-06, "loss": 0.5147, "mean_token_accuracy": 0.8419336557388306, "num_tokens": 7957342.0, "step": 980 }, { "entropy": 0.6178439378738403, "epoch": 0.5558690744920993, "grad_norm": 1.1932618618011475, "learning_rate": 4.99287149576811e-06, "loss": 0.4988, "mean_token_accuracy": 0.850293779373169, "num_tokens": 7998078.0, "step": 985 }, { "entropy": 0.7176673412322998, "epoch": 0.5586907449209932, "grad_norm": 1.495336651802063, "learning_rate": 4.992798914088786e-06, "loss": 0.5872, "mean_token_accuracy": 0.8254365086555481, "num_tokens": 8038148.0, "step": 990 }, { "entropy": 0.6931470155715942, "epoch": 0.5615124153498872, "grad_norm": 1.343912124633789, "learning_rate": 4.992725965480121e-06, "loss": 0.5707, "mean_token_accuracy": 0.8297663688659668, "num_tokens": 8078718.0, "step": 995 }, { "entropy": 0.7093847036361695, "epoch": 0.5643340857787811, "grad_norm": 1.2880916595458984, "learning_rate": 4.992652649956448e-06, "loss": 0.5708, "mean_token_accuracy": 0.8289087891578675, "num_tokens": 8119465.0, "step": 1000 }, { "epoch": 0.5643340857787811, "eval_entropy": 0.6200407147407532, "eval_loss": 0.49860483407974243, "eval_mean_token_accuracy": 0.8631659746170044, "eval_num_tokens": 8119465.0, "eval_runtime": 0.1639, "eval_samples_per_second": 24.405, "eval_steps_per_second": 6.101, "step": 1000 }, { "entropy": 0.6551719307899475, "epoch": 0.5671557562076749, "grad_norm": 1.444422721862793, "learning_rate": 4.992578967532169e-06, "loss": 0.5098, "mean_token_accuracy": 0.8438852429389954, "num_tokens": 8160136.0, "step": 1005 }, { "entropy": 0.6948068380355835, "epoch": 0.5699774266365688, "grad_norm": 1.4045120477676392, "learning_rate": 4.992504918221759e-06, "loss": 0.578, "mean_token_accuracy": 0.8269060254096985, "num_tokens": 8200865.0, "step": 1010 }, { "entropy": 0.6053695321083069, "epoch": 0.5727990970654627, "grad_norm": 1.2733221054077148, "learning_rate": 4.9924305020397645e-06, "loss": 0.4762, "mean_token_accuracy": 0.8552789926528931, "num_tokens": 8241635.0, "step": 1015 }, { "entropy": 0.6861397624015808, "epoch": 0.5756207674943566, "grad_norm": 1.3532050848007202, "learning_rate": 4.992355719000805e-06, "loss": 0.5672, "mean_token_accuracy": 0.8298478722572327, "num_tokens": 8282118.0, "step": 1020 }, { "entropy": 0.6331191897392273, "epoch": 0.5784424379232506, "grad_norm": 1.2071319818496704, "learning_rate": 4.992280569119574e-06, "loss": 0.5005, "mean_token_accuracy": 0.850217878818512, "num_tokens": 8322734.0, "step": 1025 }, { "entropy": 0.6948285818099975, "epoch": 0.5812641083521445, "grad_norm": 1.375131368637085, "learning_rate": 4.99220505241083e-06, "loss": 0.5409, "mean_token_accuracy": 0.8394317030906677, "num_tokens": 8363311.0, "step": 1030 }, { "entropy": 0.6704018592834473, "epoch": 0.5840857787810384, "grad_norm": 1.373281478881836, "learning_rate": 4.992129168889412e-06, "loss": 0.5439, "mean_token_accuracy": 0.8385064721107482, "num_tokens": 8403747.0, "step": 1035 }, { "entropy": 0.6323675155639649, "epoch": 0.5869074492099323, "grad_norm": 1.2113394737243652, "learning_rate": 4.992052918570226e-06, "loss": 0.495, "mean_token_accuracy": 0.8503919124603272, "num_tokens": 8444647.0, "step": 1040 }, { "entropy": 0.6883712530136108, "epoch": 0.5897291196388262, "grad_norm": 1.329156517982483, "learning_rate": 4.991976301468251e-06, "loss": 0.5507, "mean_token_accuracy": 0.8347142577171326, "num_tokens": 8485437.0, "step": 1045 }, { "entropy": 0.6799022316932678, "epoch": 0.59255079006772, "grad_norm": 1.2880463600158691, "learning_rate": 4.9918993175985384e-06, "loss": 0.5525, "mean_token_accuracy": 0.8357504963874817, "num_tokens": 8526376.0, "step": 1050 }, { "entropy": 0.6652753591537476, "epoch": 0.595372460496614, "grad_norm": 1.3654208183288574, "learning_rate": 4.991821966976213e-06, "loss": 0.5473, "mean_token_accuracy": 0.8353073954582214, "num_tokens": 8567085.0, "step": 1055 }, { "entropy": 0.6146788358688354, "epoch": 0.5981941309255079, "grad_norm": 1.140661597251892, "learning_rate": 4.991744249616469e-06, "loss": 0.4757, "mean_token_accuracy": 0.8558377385139465, "num_tokens": 8607488.0, "step": 1060 }, { "entropy": 0.710556972026825, "epoch": 0.6010158013544018, "grad_norm": 1.3625504970550537, "learning_rate": 4.991666165534575e-06, "loss": 0.5542, "mean_token_accuracy": 0.8333917140960694, "num_tokens": 8648210.0, "step": 1065 }, { "entropy": 0.6810308933258057, "epoch": 0.6038374717832957, "grad_norm": 1.393760323524475, "learning_rate": 4.99158771474587e-06, "loss": 0.5602, "mean_token_accuracy": 0.8344960331916809, "num_tokens": 8688725.0, "step": 1070 }, { "entropy": 0.6881044745445252, "epoch": 0.6066591422121896, "grad_norm": 1.4435288906097412, "learning_rate": 4.991508897265766e-06, "loss": 0.5611, "mean_token_accuracy": 0.8323047876358032, "num_tokens": 8729344.0, "step": 1075 }, { "entropy": 0.6240251541137696, "epoch": 0.6094808126410836, "grad_norm": 1.3459124565124512, "learning_rate": 4.991429713109746e-06, "loss": 0.4968, "mean_token_accuracy": 0.8495769262313843, "num_tokens": 8770034.0, "step": 1080 }, { "entropy": 0.642047894001007, "epoch": 0.6123024830699775, "grad_norm": 1.186833143234253, "learning_rate": 4.991350162293367e-06, "loss": 0.5215, "mean_token_accuracy": 0.8443755626678466, "num_tokens": 8810541.0, "step": 1085 }, { "entropy": 0.6657473087310791, "epoch": 0.6151241534988713, "grad_norm": 1.4160339832305908, "learning_rate": 4.991270244832256e-06, "loss": 0.5373, "mean_token_accuracy": 0.836929440498352, "num_tokens": 8851054.0, "step": 1090 }, { "entropy": 0.664323914051056, "epoch": 0.6179458239277652, "grad_norm": 1.3085633516311646, "learning_rate": 4.9911899607421116e-06, "loss": 0.5336, "mean_token_accuracy": 0.8405984282493592, "num_tokens": 8891919.0, "step": 1095 }, { "entropy": 0.6161608934402466, "epoch": 0.6207674943566591, "grad_norm": 1.2209677696228027, "learning_rate": 4.991109310038707e-06, "loss": 0.4854, "mean_token_accuracy": 0.8494919776916504, "num_tokens": 8932520.0, "step": 1100 }, { "entropy": 0.633518648147583, "epoch": 0.623589164785553, "grad_norm": 1.422237515449524, "learning_rate": 4.991028292737887e-06, "loss": 0.5238, "mean_token_accuracy": 0.8427853345870971, "num_tokens": 8973154.0, "step": 1105 }, { "entropy": 0.7281236052513123, "epoch": 0.626410835214447, "grad_norm": 1.3925809860229492, "learning_rate": 4.990946908855565e-06, "loss": 0.6025, "mean_token_accuracy": 0.8230817198753357, "num_tokens": 9013989.0, "step": 1110 }, { "entropy": 0.6049828886985779, "epoch": 0.6292325056433409, "grad_norm": 1.2714378833770752, "learning_rate": 4.990865158407731e-06, "loss": 0.4823, "mean_token_accuracy": 0.8520822525024414, "num_tokens": 9054405.0, "step": 1115 }, { "entropy": 0.6474563360214234, "epoch": 0.6320541760722348, "grad_norm": 1.3699289560317993, "learning_rate": 4.990783041410444e-06, "loss": 0.5001, "mean_token_accuracy": 0.8497814536094666, "num_tokens": 9095024.0, "step": 1120 }, { "entropy": 0.665639317035675, "epoch": 0.6348758465011287, "grad_norm": 1.32370126247406, "learning_rate": 4.9907005578798366e-06, "loss": 0.5265, "mean_token_accuracy": 0.8440038084983825, "num_tokens": 9135786.0, "step": 1125 }, { "entropy": 0.6320675134658813, "epoch": 0.6376975169300225, "grad_norm": 1.2937510013580322, "learning_rate": 4.990617707832111e-06, "loss": 0.4979, "mean_token_accuracy": 0.8501017332077027, "num_tokens": 9176559.0, "step": 1130 }, { "entropy": 0.7049695611000061, "epoch": 0.6405191873589164, "grad_norm": 1.3041940927505493, "learning_rate": 4.990534491283545e-06, "loss": 0.5719, "mean_token_accuracy": 0.8350239872932435, "num_tokens": 9217381.0, "step": 1135 }, { "entropy": 0.6608424663543702, "epoch": 0.6433408577878104, "grad_norm": 1.3915507793426514, "learning_rate": 4.990450908250485e-06, "loss": 0.5267, "mean_token_accuracy": 0.8423561692237854, "num_tokens": 9258041.0, "step": 1140 }, { "entropy": 0.6109439551830291, "epoch": 0.6461625282167043, "grad_norm": 1.1623317003250122, "learning_rate": 4.990366958749352e-06, "loss": 0.4937, "mean_token_accuracy": 0.8506023287773132, "num_tokens": 9298558.0, "step": 1145 }, { "entropy": 0.6726685404777527, "epoch": 0.6489841986455982, "grad_norm": 1.4480568170547485, "learning_rate": 4.990282642796638e-06, "loss": 0.559, "mean_token_accuracy": 0.8336881279945374, "num_tokens": 9339287.0, "step": 1150 }, { "entropy": 0.6401282668113708, "epoch": 0.6518058690744921, "grad_norm": 1.405791997909546, "learning_rate": 4.9901979604089055e-06, "loss": 0.5005, "mean_token_accuracy": 0.8454666852951049, "num_tokens": 9379965.0, "step": 1155 }, { "entropy": 0.616789698600769, "epoch": 0.654627539503386, "grad_norm": 1.2432202100753784, "learning_rate": 4.990112911602792e-06, "loss": 0.4809, "mean_token_accuracy": 0.8562314510345459, "num_tokens": 9420544.0, "step": 1160 }, { "entropy": 0.6499378085136414, "epoch": 0.65744920993228, "grad_norm": 1.3318426609039307, "learning_rate": 4.990027496395003e-06, "loss": 0.5165, "mean_token_accuracy": 0.8456689953804016, "num_tokens": 9461170.0, "step": 1165 }, { "entropy": 0.6929373741149902, "epoch": 0.6602708803611738, "grad_norm": 1.501207709312439, "learning_rate": 4.989941714802321e-06, "loss": 0.5457, "mean_token_accuracy": 0.8362460851669311, "num_tokens": 9501863.0, "step": 1170 }, { "entropy": 0.6151013791561126, "epoch": 0.6630925507900677, "grad_norm": 1.291373372077942, "learning_rate": 4.989855566841597e-06, "loss": 0.4929, "mean_token_accuracy": 0.8491938352584839, "num_tokens": 9542539.0, "step": 1175 }, { "entropy": 0.6619673609733582, "epoch": 0.6659142212189616, "grad_norm": 1.8319789171218872, "learning_rate": 4.989769052529754e-06, "loss": 0.5406, "mean_token_accuracy": 0.8366817712783814, "num_tokens": 9583209.0, "step": 1180 }, { "entropy": 0.7023379445075989, "epoch": 0.6687358916478555, "grad_norm": 1.4245003461837769, "learning_rate": 4.989682171883789e-06, "loss": 0.5833, "mean_token_accuracy": 0.8261257648468018, "num_tokens": 9623904.0, "step": 1185 }, { "entropy": 0.6219159960746765, "epoch": 0.6715575620767494, "grad_norm": 1.5310255289077759, "learning_rate": 4.9895949249207674e-06, "loss": 0.4825, "mean_token_accuracy": 0.8541441679000854, "num_tokens": 9664582.0, "step": 1190 }, { "entropy": 0.6346824288368225, "epoch": 0.6743792325056434, "grad_norm": 1.3092063665390015, "learning_rate": 4.989507311657832e-06, "loss": 0.5112, "mean_token_accuracy": 0.8455924034118653, "num_tokens": 9705103.0, "step": 1195 }, { "entropy": 0.6392194151878356, "epoch": 0.6772009029345373, "grad_norm": 1.2018376588821411, "learning_rate": 4.9894193321121915e-06, "loss": 0.5101, "mean_token_accuracy": 0.847294807434082, "num_tokens": 9745685.0, "step": 1200 }, { "entropy": 0.7368967413902283, "epoch": 0.6800225733634312, "grad_norm": 1.4050233364105225, "learning_rate": 4.989330986301131e-06, "loss": 0.6216, "mean_token_accuracy": 0.8151282906532288, "num_tokens": 9786144.0, "step": 1205 }, { "entropy": 0.6174101948738098, "epoch": 0.6828442437923251, "grad_norm": 1.3221447467803955, "learning_rate": 4.989242274242007e-06, "loss": 0.4908, "mean_token_accuracy": 0.8495320796966552, "num_tokens": 9826811.0, "step": 1210 }, { "entropy": 0.6422483444213867, "epoch": 0.6856659142212189, "grad_norm": 1.1776975393295288, "learning_rate": 4.989153195952246e-06, "loss": 0.5173, "mean_token_accuracy": 0.8441770195960998, "num_tokens": 9867430.0, "step": 1215 }, { "entropy": 0.6633909225463868, "epoch": 0.6884875846501128, "grad_norm": 1.257130742073059, "learning_rate": 4.989063751449346e-06, "loss": 0.5406, "mean_token_accuracy": 0.8380695223808289, "num_tokens": 9907541.0, "step": 1220 }, { "entropy": 0.6803221344947815, "epoch": 0.6913092550790068, "grad_norm": 1.227628231048584, "learning_rate": 4.98897394075088e-06, "loss": 0.5413, "mean_token_accuracy": 0.8371790289878845, "num_tokens": 9948303.0, "step": 1225 }, { "entropy": 0.6370683073997497, "epoch": 0.6941309255079007, "grad_norm": 1.1942147016525269, "learning_rate": 4.9888837638744915e-06, "loss": 0.5116, "mean_token_accuracy": 0.8441259145736695, "num_tokens": 9989051.0, "step": 1230 }, { "entropy": 0.6529434204101563, "epoch": 0.6969525959367946, "grad_norm": 1.3808612823486328, "learning_rate": 4.988793220837895e-06, "loss": 0.5046, "mean_token_accuracy": 0.8458134651184082, "num_tokens": 10029798.0, "step": 1235 }, { "entropy": 0.6289508223533631, "epoch": 0.6997742663656885, "grad_norm": 1.1567049026489258, "learning_rate": 4.988702311658879e-06, "loss": 0.5103, "mean_token_accuracy": 0.8443259239196778, "num_tokens": 10070451.0, "step": 1240 }, { "entropy": 0.6437416434288025, "epoch": 0.7025959367945824, "grad_norm": 1.3937760591506958, "learning_rate": 4.9886110363553005e-06, "loss": 0.5103, "mean_token_accuracy": 0.846991765499115, "num_tokens": 10111198.0, "step": 1245 }, { "entropy": 0.710420835018158, "epoch": 0.7054176072234764, "grad_norm": 1.3196383714675903, "learning_rate": 4.988519394945092e-06, "loss": 0.5769, "mean_token_accuracy": 0.8278885602951049, "num_tokens": 10151900.0, "step": 1250 }, { "entropy": 0.6752119541168213, "epoch": 0.7082392776523702, "grad_norm": 1.4347295761108398, "learning_rate": 4.988427387446255e-06, "loss": 0.5473, "mean_token_accuracy": 0.8365179777145386, "num_tokens": 10192756.0, "step": 1255 }, { "entropy": 0.6734273076057434, "epoch": 0.7110609480812641, "grad_norm": 1.4099767208099365, "learning_rate": 4.988335013876867e-06, "loss": 0.5686, "mean_token_accuracy": 0.8306289792060852, "num_tokens": 10233478.0, "step": 1260 }, { "entropy": 0.7350514650344848, "epoch": 0.713882618510158, "grad_norm": 1.3657110929489136, "learning_rate": 4.988242274255073e-06, "loss": 0.6019, "mean_token_accuracy": 0.8200275421142578, "num_tokens": 10274135.0, "step": 1265 }, { "entropy": 0.6617794394493103, "epoch": 0.7167042889390519, "grad_norm": 1.4455223083496094, "learning_rate": 4.988149168599092e-06, "loss": 0.5323, "mean_token_accuracy": 0.8396545290946961, "num_tokens": 10314720.0, "step": 1270 }, { "entropy": 0.6266619205474854, "epoch": 0.7195259593679458, "grad_norm": 1.2425928115844727, "learning_rate": 4.988055696927214e-06, "loss": 0.5033, "mean_token_accuracy": 0.8460469722747803, "num_tokens": 10355383.0, "step": 1275 }, { "entropy": 0.6324691534042358, "epoch": 0.7223476297968398, "grad_norm": 1.2948311567306519, "learning_rate": 4.987961859257803e-06, "loss": 0.5149, "mean_token_accuracy": 0.8449645042419434, "num_tokens": 10395935.0, "step": 1280 }, { "entropy": 0.6536452293395996, "epoch": 0.7251693002257337, "grad_norm": 1.3648988008499146, "learning_rate": 4.987867655609292e-06, "loss": 0.5296, "mean_token_accuracy": 0.8420405626296997, "num_tokens": 10436654.0, "step": 1285 }, { "entropy": 0.6121232390403748, "epoch": 0.7279909706546276, "grad_norm": 1.2139304876327515, "learning_rate": 4.987773086000188e-06, "loss": 0.4986, "mean_token_accuracy": 0.8504088521003723, "num_tokens": 10477501.0, "step": 1290 }, { "entropy": 0.6448933601379394, "epoch": 0.7308126410835214, "grad_norm": 1.375567078590393, "learning_rate": 4.987678150449069e-06, "loss": 0.5122, "mean_token_accuracy": 0.8450644254684448, "num_tokens": 10518384.0, "step": 1295 }, { "entropy": 0.6476213932037354, "epoch": 0.7336343115124153, "grad_norm": 1.3577604293823242, "learning_rate": 4.987582848974586e-06, "loss": 0.5053, "mean_token_accuracy": 0.8453767418861389, "num_tokens": 10559194.0, "step": 1300 }, { "entropy": 0.6976300120353699, "epoch": 0.7364559819413092, "grad_norm": 1.3497973680496216, "learning_rate": 4.987487181595459e-06, "loss": 0.5519, "mean_token_accuracy": 0.8350682735443116, "num_tokens": 10599795.0, "step": 1305 }, { "entropy": 0.6458664059638977, "epoch": 0.7392776523702032, "grad_norm": 1.329574465751648, "learning_rate": 4.987391148330485e-06, "loss": 0.5169, "mean_token_accuracy": 0.8440695524215698, "num_tokens": 10640689.0, "step": 1310 }, { "entropy": 0.6906252384185791, "epoch": 0.7420993227990971, "grad_norm": 1.2950578927993774, "learning_rate": 4.987294749198526e-06, "loss": 0.5638, "mean_token_accuracy": 0.8313373565673828, "num_tokens": 10681276.0, "step": 1315 }, { "entropy": 0.6506387591362, "epoch": 0.744920993227991, "grad_norm": 1.5627875328063965, "learning_rate": 4.987197984218522e-06, "loss": 0.491, "mean_token_accuracy": 0.8504640936851502, "num_tokens": 10722061.0, "step": 1320 }, { "entropy": 0.6033547759056092, "epoch": 0.7477426636568849, "grad_norm": 1.4060436487197876, "learning_rate": 4.9871008534094825e-06, "loss": 0.4864, "mean_token_accuracy": 0.8535768985748291, "num_tokens": 10762663.0, "step": 1325 }, { "entropy": 0.6256420493125916, "epoch": 0.7505643340857788, "grad_norm": 1.2653220891952515, "learning_rate": 4.987003356790487e-06, "loss": 0.5119, "mean_token_accuracy": 0.845154881477356, "num_tokens": 10803082.0, "step": 1330 }, { "entropy": 0.627797293663025, "epoch": 0.7533860045146726, "grad_norm": 1.2059917449951172, "learning_rate": 4.986905494380691e-06, "loss": 0.483, "mean_token_accuracy": 0.8541393876075745, "num_tokens": 10843614.0, "step": 1335 }, { "entropy": 0.6704052805900573, "epoch": 0.7562076749435666, "grad_norm": 1.4258085489273071, "learning_rate": 4.986807266199318e-06, "loss": 0.5272, "mean_token_accuracy": 0.8418290138244628, "num_tokens": 10884087.0, "step": 1340 }, { "entropy": 0.7184508085250855, "epoch": 0.7590293453724605, "grad_norm": 1.3789485692977905, "learning_rate": 4.986708672265667e-06, "loss": 0.5641, "mean_token_accuracy": 0.8304919958114624, "num_tokens": 10924781.0, "step": 1345 }, { "entropy": 0.6704004406929016, "epoch": 0.7618510158013544, "grad_norm": 1.290982961654663, "learning_rate": 4.986609712599103e-06, "loss": 0.5249, "mean_token_accuracy": 0.8440407991409302, "num_tokens": 10965534.0, "step": 1350 }, { "entropy": 0.6607297778129577, "epoch": 0.7646726862302483, "grad_norm": 1.294542670249939, "learning_rate": 4.986510387219071e-06, "loss": 0.5247, "mean_token_accuracy": 0.8411241173744202, "num_tokens": 11006238.0, "step": 1355 }, { "entropy": 0.6355977773666381, "epoch": 0.7674943566591422, "grad_norm": 1.3880748748779297, "learning_rate": 4.98641069614508e-06, "loss": 0.5017, "mean_token_accuracy": 0.8476488828659058, "num_tokens": 11046967.0, "step": 1360 }, { "entropy": 0.6151192545890808, "epoch": 0.7703160270880361, "grad_norm": 1.2665696144104004, "learning_rate": 4.9863106393967165e-06, "loss": 0.4792, "mean_token_accuracy": 0.854477858543396, "num_tokens": 11086831.0, "step": 1365 }, { "entropy": 0.6570278882980347, "epoch": 0.7731376975169301, "grad_norm": 1.4179819822311401, "learning_rate": 4.986210216993636e-06, "loss": 0.5266, "mean_token_accuracy": 0.8439712643623352, "num_tokens": 11127372.0, "step": 1370 }, { "entropy": 0.6928212285041809, "epoch": 0.7759593679458239, "grad_norm": 1.3699414730072021, "learning_rate": 4.986109428955566e-06, "loss": 0.5622, "mean_token_accuracy": 0.8342428803443909, "num_tokens": 11167988.0, "step": 1375 }, { "entropy": 0.6448866724967957, "epoch": 0.7787810383747178, "grad_norm": 1.2906067371368408, "learning_rate": 4.986008275302307e-06, "loss": 0.5244, "mean_token_accuracy": 0.8406425952911377, "num_tokens": 11208797.0, "step": 1380 }, { "entropy": 0.6187806010246277, "epoch": 0.7816027088036117, "grad_norm": 1.25465726852417, "learning_rate": 4.98590675605373e-06, "loss": 0.4718, "mean_token_accuracy": 0.8553349137306213, "num_tokens": 11249307.0, "step": 1385 }, { "entropy": 0.6682080864906311, "epoch": 0.7844243792325056, "grad_norm": 1.278944969177246, "learning_rate": 4.98580487122978e-06, "loss": 0.5124, "mean_token_accuracy": 0.8448834180831909, "num_tokens": 11289926.0, "step": 1390 }, { "entropy": 0.6455815672874451, "epoch": 0.7872460496613995, "grad_norm": 1.4049651622772217, "learning_rate": 4.98570262085047e-06, "loss": 0.5146, "mean_token_accuracy": 0.8430341005325317, "num_tokens": 11330704.0, "step": 1395 }, { "entropy": 0.7064637899398803, "epoch": 0.7900677200902935, "grad_norm": 1.3357079029083252, "learning_rate": 4.985600004935889e-06, "loss": 0.565, "mean_token_accuracy": 0.8299904108047486, "num_tokens": 11371389.0, "step": 1400 }, { "entropy": 0.6495133399963379, "epoch": 0.7928893905191874, "grad_norm": 1.6325474977493286, "learning_rate": 4.985497023506195e-06, "loss": 0.5448, "mean_token_accuracy": 0.8373395681381226, "num_tokens": 11411887.0, "step": 1405 }, { "entropy": 0.6633385181427002, "epoch": 0.7957110609480813, "grad_norm": 1.4035420417785645, "learning_rate": 4.985393676581619e-06, "loss": 0.5252, "mean_token_accuracy": 0.8406889915466309, "num_tokens": 11452589.0, "step": 1410 }, { "entropy": 0.6448916912078857, "epoch": 0.7985327313769752, "grad_norm": 1.288009524345398, "learning_rate": 4.985289964182463e-06, "loss": 0.5105, "mean_token_accuracy": 0.8462629556655884, "num_tokens": 11493085.0, "step": 1415 }, { "entropy": 0.5965805172920227, "epoch": 0.801354401805869, "grad_norm": 1.2756577730178833, "learning_rate": 4.985185886329101e-06, "loss": 0.4867, "mean_token_accuracy": 0.8524753332138062, "num_tokens": 11533760.0, "step": 1420 }, { "entropy": 0.6665675163269043, "epoch": 0.804176072234763, "grad_norm": 1.3834420442581177, "learning_rate": 4.985081443041981e-06, "loss": 0.5285, "mean_token_accuracy": 0.8393696427345276, "num_tokens": 11574224.0, "step": 1425 }, { "entropy": 0.6934167623519898, "epoch": 0.8069977426636569, "grad_norm": 1.4740710258483887, "learning_rate": 4.98497663434162e-06, "loss": 0.5623, "mean_token_accuracy": 0.830849575996399, "num_tokens": 11614729.0, "step": 1430 }, { "entropy": 0.6643628239631653, "epoch": 0.8098194130925508, "grad_norm": 1.342612862586975, "learning_rate": 4.984871460248607e-06, "loss": 0.5361, "mean_token_accuracy": 0.840022873878479, "num_tokens": 11655327.0, "step": 1435 }, { "entropy": 0.6764454126358033, "epoch": 0.8126410835214447, "grad_norm": 1.5213992595672607, "learning_rate": 4.984765920783604e-06, "loss": 0.5688, "mean_token_accuracy": 0.8351798415184021, "num_tokens": 11695967.0, "step": 1440 }, { "entropy": 0.6788670063018799, "epoch": 0.8154627539503386, "grad_norm": 1.2199022769927979, "learning_rate": 4.984660015967343e-06, "loss": 0.5592, "mean_token_accuracy": 0.8351686835289002, "num_tokens": 11736108.0, "step": 1445 }, { "entropy": 0.6327372908592224, "epoch": 0.8182844243792325, "grad_norm": 1.4303865432739258, "learning_rate": 4.984553745820631e-06, "loss": 0.505, "mean_token_accuracy": 0.8465925574302673, "num_tokens": 11776854.0, "step": 1450 }, { "entropy": 0.6577797293663025, "epoch": 0.8211060948081265, "grad_norm": 1.3754053115844727, "learning_rate": 4.984447110364343e-06, "loss": 0.526, "mean_token_accuracy": 0.8419236660003662, "num_tokens": 11817612.0, "step": 1455 }, { "entropy": 0.6646520733833313, "epoch": 0.8239277652370203, "grad_norm": 1.260542631149292, "learning_rate": 4.98434010961943e-06, "loss": 0.533, "mean_token_accuracy": 0.8385473966598511, "num_tokens": 11858437.0, "step": 1460 }, { "entropy": 0.6954992532730102, "epoch": 0.8267494356659142, "grad_norm": 1.408316731452942, "learning_rate": 4.9842327436069105e-06, "loss": 0.5601, "mean_token_accuracy": 0.8325203895568848, "num_tokens": 11898873.0, "step": 1465 }, { "entropy": 0.6532683134078979, "epoch": 0.8295711060948081, "grad_norm": 1.1917073726654053, "learning_rate": 4.984125012347876e-06, "loss": 0.5346, "mean_token_accuracy": 0.8390217423439026, "num_tokens": 11939340.0, "step": 1470 }, { "entropy": 0.6418487191200256, "epoch": 0.832392776523702, "grad_norm": 1.243117332458496, "learning_rate": 4.984016915863491e-06, "loss": 0.4966, "mean_token_accuracy": 0.8492981314659118, "num_tokens": 11980119.0, "step": 1475 }, { "entropy": 0.6358325242996216, "epoch": 0.835214446952596, "grad_norm": 1.252548336982727, "learning_rate": 4.983908454174993e-06, "loss": 0.4919, "mean_token_accuracy": 0.8524085760116578, "num_tokens": 12020810.0, "step": 1480 }, { "entropy": 0.6454902291297913, "epoch": 0.8380361173814899, "grad_norm": 1.2286616563796997, "learning_rate": 4.983799627303685e-06, "loss": 0.4958, "mean_token_accuracy": 0.8518875241279602, "num_tokens": 12061121.0, "step": 1485 }, { "entropy": 0.6209899544715881, "epoch": 0.8408577878103838, "grad_norm": 1.2979341745376587, "learning_rate": 4.98369043527095e-06, "loss": 0.5001, "mean_token_accuracy": 0.8472349762916564, "num_tokens": 12101696.0, "step": 1490 }, { "entropy": 0.6519573330879211, "epoch": 0.8436794582392777, "grad_norm": 1.2703050374984741, "learning_rate": 4.9835808780982375e-06, "loss": 0.5115, "mean_token_accuracy": 0.8439043760299683, "num_tokens": 12142285.0, "step": 1495 }, { "entropy": 0.6138962626457214, "epoch": 0.8465011286681715, "grad_norm": 1.2608084678649902, "learning_rate": 4.9834709558070695e-06, "loss": 0.4987, "mean_token_accuracy": 0.8513062238693238, "num_tokens": 12183096.0, "step": 1500 }, { "epoch": 0.8465011286681715, "eval_entropy": 0.6190634965896606, "eval_loss": 0.450931191444397, "eval_mean_token_accuracy": 0.8727481961250305, "eval_num_tokens": 12183096.0, "eval_runtime": 0.1637, "eval_samples_per_second": 24.428, "eval_steps_per_second": 6.107, "step": 1500 }, { "entropy": 0.6505842447280884, "epoch": 0.8493227990970654, "grad_norm": 1.3207699060440063, "learning_rate": 4.983360668419041e-06, "loss": 0.4894, "mean_token_accuracy": 0.8519291162490845, "num_tokens": 12223690.0, "step": 1505 }, { "entropy": 0.664458155632019, "epoch": 0.8521444695259593, "grad_norm": 1.2953647375106812, "learning_rate": 4.983250015955818e-06, "loss": 0.5346, "mean_token_accuracy": 0.8392679691314697, "num_tokens": 12264356.0, "step": 1510 }, { "entropy": 0.6668670177459717, "epoch": 0.8549661399548533, "grad_norm": 1.276229739189148, "learning_rate": 4.983138998439137e-06, "loss": 0.5242, "mean_token_accuracy": 0.8406529307365418, "num_tokens": 12305094.0, "step": 1515 }, { "entropy": 0.7231775879859924, "epoch": 0.8577878103837472, "grad_norm": 1.3792636394500732, "learning_rate": 4.983027615890809e-06, "loss": 0.5792, "mean_token_accuracy": 0.827623724937439, "num_tokens": 12345899.0, "step": 1520 }, { "entropy": 0.6197749614715576, "epoch": 0.8606094808126411, "grad_norm": 1.3481459617614746, "learning_rate": 4.982915868332713e-06, "loss": 0.4994, "mean_token_accuracy": 0.8470370888710022, "num_tokens": 12386462.0, "step": 1525 }, { "entropy": 0.7093599200248718, "epoch": 0.863431151241535, "grad_norm": 1.4356274604797363, "learning_rate": 4.982803755786804e-06, "loss": 0.5867, "mean_token_accuracy": 0.825712525844574, "num_tokens": 12427019.0, "step": 1530 }, { "entropy": 0.7157705545425415, "epoch": 0.8662528216704289, "grad_norm": 1.3810184001922607, "learning_rate": 4.982691278275106e-06, "loss": 0.5696, "mean_token_accuracy": 0.8296974778175354, "num_tokens": 12467761.0, "step": 1535 }, { "entropy": 0.6354712247848511, "epoch": 0.8690744920993227, "grad_norm": 1.4308044910430908, "learning_rate": 4.982578435819714e-06, "loss": 0.5066, "mean_token_accuracy": 0.8471119284629822, "num_tokens": 12508405.0, "step": 1540 }, { "entropy": 0.7147751808166504, "epoch": 0.8718961625282167, "grad_norm": 1.399246096611023, "learning_rate": 4.982465228442797e-06, "loss": 0.5764, "mean_token_accuracy": 0.8266459822654724, "num_tokens": 12549081.0, "step": 1545 }, { "entropy": 0.643587851524353, "epoch": 0.8747178329571106, "grad_norm": 1.2349791526794434, "learning_rate": 4.982351656166595e-06, "loss": 0.5062, "mean_token_accuracy": 0.8451477646827698, "num_tokens": 12589617.0, "step": 1550 }, { "entropy": 0.6347296714782715, "epoch": 0.8775395033860045, "grad_norm": 1.2148078680038452, "learning_rate": 4.982237719013418e-06, "loss": 0.5073, "mean_token_accuracy": 0.8488546848297119, "num_tokens": 12629403.0, "step": 1555 }, { "entropy": 0.6973551154136658, "epoch": 0.8803611738148984, "grad_norm": 1.2895756959915161, "learning_rate": 4.982123417005651e-06, "loss": 0.561, "mean_token_accuracy": 0.8330285906791687, "num_tokens": 12670150.0, "step": 1560 }, { "entropy": 0.7544346213340759, "epoch": 0.8831828442437923, "grad_norm": 1.3327959775924683, "learning_rate": 4.982008750165746e-06, "loss": 0.615, "mean_token_accuracy": 0.818334436416626, "num_tokens": 12710858.0, "step": 1565 }, { "entropy": 0.7033024072647095, "epoch": 0.8860045146726863, "grad_norm": 1.3447669744491577, "learning_rate": 4.981893718516231e-06, "loss": 0.5641, "mean_token_accuracy": 0.8303789258003235, "num_tokens": 12751450.0, "step": 1570 }, { "entropy": 0.6676825523376465, "epoch": 0.8888261851015802, "grad_norm": 1.3791662454605103, "learning_rate": 4.981778322079704e-06, "loss": 0.5194, "mean_token_accuracy": 0.8445156216621399, "num_tokens": 12792048.0, "step": 1575 }, { "entropy": 0.6464922666549683, "epoch": 0.891647855530474, "grad_norm": 1.2325254678726196, "learning_rate": 4.981662560878835e-06, "loss": 0.5236, "mean_token_accuracy": 0.8373061299324036, "num_tokens": 12832714.0, "step": 1580 }, { "entropy": 0.6744041681289673, "epoch": 0.8944695259593679, "grad_norm": 1.4298688173294067, "learning_rate": 4.981546434936363e-06, "loss": 0.5531, "mean_token_accuracy": 0.8334176301956177, "num_tokens": 12873204.0, "step": 1585 }, { "entropy": 0.5900899410247803, "epoch": 0.8972911963882618, "grad_norm": 1.396270513534546, "learning_rate": 4.981429944275103e-06, "loss": 0.4826, "mean_token_accuracy": 0.8552528858184815, "num_tokens": 12912731.0, "step": 1590 }, { "entropy": 0.6489648938179016, "epoch": 0.9001128668171557, "grad_norm": 1.2255243062973022, "learning_rate": 4.981313088917939e-06, "loss": 0.5231, "mean_token_accuracy": 0.8412474513053894, "num_tokens": 12953377.0, "step": 1595 }, { "entropy": 0.7049468755722046, "epoch": 0.9029345372460497, "grad_norm": 1.6258138418197632, "learning_rate": 4.9811958688878274e-06, "loss": 0.5564, "mean_token_accuracy": 0.8347102165222168, "num_tokens": 12994028.0, "step": 1600 }, { "entropy": 0.6801372170448303, "epoch": 0.9057562076749436, "grad_norm": 1.2854009866714478, "learning_rate": 4.981078284207797e-06, "loss": 0.5349, "mean_token_accuracy": 0.8424756646156311, "num_tokens": 13034671.0, "step": 1605 }, { "entropy": 0.6932690858840942, "epoch": 0.9085778781038375, "grad_norm": 1.4718157052993774, "learning_rate": 4.980960334900945e-06, "loss": 0.5371, "mean_token_accuracy": 0.8399105072021484, "num_tokens": 13075220.0, "step": 1610 }, { "entropy": 0.6554085493087769, "epoch": 0.9113995485327314, "grad_norm": 1.1461048126220703, "learning_rate": 4.980842020990444e-06, "loss": 0.5251, "mean_token_accuracy": 0.8404485821723938, "num_tokens": 13115849.0, "step": 1615 }, { "entropy": 0.6477279543876648, "epoch": 0.9142212189616253, "grad_norm": 1.2576565742492676, "learning_rate": 4.980723342499538e-06, "loss": 0.5306, "mean_token_accuracy": 0.8431344985961914, "num_tokens": 13156403.0, "step": 1620 }, { "entropy": 0.6991305470466613, "epoch": 0.9170428893905191, "grad_norm": 1.3756641149520874, "learning_rate": 4.9806042994515395e-06, "loss": 0.5567, "mean_token_accuracy": 0.8329641699790955, "num_tokens": 13197015.0, "step": 1625 }, { "entropy": 0.6869478225708008, "epoch": 0.9198645598194131, "grad_norm": 1.265496015548706, "learning_rate": 4.980484891869835e-06, "loss": 0.5361, "mean_token_accuracy": 0.8395046353340149, "num_tokens": 13237500.0, "step": 1630 }, { "entropy": 0.6620127320289612, "epoch": 0.922686230248307, "grad_norm": 1.432976484298706, "learning_rate": 4.980365119777882e-06, "loss": 0.5251, "mean_token_accuracy": 0.8438527345657348, "num_tokens": 13277239.0, "step": 1635 }, { "entropy": 0.7288511991500854, "epoch": 0.9255079006772009, "grad_norm": 1.485297679901123, "learning_rate": 4.980244983199211e-06, "loss": 0.5528, "mean_token_accuracy": 0.8336746454238891, "num_tokens": 13317707.0, "step": 1640 }, { "entropy": 0.7077333211898804, "epoch": 0.9283295711060948, "grad_norm": 1.3443304300308228, "learning_rate": 4.9801244821574216e-06, "loss": 0.5556, "mean_token_accuracy": 0.8335135579109192, "num_tokens": 13358245.0, "step": 1645 }, { "entropy": 0.6646490931510926, "epoch": 0.9311512415349887, "grad_norm": 1.2345566749572754, "learning_rate": 4.9800036166761866e-06, "loss": 0.5416, "mean_token_accuracy": 0.8396414875984192, "num_tokens": 13398838.0, "step": 1650 }, { "entropy": 0.6034572184085846, "epoch": 0.9339729119638827, "grad_norm": 1.2768057584762573, "learning_rate": 4.979882386779249e-06, "loss": 0.463, "mean_token_accuracy": 0.8570885419845581, "num_tokens": 13439472.0, "step": 1655 }, { "entropy": 0.6616186738014221, "epoch": 0.9367945823927766, "grad_norm": 1.2942156791687012, "learning_rate": 4.979760792490426e-06, "loss": 0.5432, "mean_token_accuracy": 0.8383080959320068, "num_tokens": 13480194.0, "step": 1660 }, { "entropy": 0.6370395302772522, "epoch": 0.9396162528216704, "grad_norm": 1.384143352508545, "learning_rate": 4.979638833833604e-06, "loss": 0.4945, "mean_token_accuracy": 0.8490473747253418, "num_tokens": 13520923.0, "step": 1665 }, { "entropy": 0.648089063167572, "epoch": 0.9424379232505643, "grad_norm": 1.377890706062317, "learning_rate": 4.979516510832743e-06, "loss": 0.5125, "mean_token_accuracy": 0.8464336395263672, "num_tokens": 13561752.0, "step": 1670 }, { "entropy": 0.7131257057189941, "epoch": 0.9452595936794582, "grad_norm": 1.4829003810882568, "learning_rate": 4.979393823511871e-06, "loss": 0.5744, "mean_token_accuracy": 0.8287210106849671, "num_tokens": 13602311.0, "step": 1675 }, { "entropy": 0.7149917125701905, "epoch": 0.9480812641083521, "grad_norm": 1.355993390083313, "learning_rate": 4.979270771895093e-06, "loss": 0.5873, "mean_token_accuracy": 0.827799940109253, "num_tokens": 13642473.0, "step": 1680 }, { "entropy": 0.6665526628494263, "epoch": 0.9509029345372461, "grad_norm": 1.222419023513794, "learning_rate": 4.979147356006579e-06, "loss": 0.5285, "mean_token_accuracy": 0.8411856889724731, "num_tokens": 13683117.0, "step": 1685 }, { "entropy": 0.6773181915283203, "epoch": 0.95372460496614, "grad_norm": 1.3376954793930054, "learning_rate": 4.979023575870577e-06, "loss": 0.5327, "mean_token_accuracy": 0.8385852217674256, "num_tokens": 13723981.0, "step": 1690 }, { "entropy": 0.5895337462425232, "epoch": 0.9565462753950339, "grad_norm": 1.189743995666504, "learning_rate": 4.978899431511401e-06, "loss": 0.4602, "mean_token_accuracy": 0.8576975584030151, "num_tokens": 13764383.0, "step": 1695 }, { "entropy": 0.6576069235801697, "epoch": 0.9593679458239278, "grad_norm": 1.3030279874801636, "learning_rate": 4.978774922953442e-06, "loss": 0.5137, "mean_token_accuracy": 0.847059428691864, "num_tokens": 13805055.0, "step": 1700 }, { "entropy": 0.6367745637893677, "epoch": 0.9621896162528216, "grad_norm": 1.2039754390716553, "learning_rate": 4.978650050221159e-06, "loss": 0.5165, "mean_token_accuracy": 0.8448570847511292, "num_tokens": 13845904.0, "step": 1705 }, { "entropy": 0.6365428566932678, "epoch": 0.9650112866817155, "grad_norm": 1.157940149307251, "learning_rate": 4.978524813339082e-06, "loss": 0.5239, "mean_token_accuracy": 0.8424256920814515, "num_tokens": 13886682.0, "step": 1710 }, { "entropy": 0.66874258518219, "epoch": 0.9678329571106095, "grad_norm": 1.4614887237548828, "learning_rate": 4.978399212331814e-06, "loss": 0.5284, "mean_token_accuracy": 0.8407912611961365, "num_tokens": 13927181.0, "step": 1715 }, { "entropy": 0.643929636478424, "epoch": 0.9706546275395034, "grad_norm": 1.3897498846054077, "learning_rate": 4.97827324722403e-06, "loss": 0.5091, "mean_token_accuracy": 0.8482011795043946, "num_tokens": 13967885.0, "step": 1720 }, { "entropy": 0.6536713361740112, "epoch": 0.9734762979683973, "grad_norm": 1.3549083471298218, "learning_rate": 4.978146918040476e-06, "loss": 0.5065, "mean_token_accuracy": 0.8463730335235595, "num_tokens": 14008407.0, "step": 1725 }, { "entropy": 0.6258985161781311, "epoch": 0.9762979683972912, "grad_norm": 1.2369407415390015, "learning_rate": 4.97802022480597e-06, "loss": 0.4787, "mean_token_accuracy": 0.8545310378074646, "num_tokens": 14048919.0, "step": 1730 }, { "entropy": 0.6749670505523682, "epoch": 0.9791196388261851, "grad_norm": 1.5410014390945435, "learning_rate": 4.977893167545398e-06, "loss": 0.5411, "mean_token_accuracy": 0.8386905789375305, "num_tokens": 14089574.0, "step": 1735 }, { "entropy": 0.6478506565093994, "epoch": 0.981941309255079, "grad_norm": 1.3205047845840454, "learning_rate": 4.977765746283724e-06, "loss": 0.5138, "mean_token_accuracy": 0.8434589385986329, "num_tokens": 14130308.0, "step": 1740 }, { "entropy": 0.6606280326843261, "epoch": 0.9847629796839729, "grad_norm": 1.2447839975357056, "learning_rate": 4.977637961045977e-06, "loss": 0.5197, "mean_token_accuracy": 0.8446960568428039, "num_tokens": 14171031.0, "step": 1745 }, { "entropy": 0.6999424219131469, "epoch": 0.9875846501128668, "grad_norm": 1.2505875825881958, "learning_rate": 4.977509811857263e-06, "loss": 0.5772, "mean_token_accuracy": 0.828137469291687, "num_tokens": 14211646.0, "step": 1750 }, { "entropy": 0.6732224225997925, "epoch": 0.9904063205417607, "grad_norm": 1.4964361190795898, "learning_rate": 4.977381298742754e-06, "loss": 0.5528, "mean_token_accuracy": 0.8329331874847412, "num_tokens": 14252319.0, "step": 1755 }, { "entropy": 0.6761491894721985, "epoch": 0.9932279909706546, "grad_norm": 1.426883339881897, "learning_rate": 4.977252421727699e-06, "loss": 0.5398, "mean_token_accuracy": 0.8371304512023926, "num_tokens": 14293055.0, "step": 1760 }, { "entropy": 0.6125996708869934, "epoch": 0.9960496613995485, "grad_norm": 1.3476899862289429, "learning_rate": 4.977123180837416e-06, "loss": 0.4946, "mean_token_accuracy": 0.8501678228378295, "num_tokens": 14333762.0, "step": 1765 }, { "entropy": 0.6156837463378906, "epoch": 0.9988713318284425, "grad_norm": 1.3276349306106567, "learning_rate": 4.976993576097292e-06, "loss": 0.5002, "mean_token_accuracy": 0.8468546986579895, "num_tokens": 14374541.0, "step": 1770 }, { "entropy": 0.5651689410209656, "epoch": 1.0016930022573363, "grad_norm": 0.9538065195083618, "learning_rate": 4.97686360753279e-06, "loss": 0.4177, "mean_token_accuracy": 0.8691263794898987, "num_tokens": 14409063.0, "step": 1775 }, { "entropy": 0.6223705530166626, "epoch": 1.0045146726862302, "grad_norm": 1.2451618909835815, "learning_rate": 4.976733275169441e-06, "loss": 0.438, "mean_token_accuracy": 0.8648397326469421, "num_tokens": 14449840.0, "step": 1780 }, { "entropy": 0.5574550032615662, "epoch": 1.007336343115124, "grad_norm": 1.2733477354049683, "learning_rate": 4.976602579032849e-06, "loss": 0.4043, "mean_token_accuracy": 0.8714008688926697, "num_tokens": 14490449.0, "step": 1785 }, { "entropy": 0.5262600839138031, "epoch": 1.010158013544018, "grad_norm": 1.3276273012161255, "learning_rate": 4.976471519148691e-06, "loss": 0.4036, "mean_token_accuracy": 0.8747005462646484, "num_tokens": 14531107.0, "step": 1790 }, { "entropy": 0.49118422865867617, "epoch": 1.012979683972912, "grad_norm": 1.2698107957839966, "learning_rate": 4.976340095542711e-06, "loss": 0.3685, "mean_token_accuracy": 0.8826399683952332, "num_tokens": 14571891.0, "step": 1795 }, { "entropy": 0.48148898482322694, "epoch": 1.0158013544018059, "grad_norm": 1.3123940229415894, "learning_rate": 4.97620830824073e-06, "loss": 0.3673, "mean_token_accuracy": 0.8846578359603882, "num_tokens": 14612632.0, "step": 1800 }, { "entropy": 0.5638070046901703, "epoch": 1.0186230248306998, "grad_norm": 1.2908910512924194, "learning_rate": 4.976076157268636e-06, "loss": 0.4132, "mean_token_accuracy": 0.8727042317390442, "num_tokens": 14653317.0, "step": 1805 }, { "entropy": 0.5422739446163177, "epoch": 1.0214446952595937, "grad_norm": 1.5749247074127197, "learning_rate": 4.975943642652389e-06, "loss": 0.4113, "mean_token_accuracy": 0.8708185434341431, "num_tokens": 14693958.0, "step": 1810 }, { "entropy": 0.5249835729599, "epoch": 1.0242663656884876, "grad_norm": 1.3571456670761108, "learning_rate": 4.975810764418023e-06, "loss": 0.3962, "mean_token_accuracy": 0.8773154497146607, "num_tokens": 14734520.0, "step": 1815 }, { "entropy": 0.4799295425415039, "epoch": 1.0270880361173815, "grad_norm": 1.2057713270187378, "learning_rate": 4.975677522591642e-06, "loss": 0.3698, "mean_token_accuracy": 0.881498396396637, "num_tokens": 14775267.0, "step": 1820 }, { "entropy": 0.5456785798072815, "epoch": 1.0299097065462754, "grad_norm": 1.4665296077728271, "learning_rate": 4.975543917199422e-06, "loss": 0.4268, "mean_token_accuracy": 0.8673960208892822, "num_tokens": 14816106.0, "step": 1825 }, { "entropy": 0.540210634469986, "epoch": 1.0327313769751694, "grad_norm": 1.2391974925994873, "learning_rate": 4.975409948267608e-06, "loss": 0.416, "mean_token_accuracy": 0.8694531202316285, "num_tokens": 14856910.0, "step": 1830 }, { "entropy": 0.5396132886409759, "epoch": 1.0355530474040633, "grad_norm": 1.2638976573944092, "learning_rate": 4.97527561582252e-06, "loss": 0.4162, "mean_token_accuracy": 0.8690710544586182, "num_tokens": 14897584.0, "step": 1835 }, { "entropy": 0.5132744669914245, "epoch": 1.0383747178329572, "grad_norm": 1.2913990020751953, "learning_rate": 4.975140919890546e-06, "loss": 0.4015, "mean_token_accuracy": 0.873302161693573, "num_tokens": 14938181.0, "step": 1840 }, { "entropy": 0.5024015605449677, "epoch": 1.041196388261851, "grad_norm": 1.1151163578033447, "learning_rate": 4.975005860498148e-06, "loss": 0.3876, "mean_token_accuracy": 0.8748369932174682, "num_tokens": 14978884.0, "step": 1845 }, { "entropy": 0.513664311170578, "epoch": 1.0440180586907448, "grad_norm": 1.298097014427185, "learning_rate": 4.974870437671858e-06, "loss": 0.4012, "mean_token_accuracy": 0.8729772806167603, "num_tokens": 15019646.0, "step": 1850 }, { "entropy": 0.5189130663871765, "epoch": 1.0468397291196387, "grad_norm": 1.3805726766586304, "learning_rate": 4.97473465143828e-06, "loss": 0.405, "mean_token_accuracy": 0.8714963912963867, "num_tokens": 15060355.0, "step": 1855 }, { "entropy": 0.5300206661224365, "epoch": 1.0496613995485327, "grad_norm": 1.3854206800460815, "learning_rate": 4.9745985018240895e-06, "loss": 0.4155, "mean_token_accuracy": 0.8686167359352112, "num_tokens": 15101164.0, "step": 1860 }, { "entropy": 0.49498083591461184, "epoch": 1.0524830699774266, "grad_norm": 1.3298227787017822, "learning_rate": 4.974461988856033e-06, "loss": 0.3802, "mean_token_accuracy": 0.8796196699142456, "num_tokens": 15141937.0, "step": 1865 }, { "entropy": 0.5611091613769531, "epoch": 1.0553047404063205, "grad_norm": 1.3966474533081055, "learning_rate": 4.974325112560928e-06, "loss": 0.4562, "mean_token_accuracy": 0.8588791251182556, "num_tokens": 15182669.0, "step": 1870 }, { "entropy": 0.5468938708305359, "epoch": 1.0581264108352144, "grad_norm": 1.2755643129348755, "learning_rate": 4.974187872965665e-06, "loss": 0.4084, "mean_token_accuracy": 0.8714737057685852, "num_tokens": 15223336.0, "step": 1875 }, { "entropy": 0.5215595126152038, "epoch": 1.0609480812641083, "grad_norm": 1.296976923942566, "learning_rate": 4.974050270097203e-06, "loss": 0.378, "mean_token_accuracy": 0.8788340568542481, "num_tokens": 15264013.0, "step": 1880 }, { "entropy": 0.5012482941150666, "epoch": 1.0637697516930023, "grad_norm": 1.3608322143554688, "learning_rate": 4.973912303982575e-06, "loss": 0.3661, "mean_token_accuracy": 0.8823076605796814, "num_tokens": 15304788.0, "step": 1885 }, { "entropy": 0.5157119750976562, "epoch": 1.0665914221218962, "grad_norm": 1.3526833057403564, "learning_rate": 4.973773974648885e-06, "loss": 0.4074, "mean_token_accuracy": 0.8713153243064881, "num_tokens": 15345095.0, "step": 1890 }, { "entropy": 0.5500873923301697, "epoch": 1.06941309255079, "grad_norm": 1.2834185361862183, "learning_rate": 4.973635282123308e-06, "loss": 0.4127, "mean_token_accuracy": 0.8706364631652832, "num_tokens": 15385312.0, "step": 1895 }, { "entropy": 0.5158918261528015, "epoch": 1.072234762979684, "grad_norm": 1.3259310722351074, "learning_rate": 4.973496226433089e-06, "loss": 0.395, "mean_token_accuracy": 0.8755181312561036, "num_tokens": 15426061.0, "step": 1900 }, { "entropy": 0.538950902223587, "epoch": 1.075056433408578, "grad_norm": 1.3279820680618286, "learning_rate": 4.973356807605546e-06, "loss": 0.4104, "mean_token_accuracy": 0.8712123036384583, "num_tokens": 15466630.0, "step": 1905 }, { "entropy": 0.49377835392951963, "epoch": 1.0778781038374718, "grad_norm": 1.1958774328231812, "learning_rate": 4.973217025668068e-06, "loss": 0.379, "mean_token_accuracy": 0.8779303431510925, "num_tokens": 15507113.0, "step": 1910 }, { "entropy": 0.5405571818351745, "epoch": 1.0806997742663658, "grad_norm": 1.2505043745040894, "learning_rate": 4.973076880648115e-06, "loss": 0.4102, "mean_token_accuracy": 0.8709590196609497, "num_tokens": 15547424.0, "step": 1915 }, { "entropy": 0.5551674902439118, "epoch": 1.0835214446952597, "grad_norm": 1.366829514503479, "learning_rate": 4.972936372573218e-06, "loss": 0.45, "mean_token_accuracy": 0.8604497313499451, "num_tokens": 15588125.0, "step": 1920 }, { "entropy": 0.5369654476642609, "epoch": 1.0863431151241536, "grad_norm": 1.367258906364441, "learning_rate": 4.972795501470981e-06, "loss": 0.4292, "mean_token_accuracy": 0.8665614485740661, "num_tokens": 15628472.0, "step": 1925 }, { "entropy": 0.5272303640842437, "epoch": 1.0891647855530473, "grad_norm": 1.4138174057006836, "learning_rate": 4.972654267369078e-06, "loss": 0.4193, "mean_token_accuracy": 0.8680860280990601, "num_tokens": 15669129.0, "step": 1930 }, { "entropy": 0.5395491600036622, "epoch": 1.0919864559819412, "grad_norm": 1.3633450269699097, "learning_rate": 4.972512670295253e-06, "loss": 0.4003, "mean_token_accuracy": 0.8737997651100159, "num_tokens": 15709941.0, "step": 1935 }, { "entropy": 0.5212275326251984, "epoch": 1.0948081264108351, "grad_norm": 1.37870454788208, "learning_rate": 4.9723707102773235e-06, "loss": 0.3997, "mean_token_accuracy": 0.8710021495819091, "num_tokens": 15750468.0, "step": 1940 }, { "entropy": 0.533467584848404, "epoch": 1.097629796839729, "grad_norm": 1.5006767511367798, "learning_rate": 4.972228387343179e-06, "loss": 0.4071, "mean_token_accuracy": 0.8726310014724732, "num_tokens": 15791223.0, "step": 1945 }, { "entropy": 0.5062848865985871, "epoch": 1.100451467268623, "grad_norm": 1.1996793746948242, "learning_rate": 4.972085701520777e-06, "loss": 0.3961, "mean_token_accuracy": 0.8720014691352844, "num_tokens": 15831746.0, "step": 1950 }, { "entropy": 0.5338364660739898, "epoch": 1.103273137697517, "grad_norm": 1.4911237955093384, "learning_rate": 4.971942652838149e-06, "loss": 0.4187, "mean_token_accuracy": 0.8707549214363098, "num_tokens": 15872459.0, "step": 1955 }, { "entropy": 0.5420314610004425, "epoch": 1.1060948081264108, "grad_norm": 1.4543707370758057, "learning_rate": 4.971799241323397e-06, "loss": 0.4279, "mean_token_accuracy": 0.8651981711387634, "num_tokens": 15913132.0, "step": 1960 }, { "entropy": 0.533877170085907, "epoch": 1.1089164785553047, "grad_norm": 1.4339110851287842, "learning_rate": 4.971655467004693e-06, "loss": 0.4169, "mean_token_accuracy": 0.8666856288909912, "num_tokens": 15953596.0, "step": 1965 }, { "entropy": 0.525646859407425, "epoch": 1.1117381489841986, "grad_norm": 1.370369791984558, "learning_rate": 4.971511329910283e-06, "loss": 0.4201, "mean_token_accuracy": 0.8674192786216736, "num_tokens": 15994327.0, "step": 1970 }, { "entropy": 0.5182268440723419, "epoch": 1.1145598194130926, "grad_norm": 1.2392444610595703, "learning_rate": 4.971366830068483e-06, "loss": 0.3987, "mean_token_accuracy": 0.8725781679153443, "num_tokens": 16035039.0, "step": 1975 }, { "entropy": 0.5197380304336547, "epoch": 1.1173814898419865, "grad_norm": 1.3075342178344727, "learning_rate": 4.971221967507679e-06, "loss": 0.4074, "mean_token_accuracy": 0.8703087091445922, "num_tokens": 16075828.0, "step": 1980 }, { "entropy": 0.5025268793106079, "epoch": 1.1202031602708804, "grad_norm": 1.4279546737670898, "learning_rate": 4.9710767422563285e-06, "loss": 0.3803, "mean_token_accuracy": 0.8805047512054444, "num_tokens": 16116500.0, "step": 1985 }, { "entropy": 0.533239609003067, "epoch": 1.1230248306997743, "grad_norm": 1.2557505369186401, "learning_rate": 4.970931154342963e-06, "loss": 0.4162, "mean_token_accuracy": 0.8681887149810791, "num_tokens": 16157026.0, "step": 1990 }, { "entropy": 0.5435242295265198, "epoch": 1.1258465011286682, "grad_norm": 1.3608372211456299, "learning_rate": 4.970785203796182e-06, "loss": 0.4237, "mean_token_accuracy": 0.8656893134117126, "num_tokens": 16197572.0, "step": 1995 }, { "entropy": 0.48926340937614443, "epoch": 1.1286681715575622, "grad_norm": 1.314292311668396, "learning_rate": 4.970638890644658e-06, "loss": 0.3675, "mean_token_accuracy": 0.8834827184677124, "num_tokens": 16238388.0, "step": 2000 }, { "epoch": 1.1286681715575622, "eval_entropy": 0.5074299573898315, "eval_loss": 0.3934297561645508, "eval_mean_token_accuracy": 0.8853967189788818, "eval_num_tokens": 16238388.0, "eval_runtime": 0.1639, "eval_samples_per_second": 24.402, "eval_steps_per_second": 6.1, "step": 2000 }, { "entropy": 0.5785790205001831, "epoch": 1.1314898419864559, "grad_norm": 1.5585596561431885, "learning_rate": 4.970492214917133e-06, "loss": 0.4792, "mean_token_accuracy": 0.8502647757530213, "num_tokens": 16279114.0, "step": 2005 }, { "entropy": 0.5156497418880462, "epoch": 1.13431151241535, "grad_norm": 1.2416678667068481, "learning_rate": 4.970345176642424e-06, "loss": 0.409, "mean_token_accuracy": 0.8698260307312011, "num_tokens": 16319792.0, "step": 2010 }, { "entropy": 0.50312060713768, "epoch": 1.1371331828442437, "grad_norm": 1.343417763710022, "learning_rate": 4.9701977758494135e-06, "loss": 0.402, "mean_token_accuracy": 0.8738492012023926, "num_tokens": 16360729.0, "step": 2015 }, { "entropy": 0.47664583325386045, "epoch": 1.1399548532731376, "grad_norm": 1.3557952642440796, "learning_rate": 4.970050012567061e-06, "loss": 0.3691, "mean_token_accuracy": 0.8808457016944885, "num_tokens": 16401565.0, "step": 2020 }, { "entropy": 0.5471422076225281, "epoch": 1.1427765237020315, "grad_norm": 1.6078767776489258, "learning_rate": 4.969901886824394e-06, "loss": 0.4408, "mean_token_accuracy": 0.861269211769104, "num_tokens": 16442011.0, "step": 2025 }, { "entropy": 0.500029307603836, "epoch": 1.1455981941309255, "grad_norm": 1.2382087707519531, "learning_rate": 4.969753398650511e-06, "loss": 0.3831, "mean_token_accuracy": 0.8787751078605652, "num_tokens": 16482704.0, "step": 2030 }, { "entropy": 0.5860454142093658, "epoch": 1.1484198645598194, "grad_norm": 1.2965975999832153, "learning_rate": 4.969604548074583e-06, "loss": 0.4537, "mean_token_accuracy": 0.858244788646698, "num_tokens": 16523275.0, "step": 2035 }, { "entropy": 0.5472402393817901, "epoch": 1.1512415349887133, "grad_norm": 1.347503900527954, "learning_rate": 4.969455335125852e-06, "loss": 0.4212, "mean_token_accuracy": 0.8660460352897644, "num_tokens": 16563975.0, "step": 2040 }, { "entropy": 0.5726926565170288, "epoch": 1.1540632054176072, "grad_norm": 1.3004218339920044, "learning_rate": 4.969305759833631e-06, "loss": 0.4574, "mean_token_accuracy": 0.8575977206230163, "num_tokens": 16604607.0, "step": 2045 }, { "entropy": 0.554759293794632, "epoch": 1.1568848758465011, "grad_norm": 1.1861565113067627, "learning_rate": 4.969155822227304e-06, "loss": 0.4342, "mean_token_accuracy": 0.8664988040924072, "num_tokens": 16645243.0, "step": 2050 }, { "entropy": 0.5388511419296265, "epoch": 1.159706546275395, "grad_norm": 1.6149320602416992, "learning_rate": 4.969005522336324e-06, "loss": 0.4337, "mean_token_accuracy": 0.8644424319267273, "num_tokens": 16685870.0, "step": 2055 }, { "entropy": 0.5282904028892517, "epoch": 1.162528216704289, "grad_norm": 1.2274096012115479, "learning_rate": 4.968854860190222e-06, "loss": 0.427, "mean_token_accuracy": 0.8676736235618592, "num_tokens": 16726814.0, "step": 2060 }, { "entropy": 0.5028113007545472, "epoch": 1.1653498871331829, "grad_norm": 1.2310305833816528, "learning_rate": 4.9687038358185904e-06, "loss": 0.3945, "mean_token_accuracy": 0.8749886631965638, "num_tokens": 16767430.0, "step": 2065 }, { "entropy": 0.5054876983165741, "epoch": 1.1681715575620768, "grad_norm": 1.6121010780334473, "learning_rate": 4.968552449251103e-06, "loss": 0.4005, "mean_token_accuracy": 0.8731933832168579, "num_tokens": 16808070.0, "step": 2070 }, { "entropy": 0.510890108346939, "epoch": 1.1709932279909707, "grad_norm": 1.212408185005188, "learning_rate": 4.968400700517496e-06, "loss": 0.402, "mean_token_accuracy": 0.8713488578796387, "num_tokens": 16848790.0, "step": 2075 }, { "entropy": 0.5575174331665039, "epoch": 1.1738148984198646, "grad_norm": 1.3433505296707153, "learning_rate": 4.968248589647582e-06, "loss": 0.4241, "mean_token_accuracy": 0.8673989772796631, "num_tokens": 16888705.0, "step": 2080 }, { "entropy": 0.4738758623600006, "epoch": 1.1766365688487586, "grad_norm": 1.337335467338562, "learning_rate": 4.968096116671243e-06, "loss": 0.3633, "mean_token_accuracy": 0.8840928077697754, "num_tokens": 16929128.0, "step": 2085 }, { "entropy": 0.5100393950939178, "epoch": 1.1794582392776523, "grad_norm": 1.4914759397506714, "learning_rate": 4.9679432816184316e-06, "loss": 0.4018, "mean_token_accuracy": 0.8735705256462097, "num_tokens": 16969800.0, "step": 2090 }, { "entropy": 0.5905535221099854, "epoch": 1.1822799097065464, "grad_norm": 1.579995036125183, "learning_rate": 4.967790084519174e-06, "loss": 0.4902, "mean_token_accuracy": 0.850010359287262, "num_tokens": 17010171.0, "step": 2095 }, { "entropy": 0.5488203287124633, "epoch": 1.18510158013544, "grad_norm": 1.3214325904846191, "learning_rate": 4.9676365254035645e-06, "loss": 0.4295, "mean_token_accuracy": 0.8675566554069519, "num_tokens": 17050042.0, "step": 2100 }, { "entropy": 0.49664353728294375, "epoch": 1.187923250564334, "grad_norm": 1.3970224857330322, "learning_rate": 4.96748260430177e-06, "loss": 0.3776, "mean_token_accuracy": 0.8784841656684875, "num_tokens": 17090848.0, "step": 2105 }, { "entropy": 0.5250966966152191, "epoch": 1.190744920993228, "grad_norm": 1.3193445205688477, "learning_rate": 4.967328321244028e-06, "loss": 0.4011, "mean_token_accuracy": 0.8732633233070374, "num_tokens": 17131555.0, "step": 2110 }, { "entropy": 0.5157673597335816, "epoch": 1.1935665914221218, "grad_norm": 1.2861922979354858, "learning_rate": 4.967173676260648e-06, "loss": 0.4074, "mean_token_accuracy": 0.873046362400055, "num_tokens": 17172227.0, "step": 2115 }, { "entropy": 0.5405256807804107, "epoch": 1.1963882618510158, "grad_norm": 1.6772886514663696, "learning_rate": 4.96701866938201e-06, "loss": 0.4271, "mean_token_accuracy": 0.8637170910835266, "num_tokens": 17212727.0, "step": 2120 }, { "entropy": 0.5377147376537323, "epoch": 1.1992099322799097, "grad_norm": 1.3887752294540405, "learning_rate": 4.9668633006385655e-06, "loss": 0.4275, "mean_token_accuracy": 0.8655383706092834, "num_tokens": 17253478.0, "step": 2125 }, { "entropy": 0.5166448771953582, "epoch": 1.2020316027088036, "grad_norm": 1.2280800342559814, "learning_rate": 4.966707570060835e-06, "loss": 0.3918, "mean_token_accuracy": 0.8748661637306213, "num_tokens": 17294254.0, "step": 2130 }, { "entropy": 0.5040790915489197, "epoch": 1.2048532731376975, "grad_norm": 1.4066789150238037, "learning_rate": 4.9665514776794145e-06, "loss": 0.3728, "mean_token_accuracy": 0.8822136998176575, "num_tokens": 17335017.0, "step": 2135 }, { "entropy": 0.5264786601066589, "epoch": 1.2076749435665914, "grad_norm": 1.3595073223114014, "learning_rate": 4.9663950235249655e-06, "loss": 0.4151, "mean_token_accuracy": 0.8696430683135986, "num_tokens": 17375865.0, "step": 2140 }, { "entropy": 0.5130272388458252, "epoch": 1.2104966139954854, "grad_norm": 1.4424818754196167, "learning_rate": 4.966238207628225e-06, "loss": 0.4016, "mean_token_accuracy": 0.8746838808059693, "num_tokens": 17416458.0, "step": 2145 }, { "entropy": 0.49900220036506654, "epoch": 1.2133182844243793, "grad_norm": 1.3563543558120728, "learning_rate": 4.966081030019999e-06, "loss": 0.407, "mean_token_accuracy": 0.872953200340271, "num_tokens": 17457303.0, "step": 2150 }, { "entropy": 0.5183020770549774, "epoch": 1.2161399548532732, "grad_norm": 1.431388258934021, "learning_rate": 4.965923490731166e-06, "loss": 0.4001, "mean_token_accuracy": 0.8713582038879395, "num_tokens": 17498163.0, "step": 2155 }, { "entropy": 0.5679690599441528, "epoch": 1.2189616252821671, "grad_norm": 1.5881187915802002, "learning_rate": 4.965765589792674e-06, "loss": 0.4542, "mean_token_accuracy": 0.8564951181411743, "num_tokens": 17538730.0, "step": 2160 }, { "entropy": 0.5158195972442627, "epoch": 1.221783295711061, "grad_norm": 1.2861218452453613, "learning_rate": 4.965607327235542e-06, "loss": 0.3973, "mean_token_accuracy": 0.8753223657608032, "num_tokens": 17579319.0, "step": 2165 }, { "entropy": 0.5332009434700012, "epoch": 1.224604966139955, "grad_norm": 1.338793158531189, "learning_rate": 4.965448703090861e-06, "loss": 0.4198, "mean_token_accuracy": 0.8673290014266968, "num_tokens": 17620049.0, "step": 2170 }, { "entropy": 0.5594852328300476, "epoch": 1.2274266365688487, "grad_norm": 1.3682115077972412, "learning_rate": 4.965289717389794e-06, "loss": 0.4309, "mean_token_accuracy": 0.8634846925735473, "num_tokens": 17660479.0, "step": 2175 }, { "entropy": 0.5210170686244965, "epoch": 1.2302483069977426, "grad_norm": 1.4854122400283813, "learning_rate": 4.965130370163572e-06, "loss": 0.391, "mean_token_accuracy": 0.8751443147659301, "num_tokens": 17701156.0, "step": 2180 }, { "entropy": 0.5292206406593323, "epoch": 1.2330699774266365, "grad_norm": 1.3995417356491089, "learning_rate": 4.9649706614435e-06, "loss": 0.4127, "mean_token_accuracy": 0.8673221230506897, "num_tokens": 17742000.0, "step": 2185 }, { "entropy": 0.5528486132621765, "epoch": 1.2358916478555304, "grad_norm": 1.4038658142089844, "learning_rate": 4.9648105912609525e-06, "loss": 0.4354, "mean_token_accuracy": 0.863236916065216, "num_tokens": 17782813.0, "step": 2190 }, { "entropy": 0.520248967409134, "epoch": 1.2387133182844243, "grad_norm": 1.3488070964813232, "learning_rate": 4.964650159647375e-06, "loss": 0.4095, "mean_token_accuracy": 0.8726580619812012, "num_tokens": 17823603.0, "step": 2195 }, { "entropy": 0.5257691144943237, "epoch": 1.2415349887133182, "grad_norm": 1.3696351051330566, "learning_rate": 4.964489366634285e-06, "loss": 0.4045, "mean_token_accuracy": 0.8737285494804382, "num_tokens": 17864326.0, "step": 2200 }, { "entropy": 0.521408861875534, "epoch": 1.2443566591422122, "grad_norm": 1.2956690788269043, "learning_rate": 4.964328212253269e-06, "loss": 0.4046, "mean_token_accuracy": 0.8745855331420899, "num_tokens": 17904832.0, "step": 2205 }, { "entropy": 0.5116970539093018, "epoch": 1.247178329571106, "grad_norm": 1.1543620824813843, "learning_rate": 4.9641666965359865e-06, "loss": 0.4061, "mean_token_accuracy": 0.871479332447052, "num_tokens": 17945706.0, "step": 2210 }, { "entropy": 0.51253702044487, "epoch": 1.25, "grad_norm": 1.3165756464004517, "learning_rate": 4.9640048195141685e-06, "loss": 0.4001, "mean_token_accuracy": 0.8732204794883728, "num_tokens": 17986493.0, "step": 2215 }, { "entropy": 0.5361522495746612, "epoch": 1.252821670428894, "grad_norm": 1.3089770078659058, "learning_rate": 4.9638425812196145e-06, "loss": 0.3951, "mean_token_accuracy": 0.8748211622238159, "num_tokens": 18027243.0, "step": 2220 }, { "entropy": 0.4924753546714783, "epoch": 1.2556433408577878, "grad_norm": 1.3457053899765015, "learning_rate": 4.963679981684195e-06, "loss": 0.3846, "mean_token_accuracy": 0.8790703415870667, "num_tokens": 18067621.0, "step": 2225 }, { "entropy": 0.5386804461479187, "epoch": 1.2584650112866818, "grad_norm": 1.543764352798462, "learning_rate": 4.963517020939855e-06, "loss": 0.4387, "mean_token_accuracy": 0.8642153739929199, "num_tokens": 18108318.0, "step": 2230 }, { "entropy": 0.5258350789546966, "epoch": 1.2612866817155757, "grad_norm": 1.33335280418396, "learning_rate": 4.963353699018607e-06, "loss": 0.4202, "mean_token_accuracy": 0.867599892616272, "num_tokens": 18149098.0, "step": 2235 }, { "entropy": 0.4915396809577942, "epoch": 1.2641083521444696, "grad_norm": 1.3478151559829712, "learning_rate": 4.963190015952536e-06, "loss": 0.3941, "mean_token_accuracy": 0.8742651224136353, "num_tokens": 18189853.0, "step": 2240 }, { "entropy": 0.5037339508533478, "epoch": 1.2669300225733635, "grad_norm": 1.4654349088668823, "learning_rate": 4.963025971773798e-06, "loss": 0.3896, "mean_token_accuracy": 0.8763032555580139, "num_tokens": 18230648.0, "step": 2245 }, { "entropy": 0.5388745546340943, "epoch": 1.2697516930022572, "grad_norm": 1.3390324115753174, "learning_rate": 4.962861566514618e-06, "loss": 0.418, "mean_token_accuracy": 0.865935492515564, "num_tokens": 18271377.0, "step": 2250 }, { "entropy": 0.5285961091518402, "epoch": 1.2725733634311513, "grad_norm": 1.2784810066223145, "learning_rate": 4.962696800207295e-06, "loss": 0.4282, "mean_token_accuracy": 0.8682160973548889, "num_tokens": 18311967.0, "step": 2255 }, { "entropy": 0.5452647149562836, "epoch": 1.275395033860045, "grad_norm": 1.4500328302383423, "learning_rate": 4.9625316728841966e-06, "loss": 0.4185, "mean_token_accuracy": 0.8690374493598938, "num_tokens": 18352492.0, "step": 2260 }, { "entropy": 0.5347442328929901, "epoch": 1.2782167042889392, "grad_norm": 1.3491861820220947, "learning_rate": 4.962366184577762e-06, "loss": 0.3963, "mean_token_accuracy": 0.8743094444274903, "num_tokens": 18393148.0, "step": 2265 }, { "entropy": 0.5413298010826111, "epoch": 1.2810383747178329, "grad_norm": 1.3183116912841797, "learning_rate": 4.962200335320502e-06, "loss": 0.43, "mean_token_accuracy": 0.8649118781089783, "num_tokens": 18433888.0, "step": 2270 }, { "entropy": 0.49364901781082154, "epoch": 1.2838600451467268, "grad_norm": 1.4001456499099731, "learning_rate": 4.962034125144997e-06, "loss": 0.3979, "mean_token_accuracy": 0.8754386663436889, "num_tokens": 18474725.0, "step": 2275 }, { "entropy": 0.4968021988868713, "epoch": 1.2866817155756207, "grad_norm": 1.4670476913452148, "learning_rate": 4.961867554083899e-06, "loss": 0.3737, "mean_token_accuracy": 0.8795897960662842, "num_tokens": 18515515.0, "step": 2280 }, { "entropy": 0.49011672139167783, "epoch": 1.2895033860045146, "grad_norm": 1.5934028625488281, "learning_rate": 4.961700622169931e-06, "loss": 0.3733, "mean_token_accuracy": 0.8795024633407593, "num_tokens": 18556216.0, "step": 2285 }, { "entropy": 0.5093987703323364, "epoch": 1.2923250564334086, "grad_norm": 1.2995637655258179, "learning_rate": 4.961533329435888e-06, "loss": 0.4071, "mean_token_accuracy": 0.8725569367408752, "num_tokens": 18596786.0, "step": 2290 }, { "entropy": 0.5331693708896637, "epoch": 1.2951467268623025, "grad_norm": 1.4760665893554688, "learning_rate": 4.9613656759146335e-06, "loss": 0.4109, "mean_token_accuracy": 0.869564437866211, "num_tokens": 18637493.0, "step": 2295 }, { "entropy": 0.5001338601112366, "epoch": 1.2979683972911964, "grad_norm": 1.8200581073760986, "learning_rate": 4.961197661639102e-06, "loss": 0.3825, "mean_token_accuracy": 0.8765197157859802, "num_tokens": 18677593.0, "step": 2300 }, { "entropy": 0.5498757302761078, "epoch": 1.3007900677200903, "grad_norm": 1.2150392532348633, "learning_rate": 4.9610292866423036e-06, "loss": 0.4201, "mean_token_accuracy": 0.8701222181320191, "num_tokens": 18718374.0, "step": 2305 }, { "entropy": 0.5348264276981354, "epoch": 1.3036117381489842, "grad_norm": 1.4520456790924072, "learning_rate": 4.960860550957311e-06, "loss": 0.4204, "mean_token_accuracy": 0.8689911484718322, "num_tokens": 18758689.0, "step": 2310 }, { "entropy": 0.564541220664978, "epoch": 1.3064334085778782, "grad_norm": 1.4934542179107666, "learning_rate": 4.960691454617276e-06, "loss": 0.444, "mean_token_accuracy": 0.8587620854377747, "num_tokens": 18798953.0, "step": 2315 }, { "entropy": 0.5457278668880463, "epoch": 1.309255079006772, "grad_norm": 1.3648675680160522, "learning_rate": 4.960521997655415e-06, "loss": 0.4228, "mean_token_accuracy": 0.8687914133071899, "num_tokens": 18839547.0, "step": 2320 }, { "entropy": 0.5353664636611939, "epoch": 1.312076749435666, "grad_norm": 1.3305805921554565, "learning_rate": 4.960352180105019e-06, "loss": 0.4185, "mean_token_accuracy": 0.8669357776641846, "num_tokens": 18880107.0, "step": 2325 }, { "entropy": 0.5464560270309449, "epoch": 1.31489841986456, "grad_norm": 2.0227503776550293, "learning_rate": 4.9601820019994495e-06, "loss": 0.4141, "mean_token_accuracy": 0.8719035506248474, "num_tokens": 18920937.0, "step": 2330 }, { "entropy": 0.536427104473114, "epoch": 1.3177200902934536, "grad_norm": 1.3813787698745728, "learning_rate": 4.960011463372136e-06, "loss": 0.4265, "mean_token_accuracy": 0.8644273400306701, "num_tokens": 18961776.0, "step": 2335 }, { "entropy": 0.5107419192790985, "epoch": 1.3205417607223477, "grad_norm": 1.2994340658187866, "learning_rate": 4.959840564256583e-06, "loss": 0.4011, "mean_token_accuracy": 0.873857831954956, "num_tokens": 19002350.0, "step": 2340 }, { "entropy": 0.48056021332740784, "epoch": 1.3233634311512414, "grad_norm": 1.4537101984024048, "learning_rate": 4.959669304686362e-06, "loss": 0.3806, "mean_token_accuracy": 0.8777095079421997, "num_tokens": 19042920.0, "step": 2345 }, { "entropy": 0.5710622429847717, "epoch": 1.3261851015801354, "grad_norm": 1.362627625465393, "learning_rate": 4.959497684695118e-06, "loss": 0.4462, "mean_token_accuracy": 0.8591257691383362, "num_tokens": 19083640.0, "step": 2350 }, { "entropy": 0.521550840139389, "epoch": 1.3290067720090293, "grad_norm": 1.173241138458252, "learning_rate": 4.959325704316565e-06, "loss": 0.4025, "mean_token_accuracy": 0.8729618310928344, "num_tokens": 19123082.0, "step": 2355 }, { "entropy": 0.5515291333198548, "epoch": 1.3318284424379232, "grad_norm": 1.6536093950271606, "learning_rate": 4.959153363584489e-06, "loss": 0.4587, "mean_token_accuracy": 0.8603331565856933, "num_tokens": 19163643.0, "step": 2360 }, { "entropy": 0.5428348422050476, "epoch": 1.3346501128668171, "grad_norm": 1.6122465133666992, "learning_rate": 4.958980662532747e-06, "loss": 0.428, "mean_token_accuracy": 0.8602743864059448, "num_tokens": 19204151.0, "step": 2365 }, { "entropy": 0.558714485168457, "epoch": 1.337471783295711, "grad_norm": 1.321026086807251, "learning_rate": 4.9588076011952655e-06, "loss": 0.4514, "mean_token_accuracy": 0.8587347507476807, "num_tokens": 19244654.0, "step": 2370 }, { "entropy": 0.5276102185249328, "epoch": 1.340293453724605, "grad_norm": 1.311216950416565, "learning_rate": 4.958634179606041e-06, "loss": 0.4122, "mean_token_accuracy": 0.8710083127021789, "num_tokens": 19285304.0, "step": 2375 }, { "entropy": 0.5120981454849243, "epoch": 1.3431151241534989, "grad_norm": 1.2444289922714233, "learning_rate": 4.9584603977991445e-06, "loss": 0.3951, "mean_token_accuracy": 0.876779317855835, "num_tokens": 19326058.0, "step": 2380 }, { "entropy": 0.5315510213375092, "epoch": 1.3459367945823928, "grad_norm": 1.342822551727295, "learning_rate": 4.958286255808714e-06, "loss": 0.4153, "mean_token_accuracy": 0.86748708486557, "num_tokens": 19366695.0, "step": 2385 }, { "entropy": 0.5051746428012848, "epoch": 1.3487584650112867, "grad_norm": 1.3792295455932617, "learning_rate": 4.958111753668962e-06, "loss": 0.4084, "mean_token_accuracy": 0.8715560555458068, "num_tokens": 19407356.0, "step": 2390 }, { "entropy": 0.5739570260047913, "epoch": 1.3515801354401806, "grad_norm": 1.3936057090759277, "learning_rate": 4.957936891414166e-06, "loss": 0.443, "mean_token_accuracy": 0.8616944670677185, "num_tokens": 19446924.0, "step": 2395 }, { "entropy": 0.5593576908111573, "epoch": 1.3544018058690745, "grad_norm": 1.6129295825958252, "learning_rate": 4.957761669078679e-06, "loss": 0.453, "mean_token_accuracy": 0.8601847767829895, "num_tokens": 19487296.0, "step": 2400 }, { "entropy": 0.49403364062309263, "epoch": 1.3572234762979685, "grad_norm": 1.463240623474121, "learning_rate": 4.957586086696925e-06, "loss": 0.3719, "mean_token_accuracy": 0.882617199420929, "num_tokens": 19528192.0, "step": 2405 }, { "entropy": 0.5413905024528504, "epoch": 1.3600451467268622, "grad_norm": 1.3406250476837158, "learning_rate": 4.957410144303396e-06, "loss": 0.4149, "mean_token_accuracy": 0.8708963632583618, "num_tokens": 19568788.0, "step": 2410 }, { "entropy": 0.5129614770412445, "epoch": 1.3628668171557563, "grad_norm": 1.423504114151001, "learning_rate": 4.957233841932655e-06, "loss": 0.3972, "mean_token_accuracy": 0.8748751640319824, "num_tokens": 19609525.0, "step": 2415 }, { "entropy": 0.49986888766288756, "epoch": 1.36568848758465, "grad_norm": 1.3050624132156372, "learning_rate": 4.957057179619339e-06, "loss": 0.3815, "mean_token_accuracy": 0.8794636130332947, "num_tokens": 19650378.0, "step": 2420 }, { "entropy": 0.49531072974205015, "epoch": 1.3685101580135441, "grad_norm": 1.2032006978988647, "learning_rate": 4.956880157398151e-06, "loss": 0.3877, "mean_token_accuracy": 0.8791410803794861, "num_tokens": 19690935.0, "step": 2425 }, { "entropy": 0.5217344462871552, "epoch": 1.3713318284424378, "grad_norm": 1.3181284666061401, "learning_rate": 4.956702775303868e-06, "loss": 0.4113, "mean_token_accuracy": 0.8702506303787232, "num_tokens": 19731616.0, "step": 2430 }, { "entropy": 0.5071918427944183, "epoch": 1.3741534988713318, "grad_norm": 1.2046350240707397, "learning_rate": 4.956525033371336e-06, "loss": 0.4106, "mean_token_accuracy": 0.8688374757766724, "num_tokens": 19772129.0, "step": 2435 }, { "entropy": 0.5620993256568909, "epoch": 1.3769751693002257, "grad_norm": 1.3404937982559204, "learning_rate": 4.956346931635474e-06, "loss": 0.4299, "mean_token_accuracy": 0.8631010413169861, "num_tokens": 19812671.0, "step": 2440 }, { "entropy": 0.5249874234199524, "epoch": 1.3797968397291196, "grad_norm": 1.3918310403823853, "learning_rate": 4.956168470131269e-06, "loss": 0.4167, "mean_token_accuracy": 0.8694810628890991, "num_tokens": 19853386.0, "step": 2445 }, { "entropy": 0.48499825596809387, "epoch": 1.3826185101580135, "grad_norm": 1.3398149013519287, "learning_rate": 4.95598964889378e-06, "loss": 0.3775, "mean_token_accuracy": 0.8827056169509888, "num_tokens": 19893901.0, "step": 2450 }, { "entropy": 0.5267378866672516, "epoch": 1.3854401805869074, "grad_norm": 1.4285844564437866, "learning_rate": 4.9558104679581366e-06, "loss": 0.4089, "mean_token_accuracy": 0.8731061100959778, "num_tokens": 19934618.0, "step": 2455 }, { "entropy": 0.551910811662674, "epoch": 1.3882618510158014, "grad_norm": 1.462052822113037, "learning_rate": 4.955630927359538e-06, "loss": 0.4621, "mean_token_accuracy": 0.8564296126365661, "num_tokens": 19975257.0, "step": 2460 }, { "entropy": 0.5045246481895447, "epoch": 1.3910835214446953, "grad_norm": 1.2630540132522583, "learning_rate": 4.9554510271332575e-06, "loss": 0.3931, "mean_token_accuracy": 0.8743220806121826, "num_tokens": 20015796.0, "step": 2465 }, { "entropy": 0.4776579737663269, "epoch": 1.3939051918735892, "grad_norm": 1.3086442947387695, "learning_rate": 4.955270767314633e-06, "loss": 0.3594, "mean_token_accuracy": 0.8837788105010986, "num_tokens": 20056639.0, "step": 2470 }, { "entropy": 0.5198698997497558, "epoch": 1.396726862302483, "grad_norm": 1.1976763010025024, "learning_rate": 4.955090147939079e-06, "loss": 0.4014, "mean_token_accuracy": 0.8731474876403809, "num_tokens": 20097523.0, "step": 2475 }, { "entropy": 0.5507951974868774, "epoch": 1.399548532731377, "grad_norm": 1.595848798751831, "learning_rate": 4.954909169042078e-06, "loss": 0.4321, "mean_token_accuracy": 0.8633155941963195, "num_tokens": 20137741.0, "step": 2480 }, { "entropy": 0.5629419386386871, "epoch": 1.402370203160271, "grad_norm": 1.5942708253860474, "learning_rate": 4.954727830659182e-06, "loss": 0.4402, "mean_token_accuracy": 0.8617273569107056, "num_tokens": 20178295.0, "step": 2485 }, { "entropy": 0.5281494557857513, "epoch": 1.4051918735891649, "grad_norm": 1.2493270635604858, "learning_rate": 4.954546132826017e-06, "loss": 0.4143, "mean_token_accuracy": 0.8682329654693604, "num_tokens": 20219041.0, "step": 2490 }, { "entropy": 0.5261754095554352, "epoch": 1.4080135440180586, "grad_norm": 1.3487435579299927, "learning_rate": 4.954364075578276e-06, "loss": 0.4072, "mean_token_accuracy": 0.8690428853034973, "num_tokens": 20259747.0, "step": 2495 }, { "entropy": 0.5704293370246887, "epoch": 1.4108352144469527, "grad_norm": 1.5484710931777954, "learning_rate": 4.954181658951725e-06, "loss": 0.4643, "mean_token_accuracy": 0.8576129674911499, "num_tokens": 20300626.0, "step": 2500 }, { "epoch": 1.4108352144469527, "eval_entropy": 0.49934637546539307, "eval_loss": 0.3746188282966614, "eval_mean_token_accuracy": 0.8903794288635254, "eval_num_tokens": 20300626.0, "eval_runtime": 0.163, "eval_samples_per_second": 24.539, "eval_steps_per_second": 6.135, "step": 2500 }, { "entropy": 0.5491129279136657, "epoch": 1.4136568848758464, "grad_norm": 1.3182957172393799, "learning_rate": 4.953998882982197e-06, "loss": 0.4536, "mean_token_accuracy": 0.8601503491401672, "num_tokens": 20341333.0, "step": 2505 }, { "entropy": 0.5612141251564026, "epoch": 1.4164785553047405, "grad_norm": 1.5513429641723633, "learning_rate": 4.9538157477056025e-06, "loss": 0.4414, "mean_token_accuracy": 0.8637246131896973, "num_tokens": 20381822.0, "step": 2510 }, { "entropy": 0.5199264347553253, "epoch": 1.4193002257336342, "grad_norm": 1.3425822257995605, "learning_rate": 4.953632253157916e-06, "loss": 0.4135, "mean_token_accuracy": 0.8695431590080261, "num_tokens": 20422346.0, "step": 2515 }, { "entropy": 0.5060115993022919, "epoch": 1.4221218961625282, "grad_norm": 1.4134669303894043, "learning_rate": 4.953448399375187e-06, "loss": 0.387, "mean_token_accuracy": 0.8781610608100892, "num_tokens": 20462097.0, "step": 2520 }, { "entropy": 0.5111626803874969, "epoch": 1.424943566591422, "grad_norm": 1.3311017751693726, "learning_rate": 4.953264186393531e-06, "loss": 0.3961, "mean_token_accuracy": 0.87530437707901, "num_tokens": 20502526.0, "step": 2525 }, { "entropy": 0.5413293719291687, "epoch": 1.427765237020316, "grad_norm": 1.4240591526031494, "learning_rate": 4.953079614249138e-06, "loss": 0.4341, "mean_token_accuracy": 0.8609416246414184, "num_tokens": 20543142.0, "step": 2530 }, { "entropy": 0.5732130527496337, "epoch": 1.43058690744921, "grad_norm": 1.4825007915496826, "learning_rate": 4.952894682978268e-06, "loss": 0.4597, "mean_token_accuracy": 0.8564027547836304, "num_tokens": 20583174.0, "step": 2535 }, { "entropy": 0.4671943664550781, "epoch": 1.4334085778781038, "grad_norm": 1.1537166833877563, "learning_rate": 4.952709392617248e-06, "loss": 0.3627, "mean_token_accuracy": 0.8833828091621398, "num_tokens": 20623760.0, "step": 2540 }, { "entropy": 0.5528115510940552, "epoch": 1.4362302483069977, "grad_norm": 1.351750135421753, "learning_rate": 4.952523743202482e-06, "loss": 0.4395, "mean_token_accuracy": 0.8619906187057496, "num_tokens": 20664513.0, "step": 2545 }, { "entropy": 0.5158391892910004, "epoch": 1.4390519187358917, "grad_norm": 1.1706582307815552, "learning_rate": 4.952337734770439e-06, "loss": 0.4, "mean_token_accuracy": 0.8752634882926941, "num_tokens": 20705333.0, "step": 2550 }, { "entropy": 0.5181755602359772, "epoch": 1.4418735891647856, "grad_norm": 1.5158717632293701, "learning_rate": 4.95215136735766e-06, "loss": 0.3929, "mean_token_accuracy": 0.8729309439659119, "num_tokens": 20745887.0, "step": 2555 }, { "entropy": 0.5385544419288635, "epoch": 1.4446952595936795, "grad_norm": 1.2974624633789062, "learning_rate": 4.951964641000757e-06, "loss": 0.4326, "mean_token_accuracy": 0.8661515951156616, "num_tokens": 20786349.0, "step": 2560 }, { "entropy": 0.5237244606018067, "epoch": 1.4475169300225734, "grad_norm": 1.4556293487548828, "learning_rate": 4.951777555736414e-06, "loss": 0.415, "mean_token_accuracy": 0.8732018351554871, "num_tokens": 20826912.0, "step": 2565 }, { "entropy": 0.5504439949989319, "epoch": 1.4503386004514673, "grad_norm": 1.478190302848816, "learning_rate": 4.951590111601381e-06, "loss": 0.4226, "mean_token_accuracy": 0.8666379451751709, "num_tokens": 20867551.0, "step": 2570 }, { "entropy": 0.5655958175659179, "epoch": 1.4531602708803613, "grad_norm": 1.2426916360855103, "learning_rate": 4.951402308632485e-06, "loss": 0.4511, "mean_token_accuracy": 0.8618389010429383, "num_tokens": 20908206.0, "step": 2575 }, { "entropy": 0.5350692510604859, "epoch": 1.455981941309255, "grad_norm": 1.355296015739441, "learning_rate": 4.951214146866617e-06, "loss": 0.4225, "mean_token_accuracy": 0.868348228931427, "num_tokens": 20948945.0, "step": 2580 }, { "entropy": 0.5500153779983521, "epoch": 1.458803611738149, "grad_norm": 1.242108941078186, "learning_rate": 4.951025626340743e-06, "loss": 0.4279, "mean_token_accuracy": 0.864187228679657, "num_tokens": 20989560.0, "step": 2585 }, { "entropy": 0.5728635132312775, "epoch": 1.4616252821670428, "grad_norm": 1.4970265626907349, "learning_rate": 4.950836747091896e-06, "loss": 0.4321, "mean_token_accuracy": 0.8666630983352661, "num_tokens": 21030293.0, "step": 2590 }, { "entropy": 0.572597336769104, "epoch": 1.4644469525959367, "grad_norm": 1.5469214916229248, "learning_rate": 4.950647509157184e-06, "loss": 0.4535, "mean_token_accuracy": 0.860745084285736, "num_tokens": 21070816.0, "step": 2595 }, { "entropy": 0.4957478761672974, "epoch": 1.4672686230248306, "grad_norm": 1.1917364597320557, "learning_rate": 4.950457912573781e-06, "loss": 0.3959, "mean_token_accuracy": 0.8750966668128968, "num_tokens": 21111621.0, "step": 2600 }, { "entropy": 0.5851558446884155, "epoch": 1.4700902934537246, "grad_norm": 1.4717788696289062, "learning_rate": 4.950267957378934e-06, "loss": 0.4614, "mean_token_accuracy": 0.8556645750999451, "num_tokens": 21152027.0, "step": 2605 }, { "entropy": 0.4879822790622711, "epoch": 1.4729119638826185, "grad_norm": 1.244563341140747, "learning_rate": 4.950077643609959e-06, "loss": 0.3706, "mean_token_accuracy": 0.882515013217926, "num_tokens": 21192521.0, "step": 2610 }, { "entropy": 0.5324353992938995, "epoch": 1.4757336343115124, "grad_norm": 1.4513381719589233, "learning_rate": 4.949886971304245e-06, "loss": 0.4337, "mean_token_accuracy": 0.8638843178749085, "num_tokens": 21233396.0, "step": 2615 }, { "entropy": 0.5465445816516876, "epoch": 1.4785553047404063, "grad_norm": 1.4188737869262695, "learning_rate": 4.9496959404992475e-06, "loss": 0.4304, "mean_token_accuracy": 0.8674513459205627, "num_tokens": 21274180.0, "step": 2620 }, { "entropy": 0.5312675893306732, "epoch": 1.4813769751693002, "grad_norm": 1.3827718496322632, "learning_rate": 4.949504551232494e-06, "loss": 0.424, "mean_token_accuracy": 0.8689416646957397, "num_tokens": 21314947.0, "step": 2625 }, { "entropy": 0.5221801578998566, "epoch": 1.4841986455981941, "grad_norm": 1.344963788986206, "learning_rate": 4.949312803541586e-06, "loss": 0.3887, "mean_token_accuracy": 0.8764538764953613, "num_tokens": 21355658.0, "step": 2630 }, { "entropy": 0.5304463326930999, "epoch": 1.487020316027088, "grad_norm": 1.5296121835708618, "learning_rate": 4.94912069746419e-06, "loss": 0.4037, "mean_token_accuracy": 0.8726909160614014, "num_tokens": 21396355.0, "step": 2635 }, { "entropy": 0.5112833499908447, "epoch": 1.489841986455982, "grad_norm": 1.4764835834503174, "learning_rate": 4.948928233038046e-06, "loss": 0.3963, "mean_token_accuracy": 0.8752380728721618, "num_tokens": 21437172.0, "step": 2640 }, { "entropy": 0.5136803925037384, "epoch": 1.492663656884876, "grad_norm": 1.5744765996932983, "learning_rate": 4.948735410300964e-06, "loss": 0.4151, "mean_token_accuracy": 0.8673662304878235, "num_tokens": 21477969.0, "step": 2645 }, { "entropy": 0.5488377809524536, "epoch": 1.4954853273137698, "grad_norm": 1.3822392225265503, "learning_rate": 4.948542229290823e-06, "loss": 0.4442, "mean_token_accuracy": 0.8641608357429504, "num_tokens": 21517644.0, "step": 2650 }, { "entropy": 0.48912598490715026, "epoch": 1.4983069977426637, "grad_norm": 1.3357199430465698, "learning_rate": 4.948348690045574e-06, "loss": 0.3948, "mean_token_accuracy": 0.873920488357544, "num_tokens": 21558283.0, "step": 2655 }, { "entropy": 0.5133354067802429, "epoch": 1.5011286681715577, "grad_norm": 1.3202459812164307, "learning_rate": 4.948154792603237e-06, "loss": 0.414, "mean_token_accuracy": 0.8687997817993164, "num_tokens": 21598905.0, "step": 2660 }, { "entropy": 0.5592216491699219, "epoch": 1.5039503386004514, "grad_norm": 1.515934944152832, "learning_rate": 4.947960537001905e-06, "loss": 0.4425, "mean_token_accuracy": 0.8624489784240723, "num_tokens": 21639436.0, "step": 2665 }, { "entropy": 0.5013287782669067, "epoch": 1.5067720090293455, "grad_norm": 1.3164470195770264, "learning_rate": 4.947765923279738e-06, "loss": 0.3836, "mean_token_accuracy": 0.8810773968696595, "num_tokens": 21680256.0, "step": 2670 }, { "entropy": 0.5708201169967652, "epoch": 1.5095936794582392, "grad_norm": 1.5717384815216064, "learning_rate": 4.9475709514749695e-06, "loss": 0.4513, "mean_token_accuracy": 0.8578357696533203, "num_tokens": 21720710.0, "step": 2675 }, { "entropy": 0.5323946833610534, "epoch": 1.5124153498871333, "grad_norm": 1.3646914958953857, "learning_rate": 4.9473756216258996e-06, "loss": 0.4152, "mean_token_accuracy": 0.8670541882514954, "num_tokens": 21761305.0, "step": 2680 }, { "entropy": 0.5191471993923187, "epoch": 1.515237020316027, "grad_norm": 1.259047508239746, "learning_rate": 4.947179933770902e-06, "loss": 0.397, "mean_token_accuracy": 0.8745760560035706, "num_tokens": 21801879.0, "step": 2685 }, { "entropy": 0.5398116707801819, "epoch": 1.518058690744921, "grad_norm": 1.3516515493392944, "learning_rate": 4.94698388794842e-06, "loss": 0.4344, "mean_token_accuracy": 0.8646332740783691, "num_tokens": 21842560.0, "step": 2690 }, { "entropy": 0.5416356801986695, "epoch": 1.5208803611738149, "grad_norm": 1.3463817834854126, "learning_rate": 4.946787484196966e-06, "loss": 0.4341, "mean_token_accuracy": 0.8608946084976197, "num_tokens": 21883327.0, "step": 2695 }, { "entropy": 0.555895859003067, "epoch": 1.5237020316027088, "grad_norm": 1.376157283782959, "learning_rate": 4.9465907225551244e-06, "loss": 0.4479, "mean_token_accuracy": 0.8606306076049804, "num_tokens": 21923780.0, "step": 2700 }, { "entropy": 0.5254535734653473, "epoch": 1.5265237020316027, "grad_norm": 1.310943365097046, "learning_rate": 4.946393603061548e-06, "loss": 0.4071, "mean_token_accuracy": 0.8713202953338623, "num_tokens": 21964188.0, "step": 2705 }, { "entropy": 0.5179416954517364, "epoch": 1.5293453724604966, "grad_norm": 1.361799955368042, "learning_rate": 4.946196125754962e-06, "loss": 0.3959, "mean_token_accuracy": 0.8728214979171753, "num_tokens": 22004992.0, "step": 2710 }, { "entropy": 0.539003986120224, "epoch": 1.5321670428893905, "grad_norm": 1.5339001417160034, "learning_rate": 4.9459982906741596e-06, "loss": 0.4343, "mean_token_accuracy": 0.8599718093872071, "num_tokens": 22045651.0, "step": 2715 }, { "entropy": 0.48843835592269896, "epoch": 1.5349887133182845, "grad_norm": 1.1609723567962646, "learning_rate": 4.945800097858007e-06, "loss": 0.3754, "mean_token_accuracy": 0.8784770965576172, "num_tokens": 22085915.0, "step": 2720 }, { "entropy": 0.5748872756958008, "epoch": 1.5378103837471784, "grad_norm": 1.474261999130249, "learning_rate": 4.945601547345439e-06, "loss": 0.4581, "mean_token_accuracy": 0.8565932989120484, "num_tokens": 22126565.0, "step": 2725 }, { "entropy": 0.5497707307338715, "epoch": 1.540632054176072, "grad_norm": 1.328251600265503, "learning_rate": 4.945402639175459e-06, "loss": 0.4229, "mean_token_accuracy": 0.8657533526420593, "num_tokens": 22166897.0, "step": 2730 }, { "entropy": 0.5045753955841065, "epoch": 1.5434537246049662, "grad_norm": 1.4237067699432373, "learning_rate": 4.945203373387145e-06, "loss": 0.4037, "mean_token_accuracy": 0.8722402095794678, "num_tokens": 22207560.0, "step": 2735 }, { "entropy": 0.5575098991394043, "epoch": 1.54627539503386, "grad_norm": 1.7878057956695557, "learning_rate": 4.945003750019641e-06, "loss": 0.4381, "mean_token_accuracy": 0.8603370666503907, "num_tokens": 22247941.0, "step": 2740 }, { "entropy": 0.4867607891559601, "epoch": 1.549097065462754, "grad_norm": 1.526099443435669, "learning_rate": 4.944803769112164e-06, "loss": 0.401, "mean_token_accuracy": 0.874083411693573, "num_tokens": 22288838.0, "step": 2745 }, { "entropy": 0.5552621841430664, "epoch": 1.5519187358916477, "grad_norm": 1.3902403116226196, "learning_rate": 4.944603430704e-06, "loss": 0.4259, "mean_token_accuracy": 0.8668664813041687, "num_tokens": 22328956.0, "step": 2750 }, { "entropy": 0.5252302527427674, "epoch": 1.554740406320542, "grad_norm": 1.326379418373108, "learning_rate": 4.944402734834506e-06, "loss": 0.4095, "mean_token_accuracy": 0.8710361838340759, "num_tokens": 22369592.0, "step": 2755 }, { "entropy": 0.5309755086898804, "epoch": 1.5575620767494356, "grad_norm": 1.4432721138000488, "learning_rate": 4.944201681543107e-06, "loss": 0.423, "mean_token_accuracy": 0.8664063453674317, "num_tokens": 22410109.0, "step": 2760 }, { "entropy": 0.5249135076999665, "epoch": 1.5603837471783297, "grad_norm": 1.4632095098495483, "learning_rate": 4.944000270869302e-06, "loss": 0.3937, "mean_token_accuracy": 0.8737531065940857, "num_tokens": 22450758.0, "step": 2765 }, { "entropy": 0.509848439693451, "epoch": 1.5632054176072234, "grad_norm": 1.4338641166687012, "learning_rate": 4.943798502852657e-06, "loss": 0.4024, "mean_token_accuracy": 0.8742005109786988, "num_tokens": 22491593.0, "step": 2770 }, { "entropy": 0.5258462011814118, "epoch": 1.5660270880361173, "grad_norm": 1.3523592948913574, "learning_rate": 4.94359637753281e-06, "loss": 0.4196, "mean_token_accuracy": 0.867405092716217, "num_tokens": 22532315.0, "step": 2775 }, { "entropy": 0.5412358582019806, "epoch": 1.5688487584650113, "grad_norm": 1.4376283884048462, "learning_rate": 4.943393894949469e-06, "loss": 0.42, "mean_token_accuracy": 0.8682662725448609, "num_tokens": 22573023.0, "step": 2780 }, { "entropy": 0.530605137348175, "epoch": 1.5716704288939052, "grad_norm": 1.3493672609329224, "learning_rate": 4.943191055142409e-06, "loss": 0.4154, "mean_token_accuracy": 0.8707961201667785, "num_tokens": 22613743.0, "step": 2785 }, { "entropy": 0.5197601974010467, "epoch": 1.574492099322799, "grad_norm": 1.3924630880355835, "learning_rate": 4.942987858151481e-06, "loss": 0.4109, "mean_token_accuracy": 0.8706324815750122, "num_tokens": 22654339.0, "step": 2790 }, { "entropy": 0.48147222995758054, "epoch": 1.577313769751693, "grad_norm": 1.5284984111785889, "learning_rate": 4.942784304016602e-06, "loss": 0.3804, "mean_token_accuracy": 0.8784740090370178, "num_tokens": 22694670.0, "step": 2795 }, { "entropy": 0.46955564618110657, "epoch": 1.580135440180587, "grad_norm": 1.6057965755462646, "learning_rate": 4.942580392777761e-06, "loss": 0.3786, "mean_token_accuracy": 0.8798289656639099, "num_tokens": 22735192.0, "step": 2800 }, { "entropy": 0.5634661912918091, "epoch": 1.5829571106094809, "grad_norm": 1.395089030265808, "learning_rate": 4.942376124475014e-06, "loss": 0.4528, "mean_token_accuracy": 0.8594593167304992, "num_tokens": 22775738.0, "step": 2805 }, { "entropy": 0.6147814273834229, "epoch": 1.5857787810383748, "grad_norm": 1.4737554788589478, "learning_rate": 4.942171499148492e-06, "loss": 0.4841, "mean_token_accuracy": 0.851655924320221, "num_tokens": 22816401.0, "step": 2810 }, { "entropy": 0.581537914276123, "epoch": 1.5886004514672685, "grad_norm": 1.4444756507873535, "learning_rate": 4.941966516838393e-06, "loss": 0.4358, "mean_token_accuracy": 0.8650146484375, "num_tokens": 22857040.0, "step": 2815 }, { "entropy": 0.5177268862724305, "epoch": 1.5914221218961626, "grad_norm": 1.441490650177002, "learning_rate": 4.941761177584985e-06, "loss": 0.4075, "mean_token_accuracy": 0.8717929363250733, "num_tokens": 22897722.0, "step": 2820 }, { "entropy": 0.48996912837028506, "epoch": 1.5942437923250563, "grad_norm": 1.5225064754486084, "learning_rate": 4.941555481428607e-06, "loss": 0.3846, "mean_token_accuracy": 0.8782232880592347, "num_tokens": 22938229.0, "step": 2825 }, { "entropy": 0.5194081962108612, "epoch": 1.5970654627539504, "grad_norm": 1.3030446767807007, "learning_rate": 4.94134942840967e-06, "loss": 0.3979, "mean_token_accuracy": 0.8713892698287964, "num_tokens": 22978751.0, "step": 2830 }, { "entropy": 0.5596107244491577, "epoch": 1.5998871331828441, "grad_norm": 1.3859238624572754, "learning_rate": 4.9411430185686505e-06, "loss": 0.4545, "mean_token_accuracy": 0.8581524729728699, "num_tokens": 23019325.0, "step": 2835 }, { "entropy": 0.5840788006782531, "epoch": 1.6027088036117383, "grad_norm": 1.4645488262176514, "learning_rate": 4.940936251946099e-06, "loss": 0.466, "mean_token_accuracy": 0.8551864385604858, "num_tokens": 23060114.0, "step": 2840 }, { "entropy": 0.5338825523853302, "epoch": 1.605530474040632, "grad_norm": 1.3881251811981201, "learning_rate": 4.940729128582636e-06, "loss": 0.4135, "mean_token_accuracy": 0.8701825737953186, "num_tokens": 23100863.0, "step": 2845 }, { "entropy": 0.5128993272781373, "epoch": 1.6083521444695261, "grad_norm": 1.1398290395736694, "learning_rate": 4.940521648518948e-06, "loss": 0.3846, "mean_token_accuracy": 0.8785142302513123, "num_tokens": 23141333.0, "step": 2850 }, { "entropy": 0.5374301552772522, "epoch": 1.6111738148984198, "grad_norm": 1.459583044052124, "learning_rate": 4.940313811795797e-06, "loss": 0.4169, "mean_token_accuracy": 0.8682045102119446, "num_tokens": 23181557.0, "step": 2855 }, { "entropy": 0.530595988035202, "epoch": 1.6139954853273137, "grad_norm": 1.451833963394165, "learning_rate": 4.9401056184540115e-06, "loss": 0.4232, "mean_token_accuracy": 0.8683579087257385, "num_tokens": 23221979.0, "step": 2860 }, { "entropy": 0.5528306722640991, "epoch": 1.6168171557562077, "grad_norm": 1.3938990831375122, "learning_rate": 4.939897068534491e-06, "loss": 0.4351, "mean_token_accuracy": 0.8650489091873169, "num_tokens": 23262727.0, "step": 2865 }, { "entropy": 0.4935439705848694, "epoch": 1.6196388261851016, "grad_norm": 1.3947253227233887, "learning_rate": 4.939688162078205e-06, "loss": 0.3791, "mean_token_accuracy": 0.8790611743927002, "num_tokens": 23303214.0, "step": 2870 }, { "entropy": 0.5516513884067535, "epoch": 1.6224604966139955, "grad_norm": 1.4023181200027466, "learning_rate": 4.939478899126196e-06, "loss": 0.4402, "mean_token_accuracy": 0.8614558100700378, "num_tokens": 23343833.0, "step": 2875 }, { "entropy": 0.5269099056720734, "epoch": 1.6252821670428894, "grad_norm": 1.520416021347046, "learning_rate": 4.939269279719569e-06, "loss": 0.4226, "mean_token_accuracy": 0.868548309803009, "num_tokens": 23384399.0, "step": 2880 }, { "entropy": 0.5381790697574615, "epoch": 1.6281038374717833, "grad_norm": 1.252475380897522, "learning_rate": 4.939059303899507e-06, "loss": 0.4197, "mean_token_accuracy": 0.8675281643867493, "num_tokens": 23424933.0, "step": 2885 }, { "entropy": 0.5297176659107208, "epoch": 1.6309255079006773, "grad_norm": 1.3621821403503418, "learning_rate": 4.93884897170726e-06, "loss": 0.4147, "mean_token_accuracy": 0.8692261695861816, "num_tokens": 23465388.0, "step": 2890 }, { "entropy": 0.5208603262901306, "epoch": 1.6337471783295712, "grad_norm": 1.6305574178695679, "learning_rate": 4.9386382831841455e-06, "loss": 0.3942, "mean_token_accuracy": 0.8741989612579346, "num_tokens": 23506128.0, "step": 2895 }, { "entropy": 0.5832960605621338, "epoch": 1.6365688487584649, "grad_norm": 1.3824501037597656, "learning_rate": 4.9384272383715535e-06, "loss": 0.4412, "mean_token_accuracy": 0.8618869662284852, "num_tokens": 23546678.0, "step": 2900 }, { "entropy": 0.5429921388626099, "epoch": 1.639390519187359, "grad_norm": 1.4333373308181763, "learning_rate": 4.938215837310947e-06, "loss": 0.4234, "mean_token_accuracy": 0.8673049211502075, "num_tokens": 23587532.0, "step": 2905 }, { "entropy": 0.4961793005466461, "epoch": 1.6422121896162527, "grad_norm": 1.4071818590164185, "learning_rate": 4.938004080043852e-06, "loss": 0.3931, "mean_token_accuracy": 0.8742106318473816, "num_tokens": 23628159.0, "step": 2910 }, { "entropy": 0.5959875106811523, "epoch": 1.6450338600451468, "grad_norm": 1.366908073425293, "learning_rate": 4.93779196661187e-06, "loss": 0.4724, "mean_token_accuracy": 0.8543134689331054, "num_tokens": 23668932.0, "step": 2915 }, { "entropy": 0.48293390274047854, "epoch": 1.6478555304740405, "grad_norm": 1.2694140672683716, "learning_rate": 4.937579497056671e-06, "loss": 0.3597, "mean_token_accuracy": 0.8866869568824768, "num_tokens": 23709665.0, "step": 2920 }, { "entropy": 0.5429055094718933, "epoch": 1.6506772009029347, "grad_norm": 1.4968737363815308, "learning_rate": 4.937366671419994e-06, "loss": 0.4479, "mean_token_accuracy": 0.8604987978935241, "num_tokens": 23750495.0, "step": 2925 }, { "entropy": 0.49238319993019103, "epoch": 1.6534988713318284, "grad_norm": 1.4988641738891602, "learning_rate": 4.937153489743649e-06, "loss": 0.3955, "mean_token_accuracy": 0.875276243686676, "num_tokens": 23790793.0, "step": 2930 }, { "entropy": 0.5414620280265808, "epoch": 1.6563205417607223, "grad_norm": 1.4088878631591797, "learning_rate": 4.936939952069515e-06, "loss": 0.4192, "mean_token_accuracy": 0.8669666767120361, "num_tokens": 23831543.0, "step": 2935 }, { "entropy": 0.5236553966999054, "epoch": 1.6591422121896162, "grad_norm": 1.4377782344818115, "learning_rate": 4.936726058439542e-06, "loss": 0.4219, "mean_token_accuracy": 0.8676124572753906, "num_tokens": 23872266.0, "step": 2940 }, { "entropy": 0.5415829122066498, "epoch": 1.6619638826185101, "grad_norm": 1.4951739311218262, "learning_rate": 4.936511808895751e-06, "loss": 0.4225, "mean_token_accuracy": 0.865878450870514, "num_tokens": 23912731.0, "step": 2945 }, { "entropy": 0.5576195478439331, "epoch": 1.664785553047404, "grad_norm": 1.3496601581573486, "learning_rate": 4.936297203480227e-06, "loss": 0.4244, "mean_token_accuracy": 0.8663969993591308, "num_tokens": 23953363.0, "step": 2950 }, { "entropy": 0.5511267483234406, "epoch": 1.667607223476298, "grad_norm": 1.3088607788085938, "learning_rate": 4.936082242235133e-06, "loss": 0.4336, "mean_token_accuracy": 0.8627111554145813, "num_tokens": 23994093.0, "step": 2955 }, { "entropy": 0.5174029409885407, "epoch": 1.670428893905192, "grad_norm": 1.3591479063034058, "learning_rate": 4.935866925202697e-06, "loss": 0.3993, "mean_token_accuracy": 0.8729613065719605, "num_tokens": 24034859.0, "step": 2960 }, { "entropy": 0.5546496689319611, "epoch": 1.6732505643340858, "grad_norm": 1.4932035207748413, "learning_rate": 4.935651252425219e-06, "loss": 0.4278, "mean_token_accuracy": 0.8636979460716248, "num_tokens": 24075192.0, "step": 2965 }, { "entropy": 0.585486912727356, "epoch": 1.6760722347629797, "grad_norm": 1.5926809310913086, "learning_rate": 4.935435223945066e-06, "loss": 0.4578, "mean_token_accuracy": 0.8578457713127137, "num_tokens": 24115948.0, "step": 2970 }, { "entropy": 0.6288502216339111, "epoch": 1.6788939051918734, "grad_norm": 1.4562243223190308, "learning_rate": 4.935218839804678e-06, "loss": 0.5238, "mean_token_accuracy": 0.8391013741493225, "num_tokens": 24156616.0, "step": 2975 }, { "entropy": 0.518006545305252, "epoch": 1.6817155756207676, "grad_norm": 1.3895467519760132, "learning_rate": 4.9350021000465645e-06, "loss": 0.3946, "mean_token_accuracy": 0.8746880412101745, "num_tokens": 24196970.0, "step": 2980 }, { "entropy": 0.5556329131126404, "epoch": 1.6845372460496613, "grad_norm": 1.4481979608535767, "learning_rate": 4.9347850047133025e-06, "loss": 0.4324, "mean_token_accuracy": 0.8628229856491089, "num_tokens": 24237625.0, "step": 2985 }, { "entropy": 0.557821786403656, "epoch": 1.6873589164785554, "grad_norm": 1.4519360065460205, "learning_rate": 4.934567553847541e-06, "loss": 0.4377, "mean_token_accuracy": 0.8617467164993287, "num_tokens": 24278002.0, "step": 2990 }, { "entropy": 0.4816114008426666, "epoch": 1.690180586907449, "grad_norm": 1.354487657546997, "learning_rate": 4.934349747491998e-06, "loss": 0.3668, "mean_token_accuracy": 0.883508825302124, "num_tokens": 24318792.0, "step": 2995 }, { "entropy": 0.5235770165920257, "epoch": 1.6930022573363432, "grad_norm": 1.4343819618225098, "learning_rate": 4.934131585689462e-06, "loss": 0.4065, "mean_token_accuracy": 0.8717373967170715, "num_tokens": 24359410.0, "step": 3000 }, { "epoch": 1.6930022573363432, "eval_entropy": 0.49325600266456604, "eval_loss": 0.33416908979415894, "eval_mean_token_accuracy": 0.8984284996986389, "eval_num_tokens": 24359410.0, "eval_runtime": 0.1636, "eval_samples_per_second": 24.448, "eval_steps_per_second": 6.112, "step": 3000 }, { "entropy": 0.5068280339241028, "epoch": 1.695823927765237, "grad_norm": 1.4546306133270264, "learning_rate": 4.933913068482792e-06, "loss": 0.4058, "mean_token_accuracy": 0.8731536269187927, "num_tokens": 24400072.0, "step": 3005 }, { "entropy": 0.5603242635726928, "epoch": 1.698645598194131, "grad_norm": 1.3821967840194702, "learning_rate": 4.933694195914913e-06, "loss": 0.4325, "mean_token_accuracy": 0.8643564581871033, "num_tokens": 24440775.0, "step": 3010 }, { "entropy": 0.5545867264270783, "epoch": 1.7014672686230248, "grad_norm": 1.3357994556427002, "learning_rate": 4.9334749680288255e-06, "loss": 0.4158, "mean_token_accuracy": 0.8690497159957886, "num_tokens": 24481543.0, "step": 3015 }, { "entropy": 0.5338212966918945, "epoch": 1.7042889390519187, "grad_norm": 1.5392085313796997, "learning_rate": 4.9332553848675945e-06, "loss": 0.4163, "mean_token_accuracy": 0.867198383808136, "num_tokens": 24522074.0, "step": 3020 }, { "entropy": 0.5320624589920044, "epoch": 1.7071106094808126, "grad_norm": 1.3222757577896118, "learning_rate": 4.933035446474358e-06, "loss": 0.4333, "mean_token_accuracy": 0.8634262561798096, "num_tokens": 24562720.0, "step": 3025 }, { "entropy": 0.5418852508068085, "epoch": 1.7099322799097065, "grad_norm": 1.3481212854385376, "learning_rate": 4.932815152892323e-06, "loss": 0.4225, "mean_token_accuracy": 0.8684999108314514, "num_tokens": 24603277.0, "step": 3030 }, { "entropy": 0.5434010863304138, "epoch": 1.7127539503386005, "grad_norm": 1.4969969987869263, "learning_rate": 4.932594504164767e-06, "loss": 0.4051, "mean_token_accuracy": 0.8709228515625, "num_tokens": 24643930.0, "step": 3035 }, { "entropy": 0.4718973636627197, "epoch": 1.7155756207674944, "grad_norm": 1.2923210859298706, "learning_rate": 4.932373500335035e-06, "loss": 0.3696, "mean_token_accuracy": 0.8835036158561707, "num_tokens": 24684556.0, "step": 3040 }, { "entropy": 0.5055824279785156, "epoch": 1.7183972911963883, "grad_norm": 1.2986489534378052, "learning_rate": 4.932152141446545e-06, "loss": 0.3948, "mean_token_accuracy": 0.8733231663703919, "num_tokens": 24725118.0, "step": 3045 }, { "entropy": 0.5701562583446502, "epoch": 1.7212189616252822, "grad_norm": 1.3613872528076172, "learning_rate": 4.93193042754278e-06, "loss": 0.4687, "mean_token_accuracy": 0.8567159414291382, "num_tokens": 24765797.0, "step": 3050 }, { "entropy": 0.5157757997512817, "epoch": 1.7240406320541761, "grad_norm": 1.4309362173080444, "learning_rate": 4.931708358667299e-06, "loss": 0.4019, "mean_token_accuracy": 0.8720048069953918, "num_tokens": 24806387.0, "step": 3055 }, { "entropy": 0.5428893506526947, "epoch": 1.7268623024830698, "grad_norm": 1.565901756286621, "learning_rate": 4.9314859348637256e-06, "loss": 0.4145, "mean_token_accuracy": 0.8697570323944092, "num_tokens": 24846979.0, "step": 3060 }, { "entropy": 0.5166443586349487, "epoch": 1.729683972911964, "grad_norm": 1.3636257648468018, "learning_rate": 4.931263156175756e-06, "loss": 0.4157, "mean_token_accuracy": 0.8698770403862, "num_tokens": 24887888.0, "step": 3065 }, { "entropy": 0.5039295554161072, "epoch": 1.7325056433408577, "grad_norm": 1.1980102062225342, "learning_rate": 4.931040022647154e-06, "loss": 0.3905, "mean_token_accuracy": 0.8756434202194214, "num_tokens": 24928370.0, "step": 3070 }, { "entropy": 0.510296243429184, "epoch": 1.7353273137697518, "grad_norm": 1.393905758857727, "learning_rate": 4.930816534321755e-06, "loss": 0.3965, "mean_token_accuracy": 0.8748240351676941, "num_tokens": 24969015.0, "step": 3075 }, { "entropy": 0.4927513003349304, "epoch": 1.7381489841986455, "grad_norm": 1.4032167196273804, "learning_rate": 4.930592691243463e-06, "loss": 0.3913, "mean_token_accuracy": 0.8734770774841308, "num_tokens": 25009646.0, "step": 3080 }, { "entropy": 0.5289837539196014, "epoch": 1.7409706546275396, "grad_norm": 1.3683279752731323, "learning_rate": 4.930368493456252e-06, "loss": 0.4062, "mean_token_accuracy": 0.8745390772819519, "num_tokens": 25050335.0, "step": 3085 }, { "entropy": 0.5285290896892547, "epoch": 1.7437923250564333, "grad_norm": 1.4092698097229004, "learning_rate": 4.930143941004166e-06, "loss": 0.4262, "mean_token_accuracy": 0.8690651416778564, "num_tokens": 25090875.0, "step": 3090 }, { "entropy": 0.5130545377731324, "epoch": 1.7466139954853275, "grad_norm": 1.300351619720459, "learning_rate": 4.9299190339313186e-06, "loss": 0.3881, "mean_token_accuracy": 0.8777357459068298, "num_tokens": 25131068.0, "step": 3095 }, { "entropy": 0.5526080071926117, "epoch": 1.7494356659142212, "grad_norm": 1.5255002975463867, "learning_rate": 4.929693772281892e-06, "loss": 0.4491, "mean_token_accuracy": 0.859094786643982, "num_tokens": 25171825.0, "step": 3100 }, { "entropy": 0.554760754108429, "epoch": 1.752257336343115, "grad_norm": 1.6149297952651978, "learning_rate": 4.929468156100139e-06, "loss": 0.4283, "mean_token_accuracy": 0.8624596238136292, "num_tokens": 25212450.0, "step": 3105 }, { "entropy": 0.5364702701568603, "epoch": 1.755079006772009, "grad_norm": 1.3212907314300537, "learning_rate": 4.929242185430382e-06, "loss": 0.4295, "mean_token_accuracy": 0.8656898379325867, "num_tokens": 25252979.0, "step": 3110 }, { "entropy": 0.5041682958602905, "epoch": 1.757900677200903, "grad_norm": 1.4357374906539917, "learning_rate": 4.929015860317013e-06, "loss": 0.3979, "mean_token_accuracy": 0.8751791954040528, "num_tokens": 25293782.0, "step": 3115 }, { "entropy": 0.5505750179290771, "epoch": 1.7607223476297968, "grad_norm": 1.3972926139831543, "learning_rate": 4.928789180804494e-06, "loss": 0.4544, "mean_token_accuracy": 0.8585454225540161, "num_tokens": 25334421.0, "step": 3120 }, { "entropy": 0.5151427447795868, "epoch": 1.7635440180586908, "grad_norm": 1.1562422513961792, "learning_rate": 4.9285621469373565e-06, "loss": 0.3976, "mean_token_accuracy": 0.8717629671096802, "num_tokens": 25374820.0, "step": 3125 }, { "entropy": 0.5338060319423675, "epoch": 1.7663656884875847, "grad_norm": 1.4789332151412964, "learning_rate": 4.9283347587602e-06, "loss": 0.4161, "mean_token_accuracy": 0.8673757314682007, "num_tokens": 25415684.0, "step": 3130 }, { "entropy": 0.5420262277126312, "epoch": 1.7691873589164786, "grad_norm": 1.34184730052948, "learning_rate": 4.928107016317697e-06, "loss": 0.4364, "mean_token_accuracy": 0.8614213824272156, "num_tokens": 25456228.0, "step": 3135 }, { "entropy": 0.5315328538417816, "epoch": 1.7720090293453725, "grad_norm": 1.4150985479354858, "learning_rate": 4.927878919654585e-06, "loss": 0.4275, "mean_token_accuracy": 0.8663432002067566, "num_tokens": 25497007.0, "step": 3140 }, { "entropy": 0.5486104905605316, "epoch": 1.7748306997742662, "grad_norm": 1.6440742015838623, "learning_rate": 4.927650468815675e-06, "loss": 0.4464, "mean_token_accuracy": 0.8615983009338379, "num_tokens": 25537572.0, "step": 3145 }, { "entropy": 0.5485983848571777, "epoch": 1.7776523702031604, "grad_norm": 1.3117893934249878, "learning_rate": 4.927421663845847e-06, "loss": 0.4287, "mean_token_accuracy": 0.8655959486961364, "num_tokens": 25578393.0, "step": 3150 }, { "entropy": 0.5457029461860656, "epoch": 1.780474040632054, "grad_norm": 1.4625993967056274, "learning_rate": 4.927192504790048e-06, "loss": 0.4178, "mean_token_accuracy": 0.8674182772636414, "num_tokens": 25619127.0, "step": 3155 }, { "entropy": 0.5470890522003173, "epoch": 1.7832957110609482, "grad_norm": 1.3004167079925537, "learning_rate": 4.926962991693297e-06, "loss": 0.4472, "mean_token_accuracy": 0.8591425657272339, "num_tokens": 25659696.0, "step": 3160 }, { "entropy": 0.5210420250892639, "epoch": 1.786117381489842, "grad_norm": 1.358970046043396, "learning_rate": 4.926733124600682e-06, "loss": 0.4098, "mean_token_accuracy": 0.8688877105712891, "num_tokens": 25700593.0, "step": 3165 }, { "entropy": 0.5520116329193115, "epoch": 1.788939051918736, "grad_norm": 1.470733880996704, "learning_rate": 4.926502903557361e-06, "loss": 0.44, "mean_token_accuracy": 0.8626960635185241, "num_tokens": 25741390.0, "step": 3170 }, { "entropy": 0.5124341726303101, "epoch": 1.7917607223476297, "grad_norm": 1.2576587200164795, "learning_rate": 4.92627232860856e-06, "loss": 0.4181, "mean_token_accuracy": 0.8688732981681824, "num_tokens": 25781767.0, "step": 3175 }, { "entropy": 0.48606478571891787, "epoch": 1.7945823927765236, "grad_norm": 1.2571027278900146, "learning_rate": 4.926041399799576e-06, "loss": 0.3849, "mean_token_accuracy": 0.8780399203300476, "num_tokens": 25822389.0, "step": 3180 }, { "entropy": 0.5661851406097412, "epoch": 1.7974040632054176, "grad_norm": 1.4934290647506714, "learning_rate": 4.925810117175775e-06, "loss": 0.4423, "mean_token_accuracy": 0.8613282799720764, "num_tokens": 25862747.0, "step": 3185 }, { "entropy": 0.5048659920692444, "epoch": 1.8002257336343115, "grad_norm": 1.2331528663635254, "learning_rate": 4.925578480782593e-06, "loss": 0.3776, "mean_token_accuracy": 0.8817182898521423, "num_tokens": 25903452.0, "step": 3190 }, { "entropy": 0.4995622456073761, "epoch": 1.8030474040632054, "grad_norm": 1.370931625366211, "learning_rate": 4.925346490665533e-06, "loss": 0.3816, "mean_token_accuracy": 0.8787646174430848, "num_tokens": 25943952.0, "step": 3195 }, { "entropy": 0.5398005902767181, "epoch": 1.8058690744920993, "grad_norm": 1.33382248878479, "learning_rate": 4.925114146870172e-06, "loss": 0.4339, "mean_token_accuracy": 0.8646183371543884, "num_tokens": 25984602.0, "step": 3200 }, { "entropy": 0.563685005903244, "epoch": 1.8086907449209932, "grad_norm": 1.504831314086914, "learning_rate": 4.924881449442153e-06, "loss": 0.4739, "mean_token_accuracy": 0.8542726278305054, "num_tokens": 26024952.0, "step": 3205 }, { "entropy": 0.5182668328285217, "epoch": 1.8115124153498872, "grad_norm": 1.46697199344635, "learning_rate": 4.924648398427189e-06, "loss": 0.4174, "mean_token_accuracy": 0.8678057789802551, "num_tokens": 26065459.0, "step": 3210 }, { "entropy": 0.5408389866352081, "epoch": 1.814334085778781, "grad_norm": 1.30887770652771, "learning_rate": 4.924414993871063e-06, "loss": 0.419, "mean_token_accuracy": 0.8668316960334778, "num_tokens": 26106028.0, "step": 3215 }, { "entropy": 0.542147308588028, "epoch": 1.8171557562076748, "grad_norm": 1.524593710899353, "learning_rate": 4.924181235819627e-06, "loss": 0.4275, "mean_token_accuracy": 0.8657636642456055, "num_tokens": 26146540.0, "step": 3220 }, { "entropy": 0.5389390230178833, "epoch": 1.819977426636569, "grad_norm": 1.4004186391830444, "learning_rate": 4.923947124318804e-06, "loss": 0.4232, "mean_token_accuracy": 0.8676887512207031, "num_tokens": 26186216.0, "step": 3225 }, { "entropy": 0.5603163003921509, "epoch": 1.8227990970654626, "grad_norm": 1.5067588090896606, "learning_rate": 4.923712659414585e-06, "loss": 0.4433, "mean_token_accuracy": 0.8593494296073914, "num_tokens": 26226650.0, "step": 3230 }, { "entropy": 0.5345525562763214, "epoch": 1.8256207674943568, "grad_norm": 1.264466643333435, "learning_rate": 4.923477841153029e-06, "loss": 0.424, "mean_token_accuracy": 0.8678050875663758, "num_tokens": 26267478.0, "step": 3235 }, { "entropy": 0.4989795684814453, "epoch": 1.8284424379232505, "grad_norm": 1.2940160036087036, "learning_rate": 4.923242669580268e-06, "loss": 0.3812, "mean_token_accuracy": 0.8774576902389526, "num_tokens": 26308103.0, "step": 3240 }, { "entropy": 0.5003684043884278, "epoch": 1.8312641083521446, "grad_norm": 1.4770820140838623, "learning_rate": 4.923007144742501e-06, "loss": 0.4049, "mean_token_accuracy": 0.8739272236824036, "num_tokens": 26348574.0, "step": 3245 }, { "entropy": 0.4989498794078827, "epoch": 1.8340857787810383, "grad_norm": 1.2541260719299316, "learning_rate": 4.922771266685997e-06, "loss": 0.3849, "mean_token_accuracy": 0.8757034659385681, "num_tokens": 26389200.0, "step": 3250 }, { "entropy": 0.5914472699165344, "epoch": 1.8369074492099324, "grad_norm": 1.3079047203063965, "learning_rate": 4.922535035457094e-06, "loss": 0.4896, "mean_token_accuracy": 0.8483936309814453, "num_tokens": 26429900.0, "step": 3255 }, { "entropy": 0.5169390797615051, "epoch": 1.8397291196388261, "grad_norm": 1.5444384813308716, "learning_rate": 4.922298451102199e-06, "loss": 0.3953, "mean_token_accuracy": 0.8759800791740417, "num_tokens": 26470465.0, "step": 3260 }, { "entropy": 0.536392205953598, "epoch": 1.84255079006772, "grad_norm": 1.189184546470642, "learning_rate": 4.922061513667789e-06, "loss": 0.434, "mean_token_accuracy": 0.8635899901390076, "num_tokens": 26511105.0, "step": 3265 }, { "entropy": 0.5151565790176391, "epoch": 1.845372460496614, "grad_norm": 1.5389307737350464, "learning_rate": 4.921824223200412e-06, "loss": 0.3973, "mean_token_accuracy": 0.8712221384048462, "num_tokens": 26551938.0, "step": 3270 }, { "entropy": 0.5374954879283905, "epoch": 1.8481941309255079, "grad_norm": 1.3970680236816406, "learning_rate": 4.9215865797466826e-06, "loss": 0.4253, "mean_token_accuracy": 0.8682995200157165, "num_tokens": 26592540.0, "step": 3275 }, { "entropy": 0.5597237467765808, "epoch": 1.8510158013544018, "grad_norm": 1.3503648042678833, "learning_rate": 4.921348583353286e-06, "loss": 0.4328, "mean_token_accuracy": 0.8641868352890014, "num_tokens": 26633368.0, "step": 3280 }, { "entropy": 0.5081552624702453, "epoch": 1.8538374717832957, "grad_norm": 1.3734320402145386, "learning_rate": 4.921110234066977e-06, "loss": 0.4065, "mean_token_accuracy": 0.8727322340011596, "num_tokens": 26674050.0, "step": 3285 }, { "entropy": 0.5667516946792602, "epoch": 1.8566591422121896, "grad_norm": 1.4739806652069092, "learning_rate": 4.920871531934579e-06, "loss": 0.4482, "mean_token_accuracy": 0.8577787399291992, "num_tokens": 26714528.0, "step": 3290 }, { "entropy": 0.5383511900901794, "epoch": 1.8594808126410836, "grad_norm": 1.450299620628357, "learning_rate": 4.920632477002985e-06, "loss": 0.4157, "mean_token_accuracy": 0.8684499502182007, "num_tokens": 26755102.0, "step": 3295 }, { "entropy": 0.5264916241168975, "epoch": 1.8623024830699775, "grad_norm": 1.339902400970459, "learning_rate": 4.9203930693191575e-06, "loss": 0.4283, "mean_token_accuracy": 0.8646404504776001, "num_tokens": 26795847.0, "step": 3300 }, { "entropy": 0.501398503780365, "epoch": 1.8651241534988712, "grad_norm": 1.3118853569030762, "learning_rate": 4.920153308930128e-06, "loss": 0.3929, "mean_token_accuracy": 0.872932231426239, "num_tokens": 26836356.0, "step": 3305 }, { "entropy": 0.5669954061508179, "epoch": 1.8679458239277653, "grad_norm": 1.4395278692245483, "learning_rate": 4.919913195882997e-06, "loss": 0.4449, "mean_token_accuracy": 0.8604054570198059, "num_tokens": 26876942.0, "step": 3310 }, { "entropy": 0.4821522831916809, "epoch": 1.870767494356659, "grad_norm": 1.3885427713394165, "learning_rate": 4.919672730224936e-06, "loss": 0.3866, "mean_token_accuracy": 0.8788249969482422, "num_tokens": 26917741.0, "step": 3315 }, { "entropy": 0.5123970150947571, "epoch": 1.8735891647855532, "grad_norm": 1.3463451862335205, "learning_rate": 4.9194319120031836e-06, "loss": 0.3925, "mean_token_accuracy": 0.8763728380203247, "num_tokens": 26957483.0, "step": 3320 }, { "entropy": 0.5375806808471679, "epoch": 1.8764108352144468, "grad_norm": 1.339368462562561, "learning_rate": 4.91919074126505e-06, "loss": 0.4255, "mean_token_accuracy": 0.8660783290863037, "num_tokens": 26998053.0, "step": 3325 }, { "entropy": 0.5030182540416718, "epoch": 1.879232505643341, "grad_norm": 1.2726454734802246, "learning_rate": 4.91894921805791e-06, "loss": 0.3762, "mean_token_accuracy": 0.8820128679275513, "num_tokens": 27038835.0, "step": 3330 }, { "entropy": 0.5537118554115296, "epoch": 1.8820541760722347, "grad_norm": 1.3522690534591675, "learning_rate": 4.918707342429214e-06, "loss": 0.4299, "mean_token_accuracy": 0.8657578349113464, "num_tokens": 27079464.0, "step": 3335 }, { "entropy": 0.5303456306457519, "epoch": 1.8848758465011288, "grad_norm": 1.423553705215454, "learning_rate": 4.9184651144264776e-06, "loss": 0.4217, "mean_token_accuracy": 0.8663669109344483, "num_tokens": 27120201.0, "step": 3340 }, { "entropy": 0.5344128489494324, "epoch": 1.8876975169300225, "grad_norm": 1.223803162574768, "learning_rate": 4.918222534097286e-06, "loss": 0.4155, "mean_token_accuracy": 0.8672279477119446, "num_tokens": 27160686.0, "step": 3345 }, { "entropy": 0.4743356049060822, "epoch": 1.8905191873589164, "grad_norm": 1.3717639446258545, "learning_rate": 4.917979601489295e-06, "loss": 0.3674, "mean_token_accuracy": 0.8822289109230042, "num_tokens": 27201250.0, "step": 3350 }, { "entropy": 0.5208858251571655, "epoch": 1.8933408577878104, "grad_norm": 1.2312960624694824, "learning_rate": 4.917736316650228e-06, "loss": 0.3954, "mean_token_accuracy": 0.8755510807037353, "num_tokens": 27241854.0, "step": 3355 }, { "entropy": 0.5430815577507019, "epoch": 1.8961625282167043, "grad_norm": 1.385640025138855, "learning_rate": 4.917492679627879e-06, "loss": 0.436, "mean_token_accuracy": 0.8614830732345581, "num_tokens": 27282703.0, "step": 3360 }, { "entropy": 0.5193690776824951, "epoch": 1.8989841986455982, "grad_norm": 1.6769845485687256, "learning_rate": 4.917248690470109e-06, "loss": 0.3994, "mean_token_accuracy": 0.8753103852272034, "num_tokens": 27323255.0, "step": 3365 }, { "entropy": 0.5402388453483582, "epoch": 1.9018058690744921, "grad_norm": 1.4692338705062866, "learning_rate": 4.917004349224851e-06, "loss": 0.4094, "mean_token_accuracy": 0.8716601133346558, "num_tokens": 27363961.0, "step": 3370 }, { "entropy": 0.5165061771869659, "epoch": 1.904627539503386, "grad_norm": 1.2910758256912231, "learning_rate": 4.916759655940107e-06, "loss": 0.3992, "mean_token_accuracy": 0.8713883280754089, "num_tokens": 27404297.0, "step": 3375 }, { "entropy": 0.5438097000122071, "epoch": 1.90744920993228, "grad_norm": 1.5428695678710938, "learning_rate": 4.916514610663943e-06, "loss": 0.4398, "mean_token_accuracy": 0.8607651233673096, "num_tokens": 27444966.0, "step": 3380 }, { "entropy": 0.5148679494857789, "epoch": 1.9102708803611739, "grad_norm": 1.370609164237976, "learning_rate": 4.916269213444502e-06, "loss": 0.3973, "mean_token_accuracy": 0.8719611167907715, "num_tokens": 27485807.0, "step": 3385 }, { "entropy": 0.552492767572403, "epoch": 1.9130925507900676, "grad_norm": 1.3460413217544556, "learning_rate": 4.9160234643299935e-06, "loss": 0.4232, "mean_token_accuracy": 0.867039430141449, "num_tokens": 27526548.0, "step": 3390 }, { "entropy": 0.5185729682445526, "epoch": 1.9159142212189617, "grad_norm": 1.2497392892837524, "learning_rate": 4.91577736336869e-06, "loss": 0.4138, "mean_token_accuracy": 0.8712507843971252, "num_tokens": 27566660.0, "step": 3395 }, { "entropy": 0.5251140117645263, "epoch": 1.9187358916478554, "grad_norm": 1.4792594909667969, "learning_rate": 4.915530910608941e-06, "loss": 0.4159, "mean_token_accuracy": 0.8693275570869445, "num_tokens": 27607302.0, "step": 3400 }, { "entropy": 0.5528305053710938, "epoch": 1.9215575620767495, "grad_norm": 1.4354878664016724, "learning_rate": 4.915284106099162e-06, "loss": 0.4363, "mean_token_accuracy": 0.864462697505951, "num_tokens": 27648058.0, "step": 3405 }, { "entropy": 0.5455857455730438, "epoch": 1.9243792325056432, "grad_norm": 1.3338799476623535, "learning_rate": 4.915036949887838e-06, "loss": 0.4146, "mean_token_accuracy": 0.8690644383430481, "num_tokens": 27688285.0, "step": 3410 }, { "entropy": 0.524651050567627, "epoch": 1.9272009029345374, "grad_norm": 1.3062248229980469, "learning_rate": 4.914789442023523e-06, "loss": 0.4082, "mean_token_accuracy": 0.8696060180664062, "num_tokens": 27728945.0, "step": 3415 }, { "entropy": 0.5397283554077148, "epoch": 1.930022573363431, "grad_norm": 1.3946961164474487, "learning_rate": 4.914541582554838e-06, "loss": 0.4419, "mean_token_accuracy": 0.8649717569351196, "num_tokens": 27769755.0, "step": 3420 }, { "entropy": 0.5519603669643403, "epoch": 1.9328442437923252, "grad_norm": 1.3521655797958374, "learning_rate": 4.914293371530478e-06, "loss": 0.448, "mean_token_accuracy": 0.858772075176239, "num_tokens": 27810361.0, "step": 3425 }, { "entropy": 0.5126847267150879, "epoch": 1.935665914221219, "grad_norm": 1.469565510749817, "learning_rate": 4.914044808999202e-06, "loss": 0.3962, "mean_token_accuracy": 0.8749841451644897, "num_tokens": 27851096.0, "step": 3430 }, { "entropy": 0.5041461646556854, "epoch": 1.9384875846501128, "grad_norm": 1.2633264064788818, "learning_rate": 4.913795895009841e-06, "loss": 0.3762, "mean_token_accuracy": 0.8790737509727478, "num_tokens": 27891803.0, "step": 3435 }, { "entropy": 0.5427121460437775, "epoch": 1.9413092550790068, "grad_norm": 1.4089435338974, "learning_rate": 4.913546629611294e-06, "loss": 0.4497, "mean_token_accuracy": 0.8610092759132385, "num_tokens": 27932403.0, "step": 3440 }, { "entropy": 0.5261597633361816, "epoch": 1.9441309255079007, "grad_norm": 1.4115110635757446, "learning_rate": 4.913297012852528e-06, "loss": 0.4186, "mean_token_accuracy": 0.8676816701889039, "num_tokens": 27973177.0, "step": 3445 }, { "entropy": 0.5300165891647339, "epoch": 1.9469525959367946, "grad_norm": 1.4206523895263672, "learning_rate": 4.9130470447825816e-06, "loss": 0.413, "mean_token_accuracy": 0.8669154524803162, "num_tokens": 28013878.0, "step": 3450 }, { "entropy": 0.5528669118881225, "epoch": 1.9497742663656885, "grad_norm": 1.4294172525405884, "learning_rate": 4.912796725450562e-06, "loss": 0.4264, "mean_token_accuracy": 0.8657429218292236, "num_tokens": 28054452.0, "step": 3455 }, { "entropy": 0.5473185420036316, "epoch": 1.9525959367945824, "grad_norm": 1.4492708444595337, "learning_rate": 4.912546054905642e-06, "loss": 0.4413, "mean_token_accuracy": 0.8618425607681275, "num_tokens": 28095117.0, "step": 3460 }, { "entropy": 0.5194451570510864, "epoch": 1.9554176072234764, "grad_norm": 1.4601637125015259, "learning_rate": 4.912295033197068e-06, "loss": 0.4066, "mean_token_accuracy": 0.8705293416976929, "num_tokens": 28135700.0, "step": 3465 }, { "entropy": 0.5036539554595947, "epoch": 1.9582392776523703, "grad_norm": 1.4262093305587769, "learning_rate": 4.9120436603741515e-06, "loss": 0.3792, "mean_token_accuracy": 0.8786629676818848, "num_tokens": 28176234.0, "step": 3470 }, { "entropy": 0.5227799832820892, "epoch": 1.961060948081264, "grad_norm": 1.4064563512802124, "learning_rate": 4.911791936486276e-06, "loss": 0.4158, "mean_token_accuracy": 0.871609365940094, "num_tokens": 28216567.0, "step": 3475 }, { "entropy": 0.49351215958595274, "epoch": 1.963882618510158, "grad_norm": 1.401670217514038, "learning_rate": 4.911539861582893e-06, "loss": 0.3878, "mean_token_accuracy": 0.8766814708709717, "num_tokens": 28257305.0, "step": 3480 }, { "entropy": 0.49786993861198425, "epoch": 1.9667042889390518, "grad_norm": 1.5233041048049927, "learning_rate": 4.911287435713522e-06, "loss": 0.3905, "mean_token_accuracy": 0.8762277483940124, "num_tokens": 28297776.0, "step": 3485 }, { "entropy": 0.4954915583133698, "epoch": 1.969525959367946, "grad_norm": 1.241149663925171, "learning_rate": 4.911034658927751e-06, "loss": 0.3957, "mean_token_accuracy": 0.8773471236228942, "num_tokens": 28338384.0, "step": 3490 }, { "entropy": 0.5027383029460907, "epoch": 1.9723476297968396, "grad_norm": 1.4190332889556885, "learning_rate": 4.91078153127524e-06, "loss": 0.3892, "mean_token_accuracy": 0.8748612880706788, "num_tokens": 28379137.0, "step": 3495 }, { "entropy": 0.5188802659511567, "epoch": 1.9751693002257338, "grad_norm": 1.4441595077514648, "learning_rate": 4.910528052805714e-06, "loss": 0.4141, "mean_token_accuracy": 0.8701729416847229, "num_tokens": 28419987.0, "step": 3500 }, { "epoch": 1.9751693002257338, "eval_entropy": 0.5039494037628174, "eval_loss": 0.32582059502601624, "eval_mean_token_accuracy": 0.9022613763809204, "eval_num_tokens": 28419987.0, "eval_runtime": 0.1635, "eval_samples_per_second": 24.464, "eval_steps_per_second": 6.116, "step": 3500 }, { "entropy": 0.5050791263580322, "epoch": 1.9779909706546275, "grad_norm": 1.2525067329406738, "learning_rate": 4.910274223568971e-06, "loss": 0.4111, "mean_token_accuracy": 0.8697703242301941, "num_tokens": 28460369.0, "step": 3505 }, { "entropy": 0.5229144394397736, "epoch": 1.9808126410835214, "grad_norm": 1.4554386138916016, "learning_rate": 4.9100200436148735e-06, "loss": 0.4106, "mean_token_accuracy": 0.8699278473854065, "num_tokens": 28501101.0, "step": 3510 }, { "entropy": 0.5569809198379516, "epoch": 1.9836343115124153, "grad_norm": 1.4474997520446777, "learning_rate": 4.909765512993357e-06, "loss": 0.4563, "mean_token_accuracy": 0.857849943637848, "num_tokens": 28541729.0, "step": 3515 }, { "entropy": 0.5587629973888397, "epoch": 1.9864559819413092, "grad_norm": 1.3627238273620605, "learning_rate": 4.909510631754425e-06, "loss": 0.4268, "mean_token_accuracy": 0.8645955443382263, "num_tokens": 28582055.0, "step": 3520 }, { "entropy": 0.5122965276241302, "epoch": 1.9892776523702032, "grad_norm": 1.2392799854278564, "learning_rate": 4.909255399948146e-06, "loss": 0.3933, "mean_token_accuracy": 0.8727590441703796, "num_tokens": 28622596.0, "step": 3525 }, { "entropy": 0.5298355937004089, "epoch": 1.992099322799097, "grad_norm": 1.6109238862991333, "learning_rate": 4.908999817624661e-06, "loss": 0.4189, "mean_token_accuracy": 0.8664365530014038, "num_tokens": 28662503.0, "step": 3530 }, { "entropy": 0.5138360559940338, "epoch": 1.994920993227991, "grad_norm": 1.3417595624923706, "learning_rate": 4.9087438848341806e-06, "loss": 0.4158, "mean_token_accuracy": 0.8667647480964661, "num_tokens": 28702848.0, "step": 3535 }, { "entropy": 0.5267124712467194, "epoch": 1.997742663656885, "grad_norm": 1.2989228963851929, "learning_rate": 4.908487601626983e-06, "loss": 0.4113, "mean_token_accuracy": 0.8676295042037964, "num_tokens": 28743457.0, "step": 3540 }, { "entropy": 0.5033242404460907, "epoch": 2.000564334085779, "grad_norm": 1.0862282514572144, "learning_rate": 4.9082309680534134e-06, "loss": 0.3848, "mean_token_accuracy": 0.8765107035636902, "num_tokens": 28777728.0, "step": 3545 }, { "entropy": 0.4844656944274902, "epoch": 2.0033860045146725, "grad_norm": 1.1120542287826538, "learning_rate": 4.907973984163888e-06, "loss": 0.3023, "mean_token_accuracy": 0.9055742740631103, "num_tokens": 28818642.0, "step": 3550 }, { "entropy": 0.4910138726234436, "epoch": 2.0062076749435667, "grad_norm": 1.2199798822402954, "learning_rate": 4.907716650008893e-06, "loss": 0.3487, "mean_token_accuracy": 0.8892881631851196, "num_tokens": 28859021.0, "step": 3555 }, { "entropy": 0.4198545038700104, "epoch": 2.0090293453724604, "grad_norm": 1.348246693611145, "learning_rate": 4.907458965638979e-06, "loss": 0.3017, "mean_token_accuracy": 0.9059156775474548, "num_tokens": 28899517.0, "step": 3560 }, { "entropy": 0.42251318097114565, "epoch": 2.0118510158013545, "grad_norm": 1.4219539165496826, "learning_rate": 4.90720093110477e-06, "loss": 0.316, "mean_token_accuracy": 0.8997484803199768, "num_tokens": 28940360.0, "step": 3565 }, { "entropy": 0.4202295243740082, "epoch": 2.014672686230248, "grad_norm": 1.5008666515350342, "learning_rate": 4.906942546456957e-06, "loss": 0.3168, "mean_token_accuracy": 0.8997807145118714, "num_tokens": 28981158.0, "step": 3570 }, { "entropy": 0.39814443588256837, "epoch": 2.0174943566591423, "grad_norm": 1.4986308813095093, "learning_rate": 4.906683811746298e-06, "loss": 0.2957, "mean_token_accuracy": 0.9047508955001831, "num_tokens": 29021722.0, "step": 3575 }, { "entropy": 0.4490103781223297, "epoch": 2.020316027088036, "grad_norm": 1.7003601789474487, "learning_rate": 4.9064247270236235e-06, "loss": 0.3252, "mean_token_accuracy": 0.8966346383094788, "num_tokens": 29062372.0, "step": 3580 }, { "entropy": 0.440228271484375, "epoch": 2.02313769751693, "grad_norm": 1.8546236753463745, "learning_rate": 4.906165292339828e-06, "loss": 0.3164, "mean_token_accuracy": 0.899936830997467, "num_tokens": 29102844.0, "step": 3585 }, { "entropy": 0.39439972043037413, "epoch": 2.025959367945824, "grad_norm": 1.4125168323516846, "learning_rate": 4.905905507745881e-06, "loss": 0.2914, "mean_token_accuracy": 0.9061366796493531, "num_tokens": 29143188.0, "step": 3590 }, { "entropy": 0.40190274715423585, "epoch": 2.028781038374718, "grad_norm": 1.5160431861877441, "learning_rate": 4.905645373292815e-06, "loss": 0.2931, "mean_token_accuracy": 0.9052773237228393, "num_tokens": 29183656.0, "step": 3595 }, { "entropy": 0.4064796566963196, "epoch": 2.0316027088036117, "grad_norm": 1.6123907566070557, "learning_rate": 4.905384889031734e-06, "loss": 0.302, "mean_token_accuracy": 0.9012960076332093, "num_tokens": 29224451.0, "step": 3600 }, { "entropy": 0.4297832429409027, "epoch": 2.034424379232506, "grad_norm": 1.5297926664352417, "learning_rate": 4.90512405501381e-06, "loss": 0.3242, "mean_token_accuracy": 0.8968385696411133, "num_tokens": 29265091.0, "step": 3605 }, { "entropy": 0.41181485652923583, "epoch": 2.0372460496613995, "grad_norm": 1.4415594339370728, "learning_rate": 4.904862871290285e-06, "loss": 0.3107, "mean_token_accuracy": 0.9039250016212463, "num_tokens": 29305558.0, "step": 3610 }, { "entropy": 0.4206926107406616, "epoch": 2.0400677200902932, "grad_norm": 1.4016162157058716, "learning_rate": 4.904601337912467e-06, "loss": 0.3029, "mean_token_accuracy": 0.9045567750930786, "num_tokens": 29346298.0, "step": 3615 }, { "entropy": 0.3947125256061554, "epoch": 2.0428893905191874, "grad_norm": 1.505416989326477, "learning_rate": 4.9043394549317345e-06, "loss": 0.2789, "mean_token_accuracy": 0.9085012078285217, "num_tokens": 29387055.0, "step": 3620 }, { "entropy": 0.39603736996650696, "epoch": 2.045711060948081, "grad_norm": 1.6036330461502075, "learning_rate": 4.904077222399534e-06, "loss": 0.3023, "mean_token_accuracy": 0.90137619972229, "num_tokens": 29427763.0, "step": 3625 }, { "entropy": 0.3889478862285614, "epoch": 2.0485327313769752, "grad_norm": 1.672661304473877, "learning_rate": 4.903814640367383e-06, "loss": 0.2932, "mean_token_accuracy": 0.9071918845176696, "num_tokens": 29468411.0, "step": 3630 }, { "entropy": 0.39558014273643494, "epoch": 2.051354401805869, "grad_norm": 1.556510090827942, "learning_rate": 4.903551708886865e-06, "loss": 0.2879, "mean_token_accuracy": 0.9074760317802429, "num_tokens": 29509088.0, "step": 3635 }, { "entropy": 0.41549997925758364, "epoch": 2.054176072234763, "grad_norm": 1.4191838502883911, "learning_rate": 4.903288428009632e-06, "loss": 0.2912, "mean_token_accuracy": 0.9053076505661011, "num_tokens": 29549728.0, "step": 3640 }, { "entropy": 0.4345362961292267, "epoch": 2.0569977426636568, "grad_norm": 1.6131196022033691, "learning_rate": 4.9030247977874064e-06, "loss": 0.3214, "mean_token_accuracy": 0.8968990206718445, "num_tokens": 29590362.0, "step": 3645 }, { "entropy": 0.38488792777061465, "epoch": 2.059819413092551, "grad_norm": 1.391592025756836, "learning_rate": 4.902760818271978e-06, "loss": 0.2758, "mean_token_accuracy": 0.9102860927581787, "num_tokens": 29630543.0, "step": 3650 }, { "entropy": 0.4055006742477417, "epoch": 2.0626410835214446, "grad_norm": 1.5487695932388306, "learning_rate": 4.902496489515206e-06, "loss": 0.308, "mean_token_accuracy": 0.8991694211959839, "num_tokens": 29671134.0, "step": 3655 }, { "entropy": 0.4026013076305389, "epoch": 2.0654627539503387, "grad_norm": 1.4963382482528687, "learning_rate": 4.902231811569016e-06, "loss": 0.3016, "mean_token_accuracy": 0.9023310422897339, "num_tokens": 29711975.0, "step": 3660 }, { "entropy": 0.3941408574581146, "epoch": 2.0682844243792324, "grad_norm": 1.4817650318145752, "learning_rate": 4.901966784485407e-06, "loss": 0.2837, "mean_token_accuracy": 0.9089763045310975, "num_tokens": 29752694.0, "step": 3665 }, { "entropy": 0.4023356318473816, "epoch": 2.0711060948081266, "grad_norm": 1.6465110778808594, "learning_rate": 4.901701408316443e-06, "loss": 0.2836, "mean_token_accuracy": 0.908521831035614, "num_tokens": 29793192.0, "step": 3670 }, { "entropy": 0.38578653931617735, "epoch": 2.0739277652370203, "grad_norm": 1.5175875425338745, "learning_rate": 4.901435683114255e-06, "loss": 0.2886, "mean_token_accuracy": 0.9055356621742249, "num_tokens": 29834115.0, "step": 3675 }, { "entropy": 0.3811330258846283, "epoch": 2.0767494356659144, "grad_norm": 1.527234435081482, "learning_rate": 4.901169608931046e-06, "loss": 0.2779, "mean_token_accuracy": 0.9086094379425049, "num_tokens": 29874790.0, "step": 3680 }, { "entropy": 0.3947294354438782, "epoch": 2.079571106094808, "grad_norm": 1.5221364498138428, "learning_rate": 4.900903185819088e-06, "loss": 0.2842, "mean_token_accuracy": 0.9065226435661315, "num_tokens": 29915452.0, "step": 3685 }, { "entropy": 0.372803258895874, "epoch": 2.082392776523702, "grad_norm": 1.4854118824005127, "learning_rate": 4.900636413830717e-06, "loss": 0.2865, "mean_token_accuracy": 0.9063424587249755, "num_tokens": 29956198.0, "step": 3690 }, { "entropy": 0.3771813929080963, "epoch": 2.085214446952596, "grad_norm": 1.432308316230774, "learning_rate": 4.900369293018342e-06, "loss": 0.2663, "mean_token_accuracy": 0.9135713815689087, "num_tokens": 29996744.0, "step": 3695 }, { "entropy": 0.3829148352146149, "epoch": 2.0880361173814896, "grad_norm": 1.5369491577148438, "learning_rate": 4.900101823434438e-06, "loss": 0.2865, "mean_token_accuracy": 0.9079620480537415, "num_tokens": 30037615.0, "step": 3700 }, { "entropy": 0.40159552693367007, "epoch": 2.090857787810384, "grad_norm": 1.6757155656814575, "learning_rate": 4.8998340051315515e-06, "loss": 0.2867, "mean_token_accuracy": 0.9064762830734253, "num_tokens": 30078214.0, "step": 3705 }, { "entropy": 0.393253880739212, "epoch": 2.0936794582392775, "grad_norm": 1.5468506813049316, "learning_rate": 4.899565838162292e-06, "loss": 0.2948, "mean_token_accuracy": 0.9056425452232361, "num_tokens": 30118900.0, "step": 3710 }, { "entropy": 0.42414683699607847, "epoch": 2.0965011286681716, "grad_norm": 1.736525297164917, "learning_rate": 4.899297322579345e-06, "loss": 0.3287, "mean_token_accuracy": 0.8957733750343323, "num_tokens": 30159519.0, "step": 3715 }, { "entropy": 0.3917936205863953, "epoch": 2.0993227990970653, "grad_norm": 1.5324411392211914, "learning_rate": 4.899028458435458e-06, "loss": 0.2951, "mean_token_accuracy": 0.9044302463531494, "num_tokens": 30199519.0, "step": 3720 }, { "entropy": 0.38330122232437136, "epoch": 2.1021444695259595, "grad_norm": 1.4559762477874756, "learning_rate": 4.898759245783449e-06, "loss": 0.2755, "mean_token_accuracy": 0.9099204063415527, "num_tokens": 30239952.0, "step": 3725 }, { "entropy": 0.3951174974441528, "epoch": 2.104966139954853, "grad_norm": 1.5351479053497314, "learning_rate": 4.898489684676205e-06, "loss": 0.2907, "mean_token_accuracy": 0.9066962122917175, "num_tokens": 30280534.0, "step": 3730 }, { "entropy": 0.3635559678077698, "epoch": 2.1077878103837473, "grad_norm": 1.50784170627594, "learning_rate": 4.898219775166683e-06, "loss": 0.2621, "mean_token_accuracy": 0.9133730411529541, "num_tokens": 30321115.0, "step": 3735 }, { "entropy": 0.41765828132629396, "epoch": 2.110609480812641, "grad_norm": 1.554824709892273, "learning_rate": 4.897949517307905e-06, "loss": 0.3156, "mean_token_accuracy": 0.898926043510437, "num_tokens": 30361715.0, "step": 3740 }, { "entropy": 0.40577141046524046, "epoch": 2.113431151241535, "grad_norm": 1.543889045715332, "learning_rate": 4.897678911152964e-06, "loss": 0.302, "mean_token_accuracy": 0.9033358812332153, "num_tokens": 30402151.0, "step": 3745 }, { "entropy": 0.3970208168029785, "epoch": 2.116252821670429, "grad_norm": 1.4221984148025513, "learning_rate": 4.897407956755021e-06, "loss": 0.2984, "mean_token_accuracy": 0.9049067020416259, "num_tokens": 30442797.0, "step": 3750 }, { "entropy": 0.41688573360443115, "epoch": 2.119074492099323, "grad_norm": 1.6445451974868774, "learning_rate": 4.897136654167304e-06, "loss": 0.3151, "mean_token_accuracy": 0.9019560813903809, "num_tokens": 30483552.0, "step": 3755 }, { "entropy": 0.34632920026779174, "epoch": 2.1218961625282167, "grad_norm": 1.3843744993209839, "learning_rate": 4.896865003443111e-06, "loss": 0.2493, "mean_token_accuracy": 0.9194693088531494, "num_tokens": 30524141.0, "step": 3760 }, { "entropy": 0.41517892479896545, "epoch": 2.124717832957111, "grad_norm": 1.4717671871185303, "learning_rate": 4.896593004635807e-06, "loss": 0.2993, "mean_token_accuracy": 0.9012321352958679, "num_tokens": 30564727.0, "step": 3765 }, { "entropy": 0.3839627206325531, "epoch": 2.1275395033860045, "grad_norm": 1.5019539594650269, "learning_rate": 4.896320657798828e-06, "loss": 0.2671, "mean_token_accuracy": 0.9121496915817261, "num_tokens": 30605161.0, "step": 3770 }, { "entropy": 0.40346505045890807, "epoch": 2.130361173814898, "grad_norm": 1.8757257461547852, "learning_rate": 4.896047962985676e-06, "loss": 0.3044, "mean_token_accuracy": 0.9000169515609742, "num_tokens": 30645838.0, "step": 3775 }, { "entropy": 0.40536361932754517, "epoch": 2.1331828442437923, "grad_norm": 1.508545994758606, "learning_rate": 4.89577492024992e-06, "loss": 0.3083, "mean_token_accuracy": 0.899243688583374, "num_tokens": 30686256.0, "step": 3780 }, { "entropy": 0.4287443220615387, "epoch": 2.136004514672686, "grad_norm": 1.4562077522277832, "learning_rate": 4.895501529645201e-06, "loss": 0.3222, "mean_token_accuracy": 0.8983871340751648, "num_tokens": 30726917.0, "step": 3785 }, { "entropy": 0.434832763671875, "epoch": 2.13882618510158, "grad_norm": 1.5828227996826172, "learning_rate": 4.895227791225228e-06, "loss": 0.2999, "mean_token_accuracy": 0.905066967010498, "num_tokens": 30767481.0, "step": 3790 }, { "entropy": 0.4244759321212769, "epoch": 2.141647855530474, "grad_norm": 1.6923705339431763, "learning_rate": 4.894953705043774e-06, "loss": 0.3151, "mean_token_accuracy": 0.901056170463562, "num_tokens": 30808039.0, "step": 3795 }, { "entropy": 0.40609169006347656, "epoch": 2.144469525959368, "grad_norm": 1.470828652381897, "learning_rate": 4.894679271154684e-06, "loss": 0.3031, "mean_token_accuracy": 0.9032929420471192, "num_tokens": 30848528.0, "step": 3800 }, { "entropy": 0.38831552267074587, "epoch": 2.1472911963882617, "grad_norm": 1.4154300689697266, "learning_rate": 4.894404489611872e-06, "loss": 0.3071, "mean_token_accuracy": 0.9009493231773377, "num_tokens": 30889149.0, "step": 3805 }, { "entropy": 0.36803258061408994, "epoch": 2.150112866817156, "grad_norm": 1.5650067329406738, "learning_rate": 4.894129360469317e-06, "loss": 0.2586, "mean_token_accuracy": 0.9152506470680237, "num_tokens": 30929871.0, "step": 3810 }, { "entropy": 0.41122249364852903, "epoch": 2.1529345372460496, "grad_norm": 1.7758678197860718, "learning_rate": 4.89385388378107e-06, "loss": 0.3143, "mean_token_accuracy": 0.9019395470619201, "num_tokens": 30970228.0, "step": 3815 }, { "entropy": 0.3893428444862366, "epoch": 2.1557562076749437, "grad_norm": 1.581362009048462, "learning_rate": 4.893578059601249e-06, "loss": 0.2896, "mean_token_accuracy": 0.9072283625602722, "num_tokens": 31010919.0, "step": 3820 }, { "entropy": 0.3995703637599945, "epoch": 2.1585778781038374, "grad_norm": 1.6888242959976196, "learning_rate": 4.893301887984036e-06, "loss": 0.3082, "mean_token_accuracy": 0.8979094982147217, "num_tokens": 31051717.0, "step": 3825 }, { "entropy": 0.3909620702266693, "epoch": 2.1613995485327315, "grad_norm": 1.589010238647461, "learning_rate": 4.893025368983688e-06, "loss": 0.2876, "mean_token_accuracy": 0.9069844603538513, "num_tokens": 31092303.0, "step": 3830 }, { "entropy": 0.38756829500198364, "epoch": 2.1642212189616252, "grad_norm": 1.4219690561294556, "learning_rate": 4.892748502654527e-06, "loss": 0.2725, "mean_token_accuracy": 0.9117750763893128, "num_tokens": 31133212.0, "step": 3835 }, { "entropy": 0.39158605933189394, "epoch": 2.1670428893905194, "grad_norm": 1.5833606719970703, "learning_rate": 4.892471289050942e-06, "loss": 0.2781, "mean_token_accuracy": 0.9098140478134156, "num_tokens": 31173102.0, "step": 3840 }, { "entropy": 0.4036704897880554, "epoch": 2.169864559819413, "grad_norm": 1.4913930892944336, "learning_rate": 4.892193728227393e-06, "loss": 0.3055, "mean_token_accuracy": 0.9021237850189209, "num_tokens": 31213865.0, "step": 3845 }, { "entropy": 0.394330632686615, "epoch": 2.172686230248307, "grad_norm": 1.8555155992507935, "learning_rate": 4.891915820238406e-06, "loss": 0.3095, "mean_token_accuracy": 0.8976067423820495, "num_tokens": 31254408.0, "step": 3850 }, { "entropy": 0.4108275890350342, "epoch": 2.175507900677201, "grad_norm": 1.4580622911453247, "learning_rate": 4.891637565138578e-06, "loss": 0.3162, "mean_token_accuracy": 0.8978480458259582, "num_tokens": 31295211.0, "step": 3855 }, { "entropy": 0.42848613262176516, "epoch": 2.1783295711060946, "grad_norm": 1.54513680934906, "learning_rate": 4.891358962982569e-06, "loss": 0.308, "mean_token_accuracy": 0.9015236139297486, "num_tokens": 31335702.0, "step": 3860 }, { "entropy": 0.4215279221534729, "epoch": 2.1811512415349887, "grad_norm": 1.7928465604782104, "learning_rate": 4.891080013825112e-06, "loss": 0.3083, "mean_token_accuracy": 0.9013218641281128, "num_tokens": 31376407.0, "step": 3865 }, { "entropy": 0.4218919575214386, "epoch": 2.1839729119638824, "grad_norm": 1.5700446367263794, "learning_rate": 4.890800717721007e-06, "loss": 0.3067, "mean_token_accuracy": 0.9020668745040894, "num_tokens": 31416993.0, "step": 3870 }, { "entropy": 0.40297368764877317, "epoch": 2.1867945823927766, "grad_norm": 1.5355396270751953, "learning_rate": 4.890521074725122e-06, "loss": 0.2787, "mean_token_accuracy": 0.9106267809867858, "num_tokens": 31457735.0, "step": 3875 }, { "entropy": 0.4444594144821167, "epoch": 2.1896162528216703, "grad_norm": 1.762122631072998, "learning_rate": 4.890241084892392e-06, "loss": 0.324, "mean_token_accuracy": 0.8961127758026123, "num_tokens": 31498437.0, "step": 3880 }, { "entropy": 0.39649640321731566, "epoch": 2.1924379232505644, "grad_norm": 1.3777143955230713, "learning_rate": 4.889960748277821e-06, "loss": 0.2963, "mean_token_accuracy": 0.9072598814964294, "num_tokens": 31539261.0, "step": 3885 }, { "entropy": 0.36735162138938904, "epoch": 2.195259593679458, "grad_norm": 1.510568618774414, "learning_rate": 4.889680064936483e-06, "loss": 0.2806, "mean_token_accuracy": 0.9091784954071045, "num_tokens": 31580151.0, "step": 3890 }, { "entropy": 0.3932835698127747, "epoch": 2.1980812641083523, "grad_norm": 1.311471939086914, "learning_rate": 4.889399034923515e-06, "loss": 0.2901, "mean_token_accuracy": 0.9054880023002625, "num_tokens": 31620726.0, "step": 3895 }, { "entropy": 0.4081146061420441, "epoch": 2.200902934537246, "grad_norm": 1.7176251411437988, "learning_rate": 4.889117658294128e-06, "loss": 0.3022, "mean_token_accuracy": 0.9020537614822388, "num_tokens": 31661526.0, "step": 3900 }, { "entropy": 0.4014622807502747, "epoch": 2.20372460496614, "grad_norm": 1.5755354166030884, "learning_rate": 4.888835935103598e-06, "loss": 0.3051, "mean_token_accuracy": 0.9023280262947082, "num_tokens": 31702085.0, "step": 3905 }, { "entropy": 0.39336706399917604, "epoch": 2.206546275395034, "grad_norm": 1.734265923500061, "learning_rate": 4.888553865407269e-06, "loss": 0.2862, "mean_token_accuracy": 0.905624508857727, "num_tokens": 31742379.0, "step": 3910 }, { "entropy": 0.3690111577510834, "epoch": 2.209367945823928, "grad_norm": 1.5115963220596313, "learning_rate": 4.888271449260554e-06, "loss": 0.26, "mean_token_accuracy": 0.9165234088897705, "num_tokens": 31783016.0, "step": 3915 }, { "entropy": 0.3873672723770142, "epoch": 2.2121896162528216, "grad_norm": 1.7038564682006836, "learning_rate": 4.887988686718933e-06, "loss": 0.2887, "mean_token_accuracy": 0.9066754698753356, "num_tokens": 31823595.0, "step": 3920 }, { "entropy": 0.3954701006412506, "epoch": 2.2150112866817158, "grad_norm": 1.6473407745361328, "learning_rate": 4.887705577837957e-06, "loss": 0.2929, "mean_token_accuracy": 0.9045971989631653, "num_tokens": 31864135.0, "step": 3925 }, { "entropy": 0.38976659178733825, "epoch": 2.2178329571106095, "grad_norm": 1.4724841117858887, "learning_rate": 4.88742212267324e-06, "loss": 0.2907, "mean_token_accuracy": 0.9068511605262757, "num_tokens": 31904819.0, "step": 3930 }, { "entropy": 0.40335485339164734, "epoch": 2.2206546275395036, "grad_norm": 1.5783131122589111, "learning_rate": 4.887138321280468e-06, "loss": 0.3003, "mean_token_accuracy": 0.9015839457511902, "num_tokens": 31945530.0, "step": 3935 }, { "entropy": 0.3923260450363159, "epoch": 2.2234762979683973, "grad_norm": 1.4128057956695557, "learning_rate": 4.886854173715393e-06, "loss": 0.2819, "mean_token_accuracy": 0.9102500557899476, "num_tokens": 31986171.0, "step": 3940 }, { "entropy": 0.4288273572921753, "epoch": 2.226297968397291, "grad_norm": 1.6166179180145264, "learning_rate": 4.886569680033837e-06, "loss": 0.3106, "mean_token_accuracy": 0.8994893431663513, "num_tokens": 32026712.0, "step": 3945 }, { "entropy": 0.4336235702037811, "epoch": 2.229119638826185, "grad_norm": 1.6575360298156738, "learning_rate": 4.886284840291689e-06, "loss": 0.3289, "mean_token_accuracy": 0.8974316120147705, "num_tokens": 32067338.0, "step": 3950 }, { "entropy": 0.40959821343421937, "epoch": 2.231941309255079, "grad_norm": 1.4833348989486694, "learning_rate": 4.885999654544904e-06, "loss": 0.3031, "mean_token_accuracy": 0.9015701293945313, "num_tokens": 32108045.0, "step": 3955 }, { "entropy": 0.43567387461662294, "epoch": 2.234762979683973, "grad_norm": 1.6455271244049072, "learning_rate": 4.885714122849509e-06, "loss": 0.3236, "mean_token_accuracy": 0.8950015425682067, "num_tokens": 32148724.0, "step": 3960 }, { "entropy": 0.41589171886444093, "epoch": 2.2375846501128667, "grad_norm": 1.6400325298309326, "learning_rate": 4.885428245261596e-06, "loss": 0.3149, "mean_token_accuracy": 0.8991590142250061, "num_tokens": 32189079.0, "step": 3965 }, { "entropy": 0.4249677717685699, "epoch": 2.240406320541761, "grad_norm": 2.002429485321045, "learning_rate": 4.885142021837323e-06, "loss": 0.3116, "mean_token_accuracy": 0.9002622365951538, "num_tokens": 32229786.0, "step": 3970 }, { "entropy": 0.42671540975570676, "epoch": 2.2432279909706545, "grad_norm": 1.9370062351226807, "learning_rate": 4.8848554526329236e-06, "loss": 0.3381, "mean_token_accuracy": 0.8927700161933899, "num_tokens": 32270581.0, "step": 3975 }, { "entropy": 0.4029369592666626, "epoch": 2.2460496613995486, "grad_norm": 1.457308053970337, "learning_rate": 4.88456853770469e-06, "loss": 0.299, "mean_token_accuracy": 0.9022388100624085, "num_tokens": 32311159.0, "step": 3980 }, { "entropy": 0.4298437058925629, "epoch": 2.2488713318284423, "grad_norm": 1.4605598449707031, "learning_rate": 4.88428127710899e-06, "loss": 0.3317, "mean_token_accuracy": 0.8953496932983398, "num_tokens": 32351769.0, "step": 3985 }, { "entropy": 0.4445444464683533, "epoch": 2.2516930022573365, "grad_norm": 1.861961841583252, "learning_rate": 4.883993670902254e-06, "loss": 0.3314, "mean_token_accuracy": 0.895211398601532, "num_tokens": 32392495.0, "step": 3990 }, { "entropy": 0.4017652451992035, "epoch": 2.25451467268623, "grad_norm": 1.816008448600769, "learning_rate": 4.883705719140982e-06, "loss": 0.3018, "mean_token_accuracy": 0.9027621269226074, "num_tokens": 32432976.0, "step": 3995 }, { "entropy": 0.3940592765808105, "epoch": 2.2573363431151243, "grad_norm": 1.5475646257400513, "learning_rate": 4.883417421881744e-06, "loss": 0.294, "mean_token_accuracy": 0.9063061594963073, "num_tokens": 32473442.0, "step": 4000 }, { "epoch": 2.2573363431151243, "eval_entropy": 0.402061402797699, "eval_loss": 0.27405405044555664, "eval_mean_token_accuracy": 0.9164431095123291, "eval_num_tokens": 32473442.0, "eval_runtime": 0.1642, "eval_samples_per_second": 24.354, "eval_steps_per_second": 6.089, "step": 4000 }, { "entropy": 0.4588034152984619, "epoch": 2.260158013544018, "grad_norm": 1.6592538356781006, "learning_rate": 4.883128779181174e-06, "loss": 0.3559, "mean_token_accuracy": 0.889747953414917, "num_tokens": 32514004.0, "step": 4005 }, { "entropy": 0.3681566655635834, "epoch": 2.2629796839729117, "grad_norm": 1.5150611400604248, "learning_rate": 4.8828397910959766e-06, "loss": 0.2613, "mean_token_accuracy": 0.9151654362678527, "num_tokens": 32554858.0, "step": 4010 }, { "entropy": 0.40542457699775697, "epoch": 2.265801354401806, "grad_norm": 1.5727524757385254, "learning_rate": 4.882550457682924e-06, "loss": 0.3099, "mean_token_accuracy": 0.8985041737556457, "num_tokens": 32595341.0, "step": 4015 }, { "entropy": 0.38482665419578554, "epoch": 2.2686230248307, "grad_norm": 1.5396711826324463, "learning_rate": 4.8822607789988565e-06, "loss": 0.2944, "mean_token_accuracy": 0.9044581055641174, "num_tokens": 32636025.0, "step": 4020 }, { "entropy": 0.4148333430290222, "epoch": 2.2714446952595937, "grad_norm": 1.5504050254821777, "learning_rate": 4.881970755100679e-06, "loss": 0.3041, "mean_token_accuracy": 0.901849901676178, "num_tokens": 32676078.0, "step": 4025 }, { "entropy": 0.4355687737464905, "epoch": 2.2742663656884874, "grad_norm": 3.5764992237091064, "learning_rate": 4.88168038604537e-06, "loss": 0.3248, "mean_token_accuracy": 0.8947890400886536, "num_tokens": 32716756.0, "step": 4030 }, { "entropy": 0.4102937400341034, "epoch": 2.2770880361173815, "grad_norm": 1.5980452299118042, "learning_rate": 4.881389671889969e-06, "loss": 0.3088, "mean_token_accuracy": 0.9003346800804138, "num_tokens": 32757375.0, "step": 4035 }, { "entropy": 0.4062139928340912, "epoch": 2.2799097065462752, "grad_norm": 1.4203381538391113, "learning_rate": 4.881098612691589e-06, "loss": 0.2957, "mean_token_accuracy": 0.9053402423858643, "num_tokens": 32797973.0, "step": 4040 }, { "entropy": 0.42087835669517515, "epoch": 2.2827313769751694, "grad_norm": 1.6059361696243286, "learning_rate": 4.880807208507409e-06, "loss": 0.3123, "mean_token_accuracy": 0.8996002554893494, "num_tokens": 32838742.0, "step": 4045 }, { "entropy": 0.3775734961032867, "epoch": 2.285553047404063, "grad_norm": 1.5873465538024902, "learning_rate": 4.880515459394674e-06, "loss": 0.2564, "mean_token_accuracy": 0.9160468459129334, "num_tokens": 32879298.0, "step": 4050 }, { "entropy": 0.3923003554344177, "epoch": 2.288374717832957, "grad_norm": 1.7001484632492065, "learning_rate": 4.880223365410699e-06, "loss": 0.3002, "mean_token_accuracy": 0.9026227593421936, "num_tokens": 32920082.0, "step": 4055 }, { "entropy": 0.41630412340164186, "epoch": 2.291196388261851, "grad_norm": 1.824363350868225, "learning_rate": 4.879930926612866e-06, "loss": 0.3176, "mean_token_accuracy": 0.8960343599319458, "num_tokens": 32960716.0, "step": 4060 }, { "entropy": 0.4242856979370117, "epoch": 2.294018058690745, "grad_norm": 1.5266849994659424, "learning_rate": 4.879638143058625e-06, "loss": 0.3006, "mean_token_accuracy": 0.9029630541801452, "num_tokens": 33001292.0, "step": 4065 }, { "entropy": 0.4204494059085846, "epoch": 2.2968397291196387, "grad_norm": 1.8229271173477173, "learning_rate": 4.879345014805491e-06, "loss": 0.3137, "mean_token_accuracy": 0.9004954934120178, "num_tokens": 33041946.0, "step": 4070 }, { "entropy": 0.40997507572174074, "epoch": 2.299661399548533, "grad_norm": 1.7101879119873047, "learning_rate": 4.8790515419110516e-06, "loss": 0.3036, "mean_token_accuracy": 0.9026644349098205, "num_tokens": 33082717.0, "step": 4075 }, { "entropy": 0.3944500207901001, "epoch": 2.3024830699774266, "grad_norm": 1.7713676691055298, "learning_rate": 4.8787577244329585e-06, "loss": 0.2867, "mean_token_accuracy": 0.9076826214790344, "num_tokens": 33123211.0, "step": 4080 }, { "entropy": 0.4161556899547577, "epoch": 2.3053047404063207, "grad_norm": 1.6283953189849854, "learning_rate": 4.878463562428933e-06, "loss": 0.317, "mean_token_accuracy": 0.8988382816314697, "num_tokens": 33164049.0, "step": 4085 }, { "entropy": 0.4177008092403412, "epoch": 2.3081264108352144, "grad_norm": 1.6907880306243896, "learning_rate": 4.878169055956763e-06, "loss": 0.3069, "mean_token_accuracy": 0.8997668266296387, "num_tokens": 33204881.0, "step": 4090 }, { "entropy": 0.3897433698177338, "epoch": 2.310948081264108, "grad_norm": 1.4863002300262451, "learning_rate": 4.877874205074303e-06, "loss": 0.2881, "mean_token_accuracy": 0.9074371457099915, "num_tokens": 33245356.0, "step": 4095 }, { "entropy": 0.41926553249359133, "epoch": 2.3137697516930023, "grad_norm": 1.7184562683105469, "learning_rate": 4.877579009839478e-06, "loss": 0.3263, "mean_token_accuracy": 0.8960996985435485, "num_tokens": 33285939.0, "step": 4100 }, { "entropy": 0.39500190019607545, "epoch": 2.3165914221218964, "grad_norm": 1.7541210651397705, "learning_rate": 4.877283470310279e-06, "loss": 0.2844, "mean_token_accuracy": 0.9074315071105957, "num_tokens": 33326588.0, "step": 4105 }, { "entropy": 0.40492220520973204, "epoch": 2.31941309255079, "grad_norm": 1.7052255868911743, "learning_rate": 4.876987586544765e-06, "loss": 0.3244, "mean_token_accuracy": 0.8954932928085327, "num_tokens": 33367169.0, "step": 4110 }, { "entropy": 0.38237152695655824, "epoch": 2.322234762979684, "grad_norm": 1.6368436813354492, "learning_rate": 4.876691358601061e-06, "loss": 0.2849, "mean_token_accuracy": 0.9079111337661743, "num_tokens": 33407822.0, "step": 4115 }, { "entropy": 0.42206219434738157, "epoch": 2.325056433408578, "grad_norm": 1.7534505128860474, "learning_rate": 4.876394786537362e-06, "loss": 0.3165, "mean_token_accuracy": 0.899584424495697, "num_tokens": 33448607.0, "step": 4120 }, { "entropy": 0.3744909644126892, "epoch": 2.3278781038374716, "grad_norm": 1.4802114963531494, "learning_rate": 4.87609787041193e-06, "loss": 0.278, "mean_token_accuracy": 0.9111004710197449, "num_tokens": 33489164.0, "step": 4125 }, { "entropy": 0.3757921814918518, "epoch": 2.3306997742663658, "grad_norm": 1.6543906927108765, "learning_rate": 4.875800610283092e-06, "loss": 0.2765, "mean_token_accuracy": 0.9108646512031555, "num_tokens": 33529990.0, "step": 4130 }, { "entropy": 0.42251219749450686, "epoch": 2.3335214446952595, "grad_norm": 1.6640608310699463, "learning_rate": 4.875503006209249e-06, "loss": 0.334, "mean_token_accuracy": 0.8940789937973023, "num_tokens": 33570747.0, "step": 4135 }, { "entropy": 0.3960157215595245, "epoch": 2.3363431151241536, "grad_norm": 1.8502579927444458, "learning_rate": 4.875205058248861e-06, "loss": 0.2835, "mean_token_accuracy": 0.9076458573341369, "num_tokens": 33611154.0, "step": 4140 }, { "entropy": 0.41452285647392273, "epoch": 2.3391647855530473, "grad_norm": 1.5899200439453125, "learning_rate": 4.874906766460463e-06, "loss": 0.314, "mean_token_accuracy": 0.9006463170051575, "num_tokens": 33651898.0, "step": 4145 }, { "entropy": 0.3915266990661621, "epoch": 2.3419864559819414, "grad_norm": 1.3926752805709839, "learning_rate": 4.874608130902653e-06, "loss": 0.2881, "mean_token_accuracy": 0.9062711000442505, "num_tokens": 33692461.0, "step": 4150 }, { "entropy": 0.380002897977829, "epoch": 2.344808126410835, "grad_norm": 1.4338423013687134, "learning_rate": 4.874309151634098e-06, "loss": 0.2729, "mean_token_accuracy": 0.9096962571144104, "num_tokens": 33732460.0, "step": 4155 }, { "entropy": 0.40470706224441527, "epoch": 2.3476297968397293, "grad_norm": 1.7443944215774536, "learning_rate": 4.874009828713532e-06, "loss": 0.2983, "mean_token_accuracy": 0.9021994709968567, "num_tokens": 33773109.0, "step": 4160 }, { "entropy": 0.3907041311264038, "epoch": 2.350451467268623, "grad_norm": 1.3750447034835815, "learning_rate": 4.873710162199759e-06, "loss": 0.2994, "mean_token_accuracy": 0.9032772421836853, "num_tokens": 33813658.0, "step": 4165 }, { "entropy": 0.42153160572052, "epoch": 2.353273137697517, "grad_norm": 1.5966558456420898, "learning_rate": 4.873410152151648e-06, "loss": 0.3312, "mean_token_accuracy": 0.89634268283844, "num_tokens": 33854046.0, "step": 4170 }, { "entropy": 0.39306110739707945, "epoch": 2.356094808126411, "grad_norm": 1.5900177955627441, "learning_rate": 4.873109798628133e-06, "loss": 0.3014, "mean_token_accuracy": 0.902882993221283, "num_tokens": 33894451.0, "step": 4175 }, { "entropy": 0.3619803309440613, "epoch": 2.3589164785553045, "grad_norm": 1.5183881521224976, "learning_rate": 4.872809101688222e-06, "loss": 0.2644, "mean_token_accuracy": 0.9138602614402771, "num_tokens": 33935166.0, "step": 4180 }, { "entropy": 0.40472997426986695, "epoch": 2.3617381489841986, "grad_norm": 1.5294315814971924, "learning_rate": 4.872508061390986e-06, "loss": 0.3044, "mean_token_accuracy": 0.9031421303749084, "num_tokens": 33975747.0, "step": 4185 }, { "entropy": 0.39990630745887756, "epoch": 2.364559819413093, "grad_norm": 1.6503218412399292, "learning_rate": 4.872206677795564e-06, "loss": 0.2919, "mean_token_accuracy": 0.9047986268997192, "num_tokens": 34016543.0, "step": 4190 }, { "entropy": 0.3896356463432312, "epoch": 2.3673814898419865, "grad_norm": 1.6634739637374878, "learning_rate": 4.871904950961163e-06, "loss": 0.2923, "mean_token_accuracy": 0.9031400918960572, "num_tokens": 34057312.0, "step": 4195 }, { "entropy": 0.41114925742149355, "epoch": 2.37020316027088, "grad_norm": 1.5319340229034424, "learning_rate": 4.871602880947058e-06, "loss": 0.3089, "mean_token_accuracy": 0.9023614287376404, "num_tokens": 34098081.0, "step": 4200 }, { "entropy": 0.4191340744495392, "epoch": 2.3730248306997743, "grad_norm": 1.60099196434021, "learning_rate": 4.871300467812589e-06, "loss": 0.3044, "mean_token_accuracy": 0.9024178624153137, "num_tokens": 34138698.0, "step": 4205 }, { "entropy": 0.4011139750480652, "epoch": 2.375846501128668, "grad_norm": 1.5579471588134766, "learning_rate": 4.870997711617166e-06, "loss": 0.3093, "mean_token_accuracy": 0.9014039039611816, "num_tokens": 34179294.0, "step": 4210 }, { "entropy": 0.38470744490623476, "epoch": 2.378668171557562, "grad_norm": 1.4612574577331543, "learning_rate": 4.8706946124202666e-06, "loss": 0.2772, "mean_token_accuracy": 0.9105475664138794, "num_tokens": 34219877.0, "step": 4215 }, { "entropy": 0.4085557281970978, "epoch": 2.381489841986456, "grad_norm": 1.6769572496414185, "learning_rate": 4.8703911702814326e-06, "loss": 0.3239, "mean_token_accuracy": 0.8968191981315613, "num_tokens": 34260392.0, "step": 4220 }, { "entropy": 0.4069161355495453, "epoch": 2.38431151241535, "grad_norm": 1.8266639709472656, "learning_rate": 4.870087385260277e-06, "loss": 0.294, "mean_token_accuracy": 0.9052396297454834, "num_tokens": 34301087.0, "step": 4225 }, { "entropy": 0.41574978828430176, "epoch": 2.3871331828442437, "grad_norm": 1.4496681690216064, "learning_rate": 4.8697832574164786e-06, "loss": 0.3123, "mean_token_accuracy": 0.8990219950675964, "num_tokens": 34341510.0, "step": 4230 }, { "entropy": 0.42293376922607423, "epoch": 2.389954853273138, "grad_norm": 1.5617249011993408, "learning_rate": 4.86947878680978e-06, "loss": 0.3288, "mean_token_accuracy": 0.8938201785087585, "num_tokens": 34382177.0, "step": 4235 }, { "entropy": 0.43548256158828735, "epoch": 2.3927765237020315, "grad_norm": 1.653110146522522, "learning_rate": 4.869173973499999e-06, "loss": 0.3322, "mean_token_accuracy": 0.8943361163139343, "num_tokens": 34422859.0, "step": 4240 }, { "entropy": 0.4331204891204834, "epoch": 2.3955981941309257, "grad_norm": 1.483518123626709, "learning_rate": 4.868868817547013e-06, "loss": 0.3349, "mean_token_accuracy": 0.8963463306427002, "num_tokens": 34463566.0, "step": 4245 }, { "entropy": 0.40947118401527405, "epoch": 2.3984198645598194, "grad_norm": 1.3235682249069214, "learning_rate": 4.868563319010772e-06, "loss": 0.2896, "mean_token_accuracy": 0.9062234163284302, "num_tokens": 34504064.0, "step": 4250 }, { "entropy": 0.4543120741844177, "epoch": 2.4012415349887135, "grad_norm": 1.6390918493270874, "learning_rate": 4.86825747795129e-06, "loss": 0.3415, "mean_token_accuracy": 0.8885816693305969, "num_tokens": 34544665.0, "step": 4255 }, { "entropy": 0.42363797426223754, "epoch": 2.404063205417607, "grad_norm": 1.629384160041809, "learning_rate": 4.86795129442865e-06, "loss": 0.3201, "mean_token_accuracy": 0.8973617792129517, "num_tokens": 34585268.0, "step": 4260 }, { "entropy": 0.3826207935810089, "epoch": 2.406884875846501, "grad_norm": 1.5705349445343018, "learning_rate": 4.867644768503002e-06, "loss": 0.2778, "mean_token_accuracy": 0.9083486676216126, "num_tokens": 34625843.0, "step": 4265 }, { "entropy": 0.38043850660324097, "epoch": 2.409706546275395, "grad_norm": 1.3238205909729004, "learning_rate": 4.867337900234562e-06, "loss": 0.28, "mean_token_accuracy": 0.9097347378730773, "num_tokens": 34666723.0, "step": 4270 }, { "entropy": 0.40482637882232664, "epoch": 2.412528216704289, "grad_norm": 1.4541096687316895, "learning_rate": 4.867030689683615e-06, "loss": 0.2967, "mean_token_accuracy": 0.9034028768539428, "num_tokens": 34707314.0, "step": 4275 }, { "entropy": 0.3765712082386017, "epoch": 2.415349887133183, "grad_norm": 1.694923996925354, "learning_rate": 4.8667231369105126e-06, "loss": 0.2784, "mean_token_accuracy": 0.9113203883171082, "num_tokens": 34748089.0, "step": 4280 }, { "entropy": 0.41379693150520325, "epoch": 2.4181715575620766, "grad_norm": 1.8324544429779053, "learning_rate": 4.866415241975674e-06, "loss": 0.3173, "mean_token_accuracy": 0.8990246057510376, "num_tokens": 34788615.0, "step": 4285 }, { "entropy": 0.405905282497406, "epoch": 2.4209932279909707, "grad_norm": 1.5715994834899902, "learning_rate": 4.866107004939584e-06, "loss": 0.2953, "mean_token_accuracy": 0.9052024960517884, "num_tokens": 34829247.0, "step": 4290 }, { "entropy": 0.38263545036315916, "epoch": 2.4238148984198644, "grad_norm": 1.4427995681762695, "learning_rate": 4.865798425862797e-06, "loss": 0.2802, "mean_token_accuracy": 0.9089892625808715, "num_tokens": 34869192.0, "step": 4295 }, { "entropy": 0.41300634741783143, "epoch": 2.4266365688487586, "grad_norm": 1.6823080778121948, "learning_rate": 4.865489504805933e-06, "loss": 0.3148, "mean_token_accuracy": 0.8993620634078979, "num_tokens": 34909855.0, "step": 4300 }, { "entropy": 0.3476841390132904, "epoch": 2.4294582392776523, "grad_norm": 1.5836542844772339, "learning_rate": 4.865180241829679e-06, "loss": 0.2606, "mean_token_accuracy": 0.915683650970459, "num_tokens": 34950497.0, "step": 4305 }, { "entropy": 0.40133320689201357, "epoch": 2.4322799097065464, "grad_norm": 1.5993763208389282, "learning_rate": 4.8648706369947915e-06, "loss": 0.3077, "mean_token_accuracy": 0.9001999020576477, "num_tokens": 34990946.0, "step": 4310 }, { "entropy": 0.43380234241485593, "epoch": 2.43510158013544, "grad_norm": 1.6708790063858032, "learning_rate": 4.86456069036209e-06, "loss": 0.3327, "mean_token_accuracy": 0.8949172139167786, "num_tokens": 35031584.0, "step": 4315 }, { "entropy": 0.36903883814811705, "epoch": 2.4379232505643342, "grad_norm": 1.705231785774231, "learning_rate": 4.864250401992465e-06, "loss": 0.2643, "mean_token_accuracy": 0.9130355954170227, "num_tokens": 35072066.0, "step": 4320 }, { "entropy": 0.4150381624698639, "epoch": 2.440744920993228, "grad_norm": 1.5706596374511719, "learning_rate": 4.863939771946873e-06, "loss": 0.3097, "mean_token_accuracy": 0.8974992752075195, "num_tokens": 35112703.0, "step": 4325 }, { "entropy": 0.4148720264434814, "epoch": 2.443566591422122, "grad_norm": 1.6442214250564575, "learning_rate": 4.863628800286337e-06, "loss": 0.3247, "mean_token_accuracy": 0.8946601033210755, "num_tokens": 35153539.0, "step": 4330 }, { "entropy": 0.40285355448722837, "epoch": 2.4463882618510158, "grad_norm": 1.4223581552505493, "learning_rate": 4.863317487071946e-06, "loss": 0.3108, "mean_token_accuracy": 0.8984290480613708, "num_tokens": 35194160.0, "step": 4335 }, { "entropy": 0.41776869893074037, "epoch": 2.44920993227991, "grad_norm": 1.5501775741577148, "learning_rate": 4.8630058323648584e-06, "loss": 0.3111, "mean_token_accuracy": 0.8984562754631042, "num_tokens": 35234676.0, "step": 4340 }, { "entropy": 0.4286969780921936, "epoch": 2.4520316027088036, "grad_norm": 1.6209558248519897, "learning_rate": 4.862693836226301e-06, "loss": 0.3231, "mean_token_accuracy": 0.8965187311172486, "num_tokens": 35275453.0, "step": 4345 }, { "entropy": 0.40539962649345396, "epoch": 2.4548532731376973, "grad_norm": 1.6065021753311157, "learning_rate": 4.862381498717563e-06, "loss": 0.3012, "mean_token_accuracy": 0.9059406757354737, "num_tokens": 35315750.0, "step": 4350 }, { "entropy": 0.38936737179756165, "epoch": 2.4576749435665914, "grad_norm": 1.5308164358139038, "learning_rate": 4.862068819900003e-06, "loss": 0.289, "mean_token_accuracy": 0.905747401714325, "num_tokens": 35356475.0, "step": 4355 }, { "entropy": 0.3944389283657074, "epoch": 2.460496613995485, "grad_norm": 1.6397894620895386, "learning_rate": 4.8617557998350475e-06, "loss": 0.2897, "mean_token_accuracy": 0.9059877634048462, "num_tokens": 35397000.0, "step": 4360 }, { "entropy": 0.3817551672458649, "epoch": 2.4633182844243793, "grad_norm": 1.590949296951294, "learning_rate": 4.86144243858419e-06, "loss": 0.2994, "mean_token_accuracy": 0.9029469013214111, "num_tokens": 35437662.0, "step": 4365 }, { "entropy": 0.43104063868522646, "epoch": 2.466139954853273, "grad_norm": 1.7022565603256226, "learning_rate": 4.86112873620899e-06, "loss": 0.346, "mean_token_accuracy": 0.8878795146942139, "num_tokens": 35478304.0, "step": 4370 }, { "entropy": 0.4168141961097717, "epoch": 2.468961625282167, "grad_norm": 1.5154368877410889, "learning_rate": 4.860814692771072e-06, "loss": 0.3225, "mean_token_accuracy": 0.8962018251419067, "num_tokens": 35518948.0, "step": 4375 }, { "entropy": 0.4418110430240631, "epoch": 2.471783295711061, "grad_norm": 1.6656417846679688, "learning_rate": 4.860500308332134e-06, "loss": 0.3292, "mean_token_accuracy": 0.8942551970481872, "num_tokens": 35559540.0, "step": 4380 }, { "entropy": 0.40012283325195314, "epoch": 2.474604966139955, "grad_norm": 1.5913735628128052, "learning_rate": 4.8601855829539345e-06, "loss": 0.3069, "mean_token_accuracy": 0.8994509220123291, "num_tokens": 35600363.0, "step": 4385 }, { "entropy": 0.4134869039058685, "epoch": 2.4774266365688487, "grad_norm": 1.574020504951477, "learning_rate": 4.859870516698302e-06, "loss": 0.3175, "mean_token_accuracy": 0.8977964639663696, "num_tokens": 35641089.0, "step": 4390 }, { "entropy": 0.39518179297447203, "epoch": 2.480248306997743, "grad_norm": 1.6124846935272217, "learning_rate": 4.85955510962713e-06, "loss": 0.2935, "mean_token_accuracy": 0.9060514092445373, "num_tokens": 35681822.0, "step": 4395 }, { "entropy": 0.403781658411026, "epoch": 2.4830699774266365, "grad_norm": 1.7911304235458374, "learning_rate": 4.8592393618023816e-06, "loss": 0.3146, "mean_token_accuracy": 0.8975111484527588, "num_tokens": 35722308.0, "step": 4400 }, { "entropy": 0.4161704480648041, "epoch": 2.4858916478555306, "grad_norm": 1.7675319910049438, "learning_rate": 4.858923273286086e-06, "loss": 0.3117, "mean_token_accuracy": 0.8990342259407044, "num_tokens": 35762753.0, "step": 4405 }, { "entropy": 0.3830337405204773, "epoch": 2.4887133182844243, "grad_norm": 1.3579264879226685, "learning_rate": 4.858606844140337e-06, "loss": 0.2877, "mean_token_accuracy": 0.9072355508804322, "num_tokens": 35803606.0, "step": 4410 }, { "entropy": 0.4230438947677612, "epoch": 2.4915349887133185, "grad_norm": 1.622119665145874, "learning_rate": 4.8582900744272975e-06, "loss": 0.323, "mean_token_accuracy": 0.8985754609107971, "num_tokens": 35844338.0, "step": 4415 }, { "entropy": 0.42835283279418945, "epoch": 2.494356659142212, "grad_norm": 1.5371416807174683, "learning_rate": 4.857972964209199e-06, "loss": 0.3226, "mean_token_accuracy": 0.8958300590515137, "num_tokens": 35885113.0, "step": 4420 }, { "entropy": 0.42627652883529665, "epoch": 2.4971783295711063, "grad_norm": 1.5911802053451538, "learning_rate": 4.857655513548335e-06, "loss": 0.3174, "mean_token_accuracy": 0.8981908798217774, "num_tokens": 35925643.0, "step": 4425 }, { "entropy": 0.37044663429260255, "epoch": 2.5, "grad_norm": 1.3842355012893677, "learning_rate": 4.8573377225070715e-06, "loss": 0.2614, "mean_token_accuracy": 0.9132282495498657, "num_tokens": 35966310.0, "step": 4430 }, { "entropy": 0.3875482201576233, "epoch": 2.5028216704288937, "grad_norm": 1.421795129776001, "learning_rate": 4.857019591147836e-06, "loss": 0.2989, "mean_token_accuracy": 0.9054312467575073, "num_tokens": 36007072.0, "step": 4435 }, { "entropy": 0.4442818224430084, "epoch": 2.505643340857788, "grad_norm": 1.6131327152252197, "learning_rate": 4.856701119533128e-06, "loss": 0.3393, "mean_token_accuracy": 0.8924556851387024, "num_tokens": 36047636.0, "step": 4440 }, { "entropy": 0.4388956606388092, "epoch": 2.508465011286682, "grad_norm": 1.5778138637542725, "learning_rate": 4.856382307725509e-06, "loss": 0.3225, "mean_token_accuracy": 0.8961979508399963, "num_tokens": 36088203.0, "step": 4445 }, { "entropy": 0.402739155292511, "epoch": 2.5112866817155757, "grad_norm": 1.5481038093566895, "learning_rate": 4.856063155787611e-06, "loss": 0.3023, "mean_token_accuracy": 0.9025402426719665, "num_tokens": 36128987.0, "step": 4450 }, { "entropy": 0.3847965598106384, "epoch": 2.5141083521444694, "grad_norm": 1.4700496196746826, "learning_rate": 4.855743663782131e-06, "loss": 0.2893, "mean_token_accuracy": 0.9061774015426636, "num_tokens": 36169817.0, "step": 4455 }, { "entropy": 0.39482749700546266, "epoch": 2.5169300225733635, "grad_norm": 1.6306626796722412, "learning_rate": 4.855423831771832e-06, "loss": 0.2963, "mean_token_accuracy": 0.9046317577362061, "num_tokens": 36210322.0, "step": 4460 }, { "entropy": 0.44486273527145387, "epoch": 2.519751693002257, "grad_norm": 1.7283015251159668, "learning_rate": 4.8551036598195486e-06, "loss": 0.3437, "mean_token_accuracy": 0.8907332181930542, "num_tokens": 36251034.0, "step": 4465 }, { "entropy": 0.42003960609436036, "epoch": 2.5225733634311513, "grad_norm": 1.4865223169326782, "learning_rate": 4.8547831479881745e-06, "loss": 0.3239, "mean_token_accuracy": 0.8974966883659363, "num_tokens": 36291628.0, "step": 4470 }, { "entropy": 0.39935733675956725, "epoch": 2.525395033860045, "grad_norm": 1.3825920820236206, "learning_rate": 4.854462296340677e-06, "loss": 0.2872, "mean_token_accuracy": 0.9067023873329163, "num_tokens": 36332435.0, "step": 4475 }, { "entropy": 0.36378782987594604, "epoch": 2.528216704288939, "grad_norm": 1.507226586341858, "learning_rate": 4.854141104940087e-06, "loss": 0.2582, "mean_token_accuracy": 0.9157714605331421, "num_tokens": 36373117.0, "step": 4480 }, { "entropy": 0.41645694971084596, "epoch": 2.531038374717833, "grad_norm": 1.6471916437149048, "learning_rate": 4.853819573849502e-06, "loss": 0.3073, "mean_token_accuracy": 0.9004851222038269, "num_tokens": 36413658.0, "step": 4485 }, { "entropy": 0.4097161889076233, "epoch": 2.533860045146727, "grad_norm": 1.7997804880142212, "learning_rate": 4.8534977031320855e-06, "loss": 0.3056, "mean_token_accuracy": 0.901634693145752, "num_tokens": 36454096.0, "step": 4490 }, { "entropy": 0.38941258788108823, "epoch": 2.5366817155756207, "grad_norm": 1.6591458320617676, "learning_rate": 4.8531754928510725e-06, "loss": 0.3015, "mean_token_accuracy": 0.9032475352287292, "num_tokens": 36494625.0, "step": 4495 }, { "entropy": 0.39249064326286315, "epoch": 2.5395033860045144, "grad_norm": 1.5502938032150269, "learning_rate": 4.852852943069758e-06, "loss": 0.2938, "mean_token_accuracy": 0.9045561671257019, "num_tokens": 36535372.0, "step": 4500 }, { "epoch": 2.5395033860045144, "eval_entropy": 0.4075223505496979, "eval_loss": 0.2745997905731201, "eval_mean_token_accuracy": 0.918742835521698, "eval_num_tokens": 36535372.0, "eval_runtime": 0.164, "eval_samples_per_second": 24.395, "eval_steps_per_second": 6.099, "step": 4500 }, { "entropy": 0.42699538469314574, "epoch": 2.5423250564334086, "grad_norm": 1.4848240613937378, "learning_rate": 4.852530053851509e-06, "loss": 0.3234, "mean_token_accuracy": 0.8980291485786438, "num_tokens": 36576121.0, "step": 4505 }, { "entropy": 0.4450218975543976, "epoch": 2.5451467268623027, "grad_norm": 1.2995103597640991, "learning_rate": 4.852206825259756e-06, "loss": 0.3528, "mean_token_accuracy": 0.8892289876937867, "num_tokens": 36616673.0, "step": 4510 }, { "entropy": 0.4333424806594849, "epoch": 2.5479683972911964, "grad_norm": 1.4600845575332642, "learning_rate": 4.851883257357997e-06, "loss": 0.3108, "mean_token_accuracy": 0.9001733303070069, "num_tokens": 36657491.0, "step": 4515 }, { "entropy": 0.4113191843032837, "epoch": 2.55079006772009, "grad_norm": 1.7340246438980103, "learning_rate": 4.851559350209798e-06, "loss": 0.3183, "mean_token_accuracy": 0.8975558519363404, "num_tokens": 36698183.0, "step": 4520 }, { "entropy": 0.4206542193889618, "epoch": 2.5536117381489842, "grad_norm": 1.6161917448043823, "learning_rate": 4.85123510387879e-06, "loss": 0.326, "mean_token_accuracy": 0.8958797335624695, "num_tokens": 36738687.0, "step": 4525 }, { "entropy": 0.42369606494903567, "epoch": 2.5564334085778784, "grad_norm": 1.4928462505340576, "learning_rate": 4.850910518428672e-06, "loss": 0.3245, "mean_token_accuracy": 0.8953816771507264, "num_tokens": 36779100.0, "step": 4530 }, { "entropy": 0.4580428898334503, "epoch": 2.559255079006772, "grad_norm": 1.6802058219909668, "learning_rate": 4.850585593923209e-06, "loss": 0.3362, "mean_token_accuracy": 0.8919341087341308, "num_tokens": 36819464.0, "step": 4535 }, { "entropy": 0.44548857808113096, "epoch": 2.5620767494356658, "grad_norm": 1.7012122869491577, "learning_rate": 4.850260330426231e-06, "loss": 0.3257, "mean_token_accuracy": 0.8931902885437012, "num_tokens": 36860115.0, "step": 4540 }, { "entropy": 0.42452192306518555, "epoch": 2.56489841986456, "grad_norm": 1.4587101936340332, "learning_rate": 4.849934728001636e-06, "loss": 0.3252, "mean_token_accuracy": 0.8946157932281494, "num_tokens": 36900844.0, "step": 4545 }, { "entropy": 0.4206928193569183, "epoch": 2.5677200902934536, "grad_norm": 1.6573847532272339, "learning_rate": 4.84960878671339e-06, "loss": 0.3297, "mean_token_accuracy": 0.897615122795105, "num_tokens": 36941461.0, "step": 4550 }, { "entropy": 0.40249053835868837, "epoch": 2.5705417607223477, "grad_norm": 1.6816022396087646, "learning_rate": 4.849282506625525e-06, "loss": 0.2879, "mean_token_accuracy": 0.906862735748291, "num_tokens": 36982267.0, "step": 4555 }, { "entropy": 0.3981044590473175, "epoch": 2.5733634311512414, "grad_norm": 1.6164435148239136, "learning_rate": 4.848955887802135e-06, "loss": 0.3043, "mean_token_accuracy": 0.9023780941963195, "num_tokens": 37023035.0, "step": 4560 }, { "entropy": 0.3921403527259827, "epoch": 2.5761851015801356, "grad_norm": 1.7836408615112305, "learning_rate": 4.8486289303073884e-06, "loss": 0.2822, "mean_token_accuracy": 0.9081403493881226, "num_tokens": 37063570.0, "step": 4565 }, { "entropy": 0.41735507249832154, "epoch": 2.5790067720090293, "grad_norm": 1.6461783647537231, "learning_rate": 4.848301634205514e-06, "loss": 0.3012, "mean_token_accuracy": 0.9022216796875, "num_tokens": 37104278.0, "step": 4570 }, { "entropy": 0.3928376615047455, "epoch": 2.5818284424379234, "grad_norm": 1.6805448532104492, "learning_rate": 4.84797399956081e-06, "loss": 0.2898, "mean_token_accuracy": 0.9058070182800293, "num_tokens": 37144698.0, "step": 4575 }, { "entropy": 0.41108508706092833, "epoch": 2.584650112866817, "grad_norm": 1.6354347467422485, "learning_rate": 4.847646026437639e-06, "loss": 0.3039, "mean_token_accuracy": 0.9019859433174133, "num_tokens": 37185166.0, "step": 4580 }, { "entropy": 0.39545953273773193, "epoch": 2.587471783295711, "grad_norm": 1.4855034351348877, "learning_rate": 4.847317714900432e-06, "loss": 0.2978, "mean_token_accuracy": 0.9041112303733826, "num_tokens": 37225882.0, "step": 4585 }, { "entropy": 0.378242689371109, "epoch": 2.590293453724605, "grad_norm": 1.4001781940460205, "learning_rate": 4.846989065013687e-06, "loss": 0.2667, "mean_token_accuracy": 0.9122205972671509, "num_tokens": 37266182.0, "step": 4590 }, { "entropy": 0.41660374999046323, "epoch": 2.593115124153499, "grad_norm": 1.6582450866699219, "learning_rate": 4.846660076841966e-06, "loss": 0.3095, "mean_token_accuracy": 0.9031738400459289, "num_tokens": 37306948.0, "step": 4595 }, { "entropy": 0.4056197464466095, "epoch": 2.595936794582393, "grad_norm": 1.9591373205184937, "learning_rate": 4.8463307504498995e-06, "loss": 0.3038, "mean_token_accuracy": 0.9031343698501587, "num_tokens": 37347762.0, "step": 4600 }, { "entropy": 0.43581578731536863, "epoch": 2.5987584650112865, "grad_norm": 1.5790077447891235, "learning_rate": 4.846001085902182e-06, "loss": 0.3175, "mean_token_accuracy": 0.9001483678817749, "num_tokens": 37388225.0, "step": 4605 }, { "entropy": 0.4038006603717804, "epoch": 2.6015801354401806, "grad_norm": 1.5071346759796143, "learning_rate": 4.845671083263579e-06, "loss": 0.2923, "mean_token_accuracy": 0.9046383738517761, "num_tokens": 37429028.0, "step": 4610 }, { "entropy": 0.4395131945610046, "epoch": 2.6044018058690743, "grad_norm": 1.651118516921997, "learning_rate": 4.845340742598917e-06, "loss": 0.3422, "mean_token_accuracy": 0.8916622638702393, "num_tokens": 37469231.0, "step": 4615 }, { "entropy": 0.43975943326950073, "epoch": 2.6072234762979685, "grad_norm": 1.7351619005203247, "learning_rate": 4.8450100639730934e-06, "loss": 0.327, "mean_token_accuracy": 0.8946444272994996, "num_tokens": 37509748.0, "step": 4620 }, { "entropy": 0.3946446657180786, "epoch": 2.610045146726862, "grad_norm": 1.5668245553970337, "learning_rate": 4.844679047451068e-06, "loss": 0.2897, "mean_token_accuracy": 0.9054205894470215, "num_tokens": 37549968.0, "step": 4625 }, { "entropy": 0.4205393612384796, "epoch": 2.6128668171557563, "grad_norm": 1.5956112146377563, "learning_rate": 4.844347693097871e-06, "loss": 0.3276, "mean_token_accuracy": 0.8945238828659058, "num_tokens": 37590500.0, "step": 4630 }, { "entropy": 0.4266462683677673, "epoch": 2.61568848758465, "grad_norm": 1.8106273412704468, "learning_rate": 4.844016000978595e-06, "loss": 0.3106, "mean_token_accuracy": 0.9020758390426635, "num_tokens": 37631127.0, "step": 4635 }, { "entropy": 0.3949384868144989, "epoch": 2.618510158013544, "grad_norm": 1.6231563091278076, "learning_rate": 4.843683971158404e-06, "loss": 0.2972, "mean_token_accuracy": 0.9043740272521973, "num_tokens": 37671522.0, "step": 4640 }, { "entropy": 0.427196079492569, "epoch": 2.621331828442438, "grad_norm": 1.8242963552474976, "learning_rate": 4.843351603702522e-06, "loss": 0.3276, "mean_token_accuracy": 0.8958367943763733, "num_tokens": 37711505.0, "step": 4645 }, { "entropy": 0.4056568145751953, "epoch": 2.624153498871332, "grad_norm": 1.767072081565857, "learning_rate": 4.843018898676245e-06, "loss": 0.2968, "mean_token_accuracy": 0.9040299654006958, "num_tokens": 37751970.0, "step": 4650 }, { "entropy": 0.4182514131069183, "epoch": 2.6269751693002257, "grad_norm": 1.381471872329712, "learning_rate": 4.842685856144932e-06, "loss": 0.3202, "mean_token_accuracy": 0.8963116884231568, "num_tokens": 37792712.0, "step": 4655 }, { "entropy": 0.3773132681846619, "epoch": 2.62979683972912, "grad_norm": 1.4395767450332642, "learning_rate": 4.842352476174008e-06, "loss": 0.2735, "mean_token_accuracy": 0.9093364715576172, "num_tokens": 37833341.0, "step": 4660 }, { "entropy": 0.38550790548324587, "epoch": 2.6326185101580135, "grad_norm": 1.4654998779296875, "learning_rate": 4.842018758828968e-06, "loss": 0.2897, "mean_token_accuracy": 0.906914758682251, "num_tokens": 37874036.0, "step": 4665 }, { "entropy": 0.42129603028297424, "epoch": 2.635440180586907, "grad_norm": 1.8509776592254639, "learning_rate": 4.84168470417537e-06, "loss": 0.3046, "mean_token_accuracy": 0.9016175627708435, "num_tokens": 37914771.0, "step": 4670 }, { "entropy": 0.4129085302352905, "epoch": 2.6382618510158014, "grad_norm": 1.6427429914474487, "learning_rate": 4.841350312278838e-06, "loss": 0.3373, "mean_token_accuracy": 0.893082594871521, "num_tokens": 37955424.0, "step": 4675 }, { "entropy": 0.455252867937088, "epoch": 2.6410835214446955, "grad_norm": 1.7675665616989136, "learning_rate": 4.8410155832050635e-06, "loss": 0.346, "mean_token_accuracy": 0.8894224405288697, "num_tokens": 37995924.0, "step": 4680 }, { "entropy": 0.4129503607749939, "epoch": 2.643905191873589, "grad_norm": 1.8195785284042358, "learning_rate": 4.840680517019806e-06, "loss": 0.3097, "mean_token_accuracy": 0.9006987929344177, "num_tokens": 38036393.0, "step": 4685 }, { "entropy": 0.41016621589660646, "epoch": 2.646726862302483, "grad_norm": 1.335083246231079, "learning_rate": 4.840345113788887e-06, "loss": 0.2876, "mean_token_accuracy": 0.9052215456962586, "num_tokens": 38077059.0, "step": 4690 }, { "entropy": 0.38619791269302367, "epoch": 2.649548532731377, "grad_norm": 1.5167460441589355, "learning_rate": 4.840009373578197e-06, "loss": 0.2864, "mean_token_accuracy": 0.9078164935112, "num_tokens": 38117855.0, "step": 4695 }, { "entropy": 0.40981505513191224, "epoch": 2.6523702031602707, "grad_norm": 1.4652817249298096, "learning_rate": 4.839673296453694e-06, "loss": 0.3079, "mean_token_accuracy": 0.8995686173439026, "num_tokens": 38158633.0, "step": 4700 }, { "entropy": 0.40746703147888186, "epoch": 2.655191873589165, "grad_norm": 1.5045616626739502, "learning_rate": 4.839336882481398e-06, "loss": 0.3076, "mean_token_accuracy": 0.9018496155738831, "num_tokens": 38199207.0, "step": 4705 }, { "entropy": 0.4251773953437805, "epoch": 2.6580135440180586, "grad_norm": 1.7509130239486694, "learning_rate": 4.839000131727399e-06, "loss": 0.3138, "mean_token_accuracy": 0.8988733649253845, "num_tokens": 38239853.0, "step": 4710 }, { "entropy": 0.4144779920578003, "epoch": 2.6608352144469527, "grad_norm": 1.6336438655853271, "learning_rate": 4.8386630442578505e-06, "loss": 0.3176, "mean_token_accuracy": 0.8991317749023438, "num_tokens": 38280445.0, "step": 4715 }, { "entropy": 0.42009794116020205, "epoch": 2.6636568848758464, "grad_norm": 1.679404854774475, "learning_rate": 4.838325620138975e-06, "loss": 0.3136, "mean_token_accuracy": 0.8994911909103394, "num_tokens": 38321216.0, "step": 4720 }, { "entropy": 0.3979401350021362, "epoch": 2.6664785553047405, "grad_norm": 1.5892705917358398, "learning_rate": 4.837987859437058e-06, "loss": 0.2973, "mean_token_accuracy": 0.9033595561981201, "num_tokens": 38361943.0, "step": 4725 }, { "entropy": 0.4081549823284149, "epoch": 2.6693002257336342, "grad_norm": 1.4048125743865967, "learning_rate": 4.837649762218454e-06, "loss": 0.3061, "mean_token_accuracy": 0.9016985297203064, "num_tokens": 38402444.0, "step": 4730 }, { "entropy": 0.40113077163696287, "epoch": 2.6721218961625284, "grad_norm": 1.590847134590149, "learning_rate": 4.837311328549582e-06, "loss": 0.3019, "mean_token_accuracy": 0.9024832129478455, "num_tokens": 38443055.0, "step": 4735 }, { "entropy": 0.4067776739597321, "epoch": 2.674943566591422, "grad_norm": 1.6257827281951904, "learning_rate": 4.8369725584969265e-06, "loss": 0.2947, "mean_token_accuracy": 0.9031951904296875, "num_tokens": 38483816.0, "step": 4740 }, { "entropy": 0.386566686630249, "epoch": 2.677765237020316, "grad_norm": 1.3581163883209229, "learning_rate": 4.836633452127039e-06, "loss": 0.2919, "mean_token_accuracy": 0.9064470291137695, "num_tokens": 38523703.0, "step": 4745 }, { "entropy": 0.40097097754478456, "epoch": 2.68058690744921, "grad_norm": 1.6537374258041382, "learning_rate": 4.836294009506537e-06, "loss": 0.2976, "mean_token_accuracy": 0.9049440145492553, "num_tokens": 38564385.0, "step": 4750 }, { "entropy": 0.3615904927253723, "epoch": 2.6834085778781036, "grad_norm": 1.5579919815063477, "learning_rate": 4.835954230702105e-06, "loss": 0.2632, "mean_token_accuracy": 0.9136163473129273, "num_tokens": 38605052.0, "step": 4755 }, { "entropy": 0.43404104113578795, "epoch": 2.6862302483069977, "grad_norm": 1.5290402173995972, "learning_rate": 4.835614115780492e-06, "loss": 0.3276, "mean_token_accuracy": 0.8940969586372376, "num_tokens": 38645892.0, "step": 4760 }, { "entropy": 0.4232151687145233, "epoch": 2.689051918735892, "grad_norm": 1.3929665088653564, "learning_rate": 4.835273664808514e-06, "loss": 0.3277, "mean_token_accuracy": 0.8977612614631653, "num_tokens": 38686583.0, "step": 4765 }, { "entropy": 0.399210125207901, "epoch": 2.6918735891647856, "grad_norm": 1.4951647520065308, "learning_rate": 4.834932877853051e-06, "loss": 0.2983, "mean_token_accuracy": 0.9042705297470093, "num_tokens": 38726923.0, "step": 4770 }, { "entropy": 0.42027135491371154, "epoch": 2.6946952595936793, "grad_norm": 1.3856654167175293, "learning_rate": 4.834591754981053e-06, "loss": 0.3092, "mean_token_accuracy": 0.9001957535743713, "num_tokens": 38767439.0, "step": 4775 }, { "entropy": 0.4031683325767517, "epoch": 2.6975169300225734, "grad_norm": 1.5139154195785522, "learning_rate": 4.834250296259532e-06, "loss": 0.2899, "mean_token_accuracy": 0.9039840221405029, "num_tokens": 38808039.0, "step": 4780 }, { "entropy": 0.40245692133903505, "epoch": 2.700338600451467, "grad_norm": 1.7345075607299805, "learning_rate": 4.8339085017555685e-06, "loss": 0.307, "mean_token_accuracy": 0.9012099623680114, "num_tokens": 38848491.0, "step": 4785 }, { "entropy": 0.4227768838405609, "epoch": 2.7031602708803613, "grad_norm": 1.7092951536178589, "learning_rate": 4.833566371536307e-06, "loss": 0.3181, "mean_token_accuracy": 0.8983111381530762, "num_tokens": 38889280.0, "step": 4790 }, { "entropy": 0.4330504536628723, "epoch": 2.705981941309255, "grad_norm": 1.6095880270004272, "learning_rate": 4.83322390566896e-06, "loss": 0.3443, "mean_token_accuracy": 0.8882631182670593, "num_tokens": 38929955.0, "step": 4795 }, { "entropy": 0.40083847045898435, "epoch": 2.708803611738149, "grad_norm": 1.877541184425354, "learning_rate": 4.832881104220805e-06, "loss": 0.2985, "mean_token_accuracy": 0.9026248455047607, "num_tokens": 38970610.0, "step": 4800 }, { "entropy": 0.43792678117752076, "epoch": 2.711625282167043, "grad_norm": 1.6264429092407227, "learning_rate": 4.8325379672591845e-06, "loss": 0.331, "mean_token_accuracy": 0.8906769514083862, "num_tokens": 39011465.0, "step": 4805 }, { "entropy": 0.40115252137184143, "epoch": 2.714446952595937, "grad_norm": 1.7116504907608032, "learning_rate": 4.8321944948515085e-06, "loss": 0.272, "mean_token_accuracy": 0.9107529878616333, "num_tokens": 39052019.0, "step": 4810 }, { "entropy": 0.3959204196929932, "epoch": 2.7172686230248306, "grad_norm": 1.5745071172714233, "learning_rate": 4.831850687065253e-06, "loss": 0.2908, "mean_token_accuracy": 0.9042417287826539, "num_tokens": 39092820.0, "step": 4815 }, { "entropy": 0.4313171863555908, "epoch": 2.7200902934537243, "grad_norm": 1.4608080387115479, "learning_rate": 4.831506543967958e-06, "loss": 0.3263, "mean_token_accuracy": 0.8949733734130859, "num_tokens": 39133601.0, "step": 4820 }, { "entropy": 0.4190512001514435, "epoch": 2.7229119638826185, "grad_norm": 1.6403937339782715, "learning_rate": 4.831162065627229e-06, "loss": 0.3379, "mean_token_accuracy": 0.8949956893920898, "num_tokens": 39174455.0, "step": 4825 }, { "entropy": 0.39245215654373167, "epoch": 2.7257336343115126, "grad_norm": 1.5303268432617188, "learning_rate": 4.830817252110742e-06, "loss": 0.2972, "mean_token_accuracy": 0.9037534713745117, "num_tokens": 39215087.0, "step": 4830 }, { "entropy": 0.40173652172088625, "epoch": 2.7285553047404063, "grad_norm": 1.5214855670928955, "learning_rate": 4.830472103486233e-06, "loss": 0.2968, "mean_token_accuracy": 0.902808690071106, "num_tokens": 39255856.0, "step": 4835 }, { "entropy": 0.4071968972682953, "epoch": 2.7313769751693, "grad_norm": 1.669796109199524, "learning_rate": 4.830126619821508e-06, "loss": 0.2997, "mean_token_accuracy": 0.903080677986145, "num_tokens": 39296108.0, "step": 4840 }, { "entropy": 0.4280484437942505, "epoch": 2.734198645598194, "grad_norm": 1.5314013957977295, "learning_rate": 4.829780801184437e-06, "loss": 0.322, "mean_token_accuracy": 0.8953822731971741, "num_tokens": 39336454.0, "step": 4845 }, { "entropy": 0.38799464106559756, "epoch": 2.7370203160270883, "grad_norm": 1.6457887887954712, "learning_rate": 4.829434647642956e-06, "loss": 0.3007, "mean_token_accuracy": 0.902879273891449, "num_tokens": 39377316.0, "step": 4850 }, { "entropy": 0.43819814920425415, "epoch": 2.739841986455982, "grad_norm": 1.7149285078048706, "learning_rate": 4.829088159265067e-06, "loss": 0.3406, "mean_token_accuracy": 0.8905494093894959, "num_tokens": 39418143.0, "step": 4855 }, { "entropy": 0.41923085451126096, "epoch": 2.7426636568848757, "grad_norm": 1.331289529800415, "learning_rate": 4.828741336118837e-06, "loss": 0.3192, "mean_token_accuracy": 0.8980522155761719, "num_tokens": 39457182.0, "step": 4860 }, { "entropy": 0.40077372193336486, "epoch": 2.74548532731377, "grad_norm": 1.6222182512283325, "learning_rate": 4.828394178272401e-06, "loss": 0.302, "mean_token_accuracy": 0.9054031133651733, "num_tokens": 39497879.0, "step": 4865 }, { "entropy": 0.39027564525604247, "epoch": 2.7483069977426635, "grad_norm": 1.7052563428878784, "learning_rate": 4.828046685793957e-06, "loss": 0.2949, "mean_token_accuracy": 0.9051683902740478, "num_tokens": 39538388.0, "step": 4870 }, { "entropy": 0.42039836645126344, "epoch": 2.7511286681715577, "grad_norm": 1.6202837228775024, "learning_rate": 4.82769885875177e-06, "loss": 0.3195, "mean_token_accuracy": 0.8974061369895935, "num_tokens": 39579248.0, "step": 4875 }, { "entropy": 0.42434735894203185, "epoch": 2.7539503386004514, "grad_norm": 1.6943541765213013, "learning_rate": 4.8273506972141705e-06, "loss": 0.3145, "mean_token_accuracy": 0.8977732062339783, "num_tokens": 39619680.0, "step": 4880 }, { "entropy": 0.4002332389354706, "epoch": 2.7567720090293455, "grad_norm": 1.5710278749465942, "learning_rate": 4.827002201249556e-06, "loss": 0.298, "mean_token_accuracy": 0.9056402921676636, "num_tokens": 39660296.0, "step": 4885 }, { "entropy": 0.39988999366760253, "epoch": 2.759593679458239, "grad_norm": 1.5473220348358154, "learning_rate": 4.826653370926387e-06, "loss": 0.2995, "mean_token_accuracy": 0.9022600173950195, "num_tokens": 39701090.0, "step": 4890 }, { "entropy": 0.41356533765792847, "epoch": 2.7624153498871333, "grad_norm": 1.5488544702529907, "learning_rate": 4.826304206313193e-06, "loss": 0.3091, "mean_token_accuracy": 0.8972513794898986, "num_tokens": 39741947.0, "step": 4895 }, { "entropy": 0.39104182124137876, "epoch": 2.765237020316027, "grad_norm": 1.5953575372695923, "learning_rate": 4.825954707478565e-06, "loss": 0.3018, "mean_token_accuracy": 0.9020759582519531, "num_tokens": 39782510.0, "step": 4900 }, { "entropy": 0.41210404634475706, "epoch": 2.7680586907449207, "grad_norm": 1.4687649011611938, "learning_rate": 4.825604874491165e-06, "loss": 0.3201, "mean_token_accuracy": 0.8970984816551208, "num_tokens": 39823334.0, "step": 4905 }, { "entropy": 0.38967686891555786, "epoch": 2.770880361173815, "grad_norm": 1.563970923423767, "learning_rate": 4.825254707419716e-06, "loss": 0.2888, "mean_token_accuracy": 0.9057512521743775, "num_tokens": 39864124.0, "step": 4910 }, { "entropy": 0.4446266949176788, "epoch": 2.773702031602709, "grad_norm": 1.8769588470458984, "learning_rate": 4.824904206333009e-06, "loss": 0.3441, "mean_token_accuracy": 0.8903054833412171, "num_tokens": 39904728.0, "step": 4915 }, { "entropy": 0.3861452877521515, "epoch": 2.7765237020316027, "grad_norm": 1.4496490955352783, "learning_rate": 4.8245533712998995e-06, "loss": 0.283, "mean_token_accuracy": 0.9068530559539795, "num_tokens": 39945454.0, "step": 4920 }, { "entropy": 0.407192063331604, "epoch": 2.7793453724604964, "grad_norm": 1.4505201578140259, "learning_rate": 4.8242022023893095e-06, "loss": 0.3053, "mean_token_accuracy": 0.9021325826644897, "num_tokens": 39986255.0, "step": 4925 }, { "entropy": 0.3858388364315033, "epoch": 2.7821670428893905, "grad_norm": 1.4844613075256348, "learning_rate": 4.823850699670225e-06, "loss": 0.2856, "mean_token_accuracy": 0.9087419271469116, "num_tokens": 40026935.0, "step": 4930 }, { "entropy": 0.3972056210041046, "epoch": 2.7849887133182847, "grad_norm": 1.8050625324249268, "learning_rate": 4.823498863211701e-06, "loss": 0.3016, "mean_token_accuracy": 0.9033455014228821, "num_tokens": 40067747.0, "step": 4935 }, { "entropy": 0.4366573691368103, "epoch": 2.7878103837471784, "grad_norm": 1.7209434509277344, "learning_rate": 4.823146693082853e-06, "loss": 0.331, "mean_token_accuracy": 0.8931609511375427, "num_tokens": 40107827.0, "step": 4940 }, { "entropy": 0.44152148365974425, "epoch": 2.790632054176072, "grad_norm": 1.6366186141967773, "learning_rate": 4.822794189352867e-06, "loss": 0.3618, "mean_token_accuracy": 0.8887471437454224, "num_tokens": 40148719.0, "step": 4945 }, { "entropy": 0.3947724997997284, "epoch": 2.793453724604966, "grad_norm": 1.3238067626953125, "learning_rate": 4.822441352090992e-06, "loss": 0.2977, "mean_token_accuracy": 0.9030474781990051, "num_tokens": 40189420.0, "step": 4950 }, { "entropy": 0.42072648406028745, "epoch": 2.79627539503386, "grad_norm": 1.553117036819458, "learning_rate": 4.8220881813665435e-06, "loss": 0.3086, "mean_token_accuracy": 0.9006119012832642, "num_tokens": 40230144.0, "step": 4955 }, { "entropy": 0.39687870144844056, "epoch": 2.799097065462754, "grad_norm": 1.5955865383148193, "learning_rate": 4.8217346772489e-06, "loss": 0.2926, "mean_token_accuracy": 0.9029557347297669, "num_tokens": 40270958.0, "step": 4960 }, { "entropy": 0.38045825362205504, "epoch": 2.8019187358916477, "grad_norm": 1.9875127077102661, "learning_rate": 4.821380839807509e-06, "loss": 0.2799, "mean_token_accuracy": 0.9087697386741638, "num_tokens": 40311657.0, "step": 4965 }, { "entropy": 0.42601910829544065, "epoch": 2.804740406320542, "grad_norm": 1.7846463918685913, "learning_rate": 4.821026669111881e-06, "loss": 0.3235, "mean_token_accuracy": 0.8960994839668274, "num_tokens": 40352256.0, "step": 4970 }, { "entropy": 0.40002630949020385, "epoch": 2.8075620767494356, "grad_norm": 1.5628581047058105, "learning_rate": 4.820672165231595e-06, "loss": 0.3004, "mean_token_accuracy": 0.9042887330055237, "num_tokens": 40392932.0, "step": 4975 }, { "entropy": 0.40157084465026854, "epoch": 2.8103837471783297, "grad_norm": 1.515154480934143, "learning_rate": 4.8203173282362904e-06, "loss": 0.3059, "mean_token_accuracy": 0.900831151008606, "num_tokens": 40433461.0, "step": 4980 }, { "entropy": 0.4204134941101074, "epoch": 2.8132054176072234, "grad_norm": 1.5249524116516113, "learning_rate": 4.819962158195677e-06, "loss": 0.3223, "mean_token_accuracy": 0.8980574369430542, "num_tokens": 40473869.0, "step": 4985 }, { "entropy": 0.3809087574481964, "epoch": 2.816027088036117, "grad_norm": 1.5786020755767822, "learning_rate": 4.819606655179527e-06, "loss": 0.2748, "mean_token_accuracy": 0.9111071109771729, "num_tokens": 40514570.0, "step": 4990 }, { "entropy": 0.4299299597740173, "epoch": 2.8188487584650113, "grad_norm": 1.6505275964736938, "learning_rate": 4.819250819257679e-06, "loss": 0.3196, "mean_token_accuracy": 0.8989997148513794, "num_tokens": 40555393.0, "step": 4995 }, { "entropy": 0.41674472093582154, "epoch": 2.8216704288939054, "grad_norm": 1.8646577596664429, "learning_rate": 4.818894650500037e-06, "loss": 0.3096, "mean_token_accuracy": 0.9004587650299072, "num_tokens": 40596103.0, "step": 5000 }, { "epoch": 2.8216704288939054, "eval_entropy": 0.40231844782829285, "eval_loss": 0.2420196682214737, "eval_mean_token_accuracy": 0.9271751642227173, "eval_num_tokens": 40596103.0, "eval_runtime": 0.1641, "eval_samples_per_second": 24.368, "eval_steps_per_second": 6.092, "step": 5000 }, { "entropy": 0.4362106263637543, "epoch": 2.824492099322799, "grad_norm": 1.5646847486495972, "learning_rate": 4.818538148976572e-06, "loss": 0.3389, "mean_token_accuracy": 0.8909627199172974, "num_tokens": 40636724.0, "step": 5005 }, { "entropy": 0.41136063933372496, "epoch": 2.827313769751693, "grad_norm": 1.6143547296524048, "learning_rate": 4.8181813147573166e-06, "loss": 0.3059, "mean_token_accuracy": 0.9023926854133606, "num_tokens": 40677571.0, "step": 5010 }, { "entropy": 0.4300425052642822, "epoch": 2.830135440180587, "grad_norm": 1.538517713546753, "learning_rate": 4.817824147912371e-06, "loss": 0.3059, "mean_token_accuracy": 0.901062273979187, "num_tokens": 40717960.0, "step": 5015 }, { "entropy": 0.4204569935798645, "epoch": 2.832957110609481, "grad_norm": 1.51469886302948, "learning_rate": 4.817466648511903e-06, "loss": 0.317, "mean_token_accuracy": 0.9010179877281189, "num_tokens": 40758728.0, "step": 5020 }, { "entropy": 0.4019504964351654, "epoch": 2.8357787810383748, "grad_norm": 1.5685406923294067, "learning_rate": 4.817108816626142e-06, "loss": 0.3027, "mean_token_accuracy": 0.9020584464073181, "num_tokens": 40799222.0, "step": 5025 }, { "entropy": 0.4178150534629822, "epoch": 2.8386004514672685, "grad_norm": 1.5866230726242065, "learning_rate": 4.816750652325382e-06, "loss": 0.3083, "mean_token_accuracy": 0.9016893863677978, "num_tokens": 40839920.0, "step": 5030 }, { "entropy": 0.3792074918746948, "epoch": 2.8414221218961626, "grad_norm": 1.2347012758255005, "learning_rate": 4.8163921556799885e-06, "loss": 0.2941, "mean_token_accuracy": 0.904798150062561, "num_tokens": 40880474.0, "step": 5035 }, { "entropy": 0.42022097706794737, "epoch": 2.8442437923250563, "grad_norm": 2.0809385776519775, "learning_rate": 4.816033326760384e-06, "loss": 0.3125, "mean_token_accuracy": 0.9000694155693054, "num_tokens": 40920909.0, "step": 5040 }, { "entropy": 0.40716384053230287, "epoch": 2.8470654627539504, "grad_norm": 1.6952626705169678, "learning_rate": 4.815674165637065e-06, "loss": 0.3045, "mean_token_accuracy": 0.9000877380371094, "num_tokens": 40961474.0, "step": 5045 }, { "entropy": 0.4542030215263367, "epoch": 2.849887133182844, "grad_norm": 2.060497283935547, "learning_rate": 4.815314672380586e-06, "loss": 0.3413, "mean_token_accuracy": 0.8906081914901733, "num_tokens": 41002244.0, "step": 5050 }, { "entropy": 0.42580119967460633, "epoch": 2.8527088036117383, "grad_norm": 1.8272522687911987, "learning_rate": 4.814954847061568e-06, "loss": 0.3134, "mean_token_accuracy": 0.8984528660774231, "num_tokens": 41043006.0, "step": 5055 }, { "entropy": 0.39229719042778016, "epoch": 2.855530474040632, "grad_norm": 1.7011030912399292, "learning_rate": 4.8145946897507026e-06, "loss": 0.2907, "mean_token_accuracy": 0.9052790999412537, "num_tokens": 41083650.0, "step": 5060 }, { "entropy": 0.42324894666671753, "epoch": 2.858352144469526, "grad_norm": 1.7294552326202393, "learning_rate": 4.814234200518741e-06, "loss": 0.3193, "mean_token_accuracy": 0.8964131712913513, "num_tokens": 41124231.0, "step": 5065 }, { "entropy": 0.4176328182220459, "epoch": 2.86117381489842, "grad_norm": 1.8063850402832031, "learning_rate": 4.813873379436499e-06, "loss": 0.311, "mean_token_accuracy": 0.8978959679603576, "num_tokens": 41164837.0, "step": 5070 }, { "entropy": 0.4265771806240082, "epoch": 2.8639954853273135, "grad_norm": 1.4727050065994263, "learning_rate": 4.813512226574863e-06, "loss": 0.3381, "mean_token_accuracy": 0.894121527671814, "num_tokens": 41205500.0, "step": 5075 }, { "entropy": 0.4046603262424469, "epoch": 2.8668171557562077, "grad_norm": 1.5069383382797241, "learning_rate": 4.813150742004782e-06, "loss": 0.3129, "mean_token_accuracy": 0.8987318396568298, "num_tokens": 41246220.0, "step": 5080 }, { "entropy": 0.41487335562705996, "epoch": 2.869638826185102, "grad_norm": 1.5012837648391724, "learning_rate": 4.812788925797267e-06, "loss": 0.3078, "mean_token_accuracy": 0.9006859183311462, "num_tokens": 41287146.0, "step": 5085 }, { "entropy": 0.40536822080612184, "epoch": 2.8724604966139955, "grad_norm": 1.679571270942688, "learning_rate": 4.812426778023398e-06, "loss": 0.3058, "mean_token_accuracy": 0.9021418213844299, "num_tokens": 41327663.0, "step": 5090 }, { "entropy": 0.4111123144626617, "epoch": 2.875282167042889, "grad_norm": 1.7820204496383667, "learning_rate": 4.812064298754319e-06, "loss": 0.3202, "mean_token_accuracy": 0.8966802835464478, "num_tokens": 41367964.0, "step": 5095 }, { "entropy": 0.4519184172153473, "epoch": 2.8781038374717833, "grad_norm": 1.9354314804077148, "learning_rate": 4.811701488061239e-06, "loss": 0.3484, "mean_token_accuracy": 0.8897565484046936, "num_tokens": 41408264.0, "step": 5100 }, { "entropy": 0.3959288239479065, "epoch": 2.8809255079006775, "grad_norm": 1.4760626554489136, "learning_rate": 4.811338346015434e-06, "loss": 0.2947, "mean_token_accuracy": 0.9035079836845398, "num_tokens": 41448952.0, "step": 5105 }, { "entropy": 0.3818633139133453, "epoch": 2.883747178329571, "grad_norm": 1.3426601886749268, "learning_rate": 4.81097487268824e-06, "loss": 0.2856, "mean_token_accuracy": 0.9082074284553527, "num_tokens": 41489735.0, "step": 5110 }, { "entropy": 0.4096335232257843, "epoch": 2.886568848758465, "grad_norm": 1.6107584238052368, "learning_rate": 4.810611068151064e-06, "loss": 0.319, "mean_token_accuracy": 0.898867416381836, "num_tokens": 41530551.0, "step": 5115 }, { "entropy": 0.4616283357143402, "epoch": 2.889390519187359, "grad_norm": 1.9297223091125488, "learning_rate": 4.810246932475374e-06, "loss": 0.3473, "mean_token_accuracy": 0.8871626138687134, "num_tokens": 41571097.0, "step": 5120 }, { "entropy": 0.40603058934211733, "epoch": 2.8922121896162527, "grad_norm": 1.4726239442825317, "learning_rate": 4.809882465732706e-06, "loss": 0.3127, "mean_token_accuracy": 0.8997641682624817, "num_tokens": 41611601.0, "step": 5125 }, { "entropy": 0.4259068608283997, "epoch": 2.895033860045147, "grad_norm": 1.739429235458374, "learning_rate": 4.809517667994657e-06, "loss": 0.3129, "mean_token_accuracy": 0.8975944757461548, "num_tokens": 41651971.0, "step": 5130 }, { "entropy": 0.42120612859725953, "epoch": 2.8978555304740405, "grad_norm": 1.9967470169067383, "learning_rate": 4.809152539332895e-06, "loss": 0.3139, "mean_token_accuracy": 0.9000747799873352, "num_tokens": 41692648.0, "step": 5135 }, { "entropy": 0.4155091643333435, "epoch": 2.9006772009029347, "grad_norm": 1.7973077297210693, "learning_rate": 4.808787079819147e-06, "loss": 0.3242, "mean_token_accuracy": 0.8949877023696899, "num_tokens": 41733143.0, "step": 5140 }, { "entropy": 0.41151371598243713, "epoch": 2.9034988713318284, "grad_norm": 1.404732346534729, "learning_rate": 4.808421289525208e-06, "loss": 0.3154, "mean_token_accuracy": 0.8984189987182617, "num_tokens": 41773361.0, "step": 5145 }, { "entropy": 0.42156214118003843, "epoch": 2.9063205417607225, "grad_norm": 1.6763156652450562, "learning_rate": 4.808055168522938e-06, "loss": 0.3303, "mean_token_accuracy": 0.8945571660995484, "num_tokens": 41813917.0, "step": 5150 }, { "entropy": 0.4589288830757141, "epoch": 2.909142212189616, "grad_norm": 1.4989210367202759, "learning_rate": 4.807688716884262e-06, "loss": 0.359, "mean_token_accuracy": 0.8893625497817993, "num_tokens": 41854234.0, "step": 5155 }, { "entropy": 0.45793223977088926, "epoch": 2.91196388261851, "grad_norm": 1.8785529136657715, "learning_rate": 4.807321934681168e-06, "loss": 0.344, "mean_token_accuracy": 0.8890782237052918, "num_tokens": 41895042.0, "step": 5160 }, { "entropy": 0.42541446089744567, "epoch": 2.914785553047404, "grad_norm": 1.4923590421676636, "learning_rate": 4.806954821985711e-06, "loss": 0.3257, "mean_token_accuracy": 0.8974828958511353, "num_tokens": 41935666.0, "step": 5165 }, { "entropy": 0.411184823513031, "epoch": 2.917607223476298, "grad_norm": 1.8144638538360596, "learning_rate": 4.806587378870011e-06, "loss": 0.3096, "mean_token_accuracy": 0.9010491013526917, "num_tokens": 41976246.0, "step": 5170 }, { "entropy": 0.4226935863494873, "epoch": 2.920428893905192, "grad_norm": 1.6548048257827759, "learning_rate": 4.806219605406253e-06, "loss": 0.3246, "mean_token_accuracy": 0.8946655511856079, "num_tokens": 42016979.0, "step": 5175 }, { "entropy": 0.4352991938591003, "epoch": 2.9232505643340856, "grad_norm": 1.8966737985610962, "learning_rate": 4.805851501666683e-06, "loss": 0.3265, "mean_token_accuracy": 0.8924818277359009, "num_tokens": 42057536.0, "step": 5180 }, { "entropy": 0.43102755546569826, "epoch": 2.9260722347629797, "grad_norm": 1.7986955642700195, "learning_rate": 4.805483067723618e-06, "loss": 0.3331, "mean_token_accuracy": 0.8937864661216736, "num_tokens": 42097826.0, "step": 5185 }, { "entropy": 0.4175034463405609, "epoch": 2.9288939051918734, "grad_norm": 1.5185545682907104, "learning_rate": 4.805114303649436e-06, "loss": 0.3217, "mean_token_accuracy": 0.8953788757324219, "num_tokens": 42138400.0, "step": 5190 }, { "entropy": 0.4255950450897217, "epoch": 2.9317155756207676, "grad_norm": 1.8746482133865356, "learning_rate": 4.80474520951658e-06, "loss": 0.3113, "mean_token_accuracy": 0.9003988265991211, "num_tokens": 42178987.0, "step": 5195 }, { "entropy": 0.41336881518363955, "epoch": 2.9345372460496613, "grad_norm": 1.6806355714797974, "learning_rate": 4.80437578539756e-06, "loss": 0.3031, "mean_token_accuracy": 0.9040932536125184, "num_tokens": 42219657.0, "step": 5200 }, { "entropy": 0.4057203710079193, "epoch": 2.9373589164785554, "grad_norm": 1.564095377922058, "learning_rate": 4.804006031364948e-06, "loss": 0.2936, "mean_token_accuracy": 0.9054961562156677, "num_tokens": 42260473.0, "step": 5205 }, { "entropy": 0.39404407143592834, "epoch": 2.940180586907449, "grad_norm": 1.4518405199050903, "learning_rate": 4.8036359474913826e-06, "loss": 0.288, "mean_token_accuracy": 0.9059139609336853, "num_tokens": 42301055.0, "step": 5210 }, { "entropy": 0.3924104571342468, "epoch": 2.9430022573363432, "grad_norm": 1.319478154182434, "learning_rate": 4.803265533849569e-06, "loss": 0.3051, "mean_token_accuracy": 0.9025569319725036, "num_tokens": 42341730.0, "step": 5215 }, { "entropy": 0.3995777368545532, "epoch": 2.945823927765237, "grad_norm": 1.4966541528701782, "learning_rate": 4.802894790512271e-06, "loss": 0.3079, "mean_token_accuracy": 0.901541805267334, "num_tokens": 42382070.0, "step": 5220 }, { "entropy": 0.4068488717079163, "epoch": 2.948645598194131, "grad_norm": 1.8578685522079468, "learning_rate": 4.8025237175523245e-06, "loss": 0.2956, "mean_token_accuracy": 0.9044872045516967, "num_tokens": 42422271.0, "step": 5225 }, { "entropy": 0.40595067739486695, "epoch": 2.9514672686230248, "grad_norm": 1.6572576761245728, "learning_rate": 4.8021523150426255e-06, "loss": 0.3079, "mean_token_accuracy": 0.9003460884094239, "num_tokens": 42463045.0, "step": 5230 }, { "entropy": 0.4293433666229248, "epoch": 2.954288939051919, "grad_norm": 1.6312439441680908, "learning_rate": 4.801780583056135e-06, "loss": 0.3073, "mean_token_accuracy": 0.9005551815032959, "num_tokens": 42503743.0, "step": 5235 }, { "entropy": 0.43949187994003297, "epoch": 2.9571106094808126, "grad_norm": 1.5055447816848755, "learning_rate": 4.8014085216658824e-06, "loss": 0.3352, "mean_token_accuracy": 0.8934733510017395, "num_tokens": 42544394.0, "step": 5240 }, { "entropy": 0.3744525730609894, "epoch": 2.9599322799097063, "grad_norm": 1.7301849126815796, "learning_rate": 4.801036130944957e-06, "loss": 0.29, "mean_token_accuracy": 0.9071328043937683, "num_tokens": 42584858.0, "step": 5245 }, { "entropy": 0.41108238101005556, "epoch": 2.9627539503386005, "grad_norm": 1.419072151184082, "learning_rate": 4.800663410966516e-06, "loss": 0.3142, "mean_token_accuracy": 0.8972790360450744, "num_tokens": 42625252.0, "step": 5250 }, { "entropy": 0.4198480904102325, "epoch": 2.9655756207674946, "grad_norm": 1.5576399564743042, "learning_rate": 4.80029036180378e-06, "loss": 0.312, "mean_token_accuracy": 0.9000542402267456, "num_tokens": 42665780.0, "step": 5255 }, { "entropy": 0.4146255135536194, "epoch": 2.9683972911963883, "grad_norm": 1.5485845804214478, "learning_rate": 4.799916983530035e-06, "loss": 0.3035, "mean_token_accuracy": 0.9038738012313843, "num_tokens": 42706347.0, "step": 5260 }, { "entropy": 0.4407613933086395, "epoch": 2.971218961625282, "grad_norm": 1.6536972522735596, "learning_rate": 4.7995432762186305e-06, "loss": 0.3554, "mean_token_accuracy": 0.8906416058540344, "num_tokens": 42747026.0, "step": 5265 }, { "entropy": 0.4324296176433563, "epoch": 2.974040632054176, "grad_norm": 1.9314972162246704, "learning_rate": 4.799169239942982e-06, "loss": 0.323, "mean_token_accuracy": 0.8953546643257141, "num_tokens": 42787529.0, "step": 5270 }, { "entropy": 0.40229780673980714, "epoch": 2.97686230248307, "grad_norm": 1.437831163406372, "learning_rate": 4.798794874776569e-06, "loss": 0.301, "mean_token_accuracy": 0.9024040460586548, "num_tokens": 42828263.0, "step": 5275 }, { "entropy": 0.4193461239337921, "epoch": 2.979683972911964, "grad_norm": 1.6553016901016235, "learning_rate": 4.798420180792934e-06, "loss": 0.3335, "mean_token_accuracy": 0.8927172064781189, "num_tokens": 42868861.0, "step": 5280 }, { "entropy": 0.39387176036834715, "epoch": 2.9825056433408577, "grad_norm": 1.827269434928894, "learning_rate": 4.7980451580656884e-06, "loss": 0.3012, "mean_token_accuracy": 0.9013678908348084, "num_tokens": 42909719.0, "step": 5285 }, { "entropy": 0.4310999274253845, "epoch": 2.985327313769752, "grad_norm": 1.4325158596038818, "learning_rate": 4.797669806668504e-06, "loss": 0.3228, "mean_token_accuracy": 0.8959625005722046, "num_tokens": 42950331.0, "step": 5290 }, { "entropy": 0.3933727502822876, "epoch": 2.9881489841986455, "grad_norm": 1.5803815126419067, "learning_rate": 4.797294126675117e-06, "loss": 0.2916, "mean_token_accuracy": 0.9042976379394532, "num_tokens": 42991038.0, "step": 5295 }, { "entropy": 0.4161331236362457, "epoch": 2.9909706546275396, "grad_norm": 1.6079249382019043, "learning_rate": 4.796918118159333e-06, "loss": 0.3214, "mean_token_accuracy": 0.8964751839637757, "num_tokens": 43031524.0, "step": 5300 }, { "entropy": 0.41101468205451963, "epoch": 2.9937923250564333, "grad_norm": 1.6859304904937744, "learning_rate": 4.796541781195018e-06, "loss": 0.3066, "mean_token_accuracy": 0.8995181202888489, "num_tokens": 43072136.0, "step": 5305 }, { "entropy": 0.415380471944809, "epoch": 2.9966139954853275, "grad_norm": 1.5194308757781982, "learning_rate": 4.796165115856101e-06, "loss": 0.3134, "mean_token_accuracy": 0.8985631585121154, "num_tokens": 43111676.0, "step": 5310 }, { "entropy": 0.4073172271251678, "epoch": 2.999435665914221, "grad_norm": 1.6618201732635498, "learning_rate": 4.79578812221658e-06, "loss": 0.3191, "mean_token_accuracy": 0.8969994068145752, "num_tokens": 43152259.0, "step": 5315 }, { "entropy": 0.40902658700942995, "epoch": 3.0022573363431153, "grad_norm": 1.5331395864486694, "learning_rate": 4.795410800350516e-06, "loss": 0.2449, "mean_token_accuracy": 0.9244343876838684, "num_tokens": 43186514.0, "step": 5320 }, { "entropy": 0.3297205209732056, "epoch": 3.005079006772009, "grad_norm": 1.183812141418457, "learning_rate": 4.795033150332033e-06, "loss": 0.2, "mean_token_accuracy": 0.93786141872406, "num_tokens": 43227120.0, "step": 5325 }, { "entropy": 0.32953916788101195, "epoch": 3.007900677200903, "grad_norm": 1.991351842880249, "learning_rate": 4.79465517223532e-06, "loss": 0.2247, "mean_token_accuracy": 0.9283014059066772, "num_tokens": 43267656.0, "step": 5330 }, { "entropy": 0.3248750567436218, "epoch": 3.010722347629797, "grad_norm": 1.9206222295761108, "learning_rate": 4.794276866134631e-06, "loss": 0.2204, "mean_token_accuracy": 0.9299774169921875, "num_tokens": 43308512.0, "step": 5335 }, { "entropy": 0.29567849040031435, "epoch": 3.0135440180586905, "grad_norm": 1.8526611328125, "learning_rate": 4.793898232104286e-06, "loss": 0.2006, "mean_token_accuracy": 0.9349032402038574, "num_tokens": 43349318.0, "step": 5340 }, { "entropy": 0.29463449120521545, "epoch": 3.0163656884875847, "grad_norm": 1.8326590061187744, "learning_rate": 4.7935192702186655e-06, "loss": 0.1999, "mean_token_accuracy": 0.936297345161438, "num_tokens": 43389542.0, "step": 5345 }, { "entropy": 0.322478848695755, "epoch": 3.0191873589164784, "grad_norm": 1.6944438219070435, "learning_rate": 4.793139980552218e-06, "loss": 0.2056, "mean_token_accuracy": 0.9352605819702149, "num_tokens": 43430125.0, "step": 5350 }, { "entropy": 0.28133447766304015, "epoch": 3.0220090293453725, "grad_norm": 5.7940263748168945, "learning_rate": 4.792760363179454e-06, "loss": 0.1745, "mean_token_accuracy": 0.9443145155906677, "num_tokens": 43470808.0, "step": 5355 }, { "entropy": 0.31858010292053224, "epoch": 3.024830699774266, "grad_norm": 2.1480491161346436, "learning_rate": 4.79238041817495e-06, "loss": 0.1987, "mean_token_accuracy": 0.936722457408905, "num_tokens": 43511437.0, "step": 5360 }, { "entropy": 0.27969213724136355, "epoch": 3.0276523702031604, "grad_norm": 1.8360697031021118, "learning_rate": 4.792000145613346e-06, "loss": 0.1816, "mean_token_accuracy": 0.9421907782554626, "num_tokens": 43552179.0, "step": 5365 }, { "entropy": 0.3075882375240326, "epoch": 3.030474040632054, "grad_norm": 1.9553619623184204, "learning_rate": 4.791619545569347e-06, "loss": 0.2032, "mean_token_accuracy": 0.9348996162414551, "num_tokens": 43592847.0, "step": 5370 }, { "entropy": 0.3161883890628815, "epoch": 3.033295711060948, "grad_norm": 1.7996515035629272, "learning_rate": 4.7912386181177216e-06, "loss": 0.2017, "mean_token_accuracy": 0.9368625760078431, "num_tokens": 43633494.0, "step": 5375 }, { "entropy": 0.31576813459396363, "epoch": 3.036117381489842, "grad_norm": 1.7700996398925781, "learning_rate": 4.790857363333303e-06, "loss": 0.2138, "mean_token_accuracy": 0.9302696347236633, "num_tokens": 43674104.0, "step": 5380 }, { "entropy": 0.2922771334648132, "epoch": 3.038939051918736, "grad_norm": 2.0567281246185303, "learning_rate": 4.790475781290988e-06, "loss": 0.1944, "mean_token_accuracy": 0.9376009941101074, "num_tokens": 43714718.0, "step": 5385 }, { "entropy": 0.3182399868965149, "epoch": 3.0417607223476297, "grad_norm": 1.8016152381896973, "learning_rate": 4.79009387206574e-06, "loss": 0.2071, "mean_token_accuracy": 0.9343981266021728, "num_tokens": 43755355.0, "step": 5390 }, { "entropy": 0.27976890206336974, "epoch": 3.044582392776524, "grad_norm": 1.6723804473876953, "learning_rate": 4.7897116357325844e-06, "loss": 0.174, "mean_token_accuracy": 0.9448309302330017, "num_tokens": 43795791.0, "step": 5395 }, { "entropy": 0.30181885361671446, "epoch": 3.0474040632054176, "grad_norm": 1.6413596868515015, "learning_rate": 4.7893290723666116e-06, "loss": 0.192, "mean_token_accuracy": 0.938207459449768, "num_tokens": 43836503.0, "step": 5400 }, { "entropy": 0.2982407629489899, "epoch": 3.0502257336343117, "grad_norm": 1.594096064567566, "learning_rate": 4.788946182042976e-06, "loss": 0.1967, "mean_token_accuracy": 0.9364789009094239, "num_tokens": 43877133.0, "step": 5405 }, { "entropy": 0.34292480945587156, "epoch": 3.0530474040632054, "grad_norm": 1.880804181098938, "learning_rate": 4.788562964836897e-06, "loss": 0.2291, "mean_token_accuracy": 0.9263935923576355, "num_tokens": 43917640.0, "step": 5410 }, { "entropy": 0.32324235439300536, "epoch": 3.055869074492099, "grad_norm": 1.8121103048324585, "learning_rate": 4.788179420823657e-06, "loss": 0.2245, "mean_token_accuracy": 0.9280650019645691, "num_tokens": 43958238.0, "step": 5415 }, { "entropy": 0.31875272989273074, "epoch": 3.0586907449209932, "grad_norm": 1.8845757246017456, "learning_rate": 4.787795550078603e-06, "loss": 0.2127, "mean_token_accuracy": 0.9321471333503724, "num_tokens": 43999127.0, "step": 5420 }, { "entropy": 0.306513249874115, "epoch": 3.061512415349887, "grad_norm": 1.7889739274978638, "learning_rate": 4.787411352677148e-06, "loss": 0.2052, "mean_token_accuracy": 0.9344831585884095, "num_tokens": 44039844.0, "step": 5425 }, { "entropy": 0.3005764603614807, "epoch": 3.064334085778781, "grad_norm": 1.6967425346374512, "learning_rate": 4.787026828694767e-06, "loss": 0.1913, "mean_token_accuracy": 0.9385549426078796, "num_tokens": 44080390.0, "step": 5430 }, { "entropy": 0.28662807643413546, "epoch": 3.0671557562076748, "grad_norm": 1.8293821811676025, "learning_rate": 4.786641978206999e-06, "loss": 0.1713, "mean_token_accuracy": 0.9455971002578736, "num_tokens": 44120988.0, "step": 5435 }, { "entropy": 0.3011536240577698, "epoch": 3.069977426636569, "grad_norm": 1.7703379392623901, "learning_rate": 4.786256801289449e-06, "loss": 0.1839, "mean_token_accuracy": 0.9414896726608276, "num_tokens": 44161183.0, "step": 5440 }, { "entropy": 0.31452121734619143, "epoch": 3.0727990970654626, "grad_norm": 1.900452733039856, "learning_rate": 4.785871298017783e-06, "loss": 0.1999, "mean_token_accuracy": 0.9365707755088806, "num_tokens": 44201881.0, "step": 5445 }, { "entropy": 0.296010160446167, "epoch": 3.0756207674943568, "grad_norm": 2.206284761428833, "learning_rate": 4.7854854684677345e-06, "loss": 0.1944, "mean_token_accuracy": 0.9396218180656433, "num_tokens": 44242707.0, "step": 5450 }, { "entropy": 0.3006631314754486, "epoch": 3.0784424379232505, "grad_norm": 1.720755934715271, "learning_rate": 4.785099312715101e-06, "loss": 0.1956, "mean_token_accuracy": 0.9376758575439453, "num_tokens": 44283160.0, "step": 5455 }, { "entropy": 0.28210367560386657, "epoch": 3.0812641083521446, "grad_norm": 1.6647253036499023, "learning_rate": 4.7847128308357414e-06, "loss": 0.1767, "mean_token_accuracy": 0.943335497379303, "num_tokens": 44323781.0, "step": 5460 }, { "entropy": 0.32272166609764097, "epoch": 3.0840857787810383, "grad_norm": 2.004971742630005, "learning_rate": 4.7843260229055805e-06, "loss": 0.209, "mean_token_accuracy": 0.9329189896583557, "num_tokens": 44364364.0, "step": 5465 }, { "entropy": 0.3206708192825317, "epoch": 3.0869074492099324, "grad_norm": 2.0899529457092285, "learning_rate": 4.7839388890006065e-06, "loss": 0.2202, "mean_token_accuracy": 0.9304483771324158, "num_tokens": 44404981.0, "step": 5470 }, { "entropy": 0.33442134261131284, "epoch": 3.089729119638826, "grad_norm": 2.028860092163086, "learning_rate": 4.783551429196872e-06, "loss": 0.2262, "mean_token_accuracy": 0.9287503361701965, "num_tokens": 44445757.0, "step": 5475 }, { "entropy": 0.31830272674560545, "epoch": 3.0925507900677203, "grad_norm": 1.8910348415374756, "learning_rate": 4.783163643570493e-06, "loss": 0.2219, "mean_token_accuracy": 0.9306220650672913, "num_tokens": 44486469.0, "step": 5480 }, { "entropy": 0.2903516858816147, "epoch": 3.095372460496614, "grad_norm": 1.8281724452972412, "learning_rate": 4.782775532197652e-06, "loss": 0.2034, "mean_token_accuracy": 0.9371914029121399, "num_tokens": 44527281.0, "step": 5485 }, { "entropy": 0.2952991366386414, "epoch": 3.098194130925508, "grad_norm": 1.6057690382003784, "learning_rate": 4.7823870951545924e-06, "loss": 0.2059, "mean_token_accuracy": 0.9358728647232055, "num_tokens": 44567483.0, "step": 5490 }, { "entropy": 0.3091245710849762, "epoch": 3.101015801354402, "grad_norm": 1.645340919494629, "learning_rate": 4.781998332517621e-06, "loss": 0.2082, "mean_token_accuracy": 0.9315654993057251, "num_tokens": 44608171.0, "step": 5495 }, { "entropy": 0.28429390788078307, "epoch": 3.1038374717832955, "grad_norm": 1.748238205909729, "learning_rate": 4.781609244363113e-06, "loss": 0.1902, "mean_token_accuracy": 0.9393861174583436, "num_tokens": 44648818.0, "step": 5500 }, { "epoch": 3.1038374717832955, "eval_entropy": 0.31511667370796204, "eval_loss": 0.1905011534690857, "eval_mean_token_accuracy": 0.9417401552200317, "eval_num_tokens": 44648818.0, "eval_runtime": 0.1638, "eval_samples_per_second": 24.418, "eval_steps_per_second": 6.104, "step": 5500 }, { "entropy": 0.291202050447464, "epoch": 3.1066591422121896, "grad_norm": 1.549842357635498, "learning_rate": 4.781219830767503e-06, "loss": 0.1874, "mean_token_accuracy": 0.9398093223571777, "num_tokens": 44689512.0, "step": 5505 }, { "entropy": 0.32962539196014407, "epoch": 3.1094808126410833, "grad_norm": 1.8189475536346436, "learning_rate": 4.780830091807293e-06, "loss": 0.2264, "mean_token_accuracy": 0.9287977695465088, "num_tokens": 44729392.0, "step": 5510 }, { "entropy": 0.3322260320186615, "epoch": 3.1123024830699775, "grad_norm": 1.7237313985824585, "learning_rate": 4.780440027559045e-06, "loss": 0.2275, "mean_token_accuracy": 0.9315350294113159, "num_tokens": 44770052.0, "step": 5515 }, { "entropy": 0.3075240433216095, "epoch": 3.115124153498871, "grad_norm": 1.6741464138031006, "learning_rate": 4.780049638099389e-06, "loss": 0.2057, "mean_token_accuracy": 0.9344268202781677, "num_tokens": 44810590.0, "step": 5520 }, { "entropy": 0.3267399907112122, "epoch": 3.1179458239277653, "grad_norm": 1.500756859779358, "learning_rate": 4.779658923505016e-06, "loss": 0.2024, "mean_token_accuracy": 0.9370131492614746, "num_tokens": 44851209.0, "step": 5525 }, { "entropy": 0.30828378200531004, "epoch": 3.120767494356659, "grad_norm": 1.5910139083862305, "learning_rate": 4.779267883852683e-06, "loss": 0.1998, "mean_token_accuracy": 0.9370311021804809, "num_tokens": 44891316.0, "step": 5530 }, { "entropy": 0.29669389128685, "epoch": 3.123589164785553, "grad_norm": 1.7703447341918945, "learning_rate": 4.778876519219208e-06, "loss": 0.1821, "mean_token_accuracy": 0.9407304763793946, "num_tokens": 44932141.0, "step": 5535 }, { "entropy": 0.29509271383285524, "epoch": 3.126410835214447, "grad_norm": 1.945721983909607, "learning_rate": 4.778484829681477e-06, "loss": 0.1991, "mean_token_accuracy": 0.9373913645744324, "num_tokens": 44972742.0, "step": 5540 }, { "entropy": 0.2876304090023041, "epoch": 3.129232505643341, "grad_norm": 1.8770701885223389, "learning_rate": 4.778092815316436e-06, "loss": 0.1877, "mean_token_accuracy": 0.9409420013427734, "num_tokens": 45013573.0, "step": 5545 }, { "entropy": 0.33961347937583924, "epoch": 3.1320541760722347, "grad_norm": 1.8638192415237427, "learning_rate": 4.777700476201096e-06, "loss": 0.2317, "mean_token_accuracy": 0.9269558072090149, "num_tokens": 45054408.0, "step": 5550 }, { "entropy": 0.3194903790950775, "epoch": 3.134875846501129, "grad_norm": 1.9824572801589966, "learning_rate": 4.777307812412533e-06, "loss": 0.2087, "mean_token_accuracy": 0.9329106450080872, "num_tokens": 45094995.0, "step": 5555 }, { "entropy": 0.3165232837200165, "epoch": 3.1376975169300225, "grad_norm": 1.8498613834381104, "learning_rate": 4.776914824027885e-06, "loss": 0.1959, "mean_token_accuracy": 0.9351617097854614, "num_tokens": 45135825.0, "step": 5560 }, { "entropy": 0.31093028783798216, "epoch": 3.1405191873589167, "grad_norm": 1.8327690362930298, "learning_rate": 4.776521511124356e-06, "loss": 0.2014, "mean_token_accuracy": 0.9345197558403016, "num_tokens": 45176378.0, "step": 5565 }, { "entropy": 0.2836467266082764, "epoch": 3.1433408577878104, "grad_norm": 1.7564665079116821, "learning_rate": 4.7761278737792115e-06, "loss": 0.1741, "mean_token_accuracy": 0.9437781810760498, "num_tokens": 45216787.0, "step": 5570 }, { "entropy": 0.2903373181819916, "epoch": 3.1461625282167045, "grad_norm": 1.730517029762268, "learning_rate": 4.775733912069781e-06, "loss": 0.1928, "mean_token_accuracy": 0.9383692383766175, "num_tokens": 45257435.0, "step": 5575 }, { "entropy": 0.27841404676437376, "epoch": 3.148984198645598, "grad_norm": 1.6484125852584839, "learning_rate": 4.775339626073458e-06, "loss": 0.1796, "mean_token_accuracy": 0.9405380487442017, "num_tokens": 45298209.0, "step": 5580 }, { "entropy": 0.30049493312835696, "epoch": 3.151805869074492, "grad_norm": 1.5576494932174683, "learning_rate": 4.774945015867702e-06, "loss": 0.1956, "mean_token_accuracy": 0.9364784240722657, "num_tokens": 45338825.0, "step": 5585 }, { "entropy": 0.31532043814659116, "epoch": 3.154627539503386, "grad_norm": 1.8203058242797852, "learning_rate": 4.774550081530034e-06, "loss": 0.1975, "mean_token_accuracy": 0.9349407196044922, "num_tokens": 45379636.0, "step": 5590 }, { "entropy": 0.29182214140892027, "epoch": 3.1574492099322797, "grad_norm": 1.9475021362304688, "learning_rate": 4.774154823138037e-06, "loss": 0.1936, "mean_token_accuracy": 0.9374213933944702, "num_tokens": 45420353.0, "step": 5595 }, { "entropy": 0.32046533823013307, "epoch": 3.160270880361174, "grad_norm": 1.7757936716079712, "learning_rate": 4.773759240769361e-06, "loss": 0.2002, "mean_token_accuracy": 0.9345276355743408, "num_tokens": 45460828.0, "step": 5600 }, { "entropy": 0.29744908809661863, "epoch": 3.1630925507900676, "grad_norm": 1.8185851573944092, "learning_rate": 4.773363334501717e-06, "loss": 0.1847, "mean_token_accuracy": 0.939760959148407, "num_tokens": 45501378.0, "step": 5605 }, { "entropy": 0.2820535182952881, "epoch": 3.1659142212189617, "grad_norm": 1.843806266784668, "learning_rate": 4.772967104412882e-06, "loss": 0.1958, "mean_token_accuracy": 0.9386505484580994, "num_tokens": 45542183.0, "step": 5610 }, { "entropy": 0.2983327269554138, "epoch": 3.1687358916478554, "grad_norm": 1.7788043022155762, "learning_rate": 4.772570550580696e-06, "loss": 0.1908, "mean_token_accuracy": 0.939252245426178, "num_tokens": 45582722.0, "step": 5615 }, { "entropy": 0.3242061614990234, "epoch": 3.1715575620767495, "grad_norm": 1.9367573261260986, "learning_rate": 4.77217367308306e-06, "loss": 0.2153, "mean_token_accuracy": 0.9323940753936768, "num_tokens": 45623151.0, "step": 5620 }, { "entropy": 0.32598050832748415, "epoch": 3.1743792325056432, "grad_norm": 1.8925623893737793, "learning_rate": 4.7717764719979425e-06, "loss": 0.2206, "mean_token_accuracy": 0.9283890128135681, "num_tokens": 45663702.0, "step": 5625 }, { "entropy": 0.296782386302948, "epoch": 3.1772009029345374, "grad_norm": 2.1091501712799072, "learning_rate": 4.771378947403374e-06, "loss": 0.2048, "mean_token_accuracy": 0.9344670414924622, "num_tokens": 45704366.0, "step": 5630 }, { "entropy": 0.3019926369190216, "epoch": 3.180022573363431, "grad_norm": 1.747178077697754, "learning_rate": 4.770981099377445e-06, "loss": 0.2092, "mean_token_accuracy": 0.9341341137886048, "num_tokens": 45744753.0, "step": 5635 }, { "entropy": 0.31195615530014037, "epoch": 3.1828442437923252, "grad_norm": 2.096071243286133, "learning_rate": 4.7705829279983165e-06, "loss": 0.2089, "mean_token_accuracy": 0.9316715359687805, "num_tokens": 45784713.0, "step": 5640 }, { "entropy": 0.3224526882171631, "epoch": 3.185665914221219, "grad_norm": 2.126277446746826, "learning_rate": 4.770184433344207e-06, "loss": 0.2126, "mean_token_accuracy": 0.9330295085906982, "num_tokens": 45825418.0, "step": 5645 }, { "entropy": 0.33095919489860537, "epoch": 3.188487584650113, "grad_norm": 1.8362377882003784, "learning_rate": 4.769785615493403e-06, "loss": 0.2234, "mean_token_accuracy": 0.9276701092720032, "num_tokens": 45866068.0, "step": 5650 }, { "entropy": 0.290876042842865, "epoch": 3.1913092550790068, "grad_norm": 1.6160290241241455, "learning_rate": 4.76938647452425e-06, "loss": 0.1988, "mean_token_accuracy": 0.9370180249214173, "num_tokens": 45906929.0, "step": 5655 }, { "entropy": 0.2966496706008911, "epoch": 3.194130925507901, "grad_norm": 1.9973173141479492, "learning_rate": 4.76898701051516e-06, "loss": 0.2004, "mean_token_accuracy": 0.9346588611602783, "num_tokens": 45947800.0, "step": 5660 }, { "entropy": 0.3274839758872986, "epoch": 3.1969525959367946, "grad_norm": 1.7757952213287354, "learning_rate": 4.768587223544609e-06, "loss": 0.2114, "mean_token_accuracy": 0.9339724779129028, "num_tokens": 45987994.0, "step": 5665 }, { "entropy": 0.31054744124412537, "epoch": 3.1997742663656883, "grad_norm": 1.8302743434906006, "learning_rate": 4.768187113691134e-06, "loss": 0.1925, "mean_token_accuracy": 0.937913966178894, "num_tokens": 46028630.0, "step": 5670 }, { "entropy": 0.296200168132782, "epoch": 3.2025959367945824, "grad_norm": 1.6775931119918823, "learning_rate": 4.767786681033337e-06, "loss": 0.1923, "mean_token_accuracy": 0.9360583305358887, "num_tokens": 46069224.0, "step": 5675 }, { "entropy": 0.31996251940727233, "epoch": 3.205417607223476, "grad_norm": 2.0050246715545654, "learning_rate": 4.767385925649883e-06, "loss": 0.2124, "mean_token_accuracy": 0.9327707052230835, "num_tokens": 46109724.0, "step": 5680 }, { "entropy": 0.2851140141487122, "epoch": 3.2082392776523703, "grad_norm": 2.052438497543335, "learning_rate": 4.7669848476195005e-06, "loss": 0.1954, "mean_token_accuracy": 0.9375227808952331, "num_tokens": 46150400.0, "step": 5685 }, { "entropy": 0.2947817206382751, "epoch": 3.211060948081264, "grad_norm": 1.9339162111282349, "learning_rate": 4.766583447020981e-06, "loss": 0.2079, "mean_token_accuracy": 0.9334628462791443, "num_tokens": 46191151.0, "step": 5690 }, { "entropy": 0.3035987734794617, "epoch": 3.213882618510158, "grad_norm": 1.9785780906677246, "learning_rate": 4.76618172393318e-06, "loss": 0.2027, "mean_token_accuracy": 0.9366864800453186, "num_tokens": 46231643.0, "step": 5695 }, { "entropy": 0.31024947464466096, "epoch": 3.216704288939052, "grad_norm": 1.4870526790618896, "learning_rate": 4.765779678435016e-06, "loss": 0.2126, "mean_token_accuracy": 0.9308468461036682, "num_tokens": 46272392.0, "step": 5700 }, { "entropy": 0.31926954388618467, "epoch": 3.219525959367946, "grad_norm": 1.933563470840454, "learning_rate": 4.76537731060547e-06, "loss": 0.219, "mean_token_accuracy": 0.9304630041122437, "num_tokens": 46313068.0, "step": 5705 }, { "entropy": 0.3053418666124344, "epoch": 3.2223476297968396, "grad_norm": 1.7980693578720093, "learning_rate": 4.764974620523589e-06, "loss": 0.1975, "mean_token_accuracy": 0.9364609718322754, "num_tokens": 46353615.0, "step": 5710 }, { "entropy": 0.29312550127506254, "epoch": 3.225169300225734, "grad_norm": 1.8937259912490845, "learning_rate": 4.764571608268481e-06, "loss": 0.1914, "mean_token_accuracy": 0.9379542708396912, "num_tokens": 46394082.0, "step": 5715 }, { "entropy": 0.3227978229522705, "epoch": 3.2279909706546275, "grad_norm": 1.9451820850372314, "learning_rate": 4.764168273919317e-06, "loss": 0.2264, "mean_token_accuracy": 0.9285582900047302, "num_tokens": 46434664.0, "step": 5720 }, { "entropy": 0.2956021040678024, "epoch": 3.2308126410835216, "grad_norm": 2.080122947692871, "learning_rate": 4.7637646175553325e-06, "loss": 0.2137, "mean_token_accuracy": 0.9323472619056702, "num_tokens": 46475192.0, "step": 5725 }, { "entropy": 0.32532267570495604, "epoch": 3.2336343115124153, "grad_norm": 1.788669466972351, "learning_rate": 4.763360639255826e-06, "loss": 0.2257, "mean_token_accuracy": 0.9298007488250732, "num_tokens": 46515676.0, "step": 5730 }, { "entropy": 0.3365131139755249, "epoch": 3.2364559819413095, "grad_norm": 1.9396251440048218, "learning_rate": 4.762956339100158e-06, "loss": 0.2328, "mean_token_accuracy": 0.9281740784645081, "num_tokens": 46556056.0, "step": 5735 }, { "entropy": 0.2854636192321777, "epoch": 3.239277652370203, "grad_norm": 1.900076150894165, "learning_rate": 4.762551717167756e-06, "loss": 0.1883, "mean_token_accuracy": 0.9398433446884156, "num_tokens": 46596732.0, "step": 5740 }, { "entropy": 0.3118133544921875, "epoch": 3.2420993227990973, "grad_norm": 2.0240774154663086, "learning_rate": 4.762146773538105e-06, "loss": 0.211, "mean_token_accuracy": 0.9326440811157226, "num_tokens": 46637432.0, "step": 5745 }, { "entropy": 0.293948894739151, "epoch": 3.244920993227991, "grad_norm": 1.7449288368225098, "learning_rate": 4.7617415082907575e-06, "loss": 0.198, "mean_token_accuracy": 0.9358683586120605, "num_tokens": 46678134.0, "step": 5750 }, { "entropy": 0.32562840580940244, "epoch": 3.2477426636568847, "grad_norm": 1.7750238180160522, "learning_rate": 4.761335921505329e-06, "loss": 0.2237, "mean_token_accuracy": 0.929267966747284, "num_tokens": 46718630.0, "step": 5755 }, { "entropy": 0.2856466591358185, "epoch": 3.250564334085779, "grad_norm": 1.5713127851486206, "learning_rate": 4.760930013261495e-06, "loss": 0.1906, "mean_token_accuracy": 0.9386573076248169, "num_tokens": 46759404.0, "step": 5760 }, { "entropy": 0.31055004596710206, "epoch": 3.2533860045146725, "grad_norm": 1.7997633218765259, "learning_rate": 4.760523783638997e-06, "loss": 0.1985, "mean_token_accuracy": 0.9364279508590698, "num_tokens": 46799993.0, "step": 5765 }, { "entropy": 0.317557692527771, "epoch": 3.2562076749435667, "grad_norm": 1.7583611011505127, "learning_rate": 4.76011723271764e-06, "loss": 0.2218, "mean_token_accuracy": 0.9295771718025208, "num_tokens": 46840518.0, "step": 5770 }, { "entropy": 0.3186119019985199, "epoch": 3.2590293453724604, "grad_norm": 2.132106065750122, "learning_rate": 4.75971036057729e-06, "loss": 0.2093, "mean_token_accuracy": 0.9330814242362976, "num_tokens": 46881236.0, "step": 5775 }, { "entropy": 0.2968993723392487, "epoch": 3.2618510158013545, "grad_norm": 1.693253993988037, "learning_rate": 4.759303167297877e-06, "loss": 0.1979, "mean_token_accuracy": 0.9351974129676819, "num_tokens": 46921984.0, "step": 5780 }, { "entropy": 0.2921249568462372, "epoch": 3.264672686230248, "grad_norm": 2.1287200450897217, "learning_rate": 4.758895652959394e-06, "loss": 0.198, "mean_token_accuracy": 0.9361931085586548, "num_tokens": 46962730.0, "step": 5785 }, { "entropy": 0.3355319321155548, "epoch": 3.2674943566591423, "grad_norm": 1.6830389499664307, "learning_rate": 4.758487817641898e-06, "loss": 0.2355, "mean_token_accuracy": 0.9263626217842102, "num_tokens": 47002800.0, "step": 5790 }, { "entropy": 0.32140293121337893, "epoch": 3.270316027088036, "grad_norm": 1.845828652381897, "learning_rate": 4.758079661425508e-06, "loss": 0.2129, "mean_token_accuracy": 0.9305433511734009, "num_tokens": 47043526.0, "step": 5795 }, { "entropy": 0.32089059948921206, "epoch": 3.27313769751693, "grad_norm": 1.3842658996582031, "learning_rate": 4.757671184390406e-06, "loss": 0.2152, "mean_token_accuracy": 0.9322557806968689, "num_tokens": 47084134.0, "step": 5800 }, { "entropy": 0.33169599771499636, "epoch": 3.275959367945824, "grad_norm": 1.901238203048706, "learning_rate": 4.757262386616837e-06, "loss": 0.2173, "mean_token_accuracy": 0.9306012511253356, "num_tokens": 47124773.0, "step": 5805 }, { "entropy": 0.3124689519405365, "epoch": 3.278781038374718, "grad_norm": 2.120180368423462, "learning_rate": 4.75685326818511e-06, "loss": 0.2199, "mean_token_accuracy": 0.9284607768058777, "num_tokens": 47165252.0, "step": 5810 }, { "entropy": 0.28955113887786865, "epoch": 3.2816027088036117, "grad_norm": 2.0301990509033203, "learning_rate": 4.756443829175598e-06, "loss": 0.1989, "mean_token_accuracy": 0.9357622146606446, "num_tokens": 47205788.0, "step": 5815 }, { "entropy": 0.33322848081588746, "epoch": 3.2844243792325054, "grad_norm": 1.8951489925384521, "learning_rate": 4.756034069668732e-06, "loss": 0.2272, "mean_token_accuracy": 0.9254481196403503, "num_tokens": 47246598.0, "step": 5820 }, { "entropy": 0.3263582348823547, "epoch": 3.2872460496613995, "grad_norm": 1.7902400493621826, "learning_rate": 4.7556239897450116e-06, "loss": 0.2064, "mean_token_accuracy": 0.9346579432487487, "num_tokens": 47287035.0, "step": 5825 }, { "entropy": 0.3200534522533417, "epoch": 3.2900677200902937, "grad_norm": 2.2464659214019775, "learning_rate": 4.7552135894849965e-06, "loss": 0.2172, "mean_token_accuracy": 0.9306413531303406, "num_tokens": 47327903.0, "step": 5830 }, { "entropy": 0.29628766179084776, "epoch": 3.2928893905191874, "grad_norm": 1.7990299463272095, "learning_rate": 4.75480286896931e-06, "loss": 0.1918, "mean_token_accuracy": 0.9378930330276489, "num_tokens": 47368326.0, "step": 5835 }, { "entropy": 0.29361318349838256, "epoch": 3.295711060948081, "grad_norm": 2.020881175994873, "learning_rate": 4.754391828278638e-06, "loss": 0.2086, "mean_token_accuracy": 0.9333640217781067, "num_tokens": 47409098.0, "step": 5840 }, { "entropy": 0.3207947850227356, "epoch": 3.2985327313769752, "grad_norm": 2.123802423477173, "learning_rate": 4.753980467493729e-06, "loss": 0.2166, "mean_token_accuracy": 0.929839301109314, "num_tokens": 47449708.0, "step": 5845 }, { "entropy": 0.314586877822876, "epoch": 3.301354401805869, "grad_norm": 1.9354177713394165, "learning_rate": 4.753568786695395e-06, "loss": 0.2091, "mean_token_accuracy": 0.9324597477912903, "num_tokens": 47490310.0, "step": 5850 }, { "entropy": 0.30399842858314513, "epoch": 3.304176072234763, "grad_norm": 1.9815207719802856, "learning_rate": 4.753156785964512e-06, "loss": 0.2005, "mean_token_accuracy": 0.9356989383697509, "num_tokens": 47530834.0, "step": 5855 }, { "entropy": 0.2951928973197937, "epoch": 3.3069977426636568, "grad_norm": 2.1382501125335693, "learning_rate": 4.752744465382016e-06, "loss": 0.1979, "mean_token_accuracy": 0.9363240480422974, "num_tokens": 47571446.0, "step": 5860 }, { "entropy": 0.30987902283668517, "epoch": 3.309819413092551, "grad_norm": 1.7661712169647217, "learning_rate": 4.75233182502891e-06, "loss": 0.1998, "mean_token_accuracy": 0.9360772490501403, "num_tokens": 47612058.0, "step": 5865 }, { "entropy": 0.29606603980064394, "epoch": 3.3126410835214446, "grad_norm": 1.722391963005066, "learning_rate": 4.751918864986254e-06, "loss": 0.1834, "mean_token_accuracy": 0.9418554663658142, "num_tokens": 47652000.0, "step": 5870 }, { "entropy": 0.29225061237812044, "epoch": 3.3154627539503387, "grad_norm": 2.109041929244995, "learning_rate": 4.751505585335176e-06, "loss": 0.2005, "mean_token_accuracy": 0.9353228807449341, "num_tokens": 47692508.0, "step": 5875 }, { "entropy": 0.29792939126491547, "epoch": 3.3182844243792324, "grad_norm": 1.6946020126342773, "learning_rate": 4.751091986156864e-06, "loss": 0.1992, "mean_token_accuracy": 0.9354248166084289, "num_tokens": 47733266.0, "step": 5880 }, { "entropy": 0.29692640602588655, "epoch": 3.3211060948081266, "grad_norm": 2.0159807205200195, "learning_rate": 4.750678067532569e-06, "loss": 0.1937, "mean_token_accuracy": 0.9390650391578674, "num_tokens": 47773918.0, "step": 5885 }, { "entropy": 0.29629198312759397, "epoch": 3.3239277652370203, "grad_norm": 2.0621113777160645, "learning_rate": 4.750263829543608e-06, "loss": 0.1907, "mean_token_accuracy": 0.9389678955078125, "num_tokens": 47814708.0, "step": 5890 }, { "entropy": 0.3075540721416473, "epoch": 3.3267494356659144, "grad_norm": 2.2669737339019775, "learning_rate": 4.749849272271355e-06, "loss": 0.2074, "mean_token_accuracy": 0.9335337281227112, "num_tokens": 47855163.0, "step": 5895 }, { "entropy": 0.3051905155181885, "epoch": 3.329571106094808, "grad_norm": 1.7983601093292236, "learning_rate": 4.749434395797252e-06, "loss": 0.2013, "mean_token_accuracy": 0.9364488601684571, "num_tokens": 47895912.0, "step": 5900 }, { "entropy": 0.30544620752334595, "epoch": 3.332392776523702, "grad_norm": 2.127479076385498, "learning_rate": 4.749019200202801e-06, "loss": 0.2216, "mean_token_accuracy": 0.9293683290481567, "num_tokens": 47936365.0, "step": 5905 }, { "entropy": 0.3163418352603912, "epoch": 3.335214446952596, "grad_norm": 1.8929691314697266, "learning_rate": 4.748603685569566e-06, "loss": 0.2188, "mean_token_accuracy": 0.9315417289733887, "num_tokens": 47977087.0, "step": 5910 }, { "entropy": 0.2961422860622406, "epoch": 3.33803611738149, "grad_norm": 1.8902735710144043, "learning_rate": 4.7481878519791775e-06, "loss": 0.2024, "mean_token_accuracy": 0.9347827434539795, "num_tokens": 48017600.0, "step": 5915 }, { "entropy": 0.3079638063907623, "epoch": 3.340857787810384, "grad_norm": 3.9220144748687744, "learning_rate": 4.747771699513324e-06, "loss": 0.204, "mean_token_accuracy": 0.9359039187431335, "num_tokens": 48058181.0, "step": 5920 }, { "entropy": 0.30697737336158754, "epoch": 3.3436794582392775, "grad_norm": 1.9922391176223755, "learning_rate": 4.747355228253759e-06, "loss": 0.2005, "mean_token_accuracy": 0.9362172484397888, "num_tokens": 48098969.0, "step": 5925 }, { "entropy": 0.29278679490089415, "epoch": 3.3465011286681716, "grad_norm": 1.6647714376449585, "learning_rate": 4.746938438282297e-06, "loss": 0.1868, "mean_token_accuracy": 0.9388912916183472, "num_tokens": 48139626.0, "step": 5930 }, { "entropy": 0.32587441205978396, "epoch": 3.3493227990970653, "grad_norm": 1.9658551216125488, "learning_rate": 4.74652132968082e-06, "loss": 0.2199, "mean_token_accuracy": 0.931458306312561, "num_tokens": 48180389.0, "step": 5935 }, { "entropy": 0.3118497312068939, "epoch": 3.3521444695259595, "grad_norm": 1.961451768875122, "learning_rate": 4.746103902531266e-06, "loss": 0.2137, "mean_token_accuracy": 0.9307841777801513, "num_tokens": 48221035.0, "step": 5940 }, { "entropy": 0.29938756227493285, "epoch": 3.354966139954853, "grad_norm": 1.9882782697677612, "learning_rate": 4.7456861569156396e-06, "loss": 0.2026, "mean_token_accuracy": 0.9369753360748291, "num_tokens": 48261699.0, "step": 5945 }, { "entropy": 0.297002974152565, "epoch": 3.3577878103837473, "grad_norm": 1.5427496433258057, "learning_rate": 4.745268092916008e-06, "loss": 0.1919, "mean_token_accuracy": 0.9388345956802369, "num_tokens": 48302567.0, "step": 5950 }, { "entropy": 0.31234695911407473, "epoch": 3.360609480812641, "grad_norm": 1.856497049331665, "learning_rate": 4.744849710614498e-06, "loss": 0.2081, "mean_token_accuracy": 0.9339388251304627, "num_tokens": 48343071.0, "step": 5955 }, { "entropy": 0.26803739964962003, "epoch": 3.363431151241535, "grad_norm": 1.5363012552261353, "learning_rate": 4.744431010093302e-06, "loss": 0.1604, "mean_token_accuracy": 0.9466527342796326, "num_tokens": 48383501.0, "step": 5960 }, { "entropy": 0.3034593999385834, "epoch": 3.366252821670429, "grad_norm": 1.9353678226470947, "learning_rate": 4.744011991434673e-06, "loss": 0.1963, "mean_token_accuracy": 0.9345849275588989, "num_tokens": 48424004.0, "step": 5965 }, { "entropy": 0.3189900636672974, "epoch": 3.369074492099323, "grad_norm": 1.8158825635910034, "learning_rate": 4.743592654720929e-06, "loss": 0.2096, "mean_token_accuracy": 0.9319984436035156, "num_tokens": 48464769.0, "step": 5970 }, { "entropy": 0.29853619933128356, "epoch": 3.3718961625282167, "grad_norm": 2.1045563220977783, "learning_rate": 4.743173000034446e-06, "loss": 0.1981, "mean_token_accuracy": 0.9359387755393982, "num_tokens": 48505548.0, "step": 5975 }, { "entropy": 0.2892568349838257, "epoch": 3.374717832957111, "grad_norm": 1.914672613143921, "learning_rate": 4.7427530274576685e-06, "loss": 0.184, "mean_token_accuracy": 0.9403793454170227, "num_tokens": 48546282.0, "step": 5980 }, { "entropy": 0.3369791269302368, "epoch": 3.3775395033860045, "grad_norm": 1.855833649635315, "learning_rate": 4.742332737073098e-06, "loss": 0.2296, "mean_token_accuracy": 0.9269206643104553, "num_tokens": 48586933.0, "step": 5985 }, { "entropy": 0.33269967436790465, "epoch": 3.380361173814898, "grad_norm": 1.9749621152877808, "learning_rate": 4.741912128963301e-06, "loss": 0.2292, "mean_token_accuracy": 0.9259452581405639, "num_tokens": 48627643.0, "step": 5990 }, { "entropy": 0.3066904067993164, "epoch": 3.3831828442437923, "grad_norm": 2.0527172088623047, "learning_rate": 4.741491203210906e-06, "loss": 0.2026, "mean_token_accuracy": 0.9332493662834167, "num_tokens": 48668300.0, "step": 5995 }, { "entropy": 0.3129414677619934, "epoch": 3.386004514672686, "grad_norm": 1.7080130577087402, "learning_rate": 4.741069959898603e-06, "loss": 0.2183, "mean_token_accuracy": 0.929611599445343, "num_tokens": 48708814.0, "step": 6000 }, { "epoch": 3.386004514672686, "eval_entropy": 0.3240194022655487, "eval_loss": 0.18876124918460846, "eval_mean_token_accuracy": 0.9436565637588501, "eval_num_tokens": 48708814.0, "eval_runtime": 0.1645, "eval_samples_per_second": 24.322, "eval_steps_per_second": 6.08, "step": 6000 }, { "entropy": 0.31379024386405946, "epoch": 3.38882618510158, "grad_norm": 1.7492561340332031, "learning_rate": 4.740648399109148e-06, "loss": 0.2158, "mean_token_accuracy": 0.9303012132644654, "num_tokens": 48749515.0, "step": 6005 }, { "entropy": 0.27735379338264465, "epoch": 3.391647855530474, "grad_norm": 1.8119415044784546, "learning_rate": 4.740226520925354e-06, "loss": 0.1774, "mean_token_accuracy": 0.9421341061592102, "num_tokens": 48790013.0, "step": 6010 }, { "entropy": 0.27942020893096925, "epoch": 3.394469525959368, "grad_norm": 1.6023916006088257, "learning_rate": 4.7398043254301e-06, "loss": 0.1803, "mean_token_accuracy": 0.94042888879776, "num_tokens": 48830739.0, "step": 6015 }, { "entropy": 0.3403823971748352, "epoch": 3.3972911963882617, "grad_norm": 1.8019448518753052, "learning_rate": 4.739381812706326e-06, "loss": 0.2299, "mean_token_accuracy": 0.9285380721092225, "num_tokens": 48871272.0, "step": 6020 }, { "entropy": 0.30235469341278076, "epoch": 3.400112866817156, "grad_norm": 2.0274271965026855, "learning_rate": 4.738958982837036e-06, "loss": 0.208, "mean_token_accuracy": 0.9346428751945496, "num_tokens": 48911923.0, "step": 6025 }, { "entropy": 0.32721391320228577, "epoch": 3.4029345372460496, "grad_norm": 1.9175236225128174, "learning_rate": 4.738535835905294e-06, "loss": 0.2291, "mean_token_accuracy": 0.9262001872062683, "num_tokens": 48952801.0, "step": 6030 }, { "entropy": 0.3034025192260742, "epoch": 3.4057562076749437, "grad_norm": 1.9815678596496582, "learning_rate": 4.738112371994227e-06, "loss": 0.2018, "mean_token_accuracy": 0.9346770763397216, "num_tokens": 48993273.0, "step": 6035 }, { "entropy": 0.3161701142787933, "epoch": 3.4085778781038374, "grad_norm": 1.633199691772461, "learning_rate": 4.737688591187024e-06, "loss": 0.2118, "mean_token_accuracy": 0.9302195191383362, "num_tokens": 49033969.0, "step": 6040 }, { "entropy": 0.30224609375, "epoch": 3.4113995485327315, "grad_norm": 1.9072091579437256, "learning_rate": 4.737264493566939e-06, "loss": 0.2037, "mean_token_accuracy": 0.933539628982544, "num_tokens": 49074370.0, "step": 6045 }, { "entropy": 0.33060076236724856, "epoch": 3.4142212189616252, "grad_norm": 2.085325241088867, "learning_rate": 4.736840079217284e-06, "loss": 0.2364, "mean_token_accuracy": 0.9252678632736206, "num_tokens": 49114878.0, "step": 6050 }, { "entropy": 0.2963068068027496, "epoch": 3.4170428893905194, "grad_norm": 1.7405322790145874, "learning_rate": 4.736415348221435e-06, "loss": 0.1954, "mean_token_accuracy": 0.9355825662612915, "num_tokens": 49155589.0, "step": 6055 }, { "entropy": 0.3319820463657379, "epoch": 3.419864559819413, "grad_norm": 1.9911015033721924, "learning_rate": 4.735990300662833e-06, "loss": 0.2195, "mean_token_accuracy": 0.9308295607566833, "num_tokens": 49196038.0, "step": 6060 }, { "entropy": 0.29007573127746583, "epoch": 3.422686230248307, "grad_norm": 1.579049825668335, "learning_rate": 4.7355649366249755e-06, "loss": 0.1962, "mean_token_accuracy": 0.9360047936439514, "num_tokens": 49236253.0, "step": 6065 }, { "entropy": 0.28036363422870636, "epoch": 3.425507900677201, "grad_norm": 1.846561312675476, "learning_rate": 4.735139256191428e-06, "loss": 0.1917, "mean_token_accuracy": 0.937602186203003, "num_tokens": 49276934.0, "step": 6070 }, { "entropy": 0.30836820006370547, "epoch": 3.4283295711060946, "grad_norm": 2.027714967727661, "learning_rate": 4.734713259445814e-06, "loss": 0.2116, "mean_token_accuracy": 0.9310431718826294, "num_tokens": 49317527.0, "step": 6075 }, { "entropy": 0.2989709198474884, "epoch": 3.4311512415349887, "grad_norm": 1.928396224975586, "learning_rate": 4.734286946471821e-06, "loss": 0.2028, "mean_token_accuracy": 0.9341145396232605, "num_tokens": 49358187.0, "step": 6080 }, { "entropy": 0.29576932787895205, "epoch": 3.4339729119638824, "grad_norm": 1.8478814363479614, "learning_rate": 4.733860317353198e-06, "loss": 0.201, "mean_token_accuracy": 0.9338787913322448, "num_tokens": 49398870.0, "step": 6085 }, { "entropy": 0.34390089511871336, "epoch": 3.4367945823927766, "grad_norm": 2.0840272903442383, "learning_rate": 4.733433372173756e-06, "loss": 0.2391, "mean_token_accuracy": 0.9246963381767273, "num_tokens": 49439698.0, "step": 6090 }, { "entropy": 0.3055784046649933, "epoch": 3.4396162528216703, "grad_norm": 1.8206205368041992, "learning_rate": 4.73300611101737e-06, "loss": 0.2052, "mean_token_accuracy": 0.933589768409729, "num_tokens": 49480359.0, "step": 6095 }, { "entropy": 0.3226096034049988, "epoch": 3.4424379232505644, "grad_norm": 2.05794620513916, "learning_rate": 4.732578533967974e-06, "loss": 0.2108, "mean_token_accuracy": 0.9320708394050599, "num_tokens": 49521140.0, "step": 6100 }, { "entropy": 0.31547321677207946, "epoch": 3.445259593679458, "grad_norm": 2.0338547229766846, "learning_rate": 4.732150641109566e-06, "loss": 0.2055, "mean_token_accuracy": 0.93296377658844, "num_tokens": 49561801.0, "step": 6105 }, { "entropy": 0.32915824055671694, "epoch": 3.4480812641083523, "grad_norm": 1.9995898008346558, "learning_rate": 4.731722432526206e-06, "loss": 0.2213, "mean_token_accuracy": 0.9298017382621765, "num_tokens": 49602445.0, "step": 6110 }, { "entropy": 0.31122357249259947, "epoch": 3.450902934537246, "grad_norm": 1.9306919574737549, "learning_rate": 4.731293908302014e-06, "loss": 0.193, "mean_token_accuracy": 0.9369074702262878, "num_tokens": 49643049.0, "step": 6115 }, { "entropy": 0.283711576461792, "epoch": 3.45372460496614, "grad_norm": 1.7749558687210083, "learning_rate": 4.730865068521177e-06, "loss": 0.1865, "mean_token_accuracy": 0.9403268456459045, "num_tokens": 49683779.0, "step": 6120 }, { "entropy": 0.3096657872200012, "epoch": 3.456546275395034, "grad_norm": 1.9655077457427979, "learning_rate": 4.730435913267937e-06, "loss": 0.2029, "mean_token_accuracy": 0.9331951022148133, "num_tokens": 49724297.0, "step": 6125 }, { "entropy": 0.3291502416133881, "epoch": 3.459367945823928, "grad_norm": 1.9921224117279053, "learning_rate": 4.7300064426266035e-06, "loss": 0.2186, "mean_token_accuracy": 0.9291182398796082, "num_tokens": 49764911.0, "step": 6130 }, { "entropy": 0.31880820393562315, "epoch": 3.4621896162528216, "grad_norm": 1.8428974151611328, "learning_rate": 4.729576656681545e-06, "loss": 0.2154, "mean_token_accuracy": 0.9307239055633545, "num_tokens": 49805553.0, "step": 6135 }, { "entropy": 0.29867430329322814, "epoch": 3.4650112866817158, "grad_norm": 2.0829527378082275, "learning_rate": 4.729146555517195e-06, "loss": 0.2009, "mean_token_accuracy": 0.9337917327880859, "num_tokens": 49846334.0, "step": 6140 }, { "entropy": 0.3254325449466705, "epoch": 3.4678329571106095, "grad_norm": 1.9901976585388184, "learning_rate": 4.728716139218045e-06, "loss": 0.2165, "mean_token_accuracy": 0.9306549906730652, "num_tokens": 49886713.0, "step": 6145 }, { "entropy": 0.2868054747581482, "epoch": 3.4706546275395036, "grad_norm": 1.7727199792861938, "learning_rate": 4.728285407868651e-06, "loss": 0.1976, "mean_token_accuracy": 0.936365807056427, "num_tokens": 49927340.0, "step": 6150 }, { "entropy": 0.3261713206768036, "epoch": 3.4734762979683973, "grad_norm": 1.8373956680297852, "learning_rate": 4.72785436155363e-06, "loss": 0.2094, "mean_token_accuracy": 0.9324244618415832, "num_tokens": 49967868.0, "step": 6155 }, { "entropy": 0.3357150912284851, "epoch": 3.476297968397291, "grad_norm": 1.9818036556243896, "learning_rate": 4.7274230003576625e-06, "loss": 0.2388, "mean_token_accuracy": 0.9242081761360168, "num_tokens": 50008499.0, "step": 6160 }, { "entropy": 0.3065942347049713, "epoch": 3.479119638826185, "grad_norm": 1.6570138931274414, "learning_rate": 4.726991324365487e-06, "loss": 0.2021, "mean_token_accuracy": 0.9351112723350525, "num_tokens": 50049217.0, "step": 6165 }, { "entropy": 0.2773208677768707, "epoch": 3.481941309255079, "grad_norm": 1.8118728399276733, "learning_rate": 4.726559333661908e-06, "loss": 0.1871, "mean_token_accuracy": 0.9404791593551636, "num_tokens": 50090066.0, "step": 6170 }, { "entropy": 0.3141829252243042, "epoch": 3.484762979683973, "grad_norm": 1.8138505220413208, "learning_rate": 4.726127028331789e-06, "loss": 0.2124, "mean_token_accuracy": 0.9332842707633973, "num_tokens": 50130759.0, "step": 6175 }, { "entropy": 0.29586785435676577, "epoch": 3.4875846501128667, "grad_norm": 2.0417232513427734, "learning_rate": 4.725694408460059e-06, "loss": 0.1986, "mean_token_accuracy": 0.9360351800918579, "num_tokens": 50171307.0, "step": 6180 }, { "entropy": 0.3109571814537048, "epoch": 3.490406320541761, "grad_norm": 2.048214912414551, "learning_rate": 4.725261474131703e-06, "loss": 0.207, "mean_token_accuracy": 0.9328826665878296, "num_tokens": 50211913.0, "step": 6185 }, { "entropy": 0.3329827606678009, "epoch": 3.4932279909706545, "grad_norm": 1.681795597076416, "learning_rate": 4.724828225431772e-06, "loss": 0.2283, "mean_token_accuracy": 0.9269816160202027, "num_tokens": 50252502.0, "step": 6190 }, { "entropy": 0.29528833031654356, "epoch": 3.4960496613995486, "grad_norm": 1.5485318899154663, "learning_rate": 4.72439466244538e-06, "loss": 0.1888, "mean_token_accuracy": 0.9390155076980591, "num_tokens": 50293239.0, "step": 6195 }, { "entropy": 0.30525763630867003, "epoch": 3.4988713318284423, "grad_norm": 1.8242753744125366, "learning_rate": 4.723960785257697e-06, "loss": 0.207, "mean_token_accuracy": 0.9336790919303894, "num_tokens": 50333949.0, "step": 6200 }, { "entropy": 0.329301780462265, "epoch": 3.5016930022573365, "grad_norm": 1.7903170585632324, "learning_rate": 4.72352659395396e-06, "loss": 0.2453, "mean_token_accuracy": 0.9249015927314759, "num_tokens": 50374411.0, "step": 6205 }, { "entropy": 0.30608891248703, "epoch": 3.50451467268623, "grad_norm": 1.8584059476852417, "learning_rate": 4.7230920886194655e-06, "loss": 0.2178, "mean_token_accuracy": 0.9296943068504333, "num_tokens": 50415248.0, "step": 6210 }, { "entropy": 0.3110420048236847, "epoch": 3.5073363431151243, "grad_norm": 1.9278223514556885, "learning_rate": 4.722657269339573e-06, "loss": 0.2087, "mean_token_accuracy": 0.9321151494979858, "num_tokens": 50455903.0, "step": 6215 }, { "entropy": 0.29749436378479005, "epoch": 3.510158013544018, "grad_norm": 1.6023099422454834, "learning_rate": 4.722222136199703e-06, "loss": 0.1862, "mean_token_accuracy": 0.940057122707367, "num_tokens": 50496595.0, "step": 6220 }, { "entropy": 0.3107947289943695, "epoch": 3.5129796839729117, "grad_norm": 1.7268322706222534, "learning_rate": 4.7217866892853355e-06, "loss": 0.1982, "mean_token_accuracy": 0.9365177512168884, "num_tokens": 50537156.0, "step": 6225 }, { "entropy": 0.31932695508003234, "epoch": 3.515801354401806, "grad_norm": 1.9422742128372192, "learning_rate": 4.721350928682017e-06, "loss": 0.2254, "mean_token_accuracy": 0.9268533945083618, "num_tokens": 50577880.0, "step": 6230 }, { "entropy": 0.3243493378162384, "epoch": 3.5186230248307, "grad_norm": 1.8722442388534546, "learning_rate": 4.720914854475349e-06, "loss": 0.2181, "mean_token_accuracy": 0.9315003275871276, "num_tokens": 50618659.0, "step": 6235 }, { "entropy": 0.30051770210266116, "epoch": 3.5214446952595937, "grad_norm": 1.5247653722763062, "learning_rate": 4.720478466751002e-06, "loss": 0.1931, "mean_token_accuracy": 0.9378492355346679, "num_tokens": 50659414.0, "step": 6240 }, { "entropy": 0.3169930100440979, "epoch": 3.5242663656884874, "grad_norm": 2.1740469932556152, "learning_rate": 4.720041765594701e-06, "loss": 0.2171, "mean_token_accuracy": 0.9304054498672485, "num_tokens": 50700091.0, "step": 6245 }, { "entropy": 0.3257488250732422, "epoch": 3.5270880361173815, "grad_norm": 1.9426953792572021, "learning_rate": 4.719604751092239e-06, "loss": 0.221, "mean_token_accuracy": 0.928559148311615, "num_tokens": 50740627.0, "step": 6250 }, { "entropy": 0.31315135955810547, "epoch": 3.5299097065462757, "grad_norm": 1.9665589332580566, "learning_rate": 4.719167423329467e-06, "loss": 0.2139, "mean_token_accuracy": 0.9316304087638855, "num_tokens": 50781438.0, "step": 6255 }, { "entropy": 0.3040352761745453, "epoch": 3.5327313769751694, "grad_norm": 2.057408571243286, "learning_rate": 4.718729782392297e-06, "loss": 0.1961, "mean_token_accuracy": 0.9354169130325317, "num_tokens": 50822359.0, "step": 6260 }, { "entropy": 0.3147823929786682, "epoch": 3.535553047404063, "grad_norm": 2.11651349067688, "learning_rate": 4.718291828366703e-06, "loss": 0.2091, "mean_token_accuracy": 0.9335868835449219, "num_tokens": 50862733.0, "step": 6265 }, { "entropy": 0.3134510278701782, "epoch": 3.538374717832957, "grad_norm": 1.6482633352279663, "learning_rate": 4.717853561338723e-06, "loss": 0.2068, "mean_token_accuracy": 0.9318931818008422, "num_tokens": 50903490.0, "step": 6270 }, { "entropy": 0.3234701603651047, "epoch": 3.541196388261851, "grad_norm": 2.001359701156616, "learning_rate": 4.717414981394454e-06, "loss": 0.233, "mean_token_accuracy": 0.9272037386894226, "num_tokens": 50944012.0, "step": 6275 }, { "entropy": 0.2873412311077118, "epoch": 3.544018058690745, "grad_norm": 2.011967420578003, "learning_rate": 4.716976088620055e-06, "loss": 0.1917, "mean_token_accuracy": 0.9380563020706176, "num_tokens": 50984897.0, "step": 6280 }, { "entropy": 0.29418745040893557, "epoch": 3.5468397291196387, "grad_norm": 1.7138752937316895, "learning_rate": 4.716536883101746e-06, "loss": 0.1905, "mean_token_accuracy": 0.9367462277412415, "num_tokens": 51025455.0, "step": 6285 }, { "entropy": 0.3291135847568512, "epoch": 3.549661399548533, "grad_norm": 2.0384838581085205, "learning_rate": 4.716097364925809e-06, "loss": 0.2329, "mean_token_accuracy": 0.9264655709266663, "num_tokens": 51066205.0, "step": 6290 }, { "entropy": 0.31210439205169677, "epoch": 3.5524830699774266, "grad_norm": 1.871769905090332, "learning_rate": 4.715657534178589e-06, "loss": 0.2176, "mean_token_accuracy": 0.9300808906555176, "num_tokens": 51106895.0, "step": 6295 }, { "entropy": 0.2980155825614929, "epoch": 3.5553047404063207, "grad_norm": 1.6929700374603271, "learning_rate": 4.715217390946489e-06, "loss": 0.2014, "mean_token_accuracy": 0.9350870847702026, "num_tokens": 51147652.0, "step": 6300 }, { "entropy": 0.3304617702960968, "epoch": 3.5581264108352144, "grad_norm": 1.6658223867416382, "learning_rate": 4.714776935315976e-06, "loss": 0.2305, "mean_token_accuracy": 0.9265113472938538, "num_tokens": 51188269.0, "step": 6305 }, { "entropy": 0.30585951209068296, "epoch": 3.560948081264108, "grad_norm": 1.8482630252838135, "learning_rate": 4.7143361673735774e-06, "loss": 0.2076, "mean_token_accuracy": 0.9323771357536316, "num_tokens": 51228854.0, "step": 6310 }, { "entropy": 0.33988407254219055, "epoch": 3.5637697516930023, "grad_norm": 1.9136297702789307, "learning_rate": 4.713895087205882e-06, "loss": 0.2287, "mean_token_accuracy": 0.9266070127487183, "num_tokens": 51269073.0, "step": 6315 }, { "entropy": 0.3092219591140747, "epoch": 3.5665914221218964, "grad_norm": 1.8166900873184204, "learning_rate": 4.71345369489954e-06, "loss": 0.2046, "mean_token_accuracy": 0.9348546147346497, "num_tokens": 51309223.0, "step": 6320 }, { "entropy": 0.3356677830219269, "epoch": 3.56941309255079, "grad_norm": 1.9024507999420166, "learning_rate": 4.7130119905412635e-06, "loss": 0.2282, "mean_token_accuracy": 0.927457618713379, "num_tokens": 51349901.0, "step": 6325 }, { "entropy": 0.31340835690498353, "epoch": 3.572234762979684, "grad_norm": 1.8880701065063477, "learning_rate": 4.712569974217826e-06, "loss": 0.2102, "mean_token_accuracy": 0.9327018618583679, "num_tokens": 51390569.0, "step": 6330 }, { "entropy": 0.31486195921897886, "epoch": 3.575056433408578, "grad_norm": 2.056093215942383, "learning_rate": 4.712127646016059e-06, "loss": 0.211, "mean_token_accuracy": 0.9327765345573426, "num_tokens": 51431350.0, "step": 6335 }, { "entropy": 0.31965509057044983, "epoch": 3.5778781038374716, "grad_norm": 1.6555287837982178, "learning_rate": 4.71168500602286e-06, "loss": 0.2194, "mean_token_accuracy": 0.9297954082489014, "num_tokens": 51471728.0, "step": 6340 }, { "entropy": 0.34823336601257326, "epoch": 3.5806997742663658, "grad_norm": 1.9188921451568604, "learning_rate": 4.7112420543251854e-06, "loss": 0.2211, "mean_token_accuracy": 0.9278290867805481, "num_tokens": 51512451.0, "step": 6345 }, { "entropy": 0.27960309386253357, "epoch": 3.5835214446952595, "grad_norm": 1.7235885858535767, "learning_rate": 4.710798791010054e-06, "loss": 0.1939, "mean_token_accuracy": 0.93757244348526, "num_tokens": 51553127.0, "step": 6350 }, { "entropy": 0.30215221643447876, "epoch": 3.5863431151241536, "grad_norm": 1.8641575574874878, "learning_rate": 4.710355216164543e-06, "loss": 0.2143, "mean_token_accuracy": 0.9315236330032348, "num_tokens": 51593737.0, "step": 6355 }, { "entropy": 0.29333736300468444, "epoch": 3.5891647855530473, "grad_norm": 1.6100600957870483, "learning_rate": 4.7099113298757934e-06, "loss": 0.1993, "mean_token_accuracy": 0.9377033352851868, "num_tokens": 51634446.0, "step": 6360 }, { "entropy": 0.32711849808692933, "epoch": 3.5919864559819414, "grad_norm": 2.63490891456604, "learning_rate": 4.709467132231007e-06, "loss": 0.213, "mean_token_accuracy": 0.9318007826805115, "num_tokens": 51674890.0, "step": 6365 }, { "entropy": 0.3057248055934906, "epoch": 3.594808126410835, "grad_norm": 1.7993404865264893, "learning_rate": 4.709022623317447e-06, "loss": 0.2136, "mean_token_accuracy": 0.9311740756034851, "num_tokens": 51715328.0, "step": 6370 }, { "entropy": 0.32637726664543154, "epoch": 3.5976297968397293, "grad_norm": 1.7621092796325684, "learning_rate": 4.708577803222437e-06, "loss": 0.228, "mean_token_accuracy": 0.926943325996399, "num_tokens": 51755709.0, "step": 6375 }, { "entropy": 0.3093415260314941, "epoch": 3.600451467268623, "grad_norm": 1.7598847150802612, "learning_rate": 4.708132672033361e-06, "loss": 0.1968, "mean_token_accuracy": 0.9351221680641174, "num_tokens": 51796409.0, "step": 6380 }, { "entropy": 0.3067767798900604, "epoch": 3.603273137697517, "grad_norm": 1.894102931022644, "learning_rate": 4.707687229837667e-06, "loss": 0.2096, "mean_token_accuracy": 0.9340128540992737, "num_tokens": 51836874.0, "step": 6385 }, { "entropy": 0.31157588958740234, "epoch": 3.606094808126411, "grad_norm": 1.6844671964645386, "learning_rate": 4.70724147672286e-06, "loss": 0.2012, "mean_token_accuracy": 0.9360531449317933, "num_tokens": 51877598.0, "step": 6390 }, { "entropy": 0.30880475640296934, "epoch": 3.6089164785553045, "grad_norm": 1.8519737720489502, "learning_rate": 4.706795412776509e-06, "loss": 0.2139, "mean_token_accuracy": 0.9301542401313782, "num_tokens": 51918221.0, "step": 6395 }, { "entropy": 0.3159621596336365, "epoch": 3.6117381489841986, "grad_norm": 1.9711363315582275, "learning_rate": 4.706349038086244e-06, "loss": 0.218, "mean_token_accuracy": 0.9309246063232421, "num_tokens": 51959053.0, "step": 6400 }, { "entropy": 0.2915292739868164, "epoch": 3.614559819413093, "grad_norm": 1.8630083799362183, "learning_rate": 4.7059023527397556e-06, "loss": 0.1922, "mean_token_accuracy": 0.9368671178817749, "num_tokens": 51999218.0, "step": 6405 }, { "entropy": 0.3344090163707733, "epoch": 3.6173814898419865, "grad_norm": 1.93043851852417, "learning_rate": 4.705455356824794e-06, "loss": 0.2198, "mean_token_accuracy": 0.9300200700759887, "num_tokens": 52039875.0, "step": 6410 }, { "entropy": 0.3021121442317963, "epoch": 3.62020316027088, "grad_norm": 1.8353512287139893, "learning_rate": 4.705008050429171e-06, "loss": 0.2074, "mean_token_accuracy": 0.9331298470497131, "num_tokens": 52080363.0, "step": 6415 }, { "entropy": 0.30426907539367676, "epoch": 3.6230248306997743, "grad_norm": 1.7905832529067993, "learning_rate": 4.704560433640762e-06, "loss": 0.2117, "mean_token_accuracy": 0.9322558403015136, "num_tokens": 52121089.0, "step": 6420 }, { "entropy": 0.3197052538394928, "epoch": 3.625846501128668, "grad_norm": 1.7855573892593384, "learning_rate": 4.7041125065475e-06, "loss": 0.2154, "mean_token_accuracy": 0.9314280033111573, "num_tokens": 52161686.0, "step": 6425 }, { "entropy": 0.2890112340450287, "epoch": 3.628668171557562, "grad_norm": 2.0892858505249023, "learning_rate": 4.703664269237381e-06, "loss": 0.1937, "mean_token_accuracy": 0.9377867579460144, "num_tokens": 52202482.0, "step": 6430 }, { "entropy": 0.30858972668647766, "epoch": 3.631489841986456, "grad_norm": 1.9566696882247925, "learning_rate": 4.703215721798462e-06, "loss": 0.2021, "mean_token_accuracy": 0.9352177262306214, "num_tokens": 52243201.0, "step": 6435 }, { "entropy": 0.3409568965435028, "epoch": 3.63431151241535, "grad_norm": 1.81044602394104, "learning_rate": 4.702766864318858e-06, "loss": 0.2185, "mean_token_accuracy": 0.9314499139785767, "num_tokens": 52284031.0, "step": 6440 }, { "entropy": 0.3199546754360199, "epoch": 3.6371331828442437, "grad_norm": 1.9870702028274536, "learning_rate": 4.70231769688675e-06, "loss": 0.2192, "mean_token_accuracy": 0.929648220539093, "num_tokens": 52323965.0, "step": 6445 }, { "entropy": 0.3059984266757965, "epoch": 3.639954853273138, "grad_norm": 2.04311203956604, "learning_rate": 4.701868219590374e-06, "loss": 0.2123, "mean_token_accuracy": 0.9337385654449463, "num_tokens": 52363881.0, "step": 6450 }, { "entropy": 0.32826207876205443, "epoch": 3.6427765237020315, "grad_norm": 1.848895788192749, "learning_rate": 4.701418432518032e-06, "loss": 0.2206, "mean_token_accuracy": 0.9300814032554626, "num_tokens": 52404601.0, "step": 6455 }, { "entropy": 0.3011523485183716, "epoch": 3.6455981941309257, "grad_norm": 1.7186975479125977, "learning_rate": 4.700968335758084e-06, "loss": 0.1944, "mean_token_accuracy": 0.9379884481430054, "num_tokens": 52445278.0, "step": 6460 }, { "entropy": 0.3180157899856567, "epoch": 3.6484198645598194, "grad_norm": 2.0736236572265625, "learning_rate": 4.700517929398951e-06, "loss": 0.2125, "mean_token_accuracy": 0.9314456939697265, "num_tokens": 52486045.0, "step": 6465 }, { "entropy": 0.32199347019195557, "epoch": 3.6512415349887135, "grad_norm": 1.9433692693710327, "learning_rate": 4.7000672135291166e-06, "loss": 0.2133, "mean_token_accuracy": 0.9318039417266846, "num_tokens": 52525845.0, "step": 6470 }, { "entropy": 0.32936949729919435, "epoch": 3.654063205417607, "grad_norm": 1.8279980421066284, "learning_rate": 4.699616188237123e-06, "loss": 0.2376, "mean_token_accuracy": 0.9232208609580994, "num_tokens": 52566170.0, "step": 6475 }, { "entropy": 0.28730884194374084, "epoch": 3.656884875846501, "grad_norm": 1.848632574081421, "learning_rate": 4.699164853611574e-06, "loss": 0.1949, "mean_token_accuracy": 0.9368890285491943, "num_tokens": 52606933.0, "step": 6480 }, { "entropy": 0.2818956643342972, "epoch": 3.659706546275395, "grad_norm": 1.7565962076187134, "learning_rate": 4.698713209741136e-06, "loss": 0.1923, "mean_token_accuracy": 0.9376680016517639, "num_tokens": 52647547.0, "step": 6485 }, { "entropy": 0.30916563868522645, "epoch": 3.662528216704289, "grad_norm": 1.94307279586792, "learning_rate": 4.698261256714533e-06, "loss": 0.2076, "mean_token_accuracy": 0.9336765766143799, "num_tokens": 52688266.0, "step": 6490 }, { "entropy": 0.29969978928565977, "epoch": 3.665349887133183, "grad_norm": 1.9244630336761475, "learning_rate": 4.697808994620552e-06, "loss": 0.2085, "mean_token_accuracy": 0.9322393655776977, "num_tokens": 52728923.0, "step": 6495 }, { "entropy": 0.31762275099754333, "epoch": 3.6681715575620766, "grad_norm": 1.9523797035217285, "learning_rate": 4.697356423548038e-06, "loss": 0.2147, "mean_token_accuracy": 0.9313531517982483, "num_tokens": 52769425.0, "step": 6500 }, { "epoch": 3.6681715575620766, "eval_entropy": 0.32103675603866577, "eval_loss": 0.1689807027578354, "eval_mean_token_accuracy": 0.9474894404411316, "eval_num_tokens": 52769425.0, "eval_runtime": 0.1638, "eval_samples_per_second": 24.419, "eval_steps_per_second": 6.105, "step": 6500 }, { "entropy": 0.3029796838760376, "epoch": 3.6709932279909707, "grad_norm": 1.7941051721572876, "learning_rate": 4.696903543585902e-06, "loss": 0.2128, "mean_token_accuracy": 0.9323464274406433, "num_tokens": 52810073.0, "step": 6505 }, { "entropy": 0.2974651575088501, "epoch": 3.6738148984198644, "grad_norm": 2.0666654109954834, "learning_rate": 4.696450354823109e-06, "loss": 0.2107, "mean_token_accuracy": 0.9324890732765198, "num_tokens": 52850886.0, "step": 6510 }, { "entropy": 0.3096803486347198, "epoch": 3.6766365688487586, "grad_norm": 1.9316201210021973, "learning_rate": 4.69599685734869e-06, "loss": 0.2055, "mean_token_accuracy": 0.932162344455719, "num_tokens": 52891548.0, "step": 6515 }, { "entropy": 0.307836776971817, "epoch": 3.6794582392776523, "grad_norm": 1.7384241819381714, "learning_rate": 4.695543051251735e-06, "loss": 0.2106, "mean_token_accuracy": 0.930738651752472, "num_tokens": 52932381.0, "step": 6520 }, { "entropy": 0.31080532670021055, "epoch": 3.6822799097065464, "grad_norm": 1.620614767074585, "learning_rate": 4.695088936621393e-06, "loss": 0.206, "mean_token_accuracy": 0.9316131114959717, "num_tokens": 52972931.0, "step": 6525 }, { "entropy": 0.3173711597919464, "epoch": 3.68510158013544, "grad_norm": 1.8724675178527832, "learning_rate": 4.694634513546875e-06, "loss": 0.2149, "mean_token_accuracy": 0.9308315873146057, "num_tokens": 53013578.0, "step": 6530 }, { "entropy": 0.29013689756393435, "epoch": 3.6879232505643342, "grad_norm": 1.7273049354553223, "learning_rate": 4.6941797821174526e-06, "loss": 0.1904, "mean_token_accuracy": 0.9397516369819641, "num_tokens": 53054348.0, "step": 6535 }, { "entropy": 0.30396153330802916, "epoch": 3.690744920993228, "grad_norm": 1.865898847579956, "learning_rate": 4.693724742422458e-06, "loss": 0.2176, "mean_token_accuracy": 0.9278941035270691, "num_tokens": 53094975.0, "step": 6540 }, { "entropy": 0.26694598495960237, "epoch": 3.6935665914221216, "grad_norm": 1.6677098274230957, "learning_rate": 4.693269394551286e-06, "loss": 0.1822, "mean_token_accuracy": 0.9395427823066711, "num_tokens": 53135477.0, "step": 6545 }, { "entropy": 0.3146726369857788, "epoch": 3.6963882618510158, "grad_norm": 1.8670984506607056, "learning_rate": 4.6928137385933845e-06, "loss": 0.2115, "mean_token_accuracy": 0.9311745762825012, "num_tokens": 53176141.0, "step": 6550 }, { "entropy": 0.32695006132125853, "epoch": 3.69920993227991, "grad_norm": 2.1717782020568848, "learning_rate": 4.692357774638272e-06, "loss": 0.2342, "mean_token_accuracy": 0.9270057797431945, "num_tokens": 53216706.0, "step": 6555 }, { "entropy": 0.3012891411781311, "epoch": 3.7020316027088036, "grad_norm": 1.886801838874817, "learning_rate": 4.69190150277552e-06, "loss": 0.2062, "mean_token_accuracy": 0.9332509160041809, "num_tokens": 53257505.0, "step": 6560 }, { "entropy": 0.316040962934494, "epoch": 3.7048532731376973, "grad_norm": 1.6204224824905396, "learning_rate": 4.6914449230947645e-06, "loss": 0.2109, "mean_token_accuracy": 0.9321287989616394, "num_tokens": 53298201.0, "step": 6565 }, { "entropy": 0.28626594245433806, "epoch": 3.7076749435665914, "grad_norm": 1.7525380849838257, "learning_rate": 4.690988035685701e-06, "loss": 0.1939, "mean_token_accuracy": 0.9376110672950745, "num_tokens": 53338694.0, "step": 6570 }, { "entropy": 0.29510667324066164, "epoch": 3.7104966139954856, "grad_norm": 1.6574718952178955, "learning_rate": 4.690530840638083e-06, "loss": 0.1897, "mean_token_accuracy": 0.940097987651825, "num_tokens": 53379266.0, "step": 6575 }, { "entropy": 0.3167356073856354, "epoch": 3.7133182844243793, "grad_norm": 2.295748472213745, "learning_rate": 4.690073338041728e-06, "loss": 0.2168, "mean_token_accuracy": 0.9299868464469909, "num_tokens": 53419886.0, "step": 6580 }, { "entropy": 0.27687845230102537, "epoch": 3.716139954853273, "grad_norm": 1.9063183069229126, "learning_rate": 4.689615527986514e-06, "loss": 0.1954, "mean_token_accuracy": 0.9360902070999145, "num_tokens": 53460645.0, "step": 6585 }, { "entropy": 0.27884105145931243, "epoch": 3.718961625282167, "grad_norm": 1.709240436553955, "learning_rate": 4.689157410562374e-06, "loss": 0.1907, "mean_token_accuracy": 0.939925491809845, "num_tokens": 53501408.0, "step": 6590 }, { "entropy": 0.3242746412754059, "epoch": 3.721783295711061, "grad_norm": 1.7791012525558472, "learning_rate": 4.688698985859309e-06, "loss": 0.2124, "mean_token_accuracy": 0.9305900692939758, "num_tokens": 53542237.0, "step": 6595 }, { "entropy": 0.3170479595661163, "epoch": 3.724604966139955, "grad_norm": 1.7075973749160767, "learning_rate": 4.688240253967374e-06, "loss": 0.2162, "mean_token_accuracy": 0.9304636359214783, "num_tokens": 53582388.0, "step": 6600 }, { "entropy": 0.3019756257534027, "epoch": 3.7274266365688487, "grad_norm": 1.7577334642410278, "learning_rate": 4.6877812149766875e-06, "loss": 0.1985, "mean_token_accuracy": 0.9347709894180298, "num_tokens": 53623102.0, "step": 6605 }, { "entropy": 0.33137462139129636, "epoch": 3.730248306997743, "grad_norm": 1.6792027950286865, "learning_rate": 4.687321868977429e-06, "loss": 0.241, "mean_token_accuracy": 0.9231707096099854, "num_tokens": 53663401.0, "step": 6610 }, { "entropy": 0.33162534832954405, "epoch": 3.7330699774266365, "grad_norm": 1.8961528539657593, "learning_rate": 4.686862216059836e-06, "loss": 0.2269, "mean_token_accuracy": 0.9269180774688721, "num_tokens": 53703912.0, "step": 6615 }, { "entropy": 0.32055285573005676, "epoch": 3.7358916478555306, "grad_norm": 1.9672921895980835, "learning_rate": 4.686402256314208e-06, "loss": 0.219, "mean_token_accuracy": 0.9281070351600647, "num_tokens": 53744733.0, "step": 6620 }, { "entropy": 0.32325056195259094, "epoch": 3.7387133182844243, "grad_norm": 2.0314273834228516, "learning_rate": 4.685941989830903e-06, "loss": 0.2246, "mean_token_accuracy": 0.9259540200233459, "num_tokens": 53784519.0, "step": 6625 }, { "entropy": 0.3410928726196289, "epoch": 3.741534988713318, "grad_norm": 1.9816386699676514, "learning_rate": 4.685481416700342e-06, "loss": 0.2428, "mean_token_accuracy": 0.9229821920394897, "num_tokens": 53825215.0, "step": 6630 }, { "entropy": 0.3254290819168091, "epoch": 3.744356659142212, "grad_norm": 2.0321855545043945, "learning_rate": 4.685020537013004e-06, "loss": 0.2097, "mean_token_accuracy": 0.932454526424408, "num_tokens": 53865574.0, "step": 6635 }, { "entropy": 0.29887913465499877, "epoch": 3.7471783295711063, "grad_norm": 1.6626564264297485, "learning_rate": 4.684559350859428e-06, "loss": 0.206, "mean_token_accuracy": 0.9335589647293091, "num_tokens": 53906263.0, "step": 6640 }, { "entropy": 0.32658823728561404, "epoch": 3.75, "grad_norm": 1.9820867776870728, "learning_rate": 4.684097858330215e-06, "loss": 0.2207, "mean_token_accuracy": 0.9296117424964905, "num_tokens": 53946706.0, "step": 6645 }, { "entropy": 0.29330855011940005, "epoch": 3.7528216704288937, "grad_norm": 1.978268027305603, "learning_rate": 4.683636059516024e-06, "loss": 0.1986, "mean_token_accuracy": 0.9361773490905761, "num_tokens": 53987483.0, "step": 6650 }, { "entropy": 0.3071614384651184, "epoch": 3.755643340857788, "grad_norm": 1.5843520164489746, "learning_rate": 4.683173954507578e-06, "loss": 0.2018, "mean_token_accuracy": 0.935286819934845, "num_tokens": 54028166.0, "step": 6655 }, { "entropy": 0.2947255611419678, "epoch": 3.758465011286682, "grad_norm": 1.8720123767852783, "learning_rate": 4.682711543395656e-06, "loss": 0.1897, "mean_token_accuracy": 0.9393622994422912, "num_tokens": 54068959.0, "step": 6660 }, { "entropy": 0.3333085238933563, "epoch": 3.7612866817155757, "grad_norm": 2.160306453704834, "learning_rate": 4.6822488262710985e-06, "loss": 0.2458, "mean_token_accuracy": 0.9197412252426147, "num_tokens": 54109806.0, "step": 6665 }, { "entropy": 0.3107566237449646, "epoch": 3.7641083521444694, "grad_norm": 1.6867446899414062, "learning_rate": 4.681785803224807e-06, "loss": 0.2145, "mean_token_accuracy": 0.9298100113868714, "num_tokens": 54150377.0, "step": 6670 }, { "entropy": 0.34289470911026, "epoch": 3.7669300225733635, "grad_norm": 2.239997625350952, "learning_rate": 4.681322474347741e-06, "loss": 0.243, "mean_token_accuracy": 0.9235318303108215, "num_tokens": 54191007.0, "step": 6675 }, { "entropy": 0.300843334197998, "epoch": 3.769751693002257, "grad_norm": 1.7303624153137207, "learning_rate": 4.680858839730923e-06, "loss": 0.2098, "mean_token_accuracy": 0.9318808197975159, "num_tokens": 54231680.0, "step": 6680 }, { "entropy": 0.3099523186683655, "epoch": 3.7725733634311513, "grad_norm": 1.8123502731323242, "learning_rate": 4.680394899465435e-06, "loss": 0.2032, "mean_token_accuracy": 0.9328450918197632, "num_tokens": 54272199.0, "step": 6685 }, { "entropy": 0.3188793003559113, "epoch": 3.775395033860045, "grad_norm": 1.985037922859192, "learning_rate": 4.679930653642415e-06, "loss": 0.2159, "mean_token_accuracy": 0.9308086633682251, "num_tokens": 54312840.0, "step": 6690 }, { "entropy": 0.29359256029129027, "epoch": 3.778216704288939, "grad_norm": 1.7202750444412231, "learning_rate": 4.679466102353068e-06, "loss": 0.2156, "mean_token_accuracy": 0.9294952034950257, "num_tokens": 54353596.0, "step": 6695 }, { "entropy": 0.32029845714569094, "epoch": 3.781038374717833, "grad_norm": 2.217895746231079, "learning_rate": 4.679001245688651e-06, "loss": 0.2197, "mean_token_accuracy": 0.9296623706817627, "num_tokens": 54393283.0, "step": 6700 }, { "entropy": 0.3079906702041626, "epoch": 3.783860045146727, "grad_norm": 1.7621604204177856, "learning_rate": 4.678536083740488e-06, "loss": 0.2123, "mean_token_accuracy": 0.934044873714447, "num_tokens": 54433802.0, "step": 6705 }, { "entropy": 0.3148071825504303, "epoch": 3.7866817155756207, "grad_norm": 1.961150884628296, "learning_rate": 4.67807061659996e-06, "loss": 0.2128, "mean_token_accuracy": 0.9301833033561706, "num_tokens": 54474550.0, "step": 6710 }, { "entropy": 0.31343921422958376, "epoch": 3.7895033860045144, "grad_norm": 2.0900840759277344, "learning_rate": 4.677604844358507e-06, "loss": 0.2165, "mean_token_accuracy": 0.9303097009658814, "num_tokens": 54515386.0, "step": 6715 }, { "entropy": 0.32203570008277893, "epoch": 3.7923250564334086, "grad_norm": 1.9342727661132812, "learning_rate": 4.677138767107631e-06, "loss": 0.2161, "mean_token_accuracy": 0.9303983330726624, "num_tokens": 54556126.0, "step": 6720 }, { "entropy": 0.28948697447776794, "epoch": 3.7951467268623027, "grad_norm": 1.6458464860916138, "learning_rate": 4.676672384938891e-06, "loss": 0.1951, "mean_token_accuracy": 0.9385475873947143, "num_tokens": 54597036.0, "step": 6725 }, { "entropy": 0.3142231285572052, "epoch": 3.7979683972911964, "grad_norm": 1.8955562114715576, "learning_rate": 4.676205697943911e-06, "loss": 0.2115, "mean_token_accuracy": 0.9330004334449769, "num_tokens": 54637646.0, "step": 6730 }, { "entropy": 0.3165326237678528, "epoch": 3.80079006772009, "grad_norm": 1.7128779888153076, "learning_rate": 4.675738706214369e-06, "loss": 0.2125, "mean_token_accuracy": 0.9308281064033508, "num_tokens": 54678230.0, "step": 6735 }, { "entropy": 0.3183960378170013, "epoch": 3.8036117381489842, "grad_norm": 2.020563840866089, "learning_rate": 4.6752714098420065e-06, "loss": 0.225, "mean_token_accuracy": 0.9275296330451965, "num_tokens": 54718922.0, "step": 6740 }, { "entropy": 0.3038501560688019, "epoch": 3.8064334085778784, "grad_norm": 2.007378339767456, "learning_rate": 4.674803808918624e-06, "loss": 0.1919, "mean_token_accuracy": 0.9365889430046082, "num_tokens": 54759550.0, "step": 6745 }, { "entropy": 0.3290569126605988, "epoch": 3.809255079006772, "grad_norm": 1.8866428136825562, "learning_rate": 4.674335903536083e-06, "loss": 0.2219, "mean_token_accuracy": 0.9293844699859619, "num_tokens": 54800015.0, "step": 6750 }, { "entropy": 0.3428149461746216, "epoch": 3.8120767494356658, "grad_norm": 2.020414113998413, "learning_rate": 4.673867693786301e-06, "loss": 0.2348, "mean_token_accuracy": 0.924261474609375, "num_tokens": 54840616.0, "step": 6755 }, { "entropy": 0.32770676612854005, "epoch": 3.81489841986456, "grad_norm": 1.6898592710494995, "learning_rate": 4.6733991797612595e-06, "loss": 0.2039, "mean_token_accuracy": 0.9363934993743896, "num_tokens": 54881081.0, "step": 6760 }, { "entropy": 0.3508101344108582, "epoch": 3.8177200902934536, "grad_norm": 1.967923879623413, "learning_rate": 4.672930361552998e-06, "loss": 0.2206, "mean_token_accuracy": 0.9289634466171265, "num_tokens": 54921258.0, "step": 6765 }, { "entropy": 0.29331698417663576, "epoch": 3.8205417607223477, "grad_norm": 1.8218579292297363, "learning_rate": 4.672461239253616e-06, "loss": 0.1829, "mean_token_accuracy": 0.9402494430541992, "num_tokens": 54961736.0, "step": 6770 }, { "entropy": 0.31439730525016785, "epoch": 3.8233634311512414, "grad_norm": 2.268512487411499, "learning_rate": 4.671991812955273e-06, "loss": 0.2126, "mean_token_accuracy": 0.9317742347717285, "num_tokens": 55002382.0, "step": 6775 }, { "entropy": 0.3293932378292084, "epoch": 3.8261851015801356, "grad_norm": 2.075479507446289, "learning_rate": 4.671522082750186e-06, "loss": 0.2336, "mean_token_accuracy": 0.9250609993934631, "num_tokens": 55043115.0, "step": 6780 }, { "entropy": 0.32756795883178713, "epoch": 3.8290067720090293, "grad_norm": 1.9896371364593506, "learning_rate": 4.671052048730635e-06, "loss": 0.2114, "mean_token_accuracy": 0.9321682929992676, "num_tokens": 55082894.0, "step": 6785 }, { "entropy": 0.3386588513851166, "epoch": 3.8318284424379234, "grad_norm": 1.902786135673523, "learning_rate": 4.670581710988958e-06, "loss": 0.2271, "mean_token_accuracy": 0.9268991231918335, "num_tokens": 55123408.0, "step": 6790 }, { "entropy": 0.3413122892379761, "epoch": 3.834650112866817, "grad_norm": 2.145611047744751, "learning_rate": 4.6701110696175546e-06, "loss": 0.2295, "mean_token_accuracy": 0.9283112645149231, "num_tokens": 55163941.0, "step": 6795 }, { "entropy": 0.293692809343338, "epoch": 3.837471783295711, "grad_norm": 2.1431288719177246, "learning_rate": 4.669640124708879e-06, "loss": 0.2009, "mean_token_accuracy": 0.934659731388092, "num_tokens": 55204732.0, "step": 6800 }, { "entropy": 0.322798889875412, "epoch": 3.840293453724605, "grad_norm": 1.9868714809417725, "learning_rate": 4.66916887635545e-06, "loss": 0.2327, "mean_token_accuracy": 0.9276728153228759, "num_tokens": 55245323.0, "step": 6805 }, { "entropy": 0.3417269468307495, "epoch": 3.843115124153499, "grad_norm": 1.8813223838806152, "learning_rate": 4.668697324649845e-06, "loss": 0.2308, "mean_token_accuracy": 0.9265347599983216, "num_tokens": 55285830.0, "step": 6810 }, { "entropy": 0.32689463496208193, "epoch": 3.845936794582393, "grad_norm": 1.8665236234664917, "learning_rate": 4.6682254696847e-06, "loss": 0.2186, "mean_token_accuracy": 0.928902006149292, "num_tokens": 55326576.0, "step": 6815 }, { "entropy": 0.30429264307022097, "epoch": 3.8487584650112865, "grad_norm": 1.9757881164550781, "learning_rate": 4.667753311552711e-06, "loss": 0.2066, "mean_token_accuracy": 0.9331792950630188, "num_tokens": 55367147.0, "step": 6820 }, { "entropy": 0.3437939524650574, "epoch": 3.8515801354401806, "grad_norm": 1.7982680797576904, "learning_rate": 4.667280850346634e-06, "loss": 0.236, "mean_token_accuracy": 0.9242946028709411, "num_tokens": 55407720.0, "step": 6825 }, { "entropy": 0.3199367105960846, "epoch": 3.8544018058690743, "grad_norm": 1.9674409627914429, "learning_rate": 4.666808086159283e-06, "loss": 0.2236, "mean_token_accuracy": 0.9290154814720154, "num_tokens": 55448550.0, "step": 6830 }, { "entropy": 0.2929447114467621, "epoch": 3.8572234762979685, "grad_norm": 2.2248289585113525, "learning_rate": 4.666335019083532e-06, "loss": 0.1942, "mean_token_accuracy": 0.9364341259002685, "num_tokens": 55489200.0, "step": 6835 }, { "entropy": 0.29515990018844607, "epoch": 3.860045146726862, "grad_norm": 1.5697543621063232, "learning_rate": 4.665861649212316e-06, "loss": 0.1987, "mean_token_accuracy": 0.936658239364624, "num_tokens": 55530008.0, "step": 6840 }, { "entropy": 0.3109833776950836, "epoch": 3.8628668171557563, "grad_norm": 1.9397079944610596, "learning_rate": 4.6653879766386305e-06, "loss": 0.2049, "mean_token_accuracy": 0.9348199844360352, "num_tokens": 55570437.0, "step": 6845 }, { "entropy": 0.3064566671848297, "epoch": 3.86568848758465, "grad_norm": 1.5913760662078857, "learning_rate": 4.664914001455526e-06, "loss": 0.2074, "mean_token_accuracy": 0.9345534205436706, "num_tokens": 55611132.0, "step": 6850 }, { "entropy": 0.31966778039932253, "epoch": 3.868510158013544, "grad_norm": 1.8581488132476807, "learning_rate": 4.664439723756116e-06, "loss": 0.2171, "mean_token_accuracy": 0.9305494904518128, "num_tokens": 55651714.0, "step": 6855 }, { "entropy": 0.31408268213272095, "epoch": 3.871331828442438, "grad_norm": 1.9941738843917847, "learning_rate": 4.6639651436335705e-06, "loss": 0.2101, "mean_token_accuracy": 0.933535099029541, "num_tokens": 55692277.0, "step": 6860 }, { "entropy": 0.3109330773353577, "epoch": 3.874153498871332, "grad_norm": 2.0422751903533936, "learning_rate": 4.663490261181124e-06, "loss": 0.206, "mean_token_accuracy": 0.9325674891471862, "num_tokens": 55732896.0, "step": 6865 }, { "entropy": 0.29195868968963623, "epoch": 3.8769751693002257, "grad_norm": 1.6397924423217773, "learning_rate": 4.663015076492065e-06, "loss": 0.2098, "mean_token_accuracy": 0.9333557486534119, "num_tokens": 55773560.0, "step": 6870 }, { "entropy": 0.3036874562501907, "epoch": 3.87979683972912, "grad_norm": 2.033968448638916, "learning_rate": 4.662539589659746e-06, "loss": 0.2065, "mean_token_accuracy": 0.9338034749031067, "num_tokens": 55814207.0, "step": 6875 }, { "entropy": 0.33089557886123655, "epoch": 3.8826185101580135, "grad_norm": 1.942462682723999, "learning_rate": 4.6620638007775735e-06, "loss": 0.223, "mean_token_accuracy": 0.9286533951759338, "num_tokens": 55854785.0, "step": 6880 }, { "entropy": 0.3029870331287384, "epoch": 3.885440180586907, "grad_norm": 1.7418941259384155, "learning_rate": 4.661587709939017e-06, "loss": 0.2038, "mean_token_accuracy": 0.9331433176994324, "num_tokens": 55895399.0, "step": 6885 }, { "entropy": 0.3070501685142517, "epoch": 3.8882618510158014, "grad_norm": 1.9692811965942383, "learning_rate": 4.661111317237606e-06, "loss": 0.1969, "mean_token_accuracy": 0.9379673600196838, "num_tokens": 55935772.0, "step": 6890 }, { "entropy": 0.3398116111755371, "epoch": 3.8910835214446955, "grad_norm": 1.9820867776870728, "learning_rate": 4.660634622766926e-06, "loss": 0.2501, "mean_token_accuracy": 0.9237083911895752, "num_tokens": 55976410.0, "step": 6895 }, { "entropy": 0.32836961150169375, "epoch": 3.893905191873589, "grad_norm": 2.400958299636841, "learning_rate": 4.660157626620625e-06, "loss": 0.2248, "mean_token_accuracy": 0.9286513090133667, "num_tokens": 56016725.0, "step": 6900 }, { "entropy": 0.31524630188941954, "epoch": 3.896726862302483, "grad_norm": 1.8288160562515259, "learning_rate": 4.65968032889241e-06, "loss": 0.1952, "mean_token_accuracy": 0.9361767411231995, "num_tokens": 56057415.0, "step": 6905 }, { "entropy": 0.31340600848197936, "epoch": 3.899548532731377, "grad_norm": 1.9468982219696045, "learning_rate": 4.6592027296760435e-06, "loss": 0.2189, "mean_token_accuracy": 0.9306774020195008, "num_tokens": 56098039.0, "step": 6910 }, { "entropy": 0.2994066894054413, "epoch": 3.9023702031602707, "grad_norm": 1.8991196155548096, "learning_rate": 4.658724829065352e-06, "loss": 0.2006, "mean_token_accuracy": 0.9361102461814881, "num_tokens": 56138690.0, "step": 6915 }, { "entropy": 0.3208695352077484, "epoch": 3.905191873589165, "grad_norm": 2.3649332523345947, "learning_rate": 4.658246627154219e-06, "loss": 0.2117, "mean_token_accuracy": 0.9312704563140869, "num_tokens": 56179455.0, "step": 6920 }, { "entropy": 0.2794735461473465, "epoch": 3.9080135440180586, "grad_norm": 2.0509235858917236, "learning_rate": 4.6577681240365856e-06, "loss": 0.1958, "mean_token_accuracy": 0.9353933930397034, "num_tokens": 56219807.0, "step": 6925 }, { "entropy": 0.32446990013122556, "epoch": 3.9108352144469527, "grad_norm": 1.699033498764038, "learning_rate": 4.657289319806456e-06, "loss": 0.2198, "mean_token_accuracy": 0.929268729686737, "num_tokens": 56260537.0, "step": 6930 }, { "entropy": 0.29834595918655393, "epoch": 3.9136568848758464, "grad_norm": 1.6812763214111328, "learning_rate": 4.656810214557889e-06, "loss": 0.1882, "mean_token_accuracy": 0.937093997001648, "num_tokens": 56301037.0, "step": 6935 }, { "entropy": 0.2948793530464172, "epoch": 3.9164785553047405, "grad_norm": 1.752263069152832, "learning_rate": 4.6563308083850075e-06, "loss": 0.1935, "mean_token_accuracy": 0.9361865758895874, "num_tokens": 56341578.0, "step": 6940 }, { "entropy": 0.3016524910926819, "epoch": 3.9193002257336342, "grad_norm": 1.9265809059143066, "learning_rate": 4.655851101381988e-06, "loss": 0.1969, "mean_token_accuracy": 0.9359880805015564, "num_tokens": 56382334.0, "step": 6945 }, { "entropy": 0.31857663989067075, "epoch": 3.9221218961625284, "grad_norm": 2.062640905380249, "learning_rate": 4.655371093643073e-06, "loss": 0.2319, "mean_token_accuracy": 0.9254809617996216, "num_tokens": 56422744.0, "step": 6950 }, { "entropy": 0.3018131792545319, "epoch": 3.924943566591422, "grad_norm": 1.7309370040893555, "learning_rate": 4.6548907852625565e-06, "loss": 0.2042, "mean_token_accuracy": 0.9332278847694397, "num_tokens": 56463597.0, "step": 6955 }, { "entropy": 0.31196231842041017, "epoch": 3.927765237020316, "grad_norm": 1.6862263679504395, "learning_rate": 4.654410176334796e-06, "loss": 0.2161, "mean_token_accuracy": 0.9320029973983764, "num_tokens": 56504227.0, "step": 6960 }, { "entropy": 0.3238642394542694, "epoch": 3.93058690744921, "grad_norm": 1.998358130455017, "learning_rate": 4.653929266954208e-06, "loss": 0.2084, "mean_token_accuracy": 0.9316397666931152, "num_tokens": 56544904.0, "step": 6965 }, { "entropy": 0.34933393597602846, "epoch": 3.9334085778781036, "grad_norm": 2.1447250843048096, "learning_rate": 4.653448057215267e-06, "loss": 0.2264, "mean_token_accuracy": 0.9266282439231872, "num_tokens": 56585427.0, "step": 6970 }, { "entropy": 0.34060198068618774, "epoch": 3.9362302483069977, "grad_norm": 2.133936882019043, "learning_rate": 4.652966547212506e-06, "loss": 0.2445, "mean_token_accuracy": 0.9236889362335206, "num_tokens": 56626066.0, "step": 6975 }, { "entropy": 0.3082031667232513, "epoch": 3.939051918735892, "grad_norm": 1.9675047397613525, "learning_rate": 4.652484737040518e-06, "loss": 0.2007, "mean_token_accuracy": 0.9365509033203125, "num_tokens": 56666827.0, "step": 6980 }, { "entropy": 0.3459632158279419, "epoch": 3.9418735891647856, "grad_norm": 1.927329421043396, "learning_rate": 4.652002626793956e-06, "loss": 0.2278, "mean_token_accuracy": 0.9272386431694031, "num_tokens": 56707612.0, "step": 6985 }, { "entropy": 0.3056627333164215, "epoch": 3.9446952595936793, "grad_norm": 1.6812715530395508, "learning_rate": 4.651520216567528e-06, "loss": 0.1971, "mean_token_accuracy": 0.9381381869316101, "num_tokens": 56748245.0, "step": 6990 }, { "entropy": 0.2965980887413025, "epoch": 3.9475169300225734, "grad_norm": 1.688714861869812, "learning_rate": 4.651037506456006e-06, "loss": 0.2001, "mean_token_accuracy": 0.9357024908065796, "num_tokens": 56788909.0, "step": 6995 }, { "entropy": 0.32579890489578245, "epoch": 3.950338600451467, "grad_norm": 2.0904133319854736, "learning_rate": 4.650554496554217e-06, "loss": 0.2383, "mean_token_accuracy": 0.9262525558471679, "num_tokens": 56829431.0, "step": 7000 }, { "epoch": 3.950338600451467, "eval_entropy": 0.3211408257484436, "eval_loss": 0.14595237374305725, "eval_mean_token_accuracy": 0.9543886780738831, "eval_num_tokens": 56829431.0, "eval_runtime": 0.164, "eval_samples_per_second": 24.384, "eval_steps_per_second": 6.096, "step": 7000 }, { "entropy": 0.32015385031700133, "epoch": 3.9531602708803613, "grad_norm": 1.8100392818450928, "learning_rate": 4.650071186957049e-06, "loss": 0.2236, "mean_token_accuracy": 0.9274610161781311, "num_tokens": 56870084.0, "step": 7005 }, { "entropy": 0.31498663425445556, "epoch": 3.955981941309255, "grad_norm": 1.7717326879501343, "learning_rate": 4.6495875777594485e-06, "loss": 0.2223, "mean_token_accuracy": 0.927357268333435, "num_tokens": 56910874.0, "step": 7010 }, { "entropy": 0.2908495903015137, "epoch": 3.958803611738149, "grad_norm": 2.0041091442108154, "learning_rate": 4.64910366905642e-06, "loss": 0.1944, "mean_token_accuracy": 0.9367148995399475, "num_tokens": 56951468.0, "step": 7015 }, { "entropy": 0.3460938811302185, "epoch": 3.961625282167043, "grad_norm": 2.049760580062866, "learning_rate": 4.648619460943027e-06, "loss": 0.2347, "mean_token_accuracy": 0.9227741837501526, "num_tokens": 56992209.0, "step": 7020 }, { "entropy": 0.31545761227607727, "epoch": 3.964446952595937, "grad_norm": 2.2006888389587402, "learning_rate": 4.6481349535143934e-06, "loss": 0.21, "mean_token_accuracy": 0.9324324250221252, "num_tokens": 57033074.0, "step": 7025 }, { "entropy": 0.2913439154624939, "epoch": 3.9672686230248306, "grad_norm": 1.7497133016586304, "learning_rate": 4.6476501468657e-06, "loss": 0.1987, "mean_token_accuracy": 0.934621024131775, "num_tokens": 57073763.0, "step": 7030 }, { "entropy": 0.3054608583450317, "epoch": 3.9700902934537243, "grad_norm": 1.3627029657363892, "learning_rate": 4.647165041092187e-06, "loss": 0.1965, "mean_token_accuracy": 0.9352095484733581, "num_tokens": 57114140.0, "step": 7035 }, { "entropy": 0.31089224219322203, "epoch": 3.9729119638826185, "grad_norm": 1.926986575126648, "learning_rate": 4.646679636289154e-06, "loss": 0.2129, "mean_token_accuracy": 0.9315391659736634, "num_tokens": 57154688.0, "step": 7040 }, { "entropy": 0.29714728593826295, "epoch": 3.9757336343115126, "grad_norm": 2.036738872528076, "learning_rate": 4.646193932551959e-06, "loss": 0.2094, "mean_token_accuracy": 0.9334526896476746, "num_tokens": 57195301.0, "step": 7045 }, { "entropy": 0.3167958378791809, "epoch": 3.9785553047404063, "grad_norm": 2.016421318054199, "learning_rate": 4.645707929976018e-06, "loss": 0.2166, "mean_token_accuracy": 0.9300056099891663, "num_tokens": 57235979.0, "step": 7050 }, { "entropy": 0.2984152317047119, "epoch": 3.9813769751693, "grad_norm": 2.002079486846924, "learning_rate": 4.645221628656806e-06, "loss": 0.1954, "mean_token_accuracy": 0.9366149425506591, "num_tokens": 57276713.0, "step": 7055 }, { "entropy": 0.32194399237632754, "epoch": 3.984198645598194, "grad_norm": 1.7218655347824097, "learning_rate": 4.644735028689858e-06, "loss": 0.2147, "mean_token_accuracy": 0.9305589437484741, "num_tokens": 57317326.0, "step": 7060 }, { "entropy": 0.317461222410202, "epoch": 3.9870203160270883, "grad_norm": 2.030597686767578, "learning_rate": 4.644248130170766e-06, "loss": 0.225, "mean_token_accuracy": 0.927132534980774, "num_tokens": 57358188.0, "step": 7065 }, { "entropy": 0.30844367146492, "epoch": 3.989841986455982, "grad_norm": 1.770613193511963, "learning_rate": 4.643760933195182e-06, "loss": 0.2062, "mean_token_accuracy": 0.9343749284744263, "num_tokens": 57398939.0, "step": 7070 }, { "entropy": 0.2940232276916504, "epoch": 3.9926636568848757, "grad_norm": 1.9694218635559082, "learning_rate": 4.643273437858814e-06, "loss": 0.1933, "mean_token_accuracy": 0.9365314722061158, "num_tokens": 57439332.0, "step": 7075 }, { "entropy": 0.30736419558525085, "epoch": 3.99548532731377, "grad_norm": 1.9869581460952759, "learning_rate": 4.642785644257432e-06, "loss": 0.2046, "mean_token_accuracy": 0.9342207312583923, "num_tokens": 57479943.0, "step": 7080 }, { "entropy": 0.3101332366466522, "epoch": 3.9983069977426635, "grad_norm": 1.9268759489059448, "learning_rate": 4.6422975524868635e-06, "loss": 0.2151, "mean_token_accuracy": 0.9306729078292847, "num_tokens": 57520661.0, "step": 7085 }, { "entropy": 0.3069190442562103, "epoch": 4.001128668171558, "grad_norm": 1.7287434339523315, "learning_rate": 4.641809162642993e-06, "loss": 0.1752, "mean_token_accuracy": 0.9477459907531738, "num_tokens": 57555350.0, "step": 7090 }, { "entropy": 0.2743695855140686, "epoch": 4.003950338600451, "grad_norm": 1.4140337705612183, "learning_rate": 4.641320474821765e-06, "loss": 0.128, "mean_token_accuracy": 0.9617153882980347, "num_tokens": 57596053.0, "step": 7095 }, { "entropy": 0.2616827428340912, "epoch": 4.006772009029345, "grad_norm": 2.273259162902832, "learning_rate": 4.640831489119184e-06, "loss": 0.1496, "mean_token_accuracy": 0.9549853324890136, "num_tokens": 57636820.0, "step": 7100 }, { "entropy": 0.21969364285469056, "epoch": 4.00959367945824, "grad_norm": 1.9657589197158813, "learning_rate": 4.640342205631309e-06, "loss": 0.1234, "mean_token_accuracy": 0.961889123916626, "num_tokens": 57677102.0, "step": 7105 }, { "entropy": 0.2414553850889206, "epoch": 4.012415349887133, "grad_norm": 2.4210760593414307, "learning_rate": 4.639852624454261e-06, "loss": 0.1491, "mean_token_accuracy": 0.9532399654388428, "num_tokens": 57717694.0, "step": 7110 }, { "entropy": 0.22773428559303283, "epoch": 4.015237020316027, "grad_norm": 2.0775444507598877, "learning_rate": 4.639362745684219e-06, "loss": 0.1188, "mean_token_accuracy": 0.9613750100135803, "num_tokens": 57758420.0, "step": 7115 }, { "entropy": 0.24161579608917236, "epoch": 4.018058690744921, "grad_norm": 1.9448866844177246, "learning_rate": 4.638872569417417e-06, "loss": 0.1306, "mean_token_accuracy": 0.9591588497161865, "num_tokens": 57799103.0, "step": 7120 }, { "entropy": 0.2467570424079895, "epoch": 4.020880361173815, "grad_norm": 1.8447470664978027, "learning_rate": 4.638382095750152e-06, "loss": 0.1285, "mean_token_accuracy": 0.9602084875106811, "num_tokens": 57839297.0, "step": 7125 }, { "entropy": 0.24987946152687074, "epoch": 4.023702031602709, "grad_norm": 2.2855517864227295, "learning_rate": 4.6378913247787786e-06, "loss": 0.1235, "mean_token_accuracy": 0.9607668399810791, "num_tokens": 57879861.0, "step": 7130 }, { "entropy": 0.23636153638362883, "epoch": 4.026523702031603, "grad_norm": 1.8914059400558472, "learning_rate": 4.637400256599707e-06, "loss": 0.133, "mean_token_accuracy": 0.9598736882209777, "num_tokens": 57920707.0, "step": 7135 }, { "entropy": 0.2302469491958618, "epoch": 4.029345372460496, "grad_norm": 2.0456087589263916, "learning_rate": 4.636908891309408e-06, "loss": 0.1263, "mean_token_accuracy": 0.9601135969161987, "num_tokens": 57961297.0, "step": 7140 }, { "entropy": 0.23698365390300752, "epoch": 4.03216704288939, "grad_norm": 1.853820562362671, "learning_rate": 4.636417229004412e-06, "loss": 0.1325, "mean_token_accuracy": 0.9583729267120361, "num_tokens": 58001984.0, "step": 7145 }, { "entropy": 0.22845596373081206, "epoch": 4.034988713318285, "grad_norm": 1.8284083604812622, "learning_rate": 4.635925269781305e-06, "loss": 0.1264, "mean_token_accuracy": 0.9601821541786194, "num_tokens": 58042688.0, "step": 7150 }, { "entropy": 0.21759492456912993, "epoch": 4.037810383747178, "grad_norm": 2.047950267791748, "learning_rate": 4.6354330137367305e-06, "loss": 0.1157, "mean_token_accuracy": 0.963828158378601, "num_tokens": 58083388.0, "step": 7155 }, { "entropy": 0.22680849432945252, "epoch": 4.040632054176072, "grad_norm": 1.9068692922592163, "learning_rate": 4.634940460967396e-06, "loss": 0.1217, "mean_token_accuracy": 0.9618602633476258, "num_tokens": 58124041.0, "step": 7160 }, { "entropy": 0.22470956444740295, "epoch": 4.043453724604966, "grad_norm": 1.6333588361740112, "learning_rate": 4.634447611570061e-06, "loss": 0.1182, "mean_token_accuracy": 0.9632346987724304, "num_tokens": 58164726.0, "step": 7165 }, { "entropy": 0.2346551775932312, "epoch": 4.04627539503386, "grad_norm": 1.8143978118896484, "learning_rate": 4.633954465641546e-06, "loss": 0.1325, "mean_token_accuracy": 0.9576727986335755, "num_tokens": 58205286.0, "step": 7170 }, { "entropy": 0.23285722136497497, "epoch": 4.049097065462754, "grad_norm": 1.8555634021759033, "learning_rate": 4.633461023278731e-06, "loss": 0.1157, "mean_token_accuracy": 0.9634591937065125, "num_tokens": 58246019.0, "step": 7175 }, { "entropy": 0.23194559514522553, "epoch": 4.051918735891648, "grad_norm": 1.8216824531555176, "learning_rate": 4.632967284578551e-06, "loss": 0.1168, "mean_token_accuracy": 0.9622029185295105, "num_tokens": 58286809.0, "step": 7180 }, { "entropy": 0.25160720348358157, "epoch": 4.0547404063205414, "grad_norm": 1.9061291217803955, "learning_rate": 4.632473249638003e-06, "loss": 0.1255, "mean_token_accuracy": 0.9632068753242493, "num_tokens": 58327226.0, "step": 7185 }, { "entropy": 0.22852468192577363, "epoch": 4.057562076749436, "grad_norm": 1.9731613397598267, "learning_rate": 4.631978918554139e-06, "loss": 0.1235, "mean_token_accuracy": 0.9616599082946777, "num_tokens": 58367956.0, "step": 7190 }, { "entropy": 0.2459454208612442, "epoch": 4.06038374717833, "grad_norm": 1.9121125936508179, "learning_rate": 4.631484291424069e-06, "loss": 0.1461, "mean_token_accuracy": 0.9540547609329224, "num_tokens": 58408674.0, "step": 7195 }, { "entropy": 0.24432432651519775, "epoch": 4.063205417607223, "grad_norm": 1.7734456062316895, "learning_rate": 4.630989368344966e-06, "loss": 0.1334, "mean_token_accuracy": 0.9592110633850097, "num_tokens": 58448721.0, "step": 7200 }, { "entropy": 0.24515534937381744, "epoch": 4.066027088036117, "grad_norm": 2.12031888961792, "learning_rate": 4.630494149414054e-06, "loss": 0.1274, "mean_token_accuracy": 0.9601586699485779, "num_tokens": 58489397.0, "step": 7205 }, { "entropy": 0.23616698682308196, "epoch": 4.068848758465012, "grad_norm": 1.635472059249878, "learning_rate": 4.629998634728622e-06, "loss": 0.1233, "mean_token_accuracy": 0.9608691930770874, "num_tokens": 58529908.0, "step": 7210 }, { "entropy": 0.24083788096904754, "epoch": 4.071670428893905, "grad_norm": 1.929811716079712, "learning_rate": 4.629502824386013e-06, "loss": 0.1248, "mean_token_accuracy": 0.9608209013938904, "num_tokens": 58570278.0, "step": 7215 }, { "entropy": 0.24579550325870514, "epoch": 4.074492099322799, "grad_norm": 2.148528814315796, "learning_rate": 4.629006718483627e-06, "loss": 0.1347, "mean_token_accuracy": 0.9587805867195129, "num_tokens": 58611040.0, "step": 7220 }, { "entropy": 0.23470092117786406, "epoch": 4.077313769751693, "grad_norm": 1.7747644186019897, "learning_rate": 4.628510317118927e-06, "loss": 0.1234, "mean_token_accuracy": 0.961380934715271, "num_tokens": 58651615.0, "step": 7225 }, { "entropy": 0.21981629133224487, "epoch": 4.0801354401805865, "grad_norm": 2.0685737133026123, "learning_rate": 4.628013620389429e-06, "loss": 0.1237, "mean_token_accuracy": 0.960912036895752, "num_tokens": 58692473.0, "step": 7230 }, { "entropy": 0.22954968512058258, "epoch": 4.082957110609481, "grad_norm": 2.0184037685394287, "learning_rate": 4.62751662839271e-06, "loss": 0.1328, "mean_token_accuracy": 0.9574317932128906, "num_tokens": 58733055.0, "step": 7235 }, { "entropy": 0.2246454894542694, "epoch": 4.085778781038375, "grad_norm": 1.9148306846618652, "learning_rate": 4.627019341226404e-06, "loss": 0.1156, "mean_token_accuracy": 0.9631801724433899, "num_tokens": 58773853.0, "step": 7240 }, { "entropy": 0.23507523834705352, "epoch": 4.0886004514672685, "grad_norm": 1.794526219367981, "learning_rate": 4.626521758988204e-06, "loss": 0.1167, "mean_token_accuracy": 0.9632613658905029, "num_tokens": 58814514.0, "step": 7245 }, { "entropy": 0.23969656229019165, "epoch": 4.091422121896162, "grad_norm": 1.9400169849395752, "learning_rate": 4.626023881775858e-06, "loss": 0.1284, "mean_token_accuracy": 0.9598143339157105, "num_tokens": 58854908.0, "step": 7250 }, { "entropy": 0.2598514884710312, "epoch": 4.094243792325057, "grad_norm": 2.0811522006988525, "learning_rate": 4.625525709687176e-06, "loss": 0.1422, "mean_token_accuracy": 0.9565314888954163, "num_tokens": 58895394.0, "step": 7255 }, { "entropy": 0.24920229315757753, "epoch": 4.0970654627539504, "grad_norm": 1.6480082273483276, "learning_rate": 4.625027242820023e-06, "loss": 0.1286, "mean_token_accuracy": 0.9599669218063355, "num_tokens": 58935939.0, "step": 7260 }, { "entropy": 0.23545166254043579, "epoch": 4.099887133182844, "grad_norm": 2.00404691696167, "learning_rate": 4.6245284812723234e-06, "loss": 0.1257, "mean_token_accuracy": 0.9604501008987427, "num_tokens": 58976341.0, "step": 7265 }, { "entropy": 0.23354564011096954, "epoch": 4.102708803611738, "grad_norm": 2.3596303462982178, "learning_rate": 4.624029425142059e-06, "loss": 0.1312, "mean_token_accuracy": 0.9599619388580323, "num_tokens": 59016822.0, "step": 7270 }, { "entropy": 0.24546316564083098, "epoch": 4.105530474040632, "grad_norm": 1.6743675470352173, "learning_rate": 4.623530074527269e-06, "loss": 0.1286, "mean_token_accuracy": 0.9613336443901062, "num_tokens": 59057357.0, "step": 7275 }, { "entropy": 0.23465977013111114, "epoch": 4.108352144469526, "grad_norm": 1.9552369117736816, "learning_rate": 4.6230304295260504e-06, "loss": 0.1138, "mean_token_accuracy": 0.963888657093048, "num_tokens": 59098128.0, "step": 7280 }, { "entropy": 0.22476655542850493, "epoch": 4.11117381489842, "grad_norm": 1.9228246212005615, "learning_rate": 4.62253049023656e-06, "loss": 0.132, "mean_token_accuracy": 0.9577814340591431, "num_tokens": 59138682.0, "step": 7285 }, { "entropy": 0.23238305151462554, "epoch": 4.1139954853273135, "grad_norm": 1.8508713245391846, "learning_rate": 4.62203025675701e-06, "loss": 0.1336, "mean_token_accuracy": 0.9592737793922425, "num_tokens": 59179330.0, "step": 7290 }, { "entropy": 0.21901310980319977, "epoch": 4.116817155756208, "grad_norm": 1.9854564666748047, "learning_rate": 4.621529729185671e-06, "loss": 0.1146, "mean_token_accuracy": 0.9635023355484009, "num_tokens": 59220102.0, "step": 7295 }, { "entropy": 0.23093846142292024, "epoch": 4.119638826185102, "grad_norm": 1.7176353931427002, "learning_rate": 4.621028907620873e-06, "loss": 0.129, "mean_token_accuracy": 0.9593886017799378, "num_tokens": 59260732.0, "step": 7300 }, { "entropy": 0.2379404127597809, "epoch": 4.1224604966139955, "grad_norm": 1.8273789882659912, "learning_rate": 4.620527792161001e-06, "loss": 0.1124, "mean_token_accuracy": 0.9650070428848266, "num_tokens": 59301380.0, "step": 7305 }, { "entropy": 0.22208026945590972, "epoch": 4.125282167042889, "grad_norm": 1.7196441888809204, "learning_rate": 4.620026382904499e-06, "loss": 0.1052, "mean_token_accuracy": 0.9668426513671875, "num_tokens": 59342144.0, "step": 7310 }, { "entropy": 0.23061503767967223, "epoch": 4.128103837471783, "grad_norm": 2.7583887577056885, "learning_rate": 4.61952467994987e-06, "loss": 0.1236, "mean_token_accuracy": 0.9621009945869445, "num_tokens": 59382762.0, "step": 7315 }, { "entropy": 0.22966778576374053, "epoch": 4.1309255079006775, "grad_norm": 2.4049155712127686, "learning_rate": 4.619022683395675e-06, "loss": 0.131, "mean_token_accuracy": 0.9585760951042175, "num_tokens": 59423356.0, "step": 7320 }, { "entropy": 0.23155851662158966, "epoch": 4.133747178329571, "grad_norm": 1.8571085929870605, "learning_rate": 4.618520393340528e-06, "loss": 0.1282, "mean_token_accuracy": 0.9598821759223938, "num_tokens": 59463990.0, "step": 7325 }, { "entropy": 0.2637378484010696, "epoch": 4.136568848758465, "grad_norm": 2.0401177406311035, "learning_rate": 4.618017809883107e-06, "loss": 0.1305, "mean_token_accuracy": 0.9606005668640136, "num_tokens": 59504742.0, "step": 7330 }, { "entropy": 0.2353299468755722, "epoch": 4.139390519187359, "grad_norm": 2.042680263519287, "learning_rate": 4.617514933122142e-06, "loss": 0.1304, "mean_token_accuracy": 0.9596582651138306, "num_tokens": 59545424.0, "step": 7335 }, { "entropy": 0.22721785604953765, "epoch": 4.142212189616253, "grad_norm": 2.117658853530884, "learning_rate": 4.6170117631564246e-06, "loss": 0.115, "mean_token_accuracy": 0.9629755258560181, "num_tokens": 59586178.0, "step": 7340 }, { "entropy": 0.2232923239469528, "epoch": 4.145033860045147, "grad_norm": 1.8017418384552002, "learning_rate": 4.616508300084803e-06, "loss": 0.129, "mean_token_accuracy": 0.9588123679161071, "num_tokens": 59626782.0, "step": 7345 }, { "entropy": 0.22652737498283387, "epoch": 4.1478555304740405, "grad_norm": 2.2765512466430664, "learning_rate": 4.616004544006181e-06, "loss": 0.1175, "mean_token_accuracy": 0.9620057940483093, "num_tokens": 59667236.0, "step": 7350 }, { "entropy": 0.23027748465538025, "epoch": 4.150677200902934, "grad_norm": 1.8993479013442993, "learning_rate": 4.615500495019523e-06, "loss": 0.1223, "mean_token_accuracy": 0.9609902024269104, "num_tokens": 59708042.0, "step": 7355 }, { "entropy": 0.22403572499752045, "epoch": 4.153498871331829, "grad_norm": 2.1379618644714355, "learning_rate": 4.614996153223849e-06, "loss": 0.1266, "mean_token_accuracy": 0.9605488657951355, "num_tokens": 59748650.0, "step": 7360 }, { "entropy": 0.2490081638097763, "epoch": 4.1563205417607225, "grad_norm": 2.036970853805542, "learning_rate": 4.614491518718237e-06, "loss": 0.1421, "mean_token_accuracy": 0.9558643579483033, "num_tokens": 59789397.0, "step": 7365 }, { "entropy": 0.21482129395008087, "epoch": 4.159142212189616, "grad_norm": 1.7927038669586182, "learning_rate": 4.613986591601823e-06, "loss": 0.1081, "mean_token_accuracy": 0.9661535143852233, "num_tokens": 59830163.0, "step": 7370 }, { "entropy": 0.23096846640110016, "epoch": 4.16196388261851, "grad_norm": 2.013101816177368, "learning_rate": 4.613481371973799e-06, "loss": 0.1247, "mean_token_accuracy": 0.9610580563545227, "num_tokens": 59870838.0, "step": 7375 }, { "entropy": 0.23506858050823212, "epoch": 4.164785553047404, "grad_norm": 2.304497480392456, "learning_rate": 4.612975859933415e-06, "loss": 0.1292, "mean_token_accuracy": 0.9596829771995544, "num_tokens": 59911676.0, "step": 7380 }, { "entropy": 0.2367018163204193, "epoch": 4.167607223476298, "grad_norm": 2.2349987030029297, "learning_rate": 4.612470055579982e-06, "loss": 0.1316, "mean_token_accuracy": 0.9579625487327575, "num_tokens": 59952218.0, "step": 7385 }, { "entropy": 0.22246408462524414, "epoch": 4.170428893905192, "grad_norm": 1.9588614702224731, "learning_rate": 4.611963959012862e-06, "loss": 0.1166, "mean_token_accuracy": 0.9629753589630127, "num_tokens": 59992290.0, "step": 7390 }, { "entropy": 0.22336077094078063, "epoch": 4.173250564334086, "grad_norm": 1.994891881942749, "learning_rate": 4.611457570331479e-06, "loss": 0.1155, "mean_token_accuracy": 0.9629181385040283, "num_tokens": 60033130.0, "step": 7395 }, { "entropy": 0.24217330813407897, "epoch": 4.176072234762979, "grad_norm": 1.8636099100112915, "learning_rate": 4.610950889635313e-06, "loss": 0.1315, "mean_token_accuracy": 0.9586685538291931, "num_tokens": 60073764.0, "step": 7400 }, { "entropy": 0.2141832858324051, "epoch": 4.178893905191874, "grad_norm": 1.7918341159820557, "learning_rate": 4.6104439170239015e-06, "loss": 0.1143, "mean_token_accuracy": 0.9644059538841248, "num_tokens": 60114234.0, "step": 7405 }, { "entropy": 0.2275536447763443, "epoch": 4.181715575620768, "grad_norm": 1.8082565069198608, "learning_rate": 4.609936652596841e-06, "loss": 0.121, "mean_token_accuracy": 0.961764144897461, "num_tokens": 60154149.0, "step": 7410 }, { "entropy": 0.24505477845668794, "epoch": 4.184537246049661, "grad_norm": 1.9884941577911377, "learning_rate": 4.60942909645378e-06, "loss": 0.1369, "mean_token_accuracy": 0.956404197216034, "num_tokens": 60194901.0, "step": 7415 }, { "entropy": 0.2267145186662674, "epoch": 4.187358916478555, "grad_norm": 2.0457382202148438, "learning_rate": 4.608921248694431e-06, "loss": 0.129, "mean_token_accuracy": 0.9587201476097107, "num_tokens": 60235805.0, "step": 7420 }, { "entropy": 0.24519952833652497, "epoch": 4.1901805869074495, "grad_norm": 2.398599147796631, "learning_rate": 4.6084131094185594e-06, "loss": 0.1345, "mean_token_accuracy": 0.9566538453102111, "num_tokens": 60276550.0, "step": 7425 }, { "entropy": 0.23022598624229432, "epoch": 4.193002257336343, "grad_norm": 1.9493690729141235, "learning_rate": 4.607904678725989e-06, "loss": 0.1237, "mean_token_accuracy": 0.9610730051994324, "num_tokens": 60316464.0, "step": 7430 }, { "entropy": 0.22809267342090606, "epoch": 4.195823927765237, "grad_norm": 2.157726287841797, "learning_rate": 4.607395956716603e-06, "loss": 0.12, "mean_token_accuracy": 0.9622081279754638, "num_tokens": 60356857.0, "step": 7435 }, { "entropy": 0.2448444426059723, "epoch": 4.198645598194131, "grad_norm": 2.016291379928589, "learning_rate": 4.606886943490338e-06, "loss": 0.1329, "mean_token_accuracy": 0.9577858328819275, "num_tokens": 60397631.0, "step": 7440 }, { "entropy": 0.2451613575220108, "epoch": 4.201467268623025, "grad_norm": 2.0515503883361816, "learning_rate": 4.60637763914719e-06, "loss": 0.1414, "mean_token_accuracy": 0.9543289184570313, "num_tokens": 60437991.0, "step": 7445 }, { "entropy": 0.2392250567674637, "epoch": 4.204288939051919, "grad_norm": 1.8992726802825928, "learning_rate": 4.605868043787213e-06, "loss": 0.1336, "mean_token_accuracy": 0.9581851363182068, "num_tokens": 60478732.0, "step": 7450 }, { "entropy": 0.23091658651828767, "epoch": 4.207110609480813, "grad_norm": 1.9759114980697632, "learning_rate": 4.605358157510516e-06, "loss": 0.1108, "mean_token_accuracy": 0.9652905344963074, "num_tokens": 60518543.0, "step": 7455 }, { "entropy": 0.23277661204338074, "epoch": 4.209932279909706, "grad_norm": 1.8714420795440674, "learning_rate": 4.6048479804172666e-06, "loss": 0.1099, "mean_token_accuracy": 0.966291892528534, "num_tokens": 60559260.0, "step": 7460 }, { "entropy": 0.23967285752296447, "epoch": 4.2127539503386, "grad_norm": 2.056525468826294, "learning_rate": 4.604337512607689e-06, "loss": 0.1228, "mean_token_accuracy": 0.9611961007118225, "num_tokens": 60600020.0, "step": 7465 }, { "entropy": 0.24567246735095977, "epoch": 4.215575620767495, "grad_norm": 2.1545474529266357, "learning_rate": 4.603826754182065e-06, "loss": 0.1493, "mean_token_accuracy": 0.9537813544273377, "num_tokens": 60640575.0, "step": 7470 }, { "entropy": 0.2454807221889496, "epoch": 4.218397291196388, "grad_norm": 2.1680426597595215, "learning_rate": 4.603315705240732e-06, "loss": 0.1382, "mean_token_accuracy": 0.956198763847351, "num_tokens": 60681128.0, "step": 7475 }, { "entropy": 0.24590539932250977, "epoch": 4.221218961625282, "grad_norm": 1.8926993608474731, "learning_rate": 4.602804365884088e-06, "loss": 0.1257, "mean_token_accuracy": 0.960571038722992, "num_tokens": 60721973.0, "step": 7480 }, { "entropy": 0.23549313545227052, "epoch": 4.224040632054176, "grad_norm": 2.156057119369507, "learning_rate": 4.602292736212583e-06, "loss": 0.1329, "mean_token_accuracy": 0.9587239861488343, "num_tokens": 60762628.0, "step": 7485 }, { "entropy": 0.2402738958597183, "epoch": 4.22686230248307, "grad_norm": 1.9733823537826538, "learning_rate": 4.601780816326728e-06, "loss": 0.1245, "mean_token_accuracy": 0.9602890253067017, "num_tokens": 60803434.0, "step": 7490 }, { "entropy": 0.243062624335289, "epoch": 4.229683972911964, "grad_norm": 2.058126211166382, "learning_rate": 4.60126860632709e-06, "loss": 0.1329, "mean_token_accuracy": 0.9585610747337341, "num_tokens": 60844279.0, "step": 7495 }, { "entropy": 0.23409413993358613, "epoch": 4.232505643340858, "grad_norm": 1.9692258834838867, "learning_rate": 4.600756106314292e-06, "loss": 0.1167, "mean_token_accuracy": 0.9634551763534546, "num_tokens": 60884801.0, "step": 7500 }, { "epoch": 4.232505643340858, "eval_entropy": 0.2763104736804962, "eval_loss": 0.11991167068481445, "eval_mean_token_accuracy": 0.9616711139678955, "eval_num_tokens": 60884801.0, "eval_runtime": 0.1639, "eval_samples_per_second": 24.403, "eval_steps_per_second": 6.101, "step": 7500 }, { "entropy": 0.24236758947372436, "epoch": 4.235327313769751, "grad_norm": 2.3060202598571777, "learning_rate": 4.6002433163890156e-06, "loss": 0.1294, "mean_token_accuracy": 0.959705400466919, "num_tokens": 60925426.0, "step": 7505 }, { "entropy": 0.21380599737167358, "epoch": 4.238148984198646, "grad_norm": 2.0318894386291504, "learning_rate": 4.599730236651998e-06, "loss": 0.1058, "mean_token_accuracy": 0.965471088886261, "num_tokens": 60966196.0, "step": 7510 }, { "entropy": 0.22909758388996124, "epoch": 4.24097065462754, "grad_norm": 1.807207465171814, "learning_rate": 4.5992168672040335e-06, "loss": 0.1201, "mean_token_accuracy": 0.9617739319801331, "num_tokens": 61006851.0, "step": 7515 }, { "entropy": 0.2145596742630005, "epoch": 4.243792325056433, "grad_norm": 1.704498052597046, "learning_rate": 4.598703208145974e-06, "loss": 0.1079, "mean_token_accuracy": 0.9660002827644348, "num_tokens": 61047687.0, "step": 7520 }, { "entropy": 0.2247183084487915, "epoch": 4.246613995485327, "grad_norm": 2.1221797466278076, "learning_rate": 4.598189259578727e-06, "loss": 0.1279, "mean_token_accuracy": 0.9582555651664734, "num_tokens": 61088309.0, "step": 7525 }, { "entropy": 0.23647244274616241, "epoch": 4.249435665914222, "grad_norm": 2.2665932178497314, "learning_rate": 4.597675021603259e-06, "loss": 0.1294, "mean_token_accuracy": 0.9585322737693787, "num_tokens": 61129103.0, "step": 7530 }, { "entropy": 0.21924723386764527, "epoch": 4.252257336343115, "grad_norm": 2.0743370056152344, "learning_rate": 4.597160494320592e-06, "loss": 0.1204, "mean_token_accuracy": 0.9610903978347778, "num_tokens": 61169817.0, "step": 7535 }, { "entropy": 0.22164719700813293, "epoch": 4.255079006772009, "grad_norm": 1.8417187929153442, "learning_rate": 4.596645677831804e-06, "loss": 0.1236, "mean_token_accuracy": 0.9602578401565551, "num_tokens": 61210598.0, "step": 7540 }, { "entropy": 0.2332447052001953, "epoch": 4.257900677200903, "grad_norm": 1.8406648635864258, "learning_rate": 4.596130572238031e-06, "loss": 0.1174, "mean_token_accuracy": 0.9627135396003723, "num_tokens": 61251185.0, "step": 7545 }, { "entropy": 0.23616610169410707, "epoch": 4.260722347629796, "grad_norm": 1.8603347539901733, "learning_rate": 4.595615177640466e-06, "loss": 0.1263, "mean_token_accuracy": 0.9601802945137023, "num_tokens": 61291752.0, "step": 7550 }, { "entropy": 0.22554007172584534, "epoch": 4.263544018058691, "grad_norm": 2.3185107707977295, "learning_rate": 4.59509949414036e-06, "loss": 0.119, "mean_token_accuracy": 0.9624182105064392, "num_tokens": 61332441.0, "step": 7555 }, { "entropy": 0.2530226230621338, "epoch": 4.266365688487585, "grad_norm": 2.326624631881714, "learning_rate": 4.594583521839015e-06, "loss": 0.1415, "mean_token_accuracy": 0.9568467020988465, "num_tokens": 61373014.0, "step": 7560 }, { "entropy": 0.22727547585964203, "epoch": 4.269187358916478, "grad_norm": 2.162659168243408, "learning_rate": 4.594067260837796e-06, "loss": 0.1309, "mean_token_accuracy": 0.9592653393745423, "num_tokens": 61413745.0, "step": 7565 }, { "entropy": 0.24552667737007142, "epoch": 4.272009029345372, "grad_norm": 2.3903839588165283, "learning_rate": 4.593550711238123e-06, "loss": 0.1324, "mean_token_accuracy": 0.9594314217567443, "num_tokens": 61454234.0, "step": 7570 }, { "entropy": 0.23494355678558348, "epoch": 4.274830699774267, "grad_norm": 1.9757755994796753, "learning_rate": 4.5930338731414726e-06, "loss": 0.1304, "mean_token_accuracy": 0.9587761878967285, "num_tokens": 61494930.0, "step": 7575 }, { "entropy": 0.24661367535591125, "epoch": 4.27765237020316, "grad_norm": 2.0399813652038574, "learning_rate": 4.592516746649377e-06, "loss": 0.1321, "mean_token_accuracy": 0.9585609912872315, "num_tokens": 61535747.0, "step": 7580 }, { "entropy": 0.23092095851898192, "epoch": 4.280474040632054, "grad_norm": 2.1672401428222656, "learning_rate": 4.591999331863425e-06, "loss": 0.1239, "mean_token_accuracy": 0.9604844927787781, "num_tokens": 61576529.0, "step": 7585 }, { "entropy": 0.2526959002017975, "epoch": 4.283295711060948, "grad_norm": 2.322620153427124, "learning_rate": 4.5914816288852645e-06, "loss": 0.1319, "mean_token_accuracy": 0.9580205917358399, "num_tokens": 61617030.0, "step": 7590 }, { "entropy": 0.22551030814647674, "epoch": 4.286117381489842, "grad_norm": 1.7364836931228638, "learning_rate": 4.590963637816596e-06, "loss": 0.127, "mean_token_accuracy": 0.960225510597229, "num_tokens": 61657559.0, "step": 7595 }, { "entropy": 0.22876082360744476, "epoch": 4.288939051918736, "grad_norm": 1.735487461090088, "learning_rate": 4.590445358759181e-06, "loss": 0.1165, "mean_token_accuracy": 0.961608099937439, "num_tokens": 61697918.0, "step": 7600 }, { "entropy": 0.23216151893138887, "epoch": 4.29176072234763, "grad_norm": 2.070525646209717, "learning_rate": 4.589926791814836e-06, "loss": 0.1204, "mean_token_accuracy": 0.9610988855361938, "num_tokens": 61738812.0, "step": 7605 }, { "entropy": 0.23693689107894897, "epoch": 4.294582392776523, "grad_norm": 2.203585147857666, "learning_rate": 4.589407937085431e-06, "loss": 0.1247, "mean_token_accuracy": 0.9603411674499511, "num_tokens": 61779432.0, "step": 7610 }, { "entropy": 0.2458764672279358, "epoch": 4.297404063205418, "grad_norm": 2.2714271545410156, "learning_rate": 4.5888887946728966e-06, "loss": 0.1412, "mean_token_accuracy": 0.9558348536491394, "num_tokens": 61820028.0, "step": 7615 }, { "entropy": 0.23253954946994781, "epoch": 4.300225733634312, "grad_norm": 2.077993154525757, "learning_rate": 4.588369364679217e-06, "loss": 0.1219, "mean_token_accuracy": 0.9627041578292846, "num_tokens": 61860687.0, "step": 7620 }, { "entropy": 0.2426275998353958, "epoch": 4.303047404063205, "grad_norm": 2.0825798511505127, "learning_rate": 4.587849647206437e-06, "loss": 0.1395, "mean_token_accuracy": 0.9566293358802795, "num_tokens": 61901354.0, "step": 7625 }, { "entropy": 0.2425705909729004, "epoch": 4.305869074492099, "grad_norm": 1.9618566036224365, "learning_rate": 4.587329642356654e-06, "loss": 0.1382, "mean_token_accuracy": 0.9561069011688232, "num_tokens": 61941799.0, "step": 7630 }, { "entropy": 0.2432687759399414, "epoch": 4.308690744920993, "grad_norm": 1.9312220811843872, "learning_rate": 4.586809350232022e-06, "loss": 0.1262, "mean_token_accuracy": 0.9600926876068115, "num_tokens": 61982412.0, "step": 7635 }, { "entropy": 0.22498698830604552, "epoch": 4.311512415349887, "grad_norm": 2.0869600772857666, "learning_rate": 4.586288770934753e-06, "loss": 0.1247, "mean_token_accuracy": 0.9600854396820069, "num_tokens": 62022929.0, "step": 7640 }, { "entropy": 0.2627303659915924, "epoch": 4.314334085778781, "grad_norm": 2.0073163509368896, "learning_rate": 4.585767904567115e-06, "loss": 0.1315, "mean_token_accuracy": 0.9589227676391602, "num_tokens": 62063278.0, "step": 7645 }, { "entropy": 0.2304110735654831, "epoch": 4.317155756207675, "grad_norm": 1.8721083402633667, "learning_rate": 4.585246751231433e-06, "loss": 0.1155, "mean_token_accuracy": 0.962857437133789, "num_tokens": 62103307.0, "step": 7650 }, { "entropy": 0.24186222553253173, "epoch": 4.3199774266365685, "grad_norm": 2.2419540882110596, "learning_rate": 4.584725311030085e-06, "loss": 0.126, "mean_token_accuracy": 0.9607646465301514, "num_tokens": 62144023.0, "step": 7655 }, { "entropy": 0.22962496876716615, "epoch": 4.322799097065463, "grad_norm": 2.0205132961273193, "learning_rate": 4.584203584065512e-06, "loss": 0.1339, "mean_token_accuracy": 0.9580964565277099, "num_tokens": 62184285.0, "step": 7660 }, { "entropy": 0.22456653118133546, "epoch": 4.325620767494357, "grad_norm": 1.9401427507400513, "learning_rate": 4.583681570440204e-06, "loss": 0.1209, "mean_token_accuracy": 0.96328786611557, "num_tokens": 62224200.0, "step": 7665 }, { "entropy": 0.22748615145683287, "epoch": 4.3284424379232505, "grad_norm": 1.9831054210662842, "learning_rate": 4.583159270256712e-06, "loss": 0.1157, "mean_token_accuracy": 0.9626774907112121, "num_tokens": 62264973.0, "step": 7670 }, { "entropy": 0.2501515805721283, "epoch": 4.331264108352144, "grad_norm": 2.2783570289611816, "learning_rate": 4.582636683617643e-06, "loss": 0.1268, "mean_token_accuracy": 0.9602941393852233, "num_tokens": 62305564.0, "step": 7675 }, { "entropy": 0.238142928481102, "epoch": 4.334085778781039, "grad_norm": 2.0721852779388428, "learning_rate": 4.582113810625657e-06, "loss": 0.133, "mean_token_accuracy": 0.958099901676178, "num_tokens": 62345999.0, "step": 7680 }, { "entropy": 0.22630032598972322, "epoch": 4.336907449209932, "grad_norm": 1.8380122184753418, "learning_rate": 4.581590651383473e-06, "loss": 0.1168, "mean_token_accuracy": 0.9639391660690307, "num_tokens": 62386861.0, "step": 7685 }, { "entropy": 0.2205124467611313, "epoch": 4.339729119638826, "grad_norm": 2.063775062561035, "learning_rate": 4.581067205993867e-06, "loss": 0.1078, "mean_token_accuracy": 0.9652288436889649, "num_tokens": 62427526.0, "step": 7690 }, { "entropy": 0.23350533246994018, "epoch": 4.34255079006772, "grad_norm": 2.0544824600219727, "learning_rate": 4.580543474559669e-06, "loss": 0.1333, "mean_token_accuracy": 0.9583484888076782, "num_tokens": 62468033.0, "step": 7695 }, { "entropy": 0.23622917830944062, "epoch": 4.345372460496614, "grad_norm": 1.9537707567214966, "learning_rate": 4.580019457183766e-06, "loss": 0.1248, "mean_token_accuracy": 0.9608685970306396, "num_tokens": 62508513.0, "step": 7700 }, { "entropy": 0.22541263699531555, "epoch": 4.348194130925508, "grad_norm": 1.8316019773483276, "learning_rate": 4.579495153969102e-06, "loss": 0.1235, "mean_token_accuracy": 0.9609684944152832, "num_tokens": 62549331.0, "step": 7705 }, { "entropy": 0.2408158302307129, "epoch": 4.351015801354402, "grad_norm": 2.1585938930511475, "learning_rate": 4.578970565018676e-06, "loss": 0.1357, "mean_token_accuracy": 0.9570087313652038, "num_tokens": 62590028.0, "step": 7710 }, { "entropy": 0.2397814780473709, "epoch": 4.3538374717832955, "grad_norm": 1.9146168231964111, "learning_rate": 4.578445690435542e-06, "loss": 0.1205, "mean_token_accuracy": 0.9615342140197753, "num_tokens": 62630583.0, "step": 7715 }, { "entropy": 0.2495466351509094, "epoch": 4.356659142212189, "grad_norm": 2.213092565536499, "learning_rate": 4.577920530322815e-06, "loss": 0.1317, "mean_token_accuracy": 0.9592637538909912, "num_tokens": 62671379.0, "step": 7720 }, { "entropy": 0.25447321236133574, "epoch": 4.359480812641084, "grad_norm": 2.162130355834961, "learning_rate": 4.5773950847836604e-06, "loss": 0.1476, "mean_token_accuracy": 0.9534005284309387, "num_tokens": 62712032.0, "step": 7725 }, { "entropy": 0.242551389336586, "epoch": 4.3623024830699775, "grad_norm": 2.0905115604400635, "learning_rate": 4.576869353921302e-06, "loss": 0.1325, "mean_token_accuracy": 0.9586278200149536, "num_tokens": 62752939.0, "step": 7730 }, { "entropy": 0.2576675295829773, "epoch": 4.365124153498871, "grad_norm": 2.0513839721679688, "learning_rate": 4.5763433378390205e-06, "loss": 0.1467, "mean_token_accuracy": 0.9542353510856628, "num_tokens": 62793276.0, "step": 7735 }, { "entropy": 0.2226796418428421, "epoch": 4.367945823927765, "grad_norm": 2.049790382385254, "learning_rate": 4.575817036640153e-06, "loss": 0.1265, "mean_token_accuracy": 0.9590314865112305, "num_tokens": 62833681.0, "step": 7740 }, { "entropy": 0.22169795334339143, "epoch": 4.3707674943566595, "grad_norm": 2.1573617458343506, "learning_rate": 4.575290450428088e-06, "loss": 0.1141, "mean_token_accuracy": 0.9643651604652405, "num_tokens": 62874247.0, "step": 7745 }, { "entropy": 0.21064240634441375, "epoch": 4.373589164785553, "grad_norm": 2.0029399394989014, "learning_rate": 4.574763579306276e-06, "loss": 0.114, "mean_token_accuracy": 0.9654483675956727, "num_tokens": 62914852.0, "step": 7750 }, { "entropy": 0.2374986946582794, "epoch": 4.376410835214447, "grad_norm": 2.325894355773926, "learning_rate": 4.574236423378221e-06, "loss": 0.1237, "mean_token_accuracy": 0.9615754604339599, "num_tokens": 62955498.0, "step": 7755 }, { "entropy": 0.2390218436717987, "epoch": 4.3792325056433405, "grad_norm": 2.234550952911377, "learning_rate": 4.5737089827474826e-06, "loss": 0.1278, "mean_token_accuracy": 0.9586256504058838, "num_tokens": 62995636.0, "step": 7760 }, { "entropy": 0.2208180695772171, "epoch": 4.382054176072235, "grad_norm": 2.097153902053833, "learning_rate": 4.573181257517675e-06, "loss": 0.1101, "mean_token_accuracy": 0.9645330548286438, "num_tokens": 63036376.0, "step": 7765 }, { "entropy": 0.25867641568183897, "epoch": 4.384875846501129, "grad_norm": 1.9841035604476929, "learning_rate": 4.572653247792471e-06, "loss": 0.136, "mean_token_accuracy": 0.9573712944984436, "num_tokens": 63076910.0, "step": 7770 }, { "entropy": 0.22610498666763307, "epoch": 4.3876975169300225, "grad_norm": 1.8051813840866089, "learning_rate": 4.572124953675599e-06, "loss": 0.1146, "mean_token_accuracy": 0.963974404335022, "num_tokens": 63117280.0, "step": 7775 }, { "entropy": 0.24536430835723877, "epoch": 4.390519187358916, "grad_norm": 2.267284870147705, "learning_rate": 4.571596375270843e-06, "loss": 0.1263, "mean_token_accuracy": 0.9603652000427246, "num_tokens": 63158036.0, "step": 7780 }, { "entropy": 0.2449027419090271, "epoch": 4.393340857787811, "grad_norm": 1.866507649421692, "learning_rate": 4.5710675126820394e-06, "loss": 0.1414, "mean_token_accuracy": 0.9538567900657654, "num_tokens": 63198608.0, "step": 7785 }, { "entropy": 0.2529394537210464, "epoch": 4.3961625282167045, "grad_norm": 2.741823673248291, "learning_rate": 4.570538366013085e-06, "loss": 0.136, "mean_token_accuracy": 0.957249391078949, "num_tokens": 63239140.0, "step": 7790 }, { "entropy": 0.2259007692337036, "epoch": 4.398984198645598, "grad_norm": 1.8987911939620972, "learning_rate": 4.570008935367931e-06, "loss": 0.1297, "mean_token_accuracy": 0.9585871219635009, "num_tokens": 63279866.0, "step": 7795 }, { "entropy": 0.22481289207935334, "epoch": 4.401805869074492, "grad_norm": 2.1305699348449707, "learning_rate": 4.569479220850583e-06, "loss": 0.125, "mean_token_accuracy": 0.9605366587638855, "num_tokens": 63320412.0, "step": 7800 }, { "entropy": 0.23074941635131835, "epoch": 4.404627539503386, "grad_norm": 1.930199384689331, "learning_rate": 4.568949222565105e-06, "loss": 0.1218, "mean_token_accuracy": 0.9614872097969055, "num_tokens": 63361131.0, "step": 7805 }, { "entropy": 0.22710799276828766, "epoch": 4.40744920993228, "grad_norm": 1.8718585968017578, "learning_rate": 4.568418940615616e-06, "loss": 0.119, "mean_token_accuracy": 0.9626579761505127, "num_tokens": 63401708.0, "step": 7810 }, { "entropy": 0.24245786666870117, "epoch": 4.410270880361174, "grad_norm": 2.3666772842407227, "learning_rate": 4.567888375106286e-06, "loss": 0.1383, "mean_token_accuracy": 0.956715726852417, "num_tokens": 63442479.0, "step": 7815 }, { "entropy": 0.2398514539003372, "epoch": 4.413092550790068, "grad_norm": 2.2359414100646973, "learning_rate": 4.567357526141349e-06, "loss": 0.133, "mean_token_accuracy": 0.9572691440582275, "num_tokens": 63482841.0, "step": 7820 }, { "entropy": 0.23423077166080475, "epoch": 4.415914221218961, "grad_norm": 2.1519811153411865, "learning_rate": 4.5668263938250876e-06, "loss": 0.1264, "mean_token_accuracy": 0.9597557783126831, "num_tokens": 63523538.0, "step": 7825 }, { "entropy": 0.2178510457277298, "epoch": 4.418735891647856, "grad_norm": 1.9678175449371338, "learning_rate": 4.566294978261844e-06, "loss": 0.1131, "mean_token_accuracy": 0.9639857292175293, "num_tokens": 63564255.0, "step": 7830 }, { "entropy": 0.23961224555969238, "epoch": 4.4215575620767495, "grad_norm": 2.6398816108703613, "learning_rate": 4.565763279556014e-06, "loss": 0.137, "mean_token_accuracy": 0.9575521588325501, "num_tokens": 63604882.0, "step": 7835 }, { "entropy": 0.24535090029239653, "epoch": 4.424379232505643, "grad_norm": 2.135787010192871, "learning_rate": 4.565231297812051e-06, "loss": 0.127, "mean_token_accuracy": 0.9597046256065369, "num_tokens": 63645157.0, "step": 7840 }, { "entropy": 0.2281971275806427, "epoch": 4.427200902934537, "grad_norm": 2.0932908058166504, "learning_rate": 4.564699033134462e-06, "loss": 0.1229, "mean_token_accuracy": 0.9616926431655883, "num_tokens": 63685797.0, "step": 7845 }, { "entropy": 0.24232443869113923, "epoch": 4.4300225733634315, "grad_norm": 1.8690487146377563, "learning_rate": 4.564166485627811e-06, "loss": 0.1439, "mean_token_accuracy": 0.9539458394050598, "num_tokens": 63726385.0, "step": 7850 }, { "entropy": 0.2243878573179245, "epoch": 4.432844243792325, "grad_norm": 2.3627984523773193, "learning_rate": 4.563633655396717e-06, "loss": 0.1282, "mean_token_accuracy": 0.9588661789894104, "num_tokens": 63767053.0, "step": 7855 }, { "entropy": 0.25156151950359346, "epoch": 4.435665914221219, "grad_norm": 2.046766996383667, "learning_rate": 4.563100542545854e-06, "loss": 0.1355, "mean_token_accuracy": 0.9556037425994873, "num_tokens": 63807612.0, "step": 7860 }, { "entropy": 0.23665276169776917, "epoch": 4.438487584650113, "grad_norm": 2.13456130027771, "learning_rate": 4.5625671471799535e-06, "loss": 0.1412, "mean_token_accuracy": 0.955809724330902, "num_tokens": 63848217.0, "step": 7865 }, { "entropy": 0.22619423866271973, "epoch": 4.441309255079007, "grad_norm": 2.092322587966919, "learning_rate": 4.562033469403799e-06, "loss": 0.1183, "mean_token_accuracy": 0.9627872586250306, "num_tokens": 63888701.0, "step": 7870 }, { "entropy": 0.22337720394134522, "epoch": 4.444130925507901, "grad_norm": 1.8521798849105835, "learning_rate": 4.561499509322233e-06, "loss": 0.1193, "mean_token_accuracy": 0.9614850997924804, "num_tokens": 63929411.0, "step": 7875 }, { "entropy": 0.233727565407753, "epoch": 4.446952595936795, "grad_norm": 1.6389681100845337, "learning_rate": 4.560965267040151e-06, "loss": 0.1149, "mean_token_accuracy": 0.9633271217346191, "num_tokens": 63970272.0, "step": 7880 }, { "entropy": 0.21891143321990966, "epoch": 4.449774266365688, "grad_norm": 2.2698888778686523, "learning_rate": 4.560430742662506e-06, "loss": 0.1212, "mean_token_accuracy": 0.9615803718566894, "num_tokens": 64010993.0, "step": 7885 }, { "entropy": 0.257635697722435, "epoch": 4.452595936794582, "grad_norm": 2.019932270050049, "learning_rate": 4.559895936294305e-06, "loss": 0.1328, "mean_token_accuracy": 0.9580089688301087, "num_tokens": 64051747.0, "step": 7890 }, { "entropy": 0.2612759441137314, "epoch": 4.455417607223477, "grad_norm": 2.1498031616210938, "learning_rate": 4.559360848040611e-06, "loss": 0.1354, "mean_token_accuracy": 0.9571520805358886, "num_tokens": 64092298.0, "step": 7895 }, { "entropy": 0.24472321271896363, "epoch": 4.45823927765237, "grad_norm": 2.2443246841430664, "learning_rate": 4.558825478006543e-06, "loss": 0.1298, "mean_token_accuracy": 0.9582807421684265, "num_tokens": 64132966.0, "step": 7900 }, { "entropy": 0.24086927771568298, "epoch": 4.461060948081264, "grad_norm": 2.0586602687835693, "learning_rate": 4.5582898262972715e-06, "loss": 0.1147, "mean_token_accuracy": 0.9626161217689514, "num_tokens": 64173716.0, "step": 7905 }, { "entropy": 0.22729225158691407, "epoch": 4.463882618510158, "grad_norm": 2.192944049835205, "learning_rate": 4.557753893018028e-06, "loss": 0.1231, "mean_token_accuracy": 0.962181007862091, "num_tokens": 64214398.0, "step": 7910 }, { "entropy": 0.24374393820762635, "epoch": 4.466704288939052, "grad_norm": 2.042626142501831, "learning_rate": 4.557217678274097e-06, "loss": 0.1412, "mean_token_accuracy": 0.9556024193763732, "num_tokens": 64254854.0, "step": 7915 }, { "entropy": 0.23933674693107604, "epoch": 4.469525959367946, "grad_norm": 1.807453989982605, "learning_rate": 4.556681182170816e-06, "loss": 0.1334, "mean_token_accuracy": 0.9586431503295898, "num_tokens": 64295435.0, "step": 7920 }, { "entropy": 0.2443247377872467, "epoch": 4.47234762979684, "grad_norm": 2.1040701866149902, "learning_rate": 4.55614440481358e-06, "loss": 0.1288, "mean_token_accuracy": 0.9598910212516785, "num_tokens": 64336069.0, "step": 7925 }, { "entropy": 0.24531608521938325, "epoch": 4.475169300225733, "grad_norm": 2.0442538261413574, "learning_rate": 4.555607346307841e-06, "loss": 0.1359, "mean_token_accuracy": 0.9561710357666016, "num_tokens": 64376772.0, "step": 7930 }, { "entropy": 0.2342956393957138, "epoch": 4.477990970654628, "grad_norm": 1.9000775814056396, "learning_rate": 4.555070006759102e-06, "loss": 0.1351, "mean_token_accuracy": 0.9563033699989318, "num_tokens": 64417469.0, "step": 7935 }, { "entropy": 0.22305310368537903, "epoch": 4.480812641083522, "grad_norm": 1.8340357542037964, "learning_rate": 4.554532386272925e-06, "loss": 0.1233, "mean_token_accuracy": 0.9609080910682678, "num_tokens": 64458183.0, "step": 7940 }, { "entropy": 0.24366792142391205, "epoch": 4.483634311512415, "grad_norm": 2.0451269149780273, "learning_rate": 4.5539944849549244e-06, "loss": 0.1332, "mean_token_accuracy": 0.9595473170280456, "num_tokens": 64499084.0, "step": 7945 }, { "entropy": 0.2458895593881607, "epoch": 4.486455981941309, "grad_norm": 2.0097408294677734, "learning_rate": 4.553456302910771e-06, "loss": 0.1273, "mean_token_accuracy": 0.9585718154907227, "num_tokens": 64539911.0, "step": 7950 }, { "entropy": 0.26139111518859864, "epoch": 4.489277652370204, "grad_norm": 2.031979560852051, "learning_rate": 4.552917840246191e-06, "loss": 0.1372, "mean_token_accuracy": 0.9563012480735779, "num_tokens": 64580156.0, "step": 7955 }, { "entropy": 0.2303838014602661, "epoch": 4.492099322799097, "grad_norm": 1.742417573928833, "learning_rate": 4.552379097066967e-06, "loss": 0.128, "mean_token_accuracy": 0.959154736995697, "num_tokens": 64620865.0, "step": 7960 }, { "entropy": 0.24330146610736847, "epoch": 4.494920993227991, "grad_norm": 2.139296770095825, "learning_rate": 4.551840073478934e-06, "loss": 0.134, "mean_token_accuracy": 0.9584646821022034, "num_tokens": 64661404.0, "step": 7965 }, { "entropy": 0.24887127578258514, "epoch": 4.497742663656885, "grad_norm": 2.17922306060791, "learning_rate": 4.551300769587982e-06, "loss": 0.133, "mean_token_accuracy": 0.9567908644676208, "num_tokens": 64701858.0, "step": 7970 }, { "entropy": 0.2185787171125412, "epoch": 4.500564334085778, "grad_norm": 2.0657732486724854, "learning_rate": 4.550761185500059e-06, "loss": 0.1253, "mean_token_accuracy": 0.9603734970092773, "num_tokens": 64742554.0, "step": 7975 }, { "entropy": 0.2412070006132126, "epoch": 4.503386004514673, "grad_norm": 1.9913356304168701, "learning_rate": 4.550221321321165e-06, "loss": 0.1238, "mean_token_accuracy": 0.959455955028534, "num_tokens": 64783017.0, "step": 7980 }, { "entropy": 0.24967373609542848, "epoch": 4.506207674943567, "grad_norm": 2.3349766731262207, "learning_rate": 4.549681177157358e-06, "loss": 0.1318, "mean_token_accuracy": 0.9588298797607422, "num_tokens": 64823373.0, "step": 7985 }, { "entropy": 0.22077546417713165, "epoch": 4.50902934537246, "grad_norm": 1.7486339807510376, "learning_rate": 4.549140753114748e-06, "loss": 0.1058, "mean_token_accuracy": 0.9653358578681945, "num_tokens": 64864026.0, "step": 7990 }, { "entropy": 0.22933962047100068, "epoch": 4.511851015801354, "grad_norm": 1.8534975051879883, "learning_rate": 4.548600049299502e-06, "loss": 0.1226, "mean_token_accuracy": 0.9605401158332825, "num_tokens": 64904483.0, "step": 7995 }, { "entropy": 0.22776411473751068, "epoch": 4.514672686230249, "grad_norm": 1.8297226428985596, "learning_rate": 4.548059065817841e-06, "loss": 0.1186, "mean_token_accuracy": 0.96176518201828, "num_tokens": 64945138.0, "step": 8000 }, { "epoch": 4.514672686230249, "eval_entropy": 0.2636869251728058, "eval_loss": 0.0795026570558548, "eval_mean_token_accuracy": 0.9754695296287537, "eval_num_tokens": 64945138.0, "eval_runtime": 0.1637, "eval_samples_per_second": 24.429, "eval_steps_per_second": 6.107, "step": 8000 }, { "entropy": 0.21750448644161224, "epoch": 4.517494356659142, "grad_norm": 2.122471570968628, "learning_rate": 4.547517802776042e-06, "loss": 0.1224, "mean_token_accuracy": 0.9609117388725281, "num_tokens": 64985873.0, "step": 8005 }, { "entropy": 0.22921790778636933, "epoch": 4.520316027088036, "grad_norm": 2.1541335582733154, "learning_rate": 4.546976260280435e-06, "loss": 0.1131, "mean_token_accuracy": 0.9636703491210937, "num_tokens": 65026415.0, "step": 8010 }, { "entropy": 0.23894762694835664, "epoch": 4.52313769751693, "grad_norm": 2.3302664756774902, "learning_rate": 4.546434438437408e-06, "loss": 0.1196, "mean_token_accuracy": 0.9631991147994995, "num_tokens": 65067004.0, "step": 8015 }, { "entropy": 0.26045531034469604, "epoch": 4.525959367945823, "grad_norm": 2.2902634143829346, "learning_rate": 4.5458923373534e-06, "loss": 0.1442, "mean_token_accuracy": 0.9546260356903076, "num_tokens": 65107609.0, "step": 8020 }, { "entropy": 0.23849842548370362, "epoch": 4.528781038374718, "grad_norm": 2.1312549114227295, "learning_rate": 4.545349957134908e-06, "loss": 0.1253, "mean_token_accuracy": 0.9602569699287414, "num_tokens": 65148321.0, "step": 8025 }, { "entropy": 0.242272087931633, "epoch": 4.531602708803612, "grad_norm": 2.1353938579559326, "learning_rate": 4.544807297888482e-06, "loss": 0.1329, "mean_token_accuracy": 0.9582337856292724, "num_tokens": 65188859.0, "step": 8030 }, { "entropy": 0.21073226928710936, "epoch": 4.534424379232505, "grad_norm": 1.7557628154754639, "learning_rate": 4.544264359720728e-06, "loss": 0.1058, "mean_token_accuracy": 0.9665800213813782, "num_tokens": 65229611.0, "step": 8035 }, { "entropy": 0.2263183683156967, "epoch": 4.5372460496614, "grad_norm": 1.9420866966247559, "learning_rate": 4.543721142738306e-06, "loss": 0.1327, "mean_token_accuracy": 0.9569009423255921, "num_tokens": 65270007.0, "step": 8040 }, { "entropy": 0.24126074612140655, "epoch": 4.540067720090294, "grad_norm": 1.8423664569854736, "learning_rate": 4.543177647047931e-06, "loss": 0.1261, "mean_token_accuracy": 0.9594087600708008, "num_tokens": 65310681.0, "step": 8045 }, { "entropy": 0.2388361632823944, "epoch": 4.542889390519187, "grad_norm": 1.9569209814071655, "learning_rate": 4.542633872756374e-06, "loss": 0.1303, "mean_token_accuracy": 0.9593725204467773, "num_tokens": 65351285.0, "step": 8050 }, { "entropy": 0.23481654226779938, "epoch": 4.545711060948081, "grad_norm": 1.949885606765747, "learning_rate": 4.542089819970456e-06, "loss": 0.1192, "mean_token_accuracy": 0.9611479878425598, "num_tokens": 65391577.0, "step": 8055 }, { "entropy": 0.25715220272541045, "epoch": 4.548532731376975, "grad_norm": 2.203080892562866, "learning_rate": 4.541545488797061e-06, "loss": 0.1468, "mean_token_accuracy": 0.9533620238304138, "num_tokens": 65432197.0, "step": 8060 }, { "entropy": 0.23479729294776916, "epoch": 4.551354401805869, "grad_norm": 1.9314894676208496, "learning_rate": 4.541000879343119e-06, "loss": 0.1233, "mean_token_accuracy": 0.9603962182998658, "num_tokens": 65473100.0, "step": 8065 }, { "entropy": 0.23400575220584868, "epoch": 4.554176072234763, "grad_norm": 1.9281415939331055, "learning_rate": 4.540455991715621e-06, "loss": 0.139, "mean_token_accuracy": 0.9566599488258362, "num_tokens": 65513690.0, "step": 8070 }, { "entropy": 0.22714907824993133, "epoch": 4.556997742663657, "grad_norm": 1.8072905540466309, "learning_rate": 4.539910826021609e-06, "loss": 0.1232, "mean_token_accuracy": 0.9606075048446655, "num_tokens": 65554560.0, "step": 8075 }, { "entropy": 0.23093242347240447, "epoch": 4.5598194130925505, "grad_norm": 1.9328153133392334, "learning_rate": 4.539365382368182e-06, "loss": 0.1313, "mean_token_accuracy": 0.9587170481681824, "num_tokens": 65595278.0, "step": 8080 }, { "entropy": 0.2399729460477829, "epoch": 4.562641083521445, "grad_norm": 2.074904203414917, "learning_rate": 4.5388196608624915e-06, "loss": 0.126, "mean_token_accuracy": 0.9598425269126892, "num_tokens": 65635798.0, "step": 8085 }, { "entropy": 0.22223535776138306, "epoch": 4.565462753950339, "grad_norm": 1.8036304712295532, "learning_rate": 4.538273661611744e-06, "loss": 0.128, "mean_token_accuracy": 0.960320234298706, "num_tokens": 65676485.0, "step": 8090 }, { "entropy": 0.2333895742893219, "epoch": 4.568284424379232, "grad_norm": 2.1625099182128906, "learning_rate": 4.537727384723203e-06, "loss": 0.1396, "mean_token_accuracy": 0.9549428701400757, "num_tokens": 65717380.0, "step": 8095 }, { "entropy": 0.2488487184047699, "epoch": 4.571106094808126, "grad_norm": 2.21354079246521, "learning_rate": 4.537180830304183e-06, "loss": 0.1413, "mean_token_accuracy": 0.9569279074668884, "num_tokens": 65757893.0, "step": 8100 }, { "entropy": 0.22758035659790038, "epoch": 4.57392776523702, "grad_norm": 2.3330676555633545, "learning_rate": 4.536633998462055e-06, "loss": 0.1163, "mean_token_accuracy": 0.9631718873977662, "num_tokens": 65798444.0, "step": 8105 }, { "entropy": 0.235689178109169, "epoch": 4.576749435665914, "grad_norm": 2.2506041526794434, "learning_rate": 4.536086889304246e-06, "loss": 0.134, "mean_token_accuracy": 0.9576963305473327, "num_tokens": 65839127.0, "step": 8110 }, { "entropy": 0.2473171055316925, "epoch": 4.579571106094808, "grad_norm": 2.3321518898010254, "learning_rate": 4.535539502938233e-06, "loss": 0.128, "mean_token_accuracy": 0.9609445095062256, "num_tokens": 65879975.0, "step": 8115 }, { "entropy": 0.25173512697219846, "epoch": 4.582392776523702, "grad_norm": 2.5491793155670166, "learning_rate": 4.534991839471551e-06, "loss": 0.1372, "mean_token_accuracy": 0.9561426758766174, "num_tokens": 65920672.0, "step": 8120 }, { "entropy": 0.23555795848369598, "epoch": 4.585214446952596, "grad_norm": 2.0215837955474854, "learning_rate": 4.534443899011789e-06, "loss": 0.1274, "mean_token_accuracy": 0.9595631241798401, "num_tokens": 65960881.0, "step": 8125 }, { "entropy": 0.2525820404291153, "epoch": 4.58803611738149, "grad_norm": 2.073427438735962, "learning_rate": 4.533895681666591e-06, "loss": 0.1387, "mean_token_accuracy": 0.9559617161750793, "num_tokens": 66001503.0, "step": 8130 }, { "entropy": 0.23859679996967315, "epoch": 4.590857787810384, "grad_norm": 1.9522126913070679, "learning_rate": 4.533347187543652e-06, "loss": 0.125, "mean_token_accuracy": 0.9605787754058838, "num_tokens": 66042200.0, "step": 8135 }, { "entropy": 0.23188299536705018, "epoch": 4.5936794582392775, "grad_norm": 1.8732774257659912, "learning_rate": 4.5327984167507255e-06, "loss": 0.1232, "mean_token_accuracy": 0.9610357999801635, "num_tokens": 66082649.0, "step": 8140 }, { "entropy": 0.2427914947271347, "epoch": 4.596501128668171, "grad_norm": 2.2590503692626953, "learning_rate": 4.532249369395616e-06, "loss": 0.1242, "mean_token_accuracy": 0.9602537393569947, "num_tokens": 66123110.0, "step": 8145 }, { "entropy": 0.24748845994472504, "epoch": 4.599322799097066, "grad_norm": 2.055881977081299, "learning_rate": 4.531700045586187e-06, "loss": 0.1205, "mean_token_accuracy": 0.9622966647148132, "num_tokens": 66163660.0, "step": 8150 }, { "entropy": 0.23012515604496003, "epoch": 4.6021444695259595, "grad_norm": 1.8205794095993042, "learning_rate": 4.53115044543035e-06, "loss": 0.1247, "mean_token_accuracy": 0.9605541706085206, "num_tokens": 66204458.0, "step": 8155 }, { "entropy": 0.2356725037097931, "epoch": 4.604966139954853, "grad_norm": 2.3245763778686523, "learning_rate": 4.530600569036075e-06, "loss": 0.1337, "mean_token_accuracy": 0.9578644394874573, "num_tokens": 66245231.0, "step": 8160 }, { "entropy": 0.2583352416753769, "epoch": 4.607787810383747, "grad_norm": 2.2044198513031006, "learning_rate": 4.530050416511386e-06, "loss": 0.1491, "mean_token_accuracy": 0.9534392952919006, "num_tokens": 66285856.0, "step": 8165 }, { "entropy": 0.2439739376306534, "epoch": 4.610609480812641, "grad_norm": 2.181988477706909, "learning_rate": 4.529499987964359e-06, "loss": 0.1372, "mean_token_accuracy": 0.9561636447906494, "num_tokens": 66326451.0, "step": 8170 }, { "entropy": 0.24100883007049562, "epoch": 4.613431151241535, "grad_norm": 2.0313637256622314, "learning_rate": 4.5289492835031275e-06, "loss": 0.1421, "mean_token_accuracy": 0.9549835324287415, "num_tokens": 66367157.0, "step": 8175 }, { "entropy": 0.22571427822113038, "epoch": 4.616252821670429, "grad_norm": 1.996917963027954, "learning_rate": 4.528398303235877e-06, "loss": 0.1191, "mean_token_accuracy": 0.9621818661689758, "num_tokens": 66407802.0, "step": 8180 }, { "entropy": 0.23621368408203125, "epoch": 4.6190744920993225, "grad_norm": 1.7542959451675415, "learning_rate": 4.527847047270847e-06, "loss": 0.1379, "mean_token_accuracy": 0.9554514408111572, "num_tokens": 66448374.0, "step": 8185 }, { "entropy": 0.2359769821166992, "epoch": 4.621896162528216, "grad_norm": 2.465397834777832, "learning_rate": 4.527295515716332e-06, "loss": 0.128, "mean_token_accuracy": 0.9583778262138367, "num_tokens": 66488251.0, "step": 8190 }, { "entropy": 0.22702213227748871, "epoch": 4.624717832957111, "grad_norm": 2.1334855556488037, "learning_rate": 4.526743708680681e-06, "loss": 0.1218, "mean_token_accuracy": 0.9614168047904968, "num_tokens": 66528989.0, "step": 8195 }, { "entropy": 0.2207847625017166, "epoch": 4.6275395033860045, "grad_norm": 2.2564754486083984, "learning_rate": 4.526191626272297e-06, "loss": 0.1245, "mean_token_accuracy": 0.9604014396667481, "num_tokens": 66569555.0, "step": 8200 }, { "entropy": 0.24324011504650117, "epoch": 4.630361173814898, "grad_norm": 1.8627599477767944, "learning_rate": 4.525639268599635e-06, "loss": 0.1284, "mean_token_accuracy": 0.9589541673660278, "num_tokens": 66610160.0, "step": 8205 }, { "entropy": 0.2532497227191925, "epoch": 4.633182844243793, "grad_norm": 2.4542059898376465, "learning_rate": 4.5250866357712066e-06, "loss": 0.1418, "mean_token_accuracy": 0.9555447816848754, "num_tokens": 66650856.0, "step": 8210 }, { "entropy": 0.23340786695480348, "epoch": 4.6360045146726865, "grad_norm": 2.2496676445007324, "learning_rate": 4.524533727895577e-06, "loss": 0.118, "mean_token_accuracy": 0.9636736989021302, "num_tokens": 66691192.0, "step": 8215 }, { "entropy": 0.24236354231834412, "epoch": 4.63882618510158, "grad_norm": 1.9971568584442139, "learning_rate": 4.5239805450813646e-06, "loss": 0.134, "mean_token_accuracy": 0.9569501161575318, "num_tokens": 66731579.0, "step": 8220 }, { "entropy": 0.23493029475212096, "epoch": 4.641647855530474, "grad_norm": 2.242898941040039, "learning_rate": 4.523427087437241e-06, "loss": 0.1212, "mean_token_accuracy": 0.962970244884491, "num_tokens": 66771885.0, "step": 8225 }, { "entropy": 0.22397561073303224, "epoch": 4.644469525959368, "grad_norm": 1.9795989990234375, "learning_rate": 4.522873355071936e-06, "loss": 0.1089, "mean_token_accuracy": 0.9651905417442321, "num_tokens": 66812487.0, "step": 8230 }, { "entropy": 0.21759993433952332, "epoch": 4.647291196388262, "grad_norm": 5.298044204711914, "learning_rate": 4.5223193480942275e-06, "loss": 0.1171, "mean_token_accuracy": 0.9617223024368287, "num_tokens": 66853201.0, "step": 8235 }, { "entropy": 0.24825019538402557, "epoch": 4.650112866817156, "grad_norm": 2.4175469875335693, "learning_rate": 4.521765066612952e-06, "loss": 0.1532, "mean_token_accuracy": 0.9531420230865478, "num_tokens": 66893997.0, "step": 8240 }, { "entropy": 0.23018406629562377, "epoch": 4.6529345372460496, "grad_norm": 1.9281044006347656, "learning_rate": 4.521210510736998e-06, "loss": 0.1235, "mean_token_accuracy": 0.9599860429763794, "num_tokens": 66934876.0, "step": 8245 }, { "entropy": 0.23870559334754943, "epoch": 4.655756207674943, "grad_norm": 1.8871232271194458, "learning_rate": 4.520655680575306e-06, "loss": 0.1159, "mean_token_accuracy": 0.9634807467460632, "num_tokens": 66975475.0, "step": 8250 }, { "entropy": 0.244593945145607, "epoch": 4.658577878103838, "grad_norm": 2.5723750591278076, "learning_rate": 4.520100576236877e-06, "loss": 0.1325, "mean_token_accuracy": 0.9575012445449829, "num_tokens": 67016014.0, "step": 8255 }, { "entropy": 0.23622966110706328, "epoch": 4.6613995485327315, "grad_norm": 2.5368049144744873, "learning_rate": 4.5195451978307556e-06, "loss": 0.131, "mean_token_accuracy": 0.9596938133239746, "num_tokens": 67056774.0, "step": 8260 }, { "entropy": 0.22385245859622954, "epoch": 4.664221218961625, "grad_norm": 2.3085649013519287, "learning_rate": 4.51898954546605e-06, "loss": 0.1206, "mean_token_accuracy": 0.9616567969322205, "num_tokens": 67096578.0, "step": 8265 }, { "entropy": 0.24655192196369172, "epoch": 4.667042889390519, "grad_norm": 2.2656362056732178, "learning_rate": 4.518433619251918e-06, "loss": 0.1407, "mean_token_accuracy": 0.9564512014389038, "num_tokens": 67137138.0, "step": 8270 }, { "entropy": 0.24504003524780274, "epoch": 4.669864559819413, "grad_norm": 1.9192149639129639, "learning_rate": 4.5178774192975685e-06, "loss": 0.1408, "mean_token_accuracy": 0.9554814100265503, "num_tokens": 67176712.0, "step": 8275 }, { "entropy": 0.22429890930652618, "epoch": 4.672686230248307, "grad_norm": 2.07617449760437, "learning_rate": 4.51732094571227e-06, "loss": 0.1242, "mean_token_accuracy": 0.9604406833648682, "num_tokens": 67217311.0, "step": 8280 }, { "entropy": 0.250271201133728, "epoch": 4.675507900677201, "grad_norm": 1.9916068315505981, "learning_rate": 4.51676419860534e-06, "loss": 0.1517, "mean_token_accuracy": 0.9521410703659058, "num_tokens": 67257957.0, "step": 8285 }, { "entropy": 0.243993404507637, "epoch": 4.678329571106095, "grad_norm": 2.026092290878296, "learning_rate": 4.516207178086153e-06, "loss": 0.121, "mean_token_accuracy": 0.962063193321228, "num_tokens": 67298719.0, "step": 8290 }, { "entropy": 0.23810403943061828, "epoch": 4.681151241534989, "grad_norm": 1.7401542663574219, "learning_rate": 4.515649884264135e-06, "loss": 0.1302, "mean_token_accuracy": 0.9585550904273987, "num_tokens": 67339410.0, "step": 8295 }, { "entropy": 0.24030295610427857, "epoch": 4.683972911963883, "grad_norm": 1.8887089490890503, "learning_rate": 4.515092317248766e-06, "loss": 0.1216, "mean_token_accuracy": 0.9610766172409058, "num_tokens": 67379808.0, "step": 8300 }, { "entropy": 0.2178757071495056, "epoch": 4.686794582392777, "grad_norm": 2.0333640575408936, "learning_rate": 4.514534477149581e-06, "loss": 0.1048, "mean_token_accuracy": 0.9664766073226929, "num_tokens": 67420340.0, "step": 8305 }, { "entropy": 0.23881784081459045, "epoch": 4.68961625282167, "grad_norm": 2.146456480026245, "learning_rate": 4.513976364076167e-06, "loss": 0.1213, "mean_token_accuracy": 0.9607988595962524, "num_tokens": 67461118.0, "step": 8310 }, { "entropy": 0.22699067294597625, "epoch": 4.692437923250564, "grad_norm": 2.0983293056488037, "learning_rate": 4.513417978138166e-06, "loss": 0.1382, "mean_token_accuracy": 0.9564812779426575, "num_tokens": 67501770.0, "step": 8315 }, { "entropy": 0.21569800674915313, "epoch": 4.6952595936794586, "grad_norm": 1.7150790691375732, "learning_rate": 4.5128593194452725e-06, "loss": 0.124, "mean_token_accuracy": 0.9598898530006409, "num_tokens": 67542371.0, "step": 8320 }, { "entropy": 0.2471790909767151, "epoch": 4.698081264108352, "grad_norm": 2.265134811401367, "learning_rate": 4.5123003881072345e-06, "loss": 0.1441, "mean_token_accuracy": 0.9535491704940796, "num_tokens": 67582960.0, "step": 8325 }, { "entropy": 0.22231938540935517, "epoch": 4.700902934537246, "grad_norm": 1.7570245265960693, "learning_rate": 4.511741184233856e-06, "loss": 0.1158, "mean_token_accuracy": 0.9630581736564636, "num_tokens": 67623636.0, "step": 8330 }, { "entropy": 0.24155616462230683, "epoch": 4.70372460496614, "grad_norm": 2.0228629112243652, "learning_rate": 4.511181707934992e-06, "loss": 0.1354, "mean_token_accuracy": 0.9561177611351013, "num_tokens": 67664179.0, "step": 8335 }, { "entropy": 0.23686327040195465, "epoch": 4.706546275395034, "grad_norm": 2.0804860591888428, "learning_rate": 4.5106219593205505e-06, "loss": 0.1402, "mean_token_accuracy": 0.9536548256874084, "num_tokens": 67704834.0, "step": 8340 }, { "entropy": 0.22247822284698487, "epoch": 4.709367945823928, "grad_norm": 2.316359519958496, "learning_rate": 4.510061938500495e-06, "loss": 0.1317, "mean_token_accuracy": 0.9575009226799012, "num_tokens": 67745324.0, "step": 8345 }, { "entropy": 0.25336918532848357, "epoch": 4.712189616252822, "grad_norm": 1.8472719192504883, "learning_rate": 4.509501645584842e-06, "loss": 0.1437, "mean_token_accuracy": 0.9530521392822265, "num_tokens": 67786074.0, "step": 8350 }, { "entropy": 0.23464283645153045, "epoch": 4.715011286681715, "grad_norm": 2.232201337814331, "learning_rate": 4.508941080683661e-06, "loss": 0.1304, "mean_token_accuracy": 0.9586957097053528, "num_tokens": 67826602.0, "step": 8355 }, { "entropy": 0.2284877747297287, "epoch": 4.717832957110609, "grad_norm": 2.0694456100463867, "learning_rate": 4.508380243907074e-06, "loss": 0.1335, "mean_token_accuracy": 0.9572539687156677, "num_tokens": 67867168.0, "step": 8360 }, { "entropy": 0.23733165860176086, "epoch": 4.720654627539504, "grad_norm": 1.836682915687561, "learning_rate": 4.5078191353652575e-06, "loss": 0.1427, "mean_token_accuracy": 0.9549849033355713, "num_tokens": 67907809.0, "step": 8365 }, { "entropy": 0.23421334028244017, "epoch": 4.723476297968397, "grad_norm": 2.212164878845215, "learning_rate": 4.507257755168444e-06, "loss": 0.1401, "mean_token_accuracy": 0.9548245668411255, "num_tokens": 67948509.0, "step": 8370 }, { "entropy": 0.2503542214632034, "epoch": 4.726297968397291, "grad_norm": 2.0967366695404053, "learning_rate": 4.506696103426914e-06, "loss": 0.1403, "mean_token_accuracy": 0.9554270505905151, "num_tokens": 67989217.0, "step": 8375 }, { "entropy": 0.22551278471946717, "epoch": 4.729119638826186, "grad_norm": 2.0582823753356934, "learning_rate": 4.506134180251005e-06, "loss": 0.1292, "mean_token_accuracy": 0.9604162216186524, "num_tokens": 68029546.0, "step": 8380 }, { "entropy": 0.23066943287849426, "epoch": 4.731941309255079, "grad_norm": 1.9239495992660522, "learning_rate": 4.5055719857511065e-06, "loss": 0.1443, "mean_token_accuracy": 0.9540068626403808, "num_tokens": 68070249.0, "step": 8385 }, { "entropy": 0.24158512353897094, "epoch": 4.734762979683973, "grad_norm": 2.0336618423461914, "learning_rate": 4.505009520037662e-06, "loss": 0.1339, "mean_token_accuracy": 0.9556556820869446, "num_tokens": 68110991.0, "step": 8390 }, { "entropy": 0.24518156945705413, "epoch": 4.737584650112867, "grad_norm": 2.0292651653289795, "learning_rate": 4.504446783221168e-06, "loss": 0.1303, "mean_token_accuracy": 0.9584499716758728, "num_tokens": 68151625.0, "step": 8395 }, { "entropy": 0.23519628643989562, "epoch": 4.74040632054176, "grad_norm": 2.051119327545166, "learning_rate": 4.503883775412174e-06, "loss": 0.1334, "mean_token_accuracy": 0.9583625435829163, "num_tokens": 68192104.0, "step": 8400 }, { "entropy": 0.2450340747833252, "epoch": 4.743227990970655, "grad_norm": 2.1468701362609863, "learning_rate": 4.503320496721283e-06, "loss": 0.1437, "mean_token_accuracy": 0.9554325222969056, "num_tokens": 68232969.0, "step": 8405 }, { "entropy": 0.22379915416240692, "epoch": 4.746049661399549, "grad_norm": 2.243983268737793, "learning_rate": 4.5027569472591515e-06, "loss": 0.1262, "mean_token_accuracy": 0.9602833390235901, "num_tokens": 68273416.0, "step": 8410 }, { "entropy": 0.23107292354106904, "epoch": 4.748871331828442, "grad_norm": 1.9481990337371826, "learning_rate": 4.502193127136489e-06, "loss": 0.1195, "mean_token_accuracy": 0.9613822102546692, "num_tokens": 68314053.0, "step": 8415 }, { "entropy": 0.24034847021102906, "epoch": 4.751693002257336, "grad_norm": 2.4841561317443848, "learning_rate": 4.501629036464057e-06, "loss": 0.1295, "mean_token_accuracy": 0.9587523937225342, "num_tokens": 68354766.0, "step": 8420 }, { "entropy": 0.23407194018363953, "epoch": 4.754514672686231, "grad_norm": 2.2087182998657227, "learning_rate": 4.501064675352671e-06, "loss": 0.125, "mean_token_accuracy": 0.9598301172256469, "num_tokens": 68395431.0, "step": 8425 }, { "entropy": 0.25628364980220797, "epoch": 4.757336343115124, "grad_norm": 2.0910379886627197, "learning_rate": 4.500500043913203e-06, "loss": 0.1406, "mean_token_accuracy": 0.9561483979225158, "num_tokens": 68436109.0, "step": 8430 }, { "entropy": 0.26322224736213684, "epoch": 4.760158013544018, "grad_norm": 2.235393762588501, "learning_rate": 4.499935142256571e-06, "loss": 0.1605, "mean_token_accuracy": 0.9494641661643982, "num_tokens": 68476762.0, "step": 8435 }, { "entropy": 0.24928564131259917, "epoch": 4.762979683972912, "grad_norm": 2.2493717670440674, "learning_rate": 4.499369970493751e-06, "loss": 0.1457, "mean_token_accuracy": 0.9544100880622863, "num_tokens": 68517508.0, "step": 8440 }, { "entropy": 0.23149662911891938, "epoch": 4.765801354401805, "grad_norm": 1.8034436702728271, "learning_rate": 4.498804528735773e-06, "loss": 0.1316, "mean_token_accuracy": 0.958375072479248, "num_tokens": 68558346.0, "step": 8445 }, { "entropy": 0.2590792328119278, "epoch": 4.7686230248307, "grad_norm": 1.9933143854141235, "learning_rate": 4.498238817093717e-06, "loss": 0.1211, "mean_token_accuracy": 0.9613798260688782, "num_tokens": 68598995.0, "step": 8450 }, { "entropy": 0.21191737055778503, "epoch": 4.771444695259594, "grad_norm": 1.8412405252456665, "learning_rate": 4.497672835678716e-06, "loss": 0.1165, "mean_token_accuracy": 0.9624577045440674, "num_tokens": 68639676.0, "step": 8455 }, { "entropy": 0.2465408831834793, "epoch": 4.774266365688487, "grad_norm": 2.141119956970215, "learning_rate": 4.497106584601957e-06, "loss": 0.1499, "mean_token_accuracy": 0.9527180552482605, "num_tokens": 68680313.0, "step": 8460 }, { "entropy": 0.2483223021030426, "epoch": 4.777088036117382, "grad_norm": 2.286459445953369, "learning_rate": 4.496540063974683e-06, "loss": 0.1449, "mean_token_accuracy": 0.9531118035316467, "num_tokens": 68721184.0, "step": 8465 }, { "entropy": 0.23041402101516723, "epoch": 4.779909706546276, "grad_norm": 1.8058642148971558, "learning_rate": 4.495973273908184e-06, "loss": 0.1271, "mean_token_accuracy": 0.9595191001892089, "num_tokens": 68761833.0, "step": 8470 }, { "entropy": 0.24951161742210387, "epoch": 4.782731376975169, "grad_norm": 2.057999849319458, "learning_rate": 4.495406214513807e-06, "loss": 0.1334, "mean_token_accuracy": 0.9589525699615479, "num_tokens": 68802489.0, "step": 8475 }, { "entropy": 0.2543626993894577, "epoch": 4.785553047404063, "grad_norm": 2.1733288764953613, "learning_rate": 4.494838885902952e-06, "loss": 0.1569, "mean_token_accuracy": 0.9508596181869506, "num_tokens": 68843038.0, "step": 8480 }, { "entropy": 0.22672601640224457, "epoch": 4.788374717832957, "grad_norm": 3.0629308223724365, "learning_rate": 4.4942712881870684e-06, "loss": 0.13, "mean_token_accuracy": 0.9586154460906983, "num_tokens": 68883645.0, "step": 8485 }, { "entropy": 0.24403004348278046, "epoch": 4.791196388261851, "grad_norm": 2.2074005603790283, "learning_rate": 4.493703421477663e-06, "loss": 0.1449, "mean_token_accuracy": 0.9540403723716736, "num_tokens": 68924239.0, "step": 8490 }, { "entropy": 0.2356546252965927, "epoch": 4.794018058690745, "grad_norm": 1.874244213104248, "learning_rate": 4.493135285886293e-06, "loss": 0.1392, "mean_token_accuracy": 0.9564555406570434, "num_tokens": 68964945.0, "step": 8495 }, { "entropy": 0.24922839403152466, "epoch": 4.796839729119639, "grad_norm": 2.270205020904541, "learning_rate": 4.492566881524568e-06, "loss": 0.1371, "mean_token_accuracy": 0.9573128700256348, "num_tokens": 69005804.0, "step": 8500 }, { "epoch": 4.796839729119639, "eval_entropy": 0.25907859206199646, "eval_loss": 0.06230160593986511, "eval_mean_token_accuracy": 0.9831352829933167, "eval_num_tokens": 69005804.0, "eval_runtime": 0.164, "eval_samples_per_second": 24.392, "eval_steps_per_second": 6.098, "step": 8500 }, { "entropy": 0.2256260484457016, "epoch": 4.799661399548532, "grad_norm": 2.126230001449585, "learning_rate": 4.491998208504151e-06, "loss": 0.1214, "mean_token_accuracy": 0.9609059691429138, "num_tokens": 69045948.0, "step": 8505 }, { "entropy": 0.2439708322286606, "epoch": 4.802483069977427, "grad_norm": 1.7869462966918945, "learning_rate": 4.491429266936759e-06, "loss": 0.1453, "mean_token_accuracy": 0.9530874967575074, "num_tokens": 69086575.0, "step": 8510 }, { "entropy": 0.23215862214565278, "epoch": 4.805304740406321, "grad_norm": 1.871141791343689, "learning_rate": 4.490860056934158e-06, "loss": 0.1164, "mean_token_accuracy": 0.9630860209465026, "num_tokens": 69127332.0, "step": 8515 }, { "entropy": 0.22940534651279448, "epoch": 4.808126410835214, "grad_norm": 1.7535682916641235, "learning_rate": 4.490290578608173e-06, "loss": 0.1133, "mean_token_accuracy": 0.9624628305435181, "num_tokens": 69167989.0, "step": 8520 }, { "entropy": 0.22979909479618071, "epoch": 4.810948081264108, "grad_norm": 2.0037307739257812, "learning_rate": 4.489720832070676e-06, "loss": 0.1249, "mean_token_accuracy": 0.9602430582046508, "num_tokens": 69208619.0, "step": 8525 }, { "entropy": 0.24260205030441284, "epoch": 4.813769751693002, "grad_norm": 2.209491491317749, "learning_rate": 4.489150817433594e-06, "loss": 0.1345, "mean_token_accuracy": 0.9570428252220153, "num_tokens": 69249329.0, "step": 8530 }, { "entropy": 0.22956629991531372, "epoch": 4.816591422121896, "grad_norm": 2.2682173252105713, "learning_rate": 4.488580534808908e-06, "loss": 0.1358, "mean_token_accuracy": 0.9560045003890991, "num_tokens": 69289846.0, "step": 8535 }, { "entropy": 0.2220306247472763, "epoch": 4.81941309255079, "grad_norm": 1.8845446109771729, "learning_rate": 4.488009984308647e-06, "loss": 0.1276, "mean_token_accuracy": 0.9602849364280701, "num_tokens": 69330609.0, "step": 8540 }, { "entropy": 0.25677892565727234, "epoch": 4.822234762979684, "grad_norm": 2.3458547592163086, "learning_rate": 4.487439166044898e-06, "loss": 0.1333, "mean_token_accuracy": 0.9575768232345581, "num_tokens": 69370952.0, "step": 8545 }, { "entropy": 0.2511105537414551, "epoch": 4.825056433408578, "grad_norm": 2.185471773147583, "learning_rate": 4.486868080129797e-06, "loss": 0.1403, "mean_token_accuracy": 0.9561097264289856, "num_tokens": 69411446.0, "step": 8550 }, { "entropy": 0.2550018668174744, "epoch": 4.827878103837472, "grad_norm": 2.0615882873535156, "learning_rate": 4.486296726675535e-06, "loss": 0.1344, "mean_token_accuracy": 0.957464849948883, "num_tokens": 69452055.0, "step": 8555 }, { "entropy": 0.2429851323366165, "epoch": 4.830699774266366, "grad_norm": 2.123836040496826, "learning_rate": 4.485725105794354e-06, "loss": 0.1285, "mean_token_accuracy": 0.9590805530548095, "num_tokens": 69492630.0, "step": 8560 }, { "entropy": 0.23657283782958985, "epoch": 4.8335214446952595, "grad_norm": 2.050199508666992, "learning_rate": 4.48515321759855e-06, "loss": 0.1305, "mean_token_accuracy": 0.9578537583351135, "num_tokens": 69533313.0, "step": 8565 }, { "entropy": 0.23308176994323732, "epoch": 4.836343115124153, "grad_norm": 1.9955555200576782, "learning_rate": 4.4845810622004685e-06, "loss": 0.1443, "mean_token_accuracy": 0.9529533624649048, "num_tokens": 69573940.0, "step": 8570 }, { "entropy": 0.23426572382450103, "epoch": 4.839164785553048, "grad_norm": 1.7304738759994507, "learning_rate": 4.484008639712511e-06, "loss": 0.1191, "mean_token_accuracy": 0.9620694041252136, "num_tokens": 69614517.0, "step": 8575 }, { "entropy": 0.2388251841068268, "epoch": 4.841986455981941, "grad_norm": 2.242614269256592, "learning_rate": 4.48343595024713e-06, "loss": 0.138, "mean_token_accuracy": 0.9573504686355591, "num_tokens": 69654972.0, "step": 8580 }, { "entropy": 0.2321117788553238, "epoch": 4.844808126410835, "grad_norm": 2.4217350482940674, "learning_rate": 4.482862993916829e-06, "loss": 0.1388, "mean_token_accuracy": 0.95455002784729, "num_tokens": 69695607.0, "step": 8585 }, { "entropy": 0.22771554291248322, "epoch": 4.847629796839729, "grad_norm": 1.7584290504455566, "learning_rate": 4.482289770834168e-06, "loss": 0.1239, "mean_token_accuracy": 0.9602065324783325, "num_tokens": 69736187.0, "step": 8590 }, { "entropy": 0.22587930858135224, "epoch": 4.850451467268623, "grad_norm": 2.119394302368164, "learning_rate": 4.481716281111753e-06, "loss": 0.1207, "mean_token_accuracy": 0.9609213948249817, "num_tokens": 69777050.0, "step": 8595 }, { "entropy": 0.23252309560775758, "epoch": 4.853273137697517, "grad_norm": 2.079322576522827, "learning_rate": 4.481142524862249e-06, "loss": 0.1269, "mean_token_accuracy": 0.9575224757194519, "num_tokens": 69817801.0, "step": 8600 }, { "entropy": 0.2291330635547638, "epoch": 4.856094808126411, "grad_norm": 2.186591148376465, "learning_rate": 4.48056850219837e-06, "loss": 0.1236, "mean_token_accuracy": 0.9605337023735047, "num_tokens": 69858177.0, "step": 8605 }, { "entropy": 0.22314091324806212, "epoch": 4.8589164785553045, "grad_norm": 1.9909532070159912, "learning_rate": 4.479994213232882e-06, "loss": 0.1238, "mean_token_accuracy": 0.9604300022125244, "num_tokens": 69898890.0, "step": 8610 }, { "entropy": 0.2294975697994232, "epoch": 4.861738148984198, "grad_norm": 2.1407694816589355, "learning_rate": 4.479419658078606e-06, "loss": 0.1272, "mean_token_accuracy": 0.9603155970573425, "num_tokens": 69939533.0, "step": 8615 }, { "entropy": 0.23732683360576629, "epoch": 4.864559819413093, "grad_norm": 2.215397357940674, "learning_rate": 4.478844836848411e-06, "loss": 0.1376, "mean_token_accuracy": 0.9563262820243835, "num_tokens": 69980394.0, "step": 8620 }, { "entropy": 0.24985795021057128, "epoch": 4.8673814898419865, "grad_norm": 1.9875785112380981, "learning_rate": 4.478269749655222e-06, "loss": 0.1414, "mean_token_accuracy": 0.954399836063385, "num_tokens": 70020932.0, "step": 8625 }, { "entropy": 0.23618102967739105, "epoch": 4.87020316027088, "grad_norm": 2.1521682739257812, "learning_rate": 4.477694396612014e-06, "loss": 0.124, "mean_token_accuracy": 0.9608348488807679, "num_tokens": 70061566.0, "step": 8630 }, { "entropy": 0.21555082201957704, "epoch": 4.873024830699774, "grad_norm": 1.909771203994751, "learning_rate": 4.477118777831817e-06, "loss": 0.1278, "mean_token_accuracy": 0.9597031474113464, "num_tokens": 70102195.0, "step": 8635 }, { "entropy": 0.24607637226581575, "epoch": 4.8758465011286685, "grad_norm": 2.6042673587799072, "learning_rate": 4.47654289342771e-06, "loss": 0.1404, "mean_token_accuracy": 0.9529093623161315, "num_tokens": 70142718.0, "step": 8640 }, { "entropy": 0.22525620758533477, "epoch": 4.878668171557562, "grad_norm": 2.1628029346466064, "learning_rate": 4.475966743512826e-06, "loss": 0.1384, "mean_token_accuracy": 0.9556032299995423, "num_tokens": 70183458.0, "step": 8645 }, { "entropy": 0.24763034284114838, "epoch": 4.881489841986456, "grad_norm": 2.244004726409912, "learning_rate": 4.47539032820035e-06, "loss": 0.1385, "mean_token_accuracy": 0.9558951020240783, "num_tokens": 70224283.0, "step": 8650 }, { "entropy": 0.23140210211277007, "epoch": 4.8843115124153496, "grad_norm": 2.290135145187378, "learning_rate": 4.474813647603518e-06, "loss": 0.1229, "mean_token_accuracy": 0.9592395067214966, "num_tokens": 70264983.0, "step": 8655 }, { "entropy": 0.2398304522037506, "epoch": 4.887133182844244, "grad_norm": 1.9016507863998413, "learning_rate": 4.4742367018356195e-06, "loss": 0.1311, "mean_token_accuracy": 0.95786372423172, "num_tokens": 70305649.0, "step": 8660 }, { "entropy": 0.23110878467559814, "epoch": 4.889954853273138, "grad_norm": 1.8025792837142944, "learning_rate": 4.4736594910099956e-06, "loss": 0.1405, "mean_token_accuracy": 0.9554929614067078, "num_tokens": 70345976.0, "step": 8665 }, { "entropy": 0.24155546426773072, "epoch": 4.8927765237020315, "grad_norm": 1.9929896593093872, "learning_rate": 4.47308201524004e-06, "loss": 0.1383, "mean_token_accuracy": 0.955091404914856, "num_tokens": 70386380.0, "step": 8670 }, { "entropy": 0.23273320198059083, "epoch": 4.895598194130925, "grad_norm": 1.964606523513794, "learning_rate": 4.4725042746391965e-06, "loss": 0.1267, "mean_token_accuracy": 0.9588501572608947, "num_tokens": 70427041.0, "step": 8675 }, { "entropy": 0.24253876507282257, "epoch": 4.89841986455982, "grad_norm": 2.12473464012146, "learning_rate": 4.471926269320963e-06, "loss": 0.1228, "mean_token_accuracy": 0.9616016149520874, "num_tokens": 70467666.0, "step": 8680 }, { "entropy": 0.23109547793865204, "epoch": 4.9012415349887135, "grad_norm": 2.302859306335449, "learning_rate": 4.471347999398888e-06, "loss": 0.1198, "mean_token_accuracy": 0.9608020305633544, "num_tokens": 70507748.0, "step": 8685 }, { "entropy": 0.21591301262378693, "epoch": 4.904063205417607, "grad_norm": 1.8348546028137207, "learning_rate": 4.4707694649865755e-06, "loss": 0.1206, "mean_token_accuracy": 0.9611794590950012, "num_tokens": 70548511.0, "step": 8690 }, { "entropy": 0.2398153394460678, "epoch": 4.906884875846501, "grad_norm": 1.7756059169769287, "learning_rate": 4.470190666197675e-06, "loss": 0.132, "mean_token_accuracy": 0.9583917617797851, "num_tokens": 70589210.0, "step": 8695 }, { "entropy": 0.24644698202610016, "epoch": 4.909706546275395, "grad_norm": 2.212986469268799, "learning_rate": 4.469611603145895e-06, "loss": 0.1325, "mean_token_accuracy": 0.957687258720398, "num_tokens": 70629759.0, "step": 8700 }, { "entropy": 0.2506899446249008, "epoch": 4.912528216704289, "grad_norm": 2.2517426013946533, "learning_rate": 4.469032275944989e-06, "loss": 0.1354, "mean_token_accuracy": 0.9561644673347474, "num_tokens": 70669828.0, "step": 8705 }, { "entropy": 0.26197082698345187, "epoch": 4.915349887133183, "grad_norm": 2.4998831748962402, "learning_rate": 4.468452684708769e-06, "loss": 0.1506, "mean_token_accuracy": 0.953766405582428, "num_tokens": 70710349.0, "step": 8710 }, { "entropy": 0.2373470038175583, "epoch": 4.918171557562077, "grad_norm": 2.084594249725342, "learning_rate": 4.467872829551093e-06, "loss": 0.1306, "mean_token_accuracy": 0.9590035080909729, "num_tokens": 70751075.0, "step": 8715 }, { "entropy": 0.23836889564990998, "epoch": 4.92099322799097, "grad_norm": 2.043854236602783, "learning_rate": 4.467292710585876e-06, "loss": 0.1263, "mean_token_accuracy": 0.9599133729934692, "num_tokens": 70791615.0, "step": 8720 }, { "entropy": 0.2358907401561737, "epoch": 4.923814898419865, "grad_norm": 1.9788299798965454, "learning_rate": 4.466712327927082e-06, "loss": 0.1422, "mean_token_accuracy": 0.9546040058135986, "num_tokens": 70832420.0, "step": 8725 }, { "entropy": 0.2312130182981491, "epoch": 4.926636568848759, "grad_norm": 2.282057762145996, "learning_rate": 4.466131681688725e-06, "loss": 0.1265, "mean_token_accuracy": 0.9591808557510376, "num_tokens": 70873131.0, "step": 8730 }, { "entropy": 0.22316355705261232, "epoch": 4.929458239277652, "grad_norm": 2.02055025100708, "learning_rate": 4.465550771984877e-06, "loss": 0.1187, "mean_token_accuracy": 0.9624357938766479, "num_tokens": 70913503.0, "step": 8735 }, { "entropy": 0.2304389774799347, "epoch": 4.932279909706546, "grad_norm": 2.3464314937591553, "learning_rate": 4.464969598929654e-06, "loss": 0.1205, "mean_token_accuracy": 0.9610739946365356, "num_tokens": 70954047.0, "step": 8740 }, { "entropy": 0.23224081397056578, "epoch": 4.9351015801354405, "grad_norm": 2.0249598026275635, "learning_rate": 4.4643881626372305e-06, "loss": 0.125, "mean_token_accuracy": 0.959325659275055, "num_tokens": 70994812.0, "step": 8745 }, { "entropy": 0.2500057280063629, "epoch": 4.937923250564334, "grad_norm": 2.1669914722442627, "learning_rate": 4.463806463221827e-06, "loss": 0.1378, "mean_token_accuracy": 0.9567291259765625, "num_tokens": 71035388.0, "step": 8750 }, { "entropy": 0.23122502863407135, "epoch": 4.940744920993228, "grad_norm": 2.2984166145324707, "learning_rate": 4.463224500797721e-06, "loss": 0.1396, "mean_token_accuracy": 0.954986846446991, "num_tokens": 71076225.0, "step": 8755 }, { "entropy": 0.22869625687599182, "epoch": 4.943566591422122, "grad_norm": 1.968353509902954, "learning_rate": 4.462642275479236e-06, "loss": 0.1187, "mean_token_accuracy": 0.9607104420661926, "num_tokens": 71116934.0, "step": 8760 }, { "entropy": 0.23740582168102264, "epoch": 4.946388261851016, "grad_norm": 1.954420566558838, "learning_rate": 4.462059787380754e-06, "loss": 0.1371, "mean_token_accuracy": 0.9562184929847717, "num_tokens": 71157332.0, "step": 8765 }, { "entropy": 0.23098691403865815, "epoch": 4.94920993227991, "grad_norm": 1.9516210556030273, "learning_rate": 4.461477036616702e-06, "loss": 0.1415, "mean_token_accuracy": 0.9550253391265869, "num_tokens": 71198170.0, "step": 8770 }, { "entropy": 0.22629740536212922, "epoch": 4.952031602708804, "grad_norm": 2.029778480529785, "learning_rate": 4.460894023301563e-06, "loss": 0.1159, "mean_token_accuracy": 0.9626461505889893, "num_tokens": 71238652.0, "step": 8775 }, { "entropy": 0.22563746571540833, "epoch": 4.954853273137697, "grad_norm": 2.0067238807678223, "learning_rate": 4.460310747549869e-06, "loss": 0.128, "mean_token_accuracy": 0.9585816025733948, "num_tokens": 71279449.0, "step": 8780 }, { "entropy": 0.264061963558197, "epoch": 4.957674943566591, "grad_norm": 2.42386794090271, "learning_rate": 4.459727209476205e-06, "loss": 0.1534, "mean_token_accuracy": 0.9512446284294128, "num_tokens": 71320179.0, "step": 8785 }, { "entropy": 0.2313307523727417, "epoch": 4.960496613995486, "grad_norm": 1.9760738611221313, "learning_rate": 4.459143409195208e-06, "loss": 0.1184, "mean_token_accuracy": 0.9614473700523376, "num_tokens": 71360778.0, "step": 8790 }, { "entropy": 0.23215781152248383, "epoch": 4.963318284424379, "grad_norm": 2.0897891521453857, "learning_rate": 4.458559346821564e-06, "loss": 0.1406, "mean_token_accuracy": 0.9561804056167602, "num_tokens": 71401477.0, "step": 8795 }, { "entropy": 0.2455916315317154, "epoch": 4.966139954853273, "grad_norm": 2.2576193809509277, "learning_rate": 4.457975022470013e-06, "loss": 0.139, "mean_token_accuracy": 0.9552298545837402, "num_tokens": 71442131.0, "step": 8800 }, { "entropy": 0.2560196042060852, "epoch": 4.968961625282167, "grad_norm": 2.23545503616333, "learning_rate": 4.457390436255345e-06, "loss": 0.1483, "mean_token_accuracy": 0.952830684185028, "num_tokens": 71482712.0, "step": 8805 }, { "entropy": 0.24611234068870544, "epoch": 4.971783295711061, "grad_norm": 1.8639657497406006, "learning_rate": 4.456805588292404e-06, "loss": 0.1302, "mean_token_accuracy": 0.9585993766784668, "num_tokens": 71523349.0, "step": 8810 }, { "entropy": 0.2496563047170639, "epoch": 4.974604966139955, "grad_norm": 2.166790723800659, "learning_rate": 4.456220478696081e-06, "loss": 0.1514, "mean_token_accuracy": 0.9514712452888489, "num_tokens": 71563712.0, "step": 8815 }, { "entropy": 0.23716452717781067, "epoch": 4.977426636568849, "grad_norm": 2.0670976638793945, "learning_rate": 4.455635107581322e-06, "loss": 0.1365, "mean_token_accuracy": 0.9566124558448792, "num_tokens": 71604371.0, "step": 8820 }, { "entropy": 0.21934570670127868, "epoch": 4.980248306997742, "grad_norm": 2.0929582118988037, "learning_rate": 4.455049475063124e-06, "loss": 0.1163, "mean_token_accuracy": 0.9618055701255799, "num_tokens": 71645153.0, "step": 8825 }, { "entropy": 0.24300400614738465, "epoch": 4.983069977426637, "grad_norm": 1.9957592487335205, "learning_rate": 4.4544635812565335e-06, "loss": 0.1322, "mean_token_accuracy": 0.9577077746391296, "num_tokens": 71685968.0, "step": 8830 }, { "entropy": 0.24642634391784668, "epoch": 4.985891647855531, "grad_norm": 1.812232494354248, "learning_rate": 4.453877426276649e-06, "loss": 0.1327, "mean_token_accuracy": 0.9573553204536438, "num_tokens": 71726713.0, "step": 8835 }, { "entropy": 0.24452163875102997, "epoch": 4.988713318284424, "grad_norm": 2.2379374504089355, "learning_rate": 4.4532910102386215e-06, "loss": 0.1368, "mean_token_accuracy": 0.9563169717788697, "num_tokens": 71767291.0, "step": 8840 }, { "entropy": 0.23888054192066194, "epoch": 4.991534988713318, "grad_norm": 2.2258286476135254, "learning_rate": 4.452704333257653e-06, "loss": 0.1338, "mean_token_accuracy": 0.9571861267089844, "num_tokens": 71807947.0, "step": 8845 }, { "entropy": 0.23455857038497924, "epoch": 4.994356659142213, "grad_norm": 2.2029316425323486, "learning_rate": 4.452117395448995e-06, "loss": 0.1321, "mean_token_accuracy": 0.9573328852653503, "num_tokens": 71848678.0, "step": 8850 }, { "entropy": 0.23345192968845369, "epoch": 4.997178329571106, "grad_norm": 2.2948131561279297, "learning_rate": 4.451530196927952e-06, "loss": 0.1239, "mean_token_accuracy": 0.9603603482246399, "num_tokens": 71889260.0, "step": 8855 }, { "entropy": 0.22622182071208954, "epoch": 5.0, "grad_norm": 3.6322450637817383, "learning_rate": 4.45094273780988e-06, "loss": 0.1305, "mean_token_accuracy": 0.9572938919067383, "num_tokens": 71923845.0, "step": 8860 }, { "entropy": 0.22580887079238893, "epoch": 5.002821670428894, "grad_norm": 1.672566294670105, "learning_rate": 4.450355018210185e-06, "loss": 0.0748, "mean_token_accuracy": 0.9793183445930481, "num_tokens": 71964397.0, "step": 8865 }, { "entropy": 0.1980500638484955, "epoch": 5.005643340857787, "grad_norm": 1.7135627269744873, "learning_rate": 4.4497670382443235e-06, "loss": 0.0595, "mean_token_accuracy": 0.9829611539840698, "num_tokens": 72004845.0, "step": 8870 }, { "entropy": 0.19046223759651185, "epoch": 5.008465011286682, "grad_norm": 1.7195278406143188, "learning_rate": 4.449178798027806e-06, "loss": 0.0699, "mean_token_accuracy": 0.9804732322692871, "num_tokens": 72045486.0, "step": 8875 }, { "entropy": 0.1900404006242752, "epoch": 5.011286681715576, "grad_norm": 1.8688246011734009, "learning_rate": 4.4485902976761905e-06, "loss": 0.0699, "mean_token_accuracy": 0.9789183497428894, "num_tokens": 72086189.0, "step": 8880 }, { "entropy": 0.17116313576698303, "epoch": 5.014108352144469, "grad_norm": 2.521716833114624, "learning_rate": 4.44800153730509e-06, "loss": 0.0676, "mean_token_accuracy": 0.979660964012146, "num_tokens": 72126926.0, "step": 8885 }, { "entropy": 0.1692497670650482, "epoch": 5.016930022573363, "grad_norm": 2.2940783500671387, "learning_rate": 4.447412517030165e-06, "loss": 0.0638, "mean_token_accuracy": 0.980719518661499, "num_tokens": 72167664.0, "step": 8890 }, { "entropy": 0.18492531478405, "epoch": 5.019751693002258, "grad_norm": 2.423424005508423, "learning_rate": 4.446823236967129e-06, "loss": 0.086, "mean_token_accuracy": 0.9749364614486694, "num_tokens": 72208162.0, "step": 8895 }, { "entropy": 0.1805371016263962, "epoch": 5.022573363431151, "grad_norm": 2.1918728351593018, "learning_rate": 4.446233697231747e-06, "loss": 0.0717, "mean_token_accuracy": 0.9780835628509521, "num_tokens": 72248751.0, "step": 8900 }, { "entropy": 0.19484019577503203, "epoch": 5.025395033860045, "grad_norm": 2.126352071762085, "learning_rate": 4.445643897939832e-06, "loss": 0.0653, "mean_token_accuracy": 0.9807442188262939, "num_tokens": 72289553.0, "step": 8905 }, { "entropy": 0.16935117840766906, "epoch": 5.028216704288939, "grad_norm": 1.7222833633422852, "learning_rate": 4.445053839207252e-06, "loss": 0.0582, "mean_token_accuracy": 0.9815907716751099, "num_tokens": 72330231.0, "step": 8910 }, { "entropy": 0.17409595847129822, "epoch": 5.031038374717833, "grad_norm": 1.9635859727859497, "learning_rate": 4.4444635211499245e-06, "loss": 0.0644, "mean_token_accuracy": 0.9801131606101989, "num_tokens": 72370964.0, "step": 8915 }, { "entropy": 0.18664040863513948, "epoch": 5.033860045146727, "grad_norm": 2.004497528076172, "learning_rate": 4.443872943883817e-06, "loss": 0.0686, "mean_token_accuracy": 0.9801067471504211, "num_tokens": 72411632.0, "step": 8920 }, { "entropy": 0.19012978672981262, "epoch": 5.036681715575621, "grad_norm": 2.1313867568969727, "learning_rate": 4.443282107524947e-06, "loss": 0.0779, "mean_token_accuracy": 0.9768623113632202, "num_tokens": 72452367.0, "step": 8925 }, { "entropy": 0.18184551000595092, "epoch": 5.039503386004514, "grad_norm": 2.076591730117798, "learning_rate": 4.442691012189386e-06, "loss": 0.0734, "mean_token_accuracy": 0.9780202388763428, "num_tokens": 72493043.0, "step": 8930 }, { "entropy": 0.17082217931747437, "epoch": 5.042325056433409, "grad_norm": 1.6795393228530884, "learning_rate": 4.4420996579932555e-06, "loss": 0.0697, "mean_token_accuracy": 0.9782551288604736, "num_tokens": 72533217.0, "step": 8935 }, { "entropy": 0.19154114723205568, "epoch": 5.045146726862303, "grad_norm": 1.8840192556381226, "learning_rate": 4.4415080450527244e-06, "loss": 0.0783, "mean_token_accuracy": 0.9762571692466736, "num_tokens": 72573491.0, "step": 8940 }, { "entropy": 0.1921801507472992, "epoch": 5.047968397291196, "grad_norm": 2.051151752471924, "learning_rate": 4.440916173484018e-06, "loss": 0.0735, "mean_token_accuracy": 0.9775419950485229, "num_tokens": 72614111.0, "step": 8945 }, { "entropy": 0.171154123544693, "epoch": 5.05079006772009, "grad_norm": 1.7416431903839111, "learning_rate": 4.440324043403408e-06, "loss": 0.0562, "mean_token_accuracy": 0.982762086391449, "num_tokens": 72654942.0, "step": 8950 }, { "entropy": 0.1801618218421936, "epoch": 5.053611738148984, "grad_norm": 2.1165449619293213, "learning_rate": 4.439731654927218e-06, "loss": 0.0651, "mean_token_accuracy": 0.9803321242332459, "num_tokens": 72695401.0, "step": 8955 }, { "entropy": 0.17029935419559478, "epoch": 5.056433408577878, "grad_norm": 2.3758440017700195, "learning_rate": 4.439139008171824e-06, "loss": 0.0691, "mean_token_accuracy": 0.9787796020507813, "num_tokens": 72736277.0, "step": 8960 }, { "entropy": 0.1681983232498169, "epoch": 5.059255079006772, "grad_norm": 1.9630498886108398, "learning_rate": 4.43854610325365e-06, "loss": 0.0617, "mean_token_accuracy": 0.981269919872284, "num_tokens": 72777155.0, "step": 8965 }, { "entropy": 0.20426134169101715, "epoch": 5.062076749435666, "grad_norm": 2.2527589797973633, "learning_rate": 4.437952940289175e-06, "loss": 0.0815, "mean_token_accuracy": 0.9752603888511657, "num_tokens": 72817863.0, "step": 8970 }, { "entropy": 0.17029196619987488, "epoch": 5.0648984198645595, "grad_norm": 1.9497177600860596, "learning_rate": 4.437359519394923e-06, "loss": 0.0664, "mean_token_accuracy": 0.9784554839134216, "num_tokens": 72858530.0, "step": 8975 }, { "entropy": 0.18146368563175203, "epoch": 5.067720090293454, "grad_norm": 1.7516709566116333, "learning_rate": 4.436765840687473e-06, "loss": 0.0697, "mean_token_accuracy": 0.9783901810646057, "num_tokens": 72899175.0, "step": 8980 }, { "entropy": 0.17964912950992584, "epoch": 5.070541760722348, "grad_norm": 1.487107515335083, "learning_rate": 4.4361719042834525e-06, "loss": 0.0595, "mean_token_accuracy": 0.9815647959709167, "num_tokens": 72939846.0, "step": 8985 }, { "entropy": 0.1833242356777191, "epoch": 5.073363431151241, "grad_norm": 2.1468114852905273, "learning_rate": 4.435577710299542e-06, "loss": 0.0569, "mean_token_accuracy": 0.982574725151062, "num_tokens": 72979909.0, "step": 8990 }, { "entropy": 0.1870110362768173, "epoch": 5.076185101580135, "grad_norm": 1.998921513557434, "learning_rate": 4.43498325885247e-06, "loss": 0.0646, "mean_token_accuracy": 0.9806098222732544, "num_tokens": 73020545.0, "step": 8995 }, { "entropy": 0.18318285942077636, "epoch": 5.07900677200903, "grad_norm": 1.859528660774231, "learning_rate": 4.434388550059016e-06, "loss": 0.0739, "mean_token_accuracy": 0.9773783922195435, "num_tokens": 73061009.0, "step": 9000 }, { "epoch": 5.07900677200903, "eval_entropy": 0.23515231907367706, "eval_loss": 0.055877141654491425, "eval_mean_token_accuracy": 0.9827520251274109, "eval_num_tokens": 73061009.0, "eval_runtime": 0.1636, "eval_samples_per_second": 24.455, "eval_steps_per_second": 6.114, "step": 9000 }, { "entropy": 0.1605137288570404, "epoch": 5.081828442437923, "grad_norm": 2.0149075984954834, "learning_rate": 4.433793584036011e-06, "loss": 0.0591, "mean_token_accuracy": 0.9814065933227539, "num_tokens": 73101615.0, "step": 9005 }, { "entropy": 0.19753072261810303, "epoch": 5.084650112866817, "grad_norm": 1.9284378290176392, "learning_rate": 4.433198360900337e-06, "loss": 0.0738, "mean_token_accuracy": 0.9784594297409057, "num_tokens": 73142398.0, "step": 9010 }, { "entropy": 0.18846041858196258, "epoch": 5.087471783295711, "grad_norm": 2.1237173080444336, "learning_rate": 4.432602880768925e-06, "loss": 0.076, "mean_token_accuracy": 0.9769237160682678, "num_tokens": 73183058.0, "step": 9015 }, { "entropy": 0.1911979854106903, "epoch": 5.090293453724605, "grad_norm": 1.8477128744125366, "learning_rate": 4.4320071437587554e-06, "loss": 0.0704, "mean_token_accuracy": 0.9788993120193481, "num_tokens": 73223530.0, "step": 9020 }, { "entropy": 0.19110194444656373, "epoch": 5.093115124153499, "grad_norm": 2.072406053543091, "learning_rate": 4.431411149986865e-06, "loss": 0.074, "mean_token_accuracy": 0.9778715133666992, "num_tokens": 73264278.0, "step": 9025 }, { "entropy": 0.18987910449504852, "epoch": 5.095936794582393, "grad_norm": 1.7068977355957031, "learning_rate": 4.430814899570333e-06, "loss": 0.0745, "mean_token_accuracy": 0.9774025797843933, "num_tokens": 73304905.0, "step": 9030 }, { "entropy": 0.18311900794506072, "epoch": 5.0987584650112865, "grad_norm": 2.1360621452331543, "learning_rate": 4.430218392626295e-06, "loss": 0.065, "mean_token_accuracy": 0.9798977851867676, "num_tokens": 73345687.0, "step": 9035 }, { "entropy": 0.17857904732227325, "epoch": 5.10158013544018, "grad_norm": 1.964221477508545, "learning_rate": 4.429621629271933e-06, "loss": 0.0628, "mean_token_accuracy": 0.980586564540863, "num_tokens": 73386265.0, "step": 9040 }, { "entropy": 0.17305072247982026, "epoch": 5.104401805869075, "grad_norm": 2.037114143371582, "learning_rate": 4.429024609624482e-06, "loss": 0.0636, "mean_token_accuracy": 0.9802641272544861, "num_tokens": 73427151.0, "step": 9045 }, { "entropy": 0.18654557764530183, "epoch": 5.1072234762979685, "grad_norm": 2.1122262477874756, "learning_rate": 4.428427333801228e-06, "loss": 0.0626, "mean_token_accuracy": 0.9806604266166687, "num_tokens": 73467943.0, "step": 9050 }, { "entropy": 0.1888034909963608, "epoch": 5.110045146726862, "grad_norm": 2.032395124435425, "learning_rate": 4.4278298019195044e-06, "loss": 0.0677, "mean_token_accuracy": 0.9790974020957947, "num_tokens": 73508586.0, "step": 9055 }, { "entropy": 0.201117542386055, "epoch": 5.112866817155756, "grad_norm": 2.6121714115142822, "learning_rate": 4.4272320140966965e-06, "loss": 0.0789, "mean_token_accuracy": 0.9769583702087402, "num_tokens": 73549036.0, "step": 9060 }, { "entropy": 0.17489383816719056, "epoch": 5.1156884875846504, "grad_norm": 2.1628427505493164, "learning_rate": 4.4266339704502415e-06, "loss": 0.0704, "mean_token_accuracy": 0.9784565687179565, "num_tokens": 73589624.0, "step": 9065 }, { "entropy": 0.17749419510364534, "epoch": 5.118510158013544, "grad_norm": 2.2119405269622803, "learning_rate": 4.426035671097623e-06, "loss": 0.0672, "mean_token_accuracy": 0.980502438545227, "num_tokens": 73630265.0, "step": 9070 }, { "entropy": 0.17612397372722627, "epoch": 5.121331828442438, "grad_norm": 1.97671377658844, "learning_rate": 4.425437116156377e-06, "loss": 0.0741, "mean_token_accuracy": 0.9769485712051391, "num_tokens": 73670966.0, "step": 9075 }, { "entropy": 0.1710589587688446, "epoch": 5.1241534988713315, "grad_norm": 2.027311325073242, "learning_rate": 4.424838305744091e-06, "loss": 0.0636, "mean_token_accuracy": 0.9812980055809021, "num_tokens": 73711495.0, "step": 9080 }, { "entropy": 0.16620054244995117, "epoch": 5.126975169300226, "grad_norm": 2.079716920852661, "learning_rate": 4.4242392399784015e-06, "loss": 0.0599, "mean_token_accuracy": 0.9820851564407349, "num_tokens": 73751986.0, "step": 9085 }, { "entropy": 0.19693105220794677, "epoch": 5.12979683972912, "grad_norm": 1.8494852781295776, "learning_rate": 4.423639918976994e-06, "loss": 0.0841, "mean_token_accuracy": 0.9751617550849915, "num_tokens": 73792696.0, "step": 9090 }, { "entropy": 0.16846722960472107, "epoch": 5.1326185101580135, "grad_norm": 2.0323588848114014, "learning_rate": 4.4230403428576055e-06, "loss": 0.0576, "mean_token_accuracy": 0.983081865310669, "num_tokens": 73833479.0, "step": 9095 }, { "entropy": 0.18227221965789794, "epoch": 5.135440180586907, "grad_norm": 1.713901400566101, "learning_rate": 4.4224405117380235e-06, "loss": 0.0722, "mean_token_accuracy": 0.9784926295280456, "num_tokens": 73874141.0, "step": 9100 }, { "entropy": 0.17101970911026002, "epoch": 5.138261851015802, "grad_norm": 1.8212043046951294, "learning_rate": 4.421840425736084e-06, "loss": 0.0694, "mean_token_accuracy": 0.9785187363624572, "num_tokens": 73914676.0, "step": 9105 }, { "entropy": 0.18568507134914397, "epoch": 5.1410835214446955, "grad_norm": 2.0307278633117676, "learning_rate": 4.421240084969673e-06, "loss": 0.0681, "mean_token_accuracy": 0.9798570513725281, "num_tokens": 73955243.0, "step": 9110 }, { "entropy": 0.18314568400382997, "epoch": 5.143905191873589, "grad_norm": 2.26824951171875, "learning_rate": 4.42063948955673e-06, "loss": 0.0819, "mean_token_accuracy": 0.9749473333358765, "num_tokens": 73996064.0, "step": 9115 }, { "entropy": 0.18159986436367034, "epoch": 5.146726862302483, "grad_norm": 2.368440866470337, "learning_rate": 4.420038639615241e-06, "loss": 0.0759, "mean_token_accuracy": 0.97624671459198, "num_tokens": 74036627.0, "step": 9120 }, { "entropy": 0.18119294345378875, "epoch": 5.149548532731377, "grad_norm": 1.908693552017212, "learning_rate": 4.419437535263243e-06, "loss": 0.067, "mean_token_accuracy": 0.9788689136505127, "num_tokens": 74077307.0, "step": 9125 }, { "entropy": 0.18659307360649108, "epoch": 5.152370203160271, "grad_norm": 1.8129174709320068, "learning_rate": 4.418836176618823e-06, "loss": 0.0727, "mean_token_accuracy": 0.9778983354568481, "num_tokens": 74117925.0, "step": 9130 }, { "entropy": 0.19027889668941497, "epoch": 5.155191873589165, "grad_norm": 1.8075789213180542, "learning_rate": 4.418234563800117e-06, "loss": 0.0733, "mean_token_accuracy": 0.9768654942512512, "num_tokens": 74158594.0, "step": 9135 }, { "entropy": 0.17420806884765624, "epoch": 5.158013544018059, "grad_norm": 2.1815521717071533, "learning_rate": 4.417632696925314e-06, "loss": 0.0643, "mean_token_accuracy": 0.980584466457367, "num_tokens": 74199375.0, "step": 9140 }, { "entropy": 0.19676790237426758, "epoch": 5.160835214446952, "grad_norm": 2.2495217323303223, "learning_rate": 4.417030576112649e-06, "loss": 0.0635, "mean_token_accuracy": 0.9816282868385315, "num_tokens": 74239922.0, "step": 9145 }, { "entropy": 0.17044782042503356, "epoch": 5.163656884875847, "grad_norm": 2.1400983333587646, "learning_rate": 4.416428201480409e-06, "loss": 0.0741, "mean_token_accuracy": 0.9771974563598633, "num_tokens": 74280490.0, "step": 9150 }, { "entropy": 0.17132185995578766, "epoch": 5.1664785553047405, "grad_norm": 1.5042531490325928, "learning_rate": 4.415825573146931e-06, "loss": 0.066, "mean_token_accuracy": 0.9790427088737488, "num_tokens": 74321233.0, "step": 9155 }, { "entropy": 0.1608135759830475, "epoch": 5.169300225733634, "grad_norm": 1.817196011543274, "learning_rate": 4.415222691230602e-06, "loss": 0.0584, "mean_token_accuracy": 0.9813137888908386, "num_tokens": 74361867.0, "step": 9160 }, { "entropy": 0.19295158088207245, "epoch": 5.172121896162528, "grad_norm": 2.3909380435943604, "learning_rate": 4.414619555849857e-06, "loss": 0.0688, "mean_token_accuracy": 0.9787463188171387, "num_tokens": 74402471.0, "step": 9165 }, { "entropy": 0.1939996987581253, "epoch": 5.1749435665914225, "grad_norm": 2.0213077068328857, "learning_rate": 4.414016167123183e-06, "loss": 0.0715, "mean_token_accuracy": 0.978550124168396, "num_tokens": 74442930.0, "step": 9170 }, { "entropy": 0.18027763068675995, "epoch": 5.177765237020316, "grad_norm": 1.568903923034668, "learning_rate": 4.413412525169115e-06, "loss": 0.0628, "mean_token_accuracy": 0.9806607604026795, "num_tokens": 74483369.0, "step": 9175 }, { "entropy": 0.17010557353496553, "epoch": 5.18058690744921, "grad_norm": 2.0850884914398193, "learning_rate": 4.4128086301062405e-06, "loss": 0.0652, "mean_token_accuracy": 0.9795267939567566, "num_tokens": 74523998.0, "step": 9180 }, { "entropy": 0.1618386447429657, "epoch": 5.183408577878104, "grad_norm": 2.156888484954834, "learning_rate": 4.412204482053191e-06, "loss": 0.0653, "mean_token_accuracy": 0.9797653436660767, "num_tokens": 74564676.0, "step": 9185 }, { "entropy": 0.1769125282764435, "epoch": 5.186230248306998, "grad_norm": 1.982959270477295, "learning_rate": 4.411600081128655e-06, "loss": 0.0692, "mean_token_accuracy": 0.9781957626342773, "num_tokens": 74605459.0, "step": 9190 }, { "entropy": 0.1831894338130951, "epoch": 5.189051918735892, "grad_norm": 2.3006227016448975, "learning_rate": 4.410995427451365e-06, "loss": 0.0659, "mean_token_accuracy": 0.9792438864707946, "num_tokens": 74646245.0, "step": 9195 }, { "entropy": 0.16881802082061767, "epoch": 5.191873589164786, "grad_norm": 1.7162789106369019, "learning_rate": 4.410390521140107e-06, "loss": 0.0598, "mean_token_accuracy": 0.9816855788230896, "num_tokens": 74686744.0, "step": 9200 }, { "entropy": 0.17817852199077605, "epoch": 5.194695259593679, "grad_norm": 1.815919041633606, "learning_rate": 4.409785362313714e-06, "loss": 0.0659, "mean_token_accuracy": 0.9804519295692444, "num_tokens": 74727229.0, "step": 9205 }, { "entropy": 0.16955990195274354, "epoch": 5.197516930022573, "grad_norm": 1.5127193927764893, "learning_rate": 4.409179951091069e-06, "loss": 0.0704, "mean_token_accuracy": 0.9778647541999816, "num_tokens": 74767941.0, "step": 9210 }, { "entropy": 0.17718103229999543, "epoch": 5.200338600451468, "grad_norm": 1.9324251413345337, "learning_rate": 4.408574287591105e-06, "loss": 0.0684, "mean_token_accuracy": 0.9797194123268127, "num_tokens": 74808629.0, "step": 9215 }, { "entropy": 0.18118757009506226, "epoch": 5.203160270880361, "grad_norm": 1.6411625146865845, "learning_rate": 4.407968371932807e-06, "loss": 0.0668, "mean_token_accuracy": 0.9800692081451416, "num_tokens": 74849290.0, "step": 9220 }, { "entropy": 0.16765645444393157, "epoch": 5.205981941309255, "grad_norm": 2.054521322250366, "learning_rate": 4.407362204235205e-06, "loss": 0.0622, "mean_token_accuracy": 0.9807947397232055, "num_tokens": 74889988.0, "step": 9225 }, { "entropy": 0.18616848886013032, "epoch": 5.208803611738149, "grad_norm": 2.160146474838257, "learning_rate": 4.40675578461738e-06, "loss": 0.0672, "mean_token_accuracy": 0.9792033910751343, "num_tokens": 74930594.0, "step": 9230 }, { "entropy": 0.17592273354530336, "epoch": 5.211625282167043, "grad_norm": 2.01505708694458, "learning_rate": 4.4061491131984655e-06, "loss": 0.0708, "mean_token_accuracy": 0.9776349067687988, "num_tokens": 74971295.0, "step": 9235 }, { "entropy": 0.16667756140232087, "epoch": 5.214446952595937, "grad_norm": 1.9631502628326416, "learning_rate": 4.405542190097641e-06, "loss": 0.0639, "mean_token_accuracy": 0.9816940546035766, "num_tokens": 75011986.0, "step": 9240 }, { "entropy": 0.17029538750648499, "epoch": 5.217268623024831, "grad_norm": 1.7982895374298096, "learning_rate": 4.404935015434138e-06, "loss": 0.0641, "mean_token_accuracy": 0.9802758932113648, "num_tokens": 75052805.0, "step": 9245 }, { "entropy": 0.172264164686203, "epoch": 5.220090293453724, "grad_norm": 2.2566871643066406, "learning_rate": 4.404327589327234e-06, "loss": 0.0668, "mean_token_accuracy": 0.9802647709846497, "num_tokens": 75093268.0, "step": 9250 }, { "entropy": 0.16985282599925994, "epoch": 5.222911963882619, "grad_norm": 1.5910284519195557, "learning_rate": 4.403719911896258e-06, "loss": 0.0634, "mean_token_accuracy": 0.9810275197029114, "num_tokens": 75134071.0, "step": 9255 }, { "entropy": 0.19104610085487367, "epoch": 5.225733634311513, "grad_norm": 2.561847448348999, "learning_rate": 4.40311198326059e-06, "loss": 0.0704, "mean_token_accuracy": 0.9786494374275208, "num_tokens": 75174864.0, "step": 9260 }, { "entropy": 0.19318519532680511, "epoch": 5.228555304740406, "grad_norm": 1.789509654045105, "learning_rate": 4.402503803539656e-06, "loss": 0.0748, "mean_token_accuracy": 0.9768137574195862, "num_tokens": 75215612.0, "step": 9265 }, { "entropy": 0.1831961005926132, "epoch": 5.2313769751693, "grad_norm": 1.7263920307159424, "learning_rate": 4.401895372852935e-06, "loss": 0.0658, "mean_token_accuracy": 0.9797954320907593, "num_tokens": 75255813.0, "step": 9270 }, { "entropy": 0.16736743450164795, "epoch": 5.234198645598195, "grad_norm": 1.8194705247879028, "learning_rate": 4.401286691319951e-06, "loss": 0.0677, "mean_token_accuracy": 0.9790467739105224, "num_tokens": 75296532.0, "step": 9275 }, { "entropy": 0.17123483419418334, "epoch": 5.237020316027088, "grad_norm": 1.8370773792266846, "learning_rate": 4.40067775906028e-06, "loss": 0.0634, "mean_token_accuracy": 0.9816179037094116, "num_tokens": 75337252.0, "step": 9280 }, { "entropy": 0.18256573975086213, "epoch": 5.239841986455982, "grad_norm": 1.8240233659744263, "learning_rate": 4.400068576193549e-06, "loss": 0.0654, "mean_token_accuracy": 0.9805835843086242, "num_tokens": 75377833.0, "step": 9285 }, { "entropy": 0.19250957071781158, "epoch": 5.242663656884876, "grad_norm": 1.8019994497299194, "learning_rate": 4.399459142839429e-06, "loss": 0.0775, "mean_token_accuracy": 0.9766400456428528, "num_tokens": 75418751.0, "step": 9290 }, { "entropy": 0.204935023188591, "epoch": 5.245485327313769, "grad_norm": 2.1010115146636963, "learning_rate": 4.398849459117645e-06, "loss": 0.0687, "mean_token_accuracy": 0.9798161029815674, "num_tokens": 75459490.0, "step": 9295 }, { "entropy": 0.1841699182987213, "epoch": 5.248306997742664, "grad_norm": 1.935957431793213, "learning_rate": 4.3982395251479705e-06, "loss": 0.066, "mean_token_accuracy": 0.9796889901161194, "num_tokens": 75500096.0, "step": 9300 }, { "entropy": 0.16842667162418365, "epoch": 5.251128668171558, "grad_norm": 2.501762866973877, "learning_rate": 4.3976293410502245e-06, "loss": 0.0566, "mean_token_accuracy": 0.9825867414474487, "num_tokens": 75540820.0, "step": 9305 }, { "entropy": 0.18842605650424957, "epoch": 5.253950338600451, "grad_norm": 1.8708136081695557, "learning_rate": 4.397018906944279e-06, "loss": 0.0661, "mean_token_accuracy": 0.9798398852348328, "num_tokens": 75581478.0, "step": 9310 }, { "entropy": 0.17341751158237456, "epoch": 5.256772009029345, "grad_norm": 1.717519998550415, "learning_rate": 4.3964082229500545e-06, "loss": 0.0754, "mean_token_accuracy": 0.977244210243225, "num_tokens": 75622057.0, "step": 9315 }, { "entropy": 0.17311387956142427, "epoch": 5.25959367945824, "grad_norm": 2.0607640743255615, "learning_rate": 4.39579728918752e-06, "loss": 0.0712, "mean_token_accuracy": 0.977792501449585, "num_tokens": 75662674.0, "step": 9320 }, { "entropy": 0.17308418452739716, "epoch": 5.262415349887133, "grad_norm": 1.5914183855056763, "learning_rate": 4.395186105776691e-06, "loss": 0.0619, "mean_token_accuracy": 0.9809646487236023, "num_tokens": 75703277.0, "step": 9325 }, { "entropy": 0.15740045607089997, "epoch": 5.265237020316027, "grad_norm": 2.155003547668457, "learning_rate": 4.394574672837637e-06, "loss": 0.0599, "mean_token_accuracy": 0.9820808887481689, "num_tokens": 75743221.0, "step": 9330 }, { "entropy": 0.17217525243759155, "epoch": 5.268058690744921, "grad_norm": 2.087221384048462, "learning_rate": 4.393962990490475e-06, "loss": 0.0632, "mean_token_accuracy": 0.9806132674217224, "num_tokens": 75783998.0, "step": 9335 }, { "entropy": 0.1740059047937393, "epoch": 5.270880361173815, "grad_norm": 2.0336315631866455, "learning_rate": 4.393351058855366e-06, "loss": 0.0739, "mean_token_accuracy": 0.9778852939605713, "num_tokens": 75824530.0, "step": 9340 }, { "entropy": 0.19505321085453034, "epoch": 5.273702031602709, "grad_norm": 2.201986074447632, "learning_rate": 4.392738878052528e-06, "loss": 0.071, "mean_token_accuracy": 0.9772972106933594, "num_tokens": 75865197.0, "step": 9345 }, { "entropy": 0.17578763961791993, "epoch": 5.276523702031603, "grad_norm": 1.827558159828186, "learning_rate": 4.392126448202223e-06, "loss": 0.0617, "mean_token_accuracy": 0.9813929557800293, "num_tokens": 75905554.0, "step": 9350 }, { "entropy": 0.16351258158683776, "epoch": 5.279345372460496, "grad_norm": 2.258183479309082, "learning_rate": 4.391513769424762e-06, "loss": 0.0643, "mean_token_accuracy": 0.9805266618728637, "num_tokens": 75946320.0, "step": 9355 }, { "entropy": 0.17927370667457582, "epoch": 5.282167042889391, "grad_norm": 2.4332005977630615, "learning_rate": 4.390900841840506e-06, "loss": 0.0721, "mean_token_accuracy": 0.9778699398040771, "num_tokens": 75987141.0, "step": 9360 }, { "entropy": 0.16708399951457978, "epoch": 5.284988713318285, "grad_norm": 1.8501638174057007, "learning_rate": 4.3902876655698666e-06, "loss": 0.0672, "mean_token_accuracy": 0.978826928138733, "num_tokens": 76027705.0, "step": 9365 }, { "entropy": 0.18877400755882262, "epoch": 5.287810383747178, "grad_norm": 1.798592448234558, "learning_rate": 4.3896742407332995e-06, "loss": 0.0649, "mean_token_accuracy": 0.9791201710700989, "num_tokens": 76068472.0, "step": 9370 }, { "entropy": 0.1839359939098358, "epoch": 5.290632054176072, "grad_norm": 2.170732259750366, "learning_rate": 4.389060567451313e-06, "loss": 0.0688, "mean_token_accuracy": 0.9787888526916504, "num_tokens": 76109233.0, "step": 9375 }, { "entropy": 0.1790493279695511, "epoch": 5.293453724604966, "grad_norm": 1.9541733264923096, "learning_rate": 4.388446645844465e-06, "loss": 0.0708, "mean_token_accuracy": 0.9779038906097413, "num_tokens": 76149974.0, "step": 9380 }, { "entropy": 0.17843609154224396, "epoch": 5.29627539503386, "grad_norm": 1.8375680446624756, "learning_rate": 4.387832476033358e-06, "loss": 0.0692, "mean_token_accuracy": 0.9785267472267151, "num_tokens": 76190641.0, "step": 9385 }, { "entropy": 0.17534771859645842, "epoch": 5.299097065462754, "grad_norm": 2.224586009979248, "learning_rate": 4.3872180581386485e-06, "loss": 0.0762, "mean_token_accuracy": 0.9763685941696167, "num_tokens": 76231345.0, "step": 9390 }, { "entropy": 0.17150465846061708, "epoch": 5.301918735891648, "grad_norm": 2.095118999481201, "learning_rate": 4.3866033922810355e-06, "loss": 0.0719, "mean_token_accuracy": 0.9783193111419678, "num_tokens": 76272069.0, "step": 9395 }, { "entropy": 0.18151159584522247, "epoch": 5.3047404063205414, "grad_norm": 2.1895596981048584, "learning_rate": 4.385988478581274e-06, "loss": 0.0718, "mean_token_accuracy": 0.9773975610733032, "num_tokens": 76312608.0, "step": 9400 }, { "entropy": 0.19453785419464112, "epoch": 5.307562076749436, "grad_norm": 2.0922398567199707, "learning_rate": 4.38537331716016e-06, "loss": 0.0709, "mean_token_accuracy": 0.9785321354866028, "num_tokens": 76353353.0, "step": 9405 }, { "entropy": 0.18575883209705352, "epoch": 5.31038374717833, "grad_norm": 3.6228082180023193, "learning_rate": 4.384757908138545e-06, "loss": 0.0727, "mean_token_accuracy": 0.9786053776741028, "num_tokens": 76394028.0, "step": 9410 }, { "entropy": 0.18422328531742097, "epoch": 5.313205417607223, "grad_norm": 1.974879503250122, "learning_rate": 4.3841422516373255e-06, "loss": 0.0632, "mean_token_accuracy": 0.9804863810539246, "num_tokens": 76434694.0, "step": 9415 }, { "entropy": 0.17816999256610871, "epoch": 5.316027088036117, "grad_norm": 1.6818512678146362, "learning_rate": 4.383526347777446e-06, "loss": 0.0684, "mean_token_accuracy": 0.9786510825157165, "num_tokens": 76475096.0, "step": 9420 }, { "entropy": 0.18150631487369537, "epoch": 5.318848758465011, "grad_norm": 1.903784990310669, "learning_rate": 4.3829101966799025e-06, "loss": 0.0732, "mean_token_accuracy": 0.9780114173889161, "num_tokens": 76515735.0, "step": 9425 }, { "entropy": 0.17466551959514617, "epoch": 5.321670428893905, "grad_norm": 1.8316190242767334, "learning_rate": 4.382293798465738e-06, "loss": 0.0729, "mean_token_accuracy": 0.9783891916275025, "num_tokens": 76556187.0, "step": 9430 }, { "entropy": 0.1856124997138977, "epoch": 5.324492099322799, "grad_norm": 2.0181853771209717, "learning_rate": 4.381677153256042e-06, "loss": 0.077, "mean_token_accuracy": 0.9763926148414612, "num_tokens": 76596840.0, "step": 9435 }, { "entropy": 0.16592672169208528, "epoch": 5.327313769751693, "grad_norm": 2.4706497192382812, "learning_rate": 4.381060261171956e-06, "loss": 0.0765, "mean_token_accuracy": 0.9765696048736572, "num_tokens": 76637424.0, "step": 9440 }, { "entropy": 0.17519499063491822, "epoch": 5.3301354401805865, "grad_norm": 1.9613145589828491, "learning_rate": 4.38044312233467e-06, "loss": 0.0649, "mean_token_accuracy": 0.9802795290946961, "num_tokens": 76678086.0, "step": 9445 }, { "entropy": 0.19584991335868834, "epoch": 5.332957110609481, "grad_norm": 1.8354933261871338, "learning_rate": 4.379825736865419e-06, "loss": 0.068, "mean_token_accuracy": 0.9790003061294555, "num_tokens": 76718348.0, "step": 9450 }, { "entropy": 0.17431385517120362, "epoch": 5.335778781038375, "grad_norm": 2.1092381477355957, "learning_rate": 4.3792081048854875e-06, "loss": 0.072, "mean_token_accuracy": 0.9774962902069092, "num_tokens": 76759148.0, "step": 9455 }, { "entropy": 0.1774332642555237, "epoch": 5.3386004514672685, "grad_norm": 2.28005313873291, "learning_rate": 4.378590226516211e-06, "loss": 0.0746, "mean_token_accuracy": 0.9770366787910462, "num_tokens": 76799597.0, "step": 9460 }, { "entropy": 0.17988061010837555, "epoch": 5.341422121896162, "grad_norm": 2.3315978050231934, "learning_rate": 4.3779721018789735e-06, "loss": 0.0781, "mean_token_accuracy": 0.9758471369743347, "num_tokens": 76840462.0, "step": 9465 }, { "entropy": 0.16905871629714966, "epoch": 5.344243792325057, "grad_norm": 2.204232931137085, "learning_rate": 4.377353731095202e-06, "loss": 0.0604, "mean_token_accuracy": 0.9812986135482789, "num_tokens": 76881153.0, "step": 9470 }, { "entropy": 0.17078822553157808, "epoch": 5.3470654627539504, "grad_norm": 1.9415490627288818, "learning_rate": 4.376735114286378e-06, "loss": 0.0697, "mean_token_accuracy": 0.9775596261024475, "num_tokens": 76922045.0, "step": 9475 }, { "entropy": 0.17137598097324372, "epoch": 5.349887133182844, "grad_norm": 1.894740343093872, "learning_rate": 4.3761162515740276e-06, "loss": 0.0631, "mean_token_accuracy": 0.9806401252746582, "num_tokens": 76962699.0, "step": 9480 }, { "entropy": 0.1780911296606064, "epoch": 5.352708803611738, "grad_norm": 1.8693207502365112, "learning_rate": 4.375497143079726e-06, "loss": 0.0728, "mean_token_accuracy": 0.9785822868347168, "num_tokens": 77003042.0, "step": 9485 }, { "entropy": 0.18042166531085968, "epoch": 5.355530474040632, "grad_norm": 1.8227202892303467, "learning_rate": 4.3748777889250995e-06, "loss": 0.0743, "mean_token_accuracy": 0.9775307297706604, "num_tokens": 77043699.0, "step": 9490 }, { "entropy": 0.1943429082632065, "epoch": 5.358352144469526, "grad_norm": 1.9916659593582153, "learning_rate": 4.374258189231818e-06, "loss": 0.0667, "mean_token_accuracy": 0.9789568185806274, "num_tokens": 77084496.0, "step": 9495 }, { "entropy": 0.19134209752082826, "epoch": 5.36117381489842, "grad_norm": 2.394291877746582, "learning_rate": 4.3736383441216036e-06, "loss": 0.0672, "mean_token_accuracy": 0.9791797280311585, "num_tokens": 77124779.0, "step": 9500 }, { "epoch": 5.36117381489842, "eval_entropy": 0.22530123591423035, "eval_loss": 0.03946574404835701, "eval_mean_token_accuracy": 0.9888846278190613, "eval_num_tokens": 77124779.0, "eval_runtime": 0.1643, "eval_samples_per_second": 24.339, "eval_steps_per_second": 6.085, "step": 9500 }, { "entropy": 0.19695676267147064, "epoch": 5.3639954853273135, "grad_norm": 1.9585169553756714, "learning_rate": 4.3730182537162235e-06, "loss": 0.0782, "mean_token_accuracy": 0.9759560108184815, "num_tokens": 77165622.0, "step": 9505 }, { "entropy": 0.18560438454151154, "epoch": 5.366817155756207, "grad_norm": 2.292339563369751, "learning_rate": 4.3723979181374964e-06, "loss": 0.0743, "mean_token_accuracy": 0.9771399736404419, "num_tokens": 77206242.0, "step": 9510 }, { "entropy": 0.17267344892024994, "epoch": 5.369638826185102, "grad_norm": 2.0678932666778564, "learning_rate": 4.371777337507285e-06, "loss": 0.0692, "mean_token_accuracy": 0.979387903213501, "num_tokens": 77246858.0, "step": 9515 }, { "entropy": 0.18678390383720397, "epoch": 5.3724604966139955, "grad_norm": 2.482128858566284, "learning_rate": 4.371156511947504e-06, "loss": 0.0713, "mean_token_accuracy": 0.9776126742362976, "num_tokens": 77287179.0, "step": 9520 }, { "entropy": 0.16702256202697754, "epoch": 5.375282167042889, "grad_norm": 2.2040395736694336, "learning_rate": 4.370535441580114e-06, "loss": 0.0623, "mean_token_accuracy": 0.981655502319336, "num_tokens": 77327907.0, "step": 9525 }, { "entropy": 0.16512243151664735, "epoch": 5.378103837471783, "grad_norm": 1.9599778652191162, "learning_rate": 4.369914126527125e-06, "loss": 0.0643, "mean_token_accuracy": 0.9801242351531982, "num_tokens": 77368670.0, "step": 9530 }, { "entropy": 0.17791537344455718, "epoch": 5.3809255079006775, "grad_norm": 1.9389322996139526, "learning_rate": 4.369292566910594e-06, "loss": 0.0642, "mean_token_accuracy": 0.9804812669754028, "num_tokens": 77409415.0, "step": 9535 }, { "entropy": 0.18755693435668946, "epoch": 5.383747178329571, "grad_norm": 1.7628971338272095, "learning_rate": 4.368670762852626e-06, "loss": 0.0655, "mean_token_accuracy": 0.9798348426818848, "num_tokens": 77450190.0, "step": 9540 }, { "entropy": 0.1958528518676758, "epoch": 5.386568848758465, "grad_norm": 2.0312836170196533, "learning_rate": 4.368048714475375e-06, "loss": 0.0711, "mean_token_accuracy": 0.9780890107154846, "num_tokens": 77490984.0, "step": 9545 }, { "entropy": 0.1894278347492218, "epoch": 5.389390519187359, "grad_norm": 2.1138522624969482, "learning_rate": 4.367426421901042e-06, "loss": 0.0687, "mean_token_accuracy": 0.9790720820426941, "num_tokens": 77531707.0, "step": 9550 }, { "entropy": 0.18264504075050353, "epoch": 5.392212189616253, "grad_norm": 2.236631155014038, "learning_rate": 4.366803885251879e-06, "loss": 0.068, "mean_token_accuracy": 0.9791576147079468, "num_tokens": 77572138.0, "step": 9555 }, { "entropy": 0.17964340448379518, "epoch": 5.395033860045147, "grad_norm": 1.9369723796844482, "learning_rate": 4.366181104650179e-06, "loss": 0.0726, "mean_token_accuracy": 0.978190791606903, "num_tokens": 77612964.0, "step": 9560 }, { "entropy": 0.19620463848114014, "epoch": 5.3978555304740405, "grad_norm": 2.114414930343628, "learning_rate": 4.36555808021829e-06, "loss": 0.0781, "mean_token_accuracy": 0.975774621963501, "num_tokens": 77653620.0, "step": 9565 }, { "entropy": 0.18174751698970795, "epoch": 5.400677200902934, "grad_norm": 2.574272394180298, "learning_rate": 4.364934812078606e-06, "loss": 0.071, "mean_token_accuracy": 0.9776658058166504, "num_tokens": 77694231.0, "step": 9570 }, { "entropy": 0.19385533034801483, "epoch": 5.403498871331829, "grad_norm": 1.886120319366455, "learning_rate": 4.364311300353567e-06, "loss": 0.0706, "mean_token_accuracy": 0.9778653621673584, "num_tokens": 77734981.0, "step": 9575 }, { "entropy": 0.1768470048904419, "epoch": 5.4063205417607225, "grad_norm": 1.6945990324020386, "learning_rate": 4.363687545165661e-06, "loss": 0.0748, "mean_token_accuracy": 0.9762227416038514, "num_tokens": 77775525.0, "step": 9580 }, { "entropy": 0.18447025418281554, "epoch": 5.409142212189616, "grad_norm": 2.226775646209717, "learning_rate": 4.363063546637426e-06, "loss": 0.0772, "mean_token_accuracy": 0.9765905261039733, "num_tokens": 77816146.0, "step": 9585 }, { "entropy": 0.20254374146461487, "epoch": 5.41196388261851, "grad_norm": 1.4906258583068848, "learning_rate": 4.3624393048914465e-06, "loss": 0.0672, "mean_token_accuracy": 0.9796987056732178, "num_tokens": 77856760.0, "step": 9590 }, { "entropy": 0.20228754878044128, "epoch": 5.414785553047404, "grad_norm": 2.128209114074707, "learning_rate": 4.361814820050355e-06, "loss": 0.074, "mean_token_accuracy": 0.9770419836044312, "num_tokens": 77897182.0, "step": 9595 }, { "entropy": 0.18532134592533112, "epoch": 5.417607223476298, "grad_norm": 1.994026780128479, "learning_rate": 4.36119009223683e-06, "loss": 0.0698, "mean_token_accuracy": 0.9782418012619019, "num_tokens": 77937769.0, "step": 9600 }, { "entropy": 0.17942964434623718, "epoch": 5.420428893905192, "grad_norm": 1.7702689170837402, "learning_rate": 4.3605651215736025e-06, "loss": 0.0697, "mean_token_accuracy": 0.9785507321357727, "num_tokens": 77978608.0, "step": 9605 }, { "entropy": 0.18152170181274413, "epoch": 5.423250564334086, "grad_norm": 2.2829222679138184, "learning_rate": 4.359939908183445e-06, "loss": 0.0748, "mean_token_accuracy": 0.9763474941253663, "num_tokens": 78019201.0, "step": 9610 }, { "entropy": 0.16868273317813873, "epoch": 5.426072234762979, "grad_norm": 1.9242933988571167, "learning_rate": 4.3593144521891825e-06, "loss": 0.0684, "mean_token_accuracy": 0.9792532801628113, "num_tokens": 78059853.0, "step": 9615 }, { "entropy": 0.17837615609169005, "epoch": 5.428893905191874, "grad_norm": 1.9785670042037964, "learning_rate": 4.358688753713685e-06, "loss": 0.063, "mean_token_accuracy": 0.9807116985321045, "num_tokens": 78100351.0, "step": 9620 }, { "entropy": 0.16789911687374115, "epoch": 5.431715575620768, "grad_norm": 1.8135242462158203, "learning_rate": 4.358062812879873e-06, "loss": 0.0667, "mean_token_accuracy": 0.979233467578888, "num_tokens": 78140931.0, "step": 9625 }, { "entropy": 0.16792088150978088, "epoch": 5.434537246049661, "grad_norm": 2.1166341304779053, "learning_rate": 4.357436629810709e-06, "loss": 0.0682, "mean_token_accuracy": 0.9792353987693787, "num_tokens": 78181470.0, "step": 9630 }, { "entropy": 0.18325534760951995, "epoch": 5.437358916478555, "grad_norm": 1.728598952293396, "learning_rate": 4.35681020462921e-06, "loss": 0.0747, "mean_token_accuracy": 0.977009117603302, "num_tokens": 78222347.0, "step": 9635 }, { "entropy": 0.18267615139484406, "epoch": 5.4401805869074495, "grad_norm": 1.7328113317489624, "learning_rate": 4.356183537458436e-06, "loss": 0.0722, "mean_token_accuracy": 0.9778196811676025, "num_tokens": 78262982.0, "step": 9640 }, { "entropy": 0.180474916100502, "epoch": 5.443002257336343, "grad_norm": 1.5890209674835205, "learning_rate": 4.3555566284214955e-06, "loss": 0.0663, "mean_token_accuracy": 0.9788239240646363, "num_tokens": 78303575.0, "step": 9645 }, { "entropy": 0.20339135229587554, "epoch": 5.445823927765237, "grad_norm": 1.9945831298828125, "learning_rate": 4.354929477641547e-06, "loss": 0.0792, "mean_token_accuracy": 0.9748901128768921, "num_tokens": 78343987.0, "step": 9650 }, { "entropy": 0.18225070238113403, "epoch": 5.448645598194131, "grad_norm": 1.9023123979568481, "learning_rate": 4.354302085241791e-06, "loss": 0.072, "mean_token_accuracy": 0.9779296040534973, "num_tokens": 78384670.0, "step": 9655 }, { "entropy": 0.17491951882839202, "epoch": 5.451467268623025, "grad_norm": 1.9437018632888794, "learning_rate": 4.353674451345481e-06, "loss": 0.0649, "mean_token_accuracy": 0.9799412608146667, "num_tokens": 78425444.0, "step": 9660 }, { "entropy": 0.19293942749500276, "epoch": 5.454288939051919, "grad_norm": 2.0638647079467773, "learning_rate": 4.353046576075915e-06, "loss": 0.0706, "mean_token_accuracy": 0.9783964395523072, "num_tokens": 78465938.0, "step": 9665 }, { "entropy": 0.17496614456176757, "epoch": 5.457110609480813, "grad_norm": 2.2454192638397217, "learning_rate": 4.35241845955644e-06, "loss": 0.0733, "mean_token_accuracy": 0.9771980404853821, "num_tokens": 78506288.0, "step": 9670 }, { "entropy": 0.18418402671813966, "epoch": 5.459932279909706, "grad_norm": 1.9852421283721924, "learning_rate": 4.3517901019104494e-06, "loss": 0.0624, "mean_token_accuracy": 0.9815483212471008, "num_tokens": 78546897.0, "step": 9675 }, { "entropy": 0.17411540746688842, "epoch": 5.4627539503386, "grad_norm": 2.471421957015991, "learning_rate": 4.351161503261384e-06, "loss": 0.0593, "mean_token_accuracy": 0.981855833530426, "num_tokens": 78587494.0, "step": 9680 }, { "entropy": 0.18723950386047364, "epoch": 5.465575620767495, "grad_norm": 2.042299509048462, "learning_rate": 4.3505326637327315e-06, "loss": 0.0789, "mean_token_accuracy": 0.9755418181419373, "num_tokens": 78628058.0, "step": 9685 }, { "entropy": 0.1705529898405075, "epoch": 5.468397291196388, "grad_norm": 2.377067804336548, "learning_rate": 4.3499035834480275e-06, "loss": 0.0714, "mean_token_accuracy": 0.9782310485839844, "num_tokens": 78668915.0, "step": 9690 }, { "entropy": 0.1720373123884201, "epoch": 5.471218961625282, "grad_norm": 2.115959405899048, "learning_rate": 4.349274262530856e-06, "loss": 0.0724, "mean_token_accuracy": 0.9783347487449646, "num_tokens": 78709535.0, "step": 9695 }, { "entropy": 0.1738339751958847, "epoch": 5.474040632054176, "grad_norm": 1.7836189270019531, "learning_rate": 4.348644701104845e-06, "loss": 0.067, "mean_token_accuracy": 0.9795932054519654, "num_tokens": 78750224.0, "step": 9700 }, { "entropy": 0.19387414753437043, "epoch": 5.47686230248307, "grad_norm": 1.597865343093872, "learning_rate": 4.348014899293675e-06, "loss": 0.0782, "mean_token_accuracy": 0.9758282899856567, "num_tokens": 78790822.0, "step": 9705 }, { "entropy": 0.1638443261384964, "epoch": 5.479683972911964, "grad_norm": 1.7164875268936157, "learning_rate": 4.3473848572210685e-06, "loss": 0.0667, "mean_token_accuracy": 0.9790258288383484, "num_tokens": 78831384.0, "step": 9710 }, { "entropy": 0.17148119807243348, "epoch": 5.482505643340858, "grad_norm": 1.7814518213272095, "learning_rate": 4.346754575010798e-06, "loss": 0.055, "mean_token_accuracy": 0.9819474458694458, "num_tokens": 78871893.0, "step": 9715 }, { "entropy": 0.1933718204498291, "epoch": 5.485327313769751, "grad_norm": 2.326171398162842, "learning_rate": 4.346124052786682e-06, "loss": 0.0807, "mean_token_accuracy": 0.9755822420120239, "num_tokens": 78912523.0, "step": 9720 }, { "entropy": 0.17635531425476075, "epoch": 5.488148984198646, "grad_norm": 1.7191340923309326, "learning_rate": 4.3454932906725875e-06, "loss": 0.0705, "mean_token_accuracy": 0.9771863937377929, "num_tokens": 78952985.0, "step": 9725 }, { "entropy": 0.17937880754470825, "epoch": 5.49097065462754, "grad_norm": 2.046811819076538, "learning_rate": 4.3448622887924265e-06, "loss": 0.0618, "mean_token_accuracy": 0.981409227848053, "num_tokens": 78993356.0, "step": 9730 }, { "entropy": 0.19099608957767486, "epoch": 5.493792325056433, "grad_norm": 2.5957789421081543, "learning_rate": 4.3442310472701615e-06, "loss": 0.0765, "mean_token_accuracy": 0.9757651925086975, "num_tokens": 79034026.0, "step": 9735 }, { "entropy": 0.1835557132959366, "epoch": 5.496613995485327, "grad_norm": 2.360753059387207, "learning_rate": 4.343599566229799e-06, "loss": 0.0732, "mean_token_accuracy": 0.9772834777832031, "num_tokens": 79074662.0, "step": 9740 }, { "entropy": 0.17511331737041474, "epoch": 5.499435665914222, "grad_norm": 2.0439140796661377, "learning_rate": 4.342967845795393e-06, "loss": 0.0642, "mean_token_accuracy": 0.9799216628074646, "num_tokens": 79115192.0, "step": 9745 }, { "entropy": 0.18398658633232118, "epoch": 5.502257336343115, "grad_norm": 1.601022481918335, "learning_rate": 4.342335886091045e-06, "loss": 0.0729, "mean_token_accuracy": 0.9781609892845153, "num_tokens": 79155981.0, "step": 9750 }, { "entropy": 0.18661101460456847, "epoch": 5.505079006772009, "grad_norm": 2.0731983184814453, "learning_rate": 4.341703687240903e-06, "loss": 0.0754, "mean_token_accuracy": 0.9759285807609558, "num_tokens": 79196610.0, "step": 9755 }, { "entropy": 0.19010823965072632, "epoch": 5.507900677200903, "grad_norm": 1.9401448965072632, "learning_rate": 4.341071249369164e-06, "loss": 0.0742, "mean_token_accuracy": 0.9756409406661988, "num_tokens": 79237061.0, "step": 9760 }, { "entropy": 0.17404641211032867, "epoch": 5.510722347629796, "grad_norm": 2.2344675064086914, "learning_rate": 4.340438572600072e-06, "loss": 0.0677, "mean_token_accuracy": 0.9782544851303101, "num_tokens": 79277948.0, "step": 9765 }, { "entropy": 0.1966948091983795, "epoch": 5.513544018058691, "grad_norm": 2.11859130859375, "learning_rate": 4.3398056570579125e-06, "loss": 0.0865, "mean_token_accuracy": 0.9740624785423279, "num_tokens": 79317957.0, "step": 9770 }, { "entropy": 0.17077078223228453, "epoch": 5.516365688487585, "grad_norm": 1.9903873205184937, "learning_rate": 4.339172502867023e-06, "loss": 0.0667, "mean_token_accuracy": 0.9797818660736084, "num_tokens": 79358659.0, "step": 9775 }, { "entropy": 0.17696953415870667, "epoch": 5.519187358916478, "grad_norm": 2.100498676300049, "learning_rate": 4.338539110151789e-06, "loss": 0.0834, "mean_token_accuracy": 0.974258816242218, "num_tokens": 79399262.0, "step": 9780 }, { "entropy": 0.17644532322883605, "epoch": 5.522009029345372, "grad_norm": 2.0375876426696777, "learning_rate": 4.337905479036639e-06, "loss": 0.0722, "mean_token_accuracy": 0.9780573964118957, "num_tokens": 79439526.0, "step": 9785 }, { "entropy": 0.18530842363834382, "epoch": 5.524830699774267, "grad_norm": 2.0761780738830566, "learning_rate": 4.33727160964605e-06, "loss": 0.0706, "mean_token_accuracy": 0.9781399369239807, "num_tokens": 79480123.0, "step": 9790 }, { "entropy": 0.19907425343990326, "epoch": 5.52765237020316, "grad_norm": 1.8196872472763062, "learning_rate": 4.336637502104545e-06, "loss": 0.0821, "mean_token_accuracy": 0.9743606328964234, "num_tokens": 79520700.0, "step": 9795 }, { "entropy": 0.18495319783687592, "epoch": 5.530474040632054, "grad_norm": 1.844511866569519, "learning_rate": 4.336003156536696e-06, "loss": 0.0689, "mean_token_accuracy": 0.9784881234169006, "num_tokens": 79561218.0, "step": 9800 }, { "entropy": 0.17770458459854127, "epoch": 5.533295711060948, "grad_norm": 2.0217959880828857, "learning_rate": 4.335368573067118e-06, "loss": 0.0743, "mean_token_accuracy": 0.9774918794631958, "num_tokens": 79601835.0, "step": 9805 }, { "entropy": 0.18182687163352967, "epoch": 5.536117381489842, "grad_norm": 2.815741777420044, "learning_rate": 4.334733751820478e-06, "loss": 0.0754, "mean_token_accuracy": 0.9765426874160766, "num_tokens": 79642074.0, "step": 9810 }, { "entropy": 0.17728038728237153, "epoch": 5.538939051918736, "grad_norm": 1.9506757259368896, "learning_rate": 4.334098692921484e-06, "loss": 0.0674, "mean_token_accuracy": 0.9793436288833618, "num_tokens": 79682666.0, "step": 9815 }, { "entropy": 0.1820712596178055, "epoch": 5.54176072234763, "grad_norm": 2.2737324237823486, "learning_rate": 4.333463396494896e-06, "loss": 0.0664, "mean_token_accuracy": 0.9783164978027343, "num_tokens": 79723398.0, "step": 9820 }, { "entropy": 0.1908366858959198, "epoch": 5.544582392776523, "grad_norm": 1.7559250593185425, "learning_rate": 4.332827862665515e-06, "loss": 0.0673, "mean_token_accuracy": 0.9796145915985107, "num_tokens": 79764063.0, "step": 9825 }, { "entropy": 0.18218895196914672, "epoch": 5.547404063205418, "grad_norm": 2.057884693145752, "learning_rate": 4.332192091558195e-06, "loss": 0.084, "mean_token_accuracy": 0.9729039072990417, "num_tokens": 79804649.0, "step": 9830 }, { "entropy": 0.19951559603214264, "epoch": 5.550225733634312, "grad_norm": 2.015651226043701, "learning_rate": 4.331556083297831e-06, "loss": 0.0819, "mean_token_accuracy": 0.9746386528015136, "num_tokens": 79845171.0, "step": 9835 }, { "entropy": 0.17591853737831115, "epoch": 5.553047404063205, "grad_norm": 2.3496806621551514, "learning_rate": 4.330919838009368e-06, "loss": 0.0591, "mean_token_accuracy": 0.9815265893936157, "num_tokens": 79885183.0, "step": 9840 }, { "entropy": 0.176205512881279, "epoch": 5.555869074492099, "grad_norm": 2.1586620807647705, "learning_rate": 4.330283355817796e-06, "loss": 0.0767, "mean_token_accuracy": 0.9763123154640198, "num_tokens": 79925909.0, "step": 9845 }, { "entropy": 0.17635221481323243, "epoch": 5.558690744920993, "grad_norm": 2.0332553386688232, "learning_rate": 4.329646636848151e-06, "loss": 0.0669, "mean_token_accuracy": 0.9792371988296509, "num_tokens": 79966534.0, "step": 9850 }, { "entropy": 0.18909758031368257, "epoch": 5.561512415349887, "grad_norm": 1.8824989795684814, "learning_rate": 4.3290096812255185e-06, "loss": 0.0767, "mean_token_accuracy": 0.976757287979126, "num_tokens": 80007224.0, "step": 9855 }, { "entropy": 0.18777174949645997, "epoch": 5.564334085778781, "grad_norm": 2.0460257530212402, "learning_rate": 4.328372489075028e-06, "loss": 0.0755, "mean_token_accuracy": 0.9778550267219543, "num_tokens": 80047897.0, "step": 9860 }, { "entropy": 0.18754212260246278, "epoch": 5.567155756207675, "grad_norm": 2.3515195846557617, "learning_rate": 4.327735060521855e-06, "loss": 0.0802, "mean_token_accuracy": 0.9749362111091614, "num_tokens": 80088428.0, "step": 9865 }, { "entropy": 0.17782265245914458, "epoch": 5.5699774266365685, "grad_norm": 1.882233738899231, "learning_rate": 4.3270973956912225e-06, "loss": 0.0696, "mean_token_accuracy": 0.9777340531349182, "num_tokens": 80129154.0, "step": 9870 }, { "entropy": 0.16835958659648895, "epoch": 5.572799097065463, "grad_norm": 1.7510329484939575, "learning_rate": 4.326459494708401e-06, "loss": 0.0638, "mean_token_accuracy": 0.9795340061187744, "num_tokens": 80169798.0, "step": 9875 }, { "entropy": 0.17615444958209991, "epoch": 5.575620767494357, "grad_norm": 1.4721903800964355, "learning_rate": 4.325821357698705e-06, "loss": 0.0693, "mean_token_accuracy": 0.9783699035644531, "num_tokens": 80210503.0, "step": 9880 }, { "entropy": 0.1756734073162079, "epoch": 5.5784424379232505, "grad_norm": 2.1826069355010986, "learning_rate": 4.325182984787499e-06, "loss": 0.067, "mean_token_accuracy": 0.9788246631622315, "num_tokens": 80251068.0, "step": 9885 }, { "entropy": 0.19139577448368073, "epoch": 5.581264108352144, "grad_norm": 2.021207094192505, "learning_rate": 4.324544376100188e-06, "loss": 0.0744, "mean_token_accuracy": 0.9779517769813537, "num_tokens": 80291849.0, "step": 9890 }, { "entropy": 0.17277522683143615, "epoch": 5.584085778781039, "grad_norm": 2.1115806102752686, "learning_rate": 4.323905531762229e-06, "loss": 0.0694, "mean_token_accuracy": 0.9781728863716126, "num_tokens": 80332416.0, "step": 9895 }, { "entropy": 0.17857773303985597, "epoch": 5.586907449209932, "grad_norm": 1.9007774591445923, "learning_rate": 4.323266451899122e-06, "loss": 0.0666, "mean_token_accuracy": 0.9787236452102661, "num_tokens": 80373154.0, "step": 9900 }, { "entropy": 0.18921782374382018, "epoch": 5.589729119638826, "grad_norm": 1.874495267868042, "learning_rate": 4.322627136636415e-06, "loss": 0.0715, "mean_token_accuracy": 0.9781720876693726, "num_tokens": 80413856.0, "step": 9905 }, { "entropy": 0.18341380059719087, "epoch": 5.59255079006772, "grad_norm": 2.1999471187591553, "learning_rate": 4.321987586099702e-06, "loss": 0.0603, "mean_token_accuracy": 0.9813642621040344, "num_tokens": 80454655.0, "step": 9910 }, { "entropy": 0.17606990933418273, "epoch": 5.595372460496614, "grad_norm": 2.1019020080566406, "learning_rate": 4.32134780041462e-06, "loss": 0.0651, "mean_token_accuracy": 0.9796467542648315, "num_tokens": 80495377.0, "step": 9915 }, { "entropy": 0.18233582377433777, "epoch": 5.598194130925508, "grad_norm": 1.8605166673660278, "learning_rate": 4.320707779706859e-06, "loss": 0.0701, "mean_token_accuracy": 0.9781240105628968, "num_tokens": 80536019.0, "step": 9920 }, { "entropy": 0.18825688362121581, "epoch": 5.601015801354402, "grad_norm": 1.838043451309204, "learning_rate": 4.320067524102149e-06, "loss": 0.0755, "mean_token_accuracy": 0.9771787285804748, "num_tokens": 80576772.0, "step": 9925 }, { "entropy": 0.1714703172445297, "epoch": 5.6038374717832955, "grad_norm": 2.1907753944396973, "learning_rate": 4.319427033726268e-06, "loss": 0.0685, "mean_token_accuracy": 0.9775472521781922, "num_tokens": 80617419.0, "step": 9930 }, { "entropy": 0.1989262282848358, "epoch": 5.606659142212189, "grad_norm": 2.381535530090332, "learning_rate": 4.3187863087050405e-06, "loss": 0.0764, "mean_token_accuracy": 0.9764021635055542, "num_tokens": 80657689.0, "step": 9935 }, { "entropy": 0.1726620763540268, "epoch": 5.609480812641084, "grad_norm": 2.249459743499756, "learning_rate": 4.318145349164339e-06, "loss": 0.067, "mean_token_accuracy": 0.9787798404693604, "num_tokens": 80698324.0, "step": 9940 }, { "entropy": 0.17213599681854247, "epoch": 5.6123024830699775, "grad_norm": 1.9886410236358643, "learning_rate": 4.3175041552300775e-06, "loss": 0.062, "mean_token_accuracy": 0.9811339974403381, "num_tokens": 80738375.0, "step": 9945 }, { "entropy": 0.16617416441440583, "epoch": 5.615124153498871, "grad_norm": 2.2124135494232178, "learning_rate": 4.31686272702822e-06, "loss": 0.0702, "mean_token_accuracy": 0.978109622001648, "num_tokens": 80778942.0, "step": 9950 }, { "entropy": 0.1947992593050003, "epoch": 5.617945823927765, "grad_norm": 2.2856040000915527, "learning_rate": 4.316221064684775e-06, "loss": 0.0789, "mean_token_accuracy": 0.9748259305953979, "num_tokens": 80819710.0, "step": 9955 }, { "entropy": 0.19779794812202453, "epoch": 5.6207674943566595, "grad_norm": 3.436311721801758, "learning_rate": 4.3155791683257965e-06, "loss": 0.074, "mean_token_accuracy": 0.9769353747367859, "num_tokens": 80860043.0, "step": 9960 }, { "entropy": 0.18495135605335236, "epoch": 5.623589164785553, "grad_norm": 2.0917840003967285, "learning_rate": 4.314937038077386e-06, "loss": 0.0708, "mean_token_accuracy": 0.9777469873428345, "num_tokens": 80900666.0, "step": 9965 }, { "entropy": 0.1749154359102249, "epoch": 5.626410835214447, "grad_norm": 1.831464409828186, "learning_rate": 4.314294674065689e-06, "loss": 0.0673, "mean_token_accuracy": 0.9792719483375549, "num_tokens": 80941260.0, "step": 9970 }, { "entropy": 0.17395665049552916, "epoch": 5.6292325056433405, "grad_norm": 2.2329459190368652, "learning_rate": 4.3136520764168996e-06, "loss": 0.0716, "mean_token_accuracy": 0.9781460523605346, "num_tokens": 80981968.0, "step": 9975 }, { "entropy": 0.19316057562828065, "epoch": 5.632054176072235, "grad_norm": 1.6789857149124146, "learning_rate": 4.3130092452572545e-06, "loss": 0.0722, "mean_token_accuracy": 0.9778452634811401, "num_tokens": 81022614.0, "step": 9980 }, { "entropy": 0.17241209149360656, "epoch": 5.634875846501129, "grad_norm": 1.6188982725143433, "learning_rate": 4.312366180713039e-06, "loss": 0.0668, "mean_token_accuracy": 0.9798883080482483, "num_tokens": 81063456.0, "step": 9985 }, { "entropy": 0.18288934230804443, "epoch": 5.6376975169300225, "grad_norm": 12.512042999267578, "learning_rate": 4.311722882910584e-06, "loss": 0.0767, "mean_token_accuracy": 0.9758876323699951, "num_tokens": 81104260.0, "step": 9990 }, { "entropy": 0.17049558758735656, "epoch": 5.640519187358916, "grad_norm": 2.0652711391448975, "learning_rate": 4.3110793519762625e-06, "loss": 0.0753, "mean_token_accuracy": 0.9753929495811462, "num_tokens": 81145141.0, "step": 9995 }, { "entropy": 0.17534006536006927, "epoch": 5.643340857787811, "grad_norm": 1.9431458711624146, "learning_rate": 4.3104355880365e-06, "loss": 0.0686, "mean_token_accuracy": 0.9783696532249451, "num_tokens": 81185597.0, "step": 10000 }, { "epoch": 5.643340857787811, "eval_entropy": 0.2248619794845581, "eval_loss": 0.03424257040023804, "eval_mean_token_accuracy": 0.9908010959625244, "eval_num_tokens": 81185597.0, "eval_runtime": 0.164, "eval_samples_per_second": 24.396, "eval_steps_per_second": 6.099, "step": 10000 }, { "entropy": 0.17620908617973327, "epoch": 5.6461625282167045, "grad_norm": 2.0070059299468994, "learning_rate": 4.3097915912177615e-06, "loss": 0.0726, "mean_token_accuracy": 0.9765960216522217, "num_tokens": 81225965.0, "step": 10005 }, { "entropy": 0.19689911603927612, "epoch": 5.648984198645598, "grad_norm": 2.1444599628448486, "learning_rate": 4.30914736164656e-06, "loss": 0.0715, "mean_token_accuracy": 0.9776429057121276, "num_tokens": 81266144.0, "step": 10010 }, { "entropy": 0.1702759474515915, "epoch": 5.651805869074492, "grad_norm": 1.7831538915634155, "learning_rate": 4.308502899449456e-06, "loss": 0.0646, "mean_token_accuracy": 0.9793395638465882, "num_tokens": 81306666.0, "step": 10015 }, { "entropy": 0.19171405136585234, "epoch": 5.654627539503386, "grad_norm": 6.789673328399658, "learning_rate": 4.307858204753054e-06, "loss": 0.0827, "mean_token_accuracy": 0.9753363370895386, "num_tokens": 81347319.0, "step": 10020 }, { "entropy": 0.18764612972736358, "epoch": 5.65744920993228, "grad_norm": 2.124255895614624, "learning_rate": 4.3072132776840035e-06, "loss": 0.0692, "mean_token_accuracy": 0.9783503413200378, "num_tokens": 81387613.0, "step": 10025 }, { "entropy": 0.17401687800884247, "epoch": 5.660270880361174, "grad_norm": 2.335118293762207, "learning_rate": 4.306568118369001e-06, "loss": 0.0691, "mean_token_accuracy": 0.9781079888343811, "num_tokens": 81428246.0, "step": 10030 }, { "entropy": 0.1741603434085846, "epoch": 5.663092550790068, "grad_norm": 1.9242067337036133, "learning_rate": 4.305922726934788e-06, "loss": 0.0657, "mean_token_accuracy": 0.9787984848022461, "num_tokens": 81468991.0, "step": 10035 }, { "entropy": 0.18352454900741577, "epoch": 5.665914221218961, "grad_norm": 2.054337501525879, "learning_rate": 4.305277103508152e-06, "loss": 0.0738, "mean_token_accuracy": 0.976045835018158, "num_tokens": 81509669.0, "step": 10040 }, { "entropy": 0.19397950172424316, "epoch": 5.668735891647856, "grad_norm": 2.4687507152557373, "learning_rate": 4.304631248215925e-06, "loss": 0.0815, "mean_token_accuracy": 0.9752507686614991, "num_tokens": 81550264.0, "step": 10045 }, { "entropy": 0.19177696108818054, "epoch": 5.6715575620767495, "grad_norm": 2.18593168258667, "learning_rate": 4.303985161184986e-06, "loss": 0.081, "mean_token_accuracy": 0.9757527351379395, "num_tokens": 81590845.0, "step": 10050 }, { "entropy": 0.19264190196990966, "epoch": 5.674379232505643, "grad_norm": 1.6302083730697632, "learning_rate": 4.3033388425422595e-06, "loss": 0.0754, "mean_token_accuracy": 0.9771258950233459, "num_tokens": 81631530.0, "step": 10055 }, { "entropy": 0.18041291534900666, "epoch": 5.677200902934537, "grad_norm": 2.0120229721069336, "learning_rate": 4.302692292414713e-06, "loss": 0.0701, "mean_token_accuracy": 0.9786185026168823, "num_tokens": 81672307.0, "step": 10060 }, { "entropy": 0.18173338174819947, "epoch": 5.6800225733634315, "grad_norm": 1.9405615329742432, "learning_rate": 4.302045510929364e-06, "loss": 0.0692, "mean_token_accuracy": 0.9786619186401367, "num_tokens": 81712760.0, "step": 10065 }, { "entropy": 0.18286008834838868, "epoch": 5.682844243792325, "grad_norm": 2.343987464904785, "learning_rate": 4.3013984982132705e-06, "loss": 0.0807, "mean_token_accuracy": 0.9762983083724975, "num_tokens": 81753384.0, "step": 10070 }, { "entropy": 0.1787573516368866, "epoch": 5.685665914221219, "grad_norm": 2.1654746532440186, "learning_rate": 4.30075125439354e-06, "loss": 0.0803, "mean_token_accuracy": 0.9749480605125427, "num_tokens": 81793907.0, "step": 10075 }, { "entropy": 0.1876184046268463, "epoch": 5.688487584650113, "grad_norm": 2.1528499126434326, "learning_rate": 4.3001037795973225e-06, "loss": 0.065, "mean_token_accuracy": 0.9796617031097412, "num_tokens": 81834372.0, "step": 10080 }, { "entropy": 0.17926348149776458, "epoch": 5.691309255079007, "grad_norm": 2.1884028911590576, "learning_rate": 4.299456073951814e-06, "loss": 0.0723, "mean_token_accuracy": 0.9770533204078674, "num_tokens": 81875199.0, "step": 10085 }, { "entropy": 0.17883450984954835, "epoch": 5.694130925507901, "grad_norm": 1.955631971359253, "learning_rate": 4.2988081375842575e-06, "loss": 0.0658, "mean_token_accuracy": 0.9807510495185852, "num_tokens": 81915920.0, "step": 10090 }, { "entropy": 0.1908187061548233, "epoch": 5.696952595936795, "grad_norm": 2.022860288619995, "learning_rate": 4.29815997062194e-06, "loss": 0.0767, "mean_token_accuracy": 0.9755529642105103, "num_tokens": 81956148.0, "step": 10095 }, { "entropy": 0.1840174227952957, "epoch": 5.699774266365688, "grad_norm": 2.5862724781036377, "learning_rate": 4.297511573192194e-06, "loss": 0.0737, "mean_token_accuracy": 0.9770397782325745, "num_tokens": 81996746.0, "step": 10100 }, { "entropy": 0.18789410591125488, "epoch": 5.702595936794582, "grad_norm": 2.2070839405059814, "learning_rate": 4.296862945422396e-06, "loss": 0.0763, "mean_token_accuracy": 0.9756775379180909, "num_tokens": 82037275.0, "step": 10105 }, { "entropy": 0.1947762042284012, "epoch": 5.705417607223477, "grad_norm": 2.0239546298980713, "learning_rate": 4.2962140874399705e-06, "loss": 0.0736, "mean_token_accuracy": 0.9776275634765625, "num_tokens": 82077888.0, "step": 10110 }, { "entropy": 0.19028544127941133, "epoch": 5.70823927765237, "grad_norm": 2.6127588748931885, "learning_rate": 4.295564999372385e-06, "loss": 0.0712, "mean_token_accuracy": 0.97735435962677, "num_tokens": 82118455.0, "step": 10115 }, { "entropy": 0.18263841271400452, "epoch": 5.711060948081264, "grad_norm": 1.8415166139602661, "learning_rate": 4.294915681347154e-06, "loss": 0.0719, "mean_token_accuracy": 0.9764939785003662, "num_tokens": 82158890.0, "step": 10120 }, { "entropy": 0.18013592660427094, "epoch": 5.713882618510158, "grad_norm": 2.214005470275879, "learning_rate": 4.294266133491834e-06, "loss": 0.0654, "mean_token_accuracy": 0.9796643853187561, "num_tokens": 82199368.0, "step": 10125 }, { "entropy": 0.17933290004730223, "epoch": 5.716704288939052, "grad_norm": 2.40313720703125, "learning_rate": 4.293616355934032e-06, "loss": 0.0761, "mean_token_accuracy": 0.9756105422973633, "num_tokens": 82240162.0, "step": 10130 }, { "entropy": 0.19547135531902313, "epoch": 5.719525959367946, "grad_norm": 2.1128058433532715, "learning_rate": 4.292966348801394e-06, "loss": 0.0715, "mean_token_accuracy": 0.9782087802886963, "num_tokens": 82281077.0, "step": 10135 }, { "entropy": 0.17906437516212464, "epoch": 5.72234762979684, "grad_norm": 2.4157004356384277, "learning_rate": 4.292316112221615e-06, "loss": 0.0637, "mean_token_accuracy": 0.9807783722877502, "num_tokens": 82321293.0, "step": 10140 }, { "entropy": 0.17458663284778594, "epoch": 5.725169300225733, "grad_norm": 2.1524438858032227, "learning_rate": 4.291665646322434e-06, "loss": 0.0691, "mean_token_accuracy": 0.9782337784767151, "num_tokens": 82361543.0, "step": 10145 }, { "entropy": 0.18585495054721832, "epoch": 5.727990970654628, "grad_norm": 2.3089470863342285, "learning_rate": 4.2910149512316345e-06, "loss": 0.0743, "mean_token_accuracy": 0.976749575138092, "num_tokens": 82402317.0, "step": 10150 }, { "entropy": 0.1775616079568863, "epoch": 5.730812641083522, "grad_norm": 2.647921323776245, "learning_rate": 4.290364027077047e-06, "loss": 0.0775, "mean_token_accuracy": 0.975775396823883, "num_tokens": 82443034.0, "step": 10155 }, { "entropy": 0.16766503751277922, "epoch": 5.733634311512415, "grad_norm": 2.180415391921997, "learning_rate": 4.2897128739865446e-06, "loss": 0.0681, "mean_token_accuracy": 0.9786939501762391, "num_tokens": 82483796.0, "step": 10160 }, { "entropy": 0.20999548733234405, "epoch": 5.736455981941309, "grad_norm": 2.0857527256011963, "learning_rate": 4.289061492088047e-06, "loss": 0.086, "mean_token_accuracy": 0.9734646320343018, "num_tokens": 82524597.0, "step": 10165 }, { "entropy": 0.18432653844356536, "epoch": 5.739277652370204, "grad_norm": 2.0384151935577393, "learning_rate": 4.2884098815095175e-06, "loss": 0.0702, "mean_token_accuracy": 0.977328622341156, "num_tokens": 82565390.0, "step": 10170 }, { "entropy": 0.17555322945117952, "epoch": 5.742099322799097, "grad_norm": 2.352675199508667, "learning_rate": 4.287758042378966e-06, "loss": 0.0657, "mean_token_accuracy": 0.979662299156189, "num_tokens": 82605127.0, "step": 10175 }, { "entropy": 0.1978653311729431, "epoch": 5.744920993227991, "grad_norm": 2.1638011932373047, "learning_rate": 4.287105974824446e-06, "loss": 0.0753, "mean_token_accuracy": 0.9755738496780395, "num_tokens": 82645570.0, "step": 10180 }, { "entropy": 0.18076943159103392, "epoch": 5.747742663656885, "grad_norm": 2.197725534439087, "learning_rate": 4.286453678974055e-06, "loss": 0.0757, "mean_token_accuracy": 0.9774895668029785, "num_tokens": 82686162.0, "step": 10185 }, { "entropy": 0.1678744912147522, "epoch": 5.750564334085778, "grad_norm": 1.8131523132324219, "learning_rate": 4.28580115495594e-06, "loss": 0.0712, "mean_token_accuracy": 0.9775494694709778, "num_tokens": 82726804.0, "step": 10190 }, { "entropy": 0.1693742036819458, "epoch": 5.753386004514673, "grad_norm": 1.9890544414520264, "learning_rate": 4.285148402898285e-06, "loss": 0.0696, "mean_token_accuracy": 0.9783661007881165, "num_tokens": 82767575.0, "step": 10195 }, { "entropy": 0.16243847608566284, "epoch": 5.756207674943567, "grad_norm": 2.3177292346954346, "learning_rate": 4.284495422929326e-06, "loss": 0.063, "mean_token_accuracy": 0.9801602840423584, "num_tokens": 82808426.0, "step": 10200 }, { "entropy": 0.18572303354740144, "epoch": 5.75902934537246, "grad_norm": 2.198636531829834, "learning_rate": 4.283842215177341e-06, "loss": 0.0701, "mean_token_accuracy": 0.9777385950088501, "num_tokens": 82849159.0, "step": 10205 }, { "entropy": 0.18345086872577668, "epoch": 5.761851015801354, "grad_norm": 2.125354290008545, "learning_rate": 4.283188779770652e-06, "loss": 0.0782, "mean_token_accuracy": 0.9764749526977539, "num_tokens": 82889959.0, "step": 10210 }, { "entropy": 0.18493236899375914, "epoch": 5.764672686230249, "grad_norm": 2.407668352127075, "learning_rate": 4.2825351168376275e-06, "loss": 0.0869, "mean_token_accuracy": 0.971848976612091, "num_tokens": 82930471.0, "step": 10215 }, { "entropy": 0.18200556337833404, "epoch": 5.767494356659142, "grad_norm": 1.8151724338531494, "learning_rate": 4.281881226506677e-06, "loss": 0.0811, "mean_token_accuracy": 0.974114739894867, "num_tokens": 82971060.0, "step": 10220 }, { "entropy": 0.17582403719425202, "epoch": 5.770316027088036, "grad_norm": 1.7381181716918945, "learning_rate": 4.28122710890626e-06, "loss": 0.0811, "mean_token_accuracy": 0.974997091293335, "num_tokens": 83011536.0, "step": 10225 }, { "entropy": 0.19064405858516692, "epoch": 5.77313769751693, "grad_norm": 2.46132493019104, "learning_rate": 4.2805727641648775e-06, "loss": 0.0911, "mean_token_accuracy": 0.9718187093734741, "num_tokens": 83052238.0, "step": 10230 }, { "entropy": 0.18355910778045653, "epoch": 5.775959367945823, "grad_norm": 2.21754789352417, "learning_rate": 4.2799181924110755e-06, "loss": 0.075, "mean_token_accuracy": 0.9755819082260132, "num_tokens": 83092882.0, "step": 10235 }, { "entropy": 0.18562229871749877, "epoch": 5.778781038374718, "grad_norm": 2.161637306213379, "learning_rate": 4.279263393773444e-06, "loss": 0.0767, "mean_token_accuracy": 0.9759423017501831, "num_tokens": 83133727.0, "step": 10240 }, { "entropy": 0.1935652256011963, "epoch": 5.781602708803612, "grad_norm": 1.7781506776809692, "learning_rate": 4.278608368380618e-06, "loss": 0.0818, "mean_token_accuracy": 0.9746825933456421, "num_tokens": 83173500.0, "step": 10245 }, { "entropy": 0.19277122020721435, "epoch": 5.784424379232505, "grad_norm": 1.752469539642334, "learning_rate": 4.27795311636128e-06, "loss": 0.0694, "mean_token_accuracy": 0.9785934090614319, "num_tokens": 83214016.0, "step": 10250 }, { "entropy": 0.179434072971344, "epoch": 5.7872460496614, "grad_norm": 2.006405830383301, "learning_rate": 4.277297637844151e-06, "loss": 0.071, "mean_token_accuracy": 0.9783080697059632, "num_tokens": 83254826.0, "step": 10255 }, { "entropy": 0.17800212800502777, "epoch": 5.790067720090294, "grad_norm": 2.5060832500457764, "learning_rate": 4.276641932958002e-06, "loss": 0.074, "mean_token_accuracy": 0.9775895595550537, "num_tokens": 83295437.0, "step": 10260 }, { "entropy": 0.1740177422761917, "epoch": 5.792889390519187, "grad_norm": 1.8866690397262573, "learning_rate": 4.275986001831645e-06, "loss": 0.0737, "mean_token_accuracy": 0.9761949896812439, "num_tokens": 83336071.0, "step": 10265 }, { "entropy": 0.17147966027259826, "epoch": 5.795711060948081, "grad_norm": 2.06299090385437, "learning_rate": 4.275329844593938e-06, "loss": 0.0722, "mean_token_accuracy": 0.9773301362991333, "num_tokens": 83376866.0, "step": 10270 }, { "entropy": 0.18029634952545165, "epoch": 5.798532731376975, "grad_norm": 2.3944549560546875, "learning_rate": 4.274673461373784e-06, "loss": 0.0886, "mean_token_accuracy": 0.9716618418693542, "num_tokens": 83417485.0, "step": 10275 }, { "entropy": 0.18746162354946136, "epoch": 5.801354401805869, "grad_norm": 2.2615456581115723, "learning_rate": 4.274016852300129e-06, "loss": 0.0709, "mean_token_accuracy": 0.9775063872337342, "num_tokens": 83458100.0, "step": 10280 }, { "entropy": 0.18002947568893432, "epoch": 5.804176072234763, "grad_norm": 1.8940982818603516, "learning_rate": 4.273360017501964e-06, "loss": 0.0732, "mean_token_accuracy": 0.9775622367858887, "num_tokens": 83498674.0, "step": 10285 }, { "entropy": 0.1664568930864334, "epoch": 5.806997742663657, "grad_norm": 2.5354745388031006, "learning_rate": 4.272702957108325e-06, "loss": 0.0729, "mean_token_accuracy": 0.976758623123169, "num_tokens": 83539467.0, "step": 10290 }, { "entropy": 0.18044437170028688, "epoch": 5.8098194130925505, "grad_norm": 1.8379076719284058, "learning_rate": 4.272045671248289e-06, "loss": 0.0661, "mean_token_accuracy": 0.9795686602592468, "num_tokens": 83580065.0, "step": 10295 }, { "entropy": 0.18124404847621917, "epoch": 5.812641083521445, "grad_norm": 2.4792568683624268, "learning_rate": 4.2713881600509835e-06, "loss": 0.0632, "mean_token_accuracy": 0.9804369568824768, "num_tokens": 83619608.0, "step": 10300 }, { "entropy": 0.1773180991411209, "epoch": 5.815462753950339, "grad_norm": 2.801755428314209, "learning_rate": 4.270730423645574e-06, "loss": 0.066, "mean_token_accuracy": 0.9787346959114075, "num_tokens": 83659117.0, "step": 10305 }, { "entropy": 0.17246809601783752, "epoch": 5.818284424379232, "grad_norm": 1.8683533668518066, "learning_rate": 4.2700724621612745e-06, "loss": 0.0701, "mean_token_accuracy": 0.9784170150756836, "num_tokens": 83699987.0, "step": 10310 }, { "entropy": 0.17956862449645997, "epoch": 5.821106094808126, "grad_norm": 1.9856154918670654, "learning_rate": 4.269414275727341e-06, "loss": 0.083, "mean_token_accuracy": 0.9736566543579102, "num_tokens": 83740362.0, "step": 10315 }, { "entropy": 0.17001102566719056, "epoch": 5.82392776523702, "grad_norm": 2.348156452178955, "learning_rate": 4.2687558644730735e-06, "loss": 0.0603, "mean_token_accuracy": 0.9807537913322448, "num_tokens": 83781025.0, "step": 10320 }, { "entropy": 0.17963250279426574, "epoch": 5.826749435665914, "grad_norm": 2.1682050228118896, "learning_rate": 4.268097228527818e-06, "loss": 0.0684, "mean_token_accuracy": 0.9780291318893433, "num_tokens": 83821718.0, "step": 10325 }, { "entropy": 0.18213966786861419, "epoch": 5.829571106094808, "grad_norm": 2.4645328521728516, "learning_rate": 4.267438368020964e-06, "loss": 0.0777, "mean_token_accuracy": 0.9760030627250671, "num_tokens": 83862498.0, "step": 10330 }, { "entropy": 0.16510314047336577, "epoch": 5.832392776523702, "grad_norm": 1.8456097841262817, "learning_rate": 4.2667792830819435e-06, "loss": 0.0668, "mean_token_accuracy": 0.9788869500160218, "num_tokens": 83903238.0, "step": 10335 }, { "entropy": 0.17967475354671478, "epoch": 5.835214446952596, "grad_norm": 1.8277400732040405, "learning_rate": 4.266119973840235e-06, "loss": 0.0766, "mean_token_accuracy": 0.975653862953186, "num_tokens": 83944020.0, "step": 10340 }, { "entropy": 0.18739196360111238, "epoch": 5.83803611738149, "grad_norm": 2.124237060546875, "learning_rate": 4.2654604404253585e-06, "loss": 0.0773, "mean_token_accuracy": 0.9757479429244995, "num_tokens": 83984438.0, "step": 10345 }, { "entropy": 0.1835099220275879, "epoch": 5.840857787810384, "grad_norm": 2.48547625541687, "learning_rate": 4.264800682966881e-06, "loss": 0.0658, "mean_token_accuracy": 0.9790345907211304, "num_tokens": 84025304.0, "step": 10350 }, { "entropy": 0.17234547436237335, "epoch": 5.8436794582392775, "grad_norm": 2.024911642074585, "learning_rate": 4.26414070159441e-06, "loss": 0.0661, "mean_token_accuracy": 0.9787455081939698, "num_tokens": 84065848.0, "step": 10355 }, { "entropy": 0.19444843232631684, "epoch": 5.846501128668171, "grad_norm": 2.464071273803711, "learning_rate": 4.263480496437601e-06, "loss": 0.0811, "mean_token_accuracy": 0.974936056137085, "num_tokens": 84106680.0, "step": 10360 }, { "entropy": 0.173148912191391, "epoch": 5.849322799097066, "grad_norm": 1.9251296520233154, "learning_rate": 4.26282006762615e-06, "loss": 0.0701, "mean_token_accuracy": 0.9775914549827576, "num_tokens": 84147405.0, "step": 10365 }, { "entropy": 0.18452159464359283, "epoch": 5.8521444695259595, "grad_norm": 2.2456061840057373, "learning_rate": 4.262159415289799e-06, "loss": 0.0768, "mean_token_accuracy": 0.9760621428489685, "num_tokens": 84188045.0, "step": 10370 }, { "entropy": 0.18307125866413115, "epoch": 5.854966139954853, "grad_norm": 2.077425241470337, "learning_rate": 4.261498539558333e-06, "loss": 0.0692, "mean_token_accuracy": 0.9785082459449768, "num_tokens": 84227724.0, "step": 10375 }, { "entropy": 0.1773241400718689, "epoch": 5.857787810383747, "grad_norm": 2.191255569458008, "learning_rate": 4.260837440561583e-06, "loss": 0.0711, "mean_token_accuracy": 0.976841950416565, "num_tokens": 84268109.0, "step": 10380 }, { "entropy": 0.18978277146816253, "epoch": 5.860609480812641, "grad_norm": 2.40100359916687, "learning_rate": 4.260176118429418e-06, "loss": 0.0722, "mean_token_accuracy": 0.9782564878463745, "num_tokens": 84308793.0, "step": 10385 }, { "entropy": 0.18334400355815889, "epoch": 5.863431151241535, "grad_norm": 2.2183425426483154, "learning_rate": 4.259514573291757e-06, "loss": 0.0792, "mean_token_accuracy": 0.9754730105400086, "num_tokens": 84349159.0, "step": 10390 }, { "entropy": 0.2000194698572159, "epoch": 5.866252821670429, "grad_norm": 2.203062057495117, "learning_rate": 4.258852805278562e-06, "loss": 0.0838, "mean_token_accuracy": 0.9722742438316345, "num_tokens": 84389870.0, "step": 10395 }, { "entropy": 0.18452554047107697, "epoch": 5.8690744920993225, "grad_norm": 2.1212892532348633, "learning_rate": 4.258190814519834e-06, "loss": 0.0774, "mean_token_accuracy": 0.976058554649353, "num_tokens": 84430501.0, "step": 10400 }, { "entropy": 0.1830653041601181, "epoch": 5.871896162528216, "grad_norm": 2.3475375175476074, "learning_rate": 4.2575286011456255e-06, "loss": 0.074, "mean_token_accuracy": 0.9763685941696167, "num_tokens": 84471098.0, "step": 10405 }, { "entropy": 0.16962153613567352, "epoch": 5.874717832957111, "grad_norm": 1.6830201148986816, "learning_rate": 4.256866165286024e-06, "loss": 0.069, "mean_token_accuracy": 0.9781726360321045, "num_tokens": 84511664.0, "step": 10410 }, { "entropy": 0.1886516332626343, "epoch": 5.8775395033860045, "grad_norm": 2.5439491271972656, "learning_rate": 4.256203507071168e-06, "loss": 0.0834, "mean_token_accuracy": 0.974034309387207, "num_tokens": 84552223.0, "step": 10415 }, { "entropy": 0.17994187772274017, "epoch": 5.880361173814898, "grad_norm": 2.874234437942505, "learning_rate": 4.255540626631236e-06, "loss": 0.0763, "mean_token_accuracy": 0.9750635504722596, "num_tokens": 84592774.0, "step": 10420 }, { "entropy": 0.19643645584583283, "epoch": 5.883182844243793, "grad_norm": 2.5835118293762207, "learning_rate": 4.2548775240964515e-06, "loss": 0.0883, "mean_token_accuracy": 0.9736735224723816, "num_tokens": 84633572.0, "step": 10425 }, { "entropy": 0.17592164278030395, "epoch": 5.8860045146726865, "grad_norm": 2.1674015522003174, "learning_rate": 4.25421419959708e-06, "loss": 0.074, "mean_token_accuracy": 0.9766902089118957, "num_tokens": 84674314.0, "step": 10430 }, { "entropy": 0.19005840122699738, "epoch": 5.88882618510158, "grad_norm": 1.8254783153533936, "learning_rate": 4.253550653263432e-06, "loss": 0.0783, "mean_token_accuracy": 0.9754180431365966, "num_tokens": 84714985.0, "step": 10435 }, { "entropy": 0.17556642293930053, "epoch": 5.891647855530474, "grad_norm": 1.7899775505065918, "learning_rate": 4.2528868852258615e-06, "loss": 0.0664, "mean_token_accuracy": 0.9791262030601502, "num_tokens": 84755700.0, "step": 10440 }, { "entropy": 0.17945761382579803, "epoch": 5.894469525959368, "grad_norm": 2.6520256996154785, "learning_rate": 4.252222895614766e-06, "loss": 0.0713, "mean_token_accuracy": 0.9770576000213623, "num_tokens": 84796363.0, "step": 10445 }, { "entropy": 0.19162935614585877, "epoch": 5.897291196388262, "grad_norm": 2.3933403491973877, "learning_rate": 4.2515586845605864e-06, "loss": 0.0795, "mean_token_accuracy": 0.9750626444816589, "num_tokens": 84836849.0, "step": 10450 }, { "entropy": 0.17797043323516845, "epoch": 5.900112866817156, "grad_norm": 2.0211195945739746, "learning_rate": 4.250894252193806e-06, "loss": 0.0713, "mean_token_accuracy": 0.9775516271591187, "num_tokens": 84877674.0, "step": 10455 }, { "entropy": 0.1765431433916092, "epoch": 5.9029345372460496, "grad_norm": 2.351975440979004, "learning_rate": 4.250229598644954e-06, "loss": 0.0678, "mean_token_accuracy": 0.9786093235015869, "num_tokens": 84918343.0, "step": 10460 }, { "entropy": 0.18122887313365937, "epoch": 5.905756207674943, "grad_norm": 1.8683475255966187, "learning_rate": 4.249564724044602e-06, "loss": 0.072, "mean_token_accuracy": 0.9761911749839782, "num_tokens": 84958846.0, "step": 10465 }, { "entropy": 0.17490617036819459, "epoch": 5.908577878103838, "grad_norm": 1.9673949480056763, "learning_rate": 4.248899628523362e-06, "loss": 0.0763, "mean_token_accuracy": 0.9763616800308228, "num_tokens": 84999643.0, "step": 10470 }, { "entropy": 0.18483209908008574, "epoch": 5.9113995485327315, "grad_norm": 1.887024164199829, "learning_rate": 4.248234312211895e-06, "loss": 0.0688, "mean_token_accuracy": 0.978734838962555, "num_tokens": 85040465.0, "step": 10475 }, { "entropy": 0.19448497295379638, "epoch": 5.914221218961625, "grad_norm": 2.0865771770477295, "learning_rate": 4.247568775240901e-06, "loss": 0.0623, "mean_token_accuracy": 0.9806246161460876, "num_tokens": 85081222.0, "step": 10480 }, { "entropy": 0.1752565920352936, "epoch": 5.917042889390519, "grad_norm": 2.1277925968170166, "learning_rate": 4.246903017741124e-06, "loss": 0.0841, "mean_token_accuracy": 0.9734927892684937, "num_tokens": 85121900.0, "step": 10485 }, { "entropy": 0.18596406877040864, "epoch": 5.919864559819413, "grad_norm": 2.1438140869140625, "learning_rate": 4.246237039843355e-06, "loss": 0.0659, "mean_token_accuracy": 0.9792929530143738, "num_tokens": 85162395.0, "step": 10490 }, { "entropy": 0.1806576669216156, "epoch": 5.922686230248307, "grad_norm": 2.2096385955810547, "learning_rate": 4.2455708416784235e-06, "loss": 0.0719, "mean_token_accuracy": 0.977632212638855, "num_tokens": 85203275.0, "step": 10495 }, { "entropy": 0.20309909880161287, "epoch": 5.925507900677201, "grad_norm": 1.782074213027954, "learning_rate": 4.244904423377204e-06, "loss": 0.0736, "mean_token_accuracy": 0.9776381611824035, "num_tokens": 85243951.0, "step": 10500 }, { "epoch": 5.925507900677201, "eval_entropy": 0.2187255471944809, "eval_loss": 0.030472468584775925, "eval_mean_token_accuracy": 0.9911843538284302, "eval_num_tokens": 85243951.0, "eval_runtime": 0.1635, "eval_samples_per_second": 24.46, "eval_steps_per_second": 6.115, "step": 10500 }, { "entropy": 0.18639110922813415, "epoch": 5.928329571106095, "grad_norm": 2.001336097717285, "learning_rate": 4.244237785070615e-06, "loss": 0.0712, "mean_token_accuracy": 0.9782602667808533, "num_tokens": 85284411.0, "step": 10505 }, { "entropy": 0.18605276346206664, "epoch": 5.931151241534989, "grad_norm": 2.1870064735412598, "learning_rate": 4.243570926889618e-06, "loss": 0.0759, "mean_token_accuracy": 0.9759214639663696, "num_tokens": 85325253.0, "step": 10510 }, { "entropy": 0.17156625986099244, "epoch": 5.933972911963883, "grad_norm": 2.035531520843506, "learning_rate": 4.242903848965217e-06, "loss": 0.0656, "mean_token_accuracy": 0.9796592235565186, "num_tokens": 85365632.0, "step": 10515 }, { "entropy": 0.20702783465385438, "epoch": 5.936794582392777, "grad_norm": 2.6586201190948486, "learning_rate": 4.242236551428459e-06, "loss": 0.0684, "mean_token_accuracy": 0.9800093650817872, "num_tokens": 85405847.0, "step": 10520 }, { "entropy": 0.18220070600509644, "epoch": 5.93961625282167, "grad_norm": 2.27356219291687, "learning_rate": 4.241569034410436e-06, "loss": 0.0749, "mean_token_accuracy": 0.9759989261627198, "num_tokens": 85446548.0, "step": 10525 }, { "entropy": 0.17777936458587645, "epoch": 5.942437923250564, "grad_norm": 1.8858665227890015, "learning_rate": 4.24090129804228e-06, "loss": 0.0764, "mean_token_accuracy": 0.9763510704040528, "num_tokens": 85487166.0, "step": 10530 }, { "entropy": 0.19082893133163453, "epoch": 5.9452595936794586, "grad_norm": 2.3828372955322266, "learning_rate": 4.24023334245517e-06, "loss": 0.0726, "mean_token_accuracy": 0.9769440650939941, "num_tokens": 85527909.0, "step": 10535 }, { "entropy": 0.16436271667480468, "epoch": 5.948081264108352, "grad_norm": 2.2857789993286133, "learning_rate": 4.2395651677803244e-06, "loss": 0.0691, "mean_token_accuracy": 0.9776111841201782, "num_tokens": 85568713.0, "step": 10540 }, { "entropy": 0.19756191074848176, "epoch": 5.950902934537246, "grad_norm": 1.9905869960784912, "learning_rate": 4.238896774149007e-06, "loss": 0.0671, "mean_token_accuracy": 0.9784975528717041, "num_tokens": 85609278.0, "step": 10545 }, { "entropy": 0.17361958026885987, "epoch": 5.95372460496614, "grad_norm": 1.9324471950531006, "learning_rate": 4.2382281616925235e-06, "loss": 0.0715, "mean_token_accuracy": 0.9772069692611695, "num_tokens": 85649844.0, "step": 10550 }, { "entropy": 0.17714324295520784, "epoch": 5.956546275395034, "grad_norm": 2.4276859760284424, "learning_rate": 4.237559330542223e-06, "loss": 0.0732, "mean_token_accuracy": 0.9776072025299072, "num_tokens": 85690372.0, "step": 10555 }, { "entropy": 0.18243989944458008, "epoch": 5.959367945823928, "grad_norm": 2.2024104595184326, "learning_rate": 4.236890280829496e-06, "loss": 0.0758, "mean_token_accuracy": 0.9765413165092468, "num_tokens": 85730984.0, "step": 10560 }, { "entropy": 0.18148486614227294, "epoch": 5.962189616252822, "grad_norm": 2.123389720916748, "learning_rate": 4.236221012685781e-06, "loss": 0.0675, "mean_token_accuracy": 0.9791842937469483, "num_tokens": 85771776.0, "step": 10565 }, { "entropy": 0.19152417182922363, "epoch": 5.965011286681715, "grad_norm": 2.2187232971191406, "learning_rate": 4.235551526242552e-06, "loss": 0.0727, "mean_token_accuracy": 0.9772362232208252, "num_tokens": 85812366.0, "step": 10570 }, { "entropy": 0.18775533139705658, "epoch": 5.967832957110609, "grad_norm": 2.338989496231079, "learning_rate": 4.234881821631332e-06, "loss": 0.0823, "mean_token_accuracy": 0.974126398563385, "num_tokens": 85851986.0, "step": 10575 }, { "entropy": 0.19790259897708892, "epoch": 5.970654627539504, "grad_norm": 1.8901110887527466, "learning_rate": 4.234211898983684e-06, "loss": 0.0742, "mean_token_accuracy": 0.976418399810791, "num_tokens": 85892394.0, "step": 10580 }, { "entropy": 0.18636341392993927, "epoch": 5.973476297968397, "grad_norm": 2.382916212081909, "learning_rate": 4.233541758431213e-06, "loss": 0.083, "mean_token_accuracy": 0.9738220572471619, "num_tokens": 85932953.0, "step": 10585 }, { "entropy": 0.16605794429779053, "epoch": 5.976297968397291, "grad_norm": 1.9576960802078247, "learning_rate": 4.232871400105572e-06, "loss": 0.0698, "mean_token_accuracy": 0.9788704752922058, "num_tokens": 85973329.0, "step": 10590 }, { "entropy": 0.16949787735939026, "epoch": 5.979119638826186, "grad_norm": 1.836525559425354, "learning_rate": 4.232200824138448e-06, "loss": 0.0688, "mean_token_accuracy": 0.9781976103782654, "num_tokens": 86013764.0, "step": 10595 }, { "entropy": 0.17068464457988738, "epoch": 5.981941309255079, "grad_norm": 1.9783167839050293, "learning_rate": 4.231530030661579e-06, "loss": 0.0685, "mean_token_accuracy": 0.9786436915397644, "num_tokens": 86054432.0, "step": 10600 }, { "entropy": 0.18707202672958373, "epoch": 5.984762979683973, "grad_norm": 1.955723524093628, "learning_rate": 4.230859019806741e-06, "loss": 0.06, "mean_token_accuracy": 0.9814386010169983, "num_tokens": 86095074.0, "step": 10605 }, { "entropy": 0.19427139759063722, "epoch": 5.987584650112867, "grad_norm": 2.150771141052246, "learning_rate": 4.230187791705756e-06, "loss": 0.0887, "mean_token_accuracy": 0.9718084454536438, "num_tokens": 86135924.0, "step": 10610 }, { "entropy": 0.18580053150653839, "epoch": 5.99040632054176, "grad_norm": 1.954380989074707, "learning_rate": 4.229516346490485e-06, "loss": 0.0782, "mean_token_accuracy": 0.9761150360107422, "num_tokens": 86176503.0, "step": 10615 }, { "entropy": 0.18900538682937623, "epoch": 5.993227990970655, "grad_norm": 2.2284088134765625, "learning_rate": 4.2288446842928345e-06, "loss": 0.0784, "mean_token_accuracy": 0.9764190793037415, "num_tokens": 86217113.0, "step": 10620 }, { "entropy": 0.18157367706298827, "epoch": 5.996049661399549, "grad_norm": 2.151667833328247, "learning_rate": 4.228172805244753e-06, "loss": 0.0793, "mean_token_accuracy": 0.9748581528663636, "num_tokens": 86257786.0, "step": 10625 }, { "entropy": 0.19717211425304412, "epoch": 5.998871331828442, "grad_norm": 2.68282413482666, "learning_rate": 4.227500709478229e-06, "loss": 0.0845, "mean_token_accuracy": 0.9748492002487182, "num_tokens": 86298375.0, "step": 10630 }, { "entropy": 0.16193274557590484, "epoch": 6.001693002257336, "grad_norm": 1.6442623138427734, "learning_rate": 4.226828397125298e-06, "loss": 0.0563, "mean_token_accuracy": 0.9836606025695801, "num_tokens": 86333142.0, "step": 10635 }, { "entropy": 0.15360802710056304, "epoch": 6.004514672686231, "grad_norm": 1.2552261352539062, "learning_rate": 4.226155868318035e-06, "loss": 0.0358, "mean_token_accuracy": 0.9905863881111145, "num_tokens": 86373661.0, "step": 10640 }, { "entropy": 0.15118006765842437, "epoch": 6.007336343115124, "grad_norm": 2.012360095977783, "learning_rate": 4.225483123188559e-06, "loss": 0.0349, "mean_token_accuracy": 0.990486478805542, "num_tokens": 86414362.0, "step": 10645 }, { "entropy": 0.13885512351989746, "epoch": 6.010158013544018, "grad_norm": 2.042280435562134, "learning_rate": 4.224810161869029e-06, "loss": 0.0329, "mean_token_accuracy": 0.990878415107727, "num_tokens": 86454954.0, "step": 10650 }, { "entropy": 0.1436757594347, "epoch": 6.012979683972912, "grad_norm": 1.7394871711730957, "learning_rate": 4.224136984491651e-06, "loss": 0.0358, "mean_token_accuracy": 0.9893459677696228, "num_tokens": 86495647.0, "step": 10655 }, { "entropy": 0.1538625791668892, "epoch": 6.015801354401806, "grad_norm": 1.6942013502120972, "learning_rate": 4.22346359118867e-06, "loss": 0.0335, "mean_token_accuracy": 0.9908841848373413, "num_tokens": 86536010.0, "step": 10660 }, { "entropy": 0.1632068932056427, "epoch": 6.0186230248307, "grad_norm": 2.3148810863494873, "learning_rate": 4.222789982092373e-06, "loss": 0.0313, "mean_token_accuracy": 0.9914784550666809, "num_tokens": 86576406.0, "step": 10665 }, { "entropy": 0.1600690007209778, "epoch": 6.021444695259594, "grad_norm": 2.032419443130493, "learning_rate": 4.222116157335091e-06, "loss": 0.0394, "mean_token_accuracy": 0.9886528730392456, "num_tokens": 86616837.0, "step": 10670 }, { "entropy": 0.13843523859977722, "epoch": 6.024266365688487, "grad_norm": 2.1894493103027344, "learning_rate": 4.2214421170491975e-06, "loss": 0.0281, "mean_token_accuracy": 0.9916700005531311, "num_tokens": 86657356.0, "step": 10675 }, { "entropy": 0.1451241284608841, "epoch": 6.027088036117381, "grad_norm": 1.5372796058654785, "learning_rate": 4.220767861367108e-06, "loss": 0.032, "mean_token_accuracy": 0.9909778356552124, "num_tokens": 86697851.0, "step": 10680 }, { "entropy": 0.1586732506752014, "epoch": 6.029909706546276, "grad_norm": 1.9109724760055542, "learning_rate": 4.220093390421279e-06, "loss": 0.0396, "mean_token_accuracy": 0.9885246515274048, "num_tokens": 86738396.0, "step": 10685 }, { "entropy": 0.14336285442113877, "epoch": 6.032731376975169, "grad_norm": 1.615535855293274, "learning_rate": 4.219418704344211e-06, "loss": 0.0323, "mean_token_accuracy": 0.9910237193107605, "num_tokens": 86779215.0, "step": 10690 }, { "entropy": 0.15025009214878082, "epoch": 6.035553047404063, "grad_norm": 2.157390594482422, "learning_rate": 4.218743803268447e-06, "loss": 0.0399, "mean_token_accuracy": 0.9880735993385314, "num_tokens": 86819698.0, "step": 10695 }, { "entropy": 0.15084101557731627, "epoch": 6.038374717832957, "grad_norm": 1.7998473644256592, "learning_rate": 4.218068687326571e-06, "loss": 0.0354, "mean_token_accuracy": 0.9899451732635498, "num_tokens": 86860397.0, "step": 10700 }, { "entropy": 0.14166499078273773, "epoch": 6.041196388261851, "grad_norm": 1.578946590423584, "learning_rate": 4.217393356651208e-06, "loss": 0.0367, "mean_token_accuracy": 0.9898417115211486, "num_tokens": 86901077.0, "step": 10705 }, { "entropy": 0.14319468885660172, "epoch": 6.044018058690745, "grad_norm": 1.6669219732284546, "learning_rate": 4.216717811375028e-06, "loss": 0.0345, "mean_token_accuracy": 0.9898629069328309, "num_tokens": 86941792.0, "step": 10710 }, { "entropy": 0.13536919057369232, "epoch": 6.046839729119639, "grad_norm": 1.7521120309829712, "learning_rate": 4.216042051630743e-06, "loss": 0.0326, "mean_token_accuracy": 0.9901854276657105, "num_tokens": 86982279.0, "step": 10715 }, { "entropy": 0.1333491548895836, "epoch": 6.049661399548532, "grad_norm": 1.9163808822631836, "learning_rate": 4.215366077551105e-06, "loss": 0.0341, "mean_token_accuracy": 0.9900972962379455, "num_tokens": 87023083.0, "step": 10720 }, { "entropy": 0.15708822011947632, "epoch": 6.052483069977427, "grad_norm": 2.149850368499756, "learning_rate": 4.21468988926891e-06, "loss": 0.0382, "mean_token_accuracy": 0.9889817953109741, "num_tokens": 87063613.0, "step": 10725 }, { "entropy": 0.14288501143455506, "epoch": 6.055304740406321, "grad_norm": 1.6220556497573853, "learning_rate": 4.2140134869169934e-06, "loss": 0.0397, "mean_token_accuracy": 0.9880726933479309, "num_tokens": 87104292.0, "step": 10730 }, { "entropy": 0.13960061967372894, "epoch": 6.058126410835214, "grad_norm": 1.8582446575164795, "learning_rate": 4.213336870628236e-06, "loss": 0.0337, "mean_token_accuracy": 0.9908414721488953, "num_tokens": 87144991.0, "step": 10735 }, { "entropy": 0.1556618928909302, "epoch": 6.060948081264108, "grad_norm": 1.7569464445114136, "learning_rate": 4.21266004053556e-06, "loss": 0.035, "mean_token_accuracy": 0.9900591254234314, "num_tokens": 87185620.0, "step": 10740 }, { "entropy": 0.15336497128009796, "epoch": 6.063769751693002, "grad_norm": 1.2546908855438232, "learning_rate": 4.211982996771926e-06, "loss": 0.0348, "mean_token_accuracy": 0.9904587507247925, "num_tokens": 87226351.0, "step": 10745 }, { "entropy": 0.1377393662929535, "epoch": 6.066591422121896, "grad_norm": 1.875976800918579, "learning_rate": 4.211305739470342e-06, "loss": 0.0305, "mean_token_accuracy": 0.9914319157600403, "num_tokens": 87267095.0, "step": 10750 }, { "entropy": 0.1543271839618683, "epoch": 6.06941309255079, "grad_norm": 1.8693065643310547, "learning_rate": 4.210628268763854e-06, "loss": 0.0337, "mean_token_accuracy": 0.9902760028839112, "num_tokens": 87307784.0, "step": 10755 }, { "entropy": 0.14306709170341492, "epoch": 6.072234762979684, "grad_norm": 2.66241717338562, "learning_rate": 4.209950584785552e-06, "loss": 0.0307, "mean_token_accuracy": 0.9919659614562988, "num_tokens": 87348340.0, "step": 10760 }, { "entropy": 0.14404877722263337, "epoch": 6.0750564334085775, "grad_norm": 1.838407278060913, "learning_rate": 4.209272687668565e-06, "loss": 0.0314, "mean_token_accuracy": 0.9915964126586914, "num_tokens": 87389066.0, "step": 10765 }, { "entropy": 0.15369834899902343, "epoch": 6.077878103837472, "grad_norm": 1.1812386512756348, "learning_rate": 4.2085945775460685e-06, "loss": 0.0294, "mean_token_accuracy": 0.9916245102882385, "num_tokens": 87429380.0, "step": 10770 }, { "entropy": 0.15424088537693023, "epoch": 6.080699774266366, "grad_norm": 1.5837703943252563, "learning_rate": 4.207916254551276e-06, "loss": 0.0328, "mean_token_accuracy": 0.9913092613220215, "num_tokens": 87470021.0, "step": 10775 }, { "entropy": 0.1431649297475815, "epoch": 6.0835214446952595, "grad_norm": 1.973551630973816, "learning_rate": 4.207237718817446e-06, "loss": 0.0312, "mean_token_accuracy": 0.9908066749572754, "num_tokens": 87510700.0, "step": 10780 }, { "entropy": 0.14944909512996674, "epoch": 6.086343115124153, "grad_norm": 1.8268595933914185, "learning_rate": 4.2065589704778745e-06, "loss": 0.0325, "mean_token_accuracy": 0.9908172607421875, "num_tokens": 87551408.0, "step": 10785 }, { "entropy": 0.1600016713142395, "epoch": 6.089164785553048, "grad_norm": 1.7882065773010254, "learning_rate": 4.205880009665902e-06, "loss": 0.0378, "mean_token_accuracy": 0.9892760753631592, "num_tokens": 87592139.0, "step": 10790 }, { "entropy": 0.14885053634643555, "epoch": 6.091986455981941, "grad_norm": 1.7751688957214355, "learning_rate": 4.205200836514912e-06, "loss": 0.0372, "mean_token_accuracy": 0.9887823104858399, "num_tokens": 87632768.0, "step": 10795 }, { "entropy": 0.14843485057353972, "epoch": 6.094808126410835, "grad_norm": 1.5204575061798096, "learning_rate": 4.204521451158327e-06, "loss": 0.0345, "mean_token_accuracy": 0.9894916415214539, "num_tokens": 87673525.0, "step": 10800 }, { "entropy": 0.13986097276210785, "epoch": 6.097629796839729, "grad_norm": 1.9457536935806274, "learning_rate": 4.2038418537296126e-06, "loss": 0.032, "mean_token_accuracy": 0.9908498883247375, "num_tokens": 87714086.0, "step": 10805 }, { "entropy": 0.14706320762634278, "epoch": 6.100451467268623, "grad_norm": 1.7336926460266113, "learning_rate": 4.203162044362276e-06, "loss": 0.0347, "mean_token_accuracy": 0.9906623005867005, "num_tokens": 87753943.0, "step": 10810 }, { "entropy": 0.1470971792936325, "epoch": 6.103273137697517, "grad_norm": 2.0381176471710205, "learning_rate": 4.2024820231898655e-06, "loss": 0.0328, "mean_token_accuracy": 0.9907041072845459, "num_tokens": 87794673.0, "step": 10815 }, { "entropy": 0.1387518674135208, "epoch": 6.106094808126411, "grad_norm": 1.6845587491989136, "learning_rate": 4.201801790345971e-06, "loss": 0.0372, "mean_token_accuracy": 0.9895907640457153, "num_tokens": 87835382.0, "step": 10820 }, { "entropy": 0.14503337144851686, "epoch": 6.1089164785553045, "grad_norm": 1.6855789422988892, "learning_rate": 4.201121345964225e-06, "loss": 0.0334, "mean_token_accuracy": 0.9907157897949219, "num_tokens": 87876038.0, "step": 10825 }, { "entropy": 0.13844524025917054, "epoch": 6.111738148984198, "grad_norm": 1.9881229400634766, "learning_rate": 4.200440690178301e-06, "loss": 0.0339, "mean_token_accuracy": 0.9900532841682435, "num_tokens": 87916705.0, "step": 10830 }, { "entropy": 0.15506267845630645, "epoch": 6.114559819413093, "grad_norm": 1.9544332027435303, "learning_rate": 4.199759823121914e-06, "loss": 0.0346, "mean_token_accuracy": 0.9900209069252014, "num_tokens": 87957566.0, "step": 10835 }, { "entropy": 0.16678837686777115, "epoch": 6.1173814898419865, "grad_norm": 1.6686840057373047, "learning_rate": 4.199078744928819e-06, "loss": 0.0337, "mean_token_accuracy": 0.9909396409988404, "num_tokens": 87997564.0, "step": 10840 }, { "entropy": 0.14651824533939362, "epoch": 6.12020316027088, "grad_norm": 1.7264286279678345, "learning_rate": 4.198397455732816e-06, "loss": 0.0373, "mean_token_accuracy": 0.9898837804794312, "num_tokens": 88037998.0, "step": 10845 }, { "entropy": 0.15673071444034575, "epoch": 6.123024830699774, "grad_norm": 1.6216492652893066, "learning_rate": 4.197715955667742e-06, "loss": 0.0348, "mean_token_accuracy": 0.9899758458137512, "num_tokens": 88078569.0, "step": 10850 }, { "entropy": 0.1553318828344345, "epoch": 6.1258465011286685, "grad_norm": 2.029632329940796, "learning_rate": 4.19703424486748e-06, "loss": 0.0277, "mean_token_accuracy": 0.9923609852790832, "num_tokens": 88118867.0, "step": 10855 }, { "entropy": 0.13455870598554612, "epoch": 6.128668171557562, "grad_norm": 1.8276398181915283, "learning_rate": 4.196352323465951e-06, "loss": 0.0256, "mean_token_accuracy": 0.9924584746360778, "num_tokens": 88159338.0, "step": 10860 }, { "entropy": 0.13744568973779678, "epoch": 6.131489841986456, "grad_norm": 2.144724130630493, "learning_rate": 4.1956701915971196e-06, "loss": 0.0358, "mean_token_accuracy": 0.9901370286941529, "num_tokens": 88199943.0, "step": 10865 }, { "entropy": 0.1540704548358917, "epoch": 6.1343115124153496, "grad_norm": 2.058373212814331, "learning_rate": 4.194987849394988e-06, "loss": 0.0333, "mean_token_accuracy": 0.9897918820381164, "num_tokens": 88240426.0, "step": 10870 }, { "entropy": 0.14623642861843109, "epoch": 6.137133182844244, "grad_norm": 2.4471840858459473, "learning_rate": 4.194305296993606e-06, "loss": 0.0359, "mean_token_accuracy": 0.9897229671478271, "num_tokens": 88281095.0, "step": 10875 }, { "entropy": 0.1303885832428932, "epoch": 6.139954853273138, "grad_norm": 1.7725268602371216, "learning_rate": 4.193622534527058e-06, "loss": 0.0353, "mean_token_accuracy": 0.9896966218948364, "num_tokens": 88321683.0, "step": 10880 }, { "entropy": 0.13377143442630768, "epoch": 6.1427765237020315, "grad_norm": 2.0175435543060303, "learning_rate": 4.192939562129476e-06, "loss": 0.0306, "mean_token_accuracy": 0.9910183906555176, "num_tokens": 88362423.0, "step": 10885 }, { "entropy": 0.13516291081905366, "epoch": 6.145598194130925, "grad_norm": 2.2618424892425537, "learning_rate": 4.192256379935027e-06, "loss": 0.0277, "mean_token_accuracy": 0.9920958876609802, "num_tokens": 88402934.0, "step": 10890 }, { "entropy": 0.14001532047986984, "epoch": 6.14841986455982, "grad_norm": 1.8463703393936157, "learning_rate": 4.191572988077924e-06, "loss": 0.0333, "mean_token_accuracy": 0.9904056191444397, "num_tokens": 88443748.0, "step": 10895 }, { "entropy": 0.12817184776067733, "epoch": 6.1512415349887135, "grad_norm": 1.4544408321380615, "learning_rate": 4.190889386692418e-06, "loss": 0.0263, "mean_token_accuracy": 0.9931593537330627, "num_tokens": 88484016.0, "step": 10900 }, { "entropy": 0.14389062374830247, "epoch": 6.154063205417607, "grad_norm": 1.9694024324417114, "learning_rate": 4.190205575912804e-06, "loss": 0.0353, "mean_token_accuracy": 0.9898261427879333, "num_tokens": 88524836.0, "step": 10905 }, { "entropy": 0.14219014048576356, "epoch": 6.156884875846501, "grad_norm": 2.0647833347320557, "learning_rate": 4.189521555873416e-06, "loss": 0.035, "mean_token_accuracy": 0.9893984079360962, "num_tokens": 88565548.0, "step": 10910 }, { "entropy": 0.14767142832279206, "epoch": 6.159706546275395, "grad_norm": 1.9720888137817383, "learning_rate": 4.18883732670863e-06, "loss": 0.0334, "mean_token_accuracy": 0.9906517505645752, "num_tokens": 88605978.0, "step": 10915 }, { "entropy": 0.14821238815784454, "epoch": 6.162528216704289, "grad_norm": 2.0277600288391113, "learning_rate": 4.188152888552864e-06, "loss": 0.0366, "mean_token_accuracy": 0.9893009543418885, "num_tokens": 88646706.0, "step": 10920 }, { "entropy": 0.15523377060890198, "epoch": 6.165349887133183, "grad_norm": 1.6985244750976562, "learning_rate": 4.1874682415405735e-06, "loss": 0.0361, "mean_token_accuracy": 0.9894434928894043, "num_tokens": 88687198.0, "step": 10925 }, { "entropy": 0.15274632275104522, "epoch": 6.168171557562077, "grad_norm": 1.9225730895996094, "learning_rate": 4.186783385806259e-06, "loss": 0.0328, "mean_token_accuracy": 0.9911539673805236, "num_tokens": 88727828.0, "step": 10930 }, { "entropy": 0.13707538247108458, "epoch": 6.17099322799097, "grad_norm": 1.9640833139419556, "learning_rate": 4.186098321484459e-06, "loss": 0.0353, "mean_token_accuracy": 0.9896294713020325, "num_tokens": 88768488.0, "step": 10935 }, { "entropy": 0.14530183970928193, "epoch": 6.173814898419865, "grad_norm": 1.6039245128631592, "learning_rate": 4.185413048709757e-06, "loss": 0.0314, "mean_token_accuracy": 0.9907344460487366, "num_tokens": 88809035.0, "step": 10940 }, { "entropy": 0.14951497465372085, "epoch": 6.176636568848759, "grad_norm": 1.7432762384414673, "learning_rate": 4.184727567616775e-06, "loss": 0.0387, "mean_token_accuracy": 0.9889516711235047, "num_tokens": 88849740.0, "step": 10945 }, { "entropy": 0.14462248533964156, "epoch": 6.179458239277652, "grad_norm": 1.5465824604034424, "learning_rate": 4.184041878340174e-06, "loss": 0.0338, "mean_token_accuracy": 0.990413224697113, "num_tokens": 88890461.0, "step": 10950 }, { "entropy": 0.1421406477689743, "epoch": 6.182279909706546, "grad_norm": 2.111994743347168, "learning_rate": 4.183355981014658e-06, "loss": 0.0405, "mean_token_accuracy": 0.9883662700653076, "num_tokens": 88931166.0, "step": 10955 }, { "entropy": 0.1487989455461502, "epoch": 6.1851015801354405, "grad_norm": 2.039963722229004, "learning_rate": 4.1826698757749715e-06, "loss": 0.0361, "mean_token_accuracy": 0.989516270160675, "num_tokens": 88971798.0, "step": 10960 }, { "entropy": 0.14600317776203156, "epoch": 6.187923250564334, "grad_norm": 1.9209233522415161, "learning_rate": 4.1819835627559e-06, "loss": 0.0341, "mean_token_accuracy": 0.9893284320831299, "num_tokens": 89012519.0, "step": 10965 }, { "entropy": 0.1442936509847641, "epoch": 6.190744920993228, "grad_norm": 1.5550873279571533, "learning_rate": 4.1812970420922725e-06, "loss": 0.0373, "mean_token_accuracy": 0.9887455940246582, "num_tokens": 89053198.0, "step": 10970 }, { "entropy": 0.15094742476940154, "epoch": 6.193566591422122, "grad_norm": 1.8063254356384277, "learning_rate": 4.180610313918952e-06, "loss": 0.0401, "mean_token_accuracy": 0.9881560444831848, "num_tokens": 89093860.0, "step": 10975 }, { "entropy": 0.15021807849407195, "epoch": 6.196388261851016, "grad_norm": 2.1231496334075928, "learning_rate": 4.1799233783708474e-06, "loss": 0.0369, "mean_token_accuracy": 0.9901463985443115, "num_tokens": 89134320.0, "step": 10980 }, { "entropy": 0.1368527665734291, "epoch": 6.19920993227991, "grad_norm": 2.1702585220336914, "learning_rate": 4.1792362355829094e-06, "loss": 0.0336, "mean_token_accuracy": 0.9903720498085022, "num_tokens": 89175018.0, "step": 10985 }, { "entropy": 0.15346645712852477, "epoch": 6.202031602708804, "grad_norm": 1.8872740268707275, "learning_rate": 4.178548885690126e-06, "loss": 0.0384, "mean_token_accuracy": 0.9887521862983704, "num_tokens": 89215458.0, "step": 10990 }, { "entropy": 0.14233822673559188, "epoch": 6.204853273137697, "grad_norm": 1.8049365282058716, "learning_rate": 4.177861328827526e-06, "loss": 0.0391, "mean_token_accuracy": 0.9878924250602722, "num_tokens": 89255912.0, "step": 10995 }, { "entropy": 0.14570232629776, "epoch": 6.207674943566591, "grad_norm": 1.8370901346206665, "learning_rate": 4.1771735651301815e-06, "loss": 0.0314, "mean_token_accuracy": 0.9908752083778382, "num_tokens": 89296589.0, "step": 11000 }, { "epoch": 6.207674943566591, "eval_entropy": 0.20263180136680603, "eval_loss": 0.02532457746565342, "eval_mean_token_accuracy": 0.9938673973083496, "eval_num_tokens": 89296589.0, "eval_runtime": 0.1641, "eval_samples_per_second": 24.369, "eval_steps_per_second": 6.092, "step": 11000 }, { "entropy": 0.14294780194759368, "epoch": 6.210496613995486, "grad_norm": 1.8399924039840698, "learning_rate": 4.176485594733203e-06, "loss": 0.037, "mean_token_accuracy": 0.9889877915382386, "num_tokens": 89337302.0, "step": 11005 }, { "entropy": 0.14533422291278839, "epoch": 6.213318284424379, "grad_norm": 1.7887989282608032, "learning_rate": 4.175797417771744e-06, "loss": 0.0325, "mean_token_accuracy": 0.9902246236801148, "num_tokens": 89377799.0, "step": 11010 }, { "entropy": 0.1417243927717209, "epoch": 6.216139954853273, "grad_norm": 1.6449092626571655, "learning_rate": 4.175109034380994e-06, "loss": 0.0311, "mean_token_accuracy": 0.9912659168243408, "num_tokens": 89418429.0, "step": 11015 }, { "entropy": 0.14027564078569413, "epoch": 6.218961625282167, "grad_norm": 1.5085475444793701, "learning_rate": 4.1744204446961885e-06, "loss": 0.031, "mean_token_accuracy": 0.9910840392112732, "num_tokens": 89458957.0, "step": 11020 }, { "entropy": 0.1335515111684799, "epoch": 6.221783295711061, "grad_norm": 1.927368402481079, "learning_rate": 4.1737316488526005e-06, "loss": 0.0369, "mean_token_accuracy": 0.9889877438545227, "num_tokens": 89499568.0, "step": 11025 }, { "entropy": 0.14035135358572007, "epoch": 6.224604966139955, "grad_norm": 1.654311180114746, "learning_rate": 4.173042646985544e-06, "loss": 0.0325, "mean_token_accuracy": 0.9901265263557434, "num_tokens": 89540184.0, "step": 11030 }, { "entropy": 0.14204625636339188, "epoch": 6.227426636568849, "grad_norm": 1.987782597541809, "learning_rate": 4.172353439230372e-06, "loss": 0.0328, "mean_token_accuracy": 0.9901327252388, "num_tokens": 89580911.0, "step": 11035 }, { "entropy": 0.15092624723911285, "epoch": 6.230248306997742, "grad_norm": 1.7647333145141602, "learning_rate": 4.1716640257224815e-06, "loss": 0.0374, "mean_token_accuracy": 0.9883796095848083, "num_tokens": 89620532.0, "step": 11040 }, { "entropy": 0.15512611269950866, "epoch": 6.233069977426637, "grad_norm": 1.8042088747024536, "learning_rate": 4.170974406597307e-06, "loss": 0.0377, "mean_token_accuracy": 0.9896431684494018, "num_tokens": 89661252.0, "step": 11045 }, { "entropy": 0.13535745441913605, "epoch": 6.235891647855531, "grad_norm": 2.0197017192840576, "learning_rate": 4.170284581990325e-06, "loss": 0.0321, "mean_token_accuracy": 0.9908074617385865, "num_tokens": 89702049.0, "step": 11050 }, { "entropy": 0.14527721405029298, "epoch": 6.238713318284424, "grad_norm": 2.0421767234802246, "learning_rate": 4.1695945520370505e-06, "loss": 0.0355, "mean_token_accuracy": 0.9893774628639221, "num_tokens": 89742588.0, "step": 11055 }, { "entropy": 0.159025439620018, "epoch": 6.241534988713318, "grad_norm": 2.0698342323303223, "learning_rate": 4.16890431687304e-06, "loss": 0.0317, "mean_token_accuracy": 0.9911215782165528, "num_tokens": 89783240.0, "step": 11060 }, { "entropy": 0.15658071339130403, "epoch": 6.244356659142213, "grad_norm": 1.8461730480194092, "learning_rate": 4.168213876633891e-06, "loss": 0.0354, "mean_token_accuracy": 0.988905155658722, "num_tokens": 89823822.0, "step": 11065 }, { "entropy": 0.13386445641517639, "epoch": 6.247178329571106, "grad_norm": 2.02535343170166, "learning_rate": 4.167523231455241e-06, "loss": 0.0353, "mean_token_accuracy": 0.9896762609481812, "num_tokens": 89864500.0, "step": 11070 }, { "entropy": 0.14109272211790086, "epoch": 6.25, "grad_norm": 2.1409685611724854, "learning_rate": 4.166832381472766e-06, "loss": 0.0341, "mean_token_accuracy": 0.9898652195930481, "num_tokens": 89905040.0, "step": 11075 }, { "entropy": 0.1469561368227005, "epoch": 6.252821670428894, "grad_norm": 1.8599631786346436, "learning_rate": 4.166141326822184e-06, "loss": 0.0307, "mean_token_accuracy": 0.9907646656036377, "num_tokens": 89945211.0, "step": 11080 }, { "entropy": 0.13446701616048812, "epoch": 6.255643340857787, "grad_norm": 1.5020068883895874, "learning_rate": 4.165450067639254e-06, "loss": 0.0366, "mean_token_accuracy": 0.9891453385353088, "num_tokens": 89985851.0, "step": 11085 }, { "entropy": 0.14148662984371185, "epoch": 6.258465011286682, "grad_norm": 2.185192346572876, "learning_rate": 4.164758604059772e-06, "loss": 0.0323, "mean_token_accuracy": 0.9905752301216125, "num_tokens": 90026635.0, "step": 11090 }, { "entropy": 0.13741403222084045, "epoch": 6.261286681715576, "grad_norm": 1.7420563697814941, "learning_rate": 4.164066936219577e-06, "loss": 0.0321, "mean_token_accuracy": 0.9903434753417969, "num_tokens": 90066598.0, "step": 11095 }, { "entropy": 0.14443300664424896, "epoch": 6.264108352144469, "grad_norm": 1.9935277700424194, "learning_rate": 4.163375064254549e-06, "loss": 0.041, "mean_token_accuracy": 0.988189947605133, "num_tokens": 90107478.0, "step": 11100 }, { "entropy": 0.1385072499513626, "epoch": 6.266930022573363, "grad_norm": 2.011894702911377, "learning_rate": 4.162682988300602e-06, "loss": 0.0394, "mean_token_accuracy": 0.9881589770317077, "num_tokens": 90148095.0, "step": 11105 }, { "entropy": 0.15410477817058563, "epoch": 6.269751693002258, "grad_norm": 1.801830530166626, "learning_rate": 4.1619907084937e-06, "loss": 0.0365, "mean_token_accuracy": 0.9895232796669007, "num_tokens": 90188661.0, "step": 11110 }, { "entropy": 0.1497156322002411, "epoch": 6.272573363431151, "grad_norm": 2.2699999809265137, "learning_rate": 4.161298224969836e-06, "loss": 0.0365, "mean_token_accuracy": 0.9894330143928528, "num_tokens": 90229364.0, "step": 11115 }, { "entropy": 0.14462217986583709, "epoch": 6.275395033860045, "grad_norm": 2.064868450164795, "learning_rate": 4.1606055378650516e-06, "loss": 0.034, "mean_token_accuracy": 0.9895681977272034, "num_tokens": 90270064.0, "step": 11120 }, { "entropy": 0.14862312972545624, "epoch": 6.278216704288939, "grad_norm": 1.742047905921936, "learning_rate": 4.159912647315425e-06, "loss": 0.0354, "mean_token_accuracy": 0.9892676591873169, "num_tokens": 90310329.0, "step": 11125 }, { "entropy": 0.14780815839767455, "epoch": 6.281038374717833, "grad_norm": 1.733389139175415, "learning_rate": 4.159219553457074e-06, "loss": 0.0338, "mean_token_accuracy": 0.9897500872612, "num_tokens": 90351138.0, "step": 11130 }, { "entropy": 0.13568034768104553, "epoch": 6.283860045146727, "grad_norm": 1.8001052141189575, "learning_rate": 4.158526256426158e-06, "loss": 0.0333, "mean_token_accuracy": 0.9904277682304382, "num_tokens": 90391910.0, "step": 11135 }, { "entropy": 0.14421828389167785, "epoch": 6.286681715575621, "grad_norm": 2.015634298324585, "learning_rate": 4.157832756358874e-06, "loss": 0.0406, "mean_token_accuracy": 0.9873695731163025, "num_tokens": 90432746.0, "step": 11140 }, { "entropy": 0.14020881354808806, "epoch": 6.289503386004514, "grad_norm": 2.3647797107696533, "learning_rate": 4.157139053391461e-06, "loss": 0.0362, "mean_token_accuracy": 0.9892059206962586, "num_tokens": 90473415.0, "step": 11145 }, { "entropy": 0.1497344046831131, "epoch": 6.292325056433409, "grad_norm": 2.052783727645874, "learning_rate": 4.156445147660197e-06, "loss": 0.0382, "mean_token_accuracy": 0.9883453965187072, "num_tokens": 90514134.0, "step": 11150 }, { "entropy": 0.14050848484039308, "epoch": 6.295146726862303, "grad_norm": 1.6089463233947754, "learning_rate": 4.1557510393014e-06, "loss": 0.0388, "mean_token_accuracy": 0.9887239336967468, "num_tokens": 90554962.0, "step": 11155 }, { "entropy": 0.14407500624656677, "epoch": 6.297968397291196, "grad_norm": 2.434351682662964, "learning_rate": 4.155056728451426e-06, "loss": 0.0376, "mean_token_accuracy": 0.9894903421401977, "num_tokens": 90595436.0, "step": 11160 }, { "entropy": 0.13423256427049637, "epoch": 6.30079006772009, "grad_norm": 2.1055705547332764, "learning_rate": 4.154362215246675e-06, "loss": 0.0352, "mean_token_accuracy": 0.9897322297096253, "num_tokens": 90636082.0, "step": 11165 }, { "entropy": 0.1400793805718422, "epoch": 6.303611738148984, "grad_norm": 2.3050198554992676, "learning_rate": 4.1536674998235825e-06, "loss": 0.0382, "mean_token_accuracy": 0.9891103863716125, "num_tokens": 90676676.0, "step": 11170 }, { "entropy": 0.1425952732563019, "epoch": 6.306433408577878, "grad_norm": 2.2005064487457275, "learning_rate": 4.152972582318626e-06, "loss": 0.0357, "mean_token_accuracy": 0.9899725079536438, "num_tokens": 90717384.0, "step": 11175 }, { "entropy": 0.1526680827140808, "epoch": 6.309255079006772, "grad_norm": 1.6304471492767334, "learning_rate": 4.152277462868321e-06, "loss": 0.0371, "mean_token_accuracy": 0.9893090009689331, "num_tokens": 90757930.0, "step": 11180 }, { "entropy": 0.14085469841957093, "epoch": 6.312076749435666, "grad_norm": 1.5589280128479004, "learning_rate": 4.1515821416092264e-06, "loss": 0.0282, "mean_token_accuracy": 0.9919806241989135, "num_tokens": 90798800.0, "step": 11185 }, { "entropy": 0.13670676797628403, "epoch": 6.3148984198645595, "grad_norm": 1.8319333791732788, "learning_rate": 4.150886618677936e-06, "loss": 0.0367, "mean_token_accuracy": 0.9886664271354675, "num_tokens": 90838684.0, "step": 11190 }, { "entropy": 0.16441910862922668, "epoch": 6.317720090293454, "grad_norm": 2.1681110858917236, "learning_rate": 4.150190894211087e-06, "loss": 0.0416, "mean_token_accuracy": 0.9879634499549865, "num_tokens": 90879475.0, "step": 11195 }, { "entropy": 0.15294736325740815, "epoch": 6.320541760722348, "grad_norm": 1.9503849744796753, "learning_rate": 4.1494949683453525e-06, "loss": 0.033, "mean_token_accuracy": 0.990665853023529, "num_tokens": 90920247.0, "step": 11200 }, { "entropy": 0.13768713921308517, "epoch": 6.323363431151241, "grad_norm": 1.801827311515808, "learning_rate": 4.148798841217448e-06, "loss": 0.036, "mean_token_accuracy": 0.9888361096382141, "num_tokens": 90960908.0, "step": 11205 }, { "entropy": 0.1548486292362213, "epoch": 6.326185101580135, "grad_norm": 2.232868194580078, "learning_rate": 4.148102512964129e-06, "loss": 0.0361, "mean_token_accuracy": 0.9894163966178894, "num_tokens": 91001543.0, "step": 11210 }, { "entropy": 0.13173782229423522, "epoch": 6.32900677200903, "grad_norm": 1.801491141319275, "learning_rate": 4.1474059837221884e-06, "loss": 0.0347, "mean_token_accuracy": 0.9903063178062439, "num_tokens": 91042245.0, "step": 11215 }, { "entropy": 0.14220143854618073, "epoch": 6.331828442437923, "grad_norm": 2.110448122024536, "learning_rate": 4.146709253628458e-06, "loss": 0.0377, "mean_token_accuracy": 0.9890905261039734, "num_tokens": 91082706.0, "step": 11220 }, { "entropy": 0.1490049421787262, "epoch": 6.334650112866817, "grad_norm": 2.0357728004455566, "learning_rate": 4.146012322819814e-06, "loss": 0.0372, "mean_token_accuracy": 0.9890268802642822, "num_tokens": 91123300.0, "step": 11225 }, { "entropy": 0.16816553473472595, "epoch": 6.337471783295711, "grad_norm": 2.126213550567627, "learning_rate": 4.145315191433165e-06, "loss": 0.0379, "mean_token_accuracy": 0.989051103591919, "num_tokens": 91163864.0, "step": 11230 }, { "entropy": 0.15646686255931855, "epoch": 6.340293453724605, "grad_norm": 1.7959010601043701, "learning_rate": 4.144617859605464e-06, "loss": 0.0367, "mean_token_accuracy": 0.9899686336517334, "num_tokens": 91204489.0, "step": 11235 }, { "entropy": 0.14400748908519745, "epoch": 6.343115124153499, "grad_norm": 1.6874085664749146, "learning_rate": 4.1439203274737015e-06, "loss": 0.0352, "mean_token_accuracy": 0.9898784995079041, "num_tokens": 91245210.0, "step": 11240 }, { "entropy": 0.12948621511459352, "epoch": 6.345936794582393, "grad_norm": 1.4666175842285156, "learning_rate": 4.143222595174909e-06, "loss": 0.0392, "mean_token_accuracy": 0.9886078119277955, "num_tokens": 91285941.0, "step": 11245 }, { "entropy": 0.14075332880020142, "epoch": 6.3487584650112865, "grad_norm": 2.0497982501983643, "learning_rate": 4.142524662846156e-06, "loss": 0.0376, "mean_token_accuracy": 0.9891217827796936, "num_tokens": 91326822.0, "step": 11250 }, { "entropy": 0.15241701006889344, "epoch": 6.35158013544018, "grad_norm": 1.943670392036438, "learning_rate": 4.14182653062455e-06, "loss": 0.0322, "mean_token_accuracy": 0.9905896425247193, "num_tokens": 91367515.0, "step": 11255 }, { "entropy": 0.1496208757162094, "epoch": 6.354401805869075, "grad_norm": 2.349369525909424, "learning_rate": 4.14112819864724e-06, "loss": 0.0394, "mean_token_accuracy": 0.9895246982574463, "num_tokens": 91408287.0, "step": 11260 }, { "entropy": 0.12518291175365448, "epoch": 6.3572234762979685, "grad_norm": 1.957039475440979, "learning_rate": 4.140429667051412e-06, "loss": 0.034, "mean_token_accuracy": 0.9896358966827392, "num_tokens": 91448827.0, "step": 11265 }, { "entropy": 0.13367379158735276, "epoch": 6.360045146726862, "grad_norm": 1.9315341711044312, "learning_rate": 4.139730935974295e-06, "loss": 0.0331, "mean_token_accuracy": 0.9905829906463623, "num_tokens": 91489610.0, "step": 11270 }, { "entropy": 0.13425323665142058, "epoch": 6.362866817155756, "grad_norm": 1.8871325254440308, "learning_rate": 4.1390320055531545e-06, "loss": 0.0374, "mean_token_accuracy": 0.9884009838104248, "num_tokens": 91530511.0, "step": 11275 }, { "entropy": 0.1478370100259781, "epoch": 6.3656884875846504, "grad_norm": 1.611417531967163, "learning_rate": 4.1383328759252935e-06, "loss": 0.031, "mean_token_accuracy": 0.9910028457641602, "num_tokens": 91571131.0, "step": 11280 }, { "entropy": 0.14289433658123016, "epoch": 6.368510158013544, "grad_norm": 1.8513978719711304, "learning_rate": 4.137633547228058e-06, "loss": 0.0401, "mean_token_accuracy": 0.9881711721420288, "num_tokens": 91612000.0, "step": 11285 }, { "entropy": 0.1438445121049881, "epoch": 6.371331828442438, "grad_norm": 2.4446709156036377, "learning_rate": 4.13693401959883e-06, "loss": 0.0344, "mean_token_accuracy": 0.9899812698364258, "num_tokens": 91652645.0, "step": 11290 }, { "entropy": 0.1517265945672989, "epoch": 6.3741534988713315, "grad_norm": 1.8532661199569702, "learning_rate": 4.136234293175033e-06, "loss": 0.0325, "mean_token_accuracy": 0.9901606917381287, "num_tokens": 91693216.0, "step": 11295 }, { "entropy": 0.1488794654607773, "epoch": 6.376975169300226, "grad_norm": 1.7429970502853394, "learning_rate": 4.135534368094127e-06, "loss": 0.0413, "mean_token_accuracy": 0.9876135468482972, "num_tokens": 91733904.0, "step": 11300 }, { "entropy": 0.15159515142440796, "epoch": 6.37979683972912, "grad_norm": 1.6426184177398682, "learning_rate": 4.1348342444936134e-06, "loss": 0.0405, "mean_token_accuracy": 0.9885798454284668, "num_tokens": 91774778.0, "step": 11305 }, { "entropy": 0.14828484505414963, "epoch": 6.3826185101580135, "grad_norm": 1.2837458848953247, "learning_rate": 4.134133922511032e-06, "loss": 0.0389, "mean_token_accuracy": 0.9885473847389221, "num_tokens": 91815379.0, "step": 11310 }, { "entropy": 0.13970376253128053, "epoch": 6.385440180586907, "grad_norm": 1.9679341316223145, "learning_rate": 4.133433402283958e-06, "loss": 0.0327, "mean_token_accuracy": 0.9905824422836303, "num_tokens": 91855882.0, "step": 11315 }, { "entropy": 0.13457007706165314, "epoch": 6.388261851015802, "grad_norm": 1.7451153993606567, "learning_rate": 4.132732683950013e-06, "loss": 0.0351, "mean_token_accuracy": 0.9886767625808716, "num_tokens": 91896363.0, "step": 11320 }, { "entropy": 0.14434319883584976, "epoch": 6.3910835214446955, "grad_norm": 2.0313048362731934, "learning_rate": 4.13203176764685e-06, "loss": 0.0348, "mean_token_accuracy": 0.9894550204277038, "num_tokens": 91936957.0, "step": 11325 }, { "entropy": 0.14290865659713745, "epoch": 6.393905191873589, "grad_norm": 1.3977835178375244, "learning_rate": 4.131330653512167e-06, "loss": 0.0361, "mean_token_accuracy": 0.9899007439613342, "num_tokens": 91977634.0, "step": 11330 }, { "entropy": 0.14777366816997528, "epoch": 6.396726862302483, "grad_norm": 1.7547193765640259, "learning_rate": 4.130629341683695e-06, "loss": 0.0359, "mean_token_accuracy": 0.9890438318252563, "num_tokens": 92018299.0, "step": 11335 }, { "entropy": 0.16221860945224761, "epoch": 6.399548532731377, "grad_norm": 2.0626728534698486, "learning_rate": 4.129927832299209e-06, "loss": 0.0361, "mean_token_accuracy": 0.989508330821991, "num_tokens": 92058961.0, "step": 11340 }, { "entropy": 0.13226877599954606, "epoch": 6.402370203160271, "grad_norm": 1.7659369707107544, "learning_rate": 4.129226125496519e-06, "loss": 0.0363, "mean_token_accuracy": 0.9892664194107056, "num_tokens": 92099870.0, "step": 11345 }, { "entropy": 0.14557516872882842, "epoch": 6.405191873589165, "grad_norm": 2.086122751235962, "learning_rate": 4.128524221413477e-06, "loss": 0.0347, "mean_token_accuracy": 0.9899450778961182, "num_tokens": 92140746.0, "step": 11350 }, { "entropy": 0.15533352494239808, "epoch": 6.408013544018059, "grad_norm": 2.2414488792419434, "learning_rate": 4.12782212018797e-06, "loss": 0.0414, "mean_token_accuracy": 0.9879919528961182, "num_tokens": 92181327.0, "step": 11355 }, { "entropy": 0.14174574315547944, "epoch": 6.410835214446952, "grad_norm": 1.9369492530822754, "learning_rate": 4.127119821957927e-06, "loss": 0.0337, "mean_token_accuracy": 0.9892768621444702, "num_tokens": 92221857.0, "step": 11360 }, { "entropy": 0.13032906055450438, "epoch": 6.413656884875847, "grad_norm": 1.955140471458435, "learning_rate": 4.126417326861316e-06, "loss": 0.0331, "mean_token_accuracy": 0.9905326247215271, "num_tokens": 92262534.0, "step": 11365 }, { "entropy": 0.13662864565849303, "epoch": 6.4164785553047405, "grad_norm": 1.978060245513916, "learning_rate": 4.1257146350361395e-06, "loss": 0.0436, "mean_token_accuracy": 0.9868420958518982, "num_tokens": 92303036.0, "step": 11370 }, { "entropy": 0.15215873420238496, "epoch": 6.419300225733634, "grad_norm": 2.898237466812134, "learning_rate": 4.125011746620444e-06, "loss": 0.0403, "mean_token_accuracy": 0.9886619448661804, "num_tokens": 92342876.0, "step": 11375 }, { "entropy": 0.16599242389202118, "epoch": 6.422121896162528, "grad_norm": 1.9874509572982788, "learning_rate": 4.1243086617523105e-06, "loss": 0.0404, "mean_token_accuracy": 0.98833726644516, "num_tokens": 92383402.0, "step": 11380 }, { "entropy": 0.14803114980459214, "epoch": 6.4249435665914225, "grad_norm": 2.0754692554473877, "learning_rate": 4.12360538056986e-06, "loss": 0.0355, "mean_token_accuracy": 0.9896466255187988, "num_tokens": 92423873.0, "step": 11385 }, { "entropy": 0.14688670337200166, "epoch": 6.427765237020316, "grad_norm": 1.7059087753295898, "learning_rate": 4.122901903211254e-06, "loss": 0.0363, "mean_token_accuracy": 0.9896903872489929, "num_tokens": 92464592.0, "step": 11390 }, { "entropy": 0.137593112885952, "epoch": 6.43058690744921, "grad_norm": 1.807116985321045, "learning_rate": 4.122198229814689e-06, "loss": 0.0329, "mean_token_accuracy": 0.9902451157569885, "num_tokens": 92505357.0, "step": 11395 }, { "entropy": 0.1294647052884102, "epoch": 6.433408577878104, "grad_norm": 1.739324927330017, "learning_rate": 4.121494360518401e-06, "loss": 0.0359, "mean_token_accuracy": 0.9892550468444824, "num_tokens": 92546032.0, "step": 11400 }, { "entropy": 0.16134763658046722, "epoch": 6.436230248306998, "grad_norm": 1.8106449842453003, "learning_rate": 4.120790295460668e-06, "loss": 0.0378, "mean_token_accuracy": 0.9890878438949585, "num_tokens": 92586727.0, "step": 11405 }, { "entropy": 0.13998755365610122, "epoch": 6.439051918735892, "grad_norm": 1.5401268005371094, "learning_rate": 4.1200860347798e-06, "loss": 0.0369, "mean_token_accuracy": 0.9888973951339721, "num_tokens": 92627301.0, "step": 11410 }, { "entropy": 0.14669421464204788, "epoch": 6.441873589164786, "grad_norm": 1.9903889894485474, "learning_rate": 4.119381578614153e-06, "loss": 0.0388, "mean_token_accuracy": 0.9879800200462341, "num_tokens": 92668052.0, "step": 11415 }, { "entropy": 0.14707699716091155, "epoch": 6.444695259593679, "grad_norm": 1.9357092380523682, "learning_rate": 4.118676927102115e-06, "loss": 0.0356, "mean_token_accuracy": 0.9894900679588318, "num_tokens": 92708627.0, "step": 11420 }, { "entropy": 0.14949690401554108, "epoch": 6.447516930022573, "grad_norm": 1.9960310459136963, "learning_rate": 4.117972080382115e-06, "loss": 0.0378, "mean_token_accuracy": 0.9885159015655518, "num_tokens": 92749188.0, "step": 11425 }, { "entropy": 0.14854080080986024, "epoch": 6.450338600451468, "grad_norm": 1.9244252443313599, "learning_rate": 4.117267038592621e-06, "loss": 0.0391, "mean_token_accuracy": 0.988319730758667, "num_tokens": 92789998.0, "step": 11430 }, { "entropy": 0.14139661490917205, "epoch": 6.453160270880361, "grad_norm": 1.5770515203475952, "learning_rate": 4.1165618018721385e-06, "loss": 0.0379, "mean_token_accuracy": 0.9883534550666809, "num_tokens": 92830742.0, "step": 11435 }, { "entropy": 0.1313648045063019, "epoch": 6.455981941309255, "grad_norm": 1.4559335708618164, "learning_rate": 4.115856370359211e-06, "loss": 0.0352, "mean_token_accuracy": 0.9895216822624207, "num_tokens": 92871340.0, "step": 11440 }, { "entropy": 0.14764813482761383, "epoch": 6.458803611738149, "grad_norm": 1.990279197692871, "learning_rate": 4.115150744192421e-06, "loss": 0.0383, "mean_token_accuracy": 0.9883626222610473, "num_tokens": 92911986.0, "step": 11445 }, { "entropy": 0.14003989696502686, "epoch": 6.461625282167043, "grad_norm": 2.3407559394836426, "learning_rate": 4.114444923510388e-06, "loss": 0.0353, "mean_token_accuracy": 0.9898525953292847, "num_tokens": 92952713.0, "step": 11450 }, { "entropy": 0.13716760277748108, "epoch": 6.464446952595937, "grad_norm": 1.8746510744094849, "learning_rate": 4.113738908451771e-06, "loss": 0.0314, "mean_token_accuracy": 0.9901943325996398, "num_tokens": 92993264.0, "step": 11455 }, { "entropy": 0.14014555811882018, "epoch": 6.467268623024831, "grad_norm": 1.970062255859375, "learning_rate": 4.113032699155268e-06, "loss": 0.042, "mean_token_accuracy": 0.9883641839027405, "num_tokens": 93033836.0, "step": 11460 }, { "entropy": 0.15491693019866942, "epoch": 6.470090293453724, "grad_norm": 1.8556119203567505, "learning_rate": 4.112326295759612e-06, "loss": 0.0394, "mean_token_accuracy": 0.9879988074302674, "num_tokens": 93074450.0, "step": 11465 }, { "entropy": 0.1430242419242859, "epoch": 6.472911963882619, "grad_norm": 2.5160515308380127, "learning_rate": 4.111619698403577e-06, "loss": 0.0413, "mean_token_accuracy": 0.9870900750160218, "num_tokens": 93115187.0, "step": 11470 }, { "entropy": 0.1444158598780632, "epoch": 6.475733634311513, "grad_norm": 1.70442795753479, "learning_rate": 4.110912907225974e-06, "loss": 0.0362, "mean_token_accuracy": 0.9890662670135498, "num_tokens": 93155595.0, "step": 11475 }, { "entropy": 0.13908229172229766, "epoch": 6.478555304740406, "grad_norm": 1.6484752893447876, "learning_rate": 4.110205922365652e-06, "loss": 0.0341, "mean_token_accuracy": 0.9901829957962036, "num_tokens": 93196137.0, "step": 11480 }, { "entropy": 0.1475514531135559, "epoch": 6.4813769751693, "grad_norm": 1.759164571762085, "learning_rate": 4.1094987439615e-06, "loss": 0.0328, "mean_token_accuracy": 0.990956437587738, "num_tokens": 93236943.0, "step": 11485 }, { "entropy": 0.16040101349353791, "epoch": 6.484198645598195, "grad_norm": 2.2340142726898193, "learning_rate": 4.10879137215244e-06, "loss": 0.038, "mean_token_accuracy": 0.9892443895339966, "num_tokens": 93277309.0, "step": 11490 }, { "entropy": 0.1610852897167206, "epoch": 6.487020316027088, "grad_norm": 1.9512050151824951, "learning_rate": 4.108083807077437e-06, "loss": 0.0445, "mean_token_accuracy": 0.9860481977462768, "num_tokens": 93317646.0, "step": 11495 }, { "entropy": 0.15572181046009065, "epoch": 6.489841986455982, "grad_norm": 2.4991040229797363, "learning_rate": 4.1073760488754935e-06, "loss": 0.0448, "mean_token_accuracy": 0.9868571162223816, "num_tokens": 93358277.0, "step": 11500 }, { "epoch": 6.489841986455982, "eval_entropy": 0.1996920257806778, "eval_loss": 0.023422840982675552, "eval_mean_token_accuracy": 0.9938673973083496, "eval_num_tokens": 93358277.0, "eval_runtime": 0.1638, "eval_samples_per_second": 24.421, "eval_steps_per_second": 6.105, "step": 11500 }, { "entropy": 0.1445915535092354, "epoch": 6.492663656884876, "grad_norm": 1.8517767190933228, "learning_rate": 4.106668097685647e-06, "loss": 0.0377, "mean_token_accuracy": 0.9889588594436646, "num_tokens": 93398334.0, "step": 11505 }, { "entropy": 0.15220266580581665, "epoch": 6.495485327313769, "grad_norm": 1.8239866495132446, "learning_rate": 4.105959953646975e-06, "loss": 0.0392, "mean_token_accuracy": 0.9890074372291565, "num_tokens": 93438747.0, "step": 11510 }, { "entropy": 0.150148668885231, "epoch": 6.498306997742664, "grad_norm": 1.4895845651626587, "learning_rate": 4.105251616898592e-06, "loss": 0.0376, "mean_token_accuracy": 0.9892801523208619, "num_tokens": 93479380.0, "step": 11515 }, { "entropy": 0.14549075663089753, "epoch": 6.501128668171558, "grad_norm": 1.9964500665664673, "learning_rate": 4.104543087579652e-06, "loss": 0.0315, "mean_token_accuracy": 0.9905564427375794, "num_tokens": 93520126.0, "step": 11520 }, { "entropy": 0.1546286165714264, "epoch": 6.503950338600451, "grad_norm": 2.1431045532226562, "learning_rate": 4.103834365829346e-06, "loss": 0.0387, "mean_token_accuracy": 0.9886301279067993, "num_tokens": 93560843.0, "step": 11525 }, { "entropy": 0.14803966879844666, "epoch": 6.506772009029345, "grad_norm": 1.7916501760482788, "learning_rate": 4.1031254517869e-06, "loss": 0.0302, "mean_token_accuracy": 0.9909810662269593, "num_tokens": 93601247.0, "step": 11530 }, { "entropy": 0.14564967155456543, "epoch": 6.50959367945824, "grad_norm": 2.3298871517181396, "learning_rate": 4.102416345591583e-06, "loss": 0.0387, "mean_token_accuracy": 0.9888354897499084, "num_tokens": 93641820.0, "step": 11535 }, { "entropy": 0.1511426329612732, "epoch": 6.512415349887133, "grad_norm": 1.7131050825119019, "learning_rate": 4.101707047382697e-06, "loss": 0.0292, "mean_token_accuracy": 0.9912442445755005, "num_tokens": 93682410.0, "step": 11540 }, { "entropy": 0.14388006627559663, "epoch": 6.515237020316027, "grad_norm": 1.829458475112915, "learning_rate": 4.100997557299585e-06, "loss": 0.0371, "mean_token_accuracy": 0.9888779282569885, "num_tokens": 93723161.0, "step": 11545 }, { "entropy": 0.15411291718482972, "epoch": 6.518058690744921, "grad_norm": 2.0121374130249023, "learning_rate": 4.100287875481627e-06, "loss": 0.0366, "mean_token_accuracy": 0.9880946636199951, "num_tokens": 93763892.0, "step": 11550 }, { "entropy": 0.16307685375213624, "epoch": 6.520880361173814, "grad_norm": 2.0615103244781494, "learning_rate": 4.099578002068238e-06, "loss": 0.0366, "mean_token_accuracy": 0.9895309686660767, "num_tokens": 93804514.0, "step": 11555 }, { "entropy": 0.14214831590652466, "epoch": 6.523702031602709, "grad_norm": 1.7495334148406982, "learning_rate": 4.098867937198873e-06, "loss": 0.0356, "mean_token_accuracy": 0.989754056930542, "num_tokens": 93844886.0, "step": 11560 }, { "entropy": 0.13692098259925842, "epoch": 6.526523702031603, "grad_norm": 2.2061078548431396, "learning_rate": 4.098157681013027e-06, "loss": 0.0405, "mean_token_accuracy": 0.9879782795906067, "num_tokens": 93885343.0, "step": 11565 }, { "entropy": 0.1372235894203186, "epoch": 6.529345372460496, "grad_norm": 1.7683453559875488, "learning_rate": 4.097447233650226e-06, "loss": 0.0356, "mean_token_accuracy": 0.9889456748962402, "num_tokens": 93926121.0, "step": 11570 }, { "entropy": 0.15331811606884002, "epoch": 6.532167042889391, "grad_norm": 2.0020864009857178, "learning_rate": 4.0967365952500416e-06, "loss": 0.0332, "mean_token_accuracy": 0.9897879481315612, "num_tokens": 93966780.0, "step": 11575 }, { "entropy": 0.154784095287323, "epoch": 6.534988713318285, "grad_norm": 1.7012702226638794, "learning_rate": 4.096025765952076e-06, "loss": 0.0342, "mean_token_accuracy": 0.9907065868377686, "num_tokens": 94007365.0, "step": 11580 }, { "entropy": 0.13629684150218963, "epoch": 6.537810383747178, "grad_norm": 2.087913990020752, "learning_rate": 4.095314745895972e-06, "loss": 0.0376, "mean_token_accuracy": 0.9887621521949768, "num_tokens": 94048019.0, "step": 11585 }, { "entropy": 0.14386317431926726, "epoch": 6.540632054176072, "grad_norm": 1.9246793985366821, "learning_rate": 4.0946035352214106e-06, "loss": 0.0308, "mean_token_accuracy": 0.990863585472107, "num_tokens": 94088462.0, "step": 11590 }, { "entropy": 0.1515270948410034, "epoch": 6.543453724604966, "grad_norm": 2.080392837524414, "learning_rate": 4.093892134068108e-06, "loss": 0.0403, "mean_token_accuracy": 0.9882361173629761, "num_tokens": 94128923.0, "step": 11595 }, { "entropy": 0.15317191779613495, "epoch": 6.54627539503386, "grad_norm": 1.7559170722961426, "learning_rate": 4.09318054257582e-06, "loss": 0.036, "mean_token_accuracy": 0.9891046166419983, "num_tokens": 94169548.0, "step": 11600 }, { "entropy": 0.1485581949353218, "epoch": 6.549097065462754, "grad_norm": 1.9086334705352783, "learning_rate": 4.092468760884338e-06, "loss": 0.0403, "mean_token_accuracy": 0.9878690481185913, "num_tokens": 94210344.0, "step": 11605 }, { "entropy": 0.14423695504665374, "epoch": 6.551918735891648, "grad_norm": 2.0206217765808105, "learning_rate": 4.0917567891334935e-06, "loss": 0.0367, "mean_token_accuracy": 0.9891125440597535, "num_tokens": 94250622.0, "step": 11610 }, { "entropy": 0.14233740866184236, "epoch": 6.5547404063205414, "grad_norm": 2.0237369537353516, "learning_rate": 4.091044627463151e-06, "loss": 0.033, "mean_token_accuracy": 0.9899827241897583, "num_tokens": 94291129.0, "step": 11615 }, { "entropy": 0.1390757292509079, "epoch": 6.557562076749436, "grad_norm": 2.0195369720458984, "learning_rate": 4.0903322760132165e-06, "loss": 0.0393, "mean_token_accuracy": 0.9883224606513977, "num_tokens": 94331922.0, "step": 11620 }, { "entropy": 0.15587489008903505, "epoch": 6.56038374717833, "grad_norm": 2.020552158355713, "learning_rate": 4.0896197349236306e-06, "loss": 0.0397, "mean_token_accuracy": 0.9889486074447632, "num_tokens": 94372528.0, "step": 11625 }, { "entropy": 0.15144974291324614, "epoch": 6.563205417607223, "grad_norm": 1.6519806385040283, "learning_rate": 4.0889070043343725e-06, "loss": 0.0409, "mean_token_accuracy": 0.9875991225242615, "num_tokens": 94413314.0, "step": 11630 }, { "entropy": 0.16432562172412873, "epoch": 6.566027088036117, "grad_norm": 2.0881333351135254, "learning_rate": 4.088194084385459e-06, "loss": 0.0379, "mean_token_accuracy": 0.9884780287742615, "num_tokens": 94453628.0, "step": 11635 }, { "entropy": 0.15171082615852355, "epoch": 6.568848758465011, "grad_norm": 1.8627437353134155, "learning_rate": 4.08748097521694e-06, "loss": 0.0423, "mean_token_accuracy": 0.9872801780700684, "num_tokens": 94494421.0, "step": 11640 }, { "entropy": 0.1384830117225647, "epoch": 6.571670428893905, "grad_norm": 1.6030412912368774, "learning_rate": 4.0867676769689104e-06, "loss": 0.0408, "mean_token_accuracy": 0.988001823425293, "num_tokens": 94534981.0, "step": 11645 }, { "entropy": 0.15062596201896666, "epoch": 6.574492099322799, "grad_norm": 2.3316404819488525, "learning_rate": 4.086054189781495e-06, "loss": 0.0375, "mean_token_accuracy": 0.9890991806983948, "num_tokens": 94575864.0, "step": 11650 }, { "entropy": 0.15638395845890046, "epoch": 6.577313769751693, "grad_norm": 2.025421142578125, "learning_rate": 4.085340513794859e-06, "loss": 0.032, "mean_token_accuracy": 0.9899617671966553, "num_tokens": 94616541.0, "step": 11655 }, { "entropy": 0.1384786307811737, "epoch": 6.580135440180587, "grad_norm": 2.037663459777832, "learning_rate": 4.084626649149204e-06, "loss": 0.0357, "mean_token_accuracy": 0.9894083857536315, "num_tokens": 94657102.0, "step": 11660 }, { "entropy": 0.14564592242240906, "epoch": 6.582957110609481, "grad_norm": 2.0661332607269287, "learning_rate": 4.083912595984769e-06, "loss": 0.0367, "mean_token_accuracy": 0.9883635878562927, "num_tokens": 94697865.0, "step": 11665 }, { "entropy": 0.15481620132923127, "epoch": 6.585778781038375, "grad_norm": 1.89729642868042, "learning_rate": 4.083198354441831e-06, "loss": 0.0391, "mean_token_accuracy": 0.9885015726089478, "num_tokens": 94738274.0, "step": 11670 }, { "entropy": 0.14785862267017363, "epoch": 6.5886004514672685, "grad_norm": 2.5949220657348633, "learning_rate": 4.082483924660701e-06, "loss": 0.0326, "mean_token_accuracy": 0.9899281978607177, "num_tokens": 94778892.0, "step": 11675 }, { "entropy": 0.1447735384106636, "epoch": 6.591422121896162, "grad_norm": 1.6759907007217407, "learning_rate": 4.081769306781729e-06, "loss": 0.0374, "mean_token_accuracy": 0.9890196442604064, "num_tokens": 94819675.0, "step": 11680 }, { "entropy": 0.14461695849895478, "epoch": 6.594243792325057, "grad_norm": 1.7010953426361084, "learning_rate": 4.081054500945303e-06, "loss": 0.0343, "mean_token_accuracy": 0.9894821643829346, "num_tokens": 94860190.0, "step": 11685 }, { "entropy": 0.14982332289218903, "epoch": 6.5970654627539504, "grad_norm": 2.2242343425750732, "learning_rate": 4.080339507291845e-06, "loss": 0.0377, "mean_token_accuracy": 0.9886794209480285, "num_tokens": 94900923.0, "step": 11690 }, { "entropy": 0.15086464136838912, "epoch": 6.599887133182844, "grad_norm": 2.197476863861084, "learning_rate": 4.079624325961818e-06, "loss": 0.0359, "mean_token_accuracy": 0.988996946811676, "num_tokens": 94941485.0, "step": 11695 }, { "entropy": 0.1545707941055298, "epoch": 6.602708803611738, "grad_norm": 1.8683733940124512, "learning_rate": 4.0789089570957175e-06, "loss": 0.0394, "mean_token_accuracy": 0.9879536390304565, "num_tokens": 94982052.0, "step": 11700 }, { "entropy": 0.13810085207223893, "epoch": 6.605530474040632, "grad_norm": 2.827000856399536, "learning_rate": 4.078193400834078e-06, "loss": 0.0388, "mean_token_accuracy": 0.9887730002403259, "num_tokens": 95022767.0, "step": 11705 }, { "entropy": 0.14364120215177537, "epoch": 6.608352144469526, "grad_norm": 2.075237989425659, "learning_rate": 4.077477657317471e-06, "loss": 0.0323, "mean_token_accuracy": 0.9897770881652832, "num_tokens": 95063645.0, "step": 11710 }, { "entropy": 0.1465618520975113, "epoch": 6.61117381489842, "grad_norm": 2.806278705596924, "learning_rate": 4.076761726686505e-06, "loss": 0.0466, "mean_token_accuracy": 0.9863425970077515, "num_tokens": 95104053.0, "step": 11715 }, { "entropy": 0.14402187317609788, "epoch": 6.6139954853273135, "grad_norm": 1.9875422716140747, "learning_rate": 4.076045609081824e-06, "loss": 0.0403, "mean_token_accuracy": 0.9877901315689087, "num_tokens": 95144803.0, "step": 11720 }, { "entropy": 0.154840087890625, "epoch": 6.616817155756207, "grad_norm": 1.8063068389892578, "learning_rate": 4.075329304644109e-06, "loss": 0.0397, "mean_token_accuracy": 0.9879529476165771, "num_tokens": 95185612.0, "step": 11725 }, { "entropy": 0.13759687691926956, "epoch": 6.619638826185102, "grad_norm": 1.7499263286590576, "learning_rate": 4.074612813514079e-06, "loss": 0.0373, "mean_token_accuracy": 0.988704776763916, "num_tokens": 95226271.0, "step": 11730 }, { "entropy": 0.1382174924015999, "epoch": 6.6224604966139955, "grad_norm": 2.57856822013855, "learning_rate": 4.073896135832488e-06, "loss": 0.0423, "mean_token_accuracy": 0.987503731250763, "num_tokens": 95266935.0, "step": 11735 }, { "entropy": 0.15132977664470673, "epoch": 6.625282167042889, "grad_norm": 2.365727424621582, "learning_rate": 4.073179271740128e-06, "loss": 0.0439, "mean_token_accuracy": 0.9868868947029114, "num_tokens": 95307554.0, "step": 11740 }, { "entropy": 0.1502486765384674, "epoch": 6.628103837471784, "grad_norm": 2.003241777420044, "learning_rate": 4.072462221377827e-06, "loss": 0.0392, "mean_token_accuracy": 0.9876524209976196, "num_tokens": 95348372.0, "step": 11745 }, { "entropy": 0.15410052090883256, "epoch": 6.6309255079006775, "grad_norm": 1.5356718301773071, "learning_rate": 4.07174498488645e-06, "loss": 0.0353, "mean_token_accuracy": 0.989504873752594, "num_tokens": 95389076.0, "step": 11750 }, { "entropy": 0.1485922798514366, "epoch": 6.633747178329571, "grad_norm": 1.6456876993179321, "learning_rate": 4.071027562406896e-06, "loss": 0.0336, "mean_token_accuracy": 0.9904733896255493, "num_tokens": 95429753.0, "step": 11755 }, { "entropy": 0.1574637770652771, "epoch": 6.636568848758465, "grad_norm": 2.0578815937042236, "learning_rate": 4.070309954080106e-06, "loss": 0.0393, "mean_token_accuracy": 0.9888066649436951, "num_tokens": 95470465.0, "step": 11760 }, { "entropy": 0.1463371992111206, "epoch": 6.639390519187359, "grad_norm": 1.576945185661316, "learning_rate": 4.069592160047051e-06, "loss": 0.0363, "mean_token_accuracy": 0.9890714764595032, "num_tokens": 95510938.0, "step": 11765 }, { "entropy": 0.15714893043041228, "epoch": 6.642212189616253, "grad_norm": 1.784827709197998, "learning_rate": 4.0688741804487446e-06, "loss": 0.0374, "mean_token_accuracy": 0.9894795179367065, "num_tokens": 95551359.0, "step": 11770 }, { "entropy": 0.14306671023368836, "epoch": 6.645033860045147, "grad_norm": 1.9207254648208618, "learning_rate": 4.0681560154262326e-06, "loss": 0.0399, "mean_token_accuracy": 0.9883310794830322, "num_tokens": 95591909.0, "step": 11775 }, { "entropy": 0.14147693812847137, "epoch": 6.6478555304740405, "grad_norm": 2.073054552078247, "learning_rate": 4.067437665120598e-06, "loss": 0.0353, "mean_token_accuracy": 0.9901525735855102, "num_tokens": 95632630.0, "step": 11780 }, { "entropy": 0.12939851135015487, "epoch": 6.650677200902934, "grad_norm": 1.6735975742340088, "learning_rate": 4.066719129672962e-06, "loss": 0.0346, "mean_token_accuracy": 0.9895193934440613, "num_tokens": 95673068.0, "step": 11785 }, { "entropy": 0.13276245892047883, "epoch": 6.653498871331829, "grad_norm": 2.009474754333496, "learning_rate": 4.066000409224481e-06, "loss": 0.0377, "mean_token_accuracy": 0.9889036655426026, "num_tokens": 95713693.0, "step": 11790 }, { "entropy": 0.13992193043231965, "epoch": 6.6563205417607225, "grad_norm": 2.217224359512329, "learning_rate": 4.0652815039163475e-06, "loss": 0.0363, "mean_token_accuracy": 0.9892218232154846, "num_tokens": 95754580.0, "step": 11795 }, { "entropy": 0.14898284673690795, "epoch": 6.659142212189616, "grad_norm": 1.6819255352020264, "learning_rate": 4.06456241388979e-06, "loss": 0.0351, "mean_token_accuracy": 0.9898673295974731, "num_tokens": 95795256.0, "step": 11800 }, { "entropy": 0.14159813523292542, "epoch": 6.66196388261851, "grad_norm": 1.9627163410186768, "learning_rate": 4.063843139286073e-06, "loss": 0.0384, "mean_token_accuracy": 0.9878716945648194, "num_tokens": 95835802.0, "step": 11805 }, { "entropy": 0.1385001763701439, "epoch": 6.664785553047404, "grad_norm": 2.2941231727600098, "learning_rate": 4.063123680246501e-06, "loss": 0.0414, "mean_token_accuracy": 0.9872376441955566, "num_tokens": 95876651.0, "step": 11810 }, { "entropy": 0.14992179572582245, "epoch": 6.667607223476298, "grad_norm": 1.722368836402893, "learning_rate": 4.062404036912409e-06, "loss": 0.0365, "mean_token_accuracy": 0.9891926527023316, "num_tokens": 95917227.0, "step": 11815 }, { "entropy": 0.14835633486509323, "epoch": 6.670428893905192, "grad_norm": 2.5152008533477783, "learning_rate": 4.061684209425173e-06, "loss": 0.0407, "mean_token_accuracy": 0.9874385356903076, "num_tokens": 95957630.0, "step": 11820 }, { "entropy": 0.1346977487206459, "epoch": 6.673250564334086, "grad_norm": 2.1163482666015625, "learning_rate": 4.060964197926201e-06, "loss": 0.0327, "mean_token_accuracy": 0.9909256815910339, "num_tokens": 95998284.0, "step": 11825 }, { "entropy": 0.14587730467319487, "epoch": 6.67607223476298, "grad_norm": 2.0600922107696533, "learning_rate": 4.060244002556942e-06, "loss": 0.0354, "mean_token_accuracy": 0.9892189502716064, "num_tokens": 96039045.0, "step": 11830 }, { "entropy": 0.157058185338974, "epoch": 6.678893905191874, "grad_norm": 2.0479791164398193, "learning_rate": 4.0595236234588755e-06, "loss": 0.0423, "mean_token_accuracy": 0.9872640252113343, "num_tokens": 96079680.0, "step": 11835 }, { "entropy": 0.15050775408744813, "epoch": 6.681715575620768, "grad_norm": 2.05046010017395, "learning_rate": 4.058803060773523e-06, "loss": 0.041, "mean_token_accuracy": 0.9878550887107849, "num_tokens": 96120155.0, "step": 11840 }, { "entropy": 0.15183096528053283, "epoch": 6.684537246049661, "grad_norm": 2.1310324668884277, "learning_rate": 4.058082314642438e-06, "loss": 0.0417, "mean_token_accuracy": 0.9870881080627442, "num_tokens": 96160925.0, "step": 11845 }, { "entropy": 0.13959557116031646, "epoch": 6.687358916478555, "grad_norm": 2.0781123638153076, "learning_rate": 4.05736138520721e-06, "loss": 0.0383, "mean_token_accuracy": 0.9885563850402832, "num_tokens": 96201452.0, "step": 11850 }, { "entropy": 0.16393175423145295, "epoch": 6.6901805869074495, "grad_norm": 1.6570488214492798, "learning_rate": 4.056640272609467e-06, "loss": 0.0327, "mean_token_accuracy": 0.9903802037239074, "num_tokens": 96242166.0, "step": 11855 }, { "entropy": 0.1479017674922943, "epoch": 6.693002257336343, "grad_norm": 1.9548949003219604, "learning_rate": 4.055918976990872e-06, "loss": 0.0321, "mean_token_accuracy": 0.9904338598251343, "num_tokens": 96282855.0, "step": 11860 }, { "entropy": 0.1482335865497589, "epoch": 6.695823927765237, "grad_norm": 2.1175918579101562, "learning_rate": 4.055197498493123e-06, "loss": 0.0418, "mean_token_accuracy": 0.9874824643135071, "num_tokens": 96323775.0, "step": 11865 }, { "entropy": 0.14961565434932708, "epoch": 6.698645598194131, "grad_norm": 1.942327618598938, "learning_rate": 4.054475837257953e-06, "loss": 0.032, "mean_token_accuracy": 0.9901739597320557, "num_tokens": 96364548.0, "step": 11870 }, { "entropy": 0.15366184413433076, "epoch": 6.701467268623025, "grad_norm": 2.2145068645477295, "learning_rate": 4.053753993427135e-06, "loss": 0.0378, "mean_token_accuracy": 0.988074266910553, "num_tokens": 96404933.0, "step": 11875 }, { "entropy": 0.15021361410617828, "epoch": 6.704288939051919, "grad_norm": 1.6474920511245728, "learning_rate": 4.053031967142475e-06, "loss": 0.0341, "mean_token_accuracy": 0.98985835313797, "num_tokens": 96445402.0, "step": 11880 }, { "entropy": 0.16894918978214263, "epoch": 6.707110609480813, "grad_norm": 1.8737285137176514, "learning_rate": 4.052309758545813e-06, "loss": 0.0475, "mean_token_accuracy": 0.9867665410041809, "num_tokens": 96486119.0, "step": 11885 }, { "entropy": 0.15583183765411376, "epoch": 6.709932279909706, "grad_norm": 1.9295543432235718, "learning_rate": 4.051587367779029e-06, "loss": 0.0321, "mean_token_accuracy": 0.9901478886604309, "num_tokens": 96526926.0, "step": 11890 }, { "entropy": 0.15508732497692107, "epoch": 6.7127539503386, "grad_norm": 1.60502290725708, "learning_rate": 4.050864794984036e-06, "loss": 0.0316, "mean_token_accuracy": 0.9912713050842286, "num_tokens": 96567737.0, "step": 11895 }, { "entropy": 0.15691338181495668, "epoch": 6.715575620767495, "grad_norm": 1.7124838829040527, "learning_rate": 4.050142040302784e-06, "loss": 0.0368, "mean_token_accuracy": 0.9893094897270203, "num_tokens": 96607281.0, "step": 11900 }, { "entropy": 0.1330145627260208, "epoch": 6.718397291196388, "grad_norm": 1.8225212097167969, "learning_rate": 4.049419103877258e-06, "loss": 0.035, "mean_token_accuracy": 0.9894989252090454, "num_tokens": 96648016.0, "step": 11905 }, { "entropy": 0.15559065639972686, "epoch": 6.721218961625282, "grad_norm": 1.6894086599349976, "learning_rate": 4.048695985849479e-06, "loss": 0.036, "mean_token_accuracy": 0.9887082815170288, "num_tokens": 96688807.0, "step": 11910 }, { "entropy": 0.14794491827487946, "epoch": 6.724040632054177, "grad_norm": 1.7598350048065186, "learning_rate": 4.047972686361503e-06, "loss": 0.0399, "mean_token_accuracy": 0.9888983845710755, "num_tokens": 96729457.0, "step": 11915 }, { "entropy": 0.1369953766465187, "epoch": 6.72686230248307, "grad_norm": 1.9057563543319702, "learning_rate": 4.047249205555423e-06, "loss": 0.0321, "mean_token_accuracy": 0.9902025699615479, "num_tokens": 96769286.0, "step": 11920 }, { "entropy": 0.14871995151042938, "epoch": 6.729683972911964, "grad_norm": 2.072307586669922, "learning_rate": 4.046525543573366e-06, "loss": 0.0397, "mean_token_accuracy": 0.9880092859268188, "num_tokens": 96809834.0, "step": 11925 }, { "entropy": 0.13897217512130738, "epoch": 6.732505643340858, "grad_norm": 2.005223274230957, "learning_rate": 4.045801700557497e-06, "loss": 0.0378, "mean_token_accuracy": 0.9891113877296448, "num_tokens": 96850609.0, "step": 11930 }, { "entropy": 0.1497331142425537, "epoch": 6.735327313769751, "grad_norm": 2.2140676975250244, "learning_rate": 4.045077676650014e-06, "loss": 0.0385, "mean_token_accuracy": 0.9883991718292237, "num_tokens": 96891222.0, "step": 11935 }, { "entropy": 0.14488416612148286, "epoch": 6.738148984198646, "grad_norm": 1.802141547203064, "learning_rate": 4.044353471993152e-06, "loss": 0.0366, "mean_token_accuracy": 0.9881270170211792, "num_tokens": 96931928.0, "step": 11940 }, { "entropy": 0.14439502954483033, "epoch": 6.74097065462754, "grad_norm": 2.055842399597168, "learning_rate": 4.0436290867291806e-06, "loss": 0.0436, "mean_token_accuracy": 0.986508822441101, "num_tokens": 96972565.0, "step": 11945 }, { "entropy": 0.14606373012065887, "epoch": 6.743792325056433, "grad_norm": 1.8258408308029175, "learning_rate": 4.042904521000406e-06, "loss": 0.0348, "mean_token_accuracy": 0.9901708602905274, "num_tokens": 97013177.0, "step": 11950 }, { "entropy": 0.15913362801074982, "epoch": 6.746613995485327, "grad_norm": 2.4801416397094727, "learning_rate": 4.042179774949169e-06, "loss": 0.0495, "mean_token_accuracy": 0.9845641493797302, "num_tokens": 97053721.0, "step": 11955 }, { "entropy": 0.1454220324754715, "epoch": 6.749435665914222, "grad_norm": 2.5556042194366455, "learning_rate": 4.041454848717845e-06, "loss": 0.0492, "mean_token_accuracy": 0.9852368235588074, "num_tokens": 97094386.0, "step": 11960 }, { "entropy": 0.14474825710058212, "epoch": 6.752257336343115, "grad_norm": 1.8604875802993774, "learning_rate": 4.040729742448848e-06, "loss": 0.0389, "mean_token_accuracy": 0.9878225922584534, "num_tokens": 97134920.0, "step": 11965 }, { "entropy": 0.14345545172691346, "epoch": 6.755079006772009, "grad_norm": 2.0168981552124023, "learning_rate": 4.040004456284623e-06, "loss": 0.0399, "mean_token_accuracy": 0.9885003328323364, "num_tokens": 97175551.0, "step": 11970 }, { "entropy": 0.14142475724220277, "epoch": 6.757900677200903, "grad_norm": 1.5360438823699951, "learning_rate": 4.0392789903676545e-06, "loss": 0.0319, "mean_token_accuracy": 0.9909959673881531, "num_tokens": 97216255.0, "step": 11975 }, { "entropy": 0.14621337950229646, "epoch": 6.760722347629796, "grad_norm": 1.5952950716018677, "learning_rate": 4.03855334484046e-06, "loss": 0.0362, "mean_token_accuracy": 0.9889961838722229, "num_tokens": 97257015.0, "step": 11980 }, { "entropy": 0.14792815148830413, "epoch": 6.763544018058691, "grad_norm": 2.06760835647583, "learning_rate": 4.037827519845591e-06, "loss": 0.0299, "mean_token_accuracy": 0.9917079448699951, "num_tokens": 97297751.0, "step": 11985 }, { "entropy": 0.1481049656867981, "epoch": 6.766365688487585, "grad_norm": 1.856473684310913, "learning_rate": 4.037101515525637e-06, "loss": 0.0368, "mean_token_accuracy": 0.9893781185150147, "num_tokens": 97338310.0, "step": 11990 }, { "entropy": 0.14933787584304808, "epoch": 6.769187358916478, "grad_norm": 1.7966928482055664, "learning_rate": 4.036375332023222e-06, "loss": 0.0408, "mean_token_accuracy": 0.9873141407966614, "num_tokens": 97378801.0, "step": 11995 }, { "entropy": 0.15227662324905394, "epoch": 6.772009029345372, "grad_norm": 2.43129301071167, "learning_rate": 4.0356489694810055e-06, "loss": 0.0365, "mean_token_accuracy": 0.9891417384147644, "num_tokens": 97419558.0, "step": 12000 }, { "epoch": 6.772009029345372, "eval_entropy": 0.1956140100955963, "eval_loss": 0.012606015428900719, "eval_mean_token_accuracy": 0.9965503811836243, "eval_num_tokens": 97419558.0, "eval_runtime": 0.1638, "eval_samples_per_second": 24.418, "eval_steps_per_second": 6.105, "step": 12000 }, { "entropy": 0.15393645465373992, "epoch": 6.774830699774267, "grad_norm": 2.1532039642333984, "learning_rate": 4.03492242804168e-06, "loss": 0.0442, "mean_token_accuracy": 0.9862104415893554, "num_tokens": 97460095.0, "step": 12005 }, { "entropy": 0.1350156396627426, "epoch": 6.77765237020316, "grad_norm": 1.9720470905303955, "learning_rate": 4.034195707847975e-06, "loss": 0.0332, "mean_token_accuracy": 0.9898574113845825, "num_tokens": 97500756.0, "step": 12010 }, { "entropy": 0.1415847659111023, "epoch": 6.780474040632054, "grad_norm": 1.9140161275863647, "learning_rate": 4.033468809042655e-06, "loss": 0.0365, "mean_token_accuracy": 0.9890129208564759, "num_tokens": 97541530.0, "step": 12015 }, { "entropy": 0.14546948671340942, "epoch": 6.783295711060948, "grad_norm": 2.1538074016571045, "learning_rate": 4.032741731768519e-06, "loss": 0.0406, "mean_token_accuracy": 0.986963152885437, "num_tokens": 97581567.0, "step": 12020 }, { "entropy": 0.14426875561475755, "epoch": 6.786117381489842, "grad_norm": 1.9157716035842896, "learning_rate": 4.032014476168403e-06, "loss": 0.0355, "mean_token_accuracy": 0.9900845766067505, "num_tokens": 97622196.0, "step": 12025 }, { "entropy": 0.14931005537509917, "epoch": 6.788939051918736, "grad_norm": 1.9808646440505981, "learning_rate": 4.031287042385174e-06, "loss": 0.0398, "mean_token_accuracy": 0.9882163524627685, "num_tokens": 97662914.0, "step": 12030 }, { "entropy": 0.1658826380968094, "epoch": 6.79176072234763, "grad_norm": 2.1118521690368652, "learning_rate": 4.030559430561738e-06, "loss": 0.0371, "mean_token_accuracy": 0.9887429594993591, "num_tokens": 97703480.0, "step": 12035 }, { "entropy": 0.15551794469356536, "epoch": 6.794582392776523, "grad_norm": 2.5292036533355713, "learning_rate": 4.029831640841035e-06, "loss": 0.0463, "mean_token_accuracy": 0.9871610522270202, "num_tokens": 97744240.0, "step": 12040 }, { "entropy": 0.15738882422447203, "epoch": 6.797404063205418, "grad_norm": 1.7646291255950928, "learning_rate": 4.029103673366037e-06, "loss": 0.0386, "mean_token_accuracy": 0.988528847694397, "num_tokens": 97784307.0, "step": 12045 }, { "entropy": 0.1447429984807968, "epoch": 6.800225733634312, "grad_norm": 2.040116310119629, "learning_rate": 4.028375528279757e-06, "loss": 0.0399, "mean_token_accuracy": 0.9880939960479737, "num_tokens": 97825063.0, "step": 12050 }, { "entropy": 0.15236329436302185, "epoch": 6.803047404063205, "grad_norm": 1.7679275274276733, "learning_rate": 4.027647205725235e-06, "loss": 0.0347, "mean_token_accuracy": 0.9887324929237366, "num_tokens": 97865669.0, "step": 12055 }, { "entropy": 0.15214796960353852, "epoch": 6.805869074492099, "grad_norm": 1.926100730895996, "learning_rate": 4.026918705845553e-06, "loss": 0.0309, "mean_token_accuracy": 0.9904511570930481, "num_tokens": 97906513.0, "step": 12060 }, { "entropy": 0.14448803663253784, "epoch": 6.808690744920993, "grad_norm": 1.732896327972412, "learning_rate": 4.026190028783824e-06, "loss": 0.0366, "mean_token_accuracy": 0.9893592596054077, "num_tokens": 97946867.0, "step": 12065 }, { "entropy": 0.15263217091560363, "epoch": 6.811512415349887, "grad_norm": 1.5818462371826172, "learning_rate": 4.025461174683195e-06, "loss": 0.038, "mean_token_accuracy": 0.9896114826202392, "num_tokens": 97987436.0, "step": 12070 }, { "entropy": 0.14199225902557372, "epoch": 6.814334085778781, "grad_norm": 2.4520139694213867, "learning_rate": 4.024732143686854e-06, "loss": 0.0343, "mean_token_accuracy": 0.9893141746520996, "num_tokens": 98028061.0, "step": 12075 }, { "entropy": 0.1499510258436203, "epoch": 6.817155756207675, "grad_norm": 2.507272720336914, "learning_rate": 4.024002935938015e-06, "loss": 0.0353, "mean_token_accuracy": 0.9894481897354126, "num_tokens": 98068816.0, "step": 12080 }, { "entropy": 0.1536535143852234, "epoch": 6.8199774266365685, "grad_norm": 1.787207007408142, "learning_rate": 4.0232735515799325e-06, "loss": 0.04, "mean_token_accuracy": 0.9880955934524536, "num_tokens": 98109350.0, "step": 12085 }, { "entropy": 0.15066614747047424, "epoch": 6.822799097065463, "grad_norm": 2.0756945610046387, "learning_rate": 4.022543990755894e-06, "loss": 0.0397, "mean_token_accuracy": 0.9888906359672547, "num_tokens": 98149953.0, "step": 12090 }, { "entropy": 0.1614619642496109, "epoch": 6.825620767494357, "grad_norm": 2.169306516647339, "learning_rate": 4.021814253609222e-06, "loss": 0.0418, "mean_token_accuracy": 0.98787602186203, "num_tokens": 98190613.0, "step": 12095 }, { "entropy": 0.13885380923748017, "epoch": 6.8284424379232505, "grad_norm": 1.65521240234375, "learning_rate": 4.021084340283273e-06, "loss": 0.0377, "mean_token_accuracy": 0.9888322472572326, "num_tokens": 98231228.0, "step": 12100 }, { "entropy": 0.13987670242786407, "epoch": 6.831264108352144, "grad_norm": 2.0256147384643555, "learning_rate": 4.020354250921439e-06, "loss": 0.0374, "mean_token_accuracy": 0.9885939717292785, "num_tokens": 98271713.0, "step": 12105 }, { "entropy": 0.14241887032985687, "epoch": 6.834085778781039, "grad_norm": 2.178802728652954, "learning_rate": 4.0196239856671465e-06, "loss": 0.0343, "mean_token_accuracy": 0.9898516416549683, "num_tokens": 98312133.0, "step": 12110 }, { "entropy": 0.15942923724651337, "epoch": 6.836907449209932, "grad_norm": 1.7774022817611694, "learning_rate": 4.0188935446638545e-06, "loss": 0.0417, "mean_token_accuracy": 0.9870691418647766, "num_tokens": 98352906.0, "step": 12115 }, { "entropy": 0.13171774595975877, "epoch": 6.839729119638826, "grad_norm": 1.836889624595642, "learning_rate": 4.018162928055061e-06, "loss": 0.04, "mean_token_accuracy": 0.9872918844223022, "num_tokens": 98393669.0, "step": 12120 }, { "entropy": 0.1482748955488205, "epoch": 6.84255079006772, "grad_norm": 1.7053948640823364, "learning_rate": 4.017432135984293e-06, "loss": 0.034, "mean_token_accuracy": 0.990520977973938, "num_tokens": 98434346.0, "step": 12125 }, { "entropy": 0.15172266364097595, "epoch": 6.845372460496614, "grad_norm": 1.739862084388733, "learning_rate": 4.016701168595116e-06, "loss": 0.0434, "mean_token_accuracy": 0.9869125485420227, "num_tokens": 98474870.0, "step": 12130 }, { "entropy": 0.15046610832214355, "epoch": 6.848194130925508, "grad_norm": 2.3250515460968018, "learning_rate": 4.01597002603113e-06, "loss": 0.0452, "mean_token_accuracy": 0.9872011065483093, "num_tokens": 98515557.0, "step": 12135 }, { "entropy": 0.14062797725200654, "epoch": 6.851015801354402, "grad_norm": 2.180325746536255, "learning_rate": 4.015238708435965e-06, "loss": 0.0368, "mean_token_accuracy": 0.9888609290122986, "num_tokens": 98556251.0, "step": 12140 }, { "entropy": 0.1476264402270317, "epoch": 6.8538374717832955, "grad_norm": 2.2180349826812744, "learning_rate": 4.0145072159532906e-06, "loss": 0.0376, "mean_token_accuracy": 0.9886261582374573, "num_tokens": 98597018.0, "step": 12145 }, { "entropy": 0.1369374930858612, "epoch": 6.856659142212189, "grad_norm": 1.9766490459442139, "learning_rate": 4.013775548726807e-06, "loss": 0.0375, "mean_token_accuracy": 0.9877067446708679, "num_tokens": 98637642.0, "step": 12150 }, { "entropy": 0.15410472452640533, "epoch": 6.859480812641084, "grad_norm": 2.415297269821167, "learning_rate": 4.013043706900252e-06, "loss": 0.0404, "mean_token_accuracy": 0.9889116406440734, "num_tokens": 98678395.0, "step": 12155 }, { "entropy": 0.16218827664852142, "epoch": 6.8623024830699775, "grad_norm": 1.7888035774230957, "learning_rate": 4.012311690617396e-06, "loss": 0.0408, "mean_token_accuracy": 0.9877578258514405, "num_tokens": 98718623.0, "step": 12160 }, { "entropy": 0.1510297805070877, "epoch": 6.865124153498871, "grad_norm": 2.012989044189453, "learning_rate": 4.011579500022043e-06, "loss": 0.0388, "mean_token_accuracy": 0.9888627529144287, "num_tokens": 98759483.0, "step": 12165 }, { "entropy": 0.15312339663505553, "epoch": 6.867945823927765, "grad_norm": 1.867661714553833, "learning_rate": 4.010847135258031e-06, "loss": 0.0349, "mean_token_accuracy": 0.9899600625038147, "num_tokens": 98799890.0, "step": 12170 }, { "entropy": 0.14693421721458436, "epoch": 6.8707674943566595, "grad_norm": 1.5122771263122559, "learning_rate": 4.010114596469234e-06, "loss": 0.0333, "mean_token_accuracy": 0.9900424122810364, "num_tokens": 98840808.0, "step": 12175 }, { "entropy": 0.17182584404945372, "epoch": 6.873589164785553, "grad_norm": 1.7709141969680786, "learning_rate": 4.009381883799561e-06, "loss": 0.0355, "mean_token_accuracy": 0.989521062374115, "num_tokens": 98881544.0, "step": 12180 }, { "entropy": 0.15575533509254455, "epoch": 6.876410835214447, "grad_norm": 2.0675461292266846, "learning_rate": 4.00864899739295e-06, "loss": 0.0394, "mean_token_accuracy": 0.9877262592315674, "num_tokens": 98922210.0, "step": 12185 }, { "entropy": 0.15429140627384186, "epoch": 6.8792325056433405, "grad_norm": 2.4544715881347656, "learning_rate": 4.00791593739338e-06, "loss": 0.0494, "mean_token_accuracy": 0.9845258593559265, "num_tokens": 98962824.0, "step": 12190 }, { "entropy": 0.15189413875341415, "epoch": 6.882054176072235, "grad_norm": 2.8407533168792725, "learning_rate": 4.007182703944859e-06, "loss": 0.0446, "mean_token_accuracy": 0.9873796224594116, "num_tokens": 99003619.0, "step": 12195 }, { "entropy": 0.15428635776042937, "epoch": 6.884875846501129, "grad_norm": 2.224959135055542, "learning_rate": 4.006449297191432e-06, "loss": 0.0365, "mean_token_accuracy": 0.9887138962745666, "num_tokens": 99044216.0, "step": 12200 }, { "entropy": 0.14300165176391602, "epoch": 6.8876975169300225, "grad_norm": 1.9736629724502563, "learning_rate": 4.005715717277174e-06, "loss": 0.0366, "mean_token_accuracy": 0.988729465007782, "num_tokens": 99084439.0, "step": 12205 }, { "entropy": 0.13568673729896547, "epoch": 6.890519187358916, "grad_norm": 1.9142423868179321, "learning_rate": 4.004981964346201e-06, "loss": 0.0332, "mean_token_accuracy": 0.990368640422821, "num_tokens": 99124909.0, "step": 12210 }, { "entropy": 0.14406311213970185, "epoch": 6.893340857787811, "grad_norm": 2.4211018085479736, "learning_rate": 4.004248038542656e-06, "loss": 0.039, "mean_token_accuracy": 0.988343071937561, "num_tokens": 99165645.0, "step": 12215 }, { "entropy": 0.15503831803798676, "epoch": 6.8961625282167045, "grad_norm": 2.471714496612549, "learning_rate": 4.003513940010718e-06, "loss": 0.0432, "mean_token_accuracy": 0.9861461281776428, "num_tokens": 99206347.0, "step": 12220 }, { "entropy": 0.1480024516582489, "epoch": 6.898984198645598, "grad_norm": 2.0117077827453613, "learning_rate": 4.002779668894604e-06, "loss": 0.0362, "mean_token_accuracy": 0.9891183733940124, "num_tokens": 99246987.0, "step": 12225 }, { "entropy": 0.16451097428798675, "epoch": 6.901805869074492, "grad_norm": 1.6899313926696777, "learning_rate": 4.002045225338559e-06, "loss": 0.0404, "mean_token_accuracy": 0.9885852932929993, "num_tokens": 99287449.0, "step": 12230 }, { "entropy": 0.14375028908252716, "epoch": 6.904627539503386, "grad_norm": 1.818851351737976, "learning_rate": 4.001310609486866e-06, "loss": 0.0362, "mean_token_accuracy": 0.9893089056015014, "num_tokens": 99327979.0, "step": 12235 }, { "entropy": 0.15548242926597594, "epoch": 6.90744920993228, "grad_norm": 1.614169955253601, "learning_rate": 4.000575821483839e-06, "loss": 0.0383, "mean_token_accuracy": 0.9892736315727234, "num_tokens": 99368473.0, "step": 12240 }, { "entropy": 0.16284603476524354, "epoch": 6.910270880361174, "grad_norm": 1.6673612594604492, "learning_rate": 3.9998408614738295e-06, "loss": 0.039, "mean_token_accuracy": 0.9885139226913452, "num_tokens": 99408923.0, "step": 12245 }, { "entropy": 0.14805781245231628, "epoch": 6.913092550790068, "grad_norm": 2.2017159461975098, "learning_rate": 3.999105729601218e-06, "loss": 0.0378, "mean_token_accuracy": 0.9887848377227784, "num_tokens": 99449422.0, "step": 12250 }, { "entropy": 0.14871875047683716, "epoch": 6.915914221218961, "grad_norm": 1.9085617065429688, "learning_rate": 3.9983704260104225e-06, "loss": 0.0395, "mean_token_accuracy": 0.9885958671569824, "num_tokens": 99490171.0, "step": 12255 }, { "entropy": 0.1603003293275833, "epoch": 6.918735891647856, "grad_norm": 1.8411649465560913, "learning_rate": 3.997634950845893e-06, "loss": 0.0391, "mean_token_accuracy": 0.9881568551063538, "num_tokens": 99530789.0, "step": 12260 }, { "entropy": 0.14625448882579803, "epoch": 6.9215575620767495, "grad_norm": 1.8774826526641846, "learning_rate": 3.996899304252116e-06, "loss": 0.0342, "mean_token_accuracy": 0.9903077602386474, "num_tokens": 99571506.0, "step": 12265 }, { "entropy": 0.14785387217998505, "epoch": 6.924379232505643, "grad_norm": 1.910869836807251, "learning_rate": 3.996163486373605e-06, "loss": 0.0408, "mean_token_accuracy": 0.9881960272789001, "num_tokens": 99612260.0, "step": 12270 }, { "entropy": 0.15447575151920317, "epoch": 6.927200902934537, "grad_norm": 1.985054612159729, "learning_rate": 3.9954274973549144e-06, "loss": 0.0399, "mean_token_accuracy": 0.9876292586326599, "num_tokens": 99652726.0, "step": 12275 }, { "entropy": 0.15240433514118196, "epoch": 6.9300225733634315, "grad_norm": 1.6934657096862793, "learning_rate": 3.994691337340629e-06, "loss": 0.0353, "mean_token_accuracy": 0.9899464964866638, "num_tokens": 99693195.0, "step": 12280 }, { "entropy": 0.14032234251499176, "epoch": 6.932844243792325, "grad_norm": 2.0978596210479736, "learning_rate": 3.9939550064753674e-06, "loss": 0.0421, "mean_token_accuracy": 0.9877179741859436, "num_tokens": 99733715.0, "step": 12285 }, { "entropy": 0.1609266698360443, "epoch": 6.935665914221219, "grad_norm": 2.033520221710205, "learning_rate": 3.993218504903781e-06, "loss": 0.0396, "mean_token_accuracy": 0.9879401564598084, "num_tokens": 99774295.0, "step": 12290 }, { "entropy": 0.1533256560564041, "epoch": 6.938487584650113, "grad_norm": 2.056273937225342, "learning_rate": 3.992481832770558e-06, "loss": 0.0481, "mean_token_accuracy": 0.9857136845588684, "num_tokens": 99814847.0, "step": 12295 }, { "entropy": 0.14914492666721343, "epoch": 6.941309255079007, "grad_norm": 1.809752345085144, "learning_rate": 3.991744990220415e-06, "loss": 0.0403, "mean_token_accuracy": 0.9875779390335083, "num_tokens": 99855659.0, "step": 12300 }, { "entropy": 0.13904739618301393, "epoch": 6.944130925507901, "grad_norm": 1.739696979522705, "learning_rate": 3.9910079773981055e-06, "loss": 0.0357, "mean_token_accuracy": 0.989600396156311, "num_tokens": 99896329.0, "step": 12305 }, { "entropy": 0.1490867018699646, "epoch": 6.946952595936795, "grad_norm": 2.174006223678589, "learning_rate": 3.990270794448418e-06, "loss": 0.0397, "mean_token_accuracy": 0.9880833983421325, "num_tokens": 99936915.0, "step": 12310 }, { "entropy": 0.1486115574836731, "epoch": 6.949774266365688, "grad_norm": 2.1487553119659424, "learning_rate": 3.989533441516169e-06, "loss": 0.0412, "mean_token_accuracy": 0.988270103931427, "num_tokens": 99977400.0, "step": 12315 }, { "entropy": 0.15158516466617583, "epoch": 6.952595936794582, "grad_norm": 2.021366834640503, "learning_rate": 3.9887959187462145e-06, "loss": 0.0385, "mean_token_accuracy": 0.988451075553894, "num_tokens": 100018070.0, "step": 12320 }, { "entropy": 0.15477460920810698, "epoch": 6.955417607223477, "grad_norm": 2.3939874172210693, "learning_rate": 3.988058226283438e-06, "loss": 0.0459, "mean_token_accuracy": 0.9860603213310242, "num_tokens": 100058592.0, "step": 12325 }, { "entropy": 0.13858523964881897, "epoch": 6.95823927765237, "grad_norm": 1.722223162651062, "learning_rate": 3.987320364272761e-06, "loss": 0.0366, "mean_token_accuracy": 0.9893313646316528, "num_tokens": 100098381.0, "step": 12330 }, { "entropy": 0.14853320121765137, "epoch": 6.961060948081264, "grad_norm": 1.6259387731552124, "learning_rate": 3.986582332859138e-06, "loss": 0.029, "mean_token_accuracy": 0.9913646578788757, "num_tokens": 100138509.0, "step": 12335 }, { "entropy": 0.1412474274635315, "epoch": 6.963882618510158, "grad_norm": 1.655497431755066, "learning_rate": 3.985844132187552e-06, "loss": 0.0377, "mean_token_accuracy": 0.9879396677017211, "num_tokens": 100178940.0, "step": 12340 }, { "entropy": 0.14502420425415039, "epoch": 6.966704288939052, "grad_norm": 2.259775161743164, "learning_rate": 3.985105762403024e-06, "loss": 0.0385, "mean_token_accuracy": 0.9885640144348145, "num_tokens": 100219679.0, "step": 12345 }, { "entropy": 0.1412261575460434, "epoch": 6.969525959367946, "grad_norm": 2.183638095855713, "learning_rate": 3.984367223650608e-06, "loss": 0.0391, "mean_token_accuracy": 0.988431978225708, "num_tokens": 100260265.0, "step": 12350 }, { "entropy": 0.16001304388046264, "epoch": 6.97234762979684, "grad_norm": 1.7587566375732422, "learning_rate": 3.983628516075389e-06, "loss": 0.0422, "mean_token_accuracy": 0.9870441198348999, "num_tokens": 100301028.0, "step": 12355 }, { "entropy": 0.1356129452586174, "epoch": 6.975169300225733, "grad_norm": 1.582041621208191, "learning_rate": 3.982889639822487e-06, "loss": 0.0351, "mean_token_accuracy": 0.9892949461936951, "num_tokens": 100341780.0, "step": 12360 }, { "entropy": 0.15091146826744078, "epoch": 6.977990970654628, "grad_norm": 1.9201163053512573, "learning_rate": 3.982150595037053e-06, "loss": 0.0396, "mean_token_accuracy": 0.9874477505683898, "num_tokens": 100382504.0, "step": 12365 }, { "entropy": 0.14935719966888428, "epoch": 6.980812641083522, "grad_norm": 1.8997581005096436, "learning_rate": 3.981411381864274e-06, "loss": 0.0357, "mean_token_accuracy": 0.9886354565620422, "num_tokens": 100423339.0, "step": 12370 }, { "entropy": 0.15034773349761962, "epoch": 6.983634311512415, "grad_norm": 2.1917574405670166, "learning_rate": 3.980672000449367e-06, "loss": 0.0354, "mean_token_accuracy": 0.9892937660217285, "num_tokens": 100463746.0, "step": 12375 }, { "entropy": 0.15186431109905243, "epoch": 6.986455981941309, "grad_norm": 1.7991350889205933, "learning_rate": 3.9799324509375846e-06, "loss": 0.0449, "mean_token_accuracy": 0.9864747166633606, "num_tokens": 100504500.0, "step": 12380 }, { "entropy": 0.162436842918396, "epoch": 6.989277652370204, "grad_norm": 1.9422773122787476, "learning_rate": 3.979192733474211e-06, "loss": 0.0436, "mean_token_accuracy": 0.9862201929092407, "num_tokens": 100545119.0, "step": 12385 }, { "entropy": 0.15107800364494323, "epoch": 6.992099322799097, "grad_norm": 2.049860715866089, "learning_rate": 3.978452848204563e-06, "loss": 0.0403, "mean_token_accuracy": 0.9873886942863465, "num_tokens": 100585667.0, "step": 12390 }, { "entropy": 0.1658845365047455, "epoch": 6.994920993227991, "grad_norm": 2.350536346435547, "learning_rate": 3.977712795273993e-06, "loss": 0.0426, "mean_token_accuracy": 0.9870129108428956, "num_tokens": 100626337.0, "step": 12395 }, { "entropy": 0.14828409254550934, "epoch": 6.997742663656885, "grad_norm": 2.069523811340332, "learning_rate": 3.976972574827883e-06, "loss": 0.0379, "mean_token_accuracy": 0.9882864952087402, "num_tokens": 100666972.0, "step": 12400 }, { "entropy": 0.15599106550216674, "epoch": 7.000564334085778, "grad_norm": 1.5296047925949097, "learning_rate": 3.97623218701165e-06, "loss": 0.0328, "mean_token_accuracy": 0.9913263559341431, "num_tokens": 100701561.0, "step": 12405 }, { "entropy": 0.13645475655794143, "epoch": 7.003386004514673, "grad_norm": 1.3886555433273315, "learning_rate": 3.975491631970744e-06, "loss": 0.0159, "mean_token_accuracy": 0.9963740348815918, "num_tokens": 100742162.0, "step": 12410 }, { "entropy": 0.13274604827165604, "epoch": 7.006207674943567, "grad_norm": 1.7409141063690186, "learning_rate": 3.974750909850646e-06, "loss": 0.0181, "mean_token_accuracy": 0.9954428911209107, "num_tokens": 100782515.0, "step": 12415 }, { "entropy": 0.12155016213655472, "epoch": 7.00902934537246, "grad_norm": 1.6582471132278442, "learning_rate": 3.9740100207968716e-06, "loss": 0.0183, "mean_token_accuracy": 0.9957684516906739, "num_tokens": 100823151.0, "step": 12420 }, { "entropy": 0.12232804298400879, "epoch": 7.011851015801354, "grad_norm": 1.9185771942138672, "learning_rate": 3.973268964954967e-06, "loss": 0.0156, "mean_token_accuracy": 0.9957268714904786, "num_tokens": 100863921.0, "step": 12425 }, { "entropy": 0.13372556120157242, "epoch": 7.014672686230249, "grad_norm": 1.302709937095642, "learning_rate": 3.972527742470515e-06, "loss": 0.017, "mean_token_accuracy": 0.9956133961677551, "num_tokens": 100904652.0, "step": 12430 }, { "entropy": 0.1276037335395813, "epoch": 7.017494356659142, "grad_norm": 2.0763673782348633, "learning_rate": 3.971786353489127e-06, "loss": 0.0174, "mean_token_accuracy": 0.9954843640327453, "num_tokens": 100945350.0, "step": 12435 }, { "entropy": 0.1303536906838417, "epoch": 7.020316027088036, "grad_norm": 1.7125003337860107, "learning_rate": 3.97104479815645e-06, "loss": 0.0193, "mean_token_accuracy": 0.9947200775146484, "num_tokens": 100986018.0, "step": 12440 }, { "entropy": 0.13805813938379288, "epoch": 7.02313769751693, "grad_norm": 1.2138659954071045, "learning_rate": 3.9703030766181634e-06, "loss": 0.0166, "mean_token_accuracy": 0.9959219574928284, "num_tokens": 101026679.0, "step": 12445 }, { "entropy": 0.11290259510278702, "epoch": 7.025959367945824, "grad_norm": 1.9468364715576172, "learning_rate": 3.969561189019977e-06, "loss": 0.0156, "mean_token_accuracy": 0.9957970261573792, "num_tokens": 101067288.0, "step": 12450 }, { "entropy": 0.12177327871322632, "epoch": 7.028781038374718, "grad_norm": 1.656708836555481, "learning_rate": 3.968819135507636e-06, "loss": 0.0212, "mean_token_accuracy": 0.994585645198822, "num_tokens": 101107815.0, "step": 12455 }, { "entropy": 0.13150275498628616, "epoch": 7.031602708803612, "grad_norm": 1.2296918630599976, "learning_rate": 3.968076916226914e-06, "loss": 0.0143, "mean_token_accuracy": 0.9961319208145142, "num_tokens": 101148543.0, "step": 12460 }, { "entropy": 0.13319746106863023, "epoch": 7.034424379232505, "grad_norm": 1.9725842475891113, "learning_rate": 3.967334531323624e-06, "loss": 0.0166, "mean_token_accuracy": 0.995919120311737, "num_tokens": 101189223.0, "step": 12465 }, { "entropy": 0.13151731789112092, "epoch": 7.0372460496614, "grad_norm": 1.4561344385147095, "learning_rate": 3.966591980943605e-06, "loss": 0.0189, "mean_token_accuracy": 0.995412039756775, "num_tokens": 101229757.0, "step": 12470 }, { "entropy": 0.12682377845048903, "epoch": 7.040067720090294, "grad_norm": 1.5752171277999878, "learning_rate": 3.965849265232732e-06, "loss": 0.0162, "mean_token_accuracy": 0.996173620223999, "num_tokens": 101270029.0, "step": 12475 }, { "entropy": 0.1453809142112732, "epoch": 7.042889390519187, "grad_norm": 1.8831138610839844, "learning_rate": 3.965106384336912e-06, "loss": 0.0178, "mean_token_accuracy": 0.995201587677002, "num_tokens": 101310756.0, "step": 12480 }, { "entropy": 0.13473598957061766, "epoch": 7.045711060948081, "grad_norm": 1.9781758785247803, "learning_rate": 3.964363338402083e-06, "loss": 0.0181, "mean_token_accuracy": 0.9956030488014221, "num_tokens": 101350870.0, "step": 12485 }, { "entropy": 0.13227078914642335, "epoch": 7.048532731376975, "grad_norm": 1.273861289024353, "learning_rate": 3.9636201275742175e-06, "loss": 0.0178, "mean_token_accuracy": 0.9954041123390198, "num_tokens": 101391393.0, "step": 12490 }, { "entropy": 0.130999419093132, "epoch": 7.051354401805869, "grad_norm": 1.901928186416626, "learning_rate": 3.962876751999318e-06, "loss": 0.0211, "mean_token_accuracy": 0.9943174481391907, "num_tokens": 101431784.0, "step": 12495 }, { "entropy": 0.13049811720848084, "epoch": 7.054176072234763, "grad_norm": 1.6979538202285767, "learning_rate": 3.962133211823424e-06, "loss": 0.0206, "mean_token_accuracy": 0.9947118163108826, "num_tokens": 101472388.0, "step": 12500 }, { "epoch": 7.054176072234763, "eval_entropy": 0.1905188113451004, "eval_loss": 0.012303678318858147, "eval_mean_token_accuracy": 0.9973169565200806, "eval_num_tokens": 101472388.0, "eval_runtime": 0.1636, "eval_samples_per_second": 24.454, "eval_steps_per_second": 6.113, "step": 12500 } ], "logging_steps": 5, "max_steps": 35440, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.212206797834539e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }