diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25309 @@ +{ + "best_global_step": 12500, + "best_metric": 0.012303678318858147, + "best_model_checkpoint": "./sft_model/checkpoint-12500", + "epoch": 7.054176072234763, + "eval_steps": 500, + "global_step": 12500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.7264264225959778, + "epoch": 0.0028216704288939053, + "grad_norm": 1.9348883628845215, + "learning_rate": 4.999999882129922e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.8013904571533204, + "num_tokens": 40154.0, + "step": 5 + }, + { + "entropy": 0.8295215606689453, + "epoch": 0.0056433408577878106, + "grad_norm": 1.6994550228118896, + "learning_rate": 4.999999403282752e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.8162339448928833, + "num_tokens": 80736.0, + "step": 10 + }, + { + "entropy": 0.8583701252937317, + "epoch": 0.008465011286681716, + "grad_norm": 1.4073373079299927, + "learning_rate": 4.999998556091706e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.8052006244659424, + "num_tokens": 121191.0, + "step": 15 + }, + { + "entropy": 0.8272644519805908, + "epoch": 0.011286681715575621, + "grad_norm": 1.322540044784546, + "learning_rate": 4.999997340556951e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.814669954776764, + "num_tokens": 161873.0, + "step": 20 + }, + { + "entropy": 0.7710798859596253, + "epoch": 0.014108352144469526, + "grad_norm": 1.4983023405075073, + "learning_rate": 4.999995756678724e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.8207517623901367, + "num_tokens": 202530.0, + "step": 25 + }, + { + "entropy": 0.7607816934585572, + "epoch": 0.016930022573363433, + "grad_norm": 1.315825343132019, + "learning_rate": 4.999993804457336e-06, + "loss": 0.6278, + "mean_token_accuracy": 0.8195699453353882, + "num_tokens": 242963.0, + "step": 30 + }, + { + "entropy": 0.6939435482025147, + "epoch": 0.019751693002257337, + "grad_norm": 1.3920537233352661, + "learning_rate": 4.999991483893173e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8409396290779114, + "num_tokens": 283687.0, + "step": 35 + }, + { + "entropy": 0.7462215900421143, + "epoch": 0.022573363431151242, + "grad_norm": 1.3442474603652954, + "learning_rate": 4.999988794986688e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.822938334941864, + "num_tokens": 324425.0, + "step": 40 + }, + { + "entropy": 0.7756533026695251, + "epoch": 0.025395033860045147, + "grad_norm": 1.3415160179138184, + "learning_rate": 4.999985737738411e-06, + "loss": 0.6479, + "mean_token_accuracy": 0.8175343155860901, + "num_tokens": 365286.0, + "step": 45 + }, + { + "entropy": 0.7056244015693665, + "epoch": 0.028216704288939052, + "grad_norm": 1.4092621803283691, + "learning_rate": 4.999982312148941e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8333210825920105, + "num_tokens": 405893.0, + "step": 50 + }, + { + "entropy": 0.7671143412590027, + "epoch": 0.031038374717832957, + "grad_norm": 1.3218460083007812, + "learning_rate": 4.999978518218954e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.8200177073478698, + "num_tokens": 446486.0, + "step": 55 + }, + { + "entropy": 0.71772620677948, + "epoch": 0.033860045146726865, + "grad_norm": 1.3232901096343994, + "learning_rate": 4.999974355949192e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.825872802734375, + "num_tokens": 487151.0, + "step": 60 + }, + { + "entropy": 0.8025589108467102, + "epoch": 0.03668171557562077, + "grad_norm": 1.3470555543899536, + "learning_rate": 4.999969825340475e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.8110421895980835, + "num_tokens": 527748.0, + "step": 65 + }, + { + "entropy": 0.6747750401496887, + "epoch": 0.039503386004514675, + "grad_norm": 1.3361178636550903, + "learning_rate": 4.999964926393691e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8391481637954712, + "num_tokens": 568462.0, + "step": 70 + }, + { + "entropy": 0.7461091756820679, + "epoch": 0.04232505643340858, + "grad_norm": 1.4912503957748413, + "learning_rate": 4.999959659109804e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8242739796638489, + "num_tokens": 608879.0, + "step": 75 + }, + { + "entropy": 0.7261611700057984, + "epoch": 0.045146726862302484, + "grad_norm": 1.2540172338485718, + "learning_rate": 4.999954023489848e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8286155581474304, + "num_tokens": 649552.0, + "step": 80 + }, + { + "entropy": 0.7150743484497071, + "epoch": 0.04796839729119639, + "grad_norm": 1.4506585597991943, + "learning_rate": 4.99994801953493e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8319103837013244, + "num_tokens": 690358.0, + "step": 85 + }, + { + "entropy": 0.6064900994300843, + "epoch": 0.050790067720090294, + "grad_norm": 1.2554188966751099, + "learning_rate": 4.999941647246231e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.851774251461029, + "num_tokens": 731078.0, + "step": 90 + }, + { + "entropy": 0.6873465299606323, + "epoch": 0.0536117381489842, + "grad_norm": 1.2316175699234009, + "learning_rate": 4.9999349066250014e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8320832848548889, + "num_tokens": 771649.0, + "step": 95 + }, + { + "entropy": 0.7353047490119934, + "epoch": 0.056433408577878104, + "grad_norm": 2.000030994415283, + "learning_rate": 4.9999277976725655e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.8255870938301086, + "num_tokens": 812286.0, + "step": 100 + }, + { + "entropy": 0.7349668025970459, + "epoch": 0.05925507900677201, + "grad_norm": 1.4496697187423706, + "learning_rate": 4.999920320390319e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8332556962966919, + "num_tokens": 853019.0, + "step": 105 + }, + { + "entropy": 0.7148452520370483, + "epoch": 0.062076749435665914, + "grad_norm": 1.39479660987854, + "learning_rate": 4.999912474779733e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.8211281180381775, + "num_tokens": 893817.0, + "step": 110 + }, + { + "entropy": 0.7357437491416932, + "epoch": 0.06489841986455983, + "grad_norm": 1.3685375452041626, + "learning_rate": 4.999904260842348e-06, + "loss": 0.5979, + "mean_token_accuracy": 0.8241528987884521, + "num_tokens": 934521.0, + "step": 115 + }, + { + "entropy": 0.6611983060836792, + "epoch": 0.06772009029345373, + "grad_norm": 1.266114354133606, + "learning_rate": 4.999895678579776e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8424633145332336, + "num_tokens": 975258.0, + "step": 120 + }, + { + "entropy": 0.6882431030273437, + "epoch": 0.07054176072234764, + "grad_norm": 1.4671064615249634, + "learning_rate": 4.999886727993704e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.8341028809547424, + "num_tokens": 1015761.0, + "step": 125 + }, + { + "entropy": 0.7173007011413575, + "epoch": 0.07336343115124154, + "grad_norm": 1.4032313823699951, + "learning_rate": 4.999877409085892e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8307689428329468, + "num_tokens": 1056694.0, + "step": 130 + }, + { + "entropy": 0.6990100502967834, + "epoch": 0.07618510158013544, + "grad_norm": 1.3346563577651978, + "learning_rate": 4.999867721858168e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8263980627059937, + "num_tokens": 1097488.0, + "step": 135 + }, + { + "entropy": 0.6879770636558533, + "epoch": 0.07900677200902935, + "grad_norm": 1.5278775691986084, + "learning_rate": 4.999857666312438e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8352189302444458, + "num_tokens": 1138244.0, + "step": 140 + }, + { + "entropy": 0.6410947203636169, + "epoch": 0.08182844243792325, + "grad_norm": 1.5688629150390625, + "learning_rate": 4.999847242450674e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8461119532585144, + "num_tokens": 1179012.0, + "step": 145 + }, + { + "entropy": 0.669737708568573, + "epoch": 0.08465011286681716, + "grad_norm": 1.421505093574524, + "learning_rate": 4.999836450274926e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8409364104270936, + "num_tokens": 1219636.0, + "step": 150 + }, + { + "entropy": 0.7268872022628784, + "epoch": 0.08747178329571106, + "grad_norm": 1.4139888286590576, + "learning_rate": 4.999825289787314e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8278973698616028, + "num_tokens": 1260358.0, + "step": 155 + }, + { + "entropy": 0.6793547749519349, + "epoch": 0.09029345372460497, + "grad_norm": 1.3994945287704468, + "learning_rate": 4.99981376099003e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8384710073471069, + "num_tokens": 1300160.0, + "step": 160 + }, + { + "entropy": 0.7041888356208801, + "epoch": 0.09311512415349887, + "grad_norm": 1.2893599271774292, + "learning_rate": 4.999801863885339e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8308470964431762, + "num_tokens": 1340909.0, + "step": 165 + }, + { + "entropy": 0.7476347208023071, + "epoch": 0.09593679458239278, + "grad_norm": 1.4180552959442139, + "learning_rate": 4.999789598475578e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.8223087549209595, + "num_tokens": 1381254.0, + "step": 170 + }, + { + "entropy": 0.7091502547264099, + "epoch": 0.09875846501128668, + "grad_norm": 1.2832435369491577, + "learning_rate": 4.999776964763157e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8346888542175293, + "num_tokens": 1421956.0, + "step": 175 + }, + { + "entropy": 0.6569717288017273, + "epoch": 0.10158013544018059, + "grad_norm": 1.4216198921203613, + "learning_rate": 4.999763962750557e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8407601952552796, + "num_tokens": 1462724.0, + "step": 180 + }, + { + "entropy": 0.7254927158355713, + "epoch": 0.1044018058690745, + "grad_norm": 1.367143154144287, + "learning_rate": 4.999750592440333e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8244819164276123, + "num_tokens": 1503032.0, + "step": 185 + }, + { + "entropy": 0.6174264311790466, + "epoch": 0.1072234762979684, + "grad_norm": 1.1871576309204102, + "learning_rate": 4.999736853835111e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8440237402915954, + "num_tokens": 1543678.0, + "step": 190 + }, + { + "entropy": 0.6621920347213746, + "epoch": 0.1100451467268623, + "grad_norm": 1.3519974946975708, + "learning_rate": 4.999722746937591e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8406000256538391, + "num_tokens": 1584469.0, + "step": 195 + }, + { + "entropy": 0.72998526096344, + "epoch": 0.11286681715575621, + "grad_norm": 1.5044736862182617, + "learning_rate": 4.999708271750544e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8296970129013062, + "num_tokens": 1625117.0, + "step": 200 + }, + { + "entropy": 0.620005464553833, + "epoch": 0.11568848758465011, + "grad_norm": 1.2598135471343994, + "learning_rate": 4.999693428276813e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8482168078422546, + "num_tokens": 1665890.0, + "step": 205 + }, + { + "entropy": 0.7349537253379822, + "epoch": 0.11851015801354402, + "grad_norm": 1.290712833404541, + "learning_rate": 4.999678216519314e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8309862017631531, + "num_tokens": 1706501.0, + "step": 210 + }, + { + "entropy": 0.6702666759490967, + "epoch": 0.12133182844243792, + "grad_norm": 1.1792408227920532, + "learning_rate": 4.999662636481035e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8420506477355957, + "num_tokens": 1747287.0, + "step": 215 + }, + { + "entropy": 0.7209498405456543, + "epoch": 0.12415349887133183, + "grad_norm": 1.4285236597061157, + "learning_rate": 4.999646688165039e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8266986727714538, + "num_tokens": 1787814.0, + "step": 220 + }, + { + "entropy": 0.6917775988578796, + "epoch": 0.12697516930022573, + "grad_norm": 1.4500813484191895, + "learning_rate": 4.999630371574457e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8405853033065795, + "num_tokens": 1828330.0, + "step": 225 + }, + { + "entropy": 0.7286237478256226, + "epoch": 0.12979683972911965, + "grad_norm": 1.197171688079834, + "learning_rate": 4.999613686712493e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.8261087775230408, + "num_tokens": 1868916.0, + "step": 230 + }, + { + "entropy": 0.7036802411079407, + "epoch": 0.13261851015801354, + "grad_norm": 1.3373335599899292, + "learning_rate": 4.999596633582429e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8307413101196289, + "num_tokens": 1909557.0, + "step": 235 + }, + { + "entropy": 0.6876640558242798, + "epoch": 0.13544018058690746, + "grad_norm": 1.2515480518341064, + "learning_rate": 4.999579212187611e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8372658848762512, + "num_tokens": 1949962.0, + "step": 240 + }, + { + "entropy": 0.6509540438652038, + "epoch": 0.13826185101580135, + "grad_norm": 1.2757313251495361, + "learning_rate": 4.999561422531464e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8403837084770203, + "num_tokens": 1990703.0, + "step": 245 + }, + { + "entropy": 0.6634021878242493, + "epoch": 0.14108352144469527, + "grad_norm": 1.3375166654586792, + "learning_rate": 4.9995432646174815e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.831636929512024, + "num_tokens": 2031252.0, + "step": 250 + }, + { + "entropy": 0.7269237041473389, + "epoch": 0.14390519187358916, + "grad_norm": 1.3792425394058228, + "learning_rate": 4.9995247384492314e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.8250751614570617, + "num_tokens": 2071662.0, + "step": 255 + }, + { + "entropy": 0.7198888540267945, + "epoch": 0.14672686230248308, + "grad_norm": 1.4608137607574463, + "learning_rate": 4.999505844030352e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.8260551571846009, + "num_tokens": 2112264.0, + "step": 260 + }, + { + "entropy": 0.6873027205467224, + "epoch": 0.14954853273137697, + "grad_norm": 1.3915055990219116, + "learning_rate": 4.999486581364557e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.837960147857666, + "num_tokens": 2152995.0, + "step": 265 + }, + { + "entropy": 0.7061834812164307, + "epoch": 0.1523702031602709, + "grad_norm": 1.3316701650619507, + "learning_rate": 4.999466950455628e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8314638137817383, + "num_tokens": 2193728.0, + "step": 270 + }, + { + "entropy": 0.6988749980926514, + "epoch": 0.15519187358916478, + "grad_norm": 1.4248734712600708, + "learning_rate": 4.999446951307424e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8328610777854919, + "num_tokens": 2234493.0, + "step": 275 + }, + { + "entropy": 0.7523432970046997, + "epoch": 0.1580135440180587, + "grad_norm": 1.5484312772750854, + "learning_rate": 4.999426583923873e-06, + "loss": 0.6236, + "mean_token_accuracy": 0.8170114874839782, + "num_tokens": 2274609.0, + "step": 280 + }, + { + "entropy": 0.6838536024093628, + "epoch": 0.1608352144469526, + "grad_norm": 1.367386817932129, + "learning_rate": 4.999405848308975e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.842476773262024, + "num_tokens": 2315124.0, + "step": 285 + }, + { + "entropy": 0.6006429553031921, + "epoch": 0.1636568848758465, + "grad_norm": 1.3099923133850098, + "learning_rate": 4.999384744466805e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8527785658836364, + "num_tokens": 2355713.0, + "step": 290 + }, + { + "entropy": 0.6720806241035462, + "epoch": 0.1664785553047404, + "grad_norm": 1.2152323722839355, + "learning_rate": 4.999363272401508e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8428802371025086, + "num_tokens": 2395726.0, + "step": 295 + }, + { + "entropy": 0.730102264881134, + "epoch": 0.16930022573363432, + "grad_norm": 1.3833410739898682, + "learning_rate": 4.9993414321173014e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.8238815784454345, + "num_tokens": 2436268.0, + "step": 300 + }, + { + "entropy": 0.7030096173286438, + "epoch": 0.1721218961625282, + "grad_norm": 1.3912793397903442, + "learning_rate": 4.9993192236184786e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.829773461818695, + "num_tokens": 2476941.0, + "step": 305 + }, + { + "entropy": 0.692926001548767, + "epoch": 0.17494356659142213, + "grad_norm": 1.406414270401001, + "learning_rate": 4.9992966469094005e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8338650107383728, + "num_tokens": 2517495.0, + "step": 310 + }, + { + "entropy": 0.7009713649749756, + "epoch": 0.17776523702031602, + "grad_norm": 1.4576942920684814, + "learning_rate": 4.999273701994501e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8266690850257874, + "num_tokens": 2558100.0, + "step": 315 + }, + { + "entropy": 0.6718906760215759, + "epoch": 0.18058690744920994, + "grad_norm": 1.4303754568099976, + "learning_rate": 4.999250388878291e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.830142343044281, + "num_tokens": 2598172.0, + "step": 320 + }, + { + "entropy": 0.6644892454147339, + "epoch": 0.18340857787810383, + "grad_norm": 1.3855103254318237, + "learning_rate": 4.999226707565348e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8401578664779663, + "num_tokens": 2638766.0, + "step": 325 + }, + { + "entropy": 0.6168012738227844, + "epoch": 0.18623024830699775, + "grad_norm": 1.3673598766326904, + "learning_rate": 4.999202658060324e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8519076347351074, + "num_tokens": 2679412.0, + "step": 330 + }, + { + "entropy": 0.7058925032615662, + "epoch": 0.18905191873589164, + "grad_norm": 1.2969095706939697, + "learning_rate": 4.9991782403679445e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.8322369575500488, + "num_tokens": 2720202.0, + "step": 335 + }, + { + "entropy": 0.6692469239234924, + "epoch": 0.19187358916478556, + "grad_norm": 1.3060578107833862, + "learning_rate": 4.999153454493006e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.837912917137146, + "num_tokens": 2760837.0, + "step": 340 + }, + { + "entropy": 0.6806850790977478, + "epoch": 0.19469525959367945, + "grad_norm": 1.3894520998001099, + "learning_rate": 4.999128300440377e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.8271700620651246, + "num_tokens": 2801488.0, + "step": 345 + }, + { + "entropy": 0.6444134593009949, + "epoch": 0.19751693002257337, + "grad_norm": 1.2214727401733398, + "learning_rate": 4.9991027782150005e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8515629053115845, + "num_tokens": 2842036.0, + "step": 350 + }, + { + "entropy": 0.720831036567688, + "epoch": 0.20033860045146726, + "grad_norm": 1.3418818712234497, + "learning_rate": 4.99907688782189e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8290072560310364, + "num_tokens": 2882628.0, + "step": 355 + }, + { + "entropy": 0.6308605074882507, + "epoch": 0.20316027088036118, + "grad_norm": 1.3402037620544434, + "learning_rate": 4.9990506292661315e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8474404573440552, + "num_tokens": 2923198.0, + "step": 360 + }, + { + "entropy": 0.6803479313850402, + "epoch": 0.20598194130925507, + "grad_norm": 1.4827245473861694, + "learning_rate": 4.9990240025528825e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8327592492103577, + "num_tokens": 2963798.0, + "step": 365 + }, + { + "entropy": 0.6540728807449341, + "epoch": 0.208803611738149, + "grad_norm": 1.3350443840026855, + "learning_rate": 4.998997007687375e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8447277069091796, + "num_tokens": 3004683.0, + "step": 370 + }, + { + "entropy": 0.7112419605255127, + "epoch": 0.21162528216704288, + "grad_norm": 1.3664087057113647, + "learning_rate": 4.998969644674911e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.8266646385192871, + "num_tokens": 3045439.0, + "step": 375 + }, + { + "entropy": 0.6424817204475403, + "epoch": 0.2144469525959368, + "grad_norm": 1.295762300491333, + "learning_rate": 4.998941913520867e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8423674941062927, + "num_tokens": 3086082.0, + "step": 380 + }, + { + "entropy": 0.6630952477455139, + "epoch": 0.2172686230248307, + "grad_norm": 1.189756989479065, + "learning_rate": 4.998913814230691e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.839359712600708, + "num_tokens": 3126821.0, + "step": 385 + }, + { + "entropy": 0.7343137979507446, + "epoch": 0.2200902934537246, + "grad_norm": 1.3133877515792847, + "learning_rate": 4.998885346809902e-06, + "loss": 0.616, + "mean_token_accuracy": 0.820310366153717, + "num_tokens": 3167554.0, + "step": 390 + }, + { + "entropy": 0.7052167534828186, + "epoch": 0.2229119638826185, + "grad_norm": 1.2900844812393188, + "learning_rate": 4.998856511264094e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8320470333099366, + "num_tokens": 3208157.0, + "step": 395 + }, + { + "entropy": 0.7610344290733337, + "epoch": 0.22573363431151242, + "grad_norm": 1.5215799808502197, + "learning_rate": 4.99882730759893e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.81887845993042, + "num_tokens": 3248261.0, + "step": 400 + }, + { + "entropy": 0.6336552977561951, + "epoch": 0.22855530474040633, + "grad_norm": 1.2715165615081787, + "learning_rate": 4.9987977358201475e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8473275661468506, + "num_tokens": 3288888.0, + "step": 405 + }, + { + "entropy": 0.6352953314781189, + "epoch": 0.23137697516930023, + "grad_norm": 1.3592379093170166, + "learning_rate": 4.998767795933557e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8488934755325317, + "num_tokens": 3329423.0, + "step": 410 + }, + { + "entropy": 0.6898619294166565, + "epoch": 0.23419864559819414, + "grad_norm": 1.5599400997161865, + "learning_rate": 4.998737487945039e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8362764358520508, + "num_tokens": 3369896.0, + "step": 415 + }, + { + "entropy": 0.6755191922187805, + "epoch": 0.23702031602708803, + "grad_norm": 1.3778337240219116, + "learning_rate": 4.998706811860548e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8319776773452758, + "num_tokens": 3410527.0, + "step": 420 + }, + { + "entropy": 0.6867754459381104, + "epoch": 0.23984198645598195, + "grad_norm": 1.3699222803115845, + "learning_rate": 4.99867576768611e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.839304780960083, + "num_tokens": 3451077.0, + "step": 425 + }, + { + "entropy": 0.683435583114624, + "epoch": 0.24266365688487584, + "grad_norm": 1.3843740224838257, + "learning_rate": 4.9986443554278244e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8384104609489441, + "num_tokens": 3491423.0, + "step": 430 + }, + { + "entropy": 0.6476885080337524, + "epoch": 0.24548532731376976, + "grad_norm": 1.2700644731521606, + "learning_rate": 4.998612575091861e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8458979368209839, + "num_tokens": 3532023.0, + "step": 435 + }, + { + "entropy": 0.6664133548736573, + "epoch": 0.24830699774266365, + "grad_norm": 1.3746997117996216, + "learning_rate": 4.998580426684464e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8449108362197876, + "num_tokens": 3572699.0, + "step": 440 + }, + { + "entropy": 0.6357210874557495, + "epoch": 0.25112866817155755, + "grad_norm": 1.2438454627990723, + "learning_rate": 4.99854791021195e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8491321802139282, + "num_tokens": 3613440.0, + "step": 445 + }, + { + "entropy": 0.627473509311676, + "epoch": 0.25395033860045146, + "grad_norm": 1.1443346738815308, + "learning_rate": 4.998515025680703e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8488677740097046, + "num_tokens": 3653969.0, + "step": 450 + }, + { + "entropy": 0.6401142597198486, + "epoch": 0.2567720090293454, + "grad_norm": 1.3476841449737549, + "learning_rate": 4.998481773097187e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8511731863021851, + "num_tokens": 3693874.0, + "step": 455 + }, + { + "entropy": 0.645455002784729, + "epoch": 0.2595936794582393, + "grad_norm": 1.392980933189392, + "learning_rate": 4.998448152467933e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8437379837036133, + "num_tokens": 3734684.0, + "step": 460 + }, + { + "entropy": 0.7039332389831543, + "epoch": 0.26241534988713316, + "grad_norm": 1.42012357711792, + "learning_rate": 4.998414163799545e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.8330328106880188, + "num_tokens": 3775233.0, + "step": 465 + }, + { + "entropy": 0.6496623396873474, + "epoch": 0.2652370203160271, + "grad_norm": 1.168820858001709, + "learning_rate": 4.998379807098703e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8448665142059326, + "num_tokens": 3815898.0, + "step": 470 + }, + { + "entropy": 0.6927712917327881, + "epoch": 0.268058690744921, + "grad_norm": 1.277652621269226, + "learning_rate": 4.998345082372153e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8371129631996155, + "num_tokens": 3856530.0, + "step": 475 + }, + { + "entropy": 0.6336887121200562, + "epoch": 0.2708803611738149, + "grad_norm": 1.3250688314437866, + "learning_rate": 4.998309989626718e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8498493075370789, + "num_tokens": 3897409.0, + "step": 480 + }, + { + "entropy": 0.645351254940033, + "epoch": 0.2737020316027088, + "grad_norm": 1.1753090620040894, + "learning_rate": 4.998274528869292e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8436009883880615, + "num_tokens": 3938183.0, + "step": 485 + }, + { + "entropy": 0.6607337474822998, + "epoch": 0.2765237020316027, + "grad_norm": 1.386691927909851, + "learning_rate": 4.998238700106842e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8447944283485412, + "num_tokens": 3978774.0, + "step": 490 + }, + { + "entropy": 0.7330415606498718, + "epoch": 0.2793453724604966, + "grad_norm": 1.3130143880844116, + "learning_rate": 4.998202503346405e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.8254420638084412, + "num_tokens": 4019301.0, + "step": 495 + }, + { + "entropy": 0.6656695961952209, + "epoch": 0.28216704288939054, + "grad_norm": 1.4055564403533936, + "learning_rate": 4.998165938595094e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8389903545379639, + "num_tokens": 4059906.0, + "step": 500 + }, + { + "epoch": 0.28216704288939054, + "eval_entropy": 0.6397659778594971, + "eval_loss": 0.5099606513977051, + "eval_mean_token_accuracy": 0.857033371925354, + "eval_num_tokens": 4059906.0, + "eval_runtime": 0.1662, + "eval_samples_per_second": 24.067, + "eval_steps_per_second": 6.017, + "step": 500 + }, + { + "entropy": 0.6222930490970612, + "epoch": 0.2849887133182844, + "grad_norm": 1.3186826705932617, + "learning_rate": 4.99812900586009e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8475234031677246, + "num_tokens": 4100501.0, + "step": 505 + }, + { + "entropy": 0.6861591100692749, + "epoch": 0.2878103837471783, + "grad_norm": 1.2481554746627808, + "learning_rate": 4.998091705148649e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8370915293693543, + "num_tokens": 4141056.0, + "step": 510 + }, + { + "entropy": 0.6614163756370545, + "epoch": 0.29063205417607224, + "grad_norm": 1.3787559270858765, + "learning_rate": 4.998054036468099e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8423222064971924, + "num_tokens": 4181622.0, + "step": 515 + }, + { + "entropy": 0.6498169541358948, + "epoch": 0.29345372460496616, + "grad_norm": 1.140236735343933, + "learning_rate": 4.9980159998258406e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.846357011795044, + "num_tokens": 4222377.0, + "step": 520 + }, + { + "entropy": 0.6879486322402955, + "epoch": 0.29627539503386, + "grad_norm": 1.4712022542953491, + "learning_rate": 4.997977595229346e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.835738730430603, + "num_tokens": 4263122.0, + "step": 525 + }, + { + "entropy": 0.6771662116050721, + "epoch": 0.29909706546275394, + "grad_norm": 1.3491076231002808, + "learning_rate": 4.997938822686158e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8368203639984131, + "num_tokens": 4303690.0, + "step": 530 + }, + { + "entropy": 0.614617896080017, + "epoch": 0.30191873589164786, + "grad_norm": 1.2441357374191284, + "learning_rate": 4.9978996822038964e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8524455547332763, + "num_tokens": 4344245.0, + "step": 535 + }, + { + "entropy": 0.6798507928848266, + "epoch": 0.3047404063205418, + "grad_norm": 1.3364735841751099, + "learning_rate": 4.997860173790247e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8381256461143494, + "num_tokens": 4384408.0, + "step": 540 + }, + { + "entropy": 0.6491321206092835, + "epoch": 0.30756207674943564, + "grad_norm": 1.3930151462554932, + "learning_rate": 4.997820297452975e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8447855472564697, + "num_tokens": 4424758.0, + "step": 545 + }, + { + "entropy": 0.6349780797958374, + "epoch": 0.31038374717832956, + "grad_norm": 1.406150460243225, + "learning_rate": 4.99778005319991e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8480640172958374, + "num_tokens": 4465315.0, + "step": 550 + }, + { + "entropy": 0.6838363170623779, + "epoch": 0.3132054176072235, + "grad_norm": 1.5294631719589233, + "learning_rate": 4.997739441038962e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8342589735984802, + "num_tokens": 4505898.0, + "step": 555 + }, + { + "entropy": 0.7220178842544556, + "epoch": 0.3160270880361174, + "grad_norm": 1.3367938995361328, + "learning_rate": 4.997698460978107e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8219369530677796, + "num_tokens": 4546647.0, + "step": 560 + }, + { + "entropy": 0.6441617131233215, + "epoch": 0.31884875846501126, + "grad_norm": 1.1907154321670532, + "learning_rate": 4.997657113025395e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8480484962463379, + "num_tokens": 4587353.0, + "step": 565 + }, + { + "entropy": 0.7197385430335999, + "epoch": 0.3216704288939052, + "grad_norm": 1.5142972469329834, + "learning_rate": 4.99761539718895e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8342020034790039, + "num_tokens": 4627673.0, + "step": 570 + }, + { + "entropy": 0.7443452477455139, + "epoch": 0.3244920993227991, + "grad_norm": 1.2722396850585938, + "learning_rate": 4.997573313476966e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.8234388828277588, + "num_tokens": 4668389.0, + "step": 575 + }, + { + "entropy": 0.6642969369888305, + "epoch": 0.327313769751693, + "grad_norm": 1.3420699834823608, + "learning_rate": 4.997530861897713e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8478710532188416, + "num_tokens": 4708970.0, + "step": 580 + }, + { + "entropy": 0.6479909062385559, + "epoch": 0.3301354401805869, + "grad_norm": 1.422701358795166, + "learning_rate": 4.997488042459528e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8437910914421082, + "num_tokens": 4749674.0, + "step": 585 + }, + { + "entropy": 0.6523343324661255, + "epoch": 0.3329571106094808, + "grad_norm": 1.3672442436218262, + "learning_rate": 4.997444855170823e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8426079154014587, + "num_tokens": 4790249.0, + "step": 590 + }, + { + "entropy": 0.6659448742866516, + "epoch": 0.3357787810383747, + "grad_norm": 1.267858624458313, + "learning_rate": 4.997401300040084e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8389504075050354, + "num_tokens": 4831050.0, + "step": 595 + }, + { + "entropy": 0.6824786901473999, + "epoch": 0.33860045146726864, + "grad_norm": 1.3464597463607788, + "learning_rate": 4.997357377075866e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8403973698616027, + "num_tokens": 4871225.0, + "step": 600 + }, + { + "entropy": 0.6604692697525024, + "epoch": 0.34142212189616256, + "grad_norm": 1.4553182125091553, + "learning_rate": 4.997313086286797e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8407448887825012, + "num_tokens": 4911582.0, + "step": 605 + }, + { + "entropy": 0.6908664226531982, + "epoch": 0.3442437923250564, + "grad_norm": 1.3354190587997437, + "learning_rate": 4.997268427681579e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8351940751075745, + "num_tokens": 4952166.0, + "step": 610 + }, + { + "entropy": 0.6428348064422608, + "epoch": 0.34706546275395034, + "grad_norm": 1.2946590185165405, + "learning_rate": 4.997223401268985e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8446980834007263, + "num_tokens": 4992726.0, + "step": 615 + }, + { + "entropy": 0.6469574570655823, + "epoch": 0.34988713318284426, + "grad_norm": 1.3383957147598267, + "learning_rate": 4.9971780070578605e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8440989255905151, + "num_tokens": 5033375.0, + "step": 620 + }, + { + "entropy": 0.6753599047660828, + "epoch": 0.3527088036117382, + "grad_norm": 1.4977456331253052, + "learning_rate": 4.997132245057124e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8380128145217896, + "num_tokens": 5074101.0, + "step": 625 + }, + { + "entropy": 0.6009699106216431, + "epoch": 0.35553047404063204, + "grad_norm": 1.3025342226028442, + "learning_rate": 4.997086115275763e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8532807230949402, + "num_tokens": 5114789.0, + "step": 630 + }, + { + "entropy": 0.7020259857177734, + "epoch": 0.35835214446952596, + "grad_norm": 1.42304265499115, + "learning_rate": 4.997039617722843e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8297113418579102, + "num_tokens": 5155557.0, + "step": 635 + }, + { + "entropy": 0.6381742715835571, + "epoch": 0.3611738148984199, + "grad_norm": 1.3583488464355469, + "learning_rate": 4.996992752407496e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8546654939651489, + "num_tokens": 5195254.0, + "step": 640 + }, + { + "entropy": 0.7070492506027222, + "epoch": 0.3639954853273138, + "grad_norm": 1.4853992462158203, + "learning_rate": 4.996945519338929e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8343408942222595, + "num_tokens": 5235919.0, + "step": 645 + }, + { + "entropy": 0.7001669526100158, + "epoch": 0.36681715575620766, + "grad_norm": 1.263033390045166, + "learning_rate": 4.996897918526422e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8342163324356079, + "num_tokens": 5276443.0, + "step": 650 + }, + { + "entropy": 0.6899022579193115, + "epoch": 0.3696388261851016, + "grad_norm": 1.3277238607406616, + "learning_rate": 4.996849949979325e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8299772024154664, + "num_tokens": 5317131.0, + "step": 655 + }, + { + "entropy": 0.6805570363998413, + "epoch": 0.3724604966139955, + "grad_norm": 1.2228710651397705, + "learning_rate": 4.996801613707063e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8347800970077515, + "num_tokens": 5357866.0, + "step": 660 + }, + { + "entropy": 0.6331149935722351, + "epoch": 0.3752821670428894, + "grad_norm": 1.3809823989868164, + "learning_rate": 4.9967529097191305e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8466669321060181, + "num_tokens": 5398573.0, + "step": 665 + }, + { + "entropy": 0.6736877083778381, + "epoch": 0.3781038374717833, + "grad_norm": 1.3853009939193726, + "learning_rate": 4.996703838025095e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.839972174167633, + "num_tokens": 5439412.0, + "step": 670 + }, + { + "entropy": 0.6803161859512329, + "epoch": 0.3809255079006772, + "grad_norm": 1.4266542196273804, + "learning_rate": 4.996654398634597e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8333458185195923, + "num_tokens": 5479945.0, + "step": 675 + }, + { + "entropy": 0.7106911301612854, + "epoch": 0.3837471783295711, + "grad_norm": 1.482615351676941, + "learning_rate": 4.996604591557349e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8281079649925231, + "num_tokens": 5520469.0, + "step": 680 + }, + { + "entropy": 0.6807895421981811, + "epoch": 0.38656884875846503, + "grad_norm": 1.2304539680480957, + "learning_rate": 4.996554416803137e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.8346430540084839, + "num_tokens": 5561155.0, + "step": 685 + }, + { + "entropy": 0.6457465648651123, + "epoch": 0.3893905191873589, + "grad_norm": 1.470390796661377, + "learning_rate": 4.996503874381815e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.846334969997406, + "num_tokens": 5601850.0, + "step": 690 + }, + { + "entropy": 0.5989644646644592, + "epoch": 0.3922121896162528, + "grad_norm": 1.3190268278121948, + "learning_rate": 4.996452964303315e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8595061659812927, + "num_tokens": 5642455.0, + "step": 695 + }, + { + "entropy": 0.6620395064353943, + "epoch": 0.39503386004514673, + "grad_norm": 1.4160014390945435, + "learning_rate": 4.996401686577636e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8426364421844482, + "num_tokens": 5683057.0, + "step": 700 + }, + { + "entropy": 0.6351073563098908, + "epoch": 0.39785553047404065, + "grad_norm": 1.417966604232788, + "learning_rate": 4.996350041214852e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8437610864639282, + "num_tokens": 5723964.0, + "step": 705 + }, + { + "entropy": 0.7087972402572632, + "epoch": 0.4006772009029345, + "grad_norm": 1.4004472494125366, + "learning_rate": 4.996298028225111e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8329956293106079, + "num_tokens": 5764573.0, + "step": 710 + }, + { + "entropy": 0.7214209794998169, + "epoch": 0.40349887133182843, + "grad_norm": 1.5199602842330933, + "learning_rate": 4.996245647618627e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8271468758583069, + "num_tokens": 5805361.0, + "step": 715 + }, + { + "entropy": 0.6683865189552307, + "epoch": 0.40632054176072235, + "grad_norm": 1.3417999744415283, + "learning_rate": 4.996192899405693e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8423861384391784, + "num_tokens": 5846241.0, + "step": 720 + }, + { + "entropy": 0.7061811447143554, + "epoch": 0.40914221218961627, + "grad_norm": 1.456238031387329, + "learning_rate": 4.996139783596671e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8380335092544555, + "num_tokens": 5886839.0, + "step": 725 + }, + { + "entropy": 0.6855465412139893, + "epoch": 0.41196388261851014, + "grad_norm": 1.3961089849472046, + "learning_rate": 4.996086300201995e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.838828194141388, + "num_tokens": 5927433.0, + "step": 730 + }, + { + "entropy": 0.6245934367179871, + "epoch": 0.41478555304740405, + "grad_norm": 1.252845048904419, + "learning_rate": 4.996032449232172e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8503703594207763, + "num_tokens": 5968213.0, + "step": 735 + }, + { + "entropy": 0.6127149283885955, + "epoch": 0.417607223476298, + "grad_norm": 1.207445502281189, + "learning_rate": 4.995978230697782e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8527963757514954, + "num_tokens": 6008851.0, + "step": 740 + }, + { + "entropy": 0.7167877793312073, + "epoch": 0.4204288939051919, + "grad_norm": 1.266051173210144, + "learning_rate": 4.995923644609474e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8294572830200195, + "num_tokens": 6049545.0, + "step": 745 + }, + { + "entropy": 0.5922774732112884, + "epoch": 0.42325056433408575, + "grad_norm": 1.3726112842559814, + "learning_rate": 4.995868690977974e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8563015341758728, + "num_tokens": 6090230.0, + "step": 750 + }, + { + "entropy": 0.6408256888389587, + "epoch": 0.4260722347629797, + "grad_norm": 1.3127083778381348, + "learning_rate": 4.995813369814075e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8428861260414123, + "num_tokens": 6130808.0, + "step": 755 + }, + { + "entropy": 0.6796544313430786, + "epoch": 0.4288939051918736, + "grad_norm": 1.409447431564331, + "learning_rate": 4.995757681128648e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8407068133354187, + "num_tokens": 6171110.0, + "step": 760 + }, + { + "entropy": 0.6506537318229675, + "epoch": 0.4317155756207675, + "grad_norm": 1.2559049129486084, + "learning_rate": 4.995701624932631e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8502960324287414, + "num_tokens": 6211953.0, + "step": 765 + }, + { + "entropy": 0.6637347579002381, + "epoch": 0.4345372460496614, + "grad_norm": 1.309942603111267, + "learning_rate": 4.995645201237036e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8399680137634278, + "num_tokens": 6252408.0, + "step": 770 + }, + { + "entropy": 0.6979188203811646, + "epoch": 0.4373589164785553, + "grad_norm": 1.5395498275756836, + "learning_rate": 4.995588410052948e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.8252558946609497, + "num_tokens": 6293047.0, + "step": 775 + }, + { + "entropy": 0.6070165634155273, + "epoch": 0.4401805869074492, + "grad_norm": 1.1005687713623047, + "learning_rate": 4.995531251391524e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8509817957878113, + "num_tokens": 6333702.0, + "step": 780 + }, + { + "entropy": 0.6723381996154785, + "epoch": 0.44300225733634313, + "grad_norm": 1.27623450756073, + "learning_rate": 4.995473725263992e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8400357604026795, + "num_tokens": 6374319.0, + "step": 785 + }, + { + "entropy": 0.6453313708305359, + "epoch": 0.445823927765237, + "grad_norm": 1.3552383184432983, + "learning_rate": 4.995415831681654e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8450021266937255, + "num_tokens": 6415005.0, + "step": 790 + }, + { + "entropy": 0.7531963586807251, + "epoch": 0.4486455981941309, + "grad_norm": 1.303346037864685, + "learning_rate": 4.9953575706558835e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8193735122680664, + "num_tokens": 6455608.0, + "step": 795 + }, + { + "entropy": 0.6664739489555359, + "epoch": 0.45146726862302483, + "grad_norm": 1.3953213691711426, + "learning_rate": 4.9952989421981244e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8387627482414246, + "num_tokens": 6495958.0, + "step": 800 + }, + { + "entropy": 0.6475582718849182, + "epoch": 0.45428893905191875, + "grad_norm": 1.3377810716629028, + "learning_rate": 4.995239946319895e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8416747093200684, + "num_tokens": 6536670.0, + "step": 805 + }, + { + "entropy": 0.6350281596183777, + "epoch": 0.45711060948081267, + "grad_norm": 1.3273295164108276, + "learning_rate": 4.995180583032784e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8475698709487915, + "num_tokens": 6577503.0, + "step": 810 + }, + { + "entropy": 0.6923723101615906, + "epoch": 0.45993227990970653, + "grad_norm": 1.4398008584976196, + "learning_rate": 4.9951208523484555e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.834313976764679, + "num_tokens": 6617829.0, + "step": 815 + }, + { + "entropy": 0.6901078701019288, + "epoch": 0.46275395033860045, + "grad_norm": 1.2139265537261963, + "learning_rate": 4.995060754278642e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.832787299156189, + "num_tokens": 6658320.0, + "step": 820 + }, + { + "entropy": 0.592526650428772, + "epoch": 0.46557562076749437, + "grad_norm": 1.34032142162323, + "learning_rate": 4.9950002888351514e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8565322160720825, + "num_tokens": 6699023.0, + "step": 825 + }, + { + "entropy": 0.713707959651947, + "epoch": 0.4683972911963883, + "grad_norm": 1.4716837406158447, + "learning_rate": 4.994939456029859e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8286639451980591, + "num_tokens": 6739446.0, + "step": 830 + }, + { + "entropy": 0.6702501654624939, + "epoch": 0.47121896162528215, + "grad_norm": 1.2845447063446045, + "learning_rate": 4.994878255874719e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8418485879898071, + "num_tokens": 6780190.0, + "step": 835 + }, + { + "entropy": 0.6634608149528504, + "epoch": 0.47404063205417607, + "grad_norm": 1.3624354600906372, + "learning_rate": 4.994816688381751e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8404200553894043, + "num_tokens": 6820877.0, + "step": 840 + }, + { + "entropy": 0.7302131295204163, + "epoch": 0.47686230248307, + "grad_norm": 1.4335999488830566, + "learning_rate": 4.994754753563054e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.8234637618064881, + "num_tokens": 6861674.0, + "step": 845 + }, + { + "entropy": 0.6252121210098267, + "epoch": 0.4796839729119639, + "grad_norm": 1.423103928565979, + "learning_rate": 4.994692451430791e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8479559659957886, + "num_tokens": 6902472.0, + "step": 850 + }, + { + "entropy": 0.6240964293479919, + "epoch": 0.48250564334085777, + "grad_norm": 1.416658878326416, + "learning_rate": 4.9946297819972025e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8452272534370422, + "num_tokens": 6942877.0, + "step": 855 + }, + { + "entropy": 0.6672571420669555, + "epoch": 0.4853273137697517, + "grad_norm": 1.3133835792541504, + "learning_rate": 4.994566745274601e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8426130771636963, + "num_tokens": 6983426.0, + "step": 860 + }, + { + "entropy": 0.6581945300102234, + "epoch": 0.4881489841986456, + "grad_norm": 1.4337458610534668, + "learning_rate": 4.994503341275369e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8437890887260437, + "num_tokens": 7023732.0, + "step": 865 + }, + { + "entropy": 0.6551531553268433, + "epoch": 0.4909706546275395, + "grad_norm": 1.3199399709701538, + "learning_rate": 4.994439570011963e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8422786116600036, + "num_tokens": 7064450.0, + "step": 870 + }, + { + "entropy": 0.6477535367012024, + "epoch": 0.4937923250564334, + "grad_norm": 1.2373626232147217, + "learning_rate": 4.99437543149691e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8424580574035645, + "num_tokens": 7105220.0, + "step": 875 + }, + { + "entropy": 0.6733549952507019, + "epoch": 0.4966139954853273, + "grad_norm": 1.3904764652252197, + "learning_rate": 4.994310925742811e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8425598025321961, + "num_tokens": 7145455.0, + "step": 880 + }, + { + "entropy": 0.6530658364295959, + "epoch": 0.4994356659142212, + "grad_norm": 1.387152075767517, + "learning_rate": 4.9942460527623374e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8402209639549255, + "num_tokens": 7185927.0, + "step": 885 + }, + { + "entropy": 0.6589832663536072, + "epoch": 0.5022573363431151, + "grad_norm": 1.2508260011672974, + "learning_rate": 4.9941808125682336e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8469531655311584, + "num_tokens": 7226531.0, + "step": 890 + }, + { + "entropy": 0.6467825770378113, + "epoch": 0.505079006772009, + "grad_norm": 1.2095719575881958, + "learning_rate": 4.994115205173317e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8432755708694458, + "num_tokens": 7267311.0, + "step": 895 + }, + { + "entropy": 0.6670600891113281, + "epoch": 0.5079006772009029, + "grad_norm": 1.3329170942306519, + "learning_rate": 4.994049230590474e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8473351120948791, + "num_tokens": 7308118.0, + "step": 900 + }, + { + "entropy": 0.654244887828827, + "epoch": 0.5107223476297968, + "grad_norm": 1.3469221591949463, + "learning_rate": 4.993982888832667e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8424062848091125, + "num_tokens": 7348961.0, + "step": 905 + }, + { + "entropy": 0.6805335998535156, + "epoch": 0.5135440180586908, + "grad_norm": 1.4335166215896606, + "learning_rate": 4.993916179912929e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8342411160469055, + "num_tokens": 7389295.0, + "step": 910 + }, + { + "entropy": 0.6766547679901123, + "epoch": 0.5163656884875847, + "grad_norm": 1.2276146411895752, + "learning_rate": 4.993849103844365e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8335286498069763, + "num_tokens": 7429891.0, + "step": 915 + }, + { + "entropy": 0.6441382646560669, + "epoch": 0.5191873589164786, + "grad_norm": 1.277137041091919, + "learning_rate": 4.9937816606401506e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8357362985610962, + "num_tokens": 7470461.0, + "step": 920 + }, + { + "entropy": 0.6156215190887451, + "epoch": 0.5220090293453724, + "grad_norm": 1.3269896507263184, + "learning_rate": 4.993713850313537e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8524217247962952, + "num_tokens": 7511054.0, + "step": 925 + }, + { + "entropy": 0.702682101726532, + "epoch": 0.5248306997742663, + "grad_norm": 1.4615741968154907, + "learning_rate": 4.993645672877843e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8279630184173584, + "num_tokens": 7551567.0, + "step": 930 + }, + { + "entropy": 0.6455079674720764, + "epoch": 0.5276523702031602, + "grad_norm": 1.3673226833343506, + "learning_rate": 4.993577128346465e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8442542672157287, + "num_tokens": 7591635.0, + "step": 935 + }, + { + "entropy": 0.6908176302909851, + "epoch": 0.5304740406320542, + "grad_norm": 1.4456593990325928, + "learning_rate": 4.993508216732867e-06, + "loss": 0.558, + "mean_token_accuracy": 0.832518482208252, + "num_tokens": 7631859.0, + "step": 940 + }, + { + "entropy": 0.7004367709159851, + "epoch": 0.5332957110609481, + "grad_norm": 1.3898345232009888, + "learning_rate": 4.993438938050587e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.8318912148475647, + "num_tokens": 7672520.0, + "step": 945 + }, + { + "entropy": 0.6737080454826355, + "epoch": 0.536117381489842, + "grad_norm": 1.398849606513977, + "learning_rate": 4.993369292313235e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8320559740066529, + "num_tokens": 7713151.0, + "step": 950 + }, + { + "entropy": 0.6230229139328003, + "epoch": 0.5389390519187359, + "grad_norm": 1.264159917831421, + "learning_rate": 4.993299279534492e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8493775367736817, + "num_tokens": 7754018.0, + "step": 955 + }, + { + "entropy": 0.6292937159538269, + "epoch": 0.5417607223476298, + "grad_norm": 1.344561219215393, + "learning_rate": 4.993228899728113e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8411380529403687, + "num_tokens": 7794654.0, + "step": 960 + }, + { + "entropy": 0.6116863548755646, + "epoch": 0.5445823927765236, + "grad_norm": 1.1279850006103516, + "learning_rate": 4.993158152907923e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8529308915138245, + "num_tokens": 7835397.0, + "step": 965 + }, + { + "entropy": 0.6285845994949341, + "epoch": 0.5474040632054176, + "grad_norm": 1.1283153295516968, + "learning_rate": 4.993087039087823e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8528955459594727, + "num_tokens": 7876148.0, + "step": 970 + }, + { + "entropy": 0.6560633659362793, + "epoch": 0.5502257336343115, + "grad_norm": 1.213463306427002, + "learning_rate": 4.993015558281779e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8412879347801209, + "num_tokens": 7916823.0, + "step": 975 + }, + { + "entropy": 0.6548992276191712, + "epoch": 0.5530474040632054, + "grad_norm": 1.2896618843078613, + "learning_rate": 4.992943710503838e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8419336557388306, + "num_tokens": 7957342.0, + "step": 980 + }, + { + "entropy": 0.6178439378738403, + "epoch": 0.5558690744920993, + "grad_norm": 1.1932618618011475, + "learning_rate": 4.99287149576811e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.850293779373169, + "num_tokens": 7998078.0, + "step": 985 + }, + { + "entropy": 0.7176673412322998, + "epoch": 0.5586907449209932, + "grad_norm": 1.495336651802063, + "learning_rate": 4.992798914088786e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8254365086555481, + "num_tokens": 8038148.0, + "step": 990 + }, + { + "entropy": 0.6931470155715942, + "epoch": 0.5615124153498872, + "grad_norm": 1.343912124633789, + "learning_rate": 4.992725965480121e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8297663688659668, + "num_tokens": 8078718.0, + "step": 995 + }, + { + "entropy": 0.7093847036361695, + "epoch": 0.5643340857787811, + "grad_norm": 1.2880916595458984, + "learning_rate": 4.992652649956448e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8289087891578675, + "num_tokens": 8119465.0, + "step": 1000 + }, + { + "epoch": 0.5643340857787811, + "eval_entropy": 0.6200407147407532, + "eval_loss": 0.49860483407974243, + "eval_mean_token_accuracy": 0.8631659746170044, + "eval_num_tokens": 8119465.0, + "eval_runtime": 0.1639, + "eval_samples_per_second": 24.405, + "eval_steps_per_second": 6.101, + "step": 1000 + }, + { + "entropy": 0.6551719307899475, + "epoch": 0.5671557562076749, + "grad_norm": 1.444422721862793, + "learning_rate": 4.992578967532169e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8438852429389954, + "num_tokens": 8160136.0, + "step": 1005 + }, + { + "entropy": 0.6948068380355835, + "epoch": 0.5699774266365688, + "grad_norm": 1.4045120477676392, + "learning_rate": 4.992504918221759e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8269060254096985, + "num_tokens": 8200865.0, + "step": 1010 + }, + { + "entropy": 0.6053695321083069, + "epoch": 0.5727990970654627, + "grad_norm": 1.2733221054077148, + "learning_rate": 4.9924305020397645e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8552789926528931, + "num_tokens": 8241635.0, + "step": 1015 + }, + { + "entropy": 0.6861397624015808, + "epoch": 0.5756207674943566, + "grad_norm": 1.3532050848007202, + "learning_rate": 4.992355719000805e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8298478722572327, + "num_tokens": 8282118.0, + "step": 1020 + }, + { + "entropy": 0.6331191897392273, + "epoch": 0.5784424379232506, + "grad_norm": 1.2071319818496704, + "learning_rate": 4.992280569119574e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.850217878818512, + "num_tokens": 8322734.0, + "step": 1025 + }, + { + "entropy": 0.6948285818099975, + "epoch": 0.5812641083521445, + "grad_norm": 1.375131368637085, + "learning_rate": 4.99220505241083e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8394317030906677, + "num_tokens": 8363311.0, + "step": 1030 + }, + { + "entropy": 0.6704018592834473, + "epoch": 0.5840857787810384, + "grad_norm": 1.373281478881836, + "learning_rate": 4.992129168889412e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8385064721107482, + "num_tokens": 8403747.0, + "step": 1035 + }, + { + "entropy": 0.6323675155639649, + "epoch": 0.5869074492099323, + "grad_norm": 1.2113394737243652, + "learning_rate": 4.992052918570226e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8503919124603272, + "num_tokens": 8444647.0, + "step": 1040 + }, + { + "entropy": 0.6883712530136108, + "epoch": 0.5897291196388262, + "grad_norm": 1.329156517982483, + "learning_rate": 4.991976301468251e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8347142577171326, + "num_tokens": 8485437.0, + "step": 1045 + }, + { + "entropy": 0.6799022316932678, + "epoch": 0.59255079006772, + "grad_norm": 1.2880463600158691, + "learning_rate": 4.9918993175985384e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.8357504963874817, + "num_tokens": 8526376.0, + "step": 1050 + }, + { + "entropy": 0.6652753591537476, + "epoch": 0.595372460496614, + "grad_norm": 1.3654208183288574, + "learning_rate": 4.991821966976213e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8353073954582214, + "num_tokens": 8567085.0, + "step": 1055 + }, + { + "entropy": 0.6146788358688354, + "epoch": 0.5981941309255079, + "grad_norm": 1.140661597251892, + "learning_rate": 4.991744249616469e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8558377385139465, + "num_tokens": 8607488.0, + "step": 1060 + }, + { + "entropy": 0.710556972026825, + "epoch": 0.6010158013544018, + "grad_norm": 1.3625504970550537, + "learning_rate": 4.991666165534575e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8333917140960694, + "num_tokens": 8648210.0, + "step": 1065 + }, + { + "entropy": 0.6810308933258057, + "epoch": 0.6038374717832957, + "grad_norm": 1.393760323524475, + "learning_rate": 4.99158771474587e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8344960331916809, + "num_tokens": 8688725.0, + "step": 1070 + }, + { + "entropy": 0.6881044745445252, + "epoch": 0.6066591422121896, + "grad_norm": 1.4435288906097412, + "learning_rate": 4.991508897265766e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8323047876358032, + "num_tokens": 8729344.0, + "step": 1075 + }, + { + "entropy": 0.6240251541137696, + "epoch": 0.6094808126410836, + "grad_norm": 1.3459124565124512, + "learning_rate": 4.991429713109746e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8495769262313843, + "num_tokens": 8770034.0, + "step": 1080 + }, + { + "entropy": 0.642047894001007, + "epoch": 0.6123024830699775, + "grad_norm": 1.186833143234253, + "learning_rate": 4.991350162293367e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8443755626678466, + "num_tokens": 8810541.0, + "step": 1085 + }, + { + "entropy": 0.6657473087310791, + "epoch": 0.6151241534988713, + "grad_norm": 1.4160339832305908, + "learning_rate": 4.991270244832256e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.836929440498352, + "num_tokens": 8851054.0, + "step": 1090 + }, + { + "entropy": 0.664323914051056, + "epoch": 0.6179458239277652, + "grad_norm": 1.3085633516311646, + "learning_rate": 4.9911899607421116e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8405984282493592, + "num_tokens": 8891919.0, + "step": 1095 + }, + { + "entropy": 0.6161608934402466, + "epoch": 0.6207674943566591, + "grad_norm": 1.2209677696228027, + "learning_rate": 4.991109310038707e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8494919776916504, + "num_tokens": 8932520.0, + "step": 1100 + }, + { + "entropy": 0.633518648147583, + "epoch": 0.623589164785553, + "grad_norm": 1.422237515449524, + "learning_rate": 4.991028292737887e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8427853345870971, + "num_tokens": 8973154.0, + "step": 1105 + }, + { + "entropy": 0.7281236052513123, + "epoch": 0.626410835214447, + "grad_norm": 1.3925809860229492, + "learning_rate": 4.990946908855565e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.8230817198753357, + "num_tokens": 9013989.0, + "step": 1110 + }, + { + "entropy": 0.6049828886985779, + "epoch": 0.6292325056433409, + "grad_norm": 1.2714378833770752, + "learning_rate": 4.990865158407731e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8520822525024414, + "num_tokens": 9054405.0, + "step": 1115 + }, + { + "entropy": 0.6474563360214234, + "epoch": 0.6320541760722348, + "grad_norm": 1.3699289560317993, + "learning_rate": 4.990783041410444e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8497814536094666, + "num_tokens": 9095024.0, + "step": 1120 + }, + { + "entropy": 0.665639317035675, + "epoch": 0.6348758465011287, + "grad_norm": 1.32370126247406, + "learning_rate": 4.9907005578798366e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8440038084983825, + "num_tokens": 9135786.0, + "step": 1125 + }, + { + "entropy": 0.6320675134658813, + "epoch": 0.6376975169300225, + "grad_norm": 1.2937510013580322, + "learning_rate": 4.990617707832111e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8501017332077027, + "num_tokens": 9176559.0, + "step": 1130 + }, + { + "entropy": 0.7049695611000061, + "epoch": 0.6405191873589164, + "grad_norm": 1.3041940927505493, + "learning_rate": 4.990534491283545e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8350239872932435, + "num_tokens": 9217381.0, + "step": 1135 + }, + { + "entropy": 0.6608424663543702, + "epoch": 0.6433408577878104, + "grad_norm": 1.3915507793426514, + "learning_rate": 4.990450908250485e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8423561692237854, + "num_tokens": 9258041.0, + "step": 1140 + }, + { + "entropy": 0.6109439551830291, + "epoch": 0.6461625282167043, + "grad_norm": 1.1623317003250122, + "learning_rate": 4.990366958749352e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8506023287773132, + "num_tokens": 9298558.0, + "step": 1145 + }, + { + "entropy": 0.6726685404777527, + "epoch": 0.6489841986455982, + "grad_norm": 1.4480568170547485, + "learning_rate": 4.990282642796638e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8336881279945374, + "num_tokens": 9339287.0, + "step": 1150 + }, + { + "entropy": 0.6401282668113708, + "epoch": 0.6518058690744921, + "grad_norm": 1.405791997909546, + "learning_rate": 4.9901979604089055e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8454666852951049, + "num_tokens": 9379965.0, + "step": 1155 + }, + { + "entropy": 0.616789698600769, + "epoch": 0.654627539503386, + "grad_norm": 1.2432202100753784, + "learning_rate": 4.990112911602792e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8562314510345459, + "num_tokens": 9420544.0, + "step": 1160 + }, + { + "entropy": 0.6499378085136414, + "epoch": 0.65744920993228, + "grad_norm": 1.3318426609039307, + "learning_rate": 4.990027496395003e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8456689953804016, + "num_tokens": 9461170.0, + "step": 1165 + }, + { + "entropy": 0.6929373741149902, + "epoch": 0.6602708803611738, + "grad_norm": 1.501207709312439, + "learning_rate": 4.989941714802321e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8362460851669311, + "num_tokens": 9501863.0, + "step": 1170 + }, + { + "entropy": 0.6151013791561126, + "epoch": 0.6630925507900677, + "grad_norm": 1.291373372077942, + "learning_rate": 4.989855566841597e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8491938352584839, + "num_tokens": 9542539.0, + "step": 1175 + }, + { + "entropy": 0.6619673609733582, + "epoch": 0.6659142212189616, + "grad_norm": 1.8319789171218872, + "learning_rate": 4.989769052529754e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8366817712783814, + "num_tokens": 9583209.0, + "step": 1180 + }, + { + "entropy": 0.7023379445075989, + "epoch": 0.6687358916478555, + "grad_norm": 1.4245003461837769, + "learning_rate": 4.989682171883789e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8261257648468018, + "num_tokens": 9623904.0, + "step": 1185 + }, + { + "entropy": 0.6219159960746765, + "epoch": 0.6715575620767494, + "grad_norm": 1.5310255289077759, + "learning_rate": 4.9895949249207674e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8541441679000854, + "num_tokens": 9664582.0, + "step": 1190 + }, + { + "entropy": 0.6346824288368225, + "epoch": 0.6743792325056434, + "grad_norm": 1.3092063665390015, + "learning_rate": 4.989507311657832e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8455924034118653, + "num_tokens": 9705103.0, + "step": 1195 + }, + { + "entropy": 0.6392194151878356, + "epoch": 0.6772009029345373, + "grad_norm": 1.2018376588821411, + "learning_rate": 4.9894193321121915e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.847294807434082, + "num_tokens": 9745685.0, + "step": 1200 + }, + { + "entropy": 0.7368967413902283, + "epoch": 0.6800225733634312, + "grad_norm": 1.4050233364105225, + "learning_rate": 4.989330986301131e-06, + "loss": 0.6216, + "mean_token_accuracy": 0.8151282906532288, + "num_tokens": 9786144.0, + "step": 1205 + }, + { + "entropy": 0.6174101948738098, + "epoch": 0.6828442437923251, + "grad_norm": 1.3221447467803955, + "learning_rate": 4.989242274242007e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8495320796966552, + "num_tokens": 9826811.0, + "step": 1210 + }, + { + "entropy": 0.6422483444213867, + "epoch": 0.6856659142212189, + "grad_norm": 1.1776975393295288, + "learning_rate": 4.989153195952246e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8441770195960998, + "num_tokens": 9867430.0, + "step": 1215 + }, + { + "entropy": 0.6633909225463868, + "epoch": 0.6884875846501128, + "grad_norm": 1.257130742073059, + "learning_rate": 4.989063751449346e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8380695223808289, + "num_tokens": 9907541.0, + "step": 1220 + }, + { + "entropy": 0.6803221344947815, + "epoch": 0.6913092550790068, + "grad_norm": 1.227628231048584, + "learning_rate": 4.98897394075088e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8371790289878845, + "num_tokens": 9948303.0, + "step": 1225 + }, + { + "entropy": 0.6370683073997497, + "epoch": 0.6941309255079007, + "grad_norm": 1.1942147016525269, + "learning_rate": 4.9888837638744915e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8441259145736695, + "num_tokens": 9989051.0, + "step": 1230 + }, + { + "entropy": 0.6529434204101563, + "epoch": 0.6969525959367946, + "grad_norm": 1.3808612823486328, + "learning_rate": 4.988793220837895e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8458134651184082, + "num_tokens": 10029798.0, + "step": 1235 + }, + { + "entropy": 0.6289508223533631, + "epoch": 0.6997742663656885, + "grad_norm": 1.1567049026489258, + "learning_rate": 4.988702311658879e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8443259239196778, + "num_tokens": 10070451.0, + "step": 1240 + }, + { + "entropy": 0.6437416434288025, + "epoch": 0.7025959367945824, + "grad_norm": 1.3937760591506958, + "learning_rate": 4.9886110363553005e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.846991765499115, + "num_tokens": 10111198.0, + "step": 1245 + }, + { + "entropy": 0.710420835018158, + "epoch": 0.7054176072234764, + "grad_norm": 1.3196383714675903, + "learning_rate": 4.988519394945092e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8278885602951049, + "num_tokens": 10151900.0, + "step": 1250 + }, + { + "entropy": 0.6752119541168213, + "epoch": 0.7082392776523702, + "grad_norm": 1.4347295761108398, + "learning_rate": 4.988427387446255e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8365179777145386, + "num_tokens": 10192756.0, + "step": 1255 + }, + { + "entropy": 0.6734273076057434, + "epoch": 0.7110609480812641, + "grad_norm": 1.4099767208099365, + "learning_rate": 4.988335013876867e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8306289792060852, + "num_tokens": 10233478.0, + "step": 1260 + }, + { + "entropy": 0.7350514650344848, + "epoch": 0.713882618510158, + "grad_norm": 1.3657110929489136, + "learning_rate": 4.988242274255073e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.8200275421142578, + "num_tokens": 10274135.0, + "step": 1265 + }, + { + "entropy": 0.6617794394493103, + "epoch": 0.7167042889390519, + "grad_norm": 1.4455223083496094, + "learning_rate": 4.988149168599092e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8396545290946961, + "num_tokens": 10314720.0, + "step": 1270 + }, + { + "entropy": 0.6266619205474854, + "epoch": 0.7195259593679458, + "grad_norm": 1.2425928115844727, + "learning_rate": 4.988055696927214e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8460469722747803, + "num_tokens": 10355383.0, + "step": 1275 + }, + { + "entropy": 0.6324691534042358, + "epoch": 0.7223476297968398, + "grad_norm": 1.2948311567306519, + "learning_rate": 4.987961859257803e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8449645042419434, + "num_tokens": 10395935.0, + "step": 1280 + }, + { + "entropy": 0.6536452293395996, + "epoch": 0.7251693002257337, + "grad_norm": 1.3648988008499146, + "learning_rate": 4.987867655609292e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8420405626296997, + "num_tokens": 10436654.0, + "step": 1285 + }, + { + "entropy": 0.6121232390403748, + "epoch": 0.7279909706546276, + "grad_norm": 1.2139304876327515, + "learning_rate": 4.987773086000188e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8504088521003723, + "num_tokens": 10477501.0, + "step": 1290 + }, + { + "entropy": 0.6448933601379394, + "epoch": 0.7308126410835214, + "grad_norm": 1.375567078590393, + "learning_rate": 4.987678150449069e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8450644254684448, + "num_tokens": 10518384.0, + "step": 1295 + }, + { + "entropy": 0.6476213932037354, + "epoch": 0.7336343115124153, + "grad_norm": 1.3577604293823242, + "learning_rate": 4.987582848974586e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8453767418861389, + "num_tokens": 10559194.0, + "step": 1300 + }, + { + "entropy": 0.6976300120353699, + "epoch": 0.7364559819413092, + "grad_norm": 1.3497973680496216, + "learning_rate": 4.987487181595459e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8350682735443116, + "num_tokens": 10599795.0, + "step": 1305 + }, + { + "entropy": 0.6458664059638977, + "epoch": 0.7392776523702032, + "grad_norm": 1.329574465751648, + "learning_rate": 4.987391148330485e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8440695524215698, + "num_tokens": 10640689.0, + "step": 1310 + }, + { + "entropy": 0.6906252384185791, + "epoch": 0.7420993227990971, + "grad_norm": 1.2950578927993774, + "learning_rate": 4.987294749198526e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8313373565673828, + "num_tokens": 10681276.0, + "step": 1315 + }, + { + "entropy": 0.6506387591362, + "epoch": 0.744920993227991, + "grad_norm": 1.5627875328063965, + "learning_rate": 4.987197984218522e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8504640936851502, + "num_tokens": 10722061.0, + "step": 1320 + }, + { + "entropy": 0.6033547759056092, + "epoch": 0.7477426636568849, + "grad_norm": 1.4060436487197876, + "learning_rate": 4.9871008534094825e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8535768985748291, + "num_tokens": 10762663.0, + "step": 1325 + }, + { + "entropy": 0.6256420493125916, + "epoch": 0.7505643340857788, + "grad_norm": 1.2653220891952515, + "learning_rate": 4.987003356790487e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.845154881477356, + "num_tokens": 10803082.0, + "step": 1330 + }, + { + "entropy": 0.627797293663025, + "epoch": 0.7533860045146726, + "grad_norm": 1.2059917449951172, + "learning_rate": 4.986905494380691e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8541393876075745, + "num_tokens": 10843614.0, + "step": 1335 + }, + { + "entropy": 0.6704052805900573, + "epoch": 0.7562076749435666, + "grad_norm": 1.4258085489273071, + "learning_rate": 4.986807266199318e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8418290138244628, + "num_tokens": 10884087.0, + "step": 1340 + }, + { + "entropy": 0.7184508085250855, + "epoch": 0.7590293453724605, + "grad_norm": 1.3789485692977905, + "learning_rate": 4.986708672265667e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.8304919958114624, + "num_tokens": 10924781.0, + "step": 1345 + }, + { + "entropy": 0.6704004406929016, + "epoch": 0.7618510158013544, + "grad_norm": 1.290982961654663, + "learning_rate": 4.986609712599103e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8440407991409302, + "num_tokens": 10965534.0, + "step": 1350 + }, + { + "entropy": 0.6607297778129577, + "epoch": 0.7646726862302483, + "grad_norm": 1.294542670249939, + "learning_rate": 4.986510387219071e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8411241173744202, + "num_tokens": 11006238.0, + "step": 1355 + }, + { + "entropy": 0.6355977773666381, + "epoch": 0.7674943566591422, + "grad_norm": 1.3880748748779297, + "learning_rate": 4.98641069614508e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8476488828659058, + "num_tokens": 11046967.0, + "step": 1360 + }, + { + "entropy": 0.6151192545890808, + "epoch": 0.7703160270880361, + "grad_norm": 1.2665696144104004, + "learning_rate": 4.9863106393967165e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.854477858543396, + "num_tokens": 11086831.0, + "step": 1365 + }, + { + "entropy": 0.6570278882980347, + "epoch": 0.7731376975169301, + "grad_norm": 1.4179819822311401, + "learning_rate": 4.986210216993636e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8439712643623352, + "num_tokens": 11127372.0, + "step": 1370 + }, + { + "entropy": 0.6928212285041809, + "epoch": 0.7759593679458239, + "grad_norm": 1.3699414730072021, + "learning_rate": 4.986109428955566e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8342428803443909, + "num_tokens": 11167988.0, + "step": 1375 + }, + { + "entropy": 0.6448866724967957, + "epoch": 0.7787810383747178, + "grad_norm": 1.2906067371368408, + "learning_rate": 4.986008275302307e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8406425952911377, + "num_tokens": 11208797.0, + "step": 1380 + }, + { + "entropy": 0.6187806010246277, + "epoch": 0.7816027088036117, + "grad_norm": 1.25465726852417, + "learning_rate": 4.98590675605373e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8553349137306213, + "num_tokens": 11249307.0, + "step": 1385 + }, + { + "entropy": 0.6682080864906311, + "epoch": 0.7844243792325056, + "grad_norm": 1.278944969177246, + "learning_rate": 4.98580487122978e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8448834180831909, + "num_tokens": 11289926.0, + "step": 1390 + }, + { + "entropy": 0.6455815672874451, + "epoch": 0.7872460496613995, + "grad_norm": 1.4049651622772217, + "learning_rate": 4.98570262085047e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8430341005325317, + "num_tokens": 11330704.0, + "step": 1395 + }, + { + "entropy": 0.7064637899398803, + "epoch": 0.7900677200902935, + "grad_norm": 1.3357079029083252, + "learning_rate": 4.985600004935889e-06, + "loss": 0.565, + "mean_token_accuracy": 0.8299904108047486, + "num_tokens": 11371389.0, + "step": 1400 + }, + { + "entropy": 0.6495133399963379, + "epoch": 0.7928893905191874, + "grad_norm": 1.6325474977493286, + "learning_rate": 4.985497023506195e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8373395681381226, + "num_tokens": 11411887.0, + "step": 1405 + }, + { + "entropy": 0.6633385181427002, + "epoch": 0.7957110609480813, + "grad_norm": 1.4035420417785645, + "learning_rate": 4.985393676581619e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8406889915466309, + "num_tokens": 11452589.0, + "step": 1410 + }, + { + "entropy": 0.6448916912078857, + "epoch": 0.7985327313769752, + "grad_norm": 1.288009524345398, + "learning_rate": 4.985289964182463e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8462629556655884, + "num_tokens": 11493085.0, + "step": 1415 + }, + { + "entropy": 0.5965805172920227, + "epoch": 0.801354401805869, + "grad_norm": 1.2756577730178833, + "learning_rate": 4.985185886329101e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8524753332138062, + "num_tokens": 11533760.0, + "step": 1420 + }, + { + "entropy": 0.6665675163269043, + "epoch": 0.804176072234763, + "grad_norm": 1.3834420442581177, + "learning_rate": 4.985081443041981e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8393696427345276, + "num_tokens": 11574224.0, + "step": 1425 + }, + { + "entropy": 0.6934167623519898, + "epoch": 0.8069977426636569, + "grad_norm": 1.4740710258483887, + "learning_rate": 4.98497663434162e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.830849575996399, + "num_tokens": 11614729.0, + "step": 1430 + }, + { + "entropy": 0.6643628239631653, + "epoch": 0.8098194130925508, + "grad_norm": 1.342612862586975, + "learning_rate": 4.984871460248607e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.840022873878479, + "num_tokens": 11655327.0, + "step": 1435 + }, + { + "entropy": 0.6764454126358033, + "epoch": 0.8126410835214447, + "grad_norm": 1.5213992595672607, + "learning_rate": 4.984765920783604e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8351798415184021, + "num_tokens": 11695967.0, + "step": 1440 + }, + { + "entropy": 0.6788670063018799, + "epoch": 0.8154627539503386, + "grad_norm": 1.2199022769927979, + "learning_rate": 4.984660015967343e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8351686835289002, + "num_tokens": 11736108.0, + "step": 1445 + }, + { + "entropy": 0.6327372908592224, + "epoch": 0.8182844243792325, + "grad_norm": 1.4303865432739258, + "learning_rate": 4.984553745820631e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8465925574302673, + "num_tokens": 11776854.0, + "step": 1450 + }, + { + "entropy": 0.6577797293663025, + "epoch": 0.8211060948081265, + "grad_norm": 1.3754053115844727, + "learning_rate": 4.984447110364343e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8419236660003662, + "num_tokens": 11817612.0, + "step": 1455 + }, + { + "entropy": 0.6646520733833313, + "epoch": 0.8239277652370203, + "grad_norm": 1.260542631149292, + "learning_rate": 4.98434010961943e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8385473966598511, + "num_tokens": 11858437.0, + "step": 1460 + }, + { + "entropy": 0.6954992532730102, + "epoch": 0.8267494356659142, + "grad_norm": 1.408316731452942, + "learning_rate": 4.9842327436069105e-06, + "loss": 0.5601, + "mean_token_accuracy": 0.8325203895568848, + "num_tokens": 11898873.0, + "step": 1465 + }, + { + "entropy": 0.6532683134078979, + "epoch": 0.8295711060948081, + "grad_norm": 1.1917073726654053, + "learning_rate": 4.984125012347876e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8390217423439026, + "num_tokens": 11939340.0, + "step": 1470 + }, + { + "entropy": 0.6418487191200256, + "epoch": 0.832392776523702, + "grad_norm": 1.243117332458496, + "learning_rate": 4.984016915863491e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8492981314659118, + "num_tokens": 11980119.0, + "step": 1475 + }, + { + "entropy": 0.6358325242996216, + "epoch": 0.835214446952596, + "grad_norm": 1.252548336982727, + "learning_rate": 4.983908454174993e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8524085760116578, + "num_tokens": 12020810.0, + "step": 1480 + }, + { + "entropy": 0.6454902291297913, + "epoch": 0.8380361173814899, + "grad_norm": 1.2286616563796997, + "learning_rate": 4.983799627303685e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8518875241279602, + "num_tokens": 12061121.0, + "step": 1485 + }, + { + "entropy": 0.6209899544715881, + "epoch": 0.8408577878103838, + "grad_norm": 1.2979341745376587, + "learning_rate": 4.98369043527095e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8472349762916564, + "num_tokens": 12101696.0, + "step": 1490 + }, + { + "entropy": 0.6519573330879211, + "epoch": 0.8436794582392777, + "grad_norm": 1.2703050374984741, + "learning_rate": 4.9835808780982375e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8439043760299683, + "num_tokens": 12142285.0, + "step": 1495 + }, + { + "entropy": 0.6138962626457214, + "epoch": 0.8465011286681715, + "grad_norm": 1.2608084678649902, + "learning_rate": 4.9834709558070695e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8513062238693238, + "num_tokens": 12183096.0, + "step": 1500 + }, + { + "epoch": 0.8465011286681715, + "eval_entropy": 0.6190634965896606, + "eval_loss": 0.450931191444397, + "eval_mean_token_accuracy": 0.8727481961250305, + "eval_num_tokens": 12183096.0, + "eval_runtime": 0.1637, + "eval_samples_per_second": 24.428, + "eval_steps_per_second": 6.107, + "step": 1500 + }, + { + "entropy": 0.6505842447280884, + "epoch": 0.8493227990970654, + "grad_norm": 1.3207699060440063, + "learning_rate": 4.983360668419041e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8519291162490845, + "num_tokens": 12223690.0, + "step": 1505 + }, + { + "entropy": 0.664458155632019, + "epoch": 0.8521444695259593, + "grad_norm": 1.2953647375106812, + "learning_rate": 4.983250015955818e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8392679691314697, + "num_tokens": 12264356.0, + "step": 1510 + }, + { + "entropy": 0.6668670177459717, + "epoch": 0.8549661399548533, + "grad_norm": 1.276229739189148, + "learning_rate": 4.983138998439137e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8406529307365418, + "num_tokens": 12305094.0, + "step": 1515 + }, + { + "entropy": 0.7231775879859924, + "epoch": 0.8577878103837472, + "grad_norm": 1.3792636394500732, + "learning_rate": 4.983027615890809e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.827623724937439, + "num_tokens": 12345899.0, + "step": 1520 + }, + { + "entropy": 0.6197749614715576, + "epoch": 0.8606094808126411, + "grad_norm": 1.3481459617614746, + "learning_rate": 4.982915868332713e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8470370888710022, + "num_tokens": 12386462.0, + "step": 1525 + }, + { + "entropy": 0.7093599200248718, + "epoch": 0.863431151241535, + "grad_norm": 1.4356274604797363, + "learning_rate": 4.982803755786804e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.825712525844574, + "num_tokens": 12427019.0, + "step": 1530 + }, + { + "entropy": 0.7157705545425415, + "epoch": 0.8662528216704289, + "grad_norm": 1.3810184001922607, + "learning_rate": 4.982691278275106e-06, + "loss": 0.5696, + "mean_token_accuracy": 0.8296974778175354, + "num_tokens": 12467761.0, + "step": 1535 + }, + { + "entropy": 0.6354712247848511, + "epoch": 0.8690744920993227, + "grad_norm": 1.4308044910430908, + "learning_rate": 4.982578435819714e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8471119284629822, + "num_tokens": 12508405.0, + "step": 1540 + }, + { + "entropy": 0.7147751808166504, + "epoch": 0.8718961625282167, + "grad_norm": 1.399246096611023, + "learning_rate": 4.982465228442797e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8266459822654724, + "num_tokens": 12549081.0, + "step": 1545 + }, + { + "entropy": 0.643587851524353, + "epoch": 0.8747178329571106, + "grad_norm": 1.2349791526794434, + "learning_rate": 4.982351656166595e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8451477646827698, + "num_tokens": 12589617.0, + "step": 1550 + }, + { + "entropy": 0.6347296714782715, + "epoch": 0.8775395033860045, + "grad_norm": 1.2148078680038452, + "learning_rate": 4.982237719013418e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8488546848297119, + "num_tokens": 12629403.0, + "step": 1555 + }, + { + "entropy": 0.6973551154136658, + "epoch": 0.8803611738148984, + "grad_norm": 1.2895756959915161, + "learning_rate": 4.982123417005651e-06, + "loss": 0.561, + "mean_token_accuracy": 0.8330285906791687, + "num_tokens": 12670150.0, + "step": 1560 + }, + { + "entropy": 0.7544346213340759, + "epoch": 0.8831828442437923, + "grad_norm": 1.3327959775924683, + "learning_rate": 4.982008750165746e-06, + "loss": 0.615, + "mean_token_accuracy": 0.818334436416626, + "num_tokens": 12710858.0, + "step": 1565 + }, + { + "entropy": 0.7033024072647095, + "epoch": 0.8860045146726863, + "grad_norm": 1.3447669744491577, + "learning_rate": 4.981893718516231e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.8303789258003235, + "num_tokens": 12751450.0, + "step": 1570 + }, + { + "entropy": 0.6676825523376465, + "epoch": 0.8888261851015802, + "grad_norm": 1.3791662454605103, + "learning_rate": 4.981778322079704e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8445156216621399, + "num_tokens": 12792048.0, + "step": 1575 + }, + { + "entropy": 0.6464922666549683, + "epoch": 0.891647855530474, + "grad_norm": 1.2325254678726196, + "learning_rate": 4.981662560878835e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8373061299324036, + "num_tokens": 12832714.0, + "step": 1580 + }, + { + "entropy": 0.6744041681289673, + "epoch": 0.8944695259593679, + "grad_norm": 1.4298688173294067, + "learning_rate": 4.981546434936363e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8334176301956177, + "num_tokens": 12873204.0, + "step": 1585 + }, + { + "entropy": 0.5900899410247803, + "epoch": 0.8972911963882618, + "grad_norm": 1.396270513534546, + "learning_rate": 4.981429944275103e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8552528858184815, + "num_tokens": 12912731.0, + "step": 1590 + }, + { + "entropy": 0.6489648938179016, + "epoch": 0.9001128668171557, + "grad_norm": 1.2255243062973022, + "learning_rate": 4.981313088917939e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8412474513053894, + "num_tokens": 12953377.0, + "step": 1595 + }, + { + "entropy": 0.7049468755722046, + "epoch": 0.9029345372460497, + "grad_norm": 1.6258138418197632, + "learning_rate": 4.9811958688878274e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.8347102165222168, + "num_tokens": 12994028.0, + "step": 1600 + }, + { + "entropy": 0.6801372170448303, + "epoch": 0.9057562076749436, + "grad_norm": 1.2854009866714478, + "learning_rate": 4.981078284207797e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8424756646156311, + "num_tokens": 13034671.0, + "step": 1605 + }, + { + "entropy": 0.6932690858840942, + "epoch": 0.9085778781038375, + "grad_norm": 1.4718157052993774, + "learning_rate": 4.980960334900945e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8399105072021484, + "num_tokens": 13075220.0, + "step": 1610 + }, + { + "entropy": 0.6554085493087769, + "epoch": 0.9113995485327314, + "grad_norm": 1.1461048126220703, + "learning_rate": 4.980842020990444e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8404485821723938, + "num_tokens": 13115849.0, + "step": 1615 + }, + { + "entropy": 0.6477279543876648, + "epoch": 0.9142212189616253, + "grad_norm": 1.2576565742492676, + "learning_rate": 4.980723342499538e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8431344985961914, + "num_tokens": 13156403.0, + "step": 1620 + }, + { + "entropy": 0.6991305470466613, + "epoch": 0.9170428893905191, + "grad_norm": 1.3756641149520874, + "learning_rate": 4.9806042994515395e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.8329641699790955, + "num_tokens": 13197015.0, + "step": 1625 + }, + { + "entropy": 0.6869478225708008, + "epoch": 0.9198645598194131, + "grad_norm": 1.265496015548706, + "learning_rate": 4.980484891869835e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8395046353340149, + "num_tokens": 13237500.0, + "step": 1630 + }, + { + "entropy": 0.6620127320289612, + "epoch": 0.922686230248307, + "grad_norm": 1.432976484298706, + "learning_rate": 4.980365119777882e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8438527345657348, + "num_tokens": 13277239.0, + "step": 1635 + }, + { + "entropy": 0.7288511991500854, + "epoch": 0.9255079006772009, + "grad_norm": 1.485297679901123, + "learning_rate": 4.980244983199211e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8336746454238891, + "num_tokens": 13317707.0, + "step": 1640 + }, + { + "entropy": 0.7077333211898804, + "epoch": 0.9283295711060948, + "grad_norm": 1.3443304300308228, + "learning_rate": 4.9801244821574216e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.8335135579109192, + "num_tokens": 13358245.0, + "step": 1645 + }, + { + "entropy": 0.6646490931510926, + "epoch": 0.9311512415349887, + "grad_norm": 1.2345566749572754, + "learning_rate": 4.9800036166761866e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8396414875984192, + "num_tokens": 13398838.0, + "step": 1650 + }, + { + "entropy": 0.6034572184085846, + "epoch": 0.9339729119638827, + "grad_norm": 1.2768057584762573, + "learning_rate": 4.979882386779249e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8570885419845581, + "num_tokens": 13439472.0, + "step": 1655 + }, + { + "entropy": 0.6616186738014221, + "epoch": 0.9367945823927766, + "grad_norm": 1.2942156791687012, + "learning_rate": 4.979760792490426e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.8383080959320068, + "num_tokens": 13480194.0, + "step": 1660 + }, + { + "entropy": 0.6370395302772522, + "epoch": 0.9396162528216704, + "grad_norm": 1.384143352508545, + "learning_rate": 4.979638833833604e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8490473747253418, + "num_tokens": 13520923.0, + "step": 1665 + }, + { + "entropy": 0.648089063167572, + "epoch": 0.9424379232505643, + "grad_norm": 1.377890706062317, + "learning_rate": 4.979516510832743e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8464336395263672, + "num_tokens": 13561752.0, + "step": 1670 + }, + { + "entropy": 0.7131257057189941, + "epoch": 0.9452595936794582, + "grad_norm": 1.4829003810882568, + "learning_rate": 4.979393823511871e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8287210106849671, + "num_tokens": 13602311.0, + "step": 1675 + }, + { + "entropy": 0.7149917125701905, + "epoch": 0.9480812641083521, + "grad_norm": 1.355993390083313, + "learning_rate": 4.979270771895093e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.827799940109253, + "num_tokens": 13642473.0, + "step": 1680 + }, + { + "entropy": 0.6665526628494263, + "epoch": 0.9509029345372461, + "grad_norm": 1.222419023513794, + "learning_rate": 4.979147356006579e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8411856889724731, + "num_tokens": 13683117.0, + "step": 1685 + }, + { + "entropy": 0.6773181915283203, + "epoch": 0.95372460496614, + "grad_norm": 1.3376954793930054, + "learning_rate": 4.979023575870577e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8385852217674256, + "num_tokens": 13723981.0, + "step": 1690 + }, + { + "entropy": 0.5895337462425232, + "epoch": 0.9565462753950339, + "grad_norm": 1.189743995666504, + "learning_rate": 4.978899431511401e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8576975584030151, + "num_tokens": 13764383.0, + "step": 1695 + }, + { + "entropy": 0.6576069235801697, + "epoch": 0.9593679458239278, + "grad_norm": 1.3030279874801636, + "learning_rate": 4.978774922953442e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.847059428691864, + "num_tokens": 13805055.0, + "step": 1700 + }, + { + "entropy": 0.6367745637893677, + "epoch": 0.9621896162528216, + "grad_norm": 1.2039754390716553, + "learning_rate": 4.978650050221159e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8448570847511292, + "num_tokens": 13845904.0, + "step": 1705 + }, + { + "entropy": 0.6365428566932678, + "epoch": 0.9650112866817155, + "grad_norm": 1.157940149307251, + "learning_rate": 4.978524813339082e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8424256920814515, + "num_tokens": 13886682.0, + "step": 1710 + }, + { + "entropy": 0.66874258518219, + "epoch": 0.9678329571106095, + "grad_norm": 1.4614887237548828, + "learning_rate": 4.978399212331814e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8407912611961365, + "num_tokens": 13927181.0, + "step": 1715 + }, + { + "entropy": 0.643929636478424, + "epoch": 0.9706546275395034, + "grad_norm": 1.3897498846054077, + "learning_rate": 4.97827324722403e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8482011795043946, + "num_tokens": 13967885.0, + "step": 1720 + }, + { + "entropy": 0.6536713361740112, + "epoch": 0.9734762979683973, + "grad_norm": 1.3549083471298218, + "learning_rate": 4.978146918040476e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8463730335235595, + "num_tokens": 14008407.0, + "step": 1725 + }, + { + "entropy": 0.6258985161781311, + "epoch": 0.9762979683972912, + "grad_norm": 1.2369407415390015, + "learning_rate": 4.97802022480597e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8545310378074646, + "num_tokens": 14048919.0, + "step": 1730 + }, + { + "entropy": 0.6749670505523682, + "epoch": 0.9791196388261851, + "grad_norm": 1.5410014390945435, + "learning_rate": 4.977893167545398e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8386905789375305, + "num_tokens": 14089574.0, + "step": 1735 + }, + { + "entropy": 0.6478506565093994, + "epoch": 0.981941309255079, + "grad_norm": 1.3205047845840454, + "learning_rate": 4.977765746283724e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8434589385986329, + "num_tokens": 14130308.0, + "step": 1740 + }, + { + "entropy": 0.6606280326843261, + "epoch": 0.9847629796839729, + "grad_norm": 1.2447839975357056, + "learning_rate": 4.977637961045977e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8446960568428039, + "num_tokens": 14171031.0, + "step": 1745 + }, + { + "entropy": 0.6999424219131469, + "epoch": 0.9875846501128668, + "grad_norm": 1.2505875825881958, + "learning_rate": 4.977509811857263e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.828137469291687, + "num_tokens": 14211646.0, + "step": 1750 + }, + { + "entropy": 0.6732224225997925, + "epoch": 0.9904063205417607, + "grad_norm": 1.4964361190795898, + "learning_rate": 4.977381298742754e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8329331874847412, + "num_tokens": 14252319.0, + "step": 1755 + }, + { + "entropy": 0.6761491894721985, + "epoch": 0.9932279909706546, + "grad_norm": 1.426883339881897, + "learning_rate": 4.977252421727699e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8371304512023926, + "num_tokens": 14293055.0, + "step": 1760 + }, + { + "entropy": 0.6125996708869934, + "epoch": 0.9960496613995485, + "grad_norm": 1.3476899862289429, + "learning_rate": 4.977123180837416e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8501678228378295, + "num_tokens": 14333762.0, + "step": 1765 + }, + { + "entropy": 0.6156837463378906, + "epoch": 0.9988713318284425, + "grad_norm": 1.3276349306106567, + "learning_rate": 4.976993576097292e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8468546986579895, + "num_tokens": 14374541.0, + "step": 1770 + }, + { + "entropy": 0.5651689410209656, + "epoch": 1.0016930022573363, + "grad_norm": 0.9538065195083618, + "learning_rate": 4.97686360753279e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8691263794898987, + "num_tokens": 14409063.0, + "step": 1775 + }, + { + "entropy": 0.6223705530166626, + "epoch": 1.0045146726862302, + "grad_norm": 1.2451618909835815, + "learning_rate": 4.976733275169441e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8648397326469421, + "num_tokens": 14449840.0, + "step": 1780 + }, + { + "entropy": 0.5574550032615662, + "epoch": 1.007336343115124, + "grad_norm": 1.2733477354049683, + "learning_rate": 4.976602579032849e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8714008688926697, + "num_tokens": 14490449.0, + "step": 1785 + }, + { + "entropy": 0.5262600839138031, + "epoch": 1.010158013544018, + "grad_norm": 1.3276273012161255, + "learning_rate": 4.976471519148691e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8747005462646484, + "num_tokens": 14531107.0, + "step": 1790 + }, + { + "entropy": 0.49118422865867617, + "epoch": 1.012979683972912, + "grad_norm": 1.2698107957839966, + "learning_rate": 4.976340095542711e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8826399683952332, + "num_tokens": 14571891.0, + "step": 1795 + }, + { + "entropy": 0.48148898482322694, + "epoch": 1.0158013544018059, + "grad_norm": 1.3123940229415894, + "learning_rate": 4.97620830824073e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8846578359603882, + "num_tokens": 14612632.0, + "step": 1800 + }, + { + "entropy": 0.5638070046901703, + "epoch": 1.0186230248306998, + "grad_norm": 1.2908910512924194, + "learning_rate": 4.976076157268636e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8727042317390442, + "num_tokens": 14653317.0, + "step": 1805 + }, + { + "entropy": 0.5422739446163177, + "epoch": 1.0214446952595937, + "grad_norm": 1.5749247074127197, + "learning_rate": 4.975943642652389e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8708185434341431, + "num_tokens": 14693958.0, + "step": 1810 + }, + { + "entropy": 0.5249835729599, + "epoch": 1.0242663656884876, + "grad_norm": 1.3571456670761108, + "learning_rate": 4.975810764418023e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8773154497146607, + "num_tokens": 14734520.0, + "step": 1815 + }, + { + "entropy": 0.4799295425415039, + "epoch": 1.0270880361173815, + "grad_norm": 1.2057713270187378, + "learning_rate": 4.975677522591642e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.881498396396637, + "num_tokens": 14775267.0, + "step": 1820 + }, + { + "entropy": 0.5456785798072815, + "epoch": 1.0299097065462754, + "grad_norm": 1.4665296077728271, + "learning_rate": 4.975543917199422e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8673960208892822, + "num_tokens": 14816106.0, + "step": 1825 + }, + { + "entropy": 0.540210634469986, + "epoch": 1.0327313769751694, + "grad_norm": 1.2391974925994873, + "learning_rate": 4.975409948267608e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8694531202316285, + "num_tokens": 14856910.0, + "step": 1830 + }, + { + "entropy": 0.5396132886409759, + "epoch": 1.0355530474040633, + "grad_norm": 1.2638976573944092, + "learning_rate": 4.97527561582252e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8690710544586182, + "num_tokens": 14897584.0, + "step": 1835 + }, + { + "entropy": 0.5132744669914245, + "epoch": 1.0383747178329572, + "grad_norm": 1.2913990020751953, + "learning_rate": 4.975140919890546e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.873302161693573, + "num_tokens": 14938181.0, + "step": 1840 + }, + { + "entropy": 0.5024015605449677, + "epoch": 1.041196388261851, + "grad_norm": 1.1151163578033447, + "learning_rate": 4.975005860498148e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8748369932174682, + "num_tokens": 14978884.0, + "step": 1845 + }, + { + "entropy": 0.513664311170578, + "epoch": 1.0440180586907448, + "grad_norm": 1.298097014427185, + "learning_rate": 4.974870437671858e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8729772806167603, + "num_tokens": 15019646.0, + "step": 1850 + }, + { + "entropy": 0.5189130663871765, + "epoch": 1.0468397291196387, + "grad_norm": 1.3805726766586304, + "learning_rate": 4.97473465143828e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8714963912963867, + "num_tokens": 15060355.0, + "step": 1855 + }, + { + "entropy": 0.5300206661224365, + "epoch": 1.0496613995485327, + "grad_norm": 1.3854206800460815, + "learning_rate": 4.9745985018240895e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8686167359352112, + "num_tokens": 15101164.0, + "step": 1860 + }, + { + "entropy": 0.49498083591461184, + "epoch": 1.0524830699774266, + "grad_norm": 1.3298227787017822, + "learning_rate": 4.974461988856033e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8796196699142456, + "num_tokens": 15141937.0, + "step": 1865 + }, + { + "entropy": 0.5611091613769531, + "epoch": 1.0553047404063205, + "grad_norm": 1.3966474533081055, + "learning_rate": 4.974325112560928e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8588791251182556, + "num_tokens": 15182669.0, + "step": 1870 + }, + { + "entropy": 0.5468938708305359, + "epoch": 1.0581264108352144, + "grad_norm": 1.2755643129348755, + "learning_rate": 4.974187872965665e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8714737057685852, + "num_tokens": 15223336.0, + "step": 1875 + }, + { + "entropy": 0.5215595126152038, + "epoch": 1.0609480812641083, + "grad_norm": 1.296976923942566, + "learning_rate": 4.974050270097203e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8788340568542481, + "num_tokens": 15264013.0, + "step": 1880 + }, + { + "entropy": 0.5012482941150666, + "epoch": 1.0637697516930023, + "grad_norm": 1.3608322143554688, + "learning_rate": 4.973912303982575e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8823076605796814, + "num_tokens": 15304788.0, + "step": 1885 + }, + { + "entropy": 0.5157119750976562, + "epoch": 1.0665914221218962, + "grad_norm": 1.3526833057403564, + "learning_rate": 4.973773974648885e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8713153243064881, + "num_tokens": 15345095.0, + "step": 1890 + }, + { + "entropy": 0.5500873923301697, + "epoch": 1.06941309255079, + "grad_norm": 1.2834185361862183, + "learning_rate": 4.973635282123308e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8706364631652832, + "num_tokens": 15385312.0, + "step": 1895 + }, + { + "entropy": 0.5158918261528015, + "epoch": 1.072234762979684, + "grad_norm": 1.3259310722351074, + "learning_rate": 4.973496226433089e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8755181312561036, + "num_tokens": 15426061.0, + "step": 1900 + }, + { + "entropy": 0.538950902223587, + "epoch": 1.075056433408578, + "grad_norm": 1.3279820680618286, + "learning_rate": 4.973356807605546e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8712123036384583, + "num_tokens": 15466630.0, + "step": 1905 + }, + { + "entropy": 0.49377835392951963, + "epoch": 1.0778781038374718, + "grad_norm": 1.1958774328231812, + "learning_rate": 4.973217025668068e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8779303431510925, + "num_tokens": 15507113.0, + "step": 1910 + }, + { + "entropy": 0.5405571818351745, + "epoch": 1.0806997742663658, + "grad_norm": 1.2505043745040894, + "learning_rate": 4.973076880648115e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8709590196609497, + "num_tokens": 15547424.0, + "step": 1915 + }, + { + "entropy": 0.5551674902439118, + "epoch": 1.0835214446952597, + "grad_norm": 1.366829514503479, + "learning_rate": 4.972936372573218e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8604497313499451, + "num_tokens": 15588125.0, + "step": 1920 + }, + { + "entropy": 0.5369654476642609, + "epoch": 1.0863431151241536, + "grad_norm": 1.367258906364441, + "learning_rate": 4.972795501470981e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8665614485740661, + "num_tokens": 15628472.0, + "step": 1925 + }, + { + "entropy": 0.5272303640842437, + "epoch": 1.0891647855530473, + "grad_norm": 1.4138174057006836, + "learning_rate": 4.972654267369078e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8680860280990601, + "num_tokens": 15669129.0, + "step": 1930 + }, + { + "entropy": 0.5395491600036622, + "epoch": 1.0919864559819412, + "grad_norm": 1.3633450269699097, + "learning_rate": 4.972512670295253e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8737997651100159, + "num_tokens": 15709941.0, + "step": 1935 + }, + { + "entropy": 0.5212275326251984, + "epoch": 1.0948081264108351, + "grad_norm": 1.37870454788208, + "learning_rate": 4.9723707102773235e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8710021495819091, + "num_tokens": 15750468.0, + "step": 1940 + }, + { + "entropy": 0.533467584848404, + "epoch": 1.097629796839729, + "grad_norm": 1.5006767511367798, + "learning_rate": 4.972228387343179e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8726310014724732, + "num_tokens": 15791223.0, + "step": 1945 + }, + { + "entropy": 0.5062848865985871, + "epoch": 1.100451467268623, + "grad_norm": 1.1996793746948242, + "learning_rate": 4.972085701520777e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8720014691352844, + "num_tokens": 15831746.0, + "step": 1950 + }, + { + "entropy": 0.5338364660739898, + "epoch": 1.103273137697517, + "grad_norm": 1.4911237955093384, + "learning_rate": 4.971942652838149e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8707549214363098, + "num_tokens": 15872459.0, + "step": 1955 + }, + { + "entropy": 0.5420314610004425, + "epoch": 1.1060948081264108, + "grad_norm": 1.4543707370758057, + "learning_rate": 4.971799241323397e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8651981711387634, + "num_tokens": 15913132.0, + "step": 1960 + }, + { + "entropy": 0.533877170085907, + "epoch": 1.1089164785553047, + "grad_norm": 1.4339110851287842, + "learning_rate": 4.971655467004693e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8666856288909912, + "num_tokens": 15953596.0, + "step": 1965 + }, + { + "entropy": 0.525646859407425, + "epoch": 1.1117381489841986, + "grad_norm": 1.370369791984558, + "learning_rate": 4.971511329910283e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8674192786216736, + "num_tokens": 15994327.0, + "step": 1970 + }, + { + "entropy": 0.5182268440723419, + "epoch": 1.1145598194130926, + "grad_norm": 1.2392444610595703, + "learning_rate": 4.971366830068483e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8725781679153443, + "num_tokens": 16035039.0, + "step": 1975 + }, + { + "entropy": 0.5197380304336547, + "epoch": 1.1173814898419865, + "grad_norm": 1.3075342178344727, + "learning_rate": 4.971221967507679e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8703087091445922, + "num_tokens": 16075828.0, + "step": 1980 + }, + { + "entropy": 0.5025268793106079, + "epoch": 1.1202031602708804, + "grad_norm": 1.4279546737670898, + "learning_rate": 4.9710767422563285e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8805047512054444, + "num_tokens": 16116500.0, + "step": 1985 + }, + { + "entropy": 0.533239609003067, + "epoch": 1.1230248306997743, + "grad_norm": 1.2557505369186401, + "learning_rate": 4.970931154342963e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8681887149810791, + "num_tokens": 16157026.0, + "step": 1990 + }, + { + "entropy": 0.5435242295265198, + "epoch": 1.1258465011286682, + "grad_norm": 1.3608372211456299, + "learning_rate": 4.970785203796182e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8656893134117126, + "num_tokens": 16197572.0, + "step": 1995 + }, + { + "entropy": 0.48926340937614443, + "epoch": 1.1286681715575622, + "grad_norm": 1.314292311668396, + "learning_rate": 4.970638890644658e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8834827184677124, + "num_tokens": 16238388.0, + "step": 2000 + }, + { + "epoch": 1.1286681715575622, + "eval_entropy": 0.5074299573898315, + "eval_loss": 0.3934297561645508, + "eval_mean_token_accuracy": 0.8853967189788818, + "eval_num_tokens": 16238388.0, + "eval_runtime": 0.1639, + "eval_samples_per_second": 24.402, + "eval_steps_per_second": 6.1, + "step": 2000 + }, + { + "entropy": 0.5785790205001831, + "epoch": 1.1314898419864559, + "grad_norm": 1.5585596561431885, + "learning_rate": 4.970492214917133e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8502647757530213, + "num_tokens": 16279114.0, + "step": 2005 + }, + { + "entropy": 0.5156497418880462, + "epoch": 1.13431151241535, + "grad_norm": 1.2416678667068481, + "learning_rate": 4.970345176642424e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8698260307312011, + "num_tokens": 16319792.0, + "step": 2010 + }, + { + "entropy": 0.50312060713768, + "epoch": 1.1371331828442437, + "grad_norm": 1.343417763710022, + "learning_rate": 4.9701977758494135e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8738492012023926, + "num_tokens": 16360729.0, + "step": 2015 + }, + { + "entropy": 0.47664583325386045, + "epoch": 1.1399548532731376, + "grad_norm": 1.3557952642440796, + "learning_rate": 4.970050012567061e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8808457016944885, + "num_tokens": 16401565.0, + "step": 2020 + }, + { + "entropy": 0.5471422076225281, + "epoch": 1.1427765237020315, + "grad_norm": 1.6078767776489258, + "learning_rate": 4.969901886824394e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.861269211769104, + "num_tokens": 16442011.0, + "step": 2025 + }, + { + "entropy": 0.500029307603836, + "epoch": 1.1455981941309255, + "grad_norm": 1.2382087707519531, + "learning_rate": 4.969753398650511e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8787751078605652, + "num_tokens": 16482704.0, + "step": 2030 + }, + { + "entropy": 0.5860454142093658, + "epoch": 1.1484198645598194, + "grad_norm": 1.2965975999832153, + "learning_rate": 4.969604548074583e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.858244788646698, + "num_tokens": 16523275.0, + "step": 2035 + }, + { + "entropy": 0.5472402393817901, + "epoch": 1.1512415349887133, + "grad_norm": 1.347503900527954, + "learning_rate": 4.969455335125852e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8660460352897644, + "num_tokens": 16563975.0, + "step": 2040 + }, + { + "entropy": 0.5726926565170288, + "epoch": 1.1540632054176072, + "grad_norm": 1.3004218339920044, + "learning_rate": 4.969305759833631e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8575977206230163, + "num_tokens": 16604607.0, + "step": 2045 + }, + { + "entropy": 0.554759293794632, + "epoch": 1.1568848758465011, + "grad_norm": 1.1861565113067627, + "learning_rate": 4.969155822227304e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8664988040924072, + "num_tokens": 16645243.0, + "step": 2050 + }, + { + "entropy": 0.5388511419296265, + "epoch": 1.159706546275395, + "grad_norm": 1.6149320602416992, + "learning_rate": 4.969005522336324e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8644424319267273, + "num_tokens": 16685870.0, + "step": 2055 + }, + { + "entropy": 0.5282904028892517, + "epoch": 1.162528216704289, + "grad_norm": 1.2274096012115479, + "learning_rate": 4.968854860190222e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8676736235618592, + "num_tokens": 16726814.0, + "step": 2060 + }, + { + "entropy": 0.5028113007545472, + "epoch": 1.1653498871331829, + "grad_norm": 1.2310305833816528, + "learning_rate": 4.9687038358185904e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8749886631965638, + "num_tokens": 16767430.0, + "step": 2065 + }, + { + "entropy": 0.5054876983165741, + "epoch": 1.1681715575620768, + "grad_norm": 1.6121010780334473, + "learning_rate": 4.968552449251103e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8731933832168579, + "num_tokens": 16808070.0, + "step": 2070 + }, + { + "entropy": 0.510890108346939, + "epoch": 1.1709932279909707, + "grad_norm": 1.212408185005188, + "learning_rate": 4.968400700517496e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8713488578796387, + "num_tokens": 16848790.0, + "step": 2075 + }, + { + "entropy": 0.5575174331665039, + "epoch": 1.1738148984198646, + "grad_norm": 1.3433505296707153, + "learning_rate": 4.968248589647582e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8673989772796631, + "num_tokens": 16888705.0, + "step": 2080 + }, + { + "entropy": 0.4738758623600006, + "epoch": 1.1766365688487586, + "grad_norm": 1.337335467338562, + "learning_rate": 4.968096116671243e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8840928077697754, + "num_tokens": 16929128.0, + "step": 2085 + }, + { + "entropy": 0.5100393950939178, + "epoch": 1.1794582392776523, + "grad_norm": 1.4914759397506714, + "learning_rate": 4.9679432816184316e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8735705256462097, + "num_tokens": 16969800.0, + "step": 2090 + }, + { + "entropy": 0.5905535221099854, + "epoch": 1.1822799097065464, + "grad_norm": 1.579995036125183, + "learning_rate": 4.967790084519174e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.850010359287262, + "num_tokens": 17010171.0, + "step": 2095 + }, + { + "entropy": 0.5488203287124633, + "epoch": 1.18510158013544, + "grad_norm": 1.3214325904846191, + "learning_rate": 4.9676365254035645e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8675566554069519, + "num_tokens": 17050042.0, + "step": 2100 + }, + { + "entropy": 0.49664353728294375, + "epoch": 1.187923250564334, + "grad_norm": 1.3970224857330322, + "learning_rate": 4.96748260430177e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8784841656684875, + "num_tokens": 17090848.0, + "step": 2105 + }, + { + "entropy": 0.5250966966152191, + "epoch": 1.190744920993228, + "grad_norm": 1.3193445205688477, + "learning_rate": 4.967328321244028e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8732633233070374, + "num_tokens": 17131555.0, + "step": 2110 + }, + { + "entropy": 0.5157673597335816, + "epoch": 1.1935665914221218, + "grad_norm": 1.2861922979354858, + "learning_rate": 4.967173676260648e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.873046362400055, + "num_tokens": 17172227.0, + "step": 2115 + }, + { + "entropy": 0.5405256807804107, + "epoch": 1.1963882618510158, + "grad_norm": 1.6772886514663696, + "learning_rate": 4.96701866938201e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8637170910835266, + "num_tokens": 17212727.0, + "step": 2120 + }, + { + "entropy": 0.5377147376537323, + "epoch": 1.1992099322799097, + "grad_norm": 1.3887752294540405, + "learning_rate": 4.9668633006385655e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8655383706092834, + "num_tokens": 17253478.0, + "step": 2125 + }, + { + "entropy": 0.5166448771953582, + "epoch": 1.2020316027088036, + "grad_norm": 1.2280800342559814, + "learning_rate": 4.966707570060835e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8748661637306213, + "num_tokens": 17294254.0, + "step": 2130 + }, + { + "entropy": 0.5040790915489197, + "epoch": 1.2048532731376975, + "grad_norm": 1.4066789150238037, + "learning_rate": 4.9665514776794145e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8822136998176575, + "num_tokens": 17335017.0, + "step": 2135 + }, + { + "entropy": 0.5264786601066589, + "epoch": 1.2076749435665914, + "grad_norm": 1.3595073223114014, + "learning_rate": 4.9663950235249655e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8696430683135986, + "num_tokens": 17375865.0, + "step": 2140 + }, + { + "entropy": 0.5130272388458252, + "epoch": 1.2104966139954854, + "grad_norm": 1.4424818754196167, + "learning_rate": 4.966238207628225e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8746838808059693, + "num_tokens": 17416458.0, + "step": 2145 + }, + { + "entropy": 0.49900220036506654, + "epoch": 1.2133182844243793, + "grad_norm": 1.3563543558120728, + "learning_rate": 4.966081030019999e-06, + "loss": 0.407, + "mean_token_accuracy": 0.872953200340271, + "num_tokens": 17457303.0, + "step": 2150 + }, + { + "entropy": 0.5183020770549774, + "epoch": 1.2161399548532732, + "grad_norm": 1.431388258934021, + "learning_rate": 4.965923490731166e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8713582038879395, + "num_tokens": 17498163.0, + "step": 2155 + }, + { + "entropy": 0.5679690599441528, + "epoch": 1.2189616252821671, + "grad_norm": 1.5881187915802002, + "learning_rate": 4.965765589792674e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8564951181411743, + "num_tokens": 17538730.0, + "step": 2160 + }, + { + "entropy": 0.5158195972442627, + "epoch": 1.221783295711061, + "grad_norm": 1.2861218452453613, + "learning_rate": 4.965607327235542e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8753223657608032, + "num_tokens": 17579319.0, + "step": 2165 + }, + { + "entropy": 0.5332009434700012, + "epoch": 1.224604966139955, + "grad_norm": 1.338793158531189, + "learning_rate": 4.965448703090861e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8673290014266968, + "num_tokens": 17620049.0, + "step": 2170 + }, + { + "entropy": 0.5594852328300476, + "epoch": 1.2274266365688487, + "grad_norm": 1.3682115077972412, + "learning_rate": 4.965289717389794e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8634846925735473, + "num_tokens": 17660479.0, + "step": 2175 + }, + { + "entropy": 0.5210170686244965, + "epoch": 1.2302483069977426, + "grad_norm": 1.4854122400283813, + "learning_rate": 4.965130370163572e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8751443147659301, + "num_tokens": 17701156.0, + "step": 2180 + }, + { + "entropy": 0.5292206406593323, + "epoch": 1.2330699774266365, + "grad_norm": 1.3995417356491089, + "learning_rate": 4.9649706614435e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8673221230506897, + "num_tokens": 17742000.0, + "step": 2185 + }, + { + "entropy": 0.5528486132621765, + "epoch": 1.2358916478555304, + "grad_norm": 1.4038658142089844, + "learning_rate": 4.9648105912609525e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.863236916065216, + "num_tokens": 17782813.0, + "step": 2190 + }, + { + "entropy": 0.520248967409134, + "epoch": 1.2387133182844243, + "grad_norm": 1.3488070964813232, + "learning_rate": 4.964650159647375e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8726580619812012, + "num_tokens": 17823603.0, + "step": 2195 + }, + { + "entropy": 0.5257691144943237, + "epoch": 1.2415349887133182, + "grad_norm": 1.3696351051330566, + "learning_rate": 4.964489366634285e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8737285494804382, + "num_tokens": 17864326.0, + "step": 2200 + }, + { + "entropy": 0.521408861875534, + "epoch": 1.2443566591422122, + "grad_norm": 1.2956690788269043, + "learning_rate": 4.964328212253269e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8745855331420899, + "num_tokens": 17904832.0, + "step": 2205 + }, + { + "entropy": 0.5116970539093018, + "epoch": 1.247178329571106, + "grad_norm": 1.1543620824813843, + "learning_rate": 4.9641666965359865e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.871479332447052, + "num_tokens": 17945706.0, + "step": 2210 + }, + { + "entropy": 0.51253702044487, + "epoch": 1.25, + "grad_norm": 1.3165756464004517, + "learning_rate": 4.9640048195141685e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8732204794883728, + "num_tokens": 17986493.0, + "step": 2215 + }, + { + "entropy": 0.5361522495746612, + "epoch": 1.252821670428894, + "grad_norm": 1.3089770078659058, + "learning_rate": 4.9638425812196145e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8748211622238159, + "num_tokens": 18027243.0, + "step": 2220 + }, + { + "entropy": 0.4924753546714783, + "epoch": 1.2556433408577878, + "grad_norm": 1.3457053899765015, + "learning_rate": 4.963679981684195e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8790703415870667, + "num_tokens": 18067621.0, + "step": 2225 + }, + { + "entropy": 0.5386804461479187, + "epoch": 1.2584650112866818, + "grad_norm": 1.543764352798462, + "learning_rate": 4.963517020939855e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8642153739929199, + "num_tokens": 18108318.0, + "step": 2230 + }, + { + "entropy": 0.5258350789546966, + "epoch": 1.2612866817155757, + "grad_norm": 1.33335280418396, + "learning_rate": 4.963353699018607e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.867599892616272, + "num_tokens": 18149098.0, + "step": 2235 + }, + { + "entropy": 0.4915396809577942, + "epoch": 1.2641083521444696, + "grad_norm": 1.3478151559829712, + "learning_rate": 4.963190015952536e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8742651224136353, + "num_tokens": 18189853.0, + "step": 2240 + }, + { + "entropy": 0.5037339508533478, + "epoch": 1.2669300225733635, + "grad_norm": 1.4654349088668823, + "learning_rate": 4.963025971773798e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8763032555580139, + "num_tokens": 18230648.0, + "step": 2245 + }, + { + "entropy": 0.5388745546340943, + "epoch": 1.2697516930022572, + "grad_norm": 1.3390324115753174, + "learning_rate": 4.962861566514618e-06, + "loss": 0.418, + "mean_token_accuracy": 0.865935492515564, + "num_tokens": 18271377.0, + "step": 2250 + }, + { + "entropy": 0.5285961091518402, + "epoch": 1.2725733634311513, + "grad_norm": 1.2784810066223145, + "learning_rate": 4.962696800207295e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8682160973548889, + "num_tokens": 18311967.0, + "step": 2255 + }, + { + "entropy": 0.5452647149562836, + "epoch": 1.275395033860045, + "grad_norm": 1.4500328302383423, + "learning_rate": 4.9625316728841966e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8690374493598938, + "num_tokens": 18352492.0, + "step": 2260 + }, + { + "entropy": 0.5347442328929901, + "epoch": 1.2782167042889392, + "grad_norm": 1.3491861820220947, + "learning_rate": 4.962366184577762e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8743094444274903, + "num_tokens": 18393148.0, + "step": 2265 + }, + { + "entropy": 0.5413298010826111, + "epoch": 1.2810383747178329, + "grad_norm": 1.3183116912841797, + "learning_rate": 4.962200335320502e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8649118781089783, + "num_tokens": 18433888.0, + "step": 2270 + }, + { + "entropy": 0.49364901781082154, + "epoch": 1.2838600451467268, + "grad_norm": 1.4001456499099731, + "learning_rate": 4.962034125144997e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8754386663436889, + "num_tokens": 18474725.0, + "step": 2275 + }, + { + "entropy": 0.4968021988868713, + "epoch": 1.2866817155756207, + "grad_norm": 1.4670476913452148, + "learning_rate": 4.961867554083899e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8795897960662842, + "num_tokens": 18515515.0, + "step": 2280 + }, + { + "entropy": 0.49011672139167783, + "epoch": 1.2895033860045146, + "grad_norm": 1.5934028625488281, + "learning_rate": 4.961700622169931e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8795024633407593, + "num_tokens": 18556216.0, + "step": 2285 + }, + { + "entropy": 0.5093987703323364, + "epoch": 1.2923250564334086, + "grad_norm": 1.2995637655258179, + "learning_rate": 4.961533329435888e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8725569367408752, + "num_tokens": 18596786.0, + "step": 2290 + }, + { + "entropy": 0.5331693708896637, + "epoch": 1.2951467268623025, + "grad_norm": 1.4760665893554688, + "learning_rate": 4.9613656759146335e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.869564437866211, + "num_tokens": 18637493.0, + "step": 2295 + }, + { + "entropy": 0.5001338601112366, + "epoch": 1.2979683972911964, + "grad_norm": 1.8200581073760986, + "learning_rate": 4.961197661639102e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8765197157859802, + "num_tokens": 18677593.0, + "step": 2300 + }, + { + "entropy": 0.5498757302761078, + "epoch": 1.3007900677200903, + "grad_norm": 1.2150392532348633, + "learning_rate": 4.9610292866423036e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8701222181320191, + "num_tokens": 18718374.0, + "step": 2305 + }, + { + "entropy": 0.5348264276981354, + "epoch": 1.3036117381489842, + "grad_norm": 1.4520456790924072, + "learning_rate": 4.960860550957311e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8689911484718322, + "num_tokens": 18758689.0, + "step": 2310 + }, + { + "entropy": 0.564541220664978, + "epoch": 1.3064334085778782, + "grad_norm": 1.4934542179107666, + "learning_rate": 4.960691454617276e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8587620854377747, + "num_tokens": 18798953.0, + "step": 2315 + }, + { + "entropy": 0.5457278668880463, + "epoch": 1.309255079006772, + "grad_norm": 1.3648675680160522, + "learning_rate": 4.960521997655415e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8687914133071899, + "num_tokens": 18839547.0, + "step": 2320 + }, + { + "entropy": 0.5353664636611939, + "epoch": 1.312076749435666, + "grad_norm": 1.3305805921554565, + "learning_rate": 4.960352180105019e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8669357776641846, + "num_tokens": 18880107.0, + "step": 2325 + }, + { + "entropy": 0.5464560270309449, + "epoch": 1.31489841986456, + "grad_norm": 2.0227503776550293, + "learning_rate": 4.9601820019994495e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8719035506248474, + "num_tokens": 18920937.0, + "step": 2330 + }, + { + "entropy": 0.536427104473114, + "epoch": 1.3177200902934536, + "grad_norm": 1.3813787698745728, + "learning_rate": 4.960011463372136e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8644273400306701, + "num_tokens": 18961776.0, + "step": 2335 + }, + { + "entropy": 0.5107419192790985, + "epoch": 1.3205417607223477, + "grad_norm": 1.2994340658187866, + "learning_rate": 4.959840564256583e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.873857831954956, + "num_tokens": 19002350.0, + "step": 2340 + }, + { + "entropy": 0.48056021332740784, + "epoch": 1.3233634311512414, + "grad_norm": 1.4537101984024048, + "learning_rate": 4.959669304686362e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8777095079421997, + "num_tokens": 19042920.0, + "step": 2345 + }, + { + "entropy": 0.5710622429847717, + "epoch": 1.3261851015801354, + "grad_norm": 1.362627625465393, + "learning_rate": 4.959497684695118e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8591257691383362, + "num_tokens": 19083640.0, + "step": 2350 + }, + { + "entropy": 0.521550840139389, + "epoch": 1.3290067720090293, + "grad_norm": 1.173241138458252, + "learning_rate": 4.959325704316565e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8729618310928344, + "num_tokens": 19123082.0, + "step": 2355 + }, + { + "entropy": 0.5515291333198548, + "epoch": 1.3318284424379232, + "grad_norm": 1.6536093950271606, + "learning_rate": 4.959153363584489e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8603331565856933, + "num_tokens": 19163643.0, + "step": 2360 + }, + { + "entropy": 0.5428348422050476, + "epoch": 1.3346501128668171, + "grad_norm": 1.6122465133666992, + "learning_rate": 4.958980662532747e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8602743864059448, + "num_tokens": 19204151.0, + "step": 2365 + }, + { + "entropy": 0.558714485168457, + "epoch": 1.337471783295711, + "grad_norm": 1.321026086807251, + "learning_rate": 4.9588076011952655e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8587347507476807, + "num_tokens": 19244654.0, + "step": 2370 + }, + { + "entropy": 0.5276102185249328, + "epoch": 1.340293453724605, + "grad_norm": 1.311216950416565, + "learning_rate": 4.958634179606041e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8710083127021789, + "num_tokens": 19285304.0, + "step": 2375 + }, + { + "entropy": 0.5120981454849243, + "epoch": 1.3431151241534989, + "grad_norm": 1.2444289922714233, + "learning_rate": 4.9584603977991445e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.876779317855835, + "num_tokens": 19326058.0, + "step": 2380 + }, + { + "entropy": 0.5315510213375092, + "epoch": 1.3459367945823928, + "grad_norm": 1.342822551727295, + "learning_rate": 4.958286255808714e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.86748708486557, + "num_tokens": 19366695.0, + "step": 2385 + }, + { + "entropy": 0.5051746428012848, + "epoch": 1.3487584650112867, + "grad_norm": 1.3792295455932617, + "learning_rate": 4.958111753668962e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8715560555458068, + "num_tokens": 19407356.0, + "step": 2390 + }, + { + "entropy": 0.5739570260047913, + "epoch": 1.3515801354401806, + "grad_norm": 1.3936057090759277, + "learning_rate": 4.957936891414166e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8616944670677185, + "num_tokens": 19446924.0, + "step": 2395 + }, + { + "entropy": 0.5593576908111573, + "epoch": 1.3544018058690745, + "grad_norm": 1.6129295825958252, + "learning_rate": 4.957761669078679e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8601847767829895, + "num_tokens": 19487296.0, + "step": 2400 + }, + { + "entropy": 0.49403364062309263, + "epoch": 1.3572234762979685, + "grad_norm": 1.463240623474121, + "learning_rate": 4.957586086696925e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.882617199420929, + "num_tokens": 19528192.0, + "step": 2405 + }, + { + "entropy": 0.5413905024528504, + "epoch": 1.3600451467268622, + "grad_norm": 1.3406250476837158, + "learning_rate": 4.957410144303396e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8708963632583618, + "num_tokens": 19568788.0, + "step": 2410 + }, + { + "entropy": 0.5129614770412445, + "epoch": 1.3628668171557563, + "grad_norm": 1.423504114151001, + "learning_rate": 4.957233841932655e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8748751640319824, + "num_tokens": 19609525.0, + "step": 2415 + }, + { + "entropy": 0.49986888766288756, + "epoch": 1.36568848758465, + "grad_norm": 1.3050624132156372, + "learning_rate": 4.957057179619339e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8794636130332947, + "num_tokens": 19650378.0, + "step": 2420 + }, + { + "entropy": 0.49531072974205015, + "epoch": 1.3685101580135441, + "grad_norm": 1.2032006978988647, + "learning_rate": 4.956880157398151e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8791410803794861, + "num_tokens": 19690935.0, + "step": 2425 + }, + { + "entropy": 0.5217344462871552, + "epoch": 1.3713318284424378, + "grad_norm": 1.3181284666061401, + "learning_rate": 4.956702775303868e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8702506303787232, + "num_tokens": 19731616.0, + "step": 2430 + }, + { + "entropy": 0.5071918427944183, + "epoch": 1.3741534988713318, + "grad_norm": 1.2046350240707397, + "learning_rate": 4.956525033371336e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8688374757766724, + "num_tokens": 19772129.0, + "step": 2435 + }, + { + "entropy": 0.5620993256568909, + "epoch": 1.3769751693002257, + "grad_norm": 1.3404937982559204, + "learning_rate": 4.956346931635474e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8631010413169861, + "num_tokens": 19812671.0, + "step": 2440 + }, + { + "entropy": 0.5249874234199524, + "epoch": 1.3797968397291196, + "grad_norm": 1.3918310403823853, + "learning_rate": 4.956168470131269e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8694810628890991, + "num_tokens": 19853386.0, + "step": 2445 + }, + { + "entropy": 0.48499825596809387, + "epoch": 1.3826185101580135, + "grad_norm": 1.3398149013519287, + "learning_rate": 4.95598964889378e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8827056169509888, + "num_tokens": 19893901.0, + "step": 2450 + }, + { + "entropy": 0.5267378866672516, + "epoch": 1.3854401805869074, + "grad_norm": 1.4285844564437866, + "learning_rate": 4.9558104679581366e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8731061100959778, + "num_tokens": 19934618.0, + "step": 2455 + }, + { + "entropy": 0.551910811662674, + "epoch": 1.3882618510158014, + "grad_norm": 1.462052822113037, + "learning_rate": 4.955630927359538e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8564296126365661, + "num_tokens": 19975257.0, + "step": 2460 + }, + { + "entropy": 0.5045246481895447, + "epoch": 1.3910835214446953, + "grad_norm": 1.2630540132522583, + "learning_rate": 4.9554510271332575e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8743220806121826, + "num_tokens": 20015796.0, + "step": 2465 + }, + { + "entropy": 0.4776579737663269, + "epoch": 1.3939051918735892, + "grad_norm": 1.3086442947387695, + "learning_rate": 4.955270767314633e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8837788105010986, + "num_tokens": 20056639.0, + "step": 2470 + }, + { + "entropy": 0.5198698997497558, + "epoch": 1.396726862302483, + "grad_norm": 1.1976763010025024, + "learning_rate": 4.955090147939079e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8731474876403809, + "num_tokens": 20097523.0, + "step": 2475 + }, + { + "entropy": 0.5507951974868774, + "epoch": 1.399548532731377, + "grad_norm": 1.595848798751831, + "learning_rate": 4.954909169042078e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8633155941963195, + "num_tokens": 20137741.0, + "step": 2480 + }, + { + "entropy": 0.5629419386386871, + "epoch": 1.402370203160271, + "grad_norm": 1.5942708253860474, + "learning_rate": 4.954727830659182e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8617273569107056, + "num_tokens": 20178295.0, + "step": 2485 + }, + { + "entropy": 0.5281494557857513, + "epoch": 1.4051918735891649, + "grad_norm": 1.2493270635604858, + "learning_rate": 4.954546132826017e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8682329654693604, + "num_tokens": 20219041.0, + "step": 2490 + }, + { + "entropy": 0.5261754095554352, + "epoch": 1.4080135440180586, + "grad_norm": 1.3487435579299927, + "learning_rate": 4.954364075578276e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8690428853034973, + "num_tokens": 20259747.0, + "step": 2495 + }, + { + "entropy": 0.5704293370246887, + "epoch": 1.4108352144469527, + "grad_norm": 1.5484710931777954, + "learning_rate": 4.954181658951725e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8576129674911499, + "num_tokens": 20300626.0, + "step": 2500 + }, + { + "epoch": 1.4108352144469527, + "eval_entropy": 0.49934637546539307, + "eval_loss": 0.3746188282966614, + "eval_mean_token_accuracy": 0.8903794288635254, + "eval_num_tokens": 20300626.0, + "eval_runtime": 0.163, + "eval_samples_per_second": 24.539, + "eval_steps_per_second": 6.135, + "step": 2500 + }, + { + "entropy": 0.5491129279136657, + "epoch": 1.4136568848758464, + "grad_norm": 1.3182957172393799, + "learning_rate": 4.953998882982197e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8601503491401672, + "num_tokens": 20341333.0, + "step": 2505 + }, + { + "entropy": 0.5612141251564026, + "epoch": 1.4164785553047405, + "grad_norm": 1.5513429641723633, + "learning_rate": 4.9538157477056025e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8637246131896973, + "num_tokens": 20381822.0, + "step": 2510 + }, + { + "entropy": 0.5199264347553253, + "epoch": 1.4193002257336342, + "grad_norm": 1.3425822257995605, + "learning_rate": 4.953632253157916e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8695431590080261, + "num_tokens": 20422346.0, + "step": 2515 + }, + { + "entropy": 0.5060115993022919, + "epoch": 1.4221218961625282, + "grad_norm": 1.4134669303894043, + "learning_rate": 4.953448399375187e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8781610608100892, + "num_tokens": 20462097.0, + "step": 2520 + }, + { + "entropy": 0.5111626803874969, + "epoch": 1.424943566591422, + "grad_norm": 1.3311017751693726, + "learning_rate": 4.953264186393531e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.87530437707901, + "num_tokens": 20502526.0, + "step": 2525 + }, + { + "entropy": 0.5413293719291687, + "epoch": 1.427765237020316, + "grad_norm": 1.4240591526031494, + "learning_rate": 4.953079614249138e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8609416246414184, + "num_tokens": 20543142.0, + "step": 2530 + }, + { + "entropy": 0.5732130527496337, + "epoch": 1.43058690744921, + "grad_norm": 1.4825007915496826, + "learning_rate": 4.952894682978268e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8564027547836304, + "num_tokens": 20583174.0, + "step": 2535 + }, + { + "entropy": 0.4671943664550781, + "epoch": 1.4334085778781038, + "grad_norm": 1.1537166833877563, + "learning_rate": 4.952709392617248e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8833828091621398, + "num_tokens": 20623760.0, + "step": 2540 + }, + { + "entropy": 0.5528115510940552, + "epoch": 1.4362302483069977, + "grad_norm": 1.351750135421753, + "learning_rate": 4.952523743202482e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8619906187057496, + "num_tokens": 20664513.0, + "step": 2545 + }, + { + "entropy": 0.5158391892910004, + "epoch": 1.4390519187358917, + "grad_norm": 1.1706582307815552, + "learning_rate": 4.952337734770439e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8752634882926941, + "num_tokens": 20705333.0, + "step": 2550 + }, + { + "entropy": 0.5181755602359772, + "epoch": 1.4418735891647856, + "grad_norm": 1.5158717632293701, + "learning_rate": 4.95215136735766e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8729309439659119, + "num_tokens": 20745887.0, + "step": 2555 + }, + { + "entropy": 0.5385544419288635, + "epoch": 1.4446952595936795, + "grad_norm": 1.2974624633789062, + "learning_rate": 4.951964641000757e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8661515951156616, + "num_tokens": 20786349.0, + "step": 2560 + }, + { + "entropy": 0.5237244606018067, + "epoch": 1.4475169300225734, + "grad_norm": 1.4556293487548828, + "learning_rate": 4.951777555736414e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8732018351554871, + "num_tokens": 20826912.0, + "step": 2565 + }, + { + "entropy": 0.5504439949989319, + "epoch": 1.4503386004514673, + "grad_norm": 1.478190302848816, + "learning_rate": 4.951590111601381e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8666379451751709, + "num_tokens": 20867551.0, + "step": 2570 + }, + { + "entropy": 0.5655958175659179, + "epoch": 1.4531602708803613, + "grad_norm": 1.2426916360855103, + "learning_rate": 4.951402308632485e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8618389010429383, + "num_tokens": 20908206.0, + "step": 2575 + }, + { + "entropy": 0.5350692510604859, + "epoch": 1.455981941309255, + "grad_norm": 1.355296015739441, + "learning_rate": 4.951214146866617e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.868348228931427, + "num_tokens": 20948945.0, + "step": 2580 + }, + { + "entropy": 0.5500153779983521, + "epoch": 1.458803611738149, + "grad_norm": 1.242108941078186, + "learning_rate": 4.951025626340743e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.864187228679657, + "num_tokens": 20989560.0, + "step": 2585 + }, + { + "entropy": 0.5728635132312775, + "epoch": 1.4616252821670428, + "grad_norm": 1.4970265626907349, + "learning_rate": 4.950836747091896e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8666630983352661, + "num_tokens": 21030293.0, + "step": 2590 + }, + { + "entropy": 0.572597336769104, + "epoch": 1.4644469525959367, + "grad_norm": 1.5469214916229248, + "learning_rate": 4.950647509157184e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.860745084285736, + "num_tokens": 21070816.0, + "step": 2595 + }, + { + "entropy": 0.4957478761672974, + "epoch": 1.4672686230248306, + "grad_norm": 1.1917364597320557, + "learning_rate": 4.950457912573781e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8750966668128968, + "num_tokens": 21111621.0, + "step": 2600 + }, + { + "entropy": 0.5851558446884155, + "epoch": 1.4700902934537246, + "grad_norm": 1.4717788696289062, + "learning_rate": 4.950267957378934e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8556645750999451, + "num_tokens": 21152027.0, + "step": 2605 + }, + { + "entropy": 0.4879822790622711, + "epoch": 1.4729119638826185, + "grad_norm": 1.244563341140747, + "learning_rate": 4.950077643609959e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.882515013217926, + "num_tokens": 21192521.0, + "step": 2610 + }, + { + "entropy": 0.5324353992938995, + "epoch": 1.4757336343115124, + "grad_norm": 1.4513381719589233, + "learning_rate": 4.949886971304245e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8638843178749085, + "num_tokens": 21233396.0, + "step": 2615 + }, + { + "entropy": 0.5465445816516876, + "epoch": 1.4785553047404063, + "grad_norm": 1.4188737869262695, + "learning_rate": 4.9496959404992475e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8674513459205627, + "num_tokens": 21274180.0, + "step": 2620 + }, + { + "entropy": 0.5312675893306732, + "epoch": 1.4813769751693002, + "grad_norm": 1.3827718496322632, + "learning_rate": 4.949504551232494e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8689416646957397, + "num_tokens": 21314947.0, + "step": 2625 + }, + { + "entropy": 0.5221801578998566, + "epoch": 1.4841986455981941, + "grad_norm": 1.344963788986206, + "learning_rate": 4.949312803541586e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8764538764953613, + "num_tokens": 21355658.0, + "step": 2630 + }, + { + "entropy": 0.5304463326930999, + "epoch": 1.487020316027088, + "grad_norm": 1.5296121835708618, + "learning_rate": 4.94912069746419e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8726909160614014, + "num_tokens": 21396355.0, + "step": 2635 + }, + { + "entropy": 0.5112833499908447, + "epoch": 1.489841986455982, + "grad_norm": 1.4764835834503174, + "learning_rate": 4.948928233038046e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8752380728721618, + "num_tokens": 21437172.0, + "step": 2640 + }, + { + "entropy": 0.5136803925037384, + "epoch": 1.492663656884876, + "grad_norm": 1.5744765996932983, + "learning_rate": 4.948735410300964e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8673662304878235, + "num_tokens": 21477969.0, + "step": 2645 + }, + { + "entropy": 0.5488377809524536, + "epoch": 1.4954853273137698, + "grad_norm": 1.3822392225265503, + "learning_rate": 4.948542229290823e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8641608357429504, + "num_tokens": 21517644.0, + "step": 2650 + }, + { + "entropy": 0.48912598490715026, + "epoch": 1.4983069977426637, + "grad_norm": 1.3357199430465698, + "learning_rate": 4.948348690045574e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.873920488357544, + "num_tokens": 21558283.0, + "step": 2655 + }, + { + "entropy": 0.5133354067802429, + "epoch": 1.5011286681715577, + "grad_norm": 1.3202459812164307, + "learning_rate": 4.948154792603237e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8687997817993164, + "num_tokens": 21598905.0, + "step": 2660 + }, + { + "entropy": 0.5592216491699219, + "epoch": 1.5039503386004514, + "grad_norm": 1.515934944152832, + "learning_rate": 4.947960537001905e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8624489784240723, + "num_tokens": 21639436.0, + "step": 2665 + }, + { + "entropy": 0.5013287782669067, + "epoch": 1.5067720090293455, + "grad_norm": 1.3164470195770264, + "learning_rate": 4.947765923279738e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8810773968696595, + "num_tokens": 21680256.0, + "step": 2670 + }, + { + "entropy": 0.5708201169967652, + "epoch": 1.5095936794582392, + "grad_norm": 1.5717384815216064, + "learning_rate": 4.9475709514749695e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8578357696533203, + "num_tokens": 21720710.0, + "step": 2675 + }, + { + "entropy": 0.5323946833610534, + "epoch": 1.5124153498871333, + "grad_norm": 1.3646914958953857, + "learning_rate": 4.9473756216258996e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8670541882514954, + "num_tokens": 21761305.0, + "step": 2680 + }, + { + "entropy": 0.5191471993923187, + "epoch": 1.515237020316027, + "grad_norm": 1.259047508239746, + "learning_rate": 4.947179933770902e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8745760560035706, + "num_tokens": 21801879.0, + "step": 2685 + }, + { + "entropy": 0.5398116707801819, + "epoch": 1.518058690744921, + "grad_norm": 1.3516515493392944, + "learning_rate": 4.94698388794842e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8646332740783691, + "num_tokens": 21842560.0, + "step": 2690 + }, + { + "entropy": 0.5416356801986695, + "epoch": 1.5208803611738149, + "grad_norm": 1.3463817834854126, + "learning_rate": 4.946787484196966e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8608946084976197, + "num_tokens": 21883327.0, + "step": 2695 + }, + { + "entropy": 0.555895859003067, + "epoch": 1.5237020316027088, + "grad_norm": 1.376157283782959, + "learning_rate": 4.9465907225551244e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8606306076049804, + "num_tokens": 21923780.0, + "step": 2700 + }, + { + "entropy": 0.5254535734653473, + "epoch": 1.5265237020316027, + "grad_norm": 1.310943365097046, + "learning_rate": 4.946393603061548e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8713202953338623, + "num_tokens": 21964188.0, + "step": 2705 + }, + { + "entropy": 0.5179416954517364, + "epoch": 1.5293453724604966, + "grad_norm": 1.361799955368042, + "learning_rate": 4.946196125754962e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8728214979171753, + "num_tokens": 22004992.0, + "step": 2710 + }, + { + "entropy": 0.539003986120224, + "epoch": 1.5321670428893905, + "grad_norm": 1.5339001417160034, + "learning_rate": 4.9459982906741596e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8599718093872071, + "num_tokens": 22045651.0, + "step": 2715 + }, + { + "entropy": 0.48843835592269896, + "epoch": 1.5349887133182845, + "grad_norm": 1.1609723567962646, + "learning_rate": 4.945800097858007e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8784770965576172, + "num_tokens": 22085915.0, + "step": 2720 + }, + { + "entropy": 0.5748872756958008, + "epoch": 1.5378103837471784, + "grad_norm": 1.474261999130249, + "learning_rate": 4.945601547345439e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8565932989120484, + "num_tokens": 22126565.0, + "step": 2725 + }, + { + "entropy": 0.5497707307338715, + "epoch": 1.540632054176072, + "grad_norm": 1.328251600265503, + "learning_rate": 4.945402639175459e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8657533526420593, + "num_tokens": 22166897.0, + "step": 2730 + }, + { + "entropy": 0.5045753955841065, + "epoch": 1.5434537246049662, + "grad_norm": 1.4237067699432373, + "learning_rate": 4.945203373387145e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8722402095794678, + "num_tokens": 22207560.0, + "step": 2735 + }, + { + "entropy": 0.5575098991394043, + "epoch": 1.54627539503386, + "grad_norm": 1.7878057956695557, + "learning_rate": 4.945003750019641e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8603370666503907, + "num_tokens": 22247941.0, + "step": 2740 + }, + { + "entropy": 0.4867607891559601, + "epoch": 1.549097065462754, + "grad_norm": 1.526099443435669, + "learning_rate": 4.944803769112164e-06, + "loss": 0.401, + "mean_token_accuracy": 0.874083411693573, + "num_tokens": 22288838.0, + "step": 2745 + }, + { + "entropy": 0.5552621841430664, + "epoch": 1.5519187358916477, + "grad_norm": 1.3902403116226196, + "learning_rate": 4.944603430704e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8668664813041687, + "num_tokens": 22328956.0, + "step": 2750 + }, + { + "entropy": 0.5252302527427674, + "epoch": 1.554740406320542, + "grad_norm": 1.326379418373108, + "learning_rate": 4.944402734834506e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8710361838340759, + "num_tokens": 22369592.0, + "step": 2755 + }, + { + "entropy": 0.5309755086898804, + "epoch": 1.5575620767494356, + "grad_norm": 1.4432721138000488, + "learning_rate": 4.944201681543107e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8664063453674317, + "num_tokens": 22410109.0, + "step": 2760 + }, + { + "entropy": 0.5249135076999665, + "epoch": 1.5603837471783297, + "grad_norm": 1.4632095098495483, + "learning_rate": 4.944000270869302e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8737531065940857, + "num_tokens": 22450758.0, + "step": 2765 + }, + { + "entropy": 0.509848439693451, + "epoch": 1.5632054176072234, + "grad_norm": 1.4338641166687012, + "learning_rate": 4.943798502852657e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8742005109786988, + "num_tokens": 22491593.0, + "step": 2770 + }, + { + "entropy": 0.5258462011814118, + "epoch": 1.5660270880361173, + "grad_norm": 1.3523592948913574, + "learning_rate": 4.94359637753281e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.867405092716217, + "num_tokens": 22532315.0, + "step": 2775 + }, + { + "entropy": 0.5412358582019806, + "epoch": 1.5688487584650113, + "grad_norm": 1.4376283884048462, + "learning_rate": 4.943393894949469e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8682662725448609, + "num_tokens": 22573023.0, + "step": 2780 + }, + { + "entropy": 0.530605137348175, + "epoch": 1.5716704288939052, + "grad_norm": 1.3493672609329224, + "learning_rate": 4.943191055142409e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8707961201667785, + "num_tokens": 22613743.0, + "step": 2785 + }, + { + "entropy": 0.5197601974010467, + "epoch": 1.574492099322799, + "grad_norm": 1.3924630880355835, + "learning_rate": 4.942987858151481e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8706324815750122, + "num_tokens": 22654339.0, + "step": 2790 + }, + { + "entropy": 0.48147222995758054, + "epoch": 1.577313769751693, + "grad_norm": 1.5284984111785889, + "learning_rate": 4.942784304016602e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8784740090370178, + "num_tokens": 22694670.0, + "step": 2795 + }, + { + "entropy": 0.46955564618110657, + "epoch": 1.580135440180587, + "grad_norm": 1.6057965755462646, + "learning_rate": 4.942580392777761e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8798289656639099, + "num_tokens": 22735192.0, + "step": 2800 + }, + { + "entropy": 0.5634661912918091, + "epoch": 1.5829571106094809, + "grad_norm": 1.395089030265808, + "learning_rate": 4.942376124475014e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8594593167304992, + "num_tokens": 22775738.0, + "step": 2805 + }, + { + "entropy": 0.6147814273834229, + "epoch": 1.5857787810383748, + "grad_norm": 1.4737554788589478, + "learning_rate": 4.942171499148492e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.851655924320221, + "num_tokens": 22816401.0, + "step": 2810 + }, + { + "entropy": 0.581537914276123, + "epoch": 1.5886004514672685, + "grad_norm": 1.4444756507873535, + "learning_rate": 4.941966516838393e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8650146484375, + "num_tokens": 22857040.0, + "step": 2815 + }, + { + "entropy": 0.5177268862724305, + "epoch": 1.5914221218961626, + "grad_norm": 1.441490650177002, + "learning_rate": 4.941761177584985e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8717929363250733, + "num_tokens": 22897722.0, + "step": 2820 + }, + { + "entropy": 0.48996912837028506, + "epoch": 1.5942437923250563, + "grad_norm": 1.5225064754486084, + "learning_rate": 4.941555481428607e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8782232880592347, + "num_tokens": 22938229.0, + "step": 2825 + }, + { + "entropy": 0.5194081962108612, + "epoch": 1.5970654627539504, + "grad_norm": 1.3030446767807007, + "learning_rate": 4.94134942840967e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8713892698287964, + "num_tokens": 22978751.0, + "step": 2830 + }, + { + "entropy": 0.5596107244491577, + "epoch": 1.5998871331828441, + "grad_norm": 1.3859238624572754, + "learning_rate": 4.9411430185686505e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8581524729728699, + "num_tokens": 23019325.0, + "step": 2835 + }, + { + "entropy": 0.5840788006782531, + "epoch": 1.6027088036117383, + "grad_norm": 1.4645488262176514, + "learning_rate": 4.940936251946099e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8551864385604858, + "num_tokens": 23060114.0, + "step": 2840 + }, + { + "entropy": 0.5338825523853302, + "epoch": 1.605530474040632, + "grad_norm": 1.3881251811981201, + "learning_rate": 4.940729128582636e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8701825737953186, + "num_tokens": 23100863.0, + "step": 2845 + }, + { + "entropy": 0.5128993272781373, + "epoch": 1.6083521444695261, + "grad_norm": 1.1398290395736694, + "learning_rate": 4.940521648518948e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8785142302513123, + "num_tokens": 23141333.0, + "step": 2850 + }, + { + "entropy": 0.5374301552772522, + "epoch": 1.6111738148984198, + "grad_norm": 1.459583044052124, + "learning_rate": 4.940313811795797e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8682045102119446, + "num_tokens": 23181557.0, + "step": 2855 + }, + { + "entropy": 0.530595988035202, + "epoch": 1.6139954853273137, + "grad_norm": 1.451833963394165, + "learning_rate": 4.9401056184540115e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8683579087257385, + "num_tokens": 23221979.0, + "step": 2860 + }, + { + "entropy": 0.5528306722640991, + "epoch": 1.6168171557562077, + "grad_norm": 1.3938990831375122, + "learning_rate": 4.939897068534491e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8650489091873169, + "num_tokens": 23262727.0, + "step": 2865 + }, + { + "entropy": 0.4935439705848694, + "epoch": 1.6196388261851016, + "grad_norm": 1.3947253227233887, + "learning_rate": 4.939688162078205e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8790611743927002, + "num_tokens": 23303214.0, + "step": 2870 + }, + { + "entropy": 0.5516513884067535, + "epoch": 1.6224604966139955, + "grad_norm": 1.4023181200027466, + "learning_rate": 4.939478899126196e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8614558100700378, + "num_tokens": 23343833.0, + "step": 2875 + }, + { + "entropy": 0.5269099056720734, + "epoch": 1.6252821670428894, + "grad_norm": 1.520416021347046, + "learning_rate": 4.939269279719569e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.868548309803009, + "num_tokens": 23384399.0, + "step": 2880 + }, + { + "entropy": 0.5381790697574615, + "epoch": 1.6281038374717833, + "grad_norm": 1.252475380897522, + "learning_rate": 4.939059303899507e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8675281643867493, + "num_tokens": 23424933.0, + "step": 2885 + }, + { + "entropy": 0.5297176659107208, + "epoch": 1.6309255079006773, + "grad_norm": 1.3621821403503418, + "learning_rate": 4.93884897170726e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8692261695861816, + "num_tokens": 23465388.0, + "step": 2890 + }, + { + "entropy": 0.5208603262901306, + "epoch": 1.6337471783295712, + "grad_norm": 1.6305574178695679, + "learning_rate": 4.9386382831841455e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8741989612579346, + "num_tokens": 23506128.0, + "step": 2895 + }, + { + "entropy": 0.5832960605621338, + "epoch": 1.6365688487584649, + "grad_norm": 1.3824501037597656, + "learning_rate": 4.9384272383715535e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8618869662284852, + "num_tokens": 23546678.0, + "step": 2900 + }, + { + "entropy": 0.5429921388626099, + "epoch": 1.639390519187359, + "grad_norm": 1.4333373308181763, + "learning_rate": 4.938215837310947e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8673049211502075, + "num_tokens": 23587532.0, + "step": 2905 + }, + { + "entropy": 0.4961793005466461, + "epoch": 1.6422121896162527, + "grad_norm": 1.4071818590164185, + "learning_rate": 4.938004080043852e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8742106318473816, + "num_tokens": 23628159.0, + "step": 2910 + }, + { + "entropy": 0.5959875106811523, + "epoch": 1.6450338600451468, + "grad_norm": 1.366908073425293, + "learning_rate": 4.93779196661187e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8543134689331054, + "num_tokens": 23668932.0, + "step": 2915 + }, + { + "entropy": 0.48293390274047854, + "epoch": 1.6478555304740405, + "grad_norm": 1.2694140672683716, + "learning_rate": 4.937579497056671e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8866869568824768, + "num_tokens": 23709665.0, + "step": 2920 + }, + { + "entropy": 0.5429055094718933, + "epoch": 1.6506772009029347, + "grad_norm": 1.4968737363815308, + "learning_rate": 4.937366671419994e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8604987978935241, + "num_tokens": 23750495.0, + "step": 2925 + }, + { + "entropy": 0.49238319993019103, + "epoch": 1.6534988713318284, + "grad_norm": 1.4988641738891602, + "learning_rate": 4.937153489743649e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.875276243686676, + "num_tokens": 23790793.0, + "step": 2930 + }, + { + "entropy": 0.5414620280265808, + "epoch": 1.6563205417607223, + "grad_norm": 1.4088878631591797, + "learning_rate": 4.936939952069515e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8669666767120361, + "num_tokens": 23831543.0, + "step": 2935 + }, + { + "entropy": 0.5236553966999054, + "epoch": 1.6591422121896162, + "grad_norm": 1.4377782344818115, + "learning_rate": 4.936726058439542e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8676124572753906, + "num_tokens": 23872266.0, + "step": 2940 + }, + { + "entropy": 0.5415829122066498, + "epoch": 1.6619638826185101, + "grad_norm": 1.4951739311218262, + "learning_rate": 4.936511808895751e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.865878450870514, + "num_tokens": 23912731.0, + "step": 2945 + }, + { + "entropy": 0.5576195478439331, + "epoch": 1.664785553047404, + "grad_norm": 1.3496601581573486, + "learning_rate": 4.936297203480227e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8663969993591308, + "num_tokens": 23953363.0, + "step": 2950 + }, + { + "entropy": 0.5511267483234406, + "epoch": 1.667607223476298, + "grad_norm": 1.3088607788085938, + "learning_rate": 4.936082242235133e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8627111554145813, + "num_tokens": 23994093.0, + "step": 2955 + }, + { + "entropy": 0.5174029409885407, + "epoch": 1.670428893905192, + "grad_norm": 1.3591479063034058, + "learning_rate": 4.935866925202697e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8729613065719605, + "num_tokens": 24034859.0, + "step": 2960 + }, + { + "entropy": 0.5546496689319611, + "epoch": 1.6732505643340858, + "grad_norm": 1.4932035207748413, + "learning_rate": 4.935651252425219e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8636979460716248, + "num_tokens": 24075192.0, + "step": 2965 + }, + { + "entropy": 0.585486912727356, + "epoch": 1.6760722347629797, + "grad_norm": 1.5926809310913086, + "learning_rate": 4.935435223945066e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8578457713127137, + "num_tokens": 24115948.0, + "step": 2970 + }, + { + "entropy": 0.6288502216339111, + "epoch": 1.6788939051918734, + "grad_norm": 1.4562243223190308, + "learning_rate": 4.935218839804678e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8391013741493225, + "num_tokens": 24156616.0, + "step": 2975 + }, + { + "entropy": 0.518006545305252, + "epoch": 1.6817155756207676, + "grad_norm": 1.3895467519760132, + "learning_rate": 4.9350021000465645e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8746880412101745, + "num_tokens": 24196970.0, + "step": 2980 + }, + { + "entropy": 0.5556329131126404, + "epoch": 1.6845372460496613, + "grad_norm": 1.4481979608535767, + "learning_rate": 4.9347850047133025e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8628229856491089, + "num_tokens": 24237625.0, + "step": 2985 + }, + { + "entropy": 0.557821786403656, + "epoch": 1.6873589164785554, + "grad_norm": 1.4519360065460205, + "learning_rate": 4.934567553847541e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8617467164993287, + "num_tokens": 24278002.0, + "step": 2990 + }, + { + "entropy": 0.4816114008426666, + "epoch": 1.690180586907449, + "grad_norm": 1.354487657546997, + "learning_rate": 4.934349747491998e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.883508825302124, + "num_tokens": 24318792.0, + "step": 2995 + }, + { + "entropy": 0.5235770165920257, + "epoch": 1.6930022573363432, + "grad_norm": 1.4343819618225098, + "learning_rate": 4.934131585689462e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8717373967170715, + "num_tokens": 24359410.0, + "step": 3000 + }, + { + "epoch": 1.6930022573363432, + "eval_entropy": 0.49325600266456604, + "eval_loss": 0.33416908979415894, + "eval_mean_token_accuracy": 0.8984284996986389, + "eval_num_tokens": 24359410.0, + "eval_runtime": 0.1636, + "eval_samples_per_second": 24.448, + "eval_steps_per_second": 6.112, + "step": 3000 + }, + { + "entropy": 0.5068280339241028, + "epoch": 1.695823927765237, + "grad_norm": 1.4546306133270264, + "learning_rate": 4.933913068482792e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8731536269187927, + "num_tokens": 24400072.0, + "step": 3005 + }, + { + "entropy": 0.5603242635726928, + "epoch": 1.698645598194131, + "grad_norm": 1.3821967840194702, + "learning_rate": 4.933694195914913e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8643564581871033, + "num_tokens": 24440775.0, + "step": 3010 + }, + { + "entropy": 0.5545867264270783, + "epoch": 1.7014672686230248, + "grad_norm": 1.3357994556427002, + "learning_rate": 4.9334749680288255e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8690497159957886, + "num_tokens": 24481543.0, + "step": 3015 + }, + { + "entropy": 0.5338212966918945, + "epoch": 1.7042889390519187, + "grad_norm": 1.5392085313796997, + "learning_rate": 4.9332553848675945e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.867198383808136, + "num_tokens": 24522074.0, + "step": 3020 + }, + { + "entropy": 0.5320624589920044, + "epoch": 1.7071106094808126, + "grad_norm": 1.3222757577896118, + "learning_rate": 4.933035446474358e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8634262561798096, + "num_tokens": 24562720.0, + "step": 3025 + }, + { + "entropy": 0.5418852508068085, + "epoch": 1.7099322799097065, + "grad_norm": 1.3481212854385376, + "learning_rate": 4.932815152892323e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8684999108314514, + "num_tokens": 24603277.0, + "step": 3030 + }, + { + "entropy": 0.5434010863304138, + "epoch": 1.7127539503386005, + "grad_norm": 1.4969969987869263, + "learning_rate": 4.932594504164767e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8709228515625, + "num_tokens": 24643930.0, + "step": 3035 + }, + { + "entropy": 0.4718973636627197, + "epoch": 1.7155756207674944, + "grad_norm": 1.2923210859298706, + "learning_rate": 4.932373500335035e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8835036158561707, + "num_tokens": 24684556.0, + "step": 3040 + }, + { + "entropy": 0.5055824279785156, + "epoch": 1.7183972911963883, + "grad_norm": 1.2986489534378052, + "learning_rate": 4.932152141446545e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8733231663703919, + "num_tokens": 24725118.0, + "step": 3045 + }, + { + "entropy": 0.5701562583446502, + "epoch": 1.7212189616252822, + "grad_norm": 1.3613872528076172, + "learning_rate": 4.93193042754278e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8567159414291382, + "num_tokens": 24765797.0, + "step": 3050 + }, + { + "entropy": 0.5157757997512817, + "epoch": 1.7240406320541761, + "grad_norm": 1.4309362173080444, + "learning_rate": 4.931708358667299e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8720048069953918, + "num_tokens": 24806387.0, + "step": 3055 + }, + { + "entropy": 0.5428893506526947, + "epoch": 1.7268623024830698, + "grad_norm": 1.565901756286621, + "learning_rate": 4.9314859348637256e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8697570323944092, + "num_tokens": 24846979.0, + "step": 3060 + }, + { + "entropy": 0.5166443586349487, + "epoch": 1.729683972911964, + "grad_norm": 1.3636257648468018, + "learning_rate": 4.931263156175756e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8698770403862, + "num_tokens": 24887888.0, + "step": 3065 + }, + { + "entropy": 0.5039295554161072, + "epoch": 1.7325056433408577, + "grad_norm": 1.1980102062225342, + "learning_rate": 4.931040022647154e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8756434202194214, + "num_tokens": 24928370.0, + "step": 3070 + }, + { + "entropy": 0.510296243429184, + "epoch": 1.7353273137697518, + "grad_norm": 1.393905758857727, + "learning_rate": 4.930816534321755e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8748240351676941, + "num_tokens": 24969015.0, + "step": 3075 + }, + { + "entropy": 0.4927513003349304, + "epoch": 1.7381489841986455, + "grad_norm": 1.4032167196273804, + "learning_rate": 4.930592691243463e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8734770774841308, + "num_tokens": 25009646.0, + "step": 3080 + }, + { + "entropy": 0.5289837539196014, + "epoch": 1.7409706546275396, + "grad_norm": 1.3683279752731323, + "learning_rate": 4.930368493456252e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8745390772819519, + "num_tokens": 25050335.0, + "step": 3085 + }, + { + "entropy": 0.5285290896892547, + "epoch": 1.7437923250564333, + "grad_norm": 1.4092698097229004, + "learning_rate": 4.930143941004166e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8690651416778564, + "num_tokens": 25090875.0, + "step": 3090 + }, + { + "entropy": 0.5130545377731324, + "epoch": 1.7466139954853275, + "grad_norm": 1.300351619720459, + "learning_rate": 4.9299190339313186e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8777357459068298, + "num_tokens": 25131068.0, + "step": 3095 + }, + { + "entropy": 0.5526080071926117, + "epoch": 1.7494356659142212, + "grad_norm": 1.5255002975463867, + "learning_rate": 4.929693772281892e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.859094786643982, + "num_tokens": 25171825.0, + "step": 3100 + }, + { + "entropy": 0.554760754108429, + "epoch": 1.752257336343115, + "grad_norm": 1.6149297952651978, + "learning_rate": 4.929468156100139e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8624596238136292, + "num_tokens": 25212450.0, + "step": 3105 + }, + { + "entropy": 0.5364702701568603, + "epoch": 1.755079006772009, + "grad_norm": 1.3212907314300537, + "learning_rate": 4.929242185430382e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8656898379325867, + "num_tokens": 25252979.0, + "step": 3110 + }, + { + "entropy": 0.5041682958602905, + "epoch": 1.757900677200903, + "grad_norm": 1.4357374906539917, + "learning_rate": 4.929015860317013e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8751791954040528, + "num_tokens": 25293782.0, + "step": 3115 + }, + { + "entropy": 0.5505750179290771, + "epoch": 1.7607223476297968, + "grad_norm": 1.3972926139831543, + "learning_rate": 4.928789180804494e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8585454225540161, + "num_tokens": 25334421.0, + "step": 3120 + }, + { + "entropy": 0.5151427447795868, + "epoch": 1.7635440180586908, + "grad_norm": 1.1562422513961792, + "learning_rate": 4.9285621469373565e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8717629671096802, + "num_tokens": 25374820.0, + "step": 3125 + }, + { + "entropy": 0.5338060319423675, + "epoch": 1.7663656884875847, + "grad_norm": 1.4789332151412964, + "learning_rate": 4.9283347587602e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8673757314682007, + "num_tokens": 25415684.0, + "step": 3130 + }, + { + "entropy": 0.5420262277126312, + "epoch": 1.7691873589164786, + "grad_norm": 1.34184730052948, + "learning_rate": 4.928107016317697e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8614213824272156, + "num_tokens": 25456228.0, + "step": 3135 + }, + { + "entropy": 0.5315328538417816, + "epoch": 1.7720090293453725, + "grad_norm": 1.4150985479354858, + "learning_rate": 4.927878919654585e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8663432002067566, + "num_tokens": 25497007.0, + "step": 3140 + }, + { + "entropy": 0.5486104905605316, + "epoch": 1.7748306997742662, + "grad_norm": 1.6440742015838623, + "learning_rate": 4.927650468815675e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8615983009338379, + "num_tokens": 25537572.0, + "step": 3145 + }, + { + "entropy": 0.5485983848571777, + "epoch": 1.7776523702031604, + "grad_norm": 1.3117893934249878, + "learning_rate": 4.927421663845847e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8655959486961364, + "num_tokens": 25578393.0, + "step": 3150 + }, + { + "entropy": 0.5457029461860656, + "epoch": 1.780474040632054, + "grad_norm": 1.4625993967056274, + "learning_rate": 4.927192504790048e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8674182772636414, + "num_tokens": 25619127.0, + "step": 3155 + }, + { + "entropy": 0.5470890522003173, + "epoch": 1.7832957110609482, + "grad_norm": 1.3004167079925537, + "learning_rate": 4.926962991693297e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8591425657272339, + "num_tokens": 25659696.0, + "step": 3160 + }, + { + "entropy": 0.5210420250892639, + "epoch": 1.786117381489842, + "grad_norm": 1.358970046043396, + "learning_rate": 4.926733124600682e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8688877105712891, + "num_tokens": 25700593.0, + "step": 3165 + }, + { + "entropy": 0.5520116329193115, + "epoch": 1.788939051918736, + "grad_norm": 1.470733880996704, + "learning_rate": 4.926502903557361e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8626960635185241, + "num_tokens": 25741390.0, + "step": 3170 + }, + { + "entropy": 0.5124341726303101, + "epoch": 1.7917607223476297, + "grad_norm": 1.2576587200164795, + "learning_rate": 4.92627232860856e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8688732981681824, + "num_tokens": 25781767.0, + "step": 3175 + }, + { + "entropy": 0.48606478571891787, + "epoch": 1.7945823927765236, + "grad_norm": 1.2571027278900146, + "learning_rate": 4.926041399799576e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8780399203300476, + "num_tokens": 25822389.0, + "step": 3180 + }, + { + "entropy": 0.5661851406097412, + "epoch": 1.7974040632054176, + "grad_norm": 1.4934290647506714, + "learning_rate": 4.925810117175775e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8613282799720764, + "num_tokens": 25862747.0, + "step": 3185 + }, + { + "entropy": 0.5048659920692444, + "epoch": 1.8002257336343115, + "grad_norm": 1.2331528663635254, + "learning_rate": 4.925578480782593e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8817182898521423, + "num_tokens": 25903452.0, + "step": 3190 + }, + { + "entropy": 0.4995622456073761, + "epoch": 1.8030474040632054, + "grad_norm": 1.370931625366211, + "learning_rate": 4.925346490665533e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8787646174430848, + "num_tokens": 25943952.0, + "step": 3195 + }, + { + "entropy": 0.5398005902767181, + "epoch": 1.8058690744920993, + "grad_norm": 1.33382248878479, + "learning_rate": 4.925114146870172e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8646183371543884, + "num_tokens": 25984602.0, + "step": 3200 + }, + { + "entropy": 0.563685005903244, + "epoch": 1.8086907449209932, + "grad_norm": 1.504831314086914, + "learning_rate": 4.924881449442153e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8542726278305054, + "num_tokens": 26024952.0, + "step": 3205 + }, + { + "entropy": 0.5182668328285217, + "epoch": 1.8115124153498872, + "grad_norm": 1.46697199344635, + "learning_rate": 4.924648398427189e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8678057789802551, + "num_tokens": 26065459.0, + "step": 3210 + }, + { + "entropy": 0.5408389866352081, + "epoch": 1.814334085778781, + "grad_norm": 1.30887770652771, + "learning_rate": 4.924414993871063e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8668316960334778, + "num_tokens": 26106028.0, + "step": 3215 + }, + { + "entropy": 0.542147308588028, + "epoch": 1.8171557562076748, + "grad_norm": 1.524593710899353, + "learning_rate": 4.924181235819627e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8657636642456055, + "num_tokens": 26146540.0, + "step": 3220 + }, + { + "entropy": 0.5389390230178833, + "epoch": 1.819977426636569, + "grad_norm": 1.4004186391830444, + "learning_rate": 4.923947124318804e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8676887512207031, + "num_tokens": 26186216.0, + "step": 3225 + }, + { + "entropy": 0.5603163003921509, + "epoch": 1.8227990970654626, + "grad_norm": 1.5067588090896606, + "learning_rate": 4.923712659414585e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8593494296073914, + "num_tokens": 26226650.0, + "step": 3230 + }, + { + "entropy": 0.5345525562763214, + "epoch": 1.8256207674943568, + "grad_norm": 1.264466643333435, + "learning_rate": 4.923477841153029e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8678050875663758, + "num_tokens": 26267478.0, + "step": 3235 + }, + { + "entropy": 0.4989795684814453, + "epoch": 1.8284424379232505, + "grad_norm": 1.2940160036087036, + "learning_rate": 4.923242669580268e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8774576902389526, + "num_tokens": 26308103.0, + "step": 3240 + }, + { + "entropy": 0.5003684043884278, + "epoch": 1.8312641083521446, + "grad_norm": 1.4770820140838623, + "learning_rate": 4.923007144742501e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8739272236824036, + "num_tokens": 26348574.0, + "step": 3245 + }, + { + "entropy": 0.4989498794078827, + "epoch": 1.8340857787810383, + "grad_norm": 1.2541260719299316, + "learning_rate": 4.922771266685997e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8757034659385681, + "num_tokens": 26389200.0, + "step": 3250 + }, + { + "entropy": 0.5914472699165344, + "epoch": 1.8369074492099324, + "grad_norm": 1.3079047203063965, + "learning_rate": 4.922535035457094e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8483936309814453, + "num_tokens": 26429900.0, + "step": 3255 + }, + { + "entropy": 0.5169390797615051, + "epoch": 1.8397291196388261, + "grad_norm": 1.5444384813308716, + "learning_rate": 4.922298451102199e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8759800791740417, + "num_tokens": 26470465.0, + "step": 3260 + }, + { + "entropy": 0.536392205953598, + "epoch": 1.84255079006772, + "grad_norm": 1.189184546470642, + "learning_rate": 4.922061513667789e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8635899901390076, + "num_tokens": 26511105.0, + "step": 3265 + }, + { + "entropy": 0.5151565790176391, + "epoch": 1.845372460496614, + "grad_norm": 1.5389307737350464, + "learning_rate": 4.921824223200412e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8712221384048462, + "num_tokens": 26551938.0, + "step": 3270 + }, + { + "entropy": 0.5374954879283905, + "epoch": 1.8481941309255079, + "grad_norm": 1.3970680236816406, + "learning_rate": 4.9215865797466826e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8682995200157165, + "num_tokens": 26592540.0, + "step": 3275 + }, + { + "entropy": 0.5597237467765808, + "epoch": 1.8510158013544018, + "grad_norm": 1.3503648042678833, + "learning_rate": 4.921348583353286e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8641868352890014, + "num_tokens": 26633368.0, + "step": 3280 + }, + { + "entropy": 0.5081552624702453, + "epoch": 1.8538374717832957, + "grad_norm": 1.3734320402145386, + "learning_rate": 4.921110234066977e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8727322340011596, + "num_tokens": 26674050.0, + "step": 3285 + }, + { + "entropy": 0.5667516946792602, + "epoch": 1.8566591422121896, + "grad_norm": 1.4739806652069092, + "learning_rate": 4.920871531934579e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8577787399291992, + "num_tokens": 26714528.0, + "step": 3290 + }, + { + "entropy": 0.5383511900901794, + "epoch": 1.8594808126410836, + "grad_norm": 1.450299620628357, + "learning_rate": 4.920632477002985e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8684499502182007, + "num_tokens": 26755102.0, + "step": 3295 + }, + { + "entropy": 0.5264916241168975, + "epoch": 1.8623024830699775, + "grad_norm": 1.339902400970459, + "learning_rate": 4.9203930693191575e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8646404504776001, + "num_tokens": 26795847.0, + "step": 3300 + }, + { + "entropy": 0.501398503780365, + "epoch": 1.8651241534988712, + "grad_norm": 1.3118853569030762, + "learning_rate": 4.920153308930128e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.872932231426239, + "num_tokens": 26836356.0, + "step": 3305 + }, + { + "entropy": 0.5669954061508179, + "epoch": 1.8679458239277653, + "grad_norm": 1.4395278692245483, + "learning_rate": 4.919913195882997e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8604054570198059, + "num_tokens": 26876942.0, + "step": 3310 + }, + { + "entropy": 0.4821522831916809, + "epoch": 1.870767494356659, + "grad_norm": 1.3885427713394165, + "learning_rate": 4.919672730224936e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8788249969482422, + "num_tokens": 26917741.0, + "step": 3315 + }, + { + "entropy": 0.5123970150947571, + "epoch": 1.8735891647855532, + "grad_norm": 1.3463451862335205, + "learning_rate": 4.9194319120031836e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8763728380203247, + "num_tokens": 26957483.0, + "step": 3320 + }, + { + "entropy": 0.5375806808471679, + "epoch": 1.8764108352144468, + "grad_norm": 1.339368462562561, + "learning_rate": 4.91919074126505e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8660783290863037, + "num_tokens": 26998053.0, + "step": 3325 + }, + { + "entropy": 0.5030182540416718, + "epoch": 1.879232505643341, + "grad_norm": 1.2726454734802246, + "learning_rate": 4.91894921805791e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8820128679275513, + "num_tokens": 27038835.0, + "step": 3330 + }, + { + "entropy": 0.5537118554115296, + "epoch": 1.8820541760722347, + "grad_norm": 1.3522690534591675, + "learning_rate": 4.918707342429214e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8657578349113464, + "num_tokens": 27079464.0, + "step": 3335 + }, + { + "entropy": 0.5303456306457519, + "epoch": 1.8848758465011288, + "grad_norm": 1.423553705215454, + "learning_rate": 4.9184651144264776e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8663669109344483, + "num_tokens": 27120201.0, + "step": 3340 + }, + { + "entropy": 0.5344128489494324, + "epoch": 1.8876975169300225, + "grad_norm": 1.223803162574768, + "learning_rate": 4.918222534097286e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8672279477119446, + "num_tokens": 27160686.0, + "step": 3345 + }, + { + "entropy": 0.4743356049060822, + "epoch": 1.8905191873589164, + "grad_norm": 1.3717639446258545, + "learning_rate": 4.917979601489295e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8822289109230042, + "num_tokens": 27201250.0, + "step": 3350 + }, + { + "entropy": 0.5208858251571655, + "epoch": 1.8933408577878104, + "grad_norm": 1.2312960624694824, + "learning_rate": 4.917736316650228e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8755510807037353, + "num_tokens": 27241854.0, + "step": 3355 + }, + { + "entropy": 0.5430815577507019, + "epoch": 1.8961625282167043, + "grad_norm": 1.385640025138855, + "learning_rate": 4.917492679627879e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8614830732345581, + "num_tokens": 27282703.0, + "step": 3360 + }, + { + "entropy": 0.5193690776824951, + "epoch": 1.8989841986455982, + "grad_norm": 1.6769845485687256, + "learning_rate": 4.917248690470109e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8753103852272034, + "num_tokens": 27323255.0, + "step": 3365 + }, + { + "entropy": 0.5402388453483582, + "epoch": 1.9018058690744921, + "grad_norm": 1.4692338705062866, + "learning_rate": 4.917004349224851e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8716601133346558, + "num_tokens": 27363961.0, + "step": 3370 + }, + { + "entropy": 0.5165061771869659, + "epoch": 1.904627539503386, + "grad_norm": 1.2910758256912231, + "learning_rate": 4.916759655940107e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8713883280754089, + "num_tokens": 27404297.0, + "step": 3375 + }, + { + "entropy": 0.5438097000122071, + "epoch": 1.90744920993228, + "grad_norm": 1.5428695678710938, + "learning_rate": 4.916514610663943e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8607651233673096, + "num_tokens": 27444966.0, + "step": 3380 + }, + { + "entropy": 0.5148679494857789, + "epoch": 1.9102708803611739, + "grad_norm": 1.370609164237976, + "learning_rate": 4.916269213444502e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8719611167907715, + "num_tokens": 27485807.0, + "step": 3385 + }, + { + "entropy": 0.552492767572403, + "epoch": 1.9130925507900676, + "grad_norm": 1.3460413217544556, + "learning_rate": 4.9160234643299935e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.867039430141449, + "num_tokens": 27526548.0, + "step": 3390 + }, + { + "entropy": 0.5185729682445526, + "epoch": 1.9159142212189617, + "grad_norm": 1.2497392892837524, + "learning_rate": 4.91577736336869e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8712507843971252, + "num_tokens": 27566660.0, + "step": 3395 + }, + { + "entropy": 0.5251140117645263, + "epoch": 1.9187358916478554, + "grad_norm": 1.4792594909667969, + "learning_rate": 4.915530910608941e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8693275570869445, + "num_tokens": 27607302.0, + "step": 3400 + }, + { + "entropy": 0.5528305053710938, + "epoch": 1.9215575620767495, + "grad_norm": 1.4354878664016724, + "learning_rate": 4.915284106099162e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.864462697505951, + "num_tokens": 27648058.0, + "step": 3405 + }, + { + "entropy": 0.5455857455730438, + "epoch": 1.9243792325056432, + "grad_norm": 1.3338799476623535, + "learning_rate": 4.915036949887838e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8690644383430481, + "num_tokens": 27688285.0, + "step": 3410 + }, + { + "entropy": 0.524651050567627, + "epoch": 1.9272009029345374, + "grad_norm": 1.3062248229980469, + "learning_rate": 4.914789442023523e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8696060180664062, + "num_tokens": 27728945.0, + "step": 3415 + }, + { + "entropy": 0.5397283554077148, + "epoch": 1.930022573363431, + "grad_norm": 1.3946961164474487, + "learning_rate": 4.914541582554838e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8649717569351196, + "num_tokens": 27769755.0, + "step": 3420 + }, + { + "entropy": 0.5519603669643403, + "epoch": 1.9328442437923252, + "grad_norm": 1.3521655797958374, + "learning_rate": 4.914293371530478e-06, + "loss": 0.448, + "mean_token_accuracy": 0.858772075176239, + "num_tokens": 27810361.0, + "step": 3425 + }, + { + "entropy": 0.5126847267150879, + "epoch": 1.935665914221219, + "grad_norm": 1.469565510749817, + "learning_rate": 4.914044808999202e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8749841451644897, + "num_tokens": 27851096.0, + "step": 3430 + }, + { + "entropy": 0.5041461646556854, + "epoch": 1.9384875846501128, + "grad_norm": 1.2633264064788818, + "learning_rate": 4.913795895009841e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8790737509727478, + "num_tokens": 27891803.0, + "step": 3435 + }, + { + "entropy": 0.5427121460437775, + "epoch": 1.9413092550790068, + "grad_norm": 1.4089435338974, + "learning_rate": 4.913546629611294e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8610092759132385, + "num_tokens": 27932403.0, + "step": 3440 + }, + { + "entropy": 0.5261597633361816, + "epoch": 1.9441309255079007, + "grad_norm": 1.4115110635757446, + "learning_rate": 4.913297012852528e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8676816701889039, + "num_tokens": 27973177.0, + "step": 3445 + }, + { + "entropy": 0.5300165891647339, + "epoch": 1.9469525959367946, + "grad_norm": 1.4206523895263672, + "learning_rate": 4.9130470447825816e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8669154524803162, + "num_tokens": 28013878.0, + "step": 3450 + }, + { + "entropy": 0.5528669118881225, + "epoch": 1.9497742663656885, + "grad_norm": 1.4294172525405884, + "learning_rate": 4.912796725450562e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8657429218292236, + "num_tokens": 28054452.0, + "step": 3455 + }, + { + "entropy": 0.5473185420036316, + "epoch": 1.9525959367945824, + "grad_norm": 1.4492708444595337, + "learning_rate": 4.912546054905642e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8618425607681275, + "num_tokens": 28095117.0, + "step": 3460 + }, + { + "entropy": 0.5194451570510864, + "epoch": 1.9554176072234764, + "grad_norm": 1.4601637125015259, + "learning_rate": 4.912295033197068e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8705293416976929, + "num_tokens": 28135700.0, + "step": 3465 + }, + { + "entropy": 0.5036539554595947, + "epoch": 1.9582392776523703, + "grad_norm": 1.4262093305587769, + "learning_rate": 4.9120436603741515e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8786629676818848, + "num_tokens": 28176234.0, + "step": 3470 + }, + { + "entropy": 0.5227799832820892, + "epoch": 1.961060948081264, + "grad_norm": 1.4064563512802124, + "learning_rate": 4.911791936486276e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.871609365940094, + "num_tokens": 28216567.0, + "step": 3475 + }, + { + "entropy": 0.49351215958595274, + "epoch": 1.963882618510158, + "grad_norm": 1.401670217514038, + "learning_rate": 4.911539861582893e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8766814708709717, + "num_tokens": 28257305.0, + "step": 3480 + }, + { + "entropy": 0.49786993861198425, + "epoch": 1.9667042889390518, + "grad_norm": 1.5233041048049927, + "learning_rate": 4.911287435713522e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8762277483940124, + "num_tokens": 28297776.0, + "step": 3485 + }, + { + "entropy": 0.4954915583133698, + "epoch": 1.969525959367946, + "grad_norm": 1.241149663925171, + "learning_rate": 4.911034658927751e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8773471236228942, + "num_tokens": 28338384.0, + "step": 3490 + }, + { + "entropy": 0.5027383029460907, + "epoch": 1.9723476297968396, + "grad_norm": 1.4190332889556885, + "learning_rate": 4.91078153127524e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8748612880706788, + "num_tokens": 28379137.0, + "step": 3495 + }, + { + "entropy": 0.5188802659511567, + "epoch": 1.9751693002257338, + "grad_norm": 1.4441595077514648, + "learning_rate": 4.910528052805714e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8701729416847229, + "num_tokens": 28419987.0, + "step": 3500 + }, + { + "epoch": 1.9751693002257338, + "eval_entropy": 0.5039494037628174, + "eval_loss": 0.32582059502601624, + "eval_mean_token_accuracy": 0.9022613763809204, + "eval_num_tokens": 28419987.0, + "eval_runtime": 0.1635, + "eval_samples_per_second": 24.464, + "eval_steps_per_second": 6.116, + "step": 3500 + }, + { + "entropy": 0.5050791263580322, + "epoch": 1.9779909706546275, + "grad_norm": 1.2525067329406738, + "learning_rate": 4.910274223568971e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8697703242301941, + "num_tokens": 28460369.0, + "step": 3505 + }, + { + "entropy": 0.5229144394397736, + "epoch": 1.9808126410835214, + "grad_norm": 1.4554386138916016, + "learning_rate": 4.9100200436148735e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8699278473854065, + "num_tokens": 28501101.0, + "step": 3510 + }, + { + "entropy": 0.5569809198379516, + "epoch": 1.9836343115124153, + "grad_norm": 1.4474997520446777, + "learning_rate": 4.909765512993357e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.857849943637848, + "num_tokens": 28541729.0, + "step": 3515 + }, + { + "entropy": 0.5587629973888397, + "epoch": 1.9864559819413092, + "grad_norm": 1.3627238273620605, + "learning_rate": 4.909510631754425e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8645955443382263, + "num_tokens": 28582055.0, + "step": 3520 + }, + { + "entropy": 0.5122965276241302, + "epoch": 1.9892776523702032, + "grad_norm": 1.2392799854278564, + "learning_rate": 4.909255399948146e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8727590441703796, + "num_tokens": 28622596.0, + "step": 3525 + }, + { + "entropy": 0.5298355937004089, + "epoch": 1.992099322799097, + "grad_norm": 1.6109238862991333, + "learning_rate": 4.908999817624661e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8664365530014038, + "num_tokens": 28662503.0, + "step": 3530 + }, + { + "entropy": 0.5138360559940338, + "epoch": 1.994920993227991, + "grad_norm": 1.3417595624923706, + "learning_rate": 4.9087438848341806e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8667647480964661, + "num_tokens": 28702848.0, + "step": 3535 + }, + { + "entropy": 0.5267124712467194, + "epoch": 1.997742663656885, + "grad_norm": 1.2989228963851929, + "learning_rate": 4.908487601626983e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8676295042037964, + "num_tokens": 28743457.0, + "step": 3540 + }, + { + "entropy": 0.5033242404460907, + "epoch": 2.000564334085779, + "grad_norm": 1.0862282514572144, + "learning_rate": 4.9082309680534134e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8765107035636902, + "num_tokens": 28777728.0, + "step": 3545 + }, + { + "entropy": 0.4844656944274902, + "epoch": 2.0033860045146725, + "grad_norm": 1.1120542287826538, + "learning_rate": 4.907973984163888e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.9055742740631103, + "num_tokens": 28818642.0, + "step": 3550 + }, + { + "entropy": 0.4910138726234436, + "epoch": 2.0062076749435667, + "grad_norm": 1.2199798822402954, + "learning_rate": 4.907716650008893e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8892881631851196, + "num_tokens": 28859021.0, + "step": 3555 + }, + { + "entropy": 0.4198545038700104, + "epoch": 2.0090293453724604, + "grad_norm": 1.348246693611145, + "learning_rate": 4.907458965638979e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.9059156775474548, + "num_tokens": 28899517.0, + "step": 3560 + }, + { + "entropy": 0.42251318097114565, + "epoch": 2.0118510158013545, + "grad_norm": 1.4219539165496826, + "learning_rate": 4.90720093110477e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8997484803199768, + "num_tokens": 28940360.0, + "step": 3565 + }, + { + "entropy": 0.4202295243740082, + "epoch": 2.014672686230248, + "grad_norm": 1.5008666515350342, + "learning_rate": 4.906942546456957e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8997807145118714, + "num_tokens": 28981158.0, + "step": 3570 + }, + { + "entropy": 0.39814443588256837, + "epoch": 2.0174943566591423, + "grad_norm": 1.4986308813095093, + "learning_rate": 4.906683811746298e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.9047508955001831, + "num_tokens": 29021722.0, + "step": 3575 + }, + { + "entropy": 0.4490103781223297, + "epoch": 2.020316027088036, + "grad_norm": 1.7003601789474487, + "learning_rate": 4.9064247270236235e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8966346383094788, + "num_tokens": 29062372.0, + "step": 3580 + }, + { + "entropy": 0.440228271484375, + "epoch": 2.02313769751693, + "grad_norm": 1.8546236753463745, + "learning_rate": 4.906165292339828e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.899936830997467, + "num_tokens": 29102844.0, + "step": 3585 + }, + { + "entropy": 0.39439972043037413, + "epoch": 2.025959367945824, + "grad_norm": 1.4125168323516846, + "learning_rate": 4.905905507745881e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.9061366796493531, + "num_tokens": 29143188.0, + "step": 3590 + }, + { + "entropy": 0.40190274715423585, + "epoch": 2.028781038374718, + "grad_norm": 1.5160431861877441, + "learning_rate": 4.905645373292815e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.9052773237228393, + "num_tokens": 29183656.0, + "step": 3595 + }, + { + "entropy": 0.4064796566963196, + "epoch": 2.0316027088036117, + "grad_norm": 1.6123907566070557, + "learning_rate": 4.905384889031734e-06, + "loss": 0.302, + "mean_token_accuracy": 0.9012960076332093, + "num_tokens": 29224451.0, + "step": 3600 + }, + { + "entropy": 0.4297832429409027, + "epoch": 2.034424379232506, + "grad_norm": 1.5297926664352417, + "learning_rate": 4.90512405501381e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8968385696411133, + "num_tokens": 29265091.0, + "step": 3605 + }, + { + "entropy": 0.41181485652923583, + "epoch": 2.0372460496613995, + "grad_norm": 1.4415594339370728, + "learning_rate": 4.904862871290285e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.9039250016212463, + "num_tokens": 29305558.0, + "step": 3610 + }, + { + "entropy": 0.4206926107406616, + "epoch": 2.0400677200902932, + "grad_norm": 1.4016162157058716, + "learning_rate": 4.904601337912467e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.9045567750930786, + "num_tokens": 29346298.0, + "step": 3615 + }, + { + "entropy": 0.3947125256061554, + "epoch": 2.0428893905191874, + "grad_norm": 1.505416989326477, + "learning_rate": 4.9043394549317345e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.9085012078285217, + "num_tokens": 29387055.0, + "step": 3620 + }, + { + "entropy": 0.39603736996650696, + "epoch": 2.045711060948081, + "grad_norm": 1.6036330461502075, + "learning_rate": 4.904077222399534e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.90137619972229, + "num_tokens": 29427763.0, + "step": 3625 + }, + { + "entropy": 0.3889478862285614, + "epoch": 2.0485327313769752, + "grad_norm": 1.672661304473877, + "learning_rate": 4.903814640367383e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.9071918845176696, + "num_tokens": 29468411.0, + "step": 3630 + }, + { + "entropy": 0.39558014273643494, + "epoch": 2.051354401805869, + "grad_norm": 1.556510090827942, + "learning_rate": 4.903551708886865e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.9074760317802429, + "num_tokens": 29509088.0, + "step": 3635 + }, + { + "entropy": 0.41549997925758364, + "epoch": 2.054176072234763, + "grad_norm": 1.4191838502883911, + "learning_rate": 4.903288428009632e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.9053076505661011, + "num_tokens": 29549728.0, + "step": 3640 + }, + { + "entropy": 0.4345362961292267, + "epoch": 2.0569977426636568, + "grad_norm": 1.6131196022033691, + "learning_rate": 4.9030247977874064e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8968990206718445, + "num_tokens": 29590362.0, + "step": 3645 + }, + { + "entropy": 0.38488792777061465, + "epoch": 2.059819413092551, + "grad_norm": 1.391592025756836, + "learning_rate": 4.902760818271978e-06, + "loss": 0.2758, + "mean_token_accuracy": 0.9102860927581787, + "num_tokens": 29630543.0, + "step": 3650 + }, + { + "entropy": 0.4055006742477417, + "epoch": 2.0626410835214446, + "grad_norm": 1.5487695932388306, + "learning_rate": 4.902496489515206e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8991694211959839, + "num_tokens": 29671134.0, + "step": 3655 + }, + { + "entropy": 0.4026013076305389, + "epoch": 2.0654627539503387, + "grad_norm": 1.4963382482528687, + "learning_rate": 4.902231811569016e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.9023310422897339, + "num_tokens": 29711975.0, + "step": 3660 + }, + { + "entropy": 0.3941408574581146, + "epoch": 2.0682844243792324, + "grad_norm": 1.4817650318145752, + "learning_rate": 4.901966784485407e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.9089763045310975, + "num_tokens": 29752694.0, + "step": 3665 + }, + { + "entropy": 0.4023356318473816, + "epoch": 2.0711060948081266, + "grad_norm": 1.6465110778808594, + "learning_rate": 4.901701408316443e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.908521831035614, + "num_tokens": 29793192.0, + "step": 3670 + }, + { + "entropy": 0.38578653931617735, + "epoch": 2.0739277652370203, + "grad_norm": 1.5175875425338745, + "learning_rate": 4.901435683114255e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.9055356621742249, + "num_tokens": 29834115.0, + "step": 3675 + }, + { + "entropy": 0.3811330258846283, + "epoch": 2.0767494356659144, + "grad_norm": 1.527234435081482, + "learning_rate": 4.901169608931046e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.9086094379425049, + "num_tokens": 29874790.0, + "step": 3680 + }, + { + "entropy": 0.3947294354438782, + "epoch": 2.079571106094808, + "grad_norm": 1.5221364498138428, + "learning_rate": 4.900903185819088e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.9065226435661315, + "num_tokens": 29915452.0, + "step": 3685 + }, + { + "entropy": 0.372803258895874, + "epoch": 2.082392776523702, + "grad_norm": 1.4854118824005127, + "learning_rate": 4.900636413830717e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.9063424587249755, + "num_tokens": 29956198.0, + "step": 3690 + }, + { + "entropy": 0.3771813929080963, + "epoch": 2.085214446952596, + "grad_norm": 1.432308316230774, + "learning_rate": 4.900369293018342e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9135713815689087, + "num_tokens": 29996744.0, + "step": 3695 + }, + { + "entropy": 0.3829148352146149, + "epoch": 2.0880361173814896, + "grad_norm": 1.5369491577148438, + "learning_rate": 4.900101823434438e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.9079620480537415, + "num_tokens": 30037615.0, + "step": 3700 + }, + { + "entropy": 0.40159552693367007, + "epoch": 2.090857787810384, + "grad_norm": 1.6757155656814575, + "learning_rate": 4.8998340051315515e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.9064762830734253, + "num_tokens": 30078214.0, + "step": 3705 + }, + { + "entropy": 0.393253880739212, + "epoch": 2.0936794582392775, + "grad_norm": 1.5468506813049316, + "learning_rate": 4.899565838162292e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.9056425452232361, + "num_tokens": 30118900.0, + "step": 3710 + }, + { + "entropy": 0.42414683699607847, + "epoch": 2.0965011286681716, + "grad_norm": 1.736525297164917, + "learning_rate": 4.899297322579345e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8957733750343323, + "num_tokens": 30159519.0, + "step": 3715 + }, + { + "entropy": 0.3917936205863953, + "epoch": 2.0993227990970653, + "grad_norm": 1.5324411392211914, + "learning_rate": 4.899028458435458e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.9044302463531494, + "num_tokens": 30199519.0, + "step": 3720 + }, + { + "entropy": 0.38330122232437136, + "epoch": 2.1021444695259595, + "grad_norm": 1.4559762477874756, + "learning_rate": 4.898759245783449e-06, + "loss": 0.2755, + "mean_token_accuracy": 0.9099204063415527, + "num_tokens": 30239952.0, + "step": 3725 + }, + { + "entropy": 0.3951174974441528, + "epoch": 2.104966139954853, + "grad_norm": 1.5351479053497314, + "learning_rate": 4.898489684676205e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.9066962122917175, + "num_tokens": 30280534.0, + "step": 3730 + }, + { + "entropy": 0.3635559678077698, + "epoch": 2.1077878103837473, + "grad_norm": 1.50784170627594, + "learning_rate": 4.898219775166683e-06, + "loss": 0.2621, + "mean_token_accuracy": 0.9133730411529541, + "num_tokens": 30321115.0, + "step": 3735 + }, + { + "entropy": 0.41765828132629396, + "epoch": 2.110609480812641, + "grad_norm": 1.554824709892273, + "learning_rate": 4.897949517307905e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.898926043510437, + "num_tokens": 30361715.0, + "step": 3740 + }, + { + "entropy": 0.40577141046524046, + "epoch": 2.113431151241535, + "grad_norm": 1.543889045715332, + "learning_rate": 4.897678911152964e-06, + "loss": 0.302, + "mean_token_accuracy": 0.9033358812332153, + "num_tokens": 30402151.0, + "step": 3745 + }, + { + "entropy": 0.3970208168029785, + "epoch": 2.116252821670429, + "grad_norm": 1.4221984148025513, + "learning_rate": 4.897407956755021e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.9049067020416259, + "num_tokens": 30442797.0, + "step": 3750 + }, + { + "entropy": 0.41688573360443115, + "epoch": 2.119074492099323, + "grad_norm": 1.6445451974868774, + "learning_rate": 4.897136654167304e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.9019560813903809, + "num_tokens": 30483552.0, + "step": 3755 + }, + { + "entropy": 0.34632920026779174, + "epoch": 2.1218961625282167, + "grad_norm": 1.3843744993209839, + "learning_rate": 4.896865003443111e-06, + "loss": 0.2493, + "mean_token_accuracy": 0.9194693088531494, + "num_tokens": 30524141.0, + "step": 3760 + }, + { + "entropy": 0.41517892479896545, + "epoch": 2.124717832957111, + "grad_norm": 1.4717671871185303, + "learning_rate": 4.896593004635807e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.9012321352958679, + "num_tokens": 30564727.0, + "step": 3765 + }, + { + "entropy": 0.3839627206325531, + "epoch": 2.1275395033860045, + "grad_norm": 1.5019539594650269, + "learning_rate": 4.896320657798828e-06, + "loss": 0.2671, + "mean_token_accuracy": 0.9121496915817261, + "num_tokens": 30605161.0, + "step": 3770 + }, + { + "entropy": 0.40346505045890807, + "epoch": 2.130361173814898, + "grad_norm": 1.8757257461547852, + "learning_rate": 4.896047962985676e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.9000169515609742, + "num_tokens": 30645838.0, + "step": 3775 + }, + { + "entropy": 0.40536361932754517, + "epoch": 2.1331828442437923, + "grad_norm": 1.508545994758606, + "learning_rate": 4.89577492024992e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.899243688583374, + "num_tokens": 30686256.0, + "step": 3780 + }, + { + "entropy": 0.4287443220615387, + "epoch": 2.136004514672686, + "grad_norm": 1.4562077522277832, + "learning_rate": 4.895501529645201e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8983871340751648, + "num_tokens": 30726917.0, + "step": 3785 + }, + { + "entropy": 0.434832763671875, + "epoch": 2.13882618510158, + "grad_norm": 1.5828227996826172, + "learning_rate": 4.895227791225228e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.905066967010498, + "num_tokens": 30767481.0, + "step": 3790 + }, + { + "entropy": 0.4244759321212769, + "epoch": 2.141647855530474, + "grad_norm": 1.6923705339431763, + "learning_rate": 4.894953705043774e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.901056170463562, + "num_tokens": 30808039.0, + "step": 3795 + }, + { + "entropy": 0.40609169006347656, + "epoch": 2.144469525959368, + "grad_norm": 1.470828652381897, + "learning_rate": 4.894679271154684e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.9032929420471192, + "num_tokens": 30848528.0, + "step": 3800 + }, + { + "entropy": 0.38831552267074587, + "epoch": 2.1472911963882617, + "grad_norm": 1.4154300689697266, + "learning_rate": 4.894404489611872e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.9009493231773377, + "num_tokens": 30889149.0, + "step": 3805 + }, + { + "entropy": 0.36803258061408994, + "epoch": 2.150112866817156, + "grad_norm": 1.5650067329406738, + "learning_rate": 4.894129360469317e-06, + "loss": 0.2586, + "mean_token_accuracy": 0.9152506470680237, + "num_tokens": 30929871.0, + "step": 3810 + }, + { + "entropy": 0.41122249364852903, + "epoch": 2.1529345372460496, + "grad_norm": 1.7758678197860718, + "learning_rate": 4.89385388378107e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.9019395470619201, + "num_tokens": 30970228.0, + "step": 3815 + }, + { + "entropy": 0.3893428444862366, + "epoch": 2.1557562076749437, + "grad_norm": 1.581362009048462, + "learning_rate": 4.893578059601249e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.9072283625602722, + "num_tokens": 31010919.0, + "step": 3820 + }, + { + "entropy": 0.3995703637599945, + "epoch": 2.1585778781038374, + "grad_norm": 1.6888242959976196, + "learning_rate": 4.893301887984036e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8979094982147217, + "num_tokens": 31051717.0, + "step": 3825 + }, + { + "entropy": 0.3909620702266693, + "epoch": 2.1613995485327315, + "grad_norm": 1.589010238647461, + "learning_rate": 4.893025368983688e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.9069844603538513, + "num_tokens": 31092303.0, + "step": 3830 + }, + { + "entropy": 0.38756829500198364, + "epoch": 2.1642212189616252, + "grad_norm": 1.4219690561294556, + "learning_rate": 4.892748502654527e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9117750763893128, + "num_tokens": 31133212.0, + "step": 3835 + }, + { + "entropy": 0.39158605933189394, + "epoch": 2.1670428893905194, + "grad_norm": 1.5833606719970703, + "learning_rate": 4.892471289050942e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.9098140478134156, + "num_tokens": 31173102.0, + "step": 3840 + }, + { + "entropy": 0.4036704897880554, + "epoch": 2.169864559819413, + "grad_norm": 1.4913930892944336, + "learning_rate": 4.892193728227393e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.9021237850189209, + "num_tokens": 31213865.0, + "step": 3845 + }, + { + "entropy": 0.394330632686615, + "epoch": 2.172686230248307, + "grad_norm": 1.8555155992507935, + "learning_rate": 4.891915820238406e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8976067423820495, + "num_tokens": 31254408.0, + "step": 3850 + }, + { + "entropy": 0.4108275890350342, + "epoch": 2.175507900677201, + "grad_norm": 1.4580622911453247, + "learning_rate": 4.891637565138578e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8978480458259582, + "num_tokens": 31295211.0, + "step": 3855 + }, + { + "entropy": 0.42848613262176516, + "epoch": 2.1783295711060946, + "grad_norm": 1.54513680934906, + "learning_rate": 4.891358962982569e-06, + "loss": 0.308, + "mean_token_accuracy": 0.9015236139297486, + "num_tokens": 31335702.0, + "step": 3860 + }, + { + "entropy": 0.4215279221534729, + "epoch": 2.1811512415349887, + "grad_norm": 1.7928465604782104, + "learning_rate": 4.891080013825112e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.9013218641281128, + "num_tokens": 31376407.0, + "step": 3865 + }, + { + "entropy": 0.4218919575214386, + "epoch": 2.1839729119638824, + "grad_norm": 1.5700446367263794, + "learning_rate": 4.890800717721007e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.9020668745040894, + "num_tokens": 31416993.0, + "step": 3870 + }, + { + "entropy": 0.40297368764877317, + "epoch": 2.1867945823927766, + "grad_norm": 1.5355396270751953, + "learning_rate": 4.890521074725122e-06, + "loss": 0.2787, + "mean_token_accuracy": 0.9106267809867858, + "num_tokens": 31457735.0, + "step": 3875 + }, + { + "entropy": 0.4444594144821167, + "epoch": 2.1896162528216703, + "grad_norm": 1.762122631072998, + "learning_rate": 4.890241084892392e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8961127758026123, + "num_tokens": 31498437.0, + "step": 3880 + }, + { + "entropy": 0.39649640321731566, + "epoch": 2.1924379232505644, + "grad_norm": 1.3777143955230713, + "learning_rate": 4.889960748277821e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.9072598814964294, + "num_tokens": 31539261.0, + "step": 3885 + }, + { + "entropy": 0.36735162138938904, + "epoch": 2.195259593679458, + "grad_norm": 1.510568618774414, + "learning_rate": 4.889680064936483e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.9091784954071045, + "num_tokens": 31580151.0, + "step": 3890 + }, + { + "entropy": 0.3932835698127747, + "epoch": 2.1980812641083523, + "grad_norm": 1.311471939086914, + "learning_rate": 4.889399034923515e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.9054880023002625, + "num_tokens": 31620726.0, + "step": 3895 + }, + { + "entropy": 0.4081146061420441, + "epoch": 2.200902934537246, + "grad_norm": 1.7176251411437988, + "learning_rate": 4.889117658294128e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.9020537614822388, + "num_tokens": 31661526.0, + "step": 3900 + }, + { + "entropy": 0.4014622807502747, + "epoch": 2.20372460496614, + "grad_norm": 1.5755354166030884, + "learning_rate": 4.888835935103598e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.9023280262947082, + "num_tokens": 31702085.0, + "step": 3905 + }, + { + "entropy": 0.39336706399917604, + "epoch": 2.206546275395034, + "grad_norm": 1.734265923500061, + "learning_rate": 4.888553865407269e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.905624508857727, + "num_tokens": 31742379.0, + "step": 3910 + }, + { + "entropy": 0.3690111577510834, + "epoch": 2.209367945823928, + "grad_norm": 1.5115963220596313, + "learning_rate": 4.888271449260554e-06, + "loss": 0.26, + "mean_token_accuracy": 0.9165234088897705, + "num_tokens": 31783016.0, + "step": 3915 + }, + { + "entropy": 0.3873672723770142, + "epoch": 2.2121896162528216, + "grad_norm": 1.7038564682006836, + "learning_rate": 4.887988686718933e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.9066754698753356, + "num_tokens": 31823595.0, + "step": 3920 + }, + { + "entropy": 0.3954701006412506, + "epoch": 2.2150112866817158, + "grad_norm": 1.6473407745361328, + "learning_rate": 4.887705577837957e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.9045971989631653, + "num_tokens": 31864135.0, + "step": 3925 + }, + { + "entropy": 0.38976659178733825, + "epoch": 2.2178329571106095, + "grad_norm": 1.4724841117858887, + "learning_rate": 4.88742212267324e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.9068511605262757, + "num_tokens": 31904819.0, + "step": 3930 + }, + { + "entropy": 0.40335485339164734, + "epoch": 2.2206546275395036, + "grad_norm": 1.5783131122589111, + "learning_rate": 4.887138321280468e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.9015839457511902, + "num_tokens": 31945530.0, + "step": 3935 + }, + { + "entropy": 0.3923260450363159, + "epoch": 2.2234762979683973, + "grad_norm": 1.4128057956695557, + "learning_rate": 4.886854173715393e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.9102500557899476, + "num_tokens": 31986171.0, + "step": 3940 + }, + { + "entropy": 0.4288273572921753, + "epoch": 2.226297968397291, + "grad_norm": 1.6166179180145264, + "learning_rate": 4.886569680033837e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8994893431663513, + "num_tokens": 32026712.0, + "step": 3945 + }, + { + "entropy": 0.4336235702037811, + "epoch": 2.229119638826185, + "grad_norm": 1.6575360298156738, + "learning_rate": 4.886284840291689e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8974316120147705, + "num_tokens": 32067338.0, + "step": 3950 + }, + { + "entropy": 0.40959821343421937, + "epoch": 2.231941309255079, + "grad_norm": 1.4833348989486694, + "learning_rate": 4.885999654544904e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.9015701293945313, + "num_tokens": 32108045.0, + "step": 3955 + }, + { + "entropy": 0.43567387461662294, + "epoch": 2.234762979683973, + "grad_norm": 1.6455271244049072, + "learning_rate": 4.885714122849509e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8950015425682067, + "num_tokens": 32148724.0, + "step": 3960 + }, + { + "entropy": 0.41589171886444093, + "epoch": 2.2375846501128667, + "grad_norm": 1.6400325298309326, + "learning_rate": 4.885428245261596e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8991590142250061, + "num_tokens": 32189079.0, + "step": 3965 + }, + { + "entropy": 0.4249677717685699, + "epoch": 2.240406320541761, + "grad_norm": 2.002429485321045, + "learning_rate": 4.885142021837323e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.9002622365951538, + "num_tokens": 32229786.0, + "step": 3970 + }, + { + "entropy": 0.42671540975570676, + "epoch": 2.2432279909706545, + "grad_norm": 1.9370062351226807, + "learning_rate": 4.8848554526329236e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8927700161933899, + "num_tokens": 32270581.0, + "step": 3975 + }, + { + "entropy": 0.4029369592666626, + "epoch": 2.2460496613995486, + "grad_norm": 1.457308053970337, + "learning_rate": 4.88456853770469e-06, + "loss": 0.299, + "mean_token_accuracy": 0.9022388100624085, + "num_tokens": 32311159.0, + "step": 3980 + }, + { + "entropy": 0.4298437058925629, + "epoch": 2.2488713318284423, + "grad_norm": 1.4605598449707031, + "learning_rate": 4.88428127710899e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8953496932983398, + "num_tokens": 32351769.0, + "step": 3985 + }, + { + "entropy": 0.4445444464683533, + "epoch": 2.2516930022573365, + "grad_norm": 1.861961841583252, + "learning_rate": 4.883993670902254e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.895211398601532, + "num_tokens": 32392495.0, + "step": 3990 + }, + { + "entropy": 0.4017652451992035, + "epoch": 2.25451467268623, + "grad_norm": 1.816008448600769, + "learning_rate": 4.883705719140982e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.9027621269226074, + "num_tokens": 32432976.0, + "step": 3995 + }, + { + "entropy": 0.3940592765808105, + "epoch": 2.2573363431151243, + "grad_norm": 1.5475646257400513, + "learning_rate": 4.883417421881744e-06, + "loss": 0.294, + "mean_token_accuracy": 0.9063061594963073, + "num_tokens": 32473442.0, + "step": 4000 + }, + { + "epoch": 2.2573363431151243, + "eval_entropy": 0.402061402797699, + "eval_loss": 0.27405405044555664, + "eval_mean_token_accuracy": 0.9164431095123291, + "eval_num_tokens": 32473442.0, + "eval_runtime": 0.1642, + "eval_samples_per_second": 24.354, + "eval_steps_per_second": 6.089, + "step": 4000 + }, + { + "entropy": 0.4588034152984619, + "epoch": 2.260158013544018, + "grad_norm": 1.6592538356781006, + "learning_rate": 4.883128779181174e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.889747953414917, + "num_tokens": 32514004.0, + "step": 4005 + }, + { + "entropy": 0.3681566655635834, + "epoch": 2.2629796839729117, + "grad_norm": 1.5150611400604248, + "learning_rate": 4.8828397910959766e-06, + "loss": 0.2613, + "mean_token_accuracy": 0.9151654362678527, + "num_tokens": 32554858.0, + "step": 4010 + }, + { + "entropy": 0.40542457699775697, + "epoch": 2.265801354401806, + "grad_norm": 1.5727524757385254, + "learning_rate": 4.882550457682924e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8985041737556457, + "num_tokens": 32595341.0, + "step": 4015 + }, + { + "entropy": 0.38482665419578554, + "epoch": 2.2686230248307, + "grad_norm": 1.5396711826324463, + "learning_rate": 4.8822607789988565e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.9044581055641174, + "num_tokens": 32636025.0, + "step": 4020 + }, + { + "entropy": 0.4148333430290222, + "epoch": 2.2714446952595937, + "grad_norm": 1.5504050254821777, + "learning_rate": 4.881970755100679e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.901849901676178, + "num_tokens": 32676078.0, + "step": 4025 + }, + { + "entropy": 0.4355687737464905, + "epoch": 2.2742663656884874, + "grad_norm": 3.5764992237091064, + "learning_rate": 4.88168038604537e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8947890400886536, + "num_tokens": 32716756.0, + "step": 4030 + }, + { + "entropy": 0.4102937400341034, + "epoch": 2.2770880361173815, + "grad_norm": 1.5980452299118042, + "learning_rate": 4.881389671889969e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.9003346800804138, + "num_tokens": 32757375.0, + "step": 4035 + }, + { + "entropy": 0.4062139928340912, + "epoch": 2.2799097065462752, + "grad_norm": 1.4203381538391113, + "learning_rate": 4.881098612691589e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.9053402423858643, + "num_tokens": 32797973.0, + "step": 4040 + }, + { + "entropy": 0.42087835669517515, + "epoch": 2.2827313769751694, + "grad_norm": 1.6059361696243286, + "learning_rate": 4.880807208507409e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8996002554893494, + "num_tokens": 32838742.0, + "step": 4045 + }, + { + "entropy": 0.3775734961032867, + "epoch": 2.285553047404063, + "grad_norm": 1.5873465538024902, + "learning_rate": 4.880515459394674e-06, + "loss": 0.2564, + "mean_token_accuracy": 0.9160468459129334, + "num_tokens": 32879298.0, + "step": 4050 + }, + { + "entropy": 0.3923003554344177, + "epoch": 2.288374717832957, + "grad_norm": 1.7001484632492065, + "learning_rate": 4.880223365410699e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.9026227593421936, + "num_tokens": 32920082.0, + "step": 4055 + }, + { + "entropy": 0.41630412340164186, + "epoch": 2.291196388261851, + "grad_norm": 1.824363350868225, + "learning_rate": 4.879930926612866e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8960343599319458, + "num_tokens": 32960716.0, + "step": 4060 + }, + { + "entropy": 0.4242856979370117, + "epoch": 2.294018058690745, + "grad_norm": 1.5266849994659424, + "learning_rate": 4.879638143058625e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.9029630541801452, + "num_tokens": 33001292.0, + "step": 4065 + }, + { + "entropy": 0.4204494059085846, + "epoch": 2.2968397291196387, + "grad_norm": 1.8229271173477173, + "learning_rate": 4.879345014805491e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.9004954934120178, + "num_tokens": 33041946.0, + "step": 4070 + }, + { + "entropy": 0.40997507572174074, + "epoch": 2.299661399548533, + "grad_norm": 1.7101879119873047, + "learning_rate": 4.8790515419110516e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.9026644349098205, + "num_tokens": 33082717.0, + "step": 4075 + }, + { + "entropy": 0.3944500207901001, + "epoch": 2.3024830699774266, + "grad_norm": 1.7713676691055298, + "learning_rate": 4.8787577244329585e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.9076826214790344, + "num_tokens": 33123211.0, + "step": 4080 + }, + { + "entropy": 0.4161556899547577, + "epoch": 2.3053047404063207, + "grad_norm": 1.6283953189849854, + "learning_rate": 4.878463562428933e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8988382816314697, + "num_tokens": 33164049.0, + "step": 4085 + }, + { + "entropy": 0.4177008092403412, + "epoch": 2.3081264108352144, + "grad_norm": 1.6907880306243896, + "learning_rate": 4.878169055956763e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8997668266296387, + "num_tokens": 33204881.0, + "step": 4090 + }, + { + "entropy": 0.3897433698177338, + "epoch": 2.310948081264108, + "grad_norm": 1.4863002300262451, + "learning_rate": 4.877874205074303e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.9074371457099915, + "num_tokens": 33245356.0, + "step": 4095 + }, + { + "entropy": 0.41926553249359133, + "epoch": 2.3137697516930023, + "grad_norm": 1.7184562683105469, + "learning_rate": 4.877579009839478e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8960996985435485, + "num_tokens": 33285939.0, + "step": 4100 + }, + { + "entropy": 0.39500190019607545, + "epoch": 2.3165914221218964, + "grad_norm": 1.7541210651397705, + "learning_rate": 4.877283470310279e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.9074315071105957, + "num_tokens": 33326588.0, + "step": 4105 + }, + { + "entropy": 0.40492220520973204, + "epoch": 2.31941309255079, + "grad_norm": 1.7052255868911743, + "learning_rate": 4.876987586544765e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8954932928085327, + "num_tokens": 33367169.0, + "step": 4110 + }, + { + "entropy": 0.38237152695655824, + "epoch": 2.322234762979684, + "grad_norm": 1.6368436813354492, + "learning_rate": 4.876691358601061e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.9079111337661743, + "num_tokens": 33407822.0, + "step": 4115 + }, + { + "entropy": 0.42206219434738157, + "epoch": 2.325056433408578, + "grad_norm": 1.7534505128860474, + "learning_rate": 4.876394786537362e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.899584424495697, + "num_tokens": 33448607.0, + "step": 4120 + }, + { + "entropy": 0.3744909644126892, + "epoch": 2.3278781038374716, + "grad_norm": 1.4802114963531494, + "learning_rate": 4.87609787041193e-06, + "loss": 0.278, + "mean_token_accuracy": 0.9111004710197449, + "num_tokens": 33489164.0, + "step": 4125 + }, + { + "entropy": 0.3757921814918518, + "epoch": 2.3306997742663658, + "grad_norm": 1.6543906927108765, + "learning_rate": 4.875800610283092e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.9108646512031555, + "num_tokens": 33529990.0, + "step": 4130 + }, + { + "entropy": 0.42251219749450686, + "epoch": 2.3335214446952595, + "grad_norm": 1.6640608310699463, + "learning_rate": 4.875503006209249e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8940789937973023, + "num_tokens": 33570747.0, + "step": 4135 + }, + { + "entropy": 0.3960157215595245, + "epoch": 2.3363431151241536, + "grad_norm": 1.8502579927444458, + "learning_rate": 4.875205058248861e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.9076458573341369, + "num_tokens": 33611154.0, + "step": 4140 + }, + { + "entropy": 0.41452285647392273, + "epoch": 2.3391647855530473, + "grad_norm": 1.5899200439453125, + "learning_rate": 4.874906766460463e-06, + "loss": 0.314, + "mean_token_accuracy": 0.9006463170051575, + "num_tokens": 33651898.0, + "step": 4145 + }, + { + "entropy": 0.3915266990661621, + "epoch": 2.3419864559819414, + "grad_norm": 1.3926752805709839, + "learning_rate": 4.874608130902653e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.9062711000442505, + "num_tokens": 33692461.0, + "step": 4150 + }, + { + "entropy": 0.380002897977829, + "epoch": 2.344808126410835, + "grad_norm": 1.4338423013687134, + "learning_rate": 4.874309151634098e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9096962571144104, + "num_tokens": 33732460.0, + "step": 4155 + }, + { + "entropy": 0.40470706224441527, + "epoch": 2.3476297968397293, + "grad_norm": 1.7443944215774536, + "learning_rate": 4.874009828713532e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.9021994709968567, + "num_tokens": 33773109.0, + "step": 4160 + }, + { + "entropy": 0.3907041311264038, + "epoch": 2.350451467268623, + "grad_norm": 1.3750447034835815, + "learning_rate": 4.873710162199759e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.9032772421836853, + "num_tokens": 33813658.0, + "step": 4165 + }, + { + "entropy": 0.42153160572052, + "epoch": 2.353273137697517, + "grad_norm": 1.5966558456420898, + "learning_rate": 4.873410152151648e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.89634268283844, + "num_tokens": 33854046.0, + "step": 4170 + }, + { + "entropy": 0.39306110739707945, + "epoch": 2.356094808126411, + "grad_norm": 1.5900177955627441, + "learning_rate": 4.873109798628133e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.902882993221283, + "num_tokens": 33894451.0, + "step": 4175 + }, + { + "entropy": 0.3619803309440613, + "epoch": 2.3589164785553045, + "grad_norm": 1.5183881521224976, + "learning_rate": 4.872809101688222e-06, + "loss": 0.2644, + "mean_token_accuracy": 0.9138602614402771, + "num_tokens": 33935166.0, + "step": 4180 + }, + { + "entropy": 0.40472997426986695, + "epoch": 2.3617381489841986, + "grad_norm": 1.5294315814971924, + "learning_rate": 4.872508061390986e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.9031421303749084, + "num_tokens": 33975747.0, + "step": 4185 + }, + { + "entropy": 0.39990630745887756, + "epoch": 2.364559819413093, + "grad_norm": 1.6503218412399292, + "learning_rate": 4.872206677795564e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.9047986268997192, + "num_tokens": 34016543.0, + "step": 4190 + }, + { + "entropy": 0.3896356463432312, + "epoch": 2.3673814898419865, + "grad_norm": 1.6634739637374878, + "learning_rate": 4.871904950961163e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.9031400918960572, + "num_tokens": 34057312.0, + "step": 4195 + }, + { + "entropy": 0.41114925742149355, + "epoch": 2.37020316027088, + "grad_norm": 1.5319340229034424, + "learning_rate": 4.871602880947058e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.9023614287376404, + "num_tokens": 34098081.0, + "step": 4200 + }, + { + "entropy": 0.4191340744495392, + "epoch": 2.3730248306997743, + "grad_norm": 1.60099196434021, + "learning_rate": 4.871300467812589e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.9024178624153137, + "num_tokens": 34138698.0, + "step": 4205 + }, + { + "entropy": 0.4011139750480652, + "epoch": 2.375846501128668, + "grad_norm": 1.5579471588134766, + "learning_rate": 4.870997711617166e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.9014039039611816, + "num_tokens": 34179294.0, + "step": 4210 + }, + { + "entropy": 0.38470744490623476, + "epoch": 2.378668171557562, + "grad_norm": 1.4612574577331543, + "learning_rate": 4.8706946124202666e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9105475664138794, + "num_tokens": 34219877.0, + "step": 4215 + }, + { + "entropy": 0.4085557281970978, + "epoch": 2.381489841986456, + "grad_norm": 1.6769572496414185, + "learning_rate": 4.8703911702814326e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8968191981315613, + "num_tokens": 34260392.0, + "step": 4220 + }, + { + "entropy": 0.4069161355495453, + "epoch": 2.38431151241535, + "grad_norm": 1.8266639709472656, + "learning_rate": 4.870087385260277e-06, + "loss": 0.294, + "mean_token_accuracy": 0.9052396297454834, + "num_tokens": 34301087.0, + "step": 4225 + }, + { + "entropy": 0.41574978828430176, + "epoch": 2.3871331828442437, + "grad_norm": 1.4496681690216064, + "learning_rate": 4.8697832574164786e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8990219950675964, + "num_tokens": 34341510.0, + "step": 4230 + }, + { + "entropy": 0.42293376922607423, + "epoch": 2.389954853273138, + "grad_norm": 1.5617249011993408, + "learning_rate": 4.86947878680978e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8938201785087585, + "num_tokens": 34382177.0, + "step": 4235 + }, + { + "entropy": 0.43548256158828735, + "epoch": 2.3927765237020315, + "grad_norm": 1.653110146522522, + "learning_rate": 4.869173973499999e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8943361163139343, + "num_tokens": 34422859.0, + "step": 4240 + }, + { + "entropy": 0.4331204891204834, + "epoch": 2.3955981941309257, + "grad_norm": 1.483518123626709, + "learning_rate": 4.868868817547013e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8963463306427002, + "num_tokens": 34463566.0, + "step": 4245 + }, + { + "entropy": 0.40947118401527405, + "epoch": 2.3984198645598194, + "grad_norm": 1.3235682249069214, + "learning_rate": 4.868563319010772e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.9062234163284302, + "num_tokens": 34504064.0, + "step": 4250 + }, + { + "entropy": 0.4543120741844177, + "epoch": 2.4012415349887135, + "grad_norm": 1.6390918493270874, + "learning_rate": 4.86825747795129e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8885816693305969, + "num_tokens": 34544665.0, + "step": 4255 + }, + { + "entropy": 0.42363797426223754, + "epoch": 2.404063205417607, + "grad_norm": 1.629384160041809, + "learning_rate": 4.86795129442865e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8973617792129517, + "num_tokens": 34585268.0, + "step": 4260 + }, + { + "entropy": 0.3826207935810089, + "epoch": 2.406884875846501, + "grad_norm": 1.5705349445343018, + "learning_rate": 4.867644768503002e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.9083486676216126, + "num_tokens": 34625843.0, + "step": 4265 + }, + { + "entropy": 0.38043850660324097, + "epoch": 2.409706546275395, + "grad_norm": 1.3238205909729004, + "learning_rate": 4.867337900234562e-06, + "loss": 0.28, + "mean_token_accuracy": 0.9097347378730773, + "num_tokens": 34666723.0, + "step": 4270 + }, + { + "entropy": 0.40482637882232664, + "epoch": 2.412528216704289, + "grad_norm": 1.4541096687316895, + "learning_rate": 4.867030689683615e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.9034028768539428, + "num_tokens": 34707314.0, + "step": 4275 + }, + { + "entropy": 0.3765712082386017, + "epoch": 2.415349887133183, + "grad_norm": 1.694923996925354, + "learning_rate": 4.8667231369105126e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.9113203883171082, + "num_tokens": 34748089.0, + "step": 4280 + }, + { + "entropy": 0.41379693150520325, + "epoch": 2.4181715575620766, + "grad_norm": 1.8324544429779053, + "learning_rate": 4.866415241975674e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8990246057510376, + "num_tokens": 34788615.0, + "step": 4285 + }, + { + "entropy": 0.405905282497406, + "epoch": 2.4209932279909707, + "grad_norm": 1.5715994834899902, + "learning_rate": 4.866107004939584e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.9052024960517884, + "num_tokens": 34829247.0, + "step": 4290 + }, + { + "entropy": 0.38263545036315916, + "epoch": 2.4238148984198644, + "grad_norm": 1.4427995681762695, + "learning_rate": 4.865798425862797e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.9089892625808715, + "num_tokens": 34869192.0, + "step": 4295 + }, + { + "entropy": 0.41300634741783143, + "epoch": 2.4266365688487586, + "grad_norm": 1.6823080778121948, + "learning_rate": 4.865489504805933e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8993620634078979, + "num_tokens": 34909855.0, + "step": 4300 + }, + { + "entropy": 0.3476841390132904, + "epoch": 2.4294582392776523, + "grad_norm": 1.5836542844772339, + "learning_rate": 4.865180241829679e-06, + "loss": 0.2606, + "mean_token_accuracy": 0.915683650970459, + "num_tokens": 34950497.0, + "step": 4305 + }, + { + "entropy": 0.40133320689201357, + "epoch": 2.4322799097065464, + "grad_norm": 1.5993763208389282, + "learning_rate": 4.8648706369947915e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.9001999020576477, + "num_tokens": 34990946.0, + "step": 4310 + }, + { + "entropy": 0.43380234241485593, + "epoch": 2.43510158013544, + "grad_norm": 1.6708790063858032, + "learning_rate": 4.86456069036209e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8949172139167786, + "num_tokens": 35031584.0, + "step": 4315 + }, + { + "entropy": 0.36903883814811705, + "epoch": 2.4379232505643342, + "grad_norm": 1.705231785774231, + "learning_rate": 4.864250401992465e-06, + "loss": 0.2643, + "mean_token_accuracy": 0.9130355954170227, + "num_tokens": 35072066.0, + "step": 4320 + }, + { + "entropy": 0.4150381624698639, + "epoch": 2.440744920993228, + "grad_norm": 1.5706596374511719, + "learning_rate": 4.863939771946873e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8974992752075195, + "num_tokens": 35112703.0, + "step": 4325 + }, + { + "entropy": 0.4148720264434814, + "epoch": 2.443566591422122, + "grad_norm": 1.6442214250564575, + "learning_rate": 4.863628800286337e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8946601033210755, + "num_tokens": 35153539.0, + "step": 4330 + }, + { + "entropy": 0.40285355448722837, + "epoch": 2.4463882618510158, + "grad_norm": 1.4223581552505493, + "learning_rate": 4.863317487071946e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8984290480613708, + "num_tokens": 35194160.0, + "step": 4335 + }, + { + "entropy": 0.41776869893074037, + "epoch": 2.44920993227991, + "grad_norm": 1.5501775741577148, + "learning_rate": 4.8630058323648584e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8984562754631042, + "num_tokens": 35234676.0, + "step": 4340 + }, + { + "entropy": 0.4286969780921936, + "epoch": 2.4520316027088036, + "grad_norm": 1.6209558248519897, + "learning_rate": 4.862693836226301e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8965187311172486, + "num_tokens": 35275453.0, + "step": 4345 + }, + { + "entropy": 0.40539962649345396, + "epoch": 2.4548532731376973, + "grad_norm": 1.6065021753311157, + "learning_rate": 4.862381498717563e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.9059406757354737, + "num_tokens": 35315750.0, + "step": 4350 + }, + { + "entropy": 0.38936737179756165, + "epoch": 2.4576749435665914, + "grad_norm": 1.5308164358139038, + "learning_rate": 4.862068819900003e-06, + "loss": 0.289, + "mean_token_accuracy": 0.905747401714325, + "num_tokens": 35356475.0, + "step": 4355 + }, + { + "entropy": 0.3944389283657074, + "epoch": 2.460496613995485, + "grad_norm": 1.6397894620895386, + "learning_rate": 4.8617557998350475e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.9059877634048462, + "num_tokens": 35397000.0, + "step": 4360 + }, + { + "entropy": 0.3817551672458649, + "epoch": 2.4633182844243793, + "grad_norm": 1.590949296951294, + "learning_rate": 4.86144243858419e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.9029469013214111, + "num_tokens": 35437662.0, + "step": 4365 + }, + { + "entropy": 0.43104063868522646, + "epoch": 2.466139954853273, + "grad_norm": 1.7022565603256226, + "learning_rate": 4.86112873620899e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8878795146942139, + "num_tokens": 35478304.0, + "step": 4370 + }, + { + "entropy": 0.4168141961097717, + "epoch": 2.468961625282167, + "grad_norm": 1.5154368877410889, + "learning_rate": 4.860814692771072e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8962018251419067, + "num_tokens": 35518948.0, + "step": 4375 + }, + { + "entropy": 0.4418110430240631, + "epoch": 2.471783295711061, + "grad_norm": 1.6656417846679688, + "learning_rate": 4.860500308332134e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8942551970481872, + "num_tokens": 35559540.0, + "step": 4380 + }, + { + "entropy": 0.40012283325195314, + "epoch": 2.474604966139955, + "grad_norm": 1.5913735628128052, + "learning_rate": 4.8601855829539345e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8994509220123291, + "num_tokens": 35600363.0, + "step": 4385 + }, + { + "entropy": 0.4134869039058685, + "epoch": 2.4774266365688487, + "grad_norm": 1.574020504951477, + "learning_rate": 4.859870516698302e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8977964639663696, + "num_tokens": 35641089.0, + "step": 4390 + }, + { + "entropy": 0.39518179297447203, + "epoch": 2.480248306997743, + "grad_norm": 1.6124846935272217, + "learning_rate": 4.85955510962713e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.9060514092445373, + "num_tokens": 35681822.0, + "step": 4395 + }, + { + "entropy": 0.403781658411026, + "epoch": 2.4830699774266365, + "grad_norm": 1.7911304235458374, + "learning_rate": 4.8592393618023816e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8975111484527588, + "num_tokens": 35722308.0, + "step": 4400 + }, + { + "entropy": 0.4161704480648041, + "epoch": 2.4858916478555306, + "grad_norm": 1.7675319910049438, + "learning_rate": 4.858923273286086e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8990342259407044, + "num_tokens": 35762753.0, + "step": 4405 + }, + { + "entropy": 0.3830337405204773, + "epoch": 2.4887133182844243, + "grad_norm": 1.3579264879226685, + "learning_rate": 4.858606844140337e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.9072355508804322, + "num_tokens": 35803606.0, + "step": 4410 + }, + { + "entropy": 0.4230438947677612, + "epoch": 2.4915349887133185, + "grad_norm": 1.622119665145874, + "learning_rate": 4.8582900744272975e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8985754609107971, + "num_tokens": 35844338.0, + "step": 4415 + }, + { + "entropy": 0.42835283279418945, + "epoch": 2.494356659142212, + "grad_norm": 1.5371416807174683, + "learning_rate": 4.857972964209199e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8958300590515137, + "num_tokens": 35885113.0, + "step": 4420 + }, + { + "entropy": 0.42627652883529665, + "epoch": 2.4971783295711063, + "grad_norm": 1.5911802053451538, + "learning_rate": 4.857655513548335e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8981908798217774, + "num_tokens": 35925643.0, + "step": 4425 + }, + { + "entropy": 0.37044663429260255, + "epoch": 2.5, + "grad_norm": 1.3842355012893677, + "learning_rate": 4.8573377225070715e-06, + "loss": 0.2614, + "mean_token_accuracy": 0.9132282495498657, + "num_tokens": 35966310.0, + "step": 4430 + }, + { + "entropy": 0.3875482201576233, + "epoch": 2.5028216704288937, + "grad_norm": 1.421795129776001, + "learning_rate": 4.857019591147836e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.9054312467575073, + "num_tokens": 36007072.0, + "step": 4435 + }, + { + "entropy": 0.4442818224430084, + "epoch": 2.505643340857788, + "grad_norm": 1.6131327152252197, + "learning_rate": 4.856701119533128e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8924556851387024, + "num_tokens": 36047636.0, + "step": 4440 + }, + { + "entropy": 0.4388956606388092, + "epoch": 2.508465011286682, + "grad_norm": 1.5778138637542725, + "learning_rate": 4.856382307725509e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8961979508399963, + "num_tokens": 36088203.0, + "step": 4445 + }, + { + "entropy": 0.402739155292511, + "epoch": 2.5112866817155757, + "grad_norm": 1.5481038093566895, + "learning_rate": 4.856063155787611e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.9025402426719665, + "num_tokens": 36128987.0, + "step": 4450 + }, + { + "entropy": 0.3847965598106384, + "epoch": 2.5141083521444694, + "grad_norm": 1.4700496196746826, + "learning_rate": 4.855743663782131e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.9061774015426636, + "num_tokens": 36169817.0, + "step": 4455 + }, + { + "entropy": 0.39482749700546266, + "epoch": 2.5169300225733635, + "grad_norm": 1.6306626796722412, + "learning_rate": 4.855423831771832e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.9046317577362061, + "num_tokens": 36210322.0, + "step": 4460 + }, + { + "entropy": 0.44486273527145387, + "epoch": 2.519751693002257, + "grad_norm": 1.7283015251159668, + "learning_rate": 4.8551036598195486e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8907332181930542, + "num_tokens": 36251034.0, + "step": 4465 + }, + { + "entropy": 0.42003960609436036, + "epoch": 2.5225733634311513, + "grad_norm": 1.4865223169326782, + "learning_rate": 4.8547831479881745e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8974966883659363, + "num_tokens": 36291628.0, + "step": 4470 + }, + { + "entropy": 0.39935733675956725, + "epoch": 2.525395033860045, + "grad_norm": 1.3825920820236206, + "learning_rate": 4.854462296340677e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.9067023873329163, + "num_tokens": 36332435.0, + "step": 4475 + }, + { + "entropy": 0.36378782987594604, + "epoch": 2.528216704288939, + "grad_norm": 1.507226586341858, + "learning_rate": 4.854141104940087e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.9157714605331421, + "num_tokens": 36373117.0, + "step": 4480 + }, + { + "entropy": 0.41645694971084596, + "epoch": 2.531038374717833, + "grad_norm": 1.6471916437149048, + "learning_rate": 4.853819573849502e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.9004851222038269, + "num_tokens": 36413658.0, + "step": 4485 + }, + { + "entropy": 0.4097161889076233, + "epoch": 2.533860045146727, + "grad_norm": 1.7997804880142212, + "learning_rate": 4.8534977031320855e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.901634693145752, + "num_tokens": 36454096.0, + "step": 4490 + }, + { + "entropy": 0.38941258788108823, + "epoch": 2.5366817155756207, + "grad_norm": 1.6591458320617676, + "learning_rate": 4.8531754928510725e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.9032475352287292, + "num_tokens": 36494625.0, + "step": 4495 + }, + { + "entropy": 0.39249064326286315, + "epoch": 2.5395033860045144, + "grad_norm": 1.5502938032150269, + "learning_rate": 4.852852943069758e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.9045561671257019, + "num_tokens": 36535372.0, + "step": 4500 + }, + { + "epoch": 2.5395033860045144, + "eval_entropy": 0.4075223505496979, + "eval_loss": 0.2745997905731201, + "eval_mean_token_accuracy": 0.918742835521698, + "eval_num_tokens": 36535372.0, + "eval_runtime": 0.164, + "eval_samples_per_second": 24.395, + "eval_steps_per_second": 6.099, + "step": 4500 + }, + { + "entropy": 0.42699538469314574, + "epoch": 2.5423250564334086, + "grad_norm": 1.4848240613937378, + "learning_rate": 4.852530053851509e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8980291485786438, + "num_tokens": 36576121.0, + "step": 4505 + }, + { + "entropy": 0.4450218975543976, + "epoch": 2.5451467268623027, + "grad_norm": 1.2995103597640991, + "learning_rate": 4.852206825259756e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8892289876937867, + "num_tokens": 36616673.0, + "step": 4510 + }, + { + "entropy": 0.4333424806594849, + "epoch": 2.5479683972911964, + "grad_norm": 1.4600845575332642, + "learning_rate": 4.851883257357997e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.9001733303070069, + "num_tokens": 36657491.0, + "step": 4515 + }, + { + "entropy": 0.4113191843032837, + "epoch": 2.55079006772009, + "grad_norm": 1.7340246438980103, + "learning_rate": 4.851559350209798e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8975558519363404, + "num_tokens": 36698183.0, + "step": 4520 + }, + { + "entropy": 0.4206542193889618, + "epoch": 2.5536117381489842, + "grad_norm": 1.6161917448043823, + "learning_rate": 4.85123510387879e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8958797335624695, + "num_tokens": 36738687.0, + "step": 4525 + }, + { + "entropy": 0.42369606494903567, + "epoch": 2.5564334085778784, + "grad_norm": 1.4928462505340576, + "learning_rate": 4.850910518428672e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8953816771507264, + "num_tokens": 36779100.0, + "step": 4530 + }, + { + "entropy": 0.4580428898334503, + "epoch": 2.559255079006772, + "grad_norm": 1.6802058219909668, + "learning_rate": 4.850585593923209e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8919341087341308, + "num_tokens": 36819464.0, + "step": 4535 + }, + { + "entropy": 0.44548857808113096, + "epoch": 2.5620767494356658, + "grad_norm": 1.7012122869491577, + "learning_rate": 4.850260330426231e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8931902885437012, + "num_tokens": 36860115.0, + "step": 4540 + }, + { + "entropy": 0.42452192306518555, + "epoch": 2.56489841986456, + "grad_norm": 1.4587101936340332, + "learning_rate": 4.849934728001636e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8946157932281494, + "num_tokens": 36900844.0, + "step": 4545 + }, + { + "entropy": 0.4206928193569183, + "epoch": 2.5677200902934536, + "grad_norm": 1.6573847532272339, + "learning_rate": 4.84960878671339e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.897615122795105, + "num_tokens": 36941461.0, + "step": 4550 + }, + { + "entropy": 0.40249053835868837, + "epoch": 2.5705417607223477, + "grad_norm": 1.6816022396087646, + "learning_rate": 4.849282506625525e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.906862735748291, + "num_tokens": 36982267.0, + "step": 4555 + }, + { + "entropy": 0.3981044590473175, + "epoch": 2.5733634311512414, + "grad_norm": 1.6164435148239136, + "learning_rate": 4.848955887802135e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.9023780941963195, + "num_tokens": 37023035.0, + "step": 4560 + }, + { + "entropy": 0.3921403527259827, + "epoch": 2.5761851015801356, + "grad_norm": 1.7836408615112305, + "learning_rate": 4.8486289303073884e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.9081403493881226, + "num_tokens": 37063570.0, + "step": 4565 + }, + { + "entropy": 0.41735507249832154, + "epoch": 2.5790067720090293, + "grad_norm": 1.6461783647537231, + "learning_rate": 4.848301634205514e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.9022216796875, + "num_tokens": 37104278.0, + "step": 4570 + }, + { + "entropy": 0.3928376615047455, + "epoch": 2.5818284424379234, + "grad_norm": 1.6805448532104492, + "learning_rate": 4.84797399956081e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.9058070182800293, + "num_tokens": 37144698.0, + "step": 4575 + }, + { + "entropy": 0.41108508706092833, + "epoch": 2.584650112866817, + "grad_norm": 1.6354347467422485, + "learning_rate": 4.847646026437639e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.9019859433174133, + "num_tokens": 37185166.0, + "step": 4580 + }, + { + "entropy": 0.39545953273773193, + "epoch": 2.587471783295711, + "grad_norm": 1.4855034351348877, + "learning_rate": 4.847317714900432e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.9041112303733826, + "num_tokens": 37225882.0, + "step": 4585 + }, + { + "entropy": 0.378242689371109, + "epoch": 2.590293453724605, + "grad_norm": 1.4001781940460205, + "learning_rate": 4.846989065013687e-06, + "loss": 0.2667, + "mean_token_accuracy": 0.9122205972671509, + "num_tokens": 37266182.0, + "step": 4590 + }, + { + "entropy": 0.41660374999046323, + "epoch": 2.593115124153499, + "grad_norm": 1.6582450866699219, + "learning_rate": 4.846660076841966e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.9031738400459289, + "num_tokens": 37306948.0, + "step": 4595 + }, + { + "entropy": 0.4056197464466095, + "epoch": 2.595936794582393, + "grad_norm": 1.9591373205184937, + "learning_rate": 4.8463307504498995e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.9031343698501587, + "num_tokens": 37347762.0, + "step": 4600 + }, + { + "entropy": 0.43581578731536863, + "epoch": 2.5987584650112865, + "grad_norm": 1.5790077447891235, + "learning_rate": 4.846001085902182e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.9001483678817749, + "num_tokens": 37388225.0, + "step": 4605 + }, + { + "entropy": 0.4038006603717804, + "epoch": 2.6015801354401806, + "grad_norm": 1.5071346759796143, + "learning_rate": 4.845671083263579e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.9046383738517761, + "num_tokens": 37429028.0, + "step": 4610 + }, + { + "entropy": 0.4395131945610046, + "epoch": 2.6044018058690743, + "grad_norm": 1.651118516921997, + "learning_rate": 4.845340742598917e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8916622638702393, + "num_tokens": 37469231.0, + "step": 4615 + }, + { + "entropy": 0.43975943326950073, + "epoch": 2.6072234762979685, + "grad_norm": 1.7351619005203247, + "learning_rate": 4.8450100639730934e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8946444272994996, + "num_tokens": 37509748.0, + "step": 4620 + }, + { + "entropy": 0.3946446657180786, + "epoch": 2.610045146726862, + "grad_norm": 1.5668245553970337, + "learning_rate": 4.844679047451068e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.9054205894470215, + "num_tokens": 37549968.0, + "step": 4625 + }, + { + "entropy": 0.4205393612384796, + "epoch": 2.6128668171557563, + "grad_norm": 1.5956112146377563, + "learning_rate": 4.844347693097871e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8945238828659058, + "num_tokens": 37590500.0, + "step": 4630 + }, + { + "entropy": 0.4266462683677673, + "epoch": 2.61568848758465, + "grad_norm": 1.8106273412704468, + "learning_rate": 4.844016000978595e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.9020758390426635, + "num_tokens": 37631127.0, + "step": 4635 + }, + { + "entropy": 0.3949384868144989, + "epoch": 2.618510158013544, + "grad_norm": 1.6231563091278076, + "learning_rate": 4.843683971158404e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.9043740272521973, + "num_tokens": 37671522.0, + "step": 4640 + }, + { + "entropy": 0.427196079492569, + "epoch": 2.621331828442438, + "grad_norm": 1.8242963552474976, + "learning_rate": 4.843351603702522e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8958367943763733, + "num_tokens": 37711505.0, + "step": 4645 + }, + { + "entropy": 0.4056568145751953, + "epoch": 2.624153498871332, + "grad_norm": 1.767072081565857, + "learning_rate": 4.843018898676245e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.9040299654006958, + "num_tokens": 37751970.0, + "step": 4650 + }, + { + "entropy": 0.4182514131069183, + "epoch": 2.6269751693002257, + "grad_norm": 1.381471872329712, + "learning_rate": 4.842685856144932e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8963116884231568, + "num_tokens": 37792712.0, + "step": 4655 + }, + { + "entropy": 0.3773132681846619, + "epoch": 2.62979683972912, + "grad_norm": 1.4395767450332642, + "learning_rate": 4.842352476174008e-06, + "loss": 0.2735, + "mean_token_accuracy": 0.9093364715576172, + "num_tokens": 37833341.0, + "step": 4660 + }, + { + "entropy": 0.38550790548324587, + "epoch": 2.6326185101580135, + "grad_norm": 1.4654998779296875, + "learning_rate": 4.842018758828968e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.906914758682251, + "num_tokens": 37874036.0, + "step": 4665 + }, + { + "entropy": 0.42129603028297424, + "epoch": 2.635440180586907, + "grad_norm": 1.8509776592254639, + "learning_rate": 4.84168470417537e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.9016175627708435, + "num_tokens": 37914771.0, + "step": 4670 + }, + { + "entropy": 0.4129085302352905, + "epoch": 2.6382618510158014, + "grad_norm": 1.6427429914474487, + "learning_rate": 4.841350312278838e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.893082594871521, + "num_tokens": 37955424.0, + "step": 4675 + }, + { + "entropy": 0.455252867937088, + "epoch": 2.6410835214446955, + "grad_norm": 1.7675665616989136, + "learning_rate": 4.8410155832050635e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8894224405288697, + "num_tokens": 37995924.0, + "step": 4680 + }, + { + "entropy": 0.4129503607749939, + "epoch": 2.643905191873589, + "grad_norm": 1.8195785284042358, + "learning_rate": 4.840680517019806e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.9006987929344177, + "num_tokens": 38036393.0, + "step": 4685 + }, + { + "entropy": 0.41016621589660646, + "epoch": 2.646726862302483, + "grad_norm": 1.335083246231079, + "learning_rate": 4.840345113788887e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.9052215456962586, + "num_tokens": 38077059.0, + "step": 4690 + }, + { + "entropy": 0.38619791269302367, + "epoch": 2.649548532731377, + "grad_norm": 1.5167460441589355, + "learning_rate": 4.840009373578197e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.9078164935112, + "num_tokens": 38117855.0, + "step": 4695 + }, + { + "entropy": 0.40981505513191224, + "epoch": 2.6523702031602707, + "grad_norm": 1.4652817249298096, + "learning_rate": 4.839673296453694e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8995686173439026, + "num_tokens": 38158633.0, + "step": 4700 + }, + { + "entropy": 0.40746703147888186, + "epoch": 2.655191873589165, + "grad_norm": 1.5045616626739502, + "learning_rate": 4.839336882481398e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.9018496155738831, + "num_tokens": 38199207.0, + "step": 4705 + }, + { + "entropy": 0.4251773953437805, + "epoch": 2.6580135440180586, + "grad_norm": 1.7509130239486694, + "learning_rate": 4.839000131727399e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8988733649253845, + "num_tokens": 38239853.0, + "step": 4710 + }, + { + "entropy": 0.4144779920578003, + "epoch": 2.6608352144469527, + "grad_norm": 1.6336438655853271, + "learning_rate": 4.8386630442578505e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8991317749023438, + "num_tokens": 38280445.0, + "step": 4715 + }, + { + "entropy": 0.42009794116020205, + "epoch": 2.6636568848758464, + "grad_norm": 1.679404854774475, + "learning_rate": 4.838325620138975e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8994911909103394, + "num_tokens": 38321216.0, + "step": 4720 + }, + { + "entropy": 0.3979401350021362, + "epoch": 2.6664785553047405, + "grad_norm": 1.5892705917358398, + "learning_rate": 4.837987859437058e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.9033595561981201, + "num_tokens": 38361943.0, + "step": 4725 + }, + { + "entropy": 0.4081549823284149, + "epoch": 2.6693002257336342, + "grad_norm": 1.4048125743865967, + "learning_rate": 4.837649762218454e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.9016985297203064, + "num_tokens": 38402444.0, + "step": 4730 + }, + { + "entropy": 0.40113077163696287, + "epoch": 2.6721218961625284, + "grad_norm": 1.590847134590149, + "learning_rate": 4.837311328549582e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.9024832129478455, + "num_tokens": 38443055.0, + "step": 4735 + }, + { + "entropy": 0.4067776739597321, + "epoch": 2.674943566591422, + "grad_norm": 1.6257827281951904, + "learning_rate": 4.8369725584969265e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.9031951904296875, + "num_tokens": 38483816.0, + "step": 4740 + }, + { + "entropy": 0.386566686630249, + "epoch": 2.677765237020316, + "grad_norm": 1.3581163883209229, + "learning_rate": 4.836633452127039e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.9064470291137695, + "num_tokens": 38523703.0, + "step": 4745 + }, + { + "entropy": 0.40097097754478456, + "epoch": 2.68058690744921, + "grad_norm": 1.6537374258041382, + "learning_rate": 4.836294009506537e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.9049440145492553, + "num_tokens": 38564385.0, + "step": 4750 + }, + { + "entropy": 0.3615904927253723, + "epoch": 2.6834085778781036, + "grad_norm": 1.5579919815063477, + "learning_rate": 4.835954230702105e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.9136163473129273, + "num_tokens": 38605052.0, + "step": 4755 + }, + { + "entropy": 0.43404104113578795, + "epoch": 2.6862302483069977, + "grad_norm": 1.5290402173995972, + "learning_rate": 4.835614115780492e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8940969586372376, + "num_tokens": 38645892.0, + "step": 4760 + }, + { + "entropy": 0.4232151687145233, + "epoch": 2.689051918735892, + "grad_norm": 1.3929665088653564, + "learning_rate": 4.835273664808514e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8977612614631653, + "num_tokens": 38686583.0, + "step": 4765 + }, + { + "entropy": 0.399210125207901, + "epoch": 2.6918735891647856, + "grad_norm": 1.4951647520065308, + "learning_rate": 4.834932877853051e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.9042705297470093, + "num_tokens": 38726923.0, + "step": 4770 + }, + { + "entropy": 0.42027135491371154, + "epoch": 2.6946952595936793, + "grad_norm": 1.3856654167175293, + "learning_rate": 4.834591754981053e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.9001957535743713, + "num_tokens": 38767439.0, + "step": 4775 + }, + { + "entropy": 0.4031683325767517, + "epoch": 2.6975169300225734, + "grad_norm": 1.5139154195785522, + "learning_rate": 4.834250296259532e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.9039840221405029, + "num_tokens": 38808039.0, + "step": 4780 + }, + { + "entropy": 0.40245692133903505, + "epoch": 2.700338600451467, + "grad_norm": 1.7345075607299805, + "learning_rate": 4.8339085017555685e-06, + "loss": 0.307, + "mean_token_accuracy": 0.9012099623680114, + "num_tokens": 38848491.0, + "step": 4785 + }, + { + "entropy": 0.4227768838405609, + "epoch": 2.7031602708803613, + "grad_norm": 1.7092951536178589, + "learning_rate": 4.833566371536307e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8983111381530762, + "num_tokens": 38889280.0, + "step": 4790 + }, + { + "entropy": 0.4330504536628723, + "epoch": 2.705981941309255, + "grad_norm": 1.6095880270004272, + "learning_rate": 4.83322390566896e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8882631182670593, + "num_tokens": 38929955.0, + "step": 4795 + }, + { + "entropy": 0.40083847045898435, + "epoch": 2.708803611738149, + "grad_norm": 1.877541184425354, + "learning_rate": 4.832881104220805e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.9026248455047607, + "num_tokens": 38970610.0, + "step": 4800 + }, + { + "entropy": 0.43792678117752076, + "epoch": 2.711625282167043, + "grad_norm": 1.6264429092407227, + "learning_rate": 4.8325379672591845e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8906769514083862, + "num_tokens": 39011465.0, + "step": 4805 + }, + { + "entropy": 0.40115252137184143, + "epoch": 2.714446952595937, + "grad_norm": 1.7116504907608032, + "learning_rate": 4.8321944948515085e-06, + "loss": 0.272, + "mean_token_accuracy": 0.9107529878616333, + "num_tokens": 39052019.0, + "step": 4810 + }, + { + "entropy": 0.3959204196929932, + "epoch": 2.7172686230248306, + "grad_norm": 1.5745071172714233, + "learning_rate": 4.831850687065253e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.9042417287826539, + "num_tokens": 39092820.0, + "step": 4815 + }, + { + "entropy": 0.4313171863555908, + "epoch": 2.7200902934537243, + "grad_norm": 1.4608080387115479, + "learning_rate": 4.831506543967958e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8949733734130859, + "num_tokens": 39133601.0, + "step": 4820 + }, + { + "entropy": 0.4190512001514435, + "epoch": 2.7229119638826185, + "grad_norm": 1.6403937339782715, + "learning_rate": 4.831162065627229e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8949956893920898, + "num_tokens": 39174455.0, + "step": 4825 + }, + { + "entropy": 0.39245215654373167, + "epoch": 2.7257336343115126, + "grad_norm": 1.5303268432617188, + "learning_rate": 4.830817252110742e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.9037534713745117, + "num_tokens": 39215087.0, + "step": 4830 + }, + { + "entropy": 0.40173652172088625, + "epoch": 2.7285553047404063, + "grad_norm": 1.5214855670928955, + "learning_rate": 4.830472103486233e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.902808690071106, + "num_tokens": 39255856.0, + "step": 4835 + }, + { + "entropy": 0.4071968972682953, + "epoch": 2.7313769751693, + "grad_norm": 1.669796109199524, + "learning_rate": 4.830126619821508e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.903080677986145, + "num_tokens": 39296108.0, + "step": 4840 + }, + { + "entropy": 0.4280484437942505, + "epoch": 2.734198645598194, + "grad_norm": 1.5314013957977295, + "learning_rate": 4.829780801184437e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8953822731971741, + "num_tokens": 39336454.0, + "step": 4845 + }, + { + "entropy": 0.38799464106559756, + "epoch": 2.7370203160270883, + "grad_norm": 1.6457887887954712, + "learning_rate": 4.829434647642956e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.902879273891449, + "num_tokens": 39377316.0, + "step": 4850 + }, + { + "entropy": 0.43819814920425415, + "epoch": 2.739841986455982, + "grad_norm": 1.7149285078048706, + "learning_rate": 4.829088159265067e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8905494093894959, + "num_tokens": 39418143.0, + "step": 4855 + }, + { + "entropy": 0.41923085451126096, + "epoch": 2.7426636568848757, + "grad_norm": 1.331289529800415, + "learning_rate": 4.828741336118837e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8980522155761719, + "num_tokens": 39457182.0, + "step": 4860 + }, + { + "entropy": 0.40077372193336486, + "epoch": 2.74548532731377, + "grad_norm": 1.6222182512283325, + "learning_rate": 4.828394178272401e-06, + "loss": 0.302, + "mean_token_accuracy": 0.9054031133651733, + "num_tokens": 39497879.0, + "step": 4865 + }, + { + "entropy": 0.39027564525604247, + "epoch": 2.7483069977426635, + "grad_norm": 1.7052563428878784, + "learning_rate": 4.828046685793957e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.9051683902740478, + "num_tokens": 39538388.0, + "step": 4870 + }, + { + "entropy": 0.42039836645126344, + "epoch": 2.7511286681715577, + "grad_norm": 1.6202837228775024, + "learning_rate": 4.82769885875177e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8974061369895935, + "num_tokens": 39579248.0, + "step": 4875 + }, + { + "entropy": 0.42434735894203185, + "epoch": 2.7539503386004514, + "grad_norm": 1.6943541765213013, + "learning_rate": 4.8273506972141705e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8977732062339783, + "num_tokens": 39619680.0, + "step": 4880 + }, + { + "entropy": 0.4002332389354706, + "epoch": 2.7567720090293455, + "grad_norm": 1.5710278749465942, + "learning_rate": 4.827002201249556e-06, + "loss": 0.298, + "mean_token_accuracy": 0.9056402921676636, + "num_tokens": 39660296.0, + "step": 4885 + }, + { + "entropy": 0.39988999366760253, + "epoch": 2.759593679458239, + "grad_norm": 1.5473220348358154, + "learning_rate": 4.826653370926387e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.9022600173950195, + "num_tokens": 39701090.0, + "step": 4890 + }, + { + "entropy": 0.41356533765792847, + "epoch": 2.7624153498871333, + "grad_norm": 1.5488544702529907, + "learning_rate": 4.826304206313193e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8972513794898986, + "num_tokens": 39741947.0, + "step": 4895 + }, + { + "entropy": 0.39104182124137876, + "epoch": 2.765237020316027, + "grad_norm": 1.5953575372695923, + "learning_rate": 4.825954707478565e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.9020759582519531, + "num_tokens": 39782510.0, + "step": 4900 + }, + { + "entropy": 0.41210404634475706, + "epoch": 2.7680586907449207, + "grad_norm": 1.4687649011611938, + "learning_rate": 4.825604874491165e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8970984816551208, + "num_tokens": 39823334.0, + "step": 4905 + }, + { + "entropy": 0.38967686891555786, + "epoch": 2.770880361173815, + "grad_norm": 1.563970923423767, + "learning_rate": 4.825254707419716e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.9057512521743775, + "num_tokens": 39864124.0, + "step": 4910 + }, + { + "entropy": 0.4446266949176788, + "epoch": 2.773702031602709, + "grad_norm": 1.8769588470458984, + "learning_rate": 4.824904206333009e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8903054833412171, + "num_tokens": 39904728.0, + "step": 4915 + }, + { + "entropy": 0.3861452877521515, + "epoch": 2.7765237020316027, + "grad_norm": 1.4496490955352783, + "learning_rate": 4.8245533712998995e-06, + "loss": 0.283, + "mean_token_accuracy": 0.9068530559539795, + "num_tokens": 39945454.0, + "step": 4920 + }, + { + "entropy": 0.407192063331604, + "epoch": 2.7793453724604964, + "grad_norm": 1.4505201578140259, + "learning_rate": 4.8242022023893095e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.9021325826644897, + "num_tokens": 39986255.0, + "step": 4925 + }, + { + "entropy": 0.3858388364315033, + "epoch": 2.7821670428893905, + "grad_norm": 1.4844613075256348, + "learning_rate": 4.823850699670225e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.9087419271469116, + "num_tokens": 40026935.0, + "step": 4930 + }, + { + "entropy": 0.3972056210041046, + "epoch": 2.7849887133182847, + "grad_norm": 1.8050625324249268, + "learning_rate": 4.823498863211701e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.9033455014228821, + "num_tokens": 40067747.0, + "step": 4935 + }, + { + "entropy": 0.4366573691368103, + "epoch": 2.7878103837471784, + "grad_norm": 1.7209434509277344, + "learning_rate": 4.823146693082853e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8931609511375427, + "num_tokens": 40107827.0, + "step": 4940 + }, + { + "entropy": 0.44152148365974425, + "epoch": 2.790632054176072, + "grad_norm": 1.6366186141967773, + "learning_rate": 4.822794189352867e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8887471437454224, + "num_tokens": 40148719.0, + "step": 4945 + }, + { + "entropy": 0.3947724997997284, + "epoch": 2.793453724604966, + "grad_norm": 1.3238067626953125, + "learning_rate": 4.822441352090992e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.9030474781990051, + "num_tokens": 40189420.0, + "step": 4950 + }, + { + "entropy": 0.42072648406028745, + "epoch": 2.79627539503386, + "grad_norm": 1.553117036819458, + "learning_rate": 4.8220881813665435e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.9006119012832642, + "num_tokens": 40230144.0, + "step": 4955 + }, + { + "entropy": 0.39687870144844056, + "epoch": 2.799097065462754, + "grad_norm": 1.5955865383148193, + "learning_rate": 4.8217346772489e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.9029557347297669, + "num_tokens": 40270958.0, + "step": 4960 + }, + { + "entropy": 0.38045825362205504, + "epoch": 2.8019187358916477, + "grad_norm": 1.9875127077102661, + "learning_rate": 4.821380839807509e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.9087697386741638, + "num_tokens": 40311657.0, + "step": 4965 + }, + { + "entropy": 0.42601910829544065, + "epoch": 2.804740406320542, + "grad_norm": 1.7846463918685913, + "learning_rate": 4.821026669111881e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8960994839668274, + "num_tokens": 40352256.0, + "step": 4970 + }, + { + "entropy": 0.40002630949020385, + "epoch": 2.8075620767494356, + "grad_norm": 1.5628581047058105, + "learning_rate": 4.820672165231595e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.9042887330055237, + "num_tokens": 40392932.0, + "step": 4975 + }, + { + "entropy": 0.40157084465026854, + "epoch": 2.8103837471783297, + "grad_norm": 1.515154480934143, + "learning_rate": 4.8203173282362904e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.900831151008606, + "num_tokens": 40433461.0, + "step": 4980 + }, + { + "entropy": 0.4204134941101074, + "epoch": 2.8132054176072234, + "grad_norm": 1.5249524116516113, + "learning_rate": 4.819962158195677e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8980574369430542, + "num_tokens": 40473869.0, + "step": 4985 + }, + { + "entropy": 0.3809087574481964, + "epoch": 2.816027088036117, + "grad_norm": 1.5786020755767822, + "learning_rate": 4.819606655179527e-06, + "loss": 0.2748, + "mean_token_accuracy": 0.9111071109771729, + "num_tokens": 40514570.0, + "step": 4990 + }, + { + "entropy": 0.4299299597740173, + "epoch": 2.8188487584650113, + "grad_norm": 1.6505275964736938, + "learning_rate": 4.819250819257679e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8989997148513794, + "num_tokens": 40555393.0, + "step": 4995 + }, + { + "entropy": 0.41674472093582154, + "epoch": 2.8216704288939054, + "grad_norm": 1.8646577596664429, + "learning_rate": 4.818894650500037e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.9004587650299072, + "num_tokens": 40596103.0, + "step": 5000 + }, + { + "epoch": 2.8216704288939054, + "eval_entropy": 0.40231844782829285, + "eval_loss": 0.2420196682214737, + "eval_mean_token_accuracy": 0.9271751642227173, + "eval_num_tokens": 40596103.0, + "eval_runtime": 0.1641, + "eval_samples_per_second": 24.368, + "eval_steps_per_second": 6.092, + "step": 5000 + }, + { + "entropy": 0.4362106263637543, + "epoch": 2.824492099322799, + "grad_norm": 1.5646847486495972, + "learning_rate": 4.818538148976572e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8909627199172974, + "num_tokens": 40636724.0, + "step": 5005 + }, + { + "entropy": 0.41136063933372496, + "epoch": 2.827313769751693, + "grad_norm": 1.6143547296524048, + "learning_rate": 4.8181813147573166e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.9023926854133606, + "num_tokens": 40677571.0, + "step": 5010 + }, + { + "entropy": 0.4300425052642822, + "epoch": 2.830135440180587, + "grad_norm": 1.538517713546753, + "learning_rate": 4.817824147912371e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.901062273979187, + "num_tokens": 40717960.0, + "step": 5015 + }, + { + "entropy": 0.4204569935798645, + "epoch": 2.832957110609481, + "grad_norm": 1.51469886302948, + "learning_rate": 4.817466648511903e-06, + "loss": 0.317, + "mean_token_accuracy": 0.9010179877281189, + "num_tokens": 40758728.0, + "step": 5020 + }, + { + "entropy": 0.4019504964351654, + "epoch": 2.8357787810383748, + "grad_norm": 1.5685406923294067, + "learning_rate": 4.817108816626142e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.9020584464073181, + "num_tokens": 40799222.0, + "step": 5025 + }, + { + "entropy": 0.4178150534629822, + "epoch": 2.8386004514672685, + "grad_norm": 1.5866230726242065, + "learning_rate": 4.816750652325382e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.9016893863677978, + "num_tokens": 40839920.0, + "step": 5030 + }, + { + "entropy": 0.3792074918746948, + "epoch": 2.8414221218961626, + "grad_norm": 1.2347012758255005, + "learning_rate": 4.8163921556799885e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.904798150062561, + "num_tokens": 40880474.0, + "step": 5035 + }, + { + "entropy": 0.42022097706794737, + "epoch": 2.8442437923250563, + "grad_norm": 2.0809385776519775, + "learning_rate": 4.816033326760384e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.9000694155693054, + "num_tokens": 40920909.0, + "step": 5040 + }, + { + "entropy": 0.40716384053230287, + "epoch": 2.8470654627539504, + "grad_norm": 1.6952626705169678, + "learning_rate": 4.815674165637065e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.9000877380371094, + "num_tokens": 40961474.0, + "step": 5045 + }, + { + "entropy": 0.4542030215263367, + "epoch": 2.849887133182844, + "grad_norm": 2.060497283935547, + "learning_rate": 4.815314672380586e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8906081914901733, + "num_tokens": 41002244.0, + "step": 5050 + }, + { + "entropy": 0.42580119967460633, + "epoch": 2.8527088036117383, + "grad_norm": 1.8272522687911987, + "learning_rate": 4.814954847061568e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8984528660774231, + "num_tokens": 41043006.0, + "step": 5055 + }, + { + "entropy": 0.39229719042778016, + "epoch": 2.855530474040632, + "grad_norm": 1.7011030912399292, + "learning_rate": 4.8145946897507026e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.9052790999412537, + "num_tokens": 41083650.0, + "step": 5060 + }, + { + "entropy": 0.42324894666671753, + "epoch": 2.858352144469526, + "grad_norm": 1.7294552326202393, + "learning_rate": 4.814234200518741e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8964131712913513, + "num_tokens": 41124231.0, + "step": 5065 + }, + { + "entropy": 0.4176328182220459, + "epoch": 2.86117381489842, + "grad_norm": 1.8063850402832031, + "learning_rate": 4.813873379436499e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8978959679603576, + "num_tokens": 41164837.0, + "step": 5070 + }, + { + "entropy": 0.4265771806240082, + "epoch": 2.8639954853273135, + "grad_norm": 1.4727050065994263, + "learning_rate": 4.813512226574863e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.894121527671814, + "num_tokens": 41205500.0, + "step": 5075 + }, + { + "entropy": 0.4046603262424469, + "epoch": 2.8668171557562077, + "grad_norm": 1.5069383382797241, + "learning_rate": 4.813150742004782e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8987318396568298, + "num_tokens": 41246220.0, + "step": 5080 + }, + { + "entropy": 0.41487335562705996, + "epoch": 2.869638826185102, + "grad_norm": 1.5012837648391724, + "learning_rate": 4.812788925797267e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.9006859183311462, + "num_tokens": 41287146.0, + "step": 5085 + }, + { + "entropy": 0.40536822080612184, + "epoch": 2.8724604966139955, + "grad_norm": 1.679571270942688, + "learning_rate": 4.812426778023398e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.9021418213844299, + "num_tokens": 41327663.0, + "step": 5090 + }, + { + "entropy": 0.4111123144626617, + "epoch": 2.875282167042889, + "grad_norm": 1.7820204496383667, + "learning_rate": 4.812064298754319e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8966802835464478, + "num_tokens": 41367964.0, + "step": 5095 + }, + { + "entropy": 0.4519184172153473, + "epoch": 2.8781038374717833, + "grad_norm": 1.9354314804077148, + "learning_rate": 4.811701488061239e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8897565484046936, + "num_tokens": 41408264.0, + "step": 5100 + }, + { + "entropy": 0.3959288239479065, + "epoch": 2.8809255079006775, + "grad_norm": 1.4760626554489136, + "learning_rate": 4.811338346015434e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.9035079836845398, + "num_tokens": 41448952.0, + "step": 5105 + }, + { + "entropy": 0.3818633139133453, + "epoch": 2.883747178329571, + "grad_norm": 1.3426601886749268, + "learning_rate": 4.81097487268824e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.9082074284553527, + "num_tokens": 41489735.0, + "step": 5110 + }, + { + "entropy": 0.4096335232257843, + "epoch": 2.886568848758465, + "grad_norm": 1.6107584238052368, + "learning_rate": 4.810611068151064e-06, + "loss": 0.319, + "mean_token_accuracy": 0.898867416381836, + "num_tokens": 41530551.0, + "step": 5115 + }, + { + "entropy": 0.4616283357143402, + "epoch": 2.889390519187359, + "grad_norm": 1.9297223091125488, + "learning_rate": 4.810246932475374e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8871626138687134, + "num_tokens": 41571097.0, + "step": 5120 + }, + { + "entropy": 0.40603058934211733, + "epoch": 2.8922121896162527, + "grad_norm": 1.4726239442825317, + "learning_rate": 4.809882465732706e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8997641682624817, + "num_tokens": 41611601.0, + "step": 5125 + }, + { + "entropy": 0.4259068608283997, + "epoch": 2.895033860045147, + "grad_norm": 1.739429235458374, + "learning_rate": 4.809517667994657e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8975944757461548, + "num_tokens": 41651971.0, + "step": 5130 + }, + { + "entropy": 0.42120612859725953, + "epoch": 2.8978555304740405, + "grad_norm": 1.9967470169067383, + "learning_rate": 4.809152539332895e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.9000747799873352, + "num_tokens": 41692648.0, + "step": 5135 + }, + { + "entropy": 0.4155091643333435, + "epoch": 2.9006772009029347, + "grad_norm": 1.7973077297210693, + "learning_rate": 4.808787079819147e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8949877023696899, + "num_tokens": 41733143.0, + "step": 5140 + }, + { + "entropy": 0.41151371598243713, + "epoch": 2.9034988713318284, + "grad_norm": 1.404732346534729, + "learning_rate": 4.808421289525208e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8984189987182617, + "num_tokens": 41773361.0, + "step": 5145 + }, + { + "entropy": 0.42156214118003843, + "epoch": 2.9063205417607225, + "grad_norm": 1.6763156652450562, + "learning_rate": 4.808055168522938e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8945571660995484, + "num_tokens": 41813917.0, + "step": 5150 + }, + { + "entropy": 0.4589288830757141, + "epoch": 2.909142212189616, + "grad_norm": 1.4989210367202759, + "learning_rate": 4.807688716884262e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8893625497817993, + "num_tokens": 41854234.0, + "step": 5155 + }, + { + "entropy": 0.45793223977088926, + "epoch": 2.91196388261851, + "grad_norm": 1.8785529136657715, + "learning_rate": 4.807321934681168e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8890782237052918, + "num_tokens": 41895042.0, + "step": 5160 + }, + { + "entropy": 0.42541446089744567, + "epoch": 2.914785553047404, + "grad_norm": 1.4923590421676636, + "learning_rate": 4.806954821985711e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8974828958511353, + "num_tokens": 41935666.0, + "step": 5165 + }, + { + "entropy": 0.411184823513031, + "epoch": 2.917607223476298, + "grad_norm": 1.8144638538360596, + "learning_rate": 4.806587378870011e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.9010491013526917, + "num_tokens": 41976246.0, + "step": 5170 + }, + { + "entropy": 0.4226935863494873, + "epoch": 2.920428893905192, + "grad_norm": 1.6548048257827759, + "learning_rate": 4.806219605406253e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8946655511856079, + "num_tokens": 42016979.0, + "step": 5175 + }, + { + "entropy": 0.4352991938591003, + "epoch": 2.9232505643340856, + "grad_norm": 1.8966737985610962, + "learning_rate": 4.805851501666683e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8924818277359009, + "num_tokens": 42057536.0, + "step": 5180 + }, + { + "entropy": 0.43102755546569826, + "epoch": 2.9260722347629797, + "grad_norm": 1.7986955642700195, + "learning_rate": 4.805483067723618e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8937864661216736, + "num_tokens": 42097826.0, + "step": 5185 + }, + { + "entropy": 0.4175034463405609, + "epoch": 2.9288939051918734, + "grad_norm": 1.5185545682907104, + "learning_rate": 4.805114303649436e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8953788757324219, + "num_tokens": 42138400.0, + "step": 5190 + }, + { + "entropy": 0.4255950450897217, + "epoch": 2.9317155756207676, + "grad_norm": 1.8746482133865356, + "learning_rate": 4.80474520951658e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.9003988265991211, + "num_tokens": 42178987.0, + "step": 5195 + }, + { + "entropy": 0.41336881518363955, + "epoch": 2.9345372460496613, + "grad_norm": 1.6806355714797974, + "learning_rate": 4.80437578539756e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.9040932536125184, + "num_tokens": 42219657.0, + "step": 5200 + }, + { + "entropy": 0.4057203710079193, + "epoch": 2.9373589164785554, + "grad_norm": 1.564095377922058, + "learning_rate": 4.804006031364948e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.9054961562156677, + "num_tokens": 42260473.0, + "step": 5205 + }, + { + "entropy": 0.39404407143592834, + "epoch": 2.940180586907449, + "grad_norm": 1.4518405199050903, + "learning_rate": 4.8036359474913826e-06, + "loss": 0.288, + "mean_token_accuracy": 0.9059139609336853, + "num_tokens": 42301055.0, + "step": 5210 + }, + { + "entropy": 0.3924104571342468, + "epoch": 2.9430022573363432, + "grad_norm": 1.319478154182434, + "learning_rate": 4.803265533849569e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.9025569319725036, + "num_tokens": 42341730.0, + "step": 5215 + }, + { + "entropy": 0.3995777368545532, + "epoch": 2.945823927765237, + "grad_norm": 1.4966541528701782, + "learning_rate": 4.802894790512271e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.901541805267334, + "num_tokens": 42382070.0, + "step": 5220 + }, + { + "entropy": 0.4068488717079163, + "epoch": 2.948645598194131, + "grad_norm": 1.8578685522079468, + "learning_rate": 4.8025237175523245e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.9044872045516967, + "num_tokens": 42422271.0, + "step": 5225 + }, + { + "entropy": 0.40595067739486695, + "epoch": 2.9514672686230248, + "grad_norm": 1.6572576761245728, + "learning_rate": 4.8021523150426255e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.9003460884094239, + "num_tokens": 42463045.0, + "step": 5230 + }, + { + "entropy": 0.4293433666229248, + "epoch": 2.954288939051919, + "grad_norm": 1.6312439441680908, + "learning_rate": 4.801780583056135e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.9005551815032959, + "num_tokens": 42503743.0, + "step": 5235 + }, + { + "entropy": 0.43949187994003297, + "epoch": 2.9571106094808126, + "grad_norm": 1.5055447816848755, + "learning_rate": 4.8014085216658824e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8934733510017395, + "num_tokens": 42544394.0, + "step": 5240 + }, + { + "entropy": 0.3744525730609894, + "epoch": 2.9599322799097063, + "grad_norm": 1.7301849126815796, + "learning_rate": 4.801036130944957e-06, + "loss": 0.29, + "mean_token_accuracy": 0.9071328043937683, + "num_tokens": 42584858.0, + "step": 5245 + }, + { + "entropy": 0.41108238101005556, + "epoch": 2.9627539503386005, + "grad_norm": 1.419072151184082, + "learning_rate": 4.800663410966516e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8972790360450744, + "num_tokens": 42625252.0, + "step": 5250 + }, + { + "entropy": 0.4198480904102325, + "epoch": 2.9655756207674946, + "grad_norm": 1.5576399564743042, + "learning_rate": 4.80029036180378e-06, + "loss": 0.312, + "mean_token_accuracy": 0.9000542402267456, + "num_tokens": 42665780.0, + "step": 5255 + }, + { + "entropy": 0.4146255135536194, + "epoch": 2.9683972911963883, + "grad_norm": 1.5485845804214478, + "learning_rate": 4.799916983530035e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.9038738012313843, + "num_tokens": 42706347.0, + "step": 5260 + }, + { + "entropy": 0.4407613933086395, + "epoch": 2.971218961625282, + "grad_norm": 1.6536972522735596, + "learning_rate": 4.7995432762186305e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8906416058540344, + "num_tokens": 42747026.0, + "step": 5265 + }, + { + "entropy": 0.4324296176433563, + "epoch": 2.974040632054176, + "grad_norm": 1.9314972162246704, + "learning_rate": 4.799169239942982e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8953546643257141, + "num_tokens": 42787529.0, + "step": 5270 + }, + { + "entropy": 0.40229780673980714, + "epoch": 2.97686230248307, + "grad_norm": 1.437831163406372, + "learning_rate": 4.798794874776569e-06, + "loss": 0.301, + "mean_token_accuracy": 0.9024040460586548, + "num_tokens": 42828263.0, + "step": 5275 + }, + { + "entropy": 0.4193461239337921, + "epoch": 2.979683972911964, + "grad_norm": 1.6553016901016235, + "learning_rate": 4.798420180792934e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8927172064781189, + "num_tokens": 42868861.0, + "step": 5280 + }, + { + "entropy": 0.39387176036834715, + "epoch": 2.9825056433408577, + "grad_norm": 1.827269434928894, + "learning_rate": 4.7980451580656884e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.9013678908348084, + "num_tokens": 42909719.0, + "step": 5285 + }, + { + "entropy": 0.4310999274253845, + "epoch": 2.985327313769752, + "grad_norm": 1.4325158596038818, + "learning_rate": 4.797669806668504e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8959625005722046, + "num_tokens": 42950331.0, + "step": 5290 + }, + { + "entropy": 0.3933727502822876, + "epoch": 2.9881489841986455, + "grad_norm": 1.5803815126419067, + "learning_rate": 4.797294126675117e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.9042976379394532, + "num_tokens": 42991038.0, + "step": 5295 + }, + { + "entropy": 0.4161331236362457, + "epoch": 2.9909706546275396, + "grad_norm": 1.6079249382019043, + "learning_rate": 4.796918118159333e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8964751839637757, + "num_tokens": 43031524.0, + "step": 5300 + }, + { + "entropy": 0.41101468205451963, + "epoch": 2.9937923250564333, + "grad_norm": 1.6859304904937744, + "learning_rate": 4.796541781195018e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8995181202888489, + "num_tokens": 43072136.0, + "step": 5305 + }, + { + "entropy": 0.415380471944809, + "epoch": 2.9966139954853275, + "grad_norm": 1.5194308757781982, + "learning_rate": 4.796165115856101e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8985631585121154, + "num_tokens": 43111676.0, + "step": 5310 + }, + { + "entropy": 0.4073172271251678, + "epoch": 2.999435665914221, + "grad_norm": 1.6618201732635498, + "learning_rate": 4.79578812221658e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8969994068145752, + "num_tokens": 43152259.0, + "step": 5315 + }, + { + "entropy": 0.40902658700942995, + "epoch": 3.0022573363431153, + "grad_norm": 1.5331395864486694, + "learning_rate": 4.795410800350516e-06, + "loss": 0.2449, + "mean_token_accuracy": 0.9244343876838684, + "num_tokens": 43186514.0, + "step": 5320 + }, + { + "entropy": 0.3297205209732056, + "epoch": 3.005079006772009, + "grad_norm": 1.183812141418457, + "learning_rate": 4.795033150332033e-06, + "loss": 0.2, + "mean_token_accuracy": 0.93786141872406, + "num_tokens": 43227120.0, + "step": 5325 + }, + { + "entropy": 0.32953916788101195, + "epoch": 3.007900677200903, + "grad_norm": 1.991351842880249, + "learning_rate": 4.79465517223532e-06, + "loss": 0.2247, + "mean_token_accuracy": 0.9283014059066772, + "num_tokens": 43267656.0, + "step": 5330 + }, + { + "entropy": 0.3248750567436218, + "epoch": 3.010722347629797, + "grad_norm": 1.9206222295761108, + "learning_rate": 4.794276866134631e-06, + "loss": 0.2204, + "mean_token_accuracy": 0.9299774169921875, + "num_tokens": 43308512.0, + "step": 5335 + }, + { + "entropy": 0.29567849040031435, + "epoch": 3.0135440180586905, + "grad_norm": 1.8526611328125, + "learning_rate": 4.793898232104286e-06, + "loss": 0.2006, + "mean_token_accuracy": 0.9349032402038574, + "num_tokens": 43349318.0, + "step": 5340 + }, + { + "entropy": 0.29463449120521545, + "epoch": 3.0163656884875847, + "grad_norm": 1.8326590061187744, + "learning_rate": 4.7935192702186655e-06, + "loss": 0.1999, + "mean_token_accuracy": 0.936297345161438, + "num_tokens": 43389542.0, + "step": 5345 + }, + { + "entropy": 0.322478848695755, + "epoch": 3.0191873589164784, + "grad_norm": 1.6944438219070435, + "learning_rate": 4.793139980552218e-06, + "loss": 0.2056, + "mean_token_accuracy": 0.9352605819702149, + "num_tokens": 43430125.0, + "step": 5350 + }, + { + "entropy": 0.28133447766304015, + "epoch": 3.0220090293453725, + "grad_norm": 5.7940263748168945, + "learning_rate": 4.792760363179454e-06, + "loss": 0.1745, + "mean_token_accuracy": 0.9443145155906677, + "num_tokens": 43470808.0, + "step": 5355 + }, + { + "entropy": 0.31858010292053224, + "epoch": 3.024830699774266, + "grad_norm": 2.1480491161346436, + "learning_rate": 4.79238041817495e-06, + "loss": 0.1987, + "mean_token_accuracy": 0.936722457408905, + "num_tokens": 43511437.0, + "step": 5360 + }, + { + "entropy": 0.27969213724136355, + "epoch": 3.0276523702031604, + "grad_norm": 1.8360697031021118, + "learning_rate": 4.792000145613346e-06, + "loss": 0.1816, + "mean_token_accuracy": 0.9421907782554626, + "num_tokens": 43552179.0, + "step": 5365 + }, + { + "entropy": 0.3075882375240326, + "epoch": 3.030474040632054, + "grad_norm": 1.9553619623184204, + "learning_rate": 4.791619545569347e-06, + "loss": 0.2032, + "mean_token_accuracy": 0.9348996162414551, + "num_tokens": 43592847.0, + "step": 5370 + }, + { + "entropy": 0.3161883890628815, + "epoch": 3.033295711060948, + "grad_norm": 1.7996515035629272, + "learning_rate": 4.7912386181177216e-06, + "loss": 0.2017, + "mean_token_accuracy": 0.9368625760078431, + "num_tokens": 43633494.0, + "step": 5375 + }, + { + "entropy": 0.31576813459396363, + "epoch": 3.036117381489842, + "grad_norm": 1.7700996398925781, + "learning_rate": 4.790857363333303e-06, + "loss": 0.2138, + "mean_token_accuracy": 0.9302696347236633, + "num_tokens": 43674104.0, + "step": 5380 + }, + { + "entropy": 0.2922771334648132, + "epoch": 3.038939051918736, + "grad_norm": 2.0567281246185303, + "learning_rate": 4.790475781290988e-06, + "loss": 0.1944, + "mean_token_accuracy": 0.9376009941101074, + "num_tokens": 43714718.0, + "step": 5385 + }, + { + "entropy": 0.3182399868965149, + "epoch": 3.0417607223476297, + "grad_norm": 1.8016152381896973, + "learning_rate": 4.79009387206574e-06, + "loss": 0.2071, + "mean_token_accuracy": 0.9343981266021728, + "num_tokens": 43755355.0, + "step": 5390 + }, + { + "entropy": 0.27976890206336974, + "epoch": 3.044582392776524, + "grad_norm": 1.6723804473876953, + "learning_rate": 4.7897116357325844e-06, + "loss": 0.174, + "mean_token_accuracy": 0.9448309302330017, + "num_tokens": 43795791.0, + "step": 5395 + }, + { + "entropy": 0.30181885361671446, + "epoch": 3.0474040632054176, + "grad_norm": 1.6413596868515015, + "learning_rate": 4.7893290723666116e-06, + "loss": 0.192, + "mean_token_accuracy": 0.938207459449768, + "num_tokens": 43836503.0, + "step": 5400 + }, + { + "entropy": 0.2982407629489899, + "epoch": 3.0502257336343117, + "grad_norm": 1.594096064567566, + "learning_rate": 4.788946182042976e-06, + "loss": 0.1967, + "mean_token_accuracy": 0.9364789009094239, + "num_tokens": 43877133.0, + "step": 5405 + }, + { + "entropy": 0.34292480945587156, + "epoch": 3.0530474040632054, + "grad_norm": 1.880804181098938, + "learning_rate": 4.788562964836897e-06, + "loss": 0.2291, + "mean_token_accuracy": 0.9263935923576355, + "num_tokens": 43917640.0, + "step": 5410 + }, + { + "entropy": 0.32324235439300536, + "epoch": 3.055869074492099, + "grad_norm": 1.8121103048324585, + "learning_rate": 4.788179420823657e-06, + "loss": 0.2245, + "mean_token_accuracy": 0.9280650019645691, + "num_tokens": 43958238.0, + "step": 5415 + }, + { + "entropy": 0.31875272989273074, + "epoch": 3.0586907449209932, + "grad_norm": 1.8845757246017456, + "learning_rate": 4.787795550078603e-06, + "loss": 0.2127, + "mean_token_accuracy": 0.9321471333503724, + "num_tokens": 43999127.0, + "step": 5420 + }, + { + "entropy": 0.306513249874115, + "epoch": 3.061512415349887, + "grad_norm": 1.7889739274978638, + "learning_rate": 4.787411352677148e-06, + "loss": 0.2052, + "mean_token_accuracy": 0.9344831585884095, + "num_tokens": 44039844.0, + "step": 5425 + }, + { + "entropy": 0.3005764603614807, + "epoch": 3.064334085778781, + "grad_norm": 1.6967425346374512, + "learning_rate": 4.787026828694767e-06, + "loss": 0.1913, + "mean_token_accuracy": 0.9385549426078796, + "num_tokens": 44080390.0, + "step": 5430 + }, + { + "entropy": 0.28662807643413546, + "epoch": 3.0671557562076748, + "grad_norm": 1.8293821811676025, + "learning_rate": 4.786641978206999e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9455971002578736, + "num_tokens": 44120988.0, + "step": 5435 + }, + { + "entropy": 0.3011536240577698, + "epoch": 3.069977426636569, + "grad_norm": 1.7703379392623901, + "learning_rate": 4.786256801289449e-06, + "loss": 0.1839, + "mean_token_accuracy": 0.9414896726608276, + "num_tokens": 44161183.0, + "step": 5440 + }, + { + "entropy": 0.31452121734619143, + "epoch": 3.0727990970654626, + "grad_norm": 1.900452733039856, + "learning_rate": 4.785871298017783e-06, + "loss": 0.1999, + "mean_token_accuracy": 0.9365707755088806, + "num_tokens": 44201881.0, + "step": 5445 + }, + { + "entropy": 0.296010160446167, + "epoch": 3.0756207674943568, + "grad_norm": 2.206284761428833, + "learning_rate": 4.7854854684677345e-06, + "loss": 0.1944, + "mean_token_accuracy": 0.9396218180656433, + "num_tokens": 44242707.0, + "step": 5450 + }, + { + "entropy": 0.3006631314754486, + "epoch": 3.0784424379232505, + "grad_norm": 1.720755934715271, + "learning_rate": 4.785099312715101e-06, + "loss": 0.1956, + "mean_token_accuracy": 0.9376758575439453, + "num_tokens": 44283160.0, + "step": 5455 + }, + { + "entropy": 0.28210367560386657, + "epoch": 3.0812641083521446, + "grad_norm": 1.6647253036499023, + "learning_rate": 4.7847128308357414e-06, + "loss": 0.1767, + "mean_token_accuracy": 0.943335497379303, + "num_tokens": 44323781.0, + "step": 5460 + }, + { + "entropy": 0.32272166609764097, + "epoch": 3.0840857787810383, + "grad_norm": 2.004971742630005, + "learning_rate": 4.7843260229055805e-06, + "loss": 0.209, + "mean_token_accuracy": 0.9329189896583557, + "num_tokens": 44364364.0, + "step": 5465 + }, + { + "entropy": 0.3206708192825317, + "epoch": 3.0869074492099324, + "grad_norm": 2.0899529457092285, + "learning_rate": 4.7839388890006065e-06, + "loss": 0.2202, + "mean_token_accuracy": 0.9304483771324158, + "num_tokens": 44404981.0, + "step": 5470 + }, + { + "entropy": 0.33442134261131284, + "epoch": 3.089729119638826, + "grad_norm": 2.028860092163086, + "learning_rate": 4.783551429196872e-06, + "loss": 0.2262, + "mean_token_accuracy": 0.9287503361701965, + "num_tokens": 44445757.0, + "step": 5475 + }, + { + "entropy": 0.31830272674560545, + "epoch": 3.0925507900677203, + "grad_norm": 1.8910348415374756, + "learning_rate": 4.783163643570493e-06, + "loss": 0.2219, + "mean_token_accuracy": 0.9306220650672913, + "num_tokens": 44486469.0, + "step": 5480 + }, + { + "entropy": 0.2903516858816147, + "epoch": 3.095372460496614, + "grad_norm": 1.8281724452972412, + "learning_rate": 4.782775532197652e-06, + "loss": 0.2034, + "mean_token_accuracy": 0.9371914029121399, + "num_tokens": 44527281.0, + "step": 5485 + }, + { + "entropy": 0.2952991366386414, + "epoch": 3.098194130925508, + "grad_norm": 1.6057690382003784, + "learning_rate": 4.7823870951545924e-06, + "loss": 0.2059, + "mean_token_accuracy": 0.9358728647232055, + "num_tokens": 44567483.0, + "step": 5490 + }, + { + "entropy": 0.3091245710849762, + "epoch": 3.101015801354402, + "grad_norm": 1.645340919494629, + "learning_rate": 4.781998332517621e-06, + "loss": 0.2082, + "mean_token_accuracy": 0.9315654993057251, + "num_tokens": 44608171.0, + "step": 5495 + }, + { + "entropy": 0.28429390788078307, + "epoch": 3.1038374717832955, + "grad_norm": 1.748238205909729, + "learning_rate": 4.781609244363113e-06, + "loss": 0.1902, + "mean_token_accuracy": 0.9393861174583436, + "num_tokens": 44648818.0, + "step": 5500 + }, + { + "epoch": 3.1038374717832955, + "eval_entropy": 0.31511667370796204, + "eval_loss": 0.1905011534690857, + "eval_mean_token_accuracy": 0.9417401552200317, + "eval_num_tokens": 44648818.0, + "eval_runtime": 0.1638, + "eval_samples_per_second": 24.418, + "eval_steps_per_second": 6.104, + "step": 5500 + }, + { + "entropy": 0.291202050447464, + "epoch": 3.1066591422121896, + "grad_norm": 1.549842357635498, + "learning_rate": 4.781219830767503e-06, + "loss": 0.1874, + "mean_token_accuracy": 0.9398093223571777, + "num_tokens": 44689512.0, + "step": 5505 + }, + { + "entropy": 0.32962539196014407, + "epoch": 3.1094808126410833, + "grad_norm": 1.8189475536346436, + "learning_rate": 4.780830091807293e-06, + "loss": 0.2264, + "mean_token_accuracy": 0.9287977695465088, + "num_tokens": 44729392.0, + "step": 5510 + }, + { + "entropy": 0.3322260320186615, + "epoch": 3.1123024830699775, + "grad_norm": 1.7237313985824585, + "learning_rate": 4.780440027559045e-06, + "loss": 0.2275, + "mean_token_accuracy": 0.9315350294113159, + "num_tokens": 44770052.0, + "step": 5515 + }, + { + "entropy": 0.3075240433216095, + "epoch": 3.115124153498871, + "grad_norm": 1.6741464138031006, + "learning_rate": 4.780049638099389e-06, + "loss": 0.2057, + "mean_token_accuracy": 0.9344268202781677, + "num_tokens": 44810590.0, + "step": 5520 + }, + { + "entropy": 0.3267399907112122, + "epoch": 3.1179458239277653, + "grad_norm": 1.500756859779358, + "learning_rate": 4.779658923505016e-06, + "loss": 0.2024, + "mean_token_accuracy": 0.9370131492614746, + "num_tokens": 44851209.0, + "step": 5525 + }, + { + "entropy": 0.30828378200531004, + "epoch": 3.120767494356659, + "grad_norm": 1.5910139083862305, + "learning_rate": 4.779267883852683e-06, + "loss": 0.1998, + "mean_token_accuracy": 0.9370311021804809, + "num_tokens": 44891316.0, + "step": 5530 + }, + { + "entropy": 0.29669389128685, + "epoch": 3.123589164785553, + "grad_norm": 1.7703447341918945, + "learning_rate": 4.778876519219208e-06, + "loss": 0.1821, + "mean_token_accuracy": 0.9407304763793946, + "num_tokens": 44932141.0, + "step": 5535 + }, + { + "entropy": 0.29509271383285524, + "epoch": 3.126410835214447, + "grad_norm": 1.945721983909607, + "learning_rate": 4.778484829681477e-06, + "loss": 0.1991, + "mean_token_accuracy": 0.9373913645744324, + "num_tokens": 44972742.0, + "step": 5540 + }, + { + "entropy": 0.2876304090023041, + "epoch": 3.129232505643341, + "grad_norm": 1.8770701885223389, + "learning_rate": 4.778092815316436e-06, + "loss": 0.1877, + "mean_token_accuracy": 0.9409420013427734, + "num_tokens": 45013573.0, + "step": 5545 + }, + { + "entropy": 0.33961347937583924, + "epoch": 3.1320541760722347, + "grad_norm": 1.8638192415237427, + "learning_rate": 4.777700476201096e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.9269558072090149, + "num_tokens": 45054408.0, + "step": 5550 + }, + { + "entropy": 0.3194903790950775, + "epoch": 3.134875846501129, + "grad_norm": 1.9824572801589966, + "learning_rate": 4.777307812412533e-06, + "loss": 0.2087, + "mean_token_accuracy": 0.9329106450080872, + "num_tokens": 45094995.0, + "step": 5555 + }, + { + "entropy": 0.3165232837200165, + "epoch": 3.1376975169300225, + "grad_norm": 1.8498613834381104, + "learning_rate": 4.776914824027885e-06, + "loss": 0.1959, + "mean_token_accuracy": 0.9351617097854614, + "num_tokens": 45135825.0, + "step": 5560 + }, + { + "entropy": 0.31093028783798216, + "epoch": 3.1405191873589167, + "grad_norm": 1.8327690362930298, + "learning_rate": 4.776521511124356e-06, + "loss": 0.2014, + "mean_token_accuracy": 0.9345197558403016, + "num_tokens": 45176378.0, + "step": 5565 + }, + { + "entropy": 0.2836467266082764, + "epoch": 3.1433408577878104, + "grad_norm": 1.7564665079116821, + "learning_rate": 4.7761278737792115e-06, + "loss": 0.1741, + "mean_token_accuracy": 0.9437781810760498, + "num_tokens": 45216787.0, + "step": 5570 + }, + { + "entropy": 0.2903373181819916, + "epoch": 3.1461625282167045, + "grad_norm": 1.730517029762268, + "learning_rate": 4.775733912069781e-06, + "loss": 0.1928, + "mean_token_accuracy": 0.9383692383766175, + "num_tokens": 45257435.0, + "step": 5575 + }, + { + "entropy": 0.27841404676437376, + "epoch": 3.148984198645598, + "grad_norm": 1.6484125852584839, + "learning_rate": 4.775339626073458e-06, + "loss": 0.1796, + "mean_token_accuracy": 0.9405380487442017, + "num_tokens": 45298209.0, + "step": 5580 + }, + { + "entropy": 0.30049493312835696, + "epoch": 3.151805869074492, + "grad_norm": 1.5576494932174683, + "learning_rate": 4.774945015867702e-06, + "loss": 0.1956, + "mean_token_accuracy": 0.9364784240722657, + "num_tokens": 45338825.0, + "step": 5585 + }, + { + "entropy": 0.31532043814659116, + "epoch": 3.154627539503386, + "grad_norm": 1.8203058242797852, + "learning_rate": 4.774550081530034e-06, + "loss": 0.1975, + "mean_token_accuracy": 0.9349407196044922, + "num_tokens": 45379636.0, + "step": 5590 + }, + { + "entropy": 0.29182214140892027, + "epoch": 3.1574492099322797, + "grad_norm": 1.9475021362304688, + "learning_rate": 4.774154823138037e-06, + "loss": 0.1936, + "mean_token_accuracy": 0.9374213933944702, + "num_tokens": 45420353.0, + "step": 5595 + }, + { + "entropy": 0.32046533823013307, + "epoch": 3.160270880361174, + "grad_norm": 1.7757936716079712, + "learning_rate": 4.773759240769361e-06, + "loss": 0.2002, + "mean_token_accuracy": 0.9345276355743408, + "num_tokens": 45460828.0, + "step": 5600 + }, + { + "entropy": 0.29744908809661863, + "epoch": 3.1630925507900676, + "grad_norm": 1.8185851573944092, + "learning_rate": 4.773363334501717e-06, + "loss": 0.1847, + "mean_token_accuracy": 0.939760959148407, + "num_tokens": 45501378.0, + "step": 5605 + }, + { + "entropy": 0.2820535182952881, + "epoch": 3.1659142212189617, + "grad_norm": 1.843806266784668, + "learning_rate": 4.772967104412882e-06, + "loss": 0.1958, + "mean_token_accuracy": 0.9386505484580994, + "num_tokens": 45542183.0, + "step": 5610 + }, + { + "entropy": 0.2983327269554138, + "epoch": 3.1687358916478554, + "grad_norm": 1.7788043022155762, + "learning_rate": 4.772570550580696e-06, + "loss": 0.1908, + "mean_token_accuracy": 0.939252245426178, + "num_tokens": 45582722.0, + "step": 5615 + }, + { + "entropy": 0.3242061614990234, + "epoch": 3.1715575620767495, + "grad_norm": 1.9367573261260986, + "learning_rate": 4.77217367308306e-06, + "loss": 0.2153, + "mean_token_accuracy": 0.9323940753936768, + "num_tokens": 45623151.0, + "step": 5620 + }, + { + "entropy": 0.32598050832748415, + "epoch": 3.1743792325056432, + "grad_norm": 1.8925623893737793, + "learning_rate": 4.7717764719979425e-06, + "loss": 0.2206, + "mean_token_accuracy": 0.9283890128135681, + "num_tokens": 45663702.0, + "step": 5625 + }, + { + "entropy": 0.296782386302948, + "epoch": 3.1772009029345374, + "grad_norm": 2.1091501712799072, + "learning_rate": 4.771378947403374e-06, + "loss": 0.2048, + "mean_token_accuracy": 0.9344670414924622, + "num_tokens": 45704366.0, + "step": 5630 + }, + { + "entropy": 0.3019926369190216, + "epoch": 3.180022573363431, + "grad_norm": 1.747178077697754, + "learning_rate": 4.770981099377445e-06, + "loss": 0.2092, + "mean_token_accuracy": 0.9341341137886048, + "num_tokens": 45744753.0, + "step": 5635 + }, + { + "entropy": 0.31195615530014037, + "epoch": 3.1828442437923252, + "grad_norm": 2.096071243286133, + "learning_rate": 4.7705829279983165e-06, + "loss": 0.2089, + "mean_token_accuracy": 0.9316715359687805, + "num_tokens": 45784713.0, + "step": 5640 + }, + { + "entropy": 0.3224526882171631, + "epoch": 3.185665914221219, + "grad_norm": 2.126277446746826, + "learning_rate": 4.770184433344207e-06, + "loss": 0.2126, + "mean_token_accuracy": 0.9330295085906982, + "num_tokens": 45825418.0, + "step": 5645 + }, + { + "entropy": 0.33095919489860537, + "epoch": 3.188487584650113, + "grad_norm": 1.8362377882003784, + "learning_rate": 4.769785615493403e-06, + "loss": 0.2234, + "mean_token_accuracy": 0.9276701092720032, + "num_tokens": 45866068.0, + "step": 5650 + }, + { + "entropy": 0.290876042842865, + "epoch": 3.1913092550790068, + "grad_norm": 1.6160290241241455, + "learning_rate": 4.76938647452425e-06, + "loss": 0.1988, + "mean_token_accuracy": 0.9370180249214173, + "num_tokens": 45906929.0, + "step": 5655 + }, + { + "entropy": 0.2966496706008911, + "epoch": 3.194130925507901, + "grad_norm": 1.9973173141479492, + "learning_rate": 4.76898701051516e-06, + "loss": 0.2004, + "mean_token_accuracy": 0.9346588611602783, + "num_tokens": 45947800.0, + "step": 5660 + }, + { + "entropy": 0.3274839758872986, + "epoch": 3.1969525959367946, + "grad_norm": 1.7757952213287354, + "learning_rate": 4.768587223544609e-06, + "loss": 0.2114, + "mean_token_accuracy": 0.9339724779129028, + "num_tokens": 45987994.0, + "step": 5665 + }, + { + "entropy": 0.31054744124412537, + "epoch": 3.1997742663656883, + "grad_norm": 1.8302743434906006, + "learning_rate": 4.768187113691134e-06, + "loss": 0.1925, + "mean_token_accuracy": 0.937913966178894, + "num_tokens": 46028630.0, + "step": 5670 + }, + { + "entropy": 0.296200168132782, + "epoch": 3.2025959367945824, + "grad_norm": 1.6775931119918823, + "learning_rate": 4.767786681033337e-06, + "loss": 0.1923, + "mean_token_accuracy": 0.9360583305358887, + "num_tokens": 46069224.0, + "step": 5675 + }, + { + "entropy": 0.31996251940727233, + "epoch": 3.205417607223476, + "grad_norm": 2.0050246715545654, + "learning_rate": 4.767385925649883e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.9327707052230835, + "num_tokens": 46109724.0, + "step": 5680 + }, + { + "entropy": 0.2851140141487122, + "epoch": 3.2082392776523703, + "grad_norm": 2.052438497543335, + "learning_rate": 4.7669848476195005e-06, + "loss": 0.1954, + "mean_token_accuracy": 0.9375227808952331, + "num_tokens": 46150400.0, + "step": 5685 + }, + { + "entropy": 0.2947817206382751, + "epoch": 3.211060948081264, + "grad_norm": 1.9339162111282349, + "learning_rate": 4.766583447020981e-06, + "loss": 0.2079, + "mean_token_accuracy": 0.9334628462791443, + "num_tokens": 46191151.0, + "step": 5690 + }, + { + "entropy": 0.3035987734794617, + "epoch": 3.213882618510158, + "grad_norm": 1.9785780906677246, + "learning_rate": 4.76618172393318e-06, + "loss": 0.2027, + "mean_token_accuracy": 0.9366864800453186, + "num_tokens": 46231643.0, + "step": 5695 + }, + { + "entropy": 0.31024947464466096, + "epoch": 3.216704288939052, + "grad_norm": 1.4870526790618896, + "learning_rate": 4.765779678435016e-06, + "loss": 0.2126, + "mean_token_accuracy": 0.9308468461036682, + "num_tokens": 46272392.0, + "step": 5700 + }, + { + "entropy": 0.31926954388618467, + "epoch": 3.219525959367946, + "grad_norm": 1.933563470840454, + "learning_rate": 4.76537731060547e-06, + "loss": 0.219, + "mean_token_accuracy": 0.9304630041122437, + "num_tokens": 46313068.0, + "step": 5705 + }, + { + "entropy": 0.3053418666124344, + "epoch": 3.2223476297968396, + "grad_norm": 1.7980693578720093, + "learning_rate": 4.764974620523589e-06, + "loss": 0.1975, + "mean_token_accuracy": 0.9364609718322754, + "num_tokens": 46353615.0, + "step": 5710 + }, + { + "entropy": 0.29312550127506254, + "epoch": 3.225169300225734, + "grad_norm": 1.8937259912490845, + "learning_rate": 4.764571608268481e-06, + "loss": 0.1914, + "mean_token_accuracy": 0.9379542708396912, + "num_tokens": 46394082.0, + "step": 5715 + }, + { + "entropy": 0.3227978229522705, + "epoch": 3.2279909706546275, + "grad_norm": 1.9451820850372314, + "learning_rate": 4.764168273919317e-06, + "loss": 0.2264, + "mean_token_accuracy": 0.9285582900047302, + "num_tokens": 46434664.0, + "step": 5720 + }, + { + "entropy": 0.2956021040678024, + "epoch": 3.2308126410835216, + "grad_norm": 2.080122947692871, + "learning_rate": 4.7637646175553325e-06, + "loss": 0.2137, + "mean_token_accuracy": 0.9323472619056702, + "num_tokens": 46475192.0, + "step": 5725 + }, + { + "entropy": 0.32532267570495604, + "epoch": 3.2336343115124153, + "grad_norm": 1.788669466972351, + "learning_rate": 4.763360639255826e-06, + "loss": 0.2257, + "mean_token_accuracy": 0.9298007488250732, + "num_tokens": 46515676.0, + "step": 5730 + }, + { + "entropy": 0.3365131139755249, + "epoch": 3.2364559819413095, + "grad_norm": 1.9396251440048218, + "learning_rate": 4.762956339100158e-06, + "loss": 0.2328, + "mean_token_accuracy": 0.9281740784645081, + "num_tokens": 46556056.0, + "step": 5735 + }, + { + "entropy": 0.2854636192321777, + "epoch": 3.239277652370203, + "grad_norm": 1.900076150894165, + "learning_rate": 4.762551717167756e-06, + "loss": 0.1883, + "mean_token_accuracy": 0.9398433446884156, + "num_tokens": 46596732.0, + "step": 5740 + }, + { + "entropy": 0.3118133544921875, + "epoch": 3.2420993227990973, + "grad_norm": 2.0240774154663086, + "learning_rate": 4.762146773538105e-06, + "loss": 0.211, + "mean_token_accuracy": 0.9326440811157226, + "num_tokens": 46637432.0, + "step": 5745 + }, + { + "entropy": 0.293948894739151, + "epoch": 3.244920993227991, + "grad_norm": 1.7449288368225098, + "learning_rate": 4.7617415082907575e-06, + "loss": 0.198, + "mean_token_accuracy": 0.9358683586120605, + "num_tokens": 46678134.0, + "step": 5750 + }, + { + "entropy": 0.32562840580940244, + "epoch": 3.2477426636568847, + "grad_norm": 1.7750238180160522, + "learning_rate": 4.761335921505329e-06, + "loss": 0.2237, + "mean_token_accuracy": 0.929267966747284, + "num_tokens": 46718630.0, + "step": 5755 + }, + { + "entropy": 0.2856466591358185, + "epoch": 3.250564334085779, + "grad_norm": 1.5713127851486206, + "learning_rate": 4.760930013261495e-06, + "loss": 0.1906, + "mean_token_accuracy": 0.9386573076248169, + "num_tokens": 46759404.0, + "step": 5760 + }, + { + "entropy": 0.31055004596710206, + "epoch": 3.2533860045146725, + "grad_norm": 1.7997633218765259, + "learning_rate": 4.760523783638997e-06, + "loss": 0.1985, + "mean_token_accuracy": 0.9364279508590698, + "num_tokens": 46799993.0, + "step": 5765 + }, + { + "entropy": 0.317557692527771, + "epoch": 3.2562076749435667, + "grad_norm": 1.7583611011505127, + "learning_rate": 4.76011723271764e-06, + "loss": 0.2218, + "mean_token_accuracy": 0.9295771718025208, + "num_tokens": 46840518.0, + "step": 5770 + }, + { + "entropy": 0.3186119019985199, + "epoch": 3.2590293453724604, + "grad_norm": 2.132106065750122, + "learning_rate": 4.75971036057729e-06, + "loss": 0.2093, + "mean_token_accuracy": 0.9330814242362976, + "num_tokens": 46881236.0, + "step": 5775 + }, + { + "entropy": 0.2968993723392487, + "epoch": 3.2618510158013545, + "grad_norm": 1.693253993988037, + "learning_rate": 4.759303167297877e-06, + "loss": 0.1979, + "mean_token_accuracy": 0.9351974129676819, + "num_tokens": 46921984.0, + "step": 5780 + }, + { + "entropy": 0.2921249568462372, + "epoch": 3.264672686230248, + "grad_norm": 2.1287200450897217, + "learning_rate": 4.758895652959394e-06, + "loss": 0.198, + "mean_token_accuracy": 0.9361931085586548, + "num_tokens": 46962730.0, + "step": 5785 + }, + { + "entropy": 0.3355319321155548, + "epoch": 3.2674943566591423, + "grad_norm": 1.6830389499664307, + "learning_rate": 4.758487817641898e-06, + "loss": 0.2355, + "mean_token_accuracy": 0.9263626217842102, + "num_tokens": 47002800.0, + "step": 5790 + }, + { + "entropy": 0.32140293121337893, + "epoch": 3.270316027088036, + "grad_norm": 1.845828652381897, + "learning_rate": 4.758079661425508e-06, + "loss": 0.2129, + "mean_token_accuracy": 0.9305433511734009, + "num_tokens": 47043526.0, + "step": 5795 + }, + { + "entropy": 0.32089059948921206, + "epoch": 3.27313769751693, + "grad_norm": 1.3842658996582031, + "learning_rate": 4.757671184390406e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.9322557806968689, + "num_tokens": 47084134.0, + "step": 5800 + }, + { + "entropy": 0.33169599771499636, + "epoch": 3.275959367945824, + "grad_norm": 1.901238203048706, + "learning_rate": 4.757262386616837e-06, + "loss": 0.2173, + "mean_token_accuracy": 0.9306012511253356, + "num_tokens": 47124773.0, + "step": 5805 + }, + { + "entropy": 0.3124689519405365, + "epoch": 3.278781038374718, + "grad_norm": 2.120180368423462, + "learning_rate": 4.75685326818511e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.9284607768058777, + "num_tokens": 47165252.0, + "step": 5810 + }, + { + "entropy": 0.28955113887786865, + "epoch": 3.2816027088036117, + "grad_norm": 2.0301990509033203, + "learning_rate": 4.756443829175598e-06, + "loss": 0.1989, + "mean_token_accuracy": 0.9357622146606446, + "num_tokens": 47205788.0, + "step": 5815 + }, + { + "entropy": 0.33322848081588746, + "epoch": 3.2844243792325054, + "grad_norm": 1.8951489925384521, + "learning_rate": 4.756034069668732e-06, + "loss": 0.2272, + "mean_token_accuracy": 0.9254481196403503, + "num_tokens": 47246598.0, + "step": 5820 + }, + { + "entropy": 0.3263582348823547, + "epoch": 3.2872460496613995, + "grad_norm": 1.7902400493621826, + "learning_rate": 4.7556239897450116e-06, + "loss": 0.2064, + "mean_token_accuracy": 0.9346579432487487, + "num_tokens": 47287035.0, + "step": 5825 + }, + { + "entropy": 0.3200534522533417, + "epoch": 3.2900677200902937, + "grad_norm": 2.2464659214019775, + "learning_rate": 4.7552135894849965e-06, + "loss": 0.2172, + "mean_token_accuracy": 0.9306413531303406, + "num_tokens": 47327903.0, + "step": 5830 + }, + { + "entropy": 0.29628766179084776, + "epoch": 3.2928893905191874, + "grad_norm": 1.7990299463272095, + "learning_rate": 4.75480286896931e-06, + "loss": 0.1918, + "mean_token_accuracy": 0.9378930330276489, + "num_tokens": 47368326.0, + "step": 5835 + }, + { + "entropy": 0.29361318349838256, + "epoch": 3.295711060948081, + "grad_norm": 2.020881175994873, + "learning_rate": 4.754391828278638e-06, + "loss": 0.2086, + "mean_token_accuracy": 0.9333640217781067, + "num_tokens": 47409098.0, + "step": 5840 + }, + { + "entropy": 0.3207947850227356, + "epoch": 3.2985327313769752, + "grad_norm": 2.123802423477173, + "learning_rate": 4.753980467493729e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.929839301109314, + "num_tokens": 47449708.0, + "step": 5845 + }, + { + "entropy": 0.314586877822876, + "epoch": 3.301354401805869, + "grad_norm": 1.9354177713394165, + "learning_rate": 4.753568786695395e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.9324597477912903, + "num_tokens": 47490310.0, + "step": 5850 + }, + { + "entropy": 0.30399842858314513, + "epoch": 3.304176072234763, + "grad_norm": 1.9815207719802856, + "learning_rate": 4.753156785964512e-06, + "loss": 0.2005, + "mean_token_accuracy": 0.9356989383697509, + "num_tokens": 47530834.0, + "step": 5855 + }, + { + "entropy": 0.2951928973197937, + "epoch": 3.3069977426636568, + "grad_norm": 2.1382501125335693, + "learning_rate": 4.752744465382016e-06, + "loss": 0.1979, + "mean_token_accuracy": 0.9363240480422974, + "num_tokens": 47571446.0, + "step": 5860 + }, + { + "entropy": 0.30987902283668517, + "epoch": 3.309819413092551, + "grad_norm": 1.7661712169647217, + "learning_rate": 4.75233182502891e-06, + "loss": 0.1998, + "mean_token_accuracy": 0.9360772490501403, + "num_tokens": 47612058.0, + "step": 5865 + }, + { + "entropy": 0.29606603980064394, + "epoch": 3.3126410835214446, + "grad_norm": 1.722391963005066, + "learning_rate": 4.751918864986254e-06, + "loss": 0.1834, + "mean_token_accuracy": 0.9418554663658142, + "num_tokens": 47652000.0, + "step": 5870 + }, + { + "entropy": 0.29225061237812044, + "epoch": 3.3154627539503387, + "grad_norm": 2.109041929244995, + "learning_rate": 4.751505585335176e-06, + "loss": 0.2005, + "mean_token_accuracy": 0.9353228807449341, + "num_tokens": 47692508.0, + "step": 5875 + }, + { + "entropy": 0.29792939126491547, + "epoch": 3.3182844243792324, + "grad_norm": 1.6946020126342773, + "learning_rate": 4.751091986156864e-06, + "loss": 0.1992, + "mean_token_accuracy": 0.9354248166084289, + "num_tokens": 47733266.0, + "step": 5880 + }, + { + "entropy": 0.29692640602588655, + "epoch": 3.3211060948081266, + "grad_norm": 2.0159807205200195, + "learning_rate": 4.750678067532569e-06, + "loss": 0.1937, + "mean_token_accuracy": 0.9390650391578674, + "num_tokens": 47773918.0, + "step": 5885 + }, + { + "entropy": 0.29629198312759397, + "epoch": 3.3239277652370203, + "grad_norm": 2.0621113777160645, + "learning_rate": 4.750263829543608e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.9389678955078125, + "num_tokens": 47814708.0, + "step": 5890 + }, + { + "entropy": 0.3075540721416473, + "epoch": 3.3267494356659144, + "grad_norm": 2.2669737339019775, + "learning_rate": 4.749849272271355e-06, + "loss": 0.2074, + "mean_token_accuracy": 0.9335337281227112, + "num_tokens": 47855163.0, + "step": 5895 + }, + { + "entropy": 0.3051905155181885, + "epoch": 3.329571106094808, + "grad_norm": 1.7983601093292236, + "learning_rate": 4.749434395797252e-06, + "loss": 0.2013, + "mean_token_accuracy": 0.9364488601684571, + "num_tokens": 47895912.0, + "step": 5900 + }, + { + "entropy": 0.30544620752334595, + "epoch": 3.332392776523702, + "grad_norm": 2.127479076385498, + "learning_rate": 4.749019200202801e-06, + "loss": 0.2216, + "mean_token_accuracy": 0.9293683290481567, + "num_tokens": 47936365.0, + "step": 5905 + }, + { + "entropy": 0.3163418352603912, + "epoch": 3.335214446952596, + "grad_norm": 1.8929691314697266, + "learning_rate": 4.748603685569566e-06, + "loss": 0.2188, + "mean_token_accuracy": 0.9315417289733887, + "num_tokens": 47977087.0, + "step": 5910 + }, + { + "entropy": 0.2961422860622406, + "epoch": 3.33803611738149, + "grad_norm": 1.8902735710144043, + "learning_rate": 4.7481878519791775e-06, + "loss": 0.2024, + "mean_token_accuracy": 0.9347827434539795, + "num_tokens": 48017600.0, + "step": 5915 + }, + { + "entropy": 0.3079638063907623, + "epoch": 3.340857787810384, + "grad_norm": 3.9220144748687744, + "learning_rate": 4.747771699513324e-06, + "loss": 0.204, + "mean_token_accuracy": 0.9359039187431335, + "num_tokens": 48058181.0, + "step": 5920 + }, + { + "entropy": 0.30697737336158754, + "epoch": 3.3436794582392775, + "grad_norm": 1.9922391176223755, + "learning_rate": 4.747355228253759e-06, + "loss": 0.2005, + "mean_token_accuracy": 0.9362172484397888, + "num_tokens": 48098969.0, + "step": 5925 + }, + { + "entropy": 0.29278679490089415, + "epoch": 3.3465011286681716, + "grad_norm": 1.6647714376449585, + "learning_rate": 4.746938438282297e-06, + "loss": 0.1868, + "mean_token_accuracy": 0.9388912916183472, + "num_tokens": 48139626.0, + "step": 5930 + }, + { + "entropy": 0.32587441205978396, + "epoch": 3.3493227990970653, + "grad_norm": 1.9658551216125488, + "learning_rate": 4.74652132968082e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.931458306312561, + "num_tokens": 48180389.0, + "step": 5935 + }, + { + "entropy": 0.3118497312068939, + "epoch": 3.3521444695259595, + "grad_norm": 1.961451768875122, + "learning_rate": 4.746103902531266e-06, + "loss": 0.2137, + "mean_token_accuracy": 0.9307841777801513, + "num_tokens": 48221035.0, + "step": 5940 + }, + { + "entropy": 0.29938756227493285, + "epoch": 3.354966139954853, + "grad_norm": 1.9882782697677612, + "learning_rate": 4.7456861569156396e-06, + "loss": 0.2026, + "mean_token_accuracy": 0.9369753360748291, + "num_tokens": 48261699.0, + "step": 5945 + }, + { + "entropy": 0.297002974152565, + "epoch": 3.3577878103837473, + "grad_norm": 1.5427496433258057, + "learning_rate": 4.745268092916008e-06, + "loss": 0.1919, + "mean_token_accuracy": 0.9388345956802369, + "num_tokens": 48302567.0, + "step": 5950 + }, + { + "entropy": 0.31234695911407473, + "epoch": 3.360609480812641, + "grad_norm": 1.856497049331665, + "learning_rate": 4.744849710614498e-06, + "loss": 0.2081, + "mean_token_accuracy": 0.9339388251304627, + "num_tokens": 48343071.0, + "step": 5955 + }, + { + "entropy": 0.26803739964962003, + "epoch": 3.363431151241535, + "grad_norm": 1.5363012552261353, + "learning_rate": 4.744431010093302e-06, + "loss": 0.1604, + "mean_token_accuracy": 0.9466527342796326, + "num_tokens": 48383501.0, + "step": 5960 + }, + { + "entropy": 0.3034593999385834, + "epoch": 3.366252821670429, + "grad_norm": 1.9353678226470947, + "learning_rate": 4.744011991434673e-06, + "loss": 0.1963, + "mean_token_accuracy": 0.9345849275588989, + "num_tokens": 48424004.0, + "step": 5965 + }, + { + "entropy": 0.3189900636672974, + "epoch": 3.369074492099323, + "grad_norm": 1.8158825635910034, + "learning_rate": 4.743592654720929e-06, + "loss": 0.2096, + "mean_token_accuracy": 0.9319984436035156, + "num_tokens": 48464769.0, + "step": 5970 + }, + { + "entropy": 0.29853619933128356, + "epoch": 3.3718961625282167, + "grad_norm": 2.1045563220977783, + "learning_rate": 4.743173000034446e-06, + "loss": 0.1981, + "mean_token_accuracy": 0.9359387755393982, + "num_tokens": 48505548.0, + "step": 5975 + }, + { + "entropy": 0.2892568349838257, + "epoch": 3.374717832957111, + "grad_norm": 1.914672613143921, + "learning_rate": 4.7427530274576685e-06, + "loss": 0.184, + "mean_token_accuracy": 0.9403793454170227, + "num_tokens": 48546282.0, + "step": 5980 + }, + { + "entropy": 0.3369791269302368, + "epoch": 3.3775395033860045, + "grad_norm": 1.855833649635315, + "learning_rate": 4.742332737073098e-06, + "loss": 0.2296, + "mean_token_accuracy": 0.9269206643104553, + "num_tokens": 48586933.0, + "step": 5985 + }, + { + "entropy": 0.33269967436790465, + "epoch": 3.380361173814898, + "grad_norm": 1.9749621152877808, + "learning_rate": 4.741912128963301e-06, + "loss": 0.2292, + "mean_token_accuracy": 0.9259452581405639, + "num_tokens": 48627643.0, + "step": 5990 + }, + { + "entropy": 0.3066904067993164, + "epoch": 3.3831828442437923, + "grad_norm": 2.0527172088623047, + "learning_rate": 4.741491203210906e-06, + "loss": 0.2026, + "mean_token_accuracy": 0.9332493662834167, + "num_tokens": 48668300.0, + "step": 5995 + }, + { + "entropy": 0.3129414677619934, + "epoch": 3.386004514672686, + "grad_norm": 1.7080130577087402, + "learning_rate": 4.741069959898603e-06, + "loss": 0.2183, + "mean_token_accuracy": 0.929611599445343, + "num_tokens": 48708814.0, + "step": 6000 + }, + { + "epoch": 3.386004514672686, + "eval_entropy": 0.3240194022655487, + "eval_loss": 0.18876124918460846, + "eval_mean_token_accuracy": 0.9436565637588501, + "eval_num_tokens": 48708814.0, + "eval_runtime": 0.1645, + "eval_samples_per_second": 24.322, + "eval_steps_per_second": 6.08, + "step": 6000 + }, + { + "entropy": 0.31379024386405946, + "epoch": 3.38882618510158, + "grad_norm": 1.7492561340332031, + "learning_rate": 4.740648399109148e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9303012132644654, + "num_tokens": 48749515.0, + "step": 6005 + }, + { + "entropy": 0.27735379338264465, + "epoch": 3.391647855530474, + "grad_norm": 1.8119415044784546, + "learning_rate": 4.740226520925354e-06, + "loss": 0.1774, + "mean_token_accuracy": 0.9421341061592102, + "num_tokens": 48790013.0, + "step": 6010 + }, + { + "entropy": 0.27942020893096925, + "epoch": 3.394469525959368, + "grad_norm": 1.6023916006088257, + "learning_rate": 4.7398043254301e-06, + "loss": 0.1803, + "mean_token_accuracy": 0.94042888879776, + "num_tokens": 48830739.0, + "step": 6015 + }, + { + "entropy": 0.3403823971748352, + "epoch": 3.3972911963882617, + "grad_norm": 1.8019448518753052, + "learning_rate": 4.739381812706326e-06, + "loss": 0.2299, + "mean_token_accuracy": 0.9285380721092225, + "num_tokens": 48871272.0, + "step": 6020 + }, + { + "entropy": 0.30235469341278076, + "epoch": 3.400112866817156, + "grad_norm": 2.0274271965026855, + "learning_rate": 4.738958982837036e-06, + "loss": 0.208, + "mean_token_accuracy": 0.9346428751945496, + "num_tokens": 48911923.0, + "step": 6025 + }, + { + "entropy": 0.32721391320228577, + "epoch": 3.4029345372460496, + "grad_norm": 1.9175236225128174, + "learning_rate": 4.738535835905294e-06, + "loss": 0.2291, + "mean_token_accuracy": 0.9262001872062683, + "num_tokens": 48952801.0, + "step": 6030 + }, + { + "entropy": 0.3034025192260742, + "epoch": 3.4057562076749437, + "grad_norm": 1.9815678596496582, + "learning_rate": 4.738112371994227e-06, + "loss": 0.2018, + "mean_token_accuracy": 0.9346770763397216, + "num_tokens": 48993273.0, + "step": 6035 + }, + { + "entropy": 0.3161701142787933, + "epoch": 3.4085778781038374, + "grad_norm": 1.633199691772461, + "learning_rate": 4.737688591187024e-06, + "loss": 0.2118, + "mean_token_accuracy": 0.9302195191383362, + "num_tokens": 49033969.0, + "step": 6040 + }, + { + "entropy": 0.30224609375, + "epoch": 3.4113995485327315, + "grad_norm": 1.9072091579437256, + "learning_rate": 4.737264493566939e-06, + "loss": 0.2037, + "mean_token_accuracy": 0.933539628982544, + "num_tokens": 49074370.0, + "step": 6045 + }, + { + "entropy": 0.33060076236724856, + "epoch": 3.4142212189616252, + "grad_norm": 2.085325241088867, + "learning_rate": 4.736840079217284e-06, + "loss": 0.2364, + "mean_token_accuracy": 0.9252678632736206, + "num_tokens": 49114878.0, + "step": 6050 + }, + { + "entropy": 0.2963068068027496, + "epoch": 3.4170428893905194, + "grad_norm": 1.7405322790145874, + "learning_rate": 4.736415348221435e-06, + "loss": 0.1954, + "mean_token_accuracy": 0.9355825662612915, + "num_tokens": 49155589.0, + "step": 6055 + }, + { + "entropy": 0.3319820463657379, + "epoch": 3.419864559819413, + "grad_norm": 1.9911015033721924, + "learning_rate": 4.735990300662833e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.9308295607566833, + "num_tokens": 49196038.0, + "step": 6060 + }, + { + "entropy": 0.29007573127746583, + "epoch": 3.422686230248307, + "grad_norm": 1.579049825668335, + "learning_rate": 4.7355649366249755e-06, + "loss": 0.1962, + "mean_token_accuracy": 0.9360047936439514, + "num_tokens": 49236253.0, + "step": 6065 + }, + { + "entropy": 0.28036363422870636, + "epoch": 3.425507900677201, + "grad_norm": 1.846561312675476, + "learning_rate": 4.735139256191428e-06, + "loss": 0.1917, + "mean_token_accuracy": 0.937602186203003, + "num_tokens": 49276934.0, + "step": 6070 + }, + { + "entropy": 0.30836820006370547, + "epoch": 3.4283295711060946, + "grad_norm": 2.027714967727661, + "learning_rate": 4.734713259445814e-06, + "loss": 0.2116, + "mean_token_accuracy": 0.9310431718826294, + "num_tokens": 49317527.0, + "step": 6075 + }, + { + "entropy": 0.2989709198474884, + "epoch": 3.4311512415349887, + "grad_norm": 1.928396224975586, + "learning_rate": 4.734286946471821e-06, + "loss": 0.2028, + "mean_token_accuracy": 0.9341145396232605, + "num_tokens": 49358187.0, + "step": 6080 + }, + { + "entropy": 0.29576932787895205, + "epoch": 3.4339729119638824, + "grad_norm": 1.8478814363479614, + "learning_rate": 4.733860317353198e-06, + "loss": 0.201, + "mean_token_accuracy": 0.9338787913322448, + "num_tokens": 49398870.0, + "step": 6085 + }, + { + "entropy": 0.34390089511871336, + "epoch": 3.4367945823927766, + "grad_norm": 2.0840272903442383, + "learning_rate": 4.733433372173756e-06, + "loss": 0.2391, + "mean_token_accuracy": 0.9246963381767273, + "num_tokens": 49439698.0, + "step": 6090 + }, + { + "entropy": 0.3055784046649933, + "epoch": 3.4396162528216703, + "grad_norm": 1.8206205368041992, + "learning_rate": 4.73300611101737e-06, + "loss": 0.2052, + "mean_token_accuracy": 0.933589768409729, + "num_tokens": 49480359.0, + "step": 6095 + }, + { + "entropy": 0.3226096034049988, + "epoch": 3.4424379232505644, + "grad_norm": 2.05794620513916, + "learning_rate": 4.732578533967974e-06, + "loss": 0.2108, + "mean_token_accuracy": 0.9320708394050599, + "num_tokens": 49521140.0, + "step": 6100 + }, + { + "entropy": 0.31547321677207946, + "epoch": 3.445259593679458, + "grad_norm": 2.0338547229766846, + "learning_rate": 4.732150641109566e-06, + "loss": 0.2055, + "mean_token_accuracy": 0.93296377658844, + "num_tokens": 49561801.0, + "step": 6105 + }, + { + "entropy": 0.32915824055671694, + "epoch": 3.4480812641083523, + "grad_norm": 1.9995898008346558, + "learning_rate": 4.731722432526206e-06, + "loss": 0.2213, + "mean_token_accuracy": 0.9298017382621765, + "num_tokens": 49602445.0, + "step": 6110 + }, + { + "entropy": 0.31122357249259947, + "epoch": 3.450902934537246, + "grad_norm": 1.9306919574737549, + "learning_rate": 4.731293908302014e-06, + "loss": 0.193, + "mean_token_accuracy": 0.9369074702262878, + "num_tokens": 49643049.0, + "step": 6115 + }, + { + "entropy": 0.283711576461792, + "epoch": 3.45372460496614, + "grad_norm": 1.7749558687210083, + "learning_rate": 4.730865068521177e-06, + "loss": 0.1865, + "mean_token_accuracy": 0.9403268456459045, + "num_tokens": 49683779.0, + "step": 6120 + }, + { + "entropy": 0.3096657872200012, + "epoch": 3.456546275395034, + "grad_norm": 1.9655077457427979, + "learning_rate": 4.730435913267937e-06, + "loss": 0.2029, + "mean_token_accuracy": 0.9331951022148133, + "num_tokens": 49724297.0, + "step": 6125 + }, + { + "entropy": 0.3291502416133881, + "epoch": 3.459367945823928, + "grad_norm": 1.9921224117279053, + "learning_rate": 4.7300064426266035e-06, + "loss": 0.2186, + "mean_token_accuracy": 0.9291182398796082, + "num_tokens": 49764911.0, + "step": 6130 + }, + { + "entropy": 0.31880820393562315, + "epoch": 3.4621896162528216, + "grad_norm": 1.8428974151611328, + "learning_rate": 4.729576656681545e-06, + "loss": 0.2154, + "mean_token_accuracy": 0.9307239055633545, + "num_tokens": 49805553.0, + "step": 6135 + }, + { + "entropy": 0.29867430329322814, + "epoch": 3.4650112866817158, + "grad_norm": 2.0829527378082275, + "learning_rate": 4.729146555517195e-06, + "loss": 0.2009, + "mean_token_accuracy": 0.9337917327880859, + "num_tokens": 49846334.0, + "step": 6140 + }, + { + "entropy": 0.3254325449466705, + "epoch": 3.4678329571106095, + "grad_norm": 1.9901976585388184, + "learning_rate": 4.728716139218045e-06, + "loss": 0.2165, + "mean_token_accuracy": 0.9306549906730652, + "num_tokens": 49886713.0, + "step": 6145 + }, + { + "entropy": 0.2868054747581482, + "epoch": 3.4706546275395036, + "grad_norm": 1.7727199792861938, + "learning_rate": 4.728285407868651e-06, + "loss": 0.1976, + "mean_token_accuracy": 0.936365807056427, + "num_tokens": 49927340.0, + "step": 6150 + }, + { + "entropy": 0.3261713206768036, + "epoch": 3.4734762979683973, + "grad_norm": 1.8373956680297852, + "learning_rate": 4.72785436155363e-06, + "loss": 0.2094, + "mean_token_accuracy": 0.9324244618415832, + "num_tokens": 49967868.0, + "step": 6155 + }, + { + "entropy": 0.3357150912284851, + "epoch": 3.476297968397291, + "grad_norm": 1.9818036556243896, + "learning_rate": 4.7274230003576625e-06, + "loss": 0.2388, + "mean_token_accuracy": 0.9242081761360168, + "num_tokens": 50008499.0, + "step": 6160 + }, + { + "entropy": 0.3065942347049713, + "epoch": 3.479119638826185, + "grad_norm": 1.6570138931274414, + "learning_rate": 4.726991324365487e-06, + "loss": 0.2021, + "mean_token_accuracy": 0.9351112723350525, + "num_tokens": 50049217.0, + "step": 6165 + }, + { + "entropy": 0.2773208677768707, + "epoch": 3.481941309255079, + "grad_norm": 1.8118728399276733, + "learning_rate": 4.726559333661908e-06, + "loss": 0.1871, + "mean_token_accuracy": 0.9404791593551636, + "num_tokens": 50090066.0, + "step": 6170 + }, + { + "entropy": 0.3141829252243042, + "epoch": 3.484762979683973, + "grad_norm": 1.8138505220413208, + "learning_rate": 4.726127028331789e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.9332842707633973, + "num_tokens": 50130759.0, + "step": 6175 + }, + { + "entropy": 0.29586785435676577, + "epoch": 3.4875846501128667, + "grad_norm": 2.0417232513427734, + "learning_rate": 4.725694408460059e-06, + "loss": 0.1986, + "mean_token_accuracy": 0.9360351800918579, + "num_tokens": 50171307.0, + "step": 6180 + }, + { + "entropy": 0.3109571814537048, + "epoch": 3.490406320541761, + "grad_norm": 2.048214912414551, + "learning_rate": 4.725261474131703e-06, + "loss": 0.207, + "mean_token_accuracy": 0.9328826665878296, + "num_tokens": 50211913.0, + "step": 6185 + }, + { + "entropy": 0.3329827606678009, + "epoch": 3.4932279909706545, + "grad_norm": 1.681795597076416, + "learning_rate": 4.724828225431772e-06, + "loss": 0.2283, + "mean_token_accuracy": 0.9269816160202027, + "num_tokens": 50252502.0, + "step": 6190 + }, + { + "entropy": 0.29528833031654356, + "epoch": 3.4960496613995486, + "grad_norm": 1.5485318899154663, + "learning_rate": 4.72439466244538e-06, + "loss": 0.1888, + "mean_token_accuracy": 0.9390155076980591, + "num_tokens": 50293239.0, + "step": 6195 + }, + { + "entropy": 0.30525763630867003, + "epoch": 3.4988713318284423, + "grad_norm": 1.8242753744125366, + "learning_rate": 4.723960785257697e-06, + "loss": 0.207, + "mean_token_accuracy": 0.9336790919303894, + "num_tokens": 50333949.0, + "step": 6200 + }, + { + "entropy": 0.329301780462265, + "epoch": 3.5016930022573365, + "grad_norm": 1.7903170585632324, + "learning_rate": 4.72352659395396e-06, + "loss": 0.2453, + "mean_token_accuracy": 0.9249015927314759, + "num_tokens": 50374411.0, + "step": 6205 + }, + { + "entropy": 0.30608891248703, + "epoch": 3.50451467268623, + "grad_norm": 1.8584059476852417, + "learning_rate": 4.7230920886194655e-06, + "loss": 0.2178, + "mean_token_accuracy": 0.9296943068504333, + "num_tokens": 50415248.0, + "step": 6210 + }, + { + "entropy": 0.3110420048236847, + "epoch": 3.5073363431151243, + "grad_norm": 1.9278223514556885, + "learning_rate": 4.722657269339573e-06, + "loss": 0.2087, + "mean_token_accuracy": 0.9321151494979858, + "num_tokens": 50455903.0, + "step": 6215 + }, + { + "entropy": 0.29749436378479005, + "epoch": 3.510158013544018, + "grad_norm": 1.6023099422454834, + "learning_rate": 4.722222136199703e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.940057122707367, + "num_tokens": 50496595.0, + "step": 6220 + }, + { + "entropy": 0.3107947289943695, + "epoch": 3.5129796839729117, + "grad_norm": 1.7268322706222534, + "learning_rate": 4.7217866892853355e-06, + "loss": 0.1982, + "mean_token_accuracy": 0.9365177512168884, + "num_tokens": 50537156.0, + "step": 6225 + }, + { + "entropy": 0.31932695508003234, + "epoch": 3.515801354401806, + "grad_norm": 1.9422742128372192, + "learning_rate": 4.721350928682017e-06, + "loss": 0.2254, + "mean_token_accuracy": 0.9268533945083618, + "num_tokens": 50577880.0, + "step": 6230 + }, + { + "entropy": 0.3243493378162384, + "epoch": 3.5186230248307, + "grad_norm": 1.8722442388534546, + "learning_rate": 4.720914854475349e-06, + "loss": 0.2181, + "mean_token_accuracy": 0.9315003275871276, + "num_tokens": 50618659.0, + "step": 6235 + }, + { + "entropy": 0.30051770210266116, + "epoch": 3.5214446952595937, + "grad_norm": 1.5247653722763062, + "learning_rate": 4.720478466751002e-06, + "loss": 0.1931, + "mean_token_accuracy": 0.9378492355346679, + "num_tokens": 50659414.0, + "step": 6240 + }, + { + "entropy": 0.3169930100440979, + "epoch": 3.5242663656884874, + "grad_norm": 2.1740469932556152, + "learning_rate": 4.720041765594701e-06, + "loss": 0.2171, + "mean_token_accuracy": 0.9304054498672485, + "num_tokens": 50700091.0, + "step": 6245 + }, + { + "entropy": 0.3257488250732422, + "epoch": 3.5270880361173815, + "grad_norm": 1.9426953792572021, + "learning_rate": 4.719604751092239e-06, + "loss": 0.221, + "mean_token_accuracy": 0.928559148311615, + "num_tokens": 50740627.0, + "step": 6250 + }, + { + "entropy": 0.31315135955810547, + "epoch": 3.5299097065462757, + "grad_norm": 1.9665589332580566, + "learning_rate": 4.719167423329467e-06, + "loss": 0.2139, + "mean_token_accuracy": 0.9316304087638855, + "num_tokens": 50781438.0, + "step": 6255 + }, + { + "entropy": 0.3040352761745453, + "epoch": 3.5327313769751694, + "grad_norm": 2.057408571243286, + "learning_rate": 4.718729782392297e-06, + "loss": 0.1961, + "mean_token_accuracy": 0.9354169130325317, + "num_tokens": 50822359.0, + "step": 6260 + }, + { + "entropy": 0.3147823929786682, + "epoch": 3.535553047404063, + "grad_norm": 2.11651349067688, + "learning_rate": 4.718291828366703e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.9335868835449219, + "num_tokens": 50862733.0, + "step": 6265 + }, + { + "entropy": 0.3134510278701782, + "epoch": 3.538374717832957, + "grad_norm": 1.6482633352279663, + "learning_rate": 4.717853561338723e-06, + "loss": 0.2068, + "mean_token_accuracy": 0.9318931818008422, + "num_tokens": 50903490.0, + "step": 6270 + }, + { + "entropy": 0.3234701603651047, + "epoch": 3.541196388261851, + "grad_norm": 2.001359701156616, + "learning_rate": 4.717414981394454e-06, + "loss": 0.233, + "mean_token_accuracy": 0.9272037386894226, + "num_tokens": 50944012.0, + "step": 6275 + }, + { + "entropy": 0.2873412311077118, + "epoch": 3.544018058690745, + "grad_norm": 2.011967420578003, + "learning_rate": 4.716976088620055e-06, + "loss": 0.1917, + "mean_token_accuracy": 0.9380563020706176, + "num_tokens": 50984897.0, + "step": 6280 + }, + { + "entropy": 0.29418745040893557, + "epoch": 3.5468397291196387, + "grad_norm": 1.7138752937316895, + "learning_rate": 4.716536883101746e-06, + "loss": 0.1905, + "mean_token_accuracy": 0.9367462277412415, + "num_tokens": 51025455.0, + "step": 6285 + }, + { + "entropy": 0.3291135847568512, + "epoch": 3.549661399548533, + "grad_norm": 2.0384838581085205, + "learning_rate": 4.716097364925809e-06, + "loss": 0.2329, + "mean_token_accuracy": 0.9264655709266663, + "num_tokens": 51066205.0, + "step": 6290 + }, + { + "entropy": 0.31210439205169677, + "epoch": 3.5524830699774266, + "grad_norm": 1.871769905090332, + "learning_rate": 4.715657534178589e-06, + "loss": 0.2176, + "mean_token_accuracy": 0.9300808906555176, + "num_tokens": 51106895.0, + "step": 6295 + }, + { + "entropy": 0.2980155825614929, + "epoch": 3.5553047404063207, + "grad_norm": 1.6929700374603271, + "learning_rate": 4.715217390946489e-06, + "loss": 0.2014, + "mean_token_accuracy": 0.9350870847702026, + "num_tokens": 51147652.0, + "step": 6300 + }, + { + "entropy": 0.3304617702960968, + "epoch": 3.5581264108352144, + "grad_norm": 1.6658223867416382, + "learning_rate": 4.714776935315976e-06, + "loss": 0.2305, + "mean_token_accuracy": 0.9265113472938538, + "num_tokens": 51188269.0, + "step": 6305 + }, + { + "entropy": 0.30585951209068296, + "epoch": 3.560948081264108, + "grad_norm": 1.8482630252838135, + "learning_rate": 4.7143361673735774e-06, + "loss": 0.2076, + "mean_token_accuracy": 0.9323771357536316, + "num_tokens": 51228854.0, + "step": 6310 + }, + { + "entropy": 0.33988407254219055, + "epoch": 3.5637697516930023, + "grad_norm": 1.9136297702789307, + "learning_rate": 4.713895087205882e-06, + "loss": 0.2287, + "mean_token_accuracy": 0.9266070127487183, + "num_tokens": 51269073.0, + "step": 6315 + }, + { + "entropy": 0.3092219591140747, + "epoch": 3.5665914221218964, + "grad_norm": 1.8166900873184204, + "learning_rate": 4.71345369489954e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.9348546147346497, + "num_tokens": 51309223.0, + "step": 6320 + }, + { + "entropy": 0.3356677830219269, + "epoch": 3.56941309255079, + "grad_norm": 1.9024507999420166, + "learning_rate": 4.7130119905412635e-06, + "loss": 0.2282, + "mean_token_accuracy": 0.927457618713379, + "num_tokens": 51349901.0, + "step": 6325 + }, + { + "entropy": 0.31340835690498353, + "epoch": 3.572234762979684, + "grad_norm": 1.8880701065063477, + "learning_rate": 4.712569974217826e-06, + "loss": 0.2102, + "mean_token_accuracy": 0.9327018618583679, + "num_tokens": 51390569.0, + "step": 6330 + }, + { + "entropy": 0.31486195921897886, + "epoch": 3.575056433408578, + "grad_norm": 2.056093215942383, + "learning_rate": 4.712127646016059e-06, + "loss": 0.211, + "mean_token_accuracy": 0.9327765345573426, + "num_tokens": 51431350.0, + "step": 6335 + }, + { + "entropy": 0.31965509057044983, + "epoch": 3.5778781038374716, + "grad_norm": 1.6555287837982178, + "learning_rate": 4.71168500602286e-06, + "loss": 0.2194, + "mean_token_accuracy": 0.9297954082489014, + "num_tokens": 51471728.0, + "step": 6340 + }, + { + "entropy": 0.34823336601257326, + "epoch": 3.5806997742663658, + "grad_norm": 1.9188921451568604, + "learning_rate": 4.7112420543251854e-06, + "loss": 0.2211, + "mean_token_accuracy": 0.9278290867805481, + "num_tokens": 51512451.0, + "step": 6345 + }, + { + "entropy": 0.27960309386253357, + "epoch": 3.5835214446952595, + "grad_norm": 1.7235885858535767, + "learning_rate": 4.710798791010054e-06, + "loss": 0.1939, + "mean_token_accuracy": 0.93757244348526, + "num_tokens": 51553127.0, + "step": 6350 + }, + { + "entropy": 0.30215221643447876, + "epoch": 3.5863431151241536, + "grad_norm": 1.8641575574874878, + "learning_rate": 4.710355216164543e-06, + "loss": 0.2143, + "mean_token_accuracy": 0.9315236330032348, + "num_tokens": 51593737.0, + "step": 6355 + }, + { + "entropy": 0.29333736300468444, + "epoch": 3.5891647855530473, + "grad_norm": 1.6100600957870483, + "learning_rate": 4.7099113298757934e-06, + "loss": 0.1993, + "mean_token_accuracy": 0.9377033352851868, + "num_tokens": 51634446.0, + "step": 6360 + }, + { + "entropy": 0.32711849808692933, + "epoch": 3.5919864559819414, + "grad_norm": 2.63490891456604, + "learning_rate": 4.709467132231007e-06, + "loss": 0.213, + "mean_token_accuracy": 0.9318007826805115, + "num_tokens": 51674890.0, + "step": 6365 + }, + { + "entropy": 0.3057248055934906, + "epoch": 3.594808126410835, + "grad_norm": 1.7993404865264893, + "learning_rate": 4.709022623317447e-06, + "loss": 0.2136, + "mean_token_accuracy": 0.9311740756034851, + "num_tokens": 51715328.0, + "step": 6370 + }, + { + "entropy": 0.32637726664543154, + "epoch": 3.5976297968397293, + "grad_norm": 1.7621092796325684, + "learning_rate": 4.708577803222437e-06, + "loss": 0.228, + "mean_token_accuracy": 0.926943325996399, + "num_tokens": 51755709.0, + "step": 6375 + }, + { + "entropy": 0.3093415260314941, + "epoch": 3.600451467268623, + "grad_norm": 1.7598847150802612, + "learning_rate": 4.708132672033361e-06, + "loss": 0.1968, + "mean_token_accuracy": 0.9351221680641174, + "num_tokens": 51796409.0, + "step": 6380 + }, + { + "entropy": 0.3067767798900604, + "epoch": 3.603273137697517, + "grad_norm": 1.894102931022644, + "learning_rate": 4.707687229837667e-06, + "loss": 0.2096, + "mean_token_accuracy": 0.9340128540992737, + "num_tokens": 51836874.0, + "step": 6385 + }, + { + "entropy": 0.31157588958740234, + "epoch": 3.606094808126411, + "grad_norm": 1.6844671964645386, + "learning_rate": 4.70724147672286e-06, + "loss": 0.2012, + "mean_token_accuracy": 0.9360531449317933, + "num_tokens": 51877598.0, + "step": 6390 + }, + { + "entropy": 0.30880475640296934, + "epoch": 3.6089164785553045, + "grad_norm": 1.8519737720489502, + "learning_rate": 4.706795412776509e-06, + "loss": 0.2139, + "mean_token_accuracy": 0.9301542401313782, + "num_tokens": 51918221.0, + "step": 6395 + }, + { + "entropy": 0.3159621596336365, + "epoch": 3.6117381489841986, + "grad_norm": 1.9711363315582275, + "learning_rate": 4.706349038086244e-06, + "loss": 0.218, + "mean_token_accuracy": 0.9309246063232421, + "num_tokens": 51959053.0, + "step": 6400 + }, + { + "entropy": 0.2915292739868164, + "epoch": 3.614559819413093, + "grad_norm": 1.8630083799362183, + "learning_rate": 4.7059023527397556e-06, + "loss": 0.1922, + "mean_token_accuracy": 0.9368671178817749, + "num_tokens": 51999218.0, + "step": 6405 + }, + { + "entropy": 0.3344090163707733, + "epoch": 3.6173814898419865, + "grad_norm": 1.93043851852417, + "learning_rate": 4.705455356824794e-06, + "loss": 0.2198, + "mean_token_accuracy": 0.9300200700759887, + "num_tokens": 52039875.0, + "step": 6410 + }, + { + "entropy": 0.3021121442317963, + "epoch": 3.62020316027088, + "grad_norm": 1.8353512287139893, + "learning_rate": 4.705008050429171e-06, + "loss": 0.2074, + "mean_token_accuracy": 0.9331298470497131, + "num_tokens": 52080363.0, + "step": 6415 + }, + { + "entropy": 0.30426907539367676, + "epoch": 3.6230248306997743, + "grad_norm": 1.7905832529067993, + "learning_rate": 4.704560433640762e-06, + "loss": 0.2117, + "mean_token_accuracy": 0.9322558403015136, + "num_tokens": 52121089.0, + "step": 6420 + }, + { + "entropy": 0.3197052538394928, + "epoch": 3.625846501128668, + "grad_norm": 1.7855573892593384, + "learning_rate": 4.7041125065475e-06, + "loss": 0.2154, + "mean_token_accuracy": 0.9314280033111573, + "num_tokens": 52161686.0, + "step": 6425 + }, + { + "entropy": 0.2890112340450287, + "epoch": 3.628668171557562, + "grad_norm": 2.0892858505249023, + "learning_rate": 4.703664269237381e-06, + "loss": 0.1937, + "mean_token_accuracy": 0.9377867579460144, + "num_tokens": 52202482.0, + "step": 6430 + }, + { + "entropy": 0.30858972668647766, + "epoch": 3.631489841986456, + "grad_norm": 1.9566696882247925, + "learning_rate": 4.703215721798462e-06, + "loss": 0.2021, + "mean_token_accuracy": 0.9352177262306214, + "num_tokens": 52243201.0, + "step": 6435 + }, + { + "entropy": 0.3409568965435028, + "epoch": 3.63431151241535, + "grad_norm": 1.81044602394104, + "learning_rate": 4.702766864318858e-06, + "loss": 0.2185, + "mean_token_accuracy": 0.9314499139785767, + "num_tokens": 52284031.0, + "step": 6440 + }, + { + "entropy": 0.3199546754360199, + "epoch": 3.6371331828442437, + "grad_norm": 1.9870702028274536, + "learning_rate": 4.70231769688675e-06, + "loss": 0.2192, + "mean_token_accuracy": 0.929648220539093, + "num_tokens": 52323965.0, + "step": 6445 + }, + { + "entropy": 0.3059984266757965, + "epoch": 3.639954853273138, + "grad_norm": 2.04311203956604, + "learning_rate": 4.701868219590374e-06, + "loss": 0.2123, + "mean_token_accuracy": 0.9337385654449463, + "num_tokens": 52363881.0, + "step": 6450 + }, + { + "entropy": 0.32826207876205443, + "epoch": 3.6427765237020315, + "grad_norm": 1.848895788192749, + "learning_rate": 4.701418432518032e-06, + "loss": 0.2206, + "mean_token_accuracy": 0.9300814032554626, + "num_tokens": 52404601.0, + "step": 6455 + }, + { + "entropy": 0.3011523485183716, + "epoch": 3.6455981941309257, + "grad_norm": 1.7186975479125977, + "learning_rate": 4.700968335758084e-06, + "loss": 0.1944, + "mean_token_accuracy": 0.9379884481430054, + "num_tokens": 52445278.0, + "step": 6460 + }, + { + "entropy": 0.3180157899856567, + "epoch": 3.6484198645598194, + "grad_norm": 2.0736236572265625, + "learning_rate": 4.700517929398951e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.9314456939697265, + "num_tokens": 52486045.0, + "step": 6465 + }, + { + "entropy": 0.32199347019195557, + "epoch": 3.6512415349887135, + "grad_norm": 1.9433692693710327, + "learning_rate": 4.7000672135291166e-06, + "loss": 0.2133, + "mean_token_accuracy": 0.9318039417266846, + "num_tokens": 52525845.0, + "step": 6470 + }, + { + "entropy": 0.32936949729919435, + "epoch": 3.654063205417607, + "grad_norm": 1.8279980421066284, + "learning_rate": 4.699616188237123e-06, + "loss": 0.2376, + "mean_token_accuracy": 0.9232208609580994, + "num_tokens": 52566170.0, + "step": 6475 + }, + { + "entropy": 0.28730884194374084, + "epoch": 3.656884875846501, + "grad_norm": 1.848632574081421, + "learning_rate": 4.699164853611574e-06, + "loss": 0.1949, + "mean_token_accuracy": 0.9368890285491943, + "num_tokens": 52606933.0, + "step": 6480 + }, + { + "entropy": 0.2818956643342972, + "epoch": 3.659706546275395, + "grad_norm": 1.7565962076187134, + "learning_rate": 4.698713209741136e-06, + "loss": 0.1923, + "mean_token_accuracy": 0.9376680016517639, + "num_tokens": 52647547.0, + "step": 6485 + }, + { + "entropy": 0.30916563868522645, + "epoch": 3.662528216704289, + "grad_norm": 1.94307279586792, + "learning_rate": 4.698261256714533e-06, + "loss": 0.2076, + "mean_token_accuracy": 0.9336765766143799, + "num_tokens": 52688266.0, + "step": 6490 + }, + { + "entropy": 0.29969978928565977, + "epoch": 3.665349887133183, + "grad_norm": 1.9244630336761475, + "learning_rate": 4.697808994620552e-06, + "loss": 0.2085, + "mean_token_accuracy": 0.9322393655776977, + "num_tokens": 52728923.0, + "step": 6495 + }, + { + "entropy": 0.31762275099754333, + "epoch": 3.6681715575620766, + "grad_norm": 1.9523797035217285, + "learning_rate": 4.697356423548038e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.9313531517982483, + "num_tokens": 52769425.0, + "step": 6500 + }, + { + "epoch": 3.6681715575620766, + "eval_entropy": 0.32103675603866577, + "eval_loss": 0.1689807027578354, + "eval_mean_token_accuracy": 0.9474894404411316, + "eval_num_tokens": 52769425.0, + "eval_runtime": 0.1638, + "eval_samples_per_second": 24.419, + "eval_steps_per_second": 6.105, + "step": 6500 + }, + { + "entropy": 0.3029796838760376, + "epoch": 3.6709932279909707, + "grad_norm": 1.7941051721572876, + "learning_rate": 4.696903543585902e-06, + "loss": 0.2128, + "mean_token_accuracy": 0.9323464274406433, + "num_tokens": 52810073.0, + "step": 6505 + }, + { + "entropy": 0.2974651575088501, + "epoch": 3.6738148984198644, + "grad_norm": 2.0666654109954834, + "learning_rate": 4.696450354823109e-06, + "loss": 0.2107, + "mean_token_accuracy": 0.9324890732765198, + "num_tokens": 52850886.0, + "step": 6510 + }, + { + "entropy": 0.3096803486347198, + "epoch": 3.6766365688487586, + "grad_norm": 1.9316201210021973, + "learning_rate": 4.69599685734869e-06, + "loss": 0.2055, + "mean_token_accuracy": 0.932162344455719, + "num_tokens": 52891548.0, + "step": 6515 + }, + { + "entropy": 0.307836776971817, + "epoch": 3.6794582392776523, + "grad_norm": 1.7384241819381714, + "learning_rate": 4.695543051251735e-06, + "loss": 0.2106, + "mean_token_accuracy": 0.930738651752472, + "num_tokens": 52932381.0, + "step": 6520 + }, + { + "entropy": 0.31080532670021055, + "epoch": 3.6822799097065464, + "grad_norm": 1.620614767074585, + "learning_rate": 4.695088936621393e-06, + "loss": 0.206, + "mean_token_accuracy": 0.9316131114959717, + "num_tokens": 52972931.0, + "step": 6525 + }, + { + "entropy": 0.3173711597919464, + "epoch": 3.68510158013544, + "grad_norm": 1.8724675178527832, + "learning_rate": 4.694634513546875e-06, + "loss": 0.2149, + "mean_token_accuracy": 0.9308315873146057, + "num_tokens": 53013578.0, + "step": 6530 + }, + { + "entropy": 0.29013689756393435, + "epoch": 3.6879232505643342, + "grad_norm": 1.7273049354553223, + "learning_rate": 4.6941797821174526e-06, + "loss": 0.1904, + "mean_token_accuracy": 0.9397516369819641, + "num_tokens": 53054348.0, + "step": 6535 + }, + { + "entropy": 0.30396153330802916, + "epoch": 3.690744920993228, + "grad_norm": 1.865898847579956, + "learning_rate": 4.693724742422458e-06, + "loss": 0.2176, + "mean_token_accuracy": 0.9278941035270691, + "num_tokens": 53094975.0, + "step": 6540 + }, + { + "entropy": 0.26694598495960237, + "epoch": 3.6935665914221216, + "grad_norm": 1.6677098274230957, + "learning_rate": 4.693269394551286e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9395427823066711, + "num_tokens": 53135477.0, + "step": 6545 + }, + { + "entropy": 0.3146726369857788, + "epoch": 3.6963882618510158, + "grad_norm": 1.8670984506607056, + "learning_rate": 4.6928137385933845e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.9311745762825012, + "num_tokens": 53176141.0, + "step": 6550 + }, + { + "entropy": 0.32695006132125853, + "epoch": 3.69920993227991, + "grad_norm": 2.1717782020568848, + "learning_rate": 4.692357774638272e-06, + "loss": 0.2342, + "mean_token_accuracy": 0.9270057797431945, + "num_tokens": 53216706.0, + "step": 6555 + }, + { + "entropy": 0.3012891411781311, + "epoch": 3.7020316027088036, + "grad_norm": 1.886801838874817, + "learning_rate": 4.69190150277552e-06, + "loss": 0.2062, + "mean_token_accuracy": 0.9332509160041809, + "num_tokens": 53257505.0, + "step": 6560 + }, + { + "entropy": 0.316040962934494, + "epoch": 3.7048532731376973, + "grad_norm": 1.6204224824905396, + "learning_rate": 4.6914449230947645e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.9321287989616394, + "num_tokens": 53298201.0, + "step": 6565 + }, + { + "entropy": 0.28626594245433806, + "epoch": 3.7076749435665914, + "grad_norm": 1.7525380849838257, + "learning_rate": 4.690988035685701e-06, + "loss": 0.1939, + "mean_token_accuracy": 0.9376110672950745, + "num_tokens": 53338694.0, + "step": 6570 + }, + { + "entropy": 0.29510667324066164, + "epoch": 3.7104966139954856, + "grad_norm": 1.6574718952178955, + "learning_rate": 4.690530840638083e-06, + "loss": 0.1897, + "mean_token_accuracy": 0.940097987651825, + "num_tokens": 53379266.0, + "step": 6575 + }, + { + "entropy": 0.3167356073856354, + "epoch": 3.7133182844243793, + "grad_norm": 2.295748472213745, + "learning_rate": 4.690073338041728e-06, + "loss": 0.2168, + "mean_token_accuracy": 0.9299868464469909, + "num_tokens": 53419886.0, + "step": 6580 + }, + { + "entropy": 0.27687845230102537, + "epoch": 3.716139954853273, + "grad_norm": 1.9063183069229126, + "learning_rate": 4.689615527986514e-06, + "loss": 0.1954, + "mean_token_accuracy": 0.9360902070999145, + "num_tokens": 53460645.0, + "step": 6585 + }, + { + "entropy": 0.27884105145931243, + "epoch": 3.718961625282167, + "grad_norm": 1.709240436553955, + "learning_rate": 4.689157410562374e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.939925491809845, + "num_tokens": 53501408.0, + "step": 6590 + }, + { + "entropy": 0.3242746412754059, + "epoch": 3.721783295711061, + "grad_norm": 1.7791012525558472, + "learning_rate": 4.688698985859309e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.9305900692939758, + "num_tokens": 53542237.0, + "step": 6595 + }, + { + "entropy": 0.3170479595661163, + "epoch": 3.724604966139955, + "grad_norm": 1.7075973749160767, + "learning_rate": 4.688240253967374e-06, + "loss": 0.2162, + "mean_token_accuracy": 0.9304636359214783, + "num_tokens": 53582388.0, + "step": 6600 + }, + { + "entropy": 0.3019756257534027, + "epoch": 3.7274266365688487, + "grad_norm": 1.7577334642410278, + "learning_rate": 4.6877812149766875e-06, + "loss": 0.1985, + "mean_token_accuracy": 0.9347709894180298, + "num_tokens": 53623102.0, + "step": 6605 + }, + { + "entropy": 0.33137462139129636, + "epoch": 3.730248306997743, + "grad_norm": 1.6792027950286865, + "learning_rate": 4.687321868977429e-06, + "loss": 0.241, + "mean_token_accuracy": 0.9231707096099854, + "num_tokens": 53663401.0, + "step": 6610 + }, + { + "entropy": 0.33162534832954405, + "epoch": 3.7330699774266365, + "grad_norm": 1.8961528539657593, + "learning_rate": 4.686862216059836e-06, + "loss": 0.2269, + "mean_token_accuracy": 0.9269180774688721, + "num_tokens": 53703912.0, + "step": 6615 + }, + { + "entropy": 0.32055285573005676, + "epoch": 3.7358916478555306, + "grad_norm": 1.9672921895980835, + "learning_rate": 4.686402256314208e-06, + "loss": 0.219, + "mean_token_accuracy": 0.9281070351600647, + "num_tokens": 53744733.0, + "step": 6620 + }, + { + "entropy": 0.32325056195259094, + "epoch": 3.7387133182844243, + "grad_norm": 2.0314273834228516, + "learning_rate": 4.685941989830903e-06, + "loss": 0.2246, + "mean_token_accuracy": 0.9259540200233459, + "num_tokens": 53784519.0, + "step": 6625 + }, + { + "entropy": 0.3410928726196289, + "epoch": 3.741534988713318, + "grad_norm": 1.9816386699676514, + "learning_rate": 4.685481416700342e-06, + "loss": 0.2428, + "mean_token_accuracy": 0.9229821920394897, + "num_tokens": 53825215.0, + "step": 6630 + }, + { + "entropy": 0.3254290819168091, + "epoch": 3.744356659142212, + "grad_norm": 2.0321855545043945, + "learning_rate": 4.685020537013004e-06, + "loss": 0.2097, + "mean_token_accuracy": 0.932454526424408, + "num_tokens": 53865574.0, + "step": 6635 + }, + { + "entropy": 0.29887913465499877, + "epoch": 3.7471783295711063, + "grad_norm": 1.6626564264297485, + "learning_rate": 4.684559350859428e-06, + "loss": 0.206, + "mean_token_accuracy": 0.9335589647293091, + "num_tokens": 53906263.0, + "step": 6640 + }, + { + "entropy": 0.32658823728561404, + "epoch": 3.75, + "grad_norm": 1.9820867776870728, + "learning_rate": 4.684097858330215e-06, + "loss": 0.2207, + "mean_token_accuracy": 0.9296117424964905, + "num_tokens": 53946706.0, + "step": 6645 + }, + { + "entropy": 0.29330855011940005, + "epoch": 3.7528216704288937, + "grad_norm": 1.978268027305603, + "learning_rate": 4.683636059516024e-06, + "loss": 0.1986, + "mean_token_accuracy": 0.9361773490905761, + "num_tokens": 53987483.0, + "step": 6650 + }, + { + "entropy": 0.3071614384651184, + "epoch": 3.755643340857788, + "grad_norm": 1.5843520164489746, + "learning_rate": 4.683173954507578e-06, + "loss": 0.2018, + "mean_token_accuracy": 0.935286819934845, + "num_tokens": 54028166.0, + "step": 6655 + }, + { + "entropy": 0.2947255611419678, + "epoch": 3.758465011286682, + "grad_norm": 1.8720123767852783, + "learning_rate": 4.682711543395656e-06, + "loss": 0.1897, + "mean_token_accuracy": 0.9393622994422912, + "num_tokens": 54068959.0, + "step": 6660 + }, + { + "entropy": 0.3333085238933563, + "epoch": 3.7612866817155757, + "grad_norm": 2.160306453704834, + "learning_rate": 4.6822488262710985e-06, + "loss": 0.2458, + "mean_token_accuracy": 0.9197412252426147, + "num_tokens": 54109806.0, + "step": 6665 + }, + { + "entropy": 0.3107566237449646, + "epoch": 3.7641083521444694, + "grad_norm": 1.6867446899414062, + "learning_rate": 4.681785803224807e-06, + "loss": 0.2145, + "mean_token_accuracy": 0.9298100113868714, + "num_tokens": 54150377.0, + "step": 6670 + }, + { + "entropy": 0.34289470911026, + "epoch": 3.7669300225733635, + "grad_norm": 2.239997625350952, + "learning_rate": 4.681322474347741e-06, + "loss": 0.243, + "mean_token_accuracy": 0.9235318303108215, + "num_tokens": 54191007.0, + "step": 6675 + }, + { + "entropy": 0.300843334197998, + "epoch": 3.769751693002257, + "grad_norm": 1.7303624153137207, + "learning_rate": 4.680858839730923e-06, + "loss": 0.2098, + "mean_token_accuracy": 0.9318808197975159, + "num_tokens": 54231680.0, + "step": 6680 + }, + { + "entropy": 0.3099523186683655, + "epoch": 3.7725733634311513, + "grad_norm": 1.8123502731323242, + "learning_rate": 4.680394899465435e-06, + "loss": 0.2032, + "mean_token_accuracy": 0.9328450918197632, + "num_tokens": 54272199.0, + "step": 6685 + }, + { + "entropy": 0.3188793003559113, + "epoch": 3.775395033860045, + "grad_norm": 1.985037922859192, + "learning_rate": 4.679930653642415e-06, + "loss": 0.2159, + "mean_token_accuracy": 0.9308086633682251, + "num_tokens": 54312840.0, + "step": 6690 + }, + { + "entropy": 0.29359256029129027, + "epoch": 3.778216704288939, + "grad_norm": 1.7202750444412231, + "learning_rate": 4.679466102353068e-06, + "loss": 0.2156, + "mean_token_accuracy": 0.9294952034950257, + "num_tokens": 54353596.0, + "step": 6695 + }, + { + "entropy": 0.32029845714569094, + "epoch": 3.781038374717833, + "grad_norm": 2.217895746231079, + "learning_rate": 4.679001245688651e-06, + "loss": 0.2197, + "mean_token_accuracy": 0.9296623706817627, + "num_tokens": 54393283.0, + "step": 6700 + }, + { + "entropy": 0.3079906702041626, + "epoch": 3.783860045146727, + "grad_norm": 1.7621604204177856, + "learning_rate": 4.678536083740488e-06, + "loss": 0.2123, + "mean_token_accuracy": 0.934044873714447, + "num_tokens": 54433802.0, + "step": 6705 + }, + { + "entropy": 0.3148071825504303, + "epoch": 3.7866817155756207, + "grad_norm": 1.961150884628296, + "learning_rate": 4.67807061659996e-06, + "loss": 0.2128, + "mean_token_accuracy": 0.9301833033561706, + "num_tokens": 54474550.0, + "step": 6710 + }, + { + "entropy": 0.31343921422958376, + "epoch": 3.7895033860045144, + "grad_norm": 2.0900840759277344, + "learning_rate": 4.677604844358507e-06, + "loss": 0.2165, + "mean_token_accuracy": 0.9303097009658814, + "num_tokens": 54515386.0, + "step": 6715 + }, + { + "entropy": 0.32203570008277893, + "epoch": 3.7923250564334086, + "grad_norm": 1.9342727661132812, + "learning_rate": 4.677138767107631e-06, + "loss": 0.2161, + "mean_token_accuracy": 0.9303983330726624, + "num_tokens": 54556126.0, + "step": 6720 + }, + { + "entropy": 0.28948697447776794, + "epoch": 3.7951467268623027, + "grad_norm": 1.6458464860916138, + "learning_rate": 4.676672384938891e-06, + "loss": 0.1951, + "mean_token_accuracy": 0.9385475873947143, + "num_tokens": 54597036.0, + "step": 6725 + }, + { + "entropy": 0.3142231285572052, + "epoch": 3.7979683972911964, + "grad_norm": 1.8955562114715576, + "learning_rate": 4.676205697943911e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.9330004334449769, + "num_tokens": 54637646.0, + "step": 6730 + }, + { + "entropy": 0.3165326237678528, + "epoch": 3.80079006772009, + "grad_norm": 1.7128779888153076, + "learning_rate": 4.675738706214369e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.9308281064033508, + "num_tokens": 54678230.0, + "step": 6735 + }, + { + "entropy": 0.3183960378170013, + "epoch": 3.8036117381489842, + "grad_norm": 2.020563840866089, + "learning_rate": 4.6752714098420065e-06, + "loss": 0.225, + "mean_token_accuracy": 0.9275296330451965, + "num_tokens": 54718922.0, + "step": 6740 + }, + { + "entropy": 0.3038501560688019, + "epoch": 3.8064334085778784, + "grad_norm": 2.007378339767456, + "learning_rate": 4.674803808918624e-06, + "loss": 0.1919, + "mean_token_accuracy": 0.9365889430046082, + "num_tokens": 54759550.0, + "step": 6745 + }, + { + "entropy": 0.3290569126605988, + "epoch": 3.809255079006772, + "grad_norm": 1.8866428136825562, + "learning_rate": 4.674335903536083e-06, + "loss": 0.2219, + "mean_token_accuracy": 0.9293844699859619, + "num_tokens": 54800015.0, + "step": 6750 + }, + { + "entropy": 0.3428149461746216, + "epoch": 3.8120767494356658, + "grad_norm": 2.020414113998413, + "learning_rate": 4.673867693786301e-06, + "loss": 0.2348, + "mean_token_accuracy": 0.924261474609375, + "num_tokens": 54840616.0, + "step": 6755 + }, + { + "entropy": 0.32770676612854005, + "epoch": 3.81489841986456, + "grad_norm": 1.6898592710494995, + "learning_rate": 4.6733991797612595e-06, + "loss": 0.2039, + "mean_token_accuracy": 0.9363934993743896, + "num_tokens": 54881081.0, + "step": 6760 + }, + { + "entropy": 0.3508101344108582, + "epoch": 3.8177200902934536, + "grad_norm": 1.967923879623413, + "learning_rate": 4.672930361552998e-06, + "loss": 0.2206, + "mean_token_accuracy": 0.9289634466171265, + "num_tokens": 54921258.0, + "step": 6765 + }, + { + "entropy": 0.29331698417663576, + "epoch": 3.8205417607223477, + "grad_norm": 1.8218579292297363, + "learning_rate": 4.672461239253616e-06, + "loss": 0.1829, + "mean_token_accuracy": 0.9402494430541992, + "num_tokens": 54961736.0, + "step": 6770 + }, + { + "entropy": 0.31439730525016785, + "epoch": 3.8233634311512414, + "grad_norm": 2.268512487411499, + "learning_rate": 4.671991812955273e-06, + "loss": 0.2126, + "mean_token_accuracy": 0.9317742347717285, + "num_tokens": 55002382.0, + "step": 6775 + }, + { + "entropy": 0.3293932378292084, + "epoch": 3.8261851015801356, + "grad_norm": 2.075479507446289, + "learning_rate": 4.671522082750186e-06, + "loss": 0.2336, + "mean_token_accuracy": 0.9250609993934631, + "num_tokens": 55043115.0, + "step": 6780 + }, + { + "entropy": 0.32756795883178713, + "epoch": 3.8290067720090293, + "grad_norm": 1.9896371364593506, + "learning_rate": 4.671052048730635e-06, + "loss": 0.2114, + "mean_token_accuracy": 0.9321682929992676, + "num_tokens": 55082894.0, + "step": 6785 + }, + { + "entropy": 0.3386588513851166, + "epoch": 3.8318284424379234, + "grad_norm": 1.902786135673523, + "learning_rate": 4.670581710988958e-06, + "loss": 0.2271, + "mean_token_accuracy": 0.9268991231918335, + "num_tokens": 55123408.0, + "step": 6790 + }, + { + "entropy": 0.3413122892379761, + "epoch": 3.834650112866817, + "grad_norm": 2.145611047744751, + "learning_rate": 4.6701110696175546e-06, + "loss": 0.2295, + "mean_token_accuracy": 0.9283112645149231, + "num_tokens": 55163941.0, + "step": 6795 + }, + { + "entropy": 0.293692809343338, + "epoch": 3.837471783295711, + "grad_norm": 2.1431288719177246, + "learning_rate": 4.669640124708879e-06, + "loss": 0.2009, + "mean_token_accuracy": 0.934659731388092, + "num_tokens": 55204732.0, + "step": 6800 + }, + { + "entropy": 0.322798889875412, + "epoch": 3.840293453724605, + "grad_norm": 1.9868714809417725, + "learning_rate": 4.66916887635545e-06, + "loss": 0.2327, + "mean_token_accuracy": 0.9276728153228759, + "num_tokens": 55245323.0, + "step": 6805 + }, + { + "entropy": 0.3417269468307495, + "epoch": 3.843115124153499, + "grad_norm": 1.8813223838806152, + "learning_rate": 4.668697324649845e-06, + "loss": 0.2308, + "mean_token_accuracy": 0.9265347599983216, + "num_tokens": 55285830.0, + "step": 6810 + }, + { + "entropy": 0.32689463496208193, + "epoch": 3.845936794582393, + "grad_norm": 1.8665236234664917, + "learning_rate": 4.6682254696847e-06, + "loss": 0.2186, + "mean_token_accuracy": 0.928902006149292, + "num_tokens": 55326576.0, + "step": 6815 + }, + { + "entropy": 0.30429264307022097, + "epoch": 3.8487584650112865, + "grad_norm": 1.9757881164550781, + "learning_rate": 4.667753311552711e-06, + "loss": 0.2066, + "mean_token_accuracy": 0.9331792950630188, + "num_tokens": 55367147.0, + "step": 6820 + }, + { + "entropy": 0.3437939524650574, + "epoch": 3.8515801354401806, + "grad_norm": 1.7982680797576904, + "learning_rate": 4.667280850346634e-06, + "loss": 0.236, + "mean_token_accuracy": 0.9242946028709411, + "num_tokens": 55407720.0, + "step": 6825 + }, + { + "entropy": 0.3199367105960846, + "epoch": 3.8544018058690743, + "grad_norm": 1.9674409627914429, + "learning_rate": 4.666808086159283e-06, + "loss": 0.2236, + "mean_token_accuracy": 0.9290154814720154, + "num_tokens": 55448550.0, + "step": 6830 + }, + { + "entropy": 0.2929447114467621, + "epoch": 3.8572234762979685, + "grad_norm": 2.2248289585113525, + "learning_rate": 4.666335019083532e-06, + "loss": 0.1942, + "mean_token_accuracy": 0.9364341259002685, + "num_tokens": 55489200.0, + "step": 6835 + }, + { + "entropy": 0.29515990018844607, + "epoch": 3.860045146726862, + "grad_norm": 1.5697543621063232, + "learning_rate": 4.665861649212316e-06, + "loss": 0.1987, + "mean_token_accuracy": 0.936658239364624, + "num_tokens": 55530008.0, + "step": 6840 + }, + { + "entropy": 0.3109833776950836, + "epoch": 3.8628668171557563, + "grad_norm": 1.9397079944610596, + "learning_rate": 4.6653879766386305e-06, + "loss": 0.2049, + "mean_token_accuracy": 0.9348199844360352, + "num_tokens": 55570437.0, + "step": 6845 + }, + { + "entropy": 0.3064566671848297, + "epoch": 3.86568848758465, + "grad_norm": 1.5913760662078857, + "learning_rate": 4.664914001455526e-06, + "loss": 0.2074, + "mean_token_accuracy": 0.9345534205436706, + "num_tokens": 55611132.0, + "step": 6850 + }, + { + "entropy": 0.31966778039932253, + "epoch": 3.868510158013544, + "grad_norm": 1.8581488132476807, + "learning_rate": 4.664439723756116e-06, + "loss": 0.2171, + "mean_token_accuracy": 0.9305494904518128, + "num_tokens": 55651714.0, + "step": 6855 + }, + { + "entropy": 0.31408268213272095, + "epoch": 3.871331828442438, + "grad_norm": 1.9941738843917847, + "learning_rate": 4.6639651436335705e-06, + "loss": 0.2101, + "mean_token_accuracy": 0.933535099029541, + "num_tokens": 55692277.0, + "step": 6860 + }, + { + "entropy": 0.3109330773353577, + "epoch": 3.874153498871332, + "grad_norm": 2.0422751903533936, + "learning_rate": 4.663490261181124e-06, + "loss": 0.206, + "mean_token_accuracy": 0.9325674891471862, + "num_tokens": 55732896.0, + "step": 6865 + }, + { + "entropy": 0.29195868968963623, + "epoch": 3.8769751693002257, + "grad_norm": 1.6397924423217773, + "learning_rate": 4.663015076492065e-06, + "loss": 0.2098, + "mean_token_accuracy": 0.9333557486534119, + "num_tokens": 55773560.0, + "step": 6870 + }, + { + "entropy": 0.3036874562501907, + "epoch": 3.87979683972912, + "grad_norm": 2.033968448638916, + "learning_rate": 4.662539589659746e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9338034749031067, + "num_tokens": 55814207.0, + "step": 6875 + }, + { + "entropy": 0.33089557886123655, + "epoch": 3.8826185101580135, + "grad_norm": 1.942462682723999, + "learning_rate": 4.6620638007775735e-06, + "loss": 0.223, + "mean_token_accuracy": 0.9286533951759338, + "num_tokens": 55854785.0, + "step": 6880 + }, + { + "entropy": 0.3029870331287384, + "epoch": 3.885440180586907, + "grad_norm": 1.7418941259384155, + "learning_rate": 4.661587709939017e-06, + "loss": 0.2038, + "mean_token_accuracy": 0.9331433176994324, + "num_tokens": 55895399.0, + "step": 6885 + }, + { + "entropy": 0.3070501685142517, + "epoch": 3.8882618510158014, + "grad_norm": 1.9692811965942383, + "learning_rate": 4.661111317237606e-06, + "loss": 0.1969, + "mean_token_accuracy": 0.9379673600196838, + "num_tokens": 55935772.0, + "step": 6890 + }, + { + "entropy": 0.3398116111755371, + "epoch": 3.8910835214446955, + "grad_norm": 1.9820867776870728, + "learning_rate": 4.660634622766926e-06, + "loss": 0.2501, + "mean_token_accuracy": 0.9237083911895752, + "num_tokens": 55976410.0, + "step": 6895 + }, + { + "entropy": 0.32836961150169375, + "epoch": 3.893905191873589, + "grad_norm": 2.400958299636841, + "learning_rate": 4.660157626620625e-06, + "loss": 0.2248, + "mean_token_accuracy": 0.9286513090133667, + "num_tokens": 56016725.0, + "step": 6900 + }, + { + "entropy": 0.31524630188941954, + "epoch": 3.896726862302483, + "grad_norm": 1.8288160562515259, + "learning_rate": 4.65968032889241e-06, + "loss": 0.1952, + "mean_token_accuracy": 0.9361767411231995, + "num_tokens": 56057415.0, + "step": 6905 + }, + { + "entropy": 0.31340600848197936, + "epoch": 3.899548532731377, + "grad_norm": 1.9468982219696045, + "learning_rate": 4.6592027296760435e-06, + "loss": 0.2189, + "mean_token_accuracy": 0.9306774020195008, + "num_tokens": 56098039.0, + "step": 6910 + }, + { + "entropy": 0.2994066894054413, + "epoch": 3.9023702031602707, + "grad_norm": 1.8991196155548096, + "learning_rate": 4.658724829065352e-06, + "loss": 0.2006, + "mean_token_accuracy": 0.9361102461814881, + "num_tokens": 56138690.0, + "step": 6915 + }, + { + "entropy": 0.3208695352077484, + "epoch": 3.905191873589165, + "grad_norm": 2.3649332523345947, + "learning_rate": 4.658246627154219e-06, + "loss": 0.2117, + "mean_token_accuracy": 0.9312704563140869, + "num_tokens": 56179455.0, + "step": 6920 + }, + { + "entropy": 0.2794735461473465, + "epoch": 3.9080135440180586, + "grad_norm": 2.0509235858917236, + "learning_rate": 4.6577681240365856e-06, + "loss": 0.1958, + "mean_token_accuracy": 0.9353933930397034, + "num_tokens": 56219807.0, + "step": 6925 + }, + { + "entropy": 0.32446990013122556, + "epoch": 3.9108352144469527, + "grad_norm": 1.699033498764038, + "learning_rate": 4.657289319806456e-06, + "loss": 0.2198, + "mean_token_accuracy": 0.929268729686737, + "num_tokens": 56260537.0, + "step": 6930 + }, + { + "entropy": 0.29834595918655393, + "epoch": 3.9136568848758464, + "grad_norm": 1.6812763214111328, + "learning_rate": 4.656810214557889e-06, + "loss": 0.1882, + "mean_token_accuracy": 0.937093997001648, + "num_tokens": 56301037.0, + "step": 6935 + }, + { + "entropy": 0.2948793530464172, + "epoch": 3.9164785553047405, + "grad_norm": 1.752263069152832, + "learning_rate": 4.6563308083850075e-06, + "loss": 0.1935, + "mean_token_accuracy": 0.9361865758895874, + "num_tokens": 56341578.0, + "step": 6940 + }, + { + "entropy": 0.3016524910926819, + "epoch": 3.9193002257336342, + "grad_norm": 1.9265809059143066, + "learning_rate": 4.655851101381988e-06, + "loss": 0.1969, + "mean_token_accuracy": 0.9359880805015564, + "num_tokens": 56382334.0, + "step": 6945 + }, + { + "entropy": 0.31857663989067075, + "epoch": 3.9221218961625284, + "grad_norm": 2.062640905380249, + "learning_rate": 4.655371093643073e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.9254809617996216, + "num_tokens": 56422744.0, + "step": 6950 + }, + { + "entropy": 0.3018131792545319, + "epoch": 3.924943566591422, + "grad_norm": 1.7309370040893555, + "learning_rate": 4.6548907852625565e-06, + "loss": 0.2042, + "mean_token_accuracy": 0.9332278847694397, + "num_tokens": 56463597.0, + "step": 6955 + }, + { + "entropy": 0.31196231842041017, + "epoch": 3.927765237020316, + "grad_norm": 1.6862263679504395, + "learning_rate": 4.654410176334796e-06, + "loss": 0.2161, + "mean_token_accuracy": 0.9320029973983764, + "num_tokens": 56504227.0, + "step": 6960 + }, + { + "entropy": 0.3238642394542694, + "epoch": 3.93058690744921, + "grad_norm": 1.998358130455017, + "learning_rate": 4.653929266954208e-06, + "loss": 0.2084, + "mean_token_accuracy": 0.9316397666931152, + "num_tokens": 56544904.0, + "step": 6965 + }, + { + "entropy": 0.34933393597602846, + "epoch": 3.9334085778781036, + "grad_norm": 2.1447250843048096, + "learning_rate": 4.653448057215267e-06, + "loss": 0.2264, + "mean_token_accuracy": 0.9266282439231872, + "num_tokens": 56585427.0, + "step": 6970 + }, + { + "entropy": 0.34060198068618774, + "epoch": 3.9362302483069977, + "grad_norm": 2.133936882019043, + "learning_rate": 4.652966547212506e-06, + "loss": 0.2445, + "mean_token_accuracy": 0.9236889362335206, + "num_tokens": 56626066.0, + "step": 6975 + }, + { + "entropy": 0.3082031667232513, + "epoch": 3.939051918735892, + "grad_norm": 1.9675047397613525, + "learning_rate": 4.652484737040518e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.9365509033203125, + "num_tokens": 56666827.0, + "step": 6980 + }, + { + "entropy": 0.3459632158279419, + "epoch": 3.9418735891647856, + "grad_norm": 1.927329421043396, + "learning_rate": 4.652002626793956e-06, + "loss": 0.2278, + "mean_token_accuracy": 0.9272386431694031, + "num_tokens": 56707612.0, + "step": 6985 + }, + { + "entropy": 0.3056627333164215, + "epoch": 3.9446952595936793, + "grad_norm": 1.6812715530395508, + "learning_rate": 4.651520216567528e-06, + "loss": 0.1971, + "mean_token_accuracy": 0.9381381869316101, + "num_tokens": 56748245.0, + "step": 6990 + }, + { + "entropy": 0.2965980887413025, + "epoch": 3.9475169300225734, + "grad_norm": 1.688714861869812, + "learning_rate": 4.651037506456006e-06, + "loss": 0.2001, + "mean_token_accuracy": 0.9357024908065796, + "num_tokens": 56788909.0, + "step": 6995 + }, + { + "entropy": 0.32579890489578245, + "epoch": 3.950338600451467, + "grad_norm": 2.0904133319854736, + "learning_rate": 4.650554496554217e-06, + "loss": 0.2383, + "mean_token_accuracy": 0.9262525558471679, + "num_tokens": 56829431.0, + "step": 7000 + }, + { + "epoch": 3.950338600451467, + "eval_entropy": 0.3211408257484436, + "eval_loss": 0.14595237374305725, + "eval_mean_token_accuracy": 0.9543886780738831, + "eval_num_tokens": 56829431.0, + "eval_runtime": 0.164, + "eval_samples_per_second": 24.384, + "eval_steps_per_second": 6.096, + "step": 7000 + }, + { + "entropy": 0.32015385031700133, + "epoch": 3.9531602708803613, + "grad_norm": 1.8100392818450928, + "learning_rate": 4.650071186957049e-06, + "loss": 0.2236, + "mean_token_accuracy": 0.9274610161781311, + "num_tokens": 56870084.0, + "step": 7005 + }, + { + "entropy": 0.31498663425445556, + "epoch": 3.955981941309255, + "grad_norm": 1.7717326879501343, + "learning_rate": 4.6495875777594485e-06, + "loss": 0.2223, + "mean_token_accuracy": 0.927357268333435, + "num_tokens": 56910874.0, + "step": 7010 + }, + { + "entropy": 0.2908495903015137, + "epoch": 3.958803611738149, + "grad_norm": 2.0041091442108154, + "learning_rate": 4.64910366905642e-06, + "loss": 0.1944, + "mean_token_accuracy": 0.9367148995399475, + "num_tokens": 56951468.0, + "step": 7015 + }, + { + "entropy": 0.3460938811302185, + "epoch": 3.961625282167043, + "grad_norm": 2.049760580062866, + "learning_rate": 4.648619460943027e-06, + "loss": 0.2347, + "mean_token_accuracy": 0.9227741837501526, + "num_tokens": 56992209.0, + "step": 7020 + }, + { + "entropy": 0.31545761227607727, + "epoch": 3.964446952595937, + "grad_norm": 2.2006888389587402, + "learning_rate": 4.6481349535143934e-06, + "loss": 0.21, + "mean_token_accuracy": 0.9324324250221252, + "num_tokens": 57033074.0, + "step": 7025 + }, + { + "entropy": 0.2913439154624939, + "epoch": 3.9672686230248306, + "grad_norm": 1.7497133016586304, + "learning_rate": 4.6476501468657e-06, + "loss": 0.1987, + "mean_token_accuracy": 0.934621024131775, + "num_tokens": 57073763.0, + "step": 7030 + }, + { + "entropy": 0.3054608583450317, + "epoch": 3.9700902934537243, + "grad_norm": 1.3627029657363892, + "learning_rate": 4.647165041092187e-06, + "loss": 0.1965, + "mean_token_accuracy": 0.9352095484733581, + "num_tokens": 57114140.0, + "step": 7035 + }, + { + "entropy": 0.31089224219322203, + "epoch": 3.9729119638826185, + "grad_norm": 1.926986575126648, + "learning_rate": 4.646679636289154e-06, + "loss": 0.2129, + "mean_token_accuracy": 0.9315391659736634, + "num_tokens": 57154688.0, + "step": 7040 + }, + { + "entropy": 0.29714728593826295, + "epoch": 3.9757336343115126, + "grad_norm": 2.036738872528076, + "learning_rate": 4.646193932551959e-06, + "loss": 0.2094, + "mean_token_accuracy": 0.9334526896476746, + "num_tokens": 57195301.0, + "step": 7045 + }, + { + "entropy": 0.3167958378791809, + "epoch": 3.9785553047404063, + "grad_norm": 2.016421318054199, + "learning_rate": 4.645707929976018e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.9300056099891663, + "num_tokens": 57235979.0, + "step": 7050 + }, + { + "entropy": 0.2984152317047119, + "epoch": 3.9813769751693, + "grad_norm": 2.002079486846924, + "learning_rate": 4.645221628656806e-06, + "loss": 0.1954, + "mean_token_accuracy": 0.9366149425506591, + "num_tokens": 57276713.0, + "step": 7055 + }, + { + "entropy": 0.32194399237632754, + "epoch": 3.984198645598194, + "grad_norm": 1.7218655347824097, + "learning_rate": 4.644735028689858e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.9305589437484741, + "num_tokens": 57317326.0, + "step": 7060 + }, + { + "entropy": 0.317461222410202, + "epoch": 3.9870203160270883, + "grad_norm": 2.030597686767578, + "learning_rate": 4.644248130170766e-06, + "loss": 0.225, + "mean_token_accuracy": 0.927132534980774, + "num_tokens": 57358188.0, + "step": 7065 + }, + { + "entropy": 0.30844367146492, + "epoch": 3.989841986455982, + "grad_norm": 1.770613193511963, + "learning_rate": 4.643760933195182e-06, + "loss": 0.2062, + "mean_token_accuracy": 0.9343749284744263, + "num_tokens": 57398939.0, + "step": 7070 + }, + { + "entropy": 0.2940232276916504, + "epoch": 3.9926636568848757, + "grad_norm": 1.9694218635559082, + "learning_rate": 4.643273437858814e-06, + "loss": 0.1933, + "mean_token_accuracy": 0.9365314722061158, + "num_tokens": 57439332.0, + "step": 7075 + }, + { + "entropy": 0.30736419558525085, + "epoch": 3.99548532731377, + "grad_norm": 1.9869581460952759, + "learning_rate": 4.642785644257432e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.9342207312583923, + "num_tokens": 57479943.0, + "step": 7080 + }, + { + "entropy": 0.3101332366466522, + "epoch": 3.9983069977426635, + "grad_norm": 1.9268759489059448, + "learning_rate": 4.6422975524868635e-06, + "loss": 0.2151, + "mean_token_accuracy": 0.9306729078292847, + "num_tokens": 57520661.0, + "step": 7085 + }, + { + "entropy": 0.3069190442562103, + "epoch": 4.001128668171558, + "grad_norm": 1.7287434339523315, + "learning_rate": 4.641809162642993e-06, + "loss": 0.1752, + "mean_token_accuracy": 0.9477459907531738, + "num_tokens": 57555350.0, + "step": 7090 + }, + { + "entropy": 0.2743695855140686, + "epoch": 4.003950338600451, + "grad_norm": 1.4140337705612183, + "learning_rate": 4.641320474821765e-06, + "loss": 0.128, + "mean_token_accuracy": 0.9617153882980347, + "num_tokens": 57596053.0, + "step": 7095 + }, + { + "entropy": 0.2616827428340912, + "epoch": 4.006772009029345, + "grad_norm": 2.273259162902832, + "learning_rate": 4.640831489119184e-06, + "loss": 0.1496, + "mean_token_accuracy": 0.9549853324890136, + "num_tokens": 57636820.0, + "step": 7100 + }, + { + "entropy": 0.21969364285469056, + "epoch": 4.00959367945824, + "grad_norm": 1.9657589197158813, + "learning_rate": 4.640342205631309e-06, + "loss": 0.1234, + "mean_token_accuracy": 0.961889123916626, + "num_tokens": 57677102.0, + "step": 7105 + }, + { + "entropy": 0.2414553850889206, + "epoch": 4.012415349887133, + "grad_norm": 2.4210760593414307, + "learning_rate": 4.639852624454261e-06, + "loss": 0.1491, + "mean_token_accuracy": 0.9532399654388428, + "num_tokens": 57717694.0, + "step": 7110 + }, + { + "entropy": 0.22773428559303283, + "epoch": 4.015237020316027, + "grad_norm": 2.0775444507598877, + "learning_rate": 4.639362745684219e-06, + "loss": 0.1188, + "mean_token_accuracy": 0.9613750100135803, + "num_tokens": 57758420.0, + "step": 7115 + }, + { + "entropy": 0.24161579608917236, + "epoch": 4.018058690744921, + "grad_norm": 1.9448866844177246, + "learning_rate": 4.638872569417417e-06, + "loss": 0.1306, + "mean_token_accuracy": 0.9591588497161865, + "num_tokens": 57799103.0, + "step": 7120 + }, + { + "entropy": 0.2467570424079895, + "epoch": 4.020880361173815, + "grad_norm": 1.8447470664978027, + "learning_rate": 4.638382095750152e-06, + "loss": 0.1285, + "mean_token_accuracy": 0.9602084875106811, + "num_tokens": 57839297.0, + "step": 7125 + }, + { + "entropy": 0.24987946152687074, + "epoch": 4.023702031602709, + "grad_norm": 2.2855517864227295, + "learning_rate": 4.6378913247787786e-06, + "loss": 0.1235, + "mean_token_accuracy": 0.9607668399810791, + "num_tokens": 57879861.0, + "step": 7130 + }, + { + "entropy": 0.23636153638362883, + "epoch": 4.026523702031603, + "grad_norm": 1.8914059400558472, + "learning_rate": 4.637400256599707e-06, + "loss": 0.133, + "mean_token_accuracy": 0.9598736882209777, + "num_tokens": 57920707.0, + "step": 7135 + }, + { + "entropy": 0.2302469491958618, + "epoch": 4.029345372460496, + "grad_norm": 2.0456087589263916, + "learning_rate": 4.636908891309408e-06, + "loss": 0.1263, + "mean_token_accuracy": 0.9601135969161987, + "num_tokens": 57961297.0, + "step": 7140 + }, + { + "entropy": 0.23698365390300752, + "epoch": 4.03216704288939, + "grad_norm": 1.853820562362671, + "learning_rate": 4.636417229004412e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9583729267120361, + "num_tokens": 58001984.0, + "step": 7145 + }, + { + "entropy": 0.22845596373081206, + "epoch": 4.034988713318285, + "grad_norm": 1.8284083604812622, + "learning_rate": 4.635925269781305e-06, + "loss": 0.1264, + "mean_token_accuracy": 0.9601821541786194, + "num_tokens": 58042688.0, + "step": 7150 + }, + { + "entropy": 0.21759492456912993, + "epoch": 4.037810383747178, + "grad_norm": 2.047950267791748, + "learning_rate": 4.6354330137367305e-06, + "loss": 0.1157, + "mean_token_accuracy": 0.963828158378601, + "num_tokens": 58083388.0, + "step": 7155 + }, + { + "entropy": 0.22680849432945252, + "epoch": 4.040632054176072, + "grad_norm": 1.9068692922592163, + "learning_rate": 4.634940460967396e-06, + "loss": 0.1217, + "mean_token_accuracy": 0.9618602633476258, + "num_tokens": 58124041.0, + "step": 7160 + }, + { + "entropy": 0.22470956444740295, + "epoch": 4.043453724604966, + "grad_norm": 1.6333588361740112, + "learning_rate": 4.634447611570061e-06, + "loss": 0.1182, + "mean_token_accuracy": 0.9632346987724304, + "num_tokens": 58164726.0, + "step": 7165 + }, + { + "entropy": 0.2346551775932312, + "epoch": 4.04627539503386, + "grad_norm": 1.8143978118896484, + "learning_rate": 4.633954465641546e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9576727986335755, + "num_tokens": 58205286.0, + "step": 7170 + }, + { + "entropy": 0.23285722136497497, + "epoch": 4.049097065462754, + "grad_norm": 1.8555634021759033, + "learning_rate": 4.633461023278731e-06, + "loss": 0.1157, + "mean_token_accuracy": 0.9634591937065125, + "num_tokens": 58246019.0, + "step": 7175 + }, + { + "entropy": 0.23194559514522553, + "epoch": 4.051918735891648, + "grad_norm": 1.8216824531555176, + "learning_rate": 4.632967284578551e-06, + "loss": 0.1168, + "mean_token_accuracy": 0.9622029185295105, + "num_tokens": 58286809.0, + "step": 7180 + }, + { + "entropy": 0.25160720348358157, + "epoch": 4.0547404063205414, + "grad_norm": 1.9061291217803955, + "learning_rate": 4.632473249638003e-06, + "loss": 0.1255, + "mean_token_accuracy": 0.9632068753242493, + "num_tokens": 58327226.0, + "step": 7185 + }, + { + "entropy": 0.22852468192577363, + "epoch": 4.057562076749436, + "grad_norm": 1.9731613397598267, + "learning_rate": 4.631978918554139e-06, + "loss": 0.1235, + "mean_token_accuracy": 0.9616599082946777, + "num_tokens": 58367956.0, + "step": 7190 + }, + { + "entropy": 0.2459454208612442, + "epoch": 4.06038374717833, + "grad_norm": 1.9121125936508179, + "learning_rate": 4.631484291424069e-06, + "loss": 0.1461, + "mean_token_accuracy": 0.9540547609329224, + "num_tokens": 58408674.0, + "step": 7195 + }, + { + "entropy": 0.24432432651519775, + "epoch": 4.063205417607223, + "grad_norm": 1.7734456062316895, + "learning_rate": 4.630989368344966e-06, + "loss": 0.1334, + "mean_token_accuracy": 0.9592110633850097, + "num_tokens": 58448721.0, + "step": 7200 + }, + { + "entropy": 0.24515534937381744, + "epoch": 4.066027088036117, + "grad_norm": 2.12031888961792, + "learning_rate": 4.630494149414054e-06, + "loss": 0.1274, + "mean_token_accuracy": 0.9601586699485779, + "num_tokens": 58489397.0, + "step": 7205 + }, + { + "entropy": 0.23616698682308196, + "epoch": 4.068848758465012, + "grad_norm": 1.635472059249878, + "learning_rate": 4.629998634728622e-06, + "loss": 0.1233, + "mean_token_accuracy": 0.9608691930770874, + "num_tokens": 58529908.0, + "step": 7210 + }, + { + "entropy": 0.24083788096904754, + "epoch": 4.071670428893905, + "grad_norm": 1.929811716079712, + "learning_rate": 4.629502824386013e-06, + "loss": 0.1248, + "mean_token_accuracy": 0.9608209013938904, + "num_tokens": 58570278.0, + "step": 7215 + }, + { + "entropy": 0.24579550325870514, + "epoch": 4.074492099322799, + "grad_norm": 2.148528814315796, + "learning_rate": 4.629006718483627e-06, + "loss": 0.1347, + "mean_token_accuracy": 0.9587805867195129, + "num_tokens": 58611040.0, + "step": 7220 + }, + { + "entropy": 0.23470092117786406, + "epoch": 4.077313769751693, + "grad_norm": 1.7747644186019897, + "learning_rate": 4.628510317118927e-06, + "loss": 0.1234, + "mean_token_accuracy": 0.961380934715271, + "num_tokens": 58651615.0, + "step": 7225 + }, + { + "entropy": 0.21981629133224487, + "epoch": 4.0801354401805865, + "grad_norm": 2.0685737133026123, + "learning_rate": 4.628013620389429e-06, + "loss": 0.1237, + "mean_token_accuracy": 0.960912036895752, + "num_tokens": 58692473.0, + "step": 7230 + }, + { + "entropy": 0.22954968512058258, + "epoch": 4.082957110609481, + "grad_norm": 2.0184037685394287, + "learning_rate": 4.62751662839271e-06, + "loss": 0.1328, + "mean_token_accuracy": 0.9574317932128906, + "num_tokens": 58733055.0, + "step": 7235 + }, + { + "entropy": 0.2246454894542694, + "epoch": 4.085778781038375, + "grad_norm": 1.9148306846618652, + "learning_rate": 4.627019341226404e-06, + "loss": 0.1156, + "mean_token_accuracy": 0.9631801724433899, + "num_tokens": 58773853.0, + "step": 7240 + }, + { + "entropy": 0.23507523834705352, + "epoch": 4.0886004514672685, + "grad_norm": 1.794526219367981, + "learning_rate": 4.626521758988204e-06, + "loss": 0.1167, + "mean_token_accuracy": 0.9632613658905029, + "num_tokens": 58814514.0, + "step": 7245 + }, + { + "entropy": 0.23969656229019165, + "epoch": 4.091422121896162, + "grad_norm": 1.9400169849395752, + "learning_rate": 4.626023881775858e-06, + "loss": 0.1284, + "mean_token_accuracy": 0.9598143339157105, + "num_tokens": 58854908.0, + "step": 7250 + }, + { + "entropy": 0.2598514884710312, + "epoch": 4.094243792325057, + "grad_norm": 2.0811522006988525, + "learning_rate": 4.625525709687176e-06, + "loss": 0.1422, + "mean_token_accuracy": 0.9565314888954163, + "num_tokens": 58895394.0, + "step": 7255 + }, + { + "entropy": 0.24920229315757753, + "epoch": 4.0970654627539504, + "grad_norm": 1.6480082273483276, + "learning_rate": 4.625027242820023e-06, + "loss": 0.1286, + "mean_token_accuracy": 0.9599669218063355, + "num_tokens": 58935939.0, + "step": 7260 + }, + { + "entropy": 0.23545166254043579, + "epoch": 4.099887133182844, + "grad_norm": 2.00404691696167, + "learning_rate": 4.6245284812723234e-06, + "loss": 0.1257, + "mean_token_accuracy": 0.9604501008987427, + "num_tokens": 58976341.0, + "step": 7265 + }, + { + "entropy": 0.23354564011096954, + "epoch": 4.102708803611738, + "grad_norm": 2.3596303462982178, + "learning_rate": 4.624029425142059e-06, + "loss": 0.1312, + "mean_token_accuracy": 0.9599619388580323, + "num_tokens": 59016822.0, + "step": 7270 + }, + { + "entropy": 0.24546316564083098, + "epoch": 4.105530474040632, + "grad_norm": 1.6743675470352173, + "learning_rate": 4.623530074527269e-06, + "loss": 0.1286, + "mean_token_accuracy": 0.9613336443901062, + "num_tokens": 59057357.0, + "step": 7275 + }, + { + "entropy": 0.23465977013111114, + "epoch": 4.108352144469526, + "grad_norm": 1.9552369117736816, + "learning_rate": 4.6230304295260504e-06, + "loss": 0.1138, + "mean_token_accuracy": 0.963888657093048, + "num_tokens": 59098128.0, + "step": 7280 + }, + { + "entropy": 0.22476655542850493, + "epoch": 4.11117381489842, + "grad_norm": 1.9228246212005615, + "learning_rate": 4.62253049023656e-06, + "loss": 0.132, + "mean_token_accuracy": 0.9577814340591431, + "num_tokens": 59138682.0, + "step": 7285 + }, + { + "entropy": 0.23238305151462554, + "epoch": 4.1139954853273135, + "grad_norm": 1.8508713245391846, + "learning_rate": 4.62203025675701e-06, + "loss": 0.1336, + "mean_token_accuracy": 0.9592737793922425, + "num_tokens": 59179330.0, + "step": 7290 + }, + { + "entropy": 0.21901310980319977, + "epoch": 4.116817155756208, + "grad_norm": 1.9854564666748047, + "learning_rate": 4.621529729185671e-06, + "loss": 0.1146, + "mean_token_accuracy": 0.9635023355484009, + "num_tokens": 59220102.0, + "step": 7295 + }, + { + "entropy": 0.23093846142292024, + "epoch": 4.119638826185102, + "grad_norm": 1.7176353931427002, + "learning_rate": 4.621028907620873e-06, + "loss": 0.129, + "mean_token_accuracy": 0.9593886017799378, + "num_tokens": 59260732.0, + "step": 7300 + }, + { + "entropy": 0.2379404127597809, + "epoch": 4.1224604966139955, + "grad_norm": 1.8273789882659912, + "learning_rate": 4.620527792161001e-06, + "loss": 0.1124, + "mean_token_accuracy": 0.9650070428848266, + "num_tokens": 59301380.0, + "step": 7305 + }, + { + "entropy": 0.22208026945590972, + "epoch": 4.125282167042889, + "grad_norm": 1.7196441888809204, + "learning_rate": 4.620026382904499e-06, + "loss": 0.1052, + "mean_token_accuracy": 0.9668426513671875, + "num_tokens": 59342144.0, + "step": 7310 + }, + { + "entropy": 0.23061503767967223, + "epoch": 4.128103837471783, + "grad_norm": 2.7583887577056885, + "learning_rate": 4.61952467994987e-06, + "loss": 0.1236, + "mean_token_accuracy": 0.9621009945869445, + "num_tokens": 59382762.0, + "step": 7315 + }, + { + "entropy": 0.22966778576374053, + "epoch": 4.1309255079006775, + "grad_norm": 2.4049155712127686, + "learning_rate": 4.619022683395675e-06, + "loss": 0.131, + "mean_token_accuracy": 0.9585760951042175, + "num_tokens": 59423356.0, + "step": 7320 + }, + { + "entropy": 0.23155851662158966, + "epoch": 4.133747178329571, + "grad_norm": 1.8571085929870605, + "learning_rate": 4.618520393340528e-06, + "loss": 0.1282, + "mean_token_accuracy": 0.9598821759223938, + "num_tokens": 59463990.0, + "step": 7325 + }, + { + "entropy": 0.2637378484010696, + "epoch": 4.136568848758465, + "grad_norm": 2.0401177406311035, + "learning_rate": 4.618017809883107e-06, + "loss": 0.1305, + "mean_token_accuracy": 0.9606005668640136, + "num_tokens": 59504742.0, + "step": 7330 + }, + { + "entropy": 0.2353299468755722, + "epoch": 4.139390519187359, + "grad_norm": 2.042680263519287, + "learning_rate": 4.617514933122142e-06, + "loss": 0.1304, + "mean_token_accuracy": 0.9596582651138306, + "num_tokens": 59545424.0, + "step": 7335 + }, + { + "entropy": 0.22721785604953765, + "epoch": 4.142212189616253, + "grad_norm": 2.117658853530884, + "learning_rate": 4.6170117631564246e-06, + "loss": 0.115, + "mean_token_accuracy": 0.9629755258560181, + "num_tokens": 59586178.0, + "step": 7340 + }, + { + "entropy": 0.2232923239469528, + "epoch": 4.145033860045147, + "grad_norm": 1.8017418384552002, + "learning_rate": 4.616508300084803e-06, + "loss": 0.129, + "mean_token_accuracy": 0.9588123679161071, + "num_tokens": 59626782.0, + "step": 7345 + }, + { + "entropy": 0.22652737498283387, + "epoch": 4.1478555304740405, + "grad_norm": 2.2765512466430664, + "learning_rate": 4.616004544006181e-06, + "loss": 0.1175, + "mean_token_accuracy": 0.9620057940483093, + "num_tokens": 59667236.0, + "step": 7350 + }, + { + "entropy": 0.23027748465538025, + "epoch": 4.150677200902934, + "grad_norm": 1.8993479013442993, + "learning_rate": 4.615500495019523e-06, + "loss": 0.1223, + "mean_token_accuracy": 0.9609902024269104, + "num_tokens": 59708042.0, + "step": 7355 + }, + { + "entropy": 0.22403572499752045, + "epoch": 4.153498871331829, + "grad_norm": 2.1379618644714355, + "learning_rate": 4.614996153223849e-06, + "loss": 0.1266, + "mean_token_accuracy": 0.9605488657951355, + "num_tokens": 59748650.0, + "step": 7360 + }, + { + "entropy": 0.2490081638097763, + "epoch": 4.1563205417607225, + "grad_norm": 2.036970853805542, + "learning_rate": 4.614491518718237e-06, + "loss": 0.1421, + "mean_token_accuracy": 0.9558643579483033, + "num_tokens": 59789397.0, + "step": 7365 + }, + { + "entropy": 0.21482129395008087, + "epoch": 4.159142212189616, + "grad_norm": 1.7927038669586182, + "learning_rate": 4.613986591601823e-06, + "loss": 0.1081, + "mean_token_accuracy": 0.9661535143852233, + "num_tokens": 59830163.0, + "step": 7370 + }, + { + "entropy": 0.23096846640110016, + "epoch": 4.16196388261851, + "grad_norm": 2.013101816177368, + "learning_rate": 4.613481371973799e-06, + "loss": 0.1247, + "mean_token_accuracy": 0.9610580563545227, + "num_tokens": 59870838.0, + "step": 7375 + }, + { + "entropy": 0.23506858050823212, + "epoch": 4.164785553047404, + "grad_norm": 2.304497480392456, + "learning_rate": 4.612975859933415e-06, + "loss": 0.1292, + "mean_token_accuracy": 0.9596829771995544, + "num_tokens": 59911676.0, + "step": 7380 + }, + { + "entropy": 0.2367018163204193, + "epoch": 4.167607223476298, + "grad_norm": 2.2349987030029297, + "learning_rate": 4.612470055579982e-06, + "loss": 0.1316, + "mean_token_accuracy": 0.9579625487327575, + "num_tokens": 59952218.0, + "step": 7385 + }, + { + "entropy": 0.22246408462524414, + "epoch": 4.170428893905192, + "grad_norm": 1.9588614702224731, + "learning_rate": 4.611963959012862e-06, + "loss": 0.1166, + "mean_token_accuracy": 0.9629753589630127, + "num_tokens": 59992290.0, + "step": 7390 + }, + { + "entropy": 0.22336077094078063, + "epoch": 4.173250564334086, + "grad_norm": 1.994891881942749, + "learning_rate": 4.611457570331479e-06, + "loss": 0.1155, + "mean_token_accuracy": 0.9629181385040283, + "num_tokens": 60033130.0, + "step": 7395 + }, + { + "entropy": 0.24217330813407897, + "epoch": 4.176072234762979, + "grad_norm": 1.8636099100112915, + "learning_rate": 4.610950889635313e-06, + "loss": 0.1315, + "mean_token_accuracy": 0.9586685538291931, + "num_tokens": 60073764.0, + "step": 7400 + }, + { + "entropy": 0.2141832858324051, + "epoch": 4.178893905191874, + "grad_norm": 1.7918341159820557, + "learning_rate": 4.6104439170239015e-06, + "loss": 0.1143, + "mean_token_accuracy": 0.9644059538841248, + "num_tokens": 60114234.0, + "step": 7405 + }, + { + "entropy": 0.2275536447763443, + "epoch": 4.181715575620768, + "grad_norm": 1.8082565069198608, + "learning_rate": 4.609936652596841e-06, + "loss": 0.121, + "mean_token_accuracy": 0.961764144897461, + "num_tokens": 60154149.0, + "step": 7410 + }, + { + "entropy": 0.24505477845668794, + "epoch": 4.184537246049661, + "grad_norm": 1.9884941577911377, + "learning_rate": 4.60942909645378e-06, + "loss": 0.1369, + "mean_token_accuracy": 0.956404197216034, + "num_tokens": 60194901.0, + "step": 7415 + }, + { + "entropy": 0.2267145186662674, + "epoch": 4.187358916478555, + "grad_norm": 2.0457382202148438, + "learning_rate": 4.608921248694431e-06, + "loss": 0.129, + "mean_token_accuracy": 0.9587201476097107, + "num_tokens": 60235805.0, + "step": 7420 + }, + { + "entropy": 0.24519952833652497, + "epoch": 4.1901805869074495, + "grad_norm": 2.398599147796631, + "learning_rate": 4.6084131094185594e-06, + "loss": 0.1345, + "mean_token_accuracy": 0.9566538453102111, + "num_tokens": 60276550.0, + "step": 7425 + }, + { + "entropy": 0.23022598624229432, + "epoch": 4.193002257336343, + "grad_norm": 1.9493690729141235, + "learning_rate": 4.607904678725989e-06, + "loss": 0.1237, + "mean_token_accuracy": 0.9610730051994324, + "num_tokens": 60316464.0, + "step": 7430 + }, + { + "entropy": 0.22809267342090606, + "epoch": 4.195823927765237, + "grad_norm": 2.157726287841797, + "learning_rate": 4.607395956716603e-06, + "loss": 0.12, + "mean_token_accuracy": 0.9622081279754638, + "num_tokens": 60356857.0, + "step": 7435 + }, + { + "entropy": 0.2448444426059723, + "epoch": 4.198645598194131, + "grad_norm": 2.016291379928589, + "learning_rate": 4.606886943490338e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.9577858328819275, + "num_tokens": 60397631.0, + "step": 7440 + }, + { + "entropy": 0.2451613575220108, + "epoch": 4.201467268623025, + "grad_norm": 2.0515503883361816, + "learning_rate": 4.60637763914719e-06, + "loss": 0.1414, + "mean_token_accuracy": 0.9543289184570313, + "num_tokens": 60437991.0, + "step": 7445 + }, + { + "entropy": 0.2392250567674637, + "epoch": 4.204288939051919, + "grad_norm": 1.8992726802825928, + "learning_rate": 4.605868043787213e-06, + "loss": 0.1336, + "mean_token_accuracy": 0.9581851363182068, + "num_tokens": 60478732.0, + "step": 7450 + }, + { + "entropy": 0.23091658651828767, + "epoch": 4.207110609480813, + "grad_norm": 1.9759114980697632, + "learning_rate": 4.605358157510516e-06, + "loss": 0.1108, + "mean_token_accuracy": 0.9652905344963074, + "num_tokens": 60518543.0, + "step": 7455 + }, + { + "entropy": 0.23277661204338074, + "epoch": 4.209932279909706, + "grad_norm": 1.8714420795440674, + "learning_rate": 4.6048479804172666e-06, + "loss": 0.1099, + "mean_token_accuracy": 0.966291892528534, + "num_tokens": 60559260.0, + "step": 7460 + }, + { + "entropy": 0.23967285752296447, + "epoch": 4.2127539503386, + "grad_norm": 2.056525468826294, + "learning_rate": 4.604337512607689e-06, + "loss": 0.1228, + "mean_token_accuracy": 0.9611961007118225, + "num_tokens": 60600020.0, + "step": 7465 + }, + { + "entropy": 0.24567246735095977, + "epoch": 4.215575620767495, + "grad_norm": 2.1545474529266357, + "learning_rate": 4.603826754182065e-06, + "loss": 0.1493, + "mean_token_accuracy": 0.9537813544273377, + "num_tokens": 60640575.0, + "step": 7470 + }, + { + "entropy": 0.2454807221889496, + "epoch": 4.218397291196388, + "grad_norm": 2.1680426597595215, + "learning_rate": 4.603315705240732e-06, + "loss": 0.1382, + "mean_token_accuracy": 0.956198763847351, + "num_tokens": 60681128.0, + "step": 7475 + }, + { + "entropy": 0.24590539932250977, + "epoch": 4.221218961625282, + "grad_norm": 1.8926993608474731, + "learning_rate": 4.602804365884088e-06, + "loss": 0.1257, + "mean_token_accuracy": 0.960571038722992, + "num_tokens": 60721973.0, + "step": 7480 + }, + { + "entropy": 0.23549313545227052, + "epoch": 4.224040632054176, + "grad_norm": 2.156057119369507, + "learning_rate": 4.602292736212583e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.9587239861488343, + "num_tokens": 60762628.0, + "step": 7485 + }, + { + "entropy": 0.2402738958597183, + "epoch": 4.22686230248307, + "grad_norm": 1.9733823537826538, + "learning_rate": 4.601780816326728e-06, + "loss": 0.1245, + "mean_token_accuracy": 0.9602890253067017, + "num_tokens": 60803434.0, + "step": 7490 + }, + { + "entropy": 0.243062624335289, + "epoch": 4.229683972911964, + "grad_norm": 2.058126211166382, + "learning_rate": 4.60126860632709e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.9585610747337341, + "num_tokens": 60844279.0, + "step": 7495 + }, + { + "entropy": 0.23409413993358613, + "epoch": 4.232505643340858, + "grad_norm": 1.9692258834838867, + "learning_rate": 4.600756106314292e-06, + "loss": 0.1167, + "mean_token_accuracy": 0.9634551763534546, + "num_tokens": 60884801.0, + "step": 7500 + }, + { + "epoch": 4.232505643340858, + "eval_entropy": 0.2763104736804962, + "eval_loss": 0.11991167068481445, + "eval_mean_token_accuracy": 0.9616711139678955, + "eval_num_tokens": 60884801.0, + "eval_runtime": 0.1639, + "eval_samples_per_second": 24.403, + "eval_steps_per_second": 6.101, + "step": 7500 + }, + { + "entropy": 0.24236758947372436, + "epoch": 4.235327313769751, + "grad_norm": 2.3060202598571777, + "learning_rate": 4.6002433163890156e-06, + "loss": 0.1294, + "mean_token_accuracy": 0.959705400466919, + "num_tokens": 60925426.0, + "step": 7505 + }, + { + "entropy": 0.21380599737167358, + "epoch": 4.238148984198646, + "grad_norm": 2.0318894386291504, + "learning_rate": 4.599730236651998e-06, + "loss": 0.1058, + "mean_token_accuracy": 0.965471088886261, + "num_tokens": 60966196.0, + "step": 7510 + }, + { + "entropy": 0.22909758388996124, + "epoch": 4.24097065462754, + "grad_norm": 1.807207465171814, + "learning_rate": 4.5992168672040335e-06, + "loss": 0.1201, + "mean_token_accuracy": 0.9617739319801331, + "num_tokens": 61006851.0, + "step": 7515 + }, + { + "entropy": 0.2145596742630005, + "epoch": 4.243792325056433, + "grad_norm": 1.704498052597046, + "learning_rate": 4.598703208145974e-06, + "loss": 0.1079, + "mean_token_accuracy": 0.9660002827644348, + "num_tokens": 61047687.0, + "step": 7520 + }, + { + "entropy": 0.2247183084487915, + "epoch": 4.246613995485327, + "grad_norm": 2.1221797466278076, + "learning_rate": 4.598189259578727e-06, + "loss": 0.1279, + "mean_token_accuracy": 0.9582555651664734, + "num_tokens": 61088309.0, + "step": 7525 + }, + { + "entropy": 0.23647244274616241, + "epoch": 4.249435665914222, + "grad_norm": 2.2665932178497314, + "learning_rate": 4.597675021603259e-06, + "loss": 0.1294, + "mean_token_accuracy": 0.9585322737693787, + "num_tokens": 61129103.0, + "step": 7530 + }, + { + "entropy": 0.21924723386764527, + "epoch": 4.252257336343115, + "grad_norm": 2.0743370056152344, + "learning_rate": 4.597160494320592e-06, + "loss": 0.1204, + "mean_token_accuracy": 0.9610903978347778, + "num_tokens": 61169817.0, + "step": 7535 + }, + { + "entropy": 0.22164719700813293, + "epoch": 4.255079006772009, + "grad_norm": 1.8417187929153442, + "learning_rate": 4.596645677831804e-06, + "loss": 0.1236, + "mean_token_accuracy": 0.9602578401565551, + "num_tokens": 61210598.0, + "step": 7540 + }, + { + "entropy": 0.2332447052001953, + "epoch": 4.257900677200903, + "grad_norm": 1.8406648635864258, + "learning_rate": 4.596130572238031e-06, + "loss": 0.1174, + "mean_token_accuracy": 0.9627135396003723, + "num_tokens": 61251185.0, + "step": 7545 + }, + { + "entropy": 0.23616610169410707, + "epoch": 4.260722347629796, + "grad_norm": 1.8603347539901733, + "learning_rate": 4.595615177640466e-06, + "loss": 0.1263, + "mean_token_accuracy": 0.9601802945137023, + "num_tokens": 61291752.0, + "step": 7550 + }, + { + "entropy": 0.22554007172584534, + "epoch": 4.263544018058691, + "grad_norm": 2.3185107707977295, + "learning_rate": 4.59509949414036e-06, + "loss": 0.119, + "mean_token_accuracy": 0.9624182105064392, + "num_tokens": 61332441.0, + "step": 7555 + }, + { + "entropy": 0.2530226230621338, + "epoch": 4.266365688487585, + "grad_norm": 2.326624631881714, + "learning_rate": 4.594583521839015e-06, + "loss": 0.1415, + "mean_token_accuracy": 0.9568467020988465, + "num_tokens": 61373014.0, + "step": 7560 + }, + { + "entropy": 0.22727547585964203, + "epoch": 4.269187358916478, + "grad_norm": 2.162659168243408, + "learning_rate": 4.594067260837796e-06, + "loss": 0.1309, + "mean_token_accuracy": 0.9592653393745423, + "num_tokens": 61413745.0, + "step": 7565 + }, + { + "entropy": 0.24552667737007142, + "epoch": 4.272009029345372, + "grad_norm": 2.3903839588165283, + "learning_rate": 4.593550711238123e-06, + "loss": 0.1324, + "mean_token_accuracy": 0.9594314217567443, + "num_tokens": 61454234.0, + "step": 7570 + }, + { + "entropy": 0.23494355678558348, + "epoch": 4.274830699774267, + "grad_norm": 1.9757755994796753, + "learning_rate": 4.5930338731414726e-06, + "loss": 0.1304, + "mean_token_accuracy": 0.9587761878967285, + "num_tokens": 61494930.0, + "step": 7575 + }, + { + "entropy": 0.24661367535591125, + "epoch": 4.27765237020316, + "grad_norm": 2.0399813652038574, + "learning_rate": 4.592516746649377e-06, + "loss": 0.1321, + "mean_token_accuracy": 0.9585609912872315, + "num_tokens": 61535747.0, + "step": 7580 + }, + { + "entropy": 0.23092095851898192, + "epoch": 4.280474040632054, + "grad_norm": 2.1672401428222656, + "learning_rate": 4.591999331863425e-06, + "loss": 0.1239, + "mean_token_accuracy": 0.9604844927787781, + "num_tokens": 61576529.0, + "step": 7585 + }, + { + "entropy": 0.2526959002017975, + "epoch": 4.283295711060948, + "grad_norm": 2.322620153427124, + "learning_rate": 4.5914816288852645e-06, + "loss": 0.1319, + "mean_token_accuracy": 0.9580205917358399, + "num_tokens": 61617030.0, + "step": 7590 + }, + { + "entropy": 0.22551030814647674, + "epoch": 4.286117381489842, + "grad_norm": 1.7364836931228638, + "learning_rate": 4.590963637816596e-06, + "loss": 0.127, + "mean_token_accuracy": 0.960225510597229, + "num_tokens": 61657559.0, + "step": 7595 + }, + { + "entropy": 0.22876082360744476, + "epoch": 4.288939051918736, + "grad_norm": 1.735487461090088, + "learning_rate": 4.590445358759181e-06, + "loss": 0.1165, + "mean_token_accuracy": 0.961608099937439, + "num_tokens": 61697918.0, + "step": 7600 + }, + { + "entropy": 0.23216151893138887, + "epoch": 4.29176072234763, + "grad_norm": 2.070525646209717, + "learning_rate": 4.589926791814836e-06, + "loss": 0.1204, + "mean_token_accuracy": 0.9610988855361938, + "num_tokens": 61738812.0, + "step": 7605 + }, + { + "entropy": 0.23693689107894897, + "epoch": 4.294582392776523, + "grad_norm": 2.203585147857666, + "learning_rate": 4.589407937085431e-06, + "loss": 0.1247, + "mean_token_accuracy": 0.9603411674499511, + "num_tokens": 61779432.0, + "step": 7610 + }, + { + "entropy": 0.2458764672279358, + "epoch": 4.297404063205418, + "grad_norm": 2.2714271545410156, + "learning_rate": 4.5888887946728966e-06, + "loss": 0.1412, + "mean_token_accuracy": 0.9558348536491394, + "num_tokens": 61820028.0, + "step": 7615 + }, + { + "entropy": 0.23253954946994781, + "epoch": 4.300225733634312, + "grad_norm": 2.077993154525757, + "learning_rate": 4.588369364679217e-06, + "loss": 0.1219, + "mean_token_accuracy": 0.9627041578292846, + "num_tokens": 61860687.0, + "step": 7620 + }, + { + "entropy": 0.2426275998353958, + "epoch": 4.303047404063205, + "grad_norm": 2.0825798511505127, + "learning_rate": 4.587849647206437e-06, + "loss": 0.1395, + "mean_token_accuracy": 0.9566293358802795, + "num_tokens": 61901354.0, + "step": 7625 + }, + { + "entropy": 0.2425705909729004, + "epoch": 4.305869074492099, + "grad_norm": 1.9618566036224365, + "learning_rate": 4.587329642356654e-06, + "loss": 0.1382, + "mean_token_accuracy": 0.9561069011688232, + "num_tokens": 61941799.0, + "step": 7630 + }, + { + "entropy": 0.2432687759399414, + "epoch": 4.308690744920993, + "grad_norm": 1.9312220811843872, + "learning_rate": 4.586809350232022e-06, + "loss": 0.1262, + "mean_token_accuracy": 0.9600926876068115, + "num_tokens": 61982412.0, + "step": 7635 + }, + { + "entropy": 0.22498698830604552, + "epoch": 4.311512415349887, + "grad_norm": 2.0869600772857666, + "learning_rate": 4.586288770934753e-06, + "loss": 0.1247, + "mean_token_accuracy": 0.9600854396820069, + "num_tokens": 62022929.0, + "step": 7640 + }, + { + "entropy": 0.2627303659915924, + "epoch": 4.314334085778781, + "grad_norm": 2.0073163509368896, + "learning_rate": 4.585767904567115e-06, + "loss": 0.1315, + "mean_token_accuracy": 0.9589227676391602, + "num_tokens": 62063278.0, + "step": 7645 + }, + { + "entropy": 0.2304110735654831, + "epoch": 4.317155756207675, + "grad_norm": 1.8721083402633667, + "learning_rate": 4.585246751231433e-06, + "loss": 0.1155, + "mean_token_accuracy": 0.962857437133789, + "num_tokens": 62103307.0, + "step": 7650 + }, + { + "entropy": 0.24186222553253173, + "epoch": 4.3199774266365685, + "grad_norm": 2.2419540882110596, + "learning_rate": 4.584725311030085e-06, + "loss": 0.126, + "mean_token_accuracy": 0.9607646465301514, + "num_tokens": 62144023.0, + "step": 7655 + }, + { + "entropy": 0.22962496876716615, + "epoch": 4.322799097065463, + "grad_norm": 2.0205132961273193, + "learning_rate": 4.584203584065512e-06, + "loss": 0.1339, + "mean_token_accuracy": 0.9580964565277099, + "num_tokens": 62184285.0, + "step": 7660 + }, + { + "entropy": 0.22456653118133546, + "epoch": 4.325620767494357, + "grad_norm": 1.9401427507400513, + "learning_rate": 4.583681570440204e-06, + "loss": 0.1209, + "mean_token_accuracy": 0.96328786611557, + "num_tokens": 62224200.0, + "step": 7665 + }, + { + "entropy": 0.22748615145683287, + "epoch": 4.3284424379232505, + "grad_norm": 1.9831054210662842, + "learning_rate": 4.583159270256712e-06, + "loss": 0.1157, + "mean_token_accuracy": 0.9626774907112121, + "num_tokens": 62264973.0, + "step": 7670 + }, + { + "entropy": 0.2501515805721283, + "epoch": 4.331264108352144, + "grad_norm": 2.2783570289611816, + "learning_rate": 4.582636683617643e-06, + "loss": 0.1268, + "mean_token_accuracy": 0.9602941393852233, + "num_tokens": 62305564.0, + "step": 7675 + }, + { + "entropy": 0.238142928481102, + "epoch": 4.334085778781039, + "grad_norm": 2.0721852779388428, + "learning_rate": 4.582113810625657e-06, + "loss": 0.133, + "mean_token_accuracy": 0.958099901676178, + "num_tokens": 62345999.0, + "step": 7680 + }, + { + "entropy": 0.22630032598972322, + "epoch": 4.336907449209932, + "grad_norm": 1.8380122184753418, + "learning_rate": 4.581590651383473e-06, + "loss": 0.1168, + "mean_token_accuracy": 0.9639391660690307, + "num_tokens": 62386861.0, + "step": 7685 + }, + { + "entropy": 0.2205124467611313, + "epoch": 4.339729119638826, + "grad_norm": 2.063775062561035, + "learning_rate": 4.581067205993867e-06, + "loss": 0.1078, + "mean_token_accuracy": 0.9652288436889649, + "num_tokens": 62427526.0, + "step": 7690 + }, + { + "entropy": 0.23350533246994018, + "epoch": 4.34255079006772, + "grad_norm": 2.0544824600219727, + "learning_rate": 4.580543474559669e-06, + "loss": 0.1333, + "mean_token_accuracy": 0.9583484888076782, + "num_tokens": 62468033.0, + "step": 7695 + }, + { + "entropy": 0.23622917830944062, + "epoch": 4.345372460496614, + "grad_norm": 1.9537707567214966, + "learning_rate": 4.580019457183766e-06, + "loss": 0.1248, + "mean_token_accuracy": 0.9608685970306396, + "num_tokens": 62508513.0, + "step": 7700 + }, + { + "entropy": 0.22541263699531555, + "epoch": 4.348194130925508, + "grad_norm": 1.8316019773483276, + "learning_rate": 4.579495153969102e-06, + "loss": 0.1235, + "mean_token_accuracy": 0.9609684944152832, + "num_tokens": 62549331.0, + "step": 7705 + }, + { + "entropy": 0.2408158302307129, + "epoch": 4.351015801354402, + "grad_norm": 2.1585938930511475, + "learning_rate": 4.578970565018676e-06, + "loss": 0.1357, + "mean_token_accuracy": 0.9570087313652038, + "num_tokens": 62590028.0, + "step": 7710 + }, + { + "entropy": 0.2397814780473709, + "epoch": 4.3538374717832955, + "grad_norm": 1.9146168231964111, + "learning_rate": 4.578445690435542e-06, + "loss": 0.1205, + "mean_token_accuracy": 0.9615342140197753, + "num_tokens": 62630583.0, + "step": 7715 + }, + { + "entropy": 0.2495466351509094, + "epoch": 4.356659142212189, + "grad_norm": 2.213092565536499, + "learning_rate": 4.577920530322815e-06, + "loss": 0.1317, + "mean_token_accuracy": 0.9592637538909912, + "num_tokens": 62671379.0, + "step": 7720 + }, + { + "entropy": 0.25447321236133574, + "epoch": 4.359480812641084, + "grad_norm": 2.162130355834961, + "learning_rate": 4.5773950847836604e-06, + "loss": 0.1476, + "mean_token_accuracy": 0.9534005284309387, + "num_tokens": 62712032.0, + "step": 7725 + }, + { + "entropy": 0.242551389336586, + "epoch": 4.3623024830699775, + "grad_norm": 2.0905115604400635, + "learning_rate": 4.576869353921302e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9586278200149536, + "num_tokens": 62752939.0, + "step": 7730 + }, + { + "entropy": 0.2576675295829773, + "epoch": 4.365124153498871, + "grad_norm": 2.0513839721679688, + "learning_rate": 4.5763433378390205e-06, + "loss": 0.1467, + "mean_token_accuracy": 0.9542353510856628, + "num_tokens": 62793276.0, + "step": 7735 + }, + { + "entropy": 0.2226796418428421, + "epoch": 4.367945823927765, + "grad_norm": 2.049790382385254, + "learning_rate": 4.575817036640153e-06, + "loss": 0.1265, + "mean_token_accuracy": 0.9590314865112305, + "num_tokens": 62833681.0, + "step": 7740 + }, + { + "entropy": 0.22169795334339143, + "epoch": 4.3707674943566595, + "grad_norm": 2.1573617458343506, + "learning_rate": 4.575290450428088e-06, + "loss": 0.1141, + "mean_token_accuracy": 0.9643651604652405, + "num_tokens": 62874247.0, + "step": 7745 + }, + { + "entropy": 0.21064240634441375, + "epoch": 4.373589164785553, + "grad_norm": 2.0029399394989014, + "learning_rate": 4.574763579306276e-06, + "loss": 0.114, + "mean_token_accuracy": 0.9654483675956727, + "num_tokens": 62914852.0, + "step": 7750 + }, + { + "entropy": 0.2374986946582794, + "epoch": 4.376410835214447, + "grad_norm": 2.325894355773926, + "learning_rate": 4.574236423378221e-06, + "loss": 0.1237, + "mean_token_accuracy": 0.9615754604339599, + "num_tokens": 62955498.0, + "step": 7755 + }, + { + "entropy": 0.2390218436717987, + "epoch": 4.3792325056433405, + "grad_norm": 2.234550952911377, + "learning_rate": 4.5737089827474826e-06, + "loss": 0.1278, + "mean_token_accuracy": 0.9586256504058838, + "num_tokens": 62995636.0, + "step": 7760 + }, + { + "entropy": 0.2208180695772171, + "epoch": 4.382054176072235, + "grad_norm": 2.097153902053833, + "learning_rate": 4.573181257517675e-06, + "loss": 0.1101, + "mean_token_accuracy": 0.9645330548286438, + "num_tokens": 63036376.0, + "step": 7765 + }, + { + "entropy": 0.25867641568183897, + "epoch": 4.384875846501129, + "grad_norm": 1.9841035604476929, + "learning_rate": 4.572653247792471e-06, + "loss": 0.136, + "mean_token_accuracy": 0.9573712944984436, + "num_tokens": 63076910.0, + "step": 7770 + }, + { + "entropy": 0.22610498666763307, + "epoch": 4.3876975169300225, + "grad_norm": 1.8051813840866089, + "learning_rate": 4.572124953675599e-06, + "loss": 0.1146, + "mean_token_accuracy": 0.963974404335022, + "num_tokens": 63117280.0, + "step": 7775 + }, + { + "entropy": 0.24536430835723877, + "epoch": 4.390519187358916, + "grad_norm": 2.267284870147705, + "learning_rate": 4.571596375270843e-06, + "loss": 0.1263, + "mean_token_accuracy": 0.9603652000427246, + "num_tokens": 63158036.0, + "step": 7780 + }, + { + "entropy": 0.2449027419090271, + "epoch": 4.393340857787811, + "grad_norm": 1.866507649421692, + "learning_rate": 4.5710675126820394e-06, + "loss": 0.1414, + "mean_token_accuracy": 0.9538567900657654, + "num_tokens": 63198608.0, + "step": 7785 + }, + { + "entropy": 0.2529394537210464, + "epoch": 4.3961625282167045, + "grad_norm": 2.741823673248291, + "learning_rate": 4.570538366013085e-06, + "loss": 0.136, + "mean_token_accuracy": 0.957249391078949, + "num_tokens": 63239140.0, + "step": 7790 + }, + { + "entropy": 0.2259007692337036, + "epoch": 4.398984198645598, + "grad_norm": 1.8987911939620972, + "learning_rate": 4.570008935367931e-06, + "loss": 0.1297, + "mean_token_accuracy": 0.9585871219635009, + "num_tokens": 63279866.0, + "step": 7795 + }, + { + "entropy": 0.22481289207935334, + "epoch": 4.401805869074492, + "grad_norm": 2.1305699348449707, + "learning_rate": 4.569479220850583e-06, + "loss": 0.125, + "mean_token_accuracy": 0.9605366587638855, + "num_tokens": 63320412.0, + "step": 7800 + }, + { + "entropy": 0.23074941635131835, + "epoch": 4.404627539503386, + "grad_norm": 1.930199384689331, + "learning_rate": 4.568949222565105e-06, + "loss": 0.1218, + "mean_token_accuracy": 0.9614872097969055, + "num_tokens": 63361131.0, + "step": 7805 + }, + { + "entropy": 0.22710799276828766, + "epoch": 4.40744920993228, + "grad_norm": 1.8718585968017578, + "learning_rate": 4.568418940615616e-06, + "loss": 0.119, + "mean_token_accuracy": 0.9626579761505127, + "num_tokens": 63401708.0, + "step": 7810 + }, + { + "entropy": 0.24245786666870117, + "epoch": 4.410270880361174, + "grad_norm": 2.3666772842407227, + "learning_rate": 4.567888375106286e-06, + "loss": 0.1383, + "mean_token_accuracy": 0.956715726852417, + "num_tokens": 63442479.0, + "step": 7815 + }, + { + "entropy": 0.2398514539003372, + "epoch": 4.413092550790068, + "grad_norm": 2.2359414100646973, + "learning_rate": 4.567357526141349e-06, + "loss": 0.133, + "mean_token_accuracy": 0.9572691440582275, + "num_tokens": 63482841.0, + "step": 7820 + }, + { + "entropy": 0.23423077166080475, + "epoch": 4.415914221218961, + "grad_norm": 2.1519811153411865, + "learning_rate": 4.5668263938250876e-06, + "loss": 0.1264, + "mean_token_accuracy": 0.9597557783126831, + "num_tokens": 63523538.0, + "step": 7825 + }, + { + "entropy": 0.2178510457277298, + "epoch": 4.418735891647856, + "grad_norm": 1.9678175449371338, + "learning_rate": 4.566294978261844e-06, + "loss": 0.1131, + "mean_token_accuracy": 0.9639857292175293, + "num_tokens": 63564255.0, + "step": 7830 + }, + { + "entropy": 0.23961224555969238, + "epoch": 4.4215575620767495, + "grad_norm": 2.6398816108703613, + "learning_rate": 4.565763279556014e-06, + "loss": 0.137, + "mean_token_accuracy": 0.9575521588325501, + "num_tokens": 63604882.0, + "step": 7835 + }, + { + "entropy": 0.24535090029239653, + "epoch": 4.424379232505643, + "grad_norm": 2.135787010192871, + "learning_rate": 4.565231297812051e-06, + "loss": 0.127, + "mean_token_accuracy": 0.9597046256065369, + "num_tokens": 63645157.0, + "step": 7840 + }, + { + "entropy": 0.2281971275806427, + "epoch": 4.427200902934537, + "grad_norm": 2.0932908058166504, + "learning_rate": 4.564699033134462e-06, + "loss": 0.1229, + "mean_token_accuracy": 0.9616926431655883, + "num_tokens": 63685797.0, + "step": 7845 + }, + { + "entropy": 0.24232443869113923, + "epoch": 4.4300225733634315, + "grad_norm": 1.8690487146377563, + "learning_rate": 4.564166485627811e-06, + "loss": 0.1439, + "mean_token_accuracy": 0.9539458394050598, + "num_tokens": 63726385.0, + "step": 7850 + }, + { + "entropy": 0.2243878573179245, + "epoch": 4.432844243792325, + "grad_norm": 2.3627984523773193, + "learning_rate": 4.563633655396717e-06, + "loss": 0.1282, + "mean_token_accuracy": 0.9588661789894104, + "num_tokens": 63767053.0, + "step": 7855 + }, + { + "entropy": 0.25156151950359346, + "epoch": 4.435665914221219, + "grad_norm": 2.046766996383667, + "learning_rate": 4.563100542545854e-06, + "loss": 0.1355, + "mean_token_accuracy": 0.9556037425994873, + "num_tokens": 63807612.0, + "step": 7860 + }, + { + "entropy": 0.23665276169776917, + "epoch": 4.438487584650113, + "grad_norm": 2.13456130027771, + "learning_rate": 4.5625671471799535e-06, + "loss": 0.1412, + "mean_token_accuracy": 0.955809724330902, + "num_tokens": 63848217.0, + "step": 7865 + }, + { + "entropy": 0.22619423866271973, + "epoch": 4.441309255079007, + "grad_norm": 2.092322587966919, + "learning_rate": 4.562033469403799e-06, + "loss": 0.1183, + "mean_token_accuracy": 0.9627872586250306, + "num_tokens": 63888701.0, + "step": 7870 + }, + { + "entropy": 0.22337720394134522, + "epoch": 4.444130925507901, + "grad_norm": 1.8521798849105835, + "learning_rate": 4.561499509322233e-06, + "loss": 0.1193, + "mean_token_accuracy": 0.9614850997924804, + "num_tokens": 63929411.0, + "step": 7875 + }, + { + "entropy": 0.233727565407753, + "epoch": 4.446952595936795, + "grad_norm": 1.6389681100845337, + "learning_rate": 4.560965267040151e-06, + "loss": 0.1149, + "mean_token_accuracy": 0.9633271217346191, + "num_tokens": 63970272.0, + "step": 7880 + }, + { + "entropy": 0.21891143321990966, + "epoch": 4.449774266365688, + "grad_norm": 2.2698888778686523, + "learning_rate": 4.560430742662506e-06, + "loss": 0.1212, + "mean_token_accuracy": 0.9615803718566894, + "num_tokens": 64010993.0, + "step": 7885 + }, + { + "entropy": 0.257635697722435, + "epoch": 4.452595936794582, + "grad_norm": 2.019932270050049, + "learning_rate": 4.559895936294305e-06, + "loss": 0.1328, + "mean_token_accuracy": 0.9580089688301087, + "num_tokens": 64051747.0, + "step": 7890 + }, + { + "entropy": 0.2612759441137314, + "epoch": 4.455417607223477, + "grad_norm": 2.1498031616210938, + "learning_rate": 4.559360848040611e-06, + "loss": 0.1354, + "mean_token_accuracy": 0.9571520805358886, + "num_tokens": 64092298.0, + "step": 7895 + }, + { + "entropy": 0.24472321271896363, + "epoch": 4.45823927765237, + "grad_norm": 2.2443246841430664, + "learning_rate": 4.558825478006543e-06, + "loss": 0.1298, + "mean_token_accuracy": 0.9582807421684265, + "num_tokens": 64132966.0, + "step": 7900 + }, + { + "entropy": 0.24086927771568298, + "epoch": 4.461060948081264, + "grad_norm": 2.0586602687835693, + "learning_rate": 4.5582898262972715e-06, + "loss": 0.1147, + "mean_token_accuracy": 0.9626161217689514, + "num_tokens": 64173716.0, + "step": 7905 + }, + { + "entropy": 0.22729225158691407, + "epoch": 4.463882618510158, + "grad_norm": 2.192944049835205, + "learning_rate": 4.557753893018028e-06, + "loss": 0.1231, + "mean_token_accuracy": 0.962181007862091, + "num_tokens": 64214398.0, + "step": 7910 + }, + { + "entropy": 0.24374393820762635, + "epoch": 4.466704288939052, + "grad_norm": 2.042626142501831, + "learning_rate": 4.557217678274097e-06, + "loss": 0.1412, + "mean_token_accuracy": 0.9556024193763732, + "num_tokens": 64254854.0, + "step": 7915 + }, + { + "entropy": 0.23933674693107604, + "epoch": 4.469525959367946, + "grad_norm": 1.807453989982605, + "learning_rate": 4.556681182170816e-06, + "loss": 0.1334, + "mean_token_accuracy": 0.9586431503295898, + "num_tokens": 64295435.0, + "step": 7920 + }, + { + "entropy": 0.2443247377872467, + "epoch": 4.47234762979684, + "grad_norm": 2.1040701866149902, + "learning_rate": 4.55614440481358e-06, + "loss": 0.1288, + "mean_token_accuracy": 0.9598910212516785, + "num_tokens": 64336069.0, + "step": 7925 + }, + { + "entropy": 0.24531608521938325, + "epoch": 4.475169300225733, + "grad_norm": 2.0442538261413574, + "learning_rate": 4.555607346307841e-06, + "loss": 0.1359, + "mean_token_accuracy": 0.9561710357666016, + "num_tokens": 64376772.0, + "step": 7930 + }, + { + "entropy": 0.2342956393957138, + "epoch": 4.477990970654628, + "grad_norm": 1.9000775814056396, + "learning_rate": 4.555070006759102e-06, + "loss": 0.1351, + "mean_token_accuracy": 0.9563033699989318, + "num_tokens": 64417469.0, + "step": 7935 + }, + { + "entropy": 0.22305310368537903, + "epoch": 4.480812641083522, + "grad_norm": 1.8340357542037964, + "learning_rate": 4.554532386272925e-06, + "loss": 0.1233, + "mean_token_accuracy": 0.9609080910682678, + "num_tokens": 64458183.0, + "step": 7940 + }, + { + "entropy": 0.24366792142391205, + "epoch": 4.483634311512415, + "grad_norm": 2.0451269149780273, + "learning_rate": 4.5539944849549244e-06, + "loss": 0.1332, + "mean_token_accuracy": 0.9595473170280456, + "num_tokens": 64499084.0, + "step": 7945 + }, + { + "entropy": 0.2458895593881607, + "epoch": 4.486455981941309, + "grad_norm": 2.0097408294677734, + "learning_rate": 4.553456302910771e-06, + "loss": 0.1273, + "mean_token_accuracy": 0.9585718154907227, + "num_tokens": 64539911.0, + "step": 7950 + }, + { + "entropy": 0.26139111518859864, + "epoch": 4.489277652370204, + "grad_norm": 2.031979560852051, + "learning_rate": 4.552917840246191e-06, + "loss": 0.1372, + "mean_token_accuracy": 0.9563012480735779, + "num_tokens": 64580156.0, + "step": 7955 + }, + { + "entropy": 0.2303838014602661, + "epoch": 4.492099322799097, + "grad_norm": 1.742417573928833, + "learning_rate": 4.552379097066967e-06, + "loss": 0.128, + "mean_token_accuracy": 0.959154736995697, + "num_tokens": 64620865.0, + "step": 7960 + }, + { + "entropy": 0.24330146610736847, + "epoch": 4.494920993227991, + "grad_norm": 2.139296770095825, + "learning_rate": 4.551840073478934e-06, + "loss": 0.134, + "mean_token_accuracy": 0.9584646821022034, + "num_tokens": 64661404.0, + "step": 7965 + }, + { + "entropy": 0.24887127578258514, + "epoch": 4.497742663656885, + "grad_norm": 2.17922306060791, + "learning_rate": 4.551300769587982e-06, + "loss": 0.133, + "mean_token_accuracy": 0.9567908644676208, + "num_tokens": 64701858.0, + "step": 7970 + }, + { + "entropy": 0.2185787171125412, + "epoch": 4.500564334085778, + "grad_norm": 2.0657732486724854, + "learning_rate": 4.550761185500059e-06, + "loss": 0.1253, + "mean_token_accuracy": 0.9603734970092773, + "num_tokens": 64742554.0, + "step": 7975 + }, + { + "entropy": 0.2412070006132126, + "epoch": 4.503386004514673, + "grad_norm": 1.9913356304168701, + "learning_rate": 4.550221321321165e-06, + "loss": 0.1238, + "mean_token_accuracy": 0.959455955028534, + "num_tokens": 64783017.0, + "step": 7980 + }, + { + "entropy": 0.24967373609542848, + "epoch": 4.506207674943567, + "grad_norm": 2.3349766731262207, + "learning_rate": 4.549681177157358e-06, + "loss": 0.1318, + "mean_token_accuracy": 0.9588298797607422, + "num_tokens": 64823373.0, + "step": 7985 + }, + { + "entropy": 0.22077546417713165, + "epoch": 4.50902934537246, + "grad_norm": 1.7486339807510376, + "learning_rate": 4.549140753114748e-06, + "loss": 0.1058, + "mean_token_accuracy": 0.9653358578681945, + "num_tokens": 64864026.0, + "step": 7990 + }, + { + "entropy": 0.22933962047100068, + "epoch": 4.511851015801354, + "grad_norm": 1.8534975051879883, + "learning_rate": 4.548600049299502e-06, + "loss": 0.1226, + "mean_token_accuracy": 0.9605401158332825, + "num_tokens": 64904483.0, + "step": 7995 + }, + { + "entropy": 0.22776411473751068, + "epoch": 4.514672686230249, + "grad_norm": 1.8297226428985596, + "learning_rate": 4.548059065817841e-06, + "loss": 0.1186, + "mean_token_accuracy": 0.96176518201828, + "num_tokens": 64945138.0, + "step": 8000 + }, + { + "epoch": 4.514672686230249, + "eval_entropy": 0.2636869251728058, + "eval_loss": 0.0795026570558548, + "eval_mean_token_accuracy": 0.9754695296287537, + "eval_num_tokens": 64945138.0, + "eval_runtime": 0.1637, + "eval_samples_per_second": 24.429, + "eval_steps_per_second": 6.107, + "step": 8000 + }, + { + "entropy": 0.21750448644161224, + "epoch": 4.517494356659142, + "grad_norm": 2.122471570968628, + "learning_rate": 4.547517802776042e-06, + "loss": 0.1224, + "mean_token_accuracy": 0.9609117388725281, + "num_tokens": 64985873.0, + "step": 8005 + }, + { + "entropy": 0.22921790778636933, + "epoch": 4.520316027088036, + "grad_norm": 2.1541335582733154, + "learning_rate": 4.546976260280435e-06, + "loss": 0.1131, + "mean_token_accuracy": 0.9636703491210937, + "num_tokens": 65026415.0, + "step": 8010 + }, + { + "entropy": 0.23894762694835664, + "epoch": 4.52313769751693, + "grad_norm": 2.3302664756774902, + "learning_rate": 4.546434438437408e-06, + "loss": 0.1196, + "mean_token_accuracy": 0.9631991147994995, + "num_tokens": 65067004.0, + "step": 8015 + }, + { + "entropy": 0.26045531034469604, + "epoch": 4.525959367945823, + "grad_norm": 2.2902634143829346, + "learning_rate": 4.5458923373534e-06, + "loss": 0.1442, + "mean_token_accuracy": 0.9546260356903076, + "num_tokens": 65107609.0, + "step": 8020 + }, + { + "entropy": 0.23849842548370362, + "epoch": 4.528781038374718, + "grad_norm": 2.1312549114227295, + "learning_rate": 4.545349957134908e-06, + "loss": 0.1253, + "mean_token_accuracy": 0.9602569699287414, + "num_tokens": 65148321.0, + "step": 8025 + }, + { + "entropy": 0.242272087931633, + "epoch": 4.531602708803612, + "grad_norm": 2.1353938579559326, + "learning_rate": 4.544807297888482e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.9582337856292724, + "num_tokens": 65188859.0, + "step": 8030 + }, + { + "entropy": 0.21073226928710936, + "epoch": 4.534424379232505, + "grad_norm": 1.7557628154754639, + "learning_rate": 4.544264359720728e-06, + "loss": 0.1058, + "mean_token_accuracy": 0.9665800213813782, + "num_tokens": 65229611.0, + "step": 8035 + }, + { + "entropy": 0.2263183683156967, + "epoch": 4.5372460496614, + "grad_norm": 1.9420866966247559, + "learning_rate": 4.543721142738306e-06, + "loss": 0.1327, + "mean_token_accuracy": 0.9569009423255921, + "num_tokens": 65270007.0, + "step": 8040 + }, + { + "entropy": 0.24126074612140655, + "epoch": 4.540067720090294, + "grad_norm": 1.8423664569854736, + "learning_rate": 4.543177647047931e-06, + "loss": 0.1261, + "mean_token_accuracy": 0.9594087600708008, + "num_tokens": 65310681.0, + "step": 8045 + }, + { + "entropy": 0.2388361632823944, + "epoch": 4.542889390519187, + "grad_norm": 1.9569209814071655, + "learning_rate": 4.542633872756374e-06, + "loss": 0.1303, + "mean_token_accuracy": 0.9593725204467773, + "num_tokens": 65351285.0, + "step": 8050 + }, + { + "entropy": 0.23481654226779938, + "epoch": 4.545711060948081, + "grad_norm": 1.949885606765747, + "learning_rate": 4.542089819970456e-06, + "loss": 0.1192, + "mean_token_accuracy": 0.9611479878425598, + "num_tokens": 65391577.0, + "step": 8055 + }, + { + "entropy": 0.25715220272541045, + "epoch": 4.548532731376975, + "grad_norm": 2.203080892562866, + "learning_rate": 4.541545488797061e-06, + "loss": 0.1468, + "mean_token_accuracy": 0.9533620238304138, + "num_tokens": 65432197.0, + "step": 8060 + }, + { + "entropy": 0.23479729294776916, + "epoch": 4.551354401805869, + "grad_norm": 1.9314894676208496, + "learning_rate": 4.541000879343119e-06, + "loss": 0.1233, + "mean_token_accuracy": 0.9603962182998658, + "num_tokens": 65473100.0, + "step": 8065 + }, + { + "entropy": 0.23400575220584868, + "epoch": 4.554176072234763, + "grad_norm": 1.9281415939331055, + "learning_rate": 4.540455991715621e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9566599488258362, + "num_tokens": 65513690.0, + "step": 8070 + }, + { + "entropy": 0.22714907824993133, + "epoch": 4.556997742663657, + "grad_norm": 1.8072905540466309, + "learning_rate": 4.539910826021609e-06, + "loss": 0.1232, + "mean_token_accuracy": 0.9606075048446655, + "num_tokens": 65554560.0, + "step": 8075 + }, + { + "entropy": 0.23093242347240447, + "epoch": 4.5598194130925505, + "grad_norm": 1.9328153133392334, + "learning_rate": 4.539365382368182e-06, + "loss": 0.1313, + "mean_token_accuracy": 0.9587170481681824, + "num_tokens": 65595278.0, + "step": 8080 + }, + { + "entropy": 0.2399729460477829, + "epoch": 4.562641083521445, + "grad_norm": 2.074904203414917, + "learning_rate": 4.5388196608624915e-06, + "loss": 0.126, + "mean_token_accuracy": 0.9598425269126892, + "num_tokens": 65635798.0, + "step": 8085 + }, + { + "entropy": 0.22223535776138306, + "epoch": 4.565462753950339, + "grad_norm": 1.8036304712295532, + "learning_rate": 4.538273661611744e-06, + "loss": 0.128, + "mean_token_accuracy": 0.960320234298706, + "num_tokens": 65676485.0, + "step": 8090 + }, + { + "entropy": 0.2333895742893219, + "epoch": 4.568284424379232, + "grad_norm": 2.1625099182128906, + "learning_rate": 4.537727384723203e-06, + "loss": 0.1396, + "mean_token_accuracy": 0.9549428701400757, + "num_tokens": 65717380.0, + "step": 8095 + }, + { + "entropy": 0.2488487184047699, + "epoch": 4.571106094808126, + "grad_norm": 2.21354079246521, + "learning_rate": 4.537180830304183e-06, + "loss": 0.1413, + "mean_token_accuracy": 0.9569279074668884, + "num_tokens": 65757893.0, + "step": 8100 + }, + { + "entropy": 0.22758035659790038, + "epoch": 4.57392776523702, + "grad_norm": 2.3330676555633545, + "learning_rate": 4.536633998462055e-06, + "loss": 0.1163, + "mean_token_accuracy": 0.9631718873977662, + "num_tokens": 65798444.0, + "step": 8105 + }, + { + "entropy": 0.235689178109169, + "epoch": 4.576749435665914, + "grad_norm": 2.2506041526794434, + "learning_rate": 4.536086889304246e-06, + "loss": 0.134, + "mean_token_accuracy": 0.9576963305473327, + "num_tokens": 65839127.0, + "step": 8110 + }, + { + "entropy": 0.2473171055316925, + "epoch": 4.579571106094808, + "grad_norm": 2.3321518898010254, + "learning_rate": 4.535539502938233e-06, + "loss": 0.128, + "mean_token_accuracy": 0.9609445095062256, + "num_tokens": 65879975.0, + "step": 8115 + }, + { + "entropy": 0.25173512697219846, + "epoch": 4.582392776523702, + "grad_norm": 2.5491793155670166, + "learning_rate": 4.534991839471551e-06, + "loss": 0.1372, + "mean_token_accuracy": 0.9561426758766174, + "num_tokens": 65920672.0, + "step": 8120 + }, + { + "entropy": 0.23555795848369598, + "epoch": 4.585214446952596, + "grad_norm": 2.0215837955474854, + "learning_rate": 4.534443899011789e-06, + "loss": 0.1274, + "mean_token_accuracy": 0.9595631241798401, + "num_tokens": 65960881.0, + "step": 8125 + }, + { + "entropy": 0.2525820404291153, + "epoch": 4.58803611738149, + "grad_norm": 2.073427438735962, + "learning_rate": 4.533895681666591e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.9559617161750793, + "num_tokens": 66001503.0, + "step": 8130 + }, + { + "entropy": 0.23859679996967315, + "epoch": 4.590857787810384, + "grad_norm": 1.9522126913070679, + "learning_rate": 4.533347187543652e-06, + "loss": 0.125, + "mean_token_accuracy": 0.9605787754058838, + "num_tokens": 66042200.0, + "step": 8135 + }, + { + "entropy": 0.23188299536705018, + "epoch": 4.5936794582392775, + "grad_norm": 1.8732774257659912, + "learning_rate": 4.5327984167507255e-06, + "loss": 0.1232, + "mean_token_accuracy": 0.9610357999801635, + "num_tokens": 66082649.0, + "step": 8140 + }, + { + "entropy": 0.2427914947271347, + "epoch": 4.596501128668171, + "grad_norm": 2.2590503692626953, + "learning_rate": 4.532249369395616e-06, + "loss": 0.1242, + "mean_token_accuracy": 0.9602537393569947, + "num_tokens": 66123110.0, + "step": 8145 + }, + { + "entropy": 0.24748845994472504, + "epoch": 4.599322799097066, + "grad_norm": 2.055881977081299, + "learning_rate": 4.531700045586187e-06, + "loss": 0.1205, + "mean_token_accuracy": 0.9622966647148132, + "num_tokens": 66163660.0, + "step": 8150 + }, + { + "entropy": 0.23012515604496003, + "epoch": 4.6021444695259595, + "grad_norm": 1.8205794095993042, + "learning_rate": 4.53115044543035e-06, + "loss": 0.1247, + "mean_token_accuracy": 0.9605541706085206, + "num_tokens": 66204458.0, + "step": 8155 + }, + { + "entropy": 0.2356725037097931, + "epoch": 4.604966139954853, + "grad_norm": 2.3245763778686523, + "learning_rate": 4.530600569036075e-06, + "loss": 0.1337, + "mean_token_accuracy": 0.9578644394874573, + "num_tokens": 66245231.0, + "step": 8160 + }, + { + "entropy": 0.2583352416753769, + "epoch": 4.607787810383747, + "grad_norm": 2.2044198513031006, + "learning_rate": 4.530050416511386e-06, + "loss": 0.1491, + "mean_token_accuracy": 0.9534392952919006, + "num_tokens": 66285856.0, + "step": 8165 + }, + { + "entropy": 0.2439739376306534, + "epoch": 4.610609480812641, + "grad_norm": 2.181988477706909, + "learning_rate": 4.529499987964359e-06, + "loss": 0.1372, + "mean_token_accuracy": 0.9561636447906494, + "num_tokens": 66326451.0, + "step": 8170 + }, + { + "entropy": 0.24100883007049562, + "epoch": 4.613431151241535, + "grad_norm": 2.0313637256622314, + "learning_rate": 4.5289492835031275e-06, + "loss": 0.1421, + "mean_token_accuracy": 0.9549835324287415, + "num_tokens": 66367157.0, + "step": 8175 + }, + { + "entropy": 0.22571427822113038, + "epoch": 4.616252821670429, + "grad_norm": 1.996917963027954, + "learning_rate": 4.528398303235877e-06, + "loss": 0.1191, + "mean_token_accuracy": 0.9621818661689758, + "num_tokens": 66407802.0, + "step": 8180 + }, + { + "entropy": 0.23621368408203125, + "epoch": 4.6190744920993225, + "grad_norm": 1.7542959451675415, + "learning_rate": 4.527847047270847e-06, + "loss": 0.1379, + "mean_token_accuracy": 0.9554514408111572, + "num_tokens": 66448374.0, + "step": 8185 + }, + { + "entropy": 0.2359769821166992, + "epoch": 4.621896162528216, + "grad_norm": 2.465397834777832, + "learning_rate": 4.527295515716332e-06, + "loss": 0.128, + "mean_token_accuracy": 0.9583778262138367, + "num_tokens": 66488251.0, + "step": 8190 + }, + { + "entropy": 0.22702213227748871, + "epoch": 4.624717832957111, + "grad_norm": 2.1334855556488037, + "learning_rate": 4.526743708680681e-06, + "loss": 0.1218, + "mean_token_accuracy": 0.9614168047904968, + "num_tokens": 66528989.0, + "step": 8195 + }, + { + "entropy": 0.2207847625017166, + "epoch": 4.6275395033860045, + "grad_norm": 2.2564754486083984, + "learning_rate": 4.526191626272297e-06, + "loss": 0.1245, + "mean_token_accuracy": 0.9604014396667481, + "num_tokens": 66569555.0, + "step": 8200 + }, + { + "entropy": 0.24324011504650117, + "epoch": 4.630361173814898, + "grad_norm": 1.8627599477767944, + "learning_rate": 4.525639268599635e-06, + "loss": 0.1284, + "mean_token_accuracy": 0.9589541673660278, + "num_tokens": 66610160.0, + "step": 8205 + }, + { + "entropy": 0.2532497227191925, + "epoch": 4.633182844243793, + "grad_norm": 2.4542059898376465, + "learning_rate": 4.5250866357712066e-06, + "loss": 0.1418, + "mean_token_accuracy": 0.9555447816848754, + "num_tokens": 66650856.0, + "step": 8210 + }, + { + "entropy": 0.23340786695480348, + "epoch": 4.6360045146726865, + "grad_norm": 2.2496676445007324, + "learning_rate": 4.524533727895577e-06, + "loss": 0.118, + "mean_token_accuracy": 0.9636736989021302, + "num_tokens": 66691192.0, + "step": 8215 + }, + { + "entropy": 0.24236354231834412, + "epoch": 4.63882618510158, + "grad_norm": 1.9971568584442139, + "learning_rate": 4.5239805450813646e-06, + "loss": 0.134, + "mean_token_accuracy": 0.9569501161575318, + "num_tokens": 66731579.0, + "step": 8220 + }, + { + "entropy": 0.23493029475212096, + "epoch": 4.641647855530474, + "grad_norm": 2.242898941040039, + "learning_rate": 4.523427087437241e-06, + "loss": 0.1212, + "mean_token_accuracy": 0.962970244884491, + "num_tokens": 66771885.0, + "step": 8225 + }, + { + "entropy": 0.22397561073303224, + "epoch": 4.644469525959368, + "grad_norm": 1.9795989990234375, + "learning_rate": 4.522873355071936e-06, + "loss": 0.1089, + "mean_token_accuracy": 0.9651905417442321, + "num_tokens": 66812487.0, + "step": 8230 + }, + { + "entropy": 0.21759993433952332, + "epoch": 4.647291196388262, + "grad_norm": 5.298044204711914, + "learning_rate": 4.5223193480942275e-06, + "loss": 0.1171, + "mean_token_accuracy": 0.9617223024368287, + "num_tokens": 66853201.0, + "step": 8235 + }, + { + "entropy": 0.24825019538402557, + "epoch": 4.650112866817156, + "grad_norm": 2.4175469875335693, + "learning_rate": 4.521765066612952e-06, + "loss": 0.1532, + "mean_token_accuracy": 0.9531420230865478, + "num_tokens": 66893997.0, + "step": 8240 + }, + { + "entropy": 0.23018406629562377, + "epoch": 4.6529345372460496, + "grad_norm": 1.9281044006347656, + "learning_rate": 4.521210510736998e-06, + "loss": 0.1235, + "mean_token_accuracy": 0.9599860429763794, + "num_tokens": 66934876.0, + "step": 8245 + }, + { + "entropy": 0.23870559334754943, + "epoch": 4.655756207674943, + "grad_norm": 1.8871232271194458, + "learning_rate": 4.520655680575306e-06, + "loss": 0.1159, + "mean_token_accuracy": 0.9634807467460632, + "num_tokens": 66975475.0, + "step": 8250 + }, + { + "entropy": 0.244593945145607, + "epoch": 4.658577878103838, + "grad_norm": 2.5723750591278076, + "learning_rate": 4.520100576236877e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9575012445449829, + "num_tokens": 67016014.0, + "step": 8255 + }, + { + "entropy": 0.23622966110706328, + "epoch": 4.6613995485327315, + "grad_norm": 2.5368049144744873, + "learning_rate": 4.5195451978307556e-06, + "loss": 0.131, + "mean_token_accuracy": 0.9596938133239746, + "num_tokens": 67056774.0, + "step": 8260 + }, + { + "entropy": 0.22385245859622954, + "epoch": 4.664221218961625, + "grad_norm": 2.3085649013519287, + "learning_rate": 4.51898954546605e-06, + "loss": 0.1206, + "mean_token_accuracy": 0.9616567969322205, + "num_tokens": 67096578.0, + "step": 8265 + }, + { + "entropy": 0.24655192196369172, + "epoch": 4.667042889390519, + "grad_norm": 2.2656362056732178, + "learning_rate": 4.518433619251918e-06, + "loss": 0.1407, + "mean_token_accuracy": 0.9564512014389038, + "num_tokens": 67137138.0, + "step": 8270 + }, + { + "entropy": 0.24504003524780274, + "epoch": 4.669864559819413, + "grad_norm": 1.9192149639129639, + "learning_rate": 4.5178774192975685e-06, + "loss": 0.1408, + "mean_token_accuracy": 0.9554814100265503, + "num_tokens": 67176712.0, + "step": 8275 + }, + { + "entropy": 0.22429890930652618, + "epoch": 4.672686230248307, + "grad_norm": 2.07617449760437, + "learning_rate": 4.51732094571227e-06, + "loss": 0.1242, + "mean_token_accuracy": 0.9604406833648682, + "num_tokens": 67217311.0, + "step": 8280 + }, + { + "entropy": 0.250271201133728, + "epoch": 4.675507900677201, + "grad_norm": 1.9916068315505981, + "learning_rate": 4.51676419860534e-06, + "loss": 0.1517, + "mean_token_accuracy": 0.9521410703659058, + "num_tokens": 67257957.0, + "step": 8285 + }, + { + "entropy": 0.243993404507637, + "epoch": 4.678329571106095, + "grad_norm": 2.026092290878296, + "learning_rate": 4.516207178086153e-06, + "loss": 0.121, + "mean_token_accuracy": 0.962063193321228, + "num_tokens": 67298719.0, + "step": 8290 + }, + { + "entropy": 0.23810403943061828, + "epoch": 4.681151241534989, + "grad_norm": 1.7401542663574219, + "learning_rate": 4.515649884264135e-06, + "loss": 0.1302, + "mean_token_accuracy": 0.9585550904273987, + "num_tokens": 67339410.0, + "step": 8295 + }, + { + "entropy": 0.24030295610427857, + "epoch": 4.683972911963883, + "grad_norm": 1.8887089490890503, + "learning_rate": 4.515092317248766e-06, + "loss": 0.1216, + "mean_token_accuracy": 0.9610766172409058, + "num_tokens": 67379808.0, + "step": 8300 + }, + { + "entropy": 0.2178757071495056, + "epoch": 4.686794582392777, + "grad_norm": 2.0333640575408936, + "learning_rate": 4.514534477149581e-06, + "loss": 0.1048, + "mean_token_accuracy": 0.9664766073226929, + "num_tokens": 67420340.0, + "step": 8305 + }, + { + "entropy": 0.23881784081459045, + "epoch": 4.68961625282167, + "grad_norm": 2.146456480026245, + "learning_rate": 4.513976364076167e-06, + "loss": 0.1213, + "mean_token_accuracy": 0.9607988595962524, + "num_tokens": 67461118.0, + "step": 8310 + }, + { + "entropy": 0.22699067294597625, + "epoch": 4.692437923250564, + "grad_norm": 2.0983293056488037, + "learning_rate": 4.513417978138166e-06, + "loss": 0.1382, + "mean_token_accuracy": 0.9564812779426575, + "num_tokens": 67501770.0, + "step": 8315 + }, + { + "entropy": 0.21569800674915313, + "epoch": 4.6952595936794586, + "grad_norm": 1.7150790691375732, + "learning_rate": 4.5128593194452725e-06, + "loss": 0.124, + "mean_token_accuracy": 0.9598898530006409, + "num_tokens": 67542371.0, + "step": 8320 + }, + { + "entropy": 0.2471790909767151, + "epoch": 4.698081264108352, + "grad_norm": 2.265134811401367, + "learning_rate": 4.5123003881072345e-06, + "loss": 0.1441, + "mean_token_accuracy": 0.9535491704940796, + "num_tokens": 67582960.0, + "step": 8325 + }, + { + "entropy": 0.22231938540935517, + "epoch": 4.700902934537246, + "grad_norm": 1.7570245265960693, + "learning_rate": 4.511741184233856e-06, + "loss": 0.1158, + "mean_token_accuracy": 0.9630581736564636, + "num_tokens": 67623636.0, + "step": 8330 + }, + { + "entropy": 0.24155616462230683, + "epoch": 4.70372460496614, + "grad_norm": 2.0228629112243652, + "learning_rate": 4.511181707934992e-06, + "loss": 0.1354, + "mean_token_accuracy": 0.9561177611351013, + "num_tokens": 67664179.0, + "step": 8335 + }, + { + "entropy": 0.23686327040195465, + "epoch": 4.706546275395034, + "grad_norm": 2.0804860591888428, + "learning_rate": 4.5106219593205505e-06, + "loss": 0.1402, + "mean_token_accuracy": 0.9536548256874084, + "num_tokens": 67704834.0, + "step": 8340 + }, + { + "entropy": 0.22247822284698487, + "epoch": 4.709367945823928, + "grad_norm": 2.316359519958496, + "learning_rate": 4.510061938500495e-06, + "loss": 0.1317, + "mean_token_accuracy": 0.9575009226799012, + "num_tokens": 67745324.0, + "step": 8345 + }, + { + "entropy": 0.25336918532848357, + "epoch": 4.712189616252822, + "grad_norm": 1.8472719192504883, + "learning_rate": 4.509501645584842e-06, + "loss": 0.1437, + "mean_token_accuracy": 0.9530521392822265, + "num_tokens": 67786074.0, + "step": 8350 + }, + { + "entropy": 0.23464283645153045, + "epoch": 4.715011286681715, + "grad_norm": 2.232201337814331, + "learning_rate": 4.508941080683661e-06, + "loss": 0.1304, + "mean_token_accuracy": 0.9586957097053528, + "num_tokens": 67826602.0, + "step": 8355 + }, + { + "entropy": 0.2284877747297287, + "epoch": 4.717832957110609, + "grad_norm": 2.0694456100463867, + "learning_rate": 4.508380243907074e-06, + "loss": 0.1335, + "mean_token_accuracy": 0.9572539687156677, + "num_tokens": 67867168.0, + "step": 8360 + }, + { + "entropy": 0.23733165860176086, + "epoch": 4.720654627539504, + "grad_norm": 1.836682915687561, + "learning_rate": 4.5078191353652575e-06, + "loss": 0.1427, + "mean_token_accuracy": 0.9549849033355713, + "num_tokens": 67907809.0, + "step": 8365 + }, + { + "entropy": 0.23421334028244017, + "epoch": 4.723476297968397, + "grad_norm": 2.212164878845215, + "learning_rate": 4.507257755168444e-06, + "loss": 0.1401, + "mean_token_accuracy": 0.9548245668411255, + "num_tokens": 67948509.0, + "step": 8370 + }, + { + "entropy": 0.2503542214632034, + "epoch": 4.726297968397291, + "grad_norm": 2.0967366695404053, + "learning_rate": 4.506696103426914e-06, + "loss": 0.1403, + "mean_token_accuracy": 0.9554270505905151, + "num_tokens": 67989217.0, + "step": 8375 + }, + { + "entropy": 0.22551278471946717, + "epoch": 4.729119638826186, + "grad_norm": 2.0582823753356934, + "learning_rate": 4.506134180251005e-06, + "loss": 0.1292, + "mean_token_accuracy": 0.9604162216186524, + "num_tokens": 68029546.0, + "step": 8380 + }, + { + "entropy": 0.23066943287849426, + "epoch": 4.731941309255079, + "grad_norm": 1.9239495992660522, + "learning_rate": 4.5055719857511065e-06, + "loss": 0.1443, + "mean_token_accuracy": 0.9540068626403808, + "num_tokens": 68070249.0, + "step": 8385 + }, + { + "entropy": 0.24158512353897094, + "epoch": 4.734762979683973, + "grad_norm": 2.0336618423461914, + "learning_rate": 4.505009520037662e-06, + "loss": 0.1339, + "mean_token_accuracy": 0.9556556820869446, + "num_tokens": 68110991.0, + "step": 8390 + }, + { + "entropy": 0.24518156945705413, + "epoch": 4.737584650112867, + "grad_norm": 2.0292651653289795, + "learning_rate": 4.504446783221168e-06, + "loss": 0.1303, + "mean_token_accuracy": 0.9584499716758728, + "num_tokens": 68151625.0, + "step": 8395 + }, + { + "entropy": 0.23519628643989562, + "epoch": 4.74040632054176, + "grad_norm": 2.051119327545166, + "learning_rate": 4.503883775412174e-06, + "loss": 0.1334, + "mean_token_accuracy": 0.9583625435829163, + "num_tokens": 68192104.0, + "step": 8400 + }, + { + "entropy": 0.2450340747833252, + "epoch": 4.743227990970655, + "grad_norm": 2.1468701362609863, + "learning_rate": 4.503320496721283e-06, + "loss": 0.1437, + "mean_token_accuracy": 0.9554325222969056, + "num_tokens": 68232969.0, + "step": 8405 + }, + { + "entropy": 0.22379915416240692, + "epoch": 4.746049661399549, + "grad_norm": 2.243983268737793, + "learning_rate": 4.5027569472591515e-06, + "loss": 0.1262, + "mean_token_accuracy": 0.9602833390235901, + "num_tokens": 68273416.0, + "step": 8410 + }, + { + "entropy": 0.23107292354106904, + "epoch": 4.748871331828442, + "grad_norm": 1.9481990337371826, + "learning_rate": 4.502193127136489e-06, + "loss": 0.1195, + "mean_token_accuracy": 0.9613822102546692, + "num_tokens": 68314053.0, + "step": 8415 + }, + { + "entropy": 0.24034847021102906, + "epoch": 4.751693002257336, + "grad_norm": 2.4841561317443848, + "learning_rate": 4.501629036464057e-06, + "loss": 0.1295, + "mean_token_accuracy": 0.9587523937225342, + "num_tokens": 68354766.0, + "step": 8420 + }, + { + "entropy": 0.23407194018363953, + "epoch": 4.754514672686231, + "grad_norm": 2.2087182998657227, + "learning_rate": 4.501064675352671e-06, + "loss": 0.125, + "mean_token_accuracy": 0.9598301172256469, + "num_tokens": 68395431.0, + "step": 8425 + }, + { + "entropy": 0.25628364980220797, + "epoch": 4.757336343115124, + "grad_norm": 2.0910379886627197, + "learning_rate": 4.500500043913203e-06, + "loss": 0.1406, + "mean_token_accuracy": 0.9561483979225158, + "num_tokens": 68436109.0, + "step": 8430 + }, + { + "entropy": 0.26322224736213684, + "epoch": 4.760158013544018, + "grad_norm": 2.235393762588501, + "learning_rate": 4.499935142256571e-06, + "loss": 0.1605, + "mean_token_accuracy": 0.9494641661643982, + "num_tokens": 68476762.0, + "step": 8435 + }, + { + "entropy": 0.24928564131259917, + "epoch": 4.762979683972912, + "grad_norm": 2.2493717670440674, + "learning_rate": 4.499369970493751e-06, + "loss": 0.1457, + "mean_token_accuracy": 0.9544100880622863, + "num_tokens": 68517508.0, + "step": 8440 + }, + { + "entropy": 0.23149662911891938, + "epoch": 4.765801354401805, + "grad_norm": 1.8034436702728271, + "learning_rate": 4.498804528735773e-06, + "loss": 0.1316, + "mean_token_accuracy": 0.958375072479248, + "num_tokens": 68558346.0, + "step": 8445 + }, + { + "entropy": 0.2590792328119278, + "epoch": 4.7686230248307, + "grad_norm": 1.9933143854141235, + "learning_rate": 4.498238817093717e-06, + "loss": 0.1211, + "mean_token_accuracy": 0.9613798260688782, + "num_tokens": 68598995.0, + "step": 8450 + }, + { + "entropy": 0.21191737055778503, + "epoch": 4.771444695259594, + "grad_norm": 1.8412405252456665, + "learning_rate": 4.497672835678716e-06, + "loss": 0.1165, + "mean_token_accuracy": 0.9624577045440674, + "num_tokens": 68639676.0, + "step": 8455 + }, + { + "entropy": 0.2465408831834793, + "epoch": 4.774266365688487, + "grad_norm": 2.141119956970215, + "learning_rate": 4.497106584601957e-06, + "loss": 0.1499, + "mean_token_accuracy": 0.9527180552482605, + "num_tokens": 68680313.0, + "step": 8460 + }, + { + "entropy": 0.2483223021030426, + "epoch": 4.777088036117382, + "grad_norm": 2.286459445953369, + "learning_rate": 4.496540063974683e-06, + "loss": 0.1449, + "mean_token_accuracy": 0.9531118035316467, + "num_tokens": 68721184.0, + "step": 8465 + }, + { + "entropy": 0.23041402101516723, + "epoch": 4.779909706546276, + "grad_norm": 1.8058642148971558, + "learning_rate": 4.495973273908184e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9595191001892089, + "num_tokens": 68761833.0, + "step": 8470 + }, + { + "entropy": 0.24951161742210387, + "epoch": 4.782731376975169, + "grad_norm": 2.057999849319458, + "learning_rate": 4.495406214513807e-06, + "loss": 0.1334, + "mean_token_accuracy": 0.9589525699615479, + "num_tokens": 68802489.0, + "step": 8475 + }, + { + "entropy": 0.2543626993894577, + "epoch": 4.785553047404063, + "grad_norm": 2.1733288764953613, + "learning_rate": 4.494838885902952e-06, + "loss": 0.1569, + "mean_token_accuracy": 0.9508596181869506, + "num_tokens": 68843038.0, + "step": 8480 + }, + { + "entropy": 0.22672601640224457, + "epoch": 4.788374717832957, + "grad_norm": 3.0629308223724365, + "learning_rate": 4.4942712881870684e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9586154460906983, + "num_tokens": 68883645.0, + "step": 8485 + }, + { + "entropy": 0.24403004348278046, + "epoch": 4.791196388261851, + "grad_norm": 2.2074005603790283, + "learning_rate": 4.493703421477663e-06, + "loss": 0.1449, + "mean_token_accuracy": 0.9540403723716736, + "num_tokens": 68924239.0, + "step": 8490 + }, + { + "entropy": 0.2356546252965927, + "epoch": 4.794018058690745, + "grad_norm": 1.874244213104248, + "learning_rate": 4.493135285886293e-06, + "loss": 0.1392, + "mean_token_accuracy": 0.9564555406570434, + "num_tokens": 68964945.0, + "step": 8495 + }, + { + "entropy": 0.24922839403152466, + "epoch": 4.796839729119639, + "grad_norm": 2.270205020904541, + "learning_rate": 4.492566881524568e-06, + "loss": 0.1371, + "mean_token_accuracy": 0.9573128700256348, + "num_tokens": 69005804.0, + "step": 8500 + }, + { + "epoch": 4.796839729119639, + "eval_entropy": 0.25907859206199646, + "eval_loss": 0.06230160593986511, + "eval_mean_token_accuracy": 0.9831352829933167, + "eval_num_tokens": 69005804.0, + "eval_runtime": 0.164, + "eval_samples_per_second": 24.392, + "eval_steps_per_second": 6.098, + "step": 8500 + }, + { + "entropy": 0.2256260484457016, + "epoch": 4.799661399548532, + "grad_norm": 2.126230001449585, + "learning_rate": 4.491998208504151e-06, + "loss": 0.1214, + "mean_token_accuracy": 0.9609059691429138, + "num_tokens": 69045948.0, + "step": 8505 + }, + { + "entropy": 0.2439708322286606, + "epoch": 4.802483069977427, + "grad_norm": 1.7869462966918945, + "learning_rate": 4.491429266936759e-06, + "loss": 0.1453, + "mean_token_accuracy": 0.9530874967575074, + "num_tokens": 69086575.0, + "step": 8510 + }, + { + "entropy": 0.23215862214565278, + "epoch": 4.805304740406321, + "grad_norm": 1.871141791343689, + "learning_rate": 4.490860056934158e-06, + "loss": 0.1164, + "mean_token_accuracy": 0.9630860209465026, + "num_tokens": 69127332.0, + "step": 8515 + }, + { + "entropy": 0.22940534651279448, + "epoch": 4.808126410835214, + "grad_norm": 1.7535682916641235, + "learning_rate": 4.490290578608173e-06, + "loss": 0.1133, + "mean_token_accuracy": 0.9624628305435181, + "num_tokens": 69167989.0, + "step": 8520 + }, + { + "entropy": 0.22979909479618071, + "epoch": 4.810948081264108, + "grad_norm": 2.0037307739257812, + "learning_rate": 4.489720832070676e-06, + "loss": 0.1249, + "mean_token_accuracy": 0.9602430582046508, + "num_tokens": 69208619.0, + "step": 8525 + }, + { + "entropy": 0.24260205030441284, + "epoch": 4.813769751693002, + "grad_norm": 2.209491491317749, + "learning_rate": 4.489150817433594e-06, + "loss": 0.1345, + "mean_token_accuracy": 0.9570428252220153, + "num_tokens": 69249329.0, + "step": 8530 + }, + { + "entropy": 0.22956629991531372, + "epoch": 4.816591422121896, + "grad_norm": 2.2682173252105713, + "learning_rate": 4.488580534808908e-06, + "loss": 0.1358, + "mean_token_accuracy": 0.9560045003890991, + "num_tokens": 69289846.0, + "step": 8535 + }, + { + "entropy": 0.2220306247472763, + "epoch": 4.81941309255079, + "grad_norm": 1.8845446109771729, + "learning_rate": 4.488009984308647e-06, + "loss": 0.1276, + "mean_token_accuracy": 0.9602849364280701, + "num_tokens": 69330609.0, + "step": 8540 + }, + { + "entropy": 0.25677892565727234, + "epoch": 4.822234762979684, + "grad_norm": 2.3458547592163086, + "learning_rate": 4.487439166044898e-06, + "loss": 0.1333, + "mean_token_accuracy": 0.9575768232345581, + "num_tokens": 69370952.0, + "step": 8545 + }, + { + "entropy": 0.2511105537414551, + "epoch": 4.825056433408578, + "grad_norm": 2.185471773147583, + "learning_rate": 4.486868080129797e-06, + "loss": 0.1403, + "mean_token_accuracy": 0.9561097264289856, + "num_tokens": 69411446.0, + "step": 8550 + }, + { + "entropy": 0.2550018668174744, + "epoch": 4.827878103837472, + "grad_norm": 2.0615882873535156, + "learning_rate": 4.486296726675535e-06, + "loss": 0.1344, + "mean_token_accuracy": 0.957464849948883, + "num_tokens": 69452055.0, + "step": 8555 + }, + { + "entropy": 0.2429851323366165, + "epoch": 4.830699774266366, + "grad_norm": 2.123836040496826, + "learning_rate": 4.485725105794354e-06, + "loss": 0.1285, + "mean_token_accuracy": 0.9590805530548095, + "num_tokens": 69492630.0, + "step": 8560 + }, + { + "entropy": 0.23657283782958985, + "epoch": 4.8335214446952595, + "grad_norm": 2.050199508666992, + "learning_rate": 4.48515321759855e-06, + "loss": 0.1305, + "mean_token_accuracy": 0.9578537583351135, + "num_tokens": 69533313.0, + "step": 8565 + }, + { + "entropy": 0.23308176994323732, + "epoch": 4.836343115124153, + "grad_norm": 1.9955555200576782, + "learning_rate": 4.4845810622004685e-06, + "loss": 0.1443, + "mean_token_accuracy": 0.9529533624649048, + "num_tokens": 69573940.0, + "step": 8570 + }, + { + "entropy": 0.23426572382450103, + "epoch": 4.839164785553048, + "grad_norm": 1.7304738759994507, + "learning_rate": 4.484008639712511e-06, + "loss": 0.1191, + "mean_token_accuracy": 0.9620694041252136, + "num_tokens": 69614517.0, + "step": 8575 + }, + { + "entropy": 0.2388251841068268, + "epoch": 4.841986455981941, + "grad_norm": 2.242614269256592, + "learning_rate": 4.48343595024713e-06, + "loss": 0.138, + "mean_token_accuracy": 0.9573504686355591, + "num_tokens": 69654972.0, + "step": 8580 + }, + { + "entropy": 0.2321117788553238, + "epoch": 4.844808126410835, + "grad_norm": 2.4217350482940674, + "learning_rate": 4.482862993916829e-06, + "loss": 0.1388, + "mean_token_accuracy": 0.95455002784729, + "num_tokens": 69695607.0, + "step": 8585 + }, + { + "entropy": 0.22771554291248322, + "epoch": 4.847629796839729, + "grad_norm": 1.7584290504455566, + "learning_rate": 4.482289770834168e-06, + "loss": 0.1239, + "mean_token_accuracy": 0.9602065324783325, + "num_tokens": 69736187.0, + "step": 8590 + }, + { + "entropy": 0.22587930858135224, + "epoch": 4.850451467268623, + "grad_norm": 2.119394302368164, + "learning_rate": 4.481716281111753e-06, + "loss": 0.1207, + "mean_token_accuracy": 0.9609213948249817, + "num_tokens": 69777050.0, + "step": 8595 + }, + { + "entropy": 0.23252309560775758, + "epoch": 4.853273137697517, + "grad_norm": 2.079322576522827, + "learning_rate": 4.481142524862249e-06, + "loss": 0.1269, + "mean_token_accuracy": 0.9575224757194519, + "num_tokens": 69817801.0, + "step": 8600 + }, + { + "entropy": 0.2291330635547638, + "epoch": 4.856094808126411, + "grad_norm": 2.186591148376465, + "learning_rate": 4.48056850219837e-06, + "loss": 0.1236, + "mean_token_accuracy": 0.9605337023735047, + "num_tokens": 69858177.0, + "step": 8605 + }, + { + "entropy": 0.22314091324806212, + "epoch": 4.8589164785553045, + "grad_norm": 1.9909532070159912, + "learning_rate": 4.479994213232882e-06, + "loss": 0.1238, + "mean_token_accuracy": 0.9604300022125244, + "num_tokens": 69898890.0, + "step": 8610 + }, + { + "entropy": 0.2294975697994232, + "epoch": 4.861738148984198, + "grad_norm": 2.1407694816589355, + "learning_rate": 4.479419658078606e-06, + "loss": 0.1272, + "mean_token_accuracy": 0.9603155970573425, + "num_tokens": 69939533.0, + "step": 8615 + }, + { + "entropy": 0.23732683360576629, + "epoch": 4.864559819413093, + "grad_norm": 2.215397357940674, + "learning_rate": 4.478844836848411e-06, + "loss": 0.1376, + "mean_token_accuracy": 0.9563262820243835, + "num_tokens": 69980394.0, + "step": 8620 + }, + { + "entropy": 0.24985795021057128, + "epoch": 4.8673814898419865, + "grad_norm": 1.9875785112380981, + "learning_rate": 4.478269749655222e-06, + "loss": 0.1414, + "mean_token_accuracy": 0.954399836063385, + "num_tokens": 70020932.0, + "step": 8625 + }, + { + "entropy": 0.23618102967739105, + "epoch": 4.87020316027088, + "grad_norm": 2.1521682739257812, + "learning_rate": 4.477694396612014e-06, + "loss": 0.124, + "mean_token_accuracy": 0.9608348488807679, + "num_tokens": 70061566.0, + "step": 8630 + }, + { + "entropy": 0.21555082201957704, + "epoch": 4.873024830699774, + "grad_norm": 1.909771203994751, + "learning_rate": 4.477118777831817e-06, + "loss": 0.1278, + "mean_token_accuracy": 0.9597031474113464, + "num_tokens": 70102195.0, + "step": 8635 + }, + { + "entropy": 0.24607637226581575, + "epoch": 4.8758465011286685, + "grad_norm": 2.6042673587799072, + "learning_rate": 4.47654289342771e-06, + "loss": 0.1404, + "mean_token_accuracy": 0.9529093623161315, + "num_tokens": 70142718.0, + "step": 8640 + }, + { + "entropy": 0.22525620758533477, + "epoch": 4.878668171557562, + "grad_norm": 2.1628029346466064, + "learning_rate": 4.475966743512826e-06, + "loss": 0.1384, + "mean_token_accuracy": 0.9556032299995423, + "num_tokens": 70183458.0, + "step": 8645 + }, + { + "entropy": 0.24763034284114838, + "epoch": 4.881489841986456, + "grad_norm": 2.244004726409912, + "learning_rate": 4.47539032820035e-06, + "loss": 0.1385, + "mean_token_accuracy": 0.9558951020240783, + "num_tokens": 70224283.0, + "step": 8650 + }, + { + "entropy": 0.23140210211277007, + "epoch": 4.8843115124153496, + "grad_norm": 2.290135145187378, + "learning_rate": 4.474813647603518e-06, + "loss": 0.1229, + "mean_token_accuracy": 0.9592395067214966, + "num_tokens": 70264983.0, + "step": 8655 + }, + { + "entropy": 0.2398304522037506, + "epoch": 4.887133182844244, + "grad_norm": 1.9016507863998413, + "learning_rate": 4.4742367018356195e-06, + "loss": 0.1311, + "mean_token_accuracy": 0.95786372423172, + "num_tokens": 70305649.0, + "step": 8660 + }, + { + "entropy": 0.23110878467559814, + "epoch": 4.889954853273138, + "grad_norm": 1.8025792837142944, + "learning_rate": 4.4736594910099956e-06, + "loss": 0.1405, + "mean_token_accuracy": 0.9554929614067078, + "num_tokens": 70345976.0, + "step": 8665 + }, + { + "entropy": 0.24155546426773072, + "epoch": 4.8927765237020315, + "grad_norm": 1.9929896593093872, + "learning_rate": 4.47308201524004e-06, + "loss": 0.1383, + "mean_token_accuracy": 0.955091404914856, + "num_tokens": 70386380.0, + "step": 8670 + }, + { + "entropy": 0.23273320198059083, + "epoch": 4.895598194130925, + "grad_norm": 1.964606523513794, + "learning_rate": 4.4725042746391965e-06, + "loss": 0.1267, + "mean_token_accuracy": 0.9588501572608947, + "num_tokens": 70427041.0, + "step": 8675 + }, + { + "entropy": 0.24253876507282257, + "epoch": 4.89841986455982, + "grad_norm": 2.12473464012146, + "learning_rate": 4.471926269320963e-06, + "loss": 0.1228, + "mean_token_accuracy": 0.9616016149520874, + "num_tokens": 70467666.0, + "step": 8680 + }, + { + "entropy": 0.23109547793865204, + "epoch": 4.9012415349887135, + "grad_norm": 2.302859306335449, + "learning_rate": 4.471347999398888e-06, + "loss": 0.1198, + "mean_token_accuracy": 0.9608020305633544, + "num_tokens": 70507748.0, + "step": 8685 + }, + { + "entropy": 0.21591301262378693, + "epoch": 4.904063205417607, + "grad_norm": 1.8348546028137207, + "learning_rate": 4.4707694649865755e-06, + "loss": 0.1206, + "mean_token_accuracy": 0.9611794590950012, + "num_tokens": 70548511.0, + "step": 8690 + }, + { + "entropy": 0.2398153394460678, + "epoch": 4.906884875846501, + "grad_norm": 1.7756059169769287, + "learning_rate": 4.470190666197675e-06, + "loss": 0.132, + "mean_token_accuracy": 0.9583917617797851, + "num_tokens": 70589210.0, + "step": 8695 + }, + { + "entropy": 0.24644698202610016, + "epoch": 4.909706546275395, + "grad_norm": 2.212986469268799, + "learning_rate": 4.469611603145895e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.957687258720398, + "num_tokens": 70629759.0, + "step": 8700 + }, + { + "entropy": 0.2506899446249008, + "epoch": 4.912528216704289, + "grad_norm": 2.2517426013946533, + "learning_rate": 4.469032275944989e-06, + "loss": 0.1354, + "mean_token_accuracy": 0.9561644673347474, + "num_tokens": 70669828.0, + "step": 8705 + }, + { + "entropy": 0.26197082698345187, + "epoch": 4.915349887133183, + "grad_norm": 2.4998831748962402, + "learning_rate": 4.468452684708769e-06, + "loss": 0.1506, + "mean_token_accuracy": 0.953766405582428, + "num_tokens": 70710349.0, + "step": 8710 + }, + { + "entropy": 0.2373470038175583, + "epoch": 4.918171557562077, + "grad_norm": 2.084594249725342, + "learning_rate": 4.467872829551093e-06, + "loss": 0.1306, + "mean_token_accuracy": 0.9590035080909729, + "num_tokens": 70751075.0, + "step": 8715 + }, + { + "entropy": 0.23836889564990998, + "epoch": 4.92099322799097, + "grad_norm": 2.043854236602783, + "learning_rate": 4.467292710585876e-06, + "loss": 0.1263, + "mean_token_accuracy": 0.9599133729934692, + "num_tokens": 70791615.0, + "step": 8720 + }, + { + "entropy": 0.2358907401561737, + "epoch": 4.923814898419865, + "grad_norm": 1.9788299798965454, + "learning_rate": 4.466712327927082e-06, + "loss": 0.1422, + "mean_token_accuracy": 0.9546040058135986, + "num_tokens": 70832420.0, + "step": 8725 + }, + { + "entropy": 0.2312130182981491, + "epoch": 4.926636568848759, + "grad_norm": 2.282057762145996, + "learning_rate": 4.466131681688725e-06, + "loss": 0.1265, + "mean_token_accuracy": 0.9591808557510376, + "num_tokens": 70873131.0, + "step": 8730 + }, + { + "entropy": 0.22316355705261232, + "epoch": 4.929458239277652, + "grad_norm": 2.02055025100708, + "learning_rate": 4.465550771984877e-06, + "loss": 0.1187, + "mean_token_accuracy": 0.9624357938766479, + "num_tokens": 70913503.0, + "step": 8735 + }, + { + "entropy": 0.2304389774799347, + "epoch": 4.932279909706546, + "grad_norm": 2.3464314937591553, + "learning_rate": 4.464969598929654e-06, + "loss": 0.1205, + "mean_token_accuracy": 0.9610739946365356, + "num_tokens": 70954047.0, + "step": 8740 + }, + { + "entropy": 0.23224081397056578, + "epoch": 4.9351015801354405, + "grad_norm": 2.0249598026275635, + "learning_rate": 4.4643881626372305e-06, + "loss": 0.125, + "mean_token_accuracy": 0.959325659275055, + "num_tokens": 70994812.0, + "step": 8745 + }, + { + "entropy": 0.2500057280063629, + "epoch": 4.937923250564334, + "grad_norm": 2.1669914722442627, + "learning_rate": 4.463806463221827e-06, + "loss": 0.1378, + "mean_token_accuracy": 0.9567291259765625, + "num_tokens": 71035388.0, + "step": 8750 + }, + { + "entropy": 0.23122502863407135, + "epoch": 4.940744920993228, + "grad_norm": 2.2984166145324707, + "learning_rate": 4.463224500797721e-06, + "loss": 0.1396, + "mean_token_accuracy": 0.954986846446991, + "num_tokens": 71076225.0, + "step": 8755 + }, + { + "entropy": 0.22869625687599182, + "epoch": 4.943566591422122, + "grad_norm": 1.968353509902954, + "learning_rate": 4.462642275479236e-06, + "loss": 0.1187, + "mean_token_accuracy": 0.9607104420661926, + "num_tokens": 71116934.0, + "step": 8760 + }, + { + "entropy": 0.23740582168102264, + "epoch": 4.946388261851016, + "grad_norm": 1.954420566558838, + "learning_rate": 4.462059787380754e-06, + "loss": 0.1371, + "mean_token_accuracy": 0.9562184929847717, + "num_tokens": 71157332.0, + "step": 8765 + }, + { + "entropy": 0.23098691403865815, + "epoch": 4.94920993227991, + "grad_norm": 1.9516210556030273, + "learning_rate": 4.461477036616702e-06, + "loss": 0.1415, + "mean_token_accuracy": 0.9550253391265869, + "num_tokens": 71198170.0, + "step": 8770 + }, + { + "entropy": 0.22629740536212922, + "epoch": 4.952031602708804, + "grad_norm": 2.029778480529785, + "learning_rate": 4.460894023301563e-06, + "loss": 0.1159, + "mean_token_accuracy": 0.9626461505889893, + "num_tokens": 71238652.0, + "step": 8775 + }, + { + "entropy": 0.22563746571540833, + "epoch": 4.954853273137697, + "grad_norm": 2.0067238807678223, + "learning_rate": 4.460310747549869e-06, + "loss": 0.128, + "mean_token_accuracy": 0.9585816025733948, + "num_tokens": 71279449.0, + "step": 8780 + }, + { + "entropy": 0.264061963558197, + "epoch": 4.957674943566591, + "grad_norm": 2.42386794090271, + "learning_rate": 4.459727209476205e-06, + "loss": 0.1534, + "mean_token_accuracy": 0.9512446284294128, + "num_tokens": 71320179.0, + "step": 8785 + }, + { + "entropy": 0.2313307523727417, + "epoch": 4.960496613995486, + "grad_norm": 1.9760738611221313, + "learning_rate": 4.459143409195208e-06, + "loss": 0.1184, + "mean_token_accuracy": 0.9614473700523376, + "num_tokens": 71360778.0, + "step": 8790 + }, + { + "entropy": 0.23215781152248383, + "epoch": 4.963318284424379, + "grad_norm": 2.0897891521453857, + "learning_rate": 4.458559346821564e-06, + "loss": 0.1406, + "mean_token_accuracy": 0.9561804056167602, + "num_tokens": 71401477.0, + "step": 8795 + }, + { + "entropy": 0.2455916315317154, + "epoch": 4.966139954853273, + "grad_norm": 2.2576193809509277, + "learning_rate": 4.457975022470013e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9552298545837402, + "num_tokens": 71442131.0, + "step": 8800 + }, + { + "entropy": 0.2560196042060852, + "epoch": 4.968961625282167, + "grad_norm": 2.23545503616333, + "learning_rate": 4.457390436255345e-06, + "loss": 0.1483, + "mean_token_accuracy": 0.952830684185028, + "num_tokens": 71482712.0, + "step": 8805 + }, + { + "entropy": 0.24611234068870544, + "epoch": 4.971783295711061, + "grad_norm": 1.8639657497406006, + "learning_rate": 4.456805588292404e-06, + "loss": 0.1302, + "mean_token_accuracy": 0.9585993766784668, + "num_tokens": 71523349.0, + "step": 8810 + }, + { + "entropy": 0.2496563047170639, + "epoch": 4.974604966139955, + "grad_norm": 2.166790723800659, + "learning_rate": 4.456220478696081e-06, + "loss": 0.1514, + "mean_token_accuracy": 0.9514712452888489, + "num_tokens": 71563712.0, + "step": 8815 + }, + { + "entropy": 0.23716452717781067, + "epoch": 4.977426636568849, + "grad_norm": 2.0670976638793945, + "learning_rate": 4.455635107581322e-06, + "loss": 0.1365, + "mean_token_accuracy": 0.9566124558448792, + "num_tokens": 71604371.0, + "step": 8820 + }, + { + "entropy": 0.21934570670127868, + "epoch": 4.980248306997742, + "grad_norm": 2.0929582118988037, + "learning_rate": 4.455049475063124e-06, + "loss": 0.1163, + "mean_token_accuracy": 0.9618055701255799, + "num_tokens": 71645153.0, + "step": 8825 + }, + { + "entropy": 0.24300400614738465, + "epoch": 4.983069977426637, + "grad_norm": 1.9957592487335205, + "learning_rate": 4.4544635812565335e-06, + "loss": 0.1322, + "mean_token_accuracy": 0.9577077746391296, + "num_tokens": 71685968.0, + "step": 8830 + }, + { + "entropy": 0.24642634391784668, + "epoch": 4.985891647855531, + "grad_norm": 1.812232494354248, + "learning_rate": 4.453877426276649e-06, + "loss": 0.1327, + "mean_token_accuracy": 0.9573553204536438, + "num_tokens": 71726713.0, + "step": 8835 + }, + { + "entropy": 0.24452163875102997, + "epoch": 4.988713318284424, + "grad_norm": 2.2379374504089355, + "learning_rate": 4.4532910102386215e-06, + "loss": 0.1368, + "mean_token_accuracy": 0.9563169717788697, + "num_tokens": 71767291.0, + "step": 8840 + }, + { + "entropy": 0.23888054192066194, + "epoch": 4.991534988713318, + "grad_norm": 2.2258286476135254, + "learning_rate": 4.452704333257653e-06, + "loss": 0.1338, + "mean_token_accuracy": 0.9571861267089844, + "num_tokens": 71807947.0, + "step": 8845 + }, + { + "entropy": 0.23455857038497924, + "epoch": 4.994356659142213, + "grad_norm": 2.2029316425323486, + "learning_rate": 4.452117395448995e-06, + "loss": 0.1321, + "mean_token_accuracy": 0.9573328852653503, + "num_tokens": 71848678.0, + "step": 8850 + }, + { + "entropy": 0.23345192968845369, + "epoch": 4.997178329571106, + "grad_norm": 2.2948131561279297, + "learning_rate": 4.451530196927952e-06, + "loss": 0.1239, + "mean_token_accuracy": 0.9603603482246399, + "num_tokens": 71889260.0, + "step": 8855 + }, + { + "entropy": 0.22622182071208954, + "epoch": 5.0, + "grad_norm": 3.6322450637817383, + "learning_rate": 4.45094273780988e-06, + "loss": 0.1305, + "mean_token_accuracy": 0.9572938919067383, + "num_tokens": 71923845.0, + "step": 8860 + }, + { + "entropy": 0.22580887079238893, + "epoch": 5.002821670428894, + "grad_norm": 1.672566294670105, + "learning_rate": 4.450355018210185e-06, + "loss": 0.0748, + "mean_token_accuracy": 0.9793183445930481, + "num_tokens": 71964397.0, + "step": 8865 + }, + { + "entropy": 0.1980500638484955, + "epoch": 5.005643340857787, + "grad_norm": 1.7135627269744873, + "learning_rate": 4.4497670382443235e-06, + "loss": 0.0595, + "mean_token_accuracy": 0.9829611539840698, + "num_tokens": 72004845.0, + "step": 8870 + }, + { + "entropy": 0.19046223759651185, + "epoch": 5.008465011286682, + "grad_norm": 1.7195278406143188, + "learning_rate": 4.449178798027806e-06, + "loss": 0.0699, + "mean_token_accuracy": 0.9804732322692871, + "num_tokens": 72045486.0, + "step": 8875 + }, + { + "entropy": 0.1900404006242752, + "epoch": 5.011286681715576, + "grad_norm": 1.8688246011734009, + "learning_rate": 4.4485902976761905e-06, + "loss": 0.0699, + "mean_token_accuracy": 0.9789183497428894, + "num_tokens": 72086189.0, + "step": 8880 + }, + { + "entropy": 0.17116313576698303, + "epoch": 5.014108352144469, + "grad_norm": 2.521716833114624, + "learning_rate": 4.44800153730509e-06, + "loss": 0.0676, + "mean_token_accuracy": 0.979660964012146, + "num_tokens": 72126926.0, + "step": 8885 + }, + { + "entropy": 0.1692497670650482, + "epoch": 5.016930022573363, + "grad_norm": 2.2940783500671387, + "learning_rate": 4.447412517030165e-06, + "loss": 0.0638, + "mean_token_accuracy": 0.980719518661499, + "num_tokens": 72167664.0, + "step": 8890 + }, + { + "entropy": 0.18492531478405, + "epoch": 5.019751693002258, + "grad_norm": 2.423424005508423, + "learning_rate": 4.446823236967129e-06, + "loss": 0.086, + "mean_token_accuracy": 0.9749364614486694, + "num_tokens": 72208162.0, + "step": 8895 + }, + { + "entropy": 0.1805371016263962, + "epoch": 5.022573363431151, + "grad_norm": 2.1918728351593018, + "learning_rate": 4.446233697231747e-06, + "loss": 0.0717, + "mean_token_accuracy": 0.9780835628509521, + "num_tokens": 72248751.0, + "step": 8900 + }, + { + "entropy": 0.19484019577503203, + "epoch": 5.025395033860045, + "grad_norm": 2.126352071762085, + "learning_rate": 4.445643897939832e-06, + "loss": 0.0653, + "mean_token_accuracy": 0.9807442188262939, + "num_tokens": 72289553.0, + "step": 8905 + }, + { + "entropy": 0.16935117840766906, + "epoch": 5.028216704288939, + "grad_norm": 1.7222833633422852, + "learning_rate": 4.445053839207252e-06, + "loss": 0.0582, + "mean_token_accuracy": 0.9815907716751099, + "num_tokens": 72330231.0, + "step": 8910 + }, + { + "entropy": 0.17409595847129822, + "epoch": 5.031038374717833, + "grad_norm": 1.9635859727859497, + "learning_rate": 4.4444635211499245e-06, + "loss": 0.0644, + "mean_token_accuracy": 0.9801131606101989, + "num_tokens": 72370964.0, + "step": 8915 + }, + { + "entropy": 0.18664040863513948, + "epoch": 5.033860045146727, + "grad_norm": 2.004497528076172, + "learning_rate": 4.443872943883817e-06, + "loss": 0.0686, + "mean_token_accuracy": 0.9801067471504211, + "num_tokens": 72411632.0, + "step": 8920 + }, + { + "entropy": 0.19012978672981262, + "epoch": 5.036681715575621, + "grad_norm": 2.1313867568969727, + "learning_rate": 4.443282107524947e-06, + "loss": 0.0779, + "mean_token_accuracy": 0.9768623113632202, + "num_tokens": 72452367.0, + "step": 8925 + }, + { + "entropy": 0.18184551000595092, + "epoch": 5.039503386004514, + "grad_norm": 2.076591730117798, + "learning_rate": 4.442691012189386e-06, + "loss": 0.0734, + "mean_token_accuracy": 0.9780202388763428, + "num_tokens": 72493043.0, + "step": 8930 + }, + { + "entropy": 0.17082217931747437, + "epoch": 5.042325056433409, + "grad_norm": 1.6795393228530884, + "learning_rate": 4.4420996579932555e-06, + "loss": 0.0697, + "mean_token_accuracy": 0.9782551288604736, + "num_tokens": 72533217.0, + "step": 8935 + }, + { + "entropy": 0.19154114723205568, + "epoch": 5.045146726862303, + "grad_norm": 1.8840192556381226, + "learning_rate": 4.4415080450527244e-06, + "loss": 0.0783, + "mean_token_accuracy": 0.9762571692466736, + "num_tokens": 72573491.0, + "step": 8940 + }, + { + "entropy": 0.1921801507472992, + "epoch": 5.047968397291196, + "grad_norm": 2.051151752471924, + "learning_rate": 4.440916173484018e-06, + "loss": 0.0735, + "mean_token_accuracy": 0.9775419950485229, + "num_tokens": 72614111.0, + "step": 8945 + }, + { + "entropy": 0.171154123544693, + "epoch": 5.05079006772009, + "grad_norm": 1.7416431903839111, + "learning_rate": 4.440324043403408e-06, + "loss": 0.0562, + "mean_token_accuracy": 0.982762086391449, + "num_tokens": 72654942.0, + "step": 8950 + }, + { + "entropy": 0.1801618218421936, + "epoch": 5.053611738148984, + "grad_norm": 2.1165449619293213, + "learning_rate": 4.439731654927218e-06, + "loss": 0.0651, + "mean_token_accuracy": 0.9803321242332459, + "num_tokens": 72695401.0, + "step": 8955 + }, + { + "entropy": 0.17029935419559478, + "epoch": 5.056433408577878, + "grad_norm": 2.3758440017700195, + "learning_rate": 4.439139008171824e-06, + "loss": 0.0691, + "mean_token_accuracy": 0.9787796020507813, + "num_tokens": 72736277.0, + "step": 8960 + }, + { + "entropy": 0.1681983232498169, + "epoch": 5.059255079006772, + "grad_norm": 1.9630498886108398, + "learning_rate": 4.43854610325365e-06, + "loss": 0.0617, + "mean_token_accuracy": 0.981269919872284, + "num_tokens": 72777155.0, + "step": 8965 + }, + { + "entropy": 0.20426134169101715, + "epoch": 5.062076749435666, + "grad_norm": 2.2527589797973633, + "learning_rate": 4.437952940289175e-06, + "loss": 0.0815, + "mean_token_accuracy": 0.9752603888511657, + "num_tokens": 72817863.0, + "step": 8970 + }, + { + "entropy": 0.17029196619987488, + "epoch": 5.0648984198645595, + "grad_norm": 1.9497177600860596, + "learning_rate": 4.437359519394923e-06, + "loss": 0.0664, + "mean_token_accuracy": 0.9784554839134216, + "num_tokens": 72858530.0, + "step": 8975 + }, + { + "entropy": 0.18146368563175203, + "epoch": 5.067720090293454, + "grad_norm": 1.7516709566116333, + "learning_rate": 4.436765840687473e-06, + "loss": 0.0697, + "mean_token_accuracy": 0.9783901810646057, + "num_tokens": 72899175.0, + "step": 8980 + }, + { + "entropy": 0.17964912950992584, + "epoch": 5.070541760722348, + "grad_norm": 1.487107515335083, + "learning_rate": 4.4361719042834525e-06, + "loss": 0.0595, + "mean_token_accuracy": 0.9815647959709167, + "num_tokens": 72939846.0, + "step": 8985 + }, + { + "entropy": 0.1833242356777191, + "epoch": 5.073363431151241, + "grad_norm": 2.1468114852905273, + "learning_rate": 4.435577710299542e-06, + "loss": 0.0569, + "mean_token_accuracy": 0.982574725151062, + "num_tokens": 72979909.0, + "step": 8990 + }, + { + "entropy": 0.1870110362768173, + "epoch": 5.076185101580135, + "grad_norm": 1.998921513557434, + "learning_rate": 4.43498325885247e-06, + "loss": 0.0646, + "mean_token_accuracy": 0.9806098222732544, + "num_tokens": 73020545.0, + "step": 8995 + }, + { + "entropy": 0.18318285942077636, + "epoch": 5.07900677200903, + "grad_norm": 1.859528660774231, + "learning_rate": 4.434388550059016e-06, + "loss": 0.0739, + "mean_token_accuracy": 0.9773783922195435, + "num_tokens": 73061009.0, + "step": 9000 + }, + { + "epoch": 5.07900677200903, + "eval_entropy": 0.23515231907367706, + "eval_loss": 0.055877141654491425, + "eval_mean_token_accuracy": 0.9827520251274109, + "eval_num_tokens": 73061009.0, + "eval_runtime": 0.1636, + "eval_samples_per_second": 24.455, + "eval_steps_per_second": 6.114, + "step": 9000 + }, + { + "entropy": 0.1605137288570404, + "epoch": 5.081828442437923, + "grad_norm": 2.0149075984954834, + "learning_rate": 4.433793584036011e-06, + "loss": 0.0591, + "mean_token_accuracy": 0.9814065933227539, + "num_tokens": 73101615.0, + "step": 9005 + }, + { + "entropy": 0.19753072261810303, + "epoch": 5.084650112866817, + "grad_norm": 1.9284378290176392, + "learning_rate": 4.433198360900337e-06, + "loss": 0.0738, + "mean_token_accuracy": 0.9784594297409057, + "num_tokens": 73142398.0, + "step": 9010 + }, + { + "entropy": 0.18846041858196258, + "epoch": 5.087471783295711, + "grad_norm": 2.1237173080444336, + "learning_rate": 4.432602880768925e-06, + "loss": 0.076, + "mean_token_accuracy": 0.9769237160682678, + "num_tokens": 73183058.0, + "step": 9015 + }, + { + "entropy": 0.1911979854106903, + "epoch": 5.090293453724605, + "grad_norm": 1.8477128744125366, + "learning_rate": 4.4320071437587554e-06, + "loss": 0.0704, + "mean_token_accuracy": 0.9788993120193481, + "num_tokens": 73223530.0, + "step": 9020 + }, + { + "entropy": 0.19110194444656373, + "epoch": 5.093115124153499, + "grad_norm": 2.072406053543091, + "learning_rate": 4.431411149986865e-06, + "loss": 0.074, + "mean_token_accuracy": 0.9778715133666992, + "num_tokens": 73264278.0, + "step": 9025 + }, + { + "entropy": 0.18987910449504852, + "epoch": 5.095936794582393, + "grad_norm": 1.7068977355957031, + "learning_rate": 4.430814899570333e-06, + "loss": 0.0745, + "mean_token_accuracy": 0.9774025797843933, + "num_tokens": 73304905.0, + "step": 9030 + }, + { + "entropy": 0.18311900794506072, + "epoch": 5.0987584650112865, + "grad_norm": 2.1360621452331543, + "learning_rate": 4.430218392626295e-06, + "loss": 0.065, + "mean_token_accuracy": 0.9798977851867676, + "num_tokens": 73345687.0, + "step": 9035 + }, + { + "entropy": 0.17857904732227325, + "epoch": 5.10158013544018, + "grad_norm": 1.964221477508545, + "learning_rate": 4.429621629271933e-06, + "loss": 0.0628, + "mean_token_accuracy": 0.980586564540863, + "num_tokens": 73386265.0, + "step": 9040 + }, + { + "entropy": 0.17305072247982026, + "epoch": 5.104401805869075, + "grad_norm": 2.037114143371582, + "learning_rate": 4.429024609624482e-06, + "loss": 0.0636, + "mean_token_accuracy": 0.9802641272544861, + "num_tokens": 73427151.0, + "step": 9045 + }, + { + "entropy": 0.18654557764530183, + "epoch": 5.1072234762979685, + "grad_norm": 2.1122262477874756, + "learning_rate": 4.428427333801228e-06, + "loss": 0.0626, + "mean_token_accuracy": 0.9806604266166687, + "num_tokens": 73467943.0, + "step": 9050 + }, + { + "entropy": 0.1888034909963608, + "epoch": 5.110045146726862, + "grad_norm": 2.032395124435425, + "learning_rate": 4.4278298019195044e-06, + "loss": 0.0677, + "mean_token_accuracy": 0.9790974020957947, + "num_tokens": 73508586.0, + "step": 9055 + }, + { + "entropy": 0.201117542386055, + "epoch": 5.112866817155756, + "grad_norm": 2.6121714115142822, + "learning_rate": 4.4272320140966965e-06, + "loss": 0.0789, + "mean_token_accuracy": 0.9769583702087402, + "num_tokens": 73549036.0, + "step": 9060 + }, + { + "entropy": 0.17489383816719056, + "epoch": 5.1156884875846504, + "grad_norm": 2.1628427505493164, + "learning_rate": 4.4266339704502415e-06, + "loss": 0.0704, + "mean_token_accuracy": 0.9784565687179565, + "num_tokens": 73589624.0, + "step": 9065 + }, + { + "entropy": 0.17749419510364534, + "epoch": 5.118510158013544, + "grad_norm": 2.2119405269622803, + "learning_rate": 4.426035671097623e-06, + "loss": 0.0672, + "mean_token_accuracy": 0.980502438545227, + "num_tokens": 73630265.0, + "step": 9070 + }, + { + "entropy": 0.17612397372722627, + "epoch": 5.121331828442438, + "grad_norm": 1.97671377658844, + "learning_rate": 4.425437116156377e-06, + "loss": 0.0741, + "mean_token_accuracy": 0.9769485712051391, + "num_tokens": 73670966.0, + "step": 9075 + }, + { + "entropy": 0.1710589587688446, + "epoch": 5.1241534988713315, + "grad_norm": 2.027311325073242, + "learning_rate": 4.424838305744091e-06, + "loss": 0.0636, + "mean_token_accuracy": 0.9812980055809021, + "num_tokens": 73711495.0, + "step": 9080 + }, + { + "entropy": 0.16620054244995117, + "epoch": 5.126975169300226, + "grad_norm": 2.079716920852661, + "learning_rate": 4.4242392399784015e-06, + "loss": 0.0599, + "mean_token_accuracy": 0.9820851564407349, + "num_tokens": 73751986.0, + "step": 9085 + }, + { + "entropy": 0.19693105220794677, + "epoch": 5.12979683972912, + "grad_norm": 1.8494852781295776, + "learning_rate": 4.423639918976994e-06, + "loss": 0.0841, + "mean_token_accuracy": 0.9751617550849915, + "num_tokens": 73792696.0, + "step": 9090 + }, + { + "entropy": 0.16846722960472107, + "epoch": 5.1326185101580135, + "grad_norm": 2.0323588848114014, + "learning_rate": 4.4230403428576055e-06, + "loss": 0.0576, + "mean_token_accuracy": 0.983081865310669, + "num_tokens": 73833479.0, + "step": 9095 + }, + { + "entropy": 0.18227221965789794, + "epoch": 5.135440180586907, + "grad_norm": 1.713901400566101, + "learning_rate": 4.4224405117380235e-06, + "loss": 0.0722, + "mean_token_accuracy": 0.9784926295280456, + "num_tokens": 73874141.0, + "step": 9100 + }, + { + "entropy": 0.17101970911026002, + "epoch": 5.138261851015802, + "grad_norm": 1.8212043046951294, + "learning_rate": 4.421840425736084e-06, + "loss": 0.0694, + "mean_token_accuracy": 0.9785187363624572, + "num_tokens": 73914676.0, + "step": 9105 + }, + { + "entropy": 0.18568507134914397, + "epoch": 5.1410835214446955, + "grad_norm": 2.0307278633117676, + "learning_rate": 4.421240084969673e-06, + "loss": 0.0681, + "mean_token_accuracy": 0.9798570513725281, + "num_tokens": 73955243.0, + "step": 9110 + }, + { + "entropy": 0.18314568400382997, + "epoch": 5.143905191873589, + "grad_norm": 2.26824951171875, + "learning_rate": 4.42063948955673e-06, + "loss": 0.0819, + "mean_token_accuracy": 0.9749473333358765, + "num_tokens": 73996064.0, + "step": 9115 + }, + { + "entropy": 0.18159986436367034, + "epoch": 5.146726862302483, + "grad_norm": 2.368440866470337, + "learning_rate": 4.420038639615241e-06, + "loss": 0.0759, + "mean_token_accuracy": 0.97624671459198, + "num_tokens": 74036627.0, + "step": 9120 + }, + { + "entropy": 0.18119294345378875, + "epoch": 5.149548532731377, + "grad_norm": 1.908693552017212, + "learning_rate": 4.419437535263243e-06, + "loss": 0.067, + "mean_token_accuracy": 0.9788689136505127, + "num_tokens": 74077307.0, + "step": 9125 + }, + { + "entropy": 0.18659307360649108, + "epoch": 5.152370203160271, + "grad_norm": 1.8129174709320068, + "learning_rate": 4.418836176618823e-06, + "loss": 0.0727, + "mean_token_accuracy": 0.9778983354568481, + "num_tokens": 74117925.0, + "step": 9130 + }, + { + "entropy": 0.19027889668941497, + "epoch": 5.155191873589165, + "grad_norm": 1.8075789213180542, + "learning_rate": 4.418234563800117e-06, + "loss": 0.0733, + "mean_token_accuracy": 0.9768654942512512, + "num_tokens": 74158594.0, + "step": 9135 + }, + { + "entropy": 0.17420806884765624, + "epoch": 5.158013544018059, + "grad_norm": 2.1815521717071533, + "learning_rate": 4.417632696925314e-06, + "loss": 0.0643, + "mean_token_accuracy": 0.980584466457367, + "num_tokens": 74199375.0, + "step": 9140 + }, + { + "entropy": 0.19676790237426758, + "epoch": 5.160835214446952, + "grad_norm": 2.2495217323303223, + "learning_rate": 4.417030576112649e-06, + "loss": 0.0635, + "mean_token_accuracy": 0.9816282868385315, + "num_tokens": 74239922.0, + "step": 9145 + }, + { + "entropy": 0.17044782042503356, + "epoch": 5.163656884875847, + "grad_norm": 2.1400983333587646, + "learning_rate": 4.416428201480409e-06, + "loss": 0.0741, + "mean_token_accuracy": 0.9771974563598633, + "num_tokens": 74280490.0, + "step": 9150 + }, + { + "entropy": 0.17132185995578766, + "epoch": 5.1664785553047405, + "grad_norm": 1.5042531490325928, + "learning_rate": 4.415825573146931e-06, + "loss": 0.066, + "mean_token_accuracy": 0.9790427088737488, + "num_tokens": 74321233.0, + "step": 9155 + }, + { + "entropy": 0.1608135759830475, + "epoch": 5.169300225733634, + "grad_norm": 1.817196011543274, + "learning_rate": 4.415222691230602e-06, + "loss": 0.0584, + "mean_token_accuracy": 0.9813137888908386, + "num_tokens": 74361867.0, + "step": 9160 + }, + { + "entropy": 0.19295158088207245, + "epoch": 5.172121896162528, + "grad_norm": 2.3909380435943604, + "learning_rate": 4.414619555849857e-06, + "loss": 0.0688, + "mean_token_accuracy": 0.9787463188171387, + "num_tokens": 74402471.0, + "step": 9165 + }, + { + "entropy": 0.1939996987581253, + "epoch": 5.1749435665914225, + "grad_norm": 2.0213077068328857, + "learning_rate": 4.414016167123183e-06, + "loss": 0.0715, + "mean_token_accuracy": 0.978550124168396, + "num_tokens": 74442930.0, + "step": 9170 + }, + { + "entropy": 0.18027763068675995, + "epoch": 5.177765237020316, + "grad_norm": 1.568903923034668, + "learning_rate": 4.413412525169115e-06, + "loss": 0.0628, + "mean_token_accuracy": 0.9806607604026795, + "num_tokens": 74483369.0, + "step": 9175 + }, + { + "entropy": 0.17010557353496553, + "epoch": 5.18058690744921, + "grad_norm": 2.0850884914398193, + "learning_rate": 4.4128086301062405e-06, + "loss": 0.0652, + "mean_token_accuracy": 0.9795267939567566, + "num_tokens": 74523998.0, + "step": 9180 + }, + { + "entropy": 0.1618386447429657, + "epoch": 5.183408577878104, + "grad_norm": 2.156888484954834, + "learning_rate": 4.412204482053191e-06, + "loss": 0.0653, + "mean_token_accuracy": 0.9797653436660767, + "num_tokens": 74564676.0, + "step": 9185 + }, + { + "entropy": 0.1769125282764435, + "epoch": 5.186230248306998, + "grad_norm": 1.982959270477295, + "learning_rate": 4.411600081128655e-06, + "loss": 0.0692, + "mean_token_accuracy": 0.9781957626342773, + "num_tokens": 74605459.0, + "step": 9190 + }, + { + "entropy": 0.1831894338130951, + "epoch": 5.189051918735892, + "grad_norm": 2.3006227016448975, + "learning_rate": 4.410995427451365e-06, + "loss": 0.0659, + "mean_token_accuracy": 0.9792438864707946, + "num_tokens": 74646245.0, + "step": 9195 + }, + { + "entropy": 0.16881802082061767, + "epoch": 5.191873589164786, + "grad_norm": 1.7162789106369019, + "learning_rate": 4.410390521140107e-06, + "loss": 0.0598, + "mean_token_accuracy": 0.9816855788230896, + "num_tokens": 74686744.0, + "step": 9200 + }, + { + "entropy": 0.17817852199077605, + "epoch": 5.194695259593679, + "grad_norm": 1.815919041633606, + "learning_rate": 4.409785362313714e-06, + "loss": 0.0659, + "mean_token_accuracy": 0.9804519295692444, + "num_tokens": 74727229.0, + "step": 9205 + }, + { + "entropy": 0.16955990195274354, + "epoch": 5.197516930022573, + "grad_norm": 1.5127193927764893, + "learning_rate": 4.409179951091069e-06, + "loss": 0.0704, + "mean_token_accuracy": 0.9778647541999816, + "num_tokens": 74767941.0, + "step": 9210 + }, + { + "entropy": 0.17718103229999543, + "epoch": 5.200338600451468, + "grad_norm": 1.9324251413345337, + "learning_rate": 4.408574287591105e-06, + "loss": 0.0684, + "mean_token_accuracy": 0.9797194123268127, + "num_tokens": 74808629.0, + "step": 9215 + }, + { + "entropy": 0.18118757009506226, + "epoch": 5.203160270880361, + "grad_norm": 1.6411625146865845, + "learning_rate": 4.407968371932807e-06, + "loss": 0.0668, + "mean_token_accuracy": 0.9800692081451416, + "num_tokens": 74849290.0, + "step": 9220 + }, + { + "entropy": 0.16765645444393157, + "epoch": 5.205981941309255, + "grad_norm": 2.054521322250366, + "learning_rate": 4.407362204235205e-06, + "loss": 0.0622, + "mean_token_accuracy": 0.9807947397232055, + "num_tokens": 74889988.0, + "step": 9225 + }, + { + "entropy": 0.18616848886013032, + "epoch": 5.208803611738149, + "grad_norm": 2.160146474838257, + "learning_rate": 4.40675578461738e-06, + "loss": 0.0672, + "mean_token_accuracy": 0.9792033910751343, + "num_tokens": 74930594.0, + "step": 9230 + }, + { + "entropy": 0.17592273354530336, + "epoch": 5.211625282167043, + "grad_norm": 2.01505708694458, + "learning_rate": 4.4061491131984655e-06, + "loss": 0.0708, + "mean_token_accuracy": 0.9776349067687988, + "num_tokens": 74971295.0, + "step": 9235 + }, + { + "entropy": 0.16667756140232087, + "epoch": 5.214446952595937, + "grad_norm": 1.9631502628326416, + "learning_rate": 4.405542190097641e-06, + "loss": 0.0639, + "mean_token_accuracy": 0.9816940546035766, + "num_tokens": 75011986.0, + "step": 9240 + }, + { + "entropy": 0.17029538750648499, + "epoch": 5.217268623024831, + "grad_norm": 1.7982895374298096, + "learning_rate": 4.404935015434138e-06, + "loss": 0.0641, + "mean_token_accuracy": 0.9802758932113648, + "num_tokens": 75052805.0, + "step": 9245 + }, + { + "entropy": 0.172264164686203, + "epoch": 5.220090293453724, + "grad_norm": 2.2566871643066406, + "learning_rate": 4.404327589327234e-06, + "loss": 0.0668, + "mean_token_accuracy": 0.9802647709846497, + "num_tokens": 75093268.0, + "step": 9250 + }, + { + "entropy": 0.16985282599925994, + "epoch": 5.222911963882619, + "grad_norm": 1.5910284519195557, + "learning_rate": 4.403719911896258e-06, + "loss": 0.0634, + "mean_token_accuracy": 0.9810275197029114, + "num_tokens": 75134071.0, + "step": 9255 + }, + { + "entropy": 0.19104610085487367, + "epoch": 5.225733634311513, + "grad_norm": 2.561847448348999, + "learning_rate": 4.40311198326059e-06, + "loss": 0.0704, + "mean_token_accuracy": 0.9786494374275208, + "num_tokens": 75174864.0, + "step": 9260 + }, + { + "entropy": 0.19318519532680511, + "epoch": 5.228555304740406, + "grad_norm": 1.789509654045105, + "learning_rate": 4.402503803539656e-06, + "loss": 0.0748, + "mean_token_accuracy": 0.9768137574195862, + "num_tokens": 75215612.0, + "step": 9265 + }, + { + "entropy": 0.1831961005926132, + "epoch": 5.2313769751693, + "grad_norm": 1.7263920307159424, + "learning_rate": 4.401895372852935e-06, + "loss": 0.0658, + "mean_token_accuracy": 0.9797954320907593, + "num_tokens": 75255813.0, + "step": 9270 + }, + { + "entropy": 0.16736743450164795, + "epoch": 5.234198645598195, + "grad_norm": 1.8194705247879028, + "learning_rate": 4.401286691319951e-06, + "loss": 0.0677, + "mean_token_accuracy": 0.9790467739105224, + "num_tokens": 75296532.0, + "step": 9275 + }, + { + "entropy": 0.17123483419418334, + "epoch": 5.237020316027088, + "grad_norm": 1.8370773792266846, + "learning_rate": 4.40067775906028e-06, + "loss": 0.0634, + "mean_token_accuracy": 0.9816179037094116, + "num_tokens": 75337252.0, + "step": 9280 + }, + { + "entropy": 0.18256573975086213, + "epoch": 5.239841986455982, + "grad_norm": 1.8240233659744263, + "learning_rate": 4.400068576193549e-06, + "loss": 0.0654, + "mean_token_accuracy": 0.9805835843086242, + "num_tokens": 75377833.0, + "step": 9285 + }, + { + "entropy": 0.19250957071781158, + "epoch": 5.242663656884876, + "grad_norm": 1.8019994497299194, + "learning_rate": 4.399459142839429e-06, + "loss": 0.0775, + "mean_token_accuracy": 0.9766400456428528, + "num_tokens": 75418751.0, + "step": 9290 + }, + { + "entropy": 0.204935023188591, + "epoch": 5.245485327313769, + "grad_norm": 2.1010115146636963, + "learning_rate": 4.398849459117645e-06, + "loss": 0.0687, + "mean_token_accuracy": 0.9798161029815674, + "num_tokens": 75459490.0, + "step": 9295 + }, + { + "entropy": 0.1841699182987213, + "epoch": 5.248306997742664, + "grad_norm": 1.935957431793213, + "learning_rate": 4.3982395251479705e-06, + "loss": 0.066, + "mean_token_accuracy": 0.9796889901161194, + "num_tokens": 75500096.0, + "step": 9300 + }, + { + "entropy": 0.16842667162418365, + "epoch": 5.251128668171558, + "grad_norm": 2.501762866973877, + "learning_rate": 4.3976293410502245e-06, + "loss": 0.0566, + "mean_token_accuracy": 0.9825867414474487, + "num_tokens": 75540820.0, + "step": 9305 + }, + { + "entropy": 0.18842605650424957, + "epoch": 5.253950338600451, + "grad_norm": 1.8708136081695557, + "learning_rate": 4.397018906944279e-06, + "loss": 0.0661, + "mean_token_accuracy": 0.9798398852348328, + "num_tokens": 75581478.0, + "step": 9310 + }, + { + "entropy": 0.17341751158237456, + "epoch": 5.256772009029345, + "grad_norm": 1.717519998550415, + "learning_rate": 4.3964082229500545e-06, + "loss": 0.0754, + "mean_token_accuracy": 0.977244210243225, + "num_tokens": 75622057.0, + "step": 9315 + }, + { + "entropy": 0.17311387956142427, + "epoch": 5.25959367945824, + "grad_norm": 2.0607640743255615, + "learning_rate": 4.39579728918752e-06, + "loss": 0.0712, + "mean_token_accuracy": 0.977792501449585, + "num_tokens": 75662674.0, + "step": 9320 + }, + { + "entropy": 0.17308418452739716, + "epoch": 5.262415349887133, + "grad_norm": 1.5914183855056763, + "learning_rate": 4.395186105776691e-06, + "loss": 0.0619, + "mean_token_accuracy": 0.9809646487236023, + "num_tokens": 75703277.0, + "step": 9325 + }, + { + "entropy": 0.15740045607089997, + "epoch": 5.265237020316027, + "grad_norm": 2.155003547668457, + "learning_rate": 4.394574672837637e-06, + "loss": 0.0599, + "mean_token_accuracy": 0.9820808887481689, + "num_tokens": 75743221.0, + "step": 9330 + }, + { + "entropy": 0.17217525243759155, + "epoch": 5.268058690744921, + "grad_norm": 2.087221384048462, + "learning_rate": 4.393962990490475e-06, + "loss": 0.0632, + "mean_token_accuracy": 0.9806132674217224, + "num_tokens": 75783998.0, + "step": 9335 + }, + { + "entropy": 0.1740059047937393, + "epoch": 5.270880361173815, + "grad_norm": 2.0336315631866455, + "learning_rate": 4.393351058855366e-06, + "loss": 0.0739, + "mean_token_accuracy": 0.9778852939605713, + "num_tokens": 75824530.0, + "step": 9340 + }, + { + "entropy": 0.19505321085453034, + "epoch": 5.273702031602709, + "grad_norm": 2.201986074447632, + "learning_rate": 4.392738878052528e-06, + "loss": 0.071, + "mean_token_accuracy": 0.9772972106933594, + "num_tokens": 75865197.0, + "step": 9345 + }, + { + "entropy": 0.17578763961791993, + "epoch": 5.276523702031603, + "grad_norm": 1.827558159828186, + "learning_rate": 4.392126448202223e-06, + "loss": 0.0617, + "mean_token_accuracy": 0.9813929557800293, + "num_tokens": 75905554.0, + "step": 9350 + }, + { + "entropy": 0.16351258158683776, + "epoch": 5.279345372460496, + "grad_norm": 2.258183479309082, + "learning_rate": 4.391513769424762e-06, + "loss": 0.0643, + "mean_token_accuracy": 0.9805266618728637, + "num_tokens": 75946320.0, + "step": 9355 + }, + { + "entropy": 0.17927370667457582, + "epoch": 5.282167042889391, + "grad_norm": 2.4332005977630615, + "learning_rate": 4.390900841840506e-06, + "loss": 0.0721, + "mean_token_accuracy": 0.9778699398040771, + "num_tokens": 75987141.0, + "step": 9360 + }, + { + "entropy": 0.16708399951457978, + "epoch": 5.284988713318285, + "grad_norm": 1.8501638174057007, + "learning_rate": 4.3902876655698666e-06, + "loss": 0.0672, + "mean_token_accuracy": 0.978826928138733, + "num_tokens": 76027705.0, + "step": 9365 + }, + { + "entropy": 0.18877400755882262, + "epoch": 5.287810383747178, + "grad_norm": 1.798592448234558, + "learning_rate": 4.3896742407332995e-06, + "loss": 0.0649, + "mean_token_accuracy": 0.9791201710700989, + "num_tokens": 76068472.0, + "step": 9370 + }, + { + "entropy": 0.1839359939098358, + "epoch": 5.290632054176072, + "grad_norm": 2.170732259750366, + "learning_rate": 4.389060567451313e-06, + "loss": 0.0688, + "mean_token_accuracy": 0.9787888526916504, + "num_tokens": 76109233.0, + "step": 9375 + }, + { + "entropy": 0.1790493279695511, + "epoch": 5.293453724604966, + "grad_norm": 1.9541733264923096, + "learning_rate": 4.388446645844465e-06, + "loss": 0.0708, + "mean_token_accuracy": 0.9779038906097413, + "num_tokens": 76149974.0, + "step": 9380 + }, + { + "entropy": 0.17843609154224396, + "epoch": 5.29627539503386, + "grad_norm": 1.8375680446624756, + "learning_rate": 4.387832476033358e-06, + "loss": 0.0692, + "mean_token_accuracy": 0.9785267472267151, + "num_tokens": 76190641.0, + "step": 9385 + }, + { + "entropy": 0.17534771859645842, + "epoch": 5.299097065462754, + "grad_norm": 2.224586009979248, + "learning_rate": 4.3872180581386485e-06, + "loss": 0.0762, + "mean_token_accuracy": 0.9763685941696167, + "num_tokens": 76231345.0, + "step": 9390 + }, + { + "entropy": 0.17150465846061708, + "epoch": 5.301918735891648, + "grad_norm": 2.095118999481201, + "learning_rate": 4.3866033922810355e-06, + "loss": 0.0719, + "mean_token_accuracy": 0.9783193111419678, + "num_tokens": 76272069.0, + "step": 9395 + }, + { + "entropy": 0.18151159584522247, + "epoch": 5.3047404063205414, + "grad_norm": 2.1895596981048584, + "learning_rate": 4.385988478581274e-06, + "loss": 0.0718, + "mean_token_accuracy": 0.9773975610733032, + "num_tokens": 76312608.0, + "step": 9400 + }, + { + "entropy": 0.19453785419464112, + "epoch": 5.307562076749436, + "grad_norm": 2.0922398567199707, + "learning_rate": 4.38537331716016e-06, + "loss": 0.0709, + "mean_token_accuracy": 0.9785321354866028, + "num_tokens": 76353353.0, + "step": 9405 + }, + { + "entropy": 0.18575883209705352, + "epoch": 5.31038374717833, + "grad_norm": 3.6228082180023193, + "learning_rate": 4.384757908138545e-06, + "loss": 0.0727, + "mean_token_accuracy": 0.9786053776741028, + "num_tokens": 76394028.0, + "step": 9410 + }, + { + "entropy": 0.18422328531742097, + "epoch": 5.313205417607223, + "grad_norm": 1.974879503250122, + "learning_rate": 4.3841422516373255e-06, + "loss": 0.0632, + "mean_token_accuracy": 0.9804863810539246, + "num_tokens": 76434694.0, + "step": 9415 + }, + { + "entropy": 0.17816999256610871, + "epoch": 5.316027088036117, + "grad_norm": 1.6818512678146362, + "learning_rate": 4.383526347777446e-06, + "loss": 0.0684, + "mean_token_accuracy": 0.9786510825157165, + "num_tokens": 76475096.0, + "step": 9420 + }, + { + "entropy": 0.18150631487369537, + "epoch": 5.318848758465011, + "grad_norm": 1.903784990310669, + "learning_rate": 4.3829101966799025e-06, + "loss": 0.0732, + "mean_token_accuracy": 0.9780114173889161, + "num_tokens": 76515735.0, + "step": 9425 + }, + { + "entropy": 0.17466551959514617, + "epoch": 5.321670428893905, + "grad_norm": 1.8316190242767334, + "learning_rate": 4.382293798465738e-06, + "loss": 0.0729, + "mean_token_accuracy": 0.9783891916275025, + "num_tokens": 76556187.0, + "step": 9430 + }, + { + "entropy": 0.1856124997138977, + "epoch": 5.324492099322799, + "grad_norm": 2.0181853771209717, + "learning_rate": 4.381677153256042e-06, + "loss": 0.077, + "mean_token_accuracy": 0.9763926148414612, + "num_tokens": 76596840.0, + "step": 9435 + }, + { + "entropy": 0.16592672169208528, + "epoch": 5.327313769751693, + "grad_norm": 2.4706497192382812, + "learning_rate": 4.381060261171956e-06, + "loss": 0.0765, + "mean_token_accuracy": 0.9765696048736572, + "num_tokens": 76637424.0, + "step": 9440 + }, + { + "entropy": 0.17519499063491822, + "epoch": 5.3301354401805865, + "grad_norm": 1.9613145589828491, + "learning_rate": 4.38044312233467e-06, + "loss": 0.0649, + "mean_token_accuracy": 0.9802795290946961, + "num_tokens": 76678086.0, + "step": 9445 + }, + { + "entropy": 0.19584991335868834, + "epoch": 5.332957110609481, + "grad_norm": 1.8354933261871338, + "learning_rate": 4.379825736865419e-06, + "loss": 0.068, + "mean_token_accuracy": 0.9790003061294555, + "num_tokens": 76718348.0, + "step": 9450 + }, + { + "entropy": 0.17431385517120362, + "epoch": 5.335778781038375, + "grad_norm": 2.1092381477355957, + "learning_rate": 4.3792081048854875e-06, + "loss": 0.072, + "mean_token_accuracy": 0.9774962902069092, + "num_tokens": 76759148.0, + "step": 9455 + }, + { + "entropy": 0.1774332642555237, + "epoch": 5.3386004514672685, + "grad_norm": 2.28005313873291, + "learning_rate": 4.378590226516211e-06, + "loss": 0.0746, + "mean_token_accuracy": 0.9770366787910462, + "num_tokens": 76799597.0, + "step": 9460 + }, + { + "entropy": 0.17988061010837555, + "epoch": 5.341422121896162, + "grad_norm": 2.3315978050231934, + "learning_rate": 4.3779721018789735e-06, + "loss": 0.0781, + "mean_token_accuracy": 0.9758471369743347, + "num_tokens": 76840462.0, + "step": 9465 + }, + { + "entropy": 0.16905871629714966, + "epoch": 5.344243792325057, + "grad_norm": 2.204232931137085, + "learning_rate": 4.377353731095202e-06, + "loss": 0.0604, + "mean_token_accuracy": 0.9812986135482789, + "num_tokens": 76881153.0, + "step": 9470 + }, + { + "entropy": 0.17078822553157808, + "epoch": 5.3470654627539504, + "grad_norm": 1.9415490627288818, + "learning_rate": 4.376735114286378e-06, + "loss": 0.0697, + "mean_token_accuracy": 0.9775596261024475, + "num_tokens": 76922045.0, + "step": 9475 + }, + { + "entropy": 0.17137598097324372, + "epoch": 5.349887133182844, + "grad_norm": 1.894740343093872, + "learning_rate": 4.3761162515740276e-06, + "loss": 0.0631, + "mean_token_accuracy": 0.9806401252746582, + "num_tokens": 76962699.0, + "step": 9480 + }, + { + "entropy": 0.1780911296606064, + "epoch": 5.352708803611738, + "grad_norm": 1.8693207502365112, + "learning_rate": 4.375497143079726e-06, + "loss": 0.0728, + "mean_token_accuracy": 0.9785822868347168, + "num_tokens": 77003042.0, + "step": 9485 + }, + { + "entropy": 0.18042166531085968, + "epoch": 5.355530474040632, + "grad_norm": 1.8227202892303467, + "learning_rate": 4.3748777889250995e-06, + "loss": 0.0743, + "mean_token_accuracy": 0.9775307297706604, + "num_tokens": 77043699.0, + "step": 9490 + }, + { + "entropy": 0.1943429082632065, + "epoch": 5.358352144469526, + "grad_norm": 1.9916659593582153, + "learning_rate": 4.374258189231818e-06, + "loss": 0.0667, + "mean_token_accuracy": 0.9789568185806274, + "num_tokens": 77084496.0, + "step": 9495 + }, + { + "entropy": 0.19134209752082826, + "epoch": 5.36117381489842, + "grad_norm": 2.394291877746582, + "learning_rate": 4.3736383441216036e-06, + "loss": 0.0672, + "mean_token_accuracy": 0.9791797280311585, + "num_tokens": 77124779.0, + "step": 9500 + }, + { + "epoch": 5.36117381489842, + "eval_entropy": 0.22530123591423035, + "eval_loss": 0.03946574404835701, + "eval_mean_token_accuracy": 0.9888846278190613, + "eval_num_tokens": 77124779.0, + "eval_runtime": 0.1643, + "eval_samples_per_second": 24.339, + "eval_steps_per_second": 6.085, + "step": 9500 + }, + { + "entropy": 0.19695676267147064, + "epoch": 5.3639954853273135, + "grad_norm": 1.9585169553756714, + "learning_rate": 4.3730182537162235e-06, + "loss": 0.0782, + "mean_token_accuracy": 0.9759560108184815, + "num_tokens": 77165622.0, + "step": 9505 + }, + { + "entropy": 0.18560438454151154, + "epoch": 5.366817155756207, + "grad_norm": 2.292339563369751, + "learning_rate": 4.3723979181374964e-06, + "loss": 0.0743, + "mean_token_accuracy": 0.9771399736404419, + "num_tokens": 77206242.0, + "step": 9510 + }, + { + "entropy": 0.17267344892024994, + "epoch": 5.369638826185102, + "grad_norm": 2.0678932666778564, + "learning_rate": 4.371777337507285e-06, + "loss": 0.0692, + "mean_token_accuracy": 0.979387903213501, + "num_tokens": 77246858.0, + "step": 9515 + }, + { + "entropy": 0.18678390383720397, + "epoch": 5.3724604966139955, + "grad_norm": 2.482128858566284, + "learning_rate": 4.371156511947504e-06, + "loss": 0.0713, + "mean_token_accuracy": 0.9776126742362976, + "num_tokens": 77287179.0, + "step": 9520 + }, + { + "entropy": 0.16702256202697754, + "epoch": 5.375282167042889, + "grad_norm": 2.2040395736694336, + "learning_rate": 4.370535441580114e-06, + "loss": 0.0623, + "mean_token_accuracy": 0.981655502319336, + "num_tokens": 77327907.0, + "step": 9525 + }, + { + "entropy": 0.16512243151664735, + "epoch": 5.378103837471783, + "grad_norm": 1.9599778652191162, + "learning_rate": 4.369914126527125e-06, + "loss": 0.0643, + "mean_token_accuracy": 0.9801242351531982, + "num_tokens": 77368670.0, + "step": 9530 + }, + { + "entropy": 0.17791537344455718, + "epoch": 5.3809255079006775, + "grad_norm": 1.9389322996139526, + "learning_rate": 4.369292566910594e-06, + "loss": 0.0642, + "mean_token_accuracy": 0.9804812669754028, + "num_tokens": 77409415.0, + "step": 9535 + }, + { + "entropy": 0.18755693435668946, + "epoch": 5.383747178329571, + "grad_norm": 1.7628971338272095, + "learning_rate": 4.368670762852626e-06, + "loss": 0.0655, + "mean_token_accuracy": 0.9798348426818848, + "num_tokens": 77450190.0, + "step": 9540 + }, + { + "entropy": 0.1958528518676758, + "epoch": 5.386568848758465, + "grad_norm": 2.0312836170196533, + "learning_rate": 4.368048714475375e-06, + "loss": 0.0711, + "mean_token_accuracy": 0.9780890107154846, + "num_tokens": 77490984.0, + "step": 9545 + }, + { + "entropy": 0.1894278347492218, + "epoch": 5.389390519187359, + "grad_norm": 2.1138522624969482, + "learning_rate": 4.367426421901042e-06, + "loss": 0.0687, + "mean_token_accuracy": 0.9790720820426941, + "num_tokens": 77531707.0, + "step": 9550 + }, + { + "entropy": 0.18264504075050353, + "epoch": 5.392212189616253, + "grad_norm": 2.236631155014038, + "learning_rate": 4.366803885251879e-06, + "loss": 0.068, + "mean_token_accuracy": 0.9791576147079468, + "num_tokens": 77572138.0, + "step": 9555 + }, + { + "entropy": 0.17964340448379518, + "epoch": 5.395033860045147, + "grad_norm": 1.9369723796844482, + "learning_rate": 4.366181104650179e-06, + "loss": 0.0726, + "mean_token_accuracy": 0.978190791606903, + "num_tokens": 77612964.0, + "step": 9560 + }, + { + "entropy": 0.19620463848114014, + "epoch": 5.3978555304740405, + "grad_norm": 2.114414930343628, + "learning_rate": 4.36555808021829e-06, + "loss": 0.0781, + "mean_token_accuracy": 0.975774621963501, + "num_tokens": 77653620.0, + "step": 9565 + }, + { + "entropy": 0.18174751698970795, + "epoch": 5.400677200902934, + "grad_norm": 2.574272394180298, + "learning_rate": 4.364934812078606e-06, + "loss": 0.071, + "mean_token_accuracy": 0.9776658058166504, + "num_tokens": 77694231.0, + "step": 9570 + }, + { + "entropy": 0.19385533034801483, + "epoch": 5.403498871331829, + "grad_norm": 1.886120319366455, + "learning_rate": 4.364311300353567e-06, + "loss": 0.0706, + "mean_token_accuracy": 0.9778653621673584, + "num_tokens": 77734981.0, + "step": 9575 + }, + { + "entropy": 0.1768470048904419, + "epoch": 5.4063205417607225, + "grad_norm": 1.6945990324020386, + "learning_rate": 4.363687545165661e-06, + "loss": 0.0748, + "mean_token_accuracy": 0.9762227416038514, + "num_tokens": 77775525.0, + "step": 9580 + }, + { + "entropy": 0.18447025418281554, + "epoch": 5.409142212189616, + "grad_norm": 2.226775646209717, + "learning_rate": 4.363063546637426e-06, + "loss": 0.0772, + "mean_token_accuracy": 0.9765905261039733, + "num_tokens": 77816146.0, + "step": 9585 + }, + { + "entropy": 0.20254374146461487, + "epoch": 5.41196388261851, + "grad_norm": 1.4906258583068848, + "learning_rate": 4.3624393048914465e-06, + "loss": 0.0672, + "mean_token_accuracy": 0.9796987056732178, + "num_tokens": 77856760.0, + "step": 9590 + }, + { + "entropy": 0.20228754878044128, + "epoch": 5.414785553047404, + "grad_norm": 2.128209114074707, + "learning_rate": 4.361814820050355e-06, + "loss": 0.074, + "mean_token_accuracy": 0.9770419836044312, + "num_tokens": 77897182.0, + "step": 9595 + }, + { + "entropy": 0.18532134592533112, + "epoch": 5.417607223476298, + "grad_norm": 1.994026780128479, + "learning_rate": 4.36119009223683e-06, + "loss": 0.0698, + "mean_token_accuracy": 0.9782418012619019, + "num_tokens": 77937769.0, + "step": 9600 + }, + { + "entropy": 0.17942964434623718, + "epoch": 5.420428893905192, + "grad_norm": 1.7702689170837402, + "learning_rate": 4.3605651215736025e-06, + "loss": 0.0697, + "mean_token_accuracy": 0.9785507321357727, + "num_tokens": 77978608.0, + "step": 9605 + }, + { + "entropy": 0.18152170181274413, + "epoch": 5.423250564334086, + "grad_norm": 2.2829222679138184, + "learning_rate": 4.359939908183445e-06, + "loss": 0.0748, + "mean_token_accuracy": 0.9763474941253663, + "num_tokens": 78019201.0, + "step": 9610 + }, + { + "entropy": 0.16868273317813873, + "epoch": 5.426072234762979, + "grad_norm": 1.9242933988571167, + "learning_rate": 4.3593144521891825e-06, + "loss": 0.0684, + "mean_token_accuracy": 0.9792532801628113, + "num_tokens": 78059853.0, + "step": 9615 + }, + { + "entropy": 0.17837615609169005, + "epoch": 5.428893905191874, + "grad_norm": 1.9785670042037964, + "learning_rate": 4.358688753713685e-06, + "loss": 0.063, + "mean_token_accuracy": 0.9807116985321045, + "num_tokens": 78100351.0, + "step": 9620 + }, + { + "entropy": 0.16789911687374115, + "epoch": 5.431715575620768, + "grad_norm": 1.8135242462158203, + "learning_rate": 4.358062812879873e-06, + "loss": 0.0667, + "mean_token_accuracy": 0.979233467578888, + "num_tokens": 78140931.0, + "step": 9625 + }, + { + "entropy": 0.16792088150978088, + "epoch": 5.434537246049661, + "grad_norm": 2.1166341304779053, + "learning_rate": 4.357436629810709e-06, + "loss": 0.0682, + "mean_token_accuracy": 0.9792353987693787, + "num_tokens": 78181470.0, + "step": 9630 + }, + { + "entropy": 0.18325534760951995, + "epoch": 5.437358916478555, + "grad_norm": 1.728598952293396, + "learning_rate": 4.35681020462921e-06, + "loss": 0.0747, + "mean_token_accuracy": 0.977009117603302, + "num_tokens": 78222347.0, + "step": 9635 + }, + { + "entropy": 0.18267615139484406, + "epoch": 5.4401805869074495, + "grad_norm": 1.7328113317489624, + "learning_rate": 4.356183537458436e-06, + "loss": 0.0722, + "mean_token_accuracy": 0.9778196811676025, + "num_tokens": 78262982.0, + "step": 9640 + }, + { + "entropy": 0.180474916100502, + "epoch": 5.443002257336343, + "grad_norm": 1.5890209674835205, + "learning_rate": 4.3555566284214955e-06, + "loss": 0.0663, + "mean_token_accuracy": 0.9788239240646363, + "num_tokens": 78303575.0, + "step": 9645 + }, + { + "entropy": 0.20339135229587554, + "epoch": 5.445823927765237, + "grad_norm": 1.9945831298828125, + "learning_rate": 4.354929477641547e-06, + "loss": 0.0792, + "mean_token_accuracy": 0.9748901128768921, + "num_tokens": 78343987.0, + "step": 9650 + }, + { + "entropy": 0.18225070238113403, + "epoch": 5.448645598194131, + "grad_norm": 1.9023123979568481, + "learning_rate": 4.354302085241791e-06, + "loss": 0.072, + "mean_token_accuracy": 0.9779296040534973, + "num_tokens": 78384670.0, + "step": 9655 + }, + { + "entropy": 0.17491951882839202, + "epoch": 5.451467268623025, + "grad_norm": 1.9437018632888794, + "learning_rate": 4.353674451345481e-06, + "loss": 0.0649, + "mean_token_accuracy": 0.9799412608146667, + "num_tokens": 78425444.0, + "step": 9660 + }, + { + "entropy": 0.19293942749500276, + "epoch": 5.454288939051919, + "grad_norm": 2.0638647079467773, + "learning_rate": 4.353046576075915e-06, + "loss": 0.0706, + "mean_token_accuracy": 0.9783964395523072, + "num_tokens": 78465938.0, + "step": 9665 + }, + { + "entropy": 0.17496614456176757, + "epoch": 5.457110609480813, + "grad_norm": 2.2454192638397217, + "learning_rate": 4.35241845955644e-06, + "loss": 0.0733, + "mean_token_accuracy": 0.9771980404853821, + "num_tokens": 78506288.0, + "step": 9670 + }, + { + "entropy": 0.18418402671813966, + "epoch": 5.459932279909706, + "grad_norm": 1.9852421283721924, + "learning_rate": 4.3517901019104494e-06, + "loss": 0.0624, + "mean_token_accuracy": 0.9815483212471008, + "num_tokens": 78546897.0, + "step": 9675 + }, + { + "entropy": 0.17411540746688842, + "epoch": 5.4627539503386, + "grad_norm": 2.471421957015991, + "learning_rate": 4.351161503261384e-06, + "loss": 0.0593, + "mean_token_accuracy": 0.981855833530426, + "num_tokens": 78587494.0, + "step": 9680 + }, + { + "entropy": 0.18723950386047364, + "epoch": 5.465575620767495, + "grad_norm": 2.042299509048462, + "learning_rate": 4.3505326637327315e-06, + "loss": 0.0789, + "mean_token_accuracy": 0.9755418181419373, + "num_tokens": 78628058.0, + "step": 9685 + }, + { + "entropy": 0.1705529898405075, + "epoch": 5.468397291196388, + "grad_norm": 2.377067804336548, + "learning_rate": 4.3499035834480275e-06, + "loss": 0.0714, + "mean_token_accuracy": 0.9782310485839844, + "num_tokens": 78668915.0, + "step": 9690 + }, + { + "entropy": 0.1720373123884201, + "epoch": 5.471218961625282, + "grad_norm": 2.115959405899048, + "learning_rate": 4.349274262530856e-06, + "loss": 0.0724, + "mean_token_accuracy": 0.9783347487449646, + "num_tokens": 78709535.0, + "step": 9695 + }, + { + "entropy": 0.1738339751958847, + "epoch": 5.474040632054176, + "grad_norm": 1.7836189270019531, + "learning_rate": 4.348644701104845e-06, + "loss": 0.067, + "mean_token_accuracy": 0.9795932054519654, + "num_tokens": 78750224.0, + "step": 9700 + }, + { + "entropy": 0.19387414753437043, + "epoch": 5.47686230248307, + "grad_norm": 1.597865343093872, + "learning_rate": 4.348014899293675e-06, + "loss": 0.0782, + "mean_token_accuracy": 0.9758282899856567, + "num_tokens": 78790822.0, + "step": 9705 + }, + { + "entropy": 0.1638443261384964, + "epoch": 5.479683972911964, + "grad_norm": 1.7164875268936157, + "learning_rate": 4.3473848572210685e-06, + "loss": 0.0667, + "mean_token_accuracy": 0.9790258288383484, + "num_tokens": 78831384.0, + "step": 9710 + }, + { + "entropy": 0.17148119807243348, + "epoch": 5.482505643340858, + "grad_norm": 1.7814518213272095, + "learning_rate": 4.346754575010798e-06, + "loss": 0.055, + "mean_token_accuracy": 0.9819474458694458, + "num_tokens": 78871893.0, + "step": 9715 + }, + { + "entropy": 0.1933718204498291, + "epoch": 5.485327313769751, + "grad_norm": 2.326171398162842, + "learning_rate": 4.346124052786682e-06, + "loss": 0.0807, + "mean_token_accuracy": 0.9755822420120239, + "num_tokens": 78912523.0, + "step": 9720 + }, + { + "entropy": 0.17635531425476075, + "epoch": 5.488148984198646, + "grad_norm": 1.7191340923309326, + "learning_rate": 4.3454932906725875e-06, + "loss": 0.0705, + "mean_token_accuracy": 0.9771863937377929, + "num_tokens": 78952985.0, + "step": 9725 + }, + { + "entropy": 0.17937880754470825, + "epoch": 5.49097065462754, + "grad_norm": 2.046811819076538, + "learning_rate": 4.3448622887924265e-06, + "loss": 0.0618, + "mean_token_accuracy": 0.981409227848053, + "num_tokens": 78993356.0, + "step": 9730 + }, + { + "entropy": 0.19099608957767486, + "epoch": 5.493792325056433, + "grad_norm": 2.5957789421081543, + "learning_rate": 4.3442310472701615e-06, + "loss": 0.0765, + "mean_token_accuracy": 0.9757651925086975, + "num_tokens": 79034026.0, + "step": 9735 + }, + { + "entropy": 0.1835557132959366, + "epoch": 5.496613995485327, + "grad_norm": 2.360753059387207, + "learning_rate": 4.343599566229799e-06, + "loss": 0.0732, + "mean_token_accuracy": 0.9772834777832031, + "num_tokens": 79074662.0, + "step": 9740 + }, + { + "entropy": 0.17511331737041474, + "epoch": 5.499435665914222, + "grad_norm": 2.0439140796661377, + "learning_rate": 4.342967845795393e-06, + "loss": 0.0642, + "mean_token_accuracy": 0.9799216628074646, + "num_tokens": 79115192.0, + "step": 9745 + }, + { + "entropy": 0.18398658633232118, + "epoch": 5.502257336343115, + "grad_norm": 1.601022481918335, + "learning_rate": 4.342335886091045e-06, + "loss": 0.0729, + "mean_token_accuracy": 0.9781609892845153, + "num_tokens": 79155981.0, + "step": 9750 + }, + { + "entropy": 0.18661101460456847, + "epoch": 5.505079006772009, + "grad_norm": 2.0731983184814453, + "learning_rate": 4.341703687240903e-06, + "loss": 0.0754, + "mean_token_accuracy": 0.9759285807609558, + "num_tokens": 79196610.0, + "step": 9755 + }, + { + "entropy": 0.19010823965072632, + "epoch": 5.507900677200903, + "grad_norm": 1.9401448965072632, + "learning_rate": 4.341071249369164e-06, + "loss": 0.0742, + "mean_token_accuracy": 0.9756409406661988, + "num_tokens": 79237061.0, + "step": 9760 + }, + { + "entropy": 0.17404641211032867, + "epoch": 5.510722347629796, + "grad_norm": 2.2344675064086914, + "learning_rate": 4.340438572600072e-06, + "loss": 0.0677, + "mean_token_accuracy": 0.9782544851303101, + "num_tokens": 79277948.0, + "step": 9765 + }, + { + "entropy": 0.1966948091983795, + "epoch": 5.513544018058691, + "grad_norm": 2.11859130859375, + "learning_rate": 4.3398056570579125e-06, + "loss": 0.0865, + "mean_token_accuracy": 0.9740624785423279, + "num_tokens": 79317957.0, + "step": 9770 + }, + { + "entropy": 0.17077078223228453, + "epoch": 5.516365688487585, + "grad_norm": 1.9903873205184937, + "learning_rate": 4.339172502867023e-06, + "loss": 0.0667, + "mean_token_accuracy": 0.9797818660736084, + "num_tokens": 79358659.0, + "step": 9775 + }, + { + "entropy": 0.17696953415870667, + "epoch": 5.519187358916478, + "grad_norm": 2.100498676300049, + "learning_rate": 4.338539110151789e-06, + "loss": 0.0834, + "mean_token_accuracy": 0.974258816242218, + "num_tokens": 79399262.0, + "step": 9780 + }, + { + "entropy": 0.17644532322883605, + "epoch": 5.522009029345372, + "grad_norm": 2.0375876426696777, + "learning_rate": 4.337905479036639e-06, + "loss": 0.0722, + "mean_token_accuracy": 0.9780573964118957, + "num_tokens": 79439526.0, + "step": 9785 + }, + { + "entropy": 0.18530842363834382, + "epoch": 5.524830699774267, + "grad_norm": 2.0761780738830566, + "learning_rate": 4.33727160964605e-06, + "loss": 0.0706, + "mean_token_accuracy": 0.9781399369239807, + "num_tokens": 79480123.0, + "step": 9790 + }, + { + "entropy": 0.19907425343990326, + "epoch": 5.52765237020316, + "grad_norm": 1.8196872472763062, + "learning_rate": 4.336637502104545e-06, + "loss": 0.0821, + "mean_token_accuracy": 0.9743606328964234, + "num_tokens": 79520700.0, + "step": 9795 + }, + { + "entropy": 0.18495319783687592, + "epoch": 5.530474040632054, + "grad_norm": 1.844511866569519, + "learning_rate": 4.336003156536696e-06, + "loss": 0.0689, + "mean_token_accuracy": 0.9784881234169006, + "num_tokens": 79561218.0, + "step": 9800 + }, + { + "entropy": 0.17770458459854127, + "epoch": 5.533295711060948, + "grad_norm": 2.0217959880828857, + "learning_rate": 4.335368573067118e-06, + "loss": 0.0743, + "mean_token_accuracy": 0.9774918794631958, + "num_tokens": 79601835.0, + "step": 9805 + }, + { + "entropy": 0.18182687163352967, + "epoch": 5.536117381489842, + "grad_norm": 2.815741777420044, + "learning_rate": 4.334733751820478e-06, + "loss": 0.0754, + "mean_token_accuracy": 0.9765426874160766, + "num_tokens": 79642074.0, + "step": 9810 + }, + { + "entropy": 0.17728038728237153, + "epoch": 5.538939051918736, + "grad_norm": 1.9506757259368896, + "learning_rate": 4.334098692921484e-06, + "loss": 0.0674, + "mean_token_accuracy": 0.9793436288833618, + "num_tokens": 79682666.0, + "step": 9815 + }, + { + "entropy": 0.1820712596178055, + "epoch": 5.54176072234763, + "grad_norm": 2.2737324237823486, + "learning_rate": 4.333463396494896e-06, + "loss": 0.0664, + "mean_token_accuracy": 0.9783164978027343, + "num_tokens": 79723398.0, + "step": 9820 + }, + { + "entropy": 0.1908366858959198, + "epoch": 5.544582392776523, + "grad_norm": 1.7559250593185425, + "learning_rate": 4.332827862665515e-06, + "loss": 0.0673, + "mean_token_accuracy": 0.9796145915985107, + "num_tokens": 79764063.0, + "step": 9825 + }, + { + "entropy": 0.18218895196914672, + "epoch": 5.547404063205418, + "grad_norm": 2.057884693145752, + "learning_rate": 4.332192091558195e-06, + "loss": 0.084, + "mean_token_accuracy": 0.9729039072990417, + "num_tokens": 79804649.0, + "step": 9830 + }, + { + "entropy": 0.19951559603214264, + "epoch": 5.550225733634312, + "grad_norm": 2.015651226043701, + "learning_rate": 4.331556083297831e-06, + "loss": 0.0819, + "mean_token_accuracy": 0.9746386528015136, + "num_tokens": 79845171.0, + "step": 9835 + }, + { + "entropy": 0.17591853737831115, + "epoch": 5.553047404063205, + "grad_norm": 2.3496806621551514, + "learning_rate": 4.330919838009368e-06, + "loss": 0.0591, + "mean_token_accuracy": 0.9815265893936157, + "num_tokens": 79885183.0, + "step": 9840 + }, + { + "entropy": 0.176205512881279, + "epoch": 5.555869074492099, + "grad_norm": 2.1586620807647705, + "learning_rate": 4.330283355817796e-06, + "loss": 0.0767, + "mean_token_accuracy": 0.9763123154640198, + "num_tokens": 79925909.0, + "step": 9845 + }, + { + "entropy": 0.17635221481323243, + "epoch": 5.558690744920993, + "grad_norm": 2.0332553386688232, + "learning_rate": 4.329646636848151e-06, + "loss": 0.0669, + "mean_token_accuracy": 0.9792371988296509, + "num_tokens": 79966534.0, + "step": 9850 + }, + { + "entropy": 0.18909758031368257, + "epoch": 5.561512415349887, + "grad_norm": 1.8824989795684814, + "learning_rate": 4.3290096812255185e-06, + "loss": 0.0767, + "mean_token_accuracy": 0.976757287979126, + "num_tokens": 80007224.0, + "step": 9855 + }, + { + "entropy": 0.18777174949645997, + "epoch": 5.564334085778781, + "grad_norm": 2.0460257530212402, + "learning_rate": 4.328372489075028e-06, + "loss": 0.0755, + "mean_token_accuracy": 0.9778550267219543, + "num_tokens": 80047897.0, + "step": 9860 + }, + { + "entropy": 0.18754212260246278, + "epoch": 5.567155756207675, + "grad_norm": 2.3515195846557617, + "learning_rate": 4.327735060521855e-06, + "loss": 0.0802, + "mean_token_accuracy": 0.9749362111091614, + "num_tokens": 80088428.0, + "step": 9865 + }, + { + "entropy": 0.17782265245914458, + "epoch": 5.5699774266365685, + "grad_norm": 1.882233738899231, + "learning_rate": 4.3270973956912225e-06, + "loss": 0.0696, + "mean_token_accuracy": 0.9777340531349182, + "num_tokens": 80129154.0, + "step": 9870 + }, + { + "entropy": 0.16835958659648895, + "epoch": 5.572799097065463, + "grad_norm": 1.7510329484939575, + "learning_rate": 4.326459494708401e-06, + "loss": 0.0638, + "mean_token_accuracy": 0.9795340061187744, + "num_tokens": 80169798.0, + "step": 9875 + }, + { + "entropy": 0.17615444958209991, + "epoch": 5.575620767494357, + "grad_norm": 1.4721903800964355, + "learning_rate": 4.325821357698705e-06, + "loss": 0.0693, + "mean_token_accuracy": 0.9783699035644531, + "num_tokens": 80210503.0, + "step": 9880 + }, + { + "entropy": 0.1756734073162079, + "epoch": 5.5784424379232505, + "grad_norm": 2.1826069355010986, + "learning_rate": 4.325182984787499e-06, + "loss": 0.067, + "mean_token_accuracy": 0.9788246631622315, + "num_tokens": 80251068.0, + "step": 9885 + }, + { + "entropy": 0.19139577448368073, + "epoch": 5.581264108352144, + "grad_norm": 2.021207094192505, + "learning_rate": 4.324544376100188e-06, + "loss": 0.0744, + "mean_token_accuracy": 0.9779517769813537, + "num_tokens": 80291849.0, + "step": 9890 + }, + { + "entropy": 0.17277522683143615, + "epoch": 5.584085778781039, + "grad_norm": 2.1115806102752686, + "learning_rate": 4.323905531762229e-06, + "loss": 0.0694, + "mean_token_accuracy": 0.9781728863716126, + "num_tokens": 80332416.0, + "step": 9895 + }, + { + "entropy": 0.17857773303985597, + "epoch": 5.586907449209932, + "grad_norm": 1.9007774591445923, + "learning_rate": 4.323266451899122e-06, + "loss": 0.0666, + "mean_token_accuracy": 0.9787236452102661, + "num_tokens": 80373154.0, + "step": 9900 + }, + { + "entropy": 0.18921782374382018, + "epoch": 5.589729119638826, + "grad_norm": 1.874495267868042, + "learning_rate": 4.322627136636415e-06, + "loss": 0.0715, + "mean_token_accuracy": 0.9781720876693726, + "num_tokens": 80413856.0, + "step": 9905 + }, + { + "entropy": 0.18341380059719087, + "epoch": 5.59255079006772, + "grad_norm": 2.1999471187591553, + "learning_rate": 4.321987586099702e-06, + "loss": 0.0603, + "mean_token_accuracy": 0.9813642621040344, + "num_tokens": 80454655.0, + "step": 9910 + }, + { + "entropy": 0.17606990933418273, + "epoch": 5.595372460496614, + "grad_norm": 2.1019020080566406, + "learning_rate": 4.32134780041462e-06, + "loss": 0.0651, + "mean_token_accuracy": 0.9796467542648315, + "num_tokens": 80495377.0, + "step": 9915 + }, + { + "entropy": 0.18233582377433777, + "epoch": 5.598194130925508, + "grad_norm": 1.8605166673660278, + "learning_rate": 4.320707779706859e-06, + "loss": 0.0701, + "mean_token_accuracy": 0.9781240105628968, + "num_tokens": 80536019.0, + "step": 9920 + }, + { + "entropy": 0.18825688362121581, + "epoch": 5.601015801354402, + "grad_norm": 1.838043451309204, + "learning_rate": 4.320067524102149e-06, + "loss": 0.0755, + "mean_token_accuracy": 0.9771787285804748, + "num_tokens": 80576772.0, + "step": 9925 + }, + { + "entropy": 0.1714703172445297, + "epoch": 5.6038374717832955, + "grad_norm": 2.1907753944396973, + "learning_rate": 4.319427033726268e-06, + "loss": 0.0685, + "mean_token_accuracy": 0.9775472521781922, + "num_tokens": 80617419.0, + "step": 9930 + }, + { + "entropy": 0.1989262282848358, + "epoch": 5.606659142212189, + "grad_norm": 2.381535530090332, + "learning_rate": 4.3187863087050405e-06, + "loss": 0.0764, + "mean_token_accuracy": 0.9764021635055542, + "num_tokens": 80657689.0, + "step": 9935 + }, + { + "entropy": 0.1726620763540268, + "epoch": 5.609480812641084, + "grad_norm": 2.249459743499756, + "learning_rate": 4.318145349164339e-06, + "loss": 0.067, + "mean_token_accuracy": 0.9787798404693604, + "num_tokens": 80698324.0, + "step": 9940 + }, + { + "entropy": 0.17213599681854247, + "epoch": 5.6123024830699775, + "grad_norm": 1.9886410236358643, + "learning_rate": 4.3175041552300775e-06, + "loss": 0.062, + "mean_token_accuracy": 0.9811339974403381, + "num_tokens": 80738375.0, + "step": 9945 + }, + { + "entropy": 0.16617416441440583, + "epoch": 5.615124153498871, + "grad_norm": 2.2124135494232178, + "learning_rate": 4.31686272702822e-06, + "loss": 0.0702, + "mean_token_accuracy": 0.978109622001648, + "num_tokens": 80778942.0, + "step": 9950 + }, + { + "entropy": 0.1947992593050003, + "epoch": 5.617945823927765, + "grad_norm": 2.2856040000915527, + "learning_rate": 4.316221064684775e-06, + "loss": 0.0789, + "mean_token_accuracy": 0.9748259305953979, + "num_tokens": 80819710.0, + "step": 9955 + }, + { + "entropy": 0.19779794812202453, + "epoch": 5.6207674943566595, + "grad_norm": 3.436311721801758, + "learning_rate": 4.3155791683257965e-06, + "loss": 0.074, + "mean_token_accuracy": 0.9769353747367859, + "num_tokens": 80860043.0, + "step": 9960 + }, + { + "entropy": 0.18495135605335236, + "epoch": 5.623589164785553, + "grad_norm": 2.0917840003967285, + "learning_rate": 4.314937038077386e-06, + "loss": 0.0708, + "mean_token_accuracy": 0.9777469873428345, + "num_tokens": 80900666.0, + "step": 9965 + }, + { + "entropy": 0.1749154359102249, + "epoch": 5.626410835214447, + "grad_norm": 1.831464409828186, + "learning_rate": 4.314294674065689e-06, + "loss": 0.0673, + "mean_token_accuracy": 0.9792719483375549, + "num_tokens": 80941260.0, + "step": 9970 + }, + { + "entropy": 0.17395665049552916, + "epoch": 5.6292325056433405, + "grad_norm": 2.2329459190368652, + "learning_rate": 4.3136520764168996e-06, + "loss": 0.0716, + "mean_token_accuracy": 0.9781460523605346, + "num_tokens": 80981968.0, + "step": 9975 + }, + { + "entropy": 0.19316057562828065, + "epoch": 5.632054176072235, + "grad_norm": 1.6789857149124146, + "learning_rate": 4.3130092452572545e-06, + "loss": 0.0722, + "mean_token_accuracy": 0.9778452634811401, + "num_tokens": 81022614.0, + "step": 9980 + }, + { + "entropy": 0.17241209149360656, + "epoch": 5.634875846501129, + "grad_norm": 1.6188982725143433, + "learning_rate": 4.312366180713039e-06, + "loss": 0.0668, + "mean_token_accuracy": 0.9798883080482483, + "num_tokens": 81063456.0, + "step": 9985 + }, + { + "entropy": 0.18288934230804443, + "epoch": 5.6376975169300225, + "grad_norm": 12.512042999267578, + "learning_rate": 4.311722882910584e-06, + "loss": 0.0767, + "mean_token_accuracy": 0.9758876323699951, + "num_tokens": 81104260.0, + "step": 9990 + }, + { + "entropy": 0.17049558758735656, + "epoch": 5.640519187358916, + "grad_norm": 2.0652711391448975, + "learning_rate": 4.3110793519762625e-06, + "loss": 0.0753, + "mean_token_accuracy": 0.9753929495811462, + "num_tokens": 81145141.0, + "step": 9995 + }, + { + "entropy": 0.17534006536006927, + "epoch": 5.643340857787811, + "grad_norm": 1.9431458711624146, + "learning_rate": 4.3104355880365e-06, + "loss": 0.0686, + "mean_token_accuracy": 0.9783696532249451, + "num_tokens": 81185597.0, + "step": 10000 + }, + { + "epoch": 5.643340857787811, + "eval_entropy": 0.2248619794845581, + "eval_loss": 0.03424257040023804, + "eval_mean_token_accuracy": 0.9908010959625244, + "eval_num_tokens": 81185597.0, + "eval_runtime": 0.164, + "eval_samples_per_second": 24.396, + "eval_steps_per_second": 6.099, + "step": 10000 + }, + { + "entropy": 0.17620908617973327, + "epoch": 5.6461625282167045, + "grad_norm": 2.0070059299468994, + "learning_rate": 4.3097915912177615e-06, + "loss": 0.0726, + "mean_token_accuracy": 0.9765960216522217, + "num_tokens": 81225965.0, + "step": 10005 + }, + { + "entropy": 0.19689911603927612, + "epoch": 5.648984198645598, + "grad_norm": 2.1444599628448486, + "learning_rate": 4.30914736164656e-06, + "loss": 0.0715, + "mean_token_accuracy": 0.9776429057121276, + "num_tokens": 81266144.0, + "step": 10010 + }, + { + "entropy": 0.1702759474515915, + "epoch": 5.651805869074492, + "grad_norm": 1.7831538915634155, + "learning_rate": 4.308502899449456e-06, + "loss": 0.0646, + "mean_token_accuracy": 0.9793395638465882, + "num_tokens": 81306666.0, + "step": 10015 + }, + { + "entropy": 0.19171405136585234, + "epoch": 5.654627539503386, + "grad_norm": 6.789673328399658, + "learning_rate": 4.307858204753054e-06, + "loss": 0.0827, + "mean_token_accuracy": 0.9753363370895386, + "num_tokens": 81347319.0, + "step": 10020 + }, + { + "entropy": 0.18764612972736358, + "epoch": 5.65744920993228, + "grad_norm": 2.124255895614624, + "learning_rate": 4.3072132776840035e-06, + "loss": 0.0692, + "mean_token_accuracy": 0.9783503413200378, + "num_tokens": 81387613.0, + "step": 10025 + }, + { + "entropy": 0.17401687800884247, + "epoch": 5.660270880361174, + "grad_norm": 2.335118293762207, + "learning_rate": 4.306568118369001e-06, + "loss": 0.0691, + "mean_token_accuracy": 0.9781079888343811, + "num_tokens": 81428246.0, + "step": 10030 + }, + { + "entropy": 0.1741603434085846, + "epoch": 5.663092550790068, + "grad_norm": 1.9242067337036133, + "learning_rate": 4.305922726934788e-06, + "loss": 0.0657, + "mean_token_accuracy": 0.9787984848022461, + "num_tokens": 81468991.0, + "step": 10035 + }, + { + "entropy": 0.18352454900741577, + "epoch": 5.665914221218961, + "grad_norm": 2.054337501525879, + "learning_rate": 4.305277103508152e-06, + "loss": 0.0738, + "mean_token_accuracy": 0.976045835018158, + "num_tokens": 81509669.0, + "step": 10040 + }, + { + "entropy": 0.19397950172424316, + "epoch": 5.668735891647856, + "grad_norm": 2.4687507152557373, + "learning_rate": 4.304631248215925e-06, + "loss": 0.0815, + "mean_token_accuracy": 0.9752507686614991, + "num_tokens": 81550264.0, + "step": 10045 + }, + { + "entropy": 0.19177696108818054, + "epoch": 5.6715575620767495, + "grad_norm": 2.18593168258667, + "learning_rate": 4.303985161184986e-06, + "loss": 0.081, + "mean_token_accuracy": 0.9757527351379395, + "num_tokens": 81590845.0, + "step": 10050 + }, + { + "entropy": 0.19264190196990966, + "epoch": 5.674379232505643, + "grad_norm": 1.6302083730697632, + "learning_rate": 4.3033388425422595e-06, + "loss": 0.0754, + "mean_token_accuracy": 0.9771258950233459, + "num_tokens": 81631530.0, + "step": 10055 + }, + { + "entropy": 0.18041291534900666, + "epoch": 5.677200902934537, + "grad_norm": 2.0120229721069336, + "learning_rate": 4.302692292414713e-06, + "loss": 0.0701, + "mean_token_accuracy": 0.9786185026168823, + "num_tokens": 81672307.0, + "step": 10060 + }, + { + "entropy": 0.18173338174819947, + "epoch": 5.6800225733634315, + "grad_norm": 1.9405615329742432, + "learning_rate": 4.302045510929364e-06, + "loss": 0.0692, + "mean_token_accuracy": 0.9786619186401367, + "num_tokens": 81712760.0, + "step": 10065 + }, + { + "entropy": 0.18286008834838868, + "epoch": 5.682844243792325, + "grad_norm": 2.343987464904785, + "learning_rate": 4.3013984982132705e-06, + "loss": 0.0807, + "mean_token_accuracy": 0.9762983083724975, + "num_tokens": 81753384.0, + "step": 10070 + }, + { + "entropy": 0.1787573516368866, + "epoch": 5.685665914221219, + "grad_norm": 2.1654746532440186, + "learning_rate": 4.30075125439354e-06, + "loss": 0.0803, + "mean_token_accuracy": 0.9749480605125427, + "num_tokens": 81793907.0, + "step": 10075 + }, + { + "entropy": 0.1876184046268463, + "epoch": 5.688487584650113, + "grad_norm": 2.1528499126434326, + "learning_rate": 4.3001037795973225e-06, + "loss": 0.065, + "mean_token_accuracy": 0.9796617031097412, + "num_tokens": 81834372.0, + "step": 10080 + }, + { + "entropy": 0.17926348149776458, + "epoch": 5.691309255079007, + "grad_norm": 2.1884028911590576, + "learning_rate": 4.299456073951814e-06, + "loss": 0.0723, + "mean_token_accuracy": 0.9770533204078674, + "num_tokens": 81875199.0, + "step": 10085 + }, + { + "entropy": 0.17883450984954835, + "epoch": 5.694130925507901, + "grad_norm": 1.955631971359253, + "learning_rate": 4.2988081375842575e-06, + "loss": 0.0658, + "mean_token_accuracy": 0.9807510495185852, + "num_tokens": 81915920.0, + "step": 10090 + }, + { + "entropy": 0.1908187061548233, + "epoch": 5.696952595936795, + "grad_norm": 2.022860288619995, + "learning_rate": 4.29815997062194e-06, + "loss": 0.0767, + "mean_token_accuracy": 0.9755529642105103, + "num_tokens": 81956148.0, + "step": 10095 + }, + { + "entropy": 0.1840174227952957, + "epoch": 5.699774266365688, + "grad_norm": 2.5862724781036377, + "learning_rate": 4.297511573192194e-06, + "loss": 0.0737, + "mean_token_accuracy": 0.9770397782325745, + "num_tokens": 81996746.0, + "step": 10100 + }, + { + "entropy": 0.18789410591125488, + "epoch": 5.702595936794582, + "grad_norm": 2.2070839405059814, + "learning_rate": 4.296862945422396e-06, + "loss": 0.0763, + "mean_token_accuracy": 0.9756775379180909, + "num_tokens": 82037275.0, + "step": 10105 + }, + { + "entropy": 0.1947762042284012, + "epoch": 5.705417607223477, + "grad_norm": 2.0239546298980713, + "learning_rate": 4.2962140874399705e-06, + "loss": 0.0736, + "mean_token_accuracy": 0.9776275634765625, + "num_tokens": 82077888.0, + "step": 10110 + }, + { + "entropy": 0.19028544127941133, + "epoch": 5.70823927765237, + "grad_norm": 2.6127588748931885, + "learning_rate": 4.295564999372385e-06, + "loss": 0.0712, + "mean_token_accuracy": 0.97735435962677, + "num_tokens": 82118455.0, + "step": 10115 + }, + { + "entropy": 0.18263841271400452, + "epoch": 5.711060948081264, + "grad_norm": 1.8415166139602661, + "learning_rate": 4.294915681347154e-06, + "loss": 0.0719, + "mean_token_accuracy": 0.9764939785003662, + "num_tokens": 82158890.0, + "step": 10120 + }, + { + "entropy": 0.18013592660427094, + "epoch": 5.713882618510158, + "grad_norm": 2.214005470275879, + "learning_rate": 4.294266133491834e-06, + "loss": 0.0654, + "mean_token_accuracy": 0.9796643853187561, + "num_tokens": 82199368.0, + "step": 10125 + }, + { + "entropy": 0.17933290004730223, + "epoch": 5.716704288939052, + "grad_norm": 2.40313720703125, + "learning_rate": 4.293616355934032e-06, + "loss": 0.0761, + "mean_token_accuracy": 0.9756105422973633, + "num_tokens": 82240162.0, + "step": 10130 + }, + { + "entropy": 0.19547135531902313, + "epoch": 5.719525959367946, + "grad_norm": 2.1128058433532715, + "learning_rate": 4.292966348801394e-06, + "loss": 0.0715, + "mean_token_accuracy": 0.9782087802886963, + "num_tokens": 82281077.0, + "step": 10135 + }, + { + "entropy": 0.17906437516212464, + "epoch": 5.72234762979684, + "grad_norm": 2.4157004356384277, + "learning_rate": 4.292316112221615e-06, + "loss": 0.0637, + "mean_token_accuracy": 0.9807783722877502, + "num_tokens": 82321293.0, + "step": 10140 + }, + { + "entropy": 0.17458663284778594, + "epoch": 5.725169300225733, + "grad_norm": 2.1524438858032227, + "learning_rate": 4.291665646322434e-06, + "loss": 0.0691, + "mean_token_accuracy": 0.9782337784767151, + "num_tokens": 82361543.0, + "step": 10145 + }, + { + "entropy": 0.18585495054721832, + "epoch": 5.727990970654628, + "grad_norm": 2.3089470863342285, + "learning_rate": 4.2910149512316345e-06, + "loss": 0.0743, + "mean_token_accuracy": 0.976749575138092, + "num_tokens": 82402317.0, + "step": 10150 + }, + { + "entropy": 0.1775616079568863, + "epoch": 5.730812641083522, + "grad_norm": 2.647921323776245, + "learning_rate": 4.290364027077047e-06, + "loss": 0.0775, + "mean_token_accuracy": 0.975775396823883, + "num_tokens": 82443034.0, + "step": 10155 + }, + { + "entropy": 0.16766503751277922, + "epoch": 5.733634311512415, + "grad_norm": 2.180415391921997, + "learning_rate": 4.2897128739865446e-06, + "loss": 0.0681, + "mean_token_accuracy": 0.9786939501762391, + "num_tokens": 82483796.0, + "step": 10160 + }, + { + "entropy": 0.20999548733234405, + "epoch": 5.736455981941309, + "grad_norm": 2.0857527256011963, + "learning_rate": 4.289061492088047e-06, + "loss": 0.086, + "mean_token_accuracy": 0.9734646320343018, + "num_tokens": 82524597.0, + "step": 10165 + }, + { + "entropy": 0.18432653844356536, + "epoch": 5.739277652370204, + "grad_norm": 2.0384151935577393, + "learning_rate": 4.2884098815095175e-06, + "loss": 0.0702, + "mean_token_accuracy": 0.977328622341156, + "num_tokens": 82565390.0, + "step": 10170 + }, + { + "entropy": 0.17555322945117952, + "epoch": 5.742099322799097, + "grad_norm": 2.352675199508667, + "learning_rate": 4.287758042378966e-06, + "loss": 0.0657, + "mean_token_accuracy": 0.979662299156189, + "num_tokens": 82605127.0, + "step": 10175 + }, + { + "entropy": 0.1978653311729431, + "epoch": 5.744920993227991, + "grad_norm": 2.1638011932373047, + "learning_rate": 4.287105974824446e-06, + "loss": 0.0753, + "mean_token_accuracy": 0.9755738496780395, + "num_tokens": 82645570.0, + "step": 10180 + }, + { + "entropy": 0.18076943159103392, + "epoch": 5.747742663656885, + "grad_norm": 2.197725534439087, + "learning_rate": 4.286453678974055e-06, + "loss": 0.0757, + "mean_token_accuracy": 0.9774895668029785, + "num_tokens": 82686162.0, + "step": 10185 + }, + { + "entropy": 0.1678744912147522, + "epoch": 5.750564334085778, + "grad_norm": 1.8131523132324219, + "learning_rate": 4.28580115495594e-06, + "loss": 0.0712, + "mean_token_accuracy": 0.9775494694709778, + "num_tokens": 82726804.0, + "step": 10190 + }, + { + "entropy": 0.1693742036819458, + "epoch": 5.753386004514673, + "grad_norm": 1.9890544414520264, + "learning_rate": 4.285148402898285e-06, + "loss": 0.0696, + "mean_token_accuracy": 0.9783661007881165, + "num_tokens": 82767575.0, + "step": 10195 + }, + { + "entropy": 0.16243847608566284, + "epoch": 5.756207674943567, + "grad_norm": 2.3177292346954346, + "learning_rate": 4.284495422929326e-06, + "loss": 0.063, + "mean_token_accuracy": 0.9801602840423584, + "num_tokens": 82808426.0, + "step": 10200 + }, + { + "entropy": 0.18572303354740144, + "epoch": 5.75902934537246, + "grad_norm": 2.198636531829834, + "learning_rate": 4.283842215177341e-06, + "loss": 0.0701, + "mean_token_accuracy": 0.9777385950088501, + "num_tokens": 82849159.0, + "step": 10205 + }, + { + "entropy": 0.18345086872577668, + "epoch": 5.761851015801354, + "grad_norm": 2.125354290008545, + "learning_rate": 4.283188779770652e-06, + "loss": 0.0782, + "mean_token_accuracy": 0.9764749526977539, + "num_tokens": 82889959.0, + "step": 10210 + }, + { + "entropy": 0.18493236899375914, + "epoch": 5.764672686230249, + "grad_norm": 2.407668352127075, + "learning_rate": 4.2825351168376275e-06, + "loss": 0.0869, + "mean_token_accuracy": 0.971848976612091, + "num_tokens": 82930471.0, + "step": 10215 + }, + { + "entropy": 0.18200556337833404, + "epoch": 5.767494356659142, + "grad_norm": 1.8151724338531494, + "learning_rate": 4.281881226506677e-06, + "loss": 0.0811, + "mean_token_accuracy": 0.974114739894867, + "num_tokens": 82971060.0, + "step": 10220 + }, + { + "entropy": 0.17582403719425202, + "epoch": 5.770316027088036, + "grad_norm": 1.7381181716918945, + "learning_rate": 4.28122710890626e-06, + "loss": 0.0811, + "mean_token_accuracy": 0.974997091293335, + "num_tokens": 83011536.0, + "step": 10225 + }, + { + "entropy": 0.19064405858516692, + "epoch": 5.77313769751693, + "grad_norm": 2.46132493019104, + "learning_rate": 4.2805727641648775e-06, + "loss": 0.0911, + "mean_token_accuracy": 0.9718187093734741, + "num_tokens": 83052238.0, + "step": 10230 + }, + { + "entropy": 0.18355910778045653, + "epoch": 5.775959367945823, + "grad_norm": 2.21754789352417, + "learning_rate": 4.2799181924110755e-06, + "loss": 0.075, + "mean_token_accuracy": 0.9755819082260132, + "num_tokens": 83092882.0, + "step": 10235 + }, + { + "entropy": 0.18562229871749877, + "epoch": 5.778781038374718, + "grad_norm": 2.161637306213379, + "learning_rate": 4.279263393773444e-06, + "loss": 0.0767, + "mean_token_accuracy": 0.9759423017501831, + "num_tokens": 83133727.0, + "step": 10240 + }, + { + "entropy": 0.1935652256011963, + "epoch": 5.781602708803612, + "grad_norm": 1.7781506776809692, + "learning_rate": 4.278608368380618e-06, + "loss": 0.0818, + "mean_token_accuracy": 0.9746825933456421, + "num_tokens": 83173500.0, + "step": 10245 + }, + { + "entropy": 0.19277122020721435, + "epoch": 5.784424379232505, + "grad_norm": 1.752469539642334, + "learning_rate": 4.27795311636128e-06, + "loss": 0.0694, + "mean_token_accuracy": 0.9785934090614319, + "num_tokens": 83214016.0, + "step": 10250 + }, + { + "entropy": 0.179434072971344, + "epoch": 5.7872460496614, + "grad_norm": 2.006405830383301, + "learning_rate": 4.277297637844151e-06, + "loss": 0.071, + "mean_token_accuracy": 0.9783080697059632, + "num_tokens": 83254826.0, + "step": 10255 + }, + { + "entropy": 0.17800212800502777, + "epoch": 5.790067720090294, + "grad_norm": 2.5060832500457764, + "learning_rate": 4.276641932958002e-06, + "loss": 0.074, + "mean_token_accuracy": 0.9775895595550537, + "num_tokens": 83295437.0, + "step": 10260 + }, + { + "entropy": 0.1740177422761917, + "epoch": 5.792889390519187, + "grad_norm": 1.8866690397262573, + "learning_rate": 4.275986001831645e-06, + "loss": 0.0737, + "mean_token_accuracy": 0.9761949896812439, + "num_tokens": 83336071.0, + "step": 10265 + }, + { + "entropy": 0.17147966027259826, + "epoch": 5.795711060948081, + "grad_norm": 2.06299090385437, + "learning_rate": 4.275329844593938e-06, + "loss": 0.0722, + "mean_token_accuracy": 0.9773301362991333, + "num_tokens": 83376866.0, + "step": 10270 + }, + { + "entropy": 0.18029634952545165, + "epoch": 5.798532731376975, + "grad_norm": 2.3944549560546875, + "learning_rate": 4.274673461373784e-06, + "loss": 0.0886, + "mean_token_accuracy": 0.9716618418693542, + "num_tokens": 83417485.0, + "step": 10275 + }, + { + "entropy": 0.18746162354946136, + "epoch": 5.801354401805869, + "grad_norm": 2.2615456581115723, + "learning_rate": 4.274016852300129e-06, + "loss": 0.0709, + "mean_token_accuracy": 0.9775063872337342, + "num_tokens": 83458100.0, + "step": 10280 + }, + { + "entropy": 0.18002947568893432, + "epoch": 5.804176072234763, + "grad_norm": 1.8940982818603516, + "learning_rate": 4.273360017501964e-06, + "loss": 0.0732, + "mean_token_accuracy": 0.9775622367858887, + "num_tokens": 83498674.0, + "step": 10285 + }, + { + "entropy": 0.1664568930864334, + "epoch": 5.806997742663657, + "grad_norm": 2.5354745388031006, + "learning_rate": 4.272702957108325e-06, + "loss": 0.0729, + "mean_token_accuracy": 0.976758623123169, + "num_tokens": 83539467.0, + "step": 10290 + }, + { + "entropy": 0.18044437170028688, + "epoch": 5.8098194130925505, + "grad_norm": 1.8379076719284058, + "learning_rate": 4.272045671248289e-06, + "loss": 0.0661, + "mean_token_accuracy": 0.9795686602592468, + "num_tokens": 83580065.0, + "step": 10295 + }, + { + "entropy": 0.18124404847621917, + "epoch": 5.812641083521445, + "grad_norm": 2.4792568683624268, + "learning_rate": 4.2713881600509835e-06, + "loss": 0.0632, + "mean_token_accuracy": 0.9804369568824768, + "num_tokens": 83619608.0, + "step": 10300 + }, + { + "entropy": 0.1773180991411209, + "epoch": 5.815462753950339, + "grad_norm": 2.801755428314209, + "learning_rate": 4.270730423645574e-06, + "loss": 0.066, + "mean_token_accuracy": 0.9787346959114075, + "num_tokens": 83659117.0, + "step": 10305 + }, + { + "entropy": 0.17246809601783752, + "epoch": 5.818284424379232, + "grad_norm": 1.8683533668518066, + "learning_rate": 4.2700724621612745e-06, + "loss": 0.0701, + "mean_token_accuracy": 0.9784170150756836, + "num_tokens": 83699987.0, + "step": 10310 + }, + { + "entropy": 0.17956862449645997, + "epoch": 5.821106094808126, + "grad_norm": 1.9856154918670654, + "learning_rate": 4.269414275727341e-06, + "loss": 0.083, + "mean_token_accuracy": 0.9736566543579102, + "num_tokens": 83740362.0, + "step": 10315 + }, + { + "entropy": 0.17001102566719056, + "epoch": 5.82392776523702, + "grad_norm": 2.348156452178955, + "learning_rate": 4.2687558644730735e-06, + "loss": 0.0603, + "mean_token_accuracy": 0.9807537913322448, + "num_tokens": 83781025.0, + "step": 10320 + }, + { + "entropy": 0.17963250279426574, + "epoch": 5.826749435665914, + "grad_norm": 2.1682050228118896, + "learning_rate": 4.268097228527818e-06, + "loss": 0.0684, + "mean_token_accuracy": 0.9780291318893433, + "num_tokens": 83821718.0, + "step": 10325 + }, + { + "entropy": 0.18213966786861419, + "epoch": 5.829571106094808, + "grad_norm": 2.4645328521728516, + "learning_rate": 4.267438368020964e-06, + "loss": 0.0777, + "mean_token_accuracy": 0.9760030627250671, + "num_tokens": 83862498.0, + "step": 10330 + }, + { + "entropy": 0.16510314047336577, + "epoch": 5.832392776523702, + "grad_norm": 1.8456097841262817, + "learning_rate": 4.2667792830819435e-06, + "loss": 0.0668, + "mean_token_accuracy": 0.9788869500160218, + "num_tokens": 83903238.0, + "step": 10335 + }, + { + "entropy": 0.17967475354671478, + "epoch": 5.835214446952596, + "grad_norm": 1.8277400732040405, + "learning_rate": 4.266119973840235e-06, + "loss": 0.0766, + "mean_token_accuracy": 0.975653862953186, + "num_tokens": 83944020.0, + "step": 10340 + }, + { + "entropy": 0.18739196360111238, + "epoch": 5.83803611738149, + "grad_norm": 2.124237060546875, + "learning_rate": 4.2654604404253585e-06, + "loss": 0.0773, + "mean_token_accuracy": 0.9757479429244995, + "num_tokens": 83984438.0, + "step": 10345 + }, + { + "entropy": 0.1835099220275879, + "epoch": 5.840857787810384, + "grad_norm": 2.48547625541687, + "learning_rate": 4.264800682966881e-06, + "loss": 0.0658, + "mean_token_accuracy": 0.9790345907211304, + "num_tokens": 84025304.0, + "step": 10350 + }, + { + "entropy": 0.17234547436237335, + "epoch": 5.8436794582392775, + "grad_norm": 2.024911642074585, + "learning_rate": 4.26414070159441e-06, + "loss": 0.0661, + "mean_token_accuracy": 0.9787455081939698, + "num_tokens": 84065848.0, + "step": 10355 + }, + { + "entropy": 0.19444843232631684, + "epoch": 5.846501128668171, + "grad_norm": 2.464071273803711, + "learning_rate": 4.263480496437601e-06, + "loss": 0.0811, + "mean_token_accuracy": 0.974936056137085, + "num_tokens": 84106680.0, + "step": 10360 + }, + { + "entropy": 0.173148912191391, + "epoch": 5.849322799097066, + "grad_norm": 1.9251296520233154, + "learning_rate": 4.26282006762615e-06, + "loss": 0.0701, + "mean_token_accuracy": 0.9775914549827576, + "num_tokens": 84147405.0, + "step": 10365 + }, + { + "entropy": 0.18452159464359283, + "epoch": 5.8521444695259595, + "grad_norm": 2.2456061840057373, + "learning_rate": 4.262159415289799e-06, + "loss": 0.0768, + "mean_token_accuracy": 0.9760621428489685, + "num_tokens": 84188045.0, + "step": 10370 + }, + { + "entropy": 0.18307125866413115, + "epoch": 5.854966139954853, + "grad_norm": 2.077425241470337, + "learning_rate": 4.261498539558333e-06, + "loss": 0.0692, + "mean_token_accuracy": 0.9785082459449768, + "num_tokens": 84227724.0, + "step": 10375 + }, + { + "entropy": 0.1773241400718689, + "epoch": 5.857787810383747, + "grad_norm": 2.191255569458008, + "learning_rate": 4.260837440561583e-06, + "loss": 0.0711, + "mean_token_accuracy": 0.976841950416565, + "num_tokens": 84268109.0, + "step": 10380 + }, + { + "entropy": 0.18978277146816253, + "epoch": 5.860609480812641, + "grad_norm": 2.40100359916687, + "learning_rate": 4.260176118429418e-06, + "loss": 0.0722, + "mean_token_accuracy": 0.9782564878463745, + "num_tokens": 84308793.0, + "step": 10385 + }, + { + "entropy": 0.18334400355815889, + "epoch": 5.863431151241535, + "grad_norm": 2.2183425426483154, + "learning_rate": 4.259514573291757e-06, + "loss": 0.0792, + "mean_token_accuracy": 0.9754730105400086, + "num_tokens": 84349159.0, + "step": 10390 + }, + { + "entropy": 0.2000194698572159, + "epoch": 5.866252821670429, + "grad_norm": 2.203062057495117, + "learning_rate": 4.258852805278562e-06, + "loss": 0.0838, + "mean_token_accuracy": 0.9722742438316345, + "num_tokens": 84389870.0, + "step": 10395 + }, + { + "entropy": 0.18452554047107697, + "epoch": 5.8690744920993225, + "grad_norm": 2.1212892532348633, + "learning_rate": 4.258190814519834e-06, + "loss": 0.0774, + "mean_token_accuracy": 0.976058554649353, + "num_tokens": 84430501.0, + "step": 10400 + }, + { + "entropy": 0.1830653041601181, + "epoch": 5.871896162528216, + "grad_norm": 2.3475375175476074, + "learning_rate": 4.2575286011456255e-06, + "loss": 0.074, + "mean_token_accuracy": 0.9763685941696167, + "num_tokens": 84471098.0, + "step": 10405 + }, + { + "entropy": 0.16962153613567352, + "epoch": 5.874717832957111, + "grad_norm": 1.6830201148986816, + "learning_rate": 4.256866165286024e-06, + "loss": 0.069, + "mean_token_accuracy": 0.9781726360321045, + "num_tokens": 84511664.0, + "step": 10410 + }, + { + "entropy": 0.1886516332626343, + "epoch": 5.8775395033860045, + "grad_norm": 2.5439491271972656, + "learning_rate": 4.256203507071168e-06, + "loss": 0.0834, + "mean_token_accuracy": 0.974034309387207, + "num_tokens": 84552223.0, + "step": 10415 + }, + { + "entropy": 0.17994187772274017, + "epoch": 5.880361173814898, + "grad_norm": 2.874234437942505, + "learning_rate": 4.255540626631236e-06, + "loss": 0.0763, + "mean_token_accuracy": 0.9750635504722596, + "num_tokens": 84592774.0, + "step": 10420 + }, + { + "entropy": 0.19643645584583283, + "epoch": 5.883182844243793, + "grad_norm": 2.5835118293762207, + "learning_rate": 4.2548775240964515e-06, + "loss": 0.0883, + "mean_token_accuracy": 0.9736735224723816, + "num_tokens": 84633572.0, + "step": 10425 + }, + { + "entropy": 0.17592164278030395, + "epoch": 5.8860045146726865, + "grad_norm": 2.1674015522003174, + "learning_rate": 4.25421419959708e-06, + "loss": 0.074, + "mean_token_accuracy": 0.9766902089118957, + "num_tokens": 84674314.0, + "step": 10430 + }, + { + "entropy": 0.19005840122699738, + "epoch": 5.88882618510158, + "grad_norm": 1.8254783153533936, + "learning_rate": 4.253550653263432e-06, + "loss": 0.0783, + "mean_token_accuracy": 0.9754180431365966, + "num_tokens": 84714985.0, + "step": 10435 + }, + { + "entropy": 0.17556642293930053, + "epoch": 5.891647855530474, + "grad_norm": 1.7899775505065918, + "learning_rate": 4.2528868852258615e-06, + "loss": 0.0664, + "mean_token_accuracy": 0.9791262030601502, + "num_tokens": 84755700.0, + "step": 10440 + }, + { + "entropy": 0.17945761382579803, + "epoch": 5.894469525959368, + "grad_norm": 2.6520256996154785, + "learning_rate": 4.252222895614766e-06, + "loss": 0.0713, + "mean_token_accuracy": 0.9770576000213623, + "num_tokens": 84796363.0, + "step": 10445 + }, + { + "entropy": 0.19162935614585877, + "epoch": 5.897291196388262, + "grad_norm": 2.3933403491973877, + "learning_rate": 4.2515586845605864e-06, + "loss": 0.0795, + "mean_token_accuracy": 0.9750626444816589, + "num_tokens": 84836849.0, + "step": 10450 + }, + { + "entropy": 0.17797043323516845, + "epoch": 5.900112866817156, + "grad_norm": 2.0211195945739746, + "learning_rate": 4.250894252193806e-06, + "loss": 0.0713, + "mean_token_accuracy": 0.9775516271591187, + "num_tokens": 84877674.0, + "step": 10455 + }, + { + "entropy": 0.1765431433916092, + "epoch": 5.9029345372460496, + "grad_norm": 2.351975440979004, + "learning_rate": 4.250229598644954e-06, + "loss": 0.0678, + "mean_token_accuracy": 0.9786093235015869, + "num_tokens": 84918343.0, + "step": 10460 + }, + { + "entropy": 0.18122887313365937, + "epoch": 5.905756207674943, + "grad_norm": 1.8683475255966187, + "learning_rate": 4.249564724044602e-06, + "loss": 0.072, + "mean_token_accuracy": 0.9761911749839782, + "num_tokens": 84958846.0, + "step": 10465 + }, + { + "entropy": 0.17490617036819459, + "epoch": 5.908577878103838, + "grad_norm": 1.9673949480056763, + "learning_rate": 4.248899628523362e-06, + "loss": 0.0763, + "mean_token_accuracy": 0.9763616800308228, + "num_tokens": 84999643.0, + "step": 10470 + }, + { + "entropy": 0.18483209908008574, + "epoch": 5.9113995485327315, + "grad_norm": 1.887024164199829, + "learning_rate": 4.248234312211895e-06, + "loss": 0.0688, + "mean_token_accuracy": 0.978734838962555, + "num_tokens": 85040465.0, + "step": 10475 + }, + { + "entropy": 0.19448497295379638, + "epoch": 5.914221218961625, + "grad_norm": 2.0865771770477295, + "learning_rate": 4.247568775240901e-06, + "loss": 0.0623, + "mean_token_accuracy": 0.9806246161460876, + "num_tokens": 85081222.0, + "step": 10480 + }, + { + "entropy": 0.1752565920352936, + "epoch": 5.917042889390519, + "grad_norm": 2.1277925968170166, + "learning_rate": 4.246903017741124e-06, + "loss": 0.0841, + "mean_token_accuracy": 0.9734927892684937, + "num_tokens": 85121900.0, + "step": 10485 + }, + { + "entropy": 0.18596406877040864, + "epoch": 5.919864559819413, + "grad_norm": 2.1438140869140625, + "learning_rate": 4.246237039843355e-06, + "loss": 0.0659, + "mean_token_accuracy": 0.9792929530143738, + "num_tokens": 85162395.0, + "step": 10490 + }, + { + "entropy": 0.1806576669216156, + "epoch": 5.922686230248307, + "grad_norm": 2.2096385955810547, + "learning_rate": 4.2455708416784235e-06, + "loss": 0.0719, + "mean_token_accuracy": 0.977632212638855, + "num_tokens": 85203275.0, + "step": 10495 + }, + { + "entropy": 0.20309909880161287, + "epoch": 5.925507900677201, + "grad_norm": 1.782074213027954, + "learning_rate": 4.244904423377204e-06, + "loss": 0.0736, + "mean_token_accuracy": 0.9776381611824035, + "num_tokens": 85243951.0, + "step": 10500 + }, + { + "epoch": 5.925507900677201, + "eval_entropy": 0.2187255471944809, + "eval_loss": 0.030472468584775925, + "eval_mean_token_accuracy": 0.9911843538284302, + "eval_num_tokens": 85243951.0, + "eval_runtime": 0.1635, + "eval_samples_per_second": 24.46, + "eval_steps_per_second": 6.115, + "step": 10500 + }, + { + "entropy": 0.18639110922813415, + "epoch": 5.928329571106095, + "grad_norm": 2.001336097717285, + "learning_rate": 4.244237785070615e-06, + "loss": 0.0712, + "mean_token_accuracy": 0.9782602667808533, + "num_tokens": 85284411.0, + "step": 10505 + }, + { + "entropy": 0.18605276346206664, + "epoch": 5.931151241534989, + "grad_norm": 2.1870064735412598, + "learning_rate": 4.243570926889618e-06, + "loss": 0.0759, + "mean_token_accuracy": 0.9759214639663696, + "num_tokens": 85325253.0, + "step": 10510 + }, + { + "entropy": 0.17156625986099244, + "epoch": 5.933972911963883, + "grad_norm": 2.035531520843506, + "learning_rate": 4.242903848965217e-06, + "loss": 0.0656, + "mean_token_accuracy": 0.9796592235565186, + "num_tokens": 85365632.0, + "step": 10515 + }, + { + "entropy": 0.20702783465385438, + "epoch": 5.936794582392777, + "grad_norm": 2.6586201190948486, + "learning_rate": 4.242236551428459e-06, + "loss": 0.0684, + "mean_token_accuracy": 0.9800093650817872, + "num_tokens": 85405847.0, + "step": 10520 + }, + { + "entropy": 0.18220070600509644, + "epoch": 5.93961625282167, + "grad_norm": 2.27356219291687, + "learning_rate": 4.241569034410436e-06, + "loss": 0.0749, + "mean_token_accuracy": 0.9759989261627198, + "num_tokens": 85446548.0, + "step": 10525 + }, + { + "entropy": 0.17777936458587645, + "epoch": 5.942437923250564, + "grad_norm": 1.8858665227890015, + "learning_rate": 4.24090129804228e-06, + "loss": 0.0764, + "mean_token_accuracy": 0.9763510704040528, + "num_tokens": 85487166.0, + "step": 10530 + }, + { + "entropy": 0.19082893133163453, + "epoch": 5.9452595936794586, + "grad_norm": 2.3828372955322266, + "learning_rate": 4.24023334245517e-06, + "loss": 0.0726, + "mean_token_accuracy": 0.9769440650939941, + "num_tokens": 85527909.0, + "step": 10535 + }, + { + "entropy": 0.16436271667480468, + "epoch": 5.948081264108352, + "grad_norm": 2.2857789993286133, + "learning_rate": 4.2395651677803244e-06, + "loss": 0.0691, + "mean_token_accuracy": 0.9776111841201782, + "num_tokens": 85568713.0, + "step": 10540 + }, + { + "entropy": 0.19756191074848176, + "epoch": 5.950902934537246, + "grad_norm": 1.9905869960784912, + "learning_rate": 4.238896774149007e-06, + "loss": 0.0671, + "mean_token_accuracy": 0.9784975528717041, + "num_tokens": 85609278.0, + "step": 10545 + }, + { + "entropy": 0.17361958026885987, + "epoch": 5.95372460496614, + "grad_norm": 1.9324471950531006, + "learning_rate": 4.2382281616925235e-06, + "loss": 0.0715, + "mean_token_accuracy": 0.9772069692611695, + "num_tokens": 85649844.0, + "step": 10550 + }, + { + "entropy": 0.17714324295520784, + "epoch": 5.956546275395034, + "grad_norm": 2.4276859760284424, + "learning_rate": 4.237559330542223e-06, + "loss": 0.0732, + "mean_token_accuracy": 0.9776072025299072, + "num_tokens": 85690372.0, + "step": 10555 + }, + { + "entropy": 0.18243989944458008, + "epoch": 5.959367945823928, + "grad_norm": 2.2024104595184326, + "learning_rate": 4.236890280829496e-06, + "loss": 0.0758, + "mean_token_accuracy": 0.9765413165092468, + "num_tokens": 85730984.0, + "step": 10560 + }, + { + "entropy": 0.18148486614227294, + "epoch": 5.962189616252822, + "grad_norm": 2.123389720916748, + "learning_rate": 4.236221012685781e-06, + "loss": 0.0675, + "mean_token_accuracy": 0.9791842937469483, + "num_tokens": 85771776.0, + "step": 10565 + }, + { + "entropy": 0.19152417182922363, + "epoch": 5.965011286681715, + "grad_norm": 2.2187232971191406, + "learning_rate": 4.235551526242552e-06, + "loss": 0.0727, + "mean_token_accuracy": 0.9772362232208252, + "num_tokens": 85812366.0, + "step": 10570 + }, + { + "entropy": 0.18775533139705658, + "epoch": 5.967832957110609, + "grad_norm": 2.338989496231079, + "learning_rate": 4.234881821631332e-06, + "loss": 0.0823, + "mean_token_accuracy": 0.974126398563385, + "num_tokens": 85851986.0, + "step": 10575 + }, + { + "entropy": 0.19790259897708892, + "epoch": 5.970654627539504, + "grad_norm": 1.8901110887527466, + "learning_rate": 4.234211898983684e-06, + "loss": 0.0742, + "mean_token_accuracy": 0.976418399810791, + "num_tokens": 85892394.0, + "step": 10580 + }, + { + "entropy": 0.18636341392993927, + "epoch": 5.973476297968397, + "grad_norm": 2.382916212081909, + "learning_rate": 4.233541758431213e-06, + "loss": 0.083, + "mean_token_accuracy": 0.9738220572471619, + "num_tokens": 85932953.0, + "step": 10585 + }, + { + "entropy": 0.16605794429779053, + "epoch": 5.976297968397291, + "grad_norm": 1.9576960802078247, + "learning_rate": 4.232871400105572e-06, + "loss": 0.0698, + "mean_token_accuracy": 0.9788704752922058, + "num_tokens": 85973329.0, + "step": 10590 + }, + { + "entropy": 0.16949787735939026, + "epoch": 5.979119638826186, + "grad_norm": 1.836525559425354, + "learning_rate": 4.232200824138448e-06, + "loss": 0.0688, + "mean_token_accuracy": 0.9781976103782654, + "num_tokens": 86013764.0, + "step": 10595 + }, + { + "entropy": 0.17068464457988738, + "epoch": 5.981941309255079, + "grad_norm": 1.9783167839050293, + "learning_rate": 4.231530030661579e-06, + "loss": 0.0685, + "mean_token_accuracy": 0.9786436915397644, + "num_tokens": 86054432.0, + "step": 10600 + }, + { + "entropy": 0.18707202672958373, + "epoch": 5.984762979683973, + "grad_norm": 1.955723524093628, + "learning_rate": 4.230859019806741e-06, + "loss": 0.06, + "mean_token_accuracy": 0.9814386010169983, + "num_tokens": 86095074.0, + "step": 10605 + }, + { + "entropy": 0.19427139759063722, + "epoch": 5.987584650112867, + "grad_norm": 2.150771141052246, + "learning_rate": 4.230187791705756e-06, + "loss": 0.0887, + "mean_token_accuracy": 0.9718084454536438, + "num_tokens": 86135924.0, + "step": 10610 + }, + { + "entropy": 0.18580053150653839, + "epoch": 5.99040632054176, + "grad_norm": 1.954380989074707, + "learning_rate": 4.229516346490485e-06, + "loss": 0.0782, + "mean_token_accuracy": 0.9761150360107422, + "num_tokens": 86176503.0, + "step": 10615 + }, + { + "entropy": 0.18900538682937623, + "epoch": 5.993227990970655, + "grad_norm": 2.2284088134765625, + "learning_rate": 4.2288446842928345e-06, + "loss": 0.0784, + "mean_token_accuracy": 0.9764190793037415, + "num_tokens": 86217113.0, + "step": 10620 + }, + { + "entropy": 0.18157367706298827, + "epoch": 5.996049661399549, + "grad_norm": 2.151667833328247, + "learning_rate": 4.228172805244753e-06, + "loss": 0.0793, + "mean_token_accuracy": 0.9748581528663636, + "num_tokens": 86257786.0, + "step": 10625 + }, + { + "entropy": 0.19717211425304412, + "epoch": 5.998871331828442, + "grad_norm": 2.68282413482666, + "learning_rate": 4.227500709478229e-06, + "loss": 0.0845, + "mean_token_accuracy": 0.9748492002487182, + "num_tokens": 86298375.0, + "step": 10630 + }, + { + "entropy": 0.16193274557590484, + "epoch": 6.001693002257336, + "grad_norm": 1.6442623138427734, + "learning_rate": 4.226828397125298e-06, + "loss": 0.0563, + "mean_token_accuracy": 0.9836606025695801, + "num_tokens": 86333142.0, + "step": 10635 + }, + { + "entropy": 0.15360802710056304, + "epoch": 6.004514672686231, + "grad_norm": 1.2552261352539062, + "learning_rate": 4.226155868318035e-06, + "loss": 0.0358, + "mean_token_accuracy": 0.9905863881111145, + "num_tokens": 86373661.0, + "step": 10640 + }, + { + "entropy": 0.15118006765842437, + "epoch": 6.007336343115124, + "grad_norm": 2.012360095977783, + "learning_rate": 4.225483123188559e-06, + "loss": 0.0349, + "mean_token_accuracy": 0.990486478805542, + "num_tokens": 86414362.0, + "step": 10645 + }, + { + "entropy": 0.13885512351989746, + "epoch": 6.010158013544018, + "grad_norm": 2.042280435562134, + "learning_rate": 4.224810161869029e-06, + "loss": 0.0329, + "mean_token_accuracy": 0.990878415107727, + "num_tokens": 86454954.0, + "step": 10650 + }, + { + "entropy": 0.1436757594347, + "epoch": 6.012979683972912, + "grad_norm": 1.7394871711730957, + "learning_rate": 4.224136984491651e-06, + "loss": 0.0358, + "mean_token_accuracy": 0.9893459677696228, + "num_tokens": 86495647.0, + "step": 10655 + }, + { + "entropy": 0.1538625791668892, + "epoch": 6.015801354401806, + "grad_norm": 1.6942013502120972, + "learning_rate": 4.22346359118867e-06, + "loss": 0.0335, + "mean_token_accuracy": 0.9908841848373413, + "num_tokens": 86536010.0, + "step": 10660 + }, + { + "entropy": 0.1632068932056427, + "epoch": 6.0186230248307, + "grad_norm": 2.3148810863494873, + "learning_rate": 4.222789982092373e-06, + "loss": 0.0313, + "mean_token_accuracy": 0.9914784550666809, + "num_tokens": 86576406.0, + "step": 10665 + }, + { + "entropy": 0.1600690007209778, + "epoch": 6.021444695259594, + "grad_norm": 2.032419443130493, + "learning_rate": 4.222116157335091e-06, + "loss": 0.0394, + "mean_token_accuracy": 0.9886528730392456, + "num_tokens": 86616837.0, + "step": 10670 + }, + { + "entropy": 0.13843523859977722, + "epoch": 6.024266365688487, + "grad_norm": 2.1894493103027344, + "learning_rate": 4.2214421170491975e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9916700005531311, + "num_tokens": 86657356.0, + "step": 10675 + }, + { + "entropy": 0.1451241284608841, + "epoch": 6.027088036117381, + "grad_norm": 1.5372796058654785, + "learning_rate": 4.220767861367108e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9909778356552124, + "num_tokens": 86697851.0, + "step": 10680 + }, + { + "entropy": 0.1586732506752014, + "epoch": 6.029909706546276, + "grad_norm": 1.9109724760055542, + "learning_rate": 4.220093390421279e-06, + "loss": 0.0396, + "mean_token_accuracy": 0.9885246515274048, + "num_tokens": 86738396.0, + "step": 10685 + }, + { + "entropy": 0.14336285442113877, + "epoch": 6.032731376975169, + "grad_norm": 1.615535855293274, + "learning_rate": 4.219418704344211e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9910237193107605, + "num_tokens": 86779215.0, + "step": 10690 + }, + { + "entropy": 0.15025009214878082, + "epoch": 6.035553047404063, + "grad_norm": 2.157390594482422, + "learning_rate": 4.218743803268447e-06, + "loss": 0.0399, + "mean_token_accuracy": 0.9880735993385314, + "num_tokens": 86819698.0, + "step": 10695 + }, + { + "entropy": 0.15084101557731627, + "epoch": 6.038374717832957, + "grad_norm": 1.7998473644256592, + "learning_rate": 4.218068687326571e-06, + "loss": 0.0354, + "mean_token_accuracy": 0.9899451732635498, + "num_tokens": 86860397.0, + "step": 10700 + }, + { + "entropy": 0.14166499078273773, + "epoch": 6.041196388261851, + "grad_norm": 1.578946590423584, + "learning_rate": 4.217393356651208e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9898417115211486, + "num_tokens": 86901077.0, + "step": 10705 + }, + { + "entropy": 0.14319468885660172, + "epoch": 6.044018058690745, + "grad_norm": 1.6669219732284546, + "learning_rate": 4.216717811375028e-06, + "loss": 0.0345, + "mean_token_accuracy": 0.9898629069328309, + "num_tokens": 86941792.0, + "step": 10710 + }, + { + "entropy": 0.13536919057369232, + "epoch": 6.046839729119639, + "grad_norm": 1.7521120309829712, + "learning_rate": 4.216042051630743e-06, + "loss": 0.0326, + "mean_token_accuracy": 0.9901854276657105, + "num_tokens": 86982279.0, + "step": 10715 + }, + { + "entropy": 0.1333491548895836, + "epoch": 6.049661399548532, + "grad_norm": 1.9163808822631836, + "learning_rate": 4.215366077551105e-06, + "loss": 0.0341, + "mean_token_accuracy": 0.9900972962379455, + "num_tokens": 87023083.0, + "step": 10720 + }, + { + "entropy": 0.15708822011947632, + "epoch": 6.052483069977427, + "grad_norm": 2.149850368499756, + "learning_rate": 4.21468988926891e-06, + "loss": 0.0382, + "mean_token_accuracy": 0.9889817953109741, + "num_tokens": 87063613.0, + "step": 10725 + }, + { + "entropy": 0.14288501143455506, + "epoch": 6.055304740406321, + "grad_norm": 1.6220556497573853, + "learning_rate": 4.2140134869169934e-06, + "loss": 0.0397, + "mean_token_accuracy": 0.9880726933479309, + "num_tokens": 87104292.0, + "step": 10730 + }, + { + "entropy": 0.13960061967372894, + "epoch": 6.058126410835214, + "grad_norm": 1.8582446575164795, + "learning_rate": 4.213336870628236e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9908414721488953, + "num_tokens": 87144991.0, + "step": 10735 + }, + { + "entropy": 0.1556618928909302, + "epoch": 6.060948081264108, + "grad_norm": 1.7569464445114136, + "learning_rate": 4.21266004053556e-06, + "loss": 0.035, + "mean_token_accuracy": 0.9900591254234314, + "num_tokens": 87185620.0, + "step": 10740 + }, + { + "entropy": 0.15336497128009796, + "epoch": 6.063769751693002, + "grad_norm": 1.2546908855438232, + "learning_rate": 4.211982996771926e-06, + "loss": 0.0348, + "mean_token_accuracy": 0.9904587507247925, + "num_tokens": 87226351.0, + "step": 10745 + }, + { + "entropy": 0.1377393662929535, + "epoch": 6.066591422121896, + "grad_norm": 1.875976800918579, + "learning_rate": 4.211305739470342e-06, + "loss": 0.0305, + "mean_token_accuracy": 0.9914319157600403, + "num_tokens": 87267095.0, + "step": 10750 + }, + { + "entropy": 0.1543271839618683, + "epoch": 6.06941309255079, + "grad_norm": 1.8693065643310547, + "learning_rate": 4.210628268763854e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9902760028839112, + "num_tokens": 87307784.0, + "step": 10755 + }, + { + "entropy": 0.14306709170341492, + "epoch": 6.072234762979684, + "grad_norm": 2.66241717338562, + "learning_rate": 4.209950584785552e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9919659614562988, + "num_tokens": 87348340.0, + "step": 10760 + }, + { + "entropy": 0.14404877722263337, + "epoch": 6.0750564334085775, + "grad_norm": 1.838407278060913, + "learning_rate": 4.209272687668565e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9915964126586914, + "num_tokens": 87389066.0, + "step": 10765 + }, + { + "entropy": 0.15369834899902343, + "epoch": 6.077878103837472, + "grad_norm": 1.1812386512756348, + "learning_rate": 4.2085945775460685e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9916245102882385, + "num_tokens": 87429380.0, + "step": 10770 + }, + { + "entropy": 0.15424088537693023, + "epoch": 6.080699774266366, + "grad_norm": 1.5837703943252563, + "learning_rate": 4.207916254551276e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.9913092613220215, + "num_tokens": 87470021.0, + "step": 10775 + }, + { + "entropy": 0.1431649297475815, + "epoch": 6.0835214446952595, + "grad_norm": 1.973551630973816, + "learning_rate": 4.207237718817446e-06, + "loss": 0.0312, + "mean_token_accuracy": 0.9908066749572754, + "num_tokens": 87510700.0, + "step": 10780 + }, + { + "entropy": 0.14944909512996674, + "epoch": 6.086343115124153, + "grad_norm": 1.8268595933914185, + "learning_rate": 4.2065589704778745e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9908172607421875, + "num_tokens": 87551408.0, + "step": 10785 + }, + { + "entropy": 0.1600016713142395, + "epoch": 6.089164785553048, + "grad_norm": 1.7882065773010254, + "learning_rate": 4.205880009665902e-06, + "loss": 0.0378, + "mean_token_accuracy": 0.9892760753631592, + "num_tokens": 87592139.0, + "step": 10790 + }, + { + "entropy": 0.14885053634643555, + "epoch": 6.091986455981941, + "grad_norm": 1.7751688957214355, + "learning_rate": 4.205200836514912e-06, + "loss": 0.0372, + "mean_token_accuracy": 0.9887823104858399, + "num_tokens": 87632768.0, + "step": 10795 + }, + { + "entropy": 0.14843485057353972, + "epoch": 6.094808126410835, + "grad_norm": 1.5204575061798096, + "learning_rate": 4.204521451158327e-06, + "loss": 0.0345, + "mean_token_accuracy": 0.9894916415214539, + "num_tokens": 87673525.0, + "step": 10800 + }, + { + "entropy": 0.13986097276210785, + "epoch": 6.097629796839729, + "grad_norm": 1.9457536935806274, + "learning_rate": 4.2038418537296126e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9908498883247375, + "num_tokens": 87714086.0, + "step": 10805 + }, + { + "entropy": 0.14706320762634278, + "epoch": 6.100451467268623, + "grad_norm": 1.7336926460266113, + "learning_rate": 4.203162044362276e-06, + "loss": 0.0347, + "mean_token_accuracy": 0.9906623005867005, + "num_tokens": 87753943.0, + "step": 10810 + }, + { + "entropy": 0.1470971792936325, + "epoch": 6.103273137697517, + "grad_norm": 2.0381176471710205, + "learning_rate": 4.2024820231898655e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.9907041072845459, + "num_tokens": 87794673.0, + "step": 10815 + }, + { + "entropy": 0.1387518674135208, + "epoch": 6.106094808126411, + "grad_norm": 1.6845587491989136, + "learning_rate": 4.201801790345971e-06, + "loss": 0.0372, + "mean_token_accuracy": 0.9895907640457153, + "num_tokens": 87835382.0, + "step": 10820 + }, + { + "entropy": 0.14503337144851686, + "epoch": 6.1089164785553045, + "grad_norm": 1.6855789422988892, + "learning_rate": 4.201121345964225e-06, + "loss": 0.0334, + "mean_token_accuracy": 0.9907157897949219, + "num_tokens": 87876038.0, + "step": 10825 + }, + { + "entropy": 0.13844524025917054, + "epoch": 6.111738148984198, + "grad_norm": 1.9881229400634766, + "learning_rate": 4.200440690178301e-06, + "loss": 0.0339, + "mean_token_accuracy": 0.9900532841682435, + "num_tokens": 87916705.0, + "step": 10830 + }, + { + "entropy": 0.15506267845630645, + "epoch": 6.114559819413093, + "grad_norm": 1.9544332027435303, + "learning_rate": 4.199759823121914e-06, + "loss": 0.0346, + "mean_token_accuracy": 0.9900209069252014, + "num_tokens": 87957566.0, + "step": 10835 + }, + { + "entropy": 0.16678837686777115, + "epoch": 6.1173814898419865, + "grad_norm": 1.6686840057373047, + "learning_rate": 4.199078744928819e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9909396409988404, + "num_tokens": 87997564.0, + "step": 10840 + }, + { + "entropy": 0.14651824533939362, + "epoch": 6.12020316027088, + "grad_norm": 1.7264286279678345, + "learning_rate": 4.198397455732816e-06, + "loss": 0.0373, + "mean_token_accuracy": 0.9898837804794312, + "num_tokens": 88037998.0, + "step": 10845 + }, + { + "entropy": 0.15673071444034575, + "epoch": 6.123024830699774, + "grad_norm": 1.6216492652893066, + "learning_rate": 4.197715955667742e-06, + "loss": 0.0348, + "mean_token_accuracy": 0.9899758458137512, + "num_tokens": 88078569.0, + "step": 10850 + }, + { + "entropy": 0.1553318828344345, + "epoch": 6.1258465011286685, + "grad_norm": 2.029632329940796, + "learning_rate": 4.19703424486748e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.9923609852790832, + "num_tokens": 88118867.0, + "step": 10855 + }, + { + "entropy": 0.13455870598554612, + "epoch": 6.128668171557562, + "grad_norm": 1.8276398181915283, + "learning_rate": 4.196352323465951e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9924584746360778, + "num_tokens": 88159338.0, + "step": 10860 + }, + { + "entropy": 0.13744568973779678, + "epoch": 6.131489841986456, + "grad_norm": 2.144724130630493, + "learning_rate": 4.1956701915971196e-06, + "loss": 0.0358, + "mean_token_accuracy": 0.9901370286941529, + "num_tokens": 88199943.0, + "step": 10865 + }, + { + "entropy": 0.1540704548358917, + "epoch": 6.1343115124153496, + "grad_norm": 2.058373212814331, + "learning_rate": 4.194987849394988e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9897918820381164, + "num_tokens": 88240426.0, + "step": 10870 + }, + { + "entropy": 0.14623642861843109, + "epoch": 6.137133182844244, + "grad_norm": 2.4471840858459473, + "learning_rate": 4.194305296993606e-06, + "loss": 0.0359, + "mean_token_accuracy": 0.9897229671478271, + "num_tokens": 88281095.0, + "step": 10875 + }, + { + "entropy": 0.1303885832428932, + "epoch": 6.139954853273138, + "grad_norm": 1.7725268602371216, + "learning_rate": 4.193622534527058e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9896966218948364, + "num_tokens": 88321683.0, + "step": 10880 + }, + { + "entropy": 0.13377143442630768, + "epoch": 6.1427765237020315, + "grad_norm": 2.0175435543060303, + "learning_rate": 4.192939562129476e-06, + "loss": 0.0306, + "mean_token_accuracy": 0.9910183906555176, + "num_tokens": 88362423.0, + "step": 10885 + }, + { + "entropy": 0.13516291081905366, + "epoch": 6.145598194130925, + "grad_norm": 2.2618424892425537, + "learning_rate": 4.192256379935027e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.9920958876609802, + "num_tokens": 88402934.0, + "step": 10890 + }, + { + "entropy": 0.14001532047986984, + "epoch": 6.14841986455982, + "grad_norm": 1.8463703393936157, + "learning_rate": 4.191572988077924e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9904056191444397, + "num_tokens": 88443748.0, + "step": 10895 + }, + { + "entropy": 0.12817184776067733, + "epoch": 6.1512415349887135, + "grad_norm": 1.4544408321380615, + "learning_rate": 4.190889386692418e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9931593537330627, + "num_tokens": 88484016.0, + "step": 10900 + }, + { + "entropy": 0.14389062374830247, + "epoch": 6.154063205417607, + "grad_norm": 1.9694024324417114, + "learning_rate": 4.190205575912804e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9898261427879333, + "num_tokens": 88524836.0, + "step": 10905 + }, + { + "entropy": 0.14219014048576356, + "epoch": 6.156884875846501, + "grad_norm": 2.0647833347320557, + "learning_rate": 4.189521555873416e-06, + "loss": 0.035, + "mean_token_accuracy": 0.9893984079360962, + "num_tokens": 88565548.0, + "step": 10910 + }, + { + "entropy": 0.14767142832279206, + "epoch": 6.159706546275395, + "grad_norm": 1.9720888137817383, + "learning_rate": 4.18883732670863e-06, + "loss": 0.0334, + "mean_token_accuracy": 0.9906517505645752, + "num_tokens": 88605978.0, + "step": 10915 + }, + { + "entropy": 0.14821238815784454, + "epoch": 6.162528216704289, + "grad_norm": 2.0277600288391113, + "learning_rate": 4.188152888552864e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9893009543418885, + "num_tokens": 88646706.0, + "step": 10920 + }, + { + "entropy": 0.15523377060890198, + "epoch": 6.165349887133183, + "grad_norm": 1.6985244750976562, + "learning_rate": 4.1874682415405735e-06, + "loss": 0.0361, + "mean_token_accuracy": 0.9894434928894043, + "num_tokens": 88687198.0, + "step": 10925 + }, + { + "entropy": 0.15274632275104522, + "epoch": 6.168171557562077, + "grad_norm": 1.9225730895996094, + "learning_rate": 4.186783385806259e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.9911539673805236, + "num_tokens": 88727828.0, + "step": 10930 + }, + { + "entropy": 0.13707538247108458, + "epoch": 6.17099322799097, + "grad_norm": 1.9640833139419556, + "learning_rate": 4.186098321484459e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9896294713020325, + "num_tokens": 88768488.0, + "step": 10935 + }, + { + "entropy": 0.14530183970928193, + "epoch": 6.173814898419865, + "grad_norm": 1.6039245128631592, + "learning_rate": 4.185413048709757e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9907344460487366, + "num_tokens": 88809035.0, + "step": 10940 + }, + { + "entropy": 0.14951497465372085, + "epoch": 6.176636568848759, + "grad_norm": 1.7432762384414673, + "learning_rate": 4.184727567616775e-06, + "loss": 0.0387, + "mean_token_accuracy": 0.9889516711235047, + "num_tokens": 88849740.0, + "step": 10945 + }, + { + "entropy": 0.14462248533964156, + "epoch": 6.179458239277652, + "grad_norm": 1.5465824604034424, + "learning_rate": 4.184041878340174e-06, + "loss": 0.0338, + "mean_token_accuracy": 0.990413224697113, + "num_tokens": 88890461.0, + "step": 10950 + }, + { + "entropy": 0.1421406477689743, + "epoch": 6.182279909706546, + "grad_norm": 2.111994743347168, + "learning_rate": 4.183355981014658e-06, + "loss": 0.0405, + "mean_token_accuracy": 0.9883662700653076, + "num_tokens": 88931166.0, + "step": 10955 + }, + { + "entropy": 0.1487989455461502, + "epoch": 6.1851015801354405, + "grad_norm": 2.039963722229004, + "learning_rate": 4.1826698757749715e-06, + "loss": 0.0361, + "mean_token_accuracy": 0.989516270160675, + "num_tokens": 88971798.0, + "step": 10960 + }, + { + "entropy": 0.14600317776203156, + "epoch": 6.187923250564334, + "grad_norm": 1.9209233522415161, + "learning_rate": 4.1819835627559e-06, + "loss": 0.0341, + "mean_token_accuracy": 0.9893284320831299, + "num_tokens": 89012519.0, + "step": 10965 + }, + { + "entropy": 0.1442936509847641, + "epoch": 6.190744920993228, + "grad_norm": 1.5550873279571533, + "learning_rate": 4.1812970420922725e-06, + "loss": 0.0373, + "mean_token_accuracy": 0.9887455940246582, + "num_tokens": 89053198.0, + "step": 10970 + }, + { + "entropy": 0.15094742476940154, + "epoch": 6.193566591422122, + "grad_norm": 1.8063254356384277, + "learning_rate": 4.180610313918952e-06, + "loss": 0.0401, + "mean_token_accuracy": 0.9881560444831848, + "num_tokens": 89093860.0, + "step": 10975 + }, + { + "entropy": 0.15021807849407195, + "epoch": 6.196388261851016, + "grad_norm": 2.1231496334075928, + "learning_rate": 4.1799233783708474e-06, + "loss": 0.0369, + "mean_token_accuracy": 0.9901463985443115, + "num_tokens": 89134320.0, + "step": 10980 + }, + { + "entropy": 0.1368527665734291, + "epoch": 6.19920993227991, + "grad_norm": 2.1702585220336914, + "learning_rate": 4.1792362355829094e-06, + "loss": 0.0336, + "mean_token_accuracy": 0.9903720498085022, + "num_tokens": 89175018.0, + "step": 10985 + }, + { + "entropy": 0.15346645712852477, + "epoch": 6.202031602708804, + "grad_norm": 1.8872740268707275, + "learning_rate": 4.178548885690126e-06, + "loss": 0.0384, + "mean_token_accuracy": 0.9887521862983704, + "num_tokens": 89215458.0, + "step": 10990 + }, + { + "entropy": 0.14233822673559188, + "epoch": 6.204853273137697, + "grad_norm": 1.8049365282058716, + "learning_rate": 4.177861328827526e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.9878924250602722, + "num_tokens": 89255912.0, + "step": 10995 + }, + { + "entropy": 0.14570232629776, + "epoch": 6.207674943566591, + "grad_norm": 1.8370901346206665, + "learning_rate": 4.1771735651301815e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9908752083778382, + "num_tokens": 89296589.0, + "step": 11000 + }, + { + "epoch": 6.207674943566591, + "eval_entropy": 0.20263180136680603, + "eval_loss": 0.02532457746565342, + "eval_mean_token_accuracy": 0.9938673973083496, + "eval_num_tokens": 89296589.0, + "eval_runtime": 0.1641, + "eval_samples_per_second": 24.369, + "eval_steps_per_second": 6.092, + "step": 11000 + }, + { + "entropy": 0.14294780194759368, + "epoch": 6.210496613995486, + "grad_norm": 1.8399924039840698, + "learning_rate": 4.176485594733203e-06, + "loss": 0.037, + "mean_token_accuracy": 0.9889877915382386, + "num_tokens": 89337302.0, + "step": 11005 + }, + { + "entropy": 0.14533422291278839, + "epoch": 6.213318284424379, + "grad_norm": 1.7887989282608032, + "learning_rate": 4.175797417771744e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9902246236801148, + "num_tokens": 89377799.0, + "step": 11010 + }, + { + "entropy": 0.1417243927717209, + "epoch": 6.216139954853273, + "grad_norm": 1.6449092626571655, + "learning_rate": 4.175109034380994e-06, + "loss": 0.0311, + "mean_token_accuracy": 0.9912659168243408, + "num_tokens": 89418429.0, + "step": 11015 + }, + { + "entropy": 0.14027564078569413, + "epoch": 6.218961625282167, + "grad_norm": 1.5085475444793701, + "learning_rate": 4.1744204446961885e-06, + "loss": 0.031, + "mean_token_accuracy": 0.9910840392112732, + "num_tokens": 89458957.0, + "step": 11020 + }, + { + "entropy": 0.1335515111684799, + "epoch": 6.221783295711061, + "grad_norm": 1.927368402481079, + "learning_rate": 4.1737316488526005e-06, + "loss": 0.0369, + "mean_token_accuracy": 0.9889877438545227, + "num_tokens": 89499568.0, + "step": 11025 + }, + { + "entropy": 0.14035135358572007, + "epoch": 6.224604966139955, + "grad_norm": 1.654311180114746, + "learning_rate": 4.173042646985544e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9901265263557434, + "num_tokens": 89540184.0, + "step": 11030 + }, + { + "entropy": 0.14204625636339188, + "epoch": 6.227426636568849, + "grad_norm": 1.987782597541809, + "learning_rate": 4.172353439230372e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.9901327252388, + "num_tokens": 89580911.0, + "step": 11035 + }, + { + "entropy": 0.15092624723911285, + "epoch": 6.230248306997742, + "grad_norm": 1.7647333145141602, + "learning_rate": 4.1716640257224815e-06, + "loss": 0.0374, + "mean_token_accuracy": 0.9883796095848083, + "num_tokens": 89620532.0, + "step": 11040 + }, + { + "entropy": 0.15512611269950866, + "epoch": 6.233069977426637, + "grad_norm": 1.8042088747024536, + "learning_rate": 4.170974406597307e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9896431684494018, + "num_tokens": 89661252.0, + "step": 11045 + }, + { + "entropy": 0.13535745441913605, + "epoch": 6.235891647855531, + "grad_norm": 2.0197017192840576, + "learning_rate": 4.170284581990325e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9908074617385865, + "num_tokens": 89702049.0, + "step": 11050 + }, + { + "entropy": 0.14527721405029298, + "epoch": 6.238713318284424, + "grad_norm": 2.0421767234802246, + "learning_rate": 4.1695945520370505e-06, + "loss": 0.0355, + "mean_token_accuracy": 0.9893774628639221, + "num_tokens": 89742588.0, + "step": 11055 + }, + { + "entropy": 0.159025439620018, + "epoch": 6.241534988713318, + "grad_norm": 2.0698342323303223, + "learning_rate": 4.16890431687304e-06, + "loss": 0.0317, + "mean_token_accuracy": 0.9911215782165528, + "num_tokens": 89783240.0, + "step": 11060 + }, + { + "entropy": 0.15658071339130403, + "epoch": 6.244356659142213, + "grad_norm": 1.8461730480194092, + "learning_rate": 4.168213876633891e-06, + "loss": 0.0354, + "mean_token_accuracy": 0.988905155658722, + "num_tokens": 89823822.0, + "step": 11065 + }, + { + "entropy": 0.13386445641517639, + "epoch": 6.247178329571106, + "grad_norm": 2.02535343170166, + "learning_rate": 4.167523231455241e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9896762609481812, + "num_tokens": 89864500.0, + "step": 11070 + }, + { + "entropy": 0.14109272211790086, + "epoch": 6.25, + "grad_norm": 2.1409685611724854, + "learning_rate": 4.166832381472766e-06, + "loss": 0.0341, + "mean_token_accuracy": 0.9898652195930481, + "num_tokens": 89905040.0, + "step": 11075 + }, + { + "entropy": 0.1469561368227005, + "epoch": 6.252821670428894, + "grad_norm": 1.8599631786346436, + "learning_rate": 4.166141326822184e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9907646656036377, + "num_tokens": 89945211.0, + "step": 11080 + }, + { + "entropy": 0.13446701616048812, + "epoch": 6.255643340857787, + "grad_norm": 1.5020068883895874, + "learning_rate": 4.165450067639254e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9891453385353088, + "num_tokens": 89985851.0, + "step": 11085 + }, + { + "entropy": 0.14148662984371185, + "epoch": 6.258465011286682, + "grad_norm": 2.185192346572876, + "learning_rate": 4.164758604059772e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9905752301216125, + "num_tokens": 90026635.0, + "step": 11090 + }, + { + "entropy": 0.13741403222084045, + "epoch": 6.261286681715576, + "grad_norm": 1.7420563697814941, + "learning_rate": 4.164066936219577e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9903434753417969, + "num_tokens": 90066598.0, + "step": 11095 + }, + { + "entropy": 0.14443300664424896, + "epoch": 6.264108352144469, + "grad_norm": 1.9935277700424194, + "learning_rate": 4.163375064254549e-06, + "loss": 0.041, + "mean_token_accuracy": 0.988189947605133, + "num_tokens": 90107478.0, + "step": 11100 + }, + { + "entropy": 0.1385072499513626, + "epoch": 6.266930022573363, + "grad_norm": 2.011894702911377, + "learning_rate": 4.162682988300602e-06, + "loss": 0.0394, + "mean_token_accuracy": 0.9881589770317077, + "num_tokens": 90148095.0, + "step": 11105 + }, + { + "entropy": 0.15410477817058563, + "epoch": 6.269751693002258, + "grad_norm": 1.801830530166626, + "learning_rate": 4.1619907084937e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9895232796669007, + "num_tokens": 90188661.0, + "step": 11110 + }, + { + "entropy": 0.1497156322002411, + "epoch": 6.272573363431151, + "grad_norm": 2.2699999809265137, + "learning_rate": 4.161298224969836e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9894330143928528, + "num_tokens": 90229364.0, + "step": 11115 + }, + { + "entropy": 0.14462217986583709, + "epoch": 6.275395033860045, + "grad_norm": 2.064868450164795, + "learning_rate": 4.1606055378650516e-06, + "loss": 0.034, + "mean_token_accuracy": 0.9895681977272034, + "num_tokens": 90270064.0, + "step": 11120 + }, + { + "entropy": 0.14862312972545624, + "epoch": 6.278216704288939, + "grad_norm": 1.742047905921936, + "learning_rate": 4.159912647315425e-06, + "loss": 0.0354, + "mean_token_accuracy": 0.9892676591873169, + "num_tokens": 90310329.0, + "step": 11125 + }, + { + "entropy": 0.14780815839767455, + "epoch": 6.281038374717833, + "grad_norm": 1.733389139175415, + "learning_rate": 4.159219553457074e-06, + "loss": 0.0338, + "mean_token_accuracy": 0.9897500872612, + "num_tokens": 90351138.0, + "step": 11130 + }, + { + "entropy": 0.13568034768104553, + "epoch": 6.283860045146727, + "grad_norm": 1.8001052141189575, + "learning_rate": 4.158526256426158e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9904277682304382, + "num_tokens": 90391910.0, + "step": 11135 + }, + { + "entropy": 0.14421828389167785, + "epoch": 6.286681715575621, + "grad_norm": 2.015634298324585, + "learning_rate": 4.157832756358874e-06, + "loss": 0.0406, + "mean_token_accuracy": 0.9873695731163025, + "num_tokens": 90432746.0, + "step": 11140 + }, + { + "entropy": 0.14020881354808806, + "epoch": 6.289503386004514, + "grad_norm": 2.3647797107696533, + "learning_rate": 4.157139053391461e-06, + "loss": 0.0362, + "mean_token_accuracy": 0.9892059206962586, + "num_tokens": 90473415.0, + "step": 11145 + }, + { + "entropy": 0.1497344046831131, + "epoch": 6.292325056433409, + "grad_norm": 2.052783727645874, + "learning_rate": 4.156445147660197e-06, + "loss": 0.0382, + "mean_token_accuracy": 0.9883453965187072, + "num_tokens": 90514134.0, + "step": 11150 + }, + { + "entropy": 0.14050848484039308, + "epoch": 6.295146726862303, + "grad_norm": 1.6089463233947754, + "learning_rate": 4.1557510393014e-06, + "loss": 0.0388, + "mean_token_accuracy": 0.9887239336967468, + "num_tokens": 90554962.0, + "step": 11155 + }, + { + "entropy": 0.14407500624656677, + "epoch": 6.297968397291196, + "grad_norm": 2.434351682662964, + "learning_rate": 4.155056728451426e-06, + "loss": 0.0376, + "mean_token_accuracy": 0.9894903421401977, + "num_tokens": 90595436.0, + "step": 11160 + }, + { + "entropy": 0.13423256427049637, + "epoch": 6.30079006772009, + "grad_norm": 2.1055705547332764, + "learning_rate": 4.154362215246675e-06, + "loss": 0.0352, + "mean_token_accuracy": 0.9897322297096253, + "num_tokens": 90636082.0, + "step": 11165 + }, + { + "entropy": 0.1400793805718422, + "epoch": 6.303611738148984, + "grad_norm": 2.3050198554992676, + "learning_rate": 4.1536674998235825e-06, + "loss": 0.0382, + "mean_token_accuracy": 0.9891103863716125, + "num_tokens": 90676676.0, + "step": 11170 + }, + { + "entropy": 0.1425952732563019, + "epoch": 6.306433408577878, + "grad_norm": 2.2005064487457275, + "learning_rate": 4.152972582318626e-06, + "loss": 0.0357, + "mean_token_accuracy": 0.9899725079536438, + "num_tokens": 90717384.0, + "step": 11175 + }, + { + "entropy": 0.1526680827140808, + "epoch": 6.309255079006772, + "grad_norm": 1.6304471492767334, + "learning_rate": 4.152277462868321e-06, + "loss": 0.0371, + "mean_token_accuracy": 0.9893090009689331, + "num_tokens": 90757930.0, + "step": 11180 + }, + { + "entropy": 0.14085469841957093, + "epoch": 6.312076749435666, + "grad_norm": 1.5589280128479004, + "learning_rate": 4.1515821416092264e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9919806241989135, + "num_tokens": 90798800.0, + "step": 11185 + }, + { + "entropy": 0.13670676797628403, + "epoch": 6.3148984198645595, + "grad_norm": 1.8319333791732788, + "learning_rate": 4.150886618677936e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9886664271354675, + "num_tokens": 90838684.0, + "step": 11190 + }, + { + "entropy": 0.16441910862922668, + "epoch": 6.317720090293454, + "grad_norm": 2.1681110858917236, + "learning_rate": 4.150190894211087e-06, + "loss": 0.0416, + "mean_token_accuracy": 0.9879634499549865, + "num_tokens": 90879475.0, + "step": 11195 + }, + { + "entropy": 0.15294736325740815, + "epoch": 6.320541760722348, + "grad_norm": 1.9503849744796753, + "learning_rate": 4.1494949683453525e-06, + "loss": 0.033, + "mean_token_accuracy": 0.990665853023529, + "num_tokens": 90920247.0, + "step": 11200 + }, + { + "entropy": 0.13768713921308517, + "epoch": 6.323363431151241, + "grad_norm": 1.801827311515808, + "learning_rate": 4.148798841217448e-06, + "loss": 0.036, + "mean_token_accuracy": 0.9888361096382141, + "num_tokens": 90960908.0, + "step": 11205 + }, + { + "entropy": 0.1548486292362213, + "epoch": 6.326185101580135, + "grad_norm": 2.232868194580078, + "learning_rate": 4.148102512964129e-06, + "loss": 0.0361, + "mean_token_accuracy": 0.9894163966178894, + "num_tokens": 91001543.0, + "step": 11210 + }, + { + "entropy": 0.13173782229423522, + "epoch": 6.32900677200903, + "grad_norm": 1.801491141319275, + "learning_rate": 4.1474059837221884e-06, + "loss": 0.0347, + "mean_token_accuracy": 0.9903063178062439, + "num_tokens": 91042245.0, + "step": 11215 + }, + { + "entropy": 0.14220143854618073, + "epoch": 6.331828442437923, + "grad_norm": 2.110448122024536, + "learning_rate": 4.146709253628458e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9890905261039734, + "num_tokens": 91082706.0, + "step": 11220 + }, + { + "entropy": 0.1490049421787262, + "epoch": 6.334650112866817, + "grad_norm": 2.0357728004455566, + "learning_rate": 4.146012322819814e-06, + "loss": 0.0372, + "mean_token_accuracy": 0.9890268802642822, + "num_tokens": 91123300.0, + "step": 11225 + }, + { + "entropy": 0.16816553473472595, + "epoch": 6.337471783295711, + "grad_norm": 2.126213550567627, + "learning_rate": 4.145315191433165e-06, + "loss": 0.0379, + "mean_token_accuracy": 0.989051103591919, + "num_tokens": 91163864.0, + "step": 11230 + }, + { + "entropy": 0.15646686255931855, + "epoch": 6.340293453724605, + "grad_norm": 1.7959010601043701, + "learning_rate": 4.144617859605464e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9899686336517334, + "num_tokens": 91204489.0, + "step": 11235 + }, + { + "entropy": 0.14400748908519745, + "epoch": 6.343115124153499, + "grad_norm": 1.6874085664749146, + "learning_rate": 4.1439203274737015e-06, + "loss": 0.0352, + "mean_token_accuracy": 0.9898784995079041, + "num_tokens": 91245210.0, + "step": 11240 + }, + { + "entropy": 0.12948621511459352, + "epoch": 6.345936794582393, + "grad_norm": 1.4666175842285156, + "learning_rate": 4.143222595174909e-06, + "loss": 0.0392, + "mean_token_accuracy": 0.9886078119277955, + "num_tokens": 91285941.0, + "step": 11245 + }, + { + "entropy": 0.14075332880020142, + "epoch": 6.3487584650112865, + "grad_norm": 2.0497982501983643, + "learning_rate": 4.142524662846156e-06, + "loss": 0.0376, + "mean_token_accuracy": 0.9891217827796936, + "num_tokens": 91326822.0, + "step": 11250 + }, + { + "entropy": 0.15241701006889344, + "epoch": 6.35158013544018, + "grad_norm": 1.943670392036438, + "learning_rate": 4.14182653062455e-06, + "loss": 0.0322, + "mean_token_accuracy": 0.9905896425247193, + "num_tokens": 91367515.0, + "step": 11255 + }, + { + "entropy": 0.1496208757162094, + "epoch": 6.354401805869075, + "grad_norm": 2.349369525909424, + "learning_rate": 4.14112819864724e-06, + "loss": 0.0394, + "mean_token_accuracy": 0.9895246982574463, + "num_tokens": 91408287.0, + "step": 11260 + }, + { + "entropy": 0.12518291175365448, + "epoch": 6.3572234762979685, + "grad_norm": 1.957039475440979, + "learning_rate": 4.140429667051412e-06, + "loss": 0.034, + "mean_token_accuracy": 0.9896358966827392, + "num_tokens": 91448827.0, + "step": 11265 + }, + { + "entropy": 0.13367379158735276, + "epoch": 6.360045146726862, + "grad_norm": 1.9315341711044312, + "learning_rate": 4.139730935974295e-06, + "loss": 0.0331, + "mean_token_accuracy": 0.9905829906463623, + "num_tokens": 91489610.0, + "step": 11270 + }, + { + "entropy": 0.13425323665142058, + "epoch": 6.362866817155756, + "grad_norm": 1.8871325254440308, + "learning_rate": 4.1390320055531545e-06, + "loss": 0.0374, + "mean_token_accuracy": 0.9884009838104248, + "num_tokens": 91530511.0, + "step": 11275 + }, + { + "entropy": 0.1478370100259781, + "epoch": 6.3656884875846504, + "grad_norm": 1.611417531967163, + "learning_rate": 4.1383328759252935e-06, + "loss": 0.031, + "mean_token_accuracy": 0.9910028457641602, + "num_tokens": 91571131.0, + "step": 11280 + }, + { + "entropy": 0.14289433658123016, + "epoch": 6.368510158013544, + "grad_norm": 1.8513978719711304, + "learning_rate": 4.137633547228058e-06, + "loss": 0.0401, + "mean_token_accuracy": 0.9881711721420288, + "num_tokens": 91612000.0, + "step": 11285 + }, + { + "entropy": 0.1438445121049881, + "epoch": 6.371331828442438, + "grad_norm": 2.4446709156036377, + "learning_rate": 4.13693401959883e-06, + "loss": 0.0344, + "mean_token_accuracy": 0.9899812698364258, + "num_tokens": 91652645.0, + "step": 11290 + }, + { + "entropy": 0.1517265945672989, + "epoch": 6.3741534988713315, + "grad_norm": 1.8532661199569702, + "learning_rate": 4.136234293175033e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9901606917381287, + "num_tokens": 91693216.0, + "step": 11295 + }, + { + "entropy": 0.1488794654607773, + "epoch": 6.376975169300226, + "grad_norm": 1.7429970502853394, + "learning_rate": 4.135534368094127e-06, + "loss": 0.0413, + "mean_token_accuracy": 0.9876135468482972, + "num_tokens": 91733904.0, + "step": 11300 + }, + { + "entropy": 0.15159515142440796, + "epoch": 6.37979683972912, + "grad_norm": 1.6426184177398682, + "learning_rate": 4.1348342444936134e-06, + "loss": 0.0405, + "mean_token_accuracy": 0.9885798454284668, + "num_tokens": 91774778.0, + "step": 11305 + }, + { + "entropy": 0.14828484505414963, + "epoch": 6.3826185101580135, + "grad_norm": 1.2837458848953247, + "learning_rate": 4.134133922511032e-06, + "loss": 0.0389, + "mean_token_accuracy": 0.9885473847389221, + "num_tokens": 91815379.0, + "step": 11310 + }, + { + "entropy": 0.13970376253128053, + "epoch": 6.385440180586907, + "grad_norm": 1.9679341316223145, + "learning_rate": 4.133433402283958e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.9905824422836303, + "num_tokens": 91855882.0, + "step": 11315 + }, + { + "entropy": 0.13457007706165314, + "epoch": 6.388261851015802, + "grad_norm": 1.7451153993606567, + "learning_rate": 4.132732683950013e-06, + "loss": 0.0351, + "mean_token_accuracy": 0.9886767625808716, + "num_tokens": 91896363.0, + "step": 11320 + }, + { + "entropy": 0.14434319883584976, + "epoch": 6.3910835214446955, + "grad_norm": 2.0313048362731934, + "learning_rate": 4.13203176764685e-06, + "loss": 0.0348, + "mean_token_accuracy": 0.9894550204277038, + "num_tokens": 91936957.0, + "step": 11325 + }, + { + "entropy": 0.14290865659713745, + "epoch": 6.393905191873589, + "grad_norm": 1.3977835178375244, + "learning_rate": 4.131330653512167e-06, + "loss": 0.0361, + "mean_token_accuracy": 0.9899007439613342, + "num_tokens": 91977634.0, + "step": 11330 + }, + { + "entropy": 0.14777366816997528, + "epoch": 6.396726862302483, + "grad_norm": 1.7547193765640259, + "learning_rate": 4.130629341683695e-06, + "loss": 0.0359, + "mean_token_accuracy": 0.9890438318252563, + "num_tokens": 92018299.0, + "step": 11335 + }, + { + "entropy": 0.16221860945224761, + "epoch": 6.399548532731377, + "grad_norm": 2.0626728534698486, + "learning_rate": 4.129927832299209e-06, + "loss": 0.0361, + "mean_token_accuracy": 0.989508330821991, + "num_tokens": 92058961.0, + "step": 11340 + }, + { + "entropy": 0.13226877599954606, + "epoch": 6.402370203160271, + "grad_norm": 1.7659369707107544, + "learning_rate": 4.129226125496519e-06, + "loss": 0.0363, + "mean_token_accuracy": 0.9892664194107056, + "num_tokens": 92099870.0, + "step": 11345 + }, + { + "entropy": 0.14557516872882842, + "epoch": 6.405191873589165, + "grad_norm": 2.086122751235962, + "learning_rate": 4.128524221413477e-06, + "loss": 0.0347, + "mean_token_accuracy": 0.9899450778961182, + "num_tokens": 92140746.0, + "step": 11350 + }, + { + "entropy": 0.15533352494239808, + "epoch": 6.408013544018059, + "grad_norm": 2.2414488792419434, + "learning_rate": 4.12782212018797e-06, + "loss": 0.0414, + "mean_token_accuracy": 0.9879919528961182, + "num_tokens": 92181327.0, + "step": 11355 + }, + { + "entropy": 0.14174574315547944, + "epoch": 6.410835214446952, + "grad_norm": 1.9369492530822754, + "learning_rate": 4.127119821957927e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9892768621444702, + "num_tokens": 92221857.0, + "step": 11360 + }, + { + "entropy": 0.13032906055450438, + "epoch": 6.413656884875847, + "grad_norm": 1.955140471458435, + "learning_rate": 4.126417326861316e-06, + "loss": 0.0331, + "mean_token_accuracy": 0.9905326247215271, + "num_tokens": 92262534.0, + "step": 11365 + }, + { + "entropy": 0.13662864565849303, + "epoch": 6.4164785553047405, + "grad_norm": 1.978060245513916, + "learning_rate": 4.1257146350361395e-06, + "loss": 0.0436, + "mean_token_accuracy": 0.9868420958518982, + "num_tokens": 92303036.0, + "step": 11370 + }, + { + "entropy": 0.15215873420238496, + "epoch": 6.419300225733634, + "grad_norm": 2.898237466812134, + "learning_rate": 4.125011746620444e-06, + "loss": 0.0403, + "mean_token_accuracy": 0.9886619448661804, + "num_tokens": 92342876.0, + "step": 11375 + }, + { + "entropy": 0.16599242389202118, + "epoch": 6.422121896162528, + "grad_norm": 1.9874509572982788, + "learning_rate": 4.1243086617523105e-06, + "loss": 0.0404, + "mean_token_accuracy": 0.98833726644516, + "num_tokens": 92383402.0, + "step": 11380 + }, + { + "entropy": 0.14803114980459214, + "epoch": 6.4249435665914225, + "grad_norm": 2.0754692554473877, + "learning_rate": 4.12360538056986e-06, + "loss": 0.0355, + "mean_token_accuracy": 0.9896466255187988, + "num_tokens": 92423873.0, + "step": 11385 + }, + { + "entropy": 0.14688670337200166, + "epoch": 6.427765237020316, + "grad_norm": 1.7059087753295898, + "learning_rate": 4.122901903211254e-06, + "loss": 0.0363, + "mean_token_accuracy": 0.9896903872489929, + "num_tokens": 92464592.0, + "step": 11390 + }, + { + "entropy": 0.137593112885952, + "epoch": 6.43058690744921, + "grad_norm": 1.807116985321045, + "learning_rate": 4.122198229814689e-06, + "loss": 0.0329, + "mean_token_accuracy": 0.9902451157569885, + "num_tokens": 92505357.0, + "step": 11395 + }, + { + "entropy": 0.1294647052884102, + "epoch": 6.433408577878104, + "grad_norm": 1.739324927330017, + "learning_rate": 4.121494360518401e-06, + "loss": 0.0359, + "mean_token_accuracy": 0.9892550468444824, + "num_tokens": 92546032.0, + "step": 11400 + }, + { + "entropy": 0.16134763658046722, + "epoch": 6.436230248306998, + "grad_norm": 1.8106449842453003, + "learning_rate": 4.120790295460668e-06, + "loss": 0.0378, + "mean_token_accuracy": 0.9890878438949585, + "num_tokens": 92586727.0, + "step": 11405 + }, + { + "entropy": 0.13998755365610122, + "epoch": 6.439051918735892, + "grad_norm": 1.5401268005371094, + "learning_rate": 4.1200860347798e-06, + "loss": 0.0369, + "mean_token_accuracy": 0.9888973951339721, + "num_tokens": 92627301.0, + "step": 11410 + }, + { + "entropy": 0.14669421464204788, + "epoch": 6.441873589164786, + "grad_norm": 1.9903889894485474, + "learning_rate": 4.119381578614153e-06, + "loss": 0.0388, + "mean_token_accuracy": 0.9879800200462341, + "num_tokens": 92668052.0, + "step": 11415 + }, + { + "entropy": 0.14707699716091155, + "epoch": 6.444695259593679, + "grad_norm": 1.9357092380523682, + "learning_rate": 4.118676927102115e-06, + "loss": 0.0356, + "mean_token_accuracy": 0.9894900679588318, + "num_tokens": 92708627.0, + "step": 11420 + }, + { + "entropy": 0.14949690401554108, + "epoch": 6.447516930022573, + "grad_norm": 1.9960310459136963, + "learning_rate": 4.117972080382115e-06, + "loss": 0.0378, + "mean_token_accuracy": 0.9885159015655518, + "num_tokens": 92749188.0, + "step": 11425 + }, + { + "entropy": 0.14854080080986024, + "epoch": 6.450338600451468, + "grad_norm": 1.9244252443313599, + "learning_rate": 4.117267038592621e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.988319730758667, + "num_tokens": 92789998.0, + "step": 11430 + }, + { + "entropy": 0.14139661490917205, + "epoch": 6.453160270880361, + "grad_norm": 1.5770515203475952, + "learning_rate": 4.1165618018721385e-06, + "loss": 0.0379, + "mean_token_accuracy": 0.9883534550666809, + "num_tokens": 92830742.0, + "step": 11435 + }, + { + "entropy": 0.1313648045063019, + "epoch": 6.455981941309255, + "grad_norm": 1.4559335708618164, + "learning_rate": 4.115856370359211e-06, + "loss": 0.0352, + "mean_token_accuracy": 0.9895216822624207, + "num_tokens": 92871340.0, + "step": 11440 + }, + { + "entropy": 0.14764813482761383, + "epoch": 6.458803611738149, + "grad_norm": 1.990279197692871, + "learning_rate": 4.115150744192421e-06, + "loss": 0.0383, + "mean_token_accuracy": 0.9883626222610473, + "num_tokens": 92911986.0, + "step": 11445 + }, + { + "entropy": 0.14003989696502686, + "epoch": 6.461625282167043, + "grad_norm": 2.3407559394836426, + "learning_rate": 4.114444923510388e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9898525953292847, + "num_tokens": 92952713.0, + "step": 11450 + }, + { + "entropy": 0.13716760277748108, + "epoch": 6.464446952595937, + "grad_norm": 1.8746510744094849, + "learning_rate": 4.113738908451771e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9901943325996398, + "num_tokens": 92993264.0, + "step": 11455 + }, + { + "entropy": 0.14014555811882018, + "epoch": 6.467268623024831, + "grad_norm": 1.970062255859375, + "learning_rate": 4.113032699155268e-06, + "loss": 0.042, + "mean_token_accuracy": 0.9883641839027405, + "num_tokens": 93033836.0, + "step": 11460 + }, + { + "entropy": 0.15491693019866942, + "epoch": 6.470090293453724, + "grad_norm": 1.8556119203567505, + "learning_rate": 4.112326295759612e-06, + "loss": 0.0394, + "mean_token_accuracy": 0.9879988074302674, + "num_tokens": 93074450.0, + "step": 11465 + }, + { + "entropy": 0.1430242419242859, + "epoch": 6.472911963882619, + "grad_norm": 2.5160515308380127, + "learning_rate": 4.111619698403577e-06, + "loss": 0.0413, + "mean_token_accuracy": 0.9870900750160218, + "num_tokens": 93115187.0, + "step": 11470 + }, + { + "entropy": 0.1444158598780632, + "epoch": 6.475733634311513, + "grad_norm": 1.70442795753479, + "learning_rate": 4.110912907225974e-06, + "loss": 0.0362, + "mean_token_accuracy": 0.9890662670135498, + "num_tokens": 93155595.0, + "step": 11475 + }, + { + "entropy": 0.13908229172229766, + "epoch": 6.478555304740406, + "grad_norm": 1.6484752893447876, + "learning_rate": 4.110205922365652e-06, + "loss": 0.0341, + "mean_token_accuracy": 0.9901829957962036, + "num_tokens": 93196137.0, + "step": 11480 + }, + { + "entropy": 0.1475514531135559, + "epoch": 6.4813769751693, + "grad_norm": 1.759164571762085, + "learning_rate": 4.1094987439615e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.990956437587738, + "num_tokens": 93236943.0, + "step": 11485 + }, + { + "entropy": 0.16040101349353791, + "epoch": 6.484198645598195, + "grad_norm": 2.2340142726898193, + "learning_rate": 4.10879137215244e-06, + "loss": 0.038, + "mean_token_accuracy": 0.9892443895339966, + "num_tokens": 93277309.0, + "step": 11490 + }, + { + "entropy": 0.1610852897167206, + "epoch": 6.487020316027088, + "grad_norm": 1.9512050151824951, + "learning_rate": 4.108083807077437e-06, + "loss": 0.0445, + "mean_token_accuracy": 0.9860481977462768, + "num_tokens": 93317646.0, + "step": 11495 + }, + { + "entropy": 0.15572181046009065, + "epoch": 6.489841986455982, + "grad_norm": 2.4991040229797363, + "learning_rate": 4.1073760488754935e-06, + "loss": 0.0448, + "mean_token_accuracy": 0.9868571162223816, + "num_tokens": 93358277.0, + "step": 11500 + }, + { + "epoch": 6.489841986455982, + "eval_entropy": 0.1996920257806778, + "eval_loss": 0.023422840982675552, + "eval_mean_token_accuracy": 0.9938673973083496, + "eval_num_tokens": 93358277.0, + "eval_runtime": 0.1638, + "eval_samples_per_second": 24.421, + "eval_steps_per_second": 6.105, + "step": 11500 + }, + { + "entropy": 0.1445915535092354, + "epoch": 6.492663656884876, + "grad_norm": 1.8517767190933228, + "learning_rate": 4.106668097685647e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9889588594436646, + "num_tokens": 93398334.0, + "step": 11505 + }, + { + "entropy": 0.15220266580581665, + "epoch": 6.495485327313769, + "grad_norm": 1.8239866495132446, + "learning_rate": 4.105959953646975e-06, + "loss": 0.0392, + "mean_token_accuracy": 0.9890074372291565, + "num_tokens": 93438747.0, + "step": 11510 + }, + { + "entropy": 0.150148668885231, + "epoch": 6.498306997742664, + "grad_norm": 1.4895845651626587, + "learning_rate": 4.105251616898592e-06, + "loss": 0.0376, + "mean_token_accuracy": 0.9892801523208619, + "num_tokens": 93479380.0, + "step": 11515 + }, + { + "entropy": 0.14549075663089753, + "epoch": 6.501128668171558, + "grad_norm": 1.9964500665664673, + "learning_rate": 4.104543087579652e-06, + "loss": 0.0315, + "mean_token_accuracy": 0.9905564427375794, + "num_tokens": 93520126.0, + "step": 11520 + }, + { + "entropy": 0.1546286165714264, + "epoch": 6.503950338600451, + "grad_norm": 2.1431045532226562, + "learning_rate": 4.103834365829346e-06, + "loss": 0.0387, + "mean_token_accuracy": 0.9886301279067993, + "num_tokens": 93560843.0, + "step": 11525 + }, + { + "entropy": 0.14803966879844666, + "epoch": 6.506772009029345, + "grad_norm": 1.7916501760482788, + "learning_rate": 4.1031254517869e-06, + "loss": 0.0302, + "mean_token_accuracy": 0.9909810662269593, + "num_tokens": 93601247.0, + "step": 11530 + }, + { + "entropy": 0.14564967155456543, + "epoch": 6.50959367945824, + "grad_norm": 2.3298871517181396, + "learning_rate": 4.102416345591583e-06, + "loss": 0.0387, + "mean_token_accuracy": 0.9888354897499084, + "num_tokens": 93641820.0, + "step": 11535 + }, + { + "entropy": 0.1511426329612732, + "epoch": 6.512415349887133, + "grad_norm": 1.7131050825119019, + "learning_rate": 4.101707047382697e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.9912442445755005, + "num_tokens": 93682410.0, + "step": 11540 + }, + { + "entropy": 0.14388006627559663, + "epoch": 6.515237020316027, + "grad_norm": 1.829458475112915, + "learning_rate": 4.100997557299585e-06, + "loss": 0.0371, + "mean_token_accuracy": 0.9888779282569885, + "num_tokens": 93723161.0, + "step": 11545 + }, + { + "entropy": 0.15411291718482972, + "epoch": 6.518058690744921, + "grad_norm": 2.0121374130249023, + "learning_rate": 4.100287875481627e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9880946636199951, + "num_tokens": 93763892.0, + "step": 11550 + }, + { + "entropy": 0.16307685375213624, + "epoch": 6.520880361173814, + "grad_norm": 2.0615103244781494, + "learning_rate": 4.099578002068238e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9895309686660767, + "num_tokens": 93804514.0, + "step": 11555 + }, + { + "entropy": 0.14214831590652466, + "epoch": 6.523702031602709, + "grad_norm": 1.7495334148406982, + "learning_rate": 4.098867937198873e-06, + "loss": 0.0356, + "mean_token_accuracy": 0.989754056930542, + "num_tokens": 93844886.0, + "step": 11560 + }, + { + "entropy": 0.13692098259925842, + "epoch": 6.526523702031603, + "grad_norm": 2.2061078548431396, + "learning_rate": 4.098157681013027e-06, + "loss": 0.0405, + "mean_token_accuracy": 0.9879782795906067, + "num_tokens": 93885343.0, + "step": 11565 + }, + { + "entropy": 0.1372235894203186, + "epoch": 6.529345372460496, + "grad_norm": 1.7683453559875488, + "learning_rate": 4.097447233650226e-06, + "loss": 0.0356, + "mean_token_accuracy": 0.9889456748962402, + "num_tokens": 93926121.0, + "step": 11570 + }, + { + "entropy": 0.15331811606884002, + "epoch": 6.532167042889391, + "grad_norm": 2.0020864009857178, + "learning_rate": 4.0967365952500416e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9897879481315612, + "num_tokens": 93966780.0, + "step": 11575 + }, + { + "entropy": 0.154784095287323, + "epoch": 6.534988713318285, + "grad_norm": 1.7012702226638794, + "learning_rate": 4.096025765952076e-06, + "loss": 0.0342, + "mean_token_accuracy": 0.9907065868377686, + "num_tokens": 94007365.0, + "step": 11580 + }, + { + "entropy": 0.13629684150218963, + "epoch": 6.537810383747178, + "grad_norm": 2.087913990020752, + "learning_rate": 4.095314745895972e-06, + "loss": 0.0376, + "mean_token_accuracy": 0.9887621521949768, + "num_tokens": 94048019.0, + "step": 11585 + }, + { + "entropy": 0.14386317431926726, + "epoch": 6.540632054176072, + "grad_norm": 1.9246793985366821, + "learning_rate": 4.0946035352214106e-06, + "loss": 0.0308, + "mean_token_accuracy": 0.990863585472107, + "num_tokens": 94088462.0, + "step": 11590 + }, + { + "entropy": 0.1515270948410034, + "epoch": 6.543453724604966, + "grad_norm": 2.080392837524414, + "learning_rate": 4.093892134068108e-06, + "loss": 0.0403, + "mean_token_accuracy": 0.9882361173629761, + "num_tokens": 94128923.0, + "step": 11595 + }, + { + "entropy": 0.15317191779613495, + "epoch": 6.54627539503386, + "grad_norm": 1.7559170722961426, + "learning_rate": 4.09318054257582e-06, + "loss": 0.036, + "mean_token_accuracy": 0.9891046166419983, + "num_tokens": 94169548.0, + "step": 11600 + }, + { + "entropy": 0.1485581949353218, + "epoch": 6.549097065462754, + "grad_norm": 1.9086334705352783, + "learning_rate": 4.092468760884338e-06, + "loss": 0.0403, + "mean_token_accuracy": 0.9878690481185913, + "num_tokens": 94210344.0, + "step": 11605 + }, + { + "entropy": 0.14423695504665374, + "epoch": 6.551918735891648, + "grad_norm": 2.0206217765808105, + "learning_rate": 4.0917567891334935e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9891125440597535, + "num_tokens": 94250622.0, + "step": 11610 + }, + { + "entropy": 0.14233740866184236, + "epoch": 6.5547404063205414, + "grad_norm": 2.0237369537353516, + "learning_rate": 4.091044627463151e-06, + "loss": 0.033, + "mean_token_accuracy": 0.9899827241897583, + "num_tokens": 94291129.0, + "step": 11615 + }, + { + "entropy": 0.1390757292509079, + "epoch": 6.557562076749436, + "grad_norm": 2.0195369720458984, + "learning_rate": 4.0903322760132165e-06, + "loss": 0.0393, + "mean_token_accuracy": 0.9883224606513977, + "num_tokens": 94331922.0, + "step": 11620 + }, + { + "entropy": 0.15587489008903505, + "epoch": 6.56038374717833, + "grad_norm": 2.020552158355713, + "learning_rate": 4.0896197349236306e-06, + "loss": 0.0397, + "mean_token_accuracy": 0.9889486074447632, + "num_tokens": 94372528.0, + "step": 11625 + }, + { + "entropy": 0.15144974291324614, + "epoch": 6.563205417607223, + "grad_norm": 1.6519806385040283, + "learning_rate": 4.0889070043343725e-06, + "loss": 0.0409, + "mean_token_accuracy": 0.9875991225242615, + "num_tokens": 94413314.0, + "step": 11630 + }, + { + "entropy": 0.16432562172412873, + "epoch": 6.566027088036117, + "grad_norm": 2.0881333351135254, + "learning_rate": 4.088194084385459e-06, + "loss": 0.0379, + "mean_token_accuracy": 0.9884780287742615, + "num_tokens": 94453628.0, + "step": 11635 + }, + { + "entropy": 0.15171082615852355, + "epoch": 6.568848758465011, + "grad_norm": 1.8627437353134155, + "learning_rate": 4.08748097521694e-06, + "loss": 0.0423, + "mean_token_accuracy": 0.9872801780700684, + "num_tokens": 94494421.0, + "step": 11640 + }, + { + "entropy": 0.1384830117225647, + "epoch": 6.571670428893905, + "grad_norm": 1.6030412912368774, + "learning_rate": 4.0867676769689104e-06, + "loss": 0.0408, + "mean_token_accuracy": 0.988001823425293, + "num_tokens": 94534981.0, + "step": 11645 + }, + { + "entropy": 0.15062596201896666, + "epoch": 6.574492099322799, + "grad_norm": 2.3316404819488525, + "learning_rate": 4.086054189781495e-06, + "loss": 0.0375, + "mean_token_accuracy": 0.9890991806983948, + "num_tokens": 94575864.0, + "step": 11650 + }, + { + "entropy": 0.15638395845890046, + "epoch": 6.577313769751693, + "grad_norm": 2.025421142578125, + "learning_rate": 4.085340513794859e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9899617671966553, + "num_tokens": 94616541.0, + "step": 11655 + }, + { + "entropy": 0.1384786307811737, + "epoch": 6.580135440180587, + "grad_norm": 2.037663459777832, + "learning_rate": 4.084626649149204e-06, + "loss": 0.0357, + "mean_token_accuracy": 0.9894083857536315, + "num_tokens": 94657102.0, + "step": 11660 + }, + { + "entropy": 0.14564592242240906, + "epoch": 6.582957110609481, + "grad_norm": 2.0661332607269287, + "learning_rate": 4.083912595984769e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9883635878562927, + "num_tokens": 94697865.0, + "step": 11665 + }, + { + "entropy": 0.15481620132923127, + "epoch": 6.585778781038375, + "grad_norm": 1.89729642868042, + "learning_rate": 4.083198354441831e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.9885015726089478, + "num_tokens": 94738274.0, + "step": 11670 + }, + { + "entropy": 0.14785862267017363, + "epoch": 6.5886004514672685, + "grad_norm": 2.5949220657348633, + "learning_rate": 4.082483924660701e-06, + "loss": 0.0326, + "mean_token_accuracy": 0.9899281978607177, + "num_tokens": 94778892.0, + "step": 11675 + }, + { + "entropy": 0.1447735384106636, + "epoch": 6.591422121896162, + "grad_norm": 1.6759907007217407, + "learning_rate": 4.081769306781729e-06, + "loss": 0.0374, + "mean_token_accuracy": 0.9890196442604064, + "num_tokens": 94819675.0, + "step": 11680 + }, + { + "entropy": 0.14461695849895478, + "epoch": 6.594243792325057, + "grad_norm": 1.7010953426361084, + "learning_rate": 4.081054500945303e-06, + "loss": 0.0343, + "mean_token_accuracy": 0.9894821643829346, + "num_tokens": 94860190.0, + "step": 11685 + }, + { + "entropy": 0.14982332289218903, + "epoch": 6.5970654627539504, + "grad_norm": 2.2242343425750732, + "learning_rate": 4.080339507291845e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9886794209480285, + "num_tokens": 94900923.0, + "step": 11690 + }, + { + "entropy": 0.15086464136838912, + "epoch": 6.599887133182844, + "grad_norm": 2.197476863861084, + "learning_rate": 4.079624325961818e-06, + "loss": 0.0359, + "mean_token_accuracy": 0.988996946811676, + "num_tokens": 94941485.0, + "step": 11695 + }, + { + "entropy": 0.1545707941055298, + "epoch": 6.602708803611738, + "grad_norm": 1.8683733940124512, + "learning_rate": 4.0789089570957175e-06, + "loss": 0.0394, + "mean_token_accuracy": 0.9879536390304565, + "num_tokens": 94982052.0, + "step": 11700 + }, + { + "entropy": 0.13810085207223893, + "epoch": 6.605530474040632, + "grad_norm": 2.827000856399536, + "learning_rate": 4.078193400834078e-06, + "loss": 0.0388, + "mean_token_accuracy": 0.9887730002403259, + "num_tokens": 95022767.0, + "step": 11705 + }, + { + "entropy": 0.14364120215177537, + "epoch": 6.608352144469526, + "grad_norm": 2.075237989425659, + "learning_rate": 4.077477657317471e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9897770881652832, + "num_tokens": 95063645.0, + "step": 11710 + }, + { + "entropy": 0.1465618520975113, + "epoch": 6.61117381489842, + "grad_norm": 2.806278705596924, + "learning_rate": 4.076761726686505e-06, + "loss": 0.0466, + "mean_token_accuracy": 0.9863425970077515, + "num_tokens": 95104053.0, + "step": 11715 + }, + { + "entropy": 0.14402187317609788, + "epoch": 6.6139954853273135, + "grad_norm": 1.9875422716140747, + "learning_rate": 4.076045609081824e-06, + "loss": 0.0403, + "mean_token_accuracy": 0.9877901315689087, + "num_tokens": 95144803.0, + "step": 11720 + }, + { + "entropy": 0.154840087890625, + "epoch": 6.616817155756207, + "grad_norm": 1.8063068389892578, + "learning_rate": 4.075329304644109e-06, + "loss": 0.0397, + "mean_token_accuracy": 0.9879529476165771, + "num_tokens": 95185612.0, + "step": 11725 + }, + { + "entropy": 0.13759687691926956, + "epoch": 6.619638826185102, + "grad_norm": 1.7499263286590576, + "learning_rate": 4.074612813514079e-06, + "loss": 0.0373, + "mean_token_accuracy": 0.988704776763916, + "num_tokens": 95226271.0, + "step": 11730 + }, + { + "entropy": 0.1382174924015999, + "epoch": 6.6224604966139955, + "grad_norm": 2.57856822013855, + "learning_rate": 4.073896135832488e-06, + "loss": 0.0423, + "mean_token_accuracy": 0.987503731250763, + "num_tokens": 95266935.0, + "step": 11735 + }, + { + "entropy": 0.15132977664470673, + "epoch": 6.625282167042889, + "grad_norm": 2.365727424621582, + "learning_rate": 4.073179271740128e-06, + "loss": 0.0439, + "mean_token_accuracy": 0.9868868947029114, + "num_tokens": 95307554.0, + "step": 11740 + }, + { + "entropy": 0.1502486765384674, + "epoch": 6.628103837471784, + "grad_norm": 2.003241777420044, + "learning_rate": 4.072462221377827e-06, + "loss": 0.0392, + "mean_token_accuracy": 0.9876524209976196, + "num_tokens": 95348372.0, + "step": 11745 + }, + { + "entropy": 0.15410052090883256, + "epoch": 6.6309255079006775, + "grad_norm": 1.5356718301773071, + "learning_rate": 4.07174498488645e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.989504873752594, + "num_tokens": 95389076.0, + "step": 11750 + }, + { + "entropy": 0.1485922798514366, + "epoch": 6.633747178329571, + "grad_norm": 1.6456876993179321, + "learning_rate": 4.071027562406896e-06, + "loss": 0.0336, + "mean_token_accuracy": 0.9904733896255493, + "num_tokens": 95429753.0, + "step": 11755 + }, + { + "entropy": 0.1574637770652771, + "epoch": 6.636568848758465, + "grad_norm": 2.0578815937042236, + "learning_rate": 4.070309954080106e-06, + "loss": 0.0393, + "mean_token_accuracy": 0.9888066649436951, + "num_tokens": 95470465.0, + "step": 11760 + }, + { + "entropy": 0.1463371992111206, + "epoch": 6.639390519187359, + "grad_norm": 1.576945185661316, + "learning_rate": 4.069592160047051e-06, + "loss": 0.0363, + "mean_token_accuracy": 0.9890714764595032, + "num_tokens": 95510938.0, + "step": 11765 + }, + { + "entropy": 0.15714893043041228, + "epoch": 6.642212189616253, + "grad_norm": 1.784827709197998, + "learning_rate": 4.0688741804487446e-06, + "loss": 0.0374, + "mean_token_accuracy": 0.9894795179367065, + "num_tokens": 95551359.0, + "step": 11770 + }, + { + "entropy": 0.14306671023368836, + "epoch": 6.645033860045147, + "grad_norm": 1.9207254648208618, + "learning_rate": 4.0681560154262326e-06, + "loss": 0.0399, + "mean_token_accuracy": 0.9883310794830322, + "num_tokens": 95591909.0, + "step": 11775 + }, + { + "entropy": 0.14147693812847137, + "epoch": 6.6478555304740405, + "grad_norm": 2.073054552078247, + "learning_rate": 4.067437665120598e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9901525735855102, + "num_tokens": 95632630.0, + "step": 11780 + }, + { + "entropy": 0.12939851135015487, + "epoch": 6.650677200902934, + "grad_norm": 1.6735975742340088, + "learning_rate": 4.066719129672962e-06, + "loss": 0.0346, + "mean_token_accuracy": 0.9895193934440613, + "num_tokens": 95673068.0, + "step": 11785 + }, + { + "entropy": 0.13276245892047883, + "epoch": 6.653498871331829, + "grad_norm": 2.009474754333496, + "learning_rate": 4.066000409224481e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9889036655426026, + "num_tokens": 95713693.0, + "step": 11790 + }, + { + "entropy": 0.13992193043231965, + "epoch": 6.6563205417607225, + "grad_norm": 2.217224359512329, + "learning_rate": 4.0652815039163475e-06, + "loss": 0.0363, + "mean_token_accuracy": 0.9892218232154846, + "num_tokens": 95754580.0, + "step": 11795 + }, + { + "entropy": 0.14898284673690795, + "epoch": 6.659142212189616, + "grad_norm": 1.6819255352020264, + "learning_rate": 4.06456241388979e-06, + "loss": 0.0351, + "mean_token_accuracy": 0.9898673295974731, + "num_tokens": 95795256.0, + "step": 11800 + }, + { + "entropy": 0.14159813523292542, + "epoch": 6.66196388261851, + "grad_norm": 1.9627163410186768, + "learning_rate": 4.063843139286073e-06, + "loss": 0.0384, + "mean_token_accuracy": 0.9878716945648194, + "num_tokens": 95835802.0, + "step": 11805 + }, + { + "entropy": 0.1385001763701439, + "epoch": 6.664785553047404, + "grad_norm": 2.2941231727600098, + "learning_rate": 4.063123680246501e-06, + "loss": 0.0414, + "mean_token_accuracy": 0.9872376441955566, + "num_tokens": 95876651.0, + "step": 11810 + }, + { + "entropy": 0.14992179572582245, + "epoch": 6.667607223476298, + "grad_norm": 1.722368836402893, + "learning_rate": 4.062404036912409e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9891926527023316, + "num_tokens": 95917227.0, + "step": 11815 + }, + { + "entropy": 0.14835633486509323, + "epoch": 6.670428893905192, + "grad_norm": 2.5152008533477783, + "learning_rate": 4.061684209425173e-06, + "loss": 0.0407, + "mean_token_accuracy": 0.9874385356903076, + "num_tokens": 95957630.0, + "step": 11820 + }, + { + "entropy": 0.1346977487206459, + "epoch": 6.673250564334086, + "grad_norm": 2.1163482666015625, + "learning_rate": 4.060964197926201e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.9909256815910339, + "num_tokens": 95998284.0, + "step": 11825 + }, + { + "entropy": 0.14587730467319487, + "epoch": 6.67607223476298, + "grad_norm": 2.0600922107696533, + "learning_rate": 4.060244002556942e-06, + "loss": 0.0354, + "mean_token_accuracy": 0.9892189502716064, + "num_tokens": 96039045.0, + "step": 11830 + }, + { + "entropy": 0.157058185338974, + "epoch": 6.678893905191874, + "grad_norm": 2.0479791164398193, + "learning_rate": 4.0595236234588755e-06, + "loss": 0.0423, + "mean_token_accuracy": 0.9872640252113343, + "num_tokens": 96079680.0, + "step": 11835 + }, + { + "entropy": 0.15050775408744813, + "epoch": 6.681715575620768, + "grad_norm": 2.05046010017395, + "learning_rate": 4.058803060773523e-06, + "loss": 0.041, + "mean_token_accuracy": 0.9878550887107849, + "num_tokens": 96120155.0, + "step": 11840 + }, + { + "entropy": 0.15183096528053283, + "epoch": 6.684537246049661, + "grad_norm": 2.1310324668884277, + "learning_rate": 4.058082314642438e-06, + "loss": 0.0417, + "mean_token_accuracy": 0.9870881080627442, + "num_tokens": 96160925.0, + "step": 11845 + }, + { + "entropy": 0.13959557116031646, + "epoch": 6.687358916478555, + "grad_norm": 2.0781123638153076, + "learning_rate": 4.05736138520721e-06, + "loss": 0.0383, + "mean_token_accuracy": 0.9885563850402832, + "num_tokens": 96201452.0, + "step": 11850 + }, + { + "entropy": 0.16393175423145295, + "epoch": 6.6901805869074495, + "grad_norm": 1.6570488214492798, + "learning_rate": 4.056640272609467e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.9903802037239074, + "num_tokens": 96242166.0, + "step": 11855 + }, + { + "entropy": 0.1479017674922943, + "epoch": 6.693002257336343, + "grad_norm": 1.9548949003219604, + "learning_rate": 4.055918976990872e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9904338598251343, + "num_tokens": 96282855.0, + "step": 11860 + }, + { + "entropy": 0.1482335865497589, + "epoch": 6.695823927765237, + "grad_norm": 2.1175918579101562, + "learning_rate": 4.055197498493123e-06, + "loss": 0.0418, + "mean_token_accuracy": 0.9874824643135071, + "num_tokens": 96323775.0, + "step": 11865 + }, + { + "entropy": 0.14961565434932708, + "epoch": 6.698645598194131, + "grad_norm": 1.942327618598938, + "learning_rate": 4.054475837257953e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9901739597320557, + "num_tokens": 96364548.0, + "step": 11870 + }, + { + "entropy": 0.15366184413433076, + "epoch": 6.701467268623025, + "grad_norm": 2.2145068645477295, + "learning_rate": 4.053753993427135e-06, + "loss": 0.0378, + "mean_token_accuracy": 0.988074266910553, + "num_tokens": 96404933.0, + "step": 11875 + }, + { + "entropy": 0.15021361410617828, + "epoch": 6.704288939051919, + "grad_norm": 1.6474920511245728, + "learning_rate": 4.053031967142475e-06, + "loss": 0.0341, + "mean_token_accuracy": 0.98985835313797, + "num_tokens": 96445402.0, + "step": 11880 + }, + { + "entropy": 0.16894918978214263, + "epoch": 6.707110609480813, + "grad_norm": 1.8737285137176514, + "learning_rate": 4.052309758545813e-06, + "loss": 0.0475, + "mean_token_accuracy": 0.9867665410041809, + "num_tokens": 96486119.0, + "step": 11885 + }, + { + "entropy": 0.15583183765411376, + "epoch": 6.709932279909706, + "grad_norm": 1.9295543432235718, + "learning_rate": 4.051587367779029e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9901478886604309, + "num_tokens": 96526926.0, + "step": 11890 + }, + { + "entropy": 0.15508732497692107, + "epoch": 6.7127539503386, + "grad_norm": 1.60502290725708, + "learning_rate": 4.050864794984036e-06, + "loss": 0.0316, + "mean_token_accuracy": 0.9912713050842286, + "num_tokens": 96567737.0, + "step": 11895 + }, + { + "entropy": 0.15691338181495668, + "epoch": 6.715575620767495, + "grad_norm": 1.7124838829040527, + "learning_rate": 4.050142040302784e-06, + "loss": 0.0368, + "mean_token_accuracy": 0.9893094897270203, + "num_tokens": 96607281.0, + "step": 11900 + }, + { + "entropy": 0.1330145627260208, + "epoch": 6.718397291196388, + "grad_norm": 1.8225212097167969, + "learning_rate": 4.049419103877258e-06, + "loss": 0.035, + "mean_token_accuracy": 0.9894989252090454, + "num_tokens": 96648016.0, + "step": 11905 + }, + { + "entropy": 0.15559065639972686, + "epoch": 6.721218961625282, + "grad_norm": 1.6894086599349976, + "learning_rate": 4.048695985849479e-06, + "loss": 0.036, + "mean_token_accuracy": 0.9887082815170288, + "num_tokens": 96688807.0, + "step": 11910 + }, + { + "entropy": 0.14794491827487946, + "epoch": 6.724040632054177, + "grad_norm": 1.7598350048065186, + "learning_rate": 4.047972686361503e-06, + "loss": 0.0399, + "mean_token_accuracy": 0.9888983845710755, + "num_tokens": 96729457.0, + "step": 11915 + }, + { + "entropy": 0.1369953766465187, + "epoch": 6.72686230248307, + "grad_norm": 1.9057563543319702, + "learning_rate": 4.047249205555423e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9902025699615479, + "num_tokens": 96769286.0, + "step": 11920 + }, + { + "entropy": 0.14871995151042938, + "epoch": 6.729683972911964, + "grad_norm": 2.072307586669922, + "learning_rate": 4.046525543573366e-06, + "loss": 0.0397, + "mean_token_accuracy": 0.9880092859268188, + "num_tokens": 96809834.0, + "step": 11925 + }, + { + "entropy": 0.13897217512130738, + "epoch": 6.732505643340858, + "grad_norm": 2.005223274230957, + "learning_rate": 4.045801700557497e-06, + "loss": 0.0378, + "mean_token_accuracy": 0.9891113877296448, + "num_tokens": 96850609.0, + "step": 11930 + }, + { + "entropy": 0.1497331142425537, + "epoch": 6.735327313769751, + "grad_norm": 2.2140676975250244, + "learning_rate": 4.045077676650014e-06, + "loss": 0.0385, + "mean_token_accuracy": 0.9883991718292237, + "num_tokens": 96891222.0, + "step": 11935 + }, + { + "entropy": 0.14488416612148286, + "epoch": 6.738148984198646, + "grad_norm": 1.802141547203064, + "learning_rate": 4.044353471993152e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9881270170211792, + "num_tokens": 96931928.0, + "step": 11940 + }, + { + "entropy": 0.14439502954483033, + "epoch": 6.74097065462754, + "grad_norm": 2.055842399597168, + "learning_rate": 4.0436290867291806e-06, + "loss": 0.0436, + "mean_token_accuracy": 0.986508822441101, + "num_tokens": 96972565.0, + "step": 11945 + }, + { + "entropy": 0.14606373012065887, + "epoch": 6.743792325056433, + "grad_norm": 1.8258408308029175, + "learning_rate": 4.042904521000406e-06, + "loss": 0.0348, + "mean_token_accuracy": 0.9901708602905274, + "num_tokens": 97013177.0, + "step": 11950 + }, + { + "entropy": 0.15913362801074982, + "epoch": 6.746613995485327, + "grad_norm": 2.4801416397094727, + "learning_rate": 4.042179774949169e-06, + "loss": 0.0495, + "mean_token_accuracy": 0.9845641493797302, + "num_tokens": 97053721.0, + "step": 11955 + }, + { + "entropy": 0.1454220324754715, + "epoch": 6.749435665914222, + "grad_norm": 2.5556042194366455, + "learning_rate": 4.041454848717845e-06, + "loss": 0.0492, + "mean_token_accuracy": 0.9852368235588074, + "num_tokens": 97094386.0, + "step": 11960 + }, + { + "entropy": 0.14474825710058212, + "epoch": 6.752257336343115, + "grad_norm": 1.8604875802993774, + "learning_rate": 4.040729742448848e-06, + "loss": 0.0389, + "mean_token_accuracy": 0.9878225922584534, + "num_tokens": 97134920.0, + "step": 11965 + }, + { + "entropy": 0.14345545172691346, + "epoch": 6.755079006772009, + "grad_norm": 2.0168981552124023, + "learning_rate": 4.040004456284623e-06, + "loss": 0.0399, + "mean_token_accuracy": 0.9885003328323364, + "num_tokens": 97175551.0, + "step": 11970 + }, + { + "entropy": 0.14142475724220277, + "epoch": 6.757900677200903, + "grad_norm": 1.5360438823699951, + "learning_rate": 4.0392789903676545e-06, + "loss": 0.0319, + "mean_token_accuracy": 0.9909959673881531, + "num_tokens": 97216255.0, + "step": 11975 + }, + { + "entropy": 0.14621337950229646, + "epoch": 6.760722347629796, + "grad_norm": 1.5952950716018677, + "learning_rate": 4.03855334484046e-06, + "loss": 0.0362, + "mean_token_accuracy": 0.9889961838722229, + "num_tokens": 97257015.0, + "step": 11980 + }, + { + "entropy": 0.14792815148830413, + "epoch": 6.763544018058691, + "grad_norm": 2.06760835647583, + "learning_rate": 4.037827519845591e-06, + "loss": 0.0299, + "mean_token_accuracy": 0.9917079448699951, + "num_tokens": 97297751.0, + "step": 11985 + }, + { + "entropy": 0.1481049656867981, + "epoch": 6.766365688487585, + "grad_norm": 1.856473684310913, + "learning_rate": 4.037101515525637e-06, + "loss": 0.0368, + "mean_token_accuracy": 0.9893781185150147, + "num_tokens": 97338310.0, + "step": 11990 + }, + { + "entropy": 0.14933787584304808, + "epoch": 6.769187358916478, + "grad_norm": 1.7966928482055664, + "learning_rate": 4.036375332023222e-06, + "loss": 0.0408, + "mean_token_accuracy": 0.9873141407966614, + "num_tokens": 97378801.0, + "step": 11995 + }, + { + "entropy": 0.15227662324905394, + "epoch": 6.772009029345372, + "grad_norm": 2.43129301071167, + "learning_rate": 4.0356489694810055e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9891417384147644, + "num_tokens": 97419558.0, + "step": 12000 + }, + { + "epoch": 6.772009029345372, + "eval_entropy": 0.1956140100955963, + "eval_loss": 0.012606015428900719, + "eval_mean_token_accuracy": 0.9965503811836243, + "eval_num_tokens": 97419558.0, + "eval_runtime": 0.1638, + "eval_samples_per_second": 24.418, + "eval_steps_per_second": 6.105, + "step": 12000 + }, + { + "entropy": 0.15393645465373992, + "epoch": 6.774830699774267, + "grad_norm": 2.1532039642333984, + "learning_rate": 4.03492242804168e-06, + "loss": 0.0442, + "mean_token_accuracy": 0.9862104415893554, + "num_tokens": 97460095.0, + "step": 12005 + }, + { + "entropy": 0.1350156396627426, + "epoch": 6.77765237020316, + "grad_norm": 1.9720470905303955, + "learning_rate": 4.034195707847975e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9898574113845825, + "num_tokens": 97500756.0, + "step": 12010 + }, + { + "entropy": 0.1415847659111023, + "epoch": 6.780474040632054, + "grad_norm": 1.9140161275863647, + "learning_rate": 4.033468809042655e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9890129208564759, + "num_tokens": 97541530.0, + "step": 12015 + }, + { + "entropy": 0.14546948671340942, + "epoch": 6.783295711060948, + "grad_norm": 2.1538074016571045, + "learning_rate": 4.032741731768519e-06, + "loss": 0.0406, + "mean_token_accuracy": 0.986963152885437, + "num_tokens": 97581567.0, + "step": 12020 + }, + { + "entropy": 0.14426875561475755, + "epoch": 6.786117381489842, + "grad_norm": 1.9157716035842896, + "learning_rate": 4.032014476168403e-06, + "loss": 0.0355, + "mean_token_accuracy": 0.9900845766067505, + "num_tokens": 97622196.0, + "step": 12025 + }, + { + "entropy": 0.14931005537509917, + "epoch": 6.788939051918736, + "grad_norm": 1.9808646440505981, + "learning_rate": 4.031287042385174e-06, + "loss": 0.0398, + "mean_token_accuracy": 0.9882163524627685, + "num_tokens": 97662914.0, + "step": 12030 + }, + { + "entropy": 0.1658826380968094, + "epoch": 6.79176072234763, + "grad_norm": 2.1118521690368652, + "learning_rate": 4.030559430561738e-06, + "loss": 0.0371, + "mean_token_accuracy": 0.9887429594993591, + "num_tokens": 97703480.0, + "step": 12035 + }, + { + "entropy": 0.15551794469356536, + "epoch": 6.794582392776523, + "grad_norm": 2.5292036533355713, + "learning_rate": 4.029831640841035e-06, + "loss": 0.0463, + "mean_token_accuracy": 0.9871610522270202, + "num_tokens": 97744240.0, + "step": 12040 + }, + { + "entropy": 0.15738882422447203, + "epoch": 6.797404063205418, + "grad_norm": 1.7646291255950928, + "learning_rate": 4.029103673366037e-06, + "loss": 0.0386, + "mean_token_accuracy": 0.988528847694397, + "num_tokens": 97784307.0, + "step": 12045 + }, + { + "entropy": 0.1447429984807968, + "epoch": 6.800225733634312, + "grad_norm": 2.040116310119629, + "learning_rate": 4.028375528279757e-06, + "loss": 0.0399, + "mean_token_accuracy": 0.9880939960479737, + "num_tokens": 97825063.0, + "step": 12050 + }, + { + "entropy": 0.15236329436302185, + "epoch": 6.803047404063205, + "grad_norm": 1.7679275274276733, + "learning_rate": 4.027647205725235e-06, + "loss": 0.0347, + "mean_token_accuracy": 0.9887324929237366, + "num_tokens": 97865669.0, + "step": 12055 + }, + { + "entropy": 0.15214796960353852, + "epoch": 6.805869074492099, + "grad_norm": 1.926100730895996, + "learning_rate": 4.026918705845553e-06, + "loss": 0.0309, + "mean_token_accuracy": 0.9904511570930481, + "num_tokens": 97906513.0, + "step": 12060 + }, + { + "entropy": 0.14448803663253784, + "epoch": 6.808690744920993, + "grad_norm": 1.732896327972412, + "learning_rate": 4.026190028783824e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9893592596054077, + "num_tokens": 97946867.0, + "step": 12065 + }, + { + "entropy": 0.15263217091560363, + "epoch": 6.811512415349887, + "grad_norm": 1.5818462371826172, + "learning_rate": 4.025461174683195e-06, + "loss": 0.038, + "mean_token_accuracy": 0.9896114826202392, + "num_tokens": 97987436.0, + "step": 12070 + }, + { + "entropy": 0.14199225902557372, + "epoch": 6.814334085778781, + "grad_norm": 2.4520139694213867, + "learning_rate": 4.024732143686854e-06, + "loss": 0.0343, + "mean_token_accuracy": 0.9893141746520996, + "num_tokens": 98028061.0, + "step": 12075 + }, + { + "entropy": 0.1499510258436203, + "epoch": 6.817155756207675, + "grad_norm": 2.507272720336914, + "learning_rate": 4.024002935938015e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9894481897354126, + "num_tokens": 98068816.0, + "step": 12080 + }, + { + "entropy": 0.1536535143852234, + "epoch": 6.8199774266365685, + "grad_norm": 1.787207007408142, + "learning_rate": 4.0232735515799325e-06, + "loss": 0.04, + "mean_token_accuracy": 0.9880955934524536, + "num_tokens": 98109350.0, + "step": 12085 + }, + { + "entropy": 0.15066614747047424, + "epoch": 6.822799097065463, + "grad_norm": 2.0756945610046387, + "learning_rate": 4.022543990755894e-06, + "loss": 0.0397, + "mean_token_accuracy": 0.9888906359672547, + "num_tokens": 98149953.0, + "step": 12090 + }, + { + "entropy": 0.1614619642496109, + "epoch": 6.825620767494357, + "grad_norm": 2.169306516647339, + "learning_rate": 4.021814253609222e-06, + "loss": 0.0418, + "mean_token_accuracy": 0.98787602186203, + "num_tokens": 98190613.0, + "step": 12095 + }, + { + "entropy": 0.13885380923748017, + "epoch": 6.8284424379232505, + "grad_norm": 1.65521240234375, + "learning_rate": 4.021084340283273e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9888322472572326, + "num_tokens": 98231228.0, + "step": 12100 + }, + { + "entropy": 0.13987670242786407, + "epoch": 6.831264108352144, + "grad_norm": 2.0256147384643555, + "learning_rate": 4.020354250921439e-06, + "loss": 0.0374, + "mean_token_accuracy": 0.9885939717292785, + "num_tokens": 98271713.0, + "step": 12105 + }, + { + "entropy": 0.14241887032985687, + "epoch": 6.834085778781039, + "grad_norm": 2.178802728652954, + "learning_rate": 4.0196239856671465e-06, + "loss": 0.0343, + "mean_token_accuracy": 0.9898516416549683, + "num_tokens": 98312133.0, + "step": 12110 + }, + { + "entropy": 0.15942923724651337, + "epoch": 6.836907449209932, + "grad_norm": 1.7774022817611694, + "learning_rate": 4.0188935446638545e-06, + "loss": 0.0417, + "mean_token_accuracy": 0.9870691418647766, + "num_tokens": 98352906.0, + "step": 12115 + }, + { + "entropy": 0.13171774595975877, + "epoch": 6.839729119638826, + "grad_norm": 1.836889624595642, + "learning_rate": 4.018162928055061e-06, + "loss": 0.04, + "mean_token_accuracy": 0.9872918844223022, + "num_tokens": 98393669.0, + "step": 12120 + }, + { + "entropy": 0.1482748955488205, + "epoch": 6.84255079006772, + "grad_norm": 1.7053948640823364, + "learning_rate": 4.017432135984293e-06, + "loss": 0.034, + "mean_token_accuracy": 0.990520977973938, + "num_tokens": 98434346.0, + "step": 12125 + }, + { + "entropy": 0.15172266364097595, + "epoch": 6.845372460496614, + "grad_norm": 1.739862084388733, + "learning_rate": 4.016701168595116e-06, + "loss": 0.0434, + "mean_token_accuracy": 0.9869125485420227, + "num_tokens": 98474870.0, + "step": 12130 + }, + { + "entropy": 0.15046610832214355, + "epoch": 6.848194130925508, + "grad_norm": 2.3250515460968018, + "learning_rate": 4.01597002603113e-06, + "loss": 0.0452, + "mean_token_accuracy": 0.9872011065483093, + "num_tokens": 98515557.0, + "step": 12135 + }, + { + "entropy": 0.14062797725200654, + "epoch": 6.851015801354402, + "grad_norm": 2.180325746536255, + "learning_rate": 4.015238708435965e-06, + "loss": 0.0368, + "mean_token_accuracy": 0.9888609290122986, + "num_tokens": 98556251.0, + "step": 12140 + }, + { + "entropy": 0.1476264402270317, + "epoch": 6.8538374717832955, + "grad_norm": 2.2180349826812744, + "learning_rate": 4.0145072159532906e-06, + "loss": 0.0376, + "mean_token_accuracy": 0.9886261582374573, + "num_tokens": 98597018.0, + "step": 12145 + }, + { + "entropy": 0.1369374930858612, + "epoch": 6.856659142212189, + "grad_norm": 1.9766490459442139, + "learning_rate": 4.013775548726807e-06, + "loss": 0.0375, + "mean_token_accuracy": 0.9877067446708679, + "num_tokens": 98637642.0, + "step": 12150 + }, + { + "entropy": 0.15410472452640533, + "epoch": 6.859480812641084, + "grad_norm": 2.415297269821167, + "learning_rate": 4.013043706900252e-06, + "loss": 0.0404, + "mean_token_accuracy": 0.9889116406440734, + "num_tokens": 98678395.0, + "step": 12155 + }, + { + "entropy": 0.16218827664852142, + "epoch": 6.8623024830699775, + "grad_norm": 1.7888035774230957, + "learning_rate": 4.012311690617396e-06, + "loss": 0.0408, + "mean_token_accuracy": 0.9877578258514405, + "num_tokens": 98718623.0, + "step": 12160 + }, + { + "entropy": 0.1510297805070877, + "epoch": 6.865124153498871, + "grad_norm": 2.012989044189453, + "learning_rate": 4.011579500022043e-06, + "loss": 0.0388, + "mean_token_accuracy": 0.9888627529144287, + "num_tokens": 98759483.0, + "step": 12165 + }, + { + "entropy": 0.15312339663505553, + "epoch": 6.867945823927765, + "grad_norm": 1.867661714553833, + "learning_rate": 4.010847135258031e-06, + "loss": 0.0349, + "mean_token_accuracy": 0.9899600625038147, + "num_tokens": 98799890.0, + "step": 12170 + }, + { + "entropy": 0.14693421721458436, + "epoch": 6.8707674943566595, + "grad_norm": 1.5122771263122559, + "learning_rate": 4.010114596469234e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9900424122810364, + "num_tokens": 98840808.0, + "step": 12175 + }, + { + "entropy": 0.17182584404945372, + "epoch": 6.873589164785553, + "grad_norm": 1.7709141969680786, + "learning_rate": 4.009381883799561e-06, + "loss": 0.0355, + "mean_token_accuracy": 0.989521062374115, + "num_tokens": 98881544.0, + "step": 12180 + }, + { + "entropy": 0.15575533509254455, + "epoch": 6.876410835214447, + "grad_norm": 2.0675461292266846, + "learning_rate": 4.00864899739295e-06, + "loss": 0.0394, + "mean_token_accuracy": 0.9877262592315674, + "num_tokens": 98922210.0, + "step": 12185 + }, + { + "entropy": 0.15429140627384186, + "epoch": 6.8792325056433405, + "grad_norm": 2.4544715881347656, + "learning_rate": 4.00791593739338e-06, + "loss": 0.0494, + "mean_token_accuracy": 0.9845258593559265, + "num_tokens": 98962824.0, + "step": 12190 + }, + { + "entropy": 0.15189413875341415, + "epoch": 6.882054176072235, + "grad_norm": 2.8407533168792725, + "learning_rate": 4.007182703944859e-06, + "loss": 0.0446, + "mean_token_accuracy": 0.9873796224594116, + "num_tokens": 99003619.0, + "step": 12195 + }, + { + "entropy": 0.15428635776042937, + "epoch": 6.884875846501129, + "grad_norm": 2.224959135055542, + "learning_rate": 4.006449297191432e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9887138962745666, + "num_tokens": 99044216.0, + "step": 12200 + }, + { + "entropy": 0.14300165176391602, + "epoch": 6.8876975169300225, + "grad_norm": 1.9736629724502563, + "learning_rate": 4.005715717277174e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.988729465007782, + "num_tokens": 99084439.0, + "step": 12205 + }, + { + "entropy": 0.13568673729896547, + "epoch": 6.890519187358916, + "grad_norm": 1.9142423868179321, + "learning_rate": 4.004981964346201e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.990368640422821, + "num_tokens": 99124909.0, + "step": 12210 + }, + { + "entropy": 0.14406311213970185, + "epoch": 6.893340857787811, + "grad_norm": 2.4211018085479736, + "learning_rate": 4.004248038542656e-06, + "loss": 0.039, + "mean_token_accuracy": 0.988343071937561, + "num_tokens": 99165645.0, + "step": 12215 + }, + { + "entropy": 0.15503831803798676, + "epoch": 6.8961625282167045, + "grad_norm": 2.471714496612549, + "learning_rate": 4.003513940010718e-06, + "loss": 0.0432, + "mean_token_accuracy": 0.9861461281776428, + "num_tokens": 99206347.0, + "step": 12220 + }, + { + "entropy": 0.1480024516582489, + "epoch": 6.898984198645598, + "grad_norm": 2.0117077827453613, + "learning_rate": 4.002779668894604e-06, + "loss": 0.0362, + "mean_token_accuracy": 0.9891183733940124, + "num_tokens": 99246987.0, + "step": 12225 + }, + { + "entropy": 0.16451097428798675, + "epoch": 6.901805869074492, + "grad_norm": 1.6899313926696777, + "learning_rate": 4.002045225338559e-06, + "loss": 0.0404, + "mean_token_accuracy": 0.9885852932929993, + "num_tokens": 99287449.0, + "step": 12230 + }, + { + "entropy": 0.14375028908252716, + "epoch": 6.904627539503386, + "grad_norm": 1.818851351737976, + "learning_rate": 4.001310609486866e-06, + "loss": 0.0362, + "mean_token_accuracy": 0.9893089056015014, + "num_tokens": 99327979.0, + "step": 12235 + }, + { + "entropy": 0.15548242926597594, + "epoch": 6.90744920993228, + "grad_norm": 1.614169955253601, + "learning_rate": 4.000575821483839e-06, + "loss": 0.0383, + "mean_token_accuracy": 0.9892736315727234, + "num_tokens": 99368473.0, + "step": 12240 + }, + { + "entropy": 0.16284603476524354, + "epoch": 6.910270880361174, + "grad_norm": 1.6673612594604492, + "learning_rate": 3.9998408614738295e-06, + "loss": 0.039, + "mean_token_accuracy": 0.9885139226913452, + "num_tokens": 99408923.0, + "step": 12245 + }, + { + "entropy": 0.14805781245231628, + "epoch": 6.913092550790068, + "grad_norm": 2.2017159461975098, + "learning_rate": 3.999105729601218e-06, + "loss": 0.0378, + "mean_token_accuracy": 0.9887848377227784, + "num_tokens": 99449422.0, + "step": 12250 + }, + { + "entropy": 0.14871875047683716, + "epoch": 6.915914221218961, + "grad_norm": 1.9085617065429688, + "learning_rate": 3.9983704260104225e-06, + "loss": 0.0395, + "mean_token_accuracy": 0.9885958671569824, + "num_tokens": 99490171.0, + "step": 12255 + }, + { + "entropy": 0.1603003293275833, + "epoch": 6.918735891647856, + "grad_norm": 1.8411649465560913, + "learning_rate": 3.997634950845893e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.9881568551063538, + "num_tokens": 99530789.0, + "step": 12260 + }, + { + "entropy": 0.14625448882579803, + "epoch": 6.9215575620767495, + "grad_norm": 1.8774826526641846, + "learning_rate": 3.996899304252116e-06, + "loss": 0.0342, + "mean_token_accuracy": 0.9903077602386474, + "num_tokens": 99571506.0, + "step": 12265 + }, + { + "entropy": 0.14785387217998505, + "epoch": 6.924379232505643, + "grad_norm": 1.910869836807251, + "learning_rate": 3.996163486373605e-06, + "loss": 0.0408, + "mean_token_accuracy": 0.9881960272789001, + "num_tokens": 99612260.0, + "step": 12270 + }, + { + "entropy": 0.15447575151920317, + "epoch": 6.927200902934537, + "grad_norm": 1.985054612159729, + "learning_rate": 3.9954274973549144e-06, + "loss": 0.0399, + "mean_token_accuracy": 0.9876292586326599, + "num_tokens": 99652726.0, + "step": 12275 + }, + { + "entropy": 0.15240433514118196, + "epoch": 6.9300225733634315, + "grad_norm": 1.6934657096862793, + "learning_rate": 3.994691337340629e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9899464964866638, + "num_tokens": 99693195.0, + "step": 12280 + }, + { + "entropy": 0.14032234251499176, + "epoch": 6.932844243792325, + "grad_norm": 2.0978596210479736, + "learning_rate": 3.9939550064753674e-06, + "loss": 0.0421, + "mean_token_accuracy": 0.9877179741859436, + "num_tokens": 99733715.0, + "step": 12285 + }, + { + "entropy": 0.1609266698360443, + "epoch": 6.935665914221219, + "grad_norm": 2.033520221710205, + "learning_rate": 3.993218504903781e-06, + "loss": 0.0396, + "mean_token_accuracy": 0.9879401564598084, + "num_tokens": 99774295.0, + "step": 12290 + }, + { + "entropy": 0.1533256560564041, + "epoch": 6.938487584650113, + "grad_norm": 2.056273937225342, + "learning_rate": 3.992481832770558e-06, + "loss": 0.0481, + "mean_token_accuracy": 0.9857136845588684, + "num_tokens": 99814847.0, + "step": 12295 + }, + { + "entropy": 0.14914492666721343, + "epoch": 6.941309255079007, + "grad_norm": 1.809752345085144, + "learning_rate": 3.991744990220415e-06, + "loss": 0.0403, + "mean_token_accuracy": 0.9875779390335083, + "num_tokens": 99855659.0, + "step": 12300 + }, + { + "entropy": 0.13904739618301393, + "epoch": 6.944130925507901, + "grad_norm": 1.739696979522705, + "learning_rate": 3.9910079773981055e-06, + "loss": 0.0357, + "mean_token_accuracy": 0.989600396156311, + "num_tokens": 99896329.0, + "step": 12305 + }, + { + "entropy": 0.1490867018699646, + "epoch": 6.946952595936795, + "grad_norm": 2.174006223678589, + "learning_rate": 3.990270794448418e-06, + "loss": 0.0397, + "mean_token_accuracy": 0.9880833983421325, + "num_tokens": 99936915.0, + "step": 12310 + }, + { + "entropy": 0.1486115574836731, + "epoch": 6.949774266365688, + "grad_norm": 2.1487553119659424, + "learning_rate": 3.989533441516169e-06, + "loss": 0.0412, + "mean_token_accuracy": 0.988270103931427, + "num_tokens": 99977400.0, + "step": 12315 + }, + { + "entropy": 0.15158516466617583, + "epoch": 6.952595936794582, + "grad_norm": 2.021366834640503, + "learning_rate": 3.9887959187462145e-06, + "loss": 0.0385, + "mean_token_accuracy": 0.988451075553894, + "num_tokens": 100018070.0, + "step": 12320 + }, + { + "entropy": 0.15477460920810698, + "epoch": 6.955417607223477, + "grad_norm": 2.3939874172210693, + "learning_rate": 3.988058226283438e-06, + "loss": 0.0459, + "mean_token_accuracy": 0.9860603213310242, + "num_tokens": 100058592.0, + "step": 12325 + }, + { + "entropy": 0.13858523964881897, + "epoch": 6.95823927765237, + "grad_norm": 1.722223162651062, + "learning_rate": 3.987320364272761e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9893313646316528, + "num_tokens": 100098381.0, + "step": 12330 + }, + { + "entropy": 0.14853320121765137, + "epoch": 6.961060948081264, + "grad_norm": 1.6259387731552124, + "learning_rate": 3.986582332859138e-06, + "loss": 0.029, + "mean_token_accuracy": 0.9913646578788757, + "num_tokens": 100138509.0, + "step": 12335 + }, + { + "entropy": 0.1412474274635315, + "epoch": 6.963882618510158, + "grad_norm": 1.655497431755066, + "learning_rate": 3.985844132187552e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9879396677017211, + "num_tokens": 100178940.0, + "step": 12340 + }, + { + "entropy": 0.14502420425415039, + "epoch": 6.966704288939052, + "grad_norm": 2.259775161743164, + "learning_rate": 3.985105762403024e-06, + "loss": 0.0385, + "mean_token_accuracy": 0.9885640144348145, + "num_tokens": 100219679.0, + "step": 12345 + }, + { + "entropy": 0.1412261575460434, + "epoch": 6.969525959367946, + "grad_norm": 2.183638095855713, + "learning_rate": 3.984367223650608e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.988431978225708, + "num_tokens": 100260265.0, + "step": 12350 + }, + { + "entropy": 0.16001304388046264, + "epoch": 6.97234762979684, + "grad_norm": 1.7587566375732422, + "learning_rate": 3.983628516075389e-06, + "loss": 0.0422, + "mean_token_accuracy": 0.9870441198348999, + "num_tokens": 100301028.0, + "step": 12355 + }, + { + "entropy": 0.1356129452586174, + "epoch": 6.975169300225733, + "grad_norm": 1.582041621208191, + "learning_rate": 3.982889639822487e-06, + "loss": 0.0351, + "mean_token_accuracy": 0.9892949461936951, + "num_tokens": 100341780.0, + "step": 12360 + }, + { + "entropy": 0.15091146826744078, + "epoch": 6.977990970654628, + "grad_norm": 1.9201163053512573, + "learning_rate": 3.982150595037053e-06, + "loss": 0.0396, + "mean_token_accuracy": 0.9874477505683898, + "num_tokens": 100382504.0, + "step": 12365 + }, + { + "entropy": 0.14935719966888428, + "epoch": 6.980812641083522, + "grad_norm": 1.8997581005096436, + "learning_rate": 3.981411381864274e-06, + "loss": 0.0357, + "mean_token_accuracy": 0.9886354565620422, + "num_tokens": 100423339.0, + "step": 12370 + }, + { + "entropy": 0.15034773349761962, + "epoch": 6.983634311512415, + "grad_norm": 2.1917574405670166, + "learning_rate": 3.980672000449367e-06, + "loss": 0.0354, + "mean_token_accuracy": 0.9892937660217285, + "num_tokens": 100463746.0, + "step": 12375 + }, + { + "entropy": 0.15186431109905243, + "epoch": 6.986455981941309, + "grad_norm": 1.7991350889205933, + "learning_rate": 3.9799324509375846e-06, + "loss": 0.0449, + "mean_token_accuracy": 0.9864747166633606, + "num_tokens": 100504500.0, + "step": 12380 + }, + { + "entropy": 0.162436842918396, + "epoch": 6.989277652370204, + "grad_norm": 1.9422773122787476, + "learning_rate": 3.979192733474211e-06, + "loss": 0.0436, + "mean_token_accuracy": 0.9862201929092407, + "num_tokens": 100545119.0, + "step": 12385 + }, + { + "entropy": 0.15107800364494323, + "epoch": 6.992099322799097, + "grad_norm": 2.049860715866089, + "learning_rate": 3.978452848204563e-06, + "loss": 0.0403, + "mean_token_accuracy": 0.9873886942863465, + "num_tokens": 100585667.0, + "step": 12390 + }, + { + "entropy": 0.1658845365047455, + "epoch": 6.994920993227991, + "grad_norm": 2.350536346435547, + "learning_rate": 3.977712795273993e-06, + "loss": 0.0426, + "mean_token_accuracy": 0.9870129108428956, + "num_tokens": 100626337.0, + "step": 12395 + }, + { + "entropy": 0.14828409254550934, + "epoch": 6.997742663656885, + "grad_norm": 2.069523811340332, + "learning_rate": 3.976972574827883e-06, + "loss": 0.0379, + "mean_token_accuracy": 0.9882864952087402, + "num_tokens": 100666972.0, + "step": 12400 + }, + { + "entropy": 0.15599106550216674, + "epoch": 7.000564334085778, + "grad_norm": 1.5296047925949097, + "learning_rate": 3.97623218701165e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.9913263559341431, + "num_tokens": 100701561.0, + "step": 12405 + }, + { + "entropy": 0.13645475655794143, + "epoch": 7.003386004514673, + "grad_norm": 1.3886555433273315, + "learning_rate": 3.975491631970744e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9963740348815918, + "num_tokens": 100742162.0, + "step": 12410 + }, + { + "entropy": 0.13274604827165604, + "epoch": 7.006207674943567, + "grad_norm": 1.7409141063690186, + "learning_rate": 3.974750909850646e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9954428911209107, + "num_tokens": 100782515.0, + "step": 12415 + }, + { + "entropy": 0.12155016213655472, + "epoch": 7.00902934537246, + "grad_norm": 1.6582471132278442, + "learning_rate": 3.9740100207968716e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9957684516906739, + "num_tokens": 100823151.0, + "step": 12420 + }, + { + "entropy": 0.12232804298400879, + "epoch": 7.011851015801354, + "grad_norm": 1.9185771942138672, + "learning_rate": 3.973268964954967e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9957268714904786, + "num_tokens": 100863921.0, + "step": 12425 + }, + { + "entropy": 0.13372556120157242, + "epoch": 7.014672686230249, + "grad_norm": 1.302709937095642, + "learning_rate": 3.972527742470515e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9956133961677551, + "num_tokens": 100904652.0, + "step": 12430 + }, + { + "entropy": 0.1276037335395813, + "epoch": 7.017494356659142, + "grad_norm": 2.0763673782348633, + "learning_rate": 3.971786353489127e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9954843640327453, + "num_tokens": 100945350.0, + "step": 12435 + }, + { + "entropy": 0.1303536906838417, + "epoch": 7.020316027088036, + "grad_norm": 1.7125003337860107, + "learning_rate": 3.97104479815645e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9947200775146484, + "num_tokens": 100986018.0, + "step": 12440 + }, + { + "entropy": 0.13805813938379288, + "epoch": 7.02313769751693, + "grad_norm": 1.2138659954071045, + "learning_rate": 3.9703030766181634e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9959219574928284, + "num_tokens": 101026679.0, + "step": 12445 + }, + { + "entropy": 0.11290259510278702, + "epoch": 7.025959367945824, + "grad_norm": 1.9468364715576172, + "learning_rate": 3.969561189019977e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9957970261573792, + "num_tokens": 101067288.0, + "step": 12450 + }, + { + "entropy": 0.12177327871322632, + "epoch": 7.028781038374718, + "grad_norm": 1.656708836555481, + "learning_rate": 3.968819135507636e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.994585645198822, + "num_tokens": 101107815.0, + "step": 12455 + }, + { + "entropy": 0.13150275498628616, + "epoch": 7.031602708803612, + "grad_norm": 1.2296918630599976, + "learning_rate": 3.968076916226914e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9961319208145142, + "num_tokens": 101148543.0, + "step": 12460 + }, + { + "entropy": 0.13319746106863023, + "epoch": 7.034424379232505, + "grad_norm": 1.9725842475891113, + "learning_rate": 3.967334531323624e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.995919120311737, + "num_tokens": 101189223.0, + "step": 12465 + }, + { + "entropy": 0.13151731789112092, + "epoch": 7.0372460496614, + "grad_norm": 1.4561344385147095, + "learning_rate": 3.966591980943605e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.995412039756775, + "num_tokens": 101229757.0, + "step": 12470 + }, + { + "entropy": 0.12682377845048903, + "epoch": 7.040067720090294, + "grad_norm": 1.5752171277999878, + "learning_rate": 3.965849265232732e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.996173620223999, + "num_tokens": 101270029.0, + "step": 12475 + }, + { + "entropy": 0.1453809142112732, + "epoch": 7.042889390519187, + "grad_norm": 1.8831138610839844, + "learning_rate": 3.965106384336912e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.995201587677002, + "num_tokens": 101310756.0, + "step": 12480 + }, + { + "entropy": 0.13473598957061766, + "epoch": 7.045711060948081, + "grad_norm": 1.9781758785247803, + "learning_rate": 3.964363338402083e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9956030488014221, + "num_tokens": 101350870.0, + "step": 12485 + }, + { + "entropy": 0.13227078914642335, + "epoch": 7.048532731376975, + "grad_norm": 1.273861289024353, + "learning_rate": 3.9636201275742175e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9954041123390198, + "num_tokens": 101391393.0, + "step": 12490 + }, + { + "entropy": 0.130999419093132, + "epoch": 7.051354401805869, + "grad_norm": 1.901928186416626, + "learning_rate": 3.962876751999318e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9943174481391907, + "num_tokens": 101431784.0, + "step": 12495 + }, + { + "entropy": 0.13049811720848084, + "epoch": 7.054176072234763, + "grad_norm": 1.6979538202285767, + "learning_rate": 3.962133211823424e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9947118163108826, + "num_tokens": 101472388.0, + "step": 12500 + }, + { + "epoch": 7.054176072234763, + "eval_entropy": 0.1905188113451004, + "eval_loss": 0.012303678318858147, + "eval_mean_token_accuracy": 0.9973169565200806, + "eval_num_tokens": 101472388.0, + "eval_runtime": 0.1636, + "eval_samples_per_second": 24.454, + "eval_steps_per_second": 6.113, + "step": 12500 + } + ], + "logging_steps": 5, + "max_steps": 35440, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.212206797834539e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}