diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8120 @@ +{ + "best_global_step": 7200, + "best_metric": 0.17618873715400696, + "best_model_checkpoint": "checkpoints/Qwen2.5-3B-Instruct-liar-dice-sft/checkpoint-7200", + "epoch": 1.2605252525252526, + "eval_steps": 300, + "global_step": 7800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.46959872394800184, + "epoch": 0.0016161616161616162, + "grad_norm": 13.960037231445312, + "learning_rate": 8.999999999999999e-06, + "loss": 1.2876, + "mean_token_accuracy": 0.7336300253868103, + "num_tokens": 192055.0, + "step": 10 + }, + { + "entropy": 0.500729388743639, + "epoch": 0.0032323232323232323, + "grad_norm": 6.7432475090026855, + "learning_rate": 1.8999999999999998e-05, + "loss": 1.0227, + "mean_token_accuracy": 0.7338824585080147, + "num_tokens": 386411.0, + "step": 20 + }, + { + "entropy": 0.5788386285305023, + "epoch": 0.0048484848484848485, + "grad_norm": 1.9439020156860352, + "learning_rate": 2.9e-05, + "loss": 0.5673, + "mean_token_accuracy": 0.823445089161396, + "num_tokens": 584042.0, + "step": 30 + }, + { + "entropy": 0.5961214371025563, + "epoch": 0.006464646464646465, + "grad_norm": 1.6919704675674438, + "learning_rate": 3.499999597539e-05, + "loss": 0.4976, + "mean_token_accuracy": 0.8367778733372688, + "num_tokens": 786741.0, + "step": 40 + }, + { + "entropy": 0.6113797709345817, + "epoch": 0.00808080808080808, + "grad_norm": 1.7834748029708862, + "learning_rate": 3.4999950698548777e-05, + "loss": 0.4251, + "mean_token_accuracy": 0.8491482272744179, + "num_tokens": 984632.0, + "step": 50 + }, + { + "entropy": 0.6576851613819599, + "epoch": 0.009696969696969697, + "grad_norm": 1.1660228967666626, + "learning_rate": 3.499985511423442e-05, + "loss": 0.3977, + "mean_token_accuracy": 0.8635194897651672, + "num_tokens": 1172287.0, + "step": 60 + }, + { + "entropy": 0.6415456309914589, + "epoch": 0.011313131313131313, + "grad_norm": 0.8897055983543396, + "learning_rate": 3.499970922272172e-05, + "loss": 0.3657, + "mean_token_accuracy": 0.8693153902888298, + "num_tokens": 1362395.0, + "step": 70 + }, + { + "entropy": 0.6485456451773643, + "epoch": 0.01292929292929293, + "grad_norm": 1.4095648527145386, + "learning_rate": 3.499951302443006e-05, + "loss": 0.3765, + "mean_token_accuracy": 0.8600584954023361, + "num_tokens": 1552405.0, + "step": 80 + }, + { + "entropy": 0.6257273025810719, + "epoch": 0.014545454545454545, + "grad_norm": 2.536198139190674, + "learning_rate": 3.499926651992347e-05, + "loss": 0.4095, + "mean_token_accuracy": 0.8548074960708618, + "num_tokens": 1753707.0, + "step": 90 + }, + { + "entropy": 0.615520790219307, + "epoch": 0.01616161616161616, + "grad_norm": 1.2256574630737305, + "learning_rate": 3.499896970991057e-05, + "loss": 0.4175, + "mean_token_accuracy": 0.8375065699219704, + "num_tokens": 1957057.0, + "step": 100 + }, + { + "entropy": 0.6429425723850727, + "epoch": 0.017777777777777778, + "grad_norm": 1.1664628982543945, + "learning_rate": 3.4998622595244605e-05, + "loss": 0.3776, + "mean_token_accuracy": 0.8666251391172409, + "num_tokens": 2153468.0, + "step": 110 + }, + { + "entropy": 0.6582260139286518, + "epoch": 0.019393939393939394, + "grad_norm": 1.757544755935669, + "learning_rate": 3.499822517692344e-05, + "loss": 0.3251, + "mean_token_accuracy": 0.8730518639087677, + "num_tokens": 2342417.0, + "step": 120 + }, + { + "entropy": 0.6374198235571384, + "epoch": 0.02101010101010101, + "grad_norm": 1.7501511573791504, + "learning_rate": 3.4997777456089535e-05, + "loss": 0.3408, + "mean_token_accuracy": 0.8802526473999024, + "num_tokens": 2540898.0, + "step": 130 + }, + { + "entropy": 0.6632717259228229, + "epoch": 0.022626262626262626, + "grad_norm": 1.2834315299987793, + "learning_rate": 3.499727943402997e-05, + "loss": 0.3466, + "mean_token_accuracy": 0.8864644199609757, + "num_tokens": 2730096.0, + "step": 140 + }, + { + "entropy": 0.6132969424128533, + "epoch": 0.024242424242424242, + "grad_norm": 1.649055004119873, + "learning_rate": 3.499673111217642e-05, + "loss": 0.3472, + "mean_token_accuracy": 0.8800297752022743, + "num_tokens": 2933186.0, + "step": 150 + }, + { + "entropy": 0.6532419972121716, + "epoch": 0.02585858585858586, + "grad_norm": 2.080294609069824, + "learning_rate": 3.4996132492105146e-05, + "loss": 0.3189, + "mean_token_accuracy": 0.8902329325675964, + "num_tokens": 3125684.0, + "step": 160 + }, + { + "entropy": 0.6288332067430019, + "epoch": 0.027474747474747475, + "grad_norm": 1.31842839717865, + "learning_rate": 3.499548357553703e-05, + "loss": 0.2968, + "mean_token_accuracy": 0.8945111632347107, + "num_tokens": 3323087.0, + "step": 170 + }, + { + "entropy": 0.6411679066717625, + "epoch": 0.02909090909090909, + "grad_norm": 1.308049201965332, + "learning_rate": 3.49947843643375e-05, + "loss": 0.3174, + "mean_token_accuracy": 0.8880784347653389, + "num_tokens": 3517966.0, + "step": 180 + }, + { + "entropy": 0.6363876968622207, + "epoch": 0.030707070707070707, + "grad_norm": 1.4614847898483276, + "learning_rate": 3.4994034860516625e-05, + "loss": 0.3138, + "mean_token_accuracy": 0.8888434395194054, + "num_tokens": 3714854.0, + "step": 190 + }, + { + "entropy": 0.6365263275802135, + "epoch": 0.03232323232323232, + "grad_norm": 1.3670061826705933, + "learning_rate": 3.4993235066228996e-05, + "loss": 0.3034, + "mean_token_accuracy": 0.8956289395689965, + "num_tokens": 3912912.0, + "step": 200 + }, + { + "entropy": 0.6736746333539486, + "epoch": 0.03393939393939394, + "grad_norm": 4.420541763305664, + "learning_rate": 3.499238498377381e-05, + "loss": 0.2881, + "mean_token_accuracy": 0.9016141295433044, + "num_tokens": 4099783.0, + "step": 210 + }, + { + "entropy": 0.6174106113612652, + "epoch": 0.035555555555555556, + "grad_norm": 1.7219263315200806, + "learning_rate": 3.49914846155948e-05, + "loss": 0.2821, + "mean_token_accuracy": 0.9015349850058556, + "num_tokens": 4302700.0, + "step": 220 + }, + { + "entropy": 0.654355899989605, + "epoch": 0.037171717171717175, + "grad_norm": 1.486142873764038, + "learning_rate": 3.4990533964280305e-05, + "loss": 0.2979, + "mean_token_accuracy": 0.8962426647543907, + "num_tokens": 4496033.0, + "step": 230 + }, + { + "entropy": 0.6416108801960945, + "epoch": 0.03878787878787879, + "grad_norm": 1.2947139739990234, + "learning_rate": 3.4989533032563156e-05, + "loss": 0.2985, + "mean_token_accuracy": 0.8903466835618019, + "num_tokens": 4691716.0, + "step": 240 + }, + { + "entropy": 0.5999645918607712, + "epoch": 0.04040404040404041, + "grad_norm": 1.6406580209732056, + "learning_rate": 3.498848182332076e-05, + "loss": 0.3048, + "mean_token_accuracy": 0.8953321948647499, + "num_tokens": 4901668.0, + "step": 250 + }, + { + "entropy": 0.6618356920778752, + "epoch": 0.04202020202020202, + "grad_norm": 2.5923011302948, + "learning_rate": 3.498738033957504e-05, + "loss": 0.2815, + "mean_token_accuracy": 0.8963434055447579, + "num_tokens": 5093439.0, + "step": 260 + }, + { + "entropy": 0.6510407045483589, + "epoch": 0.04363636363636364, + "grad_norm": 2.0043606758117676, + "learning_rate": 3.498622858449248e-05, + "loss": 0.2657, + "mean_token_accuracy": 0.8961230248212815, + "num_tokens": 5287355.0, + "step": 270 + }, + { + "entropy": 0.6206977590918541, + "epoch": 0.04525252525252525, + "grad_norm": 1.1375564336776733, + "learning_rate": 3.4985026561384024e-05, + "loss": 0.293, + "mean_token_accuracy": 0.8910468384623528, + "num_tokens": 5489325.0, + "step": 280 + }, + { + "entropy": 0.6492680199444294, + "epoch": 0.04686868686868687, + "grad_norm": 1.0635576248168945, + "learning_rate": 3.498377427370518e-05, + "loss": 0.2758, + "mean_token_accuracy": 0.8996513709425926, + "num_tokens": 5685198.0, + "step": 290 + }, + { + "entropy": 0.6693574033677578, + "epoch": 0.048484848484848485, + "grad_norm": 0.8263881206512451, + "learning_rate": 3.4982471725055907e-05, + "loss": 0.2574, + "mean_token_accuracy": 0.9080115720629692, + "num_tokens": 5874431.0, + "step": 300 + }, + { + "epoch": 0.048484848484848485, + "eval_entropy": 0.7083358879685402, + "eval_loss": 0.24814815819263458, + "eval_mean_token_accuracy": 0.907939182460308, + "eval_num_tokens": 5874431.0, + "eval_runtime": 106.7029, + "eval_samples_per_second": 9.372, + "eval_steps_per_second": 9.372, + "step": 300 + }, + { + "entropy": 0.6249252423644066, + "epoch": 0.050101010101010104, + "grad_norm": 0.9064458012580872, + "learning_rate": 3.498111891918067e-05, + "loss": 0.2907, + "mean_token_accuracy": 0.8954597547650337, + "num_tokens": 6076107.0, + "step": 310 + }, + { + "entropy": 0.6337138824164867, + "epoch": 0.05171717171717172, + "grad_norm": 1.4427143335342407, + "learning_rate": 3.4979715859968415e-05, + "loss": 0.2712, + "mean_token_accuracy": 0.8949789464473724, + "num_tokens": 6276008.0, + "step": 320 + }, + { + "entropy": 0.6436790756881237, + "epoch": 0.05333333333333334, + "grad_norm": 1.2344701290130615, + "learning_rate": 3.4978262551452546e-05, + "loss": 0.2425, + "mean_token_accuracy": 0.9087216407060623, + "num_tokens": 6473180.0, + "step": 330 + }, + { + "entropy": 0.6058064438402653, + "epoch": 0.05494949494949495, + "grad_norm": 1.8228733539581299, + "learning_rate": 3.497675899781091e-05, + "loss": 0.2708, + "mean_token_accuracy": 0.8938016831874848, + "num_tokens": 6679706.0, + "step": 340 + }, + { + "entropy": 0.6307087175548076, + "epoch": 0.05656565656565657, + "grad_norm": 1.1663655042648315, + "learning_rate": 3.497520520336582e-05, + "loss": 0.2704, + "mean_token_accuracy": 0.8911788880825042, + "num_tokens": 6876536.0, + "step": 350 + }, + { + "entropy": 0.642133416980505, + "epoch": 0.05818181818181818, + "grad_norm": 1.7720001935958862, + "learning_rate": 3.497360117258399e-05, + "loss": 0.2548, + "mean_token_accuracy": 0.9016376256942749, + "num_tokens": 7069255.0, + "step": 360 + }, + { + "entropy": 0.639679954200983, + "epoch": 0.0597979797979798, + "grad_norm": 1.618810772895813, + "learning_rate": 3.4971946910076555e-05, + "loss": 0.2544, + "mean_token_accuracy": 0.8990731582045555, + "num_tokens": 7263687.0, + "step": 370 + }, + { + "entropy": 0.6632651142776013, + "epoch": 0.061414141414141414, + "grad_norm": 1.4762156009674072, + "learning_rate": 3.497024242059907e-05, + "loss": 0.2271, + "mean_token_accuracy": 0.9126387253403664, + "num_tokens": 7449167.0, + "step": 380 + }, + { + "entropy": 0.6455851793289185, + "epoch": 0.06303030303030303, + "grad_norm": 1.648486852645874, + "learning_rate": 3.496848770905146e-05, + "loss": 0.2462, + "mean_token_accuracy": 0.9119722321629524, + "num_tokens": 7640249.0, + "step": 390 + }, + { + "entropy": 0.6278689287602901, + "epoch": 0.06464646464646465, + "grad_norm": 1.5724526643753052, + "learning_rate": 3.496668278047804e-05, + "loss": 0.2712, + "mean_token_accuracy": 0.8938238501548768, + "num_tokens": 7836242.0, + "step": 400 + }, + { + "entropy": 0.6397264793515205, + "epoch": 0.06626262626262626, + "grad_norm": 1.1819778680801392, + "learning_rate": 3.496482764006746e-05, + "loss": 0.2441, + "mean_token_accuracy": 0.9028481721878052, + "num_tokens": 8030712.0, + "step": 410 + }, + { + "entropy": 0.6572954833507538, + "epoch": 0.06787878787878789, + "grad_norm": 1.456351637840271, + "learning_rate": 3.496292229315275e-05, + "loss": 0.2615, + "mean_token_accuracy": 0.9067741006612777, + "num_tokens": 8219433.0, + "step": 420 + }, + { + "entropy": 0.6125221528112889, + "epoch": 0.0694949494949495, + "grad_norm": 1.8550266027450562, + "learning_rate": 3.4960966745211225e-05, + "loss": 0.2411, + "mean_token_accuracy": 0.9120300561189651, + "num_tokens": 8420536.0, + "step": 430 + }, + { + "entropy": 0.6642718121409417, + "epoch": 0.07111111111111111, + "grad_norm": 1.758142113685608, + "learning_rate": 3.495896100186456e-05, + "loss": 0.2263, + "mean_token_accuracy": 0.9210765823721886, + "num_tokens": 8606059.0, + "step": 440 + }, + { + "entropy": 0.6295181460678577, + "epoch": 0.07272727272727272, + "grad_norm": 3.6077258586883545, + "learning_rate": 3.4956905068878704e-05, + "loss": 0.2435, + "mean_token_accuracy": 0.9080678433179855, + "num_tokens": 8804802.0, + "step": 450 + }, + { + "entropy": 0.641046067327261, + "epoch": 0.07434343434343435, + "grad_norm": 0.8795793056488037, + "learning_rate": 3.4954798952163886e-05, + "loss": 0.2521, + "mean_token_accuracy": 0.9062324479222298, + "num_tokens": 9000444.0, + "step": 460 + }, + { + "entropy": 0.6226008631289005, + "epoch": 0.07595959595959596, + "grad_norm": 1.1648048162460327, + "learning_rate": 3.495264265777461e-05, + "loss": 0.2298, + "mean_token_accuracy": 0.9145419552922249, + "num_tokens": 9200956.0, + "step": 470 + }, + { + "entropy": 0.6410299509763717, + "epoch": 0.07757575757575758, + "grad_norm": 2.069148302078247, + "learning_rate": 3.4950436191909614e-05, + "loss": 0.2371, + "mean_token_accuracy": 0.9139557749032974, + "num_tokens": 9394938.0, + "step": 480 + }, + { + "entropy": 0.6107208199799061, + "epoch": 0.07919191919191919, + "grad_norm": 3.762711524963379, + "learning_rate": 3.494817956091187e-05, + "loss": 0.2274, + "mean_token_accuracy": 0.9104640632867813, + "num_tokens": 9599158.0, + "step": 490 + }, + { + "entropy": 0.6289502464234829, + "epoch": 0.08080808080808081, + "grad_norm": 0.7265241742134094, + "learning_rate": 3.4945872771268564e-05, + "loss": 0.2181, + "mean_token_accuracy": 0.9099911272525787, + "num_tokens": 9795341.0, + "step": 500 + }, + { + "entropy": 0.6372130006551743, + "epoch": 0.08242424242424243, + "grad_norm": 1.3274513483047485, + "learning_rate": 3.494351582961108e-05, + "loss": 0.2592, + "mean_token_accuracy": 0.9113820254802704, + "num_tokens": 9991485.0, + "step": 510 + }, + { + "entropy": 0.6244701214134694, + "epoch": 0.08404040404040404, + "grad_norm": 1.3778245449066162, + "learning_rate": 3.494110874271496e-05, + "loss": 0.2589, + "mean_token_accuracy": 0.9008386582136154, + "num_tokens": 10192963.0, + "step": 520 + }, + { + "entropy": 0.6476377807557583, + "epoch": 0.08565656565656565, + "grad_norm": 7.508720397949219, + "learning_rate": 3.4938651517499906e-05, + "loss": 0.2454, + "mean_token_accuracy": 0.9027299627661705, + "num_tokens": 10384151.0, + "step": 530 + }, + { + "entropy": 0.6151685245335102, + "epoch": 0.08727272727272728, + "grad_norm": 1.208046317100525, + "learning_rate": 3.493614416102976e-05, + "loss": 0.2519, + "mean_token_accuracy": 0.8936711862683296, + "num_tokens": 10587963.0, + "step": 540 + }, + { + "entropy": 0.6441866233944893, + "epoch": 0.08888888888888889, + "grad_norm": 0.9904794692993164, + "learning_rate": 3.493358668051246e-05, + "loss": 0.2338, + "mean_token_accuracy": 0.911143635213375, + "num_tokens": 10782441.0, + "step": 550 + }, + { + "entropy": 0.6711594641208649, + "epoch": 0.0905050505050505, + "grad_norm": 1.4392328262329102, + "learning_rate": 3.493097908330007e-05, + "loss": 0.2343, + "mean_token_accuracy": 0.9069328650832176, + "num_tokens": 10966642.0, + "step": 560 + }, + { + "entropy": 0.6202519237995148, + "epoch": 0.09212121212121212, + "grad_norm": 1.1660218238830566, + "learning_rate": 3.4928321376888685e-05, + "loss": 0.2342, + "mean_token_accuracy": 0.9107801094651222, + "num_tokens": 11165742.0, + "step": 570 + }, + { + "entropy": 0.6474331930279732, + "epoch": 0.09373737373737374, + "grad_norm": 1.130745530128479, + "learning_rate": 3.492561356891847e-05, + "loss": 0.2208, + "mean_token_accuracy": 0.9123849824070931, + "num_tokens": 11356135.0, + "step": 580 + }, + { + "entropy": 0.6123579762876034, + "epoch": 0.09535353535353536, + "grad_norm": 1.5646748542785645, + "learning_rate": 3.4922855667173627e-05, + "loss": 0.2376, + "mean_token_accuracy": 0.9094289064407348, + "num_tokens": 11558279.0, + "step": 590 + }, + { + "entropy": 0.6709182240068913, + "epoch": 0.09696969696969697, + "grad_norm": 2.955382823944092, + "learning_rate": 3.492004767958235e-05, + "loss": 0.2205, + "mean_token_accuracy": 0.9199464812874794, + "num_tokens": 11743893.0, + "step": 600 + }, + { + "epoch": 0.09696969696969697, + "eval_entropy": 0.6894149232506752, + "eval_loss": 0.21707852184772491, + "eval_mean_token_accuracy": 0.9174256545305252, + "eval_num_tokens": 11743893.0, + "eval_runtime": 106.8539, + "eval_samples_per_second": 9.359, + "eval_steps_per_second": 9.359, + "step": 600 + }, + { + "entropy": 0.631628192961216, + "epoch": 0.09858585858585858, + "grad_norm": 0.8649947047233582, + "learning_rate": 3.4917189614216816e-05, + "loss": 0.2417, + "mean_token_accuracy": 0.90289586186409, + "num_tokens": 11940257.0, + "step": 610 + }, + { + "entropy": 0.5916264042258262, + "epoch": 0.10020202020202021, + "grad_norm": 1.269579529762268, + "learning_rate": 3.491428147929317e-05, + "loss": 0.2283, + "mean_token_accuracy": 0.9115998461842537, + "num_tokens": 12151169.0, + "step": 620 + }, + { + "entropy": 0.6250050470232964, + "epoch": 0.10181818181818182, + "grad_norm": 3.079771041870117, + "learning_rate": 3.4911323283171485e-05, + "loss": 0.2391, + "mean_token_accuracy": 0.9097789540886879, + "num_tokens": 12349255.0, + "step": 630 + }, + { + "entropy": 0.6837345652282238, + "epoch": 0.10343434343434343, + "grad_norm": 2.3608956336975098, + "learning_rate": 3.490831503435575e-05, + "loss": 0.2381, + "mean_token_accuracy": 0.909605085849762, + "num_tokens": 12531982.0, + "step": 640 + }, + { + "entropy": 0.6474835090339184, + "epoch": 0.10505050505050505, + "grad_norm": 0.9185977578163147, + "learning_rate": 3.490525674149384e-05, + "loss": 0.2228, + "mean_token_accuracy": 0.9189437657594681, + "num_tokens": 12726420.0, + "step": 650 + }, + { + "entropy": 0.6150617614388466, + "epoch": 0.10666666666666667, + "grad_norm": 2.9653398990631104, + "learning_rate": 3.490214841337749e-05, + "loss": 0.2466, + "mean_token_accuracy": 0.9002894461154938, + "num_tokens": 12929338.0, + "step": 660 + }, + { + "entropy": 0.6306007914245129, + "epoch": 0.10828282828282829, + "grad_norm": 2.005779266357422, + "learning_rate": 3.4898990058942284e-05, + "loss": 0.2402, + "mean_token_accuracy": 0.9071647629141808, + "num_tokens": 13126452.0, + "step": 670 + }, + { + "entropy": 0.6232900455594063, + "epoch": 0.1098989898989899, + "grad_norm": 1.0521694421768188, + "learning_rate": 3.48957816872676e-05, + "loss": 0.2222, + "mean_token_accuracy": 0.9119127243757248, + "num_tokens": 13323868.0, + "step": 680 + }, + { + "entropy": 0.6038605846464634, + "epoch": 0.11151515151515151, + "grad_norm": 2.1727254390716553, + "learning_rate": 3.489252330757662e-05, + "loss": 0.2527, + "mean_token_accuracy": 0.8974953025579453, + "num_tokens": 13532123.0, + "step": 690 + }, + { + "entropy": 0.6080361865460873, + "epoch": 0.11313131313131314, + "grad_norm": 2.5059597492218018, + "learning_rate": 3.4889214929236264e-05, + "loss": 0.2359, + "mean_token_accuracy": 0.9091552272439003, + "num_tokens": 13739707.0, + "step": 700 + }, + { + "entropy": 0.6336839586496353, + "epoch": 0.11474747474747475, + "grad_norm": 0.798665463924408, + "learning_rate": 3.4885856561757215e-05, + "loss": 0.23, + "mean_token_accuracy": 0.9074552163481713, + "num_tokens": 13939220.0, + "step": 710 + }, + { + "entropy": 0.6417612984776497, + "epoch": 0.11636363636363636, + "grad_norm": 1.036450982093811, + "learning_rate": 3.488244821479382e-05, + "loss": 0.2191, + "mean_token_accuracy": 0.9127443626523017, + "num_tokens": 14131430.0, + "step": 720 + }, + { + "entropy": 0.6245701543986797, + "epoch": 0.11797979797979798, + "grad_norm": 1.9606401920318604, + "learning_rate": 3.487898989814414e-05, + "loss": 0.2244, + "mean_token_accuracy": 0.9075841188430787, + "num_tokens": 14328348.0, + "step": 730 + }, + { + "entropy": 0.6564237147569656, + "epoch": 0.1195959595959596, + "grad_norm": 1.6049528121948242, + "learning_rate": 3.487548162174987e-05, + "loss": 0.2296, + "mean_token_accuracy": 0.9106339663267136, + "num_tokens": 14515807.0, + "step": 740 + }, + { + "entropy": 0.6185222245752812, + "epoch": 0.12121212121212122, + "grad_norm": 1.29709792137146, + "learning_rate": 3.487192339569631e-05, + "loss": 0.2366, + "mean_token_accuracy": 0.9005967259407044, + "num_tokens": 14714178.0, + "step": 750 + }, + { + "entropy": 0.6413687355816364, + "epoch": 0.12282828282828283, + "grad_norm": 1.3318425416946411, + "learning_rate": 3.486831523021239e-05, + "loss": 0.2311, + "mean_token_accuracy": 0.9086028903722763, + "num_tokens": 14908741.0, + "step": 760 + }, + { + "entropy": 0.6336495697498321, + "epoch": 0.12444444444444444, + "grad_norm": 0.9258722066879272, + "learning_rate": 3.4864657135670555e-05, + "loss": 0.2459, + "mean_token_accuracy": 0.9087097436189652, + "num_tokens": 15106645.0, + "step": 770 + }, + { + "entropy": 0.6563303679227829, + "epoch": 0.12606060606060607, + "grad_norm": 1.5138893127441406, + "learning_rate": 3.486094912258683e-05, + "loss": 0.2287, + "mean_token_accuracy": 0.8989679113030433, + "num_tokens": 15294657.0, + "step": 780 + }, + { + "entropy": 0.6279109574854373, + "epoch": 0.12767676767676767, + "grad_norm": 1.7884769439697266, + "learning_rate": 3.485719120162069e-05, + "loss": 0.2441, + "mean_token_accuracy": 0.9058216944336891, + "num_tokens": 15492333.0, + "step": 790 + }, + { + "entropy": 0.6399269141256809, + "epoch": 0.1292929292929293, + "grad_norm": 0.9560442566871643, + "learning_rate": 3.485338338357513e-05, + "loss": 0.2125, + "mean_token_accuracy": 0.9148769214749336, + "num_tokens": 15685417.0, + "step": 800 + }, + { + "entropy": 0.6472979053854943, + "epoch": 0.13090909090909092, + "grad_norm": 2.49409556388855, + "learning_rate": 3.484952567939656e-05, + "loss": 0.2409, + "mean_token_accuracy": 0.907583674788475, + "num_tokens": 15882414.0, + "step": 810 + }, + { + "entropy": 0.5940545335412025, + "epoch": 0.13252525252525252, + "grad_norm": 1.5602918863296509, + "learning_rate": 3.48456181001748e-05, + "loss": 0.2404, + "mean_token_accuracy": 0.89293831884861, + "num_tokens": 16094120.0, + "step": 820 + }, + { + "entropy": 0.6304773099720478, + "epoch": 0.13414141414141414, + "grad_norm": 1.1661272048950195, + "learning_rate": 3.484166065714304e-05, + "loss": 0.2339, + "mean_token_accuracy": 0.9048717796802521, + "num_tokens": 16293723.0, + "step": 830 + }, + { + "entropy": 0.6498629376292229, + "epoch": 0.13575757575757577, + "grad_norm": 1.2386293411254883, + "learning_rate": 3.483765336167784e-05, + "loss": 0.2305, + "mean_token_accuracy": 0.9080037698149681, + "num_tokens": 16485193.0, + "step": 840 + }, + { + "entropy": 0.6220371380448342, + "epoch": 0.13737373737373737, + "grad_norm": 1.8108175992965698, + "learning_rate": 3.483359622529905e-05, + "loss": 0.226, + "mean_token_accuracy": 0.9119313061237335, + "num_tokens": 16687488.0, + "step": 850 + }, + { + "entropy": 0.6563889265060425, + "epoch": 0.138989898989899, + "grad_norm": 0.8232795000076294, + "learning_rate": 3.48294892596698e-05, + "loss": 0.2128, + "mean_token_accuracy": 0.9188828706741333, + "num_tokens": 16876673.0, + "step": 860 + }, + { + "entropy": 0.6596408911049366, + "epoch": 0.1406060606060606, + "grad_norm": 1.3460944890975952, + "learning_rate": 3.482533247659647e-05, + "loss": 0.2372, + "mean_token_accuracy": 0.9061458364129067, + "num_tokens": 17063863.0, + "step": 870 + }, + { + "entropy": 0.6356192789971828, + "epoch": 0.14222222222222222, + "grad_norm": 1.2244527339935303, + "learning_rate": 3.482112588802866e-05, + "loss": 0.2386, + "mean_token_accuracy": 0.9038299396634102, + "num_tokens": 17258452.0, + "step": 880 + }, + { + "entropy": 0.6439319156110287, + "epoch": 0.14383838383838385, + "grad_norm": 0.8409541249275208, + "learning_rate": 3.4816869506059134e-05, + "loss": 0.2163, + "mean_token_accuracy": 0.9198831230401993, + "num_tokens": 17451380.0, + "step": 890 + }, + { + "entropy": 0.640556076169014, + "epoch": 0.14545454545454545, + "grad_norm": 1.9304518699645996, + "learning_rate": 3.4812563342923794e-05, + "loss": 0.2141, + "mean_token_accuracy": 0.9013576105237007, + "num_tokens": 17644862.0, + "step": 900 + }, + { + "epoch": 0.14545454545454545, + "eval_entropy": 0.6915190102458, + "eval_loss": 0.20909403264522552, + "eval_mean_token_accuracy": 0.9142436196208, + "eval_num_tokens": 17644862.0, + "eval_runtime": 107.0886, + "eval_samples_per_second": 9.338, + "eval_steps_per_second": 9.338, + "step": 900 + }, + { + "entropy": 0.6096978440880776, + "epoch": 0.14707070707070707, + "grad_norm": 0.9783409237861633, + "learning_rate": 3.480820741100166e-05, + "loss": 0.2454, + "mean_token_accuracy": 0.9017274558544159, + "num_tokens": 17848157.0, + "step": 910 + }, + { + "entropy": 0.6275364428758621, + "epoch": 0.1486868686868687, + "grad_norm": 1.266980528831482, + "learning_rate": 3.480380172281483e-05, + "loss": 0.221, + "mean_token_accuracy": 0.909178127348423, + "num_tokens": 18047501.0, + "step": 920 + }, + { + "entropy": 0.6168364368379116, + "epoch": 0.1503030303030303, + "grad_norm": 0.7472752928733826, + "learning_rate": 3.4799346291028415e-05, + "loss": 0.2458, + "mean_token_accuracy": 0.9119211822748184, + "num_tokens": 18249869.0, + "step": 930 + }, + { + "entropy": 0.5978960894048214, + "epoch": 0.15191919191919193, + "grad_norm": 0.7442205548286438, + "learning_rate": 3.4794841128450554e-05, + "loss": 0.2559, + "mean_token_accuracy": 0.8956459879875183, + "num_tokens": 18457950.0, + "step": 940 + }, + { + "entropy": 0.6468767315149307, + "epoch": 0.15353535353535352, + "grad_norm": 1.178752064704895, + "learning_rate": 3.4790286248032314e-05, + "loss": 0.2113, + "mean_token_accuracy": 0.9203948318958283, + "num_tokens": 18650543.0, + "step": 950 + }, + { + "entropy": 0.6555621646344661, + "epoch": 0.15515151515151515, + "grad_norm": 0.9539858102798462, + "learning_rate": 3.478568166286771e-05, + "loss": 0.2289, + "mean_token_accuracy": 0.9062513440847397, + "num_tokens": 18841642.0, + "step": 960 + }, + { + "entropy": 0.6141032330691815, + "epoch": 0.15676767676767678, + "grad_norm": 2.01023006439209, + "learning_rate": 3.4781027386193646e-05, + "loss": 0.2389, + "mean_token_accuracy": 0.9044842913746833, + "num_tokens": 19045515.0, + "step": 970 + }, + { + "entropy": 0.6368382506072521, + "epoch": 0.15838383838383838, + "grad_norm": 0.5178106427192688, + "learning_rate": 3.4776323431389866e-05, + "loss": 0.2171, + "mean_token_accuracy": 0.9083613112568856, + "num_tokens": 19241213.0, + "step": 980 + }, + { + "entropy": 0.634531506896019, + "epoch": 0.16, + "grad_norm": 1.3576446771621704, + "learning_rate": 3.4771569811978915e-05, + "loss": 0.2216, + "mean_token_accuracy": 0.906193482875824, + "num_tokens": 19437628.0, + "step": 990 + }, + { + "entropy": 0.631604690849781, + "epoch": 0.16161616161616163, + "grad_norm": 1.0033283233642578, + "learning_rate": 3.476676654162613e-05, + "loss": 0.23, + "mean_token_accuracy": 0.9103793069720268, + "num_tokens": 19635494.0, + "step": 1000 + }, + { + "entropy": 0.6327577441930771, + "epoch": 0.16323232323232323, + "grad_norm": 0.9308726787567139, + "learning_rate": 3.4761913634139584e-05, + "loss": 0.2344, + "mean_token_accuracy": 0.9024915874004364, + "num_tokens": 19834517.0, + "step": 1010 + }, + { + "entropy": 0.6257577292621136, + "epoch": 0.16484848484848486, + "grad_norm": 1.1238154172897339, + "learning_rate": 3.475701110347001e-05, + "loss": 0.2315, + "mean_token_accuracy": 0.915093657374382, + "num_tokens": 20033584.0, + "step": 1020 + }, + { + "entropy": 0.6169027276337147, + "epoch": 0.16646464646464645, + "grad_norm": 1.1264071464538574, + "learning_rate": 3.4752058963710835e-05, + "loss": 0.2326, + "mean_token_accuracy": 0.8968415692448616, + "num_tokens": 20234492.0, + "step": 1030 + }, + { + "entropy": 0.6385836571455001, + "epoch": 0.16808080808080808, + "grad_norm": 0.7746083736419678, + "learning_rate": 3.474705722909807e-05, + "loss": 0.2305, + "mean_token_accuracy": 0.908117501437664, + "num_tokens": 20429582.0, + "step": 1040 + }, + { + "entropy": 0.6292243875563145, + "epoch": 0.1696969696969697, + "grad_norm": 0.7928685545921326, + "learning_rate": 3.4742005914010296e-05, + "loss": 0.2096, + "mean_token_accuracy": 0.9107841208577157, + "num_tokens": 20626277.0, + "step": 1050 + }, + { + "entropy": 0.6654087409377099, + "epoch": 0.1713131313131313, + "grad_norm": 0.6060704588890076, + "learning_rate": 3.473690503296865e-05, + "loss": 0.1949, + "mean_token_accuracy": 0.9214139148592949, + "num_tokens": 20809912.0, + "step": 1060 + }, + { + "entropy": 0.6308883085846901, + "epoch": 0.17292929292929293, + "grad_norm": 1.6825069189071655, + "learning_rate": 3.4731754600636734e-05, + "loss": 0.2161, + "mean_token_accuracy": 0.9196865290403367, + "num_tokens": 21004610.0, + "step": 1070 + }, + { + "entropy": 0.6125972785055638, + "epoch": 0.17454545454545456, + "grad_norm": 1.3060930967330933, + "learning_rate": 3.47265546318206e-05, + "loss": 0.2341, + "mean_token_accuracy": 0.9092792198061943, + "num_tokens": 21205277.0, + "step": 1080 + }, + { + "entropy": 0.6183511838316917, + "epoch": 0.17616161616161616, + "grad_norm": 0.9683547019958496, + "learning_rate": 3.472130514146871e-05, + "loss": 0.2087, + "mean_token_accuracy": 0.9153425708413124, + "num_tokens": 21404921.0, + "step": 1090 + }, + { + "entropy": 0.6358366332948208, + "epoch": 0.17777777777777778, + "grad_norm": 1.0839707851409912, + "learning_rate": 3.471600614467188e-05, + "loss": 0.2266, + "mean_token_accuracy": 0.9070942506194115, + "num_tokens": 21598578.0, + "step": 1100 + }, + { + "entropy": 0.6339450895786285, + "epoch": 0.17939393939393938, + "grad_norm": 1.1610348224639893, + "learning_rate": 3.4710657656663257e-05, + "loss": 0.2206, + "mean_token_accuracy": 0.908353665471077, + "num_tokens": 21795249.0, + "step": 1110 + }, + { + "entropy": 0.6443082444369793, + "epoch": 0.181010101010101, + "grad_norm": 1.1031250953674316, + "learning_rate": 3.470525969281824e-05, + "loss": 0.1938, + "mean_token_accuracy": 0.9259483084082604, + "num_tokens": 21988155.0, + "step": 1120 + }, + { + "entropy": 0.6319502085447312, + "epoch": 0.18262626262626264, + "grad_norm": 1.0539134740829468, + "learning_rate": 3.469981226865448e-05, + "loss": 0.1993, + "mean_token_accuracy": 0.9268665567040444, + "num_tokens": 22183128.0, + "step": 1130 + }, + { + "entropy": 0.6007440723478794, + "epoch": 0.18424242424242424, + "grad_norm": 1.4432859420776367, + "learning_rate": 3.469431539983178e-05, + "loss": 0.232, + "mean_token_accuracy": 0.9083610430359841, + "num_tokens": 22386990.0, + "step": 1140 + }, + { + "entropy": 0.6132355526089668, + "epoch": 0.18585858585858586, + "grad_norm": 0.695091962814331, + "learning_rate": 3.468876910215212e-05, + "loss": 0.2379, + "mean_token_accuracy": 0.9031597882509231, + "num_tokens": 22587971.0, + "step": 1150 + }, + { + "entropy": 0.5998403012752533, + "epoch": 0.1874747474747475, + "grad_norm": 0.9048874974250793, + "learning_rate": 3.468317339155955e-05, + "loss": 0.2397, + "mean_token_accuracy": 0.9018667727708817, + "num_tokens": 22793134.0, + "step": 1160 + }, + { + "entropy": 0.6309370696544647, + "epoch": 0.1890909090909091, + "grad_norm": 0.9386253952980042, + "learning_rate": 3.467752828414019e-05, + "loss": 0.2087, + "mean_token_accuracy": 0.9095293834805489, + "num_tokens": 22989872.0, + "step": 1170 + }, + { + "entropy": 0.6670469336211682, + "epoch": 0.1907070707070707, + "grad_norm": 0.8335705995559692, + "learning_rate": 3.467183379612213e-05, + "loss": 0.2037, + "mean_token_accuracy": 0.9193868711590767, + "num_tokens": 23175070.0, + "step": 1180 + }, + { + "entropy": 0.605170601606369, + "epoch": 0.1923232323232323, + "grad_norm": 0.7726657390594482, + "learning_rate": 3.4666089943875444e-05, + "loss": 0.2185, + "mean_token_accuracy": 0.9151586100459099, + "num_tokens": 23379679.0, + "step": 1190 + }, + { + "entropy": 0.6216241672635079, + "epoch": 0.19393939393939394, + "grad_norm": 1.46378493309021, + "learning_rate": 3.466029674391211e-05, + "loss": 0.2369, + "mean_token_accuracy": 0.8999699011445046, + "num_tokens": 23579844.0, + "step": 1200 + }, + { + "epoch": 0.19393939393939394, + "eval_entropy": 0.6973449417352676, + "eval_loss": 0.19888541102409363, + "eval_mean_token_accuracy": 0.9178089879155159, + "eval_num_tokens": 23579844.0, + "eval_runtime": 106.7423, + "eval_samples_per_second": 9.368, + "eval_steps_per_second": 9.368, + "step": 1200 + }, + { + "entropy": 0.6532895624637604, + "epoch": 0.19555555555555557, + "grad_norm": 3.021833658218384, + "learning_rate": 3.465445421288594e-05, + "loss": 0.2319, + "mean_token_accuracy": 0.9079471796751022, + "num_tokens": 23770536.0, + "step": 1210 + }, + { + "entropy": 0.6232981145381927, + "epoch": 0.19717171717171716, + "grad_norm": 0.7231025099754333, + "learning_rate": 3.4648562367592595e-05, + "loss": 0.2117, + "mean_token_accuracy": 0.9194033876061439, + "num_tokens": 23971970.0, + "step": 1220 + }, + { + "entropy": 0.6648551635444164, + "epoch": 0.1987878787878788, + "grad_norm": 1.4859992265701294, + "learning_rate": 3.4642621224969474e-05, + "loss": 0.203, + "mean_token_accuracy": 0.9099412858486176, + "num_tokens": 24160322.0, + "step": 1230 + }, + { + "entropy": 0.6378097325563431, + "epoch": 0.20040404040404042, + "grad_norm": 0.5669660568237305, + "learning_rate": 3.4636630802095723e-05, + "loss": 0.2115, + "mean_token_accuracy": 0.9144275858998299, + "num_tokens": 24354426.0, + "step": 1240 + }, + { + "entropy": 0.6563280507922172, + "epoch": 0.20202020202020202, + "grad_norm": 2.1159043312072754, + "learning_rate": 3.463059111619212e-05, + "loss": 0.2019, + "mean_token_accuracy": 0.9190993249416352, + "num_tokens": 24543374.0, + "step": 1250 + }, + { + "entropy": 0.6134343653917312, + "epoch": 0.20363636363636364, + "grad_norm": 1.0814077854156494, + "learning_rate": 3.462450218462108e-05, + "loss": 0.2252, + "mean_token_accuracy": 0.90730449706316, + "num_tokens": 24747550.0, + "step": 1260 + }, + { + "entropy": 0.639523645490408, + "epoch": 0.20525252525252524, + "grad_norm": 2.090059995651245, + "learning_rate": 3.461836402488658e-05, + "loss": 0.232, + "mean_token_accuracy": 0.9036611437797546, + "num_tokens": 24942137.0, + "step": 1270 + }, + { + "entropy": 0.6455445013940334, + "epoch": 0.20686868686868687, + "grad_norm": 1.184032917022705, + "learning_rate": 3.4612176654634124e-05, + "loss": 0.1844, + "mean_token_accuracy": 0.9213565349578857, + "num_tokens": 25136742.0, + "step": 1280 + }, + { + "entropy": 0.5931920140981675, + "epoch": 0.2084848484848485, + "grad_norm": 1.39106285572052, + "learning_rate": 3.460594009165066e-05, + "loss": 0.2198, + "mean_token_accuracy": 0.9083201169967652, + "num_tokens": 25344524.0, + "step": 1290 + }, + { + "entropy": 0.6356636583805084, + "epoch": 0.2101010101010101, + "grad_norm": 1.8506667613983154, + "learning_rate": 3.4599654353864576e-05, + "loss": 0.2199, + "mean_token_accuracy": 0.9171806886792183, + "num_tokens": 25540892.0, + "step": 1300 + }, + { + "entropy": 0.6352035351097584, + "epoch": 0.21171717171717172, + "grad_norm": 1.501383662223816, + "learning_rate": 3.459331945934561e-05, + "loss": 0.2049, + "mean_token_accuracy": 0.9134323224425316, + "num_tokens": 25736036.0, + "step": 1310 + }, + { + "entropy": 0.6150361925363541, + "epoch": 0.21333333333333335, + "grad_norm": 0.6642631888389587, + "learning_rate": 3.458693542630481e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.9162977918982506, + "num_tokens": 25938119.0, + "step": 1320 + }, + { + "entropy": 0.641390497982502, + "epoch": 0.21494949494949495, + "grad_norm": 1.7314903736114502, + "learning_rate": 3.4580502273094506e-05, + "loss": 0.2247, + "mean_token_accuracy": 0.9050250992178916, + "num_tokens": 26129891.0, + "step": 1330 + }, + { + "entropy": 0.6408931516110897, + "epoch": 0.21656565656565657, + "grad_norm": 0.5409038066864014, + "learning_rate": 3.4574020018208206e-05, + "loss": 0.2202, + "mean_token_accuracy": 0.9055180490016937, + "num_tokens": 26321688.0, + "step": 1340 + }, + { + "entropy": 0.6626883774995804, + "epoch": 0.21818181818181817, + "grad_norm": 3.052995443344116, + "learning_rate": 3.456748868028058e-05, + "loss": 0.2088, + "mean_token_accuracy": 0.9178230226039886, + "num_tokens": 26511190.0, + "step": 1350 + }, + { + "entropy": 0.645967397838831, + "epoch": 0.2197979797979798, + "grad_norm": 0.5310812592506409, + "learning_rate": 3.45609082780874e-05, + "loss": 0.2065, + "mean_token_accuracy": 0.9129741847515106, + "num_tokens": 26702954.0, + "step": 1360 + }, + { + "entropy": 0.635505760461092, + "epoch": 0.22141414141414142, + "grad_norm": 0.7930052280426025, + "learning_rate": 3.4554278830545494e-05, + "loss": 0.2288, + "mean_token_accuracy": 0.9053992003202438, + "num_tokens": 26898915.0, + "step": 1370 + }, + { + "entropy": 0.6399193912744522, + "epoch": 0.22303030303030302, + "grad_norm": 1.7344346046447754, + "learning_rate": 3.4547600356712673e-05, + "loss": 0.2026, + "mean_token_accuracy": 0.917083628475666, + "num_tokens": 27092903.0, + "step": 1380 + }, + { + "entropy": 0.6360017918050289, + "epoch": 0.22464646464646465, + "grad_norm": 0.5752031803131104, + "learning_rate": 3.454087287578768e-05, + "loss": 0.2007, + "mean_token_accuracy": 0.9171236485242844, + "num_tokens": 27286898.0, + "step": 1390 + }, + { + "entropy": 0.657283715903759, + "epoch": 0.22626262626262628, + "grad_norm": 1.8767166137695312, + "learning_rate": 3.4534096407110144e-05, + "loss": 0.216, + "mean_token_accuracy": 0.9128377422690391, + "num_tokens": 27478561.0, + "step": 1400 + }, + { + "entropy": 0.6192032106220722, + "epoch": 0.22787878787878788, + "grad_norm": 1.462258219718933, + "learning_rate": 3.452727097016053e-05, + "loss": 0.2269, + "mean_token_accuracy": 0.9088853120803833, + "num_tokens": 27679828.0, + "step": 1410 + }, + { + "entropy": 0.6551795959472656, + "epoch": 0.2294949494949495, + "grad_norm": 0.6006298661231995, + "learning_rate": 3.452039658456005e-05, + "loss": 0.2303, + "mean_token_accuracy": 0.905583493411541, + "num_tokens": 27869832.0, + "step": 1420 + }, + { + "entropy": 0.648159421235323, + "epoch": 0.2311111111111111, + "grad_norm": 1.1963809728622437, + "learning_rate": 3.4513473270070655e-05, + "loss": 0.2388, + "mean_token_accuracy": 0.9001779198646546, + "num_tokens": 28063539.0, + "step": 1430 + }, + { + "entropy": 0.6477998718619347, + "epoch": 0.23272727272727273, + "grad_norm": 0.4751994013786316, + "learning_rate": 3.450650104659493e-05, + "loss": 0.2048, + "mean_token_accuracy": 0.9150430023670196, + "num_tokens": 28256696.0, + "step": 1440 + }, + { + "entropy": 0.6262367948889732, + "epoch": 0.23434343434343435, + "grad_norm": 1.1237725019454956, + "learning_rate": 3.449947993417608e-05, + "loss": 0.2174, + "mean_token_accuracy": 0.9058823600411415, + "num_tokens": 28459175.0, + "step": 1450 + }, + { + "entropy": 0.6371082656085492, + "epoch": 0.23595959595959595, + "grad_norm": 0.3717547357082367, + "learning_rate": 3.4492409952997846e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.9148850768804551, + "num_tokens": 28656115.0, + "step": 1460 + }, + { + "entropy": 0.6463325545191765, + "epoch": 0.23757575757575758, + "grad_norm": 0.8946523070335388, + "learning_rate": 3.4485291123384445e-05, + "loss": 0.2103, + "mean_token_accuracy": 0.9115211561322212, + "num_tokens": 28850698.0, + "step": 1470 + }, + { + "entropy": 0.6228535048663616, + "epoch": 0.2391919191919192, + "grad_norm": 1.0412840843200684, + "learning_rate": 3.447812346580053e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.9091267749667168, + "num_tokens": 29049617.0, + "step": 1480 + }, + { + "entropy": 0.6189112387597561, + "epoch": 0.2408080808080808, + "grad_norm": 1.1727371215820312, + "learning_rate": 3.447090700085111e-05, + "loss": 0.2184, + "mean_token_accuracy": 0.915630365908146, + "num_tokens": 29250403.0, + "step": 1490 + }, + { + "entropy": 0.6186697401106358, + "epoch": 0.24242424242424243, + "grad_norm": 0.5877743363380432, + "learning_rate": 3.4463641749281495e-05, + "loss": 0.1967, + "mean_token_accuracy": 0.9188372850418091, + "num_tokens": 29450472.0, + "step": 1500 + }, + { + "epoch": 0.24242424242424243, + "eval_entropy": 0.6933836719095707, + "eval_loss": 0.19674202799797058, + "eval_mean_token_accuracy": 0.9202139660716057, + "eval_num_tokens": 29450472.0, + "eval_runtime": 107.0097, + "eval_samples_per_second": 9.345, + "eval_steps_per_second": 9.345, + "step": 1500 + }, + { + "entropy": 0.6295811668038368, + "epoch": 0.24404040404040403, + "grad_norm": 1.9190826416015625, + "learning_rate": 3.445632773197728e-05, + "loss": 0.2389, + "mean_token_accuracy": 0.9088302314281463, + "num_tokens": 29650737.0, + "step": 1510 + }, + { + "entropy": 0.629815524816513, + "epoch": 0.24565656565656566, + "grad_norm": 0.5053853392601013, + "learning_rate": 3.444896496996421e-05, + "loss": 0.2055, + "mean_token_accuracy": 0.9128904730081558, + "num_tokens": 29850237.0, + "step": 1520 + }, + { + "entropy": 0.6402494020760059, + "epoch": 0.24727272727272728, + "grad_norm": 1.2792322635650635, + "learning_rate": 3.444155348440817e-05, + "loss": 0.2039, + "mean_token_accuracy": 0.9191207081079483, + "num_tokens": 30045394.0, + "step": 1530 + }, + { + "entropy": 0.6441243663430214, + "epoch": 0.24888888888888888, + "grad_norm": 0.9373044967651367, + "learning_rate": 3.443409329661512e-05, + "loss": 0.2121, + "mean_token_accuracy": 0.9115235760807991, + "num_tokens": 30237651.0, + "step": 1540 + }, + { + "entropy": 0.6224880084395409, + "epoch": 0.2505050505050505, + "grad_norm": 0.5471675395965576, + "learning_rate": 3.442658442803101e-05, + "loss": 0.1991, + "mean_token_accuracy": 0.915025606751442, + "num_tokens": 30438010.0, + "step": 1550 + }, + { + "entropy": 0.6422186933457852, + "epoch": 0.25212121212121213, + "grad_norm": 2.163301944732666, + "learning_rate": 3.441902690024174e-05, + "loss": 0.2081, + "mean_token_accuracy": 0.9188799828290939, + "num_tokens": 30631524.0, + "step": 1560 + }, + { + "entropy": 0.6336139030754566, + "epoch": 0.25373737373737376, + "grad_norm": 0.5002826452255249, + "learning_rate": 3.44114207349731e-05, + "loss": 0.1961, + "mean_token_accuracy": 0.9159276992082596, + "num_tokens": 30827142.0, + "step": 1570 + }, + { + "entropy": 0.6471013821661472, + "epoch": 0.25535353535353533, + "grad_norm": 0.5277097225189209, + "learning_rate": 3.440376595409068e-05, + "loss": 0.2112, + "mean_token_accuracy": 0.9091186106204987, + "num_tokens": 31018112.0, + "step": 1580 + }, + { + "entropy": 0.6442371979355812, + "epoch": 0.25696969696969696, + "grad_norm": 3.8261702060699463, + "learning_rate": 3.4396062579599855e-05, + "loss": 0.2012, + "mean_token_accuracy": 0.9144791051745415, + "num_tokens": 31211638.0, + "step": 1590 + }, + { + "entropy": 0.6473165228962898, + "epoch": 0.2585858585858586, + "grad_norm": 0.6906265020370483, + "learning_rate": 3.438831063364566e-05, + "loss": 0.2015, + "mean_token_accuracy": 0.9193555861711502, + "num_tokens": 31403481.0, + "step": 1600 + }, + { + "entropy": 0.6608973711729049, + "epoch": 0.2602020202020202, + "grad_norm": 1.506940484046936, + "learning_rate": 3.4380510138512785e-05, + "loss": 0.1908, + "mean_token_accuracy": 0.9326013177633286, + "num_tokens": 31592824.0, + "step": 1610 + }, + { + "entropy": 0.6413066022098064, + "epoch": 0.26181818181818184, + "grad_norm": 1.2020001411437988, + "learning_rate": 3.437266111662548e-05, + "loss": 0.2001, + "mean_token_accuracy": 0.9156825304031372, + "num_tokens": 31785892.0, + "step": 1620 + }, + { + "entropy": 0.6559233710169792, + "epoch": 0.2634343434343434, + "grad_norm": 1.3252395391464233, + "learning_rate": 3.436476359054747e-05, + "loss": 0.2206, + "mean_token_accuracy": 0.9107002332806587, + "num_tokens": 31975303.0, + "step": 1630 + }, + { + "entropy": 0.6213893130421638, + "epoch": 0.26505050505050504, + "grad_norm": 0.9649196267127991, + "learning_rate": 3.435681758298196e-05, + "loss": 0.2396, + "mean_token_accuracy": 0.9061437651515007, + "num_tokens": 32176608.0, + "step": 1640 + }, + { + "entropy": 0.6706323623657227, + "epoch": 0.26666666666666666, + "grad_norm": 0.7337570786476135, + "learning_rate": 3.43488231167715e-05, + "loss": 0.1941, + "mean_token_accuracy": 0.9181428372859954, + "num_tokens": 32365469.0, + "step": 1650 + }, + { + "entropy": 0.6438912704586983, + "epoch": 0.2682828282828283, + "grad_norm": 0.5835590958595276, + "learning_rate": 3.4340780214897944e-05, + "loss": 0.2114, + "mean_token_accuracy": 0.9103723153471946, + "num_tokens": 32558625.0, + "step": 1660 + }, + { + "entropy": 0.6183638513088227, + "epoch": 0.2698989898989899, + "grad_norm": 0.5338345170021057, + "learning_rate": 3.433268890048239e-05, + "loss": 0.212, + "mean_token_accuracy": 0.9069491773843765, + "num_tokens": 32758742.0, + "step": 1670 + }, + { + "entropy": 0.6191523462533951, + "epoch": 0.27151515151515154, + "grad_norm": 0.659136176109314, + "learning_rate": 3.4324549196785114e-05, + "loss": 0.2267, + "mean_token_accuracy": 0.9071033582091331, + "num_tokens": 32960876.0, + "step": 1680 + }, + { + "entropy": 0.6082011148333549, + "epoch": 0.2731313131313131, + "grad_norm": 0.40289682149887085, + "learning_rate": 3.4316361127205486e-05, + "loss": 0.2271, + "mean_token_accuracy": 0.9086646243929863, + "num_tokens": 33165337.0, + "step": 1690 + }, + { + "entropy": 0.6146745294332504, + "epoch": 0.27474747474747474, + "grad_norm": 1.1410349607467651, + "learning_rate": 3.4308124715281916e-05, + "loss": 0.2113, + "mean_token_accuracy": 0.9149732172489167, + "num_tokens": 33367594.0, + "step": 1700 + }, + { + "entropy": 0.6495476797223091, + "epoch": 0.27636363636363637, + "grad_norm": 2.567011594772339, + "learning_rate": 3.42998399846918e-05, + "loss": 0.2199, + "mean_token_accuracy": 0.910026653110981, + "num_tokens": 33558500.0, + "step": 1710 + }, + { + "entropy": 0.6399842962622643, + "epoch": 0.277979797979798, + "grad_norm": 0.49004054069519043, + "learning_rate": 3.429150695925142e-05, + "loss": 0.196, + "mean_token_accuracy": 0.912572592496872, + "num_tokens": 33756919.0, + "step": 1720 + }, + { + "entropy": 0.6292979188263417, + "epoch": 0.2795959595959596, + "grad_norm": 0.6200940012931824, + "learning_rate": 3.4283125662915895e-05, + "loss": 0.1993, + "mean_token_accuracy": 0.9184976890683174, + "num_tokens": 33955439.0, + "step": 1730 + }, + { + "entropy": 0.6242897778749465, + "epoch": 0.2812121212121212, + "grad_norm": 0.46822234988212585, + "learning_rate": 3.427469611977912e-05, + "loss": 0.214, + "mean_token_accuracy": 0.9052839800715446, + "num_tokens": 34151935.0, + "step": 1740 + }, + { + "entropy": 0.6312501393258572, + "epoch": 0.2828282828282828, + "grad_norm": 0.6502590775489807, + "learning_rate": 3.426621835407367e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.9219602450728417, + "num_tokens": 34347358.0, + "step": 1750 + }, + { + "entropy": 0.6140806257724762, + "epoch": 0.28444444444444444, + "grad_norm": 0.7690215706825256, + "learning_rate": 3.425769239017077e-05, + "loss": 0.2079, + "mean_token_accuracy": 0.9129775792360306, + "num_tokens": 34549789.0, + "step": 1760 + }, + { + "entropy": 0.6185090109705925, + "epoch": 0.28606060606060607, + "grad_norm": 0.5466774702072144, + "learning_rate": 3.424911825258016e-05, + "loss": 0.2121, + "mean_token_accuracy": 0.9113013401627541, + "num_tokens": 34750972.0, + "step": 1770 + }, + { + "entropy": 0.6002763979136944, + "epoch": 0.2876767676767677, + "grad_norm": 0.599219799041748, + "learning_rate": 3.4240495965950124e-05, + "loss": 0.2317, + "mean_token_accuracy": 0.9014593094587326, + "num_tokens": 34956175.0, + "step": 1780 + }, + { + "entropy": 0.6477742247283459, + "epoch": 0.28929292929292927, + "grad_norm": 0.6608160138130188, + "learning_rate": 3.42318255550673e-05, + "loss": 0.2057, + "mean_token_accuracy": 0.9176206976175308, + "num_tokens": 35148784.0, + "step": 1790 + }, + { + "entropy": 0.6291337795555592, + "epoch": 0.2909090909090909, + "grad_norm": 2.33884596824646, + "learning_rate": 3.422310704485672e-05, + "loss": 0.199, + "mean_token_accuracy": 0.9241131842136383, + "num_tokens": 35345505.0, + "step": 1800 + }, + { + "epoch": 0.2909090909090909, + "eval_entropy": 0.6954512582421303, + "eval_loss": 0.1912153661251068, + "eval_mean_token_accuracy": 0.9204178621172905, + "eval_num_tokens": 35345505.0, + "eval_runtime": 107.4175, + "eval_samples_per_second": 9.309, + "eval_steps_per_second": 9.309, + "step": 1800 + }, + { + "entropy": 0.6612498946487904, + "epoch": 0.2925252525252525, + "grad_norm": 0.5786094069480896, + "learning_rate": 3.421434046038165e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.9201330304145813, + "num_tokens": 35534904.0, + "step": 1810 + }, + { + "entropy": 0.6251860365271569, + "epoch": 0.29414141414141415, + "grad_norm": 0.6408361196517944, + "learning_rate": 3.4205525826843576e-05, + "loss": 0.227, + "mean_token_accuracy": 0.9126361146569252, + "num_tokens": 35735057.0, + "step": 1820 + }, + { + "entropy": 0.6235713072121143, + "epoch": 0.2957575757575758, + "grad_norm": 0.4577428102493286, + "learning_rate": 3.4196663169582125e-05, + "loss": 0.2142, + "mean_token_accuracy": 0.9085043162107468, + "num_tokens": 35935733.0, + "step": 1830 + }, + { + "entropy": 0.6569493114948273, + "epoch": 0.2973737373737374, + "grad_norm": 0.7934256196022034, + "learning_rate": 3.4187752514074955e-05, + "loss": 0.2143, + "mean_token_accuracy": 0.9169334411621094, + "num_tokens": 36125039.0, + "step": 1840 + }, + { + "entropy": 0.6233655147254467, + "epoch": 0.298989898989899, + "grad_norm": 0.4999740719795227, + "learning_rate": 3.4178793885937705e-05, + "loss": 0.2174, + "mean_token_accuracy": 0.908802704513073, + "num_tokens": 36324044.0, + "step": 1850 + }, + { + "entropy": 0.6451633021235466, + "epoch": 0.3006060606060606, + "grad_norm": 0.720961332321167, + "learning_rate": 3.416978731092394e-05, + "loss": 0.192, + "mean_token_accuracy": 0.9189713701605797, + "num_tokens": 36515735.0, + "step": 1860 + }, + { + "entropy": 0.6397333383560181, + "epoch": 0.3022222222222222, + "grad_norm": 2.212775230407715, + "learning_rate": 3.416073281492504e-05, + "loss": 0.2236, + "mean_token_accuracy": 0.9063498586416244, + "num_tokens": 36712700.0, + "step": 1870 + }, + { + "entropy": 0.6438430123031139, + "epoch": 0.30383838383838385, + "grad_norm": 0.5264093279838562, + "learning_rate": 3.4151630423970153e-05, + "loss": 0.1993, + "mean_token_accuracy": 0.91684380620718, + "num_tokens": 36908572.0, + "step": 1880 + }, + { + "entropy": 0.6554016143083572, + "epoch": 0.3054545454545455, + "grad_norm": 0.41331690549850464, + "learning_rate": 3.414248016422613e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.9113001599907875, + "num_tokens": 37101597.0, + "step": 1890 + }, + { + "entropy": 0.6380980283021926, + "epoch": 0.30707070707070705, + "grad_norm": 0.4134485721588135, + "learning_rate": 3.413328206199739e-05, + "loss": 0.2273, + "mean_token_accuracy": 0.9148691385984421, + "num_tokens": 37300456.0, + "step": 1900 + }, + { + "entropy": 0.6408122353255749, + "epoch": 0.3086868686868687, + "grad_norm": 2.0843093395233154, + "learning_rate": 3.412403614372592e-05, + "loss": 0.2067, + "mean_token_accuracy": 0.9162870794534683, + "num_tokens": 37496092.0, + "step": 1910 + }, + { + "entropy": 0.6569594420492649, + "epoch": 0.3103030303030303, + "grad_norm": 1.7086161375045776, + "learning_rate": 3.411474243599116e-05, + "loss": 0.2071, + "mean_token_accuracy": 0.9168068438768386, + "num_tokens": 37685676.0, + "step": 1920 + }, + { + "entropy": 0.6387952454388142, + "epoch": 0.31191919191919193, + "grad_norm": 0.40981894731521606, + "learning_rate": 3.4105400965509906e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.9146794468164444, + "num_tokens": 37882882.0, + "step": 1930 + }, + { + "entropy": 0.6661320142447948, + "epoch": 0.31353535353535356, + "grad_norm": 0.6330037713050842, + "learning_rate": 3.40960117591363e-05, + "loss": 0.1991, + "mean_token_accuracy": 0.9190028592944145, + "num_tokens": 38069595.0, + "step": 1940 + }, + { + "entropy": 0.6375580407679081, + "epoch": 0.3151515151515151, + "grad_norm": 0.5483776926994324, + "learning_rate": 3.408657484386168e-05, + "loss": 0.2166, + "mean_token_accuracy": 0.9116458058357239, + "num_tokens": 38267168.0, + "step": 1950 + }, + { + "entropy": 0.6482452765107155, + "epoch": 0.31676767676767675, + "grad_norm": 2.778130531311035, + "learning_rate": 3.4077090246814544e-05, + "loss": 0.2098, + "mean_token_accuracy": 0.9136871173977852, + "num_tokens": 38461292.0, + "step": 1960 + }, + { + "entropy": 0.674566724896431, + "epoch": 0.3183838383838384, + "grad_norm": 1.4035710096359253, + "learning_rate": 3.406755799526046e-05, + "loss": 0.1931, + "mean_token_accuracy": 0.9202602237462998, + "num_tokens": 38647952.0, + "step": 1970 + }, + { + "entropy": 0.6578912548720837, + "epoch": 0.32, + "grad_norm": 0.8172881603240967, + "learning_rate": 3.405797811660199e-05, + "loss": 0.1821, + "mean_token_accuracy": 0.9241382718086243, + "num_tokens": 38838677.0, + "step": 1980 + }, + { + "entropy": 0.6281578689813614, + "epoch": 0.32161616161616163, + "grad_norm": 0.9093276858329773, + "learning_rate": 3.4048350638378606e-05, + "loss": 0.2202, + "mean_token_accuracy": 0.9049888834357261, + "num_tokens": 39035872.0, + "step": 1990 + }, + { + "entropy": 0.6333778567612172, + "epoch": 0.32323232323232326, + "grad_norm": 0.880991518497467, + "learning_rate": 3.403867558826663e-05, + "loss": 0.2041, + "mean_token_accuracy": 0.9181326001882553, + "num_tokens": 39235091.0, + "step": 2000 + }, + { + "entropy": 0.6394575096666812, + "epoch": 0.32484848484848483, + "grad_norm": 0.5308339595794678, + "learning_rate": 3.402895299407913e-05, + "loss": 0.1872, + "mean_token_accuracy": 0.9277254670858384, + "num_tokens": 39429550.0, + "step": 2010 + }, + { + "entropy": 0.6256730824708938, + "epoch": 0.32646464646464646, + "grad_norm": 0.37278324365615845, + "learning_rate": 3.4019182883765844e-05, + "loss": 0.1827, + "mean_token_accuracy": 0.9239986538887024, + "num_tokens": 39629223.0, + "step": 2020 + }, + { + "entropy": 0.6522976204752922, + "epoch": 0.3280808080808081, + "grad_norm": 0.3719552457332611, + "learning_rate": 3.400936528541311e-05, + "loss": 0.215, + "mean_token_accuracy": 0.912938816845417, + "num_tokens": 39820805.0, + "step": 2030 + }, + { + "entropy": 0.6549306355416775, + "epoch": 0.3296969696969697, + "grad_norm": 0.9718757271766663, + "learning_rate": 3.399950022724379e-05, + "loss": 0.1936, + "mean_token_accuracy": 0.9173328787088394, + "num_tokens": 40013643.0, + "step": 2040 + }, + { + "entropy": 0.6376727804541588, + "epoch": 0.33131313131313134, + "grad_norm": 2.6161608695983887, + "learning_rate": 3.398958773761717e-05, + "loss": 0.1937, + "mean_token_accuracy": 0.918268159031868, + "num_tokens": 40213829.0, + "step": 2050 + }, + { + "entropy": 0.6503816410899163, + "epoch": 0.3329292929292929, + "grad_norm": 0.4555739760398865, + "learning_rate": 3.3979627845028884e-05, + "loss": 0.1945, + "mean_token_accuracy": 0.9175498813390732, + "num_tokens": 40408625.0, + "step": 2060 + }, + { + "entropy": 0.6709971487522125, + "epoch": 0.33454545454545453, + "grad_norm": 0.30291101336479187, + "learning_rate": 3.396962057811085e-05, + "loss": 0.1906, + "mean_token_accuracy": 0.923813234269619, + "num_tokens": 40596756.0, + "step": 2070 + }, + { + "entropy": 0.6471885599195957, + "epoch": 0.33616161616161616, + "grad_norm": 1.0948461294174194, + "learning_rate": 3.395956596563117e-05, + "loss": 0.2153, + "mean_token_accuracy": 0.909675869345665, + "num_tokens": 40791510.0, + "step": 2080 + }, + { + "entropy": 0.6256736367940903, + "epoch": 0.3377777777777778, + "grad_norm": 0.5939026474952698, + "learning_rate": 3.394946403649405e-05, + "loss": 0.21, + "mean_token_accuracy": 0.9117529839277267, + "num_tokens": 40990851.0, + "step": 2090 + }, + { + "entropy": 0.6239214479923249, + "epoch": 0.3393939393939394, + "grad_norm": 0.7415814995765686, + "learning_rate": 3.3939314819739696e-05, + "loss": 0.2074, + "mean_token_accuracy": 0.912028856575489, + "num_tokens": 41191154.0, + "step": 2100 + }, + { + "epoch": 0.3393939393939394, + "eval_entropy": 0.6998262491822242, + "eval_loss": 0.1984918862581253, + "eval_mean_token_accuracy": 0.9189635333418846, + "eval_num_tokens": 41191154.0, + "eval_runtime": 106.9645, + "eval_samples_per_second": 9.349, + "eval_steps_per_second": 9.349, + "step": 2100 + }, + { + "entropy": 0.651825649291277, + "epoch": 0.341010101010101, + "grad_norm": 1.52384352684021, + "learning_rate": 3.392911834454429e-05, + "loss": 0.2159, + "mean_token_accuracy": 0.9134291365742684, + "num_tokens": 41384003.0, + "step": 2110 + }, + { + "entropy": 0.6138758823275566, + "epoch": 0.3426262626262626, + "grad_norm": 0.583720862865448, + "learning_rate": 3.391887464021985e-05, + "loss": 0.2322, + "mean_token_accuracy": 0.9039447426795959, + "num_tokens": 41590543.0, + "step": 2120 + }, + { + "entropy": 0.620711050182581, + "epoch": 0.34424242424242424, + "grad_norm": 0.7128543853759766, + "learning_rate": 3.3908583736214166e-05, + "loss": 0.2233, + "mean_token_accuracy": 0.9042671546339989, + "num_tokens": 41792676.0, + "step": 2130 + }, + { + "entropy": 0.6385029278695583, + "epoch": 0.34585858585858587, + "grad_norm": 0.3531809449195862, + "learning_rate": 3.389824566211071e-05, + "loss": 0.2012, + "mean_token_accuracy": 0.9119970798492432, + "num_tokens": 41989615.0, + "step": 2140 + }, + { + "entropy": 0.6404819719493389, + "epoch": 0.3474747474747475, + "grad_norm": 1.1511902809143066, + "learning_rate": 3.388786044762857e-05, + "loss": 0.1978, + "mean_token_accuracy": 0.9193251192569732, + "num_tokens": 42189774.0, + "step": 2150 + }, + { + "entropy": 0.6125321023166179, + "epoch": 0.3490909090909091, + "grad_norm": 1.0474456548690796, + "learning_rate": 3.387742812262232e-05, + "loss": 0.2248, + "mean_token_accuracy": 0.9014544919133186, + "num_tokens": 42397597.0, + "step": 2160 + }, + { + "entropy": 0.6697261020541191, + "epoch": 0.3507070707070707, + "grad_norm": 2.6372017860412598, + "learning_rate": 3.386694871708201e-05, + "loss": 0.2131, + "mean_token_accuracy": 0.9181707888841629, + "num_tokens": 42589672.0, + "step": 2170 + }, + { + "entropy": 0.6931374616920948, + "epoch": 0.3523232323232323, + "grad_norm": 0.5862610340118408, + "learning_rate": 3.385642226113298e-05, + "loss": 0.1822, + "mean_token_accuracy": 0.9239647507667541, + "num_tokens": 42773524.0, + "step": 2180 + }, + { + "entropy": 0.6293456293642521, + "epoch": 0.35393939393939394, + "grad_norm": 0.27295875549316406, + "learning_rate": 3.384584878503585e-05, + "loss": 0.2034, + "mean_token_accuracy": 0.9137175157666206, + "num_tokens": 42975451.0, + "step": 2190 + }, + { + "entropy": 0.6602464765310287, + "epoch": 0.35555555555555557, + "grad_norm": 0.38715091347694397, + "learning_rate": 3.383522831918644e-05, + "loss": 0.207, + "mean_token_accuracy": 0.9117765665054322, + "num_tokens": 43167229.0, + "step": 2200 + }, + { + "entropy": 0.6907435119152069, + "epoch": 0.3571717171717172, + "grad_norm": 0.7388333082199097, + "learning_rate": 3.3824560894115604e-05, + "loss": 0.1895, + "mean_token_accuracy": 0.9175972148776055, + "num_tokens": 43349083.0, + "step": 2210 + }, + { + "entropy": 0.641204608976841, + "epoch": 0.35878787878787877, + "grad_norm": 1.022162914276123, + "learning_rate": 3.3813846540489214e-05, + "loss": 0.2306, + "mean_token_accuracy": 0.9055288776755333, + "num_tokens": 43545849.0, + "step": 2220 + }, + { + "entropy": 0.6882462851703167, + "epoch": 0.3604040404040404, + "grad_norm": 0.6299819946289062, + "learning_rate": 3.3803085289108046e-05, + "loss": 0.1825, + "mean_token_accuracy": 0.9197423487901688, + "num_tokens": 43728913.0, + "step": 2230 + }, + { + "entropy": 0.66824596747756, + "epoch": 0.362020202020202, + "grad_norm": 0.5263814926147461, + "learning_rate": 3.3792277170907697e-05, + "loss": 0.1979, + "mean_token_accuracy": 0.9203566953539848, + "num_tokens": 43920222.0, + "step": 2240 + }, + { + "entropy": 0.6338320925831795, + "epoch": 0.36363636363636365, + "grad_norm": 0.5668702125549316, + "learning_rate": 3.378142221695848e-05, + "loss": 0.2046, + "mean_token_accuracy": 0.9108695581555366, + "num_tokens": 44120455.0, + "step": 2250 + }, + { + "entropy": 0.6575196944177151, + "epoch": 0.3652525252525253, + "grad_norm": 0.3751061260700226, + "learning_rate": 3.377052045846537e-05, + "loss": 0.2018, + "mean_token_accuracy": 0.913050290942192, + "num_tokens": 44314078.0, + "step": 2260 + }, + { + "entropy": 0.6516093999147415, + "epoch": 0.36686868686868684, + "grad_norm": 2.1195967197418213, + "learning_rate": 3.375957192676788e-05, + "loss": 0.1907, + "mean_token_accuracy": 0.9181628227233887, + "num_tokens": 44509018.0, + "step": 2270 + }, + { + "entropy": 0.6486793115735054, + "epoch": 0.36848484848484847, + "grad_norm": 0.49226877093315125, + "learning_rate": 3.374857665333997e-05, + "loss": 0.184, + "mean_token_accuracy": 0.9247466400265694, + "num_tokens": 44705376.0, + "step": 2280 + }, + { + "entropy": 0.6480357632040977, + "epoch": 0.3701010101010101, + "grad_norm": 0.4141157567501068, + "learning_rate": 3.373753466978999e-05, + "loss": 0.209, + "mean_token_accuracy": 0.9143023118376732, + "num_tokens": 44902479.0, + "step": 2290 + }, + { + "entropy": 0.6157522663474083, + "epoch": 0.3717171717171717, + "grad_norm": 0.41298553347587585, + "learning_rate": 3.3726446007860556e-05, + "loss": 0.199, + "mean_token_accuracy": 0.9163665294647216, + "num_tokens": 45110713.0, + "step": 2300 + }, + { + "entropy": 0.6643727004528046, + "epoch": 0.37333333333333335, + "grad_norm": 0.6356022357940674, + "learning_rate": 3.3715310699428484e-05, + "loss": 0.2009, + "mean_token_accuracy": 0.9170802712440491, + "num_tokens": 45300077.0, + "step": 2310 + }, + { + "entropy": 0.6490449465811252, + "epoch": 0.374949494949495, + "grad_norm": 0.4972356855869293, + "learning_rate": 3.370412877650467e-05, + "loss": 0.2056, + "mean_token_accuracy": 0.9179009929299354, + "num_tokens": 45494685.0, + "step": 2320 + }, + { + "entropy": 0.6338639289140702, + "epoch": 0.37656565656565655, + "grad_norm": 0.3716679811477661, + "learning_rate": 3.369290027123402e-05, + "loss": 0.1985, + "mean_token_accuracy": 0.9173924580216408, + "num_tokens": 45695194.0, + "step": 2330 + }, + { + "entropy": 0.6402531735599041, + "epoch": 0.3781818181818182, + "grad_norm": 3.9543232917785645, + "learning_rate": 3.368162521589536e-05, + "loss": 0.198, + "mean_token_accuracy": 0.9143711969256401, + "num_tokens": 45894902.0, + "step": 2340 + }, + { + "entropy": 0.6678831689059734, + "epoch": 0.3797979797979798, + "grad_norm": 0.3957677185535431, + "learning_rate": 3.367030364290132e-05, + "loss": 0.1814, + "mean_token_accuracy": 0.921611288189888, + "num_tokens": 46084398.0, + "step": 2350 + }, + { + "entropy": 0.6259508073329926, + "epoch": 0.3814141414141414, + "grad_norm": 1.1085342168807983, + "learning_rate": 3.3658935584798255e-05, + "loss": 0.2234, + "mean_token_accuracy": 0.9109225928783417, + "num_tokens": 46288730.0, + "step": 2360 + }, + { + "entropy": 0.645773047208786, + "epoch": 0.38303030303030305, + "grad_norm": 0.8026268482208252, + "learning_rate": 3.364752107426618e-05, + "loss": 0.1937, + "mean_token_accuracy": 0.9164649412035942, + "num_tokens": 46484836.0, + "step": 2370 + }, + { + "entropy": 0.6290055423974991, + "epoch": 0.3846464646464646, + "grad_norm": 0.4032319188117981, + "learning_rate": 3.363606014411861e-05, + "loss": 0.211, + "mean_token_accuracy": 0.9055109471082687, + "num_tokens": 46686427.0, + "step": 2380 + }, + { + "entropy": 0.633245699852705, + "epoch": 0.38626262626262625, + "grad_norm": 0.4805126488208771, + "learning_rate": 3.362455282730252e-05, + "loss": 0.2138, + "mean_token_accuracy": 0.9034733757376671, + "num_tokens": 46887489.0, + "step": 2390 + }, + { + "entropy": 0.6522969849407673, + "epoch": 0.3878787878787879, + "grad_norm": 1.1404281854629517, + "learning_rate": 3.361299915689824e-05, + "loss": 0.2043, + "mean_token_accuracy": 0.9089002668857574, + "num_tokens": 47083820.0, + "step": 2400 + }, + { + "epoch": 0.3878787878787879, + "eval_entropy": 0.7113876877129078, + "eval_loss": 0.18623696267604828, + "eval_mean_token_accuracy": 0.921599247455597, + "eval_num_tokens": 47083820.0, + "eval_runtime": 107.4465, + "eval_samples_per_second": 9.307, + "eval_steps_per_second": 9.307, + "step": 2400 + }, + { + "entropy": 0.6595964454114437, + "epoch": 0.3894949494949495, + "grad_norm": 1.8339920043945312, + "learning_rate": 3.360139916611934e-05, + "loss": 0.2007, + "mean_token_accuracy": 0.9141246557235718, + "num_tokens": 47277248.0, + "step": 2410 + }, + { + "entropy": 0.6597648195922374, + "epoch": 0.39111111111111113, + "grad_norm": 0.3694554269313812, + "learning_rate": 3.358975288831256e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9253092393279075, + "num_tokens": 47470531.0, + "step": 2420 + }, + { + "entropy": 0.6970138788223267, + "epoch": 0.3927272727272727, + "grad_norm": 0.5019353032112122, + "learning_rate": 3.35780603569577e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.9200589671730995, + "num_tokens": 47652005.0, + "step": 2430 + }, + { + "entropy": 0.6489242658019065, + "epoch": 0.39434343434343433, + "grad_norm": 1.1797176599502563, + "learning_rate": 3.356632160566752e-05, + "loss": 0.2294, + "mean_token_accuracy": 0.9104439795017243, + "num_tokens": 47847890.0, + "step": 2440 + }, + { + "entropy": 0.6623602546751499, + "epoch": 0.39595959595959596, + "grad_norm": 0.25269877910614014, + "learning_rate": 3.355453666818765e-05, + "loss": 0.1866, + "mean_token_accuracy": 0.9208436816930771, + "num_tokens": 48044273.0, + "step": 2450 + }, + { + "entropy": 0.6277916461229325, + "epoch": 0.3975757575757576, + "grad_norm": 0.38579049706459045, + "learning_rate": 3.35427055783965e-05, + "loss": 0.2269, + "mean_token_accuracy": 0.9083513021469116, + "num_tokens": 48248754.0, + "step": 2460 + }, + { + "entropy": 0.6581358797848225, + "epoch": 0.3991919191919192, + "grad_norm": 2.619389533996582, + "learning_rate": 3.3530828370305157e-05, + "loss": 0.1934, + "mean_token_accuracy": 0.9179710894823074, + "num_tokens": 48440794.0, + "step": 2470 + }, + { + "entropy": 0.6409321166574955, + "epoch": 0.40080808080808084, + "grad_norm": 0.5129874348640442, + "learning_rate": 3.3518905078057266e-05, + "loss": 0.2087, + "mean_token_accuracy": 0.9138506144285202, + "num_tokens": 48636796.0, + "step": 2480 + }, + { + "entropy": 0.6473928295075894, + "epoch": 0.4024242424242424, + "grad_norm": 0.4147140085697174, + "learning_rate": 3.3506935735928976e-05, + "loss": 0.195, + "mean_token_accuracy": 0.9118334755301476, + "num_tokens": 48830484.0, + "step": 2490 + }, + { + "entropy": 0.6263864435255527, + "epoch": 0.40404040404040403, + "grad_norm": 0.4629824757575989, + "learning_rate": 3.349492037832879e-05, + "loss": 0.2279, + "mean_token_accuracy": 0.9032794684171677, + "num_tokens": 49032476.0, + "step": 2500 + }, + { + "entropy": 0.6438064053654671, + "epoch": 0.40565656565656566, + "grad_norm": 1.7570719718933105, + "learning_rate": 3.3482859039797516e-05, + "loss": 0.1957, + "mean_token_accuracy": 0.915298792719841, + "num_tokens": 49228395.0, + "step": 2510 + }, + { + "entropy": 0.6471666194498539, + "epoch": 0.4072727272727273, + "grad_norm": 0.5772570967674255, + "learning_rate": 3.3470751755008134e-05, + "loss": 0.2092, + "mean_token_accuracy": 0.9125421524047852, + "num_tokens": 49424437.0, + "step": 2520 + }, + { + "entropy": 0.6377446949481964, + "epoch": 0.4088888888888889, + "grad_norm": 0.8318358063697815, + "learning_rate": 3.345859855876571e-05, + "loss": 0.1897, + "mean_token_accuracy": 0.921054682135582, + "num_tokens": 49621544.0, + "step": 2530 + }, + { + "entropy": 0.641574353724718, + "epoch": 0.4105050505050505, + "grad_norm": 0.5469814538955688, + "learning_rate": 3.344639948600729e-05, + "loss": 0.1909, + "mean_token_accuracy": 0.9170561015605927, + "num_tokens": 49818401.0, + "step": 2540 + }, + { + "entropy": 0.6623684763908386, + "epoch": 0.4121212121212121, + "grad_norm": 1.5764508247375488, + "learning_rate": 3.34341545718018e-05, + "loss": 0.1956, + "mean_token_accuracy": 0.9195842504501343, + "num_tokens": 50011755.0, + "step": 2550 + }, + { + "entropy": 0.6670712187886239, + "epoch": 0.41373737373737374, + "grad_norm": 0.5596409440040588, + "learning_rate": 3.342186385134995e-05, + "loss": 0.2234, + "mean_token_accuracy": 0.9089864656329155, + "num_tokens": 50202582.0, + "step": 2560 + }, + { + "entropy": 0.6491748370230198, + "epoch": 0.41535353535353536, + "grad_norm": 0.8959507942199707, + "learning_rate": 3.340952735998413e-05, + "loss": 0.2093, + "mean_token_accuracy": 0.9129490926861763, + "num_tokens": 50398268.0, + "step": 2570 + }, + { + "entropy": 0.6661599427461624, + "epoch": 0.416969696969697, + "grad_norm": 0.45862722396850586, + "learning_rate": 3.339714513316831e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.918558169901371, + "num_tokens": 50589126.0, + "step": 2580 + }, + { + "entropy": 0.6533463180065155, + "epoch": 0.41858585858585856, + "grad_norm": 0.2936107814311981, + "learning_rate": 3.338471720649795e-05, + "loss": 0.2197, + "mean_token_accuracy": 0.9082069784402848, + "num_tokens": 50786106.0, + "step": 2590 + }, + { + "entropy": 0.646408773958683, + "epoch": 0.4202020202020202, + "grad_norm": 0.9799319505691528, + "learning_rate": 3.337224361569984e-05, + "loss": 0.2195, + "mean_token_accuracy": 0.9127225950360298, + "num_tokens": 50985940.0, + "step": 2600 + }, + { + "entropy": 0.6364965550601482, + "epoch": 0.4218181818181818, + "grad_norm": 0.6174752712249756, + "learning_rate": 3.33597243966321e-05, + "loss": 0.2215, + "mean_token_accuracy": 0.9099404335021972, + "num_tokens": 51185481.0, + "step": 2610 + }, + { + "entropy": 0.6501048274338246, + "epoch": 0.42343434343434344, + "grad_norm": 0.3903461694717407, + "learning_rate": 3.334715958528397e-05, + "loss": 0.2021, + "mean_token_accuracy": 0.9131413713097573, + "num_tokens": 51381030.0, + "step": 2620 + }, + { + "entropy": 0.626953698694706, + "epoch": 0.42505050505050507, + "grad_norm": 0.45153191685676575, + "learning_rate": 3.3334549217775794e-05, + "loss": 0.2098, + "mean_token_accuracy": 0.9122917667031288, + "num_tokens": 51581647.0, + "step": 2630 + }, + { + "entropy": 0.6413272753357887, + "epoch": 0.4266666666666667, + "grad_norm": 0.7007961869239807, + "learning_rate": 3.332189333035883e-05, + "loss": 0.2002, + "mean_token_accuracy": 0.9182598188519477, + "num_tokens": 51777371.0, + "step": 2640 + }, + { + "entropy": 0.6396277464926243, + "epoch": 0.42828282828282827, + "grad_norm": 1.0117080211639404, + "learning_rate": 3.330919195941525e-05, + "loss": 0.2046, + "mean_token_accuracy": 0.9140521854162216, + "num_tokens": 51976736.0, + "step": 2650 + }, + { + "entropy": 0.6547877825796604, + "epoch": 0.4298989898989899, + "grad_norm": 1.506662130355835, + "learning_rate": 3.3296445141457936e-05, + "loss": 0.1826, + "mean_token_accuracy": 0.922282612323761, + "num_tokens": 52173296.0, + "step": 2660 + }, + { + "entropy": 0.6676691561937332, + "epoch": 0.4315151515151515, + "grad_norm": 0.3631212115287781, + "learning_rate": 3.328365291313044e-05, + "loss": 0.1981, + "mean_token_accuracy": 0.9073918789625168, + "num_tokens": 52363050.0, + "step": 2670 + }, + { + "entropy": 0.6451698906719685, + "epoch": 0.43313131313131314, + "grad_norm": 1.5293185710906982, + "learning_rate": 3.327081531120684e-05, + "loss": 0.2149, + "mean_token_accuracy": 0.9066063776612282, + "num_tokens": 52559410.0, + "step": 2680 + }, + { + "entropy": 0.6223693639039993, + "epoch": 0.43474747474747477, + "grad_norm": 0.3108581304550171, + "learning_rate": 3.325793237259165e-05, + "loss": 0.2094, + "mean_token_accuracy": 0.9071380957961083, + "num_tokens": 52763857.0, + "step": 2690 + }, + { + "entropy": 0.6278824508190155, + "epoch": 0.43636363636363634, + "grad_norm": 0.30205193161964417, + "learning_rate": 3.324500413431974e-05, + "loss": 0.2072, + "mean_token_accuracy": 0.9219203218817711, + "num_tokens": 52965075.0, + "step": 2700 + }, + { + "epoch": 0.43636363636363634, + "eval_entropy": 0.7081856344938278, + "eval_loss": 0.18622006475925446, + "eval_mean_token_accuracy": 0.920462992310524, + "eval_num_tokens": 52965075.0, + "eval_runtime": 108.4723, + "eval_samples_per_second": 9.219, + "eval_steps_per_second": 9.219, + "step": 2700 + }, + { + "entropy": 0.6290528163313865, + "epoch": 0.43797979797979797, + "grad_norm": 0.9835038781166077, + "learning_rate": 3.323203063355618e-05, + "loss": 0.22, + "mean_token_accuracy": 0.9061377301812172, + "num_tokens": 53167823.0, + "step": 2710 + }, + { + "entropy": 0.6957326963543892, + "epoch": 0.4395959595959596, + "grad_norm": 3.408257484436035, + "learning_rate": 3.3219011907596154e-05, + "loss": 0.201, + "mean_token_accuracy": 0.9151417210698127, + "num_tokens": 53351450.0, + "step": 2720 + }, + { + "entropy": 0.6469338901340962, + "epoch": 0.4412121212121212, + "grad_norm": 1.0536452531814575, + "learning_rate": 3.3205947993864884e-05, + "loss": 0.214, + "mean_token_accuracy": 0.914907206594944, + "num_tokens": 53549942.0, + "step": 2730 + }, + { + "entropy": 0.6235164746642112, + "epoch": 0.44282828282828285, + "grad_norm": 0.4027160406112671, + "learning_rate": 3.3192838929917455e-05, + "loss": 0.2132, + "mean_token_accuracy": 0.917492838203907, + "num_tokens": 53752441.0, + "step": 2740 + }, + { + "entropy": 0.6557296566665173, + "epoch": 0.4444444444444444, + "grad_norm": 0.6262427568435669, + "learning_rate": 3.317968475343877e-05, + "loss": 0.2142, + "mean_token_accuracy": 0.9105125978589058, + "num_tokens": 53946273.0, + "step": 2750 + }, + { + "entropy": 0.6702249869704247, + "epoch": 0.44606060606060605, + "grad_norm": 0.38556161522865295, + "learning_rate": 3.316648550224342e-05, + "loss": 0.1995, + "mean_token_accuracy": 0.9102703362703324, + "num_tokens": 54137777.0, + "step": 2760 + }, + { + "entropy": 0.657077157497406, + "epoch": 0.4476767676767677, + "grad_norm": 1.3603203296661377, + "learning_rate": 3.315324121427557e-05, + "loss": 0.2007, + "mean_token_accuracy": 0.9166198000311852, + "num_tokens": 54332311.0, + "step": 2770 + }, + { + "entropy": 0.6409480340778828, + "epoch": 0.4492929292929293, + "grad_norm": 0.40199100971221924, + "learning_rate": 3.3139951927608844e-05, + "loss": 0.216, + "mean_token_accuracy": 0.9097805380821228, + "num_tokens": 54534578.0, + "step": 2780 + }, + { + "entropy": 0.6437219373881817, + "epoch": 0.4509090909090909, + "grad_norm": 0.6849948167800903, + "learning_rate": 3.312661768044624e-05, + "loss": 0.2002, + "mean_token_accuracy": 0.913942052423954, + "num_tokens": 54733184.0, + "step": 2790 + }, + { + "entropy": 0.6447813183069229, + "epoch": 0.45252525252525255, + "grad_norm": 0.3659138083457947, + "learning_rate": 3.3113238511119986e-05, + "loss": 0.215, + "mean_token_accuracy": 0.9110051110386849, + "num_tokens": 54931000.0, + "step": 2800 + }, + { + "entropy": 0.6239227756857872, + "epoch": 0.4541414141414141, + "grad_norm": 0.4973262548446655, + "learning_rate": 3.3099814458091474e-05, + "loss": 0.2082, + "mean_token_accuracy": 0.9057865276932716, + "num_tokens": 55133960.0, + "step": 2810 + }, + { + "entropy": 0.6393618829548359, + "epoch": 0.45575757575757575, + "grad_norm": 0.40064364671707153, + "learning_rate": 3.308634555995109e-05, + "loss": 0.2157, + "mean_token_accuracy": 0.9042444139719009, + "num_tokens": 55331051.0, + "step": 2820 + }, + { + "entropy": 0.6447009883821011, + "epoch": 0.4573737373737374, + "grad_norm": 0.3533497452735901, + "learning_rate": 3.307283185541817e-05, + "loss": 0.2058, + "mean_token_accuracy": 0.9148957833647728, + "num_tokens": 55529610.0, + "step": 2830 + }, + { + "entropy": 0.6316937655210495, + "epoch": 0.458989898989899, + "grad_norm": 0.23120789229869843, + "learning_rate": 3.305927338334084e-05, + "loss": 0.2047, + "mean_token_accuracy": 0.9060420542955399, + "num_tokens": 55730141.0, + "step": 2840 + }, + { + "entropy": 0.6504737250506878, + "epoch": 0.46060606060606063, + "grad_norm": 0.4959610402584076, + "learning_rate": 3.3045670182695905e-05, + "loss": 0.2039, + "mean_token_accuracy": 0.9110078886151314, + "num_tokens": 55927592.0, + "step": 2850 + }, + { + "entropy": 0.6488144524395466, + "epoch": 0.4622222222222222, + "grad_norm": 0.8291785717010498, + "learning_rate": 3.30320222925888e-05, + "loss": 0.2039, + "mean_token_accuracy": 0.9138716921210289, + "num_tokens": 56124051.0, + "step": 2860 + }, + { + "entropy": 0.6518881164491177, + "epoch": 0.4638383838383838, + "grad_norm": 0.3788766860961914, + "learning_rate": 3.301832975225338e-05, + "loss": 0.18, + "mean_token_accuracy": 0.9274351745843887, + "num_tokens": 56319141.0, + "step": 2870 + }, + { + "entropy": 0.655502051115036, + "epoch": 0.46545454545454545, + "grad_norm": 0.3806280493736267, + "learning_rate": 3.300459260105188e-05, + "loss": 0.1978, + "mean_token_accuracy": 0.9226665228605271, + "num_tokens": 56510794.0, + "step": 2880 + }, + { + "entropy": 0.6256422609090805, + "epoch": 0.4670707070707071, + "grad_norm": 0.2849152684211731, + "learning_rate": 3.2990810878474766e-05, + "loss": 0.1973, + "mean_token_accuracy": 0.9150432854890823, + "num_tokens": 56713261.0, + "step": 2890 + }, + { + "entropy": 0.6748034112155438, + "epoch": 0.4686868686868687, + "grad_norm": 0.6893365979194641, + "learning_rate": 3.297698462414066e-05, + "loss": 0.189, + "mean_token_accuracy": 0.914380231499672, + "num_tokens": 56901633.0, + "step": 2900 + }, + { + "entropy": 0.6465104728937149, + "epoch": 0.4703030303030303, + "grad_norm": 0.6201600432395935, + "learning_rate": 3.296311387779618e-05, + "loss": 0.2043, + "mean_token_accuracy": 0.9120270356535911, + "num_tokens": 57096947.0, + "step": 2910 + }, + { + "entropy": 0.6560635149478913, + "epoch": 0.4719191919191919, + "grad_norm": 0.4038831293582916, + "learning_rate": 3.294919867931584e-05, + "loss": 0.1841, + "mean_token_accuracy": 0.9193960189819336, + "num_tokens": 57291443.0, + "step": 2920 + }, + { + "entropy": 0.6498475447297096, + "epoch": 0.47353535353535353, + "grad_norm": 0.3524320423603058, + "learning_rate": 3.293523906870196e-05, + "loss": 0.1982, + "mean_token_accuracy": 0.9188156425952911, + "num_tokens": 57486365.0, + "step": 2930 + }, + { + "entropy": 0.6489122800529004, + "epoch": 0.47515151515151516, + "grad_norm": 0.34468188881874084, + "learning_rate": 3.292123508608451e-05, + "loss": 0.2095, + "mean_token_accuracy": 0.9128650277853012, + "num_tokens": 57680721.0, + "step": 2940 + }, + { + "entropy": 0.6383394666016102, + "epoch": 0.4767676767676768, + "grad_norm": 0.6034473180770874, + "learning_rate": 3.290718677172105e-05, + "loss": 0.2052, + "mean_token_accuracy": 0.9086369514465332, + "num_tokens": 57879619.0, + "step": 2950 + }, + { + "entropy": 0.6529298670589924, + "epoch": 0.4783838383838384, + "grad_norm": 0.4649339020252228, + "learning_rate": 3.289309416599655e-05, + "loss": 0.1917, + "mean_token_accuracy": 0.9165951952338218, + "num_tokens": 58074943.0, + "step": 2960 + }, + { + "entropy": 0.6440173640847207, + "epoch": 0.48, + "grad_norm": 0.3674682080745697, + "learning_rate": 3.287895730942333e-05, + "loss": 0.1879, + "mean_token_accuracy": 0.9216477259993553, + "num_tokens": 58269804.0, + "step": 2970 + }, + { + "entropy": 0.6634586147964001, + "epoch": 0.4816161616161616, + "grad_norm": 0.9639015793800354, + "learning_rate": 3.286477624264089e-05, + "loss": 0.1978, + "mean_token_accuracy": 0.9219118982553482, + "num_tokens": 58461635.0, + "step": 2980 + }, + { + "entropy": 0.6251708209514618, + "epoch": 0.48323232323232324, + "grad_norm": 0.3933483064174652, + "learning_rate": 3.285055100641584e-05, + "loss": 0.2082, + "mean_token_accuracy": 0.9080576360225677, + "num_tokens": 58662756.0, + "step": 2990 + }, + { + "entropy": 0.6697501085698605, + "epoch": 0.48484848484848486, + "grad_norm": 0.8007460236549377, + "learning_rate": 3.283628164164178e-05, + "loss": 0.201, + "mean_token_accuracy": 0.9163200870156288, + "num_tokens": 58850692.0, + "step": 3000 + }, + { + "epoch": 0.48484848484848486, + "eval_entropy": 0.7009831846058369, + "eval_loss": 0.18165792524814606, + "eval_mean_token_accuracy": 0.92101721316576, + "eval_num_tokens": 58850692.0, + "eval_runtime": 108.1573, + "eval_samples_per_second": 9.246, + "eval_steps_per_second": 9.246, + "step": 3000 + }, + { + "entropy": 0.6666446149349212, + "epoch": 0.4864646464646465, + "grad_norm": 0.9564417004585266, + "learning_rate": 3.282196818933913e-05, + "loss": 0.2069, + "mean_token_accuracy": 0.9102333173155784, + "num_tokens": 59039812.0, + "step": 3010 + }, + { + "entropy": 0.6625177763402462, + "epoch": 0.48808080808080806, + "grad_norm": 0.7849810123443604, + "learning_rate": 3.280761069065508e-05, + "loss": 0.1879, + "mean_token_accuracy": 0.9257912486791611, + "num_tokens": 59230478.0, + "step": 3020 + }, + { + "entropy": 0.6120976060628891, + "epoch": 0.4896969696969697, + "grad_norm": 0.3264128863811493, + "learning_rate": 3.279320918686344e-05, + "loss": 0.2151, + "mean_token_accuracy": 0.9100072458386421, + "num_tokens": 59437566.0, + "step": 3030 + }, + { + "entropy": 0.646418509632349, + "epoch": 0.4913131313131313, + "grad_norm": 0.5998213291168213, + "learning_rate": 3.2778763719364486e-05, + "loss": 0.2037, + "mean_token_accuracy": 0.9138255193829536, + "num_tokens": 59632261.0, + "step": 3040 + }, + { + "entropy": 0.6323723264038563, + "epoch": 0.49292929292929294, + "grad_norm": 0.2643972337245941, + "learning_rate": 3.276427432968493e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.9208777904510498, + "num_tokens": 59834305.0, + "step": 3050 + }, + { + "entropy": 0.6356162540614605, + "epoch": 0.49454545454545457, + "grad_norm": 0.3861073851585388, + "learning_rate": 3.274974105947772e-05, + "loss": 0.1968, + "mean_token_accuracy": 0.9171579584479332, + "num_tokens": 60034048.0, + "step": 3060 + }, + { + "entropy": 0.6675376549363137, + "epoch": 0.49616161616161614, + "grad_norm": 0.594031572341919, + "learning_rate": 3.273516395052193e-05, + "loss": 0.1956, + "mean_token_accuracy": 0.9162991613149643, + "num_tokens": 60225137.0, + "step": 3070 + }, + { + "entropy": 0.6268281377851963, + "epoch": 0.49777777777777776, + "grad_norm": 0.4130667746067047, + "learning_rate": 3.2720543044722707e-05, + "loss": 0.2067, + "mean_token_accuracy": 0.911859753727913, + "num_tokens": 60427325.0, + "step": 3080 + }, + { + "entropy": 0.6532427452504634, + "epoch": 0.4993939393939394, + "grad_norm": 0.6098483800888062, + "learning_rate": 3.270587838411106e-05, + "loss": 0.2006, + "mean_token_accuracy": 0.9138939917087555, + "num_tokens": 60624604.0, + "step": 3090 + }, + { + "entropy": 0.6927797332406044, + "epoch": 0.501010101010101, + "grad_norm": 0.45501190423965454, + "learning_rate": 3.2691170010843785e-05, + "loss": 0.19, + "mean_token_accuracy": 0.922791239619255, + "num_tokens": 60807502.0, + "step": 3100 + }, + { + "entropy": 0.6616372317075729, + "epoch": 0.5026262626262626, + "grad_norm": 1.7559648752212524, + "learning_rate": 3.2676417967203366e-05, + "loss": 0.1995, + "mean_token_accuracy": 0.9174562796950341, + "num_tokens": 61002222.0, + "step": 3110 + }, + { + "entropy": 0.6428518198430538, + "epoch": 0.5042424242424243, + "grad_norm": 0.8280075788497925, + "learning_rate": 3.266162229559781e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.9215355843305588, + "num_tokens": 61203301.0, + "step": 3120 + }, + { + "entropy": 0.6523866213858127, + "epoch": 0.5058585858585859, + "grad_norm": 0.4117162823677063, + "learning_rate": 3.2646783038560525e-05, + "loss": 0.2059, + "mean_token_accuracy": 0.9134928032755851, + "num_tokens": 61400432.0, + "step": 3130 + }, + { + "entropy": 0.6734721213579178, + "epoch": 0.5074747474747475, + "grad_norm": 0.746486485004425, + "learning_rate": 3.263190023875025e-05, + "loss": 0.1826, + "mean_token_accuracy": 0.9212172508239747, + "num_tokens": 61589776.0, + "step": 3140 + }, + { + "entropy": 0.6552940599620343, + "epoch": 0.509090909090909, + "grad_norm": 0.31145575642585754, + "learning_rate": 3.261697393895088e-05, + "loss": 0.1917, + "mean_token_accuracy": 0.9138196483254433, + "num_tokens": 61785756.0, + "step": 3150 + }, + { + "entropy": 0.6406723141670227, + "epoch": 0.5107070707070707, + "grad_norm": 0.9990244507789612, + "learning_rate": 3.2602004182071346e-05, + "loss": 0.1972, + "mean_token_accuracy": 0.9117845878005028, + "num_tokens": 61983863.0, + "step": 3160 + }, + { + "entropy": 0.6623590111732482, + "epoch": 0.5123232323232323, + "grad_norm": 0.48632344603538513, + "learning_rate": 3.2586991011145524e-05, + "loss": 0.1902, + "mean_token_accuracy": 0.9184399858117104, + "num_tokens": 62175690.0, + "step": 3170 + }, + { + "entropy": 0.6356962986290455, + "epoch": 0.5139393939393939, + "grad_norm": 5.5877909660339355, + "learning_rate": 3.25719344693321e-05, + "loss": 0.2244, + "mean_token_accuracy": 0.8993245542049408, + "num_tokens": 62376577.0, + "step": 3180 + }, + { + "entropy": 0.6909607768058776, + "epoch": 0.5155555555555555, + "grad_norm": 0.42147713899612427, + "learning_rate": 3.2556834599914425e-05, + "loss": 0.1747, + "mean_token_accuracy": 0.9327822804450989, + "num_tokens": 62558544.0, + "step": 3190 + }, + { + "entropy": 0.6543774232268333, + "epoch": 0.5171717171717172, + "grad_norm": 2.5656116008758545, + "learning_rate": 3.25416914463004e-05, + "loss": 0.1921, + "mean_token_accuracy": 0.9150905042886734, + "num_tokens": 62750996.0, + "step": 3200 + }, + { + "entropy": 0.6566679209470749, + "epoch": 0.5187878787878788, + "grad_norm": 0.42310506105422974, + "learning_rate": 3.252650505202238e-05, + "loss": 0.1969, + "mean_token_accuracy": 0.9137048035860061, + "num_tokens": 62944360.0, + "step": 3210 + }, + { + "entropy": 0.6445956945419311, + "epoch": 0.5204040404040404, + "grad_norm": 1.61677086353302, + "learning_rate": 3.251127546073699e-05, + "loss": 0.191, + "mean_token_accuracy": 0.928897674381733, + "num_tokens": 63140963.0, + "step": 3220 + }, + { + "entropy": 0.668046285957098, + "epoch": 0.522020202020202, + "grad_norm": 0.38541457056999207, + "learning_rate": 3.249600271622507e-05, + "loss": 0.1923, + "mean_token_accuracy": 0.9224132910370827, + "num_tokens": 63332657.0, + "step": 3230 + }, + { + "entropy": 0.6093958213925361, + "epoch": 0.5236363636363637, + "grad_norm": 0.49875500798225403, + "learning_rate": 3.248068686239149e-05, + "loss": 0.223, + "mean_token_accuracy": 0.9000680401921273, + "num_tokens": 63541910.0, + "step": 3240 + }, + { + "entropy": 0.6547015912830829, + "epoch": 0.5252525252525253, + "grad_norm": 0.3443615436553955, + "learning_rate": 3.246532794326505e-05, + "loss": 0.2227, + "mean_token_accuracy": 0.9069190487265587, + "num_tokens": 63736206.0, + "step": 3250 + }, + { + "entropy": 0.6406685844063759, + "epoch": 0.5268686868686868, + "grad_norm": 0.3350508213043213, + "learning_rate": 3.244992600299836e-05, + "loss": 0.1973, + "mean_token_accuracy": 0.9136427164077758, + "num_tokens": 63935477.0, + "step": 3260 + }, + { + "entropy": 0.6515379846096039, + "epoch": 0.5284848484848484, + "grad_norm": 0.5605429410934448, + "learning_rate": 3.2434481085867705e-05, + "loss": 0.2061, + "mean_token_accuracy": 0.9114924788475036, + "num_tokens": 64129764.0, + "step": 3270 + }, + { + "entropy": 0.6315380461513996, + "epoch": 0.5301010101010101, + "grad_norm": 0.3819006681442261, + "learning_rate": 3.24189932362729e-05, + "loss": 0.2169, + "mean_token_accuracy": 0.8984061792492867, + "num_tokens": 64330659.0, + "step": 3280 + }, + { + "entropy": 0.6346470937132835, + "epoch": 0.5317171717171717, + "grad_norm": 0.4472454786300659, + "learning_rate": 3.240346249873719e-05, + "loss": 0.2215, + "mean_token_accuracy": 0.9077972799539566, + "num_tokens": 64530704.0, + "step": 3290 + }, + { + "entropy": 0.6334054812788963, + "epoch": 0.5333333333333333, + "grad_norm": 0.47427433729171753, + "learning_rate": 3.238788891790712e-05, + "loss": 0.2156, + "mean_token_accuracy": 0.9178689196705818, + "num_tokens": 64730951.0, + "step": 3300 + }, + { + "epoch": 0.5333333333333333, + "eval_entropy": 0.708876410573721, + "eval_loss": 0.18830136954784393, + "eval_mean_token_accuracy": 0.9212830139994621, + "eval_num_tokens": 64730951.0, + "eval_runtime": 107.508, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 9.302, + "step": 3300 + }, + { + "entropy": 0.6263887412846089, + "epoch": 0.534949494949495, + "grad_norm": 0.4745093286037445, + "learning_rate": 3.2372272538552376e-05, + "loss": 0.2178, + "mean_token_accuracy": 0.908173656463623, + "num_tokens": 64932897.0, + "step": 3310 + }, + { + "entropy": 0.6270258404314518, + "epoch": 0.5365656565656566, + "grad_norm": 0.25539788603782654, + "learning_rate": 3.235661340556569e-05, + "loss": 0.1932, + "mean_token_accuracy": 0.9180809631943703, + "num_tokens": 65135403.0, + "step": 3320 + }, + { + "entropy": 0.6297943659126759, + "epoch": 0.5381818181818182, + "grad_norm": 0.24494622647762299, + "learning_rate": 3.2340911563962706e-05, + "loss": 0.2108, + "mean_token_accuracy": 0.9115460008382797, + "num_tokens": 65336219.0, + "step": 3330 + }, + { + "entropy": 0.6433823242783546, + "epoch": 0.5397979797979798, + "grad_norm": 0.7900083065032959, + "learning_rate": 3.232516705888183e-05, + "loss": 0.2157, + "mean_token_accuracy": 0.9078005447983741, + "num_tokens": 65533545.0, + "step": 3340 + }, + { + "entropy": 0.6677606172859669, + "epoch": 0.5414141414141415, + "grad_norm": 0.8443840146064758, + "learning_rate": 3.2309379935584125e-05, + "loss": 0.2102, + "mean_token_accuracy": 0.9176926985383034, + "num_tokens": 65726131.0, + "step": 3350 + }, + { + "entropy": 0.6547842122614383, + "epoch": 0.5430303030303031, + "grad_norm": 0.4855356812477112, + "learning_rate": 3.229355023945315e-05, + "loss": 0.1831, + "mean_token_accuracy": 0.9250451758503914, + "num_tokens": 65918536.0, + "step": 3360 + }, + { + "entropy": 0.642189259827137, + "epoch": 0.5446464646464646, + "grad_norm": 0.32615113258361816, + "learning_rate": 3.2277678015994886e-05, + "loss": 0.1925, + "mean_token_accuracy": 0.9199791148304939, + "num_tokens": 66115421.0, + "step": 3370 + }, + { + "entropy": 0.6465832196176052, + "epoch": 0.5462626262626262, + "grad_norm": 0.28738975524902344, + "learning_rate": 3.226176331083752e-05, + "loss": 0.1837, + "mean_token_accuracy": 0.9207375660538674, + "num_tokens": 66311951.0, + "step": 3380 + }, + { + "entropy": 0.6554206393659114, + "epoch": 0.5478787878787879, + "grad_norm": 0.4349115192890167, + "learning_rate": 3.2245806169731395e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.9180210024118424, + "num_tokens": 66504523.0, + "step": 3390 + }, + { + "entropy": 0.6807047441601753, + "epoch": 0.5494949494949495, + "grad_norm": 0.38425713777542114, + "learning_rate": 3.222980663854884e-05, + "loss": 0.1829, + "mean_token_accuracy": 0.9236632093787194, + "num_tokens": 66689376.0, + "step": 3400 + }, + { + "entropy": 0.6507133312523365, + "epoch": 0.5511111111111111, + "grad_norm": 0.4921931028366089, + "learning_rate": 3.221376476328404e-05, + "loss": 0.2082, + "mean_token_accuracy": 0.9143338233232499, + "num_tokens": 66884196.0, + "step": 3410 + }, + { + "entropy": 0.6452019922435284, + "epoch": 0.5527272727272727, + "grad_norm": 0.361701637506485, + "learning_rate": 3.219768059005291e-05, + "loss": 0.2227, + "mean_token_accuracy": 0.9031536251306533, + "num_tokens": 67084098.0, + "step": 3420 + }, + { + "entropy": 0.6425276100635529, + "epoch": 0.5543434343434344, + "grad_norm": 0.23792065680027008, + "learning_rate": 3.218155416509296e-05, + "loss": 0.2079, + "mean_token_accuracy": 0.900030605494976, + "num_tokens": 67280863.0, + "step": 3430 + }, + { + "entropy": 0.6363083481788635, + "epoch": 0.555959595959596, + "grad_norm": 0.7842856645584106, + "learning_rate": 3.216538553476315e-05, + "loss": 0.2109, + "mean_token_accuracy": 0.9114381566643714, + "num_tokens": 67478386.0, + "step": 3440 + }, + { + "entropy": 0.648232850432396, + "epoch": 0.5575757575757576, + "grad_norm": 0.38298118114471436, + "learning_rate": 3.214917474554378e-05, + "loss": 0.2047, + "mean_token_accuracy": 0.9171234712004661, + "num_tokens": 67676237.0, + "step": 3450 + }, + { + "entropy": 0.6573429599404335, + "epoch": 0.5591919191919192, + "grad_norm": 0.3366295099258423, + "learning_rate": 3.213292184403636e-05, + "loss": 0.1797, + "mean_token_accuracy": 0.9248432412743568, + "num_tokens": 67866540.0, + "step": 3460 + }, + { + "entropy": 0.6503325551748276, + "epoch": 0.5608080808080808, + "grad_norm": 0.27914008498191833, + "learning_rate": 3.211662687696343e-05, + "loss": 0.198, + "mean_token_accuracy": 0.9210797488689423, + "num_tokens": 68062045.0, + "step": 3470 + }, + { + "entropy": 0.6173742033541203, + "epoch": 0.5624242424242424, + "grad_norm": 1.0394847393035889, + "learning_rate": 3.210028989116848e-05, + "loss": 0.2158, + "mean_token_accuracy": 0.9063221380114556, + "num_tokens": 68267313.0, + "step": 3480 + }, + { + "entropy": 0.6302667535841465, + "epoch": 0.564040404040404, + "grad_norm": 0.5718628764152527, + "learning_rate": 3.208391093361577e-05, + "loss": 0.1897, + "mean_token_accuracy": 0.9212815120816231, + "num_tokens": 68467768.0, + "step": 3490 + }, + { + "entropy": 0.6221346028149128, + "epoch": 0.5656565656565656, + "grad_norm": 0.3520769774913788, + "learning_rate": 3.206749005139024e-05, + "loss": 0.2248, + "mean_token_accuracy": 0.9028334498405457, + "num_tokens": 68671058.0, + "step": 3500 + }, + { + "entropy": 0.620377691090107, + "epoch": 0.5672727272727273, + "grad_norm": 0.6622844338417053, + "learning_rate": 3.205102729169734e-05, + "loss": 0.2338, + "mean_token_accuracy": 0.902023434638977, + "num_tokens": 68874421.0, + "step": 3510 + }, + { + "entropy": 0.6653469420969487, + "epoch": 0.5688888888888889, + "grad_norm": 0.895131528377533, + "learning_rate": 3.203452270186292e-05, + "loss": 0.1764, + "mean_token_accuracy": 0.9239086017012597, + "num_tokens": 69064415.0, + "step": 3520 + }, + { + "entropy": 0.660456845164299, + "epoch": 0.5705050505050505, + "grad_norm": 3.0434823036193848, + "learning_rate": 3.201797632933305e-05, + "loss": 0.1984, + "mean_token_accuracy": 0.9147211536765099, + "num_tokens": 69254291.0, + "step": 3530 + }, + { + "entropy": 0.6673716694116593, + "epoch": 0.5721212121212121, + "grad_norm": 1.063805341720581, + "learning_rate": 3.2001388221673945e-05, + "loss": 0.2082, + "mean_token_accuracy": 0.9198906376957894, + "num_tokens": 69442287.0, + "step": 3540 + }, + { + "entropy": 0.6692168071866036, + "epoch": 0.5737373737373738, + "grad_norm": 0.4066961705684662, + "learning_rate": 3.198475842657178e-05, + "loss": 0.2128, + "mean_token_accuracy": 0.9147928357124329, + "num_tokens": 69631036.0, + "step": 3550 + }, + { + "entropy": 0.6522227242588997, + "epoch": 0.5753535353535354, + "grad_norm": 0.3503301739692688, + "learning_rate": 3.196808699183258e-05, + "loss": 0.2078, + "mean_token_accuracy": 0.9150220856070519, + "num_tokens": 69824448.0, + "step": 3560 + }, + { + "entropy": 0.6434777349233627, + "epoch": 0.576969696969697, + "grad_norm": 1.1538275480270386, + "learning_rate": 3.195137396538205e-05, + "loss": 0.1943, + "mean_token_accuracy": 0.9231508269906044, + "num_tokens": 70021561.0, + "step": 3570 + }, + { + "entropy": 0.6639960631728172, + "epoch": 0.5785858585858585, + "grad_norm": 2.492892265319824, + "learning_rate": 3.193461939526549e-05, + "loss": 0.1798, + "mean_token_accuracy": 0.9285740494728089, + "num_tokens": 70212443.0, + "step": 3580 + }, + { + "entropy": 0.6517272099852562, + "epoch": 0.5802020202020202, + "grad_norm": 0.6861459612846375, + "learning_rate": 3.191782332964761e-05, + "loss": 0.2182, + "mean_token_accuracy": 0.898492357134819, + "num_tokens": 70409374.0, + "step": 3590 + }, + { + "entropy": 0.609511935710907, + "epoch": 0.5818181818181818, + "grad_norm": 0.28647559881210327, + "learning_rate": 3.19009858168124e-05, + "loss": 0.1919, + "mean_token_accuracy": 0.9192790776491165, + "num_tokens": 70618309.0, + "step": 3600 + }, + { + "epoch": 0.5818181818181818, + "eval_entropy": 0.7097669740915299, + "eval_loss": 0.18217602372169495, + "eval_mean_token_accuracy": 0.9196950270533562, + "eval_num_tokens": 70618309.0, + "eval_runtime": 107.3842, + "eval_samples_per_second": 9.312, + "eval_steps_per_second": 9.312, + "step": 3600 + }, + { + "entropy": 0.6385140128433704, + "epoch": 0.5834343434343434, + "grad_norm": 0.32677140831947327, + "learning_rate": 3.188410690516302e-05, + "loss": 0.2138, + "mean_token_accuracy": 0.9102208271622658, + "num_tokens": 70817872.0, + "step": 3610 + }, + { + "entropy": 0.6724786482751369, + "epoch": 0.585050505050505, + "grad_norm": 1.1922396421432495, + "learning_rate": 3.186718664322163e-05, + "loss": 0.19, + "mean_token_accuracy": 0.9211466312408447, + "num_tokens": 71006020.0, + "step": 3620 + }, + { + "entropy": 0.6604351043701172, + "epoch": 0.5866666666666667, + "grad_norm": 1.7478338479995728, + "learning_rate": 3.185022507962925e-05, + "loss": 0.2117, + "mean_token_accuracy": 0.9231618851423263, + "num_tokens": 71201778.0, + "step": 3630 + }, + { + "entropy": 0.6615160658955574, + "epoch": 0.5882828282828283, + "grad_norm": 0.9100177884101868, + "learning_rate": 3.183322226314565e-05, + "loss": 0.1905, + "mean_token_accuracy": 0.9153360441327095, + "num_tokens": 71394174.0, + "step": 3640 + }, + { + "entropy": 0.6542910009622573, + "epoch": 0.5898989898989899, + "grad_norm": 0.5457777976989746, + "learning_rate": 3.181617824264917e-05, + "loss": 0.2087, + "mean_token_accuracy": 0.909763066470623, + "num_tokens": 71589313.0, + "step": 3650 + }, + { + "entropy": 0.6248301893472672, + "epoch": 0.5915151515151515, + "grad_norm": 1.1501109600067139, + "learning_rate": 3.179909306713663e-05, + "loss": 0.221, + "mean_token_accuracy": 0.910381156206131, + "num_tokens": 71795222.0, + "step": 3660 + }, + { + "entropy": 0.6577152475714684, + "epoch": 0.5931313131313132, + "grad_norm": 0.9693170785903931, + "learning_rate": 3.178196678572312e-05, + "loss": 0.2076, + "mean_token_accuracy": 0.9101708441972732, + "num_tokens": 71987627.0, + "step": 3670 + }, + { + "entropy": 0.6558754108846188, + "epoch": 0.5947474747474748, + "grad_norm": 0.38402456045150757, + "learning_rate": 3.176479944764193e-05, + "loss": 0.1928, + "mean_token_accuracy": 0.9119617655873299, + "num_tokens": 72182126.0, + "step": 3680 + }, + { + "entropy": 0.6468130461871624, + "epoch": 0.5963636363636363, + "grad_norm": 0.5037944316864014, + "learning_rate": 3.174759110224436e-05, + "loss": 0.193, + "mean_token_accuracy": 0.9178726747632027, + "num_tokens": 72381065.0, + "step": 3690 + }, + { + "entropy": 0.6679630935192108, + "epoch": 0.597979797979798, + "grad_norm": 0.9890159964561462, + "learning_rate": 3.17303417989996e-05, + "loss": 0.1912, + "mean_token_accuracy": 0.9213204249739647, + "num_tokens": 72571262.0, + "step": 3700 + }, + { + "entropy": 0.6567414380609989, + "epoch": 0.5995959595959596, + "grad_norm": 1.5467326641082764, + "learning_rate": 3.1713051587494584e-05, + "loss": 0.1799, + "mean_token_accuracy": 0.926344147324562, + "num_tokens": 72767159.0, + "step": 3710 + }, + { + "entropy": 0.6576608344912529, + "epoch": 0.6012121212121212, + "grad_norm": 0.5661543011665344, + "learning_rate": 3.1695720517433844e-05, + "loss": 0.1887, + "mean_token_accuracy": 0.9229953482747077, + "num_tokens": 72964546.0, + "step": 3720 + }, + { + "entropy": 0.6437795154750348, + "epoch": 0.6028282828282828, + "grad_norm": 0.4989408850669861, + "learning_rate": 3.1678348638639365e-05, + "loss": 0.193, + "mean_token_accuracy": 0.9184434622526169, + "num_tokens": 73161389.0, + "step": 3730 + }, + { + "entropy": 0.647753156721592, + "epoch": 0.6044444444444445, + "grad_norm": 1.8060365915298462, + "learning_rate": 3.166093600105045e-05, + "loss": 0.1904, + "mean_token_accuracy": 0.9116377130150795, + "num_tokens": 73358408.0, + "step": 3740 + }, + { + "entropy": 0.6552592545747757, + "epoch": 0.6060606060606061, + "grad_norm": 0.34557202458381653, + "learning_rate": 3.164348265472357e-05, + "loss": 0.1988, + "mean_token_accuracy": 0.9147962421178818, + "num_tokens": 73553460.0, + "step": 3750 + }, + { + "entropy": 0.6839084707200527, + "epoch": 0.6076767676767677, + "grad_norm": 0.668044924736023, + "learning_rate": 3.1625988649832224e-05, + "loss": 0.1848, + "mean_token_accuracy": 0.9240136578679085, + "num_tokens": 73740151.0, + "step": 3760 + }, + { + "entropy": 0.6638677589595318, + "epoch": 0.6092929292929293, + "grad_norm": 0.3268148601055145, + "learning_rate": 3.16084540366668e-05, + "loss": 0.1932, + "mean_token_accuracy": 0.9274547725915909, + "num_tokens": 73935155.0, + "step": 3770 + }, + { + "entropy": 0.6938288390636445, + "epoch": 0.610909090909091, + "grad_norm": 0.31285151839256287, + "learning_rate": 3.15908788656344e-05, + "loss": 0.1959, + "mean_token_accuracy": 0.9191734343767166, + "num_tokens": 74119650.0, + "step": 3780 + }, + { + "entropy": 0.6699846811592579, + "epoch": 0.6125252525252525, + "grad_norm": 0.4750641882419586, + "learning_rate": 3.1573263187258754e-05, + "loss": 0.2092, + "mean_token_accuracy": 0.914281564950943, + "num_tokens": 74311830.0, + "step": 3790 + }, + { + "entropy": 0.6347973830997944, + "epoch": 0.6141414141414141, + "grad_norm": 0.45926523208618164, + "learning_rate": 3.155560705218e-05, + "loss": 0.2272, + "mean_token_accuracy": 0.9005576968193054, + "num_tokens": 74513798.0, + "step": 3800 + }, + { + "entropy": 0.6711005762219429, + "epoch": 0.6157575757575757, + "grad_norm": 0.32855984568595886, + "learning_rate": 3.1537910511154625e-05, + "loss": 0.1899, + "mean_token_accuracy": 0.9199647232890129, + "num_tokens": 74705123.0, + "step": 3810 + }, + { + "entropy": 0.6577563665807247, + "epoch": 0.6173737373737374, + "grad_norm": 0.2792748510837555, + "learning_rate": 3.152017361505522e-05, + "loss": 0.1991, + "mean_token_accuracy": 0.91060761064291, + "num_tokens": 74900348.0, + "step": 3820 + }, + { + "entropy": 0.6500893220305443, + "epoch": 0.618989898989899, + "grad_norm": 0.38441580533981323, + "learning_rate": 3.150239641487041e-05, + "loss": 0.2066, + "mean_token_accuracy": 0.907078555226326, + "num_tokens": 75095080.0, + "step": 3830 + }, + { + "entropy": 0.6571944713592529, + "epoch": 0.6206060606060606, + "grad_norm": 0.45710915327072144, + "learning_rate": 3.1484578961704694e-05, + "loss": 0.1982, + "mean_token_accuracy": 0.917755238711834, + "num_tokens": 75288294.0, + "step": 3840 + }, + { + "entropy": 0.6590827241539955, + "epoch": 0.6222222222222222, + "grad_norm": 0.3001457154750824, + "learning_rate": 3.1466721306778277e-05, + "loss": 0.1927, + "mean_token_accuracy": 0.9156096473336219, + "num_tokens": 75482456.0, + "step": 3850 + }, + { + "entropy": 0.6585754707455636, + "epoch": 0.6238383838383839, + "grad_norm": 0.27338069677352905, + "learning_rate": 3.144882350142693e-05, + "loss": 0.1812, + "mean_token_accuracy": 0.9205604672431946, + "num_tokens": 75675408.0, + "step": 3860 + }, + { + "entropy": 0.6397463321685791, + "epoch": 0.6254545454545455, + "grad_norm": 0.2751981317996979, + "learning_rate": 3.143088559710183e-05, + "loss": 0.1917, + "mean_token_accuracy": 0.9194412946701049, + "num_tokens": 75874715.0, + "step": 3870 + }, + { + "entropy": 0.6339182935655117, + "epoch": 0.6270707070707071, + "grad_norm": 0.5759657025337219, + "learning_rate": 3.141290764536947e-05, + "loss": 0.2002, + "mean_token_accuracy": 0.9099746659398079, + "num_tokens": 76072600.0, + "step": 3880 + }, + { + "entropy": 0.6439682267606258, + "epoch": 0.6286868686868687, + "grad_norm": 3.8243777751922607, + "learning_rate": 3.139488969791144e-05, + "loss": 0.2248, + "mean_token_accuracy": 0.9037980034947395, + "num_tokens": 76267369.0, + "step": 3890 + }, + { + "entropy": 0.6569941058754921, + "epoch": 0.6303030303030303, + "grad_norm": 4.018685817718506, + "learning_rate": 3.137683180652429e-05, + "loss": 0.1805, + "mean_token_accuracy": 0.9241274476051331, + "num_tokens": 76461234.0, + "step": 3900 + }, + { + "epoch": 0.6303030303030303, + "eval_entropy": 0.7052705428898335, + "eval_loss": 0.19400236010551453, + "eval_mean_token_accuracy": 0.9165776029825211, + "eval_num_tokens": 76461234.0, + "eval_runtime": 107.827, + "eval_samples_per_second": 9.274, + "eval_steps_per_second": 9.274, + "step": 3900 + }, + { + "entropy": 0.6134035527706146, + "epoch": 0.6319191919191919, + "grad_norm": 0.47339844703674316, + "learning_rate": 3.1358734023119434e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.9142026767134667, + "num_tokens": 76668510.0, + "step": 3910 + }, + { + "entropy": 0.6287301488220691, + "epoch": 0.6335353535353535, + "grad_norm": 3.310734510421753, + "learning_rate": 3.134059639972293e-05, + "loss": 0.2095, + "mean_token_accuracy": 0.9131061762571335, + "num_tokens": 76869415.0, + "step": 3920 + }, + { + "entropy": 0.6743476808071136, + "epoch": 0.6351515151515151, + "grad_norm": 0.40917423367500305, + "learning_rate": 3.132241898847541e-05, + "loss": 0.1931, + "mean_token_accuracy": 0.9163327530026436, + "num_tokens": 77055631.0, + "step": 3930 + }, + { + "entropy": 0.6846744433045387, + "epoch": 0.6367676767676768, + "grad_norm": 0.257769376039505, + "learning_rate": 3.130420184163183e-05, + "loss": 0.1795, + "mean_token_accuracy": 0.9251757651567459, + "num_tokens": 77240945.0, + "step": 3940 + }, + { + "entropy": 0.6451074630022049, + "epoch": 0.6383838383838384, + "grad_norm": 0.4214618504047394, + "learning_rate": 3.128594501156141e-05, + "loss": 0.1987, + "mean_token_accuracy": 0.913572546839714, + "num_tokens": 77439671.0, + "step": 3950 + }, + { + "entropy": 0.636054840683937, + "epoch": 0.64, + "grad_norm": 0.30621734261512756, + "learning_rate": 3.126764855074745e-05, + "loss": 0.2131, + "mean_token_accuracy": 0.9086759075522423, + "num_tokens": 77639210.0, + "step": 3960 + }, + { + "entropy": 0.669957785308361, + "epoch": 0.6416161616161616, + "grad_norm": 0.3592897355556488, + "learning_rate": 3.124931251178716e-05, + "loss": 0.1822, + "mean_token_accuracy": 0.9224239885807037, + "num_tokens": 77829537.0, + "step": 3970 + }, + { + "entropy": 0.6642787851393223, + "epoch": 0.6432323232323233, + "grad_norm": 3.2593376636505127, + "learning_rate": 3.123093694739153e-05, + "loss": 0.2133, + "mean_token_accuracy": 0.913386145234108, + "num_tokens": 78019508.0, + "step": 3980 + }, + { + "entropy": 0.6268266074359417, + "epoch": 0.6448484848484849, + "grad_norm": 2.4404337406158447, + "learning_rate": 3.1212521910385174e-05, + "loss": 0.2343, + "mean_token_accuracy": 0.9024699181318283, + "num_tokens": 78222951.0, + "step": 3990 + }, + { + "entropy": 0.6415668934583664, + "epoch": 0.6464646464646465, + "grad_norm": 2.0136265754699707, + "learning_rate": 3.11940674537062e-05, + "loss": 0.1948, + "mean_token_accuracy": 0.9218700021505356, + "num_tokens": 78421003.0, + "step": 4000 + }, + { + "entropy": 0.6442751929163932, + "epoch": 0.648080808080808, + "grad_norm": 0.34446606040000916, + "learning_rate": 3.117557363040601e-05, + "loss": 0.2121, + "mean_token_accuracy": 0.9129304736852646, + "num_tokens": 78619565.0, + "step": 4010 + }, + { + "entropy": 0.6459943115711212, + "epoch": 0.6496969696969697, + "grad_norm": 0.46052300930023193, + "learning_rate": 3.115704049364918e-05, + "loss": 0.1923, + "mean_token_accuracy": 0.9172181561589241, + "num_tokens": 78815948.0, + "step": 4020 + }, + { + "entropy": 0.658907701075077, + "epoch": 0.6513131313131313, + "grad_norm": 0.4204244613647461, + "learning_rate": 3.1138468096713306e-05, + "loss": 0.1981, + "mean_token_accuracy": 0.9092022016644478, + "num_tokens": 79007755.0, + "step": 4030 + }, + { + "entropy": 0.6481206513941288, + "epoch": 0.6529292929292929, + "grad_norm": 1.92702054977417, + "learning_rate": 3.111985649298885e-05, + "loss": 0.2269, + "mean_token_accuracy": 0.9099973410367965, + "num_tokens": 79204470.0, + "step": 4040 + }, + { + "entropy": 0.6662653639912606, + "epoch": 0.6545454545454545, + "grad_norm": 0.41740235686302185, + "learning_rate": 3.110120573597897e-05, + "loss": 0.2198, + "mean_token_accuracy": 0.9084355965256691, + "num_tokens": 79396622.0, + "step": 4050 + }, + { + "entropy": 0.647067503631115, + "epoch": 0.6561616161616162, + "grad_norm": 0.3785872757434845, + "learning_rate": 3.1082515879299394e-05, + "loss": 0.2078, + "mean_token_accuracy": 0.912786665558815, + "num_tokens": 79594893.0, + "step": 4060 + }, + { + "entropy": 0.6522647351026535, + "epoch": 0.6577777777777778, + "grad_norm": 3.0412800312042236, + "learning_rate": 3.106378697667823e-05, + "loss": 0.2232, + "mean_token_accuracy": 0.9029908329248428, + "num_tokens": 79788535.0, + "step": 4070 + }, + { + "entropy": 0.665669298171997, + "epoch": 0.6593939393939394, + "grad_norm": 2.1263978481292725, + "learning_rate": 3.104501908195586e-05, + "loss": 0.2054, + "mean_token_accuracy": 0.907913924753666, + "num_tokens": 79978326.0, + "step": 4080 + }, + { + "entropy": 0.665568544715643, + "epoch": 0.661010101010101, + "grad_norm": 0.8354134559631348, + "learning_rate": 3.102621224908474e-05, + "loss": 0.2094, + "mean_token_accuracy": 0.9120962634682656, + "num_tokens": 80169109.0, + "step": 4090 + }, + { + "entropy": 0.6687669172883034, + "epoch": 0.6626262626262627, + "grad_norm": 0.27629363536834717, + "learning_rate": 3.100736653212925e-05, + "loss": 0.1931, + "mean_token_accuracy": 0.9171461597084999, + "num_tokens": 80360696.0, + "step": 4100 + }, + { + "entropy": 0.6284529320895672, + "epoch": 0.6642424242424242, + "grad_norm": 1.274771809577942, + "learning_rate": 3.0988481985265585e-05, + "loss": 0.1957, + "mean_token_accuracy": 0.9178080677986145, + "num_tokens": 80565133.0, + "step": 4110 + }, + { + "entropy": 0.6715195573866367, + "epoch": 0.6658585858585858, + "grad_norm": 0.6915074586868286, + "learning_rate": 3.0969558662781524e-05, + "loss": 0.2111, + "mean_token_accuracy": 0.9070376634597779, + "num_tokens": 80755904.0, + "step": 4120 + }, + { + "entropy": 0.6908825971186161, + "epoch": 0.6674747474747474, + "grad_norm": 0.31496939063072205, + "learning_rate": 3.0950596619076354e-05, + "loss": 0.1949, + "mean_token_accuracy": 0.9198219880461693, + "num_tokens": 80941817.0, + "step": 4130 + }, + { + "entropy": 0.6395300135016442, + "epoch": 0.6690909090909091, + "grad_norm": 0.2790515124797821, + "learning_rate": 3.093159590866066e-05, + "loss": 0.1929, + "mean_token_accuracy": 0.9128642991185189, + "num_tokens": 81142844.0, + "step": 4140 + }, + { + "entropy": 0.6719032816588879, + "epoch": 0.6707070707070707, + "grad_norm": 0.2999343276023865, + "learning_rate": 3.0912556586156157e-05, + "loss": 0.2079, + "mean_token_accuracy": 0.9178296446800231, + "num_tokens": 81336252.0, + "step": 4150 + }, + { + "entropy": 0.6469911426305771, + "epoch": 0.6723232323232323, + "grad_norm": 0.33051058650016785, + "learning_rate": 3.0893478706295616e-05, + "loss": 0.2052, + "mean_token_accuracy": 0.9086741656064987, + "num_tokens": 81535515.0, + "step": 4160 + }, + { + "entropy": 0.647654651850462, + "epoch": 0.673939393939394, + "grad_norm": 0.24572958052158356, + "learning_rate": 3.0874362323922583e-05, + "loss": 0.2028, + "mean_token_accuracy": 0.9146606475114822, + "num_tokens": 81735276.0, + "step": 4170 + }, + { + "entropy": 0.6480786517262459, + "epoch": 0.6755555555555556, + "grad_norm": 0.4676873981952667, + "learning_rate": 3.0855207493991344e-05, + "loss": 0.1963, + "mean_token_accuracy": 0.9165736556053161, + "num_tokens": 81935012.0, + "step": 4180 + }, + { + "entropy": 0.6712435901165008, + "epoch": 0.6771717171717172, + "grad_norm": 0.2698163092136383, + "learning_rate": 3.083601427156668e-05, + "loss": 0.1943, + "mean_token_accuracy": 0.9149259775876999, + "num_tokens": 82126662.0, + "step": 4190 + }, + { + "entropy": 0.6606485776603221, + "epoch": 0.6787878787878788, + "grad_norm": 0.602709949016571, + "learning_rate": 3.081678271182374e-05, + "loss": 0.1991, + "mean_token_accuracy": 0.9160255298018456, + "num_tokens": 82320333.0, + "step": 4200 + }, + { + "epoch": 0.6787878787878788, + "eval_entropy": 0.7162625348865986, + "eval_loss": 0.1796865314245224, + "eval_mean_token_accuracy": 0.9215285767912864, + "eval_num_tokens": 82320333.0, + "eval_runtime": 107.3663, + "eval_samples_per_second": 9.314, + "eval_steps_per_second": 9.314, + "step": 4200 + }, + { + "entropy": 0.6206379361450672, + "epoch": 0.6804040404040405, + "grad_norm": 0.536363422870636, + "learning_rate": 3.07975128700479e-05, + "loss": 0.2351, + "mean_token_accuracy": 0.8985056817531586, + "num_tokens": 82529314.0, + "step": 4210 + }, + { + "entropy": 0.663483539223671, + "epoch": 0.682020202020202, + "grad_norm": 0.48275747895240784, + "learning_rate": 3.077820480163457e-05, + "loss": 0.186, + "mean_token_accuracy": 0.9269337847828865, + "num_tokens": 82722207.0, + "step": 4220 + }, + { + "entropy": 0.6566696122288704, + "epoch": 0.6836363636363636, + "grad_norm": 0.41870349645614624, + "learning_rate": 3.0758858562089064e-05, + "loss": 0.1976, + "mean_token_accuracy": 0.9154109835624695, + "num_tokens": 82917862.0, + "step": 4230 + }, + { + "entropy": 0.6778297878801822, + "epoch": 0.6852525252525252, + "grad_norm": 0.9168776869773865, + "learning_rate": 3.0739474207026414e-05, + "loss": 0.1852, + "mean_token_accuracy": 0.913340613245964, + "num_tokens": 83105019.0, + "step": 4240 + }, + { + "entropy": 0.6792197197675705, + "epoch": 0.6868686868686869, + "grad_norm": 0.30777159333229065, + "learning_rate": 3.072005179217123e-05, + "loss": 0.1869, + "mean_token_accuracy": 0.9207609817385674, + "num_tokens": 83293836.0, + "step": 4250 + }, + { + "entropy": 0.6453257068991661, + "epoch": 0.6884848484848485, + "grad_norm": 0.7975980639457703, + "learning_rate": 3.0700591373357545e-05, + "loss": 0.191, + "mean_token_accuracy": 0.9087596118450165, + "num_tokens": 83491257.0, + "step": 4260 + }, + { + "entropy": 0.6475184738636017, + "epoch": 0.6901010101010101, + "grad_norm": 1.9192277193069458, + "learning_rate": 3.068109300652861e-05, + "loss": 0.2109, + "mean_token_accuracy": 0.9065216913819313, + "num_tokens": 83687806.0, + "step": 4270 + }, + { + "entropy": 0.6501183703541755, + "epoch": 0.6917171717171717, + "grad_norm": 0.3584229648113251, + "learning_rate": 3.0661556747736804e-05, + "loss": 0.1842, + "mean_token_accuracy": 0.9226197183132172, + "num_tokens": 83884039.0, + "step": 4280 + }, + { + "entropy": 0.6619088158011437, + "epoch": 0.6933333333333334, + "grad_norm": 0.4995156228542328, + "learning_rate": 3.06419826531434e-05, + "loss": 0.2075, + "mean_token_accuracy": 0.9091604053974152, + "num_tokens": 84081632.0, + "step": 4290 + }, + { + "entropy": 0.6826878771185875, + "epoch": 0.694949494949495, + "grad_norm": 1.2546823024749756, + "learning_rate": 3.0622370779018476e-05, + "loss": 0.1722, + "mean_token_accuracy": 0.9264982849359512, + "num_tokens": 84271214.0, + "step": 4300 + }, + { + "entropy": 0.6403830215334892, + "epoch": 0.6965656565656566, + "grad_norm": 2.722322702407837, + "learning_rate": 3.060272118174067e-05, + "loss": 0.2108, + "mean_token_accuracy": 0.9086841389536857, + "num_tokens": 84472924.0, + "step": 4310 + }, + { + "entropy": 0.6621344015002251, + "epoch": 0.6981818181818182, + "grad_norm": 0.30751991271972656, + "learning_rate": 3.058303391779712e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.9191326126456261, + "num_tokens": 84666549.0, + "step": 4320 + }, + { + "entropy": 0.6759060591459274, + "epoch": 0.6997979797979798, + "grad_norm": 0.38084861636161804, + "learning_rate": 3.05633090437832e-05, + "loss": 0.1908, + "mean_token_accuracy": 0.918217821419239, + "num_tokens": 84857667.0, + "step": 4330 + }, + { + "entropy": 0.6639383666217327, + "epoch": 0.7014141414141414, + "grad_norm": 0.5564208626747131, + "learning_rate": 3.054354661640241e-05, + "loss": 0.1975, + "mean_token_accuracy": 0.916983051598072, + "num_tokens": 85052396.0, + "step": 4340 + }, + { + "entropy": 0.6601137340068817, + "epoch": 0.703030303030303, + "grad_norm": 0.26238831877708435, + "learning_rate": 3.052374669246622e-05, + "loss": 0.208, + "mean_token_accuracy": 0.9125534147024155, + "num_tokens": 85247718.0, + "step": 4350 + }, + { + "entropy": 0.6844325631856918, + "epoch": 0.7046464646464646, + "grad_norm": 0.4119778871536255, + "learning_rate": 3.0503909328893877e-05, + "loss": 0.1864, + "mean_token_accuracy": 0.9183766141533851, + "num_tokens": 85436202.0, + "step": 4360 + }, + { + "entropy": 0.6351237051188946, + "epoch": 0.7062626262626263, + "grad_norm": 0.2481747567653656, + "learning_rate": 3.048403458271227e-05, + "loss": 0.202, + "mean_token_accuracy": 0.9135596618056298, + "num_tokens": 85638337.0, + "step": 4370 + }, + { + "entropy": 0.6510977402329445, + "epoch": 0.7078787878787879, + "grad_norm": 0.2582281231880188, + "learning_rate": 3.0464122511055742e-05, + "loss": 0.2139, + "mean_token_accuracy": 0.9152849763631821, + "num_tokens": 85835739.0, + "step": 4380 + }, + { + "entropy": 0.6571626104414463, + "epoch": 0.7094949494949495, + "grad_norm": 0.39865022897720337, + "learning_rate": 3.0444173171165943e-05, + "loss": 0.1845, + "mean_token_accuracy": 0.9241902351379394, + "num_tokens": 86031887.0, + "step": 4390 + }, + { + "entropy": 0.6707207970321178, + "epoch": 0.7111111111111111, + "grad_norm": 4.553739070892334, + "learning_rate": 3.0424186620391658e-05, + "loss": 0.1986, + "mean_token_accuracy": 0.9116879284381867, + "num_tokens": 86221613.0, + "step": 4400 + }, + { + "entropy": 0.6952664569020272, + "epoch": 0.7127272727272728, + "grad_norm": 1.644877314567566, + "learning_rate": 3.040416291618864e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.9263504967093468, + "num_tokens": 86405357.0, + "step": 4410 + }, + { + "entropy": 0.6387695834040642, + "epoch": 0.7143434343434344, + "grad_norm": 0.33424562215805054, + "learning_rate": 3.0384102116119443e-05, + "loss": 0.2039, + "mean_token_accuracy": 0.9096469655632973, + "num_tokens": 86605382.0, + "step": 4420 + }, + { + "entropy": 0.6235728591680527, + "epoch": 0.7159595959595959, + "grad_norm": 0.2764752209186554, + "learning_rate": 3.0364004277853282e-05, + "loss": 0.1944, + "mean_token_accuracy": 0.913192069530487, + "num_tokens": 86808324.0, + "step": 4430 + }, + { + "entropy": 0.6314373418688775, + "epoch": 0.7175757575757575, + "grad_norm": 0.512512743473053, + "learning_rate": 3.0343869459165815e-05, + "loss": 0.2098, + "mean_token_accuracy": 0.9095560878515243, + "num_tokens": 87010056.0, + "step": 4440 + }, + { + "entropy": 0.6400863215327263, + "epoch": 0.7191919191919192, + "grad_norm": 0.3962872326374054, + "learning_rate": 3.0323697717939035e-05, + "loss": 0.1925, + "mean_token_accuracy": 0.9209304124116897, + "num_tokens": 87206033.0, + "step": 4450 + }, + { + "entropy": 0.6568309247493744, + "epoch": 0.7208080808080808, + "grad_norm": 0.43984800577163696, + "learning_rate": 3.030348911216107e-05, + "loss": 0.184, + "mean_token_accuracy": 0.9168927937746048, + "num_tokens": 87400060.0, + "step": 4460 + }, + { + "entropy": 0.644003939628601, + "epoch": 0.7224242424242424, + "grad_norm": 0.7947312593460083, + "learning_rate": 3.0283243699926004e-05, + "loss": 0.1939, + "mean_token_accuracy": 0.9169272065162659, + "num_tokens": 87596089.0, + "step": 4470 + }, + { + "entropy": 0.6618608511984349, + "epoch": 0.724040404040404, + "grad_norm": 0.20077355206012726, + "learning_rate": 3.026296153943376e-05, + "loss": 0.1869, + "mean_token_accuracy": 0.92214535176754, + "num_tokens": 87786718.0, + "step": 4480 + }, + { + "entropy": 0.6560651198029518, + "epoch": 0.7256565656565657, + "grad_norm": 0.24577167630195618, + "learning_rate": 3.0242642688989887e-05, + "loss": 0.1927, + "mean_token_accuracy": 0.9134400516748429, + "num_tokens": 87977636.0, + "step": 4490 + }, + { + "entropy": 0.6534604743123055, + "epoch": 0.7272727272727273, + "grad_norm": 0.22389015555381775, + "learning_rate": 3.02222872070054e-05, + "loss": 0.1913, + "mean_token_accuracy": 0.9150966987013817, + "num_tokens": 88173839.0, + "step": 4500 + }, + { + "epoch": 0.7272727272727273, + "eval_entropy": 0.7084573372304439, + "eval_loss": 0.17917364835739136, + "eval_mean_token_accuracy": 0.9231415635347366, + "eval_num_tokens": 88173839.0, + "eval_runtime": 107.7522, + "eval_samples_per_second": 9.281, + "eval_steps_per_second": 9.281, + "step": 4500 + }, + { + "entropy": 0.625628887116909, + "epoch": 0.7288888888888889, + "grad_norm": 0.34484541416168213, + "learning_rate": 3.0201895151996636e-05, + "loss": 0.1992, + "mean_token_accuracy": 0.915931510925293, + "num_tokens": 88378636.0, + "step": 4510 + }, + { + "entropy": 0.6244528807699681, + "epoch": 0.7305050505050505, + "grad_norm": 0.253776490688324, + "learning_rate": 3.0181466582585056e-05, + "loss": 0.2, + "mean_token_accuracy": 0.9082011923193931, + "num_tokens": 88580751.0, + "step": 4520 + }, + { + "entropy": 0.6235019870102405, + "epoch": 0.7321212121212122, + "grad_norm": 0.41761407256126404, + "learning_rate": 3.0161001557497097e-05, + "loss": 0.2076, + "mean_token_accuracy": 0.9085301622748375, + "num_tokens": 88784953.0, + "step": 4530 + }, + { + "entropy": 0.6446406342089176, + "epoch": 0.7337373737373737, + "grad_norm": 0.31629061698913574, + "learning_rate": 3.0140500135563988e-05, + "loss": 0.1894, + "mean_token_accuracy": 0.9079845041036606, + "num_tokens": 88980717.0, + "step": 4540 + }, + { + "entropy": 0.6352882876992225, + "epoch": 0.7353535353535353, + "grad_norm": 0.311393678188324, + "learning_rate": 3.01199623757216e-05, + "loss": 0.2006, + "mean_token_accuracy": 0.9123006537556648, + "num_tokens": 89179270.0, + "step": 4550 + }, + { + "entropy": 0.6552980750799179, + "epoch": 0.7369696969696969, + "grad_norm": 0.2929733693599701, + "learning_rate": 3.0099388337010258e-05, + "loss": 0.1975, + "mean_token_accuracy": 0.9153749331831932, + "num_tokens": 89371171.0, + "step": 4560 + }, + { + "entropy": 0.6586298391222953, + "epoch": 0.7385858585858586, + "grad_norm": 0.35867592692375183, + "learning_rate": 3.0078778078574578e-05, + "loss": 0.1988, + "mean_token_accuracy": 0.9191118314862251, + "num_tokens": 89562627.0, + "step": 4570 + }, + { + "entropy": 0.6490816324949265, + "epoch": 0.7402020202020202, + "grad_norm": 0.41531920433044434, + "learning_rate": 3.00581316596633e-05, + "loss": 0.1954, + "mean_token_accuracy": 0.9126431494951248, + "num_tokens": 89756798.0, + "step": 4580 + }, + { + "entropy": 0.652235820889473, + "epoch": 0.7418181818181818, + "grad_norm": 0.2528453767299652, + "learning_rate": 3.0037449139629118e-05, + "loss": 0.2169, + "mean_token_accuracy": 0.9148132383823395, + "num_tokens": 89949016.0, + "step": 4590 + }, + { + "entropy": 0.6657333552837372, + "epoch": 0.7434343434343434, + "grad_norm": 0.35651499032974243, + "learning_rate": 3.0016730577928502e-05, + "loss": 0.2013, + "mean_token_accuracy": 0.9121545031666756, + "num_tokens": 90137136.0, + "step": 4600 + }, + { + "entropy": 0.6497416846454144, + "epoch": 0.7450505050505051, + "grad_norm": 0.8100721836090088, + "learning_rate": 2.9995976034121533e-05, + "loss": 0.1845, + "mean_token_accuracy": 0.9251446709036827, + "num_tokens": 90330201.0, + "step": 4610 + }, + { + "entropy": 0.6356344550848008, + "epoch": 0.7466666666666667, + "grad_norm": 0.55918288230896, + "learning_rate": 2.9975185567871724e-05, + "loss": 0.1885, + "mean_token_accuracy": 0.9203934907913208, + "num_tokens": 90527700.0, + "step": 4620 + }, + { + "entropy": 0.6411231979727745, + "epoch": 0.7482828282828283, + "grad_norm": 0.3770994246006012, + "learning_rate": 2.9954359238945874e-05, + "loss": 0.185, + "mean_token_accuracy": 0.9203880429267883, + "num_tokens": 90726375.0, + "step": 4630 + }, + { + "entropy": 0.6443192966282367, + "epoch": 0.74989898989899, + "grad_norm": 1.301261067390442, + "learning_rate": 2.993349710721386e-05, + "loss": 0.2015, + "mean_token_accuracy": 0.9126525431871414, + "num_tokens": 90922076.0, + "step": 4640 + }, + { + "entropy": 0.6357131384313106, + "epoch": 0.7515151515151515, + "grad_norm": 0.5413894057273865, + "learning_rate": 2.9912599232648484e-05, + "loss": 0.2105, + "mean_token_accuracy": 0.9099463567137718, + "num_tokens": 91120775.0, + "step": 4650 + }, + { + "entropy": 0.6498973920941353, + "epoch": 0.7531313131313131, + "grad_norm": 0.3530432879924774, + "learning_rate": 2.9891665675325303e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9196889087557792, + "num_tokens": 91313565.0, + "step": 4660 + }, + { + "entropy": 0.6522077709436417, + "epoch": 0.7547474747474747, + "grad_norm": 0.3330952525138855, + "learning_rate": 2.9870696495422457e-05, + "loss": 0.1916, + "mean_token_accuracy": 0.9102325558662414, + "num_tokens": 91508313.0, + "step": 4670 + }, + { + "entropy": 0.6606266908347607, + "epoch": 0.7563636363636363, + "grad_norm": 0.3526689410209656, + "learning_rate": 2.9849691753220472e-05, + "loss": 0.1852, + "mean_token_accuracy": 0.9151232793927193, + "num_tokens": 91698872.0, + "step": 4680 + }, + { + "entropy": 0.6377008631825447, + "epoch": 0.757979797979798, + "grad_norm": 0.9076177477836609, + "learning_rate": 2.982865150910213e-05, + "loss": 0.1972, + "mean_token_accuracy": 0.9114378541707993, + "num_tokens": 91897368.0, + "step": 4690 + }, + { + "entropy": 0.6670403979718685, + "epoch": 0.7595959595959596, + "grad_norm": 0.41317087411880493, + "learning_rate": 2.9807575823552256e-05, + "loss": 0.1976, + "mean_token_accuracy": 0.9175861835479736, + "num_tokens": 92089611.0, + "step": 4700 + }, + { + "entropy": 0.6318577192723751, + "epoch": 0.7612121212121212, + "grad_norm": 1.2537201642990112, + "learning_rate": 2.978646475715756e-05, + "loss": 0.2033, + "mean_token_accuracy": 0.9059315130114556, + "num_tokens": 92289093.0, + "step": 4710 + }, + { + "entropy": 0.641324719786644, + "epoch": 0.7628282828282829, + "grad_norm": 3.2999162673950195, + "learning_rate": 2.976531837060646e-05, + "loss": 0.2063, + "mean_token_accuracy": 0.9138477474451066, + "num_tokens": 92486668.0, + "step": 4720 + }, + { + "entropy": 0.6670213177800178, + "epoch": 0.7644444444444445, + "grad_norm": 0.544033408164978, + "learning_rate": 2.9744136724688925e-05, + "loss": 0.2029, + "mean_token_accuracy": 0.9136773809790611, + "num_tokens": 92676682.0, + "step": 4730 + }, + { + "entropy": 0.6242363020777703, + "epoch": 0.7660606060606061, + "grad_norm": 0.23182179033756256, + "learning_rate": 2.9722919880296275e-05, + "loss": 0.2127, + "mean_token_accuracy": 0.9077824458479882, + "num_tokens": 92881908.0, + "step": 4740 + }, + { + "entropy": 0.646241495758295, + "epoch": 0.7676767676767676, + "grad_norm": 0.46250054240226746, + "learning_rate": 2.9701667898421007e-05, + "loss": 0.1877, + "mean_token_accuracy": 0.9231104418635369, + "num_tokens": 93079703.0, + "step": 4750 + }, + { + "entropy": 0.6419562876224518, + "epoch": 0.7692929292929293, + "grad_norm": 0.5598077178001404, + "learning_rate": 2.968038084015664e-05, + "loss": 0.1999, + "mean_token_accuracy": 0.90995651781559, + "num_tokens": 93279631.0, + "step": 4760 + }, + { + "entropy": 0.6329437598586083, + "epoch": 0.7709090909090909, + "grad_norm": 0.44706058502197266, + "learning_rate": 2.9659058766697517e-05, + "loss": 0.192, + "mean_token_accuracy": 0.9213619500398635, + "num_tokens": 93485133.0, + "step": 4770 + }, + { + "entropy": 0.641933137178421, + "epoch": 0.7725252525252525, + "grad_norm": 0.26708972454071045, + "learning_rate": 2.9637701739338663e-05, + "loss": 0.1998, + "mean_token_accuracy": 0.9142438322305679, + "num_tokens": 93685004.0, + "step": 4780 + }, + { + "entropy": 0.661033497005701, + "epoch": 0.7741414141414141, + "grad_norm": 0.330924391746521, + "learning_rate": 2.9616309819475555e-05, + "loss": 0.1965, + "mean_token_accuracy": 0.9215885639190674, + "num_tokens": 93876811.0, + "step": 4790 + }, + { + "entropy": 0.6610300816595555, + "epoch": 0.7757575757575758, + "grad_norm": 0.5013708472251892, + "learning_rate": 2.9594883068603994e-05, + "loss": 0.1939, + "mean_token_accuracy": 0.9239798724651337, + "num_tokens": 94069193.0, + "step": 4800 + }, + { + "epoch": 0.7757575757575758, + "eval_entropy": 0.7066668328940868, + "eval_loss": 0.1791316419839859, + "eval_mean_token_accuracy": 0.9220727326869964, + "eval_num_tokens": 94069193.0, + "eval_runtime": 107.0928, + "eval_samples_per_second": 9.338, + "eval_steps_per_second": 9.338, + "step": 4800 + }, + { + "entropy": 0.6644454948604107, + "epoch": 0.7773737373737374, + "grad_norm": 0.3091444671154022, + "learning_rate": 2.9573421548319915e-05, + "loss": 0.1709, + "mean_token_accuracy": 0.9266034245491028, + "num_tokens": 94259025.0, + "step": 4810 + }, + { + "entropy": 0.661138878762722, + "epoch": 0.778989898989899, + "grad_norm": 3.5806069374084473, + "learning_rate": 2.955192532031919e-05, + "loss": 0.177, + "mean_token_accuracy": 0.9198731109499931, + "num_tokens": 94448772.0, + "step": 4820 + }, + { + "entropy": 0.6519260667264462, + "epoch": 0.7806060606060606, + "grad_norm": 0.3555883765220642, + "learning_rate": 2.9530394446397478e-05, + "loss": 0.1986, + "mean_token_accuracy": 0.9150562450289726, + "num_tokens": 94641890.0, + "step": 4830 + }, + { + "entropy": 0.648599949479103, + "epoch": 0.7822222222222223, + "grad_norm": 0.1755499243736267, + "learning_rate": 2.950882898845003e-05, + "loss": 0.1838, + "mean_token_accuracy": 0.9226464763283729, + "num_tokens": 94837343.0, + "step": 4840 + }, + { + "entropy": 0.6322360306978225, + "epoch": 0.7838383838383839, + "grad_norm": 0.6269310712814331, + "learning_rate": 2.9487229008471516e-05, + "loss": 0.1975, + "mean_token_accuracy": 0.9124949604272843, + "num_tokens": 95037833.0, + "step": 4850 + }, + { + "entropy": 0.6398171879351139, + "epoch": 0.7854545454545454, + "grad_norm": 0.7617530226707458, + "learning_rate": 2.9465594568555848e-05, + "loss": 0.2033, + "mean_token_accuracy": 0.9138290837407113, + "num_tokens": 95234489.0, + "step": 4860 + }, + { + "entropy": 0.6249017030000686, + "epoch": 0.787070707070707, + "grad_norm": 1.4149338006973267, + "learning_rate": 2.9443925730896002e-05, + "loss": 0.2005, + "mean_token_accuracy": 0.9207390502095223, + "num_tokens": 95437222.0, + "step": 4870 + }, + { + "entropy": 0.6542009584605694, + "epoch": 0.7886868686868687, + "grad_norm": 0.2479691058397293, + "learning_rate": 2.942222255778384e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9258310168981552, + "num_tokens": 95632246.0, + "step": 4880 + }, + { + "entropy": 0.6555586464703083, + "epoch": 0.7903030303030303, + "grad_norm": 0.37414678931236267, + "learning_rate": 2.940048511160993e-05, + "loss": 0.2025, + "mean_token_accuracy": 0.9188988924026489, + "num_tokens": 95826517.0, + "step": 4890 + }, + { + "entropy": 0.6773953974246979, + "epoch": 0.7919191919191919, + "grad_norm": 0.3192891478538513, + "learning_rate": 2.937871345486335e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.9206574469804764, + "num_tokens": 96010001.0, + "step": 4900 + }, + { + "entropy": 0.6655720636248589, + "epoch": 0.7935353535353535, + "grad_norm": 0.29721325635910034, + "learning_rate": 2.935690765013155e-05, + "loss": 0.2051, + "mean_token_accuracy": 0.9204679220914841, + "num_tokens": 96199172.0, + "step": 4910 + }, + { + "entropy": 0.6329097993671894, + "epoch": 0.7951515151515152, + "grad_norm": 3.6277847290039062, + "learning_rate": 2.933506776010012e-05, + "loss": 0.1848, + "mean_token_accuracy": 0.9226977825164795, + "num_tokens": 96397606.0, + "step": 4920 + }, + { + "entropy": 0.6353421777486801, + "epoch": 0.7967676767676768, + "grad_norm": 0.5429826378822327, + "learning_rate": 2.9313193847552646e-05, + "loss": 0.1969, + "mean_token_accuracy": 0.9134786337614059, + "num_tokens": 96598079.0, + "step": 4930 + }, + { + "entropy": 0.6452288366854191, + "epoch": 0.7983838383838384, + "grad_norm": 0.6199573278427124, + "learning_rate": 2.9291285975370532e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.9198967650532722, + "num_tokens": 96794052.0, + "step": 4940 + }, + { + "entropy": 0.6411104649305344, + "epoch": 0.8, + "grad_norm": 3.42073130607605, + "learning_rate": 2.9269344206532787e-05, + "loss": 0.2022, + "mean_token_accuracy": 0.9114478975534439, + "num_tokens": 96990153.0, + "step": 4950 + }, + { + "entropy": 0.6478223502635956, + "epoch": 0.8016161616161617, + "grad_norm": 0.543823778629303, + "learning_rate": 2.9247368604115868e-05, + "loss": 0.1914, + "mean_token_accuracy": 0.9210754543542862, + "num_tokens": 97187085.0, + "step": 4960 + }, + { + "entropy": 0.6267038069665432, + "epoch": 0.8032323232323232, + "grad_norm": 0.38325926661491394, + "learning_rate": 2.9225359231293504e-05, + "loss": 0.2113, + "mean_token_accuracy": 0.9096709057688713, + "num_tokens": 97391001.0, + "step": 4970 + }, + { + "entropy": 0.6416891925036907, + "epoch": 0.8048484848484848, + "grad_norm": 0.28709155321121216, + "learning_rate": 2.9203316151336503e-05, + "loss": 0.2057, + "mean_token_accuracy": 0.9100401312112808, + "num_tokens": 97589847.0, + "step": 4980 + }, + { + "entropy": 0.6513305693864823, + "epoch": 0.8064646464646464, + "grad_norm": 0.3699948191642761, + "learning_rate": 2.9181239427612553e-05, + "loss": 0.2023, + "mean_token_accuracy": 0.9095991387963295, + "num_tokens": 97784709.0, + "step": 4990 + }, + { + "entropy": 0.6572237811982632, + "epoch": 0.8080808080808081, + "grad_norm": 0.22213219106197357, + "learning_rate": 2.915912912358608e-05, + "loss": 0.1874, + "mean_token_accuracy": 0.9166789174079895, + "num_tokens": 97976974.0, + "step": 5000 + }, + { + "entropy": 0.6352049507200718, + "epoch": 0.8096969696969697, + "grad_norm": 0.3106616735458374, + "learning_rate": 2.9136985302818037e-05, + "loss": 0.1961, + "mean_token_accuracy": 0.9133357793092728, + "num_tokens": 98178430.0, + "step": 5010 + }, + { + "entropy": 0.6425909124314785, + "epoch": 0.8113131313131313, + "grad_norm": 0.21424593031406403, + "learning_rate": 2.9114808028965725e-05, + "loss": 0.206, + "mean_token_accuracy": 0.9140076264739037, + "num_tokens": 98378932.0, + "step": 5020 + }, + { + "entropy": 0.633851234614849, + "epoch": 0.812929292929293, + "grad_norm": 0.6443716883659363, + "learning_rate": 2.909259736578261e-05, + "loss": 0.2156, + "mean_token_accuracy": 0.9081557050347329, + "num_tokens": 98582018.0, + "step": 5030 + }, + { + "entropy": 0.6445118576288223, + "epoch": 0.8145454545454546, + "grad_norm": 0.2837905287742615, + "learning_rate": 2.9070353377118157e-05, + "loss": 0.2013, + "mean_token_accuracy": 0.9167410224676132, + "num_tokens": 98780254.0, + "step": 5040 + }, + { + "entropy": 0.6554682970046997, + "epoch": 0.8161616161616162, + "grad_norm": 0.19515164196491241, + "learning_rate": 2.904807612691762e-05, + "loss": 0.1841, + "mean_token_accuracy": 0.9176914617419243, + "num_tokens": 98975630.0, + "step": 5050 + }, + { + "entropy": 0.6432619698345661, + "epoch": 0.8177777777777778, + "grad_norm": 0.4333808422088623, + "learning_rate": 2.9025765679221877e-05, + "loss": 0.1874, + "mean_token_accuracy": 0.9236332550644875, + "num_tokens": 99172630.0, + "step": 5060 + }, + { + "entropy": 0.6520089380443096, + "epoch": 0.8193939393939393, + "grad_norm": 0.2755585312843323, + "learning_rate": 2.9003422098167233e-05, + "loss": 0.2044, + "mean_token_accuracy": 0.9035430192947388, + "num_tokens": 99366672.0, + "step": 5070 + }, + { + "entropy": 0.6506100505590439, + "epoch": 0.821010101010101, + "grad_norm": 0.2337871491909027, + "learning_rate": 2.8981045447985252e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9145638585090637, + "num_tokens": 99560531.0, + "step": 5080 + }, + { + "entropy": 0.6408963188529014, + "epoch": 0.8226262626262626, + "grad_norm": 0.6002528071403503, + "learning_rate": 2.8958635793002555e-05, + "loss": 0.2027, + "mean_token_accuracy": 0.9141564324498177, + "num_tokens": 99763014.0, + "step": 5090 + }, + { + "entropy": 0.6397250548005105, + "epoch": 0.8242424242424242, + "grad_norm": 0.4990832209587097, + "learning_rate": 2.893619319764064e-05, + "loss": 0.2051, + "mean_token_accuracy": 0.9149850279092788, + "num_tokens": 99963749.0, + "step": 5100 + }, + { + "epoch": 0.8242424242424242, + "eval_entropy": 0.711084859162569, + "eval_loss": 0.17738723754882812, + "eval_mean_token_accuracy": 0.9236658059358597, + "eval_num_tokens": 99963749.0, + "eval_runtime": 108.7249, + "eval_samples_per_second": 9.198, + "eval_steps_per_second": 9.198, + "step": 5100 + }, + { + "entropy": 0.6357235059142112, + "epoch": 0.8258585858585858, + "grad_norm": 0.3885038495063782, + "learning_rate": 2.8913717726415703e-05, + "loss": 0.1997, + "mean_token_accuracy": 0.9168539077043534, + "num_tokens": 100163382.0, + "step": 5110 + }, + { + "entropy": 0.6695950977504254, + "epoch": 0.8274747474747475, + "grad_norm": 0.3567816913127899, + "learning_rate": 2.8891209443938462e-05, + "loss": 0.1857, + "mean_token_accuracy": 0.9217189520597457, + "num_tokens": 100352951.0, + "step": 5120 + }, + { + "entropy": 0.644576845318079, + "epoch": 0.8290909090909091, + "grad_norm": 1.626629114151001, + "learning_rate": 2.8868668414913932e-05, + "loss": 0.2001, + "mean_token_accuracy": 0.9181193798780442, + "num_tokens": 100550280.0, + "step": 5130 + }, + { + "entropy": 0.6233297750353813, + "epoch": 0.8307070707070707, + "grad_norm": 0.2700617015361786, + "learning_rate": 2.8846094704141283e-05, + "loss": 0.2024, + "mean_token_accuracy": 0.917197747528553, + "num_tokens": 100755619.0, + "step": 5140 + }, + { + "entropy": 0.6268128864467144, + "epoch": 0.8323232323232324, + "grad_norm": 0.3646371066570282, + "learning_rate": 2.8823488376513636e-05, + "loss": 0.1905, + "mean_token_accuracy": 0.9136819630861283, + "num_tokens": 100957153.0, + "step": 5150 + }, + { + "entropy": 0.6693423956632614, + "epoch": 0.833939393939394, + "grad_norm": 0.41190770268440247, + "learning_rate": 2.8800849497017874e-05, + "loss": 0.1982, + "mean_token_accuracy": 0.912401182949543, + "num_tokens": 101146659.0, + "step": 5160 + }, + { + "entropy": 0.6590829059481621, + "epoch": 0.8355555555555556, + "grad_norm": 1.4731428623199463, + "learning_rate": 2.8778178130734445e-05, + "loss": 0.1928, + "mean_token_accuracy": 0.9157117456197739, + "num_tokens": 101340632.0, + "step": 5170 + }, + { + "entropy": 0.6364330008625985, + "epoch": 0.8371717171717171, + "grad_norm": 0.4675506055355072, + "learning_rate": 2.8755474342837214e-05, + "loss": 0.2048, + "mean_token_accuracy": 0.9125443175435066, + "num_tokens": 101540607.0, + "step": 5180 + }, + { + "entropy": 0.646899762749672, + "epoch": 0.8387878787878787, + "grad_norm": 0.6047652363777161, + "learning_rate": 2.873273819859321e-05, + "loss": 0.1922, + "mean_token_accuracy": 0.914633323252201, + "num_tokens": 101737070.0, + "step": 5190 + }, + { + "entropy": 0.6807426974177361, + "epoch": 0.8404040404040404, + "grad_norm": 0.26485979557037354, + "learning_rate": 2.8709969763362528e-05, + "loss": 0.1828, + "mean_token_accuracy": 0.9270256280899047, + "num_tokens": 101924748.0, + "step": 5200 + }, + { + "entropy": 0.6325490467250348, + "epoch": 0.842020202020202, + "grad_norm": 0.6139155030250549, + "learning_rate": 2.8687169102598045e-05, + "loss": 0.1959, + "mean_token_accuracy": 0.9126851588487626, + "num_tokens": 102126864.0, + "step": 5210 + }, + { + "entropy": 0.6704427659511566, + "epoch": 0.8436363636363636, + "grad_norm": 0.47554439306259155, + "learning_rate": 2.8664336281845305e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.9185951977968216, + "num_tokens": 102317161.0, + "step": 5220 + }, + { + "entropy": 0.6308740340173244, + "epoch": 0.8452525252525253, + "grad_norm": 0.28282374143600464, + "learning_rate": 2.864147136674229e-05, + "loss": 0.2062, + "mean_token_accuracy": 0.9082028090953826, + "num_tokens": 102520346.0, + "step": 5230 + }, + { + "entropy": 0.6597593754529953, + "epoch": 0.8468686868686869, + "grad_norm": 0.29790395498275757, + "learning_rate": 2.8618574423019245e-05, + "loss": 0.1866, + "mean_token_accuracy": 0.9192267313599587, + "num_tokens": 102711472.0, + "step": 5240 + }, + { + "entropy": 0.6680999815464019, + "epoch": 0.8484848484848485, + "grad_norm": 0.33394286036491394, + "learning_rate": 2.85956455164985e-05, + "loss": 0.1689, + "mean_token_accuracy": 0.9244133904576302, + "num_tokens": 102900401.0, + "step": 5250 + }, + { + "entropy": 0.6864773757755757, + "epoch": 0.8501010101010101, + "grad_norm": 0.5447540879249573, + "learning_rate": 2.8572684713094258e-05, + "loss": 0.1772, + "mean_token_accuracy": 0.9200161874294281, + "num_tokens": 103084516.0, + "step": 5260 + }, + { + "entropy": 0.6241940252482892, + "epoch": 0.8517171717171718, + "grad_norm": 0.2910633981227875, + "learning_rate": 2.8549692078812408e-05, + "loss": 0.1815, + "mean_token_accuracy": 0.9207342237234115, + "num_tokens": 103288299.0, + "step": 5270 + }, + { + "entropy": 0.6231748268008233, + "epoch": 0.8533333333333334, + "grad_norm": 1.1171069145202637, + "learning_rate": 2.8526667679750373e-05, + "loss": 0.2025, + "mean_token_accuracy": 0.9095341920852661, + "num_tokens": 103490561.0, + "step": 5280 + }, + { + "entropy": 0.678689093887806, + "epoch": 0.8549494949494949, + "grad_norm": 0.9614298939704895, + "learning_rate": 2.850361158209686e-05, + "loss": 0.1854, + "mean_token_accuracy": 0.916099627315998, + "num_tokens": 103676969.0, + "step": 5290 + }, + { + "entropy": 0.6450789928436279, + "epoch": 0.8565656565656565, + "grad_norm": 1.1223703622817993, + "learning_rate": 2.848052385213172e-05, + "loss": 0.1967, + "mean_token_accuracy": 0.9124213516712188, + "num_tokens": 103874801.0, + "step": 5300 + }, + { + "entropy": 0.61056489944458, + "epoch": 0.8581818181818182, + "grad_norm": 0.6901957988739014, + "learning_rate": 2.845740455622573e-05, + "loss": 0.2225, + "mean_token_accuracy": 0.9036866158246994, + "num_tokens": 104084220.0, + "step": 5310 + }, + { + "entropy": 0.6487591087818145, + "epoch": 0.8597979797979798, + "grad_norm": 0.2818031907081604, + "learning_rate": 2.843425376084041e-05, + "loss": 0.1711, + "mean_token_accuracy": 0.9231310933828354, + "num_tokens": 104279547.0, + "step": 5320 + }, + { + "entropy": 0.6611220106482506, + "epoch": 0.8614141414141414, + "grad_norm": 2.2702674865722656, + "learning_rate": 2.8411071532527836e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9261722102761268, + "num_tokens": 104471791.0, + "step": 5330 + }, + { + "entropy": 0.6651269473135472, + "epoch": 0.863030303030303, + "grad_norm": 0.36874091625213623, + "learning_rate": 2.8387857937930444e-05, + "loss": 0.1985, + "mean_token_accuracy": 0.9155077993869781, + "num_tokens": 104666419.0, + "step": 5340 + }, + { + "entropy": 0.6907488480210304, + "epoch": 0.8646464646464647, + "grad_norm": 0.34566083550453186, + "learning_rate": 2.8364613043780834e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9205912709236145, + "num_tokens": 104851104.0, + "step": 5350 + }, + { + "entropy": 0.6465521849691868, + "epoch": 0.8662626262626263, + "grad_norm": 0.5255801677703857, + "learning_rate": 2.8341336916901593e-05, + "loss": 0.1931, + "mean_token_accuracy": 0.9175161227583886, + "num_tokens": 105048989.0, + "step": 5360 + }, + { + "entropy": 0.6434522531926632, + "epoch": 0.8678787878787879, + "grad_norm": 0.6469079256057739, + "learning_rate": 2.8318029624205086e-05, + "loss": 0.1932, + "mean_token_accuracy": 0.9107877269387246, + "num_tokens": 105247416.0, + "step": 5370 + }, + { + "entropy": 0.6635441213846207, + "epoch": 0.8694949494949495, + "grad_norm": 0.2386147379875183, + "learning_rate": 2.829469123269327e-05, + "loss": 0.1974, + "mean_token_accuracy": 0.916090327501297, + "num_tokens": 105438507.0, + "step": 5380 + }, + { + "entropy": 0.6538894563913346, + "epoch": 0.8711111111111111, + "grad_norm": 0.24012506008148193, + "learning_rate": 2.8271321809457514e-05, + "loss": 0.201, + "mean_token_accuracy": 0.9102769285440445, + "num_tokens": 105634997.0, + "step": 5390 + }, + { + "entropy": 0.6566796422004699, + "epoch": 0.8727272727272727, + "grad_norm": 0.6380462646484375, + "learning_rate": 2.824792142167838e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9162303432822227, + "num_tokens": 105829263.0, + "step": 5400 + }, + { + "epoch": 0.8727272727272727, + "eval_entropy": 0.7135846571624279, + "eval_loss": 0.17874300479888916, + "eval_mean_token_accuracy": 0.9199917802810669, + "eval_num_tokens": 105829263.0, + "eval_runtime": 107.6177, + "eval_samples_per_second": 9.292, + "eval_steps_per_second": 9.292, + "step": 5400 + }, + { + "entropy": 0.6447012484073639, + "epoch": 0.8743434343434343, + "grad_norm": 2.0912580490112305, + "learning_rate": 2.8224490136625455e-05, + "loss": 0.2109, + "mean_token_accuracy": 0.9130528718233109, + "num_tokens": 106026045.0, + "step": 5410 + }, + { + "entropy": 0.6804825678467751, + "epoch": 0.8759595959595959, + "grad_norm": 0.47214868664741516, + "learning_rate": 2.8201028021657138e-05, + "loss": 0.173, + "mean_token_accuracy": 0.9197008013725281, + "num_tokens": 106215632.0, + "step": 5420 + }, + { + "entropy": 0.6902777642011643, + "epoch": 0.8775757575757576, + "grad_norm": 0.18231706321239471, + "learning_rate": 2.8177535144220456e-05, + "loss": 0.1848, + "mean_token_accuracy": 0.9233282685279847, + "num_tokens": 106401153.0, + "step": 5430 + }, + { + "entropy": 0.6623747788369656, + "epoch": 0.8791919191919192, + "grad_norm": 0.36545881628990173, + "learning_rate": 2.8154011571850884e-05, + "loss": 0.2218, + "mean_token_accuracy": 0.9072560966014862, + "num_tokens": 106598360.0, + "step": 5440 + }, + { + "entropy": 0.685894726216793, + "epoch": 0.8808080808080808, + "grad_norm": 0.4281963109970093, + "learning_rate": 2.8130457372172125e-05, + "loss": 0.1909, + "mean_token_accuracy": 0.9133310407400131, + "num_tokens": 106790942.0, + "step": 5450 + }, + { + "entropy": 0.683454804122448, + "epoch": 0.8824242424242424, + "grad_norm": 0.3858519196510315, + "learning_rate": 2.8106872612895914e-05, + "loss": 0.1755, + "mean_token_accuracy": 0.9197027593851089, + "num_tokens": 106978744.0, + "step": 5460 + }, + { + "entropy": 0.6165321469306946, + "epoch": 0.8840404040404041, + "grad_norm": 0.2674444019794464, + "learning_rate": 2.8083257361821872e-05, + "loss": 0.2052, + "mean_token_accuracy": 0.9134606182575226, + "num_tokens": 107190687.0, + "step": 5470 + }, + { + "entropy": 0.6856359049677849, + "epoch": 0.8856565656565657, + "grad_norm": 0.40422874689102173, + "learning_rate": 2.8059611686837236e-05, + "loss": 0.2047, + "mean_token_accuracy": 0.9142091274261475, + "num_tokens": 107380321.0, + "step": 5480 + }, + { + "entropy": 0.674786227196455, + "epoch": 0.8872727272727273, + "grad_norm": 0.30188801884651184, + "learning_rate": 2.8035935655916723e-05, + "loss": 0.204, + "mean_token_accuracy": 0.9160023897886276, + "num_tokens": 107575323.0, + "step": 5490 + }, + { + "entropy": 0.6544559545814991, + "epoch": 0.8888888888888888, + "grad_norm": 0.6562331318855286, + "learning_rate": 2.8012229337122324e-05, + "loss": 0.1947, + "mean_token_accuracy": 0.9191029623150826, + "num_tokens": 107774878.0, + "step": 5500 + }, + { + "entropy": 0.6787748537957669, + "epoch": 0.8905050505050505, + "grad_norm": 0.27502256631851196, + "learning_rate": 2.7988492798603072e-05, + "loss": 0.1792, + "mean_token_accuracy": 0.9223506987094879, + "num_tokens": 107967851.0, + "step": 5510 + }, + { + "entropy": 0.6641766116023063, + "epoch": 0.8921212121212121, + "grad_norm": 0.442937970161438, + "learning_rate": 2.7964726108594904e-05, + "loss": 0.1962, + "mean_token_accuracy": 0.9132106631994248, + "num_tokens": 108163235.0, + "step": 5520 + }, + { + "entropy": 0.6542678773403168, + "epoch": 0.8937373737373737, + "grad_norm": 0.30644455552101135, + "learning_rate": 2.794092933542041e-05, + "loss": 0.2097, + "mean_token_accuracy": 0.9084485098719597, + "num_tokens": 108361379.0, + "step": 5530 + }, + { + "entropy": 0.671286852657795, + "epoch": 0.8953535353535353, + "grad_norm": 0.2679237723350525, + "learning_rate": 2.7917102547488676e-05, + "loss": 0.1995, + "mean_token_accuracy": 0.9114113479852677, + "num_tokens": 108554126.0, + "step": 5540 + }, + { + "entropy": 0.6369331762194633, + "epoch": 0.896969696969697, + "grad_norm": 2.665994882583618, + "learning_rate": 2.789324581329506e-05, + "loss": 0.2008, + "mean_token_accuracy": 0.9015925779938698, + "num_tokens": 108756149.0, + "step": 5550 + }, + { + "entropy": 0.6549828127026558, + "epoch": 0.8985858585858586, + "grad_norm": 0.23621822893619537, + "learning_rate": 2.786935920142102e-05, + "loss": 0.1956, + "mean_token_accuracy": 0.9176381945610046, + "num_tokens": 108955510.0, + "step": 5560 + }, + { + "entropy": 0.6628460936248303, + "epoch": 0.9002020202020202, + "grad_norm": 0.28806036710739136, + "learning_rate": 2.784544278053389e-05, + "loss": 0.199, + "mean_token_accuracy": 0.9112476229667663, + "num_tokens": 109153359.0, + "step": 5570 + }, + { + "entropy": 0.6598017416894436, + "epoch": 0.9018181818181819, + "grad_norm": 0.3448881506919861, + "learning_rate": 2.7821496619386704e-05, + "loss": 0.1925, + "mean_token_accuracy": 0.915037252008915, + "num_tokens": 109350258.0, + "step": 5580 + }, + { + "entropy": 0.6612473607063294, + "epoch": 0.9034343434343435, + "grad_norm": 0.2845333218574524, + "learning_rate": 2.7797520786817996e-05, + "loss": 0.1962, + "mean_token_accuracy": 0.9182912409305573, + "num_tokens": 109544964.0, + "step": 5590 + }, + { + "entropy": 0.6518812745809555, + "epoch": 0.9050505050505051, + "grad_norm": 0.21649111807346344, + "learning_rate": 2.7773515351751585e-05, + "loss": 0.2054, + "mean_token_accuracy": 0.9045742139220238, + "num_tokens": 109742959.0, + "step": 5600 + }, + { + "entropy": 0.6644902803003788, + "epoch": 0.9066666666666666, + "grad_norm": 0.33554190397262573, + "learning_rate": 2.774948038319641e-05, + "loss": 0.1939, + "mean_token_accuracy": 0.919256579875946, + "num_tokens": 109935296.0, + "step": 5610 + }, + { + "entropy": 0.6862445369362831, + "epoch": 0.9082828282828282, + "grad_norm": 0.2826671302318573, + "learning_rate": 2.772541595024628e-05, + "loss": 0.1894, + "mean_token_accuracy": 0.9180306404829025, + "num_tokens": 110124649.0, + "step": 5620 + }, + { + "entropy": 0.6782503843307495, + "epoch": 0.9098989898989899, + "grad_norm": 0.2841434180736542, + "learning_rate": 2.770132212207973e-05, + "loss": 0.1823, + "mean_token_accuracy": 0.9190478280186654, + "num_tokens": 110313361.0, + "step": 5630 + }, + { + "entropy": 0.6932250164449215, + "epoch": 0.9115151515151515, + "grad_norm": 0.29083821177482605, + "learning_rate": 2.767719896795979e-05, + "loss": 0.1973, + "mean_token_accuracy": 0.9196494951844215, + "num_tokens": 110501349.0, + "step": 5640 + }, + { + "entropy": 0.6889181569218635, + "epoch": 0.9131313131313131, + "grad_norm": 0.37333911657333374, + "learning_rate": 2.765304655723379e-05, + "loss": 0.1845, + "mean_token_accuracy": 0.9226689338684082, + "num_tokens": 110687222.0, + "step": 5650 + }, + { + "entropy": 0.6439741283655167, + "epoch": 0.9147474747474748, + "grad_norm": 0.4025020897388458, + "learning_rate": 2.7628864959333172e-05, + "loss": 0.2101, + "mean_token_accuracy": 0.910821832716465, + "num_tokens": 110888988.0, + "step": 5660 + }, + { + "entropy": 0.6726826570928097, + "epoch": 0.9163636363636364, + "grad_norm": 0.35746780037879944, + "learning_rate": 2.760465424377328e-05, + "loss": 0.1783, + "mean_token_accuracy": 0.9297507956624032, + "num_tokens": 111080942.0, + "step": 5670 + }, + { + "entropy": 0.6542132571339607, + "epoch": 0.917979797979798, + "grad_norm": 0.35613134503364563, + "learning_rate": 2.758041448015316e-05, + "loss": 0.2085, + "mean_token_accuracy": 0.9105470031499863, + "num_tokens": 111276582.0, + "step": 5680 + }, + { + "entropy": 0.6565234795212745, + "epoch": 0.9195959595959596, + "grad_norm": 0.3237346112728119, + "learning_rate": 2.7556145738155364e-05, + "loss": 0.1977, + "mean_token_accuracy": 0.9160599425435066, + "num_tokens": 111471036.0, + "step": 5690 + }, + { + "entropy": 0.6647991955280304, + "epoch": 0.9212121212121213, + "grad_norm": 0.2765544652938843, + "learning_rate": 2.7531848087545762e-05, + "loss": 0.1969, + "mean_token_accuracy": 0.9130775973200798, + "num_tokens": 111664540.0, + "step": 5700 + }, + { + "epoch": 0.9212121212121213, + "eval_entropy": 0.7171694939434529, + "eval_loss": 0.18071074783802032, + "eval_mean_token_accuracy": 0.9210708927512169, + "eval_num_tokens": 111664540.0, + "eval_runtime": 108.214, + "eval_samples_per_second": 9.241, + "eval_steps_per_second": 9.241, + "step": 5700 + }, + { + "entropy": 0.6711388848721981, + "epoch": 0.9228282828282828, + "grad_norm": 0.2098732590675354, + "learning_rate": 2.7507521598173307e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9198487937450409, + "num_tokens": 111855354.0, + "step": 5710 + }, + { + "entropy": 0.6491121411323547, + "epoch": 0.9244444444444444, + "grad_norm": 0.24254560470581055, + "learning_rate": 2.748316633996987e-05, + "loss": 0.2103, + "mean_token_accuracy": 0.9093293219804763, + "num_tokens": 112052332.0, + "step": 5720 + }, + { + "entropy": 0.6290779180824757, + "epoch": 0.926060606060606, + "grad_norm": 0.2390562742948532, + "learning_rate": 2.745878238295001e-05, + "loss": 0.1966, + "mean_token_accuracy": 0.9128098532557487, + "num_tokens": 112258189.0, + "step": 5730 + }, + { + "entropy": 0.6869820304214954, + "epoch": 0.9276767676767677, + "grad_norm": 1.397542953491211, + "learning_rate": 2.7434369797210803e-05, + "loss": 0.1831, + "mean_token_accuracy": 0.9231995150446892, + "num_tokens": 112445227.0, + "step": 5740 + }, + { + "entropy": 0.6402010828256607, + "epoch": 0.9292929292929293, + "grad_norm": 0.2345576286315918, + "learning_rate": 2.740992865293162e-05, + "loss": 0.207, + "mean_token_accuracy": 0.9165661379694938, + "num_tokens": 112644322.0, + "step": 5750 + }, + { + "entropy": 0.6373597458004951, + "epoch": 0.9309090909090909, + "grad_norm": 0.3196069002151489, + "learning_rate": 2.7385459020373933e-05, + "loss": 0.2046, + "mean_token_accuracy": 0.9103137984871864, + "num_tokens": 112844093.0, + "step": 5760 + }, + { + "entropy": 0.6428131617605686, + "epoch": 0.9325252525252525, + "grad_norm": 1.4492580890655518, + "learning_rate": 2.736096096988109e-05, + "loss": 0.1984, + "mean_token_accuracy": 0.917301295697689, + "num_tokens": 113041205.0, + "step": 5770 + }, + { + "entropy": 0.6344161719083786, + "epoch": 0.9341414141414142, + "grad_norm": 0.3058554232120514, + "learning_rate": 2.733643457187816e-05, + "loss": 0.1981, + "mean_token_accuracy": 0.9163832649588585, + "num_tokens": 113242489.0, + "step": 5780 + }, + { + "entropy": 0.6687356859445572, + "epoch": 0.9357575757575758, + "grad_norm": 0.550967812538147, + "learning_rate": 2.731187989687169e-05, + "loss": 0.1944, + "mean_token_accuracy": 0.9172602981328964, + "num_tokens": 113432928.0, + "step": 5790 + }, + { + "entropy": 0.6460176803171634, + "epoch": 0.9373737373737374, + "grad_norm": 0.7452371716499329, + "learning_rate": 2.728729701544951e-05, + "loss": 0.2064, + "mean_token_accuracy": 0.9076276451349259, + "num_tokens": 113631585.0, + "step": 5800 + }, + { + "entropy": 0.6974510416388512, + "epoch": 0.938989898989899, + "grad_norm": 0.24986976385116577, + "learning_rate": 2.7262685998280537e-05, + "loss": 0.2043, + "mean_token_accuracy": 0.915585121512413, + "num_tokens": 113816162.0, + "step": 5810 + }, + { + "entropy": 0.6628655359148979, + "epoch": 0.9406060606060606, + "grad_norm": 0.2667369246482849, + "learning_rate": 2.723804691611459e-05, + "loss": 0.1967, + "mean_token_accuracy": 0.9135117039084435, + "num_tokens": 114010508.0, + "step": 5820 + }, + { + "entropy": 0.6733184188604355, + "epoch": 0.9422222222222222, + "grad_norm": 0.29969391226768494, + "learning_rate": 2.7213379839782144e-05, + "loss": 0.1909, + "mean_token_accuracy": 0.9115293577313424, + "num_tokens": 114202584.0, + "step": 5830 + }, + { + "entropy": 0.6488926939666271, + "epoch": 0.9438383838383838, + "grad_norm": 0.43534573912620544, + "learning_rate": 2.7188684840194158e-05, + "loss": 0.1955, + "mean_token_accuracy": 0.9078608691692353, + "num_tokens": 114401106.0, + "step": 5840 + }, + { + "entropy": 0.6797570586204529, + "epoch": 0.9454545454545454, + "grad_norm": 1.7962315082550049, + "learning_rate": 2.716396198834186e-05, + "loss": 0.2005, + "mean_token_accuracy": 0.9150436997413636, + "num_tokens": 114589015.0, + "step": 5850 + }, + { + "entropy": 0.6667471393942833, + "epoch": 0.9470707070707071, + "grad_norm": 0.4936676025390625, + "learning_rate": 2.7139211355296555e-05, + "loss": 0.1994, + "mean_token_accuracy": 0.9127405688166619, + "num_tokens": 114781132.0, + "step": 5860 + }, + { + "entropy": 0.6693764880299569, + "epoch": 0.9486868686868687, + "grad_norm": 0.26227378845214844, + "learning_rate": 2.7114433012209403e-05, + "loss": 0.1936, + "mean_token_accuracy": 0.9173475831747055, + "num_tokens": 114973668.0, + "step": 5870 + }, + { + "entropy": 0.6575251325964928, + "epoch": 0.9503030303030303, + "grad_norm": 0.5430612564086914, + "learning_rate": 2.7089627030311216e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9149983078241348, + "num_tokens": 115170150.0, + "step": 5880 + }, + { + "entropy": 0.6451747253537178, + "epoch": 0.9519191919191919, + "grad_norm": 0.9014168381690979, + "learning_rate": 2.706479348091227e-05, + "loss": 0.1887, + "mean_token_accuracy": 0.913795605301857, + "num_tokens": 115368295.0, + "step": 5890 + }, + { + "entropy": 0.6573713317513465, + "epoch": 0.9535353535353536, + "grad_norm": 0.31116336584091187, + "learning_rate": 2.7039932435402087e-05, + "loss": 0.2089, + "mean_token_accuracy": 0.9058459445834159, + "num_tokens": 115562448.0, + "step": 5900 + }, + { + "entropy": 0.674033111333847, + "epoch": 0.9551515151515152, + "grad_norm": 0.9241631031036377, + "learning_rate": 2.7015043965249235e-05, + "loss": 0.2043, + "mean_token_accuracy": 0.9132018253207207, + "num_tokens": 115752918.0, + "step": 5910 + }, + { + "entropy": 0.6824505791068077, + "epoch": 0.9567676767676768, + "grad_norm": 0.5457546710968018, + "learning_rate": 2.6990128142001117e-05, + "loss": 0.1871, + "mean_token_accuracy": 0.9239314749836922, + "num_tokens": 115944094.0, + "step": 5920 + }, + { + "entropy": 0.6651259288191795, + "epoch": 0.9583838383838383, + "grad_norm": 0.2708912491798401, + "learning_rate": 2.696518503728377e-05, + "loss": 0.1831, + "mean_token_accuracy": 0.9139492645859718, + "num_tokens": 116139742.0, + "step": 5930 + }, + { + "entropy": 0.6485146664083004, + "epoch": 0.96, + "grad_norm": 0.6327102780342102, + "learning_rate": 2.694021472280166e-05, + "loss": 0.1829, + "mean_token_accuracy": 0.9205643236637115, + "num_tokens": 116337348.0, + "step": 5940 + }, + { + "entropy": 0.6744761653244495, + "epoch": 0.9616161616161616, + "grad_norm": 0.5170925855636597, + "learning_rate": 2.691521727033746e-05, + "loss": 0.1849, + "mean_token_accuracy": 0.9192163914442062, + "num_tokens": 116528104.0, + "step": 5950 + }, + { + "entropy": 0.6350755199790001, + "epoch": 0.9632323232323232, + "grad_norm": 0.329481303691864, + "learning_rate": 2.689019275175188e-05, + "loss": 0.1839, + "mean_token_accuracy": 0.9138085007667541, + "num_tokens": 116730380.0, + "step": 5960 + }, + { + "entropy": 0.6517668567597866, + "epoch": 0.9648484848484848, + "grad_norm": 0.3096514940261841, + "learning_rate": 2.686514123898342e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.913357961177826, + "num_tokens": 116928370.0, + "step": 5970 + }, + { + "entropy": 0.6759242668747902, + "epoch": 0.9664646464646465, + "grad_norm": 0.21771784126758575, + "learning_rate": 2.6840062804048187e-05, + "loss": 0.2098, + "mean_token_accuracy": 0.9091755867004394, + "num_tokens": 117117162.0, + "step": 5980 + }, + { + "entropy": 0.6561018228530884, + "epoch": 0.9680808080808081, + "grad_norm": 0.24175840616226196, + "learning_rate": 2.6814957519039685e-05, + "loss": 0.2049, + "mean_token_accuracy": 0.9110913261771202, + "num_tokens": 117315990.0, + "step": 5990 + }, + { + "entropy": 0.6435799233615398, + "epoch": 0.9696969696969697, + "grad_norm": 0.22681868076324463, + "learning_rate": 2.678982545612859e-05, + "loss": 0.2221, + "mean_token_accuracy": 0.897762194275856, + "num_tokens": 117517220.0, + "step": 6000 + }, + { + "epoch": 0.9696969696969697, + "eval_entropy": 0.718082972228527, + "eval_loss": 0.18191839754581451, + "eval_mean_token_accuracy": 0.9208413473963738, + "eval_num_tokens": 117517220.0, + "eval_runtime": 107.7777, + "eval_samples_per_second": 9.278, + "eval_steps_per_second": 9.278, + "step": 6000 + }, + { + "entropy": 0.6580222062766552, + "epoch": 0.9713131313131314, + "grad_norm": 0.21151787042617798, + "learning_rate": 2.6764666687562574e-05, + "loss": 0.2032, + "mean_token_accuracy": 0.920425595343113, + "num_tokens": 117713683.0, + "step": 6010 + }, + { + "entropy": 0.6417085506021977, + "epoch": 0.972929292929293, + "grad_norm": 0.38823527097702026, + "learning_rate": 2.6739481285666075e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.9096645340323448, + "num_tokens": 117915811.0, + "step": 6020 + }, + { + "entropy": 0.6489227615296841, + "epoch": 0.9745454545454545, + "grad_norm": 0.33198609948158264, + "learning_rate": 2.671426932284009e-05, + "loss": 0.1965, + "mean_token_accuracy": 0.913950651884079, + "num_tokens": 118112545.0, + "step": 6030 + }, + { + "entropy": 0.6406939096748829, + "epoch": 0.9761616161616161, + "grad_norm": 2.0861010551452637, + "learning_rate": 2.668903087156197e-05, + "loss": 0.2113, + "mean_token_accuracy": 0.9125285148620605, + "num_tokens": 118312944.0, + "step": 6040 + }, + { + "entropy": 0.6449309058487416, + "epoch": 0.9777777777777777, + "grad_norm": 0.4663407802581787, + "learning_rate": 2.6663766004385226e-05, + "loss": 0.2039, + "mean_token_accuracy": 0.9149981364607811, + "num_tokens": 118511619.0, + "step": 6050 + }, + { + "entropy": 0.6443940982222557, + "epoch": 0.9793939393939394, + "grad_norm": 0.21795040369033813, + "learning_rate": 2.6638474793939294e-05, + "loss": 0.197, + "mean_token_accuracy": 0.9152833878993988, + "num_tokens": 118713929.0, + "step": 6060 + }, + { + "entropy": 0.6551195085048676, + "epoch": 0.981010101010101, + "grad_norm": 0.22683417797088623, + "learning_rate": 2.661315731292934e-05, + "loss": 0.1959, + "mean_token_accuracy": 0.9164972469210625, + "num_tokens": 118911555.0, + "step": 6070 + }, + { + "entropy": 0.6454559281468392, + "epoch": 0.9826262626262626, + "grad_norm": 0.20692016184329987, + "learning_rate": 2.6587813634136063e-05, + "loss": 0.1943, + "mean_token_accuracy": 0.9109898418188095, + "num_tokens": 119114755.0, + "step": 6080 + }, + { + "entropy": 0.6683458849787712, + "epoch": 0.9842424242424243, + "grad_norm": 0.2784108519554138, + "learning_rate": 2.656244383041546e-05, + "loss": 0.2057, + "mean_token_accuracy": 0.9093471348285675, + "num_tokens": 119306781.0, + "step": 6090 + }, + { + "entropy": 0.685344897210598, + "epoch": 0.9858585858585859, + "grad_norm": 0.47598379850387573, + "learning_rate": 2.6537047974698633e-05, + "loss": 0.1765, + "mean_token_accuracy": 0.9268097966909409, + "num_tokens": 119495832.0, + "step": 6100 + }, + { + "entropy": 0.6490506112575531, + "epoch": 0.9874747474747475, + "grad_norm": 0.33167073130607605, + "learning_rate": 2.651162613999158e-05, + "loss": 0.1872, + "mean_token_accuracy": 0.9159244641661644, + "num_tokens": 119693853.0, + "step": 6110 + }, + { + "entropy": 0.6624149113893509, + "epoch": 0.9890909090909091, + "grad_norm": 0.2675837576389313, + "learning_rate": 2.6486178399374978e-05, + "loss": 0.207, + "mean_token_accuracy": 0.91104756295681, + "num_tokens": 119888005.0, + "step": 6120 + }, + { + "entropy": 0.6676151186227799, + "epoch": 0.9907070707070708, + "grad_norm": 0.4086419939994812, + "learning_rate": 2.6460704826003985e-05, + "loss": 0.192, + "mean_token_accuracy": 0.9200485736131668, + "num_tokens": 120079316.0, + "step": 6130 + }, + { + "entropy": 0.6681792497634887, + "epoch": 0.9923232323232323, + "grad_norm": 0.1883692890405655, + "learning_rate": 2.6435205493108e-05, + "loss": 0.1946, + "mean_token_accuracy": 0.9129543766379357, + "num_tokens": 120271231.0, + "step": 6140 + }, + { + "entropy": 0.6727742575109005, + "epoch": 0.9939393939393939, + "grad_norm": 0.8306013941764832, + "learning_rate": 2.64096804739905e-05, + "loss": 0.1874, + "mean_token_accuracy": 0.9178223893046379, + "num_tokens": 120461022.0, + "step": 6150 + }, + { + "entropy": 0.6303834930062294, + "epoch": 0.9955555555555555, + "grad_norm": 0.36598384380340576, + "learning_rate": 2.638412984202878e-05, + "loss": 0.2026, + "mean_token_accuracy": 0.9190863534808159, + "num_tokens": 120664236.0, + "step": 6160 + }, + { + "entropy": 0.6283733353018761, + "epoch": 0.9971717171717172, + "grad_norm": 0.6193094849586487, + "learning_rate": 2.6358553670673776e-05, + "loss": 0.2024, + "mean_token_accuracy": 0.910734897851944, + "num_tokens": 120867537.0, + "step": 6170 + }, + { + "entropy": 0.6580278515815735, + "epoch": 0.9987878787878788, + "grad_norm": 0.3590032160282135, + "learning_rate": 2.6332952033449848e-05, + "loss": 0.1837, + "mean_token_accuracy": 0.916428716480732, + "num_tokens": 121061748.0, + "step": 6180 + }, + { + "entropy": 0.6319364136771152, + "epoch": 1.0003232323232323, + "grad_norm": 0.2722172737121582, + "learning_rate": 2.630732500395455e-05, + "loss": 0.1927, + "mean_token_accuracy": 0.9176036417484283, + "num_tokens": 121252607.0, + "step": 6190 + }, + { + "entropy": 0.6554017476737499, + "epoch": 1.001939393939394, + "grad_norm": 0.236890971660614, + "learning_rate": 2.6281672655858437e-05, + "loss": 0.183, + "mean_token_accuracy": 0.9223996490240097, + "num_tokens": 121447664.0, + "step": 6200 + }, + { + "entropy": 0.64825743958354, + "epoch": 1.0035555555555555, + "grad_norm": 0.281246542930603, + "learning_rate": 2.6255995062904855e-05, + "loss": 0.2065, + "mean_token_accuracy": 0.9148237839341163, + "num_tokens": 121643626.0, + "step": 6210 + }, + { + "entropy": 0.6417653873562813, + "epoch": 1.0051717171717172, + "grad_norm": 0.4624505639076233, + "learning_rate": 2.6230292298909708e-05, + "loss": 0.1869, + "mean_token_accuracy": 0.9187545537948608, + "num_tokens": 121840163.0, + "step": 6220 + }, + { + "entropy": 0.6188324891030789, + "epoch": 1.0067878787878788, + "grad_norm": 1.9852811098098755, + "learning_rate": 2.6204564437761272e-05, + "loss": 0.1893, + "mean_token_accuracy": 0.9187879309058189, + "num_tokens": 122048180.0, + "step": 6230 + }, + { + "entropy": 0.6713766917586327, + "epoch": 1.0084040404040404, + "grad_norm": 0.26395949721336365, + "learning_rate": 2.6178811553419968e-05, + "loss": 0.1929, + "mean_token_accuracy": 0.9171957895159721, + "num_tokens": 122237720.0, + "step": 6240 + }, + { + "entropy": 0.657070753723383, + "epoch": 1.010020202020202, + "grad_norm": 0.26647141575813293, + "learning_rate": 2.615303371991815e-05, + "loss": 0.1835, + "mean_token_accuracy": 0.9234165251255035, + "num_tokens": 122432536.0, + "step": 6250 + }, + { + "entropy": 0.6808216124773026, + "epoch": 1.0116363636363637, + "grad_norm": 0.25760984420776367, + "learning_rate": 2.6127231011359885e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.9138244792819024, + "num_tokens": 122621444.0, + "step": 6260 + }, + { + "entropy": 0.6886962063610553, + "epoch": 1.0132525252525253, + "grad_norm": 1.3057711124420166, + "learning_rate": 2.610140350192077e-05, + "loss": 0.1942, + "mean_token_accuracy": 0.913233257830143, + "num_tokens": 122806380.0, + "step": 6270 + }, + { + "entropy": 0.6293446570634842, + "epoch": 1.014868686868687, + "grad_norm": 0.59806227684021, + "learning_rate": 2.6075551265847668e-05, + "loss": 0.2113, + "mean_token_accuracy": 0.9158453986048698, + "num_tokens": 123011023.0, + "step": 6280 + }, + { + "entropy": 0.6543448261916638, + "epoch": 1.0164848484848485, + "grad_norm": 0.2948601543903351, + "learning_rate": 2.604967437745856e-05, + "loss": 0.2066, + "mean_token_accuracy": 0.9178432136774063, + "num_tokens": 123206708.0, + "step": 6290 + }, + { + "entropy": 0.6323048695921898, + "epoch": 1.0181010101010102, + "grad_norm": 0.3663761019706726, + "learning_rate": 2.6023772911142255e-05, + "loss": 0.2188, + "mean_token_accuracy": 0.9081894502043724, + "num_tokens": 123408860.0, + "step": 6300 + }, + { + "epoch": 1.0181010101010102, + "eval_entropy": 0.7149012831747532, + "eval_loss": 0.1789740025997162, + "eval_mean_token_accuracy": 0.9207139663100242, + "eval_num_tokens": 123408860.0, + "eval_runtime": 109.427, + "eval_samples_per_second": 9.139, + "eval_steps_per_second": 9.139, + "step": 6300 + }, + { + "entropy": 0.6567400619387627, + "epoch": 1.0197171717171718, + "grad_norm": 0.2657247483730316, + "learning_rate": 2.599784694135826e-05, + "loss": 0.1917, + "mean_token_accuracy": 0.9149983212351799, + "num_tokens": 123604715.0, + "step": 6310 + }, + { + "entropy": 0.6590967506170273, + "epoch": 1.0213333333333334, + "grad_norm": 0.2769843637943268, + "learning_rate": 2.597189654263649e-05, + "loss": 0.1929, + "mean_token_accuracy": 0.9187929838895798, + "num_tokens": 123798586.0, + "step": 6320 + }, + { + "entropy": 0.6535114169120788, + "epoch": 1.022949494949495, + "grad_norm": 0.6922852993011475, + "learning_rate": 2.5945921789577096e-05, + "loss": 0.1965, + "mean_token_accuracy": 0.9142646953463555, + "num_tokens": 123995581.0, + "step": 6330 + }, + { + "entropy": 0.6440669246017933, + "epoch": 1.0245656565656565, + "grad_norm": 0.25289684534072876, + "learning_rate": 2.5919922756850243e-05, + "loss": 0.191, + "mean_token_accuracy": 0.9152944028377533, + "num_tokens": 124193656.0, + "step": 6340 + }, + { + "entropy": 0.6575203105807305, + "epoch": 1.026181818181818, + "grad_norm": 0.3846961557865143, + "learning_rate": 2.5893899519195903e-05, + "loss": 0.2014, + "mean_token_accuracy": 0.914886112511158, + "num_tokens": 124386546.0, + "step": 6350 + }, + { + "entropy": 0.6285572461783886, + "epoch": 1.0277979797979797, + "grad_norm": 0.3510042130947113, + "learning_rate": 2.58678521514236e-05, + "loss": 0.2134, + "mean_token_accuracy": 0.9082665666937828, + "num_tokens": 124591229.0, + "step": 6360 + }, + { + "entropy": 0.693169579654932, + "epoch": 1.0294141414141413, + "grad_norm": 0.5278009176254272, + "learning_rate": 2.5841780728412267e-05, + "loss": 0.174, + "mean_token_accuracy": 0.9229836270213128, + "num_tokens": 124775056.0, + "step": 6370 + }, + { + "entropy": 0.645633889734745, + "epoch": 1.031030303030303, + "grad_norm": 0.3005501627922058, + "learning_rate": 2.581568532510995e-05, + "loss": 0.189, + "mean_token_accuracy": 0.915327088534832, + "num_tokens": 124972056.0, + "step": 6380 + }, + { + "entropy": 0.6465482771396637, + "epoch": 1.0326464646464646, + "grad_norm": 0.17836037278175354, + "learning_rate": 2.578956601653365e-05, + "loss": 0.2005, + "mean_token_accuracy": 0.9092096760869026, + "num_tokens": 125169814.0, + "step": 6390 + }, + { + "entropy": 0.6293865792453289, + "epoch": 1.0342626262626262, + "grad_norm": 0.35403016209602356, + "learning_rate": 2.5763422877769105e-05, + "loss": 0.2058, + "mean_token_accuracy": 0.9146534383296967, + "num_tokens": 125374489.0, + "step": 6400 + }, + { + "entropy": 0.6683484628796578, + "epoch": 1.0358787878787878, + "grad_norm": 0.7300609946250916, + "learning_rate": 2.5737255983970523e-05, + "loss": 0.1901, + "mean_token_accuracy": 0.9273339122533798, + "num_tokens": 125564889.0, + "step": 6410 + }, + { + "entropy": 0.6528873711824417, + "epoch": 1.0374949494949495, + "grad_norm": 0.23353512585163116, + "learning_rate": 2.571106541036043e-05, + "loss": 0.2066, + "mean_token_accuracy": 0.9112766414880753, + "num_tokens": 125761240.0, + "step": 6420 + }, + { + "entropy": 0.6671967059373856, + "epoch": 1.039111111111111, + "grad_norm": 0.21339759230613708, + "learning_rate": 2.56848512322294e-05, + "loss": 0.1761, + "mean_token_accuracy": 0.9232214778661728, + "num_tokens": 125953805.0, + "step": 6430 + }, + { + "entropy": 0.6670616880059242, + "epoch": 1.0407272727272727, + "grad_norm": 0.2723545730113983, + "learning_rate": 2.5658613524935897e-05, + "loss": 0.1996, + "mean_token_accuracy": 0.9158126816153527, + "num_tokens": 126146527.0, + "step": 6440 + }, + { + "entropy": 0.6801604807376862, + "epoch": 1.0423434343434343, + "grad_norm": 0.3326534926891327, + "learning_rate": 2.5632352363905992e-05, + "loss": 0.1744, + "mean_token_accuracy": 0.9291215389966965, + "num_tokens": 126335687.0, + "step": 6450 + }, + { + "entropy": 0.640099360793829, + "epoch": 1.043959595959596, + "grad_norm": 0.16790306568145752, + "learning_rate": 2.5606067824633193e-05, + "loss": 0.1937, + "mean_token_accuracy": 0.9159406557679176, + "num_tokens": 126538790.0, + "step": 6460 + }, + { + "entropy": 0.6603272095322609, + "epoch": 1.0455757575757576, + "grad_norm": 0.22010697424411774, + "learning_rate": 2.5579759982678216e-05, + "loss": 0.182, + "mean_token_accuracy": 0.9223829731345177, + "num_tokens": 126731453.0, + "step": 6470 + }, + { + "entropy": 0.6681752309203148, + "epoch": 1.0471919191919192, + "grad_norm": 0.2423912137746811, + "learning_rate": 2.555342891366876e-05, + "loss": 0.193, + "mean_token_accuracy": 0.9237019583582878, + "num_tokens": 126924683.0, + "step": 6480 + }, + { + "entropy": 0.6576780676841736, + "epoch": 1.0488080808080809, + "grad_norm": 0.36850905418395996, + "learning_rate": 2.5527074693299307e-05, + "loss": 0.2105, + "mean_token_accuracy": 0.9122251257300377, + "num_tokens": 127123350.0, + "step": 6490 + }, + { + "entropy": 0.6331621743738651, + "epoch": 1.0504242424242425, + "grad_norm": 0.15457026660442352, + "learning_rate": 2.550069739733087e-05, + "loss": 0.2001, + "mean_token_accuracy": 0.9060256168246269, + "num_tokens": 127327771.0, + "step": 6500 + }, + { + "entropy": 0.6503718107938766, + "epoch": 1.052040404040404, + "grad_norm": 2.223614454269409, + "learning_rate": 2.5474297101590827e-05, + "loss": 0.2042, + "mean_token_accuracy": 0.9112634479999542, + "num_tokens": 127527473.0, + "step": 6510 + }, + { + "entropy": 0.6804512321949006, + "epoch": 1.0536565656565657, + "grad_norm": 0.23660320043563843, + "learning_rate": 2.5447873881972643e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.9214923486113549, + "num_tokens": 127716869.0, + "step": 6520 + }, + { + "entropy": 0.6566047713160514, + "epoch": 1.0552727272727274, + "grad_norm": 0.26549968123435974, + "learning_rate": 2.542142781443571e-05, + "loss": 0.1938, + "mean_token_accuracy": 0.912337064743042, + "num_tokens": 127912264.0, + "step": 6530 + }, + { + "entropy": 0.6710340820252896, + "epoch": 1.056888888888889, + "grad_norm": 3.7274105548858643, + "learning_rate": 2.5394958975005075e-05, + "loss": 0.2002, + "mean_token_accuracy": 0.9160495191812515, + "num_tokens": 128104518.0, + "step": 6540 + }, + { + "entropy": 0.6394494399428368, + "epoch": 1.0585050505050506, + "grad_norm": 0.29730379581451416, + "learning_rate": 2.536846743977128e-05, + "loss": 0.1955, + "mean_token_accuracy": 0.9120063051581383, + "num_tokens": 128306195.0, + "step": 6550 + }, + { + "entropy": 0.6369179256260395, + "epoch": 1.060121212121212, + "grad_norm": 0.7845156788825989, + "learning_rate": 2.5341953284890086e-05, + "loss": 0.1922, + "mean_token_accuracy": 0.9154770240187645, + "num_tokens": 128507191.0, + "step": 6560 + }, + { + "entropy": 0.6614749670028687, + "epoch": 1.0617373737373736, + "grad_norm": 0.5670087337493896, + "learning_rate": 2.531541658658229e-05, + "loss": 0.1766, + "mean_token_accuracy": 0.9220707610249519, + "num_tokens": 128699358.0, + "step": 6570 + }, + { + "entropy": 0.6655206590890884, + "epoch": 1.0633535353535353, + "grad_norm": 0.212821364402771, + "learning_rate": 2.528885742113349e-05, + "loss": 0.1884, + "mean_token_accuracy": 0.914807352423668, + "num_tokens": 128891949.0, + "step": 6580 + }, + { + "entropy": 0.6582912176847457, + "epoch": 1.064969696969697, + "grad_norm": 0.7031955718994141, + "learning_rate": 2.5262275864893865e-05, + "loss": 0.2008, + "mean_token_accuracy": 0.9075666755437851, + "num_tokens": 129087041.0, + "step": 6590 + }, + { + "entropy": 0.6500703394412994, + "epoch": 1.0665858585858585, + "grad_norm": 0.43128007650375366, + "learning_rate": 2.5235671994277984e-05, + "loss": 0.1903, + "mean_token_accuracy": 0.9136385723948479, + "num_tokens": 129284757.0, + "step": 6600 + }, + { + "epoch": 1.0665858585858585, + "eval_entropy": 0.7165414272248745, + "eval_loss": 0.1776740849018097, + "eval_mean_token_accuracy": 0.9204751133918763, + "eval_num_tokens": 129284757.0, + "eval_runtime": 107.5492, + "eval_samples_per_second": 9.298, + "eval_steps_per_second": 9.298, + "step": 6600 + }, + { + "entropy": 0.6414588749408722, + "epoch": 1.0682020202020202, + "grad_norm": 0.3216888904571533, + "learning_rate": 2.520904588576453e-05, + "loss": 0.2099, + "mean_token_accuracy": 0.9127737268805504, + "num_tokens": 129484015.0, + "step": 6610 + }, + { + "entropy": 0.6716010585427284, + "epoch": 1.0698181818181818, + "grad_norm": 2.989790439605713, + "learning_rate": 2.518239761589615e-05, + "loss": 0.1904, + "mean_token_accuracy": 0.9218545705080032, + "num_tokens": 129677753.0, + "step": 6620 + }, + { + "entropy": 0.6624714002013207, + "epoch": 1.0714343434343434, + "grad_norm": 1.18606436252594, + "learning_rate": 2.515572726127917e-05, + "loss": 0.1784, + "mean_token_accuracy": 0.9170515671372413, + "num_tokens": 129874528.0, + "step": 6630 + }, + { + "entropy": 0.6804944217205048, + "epoch": 1.073050505050505, + "grad_norm": 0.28340503573417664, + "learning_rate": 2.512903489858342e-05, + "loss": 0.1812, + "mean_token_accuracy": 0.9200245469808579, + "num_tokens": 130064215.0, + "step": 6640 + }, + { + "entropy": 0.6577471405267715, + "epoch": 1.0746666666666667, + "grad_norm": 1.1137200593948364, + "learning_rate": 2.510232060454199e-05, + "loss": 0.185, + "mean_token_accuracy": 0.9237975597381591, + "num_tokens": 130258589.0, + "step": 6650 + }, + { + "entropy": 0.653445017337799, + "epoch": 1.0762828282828283, + "grad_norm": 0.24963843822479248, + "learning_rate": 2.5075584455951016e-05, + "loss": 0.2006, + "mean_token_accuracy": 0.9116239801049233, + "num_tokens": 130455050.0, + "step": 6660 + }, + { + "entropy": 0.6773878857493401, + "epoch": 1.07789898989899, + "grad_norm": 0.2457326203584671, + "learning_rate": 2.5048826529669466e-05, + "loss": 0.1698, + "mean_token_accuracy": 0.9252761200070381, + "num_tokens": 130643260.0, + "step": 6670 + }, + { + "entropy": 0.6505807377398014, + "epoch": 1.0795151515151515, + "grad_norm": 0.30657973885536194, + "learning_rate": 2.5022046902618903e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.917384472489357, + "num_tokens": 130841384.0, + "step": 6680 + }, + { + "entropy": 0.6616685807704925, + "epoch": 1.0811313131313132, + "grad_norm": 0.25688493251800537, + "learning_rate": 2.499524565178328e-05, + "loss": 0.1969, + "mean_token_accuracy": 0.922076341509819, + "num_tokens": 131039846.0, + "step": 6690 + }, + { + "entropy": 0.6403779342770577, + "epoch": 1.0827474747474748, + "grad_norm": 0.24423788487911224, + "learning_rate": 2.4968422854208715e-05, + "loss": 0.204, + "mean_token_accuracy": 0.905488808453083, + "num_tokens": 131240325.0, + "step": 6700 + }, + { + "entropy": 0.6553829908370972, + "epoch": 1.0843636363636364, + "grad_norm": 0.2003633677959442, + "learning_rate": 2.4941578587003267e-05, + "loss": 0.1927, + "mean_token_accuracy": 0.9127710714936257, + "num_tokens": 131436899.0, + "step": 6710 + }, + { + "entropy": 0.7037200927734375, + "epoch": 1.085979797979798, + "grad_norm": 0.2803226113319397, + "learning_rate": 2.4914712927336702e-05, + "loss": 0.1845, + "mean_token_accuracy": 0.9239676401019097, + "num_tokens": 131621236.0, + "step": 6720 + }, + { + "entropy": 0.6559601046144963, + "epoch": 1.0875959595959597, + "grad_norm": 0.24596357345581055, + "learning_rate": 2.4887825952440296e-05, + "loss": 0.1838, + "mean_token_accuracy": 0.9203213959932327, + "num_tokens": 131816568.0, + "step": 6730 + }, + { + "entropy": 0.6305412597954273, + "epoch": 1.0892121212121213, + "grad_norm": 0.49728894233703613, + "learning_rate": 2.4860917739606592e-05, + "loss": 0.2032, + "mean_token_accuracy": 0.9125052660703659, + "num_tokens": 132022157.0, + "step": 6740 + }, + { + "entropy": 0.6523553773760795, + "epoch": 1.090828282828283, + "grad_norm": 0.33660855889320374, + "learning_rate": 2.4833988366189203e-05, + "loss": 0.193, + "mean_token_accuracy": 0.9162264108657837, + "num_tokens": 132219642.0, + "step": 6750 + }, + { + "entropy": 0.6663720726966857, + "epoch": 1.0924444444444443, + "grad_norm": 0.28930526971817017, + "learning_rate": 2.4807037909602542e-05, + "loss": 0.1978, + "mean_token_accuracy": 0.9147111624479294, + "num_tokens": 132412588.0, + "step": 6760 + }, + { + "entropy": 0.6918739549815655, + "epoch": 1.094060606060606, + "grad_norm": 0.6057730317115784, + "learning_rate": 2.478006644732166e-05, + "loss": 0.1862, + "mean_token_accuracy": 0.9180864945054055, + "num_tokens": 132599244.0, + "step": 6770 + }, + { + "entropy": 0.6754172123968601, + "epoch": 1.0956767676767676, + "grad_norm": 0.20356015861034393, + "learning_rate": 2.475307405688199e-05, + "loss": 0.1993, + "mean_token_accuracy": 0.9170787900686264, + "num_tokens": 132789658.0, + "step": 6780 + }, + { + "entropy": 0.675231696665287, + "epoch": 1.0972929292929292, + "grad_norm": 0.31911930441856384, + "learning_rate": 2.47260608158791e-05, + "loss": 0.1791, + "mean_token_accuracy": 0.9208205997943878, + "num_tokens": 132980503.0, + "step": 6790 + }, + { + "entropy": 0.6837145708501339, + "epoch": 1.0989090909090908, + "grad_norm": 0.1739879846572876, + "learning_rate": 2.469902680196853e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9194065168499946, + "num_tokens": 133167839.0, + "step": 6800 + }, + { + "entropy": 0.6514686979353428, + "epoch": 1.1005252525252525, + "grad_norm": 0.2011003941297531, + "learning_rate": 2.4671972092865518e-05, + "loss": 0.1849, + "mean_token_accuracy": 0.9203078612685204, + "num_tokens": 133366242.0, + "step": 6810 + }, + { + "entropy": 0.652017392218113, + "epoch": 1.102141414141414, + "grad_norm": 1.491286039352417, + "learning_rate": 2.4644896766344803e-05, + "loss": 0.1895, + "mean_token_accuracy": 0.9165545701980591, + "num_tokens": 133565786.0, + "step": 6820 + }, + { + "entropy": 0.6563273631036282, + "epoch": 1.1037575757575757, + "grad_norm": 0.20642946660518646, + "learning_rate": 2.461780090024039e-05, + "loss": 0.2086, + "mean_token_accuracy": 0.9042300581932068, + "num_tokens": 133762796.0, + "step": 6830 + }, + { + "entropy": 0.6549685873091221, + "epoch": 1.1053737373737373, + "grad_norm": 0.34660500288009644, + "learning_rate": 2.459068457244533e-05, + "loss": 0.209, + "mean_token_accuracy": 0.9060993060469628, + "num_tokens": 133958518.0, + "step": 6840 + }, + { + "entropy": 0.692388217151165, + "epoch": 1.106989898989899, + "grad_norm": 0.1916242241859436, + "learning_rate": 2.4563547860911495e-05, + "loss": 0.192, + "mean_token_accuracy": 0.9103460937738419, + "num_tokens": 134142419.0, + "step": 6850 + }, + { + "entropy": 0.6678758606314659, + "epoch": 1.1086060606060606, + "grad_norm": 0.26181909441947937, + "learning_rate": 2.453639084364936e-05, + "loss": 0.2012, + "mean_token_accuracy": 0.9134058818221092, + "num_tokens": 134334970.0, + "step": 6860 + }, + { + "entropy": 0.6088161051273346, + "epoch": 1.1102222222222222, + "grad_norm": 0.28219732642173767, + "learning_rate": 2.450921359872776e-05, + "loss": 0.2057, + "mean_token_accuracy": 0.9087446928024292, + "num_tokens": 134548254.0, + "step": 6870 + }, + { + "entropy": 0.6716838963329792, + "epoch": 1.1118383838383838, + "grad_norm": 0.3689417243003845, + "learning_rate": 2.44820162042737e-05, + "loss": 0.1966, + "mean_token_accuracy": 0.9242041930556297, + "num_tokens": 134738981.0, + "step": 6880 + }, + { + "entropy": 0.6553657658398151, + "epoch": 1.1134545454545455, + "grad_norm": 0.25713449716567993, + "learning_rate": 2.4454798738472085e-05, + "loss": 0.1819, + "mean_token_accuracy": 0.9258597359061241, + "num_tokens": 134935804.0, + "step": 6890 + }, + { + "entropy": 0.639024917781353, + "epoch": 1.115070707070707, + "grad_norm": 0.22210142016410828, + "learning_rate": 2.4427561279565535e-05, + "loss": 0.1986, + "mean_token_accuracy": 0.9139626592397689, + "num_tokens": 135136821.0, + "step": 6900 + }, + { + "epoch": 1.115070707070707, + "eval_entropy": 0.7123623175621032, + "eval_loss": 0.17761555314064026, + "eval_mean_token_accuracy": 0.9209292259812355, + "eval_num_tokens": 135136821.0, + "eval_runtime": 107.7787, + "eval_samples_per_second": 9.278, + "eval_steps_per_second": 9.278, + "step": 6900 + }, + { + "entropy": 0.6710393786430359, + "epoch": 1.1166868686868687, + "grad_norm": 0.24489854276180267, + "learning_rate": 2.4400303905854143e-05, + "loss": 0.1739, + "mean_token_accuracy": 0.9210480153560638, + "num_tokens": 135327176.0, + "step": 6910 + }, + { + "entropy": 0.6592464700341225, + "epoch": 1.1183030303030304, + "grad_norm": 0.2485123574733734, + "learning_rate": 2.437302669569525e-05, + "loss": 0.2069, + "mean_token_accuracy": 0.9071068927645684, + "num_tokens": 135522817.0, + "step": 6920 + }, + { + "entropy": 0.6490606270730496, + "epoch": 1.119919191919192, + "grad_norm": 0.2856655418872833, + "learning_rate": 2.434572972750322e-05, + "loss": 0.201, + "mean_token_accuracy": 0.9150787323713303, + "num_tokens": 135720129.0, + "step": 6930 + }, + { + "entropy": 0.6843857243657112, + "epoch": 1.1215353535353536, + "grad_norm": 0.2778134346008301, + "learning_rate": 2.4318413079749212e-05, + "loss": 0.1906, + "mean_token_accuracy": 0.9118962764739991, + "num_tokens": 135908000.0, + "step": 6940 + }, + { + "entropy": 0.6615097917616367, + "epoch": 1.1231515151515152, + "grad_norm": 0.2848396897315979, + "learning_rate": 2.429107683096097e-05, + "loss": 0.1981, + "mean_token_accuracy": 0.9107768446207046, + "num_tokens": 136099920.0, + "step": 6950 + }, + { + "entropy": 0.6721059113740921, + "epoch": 1.1247676767676769, + "grad_norm": 0.2825098931789398, + "learning_rate": 2.426372105972258e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.9258805438876152, + "num_tokens": 136291929.0, + "step": 6960 + }, + { + "entropy": 0.6725196681916714, + "epoch": 1.1263838383838385, + "grad_norm": 0.22564974427223206, + "learning_rate": 2.4236345844674235e-05, + "loss": 0.1981, + "mean_token_accuracy": 0.9194081515073776, + "num_tokens": 136486610.0, + "step": 6970 + }, + { + "entropy": 0.655806089937687, + "epoch": 1.1280000000000001, + "grad_norm": 0.19149194657802582, + "learning_rate": 2.4208951264512046e-05, + "loss": 0.1996, + "mean_token_accuracy": 0.9169903472065926, + "num_tokens": 136683848.0, + "step": 6980 + }, + { + "entropy": 0.6414538450539112, + "epoch": 1.1296161616161617, + "grad_norm": 0.21564777195453644, + "learning_rate": 2.4181537397987785e-05, + "loss": 0.1901, + "mean_token_accuracy": 0.9169015690684319, + "num_tokens": 136886358.0, + "step": 6990 + }, + { + "entropy": 0.6110823594033719, + "epoch": 1.1312323232323231, + "grad_norm": 0.3679966628551483, + "learning_rate": 2.415410432390866e-05, + "loss": 0.2059, + "mean_token_accuracy": 0.9113464534282685, + "num_tokens": 137095694.0, + "step": 7000 + }, + { + "entropy": 0.6325623497366906, + "epoch": 1.1328484848484848, + "grad_norm": 0.2635299563407898, + "learning_rate": 2.41266521211371e-05, + "loss": 0.198, + "mean_token_accuracy": 0.9193664342164993, + "num_tokens": 137299226.0, + "step": 7010 + }, + { + "entropy": 0.647395196557045, + "epoch": 1.1344646464646464, + "grad_norm": 0.3164098262786865, + "learning_rate": 2.409918086859054e-05, + "loss": 0.1898, + "mean_token_accuracy": 0.9130165934562683, + "num_tokens": 137496269.0, + "step": 7020 + }, + { + "entropy": 0.6482592269778251, + "epoch": 1.136080808080808, + "grad_norm": 0.25383561849594116, + "learning_rate": 2.4071690645241142e-05, + "loss": 0.1946, + "mean_token_accuracy": 0.910372956097126, + "num_tokens": 137695852.0, + "step": 7030 + }, + { + "entropy": 0.6379775069653988, + "epoch": 1.1376969696969697, + "grad_norm": 0.31261879205703735, + "learning_rate": 2.404418153011564e-05, + "loss": 0.1975, + "mean_token_accuracy": 0.9110258087515831, + "num_tokens": 137897409.0, + "step": 7040 + }, + { + "entropy": 0.6479882016777992, + "epoch": 1.1393131313131313, + "grad_norm": 0.1875302940607071, + "learning_rate": 2.401665360229504e-05, + "loss": 0.1932, + "mean_token_accuracy": 0.9154945403337479, + "num_tokens": 138093818.0, + "step": 7050 + }, + { + "entropy": 0.6449267700314522, + "epoch": 1.140929292929293, + "grad_norm": 0.19978782534599304, + "learning_rate": 2.398910694091448e-05, + "loss": 0.1761, + "mean_token_accuracy": 0.91689632833004, + "num_tokens": 138293505.0, + "step": 7060 + }, + { + "entropy": 0.632693299651146, + "epoch": 1.1425454545454545, + "grad_norm": 0.22115358710289001, + "learning_rate": 2.3961541625162895e-05, + "loss": 0.1771, + "mean_token_accuracy": 0.9192465275526047, + "num_tokens": 138494339.0, + "step": 7070 + }, + { + "entropy": 0.6531041838228703, + "epoch": 1.1441616161616162, + "grad_norm": 0.18521378934383392, + "learning_rate": 2.393395773428289e-05, + "loss": 0.1976, + "mean_token_accuracy": 0.9157662749290466, + "num_tokens": 138690939.0, + "step": 7080 + }, + { + "entropy": 0.6616611868143082, + "epoch": 1.1457777777777778, + "grad_norm": 0.5136955976486206, + "learning_rate": 2.3906355347570438e-05, + "loss": 0.2008, + "mean_token_accuracy": 0.9052021831274033, + "num_tokens": 138886124.0, + "step": 7090 + }, + { + "entropy": 0.6661837339401245, + "epoch": 1.1473939393939394, + "grad_norm": 0.18936645984649658, + "learning_rate": 2.3878734544374708e-05, + "loss": 0.1767, + "mean_token_accuracy": 0.919096226990223, + "num_tokens": 139080717.0, + "step": 7100 + }, + { + "entropy": 0.6578754298388958, + "epoch": 1.149010101010101, + "grad_norm": 0.2449623942375183, + "learning_rate": 2.3851095404097795e-05, + "loss": 0.1888, + "mean_token_accuracy": 0.9126079857349396, + "num_tokens": 139275344.0, + "step": 7110 + }, + { + "entropy": 0.6601323708891869, + "epoch": 1.1506262626262627, + "grad_norm": 0.23257870972156525, + "learning_rate": 2.3823438006194505e-05, + "loss": 0.1995, + "mean_token_accuracy": 0.9094919070601464, + "num_tokens": 139467719.0, + "step": 7120 + }, + { + "entropy": 0.664834751188755, + "epoch": 1.1522424242424243, + "grad_norm": 0.24553577601909637, + "learning_rate": 2.3795762430172146e-05, + "loss": 0.2118, + "mean_token_accuracy": 0.9074599102139473, + "num_tokens": 139659844.0, + "step": 7130 + }, + { + "entropy": 0.6728641912341118, + "epoch": 1.153858585858586, + "grad_norm": 0.19290964305400848, + "learning_rate": 2.3768068755590265e-05, + "loss": 0.195, + "mean_token_accuracy": 0.9208357721567154, + "num_tokens": 139849410.0, + "step": 7140 + }, + { + "entropy": 0.6649326175451279, + "epoch": 1.1554747474747475, + "grad_norm": 0.3070566952228546, + "learning_rate": 2.374035706206045e-05, + "loss": 0.199, + "mean_token_accuracy": 0.9067992687225341, + "num_tokens": 140042165.0, + "step": 7150 + }, + { + "entropy": 0.6905647456645966, + "epoch": 1.1570909090909092, + "grad_norm": 0.18495376408100128, + "learning_rate": 2.3712627429246083e-05, + "loss": 0.1788, + "mean_token_accuracy": 0.9231453701853752, + "num_tokens": 140227677.0, + "step": 7160 + }, + { + "entropy": 0.647196751832962, + "epoch": 1.1587070707070708, + "grad_norm": 0.19866421818733215, + "learning_rate": 2.368487993686212e-05, + "loss": 0.1929, + "mean_token_accuracy": 0.9224782362580299, + "num_tokens": 140424940.0, + "step": 7170 + }, + { + "entropy": 0.6779814459383487, + "epoch": 1.1603232323232322, + "grad_norm": 0.3933802843093872, + "learning_rate": 2.3657114664674854e-05, + "loss": 0.1819, + "mean_token_accuracy": 0.9179326862096786, + "num_tokens": 140615273.0, + "step": 7180 + }, + { + "entropy": 0.6495263047516346, + "epoch": 1.1619393939393938, + "grad_norm": 0.3412734568119049, + "learning_rate": 2.3629331692501692e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.914878611266613, + "num_tokens": 140811787.0, + "step": 7190 + }, + { + "entropy": 0.6819436699151993, + "epoch": 1.1635555555555555, + "grad_norm": 0.2559371888637543, + "learning_rate": 2.360153110021092e-05, + "loss": 0.1862, + "mean_token_accuracy": 0.9186094254255295, + "num_tokens": 140998555.0, + "step": 7200 + }, + { + "epoch": 1.1635555555555555, + "eval_entropy": 0.7138508333265782, + "eval_loss": 0.17618873715400696, + "eval_mean_token_accuracy": 0.9209619098305702, + "eval_num_tokens": 140998555.0, + "eval_runtime": 108.408, + "eval_samples_per_second": 9.224, + "eval_steps_per_second": 9.224, + "step": 7200 + }, + { + "entropy": 0.6654595993459225, + "epoch": 1.165171717171717, + "grad_norm": 0.6554359793663025, + "learning_rate": 2.357371296772149e-05, + "loss": 0.1827, + "mean_token_accuracy": 0.9218516707420349, + "num_tokens": 141192711.0, + "step": 7210 + }, + { + "entropy": 0.6439177483320236, + "epoch": 1.1667878787878787, + "grad_norm": 0.28542885184288025, + "learning_rate": 2.3545877375002758e-05, + "loss": 0.2051, + "mean_token_accuracy": 0.9158839777112007, + "num_tokens": 141392304.0, + "step": 7220 + }, + { + "entropy": 0.6547999024391175, + "epoch": 1.1684040404040403, + "grad_norm": 0.29079747200012207, + "learning_rate": 2.3518024402074285e-05, + "loss": 0.1878, + "mean_token_accuracy": 0.9213658019900322, + "num_tokens": 141585982.0, + "step": 7230 + }, + { + "entropy": 0.631399916857481, + "epoch": 1.170020202020202, + "grad_norm": 0.19328245520591736, + "learning_rate": 2.349015412900559e-05, + "loss": 0.2094, + "mean_token_accuracy": 0.9080318629741668, + "num_tokens": 141784861.0, + "step": 7240 + }, + { + "entropy": 0.6290282666683197, + "epoch": 1.1716363636363636, + "grad_norm": 0.24472235143184662, + "learning_rate": 2.3462266635915932e-05, + "loss": 0.1855, + "mean_token_accuracy": 0.9161780208349228, + "num_tokens": 141987990.0, + "step": 7250 + }, + { + "entropy": 0.6455445989966393, + "epoch": 1.1732525252525252, + "grad_norm": 0.22559289634227753, + "learning_rate": 2.3434362002974078e-05, + "loss": 0.2115, + "mean_token_accuracy": 0.9121286839246749, + "num_tokens": 142185404.0, + "step": 7260 + }, + { + "entropy": 0.6576178468763828, + "epoch": 1.1748686868686868, + "grad_norm": 0.18964362144470215, + "learning_rate": 2.340644031039804e-05, + "loss": 0.1778, + "mean_token_accuracy": 0.928297358751297, + "num_tokens": 142379048.0, + "step": 7270 + }, + { + "entropy": 0.6815750733017921, + "epoch": 1.1764848484848485, + "grad_norm": 0.3642102777957916, + "learning_rate": 2.3378501638454905e-05, + "loss": 0.1947, + "mean_token_accuracy": 0.9149638995528221, + "num_tokens": 142564772.0, + "step": 7280 + }, + { + "entropy": 0.6569184005260468, + "epoch": 1.17810101010101, + "grad_norm": 0.25654488801956177, + "learning_rate": 2.3350546067460542e-05, + "loss": 0.1929, + "mean_token_accuracy": 0.9159114107489585, + "num_tokens": 142758169.0, + "step": 7290 + }, + { + "entropy": 0.6607706993818283, + "epoch": 1.1797171717171717, + "grad_norm": 0.1952294409275055, + "learning_rate": 2.332257367777943e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.922367176413536, + "num_tokens": 142949852.0, + "step": 7300 + }, + { + "entropy": 0.6690223574638366, + "epoch": 1.1813333333333333, + "grad_norm": 0.22762024402618408, + "learning_rate": 2.3294584549824365e-05, + "loss": 0.2005, + "mean_token_accuracy": 0.9172046884894371, + "num_tokens": 143137499.0, + "step": 7310 + }, + { + "entropy": 0.6326002225279808, + "epoch": 1.182949494949495, + "grad_norm": 0.3268709182739258, + "learning_rate": 2.3266578764056283e-05, + "loss": 0.1946, + "mean_token_accuracy": 0.9179033607244491, + "num_tokens": 143337556.0, + "step": 7320 + }, + { + "entropy": 0.6689287826418877, + "epoch": 1.1845656565656566, + "grad_norm": 0.5372002720832825, + "learning_rate": 2.3238556400984002e-05, + "loss": 0.1666, + "mean_token_accuracy": 0.928965862095356, + "num_tokens": 143525422.0, + "step": 7330 + }, + { + "entropy": 0.6906199634075165, + "epoch": 1.1861818181818182, + "grad_norm": 0.19594059884548187, + "learning_rate": 2.321051754116399e-05, + "loss": 0.177, + "mean_token_accuracy": 0.9256522223353386, + "num_tokens": 143710200.0, + "step": 7340 + }, + { + "entropy": 0.6540770314633846, + "epoch": 1.1877979797979799, + "grad_norm": 0.3353155255317688, + "learning_rate": 2.318246226520015e-05, + "loss": 0.2016, + "mean_token_accuracy": 0.908722198009491, + "num_tokens": 143904760.0, + "step": 7350 + }, + { + "entropy": 0.6404020361602306, + "epoch": 1.1894141414141415, + "grad_norm": 0.2372790426015854, + "learning_rate": 2.3154390653743558e-05, + "loss": 0.1851, + "mean_token_accuracy": 0.9156206473708153, + "num_tokens": 144104075.0, + "step": 7360 + }, + { + "entropy": 0.6292147316038609, + "epoch": 1.191030303030303, + "grad_norm": 0.2508028745651245, + "learning_rate": 2.3126302787492272e-05, + "loss": 0.205, + "mean_token_accuracy": 0.9077795252203942, + "num_tokens": 144306681.0, + "step": 7370 + }, + { + "entropy": 0.6434667043387889, + "epoch": 1.1926464646464647, + "grad_norm": 0.25540539622306824, + "learning_rate": 2.3098198747191053e-05, + "loss": 0.2227, + "mean_token_accuracy": 0.9046233102679253, + "num_tokens": 144505803.0, + "step": 7380 + }, + { + "entropy": 0.6272190093994141, + "epoch": 1.1942626262626264, + "grad_norm": 0.20317929983139038, + "learning_rate": 2.307007861363118e-05, + "loss": 0.2064, + "mean_token_accuracy": 0.9116634547710418, + "num_tokens": 144707799.0, + "step": 7390 + }, + { + "entropy": 0.6925327144563198, + "epoch": 1.195878787878788, + "grad_norm": 0.24650558829307556, + "learning_rate": 2.3041942467650182e-05, + "loss": 0.1892, + "mean_token_accuracy": 0.9244462102651596, + "num_tokens": 144891128.0, + "step": 7400 + }, + { + "entropy": 0.6295531697571277, + "epoch": 1.1974949494949496, + "grad_norm": 0.2404937744140625, + "learning_rate": 2.3013790390131635e-05, + "loss": 0.1909, + "mean_token_accuracy": 0.9272438645362854, + "num_tokens": 145093402.0, + "step": 7410 + }, + { + "entropy": 0.6406836315989495, + "epoch": 1.199111111111111, + "grad_norm": 0.2630082964897156, + "learning_rate": 2.2985622462004894e-05, + "loss": 0.2024, + "mean_token_accuracy": 0.9157192766666412, + "num_tokens": 145292219.0, + "step": 7420 + }, + { + "entropy": 0.6289153560996056, + "epoch": 1.2007272727272726, + "grad_norm": 0.3908150792121887, + "learning_rate": 2.2957438764244894e-05, + "loss": 0.1821, + "mean_token_accuracy": 0.916665130853653, + "num_tokens": 145492163.0, + "step": 7430 + }, + { + "entropy": 0.6399331271648407, + "epoch": 1.2023434343434343, + "grad_norm": 1.9838895797729492, + "learning_rate": 2.292923937787189e-05, + "loss": 0.194, + "mean_token_accuracy": 0.9150435760617256, + "num_tokens": 145690169.0, + "step": 7440 + }, + { + "entropy": 0.6503728218376637, + "epoch": 1.203959595959596, + "grad_norm": 0.4047127366065979, + "learning_rate": 2.2901024383951265e-05, + "loss": 0.2091, + "mean_token_accuracy": 0.9137666761875153, + "num_tokens": 145884192.0, + "step": 7450 + }, + { + "entropy": 0.6296872481703758, + "epoch": 1.2055757575757575, + "grad_norm": 0.7709885239601135, + "learning_rate": 2.287279386359323e-05, + "loss": 0.1875, + "mean_token_accuracy": 0.9288541987538338, + "num_tokens": 146084734.0, + "step": 7460 + }, + { + "entropy": 0.6353645235300064, + "epoch": 1.2071919191919191, + "grad_norm": 0.3050624132156372, + "learning_rate": 2.2844547897952655e-05, + "loss": 0.1885, + "mean_token_accuracy": 0.9098729580640793, + "num_tokens": 146284306.0, + "step": 7470 + }, + { + "entropy": 0.6429247371852398, + "epoch": 1.2088080808080808, + "grad_norm": 0.5625445246696472, + "learning_rate": 2.2816286568228812e-05, + "loss": 0.1978, + "mean_token_accuracy": 0.914405246078968, + "num_tokens": 146482486.0, + "step": 7480 + }, + { + "entropy": 0.6462002150714398, + "epoch": 1.2104242424242424, + "grad_norm": 0.5488247871398926, + "learning_rate": 2.2788009955665133e-05, + "loss": 0.2028, + "mean_token_accuracy": 0.9175966009497643, + "num_tokens": 146678318.0, + "step": 7490 + }, + { + "entropy": 0.6897264942526817, + "epoch": 1.212040404040404, + "grad_norm": 3.5190200805664062, + "learning_rate": 2.2759718141548985e-05, + "loss": 0.2213, + "mean_token_accuracy": 0.9151095658540725, + "num_tokens": 146861526.0, + "step": 7500 + }, + { + "epoch": 1.212040404040404, + "eval_entropy": 0.7134856638014316, + "eval_loss": 0.1802089363336563, + "eval_mean_token_accuracy": 0.9210226243138313, + "eval_num_tokens": 146861526.0, + "eval_runtime": 107.8276, + "eval_samples_per_second": 9.274, + "eval_steps_per_second": 9.274, + "step": 7500 + }, + { + "entropy": 0.6661449395120144, + "epoch": 1.2136565656565657, + "grad_norm": 0.746952474117279, + "learning_rate": 2.2731411207211436e-05, + "loss": 0.1967, + "mean_token_accuracy": 0.9125927239656448, + "num_tokens": 147053824.0, + "step": 7510 + }, + { + "entropy": 0.6528176456689835, + "epoch": 1.2152727272727273, + "grad_norm": 0.24343125522136688, + "learning_rate": 2.270308923402702e-05, + "loss": 0.1841, + "mean_token_accuracy": 0.9207581833004952, + "num_tokens": 147250379.0, + "step": 7520 + }, + { + "entropy": 0.6335774019360543, + "epoch": 1.216888888888889, + "grad_norm": 0.19108474254608154, + "learning_rate": 2.26747523034135e-05, + "loss": 0.2007, + "mean_token_accuracy": 0.9169000715017319, + "num_tokens": 147452752.0, + "step": 7530 + }, + { + "entropy": 0.6925512567162514, + "epoch": 1.2185050505050505, + "grad_norm": 1.3014295101165771, + "learning_rate": 2.264640049683165e-05, + "loss": 0.1876, + "mean_token_accuracy": 0.9175386264920234, + "num_tokens": 147638076.0, + "step": 7540 + }, + { + "entropy": 0.6489442780613899, + "epoch": 1.2201212121212122, + "grad_norm": 0.24760295450687408, + "learning_rate": 2.2618033895784995e-05, + "loss": 0.1883, + "mean_token_accuracy": 0.9158783406019211, + "num_tokens": 147836030.0, + "step": 7550 + }, + { + "entropy": 0.649086344242096, + "epoch": 1.2217373737373738, + "grad_norm": 0.4136185050010681, + "learning_rate": 2.2589652581819586e-05, + "loss": 0.1962, + "mean_token_accuracy": 0.9124460712075233, + "num_tokens": 148034420.0, + "step": 7560 + }, + { + "entropy": 0.6605263993144035, + "epoch": 1.2233535353535354, + "grad_norm": 0.27574583888053894, + "learning_rate": 2.2561256636523796e-05, + "loss": 0.192, + "mean_token_accuracy": 0.905976240336895, + "num_tokens": 148229801.0, + "step": 7570 + }, + { + "entropy": 0.6664862349629402, + "epoch": 1.224969696969697, + "grad_norm": 0.25330832600593567, + "learning_rate": 2.2532846141528023e-05, + "loss": 0.1918, + "mean_token_accuracy": 0.9155817702412605, + "num_tokens": 148424374.0, + "step": 7580 + }, + { + "entropy": 0.6453304678201676, + "epoch": 1.2265858585858587, + "grad_norm": 0.48292797803878784, + "learning_rate": 2.2504421178504528e-05, + "loss": 0.2119, + "mean_token_accuracy": 0.9098923563957214, + "num_tokens": 148624497.0, + "step": 7590 + }, + { + "entropy": 0.6781962931156158, + "epoch": 1.2282020202020203, + "grad_norm": 0.26671740412712097, + "learning_rate": 2.2475981829167126e-05, + "loss": 0.1897, + "mean_token_accuracy": 0.9188299715518952, + "num_tokens": 148811307.0, + "step": 7600 + }, + { + "entropy": 0.6696497678756714, + "epoch": 1.2298181818181817, + "grad_norm": 0.21028929948806763, + "learning_rate": 2.244752817527102e-05, + "loss": 0.1834, + "mean_token_accuracy": 0.9069130301475525, + "num_tokens": 149001143.0, + "step": 7610 + }, + { + "entropy": 0.6347329325973987, + "epoch": 1.2314343434343433, + "grad_norm": 0.24588893353939056, + "learning_rate": 2.241906029861251e-05, + "loss": 0.181, + "mean_token_accuracy": 0.9176601111888886, + "num_tokens": 149203763.0, + "step": 7620 + }, + { + "entropy": 0.6485351033508777, + "epoch": 1.233050505050505, + "grad_norm": 0.18820928037166595, + "learning_rate": 2.239057828102881e-05, + "loss": 0.1958, + "mean_token_accuracy": 0.9130016818642617, + "num_tokens": 149403173.0, + "step": 7630 + }, + { + "entropy": 0.6550226002931595, + "epoch": 1.2346666666666666, + "grad_norm": 0.2959213852882385, + "learning_rate": 2.2362082204397756e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9180138662457467, + "num_tokens": 149597427.0, + "step": 7640 + }, + { + "entropy": 0.6599608227610588, + "epoch": 1.2362828282828282, + "grad_norm": 0.2471340447664261, + "learning_rate": 2.233357215063762e-05, + "loss": 0.1939, + "mean_token_accuracy": 0.9141124650835991, + "num_tokens": 149790440.0, + "step": 7650 + }, + { + "entropy": 0.6660330504179001, + "epoch": 1.2378989898989898, + "grad_norm": 0.1872570961713791, + "learning_rate": 2.2305048201706855e-05, + "loss": 0.1961, + "mean_token_accuracy": 0.9194521516561508, + "num_tokens": 149980879.0, + "step": 7660 + }, + { + "entropy": 0.6459147520363331, + "epoch": 1.2395151515151515, + "grad_norm": 0.6696406006813049, + "learning_rate": 2.2276510439603838e-05, + "loss": 0.1949, + "mean_token_accuracy": 0.917964231967926, + "num_tokens": 150180086.0, + "step": 7670 + }, + { + "entropy": 0.6270775929093361, + "epoch": 1.241131313131313, + "grad_norm": 0.21063180267810822, + "learning_rate": 2.2247958946366676e-05, + "loss": 0.187, + "mean_token_accuracy": 0.9169306978583336, + "num_tokens": 150388547.0, + "step": 7680 + }, + { + "entropy": 0.6520090967416763, + "epoch": 1.2427474747474747, + "grad_norm": 0.20971941947937012, + "learning_rate": 2.221939380407294e-05, + "loss": 0.1884, + "mean_token_accuracy": 0.915847373008728, + "num_tokens": 150584388.0, + "step": 7690 + }, + { + "entropy": 0.6569691218435765, + "epoch": 1.2443636363636363, + "grad_norm": 0.3301805257797241, + "learning_rate": 2.2190815094839442e-05, + "loss": 0.2204, + "mean_token_accuracy": 0.9099266052246093, + "num_tokens": 150782164.0, + "step": 7700 + }, + { + "entropy": 0.6681100189685821, + "epoch": 1.245979797979798, + "grad_norm": 0.3229370713233948, + "learning_rate": 2.2162222900821992e-05, + "loss": 0.1999, + "mean_token_accuracy": 0.9191615000367165, + "num_tokens": 150973852.0, + "step": 7710 + }, + { + "entropy": 0.6337154619395733, + "epoch": 1.2475959595959596, + "grad_norm": 0.23271264135837555, + "learning_rate": 2.2133617304215167e-05, + "loss": 0.1952, + "mean_token_accuracy": 0.9135285466909409, + "num_tokens": 151176552.0, + "step": 7720 + }, + { + "entropy": 0.6439542829990387, + "epoch": 1.2492121212121212, + "grad_norm": 0.25363296270370483, + "learning_rate": 2.2104998387252066e-05, + "loss": 0.2087, + "mean_token_accuracy": 0.9085622906684876, + "num_tokens": 151376726.0, + "step": 7730 + }, + { + "entropy": 0.6427732348442078, + "epoch": 1.2508282828282828, + "grad_norm": 0.20807453989982605, + "learning_rate": 2.20763662322041e-05, + "loss": 0.2028, + "mean_token_accuracy": 0.9118719816207885, + "num_tokens": 151574565.0, + "step": 7740 + }, + { + "entropy": 0.6582729101181031, + "epoch": 1.2524444444444445, + "grad_norm": 0.17287397384643555, + "learning_rate": 2.204772092138071e-05, + "loss": 0.1882, + "mean_token_accuracy": 0.917538258433342, + "num_tokens": 151767869.0, + "step": 7750 + }, + { + "entropy": 0.6312249414622784, + "epoch": 1.254060606060606, + "grad_norm": 0.21336060762405396, + "learning_rate": 2.2019062537129172e-05, + "loss": 0.1962, + "mean_token_accuracy": 0.9108826741576195, + "num_tokens": 151970671.0, + "step": 7760 + }, + { + "entropy": 0.6726974606513977, + "epoch": 1.2556767676767677, + "grad_norm": 0.6455129384994507, + "learning_rate": 2.199039116183434e-05, + "loss": 0.1972, + "mean_token_accuracy": 0.9136315107345581, + "num_tokens": 152158479.0, + "step": 7770 + }, + { + "entropy": 0.6610306613147259, + "epoch": 1.2572929292929294, + "grad_norm": 0.22136108577251434, + "learning_rate": 2.1961706877918418e-05, + "loss": 0.1753, + "mean_token_accuracy": 0.921593825519085, + "num_tokens": 152353114.0, + "step": 7780 + }, + { + "entropy": 0.656276173144579, + "epoch": 1.258909090909091, + "grad_norm": 0.16673600673675537, + "learning_rate": 2.1933009767840713e-05, + "loss": 0.1933, + "mean_token_accuracy": 0.9160093143582344, + "num_tokens": 152550398.0, + "step": 7790 + }, + { + "entropy": 0.644133622944355, + "epoch": 1.2605252525252526, + "grad_norm": 0.23686234652996063, + "learning_rate": 2.1904299914097394e-05, + "loss": 0.2018, + "mean_token_accuracy": 0.9210563838481903, + "num_tokens": 152749548.0, + "step": 7800 + }, + { + "epoch": 1.2605252525252526, + "eval_entropy": 0.7122493233680726, + "eval_loss": 0.17709016799926758, + "eval_mean_token_accuracy": 0.9228101785778999, + "eval_num_tokens": 152749548.0, + "eval_runtime": 108.1994, + "eval_samples_per_second": 9.242, + "eval_steps_per_second": 9.242, + "step": 7800 + } + ], + "logging_steps": 10, + "max_steps": 18564, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.4877359534504673e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}