| { |
| "best_global_step": 7200, |
| "best_metric": 0.17618873715400696, |
| "best_model_checkpoint": "checkpoints/Qwen2.5-3B-Instruct-liar-dice-sft/checkpoint-7200", |
| "epoch": 1.2605252525252526, |
| "eval_steps": 300, |
| "global_step": 7800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.46959872394800184, |
| "epoch": 0.0016161616161616162, |
| "grad_norm": 13.960037231445312, |
| "learning_rate": 8.999999999999999e-06, |
| "loss": 1.2876, |
| "mean_token_accuracy": 0.7336300253868103, |
| "num_tokens": 192055.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.500729388743639, |
| "epoch": 0.0032323232323232323, |
| "grad_norm": 6.7432475090026855, |
| "learning_rate": 1.8999999999999998e-05, |
| "loss": 1.0227, |
| "mean_token_accuracy": 0.7338824585080147, |
| "num_tokens": 386411.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.5788386285305023, |
| "epoch": 0.0048484848484848485, |
| "grad_norm": 1.9439020156860352, |
| "learning_rate": 2.9e-05, |
| "loss": 0.5673, |
| "mean_token_accuracy": 0.823445089161396, |
| "num_tokens": 584042.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.5961214371025563, |
| "epoch": 0.006464646464646465, |
| "grad_norm": 1.6919704675674438, |
| "learning_rate": 3.499999597539e-05, |
| "loss": 0.4976, |
| "mean_token_accuracy": 0.8367778733372688, |
| "num_tokens": 786741.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.6113797709345817, |
| "epoch": 0.00808080808080808, |
| "grad_norm": 1.7834748029708862, |
| "learning_rate": 3.4999950698548777e-05, |
| "loss": 0.4251, |
| "mean_token_accuracy": 0.8491482272744179, |
| "num_tokens": 984632.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.6576851613819599, |
| "epoch": 0.009696969696969697, |
| "grad_norm": 1.1660228967666626, |
| "learning_rate": 3.499985511423442e-05, |
| "loss": 0.3977, |
| "mean_token_accuracy": 0.8635194897651672, |
| "num_tokens": 1172287.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.6415456309914589, |
| "epoch": 0.011313131313131313, |
| "grad_norm": 0.8897055983543396, |
| "learning_rate": 3.499970922272172e-05, |
| "loss": 0.3657, |
| "mean_token_accuracy": 0.8693153902888298, |
| "num_tokens": 1362395.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.6485456451773643, |
| "epoch": 0.01292929292929293, |
| "grad_norm": 1.4095648527145386, |
| "learning_rate": 3.499951302443006e-05, |
| "loss": 0.3765, |
| "mean_token_accuracy": 0.8600584954023361, |
| "num_tokens": 1552405.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.6257273025810719, |
| "epoch": 0.014545454545454545, |
| "grad_norm": 2.536198139190674, |
| "learning_rate": 3.499926651992347e-05, |
| "loss": 0.4095, |
| "mean_token_accuracy": 0.8548074960708618, |
| "num_tokens": 1753707.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.615520790219307, |
| "epoch": 0.01616161616161616, |
| "grad_norm": 1.2256574630737305, |
| "learning_rate": 3.499896970991057e-05, |
| "loss": 0.4175, |
| "mean_token_accuracy": 0.8375065699219704, |
| "num_tokens": 1957057.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.6429425723850727, |
| "epoch": 0.017777777777777778, |
| "grad_norm": 1.1664628982543945, |
| "learning_rate": 3.4998622595244605e-05, |
| "loss": 0.3776, |
| "mean_token_accuracy": 0.8666251391172409, |
| "num_tokens": 2153468.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.6582260139286518, |
| "epoch": 0.019393939393939394, |
| "grad_norm": 1.757544755935669, |
| "learning_rate": 3.499822517692344e-05, |
| "loss": 0.3251, |
| "mean_token_accuracy": 0.8730518639087677, |
| "num_tokens": 2342417.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.6374198235571384, |
| "epoch": 0.02101010101010101, |
| "grad_norm": 1.7501511573791504, |
| "learning_rate": 3.4997777456089535e-05, |
| "loss": 0.3408, |
| "mean_token_accuracy": 0.8802526473999024, |
| "num_tokens": 2540898.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.6632717259228229, |
| "epoch": 0.022626262626262626, |
| "grad_norm": 1.2834315299987793, |
| "learning_rate": 3.499727943402997e-05, |
| "loss": 0.3466, |
| "mean_token_accuracy": 0.8864644199609757, |
| "num_tokens": 2730096.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.6132969424128533, |
| "epoch": 0.024242424242424242, |
| "grad_norm": 1.649055004119873, |
| "learning_rate": 3.499673111217642e-05, |
| "loss": 0.3472, |
| "mean_token_accuracy": 0.8800297752022743, |
| "num_tokens": 2933186.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.6532419972121716, |
| "epoch": 0.02585858585858586, |
| "grad_norm": 2.080294609069824, |
| "learning_rate": 3.4996132492105146e-05, |
| "loss": 0.3189, |
| "mean_token_accuracy": 0.8902329325675964, |
| "num_tokens": 3125684.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.6288332067430019, |
| "epoch": 0.027474747474747475, |
| "grad_norm": 1.31842839717865, |
| "learning_rate": 3.499548357553703e-05, |
| "loss": 0.2968, |
| "mean_token_accuracy": 0.8945111632347107, |
| "num_tokens": 3323087.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.6411679066717625, |
| "epoch": 0.02909090909090909, |
| "grad_norm": 1.308049201965332, |
| "learning_rate": 3.49947843643375e-05, |
| "loss": 0.3174, |
| "mean_token_accuracy": 0.8880784347653389, |
| "num_tokens": 3517966.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.6363876968622207, |
| "epoch": 0.030707070707070707, |
| "grad_norm": 1.4614847898483276, |
| "learning_rate": 3.4994034860516625e-05, |
| "loss": 0.3138, |
| "mean_token_accuracy": 0.8888434395194054, |
| "num_tokens": 3714854.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.6365263275802135, |
| "epoch": 0.03232323232323232, |
| "grad_norm": 1.3670061826705933, |
| "learning_rate": 3.4993235066228996e-05, |
| "loss": 0.3034, |
| "mean_token_accuracy": 0.8956289395689965, |
| "num_tokens": 3912912.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.6736746333539486, |
| "epoch": 0.03393939393939394, |
| "grad_norm": 4.420541763305664, |
| "learning_rate": 3.499238498377381e-05, |
| "loss": 0.2881, |
| "mean_token_accuracy": 0.9016141295433044, |
| "num_tokens": 4099783.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.6174106113612652, |
| "epoch": 0.035555555555555556, |
| "grad_norm": 1.7219263315200806, |
| "learning_rate": 3.49914846155948e-05, |
| "loss": 0.2821, |
| "mean_token_accuracy": 0.9015349850058556, |
| "num_tokens": 4302700.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.654355899989605, |
| "epoch": 0.037171717171717175, |
| "grad_norm": 1.486142873764038, |
| "learning_rate": 3.4990533964280305e-05, |
| "loss": 0.2979, |
| "mean_token_accuracy": 0.8962426647543907, |
| "num_tokens": 4496033.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.6416108801960945, |
| "epoch": 0.03878787878787879, |
| "grad_norm": 1.2947139739990234, |
| "learning_rate": 3.4989533032563156e-05, |
| "loss": 0.2985, |
| "mean_token_accuracy": 0.8903466835618019, |
| "num_tokens": 4691716.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.5999645918607712, |
| "epoch": 0.04040404040404041, |
| "grad_norm": 1.6406580209732056, |
| "learning_rate": 3.498848182332076e-05, |
| "loss": 0.3048, |
| "mean_token_accuracy": 0.8953321948647499, |
| "num_tokens": 4901668.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.6618356920778752, |
| "epoch": 0.04202020202020202, |
| "grad_norm": 2.5923011302948, |
| "learning_rate": 3.498738033957504e-05, |
| "loss": 0.2815, |
| "mean_token_accuracy": 0.8963434055447579, |
| "num_tokens": 5093439.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.6510407045483589, |
| "epoch": 0.04363636363636364, |
| "grad_norm": 2.0043606758117676, |
| "learning_rate": 3.498622858449248e-05, |
| "loss": 0.2657, |
| "mean_token_accuracy": 0.8961230248212815, |
| "num_tokens": 5287355.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.6206977590918541, |
| "epoch": 0.04525252525252525, |
| "grad_norm": 1.1375564336776733, |
| "learning_rate": 3.4985026561384024e-05, |
| "loss": 0.293, |
| "mean_token_accuracy": 0.8910468384623528, |
| "num_tokens": 5489325.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.6492680199444294, |
| "epoch": 0.04686868686868687, |
| "grad_norm": 1.0635576248168945, |
| "learning_rate": 3.498377427370518e-05, |
| "loss": 0.2758, |
| "mean_token_accuracy": 0.8996513709425926, |
| "num_tokens": 5685198.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.6693574033677578, |
| "epoch": 0.048484848484848485, |
| "grad_norm": 0.8263881206512451, |
| "learning_rate": 3.4982471725055907e-05, |
| "loss": 0.2574, |
| "mean_token_accuracy": 0.9080115720629692, |
| "num_tokens": 5874431.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.048484848484848485, |
| "eval_entropy": 0.7083358879685402, |
| "eval_loss": 0.24814815819263458, |
| "eval_mean_token_accuracy": 0.907939182460308, |
| "eval_num_tokens": 5874431.0, |
| "eval_runtime": 106.7029, |
| "eval_samples_per_second": 9.372, |
| "eval_steps_per_second": 9.372, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.6249252423644066, |
| "epoch": 0.050101010101010104, |
| "grad_norm": 0.9064458012580872, |
| "learning_rate": 3.498111891918067e-05, |
| "loss": 0.2907, |
| "mean_token_accuracy": 0.8954597547650337, |
| "num_tokens": 6076107.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.6337138824164867, |
| "epoch": 0.05171717171717172, |
| "grad_norm": 1.4427143335342407, |
| "learning_rate": 3.4979715859968415e-05, |
| "loss": 0.2712, |
| "mean_token_accuracy": 0.8949789464473724, |
| "num_tokens": 6276008.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.6436790756881237, |
| "epoch": 0.05333333333333334, |
| "grad_norm": 1.2344701290130615, |
| "learning_rate": 3.4978262551452546e-05, |
| "loss": 0.2425, |
| "mean_token_accuracy": 0.9087216407060623, |
| "num_tokens": 6473180.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.6058064438402653, |
| "epoch": 0.05494949494949495, |
| "grad_norm": 1.8228733539581299, |
| "learning_rate": 3.497675899781091e-05, |
| "loss": 0.2708, |
| "mean_token_accuracy": 0.8938016831874848, |
| "num_tokens": 6679706.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.6307087175548076, |
| "epoch": 0.05656565656565657, |
| "grad_norm": 1.1663655042648315, |
| "learning_rate": 3.497520520336582e-05, |
| "loss": 0.2704, |
| "mean_token_accuracy": 0.8911788880825042, |
| "num_tokens": 6876536.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.642133416980505, |
| "epoch": 0.05818181818181818, |
| "grad_norm": 1.7720001935958862, |
| "learning_rate": 3.497360117258399e-05, |
| "loss": 0.2548, |
| "mean_token_accuracy": 0.9016376256942749, |
| "num_tokens": 7069255.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.639679954200983, |
| "epoch": 0.0597979797979798, |
| "grad_norm": 1.618810772895813, |
| "learning_rate": 3.4971946910076555e-05, |
| "loss": 0.2544, |
| "mean_token_accuracy": 0.8990731582045555, |
| "num_tokens": 7263687.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.6632651142776013, |
| "epoch": 0.061414141414141414, |
| "grad_norm": 1.4762156009674072, |
| "learning_rate": 3.497024242059907e-05, |
| "loss": 0.2271, |
| "mean_token_accuracy": 0.9126387253403664, |
| "num_tokens": 7449167.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.6455851793289185, |
| "epoch": 0.06303030303030303, |
| "grad_norm": 1.648486852645874, |
| "learning_rate": 3.496848770905146e-05, |
| "loss": 0.2462, |
| "mean_token_accuracy": 0.9119722321629524, |
| "num_tokens": 7640249.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.6278689287602901, |
| "epoch": 0.06464646464646465, |
| "grad_norm": 1.5724526643753052, |
| "learning_rate": 3.496668278047804e-05, |
| "loss": 0.2712, |
| "mean_token_accuracy": 0.8938238501548768, |
| "num_tokens": 7836242.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.6397264793515205, |
| "epoch": 0.06626262626262626, |
| "grad_norm": 1.1819778680801392, |
| "learning_rate": 3.496482764006746e-05, |
| "loss": 0.2441, |
| "mean_token_accuracy": 0.9028481721878052, |
| "num_tokens": 8030712.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.6572954833507538, |
| "epoch": 0.06787878787878789, |
| "grad_norm": 1.456351637840271, |
| "learning_rate": 3.496292229315275e-05, |
| "loss": 0.2615, |
| "mean_token_accuracy": 0.9067741006612777, |
| "num_tokens": 8219433.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.6125221528112889, |
| "epoch": 0.0694949494949495, |
| "grad_norm": 1.8550266027450562, |
| "learning_rate": 3.4960966745211225e-05, |
| "loss": 0.2411, |
| "mean_token_accuracy": 0.9120300561189651, |
| "num_tokens": 8420536.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.6642718121409417, |
| "epoch": 0.07111111111111111, |
| "grad_norm": 1.758142113685608, |
| "learning_rate": 3.495896100186456e-05, |
| "loss": 0.2263, |
| "mean_token_accuracy": 0.9210765823721886, |
| "num_tokens": 8606059.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.6295181460678577, |
| "epoch": 0.07272727272727272, |
| "grad_norm": 3.6077258586883545, |
| "learning_rate": 3.4956905068878704e-05, |
| "loss": 0.2435, |
| "mean_token_accuracy": 0.9080678433179855, |
| "num_tokens": 8804802.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.641046067327261, |
| "epoch": 0.07434343434343435, |
| "grad_norm": 0.8795793056488037, |
| "learning_rate": 3.4954798952163886e-05, |
| "loss": 0.2521, |
| "mean_token_accuracy": 0.9062324479222298, |
| "num_tokens": 9000444.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.6226008631289005, |
| "epoch": 0.07595959595959596, |
| "grad_norm": 1.1648048162460327, |
| "learning_rate": 3.495264265777461e-05, |
| "loss": 0.2298, |
| "mean_token_accuracy": 0.9145419552922249, |
| "num_tokens": 9200956.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.6410299509763717, |
| "epoch": 0.07757575757575758, |
| "grad_norm": 2.069148302078247, |
| "learning_rate": 3.4950436191909614e-05, |
| "loss": 0.2371, |
| "mean_token_accuracy": 0.9139557749032974, |
| "num_tokens": 9394938.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.6107208199799061, |
| "epoch": 0.07919191919191919, |
| "grad_norm": 3.762711524963379, |
| "learning_rate": 3.494817956091187e-05, |
| "loss": 0.2274, |
| "mean_token_accuracy": 0.9104640632867813, |
| "num_tokens": 9599158.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.6289502464234829, |
| "epoch": 0.08080808080808081, |
| "grad_norm": 0.7265241742134094, |
| "learning_rate": 3.4945872771268564e-05, |
| "loss": 0.2181, |
| "mean_token_accuracy": 0.9099911272525787, |
| "num_tokens": 9795341.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.6372130006551743, |
| "epoch": 0.08242424242424243, |
| "grad_norm": 1.3274513483047485, |
| "learning_rate": 3.494351582961108e-05, |
| "loss": 0.2592, |
| "mean_token_accuracy": 0.9113820254802704, |
| "num_tokens": 9991485.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.6244701214134694, |
| "epoch": 0.08404040404040404, |
| "grad_norm": 1.3778245449066162, |
| "learning_rate": 3.494110874271496e-05, |
| "loss": 0.2589, |
| "mean_token_accuracy": 0.9008386582136154, |
| "num_tokens": 10192963.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.6476377807557583, |
| "epoch": 0.08565656565656565, |
| "grad_norm": 7.508720397949219, |
| "learning_rate": 3.4938651517499906e-05, |
| "loss": 0.2454, |
| "mean_token_accuracy": 0.9027299627661705, |
| "num_tokens": 10384151.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.6151685245335102, |
| "epoch": 0.08727272727272728, |
| "grad_norm": 1.208046317100525, |
| "learning_rate": 3.493614416102976e-05, |
| "loss": 0.2519, |
| "mean_token_accuracy": 0.8936711862683296, |
| "num_tokens": 10587963.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.6441866233944893, |
| "epoch": 0.08888888888888889, |
| "grad_norm": 0.9904794692993164, |
| "learning_rate": 3.493358668051246e-05, |
| "loss": 0.2338, |
| "mean_token_accuracy": 0.911143635213375, |
| "num_tokens": 10782441.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.6711594641208649, |
| "epoch": 0.0905050505050505, |
| "grad_norm": 1.4392328262329102, |
| "learning_rate": 3.493097908330007e-05, |
| "loss": 0.2343, |
| "mean_token_accuracy": 0.9069328650832176, |
| "num_tokens": 10966642.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.6202519237995148, |
| "epoch": 0.09212121212121212, |
| "grad_norm": 1.1660218238830566, |
| "learning_rate": 3.4928321376888685e-05, |
| "loss": 0.2342, |
| "mean_token_accuracy": 0.9107801094651222, |
| "num_tokens": 11165742.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.6474331930279732, |
| "epoch": 0.09373737373737374, |
| "grad_norm": 1.130745530128479, |
| "learning_rate": 3.492561356891847e-05, |
| "loss": 0.2208, |
| "mean_token_accuracy": 0.9123849824070931, |
| "num_tokens": 11356135.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.6123579762876034, |
| "epoch": 0.09535353535353536, |
| "grad_norm": 1.5646748542785645, |
| "learning_rate": 3.4922855667173627e-05, |
| "loss": 0.2376, |
| "mean_token_accuracy": 0.9094289064407348, |
| "num_tokens": 11558279.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.6709182240068913, |
| "epoch": 0.09696969696969697, |
| "grad_norm": 2.955382823944092, |
| "learning_rate": 3.492004767958235e-05, |
| "loss": 0.2205, |
| "mean_token_accuracy": 0.9199464812874794, |
| "num_tokens": 11743893.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.09696969696969697, |
| "eval_entropy": 0.6894149232506752, |
| "eval_loss": 0.21707852184772491, |
| "eval_mean_token_accuracy": 0.9174256545305252, |
| "eval_num_tokens": 11743893.0, |
| "eval_runtime": 106.8539, |
| "eval_samples_per_second": 9.359, |
| "eval_steps_per_second": 9.359, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.631628192961216, |
| "epoch": 0.09858585858585858, |
| "grad_norm": 0.8649947047233582, |
| "learning_rate": 3.4917189614216816e-05, |
| "loss": 0.2417, |
| "mean_token_accuracy": 0.90289586186409, |
| "num_tokens": 11940257.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.5916264042258262, |
| "epoch": 0.10020202020202021, |
| "grad_norm": 1.269579529762268, |
| "learning_rate": 3.491428147929317e-05, |
| "loss": 0.2283, |
| "mean_token_accuracy": 0.9115998461842537, |
| "num_tokens": 12151169.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.6250050470232964, |
| "epoch": 0.10181818181818182, |
| "grad_norm": 3.079771041870117, |
| "learning_rate": 3.4911323283171485e-05, |
| "loss": 0.2391, |
| "mean_token_accuracy": 0.9097789540886879, |
| "num_tokens": 12349255.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.6837345652282238, |
| "epoch": 0.10343434343434343, |
| "grad_norm": 2.3608956336975098, |
| "learning_rate": 3.490831503435575e-05, |
| "loss": 0.2381, |
| "mean_token_accuracy": 0.909605085849762, |
| "num_tokens": 12531982.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.6474835090339184, |
| "epoch": 0.10505050505050505, |
| "grad_norm": 0.9185977578163147, |
| "learning_rate": 3.490525674149384e-05, |
| "loss": 0.2228, |
| "mean_token_accuracy": 0.9189437657594681, |
| "num_tokens": 12726420.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.6150617614388466, |
| "epoch": 0.10666666666666667, |
| "grad_norm": 2.9653398990631104, |
| "learning_rate": 3.490214841337749e-05, |
| "loss": 0.2466, |
| "mean_token_accuracy": 0.9002894461154938, |
| "num_tokens": 12929338.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.6306007914245129, |
| "epoch": 0.10828282828282829, |
| "grad_norm": 2.005779266357422, |
| "learning_rate": 3.4898990058942284e-05, |
| "loss": 0.2402, |
| "mean_token_accuracy": 0.9071647629141808, |
| "num_tokens": 13126452.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.6232900455594063, |
| "epoch": 0.1098989898989899, |
| "grad_norm": 1.0521694421768188, |
| "learning_rate": 3.48957816872676e-05, |
| "loss": 0.2222, |
| "mean_token_accuracy": 0.9119127243757248, |
| "num_tokens": 13323868.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.6038605846464634, |
| "epoch": 0.11151515151515151, |
| "grad_norm": 2.1727254390716553, |
| "learning_rate": 3.489252330757662e-05, |
| "loss": 0.2527, |
| "mean_token_accuracy": 0.8974953025579453, |
| "num_tokens": 13532123.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.6080361865460873, |
| "epoch": 0.11313131313131314, |
| "grad_norm": 2.5059597492218018, |
| "learning_rate": 3.4889214929236264e-05, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.9091552272439003, |
| "num_tokens": 13739707.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.6336839586496353, |
| "epoch": 0.11474747474747475, |
| "grad_norm": 0.798665463924408, |
| "learning_rate": 3.4885856561757215e-05, |
| "loss": 0.23, |
| "mean_token_accuracy": 0.9074552163481713, |
| "num_tokens": 13939220.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.6417612984776497, |
| "epoch": 0.11636363636363636, |
| "grad_norm": 1.036450982093811, |
| "learning_rate": 3.488244821479382e-05, |
| "loss": 0.2191, |
| "mean_token_accuracy": 0.9127443626523017, |
| "num_tokens": 14131430.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.6245701543986797, |
| "epoch": 0.11797979797979798, |
| "grad_norm": 1.9606401920318604, |
| "learning_rate": 3.487898989814414e-05, |
| "loss": 0.2244, |
| "mean_token_accuracy": 0.9075841188430787, |
| "num_tokens": 14328348.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.6564237147569656, |
| "epoch": 0.1195959595959596, |
| "grad_norm": 1.6049528121948242, |
| "learning_rate": 3.487548162174987e-05, |
| "loss": 0.2296, |
| "mean_token_accuracy": 0.9106339663267136, |
| "num_tokens": 14515807.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.6185222245752812, |
| "epoch": 0.12121212121212122, |
| "grad_norm": 1.29709792137146, |
| "learning_rate": 3.487192339569631e-05, |
| "loss": 0.2366, |
| "mean_token_accuracy": 0.9005967259407044, |
| "num_tokens": 14714178.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.6413687355816364, |
| "epoch": 0.12282828282828283, |
| "grad_norm": 1.3318425416946411, |
| "learning_rate": 3.486831523021239e-05, |
| "loss": 0.2311, |
| "mean_token_accuracy": 0.9086028903722763, |
| "num_tokens": 14908741.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.6336495697498321, |
| "epoch": 0.12444444444444444, |
| "grad_norm": 0.9258722066879272, |
| "learning_rate": 3.4864657135670555e-05, |
| "loss": 0.2459, |
| "mean_token_accuracy": 0.9087097436189652, |
| "num_tokens": 15106645.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.6563303679227829, |
| "epoch": 0.12606060606060607, |
| "grad_norm": 1.5138893127441406, |
| "learning_rate": 3.486094912258683e-05, |
| "loss": 0.2287, |
| "mean_token_accuracy": 0.8989679113030433, |
| "num_tokens": 15294657.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.6279109574854373, |
| "epoch": 0.12767676767676767, |
| "grad_norm": 1.7884769439697266, |
| "learning_rate": 3.485719120162069e-05, |
| "loss": 0.2441, |
| "mean_token_accuracy": 0.9058216944336891, |
| "num_tokens": 15492333.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.6399269141256809, |
| "epoch": 0.1292929292929293, |
| "grad_norm": 0.9560442566871643, |
| "learning_rate": 3.485338338357513e-05, |
| "loss": 0.2125, |
| "mean_token_accuracy": 0.9148769214749336, |
| "num_tokens": 15685417.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.6472979053854943, |
| "epoch": 0.13090909090909092, |
| "grad_norm": 2.49409556388855, |
| "learning_rate": 3.484952567939656e-05, |
| "loss": 0.2409, |
| "mean_token_accuracy": 0.907583674788475, |
| "num_tokens": 15882414.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.5940545335412025, |
| "epoch": 0.13252525252525252, |
| "grad_norm": 1.5602918863296509, |
| "learning_rate": 3.48456181001748e-05, |
| "loss": 0.2404, |
| "mean_token_accuracy": 0.89293831884861, |
| "num_tokens": 16094120.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.6304773099720478, |
| "epoch": 0.13414141414141414, |
| "grad_norm": 1.1661272048950195, |
| "learning_rate": 3.484166065714304e-05, |
| "loss": 0.2339, |
| "mean_token_accuracy": 0.9048717796802521, |
| "num_tokens": 16293723.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.6498629376292229, |
| "epoch": 0.13575757575757577, |
| "grad_norm": 1.2386293411254883, |
| "learning_rate": 3.483765336167784e-05, |
| "loss": 0.2305, |
| "mean_token_accuracy": 0.9080037698149681, |
| "num_tokens": 16485193.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.6220371380448342, |
| "epoch": 0.13737373737373737, |
| "grad_norm": 1.8108175992965698, |
| "learning_rate": 3.483359622529905e-05, |
| "loss": 0.226, |
| "mean_token_accuracy": 0.9119313061237335, |
| "num_tokens": 16687488.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.6563889265060425, |
| "epoch": 0.138989898989899, |
| "grad_norm": 0.8232795000076294, |
| "learning_rate": 3.48294892596698e-05, |
| "loss": 0.2128, |
| "mean_token_accuracy": 0.9188828706741333, |
| "num_tokens": 16876673.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.6596408911049366, |
| "epoch": 0.1406060606060606, |
| "grad_norm": 1.3460944890975952, |
| "learning_rate": 3.482533247659647e-05, |
| "loss": 0.2372, |
| "mean_token_accuracy": 0.9061458364129067, |
| "num_tokens": 17063863.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.6356192789971828, |
| "epoch": 0.14222222222222222, |
| "grad_norm": 1.2244527339935303, |
| "learning_rate": 3.482112588802866e-05, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.9038299396634102, |
| "num_tokens": 17258452.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.6439319156110287, |
| "epoch": 0.14383838383838385, |
| "grad_norm": 0.8409541249275208, |
| "learning_rate": 3.4816869506059134e-05, |
| "loss": 0.2163, |
| "mean_token_accuracy": 0.9198831230401993, |
| "num_tokens": 17451380.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.640556076169014, |
| "epoch": 0.14545454545454545, |
| "grad_norm": 1.9304518699645996, |
| "learning_rate": 3.4812563342923794e-05, |
| "loss": 0.2141, |
| "mean_token_accuracy": 0.9013576105237007, |
| "num_tokens": 17644862.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.14545454545454545, |
| "eval_entropy": 0.6915190102458, |
| "eval_loss": 0.20909403264522552, |
| "eval_mean_token_accuracy": 0.9142436196208, |
| "eval_num_tokens": 17644862.0, |
| "eval_runtime": 107.0886, |
| "eval_samples_per_second": 9.338, |
| "eval_steps_per_second": 9.338, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.6096978440880776, |
| "epoch": 0.14707070707070707, |
| "grad_norm": 0.9783409237861633, |
| "learning_rate": 3.480820741100166e-05, |
| "loss": 0.2454, |
| "mean_token_accuracy": 0.9017274558544159, |
| "num_tokens": 17848157.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.6275364428758621, |
| "epoch": 0.1486868686868687, |
| "grad_norm": 1.266980528831482, |
| "learning_rate": 3.480380172281483e-05, |
| "loss": 0.221, |
| "mean_token_accuracy": 0.909178127348423, |
| "num_tokens": 18047501.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.6168364368379116, |
| "epoch": 0.1503030303030303, |
| "grad_norm": 0.7472752928733826, |
| "learning_rate": 3.4799346291028415e-05, |
| "loss": 0.2458, |
| "mean_token_accuracy": 0.9119211822748184, |
| "num_tokens": 18249869.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.5978960894048214, |
| "epoch": 0.15191919191919193, |
| "grad_norm": 0.7442205548286438, |
| "learning_rate": 3.4794841128450554e-05, |
| "loss": 0.2559, |
| "mean_token_accuracy": 0.8956459879875183, |
| "num_tokens": 18457950.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.6468767315149307, |
| "epoch": 0.15353535353535352, |
| "grad_norm": 1.178752064704895, |
| "learning_rate": 3.4790286248032314e-05, |
| "loss": 0.2113, |
| "mean_token_accuracy": 0.9203948318958283, |
| "num_tokens": 18650543.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.6555621646344661, |
| "epoch": 0.15515151515151515, |
| "grad_norm": 0.9539858102798462, |
| "learning_rate": 3.478568166286771e-05, |
| "loss": 0.2289, |
| "mean_token_accuracy": 0.9062513440847397, |
| "num_tokens": 18841642.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.6141032330691815, |
| "epoch": 0.15676767676767678, |
| "grad_norm": 2.01023006439209, |
| "learning_rate": 3.4781027386193646e-05, |
| "loss": 0.2389, |
| "mean_token_accuracy": 0.9044842913746833, |
| "num_tokens": 19045515.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.6368382506072521, |
| "epoch": 0.15838383838383838, |
| "grad_norm": 0.5178106427192688, |
| "learning_rate": 3.4776323431389866e-05, |
| "loss": 0.2171, |
| "mean_token_accuracy": 0.9083613112568856, |
| "num_tokens": 19241213.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.634531506896019, |
| "epoch": 0.16, |
| "grad_norm": 1.3576446771621704, |
| "learning_rate": 3.4771569811978915e-05, |
| "loss": 0.2216, |
| "mean_token_accuracy": 0.906193482875824, |
| "num_tokens": 19437628.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.631604690849781, |
| "epoch": 0.16161616161616163, |
| "grad_norm": 1.0033283233642578, |
| "learning_rate": 3.476676654162613e-05, |
| "loss": 0.23, |
| "mean_token_accuracy": 0.9103793069720268, |
| "num_tokens": 19635494.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.6327577441930771, |
| "epoch": 0.16323232323232323, |
| "grad_norm": 0.9308726787567139, |
| "learning_rate": 3.4761913634139584e-05, |
| "loss": 0.2344, |
| "mean_token_accuracy": 0.9024915874004364, |
| "num_tokens": 19834517.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.6257577292621136, |
| "epoch": 0.16484848484848486, |
| "grad_norm": 1.1238154172897339, |
| "learning_rate": 3.475701110347001e-05, |
| "loss": 0.2315, |
| "mean_token_accuracy": 0.915093657374382, |
| "num_tokens": 20033584.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.6169027276337147, |
| "epoch": 0.16646464646464645, |
| "grad_norm": 1.1264071464538574, |
| "learning_rate": 3.4752058963710835e-05, |
| "loss": 0.2326, |
| "mean_token_accuracy": 0.8968415692448616, |
| "num_tokens": 20234492.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.6385836571455001, |
| "epoch": 0.16808080808080808, |
| "grad_norm": 0.7746083736419678, |
| "learning_rate": 3.474705722909807e-05, |
| "loss": 0.2305, |
| "mean_token_accuracy": 0.908117501437664, |
| "num_tokens": 20429582.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.6292243875563145, |
| "epoch": 0.1696969696969697, |
| "grad_norm": 0.7928685545921326, |
| "learning_rate": 3.4742005914010296e-05, |
| "loss": 0.2096, |
| "mean_token_accuracy": 0.9107841208577157, |
| "num_tokens": 20626277.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.6654087409377099, |
| "epoch": 0.1713131313131313, |
| "grad_norm": 0.6060704588890076, |
| "learning_rate": 3.473690503296865e-05, |
| "loss": 0.1949, |
| "mean_token_accuracy": 0.9214139148592949, |
| "num_tokens": 20809912.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.6308883085846901, |
| "epoch": 0.17292929292929293, |
| "grad_norm": 1.6825069189071655, |
| "learning_rate": 3.4731754600636734e-05, |
| "loss": 0.2161, |
| "mean_token_accuracy": 0.9196865290403367, |
| "num_tokens": 21004610.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.6125972785055638, |
| "epoch": 0.17454545454545456, |
| "grad_norm": 1.3060930967330933, |
| "learning_rate": 3.47265546318206e-05, |
| "loss": 0.2341, |
| "mean_token_accuracy": 0.9092792198061943, |
| "num_tokens": 21205277.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.6183511838316917, |
| "epoch": 0.17616161616161616, |
| "grad_norm": 0.9683547019958496, |
| "learning_rate": 3.472130514146871e-05, |
| "loss": 0.2087, |
| "mean_token_accuracy": 0.9153425708413124, |
| "num_tokens": 21404921.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.6358366332948208, |
| "epoch": 0.17777777777777778, |
| "grad_norm": 1.0839707851409912, |
| "learning_rate": 3.471600614467188e-05, |
| "loss": 0.2266, |
| "mean_token_accuracy": 0.9070942506194115, |
| "num_tokens": 21598578.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.6339450895786285, |
| "epoch": 0.17939393939393938, |
| "grad_norm": 1.1610348224639893, |
| "learning_rate": 3.4710657656663257e-05, |
| "loss": 0.2206, |
| "mean_token_accuracy": 0.908353665471077, |
| "num_tokens": 21795249.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.6443082444369793, |
| "epoch": 0.181010101010101, |
| "grad_norm": 1.1031250953674316, |
| "learning_rate": 3.470525969281824e-05, |
| "loss": 0.1938, |
| "mean_token_accuracy": 0.9259483084082604, |
| "num_tokens": 21988155.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.6319502085447312, |
| "epoch": 0.18262626262626264, |
| "grad_norm": 1.0539134740829468, |
| "learning_rate": 3.469981226865448e-05, |
| "loss": 0.1993, |
| "mean_token_accuracy": 0.9268665567040444, |
| "num_tokens": 22183128.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.6007440723478794, |
| "epoch": 0.18424242424242424, |
| "grad_norm": 1.4432859420776367, |
| "learning_rate": 3.469431539983178e-05, |
| "loss": 0.232, |
| "mean_token_accuracy": 0.9083610430359841, |
| "num_tokens": 22386990.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.6132355526089668, |
| "epoch": 0.18585858585858586, |
| "grad_norm": 0.695091962814331, |
| "learning_rate": 3.468876910215212e-05, |
| "loss": 0.2379, |
| "mean_token_accuracy": 0.9031597882509231, |
| "num_tokens": 22587971.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.5998403012752533, |
| "epoch": 0.1874747474747475, |
| "grad_norm": 0.9048874974250793, |
| "learning_rate": 3.468317339155955e-05, |
| "loss": 0.2397, |
| "mean_token_accuracy": 0.9018667727708817, |
| "num_tokens": 22793134.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.6309370696544647, |
| "epoch": 0.1890909090909091, |
| "grad_norm": 0.9386253952980042, |
| "learning_rate": 3.467752828414019e-05, |
| "loss": 0.2087, |
| "mean_token_accuracy": 0.9095293834805489, |
| "num_tokens": 22989872.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.6670469336211682, |
| "epoch": 0.1907070707070707, |
| "grad_norm": 0.8335705995559692, |
| "learning_rate": 3.467183379612213e-05, |
| "loss": 0.2037, |
| "mean_token_accuracy": 0.9193868711590767, |
| "num_tokens": 23175070.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.605170601606369, |
| "epoch": 0.1923232323232323, |
| "grad_norm": 0.7726657390594482, |
| "learning_rate": 3.4666089943875444e-05, |
| "loss": 0.2185, |
| "mean_token_accuracy": 0.9151586100459099, |
| "num_tokens": 23379679.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.6216241672635079, |
| "epoch": 0.19393939393939394, |
| "grad_norm": 1.46378493309021, |
| "learning_rate": 3.466029674391211e-05, |
| "loss": 0.2369, |
| "mean_token_accuracy": 0.8999699011445046, |
| "num_tokens": 23579844.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.19393939393939394, |
| "eval_entropy": 0.6973449417352676, |
| "eval_loss": 0.19888541102409363, |
| "eval_mean_token_accuracy": 0.9178089879155159, |
| "eval_num_tokens": 23579844.0, |
| "eval_runtime": 106.7423, |
| "eval_samples_per_second": 9.368, |
| "eval_steps_per_second": 9.368, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.6532895624637604, |
| "epoch": 0.19555555555555557, |
| "grad_norm": 3.021833658218384, |
| "learning_rate": 3.465445421288594e-05, |
| "loss": 0.2319, |
| "mean_token_accuracy": 0.9079471796751022, |
| "num_tokens": 23770536.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.6232981145381927, |
| "epoch": 0.19717171717171716, |
| "grad_norm": 0.7231025099754333, |
| "learning_rate": 3.4648562367592595e-05, |
| "loss": 0.2117, |
| "mean_token_accuracy": 0.9194033876061439, |
| "num_tokens": 23971970.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.6648551635444164, |
| "epoch": 0.1987878787878788, |
| "grad_norm": 1.4859992265701294, |
| "learning_rate": 3.4642621224969474e-05, |
| "loss": 0.203, |
| "mean_token_accuracy": 0.9099412858486176, |
| "num_tokens": 24160322.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.6378097325563431, |
| "epoch": 0.20040404040404042, |
| "grad_norm": 0.5669660568237305, |
| "learning_rate": 3.4636630802095723e-05, |
| "loss": 0.2115, |
| "mean_token_accuracy": 0.9144275858998299, |
| "num_tokens": 24354426.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.6563280507922172, |
| "epoch": 0.20202020202020202, |
| "grad_norm": 2.1159043312072754, |
| "learning_rate": 3.463059111619212e-05, |
| "loss": 0.2019, |
| "mean_token_accuracy": 0.9190993249416352, |
| "num_tokens": 24543374.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.6134343653917312, |
| "epoch": 0.20363636363636364, |
| "grad_norm": 1.0814077854156494, |
| "learning_rate": 3.462450218462108e-05, |
| "loss": 0.2252, |
| "mean_token_accuracy": 0.90730449706316, |
| "num_tokens": 24747550.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.639523645490408, |
| "epoch": 0.20525252525252524, |
| "grad_norm": 2.090059995651245, |
| "learning_rate": 3.461836402488658e-05, |
| "loss": 0.232, |
| "mean_token_accuracy": 0.9036611437797546, |
| "num_tokens": 24942137.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.6455445013940334, |
| "epoch": 0.20686868686868687, |
| "grad_norm": 1.184032917022705, |
| "learning_rate": 3.4612176654634124e-05, |
| "loss": 0.1844, |
| "mean_token_accuracy": 0.9213565349578857, |
| "num_tokens": 25136742.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.5931920140981675, |
| "epoch": 0.2084848484848485, |
| "grad_norm": 1.39106285572052, |
| "learning_rate": 3.460594009165066e-05, |
| "loss": 0.2198, |
| "mean_token_accuracy": 0.9083201169967652, |
| "num_tokens": 25344524.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.6356636583805084, |
| "epoch": 0.2101010101010101, |
| "grad_norm": 1.8506667613983154, |
| "learning_rate": 3.4599654353864576e-05, |
| "loss": 0.2199, |
| "mean_token_accuracy": 0.9171806886792183, |
| "num_tokens": 25540892.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.6352035351097584, |
| "epoch": 0.21171717171717172, |
| "grad_norm": 1.501383662223816, |
| "learning_rate": 3.459331945934561e-05, |
| "loss": 0.2049, |
| "mean_token_accuracy": 0.9134323224425316, |
| "num_tokens": 25736036.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.6150361925363541, |
| "epoch": 0.21333333333333335, |
| "grad_norm": 0.6642631888389587, |
| "learning_rate": 3.458693542630481e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.9162977918982506, |
| "num_tokens": 25938119.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.641390497982502, |
| "epoch": 0.21494949494949495, |
| "grad_norm": 1.7314903736114502, |
| "learning_rate": 3.4580502273094506e-05, |
| "loss": 0.2247, |
| "mean_token_accuracy": 0.9050250992178916, |
| "num_tokens": 26129891.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.6408931516110897, |
| "epoch": 0.21656565656565657, |
| "grad_norm": 0.5409038066864014, |
| "learning_rate": 3.4574020018208206e-05, |
| "loss": 0.2202, |
| "mean_token_accuracy": 0.9055180490016937, |
| "num_tokens": 26321688.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.6626883774995804, |
| "epoch": 0.21818181818181817, |
| "grad_norm": 3.052995443344116, |
| "learning_rate": 3.456748868028058e-05, |
| "loss": 0.2088, |
| "mean_token_accuracy": 0.9178230226039886, |
| "num_tokens": 26511190.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.645967397838831, |
| "epoch": 0.2197979797979798, |
| "grad_norm": 0.5310812592506409, |
| "learning_rate": 3.45609082780874e-05, |
| "loss": 0.2065, |
| "mean_token_accuracy": 0.9129741847515106, |
| "num_tokens": 26702954.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.635505760461092, |
| "epoch": 0.22141414141414142, |
| "grad_norm": 0.7930052280426025, |
| "learning_rate": 3.4554278830545494e-05, |
| "loss": 0.2288, |
| "mean_token_accuracy": 0.9053992003202438, |
| "num_tokens": 26898915.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.6399193912744522, |
| "epoch": 0.22303030303030302, |
| "grad_norm": 1.7344346046447754, |
| "learning_rate": 3.4547600356712673e-05, |
| "loss": 0.2026, |
| "mean_token_accuracy": 0.917083628475666, |
| "num_tokens": 27092903.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.6360017918050289, |
| "epoch": 0.22464646464646465, |
| "grad_norm": 0.5752031803131104, |
| "learning_rate": 3.454087287578768e-05, |
| "loss": 0.2007, |
| "mean_token_accuracy": 0.9171236485242844, |
| "num_tokens": 27286898.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.657283715903759, |
| "epoch": 0.22626262626262628, |
| "grad_norm": 1.8767166137695312, |
| "learning_rate": 3.4534096407110144e-05, |
| "loss": 0.216, |
| "mean_token_accuracy": 0.9128377422690391, |
| "num_tokens": 27478561.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.6192032106220722, |
| "epoch": 0.22787878787878788, |
| "grad_norm": 1.462258219718933, |
| "learning_rate": 3.452727097016053e-05, |
| "loss": 0.2269, |
| "mean_token_accuracy": 0.9088853120803833, |
| "num_tokens": 27679828.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.6551795959472656, |
| "epoch": 0.2294949494949495, |
| "grad_norm": 0.6006298661231995, |
| "learning_rate": 3.452039658456005e-05, |
| "loss": 0.2303, |
| "mean_token_accuracy": 0.905583493411541, |
| "num_tokens": 27869832.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.648159421235323, |
| "epoch": 0.2311111111111111, |
| "grad_norm": 1.1963809728622437, |
| "learning_rate": 3.4513473270070655e-05, |
| "loss": 0.2388, |
| "mean_token_accuracy": 0.9001779198646546, |
| "num_tokens": 28063539.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.6477998718619347, |
| "epoch": 0.23272727272727273, |
| "grad_norm": 0.4751994013786316, |
| "learning_rate": 3.450650104659493e-05, |
| "loss": 0.2048, |
| "mean_token_accuracy": 0.9150430023670196, |
| "num_tokens": 28256696.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.6262367948889732, |
| "epoch": 0.23434343434343435, |
| "grad_norm": 1.1237725019454956, |
| "learning_rate": 3.449947993417608e-05, |
| "loss": 0.2174, |
| "mean_token_accuracy": 0.9058823600411415, |
| "num_tokens": 28459175.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.6371082656085492, |
| "epoch": 0.23595959595959595, |
| "grad_norm": 0.3717547357082367, |
| "learning_rate": 3.4492409952997846e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.9148850768804551, |
| "num_tokens": 28656115.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.6463325545191765, |
| "epoch": 0.23757575757575758, |
| "grad_norm": 0.8946523070335388, |
| "learning_rate": 3.4485291123384445e-05, |
| "loss": 0.2103, |
| "mean_token_accuracy": 0.9115211561322212, |
| "num_tokens": 28850698.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.6228535048663616, |
| "epoch": 0.2391919191919192, |
| "grad_norm": 1.0412840843200684, |
| "learning_rate": 3.447812346580053e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.9091267749667168, |
| "num_tokens": 29049617.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.6189112387597561, |
| "epoch": 0.2408080808080808, |
| "grad_norm": 1.1727371215820312, |
| "learning_rate": 3.447090700085111e-05, |
| "loss": 0.2184, |
| "mean_token_accuracy": 0.915630365908146, |
| "num_tokens": 29250403.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.6186697401106358, |
| "epoch": 0.24242424242424243, |
| "grad_norm": 0.5877743363380432, |
| "learning_rate": 3.4463641749281495e-05, |
| "loss": 0.1967, |
| "mean_token_accuracy": 0.9188372850418091, |
| "num_tokens": 29450472.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.24242424242424243, |
| "eval_entropy": 0.6933836719095707, |
| "eval_loss": 0.19674202799797058, |
| "eval_mean_token_accuracy": 0.9202139660716057, |
| "eval_num_tokens": 29450472.0, |
| "eval_runtime": 107.0097, |
| "eval_samples_per_second": 9.345, |
| "eval_steps_per_second": 9.345, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.6295811668038368, |
| "epoch": 0.24404040404040403, |
| "grad_norm": 1.9190826416015625, |
| "learning_rate": 3.445632773197728e-05, |
| "loss": 0.2389, |
| "mean_token_accuracy": 0.9088302314281463, |
| "num_tokens": 29650737.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.629815524816513, |
| "epoch": 0.24565656565656566, |
| "grad_norm": 0.5053853392601013, |
| "learning_rate": 3.444896496996421e-05, |
| "loss": 0.2055, |
| "mean_token_accuracy": 0.9128904730081558, |
| "num_tokens": 29850237.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.6402494020760059, |
| "epoch": 0.24727272727272728, |
| "grad_norm": 1.2792322635650635, |
| "learning_rate": 3.444155348440817e-05, |
| "loss": 0.2039, |
| "mean_token_accuracy": 0.9191207081079483, |
| "num_tokens": 30045394.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.6441243663430214, |
| "epoch": 0.24888888888888888, |
| "grad_norm": 0.9373044967651367, |
| "learning_rate": 3.443409329661512e-05, |
| "loss": 0.2121, |
| "mean_token_accuracy": 0.9115235760807991, |
| "num_tokens": 30237651.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.6224880084395409, |
| "epoch": 0.2505050505050505, |
| "grad_norm": 0.5471675395965576, |
| "learning_rate": 3.442658442803101e-05, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.915025606751442, |
| "num_tokens": 30438010.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.6422186933457852, |
| "epoch": 0.25212121212121213, |
| "grad_norm": 2.163301944732666, |
| "learning_rate": 3.441902690024174e-05, |
| "loss": 0.2081, |
| "mean_token_accuracy": 0.9188799828290939, |
| "num_tokens": 30631524.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.6336139030754566, |
| "epoch": 0.25373737373737376, |
| "grad_norm": 0.5002826452255249, |
| "learning_rate": 3.44114207349731e-05, |
| "loss": 0.1961, |
| "mean_token_accuracy": 0.9159276992082596, |
| "num_tokens": 30827142.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.6471013821661472, |
| "epoch": 0.25535353535353533, |
| "grad_norm": 0.5277097225189209, |
| "learning_rate": 3.440376595409068e-05, |
| "loss": 0.2112, |
| "mean_token_accuracy": 0.9091186106204987, |
| "num_tokens": 31018112.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.6442371979355812, |
| "epoch": 0.25696969696969696, |
| "grad_norm": 3.8261702060699463, |
| "learning_rate": 3.4396062579599855e-05, |
| "loss": 0.2012, |
| "mean_token_accuracy": 0.9144791051745415, |
| "num_tokens": 31211638.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.6473165228962898, |
| "epoch": 0.2585858585858586, |
| "grad_norm": 0.6906265020370483, |
| "learning_rate": 3.438831063364566e-05, |
| "loss": 0.2015, |
| "mean_token_accuracy": 0.9193555861711502, |
| "num_tokens": 31403481.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.6608973711729049, |
| "epoch": 0.2602020202020202, |
| "grad_norm": 1.506940484046936, |
| "learning_rate": 3.4380510138512785e-05, |
| "loss": 0.1908, |
| "mean_token_accuracy": 0.9326013177633286, |
| "num_tokens": 31592824.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.6413066022098064, |
| "epoch": 0.26181818181818184, |
| "grad_norm": 1.2020001411437988, |
| "learning_rate": 3.437266111662548e-05, |
| "loss": 0.2001, |
| "mean_token_accuracy": 0.9156825304031372, |
| "num_tokens": 31785892.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.6559233710169792, |
| "epoch": 0.2634343434343434, |
| "grad_norm": 1.3252395391464233, |
| "learning_rate": 3.436476359054747e-05, |
| "loss": 0.2206, |
| "mean_token_accuracy": 0.9107002332806587, |
| "num_tokens": 31975303.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.6213893130421638, |
| "epoch": 0.26505050505050504, |
| "grad_norm": 0.9649196267127991, |
| "learning_rate": 3.435681758298196e-05, |
| "loss": 0.2396, |
| "mean_token_accuracy": 0.9061437651515007, |
| "num_tokens": 32176608.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.6706323623657227, |
| "epoch": 0.26666666666666666, |
| "grad_norm": 0.7337570786476135, |
| "learning_rate": 3.43488231167715e-05, |
| "loss": 0.1941, |
| "mean_token_accuracy": 0.9181428372859954, |
| "num_tokens": 32365469.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.6438912704586983, |
| "epoch": 0.2682828282828283, |
| "grad_norm": 0.5835590958595276, |
| "learning_rate": 3.4340780214897944e-05, |
| "loss": 0.2114, |
| "mean_token_accuracy": 0.9103723153471946, |
| "num_tokens": 32558625.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.6183638513088227, |
| "epoch": 0.2698989898989899, |
| "grad_norm": 0.5338345170021057, |
| "learning_rate": 3.433268890048239e-05, |
| "loss": 0.212, |
| "mean_token_accuracy": 0.9069491773843765, |
| "num_tokens": 32758742.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.6191523462533951, |
| "epoch": 0.27151515151515154, |
| "grad_norm": 0.659136176109314, |
| "learning_rate": 3.4324549196785114e-05, |
| "loss": 0.2267, |
| "mean_token_accuracy": 0.9071033582091331, |
| "num_tokens": 32960876.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.6082011148333549, |
| "epoch": 0.2731313131313131, |
| "grad_norm": 0.40289682149887085, |
| "learning_rate": 3.4316361127205486e-05, |
| "loss": 0.2271, |
| "mean_token_accuracy": 0.9086646243929863, |
| "num_tokens": 33165337.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.6146745294332504, |
| "epoch": 0.27474747474747474, |
| "grad_norm": 1.1410349607467651, |
| "learning_rate": 3.4308124715281916e-05, |
| "loss": 0.2113, |
| "mean_token_accuracy": 0.9149732172489167, |
| "num_tokens": 33367594.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.6495476797223091, |
| "epoch": 0.27636363636363637, |
| "grad_norm": 2.567011594772339, |
| "learning_rate": 3.42998399846918e-05, |
| "loss": 0.2199, |
| "mean_token_accuracy": 0.910026653110981, |
| "num_tokens": 33558500.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.6399842962622643, |
| "epoch": 0.277979797979798, |
| "grad_norm": 0.49004054069519043, |
| "learning_rate": 3.429150695925142e-05, |
| "loss": 0.196, |
| "mean_token_accuracy": 0.912572592496872, |
| "num_tokens": 33756919.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.6292979188263417, |
| "epoch": 0.2795959595959596, |
| "grad_norm": 0.6200940012931824, |
| "learning_rate": 3.4283125662915895e-05, |
| "loss": 0.1993, |
| "mean_token_accuracy": 0.9184976890683174, |
| "num_tokens": 33955439.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.6242897778749465, |
| "epoch": 0.2812121212121212, |
| "grad_norm": 0.46822234988212585, |
| "learning_rate": 3.427469611977912e-05, |
| "loss": 0.214, |
| "mean_token_accuracy": 0.9052839800715446, |
| "num_tokens": 34151935.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.6312501393258572, |
| "epoch": 0.2828282828282828, |
| "grad_norm": 0.6502590775489807, |
| "learning_rate": 3.426621835407367e-05, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.9219602450728417, |
| "num_tokens": 34347358.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.6140806257724762, |
| "epoch": 0.28444444444444444, |
| "grad_norm": 0.7690215706825256, |
| "learning_rate": 3.425769239017077e-05, |
| "loss": 0.2079, |
| "mean_token_accuracy": 0.9129775792360306, |
| "num_tokens": 34549789.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.6185090109705925, |
| "epoch": 0.28606060606060607, |
| "grad_norm": 0.5466774702072144, |
| "learning_rate": 3.424911825258016e-05, |
| "loss": 0.2121, |
| "mean_token_accuracy": 0.9113013401627541, |
| "num_tokens": 34750972.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.6002763979136944, |
| "epoch": 0.2876767676767677, |
| "grad_norm": 0.599219799041748, |
| "learning_rate": 3.4240495965950124e-05, |
| "loss": 0.2317, |
| "mean_token_accuracy": 0.9014593094587326, |
| "num_tokens": 34956175.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.6477742247283459, |
| "epoch": 0.28929292929292927, |
| "grad_norm": 0.6608160138130188, |
| "learning_rate": 3.42318255550673e-05, |
| "loss": 0.2057, |
| "mean_token_accuracy": 0.9176206976175308, |
| "num_tokens": 35148784.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.6291337795555592, |
| "epoch": 0.2909090909090909, |
| "grad_norm": 2.33884596824646, |
| "learning_rate": 3.422310704485672e-05, |
| "loss": 0.199, |
| "mean_token_accuracy": 0.9241131842136383, |
| "num_tokens": 35345505.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.2909090909090909, |
| "eval_entropy": 0.6954512582421303, |
| "eval_loss": 0.1912153661251068, |
| "eval_mean_token_accuracy": 0.9204178621172905, |
| "eval_num_tokens": 35345505.0, |
| "eval_runtime": 107.4175, |
| "eval_samples_per_second": 9.309, |
| "eval_steps_per_second": 9.309, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.6612498946487904, |
| "epoch": 0.2925252525252525, |
| "grad_norm": 0.5786094069480896, |
| "learning_rate": 3.421434046038165e-05, |
| "loss": 0.1898, |
| "mean_token_accuracy": 0.9201330304145813, |
| "num_tokens": 35534904.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.6251860365271569, |
| "epoch": 0.29414141414141415, |
| "grad_norm": 0.6408361196517944, |
| "learning_rate": 3.4205525826843576e-05, |
| "loss": 0.227, |
| "mean_token_accuracy": 0.9126361146569252, |
| "num_tokens": 35735057.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.6235713072121143, |
| "epoch": 0.2957575757575758, |
| "grad_norm": 0.4577428102493286, |
| "learning_rate": 3.4196663169582125e-05, |
| "loss": 0.2142, |
| "mean_token_accuracy": 0.9085043162107468, |
| "num_tokens": 35935733.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.6569493114948273, |
| "epoch": 0.2973737373737374, |
| "grad_norm": 0.7934256196022034, |
| "learning_rate": 3.4187752514074955e-05, |
| "loss": 0.2143, |
| "mean_token_accuracy": 0.9169334411621094, |
| "num_tokens": 36125039.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.6233655147254467, |
| "epoch": 0.298989898989899, |
| "grad_norm": 0.4999740719795227, |
| "learning_rate": 3.4178793885937705e-05, |
| "loss": 0.2174, |
| "mean_token_accuracy": 0.908802704513073, |
| "num_tokens": 36324044.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.6451633021235466, |
| "epoch": 0.3006060606060606, |
| "grad_norm": 0.720961332321167, |
| "learning_rate": 3.416978731092394e-05, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.9189713701605797, |
| "num_tokens": 36515735.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.6397333383560181, |
| "epoch": 0.3022222222222222, |
| "grad_norm": 2.212775230407715, |
| "learning_rate": 3.416073281492504e-05, |
| "loss": 0.2236, |
| "mean_token_accuracy": 0.9063498586416244, |
| "num_tokens": 36712700.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.6438430123031139, |
| "epoch": 0.30383838383838385, |
| "grad_norm": 0.5264093279838562, |
| "learning_rate": 3.4151630423970153e-05, |
| "loss": 0.1993, |
| "mean_token_accuracy": 0.91684380620718, |
| "num_tokens": 36908572.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.6554016143083572, |
| "epoch": 0.3054545454545455, |
| "grad_norm": 0.41331690549850464, |
| "learning_rate": 3.414248016422613e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.9113001599907875, |
| "num_tokens": 37101597.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.6380980283021926, |
| "epoch": 0.30707070707070705, |
| "grad_norm": 0.4134485721588135, |
| "learning_rate": 3.413328206199739e-05, |
| "loss": 0.2273, |
| "mean_token_accuracy": 0.9148691385984421, |
| "num_tokens": 37300456.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.6408122353255749, |
| "epoch": 0.3086868686868687, |
| "grad_norm": 2.0843093395233154, |
| "learning_rate": 3.412403614372592e-05, |
| "loss": 0.2067, |
| "mean_token_accuracy": 0.9162870794534683, |
| "num_tokens": 37496092.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.6569594420492649, |
| "epoch": 0.3103030303030303, |
| "grad_norm": 1.7086161375045776, |
| "learning_rate": 3.411474243599116e-05, |
| "loss": 0.2071, |
| "mean_token_accuracy": 0.9168068438768386, |
| "num_tokens": 37685676.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.6387952454388142, |
| "epoch": 0.31191919191919193, |
| "grad_norm": 0.40981894731521606, |
| "learning_rate": 3.4105400965509906e-05, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.9146794468164444, |
| "num_tokens": 37882882.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.6661320142447948, |
| "epoch": 0.31353535353535356, |
| "grad_norm": 0.6330037713050842, |
| "learning_rate": 3.40960117591363e-05, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.9190028592944145, |
| "num_tokens": 38069595.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.6375580407679081, |
| "epoch": 0.3151515151515151, |
| "grad_norm": 0.5483776926994324, |
| "learning_rate": 3.408657484386168e-05, |
| "loss": 0.2166, |
| "mean_token_accuracy": 0.9116458058357239, |
| "num_tokens": 38267168.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.6482452765107155, |
| "epoch": 0.31676767676767675, |
| "grad_norm": 2.778130531311035, |
| "learning_rate": 3.4077090246814544e-05, |
| "loss": 0.2098, |
| "mean_token_accuracy": 0.9136871173977852, |
| "num_tokens": 38461292.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.674566724896431, |
| "epoch": 0.3183838383838384, |
| "grad_norm": 1.4035710096359253, |
| "learning_rate": 3.406755799526046e-05, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9202602237462998, |
| "num_tokens": 38647952.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.6578912548720837, |
| "epoch": 0.32, |
| "grad_norm": 0.8172881603240967, |
| "learning_rate": 3.405797811660199e-05, |
| "loss": 0.1821, |
| "mean_token_accuracy": 0.9241382718086243, |
| "num_tokens": 38838677.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.6281578689813614, |
| "epoch": 0.32161616161616163, |
| "grad_norm": 0.9093276858329773, |
| "learning_rate": 3.4048350638378606e-05, |
| "loss": 0.2202, |
| "mean_token_accuracy": 0.9049888834357261, |
| "num_tokens": 39035872.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.6333778567612172, |
| "epoch": 0.32323232323232326, |
| "grad_norm": 0.880991518497467, |
| "learning_rate": 3.403867558826663e-05, |
| "loss": 0.2041, |
| "mean_token_accuracy": 0.9181326001882553, |
| "num_tokens": 39235091.0, |
| "step": 2000 |
| }, |
| { |
| "entropy": 0.6394575096666812, |
| "epoch": 0.32484848484848483, |
| "grad_norm": 0.5308339595794678, |
| "learning_rate": 3.402895299407913e-05, |
| "loss": 0.1872, |
| "mean_token_accuracy": 0.9277254670858384, |
| "num_tokens": 39429550.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 0.6256730824708938, |
| "epoch": 0.32646464646464646, |
| "grad_norm": 0.37278324365615845, |
| "learning_rate": 3.4019182883765844e-05, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9239986538887024, |
| "num_tokens": 39629223.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 0.6522976204752922, |
| "epoch": 0.3280808080808081, |
| "grad_norm": 0.3719552457332611, |
| "learning_rate": 3.400936528541311e-05, |
| "loss": 0.215, |
| "mean_token_accuracy": 0.912938816845417, |
| "num_tokens": 39820805.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 0.6549306355416775, |
| "epoch": 0.3296969696969697, |
| "grad_norm": 0.9718757271766663, |
| "learning_rate": 3.399950022724379e-05, |
| "loss": 0.1936, |
| "mean_token_accuracy": 0.9173328787088394, |
| "num_tokens": 40013643.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 0.6376727804541588, |
| "epoch": 0.33131313131313134, |
| "grad_norm": 2.6161608695983887, |
| "learning_rate": 3.398958773761717e-05, |
| "loss": 0.1937, |
| "mean_token_accuracy": 0.918268159031868, |
| "num_tokens": 40213829.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 0.6503816410899163, |
| "epoch": 0.3329292929292929, |
| "grad_norm": 0.4555739760398865, |
| "learning_rate": 3.3979627845028884e-05, |
| "loss": 0.1945, |
| "mean_token_accuracy": 0.9175498813390732, |
| "num_tokens": 40408625.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 0.6709971487522125, |
| "epoch": 0.33454545454545453, |
| "grad_norm": 0.30291101336479187, |
| "learning_rate": 3.396962057811085e-05, |
| "loss": 0.1906, |
| "mean_token_accuracy": 0.923813234269619, |
| "num_tokens": 40596756.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 0.6471885599195957, |
| "epoch": 0.33616161616161616, |
| "grad_norm": 1.0948461294174194, |
| "learning_rate": 3.395956596563117e-05, |
| "loss": 0.2153, |
| "mean_token_accuracy": 0.909675869345665, |
| "num_tokens": 40791510.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 0.6256736367940903, |
| "epoch": 0.3377777777777778, |
| "grad_norm": 0.5939026474952698, |
| "learning_rate": 3.394946403649405e-05, |
| "loss": 0.21, |
| "mean_token_accuracy": 0.9117529839277267, |
| "num_tokens": 40990851.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 0.6239214479923249, |
| "epoch": 0.3393939393939394, |
| "grad_norm": 0.7415814995765686, |
| "learning_rate": 3.3939314819739696e-05, |
| "loss": 0.2074, |
| "mean_token_accuracy": 0.912028856575489, |
| "num_tokens": 41191154.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.3393939393939394, |
| "eval_entropy": 0.6998262491822242, |
| "eval_loss": 0.1984918862581253, |
| "eval_mean_token_accuracy": 0.9189635333418846, |
| "eval_num_tokens": 41191154.0, |
| "eval_runtime": 106.9645, |
| "eval_samples_per_second": 9.349, |
| "eval_steps_per_second": 9.349, |
| "step": 2100 |
| }, |
| { |
| "entropy": 0.651825649291277, |
| "epoch": 0.341010101010101, |
| "grad_norm": 1.52384352684021, |
| "learning_rate": 3.392911834454429e-05, |
| "loss": 0.2159, |
| "mean_token_accuracy": 0.9134291365742684, |
| "num_tokens": 41384003.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 0.6138758823275566, |
| "epoch": 0.3426262626262626, |
| "grad_norm": 0.583720862865448, |
| "learning_rate": 3.391887464021985e-05, |
| "loss": 0.2322, |
| "mean_token_accuracy": 0.9039447426795959, |
| "num_tokens": 41590543.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 0.620711050182581, |
| "epoch": 0.34424242424242424, |
| "grad_norm": 0.7128543853759766, |
| "learning_rate": 3.3908583736214166e-05, |
| "loss": 0.2233, |
| "mean_token_accuracy": 0.9042671546339989, |
| "num_tokens": 41792676.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 0.6385029278695583, |
| "epoch": 0.34585858585858587, |
| "grad_norm": 0.3531809449195862, |
| "learning_rate": 3.389824566211071e-05, |
| "loss": 0.2012, |
| "mean_token_accuracy": 0.9119970798492432, |
| "num_tokens": 41989615.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 0.6404819719493389, |
| "epoch": 0.3474747474747475, |
| "grad_norm": 1.1511902809143066, |
| "learning_rate": 3.388786044762857e-05, |
| "loss": 0.1978, |
| "mean_token_accuracy": 0.9193251192569732, |
| "num_tokens": 42189774.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 0.6125321023166179, |
| "epoch": 0.3490909090909091, |
| "grad_norm": 1.0474456548690796, |
| "learning_rate": 3.387742812262232e-05, |
| "loss": 0.2248, |
| "mean_token_accuracy": 0.9014544919133186, |
| "num_tokens": 42397597.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 0.6697261020541191, |
| "epoch": 0.3507070707070707, |
| "grad_norm": 2.6372017860412598, |
| "learning_rate": 3.386694871708201e-05, |
| "loss": 0.2131, |
| "mean_token_accuracy": 0.9181707888841629, |
| "num_tokens": 42589672.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 0.6931374616920948, |
| "epoch": 0.3523232323232323, |
| "grad_norm": 0.5862610340118408, |
| "learning_rate": 3.385642226113298e-05, |
| "loss": 0.1822, |
| "mean_token_accuracy": 0.9239647507667541, |
| "num_tokens": 42773524.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 0.6293456293642521, |
| "epoch": 0.35393939393939394, |
| "grad_norm": 0.27295875549316406, |
| "learning_rate": 3.384584878503585e-05, |
| "loss": 0.2034, |
| "mean_token_accuracy": 0.9137175157666206, |
| "num_tokens": 42975451.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 0.6602464765310287, |
| "epoch": 0.35555555555555557, |
| "grad_norm": 0.38715091347694397, |
| "learning_rate": 3.383522831918644e-05, |
| "loss": 0.207, |
| "mean_token_accuracy": 0.9117765665054322, |
| "num_tokens": 43167229.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 0.6907435119152069, |
| "epoch": 0.3571717171717172, |
| "grad_norm": 0.7388333082199097, |
| "learning_rate": 3.3824560894115604e-05, |
| "loss": 0.1895, |
| "mean_token_accuracy": 0.9175972148776055, |
| "num_tokens": 43349083.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 0.641204608976841, |
| "epoch": 0.35878787878787877, |
| "grad_norm": 1.022162914276123, |
| "learning_rate": 3.3813846540489214e-05, |
| "loss": 0.2306, |
| "mean_token_accuracy": 0.9055288776755333, |
| "num_tokens": 43545849.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 0.6882462851703167, |
| "epoch": 0.3604040404040404, |
| "grad_norm": 0.6299819946289062, |
| "learning_rate": 3.3803085289108046e-05, |
| "loss": 0.1825, |
| "mean_token_accuracy": 0.9197423487901688, |
| "num_tokens": 43728913.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 0.66824596747756, |
| "epoch": 0.362020202020202, |
| "grad_norm": 0.5263814926147461, |
| "learning_rate": 3.3792277170907697e-05, |
| "loss": 0.1979, |
| "mean_token_accuracy": 0.9203566953539848, |
| "num_tokens": 43920222.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 0.6338320925831795, |
| "epoch": 0.36363636363636365, |
| "grad_norm": 0.5668702125549316, |
| "learning_rate": 3.378142221695848e-05, |
| "loss": 0.2046, |
| "mean_token_accuracy": 0.9108695581555366, |
| "num_tokens": 44120455.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 0.6575196944177151, |
| "epoch": 0.3652525252525253, |
| "grad_norm": 0.3751061260700226, |
| "learning_rate": 3.377052045846537e-05, |
| "loss": 0.2018, |
| "mean_token_accuracy": 0.913050290942192, |
| "num_tokens": 44314078.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 0.6516093999147415, |
| "epoch": 0.36686868686868684, |
| "grad_norm": 2.1195967197418213, |
| "learning_rate": 3.375957192676788e-05, |
| "loss": 0.1907, |
| "mean_token_accuracy": 0.9181628227233887, |
| "num_tokens": 44509018.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 0.6486793115735054, |
| "epoch": 0.36848484848484847, |
| "grad_norm": 0.49226877093315125, |
| "learning_rate": 3.374857665333997e-05, |
| "loss": 0.184, |
| "mean_token_accuracy": 0.9247466400265694, |
| "num_tokens": 44705376.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 0.6480357632040977, |
| "epoch": 0.3701010101010101, |
| "grad_norm": 0.4141157567501068, |
| "learning_rate": 3.373753466978999e-05, |
| "loss": 0.209, |
| "mean_token_accuracy": 0.9143023118376732, |
| "num_tokens": 44902479.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 0.6157522663474083, |
| "epoch": 0.3717171717171717, |
| "grad_norm": 0.41298553347587585, |
| "learning_rate": 3.3726446007860556e-05, |
| "loss": 0.199, |
| "mean_token_accuracy": 0.9163665294647216, |
| "num_tokens": 45110713.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 0.6643727004528046, |
| "epoch": 0.37333333333333335, |
| "grad_norm": 0.6356022357940674, |
| "learning_rate": 3.3715310699428484e-05, |
| "loss": 0.2009, |
| "mean_token_accuracy": 0.9170802712440491, |
| "num_tokens": 45300077.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 0.6490449465811252, |
| "epoch": 0.374949494949495, |
| "grad_norm": 0.4972356855869293, |
| "learning_rate": 3.370412877650467e-05, |
| "loss": 0.2056, |
| "mean_token_accuracy": 0.9179009929299354, |
| "num_tokens": 45494685.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 0.6338639289140702, |
| "epoch": 0.37656565656565655, |
| "grad_norm": 0.3716679811477661, |
| "learning_rate": 3.369290027123402e-05, |
| "loss": 0.1985, |
| "mean_token_accuracy": 0.9173924580216408, |
| "num_tokens": 45695194.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 0.6402531735599041, |
| "epoch": 0.3781818181818182, |
| "grad_norm": 3.9543232917785645, |
| "learning_rate": 3.368162521589536e-05, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.9143711969256401, |
| "num_tokens": 45894902.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 0.6678831689059734, |
| "epoch": 0.3797979797979798, |
| "grad_norm": 0.3957677185535431, |
| "learning_rate": 3.367030364290132e-05, |
| "loss": 0.1814, |
| "mean_token_accuracy": 0.921611288189888, |
| "num_tokens": 46084398.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 0.6259508073329926, |
| "epoch": 0.3814141414141414, |
| "grad_norm": 1.1085342168807983, |
| "learning_rate": 3.3658935584798255e-05, |
| "loss": 0.2234, |
| "mean_token_accuracy": 0.9109225928783417, |
| "num_tokens": 46288730.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 0.645773047208786, |
| "epoch": 0.38303030303030305, |
| "grad_norm": 0.8026268482208252, |
| "learning_rate": 3.364752107426618e-05, |
| "loss": 0.1937, |
| "mean_token_accuracy": 0.9164649412035942, |
| "num_tokens": 46484836.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 0.6290055423974991, |
| "epoch": 0.3846464646464646, |
| "grad_norm": 0.4032319188117981, |
| "learning_rate": 3.363606014411861e-05, |
| "loss": 0.211, |
| "mean_token_accuracy": 0.9055109471082687, |
| "num_tokens": 46686427.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 0.633245699852705, |
| "epoch": 0.38626262626262625, |
| "grad_norm": 0.4805126488208771, |
| "learning_rate": 3.362455282730252e-05, |
| "loss": 0.2138, |
| "mean_token_accuracy": 0.9034733757376671, |
| "num_tokens": 46887489.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 0.6522969849407673, |
| "epoch": 0.3878787878787879, |
| "grad_norm": 1.1404281854629517, |
| "learning_rate": 3.361299915689824e-05, |
| "loss": 0.2043, |
| "mean_token_accuracy": 0.9089002668857574, |
| "num_tokens": 47083820.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.3878787878787879, |
| "eval_entropy": 0.7113876877129078, |
| "eval_loss": 0.18623696267604828, |
| "eval_mean_token_accuracy": 0.921599247455597, |
| "eval_num_tokens": 47083820.0, |
| "eval_runtime": 107.4465, |
| "eval_samples_per_second": 9.307, |
| "eval_steps_per_second": 9.307, |
| "step": 2400 |
| }, |
| { |
| "entropy": 0.6595964454114437, |
| "epoch": 0.3894949494949495, |
| "grad_norm": 1.8339920043945312, |
| "learning_rate": 3.360139916611934e-05, |
| "loss": 0.2007, |
| "mean_token_accuracy": 0.9141246557235718, |
| "num_tokens": 47277248.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 0.6597648195922374, |
| "epoch": 0.39111111111111113, |
| "grad_norm": 0.3694554269313812, |
| "learning_rate": 3.358975288831256e-05, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.9253092393279075, |
| "num_tokens": 47470531.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 0.6970138788223267, |
| "epoch": 0.3927272727272727, |
| "grad_norm": 0.5019353032112122, |
| "learning_rate": 3.35780603569577e-05, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.9200589671730995, |
| "num_tokens": 47652005.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 0.6489242658019065, |
| "epoch": 0.39434343434343433, |
| "grad_norm": 1.1797176599502563, |
| "learning_rate": 3.356632160566752e-05, |
| "loss": 0.2294, |
| "mean_token_accuracy": 0.9104439795017243, |
| "num_tokens": 47847890.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 0.6623602546751499, |
| "epoch": 0.39595959595959596, |
| "grad_norm": 0.25269877910614014, |
| "learning_rate": 3.355453666818765e-05, |
| "loss": 0.1866, |
| "mean_token_accuracy": 0.9208436816930771, |
| "num_tokens": 48044273.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 0.6277916461229325, |
| "epoch": 0.3975757575757576, |
| "grad_norm": 0.38579049706459045, |
| "learning_rate": 3.35427055783965e-05, |
| "loss": 0.2269, |
| "mean_token_accuracy": 0.9083513021469116, |
| "num_tokens": 48248754.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 0.6581358797848225, |
| "epoch": 0.3991919191919192, |
| "grad_norm": 2.619389533996582, |
| "learning_rate": 3.3530828370305157e-05, |
| "loss": 0.1934, |
| "mean_token_accuracy": 0.9179710894823074, |
| "num_tokens": 48440794.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 0.6409321166574955, |
| "epoch": 0.40080808080808084, |
| "grad_norm": 0.5129874348640442, |
| "learning_rate": 3.3518905078057266e-05, |
| "loss": 0.2087, |
| "mean_token_accuracy": 0.9138506144285202, |
| "num_tokens": 48636796.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 0.6473928295075894, |
| "epoch": 0.4024242424242424, |
| "grad_norm": 0.4147140085697174, |
| "learning_rate": 3.3506935735928976e-05, |
| "loss": 0.195, |
| "mean_token_accuracy": 0.9118334755301476, |
| "num_tokens": 48830484.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 0.6263864435255527, |
| "epoch": 0.40404040404040403, |
| "grad_norm": 0.4629824757575989, |
| "learning_rate": 3.349492037832879e-05, |
| "loss": 0.2279, |
| "mean_token_accuracy": 0.9032794684171677, |
| "num_tokens": 49032476.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 0.6438064053654671, |
| "epoch": 0.40565656565656566, |
| "grad_norm": 1.7570719718933105, |
| "learning_rate": 3.3482859039797516e-05, |
| "loss": 0.1957, |
| "mean_token_accuracy": 0.915298792719841, |
| "num_tokens": 49228395.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 0.6471666194498539, |
| "epoch": 0.4072727272727273, |
| "grad_norm": 0.5772570967674255, |
| "learning_rate": 3.3470751755008134e-05, |
| "loss": 0.2092, |
| "mean_token_accuracy": 0.9125421524047852, |
| "num_tokens": 49424437.0, |
| "step": 2520 |
| }, |
| { |
| "entropy": 0.6377446949481964, |
| "epoch": 0.4088888888888889, |
| "grad_norm": 0.8318358063697815, |
| "learning_rate": 3.345859855876571e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.921054682135582, |
| "num_tokens": 49621544.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 0.641574353724718, |
| "epoch": 0.4105050505050505, |
| "grad_norm": 0.5469814538955688, |
| "learning_rate": 3.344639948600729e-05, |
| "loss": 0.1909, |
| "mean_token_accuracy": 0.9170561015605927, |
| "num_tokens": 49818401.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 0.6623684763908386, |
| "epoch": 0.4121212121212121, |
| "grad_norm": 1.5764508247375488, |
| "learning_rate": 3.34341545718018e-05, |
| "loss": 0.1956, |
| "mean_token_accuracy": 0.9195842504501343, |
| "num_tokens": 50011755.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 0.6670712187886239, |
| "epoch": 0.41373737373737374, |
| "grad_norm": 0.5596409440040588, |
| "learning_rate": 3.342186385134995e-05, |
| "loss": 0.2234, |
| "mean_token_accuracy": 0.9089864656329155, |
| "num_tokens": 50202582.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 0.6491748370230198, |
| "epoch": 0.41535353535353536, |
| "grad_norm": 0.8959507942199707, |
| "learning_rate": 3.340952735998413e-05, |
| "loss": 0.2093, |
| "mean_token_accuracy": 0.9129490926861763, |
| "num_tokens": 50398268.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 0.6661599427461624, |
| "epoch": 0.416969696969697, |
| "grad_norm": 0.45862722396850586, |
| "learning_rate": 3.339714513316831e-05, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.918558169901371, |
| "num_tokens": 50589126.0, |
| "step": 2580 |
| }, |
| { |
| "entropy": 0.6533463180065155, |
| "epoch": 0.41858585858585856, |
| "grad_norm": 0.2936107814311981, |
| "learning_rate": 3.338471720649795e-05, |
| "loss": 0.2197, |
| "mean_token_accuracy": 0.9082069784402848, |
| "num_tokens": 50786106.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 0.646408773958683, |
| "epoch": 0.4202020202020202, |
| "grad_norm": 0.9799319505691528, |
| "learning_rate": 3.337224361569984e-05, |
| "loss": 0.2195, |
| "mean_token_accuracy": 0.9127225950360298, |
| "num_tokens": 50985940.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 0.6364965550601482, |
| "epoch": 0.4218181818181818, |
| "grad_norm": 0.6174752712249756, |
| "learning_rate": 3.33597243966321e-05, |
| "loss": 0.2215, |
| "mean_token_accuracy": 0.9099404335021972, |
| "num_tokens": 51185481.0, |
| "step": 2610 |
| }, |
| { |
| "entropy": 0.6501048274338246, |
| "epoch": 0.42343434343434344, |
| "grad_norm": 0.3903461694717407, |
| "learning_rate": 3.334715958528397e-05, |
| "loss": 0.2021, |
| "mean_token_accuracy": 0.9131413713097573, |
| "num_tokens": 51381030.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 0.626953698694706, |
| "epoch": 0.42505050505050507, |
| "grad_norm": 0.45153191685676575, |
| "learning_rate": 3.3334549217775794e-05, |
| "loss": 0.2098, |
| "mean_token_accuracy": 0.9122917667031288, |
| "num_tokens": 51581647.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 0.6413272753357887, |
| "epoch": 0.4266666666666667, |
| "grad_norm": 0.7007961869239807, |
| "learning_rate": 3.332189333035883e-05, |
| "loss": 0.2002, |
| "mean_token_accuracy": 0.9182598188519477, |
| "num_tokens": 51777371.0, |
| "step": 2640 |
| }, |
| { |
| "entropy": 0.6396277464926243, |
| "epoch": 0.42828282828282827, |
| "grad_norm": 1.0117080211639404, |
| "learning_rate": 3.330919195941525e-05, |
| "loss": 0.2046, |
| "mean_token_accuracy": 0.9140521854162216, |
| "num_tokens": 51976736.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 0.6547877825796604, |
| "epoch": 0.4298989898989899, |
| "grad_norm": 1.506662130355835, |
| "learning_rate": 3.3296445141457936e-05, |
| "loss": 0.1826, |
| "mean_token_accuracy": 0.922282612323761, |
| "num_tokens": 52173296.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 0.6676691561937332, |
| "epoch": 0.4315151515151515, |
| "grad_norm": 0.3631212115287781, |
| "learning_rate": 3.328365291313044e-05, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9073918789625168, |
| "num_tokens": 52363050.0, |
| "step": 2670 |
| }, |
| { |
| "entropy": 0.6451698906719685, |
| "epoch": 0.43313131313131314, |
| "grad_norm": 1.5293185710906982, |
| "learning_rate": 3.327081531120684e-05, |
| "loss": 0.2149, |
| "mean_token_accuracy": 0.9066063776612282, |
| "num_tokens": 52559410.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 0.6223693639039993, |
| "epoch": 0.43474747474747477, |
| "grad_norm": 0.3108581304550171, |
| "learning_rate": 3.325793237259165e-05, |
| "loss": 0.2094, |
| "mean_token_accuracy": 0.9071380957961083, |
| "num_tokens": 52763857.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 0.6278824508190155, |
| "epoch": 0.43636363636363634, |
| "grad_norm": 0.30205193161964417, |
| "learning_rate": 3.324500413431974e-05, |
| "loss": 0.2072, |
| "mean_token_accuracy": 0.9219203218817711, |
| "num_tokens": 52965075.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.43636363636363634, |
| "eval_entropy": 0.7081856344938278, |
| "eval_loss": 0.18622006475925446, |
| "eval_mean_token_accuracy": 0.920462992310524, |
| "eval_num_tokens": 52965075.0, |
| "eval_runtime": 108.4723, |
| "eval_samples_per_second": 9.219, |
| "eval_steps_per_second": 9.219, |
| "step": 2700 |
| }, |
| { |
| "entropy": 0.6290528163313865, |
| "epoch": 0.43797979797979797, |
| "grad_norm": 0.9835038781166077, |
| "learning_rate": 3.323203063355618e-05, |
| "loss": 0.22, |
| "mean_token_accuracy": 0.9061377301812172, |
| "num_tokens": 53167823.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 0.6957326963543892, |
| "epoch": 0.4395959595959596, |
| "grad_norm": 3.408257484436035, |
| "learning_rate": 3.3219011907596154e-05, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.9151417210698127, |
| "num_tokens": 53351450.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 0.6469338901340962, |
| "epoch": 0.4412121212121212, |
| "grad_norm": 1.0536452531814575, |
| "learning_rate": 3.3205947993864884e-05, |
| "loss": 0.214, |
| "mean_token_accuracy": 0.914907206594944, |
| "num_tokens": 53549942.0, |
| "step": 2730 |
| }, |
| { |
| "entropy": 0.6235164746642112, |
| "epoch": 0.44282828282828285, |
| "grad_norm": 0.4027160406112671, |
| "learning_rate": 3.3192838929917455e-05, |
| "loss": 0.2132, |
| "mean_token_accuracy": 0.917492838203907, |
| "num_tokens": 53752441.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 0.6557296566665173, |
| "epoch": 0.4444444444444444, |
| "grad_norm": 0.6262427568435669, |
| "learning_rate": 3.317968475343877e-05, |
| "loss": 0.2142, |
| "mean_token_accuracy": 0.9105125978589058, |
| "num_tokens": 53946273.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 0.6702249869704247, |
| "epoch": 0.44606060606060605, |
| "grad_norm": 0.38556161522865295, |
| "learning_rate": 3.316648550224342e-05, |
| "loss": 0.1995, |
| "mean_token_accuracy": 0.9102703362703324, |
| "num_tokens": 54137777.0, |
| "step": 2760 |
| }, |
| { |
| "entropy": 0.657077157497406, |
| "epoch": 0.4476767676767677, |
| "grad_norm": 1.3603203296661377, |
| "learning_rate": 3.315324121427557e-05, |
| "loss": 0.2007, |
| "mean_token_accuracy": 0.9166198000311852, |
| "num_tokens": 54332311.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 0.6409480340778828, |
| "epoch": 0.4492929292929293, |
| "grad_norm": 0.40199100971221924, |
| "learning_rate": 3.3139951927608844e-05, |
| "loss": 0.216, |
| "mean_token_accuracy": 0.9097805380821228, |
| "num_tokens": 54534578.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 0.6437219373881817, |
| "epoch": 0.4509090909090909, |
| "grad_norm": 0.6849948167800903, |
| "learning_rate": 3.312661768044624e-05, |
| "loss": 0.2002, |
| "mean_token_accuracy": 0.913942052423954, |
| "num_tokens": 54733184.0, |
| "step": 2790 |
| }, |
| { |
| "entropy": 0.6447813183069229, |
| "epoch": 0.45252525252525255, |
| "grad_norm": 0.3659138083457947, |
| "learning_rate": 3.3113238511119986e-05, |
| "loss": 0.215, |
| "mean_token_accuracy": 0.9110051110386849, |
| "num_tokens": 54931000.0, |
| "step": 2800 |
| }, |
| { |
| "entropy": 0.6239227756857872, |
| "epoch": 0.4541414141414141, |
| "grad_norm": 0.4973262548446655, |
| "learning_rate": 3.3099814458091474e-05, |
| "loss": 0.2082, |
| "mean_token_accuracy": 0.9057865276932716, |
| "num_tokens": 55133960.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 0.6393618829548359, |
| "epoch": 0.45575757575757575, |
| "grad_norm": 0.40064364671707153, |
| "learning_rate": 3.308634555995109e-05, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.9042444139719009, |
| "num_tokens": 55331051.0, |
| "step": 2820 |
| }, |
| { |
| "entropy": 0.6447009883821011, |
| "epoch": 0.4573737373737374, |
| "grad_norm": 0.3533497452735901, |
| "learning_rate": 3.307283185541817e-05, |
| "loss": 0.2058, |
| "mean_token_accuracy": 0.9148957833647728, |
| "num_tokens": 55529610.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 0.6316937655210495, |
| "epoch": 0.458989898989899, |
| "grad_norm": 0.23120789229869843, |
| "learning_rate": 3.305927338334084e-05, |
| "loss": 0.2047, |
| "mean_token_accuracy": 0.9060420542955399, |
| "num_tokens": 55730141.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 0.6504737250506878, |
| "epoch": 0.46060606060606063, |
| "grad_norm": 0.4959610402584076, |
| "learning_rate": 3.3045670182695905e-05, |
| "loss": 0.2039, |
| "mean_token_accuracy": 0.9110078886151314, |
| "num_tokens": 55927592.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 0.6488144524395466, |
| "epoch": 0.4622222222222222, |
| "grad_norm": 0.8291785717010498, |
| "learning_rate": 3.30320222925888e-05, |
| "loss": 0.2039, |
| "mean_token_accuracy": 0.9138716921210289, |
| "num_tokens": 56124051.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 0.6518881164491177, |
| "epoch": 0.4638383838383838, |
| "grad_norm": 0.3788766860961914, |
| "learning_rate": 3.301832975225338e-05, |
| "loss": 0.18, |
| "mean_token_accuracy": 0.9274351745843887, |
| "num_tokens": 56319141.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 0.655502051115036, |
| "epoch": 0.46545454545454545, |
| "grad_norm": 0.3806280493736267, |
| "learning_rate": 3.300459260105188e-05, |
| "loss": 0.1978, |
| "mean_token_accuracy": 0.9226665228605271, |
| "num_tokens": 56510794.0, |
| "step": 2880 |
| }, |
| { |
| "entropy": 0.6256422609090805, |
| "epoch": 0.4670707070707071, |
| "grad_norm": 0.2849152684211731, |
| "learning_rate": 3.2990810878474766e-05, |
| "loss": 0.1973, |
| "mean_token_accuracy": 0.9150432854890823, |
| "num_tokens": 56713261.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 0.6748034112155438, |
| "epoch": 0.4686868686868687, |
| "grad_norm": 0.6893365979194641, |
| "learning_rate": 3.297698462414066e-05, |
| "loss": 0.189, |
| "mean_token_accuracy": 0.914380231499672, |
| "num_tokens": 56901633.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 0.6465104728937149, |
| "epoch": 0.4703030303030303, |
| "grad_norm": 0.6201600432395935, |
| "learning_rate": 3.296311387779618e-05, |
| "loss": 0.2043, |
| "mean_token_accuracy": 0.9120270356535911, |
| "num_tokens": 57096947.0, |
| "step": 2910 |
| }, |
| { |
| "entropy": 0.6560635149478913, |
| "epoch": 0.4719191919191919, |
| "grad_norm": 0.4038831293582916, |
| "learning_rate": 3.294919867931584e-05, |
| "loss": 0.1841, |
| "mean_token_accuracy": 0.9193960189819336, |
| "num_tokens": 57291443.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 0.6498475447297096, |
| "epoch": 0.47353535353535353, |
| "grad_norm": 0.3524320423603058, |
| "learning_rate": 3.293523906870196e-05, |
| "loss": 0.1982, |
| "mean_token_accuracy": 0.9188156425952911, |
| "num_tokens": 57486365.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 0.6489122800529004, |
| "epoch": 0.47515151515151516, |
| "grad_norm": 0.34468188881874084, |
| "learning_rate": 3.292123508608451e-05, |
| "loss": 0.2095, |
| "mean_token_accuracy": 0.9128650277853012, |
| "num_tokens": 57680721.0, |
| "step": 2940 |
| }, |
| { |
| "entropy": 0.6383394666016102, |
| "epoch": 0.4767676767676768, |
| "grad_norm": 0.6034473180770874, |
| "learning_rate": 3.290718677172105e-05, |
| "loss": 0.2052, |
| "mean_token_accuracy": 0.9086369514465332, |
| "num_tokens": 57879619.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 0.6529298670589924, |
| "epoch": 0.4783838383838384, |
| "grad_norm": 0.4649339020252228, |
| "learning_rate": 3.289309416599655e-05, |
| "loss": 0.1917, |
| "mean_token_accuracy": 0.9165951952338218, |
| "num_tokens": 58074943.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 0.6440173640847207, |
| "epoch": 0.48, |
| "grad_norm": 0.3674682080745697, |
| "learning_rate": 3.287895730942333e-05, |
| "loss": 0.1879, |
| "mean_token_accuracy": 0.9216477259993553, |
| "num_tokens": 58269804.0, |
| "step": 2970 |
| }, |
| { |
| "entropy": 0.6634586147964001, |
| "epoch": 0.4816161616161616, |
| "grad_norm": 0.9639015793800354, |
| "learning_rate": 3.286477624264089e-05, |
| "loss": 0.1978, |
| "mean_token_accuracy": 0.9219118982553482, |
| "num_tokens": 58461635.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 0.6251708209514618, |
| "epoch": 0.48323232323232324, |
| "grad_norm": 0.3933483064174652, |
| "learning_rate": 3.285055100641584e-05, |
| "loss": 0.2082, |
| "mean_token_accuracy": 0.9080576360225677, |
| "num_tokens": 58662756.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 0.6697501085698605, |
| "epoch": 0.48484848484848486, |
| "grad_norm": 0.8007460236549377, |
| "learning_rate": 3.283628164164178e-05, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.9163200870156288, |
| "num_tokens": 58850692.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.48484848484848486, |
| "eval_entropy": 0.7009831846058369, |
| "eval_loss": 0.18165792524814606, |
| "eval_mean_token_accuracy": 0.92101721316576, |
| "eval_num_tokens": 58850692.0, |
| "eval_runtime": 108.1573, |
| "eval_samples_per_second": 9.246, |
| "eval_steps_per_second": 9.246, |
| "step": 3000 |
| }, |
| { |
| "entropy": 0.6666446149349212, |
| "epoch": 0.4864646464646465, |
| "grad_norm": 0.9564417004585266, |
| "learning_rate": 3.282196818933913e-05, |
| "loss": 0.2069, |
| "mean_token_accuracy": 0.9102333173155784, |
| "num_tokens": 59039812.0, |
| "step": 3010 |
| }, |
| { |
| "entropy": 0.6625177763402462, |
| "epoch": 0.48808080808080806, |
| "grad_norm": 0.7849810123443604, |
| "learning_rate": 3.280761069065508e-05, |
| "loss": 0.1879, |
| "mean_token_accuracy": 0.9257912486791611, |
| "num_tokens": 59230478.0, |
| "step": 3020 |
| }, |
| { |
| "entropy": 0.6120976060628891, |
| "epoch": 0.4896969696969697, |
| "grad_norm": 0.3264128863811493, |
| "learning_rate": 3.279320918686344e-05, |
| "loss": 0.2151, |
| "mean_token_accuracy": 0.9100072458386421, |
| "num_tokens": 59437566.0, |
| "step": 3030 |
| }, |
| { |
| "entropy": 0.646418509632349, |
| "epoch": 0.4913131313131313, |
| "grad_norm": 0.5998213291168213, |
| "learning_rate": 3.2778763719364486e-05, |
| "loss": 0.2037, |
| "mean_token_accuracy": 0.9138255193829536, |
| "num_tokens": 59632261.0, |
| "step": 3040 |
| }, |
| { |
| "entropy": 0.6323723264038563, |
| "epoch": 0.49292929292929294, |
| "grad_norm": 0.2643972337245941, |
| "learning_rate": 3.276427432968493e-05, |
| "loss": 0.1898, |
| "mean_token_accuracy": 0.9208777904510498, |
| "num_tokens": 59834305.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 0.6356162540614605, |
| "epoch": 0.49454545454545457, |
| "grad_norm": 0.3861073851585388, |
| "learning_rate": 3.274974105947772e-05, |
| "loss": 0.1968, |
| "mean_token_accuracy": 0.9171579584479332, |
| "num_tokens": 60034048.0, |
| "step": 3060 |
| }, |
| { |
| "entropy": 0.6675376549363137, |
| "epoch": 0.49616161616161614, |
| "grad_norm": 0.594031572341919, |
| "learning_rate": 3.273516395052193e-05, |
| "loss": 0.1956, |
| "mean_token_accuracy": 0.9162991613149643, |
| "num_tokens": 60225137.0, |
| "step": 3070 |
| }, |
| { |
| "entropy": 0.6268281377851963, |
| "epoch": 0.49777777777777776, |
| "grad_norm": 0.4130667746067047, |
| "learning_rate": 3.2720543044722707e-05, |
| "loss": 0.2067, |
| "mean_token_accuracy": 0.911859753727913, |
| "num_tokens": 60427325.0, |
| "step": 3080 |
| }, |
| { |
| "entropy": 0.6532427452504634, |
| "epoch": 0.4993939393939394, |
| "grad_norm": 0.6098483800888062, |
| "learning_rate": 3.270587838411106e-05, |
| "loss": 0.2006, |
| "mean_token_accuracy": 0.9138939917087555, |
| "num_tokens": 60624604.0, |
| "step": 3090 |
| }, |
| { |
| "entropy": 0.6927797332406044, |
| "epoch": 0.501010101010101, |
| "grad_norm": 0.45501190423965454, |
| "learning_rate": 3.2691170010843785e-05, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.922791239619255, |
| "num_tokens": 60807502.0, |
| "step": 3100 |
| }, |
| { |
| "entropy": 0.6616372317075729, |
| "epoch": 0.5026262626262626, |
| "grad_norm": 1.7559648752212524, |
| "learning_rate": 3.2676417967203366e-05, |
| "loss": 0.1995, |
| "mean_token_accuracy": 0.9174562796950341, |
| "num_tokens": 61002222.0, |
| "step": 3110 |
| }, |
| { |
| "entropy": 0.6428518198430538, |
| "epoch": 0.5042424242424243, |
| "grad_norm": 0.8280075788497925, |
| "learning_rate": 3.266162229559781e-05, |
| "loss": 0.1933, |
| "mean_token_accuracy": 0.9215355843305588, |
| "num_tokens": 61203301.0, |
| "step": 3120 |
| }, |
| { |
| "entropy": 0.6523866213858127, |
| "epoch": 0.5058585858585859, |
| "grad_norm": 0.4117162823677063, |
| "learning_rate": 3.2646783038560525e-05, |
| "loss": 0.2059, |
| "mean_token_accuracy": 0.9134928032755851, |
| "num_tokens": 61400432.0, |
| "step": 3130 |
| }, |
| { |
| "entropy": 0.6734721213579178, |
| "epoch": 0.5074747474747475, |
| "grad_norm": 0.746486485004425, |
| "learning_rate": 3.263190023875025e-05, |
| "loss": 0.1826, |
| "mean_token_accuracy": 0.9212172508239747, |
| "num_tokens": 61589776.0, |
| "step": 3140 |
| }, |
| { |
| "entropy": 0.6552940599620343, |
| "epoch": 0.509090909090909, |
| "grad_norm": 0.31145575642585754, |
| "learning_rate": 3.261697393895088e-05, |
| "loss": 0.1917, |
| "mean_token_accuracy": 0.9138196483254433, |
| "num_tokens": 61785756.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 0.6406723141670227, |
| "epoch": 0.5107070707070707, |
| "grad_norm": 0.9990244507789612, |
| "learning_rate": 3.2602004182071346e-05, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.9117845878005028, |
| "num_tokens": 61983863.0, |
| "step": 3160 |
| }, |
| { |
| "entropy": 0.6623590111732482, |
| "epoch": 0.5123232323232323, |
| "grad_norm": 0.48632344603538513, |
| "learning_rate": 3.2586991011145524e-05, |
| "loss": 0.1902, |
| "mean_token_accuracy": 0.9184399858117104, |
| "num_tokens": 62175690.0, |
| "step": 3170 |
| }, |
| { |
| "entropy": 0.6356962986290455, |
| "epoch": 0.5139393939393939, |
| "grad_norm": 5.5877909660339355, |
| "learning_rate": 3.25719344693321e-05, |
| "loss": 0.2244, |
| "mean_token_accuracy": 0.8993245542049408, |
| "num_tokens": 62376577.0, |
| "step": 3180 |
| }, |
| { |
| "entropy": 0.6909607768058776, |
| "epoch": 0.5155555555555555, |
| "grad_norm": 0.42147713899612427, |
| "learning_rate": 3.2556834599914425e-05, |
| "loss": 0.1747, |
| "mean_token_accuracy": 0.9327822804450989, |
| "num_tokens": 62558544.0, |
| "step": 3190 |
| }, |
| { |
| "entropy": 0.6543774232268333, |
| "epoch": 0.5171717171717172, |
| "grad_norm": 2.5656116008758545, |
| "learning_rate": 3.25416914463004e-05, |
| "loss": 0.1921, |
| "mean_token_accuracy": 0.9150905042886734, |
| "num_tokens": 62750996.0, |
| "step": 3200 |
| }, |
| { |
| "entropy": 0.6566679209470749, |
| "epoch": 0.5187878787878788, |
| "grad_norm": 0.42310506105422974, |
| "learning_rate": 3.252650505202238e-05, |
| "loss": 0.1969, |
| "mean_token_accuracy": 0.9137048035860061, |
| "num_tokens": 62944360.0, |
| "step": 3210 |
| }, |
| { |
| "entropy": 0.6445956945419311, |
| "epoch": 0.5204040404040404, |
| "grad_norm": 1.61677086353302, |
| "learning_rate": 3.251127546073699e-05, |
| "loss": 0.191, |
| "mean_token_accuracy": 0.928897674381733, |
| "num_tokens": 63140963.0, |
| "step": 3220 |
| }, |
| { |
| "entropy": 0.668046285957098, |
| "epoch": 0.522020202020202, |
| "grad_norm": 0.38541457056999207, |
| "learning_rate": 3.249600271622507e-05, |
| "loss": 0.1923, |
| "mean_token_accuracy": 0.9224132910370827, |
| "num_tokens": 63332657.0, |
| "step": 3230 |
| }, |
| { |
| "entropy": 0.6093958213925361, |
| "epoch": 0.5236363636363637, |
| "grad_norm": 0.49875500798225403, |
| "learning_rate": 3.248068686239149e-05, |
| "loss": 0.223, |
| "mean_token_accuracy": 0.9000680401921273, |
| "num_tokens": 63541910.0, |
| "step": 3240 |
| }, |
| { |
| "entropy": 0.6547015912830829, |
| "epoch": 0.5252525252525253, |
| "grad_norm": 0.3443615436553955, |
| "learning_rate": 3.246532794326505e-05, |
| "loss": 0.2227, |
| "mean_token_accuracy": 0.9069190487265587, |
| "num_tokens": 63736206.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 0.6406685844063759, |
| "epoch": 0.5268686868686868, |
| "grad_norm": 0.3350508213043213, |
| "learning_rate": 3.244992600299836e-05, |
| "loss": 0.1973, |
| "mean_token_accuracy": 0.9136427164077758, |
| "num_tokens": 63935477.0, |
| "step": 3260 |
| }, |
| { |
| "entropy": 0.6515379846096039, |
| "epoch": 0.5284848484848484, |
| "grad_norm": 0.5605429410934448, |
| "learning_rate": 3.2434481085867705e-05, |
| "loss": 0.2061, |
| "mean_token_accuracy": 0.9114924788475036, |
| "num_tokens": 64129764.0, |
| "step": 3270 |
| }, |
| { |
| "entropy": 0.6315380461513996, |
| "epoch": 0.5301010101010101, |
| "grad_norm": 0.3819006681442261, |
| "learning_rate": 3.24189932362729e-05, |
| "loss": 0.2169, |
| "mean_token_accuracy": 0.8984061792492867, |
| "num_tokens": 64330659.0, |
| "step": 3280 |
| }, |
| { |
| "entropy": 0.6346470937132835, |
| "epoch": 0.5317171717171717, |
| "grad_norm": 0.4472454786300659, |
| "learning_rate": 3.240346249873719e-05, |
| "loss": 0.2215, |
| "mean_token_accuracy": 0.9077972799539566, |
| "num_tokens": 64530704.0, |
| "step": 3290 |
| }, |
| { |
| "entropy": 0.6334054812788963, |
| "epoch": 0.5333333333333333, |
| "grad_norm": 0.47427433729171753, |
| "learning_rate": 3.238788891790712e-05, |
| "loss": 0.2156, |
| "mean_token_accuracy": 0.9178689196705818, |
| "num_tokens": 64730951.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.5333333333333333, |
| "eval_entropy": 0.708876410573721, |
| "eval_loss": 0.18830136954784393, |
| "eval_mean_token_accuracy": 0.9212830139994621, |
| "eval_num_tokens": 64730951.0, |
| "eval_runtime": 107.508, |
| "eval_samples_per_second": 9.302, |
| "eval_steps_per_second": 9.302, |
| "step": 3300 |
| }, |
| { |
| "entropy": 0.6263887412846089, |
| "epoch": 0.534949494949495, |
| "grad_norm": 0.4745093286037445, |
| "learning_rate": 3.2372272538552376e-05, |
| "loss": 0.2178, |
| "mean_token_accuracy": 0.908173656463623, |
| "num_tokens": 64932897.0, |
| "step": 3310 |
| }, |
| { |
| "entropy": 0.6270258404314518, |
| "epoch": 0.5365656565656566, |
| "grad_norm": 0.25539788603782654, |
| "learning_rate": 3.235661340556569e-05, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9180809631943703, |
| "num_tokens": 65135403.0, |
| "step": 3320 |
| }, |
| { |
| "entropy": 0.6297943659126759, |
| "epoch": 0.5381818181818182, |
| "grad_norm": 0.24494622647762299, |
| "learning_rate": 3.2340911563962706e-05, |
| "loss": 0.2108, |
| "mean_token_accuracy": 0.9115460008382797, |
| "num_tokens": 65336219.0, |
| "step": 3330 |
| }, |
| { |
| "entropy": 0.6433823242783546, |
| "epoch": 0.5397979797979798, |
| "grad_norm": 0.7900083065032959, |
| "learning_rate": 3.232516705888183e-05, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.9078005447983741, |
| "num_tokens": 65533545.0, |
| "step": 3340 |
| }, |
| { |
| "entropy": 0.6677606172859669, |
| "epoch": 0.5414141414141415, |
| "grad_norm": 0.8443840146064758, |
| "learning_rate": 3.2309379935584125e-05, |
| "loss": 0.2102, |
| "mean_token_accuracy": 0.9176926985383034, |
| "num_tokens": 65726131.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 0.6547842122614383, |
| "epoch": 0.5430303030303031, |
| "grad_norm": 0.4855356812477112, |
| "learning_rate": 3.229355023945315e-05, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9250451758503914, |
| "num_tokens": 65918536.0, |
| "step": 3360 |
| }, |
| { |
| "entropy": 0.642189259827137, |
| "epoch": 0.5446464646464646, |
| "grad_norm": 0.32615113258361816, |
| "learning_rate": 3.2277678015994886e-05, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.9199791148304939, |
| "num_tokens": 66115421.0, |
| "step": 3370 |
| }, |
| { |
| "entropy": 0.6465832196176052, |
| "epoch": 0.5462626262626262, |
| "grad_norm": 0.28738975524902344, |
| "learning_rate": 3.226176331083752e-05, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.9207375660538674, |
| "num_tokens": 66311951.0, |
| "step": 3380 |
| }, |
| { |
| "entropy": 0.6554206393659114, |
| "epoch": 0.5478787878787879, |
| "grad_norm": 0.4349115192890167, |
| "learning_rate": 3.2245806169731395e-05, |
| "loss": 0.1896, |
| "mean_token_accuracy": 0.9180210024118424, |
| "num_tokens": 66504523.0, |
| "step": 3390 |
| }, |
| { |
| "entropy": 0.6807047441601753, |
| "epoch": 0.5494949494949495, |
| "grad_norm": 0.38425713777542114, |
| "learning_rate": 3.222980663854884e-05, |
| "loss": 0.1829, |
| "mean_token_accuracy": 0.9236632093787194, |
| "num_tokens": 66689376.0, |
| "step": 3400 |
| }, |
| { |
| "entropy": 0.6507133312523365, |
| "epoch": 0.5511111111111111, |
| "grad_norm": 0.4921931028366089, |
| "learning_rate": 3.221376476328404e-05, |
| "loss": 0.2082, |
| "mean_token_accuracy": 0.9143338233232499, |
| "num_tokens": 66884196.0, |
| "step": 3410 |
| }, |
| { |
| "entropy": 0.6452019922435284, |
| "epoch": 0.5527272727272727, |
| "grad_norm": 0.361701637506485, |
| "learning_rate": 3.219768059005291e-05, |
| "loss": 0.2227, |
| "mean_token_accuracy": 0.9031536251306533, |
| "num_tokens": 67084098.0, |
| "step": 3420 |
| }, |
| { |
| "entropy": 0.6425276100635529, |
| "epoch": 0.5543434343434344, |
| "grad_norm": 0.23792065680027008, |
| "learning_rate": 3.218155416509296e-05, |
| "loss": 0.2079, |
| "mean_token_accuracy": 0.900030605494976, |
| "num_tokens": 67280863.0, |
| "step": 3430 |
| }, |
| { |
| "entropy": 0.6363083481788635, |
| "epoch": 0.555959595959596, |
| "grad_norm": 0.7842856645584106, |
| "learning_rate": 3.216538553476315e-05, |
| "loss": 0.2109, |
| "mean_token_accuracy": 0.9114381566643714, |
| "num_tokens": 67478386.0, |
| "step": 3440 |
| }, |
| { |
| "entropy": 0.648232850432396, |
| "epoch": 0.5575757575757576, |
| "grad_norm": 0.38298118114471436, |
| "learning_rate": 3.214917474554378e-05, |
| "loss": 0.2047, |
| "mean_token_accuracy": 0.9171234712004661, |
| "num_tokens": 67676237.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 0.6573429599404335, |
| "epoch": 0.5591919191919192, |
| "grad_norm": 0.3366295099258423, |
| "learning_rate": 3.213292184403636e-05, |
| "loss": 0.1797, |
| "mean_token_accuracy": 0.9248432412743568, |
| "num_tokens": 67866540.0, |
| "step": 3460 |
| }, |
| { |
| "entropy": 0.6503325551748276, |
| "epoch": 0.5608080808080808, |
| "grad_norm": 0.27914008498191833, |
| "learning_rate": 3.211662687696343e-05, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.9210797488689423, |
| "num_tokens": 68062045.0, |
| "step": 3470 |
| }, |
| { |
| "entropy": 0.6173742033541203, |
| "epoch": 0.5624242424242424, |
| "grad_norm": 1.0394847393035889, |
| "learning_rate": 3.210028989116848e-05, |
| "loss": 0.2158, |
| "mean_token_accuracy": 0.9063221380114556, |
| "num_tokens": 68267313.0, |
| "step": 3480 |
| }, |
| { |
| "entropy": 0.6302667535841465, |
| "epoch": 0.564040404040404, |
| "grad_norm": 0.5718628764152527, |
| "learning_rate": 3.208391093361577e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.9212815120816231, |
| "num_tokens": 68467768.0, |
| "step": 3490 |
| }, |
| { |
| "entropy": 0.6221346028149128, |
| "epoch": 0.5656565656565656, |
| "grad_norm": 0.3520769774913788, |
| "learning_rate": 3.206749005139024e-05, |
| "loss": 0.2248, |
| "mean_token_accuracy": 0.9028334498405457, |
| "num_tokens": 68671058.0, |
| "step": 3500 |
| }, |
| { |
| "entropy": 0.620377691090107, |
| "epoch": 0.5672727272727273, |
| "grad_norm": 0.6622844338417053, |
| "learning_rate": 3.205102729169734e-05, |
| "loss": 0.2338, |
| "mean_token_accuracy": 0.902023434638977, |
| "num_tokens": 68874421.0, |
| "step": 3510 |
| }, |
| { |
| "entropy": 0.6653469420969487, |
| "epoch": 0.5688888888888889, |
| "grad_norm": 0.895131528377533, |
| "learning_rate": 3.203452270186292e-05, |
| "loss": 0.1764, |
| "mean_token_accuracy": 0.9239086017012597, |
| "num_tokens": 69064415.0, |
| "step": 3520 |
| }, |
| { |
| "entropy": 0.660456845164299, |
| "epoch": 0.5705050505050505, |
| "grad_norm": 3.0434823036193848, |
| "learning_rate": 3.201797632933305e-05, |
| "loss": 0.1984, |
| "mean_token_accuracy": 0.9147211536765099, |
| "num_tokens": 69254291.0, |
| "step": 3530 |
| }, |
| { |
| "entropy": 0.6673716694116593, |
| "epoch": 0.5721212121212121, |
| "grad_norm": 1.063805341720581, |
| "learning_rate": 3.2001388221673945e-05, |
| "loss": 0.2082, |
| "mean_token_accuracy": 0.9198906376957894, |
| "num_tokens": 69442287.0, |
| "step": 3540 |
| }, |
| { |
| "entropy": 0.6692168071866036, |
| "epoch": 0.5737373737373738, |
| "grad_norm": 0.4066961705684662, |
| "learning_rate": 3.198475842657178e-05, |
| "loss": 0.2128, |
| "mean_token_accuracy": 0.9147928357124329, |
| "num_tokens": 69631036.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 0.6522227242588997, |
| "epoch": 0.5753535353535354, |
| "grad_norm": 0.3503301739692688, |
| "learning_rate": 3.196808699183258e-05, |
| "loss": 0.2078, |
| "mean_token_accuracy": 0.9150220856070519, |
| "num_tokens": 69824448.0, |
| "step": 3560 |
| }, |
| { |
| "entropy": 0.6434777349233627, |
| "epoch": 0.576969696969697, |
| "grad_norm": 1.1538275480270386, |
| "learning_rate": 3.195137396538205e-05, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9231508269906044, |
| "num_tokens": 70021561.0, |
| "step": 3570 |
| }, |
| { |
| "entropy": 0.6639960631728172, |
| "epoch": 0.5785858585858585, |
| "grad_norm": 2.492892265319824, |
| "learning_rate": 3.193461939526549e-05, |
| "loss": 0.1798, |
| "mean_token_accuracy": 0.9285740494728089, |
| "num_tokens": 70212443.0, |
| "step": 3580 |
| }, |
| { |
| "entropy": 0.6517272099852562, |
| "epoch": 0.5802020202020202, |
| "grad_norm": 0.6861459612846375, |
| "learning_rate": 3.191782332964761e-05, |
| "loss": 0.2182, |
| "mean_token_accuracy": 0.898492357134819, |
| "num_tokens": 70409374.0, |
| "step": 3590 |
| }, |
| { |
| "entropy": 0.609511935710907, |
| "epoch": 0.5818181818181818, |
| "grad_norm": 0.28647559881210327, |
| "learning_rate": 3.19009858168124e-05, |
| "loss": 0.1919, |
| "mean_token_accuracy": 0.9192790776491165, |
| "num_tokens": 70618309.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.5818181818181818, |
| "eval_entropy": 0.7097669740915299, |
| "eval_loss": 0.18217602372169495, |
| "eval_mean_token_accuracy": 0.9196950270533562, |
| "eval_num_tokens": 70618309.0, |
| "eval_runtime": 107.3842, |
| "eval_samples_per_second": 9.312, |
| "eval_steps_per_second": 9.312, |
| "step": 3600 |
| }, |
| { |
| "entropy": 0.6385140128433704, |
| "epoch": 0.5834343434343434, |
| "grad_norm": 0.32677140831947327, |
| "learning_rate": 3.188410690516302e-05, |
| "loss": 0.2138, |
| "mean_token_accuracy": 0.9102208271622658, |
| "num_tokens": 70817872.0, |
| "step": 3610 |
| }, |
| { |
| "entropy": 0.6724786482751369, |
| "epoch": 0.585050505050505, |
| "grad_norm": 1.1922396421432495, |
| "learning_rate": 3.186718664322163e-05, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.9211466312408447, |
| "num_tokens": 71006020.0, |
| "step": 3620 |
| }, |
| { |
| "entropy": 0.6604351043701172, |
| "epoch": 0.5866666666666667, |
| "grad_norm": 1.7478338479995728, |
| "learning_rate": 3.185022507962925e-05, |
| "loss": 0.2117, |
| "mean_token_accuracy": 0.9231618851423263, |
| "num_tokens": 71201778.0, |
| "step": 3630 |
| }, |
| { |
| "entropy": 0.6615160658955574, |
| "epoch": 0.5882828282828283, |
| "grad_norm": 0.9100177884101868, |
| "learning_rate": 3.183322226314565e-05, |
| "loss": 0.1905, |
| "mean_token_accuracy": 0.9153360441327095, |
| "num_tokens": 71394174.0, |
| "step": 3640 |
| }, |
| { |
| "entropy": 0.6542910009622573, |
| "epoch": 0.5898989898989899, |
| "grad_norm": 0.5457777976989746, |
| "learning_rate": 3.181617824264917e-05, |
| "loss": 0.2087, |
| "mean_token_accuracy": 0.909763066470623, |
| "num_tokens": 71589313.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 0.6248301893472672, |
| "epoch": 0.5915151515151515, |
| "grad_norm": 1.1501109600067139, |
| "learning_rate": 3.179909306713663e-05, |
| "loss": 0.221, |
| "mean_token_accuracy": 0.910381156206131, |
| "num_tokens": 71795222.0, |
| "step": 3660 |
| }, |
| { |
| "entropy": 0.6577152475714684, |
| "epoch": 0.5931313131313132, |
| "grad_norm": 0.9693170785903931, |
| "learning_rate": 3.178196678572312e-05, |
| "loss": 0.2076, |
| "mean_token_accuracy": 0.9101708441972732, |
| "num_tokens": 71987627.0, |
| "step": 3670 |
| }, |
| { |
| "entropy": 0.6558754108846188, |
| "epoch": 0.5947474747474748, |
| "grad_norm": 0.38402456045150757, |
| "learning_rate": 3.176479944764193e-05, |
| "loss": 0.1928, |
| "mean_token_accuracy": 0.9119617655873299, |
| "num_tokens": 72182126.0, |
| "step": 3680 |
| }, |
| { |
| "entropy": 0.6468130461871624, |
| "epoch": 0.5963636363636363, |
| "grad_norm": 0.5037944316864014, |
| "learning_rate": 3.174759110224436e-05, |
| "loss": 0.193, |
| "mean_token_accuracy": 0.9178726747632027, |
| "num_tokens": 72381065.0, |
| "step": 3690 |
| }, |
| { |
| "entropy": 0.6679630935192108, |
| "epoch": 0.597979797979798, |
| "grad_norm": 0.9890159964561462, |
| "learning_rate": 3.17303417989996e-05, |
| "loss": 0.1912, |
| "mean_token_accuracy": 0.9213204249739647, |
| "num_tokens": 72571262.0, |
| "step": 3700 |
| }, |
| { |
| "entropy": 0.6567414380609989, |
| "epoch": 0.5995959595959596, |
| "grad_norm": 1.5467326641082764, |
| "learning_rate": 3.1713051587494584e-05, |
| "loss": 0.1799, |
| "mean_token_accuracy": 0.926344147324562, |
| "num_tokens": 72767159.0, |
| "step": 3710 |
| }, |
| { |
| "entropy": 0.6576608344912529, |
| "epoch": 0.6012121212121212, |
| "grad_norm": 0.5661543011665344, |
| "learning_rate": 3.1695720517433844e-05, |
| "loss": 0.1887, |
| "mean_token_accuracy": 0.9229953482747077, |
| "num_tokens": 72964546.0, |
| "step": 3720 |
| }, |
| { |
| "entropy": 0.6437795154750348, |
| "epoch": 0.6028282828282828, |
| "grad_norm": 0.4989408850669861, |
| "learning_rate": 3.1678348638639365e-05, |
| "loss": 0.193, |
| "mean_token_accuracy": 0.9184434622526169, |
| "num_tokens": 73161389.0, |
| "step": 3730 |
| }, |
| { |
| "entropy": 0.647753156721592, |
| "epoch": 0.6044444444444445, |
| "grad_norm": 1.8060365915298462, |
| "learning_rate": 3.166093600105045e-05, |
| "loss": 0.1904, |
| "mean_token_accuracy": 0.9116377130150795, |
| "num_tokens": 73358408.0, |
| "step": 3740 |
| }, |
| { |
| "entropy": 0.6552592545747757, |
| "epoch": 0.6060606060606061, |
| "grad_norm": 0.34557202458381653, |
| "learning_rate": 3.164348265472357e-05, |
| "loss": 0.1988, |
| "mean_token_accuracy": 0.9147962421178818, |
| "num_tokens": 73553460.0, |
| "step": 3750 |
| }, |
| { |
| "entropy": 0.6839084707200527, |
| "epoch": 0.6076767676767677, |
| "grad_norm": 0.668044924736023, |
| "learning_rate": 3.1625988649832224e-05, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.9240136578679085, |
| "num_tokens": 73740151.0, |
| "step": 3760 |
| }, |
| { |
| "entropy": 0.6638677589595318, |
| "epoch": 0.6092929292929293, |
| "grad_norm": 0.3268148601055145, |
| "learning_rate": 3.16084540366668e-05, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9274547725915909, |
| "num_tokens": 73935155.0, |
| "step": 3770 |
| }, |
| { |
| "entropy": 0.6938288390636445, |
| "epoch": 0.610909090909091, |
| "grad_norm": 0.31285151839256287, |
| "learning_rate": 3.15908788656344e-05, |
| "loss": 0.1959, |
| "mean_token_accuracy": 0.9191734343767166, |
| "num_tokens": 74119650.0, |
| "step": 3780 |
| }, |
| { |
| "entropy": 0.6699846811592579, |
| "epoch": 0.6125252525252525, |
| "grad_norm": 0.4750641882419586, |
| "learning_rate": 3.1573263187258754e-05, |
| "loss": 0.2092, |
| "mean_token_accuracy": 0.914281564950943, |
| "num_tokens": 74311830.0, |
| "step": 3790 |
| }, |
| { |
| "entropy": 0.6347973830997944, |
| "epoch": 0.6141414141414141, |
| "grad_norm": 0.45926523208618164, |
| "learning_rate": 3.155560705218e-05, |
| "loss": 0.2272, |
| "mean_token_accuracy": 0.9005576968193054, |
| "num_tokens": 74513798.0, |
| "step": 3800 |
| }, |
| { |
| "entropy": 0.6711005762219429, |
| "epoch": 0.6157575757575757, |
| "grad_norm": 0.32855984568595886, |
| "learning_rate": 3.1537910511154625e-05, |
| "loss": 0.1899, |
| "mean_token_accuracy": 0.9199647232890129, |
| "num_tokens": 74705123.0, |
| "step": 3810 |
| }, |
| { |
| "entropy": 0.6577563665807247, |
| "epoch": 0.6173737373737374, |
| "grad_norm": 0.2792748510837555, |
| "learning_rate": 3.152017361505522e-05, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.91060761064291, |
| "num_tokens": 74900348.0, |
| "step": 3820 |
| }, |
| { |
| "entropy": 0.6500893220305443, |
| "epoch": 0.618989898989899, |
| "grad_norm": 0.38441580533981323, |
| "learning_rate": 3.150239641487041e-05, |
| "loss": 0.2066, |
| "mean_token_accuracy": 0.907078555226326, |
| "num_tokens": 75095080.0, |
| "step": 3830 |
| }, |
| { |
| "entropy": 0.6571944713592529, |
| "epoch": 0.6206060606060606, |
| "grad_norm": 0.45710915327072144, |
| "learning_rate": 3.1484578961704694e-05, |
| "loss": 0.1982, |
| "mean_token_accuracy": 0.917755238711834, |
| "num_tokens": 75288294.0, |
| "step": 3840 |
| }, |
| { |
| "entropy": 0.6590827241539955, |
| "epoch": 0.6222222222222222, |
| "grad_norm": 0.3001457154750824, |
| "learning_rate": 3.1466721306778277e-05, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9156096473336219, |
| "num_tokens": 75482456.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 0.6585754707455636, |
| "epoch": 0.6238383838383839, |
| "grad_norm": 0.27338069677352905, |
| "learning_rate": 3.144882350142693e-05, |
| "loss": 0.1812, |
| "mean_token_accuracy": 0.9205604672431946, |
| "num_tokens": 75675408.0, |
| "step": 3860 |
| }, |
| { |
| "entropy": 0.6397463321685791, |
| "epoch": 0.6254545454545455, |
| "grad_norm": 0.2751981317996979, |
| "learning_rate": 3.143088559710183e-05, |
| "loss": 0.1917, |
| "mean_token_accuracy": 0.9194412946701049, |
| "num_tokens": 75874715.0, |
| "step": 3870 |
| }, |
| { |
| "entropy": 0.6339182935655117, |
| "epoch": 0.6270707070707071, |
| "grad_norm": 0.5759657025337219, |
| "learning_rate": 3.141290764536947e-05, |
| "loss": 0.2002, |
| "mean_token_accuracy": 0.9099746659398079, |
| "num_tokens": 76072600.0, |
| "step": 3880 |
| }, |
| { |
| "entropy": 0.6439682267606258, |
| "epoch": 0.6286868686868687, |
| "grad_norm": 3.8243777751922607, |
| "learning_rate": 3.139488969791144e-05, |
| "loss": 0.2248, |
| "mean_token_accuracy": 0.9037980034947395, |
| "num_tokens": 76267369.0, |
| "step": 3890 |
| }, |
| { |
| "entropy": 0.6569941058754921, |
| "epoch": 0.6303030303030303, |
| "grad_norm": 4.018685817718506, |
| "learning_rate": 3.137683180652429e-05, |
| "loss": 0.1805, |
| "mean_token_accuracy": 0.9241274476051331, |
| "num_tokens": 76461234.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.6303030303030303, |
| "eval_entropy": 0.7052705428898335, |
| "eval_loss": 0.19400236010551453, |
| "eval_mean_token_accuracy": 0.9165776029825211, |
| "eval_num_tokens": 76461234.0, |
| "eval_runtime": 107.827, |
| "eval_samples_per_second": 9.274, |
| "eval_steps_per_second": 9.274, |
| "step": 3900 |
| }, |
| { |
| "entropy": 0.6134035527706146, |
| "epoch": 0.6319191919191919, |
| "grad_norm": 0.47339844703674316, |
| "learning_rate": 3.1358734023119434e-05, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.9142026767134667, |
| "num_tokens": 76668510.0, |
| "step": 3910 |
| }, |
| { |
| "entropy": 0.6287301488220691, |
| "epoch": 0.6335353535353535, |
| "grad_norm": 3.310734510421753, |
| "learning_rate": 3.134059639972293e-05, |
| "loss": 0.2095, |
| "mean_token_accuracy": 0.9131061762571335, |
| "num_tokens": 76869415.0, |
| "step": 3920 |
| }, |
| { |
| "entropy": 0.6743476808071136, |
| "epoch": 0.6351515151515151, |
| "grad_norm": 0.40917423367500305, |
| "learning_rate": 3.132241898847541e-05, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9163327530026436, |
| "num_tokens": 77055631.0, |
| "step": 3930 |
| }, |
| { |
| "entropy": 0.6846744433045387, |
| "epoch": 0.6367676767676768, |
| "grad_norm": 0.257769376039505, |
| "learning_rate": 3.130420184163183e-05, |
| "loss": 0.1795, |
| "mean_token_accuracy": 0.9251757651567459, |
| "num_tokens": 77240945.0, |
| "step": 3940 |
| }, |
| { |
| "entropy": 0.6451074630022049, |
| "epoch": 0.6383838383838384, |
| "grad_norm": 0.4214618504047394, |
| "learning_rate": 3.128594501156141e-05, |
| "loss": 0.1987, |
| "mean_token_accuracy": 0.913572546839714, |
| "num_tokens": 77439671.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 0.636054840683937, |
| "epoch": 0.64, |
| "grad_norm": 0.30621734261512756, |
| "learning_rate": 3.126764855074745e-05, |
| "loss": 0.2131, |
| "mean_token_accuracy": 0.9086759075522423, |
| "num_tokens": 77639210.0, |
| "step": 3960 |
| }, |
| { |
| "entropy": 0.669957785308361, |
| "epoch": 0.6416161616161616, |
| "grad_norm": 0.3592897355556488, |
| "learning_rate": 3.124931251178716e-05, |
| "loss": 0.1822, |
| "mean_token_accuracy": 0.9224239885807037, |
| "num_tokens": 77829537.0, |
| "step": 3970 |
| }, |
| { |
| "entropy": 0.6642787851393223, |
| "epoch": 0.6432323232323233, |
| "grad_norm": 3.2593376636505127, |
| "learning_rate": 3.123093694739153e-05, |
| "loss": 0.2133, |
| "mean_token_accuracy": 0.913386145234108, |
| "num_tokens": 78019508.0, |
| "step": 3980 |
| }, |
| { |
| "entropy": 0.6268266074359417, |
| "epoch": 0.6448484848484849, |
| "grad_norm": 2.4404337406158447, |
| "learning_rate": 3.1212521910385174e-05, |
| "loss": 0.2343, |
| "mean_token_accuracy": 0.9024699181318283, |
| "num_tokens": 78222951.0, |
| "step": 3990 |
| }, |
| { |
| "entropy": 0.6415668934583664, |
| "epoch": 0.6464646464646465, |
| "grad_norm": 2.0136265754699707, |
| "learning_rate": 3.11940674537062e-05, |
| "loss": 0.1948, |
| "mean_token_accuracy": 0.9218700021505356, |
| "num_tokens": 78421003.0, |
| "step": 4000 |
| }, |
| { |
| "entropy": 0.6442751929163932, |
| "epoch": 0.648080808080808, |
| "grad_norm": 0.34446606040000916, |
| "learning_rate": 3.117557363040601e-05, |
| "loss": 0.2121, |
| "mean_token_accuracy": 0.9129304736852646, |
| "num_tokens": 78619565.0, |
| "step": 4010 |
| }, |
| { |
| "entropy": 0.6459943115711212, |
| "epoch": 0.6496969696969697, |
| "grad_norm": 0.46052300930023193, |
| "learning_rate": 3.115704049364918e-05, |
| "loss": 0.1923, |
| "mean_token_accuracy": 0.9172181561589241, |
| "num_tokens": 78815948.0, |
| "step": 4020 |
| }, |
| { |
| "entropy": 0.658907701075077, |
| "epoch": 0.6513131313131313, |
| "grad_norm": 0.4204244613647461, |
| "learning_rate": 3.1138468096713306e-05, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9092022016644478, |
| "num_tokens": 79007755.0, |
| "step": 4030 |
| }, |
| { |
| "entropy": 0.6481206513941288, |
| "epoch": 0.6529292929292929, |
| "grad_norm": 1.92702054977417, |
| "learning_rate": 3.111985649298885e-05, |
| "loss": 0.2269, |
| "mean_token_accuracy": 0.9099973410367965, |
| "num_tokens": 79204470.0, |
| "step": 4040 |
| }, |
| { |
| "entropy": 0.6662653639912606, |
| "epoch": 0.6545454545454545, |
| "grad_norm": 0.41740235686302185, |
| "learning_rate": 3.110120573597897e-05, |
| "loss": 0.2198, |
| "mean_token_accuracy": 0.9084355965256691, |
| "num_tokens": 79396622.0, |
| "step": 4050 |
| }, |
| { |
| "entropy": 0.647067503631115, |
| "epoch": 0.6561616161616162, |
| "grad_norm": 0.3785872757434845, |
| "learning_rate": 3.1082515879299394e-05, |
| "loss": 0.2078, |
| "mean_token_accuracy": 0.912786665558815, |
| "num_tokens": 79594893.0, |
| "step": 4060 |
| }, |
| { |
| "entropy": 0.6522647351026535, |
| "epoch": 0.6577777777777778, |
| "grad_norm": 3.0412800312042236, |
| "learning_rate": 3.106378697667823e-05, |
| "loss": 0.2232, |
| "mean_token_accuracy": 0.9029908329248428, |
| "num_tokens": 79788535.0, |
| "step": 4070 |
| }, |
| { |
| "entropy": 0.665669298171997, |
| "epoch": 0.6593939393939394, |
| "grad_norm": 2.1263978481292725, |
| "learning_rate": 3.104501908195586e-05, |
| "loss": 0.2054, |
| "mean_token_accuracy": 0.907913924753666, |
| "num_tokens": 79978326.0, |
| "step": 4080 |
| }, |
| { |
| "entropy": 0.665568544715643, |
| "epoch": 0.661010101010101, |
| "grad_norm": 0.8354134559631348, |
| "learning_rate": 3.102621224908474e-05, |
| "loss": 0.2094, |
| "mean_token_accuracy": 0.9120962634682656, |
| "num_tokens": 80169109.0, |
| "step": 4090 |
| }, |
| { |
| "entropy": 0.6687669172883034, |
| "epoch": 0.6626262626262627, |
| "grad_norm": 0.27629363536834717, |
| "learning_rate": 3.100736653212925e-05, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9171461597084999, |
| "num_tokens": 80360696.0, |
| "step": 4100 |
| }, |
| { |
| "entropy": 0.6284529320895672, |
| "epoch": 0.6642424242424242, |
| "grad_norm": 1.274771809577942, |
| "learning_rate": 3.0988481985265585e-05, |
| "loss": 0.1957, |
| "mean_token_accuracy": 0.9178080677986145, |
| "num_tokens": 80565133.0, |
| "step": 4110 |
| }, |
| { |
| "entropy": 0.6715195573866367, |
| "epoch": 0.6658585858585858, |
| "grad_norm": 0.6915074586868286, |
| "learning_rate": 3.0969558662781524e-05, |
| "loss": 0.2111, |
| "mean_token_accuracy": 0.9070376634597779, |
| "num_tokens": 80755904.0, |
| "step": 4120 |
| }, |
| { |
| "entropy": 0.6908825971186161, |
| "epoch": 0.6674747474747474, |
| "grad_norm": 0.31496939063072205, |
| "learning_rate": 3.0950596619076354e-05, |
| "loss": 0.1949, |
| "mean_token_accuracy": 0.9198219880461693, |
| "num_tokens": 80941817.0, |
| "step": 4130 |
| }, |
| { |
| "entropy": 0.6395300135016442, |
| "epoch": 0.6690909090909091, |
| "grad_norm": 0.2790515124797821, |
| "learning_rate": 3.093159590866066e-05, |
| "loss": 0.1929, |
| "mean_token_accuracy": 0.9128642991185189, |
| "num_tokens": 81142844.0, |
| "step": 4140 |
| }, |
| { |
| "entropy": 0.6719032816588879, |
| "epoch": 0.6707070707070707, |
| "grad_norm": 0.2999343276023865, |
| "learning_rate": 3.0912556586156157e-05, |
| "loss": 0.2079, |
| "mean_token_accuracy": 0.9178296446800231, |
| "num_tokens": 81336252.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 0.6469911426305771, |
| "epoch": 0.6723232323232323, |
| "grad_norm": 0.33051058650016785, |
| "learning_rate": 3.0893478706295616e-05, |
| "loss": 0.2052, |
| "mean_token_accuracy": 0.9086741656064987, |
| "num_tokens": 81535515.0, |
| "step": 4160 |
| }, |
| { |
| "entropy": 0.647654651850462, |
| "epoch": 0.673939393939394, |
| "grad_norm": 0.24572958052158356, |
| "learning_rate": 3.0874362323922583e-05, |
| "loss": 0.2028, |
| "mean_token_accuracy": 0.9146606475114822, |
| "num_tokens": 81735276.0, |
| "step": 4170 |
| }, |
| { |
| "entropy": 0.6480786517262459, |
| "epoch": 0.6755555555555556, |
| "grad_norm": 0.4676873981952667, |
| "learning_rate": 3.0855207493991344e-05, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.9165736556053161, |
| "num_tokens": 81935012.0, |
| "step": 4180 |
| }, |
| { |
| "entropy": 0.6712435901165008, |
| "epoch": 0.6771717171717172, |
| "grad_norm": 0.2698163092136383, |
| "learning_rate": 3.083601427156668e-05, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9149259775876999, |
| "num_tokens": 82126662.0, |
| "step": 4190 |
| }, |
| { |
| "entropy": 0.6606485776603221, |
| "epoch": 0.6787878787878788, |
| "grad_norm": 0.602709949016571, |
| "learning_rate": 3.081678271182374e-05, |
| "loss": 0.1991, |
| "mean_token_accuracy": 0.9160255298018456, |
| "num_tokens": 82320333.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.6787878787878788, |
| "eval_entropy": 0.7162625348865986, |
| "eval_loss": 0.1796865314245224, |
| "eval_mean_token_accuracy": 0.9215285767912864, |
| "eval_num_tokens": 82320333.0, |
| "eval_runtime": 107.3663, |
| "eval_samples_per_second": 9.314, |
| "eval_steps_per_second": 9.314, |
| "step": 4200 |
| }, |
| { |
| "entropy": 0.6206379361450672, |
| "epoch": 0.6804040404040405, |
| "grad_norm": 0.536363422870636, |
| "learning_rate": 3.07975128700479e-05, |
| "loss": 0.2351, |
| "mean_token_accuracy": 0.8985056817531586, |
| "num_tokens": 82529314.0, |
| "step": 4210 |
| }, |
| { |
| "entropy": 0.663483539223671, |
| "epoch": 0.682020202020202, |
| "grad_norm": 0.48275747895240784, |
| "learning_rate": 3.077820480163457e-05, |
| "loss": 0.186, |
| "mean_token_accuracy": 0.9269337847828865, |
| "num_tokens": 82722207.0, |
| "step": 4220 |
| }, |
| { |
| "entropy": 0.6566696122288704, |
| "epoch": 0.6836363636363636, |
| "grad_norm": 0.41870349645614624, |
| "learning_rate": 3.0758858562089064e-05, |
| "loss": 0.1976, |
| "mean_token_accuracy": 0.9154109835624695, |
| "num_tokens": 82917862.0, |
| "step": 4230 |
| }, |
| { |
| "entropy": 0.6778297878801822, |
| "epoch": 0.6852525252525252, |
| "grad_norm": 0.9168776869773865, |
| "learning_rate": 3.0739474207026414e-05, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.913340613245964, |
| "num_tokens": 83105019.0, |
| "step": 4240 |
| }, |
| { |
| "entropy": 0.6792197197675705, |
| "epoch": 0.6868686868686869, |
| "grad_norm": 0.30777159333229065, |
| "learning_rate": 3.072005179217123e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9207609817385674, |
| "num_tokens": 83293836.0, |
| "step": 4250 |
| }, |
| { |
| "entropy": 0.6453257068991661, |
| "epoch": 0.6884848484848485, |
| "grad_norm": 0.7975980639457703, |
| "learning_rate": 3.0700591373357545e-05, |
| "loss": 0.191, |
| "mean_token_accuracy": 0.9087596118450165, |
| "num_tokens": 83491257.0, |
| "step": 4260 |
| }, |
| { |
| "entropy": 0.6475184738636017, |
| "epoch": 0.6901010101010101, |
| "grad_norm": 1.9192277193069458, |
| "learning_rate": 3.068109300652861e-05, |
| "loss": 0.2109, |
| "mean_token_accuracy": 0.9065216913819313, |
| "num_tokens": 83687806.0, |
| "step": 4270 |
| }, |
| { |
| "entropy": 0.6501183703541755, |
| "epoch": 0.6917171717171717, |
| "grad_norm": 0.3584229648113251, |
| "learning_rate": 3.0661556747736804e-05, |
| "loss": 0.1842, |
| "mean_token_accuracy": 0.9226197183132172, |
| "num_tokens": 83884039.0, |
| "step": 4280 |
| }, |
| { |
| "entropy": 0.6619088158011437, |
| "epoch": 0.6933333333333334, |
| "grad_norm": 0.4995156228542328, |
| "learning_rate": 3.06419826531434e-05, |
| "loss": 0.2075, |
| "mean_token_accuracy": 0.9091604053974152, |
| "num_tokens": 84081632.0, |
| "step": 4290 |
| }, |
| { |
| "entropy": 0.6826878771185875, |
| "epoch": 0.694949494949495, |
| "grad_norm": 1.2546823024749756, |
| "learning_rate": 3.0622370779018476e-05, |
| "loss": 0.1722, |
| "mean_token_accuracy": 0.9264982849359512, |
| "num_tokens": 84271214.0, |
| "step": 4300 |
| }, |
| { |
| "entropy": 0.6403830215334892, |
| "epoch": 0.6965656565656566, |
| "grad_norm": 2.722322702407837, |
| "learning_rate": 3.060272118174067e-05, |
| "loss": 0.2108, |
| "mean_token_accuracy": 0.9086841389536857, |
| "num_tokens": 84472924.0, |
| "step": 4310 |
| }, |
| { |
| "entropy": 0.6621344015002251, |
| "epoch": 0.6981818181818182, |
| "grad_norm": 0.30751991271972656, |
| "learning_rate": 3.058303391779712e-05, |
| "loss": 0.1911, |
| "mean_token_accuracy": 0.9191326126456261, |
| "num_tokens": 84666549.0, |
| "step": 4320 |
| }, |
| { |
| "entropy": 0.6759060591459274, |
| "epoch": 0.6997979797979798, |
| "grad_norm": 0.38084861636161804, |
| "learning_rate": 3.05633090437832e-05, |
| "loss": 0.1908, |
| "mean_token_accuracy": 0.918217821419239, |
| "num_tokens": 84857667.0, |
| "step": 4330 |
| }, |
| { |
| "entropy": 0.6639383666217327, |
| "epoch": 0.7014141414141414, |
| "grad_norm": 0.5564208626747131, |
| "learning_rate": 3.054354661640241e-05, |
| "loss": 0.1975, |
| "mean_token_accuracy": 0.916983051598072, |
| "num_tokens": 85052396.0, |
| "step": 4340 |
| }, |
| { |
| "entropy": 0.6601137340068817, |
| "epoch": 0.703030303030303, |
| "grad_norm": 0.26238831877708435, |
| "learning_rate": 3.052374669246622e-05, |
| "loss": 0.208, |
| "mean_token_accuracy": 0.9125534147024155, |
| "num_tokens": 85247718.0, |
| "step": 4350 |
| }, |
| { |
| "entropy": 0.6844325631856918, |
| "epoch": 0.7046464646464646, |
| "grad_norm": 0.4119778871536255, |
| "learning_rate": 3.0503909328893877e-05, |
| "loss": 0.1864, |
| "mean_token_accuracy": 0.9183766141533851, |
| "num_tokens": 85436202.0, |
| "step": 4360 |
| }, |
| { |
| "entropy": 0.6351237051188946, |
| "epoch": 0.7062626262626263, |
| "grad_norm": 0.2481747567653656, |
| "learning_rate": 3.048403458271227e-05, |
| "loss": 0.202, |
| "mean_token_accuracy": 0.9135596618056298, |
| "num_tokens": 85638337.0, |
| "step": 4370 |
| }, |
| { |
| "entropy": 0.6510977402329445, |
| "epoch": 0.7078787878787879, |
| "grad_norm": 0.2582281231880188, |
| "learning_rate": 3.0464122511055742e-05, |
| "loss": 0.2139, |
| "mean_token_accuracy": 0.9152849763631821, |
| "num_tokens": 85835739.0, |
| "step": 4380 |
| }, |
| { |
| "entropy": 0.6571626104414463, |
| "epoch": 0.7094949494949495, |
| "grad_norm": 0.39865022897720337, |
| "learning_rate": 3.0444173171165943e-05, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9241902351379394, |
| "num_tokens": 86031887.0, |
| "step": 4390 |
| }, |
| { |
| "entropy": 0.6707207970321178, |
| "epoch": 0.7111111111111111, |
| "grad_norm": 4.553739070892334, |
| "learning_rate": 3.0424186620391658e-05, |
| "loss": 0.1986, |
| "mean_token_accuracy": 0.9116879284381867, |
| "num_tokens": 86221613.0, |
| "step": 4400 |
| }, |
| { |
| "entropy": 0.6952664569020272, |
| "epoch": 0.7127272727272728, |
| "grad_norm": 1.644877314567566, |
| "learning_rate": 3.040416291618864e-05, |
| "loss": 0.1896, |
| "mean_token_accuracy": 0.9263504967093468, |
| "num_tokens": 86405357.0, |
| "step": 4410 |
| }, |
| { |
| "entropy": 0.6387695834040642, |
| "epoch": 0.7143434343434344, |
| "grad_norm": 0.33424562215805054, |
| "learning_rate": 3.0384102116119443e-05, |
| "loss": 0.2039, |
| "mean_token_accuracy": 0.9096469655632973, |
| "num_tokens": 86605382.0, |
| "step": 4420 |
| }, |
| { |
| "entropy": 0.6235728591680527, |
| "epoch": 0.7159595959595959, |
| "grad_norm": 0.2764752209186554, |
| "learning_rate": 3.0364004277853282e-05, |
| "loss": 0.1944, |
| "mean_token_accuracy": 0.913192069530487, |
| "num_tokens": 86808324.0, |
| "step": 4430 |
| }, |
| { |
| "entropy": 0.6314373418688775, |
| "epoch": 0.7175757575757575, |
| "grad_norm": 0.512512743473053, |
| "learning_rate": 3.0343869459165815e-05, |
| "loss": 0.2098, |
| "mean_token_accuracy": 0.9095560878515243, |
| "num_tokens": 87010056.0, |
| "step": 4440 |
| }, |
| { |
| "entropy": 0.6400863215327263, |
| "epoch": 0.7191919191919192, |
| "grad_norm": 0.3962872326374054, |
| "learning_rate": 3.0323697717939035e-05, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.9209304124116897, |
| "num_tokens": 87206033.0, |
| "step": 4450 |
| }, |
| { |
| "entropy": 0.6568309247493744, |
| "epoch": 0.7208080808080808, |
| "grad_norm": 0.43984800577163696, |
| "learning_rate": 3.030348911216107e-05, |
| "loss": 0.184, |
| "mean_token_accuracy": 0.9168927937746048, |
| "num_tokens": 87400060.0, |
| "step": 4460 |
| }, |
| { |
| "entropy": 0.644003939628601, |
| "epoch": 0.7224242424242424, |
| "grad_norm": 0.7947312593460083, |
| "learning_rate": 3.0283243699926004e-05, |
| "loss": 0.1939, |
| "mean_token_accuracy": 0.9169272065162659, |
| "num_tokens": 87596089.0, |
| "step": 4470 |
| }, |
| { |
| "entropy": 0.6618608511984349, |
| "epoch": 0.724040404040404, |
| "grad_norm": 0.20077355206012726, |
| "learning_rate": 3.026296153943376e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.92214535176754, |
| "num_tokens": 87786718.0, |
| "step": 4480 |
| }, |
| { |
| "entropy": 0.6560651198029518, |
| "epoch": 0.7256565656565657, |
| "grad_norm": 0.24577167630195618, |
| "learning_rate": 3.0242642688989887e-05, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9134400516748429, |
| "num_tokens": 87977636.0, |
| "step": 4490 |
| }, |
| { |
| "entropy": 0.6534604743123055, |
| "epoch": 0.7272727272727273, |
| "grad_norm": 0.22389015555381775, |
| "learning_rate": 3.02222872070054e-05, |
| "loss": 0.1913, |
| "mean_token_accuracy": 0.9150966987013817, |
| "num_tokens": 88173839.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "eval_entropy": 0.7084573372304439, |
| "eval_loss": 0.17917364835739136, |
| "eval_mean_token_accuracy": 0.9231415635347366, |
| "eval_num_tokens": 88173839.0, |
| "eval_runtime": 107.7522, |
| "eval_samples_per_second": 9.281, |
| "eval_steps_per_second": 9.281, |
| "step": 4500 |
| }, |
| { |
| "entropy": 0.625628887116909, |
| "epoch": 0.7288888888888889, |
| "grad_norm": 0.34484541416168213, |
| "learning_rate": 3.0201895151996636e-05, |
| "loss": 0.1992, |
| "mean_token_accuracy": 0.915931510925293, |
| "num_tokens": 88378636.0, |
| "step": 4510 |
| }, |
| { |
| "entropy": 0.6244528807699681, |
| "epoch": 0.7305050505050505, |
| "grad_norm": 0.253776490688324, |
| "learning_rate": 3.0181466582585056e-05, |
| "loss": 0.2, |
| "mean_token_accuracy": 0.9082011923193931, |
| "num_tokens": 88580751.0, |
| "step": 4520 |
| }, |
| { |
| "entropy": 0.6235019870102405, |
| "epoch": 0.7321212121212122, |
| "grad_norm": 0.41761407256126404, |
| "learning_rate": 3.0161001557497097e-05, |
| "loss": 0.2076, |
| "mean_token_accuracy": 0.9085301622748375, |
| "num_tokens": 88784953.0, |
| "step": 4530 |
| }, |
| { |
| "entropy": 0.6446406342089176, |
| "epoch": 0.7337373737373737, |
| "grad_norm": 0.31629061698913574, |
| "learning_rate": 3.0140500135563988e-05, |
| "loss": 0.1894, |
| "mean_token_accuracy": 0.9079845041036606, |
| "num_tokens": 88980717.0, |
| "step": 4540 |
| }, |
| { |
| "entropy": 0.6352882876992225, |
| "epoch": 0.7353535353535353, |
| "grad_norm": 0.311393678188324, |
| "learning_rate": 3.01199623757216e-05, |
| "loss": 0.2006, |
| "mean_token_accuracy": 0.9123006537556648, |
| "num_tokens": 89179270.0, |
| "step": 4550 |
| }, |
| { |
| "entropy": 0.6552980750799179, |
| "epoch": 0.7369696969696969, |
| "grad_norm": 0.2929733693599701, |
| "learning_rate": 3.0099388337010258e-05, |
| "loss": 0.1975, |
| "mean_token_accuracy": 0.9153749331831932, |
| "num_tokens": 89371171.0, |
| "step": 4560 |
| }, |
| { |
| "entropy": 0.6586298391222953, |
| "epoch": 0.7385858585858586, |
| "grad_norm": 0.35867592692375183, |
| "learning_rate": 3.0078778078574578e-05, |
| "loss": 0.1988, |
| "mean_token_accuracy": 0.9191118314862251, |
| "num_tokens": 89562627.0, |
| "step": 4570 |
| }, |
| { |
| "entropy": 0.6490816324949265, |
| "epoch": 0.7402020202020202, |
| "grad_norm": 0.41531920433044434, |
| "learning_rate": 3.00581316596633e-05, |
| "loss": 0.1954, |
| "mean_token_accuracy": 0.9126431494951248, |
| "num_tokens": 89756798.0, |
| "step": 4580 |
| }, |
| { |
| "entropy": 0.652235820889473, |
| "epoch": 0.7418181818181818, |
| "grad_norm": 0.2528453767299652, |
| "learning_rate": 3.0037449139629118e-05, |
| "loss": 0.2169, |
| "mean_token_accuracy": 0.9148132383823395, |
| "num_tokens": 89949016.0, |
| "step": 4590 |
| }, |
| { |
| "entropy": 0.6657333552837372, |
| "epoch": 0.7434343434343434, |
| "grad_norm": 0.35651499032974243, |
| "learning_rate": 3.0016730577928502e-05, |
| "loss": 0.2013, |
| "mean_token_accuracy": 0.9121545031666756, |
| "num_tokens": 90137136.0, |
| "step": 4600 |
| }, |
| { |
| "entropy": 0.6497416846454144, |
| "epoch": 0.7450505050505051, |
| "grad_norm": 0.8100721836090088, |
| "learning_rate": 2.9995976034121533e-05, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9251446709036827, |
| "num_tokens": 90330201.0, |
| "step": 4610 |
| }, |
| { |
| "entropy": 0.6356344550848008, |
| "epoch": 0.7466666666666667, |
| "grad_norm": 0.55918288230896, |
| "learning_rate": 2.9975185567871724e-05, |
| "loss": 0.1885, |
| "mean_token_accuracy": 0.9203934907913208, |
| "num_tokens": 90527700.0, |
| "step": 4620 |
| }, |
| { |
| "entropy": 0.6411231979727745, |
| "epoch": 0.7482828282828283, |
| "grad_norm": 0.3770994246006012, |
| "learning_rate": 2.9954359238945874e-05, |
| "loss": 0.185, |
| "mean_token_accuracy": 0.9203880429267883, |
| "num_tokens": 90726375.0, |
| "step": 4630 |
| }, |
| { |
| "entropy": 0.6443192966282367, |
| "epoch": 0.74989898989899, |
| "grad_norm": 1.301261067390442, |
| "learning_rate": 2.993349710721386e-05, |
| "loss": 0.2015, |
| "mean_token_accuracy": 0.9126525431871414, |
| "num_tokens": 90922076.0, |
| "step": 4640 |
| }, |
| { |
| "entropy": 0.6357131384313106, |
| "epoch": 0.7515151515151515, |
| "grad_norm": 0.5413894057273865, |
| "learning_rate": 2.9912599232648484e-05, |
| "loss": 0.2105, |
| "mean_token_accuracy": 0.9099463567137718, |
| "num_tokens": 91120775.0, |
| "step": 4650 |
| }, |
| { |
| "entropy": 0.6498973920941353, |
| "epoch": 0.7531313131313131, |
| "grad_norm": 0.3530432879924774, |
| "learning_rate": 2.9891665675325303e-05, |
| "loss": 0.1886, |
| "mean_token_accuracy": 0.9196889087557792, |
| "num_tokens": 91313565.0, |
| "step": 4660 |
| }, |
| { |
| "entropy": 0.6522077709436417, |
| "epoch": 0.7547474747474747, |
| "grad_norm": 0.3330952525138855, |
| "learning_rate": 2.9870696495422457e-05, |
| "loss": 0.1916, |
| "mean_token_accuracy": 0.9102325558662414, |
| "num_tokens": 91508313.0, |
| "step": 4670 |
| }, |
| { |
| "entropy": 0.6606266908347607, |
| "epoch": 0.7563636363636363, |
| "grad_norm": 0.3526689410209656, |
| "learning_rate": 2.9849691753220472e-05, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9151232793927193, |
| "num_tokens": 91698872.0, |
| "step": 4680 |
| }, |
| { |
| "entropy": 0.6377008631825447, |
| "epoch": 0.757979797979798, |
| "grad_norm": 0.9076177477836609, |
| "learning_rate": 2.982865150910213e-05, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.9114378541707993, |
| "num_tokens": 91897368.0, |
| "step": 4690 |
| }, |
| { |
| "entropy": 0.6670403979718685, |
| "epoch": 0.7595959595959596, |
| "grad_norm": 0.41317087411880493, |
| "learning_rate": 2.9807575823552256e-05, |
| "loss": 0.1976, |
| "mean_token_accuracy": 0.9175861835479736, |
| "num_tokens": 92089611.0, |
| "step": 4700 |
| }, |
| { |
| "entropy": 0.6318577192723751, |
| "epoch": 0.7612121212121212, |
| "grad_norm": 1.2537201642990112, |
| "learning_rate": 2.978646475715756e-05, |
| "loss": 0.2033, |
| "mean_token_accuracy": 0.9059315130114556, |
| "num_tokens": 92289093.0, |
| "step": 4710 |
| }, |
| { |
| "entropy": 0.641324719786644, |
| "epoch": 0.7628282828282829, |
| "grad_norm": 3.2999162673950195, |
| "learning_rate": 2.976531837060646e-05, |
| "loss": 0.2063, |
| "mean_token_accuracy": 0.9138477474451066, |
| "num_tokens": 92486668.0, |
| "step": 4720 |
| }, |
| { |
| "entropy": 0.6670213177800178, |
| "epoch": 0.7644444444444445, |
| "grad_norm": 0.544033408164978, |
| "learning_rate": 2.9744136724688925e-05, |
| "loss": 0.2029, |
| "mean_token_accuracy": 0.9136773809790611, |
| "num_tokens": 92676682.0, |
| "step": 4730 |
| }, |
| { |
| "entropy": 0.6242363020777703, |
| "epoch": 0.7660606060606061, |
| "grad_norm": 0.23182179033756256, |
| "learning_rate": 2.9722919880296275e-05, |
| "loss": 0.2127, |
| "mean_token_accuracy": 0.9077824458479882, |
| "num_tokens": 92881908.0, |
| "step": 4740 |
| }, |
| { |
| "entropy": 0.646241495758295, |
| "epoch": 0.7676767676767676, |
| "grad_norm": 0.46250054240226746, |
| "learning_rate": 2.9701667898421007e-05, |
| "loss": 0.1877, |
| "mean_token_accuracy": 0.9231104418635369, |
| "num_tokens": 93079703.0, |
| "step": 4750 |
| }, |
| { |
| "entropy": 0.6419562876224518, |
| "epoch": 0.7692929292929293, |
| "grad_norm": 0.5598077178001404, |
| "learning_rate": 2.968038084015664e-05, |
| "loss": 0.1999, |
| "mean_token_accuracy": 0.90995651781559, |
| "num_tokens": 93279631.0, |
| "step": 4760 |
| }, |
| { |
| "entropy": 0.6329437598586083, |
| "epoch": 0.7709090909090909, |
| "grad_norm": 0.44706058502197266, |
| "learning_rate": 2.9659058766697517e-05, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.9213619500398635, |
| "num_tokens": 93485133.0, |
| "step": 4770 |
| }, |
| { |
| "entropy": 0.641933137178421, |
| "epoch": 0.7725252525252525, |
| "grad_norm": 0.26708972454071045, |
| "learning_rate": 2.9637701739338663e-05, |
| "loss": 0.1998, |
| "mean_token_accuracy": 0.9142438322305679, |
| "num_tokens": 93685004.0, |
| "step": 4780 |
| }, |
| { |
| "entropy": 0.661033497005701, |
| "epoch": 0.7741414141414141, |
| "grad_norm": 0.330924391746521, |
| "learning_rate": 2.9616309819475555e-05, |
| "loss": 0.1965, |
| "mean_token_accuracy": 0.9215885639190674, |
| "num_tokens": 93876811.0, |
| "step": 4790 |
| }, |
| { |
| "entropy": 0.6610300816595555, |
| "epoch": 0.7757575757575758, |
| "grad_norm": 0.5013708472251892, |
| "learning_rate": 2.9594883068603994e-05, |
| "loss": 0.1939, |
| "mean_token_accuracy": 0.9239798724651337, |
| "num_tokens": 94069193.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.7757575757575758, |
| "eval_entropy": 0.7066668328940868, |
| "eval_loss": 0.1791316419839859, |
| "eval_mean_token_accuracy": 0.9220727326869964, |
| "eval_num_tokens": 94069193.0, |
| "eval_runtime": 107.0928, |
| "eval_samples_per_second": 9.338, |
| "eval_steps_per_second": 9.338, |
| "step": 4800 |
| }, |
| { |
| "entropy": 0.6644454948604107, |
| "epoch": 0.7773737373737374, |
| "grad_norm": 0.3091444671154022, |
| "learning_rate": 2.9573421548319915e-05, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.9266034245491028, |
| "num_tokens": 94259025.0, |
| "step": 4810 |
| }, |
| { |
| "entropy": 0.661138878762722, |
| "epoch": 0.778989898989899, |
| "grad_norm": 3.5806069374084473, |
| "learning_rate": 2.955192532031919e-05, |
| "loss": 0.177, |
| "mean_token_accuracy": 0.9198731109499931, |
| "num_tokens": 94448772.0, |
| "step": 4820 |
| }, |
| { |
| "entropy": 0.6519260667264462, |
| "epoch": 0.7806060606060606, |
| "grad_norm": 0.3555883765220642, |
| "learning_rate": 2.9530394446397478e-05, |
| "loss": 0.1986, |
| "mean_token_accuracy": 0.9150562450289726, |
| "num_tokens": 94641890.0, |
| "step": 4830 |
| }, |
| { |
| "entropy": 0.648599949479103, |
| "epoch": 0.7822222222222223, |
| "grad_norm": 0.1755499243736267, |
| "learning_rate": 2.950882898845003e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9226464763283729, |
| "num_tokens": 94837343.0, |
| "step": 4840 |
| }, |
| { |
| "entropy": 0.6322360306978225, |
| "epoch": 0.7838383838383839, |
| "grad_norm": 0.6269310712814331, |
| "learning_rate": 2.9487229008471516e-05, |
| "loss": 0.1975, |
| "mean_token_accuracy": 0.9124949604272843, |
| "num_tokens": 95037833.0, |
| "step": 4850 |
| }, |
| { |
| "entropy": 0.6398171879351139, |
| "epoch": 0.7854545454545454, |
| "grad_norm": 0.7617530226707458, |
| "learning_rate": 2.9465594568555848e-05, |
| "loss": 0.2033, |
| "mean_token_accuracy": 0.9138290837407113, |
| "num_tokens": 95234489.0, |
| "step": 4860 |
| }, |
| { |
| "entropy": 0.6249017030000686, |
| "epoch": 0.787070707070707, |
| "grad_norm": 1.4149338006973267, |
| "learning_rate": 2.9443925730896002e-05, |
| "loss": 0.2005, |
| "mean_token_accuracy": 0.9207390502095223, |
| "num_tokens": 95437222.0, |
| "step": 4870 |
| }, |
| { |
| "entropy": 0.6542009584605694, |
| "epoch": 0.7886868686868687, |
| "grad_norm": 0.2479691058397293, |
| "learning_rate": 2.942222255778384e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9258310168981552, |
| "num_tokens": 95632246.0, |
| "step": 4880 |
| }, |
| { |
| "entropy": 0.6555586464703083, |
| "epoch": 0.7903030303030303, |
| "grad_norm": 0.37414678931236267, |
| "learning_rate": 2.940048511160993e-05, |
| "loss": 0.2025, |
| "mean_token_accuracy": 0.9188988924026489, |
| "num_tokens": 95826517.0, |
| "step": 4890 |
| }, |
| { |
| "entropy": 0.6773953974246979, |
| "epoch": 0.7919191919191919, |
| "grad_norm": 0.3192891478538513, |
| "learning_rate": 2.937871345486335e-05, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.9206574469804764, |
| "num_tokens": 96010001.0, |
| "step": 4900 |
| }, |
| { |
| "entropy": 0.6655720636248589, |
| "epoch": 0.7935353535353535, |
| "grad_norm": 0.29721325635910034, |
| "learning_rate": 2.935690765013155e-05, |
| "loss": 0.2051, |
| "mean_token_accuracy": 0.9204679220914841, |
| "num_tokens": 96199172.0, |
| "step": 4910 |
| }, |
| { |
| "entropy": 0.6329097993671894, |
| "epoch": 0.7951515151515152, |
| "grad_norm": 3.6277847290039062, |
| "learning_rate": 2.933506776010012e-05, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.9226977825164795, |
| "num_tokens": 96397606.0, |
| "step": 4920 |
| }, |
| { |
| "entropy": 0.6353421777486801, |
| "epoch": 0.7967676767676768, |
| "grad_norm": 0.5429826378822327, |
| "learning_rate": 2.9313193847552646e-05, |
| "loss": 0.1969, |
| "mean_token_accuracy": 0.9134786337614059, |
| "num_tokens": 96598079.0, |
| "step": 4930 |
| }, |
| { |
| "entropy": 0.6452288366854191, |
| "epoch": 0.7983838383838384, |
| "grad_norm": 0.6199573278427124, |
| "learning_rate": 2.9291285975370532e-05, |
| "loss": 0.1896, |
| "mean_token_accuracy": 0.9198967650532722, |
| "num_tokens": 96794052.0, |
| "step": 4940 |
| }, |
| { |
| "entropy": 0.6411104649305344, |
| "epoch": 0.8, |
| "grad_norm": 3.42073130607605, |
| "learning_rate": 2.9269344206532787e-05, |
| "loss": 0.2022, |
| "mean_token_accuracy": 0.9114478975534439, |
| "num_tokens": 96990153.0, |
| "step": 4950 |
| }, |
| { |
| "entropy": 0.6478223502635956, |
| "epoch": 0.8016161616161617, |
| "grad_norm": 0.543823778629303, |
| "learning_rate": 2.9247368604115868e-05, |
| "loss": 0.1914, |
| "mean_token_accuracy": 0.9210754543542862, |
| "num_tokens": 97187085.0, |
| "step": 4960 |
| }, |
| { |
| "entropy": 0.6267038069665432, |
| "epoch": 0.8032323232323232, |
| "grad_norm": 0.38325926661491394, |
| "learning_rate": 2.9225359231293504e-05, |
| "loss": 0.2113, |
| "mean_token_accuracy": 0.9096709057688713, |
| "num_tokens": 97391001.0, |
| "step": 4970 |
| }, |
| { |
| "entropy": 0.6416891925036907, |
| "epoch": 0.8048484848484848, |
| "grad_norm": 0.28709155321121216, |
| "learning_rate": 2.9203316151336503e-05, |
| "loss": 0.2057, |
| "mean_token_accuracy": 0.9100401312112808, |
| "num_tokens": 97589847.0, |
| "step": 4980 |
| }, |
| { |
| "entropy": 0.6513305693864823, |
| "epoch": 0.8064646464646464, |
| "grad_norm": 0.3699948191642761, |
| "learning_rate": 2.9181239427612553e-05, |
| "loss": 0.2023, |
| "mean_token_accuracy": 0.9095991387963295, |
| "num_tokens": 97784709.0, |
| "step": 4990 |
| }, |
| { |
| "entropy": 0.6572237811982632, |
| "epoch": 0.8080808080808081, |
| "grad_norm": 0.22213219106197357, |
| "learning_rate": 2.915912912358608e-05, |
| "loss": 0.1874, |
| "mean_token_accuracy": 0.9166789174079895, |
| "num_tokens": 97976974.0, |
| "step": 5000 |
| }, |
| { |
| "entropy": 0.6352049507200718, |
| "epoch": 0.8096969696969697, |
| "grad_norm": 0.3106616735458374, |
| "learning_rate": 2.9136985302818037e-05, |
| "loss": 0.1961, |
| "mean_token_accuracy": 0.9133357793092728, |
| "num_tokens": 98178430.0, |
| "step": 5010 |
| }, |
| { |
| "entropy": 0.6425909124314785, |
| "epoch": 0.8113131313131313, |
| "grad_norm": 0.21424593031406403, |
| "learning_rate": 2.9114808028965725e-05, |
| "loss": 0.206, |
| "mean_token_accuracy": 0.9140076264739037, |
| "num_tokens": 98378932.0, |
| "step": 5020 |
| }, |
| { |
| "entropy": 0.633851234614849, |
| "epoch": 0.812929292929293, |
| "grad_norm": 0.6443716883659363, |
| "learning_rate": 2.909259736578261e-05, |
| "loss": 0.2156, |
| "mean_token_accuracy": 0.9081557050347329, |
| "num_tokens": 98582018.0, |
| "step": 5030 |
| }, |
| { |
| "entropy": 0.6445118576288223, |
| "epoch": 0.8145454545454546, |
| "grad_norm": 0.2837905287742615, |
| "learning_rate": 2.9070353377118157e-05, |
| "loss": 0.2013, |
| "mean_token_accuracy": 0.9167410224676132, |
| "num_tokens": 98780254.0, |
| "step": 5040 |
| }, |
| { |
| "entropy": 0.6554682970046997, |
| "epoch": 0.8161616161616162, |
| "grad_norm": 0.19515164196491241, |
| "learning_rate": 2.904807612691762e-05, |
| "loss": 0.1841, |
| "mean_token_accuracy": 0.9176914617419243, |
| "num_tokens": 98975630.0, |
| "step": 5050 |
| }, |
| { |
| "entropy": 0.6432619698345661, |
| "epoch": 0.8177777777777778, |
| "grad_norm": 0.4333808422088623, |
| "learning_rate": 2.9025765679221877e-05, |
| "loss": 0.1874, |
| "mean_token_accuracy": 0.9236332550644875, |
| "num_tokens": 99172630.0, |
| "step": 5060 |
| }, |
| { |
| "entropy": 0.6520089380443096, |
| "epoch": 0.8193939393939393, |
| "grad_norm": 0.2755585312843323, |
| "learning_rate": 2.9003422098167233e-05, |
| "loss": 0.2044, |
| "mean_token_accuracy": 0.9035430192947388, |
| "num_tokens": 99366672.0, |
| "step": 5070 |
| }, |
| { |
| "entropy": 0.6506100505590439, |
| "epoch": 0.821010101010101, |
| "grad_norm": 0.2337871491909027, |
| "learning_rate": 2.8981045447985252e-05, |
| "loss": 0.1886, |
| "mean_token_accuracy": 0.9145638585090637, |
| "num_tokens": 99560531.0, |
| "step": 5080 |
| }, |
| { |
| "entropy": 0.6408963188529014, |
| "epoch": 0.8226262626262626, |
| "grad_norm": 0.6002528071403503, |
| "learning_rate": 2.8958635793002555e-05, |
| "loss": 0.2027, |
| "mean_token_accuracy": 0.9141564324498177, |
| "num_tokens": 99763014.0, |
| "step": 5090 |
| }, |
| { |
| "entropy": 0.6397250548005105, |
| "epoch": 0.8242424242424242, |
| "grad_norm": 0.4990832209587097, |
| "learning_rate": 2.893619319764064e-05, |
| "loss": 0.2051, |
| "mean_token_accuracy": 0.9149850279092788, |
| "num_tokens": 99963749.0, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.8242424242424242, |
| "eval_entropy": 0.711084859162569, |
| "eval_loss": 0.17738723754882812, |
| "eval_mean_token_accuracy": 0.9236658059358597, |
| "eval_num_tokens": 99963749.0, |
| "eval_runtime": 108.7249, |
| "eval_samples_per_second": 9.198, |
| "eval_steps_per_second": 9.198, |
| "step": 5100 |
| }, |
| { |
| "entropy": 0.6357235059142112, |
| "epoch": 0.8258585858585858, |
| "grad_norm": 0.3885038495063782, |
| "learning_rate": 2.8913717726415703e-05, |
| "loss": 0.1997, |
| "mean_token_accuracy": 0.9168539077043534, |
| "num_tokens": 100163382.0, |
| "step": 5110 |
| }, |
| { |
| "entropy": 0.6695950977504254, |
| "epoch": 0.8274747474747475, |
| "grad_norm": 0.3567816913127899, |
| "learning_rate": 2.8891209443938462e-05, |
| "loss": 0.1857, |
| "mean_token_accuracy": 0.9217189520597457, |
| "num_tokens": 100352951.0, |
| "step": 5120 |
| }, |
| { |
| "entropy": 0.644576845318079, |
| "epoch": 0.8290909090909091, |
| "grad_norm": 1.626629114151001, |
| "learning_rate": 2.8868668414913932e-05, |
| "loss": 0.2001, |
| "mean_token_accuracy": 0.9181193798780442, |
| "num_tokens": 100550280.0, |
| "step": 5130 |
| }, |
| { |
| "entropy": 0.6233297750353813, |
| "epoch": 0.8307070707070707, |
| "grad_norm": 0.2700617015361786, |
| "learning_rate": 2.8846094704141283e-05, |
| "loss": 0.2024, |
| "mean_token_accuracy": 0.917197747528553, |
| "num_tokens": 100755619.0, |
| "step": 5140 |
| }, |
| { |
| "entropy": 0.6268128864467144, |
| "epoch": 0.8323232323232324, |
| "grad_norm": 0.3646371066570282, |
| "learning_rate": 2.8823488376513636e-05, |
| "loss": 0.1905, |
| "mean_token_accuracy": 0.9136819630861283, |
| "num_tokens": 100957153.0, |
| "step": 5150 |
| }, |
| { |
| "entropy": 0.6693423956632614, |
| "epoch": 0.833939393939394, |
| "grad_norm": 0.41190770268440247, |
| "learning_rate": 2.8800849497017874e-05, |
| "loss": 0.1982, |
| "mean_token_accuracy": 0.912401182949543, |
| "num_tokens": 101146659.0, |
| "step": 5160 |
| }, |
| { |
| "entropy": 0.6590829059481621, |
| "epoch": 0.8355555555555556, |
| "grad_norm": 1.4731428623199463, |
| "learning_rate": 2.8778178130734445e-05, |
| "loss": 0.1928, |
| "mean_token_accuracy": 0.9157117456197739, |
| "num_tokens": 101340632.0, |
| "step": 5170 |
| }, |
| { |
| "entropy": 0.6364330008625985, |
| "epoch": 0.8371717171717171, |
| "grad_norm": 0.4675506055355072, |
| "learning_rate": 2.8755474342837214e-05, |
| "loss": 0.2048, |
| "mean_token_accuracy": 0.9125443175435066, |
| "num_tokens": 101540607.0, |
| "step": 5180 |
| }, |
| { |
| "entropy": 0.646899762749672, |
| "epoch": 0.8387878787878787, |
| "grad_norm": 0.6047652363777161, |
| "learning_rate": 2.873273819859321e-05, |
| "loss": 0.1922, |
| "mean_token_accuracy": 0.914633323252201, |
| "num_tokens": 101737070.0, |
| "step": 5190 |
| }, |
| { |
| "entropy": 0.6807426974177361, |
| "epoch": 0.8404040404040404, |
| "grad_norm": 0.26485979557037354, |
| "learning_rate": 2.8709969763362528e-05, |
| "loss": 0.1828, |
| "mean_token_accuracy": 0.9270256280899047, |
| "num_tokens": 101924748.0, |
| "step": 5200 |
| }, |
| { |
| "entropy": 0.6325490467250348, |
| "epoch": 0.842020202020202, |
| "grad_norm": 0.6139155030250549, |
| "learning_rate": 2.8687169102598045e-05, |
| "loss": 0.1959, |
| "mean_token_accuracy": 0.9126851588487626, |
| "num_tokens": 102126864.0, |
| "step": 5210 |
| }, |
| { |
| "entropy": 0.6704427659511566, |
| "epoch": 0.8436363636363636, |
| "grad_norm": 0.47554439306259155, |
| "learning_rate": 2.8664336281845305e-05, |
| "loss": 0.1859, |
| "mean_token_accuracy": 0.9185951977968216, |
| "num_tokens": 102317161.0, |
| "step": 5220 |
| }, |
| { |
| "entropy": 0.6308740340173244, |
| "epoch": 0.8452525252525253, |
| "grad_norm": 0.28282374143600464, |
| "learning_rate": 2.864147136674229e-05, |
| "loss": 0.2062, |
| "mean_token_accuracy": 0.9082028090953826, |
| "num_tokens": 102520346.0, |
| "step": 5230 |
| }, |
| { |
| "entropy": 0.6597593754529953, |
| "epoch": 0.8468686868686869, |
| "grad_norm": 0.29790395498275757, |
| "learning_rate": 2.8618574423019245e-05, |
| "loss": 0.1866, |
| "mean_token_accuracy": 0.9192267313599587, |
| "num_tokens": 102711472.0, |
| "step": 5240 |
| }, |
| { |
| "entropy": 0.6680999815464019, |
| "epoch": 0.8484848484848485, |
| "grad_norm": 0.33394286036491394, |
| "learning_rate": 2.85956455164985e-05, |
| "loss": 0.1689, |
| "mean_token_accuracy": 0.9244133904576302, |
| "num_tokens": 102900401.0, |
| "step": 5250 |
| }, |
| { |
| "entropy": 0.6864773757755757, |
| "epoch": 0.8501010101010101, |
| "grad_norm": 0.5447540879249573, |
| "learning_rate": 2.8572684713094258e-05, |
| "loss": 0.1772, |
| "mean_token_accuracy": 0.9200161874294281, |
| "num_tokens": 103084516.0, |
| "step": 5260 |
| }, |
| { |
| "entropy": 0.6241940252482892, |
| "epoch": 0.8517171717171718, |
| "grad_norm": 0.2910633981227875, |
| "learning_rate": 2.8549692078812408e-05, |
| "loss": 0.1815, |
| "mean_token_accuracy": 0.9207342237234115, |
| "num_tokens": 103288299.0, |
| "step": 5270 |
| }, |
| { |
| "entropy": 0.6231748268008233, |
| "epoch": 0.8533333333333334, |
| "grad_norm": 1.1171069145202637, |
| "learning_rate": 2.8526667679750373e-05, |
| "loss": 0.2025, |
| "mean_token_accuracy": 0.9095341920852661, |
| "num_tokens": 103490561.0, |
| "step": 5280 |
| }, |
| { |
| "entropy": 0.678689093887806, |
| "epoch": 0.8549494949494949, |
| "grad_norm": 0.9614298939704895, |
| "learning_rate": 2.850361158209686e-05, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.916099627315998, |
| "num_tokens": 103676969.0, |
| "step": 5290 |
| }, |
| { |
| "entropy": 0.6450789928436279, |
| "epoch": 0.8565656565656565, |
| "grad_norm": 1.1223703622817993, |
| "learning_rate": 2.848052385213172e-05, |
| "loss": 0.1967, |
| "mean_token_accuracy": 0.9124213516712188, |
| "num_tokens": 103874801.0, |
| "step": 5300 |
| }, |
| { |
| "entropy": 0.61056489944458, |
| "epoch": 0.8581818181818182, |
| "grad_norm": 0.6901957988739014, |
| "learning_rate": 2.845740455622573e-05, |
| "loss": 0.2225, |
| "mean_token_accuracy": 0.9036866158246994, |
| "num_tokens": 104084220.0, |
| "step": 5310 |
| }, |
| { |
| "entropy": 0.6487591087818145, |
| "epoch": 0.8597979797979798, |
| "grad_norm": 0.2818031907081604, |
| "learning_rate": 2.843425376084041e-05, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.9231310933828354, |
| "num_tokens": 104279547.0, |
| "step": 5320 |
| }, |
| { |
| "entropy": 0.6611220106482506, |
| "epoch": 0.8614141414141414, |
| "grad_norm": 2.2702674865722656, |
| "learning_rate": 2.8411071532527836e-05, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.9261722102761268, |
| "num_tokens": 104471791.0, |
| "step": 5330 |
| }, |
| { |
| "entropy": 0.6651269473135472, |
| "epoch": 0.863030303030303, |
| "grad_norm": 0.36874091625213623, |
| "learning_rate": 2.8387857937930444e-05, |
| "loss": 0.1985, |
| "mean_token_accuracy": 0.9155077993869781, |
| "num_tokens": 104666419.0, |
| "step": 5340 |
| }, |
| { |
| "entropy": 0.6907488480210304, |
| "epoch": 0.8646464646464647, |
| "grad_norm": 0.34566083550453186, |
| "learning_rate": 2.8364613043780834e-05, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9205912709236145, |
| "num_tokens": 104851104.0, |
| "step": 5350 |
| }, |
| { |
| "entropy": 0.6465521849691868, |
| "epoch": 0.8662626262626263, |
| "grad_norm": 0.5255801677703857, |
| "learning_rate": 2.8341336916901593e-05, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9175161227583886, |
| "num_tokens": 105048989.0, |
| "step": 5360 |
| }, |
| { |
| "entropy": 0.6434522531926632, |
| "epoch": 0.8678787878787879, |
| "grad_norm": 0.6469079256057739, |
| "learning_rate": 2.8318029624205086e-05, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9107877269387246, |
| "num_tokens": 105247416.0, |
| "step": 5370 |
| }, |
| { |
| "entropy": 0.6635441213846207, |
| "epoch": 0.8694949494949495, |
| "grad_norm": 0.2386147379875183, |
| "learning_rate": 2.829469123269327e-05, |
| "loss": 0.1974, |
| "mean_token_accuracy": 0.916090327501297, |
| "num_tokens": 105438507.0, |
| "step": 5380 |
| }, |
| { |
| "entropy": 0.6538894563913346, |
| "epoch": 0.8711111111111111, |
| "grad_norm": 0.24012506008148193, |
| "learning_rate": 2.8271321809457514e-05, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.9102769285440445, |
| "num_tokens": 105634997.0, |
| "step": 5390 |
| }, |
| { |
| "entropy": 0.6566796422004699, |
| "epoch": 0.8727272727272727, |
| "grad_norm": 0.6380462646484375, |
| "learning_rate": 2.824792142167838e-05, |
| "loss": 0.1768, |
| "mean_token_accuracy": 0.9162303432822227, |
| "num_tokens": 105829263.0, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.8727272727272727, |
| "eval_entropy": 0.7135846571624279, |
| "eval_loss": 0.17874300479888916, |
| "eval_mean_token_accuracy": 0.9199917802810669, |
| "eval_num_tokens": 105829263.0, |
| "eval_runtime": 107.6177, |
| "eval_samples_per_second": 9.292, |
| "eval_steps_per_second": 9.292, |
| "step": 5400 |
| }, |
| { |
| "entropy": 0.6447012484073639, |
| "epoch": 0.8743434343434343, |
| "grad_norm": 2.0912580490112305, |
| "learning_rate": 2.8224490136625455e-05, |
| "loss": 0.2109, |
| "mean_token_accuracy": 0.9130528718233109, |
| "num_tokens": 106026045.0, |
| "step": 5410 |
| }, |
| { |
| "entropy": 0.6804825678467751, |
| "epoch": 0.8759595959595959, |
| "grad_norm": 0.47214868664741516, |
| "learning_rate": 2.8201028021657138e-05, |
| "loss": 0.173, |
| "mean_token_accuracy": 0.9197008013725281, |
| "num_tokens": 106215632.0, |
| "step": 5420 |
| }, |
| { |
| "entropy": 0.6902777642011643, |
| "epoch": 0.8775757575757576, |
| "grad_norm": 0.18231706321239471, |
| "learning_rate": 2.8177535144220456e-05, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.9233282685279847, |
| "num_tokens": 106401153.0, |
| "step": 5430 |
| }, |
| { |
| "entropy": 0.6623747788369656, |
| "epoch": 0.8791919191919192, |
| "grad_norm": 0.36545881628990173, |
| "learning_rate": 2.8154011571850884e-05, |
| "loss": 0.2218, |
| "mean_token_accuracy": 0.9072560966014862, |
| "num_tokens": 106598360.0, |
| "step": 5440 |
| }, |
| { |
| "entropy": 0.685894726216793, |
| "epoch": 0.8808080808080808, |
| "grad_norm": 0.4281963109970093, |
| "learning_rate": 2.8130457372172125e-05, |
| "loss": 0.1909, |
| "mean_token_accuracy": 0.9133310407400131, |
| "num_tokens": 106790942.0, |
| "step": 5450 |
| }, |
| { |
| "entropy": 0.683454804122448, |
| "epoch": 0.8824242424242424, |
| "grad_norm": 0.3858519196510315, |
| "learning_rate": 2.8106872612895914e-05, |
| "loss": 0.1755, |
| "mean_token_accuracy": 0.9197027593851089, |
| "num_tokens": 106978744.0, |
| "step": 5460 |
| }, |
| { |
| "entropy": 0.6165321469306946, |
| "epoch": 0.8840404040404041, |
| "grad_norm": 0.2674444019794464, |
| "learning_rate": 2.8083257361821872e-05, |
| "loss": 0.2052, |
| "mean_token_accuracy": 0.9134606182575226, |
| "num_tokens": 107190687.0, |
| "step": 5470 |
| }, |
| { |
| "entropy": 0.6856359049677849, |
| "epoch": 0.8856565656565657, |
| "grad_norm": 0.40422874689102173, |
| "learning_rate": 2.8059611686837236e-05, |
| "loss": 0.2047, |
| "mean_token_accuracy": 0.9142091274261475, |
| "num_tokens": 107380321.0, |
| "step": 5480 |
| }, |
| { |
| "entropy": 0.674786227196455, |
| "epoch": 0.8872727272727273, |
| "grad_norm": 0.30188801884651184, |
| "learning_rate": 2.8035935655916723e-05, |
| "loss": 0.204, |
| "mean_token_accuracy": 0.9160023897886276, |
| "num_tokens": 107575323.0, |
| "step": 5490 |
| }, |
| { |
| "entropy": 0.6544559545814991, |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.6562331318855286, |
| "learning_rate": 2.8012229337122324e-05, |
| "loss": 0.1947, |
| "mean_token_accuracy": 0.9191029623150826, |
| "num_tokens": 107774878.0, |
| "step": 5500 |
| }, |
| { |
| "entropy": 0.6787748537957669, |
| "epoch": 0.8905050505050505, |
| "grad_norm": 0.27502256631851196, |
| "learning_rate": 2.7988492798603072e-05, |
| "loss": 0.1792, |
| "mean_token_accuracy": 0.9223506987094879, |
| "num_tokens": 107967851.0, |
| "step": 5510 |
| }, |
| { |
| "entropy": 0.6641766116023063, |
| "epoch": 0.8921212121212121, |
| "grad_norm": 0.442937970161438, |
| "learning_rate": 2.7964726108594904e-05, |
| "loss": 0.1962, |
| "mean_token_accuracy": 0.9132106631994248, |
| "num_tokens": 108163235.0, |
| "step": 5520 |
| }, |
| { |
| "entropy": 0.6542678773403168, |
| "epoch": 0.8937373737373737, |
| "grad_norm": 0.30644455552101135, |
| "learning_rate": 2.794092933542041e-05, |
| "loss": 0.2097, |
| "mean_token_accuracy": 0.9084485098719597, |
| "num_tokens": 108361379.0, |
| "step": 5530 |
| }, |
| { |
| "entropy": 0.671286852657795, |
| "epoch": 0.8953535353535353, |
| "grad_norm": 0.2679237723350525, |
| "learning_rate": 2.7917102547488676e-05, |
| "loss": 0.1995, |
| "mean_token_accuracy": 0.9114113479852677, |
| "num_tokens": 108554126.0, |
| "step": 5540 |
| }, |
| { |
| "entropy": 0.6369331762194633, |
| "epoch": 0.896969696969697, |
| "grad_norm": 2.665994882583618, |
| "learning_rate": 2.789324581329506e-05, |
| "loss": 0.2008, |
| "mean_token_accuracy": 0.9015925779938698, |
| "num_tokens": 108756149.0, |
| "step": 5550 |
| }, |
| { |
| "entropy": 0.6549828127026558, |
| "epoch": 0.8985858585858586, |
| "grad_norm": 0.23621822893619537, |
| "learning_rate": 2.786935920142102e-05, |
| "loss": 0.1956, |
| "mean_token_accuracy": 0.9176381945610046, |
| "num_tokens": 108955510.0, |
| "step": 5560 |
| }, |
| { |
| "entropy": 0.6628460936248303, |
| "epoch": 0.9002020202020202, |
| "grad_norm": 0.28806036710739136, |
| "learning_rate": 2.784544278053389e-05, |
| "loss": 0.199, |
| "mean_token_accuracy": 0.9112476229667663, |
| "num_tokens": 109153359.0, |
| "step": 5570 |
| }, |
| { |
| "entropy": 0.6598017416894436, |
| "epoch": 0.9018181818181819, |
| "grad_norm": 0.3448881506919861, |
| "learning_rate": 2.7821496619386704e-05, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.915037252008915, |
| "num_tokens": 109350258.0, |
| "step": 5580 |
| }, |
| { |
| "entropy": 0.6612473607063294, |
| "epoch": 0.9034343434343435, |
| "grad_norm": 0.2845333218574524, |
| "learning_rate": 2.7797520786817996e-05, |
| "loss": 0.1962, |
| "mean_token_accuracy": 0.9182912409305573, |
| "num_tokens": 109544964.0, |
| "step": 5590 |
| }, |
| { |
| "entropy": 0.6518812745809555, |
| "epoch": 0.9050505050505051, |
| "grad_norm": 0.21649111807346344, |
| "learning_rate": 2.7773515351751585e-05, |
| "loss": 0.2054, |
| "mean_token_accuracy": 0.9045742139220238, |
| "num_tokens": 109742959.0, |
| "step": 5600 |
| }, |
| { |
| "entropy": 0.6644902803003788, |
| "epoch": 0.9066666666666666, |
| "grad_norm": 0.33554190397262573, |
| "learning_rate": 2.774948038319641e-05, |
| "loss": 0.1939, |
| "mean_token_accuracy": 0.919256579875946, |
| "num_tokens": 109935296.0, |
| "step": 5610 |
| }, |
| { |
| "entropy": 0.6862445369362831, |
| "epoch": 0.9082828282828282, |
| "grad_norm": 0.2826671302318573, |
| "learning_rate": 2.772541595024628e-05, |
| "loss": 0.1894, |
| "mean_token_accuracy": 0.9180306404829025, |
| "num_tokens": 110124649.0, |
| "step": 5620 |
| }, |
| { |
| "entropy": 0.6782503843307495, |
| "epoch": 0.9098989898989899, |
| "grad_norm": 0.2841434180736542, |
| "learning_rate": 2.770132212207973e-05, |
| "loss": 0.1823, |
| "mean_token_accuracy": 0.9190478280186654, |
| "num_tokens": 110313361.0, |
| "step": 5630 |
| }, |
| { |
| "entropy": 0.6932250164449215, |
| "epoch": 0.9115151515151515, |
| "grad_norm": 0.29083821177482605, |
| "learning_rate": 2.767719896795979e-05, |
| "loss": 0.1973, |
| "mean_token_accuracy": 0.9196494951844215, |
| "num_tokens": 110501349.0, |
| "step": 5640 |
| }, |
| { |
| "entropy": 0.6889181569218635, |
| "epoch": 0.9131313131313131, |
| "grad_norm": 0.37333911657333374, |
| "learning_rate": 2.765304655723379e-05, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9226689338684082, |
| "num_tokens": 110687222.0, |
| "step": 5650 |
| }, |
| { |
| "entropy": 0.6439741283655167, |
| "epoch": 0.9147474747474748, |
| "grad_norm": 0.4025020897388458, |
| "learning_rate": 2.7628864959333172e-05, |
| "loss": 0.2101, |
| "mean_token_accuracy": 0.910821832716465, |
| "num_tokens": 110888988.0, |
| "step": 5660 |
| }, |
| { |
| "entropy": 0.6726826570928097, |
| "epoch": 0.9163636363636364, |
| "grad_norm": 0.35746780037879944, |
| "learning_rate": 2.760465424377328e-05, |
| "loss": 0.1783, |
| "mean_token_accuracy": 0.9297507956624032, |
| "num_tokens": 111080942.0, |
| "step": 5670 |
| }, |
| { |
| "entropy": 0.6542132571339607, |
| "epoch": 0.917979797979798, |
| "grad_norm": 0.35613134503364563, |
| "learning_rate": 2.758041448015316e-05, |
| "loss": 0.2085, |
| "mean_token_accuracy": 0.9105470031499863, |
| "num_tokens": 111276582.0, |
| "step": 5680 |
| }, |
| { |
| "entropy": 0.6565234795212745, |
| "epoch": 0.9195959595959596, |
| "grad_norm": 0.3237346112728119, |
| "learning_rate": 2.7556145738155364e-05, |
| "loss": 0.1977, |
| "mean_token_accuracy": 0.9160599425435066, |
| "num_tokens": 111471036.0, |
| "step": 5690 |
| }, |
| { |
| "entropy": 0.6647991955280304, |
| "epoch": 0.9212121212121213, |
| "grad_norm": 0.2765544652938843, |
| "learning_rate": 2.7531848087545762e-05, |
| "loss": 0.1969, |
| "mean_token_accuracy": 0.9130775973200798, |
| "num_tokens": 111664540.0, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.9212121212121213, |
| "eval_entropy": 0.7171694939434529, |
| "eval_loss": 0.18071074783802032, |
| "eval_mean_token_accuracy": 0.9210708927512169, |
| "eval_num_tokens": 111664540.0, |
| "eval_runtime": 108.214, |
| "eval_samples_per_second": 9.241, |
| "eval_steps_per_second": 9.241, |
| "step": 5700 |
| }, |
| { |
| "entropy": 0.6711388848721981, |
| "epoch": 0.9228282828282828, |
| "grad_norm": 0.2098732590675354, |
| "learning_rate": 2.7507521598173307e-05, |
| "loss": 0.188, |
| "mean_token_accuracy": 0.9198487937450409, |
| "num_tokens": 111855354.0, |
| "step": 5710 |
| }, |
| { |
| "entropy": 0.6491121411323547, |
| "epoch": 0.9244444444444444, |
| "grad_norm": 0.24254560470581055, |
| "learning_rate": 2.748316633996987e-05, |
| "loss": 0.2103, |
| "mean_token_accuracy": 0.9093293219804763, |
| "num_tokens": 112052332.0, |
| "step": 5720 |
| }, |
| { |
| "entropy": 0.6290779180824757, |
| "epoch": 0.926060606060606, |
| "grad_norm": 0.2390562742948532, |
| "learning_rate": 2.745878238295001e-05, |
| "loss": 0.1966, |
| "mean_token_accuracy": 0.9128098532557487, |
| "num_tokens": 112258189.0, |
| "step": 5730 |
| }, |
| { |
| "entropy": 0.6869820304214954, |
| "epoch": 0.9276767676767677, |
| "grad_norm": 1.397542953491211, |
| "learning_rate": 2.7434369797210803e-05, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9231995150446892, |
| "num_tokens": 112445227.0, |
| "step": 5740 |
| }, |
| { |
| "entropy": 0.6402010828256607, |
| "epoch": 0.9292929292929293, |
| "grad_norm": 0.2345576286315918, |
| "learning_rate": 2.740992865293162e-05, |
| "loss": 0.207, |
| "mean_token_accuracy": 0.9165661379694938, |
| "num_tokens": 112644322.0, |
| "step": 5750 |
| }, |
| { |
| "entropy": 0.6373597458004951, |
| "epoch": 0.9309090909090909, |
| "grad_norm": 0.3196069002151489, |
| "learning_rate": 2.7385459020373933e-05, |
| "loss": 0.2046, |
| "mean_token_accuracy": 0.9103137984871864, |
| "num_tokens": 112844093.0, |
| "step": 5760 |
| }, |
| { |
| "entropy": 0.6428131617605686, |
| "epoch": 0.9325252525252525, |
| "grad_norm": 1.4492580890655518, |
| "learning_rate": 2.736096096988109e-05, |
| "loss": 0.1984, |
| "mean_token_accuracy": 0.917301295697689, |
| "num_tokens": 113041205.0, |
| "step": 5770 |
| }, |
| { |
| "entropy": 0.6344161719083786, |
| "epoch": 0.9341414141414142, |
| "grad_norm": 0.3058554232120514, |
| "learning_rate": 2.733643457187816e-05, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9163832649588585, |
| "num_tokens": 113242489.0, |
| "step": 5780 |
| }, |
| { |
| "entropy": 0.6687356859445572, |
| "epoch": 0.9357575757575758, |
| "grad_norm": 0.550967812538147, |
| "learning_rate": 2.731187989687169e-05, |
| "loss": 0.1944, |
| "mean_token_accuracy": 0.9172602981328964, |
| "num_tokens": 113432928.0, |
| "step": 5790 |
| }, |
| { |
| "entropy": 0.6460176803171634, |
| "epoch": 0.9373737373737374, |
| "grad_norm": 0.7452371716499329, |
| "learning_rate": 2.728729701544951e-05, |
| "loss": 0.2064, |
| "mean_token_accuracy": 0.9076276451349259, |
| "num_tokens": 113631585.0, |
| "step": 5800 |
| }, |
| { |
| "entropy": 0.6974510416388512, |
| "epoch": 0.938989898989899, |
| "grad_norm": 0.24986976385116577, |
| "learning_rate": 2.7262685998280537e-05, |
| "loss": 0.2043, |
| "mean_token_accuracy": 0.915585121512413, |
| "num_tokens": 113816162.0, |
| "step": 5810 |
| }, |
| { |
| "entropy": 0.6628655359148979, |
| "epoch": 0.9406060606060606, |
| "grad_norm": 0.2667369246482849, |
| "learning_rate": 2.723804691611459e-05, |
| "loss": 0.1967, |
| "mean_token_accuracy": 0.9135117039084435, |
| "num_tokens": 114010508.0, |
| "step": 5820 |
| }, |
| { |
| "entropy": 0.6733184188604355, |
| "epoch": 0.9422222222222222, |
| "grad_norm": 0.29969391226768494, |
| "learning_rate": 2.7213379839782144e-05, |
| "loss": 0.1909, |
| "mean_token_accuracy": 0.9115293577313424, |
| "num_tokens": 114202584.0, |
| "step": 5830 |
| }, |
| { |
| "entropy": 0.6488926939666271, |
| "epoch": 0.9438383838383838, |
| "grad_norm": 0.43534573912620544, |
| "learning_rate": 2.7188684840194158e-05, |
| "loss": 0.1955, |
| "mean_token_accuracy": 0.9078608691692353, |
| "num_tokens": 114401106.0, |
| "step": 5840 |
| }, |
| { |
| "entropy": 0.6797570586204529, |
| "epoch": 0.9454545454545454, |
| "grad_norm": 1.7962315082550049, |
| "learning_rate": 2.716396198834186e-05, |
| "loss": 0.2005, |
| "mean_token_accuracy": 0.9150436997413636, |
| "num_tokens": 114589015.0, |
| "step": 5850 |
| }, |
| { |
| "entropy": 0.6667471393942833, |
| "epoch": 0.9470707070707071, |
| "grad_norm": 0.4936676025390625, |
| "learning_rate": 2.7139211355296555e-05, |
| "loss": 0.1994, |
| "mean_token_accuracy": 0.9127405688166619, |
| "num_tokens": 114781132.0, |
| "step": 5860 |
| }, |
| { |
| "entropy": 0.6693764880299569, |
| "epoch": 0.9486868686868687, |
| "grad_norm": 0.26227378845214844, |
| "learning_rate": 2.7114433012209403e-05, |
| "loss": 0.1936, |
| "mean_token_accuracy": 0.9173475831747055, |
| "num_tokens": 114973668.0, |
| "step": 5870 |
| }, |
| { |
| "entropy": 0.6575251325964928, |
| "epoch": 0.9503030303030303, |
| "grad_norm": 0.5430612564086914, |
| "learning_rate": 2.7089627030311216e-05, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.9149983078241348, |
| "num_tokens": 115170150.0, |
| "step": 5880 |
| }, |
| { |
| "entropy": 0.6451747253537178, |
| "epoch": 0.9519191919191919, |
| "grad_norm": 0.9014168381690979, |
| "learning_rate": 2.706479348091227e-05, |
| "loss": 0.1887, |
| "mean_token_accuracy": 0.913795605301857, |
| "num_tokens": 115368295.0, |
| "step": 5890 |
| }, |
| { |
| "entropy": 0.6573713317513465, |
| "epoch": 0.9535353535353536, |
| "grad_norm": 0.31116336584091187, |
| "learning_rate": 2.7039932435402087e-05, |
| "loss": 0.2089, |
| "mean_token_accuracy": 0.9058459445834159, |
| "num_tokens": 115562448.0, |
| "step": 5900 |
| }, |
| { |
| "entropy": 0.674033111333847, |
| "epoch": 0.9551515151515152, |
| "grad_norm": 0.9241631031036377, |
| "learning_rate": 2.7015043965249235e-05, |
| "loss": 0.2043, |
| "mean_token_accuracy": 0.9132018253207207, |
| "num_tokens": 115752918.0, |
| "step": 5910 |
| }, |
| { |
| "entropy": 0.6824505791068077, |
| "epoch": 0.9567676767676768, |
| "grad_norm": 0.5457546710968018, |
| "learning_rate": 2.6990128142001117e-05, |
| "loss": 0.1871, |
| "mean_token_accuracy": 0.9239314749836922, |
| "num_tokens": 115944094.0, |
| "step": 5920 |
| }, |
| { |
| "entropy": 0.6651259288191795, |
| "epoch": 0.9583838383838383, |
| "grad_norm": 0.2708912491798401, |
| "learning_rate": 2.696518503728377e-05, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9139492645859718, |
| "num_tokens": 116139742.0, |
| "step": 5930 |
| }, |
| { |
| "entropy": 0.6485146664083004, |
| "epoch": 0.96, |
| "grad_norm": 0.6327102780342102, |
| "learning_rate": 2.694021472280166e-05, |
| "loss": 0.1829, |
| "mean_token_accuracy": 0.9205643236637115, |
| "num_tokens": 116337348.0, |
| "step": 5940 |
| }, |
| { |
| "entropy": 0.6744761653244495, |
| "epoch": 0.9616161616161616, |
| "grad_norm": 0.5170925855636597, |
| "learning_rate": 2.691521727033746e-05, |
| "loss": 0.1849, |
| "mean_token_accuracy": 0.9192163914442062, |
| "num_tokens": 116528104.0, |
| "step": 5950 |
| }, |
| { |
| "entropy": 0.6350755199790001, |
| "epoch": 0.9632323232323232, |
| "grad_norm": 0.329481303691864, |
| "learning_rate": 2.689019275175188e-05, |
| "loss": 0.1839, |
| "mean_token_accuracy": 0.9138085007667541, |
| "num_tokens": 116730380.0, |
| "step": 5960 |
| }, |
| { |
| "entropy": 0.6517668567597866, |
| "epoch": 0.9648484848484848, |
| "grad_norm": 0.3096514940261841, |
| "learning_rate": 2.686514123898342e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.913357961177826, |
| "num_tokens": 116928370.0, |
| "step": 5970 |
| }, |
| { |
| "entropy": 0.6759242668747902, |
| "epoch": 0.9664646464646465, |
| "grad_norm": 0.21771784126758575, |
| "learning_rate": 2.6840062804048187e-05, |
| "loss": 0.2098, |
| "mean_token_accuracy": 0.9091755867004394, |
| "num_tokens": 117117162.0, |
| "step": 5980 |
| }, |
| { |
| "entropy": 0.6561018228530884, |
| "epoch": 0.9680808080808081, |
| "grad_norm": 0.24175840616226196, |
| "learning_rate": 2.6814957519039685e-05, |
| "loss": 0.2049, |
| "mean_token_accuracy": 0.9110913261771202, |
| "num_tokens": 117315990.0, |
| "step": 5990 |
| }, |
| { |
| "entropy": 0.6435799233615398, |
| "epoch": 0.9696969696969697, |
| "grad_norm": 0.22681868076324463, |
| "learning_rate": 2.678982545612859e-05, |
| "loss": 0.2221, |
| "mean_token_accuracy": 0.897762194275856, |
| "num_tokens": 117517220.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "eval_entropy": 0.718082972228527, |
| "eval_loss": 0.18191839754581451, |
| "eval_mean_token_accuracy": 0.9208413473963738, |
| "eval_num_tokens": 117517220.0, |
| "eval_runtime": 107.7777, |
| "eval_samples_per_second": 9.278, |
| "eval_steps_per_second": 9.278, |
| "step": 6000 |
| }, |
| { |
| "entropy": 0.6580222062766552, |
| "epoch": 0.9713131313131314, |
| "grad_norm": 0.21151787042617798, |
| "learning_rate": 2.6764666687562574e-05, |
| "loss": 0.2032, |
| "mean_token_accuracy": 0.920425595343113, |
| "num_tokens": 117713683.0, |
| "step": 6010 |
| }, |
| { |
| "entropy": 0.6417085506021977, |
| "epoch": 0.972929292929293, |
| "grad_norm": 0.38823527097702026, |
| "learning_rate": 2.6739481285666075e-05, |
| "loss": 0.1896, |
| "mean_token_accuracy": 0.9096645340323448, |
| "num_tokens": 117915811.0, |
| "step": 6020 |
| }, |
| { |
| "entropy": 0.6489227615296841, |
| "epoch": 0.9745454545454545, |
| "grad_norm": 0.33198609948158264, |
| "learning_rate": 2.671426932284009e-05, |
| "loss": 0.1965, |
| "mean_token_accuracy": 0.913950651884079, |
| "num_tokens": 118112545.0, |
| "step": 6030 |
| }, |
| { |
| "entropy": 0.6406939096748829, |
| "epoch": 0.9761616161616161, |
| "grad_norm": 2.0861010551452637, |
| "learning_rate": 2.668903087156197e-05, |
| "loss": 0.2113, |
| "mean_token_accuracy": 0.9125285148620605, |
| "num_tokens": 118312944.0, |
| "step": 6040 |
| }, |
| { |
| "entropy": 0.6449309058487416, |
| "epoch": 0.9777777777777777, |
| "grad_norm": 0.4663407802581787, |
| "learning_rate": 2.6663766004385226e-05, |
| "loss": 0.2039, |
| "mean_token_accuracy": 0.9149981364607811, |
| "num_tokens": 118511619.0, |
| "step": 6050 |
| }, |
| { |
| "entropy": 0.6443940982222557, |
| "epoch": 0.9793939393939394, |
| "grad_norm": 0.21795040369033813, |
| "learning_rate": 2.6638474793939294e-05, |
| "loss": 0.197, |
| "mean_token_accuracy": 0.9152833878993988, |
| "num_tokens": 118713929.0, |
| "step": 6060 |
| }, |
| { |
| "entropy": 0.6551195085048676, |
| "epoch": 0.981010101010101, |
| "grad_norm": 0.22683417797088623, |
| "learning_rate": 2.661315731292934e-05, |
| "loss": 0.1959, |
| "mean_token_accuracy": 0.9164972469210625, |
| "num_tokens": 118911555.0, |
| "step": 6070 |
| }, |
| { |
| "entropy": 0.6454559281468392, |
| "epoch": 0.9826262626262626, |
| "grad_norm": 0.20692016184329987, |
| "learning_rate": 2.6587813634136063e-05, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9109898418188095, |
| "num_tokens": 119114755.0, |
| "step": 6080 |
| }, |
| { |
| "entropy": 0.6683458849787712, |
| "epoch": 0.9842424242424243, |
| "grad_norm": 0.2784108519554138, |
| "learning_rate": 2.656244383041546e-05, |
| "loss": 0.2057, |
| "mean_token_accuracy": 0.9093471348285675, |
| "num_tokens": 119306781.0, |
| "step": 6090 |
| }, |
| { |
| "entropy": 0.685344897210598, |
| "epoch": 0.9858585858585859, |
| "grad_norm": 0.47598379850387573, |
| "learning_rate": 2.6537047974698633e-05, |
| "loss": 0.1765, |
| "mean_token_accuracy": 0.9268097966909409, |
| "num_tokens": 119495832.0, |
| "step": 6100 |
| }, |
| { |
| "entropy": 0.6490506112575531, |
| "epoch": 0.9874747474747475, |
| "grad_norm": 0.33167073130607605, |
| "learning_rate": 2.651162613999158e-05, |
| "loss": 0.1872, |
| "mean_token_accuracy": 0.9159244641661644, |
| "num_tokens": 119693853.0, |
| "step": 6110 |
| }, |
| { |
| "entropy": 0.6624149113893509, |
| "epoch": 0.9890909090909091, |
| "grad_norm": 0.2675837576389313, |
| "learning_rate": 2.6486178399374978e-05, |
| "loss": 0.207, |
| "mean_token_accuracy": 0.91104756295681, |
| "num_tokens": 119888005.0, |
| "step": 6120 |
| }, |
| { |
| "entropy": 0.6676151186227799, |
| "epoch": 0.9907070707070708, |
| "grad_norm": 0.4086419939994812, |
| "learning_rate": 2.6460704826003985e-05, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.9200485736131668, |
| "num_tokens": 120079316.0, |
| "step": 6130 |
| }, |
| { |
| "entropy": 0.6681792497634887, |
| "epoch": 0.9923232323232323, |
| "grad_norm": 0.1883692890405655, |
| "learning_rate": 2.6435205493108e-05, |
| "loss": 0.1946, |
| "mean_token_accuracy": 0.9129543766379357, |
| "num_tokens": 120271231.0, |
| "step": 6140 |
| }, |
| { |
| "entropy": 0.6727742575109005, |
| "epoch": 0.9939393939393939, |
| "grad_norm": 0.8306013941764832, |
| "learning_rate": 2.64096804739905e-05, |
| "loss": 0.1874, |
| "mean_token_accuracy": 0.9178223893046379, |
| "num_tokens": 120461022.0, |
| "step": 6150 |
| }, |
| { |
| "entropy": 0.6303834930062294, |
| "epoch": 0.9955555555555555, |
| "grad_norm": 0.36598384380340576, |
| "learning_rate": 2.638412984202878e-05, |
| "loss": 0.2026, |
| "mean_token_accuracy": 0.9190863534808159, |
| "num_tokens": 120664236.0, |
| "step": 6160 |
| }, |
| { |
| "entropy": 0.6283733353018761, |
| "epoch": 0.9971717171717172, |
| "grad_norm": 0.6193094849586487, |
| "learning_rate": 2.6358553670673776e-05, |
| "loss": 0.2024, |
| "mean_token_accuracy": 0.910734897851944, |
| "num_tokens": 120867537.0, |
| "step": 6170 |
| }, |
| { |
| "entropy": 0.6580278515815735, |
| "epoch": 0.9987878787878788, |
| "grad_norm": 0.3590032160282135, |
| "learning_rate": 2.6332952033449848e-05, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.916428716480732, |
| "num_tokens": 121061748.0, |
| "step": 6180 |
| }, |
| { |
| "entropy": 0.6319364136771152, |
| "epoch": 1.0003232323232323, |
| "grad_norm": 0.2722172737121582, |
| "learning_rate": 2.630732500395455e-05, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9176036417484283, |
| "num_tokens": 121252607.0, |
| "step": 6190 |
| }, |
| { |
| "entropy": 0.6554017476737499, |
| "epoch": 1.001939393939394, |
| "grad_norm": 0.236890971660614, |
| "learning_rate": 2.6281672655858437e-05, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9223996490240097, |
| "num_tokens": 121447664.0, |
| "step": 6200 |
| }, |
| { |
| "entropy": 0.64825743958354, |
| "epoch": 1.0035555555555555, |
| "grad_norm": 0.281246542930603, |
| "learning_rate": 2.6255995062904855e-05, |
| "loss": 0.2065, |
| "mean_token_accuracy": 0.9148237839341163, |
| "num_tokens": 121643626.0, |
| "step": 6210 |
| }, |
| { |
| "entropy": 0.6417653873562813, |
| "epoch": 1.0051717171717172, |
| "grad_norm": 0.4624505639076233, |
| "learning_rate": 2.6230292298909708e-05, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9187545537948608, |
| "num_tokens": 121840163.0, |
| "step": 6220 |
| }, |
| { |
| "entropy": 0.6188324891030789, |
| "epoch": 1.0067878787878788, |
| "grad_norm": 1.9852811098098755, |
| "learning_rate": 2.6204564437761272e-05, |
| "loss": 0.1893, |
| "mean_token_accuracy": 0.9187879309058189, |
| "num_tokens": 122048180.0, |
| "step": 6230 |
| }, |
| { |
| "entropy": 0.6713766917586327, |
| "epoch": 1.0084040404040404, |
| "grad_norm": 0.26395949721336365, |
| "learning_rate": 2.6178811553419968e-05, |
| "loss": 0.1929, |
| "mean_token_accuracy": 0.9171957895159721, |
| "num_tokens": 122237720.0, |
| "step": 6240 |
| }, |
| { |
| "entropy": 0.657070753723383, |
| "epoch": 1.010020202020202, |
| "grad_norm": 0.26647141575813293, |
| "learning_rate": 2.615303371991815e-05, |
| "loss": 0.1835, |
| "mean_token_accuracy": 0.9234165251255035, |
| "num_tokens": 122432536.0, |
| "step": 6250 |
| }, |
| { |
| "entropy": 0.6808216124773026, |
| "epoch": 1.0116363636363637, |
| "grad_norm": 0.25760984420776367, |
| "learning_rate": 2.6127231011359885e-05, |
| "loss": 0.1933, |
| "mean_token_accuracy": 0.9138244792819024, |
| "num_tokens": 122621444.0, |
| "step": 6260 |
| }, |
| { |
| "entropy": 0.6886962063610553, |
| "epoch": 1.0132525252525253, |
| "grad_norm": 1.3057711124420166, |
| "learning_rate": 2.610140350192077e-05, |
| "loss": 0.1942, |
| "mean_token_accuracy": 0.913233257830143, |
| "num_tokens": 122806380.0, |
| "step": 6270 |
| }, |
| { |
| "entropy": 0.6293446570634842, |
| "epoch": 1.014868686868687, |
| "grad_norm": 0.59806227684021, |
| "learning_rate": 2.6075551265847668e-05, |
| "loss": 0.2113, |
| "mean_token_accuracy": 0.9158453986048698, |
| "num_tokens": 123011023.0, |
| "step": 6280 |
| }, |
| { |
| "entropy": 0.6543448261916638, |
| "epoch": 1.0164848484848485, |
| "grad_norm": 0.2948601543903351, |
| "learning_rate": 2.604967437745856e-05, |
| "loss": 0.2066, |
| "mean_token_accuracy": 0.9178432136774063, |
| "num_tokens": 123206708.0, |
| "step": 6290 |
| }, |
| { |
| "entropy": 0.6323048695921898, |
| "epoch": 1.0181010101010102, |
| "grad_norm": 0.3663761019706726, |
| "learning_rate": 2.6023772911142255e-05, |
| "loss": 0.2188, |
| "mean_token_accuracy": 0.9081894502043724, |
| "num_tokens": 123408860.0, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.0181010101010102, |
| "eval_entropy": 0.7149012831747532, |
| "eval_loss": 0.1789740025997162, |
| "eval_mean_token_accuracy": 0.9207139663100242, |
| "eval_num_tokens": 123408860.0, |
| "eval_runtime": 109.427, |
| "eval_samples_per_second": 9.139, |
| "eval_steps_per_second": 9.139, |
| "step": 6300 |
| }, |
| { |
| "entropy": 0.6567400619387627, |
| "epoch": 1.0197171717171718, |
| "grad_norm": 0.2657247483730316, |
| "learning_rate": 2.599784694135826e-05, |
| "loss": 0.1917, |
| "mean_token_accuracy": 0.9149983212351799, |
| "num_tokens": 123604715.0, |
| "step": 6310 |
| }, |
| { |
| "entropy": 0.6590967506170273, |
| "epoch": 1.0213333333333334, |
| "grad_norm": 0.2769843637943268, |
| "learning_rate": 2.597189654263649e-05, |
| "loss": 0.1929, |
| "mean_token_accuracy": 0.9187929838895798, |
| "num_tokens": 123798586.0, |
| "step": 6320 |
| }, |
| { |
| "entropy": 0.6535114169120788, |
| "epoch": 1.022949494949495, |
| "grad_norm": 0.6922852993011475, |
| "learning_rate": 2.5945921789577096e-05, |
| "loss": 0.1965, |
| "mean_token_accuracy": 0.9142646953463555, |
| "num_tokens": 123995581.0, |
| "step": 6330 |
| }, |
| { |
| "entropy": 0.6440669246017933, |
| "epoch": 1.0245656565656565, |
| "grad_norm": 0.25289684534072876, |
| "learning_rate": 2.5919922756850243e-05, |
| "loss": 0.191, |
| "mean_token_accuracy": 0.9152944028377533, |
| "num_tokens": 124193656.0, |
| "step": 6340 |
| }, |
| { |
| "entropy": 0.6575203105807305, |
| "epoch": 1.026181818181818, |
| "grad_norm": 0.3846961557865143, |
| "learning_rate": 2.5893899519195903e-05, |
| "loss": 0.2014, |
| "mean_token_accuracy": 0.914886112511158, |
| "num_tokens": 124386546.0, |
| "step": 6350 |
| }, |
| { |
| "entropy": 0.6285572461783886, |
| "epoch": 1.0277979797979797, |
| "grad_norm": 0.3510042130947113, |
| "learning_rate": 2.58678521514236e-05, |
| "loss": 0.2134, |
| "mean_token_accuracy": 0.9082665666937828, |
| "num_tokens": 124591229.0, |
| "step": 6360 |
| }, |
| { |
| "entropy": 0.693169579654932, |
| "epoch": 1.0294141414141413, |
| "grad_norm": 0.5278009176254272, |
| "learning_rate": 2.5841780728412267e-05, |
| "loss": 0.174, |
| "mean_token_accuracy": 0.9229836270213128, |
| "num_tokens": 124775056.0, |
| "step": 6370 |
| }, |
| { |
| "entropy": 0.645633889734745, |
| "epoch": 1.031030303030303, |
| "grad_norm": 0.3005501627922058, |
| "learning_rate": 2.581568532510995e-05, |
| "loss": 0.189, |
| "mean_token_accuracy": 0.915327088534832, |
| "num_tokens": 124972056.0, |
| "step": 6380 |
| }, |
| { |
| "entropy": 0.6465482771396637, |
| "epoch": 1.0326464646464646, |
| "grad_norm": 0.17836037278175354, |
| "learning_rate": 2.578956601653365e-05, |
| "loss": 0.2005, |
| "mean_token_accuracy": 0.9092096760869026, |
| "num_tokens": 125169814.0, |
| "step": 6390 |
| }, |
| { |
| "entropy": 0.6293865792453289, |
| "epoch": 1.0342626262626262, |
| "grad_norm": 0.35403016209602356, |
| "learning_rate": 2.5763422877769105e-05, |
| "loss": 0.2058, |
| "mean_token_accuracy": 0.9146534383296967, |
| "num_tokens": 125374489.0, |
| "step": 6400 |
| }, |
| { |
| "entropy": 0.6683484628796578, |
| "epoch": 1.0358787878787878, |
| "grad_norm": 0.7300609946250916, |
| "learning_rate": 2.5737255983970523e-05, |
| "loss": 0.1901, |
| "mean_token_accuracy": 0.9273339122533798, |
| "num_tokens": 125564889.0, |
| "step": 6410 |
| }, |
| { |
| "entropy": 0.6528873711824417, |
| "epoch": 1.0374949494949495, |
| "grad_norm": 0.23353512585163116, |
| "learning_rate": 2.571106541036043e-05, |
| "loss": 0.2066, |
| "mean_token_accuracy": 0.9112766414880753, |
| "num_tokens": 125761240.0, |
| "step": 6420 |
| }, |
| { |
| "entropy": 0.6671967059373856, |
| "epoch": 1.039111111111111, |
| "grad_norm": 0.21339759230613708, |
| "learning_rate": 2.56848512322294e-05, |
| "loss": 0.1761, |
| "mean_token_accuracy": 0.9232214778661728, |
| "num_tokens": 125953805.0, |
| "step": 6430 |
| }, |
| { |
| "entropy": 0.6670616880059242, |
| "epoch": 1.0407272727272727, |
| "grad_norm": 0.2723545730113983, |
| "learning_rate": 2.5658613524935897e-05, |
| "loss": 0.1996, |
| "mean_token_accuracy": 0.9158126816153527, |
| "num_tokens": 126146527.0, |
| "step": 6440 |
| }, |
| { |
| "entropy": 0.6801604807376862, |
| "epoch": 1.0423434343434343, |
| "grad_norm": 0.3326534926891327, |
| "learning_rate": 2.5632352363905992e-05, |
| "loss": 0.1744, |
| "mean_token_accuracy": 0.9291215389966965, |
| "num_tokens": 126335687.0, |
| "step": 6450 |
| }, |
| { |
| "entropy": 0.640099360793829, |
| "epoch": 1.043959595959596, |
| "grad_norm": 0.16790306568145752, |
| "learning_rate": 2.5606067824633193e-05, |
| "loss": 0.1937, |
| "mean_token_accuracy": 0.9159406557679176, |
| "num_tokens": 126538790.0, |
| "step": 6460 |
| }, |
| { |
| "entropy": 0.6603272095322609, |
| "epoch": 1.0455757575757576, |
| "grad_norm": 0.22010697424411774, |
| "learning_rate": 2.5579759982678216e-05, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9223829731345177, |
| "num_tokens": 126731453.0, |
| "step": 6470 |
| }, |
| { |
| "entropy": 0.6681752309203148, |
| "epoch": 1.0471919191919192, |
| "grad_norm": 0.2423912137746811, |
| "learning_rate": 2.555342891366876e-05, |
| "loss": 0.193, |
| "mean_token_accuracy": 0.9237019583582878, |
| "num_tokens": 126924683.0, |
| "step": 6480 |
| }, |
| { |
| "entropy": 0.6576780676841736, |
| "epoch": 1.0488080808080809, |
| "grad_norm": 0.36850905418395996, |
| "learning_rate": 2.5527074693299307e-05, |
| "loss": 0.2105, |
| "mean_token_accuracy": 0.9122251257300377, |
| "num_tokens": 127123350.0, |
| "step": 6490 |
| }, |
| { |
| "entropy": 0.6331621743738651, |
| "epoch": 1.0504242424242425, |
| "grad_norm": 0.15457026660442352, |
| "learning_rate": 2.550069739733087e-05, |
| "loss": 0.2001, |
| "mean_token_accuracy": 0.9060256168246269, |
| "num_tokens": 127327771.0, |
| "step": 6500 |
| }, |
| { |
| "entropy": 0.6503718107938766, |
| "epoch": 1.052040404040404, |
| "grad_norm": 2.223614454269409, |
| "learning_rate": 2.5474297101590827e-05, |
| "loss": 0.2042, |
| "mean_token_accuracy": 0.9112634479999542, |
| "num_tokens": 127527473.0, |
| "step": 6510 |
| }, |
| { |
| "entropy": 0.6804512321949006, |
| "epoch": 1.0536565656565657, |
| "grad_norm": 0.23660320043563843, |
| "learning_rate": 2.5447873881972643e-05, |
| "loss": 0.1898, |
| "mean_token_accuracy": 0.9214923486113549, |
| "num_tokens": 127716869.0, |
| "step": 6520 |
| }, |
| { |
| "entropy": 0.6566047713160514, |
| "epoch": 1.0552727272727274, |
| "grad_norm": 0.26549968123435974, |
| "learning_rate": 2.542142781443571e-05, |
| "loss": 0.1938, |
| "mean_token_accuracy": 0.912337064743042, |
| "num_tokens": 127912264.0, |
| "step": 6530 |
| }, |
| { |
| "entropy": 0.6710340820252896, |
| "epoch": 1.056888888888889, |
| "grad_norm": 3.7274105548858643, |
| "learning_rate": 2.5394958975005075e-05, |
| "loss": 0.2002, |
| "mean_token_accuracy": 0.9160495191812515, |
| "num_tokens": 128104518.0, |
| "step": 6540 |
| }, |
| { |
| "entropy": 0.6394494399428368, |
| "epoch": 1.0585050505050506, |
| "grad_norm": 0.29730379581451416, |
| "learning_rate": 2.536846743977128e-05, |
| "loss": 0.1955, |
| "mean_token_accuracy": 0.9120063051581383, |
| "num_tokens": 128306195.0, |
| "step": 6550 |
| }, |
| { |
| "entropy": 0.6369179256260395, |
| "epoch": 1.060121212121212, |
| "grad_norm": 0.7845156788825989, |
| "learning_rate": 2.5341953284890086e-05, |
| "loss": 0.1922, |
| "mean_token_accuracy": 0.9154770240187645, |
| "num_tokens": 128507191.0, |
| "step": 6560 |
| }, |
| { |
| "entropy": 0.6614749670028687, |
| "epoch": 1.0617373737373736, |
| "grad_norm": 0.5670087337493896, |
| "learning_rate": 2.531541658658229e-05, |
| "loss": 0.1766, |
| "mean_token_accuracy": 0.9220707610249519, |
| "num_tokens": 128699358.0, |
| "step": 6570 |
| }, |
| { |
| "entropy": 0.6655206590890884, |
| "epoch": 1.0633535353535353, |
| "grad_norm": 0.212821364402771, |
| "learning_rate": 2.528885742113349e-05, |
| "loss": 0.1884, |
| "mean_token_accuracy": 0.914807352423668, |
| "num_tokens": 128891949.0, |
| "step": 6580 |
| }, |
| { |
| "entropy": 0.6582912176847457, |
| "epoch": 1.064969696969697, |
| "grad_norm": 0.7031955718994141, |
| "learning_rate": 2.5262275864893865e-05, |
| "loss": 0.2008, |
| "mean_token_accuracy": 0.9075666755437851, |
| "num_tokens": 129087041.0, |
| "step": 6590 |
| }, |
| { |
| "entropy": 0.6500703394412994, |
| "epoch": 1.0665858585858585, |
| "grad_norm": 0.43128007650375366, |
| "learning_rate": 2.5235671994277984e-05, |
| "loss": 0.1903, |
| "mean_token_accuracy": 0.9136385723948479, |
| "num_tokens": 129284757.0, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.0665858585858585, |
| "eval_entropy": 0.7165414272248745, |
| "eval_loss": 0.1776740849018097, |
| "eval_mean_token_accuracy": 0.9204751133918763, |
| "eval_num_tokens": 129284757.0, |
| "eval_runtime": 107.5492, |
| "eval_samples_per_second": 9.298, |
| "eval_steps_per_second": 9.298, |
| "step": 6600 |
| }, |
| { |
| "entropy": 0.6414588749408722, |
| "epoch": 1.0682020202020202, |
| "grad_norm": 0.3216888904571533, |
| "learning_rate": 2.520904588576453e-05, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.9127737268805504, |
| "num_tokens": 129484015.0, |
| "step": 6610 |
| }, |
| { |
| "entropy": 0.6716010585427284, |
| "epoch": 1.0698181818181818, |
| "grad_norm": 2.989790439605713, |
| "learning_rate": 2.518239761589615e-05, |
| "loss": 0.1904, |
| "mean_token_accuracy": 0.9218545705080032, |
| "num_tokens": 129677753.0, |
| "step": 6620 |
| }, |
| { |
| "entropy": 0.6624714002013207, |
| "epoch": 1.0714343434343434, |
| "grad_norm": 1.18606436252594, |
| "learning_rate": 2.515572726127917e-05, |
| "loss": 0.1784, |
| "mean_token_accuracy": 0.9170515671372413, |
| "num_tokens": 129874528.0, |
| "step": 6630 |
| }, |
| { |
| "entropy": 0.6804944217205048, |
| "epoch": 1.073050505050505, |
| "grad_norm": 0.28340503573417664, |
| "learning_rate": 2.512903489858342e-05, |
| "loss": 0.1812, |
| "mean_token_accuracy": 0.9200245469808579, |
| "num_tokens": 130064215.0, |
| "step": 6640 |
| }, |
| { |
| "entropy": 0.6577471405267715, |
| "epoch": 1.0746666666666667, |
| "grad_norm": 1.1137200593948364, |
| "learning_rate": 2.510232060454199e-05, |
| "loss": 0.185, |
| "mean_token_accuracy": 0.9237975597381591, |
| "num_tokens": 130258589.0, |
| "step": 6650 |
| }, |
| { |
| "entropy": 0.653445017337799, |
| "epoch": 1.0762828282828283, |
| "grad_norm": 0.24963843822479248, |
| "learning_rate": 2.5075584455951016e-05, |
| "loss": 0.2006, |
| "mean_token_accuracy": 0.9116239801049233, |
| "num_tokens": 130455050.0, |
| "step": 6660 |
| }, |
| { |
| "entropy": 0.6773878857493401, |
| "epoch": 1.07789898989899, |
| "grad_norm": 0.2457326203584671, |
| "learning_rate": 2.5048826529669466e-05, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.9252761200070381, |
| "num_tokens": 130643260.0, |
| "step": 6670 |
| }, |
| { |
| "entropy": 0.6505807377398014, |
| "epoch": 1.0795151515151515, |
| "grad_norm": 0.30657973885536194, |
| "learning_rate": 2.5022046902618903e-05, |
| "loss": 0.1933, |
| "mean_token_accuracy": 0.917384472489357, |
| "num_tokens": 130841384.0, |
| "step": 6680 |
| }, |
| { |
| "entropy": 0.6616685807704925, |
| "epoch": 1.0811313131313132, |
| "grad_norm": 0.25688493251800537, |
| "learning_rate": 2.499524565178328e-05, |
| "loss": 0.1969, |
| "mean_token_accuracy": 0.922076341509819, |
| "num_tokens": 131039846.0, |
| "step": 6690 |
| }, |
| { |
| "entropy": 0.6403779342770577, |
| "epoch": 1.0827474747474748, |
| "grad_norm": 0.24423788487911224, |
| "learning_rate": 2.4968422854208715e-05, |
| "loss": 0.204, |
| "mean_token_accuracy": 0.905488808453083, |
| "num_tokens": 131240325.0, |
| "step": 6700 |
| }, |
| { |
| "entropy": 0.6553829908370972, |
| "epoch": 1.0843636363636364, |
| "grad_norm": 0.2003633677959442, |
| "learning_rate": 2.4941578587003267e-05, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9127710714936257, |
| "num_tokens": 131436899.0, |
| "step": 6710 |
| }, |
| { |
| "entropy": 0.7037200927734375, |
| "epoch": 1.085979797979798, |
| "grad_norm": 0.2803226113319397, |
| "learning_rate": 2.4914712927336702e-05, |
| "loss": 0.1845, |
| "mean_token_accuracy": 0.9239676401019097, |
| "num_tokens": 131621236.0, |
| "step": 6720 |
| }, |
| { |
| "entropy": 0.6559601046144963, |
| "epoch": 1.0875959595959597, |
| "grad_norm": 0.24596357345581055, |
| "learning_rate": 2.4887825952440296e-05, |
| "loss": 0.1838, |
| "mean_token_accuracy": 0.9203213959932327, |
| "num_tokens": 131816568.0, |
| "step": 6730 |
| }, |
| { |
| "entropy": 0.6305412597954273, |
| "epoch": 1.0892121212121213, |
| "grad_norm": 0.49728894233703613, |
| "learning_rate": 2.4860917739606592e-05, |
| "loss": 0.2032, |
| "mean_token_accuracy": 0.9125052660703659, |
| "num_tokens": 132022157.0, |
| "step": 6740 |
| }, |
| { |
| "entropy": 0.6523553773760795, |
| "epoch": 1.090828282828283, |
| "grad_norm": 0.33660855889320374, |
| "learning_rate": 2.4833988366189203e-05, |
| "loss": 0.193, |
| "mean_token_accuracy": 0.9162264108657837, |
| "num_tokens": 132219642.0, |
| "step": 6750 |
| }, |
| { |
| "entropy": 0.6663720726966857, |
| "epoch": 1.0924444444444443, |
| "grad_norm": 0.28930526971817017, |
| "learning_rate": 2.4807037909602542e-05, |
| "loss": 0.1978, |
| "mean_token_accuracy": 0.9147111624479294, |
| "num_tokens": 132412588.0, |
| "step": 6760 |
| }, |
| { |
| "entropy": 0.6918739549815655, |
| "epoch": 1.094060606060606, |
| "grad_norm": 0.6057730317115784, |
| "learning_rate": 2.478006644732166e-05, |
| "loss": 0.1862, |
| "mean_token_accuracy": 0.9180864945054055, |
| "num_tokens": 132599244.0, |
| "step": 6770 |
| }, |
| { |
| "entropy": 0.6754172123968601, |
| "epoch": 1.0956767676767676, |
| "grad_norm": 0.20356015861034393, |
| "learning_rate": 2.475307405688199e-05, |
| "loss": 0.1993, |
| "mean_token_accuracy": 0.9170787900686264, |
| "num_tokens": 132789658.0, |
| "step": 6780 |
| }, |
| { |
| "entropy": 0.675231696665287, |
| "epoch": 1.0972929292929292, |
| "grad_norm": 0.31911930441856384, |
| "learning_rate": 2.47260608158791e-05, |
| "loss": 0.1791, |
| "mean_token_accuracy": 0.9208205997943878, |
| "num_tokens": 132980503.0, |
| "step": 6790 |
| }, |
| { |
| "entropy": 0.6837145708501339, |
| "epoch": 1.0989090909090908, |
| "grad_norm": 0.1739879846572876, |
| "learning_rate": 2.469902680196853e-05, |
| "loss": 0.1886, |
| "mean_token_accuracy": 0.9194065168499946, |
| "num_tokens": 133167839.0, |
| "step": 6800 |
| }, |
| { |
| "entropy": 0.6514686979353428, |
| "epoch": 1.1005252525252525, |
| "grad_norm": 0.2011003941297531, |
| "learning_rate": 2.4671972092865518e-05, |
| "loss": 0.1849, |
| "mean_token_accuracy": 0.9203078612685204, |
| "num_tokens": 133366242.0, |
| "step": 6810 |
| }, |
| { |
| "entropy": 0.652017392218113, |
| "epoch": 1.102141414141414, |
| "grad_norm": 1.491286039352417, |
| "learning_rate": 2.4644896766344803e-05, |
| "loss": 0.1895, |
| "mean_token_accuracy": 0.9165545701980591, |
| "num_tokens": 133565786.0, |
| "step": 6820 |
| }, |
| { |
| "entropy": 0.6563273631036282, |
| "epoch": 1.1037575757575757, |
| "grad_norm": 0.20642946660518646, |
| "learning_rate": 2.461780090024039e-05, |
| "loss": 0.2086, |
| "mean_token_accuracy": 0.9042300581932068, |
| "num_tokens": 133762796.0, |
| "step": 6830 |
| }, |
| { |
| "entropy": 0.6549685873091221, |
| "epoch": 1.1053737373737373, |
| "grad_norm": 0.34660500288009644, |
| "learning_rate": 2.459068457244533e-05, |
| "loss": 0.209, |
| "mean_token_accuracy": 0.9060993060469628, |
| "num_tokens": 133958518.0, |
| "step": 6840 |
| }, |
| { |
| "entropy": 0.692388217151165, |
| "epoch": 1.106989898989899, |
| "grad_norm": 0.1916242241859436, |
| "learning_rate": 2.4563547860911495e-05, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.9103460937738419, |
| "num_tokens": 134142419.0, |
| "step": 6850 |
| }, |
| { |
| "entropy": 0.6678758606314659, |
| "epoch": 1.1086060606060606, |
| "grad_norm": 0.26181909441947937, |
| "learning_rate": 2.453639084364936e-05, |
| "loss": 0.2012, |
| "mean_token_accuracy": 0.9134058818221092, |
| "num_tokens": 134334970.0, |
| "step": 6860 |
| }, |
| { |
| "entropy": 0.6088161051273346, |
| "epoch": 1.1102222222222222, |
| "grad_norm": 0.28219732642173767, |
| "learning_rate": 2.450921359872776e-05, |
| "loss": 0.2057, |
| "mean_token_accuracy": 0.9087446928024292, |
| "num_tokens": 134548254.0, |
| "step": 6870 |
| }, |
| { |
| "entropy": 0.6716838963329792, |
| "epoch": 1.1118383838383838, |
| "grad_norm": 0.3689417243003845, |
| "learning_rate": 2.44820162042737e-05, |
| "loss": 0.1966, |
| "mean_token_accuracy": 0.9242041930556297, |
| "num_tokens": 134738981.0, |
| "step": 6880 |
| }, |
| { |
| "entropy": 0.6553657658398151, |
| "epoch": 1.1134545454545455, |
| "grad_norm": 0.25713449716567993, |
| "learning_rate": 2.4454798738472085e-05, |
| "loss": 0.1819, |
| "mean_token_accuracy": 0.9258597359061241, |
| "num_tokens": 134935804.0, |
| "step": 6890 |
| }, |
| { |
| "entropy": 0.639024917781353, |
| "epoch": 1.115070707070707, |
| "grad_norm": 0.22210142016410828, |
| "learning_rate": 2.4427561279565535e-05, |
| "loss": 0.1986, |
| "mean_token_accuracy": 0.9139626592397689, |
| "num_tokens": 135136821.0, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.115070707070707, |
| "eval_entropy": 0.7123623175621032, |
| "eval_loss": 0.17761555314064026, |
| "eval_mean_token_accuracy": 0.9209292259812355, |
| "eval_num_tokens": 135136821.0, |
| "eval_runtime": 107.7787, |
| "eval_samples_per_second": 9.278, |
| "eval_steps_per_second": 9.278, |
| "step": 6900 |
| }, |
| { |
| "entropy": 0.6710393786430359, |
| "epoch": 1.1166868686868687, |
| "grad_norm": 0.24489854276180267, |
| "learning_rate": 2.4400303905854143e-05, |
| "loss": 0.1739, |
| "mean_token_accuracy": 0.9210480153560638, |
| "num_tokens": 135327176.0, |
| "step": 6910 |
| }, |
| { |
| "entropy": 0.6592464700341225, |
| "epoch": 1.1183030303030304, |
| "grad_norm": 0.2485123574733734, |
| "learning_rate": 2.437302669569525e-05, |
| "loss": 0.2069, |
| "mean_token_accuracy": 0.9071068927645684, |
| "num_tokens": 135522817.0, |
| "step": 6920 |
| }, |
| { |
| "entropy": 0.6490606270730496, |
| "epoch": 1.119919191919192, |
| "grad_norm": 0.2856655418872833, |
| "learning_rate": 2.434572972750322e-05, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.9150787323713303, |
| "num_tokens": 135720129.0, |
| "step": 6930 |
| }, |
| { |
| "entropy": 0.6843857243657112, |
| "epoch": 1.1215353535353536, |
| "grad_norm": 0.2778134346008301, |
| "learning_rate": 2.4318413079749212e-05, |
| "loss": 0.1906, |
| "mean_token_accuracy": 0.9118962764739991, |
| "num_tokens": 135908000.0, |
| "step": 6940 |
| }, |
| { |
| "entropy": 0.6615097917616367, |
| "epoch": 1.1231515151515152, |
| "grad_norm": 0.2848396897315979, |
| "learning_rate": 2.429107683096097e-05, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9107768446207046, |
| "num_tokens": 136099920.0, |
| "step": 6950 |
| }, |
| { |
| "entropy": 0.6721059113740921, |
| "epoch": 1.1247676767676769, |
| "grad_norm": 0.2825098931789398, |
| "learning_rate": 2.426372105972258e-05, |
| "loss": 0.1896, |
| "mean_token_accuracy": 0.9258805438876152, |
| "num_tokens": 136291929.0, |
| "step": 6960 |
| }, |
| { |
| "entropy": 0.6725196681916714, |
| "epoch": 1.1263838383838385, |
| "grad_norm": 0.22564974427223206, |
| "learning_rate": 2.4236345844674235e-05, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9194081515073776, |
| "num_tokens": 136486610.0, |
| "step": 6970 |
| }, |
| { |
| "entropy": 0.655806089937687, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.19149194657802582, |
| "learning_rate": 2.4208951264512046e-05, |
| "loss": 0.1996, |
| "mean_token_accuracy": 0.9169903472065926, |
| "num_tokens": 136683848.0, |
| "step": 6980 |
| }, |
| { |
| "entropy": 0.6414538450539112, |
| "epoch": 1.1296161616161617, |
| "grad_norm": 0.21564777195453644, |
| "learning_rate": 2.4181537397987785e-05, |
| "loss": 0.1901, |
| "mean_token_accuracy": 0.9169015690684319, |
| "num_tokens": 136886358.0, |
| "step": 6990 |
| }, |
| { |
| "entropy": 0.6110823594033719, |
| "epoch": 1.1312323232323231, |
| "grad_norm": 0.3679966628551483, |
| "learning_rate": 2.415410432390866e-05, |
| "loss": 0.2059, |
| "mean_token_accuracy": 0.9113464534282685, |
| "num_tokens": 137095694.0, |
| "step": 7000 |
| }, |
| { |
| "entropy": 0.6325623497366906, |
| "epoch": 1.1328484848484848, |
| "grad_norm": 0.2635299563407898, |
| "learning_rate": 2.41266521211371e-05, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.9193664342164993, |
| "num_tokens": 137299226.0, |
| "step": 7010 |
| }, |
| { |
| "entropy": 0.647395196557045, |
| "epoch": 1.1344646464646464, |
| "grad_norm": 0.3164098262786865, |
| "learning_rate": 2.409918086859054e-05, |
| "loss": 0.1898, |
| "mean_token_accuracy": 0.9130165934562683, |
| "num_tokens": 137496269.0, |
| "step": 7020 |
| }, |
| { |
| "entropy": 0.6482592269778251, |
| "epoch": 1.136080808080808, |
| "grad_norm": 0.25383561849594116, |
| "learning_rate": 2.4071690645241142e-05, |
| "loss": 0.1946, |
| "mean_token_accuracy": 0.910372956097126, |
| "num_tokens": 137695852.0, |
| "step": 7030 |
| }, |
| { |
| "entropy": 0.6379775069653988, |
| "epoch": 1.1376969696969697, |
| "grad_norm": 0.31261879205703735, |
| "learning_rate": 2.404418153011564e-05, |
| "loss": 0.1975, |
| "mean_token_accuracy": 0.9110258087515831, |
| "num_tokens": 137897409.0, |
| "step": 7040 |
| }, |
| { |
| "entropy": 0.6479882016777992, |
| "epoch": 1.1393131313131313, |
| "grad_norm": 0.1875302940607071, |
| "learning_rate": 2.401665360229504e-05, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9154945403337479, |
| "num_tokens": 138093818.0, |
| "step": 7050 |
| }, |
| { |
| "entropy": 0.6449267700314522, |
| "epoch": 1.140929292929293, |
| "grad_norm": 0.19978782534599304, |
| "learning_rate": 2.398910694091448e-05, |
| "loss": 0.1761, |
| "mean_token_accuracy": 0.91689632833004, |
| "num_tokens": 138293505.0, |
| "step": 7060 |
| }, |
| { |
| "entropy": 0.632693299651146, |
| "epoch": 1.1425454545454545, |
| "grad_norm": 0.22115358710289001, |
| "learning_rate": 2.3961541625162895e-05, |
| "loss": 0.1771, |
| "mean_token_accuracy": 0.9192465275526047, |
| "num_tokens": 138494339.0, |
| "step": 7070 |
| }, |
| { |
| "entropy": 0.6531041838228703, |
| "epoch": 1.1441616161616162, |
| "grad_norm": 0.18521378934383392, |
| "learning_rate": 2.393395773428289e-05, |
| "loss": 0.1976, |
| "mean_token_accuracy": 0.9157662749290466, |
| "num_tokens": 138690939.0, |
| "step": 7080 |
| }, |
| { |
| "entropy": 0.6616611868143082, |
| "epoch": 1.1457777777777778, |
| "grad_norm": 0.5136955976486206, |
| "learning_rate": 2.3906355347570438e-05, |
| "loss": 0.2008, |
| "mean_token_accuracy": 0.9052021831274033, |
| "num_tokens": 138886124.0, |
| "step": 7090 |
| }, |
| { |
| "entropy": 0.6661837339401245, |
| "epoch": 1.1473939393939394, |
| "grad_norm": 0.18936645984649658, |
| "learning_rate": 2.3878734544374708e-05, |
| "loss": 0.1767, |
| "mean_token_accuracy": 0.919096226990223, |
| "num_tokens": 139080717.0, |
| "step": 7100 |
| }, |
| { |
| "entropy": 0.6578754298388958, |
| "epoch": 1.149010101010101, |
| "grad_norm": 0.2449623942375183, |
| "learning_rate": 2.3851095404097795e-05, |
| "loss": 0.1888, |
| "mean_token_accuracy": 0.9126079857349396, |
| "num_tokens": 139275344.0, |
| "step": 7110 |
| }, |
| { |
| "entropy": 0.6601323708891869, |
| "epoch": 1.1506262626262627, |
| "grad_norm": 0.23257870972156525, |
| "learning_rate": 2.3823438006194505e-05, |
| "loss": 0.1995, |
| "mean_token_accuracy": 0.9094919070601464, |
| "num_tokens": 139467719.0, |
| "step": 7120 |
| }, |
| { |
| "entropy": 0.664834751188755, |
| "epoch": 1.1522424242424243, |
| "grad_norm": 0.24553577601909637, |
| "learning_rate": 2.3795762430172146e-05, |
| "loss": 0.2118, |
| "mean_token_accuracy": 0.9074599102139473, |
| "num_tokens": 139659844.0, |
| "step": 7130 |
| }, |
| { |
| "entropy": 0.6728641912341118, |
| "epoch": 1.153858585858586, |
| "grad_norm": 0.19290964305400848, |
| "learning_rate": 2.3768068755590265e-05, |
| "loss": 0.195, |
| "mean_token_accuracy": 0.9208357721567154, |
| "num_tokens": 139849410.0, |
| "step": 7140 |
| }, |
| { |
| "entropy": 0.6649326175451279, |
| "epoch": 1.1554747474747475, |
| "grad_norm": 0.3070566952228546, |
| "learning_rate": 2.374035706206045e-05, |
| "loss": 0.199, |
| "mean_token_accuracy": 0.9067992687225341, |
| "num_tokens": 140042165.0, |
| "step": 7150 |
| }, |
| { |
| "entropy": 0.6905647456645966, |
| "epoch": 1.1570909090909092, |
| "grad_norm": 0.18495376408100128, |
| "learning_rate": 2.3712627429246083e-05, |
| "loss": 0.1788, |
| "mean_token_accuracy": 0.9231453701853752, |
| "num_tokens": 140227677.0, |
| "step": 7160 |
| }, |
| { |
| "entropy": 0.647196751832962, |
| "epoch": 1.1587070707070708, |
| "grad_norm": 0.19866421818733215, |
| "learning_rate": 2.368487993686212e-05, |
| "loss": 0.1929, |
| "mean_token_accuracy": 0.9224782362580299, |
| "num_tokens": 140424940.0, |
| "step": 7170 |
| }, |
| { |
| "entropy": 0.6779814459383487, |
| "epoch": 1.1603232323232322, |
| "grad_norm": 0.3933802843093872, |
| "learning_rate": 2.3657114664674854e-05, |
| "loss": 0.1819, |
| "mean_token_accuracy": 0.9179326862096786, |
| "num_tokens": 140615273.0, |
| "step": 7180 |
| }, |
| { |
| "entropy": 0.6495263047516346, |
| "epoch": 1.1619393939393938, |
| "grad_norm": 0.3412734568119049, |
| "learning_rate": 2.3629331692501692e-05, |
| "loss": 0.1896, |
| "mean_token_accuracy": 0.914878611266613, |
| "num_tokens": 140811787.0, |
| "step": 7190 |
| }, |
| { |
| "entropy": 0.6819436699151993, |
| "epoch": 1.1635555555555555, |
| "grad_norm": 0.2559371888637543, |
| "learning_rate": 2.360153110021092e-05, |
| "loss": 0.1862, |
| "mean_token_accuracy": 0.9186094254255295, |
| "num_tokens": 140998555.0, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.1635555555555555, |
| "eval_entropy": 0.7138508333265782, |
| "eval_loss": 0.17618873715400696, |
| "eval_mean_token_accuracy": 0.9209619098305702, |
| "eval_num_tokens": 140998555.0, |
| "eval_runtime": 108.408, |
| "eval_samples_per_second": 9.224, |
| "eval_steps_per_second": 9.224, |
| "step": 7200 |
| }, |
| { |
| "entropy": 0.6654595993459225, |
| "epoch": 1.165171717171717, |
| "grad_norm": 0.6554359793663025, |
| "learning_rate": 2.357371296772149e-05, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9218516707420349, |
| "num_tokens": 141192711.0, |
| "step": 7210 |
| }, |
| { |
| "entropy": 0.6439177483320236, |
| "epoch": 1.1667878787878787, |
| "grad_norm": 0.28542885184288025, |
| "learning_rate": 2.3545877375002758e-05, |
| "loss": 0.2051, |
| "mean_token_accuracy": 0.9158839777112007, |
| "num_tokens": 141392304.0, |
| "step": 7220 |
| }, |
| { |
| "entropy": 0.6547999024391175, |
| "epoch": 1.1684040404040403, |
| "grad_norm": 0.29079747200012207, |
| "learning_rate": 2.3518024402074285e-05, |
| "loss": 0.1878, |
| "mean_token_accuracy": 0.9213658019900322, |
| "num_tokens": 141585982.0, |
| "step": 7230 |
| }, |
| { |
| "entropy": 0.631399916857481, |
| "epoch": 1.170020202020202, |
| "grad_norm": 0.19328245520591736, |
| "learning_rate": 2.349015412900559e-05, |
| "loss": 0.2094, |
| "mean_token_accuracy": 0.9080318629741668, |
| "num_tokens": 141784861.0, |
| "step": 7240 |
| }, |
| { |
| "entropy": 0.6290282666683197, |
| "epoch": 1.1716363636363636, |
| "grad_norm": 0.24472235143184662, |
| "learning_rate": 2.3462266635915932e-05, |
| "loss": 0.1855, |
| "mean_token_accuracy": 0.9161780208349228, |
| "num_tokens": 141987990.0, |
| "step": 7250 |
| }, |
| { |
| "entropy": 0.6455445989966393, |
| "epoch": 1.1732525252525252, |
| "grad_norm": 0.22559289634227753, |
| "learning_rate": 2.3434362002974078e-05, |
| "loss": 0.2115, |
| "mean_token_accuracy": 0.9121286839246749, |
| "num_tokens": 142185404.0, |
| "step": 7260 |
| }, |
| { |
| "entropy": 0.6576178468763828, |
| "epoch": 1.1748686868686868, |
| "grad_norm": 0.18964362144470215, |
| "learning_rate": 2.340644031039804e-05, |
| "loss": 0.1778, |
| "mean_token_accuracy": 0.928297358751297, |
| "num_tokens": 142379048.0, |
| "step": 7270 |
| }, |
| { |
| "entropy": 0.6815750733017921, |
| "epoch": 1.1764848484848485, |
| "grad_norm": 0.3642102777957916, |
| "learning_rate": 2.3378501638454905e-05, |
| "loss": 0.1947, |
| "mean_token_accuracy": 0.9149638995528221, |
| "num_tokens": 142564772.0, |
| "step": 7280 |
| }, |
| { |
| "entropy": 0.6569184005260468, |
| "epoch": 1.17810101010101, |
| "grad_norm": 0.25654488801956177, |
| "learning_rate": 2.3350546067460542e-05, |
| "loss": 0.1929, |
| "mean_token_accuracy": 0.9159114107489585, |
| "num_tokens": 142758169.0, |
| "step": 7290 |
| }, |
| { |
| "entropy": 0.6607706993818283, |
| "epoch": 1.1797171717171717, |
| "grad_norm": 0.1952294409275055, |
| "learning_rate": 2.332257367777943e-05, |
| "loss": 0.1793, |
| "mean_token_accuracy": 0.922367176413536, |
| "num_tokens": 142949852.0, |
| "step": 7300 |
| }, |
| { |
| "entropy": 0.6690223574638366, |
| "epoch": 1.1813333333333333, |
| "grad_norm": 0.22762024402618408, |
| "learning_rate": 2.3294584549824365e-05, |
| "loss": 0.2005, |
| "mean_token_accuracy": 0.9172046884894371, |
| "num_tokens": 143137499.0, |
| "step": 7310 |
| }, |
| { |
| "entropy": 0.6326002225279808, |
| "epoch": 1.182949494949495, |
| "grad_norm": 0.3268709182739258, |
| "learning_rate": 2.3266578764056283e-05, |
| "loss": 0.1946, |
| "mean_token_accuracy": 0.9179033607244491, |
| "num_tokens": 143337556.0, |
| "step": 7320 |
| }, |
| { |
| "entropy": 0.6689287826418877, |
| "epoch": 1.1845656565656566, |
| "grad_norm": 0.5372002720832825, |
| "learning_rate": 2.3238556400984002e-05, |
| "loss": 0.1666, |
| "mean_token_accuracy": 0.928965862095356, |
| "num_tokens": 143525422.0, |
| "step": 7330 |
| }, |
| { |
| "entropy": 0.6906199634075165, |
| "epoch": 1.1861818181818182, |
| "grad_norm": 0.19594059884548187, |
| "learning_rate": 2.321051754116399e-05, |
| "loss": 0.177, |
| "mean_token_accuracy": 0.9256522223353386, |
| "num_tokens": 143710200.0, |
| "step": 7340 |
| }, |
| { |
| "entropy": 0.6540770314633846, |
| "epoch": 1.1877979797979799, |
| "grad_norm": 0.3353155255317688, |
| "learning_rate": 2.318246226520015e-05, |
| "loss": 0.2016, |
| "mean_token_accuracy": 0.908722198009491, |
| "num_tokens": 143904760.0, |
| "step": 7350 |
| }, |
| { |
| "entropy": 0.6404020361602306, |
| "epoch": 1.1894141414141415, |
| "grad_norm": 0.2372790426015854, |
| "learning_rate": 2.3154390653743558e-05, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.9156206473708153, |
| "num_tokens": 144104075.0, |
| "step": 7360 |
| }, |
| { |
| "entropy": 0.6292147316038609, |
| "epoch": 1.191030303030303, |
| "grad_norm": 0.2508028745651245, |
| "learning_rate": 2.3126302787492272e-05, |
| "loss": 0.205, |
| "mean_token_accuracy": 0.9077795252203942, |
| "num_tokens": 144306681.0, |
| "step": 7370 |
| }, |
| { |
| "entropy": 0.6434667043387889, |
| "epoch": 1.1926464646464647, |
| "grad_norm": 0.25540539622306824, |
| "learning_rate": 2.3098198747191053e-05, |
| "loss": 0.2227, |
| "mean_token_accuracy": 0.9046233102679253, |
| "num_tokens": 144505803.0, |
| "step": 7380 |
| }, |
| { |
| "entropy": 0.6272190093994141, |
| "epoch": 1.1942626262626264, |
| "grad_norm": 0.20317929983139038, |
| "learning_rate": 2.307007861363118e-05, |
| "loss": 0.2064, |
| "mean_token_accuracy": 0.9116634547710418, |
| "num_tokens": 144707799.0, |
| "step": 7390 |
| }, |
| { |
| "entropy": 0.6925327144563198, |
| "epoch": 1.195878787878788, |
| "grad_norm": 0.24650558829307556, |
| "learning_rate": 2.3041942467650182e-05, |
| "loss": 0.1892, |
| "mean_token_accuracy": 0.9244462102651596, |
| "num_tokens": 144891128.0, |
| "step": 7400 |
| }, |
| { |
| "entropy": 0.6295531697571277, |
| "epoch": 1.1974949494949496, |
| "grad_norm": 0.2404937744140625, |
| "learning_rate": 2.3013790390131635e-05, |
| "loss": 0.1909, |
| "mean_token_accuracy": 0.9272438645362854, |
| "num_tokens": 145093402.0, |
| "step": 7410 |
| }, |
| { |
| "entropy": 0.6406836315989495, |
| "epoch": 1.199111111111111, |
| "grad_norm": 0.2630082964897156, |
| "learning_rate": 2.2985622462004894e-05, |
| "loss": 0.2024, |
| "mean_token_accuracy": 0.9157192766666412, |
| "num_tokens": 145292219.0, |
| "step": 7420 |
| }, |
| { |
| "entropy": 0.6289153560996056, |
| "epoch": 1.2007272727272726, |
| "grad_norm": 0.3908150792121887, |
| "learning_rate": 2.2957438764244894e-05, |
| "loss": 0.1821, |
| "mean_token_accuracy": 0.916665130853653, |
| "num_tokens": 145492163.0, |
| "step": 7430 |
| }, |
| { |
| "entropy": 0.6399331271648407, |
| "epoch": 1.2023434343434343, |
| "grad_norm": 1.9838895797729492, |
| "learning_rate": 2.292923937787189e-05, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.9150435760617256, |
| "num_tokens": 145690169.0, |
| "step": 7440 |
| }, |
| { |
| "entropy": 0.6503728218376637, |
| "epoch": 1.203959595959596, |
| "grad_norm": 0.4047127366065979, |
| "learning_rate": 2.2901024383951265e-05, |
| "loss": 0.2091, |
| "mean_token_accuracy": 0.9137666761875153, |
| "num_tokens": 145884192.0, |
| "step": 7450 |
| }, |
| { |
| "entropy": 0.6296872481703758, |
| "epoch": 1.2055757575757575, |
| "grad_norm": 0.7709885239601135, |
| "learning_rate": 2.287279386359323e-05, |
| "loss": 0.1875, |
| "mean_token_accuracy": 0.9288541987538338, |
| "num_tokens": 146084734.0, |
| "step": 7460 |
| }, |
| { |
| "entropy": 0.6353645235300064, |
| "epoch": 1.2071919191919191, |
| "grad_norm": 0.3050624132156372, |
| "learning_rate": 2.2844547897952655e-05, |
| "loss": 0.1885, |
| "mean_token_accuracy": 0.9098729580640793, |
| "num_tokens": 146284306.0, |
| "step": 7470 |
| }, |
| { |
| "entropy": 0.6429247371852398, |
| "epoch": 1.2088080808080808, |
| "grad_norm": 0.5625445246696472, |
| "learning_rate": 2.2816286568228812e-05, |
| "loss": 0.1978, |
| "mean_token_accuracy": 0.914405246078968, |
| "num_tokens": 146482486.0, |
| "step": 7480 |
| }, |
| { |
| "entropy": 0.6462002150714398, |
| "epoch": 1.2104242424242424, |
| "grad_norm": 0.5488247871398926, |
| "learning_rate": 2.2788009955665133e-05, |
| "loss": 0.2028, |
| "mean_token_accuracy": 0.9175966009497643, |
| "num_tokens": 146678318.0, |
| "step": 7490 |
| }, |
| { |
| "entropy": 0.6897264942526817, |
| "epoch": 1.212040404040404, |
| "grad_norm": 3.5190200805664062, |
| "learning_rate": 2.2759718141548985e-05, |
| "loss": 0.2213, |
| "mean_token_accuracy": 0.9151095658540725, |
| "num_tokens": 146861526.0, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.212040404040404, |
| "eval_entropy": 0.7134856638014316, |
| "eval_loss": 0.1802089363336563, |
| "eval_mean_token_accuracy": 0.9210226243138313, |
| "eval_num_tokens": 146861526.0, |
| "eval_runtime": 107.8276, |
| "eval_samples_per_second": 9.274, |
| "eval_steps_per_second": 9.274, |
| "step": 7500 |
| }, |
| { |
| "entropy": 0.6661449395120144, |
| "epoch": 1.2136565656565657, |
| "grad_norm": 0.746952474117279, |
| "learning_rate": 2.2731411207211436e-05, |
| "loss": 0.1967, |
| "mean_token_accuracy": 0.9125927239656448, |
| "num_tokens": 147053824.0, |
| "step": 7510 |
| }, |
| { |
| "entropy": 0.6528176456689835, |
| "epoch": 1.2152727272727273, |
| "grad_norm": 0.24343125522136688, |
| "learning_rate": 2.270308923402702e-05, |
| "loss": 0.1841, |
| "mean_token_accuracy": 0.9207581833004952, |
| "num_tokens": 147250379.0, |
| "step": 7520 |
| }, |
| { |
| "entropy": 0.6335774019360543, |
| "epoch": 1.216888888888889, |
| "grad_norm": 0.19108474254608154, |
| "learning_rate": 2.26747523034135e-05, |
| "loss": 0.2007, |
| "mean_token_accuracy": 0.9169000715017319, |
| "num_tokens": 147452752.0, |
| "step": 7530 |
| }, |
| { |
| "entropy": 0.6925512567162514, |
| "epoch": 1.2185050505050505, |
| "grad_norm": 1.3014295101165771, |
| "learning_rate": 2.264640049683165e-05, |
| "loss": 0.1876, |
| "mean_token_accuracy": 0.9175386264920234, |
| "num_tokens": 147638076.0, |
| "step": 7540 |
| }, |
| { |
| "entropy": 0.6489442780613899, |
| "epoch": 1.2201212121212122, |
| "grad_norm": 0.24760295450687408, |
| "learning_rate": 2.2618033895784995e-05, |
| "loss": 0.1883, |
| "mean_token_accuracy": 0.9158783406019211, |
| "num_tokens": 147836030.0, |
| "step": 7550 |
| }, |
| { |
| "entropy": 0.649086344242096, |
| "epoch": 1.2217373737373738, |
| "grad_norm": 0.4136185050010681, |
| "learning_rate": 2.2589652581819586e-05, |
| "loss": 0.1962, |
| "mean_token_accuracy": 0.9124460712075233, |
| "num_tokens": 148034420.0, |
| "step": 7560 |
| }, |
| { |
| "entropy": 0.6605263993144035, |
| "epoch": 1.2233535353535354, |
| "grad_norm": 0.27574583888053894, |
| "learning_rate": 2.2561256636523796e-05, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.905976240336895, |
| "num_tokens": 148229801.0, |
| "step": 7570 |
| }, |
| { |
| "entropy": 0.6664862349629402, |
| "epoch": 1.224969696969697, |
| "grad_norm": 0.25330832600593567, |
| "learning_rate": 2.2532846141528023e-05, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.9155817702412605, |
| "num_tokens": 148424374.0, |
| "step": 7580 |
| }, |
| { |
| "entropy": 0.6453304678201676, |
| "epoch": 1.2265858585858587, |
| "grad_norm": 0.48292797803878784, |
| "learning_rate": 2.2504421178504528e-05, |
| "loss": 0.2119, |
| "mean_token_accuracy": 0.9098923563957214, |
| "num_tokens": 148624497.0, |
| "step": 7590 |
| }, |
| { |
| "entropy": 0.6781962931156158, |
| "epoch": 1.2282020202020203, |
| "grad_norm": 0.26671740412712097, |
| "learning_rate": 2.2475981829167126e-05, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.9188299715518952, |
| "num_tokens": 148811307.0, |
| "step": 7600 |
| }, |
| { |
| "entropy": 0.6696497678756714, |
| "epoch": 1.2298181818181817, |
| "grad_norm": 0.21028929948806763, |
| "learning_rate": 2.244752817527102e-05, |
| "loss": 0.1834, |
| "mean_token_accuracy": 0.9069130301475525, |
| "num_tokens": 149001143.0, |
| "step": 7610 |
| }, |
| { |
| "entropy": 0.6347329325973987, |
| "epoch": 1.2314343434343433, |
| "grad_norm": 0.24588893353939056, |
| "learning_rate": 2.241906029861251e-05, |
| "loss": 0.181, |
| "mean_token_accuracy": 0.9176601111888886, |
| "num_tokens": 149203763.0, |
| "step": 7620 |
| }, |
| { |
| "entropy": 0.6485351033508777, |
| "epoch": 1.233050505050505, |
| "grad_norm": 0.18820928037166595, |
| "learning_rate": 2.239057828102881e-05, |
| "loss": 0.1958, |
| "mean_token_accuracy": 0.9130016818642617, |
| "num_tokens": 149403173.0, |
| "step": 7630 |
| }, |
| { |
| "entropy": 0.6550226002931595, |
| "epoch": 1.2346666666666666, |
| "grad_norm": 0.2959213852882385, |
| "learning_rate": 2.2362082204397756e-05, |
| "loss": 0.188, |
| "mean_token_accuracy": 0.9180138662457467, |
| "num_tokens": 149597427.0, |
| "step": 7640 |
| }, |
| { |
| "entropy": 0.6599608227610588, |
| "epoch": 1.2362828282828282, |
| "grad_norm": 0.2471340447664261, |
| "learning_rate": 2.233357215063762e-05, |
| "loss": 0.1939, |
| "mean_token_accuracy": 0.9141124650835991, |
| "num_tokens": 149790440.0, |
| "step": 7650 |
| }, |
| { |
| "entropy": 0.6660330504179001, |
| "epoch": 1.2378989898989898, |
| "grad_norm": 0.1872570961713791, |
| "learning_rate": 2.2305048201706855e-05, |
| "loss": 0.1961, |
| "mean_token_accuracy": 0.9194521516561508, |
| "num_tokens": 149980879.0, |
| "step": 7660 |
| }, |
| { |
| "entropy": 0.6459147520363331, |
| "epoch": 1.2395151515151515, |
| "grad_norm": 0.6696406006813049, |
| "learning_rate": 2.2276510439603838e-05, |
| "loss": 0.1949, |
| "mean_token_accuracy": 0.917964231967926, |
| "num_tokens": 150180086.0, |
| "step": 7670 |
| }, |
| { |
| "entropy": 0.6270775929093361, |
| "epoch": 1.241131313131313, |
| "grad_norm": 0.21063180267810822, |
| "learning_rate": 2.2247958946366676e-05, |
| "loss": 0.187, |
| "mean_token_accuracy": 0.9169306978583336, |
| "num_tokens": 150388547.0, |
| "step": 7680 |
| }, |
| { |
| "entropy": 0.6520090967416763, |
| "epoch": 1.2427474747474747, |
| "grad_norm": 0.20971941947937012, |
| "learning_rate": 2.221939380407294e-05, |
| "loss": 0.1884, |
| "mean_token_accuracy": 0.915847373008728, |
| "num_tokens": 150584388.0, |
| "step": 7690 |
| }, |
| { |
| "entropy": 0.6569691218435765, |
| "epoch": 1.2443636363636363, |
| "grad_norm": 0.3301805257797241, |
| "learning_rate": 2.2190815094839442e-05, |
| "loss": 0.2204, |
| "mean_token_accuracy": 0.9099266052246093, |
| "num_tokens": 150782164.0, |
| "step": 7700 |
| }, |
| { |
| "entropy": 0.6681100189685821, |
| "epoch": 1.245979797979798, |
| "grad_norm": 0.3229370713233948, |
| "learning_rate": 2.2162222900821992e-05, |
| "loss": 0.1999, |
| "mean_token_accuracy": 0.9191615000367165, |
| "num_tokens": 150973852.0, |
| "step": 7710 |
| }, |
| { |
| "entropy": 0.6337154619395733, |
| "epoch": 1.2475959595959596, |
| "grad_norm": 0.23271264135837555, |
| "learning_rate": 2.2133617304215167e-05, |
| "loss": 0.1952, |
| "mean_token_accuracy": 0.9135285466909409, |
| "num_tokens": 151176552.0, |
| "step": 7720 |
| }, |
| { |
| "entropy": 0.6439542829990387, |
| "epoch": 1.2492121212121212, |
| "grad_norm": 0.25363296270370483, |
| "learning_rate": 2.2104998387252066e-05, |
| "loss": 0.2087, |
| "mean_token_accuracy": 0.9085622906684876, |
| "num_tokens": 151376726.0, |
| "step": 7730 |
| }, |
| { |
| "entropy": 0.6427732348442078, |
| "epoch": 1.2508282828282828, |
| "grad_norm": 0.20807453989982605, |
| "learning_rate": 2.20763662322041e-05, |
| "loss": 0.2028, |
| "mean_token_accuracy": 0.9118719816207885, |
| "num_tokens": 151574565.0, |
| "step": 7740 |
| }, |
| { |
| "entropy": 0.6582729101181031, |
| "epoch": 1.2524444444444445, |
| "grad_norm": 0.17287397384643555, |
| "learning_rate": 2.204772092138071e-05, |
| "loss": 0.1882, |
| "mean_token_accuracy": 0.917538258433342, |
| "num_tokens": 151767869.0, |
| "step": 7750 |
| }, |
| { |
| "entropy": 0.6312249414622784, |
| "epoch": 1.254060606060606, |
| "grad_norm": 0.21336060762405396, |
| "learning_rate": 2.2019062537129172e-05, |
| "loss": 0.1962, |
| "mean_token_accuracy": 0.9108826741576195, |
| "num_tokens": 151970671.0, |
| "step": 7760 |
| }, |
| { |
| "entropy": 0.6726974606513977, |
| "epoch": 1.2556767676767677, |
| "grad_norm": 0.6455129384994507, |
| "learning_rate": 2.199039116183434e-05, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.9136315107345581, |
| "num_tokens": 152158479.0, |
| "step": 7770 |
| }, |
| { |
| "entropy": 0.6610306613147259, |
| "epoch": 1.2572929292929294, |
| "grad_norm": 0.22136108577251434, |
| "learning_rate": 2.1961706877918418e-05, |
| "loss": 0.1753, |
| "mean_token_accuracy": 0.921593825519085, |
| "num_tokens": 152353114.0, |
| "step": 7780 |
| }, |
| { |
| "entropy": 0.656276173144579, |
| "epoch": 1.258909090909091, |
| "grad_norm": 0.16673600673675537, |
| "learning_rate": 2.1933009767840713e-05, |
| "loss": 0.1933, |
| "mean_token_accuracy": 0.9160093143582344, |
| "num_tokens": 152550398.0, |
| "step": 7790 |
| }, |
| { |
| "entropy": 0.644133622944355, |
| "epoch": 1.2605252525252526, |
| "grad_norm": 0.23686234652996063, |
| "learning_rate": 2.1904299914097394e-05, |
| "loss": 0.2018, |
| "mean_token_accuracy": 0.9210563838481903, |
| "num_tokens": 152749548.0, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.2605252525252526, |
| "eval_entropy": 0.7122493233680726, |
| "eval_loss": 0.17709016799926758, |
| "eval_mean_token_accuracy": 0.9228101785778999, |
| "eval_num_tokens": 152749548.0, |
| "eval_runtime": 108.1994, |
| "eval_samples_per_second": 9.242, |
| "eval_steps_per_second": 9.242, |
| "step": 7800 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 18564, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 300, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.4877359534504673e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|