| { |
| "best_global_step": 512, |
| "best_metric": 0.22544851899147034, |
| "best_model_checkpoint": "DQwen3-1.7B-uncensored/checkpoint-512", |
| "epoch": 0.07895142636854278, |
| "eval_steps": 128, |
| "global_step": 512, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.9247070550918579, |
| "epoch": 0.00015420200462606013, |
| "grad_norm": 31.125, |
| "learning_rate": 0.0, |
| "loss": 2.181769847869873, |
| "mean_token_accuracy": 0.6662116050720215, |
| "num_tokens": 1473.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.0056357383728027, |
| "epoch": 0.00030840400925212025, |
| "grad_norm": 43.0, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 2.696769952774048, |
| "mean_token_accuracy": 0.617977499961853, |
| "num_tokens": 2460.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.0802278518676758, |
| "epoch": 0.0004626060138781804, |
| "grad_norm": 41.5, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 2.6830270290374756, |
| "mean_token_accuracy": 0.6033275127410889, |
| "num_tokens": 3610.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 0.9580708742141724, |
| "epoch": 0.0006168080185042405, |
| "grad_norm": 33.75, |
| "learning_rate": 5e-06, |
| "loss": 2.234797716140747, |
| "mean_token_accuracy": 0.6668869853019714, |
| "num_tokens": 5131.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.0076204538345337, |
| "epoch": 0.0007710100231303007, |
| "grad_norm": 34.75, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 2.374027729034424, |
| "mean_token_accuracy": 0.6405493021011353, |
| "num_tokens": 6377.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.0595803260803223, |
| "epoch": 0.0009252120277563608, |
| "grad_norm": 34.25, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 2.216482400894165, |
| "mean_token_accuracy": 0.6401006579399109, |
| "num_tokens": 7577.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.1813561916351318, |
| "epoch": 0.001079414032382421, |
| "grad_norm": 25.375, |
| "learning_rate": 1e-05, |
| "loss": 2.0130465030670166, |
| "mean_token_accuracy": 0.6524437665939331, |
| "num_tokens": 8874.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 1.4208881855010986, |
| "epoch": 0.001233616037008481, |
| "grad_norm": 24.125, |
| "learning_rate": 1.1666666666666668e-05, |
| "loss": 2.3102152347564697, |
| "mean_token_accuracy": 0.5884244441986084, |
| "num_tokens": 9815.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 0.8640963435173035, |
| "epoch": 0.0013878180416345412, |
| "grad_norm": 12.3125, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 1.3446450233459473, |
| "mean_token_accuracy": 0.7377659678459167, |
| "num_tokens": 11703.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 1.0817725658416748, |
| "epoch": 0.0015420200462606013, |
| "grad_norm": 14.125, |
| "learning_rate": 1.5e-05, |
| "loss": 1.6320915222167969, |
| "mean_token_accuracy": 0.6808972358703613, |
| "num_tokens": 13093.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.9536824226379395, |
| "epoch": 0.0016962220508866615, |
| "grad_norm": 10.5, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 1.2360026836395264, |
| "mean_token_accuracy": 0.741475522518158, |
| "num_tokens": 14714.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 1.049913763999939, |
| "epoch": 0.0018504240555127216, |
| "grad_norm": 10.875, |
| "learning_rate": 1.8333333333333333e-05, |
| "loss": 1.2552467584609985, |
| "mean_token_accuracy": 0.7327285408973694, |
| "num_tokens": 16155.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 0.7939231395721436, |
| "epoch": 0.0020046260601387818, |
| "grad_norm": 7.28125, |
| "learning_rate": 2e-05, |
| "loss": 0.8604422211647034, |
| "mean_token_accuracy": 0.7944584488868713, |
| "num_tokens": 18148.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 0.9421704411506653, |
| "epoch": 0.002158828064764842, |
| "grad_norm": 9.375, |
| "learning_rate": 2.1666666666666667e-05, |
| "loss": 0.9789397716522217, |
| "mean_token_accuracy": 0.7728531956672668, |
| "num_tokens": 19600.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 1.101209044456482, |
| "epoch": 0.002313030069390902, |
| "grad_norm": 9.5, |
| "learning_rate": 2.3333333333333336e-05, |
| "loss": 1.1167230606079102, |
| "mean_token_accuracy": 0.759087085723877, |
| "num_tokens": 20791.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 0.8545694351196289, |
| "epoch": 0.002467232074016962, |
| "grad_norm": 8.875, |
| "learning_rate": 2.5e-05, |
| "loss": 0.8782606720924377, |
| "mean_token_accuracy": 0.8138889074325562, |
| "num_tokens": 22239.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 0.5610961318016052, |
| "epoch": 0.0026214340786430224, |
| "grad_norm": 5.5, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 0.5257444381713867, |
| "mean_token_accuracy": 0.8684759736061096, |
| "num_tokens": 24642.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 0.9791864156723022, |
| "epoch": 0.0027756360832690823, |
| "grad_norm": 12.0, |
| "learning_rate": 2.8333333333333335e-05, |
| "loss": 1.0223743915557861, |
| "mean_token_accuracy": 0.780053436756134, |
| "num_tokens": 25773.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 0.6505466103553772, |
| "epoch": 0.0029298380878951427, |
| "grad_norm": 7.34375, |
| "learning_rate": 3e-05, |
| "loss": 0.5793496370315552, |
| "mean_token_accuracy": 0.8571428656578064, |
| "num_tokens": 27258.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 0.8408939242362976, |
| "epoch": 0.0030840400925212026, |
| "grad_norm": 8.1875, |
| "learning_rate": 3.1666666666666666e-05, |
| "loss": 0.7403033375740051, |
| "mean_token_accuracy": 0.8192341923713684, |
| "num_tokens": 28389.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.6034911274909973, |
| "epoch": 0.003238242097147263, |
| "grad_norm": 9.125, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 0.5590517520904541, |
| "mean_token_accuracy": 0.8626410365104675, |
| "num_tokens": 29904.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 0.9795640110969543, |
| "epoch": 0.003392444101773323, |
| "grad_norm": 13.25, |
| "learning_rate": 3.5e-05, |
| "loss": 0.9148629307746887, |
| "mean_token_accuracy": 0.7654135227203369, |
| "num_tokens": 30577.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 0.7439587712287903, |
| "epoch": 0.0035466461063993833, |
| "grad_norm": 7.25, |
| "learning_rate": 3.6666666666666666e-05, |
| "loss": 0.5953554511070251, |
| "mean_token_accuracy": 0.8356807231903076, |
| "num_tokens": 31863.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 0.7064525485038757, |
| "epoch": 0.0037008481110254433, |
| "grad_norm": 12.3125, |
| "learning_rate": 3.8333333333333334e-05, |
| "loss": 0.6644932627677917, |
| "mean_token_accuracy": 0.8366890549659729, |
| "num_tokens": 32765.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 0.671248733997345, |
| "epoch": 0.0038550501156515036, |
| "grad_norm": 10.6875, |
| "learning_rate": 4e-05, |
| "loss": 0.6164469718933105, |
| "mean_token_accuracy": 0.826787531375885, |
| "num_tokens": 33766.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.5062084794044495, |
| "epoch": 0.0040092521202775636, |
| "grad_norm": 5.4375, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 0.40353134274482727, |
| "mean_token_accuracy": 0.8768283128738403, |
| "num_tokens": 35073.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 0.5384562015533447, |
| "epoch": 0.0041634541249036235, |
| "grad_norm": 5.78125, |
| "learning_rate": 4.3333333333333334e-05, |
| "loss": 0.5431831479072571, |
| "mean_token_accuracy": 0.8347339034080505, |
| "num_tokens": 36152.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 0.4401922821998596, |
| "epoch": 0.004317656129529684, |
| "grad_norm": 4.0, |
| "learning_rate": 4.5e-05, |
| "loss": 0.412110298871994, |
| "mean_token_accuracy": 0.8691341876983643, |
| "num_tokens": 37673.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 0.3748137652873993, |
| "epoch": 0.004471858134155744, |
| "grad_norm": 3.484375, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 0.360552042722702, |
| "mean_token_accuracy": 0.8896728754043579, |
| "num_tokens": 39240.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 0.45826223492622375, |
| "epoch": 0.004626060138781804, |
| "grad_norm": 4.53125, |
| "learning_rate": 4.8333333333333334e-05, |
| "loss": 0.455470472574234, |
| "mean_token_accuracy": 0.8571428656578064, |
| "num_tokens": 40256.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.33488166332244873, |
| "epoch": 0.004780262143407864, |
| "grad_norm": 3.9375, |
| "learning_rate": 5e-05, |
| "loss": 0.3165741562843323, |
| "mean_token_accuracy": 0.8937432765960693, |
| "num_tokens": 42118.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 0.3528728783130646, |
| "epoch": 0.004934464148033924, |
| "grad_norm": 7.6875, |
| "learning_rate": 4.9999468976006514e-05, |
| "loss": 0.38620343804359436, |
| "mean_token_accuracy": 0.8836023807525635, |
| "num_tokens": 43303.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 0.36625728011131287, |
| "epoch": 0.005088666152659985, |
| "grad_norm": 4.84375, |
| "learning_rate": 4.999787592658497e-05, |
| "loss": 0.446200966835022, |
| "mean_token_accuracy": 0.8777328133583069, |
| "num_tokens": 44546.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 0.26643213629722595, |
| "epoch": 0.005242868157286045, |
| "grad_norm": 4.40625, |
| "learning_rate": 4.999522091941117e-05, |
| "loss": 0.2737399637699127, |
| "mean_token_accuracy": 0.8979820609092712, |
| "num_tokens": 46338.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 0.26414045691490173, |
| "epoch": 0.005397070161912105, |
| "grad_norm": 3.296875, |
| "learning_rate": 4.999150406727491e-05, |
| "loss": 0.2983474135398865, |
| "mean_token_accuracy": 0.9013499617576599, |
| "num_tokens": 48272.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.370844304561615, |
| "epoch": 0.005551272166538165, |
| "grad_norm": 3.546875, |
| "learning_rate": 4.9986725528075205e-05, |
| "loss": 0.3454509377479553, |
| "mean_token_accuracy": 0.8820555806159973, |
| "num_tokens": 49467.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 0.2591751515865326, |
| "epoch": 0.0057054741711642255, |
| "grad_norm": 2.8125, |
| "learning_rate": 4.998088550481357e-05, |
| "loss": 0.2637964189052582, |
| "mean_token_accuracy": 0.9097625613212585, |
| "num_tokens": 51370.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 0.30300796031951904, |
| "epoch": 0.005859676175790285, |
| "grad_norm": 2.71875, |
| "learning_rate": 4.997398424558541e-05, |
| "loss": 0.30886220932006836, |
| "mean_token_accuracy": 0.901874303817749, |
| "num_tokens": 53192.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 0.27792277932167053, |
| "epoch": 0.006013878180416345, |
| "grad_norm": 2.6875, |
| "learning_rate": 4.996602204356945e-05, |
| "loss": 0.2732873558998108, |
| "mean_token_accuracy": 0.9054564833641052, |
| "num_tokens": 55051.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 0.2960520088672638, |
| "epoch": 0.006168080185042405, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.9956999237015336e-05, |
| "loss": 0.28190645575523376, |
| "mean_token_accuracy": 0.8957963585853577, |
| "num_tokens": 56748.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.27777737379074097, |
| "epoch": 0.006322282189668466, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.994691620922919e-05, |
| "loss": 0.2448980063199997, |
| "mean_token_accuracy": 0.918410062789917, |
| "num_tokens": 58668.0, |
| "step": 41 |
| }, |
| { |
| "entropy": 0.3428345024585724, |
| "epoch": 0.006476484194294526, |
| "grad_norm": 5.15625, |
| "learning_rate": 4.993577338855741e-05, |
| "loss": 0.354027658700943, |
| "mean_token_accuracy": 0.8893527984619141, |
| "num_tokens": 60113.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 0.30580252408981323, |
| "epoch": 0.006630686198920586, |
| "grad_norm": 2.75, |
| "learning_rate": 4.9923571248368375e-05, |
| "loss": 0.28440362215042114, |
| "mean_token_accuracy": 0.9064558744430542, |
| "num_tokens": 61639.0, |
| "step": 43 |
| }, |
| { |
| "entropy": 0.43973830342292786, |
| "epoch": 0.006784888203546646, |
| "grad_norm": 4.5625, |
| "learning_rate": 4.991031030703244e-05, |
| "loss": 0.4269028902053833, |
| "mean_token_accuracy": 0.859668493270874, |
| "num_tokens": 62552.0, |
| "step": 44 |
| }, |
| { |
| "entropy": 0.33187365531921387, |
| "epoch": 0.006939090208172706, |
| "grad_norm": 3.546875, |
| "learning_rate": 4.989599112789984e-05, |
| "loss": 0.34638962149620056, |
| "mean_token_accuracy": 0.8819671869277954, |
| "num_tokens": 63780.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.31787073612213135, |
| "epoch": 0.007093292212798767, |
| "grad_norm": 3.703125, |
| "learning_rate": 4.988061431927681e-05, |
| "loss": 0.3301301598548889, |
| "mean_token_accuracy": 0.8874788284301758, |
| "num_tokens": 64970.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 0.2509302496910095, |
| "epoch": 0.0072474942174248266, |
| "grad_norm": 3.375, |
| "learning_rate": 4.9864180534399674e-05, |
| "loss": 0.2752370238304138, |
| "mean_token_accuracy": 0.9063180685043335, |
| "num_tokens": 66814.0, |
| "step": 47 |
| }, |
| { |
| "entropy": 0.28000396490097046, |
| "epoch": 0.0074016962220508865, |
| "grad_norm": 3.46875, |
| "learning_rate": 4.984669047140716e-05, |
| "loss": 0.3101637363433838, |
| "mean_token_accuracy": 0.8998599648475647, |
| "num_tokens": 68250.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 0.263468474149704, |
| "epoch": 0.0075558982266769464, |
| "grad_norm": 3.046875, |
| "learning_rate": 4.982814487331072e-05, |
| "loss": 0.29188624024391174, |
| "mean_token_accuracy": 0.9051411747932434, |
| "num_tokens": 69639.0, |
| "step": 49 |
| }, |
| { |
| "entropy": 0.30314338207244873, |
| "epoch": 0.007710100231303007, |
| "grad_norm": 3.296875, |
| "learning_rate": 4.9808544527962964e-05, |
| "loss": 0.3129803538322449, |
| "mean_token_accuracy": 0.8961228728294373, |
| "num_tokens": 71014.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.299335241317749, |
| "epoch": 0.007864302235929066, |
| "grad_norm": 2.9375, |
| "learning_rate": 4.978789026802419e-05, |
| "loss": 0.3139159679412842, |
| "mean_token_accuracy": 0.8981788158416748, |
| "num_tokens": 72230.0, |
| "step": 51 |
| }, |
| { |
| "entropy": 0.33503401279449463, |
| "epoch": 0.008018504240555127, |
| "grad_norm": 3.265625, |
| "learning_rate": 4.9766182970926975e-05, |
| "loss": 0.3325449526309967, |
| "mean_token_accuracy": 0.8837863206863403, |
| "num_tokens": 73305.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 0.3575332760810852, |
| "epoch": 0.008172706245181188, |
| "grad_norm": 3.21875, |
| "learning_rate": 4.9743423558839e-05, |
| "loss": 0.3531642258167267, |
| "mean_token_accuracy": 0.8720735907554626, |
| "num_tokens": 74509.0, |
| "step": 53 |
| }, |
| { |
| "entropy": 0.3179467022418976, |
| "epoch": 0.008326908249807247, |
| "grad_norm": 2.90625, |
| "learning_rate": 4.971961299862376e-05, |
| "loss": 0.32219475507736206, |
| "mean_token_accuracy": 0.898162305355072, |
| "num_tokens": 75823.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 0.27880457043647766, |
| "epoch": 0.008481110254433308, |
| "grad_norm": 5.5625, |
| "learning_rate": 4.9694752301799566e-05, |
| "loss": 0.2741078734397888, |
| "mean_token_accuracy": 0.9016189575195312, |
| "num_tokens": 77437.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.34488171339035034, |
| "epoch": 0.008635312259059369, |
| "grad_norm": 3.90625, |
| "learning_rate": 4.9668842524496526e-05, |
| "loss": 0.366953045129776, |
| "mean_token_accuracy": 0.8817635178565979, |
| "num_tokens": 78942.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 0.2759552299976349, |
| "epoch": 0.008789514263685428, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.9641884767411714e-05, |
| "loss": 0.23074716329574585, |
| "mean_token_accuracy": 0.9223560690879822, |
| "num_tokens": 80444.0, |
| "step": 57 |
| }, |
| { |
| "entropy": 0.2680215835571289, |
| "epoch": 0.008943716268311488, |
| "grad_norm": 2.375, |
| "learning_rate": 4.9613880175762414e-05, |
| "loss": 0.2504393458366394, |
| "mean_token_accuracy": 0.9081172347068787, |
| "num_tokens": 82226.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 0.3482436537742615, |
| "epoch": 0.009097918272937548, |
| "grad_norm": 2.859375, |
| "learning_rate": 4.958482993923742e-05, |
| "loss": 0.3350726068019867, |
| "mean_token_accuracy": 0.8843283653259277, |
| "num_tokens": 83306.0, |
| "step": 59 |
| }, |
| { |
| "entropy": 0.3756292164325714, |
| "epoch": 0.009252120277563608, |
| "grad_norm": 2.75, |
| "learning_rate": 4.955473529194654e-05, |
| "loss": 0.3661136329174042, |
| "mean_token_accuracy": 0.8703535795211792, |
| "num_tokens": 84417.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.30815669894218445, |
| "epoch": 0.00940632228218967, |
| "grad_norm": 2.546875, |
| "learning_rate": 4.952359751236817e-05, |
| "loss": 0.2678232789039612, |
| "mean_token_accuracy": 0.9128242135047913, |
| "num_tokens": 85813.0, |
| "step": 61 |
| }, |
| { |
| "entropy": 0.2885396480560303, |
| "epoch": 0.009560524286815728, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.9491417923294934e-05, |
| "loss": 0.2961139976978302, |
| "mean_token_accuracy": 0.897884726524353, |
| "num_tokens": 87192.0, |
| "step": 62 |
| }, |
| { |
| "entropy": 0.3488570749759674, |
| "epoch": 0.009714726291441789, |
| "grad_norm": 4.09375, |
| "learning_rate": 4.9458197891777556e-05, |
| "loss": 0.41754454374313354, |
| "mean_token_accuracy": 0.8623949289321899, |
| "num_tokens": 88152.0, |
| "step": 63 |
| }, |
| { |
| "entropy": 0.30880776047706604, |
| "epoch": 0.009868928296067848, |
| "grad_norm": 3.59375, |
| "learning_rate": 4.942393882906674e-05, |
| "loss": 0.3697586953639984, |
| "mean_token_accuracy": 0.8826446533203125, |
| "num_tokens": 89370.0, |
| "step": 64 |
| }, |
| { |
| "entropy": 0.33239686489105225, |
| "epoch": 0.010023130300693909, |
| "grad_norm": 3.078125, |
| "learning_rate": 4.9388642190553226e-05, |
| "loss": 0.3398675322532654, |
| "mean_token_accuracy": 0.8863636255264282, |
| "num_tokens": 90478.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.2123962938785553, |
| "epoch": 0.01017733230531997, |
| "grad_norm": 2.65625, |
| "learning_rate": 4.935230947570597e-05, |
| "loss": 0.24962244927883148, |
| "mean_token_accuracy": 0.9207471013069153, |
| "num_tokens": 92467.0, |
| "step": 66 |
| }, |
| { |
| "entropy": 0.2466113418340683, |
| "epoch": 0.010331534309946029, |
| "grad_norm": 2.984375, |
| "learning_rate": 4.931494222800844e-05, |
| "loss": 0.2969174385070801, |
| "mean_token_accuracy": 0.8992460370063782, |
| "num_tokens": 93934.0, |
| "step": 67 |
| }, |
| { |
| "entropy": 0.2843821942806244, |
| "epoch": 0.01048573631457209, |
| "grad_norm": 2.765625, |
| "learning_rate": 4.9276542034893044e-05, |
| "loss": 0.32256507873535156, |
| "mean_token_accuracy": 0.8970125913619995, |
| "num_tokens": 95214.0, |
| "step": 68 |
| }, |
| { |
| "entropy": 0.2835680842399597, |
| "epoch": 0.01063993831919815, |
| "grad_norm": 2.359375, |
| "learning_rate": 4.923711052767369e-05, |
| "loss": 0.25358864665031433, |
| "mean_token_accuracy": 0.9075269103050232, |
| "num_tokens": 96617.0, |
| "step": 69 |
| }, |
| { |
| "entropy": 0.24416741728782654, |
| "epoch": 0.01079414032382421, |
| "grad_norm": 2.453125, |
| "learning_rate": 4.9196649381476504e-05, |
| "loss": 0.2399137169122696, |
| "mean_token_accuracy": 0.9191176295280457, |
| "num_tokens": 98257.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.29682713747024536, |
| "epoch": 0.01094834232845027, |
| "grad_norm": 3.109375, |
| "learning_rate": 4.915516031516863e-05, |
| "loss": 0.3011798858642578, |
| "mean_token_accuracy": 0.9019886255264282, |
| "num_tokens": 99673.0, |
| "step": 71 |
| }, |
| { |
| "entropy": 0.31920552253723145, |
| "epoch": 0.01110254433307633, |
| "grad_norm": 2.84375, |
| "learning_rate": 4.911264509128524e-05, |
| "loss": 0.3012612462043762, |
| "mean_token_accuracy": 0.8963922262191772, |
| "num_tokens": 100762.0, |
| "step": 72 |
| }, |
| { |
| "entropy": 0.30763909220695496, |
| "epoch": 0.01125674633770239, |
| "grad_norm": 2.5, |
| "learning_rate": 4.906910551595466e-05, |
| "loss": 0.2967263460159302, |
| "mean_token_accuracy": 0.9000924825668335, |
| "num_tokens": 101851.0, |
| "step": 73 |
| }, |
| { |
| "entropy": 0.3040732145309448, |
| "epoch": 0.011410948342328451, |
| "grad_norm": 3.0625, |
| "learning_rate": 4.902454343882162e-05, |
| "loss": 0.3297285735607147, |
| "mean_token_accuracy": 0.8881889581680298, |
| "num_tokens": 103129.0, |
| "step": 74 |
| }, |
| { |
| "entropy": 0.2837064564228058, |
| "epoch": 0.01156515034695451, |
| "grad_norm": 2.546875, |
| "learning_rate": 4.8978960752968695e-05, |
| "loss": 0.28480246663093567, |
| "mean_token_accuracy": 0.8954154849052429, |
| "num_tokens": 104533.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.3194725811481476, |
| "epoch": 0.01171935235158057, |
| "grad_norm": 3.171875, |
| "learning_rate": 4.893235939483587e-05, |
| "loss": 0.3251062333583832, |
| "mean_token_accuracy": 0.8847517967224121, |
| "num_tokens": 105669.0, |
| "step": 76 |
| }, |
| { |
| "entropy": 0.2741483747959137, |
| "epoch": 0.01187355435620663, |
| "grad_norm": 2.34375, |
| "learning_rate": 4.8884741344138294e-05, |
| "loss": 0.2672386169433594, |
| "mean_token_accuracy": 0.9074759483337402, |
| "num_tokens": 107028.0, |
| "step": 77 |
| }, |
| { |
| "entropy": 0.2283252328634262, |
| "epoch": 0.01202775636083269, |
| "grad_norm": 2.265625, |
| "learning_rate": 4.8836108623782154e-05, |
| "loss": 0.23968154191970825, |
| "mean_token_accuracy": 0.9172775149345398, |
| "num_tokens": 108946.0, |
| "step": 78 |
| }, |
| { |
| "entropy": 0.3795450031757355, |
| "epoch": 0.012181958365458751, |
| "grad_norm": 2.921875, |
| "learning_rate": 4.8786463299778773e-05, |
| "loss": 0.4119304120540619, |
| "mean_token_accuracy": 0.8639523386955261, |
| "num_tokens": 109961.0, |
| "step": 79 |
| }, |
| { |
| "entropy": 0.3368295431137085, |
| "epoch": 0.01233616037008481, |
| "grad_norm": 2.984375, |
| "learning_rate": 4.873580748115679e-05, |
| "loss": 0.3614250719547272, |
| "mean_token_accuracy": 0.8688679337501526, |
| "num_tokens": 111029.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.2857000231742859, |
| "epoch": 0.012490362374710871, |
| "grad_norm": 2.890625, |
| "learning_rate": 4.8684143319872636e-05, |
| "loss": 0.2805139422416687, |
| "mean_token_accuracy": 0.8976377844810486, |
| "num_tokens": 112307.0, |
| "step": 81 |
| }, |
| { |
| "entropy": 0.3508206605911255, |
| "epoch": 0.012644564379336932, |
| "grad_norm": 2.6875, |
| "learning_rate": 4.863147301071903e-05, |
| "loss": 0.3427751362323761, |
| "mean_token_accuracy": 0.8861867785453796, |
| "num_tokens": 113343.0, |
| "step": 82 |
| }, |
| { |
| "entropy": 0.33843186497688293, |
| "epoch": 0.012798766383962991, |
| "grad_norm": 2.796875, |
| "learning_rate": 4.8577798791231815e-05, |
| "loss": 0.32030197978019714, |
| "mean_token_accuracy": 0.8884462118148804, |
| "num_tokens": 114606.0, |
| "step": 83 |
| }, |
| { |
| "entropy": 0.2596885859966278, |
| "epoch": 0.012952968388589052, |
| "grad_norm": 2.15625, |
| "learning_rate": 4.852312294159486e-05, |
| "loss": 0.2479410469532013, |
| "mean_token_accuracy": 0.916926920413971, |
| "num_tokens": 116215.0, |
| "step": 84 |
| }, |
| { |
| "entropy": 0.3047274947166443, |
| "epoch": 0.013107170393215111, |
| "grad_norm": 3.03125, |
| "learning_rate": 4.8467447784543205e-05, |
| "loss": 0.30305323004722595, |
| "mean_token_accuracy": 0.8943606019020081, |
| "num_tokens": 117482.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.18909737467765808, |
| "epoch": 0.013261372397841172, |
| "grad_norm": 1.984375, |
| "learning_rate": 4.841077568526439e-05, |
| "loss": 0.2026541382074356, |
| "mean_token_accuracy": 0.9290540814399719, |
| "num_tokens": 119858.0, |
| "step": 86 |
| }, |
| { |
| "entropy": 0.2918696403503418, |
| "epoch": 0.013415574402467233, |
| "grad_norm": 2.953125, |
| "learning_rate": 4.8353109051297976e-05, |
| "loss": 0.3184109330177307, |
| "mean_token_accuracy": 0.8954508900642395, |
| "num_tokens": 121119.0, |
| "step": 87 |
| }, |
| { |
| "entropy": 0.3284008800983429, |
| "epoch": 0.013569776407093292, |
| "grad_norm": 2.953125, |
| "learning_rate": 4.829445033243326e-05, |
| "loss": 0.3101221024990082, |
| "mean_token_accuracy": 0.8810949325561523, |
| "num_tokens": 122296.0, |
| "step": 88 |
| }, |
| { |
| "entropy": 0.2881852090358734, |
| "epoch": 0.013723978411719353, |
| "grad_norm": 2.375, |
| "learning_rate": 4.823480202060521e-05, |
| "loss": 0.28734254837036133, |
| "mean_token_accuracy": 0.897292971611023, |
| "num_tokens": 123560.0, |
| "step": 89 |
| }, |
| { |
| "entropy": 0.3923459053039551, |
| "epoch": 0.013878180416345412, |
| "grad_norm": 3.40625, |
| "learning_rate": 4.817416664978861e-05, |
| "loss": 0.4181320071220398, |
| "mean_token_accuracy": 0.857782781124115, |
| "num_tokens": 124461.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.38047465682029724, |
| "epoch": 0.014032382420971472, |
| "grad_norm": 3.296875, |
| "learning_rate": 4.81125467958904e-05, |
| "loss": 0.4139612317085266, |
| "mean_token_accuracy": 0.8700189590454102, |
| "num_tokens": 125523.0, |
| "step": 91 |
| }, |
| { |
| "entropy": 0.32315686345100403, |
| "epoch": 0.014186584425597533, |
| "grad_norm": 3.125, |
| "learning_rate": 4.804994507664026e-05, |
| "loss": 0.29804831743240356, |
| "mean_token_accuracy": 0.9056603908538818, |
| "num_tokens": 126962.0, |
| "step": 92 |
| }, |
| { |
| "entropy": 0.3598167598247528, |
| "epoch": 0.014340786430223592, |
| "grad_norm": 4.375, |
| "learning_rate": 4.798636415147938e-05, |
| "loss": 0.33338406682014465, |
| "mean_token_accuracy": 0.876142144203186, |
| "num_tokens": 127955.0, |
| "step": 93 |
| }, |
| { |
| "entropy": 0.2664312422275543, |
| "epoch": 0.014494988434849653, |
| "grad_norm": 2.46875, |
| "learning_rate": 4.7921806721447494e-05, |
| "loss": 0.24038437008857727, |
| "mean_token_accuracy": 0.9096692204475403, |
| "num_tokens": 129535.0, |
| "step": 94 |
| }, |
| { |
| "entropy": 0.40390363335609436, |
| "epoch": 0.014649190439475714, |
| "grad_norm": 5.46875, |
| "learning_rate": 4.785627552906816e-05, |
| "loss": 0.39077234268188477, |
| "mean_token_accuracy": 0.8630303144454956, |
| "num_tokens": 130368.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.25566768646240234, |
| "epoch": 0.014803392444101773, |
| "grad_norm": 2.359375, |
| "learning_rate": 4.77897733582322e-05, |
| "loss": 0.24936963617801666, |
| "mean_token_accuracy": 0.9094488024711609, |
| "num_tokens": 132154.0, |
| "step": 96 |
| }, |
| { |
| "entropy": 0.27786779403686523, |
| "epoch": 0.014957594448727834, |
| "grad_norm": 2.65625, |
| "learning_rate": 4.77223030340795e-05, |
| "loss": 0.26183679699897766, |
| "mean_token_accuracy": 0.9076694250106812, |
| "num_tokens": 133505.0, |
| "step": 97 |
| }, |
| { |
| "entropy": 0.2191360741853714, |
| "epoch": 0.015111796453353893, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.7653867422878926e-05, |
| "loss": 0.20657718181610107, |
| "mean_token_accuracy": 0.9271235466003418, |
| "num_tokens": 135585.0, |
| "step": 98 |
| }, |
| { |
| "entropy": 0.24974940717220306, |
| "epoch": 0.015265998457979954, |
| "grad_norm": 2.6875, |
| "learning_rate": 4.758446943190661e-05, |
| "loss": 0.2656131982803345, |
| "mean_token_accuracy": 0.9067688584327698, |
| "num_tokens": 137159.0, |
| "step": 99 |
| }, |
| { |
| "entropy": 0.2313736230134964, |
| "epoch": 0.015420200462606014, |
| "grad_norm": 2.046875, |
| "learning_rate": 4.751411200932242e-05, |
| "loss": 0.23317928612232208, |
| "mean_token_accuracy": 0.9191856980323792, |
| "num_tokens": 138788.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.2997652590274811, |
| "epoch": 0.015574402467232074, |
| "grad_norm": 2.4375, |
| "learning_rate": 4.7442798144044695e-05, |
| "loss": 0.3120857775211334, |
| "mean_token_accuracy": 0.8966366052627563, |
| "num_tokens": 140015.0, |
| "step": 101 |
| }, |
| { |
| "entropy": 0.3081951141357422, |
| "epoch": 0.015728604471858133, |
| "grad_norm": 2.6875, |
| "learning_rate": 4.7370530865623334e-05, |
| "loss": 0.34071362018585205, |
| "mean_token_accuracy": 0.8860557675361633, |
| "num_tokens": 141278.0, |
| "step": 102 |
| }, |
| { |
| "entropy": 0.2699045240879059, |
| "epoch": 0.015882806476484195, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.729731324411104e-05, |
| "loss": 0.27989721298217773, |
| "mean_token_accuracy": 0.90031898021698, |
| "num_tokens": 142540.0, |
| "step": 103 |
| }, |
| { |
| "entropy": 0.2760254144668579, |
| "epoch": 0.016037008481110254, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.722314838993291e-05, |
| "loss": 0.3070385158061981, |
| "mean_token_accuracy": 0.9090268015861511, |
| "num_tokens": 143966.0, |
| "step": 104 |
| }, |
| { |
| "entropy": 0.29715025424957275, |
| "epoch": 0.016191210485736313, |
| "grad_norm": 2.703125, |
| "learning_rate": 4.714803945375431e-05, |
| "loss": 0.3124261796474457, |
| "mean_token_accuracy": 0.8967213034629822, |
| "num_tokens": 145194.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 0.3180467486381531, |
| "epoch": 0.016345412490362376, |
| "grad_norm": 2.8125, |
| "learning_rate": 4.707198962634701e-05, |
| "loss": 0.3431381285190582, |
| "mean_token_accuracy": 0.8840726017951965, |
| "num_tokens": 146194.0, |
| "step": 106 |
| }, |
| { |
| "entropy": 0.25070613622665405, |
| "epoch": 0.016499614494988435, |
| "grad_norm": 2.3125, |
| "learning_rate": 4.699500213845367e-05, |
| "loss": 0.290202796459198, |
| "mean_token_accuracy": 0.9046997427940369, |
| "num_tokens": 147734.0, |
| "step": 107 |
| }, |
| { |
| "entropy": 0.22292165458202362, |
| "epoch": 0.016653816499614494, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.691708026065055e-05, |
| "loss": 0.2274986356496811, |
| "mean_token_accuracy": 0.9138554334640503, |
| "num_tokens": 149402.0, |
| "step": 108 |
| }, |
| { |
| "entropy": 0.3579561412334442, |
| "epoch": 0.016808018504240557, |
| "grad_norm": 2.78125, |
| "learning_rate": 4.683822730320858e-05, |
| "loss": 0.31315499544143677, |
| "mean_token_accuracy": 0.889497697353363, |
| "num_tokens": 150505.0, |
| "step": 109 |
| }, |
| { |
| "entropy": 0.23602542281150818, |
| "epoch": 0.016962220508866616, |
| "grad_norm": 1.609375, |
| "learning_rate": 4.6758446615952746e-05, |
| "loss": 0.20407229661941528, |
| "mean_token_accuracy": 0.9303831458091736, |
| "num_tokens": 152366.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.3046983480453491, |
| "epoch": 0.017116422513492675, |
| "grad_norm": 3.25, |
| "learning_rate": 4.6677741588119784e-05, |
| "loss": 0.3156391382217407, |
| "mean_token_accuracy": 0.8897196054458618, |
| "num_tokens": 153444.0, |
| "step": 111 |
| }, |
| { |
| "entropy": 0.25662004947662354, |
| "epoch": 0.017270624518118737, |
| "grad_norm": 2.515625, |
| "learning_rate": 4.6596115648214196e-05, |
| "loss": 0.2515248656272888, |
| "mean_token_accuracy": 0.907616376876831, |
| "num_tokens": 154870.0, |
| "step": 112 |
| }, |
| { |
| "entropy": 0.28677132725715637, |
| "epoch": 0.017424826522744796, |
| "grad_norm": 3.265625, |
| "learning_rate": 4.651357226386258e-05, |
| "loss": 0.2942817211151123, |
| "mean_token_accuracy": 0.8936970829963684, |
| "num_tokens": 155941.0, |
| "step": 113 |
| }, |
| { |
| "entropy": 0.21182145178318024, |
| "epoch": 0.017579028527370855, |
| "grad_norm": 2.3125, |
| "learning_rate": 4.6430114941666334e-05, |
| "loss": 0.23567034304141998, |
| "mean_token_accuracy": 0.9196969866752625, |
| "num_tokens": 157269.0, |
| "step": 114 |
| }, |
| { |
| "entropy": 0.18196314573287964, |
| "epoch": 0.017733230531996914, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.6345747227052726e-05, |
| "loss": 0.18516698479652405, |
| "mean_token_accuracy": 0.9305768013000488, |
| "num_tokens": 159236.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 0.23556780815124512, |
| "epoch": 0.017887432536622977, |
| "grad_norm": 2.40625, |
| "learning_rate": 4.626047270412419e-05, |
| "loss": 0.22876134514808655, |
| "mean_token_accuracy": 0.9182389974594116, |
| "num_tokens": 160516.0, |
| "step": 116 |
| }, |
| { |
| "entropy": 0.24857133626937866, |
| "epoch": 0.018041634541249036, |
| "grad_norm": 3.0625, |
| "learning_rate": 4.6174294995506154e-05, |
| "loss": 0.2965892255306244, |
| "mean_token_accuracy": 0.90025794506073, |
| "num_tokens": 161687.0, |
| "step": 117 |
| }, |
| { |
| "entropy": 0.21330931782722473, |
| "epoch": 0.018195836545875095, |
| "grad_norm": 2.421875, |
| "learning_rate": 4.6087217762193105e-05, |
| "loss": 0.23048508167266846, |
| "mean_token_accuracy": 0.9241044521331787, |
| "num_tokens": 163342.0, |
| "step": 118 |
| }, |
| { |
| "entropy": 0.25938084721565247, |
| "epoch": 0.018350038550501158, |
| "grad_norm": 2.734375, |
| "learning_rate": 4.599924470339303e-05, |
| "loss": 0.27338430285453796, |
| "mean_token_accuracy": 0.9029850959777832, |
| "num_tokens": 164690.0, |
| "step": 119 |
| }, |
| { |
| "entropy": 0.3166216015815735, |
| "epoch": 0.018504240555127217, |
| "grad_norm": 3.609375, |
| "learning_rate": 4.5910379556370355e-05, |
| "loss": 0.3654600977897644, |
| "mean_token_accuracy": 0.871026337146759, |
| "num_tokens": 165799.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.21709276735782623, |
| "epoch": 0.018658442559753276, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.582062609628709e-05, |
| "loss": 0.214874729514122, |
| "mean_token_accuracy": 0.9245843291282654, |
| "num_tokens": 167491.0, |
| "step": 121 |
| }, |
| { |
| "entropy": 0.24251380562782288, |
| "epoch": 0.01881264456437934, |
| "grad_norm": 1.9921875, |
| "learning_rate": 4.57299881360425e-05, |
| "loss": 0.26085519790649414, |
| "mean_token_accuracy": 0.9065860509872437, |
| "num_tokens": 168987.0, |
| "step": 122 |
| }, |
| { |
| "entropy": 0.2558088004589081, |
| "epoch": 0.018966846569005397, |
| "grad_norm": 2.359375, |
| "learning_rate": 4.563846952611112e-05, |
| "loss": 0.2583191692829132, |
| "mean_token_accuracy": 0.9092382788658142, |
| "num_tokens": 170229.0, |
| "step": 123 |
| }, |
| { |
| "entropy": 0.28851792216300964, |
| "epoch": 0.019121048573631456, |
| "grad_norm": 2.25, |
| "learning_rate": 4.554607415437915e-05, |
| "loss": 0.28650322556495667, |
| "mean_token_accuracy": 0.8939759135246277, |
| "num_tokens": 171482.0, |
| "step": 124 |
| }, |
| { |
| "entropy": 0.3131585419178009, |
| "epoch": 0.01927525057825752, |
| "grad_norm": 2.578125, |
| "learning_rate": 4.545280594597935e-05, |
| "loss": 0.2936202585697174, |
| "mean_token_accuracy": 0.8922480344772339, |
| "num_tokens": 172780.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 0.24182380735874176, |
| "epoch": 0.019429452582883578, |
| "grad_norm": 2.234375, |
| "learning_rate": 4.535866886312423e-05, |
| "loss": 0.2440458983182907, |
| "mean_token_accuracy": 0.9163833856582642, |
| "num_tokens": 174259.0, |
| "step": 126 |
| }, |
| { |
| "entropy": 0.2646311819553375, |
| "epoch": 0.019583654587509637, |
| "grad_norm": 2.109375, |
| "learning_rate": 4.526366690493777e-05, |
| "loss": 0.2328074872493744, |
| "mean_token_accuracy": 0.9140625, |
| "num_tokens": 175675.0, |
| "step": 127 |
| }, |
| { |
| "entropy": 0.2266581654548645, |
| "epoch": 0.019737856592135696, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.5167804107285514e-05, |
| "loss": 0.21153169870376587, |
| "mean_token_accuracy": 0.922784149646759, |
| "num_tokens": 177522.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.019737856592135696, |
| "eval_entropy": 0.27021819719097073, |
| "eval_loss": 0.26394832134246826, |
| "eval_mean_token_accuracy": 0.9077995745410696, |
| "eval_num_tokens": 177522.0, |
| "eval_runtime": 35.0787, |
| "eval_samples_per_second": 77.854, |
| "eval_steps_per_second": 9.75, |
| "step": 128 |
| }, |
| { |
| "entropy": 0.3175150752067566, |
| "epoch": 0.01989205859676176, |
| "grad_norm": 2.5, |
| "learning_rate": 4.507108454260309e-05, |
| "loss": 0.32345065474510193, |
| "mean_token_accuracy": 0.895765483379364, |
| "num_tokens": 178758.0, |
| "step": 129 |
| }, |
| { |
| "entropy": 0.26202577352523804, |
| "epoch": 0.020046260601387818, |
| "grad_norm": 2.59375, |
| "learning_rate": 4.497351231972329e-05, |
| "loss": 0.247625470161438, |
| "mean_token_accuracy": 0.915336549282074, |
| "num_tokens": 180207.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.23124445974826813, |
| "epoch": 0.020200462606013877, |
| "grad_norm": 2.265625, |
| "learning_rate": 4.487509158370139e-05, |
| "loss": 0.221195787191391, |
| "mean_token_accuracy": 0.9168797731399536, |
| "num_tokens": 181779.0, |
| "step": 131 |
| }, |
| { |
| "entropy": 0.3099311590194702, |
| "epoch": 0.02035466461063994, |
| "grad_norm": 3.03125, |
| "learning_rate": 4.4775826515639205e-05, |
| "loss": 0.3427657186985016, |
| "mean_token_accuracy": 0.8853210806846619, |
| "num_tokens": 182877.0, |
| "step": 132 |
| }, |
| { |
| "entropy": 0.19146594405174255, |
| "epoch": 0.020508866615266, |
| "grad_norm": 1.9296875, |
| "learning_rate": 4.4675721332507345e-05, |
| "loss": 0.18723616003990173, |
| "mean_token_accuracy": 0.9326805472373962, |
| "num_tokens": 184519.0, |
| "step": 133 |
| }, |
| { |
| "entropy": 0.29960504174232483, |
| "epoch": 0.020663068619892058, |
| "grad_norm": 2.796875, |
| "learning_rate": 4.4574780286966154e-05, |
| "loss": 0.31267160177230835, |
| "mean_token_accuracy": 0.890625, |
| "num_tokens": 185423.0, |
| "step": 134 |
| }, |
| { |
| "entropy": 0.26278653740882874, |
| "epoch": 0.02081727062451812, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.4473007667184995e-05, |
| "loss": 0.27267012000083923, |
| "mean_token_accuracy": 0.9038869142532349, |
| "num_tokens": 186846.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 0.18965409696102142, |
| "epoch": 0.02097147262914418, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.43704077966601e-05, |
| "loss": 0.21876873075962067, |
| "mean_token_accuracy": 0.9245041608810425, |
| "num_tokens": 188417.0, |
| "step": 136 |
| }, |
| { |
| "entropy": 0.20953340828418732, |
| "epoch": 0.021125674633770238, |
| "grad_norm": 2.109375, |
| "learning_rate": 4.426698503403091e-05, |
| "loss": 0.205082505941391, |
| "mean_token_accuracy": 0.926571249961853, |
| "num_tokens": 190032.0, |
| "step": 137 |
| }, |
| { |
| "entropy": 0.2490757405757904, |
| "epoch": 0.0212798766383963, |
| "grad_norm": 2.0, |
| "learning_rate": 4.4162743772894905e-05, |
| "loss": 0.23051951825618744, |
| "mean_token_accuracy": 0.9111841917037964, |
| "num_tokens": 191256.0, |
| "step": 138 |
| }, |
| { |
| "entropy": 0.3277740180492401, |
| "epoch": 0.02143407864302236, |
| "grad_norm": 5.03125, |
| "learning_rate": 4.405768844162094e-05, |
| "loss": 0.37247925996780396, |
| "mean_token_accuracy": 0.8656716346740723, |
| "num_tokens": 192202.0, |
| "step": 139 |
| }, |
| { |
| "entropy": 0.20335228741168976, |
| "epoch": 0.02158828064764842, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.395182350316115e-05, |
| "loss": 0.20390284061431885, |
| "mean_token_accuracy": 0.9318037033081055, |
| "num_tokens": 193779.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.222616046667099, |
| "epoch": 0.021742482652274478, |
| "grad_norm": 2.15625, |
| "learning_rate": 4.384515345486131e-05, |
| "loss": 0.22837010025978088, |
| "mean_token_accuracy": 0.9107261896133423, |
| "num_tokens": 195288.0, |
| "step": 141 |
| }, |
| { |
| "entropy": 0.2554439902305603, |
| "epoch": 0.02189668465690054, |
| "grad_norm": 2.65625, |
| "learning_rate": 4.373768282826983e-05, |
| "loss": 0.28548112511634827, |
| "mean_token_accuracy": 0.905958354473114, |
| "num_tokens": 196689.0, |
| "step": 142 |
| }, |
| { |
| "entropy": 0.23849214613437653, |
| "epoch": 0.0220508866615266, |
| "grad_norm": 2.21875, |
| "learning_rate": 4.3629416188945224e-05, |
| "loss": 0.25381097197532654, |
| "mean_token_accuracy": 0.9149101972579956, |
| "num_tokens": 197978.0, |
| "step": 143 |
| }, |
| { |
| "entropy": 0.26421603560447693, |
| "epoch": 0.02220508866615266, |
| "grad_norm": 3.5625, |
| "learning_rate": 4.352035813626214e-05, |
| "loss": 0.27579382061958313, |
| "mean_token_accuracy": 0.8979591727256775, |
| "num_tokens": 199260.0, |
| "step": 144 |
| }, |
| { |
| "entropy": 0.20953713357448578, |
| "epoch": 0.02235929067077872, |
| "grad_norm": 2.328125, |
| "learning_rate": 4.3410513303215985e-05, |
| "loss": 0.1990606188774109, |
| "mean_token_accuracy": 0.9306029677391052, |
| "num_tokens": 201026.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 0.32288917899131775, |
| "epoch": 0.02251349267540478, |
| "grad_norm": 2.984375, |
| "learning_rate": 4.329988635622611e-05, |
| "loss": 0.3260837197303772, |
| "mean_token_accuracy": 0.893796980381012, |
| "num_tokens": 202098.0, |
| "step": 146 |
| }, |
| { |
| "entropy": 0.21132293343544006, |
| "epoch": 0.02266769468003084, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.318848199493758e-05, |
| "loss": 0.19785253703594208, |
| "mean_token_accuracy": 0.9298823475837708, |
| "num_tokens": 204231.0, |
| "step": 147 |
| }, |
| { |
| "entropy": 0.3431147038936615, |
| "epoch": 0.022821896684656902, |
| "grad_norm": 2.84375, |
| "learning_rate": 4.30763049520215e-05, |
| "loss": 0.3377273380756378, |
| "mean_token_accuracy": 0.8919667601585388, |
| "num_tokens": 205322.0, |
| "step": 148 |
| }, |
| { |
| "entropy": 0.24553008377552032, |
| "epoch": 0.02297609868928296, |
| "grad_norm": 2.546875, |
| "learning_rate": 4.296335999297397e-05, |
| "loss": 0.23867689073085785, |
| "mean_token_accuracy": 0.9165446758270264, |
| "num_tokens": 206696.0, |
| "step": 149 |
| }, |
| { |
| "entropy": 0.27541691064834595, |
| "epoch": 0.02313030069390902, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.284965191591364e-05, |
| "loss": 0.25213125348091125, |
| "mean_token_accuracy": 0.914050817489624, |
| "num_tokens": 208042.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.23892685770988464, |
| "epoch": 0.023284502698535083, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.2735185551377895e-05, |
| "loss": 0.20277726650238037, |
| "mean_token_accuracy": 0.9304635524749756, |
| "num_tokens": 209560.0, |
| "step": 151 |
| }, |
| { |
| "entropy": 0.2151283323764801, |
| "epoch": 0.02343870470316114, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.261996576211761e-05, |
| "loss": 0.2226867973804474, |
| "mean_token_accuracy": 0.9178715944290161, |
| "num_tokens": 211297.0, |
| "step": 152 |
| }, |
| { |
| "entropy": 0.2410528063774109, |
| "epoch": 0.0235929067077872, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.25039974428906e-05, |
| "loss": 0.22763265669345856, |
| "mean_token_accuracy": 0.9149277806282043, |
| "num_tokens": 212551.0, |
| "step": 153 |
| }, |
| { |
| "entropy": 0.2535974383354187, |
| "epoch": 0.02374710871241326, |
| "grad_norm": 2.328125, |
| "learning_rate": 4.238728552025365e-05, |
| "loss": 0.2421426922082901, |
| "mean_token_accuracy": 0.9143372178077698, |
| "num_tokens": 213668.0, |
| "step": 154 |
| }, |
| { |
| "entropy": 0.2121782749891281, |
| "epoch": 0.023901310717039322, |
| "grad_norm": 1.5, |
| "learning_rate": 4.226983495235328e-05, |
| "loss": 0.20025445520877838, |
| "mean_token_accuracy": 0.9322981238365173, |
| "num_tokens": 215286.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 0.14580558240413666, |
| "epoch": 0.02405551272166538, |
| "grad_norm": 1.625, |
| "learning_rate": 4.215165072871505e-05, |
| "loss": 0.14826127886772156, |
| "mean_token_accuracy": 0.9467787146568298, |
| "num_tokens": 217436.0, |
| "step": 156 |
| }, |
| { |
| "entropy": 0.2315557599067688, |
| "epoch": 0.02420971472629144, |
| "grad_norm": 2.078125, |
| "learning_rate": 4.203273787003162e-05, |
| "loss": 0.2486051321029663, |
| "mean_token_accuracy": 0.9164133667945862, |
| "num_tokens": 218760.0, |
| "step": 157 |
| }, |
| { |
| "entropy": 0.25005754828453064, |
| "epoch": 0.024363916730917503, |
| "grad_norm": 2.390625, |
| "learning_rate": 4.1913101427949505e-05, |
| "loss": 0.2627011835575104, |
| "mean_token_accuracy": 0.9080632925033569, |
| "num_tokens": 220095.0, |
| "step": 158 |
| }, |
| { |
| "entropy": 0.2149634212255478, |
| "epoch": 0.024518118735543562, |
| "grad_norm": 2.28125, |
| "learning_rate": 4.179274648485438e-05, |
| "loss": 0.21630343794822693, |
| "mean_token_accuracy": 0.9172714352607727, |
| "num_tokens": 221481.0, |
| "step": 159 |
| }, |
| { |
| "entropy": 0.2316989302635193, |
| "epoch": 0.02467232074016962, |
| "grad_norm": 2.3125, |
| "learning_rate": 4.1671678153655256e-05, |
| "loss": 0.240981787443161, |
| "mean_token_accuracy": 0.9135708808898926, |
| "num_tokens": 222808.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.29497963190078735, |
| "epoch": 0.024826522744795684, |
| "grad_norm": 2.40625, |
| "learning_rate": 4.154990157756722e-05, |
| "loss": 0.2961036264896393, |
| "mean_token_accuracy": 0.9030969142913818, |
| "num_tokens": 223817.0, |
| "step": 161 |
| }, |
| { |
| "entropy": 0.22725972533226013, |
| "epoch": 0.024980724749421743, |
| "grad_norm": 2.546875, |
| "learning_rate": 4.142742192989299e-05, |
| "loss": 0.22807390987873077, |
| "mean_token_accuracy": 0.9114027619361877, |
| "num_tokens": 225044.0, |
| "step": 162 |
| }, |
| { |
| "entropy": 0.2280416190624237, |
| "epoch": 0.025134926754047802, |
| "grad_norm": 2.421875, |
| "learning_rate": 4.1304244413803076e-05, |
| "loss": 0.24813513457775116, |
| "mean_token_accuracy": 0.9090909361839294, |
| "num_tokens": 226339.0, |
| "step": 163 |
| }, |
| { |
| "entropy": 0.20092645287513733, |
| "epoch": 0.025289128758673864, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.118037426211482e-05, |
| "loss": 0.22428975999355316, |
| "mean_token_accuracy": 0.9173313975334167, |
| "num_tokens": 227726.0, |
| "step": 164 |
| }, |
| { |
| "entropy": 0.20079851150512695, |
| "epoch": 0.025443330763299923, |
| "grad_norm": 9.5625, |
| "learning_rate": 4.105581673707002e-05, |
| "loss": 0.21033848822116852, |
| "mean_token_accuracy": 0.9232493042945862, |
| "num_tokens": 229519.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 0.25729137659072876, |
| "epoch": 0.025597532767925982, |
| "grad_norm": 2.3125, |
| "learning_rate": 4.0930577130111424e-05, |
| "loss": 0.2733251452445984, |
| "mean_token_accuracy": 0.9045871496200562, |
| "num_tokens": 230617.0, |
| "step": 166 |
| }, |
| { |
| "entropy": 0.20442764461040497, |
| "epoch": 0.02575173477255204, |
| "grad_norm": 1.890625, |
| "learning_rate": 4.080466076165793e-05, |
| "loss": 0.20845486223697662, |
| "mean_token_accuracy": 0.9209572076797485, |
| "num_tokens": 232004.0, |
| "step": 167 |
| }, |
| { |
| "entropy": 0.20175087451934814, |
| "epoch": 0.025905936777178104, |
| "grad_norm": 2.453125, |
| "learning_rate": 4.067807298087857e-05, |
| "loss": 0.21334150433540344, |
| "mean_token_accuracy": 0.9243085980415344, |
| "num_tokens": 233386.0, |
| "step": 168 |
| }, |
| { |
| "entropy": 0.26961395144462585, |
| "epoch": 0.026060138781804163, |
| "grad_norm": 2.125, |
| "learning_rate": 4.055081916546525e-05, |
| "loss": 0.24742326140403748, |
| "mean_token_accuracy": 0.9157986044883728, |
| "num_tokens": 234546.0, |
| "step": 169 |
| }, |
| { |
| "entropy": 0.20450648665428162, |
| "epoch": 0.026214340786430222, |
| "grad_norm": 1.6953125, |
| "learning_rate": 4.042290472140431e-05, |
| "loss": 0.20523257553577423, |
| "mean_token_accuracy": 0.9297789335250854, |
| "num_tokens": 236092.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.2690446972846985, |
| "epoch": 0.026368542791056285, |
| "grad_norm": 2.15625, |
| "learning_rate": 4.029433508274686e-05, |
| "loss": 0.26763197779655457, |
| "mean_token_accuracy": 0.9070660471916199, |
| "num_tokens": 237402.0, |
| "step": 171 |
| }, |
| { |
| "entropy": 0.22288963198661804, |
| "epoch": 0.026522744795682344, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.0165115711377945e-05, |
| "loss": 0.24567259848117828, |
| "mean_token_accuracy": 0.9189382791519165, |
| "num_tokens": 238804.0, |
| "step": 172 |
| }, |
| { |
| "entropy": 0.19029025733470917, |
| "epoch": 0.026676946800308403, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.003525209678449e-05, |
| "loss": 0.18879841268062592, |
| "mean_token_accuracy": 0.9351808428764343, |
| "num_tokens": 240941.0, |
| "step": 173 |
| }, |
| { |
| "entropy": 0.2573792338371277, |
| "epoch": 0.026831148804934465, |
| "grad_norm": 2.96875, |
| "learning_rate": 3.9904749755822114e-05, |
| "loss": 0.2607381045818329, |
| "mean_token_accuracy": 0.906000018119812, |
| "num_tokens": 242449.0, |
| "step": 174 |
| }, |
| { |
| "entropy": 0.2028045505285263, |
| "epoch": 0.026985350809560524, |
| "grad_norm": 1.3984375, |
| "learning_rate": 3.977361423248075e-05, |
| "loss": 0.1825239360332489, |
| "mean_token_accuracy": 0.9339895844459534, |
| "num_tokens": 244184.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 0.27057698369026184, |
| "epoch": 0.027139552814186584, |
| "grad_norm": 3.140625, |
| "learning_rate": 3.964185109764915e-05, |
| "loss": 0.30133944749832153, |
| "mean_token_accuracy": 0.8857142925262451, |
| "num_tokens": 245347.0, |
| "step": 176 |
| }, |
| { |
| "entropy": 0.18647152185440063, |
| "epoch": 0.027293754818812646, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.95094659488782e-05, |
| "loss": 0.1798812299966812, |
| "mean_token_accuracy": 0.9323040246963501, |
| "num_tokens": 247039.0, |
| "step": 177 |
| }, |
| { |
| "entropy": 0.2583964765071869, |
| "epoch": 0.027447956823438705, |
| "grad_norm": 2.28125, |
| "learning_rate": 3.9376464410143124e-05, |
| "loss": 0.2609320878982544, |
| "mean_token_accuracy": 0.9023405909538269, |
| "num_tokens": 248286.0, |
| "step": 178 |
| }, |
| { |
| "entropy": 0.24908345937728882, |
| "epoch": 0.027602158828064764, |
| "grad_norm": 2.09375, |
| "learning_rate": 3.9242852131604585e-05, |
| "loss": 0.2381179928779602, |
| "mean_token_accuracy": 0.9222641587257385, |
| "num_tokens": 249619.0, |
| "step": 179 |
| }, |
| { |
| "entropy": 0.21503198146820068, |
| "epoch": 0.027756360832690823, |
| "grad_norm": 2.5, |
| "learning_rate": 3.910863478936864e-05, |
| "loss": 0.2604519724845886, |
| "mean_token_accuracy": 0.9127399921417236, |
| "num_tokens": 251346.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.22753889858722687, |
| "epoch": 0.027910562837316886, |
| "grad_norm": 1.84375, |
| "learning_rate": 3.897381808524562e-05, |
| "loss": 0.23742565512657166, |
| "mean_token_accuracy": 0.9219380617141724, |
| "num_tokens": 252840.0, |
| "step": 181 |
| }, |
| { |
| "entropy": 0.25326159596443176, |
| "epoch": 0.028064764841942945, |
| "grad_norm": 2.203125, |
| "learning_rate": 3.883840774650788e-05, |
| "loss": 0.28680431842803955, |
| "mean_token_accuracy": 0.9005083441734314, |
| "num_tokens": 254225.0, |
| "step": 182 |
| }, |
| { |
| "entropy": 0.24126410484313965, |
| "epoch": 0.028218966846569004, |
| "grad_norm": 2.109375, |
| "learning_rate": 3.870240952564653e-05, |
| "loss": 0.2406134009361267, |
| "mean_token_accuracy": 0.9119541645050049, |
| "num_tokens": 255630.0, |
| "step": 183 |
| }, |
| { |
| "entropy": 0.2304130345582962, |
| "epoch": 0.028373168851195067, |
| "grad_norm": 1.6953125, |
| "learning_rate": 3.856582920012706e-05, |
| "loss": 0.22154204547405243, |
| "mean_token_accuracy": 0.9195979833602905, |
| "num_tokens": 257031.0, |
| "step": 184 |
| }, |
| { |
| "entropy": 0.16509661078453064, |
| "epoch": 0.028527370855821126, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.842867257214383e-05, |
| "loss": 0.15430063009262085, |
| "mean_token_accuracy": 0.940733790397644, |
| "num_tokens": 259165.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 0.24022063612937927, |
| "epoch": 0.028681572860447185, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.8290945468373684e-05, |
| "loss": 0.20412693917751312, |
| "mean_token_accuracy": 0.9327940344810486, |
| "num_tokens": 260780.0, |
| "step": 186 |
| }, |
| { |
| "entropy": 0.2785824239253998, |
| "epoch": 0.028835774865073247, |
| "grad_norm": 2.390625, |
| "learning_rate": 3.8152653739728363e-05, |
| "loss": 0.2689974308013916, |
| "mean_token_accuracy": 0.9066666960716248, |
| "num_tokens": 261988.0, |
| "step": 187 |
| }, |
| { |
| "entropy": 0.20374569296836853, |
| "epoch": 0.028989976869699306, |
| "grad_norm": 2.0, |
| "learning_rate": 3.8013803261105916e-05, |
| "loss": 0.21978892385959625, |
| "mean_token_accuracy": 0.9233038425445557, |
| "num_tokens": 263691.0, |
| "step": 188 |
| }, |
| { |
| "entropy": 0.2387579381465912, |
| "epoch": 0.029144178874325365, |
| "grad_norm": 1.984375, |
| "learning_rate": 3.787439993114123e-05, |
| "loss": 0.23546524345874786, |
| "mean_token_accuracy": 0.9189907312393188, |
| "num_tokens": 265205.0, |
| "step": 189 |
| }, |
| { |
| "entropy": 0.22492903470993042, |
| "epoch": 0.029298380878951428, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.7734449671955326e-05, |
| "loss": 0.21074332296848297, |
| "mean_token_accuracy": 0.9219586849212646, |
| "num_tokens": 266520.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.19710952043533325, |
| "epoch": 0.029452582883577487, |
| "grad_norm": 1.9296875, |
| "learning_rate": 3.759395842890384e-05, |
| "loss": 0.1993340104818344, |
| "mean_token_accuracy": 0.9277042150497437, |
| "num_tokens": 268340.0, |
| "step": 191 |
| }, |
| { |
| "entropy": 0.24934346973896027, |
| "epoch": 0.029606784888203546, |
| "grad_norm": 1.890625, |
| "learning_rate": 3.7452932170324464e-05, |
| "loss": 0.24506257474422455, |
| "mean_token_accuracy": 0.9209383130073547, |
| "num_tokens": 269499.0, |
| "step": 192 |
| }, |
| { |
| "entropy": 0.2751508355140686, |
| "epoch": 0.029760986892829605, |
| "grad_norm": 2.4375, |
| "learning_rate": 3.731137688728335e-05, |
| "loss": 0.28203558921813965, |
| "mean_token_accuracy": 0.9066317677497864, |
| "num_tokens": 270653.0, |
| "step": 193 |
| }, |
| { |
| "entropy": 0.2998161017894745, |
| "epoch": 0.029915188897455668, |
| "grad_norm": 2.640625, |
| "learning_rate": 3.716929859332063e-05, |
| "loss": 0.2953347861766815, |
| "mean_token_accuracy": 0.9018287062644958, |
| "num_tokens": 271700.0, |
| "step": 194 |
| }, |
| { |
| "entropy": 0.2493629902601242, |
| "epoch": 0.030069390902081727, |
| "grad_norm": 2.1875, |
| "learning_rate": 3.7026703324194966e-05, |
| "loss": 0.26706650853157043, |
| "mean_token_accuracy": 0.9076277017593384, |
| "num_tokens": 273137.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 0.20723779499530792, |
| "epoch": 0.030223592906707786, |
| "grad_norm": 2.140625, |
| "learning_rate": 3.688359713762707e-05, |
| "loss": 0.22939355671405792, |
| "mean_token_accuracy": 0.9125827550888062, |
| "num_tokens": 274655.0, |
| "step": 196 |
| }, |
| { |
| "entropy": 0.22990985214710236, |
| "epoch": 0.03037779491133385, |
| "grad_norm": 2.046875, |
| "learning_rate": 3.673998611304246e-05, |
| "loss": 0.2153758704662323, |
| "mean_token_accuracy": 0.9279279112815857, |
| "num_tokens": 275773.0, |
| "step": 197 |
| }, |
| { |
| "entropy": 0.29038283228874207, |
| "epoch": 0.030531996915959907, |
| "grad_norm": 2.71875, |
| "learning_rate": 3.6595876351313116e-05, |
| "loss": 0.304492324590683, |
| "mean_token_accuracy": 0.9004576802253723, |
| "num_tokens": 276655.0, |
| "step": 198 |
| }, |
| { |
| "entropy": 0.19836601614952087, |
| "epoch": 0.030686198920585966, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.645127397449832e-05, |
| "loss": 0.2065221518278122, |
| "mean_token_accuracy": 0.9339622855186462, |
| "num_tokens": 278359.0, |
| "step": 199 |
| }, |
| { |
| "entropy": 0.25179192423820496, |
| "epoch": 0.03084040092521203, |
| "grad_norm": 2.265625, |
| "learning_rate": 3.6306185125584615e-05, |
| "loss": 0.2616140842437744, |
| "mean_token_accuracy": 0.9063336253166199, |
| "num_tokens": 279488.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.18242394924163818, |
| "epoch": 0.030994602929838088, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.616061596822478e-05, |
| "loss": 0.17770832777023315, |
| "mean_token_accuracy": 0.9277376532554626, |
| "num_tokens": 281295.0, |
| "step": 201 |
| }, |
| { |
| "entropy": 0.24629506468772888, |
| "epoch": 0.031148804934464147, |
| "grad_norm": 2.4375, |
| "learning_rate": 3.601457268647606e-05, |
| "loss": 0.2535253167152405, |
| "mean_token_accuracy": 0.9059450030326843, |
| "num_tokens": 282430.0, |
| "step": 202 |
| }, |
| { |
| "entropy": 0.19920703768730164, |
| "epoch": 0.03130300693909021, |
| "grad_norm": 1.921875, |
| "learning_rate": 3.586806148453736e-05, |
| "loss": 0.20293940603733063, |
| "mean_token_accuracy": 0.9283132553100586, |
| "num_tokens": 284098.0, |
| "step": 203 |
| }, |
| { |
| "entropy": 0.1916186809539795, |
| "epoch": 0.031457208943716265, |
| "grad_norm": 1.578125, |
| "learning_rate": 3.572108858648579e-05, |
| "loss": 0.1925540268421173, |
| "mean_token_accuracy": 0.9329091906547546, |
| "num_tokens": 285835.0, |
| "step": 204 |
| }, |
| { |
| "entropy": 0.24154330790042877, |
| "epoch": 0.03161141094834233, |
| "grad_norm": 2.0625, |
| "learning_rate": 3.557366023601216e-05, |
| "loss": 0.2560335099697113, |
| "mean_token_accuracy": 0.9222126007080078, |
| "num_tokens": 287000.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 0.24839094281196594, |
| "epoch": 0.03176561295296839, |
| "grad_norm": 2.09375, |
| "learning_rate": 3.542578269615579e-05, |
| "loss": 0.24170006811618805, |
| "mean_token_accuracy": 0.9167927503585815, |
| "num_tokens": 288330.0, |
| "step": 206 |
| }, |
| { |
| "entropy": 0.19456236064434052, |
| "epoch": 0.031919814957594446, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.527746224903842e-05, |
| "loss": 0.18520742654800415, |
| "mean_token_accuracy": 0.9366295337677002, |
| "num_tokens": 289774.0, |
| "step": 207 |
| }, |
| { |
| "entropy": 0.24151258170604706, |
| "epoch": 0.03207401696222051, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.512870519559733e-05, |
| "loss": 0.22108638286590576, |
| "mean_token_accuracy": 0.9167962670326233, |
| "num_tokens": 291068.0, |
| "step": 208 |
| }, |
| { |
| "entropy": 0.3510158658027649, |
| "epoch": 0.03222821896684657, |
| "grad_norm": 3.71875, |
| "learning_rate": 3.49795178553177e-05, |
| "loss": 0.41906648874282837, |
| "mean_token_accuracy": 0.8701754212379456, |
| "num_tokens": 291931.0, |
| "step": 209 |
| }, |
| { |
| "entropy": 0.3286966383457184, |
| "epoch": 0.03238242097147263, |
| "grad_norm": 3.171875, |
| "learning_rate": 3.48299065659641e-05, |
| "loss": 0.343354731798172, |
| "mean_token_accuracy": 0.8834951519966125, |
| "num_tokens": 292866.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.19397929310798645, |
| "epoch": 0.03253662297609869, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.467987768331127e-05, |
| "loss": 0.1917928159236908, |
| "mean_token_accuracy": 0.9349930882453918, |
| "num_tokens": 294320.0, |
| "step": 211 |
| }, |
| { |
| "entropy": 0.2259572446346283, |
| "epoch": 0.03269082498072475, |
| "grad_norm": 2.203125, |
| "learning_rate": 3.452943758087414e-05, |
| "loss": 0.24537329375743866, |
| "mean_token_accuracy": 0.9182724356651306, |
| "num_tokens": 295833.0, |
| "step": 212 |
| }, |
| { |
| "entropy": 0.22965691983699799, |
| "epoch": 0.03284502698535081, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.437859264963702e-05, |
| "loss": 0.2151767462491989, |
| "mean_token_accuracy": 0.9223232865333557, |
| "num_tokens": 297270.0, |
| "step": 213 |
| }, |
| { |
| "entropy": 0.2611003518104553, |
| "epoch": 0.03299922898997687, |
| "grad_norm": 2.890625, |
| "learning_rate": 3.422734929778213e-05, |
| "loss": 0.2612400949001312, |
| "mean_token_accuracy": 0.8977055549621582, |
| "num_tokens": 298324.0, |
| "step": 214 |
| }, |
| { |
| "entropy": 0.1909189224243164, |
| "epoch": 0.03315343099460293, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.407571395041736e-05, |
| "loss": 0.20462700724601746, |
| "mean_token_accuracy": 0.9242695569992065, |
| "num_tokens": 300009.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 0.2556368410587311, |
| "epoch": 0.03330763299922899, |
| "grad_norm": 2.03125, |
| "learning_rate": 3.392369304930334e-05, |
| "loss": 0.2566298246383667, |
| "mean_token_accuracy": 0.9090163707733154, |
| "num_tokens": 301237.0, |
| "step": 216 |
| }, |
| { |
| "entropy": 0.27811554074287415, |
| "epoch": 0.03346183500385505, |
| "grad_norm": 2.0625, |
| "learning_rate": 3.377129305257975e-05, |
| "loss": 0.2745239734649658, |
| "mean_token_accuracy": 0.9044750332832336, |
| "num_tokens": 302407.0, |
| "step": 217 |
| }, |
| { |
| "entropy": 0.21509166061878204, |
| "epoch": 0.03361603700848111, |
| "grad_norm": 1.84375, |
| "learning_rate": 3.361852043449096e-05, |
| "loss": 0.2006048709154129, |
| "mean_token_accuracy": 0.9250646233558655, |
| "num_tokens": 303963.0, |
| "step": 218 |
| }, |
| { |
| "entropy": 0.2612791359424591, |
| "epoch": 0.03377023901310717, |
| "grad_norm": 2.0, |
| "learning_rate": 3.3465381685111054e-05, |
| "loss": 0.27390342950820923, |
| "mean_token_accuracy": 0.8982036113739014, |
| "num_tokens": 305140.0, |
| "step": 219 |
| }, |
| { |
| "entropy": 0.2126745879650116, |
| "epoch": 0.03392444101773323, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.331188331006804e-05, |
| "loss": 0.20790794491767883, |
| "mean_token_accuracy": 0.9276844263076782, |
| "num_tokens": 306517.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.216102734208107, |
| "epoch": 0.034078643022359294, |
| "grad_norm": 1.53125, |
| "learning_rate": 3.315803183026753e-05, |
| "loss": 0.2031707614660263, |
| "mean_token_accuracy": 0.9320327043533325, |
| "num_tokens": 308114.0, |
| "step": 221 |
| }, |
| { |
| "entropy": 0.23003709316253662, |
| "epoch": 0.03423284502698535, |
| "grad_norm": 2.09375, |
| "learning_rate": 3.30038337816157e-05, |
| "loss": 0.24152696132659912, |
| "mean_token_accuracy": 0.9172229766845703, |
| "num_tokens": 309620.0, |
| "step": 222 |
| }, |
| { |
| "entropy": 0.25657832622528076, |
| "epoch": 0.03438704703161141, |
| "grad_norm": 1.9375, |
| "learning_rate": 3.284929571474164e-05, |
| "loss": 0.2669946551322937, |
| "mean_token_accuracy": 0.9029045701026917, |
| "num_tokens": 310833.0, |
| "step": 223 |
| }, |
| { |
| "entropy": 0.23583689332008362, |
| "epoch": 0.034541249036237474, |
| "grad_norm": 2.125, |
| "learning_rate": 3.2694424194719046e-05, |
| "loss": 0.24596942961215973, |
| "mean_token_accuracy": 0.9083665609359741, |
| "num_tokens": 312096.0, |
| "step": 224 |
| }, |
| { |
| "entropy": 0.197276309132576, |
| "epoch": 0.03469545104086353, |
| "grad_norm": 1.703125, |
| "learning_rate": 3.2539225800787385e-05, |
| "loss": 0.19344845414161682, |
| "mean_token_accuracy": 0.93291836977005, |
| "num_tokens": 313550.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 0.3082696497440338, |
| "epoch": 0.03484965304548959, |
| "grad_norm": 3.484375, |
| "learning_rate": 3.2383707126072315e-05, |
| "loss": 0.3064239025115967, |
| "mean_token_accuracy": 0.8925233483314514, |
| "num_tokens": 314628.0, |
| "step": 226 |
| }, |
| { |
| "entropy": 0.19953380525112152, |
| "epoch": 0.03500385505011565, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.222787477730567e-05, |
| "loss": 0.19340643286705017, |
| "mean_token_accuracy": 0.9274017214775085, |
| "num_tokens": 316468.0, |
| "step": 227 |
| }, |
| { |
| "entropy": 0.27000153064727783, |
| "epoch": 0.03515805705474171, |
| "grad_norm": 3.828125, |
| "learning_rate": 3.207173537454472e-05, |
| "loss": 0.2817123830318451, |
| "mean_token_accuracy": 0.9068965315818787, |
| "num_tokens": 317636.0, |
| "step": 228 |
| }, |
| { |
| "entropy": 0.22825853526592255, |
| "epoch": 0.03531225905936777, |
| "grad_norm": 2.125, |
| "learning_rate": 3.191529555089102e-05, |
| "loss": 0.22379839420318604, |
| "mean_token_accuracy": 0.9244868159294128, |
| "num_tokens": 319008.0, |
| "step": 229 |
| }, |
| { |
| "entropy": 0.2942773997783661, |
| "epoch": 0.03546646106399383, |
| "grad_norm": 2.6875, |
| "learning_rate": 3.175856195220855e-05, |
| "loss": 0.2916644215583801, |
| "mean_token_accuracy": 0.8996211886405945, |
| "num_tokens": 320072.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.2531821131706238, |
| "epoch": 0.03562066306861989, |
| "grad_norm": 2.265625, |
| "learning_rate": 3.160154123684143e-05, |
| "loss": 0.2512527108192444, |
| "mean_token_accuracy": 0.9058629274368286, |
| "num_tokens": 321291.0, |
| "step": 231 |
| }, |
| { |
| "entropy": 0.234887957572937, |
| "epoch": 0.035774865073245954, |
| "grad_norm": 1.9140625, |
| "learning_rate": 3.1444240075331054e-05, |
| "loss": 0.2259407341480255, |
| "mean_token_accuracy": 0.9231894612312317, |
| "num_tokens": 322666.0, |
| "step": 232 |
| }, |
| { |
| "entropy": 0.23325884342193604, |
| "epoch": 0.03592906707787201, |
| "grad_norm": 1.96875, |
| "learning_rate": 3.128666515013269e-05, |
| "loss": 0.2157772332429886, |
| "mean_token_accuracy": 0.9207017421722412, |
| "num_tokens": 324099.0, |
| "step": 233 |
| }, |
| { |
| "entropy": 0.15830406546592712, |
| "epoch": 0.03608326908249807, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.112882315533163e-05, |
| "loss": 0.1372249573469162, |
| "mean_token_accuracy": 0.9470046162605286, |
| "num_tokens": 326277.0, |
| "step": 234 |
| }, |
| { |
| "entropy": 0.25762706995010376, |
| "epoch": 0.036237471087124135, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.097072079635878e-05, |
| "loss": 0.23957906663417816, |
| "mean_token_accuracy": 0.915335476398468, |
| "num_tokens": 327537.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 0.21047890186309814, |
| "epoch": 0.03639167309175019, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.081236478970583e-05, |
| "loss": 0.22065354883670807, |
| "mean_token_accuracy": 0.9236826300621033, |
| "num_tokens": 329196.0, |
| "step": 236 |
| }, |
| { |
| "entropy": 0.22569093108177185, |
| "epoch": 0.03654587509637625, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.065376186263991e-05, |
| "loss": 0.21428702771663666, |
| "mean_token_accuracy": 0.9252577424049377, |
| "num_tokens": 330368.0, |
| "step": 237 |
| }, |
| { |
| "entropy": 0.2325230836868286, |
| "epoch": 0.036700077101002315, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.049491875291778e-05, |
| "loss": 0.23734821379184723, |
| "mean_token_accuracy": 0.9114202260971069, |
| "num_tokens": 331742.0, |
| "step": 238 |
| }, |
| { |
| "entropy": 0.2122831493616104, |
| "epoch": 0.03685427910562837, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.0335842208499637e-05, |
| "loss": 0.2174147367477417, |
| "mean_token_accuracy": 0.9171270728111267, |
| "num_tokens": 333198.0, |
| "step": 239 |
| }, |
| { |
| "entropy": 0.23024694621562958, |
| "epoch": 0.03700848111025443, |
| "grad_norm": 2.046875, |
| "learning_rate": 3.0176538987262442e-05, |
| "loss": 0.2907542288303375, |
| "mean_token_accuracy": 0.9019264578819275, |
| "num_tokens": 334348.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.2648603022098541, |
| "epoch": 0.037162683114880496, |
| "grad_norm": 1.875, |
| "learning_rate": 3.0017015856712814e-05, |
| "loss": 0.2652634382247925, |
| "mean_token_accuracy": 0.9065656661987305, |
| "num_tokens": 335544.0, |
| "step": 241 |
| }, |
| { |
| "entropy": 0.2533347010612488, |
| "epoch": 0.03731688511950655, |
| "grad_norm": 1.96875, |
| "learning_rate": 2.9857279593699544e-05, |
| "loss": 0.2646684944629669, |
| "mean_token_accuracy": 0.9075286388397217, |
| "num_tokens": 336774.0, |
| "step": 242 |
| }, |
| { |
| "entropy": 0.22679953277111053, |
| "epoch": 0.037471087124132614, |
| "grad_norm": 2.078125, |
| "learning_rate": 2.9697336984125683e-05, |
| "loss": 0.22257877886295319, |
| "mean_token_accuracy": 0.9175019264221191, |
| "num_tokens": 338079.0, |
| "step": 243 |
| }, |
| { |
| "entropy": 0.19455574452877045, |
| "epoch": 0.03762528912875868, |
| "grad_norm": 1.5546875, |
| "learning_rate": 2.9537194822660295e-05, |
| "loss": 0.19329281151294708, |
| "mean_token_accuracy": 0.9266055226325989, |
| "num_tokens": 339722.0, |
| "step": 244 |
| }, |
| { |
| "entropy": 0.20773011445999146, |
| "epoch": 0.03777949113338473, |
| "grad_norm": 1.9453125, |
| "learning_rate": 2.9376859912449794e-05, |
| "loss": 0.20826096832752228, |
| "mean_token_accuracy": 0.9232895374298096, |
| "num_tokens": 341177.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 0.2844797372817993, |
| "epoch": 0.037933693138010795, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.9216339064828914e-05, |
| "loss": 0.2653990387916565, |
| "mean_token_accuracy": 0.910646378993988, |
| "num_tokens": 342237.0, |
| "step": 246 |
| }, |
| { |
| "entropy": 0.19197861850261688, |
| "epoch": 0.03808789514263686, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.9055639099031386e-05, |
| "loss": 0.191925048828125, |
| "mean_token_accuracy": 0.9356250166893005, |
| "num_tokens": 343845.0, |
| "step": 247 |
| }, |
| { |
| "entropy": 0.28776344656944275, |
| "epoch": 0.03824209714726291, |
| "grad_norm": 2.59375, |
| "learning_rate": 2.8894766841900223e-05, |
| "loss": 0.27679842710494995, |
| "mean_token_accuracy": 0.9086069464683533, |
| "num_tokens": 344980.0, |
| "step": 248 |
| }, |
| { |
| "entropy": 0.23193758726119995, |
| "epoch": 0.038396299151888975, |
| "grad_norm": 1.9765625, |
| "learning_rate": 2.8733729127597692e-05, |
| "loss": 0.2313500940799713, |
| "mean_token_accuracy": 0.9189602732658386, |
| "num_tokens": 346296.0, |
| "step": 249 |
| }, |
| { |
| "entropy": 0.19187554717063904, |
| "epoch": 0.03855050115651504, |
| "grad_norm": 1.4765625, |
| "learning_rate": 2.8572532797315006e-05, |
| "loss": 0.17860986292362213, |
| "mean_token_accuracy": 0.9357484579086304, |
| "num_tokens": 347767.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.26534777879714966, |
| "epoch": 0.038704703161141094, |
| "grad_norm": 2.234375, |
| "learning_rate": 2.8411184698981684e-05, |
| "loss": 0.2811349630355835, |
| "mean_token_accuracy": 0.9026548862457275, |
| "num_tokens": 349131.0, |
| "step": 251 |
| }, |
| { |
| "entropy": 0.19166985154151917, |
| "epoch": 0.038858905165767156, |
| "grad_norm": 1.4375, |
| "learning_rate": 2.824969168697466e-05, |
| "loss": 0.1818903237581253, |
| "mean_token_accuracy": 0.9364994764328003, |
| "num_tokens": 351013.0, |
| "step": 252 |
| }, |
| { |
| "entropy": 0.2197422981262207, |
| "epoch": 0.03901310717039321, |
| "grad_norm": 2.0, |
| "learning_rate": 2.808806062182705e-05, |
| "loss": 0.24899303913116455, |
| "mean_token_accuracy": 0.9060351252555847, |
| "num_tokens": 352330.0, |
| "step": 253 |
| }, |
| { |
| "entropy": 0.24478891491889954, |
| "epoch": 0.039167309175019274, |
| "grad_norm": 2.046875, |
| "learning_rate": 2.792629836993676e-05, |
| "loss": 0.24458467960357666, |
| "mean_token_accuracy": 0.914650559425354, |
| "num_tokens": 353826.0, |
| "step": 254 |
| }, |
| { |
| "entropy": 0.17300452291965485, |
| "epoch": 0.03932151117964534, |
| "grad_norm": 1.453125, |
| "learning_rate": 2.776441180327475e-05, |
| "loss": 0.1748412549495697, |
| "mean_token_accuracy": 0.9393326640129089, |
| "num_tokens": 355812.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 0.28217461705207825, |
| "epoch": 0.03947571318427139, |
| "grad_norm": 2.375, |
| "learning_rate": 2.76024077990931e-05, |
| "loss": 0.28308406472206116, |
| "mean_token_accuracy": 0.908906877040863, |
| "num_tokens": 356808.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.03947571318427139, |
| "eval_entropy": 0.2422610384068991, |
| "eval_loss": 0.2376217544078827, |
| "eval_mean_token_accuracy": 0.9154835451416105, |
| "eval_num_tokens": 356808.0, |
| "eval_runtime": 34.9417, |
| "eval_samples_per_second": 78.159, |
| "eval_steps_per_second": 9.788, |
| "step": 256 |
| }, |
| { |
| "entropy": 0.2056795060634613, |
| "epoch": 0.039629915188897455, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.7440293239632885e-05, |
| "loss": 0.1848773956298828, |
| "mean_token_accuracy": 0.9414348602294922, |
| "num_tokens": 358182.0, |
| "step": 257 |
| }, |
| { |
| "entropy": 0.21008774638175964, |
| "epoch": 0.03978411719352352, |
| "grad_norm": 2.125, |
| "learning_rate": 2.7278075011831757e-05, |
| "loss": 0.23831506073474884, |
| "mean_token_accuracy": 0.9120956659317017, |
| "num_tokens": 359612.0, |
| "step": 258 |
| }, |
| { |
| "entropy": 0.22274059057235718, |
| "epoch": 0.03993831919814957, |
| "grad_norm": 2.078125, |
| "learning_rate": 2.711576000703141e-05, |
| "loss": 0.22159968316555023, |
| "mean_token_accuracy": 0.9259036183357239, |
| "num_tokens": 361280.0, |
| "step": 259 |
| }, |
| { |
| "entropy": 0.24206753075122833, |
| "epoch": 0.040092521202775636, |
| "grad_norm": 2.21875, |
| "learning_rate": 2.6953355120684802e-05, |
| "loss": 0.2599974274635315, |
| "mean_token_accuracy": 0.915960431098938, |
| "num_tokens": 362704.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.22195129096508026, |
| "epoch": 0.0402467232074017, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.6790867252063247e-05, |
| "loss": 0.22732976078987122, |
| "mean_token_accuracy": 0.9146426320075989, |
| "num_tokens": 364153.0, |
| "step": 261 |
| }, |
| { |
| "entropy": 0.19769293069839478, |
| "epoch": 0.040400925212027754, |
| "grad_norm": 1.5390625, |
| "learning_rate": 2.6628303303963288e-05, |
| "loss": 0.18025925755500793, |
| "mean_token_accuracy": 0.9401107430458069, |
| "num_tokens": 366148.0, |
| "step": 262 |
| }, |
| { |
| "entropy": 0.36093661189079285, |
| "epoch": 0.040555127216653816, |
| "grad_norm": 2.828125, |
| "learning_rate": 2.646567018241349e-05, |
| "loss": 0.36829474568367004, |
| "mean_token_accuracy": 0.8780487775802612, |
| "num_tokens": 367140.0, |
| "step": 263 |
| }, |
| { |
| "entropy": 0.28070077300071716, |
| "epoch": 0.04070932922127988, |
| "grad_norm": 2.171875, |
| "learning_rate": 2.6302974796381015e-05, |
| "loss": 0.27073192596435547, |
| "mean_token_accuracy": 0.9048058986663818, |
| "num_tokens": 368230.0, |
| "step": 264 |
| }, |
| { |
| "entropy": 0.28238415718078613, |
| "epoch": 0.040863531225905934, |
| "grad_norm": 2.078125, |
| "learning_rate": 2.6140224057478158e-05, |
| "loss": 0.2595861256122589, |
| "mean_token_accuracy": 0.9181897044181824, |
| "num_tokens": 369387.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 0.24161042273044586, |
| "epoch": 0.041017733230532, |
| "grad_norm": 1.6328125, |
| "learning_rate": 2.5977424879668705e-05, |
| "loss": 0.22480149567127228, |
| "mean_token_accuracy": 0.9269341230392456, |
| "num_tokens": 370791.0, |
| "step": 266 |
| }, |
| { |
| "entropy": 0.1969321221113205, |
| "epoch": 0.04117193523515806, |
| "grad_norm": 1.53125, |
| "learning_rate": 2.5814584178974218e-05, |
| "loss": 0.1720927655696869, |
| "mean_token_accuracy": 0.934974730014801, |
| "num_tokens": 372383.0, |
| "step": 267 |
| }, |
| { |
| "entropy": 0.23700961470603943, |
| "epoch": 0.041326137239784115, |
| "grad_norm": 1.921875, |
| "learning_rate": 2.5651708873180223e-05, |
| "loss": 0.22749063372612, |
| "mean_token_accuracy": 0.917475700378418, |
| "num_tokens": 373627.0, |
| "step": 268 |
| }, |
| { |
| "entropy": 0.22176285088062286, |
| "epoch": 0.04148033924441018, |
| "grad_norm": 1.4375, |
| "learning_rate": 2.5488805881542356e-05, |
| "loss": 0.19518814980983734, |
| "mean_token_accuracy": 0.922112226486206, |
| "num_tokens": 375150.0, |
| "step": 269 |
| }, |
| { |
| "entropy": 0.19811592996120453, |
| "epoch": 0.04163454124903624, |
| "grad_norm": 1.65625, |
| "learning_rate": 2.5325882124492395e-05, |
| "loss": 0.2038094401359558, |
| "mean_token_accuracy": 0.9243918657302856, |
| "num_tokens": 376679.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.16331960260868073, |
| "epoch": 0.041788743253662296, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.5162944523344256e-05, |
| "loss": 0.15330754220485687, |
| "mean_token_accuracy": 0.9463318586349487, |
| "num_tokens": 378718.0, |
| "step": 271 |
| }, |
| { |
| "entropy": 0.2266637682914734, |
| "epoch": 0.04194294525828836, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.5e-05, |
| "loss": 0.20924291014671326, |
| "mean_token_accuracy": 0.9225251078605652, |
| "num_tokens": 380120.0, |
| "step": 272 |
| }, |
| { |
| "entropy": 0.27386748790740967, |
| "epoch": 0.04209714726291442, |
| "grad_norm": 2.296875, |
| "learning_rate": 2.4837055476655746e-05, |
| "loss": 0.28491681814193726, |
| "mean_token_accuracy": 0.9068265557289124, |
| "num_tokens": 381212.0, |
| "step": 273 |
| }, |
| { |
| "entropy": 0.2462942749261856, |
| "epoch": 0.042251349267540476, |
| "grad_norm": 1.9375, |
| "learning_rate": 2.4674117875507615e-05, |
| "loss": 0.23223665356636047, |
| "mean_token_accuracy": 0.9165329337120056, |
| "num_tokens": 382466.0, |
| "step": 274 |
| }, |
| { |
| "entropy": 0.2614425718784332, |
| "epoch": 0.04240555127216654, |
| "grad_norm": 2.265625, |
| "learning_rate": 2.451119411845765e-05, |
| "loss": 0.27489128708839417, |
| "mean_token_accuracy": 0.9016948938369751, |
| "num_tokens": 383654.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.21999643743038177, |
| "epoch": 0.0425597532767926, |
| "grad_norm": 2.140625, |
| "learning_rate": 2.4348291126819783e-05, |
| "loss": 0.2654040455818176, |
| "mean_token_accuracy": 0.9077669978141785, |
| "num_tokens": 385104.0, |
| "step": 276 |
| }, |
| { |
| "entropy": 0.2447359710931778, |
| "epoch": 0.04271395528141866, |
| "grad_norm": 2.546875, |
| "learning_rate": 2.4185415821025795e-05, |
| "loss": 0.2940978705883026, |
| "mean_token_accuracy": 0.8986432552337646, |
| "num_tokens": 386365.0, |
| "step": 277 |
| }, |
| { |
| "entropy": 0.24432024359703064, |
| "epoch": 0.04286815728604472, |
| "grad_norm": 2.171875, |
| "learning_rate": 2.4022575120331307e-05, |
| "loss": 0.2683406174182892, |
| "mean_token_accuracy": 0.9004524946212769, |
| "num_tokens": 387478.0, |
| "step": 278 |
| }, |
| { |
| "entropy": 0.19444933533668518, |
| "epoch": 0.04302235929067078, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.3859775942521854e-05, |
| "loss": 0.18984566628932953, |
| "mean_token_accuracy": 0.9271844625473022, |
| "num_tokens": 388928.0, |
| "step": 279 |
| }, |
| { |
| "entropy": 0.25862905383110046, |
| "epoch": 0.04317656129529684, |
| "grad_norm": 2.359375, |
| "learning_rate": 2.3697025203618987e-05, |
| "loss": 0.2914562523365021, |
| "mean_token_accuracy": 0.906593382358551, |
| "num_tokens": 390210.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.2573435604572296, |
| "epoch": 0.0433307632999229, |
| "grad_norm": 2.15625, |
| "learning_rate": 2.3534329817586513e-05, |
| "loss": 0.25994932651519775, |
| "mean_token_accuracy": 0.9036144614219666, |
| "num_tokens": 391214.0, |
| "step": 281 |
| }, |
| { |
| "entropy": 0.25984057784080505, |
| "epoch": 0.043484965304548956, |
| "grad_norm": 2.109375, |
| "learning_rate": 2.3371696696036715e-05, |
| "loss": 0.23992516100406647, |
| "mean_token_accuracy": 0.9247743487358093, |
| "num_tokens": 392219.0, |
| "step": 282 |
| }, |
| { |
| "entropy": 0.20528267323970795, |
| "epoch": 0.04363916730917502, |
| "grad_norm": 1.9140625, |
| "learning_rate": 2.320913274793676e-05, |
| "loss": 0.20434120297431946, |
| "mean_token_accuracy": 0.9243749976158142, |
| "num_tokens": 393827.0, |
| "step": 283 |
| }, |
| { |
| "entropy": 0.44059571623802185, |
| "epoch": 0.04379336931380108, |
| "grad_norm": 3.546875, |
| "learning_rate": 2.30466448793152e-05, |
| "loss": 0.49274563789367676, |
| "mean_token_accuracy": 0.834419846534729, |
| "num_tokens": 394602.0, |
| "step": 284 |
| }, |
| { |
| "entropy": 0.24022506177425385, |
| "epoch": 0.04394757131842714, |
| "grad_norm": 1.921875, |
| "learning_rate": 2.28842399929686e-05, |
| "loss": 0.23765617609024048, |
| "mean_token_accuracy": 0.9164490699768066, |
| "num_tokens": 395759.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 0.23994681239128113, |
| "epoch": 0.0441017733230532, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.272192498816825e-05, |
| "loss": 0.2343621850013733, |
| "mean_token_accuracy": 0.9188445806503296, |
| "num_tokens": 397221.0, |
| "step": 286 |
| }, |
| { |
| "entropy": 0.27961966395378113, |
| "epoch": 0.04425597532767926, |
| "grad_norm": 2.25, |
| "learning_rate": 2.255970676036712e-05, |
| "loss": 0.27381986379623413, |
| "mean_token_accuracy": 0.8992950916290283, |
| "num_tokens": 398222.0, |
| "step": 287 |
| }, |
| { |
| "entropy": 0.1786043792963028, |
| "epoch": 0.04441017733230532, |
| "grad_norm": 1.4921875, |
| "learning_rate": 2.2397592200906906e-05, |
| "loss": 0.17795482277870178, |
| "mean_token_accuracy": 0.9386597871780396, |
| "num_tokens": 400170.0, |
| "step": 288 |
| }, |
| { |
| "entropy": 0.1822587549686432, |
| "epoch": 0.04456437933693138, |
| "grad_norm": 1.375, |
| "learning_rate": 2.223558819672526e-05, |
| "loss": 0.1628590077161789, |
| "mean_token_accuracy": 0.9355238676071167, |
| "num_tokens": 401791.0, |
| "step": 289 |
| }, |
| { |
| "entropy": 0.22401201725006104, |
| "epoch": 0.04471858134155744, |
| "grad_norm": 1.9765625, |
| "learning_rate": 2.2073701630063243e-05, |
| "loss": 0.23397932946681976, |
| "mean_token_accuracy": 0.9228187799453735, |
| "num_tokens": 403289.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.26227450370788574, |
| "epoch": 0.0448727833461835, |
| "grad_norm": 2.28125, |
| "learning_rate": 2.1911939378172956e-05, |
| "loss": 0.2669812738895416, |
| "mean_token_accuracy": 0.9153226017951965, |
| "num_tokens": 404537.0, |
| "step": 291 |
| }, |
| { |
| "entropy": 0.21649585664272308, |
| "epoch": 0.04502698535080956, |
| "grad_norm": 1.46875, |
| "learning_rate": 2.175030831302535e-05, |
| "loss": 0.18651390075683594, |
| "mean_token_accuracy": 0.9295774698257446, |
| "num_tokens": 405894.0, |
| "step": 292 |
| }, |
| { |
| "entropy": 0.2264479100704193, |
| "epoch": 0.04518118735543562, |
| "grad_norm": 2.046875, |
| "learning_rate": 2.158881530101832e-05, |
| "loss": 0.24527707695960999, |
| "mean_token_accuracy": 0.9157626032829285, |
| "num_tokens": 407469.0, |
| "step": 293 |
| }, |
| { |
| "entropy": 0.19007329642772675, |
| "epoch": 0.04533538936006168, |
| "grad_norm": 1.828125, |
| "learning_rate": 2.1427467202685007e-05, |
| "loss": 0.18996097147464752, |
| "mean_token_accuracy": 0.9266110062599182, |
| "num_tokens": 409153.0, |
| "step": 294 |
| }, |
| { |
| "entropy": 0.2581518888473511, |
| "epoch": 0.04548959136468774, |
| "grad_norm": 1.890625, |
| "learning_rate": 2.126627087240231e-05, |
| "loss": 0.2599462568759918, |
| "mean_token_accuracy": 0.9158653616905212, |
| "num_tokens": 410409.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 0.22935496270656586, |
| "epoch": 0.045643793369313804, |
| "grad_norm": 2.09375, |
| "learning_rate": 2.110523315809978e-05, |
| "loss": 0.21854767203330994, |
| "mean_token_accuracy": 0.9225852489471436, |
| "num_tokens": 411825.0, |
| "step": 296 |
| }, |
| { |
| "entropy": 0.25962114334106445, |
| "epoch": 0.04579799537393986, |
| "grad_norm": 2.296875, |
| "learning_rate": 2.0944360900968617e-05, |
| "loss": 0.28228771686553955, |
| "mean_token_accuracy": 0.8985915780067444, |
| "num_tokens": 412898.0, |
| "step": 297 |
| }, |
| { |
| "entropy": 0.25601744651794434, |
| "epoch": 0.04595219737856592, |
| "grad_norm": 1.9765625, |
| "learning_rate": 2.0783660935171092e-05, |
| "loss": 0.26037973165512085, |
| "mean_token_accuracy": 0.9110707640647888, |
| "num_tokens": 414008.0, |
| "step": 298 |
| }, |
| { |
| "entropy": 0.2810611128807068, |
| "epoch": 0.046106399383191984, |
| "grad_norm": 2.328125, |
| "learning_rate": 2.0623140087550215e-05, |
| "loss": 0.29850900173187256, |
| "mean_token_accuracy": 0.9104072451591492, |
| "num_tokens": 415121.0, |
| "step": 299 |
| }, |
| { |
| "entropy": 0.22841358184814453, |
| "epoch": 0.04626060138781804, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.046280517733971e-05, |
| "loss": 0.22839921712875366, |
| "mean_token_accuracy": 0.923349916934967, |
| "num_tokens": 416538.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.2764427959918976, |
| "epoch": 0.0464148033924441, |
| "grad_norm": 2.34375, |
| "learning_rate": 2.0302663015874322e-05, |
| "loss": 0.2636858820915222, |
| "mean_token_accuracy": 0.9106976985931396, |
| "num_tokens": 417621.0, |
| "step": 301 |
| }, |
| { |
| "entropy": 0.18497152626514435, |
| "epoch": 0.046569005397070165, |
| "grad_norm": 1.5, |
| "learning_rate": 2.0142720406300465e-05, |
| "loss": 0.18430255353450775, |
| "mean_token_accuracy": 0.929759681224823, |
| "num_tokens": 419252.0, |
| "step": 302 |
| }, |
| { |
| "entropy": 0.2483554184436798, |
| "epoch": 0.04672320740169622, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.9982984143287188e-05, |
| "loss": 0.24268567562103271, |
| "mean_token_accuracy": 0.9065420627593994, |
| "num_tokens": 420437.0, |
| "step": 303 |
| }, |
| { |
| "entropy": 0.2957545518875122, |
| "epoch": 0.04687740940632228, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.9823461012737564e-05, |
| "loss": 0.3344174325466156, |
| "mean_token_accuracy": 0.8834766149520874, |
| "num_tokens": 421492.0, |
| "step": 304 |
| }, |
| { |
| "entropy": 0.23411741852760315, |
| "epoch": 0.047031611410948346, |
| "grad_norm": 1.5703125, |
| "learning_rate": 1.966415779150037e-05, |
| "loss": 0.21458064019680023, |
| "mean_token_accuracy": 0.9274131059646606, |
| "num_tokens": 422795.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 0.2103796899318695, |
| "epoch": 0.0471858134155744, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.9505081247082237e-05, |
| "loss": 0.20959612727165222, |
| "mean_token_accuracy": 0.9208722710609436, |
| "num_tokens": 424408.0, |
| "step": 306 |
| }, |
| { |
| "entropy": 0.2197587639093399, |
| "epoch": 0.047340015420200464, |
| "grad_norm": 1.6796875, |
| "learning_rate": 1.9346238137360106e-05, |
| "loss": 0.20553667843341827, |
| "mean_token_accuracy": 0.9193548560142517, |
| "num_tokens": 425718.0, |
| "step": 307 |
| }, |
| { |
| "entropy": 0.24315893650054932, |
| "epoch": 0.04749421742482652, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.918763521029418e-05, |
| "loss": 0.22866766154766083, |
| "mean_token_accuracy": 0.9147771596908569, |
| "num_tokens": 427005.0, |
| "step": 308 |
| }, |
| { |
| "entropy": 0.2538098990917206, |
| "epoch": 0.04764841942945258, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.9029279203641232e-05, |
| "loss": 0.2357470542192459, |
| "mean_token_accuracy": 0.9233912229537964, |
| "num_tokens": 427992.0, |
| "step": 309 |
| }, |
| { |
| "entropy": 0.3305405378341675, |
| "epoch": 0.047802621434078645, |
| "grad_norm": 2.875, |
| "learning_rate": 1.8871176844668374e-05, |
| "loss": 0.3201872408390045, |
| "mean_token_accuracy": 0.8776978254318237, |
| "num_tokens": 428834.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.22924208641052246, |
| "epoch": 0.0479568234387047, |
| "grad_norm": 1.703125, |
| "learning_rate": 1.8713334849867315e-05, |
| "loss": 0.2193642556667328, |
| "mean_token_accuracy": 0.9297805428504944, |
| "num_tokens": 430437.0, |
| "step": 311 |
| }, |
| { |
| "entropy": 0.2438676506280899, |
| "epoch": 0.04811102544333076, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.8555759924668952e-05, |
| "loss": 0.2391282469034195, |
| "mean_token_accuracy": 0.9204819202423096, |
| "num_tokens": 431690.0, |
| "step": 312 |
| }, |
| { |
| "entropy": 0.30626124143600464, |
| "epoch": 0.048265227447956825, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.8398458763158578e-05, |
| "loss": 0.31509530544281006, |
| "mean_token_accuracy": 0.8954593539237976, |
| "num_tokens": 432645.0, |
| "step": 313 |
| }, |
| { |
| "entropy": 0.26661908626556396, |
| "epoch": 0.04841942945258288, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.8241438047791454e-05, |
| "loss": 0.2524988651275635, |
| "mean_token_accuracy": 0.9092437028884888, |
| "num_tokens": 433843.0, |
| "step": 314 |
| }, |
| { |
| "entropy": 0.22748155891895294, |
| "epoch": 0.04857363145720894, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.8084704449108985e-05, |
| "loss": 0.2243906408548355, |
| "mean_token_accuracy": 0.9239205121994019, |
| "num_tokens": 435310.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 0.17577649652957916, |
| "epoch": 0.048727833461835006, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.7928264625455282e-05, |
| "loss": 0.1813218891620636, |
| "mean_token_accuracy": 0.9322709441184998, |
| "num_tokens": 437326.0, |
| "step": 316 |
| }, |
| { |
| "entropy": 0.27867627143859863, |
| "epoch": 0.04888203546646106, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.7772125222694337e-05, |
| "loss": 0.28030475974082947, |
| "mean_token_accuracy": 0.8948306441307068, |
| "num_tokens": 438456.0, |
| "step": 317 |
| }, |
| { |
| "entropy": 0.23422475159168243, |
| "epoch": 0.049036237471087124, |
| "grad_norm": 1.65625, |
| "learning_rate": 1.7616292873927688e-05, |
| "loss": 0.2259235829114914, |
| "mean_token_accuracy": 0.915672242641449, |
| "num_tokens": 439721.0, |
| "step": 318 |
| }, |
| { |
| "entropy": 0.20051687955856323, |
| "epoch": 0.04919043947571319, |
| "grad_norm": 1.5625, |
| "learning_rate": 1.7460774199212625e-05, |
| "loss": 0.20561350882053375, |
| "mean_token_accuracy": 0.9247232675552368, |
| "num_tokens": 441084.0, |
| "step": 319 |
| }, |
| { |
| "entropy": 0.17916183173656464, |
| "epoch": 0.04934464148033924, |
| "grad_norm": 1.265625, |
| "learning_rate": 1.7305575805280956e-05, |
| "loss": 0.16743285953998566, |
| "mean_token_accuracy": 0.9406779408454895, |
| "num_tokens": 442862.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.18751926720142365, |
| "epoch": 0.049498843484965305, |
| "grad_norm": 1.3671875, |
| "learning_rate": 1.7150704285258375e-05, |
| "loss": 0.16947750747203827, |
| "mean_token_accuracy": 0.9436795711517334, |
| "num_tokens": 444468.0, |
| "step": 321 |
| }, |
| { |
| "entropy": 0.17793025076389313, |
| "epoch": 0.04965304548959137, |
| "grad_norm": 1.28125, |
| "learning_rate": 1.6996166218384307e-05, |
| "loss": 0.16534742712974548, |
| "mean_token_accuracy": 0.939068078994751, |
| "num_tokens": 446150.0, |
| "step": 322 |
| }, |
| { |
| "entropy": 0.2475776970386505, |
| "epoch": 0.04980724749421742, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.684196816973248e-05, |
| "loss": 0.2468724101781845, |
| "mean_token_accuracy": 0.919457733631134, |
| "num_tokens": 447412.0, |
| "step": 323 |
| }, |
| { |
| "entropy": 0.2225208878517151, |
| "epoch": 0.049961449498843485, |
| "grad_norm": 1.625, |
| "learning_rate": 1.6688116689931972e-05, |
| "loss": 0.20401687920093536, |
| "mean_token_accuracy": 0.9311926364898682, |
| "num_tokens": 448946.0, |
| "step": 324 |
| }, |
| { |
| "entropy": 0.2503822445869446, |
| "epoch": 0.05011565150346955, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.6534618314888945e-05, |
| "loss": 0.22844718396663666, |
| "mean_token_accuracy": 0.9175724387168884, |
| "num_tokens": 450058.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.25004157423973083, |
| "epoch": 0.050269853508095604, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.638147956550904e-05, |
| "loss": 0.25791749358177185, |
| "mean_token_accuracy": 0.9117646813392639, |
| "num_tokens": 451324.0, |
| "step": 326 |
| }, |
| { |
| "entropy": 0.22011376917362213, |
| "epoch": 0.050424055512721666, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.622870694742026e-05, |
| "loss": 0.19179725646972656, |
| "mean_token_accuracy": 0.9320175647735596, |
| "num_tokens": 452700.0, |
| "step": 327 |
| }, |
| { |
| "entropy": 0.193440780043602, |
| "epoch": 0.05057825751734773, |
| "grad_norm": 1.625, |
| "learning_rate": 1.6076306950696658e-05, |
| "loss": 0.19295921921730042, |
| "mean_token_accuracy": 0.9318463206291199, |
| "num_tokens": 454322.0, |
| "step": 328 |
| }, |
| { |
| "entropy": 0.17849111557006836, |
| "epoch": 0.050732459521973784, |
| "grad_norm": 1.46875, |
| "learning_rate": 1.592428604958264e-05, |
| "loss": 0.16607390344142914, |
| "mean_token_accuracy": 0.9433174133300781, |
| "num_tokens": 456006.0, |
| "step": 329 |
| }, |
| { |
| "entropy": 0.2486262321472168, |
| "epoch": 0.05088666152659985, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.5772650702217878e-05, |
| "loss": 0.2480083853006363, |
| "mean_token_accuracy": 0.9057851433753967, |
| "num_tokens": 457224.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.27837270498275757, |
| "epoch": 0.05104086353122591, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.5621407350362986e-05, |
| "loss": 0.2996099293231964, |
| "mean_token_accuracy": 0.9042253494262695, |
| "num_tokens": 458297.0, |
| "step": 331 |
| }, |
| { |
| "entropy": 0.20956268906593323, |
| "epoch": 0.051195065535851965, |
| "grad_norm": 1.5625, |
| "learning_rate": 1.5470562419125868e-05, |
| "loss": 0.18728220462799072, |
| "mean_token_accuracy": 0.9295774698257446, |
| "num_tokens": 459796.0, |
| "step": 332 |
| }, |
| { |
| "entropy": 0.29057589173316956, |
| "epoch": 0.05134926754047803, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.5320122316688735e-05, |
| "loss": 0.29962292313575745, |
| "mean_token_accuracy": 0.8858093023300171, |
| "num_tokens": 460706.0, |
| "step": 333 |
| }, |
| { |
| "entropy": 0.1948358118534088, |
| "epoch": 0.05150346954510408, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.517009343403591e-05, |
| "loss": 0.1801883429288864, |
| "mean_token_accuracy": 0.93376624584198, |
| "num_tokens": 462254.0, |
| "step": 334 |
| }, |
| { |
| "entropy": 0.22513329982757568, |
| "epoch": 0.051657671549730146, |
| "grad_norm": 2.046875, |
| "learning_rate": 1.5020482144682308e-05, |
| "loss": 0.22428080439567566, |
| "mean_token_accuracy": 0.9161764979362488, |
| "num_tokens": 463622.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 0.2175763100385666, |
| "epoch": 0.05181187355435621, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.4871294804402675e-05, |
| "loss": 0.21439555287361145, |
| "mean_token_accuracy": 0.9237037301063538, |
| "num_tokens": 464980.0, |
| "step": 336 |
| }, |
| { |
| "entropy": 0.1653544306755066, |
| "epoch": 0.051966075558982264, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.472253775096159e-05, |
| "loss": 0.16475962102413177, |
| "mean_token_accuracy": 0.9355390667915344, |
| "num_tokens": 466741.0, |
| "step": 337 |
| }, |
| { |
| "entropy": 0.20776669681072235, |
| "epoch": 0.052120277563608326, |
| "grad_norm": 1.9453125, |
| "learning_rate": 1.4574217303844211e-05, |
| "loss": 0.19919782876968384, |
| "mean_token_accuracy": 0.9283204674720764, |
| "num_tokens": 468172.0, |
| "step": 338 |
| }, |
| { |
| "entropy": 0.18218226730823517, |
| "epoch": 0.05227447956823439, |
| "grad_norm": 1.6875, |
| "learning_rate": 1.4426339763987844e-05, |
| "loss": 0.1778276562690735, |
| "mean_token_accuracy": 0.9303686618804932, |
| "num_tokens": 469889.0, |
| "step": 339 |
| }, |
| { |
| "entropy": 0.25532829761505127, |
| "epoch": 0.052428681572860444, |
| "grad_norm": 1.9375, |
| "learning_rate": 1.4278911413514204e-05, |
| "loss": 0.26636841893196106, |
| "mean_token_accuracy": 0.9083333611488342, |
| "num_tokens": 471217.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.19937695562839508, |
| "epoch": 0.05258288357748651, |
| "grad_norm": 1.6015625, |
| "learning_rate": 1.4131938515462639e-05, |
| "loss": 0.1952292025089264, |
| "mean_token_accuracy": 0.9280303120613098, |
| "num_tokens": 472809.0, |
| "step": 341 |
| }, |
| { |
| "entropy": 0.28071922063827515, |
| "epoch": 0.05273708558211257, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.3985427313523947e-05, |
| "loss": 0.28267180919647217, |
| "mean_token_accuracy": 0.885199248790741, |
| "num_tokens": 473871.0, |
| "step": 342 |
| }, |
| { |
| "entropy": 0.1708391159772873, |
| "epoch": 0.052891287586738625, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.3839384031775226e-05, |
| "loss": 0.1682218760251999, |
| "mean_token_accuracy": 0.9421338438987732, |
| "num_tokens": 475538.0, |
| "step": 343 |
| }, |
| { |
| "entropy": 0.17169421911239624, |
| "epoch": 0.05304548959136469, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.3693814874415389e-05, |
| "loss": 0.1755795031785965, |
| "mean_token_accuracy": 0.9377777576446533, |
| "num_tokens": 477346.0, |
| "step": 344 |
| }, |
| { |
| "entropy": 0.2197735607624054, |
| "epoch": 0.05319969159599075, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.3548726025501688e-05, |
| "loss": 0.22578758001327515, |
| "mean_token_accuracy": 0.9094029068946838, |
| "num_tokens": 478811.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 0.21483223140239716, |
| "epoch": 0.053353893600616806, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.340412364868689e-05, |
| "loss": 0.21270032227039337, |
| "mean_token_accuracy": 0.9238030910491943, |
| "num_tokens": 480302.0, |
| "step": 346 |
| }, |
| { |
| "entropy": 0.27951836585998535, |
| "epoch": 0.05350809560524287, |
| "grad_norm": 2.28125, |
| "learning_rate": 1.3260013886957538e-05, |
| "loss": 0.2666223645210266, |
| "mean_token_accuracy": 0.9077869057655334, |
| "num_tokens": 481286.0, |
| "step": 347 |
| }, |
| { |
| "entropy": 0.1917494833469391, |
| "epoch": 0.05366229760986893, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.3116402862372933e-05, |
| "loss": 0.19692182540893555, |
| "mean_token_accuracy": 0.9339783787727356, |
| "num_tokens": 483051.0, |
| "step": 348 |
| }, |
| { |
| "entropy": 0.20676381886005402, |
| "epoch": 0.053816499614494986, |
| "grad_norm": 1.6328125, |
| "learning_rate": 1.2973296675805041e-05, |
| "loss": 0.20207884907722473, |
| "mean_token_accuracy": 0.9374217987060547, |
| "num_tokens": 484657.0, |
| "step": 349 |
| }, |
| { |
| "entropy": 0.19531835615634918, |
| "epoch": 0.05397070161912105, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.2830701406679375e-05, |
| "loss": 0.18931494653224945, |
| "mean_token_accuracy": 0.9317750930786133, |
| "num_tokens": 486248.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.3396989405155182, |
| "epoch": 0.05412490362374711, |
| "grad_norm": 5.1875, |
| "learning_rate": 1.2688623112716652e-05, |
| "loss": 0.37070798873901367, |
| "mean_token_accuracy": 0.869767427444458, |
| "num_tokens": 487116.0, |
| "step": 351 |
| }, |
| { |
| "entropy": 0.17527468502521515, |
| "epoch": 0.05427910562837317, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.2547067829675535e-05, |
| "loss": 0.17982880771160126, |
| "mean_token_accuracy": 0.9339567422866821, |
| "num_tokens": 488835.0, |
| "step": 352 |
| }, |
| { |
| "entropy": 0.2687583565711975, |
| "epoch": 0.05443330763299923, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.2406041571096164e-05, |
| "loss": 0.2823106646537781, |
| "mean_token_accuracy": 0.9135371446609497, |
| "num_tokens": 489988.0, |
| "step": 353 |
| }, |
| { |
| "entropy": 0.1937769502401352, |
| "epoch": 0.05458750963762529, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.2265550328044681e-05, |
| "loss": 0.19238050282001495, |
| "mean_token_accuracy": 0.9310998916625977, |
| "num_tokens": 491578.0, |
| "step": 354 |
| }, |
| { |
| "entropy": 0.17158617079257965, |
| "epoch": 0.05474171164225135, |
| "grad_norm": 1.4765625, |
| "learning_rate": 1.2125600068858772e-05, |
| "loss": 0.16338223218917847, |
| "mean_token_accuracy": 0.9456647634506226, |
| "num_tokens": 493316.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 0.19250212609767914, |
| "epoch": 0.05489591364687741, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.1986196738894078e-05, |
| "loss": 0.17621511220932007, |
| "mean_token_accuracy": 0.9345238208770752, |
| "num_tokens": 494668.0, |
| "step": 356 |
| }, |
| { |
| "entropy": 0.19578416645526886, |
| "epoch": 0.05505011565150347, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.1847346260271647e-05, |
| "loss": 0.183770090341568, |
| "mean_token_accuracy": 0.9346092343330383, |
| "num_tokens": 495930.0, |
| "step": 357 |
| }, |
| { |
| "entropy": 0.22412899136543274, |
| "epoch": 0.05520431765612953, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.1709054531626313e-05, |
| "loss": 0.2516805827617645, |
| "mean_token_accuracy": 0.9137670397758484, |
| "num_tokens": 497260.0, |
| "step": 358 |
| }, |
| { |
| "entropy": 0.2025316208600998, |
| "epoch": 0.05535851966075559, |
| "grad_norm": 1.3203125, |
| "learning_rate": 1.1571327427856177e-05, |
| "loss": 0.19299444556236267, |
| "mean_token_accuracy": 0.9367007613182068, |
| "num_tokens": 498832.0, |
| "step": 359 |
| }, |
| { |
| "entropy": 0.2235983908176422, |
| "epoch": 0.05551272166538165, |
| "grad_norm": 1.5859375, |
| "learning_rate": 1.1434170799872947e-05, |
| "loss": 0.200628861784935, |
| "mean_token_accuracy": 0.929682195186615, |
| "num_tokens": 500319.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.28108713030815125, |
| "epoch": 0.05566692367000771, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.1297590474353464e-05, |
| "loss": 0.2882252335548401, |
| "mean_token_accuracy": 0.8986828923225403, |
| "num_tokens": 501314.0, |
| "step": 361 |
| }, |
| { |
| "entropy": 0.21756984293460846, |
| "epoch": 0.05582112567463377, |
| "grad_norm": 2.125, |
| "learning_rate": 1.116159225349213e-05, |
| "loss": 0.23450873792171478, |
| "mean_token_accuracy": 0.9163208603858948, |
| "num_tokens": 502768.0, |
| "step": 362 |
| }, |
| { |
| "entropy": 0.2556920051574707, |
| "epoch": 0.05597532767925983, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.1026181914754388e-05, |
| "loss": 0.2757260203361511, |
| "mean_token_accuracy": 0.9049859046936035, |
| "num_tokens": 503839.0, |
| "step": 363 |
| }, |
| { |
| "entropy": 0.21779917180538177, |
| "epoch": 0.05612952968388589, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.089136521063137e-05, |
| "loss": 0.22174124419689178, |
| "mean_token_accuracy": 0.9221984148025513, |
| "num_tokens": 505248.0, |
| "step": 364 |
| }, |
| { |
| "entropy": 0.3109717071056366, |
| "epoch": 0.05628373168851195, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.075714786839542e-05, |
| "loss": 0.2979055345058441, |
| "mean_token_accuracy": 0.8831614851951599, |
| "num_tokens": 506129.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 0.22565557062625885, |
| "epoch": 0.05643793369313801, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.0623535589856887e-05, |
| "loss": 0.23962406814098358, |
| "mean_token_accuracy": 0.9183965921401978, |
| "num_tokens": 507534.0, |
| "step": 366 |
| }, |
| { |
| "entropy": 0.16417403519153595, |
| "epoch": 0.05659213569776407, |
| "grad_norm": 2.25, |
| "learning_rate": 1.0490534051121808e-05, |
| "loss": 0.16284841299057007, |
| "mean_token_accuracy": 0.937706708908081, |
| "num_tokens": 509356.0, |
| "step": 367 |
| }, |
| { |
| "entropy": 0.18802893161773682, |
| "epoch": 0.05674633770239013, |
| "grad_norm": 1.6640625, |
| "learning_rate": 1.0358148902350853e-05, |
| "loss": 0.19001488387584686, |
| "mean_token_accuracy": 0.930488646030426, |
| "num_tokens": 510817.0, |
| "step": 368 |
| }, |
| { |
| "entropy": 0.22402897477149963, |
| "epoch": 0.05690053970701619, |
| "grad_norm": 2.125, |
| "learning_rate": 1.0226385767519259e-05, |
| "loss": 0.228716179728508, |
| "mean_token_accuracy": 0.924344539642334, |
| "num_tokens": 512160.0, |
| "step": 369 |
| }, |
| { |
| "entropy": 0.24438747763633728, |
| "epoch": 0.05705474171164225, |
| "grad_norm": 1.984375, |
| "learning_rate": 1.0095250244177887e-05, |
| "loss": 0.22704952955245972, |
| "mean_token_accuracy": 0.918749988079071, |
| "num_tokens": 513288.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.23192906379699707, |
| "epoch": 0.057208943716268314, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.964747903215513e-06, |
| "loss": 0.22084636986255646, |
| "mean_token_accuracy": 0.929665744304657, |
| "num_tokens": 514732.0, |
| "step": 371 |
| }, |
| { |
| "entropy": 0.1626010537147522, |
| "epoch": 0.05736314572089437, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.834884288622054e-06, |
| "loss": 0.15189611911773682, |
| "mean_token_accuracy": 0.941209077835083, |
| "num_tokens": 516543.0, |
| "step": 372 |
| }, |
| { |
| "entropy": 0.16602161526679993, |
| "epoch": 0.05751734772552043, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.705664917253143e-06, |
| "loss": 0.18036378920078278, |
| "mean_token_accuracy": 0.9382113814353943, |
| "num_tokens": 518396.0, |
| "step": 373 |
| }, |
| { |
| "entropy": 0.16473768651485443, |
| "epoch": 0.057671549730146494, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.577095278595694e-06, |
| "loss": 0.15197424590587616, |
| "mean_token_accuracy": 0.9414084553718567, |
| "num_tokens": 520179.0, |
| "step": 374 |
| }, |
| { |
| "entropy": 0.1879141479730606, |
| "epoch": 0.05782575173477255, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.449180834534749e-06, |
| "loss": 0.18156398832798004, |
| "mean_token_accuracy": 0.9304715991020203, |
| "num_tokens": 521841.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.2549605369567871, |
| "epoch": 0.05797995373939861, |
| "grad_norm": 2.203125, |
| "learning_rate": 9.321927019121435e-06, |
| "loss": 0.257169634103775, |
| "mean_token_accuracy": 0.9048386812210083, |
| "num_tokens": 523089.0, |
| "step": 376 |
| }, |
| { |
| "entropy": 0.18407224118709564, |
| "epoch": 0.058134155744024675, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.195339238342071e-06, |
| "loss": 0.18074241280555725, |
| "mean_token_accuracy": 0.936096727848053, |
| "num_tokens": 524834.0, |
| "step": 377 |
| }, |
| { |
| "entropy": 0.21801158785820007, |
| "epoch": 0.05828835774865073, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.069422869888583e-06, |
| "loss": 0.22194962203502655, |
| "mean_token_accuracy": 0.923652708530426, |
| "num_tokens": 526178.0, |
| "step": 378 |
| }, |
| { |
| "entropy": 0.18715234100818634, |
| "epoch": 0.05844255975327679, |
| "grad_norm": 1.34375, |
| "learning_rate": 8.944183262929984e-06, |
| "loss": 0.17807839810848236, |
| "mean_token_accuracy": 0.9365825057029724, |
| "num_tokens": 527889.0, |
| "step": 379 |
| }, |
| { |
| "entropy": 0.196278914809227, |
| "epoch": 0.058596761757902856, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.819625737885187e-06, |
| "loss": 0.20651084184646606, |
| "mean_token_accuracy": 0.9256097674369812, |
| "num_tokens": 529537.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.35177287459373474, |
| "epoch": 0.05875096376252891, |
| "grad_norm": 3.296875, |
| "learning_rate": 8.695755586196924e-06, |
| "loss": 0.385383665561676, |
| "mean_token_accuracy": 0.8580645322799683, |
| "num_tokens": 530475.0, |
| "step": 381 |
| }, |
| { |
| "entropy": 0.25344812870025635, |
| "epoch": 0.058905165767154974, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.572578070107016e-06, |
| "loss": 0.25393110513687134, |
| "mean_token_accuracy": 0.917894721031189, |
| "num_tokens": 531433.0, |
| "step": 382 |
| }, |
| { |
| "entropy": 0.3020297884941101, |
| "epoch": 0.059059367771781036, |
| "grad_norm": 2.359375, |
| "learning_rate": 8.450098422432787e-06, |
| "loss": 0.3018152415752411, |
| "mean_token_accuracy": 0.9065510630607605, |
| "num_tokens": 532479.0, |
| "step": 383 |
| }, |
| { |
| "entropy": 0.15192678570747375, |
| "epoch": 0.05921356977640709, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.328321846344755e-06, |
| "loss": 0.1450488418340683, |
| "mean_token_accuracy": 0.9468623399734497, |
| "num_tokens": 534463.0, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.05921356977640709, |
| "eval_entropy": 0.22466930076044206, |
| "eval_loss": 0.22621265053749084, |
| "eval_mean_token_accuracy": 0.9194652596760912, |
| "eval_num_tokens": 534463.0, |
| "eval_runtime": 34.9665, |
| "eval_samples_per_second": 78.103, |
| "eval_steps_per_second": 9.781, |
| "step": 384 |
| }, |
| { |
| "entropy": 0.18735887110233307, |
| "epoch": 0.059367771781033155, |
| "grad_norm": 1.375, |
| "learning_rate": 8.207253515145625e-06, |
| "loss": 0.18456675112247467, |
| "mean_token_accuracy": 0.9276748299598694, |
| "num_tokens": 536144.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 0.2384348064661026, |
| "epoch": 0.05952197378565921, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.086898572050494e-06, |
| "loss": 0.24932722747325897, |
| "mean_token_accuracy": 0.9125475287437439, |
| "num_tokens": 537467.0, |
| "step": 386 |
| }, |
| { |
| "entropy": 0.21620430052280426, |
| "epoch": 0.05967617579028527, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.967262129968378e-06, |
| "loss": 0.20638106763362885, |
| "mean_token_accuracy": 0.9262917637825012, |
| "num_tokens": 538791.0, |
| "step": 387 |
| }, |
| { |
| "entropy": 0.22282716631889343, |
| "epoch": 0.059830377794911335, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.848349271284952e-06, |
| "loss": 0.24068771302700043, |
| "mean_token_accuracy": 0.911854088306427, |
| "num_tokens": 540115.0, |
| "step": 388 |
| }, |
| { |
| "entropy": 0.19987352192401886, |
| "epoch": 0.05998457979953739, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.730165047646723e-06, |
| "loss": 0.19121116399765015, |
| "mean_token_accuracy": 0.93138587474823, |
| "num_tokens": 541595.0, |
| "step": 389 |
| }, |
| { |
| "entropy": 0.2530774772167206, |
| "epoch": 0.06013878180416345, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.612714479746347e-06, |
| "loss": 0.250463604927063, |
| "mean_token_accuracy": 0.9078303575515747, |
| "num_tokens": 542829.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.2623169720172882, |
| "epoch": 0.060292983808789516, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.4960025571094025e-06, |
| "loss": 0.27675166726112366, |
| "mean_token_accuracy": 0.9017013311386108, |
| "num_tokens": 543895.0, |
| "step": 391 |
| }, |
| { |
| "entropy": 0.2155791074037552, |
| "epoch": 0.06044718581341557, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.380034237882394e-06, |
| "loss": 0.21280765533447266, |
| "mean_token_accuracy": 0.9217687249183655, |
| "num_tokens": 545373.0, |
| "step": 392 |
| }, |
| { |
| "entropy": 0.3150392770767212, |
| "epoch": 0.060601387818041634, |
| "grad_norm": 2.5, |
| "learning_rate": 7.264814448622106e-06, |
| "loss": 0.3080776035785675, |
| "mean_token_accuracy": 0.898815929889679, |
| "num_tokens": 546310.0, |
| "step": 393 |
| }, |
| { |
| "entropy": 0.19685329496860504, |
| "epoch": 0.0607555898226677, |
| "grad_norm": 2.125, |
| "learning_rate": 7.150348084086367e-06, |
| "loss": 0.22213543951511383, |
| "mean_token_accuracy": 0.9212239384651184, |
| "num_tokens": 547854.0, |
| "step": 394 |
| }, |
| { |
| "entropy": 0.1816016435623169, |
| "epoch": 0.06090979182729375, |
| "grad_norm": 1.4140625, |
| "learning_rate": 7.036640007026038e-06, |
| "loss": 0.17253060638904572, |
| "mean_token_accuracy": 0.9350804090499878, |
| "num_tokens": 549541.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 0.19817869365215302, |
| "epoch": 0.061063993831919815, |
| "grad_norm": 1.7890625, |
| "learning_rate": 6.923695047978502e-06, |
| "loss": 0.191897913813591, |
| "mean_token_accuracy": 0.9271523356437683, |
| "num_tokens": 551059.0, |
| "step": 396 |
| }, |
| { |
| "entropy": 0.24792121350765228, |
| "epoch": 0.06121819583654588, |
| "grad_norm": 2.25, |
| "learning_rate": 6.811518005062423e-06, |
| "loss": 0.2625022828578949, |
| "mean_token_accuracy": 0.9022988677024841, |
| "num_tokens": 552111.0, |
| "step": 397 |
| }, |
| { |
| "entropy": 0.24607616662979126, |
| "epoch": 0.06137239784117193, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.700113643773892e-06, |
| "loss": 0.22993192076683044, |
| "mean_token_accuracy": 0.9271889328956604, |
| "num_tokens": 553204.0, |
| "step": 398 |
| }, |
| { |
| "entropy": 0.25920623540878296, |
| "epoch": 0.061526599845797995, |
| "grad_norm": 2.453125, |
| "learning_rate": 6.589486696784028e-06, |
| "loss": 0.27900075912475586, |
| "mean_token_accuracy": 0.9022931456565857, |
| "num_tokens": 554215.0, |
| "step": 399 |
| }, |
| { |
| "entropy": 0.28530606627464294, |
| "epoch": 0.06168080185042406, |
| "grad_norm": 2.4375, |
| "learning_rate": 6.47964186373787e-06, |
| "loss": 0.2928396165370941, |
| "mean_token_accuracy": 0.8845500946044922, |
| "num_tokens": 555401.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.2927665114402771, |
| "epoch": 0.061835003855050114, |
| "grad_norm": 2.25, |
| "learning_rate": 6.370583811054778e-06, |
| "loss": 0.2968969941139221, |
| "mean_token_accuracy": 0.9039433598518372, |
| "num_tokens": 556398.0, |
| "step": 401 |
| }, |
| { |
| "entropy": 0.23018132150173187, |
| "epoch": 0.061989205859676176, |
| "grad_norm": 1.96875, |
| "learning_rate": 6.262317171730167e-06, |
| "loss": 0.23996573686599731, |
| "mean_token_accuracy": 0.9214015007019043, |
| "num_tokens": 557462.0, |
| "step": 402 |
| }, |
| { |
| "entropy": 0.25166183710098267, |
| "epoch": 0.06214340786430224, |
| "grad_norm": 2.0, |
| "learning_rate": 6.154846545138695e-06, |
| "loss": 0.2649187445640564, |
| "mean_token_accuracy": 0.9033687710762024, |
| "num_tokens": 558598.0, |
| "step": 403 |
| }, |
| { |
| "entropy": 0.23649781942367554, |
| "epoch": 0.062297609868928294, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.048176496838856e-06, |
| "loss": 0.21528743207454681, |
| "mean_token_accuracy": 0.9269746541976929, |
| "num_tokens": 559948.0, |
| "step": 404 |
| }, |
| { |
| "entropy": 0.22737731039524078, |
| "epoch": 0.06245181187355436, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.9423115583790604e-06, |
| "loss": 0.21719223260879517, |
| "mean_token_accuracy": 0.9225531816482544, |
| "num_tokens": 561131.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 0.21060694754123688, |
| "epoch": 0.06260601387818042, |
| "grad_norm": 1.4453125, |
| "learning_rate": 5.8372562271051e-06, |
| "loss": 0.19261834025382996, |
| "mean_token_accuracy": 0.9304878115653992, |
| "num_tokens": 562779.0, |
| "step": 406 |
| }, |
| { |
| "entropy": 0.24134337902069092, |
| "epoch": 0.06276021588280647, |
| "grad_norm": 1.8671875, |
| "learning_rate": 5.733014965969091e-06, |
| "loss": 0.2224052995443344, |
| "mean_token_accuracy": 0.9310910701751709, |
| "num_tokens": 564006.0, |
| "step": 407 |
| }, |
| { |
| "entropy": 0.19692017138004303, |
| "epoch": 0.06291441788743253, |
| "grad_norm": 1.6328125, |
| "learning_rate": 5.629592203339909e-06, |
| "loss": 0.18327265977859497, |
| "mean_token_accuracy": 0.9346548914909363, |
| "num_tokens": 565376.0, |
| "step": 408 |
| }, |
| { |
| "entropy": 0.2016250342130661, |
| "epoch": 0.0630686198920586, |
| "grad_norm": 1.4765625, |
| "learning_rate": 5.526992332815012e-06, |
| "loss": 0.20120908319950104, |
| "mean_token_accuracy": 0.9263085126876831, |
| "num_tokens": 566836.0, |
| "step": 409 |
| }, |
| { |
| "entropy": 0.14676110446453094, |
| "epoch": 0.06322282189668466, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.4252197130338525e-06, |
| "loss": 0.1583862602710724, |
| "mean_token_accuracy": 0.9458128213882446, |
| "num_tokens": 569280.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.1877201646566391, |
| "epoch": 0.06337702390131071, |
| "grad_norm": 2.09375, |
| "learning_rate": 5.3242786674926545e-06, |
| "loss": 0.18557564914226532, |
| "mean_token_accuracy": 0.9334638118743896, |
| "num_tokens": 570821.0, |
| "step": 411 |
| }, |
| { |
| "entropy": 0.21993833780288696, |
| "epoch": 0.06353122590593678, |
| "grad_norm": 1.7421875, |
| "learning_rate": 5.224173484360798e-06, |
| "loss": 0.19618681073188782, |
| "mean_token_accuracy": 0.9358024597167969, |
| "num_tokens": 572044.0, |
| "step": 412 |
| }, |
| { |
| "entropy": 0.20039010047912598, |
| "epoch": 0.06368542791056284, |
| "grad_norm": 1.3671875, |
| "learning_rate": 5.124908416298615e-06, |
| "loss": 0.18724791705608368, |
| "mean_token_accuracy": 0.9329929947853088, |
| "num_tokens": 573619.0, |
| "step": 413 |
| }, |
| { |
| "entropy": 0.21013715863227844, |
| "epoch": 0.06383962991518889, |
| "grad_norm": 1.796875, |
| "learning_rate": 5.026487680276723e-06, |
| "loss": 0.21998311579227448, |
| "mean_token_accuracy": 0.9184691905975342, |
| "num_tokens": 574829.0, |
| "step": 414 |
| }, |
| { |
| "entropy": 0.26953125, |
| "epoch": 0.06399383191981496, |
| "grad_norm": 2.171875, |
| "learning_rate": 4.928915457396913e-06, |
| "loss": 0.26942914724349976, |
| "mean_token_accuracy": 0.9191489219665527, |
| "num_tokens": 576012.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 0.23597829043865204, |
| "epoch": 0.06414803392444102, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.832195892714489e-06, |
| "loss": 0.22428561747074127, |
| "mean_token_accuracy": 0.9230215549468994, |
| "num_tokens": 577410.0, |
| "step": 416 |
| }, |
| { |
| "entropy": 0.28713032603263855, |
| "epoch": 0.06430223592906707, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.736333095062228e-06, |
| "loss": 0.2505059242248535, |
| "mean_token_accuracy": 0.9073724150657654, |
| "num_tokens": 578476.0, |
| "step": 417 |
| }, |
| { |
| "entropy": 0.2858028709888458, |
| "epoch": 0.06445643793369314, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.641331136875768e-06, |
| "loss": 0.2911134958267212, |
| "mean_token_accuracy": 0.9045093059539795, |
| "num_tokens": 579615.0, |
| "step": 418 |
| }, |
| { |
| "entropy": 0.282069593667984, |
| "epoch": 0.0646106399383192, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.547194054020651e-06, |
| "loss": 0.27553999423980713, |
| "mean_token_accuracy": 0.90444016456604, |
| "num_tokens": 580659.0, |
| "step": 419 |
| }, |
| { |
| "entropy": 0.22959555685520172, |
| "epoch": 0.06476484194294525, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.453925845620854e-06, |
| "loss": 0.22032871842384338, |
| "mean_token_accuracy": 0.9136531352996826, |
| "num_tokens": 582022.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.2052592635154724, |
| "epoch": 0.06491904394757132, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.361530473888889e-06, |
| "loss": 0.20798712968826294, |
| "mean_token_accuracy": 0.9232394099235535, |
| "num_tokens": 583450.0, |
| "step": 421 |
| }, |
| { |
| "entropy": 0.32572290301322937, |
| "epoch": 0.06507324595219738, |
| "grad_norm": 2.578125, |
| "learning_rate": 4.270011863957507e-06, |
| "loss": 0.33982253074645996, |
| "mean_token_accuracy": 0.8741418719291687, |
| "num_tokens": 584332.0, |
| "step": 422 |
| }, |
| { |
| "entropy": 0.3089931607246399, |
| "epoch": 0.06522744795682343, |
| "grad_norm": 2.578125, |
| "learning_rate": 4.179373903712913e-06, |
| "loss": 0.30327266454696655, |
| "mean_token_accuracy": 0.8930232524871826, |
| "num_tokens": 585200.0, |
| "step": 423 |
| }, |
| { |
| "entropy": 0.19629529118537903, |
| "epoch": 0.0653816499614495, |
| "grad_norm": 1.703125, |
| "learning_rate": 4.089620443629652e-06, |
| "loss": 0.2054092288017273, |
| "mean_token_accuracy": 0.9246435761451721, |
| "num_tokens": 586681.0, |
| "step": 424 |
| }, |
| { |
| "entropy": 0.18628910183906555, |
| "epoch": 0.06553585196607556, |
| "grad_norm": 1.3359375, |
| "learning_rate": 4.000755296606973e-06, |
| "loss": 0.1760605424642563, |
| "mean_token_accuracy": 0.9416413307189941, |
| "num_tokens": 588334.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 0.194645494222641, |
| "epoch": 0.06569005397070161, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.912782237806903e-06, |
| "loss": 0.19329358637332916, |
| "mean_token_accuracy": 0.9218025207519531, |
| "num_tokens": 589851.0, |
| "step": 426 |
| }, |
| { |
| "entropy": 0.19448570907115936, |
| "epoch": 0.06584425597532768, |
| "grad_norm": 1.671875, |
| "learning_rate": 3.825705004493849e-06, |
| "loss": 0.18638762831687927, |
| "mean_token_accuracy": 0.9315856695175171, |
| "num_tokens": 591423.0, |
| "step": 427 |
| }, |
| { |
| "entropy": 0.26799967885017395, |
| "epoch": 0.06599845797995374, |
| "grad_norm": 2.125, |
| "learning_rate": 3.739527295875811e-06, |
| "loss": 0.2695932686328888, |
| "mean_token_accuracy": 0.9055441617965698, |
| "num_tokens": 592405.0, |
| "step": 428 |
| }, |
| { |
| "entropy": 0.20886771380901337, |
| "epoch": 0.0661526599845798, |
| "grad_norm": 1.875, |
| "learning_rate": 3.6542527729472836e-06, |
| "loss": 0.22071963548660278, |
| "mean_token_accuracy": 0.9178168177604675, |
| "num_tokens": 594007.0, |
| "step": 429 |
| }, |
| { |
| "entropy": 0.19780333340168, |
| "epoch": 0.06630686198920586, |
| "grad_norm": 1.4296875, |
| "learning_rate": 3.5698850583336663e-06, |
| "loss": 0.19298632442951202, |
| "mean_token_accuracy": 0.9317794442176819, |
| "num_tokens": 595774.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.2335851490497589, |
| "epoch": 0.06646106399383192, |
| "grad_norm": 1.5859375, |
| "learning_rate": 3.4864277361374264e-06, |
| "loss": 0.21905845403671265, |
| "mean_token_accuracy": 0.9286743402481079, |
| "num_tokens": 597170.0, |
| "step": 431 |
| }, |
| { |
| "entropy": 0.17323604226112366, |
| "epoch": 0.06661526599845798, |
| "grad_norm": 1.640625, |
| "learning_rate": 3.4038843517858075e-06, |
| "loss": 0.17967088520526886, |
| "mean_token_accuracy": 0.9360523819923401, |
| "num_tokens": 599164.0, |
| "step": 432 |
| }, |
| { |
| "entropy": 0.2514375150203705, |
| "epoch": 0.06676946800308405, |
| "grad_norm": 1.8125, |
| "learning_rate": 3.3222584118802192e-06, |
| "loss": 0.2490684688091278, |
| "mean_token_accuracy": 0.9187192320823669, |
| "num_tokens": 600390.0, |
| "step": 433 |
| }, |
| { |
| "entropy": 0.22465308010578156, |
| "epoch": 0.0669236700077101, |
| "grad_norm": 2.515625, |
| "learning_rate": 3.241553384047258e-06, |
| "loss": 0.26371464133262634, |
| "mean_token_accuracy": 0.9116766452789307, |
| "num_tokens": 601734.0, |
| "step": 434 |
| }, |
| { |
| "entropy": 0.20948569476604462, |
| "epoch": 0.06707787201233616, |
| "grad_norm": 1.5703125, |
| "learning_rate": 3.1617726967914235e-06, |
| "loss": 0.21372012794017792, |
| "mean_token_accuracy": 0.9316811561584473, |
| "num_tokens": 603235.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 0.20347538590431213, |
| "epoch": 0.06723207401696223, |
| "grad_norm": 1.6796875, |
| "learning_rate": 3.0829197393494548e-06, |
| "loss": 0.17981462180614471, |
| "mean_token_accuracy": 0.9269624352455139, |
| "num_tokens": 604708.0, |
| "step": 436 |
| }, |
| { |
| "entropy": 0.23263585567474365, |
| "epoch": 0.06738627602158828, |
| "grad_norm": 2.1875, |
| "learning_rate": 3.004997861546327e-06, |
| "loss": 0.23778997361660004, |
| "mean_token_accuracy": 0.9214986562728882, |
| "num_tokens": 605837.0, |
| "step": 437 |
| }, |
| { |
| "entropy": 0.23302724957466125, |
| "epoch": 0.06754047802621434, |
| "grad_norm": 2.203125, |
| "learning_rate": 2.9280103736529896e-06, |
| "loss": 0.23127038776874542, |
| "mean_token_accuracy": 0.9103972911834717, |
| "num_tokens": 607028.0, |
| "step": 438 |
| }, |
| { |
| "entropy": 0.18138211965560913, |
| "epoch": 0.0676946800308404, |
| "grad_norm": 1.4140625, |
| "learning_rate": 2.8519605462456965e-06, |
| "loss": 0.1681656837463379, |
| "mean_token_accuracy": 0.9345430731773376, |
| "num_tokens": 608579.0, |
| "step": 439 |
| }, |
| { |
| "entropy": 0.17149963974952698, |
| "epoch": 0.06784888203546646, |
| "grad_norm": 1.6171875, |
| "learning_rate": 2.776851610067094e-06, |
| "loss": 0.1811680942773819, |
| "mean_token_accuracy": 0.932692289352417, |
| "num_tokens": 610563.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.15687499940395355, |
| "epoch": 0.06800308404009252, |
| "grad_norm": 1.34375, |
| "learning_rate": 2.7026867558889694e-06, |
| "loss": 0.15128004550933838, |
| "mean_token_accuracy": 0.9400107264518738, |
| "num_tokens": 612438.0, |
| "step": 441 |
| }, |
| { |
| "entropy": 0.22530966997146606, |
| "epoch": 0.06815728604471859, |
| "grad_norm": 2.046875, |
| "learning_rate": 2.6294691343766718e-06, |
| "loss": 0.22919264435768127, |
| "mean_token_accuracy": 0.9237637519836426, |
| "num_tokens": 613902.0, |
| "step": 442 |
| }, |
| { |
| "entropy": 0.21813379228115082, |
| "epoch": 0.06831148804934464, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.557201855955316e-06, |
| "loss": 0.20722565054893494, |
| "mean_token_accuracy": 0.9286713004112244, |
| "num_tokens": 615340.0, |
| "step": 443 |
| }, |
| { |
| "entropy": 0.22816047072410583, |
| "epoch": 0.0684656900539707, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.4858879906775904e-06, |
| "loss": 0.2418501079082489, |
| "mean_token_accuracy": 0.9141337275505066, |
| "num_tokens": 616664.0, |
| "step": 444 |
| }, |
| { |
| "entropy": 0.24174243211746216, |
| "epoch": 0.06861989205859677, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.4155305680933938e-06, |
| "loss": 0.24712735414505005, |
| "mean_token_accuracy": 0.9127676486968994, |
| "num_tokens": 617933.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 0.23680631816387177, |
| "epoch": 0.06877409406322282, |
| "grad_norm": 2.15625, |
| "learning_rate": 2.3461325771210683e-06, |
| "loss": 0.24274389445781708, |
| "mean_token_accuracy": 0.9137291312217712, |
| "num_tokens": 619019.0, |
| "step": 446 |
| }, |
| { |
| "entropy": 0.21051788330078125, |
| "epoch": 0.06892829606784888, |
| "grad_norm": 1.5703125, |
| "learning_rate": 2.2776969659205005e-06, |
| "loss": 0.19205066561698914, |
| "mean_token_accuracy": 0.9310897588729858, |
| "num_tokens": 620275.0, |
| "step": 447 |
| }, |
| { |
| "entropy": 0.19069823622703552, |
| "epoch": 0.06908249807247495, |
| "grad_norm": 1.640625, |
| "learning_rate": 2.2102266417677985e-06, |
| "loss": 0.193171888589859, |
| "mean_token_accuracy": 0.9300353527069092, |
| "num_tokens": 621698.0, |
| "step": 448 |
| }, |
| { |
| "entropy": 0.26176121830940247, |
| "epoch": 0.069236700077101, |
| "grad_norm": 2.203125, |
| "learning_rate": 2.143724470931846e-06, |
| "loss": 0.2646713852882385, |
| "mean_token_accuracy": 0.9019434452056885, |
| "num_tokens": 622838.0, |
| "step": 449 |
| }, |
| { |
| "entropy": 0.37524735927581787, |
| "epoch": 0.06939090208172706, |
| "grad_norm": 3.421875, |
| "learning_rate": 2.0781932785525122e-06, |
| "loss": 0.3872081935405731, |
| "mean_token_accuracy": 0.8746479153633118, |
| "num_tokens": 623556.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.20446714758872986, |
| "epoch": 0.06954510408635313, |
| "grad_norm": 1.984375, |
| "learning_rate": 2.013635848520626e-06, |
| "loss": 0.21962465345859528, |
| "mean_token_accuracy": 0.9238095283508301, |
| "num_tokens": 624824.0, |
| "step": 451 |
| }, |
| { |
| "entropy": 0.18340152502059937, |
| "epoch": 0.06969930609097919, |
| "grad_norm": 1.6796875, |
| "learning_rate": 1.9500549233597453e-06, |
| "loss": 0.1832038313150406, |
| "mean_token_accuracy": 0.9371029138565063, |
| "num_tokens": 626406.0, |
| "step": 452 |
| }, |
| { |
| "entropy": 0.3325141668319702, |
| "epoch": 0.06985350809560524, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.8874532041095989e-06, |
| "loss": 0.34842032194137573, |
| "mean_token_accuracy": 0.8773234486579895, |
| "num_tokens": 627221.0, |
| "step": 453 |
| }, |
| { |
| "entropy": 0.20056799054145813, |
| "epoch": 0.0700077101002313, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.825833350211395e-06, |
| "loss": 0.1930190622806549, |
| "mean_token_accuracy": 0.9300291538238525, |
| "num_tokens": 628944.0, |
| "step": 454 |
| }, |
| { |
| "entropy": 0.3074391484260559, |
| "epoch": 0.07016191210485737, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.7651979793947949e-06, |
| "loss": 0.320962131023407, |
| "mean_token_accuracy": 0.8794258236885071, |
| "num_tokens": 629997.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 0.2851220667362213, |
| "epoch": 0.07031611410948342, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.705549667566747e-06, |
| "loss": 0.305853009223938, |
| "mean_token_accuracy": 0.884324312210083, |
| "num_tokens": 630930.0, |
| "step": 456 |
| }, |
| { |
| "entropy": 0.213734969496727, |
| "epoch": 0.07047031611410948, |
| "grad_norm": 1.875, |
| "learning_rate": 1.6468909487020318e-06, |
| "loss": 0.21344000101089478, |
| "mean_token_accuracy": 0.9156540632247925, |
| "num_tokens": 632337.0, |
| "step": 457 |
| }, |
| { |
| "entropy": 0.23210836946964264, |
| "epoch": 0.07062451811873555, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.5892243147356128e-06, |
| "loss": 0.22123272716999054, |
| "mean_token_accuracy": 0.921897828578949, |
| "num_tokens": 633715.0, |
| "step": 458 |
| }, |
| { |
| "entropy": 0.2013556957244873, |
| "epoch": 0.0707787201233616, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.5325522154568006e-06, |
| "loss": 0.2120433896780014, |
| "mean_token_accuracy": 0.9267473220825195, |
| "num_tokens": 635211.0, |
| "step": 459 |
| }, |
| { |
| "entropy": 0.1748819798231125, |
| "epoch": 0.07093292212798766, |
| "grad_norm": 1.40625, |
| "learning_rate": 1.4768770584051433e-06, |
| "loss": 0.16574399173259735, |
| "mean_token_accuracy": 0.9330986142158508, |
| "num_tokens": 636923.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.20135805010795593, |
| "epoch": 0.07108712413261373, |
| "grad_norm": 1.59375, |
| "learning_rate": 1.422201208768187e-06, |
| "loss": 0.20329774916172028, |
| "mean_token_accuracy": 0.9288026094436646, |
| "num_tokens": 638476.0, |
| "step": 461 |
| }, |
| { |
| "entropy": 0.19482704997062683, |
| "epoch": 0.07124132613723978, |
| "grad_norm": 1.5, |
| "learning_rate": 1.3685269892809715e-06, |
| "loss": 0.18484120070934296, |
| "mean_token_accuracy": 0.9365351796150208, |
| "num_tokens": 640233.0, |
| "step": 462 |
| }, |
| { |
| "entropy": 0.2483380138874054, |
| "epoch": 0.07139552814186584, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.315856680127367e-06, |
| "loss": 0.2574044167995453, |
| "mean_token_accuracy": 0.9011474251747131, |
| "num_tokens": 641374.0, |
| "step": 463 |
| }, |
| { |
| "entropy": 0.25926902890205383, |
| "epoch": 0.07154973014649191, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.2641925188432102e-06, |
| "loss": 0.2751407325267792, |
| "mean_token_accuracy": 0.9096437692642212, |
| "num_tokens": 642533.0, |
| "step": 464 |
| }, |
| { |
| "entropy": 0.19511115550994873, |
| "epoch": 0.07170393215111796, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.2135367002212321e-06, |
| "loss": 0.19707168638706207, |
| "mean_token_accuracy": 0.9302915334701538, |
| "num_tokens": 644119.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 0.2082238495349884, |
| "epoch": 0.07185813415574402, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.1638913762178489e-06, |
| "loss": 0.2105921357870102, |
| "mean_token_accuracy": 0.9202454090118408, |
| "num_tokens": 645431.0, |
| "step": 466 |
| }, |
| { |
| "entropy": 0.19069121778011322, |
| "epoch": 0.07201233616037009, |
| "grad_norm": 1.6796875, |
| "learning_rate": 1.1152586558617118e-06, |
| "loss": 0.17696255445480347, |
| "mean_token_accuracy": 0.9442567825317383, |
| "num_tokens": 647215.0, |
| "step": 467 |
| }, |
| { |
| "entropy": 0.22916826605796814, |
| "epoch": 0.07216653816499614, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.0676406051641357e-06, |
| "loss": 0.22586072981357574, |
| "mean_token_accuracy": 0.9183526039123535, |
| "num_tokens": 648607.0, |
| "step": 468 |
| }, |
| { |
| "entropy": 0.26740562915802, |
| "epoch": 0.0723207401696222, |
| "grad_norm": 2.125, |
| "learning_rate": 1.0210392470313078e-06, |
| "loss": 0.2589561343193054, |
| "mean_token_accuracy": 0.9052631855010986, |
| "num_tokens": 649660.0, |
| "step": 469 |
| }, |
| { |
| "entropy": 0.22609063982963562, |
| "epoch": 0.07247494217424827, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.754565611783812e-07, |
| "loss": 0.23183754086494446, |
| "mean_token_accuracy": 0.9105263352394104, |
| "num_tokens": 650998.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.2637474536895752, |
| "epoch": 0.07262914417887432, |
| "grad_norm": 2.15625, |
| "learning_rate": 9.308944840453415e-07, |
| "loss": 0.2506449520587921, |
| "mean_token_accuracy": 0.9153633713722229, |
| "num_tokens": 652093.0, |
| "step": 471 |
| }, |
| { |
| "entropy": 0.2541276514530182, |
| "epoch": 0.07278334618350038, |
| "grad_norm": 2.125, |
| "learning_rate": 8.873549087147604e-07, |
| "loss": 0.25114259123802185, |
| "mean_token_accuracy": 0.9045345783233643, |
| "num_tokens": 653358.0, |
| "step": 472 |
| }, |
| { |
| "entropy": 0.20104283094406128, |
| "epoch": 0.07293754818812645, |
| "grad_norm": 1.5234375, |
| "learning_rate": 8.44839684831375e-07, |
| "loss": 0.18859422206878662, |
| "mean_token_accuracy": 0.9310559034347534, |
| "num_tokens": 654976.0, |
| "step": 473 |
| }, |
| { |
| "entropy": 0.23548080027103424, |
| "epoch": 0.0730917501927525, |
| "grad_norm": 2.125, |
| "learning_rate": 8.03350618523499e-07, |
| "loss": 0.2508711516857147, |
| "mean_token_accuracy": 0.9083601236343384, |
| "num_tokens": 656228.0, |
| "step": 474 |
| }, |
| { |
| "entropy": 0.2388007789850235, |
| "epoch": 0.07324595219737856, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.628894723263086e-07, |
| "loss": 0.25423818826675415, |
| "mean_token_accuracy": 0.9074475765228271, |
| "num_tokens": 657619.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 0.2098216712474823, |
| "epoch": 0.07340015420200463, |
| "grad_norm": 1.6640625, |
| "learning_rate": 7.234579651069578e-07, |
| "loss": 0.19636894762516022, |
| "mean_token_accuracy": 0.9344852566719055, |
| "num_tokens": 659016.0, |
| "step": 476 |
| }, |
| { |
| "entropy": 0.19445836544036865, |
| "epoch": 0.07355435620663069, |
| "grad_norm": 1.4296875, |
| "learning_rate": 6.850577719915624e-07, |
| "loss": 0.18777857720851898, |
| "mean_token_accuracy": 0.9340922832489014, |
| "num_tokens": 660693.0, |
| "step": 477 |
| }, |
| { |
| "entropy": 0.2021363377571106, |
| "epoch": 0.07370855821125674, |
| "grad_norm": 1.65625, |
| "learning_rate": 6.47690524294034e-07, |
| "loss": 0.1869696080684662, |
| "mean_token_accuracy": 0.9333333373069763, |
| "num_tokens": 662111.0, |
| "step": 478 |
| }, |
| { |
| "entropy": 0.19528843462467194, |
| "epoch": 0.07386276021588281, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.113578094467775e-07, |
| "loss": 0.17778527736663818, |
| "mean_token_accuracy": 0.9368270039558411, |
| "num_tokens": 663512.0, |
| "step": 479 |
| }, |
| { |
| "entropy": 0.17402714490890503, |
| "epoch": 0.07401696222050887, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.760611709332648e-07, |
| "loss": 0.15594635903835297, |
| "mean_token_accuracy": 0.9421712756156921, |
| "num_tokens": 665353.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.14156945049762726, |
| "epoch": 0.07417116422513492, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.418021082224472e-07, |
| "loss": 0.1273384541273117, |
| "mean_token_accuracy": 0.9454138875007629, |
| "num_tokens": 667596.0, |
| "step": 481 |
| }, |
| { |
| "entropy": 0.15703719854354858, |
| "epoch": 0.07432536622976099, |
| "grad_norm": 1.4140625, |
| "learning_rate": 5.08582076705072e-07, |
| "loss": 0.15257099270820618, |
| "mean_token_accuracy": 0.9451599717140198, |
| "num_tokens": 670011.0, |
| "step": 482 |
| }, |
| { |
| "entropy": 0.15173302590847015, |
| "epoch": 0.07447956823438705, |
| "grad_norm": 1.359375, |
| "learning_rate": 4.764024876318357e-07, |
| "loss": 0.14840558171272278, |
| "mean_token_accuracy": 0.9457720518112183, |
| "num_tokens": 672195.0, |
| "step": 483 |
| }, |
| { |
| "entropy": 0.251803457736969, |
| "epoch": 0.0746337702390131, |
| "grad_norm": 2.140625, |
| "learning_rate": 4.4526470805345554e-07, |
| "loss": 0.23033595085144043, |
| "mean_token_accuracy": 0.9138405323028564, |
| "num_tokens": 673294.0, |
| "step": 484 |
| }, |
| { |
| "entropy": 0.22149844467639923, |
| "epoch": 0.07478797224363917, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.1517006076257914e-07, |
| "loss": 0.20876595377922058, |
| "mean_token_accuracy": 0.920634925365448, |
| "num_tokens": 674751.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 0.19992657005786896, |
| "epoch": 0.07494217424826523, |
| "grad_norm": 1.609375, |
| "learning_rate": 3.861198242375852e-07, |
| "loss": 0.20208041369915009, |
| "mean_token_accuracy": 0.9220055937767029, |
| "num_tokens": 676195.0, |
| "step": 486 |
| }, |
| { |
| "entropy": 0.2647709846496582, |
| "epoch": 0.07509637625289128, |
| "grad_norm": 2.09375, |
| "learning_rate": 3.581152325882825e-07, |
| "loss": 0.26581087708473206, |
| "mean_token_accuracy": 0.9089347124099731, |
| "num_tokens": 677367.0, |
| "step": 487 |
| }, |
| { |
| "entropy": 0.18075726926326752, |
| "epoch": 0.07525057825751735, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.311574755034796e-07, |
| "loss": 0.19126133620738983, |
| "mean_token_accuracy": 0.9318181872367859, |
| "num_tokens": 678959.0, |
| "step": 488 |
| }, |
| { |
| "entropy": 0.23724618554115295, |
| "epoch": 0.07540478026214341, |
| "grad_norm": 2.25, |
| "learning_rate": 3.0524769820044487e-07, |
| "loss": 0.23674722015857697, |
| "mean_token_accuracy": 0.9180327653884888, |
| "num_tokens": 680248.0, |
| "step": 489 |
| }, |
| { |
| "entropy": 0.22051914036273956, |
| "epoch": 0.07555898226676946, |
| "grad_norm": 1.8828125, |
| "learning_rate": 2.8038700137624495e-07, |
| "loss": 0.2116030901670456, |
| "mean_token_accuracy": 0.9300605058670044, |
| "num_tokens": 681743.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.1757911741733551, |
| "epoch": 0.07571318427139553, |
| "grad_norm": 1.6015625, |
| "learning_rate": 2.5657644116100497e-07, |
| "loss": 0.17098675668239594, |
| "mean_token_accuracy": 0.9406231641769409, |
| "num_tokens": 683452.0, |
| "step": 491 |
| }, |
| { |
| "entropy": 0.18268117308616638, |
| "epoch": 0.07586738627602159, |
| "grad_norm": 1.4140625, |
| "learning_rate": 2.338170290730246e-07, |
| "loss": 0.17703530192375183, |
| "mean_token_accuracy": 0.9361584782600403, |
| "num_tokens": 685277.0, |
| "step": 492 |
| }, |
| { |
| "entropy": 0.19099417328834534, |
| "epoch": 0.07602158828064765, |
| "grad_norm": 1.5, |
| "learning_rate": 2.1210973197582085e-07, |
| "loss": 0.19510860741138458, |
| "mean_token_accuracy": 0.9316887855529785, |
| "num_tokens": 686866.0, |
| "step": 493 |
| }, |
| { |
| "entropy": 0.21786467730998993, |
| "epoch": 0.07617579028527371, |
| "grad_norm": 2.078125, |
| "learning_rate": 1.9145547203703597e-07, |
| "loss": 0.2253967970609665, |
| "mean_token_accuracy": 0.9227994084358215, |
| "num_tokens": 688260.0, |
| "step": 494 |
| }, |
| { |
| "entropy": 0.22731785476207733, |
| "epoch": 0.07632999228989977, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.7185512668927706e-07, |
| "loss": 0.21878266334533691, |
| "mean_token_accuracy": 0.9235293865203857, |
| "num_tokens": 689628.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 0.31587833166122437, |
| "epoch": 0.07648419429452583, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.533095285928432e-07, |
| "loss": 0.31676945090293884, |
| "mean_token_accuracy": 0.8903688788414001, |
| "num_tokens": 690612.0, |
| "step": 496 |
| }, |
| { |
| "entropy": 0.22072257101535797, |
| "epoch": 0.0766383962991519, |
| "grad_norm": 1.875, |
| "learning_rate": 1.3581946560033142e-07, |
| "loss": 0.20424997806549072, |
| "mean_token_accuracy": 0.9260969758033752, |
| "num_tokens": 691919.0, |
| "step": 497 |
| }, |
| { |
| "entropy": 0.2378959059715271, |
| "epoch": 0.07679259830377795, |
| "grad_norm": 2.125, |
| "learning_rate": 1.1938568072319412e-07, |
| "loss": 0.23960573971271515, |
| "mean_token_accuracy": 0.908172607421875, |
| "num_tokens": 693016.0, |
| "step": 498 |
| }, |
| { |
| "entropy": 0.18599998950958252, |
| "epoch": 0.076946800308404, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.0400887210015586e-07, |
| "loss": 0.17737571895122528, |
| "mean_token_accuracy": 0.9337517619132996, |
| "num_tokens": 694458.0, |
| "step": 499 |
| }, |
| { |
| "entropy": 0.1896909922361374, |
| "epoch": 0.07710100231303008, |
| "grad_norm": 1.5546875, |
| "learning_rate": 8.968969296756224e-08, |
| "loss": 0.1934422105550766, |
| "mean_token_accuracy": 0.9257456064224243, |
| "num_tokens": 696109.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.18347270786762238, |
| "epoch": 0.07725520431765613, |
| "grad_norm": 1.5, |
| "learning_rate": 7.642875163162977e-08, |
| "loss": 0.17866890132427216, |
| "mean_token_accuracy": 0.9312201142311096, |
| "num_tokens": 697789.0, |
| "step": 501 |
| }, |
| { |
| "entropy": 0.27496322989463806, |
| "epoch": 0.07740940632228219, |
| "grad_norm": 2.375, |
| "learning_rate": 6.422661144259989e-08, |
| "loss": 0.2631693482398987, |
| "mean_token_accuracy": 0.9099326729774475, |
| "num_tokens": 698985.0, |
| "step": 502 |
| }, |
| { |
| "entropy": 0.21727091073989868, |
| "epoch": 0.07756360832690826, |
| "grad_norm": 1.984375, |
| "learning_rate": 5.308379077080816e-08, |
| "loss": 0.22967125475406647, |
| "mean_token_accuracy": 0.9191842675209045, |
| "num_tokens": 700317.0, |
| "step": 503 |
| }, |
| { |
| "entropy": 0.21876828372478485, |
| "epoch": 0.07771781033153431, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.300076298466571e-08, |
| "loss": 0.22112873196601868, |
| "mean_token_accuracy": 0.9281525015830994, |
| "num_tokens": 701689.0, |
| "step": 504 |
| }, |
| { |
| "entropy": 0.1745622456073761, |
| "epoch": 0.07787201233616037, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.3977956430547576e-08, |
| "loss": 0.1722312867641449, |
| "mean_token_accuracy": 0.9407705664634705, |
| "num_tokens": 703436.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 0.22518332302570343, |
| "epoch": 0.07802621434078642, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.6015754414593363e-08, |
| "loss": 0.22960630059242249, |
| "mean_token_accuracy": 0.9271809458732605, |
| "num_tokens": 704831.0, |
| "step": 506 |
| }, |
| { |
| "entropy": 0.2924734055995941, |
| "epoch": 0.07818041634541249, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.911449518643138e-08, |
| "loss": 0.28948456048965454, |
| "mean_token_accuracy": 0.8898043036460876, |
| "num_tokens": 705810.0, |
| "step": 507 |
| }, |
| { |
| "entropy": 0.29374387860298157, |
| "epoch": 0.07833461835003855, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.3274471924798471e-08, |
| "loss": 0.2914823293685913, |
| "mean_token_accuracy": 0.9072463512420654, |
| "num_tokens": 706853.0, |
| "step": 508 |
| }, |
| { |
| "entropy": 0.18828892707824707, |
| "epoch": 0.0784888203546646, |
| "grad_norm": 1.5390625, |
| "learning_rate": 8.495932725094414e-09, |
| "loss": 0.19034327566623688, |
| "mean_token_accuracy": 0.9341492056846619, |
| "num_tokens": 708577.0, |
| "step": 509 |
| }, |
| { |
| "entropy": 0.3254898488521576, |
| "epoch": 0.07864302235929067, |
| "grad_norm": 2.6875, |
| "learning_rate": 4.779080588834806e-09, |
| "loss": 0.3536283075809479, |
| "mean_token_accuracy": 0.8856015801429749, |
| "num_tokens": 709599.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.19601193070411682, |
| "epoch": 0.07879722436391673, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.124073415030181e-09, |
| "loss": 0.19777625799179077, |
| "mean_token_accuracy": 0.9288975596427917, |
| "num_tokens": 711140.0, |
| "step": 511 |
| }, |
| { |
| "entropy": 0.22275681793689728, |
| "epoch": 0.07895142636854278, |
| "grad_norm": 2.234375, |
| "learning_rate": 5.310239934885885e-10, |
| "loss": 0.23580928146839142, |
| "mean_token_accuracy": 0.9170305728912354, |
| "num_tokens": 712522.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.07895142636854278, |
| "eval_entropy": 0.22568650308408236, |
| "eval_loss": 0.22544851899147034, |
| "eval_mean_token_accuracy": 0.919665330160431, |
| "eval_num_tokens": 712522.0, |
| "eval_runtime": 34.9909, |
| "eval_samples_per_second": 78.049, |
| "eval_steps_per_second": 9.774, |
| "step": 512 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 512, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 128, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.4536491036033024e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|