TopologicalQwen / trainer_state (3).json
reaperdoesntknow's picture
Upload 2 files
4c89ea0 verified
{
"best_global_step": 512,
"best_metric": 0.22544851899147034,
"best_model_checkpoint": "DQwen3-1.7B-uncensored/checkpoint-512",
"epoch": 0.07895142636854278,
"eval_steps": 128,
"global_step": 512,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.9247070550918579,
"epoch": 0.00015420200462606013,
"grad_norm": 31.125,
"learning_rate": 0.0,
"loss": 2.181769847869873,
"mean_token_accuracy": 0.6662116050720215,
"num_tokens": 1473.0,
"step": 1
},
{
"entropy": 1.0056357383728027,
"epoch": 0.00030840400925212025,
"grad_norm": 43.0,
"learning_rate": 1.6666666666666667e-06,
"loss": 2.696769952774048,
"mean_token_accuracy": 0.617977499961853,
"num_tokens": 2460.0,
"step": 2
},
{
"entropy": 1.0802278518676758,
"epoch": 0.0004626060138781804,
"grad_norm": 41.5,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.6830270290374756,
"mean_token_accuracy": 0.6033275127410889,
"num_tokens": 3610.0,
"step": 3
},
{
"entropy": 0.9580708742141724,
"epoch": 0.0006168080185042405,
"grad_norm": 33.75,
"learning_rate": 5e-06,
"loss": 2.234797716140747,
"mean_token_accuracy": 0.6668869853019714,
"num_tokens": 5131.0,
"step": 4
},
{
"entropy": 1.0076204538345337,
"epoch": 0.0007710100231303007,
"grad_norm": 34.75,
"learning_rate": 6.666666666666667e-06,
"loss": 2.374027729034424,
"mean_token_accuracy": 0.6405493021011353,
"num_tokens": 6377.0,
"step": 5
},
{
"entropy": 1.0595803260803223,
"epoch": 0.0009252120277563608,
"grad_norm": 34.25,
"learning_rate": 8.333333333333334e-06,
"loss": 2.216482400894165,
"mean_token_accuracy": 0.6401006579399109,
"num_tokens": 7577.0,
"step": 6
},
{
"entropy": 1.1813561916351318,
"epoch": 0.001079414032382421,
"grad_norm": 25.375,
"learning_rate": 1e-05,
"loss": 2.0130465030670166,
"mean_token_accuracy": 0.6524437665939331,
"num_tokens": 8874.0,
"step": 7
},
{
"entropy": 1.4208881855010986,
"epoch": 0.001233616037008481,
"grad_norm": 24.125,
"learning_rate": 1.1666666666666668e-05,
"loss": 2.3102152347564697,
"mean_token_accuracy": 0.5884244441986084,
"num_tokens": 9815.0,
"step": 8
},
{
"entropy": 0.8640963435173035,
"epoch": 0.0013878180416345412,
"grad_norm": 12.3125,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.3446450233459473,
"mean_token_accuracy": 0.7377659678459167,
"num_tokens": 11703.0,
"step": 9
},
{
"entropy": 1.0817725658416748,
"epoch": 0.0015420200462606013,
"grad_norm": 14.125,
"learning_rate": 1.5e-05,
"loss": 1.6320915222167969,
"mean_token_accuracy": 0.6808972358703613,
"num_tokens": 13093.0,
"step": 10
},
{
"entropy": 0.9536824226379395,
"epoch": 0.0016962220508866615,
"grad_norm": 10.5,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.2360026836395264,
"mean_token_accuracy": 0.741475522518158,
"num_tokens": 14714.0,
"step": 11
},
{
"entropy": 1.049913763999939,
"epoch": 0.0018504240555127216,
"grad_norm": 10.875,
"learning_rate": 1.8333333333333333e-05,
"loss": 1.2552467584609985,
"mean_token_accuracy": 0.7327285408973694,
"num_tokens": 16155.0,
"step": 12
},
{
"entropy": 0.7939231395721436,
"epoch": 0.0020046260601387818,
"grad_norm": 7.28125,
"learning_rate": 2e-05,
"loss": 0.8604422211647034,
"mean_token_accuracy": 0.7944584488868713,
"num_tokens": 18148.0,
"step": 13
},
{
"entropy": 0.9421704411506653,
"epoch": 0.002158828064764842,
"grad_norm": 9.375,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.9789397716522217,
"mean_token_accuracy": 0.7728531956672668,
"num_tokens": 19600.0,
"step": 14
},
{
"entropy": 1.101209044456482,
"epoch": 0.002313030069390902,
"grad_norm": 9.5,
"learning_rate": 2.3333333333333336e-05,
"loss": 1.1167230606079102,
"mean_token_accuracy": 0.759087085723877,
"num_tokens": 20791.0,
"step": 15
},
{
"entropy": 0.8545694351196289,
"epoch": 0.002467232074016962,
"grad_norm": 8.875,
"learning_rate": 2.5e-05,
"loss": 0.8782606720924377,
"mean_token_accuracy": 0.8138889074325562,
"num_tokens": 22239.0,
"step": 16
},
{
"entropy": 0.5610961318016052,
"epoch": 0.0026214340786430224,
"grad_norm": 5.5,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.5257444381713867,
"mean_token_accuracy": 0.8684759736061096,
"num_tokens": 24642.0,
"step": 17
},
{
"entropy": 0.9791864156723022,
"epoch": 0.0027756360832690823,
"grad_norm": 12.0,
"learning_rate": 2.8333333333333335e-05,
"loss": 1.0223743915557861,
"mean_token_accuracy": 0.780053436756134,
"num_tokens": 25773.0,
"step": 18
},
{
"entropy": 0.6505466103553772,
"epoch": 0.0029298380878951427,
"grad_norm": 7.34375,
"learning_rate": 3e-05,
"loss": 0.5793496370315552,
"mean_token_accuracy": 0.8571428656578064,
"num_tokens": 27258.0,
"step": 19
},
{
"entropy": 0.8408939242362976,
"epoch": 0.0030840400925212026,
"grad_norm": 8.1875,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.7403033375740051,
"mean_token_accuracy": 0.8192341923713684,
"num_tokens": 28389.0,
"step": 20
},
{
"entropy": 0.6034911274909973,
"epoch": 0.003238242097147263,
"grad_norm": 9.125,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.5590517520904541,
"mean_token_accuracy": 0.8626410365104675,
"num_tokens": 29904.0,
"step": 21
},
{
"entropy": 0.9795640110969543,
"epoch": 0.003392444101773323,
"grad_norm": 13.25,
"learning_rate": 3.5e-05,
"loss": 0.9148629307746887,
"mean_token_accuracy": 0.7654135227203369,
"num_tokens": 30577.0,
"step": 22
},
{
"entropy": 0.7439587712287903,
"epoch": 0.0035466461063993833,
"grad_norm": 7.25,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.5953554511070251,
"mean_token_accuracy": 0.8356807231903076,
"num_tokens": 31863.0,
"step": 23
},
{
"entropy": 0.7064525485038757,
"epoch": 0.0037008481110254433,
"grad_norm": 12.3125,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.6644932627677917,
"mean_token_accuracy": 0.8366890549659729,
"num_tokens": 32765.0,
"step": 24
},
{
"entropy": 0.671248733997345,
"epoch": 0.0038550501156515036,
"grad_norm": 10.6875,
"learning_rate": 4e-05,
"loss": 0.6164469718933105,
"mean_token_accuracy": 0.826787531375885,
"num_tokens": 33766.0,
"step": 25
},
{
"entropy": 0.5062084794044495,
"epoch": 0.0040092521202775636,
"grad_norm": 5.4375,
"learning_rate": 4.166666666666667e-05,
"loss": 0.40353134274482727,
"mean_token_accuracy": 0.8768283128738403,
"num_tokens": 35073.0,
"step": 26
},
{
"entropy": 0.5384562015533447,
"epoch": 0.0041634541249036235,
"grad_norm": 5.78125,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.5431831479072571,
"mean_token_accuracy": 0.8347339034080505,
"num_tokens": 36152.0,
"step": 27
},
{
"entropy": 0.4401922821998596,
"epoch": 0.004317656129529684,
"grad_norm": 4.0,
"learning_rate": 4.5e-05,
"loss": 0.412110298871994,
"mean_token_accuracy": 0.8691341876983643,
"num_tokens": 37673.0,
"step": 28
},
{
"entropy": 0.3748137652873993,
"epoch": 0.004471858134155744,
"grad_norm": 3.484375,
"learning_rate": 4.666666666666667e-05,
"loss": 0.360552042722702,
"mean_token_accuracy": 0.8896728754043579,
"num_tokens": 39240.0,
"step": 29
},
{
"entropy": 0.45826223492622375,
"epoch": 0.004626060138781804,
"grad_norm": 4.53125,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.455470472574234,
"mean_token_accuracy": 0.8571428656578064,
"num_tokens": 40256.0,
"step": 30
},
{
"entropy": 0.33488166332244873,
"epoch": 0.004780262143407864,
"grad_norm": 3.9375,
"learning_rate": 5e-05,
"loss": 0.3165741562843323,
"mean_token_accuracy": 0.8937432765960693,
"num_tokens": 42118.0,
"step": 31
},
{
"entropy": 0.3528728783130646,
"epoch": 0.004934464148033924,
"grad_norm": 7.6875,
"learning_rate": 4.9999468976006514e-05,
"loss": 0.38620343804359436,
"mean_token_accuracy": 0.8836023807525635,
"num_tokens": 43303.0,
"step": 32
},
{
"entropy": 0.36625728011131287,
"epoch": 0.005088666152659985,
"grad_norm": 4.84375,
"learning_rate": 4.999787592658497e-05,
"loss": 0.446200966835022,
"mean_token_accuracy": 0.8777328133583069,
"num_tokens": 44546.0,
"step": 33
},
{
"entropy": 0.26643213629722595,
"epoch": 0.005242868157286045,
"grad_norm": 4.40625,
"learning_rate": 4.999522091941117e-05,
"loss": 0.2737399637699127,
"mean_token_accuracy": 0.8979820609092712,
"num_tokens": 46338.0,
"step": 34
},
{
"entropy": 0.26414045691490173,
"epoch": 0.005397070161912105,
"grad_norm": 3.296875,
"learning_rate": 4.999150406727491e-05,
"loss": 0.2983474135398865,
"mean_token_accuracy": 0.9013499617576599,
"num_tokens": 48272.0,
"step": 35
},
{
"entropy": 0.370844304561615,
"epoch": 0.005551272166538165,
"grad_norm": 3.546875,
"learning_rate": 4.9986725528075205e-05,
"loss": 0.3454509377479553,
"mean_token_accuracy": 0.8820555806159973,
"num_tokens": 49467.0,
"step": 36
},
{
"entropy": 0.2591751515865326,
"epoch": 0.0057054741711642255,
"grad_norm": 2.8125,
"learning_rate": 4.998088550481357e-05,
"loss": 0.2637964189052582,
"mean_token_accuracy": 0.9097625613212585,
"num_tokens": 51370.0,
"step": 37
},
{
"entropy": 0.30300796031951904,
"epoch": 0.005859676175790285,
"grad_norm": 2.71875,
"learning_rate": 4.997398424558541e-05,
"loss": 0.30886220932006836,
"mean_token_accuracy": 0.901874303817749,
"num_tokens": 53192.0,
"step": 38
},
{
"entropy": 0.27792277932167053,
"epoch": 0.006013878180416345,
"grad_norm": 2.6875,
"learning_rate": 4.996602204356945e-05,
"loss": 0.2732873558998108,
"mean_token_accuracy": 0.9054564833641052,
"num_tokens": 55051.0,
"step": 39
},
{
"entropy": 0.2960520088672638,
"epoch": 0.006168080185042405,
"grad_norm": 2.390625,
"learning_rate": 4.9956999237015336e-05,
"loss": 0.28190645575523376,
"mean_token_accuracy": 0.8957963585853577,
"num_tokens": 56748.0,
"step": 40
},
{
"entropy": 0.27777737379074097,
"epoch": 0.006322282189668466,
"grad_norm": 2.390625,
"learning_rate": 4.994691620922919e-05,
"loss": 0.2448980063199997,
"mean_token_accuracy": 0.918410062789917,
"num_tokens": 58668.0,
"step": 41
},
{
"entropy": 0.3428345024585724,
"epoch": 0.006476484194294526,
"grad_norm": 5.15625,
"learning_rate": 4.993577338855741e-05,
"loss": 0.354027658700943,
"mean_token_accuracy": 0.8893527984619141,
"num_tokens": 60113.0,
"step": 42
},
{
"entropy": 0.30580252408981323,
"epoch": 0.006630686198920586,
"grad_norm": 2.75,
"learning_rate": 4.9923571248368375e-05,
"loss": 0.28440362215042114,
"mean_token_accuracy": 0.9064558744430542,
"num_tokens": 61639.0,
"step": 43
},
{
"entropy": 0.43973830342292786,
"epoch": 0.006784888203546646,
"grad_norm": 4.5625,
"learning_rate": 4.991031030703244e-05,
"loss": 0.4269028902053833,
"mean_token_accuracy": 0.859668493270874,
"num_tokens": 62552.0,
"step": 44
},
{
"entropy": 0.33187365531921387,
"epoch": 0.006939090208172706,
"grad_norm": 3.546875,
"learning_rate": 4.989599112789984e-05,
"loss": 0.34638962149620056,
"mean_token_accuracy": 0.8819671869277954,
"num_tokens": 63780.0,
"step": 45
},
{
"entropy": 0.31787073612213135,
"epoch": 0.007093292212798767,
"grad_norm": 3.703125,
"learning_rate": 4.988061431927681e-05,
"loss": 0.3301301598548889,
"mean_token_accuracy": 0.8874788284301758,
"num_tokens": 64970.0,
"step": 46
},
{
"entropy": 0.2509302496910095,
"epoch": 0.0072474942174248266,
"grad_norm": 3.375,
"learning_rate": 4.9864180534399674e-05,
"loss": 0.2752370238304138,
"mean_token_accuracy": 0.9063180685043335,
"num_tokens": 66814.0,
"step": 47
},
{
"entropy": 0.28000396490097046,
"epoch": 0.0074016962220508865,
"grad_norm": 3.46875,
"learning_rate": 4.984669047140716e-05,
"loss": 0.3101637363433838,
"mean_token_accuracy": 0.8998599648475647,
"num_tokens": 68250.0,
"step": 48
},
{
"entropy": 0.263468474149704,
"epoch": 0.0075558982266769464,
"grad_norm": 3.046875,
"learning_rate": 4.982814487331072e-05,
"loss": 0.29188624024391174,
"mean_token_accuracy": 0.9051411747932434,
"num_tokens": 69639.0,
"step": 49
},
{
"entropy": 0.30314338207244873,
"epoch": 0.007710100231303007,
"grad_norm": 3.296875,
"learning_rate": 4.9808544527962964e-05,
"loss": 0.3129803538322449,
"mean_token_accuracy": 0.8961228728294373,
"num_tokens": 71014.0,
"step": 50
},
{
"entropy": 0.299335241317749,
"epoch": 0.007864302235929066,
"grad_norm": 2.9375,
"learning_rate": 4.978789026802419e-05,
"loss": 0.3139159679412842,
"mean_token_accuracy": 0.8981788158416748,
"num_tokens": 72230.0,
"step": 51
},
{
"entropy": 0.33503401279449463,
"epoch": 0.008018504240555127,
"grad_norm": 3.265625,
"learning_rate": 4.9766182970926975e-05,
"loss": 0.3325449526309967,
"mean_token_accuracy": 0.8837863206863403,
"num_tokens": 73305.0,
"step": 52
},
{
"entropy": 0.3575332760810852,
"epoch": 0.008172706245181188,
"grad_norm": 3.21875,
"learning_rate": 4.9743423558839e-05,
"loss": 0.3531642258167267,
"mean_token_accuracy": 0.8720735907554626,
"num_tokens": 74509.0,
"step": 53
},
{
"entropy": 0.3179467022418976,
"epoch": 0.008326908249807247,
"grad_norm": 2.90625,
"learning_rate": 4.971961299862376e-05,
"loss": 0.32219475507736206,
"mean_token_accuracy": 0.898162305355072,
"num_tokens": 75823.0,
"step": 54
},
{
"entropy": 0.27880457043647766,
"epoch": 0.008481110254433308,
"grad_norm": 5.5625,
"learning_rate": 4.9694752301799566e-05,
"loss": 0.2741078734397888,
"mean_token_accuracy": 0.9016189575195312,
"num_tokens": 77437.0,
"step": 55
},
{
"entropy": 0.34488171339035034,
"epoch": 0.008635312259059369,
"grad_norm": 3.90625,
"learning_rate": 4.9668842524496526e-05,
"loss": 0.366953045129776,
"mean_token_accuracy": 0.8817635178565979,
"num_tokens": 78942.0,
"step": 56
},
{
"entropy": 0.2759552299976349,
"epoch": 0.008789514263685428,
"grad_norm": 2.390625,
"learning_rate": 4.9641884767411714e-05,
"loss": 0.23074716329574585,
"mean_token_accuracy": 0.9223560690879822,
"num_tokens": 80444.0,
"step": 57
},
{
"entropy": 0.2680215835571289,
"epoch": 0.008943716268311488,
"grad_norm": 2.375,
"learning_rate": 4.9613880175762414e-05,
"loss": 0.2504393458366394,
"mean_token_accuracy": 0.9081172347068787,
"num_tokens": 82226.0,
"step": 58
},
{
"entropy": 0.3482436537742615,
"epoch": 0.009097918272937548,
"grad_norm": 2.859375,
"learning_rate": 4.958482993923742e-05,
"loss": 0.3350726068019867,
"mean_token_accuracy": 0.8843283653259277,
"num_tokens": 83306.0,
"step": 59
},
{
"entropy": 0.3756292164325714,
"epoch": 0.009252120277563608,
"grad_norm": 2.75,
"learning_rate": 4.955473529194654e-05,
"loss": 0.3661136329174042,
"mean_token_accuracy": 0.8703535795211792,
"num_tokens": 84417.0,
"step": 60
},
{
"entropy": 0.30815669894218445,
"epoch": 0.00940632228218967,
"grad_norm": 2.546875,
"learning_rate": 4.952359751236817e-05,
"loss": 0.2678232789039612,
"mean_token_accuracy": 0.9128242135047913,
"num_tokens": 85813.0,
"step": 61
},
{
"entropy": 0.2885396480560303,
"epoch": 0.009560524286815728,
"grad_norm": 2.28125,
"learning_rate": 4.9491417923294934e-05,
"loss": 0.2961139976978302,
"mean_token_accuracy": 0.897884726524353,
"num_tokens": 87192.0,
"step": 62
},
{
"entropy": 0.3488570749759674,
"epoch": 0.009714726291441789,
"grad_norm": 4.09375,
"learning_rate": 4.9458197891777556e-05,
"loss": 0.41754454374313354,
"mean_token_accuracy": 0.8623949289321899,
"num_tokens": 88152.0,
"step": 63
},
{
"entropy": 0.30880776047706604,
"epoch": 0.009868928296067848,
"grad_norm": 3.59375,
"learning_rate": 4.942393882906674e-05,
"loss": 0.3697586953639984,
"mean_token_accuracy": 0.8826446533203125,
"num_tokens": 89370.0,
"step": 64
},
{
"entropy": 0.33239686489105225,
"epoch": 0.010023130300693909,
"grad_norm": 3.078125,
"learning_rate": 4.9388642190553226e-05,
"loss": 0.3398675322532654,
"mean_token_accuracy": 0.8863636255264282,
"num_tokens": 90478.0,
"step": 65
},
{
"entropy": 0.2123962938785553,
"epoch": 0.01017733230531997,
"grad_norm": 2.65625,
"learning_rate": 4.935230947570597e-05,
"loss": 0.24962244927883148,
"mean_token_accuracy": 0.9207471013069153,
"num_tokens": 92467.0,
"step": 66
},
{
"entropy": 0.2466113418340683,
"epoch": 0.010331534309946029,
"grad_norm": 2.984375,
"learning_rate": 4.931494222800844e-05,
"loss": 0.2969174385070801,
"mean_token_accuracy": 0.8992460370063782,
"num_tokens": 93934.0,
"step": 67
},
{
"entropy": 0.2843821942806244,
"epoch": 0.01048573631457209,
"grad_norm": 2.765625,
"learning_rate": 4.9276542034893044e-05,
"loss": 0.32256507873535156,
"mean_token_accuracy": 0.8970125913619995,
"num_tokens": 95214.0,
"step": 68
},
{
"entropy": 0.2835680842399597,
"epoch": 0.01063993831919815,
"grad_norm": 2.359375,
"learning_rate": 4.923711052767369e-05,
"loss": 0.25358864665031433,
"mean_token_accuracy": 0.9075269103050232,
"num_tokens": 96617.0,
"step": 69
},
{
"entropy": 0.24416741728782654,
"epoch": 0.01079414032382421,
"grad_norm": 2.453125,
"learning_rate": 4.9196649381476504e-05,
"loss": 0.2399137169122696,
"mean_token_accuracy": 0.9191176295280457,
"num_tokens": 98257.0,
"step": 70
},
{
"entropy": 0.29682713747024536,
"epoch": 0.01094834232845027,
"grad_norm": 3.109375,
"learning_rate": 4.915516031516863e-05,
"loss": 0.3011798858642578,
"mean_token_accuracy": 0.9019886255264282,
"num_tokens": 99673.0,
"step": 71
},
{
"entropy": 0.31920552253723145,
"epoch": 0.01110254433307633,
"grad_norm": 2.84375,
"learning_rate": 4.911264509128524e-05,
"loss": 0.3012612462043762,
"mean_token_accuracy": 0.8963922262191772,
"num_tokens": 100762.0,
"step": 72
},
{
"entropy": 0.30763909220695496,
"epoch": 0.01125674633770239,
"grad_norm": 2.5,
"learning_rate": 4.906910551595466e-05,
"loss": 0.2967263460159302,
"mean_token_accuracy": 0.9000924825668335,
"num_tokens": 101851.0,
"step": 73
},
{
"entropy": 0.3040732145309448,
"epoch": 0.011410948342328451,
"grad_norm": 3.0625,
"learning_rate": 4.902454343882162e-05,
"loss": 0.3297285735607147,
"mean_token_accuracy": 0.8881889581680298,
"num_tokens": 103129.0,
"step": 74
},
{
"entropy": 0.2837064564228058,
"epoch": 0.01156515034695451,
"grad_norm": 2.546875,
"learning_rate": 4.8978960752968695e-05,
"loss": 0.28480246663093567,
"mean_token_accuracy": 0.8954154849052429,
"num_tokens": 104533.0,
"step": 75
},
{
"entropy": 0.3194725811481476,
"epoch": 0.01171935235158057,
"grad_norm": 3.171875,
"learning_rate": 4.893235939483587e-05,
"loss": 0.3251062333583832,
"mean_token_accuracy": 0.8847517967224121,
"num_tokens": 105669.0,
"step": 76
},
{
"entropy": 0.2741483747959137,
"epoch": 0.01187355435620663,
"grad_norm": 2.34375,
"learning_rate": 4.8884741344138294e-05,
"loss": 0.2672386169433594,
"mean_token_accuracy": 0.9074759483337402,
"num_tokens": 107028.0,
"step": 77
},
{
"entropy": 0.2283252328634262,
"epoch": 0.01202775636083269,
"grad_norm": 2.265625,
"learning_rate": 4.8836108623782154e-05,
"loss": 0.23968154191970825,
"mean_token_accuracy": 0.9172775149345398,
"num_tokens": 108946.0,
"step": 78
},
{
"entropy": 0.3795450031757355,
"epoch": 0.012181958365458751,
"grad_norm": 2.921875,
"learning_rate": 4.8786463299778773e-05,
"loss": 0.4119304120540619,
"mean_token_accuracy": 0.8639523386955261,
"num_tokens": 109961.0,
"step": 79
},
{
"entropy": 0.3368295431137085,
"epoch": 0.01233616037008481,
"grad_norm": 2.984375,
"learning_rate": 4.873580748115679e-05,
"loss": 0.3614250719547272,
"mean_token_accuracy": 0.8688679337501526,
"num_tokens": 111029.0,
"step": 80
},
{
"entropy": 0.2857000231742859,
"epoch": 0.012490362374710871,
"grad_norm": 2.890625,
"learning_rate": 4.8684143319872636e-05,
"loss": 0.2805139422416687,
"mean_token_accuracy": 0.8976377844810486,
"num_tokens": 112307.0,
"step": 81
},
{
"entropy": 0.3508206605911255,
"epoch": 0.012644564379336932,
"grad_norm": 2.6875,
"learning_rate": 4.863147301071903e-05,
"loss": 0.3427751362323761,
"mean_token_accuracy": 0.8861867785453796,
"num_tokens": 113343.0,
"step": 82
},
{
"entropy": 0.33843186497688293,
"epoch": 0.012798766383962991,
"grad_norm": 2.796875,
"learning_rate": 4.8577798791231815e-05,
"loss": 0.32030197978019714,
"mean_token_accuracy": 0.8884462118148804,
"num_tokens": 114606.0,
"step": 83
},
{
"entropy": 0.2596885859966278,
"epoch": 0.012952968388589052,
"grad_norm": 2.15625,
"learning_rate": 4.852312294159486e-05,
"loss": 0.2479410469532013,
"mean_token_accuracy": 0.916926920413971,
"num_tokens": 116215.0,
"step": 84
},
{
"entropy": 0.3047274947166443,
"epoch": 0.013107170393215111,
"grad_norm": 3.03125,
"learning_rate": 4.8467447784543205e-05,
"loss": 0.30305323004722595,
"mean_token_accuracy": 0.8943606019020081,
"num_tokens": 117482.0,
"step": 85
},
{
"entropy": 0.18909737467765808,
"epoch": 0.013261372397841172,
"grad_norm": 1.984375,
"learning_rate": 4.841077568526439e-05,
"loss": 0.2026541382074356,
"mean_token_accuracy": 0.9290540814399719,
"num_tokens": 119858.0,
"step": 86
},
{
"entropy": 0.2918696403503418,
"epoch": 0.013415574402467233,
"grad_norm": 2.953125,
"learning_rate": 4.8353109051297976e-05,
"loss": 0.3184109330177307,
"mean_token_accuracy": 0.8954508900642395,
"num_tokens": 121119.0,
"step": 87
},
{
"entropy": 0.3284008800983429,
"epoch": 0.013569776407093292,
"grad_norm": 2.953125,
"learning_rate": 4.829445033243326e-05,
"loss": 0.3101221024990082,
"mean_token_accuracy": 0.8810949325561523,
"num_tokens": 122296.0,
"step": 88
},
{
"entropy": 0.2881852090358734,
"epoch": 0.013723978411719353,
"grad_norm": 2.375,
"learning_rate": 4.823480202060521e-05,
"loss": 0.28734254837036133,
"mean_token_accuracy": 0.897292971611023,
"num_tokens": 123560.0,
"step": 89
},
{
"entropy": 0.3923459053039551,
"epoch": 0.013878180416345412,
"grad_norm": 3.40625,
"learning_rate": 4.817416664978861e-05,
"loss": 0.4181320071220398,
"mean_token_accuracy": 0.857782781124115,
"num_tokens": 124461.0,
"step": 90
},
{
"entropy": 0.38047465682029724,
"epoch": 0.014032382420971472,
"grad_norm": 3.296875,
"learning_rate": 4.81125467958904e-05,
"loss": 0.4139612317085266,
"mean_token_accuracy": 0.8700189590454102,
"num_tokens": 125523.0,
"step": 91
},
{
"entropy": 0.32315686345100403,
"epoch": 0.014186584425597533,
"grad_norm": 3.125,
"learning_rate": 4.804994507664026e-05,
"loss": 0.29804831743240356,
"mean_token_accuracy": 0.9056603908538818,
"num_tokens": 126962.0,
"step": 92
},
{
"entropy": 0.3598167598247528,
"epoch": 0.014340786430223592,
"grad_norm": 4.375,
"learning_rate": 4.798636415147938e-05,
"loss": 0.33338406682014465,
"mean_token_accuracy": 0.876142144203186,
"num_tokens": 127955.0,
"step": 93
},
{
"entropy": 0.2664312422275543,
"epoch": 0.014494988434849653,
"grad_norm": 2.46875,
"learning_rate": 4.7921806721447494e-05,
"loss": 0.24038437008857727,
"mean_token_accuracy": 0.9096692204475403,
"num_tokens": 129535.0,
"step": 94
},
{
"entropy": 0.40390363335609436,
"epoch": 0.014649190439475714,
"grad_norm": 5.46875,
"learning_rate": 4.785627552906816e-05,
"loss": 0.39077234268188477,
"mean_token_accuracy": 0.8630303144454956,
"num_tokens": 130368.0,
"step": 95
},
{
"entropy": 0.25566768646240234,
"epoch": 0.014803392444101773,
"grad_norm": 2.359375,
"learning_rate": 4.77897733582322e-05,
"loss": 0.24936963617801666,
"mean_token_accuracy": 0.9094488024711609,
"num_tokens": 132154.0,
"step": 96
},
{
"entropy": 0.27786779403686523,
"epoch": 0.014957594448727834,
"grad_norm": 2.65625,
"learning_rate": 4.77223030340795e-05,
"loss": 0.26183679699897766,
"mean_token_accuracy": 0.9076694250106812,
"num_tokens": 133505.0,
"step": 97
},
{
"entropy": 0.2191360741853714,
"epoch": 0.015111796453353893,
"grad_norm": 1.7109375,
"learning_rate": 4.7653867422878926e-05,
"loss": 0.20657718181610107,
"mean_token_accuracy": 0.9271235466003418,
"num_tokens": 135585.0,
"step": 98
},
{
"entropy": 0.24974940717220306,
"epoch": 0.015265998457979954,
"grad_norm": 2.6875,
"learning_rate": 4.758446943190661e-05,
"loss": 0.2656131982803345,
"mean_token_accuracy": 0.9067688584327698,
"num_tokens": 137159.0,
"step": 99
},
{
"entropy": 0.2313736230134964,
"epoch": 0.015420200462606014,
"grad_norm": 2.046875,
"learning_rate": 4.751411200932242e-05,
"loss": 0.23317928612232208,
"mean_token_accuracy": 0.9191856980323792,
"num_tokens": 138788.0,
"step": 100
},
{
"entropy": 0.2997652590274811,
"epoch": 0.015574402467232074,
"grad_norm": 2.4375,
"learning_rate": 4.7442798144044695e-05,
"loss": 0.3120857775211334,
"mean_token_accuracy": 0.8966366052627563,
"num_tokens": 140015.0,
"step": 101
},
{
"entropy": 0.3081951141357422,
"epoch": 0.015728604471858133,
"grad_norm": 2.6875,
"learning_rate": 4.7370530865623334e-05,
"loss": 0.34071362018585205,
"mean_token_accuracy": 0.8860557675361633,
"num_tokens": 141278.0,
"step": 102
},
{
"entropy": 0.2699045240879059,
"epoch": 0.015882806476484195,
"grad_norm": 2.234375,
"learning_rate": 4.729731324411104e-05,
"loss": 0.27989721298217773,
"mean_token_accuracy": 0.90031898021698,
"num_tokens": 142540.0,
"step": 103
},
{
"entropy": 0.2760254144668579,
"epoch": 0.016037008481110254,
"grad_norm": 2.234375,
"learning_rate": 4.722314838993291e-05,
"loss": 0.3070385158061981,
"mean_token_accuracy": 0.9090268015861511,
"num_tokens": 143966.0,
"step": 104
},
{
"entropy": 0.29715025424957275,
"epoch": 0.016191210485736313,
"grad_norm": 2.703125,
"learning_rate": 4.714803945375431e-05,
"loss": 0.3124261796474457,
"mean_token_accuracy": 0.8967213034629822,
"num_tokens": 145194.0,
"step": 105
},
{
"entropy": 0.3180467486381531,
"epoch": 0.016345412490362376,
"grad_norm": 2.8125,
"learning_rate": 4.707198962634701e-05,
"loss": 0.3431381285190582,
"mean_token_accuracy": 0.8840726017951965,
"num_tokens": 146194.0,
"step": 106
},
{
"entropy": 0.25070613622665405,
"epoch": 0.016499614494988435,
"grad_norm": 2.3125,
"learning_rate": 4.699500213845367e-05,
"loss": 0.290202796459198,
"mean_token_accuracy": 0.9046997427940369,
"num_tokens": 147734.0,
"step": 107
},
{
"entropy": 0.22292165458202362,
"epoch": 0.016653816499614494,
"grad_norm": 1.7109375,
"learning_rate": 4.691708026065055e-05,
"loss": 0.2274986356496811,
"mean_token_accuracy": 0.9138554334640503,
"num_tokens": 149402.0,
"step": 108
},
{
"entropy": 0.3579561412334442,
"epoch": 0.016808018504240557,
"grad_norm": 2.78125,
"learning_rate": 4.683822730320858e-05,
"loss": 0.31315499544143677,
"mean_token_accuracy": 0.889497697353363,
"num_tokens": 150505.0,
"step": 109
},
{
"entropy": 0.23602542281150818,
"epoch": 0.016962220508866616,
"grad_norm": 1.609375,
"learning_rate": 4.6758446615952746e-05,
"loss": 0.20407229661941528,
"mean_token_accuracy": 0.9303831458091736,
"num_tokens": 152366.0,
"step": 110
},
{
"entropy": 0.3046983480453491,
"epoch": 0.017116422513492675,
"grad_norm": 3.25,
"learning_rate": 4.6677741588119784e-05,
"loss": 0.3156391382217407,
"mean_token_accuracy": 0.8897196054458618,
"num_tokens": 153444.0,
"step": 111
},
{
"entropy": 0.25662004947662354,
"epoch": 0.017270624518118737,
"grad_norm": 2.515625,
"learning_rate": 4.6596115648214196e-05,
"loss": 0.2515248656272888,
"mean_token_accuracy": 0.907616376876831,
"num_tokens": 154870.0,
"step": 112
},
{
"entropy": 0.28677132725715637,
"epoch": 0.017424826522744796,
"grad_norm": 3.265625,
"learning_rate": 4.651357226386258e-05,
"loss": 0.2942817211151123,
"mean_token_accuracy": 0.8936970829963684,
"num_tokens": 155941.0,
"step": 113
},
{
"entropy": 0.21182145178318024,
"epoch": 0.017579028527370855,
"grad_norm": 2.3125,
"learning_rate": 4.6430114941666334e-05,
"loss": 0.23567034304141998,
"mean_token_accuracy": 0.9196969866752625,
"num_tokens": 157269.0,
"step": 114
},
{
"entropy": 0.18196314573287964,
"epoch": 0.017733230531996914,
"grad_norm": 1.71875,
"learning_rate": 4.6345747227052726e-05,
"loss": 0.18516698479652405,
"mean_token_accuracy": 0.9305768013000488,
"num_tokens": 159236.0,
"step": 115
},
{
"entropy": 0.23556780815124512,
"epoch": 0.017887432536622977,
"grad_norm": 2.40625,
"learning_rate": 4.626047270412419e-05,
"loss": 0.22876134514808655,
"mean_token_accuracy": 0.9182389974594116,
"num_tokens": 160516.0,
"step": 116
},
{
"entropy": 0.24857133626937866,
"epoch": 0.018041634541249036,
"grad_norm": 3.0625,
"learning_rate": 4.6174294995506154e-05,
"loss": 0.2965892255306244,
"mean_token_accuracy": 0.90025794506073,
"num_tokens": 161687.0,
"step": 117
},
{
"entropy": 0.21330931782722473,
"epoch": 0.018195836545875095,
"grad_norm": 2.421875,
"learning_rate": 4.6087217762193105e-05,
"loss": 0.23048508167266846,
"mean_token_accuracy": 0.9241044521331787,
"num_tokens": 163342.0,
"step": 118
},
{
"entropy": 0.25938084721565247,
"epoch": 0.018350038550501158,
"grad_norm": 2.734375,
"learning_rate": 4.599924470339303e-05,
"loss": 0.27338430285453796,
"mean_token_accuracy": 0.9029850959777832,
"num_tokens": 164690.0,
"step": 119
},
{
"entropy": 0.3166216015815735,
"epoch": 0.018504240555127217,
"grad_norm": 3.609375,
"learning_rate": 4.5910379556370355e-05,
"loss": 0.3654600977897644,
"mean_token_accuracy": 0.871026337146759,
"num_tokens": 165799.0,
"step": 120
},
{
"entropy": 0.21709276735782623,
"epoch": 0.018658442559753276,
"grad_norm": 1.8359375,
"learning_rate": 4.582062609628709e-05,
"loss": 0.214874729514122,
"mean_token_accuracy": 0.9245843291282654,
"num_tokens": 167491.0,
"step": 121
},
{
"entropy": 0.24251380562782288,
"epoch": 0.01881264456437934,
"grad_norm": 1.9921875,
"learning_rate": 4.57299881360425e-05,
"loss": 0.26085519790649414,
"mean_token_accuracy": 0.9065860509872437,
"num_tokens": 168987.0,
"step": 122
},
{
"entropy": 0.2558088004589081,
"epoch": 0.018966846569005397,
"grad_norm": 2.359375,
"learning_rate": 4.563846952611112e-05,
"loss": 0.2583191692829132,
"mean_token_accuracy": 0.9092382788658142,
"num_tokens": 170229.0,
"step": 123
},
{
"entropy": 0.28851792216300964,
"epoch": 0.019121048573631456,
"grad_norm": 2.25,
"learning_rate": 4.554607415437915e-05,
"loss": 0.28650322556495667,
"mean_token_accuracy": 0.8939759135246277,
"num_tokens": 171482.0,
"step": 124
},
{
"entropy": 0.3131585419178009,
"epoch": 0.01927525057825752,
"grad_norm": 2.578125,
"learning_rate": 4.545280594597935e-05,
"loss": 0.2936202585697174,
"mean_token_accuracy": 0.8922480344772339,
"num_tokens": 172780.0,
"step": 125
},
{
"entropy": 0.24182380735874176,
"epoch": 0.019429452582883578,
"grad_norm": 2.234375,
"learning_rate": 4.535866886312423e-05,
"loss": 0.2440458983182907,
"mean_token_accuracy": 0.9163833856582642,
"num_tokens": 174259.0,
"step": 126
},
{
"entropy": 0.2646311819553375,
"epoch": 0.019583654587509637,
"grad_norm": 2.109375,
"learning_rate": 4.526366690493777e-05,
"loss": 0.2328074872493744,
"mean_token_accuracy": 0.9140625,
"num_tokens": 175675.0,
"step": 127
},
{
"entropy": 0.2266581654548645,
"epoch": 0.019737856592135696,
"grad_norm": 1.671875,
"learning_rate": 4.5167804107285514e-05,
"loss": 0.21153169870376587,
"mean_token_accuracy": 0.922784149646759,
"num_tokens": 177522.0,
"step": 128
},
{
"epoch": 0.019737856592135696,
"eval_entropy": 0.27021819719097073,
"eval_loss": 0.26394832134246826,
"eval_mean_token_accuracy": 0.9077995745410696,
"eval_num_tokens": 177522.0,
"eval_runtime": 35.0787,
"eval_samples_per_second": 77.854,
"eval_steps_per_second": 9.75,
"step": 128
},
{
"entropy": 0.3175150752067566,
"epoch": 0.01989205859676176,
"grad_norm": 2.5,
"learning_rate": 4.507108454260309e-05,
"loss": 0.32345065474510193,
"mean_token_accuracy": 0.895765483379364,
"num_tokens": 178758.0,
"step": 129
},
{
"entropy": 0.26202577352523804,
"epoch": 0.020046260601387818,
"grad_norm": 2.59375,
"learning_rate": 4.497351231972329e-05,
"loss": 0.247625470161438,
"mean_token_accuracy": 0.915336549282074,
"num_tokens": 180207.0,
"step": 130
},
{
"entropy": 0.23124445974826813,
"epoch": 0.020200462606013877,
"grad_norm": 2.265625,
"learning_rate": 4.487509158370139e-05,
"loss": 0.221195787191391,
"mean_token_accuracy": 0.9168797731399536,
"num_tokens": 181779.0,
"step": 131
},
{
"entropy": 0.3099311590194702,
"epoch": 0.02035466461063994,
"grad_norm": 3.03125,
"learning_rate": 4.4775826515639205e-05,
"loss": 0.3427657186985016,
"mean_token_accuracy": 0.8853210806846619,
"num_tokens": 182877.0,
"step": 132
},
{
"entropy": 0.19146594405174255,
"epoch": 0.020508866615266,
"grad_norm": 1.9296875,
"learning_rate": 4.4675721332507345e-05,
"loss": 0.18723616003990173,
"mean_token_accuracy": 0.9326805472373962,
"num_tokens": 184519.0,
"step": 133
},
{
"entropy": 0.29960504174232483,
"epoch": 0.020663068619892058,
"grad_norm": 2.796875,
"learning_rate": 4.4574780286966154e-05,
"loss": 0.31267160177230835,
"mean_token_accuracy": 0.890625,
"num_tokens": 185423.0,
"step": 134
},
{
"entropy": 0.26278653740882874,
"epoch": 0.02081727062451812,
"grad_norm": 2.28125,
"learning_rate": 4.4473007667184995e-05,
"loss": 0.27267012000083923,
"mean_token_accuracy": 0.9038869142532349,
"num_tokens": 186846.0,
"step": 135
},
{
"entropy": 0.18965409696102142,
"epoch": 0.02097147262914418,
"grad_norm": 2.390625,
"learning_rate": 4.43704077966601e-05,
"loss": 0.21876873075962067,
"mean_token_accuracy": 0.9245041608810425,
"num_tokens": 188417.0,
"step": 136
},
{
"entropy": 0.20953340828418732,
"epoch": 0.021125674633770238,
"grad_norm": 2.109375,
"learning_rate": 4.426698503403091e-05,
"loss": 0.205082505941391,
"mean_token_accuracy": 0.926571249961853,
"num_tokens": 190032.0,
"step": 137
},
{
"entropy": 0.2490757405757904,
"epoch": 0.0212798766383963,
"grad_norm": 2.0,
"learning_rate": 4.4162743772894905e-05,
"loss": 0.23051951825618744,
"mean_token_accuracy": 0.9111841917037964,
"num_tokens": 191256.0,
"step": 138
},
{
"entropy": 0.3277740180492401,
"epoch": 0.02143407864302236,
"grad_norm": 5.03125,
"learning_rate": 4.405768844162094e-05,
"loss": 0.37247925996780396,
"mean_token_accuracy": 0.8656716346740723,
"num_tokens": 192202.0,
"step": 139
},
{
"entropy": 0.20335228741168976,
"epoch": 0.02158828064764842,
"grad_norm": 2.0625,
"learning_rate": 4.395182350316115e-05,
"loss": 0.20390284061431885,
"mean_token_accuracy": 0.9318037033081055,
"num_tokens": 193779.0,
"step": 140
},
{
"entropy": 0.222616046667099,
"epoch": 0.021742482652274478,
"grad_norm": 2.15625,
"learning_rate": 4.384515345486131e-05,
"loss": 0.22837010025978088,
"mean_token_accuracy": 0.9107261896133423,
"num_tokens": 195288.0,
"step": 141
},
{
"entropy": 0.2554439902305603,
"epoch": 0.02189668465690054,
"grad_norm": 2.65625,
"learning_rate": 4.373768282826983e-05,
"loss": 0.28548112511634827,
"mean_token_accuracy": 0.905958354473114,
"num_tokens": 196689.0,
"step": 142
},
{
"entropy": 0.23849214613437653,
"epoch": 0.0220508866615266,
"grad_norm": 2.21875,
"learning_rate": 4.3629416188945224e-05,
"loss": 0.25381097197532654,
"mean_token_accuracy": 0.9149101972579956,
"num_tokens": 197978.0,
"step": 143
},
{
"entropy": 0.26421603560447693,
"epoch": 0.02220508866615266,
"grad_norm": 3.5625,
"learning_rate": 4.352035813626214e-05,
"loss": 0.27579382061958313,
"mean_token_accuracy": 0.8979591727256775,
"num_tokens": 199260.0,
"step": 144
},
{
"entropy": 0.20953713357448578,
"epoch": 0.02235929067077872,
"grad_norm": 2.328125,
"learning_rate": 4.3410513303215985e-05,
"loss": 0.1990606188774109,
"mean_token_accuracy": 0.9306029677391052,
"num_tokens": 201026.0,
"step": 145
},
{
"entropy": 0.32288917899131775,
"epoch": 0.02251349267540478,
"grad_norm": 2.984375,
"learning_rate": 4.329988635622611e-05,
"loss": 0.3260837197303772,
"mean_token_accuracy": 0.893796980381012,
"num_tokens": 202098.0,
"step": 146
},
{
"entropy": 0.21132293343544006,
"epoch": 0.02266769468003084,
"grad_norm": 1.84375,
"learning_rate": 4.318848199493758e-05,
"loss": 0.19785253703594208,
"mean_token_accuracy": 0.9298823475837708,
"num_tokens": 204231.0,
"step": 147
},
{
"entropy": 0.3431147038936615,
"epoch": 0.022821896684656902,
"grad_norm": 2.84375,
"learning_rate": 4.30763049520215e-05,
"loss": 0.3377273380756378,
"mean_token_accuracy": 0.8919667601585388,
"num_tokens": 205322.0,
"step": 148
},
{
"entropy": 0.24553008377552032,
"epoch": 0.02297609868928296,
"grad_norm": 2.546875,
"learning_rate": 4.296335999297397e-05,
"loss": 0.23867689073085785,
"mean_token_accuracy": 0.9165446758270264,
"num_tokens": 206696.0,
"step": 149
},
{
"entropy": 0.27541691064834595,
"epoch": 0.02313030069390902,
"grad_norm": 2.03125,
"learning_rate": 4.284965191591364e-05,
"loss": 0.25213125348091125,
"mean_token_accuracy": 0.914050817489624,
"num_tokens": 208042.0,
"step": 150
},
{
"entropy": 0.23892685770988464,
"epoch": 0.023284502698535083,
"grad_norm": 2.03125,
"learning_rate": 4.2735185551377895e-05,
"loss": 0.20277726650238037,
"mean_token_accuracy": 0.9304635524749756,
"num_tokens": 209560.0,
"step": 151
},
{
"entropy": 0.2151283323764801,
"epoch": 0.02343870470316114,
"grad_norm": 2.015625,
"learning_rate": 4.261996576211761e-05,
"loss": 0.2226867973804474,
"mean_token_accuracy": 0.9178715944290161,
"num_tokens": 211297.0,
"step": 152
},
{
"entropy": 0.2410528063774109,
"epoch": 0.0235929067077872,
"grad_norm": 2.015625,
"learning_rate": 4.25039974428906e-05,
"loss": 0.22763265669345856,
"mean_token_accuracy": 0.9149277806282043,
"num_tokens": 212551.0,
"step": 153
},
{
"entropy": 0.2535974383354187,
"epoch": 0.02374710871241326,
"grad_norm": 2.328125,
"learning_rate": 4.238728552025365e-05,
"loss": 0.2421426922082901,
"mean_token_accuracy": 0.9143372178077698,
"num_tokens": 213668.0,
"step": 154
},
{
"entropy": 0.2121782749891281,
"epoch": 0.023901310717039322,
"grad_norm": 1.5,
"learning_rate": 4.226983495235328e-05,
"loss": 0.20025445520877838,
"mean_token_accuracy": 0.9322981238365173,
"num_tokens": 215286.0,
"step": 155
},
{
"entropy": 0.14580558240413666,
"epoch": 0.02405551272166538,
"grad_norm": 1.625,
"learning_rate": 4.215165072871505e-05,
"loss": 0.14826127886772156,
"mean_token_accuracy": 0.9467787146568298,
"num_tokens": 217436.0,
"step": 156
},
{
"entropy": 0.2315557599067688,
"epoch": 0.02420971472629144,
"grad_norm": 2.078125,
"learning_rate": 4.203273787003162e-05,
"loss": 0.2486051321029663,
"mean_token_accuracy": 0.9164133667945862,
"num_tokens": 218760.0,
"step": 157
},
{
"entropy": 0.25005754828453064,
"epoch": 0.024363916730917503,
"grad_norm": 2.390625,
"learning_rate": 4.1913101427949505e-05,
"loss": 0.2627011835575104,
"mean_token_accuracy": 0.9080632925033569,
"num_tokens": 220095.0,
"step": 158
},
{
"entropy": 0.2149634212255478,
"epoch": 0.024518118735543562,
"grad_norm": 2.28125,
"learning_rate": 4.179274648485438e-05,
"loss": 0.21630343794822693,
"mean_token_accuracy": 0.9172714352607727,
"num_tokens": 221481.0,
"step": 159
},
{
"entropy": 0.2316989302635193,
"epoch": 0.02467232074016962,
"grad_norm": 2.3125,
"learning_rate": 4.1671678153655256e-05,
"loss": 0.240981787443161,
"mean_token_accuracy": 0.9135708808898926,
"num_tokens": 222808.0,
"step": 160
},
{
"entropy": 0.29497963190078735,
"epoch": 0.024826522744795684,
"grad_norm": 2.40625,
"learning_rate": 4.154990157756722e-05,
"loss": 0.2961036264896393,
"mean_token_accuracy": 0.9030969142913818,
"num_tokens": 223817.0,
"step": 161
},
{
"entropy": 0.22725972533226013,
"epoch": 0.024980724749421743,
"grad_norm": 2.546875,
"learning_rate": 4.142742192989299e-05,
"loss": 0.22807390987873077,
"mean_token_accuracy": 0.9114027619361877,
"num_tokens": 225044.0,
"step": 162
},
{
"entropy": 0.2280416190624237,
"epoch": 0.025134926754047802,
"grad_norm": 2.421875,
"learning_rate": 4.1304244413803076e-05,
"loss": 0.24813513457775116,
"mean_token_accuracy": 0.9090909361839294,
"num_tokens": 226339.0,
"step": 163
},
{
"entropy": 0.20092645287513733,
"epoch": 0.025289128758673864,
"grad_norm": 2.015625,
"learning_rate": 4.118037426211482e-05,
"loss": 0.22428975999355316,
"mean_token_accuracy": 0.9173313975334167,
"num_tokens": 227726.0,
"step": 164
},
{
"entropy": 0.20079851150512695,
"epoch": 0.025443330763299923,
"grad_norm": 9.5625,
"learning_rate": 4.105581673707002e-05,
"loss": 0.21033848822116852,
"mean_token_accuracy": 0.9232493042945862,
"num_tokens": 229519.0,
"step": 165
},
{
"entropy": 0.25729137659072876,
"epoch": 0.025597532767925982,
"grad_norm": 2.3125,
"learning_rate": 4.0930577130111424e-05,
"loss": 0.2733251452445984,
"mean_token_accuracy": 0.9045871496200562,
"num_tokens": 230617.0,
"step": 166
},
{
"entropy": 0.20442764461040497,
"epoch": 0.02575173477255204,
"grad_norm": 1.890625,
"learning_rate": 4.080466076165793e-05,
"loss": 0.20845486223697662,
"mean_token_accuracy": 0.9209572076797485,
"num_tokens": 232004.0,
"step": 167
},
{
"entropy": 0.20175087451934814,
"epoch": 0.025905936777178104,
"grad_norm": 2.453125,
"learning_rate": 4.067807298087857e-05,
"loss": 0.21334150433540344,
"mean_token_accuracy": 0.9243085980415344,
"num_tokens": 233386.0,
"step": 168
},
{
"entropy": 0.26961395144462585,
"epoch": 0.026060138781804163,
"grad_norm": 2.125,
"learning_rate": 4.055081916546525e-05,
"loss": 0.24742326140403748,
"mean_token_accuracy": 0.9157986044883728,
"num_tokens": 234546.0,
"step": 169
},
{
"entropy": 0.20450648665428162,
"epoch": 0.026214340786430222,
"grad_norm": 1.6953125,
"learning_rate": 4.042290472140431e-05,
"loss": 0.20523257553577423,
"mean_token_accuracy": 0.9297789335250854,
"num_tokens": 236092.0,
"step": 170
},
{
"entropy": 0.2690446972846985,
"epoch": 0.026368542791056285,
"grad_norm": 2.15625,
"learning_rate": 4.029433508274686e-05,
"loss": 0.26763197779655457,
"mean_token_accuracy": 0.9070660471916199,
"num_tokens": 237402.0,
"step": 171
},
{
"entropy": 0.22288963198661804,
"epoch": 0.026522744795682344,
"grad_norm": 2.03125,
"learning_rate": 4.0165115711377945e-05,
"loss": 0.24567259848117828,
"mean_token_accuracy": 0.9189382791519165,
"num_tokens": 238804.0,
"step": 172
},
{
"entropy": 0.19029025733470917,
"epoch": 0.026676946800308403,
"grad_norm": 1.8671875,
"learning_rate": 4.003525209678449e-05,
"loss": 0.18879841268062592,
"mean_token_accuracy": 0.9351808428764343,
"num_tokens": 240941.0,
"step": 173
},
{
"entropy": 0.2573792338371277,
"epoch": 0.026831148804934465,
"grad_norm": 2.96875,
"learning_rate": 3.9904749755822114e-05,
"loss": 0.2607381045818329,
"mean_token_accuracy": 0.906000018119812,
"num_tokens": 242449.0,
"step": 174
},
{
"entropy": 0.2028045505285263,
"epoch": 0.026985350809560524,
"grad_norm": 1.3984375,
"learning_rate": 3.977361423248075e-05,
"loss": 0.1825239360332489,
"mean_token_accuracy": 0.9339895844459534,
"num_tokens": 244184.0,
"step": 175
},
{
"entropy": 0.27057698369026184,
"epoch": 0.027139552814186584,
"grad_norm": 3.140625,
"learning_rate": 3.964185109764915e-05,
"loss": 0.30133944749832153,
"mean_token_accuracy": 0.8857142925262451,
"num_tokens": 245347.0,
"step": 176
},
{
"entropy": 0.18647152185440063,
"epoch": 0.027293754818812646,
"grad_norm": 1.8046875,
"learning_rate": 3.95094659488782e-05,
"loss": 0.1798812299966812,
"mean_token_accuracy": 0.9323040246963501,
"num_tokens": 247039.0,
"step": 177
},
{
"entropy": 0.2583964765071869,
"epoch": 0.027447956823438705,
"grad_norm": 2.28125,
"learning_rate": 3.9376464410143124e-05,
"loss": 0.2609320878982544,
"mean_token_accuracy": 0.9023405909538269,
"num_tokens": 248286.0,
"step": 178
},
{
"entropy": 0.24908345937728882,
"epoch": 0.027602158828064764,
"grad_norm": 2.09375,
"learning_rate": 3.9242852131604585e-05,
"loss": 0.2381179928779602,
"mean_token_accuracy": 0.9222641587257385,
"num_tokens": 249619.0,
"step": 179
},
{
"entropy": 0.21503198146820068,
"epoch": 0.027756360832690823,
"grad_norm": 2.5,
"learning_rate": 3.910863478936864e-05,
"loss": 0.2604519724845886,
"mean_token_accuracy": 0.9127399921417236,
"num_tokens": 251346.0,
"step": 180
},
{
"entropy": 0.22753889858722687,
"epoch": 0.027910562837316886,
"grad_norm": 1.84375,
"learning_rate": 3.897381808524562e-05,
"loss": 0.23742565512657166,
"mean_token_accuracy": 0.9219380617141724,
"num_tokens": 252840.0,
"step": 181
},
{
"entropy": 0.25326159596443176,
"epoch": 0.028064764841942945,
"grad_norm": 2.203125,
"learning_rate": 3.883840774650788e-05,
"loss": 0.28680431842803955,
"mean_token_accuracy": 0.9005083441734314,
"num_tokens": 254225.0,
"step": 182
},
{
"entropy": 0.24126410484313965,
"epoch": 0.028218966846569004,
"grad_norm": 2.109375,
"learning_rate": 3.870240952564653e-05,
"loss": 0.2406134009361267,
"mean_token_accuracy": 0.9119541645050049,
"num_tokens": 255630.0,
"step": 183
},
{
"entropy": 0.2304130345582962,
"epoch": 0.028373168851195067,
"grad_norm": 1.6953125,
"learning_rate": 3.856582920012706e-05,
"loss": 0.22154204547405243,
"mean_token_accuracy": 0.9195979833602905,
"num_tokens": 257031.0,
"step": 184
},
{
"entropy": 0.16509661078453064,
"epoch": 0.028527370855821126,
"grad_norm": 1.3125,
"learning_rate": 3.842867257214383e-05,
"loss": 0.15430063009262085,
"mean_token_accuracy": 0.940733790397644,
"num_tokens": 259165.0,
"step": 185
},
{
"entropy": 0.24022063612937927,
"epoch": 0.028681572860447185,
"grad_norm": 1.7890625,
"learning_rate": 3.8290945468373684e-05,
"loss": 0.20412693917751312,
"mean_token_accuracy": 0.9327940344810486,
"num_tokens": 260780.0,
"step": 186
},
{
"entropy": 0.2785824239253998,
"epoch": 0.028835774865073247,
"grad_norm": 2.390625,
"learning_rate": 3.8152653739728363e-05,
"loss": 0.2689974308013916,
"mean_token_accuracy": 0.9066666960716248,
"num_tokens": 261988.0,
"step": 187
},
{
"entropy": 0.20374569296836853,
"epoch": 0.028989976869699306,
"grad_norm": 2.0,
"learning_rate": 3.8013803261105916e-05,
"loss": 0.21978892385959625,
"mean_token_accuracy": 0.9233038425445557,
"num_tokens": 263691.0,
"step": 188
},
{
"entropy": 0.2387579381465912,
"epoch": 0.029144178874325365,
"grad_norm": 1.984375,
"learning_rate": 3.787439993114123e-05,
"loss": 0.23546524345874786,
"mean_token_accuracy": 0.9189907312393188,
"num_tokens": 265205.0,
"step": 189
},
{
"entropy": 0.22492903470993042,
"epoch": 0.029298380878951428,
"grad_norm": 1.8671875,
"learning_rate": 3.7734449671955326e-05,
"loss": 0.21074332296848297,
"mean_token_accuracy": 0.9219586849212646,
"num_tokens": 266520.0,
"step": 190
},
{
"entropy": 0.19710952043533325,
"epoch": 0.029452582883577487,
"grad_norm": 1.9296875,
"learning_rate": 3.759395842890384e-05,
"loss": 0.1993340104818344,
"mean_token_accuracy": 0.9277042150497437,
"num_tokens": 268340.0,
"step": 191
},
{
"entropy": 0.24934346973896027,
"epoch": 0.029606784888203546,
"grad_norm": 1.890625,
"learning_rate": 3.7452932170324464e-05,
"loss": 0.24506257474422455,
"mean_token_accuracy": 0.9209383130073547,
"num_tokens": 269499.0,
"step": 192
},
{
"entropy": 0.2751508355140686,
"epoch": 0.029760986892829605,
"grad_norm": 2.4375,
"learning_rate": 3.731137688728335e-05,
"loss": 0.28203558921813965,
"mean_token_accuracy": 0.9066317677497864,
"num_tokens": 270653.0,
"step": 193
},
{
"entropy": 0.2998161017894745,
"epoch": 0.029915188897455668,
"grad_norm": 2.640625,
"learning_rate": 3.716929859332063e-05,
"loss": 0.2953347861766815,
"mean_token_accuracy": 0.9018287062644958,
"num_tokens": 271700.0,
"step": 194
},
{
"entropy": 0.2493629902601242,
"epoch": 0.030069390902081727,
"grad_norm": 2.1875,
"learning_rate": 3.7026703324194966e-05,
"loss": 0.26706650853157043,
"mean_token_accuracy": 0.9076277017593384,
"num_tokens": 273137.0,
"step": 195
},
{
"entropy": 0.20723779499530792,
"epoch": 0.030223592906707786,
"grad_norm": 2.140625,
"learning_rate": 3.688359713762707e-05,
"loss": 0.22939355671405792,
"mean_token_accuracy": 0.9125827550888062,
"num_tokens": 274655.0,
"step": 196
},
{
"entropy": 0.22990985214710236,
"epoch": 0.03037779491133385,
"grad_norm": 2.046875,
"learning_rate": 3.673998611304246e-05,
"loss": 0.2153758704662323,
"mean_token_accuracy": 0.9279279112815857,
"num_tokens": 275773.0,
"step": 197
},
{
"entropy": 0.29038283228874207,
"epoch": 0.030531996915959907,
"grad_norm": 2.71875,
"learning_rate": 3.6595876351313116e-05,
"loss": 0.304492324590683,
"mean_token_accuracy": 0.9004576802253723,
"num_tokens": 276655.0,
"step": 198
},
{
"entropy": 0.19836601614952087,
"epoch": 0.030686198920585966,
"grad_norm": 1.8359375,
"learning_rate": 3.645127397449832e-05,
"loss": 0.2065221518278122,
"mean_token_accuracy": 0.9339622855186462,
"num_tokens": 278359.0,
"step": 199
},
{
"entropy": 0.25179192423820496,
"epoch": 0.03084040092521203,
"grad_norm": 2.265625,
"learning_rate": 3.6306185125584615e-05,
"loss": 0.2616140842437744,
"mean_token_accuracy": 0.9063336253166199,
"num_tokens": 279488.0,
"step": 200
},
{
"entropy": 0.18242394924163818,
"epoch": 0.030994602929838088,
"grad_norm": 1.734375,
"learning_rate": 3.616061596822478e-05,
"loss": 0.17770832777023315,
"mean_token_accuracy": 0.9277376532554626,
"num_tokens": 281295.0,
"step": 201
},
{
"entropy": 0.24629506468772888,
"epoch": 0.031148804934464147,
"grad_norm": 2.4375,
"learning_rate": 3.601457268647606e-05,
"loss": 0.2535253167152405,
"mean_token_accuracy": 0.9059450030326843,
"num_tokens": 282430.0,
"step": 202
},
{
"entropy": 0.19920703768730164,
"epoch": 0.03130300693909021,
"grad_norm": 1.921875,
"learning_rate": 3.586806148453736e-05,
"loss": 0.20293940603733063,
"mean_token_accuracy": 0.9283132553100586,
"num_tokens": 284098.0,
"step": 203
},
{
"entropy": 0.1916186809539795,
"epoch": 0.031457208943716265,
"grad_norm": 1.578125,
"learning_rate": 3.572108858648579e-05,
"loss": 0.1925540268421173,
"mean_token_accuracy": 0.9329091906547546,
"num_tokens": 285835.0,
"step": 204
},
{
"entropy": 0.24154330790042877,
"epoch": 0.03161141094834233,
"grad_norm": 2.0625,
"learning_rate": 3.557366023601216e-05,
"loss": 0.2560335099697113,
"mean_token_accuracy": 0.9222126007080078,
"num_tokens": 287000.0,
"step": 205
},
{
"entropy": 0.24839094281196594,
"epoch": 0.03176561295296839,
"grad_norm": 2.09375,
"learning_rate": 3.542578269615579e-05,
"loss": 0.24170006811618805,
"mean_token_accuracy": 0.9167927503585815,
"num_tokens": 288330.0,
"step": 206
},
{
"entropy": 0.19456236064434052,
"epoch": 0.031919814957594446,
"grad_norm": 1.640625,
"learning_rate": 3.527746224903842e-05,
"loss": 0.18520742654800415,
"mean_token_accuracy": 0.9366295337677002,
"num_tokens": 289774.0,
"step": 207
},
{
"entropy": 0.24151258170604706,
"epoch": 0.03207401696222051,
"grad_norm": 1.8828125,
"learning_rate": 3.512870519559733e-05,
"loss": 0.22108638286590576,
"mean_token_accuracy": 0.9167962670326233,
"num_tokens": 291068.0,
"step": 208
},
{
"entropy": 0.3510158658027649,
"epoch": 0.03222821896684657,
"grad_norm": 3.71875,
"learning_rate": 3.49795178553177e-05,
"loss": 0.41906648874282837,
"mean_token_accuracy": 0.8701754212379456,
"num_tokens": 291931.0,
"step": 209
},
{
"entropy": 0.3286966383457184,
"epoch": 0.03238242097147263,
"grad_norm": 3.171875,
"learning_rate": 3.48299065659641e-05,
"loss": 0.343354731798172,
"mean_token_accuracy": 0.8834951519966125,
"num_tokens": 292866.0,
"step": 210
},
{
"entropy": 0.19397929310798645,
"epoch": 0.03253662297609869,
"grad_norm": 1.6875,
"learning_rate": 3.467987768331127e-05,
"loss": 0.1917928159236908,
"mean_token_accuracy": 0.9349930882453918,
"num_tokens": 294320.0,
"step": 211
},
{
"entropy": 0.2259572446346283,
"epoch": 0.03269082498072475,
"grad_norm": 2.203125,
"learning_rate": 3.452943758087414e-05,
"loss": 0.24537329375743866,
"mean_token_accuracy": 0.9182724356651306,
"num_tokens": 295833.0,
"step": 212
},
{
"entropy": 0.22965691983699799,
"epoch": 0.03284502698535081,
"grad_norm": 1.7890625,
"learning_rate": 3.437859264963702e-05,
"loss": 0.2151767462491989,
"mean_token_accuracy": 0.9223232865333557,
"num_tokens": 297270.0,
"step": 213
},
{
"entropy": 0.2611003518104553,
"epoch": 0.03299922898997687,
"grad_norm": 2.890625,
"learning_rate": 3.422734929778213e-05,
"loss": 0.2612400949001312,
"mean_token_accuracy": 0.8977055549621582,
"num_tokens": 298324.0,
"step": 214
},
{
"entropy": 0.1909189224243164,
"epoch": 0.03315343099460293,
"grad_norm": 1.8671875,
"learning_rate": 3.407571395041736e-05,
"loss": 0.20462700724601746,
"mean_token_accuracy": 0.9242695569992065,
"num_tokens": 300009.0,
"step": 215
},
{
"entropy": 0.2556368410587311,
"epoch": 0.03330763299922899,
"grad_norm": 2.03125,
"learning_rate": 3.392369304930334e-05,
"loss": 0.2566298246383667,
"mean_token_accuracy": 0.9090163707733154,
"num_tokens": 301237.0,
"step": 216
},
{
"entropy": 0.27811554074287415,
"epoch": 0.03346183500385505,
"grad_norm": 2.0625,
"learning_rate": 3.377129305257975e-05,
"loss": 0.2745239734649658,
"mean_token_accuracy": 0.9044750332832336,
"num_tokens": 302407.0,
"step": 217
},
{
"entropy": 0.21509166061878204,
"epoch": 0.03361603700848111,
"grad_norm": 1.84375,
"learning_rate": 3.361852043449096e-05,
"loss": 0.2006048709154129,
"mean_token_accuracy": 0.9250646233558655,
"num_tokens": 303963.0,
"step": 218
},
{
"entropy": 0.2612791359424591,
"epoch": 0.03377023901310717,
"grad_norm": 2.0,
"learning_rate": 3.3465381685111054e-05,
"loss": 0.27390342950820923,
"mean_token_accuracy": 0.8982036113739014,
"num_tokens": 305140.0,
"step": 219
},
{
"entropy": 0.2126745879650116,
"epoch": 0.03392444101773323,
"grad_norm": 1.609375,
"learning_rate": 3.331188331006804e-05,
"loss": 0.20790794491767883,
"mean_token_accuracy": 0.9276844263076782,
"num_tokens": 306517.0,
"step": 220
},
{
"entropy": 0.216102734208107,
"epoch": 0.034078643022359294,
"grad_norm": 1.53125,
"learning_rate": 3.315803183026753e-05,
"loss": 0.2031707614660263,
"mean_token_accuracy": 0.9320327043533325,
"num_tokens": 308114.0,
"step": 221
},
{
"entropy": 0.23003709316253662,
"epoch": 0.03423284502698535,
"grad_norm": 2.09375,
"learning_rate": 3.30038337816157e-05,
"loss": 0.24152696132659912,
"mean_token_accuracy": 0.9172229766845703,
"num_tokens": 309620.0,
"step": 222
},
{
"entropy": 0.25657832622528076,
"epoch": 0.03438704703161141,
"grad_norm": 1.9375,
"learning_rate": 3.284929571474164e-05,
"loss": 0.2669946551322937,
"mean_token_accuracy": 0.9029045701026917,
"num_tokens": 310833.0,
"step": 223
},
{
"entropy": 0.23583689332008362,
"epoch": 0.034541249036237474,
"grad_norm": 2.125,
"learning_rate": 3.2694424194719046e-05,
"loss": 0.24596942961215973,
"mean_token_accuracy": 0.9083665609359741,
"num_tokens": 312096.0,
"step": 224
},
{
"entropy": 0.197276309132576,
"epoch": 0.03469545104086353,
"grad_norm": 1.703125,
"learning_rate": 3.2539225800787385e-05,
"loss": 0.19344845414161682,
"mean_token_accuracy": 0.93291836977005,
"num_tokens": 313550.0,
"step": 225
},
{
"entropy": 0.3082696497440338,
"epoch": 0.03484965304548959,
"grad_norm": 3.484375,
"learning_rate": 3.2383707126072315e-05,
"loss": 0.3064239025115967,
"mean_token_accuracy": 0.8925233483314514,
"num_tokens": 314628.0,
"step": 226
},
{
"entropy": 0.19953380525112152,
"epoch": 0.03500385505011565,
"grad_norm": 1.734375,
"learning_rate": 3.222787477730567e-05,
"loss": 0.19340643286705017,
"mean_token_accuracy": 0.9274017214775085,
"num_tokens": 316468.0,
"step": 227
},
{
"entropy": 0.27000153064727783,
"epoch": 0.03515805705474171,
"grad_norm": 3.828125,
"learning_rate": 3.207173537454472e-05,
"loss": 0.2817123830318451,
"mean_token_accuracy": 0.9068965315818787,
"num_tokens": 317636.0,
"step": 228
},
{
"entropy": 0.22825853526592255,
"epoch": 0.03531225905936777,
"grad_norm": 2.125,
"learning_rate": 3.191529555089102e-05,
"loss": 0.22379839420318604,
"mean_token_accuracy": 0.9244868159294128,
"num_tokens": 319008.0,
"step": 229
},
{
"entropy": 0.2942773997783661,
"epoch": 0.03546646106399383,
"grad_norm": 2.6875,
"learning_rate": 3.175856195220855e-05,
"loss": 0.2916644215583801,
"mean_token_accuracy": 0.8996211886405945,
"num_tokens": 320072.0,
"step": 230
},
{
"entropy": 0.2531821131706238,
"epoch": 0.03562066306861989,
"grad_norm": 2.265625,
"learning_rate": 3.160154123684143e-05,
"loss": 0.2512527108192444,
"mean_token_accuracy": 0.9058629274368286,
"num_tokens": 321291.0,
"step": 231
},
{
"entropy": 0.234887957572937,
"epoch": 0.035774865073245954,
"grad_norm": 1.9140625,
"learning_rate": 3.1444240075331054e-05,
"loss": 0.2259407341480255,
"mean_token_accuracy": 0.9231894612312317,
"num_tokens": 322666.0,
"step": 232
},
{
"entropy": 0.23325884342193604,
"epoch": 0.03592906707787201,
"grad_norm": 1.96875,
"learning_rate": 3.128666515013269e-05,
"loss": 0.2157772332429886,
"mean_token_accuracy": 0.9207017421722412,
"num_tokens": 324099.0,
"step": 233
},
{
"entropy": 0.15830406546592712,
"epoch": 0.03608326908249807,
"grad_norm": 1.03125,
"learning_rate": 3.112882315533163e-05,
"loss": 0.1372249573469162,
"mean_token_accuracy": 0.9470046162605286,
"num_tokens": 326277.0,
"step": 234
},
{
"entropy": 0.25762706995010376,
"epoch": 0.036237471087124135,
"grad_norm": 1.828125,
"learning_rate": 3.097072079635878e-05,
"loss": 0.23957906663417816,
"mean_token_accuracy": 0.915335476398468,
"num_tokens": 327537.0,
"step": 235
},
{
"entropy": 0.21047890186309814,
"epoch": 0.03639167309175019,
"grad_norm": 1.7421875,
"learning_rate": 3.081236478970583e-05,
"loss": 0.22065354883670807,
"mean_token_accuracy": 0.9236826300621033,
"num_tokens": 329196.0,
"step": 236
},
{
"entropy": 0.22569093108177185,
"epoch": 0.03654587509637625,
"grad_norm": 1.90625,
"learning_rate": 3.065376186263991e-05,
"loss": 0.21428702771663666,
"mean_token_accuracy": 0.9252577424049377,
"num_tokens": 330368.0,
"step": 237
},
{
"entropy": 0.2325230836868286,
"epoch": 0.036700077101002315,
"grad_norm": 1.7578125,
"learning_rate": 3.049491875291778e-05,
"loss": 0.23734821379184723,
"mean_token_accuracy": 0.9114202260971069,
"num_tokens": 331742.0,
"step": 238
},
{
"entropy": 0.2122831493616104,
"epoch": 0.03685427910562837,
"grad_norm": 1.609375,
"learning_rate": 3.0335842208499637e-05,
"loss": 0.2174147367477417,
"mean_token_accuracy": 0.9171270728111267,
"num_tokens": 333198.0,
"step": 239
},
{
"entropy": 0.23024694621562958,
"epoch": 0.03700848111025443,
"grad_norm": 2.046875,
"learning_rate": 3.0176538987262442e-05,
"loss": 0.2907542288303375,
"mean_token_accuracy": 0.9019264578819275,
"num_tokens": 334348.0,
"step": 240
},
{
"entropy": 0.2648603022098541,
"epoch": 0.037162683114880496,
"grad_norm": 1.875,
"learning_rate": 3.0017015856712814e-05,
"loss": 0.2652634382247925,
"mean_token_accuracy": 0.9065656661987305,
"num_tokens": 335544.0,
"step": 241
},
{
"entropy": 0.2533347010612488,
"epoch": 0.03731688511950655,
"grad_norm": 1.96875,
"learning_rate": 2.9857279593699544e-05,
"loss": 0.2646684944629669,
"mean_token_accuracy": 0.9075286388397217,
"num_tokens": 336774.0,
"step": 242
},
{
"entropy": 0.22679953277111053,
"epoch": 0.037471087124132614,
"grad_norm": 2.078125,
"learning_rate": 2.9697336984125683e-05,
"loss": 0.22257877886295319,
"mean_token_accuracy": 0.9175019264221191,
"num_tokens": 338079.0,
"step": 243
},
{
"entropy": 0.19455574452877045,
"epoch": 0.03762528912875868,
"grad_norm": 1.5546875,
"learning_rate": 2.9537194822660295e-05,
"loss": 0.19329281151294708,
"mean_token_accuracy": 0.9266055226325989,
"num_tokens": 339722.0,
"step": 244
},
{
"entropy": 0.20773011445999146,
"epoch": 0.03777949113338473,
"grad_norm": 1.9453125,
"learning_rate": 2.9376859912449794e-05,
"loss": 0.20826096832752228,
"mean_token_accuracy": 0.9232895374298096,
"num_tokens": 341177.0,
"step": 245
},
{
"entropy": 0.2844797372817993,
"epoch": 0.037933693138010795,
"grad_norm": 1.796875,
"learning_rate": 2.9216339064828914e-05,
"loss": 0.2653990387916565,
"mean_token_accuracy": 0.910646378993988,
"num_tokens": 342237.0,
"step": 246
},
{
"entropy": 0.19197861850261688,
"epoch": 0.03808789514263686,
"grad_norm": 1.5859375,
"learning_rate": 2.9055639099031386e-05,
"loss": 0.191925048828125,
"mean_token_accuracy": 0.9356250166893005,
"num_tokens": 343845.0,
"step": 247
},
{
"entropy": 0.28776344656944275,
"epoch": 0.03824209714726291,
"grad_norm": 2.59375,
"learning_rate": 2.8894766841900223e-05,
"loss": 0.27679842710494995,
"mean_token_accuracy": 0.9086069464683533,
"num_tokens": 344980.0,
"step": 248
},
{
"entropy": 0.23193758726119995,
"epoch": 0.038396299151888975,
"grad_norm": 1.9765625,
"learning_rate": 2.8733729127597692e-05,
"loss": 0.2313500940799713,
"mean_token_accuracy": 0.9189602732658386,
"num_tokens": 346296.0,
"step": 249
},
{
"entropy": 0.19187554717063904,
"epoch": 0.03855050115651504,
"grad_norm": 1.4765625,
"learning_rate": 2.8572532797315006e-05,
"loss": 0.17860986292362213,
"mean_token_accuracy": 0.9357484579086304,
"num_tokens": 347767.0,
"step": 250
},
{
"entropy": 0.26534777879714966,
"epoch": 0.038704703161141094,
"grad_norm": 2.234375,
"learning_rate": 2.8411184698981684e-05,
"loss": 0.2811349630355835,
"mean_token_accuracy": 0.9026548862457275,
"num_tokens": 349131.0,
"step": 251
},
{
"entropy": 0.19166985154151917,
"epoch": 0.038858905165767156,
"grad_norm": 1.4375,
"learning_rate": 2.824969168697466e-05,
"loss": 0.1818903237581253,
"mean_token_accuracy": 0.9364994764328003,
"num_tokens": 351013.0,
"step": 252
},
{
"entropy": 0.2197422981262207,
"epoch": 0.03901310717039321,
"grad_norm": 2.0,
"learning_rate": 2.808806062182705e-05,
"loss": 0.24899303913116455,
"mean_token_accuracy": 0.9060351252555847,
"num_tokens": 352330.0,
"step": 253
},
{
"entropy": 0.24478891491889954,
"epoch": 0.039167309175019274,
"grad_norm": 2.046875,
"learning_rate": 2.792629836993676e-05,
"loss": 0.24458467960357666,
"mean_token_accuracy": 0.914650559425354,
"num_tokens": 353826.0,
"step": 254
},
{
"entropy": 0.17300452291965485,
"epoch": 0.03932151117964534,
"grad_norm": 1.453125,
"learning_rate": 2.776441180327475e-05,
"loss": 0.1748412549495697,
"mean_token_accuracy": 0.9393326640129089,
"num_tokens": 355812.0,
"step": 255
},
{
"entropy": 0.28217461705207825,
"epoch": 0.03947571318427139,
"grad_norm": 2.375,
"learning_rate": 2.76024077990931e-05,
"loss": 0.28308406472206116,
"mean_token_accuracy": 0.908906877040863,
"num_tokens": 356808.0,
"step": 256
},
{
"epoch": 0.03947571318427139,
"eval_entropy": 0.2422610384068991,
"eval_loss": 0.2376217544078827,
"eval_mean_token_accuracy": 0.9154835451416105,
"eval_num_tokens": 356808.0,
"eval_runtime": 34.9417,
"eval_samples_per_second": 78.159,
"eval_steps_per_second": 9.788,
"step": 256
},
{
"entropy": 0.2056795060634613,
"epoch": 0.039629915188897455,
"grad_norm": 1.7265625,
"learning_rate": 2.7440293239632885e-05,
"loss": 0.1848773956298828,
"mean_token_accuracy": 0.9414348602294922,
"num_tokens": 358182.0,
"step": 257
},
{
"entropy": 0.21008774638175964,
"epoch": 0.03978411719352352,
"grad_norm": 2.125,
"learning_rate": 2.7278075011831757e-05,
"loss": 0.23831506073474884,
"mean_token_accuracy": 0.9120956659317017,
"num_tokens": 359612.0,
"step": 258
},
{
"entropy": 0.22274059057235718,
"epoch": 0.03993831919814957,
"grad_norm": 2.078125,
"learning_rate": 2.711576000703141e-05,
"loss": 0.22159968316555023,
"mean_token_accuracy": 0.9259036183357239,
"num_tokens": 361280.0,
"step": 259
},
{
"entropy": 0.24206753075122833,
"epoch": 0.040092521202775636,
"grad_norm": 2.21875,
"learning_rate": 2.6953355120684802e-05,
"loss": 0.2599974274635315,
"mean_token_accuracy": 0.915960431098938,
"num_tokens": 362704.0,
"step": 260
},
{
"entropy": 0.22195129096508026,
"epoch": 0.0402467232074017,
"grad_norm": 1.8203125,
"learning_rate": 2.6790867252063247e-05,
"loss": 0.22732976078987122,
"mean_token_accuracy": 0.9146426320075989,
"num_tokens": 364153.0,
"step": 261
},
{
"entropy": 0.19769293069839478,
"epoch": 0.040400925212027754,
"grad_norm": 1.5390625,
"learning_rate": 2.6628303303963288e-05,
"loss": 0.18025925755500793,
"mean_token_accuracy": 0.9401107430458069,
"num_tokens": 366148.0,
"step": 262
},
{
"entropy": 0.36093661189079285,
"epoch": 0.040555127216653816,
"grad_norm": 2.828125,
"learning_rate": 2.646567018241349e-05,
"loss": 0.36829474568367004,
"mean_token_accuracy": 0.8780487775802612,
"num_tokens": 367140.0,
"step": 263
},
{
"entropy": 0.28070077300071716,
"epoch": 0.04070932922127988,
"grad_norm": 2.171875,
"learning_rate": 2.6302974796381015e-05,
"loss": 0.27073192596435547,
"mean_token_accuracy": 0.9048058986663818,
"num_tokens": 368230.0,
"step": 264
},
{
"entropy": 0.28238415718078613,
"epoch": 0.040863531225905934,
"grad_norm": 2.078125,
"learning_rate": 2.6140224057478158e-05,
"loss": 0.2595861256122589,
"mean_token_accuracy": 0.9181897044181824,
"num_tokens": 369387.0,
"step": 265
},
{
"entropy": 0.24161042273044586,
"epoch": 0.041017733230532,
"grad_norm": 1.6328125,
"learning_rate": 2.5977424879668705e-05,
"loss": 0.22480149567127228,
"mean_token_accuracy": 0.9269341230392456,
"num_tokens": 370791.0,
"step": 266
},
{
"entropy": 0.1969321221113205,
"epoch": 0.04117193523515806,
"grad_norm": 1.53125,
"learning_rate": 2.5814584178974218e-05,
"loss": 0.1720927655696869,
"mean_token_accuracy": 0.934974730014801,
"num_tokens": 372383.0,
"step": 267
},
{
"entropy": 0.23700961470603943,
"epoch": 0.041326137239784115,
"grad_norm": 1.921875,
"learning_rate": 2.5651708873180223e-05,
"loss": 0.22749063372612,
"mean_token_accuracy": 0.917475700378418,
"num_tokens": 373627.0,
"step": 268
},
{
"entropy": 0.22176285088062286,
"epoch": 0.04148033924441018,
"grad_norm": 1.4375,
"learning_rate": 2.5488805881542356e-05,
"loss": 0.19518814980983734,
"mean_token_accuracy": 0.922112226486206,
"num_tokens": 375150.0,
"step": 269
},
{
"entropy": 0.19811592996120453,
"epoch": 0.04163454124903624,
"grad_norm": 1.65625,
"learning_rate": 2.5325882124492395e-05,
"loss": 0.2038094401359558,
"mean_token_accuracy": 0.9243918657302856,
"num_tokens": 376679.0,
"step": 270
},
{
"entropy": 0.16331960260868073,
"epoch": 0.041788743253662296,
"grad_norm": 1.296875,
"learning_rate": 2.5162944523344256e-05,
"loss": 0.15330754220485687,
"mean_token_accuracy": 0.9463318586349487,
"num_tokens": 378718.0,
"step": 271
},
{
"entropy": 0.2266637682914734,
"epoch": 0.04194294525828836,
"grad_norm": 1.7734375,
"learning_rate": 2.5e-05,
"loss": 0.20924291014671326,
"mean_token_accuracy": 0.9225251078605652,
"num_tokens": 380120.0,
"step": 272
},
{
"entropy": 0.27386748790740967,
"epoch": 0.04209714726291442,
"grad_norm": 2.296875,
"learning_rate": 2.4837055476655746e-05,
"loss": 0.28491681814193726,
"mean_token_accuracy": 0.9068265557289124,
"num_tokens": 381212.0,
"step": 273
},
{
"entropy": 0.2462942749261856,
"epoch": 0.042251349267540476,
"grad_norm": 1.9375,
"learning_rate": 2.4674117875507615e-05,
"loss": 0.23223665356636047,
"mean_token_accuracy": 0.9165329337120056,
"num_tokens": 382466.0,
"step": 274
},
{
"entropy": 0.2614425718784332,
"epoch": 0.04240555127216654,
"grad_norm": 2.265625,
"learning_rate": 2.451119411845765e-05,
"loss": 0.27489128708839417,
"mean_token_accuracy": 0.9016948938369751,
"num_tokens": 383654.0,
"step": 275
},
{
"entropy": 0.21999643743038177,
"epoch": 0.0425597532767926,
"grad_norm": 2.140625,
"learning_rate": 2.4348291126819783e-05,
"loss": 0.2654040455818176,
"mean_token_accuracy": 0.9077669978141785,
"num_tokens": 385104.0,
"step": 276
},
{
"entropy": 0.2447359710931778,
"epoch": 0.04271395528141866,
"grad_norm": 2.546875,
"learning_rate": 2.4185415821025795e-05,
"loss": 0.2940978705883026,
"mean_token_accuracy": 0.8986432552337646,
"num_tokens": 386365.0,
"step": 277
},
{
"entropy": 0.24432024359703064,
"epoch": 0.04286815728604472,
"grad_norm": 2.171875,
"learning_rate": 2.4022575120331307e-05,
"loss": 0.2683406174182892,
"mean_token_accuracy": 0.9004524946212769,
"num_tokens": 387478.0,
"step": 278
},
{
"entropy": 0.19444933533668518,
"epoch": 0.04302235929067078,
"grad_norm": 1.7265625,
"learning_rate": 2.3859775942521854e-05,
"loss": 0.18984566628932953,
"mean_token_accuracy": 0.9271844625473022,
"num_tokens": 388928.0,
"step": 279
},
{
"entropy": 0.25862905383110046,
"epoch": 0.04317656129529684,
"grad_norm": 2.359375,
"learning_rate": 2.3697025203618987e-05,
"loss": 0.2914562523365021,
"mean_token_accuracy": 0.906593382358551,
"num_tokens": 390210.0,
"step": 280
},
{
"entropy": 0.2573435604572296,
"epoch": 0.0433307632999229,
"grad_norm": 2.15625,
"learning_rate": 2.3534329817586513e-05,
"loss": 0.25994932651519775,
"mean_token_accuracy": 0.9036144614219666,
"num_tokens": 391214.0,
"step": 281
},
{
"entropy": 0.25984057784080505,
"epoch": 0.043484965304548956,
"grad_norm": 2.109375,
"learning_rate": 2.3371696696036715e-05,
"loss": 0.23992516100406647,
"mean_token_accuracy": 0.9247743487358093,
"num_tokens": 392219.0,
"step": 282
},
{
"entropy": 0.20528267323970795,
"epoch": 0.04363916730917502,
"grad_norm": 1.9140625,
"learning_rate": 2.320913274793676e-05,
"loss": 0.20434120297431946,
"mean_token_accuracy": 0.9243749976158142,
"num_tokens": 393827.0,
"step": 283
},
{
"entropy": 0.44059571623802185,
"epoch": 0.04379336931380108,
"grad_norm": 3.546875,
"learning_rate": 2.30466448793152e-05,
"loss": 0.49274563789367676,
"mean_token_accuracy": 0.834419846534729,
"num_tokens": 394602.0,
"step": 284
},
{
"entropy": 0.24022506177425385,
"epoch": 0.04394757131842714,
"grad_norm": 1.921875,
"learning_rate": 2.28842399929686e-05,
"loss": 0.23765617609024048,
"mean_token_accuracy": 0.9164490699768066,
"num_tokens": 395759.0,
"step": 285
},
{
"entropy": 0.23994681239128113,
"epoch": 0.0441017733230532,
"grad_norm": 1.84375,
"learning_rate": 2.272192498816825e-05,
"loss": 0.2343621850013733,
"mean_token_accuracy": 0.9188445806503296,
"num_tokens": 397221.0,
"step": 286
},
{
"entropy": 0.27961966395378113,
"epoch": 0.04425597532767926,
"grad_norm": 2.25,
"learning_rate": 2.255970676036712e-05,
"loss": 0.27381986379623413,
"mean_token_accuracy": 0.8992950916290283,
"num_tokens": 398222.0,
"step": 287
},
{
"entropy": 0.1786043792963028,
"epoch": 0.04441017733230532,
"grad_norm": 1.4921875,
"learning_rate": 2.2397592200906906e-05,
"loss": 0.17795482277870178,
"mean_token_accuracy": 0.9386597871780396,
"num_tokens": 400170.0,
"step": 288
},
{
"entropy": 0.1822587549686432,
"epoch": 0.04456437933693138,
"grad_norm": 1.375,
"learning_rate": 2.223558819672526e-05,
"loss": 0.1628590077161789,
"mean_token_accuracy": 0.9355238676071167,
"num_tokens": 401791.0,
"step": 289
},
{
"entropy": 0.22401201725006104,
"epoch": 0.04471858134155744,
"grad_norm": 1.9765625,
"learning_rate": 2.2073701630063243e-05,
"loss": 0.23397932946681976,
"mean_token_accuracy": 0.9228187799453735,
"num_tokens": 403289.0,
"step": 290
},
{
"entropy": 0.26227450370788574,
"epoch": 0.0448727833461835,
"grad_norm": 2.28125,
"learning_rate": 2.1911939378172956e-05,
"loss": 0.2669812738895416,
"mean_token_accuracy": 0.9153226017951965,
"num_tokens": 404537.0,
"step": 291
},
{
"entropy": 0.21649585664272308,
"epoch": 0.04502698535080956,
"grad_norm": 1.46875,
"learning_rate": 2.175030831302535e-05,
"loss": 0.18651390075683594,
"mean_token_accuracy": 0.9295774698257446,
"num_tokens": 405894.0,
"step": 292
},
{
"entropy": 0.2264479100704193,
"epoch": 0.04518118735543562,
"grad_norm": 2.046875,
"learning_rate": 2.158881530101832e-05,
"loss": 0.24527707695960999,
"mean_token_accuracy": 0.9157626032829285,
"num_tokens": 407469.0,
"step": 293
},
{
"entropy": 0.19007329642772675,
"epoch": 0.04533538936006168,
"grad_norm": 1.828125,
"learning_rate": 2.1427467202685007e-05,
"loss": 0.18996097147464752,
"mean_token_accuracy": 0.9266110062599182,
"num_tokens": 409153.0,
"step": 294
},
{
"entropy": 0.2581518888473511,
"epoch": 0.04548959136468774,
"grad_norm": 1.890625,
"learning_rate": 2.126627087240231e-05,
"loss": 0.2599462568759918,
"mean_token_accuracy": 0.9158653616905212,
"num_tokens": 410409.0,
"step": 295
},
{
"entropy": 0.22935496270656586,
"epoch": 0.045643793369313804,
"grad_norm": 2.09375,
"learning_rate": 2.110523315809978e-05,
"loss": 0.21854767203330994,
"mean_token_accuracy": 0.9225852489471436,
"num_tokens": 411825.0,
"step": 296
},
{
"entropy": 0.25962114334106445,
"epoch": 0.04579799537393986,
"grad_norm": 2.296875,
"learning_rate": 2.0944360900968617e-05,
"loss": 0.28228771686553955,
"mean_token_accuracy": 0.8985915780067444,
"num_tokens": 412898.0,
"step": 297
},
{
"entropy": 0.25601744651794434,
"epoch": 0.04595219737856592,
"grad_norm": 1.9765625,
"learning_rate": 2.0783660935171092e-05,
"loss": 0.26037973165512085,
"mean_token_accuracy": 0.9110707640647888,
"num_tokens": 414008.0,
"step": 298
},
{
"entropy": 0.2810611128807068,
"epoch": 0.046106399383191984,
"grad_norm": 2.328125,
"learning_rate": 2.0623140087550215e-05,
"loss": 0.29850900173187256,
"mean_token_accuracy": 0.9104072451591492,
"num_tokens": 415121.0,
"step": 299
},
{
"entropy": 0.22841358184814453,
"epoch": 0.04626060138781804,
"grad_norm": 1.84375,
"learning_rate": 2.046280517733971e-05,
"loss": 0.22839921712875366,
"mean_token_accuracy": 0.923349916934967,
"num_tokens": 416538.0,
"step": 300
},
{
"entropy": 0.2764427959918976,
"epoch": 0.0464148033924441,
"grad_norm": 2.34375,
"learning_rate": 2.0302663015874322e-05,
"loss": 0.2636858820915222,
"mean_token_accuracy": 0.9106976985931396,
"num_tokens": 417621.0,
"step": 301
},
{
"entropy": 0.18497152626514435,
"epoch": 0.046569005397070165,
"grad_norm": 1.5,
"learning_rate": 2.0142720406300465e-05,
"loss": 0.18430255353450775,
"mean_token_accuracy": 0.929759681224823,
"num_tokens": 419252.0,
"step": 302
},
{
"entropy": 0.2483554184436798,
"epoch": 0.04672320740169622,
"grad_norm": 1.9140625,
"learning_rate": 1.9982984143287188e-05,
"loss": 0.24268567562103271,
"mean_token_accuracy": 0.9065420627593994,
"num_tokens": 420437.0,
"step": 303
},
{
"entropy": 0.2957545518875122,
"epoch": 0.04687740940632228,
"grad_norm": 2.59375,
"learning_rate": 1.9823461012737564e-05,
"loss": 0.3344174325466156,
"mean_token_accuracy": 0.8834766149520874,
"num_tokens": 421492.0,
"step": 304
},
{
"entropy": 0.23411741852760315,
"epoch": 0.047031611410948346,
"grad_norm": 1.5703125,
"learning_rate": 1.966415779150037e-05,
"loss": 0.21458064019680023,
"mean_token_accuracy": 0.9274131059646606,
"num_tokens": 422795.0,
"step": 305
},
{
"entropy": 0.2103796899318695,
"epoch": 0.0471858134155744,
"grad_norm": 1.671875,
"learning_rate": 1.9505081247082237e-05,
"loss": 0.20959612727165222,
"mean_token_accuracy": 0.9208722710609436,
"num_tokens": 424408.0,
"step": 306
},
{
"entropy": 0.2197587639093399,
"epoch": 0.047340015420200464,
"grad_norm": 1.6796875,
"learning_rate": 1.9346238137360106e-05,
"loss": 0.20553667843341827,
"mean_token_accuracy": 0.9193548560142517,
"num_tokens": 425718.0,
"step": 307
},
{
"entropy": 0.24315893650054932,
"epoch": 0.04749421742482652,
"grad_norm": 1.6484375,
"learning_rate": 1.918763521029418e-05,
"loss": 0.22866766154766083,
"mean_token_accuracy": 0.9147771596908569,
"num_tokens": 427005.0,
"step": 308
},
{
"entropy": 0.2538098990917206,
"epoch": 0.04764841942945258,
"grad_norm": 2.078125,
"learning_rate": 1.9029279203641232e-05,
"loss": 0.2357470542192459,
"mean_token_accuracy": 0.9233912229537964,
"num_tokens": 427992.0,
"step": 309
},
{
"entropy": 0.3305405378341675,
"epoch": 0.047802621434078645,
"grad_norm": 2.875,
"learning_rate": 1.8871176844668374e-05,
"loss": 0.3201872408390045,
"mean_token_accuracy": 0.8776978254318237,
"num_tokens": 428834.0,
"step": 310
},
{
"entropy": 0.22924208641052246,
"epoch": 0.0479568234387047,
"grad_norm": 1.703125,
"learning_rate": 1.8713334849867315e-05,
"loss": 0.2193642556667328,
"mean_token_accuracy": 0.9297805428504944,
"num_tokens": 430437.0,
"step": 311
},
{
"entropy": 0.2438676506280899,
"epoch": 0.04811102544333076,
"grad_norm": 1.7578125,
"learning_rate": 1.8555759924668952e-05,
"loss": 0.2391282469034195,
"mean_token_accuracy": 0.9204819202423096,
"num_tokens": 431690.0,
"step": 312
},
{
"entropy": 0.30626124143600464,
"epoch": 0.048265227447956825,
"grad_norm": 2.484375,
"learning_rate": 1.8398458763158578e-05,
"loss": 0.31509530544281006,
"mean_token_accuracy": 0.8954593539237976,
"num_tokens": 432645.0,
"step": 313
},
{
"entropy": 0.26661908626556396,
"epoch": 0.04841942945258288,
"grad_norm": 1.9921875,
"learning_rate": 1.8241438047791454e-05,
"loss": 0.2524988651275635,
"mean_token_accuracy": 0.9092437028884888,
"num_tokens": 433843.0,
"step": 314
},
{
"entropy": 0.22748155891895294,
"epoch": 0.04857363145720894,
"grad_norm": 1.8515625,
"learning_rate": 1.8084704449108985e-05,
"loss": 0.2243906408548355,
"mean_token_accuracy": 0.9239205121994019,
"num_tokens": 435310.0,
"step": 315
},
{
"entropy": 0.17577649652957916,
"epoch": 0.048727833461835006,
"grad_norm": 1.671875,
"learning_rate": 1.7928264625455282e-05,
"loss": 0.1813218891620636,
"mean_token_accuracy": 0.9322709441184998,
"num_tokens": 437326.0,
"step": 316
},
{
"entropy": 0.27867627143859863,
"epoch": 0.04888203546646106,
"grad_norm": 2.390625,
"learning_rate": 1.7772125222694337e-05,
"loss": 0.28030475974082947,
"mean_token_accuracy": 0.8948306441307068,
"num_tokens": 438456.0,
"step": 317
},
{
"entropy": 0.23422475159168243,
"epoch": 0.049036237471087124,
"grad_norm": 1.65625,
"learning_rate": 1.7616292873927688e-05,
"loss": 0.2259235829114914,
"mean_token_accuracy": 0.915672242641449,
"num_tokens": 439721.0,
"step": 318
},
{
"entropy": 0.20051687955856323,
"epoch": 0.04919043947571319,
"grad_norm": 1.5625,
"learning_rate": 1.7460774199212625e-05,
"loss": 0.20561350882053375,
"mean_token_accuracy": 0.9247232675552368,
"num_tokens": 441084.0,
"step": 319
},
{
"entropy": 0.17916183173656464,
"epoch": 0.04934464148033924,
"grad_norm": 1.265625,
"learning_rate": 1.7305575805280956e-05,
"loss": 0.16743285953998566,
"mean_token_accuracy": 0.9406779408454895,
"num_tokens": 442862.0,
"step": 320
},
{
"entropy": 0.18751926720142365,
"epoch": 0.049498843484965305,
"grad_norm": 1.3671875,
"learning_rate": 1.7150704285258375e-05,
"loss": 0.16947750747203827,
"mean_token_accuracy": 0.9436795711517334,
"num_tokens": 444468.0,
"step": 321
},
{
"entropy": 0.17793025076389313,
"epoch": 0.04965304548959137,
"grad_norm": 1.28125,
"learning_rate": 1.6996166218384307e-05,
"loss": 0.16534742712974548,
"mean_token_accuracy": 0.939068078994751,
"num_tokens": 446150.0,
"step": 322
},
{
"entropy": 0.2475776970386505,
"epoch": 0.04980724749421742,
"grad_norm": 2.15625,
"learning_rate": 1.684196816973248e-05,
"loss": 0.2468724101781845,
"mean_token_accuracy": 0.919457733631134,
"num_tokens": 447412.0,
"step": 323
},
{
"entropy": 0.2225208878517151,
"epoch": 0.049961449498843485,
"grad_norm": 1.625,
"learning_rate": 1.6688116689931972e-05,
"loss": 0.20401687920093536,
"mean_token_accuracy": 0.9311926364898682,
"num_tokens": 448946.0,
"step": 324
},
{
"entropy": 0.2503822445869446,
"epoch": 0.05011565150346955,
"grad_norm": 1.96875,
"learning_rate": 1.6534618314888945e-05,
"loss": 0.22844718396663666,
"mean_token_accuracy": 0.9175724387168884,
"num_tokens": 450058.0,
"step": 325
},
{
"entropy": 0.25004157423973083,
"epoch": 0.050269853508095604,
"grad_norm": 2.203125,
"learning_rate": 1.638147956550904e-05,
"loss": 0.25791749358177185,
"mean_token_accuracy": 0.9117646813392639,
"num_tokens": 451324.0,
"step": 326
},
{
"entropy": 0.22011376917362213,
"epoch": 0.050424055512721666,
"grad_norm": 1.8515625,
"learning_rate": 1.622870694742026e-05,
"loss": 0.19179725646972656,
"mean_token_accuracy": 0.9320175647735596,
"num_tokens": 452700.0,
"step": 327
},
{
"entropy": 0.193440780043602,
"epoch": 0.05057825751734773,
"grad_norm": 1.625,
"learning_rate": 1.6076306950696658e-05,
"loss": 0.19295921921730042,
"mean_token_accuracy": 0.9318463206291199,
"num_tokens": 454322.0,
"step": 328
},
{
"entropy": 0.17849111557006836,
"epoch": 0.050732459521973784,
"grad_norm": 1.46875,
"learning_rate": 1.592428604958264e-05,
"loss": 0.16607390344142914,
"mean_token_accuracy": 0.9433174133300781,
"num_tokens": 456006.0,
"step": 329
},
{
"entropy": 0.2486262321472168,
"epoch": 0.05088666152659985,
"grad_norm": 1.953125,
"learning_rate": 1.5772650702217878e-05,
"loss": 0.2480083853006363,
"mean_token_accuracy": 0.9057851433753967,
"num_tokens": 457224.0,
"step": 330
},
{
"entropy": 0.27837270498275757,
"epoch": 0.05104086353122591,
"grad_norm": 2.59375,
"learning_rate": 1.5621407350362986e-05,
"loss": 0.2996099293231964,
"mean_token_accuracy": 0.9042253494262695,
"num_tokens": 458297.0,
"step": 331
},
{
"entropy": 0.20956268906593323,
"epoch": 0.051195065535851965,
"grad_norm": 1.5625,
"learning_rate": 1.5470562419125868e-05,
"loss": 0.18728220462799072,
"mean_token_accuracy": 0.9295774698257446,
"num_tokens": 459796.0,
"step": 332
},
{
"entropy": 0.29057589173316956,
"epoch": 0.05134926754047803,
"grad_norm": 2.40625,
"learning_rate": 1.5320122316688735e-05,
"loss": 0.29962292313575745,
"mean_token_accuracy": 0.8858093023300171,
"num_tokens": 460706.0,
"step": 333
},
{
"entropy": 0.1948358118534088,
"epoch": 0.05150346954510408,
"grad_norm": 1.578125,
"learning_rate": 1.517009343403591e-05,
"loss": 0.1801883429288864,
"mean_token_accuracy": 0.93376624584198,
"num_tokens": 462254.0,
"step": 334
},
{
"entropy": 0.22513329982757568,
"epoch": 0.051657671549730146,
"grad_norm": 2.046875,
"learning_rate": 1.5020482144682308e-05,
"loss": 0.22428080439567566,
"mean_token_accuracy": 0.9161764979362488,
"num_tokens": 463622.0,
"step": 335
},
{
"entropy": 0.2175763100385666,
"epoch": 0.05181187355435621,
"grad_norm": 2.15625,
"learning_rate": 1.4871294804402675e-05,
"loss": 0.21439555287361145,
"mean_token_accuracy": 0.9237037301063538,
"num_tokens": 464980.0,
"step": 336
},
{
"entropy": 0.1653544306755066,
"epoch": 0.051966075558982264,
"grad_norm": 1.796875,
"learning_rate": 1.472253775096159e-05,
"loss": 0.16475962102413177,
"mean_token_accuracy": 0.9355390667915344,
"num_tokens": 466741.0,
"step": 337
},
{
"entropy": 0.20776669681072235,
"epoch": 0.052120277563608326,
"grad_norm": 1.9453125,
"learning_rate": 1.4574217303844211e-05,
"loss": 0.19919782876968384,
"mean_token_accuracy": 0.9283204674720764,
"num_tokens": 468172.0,
"step": 338
},
{
"entropy": 0.18218226730823517,
"epoch": 0.05227447956823439,
"grad_norm": 1.6875,
"learning_rate": 1.4426339763987844e-05,
"loss": 0.1778276562690735,
"mean_token_accuracy": 0.9303686618804932,
"num_tokens": 469889.0,
"step": 339
},
{
"entropy": 0.25532829761505127,
"epoch": 0.052428681572860444,
"grad_norm": 1.9375,
"learning_rate": 1.4278911413514204e-05,
"loss": 0.26636841893196106,
"mean_token_accuracy": 0.9083333611488342,
"num_tokens": 471217.0,
"step": 340
},
{
"entropy": 0.19937695562839508,
"epoch": 0.05258288357748651,
"grad_norm": 1.6015625,
"learning_rate": 1.4131938515462639e-05,
"loss": 0.1952292025089264,
"mean_token_accuracy": 0.9280303120613098,
"num_tokens": 472809.0,
"step": 341
},
{
"entropy": 0.28071922063827515,
"epoch": 0.05273708558211257,
"grad_norm": 2.4375,
"learning_rate": 1.3985427313523947e-05,
"loss": 0.28267180919647217,
"mean_token_accuracy": 0.885199248790741,
"num_tokens": 473871.0,
"step": 342
},
{
"entropy": 0.1708391159772873,
"epoch": 0.052891287586738625,
"grad_norm": 1.40625,
"learning_rate": 1.3839384031775226e-05,
"loss": 0.1682218760251999,
"mean_token_accuracy": 0.9421338438987732,
"num_tokens": 475538.0,
"step": 343
},
{
"entropy": 0.17169421911239624,
"epoch": 0.05304548959136469,
"grad_norm": 1.671875,
"learning_rate": 1.3693814874415389e-05,
"loss": 0.1755795031785965,
"mean_token_accuracy": 0.9377777576446533,
"num_tokens": 477346.0,
"step": 344
},
{
"entropy": 0.2197735607624054,
"epoch": 0.05319969159599075,
"grad_norm": 1.8515625,
"learning_rate": 1.3548726025501688e-05,
"loss": 0.22578758001327515,
"mean_token_accuracy": 0.9094029068946838,
"num_tokens": 478811.0,
"step": 345
},
{
"entropy": 0.21483223140239716,
"epoch": 0.053353893600616806,
"grad_norm": 1.6484375,
"learning_rate": 1.340412364868689e-05,
"loss": 0.21270032227039337,
"mean_token_accuracy": 0.9238030910491943,
"num_tokens": 480302.0,
"step": 346
},
{
"entropy": 0.27951836585998535,
"epoch": 0.05350809560524287,
"grad_norm": 2.28125,
"learning_rate": 1.3260013886957538e-05,
"loss": 0.2666223645210266,
"mean_token_accuracy": 0.9077869057655334,
"num_tokens": 481286.0,
"step": 347
},
{
"entropy": 0.1917494833469391,
"epoch": 0.05366229760986893,
"grad_norm": 1.578125,
"learning_rate": 1.3116402862372933e-05,
"loss": 0.19692182540893555,
"mean_token_accuracy": 0.9339783787727356,
"num_tokens": 483051.0,
"step": 348
},
{
"entropy": 0.20676381886005402,
"epoch": 0.053816499614494986,
"grad_norm": 1.6328125,
"learning_rate": 1.2973296675805041e-05,
"loss": 0.20207884907722473,
"mean_token_accuracy": 0.9374217987060547,
"num_tokens": 484657.0,
"step": 349
},
{
"entropy": 0.19531835615634918,
"epoch": 0.05397070161912105,
"grad_norm": 1.7421875,
"learning_rate": 1.2830701406679375e-05,
"loss": 0.18931494653224945,
"mean_token_accuracy": 0.9317750930786133,
"num_tokens": 486248.0,
"step": 350
},
{
"entropy": 0.3396989405155182,
"epoch": 0.05412490362374711,
"grad_norm": 5.1875,
"learning_rate": 1.2688623112716652e-05,
"loss": 0.37070798873901367,
"mean_token_accuracy": 0.869767427444458,
"num_tokens": 487116.0,
"step": 351
},
{
"entropy": 0.17527468502521515,
"epoch": 0.05427910562837317,
"grad_norm": 1.859375,
"learning_rate": 1.2547067829675535e-05,
"loss": 0.17982880771160126,
"mean_token_accuracy": 0.9339567422866821,
"num_tokens": 488835.0,
"step": 352
},
{
"entropy": 0.2687583565711975,
"epoch": 0.05443330763299923,
"grad_norm": 2.03125,
"learning_rate": 1.2406041571096164e-05,
"loss": 0.2823106646537781,
"mean_token_accuracy": 0.9135371446609497,
"num_tokens": 489988.0,
"step": 353
},
{
"entropy": 0.1937769502401352,
"epoch": 0.05458750963762529,
"grad_norm": 1.8515625,
"learning_rate": 1.2265550328044681e-05,
"loss": 0.19238050282001495,
"mean_token_accuracy": 0.9310998916625977,
"num_tokens": 491578.0,
"step": 354
},
{
"entropy": 0.17158617079257965,
"epoch": 0.05474171164225135,
"grad_norm": 1.4765625,
"learning_rate": 1.2125600068858772e-05,
"loss": 0.16338223218917847,
"mean_token_accuracy": 0.9456647634506226,
"num_tokens": 493316.0,
"step": 355
},
{
"entropy": 0.19250212609767914,
"epoch": 0.05489591364687741,
"grad_norm": 1.671875,
"learning_rate": 1.1986196738894078e-05,
"loss": 0.17621511220932007,
"mean_token_accuracy": 0.9345238208770752,
"num_tokens": 494668.0,
"step": 356
},
{
"entropy": 0.19578416645526886,
"epoch": 0.05505011565150347,
"grad_norm": 1.8828125,
"learning_rate": 1.1847346260271647e-05,
"loss": 0.183770090341568,
"mean_token_accuracy": 0.9346092343330383,
"num_tokens": 495930.0,
"step": 357
},
{
"entropy": 0.22412899136543274,
"epoch": 0.05520431765612953,
"grad_norm": 1.828125,
"learning_rate": 1.1709054531626313e-05,
"loss": 0.2516805827617645,
"mean_token_accuracy": 0.9137670397758484,
"num_tokens": 497260.0,
"step": 358
},
{
"entropy": 0.2025316208600998,
"epoch": 0.05535851966075559,
"grad_norm": 1.3203125,
"learning_rate": 1.1571327427856177e-05,
"loss": 0.19299444556236267,
"mean_token_accuracy": 0.9367007613182068,
"num_tokens": 498832.0,
"step": 359
},
{
"entropy": 0.2235983908176422,
"epoch": 0.05551272166538165,
"grad_norm": 1.5859375,
"learning_rate": 1.1434170799872947e-05,
"loss": 0.200628861784935,
"mean_token_accuracy": 0.929682195186615,
"num_tokens": 500319.0,
"step": 360
},
{
"entropy": 0.28108713030815125,
"epoch": 0.05566692367000771,
"grad_norm": 2.40625,
"learning_rate": 1.1297590474353464e-05,
"loss": 0.2882252335548401,
"mean_token_accuracy": 0.8986828923225403,
"num_tokens": 501314.0,
"step": 361
},
{
"entropy": 0.21756984293460846,
"epoch": 0.05582112567463377,
"grad_norm": 2.125,
"learning_rate": 1.116159225349213e-05,
"loss": 0.23450873792171478,
"mean_token_accuracy": 0.9163208603858948,
"num_tokens": 502768.0,
"step": 362
},
{
"entropy": 0.2556920051574707,
"epoch": 0.05597532767925983,
"grad_norm": 2.453125,
"learning_rate": 1.1026181914754388e-05,
"loss": 0.2757260203361511,
"mean_token_accuracy": 0.9049859046936035,
"num_tokens": 503839.0,
"step": 363
},
{
"entropy": 0.21779917180538177,
"epoch": 0.05612952968388589,
"grad_norm": 1.953125,
"learning_rate": 1.089136521063137e-05,
"loss": 0.22174124419689178,
"mean_token_accuracy": 0.9221984148025513,
"num_tokens": 505248.0,
"step": 364
},
{
"entropy": 0.3109717071056366,
"epoch": 0.05628373168851195,
"grad_norm": 2.578125,
"learning_rate": 1.075714786839542e-05,
"loss": 0.2979055345058441,
"mean_token_accuracy": 0.8831614851951599,
"num_tokens": 506129.0,
"step": 365
},
{
"entropy": 0.22565557062625885,
"epoch": 0.05643793369313801,
"grad_norm": 1.859375,
"learning_rate": 1.0623535589856887e-05,
"loss": 0.23962406814098358,
"mean_token_accuracy": 0.9183965921401978,
"num_tokens": 507534.0,
"step": 366
},
{
"entropy": 0.16417403519153595,
"epoch": 0.05659213569776407,
"grad_norm": 2.25,
"learning_rate": 1.0490534051121808e-05,
"loss": 0.16284841299057007,
"mean_token_accuracy": 0.937706708908081,
"num_tokens": 509356.0,
"step": 367
},
{
"entropy": 0.18802893161773682,
"epoch": 0.05674633770239013,
"grad_norm": 1.6640625,
"learning_rate": 1.0358148902350853e-05,
"loss": 0.19001488387584686,
"mean_token_accuracy": 0.930488646030426,
"num_tokens": 510817.0,
"step": 368
},
{
"entropy": 0.22402897477149963,
"epoch": 0.05690053970701619,
"grad_norm": 2.125,
"learning_rate": 1.0226385767519259e-05,
"loss": 0.228716179728508,
"mean_token_accuracy": 0.924344539642334,
"num_tokens": 512160.0,
"step": 369
},
{
"entropy": 0.24438747763633728,
"epoch": 0.05705474171164225,
"grad_norm": 1.984375,
"learning_rate": 1.0095250244177887e-05,
"loss": 0.22704952955245972,
"mean_token_accuracy": 0.918749988079071,
"num_tokens": 513288.0,
"step": 370
},
{
"entropy": 0.23192906379699707,
"epoch": 0.057208943716268314,
"grad_norm": 1.96875,
"learning_rate": 9.964747903215513e-06,
"loss": 0.22084636986255646,
"mean_token_accuracy": 0.929665744304657,
"num_tokens": 514732.0,
"step": 371
},
{
"entropy": 0.1626010537147522,
"epoch": 0.05736314572089437,
"grad_norm": 1.3203125,
"learning_rate": 9.834884288622054e-06,
"loss": 0.15189611911773682,
"mean_token_accuracy": 0.941209077835083,
"num_tokens": 516543.0,
"step": 372
},
{
"entropy": 0.16602161526679993,
"epoch": 0.05751734772552043,
"grad_norm": 1.3828125,
"learning_rate": 9.705664917253143e-06,
"loss": 0.18036378920078278,
"mean_token_accuracy": 0.9382113814353943,
"num_tokens": 518396.0,
"step": 373
},
{
"entropy": 0.16473768651485443,
"epoch": 0.057671549730146494,
"grad_norm": 1.3046875,
"learning_rate": 9.577095278595694e-06,
"loss": 0.15197424590587616,
"mean_token_accuracy": 0.9414084553718567,
"num_tokens": 520179.0,
"step": 374
},
{
"entropy": 0.1879141479730606,
"epoch": 0.05782575173477255,
"grad_norm": 1.8046875,
"learning_rate": 9.449180834534749e-06,
"loss": 0.18156398832798004,
"mean_token_accuracy": 0.9304715991020203,
"num_tokens": 521841.0,
"step": 375
},
{
"entropy": 0.2549605369567871,
"epoch": 0.05797995373939861,
"grad_norm": 2.203125,
"learning_rate": 9.321927019121435e-06,
"loss": 0.257169634103775,
"mean_token_accuracy": 0.9048386812210083,
"num_tokens": 523089.0,
"step": 376
},
{
"entropy": 0.18407224118709564,
"epoch": 0.058134155744024675,
"grad_norm": 1.609375,
"learning_rate": 9.195339238342071e-06,
"loss": 0.18074241280555725,
"mean_token_accuracy": 0.936096727848053,
"num_tokens": 524834.0,
"step": 377
},
{
"entropy": 0.21801158785820007,
"epoch": 0.05828835774865073,
"grad_norm": 1.9609375,
"learning_rate": 9.069422869888583e-06,
"loss": 0.22194962203502655,
"mean_token_accuracy": 0.923652708530426,
"num_tokens": 526178.0,
"step": 378
},
{
"entropy": 0.18715234100818634,
"epoch": 0.05844255975327679,
"grad_norm": 1.34375,
"learning_rate": 8.944183262929984e-06,
"loss": 0.17807839810848236,
"mean_token_accuracy": 0.9365825057029724,
"num_tokens": 527889.0,
"step": 379
},
{
"entropy": 0.196278914809227,
"epoch": 0.058596761757902856,
"grad_norm": 1.6953125,
"learning_rate": 8.819625737885187e-06,
"loss": 0.20651084184646606,
"mean_token_accuracy": 0.9256097674369812,
"num_tokens": 529537.0,
"step": 380
},
{
"entropy": 0.35177287459373474,
"epoch": 0.05875096376252891,
"grad_norm": 3.296875,
"learning_rate": 8.695755586196924e-06,
"loss": 0.385383665561676,
"mean_token_accuracy": 0.8580645322799683,
"num_tokens": 530475.0,
"step": 381
},
{
"entropy": 0.25344812870025635,
"epoch": 0.058905165767154974,
"grad_norm": 2.078125,
"learning_rate": 8.572578070107016e-06,
"loss": 0.25393110513687134,
"mean_token_accuracy": 0.917894721031189,
"num_tokens": 531433.0,
"step": 382
},
{
"entropy": 0.3020297884941101,
"epoch": 0.059059367771781036,
"grad_norm": 2.359375,
"learning_rate": 8.450098422432787e-06,
"loss": 0.3018152415752411,
"mean_token_accuracy": 0.9065510630607605,
"num_tokens": 532479.0,
"step": 383
},
{
"entropy": 0.15192678570747375,
"epoch": 0.05921356977640709,
"grad_norm": 1.4296875,
"learning_rate": 8.328321846344755e-06,
"loss": 0.1450488418340683,
"mean_token_accuracy": 0.9468623399734497,
"num_tokens": 534463.0,
"step": 384
},
{
"epoch": 0.05921356977640709,
"eval_entropy": 0.22466930076044206,
"eval_loss": 0.22621265053749084,
"eval_mean_token_accuracy": 0.9194652596760912,
"eval_num_tokens": 534463.0,
"eval_runtime": 34.9665,
"eval_samples_per_second": 78.103,
"eval_steps_per_second": 9.781,
"step": 384
},
{
"entropy": 0.18735887110233307,
"epoch": 0.059367771781033155,
"grad_norm": 1.375,
"learning_rate": 8.207253515145625e-06,
"loss": 0.18456675112247467,
"mean_token_accuracy": 0.9276748299598694,
"num_tokens": 536144.0,
"step": 385
},
{
"entropy": 0.2384348064661026,
"epoch": 0.05952197378565921,
"grad_norm": 1.9375,
"learning_rate": 8.086898572050494e-06,
"loss": 0.24932722747325897,
"mean_token_accuracy": 0.9125475287437439,
"num_tokens": 537467.0,
"step": 386
},
{
"entropy": 0.21620430052280426,
"epoch": 0.05967617579028527,
"grad_norm": 1.8515625,
"learning_rate": 7.967262129968378e-06,
"loss": 0.20638106763362885,
"mean_token_accuracy": 0.9262917637825012,
"num_tokens": 538791.0,
"step": 387
},
{
"entropy": 0.22282716631889343,
"epoch": 0.059830377794911335,
"grad_norm": 1.8203125,
"learning_rate": 7.848349271284952e-06,
"loss": 0.24068771302700043,
"mean_token_accuracy": 0.911854088306427,
"num_tokens": 540115.0,
"step": 388
},
{
"entropy": 0.19987352192401886,
"epoch": 0.05998457979953739,
"grad_norm": 1.7109375,
"learning_rate": 7.730165047646723e-06,
"loss": 0.19121116399765015,
"mean_token_accuracy": 0.93138587474823,
"num_tokens": 541595.0,
"step": 389
},
{
"entropy": 0.2530774772167206,
"epoch": 0.06013878180416345,
"grad_norm": 2.484375,
"learning_rate": 7.612714479746347e-06,
"loss": 0.250463604927063,
"mean_token_accuracy": 0.9078303575515747,
"num_tokens": 542829.0,
"step": 390
},
{
"entropy": 0.2623169720172882,
"epoch": 0.060292983808789516,
"grad_norm": 2.515625,
"learning_rate": 7.4960025571094025e-06,
"loss": 0.27675166726112366,
"mean_token_accuracy": 0.9017013311386108,
"num_tokens": 543895.0,
"step": 391
},
{
"entropy": 0.2155791074037552,
"epoch": 0.06044718581341557,
"grad_norm": 1.7890625,
"learning_rate": 7.380034237882394e-06,
"loss": 0.21280765533447266,
"mean_token_accuracy": 0.9217687249183655,
"num_tokens": 545373.0,
"step": 392
},
{
"entropy": 0.3150392770767212,
"epoch": 0.060601387818041634,
"grad_norm": 2.5,
"learning_rate": 7.264814448622106e-06,
"loss": 0.3080776035785675,
"mean_token_accuracy": 0.898815929889679,
"num_tokens": 546310.0,
"step": 393
},
{
"entropy": 0.19685329496860504,
"epoch": 0.0607555898226677,
"grad_norm": 2.125,
"learning_rate": 7.150348084086367e-06,
"loss": 0.22213543951511383,
"mean_token_accuracy": 0.9212239384651184,
"num_tokens": 547854.0,
"step": 394
},
{
"entropy": 0.1816016435623169,
"epoch": 0.06090979182729375,
"grad_norm": 1.4140625,
"learning_rate": 7.036640007026038e-06,
"loss": 0.17253060638904572,
"mean_token_accuracy": 0.9350804090499878,
"num_tokens": 549541.0,
"step": 395
},
{
"entropy": 0.19817869365215302,
"epoch": 0.061063993831919815,
"grad_norm": 1.7890625,
"learning_rate": 6.923695047978502e-06,
"loss": 0.191897913813591,
"mean_token_accuracy": 0.9271523356437683,
"num_tokens": 551059.0,
"step": 396
},
{
"entropy": 0.24792121350765228,
"epoch": 0.06121819583654588,
"grad_norm": 2.25,
"learning_rate": 6.811518005062423e-06,
"loss": 0.2625022828578949,
"mean_token_accuracy": 0.9022988677024841,
"num_tokens": 552111.0,
"step": 397
},
{
"entropy": 0.24607616662979126,
"epoch": 0.06137239784117193,
"grad_norm": 2.28125,
"learning_rate": 6.700113643773892e-06,
"loss": 0.22993192076683044,
"mean_token_accuracy": 0.9271889328956604,
"num_tokens": 553204.0,
"step": 398
},
{
"entropy": 0.25920623540878296,
"epoch": 0.061526599845797995,
"grad_norm": 2.453125,
"learning_rate": 6.589486696784028e-06,
"loss": 0.27900075912475586,
"mean_token_accuracy": 0.9022931456565857,
"num_tokens": 554215.0,
"step": 399
},
{
"entropy": 0.28530606627464294,
"epoch": 0.06168080185042406,
"grad_norm": 2.4375,
"learning_rate": 6.47964186373787e-06,
"loss": 0.2928396165370941,
"mean_token_accuracy": 0.8845500946044922,
"num_tokens": 555401.0,
"step": 400
},
{
"entropy": 0.2927665114402771,
"epoch": 0.061835003855050114,
"grad_norm": 2.25,
"learning_rate": 6.370583811054778e-06,
"loss": 0.2968969941139221,
"mean_token_accuracy": 0.9039433598518372,
"num_tokens": 556398.0,
"step": 401
},
{
"entropy": 0.23018132150173187,
"epoch": 0.061989205859676176,
"grad_norm": 1.96875,
"learning_rate": 6.262317171730167e-06,
"loss": 0.23996573686599731,
"mean_token_accuracy": 0.9214015007019043,
"num_tokens": 557462.0,
"step": 402
},
{
"entropy": 0.25166183710098267,
"epoch": 0.06214340786430224,
"grad_norm": 2.0,
"learning_rate": 6.154846545138695e-06,
"loss": 0.2649187445640564,
"mean_token_accuracy": 0.9033687710762024,
"num_tokens": 558598.0,
"step": 403
},
{
"entropy": 0.23649781942367554,
"epoch": 0.062297609868928294,
"grad_norm": 2.15625,
"learning_rate": 6.048176496838856e-06,
"loss": 0.21528743207454681,
"mean_token_accuracy": 0.9269746541976929,
"num_tokens": 559948.0,
"step": 404
},
{
"entropy": 0.22737731039524078,
"epoch": 0.06245181187355436,
"grad_norm": 1.796875,
"learning_rate": 5.9423115583790604e-06,
"loss": 0.21719223260879517,
"mean_token_accuracy": 0.9225531816482544,
"num_tokens": 561131.0,
"step": 405
},
{
"entropy": 0.21060694754123688,
"epoch": 0.06260601387818042,
"grad_norm": 1.4453125,
"learning_rate": 5.8372562271051e-06,
"loss": 0.19261834025382996,
"mean_token_accuracy": 0.9304878115653992,
"num_tokens": 562779.0,
"step": 406
},
{
"entropy": 0.24134337902069092,
"epoch": 0.06276021588280647,
"grad_norm": 1.8671875,
"learning_rate": 5.733014965969091e-06,
"loss": 0.2224052995443344,
"mean_token_accuracy": 0.9310910701751709,
"num_tokens": 564006.0,
"step": 407
},
{
"entropy": 0.19692017138004303,
"epoch": 0.06291441788743253,
"grad_norm": 1.6328125,
"learning_rate": 5.629592203339909e-06,
"loss": 0.18327265977859497,
"mean_token_accuracy": 0.9346548914909363,
"num_tokens": 565376.0,
"step": 408
},
{
"entropy": 0.2016250342130661,
"epoch": 0.0630686198920586,
"grad_norm": 1.4765625,
"learning_rate": 5.526992332815012e-06,
"loss": 0.20120908319950104,
"mean_token_accuracy": 0.9263085126876831,
"num_tokens": 566836.0,
"step": 409
},
{
"entropy": 0.14676110446453094,
"epoch": 0.06322282189668466,
"grad_norm": 1.3046875,
"learning_rate": 5.4252197130338525e-06,
"loss": 0.1583862602710724,
"mean_token_accuracy": 0.9458128213882446,
"num_tokens": 569280.0,
"step": 410
},
{
"entropy": 0.1877201646566391,
"epoch": 0.06337702390131071,
"grad_norm": 2.09375,
"learning_rate": 5.3242786674926545e-06,
"loss": 0.18557564914226532,
"mean_token_accuracy": 0.9334638118743896,
"num_tokens": 570821.0,
"step": 411
},
{
"entropy": 0.21993833780288696,
"epoch": 0.06353122590593678,
"grad_norm": 1.7421875,
"learning_rate": 5.224173484360798e-06,
"loss": 0.19618681073188782,
"mean_token_accuracy": 0.9358024597167969,
"num_tokens": 572044.0,
"step": 412
},
{
"entropy": 0.20039010047912598,
"epoch": 0.06368542791056284,
"grad_norm": 1.3671875,
"learning_rate": 5.124908416298615e-06,
"loss": 0.18724791705608368,
"mean_token_accuracy": 0.9329929947853088,
"num_tokens": 573619.0,
"step": 413
},
{
"entropy": 0.21013715863227844,
"epoch": 0.06383962991518889,
"grad_norm": 1.796875,
"learning_rate": 5.026487680276723e-06,
"loss": 0.21998311579227448,
"mean_token_accuracy": 0.9184691905975342,
"num_tokens": 574829.0,
"step": 414
},
{
"entropy": 0.26953125,
"epoch": 0.06399383191981496,
"grad_norm": 2.171875,
"learning_rate": 4.928915457396913e-06,
"loss": 0.26942914724349976,
"mean_token_accuracy": 0.9191489219665527,
"num_tokens": 576012.0,
"step": 415
},
{
"entropy": 0.23597829043865204,
"epoch": 0.06414803392444102,
"grad_norm": 1.84375,
"learning_rate": 4.832195892714489e-06,
"loss": 0.22428561747074127,
"mean_token_accuracy": 0.9230215549468994,
"num_tokens": 577410.0,
"step": 416
},
{
"entropy": 0.28713032603263855,
"epoch": 0.06430223592906707,
"grad_norm": 2.0625,
"learning_rate": 4.736333095062228e-06,
"loss": 0.2505059242248535,
"mean_token_accuracy": 0.9073724150657654,
"num_tokens": 578476.0,
"step": 417
},
{
"entropy": 0.2858028709888458,
"epoch": 0.06445643793369314,
"grad_norm": 2.015625,
"learning_rate": 4.641331136875768e-06,
"loss": 0.2911134958267212,
"mean_token_accuracy": 0.9045093059539795,
"num_tokens": 579615.0,
"step": 418
},
{
"entropy": 0.282069593667984,
"epoch": 0.0646106399383192,
"grad_norm": 2.09375,
"learning_rate": 4.547194054020651e-06,
"loss": 0.27553999423980713,
"mean_token_accuracy": 0.90444016456604,
"num_tokens": 580659.0,
"step": 419
},
{
"entropy": 0.22959555685520172,
"epoch": 0.06476484194294525,
"grad_norm": 1.9453125,
"learning_rate": 4.453925845620854e-06,
"loss": 0.22032871842384338,
"mean_token_accuracy": 0.9136531352996826,
"num_tokens": 582022.0,
"step": 420
},
{
"entropy": 0.2052592635154724,
"epoch": 0.06491904394757132,
"grad_norm": 1.7734375,
"learning_rate": 4.361530473888889e-06,
"loss": 0.20798712968826294,
"mean_token_accuracy": 0.9232394099235535,
"num_tokens": 583450.0,
"step": 421
},
{
"entropy": 0.32572290301322937,
"epoch": 0.06507324595219738,
"grad_norm": 2.578125,
"learning_rate": 4.270011863957507e-06,
"loss": 0.33982253074645996,
"mean_token_accuracy": 0.8741418719291687,
"num_tokens": 584332.0,
"step": 422
},
{
"entropy": 0.3089931607246399,
"epoch": 0.06522744795682343,
"grad_norm": 2.578125,
"learning_rate": 4.179373903712913e-06,
"loss": 0.30327266454696655,
"mean_token_accuracy": 0.8930232524871826,
"num_tokens": 585200.0,
"step": 423
},
{
"entropy": 0.19629529118537903,
"epoch": 0.0653816499614495,
"grad_norm": 1.703125,
"learning_rate": 4.089620443629652e-06,
"loss": 0.2054092288017273,
"mean_token_accuracy": 0.9246435761451721,
"num_tokens": 586681.0,
"step": 424
},
{
"entropy": 0.18628910183906555,
"epoch": 0.06553585196607556,
"grad_norm": 1.3359375,
"learning_rate": 4.000755296606973e-06,
"loss": 0.1760605424642563,
"mean_token_accuracy": 0.9416413307189941,
"num_tokens": 588334.0,
"step": 425
},
{
"entropy": 0.194645494222641,
"epoch": 0.06569005397070161,
"grad_norm": 1.90625,
"learning_rate": 3.912782237806903e-06,
"loss": 0.19329358637332916,
"mean_token_accuracy": 0.9218025207519531,
"num_tokens": 589851.0,
"step": 426
},
{
"entropy": 0.19448570907115936,
"epoch": 0.06584425597532768,
"grad_norm": 1.671875,
"learning_rate": 3.825705004493849e-06,
"loss": 0.18638762831687927,
"mean_token_accuracy": 0.9315856695175171,
"num_tokens": 591423.0,
"step": 427
},
{
"entropy": 0.26799967885017395,
"epoch": 0.06599845797995374,
"grad_norm": 2.125,
"learning_rate": 3.739527295875811e-06,
"loss": 0.2695932686328888,
"mean_token_accuracy": 0.9055441617965698,
"num_tokens": 592405.0,
"step": 428
},
{
"entropy": 0.20886771380901337,
"epoch": 0.0661526599845798,
"grad_norm": 1.875,
"learning_rate": 3.6542527729472836e-06,
"loss": 0.22071963548660278,
"mean_token_accuracy": 0.9178168177604675,
"num_tokens": 594007.0,
"step": 429
},
{
"entropy": 0.19780333340168,
"epoch": 0.06630686198920586,
"grad_norm": 1.4296875,
"learning_rate": 3.5698850583336663e-06,
"loss": 0.19298632442951202,
"mean_token_accuracy": 0.9317794442176819,
"num_tokens": 595774.0,
"step": 430
},
{
"entropy": 0.2335851490497589,
"epoch": 0.06646106399383192,
"grad_norm": 1.5859375,
"learning_rate": 3.4864277361374264e-06,
"loss": 0.21905845403671265,
"mean_token_accuracy": 0.9286743402481079,
"num_tokens": 597170.0,
"step": 431
},
{
"entropy": 0.17323604226112366,
"epoch": 0.06661526599845798,
"grad_norm": 1.640625,
"learning_rate": 3.4038843517858075e-06,
"loss": 0.17967088520526886,
"mean_token_accuracy": 0.9360523819923401,
"num_tokens": 599164.0,
"step": 432
},
{
"entropy": 0.2514375150203705,
"epoch": 0.06676946800308405,
"grad_norm": 1.8125,
"learning_rate": 3.3222584118802192e-06,
"loss": 0.2490684688091278,
"mean_token_accuracy": 0.9187192320823669,
"num_tokens": 600390.0,
"step": 433
},
{
"entropy": 0.22465308010578156,
"epoch": 0.0669236700077101,
"grad_norm": 2.515625,
"learning_rate": 3.241553384047258e-06,
"loss": 0.26371464133262634,
"mean_token_accuracy": 0.9116766452789307,
"num_tokens": 601734.0,
"step": 434
},
{
"entropy": 0.20948569476604462,
"epoch": 0.06707787201233616,
"grad_norm": 1.5703125,
"learning_rate": 3.1617726967914235e-06,
"loss": 0.21372012794017792,
"mean_token_accuracy": 0.9316811561584473,
"num_tokens": 603235.0,
"step": 435
},
{
"entropy": 0.20347538590431213,
"epoch": 0.06723207401696223,
"grad_norm": 1.6796875,
"learning_rate": 3.0829197393494548e-06,
"loss": 0.17981462180614471,
"mean_token_accuracy": 0.9269624352455139,
"num_tokens": 604708.0,
"step": 436
},
{
"entropy": 0.23263585567474365,
"epoch": 0.06738627602158828,
"grad_norm": 2.1875,
"learning_rate": 3.004997861546327e-06,
"loss": 0.23778997361660004,
"mean_token_accuracy": 0.9214986562728882,
"num_tokens": 605837.0,
"step": 437
},
{
"entropy": 0.23302724957466125,
"epoch": 0.06754047802621434,
"grad_norm": 2.203125,
"learning_rate": 2.9280103736529896e-06,
"loss": 0.23127038776874542,
"mean_token_accuracy": 0.9103972911834717,
"num_tokens": 607028.0,
"step": 438
},
{
"entropy": 0.18138211965560913,
"epoch": 0.0676946800308404,
"grad_norm": 1.4140625,
"learning_rate": 2.8519605462456965e-06,
"loss": 0.1681656837463379,
"mean_token_accuracy": 0.9345430731773376,
"num_tokens": 608579.0,
"step": 439
},
{
"entropy": 0.17149963974952698,
"epoch": 0.06784888203546646,
"grad_norm": 1.6171875,
"learning_rate": 2.776851610067094e-06,
"loss": 0.1811680942773819,
"mean_token_accuracy": 0.932692289352417,
"num_tokens": 610563.0,
"step": 440
},
{
"entropy": 0.15687499940395355,
"epoch": 0.06800308404009252,
"grad_norm": 1.34375,
"learning_rate": 2.7026867558889694e-06,
"loss": 0.15128004550933838,
"mean_token_accuracy": 0.9400107264518738,
"num_tokens": 612438.0,
"step": 441
},
{
"entropy": 0.22530966997146606,
"epoch": 0.06815728604471859,
"grad_norm": 2.046875,
"learning_rate": 2.6294691343766718e-06,
"loss": 0.22919264435768127,
"mean_token_accuracy": 0.9237637519836426,
"num_tokens": 613902.0,
"step": 442
},
{
"entropy": 0.21813379228115082,
"epoch": 0.06831148804934464,
"grad_norm": 1.7890625,
"learning_rate": 2.557201855955316e-06,
"loss": 0.20722565054893494,
"mean_token_accuracy": 0.9286713004112244,
"num_tokens": 615340.0,
"step": 443
},
{
"entropy": 0.22816047072410583,
"epoch": 0.0684656900539707,
"grad_norm": 1.7890625,
"learning_rate": 2.4858879906775904e-06,
"loss": 0.2418501079082489,
"mean_token_accuracy": 0.9141337275505066,
"num_tokens": 616664.0,
"step": 444
},
{
"entropy": 0.24174243211746216,
"epoch": 0.06861989205859677,
"grad_norm": 1.7421875,
"learning_rate": 2.4155305680933938e-06,
"loss": 0.24712735414505005,
"mean_token_accuracy": 0.9127676486968994,
"num_tokens": 617933.0,
"step": 445
},
{
"entropy": 0.23680631816387177,
"epoch": 0.06877409406322282,
"grad_norm": 2.15625,
"learning_rate": 2.3461325771210683e-06,
"loss": 0.24274389445781708,
"mean_token_accuracy": 0.9137291312217712,
"num_tokens": 619019.0,
"step": 446
},
{
"entropy": 0.21051788330078125,
"epoch": 0.06892829606784888,
"grad_norm": 1.5703125,
"learning_rate": 2.2776969659205005e-06,
"loss": 0.19205066561698914,
"mean_token_accuracy": 0.9310897588729858,
"num_tokens": 620275.0,
"step": 447
},
{
"entropy": 0.19069823622703552,
"epoch": 0.06908249807247495,
"grad_norm": 1.640625,
"learning_rate": 2.2102266417677985e-06,
"loss": 0.193171888589859,
"mean_token_accuracy": 0.9300353527069092,
"num_tokens": 621698.0,
"step": 448
},
{
"entropy": 0.26176121830940247,
"epoch": 0.069236700077101,
"grad_norm": 2.203125,
"learning_rate": 2.143724470931846e-06,
"loss": 0.2646713852882385,
"mean_token_accuracy": 0.9019434452056885,
"num_tokens": 622838.0,
"step": 449
},
{
"entropy": 0.37524735927581787,
"epoch": 0.06939090208172706,
"grad_norm": 3.421875,
"learning_rate": 2.0781932785525122e-06,
"loss": 0.3872081935405731,
"mean_token_accuracy": 0.8746479153633118,
"num_tokens": 623556.0,
"step": 450
},
{
"entropy": 0.20446714758872986,
"epoch": 0.06954510408635313,
"grad_norm": 1.984375,
"learning_rate": 2.013635848520626e-06,
"loss": 0.21962465345859528,
"mean_token_accuracy": 0.9238095283508301,
"num_tokens": 624824.0,
"step": 451
},
{
"entropy": 0.18340152502059937,
"epoch": 0.06969930609097919,
"grad_norm": 1.6796875,
"learning_rate": 1.9500549233597453e-06,
"loss": 0.1832038313150406,
"mean_token_accuracy": 0.9371029138565063,
"num_tokens": 626406.0,
"step": 452
},
{
"entropy": 0.3325141668319702,
"epoch": 0.06985350809560524,
"grad_norm": 3.046875,
"learning_rate": 1.8874532041095989e-06,
"loss": 0.34842032194137573,
"mean_token_accuracy": 0.8773234486579895,
"num_tokens": 627221.0,
"step": 453
},
{
"entropy": 0.20056799054145813,
"epoch": 0.0700077101002313,
"grad_norm": 1.59375,
"learning_rate": 1.825833350211395e-06,
"loss": 0.1930190622806549,
"mean_token_accuracy": 0.9300291538238525,
"num_tokens": 628944.0,
"step": 454
},
{
"entropy": 0.3074391484260559,
"epoch": 0.07016191210485737,
"grad_norm": 2.671875,
"learning_rate": 1.7651979793947949e-06,
"loss": 0.320962131023407,
"mean_token_accuracy": 0.8794258236885071,
"num_tokens": 629997.0,
"step": 455
},
{
"entropy": 0.2851220667362213,
"epoch": 0.07031611410948342,
"grad_norm": 2.4375,
"learning_rate": 1.705549667566747e-06,
"loss": 0.305853009223938,
"mean_token_accuracy": 0.884324312210083,
"num_tokens": 630930.0,
"step": 456
},
{
"entropy": 0.213734969496727,
"epoch": 0.07047031611410948,
"grad_norm": 1.875,
"learning_rate": 1.6468909487020318e-06,
"loss": 0.21344000101089478,
"mean_token_accuracy": 0.9156540632247925,
"num_tokens": 632337.0,
"step": 457
},
{
"entropy": 0.23210836946964264,
"epoch": 0.07062451811873555,
"grad_norm": 1.8984375,
"learning_rate": 1.5892243147356128e-06,
"loss": 0.22123272716999054,
"mean_token_accuracy": 0.921897828578949,
"num_tokens": 633715.0,
"step": 458
},
{
"entropy": 0.2013556957244873,
"epoch": 0.0707787201233616,
"grad_norm": 1.90625,
"learning_rate": 1.5325522154568006e-06,
"loss": 0.2120433896780014,
"mean_token_accuracy": 0.9267473220825195,
"num_tokens": 635211.0,
"step": 459
},
{
"entropy": 0.1748819798231125,
"epoch": 0.07093292212798766,
"grad_norm": 1.40625,
"learning_rate": 1.4768770584051433e-06,
"loss": 0.16574399173259735,
"mean_token_accuracy": 0.9330986142158508,
"num_tokens": 636923.0,
"step": 460
},
{
"entropy": 0.20135805010795593,
"epoch": 0.07108712413261373,
"grad_norm": 1.59375,
"learning_rate": 1.422201208768187e-06,
"loss": 0.20329774916172028,
"mean_token_accuracy": 0.9288026094436646,
"num_tokens": 638476.0,
"step": 461
},
{
"entropy": 0.19482704997062683,
"epoch": 0.07124132613723978,
"grad_norm": 1.5,
"learning_rate": 1.3685269892809715e-06,
"loss": 0.18484120070934296,
"mean_token_accuracy": 0.9365351796150208,
"num_tokens": 640233.0,
"step": 462
},
{
"entropy": 0.2483380138874054,
"epoch": 0.07139552814186584,
"grad_norm": 2.453125,
"learning_rate": 1.315856680127367e-06,
"loss": 0.2574044167995453,
"mean_token_accuracy": 0.9011474251747131,
"num_tokens": 641374.0,
"step": 463
},
{
"entropy": 0.25926902890205383,
"epoch": 0.07154973014649191,
"grad_norm": 2.03125,
"learning_rate": 1.2641925188432102e-06,
"loss": 0.2751407325267792,
"mean_token_accuracy": 0.9096437692642212,
"num_tokens": 642533.0,
"step": 464
},
{
"entropy": 0.19511115550994873,
"epoch": 0.07170393215111796,
"grad_norm": 1.7265625,
"learning_rate": 1.2135367002212321e-06,
"loss": 0.19707168638706207,
"mean_token_accuracy": 0.9302915334701538,
"num_tokens": 644119.0,
"step": 465
},
{
"entropy": 0.2082238495349884,
"epoch": 0.07185813415574402,
"grad_norm": 1.828125,
"learning_rate": 1.1638913762178489e-06,
"loss": 0.2105921357870102,
"mean_token_accuracy": 0.9202454090118408,
"num_tokens": 645431.0,
"step": 466
},
{
"entropy": 0.19069121778011322,
"epoch": 0.07201233616037009,
"grad_norm": 1.6796875,
"learning_rate": 1.1152586558617118e-06,
"loss": 0.17696255445480347,
"mean_token_accuracy": 0.9442567825317383,
"num_tokens": 647215.0,
"step": 467
},
{
"entropy": 0.22916826605796814,
"epoch": 0.07216653816499614,
"grad_norm": 1.7890625,
"learning_rate": 1.0676406051641357e-06,
"loss": 0.22586072981357574,
"mean_token_accuracy": 0.9183526039123535,
"num_tokens": 648607.0,
"step": 468
},
{
"entropy": 0.26740562915802,
"epoch": 0.0723207401696222,
"grad_norm": 2.125,
"learning_rate": 1.0210392470313078e-06,
"loss": 0.2589561343193054,
"mean_token_accuracy": 0.9052631855010986,
"num_tokens": 649660.0,
"step": 469
},
{
"entropy": 0.22609063982963562,
"epoch": 0.07247494217424827,
"grad_norm": 1.859375,
"learning_rate": 9.754565611783812e-07,
"loss": 0.23183754086494446,
"mean_token_accuracy": 0.9105263352394104,
"num_tokens": 650998.0,
"step": 470
},
{
"entropy": 0.2637474536895752,
"epoch": 0.07262914417887432,
"grad_norm": 2.15625,
"learning_rate": 9.308944840453415e-07,
"loss": 0.2506449520587921,
"mean_token_accuracy": 0.9153633713722229,
"num_tokens": 652093.0,
"step": 471
},
{
"entropy": 0.2541276514530182,
"epoch": 0.07278334618350038,
"grad_norm": 2.125,
"learning_rate": 8.873549087147604e-07,
"loss": 0.25114259123802185,
"mean_token_accuracy": 0.9045345783233643,
"num_tokens": 653358.0,
"step": 472
},
{
"entropy": 0.20104283094406128,
"epoch": 0.07293754818812645,
"grad_norm": 1.5234375,
"learning_rate": 8.44839684831375e-07,
"loss": 0.18859422206878662,
"mean_token_accuracy": 0.9310559034347534,
"num_tokens": 654976.0,
"step": 473
},
{
"entropy": 0.23548080027103424,
"epoch": 0.0730917501927525,
"grad_norm": 2.125,
"learning_rate": 8.03350618523499e-07,
"loss": 0.2508711516857147,
"mean_token_accuracy": 0.9083601236343384,
"num_tokens": 656228.0,
"step": 474
},
{
"entropy": 0.2388007789850235,
"epoch": 0.07324595219737856,
"grad_norm": 1.9609375,
"learning_rate": 7.628894723263086e-07,
"loss": 0.25423818826675415,
"mean_token_accuracy": 0.9074475765228271,
"num_tokens": 657619.0,
"step": 475
},
{
"entropy": 0.2098216712474823,
"epoch": 0.07340015420200463,
"grad_norm": 1.6640625,
"learning_rate": 7.234579651069578e-07,
"loss": 0.19636894762516022,
"mean_token_accuracy": 0.9344852566719055,
"num_tokens": 659016.0,
"step": 476
},
{
"entropy": 0.19445836544036865,
"epoch": 0.07355435620663069,
"grad_norm": 1.4296875,
"learning_rate": 6.850577719915624e-07,
"loss": 0.18777857720851898,
"mean_token_accuracy": 0.9340922832489014,
"num_tokens": 660693.0,
"step": 477
},
{
"entropy": 0.2021363377571106,
"epoch": 0.07370855821125674,
"grad_norm": 1.65625,
"learning_rate": 6.47690524294034e-07,
"loss": 0.1869696080684662,
"mean_token_accuracy": 0.9333333373069763,
"num_tokens": 662111.0,
"step": 478
},
{
"entropy": 0.19528843462467194,
"epoch": 0.07386276021588281,
"grad_norm": 2.03125,
"learning_rate": 6.113578094467775e-07,
"loss": 0.17778527736663818,
"mean_token_accuracy": 0.9368270039558411,
"num_tokens": 663512.0,
"step": 479
},
{
"entropy": 0.17402714490890503,
"epoch": 0.07401696222050887,
"grad_norm": 1.2734375,
"learning_rate": 5.760611709332648e-07,
"loss": 0.15594635903835297,
"mean_token_accuracy": 0.9421712756156921,
"num_tokens": 665353.0,
"step": 480
},
{
"entropy": 0.14156945049762726,
"epoch": 0.07417116422513492,
"grad_norm": 1.1640625,
"learning_rate": 5.418021082224472e-07,
"loss": 0.1273384541273117,
"mean_token_accuracy": 0.9454138875007629,
"num_tokens": 667596.0,
"step": 481
},
{
"entropy": 0.15703719854354858,
"epoch": 0.07432536622976099,
"grad_norm": 1.4140625,
"learning_rate": 5.08582076705072e-07,
"loss": 0.15257099270820618,
"mean_token_accuracy": 0.9451599717140198,
"num_tokens": 670011.0,
"step": 482
},
{
"entropy": 0.15173302590847015,
"epoch": 0.07447956823438705,
"grad_norm": 1.359375,
"learning_rate": 4.764024876318357e-07,
"loss": 0.14840558171272278,
"mean_token_accuracy": 0.9457720518112183,
"num_tokens": 672195.0,
"step": 483
},
{
"entropy": 0.251803457736969,
"epoch": 0.0746337702390131,
"grad_norm": 2.140625,
"learning_rate": 4.4526470805345554e-07,
"loss": 0.23033595085144043,
"mean_token_accuracy": 0.9138405323028564,
"num_tokens": 673294.0,
"step": 484
},
{
"entropy": 0.22149844467639923,
"epoch": 0.07478797224363917,
"grad_norm": 1.671875,
"learning_rate": 4.1517006076257914e-07,
"loss": 0.20876595377922058,
"mean_token_accuracy": 0.920634925365448,
"num_tokens": 674751.0,
"step": 485
},
{
"entropy": 0.19992657005786896,
"epoch": 0.07494217424826523,
"grad_norm": 1.609375,
"learning_rate": 3.861198242375852e-07,
"loss": 0.20208041369915009,
"mean_token_accuracy": 0.9220055937767029,
"num_tokens": 676195.0,
"step": 486
},
{
"entropy": 0.2647709846496582,
"epoch": 0.07509637625289128,
"grad_norm": 2.09375,
"learning_rate": 3.581152325882825e-07,
"loss": 0.26581087708473206,
"mean_token_accuracy": 0.9089347124099731,
"num_tokens": 677367.0,
"step": 487
},
{
"entropy": 0.18075726926326752,
"epoch": 0.07525057825751735,
"grad_norm": 1.7109375,
"learning_rate": 3.311574755034796e-07,
"loss": 0.19126133620738983,
"mean_token_accuracy": 0.9318181872367859,
"num_tokens": 678959.0,
"step": 488
},
{
"entropy": 0.23724618554115295,
"epoch": 0.07540478026214341,
"grad_norm": 2.25,
"learning_rate": 3.0524769820044487e-07,
"loss": 0.23674722015857697,
"mean_token_accuracy": 0.9180327653884888,
"num_tokens": 680248.0,
"step": 489
},
{
"entropy": 0.22051914036273956,
"epoch": 0.07555898226676946,
"grad_norm": 1.8828125,
"learning_rate": 2.8038700137624495e-07,
"loss": 0.2116030901670456,
"mean_token_accuracy": 0.9300605058670044,
"num_tokens": 681743.0,
"step": 490
},
{
"entropy": 0.1757911741733551,
"epoch": 0.07571318427139553,
"grad_norm": 1.6015625,
"learning_rate": 2.5657644116100497e-07,
"loss": 0.17098675668239594,
"mean_token_accuracy": 0.9406231641769409,
"num_tokens": 683452.0,
"step": 491
},
{
"entropy": 0.18268117308616638,
"epoch": 0.07586738627602159,
"grad_norm": 1.4140625,
"learning_rate": 2.338170290730246e-07,
"loss": 0.17703530192375183,
"mean_token_accuracy": 0.9361584782600403,
"num_tokens": 685277.0,
"step": 492
},
{
"entropy": 0.19099417328834534,
"epoch": 0.07602158828064765,
"grad_norm": 1.5,
"learning_rate": 2.1210973197582085e-07,
"loss": 0.19510860741138458,
"mean_token_accuracy": 0.9316887855529785,
"num_tokens": 686866.0,
"step": 493
},
{
"entropy": 0.21786467730998993,
"epoch": 0.07617579028527371,
"grad_norm": 2.078125,
"learning_rate": 1.9145547203703597e-07,
"loss": 0.2253967970609665,
"mean_token_accuracy": 0.9227994084358215,
"num_tokens": 688260.0,
"step": 494
},
{
"entropy": 0.22731785476207733,
"epoch": 0.07632999228989977,
"grad_norm": 1.8203125,
"learning_rate": 1.7185512668927706e-07,
"loss": 0.21878266334533691,
"mean_token_accuracy": 0.9235293865203857,
"num_tokens": 689628.0,
"step": 495
},
{
"entropy": 0.31587833166122437,
"epoch": 0.07648419429452583,
"grad_norm": 2.515625,
"learning_rate": 1.533095285928432e-07,
"loss": 0.31676945090293884,
"mean_token_accuracy": 0.8903688788414001,
"num_tokens": 690612.0,
"step": 496
},
{
"entropy": 0.22072257101535797,
"epoch": 0.0766383962991519,
"grad_norm": 1.875,
"learning_rate": 1.3581946560033142e-07,
"loss": 0.20424997806549072,
"mean_token_accuracy": 0.9260969758033752,
"num_tokens": 691919.0,
"step": 497
},
{
"entropy": 0.2378959059715271,
"epoch": 0.07679259830377795,
"grad_norm": 2.125,
"learning_rate": 1.1938568072319412e-07,
"loss": 0.23960573971271515,
"mean_token_accuracy": 0.908172607421875,
"num_tokens": 693016.0,
"step": 498
},
{
"entropy": 0.18599998950958252,
"epoch": 0.076946800308404,
"grad_norm": 1.4453125,
"learning_rate": 1.0400887210015586e-07,
"loss": 0.17737571895122528,
"mean_token_accuracy": 0.9337517619132996,
"num_tokens": 694458.0,
"step": 499
},
{
"entropy": 0.1896909922361374,
"epoch": 0.07710100231303008,
"grad_norm": 1.5546875,
"learning_rate": 8.968969296756224e-08,
"loss": 0.1934422105550766,
"mean_token_accuracy": 0.9257456064224243,
"num_tokens": 696109.0,
"step": 500
},
{
"entropy": 0.18347270786762238,
"epoch": 0.07725520431765613,
"grad_norm": 1.5,
"learning_rate": 7.642875163162977e-08,
"loss": 0.17866890132427216,
"mean_token_accuracy": 0.9312201142311096,
"num_tokens": 697789.0,
"step": 501
},
{
"entropy": 0.27496322989463806,
"epoch": 0.07740940632228219,
"grad_norm": 2.375,
"learning_rate": 6.422661144259989e-08,
"loss": 0.2631693482398987,
"mean_token_accuracy": 0.9099326729774475,
"num_tokens": 698985.0,
"step": 502
},
{
"entropy": 0.21727091073989868,
"epoch": 0.07756360832690826,
"grad_norm": 1.984375,
"learning_rate": 5.308379077080816e-08,
"loss": 0.22967125475406647,
"mean_token_accuracy": 0.9191842675209045,
"num_tokens": 700317.0,
"step": 503
},
{
"entropy": 0.21876828372478485,
"epoch": 0.07771781033153431,
"grad_norm": 1.8359375,
"learning_rate": 4.300076298466571e-08,
"loss": 0.22112873196601868,
"mean_token_accuracy": 0.9281525015830994,
"num_tokens": 701689.0,
"step": 504
},
{
"entropy": 0.1745622456073761,
"epoch": 0.07787201233616037,
"grad_norm": 1.140625,
"learning_rate": 3.3977956430547576e-08,
"loss": 0.1722312867641449,
"mean_token_accuracy": 0.9407705664634705,
"num_tokens": 703436.0,
"step": 505
},
{
"entropy": 0.22518332302570343,
"epoch": 0.07802621434078642,
"grad_norm": 1.7421875,
"learning_rate": 2.6015754414593363e-08,
"loss": 0.22960630059242249,
"mean_token_accuracy": 0.9271809458732605,
"num_tokens": 704831.0,
"step": 506
},
{
"entropy": 0.2924734055995941,
"epoch": 0.07818041634541249,
"grad_norm": 2.609375,
"learning_rate": 1.911449518643138e-08,
"loss": 0.28948456048965454,
"mean_token_accuracy": 0.8898043036460876,
"num_tokens": 705810.0,
"step": 507
},
{
"entropy": 0.29374387860298157,
"epoch": 0.07833461835003855,
"grad_norm": 2.53125,
"learning_rate": 1.3274471924798471e-08,
"loss": 0.2914823293685913,
"mean_token_accuracy": 0.9072463512420654,
"num_tokens": 706853.0,
"step": 508
},
{
"entropy": 0.18828892707824707,
"epoch": 0.0784888203546646,
"grad_norm": 1.5390625,
"learning_rate": 8.495932725094414e-09,
"loss": 0.19034327566623688,
"mean_token_accuracy": 0.9341492056846619,
"num_tokens": 708577.0,
"step": 509
},
{
"entropy": 0.3254898488521576,
"epoch": 0.07864302235929067,
"grad_norm": 2.6875,
"learning_rate": 4.779080588834806e-09,
"loss": 0.3536283075809479,
"mean_token_accuracy": 0.8856015801429749,
"num_tokens": 709599.0,
"step": 510
},
{
"entropy": 0.19601193070411682,
"epoch": 0.07879722436391673,
"grad_norm": 1.71875,
"learning_rate": 2.124073415030181e-09,
"loss": 0.19777625799179077,
"mean_token_accuracy": 0.9288975596427917,
"num_tokens": 711140.0,
"step": 511
},
{
"entropy": 0.22275681793689728,
"epoch": 0.07895142636854278,
"grad_norm": 2.234375,
"learning_rate": 5.310239934885885e-10,
"loss": 0.23580928146839142,
"mean_token_accuracy": 0.9170305728912354,
"num_tokens": 712522.0,
"step": 512
},
{
"epoch": 0.07895142636854278,
"eval_entropy": 0.22568650308408236,
"eval_loss": 0.22544851899147034,
"eval_mean_token_accuracy": 0.919665330160431,
"eval_num_tokens": 712522.0,
"eval_runtime": 34.9909,
"eval_samples_per_second": 78.049,
"eval_steps_per_second": 9.774,
"step": 512
}
],
"logging_steps": 1,
"max_steps": 512,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 128,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4536491036033024e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}